From 45bb48ea197fe496865387120c7c55b56f0717d6 Mon Sep 17 00:00:00 2001 From: Tom Stellard Date: Sat, 13 Jun 2015 03:28:10 +0000 Subject: R600 -> AMDGPU rename llvm-svn: 239657 --- .../CodeGen/AMDGPU/32-bit-local-address-space.ll | 139 ++ llvm/test/CodeGen/AMDGPU/README | 21 + llvm/test/CodeGen/AMDGPU/add-debug.ll | 24 + llvm/test/CodeGen/AMDGPU/add.ll | 192 +++ llvm/test/CodeGen/AMDGPU/add_i64.ll | 84 ++ llvm/test/CodeGen/AMDGPU/address-space.ll | 36 + llvm/test/CodeGen/AMDGPU/and.ll | 296 ++++ llvm/test/CodeGen/AMDGPU/anyext.ll | 15 + llvm/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll | 44 + llvm/test/CodeGen/AMDGPU/array-ptr-calc-i64.ll | 17 + llvm/test/CodeGen/AMDGPU/atomic_cmp_swap_local.ll | 92 ++ llvm/test/CodeGen/AMDGPU/atomic_load_add.ll | 39 + llvm/test/CodeGen/AMDGPU/atomic_load_sub.ll | 39 + llvm/test/CodeGen/AMDGPU/basic-branch.ll | 16 + llvm/test/CodeGen/AMDGPU/basic-loop.ll | 18 + llvm/test/CodeGen/AMDGPU/bfe_uint.ll | 26 + llvm/test/CodeGen/AMDGPU/bfi_int.ll | 53 + llvm/test/CodeGen/AMDGPU/big_alu.ll | 1173 +++++++++++++++ llvm/test/CodeGen/AMDGPU/bitcast.ll | 79 + llvm/test/CodeGen/AMDGPU/bswap.ll | 115 ++ llvm/test/CodeGen/AMDGPU/build_vector.ll | 35 + llvm/test/CodeGen/AMDGPU/call.ll | 33 + llvm/test/CodeGen/AMDGPU/call_fs.ll | 17 + llvm/test/CodeGen/AMDGPU/cayman-loop-bug.ll | 32 + llvm/test/CodeGen/AMDGPU/cf-stack-bug.ll | 244 +++ llvm/test/CodeGen/AMDGPU/cf_end.ll | 9 + llvm/test/CodeGen/AMDGPU/cgp-addressing-modes.ll | 242 +++ llvm/test/CodeGen/AMDGPU/coalescer_remat.ll | 57 + .../AMDGPU/codegen-prepare-addrmode-sext.ll | 18 + llvm/test/CodeGen/AMDGPU/combine_vloads.ll | 42 + llvm/test/CodeGen/AMDGPU/commute-compares.ll | 697 +++++++++ llvm/test/CodeGen/AMDGPU/commute_modifiers.ll | 181 +++ llvm/test/CodeGen/AMDGPU/complex-folding.ll | 19 + llvm/test/CodeGen/AMDGPU/concat_vectors.ll | 296 ++++ llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll | 167 +++ llvm/test/CodeGen/AMDGPU/copy-to-reg.ll | 27 + llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll | 71 + llvm/test/CodeGen/AMDGPU/ctpop.ll | 300 ++++ llvm/test/CodeGen/AMDGPU/ctpop64.ll | 124 ++ llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll | 71 + llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll | 196 +++ llvm/test/CodeGen/AMDGPU/cvt_flr_i32_f32.ll | 86 ++ llvm/test/CodeGen/AMDGPU/cvt_rpi_i32_f32.ll | 83 ++ .../dagcombiner-bug-illegal-vec4-int-to-fp.ll | 36 + llvm/test/CodeGen/AMDGPU/debug.ll | 10 + llvm/test/CodeGen/AMDGPU/default-fp-mode.ll | 36 + .../AMDGPU/disconnected-predset-break-bug.ll | 29 + llvm/test/CodeGen/AMDGPU/dot4-folding.ll | 27 + .../ds-negative-offset-addressing-mode-loop.ll | 69 + llvm/test/CodeGen/AMDGPU/ds_read2.ll | 515 +++++++ llvm/test/CodeGen/AMDGPU/ds_read2_offset_order.ll | 45 + llvm/test/CodeGen/AMDGPU/ds_read2st64.ll | 272 ++++ llvm/test/CodeGen/AMDGPU/ds_write2.ll | 425 ++++++ llvm/test/CodeGen/AMDGPU/ds_write2st64.ll | 119 ++ llvm/test/CodeGen/AMDGPU/elf.ll | 34 + llvm/test/CodeGen/AMDGPU/elf.r600.ll | 17 + llvm/test/CodeGen/AMDGPU/empty-function.ll | 21 + llvm/test/CodeGen/AMDGPU/endcf-loop-header.ll | 34 + llvm/test/CodeGen/AMDGPU/extload-private.ll | 46 + llvm/test/CodeGen/AMDGPU/extload.ll | 53 + llvm/test/CodeGen/AMDGPU/extract_vector_elt_i16.ll | 30 + llvm/test/CodeGen/AMDGPU/fabs.f64.ll | 97 ++ llvm/test/CodeGen/AMDGPU/fabs.ll | 101 ++ llvm/test/CodeGen/AMDGPU/fadd.ll | 64 + llvm/test/CodeGen/AMDGPU/fadd64.ll | 14 + llvm/test/CodeGen/AMDGPU/fceil.ll | 132 ++ llvm/test/CodeGen/AMDGPU/fceil64.ll | 105 ++ llvm/test/CodeGen/AMDGPU/fcmp-cnd.ll | 14 + llvm/test/CodeGen/AMDGPU/fcmp-cnde-int-args.ll | 16 + llvm/test/CodeGen/AMDGPU/fcmp.ll | 38 + llvm/test/CodeGen/AMDGPU/fcmp64.ll | 74 + llvm/test/CodeGen/AMDGPU/fconst64.ll | 13 + llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll | 53 + llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll | 40 + llvm/test/CodeGen/AMDGPU/fdiv.f64.ll | 96 ++ llvm/test/CodeGen/AMDGPU/fdiv.ll | 68 + llvm/test/CodeGen/AMDGPU/fetch-limits.r600.ll | 48 + llvm/test/CodeGen/AMDGPU/fetch-limits.r700+.ll | 81 + llvm/test/CodeGen/AMDGPU/ffloor.f64.ll | 127 ++ llvm/test/CodeGen/AMDGPU/ffloor.ll | 49 + llvm/test/CodeGen/AMDGPU/flat-address-space.ll | 184 +++ llvm/test/CodeGen/AMDGPU/floor.ll | 15 + llvm/test/CodeGen/AMDGPU/fma-combine.ll | 368 +++++ llvm/test/CodeGen/AMDGPU/fma.f64.ll | 47 + llvm/test/CodeGen/AMDGPU/fma.ll | 92 ++ llvm/test/CodeGen/AMDGPU/fmad.ll | 19 + llvm/test/CodeGen/AMDGPU/fmax.ll | 17 + llvm/test/CodeGen/AMDGPU/fmax3.f64.ll | 24 + llvm/test/CodeGen/AMDGPU/fmax3.ll | 39 + llvm/test/CodeGen/AMDGPU/fmax_legacy.f64.ll | 67 + llvm/test/CodeGen/AMDGPU/fmax_legacy.ll | 116 ++ llvm/test/CodeGen/AMDGPU/fmaxnum.f64.ll | 76 + llvm/test/CodeGen/AMDGPU/fmaxnum.ll | 283 ++++ llvm/test/CodeGen/AMDGPU/fmin.ll | 17 + llvm/test/CodeGen/AMDGPU/fmin3.ll | 40 + llvm/test/CodeGen/AMDGPU/fmin_legacy.f64.ll | 77 + llvm/test/CodeGen/AMDGPU/fmin_legacy.ll | 123 ++ llvm/test/CodeGen/AMDGPU/fminnum.f64.ll | 76 + llvm/test/CodeGen/AMDGPU/fminnum.ll | 281 ++++ llvm/test/CodeGen/AMDGPU/fmul.ll | 92 ++ llvm/test/CodeGen/AMDGPU/fmul64.ll | 39 + llvm/test/CodeGen/AMDGPU/fmuladd.ll | 199 +++ llvm/test/CodeGen/AMDGPU/fnearbyint.ll | 58 + llvm/test/CodeGen/AMDGPU/fneg-fabs.f64.ll | 100 ++ llvm/test/CodeGen/AMDGPU/fneg-fabs.ll | 118 ++ llvm/test/CodeGen/AMDGPU/fneg.f64.ll | 60 + llvm/test/CodeGen/AMDGPU/fneg.ll | 70 + llvm/test/CodeGen/AMDGPU/fp-classify.ll | 131 ++ llvm/test/CodeGen/AMDGPU/fp16_to_fp.ll | 29 + llvm/test/CodeGen/AMDGPU/fp32_to_fp16.ll | 15 + llvm/test/CodeGen/AMDGPU/fp_to_sint.f64.ll | 56 + llvm/test/CodeGen/AMDGPU/fp_to_sint.ll | 230 +++ llvm/test/CodeGen/AMDGPU/fp_to_uint.f64.ll | 70 + llvm/test/CodeGen/AMDGPU/fp_to_uint.ll | 217 +++ llvm/test/CodeGen/AMDGPU/fpext.ll | 45 + llvm/test/CodeGen/AMDGPU/fptrunc.ll | 45 + llvm/test/CodeGen/AMDGPU/frem.ll | 112 ++ llvm/test/CodeGen/AMDGPU/fsqrt.ll | 29 + llvm/test/CodeGen/AMDGPU/fsub.ll | 75 + llvm/test/CodeGen/AMDGPU/fsub64.ll | 107 ++ llvm/test/CodeGen/AMDGPU/ftrunc.f64.ll | 111 ++ llvm/test/CodeGen/AMDGPU/ftrunc.ll | 120 ++ llvm/test/CodeGen/AMDGPU/gep-address-space.ll | 55 + llvm/test/CodeGen/AMDGPU/global-directive.ll | 15 + llvm/test/CodeGen/AMDGPU/global-extload-i1.ll | 302 ++++ llvm/test/CodeGen/AMDGPU/global-extload-i16.ll | 302 ++++ llvm/test/CodeGen/AMDGPU/global-extload-i32.ll | 457 ++++++ llvm/test/CodeGen/AMDGPU/global-extload-i8.ll | 299 ++++ .../test/CodeGen/AMDGPU/global-zero-initializer.ll | 13 + llvm/test/CodeGen/AMDGPU/global_atomics.ll | 801 ++++++++++ .../test/CodeGen/AMDGPU/gv-const-addrspace-fail.ll | 57 + llvm/test/CodeGen/AMDGPU/gv-const-addrspace.ll | 101 ++ llvm/test/CodeGen/AMDGPU/half.ll | 525 +++++++ llvm/test/CodeGen/AMDGPU/hsa.ll | 14 + llvm/test/CodeGen/AMDGPU/i1-copy-implicit-def.ll | 22 + llvm/test/CodeGen/AMDGPU/i1-copy-phi.ll | 30 + llvm/test/CodeGen/AMDGPU/i8-to-double-to-float.ll | 11 + .../AMDGPU/icmp-select-sete-reverse-args.ll | 18 + llvm/test/CodeGen/AMDGPU/icmp64.ll | 93 ++ llvm/test/CodeGen/AMDGPU/imm.ll | 617 ++++++++ llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll | 121 ++ llvm/test/CodeGen/AMDGPU/indirect-private-64.ll | 91 ++ .../test/CodeGen/AMDGPU/infinite-loop-evergreen.ll | 10 + llvm/test/CodeGen/AMDGPU/infinite-loop.ll | 18 + llvm/test/CodeGen/AMDGPU/inline-asm.ll | 12 + llvm/test/CodeGen/AMDGPU/inline-calls.ll | 25 + llvm/test/CodeGen/AMDGPU/input-mods.ll | 26 + llvm/test/CodeGen/AMDGPU/insert_subreg.ll | 16 + llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll | 252 ++++ llvm/test/CodeGen/AMDGPU/jump-address.ll | 52 + llvm/test/CodeGen/AMDGPU/kcache-fold.ll | 100 ++ llvm/test/CodeGen/AMDGPU/kernel-args.ll | 473 ++++++ llvm/test/CodeGen/AMDGPU/large-alloca.ll | 15 + .../CodeGen/AMDGPU/large-constant-initializer.ll | 19 + llvm/test/CodeGen/AMDGPU/lds-initializer.ll | 13 + llvm/test/CodeGen/AMDGPU/lds-oqap-crash.ll | 28 + llvm/test/CodeGen/AMDGPU/lds-output-queue.ll | 99 ++ llvm/test/CodeGen/AMDGPU/lds-size.ll | 26 + llvm/test/CodeGen/AMDGPU/lds-zero-initializer.ll | 13 + .../CodeGen/AMDGPU/legalizedag-bug-expand-setcc.ll | 26 + llvm/test/CodeGen/AMDGPU/lit.local.cfg | 2 + llvm/test/CodeGen/AMDGPU/literals.ll | 64 + llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.abs.ll | 49 + .../CodeGen/AMDGPU/llvm.AMDGPU.barrier.global.ll | 30 + .../CodeGen/AMDGPU/llvm.AMDGPU.barrier.local.ll | 31 + llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.bfe.i32.ll | 437 ++++++ llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.bfe.u32.ll | 627 ++++++++ llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.bfi.ll | 42 + llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.bfm.ll | 60 + llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.brev.ll | 28 + llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.clamp.ll | 67 + llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.class.ll | 497 +++++++ llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.cube.ll | 59 + .../CodeGen/AMDGPU/llvm.AMDGPU.cvt_f32_ubyte.ll | 43 + llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.div_fixup.ll | 31 + llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.div_fmas.ll | 179 +++ llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.div_scale.ll | 364 +++++ llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.flbit.i32.ll | 28 + llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.fract.f64.ll | 60 + llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.fract.ll | 65 + llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.imad24.ll | 22 + llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.imax.ll | 33 + llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.imin.ll | 33 + llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.imul24.ll | 16 + llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.kill.ll | 39 + llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.ldexp.ll | 23 + llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.legacy.rsq.ll | 13 + llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.mul.ll | 17 + llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.rcp.f64.ll | 33 + llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.rcp.ll | 50 + .../CodeGen/AMDGPU/llvm.AMDGPU.rsq.clamped.f64.ll | 23 + .../test/CodeGen/AMDGPU/llvm.AMDGPU.rsq.clamped.ll | 23 + llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.rsq.ll | 33 + llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.tex.ll | 42 + llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.trig_preop.ll | 30 + llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.trunc.ll | 17 + llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.umad24.ll | 38 + llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.umax.ll | 48 + llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.umin.ll | 48 + llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.umul24.ll | 18 + llvm/test/CodeGen/AMDGPU/llvm.SI.fs.interp.ll | 59 + llvm/test/CodeGen/AMDGPU/llvm.SI.gather4.ll | 509 +++++++ llvm/test/CodeGen/AMDGPU/llvm.SI.getlod.ll | 45 + llvm/test/CodeGen/AMDGPU/llvm.SI.image.ll | 50 + llvm/test/CodeGen/AMDGPU/llvm.SI.image.sample.ll | 310 ++++ llvm/test/CodeGen/AMDGPU/llvm.SI.image.sample.o.ll | 310 ++++ llvm/test/CodeGen/AMDGPU/llvm.SI.imageload.ll | 132 ++ llvm/test/CodeGen/AMDGPU/llvm.SI.load.dword.ll | 53 + llvm/test/CodeGen/AMDGPU/llvm.SI.resinfo.ll | 111 ++ llvm/test/CodeGen/AMDGPU/llvm.SI.sample-masked.ll | 96 ++ llvm/test/CodeGen/AMDGPU/llvm.SI.sample.ll | 160 ++ llvm/test/CodeGen/AMDGPU/llvm.SI.sampled.ll | 143 ++ llvm/test/CodeGen/AMDGPU/llvm.SI.sendmsg-m0.ll | 20 + llvm/test/CodeGen/AMDGPU/llvm.SI.sendmsg.ll | 24 + llvm/test/CodeGen/AMDGPU/llvm.SI.tbuffer.store.ll | 47 + llvm/test/CodeGen/AMDGPU/llvm.SI.tid.ll | 18 + llvm/test/CodeGen/AMDGPU/llvm.amdgpu.dp4.ll | 11 + llvm/test/CodeGen/AMDGPU/llvm.amdgpu.kilp.ll | 21 + llvm/test/CodeGen/AMDGPU/llvm.amdgpu.lrp.ll | 13 + llvm/test/CodeGen/AMDGPU/llvm.cos.ll | 41 + llvm/test/CodeGen/AMDGPU/llvm.exp2.ll | 80 + llvm/test/CodeGen/AMDGPU/llvm.log2.ll | 80 + llvm/test/CodeGen/AMDGPU/llvm.memcpy.ll | 365 +++++ llvm/test/CodeGen/AMDGPU/llvm.pow.ll | 40 + llvm/test/CodeGen/AMDGPU/llvm.rint.f64.ll | 46 + llvm/test/CodeGen/AMDGPU/llvm.rint.ll | 62 + llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll | 74 + llvm/test/CodeGen/AMDGPU/llvm.round.ll | 67 + llvm/test/CodeGen/AMDGPU/llvm.sin.ll | 92 ++ llvm/test/CodeGen/AMDGPU/llvm.sqrt.ll | 105 ++ llvm/test/CodeGen/AMDGPU/load-i1.ll | 149 ++ llvm/test/CodeGen/AMDGPU/load-input-fold.ll | 117 ++ llvm/test/CodeGen/AMDGPU/load.ll | 709 +++++++++ llvm/test/CodeGen/AMDGPU/load.vec.ll | 25 + llvm/test/CodeGen/AMDGPU/load64.ll | 31 + llvm/test/CodeGen/AMDGPU/local-64.ll | 167 +++ llvm/test/CodeGen/AMDGPU/local-atomics.ll | 551 +++++++ llvm/test/CodeGen/AMDGPU/local-atomics64.ll | 470 ++++++ .../CodeGen/AMDGPU/local-memory-two-objects.ll | 63 + llvm/test/CodeGen/AMDGPU/local-memory.ll | 49 + llvm/test/CodeGen/AMDGPU/loop-address.ll | 34 + llvm/test/CodeGen/AMDGPU/loop-idiom.ll | 51 + llvm/test/CodeGen/AMDGPU/lshl.ll | 15 + llvm/test/CodeGen/AMDGPU/lshr.ll | 15 + llvm/test/CodeGen/AMDGPU/m0-spill.ll | 35 + llvm/test/CodeGen/AMDGPU/mad-combine.ll | 567 +++++++ llvm/test/CodeGen/AMDGPU/mad-sub.ll | 215 +++ llvm/test/CodeGen/AMDGPU/mad_int24.ll | 35 + llvm/test/CodeGen/AMDGPU/mad_uint24.ll | 76 + llvm/test/CodeGen/AMDGPU/madak.ll | 193 +++ llvm/test/CodeGen/AMDGPU/madmk.ll | 205 +++ llvm/test/CodeGen/AMDGPU/max-literals.ll | 67 + llvm/test/CodeGen/AMDGPU/max.ll | 168 +++ llvm/test/CodeGen/AMDGPU/max3.ll | 41 + llvm/test/CodeGen/AMDGPU/merge-stores.ll | 536 +++++++ llvm/test/CodeGen/AMDGPU/min.ll | 189 +++ llvm/test/CodeGen/AMDGPU/min3.ll | 111 ++ llvm/test/CodeGen/AMDGPU/missing-store.ll | 26 + llvm/test/CodeGen/AMDGPU/mubuf.ll | 183 +++ llvm/test/CodeGen/AMDGPU/mul.ll | 200 +++ llvm/test/CodeGen/AMDGPU/mul_int24.ll | 23 + llvm/test/CodeGen/AMDGPU/mul_uint24.ll | 67 + llvm/test/CodeGen/AMDGPU/mulhu.ll | 17 + .../AMDGPU/no-initializer-constant-addrspace.ll | 21 + llvm/test/CodeGen/AMDGPU/no-shrink-extloads.ll | 191 +++ llvm/test/CodeGen/AMDGPU/operand-folding.ll | 113 ++ llvm/test/CodeGen/AMDGPU/operand-spacing.ll | 18 + llvm/test/CodeGen/AMDGPU/or.ll | 178 +++ llvm/test/CodeGen/AMDGPU/packetizer.ll | 34 + llvm/test/CodeGen/AMDGPU/parallelandifcollapse.ll | 59 + llvm/test/CodeGen/AMDGPU/parallelorifcollapse.ll | 66 + llvm/test/CodeGen/AMDGPU/predicate-dp4.ll | 27 + llvm/test/CodeGen/AMDGPU/predicates.ll | 104 ++ llvm/test/CodeGen/AMDGPU/private-memory-atomics.ll | 32 + llvm/test/CodeGen/AMDGPU/private-memory-broken.ll | 21 + llvm/test/CodeGen/AMDGPU/private-memory.ll | 313 ++++ llvm/test/CodeGen/AMDGPU/pv-packing.ll | 45 + llvm/test/CodeGen/AMDGPU/pv.ll | 241 +++ llvm/test/CodeGen/AMDGPU/r600-encoding.ll | 25 + llvm/test/CodeGen/AMDGPU/r600-export-fix.ll | 142 ++ ...-infinite-loop-bug-while-reorganizing-vector.ll | 58 + llvm/test/CodeGen/AMDGPU/r600cfg.ll | 119 ++ llvm/test/CodeGen/AMDGPU/reciprocal.ll | 15 + .../test/CodeGen/AMDGPU/register-count-comments.ll | 27 + llvm/test/CodeGen/AMDGPU/reorder-stores.ll | 105 ++ llvm/test/CodeGen/AMDGPU/rotl.i64.ll | 39 + llvm/test/CodeGen/AMDGPU/rotl.ll | 57 + llvm/test/CodeGen/AMDGPU/rotr.i64.ll | 61 + llvm/test/CodeGen/AMDGPU/rotr.ll | 53 + llvm/test/CodeGen/AMDGPU/rsq.ll | 74 + llvm/test/CodeGen/AMDGPU/rv7x0_count3.ll | 41 + llvm/test/CodeGen/AMDGPU/s_movk_i32.ll | 185 +++ llvm/test/CodeGen/AMDGPU/saddo.ll | 63 + llvm/test/CodeGen/AMDGPU/salu-to-valu.ll | 118 ++ llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll | 81 + .../CodeGen/AMDGPU/schedule-fs-loop-nested-if.ll | 82 + .../test/CodeGen/AMDGPU/schedule-fs-loop-nested.ll | 88 ++ llvm/test/CodeGen/AMDGPU/schedule-fs-loop.ll | 55 + llvm/test/CodeGen/AMDGPU/schedule-global-loads.ll | 41 + llvm/test/CodeGen/AMDGPU/schedule-if-2.ll | 94 ++ llvm/test/CodeGen/AMDGPU/schedule-if.ll | 46 + .../CodeGen/AMDGPU/schedule-kernel-arg-loads.ll | 51 + .../AMDGPU/schedule-vs-if-nested-loop-failure.ll | 163 ++ .../CodeGen/AMDGPU/schedule-vs-if-nested-loop.ll | 132 ++ llvm/test/CodeGen/AMDGPU/scratch-buffer.ll | 87 ++ llvm/test/CodeGen/AMDGPU/sdiv.ll | 104 ++ llvm/test/CodeGen/AMDGPU/sdivrem24.ll | 239 +++ llvm/test/CodeGen/AMDGPU/sdivrem64.ll | 225 +++ llvm/test/CodeGen/AMDGPU/select-i1.ll | 15 + llvm/test/CodeGen/AMDGPU/select-vectors.ll | 156 ++ llvm/test/CodeGen/AMDGPU/select.ll | 47 + llvm/test/CodeGen/AMDGPU/select64.ll | 68 + llvm/test/CodeGen/AMDGPU/selectcc-cnd.ll | 12 + llvm/test/CodeGen/AMDGPU/selectcc-cnde-int.ll | 12 + .../CodeGen/AMDGPU/selectcc-icmp-select-float.ll | 16 + llvm/test/CodeGen/AMDGPU/selectcc-opt.ll | 80 + llvm/test/CodeGen/AMDGPU/selectcc.ll | 20 + llvm/test/CodeGen/AMDGPU/set-dx10.ll | 161 ++ llvm/test/CodeGen/AMDGPU/setcc-equivalent.ll | 30 + llvm/test/CodeGen/AMDGPU/setcc-opt.ll | 236 +++ llvm/test/CodeGen/AMDGPU/setcc.ll | 377 +++++ llvm/test/CodeGen/AMDGPU/setcc64.ll | 259 ++++ llvm/test/CodeGen/AMDGPU/seto.ll | 15 + llvm/test/CodeGen/AMDGPU/setuo.ll | 15 + llvm/test/CodeGen/AMDGPU/sext-eliminate.ll | 26 + llvm/test/CodeGen/AMDGPU/sext-in-reg.ll | 611 ++++++++ llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll | 105 ++ .../CodeGen/AMDGPU/sgpr-copy-duplicate-operand.ll | 19 + llvm/test/CodeGen/AMDGPU/sgpr-copy.ll | 379 +++++ llvm/test/CodeGen/AMDGPU/shared-op-cycle.ll | 32 + llvm/test/CodeGen/AMDGPU/shl.ll | 180 +++ llvm/test/CodeGen/AMDGPU/shl_add_constant.ll | 90 ++ llvm/test/CodeGen/AMDGPU/shl_add_ptr.ll | 284 ++++ .../CodeGen/AMDGPU/si-annotate-cf-assertion.ll | 25 + llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll | 63 + llvm/test/CodeGen/AMDGPU/si-lod-bias.ll | 52 + llvm/test/CodeGen/AMDGPU/si-sgpr-spill.ll | 1568 ++++++++++++++++++++ llvm/test/CodeGen/AMDGPU/si-spill-cf.ll | 501 +++++++ .../CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll | 236 +++ llvm/test/CodeGen/AMDGPU/si-vector-hang.ll | 105 ++ llvm/test/CodeGen/AMDGPU/sign_extend.ll | 63 + .../AMDGPU/simplify-demanded-bits-build-pair.ll | 39 + llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll | 61 + llvm/test/CodeGen/AMDGPU/sint_to_fp.ll | 64 + llvm/test/CodeGen/AMDGPU/smrd.ll | 111 ++ llvm/test/CodeGen/AMDGPU/split-scalar-i64-add.ll | 48 + llvm/test/CodeGen/AMDGPU/sra.ll | 213 +++ llvm/test/CodeGen/AMDGPU/srem.ll | 112 ++ llvm/test/CodeGen/AMDGPU/srl.ll | 186 +++ llvm/test/CodeGen/AMDGPU/ssubo.ll | 65 + llvm/test/CodeGen/AMDGPU/store-barrier.ll | 42 + llvm/test/CodeGen/AMDGPU/store-v3i32.ll | 13 + llvm/test/CodeGen/AMDGPU/store-v3i64.ll | 29 + llvm/test/CodeGen/AMDGPU/store-vector-ptrs.ll | 12 + llvm/test/CodeGen/AMDGPU/store.ll | 369 +++++ llvm/test/CodeGen/AMDGPU/store.r600.ll | 22 + llvm/test/CodeGen/AMDGPU/structurize.ll | 83 ++ llvm/test/CodeGen/AMDGPU/structurize1.ll | 62 + llvm/test/CodeGen/AMDGPU/sub.ll | 130 ++ llvm/test/CodeGen/AMDGPU/subreg-coalescer-crash.ll | 109 ++ llvm/test/CodeGen/AMDGPU/subreg-eliminate-dead.ll | 19 + llvm/test/CodeGen/AMDGPU/swizzle-export.ll | 129 ++ llvm/test/CodeGen/AMDGPU/tex-clause-antidep.ll | 25 + llvm/test/CodeGen/AMDGPU/texture-input-merge.ll | 31 + llvm/test/CodeGen/AMDGPU/trunc-cmp-constant.ll | 170 +++ llvm/test/CodeGen/AMDGPU/trunc-store-f64-to-f16.ll | 56 + llvm/test/CodeGen/AMDGPU/trunc-store-i1.ll | 33 + .../AMDGPU/trunc-vector-store-assertion-failure.ll | 20 + llvm/test/CodeGen/AMDGPU/trunc.ll | 100 ++ llvm/test/CodeGen/AMDGPU/tti-unroll-prefs.ll | 58 + llvm/test/CodeGen/AMDGPU/uaddo.ll | 85 ++ llvm/test/CodeGen/AMDGPU/udiv.ll | 48 + llvm/test/CodeGen/AMDGPU/udivrem.ll | 345 +++++ llvm/test/CodeGen/AMDGPU/udivrem24.ll | 245 +++ llvm/test/CodeGen/AMDGPU/udivrem64.ll | 223 +++ llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll | 98 ++ llvm/test/CodeGen/AMDGPU/uint_to_fp.ll | 82 + llvm/test/CodeGen/AMDGPU/unaligned-load-store.ll | 254 ++++ .../AMDGPU/unhandled-loop-condition-assertion.ll | 115 ++ llvm/test/CodeGen/AMDGPU/unroll.ll | 36 + llvm/test/CodeGen/AMDGPU/unsupported-cc.ll | 125 ++ llvm/test/CodeGen/AMDGPU/urecip.ll | 13 + llvm/test/CodeGen/AMDGPU/urem.ll | 94 ++ .../test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll | 103 ++ llvm/test/CodeGen/AMDGPU/usubo.ll | 86 ++ llvm/test/CodeGen/AMDGPU/v1i64-kernel-arg.ll | 17 + llvm/test/CodeGen/AMDGPU/v_cndmask.ll | 39 + llvm/test/CodeGen/AMDGPU/valu-i1.ll | 188 +++ llvm/test/CodeGen/AMDGPU/vector-alloca.ll | 77 + llvm/test/CodeGen/AMDGPU/vertex-fetch-encoding.ll | 25 + llvm/test/CodeGen/AMDGPU/vop-shrink.ll | 51 + llvm/test/CodeGen/AMDGPU/vselect.ll | 77 + llvm/test/CodeGen/AMDGPU/vselect64.ll | 15 + llvm/test/CodeGen/AMDGPU/vtx-fetch-branch.ll | 29 + llvm/test/CodeGen/AMDGPU/vtx-schedule.ll | 18 + llvm/test/CodeGen/AMDGPU/wait.ll | 45 + llvm/test/CodeGen/AMDGPU/work-item-intrinsics.ll | 238 +++ llvm/test/CodeGen/AMDGPU/wrong-transalu-pos-fix.ll | 81 + llvm/test/CodeGen/AMDGPU/xor.ll | 173 +++ llvm/test/CodeGen/AMDGPU/zero_extend.ll | 41 + .../CodeGen/R600/32-bit-local-address-space.ll | 139 -- llvm/test/CodeGen/R600/README | 21 - llvm/test/CodeGen/R600/add-debug.ll | 24 - llvm/test/CodeGen/R600/add.ll | 192 --- llvm/test/CodeGen/R600/add_i64.ll | 84 -- llvm/test/CodeGen/R600/address-space.ll | 36 - llvm/test/CodeGen/R600/and.ll | 296 ---- llvm/test/CodeGen/R600/anyext.ll | 15 - llvm/test/CodeGen/R600/array-ptr-calc-i32.ll | 44 - llvm/test/CodeGen/R600/array-ptr-calc-i64.ll | 17 - llvm/test/CodeGen/R600/atomic_cmp_swap_local.ll | 92 -- llvm/test/CodeGen/R600/atomic_load_add.ll | 39 - llvm/test/CodeGen/R600/atomic_load_sub.ll | 39 - llvm/test/CodeGen/R600/basic-branch.ll | 16 - llvm/test/CodeGen/R600/basic-loop.ll | 18 - llvm/test/CodeGen/R600/bfe_uint.ll | 26 - llvm/test/CodeGen/R600/bfi_int.ll | 53 - llvm/test/CodeGen/R600/big_alu.ll | 1173 --------------- llvm/test/CodeGen/R600/bitcast.ll | 79 - llvm/test/CodeGen/R600/bswap.ll | 115 -- llvm/test/CodeGen/R600/build_vector.ll | 35 - llvm/test/CodeGen/R600/call.ll | 33 - llvm/test/CodeGen/R600/call_fs.ll | 17 - llvm/test/CodeGen/R600/cayman-loop-bug.ll | 32 - llvm/test/CodeGen/R600/cf-stack-bug.ll | 244 --- llvm/test/CodeGen/R600/cf_end.ll | 9 - llvm/test/CodeGen/R600/cgp-addressing-modes.ll | 242 --- llvm/test/CodeGen/R600/coalescer_remat.ll | 57 - .../CodeGen/R600/codegen-prepare-addrmode-sext.ll | 18 - llvm/test/CodeGen/R600/combine_vloads.ll | 42 - llvm/test/CodeGen/R600/commute-compares.ll | 697 --------- llvm/test/CodeGen/R600/commute_modifiers.ll | 181 --- llvm/test/CodeGen/R600/complex-folding.ll | 19 - llvm/test/CodeGen/R600/concat_vectors.ll | 296 ---- llvm/test/CodeGen/R600/copy-illegal-type.ll | 167 --- llvm/test/CodeGen/R600/copy-to-reg.ll | 27 - llvm/test/CodeGen/R600/ctlz_zero_undef.ll | 71 - llvm/test/CodeGen/R600/ctpop.ll | 300 ---- llvm/test/CodeGen/R600/ctpop64.ll | 124 -- llvm/test/CodeGen/R600/cttz_zero_undef.ll | 71 - llvm/test/CodeGen/R600/cvt_f32_ubyte.ll | 196 --- llvm/test/CodeGen/R600/cvt_flr_i32_f32.ll | 86 -- llvm/test/CodeGen/R600/cvt_rpi_i32_f32.ll | 83 -- .../R600/dagcombiner-bug-illegal-vec4-int-to-fp.ll | 36 - llvm/test/CodeGen/R600/debug.ll | 10 - llvm/test/CodeGen/R600/default-fp-mode.ll | 36 - .../CodeGen/R600/disconnected-predset-break-bug.ll | 29 - llvm/test/CodeGen/R600/dot4-folding.ll | 27 - .../ds-negative-offset-addressing-mode-loop.ll | 69 - llvm/test/CodeGen/R600/ds_read2.ll | 515 ------- llvm/test/CodeGen/R600/ds_read2_offset_order.ll | 45 - llvm/test/CodeGen/R600/ds_read2st64.ll | 272 ---- llvm/test/CodeGen/R600/ds_write2.ll | 425 ------ llvm/test/CodeGen/R600/ds_write2st64.ll | 119 -- llvm/test/CodeGen/R600/elf.ll | 34 - llvm/test/CodeGen/R600/elf.r600.ll | 17 - llvm/test/CodeGen/R600/empty-function.ll | 21 - llvm/test/CodeGen/R600/endcf-loop-header.ll | 34 - llvm/test/CodeGen/R600/extload-private.ll | 46 - llvm/test/CodeGen/R600/extload.ll | 53 - llvm/test/CodeGen/R600/extract_vector_elt_i16.ll | 30 - llvm/test/CodeGen/R600/fabs.f64.ll | 97 -- llvm/test/CodeGen/R600/fabs.ll | 101 -- llvm/test/CodeGen/R600/fadd.ll | 64 - llvm/test/CodeGen/R600/fadd64.ll | 14 - llvm/test/CodeGen/R600/fceil.ll | 132 -- llvm/test/CodeGen/R600/fceil64.ll | 105 -- llvm/test/CodeGen/R600/fcmp-cnd.ll | 14 - llvm/test/CodeGen/R600/fcmp-cnde-int-args.ll | 16 - llvm/test/CodeGen/R600/fcmp.ll | 38 - llvm/test/CodeGen/R600/fcmp64.ll | 74 - llvm/test/CodeGen/R600/fconst64.ll | 13 - llvm/test/CodeGen/R600/fcopysign.f32.ll | 53 - llvm/test/CodeGen/R600/fcopysign.f64.ll | 40 - llvm/test/CodeGen/R600/fdiv.f64.ll | 96 -- llvm/test/CodeGen/R600/fdiv.ll | 68 - llvm/test/CodeGen/R600/fetch-limits.r600.ll | 48 - llvm/test/CodeGen/R600/fetch-limits.r700+.ll | 81 - llvm/test/CodeGen/R600/ffloor.f64.ll | 127 -- llvm/test/CodeGen/R600/ffloor.ll | 49 - llvm/test/CodeGen/R600/flat-address-space.ll | 184 --- llvm/test/CodeGen/R600/floor.ll | 15 - llvm/test/CodeGen/R600/fma-combine.ll | 368 ----- llvm/test/CodeGen/R600/fma.f64.ll | 47 - llvm/test/CodeGen/R600/fma.ll | 92 -- llvm/test/CodeGen/R600/fmad.ll | 19 - llvm/test/CodeGen/R600/fmax.ll | 17 - llvm/test/CodeGen/R600/fmax3.f64.ll | 24 - llvm/test/CodeGen/R600/fmax3.ll | 39 - llvm/test/CodeGen/R600/fmax_legacy.f64.ll | 67 - llvm/test/CodeGen/R600/fmax_legacy.ll | 116 -- llvm/test/CodeGen/R600/fmaxnum.f64.ll | 76 - llvm/test/CodeGen/R600/fmaxnum.ll | 283 ---- llvm/test/CodeGen/R600/fmin.ll | 17 - llvm/test/CodeGen/R600/fmin3.ll | 40 - llvm/test/CodeGen/R600/fmin_legacy.f64.ll | 77 - llvm/test/CodeGen/R600/fmin_legacy.ll | 123 -- llvm/test/CodeGen/R600/fminnum.f64.ll | 76 - llvm/test/CodeGen/R600/fminnum.ll | 281 ---- llvm/test/CodeGen/R600/fmul.ll | 92 -- llvm/test/CodeGen/R600/fmul64.ll | 39 - llvm/test/CodeGen/R600/fmuladd.ll | 199 --- llvm/test/CodeGen/R600/fnearbyint.ll | 58 - llvm/test/CodeGen/R600/fneg-fabs.f64.ll | 100 -- llvm/test/CodeGen/R600/fneg-fabs.ll | 118 -- llvm/test/CodeGen/R600/fneg.f64.ll | 60 - llvm/test/CodeGen/R600/fneg.ll | 70 - llvm/test/CodeGen/R600/fp-classify.ll | 131 -- llvm/test/CodeGen/R600/fp16_to_fp.ll | 29 - llvm/test/CodeGen/R600/fp32_to_fp16.ll | 15 - llvm/test/CodeGen/R600/fp_to_sint.f64.ll | 56 - llvm/test/CodeGen/R600/fp_to_sint.ll | 230 --- llvm/test/CodeGen/R600/fp_to_uint.f64.ll | 70 - llvm/test/CodeGen/R600/fp_to_uint.ll | 217 --- llvm/test/CodeGen/R600/fpext.ll | 45 - llvm/test/CodeGen/R600/fptrunc.ll | 45 - llvm/test/CodeGen/R600/frem.ll | 112 -- llvm/test/CodeGen/R600/fsqrt.ll | 29 - llvm/test/CodeGen/R600/fsub.ll | 75 - llvm/test/CodeGen/R600/fsub64.ll | 107 -- llvm/test/CodeGen/R600/ftrunc.f64.ll | 111 -- llvm/test/CodeGen/R600/ftrunc.ll | 120 -- llvm/test/CodeGen/R600/gep-address-space.ll | 55 - llvm/test/CodeGen/R600/global-directive.ll | 15 - llvm/test/CodeGen/R600/global-extload-i1.ll | 302 ---- llvm/test/CodeGen/R600/global-extload-i16.ll | 302 ---- llvm/test/CodeGen/R600/global-extload-i32.ll | 457 ------ llvm/test/CodeGen/R600/global-extload-i8.ll | 299 ---- llvm/test/CodeGen/R600/global-zero-initializer.ll | 13 - llvm/test/CodeGen/R600/global_atomics.ll | 801 ---------- llvm/test/CodeGen/R600/gv-const-addrspace-fail.ll | 57 - llvm/test/CodeGen/R600/gv-const-addrspace.ll | 101 -- llvm/test/CodeGen/R600/half.ll | 525 ------- llvm/test/CodeGen/R600/hsa.ll | 14 - llvm/test/CodeGen/R600/i1-copy-implicit-def.ll | 22 - llvm/test/CodeGen/R600/i1-copy-phi.ll | 30 - llvm/test/CodeGen/R600/i8-to-double-to-float.ll | 11 - .../CodeGen/R600/icmp-select-sete-reverse-args.ll | 18 - llvm/test/CodeGen/R600/icmp64.ll | 93 -- llvm/test/CodeGen/R600/imm.ll | 617 -------- llvm/test/CodeGen/R600/indirect-addressing-si.ll | 121 -- llvm/test/CodeGen/R600/indirect-private-64.ll | 91 -- llvm/test/CodeGen/R600/infinite-loop-evergreen.ll | 10 - llvm/test/CodeGen/R600/infinite-loop.ll | 18 - llvm/test/CodeGen/R600/inline-asm.ll | 12 - llvm/test/CodeGen/R600/inline-calls.ll | 25 - llvm/test/CodeGen/R600/input-mods.ll | 26 - llvm/test/CodeGen/R600/insert_subreg.ll | 16 - llvm/test/CodeGen/R600/insert_vector_elt.ll | 252 ---- llvm/test/CodeGen/R600/jump-address.ll | 52 - llvm/test/CodeGen/R600/kcache-fold.ll | 100 -- llvm/test/CodeGen/R600/kernel-args.ll | 473 ------ llvm/test/CodeGen/R600/large-alloca.ll | 15 - .../CodeGen/R600/large-constant-initializer.ll | 19 - llvm/test/CodeGen/R600/lds-initializer.ll | 13 - llvm/test/CodeGen/R600/lds-oqap-crash.ll | 28 - llvm/test/CodeGen/R600/lds-output-queue.ll | 99 -- llvm/test/CodeGen/R600/lds-size.ll | 26 - llvm/test/CodeGen/R600/lds-zero-initializer.ll | 13 - .../CodeGen/R600/legalizedag-bug-expand-setcc.ll | 26 - llvm/test/CodeGen/R600/lit.local.cfg | 2 - llvm/test/CodeGen/R600/literals.ll | 64 - llvm/test/CodeGen/R600/llvm.AMDGPU.abs.ll | 49 - .../CodeGen/R600/llvm.AMDGPU.barrier.global.ll | 30 - .../test/CodeGen/R600/llvm.AMDGPU.barrier.local.ll | 31 - llvm/test/CodeGen/R600/llvm.AMDGPU.bfe.i32.ll | 437 ------ llvm/test/CodeGen/R600/llvm.AMDGPU.bfe.u32.ll | 627 -------- llvm/test/CodeGen/R600/llvm.AMDGPU.bfi.ll | 42 - llvm/test/CodeGen/R600/llvm.AMDGPU.bfm.ll | 60 - llvm/test/CodeGen/R600/llvm.AMDGPU.brev.ll | 28 - llvm/test/CodeGen/R600/llvm.AMDGPU.clamp.ll | 67 - llvm/test/CodeGen/R600/llvm.AMDGPU.class.ll | 497 ------- llvm/test/CodeGen/R600/llvm.AMDGPU.cube.ll | 59 - .../test/CodeGen/R600/llvm.AMDGPU.cvt_f32_ubyte.ll | 43 - llvm/test/CodeGen/R600/llvm.AMDGPU.div_fixup.ll | 31 - llvm/test/CodeGen/R600/llvm.AMDGPU.div_fmas.ll | 179 --- llvm/test/CodeGen/R600/llvm.AMDGPU.div_scale.ll | 364 ----- llvm/test/CodeGen/R600/llvm.AMDGPU.flbit.i32.ll | 28 - llvm/test/CodeGen/R600/llvm.AMDGPU.fract.f64.ll | 60 - llvm/test/CodeGen/R600/llvm.AMDGPU.fract.ll | 65 - llvm/test/CodeGen/R600/llvm.AMDGPU.imad24.ll | 22 - llvm/test/CodeGen/R600/llvm.AMDGPU.imax.ll | 33 - llvm/test/CodeGen/R600/llvm.AMDGPU.imin.ll | 33 - llvm/test/CodeGen/R600/llvm.AMDGPU.imul24.ll | 16 - llvm/test/CodeGen/R600/llvm.AMDGPU.kill.ll | 39 - llvm/test/CodeGen/R600/llvm.AMDGPU.ldexp.ll | 23 - llvm/test/CodeGen/R600/llvm.AMDGPU.legacy.rsq.ll | 13 - llvm/test/CodeGen/R600/llvm.AMDGPU.mul.ll | 17 - llvm/test/CodeGen/R600/llvm.AMDGPU.rcp.f64.ll | 33 - llvm/test/CodeGen/R600/llvm.AMDGPU.rcp.ll | 50 - .../CodeGen/R600/llvm.AMDGPU.rsq.clamped.f64.ll | 23 - llvm/test/CodeGen/R600/llvm.AMDGPU.rsq.clamped.ll | 23 - llvm/test/CodeGen/R600/llvm.AMDGPU.rsq.ll | 33 - llvm/test/CodeGen/R600/llvm.AMDGPU.tex.ll | 42 - llvm/test/CodeGen/R600/llvm.AMDGPU.trig_preop.ll | 30 - llvm/test/CodeGen/R600/llvm.AMDGPU.trunc.ll | 17 - llvm/test/CodeGen/R600/llvm.AMDGPU.umad24.ll | 38 - llvm/test/CodeGen/R600/llvm.AMDGPU.umax.ll | 48 - llvm/test/CodeGen/R600/llvm.AMDGPU.umin.ll | 48 - llvm/test/CodeGen/R600/llvm.AMDGPU.umul24.ll | 18 - llvm/test/CodeGen/R600/llvm.SI.fs.interp.ll | 59 - llvm/test/CodeGen/R600/llvm.SI.gather4.ll | 509 ------- llvm/test/CodeGen/R600/llvm.SI.getlod.ll | 45 - llvm/test/CodeGen/R600/llvm.SI.image.ll | 50 - llvm/test/CodeGen/R600/llvm.SI.image.sample.ll | 310 ---- llvm/test/CodeGen/R600/llvm.SI.image.sample.o.ll | 310 ---- llvm/test/CodeGen/R600/llvm.SI.imageload.ll | 132 -- llvm/test/CodeGen/R600/llvm.SI.load.dword.ll | 53 - llvm/test/CodeGen/R600/llvm.SI.resinfo.ll | 111 -- llvm/test/CodeGen/R600/llvm.SI.sample-masked.ll | 96 -- llvm/test/CodeGen/R600/llvm.SI.sample.ll | 160 -- llvm/test/CodeGen/R600/llvm.SI.sampled.ll | 143 -- llvm/test/CodeGen/R600/llvm.SI.sendmsg-m0.ll | 20 - llvm/test/CodeGen/R600/llvm.SI.sendmsg.ll | 24 - llvm/test/CodeGen/R600/llvm.SI.tbuffer.store.ll | 47 - llvm/test/CodeGen/R600/llvm.SI.tid.ll | 18 - llvm/test/CodeGen/R600/llvm.amdgpu.dp4.ll | 11 - llvm/test/CodeGen/R600/llvm.amdgpu.kilp.ll | 21 - llvm/test/CodeGen/R600/llvm.amdgpu.lrp.ll | 13 - llvm/test/CodeGen/R600/llvm.cos.ll | 41 - llvm/test/CodeGen/R600/llvm.exp2.ll | 80 - llvm/test/CodeGen/R600/llvm.log2.ll | 80 - llvm/test/CodeGen/R600/llvm.memcpy.ll | 365 ----- llvm/test/CodeGen/R600/llvm.pow.ll | 40 - llvm/test/CodeGen/R600/llvm.rint.f64.ll | 46 - llvm/test/CodeGen/R600/llvm.rint.ll | 62 - llvm/test/CodeGen/R600/llvm.round.f64.ll | 74 - llvm/test/CodeGen/R600/llvm.round.ll | 67 - llvm/test/CodeGen/R600/llvm.sin.ll | 92 -- llvm/test/CodeGen/R600/llvm.sqrt.ll | 105 -- llvm/test/CodeGen/R600/load-i1.ll | 149 -- llvm/test/CodeGen/R600/load-input-fold.ll | 117 -- llvm/test/CodeGen/R600/load.ll | 709 --------- llvm/test/CodeGen/R600/load.vec.ll | 25 - llvm/test/CodeGen/R600/load64.ll | 31 - llvm/test/CodeGen/R600/local-64.ll | 167 --- llvm/test/CodeGen/R600/local-atomics.ll | 551 ------- llvm/test/CodeGen/R600/local-atomics64.ll | 470 ------ llvm/test/CodeGen/R600/local-memory-two-objects.ll | 63 - llvm/test/CodeGen/R600/local-memory.ll | 49 - llvm/test/CodeGen/R600/loop-address.ll | 34 - llvm/test/CodeGen/R600/loop-idiom.ll | 51 - llvm/test/CodeGen/R600/lshl.ll | 15 - llvm/test/CodeGen/R600/lshr.ll | 15 - llvm/test/CodeGen/R600/m0-spill.ll | 35 - llvm/test/CodeGen/R600/mad-combine.ll | 567 ------- llvm/test/CodeGen/R600/mad-sub.ll | 215 --- llvm/test/CodeGen/R600/mad_int24.ll | 35 - llvm/test/CodeGen/R600/mad_uint24.ll | 76 - llvm/test/CodeGen/R600/madak.ll | 193 --- llvm/test/CodeGen/R600/madmk.ll | 205 --- llvm/test/CodeGen/R600/max-literals.ll | 67 - llvm/test/CodeGen/R600/max.ll | 168 --- llvm/test/CodeGen/R600/max3.ll | 41 - llvm/test/CodeGen/R600/merge-stores.ll | 536 ------- llvm/test/CodeGen/R600/min.ll | 189 --- llvm/test/CodeGen/R600/min3.ll | 111 -- llvm/test/CodeGen/R600/missing-store.ll | 26 - llvm/test/CodeGen/R600/mubuf.ll | 183 --- llvm/test/CodeGen/R600/mul.ll | 200 --- llvm/test/CodeGen/R600/mul_int24.ll | 23 - llvm/test/CodeGen/R600/mul_uint24.ll | 67 - llvm/test/CodeGen/R600/mulhu.ll | 17 - .../R600/no-initializer-constant-addrspace.ll | 21 - llvm/test/CodeGen/R600/no-shrink-extloads.ll | 191 --- llvm/test/CodeGen/R600/operand-folding.ll | 113 -- llvm/test/CodeGen/R600/operand-spacing.ll | 18 - llvm/test/CodeGen/R600/or.ll | 178 --- llvm/test/CodeGen/R600/packetizer.ll | 34 - llvm/test/CodeGen/R600/parallelandifcollapse.ll | 59 - llvm/test/CodeGen/R600/parallelorifcollapse.ll | 66 - llvm/test/CodeGen/R600/predicate-dp4.ll | 27 - llvm/test/CodeGen/R600/predicates.ll | 104 -- llvm/test/CodeGen/R600/private-memory-atomics.ll | 32 - llvm/test/CodeGen/R600/private-memory-broken.ll | 21 - llvm/test/CodeGen/R600/private-memory.ll | 313 ---- llvm/test/CodeGen/R600/pv-packing.ll | 45 - llvm/test/CodeGen/R600/pv.ll | 241 --- llvm/test/CodeGen/R600/r600-encoding.ll | 25 - llvm/test/CodeGen/R600/r600-export-fix.ll | 142 -- ...-infinite-loop-bug-while-reorganizing-vector.ll | 58 - llvm/test/CodeGen/R600/r600cfg.ll | 119 -- llvm/test/CodeGen/R600/reciprocal.ll | 15 - llvm/test/CodeGen/R600/register-count-comments.ll | 27 - llvm/test/CodeGen/R600/reorder-stores.ll | 105 -- llvm/test/CodeGen/R600/rotl.i64.ll | 39 - llvm/test/CodeGen/R600/rotl.ll | 57 - llvm/test/CodeGen/R600/rotr.i64.ll | 61 - llvm/test/CodeGen/R600/rotr.ll | 53 - llvm/test/CodeGen/R600/rsq.ll | 74 - llvm/test/CodeGen/R600/rv7x0_count3.ll | 41 - llvm/test/CodeGen/R600/s_movk_i32.ll | 185 --- llvm/test/CodeGen/R600/saddo.ll | 63 - llvm/test/CodeGen/R600/salu-to-valu.ll | 118 -- llvm/test/CodeGen/R600/scalar_to_vector.ll | 81 - .../CodeGen/R600/schedule-fs-loop-nested-if.ll | 82 - llvm/test/CodeGen/R600/schedule-fs-loop-nested.ll | 88 -- llvm/test/CodeGen/R600/schedule-fs-loop.ll | 55 - llvm/test/CodeGen/R600/schedule-global-loads.ll | 41 - llvm/test/CodeGen/R600/schedule-if-2.ll | 94 -- llvm/test/CodeGen/R600/schedule-if.ll | 46 - .../test/CodeGen/R600/schedule-kernel-arg-loads.ll | 51 - .../R600/schedule-vs-if-nested-loop-failure.ll | 163 -- .../CodeGen/R600/schedule-vs-if-nested-loop.ll | 132 -- llvm/test/CodeGen/R600/scratch-buffer.ll | 87 -- llvm/test/CodeGen/R600/sdiv.ll | 104 -- llvm/test/CodeGen/R600/sdivrem24.ll | 239 --- llvm/test/CodeGen/R600/sdivrem64.ll | 225 --- llvm/test/CodeGen/R600/select-i1.ll | 15 - llvm/test/CodeGen/R600/select-vectors.ll | 156 -- llvm/test/CodeGen/R600/select.ll | 47 - llvm/test/CodeGen/R600/select64.ll | 68 - llvm/test/CodeGen/R600/selectcc-cnd.ll | 12 - llvm/test/CodeGen/R600/selectcc-cnde-int.ll | 12 - .../CodeGen/R600/selectcc-icmp-select-float.ll | 16 - llvm/test/CodeGen/R600/selectcc-opt.ll | 80 - llvm/test/CodeGen/R600/selectcc.ll | 20 - llvm/test/CodeGen/R600/set-dx10.ll | 161 -- llvm/test/CodeGen/R600/setcc-equivalent.ll | 30 - llvm/test/CodeGen/R600/setcc-opt.ll | 236 --- llvm/test/CodeGen/R600/setcc.ll | 377 ----- llvm/test/CodeGen/R600/setcc64.ll | 259 ---- llvm/test/CodeGen/R600/seto.ll | 15 - llvm/test/CodeGen/R600/setuo.ll | 15 - llvm/test/CodeGen/R600/sext-eliminate.ll | 26 - llvm/test/CodeGen/R600/sext-in-reg.ll | 611 -------- llvm/test/CodeGen/R600/sgpr-control-flow.ll | 105 -- .../CodeGen/R600/sgpr-copy-duplicate-operand.ll | 19 - llvm/test/CodeGen/R600/sgpr-copy.ll | 379 ----- llvm/test/CodeGen/R600/shared-op-cycle.ll | 32 - llvm/test/CodeGen/R600/shl.ll | 180 --- llvm/test/CodeGen/R600/shl_add_constant.ll | 90 -- llvm/test/CodeGen/R600/shl_add_ptr.ll | 284 ---- llvm/test/CodeGen/R600/si-annotate-cf-assertion.ll | 25 - llvm/test/CodeGen/R600/si-annotate-cf.ll | 63 - llvm/test/CodeGen/R600/si-lod-bias.ll | 52 - llvm/test/CodeGen/R600/si-sgpr-spill.ll | 1568 -------------------- llvm/test/CodeGen/R600/si-spill-cf.ll | 501 ------- .../CodeGen/R600/si-triv-disjoint-mem-access.ll | 236 --- llvm/test/CodeGen/R600/si-vector-hang.ll | 105 -- llvm/test/CodeGen/R600/sign_extend.ll | 63 - .../R600/simplify-demanded-bits-build-pair.ll | 39 - llvm/test/CodeGen/R600/sint_to_fp.f64.ll | 61 - llvm/test/CodeGen/R600/sint_to_fp.ll | 64 - llvm/test/CodeGen/R600/smrd.ll | 111 -- llvm/test/CodeGen/R600/split-scalar-i64-add.ll | 48 - llvm/test/CodeGen/R600/sra.ll | 213 --- llvm/test/CodeGen/R600/srem.ll | 112 -- llvm/test/CodeGen/R600/srl.ll | 186 --- llvm/test/CodeGen/R600/ssubo.ll | 65 - llvm/test/CodeGen/R600/store-barrier.ll | 42 - llvm/test/CodeGen/R600/store-v3i32.ll | 13 - llvm/test/CodeGen/R600/store-v3i64.ll | 29 - llvm/test/CodeGen/R600/store-vector-ptrs.ll | 12 - llvm/test/CodeGen/R600/store.ll | 369 ----- llvm/test/CodeGen/R600/store.r600.ll | 22 - llvm/test/CodeGen/R600/structurize.ll | 83 -- llvm/test/CodeGen/R600/structurize1.ll | 62 - llvm/test/CodeGen/R600/sub.ll | 130 -- llvm/test/CodeGen/R600/subreg-coalescer-crash.ll | 109 -- llvm/test/CodeGen/R600/subreg-eliminate-dead.ll | 19 - llvm/test/CodeGen/R600/swizzle-export.ll | 129 -- llvm/test/CodeGen/R600/tex-clause-antidep.ll | 25 - llvm/test/CodeGen/R600/texture-input-merge.ll | 31 - llvm/test/CodeGen/R600/trunc-cmp-constant.ll | 170 --- llvm/test/CodeGen/R600/trunc-store-f64-to-f16.ll | 56 - llvm/test/CodeGen/R600/trunc-store-i1.ll | 33 - .../R600/trunc-vector-store-assertion-failure.ll | 20 - llvm/test/CodeGen/R600/trunc.ll | 100 -- llvm/test/CodeGen/R600/tti-unroll-prefs.ll | 58 - llvm/test/CodeGen/R600/uaddo.ll | 85 -- llvm/test/CodeGen/R600/udiv.ll | 48 - llvm/test/CodeGen/R600/udivrem.ll | 345 ----- llvm/test/CodeGen/R600/udivrem24.ll | 245 --- llvm/test/CodeGen/R600/udivrem64.ll | 223 --- llvm/test/CodeGen/R600/uint_to_fp.f64.ll | 98 -- llvm/test/CodeGen/R600/uint_to_fp.ll | 82 - llvm/test/CodeGen/R600/unaligned-load-store.ll | 254 ---- .../R600/unhandled-loop-condition-assertion.ll | 115 -- llvm/test/CodeGen/R600/unroll.ll | 36 - llvm/test/CodeGen/R600/unsupported-cc.ll | 125 -- llvm/test/CodeGen/R600/urecip.ll | 13 - llvm/test/CodeGen/R600/urem.ll | 94 -- llvm/test/CodeGen/R600/use-sgpr-multiple-times.ll | 103 -- llvm/test/CodeGen/R600/usubo.ll | 86 -- llvm/test/CodeGen/R600/v1i64-kernel-arg.ll | 17 - llvm/test/CodeGen/R600/v_cndmask.ll | 39 - llvm/test/CodeGen/R600/valu-i1.ll | 188 --- llvm/test/CodeGen/R600/vector-alloca.ll | 77 - llvm/test/CodeGen/R600/vertex-fetch-encoding.ll | 25 - llvm/test/CodeGen/R600/vop-shrink.ll | 51 - llvm/test/CodeGen/R600/vselect.ll | 77 - llvm/test/CodeGen/R600/vselect64.ll | 15 - llvm/test/CodeGen/R600/vtx-fetch-branch.ll | 29 - llvm/test/CodeGen/R600/vtx-schedule.ll | 18 - llvm/test/CodeGen/R600/wait.ll | 45 - llvm/test/CodeGen/R600/work-item-intrinsics.ll | 238 --- llvm/test/CodeGen/R600/wrong-transalu-pos-fix.ll | 81 - llvm/test/CodeGen/R600/xor.ll | 173 --- llvm/test/CodeGen/R600/zero_extend.ll | 41 - 800 files changed, 45283 insertions(+), 45283 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/32-bit-local-address-space.ll create mode 100644 llvm/test/CodeGen/AMDGPU/README create mode 100644 llvm/test/CodeGen/AMDGPU/add-debug.ll create mode 100644 llvm/test/CodeGen/AMDGPU/add.ll create mode 100644 llvm/test/CodeGen/AMDGPU/add_i64.ll create mode 100644 llvm/test/CodeGen/AMDGPU/address-space.ll create mode 100644 llvm/test/CodeGen/AMDGPU/and.ll create mode 100644 llvm/test/CodeGen/AMDGPU/anyext.ll create mode 100644 llvm/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll create mode 100644 llvm/test/CodeGen/AMDGPU/array-ptr-calc-i64.ll create mode 100644 llvm/test/CodeGen/AMDGPU/atomic_cmp_swap_local.ll create mode 100644 llvm/test/CodeGen/AMDGPU/atomic_load_add.ll create mode 100644 llvm/test/CodeGen/AMDGPU/atomic_load_sub.ll create mode 100644 llvm/test/CodeGen/AMDGPU/basic-branch.ll create mode 100644 llvm/test/CodeGen/AMDGPU/basic-loop.ll create mode 100644 llvm/test/CodeGen/AMDGPU/bfe_uint.ll create mode 100644 llvm/test/CodeGen/AMDGPU/bfi_int.ll create mode 100644 llvm/test/CodeGen/AMDGPU/big_alu.ll create mode 100644 llvm/test/CodeGen/AMDGPU/bitcast.ll create mode 100644 llvm/test/CodeGen/AMDGPU/bswap.ll create mode 100644 llvm/test/CodeGen/AMDGPU/build_vector.ll create mode 100644 llvm/test/CodeGen/AMDGPU/call.ll create mode 100644 llvm/test/CodeGen/AMDGPU/call_fs.ll create mode 100644 llvm/test/CodeGen/AMDGPU/cayman-loop-bug.ll create mode 100644 llvm/test/CodeGen/AMDGPU/cf-stack-bug.ll create mode 100644 llvm/test/CodeGen/AMDGPU/cf_end.ll create mode 100644 llvm/test/CodeGen/AMDGPU/cgp-addressing-modes.ll create mode 100644 llvm/test/CodeGen/AMDGPU/coalescer_remat.ll create mode 100644 llvm/test/CodeGen/AMDGPU/codegen-prepare-addrmode-sext.ll create mode 100644 llvm/test/CodeGen/AMDGPU/combine_vloads.ll create mode 100644 llvm/test/CodeGen/AMDGPU/commute-compares.ll create mode 100644 llvm/test/CodeGen/AMDGPU/commute_modifiers.ll create mode 100644 llvm/test/CodeGen/AMDGPU/complex-folding.ll create mode 100644 llvm/test/CodeGen/AMDGPU/concat_vectors.ll create mode 100644 llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll create mode 100644 llvm/test/CodeGen/AMDGPU/copy-to-reg.ll create mode 100644 llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll create mode 100644 llvm/test/CodeGen/AMDGPU/ctpop.ll create mode 100644 llvm/test/CodeGen/AMDGPU/ctpop64.ll create mode 100644 llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll create mode 100644 llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll create mode 100644 llvm/test/CodeGen/AMDGPU/cvt_flr_i32_f32.ll create mode 100644 llvm/test/CodeGen/AMDGPU/cvt_rpi_i32_f32.ll create mode 100644 llvm/test/CodeGen/AMDGPU/dagcombiner-bug-illegal-vec4-int-to-fp.ll create mode 100644 llvm/test/CodeGen/AMDGPU/debug.ll create mode 100644 llvm/test/CodeGen/AMDGPU/default-fp-mode.ll create mode 100644 llvm/test/CodeGen/AMDGPU/disconnected-predset-break-bug.ll create mode 100644 llvm/test/CodeGen/AMDGPU/dot4-folding.ll create mode 100644 llvm/test/CodeGen/AMDGPU/ds-negative-offset-addressing-mode-loop.ll create mode 100644 llvm/test/CodeGen/AMDGPU/ds_read2.ll create mode 100644 llvm/test/CodeGen/AMDGPU/ds_read2_offset_order.ll create mode 100644 llvm/test/CodeGen/AMDGPU/ds_read2st64.ll create mode 100644 llvm/test/CodeGen/AMDGPU/ds_write2.ll create mode 100644 llvm/test/CodeGen/AMDGPU/ds_write2st64.ll create mode 100644 llvm/test/CodeGen/AMDGPU/elf.ll create mode 100644 llvm/test/CodeGen/AMDGPU/elf.r600.ll create mode 100644 llvm/test/CodeGen/AMDGPU/empty-function.ll create mode 100644 llvm/test/CodeGen/AMDGPU/endcf-loop-header.ll create mode 100644 llvm/test/CodeGen/AMDGPU/extload-private.ll create mode 100644 llvm/test/CodeGen/AMDGPU/extload.ll create mode 100644 llvm/test/CodeGen/AMDGPU/extract_vector_elt_i16.ll create mode 100644 llvm/test/CodeGen/AMDGPU/fabs.f64.ll create mode 100644 llvm/test/CodeGen/AMDGPU/fabs.ll create mode 100644 llvm/test/CodeGen/AMDGPU/fadd.ll create mode 100644 llvm/test/CodeGen/AMDGPU/fadd64.ll create mode 100644 llvm/test/CodeGen/AMDGPU/fceil.ll create mode 100644 llvm/test/CodeGen/AMDGPU/fceil64.ll create mode 100644 llvm/test/CodeGen/AMDGPU/fcmp-cnd.ll create mode 100644 llvm/test/CodeGen/AMDGPU/fcmp-cnde-int-args.ll create mode 100644 llvm/test/CodeGen/AMDGPU/fcmp.ll create mode 100644 llvm/test/CodeGen/AMDGPU/fcmp64.ll create mode 100644 llvm/test/CodeGen/AMDGPU/fconst64.ll create mode 100644 llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll create mode 100644 llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll create mode 100644 llvm/test/CodeGen/AMDGPU/fdiv.f64.ll create mode 100644 llvm/test/CodeGen/AMDGPU/fdiv.ll create mode 100644 llvm/test/CodeGen/AMDGPU/fetch-limits.r600.ll create mode 100644 llvm/test/CodeGen/AMDGPU/fetch-limits.r700+.ll create mode 100644 llvm/test/CodeGen/AMDGPU/ffloor.f64.ll create mode 100644 llvm/test/CodeGen/AMDGPU/ffloor.ll create mode 100644 llvm/test/CodeGen/AMDGPU/flat-address-space.ll create mode 100644 llvm/test/CodeGen/AMDGPU/floor.ll create mode 100644 llvm/test/CodeGen/AMDGPU/fma-combine.ll create mode 100644 llvm/test/CodeGen/AMDGPU/fma.f64.ll create mode 100644 llvm/test/CodeGen/AMDGPU/fma.ll create mode 100644 llvm/test/CodeGen/AMDGPU/fmad.ll create mode 100644 llvm/test/CodeGen/AMDGPU/fmax.ll create mode 100644 llvm/test/CodeGen/AMDGPU/fmax3.f64.ll create mode 100644 llvm/test/CodeGen/AMDGPU/fmax3.ll create mode 100644 llvm/test/CodeGen/AMDGPU/fmax_legacy.f64.ll create mode 100644 llvm/test/CodeGen/AMDGPU/fmax_legacy.ll create mode 100644 llvm/test/CodeGen/AMDGPU/fmaxnum.f64.ll create mode 100644 llvm/test/CodeGen/AMDGPU/fmaxnum.ll create mode 100644 llvm/test/CodeGen/AMDGPU/fmin.ll create mode 100644 llvm/test/CodeGen/AMDGPU/fmin3.ll create mode 100644 llvm/test/CodeGen/AMDGPU/fmin_legacy.f64.ll create mode 100644 llvm/test/CodeGen/AMDGPU/fmin_legacy.ll create mode 100644 llvm/test/CodeGen/AMDGPU/fminnum.f64.ll create mode 100644 llvm/test/CodeGen/AMDGPU/fminnum.ll create mode 100644 llvm/test/CodeGen/AMDGPU/fmul.ll create mode 100644 llvm/test/CodeGen/AMDGPU/fmul64.ll create mode 100644 llvm/test/CodeGen/AMDGPU/fmuladd.ll create mode 100644 llvm/test/CodeGen/AMDGPU/fnearbyint.ll create mode 100644 llvm/test/CodeGen/AMDGPU/fneg-fabs.f64.ll create mode 100644 llvm/test/CodeGen/AMDGPU/fneg-fabs.ll create mode 100644 llvm/test/CodeGen/AMDGPU/fneg.f64.ll create mode 100644 llvm/test/CodeGen/AMDGPU/fneg.ll create mode 100644 llvm/test/CodeGen/AMDGPU/fp-classify.ll create mode 100644 llvm/test/CodeGen/AMDGPU/fp16_to_fp.ll create mode 100644 llvm/test/CodeGen/AMDGPU/fp32_to_fp16.ll create mode 100644 llvm/test/CodeGen/AMDGPU/fp_to_sint.f64.ll create mode 100644 llvm/test/CodeGen/AMDGPU/fp_to_sint.ll create mode 100644 llvm/test/CodeGen/AMDGPU/fp_to_uint.f64.ll create mode 100644 llvm/test/CodeGen/AMDGPU/fp_to_uint.ll create mode 100644 llvm/test/CodeGen/AMDGPU/fpext.ll create mode 100644 llvm/test/CodeGen/AMDGPU/fptrunc.ll create mode 100644 llvm/test/CodeGen/AMDGPU/frem.ll create mode 100644 llvm/test/CodeGen/AMDGPU/fsqrt.ll create mode 100644 llvm/test/CodeGen/AMDGPU/fsub.ll create mode 100644 llvm/test/CodeGen/AMDGPU/fsub64.ll create mode 100644 llvm/test/CodeGen/AMDGPU/ftrunc.f64.ll create mode 100644 llvm/test/CodeGen/AMDGPU/ftrunc.ll create mode 100644 llvm/test/CodeGen/AMDGPU/gep-address-space.ll create mode 100644 llvm/test/CodeGen/AMDGPU/global-directive.ll create mode 100644 llvm/test/CodeGen/AMDGPU/global-extload-i1.ll create mode 100644 llvm/test/CodeGen/AMDGPU/global-extload-i16.ll create mode 100644 llvm/test/CodeGen/AMDGPU/global-extload-i32.ll create mode 100644 llvm/test/CodeGen/AMDGPU/global-extload-i8.ll create mode 100644 llvm/test/CodeGen/AMDGPU/global-zero-initializer.ll create mode 100644 llvm/test/CodeGen/AMDGPU/global_atomics.ll create mode 100644 llvm/test/CodeGen/AMDGPU/gv-const-addrspace-fail.ll create mode 100644 llvm/test/CodeGen/AMDGPU/gv-const-addrspace.ll create mode 100644 llvm/test/CodeGen/AMDGPU/half.ll create mode 100644 llvm/test/CodeGen/AMDGPU/hsa.ll create mode 100644 llvm/test/CodeGen/AMDGPU/i1-copy-implicit-def.ll create mode 100644 llvm/test/CodeGen/AMDGPU/i1-copy-phi.ll create mode 100644 llvm/test/CodeGen/AMDGPU/i8-to-double-to-float.ll create mode 100644 llvm/test/CodeGen/AMDGPU/icmp-select-sete-reverse-args.ll create mode 100644 llvm/test/CodeGen/AMDGPU/icmp64.ll create mode 100644 llvm/test/CodeGen/AMDGPU/imm.ll create mode 100644 llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll create mode 100644 llvm/test/CodeGen/AMDGPU/indirect-private-64.ll create mode 100644 llvm/test/CodeGen/AMDGPU/infinite-loop-evergreen.ll create mode 100644 llvm/test/CodeGen/AMDGPU/infinite-loop.ll create mode 100644 llvm/test/CodeGen/AMDGPU/inline-asm.ll create mode 100644 llvm/test/CodeGen/AMDGPU/inline-calls.ll create mode 100644 llvm/test/CodeGen/AMDGPU/input-mods.ll create mode 100644 llvm/test/CodeGen/AMDGPU/insert_subreg.ll create mode 100644 llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll create mode 100644 llvm/test/CodeGen/AMDGPU/jump-address.ll create mode 100644 llvm/test/CodeGen/AMDGPU/kcache-fold.ll create mode 100644 llvm/test/CodeGen/AMDGPU/kernel-args.ll create mode 100644 llvm/test/CodeGen/AMDGPU/large-alloca.ll create mode 100644 llvm/test/CodeGen/AMDGPU/large-constant-initializer.ll create mode 100644 llvm/test/CodeGen/AMDGPU/lds-initializer.ll create mode 100644 llvm/test/CodeGen/AMDGPU/lds-oqap-crash.ll create mode 100644 llvm/test/CodeGen/AMDGPU/lds-output-queue.ll create mode 100644 llvm/test/CodeGen/AMDGPU/lds-size.ll create mode 100644 llvm/test/CodeGen/AMDGPU/lds-zero-initializer.ll create mode 100644 llvm/test/CodeGen/AMDGPU/legalizedag-bug-expand-setcc.ll create mode 100644 llvm/test/CodeGen/AMDGPU/lit.local.cfg create mode 100644 llvm/test/CodeGen/AMDGPU/literals.ll create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.abs.ll create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.barrier.global.ll create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.barrier.local.ll create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.bfe.i32.ll create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.bfe.u32.ll create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.bfi.ll create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.bfm.ll create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.brev.ll create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.clamp.ll create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.class.ll create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.cube.ll create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.cvt_f32_ubyte.ll create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.div_fixup.ll create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.div_fmas.ll create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.div_scale.ll create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.flbit.i32.ll create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.fract.f64.ll create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.fract.ll create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.imad24.ll create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.imax.ll create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.imin.ll create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.imul24.ll create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.kill.ll create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.ldexp.ll create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.legacy.rsq.ll create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.mul.ll create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.rcp.f64.ll create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.rcp.ll create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.rsq.clamped.f64.ll create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.rsq.clamped.ll create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.rsq.ll create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.tex.ll create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.trig_preop.ll create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.trunc.ll create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.umad24.ll create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.umax.ll create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.umin.ll create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.umul24.ll create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.SI.fs.interp.ll create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.SI.gather4.ll create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.SI.getlod.ll create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.SI.image.ll create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.SI.image.sample.ll create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.SI.image.sample.o.ll create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.SI.imageload.ll create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.SI.load.dword.ll create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.SI.resinfo.ll create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.SI.sample-masked.ll create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.SI.sample.ll create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.SI.sampled.ll create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.SI.sendmsg-m0.ll create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.SI.sendmsg.ll create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.SI.tbuffer.store.ll create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.SI.tid.ll create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgpu.dp4.ll create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgpu.kilp.ll create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgpu.lrp.ll create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.cos.ll create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.exp2.ll create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.log2.ll create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.memcpy.ll create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.pow.ll create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.rint.f64.ll create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.rint.ll create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.round.ll create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.sin.ll create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.sqrt.ll create mode 100644 llvm/test/CodeGen/AMDGPU/load-i1.ll create mode 100644 llvm/test/CodeGen/AMDGPU/load-input-fold.ll create mode 100644 llvm/test/CodeGen/AMDGPU/load.ll create mode 100644 llvm/test/CodeGen/AMDGPU/load.vec.ll create mode 100644 llvm/test/CodeGen/AMDGPU/load64.ll create mode 100644 llvm/test/CodeGen/AMDGPU/local-64.ll create mode 100644 llvm/test/CodeGen/AMDGPU/local-atomics.ll create mode 100644 llvm/test/CodeGen/AMDGPU/local-atomics64.ll create mode 100644 llvm/test/CodeGen/AMDGPU/local-memory-two-objects.ll create mode 100644 llvm/test/CodeGen/AMDGPU/local-memory.ll create mode 100644 llvm/test/CodeGen/AMDGPU/loop-address.ll create mode 100644 llvm/test/CodeGen/AMDGPU/loop-idiom.ll create mode 100644 llvm/test/CodeGen/AMDGPU/lshl.ll create mode 100644 llvm/test/CodeGen/AMDGPU/lshr.ll create mode 100644 llvm/test/CodeGen/AMDGPU/m0-spill.ll create mode 100644 llvm/test/CodeGen/AMDGPU/mad-combine.ll create mode 100644 llvm/test/CodeGen/AMDGPU/mad-sub.ll create mode 100644 llvm/test/CodeGen/AMDGPU/mad_int24.ll create mode 100644 llvm/test/CodeGen/AMDGPU/mad_uint24.ll create mode 100644 llvm/test/CodeGen/AMDGPU/madak.ll create mode 100644 llvm/test/CodeGen/AMDGPU/madmk.ll create mode 100644 llvm/test/CodeGen/AMDGPU/max-literals.ll create mode 100644 llvm/test/CodeGen/AMDGPU/max.ll create mode 100644 llvm/test/CodeGen/AMDGPU/max3.ll create mode 100644 llvm/test/CodeGen/AMDGPU/merge-stores.ll create mode 100644 llvm/test/CodeGen/AMDGPU/min.ll create mode 100644 llvm/test/CodeGen/AMDGPU/min3.ll create mode 100644 llvm/test/CodeGen/AMDGPU/missing-store.ll create mode 100644 llvm/test/CodeGen/AMDGPU/mubuf.ll create mode 100644 llvm/test/CodeGen/AMDGPU/mul.ll create mode 100644 llvm/test/CodeGen/AMDGPU/mul_int24.ll create mode 100644 llvm/test/CodeGen/AMDGPU/mul_uint24.ll create mode 100644 llvm/test/CodeGen/AMDGPU/mulhu.ll create mode 100644 llvm/test/CodeGen/AMDGPU/no-initializer-constant-addrspace.ll create mode 100644 llvm/test/CodeGen/AMDGPU/no-shrink-extloads.ll create mode 100644 llvm/test/CodeGen/AMDGPU/operand-folding.ll create mode 100644 llvm/test/CodeGen/AMDGPU/operand-spacing.ll create mode 100644 llvm/test/CodeGen/AMDGPU/or.ll create mode 100644 llvm/test/CodeGen/AMDGPU/packetizer.ll create mode 100644 llvm/test/CodeGen/AMDGPU/parallelandifcollapse.ll create mode 100644 llvm/test/CodeGen/AMDGPU/parallelorifcollapse.ll create mode 100644 llvm/test/CodeGen/AMDGPU/predicate-dp4.ll create mode 100644 llvm/test/CodeGen/AMDGPU/predicates.ll create mode 100644 llvm/test/CodeGen/AMDGPU/private-memory-atomics.ll create mode 100644 llvm/test/CodeGen/AMDGPU/private-memory-broken.ll create mode 100644 llvm/test/CodeGen/AMDGPU/private-memory.ll create mode 100644 llvm/test/CodeGen/AMDGPU/pv-packing.ll create mode 100644 llvm/test/CodeGen/AMDGPU/pv.ll create mode 100644 llvm/test/CodeGen/AMDGPU/r600-encoding.ll create mode 100644 llvm/test/CodeGen/AMDGPU/r600-export-fix.ll create mode 100644 llvm/test/CodeGen/AMDGPU/r600-infinite-loop-bug-while-reorganizing-vector.ll create mode 100644 llvm/test/CodeGen/AMDGPU/r600cfg.ll create mode 100644 llvm/test/CodeGen/AMDGPU/reciprocal.ll create mode 100644 llvm/test/CodeGen/AMDGPU/register-count-comments.ll create mode 100644 llvm/test/CodeGen/AMDGPU/reorder-stores.ll create mode 100644 llvm/test/CodeGen/AMDGPU/rotl.i64.ll create mode 100644 llvm/test/CodeGen/AMDGPU/rotl.ll create mode 100644 llvm/test/CodeGen/AMDGPU/rotr.i64.ll create mode 100644 llvm/test/CodeGen/AMDGPU/rotr.ll create mode 100644 llvm/test/CodeGen/AMDGPU/rsq.ll create mode 100644 llvm/test/CodeGen/AMDGPU/rv7x0_count3.ll create mode 100644 llvm/test/CodeGen/AMDGPU/s_movk_i32.ll create mode 100644 llvm/test/CodeGen/AMDGPU/saddo.ll create mode 100644 llvm/test/CodeGen/AMDGPU/salu-to-valu.ll create mode 100644 llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll create mode 100644 llvm/test/CodeGen/AMDGPU/schedule-fs-loop-nested-if.ll create mode 100644 llvm/test/CodeGen/AMDGPU/schedule-fs-loop-nested.ll create mode 100644 llvm/test/CodeGen/AMDGPU/schedule-fs-loop.ll create mode 100644 llvm/test/CodeGen/AMDGPU/schedule-global-loads.ll create mode 100644 llvm/test/CodeGen/AMDGPU/schedule-if-2.ll create mode 100644 llvm/test/CodeGen/AMDGPU/schedule-if.ll create mode 100644 llvm/test/CodeGen/AMDGPU/schedule-kernel-arg-loads.ll create mode 100644 llvm/test/CodeGen/AMDGPU/schedule-vs-if-nested-loop-failure.ll create mode 100644 llvm/test/CodeGen/AMDGPU/schedule-vs-if-nested-loop.ll create mode 100644 llvm/test/CodeGen/AMDGPU/scratch-buffer.ll create mode 100644 llvm/test/CodeGen/AMDGPU/sdiv.ll create mode 100644 llvm/test/CodeGen/AMDGPU/sdivrem24.ll create mode 100644 llvm/test/CodeGen/AMDGPU/sdivrem64.ll create mode 100644 llvm/test/CodeGen/AMDGPU/select-i1.ll create mode 100644 llvm/test/CodeGen/AMDGPU/select-vectors.ll create mode 100644 llvm/test/CodeGen/AMDGPU/select.ll create mode 100644 llvm/test/CodeGen/AMDGPU/select64.ll create mode 100644 llvm/test/CodeGen/AMDGPU/selectcc-cnd.ll create mode 100644 llvm/test/CodeGen/AMDGPU/selectcc-cnde-int.ll create mode 100644 llvm/test/CodeGen/AMDGPU/selectcc-icmp-select-float.ll create mode 100644 llvm/test/CodeGen/AMDGPU/selectcc-opt.ll create mode 100644 llvm/test/CodeGen/AMDGPU/selectcc.ll create mode 100644 llvm/test/CodeGen/AMDGPU/set-dx10.ll create mode 100644 llvm/test/CodeGen/AMDGPU/setcc-equivalent.ll create mode 100644 llvm/test/CodeGen/AMDGPU/setcc-opt.ll create mode 100644 llvm/test/CodeGen/AMDGPU/setcc.ll create mode 100644 llvm/test/CodeGen/AMDGPU/setcc64.ll create mode 100644 llvm/test/CodeGen/AMDGPU/seto.ll create mode 100644 llvm/test/CodeGen/AMDGPU/setuo.ll create mode 100644 llvm/test/CodeGen/AMDGPU/sext-eliminate.ll create mode 100644 llvm/test/CodeGen/AMDGPU/sext-in-reg.ll create mode 100644 llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll create mode 100644 llvm/test/CodeGen/AMDGPU/sgpr-copy-duplicate-operand.ll create mode 100644 llvm/test/CodeGen/AMDGPU/sgpr-copy.ll create mode 100644 llvm/test/CodeGen/AMDGPU/shared-op-cycle.ll create mode 100644 llvm/test/CodeGen/AMDGPU/shl.ll create mode 100644 llvm/test/CodeGen/AMDGPU/shl_add_constant.ll create mode 100644 llvm/test/CodeGen/AMDGPU/shl_add_ptr.ll create mode 100644 llvm/test/CodeGen/AMDGPU/si-annotate-cf-assertion.ll create mode 100644 llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll create mode 100644 llvm/test/CodeGen/AMDGPU/si-lod-bias.ll create mode 100644 llvm/test/CodeGen/AMDGPU/si-sgpr-spill.ll create mode 100644 llvm/test/CodeGen/AMDGPU/si-spill-cf.ll create mode 100644 llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll create mode 100644 llvm/test/CodeGen/AMDGPU/si-vector-hang.ll create mode 100644 llvm/test/CodeGen/AMDGPU/sign_extend.ll create mode 100644 llvm/test/CodeGen/AMDGPU/simplify-demanded-bits-build-pair.ll create mode 100644 llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll create mode 100644 llvm/test/CodeGen/AMDGPU/sint_to_fp.ll create mode 100644 llvm/test/CodeGen/AMDGPU/smrd.ll create mode 100644 llvm/test/CodeGen/AMDGPU/split-scalar-i64-add.ll create mode 100644 llvm/test/CodeGen/AMDGPU/sra.ll create mode 100644 llvm/test/CodeGen/AMDGPU/srem.ll create mode 100644 llvm/test/CodeGen/AMDGPU/srl.ll create mode 100644 llvm/test/CodeGen/AMDGPU/ssubo.ll create mode 100644 llvm/test/CodeGen/AMDGPU/store-barrier.ll create mode 100644 llvm/test/CodeGen/AMDGPU/store-v3i32.ll create mode 100644 llvm/test/CodeGen/AMDGPU/store-v3i64.ll create mode 100644 llvm/test/CodeGen/AMDGPU/store-vector-ptrs.ll create mode 100644 llvm/test/CodeGen/AMDGPU/store.ll create mode 100644 llvm/test/CodeGen/AMDGPU/store.r600.ll create mode 100644 llvm/test/CodeGen/AMDGPU/structurize.ll create mode 100644 llvm/test/CodeGen/AMDGPU/structurize1.ll create mode 100644 llvm/test/CodeGen/AMDGPU/sub.ll create mode 100644 llvm/test/CodeGen/AMDGPU/subreg-coalescer-crash.ll create mode 100644 llvm/test/CodeGen/AMDGPU/subreg-eliminate-dead.ll create mode 100644 llvm/test/CodeGen/AMDGPU/swizzle-export.ll create mode 100644 llvm/test/CodeGen/AMDGPU/tex-clause-antidep.ll create mode 100644 llvm/test/CodeGen/AMDGPU/texture-input-merge.ll create mode 100644 llvm/test/CodeGen/AMDGPU/trunc-cmp-constant.ll create mode 100644 llvm/test/CodeGen/AMDGPU/trunc-store-f64-to-f16.ll create mode 100644 llvm/test/CodeGen/AMDGPU/trunc-store-i1.ll create mode 100644 llvm/test/CodeGen/AMDGPU/trunc-vector-store-assertion-failure.ll create mode 100644 llvm/test/CodeGen/AMDGPU/trunc.ll create mode 100644 llvm/test/CodeGen/AMDGPU/tti-unroll-prefs.ll create mode 100644 llvm/test/CodeGen/AMDGPU/uaddo.ll create mode 100644 llvm/test/CodeGen/AMDGPU/udiv.ll create mode 100644 llvm/test/CodeGen/AMDGPU/udivrem.ll create mode 100644 llvm/test/CodeGen/AMDGPU/udivrem24.ll create mode 100644 llvm/test/CodeGen/AMDGPU/udivrem64.ll create mode 100644 llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll create mode 100644 llvm/test/CodeGen/AMDGPU/uint_to_fp.ll create mode 100644 llvm/test/CodeGen/AMDGPU/unaligned-load-store.ll create mode 100644 llvm/test/CodeGen/AMDGPU/unhandled-loop-condition-assertion.ll create mode 100644 llvm/test/CodeGen/AMDGPU/unroll.ll create mode 100644 llvm/test/CodeGen/AMDGPU/unsupported-cc.ll create mode 100644 llvm/test/CodeGen/AMDGPU/urecip.ll create mode 100644 llvm/test/CodeGen/AMDGPU/urem.ll create mode 100644 llvm/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll create mode 100644 llvm/test/CodeGen/AMDGPU/usubo.ll create mode 100644 llvm/test/CodeGen/AMDGPU/v1i64-kernel-arg.ll create mode 100644 llvm/test/CodeGen/AMDGPU/v_cndmask.ll create mode 100644 llvm/test/CodeGen/AMDGPU/valu-i1.ll create mode 100644 llvm/test/CodeGen/AMDGPU/vector-alloca.ll create mode 100644 llvm/test/CodeGen/AMDGPU/vertex-fetch-encoding.ll create mode 100644 llvm/test/CodeGen/AMDGPU/vop-shrink.ll create mode 100644 llvm/test/CodeGen/AMDGPU/vselect.ll create mode 100644 llvm/test/CodeGen/AMDGPU/vselect64.ll create mode 100644 llvm/test/CodeGen/AMDGPU/vtx-fetch-branch.ll create mode 100644 llvm/test/CodeGen/AMDGPU/vtx-schedule.ll create mode 100644 llvm/test/CodeGen/AMDGPU/wait.ll create mode 100644 llvm/test/CodeGen/AMDGPU/work-item-intrinsics.ll create mode 100644 llvm/test/CodeGen/AMDGPU/wrong-transalu-pos-fix.ll create mode 100644 llvm/test/CodeGen/AMDGPU/xor.ll create mode 100644 llvm/test/CodeGen/AMDGPU/zero_extend.ll delete mode 100644 llvm/test/CodeGen/R600/32-bit-local-address-space.ll delete mode 100644 llvm/test/CodeGen/R600/README delete mode 100644 llvm/test/CodeGen/R600/add-debug.ll delete mode 100644 llvm/test/CodeGen/R600/add.ll delete mode 100644 llvm/test/CodeGen/R600/add_i64.ll delete mode 100644 llvm/test/CodeGen/R600/address-space.ll delete mode 100644 llvm/test/CodeGen/R600/and.ll delete mode 100644 llvm/test/CodeGen/R600/anyext.ll delete mode 100644 llvm/test/CodeGen/R600/array-ptr-calc-i32.ll delete mode 100644 llvm/test/CodeGen/R600/array-ptr-calc-i64.ll delete mode 100644 llvm/test/CodeGen/R600/atomic_cmp_swap_local.ll delete mode 100644 llvm/test/CodeGen/R600/atomic_load_add.ll delete mode 100644 llvm/test/CodeGen/R600/atomic_load_sub.ll delete mode 100644 llvm/test/CodeGen/R600/basic-branch.ll delete mode 100644 llvm/test/CodeGen/R600/basic-loop.ll delete mode 100644 llvm/test/CodeGen/R600/bfe_uint.ll delete mode 100644 llvm/test/CodeGen/R600/bfi_int.ll delete mode 100644 llvm/test/CodeGen/R600/big_alu.ll delete mode 100644 llvm/test/CodeGen/R600/bitcast.ll delete mode 100644 llvm/test/CodeGen/R600/bswap.ll delete mode 100644 llvm/test/CodeGen/R600/build_vector.ll delete mode 100644 llvm/test/CodeGen/R600/call.ll delete mode 100644 llvm/test/CodeGen/R600/call_fs.ll delete mode 100644 llvm/test/CodeGen/R600/cayman-loop-bug.ll delete mode 100644 llvm/test/CodeGen/R600/cf-stack-bug.ll delete mode 100644 llvm/test/CodeGen/R600/cf_end.ll delete mode 100644 llvm/test/CodeGen/R600/cgp-addressing-modes.ll delete mode 100644 llvm/test/CodeGen/R600/coalescer_remat.ll delete mode 100644 llvm/test/CodeGen/R600/codegen-prepare-addrmode-sext.ll delete mode 100644 llvm/test/CodeGen/R600/combine_vloads.ll delete mode 100644 llvm/test/CodeGen/R600/commute-compares.ll delete mode 100644 llvm/test/CodeGen/R600/commute_modifiers.ll delete mode 100644 llvm/test/CodeGen/R600/complex-folding.ll delete mode 100644 llvm/test/CodeGen/R600/concat_vectors.ll delete mode 100644 llvm/test/CodeGen/R600/copy-illegal-type.ll delete mode 100644 llvm/test/CodeGen/R600/copy-to-reg.ll delete mode 100644 llvm/test/CodeGen/R600/ctlz_zero_undef.ll delete mode 100644 llvm/test/CodeGen/R600/ctpop.ll delete mode 100644 llvm/test/CodeGen/R600/ctpop64.ll delete mode 100644 llvm/test/CodeGen/R600/cttz_zero_undef.ll delete mode 100644 llvm/test/CodeGen/R600/cvt_f32_ubyte.ll delete mode 100644 llvm/test/CodeGen/R600/cvt_flr_i32_f32.ll delete mode 100644 llvm/test/CodeGen/R600/cvt_rpi_i32_f32.ll delete mode 100644 llvm/test/CodeGen/R600/dagcombiner-bug-illegal-vec4-int-to-fp.ll delete mode 100644 llvm/test/CodeGen/R600/debug.ll delete mode 100644 llvm/test/CodeGen/R600/default-fp-mode.ll delete mode 100644 llvm/test/CodeGen/R600/disconnected-predset-break-bug.ll delete mode 100644 llvm/test/CodeGen/R600/dot4-folding.ll delete mode 100644 llvm/test/CodeGen/R600/ds-negative-offset-addressing-mode-loop.ll delete mode 100644 llvm/test/CodeGen/R600/ds_read2.ll delete mode 100644 llvm/test/CodeGen/R600/ds_read2_offset_order.ll delete mode 100644 llvm/test/CodeGen/R600/ds_read2st64.ll delete mode 100644 llvm/test/CodeGen/R600/ds_write2.ll delete mode 100644 llvm/test/CodeGen/R600/ds_write2st64.ll delete mode 100644 llvm/test/CodeGen/R600/elf.ll delete mode 100644 llvm/test/CodeGen/R600/elf.r600.ll delete mode 100644 llvm/test/CodeGen/R600/empty-function.ll delete mode 100644 llvm/test/CodeGen/R600/endcf-loop-header.ll delete mode 100644 llvm/test/CodeGen/R600/extload-private.ll delete mode 100644 llvm/test/CodeGen/R600/extload.ll delete mode 100644 llvm/test/CodeGen/R600/extract_vector_elt_i16.ll delete mode 100644 llvm/test/CodeGen/R600/fabs.f64.ll delete mode 100644 llvm/test/CodeGen/R600/fabs.ll delete mode 100644 llvm/test/CodeGen/R600/fadd.ll delete mode 100644 llvm/test/CodeGen/R600/fadd64.ll delete mode 100644 llvm/test/CodeGen/R600/fceil.ll delete mode 100644 llvm/test/CodeGen/R600/fceil64.ll delete mode 100644 llvm/test/CodeGen/R600/fcmp-cnd.ll delete mode 100644 llvm/test/CodeGen/R600/fcmp-cnde-int-args.ll delete mode 100644 llvm/test/CodeGen/R600/fcmp.ll delete mode 100644 llvm/test/CodeGen/R600/fcmp64.ll delete mode 100644 llvm/test/CodeGen/R600/fconst64.ll delete mode 100644 llvm/test/CodeGen/R600/fcopysign.f32.ll delete mode 100644 llvm/test/CodeGen/R600/fcopysign.f64.ll delete mode 100644 llvm/test/CodeGen/R600/fdiv.f64.ll delete mode 100644 llvm/test/CodeGen/R600/fdiv.ll delete mode 100644 llvm/test/CodeGen/R600/fetch-limits.r600.ll delete mode 100644 llvm/test/CodeGen/R600/fetch-limits.r700+.ll delete mode 100644 llvm/test/CodeGen/R600/ffloor.f64.ll delete mode 100644 llvm/test/CodeGen/R600/ffloor.ll delete mode 100644 llvm/test/CodeGen/R600/flat-address-space.ll delete mode 100644 llvm/test/CodeGen/R600/floor.ll delete mode 100644 llvm/test/CodeGen/R600/fma-combine.ll delete mode 100644 llvm/test/CodeGen/R600/fma.f64.ll delete mode 100644 llvm/test/CodeGen/R600/fma.ll delete mode 100644 llvm/test/CodeGen/R600/fmad.ll delete mode 100644 llvm/test/CodeGen/R600/fmax.ll delete mode 100644 llvm/test/CodeGen/R600/fmax3.f64.ll delete mode 100644 llvm/test/CodeGen/R600/fmax3.ll delete mode 100644 llvm/test/CodeGen/R600/fmax_legacy.f64.ll delete mode 100644 llvm/test/CodeGen/R600/fmax_legacy.ll delete mode 100644 llvm/test/CodeGen/R600/fmaxnum.f64.ll delete mode 100644 llvm/test/CodeGen/R600/fmaxnum.ll delete mode 100644 llvm/test/CodeGen/R600/fmin.ll delete mode 100644 llvm/test/CodeGen/R600/fmin3.ll delete mode 100644 llvm/test/CodeGen/R600/fmin_legacy.f64.ll delete mode 100644 llvm/test/CodeGen/R600/fmin_legacy.ll delete mode 100644 llvm/test/CodeGen/R600/fminnum.f64.ll delete mode 100644 llvm/test/CodeGen/R600/fminnum.ll delete mode 100644 llvm/test/CodeGen/R600/fmul.ll delete mode 100644 llvm/test/CodeGen/R600/fmul64.ll delete mode 100644 llvm/test/CodeGen/R600/fmuladd.ll delete mode 100644 llvm/test/CodeGen/R600/fnearbyint.ll delete mode 100644 llvm/test/CodeGen/R600/fneg-fabs.f64.ll delete mode 100644 llvm/test/CodeGen/R600/fneg-fabs.ll delete mode 100644 llvm/test/CodeGen/R600/fneg.f64.ll delete mode 100644 llvm/test/CodeGen/R600/fneg.ll delete mode 100644 llvm/test/CodeGen/R600/fp-classify.ll delete mode 100644 llvm/test/CodeGen/R600/fp16_to_fp.ll delete mode 100644 llvm/test/CodeGen/R600/fp32_to_fp16.ll delete mode 100644 llvm/test/CodeGen/R600/fp_to_sint.f64.ll delete mode 100644 llvm/test/CodeGen/R600/fp_to_sint.ll delete mode 100644 llvm/test/CodeGen/R600/fp_to_uint.f64.ll delete mode 100644 llvm/test/CodeGen/R600/fp_to_uint.ll delete mode 100644 llvm/test/CodeGen/R600/fpext.ll delete mode 100644 llvm/test/CodeGen/R600/fptrunc.ll delete mode 100644 llvm/test/CodeGen/R600/frem.ll delete mode 100644 llvm/test/CodeGen/R600/fsqrt.ll delete mode 100644 llvm/test/CodeGen/R600/fsub.ll delete mode 100644 llvm/test/CodeGen/R600/fsub64.ll delete mode 100644 llvm/test/CodeGen/R600/ftrunc.f64.ll delete mode 100644 llvm/test/CodeGen/R600/ftrunc.ll delete mode 100644 llvm/test/CodeGen/R600/gep-address-space.ll delete mode 100644 llvm/test/CodeGen/R600/global-directive.ll delete mode 100644 llvm/test/CodeGen/R600/global-extload-i1.ll delete mode 100644 llvm/test/CodeGen/R600/global-extload-i16.ll delete mode 100644 llvm/test/CodeGen/R600/global-extload-i32.ll delete mode 100644 llvm/test/CodeGen/R600/global-extload-i8.ll delete mode 100644 llvm/test/CodeGen/R600/global-zero-initializer.ll delete mode 100644 llvm/test/CodeGen/R600/global_atomics.ll delete mode 100644 llvm/test/CodeGen/R600/gv-const-addrspace-fail.ll delete mode 100644 llvm/test/CodeGen/R600/gv-const-addrspace.ll delete mode 100644 llvm/test/CodeGen/R600/half.ll delete mode 100644 llvm/test/CodeGen/R600/hsa.ll delete mode 100644 llvm/test/CodeGen/R600/i1-copy-implicit-def.ll delete mode 100644 llvm/test/CodeGen/R600/i1-copy-phi.ll delete mode 100644 llvm/test/CodeGen/R600/i8-to-double-to-float.ll delete mode 100644 llvm/test/CodeGen/R600/icmp-select-sete-reverse-args.ll delete mode 100644 llvm/test/CodeGen/R600/icmp64.ll delete mode 100644 llvm/test/CodeGen/R600/imm.ll delete mode 100644 llvm/test/CodeGen/R600/indirect-addressing-si.ll delete mode 100644 llvm/test/CodeGen/R600/indirect-private-64.ll delete mode 100644 llvm/test/CodeGen/R600/infinite-loop-evergreen.ll delete mode 100644 llvm/test/CodeGen/R600/infinite-loop.ll delete mode 100644 llvm/test/CodeGen/R600/inline-asm.ll delete mode 100644 llvm/test/CodeGen/R600/inline-calls.ll delete mode 100644 llvm/test/CodeGen/R600/input-mods.ll delete mode 100644 llvm/test/CodeGen/R600/insert_subreg.ll delete mode 100644 llvm/test/CodeGen/R600/insert_vector_elt.ll delete mode 100644 llvm/test/CodeGen/R600/jump-address.ll delete mode 100644 llvm/test/CodeGen/R600/kcache-fold.ll delete mode 100644 llvm/test/CodeGen/R600/kernel-args.ll delete mode 100644 llvm/test/CodeGen/R600/large-alloca.ll delete mode 100644 llvm/test/CodeGen/R600/large-constant-initializer.ll delete mode 100644 llvm/test/CodeGen/R600/lds-initializer.ll delete mode 100644 llvm/test/CodeGen/R600/lds-oqap-crash.ll delete mode 100644 llvm/test/CodeGen/R600/lds-output-queue.ll delete mode 100644 llvm/test/CodeGen/R600/lds-size.ll delete mode 100644 llvm/test/CodeGen/R600/lds-zero-initializer.ll delete mode 100644 llvm/test/CodeGen/R600/legalizedag-bug-expand-setcc.ll delete mode 100644 llvm/test/CodeGen/R600/lit.local.cfg delete mode 100644 llvm/test/CodeGen/R600/literals.ll delete mode 100644 llvm/test/CodeGen/R600/llvm.AMDGPU.abs.ll delete mode 100644 llvm/test/CodeGen/R600/llvm.AMDGPU.barrier.global.ll delete mode 100644 llvm/test/CodeGen/R600/llvm.AMDGPU.barrier.local.ll delete mode 100644 llvm/test/CodeGen/R600/llvm.AMDGPU.bfe.i32.ll delete mode 100644 llvm/test/CodeGen/R600/llvm.AMDGPU.bfe.u32.ll delete mode 100644 llvm/test/CodeGen/R600/llvm.AMDGPU.bfi.ll delete mode 100644 llvm/test/CodeGen/R600/llvm.AMDGPU.bfm.ll delete mode 100644 llvm/test/CodeGen/R600/llvm.AMDGPU.brev.ll delete mode 100644 llvm/test/CodeGen/R600/llvm.AMDGPU.clamp.ll delete mode 100644 llvm/test/CodeGen/R600/llvm.AMDGPU.class.ll delete mode 100644 llvm/test/CodeGen/R600/llvm.AMDGPU.cube.ll delete mode 100644 llvm/test/CodeGen/R600/llvm.AMDGPU.cvt_f32_ubyte.ll delete mode 100644 llvm/test/CodeGen/R600/llvm.AMDGPU.div_fixup.ll delete mode 100644 llvm/test/CodeGen/R600/llvm.AMDGPU.div_fmas.ll delete mode 100644 llvm/test/CodeGen/R600/llvm.AMDGPU.div_scale.ll delete mode 100644 llvm/test/CodeGen/R600/llvm.AMDGPU.flbit.i32.ll delete mode 100644 llvm/test/CodeGen/R600/llvm.AMDGPU.fract.f64.ll delete mode 100644 llvm/test/CodeGen/R600/llvm.AMDGPU.fract.ll delete mode 100644 llvm/test/CodeGen/R600/llvm.AMDGPU.imad24.ll delete mode 100644 llvm/test/CodeGen/R600/llvm.AMDGPU.imax.ll delete mode 100644 llvm/test/CodeGen/R600/llvm.AMDGPU.imin.ll delete mode 100644 llvm/test/CodeGen/R600/llvm.AMDGPU.imul24.ll delete mode 100644 llvm/test/CodeGen/R600/llvm.AMDGPU.kill.ll delete mode 100644 llvm/test/CodeGen/R600/llvm.AMDGPU.ldexp.ll delete mode 100644 llvm/test/CodeGen/R600/llvm.AMDGPU.legacy.rsq.ll delete mode 100644 llvm/test/CodeGen/R600/llvm.AMDGPU.mul.ll delete mode 100644 llvm/test/CodeGen/R600/llvm.AMDGPU.rcp.f64.ll delete mode 100644 llvm/test/CodeGen/R600/llvm.AMDGPU.rcp.ll delete mode 100644 llvm/test/CodeGen/R600/llvm.AMDGPU.rsq.clamped.f64.ll delete mode 100644 llvm/test/CodeGen/R600/llvm.AMDGPU.rsq.clamped.ll delete mode 100644 llvm/test/CodeGen/R600/llvm.AMDGPU.rsq.ll delete mode 100644 llvm/test/CodeGen/R600/llvm.AMDGPU.tex.ll delete mode 100644 llvm/test/CodeGen/R600/llvm.AMDGPU.trig_preop.ll delete mode 100644 llvm/test/CodeGen/R600/llvm.AMDGPU.trunc.ll delete mode 100644 llvm/test/CodeGen/R600/llvm.AMDGPU.umad24.ll delete mode 100644 llvm/test/CodeGen/R600/llvm.AMDGPU.umax.ll delete mode 100644 llvm/test/CodeGen/R600/llvm.AMDGPU.umin.ll delete mode 100644 llvm/test/CodeGen/R600/llvm.AMDGPU.umul24.ll delete mode 100644 llvm/test/CodeGen/R600/llvm.SI.fs.interp.ll delete mode 100644 llvm/test/CodeGen/R600/llvm.SI.gather4.ll delete mode 100644 llvm/test/CodeGen/R600/llvm.SI.getlod.ll delete mode 100644 llvm/test/CodeGen/R600/llvm.SI.image.ll delete mode 100644 llvm/test/CodeGen/R600/llvm.SI.image.sample.ll delete mode 100644 llvm/test/CodeGen/R600/llvm.SI.image.sample.o.ll delete mode 100644 llvm/test/CodeGen/R600/llvm.SI.imageload.ll delete mode 100644 llvm/test/CodeGen/R600/llvm.SI.load.dword.ll delete mode 100644 llvm/test/CodeGen/R600/llvm.SI.resinfo.ll delete mode 100644 llvm/test/CodeGen/R600/llvm.SI.sample-masked.ll delete mode 100644 llvm/test/CodeGen/R600/llvm.SI.sample.ll delete mode 100644 llvm/test/CodeGen/R600/llvm.SI.sampled.ll delete mode 100644 llvm/test/CodeGen/R600/llvm.SI.sendmsg-m0.ll delete mode 100644 llvm/test/CodeGen/R600/llvm.SI.sendmsg.ll delete mode 100644 llvm/test/CodeGen/R600/llvm.SI.tbuffer.store.ll delete mode 100644 llvm/test/CodeGen/R600/llvm.SI.tid.ll delete mode 100644 llvm/test/CodeGen/R600/llvm.amdgpu.dp4.ll delete mode 100644 llvm/test/CodeGen/R600/llvm.amdgpu.kilp.ll delete mode 100644 llvm/test/CodeGen/R600/llvm.amdgpu.lrp.ll delete mode 100644 llvm/test/CodeGen/R600/llvm.cos.ll delete mode 100644 llvm/test/CodeGen/R600/llvm.exp2.ll delete mode 100644 llvm/test/CodeGen/R600/llvm.log2.ll delete mode 100644 llvm/test/CodeGen/R600/llvm.memcpy.ll delete mode 100644 llvm/test/CodeGen/R600/llvm.pow.ll delete mode 100644 llvm/test/CodeGen/R600/llvm.rint.f64.ll delete mode 100644 llvm/test/CodeGen/R600/llvm.rint.ll delete mode 100644 llvm/test/CodeGen/R600/llvm.round.f64.ll delete mode 100644 llvm/test/CodeGen/R600/llvm.round.ll delete mode 100644 llvm/test/CodeGen/R600/llvm.sin.ll delete mode 100644 llvm/test/CodeGen/R600/llvm.sqrt.ll delete mode 100644 llvm/test/CodeGen/R600/load-i1.ll delete mode 100644 llvm/test/CodeGen/R600/load-input-fold.ll delete mode 100644 llvm/test/CodeGen/R600/load.ll delete mode 100644 llvm/test/CodeGen/R600/load.vec.ll delete mode 100644 llvm/test/CodeGen/R600/load64.ll delete mode 100644 llvm/test/CodeGen/R600/local-64.ll delete mode 100644 llvm/test/CodeGen/R600/local-atomics.ll delete mode 100644 llvm/test/CodeGen/R600/local-atomics64.ll delete mode 100644 llvm/test/CodeGen/R600/local-memory-two-objects.ll delete mode 100644 llvm/test/CodeGen/R600/local-memory.ll delete mode 100644 llvm/test/CodeGen/R600/loop-address.ll delete mode 100644 llvm/test/CodeGen/R600/loop-idiom.ll delete mode 100644 llvm/test/CodeGen/R600/lshl.ll delete mode 100644 llvm/test/CodeGen/R600/lshr.ll delete mode 100644 llvm/test/CodeGen/R600/m0-spill.ll delete mode 100644 llvm/test/CodeGen/R600/mad-combine.ll delete mode 100644 llvm/test/CodeGen/R600/mad-sub.ll delete mode 100644 llvm/test/CodeGen/R600/mad_int24.ll delete mode 100644 llvm/test/CodeGen/R600/mad_uint24.ll delete mode 100644 llvm/test/CodeGen/R600/madak.ll delete mode 100644 llvm/test/CodeGen/R600/madmk.ll delete mode 100644 llvm/test/CodeGen/R600/max-literals.ll delete mode 100644 llvm/test/CodeGen/R600/max.ll delete mode 100644 llvm/test/CodeGen/R600/max3.ll delete mode 100644 llvm/test/CodeGen/R600/merge-stores.ll delete mode 100644 llvm/test/CodeGen/R600/min.ll delete mode 100644 llvm/test/CodeGen/R600/min3.ll delete mode 100644 llvm/test/CodeGen/R600/missing-store.ll delete mode 100644 llvm/test/CodeGen/R600/mubuf.ll delete mode 100644 llvm/test/CodeGen/R600/mul.ll delete mode 100644 llvm/test/CodeGen/R600/mul_int24.ll delete mode 100644 llvm/test/CodeGen/R600/mul_uint24.ll delete mode 100644 llvm/test/CodeGen/R600/mulhu.ll delete mode 100644 llvm/test/CodeGen/R600/no-initializer-constant-addrspace.ll delete mode 100644 llvm/test/CodeGen/R600/no-shrink-extloads.ll delete mode 100644 llvm/test/CodeGen/R600/operand-folding.ll delete mode 100644 llvm/test/CodeGen/R600/operand-spacing.ll delete mode 100644 llvm/test/CodeGen/R600/or.ll delete mode 100644 llvm/test/CodeGen/R600/packetizer.ll delete mode 100644 llvm/test/CodeGen/R600/parallelandifcollapse.ll delete mode 100644 llvm/test/CodeGen/R600/parallelorifcollapse.ll delete mode 100644 llvm/test/CodeGen/R600/predicate-dp4.ll delete mode 100644 llvm/test/CodeGen/R600/predicates.ll delete mode 100644 llvm/test/CodeGen/R600/private-memory-atomics.ll delete mode 100644 llvm/test/CodeGen/R600/private-memory-broken.ll delete mode 100644 llvm/test/CodeGen/R600/private-memory.ll delete mode 100644 llvm/test/CodeGen/R600/pv-packing.ll delete mode 100644 llvm/test/CodeGen/R600/pv.ll delete mode 100644 llvm/test/CodeGen/R600/r600-encoding.ll delete mode 100644 llvm/test/CodeGen/R600/r600-export-fix.ll delete mode 100644 llvm/test/CodeGen/R600/r600-infinite-loop-bug-while-reorganizing-vector.ll delete mode 100644 llvm/test/CodeGen/R600/r600cfg.ll delete mode 100644 llvm/test/CodeGen/R600/reciprocal.ll delete mode 100644 llvm/test/CodeGen/R600/register-count-comments.ll delete mode 100644 llvm/test/CodeGen/R600/reorder-stores.ll delete mode 100644 llvm/test/CodeGen/R600/rotl.i64.ll delete mode 100644 llvm/test/CodeGen/R600/rotl.ll delete mode 100644 llvm/test/CodeGen/R600/rotr.i64.ll delete mode 100644 llvm/test/CodeGen/R600/rotr.ll delete mode 100644 llvm/test/CodeGen/R600/rsq.ll delete mode 100644 llvm/test/CodeGen/R600/rv7x0_count3.ll delete mode 100644 llvm/test/CodeGen/R600/s_movk_i32.ll delete mode 100644 llvm/test/CodeGen/R600/saddo.ll delete mode 100644 llvm/test/CodeGen/R600/salu-to-valu.ll delete mode 100644 llvm/test/CodeGen/R600/scalar_to_vector.ll delete mode 100644 llvm/test/CodeGen/R600/schedule-fs-loop-nested-if.ll delete mode 100644 llvm/test/CodeGen/R600/schedule-fs-loop-nested.ll delete mode 100644 llvm/test/CodeGen/R600/schedule-fs-loop.ll delete mode 100644 llvm/test/CodeGen/R600/schedule-global-loads.ll delete mode 100644 llvm/test/CodeGen/R600/schedule-if-2.ll delete mode 100644 llvm/test/CodeGen/R600/schedule-if.ll delete mode 100644 llvm/test/CodeGen/R600/schedule-kernel-arg-loads.ll delete mode 100644 llvm/test/CodeGen/R600/schedule-vs-if-nested-loop-failure.ll delete mode 100644 llvm/test/CodeGen/R600/schedule-vs-if-nested-loop.ll delete mode 100644 llvm/test/CodeGen/R600/scratch-buffer.ll delete mode 100644 llvm/test/CodeGen/R600/sdiv.ll delete mode 100644 llvm/test/CodeGen/R600/sdivrem24.ll delete mode 100644 llvm/test/CodeGen/R600/sdivrem64.ll delete mode 100644 llvm/test/CodeGen/R600/select-i1.ll delete mode 100644 llvm/test/CodeGen/R600/select-vectors.ll delete mode 100644 llvm/test/CodeGen/R600/select.ll delete mode 100644 llvm/test/CodeGen/R600/select64.ll delete mode 100644 llvm/test/CodeGen/R600/selectcc-cnd.ll delete mode 100644 llvm/test/CodeGen/R600/selectcc-cnde-int.ll delete mode 100644 llvm/test/CodeGen/R600/selectcc-icmp-select-float.ll delete mode 100644 llvm/test/CodeGen/R600/selectcc-opt.ll delete mode 100644 llvm/test/CodeGen/R600/selectcc.ll delete mode 100644 llvm/test/CodeGen/R600/set-dx10.ll delete mode 100644 llvm/test/CodeGen/R600/setcc-equivalent.ll delete mode 100644 llvm/test/CodeGen/R600/setcc-opt.ll delete mode 100644 llvm/test/CodeGen/R600/setcc.ll delete mode 100644 llvm/test/CodeGen/R600/setcc64.ll delete mode 100644 llvm/test/CodeGen/R600/seto.ll delete mode 100644 llvm/test/CodeGen/R600/setuo.ll delete mode 100644 llvm/test/CodeGen/R600/sext-eliminate.ll delete mode 100644 llvm/test/CodeGen/R600/sext-in-reg.ll delete mode 100644 llvm/test/CodeGen/R600/sgpr-control-flow.ll delete mode 100644 llvm/test/CodeGen/R600/sgpr-copy-duplicate-operand.ll delete mode 100644 llvm/test/CodeGen/R600/sgpr-copy.ll delete mode 100644 llvm/test/CodeGen/R600/shared-op-cycle.ll delete mode 100644 llvm/test/CodeGen/R600/shl.ll delete mode 100644 llvm/test/CodeGen/R600/shl_add_constant.ll delete mode 100644 llvm/test/CodeGen/R600/shl_add_ptr.ll delete mode 100644 llvm/test/CodeGen/R600/si-annotate-cf-assertion.ll delete mode 100644 llvm/test/CodeGen/R600/si-annotate-cf.ll delete mode 100644 llvm/test/CodeGen/R600/si-lod-bias.ll delete mode 100644 llvm/test/CodeGen/R600/si-sgpr-spill.ll delete mode 100644 llvm/test/CodeGen/R600/si-spill-cf.ll delete mode 100644 llvm/test/CodeGen/R600/si-triv-disjoint-mem-access.ll delete mode 100644 llvm/test/CodeGen/R600/si-vector-hang.ll delete mode 100644 llvm/test/CodeGen/R600/sign_extend.ll delete mode 100644 llvm/test/CodeGen/R600/simplify-demanded-bits-build-pair.ll delete mode 100644 llvm/test/CodeGen/R600/sint_to_fp.f64.ll delete mode 100644 llvm/test/CodeGen/R600/sint_to_fp.ll delete mode 100644 llvm/test/CodeGen/R600/smrd.ll delete mode 100644 llvm/test/CodeGen/R600/split-scalar-i64-add.ll delete mode 100644 llvm/test/CodeGen/R600/sra.ll delete mode 100644 llvm/test/CodeGen/R600/srem.ll delete mode 100644 llvm/test/CodeGen/R600/srl.ll delete mode 100644 llvm/test/CodeGen/R600/ssubo.ll delete mode 100644 llvm/test/CodeGen/R600/store-barrier.ll delete mode 100644 llvm/test/CodeGen/R600/store-v3i32.ll delete mode 100644 llvm/test/CodeGen/R600/store-v3i64.ll delete mode 100644 llvm/test/CodeGen/R600/store-vector-ptrs.ll delete mode 100644 llvm/test/CodeGen/R600/store.ll delete mode 100644 llvm/test/CodeGen/R600/store.r600.ll delete mode 100644 llvm/test/CodeGen/R600/structurize.ll delete mode 100644 llvm/test/CodeGen/R600/structurize1.ll delete mode 100644 llvm/test/CodeGen/R600/sub.ll delete mode 100644 llvm/test/CodeGen/R600/subreg-coalescer-crash.ll delete mode 100644 llvm/test/CodeGen/R600/subreg-eliminate-dead.ll delete mode 100644 llvm/test/CodeGen/R600/swizzle-export.ll delete mode 100644 llvm/test/CodeGen/R600/tex-clause-antidep.ll delete mode 100644 llvm/test/CodeGen/R600/texture-input-merge.ll delete mode 100644 llvm/test/CodeGen/R600/trunc-cmp-constant.ll delete mode 100644 llvm/test/CodeGen/R600/trunc-store-f64-to-f16.ll delete mode 100644 llvm/test/CodeGen/R600/trunc-store-i1.ll delete mode 100644 llvm/test/CodeGen/R600/trunc-vector-store-assertion-failure.ll delete mode 100644 llvm/test/CodeGen/R600/trunc.ll delete mode 100644 llvm/test/CodeGen/R600/tti-unroll-prefs.ll delete mode 100644 llvm/test/CodeGen/R600/uaddo.ll delete mode 100644 llvm/test/CodeGen/R600/udiv.ll delete mode 100644 llvm/test/CodeGen/R600/udivrem.ll delete mode 100644 llvm/test/CodeGen/R600/udivrem24.ll delete mode 100644 llvm/test/CodeGen/R600/udivrem64.ll delete mode 100644 llvm/test/CodeGen/R600/uint_to_fp.f64.ll delete mode 100644 llvm/test/CodeGen/R600/uint_to_fp.ll delete mode 100644 llvm/test/CodeGen/R600/unaligned-load-store.ll delete mode 100644 llvm/test/CodeGen/R600/unhandled-loop-condition-assertion.ll delete mode 100644 llvm/test/CodeGen/R600/unroll.ll delete mode 100644 llvm/test/CodeGen/R600/unsupported-cc.ll delete mode 100644 llvm/test/CodeGen/R600/urecip.ll delete mode 100644 llvm/test/CodeGen/R600/urem.ll delete mode 100644 llvm/test/CodeGen/R600/use-sgpr-multiple-times.ll delete mode 100644 llvm/test/CodeGen/R600/usubo.ll delete mode 100644 llvm/test/CodeGen/R600/v1i64-kernel-arg.ll delete mode 100644 llvm/test/CodeGen/R600/v_cndmask.ll delete mode 100644 llvm/test/CodeGen/R600/valu-i1.ll delete mode 100644 llvm/test/CodeGen/R600/vector-alloca.ll delete mode 100644 llvm/test/CodeGen/R600/vertex-fetch-encoding.ll delete mode 100644 llvm/test/CodeGen/R600/vop-shrink.ll delete mode 100644 llvm/test/CodeGen/R600/vselect.ll delete mode 100644 llvm/test/CodeGen/R600/vselect64.ll delete mode 100644 llvm/test/CodeGen/R600/vtx-fetch-branch.ll delete mode 100644 llvm/test/CodeGen/R600/vtx-schedule.ll delete mode 100644 llvm/test/CodeGen/R600/wait.ll delete mode 100644 llvm/test/CodeGen/R600/work-item-intrinsics.ll delete mode 100644 llvm/test/CodeGen/R600/wrong-transalu-pos-fix.ll delete mode 100644 llvm/test/CodeGen/R600/xor.ll delete mode 100644 llvm/test/CodeGen/R600/zero_extend.ll (limited to 'llvm/test/CodeGen') diff --git a/llvm/test/CodeGen/AMDGPU/32-bit-local-address-space.ll b/llvm/test/CodeGen/AMDGPU/32-bit-local-address-space.ll new file mode 100644 index 00000000000..c7bcfd2ddab --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/32-bit-local-address-space.ll @@ -0,0 +1,139 @@ +; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s + +; On Southern Islands GPUs the local address space(3) uses 32-bit pointers and +; the global address space(1) uses 64-bit pointers. These tests check to make sure +; the correct pointer size is used for the local address space. + +; The e{{32|64}} suffix on the instructions refers to the encoding size and not +; the size of the operands. The operand size is denoted in the instruction name. +; Instructions with B32, U32, and I32 in their name take 32-bit operands, while +; instructions with B64, U64, and I64 take 64-bit operands. + +; FUNC-LABEL: {{^}}local_address_load: +; SI: v_mov_b32_e{{32|64}} [[PTR:v[0-9]]] +; SI: ds_read_b32 v{{[0-9]+}}, [[PTR]] +define void @local_address_load(i32 addrspace(1)* %out, i32 addrspace(3)* %in) { +entry: + %0 = load i32, i32 addrspace(3)* %in + store i32 %0, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}local_address_gep: +; SI: s_add_i32 [[SPTR:s[0-9]]] +; SI: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]] +; SI: ds_read_b32 [[VPTR]] +define void @local_address_gep(i32 addrspace(1)* %out, i32 addrspace(3)* %in, i32 %offset) { +entry: + %0 = getelementptr i32, i32 addrspace(3)* %in, i32 %offset + %1 = load i32, i32 addrspace(3)* %0 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}local_address_gep_const_offset: +; SI: v_mov_b32_e32 [[VPTR:v[0-9]+]], s{{[0-9]+}} +; SI: ds_read_b32 v{{[0-9]+}}, [[VPTR]] offset:4 +define void @local_address_gep_const_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %in) { +entry: + %0 = getelementptr i32, i32 addrspace(3)* %in, i32 1 + %1 = load i32, i32 addrspace(3)* %0 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; Offset too large, can't fold into 16-bit immediate offset. +; FUNC-LABEL: {{^}}local_address_gep_large_const_offset: +; SI: s_add_i32 [[SPTR:s[0-9]]], s{{[0-9]+}}, 0x10004 +; SI: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]] +; SI: ds_read_b32 [[VPTR]] +define void @local_address_gep_large_const_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %in) { +entry: + %0 = getelementptr i32, i32 addrspace(3)* %in, i32 16385 + %1 = load i32, i32 addrspace(3)* %0 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}null_32bit_lds_ptr: +; SI: v_cmp_ne_i32 +; SI-NOT: v_cmp_ne_i32 +; SI: v_cndmask_b32 +define void @null_32bit_lds_ptr(i32 addrspace(1)* %out, i32 addrspace(3)* %lds) nounwind { + %cmp = icmp ne i32 addrspace(3)* %lds, null + %x = select i1 %cmp, i32 123, i32 456 + store i32 %x, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}mul_32bit_ptr: +; SI: s_mul_i32 +; SI-NEXT: s_add_i32 +; SI: ds_read_b32 +define void @mul_32bit_ptr(float addrspace(1)* %out, [3 x float] addrspace(3)* %lds, i32 %tid) { + %ptr = getelementptr [3 x float], [3 x float] addrspace(3)* %lds, i32 %tid, i32 0 + %val = load float, float addrspace(3)* %ptr + store float %val, float addrspace(1)* %out + ret void +} + +@g_lds = addrspace(3) global float undef, align 4 + +; FUNC-LABEL: {{^}}infer_ptr_alignment_global_offset: +; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0 +; SI: ds_read_b32 v{{[0-9]+}}, [[REG]] +define void @infer_ptr_alignment_global_offset(float addrspace(1)* %out, i32 %tid) { + %val = load float, float addrspace(3)* @g_lds + store float %val, float addrspace(1)* %out + ret void +} + + +@ptr = addrspace(3) global i32 addrspace(3)* undef +@dst = addrspace(3) global [16384 x i32] undef + +; FUNC-LABEL: {{^}}global_ptr: +; SI: ds_write_b32 +define void @global_ptr() nounwind { + store i32 addrspace(3)* getelementptr ([16384 x i32], [16384 x i32] addrspace(3)* @dst, i32 0, i32 16), i32 addrspace(3)* addrspace(3)* @ptr + ret void +} + +; FUNC-LABEL: {{^}}local_address_store: +; SI: ds_write_b32 +define void @local_address_store(i32 addrspace(3)* %out, i32 %val) { + store i32 %val, i32 addrspace(3)* %out + ret void +} + +; FUNC-LABEL: {{^}}local_address_gep_store: +; SI: s_add_i32 [[SADDR:s[0-9]+]], +; SI: v_mov_b32_e32 [[ADDR:v[0-9]+]], [[SADDR]] +; SI: ds_write_b32 [[ADDR]], v{{[0-9]+}} +define void @local_address_gep_store(i32 addrspace(3)* %out, i32, i32 %val, i32 %offset) { + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 %offset + store i32 %val, i32 addrspace(3)* %gep, align 4 + ret void +} + +; FUNC-LABEL: {{^}}local_address_gep_const_offset_store: +; SI: v_mov_b32_e32 [[VPTR:v[0-9]+]], s{{[0-9]+}} +; SI: v_mov_b32_e32 [[VAL:v[0-9]+]], s{{[0-9]+}} +; SI: ds_write_b32 [[VPTR]], [[VAL]] offset:4 +define void @local_address_gep_const_offset_store(i32 addrspace(3)* %out, i32 %val) { + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 1 + store i32 %val, i32 addrspace(3)* %gep, align 4 + ret void +} + +; Offset too large, can't fold into 16-bit immediate offset. +; FUNC-LABEL: {{^}}local_address_gep_large_const_offset_store: +; SI: s_add_i32 [[SPTR:s[0-9]]], s{{[0-9]+}}, 0x10004 +; SI: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]] +; SI: ds_write_b32 [[VPTR]], v{{[0-9]+$}} +define void @local_address_gep_large_const_offset_store(i32 addrspace(3)* %out, i32 %val) { + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 16385 + store i32 %val, i32 addrspace(3)* %gep, align 4 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/README b/llvm/test/CodeGen/AMDGPU/README new file mode 100644 index 00000000000..96998bba28f --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/README @@ -0,0 +1,21 @@ ++==============================================================================+ +| How to organize the lit tests | ++==============================================================================+ + +- If you write a test for matching a single DAG opcode or intrinsic, it should + go in a file called {opcode_name,intrinsic_name}.ll (e.g. fadd.ll) + +- If you write a test that matches several DAG opcodes and checks for a single + ISA instruction, then that test should go in a file called {ISA_name}.ll (e.g. + bfi_int.ll + +- For all other tests, use your best judgement for organizing tests and naming + the files. + ++==============================================================================+ +| Naming conventions | ++==============================================================================+ + +- Use dash '-' and not underscore '_' to separate words in file names, unless + the file is named after a DAG opcode or ISA instruction that has an + underscore '_' in its name. diff --git a/llvm/test/CodeGen/AMDGPU/add-debug.ll b/llvm/test/CodeGen/AMDGPU/add-debug.ll new file mode 100644 index 00000000000..529905dd36a --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/add-debug.ll @@ -0,0 +1,24 @@ +; RUN: llc < %s -march=amdgcn -mcpu=tahiti -debug +; RUN: llc < %s -march=amdgcn -mcpu=tonga -debug +; REQUIRES: asserts + +; Check that SelectionDAGDumper does not crash on int_SI_if. +define void @add64_in_branch(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %a, i64 %b, i64 %c) { +entry: + %0 = icmp eq i64 %a, 0 + br i1 %0, label %if, label %else + +if: + %1 = load i64, i64 addrspace(1)* %in + br label %endif + +else: + %2 = add i64 %a, %b + br label %endif + +endif: + %3 = phi i64 [%1, %if], [%2, %else] + store i64 %3, i64 addrspace(1)* %out + ret void +} + diff --git a/llvm/test/CodeGen/AMDGPU/add.ll b/llvm/test/CodeGen/AMDGPU/add.ll new file mode 100644 index 00000000000..655e75dbc1a --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/add.ll @@ -0,0 +1,192 @@ +; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG --check-prefix=FUNC %s +; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s +; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s + +;FUNC-LABEL: {{^}}test1: +;EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} + +;SI: v_add_i32_e32 [[REG:v[0-9]+]], {{v[0-9]+, v[0-9]+}} +;SI-NOT: [[REG]] +;SI: buffer_store_dword [[REG]], +define void @test1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { + %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 + %a = load i32, i32 addrspace(1)* %in + %b = load i32, i32 addrspace(1)* %b_ptr + %result = add i32 %a, %b + store i32 %result, i32 addrspace(1)* %out + ret void +} + +;FUNC-LABEL: {{^}}test2: +;EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +;EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} + +;SI: v_add_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +;SI: v_add_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} + +define void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { + %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1 + %a = load <2 x i32>, <2 x i32> addrspace(1)* %in + %b = load <2 x i32>, <2 x i32> addrspace(1)* %b_ptr + %result = add <2 x i32> %a, %b + store <2 x i32> %result, <2 x i32> addrspace(1)* %out + ret void +} + +;FUNC-LABEL: {{^}}test4: +;EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +;EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +;EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +;EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} + +;SI: v_add_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +;SI: v_add_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +;SI: v_add_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +;SI: v_add_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} + +define void @test4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { + %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1 + %a = load <4 x i32>, <4 x i32> addrspace(1)* %in + %b = load <4 x i32>, <4 x i32> addrspace(1)* %b_ptr + %result = add <4 x i32> %a, %b + store <4 x i32> %result, <4 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}test8: +; EG: ADD_INT +; EG: ADD_INT +; EG: ADD_INT +; EG: ADD_INT +; EG: ADD_INT +; EG: ADD_INT +; EG: ADD_INT +; EG: ADD_INT + +; SI: s_add_i32 +; SI: s_add_i32 +; SI: s_add_i32 +; SI: s_add_i32 +; SI: s_add_i32 +; SI: s_add_i32 +; SI: s_add_i32 +; SI: s_add_i32 +define void @test8(<8 x i32> addrspace(1)* %out, <8 x i32> %a, <8 x i32> %b) { +entry: + %0 = add <8 x i32> %a, %b + store <8 x i32> %0, <8 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}test16: +; EG: ADD_INT +; EG: ADD_INT +; EG: ADD_INT +; EG: ADD_INT +; EG: ADD_INT +; EG: ADD_INT +; EG: ADD_INT +; EG: ADD_INT +; EG: ADD_INT +; EG: ADD_INT +; EG: ADD_INT +; EG: ADD_INT +; EG: ADD_INT +; EG: ADD_INT +; EG: ADD_INT +; EG: ADD_INT + +; SI: s_add_i32 +; SI: s_add_i32 +; SI: s_add_i32 +; SI: s_add_i32 +; SI: s_add_i32 +; SI: s_add_i32 +; SI: s_add_i32 +; SI: s_add_i32 +; SI: s_add_i32 +; SI: s_add_i32 +; SI: s_add_i32 +; SI: s_add_i32 +; SI: s_add_i32 +; SI: s_add_i32 +; SI: s_add_i32 +; SI: s_add_i32 +define void @test16(<16 x i32> addrspace(1)* %out, <16 x i32> %a, <16 x i32> %b) { +entry: + %0 = add <16 x i32> %a, %b + store <16 x i32> %0, <16 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}add64: +; SI: s_add_u32 +; SI: s_addc_u32 + +; EG: MEM_RAT_CACHELESS STORE_RAW [[LO:T[0-9]+\.[XYZW]]] +; EG: MEM_RAT_CACHELESS STORE_RAW [[HI:T[0-9]+\.[XYZW]]] +; EG-DAG: ADD_INT {{[* ]*}}[[LO]] +; EG-DAG: ADDC_UINT +; EG-DAG: ADD_INT +; EG-DAG: ADD_INT {{[* ]*}}[[HI]] +; EG-NOT: SUB +define void @add64(i64 addrspace(1)* %out, i64 %a, i64 %b) { +entry: + %0 = add i64 %a, %b + store i64 %0, i64 addrspace(1)* %out + ret void +} + +; The v_addc_u32 and v_add_i32 instruction can't read SGPRs, because they +; use VCC. The test is designed so that %a will be stored in an SGPR and +; %0 will be stored in a VGPR, so the comiler will be forced to copy %a +; to a VGPR before doing the add. + +; FUNC-LABEL: {{^}}add64_sgpr_vgpr: +; SI-NOT: v_addc_u32_e32 s + +; EG: MEM_RAT_CACHELESS STORE_RAW [[LO:T[0-9]+\.[XYZW]]] +; EG: MEM_RAT_CACHELESS STORE_RAW [[HI:T[0-9]+\.[XYZW]]] +; EG-DAG: ADD_INT {{[* ]*}}[[LO]] +; EG-DAG: ADDC_UINT +; EG-DAG: ADD_INT +; EG-DAG: ADD_INT {{[* ]*}}[[HI]] +; EG-NOT: SUB +define void @add64_sgpr_vgpr(i64 addrspace(1)* %out, i64 %a, i64 addrspace(1)* %in) { +entry: + %0 = load i64, i64 addrspace(1)* %in + %1 = add i64 %a, %0 + store i64 %1, i64 addrspace(1)* %out + ret void +} + +; Test i64 add inside a branch. +; FUNC-LABEL: {{^}}add64_in_branch: +; SI: s_add_u32 +; SI: s_addc_u32 + +; EG: MEM_RAT_CACHELESS STORE_RAW [[LO:T[0-9]+\.[XYZW]]] +; EG: MEM_RAT_CACHELESS STORE_RAW [[HI:T[0-9]+\.[XYZW]]] +; EG-DAG: ADD_INT {{[* ]*}}[[LO]] +; EG-DAG: ADDC_UINT +; EG-DAG: ADD_INT +; EG-DAG: ADD_INT {{[* ]*}}[[HI]] +; EG-NOT: SUB +define void @add64_in_branch(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %a, i64 %b, i64 %c) { +entry: + %0 = icmp eq i64 %a, 0 + br i1 %0, label %if, label %else + +if: + %1 = load i64, i64 addrspace(1)* %in + br label %endif + +else: + %2 = add i64 %a, %b + br label %endif + +endif: + %3 = phi i64 [%1, %if], [%2, %else] + store i64 %3, i64 addrspace(1)* %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/add_i64.ll b/llvm/test/CodeGen/AMDGPU/add_i64.ll new file mode 100644 index 00000000000..8346add7df9 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/add_i64.ll @@ -0,0 +1,84 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s + + +declare i32 @llvm.r600.read.tidig.x() readnone + +; SI-LABEL: {{^}}test_i64_vreg: +; SI: v_add_i32 +; SI: v_addc_u32 +define void @test_i64_vreg(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %inA, i64 addrspace(1)* noalias %inB) { + %tid = call i32 @llvm.r600.read.tidig.x() readnone + %a_ptr = getelementptr i64, i64 addrspace(1)* %inA, i32 %tid + %b_ptr = getelementptr i64, i64 addrspace(1)* %inB, i32 %tid + %a = load i64, i64 addrspace(1)* %a_ptr + %b = load i64, i64 addrspace(1)* %b_ptr + %result = add i64 %a, %b + store i64 %result, i64 addrspace(1)* %out + ret void +} + +; Check that the SGPR add operand is correctly moved to a VGPR. +; SI-LABEL: {{^}}sgpr_operand: +; SI: v_add_i32 +; SI: v_addc_u32 +define void @sgpr_operand(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in, i64 addrspace(1)* noalias %in_bar, i64 %a) { + %foo = load i64, i64 addrspace(1)* %in, align 8 + %result = add i64 %foo, %a + store i64 %result, i64 addrspace(1)* %out + ret void +} + +; Swap the arguments. Check that the SGPR -> VGPR copy works with the +; SGPR as other operand. +; +; SI-LABEL: {{^}}sgpr_operand_reversed: +; SI: v_add_i32 +; SI: v_addc_u32 +define void @sgpr_operand_reversed(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in, i64 %a) { + %foo = load i64, i64 addrspace(1)* %in, align 8 + %result = add i64 %a, %foo + store i64 %result, i64 addrspace(1)* %out + ret void +} + + +; SI-LABEL: {{^}}test_v2i64_sreg: +; SI: s_add_u32 +; SI: s_addc_u32 +; SI: s_add_u32 +; SI: s_addc_u32 +define void @test_v2i64_sreg(<2 x i64> addrspace(1)* noalias %out, <2 x i64> %a, <2 x i64> %b) { + %result = add <2 x i64> %a, %b + store <2 x i64> %result, <2 x i64> addrspace(1)* %out + ret void +} + +; SI-LABEL: {{^}}test_v2i64_vreg: +; SI: v_add_i32 +; SI: v_addc_u32 +; SI: v_add_i32 +; SI: v_addc_u32 +define void @test_v2i64_vreg(<2 x i64> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %inA, <2 x i64> addrspace(1)* noalias %inB) { + %tid = call i32 @llvm.r600.read.tidig.x() readnone + %a_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %inA, i32 %tid + %b_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %inB, i32 %tid + %a = load <2 x i64>, <2 x i64> addrspace(1)* %a_ptr + %b = load <2 x i64>, <2 x i64> addrspace(1)* %b_ptr + %result = add <2 x i64> %a, %b + store <2 x i64> %result, <2 x i64> addrspace(1)* %out + ret void +} + +; SI-LABEL: {{^}}trunc_i64_add_to_i32: +; SI: s_load_dword s[[SREG0:[0-9]+]] +; SI: s_load_dword s[[SREG1:[0-9]+]] +; SI: s_add_i32 [[SRESULT:s[0-9]+]], s[[SREG1]], s[[SREG0]] +; SI-NOT: addc +; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]] +; SI: buffer_store_dword [[VRESULT]], +define void @trunc_i64_add_to_i32(i32 addrspace(1)* %out, i64 %a, i64 %b) { + %add = add i64 %b, %a + %trunc = trunc i64 %add to i32 + store i32 %trunc, i32 addrspace(1)* %out, align 8 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/address-space.ll b/llvm/test/CodeGen/AMDGPU/address-space.ll new file mode 100644 index 00000000000..4be8c584752 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/address-space.ll @@ -0,0 +1,36 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s + +; Test that codegenprepare understands address space sizes + +%struct.foo = type { [3 x float], [3 x float] } + +; FIXME: Extra V_MOV from SGPR to VGPR for second read. The address is +; already in a VGPR after the first read. + +; CHECK-LABEL: {{^}}do_as_ptr_calcs: +; CHECK: s_load_dword [[SREG1:s[0-9]+]], +; CHECK: v_mov_b32_e32 [[VREG2:v[0-9]+]], [[SREG1]] +; CHECK: v_mov_b32_e32 [[VREG1:v[0-9]+]], [[SREG1]] +; CHECK-DAG: ds_read_b32 v{{[0-9]+}}, [[VREG1]] offset:12 +; CHECK-DAG: ds_read_b32 v{{[0-9]+}}, [[VREG2]] offset:20 +define void @do_as_ptr_calcs(%struct.foo addrspace(3)* nocapture %ptr) nounwind { +entry: + %x = getelementptr inbounds %struct.foo, %struct.foo addrspace(3)* %ptr, i32 0, i32 1, i32 0 + %y = getelementptr inbounds %struct.foo, %struct.foo addrspace(3)* %ptr, i32 0, i32 1, i32 2 + br label %bb32 + +bb32: + %a = load float, float addrspace(3)* %x, align 4 + %b = load float, float addrspace(3)* %y, align 4 + %cmp = fcmp one float %a, %b + br i1 %cmp, label %bb34, label %bb33 + +bb33: + unreachable + +bb34: + unreachable +} + + diff --git a/llvm/test/CodeGen/AMDGPU/and.ll b/llvm/test/CodeGen/AMDGPU/and.ll new file mode 100644 index 00000000000..5672d470bd7 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/and.ll @@ -0,0 +1,296 @@ +; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s + +; FUNC-LABEL: {{^}}test2: +; EG: AND_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; EG: AND_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} + +; SI: v_and_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +; SI: v_and_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} + +define void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { + %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1 + %a = load <2 x i32>, <2 x i32> addrspace(1) * %in + %b = load <2 x i32>, <2 x i32> addrspace(1) * %b_ptr + %result = and <2 x i32> %a, %b + store <2 x i32> %result, <2 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}test4: +; EG: AND_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; EG: AND_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; EG: AND_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; EG: AND_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} + +; SI: v_and_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +; SI: v_and_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +; SI: v_and_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +; SI: v_and_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} + +define void @test4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { + %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1 + %a = load <4 x i32>, <4 x i32> addrspace(1) * %in + %b = load <4 x i32>, <4 x i32> addrspace(1) * %b_ptr + %result = and <4 x i32> %a, %b + store <4 x i32> %result, <4 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}s_and_i32: +; SI: s_and_b32 +define void @s_and_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) { + %and = and i32 %a, %b + store i32 %and, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}s_and_constant_i32: +; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x12d687 +define void @s_and_constant_i32(i32 addrspace(1)* %out, i32 %a) { + %and = and i32 %a, 1234567 + store i32 %and, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}v_and_i32: +; SI: v_and_b32 +define void @v_and_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) { + %a = load i32, i32 addrspace(1)* %aptr, align 4 + %b = load i32, i32 addrspace(1)* %bptr, align 4 + %and = and i32 %a, %b + store i32 %and, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}v_and_constant_i32 +; SI: v_and_b32_e32 v{{[0-9]+}}, 0x12d687, v{{[0-9]+}} +define void @v_and_constant_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) { + %a = load i32, i32 addrspace(1)* %aptr, align 4 + %and = and i32 %a, 1234567 + store i32 %and, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}v_and_inline_imm_64_i32 +; SI: v_and_b32_e32 v{{[0-9]+}}, 64, v{{[0-9]+}} +define void @v_and_inline_imm_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) { + %a = load i32, i32 addrspace(1)* %aptr, align 4 + %and = and i32 %a, 64 + store i32 %and, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}v_and_inline_imm_neg_16_i32 +; SI: v_and_b32_e32 v{{[0-9]+}}, -16, v{{[0-9]+}} +define void @v_and_inline_imm_neg_16_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) { + %a = load i32, i32 addrspace(1)* %aptr, align 4 + %and = and i32 %a, -16 + store i32 %and, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}s_and_i64 +; SI: s_and_b64 +define void @s_and_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) { + %and = and i64 %a, %b + store i64 %and, i64 addrspace(1)* %out, align 8 + ret void +} + +; FIXME: Should use SGPRs +; FUNC-LABEL: {{^}}s_and_i1: +; SI: v_and_b32 +define void @s_and_i1(i1 addrspace(1)* %out, i1 %a, i1 %b) { + %and = and i1 %a, %b + store i1 %and, i1 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}s_and_constant_i64 +; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} +define void @s_and_constant_i64(i64 addrspace(1)* %out, i64 %a) { + %and = and i64 %a, 281474976710655 + store i64 %and, i64 addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}v_and_i64: +; SI: v_and_b32 +; SI: v_and_b32 +define void @v_and_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) { + %a = load i64, i64 addrspace(1)* %aptr, align 8 + %b = load i64, i64 addrspace(1)* %bptr, align 8 + %and = and i64 %a, %b + store i64 %and, i64 addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}v_and_i64_br: +; SI: v_and_b32 +; SI: v_and_b32 +define void @v_and_i64_br(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr, i32 %cond) { +entry: + %tmp0 = icmp eq i32 %cond, 0 + br i1 %tmp0, label %if, label %endif + +if: + %a = load i64, i64 addrspace(1)* %aptr, align 8 + %b = load i64, i64 addrspace(1)* %bptr, align 8 + %and = and i64 %a, %b + br label %endif + +endif: + %tmp1 = phi i64 [%and, %if], [0, %entry] + store i64 %tmp1, i64 addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}v_and_constant_i64: +; SI: v_and_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} +; SI: v_and_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} +define void @v_and_constant_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) { + %a = load i64, i64 addrspace(1)* %aptr, align 8 + %and = and i64 %a, 1234567 + store i64 %and, i64 addrspace(1)* %out, align 8 + ret void +} + +; FIXME: Replace and 0 with mov 0 +; FUNC-LABEL: {{^}}v_and_inline_imm_i64: +; SI: v_and_b32_e32 {{v[0-9]+}}, 64, {{v[0-9]+}} +; SI: v_and_b32_e32 {{v[0-9]+}}, 0, {{v[0-9]+}} +define void @v_and_inline_imm_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) { + %a = load i64, i64 addrspace(1)* %aptr, align 8 + %and = and i64 %a, 64 + store i64 %and, i64 addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}s_and_inline_imm_64_i64 +; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 64 +define void @s_and_inline_imm_64_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { + %and = and i64 %a, 64 + store i64 %and, i64 addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}s_and_inline_imm_1_i64 +; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 1 +define void @s_and_inline_imm_1_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { + %and = and i64 %a, 1 + store i64 %and, i64 addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}s_and_inline_imm_1.0_i64 +; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 1.0 +define void @s_and_inline_imm_1.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { + %and = and i64 %a, 4607182418800017408 + store i64 %and, i64 addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}s_and_inline_imm_neg_1.0_i64 +; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, -1.0 +define void @s_and_inline_imm_neg_1.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { + %and = and i64 %a, 13830554455654793216 + store i64 %and, i64 addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}s_and_inline_imm_0.5_i64 +; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0.5 +define void @s_and_inline_imm_0.5_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { + %and = and i64 %a, 4602678819172646912 + store i64 %and, i64 addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}s_and_inline_imm_neg_0.5_i64 +; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, -0.5 +define void @s_and_inline_imm_neg_0.5_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { + %and = and i64 %a, 13826050856027422720 + store i64 %and, i64 addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}s_and_inline_imm_2.0_i64 +; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 2.0 +define void @s_and_inline_imm_2.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { + %and = and i64 %a, 4611686018427387904 + store i64 %and, i64 addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}s_and_inline_imm_neg_2.0_i64 +; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, -2.0 +define void @s_and_inline_imm_neg_2.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { + %and = and i64 %a, 13835058055282163712 + store i64 %and, i64 addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}s_and_inline_imm_4.0_i64 +; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 4.0 +define void @s_and_inline_imm_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { + %and = and i64 %a, 4616189618054758400 + store i64 %and, i64 addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}s_and_inline_imm_neg_4.0_i64 +; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, -4.0 +define void @s_and_inline_imm_neg_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { + %and = and i64 %a, 13839561654909534208 + store i64 %and, i64 addrspace(1)* %out, align 8 + ret void +} + + +; Test with the 64-bit integer bitpattern for a 32-bit float in the +; low 32-bits, which is not a valid 64-bit inline immmediate. + +; FUNC-LABEL: {{^}}s_and_inline_imm_f32_4.0_i64 +; SI-DAG: s_mov_b32 s[[K_LO:[0-9]+]], 4.0 +; SI-DAG: s_mov_b32 s[[K_HI:[0-9]+]], 0{{$}} +; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[K_LO]]:[[K_HI]]{{\]}} +define void @s_and_inline_imm_f32_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { + %and = and i64 %a, 1082130432 + store i64 %and, i64 addrspace(1)* %out, align 8 + ret void +} + +; FIXME: Copy of -1 register +; FUNC-LABEL: {{^}}s_and_inline_imm_f32_neg_4.0_i64 +; SI-DAG: s_mov_b32 s[[K_LO:[0-9]+]], -4.0 +; SI-DAG: s_mov_b32 s[[K_HI:[0-9]+]], -1{{$}} +; SI-DAG: s_mov_b32 s[[K_HI_COPY:[0-9]+]], s[[K_HI]] +; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[K_LO]]:[[K_HI_COPY]]{{\]}} +define void @s_and_inline_imm_f32_neg_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { + %and = and i64 %a, -1065353216 + store i64 %and, i64 addrspace(1)* %out, align 8 + ret void +} + +; Shift into upper 32-bits +; FUNC-LABEL: {{^}}s_and_inline_high_imm_f32_4.0_i64 +; SI-DAG: s_mov_b32 s[[K_HI:[0-9]+]], 4.0 +; SI-DAG: s_mov_b32 s[[K_LO:[0-9]+]], 0{{$}} +; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[K_LO]]:[[K_HI]]{{\]}} +define void @s_and_inline_high_imm_f32_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { + %and = and i64 %a, 4647714815446351872 + store i64 %and, i64 addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}s_and_inline_high_imm_f32_neg_4.0_i64 +; SI-DAG: s_mov_b32 s[[K_HI:[0-9]+]], -4.0 +; SI-DAG: s_mov_b32 s[[K_LO:[0-9]+]], 0{{$}} +; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[K_LO]]:[[K_HI]]{{\]}} +define void @s_and_inline_high_imm_f32_neg_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { + %and = and i64 %a, 13871086852301127680 + store i64 %and, i64 addrspace(1)* %out, align 8 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/anyext.ll b/llvm/test/CodeGen/AMDGPU/anyext.ll new file mode 100644 index 00000000000..48d8f312249 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/anyext.ll @@ -0,0 +1,15 @@ +; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s +; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s + +; CHECK-LABEL: {{^}}anyext_i1_i32: +; CHECK: v_cndmask_b32_e64 +define void @anyext_i1_i32(i32 addrspace(1)* %out, i32 %cond) { +entry: + %0 = icmp eq i32 %cond, 0 + %1 = zext i1 %0 to i8 + %2 = xor i8 %1, -1 + %3 = and i8 %2, 1 + %4 = zext i8 %3 to i32 + store i32 %4, i32 addrspace(1)* %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll b/llvm/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll new file mode 100644 index 00000000000..8c2a0795860 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll @@ -0,0 +1,44 @@ +; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=SI -mattr=-promote-alloca < %s | FileCheck -check-prefix=SI-ALLOCA -check-prefix=SI %s +; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=SI -mattr=+promote-alloca < %s | FileCheck -check-prefix=SI-PROMOTE -check-prefix=SI %s + +declare i32 @llvm.SI.tid() nounwind readnone +declare void @llvm.AMDGPU.barrier.local() nounwind noduplicate + +; The required pointer calculations for the alloca'd actually requires +; an add and won't be folded into the addressing, which fails with a +; 64-bit pointer add. This should work since private pointers should +; be 32-bits. + +; SI-LABEL: {{^}}test_private_array_ptr_calc: + +; FIXME: We end up with zero argument for ADD, because +; SIRegisterInfo::eliminateFrameIndex() blindly replaces the frame index +; with the appropriate offset. We should fold this into the store. +; SI-ALLOCA: v_add_i32_e32 [[PTRREG:v[0-9]+]], 0, v{{[0-9]+}} +; SI-ALLOCA: buffer_store_dword {{v[0-9]+}}, [[PTRREG]], s[{{[0-9]+:[0-9]+}}] +; +; FIXME: The AMDGPUPromoteAlloca pass should be able to convert this +; alloca to a vector. It currently fails because it does not know how +; to interpret: +; getelementptr [4 x i32], [4 x i32]* %alloca, i32 1, i32 %b + +; SI-PROMOTE: v_add_i32_e32 [[PTRREG:v[0-9]+]], 16 +; SI-PROMOTE: ds_write_b32 [[PTRREG]] +define void @test_private_array_ptr_calc(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %inA, i32 addrspace(1)* noalias %inB) { + %alloca = alloca [4 x i32], i32 4, align 16 + %tid = call i32 @llvm.SI.tid() readnone + %a_ptr = getelementptr i32, i32 addrspace(1)* %inA, i32 %tid + %b_ptr = getelementptr i32, i32 addrspace(1)* %inB, i32 %tid + %a = load i32, i32 addrspace(1)* %a_ptr + %b = load i32, i32 addrspace(1)* %b_ptr + %result = add i32 %a, %b + %alloca_ptr = getelementptr [4 x i32], [4 x i32]* %alloca, i32 1, i32 %b + store i32 %result, i32* %alloca_ptr, align 4 + ; Dummy call + call void @llvm.AMDGPU.barrier.local() nounwind noduplicate + %reload = load i32, i32* %alloca_ptr, align 4 + %out_ptr = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + store i32 %reload, i32 addrspace(1)* %out_ptr, align 4 + ret void +} + diff --git a/llvm/test/CodeGen/AMDGPU/array-ptr-calc-i64.ll b/llvm/test/CodeGen/AMDGPU/array-ptr-calc-i64.ll new file mode 100644 index 00000000000..eae095eb844 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/array-ptr-calc-i64.ll @@ -0,0 +1,17 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s + +declare i32 @llvm.SI.tid() readnone + +; SI-LABEL: {{^}}test_array_ptr_calc: +; SI: v_mul_lo_i32 +; SI: v_mul_hi_i32 +define void @test_array_ptr_calc(i32 addrspace(1)* noalias %out, [1025 x i32] addrspace(1)* noalias %inA, i32 addrspace(1)* noalias %inB) { + %tid = call i32 @llvm.SI.tid() readnone + %a_ptr = getelementptr [1025 x i32], [1025 x i32] addrspace(1)* %inA, i32 %tid, i32 0 + %b_ptr = getelementptr i32, i32 addrspace(1)* %inB, i32 %tid + %a = load i32, i32 addrspace(1)* %a_ptr + %b = load i32, i32 addrspace(1)* %b_ptr + %result = add i32 %a, %b + store i32 %result, i32 addrspace(1)* %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/atomic_cmp_swap_local.ll b/llvm/test/CodeGen/AMDGPU/atomic_cmp_swap_local.ll new file mode 100644 index 00000000000..ef2560ef184 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/atomic_cmp_swap_local.ll @@ -0,0 +1,92 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=SICI -check-prefix=GCN -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=SICI -check-prefix=CIVI -check-prefix=GCN -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=CIVI -check-prefix=GCN -check-prefix=FUNC %s + +; FUNC-LABEL: {{^}}lds_atomic_cmpxchg_ret_i32_offset: +; GCN: v_mov_b32_e32 [[VCMP:v[0-9]+]], 7 +; SICI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb +; SICI: s_load_dword [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc +; VI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c +; VI: s_load_dword [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30 +; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]] +; GCN-DAG: v_mov_b32_e32 [[VSWAP:v[0-9]+]], [[SWAP]] +; GCN: ds_cmpst_rtn_b32 [[RESULT:v[0-9]+]], [[VPTR]], [[VCMP]], [[VSWAP]] offset:16 +; GCN: s_endpgm +define void @lds_atomic_cmpxchg_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr, i32 %swap) nounwind { + %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 + %pair = cmpxchg i32 addrspace(3)* %gep, i32 7, i32 %swap seq_cst monotonic + %result = extractvalue { i32, i1 } %pair, 0 + store i32 %result, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_cmpxchg_ret_i64_offset: +; GCN-DAG: v_mov_b32_e32 v[[LOVCMP:[0-9]+]], 7 +; GCN-DAG: v_mov_b32_e32 v[[HIVCMP:[0-9]+]], 0 +; SICI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb +; SICI: s_load_dwordx2 s{{\[}}[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd +; VI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c +; VI: s_load_dwordx2 s{{\[}}[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x34 +; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]] +; GCN-DAG: v_mov_b32_e32 v[[LOSWAPV:[0-9]+]], s[[LOSWAP]] +; GCN-DAG: v_mov_b32_e32 v[[HISWAPV:[0-9]+]], s[[HISWAP]] +; GCN: ds_cmpst_rtn_b64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[VPTR]], v{{\[}}[[LOVCMP]]:[[HIVCMP]]{{\]}}, v{{\[}}[[LOSWAPV]]:[[HISWAPV]]{{\]}} offset:32 +; GCN: buffer_store_dwordx2 [[RESULT]], +; GCN: s_endpgm +define void @lds_atomic_cmpxchg_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr, i64 %swap) nounwind { + %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 + %pair = cmpxchg i64 addrspace(3)* %gep, i64 7, i64 %swap seq_cst monotonic + %result = extractvalue { i64, i1 } %pair, 0 + store i64 %result, i64 addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_cmpxchg_ret_i32_bad_si_offset +; SI: ds_cmpst_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; CIVI: ds_cmpst_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16 +; GCN: s_endpgm +define void @lds_atomic_cmpxchg_ret_i32_bad_si_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr, i32 %swap, i32 %a, i32 %b) nounwind { + %sub = sub i32 %a, %b + %add = add i32 %sub, 4 + %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 %add + %pair = cmpxchg i32 addrspace(3)* %gep, i32 7, i32 %swap seq_cst monotonic + %result = extractvalue { i32, i1 } %pair, 0 + store i32 %result, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_cmpxchg_noret_i32_offset: +; SICI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x9 +; SICI: s_load_dword [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xa +; VI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x24 +; VI: s_load_dword [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x28 +; GCN-DAG: v_mov_b32_e32 [[VCMP:v[0-9]+]], 7 +; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]] +; GCN-DAG: v_mov_b32_e32 [[VSWAP:v[0-9]+]], [[SWAP]] +; GCN: ds_cmpst_b32 [[VPTR]], [[VCMP]], [[VSWAP]] offset:16 +; GCN: s_endpgm +define void @lds_atomic_cmpxchg_noret_i32_offset(i32 addrspace(3)* %ptr, i32 %swap) nounwind { + %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 + %pair = cmpxchg i32 addrspace(3)* %gep, i32 7, i32 %swap seq_cst monotonic + %result = extractvalue { i32, i1 } %pair, 0 + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_cmpxchg_noret_i64_offset: +; SICI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x9 +; SICI: s_load_dwordx2 s{{\[}}[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb +; VI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x24 +; VI: s_load_dwordx2 s{{\[}}[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c +; GCN-DAG: v_mov_b32_e32 v[[LOVCMP:[0-9]+]], 7 +; GCN-DAG: v_mov_b32_e32 v[[HIVCMP:[0-9]+]], 0 +; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]] +; GCN-DAG: v_mov_b32_e32 v[[LOSWAPV:[0-9]+]], s[[LOSWAP]] +; GCN-DAG: v_mov_b32_e32 v[[HISWAPV:[0-9]+]], s[[HISWAP]] +; GCN: ds_cmpst_b64 [[VPTR]], v{{\[}}[[LOVCMP]]:[[HIVCMP]]{{\]}}, v{{\[}}[[LOSWAPV]]:[[HISWAPV]]{{\]}} offset:32 +; GCN: s_endpgm +define void @lds_atomic_cmpxchg_noret_i64_offset(i64 addrspace(3)* %ptr, i64 %swap) nounwind { + %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 + %pair = cmpxchg i64 addrspace(3)* %gep, i64 7, i64 %swap seq_cst monotonic + %result = extractvalue { i64, i1 } %pair, 0 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/atomic_load_add.ll b/llvm/test/CodeGen/AMDGPU/atomic_load_add.ll new file mode 100644 index 00000000000..20c685447ee --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/atomic_load_add.ll @@ -0,0 +1,39 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck %s -check-prefix=SI -check-prefix=FUNC +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s -check-prefix=SI -check-prefix=FUNC +; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s + +; FUNC-LABEL: {{^}}atomic_add_local: +; R600: LDS_ADD * +; SI: ds_add_u32 +define void @atomic_add_local(i32 addrspace(3)* %local) { + %unused = atomicrmw volatile add i32 addrspace(3)* %local, i32 5 seq_cst + ret void +} + +; FUNC-LABEL: {{^}}atomic_add_local_const_offset: +; R600: LDS_ADD * +; SI: ds_add_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16 +define void @atomic_add_local_const_offset(i32 addrspace(3)* %local) { + %gep = getelementptr i32, i32 addrspace(3)* %local, i32 4 + %val = atomicrmw volatile add i32 addrspace(3)* %gep, i32 5 seq_cst + ret void +} + +; FUNC-LABEL: {{^}}atomic_add_ret_local: +; R600: LDS_ADD_RET * +; SI: ds_add_rtn_u32 +define void @atomic_add_ret_local(i32 addrspace(1)* %out, i32 addrspace(3)* %local) { + %val = atomicrmw volatile add i32 addrspace(3)* %local, i32 5 seq_cst + store i32 %val, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}atomic_add_ret_local_const_offset: +; R600: LDS_ADD_RET * +; SI: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:20 +define void @atomic_add_ret_local_const_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %local) { + %gep = getelementptr i32, i32 addrspace(3)* %local, i32 5 + %val = atomicrmw volatile add i32 addrspace(3)* %gep, i32 5 seq_cst + store i32 %val, i32 addrspace(1)* %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/atomic_load_sub.ll b/llvm/test/CodeGen/AMDGPU/atomic_load_sub.ll new file mode 100644 index 00000000000..4c6f45525b9 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/atomic_load_sub.ll @@ -0,0 +1,39 @@ +; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s + +; FUNC-LABEL: {{^}}atomic_sub_local: +; R600: LDS_SUB * +; SI: ds_sub_u32 +define void @atomic_sub_local(i32 addrspace(3)* %local) { + %unused = atomicrmw volatile sub i32 addrspace(3)* %local, i32 5 seq_cst + ret void +} + +; FUNC-LABEL: {{^}}atomic_sub_local_const_offset: +; R600: LDS_SUB * +; SI: ds_sub_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16 +define void @atomic_sub_local_const_offset(i32 addrspace(3)* %local) { + %gep = getelementptr i32, i32 addrspace(3)* %local, i32 4 + %val = atomicrmw volatile sub i32 addrspace(3)* %gep, i32 5 seq_cst + ret void +} + +; FUNC-LABEL: {{^}}atomic_sub_ret_local: +; R600: LDS_SUB_RET * +; SI: ds_sub_rtn_u32 +define void @atomic_sub_ret_local(i32 addrspace(1)* %out, i32 addrspace(3)* %local) { + %val = atomicrmw volatile sub i32 addrspace(3)* %local, i32 5 seq_cst + store i32 %val, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}atomic_sub_ret_local_const_offset: +; R600: LDS_SUB_RET * +; SI: ds_sub_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:20 +define void @atomic_sub_ret_local_const_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %local) { + %gep = getelementptr i32, i32 addrspace(3)* %local, i32 5 + %val = atomicrmw volatile sub i32 addrspace(3)* %gep, i32 5 seq_cst + store i32 %val, i32 addrspace(1)* %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/basic-branch.ll b/llvm/test/CodeGen/AMDGPU/basic-branch.ll new file mode 100644 index 00000000000..abdc4afef47 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/basic-branch.ll @@ -0,0 +1,16 @@ +; XFAIL: * +; RUN: llc -O0 -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -O0 -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s + +; CHECK-LABEL: {{^}}test_branch( +define void @test_branch(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %val) nounwind { + %cmp = icmp ne i32 %val, 0 + br i1 %cmp, label %store, label %end + +store: + store i32 222, i32 addrspace(1)* %out + ret void + +end: + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/basic-loop.ll b/llvm/test/CodeGen/AMDGPU/basic-loop.ll new file mode 100644 index 00000000000..f0263caf5d6 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/basic-loop.ll @@ -0,0 +1,18 @@ +; RUN: llc -O0 -verify-machineinstrs -march=amdgcn -mcpu=SI < %s | FileCheck %s +; RUN: llc -O0 -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck %s + +; CHECK-LABEL: {{^}}test_loop: +define void @test_loop(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %val) nounwind { +entry: + br label %loop.body + +loop.body: + %i = phi i32 [0, %entry], [%i.inc, %loop.body] + store i32 222, i32 addrspace(1)* %out + %cmp = icmp ne i32 %i, %val + %i.inc = add i32 %i, 1 + br i1 %cmp, label %loop.body, label %end + +end: + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/bfe_uint.ll b/llvm/test/CodeGen/AMDGPU/bfe_uint.ll new file mode 100644 index 00000000000..32e3fc26106 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/bfe_uint.ll @@ -0,0 +1,26 @@ +; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s + +; CHECK: {{^}}bfe_def: +; CHECK: BFE_UINT +define void @bfe_def(i32 addrspace(1)* %out, i32 %x) { +entry: + %0 = lshr i32 %x, 5 + %1 = and i32 %0, 15 ; 0xf + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; This program could be implemented using a BFE_UINT instruction, however +; since the lshr constant + number of bits in the mask is >= 32, it can also be +; implmented with a LSHR instruction, which is better, because LSHR has less +; operands and requires less constants. + +; CHECK: {{^}}bfe_shift: +; CHECK-NOT: BFE_UINT +define void @bfe_shift(i32 addrspace(1)* %out, i32 %x) { +entry: + %0 = lshr i32 %x, 16 + %1 = and i32 %0, 65535 ; 0xffff + store i32 %1, i32 addrspace(1)* %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/bfi_int.ll b/llvm/test/CodeGen/AMDGPU/bfi_int.ll new file mode 100644 index 00000000000..03349349735 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/bfi_int.ll @@ -0,0 +1,53 @@ +; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=R600 %s +; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck --check-prefix=SI %s +; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=SI %s + +; BFI_INT Definition pattern from ISA docs +; (y & x) | (z & ~x) +; +; R600: {{^}}bfi_def: +; R600: BFI_INT +; SI: @bfi_def +; SI: v_bfi_b32 +define void @bfi_def(i32 addrspace(1)* %out, i32 %x, i32 %y, i32 %z) { +entry: + %0 = xor i32 %x, -1 + %1 = and i32 %z, %0 + %2 = and i32 %y, %x + %3 = or i32 %1, %2 + store i32 %3, i32 addrspace(1)* %out + ret void +} + +; SHA-256 Ch function +; z ^ (x & (y ^ z)) +; R600: {{^}}bfi_sha256_ch: +; R600: BFI_INT +; SI: @bfi_sha256_ch +; SI: v_bfi_b32 +define void @bfi_sha256_ch(i32 addrspace(1)* %out, i32 %x, i32 %y, i32 %z) { +entry: + %0 = xor i32 %y, %z + %1 = and i32 %x, %0 + %2 = xor i32 %z, %1 + store i32 %2, i32 addrspace(1)* %out + ret void +} + +; SHA-256 Ma function +; ((x & z) | (y & (x | z))) +; R600: {{^}}bfi_sha256_ma: +; R600: XOR_INT * [[DST:T[0-9]+\.[XYZW]]], KC0[2].Z, KC0[2].W +; R600: BFI_INT * {{T[0-9]+\.[XYZW]}}, {{[[DST]]|PV\.[XYZW]}}, KC0[3].X, KC0[2].W +; SI: v_xor_b32_e32 [[DST:v[0-9]+]], {{s[0-9]+, v[0-9]+}} +; SI: v_bfi_b32 {{v[0-9]+}}, [[DST]], {{s[0-9]+, v[0-9]+}} + +define void @bfi_sha256_ma(i32 addrspace(1)* %out, i32 %x, i32 %y, i32 %z) { +entry: + %0 = and i32 %x, %z + %1 = or i32 %x, %z + %2 = and i32 %y, %1 + %3 = or i32 %0, %2 + store i32 %3, i32 addrspace(1)* %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/big_alu.ll b/llvm/test/CodeGen/AMDGPU/big_alu.ll new file mode 100644 index 00000000000..2671c5d102b --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/big_alu.ll @@ -0,0 +1,1173 @@ +;RUN: llc < %s -march=r600 -mcpu=cedar + +;This test ensures that R600 backend can handle ifcvt properly +;and do not generate ALU clauses with more than 128 instructions. + +define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1, <4 x float> inreg %reg2, <4 x float> inreg %reg3, <4 x float> inreg %reg4, <4 x float> inreg %reg5, <4 x float> inreg %reg6, <4 x float> inreg %reg7, <4 x float> inreg %reg8, <4 x float> inreg %reg9) #0 { +main_body: + %0 = extractelement <4 x float> %reg0, i32 0 + %1 = extractelement <4 x float> %reg0, i32 1 + %2 = extractelement <4 x float> %reg0, i32 2 + %3 = extractelement <4 x float> %reg0, i32 3 + %4 = extractelement <4 x float> %reg1, i32 0 + %5 = extractelement <4 x float> %reg9, i32 0 + %6 = extractelement <4 x float> %reg8, i32 0 + %7 = fcmp ugt float %6, 0.000000e+00 + %8 = select i1 %7, float %4, float %5 + %9 = extractelement <4 x float> %reg1, i32 1 + %10 = extractelement <4 x float> %reg9, i32 1 + %11 = extractelement <4 x float> %reg8, i32 0 + %12 = fcmp ugt float %11, 0.000000e+00 + %13 = select i1 %12, float %9, float %10 + %14 = extractelement <4 x float> %reg1, i32 2 + %15 = extractelement <4 x float> %reg9, i32 2 + %16 = extractelement <4 x float> %reg8, i32 0 + %17 = fcmp ugt float %16, 0.000000e+00 + %18 = select i1 %17, float %14, float %15 + %19 = extractelement <4 x float> %reg1, i32 3 + %20 = extractelement <4 x float> %reg9, i32 3 + %21 = extractelement <4 x float> %reg8, i32 0 + %22 = extractelement <4 x float> %reg2, i32 0 + %23 = extractelement <4 x float> %reg2, i32 1 + %24 = extractelement <4 x float> %reg2, i32 2 + %25 = extractelement <4 x float> %reg2, i32 3 + %26 = extractelement <4 x float> %reg3, i32 0 + %27 = extractelement <4 x float> %reg3, i32 1 + %28 = extractelement <4 x float> %reg3, i32 2 + %29 = extractelement <4 x float> %reg3, i32 3 + %30 = extractelement <4 x float> %reg4, i32 0 + %31 = extractelement <4 x float> %reg4, i32 1 + %32 = extractelement <4 x float> %reg4, i32 2 + %33 = extractelement <4 x float> %reg4, i32 3 + %34 = extractelement <4 x float> %reg5, i32 0 + %35 = extractelement <4 x float> %reg5, i32 1 + %36 = extractelement <4 x float> %reg5, i32 2 + %37 = extractelement <4 x float> %reg5, i32 3 + %38 = extractelement <4 x float> %reg6, i32 0 + %39 = extractelement <4 x float> %reg6, i32 1 + %40 = extractelement <4 x float> %reg6, i32 2 + %41 = extractelement <4 x float> %reg6, i32 3 + %42 = extractelement <4 x float> %reg7, i32 0 + %43 = extractelement <4 x float> %reg7, i32 1 + %44 = extractelement <4 x float> %reg7, i32 2 + %45 = extractelement <4 x float> %reg7, i32 3 + %46 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 11) + %47 = extractelement <4 x float> %46, i32 0 + %48 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 11) + %49 = extractelement <4 x float> %48, i32 1 + %50 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 11) + %51 = extractelement <4 x float> %50, i32 2 + %52 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 12) + %53 = extractelement <4 x float> %52, i32 0 + %54 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 14) + %55 = extractelement <4 x float> %54, i32 0 + %56 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 14) + %57 = extractelement <4 x float> %56, i32 1 + %58 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 14) + %59 = extractelement <4 x float> %58, i32 2 + %60 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 14) + %61 = extractelement <4 x float> %60, i32 3 + %62 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 16) + %63 = extractelement <4 x float> %62, i32 0 + %64 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 16) + %65 = extractelement <4 x float> %64, i32 1 + %66 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 16) + %67 = extractelement <4 x float> %66, i32 2 + %68 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9) + %69 = extractelement <4 x float> %68, i32 0 + %70 = fcmp oge float %69, 3.500000e+00 + %71 = sext i1 %70 to i32 + %72 = bitcast i32 %71 to float + %73 = bitcast float %72 to i32 + %74 = icmp ne i32 %73, 0 + %. = select i1 %74, float 0.000000e+00, float 0.000000e+00 + %75 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9) + %76 = extractelement <4 x float> %75, i32 0 + %77 = fcmp oge float %76, 2.000000e+00 + %78 = sext i1 %77 to i32 + %79 = bitcast i32 %78 to float + %80 = bitcast float %79 to i32 + %81 = icmp ne i32 %80, 0 + br i1 %81, label %IF137, label %ENDIF136 + +IF137: ; preds = %main_body + %82 = insertelement <4 x float> undef, float %30, i32 0 + %83 = insertelement <4 x float> %82, float %31, i32 1 + %84 = insertelement <4 x float> %83, float %32, i32 2 + %85 = insertelement <4 x float> %84, float 0.000000e+00, i32 3 + %86 = insertelement <4 x float> undef, float %30, i32 0 + %87 = insertelement <4 x float> %86, float %31, i32 1 + %88 = insertelement <4 x float> %87, float %32, i32 2 + %89 = insertelement <4 x float> %88, float 0.000000e+00, i32 3 + %90 = call float @llvm.AMDGPU.dp4(<4 x float> %85, <4 x float> %89) + %91 = call float @llvm.AMDGPU.rsq.f32(float %90) + %92 = fmul float %30, %91 + %93 = fmul float %31, %91 + %94 = fmul float %32, %91 + %95 = insertelement <4 x float> undef, float %92, i32 0 + %96 = insertelement <4 x float> %95, float %93, i32 1 + %97 = insertelement <4 x float> %96, float %94, i32 2 + %98 = insertelement <4 x float> %97, float 0.000000e+00, i32 3 + %99 = insertelement <4 x float> undef, float %37, i32 0 + %100 = insertelement <4 x float> %99, float %38, i32 1 + %101 = insertelement <4 x float> %100, float %39, i32 2 + %102 = insertelement <4 x float> %101, float 0.000000e+00, i32 3 + %103 = call float @llvm.AMDGPU.dp4(<4 x float> %98, <4 x float> %102) + %104 = insertelement <4 x float> undef, float %92, i32 0 + %105 = insertelement <4 x float> %104, float %93, i32 1 + %106 = insertelement <4 x float> %105, float %94, i32 2 + %107 = insertelement <4 x float> %106, float 0.000000e+00, i32 3 + %108 = insertelement <4 x float> undef, float %40, i32 0 + %109 = insertelement <4 x float> %108, float %41, i32 1 + %110 = insertelement <4 x float> %109, float %42, i32 2 + %111 = insertelement <4 x float> %110, float 0.000000e+00, i32 3 + %112 = call float @llvm.AMDGPU.dp4(<4 x float> %107, <4 x float> %111) + %113 = fsub float -0.000000e+00, %92 + %114 = fsub float -0.000000e+00, %93 + %115 = fsub float -0.000000e+00, %94 + %116 = insertelement <4 x float> undef, float %34, i32 0 + %117 = insertelement <4 x float> %116, float %35, i32 1 + %118 = insertelement <4 x float> %117, float %36, i32 2 + %119 = insertelement <4 x float> %118, float 0.000000e+00, i32 3 + %120 = insertelement <4 x float> undef, float %113, i32 0 + %121 = insertelement <4 x float> %120, float %114, i32 1 + %122 = insertelement <4 x float> %121, float %115, i32 2 + %123 = insertelement <4 x float> %122, float 0.000000e+00, i32 3 + %124 = call float @llvm.AMDGPU.dp4(<4 x float> %119, <4 x float> %123) + %125 = fdiv float 1.000000e+00, %124 + %126 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 5) + %127 = extractelement <4 x float> %126, i32 0 + %128 = fmul float %127, %125 + %129 = fmul float %103, %128 + %130 = fmul float %112, %128 + %131 = bitcast float %. to i32 + %132 = sitofp i32 %131 to float + %133 = fdiv float 1.000000e+00, %132 + %134 = bitcast float %. to i32 + %135 = add i32 %134, -1 + %136 = bitcast i32 %135 to float + %137 = bitcast float %136 to i32 + br label %LOOP + +ENDIF136: ; preds = %main_body, %ENDIF154 + %temp68.1 = phi float [ %600, %ENDIF154 ], [ 0.000000e+00, %main_body ] + %temp69.0 = phi float [ %602, %ENDIF154 ], [ 0.000000e+00, %main_body ] + %temp70.0 = phi float [ %604, %ENDIF154 ], [ 1.000000e+00, %main_body ] + %138 = fmul float %26, 0x3F847AE140000000 + %139 = fmul float %27, 0x3F847AE140000000 + %140 = fmul float %28, 0x3F847AE140000000 + %141 = insertelement <4 x float> undef, float %138, i32 0 + %142 = insertelement <4 x float> %141, float %139, i32 1 + %143 = insertelement <4 x float> %142, float %140, i32 2 + %144 = insertelement <4 x float> %143, float 0.000000e+00, i32 3 + %145 = extractelement <4 x float> %144, i32 0 + %146 = extractelement <4 x float> %144, i32 1 + %147 = extractelement <4 x float> %144, i32 2 + %148 = extractelement <4 x float> %144, i32 3 + %149 = insertelement <4 x float> undef, float %145, i32 0 + %150 = insertelement <4 x float> %149, float %146, i32 1 + %151 = insertelement <4 x float> %150, float %147, i32 2 + %152 = insertelement <4 x float> %151, float %148, i32 3 + %153 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %152, i32 16, i32 0, i32 3) + %154 = extractelement <4 x float> %153, i32 0 + %155 = extractelement <4 x float> %153, i32 1 + %156 = extractelement <4 x float> %153, i32 2 + %157 = extractelement <4 x float> %153, i32 3 + %158 = fmul float %26, 0x3F45A07B40000000 + %159 = fmul float %27, 0x3F45A07B40000000 + %160 = fmul float %28, 0x3F45A07B40000000 + %161 = insertelement <4 x float> undef, float %158, i32 0 + %162 = insertelement <4 x float> %161, float %159, i32 1 + %163 = insertelement <4 x float> %162, float %160, i32 2 + %164 = insertelement <4 x float> %163, float 0.000000e+00, i32 3 + %165 = extractelement <4 x float> %164, i32 0 + %166 = extractelement <4 x float> %164, i32 1 + %167 = extractelement <4 x float> %164, i32 2 + %168 = extractelement <4 x float> %164, i32 3 + %169 = insertelement <4 x float> undef, float %165, i32 0 + %170 = insertelement <4 x float> %169, float %166, i32 1 + %171 = insertelement <4 x float> %170, float %167, i32 2 + %172 = insertelement <4 x float> %171, float %168, i32 3 + %173 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %172, i32 16, i32 0, i32 3) + %174 = extractelement <4 x float> %173, i32 0 + %175 = extractelement <4 x float> %173, i32 1 + %176 = extractelement <4 x float> %173, i32 2 + %177 = extractelement <4 x float> %173, i32 3 + %178 = fmul float %176, 3.000000e+03 + %179 = fadd float %178, %28 + %180 = fdiv float 1.000000e+00, %33 + %181 = fmul float %32, %180 + %182 = call float @fabs(float %181) + %183 = fmul float %174, 0x3FD99999A0000000 + %184 = fadd float %183, 0x3FAEB851E0000000 + %185 = fmul float %175, 0x3FE3333340000000 + %186 = fadd float %185, %184 + %187 = fmul float %176, 2.000000e+00 + %188 = fadd float %187, %186 + %189 = fmul float %177, 4.000000e+00 + %190 = fadd float %189, %188 + %191 = fmul float %154, 0x3FB99999A0000000 + %192 = fadd float %191, %190 + %193 = fmul float %155, 0x3FD99999A0000000 + %194 = fadd float %193, %192 + %195 = fmul float %156, 0x3FE99999A0000000 + %196 = fadd float %195, %194 + %197 = fmul float %157, 0x4000CCCCC0000000 + %198 = fadd float %197, %196 + %199 = fmul float 0xBE5EFB4CC0000000, %182 + %200 = fmul float %199, %182 + %201 = call float @llvm.AMDIL.exp.(float %200) + %202 = call float @llvm.AMDGPU.lrp(float %201, float %198, float 0x3FA99999A0000000) + %203 = fadd float %202, 0x3FF4CCCCC0000000 + %204 = fmul float %203, 0x3FE1C71C80000000 + %205 = call float @llvm.AMDIL.clamp.(float %204, float 0.000000e+00, float 1.000000e+00) + %206 = fadd float %202, 0x3FF4CCCCC0000000 + %207 = fmul float %206, 0x3FE1C71C80000000 + %208 = call float @llvm.AMDIL.clamp.(float %207, float 0.000000e+00, float 1.000000e+00) + %209 = fadd float %202, 2.000000e+00 + %210 = fmul float %209, 0x3FD611A7A0000000 + %211 = call float @llvm.AMDIL.clamp.(float %210, float 0.000000e+00, float 1.000000e+00) + %212 = fmul float 2.000000e+00, %205 + %213 = fsub float -0.000000e+00, %212 + %214 = fadd float 3.000000e+00, %213 + %215 = fmul float %205, %214 + %216 = fmul float %205, %215 + %217 = fmul float 2.000000e+00, %208 + %218 = fsub float -0.000000e+00, %217 + %219 = fadd float 3.000000e+00, %218 + %220 = fmul float %208, %219 + %221 = fmul float %208, %220 + %222 = fmul float 2.000000e+00, %211 + %223 = fsub float -0.000000e+00, %222 + %224 = fadd float 3.000000e+00, %223 + %225 = fmul float %211, %224 + %226 = fmul float %211, %225 + %227 = fmul float %26, 0x3F368B5CC0000000 + %228 = fmul float %27, 0x3F368B5CC0000000 + %229 = insertelement <4 x float> undef, float %227, i32 0 + %230 = insertelement <4 x float> %229, float %228, i32 1 + %231 = insertelement <4 x float> %230, float 0.000000e+00, i32 2 + %232 = insertelement <4 x float> %231, float 0.000000e+00, i32 3 + %233 = extractelement <4 x float> %232, i32 0 + %234 = extractelement <4 x float> %232, i32 1 + %235 = insertelement <4 x float> undef, float %233, i32 0 + %236 = insertelement <4 x float> %235, float %234, i32 1 + %237 = insertelement <4 x float> %236, float undef, i32 2 + %238 = insertelement <4 x float> %237, float undef, i32 3 + %239 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %238, i32 17, i32 1, i32 2) + %240 = extractelement <4 x float> %239, i32 0 + %241 = insertelement <4 x float> undef, float %240, i32 0 + %242 = insertelement <4 x float> %241, float %228, i32 1 + %243 = insertelement <4 x float> %242, float 0.000000e+00, i32 2 + %244 = insertelement <4 x float> %243, float 0.000000e+00, i32 3 + %245 = extractelement <4 x float> %244, i32 0 + %246 = insertelement <4 x float> undef, float %245, i32 0 + %247 = insertelement <4 x float> %246, float undef, i32 1 + %248 = insertelement <4 x float> %247, float undef, i32 2 + %249 = insertelement <4 x float> %248, float undef, i32 3 + %250 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %249, i32 18, i32 2, i32 1) + %251 = extractelement <4 x float> %250, i32 0 + %252 = extractelement <4 x float> %250, i32 1 + %253 = extractelement <4 x float> %250, i32 2 + %254 = extractelement <4 x float> %250, i32 3 + %255 = fmul float %251, %216 + %256 = fmul float %252, %221 + %257 = fmul float %253, %226 + %258 = fmul float %254, 0.000000e+00 + %259 = fadd float %202, 0x3FF4CCCCC0000000 + %260 = fmul float %259, 0x3FE1C71C80000000 + %261 = call float @llvm.AMDIL.clamp.(float %260, float 0.000000e+00, float 1.000000e+00) + %262 = fadd float %202, 0x3FF4CCCCC0000000 + %263 = fmul float %262, 0x3FE1C71C80000000 + %264 = call float @llvm.AMDIL.clamp.(float %263, float 0.000000e+00, float 1.000000e+00) + %265 = fadd float %202, 2.000000e+00 + %266 = fmul float %265, 0x3FD611A7A0000000 + %267 = call float @llvm.AMDIL.clamp.(float %266, float 0.000000e+00, float 1.000000e+00) + %268 = fmul float 2.000000e+00, %261 + %269 = fsub float -0.000000e+00, %268 + %270 = fadd float 3.000000e+00, %269 + %271 = fmul float %261, %270 + %272 = fmul float %261, %271 + %273 = fmul float 2.000000e+00, %264 + %274 = fsub float -0.000000e+00, %273 + %275 = fadd float 3.000000e+00, %274 + %276 = fmul float %264, %275 + %277 = fmul float %264, %276 + %278 = fmul float 2.000000e+00, %267 + %279 = fsub float -0.000000e+00, %278 + %280 = fadd float 3.000000e+00, %279 + %281 = fmul float %267, %280 + %282 = fmul float %267, %281 + %283 = fmul float %26, 0x3F22DFD6A0000000 + %284 = fmul float %27, 0x3F22DFD6A0000000 + %285 = insertelement <4 x float> undef, float %283, i32 0 + %286 = insertelement <4 x float> %285, float %284, i32 1 + %287 = insertelement <4 x float> %286, float 0.000000e+00, i32 2 + %288 = insertelement <4 x float> %287, float 0.000000e+00, i32 3 + %289 = extractelement <4 x float> %288, i32 0 + %290 = extractelement <4 x float> %288, i32 1 + %291 = insertelement <4 x float> undef, float %289, i32 0 + %292 = insertelement <4 x float> %291, float %290, i32 1 + %293 = insertelement <4 x float> %292, float undef, i32 2 + %294 = insertelement <4 x float> %293, float undef, i32 3 + %295 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %294, i32 19, i32 3, i32 2) + %296 = extractelement <4 x float> %295, i32 0 + %297 = extractelement <4 x float> %295, i32 1 + %298 = extractelement <4 x float> %295, i32 2 + %299 = extractelement <4 x float> %295, i32 3 + %300 = fmul float %296, %272 + %301 = fmul float %297, %277 + %302 = fmul float %298, %282 + %303 = fmul float %299, 0.000000e+00 + %304 = fmul float %temp68.1, %37 + %305 = fmul float %temp68.1, %38 + %306 = fmul float %temp68.1, %39 + %307 = fmul float %temp69.0, %40 + %308 = fadd float %307, %304 + %309 = fmul float %temp69.0, %41 + %310 = fadd float %309, %305 + %311 = fmul float %temp69.0, %42 + %312 = fadd float %311, %306 + %313 = fmul float %temp70.0, %34 + %314 = fadd float %313, %308 + %315 = fmul float %temp70.0, %35 + %316 = fadd float %315, %310 + %317 = fmul float %temp70.0, %36 + %318 = fadd float %317, %312 + %319 = insertelement <4 x float> undef, float %314, i32 0 + %320 = insertelement <4 x float> %319, float %316, i32 1 + %321 = insertelement <4 x float> %320, float %318, i32 2 + %322 = insertelement <4 x float> %321, float 0.000000e+00, i32 3 + %323 = insertelement <4 x float> undef, float %314, i32 0 + %324 = insertelement <4 x float> %323, float %316, i32 1 + %325 = insertelement <4 x float> %324, float %318, i32 2 + %326 = insertelement <4 x float> %325, float 0.000000e+00, i32 3 + %327 = call float @llvm.AMDGPU.dp4(<4 x float> %322, <4 x float> %326) + %328 = call float @llvm.AMDGPU.rsq.f32(float %327) + %329 = fmul float %314, %328 + %330 = fmul float %316, %328 + %331 = fmul float %318, %328 + %332 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 6) + %333 = extractelement <4 x float> %332, i32 0 + %334 = fsub float -0.000000e+00, %333 + %335 = fadd float 1.000000e+00, %334 + %336 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 7) + %337 = extractelement <4 x float> %336, i32 0 + %338 = fsub float -0.000000e+00, %337 + %339 = fadd float 1.000000e+00, %338 + %340 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 8) + %341 = extractelement <4 x float> %340, i32 0 + %342 = fsub float -0.000000e+00, %341 + %343 = fadd float 1.000000e+00, %342 + %344 = fsub float -0.000000e+00, %335 + %345 = fadd float %202, %344 + %346 = fsub float -0.000000e+00, %339 + %347 = fadd float %202, %346 + %348 = fadd float %347, 0xBFE3333340000000 + %349 = fsub float -0.000000e+00, %202 + %350 = fsub float -0.000000e+00, %343 + %351 = fadd float %349, %350 + %352 = insertelement <4 x float> undef, float %43, i32 0 + %353 = insertelement <4 x float> %352, float %44, i32 1 + %354 = insertelement <4 x float> %353, float %45, i32 2 + %355 = insertelement <4 x float> %354, float 0.000000e+00, i32 3 + %356 = insertelement <4 x float> undef, float %43, i32 0 + %357 = insertelement <4 x float> %356, float %44, i32 1 + %358 = insertelement <4 x float> %357, float %45, i32 2 + %359 = insertelement <4 x float> %358, float 0.000000e+00, i32 3 + %360 = call float @llvm.AMDGPU.dp4(<4 x float> %355, <4 x float> %359) + %361 = call float @llvm.AMDGPU.rsq.f32(float %360) + %362 = fmul float %45, %361 + %363 = call float @fabs(float %362) + %364 = fmul float %176, 0x3FECCCCCC0000000 + %365 = fadd float %364, %363 + %366 = fadd float %365, 0xBFEFAE1480000000 + %367 = fmul float %366, 0xC023FFFFC0000000 + %368 = call float @llvm.AMDIL.clamp.(float %367, float 0.000000e+00, float 1.000000e+00) + %369 = fsub float -0.000000e+00, %335 + %370 = fadd float %202, %369 + %371 = fadd float %370, 0x3FBEB851E0000000 + %372 = fsub float -0.000000e+00, %339 + %373 = fadd float %202, %372 + %374 = fadd float %373, 0xBFE0A3D700000000 + %375 = fsub float -0.000000e+00, %202 + %376 = fsub float -0.000000e+00, %343 + %377 = fadd float %375, %376 + %378 = insertelement <4 x float> undef, float %43, i32 0 + %379 = insertelement <4 x float> %378, float %44, i32 1 + %380 = insertelement <4 x float> %379, float %45, i32 2 + %381 = insertelement <4 x float> %380, float 0.000000e+00, i32 3 + %382 = insertelement <4 x float> undef, float %43, i32 0 + %383 = insertelement <4 x float> %382, float %44, i32 1 + %384 = insertelement <4 x float> %383, float %45, i32 2 + %385 = insertelement <4 x float> %384, float 0.000000e+00, i32 3 + %386 = call float @llvm.AMDGPU.dp4(<4 x float> %381, <4 x float> %385) + %387 = call float @llvm.AMDGPU.rsq.f32(float %386) + %388 = fmul float %45, %387 + %389 = call float @fabs(float %388) + %390 = fmul float %176, 0x3FF51EB860000000 + %391 = fadd float %390, %389 + %392 = fadd float %391, 0xBFEFAE1480000000 + %393 = fmul float %392, 0xC0490001A0000000 + %394 = call float @llvm.AMDIL.clamp.(float %393, float 0.000000e+00, float 1.000000e+00) + %395 = fmul float 2.000000e+00, %368 + %396 = fsub float -0.000000e+00, %395 + %397 = fadd float 3.000000e+00, %396 + %398 = fmul float %368, %397 + %399 = fmul float %368, %398 + %400 = call float @llvm.AMDGPU.lrp(float %399, float %255, float %345) + %401 = call float @llvm.AMDGPU.lrp(float %399, float %256, float %348) + %402 = call float @llvm.AMDGPU.lrp(float %399, float %257, float %351) + %403 = call float @llvm.AMDGPU.lrp(float %399, float %258, float 0.000000e+00) + %404 = fmul float 2.000000e+00, %394 + %405 = fsub float -0.000000e+00, %404 + %406 = fadd float 3.000000e+00, %405 + %407 = fmul float %394, %406 + %408 = fmul float %394, %407 + %409 = call float @llvm.AMDGPU.lrp(float %408, float %255, float %371) + %410 = call float @llvm.AMDGPU.lrp(float %408, float %256, float %374) + %411 = call float @llvm.AMDGPU.lrp(float %408, float %257, float %377) + %412 = call float @llvm.AMDGPU.lrp(float %408, float %258, float 0x3FD3333340000000) + %413 = fcmp oge float 2.200000e+03, %179 + %414 = sext i1 %413 to i32 + %415 = bitcast i32 %414 to float + %416 = bitcast float %415 to i32 + %417 = icmp ne i32 %416, 0 + br i1 %417, label %IF161, label %ENDIF160 + +LOOP: ; preds = %ENDIF139, %IF137 + %temp88.0 = phi float [ 0.000000e+00, %IF137 ], [ %446, %ENDIF139 ] + %temp92.0 = phi float [ 1.000000e+00, %IF137 ], [ %.temp92.0, %ENDIF139 ] + %temp96.0 = phi float [ 0.000000e+00, %IF137 ], [ %477, %ENDIF139 ] + %418 = bitcast float %temp96.0 to i32 + %419 = icmp sge i32 %418, %137 + %420 = sext i1 %419 to i32 + %421 = bitcast i32 %420 to float + %422 = bitcast float %421 to i32 + %423 = icmp ne i32 %422, 0 + br i1 %423, label %IF140, label %ENDIF139 + +IF140: ; preds = %LOOP + %424 = fmul float %133, 5.000000e-01 + %425 = fmul float %129, %temp92.0 + %426 = fadd float %425, %22 + %427 = fmul float %130, %temp92.0 + %428 = fadd float %427, %23 + %429 = insertelement <4 x float> undef, float %426, i32 0 + %430 = insertelement <4 x float> %429, float %428, i32 1 + %431 = insertelement <4 x float> %430, float 0.000000e+00, i32 2 + %432 = insertelement <4 x float> %431, float 0.000000e+00, i32 3 + %433 = extractelement <4 x float> %432, i32 0 + %434 = extractelement <4 x float> %432, i32 1 + %435 = insertelement <4 x float> undef, float %433, i32 0 + %436 = insertelement <4 x float> %435, float %434, i32 1 + %437 = insertelement <4 x float> %436, float undef, i32 2 + %438 = insertelement <4 x float> %437, float undef, i32 3 + %439 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %438, i32 20, i32 4, i32 2) + %440 = extractelement <4 x float> %439, i32 3 + %441 = fcmp oge float %temp92.0, %440 + %442 = sext i1 %441 to i32 + %443 = bitcast i32 %442 to float + %444 = bitcast float %443 to i32 + %445 = icmp ne i32 %444, 0 + br i1 %445, label %IF146, label %ENDIF145 + +ENDIF139: ; preds = %LOOP + %446 = fadd float %temp88.0, %133 + %447 = fmul float %129, %446 + %448 = fadd float %447, %22 + %449 = fmul float %130, %446 + %450 = fadd float %449, %23 + %451 = insertelement <4 x float> undef, float %448, i32 0 + %452 = insertelement <4 x float> %451, float %450, i32 1 + %453 = insertelement <4 x float> %452, float 0.000000e+00, i32 2 + %454 = insertelement <4 x float> %453, float 0.000000e+00, i32 3 + %455 = extractelement <4 x float> %454, i32 0 + %456 = extractelement <4 x float> %454, i32 1 + %457 = insertelement <4 x float> undef, float %455, i32 0 + %458 = insertelement <4 x float> %457, float %456, i32 1 + %459 = insertelement <4 x float> %458, float undef, i32 2 + %460 = insertelement <4 x float> %459, float undef, i32 3 + %461 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %460, i32 20, i32 4, i32 2) + %462 = extractelement <4 x float> %461, i32 3 + %463 = fcmp olt float 0x3FEFDF3B60000000, %temp92.0 + %464 = sext i1 %463 to i32 + %465 = bitcast i32 %464 to float + %466 = fcmp oge float %446, %462 + %467 = sext i1 %466 to i32 + %468 = bitcast i32 %467 to float + %469 = bitcast float %465 to i32 + %470 = bitcast float %468 to i32 + %471 = and i32 %469, %470 + %472 = bitcast i32 %471 to float + %473 = bitcast float %472 to i32 + %474 = icmp ne i32 %473, 0 + %.temp92.0 = select i1 %474, float %446, float %temp92.0 + %475 = bitcast float %temp96.0 to i32 + %476 = add i32 %475, 1 + %477 = bitcast i32 %476 to float + br label %LOOP + +IF146: ; preds = %IF140 + %478 = fmul float 2.000000e+00, %424 + %479 = fsub float -0.000000e+00, %478 + %480 = fadd float %temp92.0, %479 + br label %ENDIF145 + +ENDIF145: ; preds = %IF140, %IF146 + %temp88.1 = phi float [ %480, %IF146 ], [ %temp92.0, %IF140 ] + %481 = fadd float %temp88.1, %424 + %482 = fmul float %424, 5.000000e-01 + %483 = fmul float %129, %481 + %484 = fadd float %483, %22 + %485 = fmul float %130, %481 + %486 = fadd float %485, %23 + %487 = insertelement <4 x float> undef, float %484, i32 0 + %488 = insertelement <4 x float> %487, float %486, i32 1 + %489 = insertelement <4 x float> %488, float 0.000000e+00, i32 2 + %490 = insertelement <4 x float> %489, float %440, i32 3 + %491 = extractelement <4 x float> %490, i32 0 + %492 = extractelement <4 x float> %490, i32 1 + %493 = insertelement <4 x float> undef, float %491, i32 0 + %494 = insertelement <4 x float> %493, float %492, i32 1 + %495 = insertelement <4 x float> %494, float undef, i32 2 + %496 = insertelement <4 x float> %495, float undef, i32 3 + %497 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %496, i32 20, i32 4, i32 2) + %498 = extractelement <4 x float> %497, i32 3 + %499 = fcmp oge float %481, %498 + %500 = sext i1 %499 to i32 + %501 = bitcast i32 %500 to float + %502 = bitcast float %501 to i32 + %503 = icmp ne i32 %502, 0 + br i1 %503, label %IF149, label %ENDIF148 + +IF149: ; preds = %ENDIF145 + %504 = fmul float 2.000000e+00, %482 + %505 = fsub float -0.000000e+00, %504 + %506 = fadd float %481, %505 + br label %ENDIF148 + +ENDIF148: ; preds = %ENDIF145, %IF149 + %temp88.2 = phi float [ %506, %IF149 ], [ %481, %ENDIF145 ] + %temp92.2 = phi float [ %481, %IF149 ], [ %temp92.0, %ENDIF145 ] + %507 = fadd float %temp88.2, %482 + %508 = fmul float %482, 5.000000e-01 + %509 = fmul float %129, %507 + %510 = fadd float %509, %22 + %511 = fmul float %130, %507 + %512 = fadd float %511, %23 + %513 = insertelement <4 x float> undef, float %510, i32 0 + %514 = insertelement <4 x float> %513, float %512, i32 1 + %515 = insertelement <4 x float> %514, float 0.000000e+00, i32 2 + %516 = insertelement <4 x float> %515, float %498, i32 3 + %517 = extractelement <4 x float> %516, i32 0 + %518 = extractelement <4 x float> %516, i32 1 + %519 = insertelement <4 x float> undef, float %517, i32 0 + %520 = insertelement <4 x float> %519, float %518, i32 1 + %521 = insertelement <4 x float> %520, float undef, i32 2 + %522 = insertelement <4 x float> %521, float undef, i32 3 + %523 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %522, i32 20, i32 4, i32 2) + %524 = extractelement <4 x float> %523, i32 3 + %525 = fcmp oge float %507, %524 + %526 = sext i1 %525 to i32 + %527 = bitcast i32 %526 to float + %528 = bitcast float %527 to i32 + %529 = icmp ne i32 %528, 0 + br i1 %529, label %IF152, label %ENDIF151 + +IF152: ; preds = %ENDIF148 + %530 = fmul float 2.000000e+00, %508 + %531 = fsub float -0.000000e+00, %530 + %532 = fadd float %507, %531 + br label %ENDIF151 + +ENDIF151: ; preds = %ENDIF148, %IF152 + %temp88.3 = phi float [ %532, %IF152 ], [ %507, %ENDIF148 ] + %temp92.3 = phi float [ %507, %IF152 ], [ %temp92.2, %ENDIF148 ] + %533 = fadd float %temp88.3, %508 + %534 = fmul float %508, 5.000000e-01 + %535 = fmul float %129, %533 + %536 = fadd float %535, %22 + %537 = fmul float %130, %533 + %538 = fadd float %537, %23 + %539 = insertelement <4 x float> undef, float %536, i32 0 + %540 = insertelement <4 x float> %539, float %538, i32 1 + %541 = insertelement <4 x float> %540, float 0.000000e+00, i32 2 + %542 = insertelement <4 x float> %541, float %524, i32 3 + %543 = extractelement <4 x float> %542, i32 0 + %544 = extractelement <4 x float> %542, i32 1 + %545 = insertelement <4 x float> undef, float %543, i32 0 + %546 = insertelement <4 x float> %545, float %544, i32 1 + %547 = insertelement <4 x float> %546, float undef, i32 2 + %548 = insertelement <4 x float> %547, float undef, i32 3 + %549 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %548, i32 20, i32 4, i32 2) + %550 = extractelement <4 x float> %549, i32 3 + %551 = fcmp oge float %533, %550 + %552 = sext i1 %551 to i32 + %553 = bitcast i32 %552 to float + %554 = bitcast float %553 to i32 + %555 = icmp ne i32 %554, 0 + br i1 %555, label %IF155, label %ENDIF154 + +IF155: ; preds = %ENDIF151 + %556 = fmul float 2.000000e+00, %534 + %557 = fsub float -0.000000e+00, %556 + %558 = fadd float %533, %557 + br label %ENDIF154 + +ENDIF154: ; preds = %ENDIF151, %IF155 + %temp88.4 = phi float [ %558, %IF155 ], [ %533, %ENDIF151 ] + %temp92.4 = phi float [ %533, %IF155 ], [ %temp92.3, %ENDIF151 ] + %559 = fadd float %temp88.4, %534 + %560 = fmul float %129, %559 + %561 = fadd float %560, %22 + %562 = fmul float %130, %559 + %563 = fadd float %562, %23 + %564 = insertelement <4 x float> undef, float %561, i32 0 + %565 = insertelement <4 x float> %564, float %563, i32 1 + %566 = insertelement <4 x float> %565, float 0.000000e+00, i32 2 + %567 = insertelement <4 x float> %566, float %550, i32 3 + %568 = extractelement <4 x float> %567, i32 0 + %569 = extractelement <4 x float> %567, i32 1 + %570 = insertelement <4 x float> undef, float %568, i32 0 + %571 = insertelement <4 x float> %570, float %569, i32 1 + %572 = insertelement <4 x float> %571, float undef, i32 2 + %573 = insertelement <4 x float> %572, float undef, i32 3 + %574 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %573, i32 20, i32 4, i32 2) + %575 = extractelement <4 x float> %574, i32 3 + %576 = fcmp oge float %559, %575 + %577 = sext i1 %576 to i32 + %578 = bitcast i32 %577 to float + %579 = bitcast float %578 to i32 + %580 = icmp ne i32 %579, 0 + %.temp92.4 = select i1 %580, float %559, float %temp92.4 + %581 = fmul float %129, %.temp92.4 + %582 = fadd float %581, %22 + %583 = fmul float %130, %.temp92.4 + %584 = fadd float %583, %23 + %585 = insertelement <4 x float> undef, float %582, i32 0 + %586 = insertelement <4 x float> %585, float %584, i32 1 + %587 = insertelement <4 x float> %586, float 0.000000e+00, i32 2 + %588 = insertelement <4 x float> %587, float %575, i32 3 + %589 = extractelement <4 x float> %588, i32 0 + %590 = extractelement <4 x float> %588, i32 1 + %591 = insertelement <4 x float> undef, float %589, i32 0 + %592 = insertelement <4 x float> %591, float %590, i32 1 + %593 = insertelement <4 x float> %592, float undef, i32 2 + %594 = insertelement <4 x float> %593, float undef, i32 3 + %595 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %594, i32 20, i32 4, i32 2) + %596 = extractelement <4 x float> %595, i32 0 + %597 = extractelement <4 x float> %595, i32 1 + %598 = extractelement <4 x float> %595, i32 2 + %599 = fmul float %596, 2.000000e+00 + %600 = fadd float %599, -1.000000e+00 + %601 = fmul float %597, 2.000000e+00 + %602 = fadd float %601, -1.000000e+00 + %603 = fmul float %598, 2.000000e+00 + %604 = fadd float %603, -1.000000e+00 + br label %ENDIF136 + +IF161: ; preds = %ENDIF136 + %605 = fmul float %202, 0x3FB99999A0000000 + %606 = fcmp uge float 0x3FE4CCCCC0000000, %605 + %607 = select i1 %606, float 0x3FE4CCCCC0000000, float %605 + %608 = fcmp uge float %607, 5.000000e-01 + %609 = select i1 %608, float 5.000000e-01, float %607 + %610 = call float @llvm.AMDGPU.lrp(float %609, float %400, float %300) + %611 = call float @llvm.AMDGPU.lrp(float %609, float %401, float %301) + %612 = call float @llvm.AMDGPU.lrp(float %609, float %402, float %302) + %613 = call float @llvm.AMDGPU.lrp(float %609, float %403, float %303) + %614 = insertelement <4 x float> undef, float %329, i32 0 + %615 = insertelement <4 x float> %614, float %330, i32 1 + %616 = insertelement <4 x float> %615, float %331, i32 2 + %617 = insertelement <4 x float> %616, float 0.000000e+00, i32 3 + %618 = insertelement <4 x float> undef, float %63, i32 0 + %619 = insertelement <4 x float> %618, float %65, i32 1 + %620 = insertelement <4 x float> %619, float %67, i32 2 + %621 = insertelement <4 x float> %620, float 0.000000e+00, i32 3 + %622 = call float @llvm.AMDGPU.dp4(<4 x float> %617, <4 x float> %621) + %623 = fcmp uge float 0x3FE6666660000000, %622 + %624 = select i1 %623, float 0x3FE6666660000000, float %622 + %625 = fmul float %8, %624 + %626 = fmul float %13, %624 + %627 = fmul float %18, %624 + %628 = insertelement <4 x float> undef, float %34, i32 0 + %629 = insertelement <4 x float> %628, float %35, i32 1 + %630 = insertelement <4 x float> %629, float %36, i32 2 + %631 = insertelement <4 x float> %630, float 0.000000e+00, i32 3 + %632 = insertelement <4 x float> undef, float %63, i32 0 + %633 = insertelement <4 x float> %632, float %65, i32 1 + %634 = insertelement <4 x float> %633, float %67, i32 2 + %635 = insertelement <4 x float> %634, float 0.000000e+00, i32 3 + %636 = call float @llvm.AMDGPU.dp4(<4 x float> %631, <4 x float> %635) + %637 = fcmp uge float 0x3FECCCCCC0000000, %636 + %638 = select i1 %637, float 0x3FECCCCCC0000000, float %636 + %639 = fmul float %625, %638 + %640 = fmul float %626, %638 + %641 = fmul float %627, %638 + br label %ENDIF160 + +ENDIF160: ; preds = %ENDIF136, %IF161 + %temp84.0 = phi float [ %610, %IF161 ], [ %255, %ENDIF136 ] + %temp85.0 = phi float [ %611, %IF161 ], [ %256, %ENDIF136 ] + %temp86.0 = phi float [ %612, %IF161 ], [ %257, %ENDIF136 ] + %temp87.0 = phi float [ %613, %IF161 ], [ %258, %ENDIF136 ] + %temp92.6 = phi float [ %639, %IF161 ], [ %415, %ENDIF136 ] + %temp93.0 = phi float [ %640, %IF161 ], [ 0.000000e+00, %ENDIF136 ] + %temp94.0 = phi float [ %641, %IF161 ], [ 0.000000e+00, %ENDIF136 ] + %642 = fcmp olt float 2.200000e+03, %179 + %643 = sext i1 %642 to i32 + %644 = bitcast i32 %643 to float + %645 = fcmp olt float %179, 2.300000e+03 + %646 = sext i1 %645 to i32 + %647 = bitcast i32 %646 to float + %648 = bitcast float %644 to i32 + %649 = bitcast float %647 to i32 + %650 = and i32 %648, %649 + %651 = bitcast i32 %650 to float + %652 = bitcast float %651 to i32 + %653 = icmp ne i32 %652, 0 + br i1 %653, label %IF164, label %ENDIF163 + +IF164: ; preds = %ENDIF160 + %654 = fmul float %202, 5.000000e-01 + %655 = fcmp uge float 0x3FE4CCCCC0000000, %654 + %656 = select i1 %655, float 0x3FE4CCCCC0000000, float %654 + %657 = fcmp uge float %656, 0x3FD6666660000000 + %658 = select i1 %657, float 0x3FD6666660000000, float %656 + %659 = call float @llvm.AMDGPU.lrp(float %658, float %400, float %300) + %660 = call float @llvm.AMDGPU.lrp(float %658, float %401, float %301) + %661 = call float @llvm.AMDGPU.lrp(float %658, float %402, float %302) + %662 = call float @llvm.AMDGPU.lrp(float %658, float %403, float %303) + %663 = insertelement <4 x float> undef, float %329, i32 0 + %664 = insertelement <4 x float> %663, float %330, i32 1 + %665 = insertelement <4 x float> %664, float %331, i32 2 + %666 = insertelement <4 x float> %665, float 0.000000e+00, i32 3 + %667 = insertelement <4 x float> undef, float %63, i32 0 + %668 = insertelement <4 x float> %667, float %65, i32 1 + %669 = insertelement <4 x float> %668, float %67, i32 2 + %670 = insertelement <4 x float> %669, float 0.000000e+00, i32 3 + %671 = call float @llvm.AMDGPU.dp4(<4 x float> %666, <4 x float> %670) + %672 = fcmp uge float 0x3FE6666660000000, %671 + %673 = select i1 %672, float 0x3FE6666660000000, float %671 + %674 = fmul float %8, %673 + %675 = fmul float %13, %673 + %676 = fmul float %18, %673 + %677 = insertelement <4 x float> undef, float %34, i32 0 + %678 = insertelement <4 x float> %677, float %35, i32 1 + %679 = insertelement <4 x float> %678, float %36, i32 2 + %680 = insertelement <4 x float> %679, float 0.000000e+00, i32 3 + %681 = insertelement <4 x float> undef, float %63, i32 0 + %682 = insertelement <4 x float> %681, float %65, i32 1 + %683 = insertelement <4 x float> %682, float %67, i32 2 + %684 = insertelement <4 x float> %683, float 0.000000e+00, i32 3 + %685 = call float @llvm.AMDGPU.dp4(<4 x float> %680, <4 x float> %684) + %686 = fcmp uge float 0x3FECCCCCC0000000, %685 + %687 = select i1 %686, float 0x3FECCCCCC0000000, float %685 + %688 = fmul float %674, %687 + %689 = fmul float %675, %687 + %690 = fmul float %676, %687 + br label %ENDIF163 + +ENDIF163: ; preds = %ENDIF160, %IF164 + %temp84.1 = phi float [ %659, %IF164 ], [ %temp84.0, %ENDIF160 ] + %temp85.1 = phi float [ %660, %IF164 ], [ %temp85.0, %ENDIF160 ] + %temp86.1 = phi float [ %661, %IF164 ], [ %temp86.0, %ENDIF160 ] + %temp87.1 = phi float [ %662, %IF164 ], [ %temp87.0, %ENDIF160 ] + %temp92.7 = phi float [ %688, %IF164 ], [ %temp92.6, %ENDIF160 ] + %temp93.1 = phi float [ %689, %IF164 ], [ %temp93.0, %ENDIF160 ] + %temp94.1 = phi float [ %690, %IF164 ], [ %temp94.0, %ENDIF160 ] + %691 = fcmp oge float %179, 2.300000e+03 + %692 = sext i1 %691 to i32 + %693 = bitcast i32 %692 to float + %694 = fcmp olt float %179, 2.480000e+03 + %695 = sext i1 %694 to i32 + %696 = bitcast i32 %695 to float + %697 = bitcast float %693 to i32 + %698 = bitcast float %696 to i32 + %699 = and i32 %697, %698 + %700 = bitcast i32 %699 to float + %701 = bitcast float %700 to i32 + %702 = icmp ne i32 %701, 0 + br i1 %702, label %IF167, label %ENDIF166 + +IF167: ; preds = %ENDIF163 + %703 = fmul float %202, 5.000000e-01 + %704 = fcmp uge float 0x3FE4CCCCC0000000, %703 + %705 = select i1 %704, float 0x3FE4CCCCC0000000, float %703 + %706 = fcmp uge float %705, 0x3FD3333340000000 + %707 = select i1 %706, float 0x3FD3333340000000, float %705 + %708 = call float @llvm.AMDGPU.lrp(float %707, float %409, float %300) + %709 = call float @llvm.AMDGPU.lrp(float %707, float %410, float %301) + %710 = call float @llvm.AMDGPU.lrp(float %707, float %411, float %302) + %711 = call float @llvm.AMDGPU.lrp(float %707, float %412, float %303) + %712 = insertelement <4 x float> undef, float %329, i32 0 + %713 = insertelement <4 x float> %712, float %330, i32 1 + %714 = insertelement <4 x float> %713, float %331, i32 2 + %715 = insertelement <4 x float> %714, float 0.000000e+00, i32 3 + %716 = insertelement <4 x float> undef, float %63, i32 0 + %717 = insertelement <4 x float> %716, float %65, i32 1 + %718 = insertelement <4 x float> %717, float %67, i32 2 + %719 = insertelement <4 x float> %718, float 0.000000e+00, i32 3 + %720 = call float @llvm.AMDGPU.dp4(<4 x float> %715, <4 x float> %719) + %721 = fcmp uge float 0x3FEB333340000000, %720 + %722 = select i1 %721, float 0x3FEB333340000000, float %720 + %723 = fmul float %8, %722 + %724 = fmul float %13, %722 + %725 = fmul float %18, %722 + %726 = insertelement <4 x float> undef, float %34, i32 0 + %727 = insertelement <4 x float> %726, float %35, i32 1 + %728 = insertelement <4 x float> %727, float %36, i32 2 + %729 = insertelement <4 x float> %728, float 0.000000e+00, i32 3 + %730 = insertelement <4 x float> undef, float %63, i32 0 + %731 = insertelement <4 x float> %730, float %65, i32 1 + %732 = insertelement <4 x float> %731, float %67, i32 2 + %733 = insertelement <4 x float> %732, float 0.000000e+00, i32 3 + %734 = call float @llvm.AMDGPU.dp4(<4 x float> %729, <4 x float> %733) + %735 = fcmp uge float 0x3FECCCCCC0000000, %734 + %736 = select i1 %735, float 0x3FECCCCCC0000000, float %734 + %737 = fmul float %723, %736 + %738 = fmul float %724, %736 + %739 = fmul float %725, %736 + br label %ENDIF166 + +ENDIF166: ; preds = %ENDIF163, %IF167 + %temp84.2 = phi float [ %708, %IF167 ], [ %temp84.1, %ENDIF163 ] + %temp85.2 = phi float [ %709, %IF167 ], [ %temp85.1, %ENDIF163 ] + %temp86.2 = phi float [ %710, %IF167 ], [ %temp86.1, %ENDIF163 ] + %temp87.2 = phi float [ %711, %IF167 ], [ %temp87.1, %ENDIF163 ] + %temp92.8 = phi float [ %737, %IF167 ], [ %temp92.7, %ENDIF163 ] + %temp93.2 = phi float [ %738, %IF167 ], [ %temp93.1, %ENDIF163 ] + %temp94.2 = phi float [ %739, %IF167 ], [ %temp94.1, %ENDIF163 ] + %740 = fcmp oge float %179, 2.480000e+03 + %741 = sext i1 %740 to i32 + %742 = bitcast i32 %741 to float + %743 = fcmp olt float %179, 2.530000e+03 + %744 = sext i1 %743 to i32 + %745 = bitcast i32 %744 to float + %746 = bitcast float %742 to i32 + %747 = bitcast float %745 to i32 + %748 = and i32 %746, %747 + %749 = bitcast i32 %748 to float + %750 = bitcast float %749 to i32 + %751 = icmp ne i32 %750, 0 + br i1 %751, label %IF170, label %ENDIF169 + +IF170: ; preds = %ENDIF166 + %752 = fmul float %202, 5.000000e-01 + %753 = fcmp uge float 0x3FE4CCCCC0000000, %752 + %754 = select i1 %753, float 0x3FE4CCCCC0000000, float %752 + %755 = fcmp uge float %754, 0x3FC99999A0000000 + %756 = select i1 %755, float 0x3FC99999A0000000, float %754 + %757 = call float @llvm.AMDGPU.lrp(float %756, float %409, float %300) + %758 = call float @llvm.AMDGPU.lrp(float %756, float %410, float %301) + %759 = call float @llvm.AMDGPU.lrp(float %756, float %411, float %302) + %760 = call float @llvm.AMDGPU.lrp(float %756, float %412, float %303) + %761 = insertelement <4 x float> undef, float %329, i32 0 + %762 = insertelement <4 x float> %761, float %330, i32 1 + %763 = insertelement <4 x float> %762, float %331, i32 2 + %764 = insertelement <4 x float> %763, float 0.000000e+00, i32 3 + %765 = insertelement <4 x float> undef, float %63, i32 0 + %766 = insertelement <4 x float> %765, float %65, i32 1 + %767 = insertelement <4 x float> %766, float %67, i32 2 + %768 = insertelement <4 x float> %767, float 0.000000e+00, i32 3 + %769 = call float @llvm.AMDGPU.dp4(<4 x float> %764, <4 x float> %768) + %770 = fcmp uge float 0x3FEB333340000000, %769 + %771 = select i1 %770, float 0x3FEB333340000000, float %769 + %772 = fmul float %8, %771 + %773 = fmul float %13, %771 + %774 = fmul float %18, %771 + %775 = insertelement <4 x float> undef, float %34, i32 0 + %776 = insertelement <4 x float> %775, float %35, i32 1 + %777 = insertelement <4 x float> %776, float %36, i32 2 + %778 = insertelement <4 x float> %777, float 0.000000e+00, i32 3 + %779 = insertelement <4 x float> undef, float %63, i32 0 + %780 = insertelement <4 x float> %779, float %65, i32 1 + %781 = insertelement <4 x float> %780, float %67, i32 2 + %782 = insertelement <4 x float> %781, float 0.000000e+00, i32 3 + %783 = call float @llvm.AMDGPU.dp4(<4 x float> %778, <4 x float> %782) + %784 = fcmp uge float 0x3FECCCCCC0000000, %783 + %785 = select i1 %784, float 0x3FECCCCCC0000000, float %783 + %786 = fmul float %772, %785 + %787 = fmul float %773, %785 + %788 = fmul float %774, %785 + br label %ENDIF169 + +ENDIF169: ; preds = %ENDIF166, %IF170 + %temp84.3 = phi float [ %757, %IF170 ], [ %temp84.2, %ENDIF166 ] + %temp85.3 = phi float [ %758, %IF170 ], [ %temp85.2, %ENDIF166 ] + %temp86.3 = phi float [ %759, %IF170 ], [ %temp86.2, %ENDIF166 ] + %temp87.3 = phi float [ %760, %IF170 ], [ %temp87.2, %ENDIF166 ] + %temp92.9 = phi float [ %786, %IF170 ], [ %temp92.8, %ENDIF166 ] + %temp93.3 = phi float [ %787, %IF170 ], [ %temp93.2, %ENDIF166 ] + %temp94.3 = phi float [ %788, %IF170 ], [ %temp94.2, %ENDIF166 ] + %789 = fcmp oge float %179, 2.530000e+03 + %790 = sext i1 %789 to i32 + %791 = bitcast i32 %790 to float + %792 = fcmp olt float %179, 2.670000e+03 + %793 = sext i1 %792 to i32 + %794 = bitcast i32 %793 to float + %795 = bitcast float %791 to i32 + %796 = bitcast float %794 to i32 + %797 = and i32 %795, %796 + %798 = bitcast i32 %797 to float + %799 = bitcast float %798 to i32 + %800 = icmp ne i32 %799, 0 + br i1 %800, label %IF173, label %ENDIF172 + +IF173: ; preds = %ENDIF169 + %801 = fmul float %202, 5.000000e-01 + %802 = fcmp uge float 0x3FE4CCCCC0000000, %801 + %803 = select i1 %802, float 0x3FE4CCCCC0000000, float %801 + %804 = fcmp uge float %803, 0x3FB99999A0000000 + %805 = select i1 %804, float 0x3FB99999A0000000, float %803 + %806 = call float @llvm.AMDGPU.lrp(float %805, float %400, float %300) + %807 = call float @llvm.AMDGPU.lrp(float %805, float %401, float %301) + %808 = call float @llvm.AMDGPU.lrp(float %805, float %402, float %302) + %809 = call float @llvm.AMDGPU.lrp(float %805, float %403, float %303) + %810 = insertelement <4 x float> undef, float %329, i32 0 + %811 = insertelement <4 x float> %810, float %330, i32 1 + %812 = insertelement <4 x float> %811, float %331, i32 2 + %813 = insertelement <4 x float> %812, float 0.000000e+00, i32 3 + %814 = insertelement <4 x float> undef, float %63, i32 0 + %815 = insertelement <4 x float> %814, float %65, i32 1 + %816 = insertelement <4 x float> %815, float %67, i32 2 + %817 = insertelement <4 x float> %816, float 0.000000e+00, i32 3 + %818 = call float @llvm.AMDGPU.dp4(<4 x float> %813, <4 x float> %817) + %819 = fcmp uge float 0x3FEB333340000000, %818 + %820 = select i1 %819, float 0x3FEB333340000000, float %818 + %821 = fmul float %8, %820 + %822 = fmul float %13, %820 + %823 = fmul float %18, %820 + %824 = insertelement <4 x float> undef, float %34, i32 0 + %825 = insertelement <4 x float> %824, float %35, i32 1 + %826 = insertelement <4 x float> %825, float %36, i32 2 + %827 = insertelement <4 x float> %826, float 0.000000e+00, i32 3 + %828 = insertelement <4 x float> undef, float %63, i32 0 + %829 = insertelement <4 x float> %828, float %65, i32 1 + %830 = insertelement <4 x float> %829, float %67, i32 2 + %831 = insertelement <4 x float> %830, float 0.000000e+00, i32 3 + %832 = call float @llvm.AMDGPU.dp4(<4 x float> %827, <4 x float> %831) + %833 = fcmp uge float 0x3FECCCCCC0000000, %832 + %834 = select i1 %833, float 0x3FECCCCCC0000000, float %832 + %835 = fmul float %821, %834 + %836 = fmul float %822, %834 + %837 = fmul float %823, %834 + br label %ENDIF172 + +ENDIF172: ; preds = %ENDIF169, %IF173 + %temp84.4 = phi float [ %806, %IF173 ], [ %temp84.3, %ENDIF169 ] + %temp85.4 = phi float [ %807, %IF173 ], [ %temp85.3, %ENDIF169 ] + %temp86.4 = phi float [ %808, %IF173 ], [ %temp86.3, %ENDIF169 ] + %temp87.4 = phi float [ %809, %IF173 ], [ %temp87.3, %ENDIF169 ] + %temp92.10 = phi float [ %835, %IF173 ], [ %temp92.9, %ENDIF169 ] + %temp93.4 = phi float [ %836, %IF173 ], [ %temp93.3, %ENDIF169 ] + %temp94.4 = phi float [ %837, %IF173 ], [ %temp94.3, %ENDIF169 ] + %838 = fcmp oge float %179, 2.670000e+03 + %839 = sext i1 %838 to i32 + %840 = bitcast i32 %839 to float + %841 = bitcast float %840 to i32 + %842 = icmp ne i32 %841, 0 + br i1 %842, label %IF176, label %ENDIF175 + +IF176: ; preds = %ENDIF172 + %843 = fmul float %202, 0x3FB99999A0000000 + %844 = fcmp uge float 0.000000e+00, %843 + %845 = select i1 %844, float 0.000000e+00, float %843 + %846 = fcmp uge float %845, 0x3FD99999A0000000 + %847 = select i1 %846, float 0x3FD99999A0000000, float %845 + %848 = call float @llvm.AMDGPU.lrp(float %847, float %400, float %300) + %849 = call float @llvm.AMDGPU.lrp(float %847, float %401, float %301) + %850 = call float @llvm.AMDGPU.lrp(float %847, float %402, float %302) + %851 = call float @llvm.AMDGPU.lrp(float %847, float %403, float %303) + %852 = insertelement <4 x float> undef, float %329, i32 0 + %853 = insertelement <4 x float> %852, float %330, i32 1 + %854 = insertelement <4 x float> %853, float %331, i32 2 + %855 = insertelement <4 x float> %854, float 0.000000e+00, i32 3 + %856 = insertelement <4 x float> undef, float %63, i32 0 + %857 = insertelement <4 x float> %856, float %65, i32 1 + %858 = insertelement <4 x float> %857, float %67, i32 2 + %859 = insertelement <4 x float> %858, float 0.000000e+00, i32 3 + %860 = call float @llvm.AMDGPU.dp4(<4 x float> %855, <4 x float> %859) + %861 = fcmp uge float 0x3FEB333340000000, %860 + %862 = select i1 %861, float 0x3FEB333340000000, float %860 + %863 = fmul float %8, %862 + %864 = fmul float %13, %862 + %865 = fmul float %18, %862 + %866 = insertelement <4 x float> undef, float %34, i32 0 + %867 = insertelement <4 x float> %866, float %35, i32 1 + %868 = insertelement <4 x float> %867, float %36, i32 2 + %869 = insertelement <4 x float> %868, float 0.000000e+00, i32 3 + %870 = insertelement <4 x float> undef, float %63, i32 0 + %871 = insertelement <4 x float> %870, float %65, i32 1 + %872 = insertelement <4 x float> %871, float %67, i32 2 + %873 = insertelement <4 x float> %872, float 0.000000e+00, i32 3 + %874 = call float @llvm.AMDGPU.dp4(<4 x float> %869, <4 x float> %873) + %875 = fcmp uge float 0x3FECCCCCC0000000, %874 + %876 = select i1 %875, float 0x3FECCCCCC0000000, float %874 + %877 = fmul float %863, %876 + %878 = fmul float %864, %876 + %879 = fmul float %865, %876 + br label %ENDIF175 + +ENDIF175: ; preds = %ENDIF172, %IF176 + %temp84.5 = phi float [ %848, %IF176 ], [ %temp84.4, %ENDIF172 ] + %temp85.5 = phi float [ %849, %IF176 ], [ %temp85.4, %ENDIF172 ] + %temp86.5 = phi float [ %850, %IF176 ], [ %temp86.4, %ENDIF172 ] + %temp87.5 = phi float [ %851, %IF176 ], [ %temp87.4, %ENDIF172 ] + %temp92.11 = phi float [ %877, %IF176 ], [ %temp92.10, %ENDIF172 ] + %temp93.5 = phi float [ %878, %IF176 ], [ %temp93.4, %ENDIF172 ] + %temp94.5 = phi float [ %879, %IF176 ], [ %temp94.4, %ENDIF172 ] + %880 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 10) + %881 = extractelement <4 x float> %880, i32 0 + %882 = fcmp olt float %881, %179 + %883 = sext i1 %882 to i32 + %884 = bitcast i32 %883 to float + %885 = bitcast float %884 to i32 + %886 = icmp ne i32 %885, 0 + br i1 %886, label %IF179, label %ENDIF178 + +IF179: ; preds = %ENDIF175 + %887 = fadd float %202, 1.000000e+00 + %888 = fadd float %202, 1.000000e+00 + %889 = fadd float %202, 1.000000e+00 + %890 = insertelement <4 x float> undef, float %43, i32 0 + %891 = insertelement <4 x float> %890, float %44, i32 1 + %892 = insertelement <4 x float> %891, float %45, i32 2 + %893 = insertelement <4 x float> %892, float 0.000000e+00, i32 3 + %894 = insertelement <4 x float> undef, float %43, i32 0 + %895 = insertelement <4 x float> %894, float %44, i32 1 + %896 = insertelement <4 x float> %895, float %45, i32 2 + %897 = insertelement <4 x float> %896, float 0.000000e+00, i32 3 + %898 = call float @llvm.AMDGPU.dp4(<4 x float> %893, <4 x float> %897) + %899 = call float @llvm.AMDGPU.rsq.f32(float %898) + %900 = fmul float %45, %899 + %901 = call float @fabs(float %900) + %902 = fmul float %176, 0x3FECCCCCC0000000 + %903 = fadd float %902, %901 + %904 = fadd float %903, 0xBFEFAE1480000000 + %905 = fmul float %904, 0xC043FFFE20000000 + %906 = call float @llvm.AMDIL.clamp.(float %905, float 0.000000e+00, float 1.000000e+00) + %907 = fmul float 2.000000e+00, %906 + %908 = fsub float -0.000000e+00, %907 + %909 = fadd float 3.000000e+00, %908 + %910 = fmul float %906, %909 + %911 = fmul float %906, %910 + %912 = call float @llvm.AMDGPU.lrp(float %911, float %temp84.5, float %887) + %913 = call float @llvm.AMDGPU.lrp(float %911, float %temp85.5, float %888) + %914 = call float @llvm.AMDGPU.lrp(float %911, float %temp86.5, float %889) + %915 = call float @llvm.AMDGPU.lrp(float %911, float %temp87.5, float 0.000000e+00) + %916 = fmul float %202, 5.000000e-01 + %917 = fcmp uge float 0x3FE4CCCCC0000000, %916 + %918 = select i1 %917, float 0x3FE4CCCCC0000000, float %916 + %919 = fcmp uge float %918, 0x3FE3333340000000 + %920 = select i1 %919, float 0x3FE3333340000000, float %918 + %921 = call float @llvm.AMDGPU.lrp(float %920, float %912, float %temp84.5) + %922 = call float @llvm.AMDGPU.lrp(float %920, float %913, float %temp85.5) + %923 = call float @llvm.AMDGPU.lrp(float %920, float %914, float %temp86.5) + %924 = call float @llvm.AMDGPU.lrp(float %920, float %915, float %temp87.5) + %925 = insertelement <4 x float> undef, float %329, i32 0 + %926 = insertelement <4 x float> %925, float %330, i32 1 + %927 = insertelement <4 x float> %926, float %331, i32 2 + %928 = insertelement <4 x float> %927, float 0.000000e+00, i32 3 + %929 = insertelement <4 x float> undef, float %63, i32 0 + %930 = insertelement <4 x float> %929, float %65, i32 1 + %931 = insertelement <4 x float> %930, float %67, i32 2 + %932 = insertelement <4 x float> %931, float 0.000000e+00, i32 3 + %933 = call float @llvm.AMDGPU.dp4(<4 x float> %928, <4 x float> %932) + %934 = fcmp uge float 0x3FE99999A0000000, %933 + %935 = select i1 %934, float 0x3FE99999A0000000, float %933 + %936 = fmul float %8, %935 + %937 = fmul float %13, %935 + %938 = fmul float %18, %935 + %939 = insertelement <4 x float> undef, float %34, i32 0 + %940 = insertelement <4 x float> %939, float %35, i32 1 + %941 = insertelement <4 x float> %940, float %36, i32 2 + %942 = insertelement <4 x float> %941, float 0.000000e+00, i32 3 + %943 = insertelement <4 x float> undef, float %63, i32 0 + %944 = insertelement <4 x float> %943, float %65, i32 1 + %945 = insertelement <4 x float> %944, float %67, i32 2 + %946 = insertelement <4 x float> %945, float 0.000000e+00, i32 3 + %947 = call float @llvm.AMDGPU.dp4(<4 x float> %942, <4 x float> %946) + %948 = fcmp uge float 0x3FECCCCCC0000000, %947 + %949 = select i1 %948, float 0x3FECCCCCC0000000, float %947 + %950 = fmul float %936, %949 + %951 = fmul float %937, %949 + %952 = fmul float %938, %949 + br label %ENDIF178 + +ENDIF178: ; preds = %ENDIF175, %IF179 + %temp84.6 = phi float [ %921, %IF179 ], [ %temp84.5, %ENDIF175 ] + %temp85.6 = phi float [ %922, %IF179 ], [ %temp85.5, %ENDIF175 ] + %temp86.6 = phi float [ %923, %IF179 ], [ %temp86.5, %ENDIF175 ] + %temp87.6 = phi float [ %924, %IF179 ], [ %temp87.5, %ENDIF175 ] + %temp92.12 = phi float [ %950, %IF179 ], [ %temp92.11, %ENDIF175 ] + %temp93.6 = phi float [ %951, %IF179 ], [ %temp93.5, %ENDIF175 ] + %temp94.6 = phi float [ %952, %IF179 ], [ %temp94.5, %ENDIF175 ] + %953 = fmul float %55, %temp92.12 + %954 = fmul float %57, %temp93.6 + %955 = fmul float %59, %temp94.6 + %956 = fmul float %61, 0.000000e+00 + %957 = fmul float %temp84.6, %953 + %958 = fmul float %temp85.6, %954 + %959 = fmul float %temp86.6, %955 + %960 = fmul float %temp87.6, %956 + %961 = fmul float %2, -2.000000e+00 + %962 = fadd float %961, 1.000000e+00 + %963 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 23) + %964 = extractelement <4 x float> %963, i32 2 + %965 = fsub float -0.000000e+00, %964 + %966 = fadd float %962, %965 + %967 = fdiv float 1.000000e+00, %966 + %968 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 24) + %969 = extractelement <4 x float> %968, i32 2 + %970 = fmul float %969, %967 + %971 = fsub float -0.000000e+00, %53 + %972 = fmul float %971, %53 + %973 = fmul float %972, %970 + %974 = fmul float %973, %970 + %975 = fmul float %974, 0x3FF7154760000000 + %976 = call float @llvm.AMDIL.exp.(float %975) + %977 = fcmp oeq float %53, 1.000000e+00 + %978 = sext i1 %977 to i32 + %979 = bitcast i32 %978 to float + %980 = bitcast float %979 to i32 + %981 = icmp ne i32 %980, 0 + %.184 = select i1 %981, float 1.000000e+00, float %976 + %982 = call float @llvm.AMDGPU.lrp(float %.184, float %957, float %47) + %983 = call float @llvm.AMDGPU.lrp(float %.184, float %958, float %49) + %984 = call float @llvm.AMDGPU.lrp(float %.184, float %959, float %51) + %985 = insertelement <4 x float> undef, float %982, i32 0 + %986 = insertelement <4 x float> %985, float %983, i32 1 + %987 = insertelement <4 x float> %986, float %984, i32 2 + %988 = insertelement <4 x float> %987, float %960, i32 3 + call void @llvm.R600.store.swizzle(<4 x float> %988, i32 0, i32 0) + ret void +} + +; Function Attrs: readnone +declare float @llvm.AMDGPU.dp4(<4 x float>, <4 x float>) #1 + +; Function Attrs: readnone +declare float @llvm.AMDGPU.rsq.f32(float) #1 + +; Function Attrs: readnone +declare <4 x float> @llvm.AMDGPU.tex(<4 x float>, i32, i32, i32) #1 + +; Function Attrs: readonly +declare float @fabs(float) #2 + +; Function Attrs: readnone +declare float @llvm.AMDIL.exp.(float) #1 + +; Function Attrs: readnone +declare float @llvm.AMDGPU.lrp(float, float, float) #1 + +; Function Attrs: readnone +declare float @llvm.AMDIL.clamp.(float, float, float) #1 + +declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) + +attributes #0 = { "ShaderType"="0" } +attributes #1 = { readnone } +attributes #2 = { readonly } diff --git a/llvm/test/CodeGen/AMDGPU/bitcast.ll b/llvm/test/CodeGen/AMDGPU/bitcast.ll new file mode 100644 index 00000000000..fd56d956bf3 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/bitcast.ll @@ -0,0 +1,79 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s + +; This test just checks that the compiler doesn't crash. + +declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) + +; FUNC-LABEL: {{^}}v32i8_to_v8i32: +; SI: s_endpgm +define void @v32i8_to_v8i32(<32 x i8> addrspace(2)* inreg) #0 { +entry: + %1 = load <32 x i8>, <32 x i8> addrspace(2)* %0 + %2 = bitcast <32 x i8> %1 to <8 x i32> + %3 = extractelement <8 x i32> %2, i32 1 + %4 = icmp ne i32 %3, 0 + %5 = select i1 %4, float 0.0, float 1.0 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %5, float %5, float %5, float %5) + ret void +} + +; FUNC-LABEL: {{^}}i8ptr_v16i8ptr: +; SI: s_endpgm +define void @i8ptr_v16i8ptr(<16 x i8> addrspace(1)* %out, i8 addrspace(1)* %in) { +entry: + %0 = bitcast i8 addrspace(1)* %in to <16 x i8> addrspace(1)* + %1 = load <16 x i8>, <16 x i8> addrspace(1)* %0 + store <16 x i8> %1, <16 x i8> addrspace(1)* %out + ret void +} + +define void @f32_to_v2i16(<2 x i16> addrspace(1)* %out, float addrspace(1)* %in) nounwind { + %load = load float, float addrspace(1)* %in, align 4 + %bc = bitcast float %load to <2 x i16> + store <2 x i16> %bc, <2 x i16> addrspace(1)* %out, align 4 + ret void +} + +define void @v2i16_to_f32(float addrspace(1)* %out, <2 x i16> addrspace(1)* %in) nounwind { + %load = load <2 x i16>, <2 x i16> addrspace(1)* %in, align 4 + %bc = bitcast <2 x i16> %load to float + store float %bc, float addrspace(1)* %out, align 4 + ret void +} + +define void @v4i8_to_i32(i32 addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind { + %load = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4 + %bc = bitcast <4 x i8> %load to i32 + store i32 %bc, i32 addrspace(1)* %out, align 4 + ret void +} + +define void @i32_to_v4i8(<4 x i8> addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { + %load = load i32, i32 addrspace(1)* %in, align 4 + %bc = bitcast i32 %load to <4 x i8> + store <4 x i8> %bc, <4 x i8> addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bitcast_v2i32_to_f64: +; SI: s_endpgm +define void @bitcast_v2i32_to_f64(double addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { + %val = load <2 x i32>, <2 x i32> addrspace(1)* %in, align 8 + %add = add <2 x i32> %val, + %bc = bitcast <2 x i32> %add to double + store double %bc, double addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}bitcast_f64_to_v2i32: +; SI: s_endpgm +define void @bitcast_f64_to_v2i32(<2 x i32> addrspace(1)* %out, double addrspace(1)* %in) { + %val = load double, double addrspace(1)* %in, align 8 + %add = fadd double %val, 4.0 + %bc = bitcast double %add to <2 x i32> + store <2 x i32> %bc, <2 x i32> addrspace(1)* %out, align 8 + ret void +} + +attributes #0 = { "ShaderType"="0" } diff --git a/llvm/test/CodeGen/AMDGPU/bswap.ll b/llvm/test/CodeGen/AMDGPU/bswap.ll new file mode 100644 index 00000000000..4cf8e4bfed5 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/bswap.ll @@ -0,0 +1,115 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s + +declare i32 @llvm.bswap.i32(i32) nounwind readnone +declare <2 x i32> @llvm.bswap.v2i32(<2 x i32>) nounwind readnone +declare <4 x i32> @llvm.bswap.v4i32(<4 x i32>) nounwind readnone +declare <8 x i32> @llvm.bswap.v8i32(<8 x i32>) nounwind readnone +declare i64 @llvm.bswap.i64(i64) nounwind readnone +declare <2 x i64> @llvm.bswap.v2i64(<2 x i64>) nounwind readnone +declare <4 x i64> @llvm.bswap.v4i64(<4 x i64>) nounwind readnone + +; FUNC-LABEL: @test_bswap_i32 +; SI: buffer_load_dword [[VAL:v[0-9]+]] +; SI-DAG: v_alignbit_b32 [[TMP0:v[0-9]+]], [[VAL]], [[VAL]], 8 +; SI-DAG: v_alignbit_b32 [[TMP1:v[0-9]+]], [[VAL]], [[VAL]], 24 +; SI-DAG: s_mov_b32 [[K:s[0-9]+]], 0xff00ff +; SI: v_bfi_b32 [[RESULT:v[0-9]+]], [[K]], [[TMP1]], [[TMP0]] +; SI: buffer_store_dword [[RESULT]] +; SI: s_endpgm +define void @test_bswap_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { + %val = load i32, i32 addrspace(1)* %in, align 4 + %bswap = call i32 @llvm.bswap.i32(i32 %val) nounwind readnone + store i32 %bswap, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @test_bswap_v2i32 +; SI-DAG: v_alignbit_b32 +; SI-DAG: v_alignbit_b32 +; SI-DAG: v_bfi_b32 +; SI-DAG: v_alignbit_b32 +; SI-DAG: v_alignbit_b32 +; SI-DAG: v_bfi_b32 +; SI: s_endpgm +define void @test_bswap_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) nounwind { + %val = load <2 x i32>, <2 x i32> addrspace(1)* %in, align 8 + %bswap = call <2 x i32> @llvm.bswap.v2i32(<2 x i32> %val) nounwind readnone + store <2 x i32> %bswap, <2 x i32> addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: @test_bswap_v4i32 +; SI-DAG: v_alignbit_b32 +; SI-DAG: v_alignbit_b32 +; SI-DAG: v_bfi_b32 +; SI-DAG: v_alignbit_b32 +; SI-DAG: v_alignbit_b32 +; SI-DAG: v_bfi_b32 +; SI-DAG: v_alignbit_b32 +; SI-DAG: v_alignbit_b32 +; SI-DAG: v_bfi_b32 +; SI-DAG: v_alignbit_b32 +; SI-DAG: v_alignbit_b32 +; SI-DAG: v_bfi_b32 +; SI: s_endpgm +define void @test_bswap_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) nounwind { + %val = load <4 x i32>, <4 x i32> addrspace(1)* %in, align 16 + %bswap = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %val) nounwind readnone + store <4 x i32> %bswap, <4 x i32> addrspace(1)* %out, align 16 + ret void +} + +; FUNC-LABEL: @test_bswap_v8i32 +; SI-DAG: v_alignbit_b32 +; SI-DAG: v_alignbit_b32 +; SI-DAG: v_bfi_b32 +; SI-DAG: v_alignbit_b32 +; SI-DAG: v_alignbit_b32 +; SI-DAG: v_bfi_b32 +; SI-DAG: v_alignbit_b32 +; SI-DAG: v_alignbit_b32 +; SI-DAG: v_bfi_b32 +; SI-DAG: v_alignbit_b32 +; SI-DAG: v_alignbit_b32 +; SI-DAG: v_bfi_b32 +; SI-DAG: v_alignbit_b32 +; SI-DAG: v_alignbit_b32 +; SI-DAG: v_bfi_b32 +; SI-DAG: v_alignbit_b32 +; SI-DAG: v_alignbit_b32 +; SI-DAG: v_bfi_b32 +; SI-DAG: v_alignbit_b32 +; SI-DAG: v_alignbit_b32 +; SI-DAG: v_bfi_b32 +; SI-DAG: v_alignbit_b32 +; SI-DAG: v_alignbit_b32 +; SI-DAG: v_bfi_b32 +; SI: s_endpgm +define void @test_bswap_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %in) nounwind { + %val = load <8 x i32>, <8 x i32> addrspace(1)* %in, align 32 + %bswap = call <8 x i32> @llvm.bswap.v8i32(<8 x i32> %val) nounwind readnone + store <8 x i32> %bswap, <8 x i32> addrspace(1)* %out, align 32 + ret void +} + +define void @test_bswap_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind { + %val = load i64, i64 addrspace(1)* %in, align 8 + %bswap = call i64 @llvm.bswap.i64(i64 %val) nounwind readnone + store i64 %bswap, i64 addrspace(1)* %out, align 8 + ret void +} + +define void @test_bswap_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) nounwind { + %val = load <2 x i64>, <2 x i64> addrspace(1)* %in, align 16 + %bswap = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> %val) nounwind readnone + store <2 x i64> %bswap, <2 x i64> addrspace(1)* %out, align 16 + ret void +} + +define void @test_bswap_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) nounwind { + %val = load <4 x i64>, <4 x i64> addrspace(1)* %in, align 32 + %bswap = call <4 x i64> @llvm.bswap.v4i64(<4 x i64> %val) nounwind readnone + store <4 x i64> %bswap, <4 x i64> addrspace(1)* %out, align 32 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/build_vector.ll b/llvm/test/CodeGen/AMDGPU/build_vector.ll new file mode 100644 index 00000000000..65eacf5adc4 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/build_vector.ll @@ -0,0 +1,35 @@ +; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600 +; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI +; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s --check-prefix=SI + +; R600: {{^}}build_vector2: +; R600: MOV +; R600: MOV +; R600-NOT: MOV +; SI: {{^}}build_vector2: +; SI-DAG: v_mov_b32_e32 v[[X:[0-9]]], 5 +; SI-DAG: v_mov_b32_e32 v[[Y:[0-9]]], 6 +; SI: buffer_store_dwordx2 v{{\[}}[[X]]:[[Y]]{{\]}} +define void @build_vector2 (<2 x i32> addrspace(1)* %out) { +entry: + store <2 x i32> , <2 x i32> addrspace(1)* %out + ret void +} + +; R600: {{^}}build_vector4: +; R600: MOV +; R600: MOV +; R600: MOV +; R600: MOV +; R600-NOT: MOV +; SI: {{^}}build_vector4: +; SI-DAG: v_mov_b32_e32 v[[X:[0-9]]], 5 +; SI-DAG: v_mov_b32_e32 v[[Y:[0-9]]], 6 +; SI-DAG: v_mov_b32_e32 v[[Z:[0-9]]], 7 +; SI-DAG: v_mov_b32_e32 v[[W:[0-9]]], 8 +; SI: buffer_store_dwordx4 v{{\[}}[[X]]:[[W]]{{\]}} +define void @build_vector4 (<4 x i32> addrspace(1)* %out) { +entry: + store <4 x i32> , <4 x i32> addrspace(1)* %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/call.ll b/llvm/test/CodeGen/AMDGPU/call.ll new file mode 100644 index 00000000000..e769fd11c28 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/call.ll @@ -0,0 +1,33 @@ +; RUN: not llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s 2>&1 | FileCheck %s +; RUN: not llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s 2>&1 | FileCheck %s +; RUN: not llc -march=r600 -mcpu=cypress < %s 2>&1 | FileCheck %s + +; CHECK: error: unsupported call to function external_function in test_call_external + + +declare i32 @external_function(i32) nounwind + +define void @test_call_external(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { + %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 + %a = load i32, i32 addrspace(1)* %in + %b = load i32, i32 addrspace(1)* %b_ptr + %c = call i32 @external_function(i32 %b) nounwind + %result = add i32 %a, %c + store i32 %result, i32 addrspace(1)* %out + ret void +} + +define i32 @defined_function(i32 %x) nounwind noinline { + %y = add i32 %x, 8 + ret i32 %y +} + +define void @test_call(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { + %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 + %a = load i32, i32 addrspace(1)* %in + %b = load i32, i32 addrspace(1)* %b_ptr + %c = call i32 @defined_function(i32 %b) nounwind + %result = add i32 %a, %c + store i32 %result, i32 addrspace(1)* %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/call_fs.ll b/llvm/test/CodeGen/AMDGPU/call_fs.ll new file mode 100644 index 00000000000..87bebbc49d5 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/call_fs.ll @@ -0,0 +1,17 @@ + +; RUN: llc < %s -march=r600 -mcpu=redwood -show-mc-encoding -o - | FileCheck --check-prefix=EG %s +; RUN: llc < %s -march=r600 -mcpu=rv710 -show-mc-encoding -o - | FileCheck --check-prefix=R600 %s + +; EG: .long 257 +; EG: {{^}}call_fs: +; EG: CALL_FS ; encoding: [0x00,0x00,0x00,0x00,0x00,0x00,0xc0,0x84] +; R600: .long 257 +; R600: {{^}}call_fs: +; R600:CALL_FS ; encoding: [0x00,0x00,0x00,0x00,0x00,0x00,0x80,0x89] + + +define void @call_fs() #0 { + ret void +} + +attributes #0 = { "ShaderType"="1" } ; Vertex Shader diff --git a/llvm/test/CodeGen/AMDGPU/cayman-loop-bug.ll b/llvm/test/CodeGen/AMDGPU/cayman-loop-bug.ll new file mode 100644 index 00000000000..c7b8c403731 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/cayman-loop-bug.ll @@ -0,0 +1,32 @@ +; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s + +; CHECK-LABEL: {{^}}main: +; CHECK: LOOP_START_DX10 +; CHECK: ALU_PUSH_BEFORE +; CHECK: LOOP_START_DX10 +; CHECK: PUSH +; CHECK-NOT: ALU_PUSH_BEFORE +; CHECK: END_LOOP +; CHECK: END_LOOP +define void @main (<4 x float> inreg %reg0) #0 { +entry: + br label %outer_loop +outer_loop: + %cnt = phi i32 [0, %entry], [%cnt_incr, %inner_loop] + %cond = icmp eq i32 %cnt, 16 + br i1 %cond, label %outer_loop_body, label %exit +outer_loop_body: + %cnt_incr = add i32 %cnt, 1 + br label %inner_loop +inner_loop: + %cnt2 = phi i32 [0, %outer_loop_body], [%cnt2_incr, %inner_loop_body] + %cond2 = icmp eq i32 %cnt2, 16 + br i1 %cond, label %inner_loop_body, label %outer_loop +inner_loop_body: + %cnt2_incr = add i32 %cnt2, 1 + br label %inner_loop +exit: + ret void +} + +attributes #0 = { "ShaderType"="0" } \ No newline at end of file diff --git a/llvm/test/CodeGen/AMDGPU/cf-stack-bug.ll b/llvm/test/CodeGen/AMDGPU/cf-stack-bug.ll new file mode 100644 index 00000000000..75b87e48622 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/cf-stack-bug.ll @@ -0,0 +1,244 @@ +; RUN: llc -march=r600 -mcpu=redwood -debug-only=r600cf %s -o - 2>%t | FileCheck %s --check-prefix=FUNC +; RUN: FileCheck --check-prefix=BUG64 %s < %t + +; RUN: llc -march=r600 -mcpu=sumo -debug-only=r600cf %s -o - 2>%t | FileCheck %s --check-prefix=FUNC +; RUN: FileCheck --check-prefix=BUG64 %s < %t + +; RUN: llc -march=r600 -mcpu=barts -debug-only=r600cf %s -o - 2>%t | FileCheck %s --check-prefix=FUNC +; RUN: FileCheck --check-prefix=BUG64 %s < %t + +; RUN: llc -march=r600 -mcpu=turks -debug-only=r600cf %s -o - 2>%t | FileCheck %s --check-prefix=FUNC +; RUN: FileCheck --check-prefix=BUG64 %s < %t + +; RUN: llc -march=r600 -mcpu=caicos -debug-only=r600cf %s -o - 2>%t | FileCheck %s --check-prefix=FUNC +; RUN: FileCheck --check-prefix=BUG64 %s < %t + +; RUN: llc -march=r600 -mcpu=cedar -debug-only=r600cf %s -o - 2>%t | FileCheck %s --check-prefix=FUNC +; RUN: FileCheck --check-prefix=BUG32 %s < %t + +; RUN: llc -march=r600 -mcpu=juniper -debug-only=r600cf %s -o - 2>%t | FileCheck %s --check-prefix=FUNC +; RUN: FileCheck --check-prefix=NOBUG %s < %t + +; RUN: llc -march=r600 -mcpu=cypress -debug-only=r600cf %s -o - 2>%t | FileCheck %s --check-prefix=FUNC +; RUN: FileCheck --check-prefix=NOBUG %s < %t + +; RUN: llc -march=r600 -mcpu=cayman -debug-only=r600cf %s -o - 2>%t | FileCheck %s --check-prefix=FUNC +; RUN: FileCheck --check-prefix=NOBUG %s < %t + +; REQUIRES: asserts + +; We are currently allocating 2 extra sub-entries on Evergreen / NI for +; non-WQM push instructions if we change this to 1, then we will need to +; add one level of depth to each of these tests. + +; BUG64-NOT: Applying bug work-around +; BUG32-NOT: Applying bug work-around +; NOBUG-NOT: Applying bug work-around +; FUNC-LABEL: {{^}}nested3: +define void @nested3(i32 addrspace(1)* %out, i32 %cond) { +entry: + %0 = icmp sgt i32 %cond, 0 + br i1 %0, label %if.1, label %end + +if.1: + %1 = icmp sgt i32 %cond, 10 + br i1 %1, label %if.2, label %if.store.1 + +if.store.1: + store i32 1, i32 addrspace(1)* %out + br label %end + +if.2: + %2 = icmp sgt i32 %cond, 20 + br i1 %2, label %if.3, label %if.2.store + +if.2.store: + store i32 2, i32 addrspace(1)* %out + br label %end + +if.3: + store i32 3, i32 addrspace(1)* %out + br label %end + +end: + ret void +} + +; BUG64: Applying bug work-around +; BUG32-NOT: Applying bug work-around +; NOBUG-NOT: Applying bug work-around +; FUNC-LABEL: {{^}}nested4: +define void @nested4(i32 addrspace(1)* %out, i32 %cond) { +entry: + %0 = icmp sgt i32 %cond, 0 + br i1 %0, label %if.1, label %end + +if.1: + %1 = icmp sgt i32 %cond, 10 + br i1 %1, label %if.2, label %if.1.store + +if.1.store: + store i32 1, i32 addrspace(1)* %out + br label %end + +if.2: + %2 = icmp sgt i32 %cond, 20 + br i1 %2, label %if.3, label %if.2.store + +if.2.store: + store i32 2, i32 addrspace(1)* %out + br label %end + +if.3: + %3 = icmp sgt i32 %cond, 30 + br i1 %3, label %if.4, label %if.3.store + +if.3.store: + store i32 3, i32 addrspace(1)* %out + br label %end + +if.4: + store i32 4, i32 addrspace(1)* %out + br label %end + +end: + ret void +} + +; BUG64: Applying bug work-around +; BUG32-NOT: Applying bug work-around +; NOBUG-NOT: Applying bug work-around +; FUNC-LABEL: {{^}}nested7: +define void @nested7(i32 addrspace(1)* %out, i32 %cond) { +entry: + %0 = icmp sgt i32 %cond, 0 + br i1 %0, label %if.1, label %end + +if.1: + %1 = icmp sgt i32 %cond, 10 + br i1 %1, label %if.2, label %if.1.store + +if.1.store: + store i32 1, i32 addrspace(1)* %out + br label %end + +if.2: + %2 = icmp sgt i32 %cond, 20 + br i1 %2, label %if.3, label %if.2.store + +if.2.store: + store i32 2, i32 addrspace(1)* %out + br label %end + +if.3: + %3 = icmp sgt i32 %cond, 30 + br i1 %3, label %if.4, label %if.3.store + +if.3.store: + store i32 3, i32 addrspace(1)* %out + br label %end + +if.4: + %4 = icmp sgt i32 %cond, 40 + br i1 %4, label %if.5, label %if.4.store + +if.4.store: + store i32 4, i32 addrspace(1)* %out + br label %end + +if.5: + %5 = icmp sgt i32 %cond, 50 + br i1 %5, label %if.6, label %if.5.store + +if.5.store: + store i32 5, i32 addrspace(1)* %out + br label %end + +if.6: + %6 = icmp sgt i32 %cond, 60 + br i1 %6, label %if.7, label %if.6.store + +if.6.store: + store i32 6, i32 addrspace(1)* %out + br label %end + +if.7: + store i32 7, i32 addrspace(1)* %out + br label %end + +end: + ret void +} + +; BUG64: Applying bug work-around +; BUG32: Applying bug work-around +; NOBUG-NOT: Applying bug work-around +; FUNC-LABEL: {{^}}nested8: +define void @nested8(i32 addrspace(1)* %out, i32 %cond) { +entry: + %0 = icmp sgt i32 %cond, 0 + br i1 %0, label %if.1, label %end + +if.1: + %1 = icmp sgt i32 %cond, 10 + br i1 %1, label %if.2, label %if.1.store + +if.1.store: + store i32 1, i32 addrspace(1)* %out + br label %end + +if.2: + %2 = icmp sgt i32 %cond, 20 + br i1 %2, label %if.3, label %if.2.store + +if.2.store: + store i32 2, i32 addrspace(1)* %out + br label %end + +if.3: + %3 = icmp sgt i32 %cond, 30 + br i1 %3, label %if.4, label %if.3.store + +if.3.store: + store i32 3, i32 addrspace(1)* %out + br label %end + +if.4: + %4 = icmp sgt i32 %cond, 40 + br i1 %4, label %if.5, label %if.4.store + +if.4.store: + store i32 4, i32 addrspace(1)* %out + br label %end + +if.5: + %5 = icmp sgt i32 %cond, 50 + br i1 %5, label %if.6, label %if.5.store + +if.5.store: + store i32 5, i32 addrspace(1)* %out + br label %end + +if.6: + %6 = icmp sgt i32 %cond, 60 + br i1 %6, label %if.7, label %if.6.store + +if.6.store: + store i32 6, i32 addrspace(1)* %out + br label %end + +if.7: + %7 = icmp sgt i32 %cond, 70 + br i1 %7, label %if.8, label %if.7.store + +if.7.store: + store i32 7, i32 addrspace(1)* %out + br label %end + +if.8: + store i32 8, i32 addrspace(1)* %out + br label %end + +end: + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/cf_end.ll b/llvm/test/CodeGen/AMDGPU/cf_end.ll new file mode 100644 index 00000000000..c74ee22868d --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/cf_end.ll @@ -0,0 +1,9 @@ +; RUN: llc < %s -march=r600 -mcpu=redwood --show-mc-encoding | FileCheck --check-prefix=EG %s +; RUN: llc < %s -march=r600 -mcpu=caicos --show-mc-encoding | FileCheck --check-prefix=EG %s +; RUN: llc < %s -march=r600 -mcpu=cayman --show-mc-encoding | FileCheck --check-prefix=CM %s + +; EG: CF_END ; encoding: [0x00,0x00,0x00,0x00,0x00,0x00,0x20,0x80] +; CM: CF_END ; encoding: [0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x88] +define void @eop() { + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes.ll b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes.ll new file mode 100644 index 00000000000..77f7bd01b7f --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes.ll @@ -0,0 +1,242 @@ +; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown < %s | FileCheck -check-prefix=OPT %s +; RUN: llc -march=amdgcn -mcpu=bonaire -mattr=-promote-alloca < %s | FileCheck -check-prefix=GCN %s + +declare i32 @llvm.r600.read.tidig.x() #0 + +; OPT-LABEL: @test_sink_global_small_offset_i32( +; OPT-NOT: getelementptr i32, i32 addrspace(1)* %in +; OPT: br i1 +; OPT: ptrtoint + +; GCN-LABEL: {{^}}test_sink_global_small_offset_i32: +; GCN: {{^}}BB0_2: +define void @test_sink_global_small_offset_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %cond) { +entry: + %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999 + %in.gep = getelementptr i32, i32 addrspace(1)* %in, i64 7 + %tmp0 = icmp eq i32 %cond, 0 + br i1 %tmp0, label %endif, label %if + +if: + %tmp1 = load i32, i32 addrspace(1)* %in.gep + br label %endif + +endif: + %x = phi i32 [ %tmp1, %if ], [ 0, %entry ] + store i32 %x, i32 addrspace(1)* %out.gep + br label %done + +done: + ret void +} + +; OPT-LABEL: @test_sink_global_small_max_i32_ds_offset( +; OPT: %in.gep = getelementptr i8, i8 addrspace(1)* %in, i64 65535 +; OPT: br i1 + +; GCN-LABEL: {{^}}test_sink_global_small_max_i32_ds_offset: +; GCN: s_and_saveexec_b64 +; GCN: buffer_load_sbyte {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s{{[0-9]+$}} +; GCN: {{^}}BB1_2: +; GCN: s_or_b64 exec +define void @test_sink_global_small_max_i32_ds_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in, i32 %cond) { +entry: + %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 99999 + %in.gep = getelementptr i8, i8 addrspace(1)* %in, i64 65535 + %tmp0 = icmp eq i32 %cond, 0 + br i1 %tmp0, label %endif, label %if + +if: + %tmp1 = load i8, i8 addrspace(1)* %in.gep + %tmp2 = sext i8 %tmp1 to i32 + br label %endif + +endif: + %x = phi i32 [ %tmp2, %if ], [ 0, %entry ] + store i32 %x, i32 addrspace(1)* %out.gep + br label %done + +done: + ret void +} + +; GCN-LABEL: {{^}}test_sink_global_small_max_mubuf_offset: +; GCN: s_and_saveexec_b64 +; GCN: buffer_load_sbyte {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:4095{{$}} +; GCN: {{^}}BB2_2: +; GCN: s_or_b64 exec +define void @test_sink_global_small_max_mubuf_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in, i32 %cond) { +entry: + %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 1024 + %in.gep = getelementptr i8, i8 addrspace(1)* %in, i64 4095 + %tmp0 = icmp eq i32 %cond, 0 + br i1 %tmp0, label %endif, label %if + +if: + %tmp1 = load i8, i8 addrspace(1)* %in.gep + %tmp2 = sext i8 %tmp1 to i32 + br label %endif + +endif: + %x = phi i32 [ %tmp2, %if ], [ 0, %entry ] + store i32 %x, i32 addrspace(1)* %out.gep + br label %done + +done: + ret void +} + +; GCN-LABEL: {{^}}test_sink_global_small_max_plus_1_mubuf_offset: +; GCN: s_and_saveexec_b64 +; GCN: buffer_load_sbyte {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s{{[0-9]+$}} +; GCN: {{^}}BB3_2: +; GCN: s_or_b64 exec +define void @test_sink_global_small_max_plus_1_mubuf_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in, i32 %cond) { +entry: + %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 99999 + %in.gep = getelementptr i8, i8 addrspace(1)* %in, i64 4096 + %tmp0 = icmp eq i32 %cond, 0 + br i1 %tmp0, label %endif, label %if + +if: + %tmp1 = load i8, i8 addrspace(1)* %in.gep + %tmp2 = sext i8 %tmp1 to i32 + br label %endif + +endif: + %x = phi i32 [ %tmp2, %if ], [ 0, %entry ] + store i32 %x, i32 addrspace(1)* %out.gep + br label %done + +done: + ret void +} + +; OPT-LABEL: @test_no_sink_flat_small_offset_i32( +; OPT: getelementptr i32, i32 addrspace(4)* %in +; OPT: br i1 +; OPT-NOT: ptrtoint + +; GCN-LABEL: {{^}}test_no_sink_flat_small_offset_i32: +; GCN: flat_load_dword +; GCN: {{^}}BB4_2: + +define void @test_no_sink_flat_small_offset_i32(i32 addrspace(4)* %out, i32 addrspace(4)* %in, i32 %cond) { +entry: + %out.gep = getelementptr i32, i32 addrspace(4)* %out, i64 999999 + %in.gep = getelementptr i32, i32 addrspace(4)* %in, i64 7 + %tmp0 = icmp eq i32 %cond, 0 + br i1 %tmp0, label %endif, label %if + +if: + %tmp1 = load i32, i32 addrspace(4)* %in.gep + br label %endif + +endif: + %x = phi i32 [ %tmp1, %if ], [ 0, %entry ] + store i32 %x, i32 addrspace(4)* %out.gep + br label %done + +done: + ret void +} + +; OPT-LABEL: @test_sink_scratch_small_offset_i32( +; OPT-NOT: getelementptr [512 x i32] +; OPT: br i1 +; OPT: ptrtoint + +; GCN-LABEL: {{^}}test_sink_scratch_small_offset_i32: +; GCN: s_and_saveexec_b64 +; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen offset:4092{{$}} +; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen offset:4092{{$}} +; GCN: {{^}}BB5_2: +define void @test_sink_scratch_small_offset_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %cond, i32 %arg) { +entry: + %alloca = alloca [512 x i32], align 4 + %out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i64 999998 + %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i64 999999 + %add.arg = add i32 %arg, 8 + %alloca.gep = getelementptr [512 x i32], [512 x i32]* %alloca, i32 0, i32 1023 + %tmp0 = icmp eq i32 %cond, 0 + br i1 %tmp0, label %endif, label %if + +if: + store volatile i32 123, i32* %alloca.gep + %tmp1 = load volatile i32, i32* %alloca.gep + br label %endif + +endif: + %x = phi i32 [ %tmp1, %if ], [ 0, %entry ] + store i32 %x, i32 addrspace(1)* %out.gep.0 + %load = load volatile i32, i32* %alloca.gep + store i32 %load, i32 addrspace(1)* %out.gep.1 + br label %done + +done: + ret void +} + +; OPT-LABEL: @test_no_sink_scratch_large_offset_i32( +; OPT: %alloca.gep = getelementptr [512 x i32], [512 x i32]* %alloca, i32 0, i32 1024 +; OPT: br i1 +; OPT-NOT: ptrtoint + +; GCN-LABEL: {{^}}test_no_sink_scratch_large_offset_i32: +; GCN: s_and_saveexec_b64 +; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}} +; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}} +; GCN: {{^}}BB6_2: +define void @test_no_sink_scratch_large_offset_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %cond, i32 %arg) { +entry: + %alloca = alloca [512 x i32], align 4 + %out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i64 999998 + %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i64 999999 + %add.arg = add i32 %arg, 8 + %alloca.gep = getelementptr [512 x i32], [512 x i32]* %alloca, i32 0, i32 1024 + %tmp0 = icmp eq i32 %cond, 0 + br i1 %tmp0, label %endif, label %if + +if: + store volatile i32 123, i32* %alloca.gep + %tmp1 = load volatile i32, i32* %alloca.gep + br label %endif + +endif: + %x = phi i32 [ %tmp1, %if ], [ 0, %entry ] + store i32 %x, i32 addrspace(1)* %out.gep.0 + %load = load volatile i32, i32* %alloca.gep + store i32 %load, i32 addrspace(1)* %out.gep.1 + br label %done + +done: + ret void +} + +; GCN-LABEL: {{^}}test_sink_global_vreg_sreg_i32: +; GCN: s_and_saveexec_b64 +; GCN: buffer_load_dword {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; GCN: {{^}}BB7_2: +define void @test_sink_global_vreg_sreg_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %offset, i32 %cond) { +entry: + %offset.ext = zext i32 %offset to i64 + %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999 + %in.gep = getelementptr i32, i32 addrspace(1)* %in, i64 %offset.ext + %tmp0 = icmp eq i32 %cond, 0 + br i1 %tmp0, label %endif, label %if + +if: + %tmp1 = load i32, i32 addrspace(1)* %in.gep + br label %endif + +endif: + %x = phi i32 [ %tmp1, %if ], [ 0, %entry ] + store i32 %x, i32 addrspace(1)* %out.gep + br label %done + +done: + ret void +} + +attributes #0 = { nounwind readnone } +attributes #1 = { nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/coalescer_remat.ll b/llvm/test/CodeGen/AMDGPU/coalescer_remat.ll new file mode 100644 index 00000000000..96730bcf2e8 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/coalescer_remat.ll @@ -0,0 +1,57 @@ +; RUN: llc -march=amdgcn -verify-machineinstrs -mtriple=amdgcn-- -o - %s | FileCheck %s + +declare float @llvm.fma.f32(float, float, float) + +; This checks that rematerialization support of the coalescer does not +; unnecessarily widen the register class. Without those fixes > 20 VGprs +; are used here +; Also check that some rematerialization of the 0 constant happened. +; CHECK-LABEL: foobar +; CHECK: v_mov_b32_e32 v{{[0-9]+}}, 0 +; CHECK: v_mov_b32_e32 v{{[0-9]+}}, 0 +; CHECK: v_mov_b32_e32 v{{[0-9]+}}, 0 +; CHECK: v_mov_b32_e32 v{{[0-9]+}}, 0 +; It's probably OK if this is slightly higher: +; CHECK: ; NumVgprs: 9 +define void @foobar(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in, i32 %flag) { +entry: + %cmpflag = icmp eq i32 %flag, 1 + br i1 %cmpflag, label %loop, label %exit + +loop: + %c = phi i32 [0, %entry], [%cnext, %loop] + %v0 = phi float [0.0, %entry], [%fma.0, %loop] + %v1 = phi float [0.0, %entry], [%fma.1, %loop] + %v2 = phi float [0.0, %entry], [%fma.2, %loop] + %v3 = phi float [0.0, %entry], [%fma.3, %loop] + + ; Try to get the 0 constant to get coalesced into a wide register + %blup = insertelement <4 x float> undef, float %v0, i32 0 + store <4 x float> %blup, <4 x float> addrspace(1)* %out + + %load = load <4 x float>, <4 x float> addrspace(1)* %in + %load.0 = extractelement <4 x float> %load, i32 0 + %load.1 = extractelement <4 x float> %load, i32 1 + %load.2 = extractelement <4 x float> %load, i32 2 + %load.3 = extractelement <4 x float> %load, i32 3 + %fma.0 = call float @llvm.fma.f32(float %v0, float %load.0, float %v0) + %fma.1 = call float @llvm.fma.f32(float %v1, float %load.1, float %v1) + %fma.2 = call float @llvm.fma.f32(float %v2, float %load.2, float %v2) + %fma.3 = call float @llvm.fma.f32(float %v3, float %load.3, float %v3) + + %cnext = add nsw i32 %c, 1 + %cmp = icmp eq i32 %cnext, 42 + br i1 %cmp, label %exit, label %loop + +exit: + %ev0 = phi float [0.0, %entry], [%fma.0, %loop] + %ev1 = phi float [0.0, %entry], [%fma.1, %loop] + %ev2 = phi float [0.0, %entry], [%fma.2, %loop] + %ev3 = phi float [0.0, %entry], [%fma.3, %loop] + %dst.0 = insertelement <4 x float> undef, float %ev0, i32 0 + %dst.1 = insertelement <4 x float> %dst.0, float %ev1, i32 1 + %dst.2 = insertelement <4 x float> %dst.1, float %ev2, i32 2 + %dst.3 = insertelement <4 x float> %dst.2, float %ev3, i32 3 + store <4 x float> %dst.3, <4 x float> addrspace(1)* %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/codegen-prepare-addrmode-sext.ll b/llvm/test/CodeGen/AMDGPU/codegen-prepare-addrmode-sext.ll new file mode 100644 index 00000000000..58517209267 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/codegen-prepare-addrmode-sext.ll @@ -0,0 +1,18 @@ +; RUN: opt -mtriple=amdgcn-- -codegenprepare -S < %s | FileCheck -check-prefix=OPT %s +; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI-LLC %s + +; OPT-LABEL: @test( +; OPT: mul nsw i32 +; OPT-NEXT: sext + +; SI-LLC-LABEL: {{^}}test: +; SI-LLC: s_mul_i32 +; SI-LLC-NOT: mul +define void @test(i8 addrspace(1)* nocapture readonly %in, i32 %a, i8 %b) { +entry: + %0 = mul nsw i32 %a, 3 + %1 = sext i32 %0 to i64 + %2 = getelementptr i8, i8 addrspace(1)* %in, i64 %1 + store i8 %b, i8 addrspace(1)* %2 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/combine_vloads.ll b/llvm/test/CodeGen/AMDGPU/combine_vloads.ll new file mode 100644 index 00000000000..01572afa620 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/combine_vloads.ll @@ -0,0 +1,42 @@ +; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG %s + +; +; kernel void combine_vloads(global char8* src, global char8* result) { +; for (int i = 0; i < 1024; ++i) +; result[i] = src[0] + src[1] + src[2] + src[3]; +; } +; + + +; 128-bit loads instead of many 8-bit +; EG-LABEL: {{^}}combine_vloads: +; EG: VTX_READ_128 +; EG: VTX_READ_128 +define void @combine_vloads(<8 x i8> addrspace(1)* nocapture %src, <8 x i8> addrspace(1)* nocapture %result) nounwind { +entry: + br label %for.body + +for.exit: ; preds = %for.body + ret void + +for.body: ; preds = %for.body, %entry + %i.01 = phi i32 [ 0, %entry ], [ %tmp19, %for.body ] + %arrayidx_v4 = bitcast <8 x i8> addrspace(1)* %src to <32 x i8> addrspace(1)* + %0 = bitcast <32 x i8> addrspace(1)* %arrayidx_v4 to <8 x i32> addrspace(1)* + %vecload2 = load <8 x i32>, <8 x i32> addrspace(1)* %0, align 32 + %1 = bitcast <8 x i32> %vecload2 to <32 x i8> + %tmp5 = shufflevector <32 x i8> %1, <32 x i8> undef, <8 x i32> + %tmp8 = shufflevector <32 x i8> %1, <32 x i8> undef, <8 x i32> + %tmp9 = add nsw <8 x i8> %tmp5, %tmp8 + %tmp12 = shufflevector <32 x i8> %1, <32 x i8> undef, <8 x i32> + %tmp13 = add nsw <8 x i8> %tmp9, %tmp12 + %tmp16 = shufflevector <32 x i8> %1, <32 x i8> undef, <8 x i32> + %tmp17 = add nsw <8 x i8> %tmp13, %tmp16 + %scevgep = getelementptr <8 x i8>, <8 x i8> addrspace(1)* %result, i32 %i.01 + %2 = bitcast <8 x i8> %tmp17 to <2 x i32> + %3 = bitcast <8 x i8> addrspace(1)* %scevgep to <2 x i32> addrspace(1)* + store <2 x i32> %2, <2 x i32> addrspace(1)* %3, align 8 + %tmp19 = add nsw i32 %i.01, 1 + %exitcond = icmp eq i32 %tmp19, 1024 + br i1 %exitcond, label %for.exit, label %for.body +} diff --git a/llvm/test/CodeGen/AMDGPU/commute-compares.ll b/llvm/test/CodeGen/AMDGPU/commute-compares.ll new file mode 100644 index 00000000000..31766047a35 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/commute-compares.ll @@ -0,0 +1,697 @@ +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s + +declare i32 @llvm.r600.read.tidig.x() #0 + +; -------------------------------------------------------------------------------- +; i32 compares +; -------------------------------------------------------------------------------- + +; GCN-LABEL: {{^}}commute_eq_64_i32: +; GCN: v_cmp_eq_i32_e32 vcc, 64, v{{[0-9]+}} +define void @commute_eq_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 { + %tid = call i32 @llvm.r600.read.tidig.x() #0 + %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %val = load i32, i32 addrspace(1)* %gep.in + %cmp = icmp eq i32 %val, 64 + %ext = sext i1 %cmp to i32 + store i32 %ext, i32 addrspace(1)* %gep.out + ret void +} + +; GCN-LABEL: {{^}}commute_ne_64_i32: +; GCN: v_cmp_ne_i32_e32 vcc, 64, v{{[0-9]+}} +define void @commute_ne_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 { + %tid = call i32 @llvm.r600.read.tidig.x() #0 + %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %val = load i32, i32 addrspace(1)* %gep.in + %cmp = icmp ne i32 %val, 64 + %ext = sext i1 %cmp to i32 + store i32 %ext, i32 addrspace(1)* %gep.out + ret void +} + +; FIXME: Why isn't this being folded as a constant? +; GCN-LABEL: {{^}}commute_ne_litk_i32: +; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0x3039 +; GCN: v_cmp_ne_i32_e32 vcc, [[K]], v{{[0-9]+}} +define void @commute_ne_litk_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 { + %tid = call i32 @llvm.r600.read.tidig.x() #0 + %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %val = load i32, i32 addrspace(1)* %gep.in + %cmp = icmp ne i32 %val, 12345 + %ext = sext i1 %cmp to i32 + store i32 %ext, i32 addrspace(1)* %gep.out + ret void +} + +; GCN-LABEL: {{^}}commute_ugt_64_i32: +; GCN: v_cmp_lt_u32_e32 vcc, 64, v{{[0-9]+}} +define void @commute_ugt_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 { + %tid = call i32 @llvm.r600.read.tidig.x() #0 + %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %val = load i32, i32 addrspace(1)* %gep.in + %cmp = icmp ugt i32 %val, 64 + %ext = sext i1 %cmp to i32 + store i32 %ext, i32 addrspace(1)* %gep.out + ret void +} + +; GCN-LABEL: {{^}}commute_uge_64_i32: +; GCN: v_cmp_lt_u32_e32 vcc, 63, v{{[0-9]+}} +define void @commute_uge_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 { + %tid = call i32 @llvm.r600.read.tidig.x() #0 + %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %val = load i32, i32 addrspace(1)* %gep.in + %cmp = icmp uge i32 %val, 64 + %ext = sext i1 %cmp to i32 + store i32 %ext, i32 addrspace(1)* %gep.out + ret void +} + +; GCN-LABEL: {{^}}commute_ult_64_i32: +; GCN: v_cmp_gt_u32_e32 vcc, 64, v{{[0-9]+}} +define void @commute_ult_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 { + %tid = call i32 @llvm.r600.read.tidig.x() #0 + %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %val = load i32, i32 addrspace(1)* %gep.in + %cmp = icmp ult i32 %val, 64 + %ext = sext i1 %cmp to i32 + store i32 %ext, i32 addrspace(1)* %gep.out + ret void +} + +; GCN-LABEL: {{^}}commute_ule_63_i32: +; GCN: v_cmp_gt_u32_e32 vcc, 64, v{{[0-9]+}} +define void @commute_ule_63_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 { + %tid = call i32 @llvm.r600.read.tidig.x() #0 + %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %val = load i32, i32 addrspace(1)* %gep.in + %cmp = icmp ule i32 %val, 63 + %ext = sext i1 %cmp to i32 + store i32 %ext, i32 addrspace(1)* %gep.out + ret void +} + +; FIXME: Undo canonicalization to gt (x + 1) since it doesn't use the inline imm + +; GCN-LABEL: {{^}}commute_ule_64_i32: +; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0x41{{$}} +; GCN: v_cmp_gt_u32_e32 vcc, [[K]], v{{[0-9]+}} +define void @commute_ule_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 { + %tid = call i32 @llvm.r600.read.tidig.x() #0 + %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %val = load i32, i32 addrspace(1)* %gep.in + %cmp = icmp ule i32 %val, 64 + %ext = sext i1 %cmp to i32 + store i32 %ext, i32 addrspace(1)* %gep.out + ret void +} + +; GCN-LABEL: {{^}}commute_sgt_neg1_i32: +; GCN: v_cmp_lt_i32_e32 vcc, -1, v{{[0-9]+}} +define void @commute_sgt_neg1_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 { + %tid = call i32 @llvm.r600.read.tidig.x() #0 + %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %val = load i32, i32 addrspace(1)* %gep.in + %cmp = icmp sgt i32 %val, -1 + %ext = sext i1 %cmp to i32 + store i32 %ext, i32 addrspace(1)* %gep.out + ret void +} + +; GCN-LABEL: {{^}}commute_sge_neg2_i32: +; GCN: v_cmp_lt_i32_e32 vcc, -3, v{{[0-9]+}} +define void @commute_sge_neg2_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 { + %tid = call i32 @llvm.r600.read.tidig.x() #0 + %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %val = load i32, i32 addrspace(1)* %gep.in + %cmp = icmp sge i32 %val, -2 + %ext = sext i1 %cmp to i32 + store i32 %ext, i32 addrspace(1)* %gep.out + ret void +} + +; GCN-LABEL: {{^}}commute_slt_neg16_i32: +; GCN: v_cmp_gt_i32_e32 vcc, -16, v{{[0-9]+}} +define void @commute_slt_neg16_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 { + %tid = call i32 @llvm.r600.read.tidig.x() #0 + %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %val = load i32, i32 addrspace(1)* %gep.in + %cmp = icmp slt i32 %val, -16 + %ext = sext i1 %cmp to i32 + store i32 %ext, i32 addrspace(1)* %gep.out + ret void +} + +; GCN-LABEL: {{^}}commute_sle_5_i32: +; GCN: v_cmp_gt_i32_e32 vcc, 6, v{{[0-9]+}} +define void @commute_sle_5_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 { + %tid = call i32 @llvm.r600.read.tidig.x() #0 + %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %val = load i32, i32 addrspace(1)* %gep.in + %cmp = icmp sle i32 %val, 5 + %ext = sext i1 %cmp to i32 + store i32 %ext, i32 addrspace(1)* %gep.out + ret void +} + +; -------------------------------------------------------------------------------- +; i64 compares +; -------------------------------------------------------------------------------- + +; GCN-LABEL: {{^}}commute_eq_64_i64: +; GCN: v_cmp_eq_i64_e32 vcc, 64, v{{\[[0-9]+:[0-9]+\]}} +define void @commute_eq_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { + %tid = call i32 @llvm.r600.read.tidig.x() #0 + %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %val = load i64, i64 addrspace(1)* %gep.in + %cmp = icmp eq i64 %val, 64 + %ext = sext i1 %cmp to i32 + store i32 %ext, i32 addrspace(1)* %gep.out + ret void +} + +; GCN-LABEL: {{^}}commute_ne_64_i64: +; GCN: v_cmp_ne_i64_e32 vcc, 64, v{{\[[0-9]+:[0-9]+\]}} +define void @commute_ne_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { + %tid = call i32 @llvm.r600.read.tidig.x() #0 + %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %val = load i64, i64 addrspace(1)* %gep.in + %cmp = icmp ne i64 %val, 64 + %ext = sext i1 %cmp to i32 + store i32 %ext, i32 addrspace(1)* %gep.out + ret void +} + +; GCN-LABEL: {{^}}commute_ugt_64_i64: +; GCN: v_cmp_lt_u64_e32 vcc, 64, v{{\[[0-9]+:[0-9]+\]}} +define void @commute_ugt_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { + %tid = call i32 @llvm.r600.read.tidig.x() #0 + %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %val = load i64, i64 addrspace(1)* %gep.in + %cmp = icmp ugt i64 %val, 64 + %ext = sext i1 %cmp to i32 + store i32 %ext, i32 addrspace(1)* %gep.out + ret void +} + +; GCN-LABEL: {{^}}commute_uge_64_i64: +; GCN: v_cmp_lt_u64_e32 vcc, 63, v{{\[[0-9]+:[0-9]+\]}} +define void @commute_uge_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { + %tid = call i32 @llvm.r600.read.tidig.x() #0 + %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %val = load i64, i64 addrspace(1)* %gep.in + %cmp = icmp uge i64 %val, 64 + %ext = sext i1 %cmp to i32 + store i32 %ext, i32 addrspace(1)* %gep.out + ret void +} + +; GCN-LABEL: {{^}}commute_ult_64_i64: +; GCN: v_cmp_gt_u64_e32 vcc, 64, v{{\[[0-9]+:[0-9]+\]}} +define void @commute_ult_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { + %tid = call i32 @llvm.r600.read.tidig.x() #0 + %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %val = load i64, i64 addrspace(1)* %gep.in + %cmp = icmp ult i64 %val, 64 + %ext = sext i1 %cmp to i32 + store i32 %ext, i32 addrspace(1)* %gep.out + ret void +} + +; GCN-LABEL: {{^}}commute_ule_63_i64: +; GCN: v_cmp_gt_u64_e32 vcc, 64, v{{\[[0-9]+:[0-9]+\]}} +define void @commute_ule_63_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { + %tid = call i32 @llvm.r600.read.tidig.x() #0 + %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %val = load i64, i64 addrspace(1)* %gep.in + %cmp = icmp ule i64 %val, 63 + %ext = sext i1 %cmp to i32 + store i32 %ext, i32 addrspace(1)* %gep.out + ret void +} + +; FIXME: Undo canonicalization to gt (x + 1) since it doesn't use the inline imm + +; GCN-LABEL: {{^}}commute_ule_64_i64: +; GCN-DAG: s_movk_i32 s[[KLO:[0-9]+]], 0x41{{$}} +; GCN: v_cmp_gt_u64_e32 vcc, s{{\[}}[[KLO]]:{{[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} +define void @commute_ule_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { + %tid = call i32 @llvm.r600.read.tidig.x() #0 + %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %val = load i64, i64 addrspace(1)* %gep.in + %cmp = icmp ule i64 %val, 64 + %ext = sext i1 %cmp to i32 + store i32 %ext, i32 addrspace(1)* %gep.out + ret void +} + +; GCN-LABEL: {{^}}commute_sgt_neg1_i64: +; GCN: v_cmp_lt_i64_e32 vcc, -1, v{{\[[0-9]+:[0-9]+\]}} +define void @commute_sgt_neg1_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { + %tid = call i32 @llvm.r600.read.tidig.x() #0 + %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %val = load i64, i64 addrspace(1)* %gep.in + %cmp = icmp sgt i64 %val, -1 + %ext = sext i1 %cmp to i32 + store i32 %ext, i32 addrspace(1)* %gep.out + ret void +} + +; GCN-LABEL: {{^}}commute_sge_neg2_i64: +; GCN: v_cmp_lt_i64_e32 vcc, -3, v{{\[[0-9]+:[0-9]+\]}} +define void @commute_sge_neg2_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { + %tid = call i32 @llvm.r600.read.tidig.x() #0 + %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %val = load i64, i64 addrspace(1)* %gep.in + %cmp = icmp sge i64 %val, -2 + %ext = sext i1 %cmp to i32 + store i32 %ext, i32 addrspace(1)* %gep.out + ret void +} + +; GCN-LABEL: {{^}}commute_slt_neg16_i64: +; GCN: v_cmp_gt_i64_e32 vcc, -16, v{{\[[0-9]+:[0-9]+\]}} +define void @commute_slt_neg16_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { + %tid = call i32 @llvm.r600.read.tidig.x() #0 + %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %val = load i64, i64 addrspace(1)* %gep.in + %cmp = icmp slt i64 %val, -16 + %ext = sext i1 %cmp to i32 + store i32 %ext, i32 addrspace(1)* %gep.out + ret void +} + +; GCN-LABEL: {{^}}commute_sle_5_i64: +; GCN: v_cmp_gt_i64_e32 vcc, 6, v{{\[[0-9]+:[0-9]+\]}} +define void @commute_sle_5_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { + %tid = call i32 @llvm.r600.read.tidig.x() #0 + %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %val = load i64, i64 addrspace(1)* %gep.in + %cmp = icmp sle i64 %val, 5 + %ext = sext i1 %cmp to i32 + store i32 %ext, i32 addrspace(1)* %gep.out + ret void +} + +; -------------------------------------------------------------------------------- +; f32 compares +; -------------------------------------------------------------------------------- + + +; GCN-LABEL: {{^}}commute_oeq_2.0_f32: +; GCN: v_cmp_eq_f32_e32 vcc, 2.0, v{{[0-9]+}} +define void @commute_oeq_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 { + %tid = call i32 @llvm.r600.read.tidig.x() #0 + %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %val = load float, float addrspace(1)* %gep.in + %cmp = fcmp oeq float %val, 2.0 + %ext = sext i1 %cmp to i32 + store i32 %ext, i32 addrspace(1)* %gep.out + ret void +} + + +; GCN-LABEL: {{^}}commute_ogt_2.0_f32: +; GCN: v_cmp_lt_f32_e32 vcc, 2.0, v{{[0-9]+}} +define void @commute_ogt_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 { + %tid = call i32 @llvm.r600.read.tidig.x() #0 + %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %val = load float, float addrspace(1)* %gep.in + %cmp = fcmp ogt float %val, 2.0 + %ext = sext i1 %cmp to i32 + store i32 %ext, i32 addrspace(1)* %gep.out + ret void +} + +; GCN-LABEL: {{^}}commute_oge_2.0_f32: +; GCN: v_cmp_le_f32_e32 vcc, 2.0, v{{[0-9]+}} +define void @commute_oge_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 { + %tid = call i32 @llvm.r600.read.tidig.x() #0 + %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %val = load float, float addrspace(1)* %gep.in + %cmp = fcmp oge float %val, 2.0 + %ext = sext i1 %cmp to i32 + store i32 %ext, i32 addrspace(1)* %gep.out + ret void +} + +; GCN-LABEL: {{^}}commute_olt_2.0_f32: +; GCN: v_cmp_gt_f32_e32 vcc, 2.0, v{{[0-9]+}} +define void @commute_olt_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 { + %tid = call i32 @llvm.r600.read.tidig.x() #0 + %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %val = load float, float addrspace(1)* %gep.in + %cmp = fcmp olt float %val, 2.0 + %ext = sext i1 %cmp to i32 + store i32 %ext, i32 addrspace(1)* %gep.out + ret void +} + +; GCN-LABEL: {{^}}commute_ole_2.0_f32: +; GCN: v_cmp_ge_f32_e32 vcc, 2.0, v{{[0-9]+}} +define void @commute_ole_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 { + %tid = call i32 @llvm.r600.read.tidig.x() #0 + %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %val = load float, float addrspace(1)* %gep.in + %cmp = fcmp ole float %val, 2.0 + %ext = sext i1 %cmp to i32 + store i32 %ext, i32 addrspace(1)* %gep.out + ret void +} + +; GCN-LABEL: {{^}}commute_one_2.0_f32: +; GCN: v_cmp_lg_f32_e32 vcc, 2.0, v{{[0-9]+}} +define void @commute_one_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 { + %tid = call i32 @llvm.r600.read.tidig.x() #0 + %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %val = load float, float addrspace(1)* %gep.in + %cmp = fcmp one float %val, 2.0 + %ext = sext i1 %cmp to i32 + store i32 %ext, i32 addrspace(1)* %gep.out + ret void +} + +; GCN-LABEL: {{^}}commute_ord_2.0_f32: +; GCN: v_cmp_o_f32_e32 vcc, [[REG:v[0-9]+]], [[REG]] +define void @commute_ord_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 { + %tid = call i32 @llvm.r600.read.tidig.x() #0 + %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %val = load float, float addrspace(1)* %gep.in + %cmp = fcmp ord float %val, 2.0 + %ext = sext i1 %cmp to i32 + store i32 %ext, i32 addrspace(1)* %gep.out + ret void +} + +; GCN-LABEL: {{^}}commute_ueq_2.0_f32: +; GCN: v_cmp_nlg_f32_e32 vcc, 2.0, v{{[0-9]+}} +define void @commute_ueq_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 { + %tid = call i32 @llvm.r600.read.tidig.x() #0 + %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %val = load float, float addrspace(1)* %gep.in + %cmp = fcmp ueq float %val, 2.0 + %ext = sext i1 %cmp to i32 + store i32 %ext, i32 addrspace(1)* %gep.out + ret void +} + +; GCN-LABEL: {{^}}commute_ugt_2.0_f32: +; GCN: v_cmp_nge_f32_e32 vcc, 2.0, v{{[0-9]+}} +define void @commute_ugt_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 { + %tid = call i32 @llvm.r600.read.tidig.x() #0 + %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %val = load float, float addrspace(1)* %gep.in + %cmp = fcmp ugt float %val, 2.0 + %ext = sext i1 %cmp to i32 + store i32 %ext, i32 addrspace(1)* %gep.out + ret void +} + +; GCN-LABEL: {{^}}commute_uge_2.0_f32: +; GCN: v_cmp_ngt_f32_e32 vcc, 2.0, v{{[0-9]+}} +define void @commute_uge_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 { + %tid = call i32 @llvm.r600.read.tidig.x() #0 + %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %val = load float, float addrspace(1)* %gep.in + %cmp = fcmp uge float %val, 2.0 + %ext = sext i1 %cmp to i32 + store i32 %ext, i32 addrspace(1)* %gep.out + ret void +} + +; GCN-LABEL: {{^}}commute_ult_2.0_f32: +; GCN: v_cmp_nle_f32_e32 vcc, 2.0, v{{[0-9]+}} +define void @commute_ult_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 { + %tid = call i32 @llvm.r600.read.tidig.x() #0 + %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %val = load float, float addrspace(1)* %gep.in + %cmp = fcmp ult float %val, 2.0 + %ext = sext i1 %cmp to i32 + store i32 %ext, i32 addrspace(1)* %gep.out + ret void +} + +; GCN-LABEL: {{^}}commute_ule_2.0_f32: +; GCN: v_cmp_nlt_f32_e32 vcc, 2.0, v{{[0-9]+}} +define void @commute_ule_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 { + %tid = call i32 @llvm.r600.read.tidig.x() #0 + %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %val = load float, float addrspace(1)* %gep.in + %cmp = fcmp ule float %val, 2.0 + %ext = sext i1 %cmp to i32 + store i32 %ext, i32 addrspace(1)* %gep.out + ret void +} + +; GCN-LABEL: {{^}}commute_une_2.0_f32: +; GCN: v_cmp_neq_f32_e32 vcc, 2.0, v{{[0-9]+}} +define void @commute_une_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 { + %tid = call i32 @llvm.r600.read.tidig.x() #0 + %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %val = load float, float addrspace(1)* %gep.in + %cmp = fcmp une float %val, 2.0 + %ext = sext i1 %cmp to i32 + store i32 %ext, i32 addrspace(1)* %gep.out + ret void +} + +; GCN-LABEL: {{^}}commute_uno_2.0_f32: +; GCN: v_cmp_u_f32_e32 vcc, [[REG:v[0-9]+]], [[REG]] +define void @commute_uno_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 { + %tid = call i32 @llvm.r600.read.tidig.x() #0 + %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %val = load float, float addrspace(1)* %gep.in + %cmp = fcmp uno float %val, 2.0 + %ext = sext i1 %cmp to i32 + store i32 %ext, i32 addrspace(1)* %gep.out + ret void +} + +; -------------------------------------------------------------------------------- +; f64 compares +; -------------------------------------------------------------------------------- + + +; GCN-LABEL: {{^}}commute_oeq_2.0_f64: +; GCN: v_cmp_eq_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}} +define void @commute_oeq_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 { + %tid = call i32 @llvm.r600.read.tidig.x() #0 + %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %val = load double, double addrspace(1)* %gep.in + %cmp = fcmp oeq double %val, 2.0 + %ext = sext i1 %cmp to i32 + store i32 %ext, i32 addrspace(1)* %gep.out + ret void +} + + +; GCN-LABEL: {{^}}commute_ogt_2.0_f64: +; GCN: v_cmp_lt_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}} +define void @commute_ogt_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 { + %tid = call i32 @llvm.r600.read.tidig.x() #0 + %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %val = load double, double addrspace(1)* %gep.in + %cmp = fcmp ogt double %val, 2.0 + %ext = sext i1 %cmp to i32 + store i32 %ext, i32 addrspace(1)* %gep.out + ret void +} + +; GCN-LABEL: {{^}}commute_oge_2.0_f64: +; GCN: v_cmp_le_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}} +define void @commute_oge_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 { + %tid = call i32 @llvm.r600.read.tidig.x() #0 + %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %val = load double, double addrspace(1)* %gep.in + %cmp = fcmp oge double %val, 2.0 + %ext = sext i1 %cmp to i32 + store i32 %ext, i32 addrspace(1)* %gep.out + ret void +} + +; GCN-LABEL: {{^}}commute_olt_2.0_f64: +; GCN: v_cmp_gt_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}} +define void @commute_olt_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 { + %tid = call i32 @llvm.r600.read.tidig.x() #0 + %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %val = load double, double addrspace(1)* %gep.in + %cmp = fcmp olt double %val, 2.0 + %ext = sext i1 %cmp to i32 + store i32 %ext, i32 addrspace(1)* %gep.out + ret void +} + +; GCN-LABEL: {{^}}commute_ole_2.0_f64: +; GCN: v_cmp_ge_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}} +define void @commute_ole_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 { + %tid = call i32 @llvm.r600.read.tidig.x() #0 + %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %val = load double, double addrspace(1)* %gep.in + %cmp = fcmp ole double %val, 2.0 + %ext = sext i1 %cmp to i32 + store i32 %ext, i32 addrspace(1)* %gep.out + ret void +} + +; GCN-LABEL: {{^}}commute_one_2.0_f64: +; GCN: v_cmp_lg_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}} +define void @commute_one_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 { + %tid = call i32 @llvm.r600.read.tidig.x() #0 + %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %val = load double, double addrspace(1)* %gep.in + %cmp = fcmp one double %val, 2.0 + %ext = sext i1 %cmp to i32 + store i32 %ext, i32 addrspace(1)* %gep.out + ret void +} + +; GCN-LABEL: {{^}}commute_ord_2.0_f64: +; GCN: v_cmp_o_f64_e32 vcc, [[REG:v\[[0-9]+:[0-9]+\]]], [[REG]] +define void @commute_ord_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 { + %tid = call i32 @llvm.r600.read.tidig.x() #0 + %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %val = load double, double addrspace(1)* %gep.in + %cmp = fcmp ord double %val, 2.0 + %ext = sext i1 %cmp to i32 + store i32 %ext, i32 addrspace(1)* %gep.out + ret void +} + +; GCN-LABEL: {{^}}commute_ueq_2.0_f64: +; GCN: v_cmp_nlg_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}} +define void @commute_ueq_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 { + %tid = call i32 @llvm.r600.read.tidig.x() #0 + %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %val = load double, double addrspace(1)* %gep.in + %cmp = fcmp ueq double %val, 2.0 + %ext = sext i1 %cmp to i32 + store i32 %ext, i32 addrspace(1)* %gep.out + ret void +} + +; GCN-LABEL: {{^}}commute_ugt_2.0_f64: +; GCN: v_cmp_nge_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}} +define void @commute_ugt_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 { + %tid = call i32 @llvm.r600.read.tidig.x() #0 + %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %val = load double, double addrspace(1)* %gep.in + %cmp = fcmp ugt double %val, 2.0 + %ext = sext i1 %cmp to i32 + store i32 %ext, i32 addrspace(1)* %gep.out + ret void +} + +; GCN-LABEL: {{^}}commute_uge_2.0_f64: +; GCN: v_cmp_ngt_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}} +define void @commute_uge_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 { + %tid = call i32 @llvm.r600.read.tidig.x() #0 + %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %val = load double, double addrspace(1)* %gep.in + %cmp = fcmp uge double %val, 2.0 + %ext = sext i1 %cmp to i32 + store i32 %ext, i32 addrspace(1)* %gep.out + ret void +} + +; GCN-LABEL: {{^}}commute_ult_2.0_f64: +; GCN: v_cmp_nle_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}} +define void @commute_ult_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 { + %tid = call i32 @llvm.r600.read.tidig.x() #0 + %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %val = load double, double addrspace(1)* %gep.in + %cmp = fcmp ult double %val, 2.0 + %ext = sext i1 %cmp to i32 + store i32 %ext, i32 addrspace(1)* %gep.out + ret void +} + +; GCN-LABEL: {{^}}commute_ule_2.0_f64: +; GCN: v_cmp_nlt_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}} +define void @commute_ule_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 { + %tid = call i32 @llvm.r600.read.tidig.x() #0 + %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %val = load double, double addrspace(1)* %gep.in + %cmp = fcmp ule double %val, 2.0 + %ext = sext i1 %cmp to i32 + store i32 %ext, i32 addrspace(1)* %gep.out + ret void +} + +; GCN-LABEL: {{^}}commute_une_2.0_f64: +; GCN: v_cmp_neq_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}} +define void @commute_une_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 { + %tid = call i32 @llvm.r600.read.tidig.x() #0 + %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %val = load double, double addrspace(1)* %gep.in + %cmp = fcmp une double %val, 2.0 + %ext = sext i1 %cmp to i32 + store i32 %ext, i32 addrspace(1)* %gep.out + ret void +} + +; GCN-LABEL: {{^}}commute_uno_2.0_f64: +; GCN: v_cmp_u_f64_e32 vcc, [[REG:v\[[0-9]+:[0-9]+\]]], [[REG]] +define void @commute_uno_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 { + %tid = call i32 @llvm.r600.read.tidig.x() #0 + %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %val = load double, double addrspace(1)* %gep.in + %cmp = fcmp uno double %val, 2.0 + %ext = sext i1 %cmp to i32 + store i32 %ext, i32 addrspace(1)* %gep.out + ret void +} + +attributes #0 = { nounwind readnone } +attributes #1 = { nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/commute_modifiers.ll b/llvm/test/CodeGen/AMDGPU/commute_modifiers.ll new file mode 100644 index 00000000000..7fc36eabb78 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/commute_modifiers.ll @@ -0,0 +1,181 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s + +declare i32 @llvm.r600.read.tidig.x() #1 +declare float @llvm.fabs.f32(float) #1 +declare float @llvm.fma.f32(float, float, float) nounwind readnone + +; FUNC-LABEL: @commute_add_imm_fabs_f32 +; SI: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI: v_add_f32_e64 [[REG:v[0-9]+]], 2.0, |[[X]]| +; SI-NEXT: buffer_store_dword [[REG]] +define void @commute_add_imm_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { + %tid = call i32 @llvm.r600.read.tidig.x() #1 + %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid + %x = load float, float addrspace(1)* %gep.0 + %x.fabs = call float @llvm.fabs.f32(float %x) #1 + %z = fadd float 2.0, %x.fabs + store float %z, float addrspace(1)* %out + ret void +} + +; FUNC-LABEL: @commute_mul_imm_fneg_fabs_f32 +; SI: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI: v_mul_f32_e64 [[REG:v[0-9]+]], -4.0, |[[X]]| +; SI-NEXT: buffer_store_dword [[REG]] +define void @commute_mul_imm_fneg_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { + %tid = call i32 @llvm.r600.read.tidig.x() #1 + %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid + %x = load float, float addrspace(1)* %gep.0 + %x.fabs = call float @llvm.fabs.f32(float %x) #1 + %x.fneg.fabs = fsub float -0.000000e+00, %x.fabs + %z = fmul float 4.0, %x.fneg.fabs + store float %z, float addrspace(1)* %out + ret void +} + +; FUNC-LABEL: @commute_mul_imm_fneg_f32 +; SI: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI: v_mul_f32_e32 [[REG:v[0-9]+]], -4.0, [[X]] +; SI-NEXT: buffer_store_dword [[REG]] +define void @commute_mul_imm_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { + %tid = call i32 @llvm.r600.read.tidig.x() #1 + %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid + %x = load float, float addrspace(1)* %gep.0 + %x.fneg = fsub float -0.000000e+00, %x + %z = fmul float 4.0, %x.fneg + store float %z, float addrspace(1)* %out + ret void +} + +; FIXME: Should use SGPR for literal. +; FUNC-LABEL: @commute_add_lit_fabs_f32 +; SI: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI: v_mov_b32_e32 [[K:v[0-9]+]], 0x44800000 +; SI: v_add_f32_e64 [[REG:v[0-9]+]], |[[X]]|, [[K]] +; SI-NEXT: buffer_store_dword [[REG]] +define void @commute_add_lit_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { + %tid = call i32 @llvm.r600.read.tidig.x() #1 + %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid + %x = load float, float addrspace(1)* %gep.0 + %x.fabs = call float @llvm.fabs.f32(float %x) #1 + %z = fadd float 1024.0, %x.fabs + store float %z, float addrspace(1)* %out + ret void +} + +; FUNC-LABEL: @commute_add_fabs_f32 +; SI-DAG: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI-DAG: buffer_load_dword [[Y:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 +; SI: v_add_f32_e64 [[REG:v[0-9]+]], [[X]], |[[Y]]| +; SI-NEXT: buffer_store_dword [[REG]] +define void @commute_add_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { + %tid = call i32 @llvm.r600.read.tidig.x() #1 + %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 + %x = load float, float addrspace(1)* %gep.0 + %y = load float, float addrspace(1)* %gep.1 + %y.fabs = call float @llvm.fabs.f32(float %y) #1 + %z = fadd float %x, %y.fabs + store float %z, float addrspace(1)* %out + ret void +} + +; FUNC-LABEL: @commute_mul_fneg_f32 +; SI-DAG: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI-DAG: buffer_load_dword [[Y:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 +; SI: v_mul_f32_e64 [[REG:v[0-9]+]], [[X]], -[[Y]] +; SI-NEXT: buffer_store_dword [[REG]] +define void @commute_mul_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { + %tid = call i32 @llvm.r600.read.tidig.x() #1 + %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 + %x = load float, float addrspace(1)* %gep.0 + %y = load float, float addrspace(1)* %gep.1 + %y.fneg = fsub float -0.000000e+00, %y + %z = fmul float %x, %y.fneg + store float %z, float addrspace(1)* %out + ret void +} + +; FUNC-LABEL: @commute_mul_fabs_fneg_f32 +; SI-DAG: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI-DAG: buffer_load_dword [[Y:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 +; SI: v_mul_f32_e64 [[REG:v[0-9]+]], [[X]], -|[[Y]]| +; SI-NEXT: buffer_store_dword [[REG]] +define void @commute_mul_fabs_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { + %tid = call i32 @llvm.r600.read.tidig.x() #1 + %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 + %x = load float, float addrspace(1)* %gep.0 + %y = load float, float addrspace(1)* %gep.1 + %y.fabs = call float @llvm.fabs.f32(float %y) #1 + %y.fabs.fneg = fsub float -0.000000e+00, %y.fabs + %z = fmul float %x, %y.fabs.fneg + store float %z, float addrspace(1)* %out + ret void +} + +; There's no reason to commute this. +; FUNC-LABEL: @commute_mul_fabs_x_fabs_y_f32 +; SI-DAG: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI-DAG: buffer_load_dword [[Y:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 +; SI: v_mul_f32_e64 [[REG:v[0-9]+]], |[[X]]|, |[[Y]]| +; SI-NEXT: buffer_store_dword [[REG]] +define void @commute_mul_fabs_x_fabs_y_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { + %tid = call i32 @llvm.r600.read.tidig.x() #1 + %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 + %x = load float, float addrspace(1)* %gep.0 + %y = load float, float addrspace(1)* %gep.1 + %x.fabs = call float @llvm.fabs.f32(float %x) #1 + %y.fabs = call float @llvm.fabs.f32(float %y) #1 + %z = fmul float %x.fabs, %y.fabs + store float %z, float addrspace(1)* %out + ret void +} + +; FUNC-LABEL: @commute_mul_fabs_x_fneg_fabs_y_f32 +; SI-DAG: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI-DAG: buffer_load_dword [[Y:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 +; SI: v_mul_f32_e64 [[REG:v[0-9]+]], |[[X]]|, -|[[Y]]| +; SI-NEXT: buffer_store_dword [[REG]] +define void @commute_mul_fabs_x_fneg_fabs_y_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { + %tid = call i32 @llvm.r600.read.tidig.x() #1 + %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 + %x = load float, float addrspace(1)* %gep.0 + %y = load float, float addrspace(1)* %gep.1 + %x.fabs = call float @llvm.fabs.f32(float %x) #1 + %y.fabs = call float @llvm.fabs.f32(float %y) #1 + %y.fabs.fneg = fsub float -0.000000e+00, %y.fabs + %z = fmul float %x.fabs, %y.fabs.fneg + store float %z, float addrspace(1)* %out + ret void +} + +; Make sure we commute the multiply part for the constant in src0 even +; though we have negate modifier on src2. + +; SI-LABEL: {{^}}fma_a_2.0_neg_b_f32 +; SI-DAG: buffer_load_dword [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 +; SI: v_fma_f32 [[RESULT:v[0-9]+]], 2.0, [[R1]], |[[R2]]| +; SI: buffer_store_dword [[RESULT]] +define void @fma_a_2.0_neg_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid + %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 + %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid + + %r1 = load float, float addrspace(1)* %gep.0 + %r2 = load float, float addrspace(1)* %gep.1 + + %r2.fabs = call float @llvm.fabs.f32(float %r2) + + %r3 = tail call float @llvm.fma.f32(float %r1, float 2.0, float %r2.fabs) + store float %r3, float addrspace(1)* %gep.out + ret void +} + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } diff --git a/llvm/test/CodeGen/AMDGPU/complex-folding.ll b/llvm/test/CodeGen/AMDGPU/complex-folding.ll new file mode 100644 index 00000000000..a5399a71324 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/complex-folding.ll @@ -0,0 +1,19 @@ +;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s + +; CHECK: {{^}}main: +; CHECK-NOT: MOV +define void @main(<4 x float> inreg %reg0) #0 { +entry: + %0 = extractelement <4 x float> %reg0, i32 0 + %1 = call float @fabs(float %0) + %2 = fptoui float %1 to i32 + %3 = bitcast i32 %2 to float + %4 = insertelement <4 x float> undef, float %3, i32 0 + call void @llvm.R600.store.swizzle(<4 x float> %4, i32 0, i32 0) + ret void +} + +declare float @fabs(float ) readnone +declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) + +attributes #0 = { "ShaderType"="0" } \ No newline at end of file diff --git a/llvm/test/CodeGen/AMDGPU/concat_vectors.ll b/llvm/test/CodeGen/AMDGPU/concat_vectors.ll new file mode 100644 index 00000000000..a09ed1f7385 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/concat_vectors.ll @@ -0,0 +1,296 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s + +; FUNC-LABEL: {{^}}test_concat_v1i32: +; 0x80f000 is the high 32 bits of the resource descriptor used by MUBUF +; instructions that access scratch memory. Bit 23, which is the add_tid_enable +; bit, is only set for scratch access, so we can check for the absence of this +; value if we want to ensure scratch memory is not being used. +; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 +; SI-NOT: movrel +define void @test_concat_v1i32(<2 x i32> addrspace(1)* %out, <1 x i32> %a, <1 x i32> %b) nounwind { + %concat = shufflevector <1 x i32> %a, <1 x i32> %b, <2 x i32> + store <2 x i32> %concat, <2 x i32> addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}test_concat_v2i32: +; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 +; SI-NOT: movrel +define void @test_concat_v2i32(<4 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) nounwind { + %concat = shufflevector <2 x i32> %a, <2 x i32> %b, <4 x i32> + store <4 x i32> %concat, <4 x i32> addrspace(1)* %out, align 16 + ret void +} + +; FUNC-LABEL: {{^}}test_concat_v4i32: +; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 +; SI-NOT: movrel +define void @test_concat_v4i32(<8 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b) nounwind { + %concat = shufflevector <4 x i32> %a, <4 x i32> %b, <8 x i32> + store <8 x i32> %concat, <8 x i32> addrspace(1)* %out, align 32 + ret void +} + +; FUNC-LABEL: {{^}}test_concat_v8i32: +; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 +; SI-NOT: movrel +define void @test_concat_v8i32(<16 x i32> addrspace(1)* %out, <8 x i32> %a, <8 x i32> %b) nounwind { + %concat = shufflevector <8 x i32> %a, <8 x i32> %b, <16 x i32> + store <16 x i32> %concat, <16 x i32> addrspace(1)* %out, align 64 + ret void +} + +; FUNC-LABEL: {{^}}test_concat_v16i32: +; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 +; SI-NOT: movrel +define void @test_concat_v16i32(<32 x i32> addrspace(1)* %out, <16 x i32> %a, <16 x i32> %b) nounwind { + %concat = shufflevector <16 x i32> %a, <16 x i32> %b, <32 x i32> + store <32 x i32> %concat, <32 x i32> addrspace(1)* %out, align 128 + ret void +} + +; FUNC-LABEL: {{^}}test_concat_v1f32: +; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 +; SI-NOT: movrel +define void @test_concat_v1f32(<2 x float> addrspace(1)* %out, <1 x float> %a, <1 x float> %b) nounwind { + %concat = shufflevector <1 x float> %a, <1 x float> %b, <2 x i32> + store <2 x float> %concat, <2 x float> addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}test_concat_v2f32: +; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 +; SI-NOT: movrel +define void @test_concat_v2f32(<4 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) nounwind { + %concat = shufflevector <2 x float> %a, <2 x float> %b, <4 x i32> + store <4 x float> %concat, <4 x float> addrspace(1)* %out, align 16 + ret void +} + +; FUNC-LABEL: {{^}}test_concat_v4f32: +; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 +; SI-NOT: movrel +define void @test_concat_v4f32(<8 x float> addrspace(1)* %out, <4 x float> %a, <4 x float> %b) nounwind { + %concat = shufflevector <4 x float> %a, <4 x float> %b, <8 x i32> + store <8 x float> %concat, <8 x float> addrspace(1)* %out, align 32 + ret void +} + +; FUNC-LABEL: {{^}}test_concat_v8f32: +; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 +; SI-NOT: movrel +define void @test_concat_v8f32(<16 x float> addrspace(1)* %out, <8 x float> %a, <8 x float> %b) nounwind { + %concat = shufflevector <8 x float> %a, <8 x float> %b, <16 x i32> + store <16 x float> %concat, <16 x float> addrspace(1)* %out, align 64 + ret void +} + +; FUNC-LABEL: {{^}}test_concat_v16f32: +; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 +; SI-NOT: movrel +define void @test_concat_v16f32(<32 x float> addrspace(1)* %out, <16 x float> %a, <16 x float> %b) nounwind { + %concat = shufflevector <16 x float> %a, <16 x float> %b, <32 x i32> + store <32 x float> %concat, <32 x float> addrspace(1)* %out, align 128 + ret void +} + +; FUNC-LABEL: {{^}}test_concat_v1i64: +; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 +; SI-NOT: movrel +define void @test_concat_v1i64(<2 x double> addrspace(1)* %out, <1 x double> %a, <1 x double> %b) nounwind { + %concat = shufflevector <1 x double> %a, <1 x double> %b, <2 x i32> + store <2 x double> %concat, <2 x double> addrspace(1)* %out, align 16 + ret void +} + +; FUNC-LABEL: {{^}}test_concat_v2i64: +; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 +; SI-NOT: movrel +define void @test_concat_v2i64(<4 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b) nounwind { + %concat = shufflevector <2 x double> %a, <2 x double> %b, <4 x i32> + store <4 x double> %concat, <4 x double> addrspace(1)* %out, align 32 + ret void +} + +; FUNC-LABEL: {{^}}test_concat_v4i64: +; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 +; SI-NOT: movrel +define void @test_concat_v4i64(<8 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b) nounwind { + %concat = shufflevector <4 x double> %a, <4 x double> %b, <8 x i32> + store <8 x double> %concat, <8 x double> addrspace(1)* %out, align 64 + ret void +} + +; FUNC-LABEL: {{^}}test_concat_v8i64: +; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 +; SI-NOT: movrel +define void @test_concat_v8i64(<16 x double> addrspace(1)* %out, <8 x double> %a, <8 x double> %b) nounwind { + %concat = shufflevector <8 x double> %a, <8 x double> %b, <16 x i32> + store <16 x double> %concat, <16 x double> addrspace(1)* %out, align 128 + ret void +} + +; FUNC-LABEL: {{^}}test_concat_v16i64: +; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 +; SI-NOT: movrel +define void @test_concat_v16i64(<32 x double> addrspace(1)* %out, <16 x double> %a, <16 x double> %b) nounwind { + %concat = shufflevector <16 x double> %a, <16 x double> %b, <32 x i32> + store <32 x double> %concat, <32 x double> addrspace(1)* %out, align 256 + ret void +} + +; FUNC-LABEL: {{^}}test_concat_v1f64: +; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 +; SI-NOT: movrel +define void @test_concat_v1f64(<2 x double> addrspace(1)* %out, <1 x double> %a, <1 x double> %b) nounwind { + %concat = shufflevector <1 x double> %a, <1 x double> %b, <2 x i32> + store <2 x double> %concat, <2 x double> addrspace(1)* %out, align 16 + ret void +} + +; FUNC-LABEL: {{^}}test_concat_v2f64: +; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 +; SI-NOT: movrel +define void @test_concat_v2f64(<4 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b) nounwind { + %concat = shufflevector <2 x double> %a, <2 x double> %b, <4 x i32> + store <4 x double> %concat, <4 x double> addrspace(1)* %out, align 32 + ret void +} + +; FUNC-LABEL: {{^}}test_concat_v4f64: +; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 +; SI-NOT: movrel +define void @test_concat_v4f64(<8 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b) nounwind { + %concat = shufflevector <4 x double> %a, <4 x double> %b, <8 x i32> + store <8 x double> %concat, <8 x double> addrspace(1)* %out, align 64 + ret void +} + +; FUNC-LABEL: {{^}}test_concat_v8f64: +; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 +; SI-NOT: movrel +define void @test_concat_v8f64(<16 x double> addrspace(1)* %out, <8 x double> %a, <8 x double> %b) nounwind { + %concat = shufflevector <8 x double> %a, <8 x double> %b, <16 x i32> + store <16 x double> %concat, <16 x double> addrspace(1)* %out, align 128 + ret void +} + +; FUNC-LABEL: {{^}}test_concat_v16f64: +; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 +; SI-NOT: movrel +define void @test_concat_v16f64(<32 x double> addrspace(1)* %out, <16 x double> %a, <16 x double> %b) nounwind { + %concat = shufflevector <16 x double> %a, <16 x double> %b, <32 x i32> + store <32 x double> %concat, <32 x double> addrspace(1)* %out, align 256 + ret void +} + +; FUNC-LABEL: {{^}}test_concat_v1i1: +; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 +; SI-NOT: movrel +define void @test_concat_v1i1(<2 x i1> addrspace(1)* %out, <1 x i1> %a, <1 x i1> %b) nounwind { + %concat = shufflevector <1 x i1> %a, <1 x i1> %b, <2 x i32> + store <2 x i1> %concat, <2 x i1> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}test_concat_v2i1: +; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 +; SI-NOT: movrel +define void @test_concat_v2i1(<4 x i1> addrspace(1)* %out, <2 x i1> %a, <2 x i1> %b) nounwind { + %concat = shufflevector <2 x i1> %a, <2 x i1> %b, <4 x i32> + store <4 x i1> %concat, <4 x i1> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}test_concat_v4i1: +; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 +; SI-NOT: movrel +define void @test_concat_v4i1(<8 x i1> addrspace(1)* %out, <4 x i1> %a, <4 x i1> %b) nounwind { + %concat = shufflevector <4 x i1> %a, <4 x i1> %b, <8 x i32> + store <8 x i1> %concat, <8 x i1> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}test_concat_v8i1: +; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 +; SI-NOT: movrel +define void @test_concat_v8i1(<16 x i1> addrspace(1)* %out, <8 x i1> %a, <8 x i1> %b) nounwind { + %concat = shufflevector <8 x i1> %a, <8 x i1> %b, <16 x i32> + store <16 x i1> %concat, <16 x i1> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}test_concat_v16i1: +; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 +; SI-NOT: movrel +define void @test_concat_v16i1(<32 x i1> addrspace(1)* %out, <16 x i1> %a, <16 x i1> %b) nounwind { + %concat = shufflevector <16 x i1> %a, <16 x i1> %b, <32 x i32> + store <32 x i1> %concat, <32 x i1> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}test_concat_v32i1: +; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 +; SI-NOT: movrel +define void @test_concat_v32i1(<64 x i1> addrspace(1)* %out, <32 x i1> %a, <32 x i1> %b) nounwind { + %concat = shufflevector <32 x i1> %a, <32 x i1> %b, <64 x i32> + store <64 x i1> %concat, <64 x i1> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}test_concat_v1i16: +; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 +; SI-NOT: movrel +define void @test_concat_v1i16(<2 x i16> addrspace(1)* %out, <1 x i16> %a, <1 x i16> %b) nounwind { + %concat = shufflevector <1 x i16> %a, <1 x i16> %b, <2 x i32> + store <2 x i16> %concat, <2 x i16> addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}test_concat_v2i16: +; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 +; SI-NOT: movrel +define void @test_concat_v2i16(<4 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b) nounwind { + %concat = shufflevector <2 x i16> %a, <2 x i16> %b, <4 x i32> + store <4 x i16> %concat, <4 x i16> addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}test_concat_v4i16: +; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 +; SI-NOT: movrel +define void @test_concat_v4i16(<8 x i16> addrspace(1)* %out, <4 x i16> %a, <4 x i16> %b) nounwind { + %concat = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> + store <8 x i16> %concat, <8 x i16> addrspace(1)* %out, align 16 + ret void +} + +; FUNC-LABEL: {{^}}test_concat_v8i16: +; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 +; SI-NOT: movrel +define void @test_concat_v8i16(<16 x i16> addrspace(1)* %out, <8 x i16> %a, <8 x i16> %b) nounwind { + %concat = shufflevector <8 x i16> %a, <8 x i16> %b, <16 x i32> + store <16 x i16> %concat, <16 x i16> addrspace(1)* %out, align 32 + ret void +} + +; FUNC-LABEL: {{^}}test_concat_v16i16: +; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 +; SI-NOT: movrel +define void @test_concat_v16i16(<32 x i16> addrspace(1)* %out, <16 x i16> %a, <16 x i16> %b) nounwind { + %concat = shufflevector <16 x i16> %a, <16 x i16> %b, <32 x i32> + store <32 x i16> %concat, <32 x i16> addrspace(1)* %out, align 64 + ret void +} + +; FUNC-LABEL: {{^}}concat_vector_crash: +; SI: s_endpgm +define void @concat_vector_crash(<8 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in) { +bb: + %tmp = load <2 x float>, <2 x float> addrspace(1)* %in, align 4 + %tmp1 = shufflevector <2 x float> %tmp, <2 x float> undef, <8 x i32> + %tmp2 = shufflevector <8 x float> undef, <8 x float> %tmp1, <8 x i32> + store <8 x float> %tmp2, <8 x float> addrspace(1)* %out, align 32 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll b/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll new file mode 100644 index 00000000000..8b397566066 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll @@ -0,0 +1,167 @@ +; RUN: llc -march=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s + +; FUNC-LABEL: {{^}}test_copy_v4i8: +; SI: buffer_load_dword [[REG:v[0-9]+]] +; SI: buffer_store_dword [[REG]] +; SI: s_endpgm +define void @test_copy_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind { + %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4 + store <4 x i8> %val, <4 x i8> addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}test_copy_v4i8_x2: +; SI: buffer_load_dword [[REG:v[0-9]+]] +; SI: buffer_store_dword [[REG]] +; SI: buffer_store_dword [[REG]] +; SI: s_endpgm +define void @test_copy_v4i8_x2(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %in) nounwind { + %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4 + store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4 + store <4 x i8> %val, <4 x i8> addrspace(1)* %out1, align 4 + ret void +} + +; FUNC-LABEL: {{^}}test_copy_v4i8_x3: +; SI: buffer_load_dword [[REG:v[0-9]+]] +; SI: buffer_store_dword [[REG]] +; SI: buffer_store_dword [[REG]] +; SI: buffer_store_dword [[REG]] +; SI: s_endpgm +define void @test_copy_v4i8_x3(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %in) nounwind { + %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4 + store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4 + store <4 x i8> %val, <4 x i8> addrspace(1)* %out1, align 4 + store <4 x i8> %val, <4 x i8> addrspace(1)* %out2, align 4 + ret void +} + +; FUNC-LABEL: {{^}}test_copy_v4i8_x4: +; SI: buffer_load_dword [[REG:v[0-9]+]] +; SI: buffer_store_dword [[REG]] +; SI: buffer_store_dword [[REG]] +; SI: buffer_store_dword [[REG]] +; SI: buffer_store_dword [[REG]] +; SI: s_endpgm +define void @test_copy_v4i8_x4(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %out3, <4 x i8> addrspace(1)* %in) nounwind { + %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4 + store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4 + store <4 x i8> %val, <4 x i8> addrspace(1)* %out1, align 4 + store <4 x i8> %val, <4 x i8> addrspace(1)* %out2, align 4 + store <4 x i8> %val, <4 x i8> addrspace(1)* %out3, align 4 + ret void +} + +; FUNC-LABEL: {{^}}test_copy_v4i8_extra_use: +; SI: buffer_load_ubyte +; SI: buffer_load_ubyte +; SI: buffer_load_ubyte +; SI: buffer_load_ubyte +; SI-DAG: v_add +; SI-DAG: v_add +; SI-DAG: v_add +; SI-DAG: v_add +; SI-DAG: buffer_store_byte +; SI-DAG: buffer_store_byte +; SI-DAG: buffer_store_byte +; SI-DAG: buffer_store_byte +; SI-DAG: buffer_store_byte +; SI-DAG: buffer_store_byte +; SI-DAG: buffer_store_byte +; SI_DAG: buffer_store_byte + +; After scalarizing v4i8 loads is fixed. +; XSI: buffer_load_dword +; XSI: V_BFE +; XSI: V_ADD +; XSI: V_ADD +; XSI: V_ADD +; XSI: buffer_store_dword +; XSI: buffer_store_dword + +; SI: s_endpgm +define void @test_copy_v4i8_extra_use(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %in) nounwind { + %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4 + %add = add <4 x i8> %val, + store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4 + store <4 x i8> %add, <4 x i8> addrspace(1)* %out1, align 4 + ret void +} + +; FUNC-LABEL: {{^}}test_copy_v4i8_x2_extra_use: +; SI: buffer_load_ubyte +; SI: buffer_load_ubyte +; SI: buffer_load_ubyte +; SI: buffer_load_ubyte +; SI-DAG: v_add +; SI-DAG: v_add +; SI-DAG: v_add +; SI-DAG: v_add +; SI-DAG: buffer_store_byte +; SI-DAG: buffer_store_byte +; SI-DAG: buffer_store_byte +; SI-DAG: buffer_store_byte +; SI-DAG: buffer_store_byte +; SI-DAG: buffer_store_byte +; SI-DAG: buffer_store_byte +; SI_DAG: buffer_store_byte +; SI-DAG: buffer_store_byte +; SI-DAG: buffer_store_byte +; SI-DAG: buffer_store_byte +; SI_DAG: buffer_store_byte + +; XSI: buffer_load_dword +; XSI: BFE +; XSI: buffer_store_dword +; XSI: V_ADD +; XSI: buffer_store_dword +; XSI-NEXT: buffer_store_dword + +; SI: s_endpgm +define void @test_copy_v4i8_x2_extra_use(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %in) nounwind { + %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4 + %add = add <4 x i8> %val, + store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4 + store <4 x i8> %add, <4 x i8> addrspace(1)* %out1, align 4 + store <4 x i8> %val, <4 x i8> addrspace(1)* %out2, align 4 + ret void +} + +; FUNC-LABEL: {{^}}test_copy_v3i8: +; SI-NOT: bfe +; SI-NOT: bfi +; SI: s_endpgm +define void @test_copy_v3i8(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) nounwind { + %val = load <3 x i8>, <3 x i8> addrspace(1)* %in, align 4 + store <3 x i8> %val, <3 x i8> addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}test_copy_v4i8_volatile_load: +; SI: buffer_load_ubyte +; SI: buffer_load_ubyte +; SI: buffer_load_ubyte +; SI: buffer_load_ubyte +; SI: s_endpgm +define void @test_copy_v4i8_volatile_load(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind { + %val = load volatile <4 x i8>, <4 x i8> addrspace(1)* %in, align 4 + store <4 x i8> %val, <4 x i8> addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}test_copy_v4i8_volatile_store: +; SI: buffer_load_ubyte +; SI: buffer_load_ubyte +; SI: buffer_load_ubyte +; SI: buffer_load_ubyte +; SI: buffer_store_byte +; SI: buffer_store_byte +; SI: buffer_store_byte +; SI: buffer_store_byte +; SI: s_endpgm +define void @test_copy_v4i8_volatile_store(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind { + %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4 + store volatile <4 x i8> %val, <4 x i8> addrspace(1)* %out, align 4 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/copy-to-reg.ll b/llvm/test/CodeGen/AMDGPU/copy-to-reg.ll new file mode 100644 index 00000000000..fc875f6ef7a --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/copy-to-reg.ll @@ -0,0 +1,27 @@ +; RUN: llc -march=amdgcn -mcpu=SI -mattr=-promote-alloca -verify-machineinstrs < %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-promote-alloca -verify-machineinstrs < %s + +; Test that CopyToReg instructions don't have non-register operands prior +; to being emitted. + +; Make sure this doesn't crash +; CHECK-LABEL: {{^}}copy_to_reg_frameindex: +define void @copy_to_reg_frameindex(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) { +entry: + %alloca = alloca [16 x i32] + br label %loop + +loop: + %inc = phi i32 [0, %entry], [%inc.i, %loop] + %ptr = getelementptr [16 x i32], [16 x i32]* %alloca, i32 0, i32 %inc + store i32 %inc, i32* %ptr + %inc.i = add i32 %inc, 1 + %cnd = icmp uge i32 %inc.i, 16 + br i1 %cnd, label %done, label %loop + +done: + %tmp0 = getelementptr [16 x i32], [16 x i32]* %alloca, i32 0, i32 0 + %tmp1 = load i32, i32* %tmp0 + store i32 %tmp1, i32 addrspace(1)* %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll new file mode 100644 index 00000000000..bd26c302fe5 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll @@ -0,0 +1,71 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s + +declare i32 @llvm.ctlz.i32(i32, i1) nounwind readnone +declare <2 x i32> @llvm.ctlz.v2i32(<2 x i32>, i1) nounwind readnone +declare <4 x i32> @llvm.ctlz.v4i32(<4 x i32>, i1) nounwind readnone + +; FUNC-LABEL: {{^}}s_ctlz_zero_undef_i32: +; SI: s_load_dword [[VAL:s[0-9]+]], +; SI: s_flbit_i32_b32 [[SRESULT:s[0-9]+]], [[VAL]] +; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]] +; SI: buffer_store_dword [[VRESULT]], +; SI: s_endpgm +; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+\.[XYZW]]] +; EG: FFBH_UINT {{\*? *}}[[RESULT]] +define void @s_ctlz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind { + %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone + store i32 %ctlz, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}v_ctlz_zero_undef_i32: +; SI: buffer_load_dword [[VAL:v[0-9]+]], +; SI: v_ffbh_u32_e32 [[RESULT:v[0-9]+]], [[VAL]] +; SI: buffer_store_dword [[RESULT]], +; SI: s_endpgm +; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+\.[XYZW]]] +; EG: FFBH_UINT {{\*? *}}[[RESULT]] +define void @v_ctlz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { + %val = load i32, i32 addrspace(1)* %valptr, align 4 + %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone + store i32 %ctlz, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}v_ctlz_zero_undef_v2i32: +; SI: buffer_load_dwordx2 +; SI: v_ffbh_u32_e32 +; SI: v_ffbh_u32_e32 +; SI: buffer_store_dwordx2 +; SI: s_endpgm +; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+]]{{\.[XYZW]}} +; EG: FFBH_UINT {{\*? *}}[[RESULT]] +; EG: FFBH_UINT {{\*? *}}[[RESULT]] +define void @v_ctlz_zero_undef_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) nounwind { + %val = load <2 x i32>, <2 x i32> addrspace(1)* %valptr, align 8 + %ctlz = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %val, i1 true) nounwind readnone + store <2 x i32> %ctlz, <2 x i32> addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}v_ctlz_zero_undef_v4i32: +; SI: buffer_load_dwordx4 +; SI: v_ffbh_u32_e32 +; SI: v_ffbh_u32_e32 +; SI: v_ffbh_u32_e32 +; SI: v_ffbh_u32_e32 +; SI: buffer_store_dwordx4 +; SI: s_endpgm +; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+]]{{\.[XYZW]}} +; EG: FFBH_UINT {{\*? *}}[[RESULT]] +; EG: FFBH_UINT {{\*? *}}[[RESULT]] +; EG: FFBH_UINT {{\*? *}}[[RESULT]] +; EG: FFBH_UINT {{\*? *}}[[RESULT]] +define void @v_ctlz_zero_undef_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %valptr) nounwind { + %val = load <4 x i32>, <4 x i32> addrspace(1)* %valptr, align 16 + %ctlz = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %val, i1 true) nounwind readnone + store <4 x i32> %ctlz, <4 x i32> addrspace(1)* %out, align 16 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/ctpop.ll b/llvm/test/CodeGen/AMDGPU/ctpop.ll new file mode 100644 index 00000000000..0a031c5e24d --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/ctpop.ll @@ -0,0 +1,300 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC -check-prefix=VI %s +; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s + +declare i32 @llvm.ctpop.i32(i32) nounwind readnone +declare <2 x i32> @llvm.ctpop.v2i32(<2 x i32>) nounwind readnone +declare <4 x i32> @llvm.ctpop.v4i32(<4 x i32>) nounwind readnone +declare <8 x i32> @llvm.ctpop.v8i32(<8 x i32>) nounwind readnone +declare <16 x i32> @llvm.ctpop.v16i32(<16 x i32>) nounwind readnone + +; FUNC-LABEL: {{^}}s_ctpop_i32: +; GCN: s_load_dword [[SVAL:s[0-9]+]], +; GCN: s_bcnt1_i32_b32 [[SRESULT:s[0-9]+]], [[SVAL]] +; GCN: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]] +; GCN: buffer_store_dword [[VRESULT]], +; GCN: s_endpgm + +; EG: BCNT_INT +define void @s_ctpop_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind { + %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone + store i32 %ctpop, i32 addrspace(1)* %out, align 4 + ret void +} + +; XXX - Why 0 in register? +; FUNC-LABEL: {{^}}v_ctpop_i32: +; GCN: buffer_load_dword [[VAL:v[0-9]+]], +; GCN: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL]], 0 +; GCN: buffer_store_dword [[RESULT]], +; GCN: s_endpgm + +; EG: BCNT_INT +define void @v_ctpop_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { + %val = load i32, i32 addrspace(1)* %in, align 4 + %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone + store i32 %ctpop, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}v_ctpop_add_chain_i32: +; GCN: buffer_load_dword [[VAL1:v[0-9]+]], +; GCN: buffer_load_dword [[VAL0:v[0-9]+]], +; GCN: v_bcnt_u32_b32_e64 [[MIDRESULT:v[0-9]+]], [[VAL1]], 0 +; SI: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], [[VAL0]], [[MIDRESULT]] +; VI: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL0]], [[MIDRESULT]] +; GCN: buffer_store_dword [[RESULT]], +; GCN: s_endpgm + +; EG: BCNT_INT +; EG: BCNT_INT +define void @v_ctpop_add_chain_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in0, i32 addrspace(1)* noalias %in1) nounwind { + %val0 = load i32, i32 addrspace(1)* %in0, align 4 + %val1 = load i32, i32 addrspace(1)* %in1, align 4 + %ctpop0 = call i32 @llvm.ctpop.i32(i32 %val0) nounwind readnone + %ctpop1 = call i32 @llvm.ctpop.i32(i32 %val1) nounwind readnone + %add = add i32 %ctpop0, %ctpop1 + store i32 %add, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}v_ctpop_add_sgpr_i32: +; GCN: buffer_load_dword [[VAL0:v[0-9]+]], +; GCN-NEXT: s_waitcnt +; GCN-NEXT: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL0]], s{{[0-9]+}} +; GCN-NEXT: buffer_store_dword [[RESULT]], +; GCN: s_endpgm +define void @v_ctpop_add_sgpr_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in0, i32 addrspace(1)* noalias %in1, i32 %sval) nounwind { + %val0 = load i32, i32 addrspace(1)* %in0, align 4 + %ctpop0 = call i32 @llvm.ctpop.i32(i32 %val0) nounwind readnone + %add = add i32 %ctpop0, %sval + store i32 %add, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}v_ctpop_v2i32: +; GCN: v_bcnt_u32_b32_e64 +; GCN: v_bcnt_u32_b32_e64 +; GCN: s_endpgm + +; EG: BCNT_INT +; EG: BCNT_INT +define void @v_ctpop_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %in) nounwind { + %val = load <2 x i32>, <2 x i32> addrspace(1)* %in, align 8 + %ctpop = call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %val) nounwind readnone + store <2 x i32> %ctpop, <2 x i32> addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}v_ctpop_v4i32: +; GCN: v_bcnt_u32_b32_e64 +; GCN: v_bcnt_u32_b32_e64 +; GCN: v_bcnt_u32_b32_e64 +; GCN: v_bcnt_u32_b32_e64 +; GCN: s_endpgm + +; EG: BCNT_INT +; EG: BCNT_INT +; EG: BCNT_INT +; EG: BCNT_INT +define void @v_ctpop_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %in) nounwind { + %val = load <4 x i32>, <4 x i32> addrspace(1)* %in, align 16 + %ctpop = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %val) nounwind readnone + store <4 x i32> %ctpop, <4 x i32> addrspace(1)* %out, align 16 + ret void +} + +; FUNC-LABEL: {{^}}v_ctpop_v8i32: +; GCN: v_bcnt_u32_b32_e64 +; GCN: v_bcnt_u32_b32_e64 +; GCN: v_bcnt_u32_b32_e64 +; GCN: v_bcnt_u32_b32_e64 +; GCN: v_bcnt_u32_b32_e64 +; GCN: v_bcnt_u32_b32_e64 +; GCN: v_bcnt_u32_b32_e64 +; GCN: v_bcnt_u32_b32_e64 +; GCN: s_endpgm + +; EG: BCNT_INT +; EG: BCNT_INT +; EG: BCNT_INT +; EG: BCNT_INT +; EG: BCNT_INT +; EG: BCNT_INT +; EG: BCNT_INT +; EG: BCNT_INT +define void @v_ctpop_v8i32(<8 x i32> addrspace(1)* noalias %out, <8 x i32> addrspace(1)* noalias %in) nounwind { + %val = load <8 x i32>, <8 x i32> addrspace(1)* %in, align 32 + %ctpop = call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> %val) nounwind readnone + store <8 x i32> %ctpop, <8 x i32> addrspace(1)* %out, align 32 + ret void +} + +; FUNC-LABEL: {{^}}v_ctpop_v16i32: +; GCN: v_bcnt_u32_b32_e64 +; GCN: v_bcnt_u32_b32_e64 +; GCN: v_bcnt_u32_b32_e64 +; GCN: v_bcnt_u32_b32_e64 +; GCN: v_bcnt_u32_b32_e64 +; GCN: v_bcnt_u32_b32_e64 +; GCN: v_bcnt_u32_b32_e64 +; GCN: v_bcnt_u32_b32_e64 +; GCN: v_bcnt_u32_b32_e64 +; GCN: v_bcnt_u32_b32_e64 +; GCN: v_bcnt_u32_b32_e64 +; GCN: v_bcnt_u32_b32_e64 +; GCN: v_bcnt_u32_b32_e64 +; GCN: v_bcnt_u32_b32_e64 +; GCN: v_bcnt_u32_b32_e64 +; GCN: v_bcnt_u32_b32_e64 +; GCN: s_endpgm + +; EG: BCNT_INT +; EG: BCNT_INT +; EG: BCNT_INT +; EG: BCNT_INT +; EG: BCNT_INT +; EG: BCNT_INT +; EG: BCNT_INT +; EG: BCNT_INT +; EG: BCNT_INT +; EG: BCNT_INT +; EG: BCNT_INT +; EG: BCNT_INT +; EG: BCNT_INT +; EG: BCNT_INT +; EG: BCNT_INT +; EG: BCNT_INT +define void @v_ctpop_v16i32(<16 x i32> addrspace(1)* noalias %out, <16 x i32> addrspace(1)* noalias %in) nounwind { + %val = load <16 x i32>, <16 x i32> addrspace(1)* %in, align 32 + %ctpop = call <16 x i32> @llvm.ctpop.v16i32(<16 x i32> %val) nounwind readnone + store <16 x i32> %ctpop, <16 x i32> addrspace(1)* %out, align 32 + ret void +} + +; FUNC-LABEL: {{^}}v_ctpop_i32_add_inline_constant: +; GCN: buffer_load_dword [[VAL:v[0-9]+]], +; GCN: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL]], 4 +; GCN: buffer_store_dword [[RESULT]], +; GCN: s_endpgm + +; EG: BCNT_INT +define void @v_ctpop_i32_add_inline_constant(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { + %val = load i32, i32 addrspace(1)* %in, align 4 + %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone + %add = add i32 %ctpop, 4 + store i32 %add, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}v_ctpop_i32_add_inline_constant_inv: +; GCN: buffer_load_dword [[VAL:v[0-9]+]], +; GCN: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL]], 4 +; GCN: buffer_store_dword [[RESULT]], +; GCN: s_endpgm + +; EG: BCNT_INT +define void @v_ctpop_i32_add_inline_constant_inv(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { + %val = load i32, i32 addrspace(1)* %in, align 4 + %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone + %add = add i32 4, %ctpop + store i32 %add, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}v_ctpop_i32_add_literal: +; GCN: buffer_load_dword [[VAL:v[0-9]+]], +; GCN: v_mov_b32_e32 [[LIT:v[0-9]+]], 0x1869f +; SI: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], [[VAL]], [[LIT]] +; VI: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL]], [[LIT]] +; GCN: buffer_store_dword [[RESULT]], +; GCN: s_endpgm +define void @v_ctpop_i32_add_literal(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { + %val = load i32, i32 addrspace(1)* %in, align 4 + %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone + %add = add i32 %ctpop, 99999 + store i32 %add, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}v_ctpop_i32_add_var: +; GCN-DAG: buffer_load_dword [[VAL:v[0-9]+]], +; GCN-DAG: s_load_dword [[VAR:s[0-9]+]], +; GCN: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL]], [[VAR]] +; GCN: buffer_store_dword [[RESULT]], +; GCN: s_endpgm + +; EG: BCNT_INT +define void @v_ctpop_i32_add_var(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %const) nounwind { + %val = load i32, i32 addrspace(1)* %in, align 4 + %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone + %add = add i32 %ctpop, %const + store i32 %add, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}v_ctpop_i32_add_var_inv: +; GCN-DAG: buffer_load_dword [[VAL:v[0-9]+]], +; GCN-DAG: s_load_dword [[VAR:s[0-9]+]], +; GCN: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL]], [[VAR]] +; GCN: buffer_store_dword [[RESULT]], +; GCN: s_endpgm + +; EG: BCNT_INT +define void @v_ctpop_i32_add_var_inv(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %const) nounwind { + %val = load i32, i32 addrspace(1)* %in, align 4 + %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone + %add = add i32 %const, %ctpop + store i32 %add, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}v_ctpop_i32_add_vvar_inv: +; GCN-DAG: buffer_load_dword [[VAL:v[0-9]+]], s[{{[0-9]+:[0-9]+}}], {{0$}} +; GCN-DAG: buffer_load_dword [[VAR:v[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0 offset:16 +; SI: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], [[VAL]], [[VAR]] +; VI: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL]], [[VAR]] +; GCN: buffer_store_dword [[RESULT]], +; GCN: s_endpgm + +; EG: BCNT_INT +define void @v_ctpop_i32_add_vvar_inv(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 addrspace(1)* noalias %constptr) nounwind { + %val = load i32, i32 addrspace(1)* %in, align 4 + %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone + %gep = getelementptr i32, i32 addrspace(1)* %constptr, i32 4 + %const = load i32, i32 addrspace(1)* %gep, align 4 + %add = add i32 %const, %ctpop + store i32 %add, i32 addrspace(1)* %out, align 4 + ret void +} + +; FIXME: We currently disallow SALU instructions in all branches, +; but there are some cases when the should be allowed. + +; FUNC-LABEL: {{^}}ctpop_i32_in_br: +; SI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xd +; VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x34 +; GCN: s_bcnt1_i32_b32 [[SRESULT:s[0-9]+]], [[VAL]] +; GCN: v_mov_b32_e32 [[RESULT]], [[SRESULT]] +; GCN: buffer_store_dword [[RESULT]], +; GCN: s_endpgm +; EG: BCNT_INT +define void @ctpop_i32_in_br(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %ctpop_arg, i32 %cond) { +entry: + %tmp0 = icmp eq i32 %cond, 0 + br i1 %tmp0, label %if, label %else + +if: + %tmp2 = call i32 @llvm.ctpop.i32(i32 %ctpop_arg) + br label %endif + +else: + %tmp3 = getelementptr i32, i32 addrspace(1)* %in, i32 1 + %tmp4 = load i32, i32 addrspace(1)* %tmp3 + br label %endif + +endif: + %tmp5 = phi i32 [%tmp2, %if], [%tmp4, %else] + store i32 %tmp5, i32 addrspace(1)* %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/ctpop64.ll b/llvm/test/CodeGen/AMDGPU/ctpop64.ll new file mode 100644 index 00000000000..e1a0ee3ea21 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/ctpop64.ll @@ -0,0 +1,124 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=FUNC %s + +declare i64 @llvm.ctpop.i64(i64) nounwind readnone +declare <2 x i64> @llvm.ctpop.v2i64(<2 x i64>) nounwind readnone +declare <4 x i64> @llvm.ctpop.v4i64(<4 x i64>) nounwind readnone +declare <8 x i64> @llvm.ctpop.v8i64(<8 x i64>) nounwind readnone +declare <16 x i64> @llvm.ctpop.v16i64(<16 x i64>) nounwind readnone + +; FUNC-LABEL: {{^}}s_ctpop_i64: +; SI: s_load_dwordx2 [[SVAL:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0xb +; VI: s_load_dwordx2 [[SVAL:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c +; GCN: s_bcnt1_i32_b64 [[SRESULT:s[0-9]+]], [[SVAL]] +; GCN: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]] +; GCN: buffer_store_dword [[VRESULT]], +; GCN: s_endpgm +define void @s_ctpop_i64(i32 addrspace(1)* noalias %out, i64 %val) nounwind { + %ctpop = call i64 @llvm.ctpop.i64(i64 %val) nounwind readnone + %truncctpop = trunc i64 %ctpop to i32 + store i32 %truncctpop, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}v_ctpop_i64: +; GCN: buffer_load_dwordx2 v{{\[}}[[LOVAL:[0-9]+]]:[[HIVAL:[0-9]+]]{{\]}}, +; GCN: v_bcnt_u32_b32_e64 [[MIDRESULT:v[0-9]+]], v[[LOVAL]], 0 +; SI-NEXT: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], v[[HIVAL]], [[MIDRESULT]] +; VI-NEXT: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], v[[HIVAL]], [[MIDRESULT]] +; GCN: buffer_store_dword [[RESULT]], +; GCN: s_endpgm +define void @v_ctpop_i64(i32 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind { + %val = load i64, i64 addrspace(1)* %in, align 8 + %ctpop = call i64 @llvm.ctpop.i64(i64 %val) nounwind readnone + %truncctpop = trunc i64 %ctpop to i32 + store i32 %truncctpop, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}s_ctpop_v2i64: +; GCN: s_bcnt1_i32_b64 +; GCN: s_bcnt1_i32_b64 +; GCN: s_endpgm +define void @s_ctpop_v2i64(<2 x i32> addrspace(1)* noalias %out, <2 x i64> %val) nounwind { + %ctpop = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %val) nounwind readnone + %truncctpop = trunc <2 x i64> %ctpop to <2 x i32> + store <2 x i32> %truncctpop, <2 x i32> addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}s_ctpop_v4i64: +; GCN: s_bcnt1_i32_b64 +; GCN: s_bcnt1_i32_b64 +; GCN: s_bcnt1_i32_b64 +; GCN: s_bcnt1_i32_b64 +; GCN: s_endpgm +define void @s_ctpop_v4i64(<4 x i32> addrspace(1)* noalias %out, <4 x i64> %val) nounwind { + %ctpop = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %val) nounwind readnone + %truncctpop = trunc <4 x i64> %ctpop to <4 x i32> + store <4 x i32> %truncctpop, <4 x i32> addrspace(1)* %out, align 16 + ret void +} + +; FUNC-LABEL: {{^}}v_ctpop_v2i64: +; GCN: v_bcnt_u32_b32 +; GCN: v_bcnt_u32_b32 +; GCN: v_bcnt_u32_b32 +; GCN: v_bcnt_u32_b32 +; GCN: s_endpgm +define void @v_ctpop_v2i64(<2 x i32> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %in) nounwind { + %val = load <2 x i64>, <2 x i64> addrspace(1)* %in, align 16 + %ctpop = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %val) nounwind readnone + %truncctpop = trunc <2 x i64> %ctpop to <2 x i32> + store <2 x i32> %truncctpop, <2 x i32> addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}v_ctpop_v4i64: +; GCN: v_bcnt_u32_b32 +; GCN: v_bcnt_u32_b32 +; GCN: v_bcnt_u32_b32 +; GCN: v_bcnt_u32_b32 +; GCN: v_bcnt_u32_b32 +; GCN: v_bcnt_u32_b32 +; GCN: v_bcnt_u32_b32 +; GCN: v_bcnt_u32_b32 +; GCN: s_endpgm +define void @v_ctpop_v4i64(<4 x i32> addrspace(1)* noalias %out, <4 x i64> addrspace(1)* noalias %in) nounwind { + %val = load <4 x i64>, <4 x i64> addrspace(1)* %in, align 32 + %ctpop = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %val) nounwind readnone + %truncctpop = trunc <4 x i64> %ctpop to <4 x i32> + store <4 x i32> %truncctpop, <4 x i32> addrspace(1)* %out, align 16 + ret void +} + +; FIXME: We currently disallow SALU instructions in all branches, +; but there are some cases when the should be allowed. + +; FUNC-LABEL: {{^}}ctpop_i64_in_br: +; SI: s_load_dwordx2 s{{\[}}[[LOVAL:[0-9]+]]:[[HIVAL:[0-9]+]]{{\]}}, s[{{[0-9]+:[0-9]+}}], 0xd +; VI: s_load_dwordx2 s{{\[}}[[LOVAL:[0-9]+]]:[[HIVAL:[0-9]+]]{{\]}}, s[{{[0-9]+:[0-9]+}}], 0x34 +; GCN: s_bcnt1_i32_b64 [[RESULT:s[0-9]+]], {{s\[}}[[LOVAL]]:[[HIVAL]]{{\]}} +; GCN: v_mov_b32_e32 v[[VLO:[0-9]+]], [[RESULT]] +; GCN: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[HIVAL]] +; GCN: buffer_store_dwordx2 {{v\[}}[[VLO]]:[[VHI]]{{\]}} +; GCN: s_endpgm +define void @ctpop_i64_in_br(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %ctpop_arg, i32 %cond) { +entry: + %tmp0 = icmp eq i32 %cond, 0 + br i1 %tmp0, label %if, label %else + +if: + %tmp2 = call i64 @llvm.ctpop.i64(i64 %ctpop_arg) + br label %endif + +else: + %tmp3 = getelementptr i64, i64 addrspace(1)* %in, i32 1 + %tmp4 = load i64, i64 addrspace(1)* %tmp3 + br label %endif + +endif: + %tmp5 = phi i64 [%tmp2, %if], [%tmp4, %else] + store i64 %tmp5, i64 addrspace(1)* %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll new file mode 100644 index 00000000000..56fcb51fe14 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll @@ -0,0 +1,71 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s + +declare i32 @llvm.cttz.i32(i32, i1) nounwind readnone +declare <2 x i32> @llvm.cttz.v2i32(<2 x i32>, i1) nounwind readnone +declare <4 x i32> @llvm.cttz.v4i32(<4 x i32>, i1) nounwind readnone + +; FUNC-LABEL: {{^}}s_cttz_zero_undef_i32: +; SI: s_load_dword [[VAL:s[0-9]+]], +; SI: s_ff1_i32_b32 [[SRESULT:s[0-9]+]], [[VAL]] +; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]] +; SI: buffer_store_dword [[VRESULT]], +; SI: s_endpgm +; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+\.[XYZW]]] +; EG: FFBL_INT {{\*? *}}[[RESULT]] +define void @s_cttz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind { + %cttz = call i32 @llvm.cttz.i32(i32 %val, i1 true) nounwind readnone + store i32 %cttz, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}v_cttz_zero_undef_i32: +; SI: buffer_load_dword [[VAL:v[0-9]+]], +; SI: v_ffbl_b32_e32 [[RESULT:v[0-9]+]], [[VAL]] +; SI: buffer_store_dword [[RESULT]], +; SI: s_endpgm +; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+\.[XYZW]]] +; EG: FFBL_INT {{\*? *}}[[RESULT]] +define void @v_cttz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { + %val = load i32, i32 addrspace(1)* %valptr, align 4 + %cttz = call i32 @llvm.cttz.i32(i32 %val, i1 true) nounwind readnone + store i32 %cttz, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}v_cttz_zero_undef_v2i32: +; SI: buffer_load_dwordx2 +; SI: v_ffbl_b32_e32 +; SI: v_ffbl_b32_e32 +; SI: buffer_store_dwordx2 +; SI: s_endpgm +; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+]]{{\.[XYZW]}} +; EG: FFBL_INT {{\*? *}}[[RESULT]] +; EG: FFBL_INT {{\*? *}}[[RESULT]] +define void @v_cttz_zero_undef_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) nounwind { + %val = load <2 x i32>, <2 x i32> addrspace(1)* %valptr, align 8 + %cttz = call <2 x i32> @llvm.cttz.v2i32(<2 x i32> %val, i1 true) nounwind readnone + store <2 x i32> %cttz, <2 x i32> addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}v_cttz_zero_undef_v4i32: +; SI: buffer_load_dwordx4 +; SI: v_ffbl_b32_e32 +; SI: v_ffbl_b32_e32 +; SI: v_ffbl_b32_e32 +; SI: v_ffbl_b32_e32 +; SI: buffer_store_dwordx4 +; SI: s_endpgm +; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+]]{{\.[XYZW]}} +; EG: FFBL_INT {{\*? *}}[[RESULT]] +; EG: FFBL_INT {{\*? *}}[[RESULT]] +; EG: FFBL_INT {{\*? *}}[[RESULT]] +; EG: FFBL_INT {{\*? *}}[[RESULT]] +define void @v_cttz_zero_undef_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %valptr) nounwind { + %val = load <4 x i32>, <4 x i32> addrspace(1)* %valptr, align 16 + %cttz = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %val, i1 true) nounwind readnone + store <4 x i32> %cttz, <4 x i32> addrspace(1)* %out, align 16 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll new file mode 100644 index 00000000000..3399d9da29e --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll @@ -0,0 +1,196 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s + +; SI-LABEL: {{^}}load_i8_to_f32: +; SI: buffer_load_ubyte [[LOADREG:v[0-9]+]], +; SI-NOT: bfe +; SI-NOT: lshr +; SI: v_cvt_f32_ubyte0_e32 [[CONV:v[0-9]+]], [[LOADREG]] +; SI: buffer_store_dword [[CONV]], +define void @load_i8_to_f32(float addrspace(1)* noalias %out, i8 addrspace(1)* noalias %in) nounwind { + %load = load i8, i8 addrspace(1)* %in, align 1 + %cvt = uitofp i8 %load to float + store float %cvt, float addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL: {{^}}load_v2i8_to_v2f32: +; SI: buffer_load_ushort [[LOADREG:v[0-9]+]], +; SI-NOT: bfe +; SI-NOT: lshr +; SI-NOT: and +; SI-DAG: v_cvt_f32_ubyte1_e32 v[[HIRESULT:[0-9]+]], [[LOADREG]] +; SI-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[LOADREG]] +; SI: buffer_store_dwordx2 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}}, +define void @load_v2i8_to_v2f32(<2 x float> addrspace(1)* noalias %out, <2 x i8> addrspace(1)* noalias %in) nounwind { + %load = load <2 x i8>, <2 x i8> addrspace(1)* %in, align 2 + %cvt = uitofp <2 x i8> %load to <2 x float> + store <2 x float> %cvt, <2 x float> addrspace(1)* %out, align 16 + ret void +} + +; SI-LABEL: {{^}}load_v3i8_to_v3f32: +; SI-NOT: bfe +; SI-NOT: v_cvt_f32_ubyte3_e32 +; SI-DAG: v_cvt_f32_ubyte2_e32 +; SI-DAG: v_cvt_f32_ubyte1_e32 +; SI-DAG: v_cvt_f32_ubyte0_e32 +; SI: buffer_store_dwordx2 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}}, +define void @load_v3i8_to_v3f32(<3 x float> addrspace(1)* noalias %out, <3 x i8> addrspace(1)* noalias %in) nounwind { + %load = load <3 x i8>, <3 x i8> addrspace(1)* %in, align 4 + %cvt = uitofp <3 x i8> %load to <3 x float> + store <3 x float> %cvt, <3 x float> addrspace(1)* %out, align 16 + ret void +} + +; SI-LABEL: {{^}}load_v4i8_to_v4f32: +; SI: buffer_load_dword [[LOADREG:v[0-9]+]] +; SI-NOT: bfe +; SI-NOT: lshr +; SI-DAG: v_cvt_f32_ubyte3_e32 v[[HIRESULT:[0-9]+]], [[LOADREG]] +; SI-DAG: v_cvt_f32_ubyte2_e32 v{{[0-9]+}}, [[LOADREG]] +; SI-DAG: v_cvt_f32_ubyte1_e32 v{{[0-9]+}}, [[LOADREG]] +; SI-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[LOADREG]] +; SI: buffer_store_dwordx4 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}}, +define void @load_v4i8_to_v4f32(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind { + %load = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4 + %cvt = uitofp <4 x i8> %load to <4 x float> + store <4 x float> %cvt, <4 x float> addrspace(1)* %out, align 16 + ret void +} + +; This should not be adding instructions to shift into the correct +; position in the word for the component. + +; SI-LABEL: {{^}}load_v4i8_to_v4f32_unaligned: +; SI: buffer_load_ubyte [[LOADREG3:v[0-9]+]] +; SI: buffer_load_ubyte [[LOADREG2:v[0-9]+]] +; SI: buffer_load_ubyte [[LOADREG1:v[0-9]+]] +; SI: buffer_load_ubyte [[LOADREG0:v[0-9]+]] +; SI-NOT: v_lshlrev_b32 +; SI-NOT: v_or_b32 + +; SI-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[LOADREG0]] +; SI-DAG: v_cvt_f32_ubyte0_e32 v{{[0-9]+}}, [[LOADREG1]] +; SI-DAG: v_cvt_f32_ubyte0_e32 v{{[0-9]+}}, [[LOADREG2]] +; SI-DAG: v_cvt_f32_ubyte0_e32 v[[HIRESULT:[0-9]+]], [[LOADREG3]] + +; SI: buffer_store_dwordx4 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}}, +define void @load_v4i8_to_v4f32_unaligned(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind { + %load = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 1 + %cvt = uitofp <4 x i8> %load to <4 x float> + store <4 x float> %cvt, <4 x float> addrspace(1)* %out, align 16 + ret void +} + +; XXX - This should really still be able to use the v_cvt_f32_ubyte0 +; for each component, but computeKnownBits doesn't handle vectors very +; well. + +; SI-LABEL: {{^}}load_v4i8_to_v4f32_2_uses: +; SI: buffer_load_ubyte +; SI: buffer_load_ubyte +; SI: buffer_load_ubyte +; SI: buffer_load_ubyte +; SI: v_cvt_f32_ubyte0_e32 +; SI: v_cvt_f32_ubyte0_e32 +; SI: v_cvt_f32_ubyte0_e32 +; SI: v_cvt_f32_ubyte0_e32 + +; XXX - replace with this when v4i8 loads aren't scalarized anymore. +; XSI: buffer_load_dword +; XSI: v_cvt_f32_u32_e32 +; XSI: v_cvt_f32_u32_e32 +; XSI: v_cvt_f32_u32_e32 +; XSI: v_cvt_f32_u32_e32 +; SI: s_endpgm +define void @load_v4i8_to_v4f32_2_uses(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %out2, <4 x i8> addrspace(1)* noalias %in) nounwind { + %load = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4 + %cvt = uitofp <4 x i8> %load to <4 x float> + store <4 x float> %cvt, <4 x float> addrspace(1)* %out, align 16 + %add = add <4 x i8> %load, ; Second use of %load + store <4 x i8> %add, <4 x i8> addrspace(1)* %out2, align 4 + ret void +} + +; Make sure this doesn't crash. +; SI-LABEL: {{^}}load_v7i8_to_v7f32: +; SI: s_endpgm +define void @load_v7i8_to_v7f32(<7 x float> addrspace(1)* noalias %out, <7 x i8> addrspace(1)* noalias %in) nounwind { + %load = load <7 x i8>, <7 x i8> addrspace(1)* %in, align 1 + %cvt = uitofp <7 x i8> %load to <7 x float> + store <7 x float> %cvt, <7 x float> addrspace(1)* %out, align 16 + ret void +} + +; SI-LABEL: {{^}}load_v8i8_to_v8f32: +; SI: buffer_load_dwordx2 v{{\[}}[[LOLOAD:[0-9]+]]:[[HILOAD:[0-9]+]]{{\]}}, +; SI-NOT: bfe +; SI-NOT: lshr +; SI-DAG: v_cvt_f32_ubyte3_e32 v{{[0-9]+}}, v[[LOLOAD]] +; SI-DAG: v_cvt_f32_ubyte2_e32 v{{[0-9]+}}, v[[LOLOAD]] +; SI-DAG: v_cvt_f32_ubyte1_e32 v{{[0-9]+}}, v[[LOLOAD]] +; SI-DAG: v_cvt_f32_ubyte0_e32 v{{[0-9]+}}, v[[LOLOAD]] +; SI-DAG: v_cvt_f32_ubyte3_e32 v{{[0-9]+}}, v[[HILOAD]] +; SI-DAG: v_cvt_f32_ubyte2_e32 v{{[0-9]+}}, v[[HILOAD]] +; SI-DAG: v_cvt_f32_ubyte1_e32 v{{[0-9]+}}, v[[HILOAD]] +; SI-DAG: v_cvt_f32_ubyte0_e32 v{{[0-9]+}}, v[[HILOAD]] +; SI-NOT: bfe +; SI-NOT: lshr +; SI: buffer_store_dword +; SI: buffer_store_dword +; SI: buffer_store_dword +; SI: buffer_store_dword +; SI: buffer_store_dword +; SI: buffer_store_dword +; SI: buffer_store_dword +; SI: buffer_store_dword +define void @load_v8i8_to_v8f32(<8 x float> addrspace(1)* noalias %out, <8 x i8> addrspace(1)* noalias %in) nounwind { + %load = load <8 x i8>, <8 x i8> addrspace(1)* %in, align 8 + %cvt = uitofp <8 x i8> %load to <8 x float> + store <8 x float> %cvt, <8 x float> addrspace(1)* %out, align 16 + ret void +} + +; SI-LABEL: {{^}}i8_zext_inreg_i32_to_f32: +; SI: buffer_load_dword [[LOADREG:v[0-9]+]], +; SI: v_add_i32_e32 [[ADD:v[0-9]+]], 2, [[LOADREG]] +; SI-NEXT: v_cvt_f32_ubyte0_e32 [[CONV:v[0-9]+]], [[ADD]] +; SI: buffer_store_dword [[CONV]], +define void @i8_zext_inreg_i32_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { + %load = load i32, i32 addrspace(1)* %in, align 4 + %add = add i32 %load, 2 + %inreg = and i32 %add, 255 + %cvt = uitofp i32 %inreg to float + store float %cvt, float addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL: {{^}}i8_zext_inreg_hi1_to_f32: +define void @i8_zext_inreg_hi1_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { + %load = load i32, i32 addrspace(1)* %in, align 4 + %inreg = and i32 %load, 65280 + %shr = lshr i32 %inreg, 8 + %cvt = uitofp i32 %shr to float + store float %cvt, float addrspace(1)* %out, align 4 + ret void +} + + +; We don't get these ones because of the zext, but instcombine removes +; them so it shouldn't really matter. +define void @i8_zext_i32_to_f32(float addrspace(1)* noalias %out, i8 addrspace(1)* noalias %in) nounwind { + %load = load i8, i8 addrspace(1)* %in, align 1 + %ext = zext i8 %load to i32 + %cvt = uitofp i32 %ext to float + store float %cvt, float addrspace(1)* %out, align 4 + ret void +} + +define void @v4i8_zext_v4i32_to_v4f32(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind { + %load = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 1 + %ext = zext <4 x i8> %load to <4 x i32> + %cvt = uitofp <4 x i32> %ext to <4 x float> + store <4 x float> %cvt, <4 x float> addrspace(1)* %out, align 16 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/cvt_flr_i32_f32.ll b/llvm/test/CodeGen/AMDGPU/cvt_flr_i32_f32.ll new file mode 100644 index 00000000000..2dd3a9f2a77 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/cvt_flr_i32_f32.ll @@ -0,0 +1,86 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI-SAFE -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=SI -enable-no-nans-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=SI-NONAN -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s + +declare float @llvm.fabs.f32(float) #1 +declare float @llvm.floor.f32(float) #1 + +; FUNC-LABEL: {{^}}cvt_flr_i32_f32_0: +; SI-SAFE-NOT: v_cvt_flr_i32_f32 +; SI-NOT: add +; SI-NONAN: v_cvt_flr_i32_f32_e32 v{{[0-9]+}}, s{{[0-9]+}} +; SI: s_endpgm +define void @cvt_flr_i32_f32_0(i32 addrspace(1)* %out, float %x) #0 { + %floor = call float @llvm.floor.f32(float %x) #1 + %cvt = fptosi float %floor to i32 + store i32 %cvt, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}cvt_flr_i32_f32_1: +; SI: v_add_f32_e64 [[TMP:v[0-9]+]], 1.0, s{{[0-9]+}} +; SI-SAFE-NOT: v_cvt_flr_i32_f32 +; SI-NONAN: v_cvt_flr_i32_f32_e32 v{{[0-9]+}}, [[TMP]] +; SI: s_endpgm +define void @cvt_flr_i32_f32_1(i32 addrspace(1)* %out, float %x) #0 { + %fadd = fadd float %x, 1.0 + %floor = call float @llvm.floor.f32(float %fadd) #1 + %cvt = fptosi float %floor to i32 + store i32 %cvt, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}cvt_flr_i32_f32_fabs: +; SI-NOT: add +; SI-SAFE-NOT: v_cvt_flr_i32_f32 +; SI-NONAN: v_cvt_flr_i32_f32_e64 v{{[0-9]+}}, |s{{[0-9]+}}| +; SI: s_endpgm +define void @cvt_flr_i32_f32_fabs(i32 addrspace(1)* %out, float %x) #0 { + %x.fabs = call float @llvm.fabs.f32(float %x) #1 + %floor = call float @llvm.floor.f32(float %x.fabs) #1 + %cvt = fptosi float %floor to i32 + store i32 %cvt, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}cvt_flr_i32_f32_fneg: +; SI-NOT: add +; SI-SAFE-NOT: v_cvt_flr_i32_f32 +; SI-NONAN: v_cvt_flr_i32_f32_e64 v{{[0-9]+}}, -s{{[0-9]+}} +; SI: s_endpgm +define void @cvt_flr_i32_f32_fneg(i32 addrspace(1)* %out, float %x) #0 { + %x.fneg = fsub float -0.000000e+00, %x + %floor = call float @llvm.floor.f32(float %x.fneg) #1 + %cvt = fptosi float %floor to i32 + store i32 %cvt, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}cvt_flr_i32_f32_fabs_fneg: +; SI-NOT: add +; SI-SAFE-NOT: v_cvt_flr_i32_f32 +; SI-NONAN: v_cvt_flr_i32_f32_e64 v{{[0-9]+}}, -|s{{[0-9]+}}| +; SI: s_endpgm +define void @cvt_flr_i32_f32_fabs_fneg(i32 addrspace(1)* %out, float %x) #0 { + %x.fabs = call float @llvm.fabs.f32(float %x) #1 + %x.fabs.fneg = fsub float -0.000000e+00, %x.fabs + %floor = call float @llvm.floor.f32(float %x.fabs.fneg) #1 + %cvt = fptosi float %floor to i32 + store i32 %cvt, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}no_cvt_flr_i32_f32_0: +; SI-NOT: v_cvt_flr_i32_f32 +; SI: v_floor_f32 +; SI: v_cvt_u32_f32_e32 +; SI: s_endpgm +define void @no_cvt_flr_i32_f32_0(i32 addrspace(1)* %out, float %x) #0 { + %floor = call float @llvm.floor.f32(float %x) #1 + %cvt = fptoui float %floor to i32 + store i32 %cvt, i32 addrspace(1)* %out + ret void +} + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } diff --git a/llvm/test/CodeGen/AMDGPU/cvt_rpi_i32_f32.ll b/llvm/test/CodeGen/AMDGPU/cvt_rpi_i32_f32.ll new file mode 100644 index 00000000000..864ac40260b --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/cvt_rpi_i32_f32.ll @@ -0,0 +1,83 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI-SAFE -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=SI -enable-no-nans-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=SI-NONAN -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI-SAFE -check-prefix=SI -check-prefix=FUNC %s + +declare float @llvm.fabs.f32(float) #1 +declare float @llvm.floor.f32(float) #1 + +; FUNC-LABEL: {{^}}cvt_rpi_i32_f32: +; SI-SAFE-NOT: v_cvt_rpi_i32_f32 +; SI-NONAN: v_cvt_rpi_i32_f32_e32 v{{[0-9]+}}, s{{[0-9]+}} +; SI: s_endpgm +define void @cvt_rpi_i32_f32(i32 addrspace(1)* %out, float %x) #0 { + %fadd = fadd float %x, 0.5 + %floor = call float @llvm.floor.f32(float %fadd) #1 + %cvt = fptosi float %floor to i32 + store i32 %cvt, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}cvt_rpi_i32_f32_fabs: +; SI-SAFE-NOT: v_cvt_rpi_i32_f32 +; SI-NONAN: v_cvt_rpi_i32_f32_e64 v{{[0-9]+}}, |s{{[0-9]+}}|{{$}} +; SI: s_endpgm +define void @cvt_rpi_i32_f32_fabs(i32 addrspace(1)* %out, float %x) #0 { + %x.fabs = call float @llvm.fabs.f32(float %x) #1 + %fadd = fadd float %x.fabs, 0.5 + %floor = call float @llvm.floor.f32(float %fadd) #1 + %cvt = fptosi float %floor to i32 + store i32 %cvt, i32 addrspace(1)* %out + ret void +} + +; FIXME: This doesn't work because it forms fsub 0.5, x +; FUNC-LABEL: {{^}}cvt_rpi_i32_f32_fneg: +; XSI-NONAN: v_cvt_rpi_i32_f32_e64 v{{[0-9]+}}, -s{{[0-9]+}} +; SI: v_sub_f32_e64 [[TMP:v[0-9]+]], 0.5, s{{[0-9]+}} +; SI-SAFE-NOT: v_cvt_flr_i32_f32 +; SI-NONAN: v_cvt_flr_i32_f32_e32 {{v[0-9]+}}, [[TMP]] +; SI: s_endpgm +define void @cvt_rpi_i32_f32_fneg(i32 addrspace(1)* %out, float %x) #0 { + %x.fneg = fsub float -0.000000e+00, %x + %fadd = fadd float %x.fneg, 0.5 + %floor = call float @llvm.floor.f32(float %fadd) #1 + %cvt = fptosi float %floor to i32 + store i32 %cvt, i32 addrspace(1)* %out + ret void +} + +; FIXME: This doesn't work for same reason as above +; FUNC-LABEL: {{^}}cvt_rpi_i32_f32_fabs_fneg: +; SI-SAFE-NOT: v_cvt_rpi_i32_f32 +; XSI-NONAN: v_cvt_rpi_i32_f32_e64 v{{[0-9]+}}, -|s{{[0-9]+}}| + +; SI: v_sub_f32_e64 [[TMP:v[0-9]+]], 0.5, |s{{[0-9]+}}| +; SI-SAFE-NOT: v_cvt_flr_i32_f32 +; SI-NONAN: v_cvt_flr_i32_f32_e32 {{v[0-9]+}}, [[TMP]] +; SI: s_endpgm +define void @cvt_rpi_i32_f32_fabs_fneg(i32 addrspace(1)* %out, float %x) #0 { + %x.fabs = call float @llvm.fabs.f32(float %x) #1 + %x.fabs.fneg = fsub float -0.000000e+00, %x.fabs + %fadd = fadd float %x.fabs.fneg, 0.5 + %floor = call float @llvm.floor.f32(float %fadd) #1 + %cvt = fptosi float %floor to i32 + store i32 %cvt, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}no_cvt_rpi_i32_f32_0: +; SI-NOT: v_cvt_rpi_i32_f32 +; SI: v_add_f32 +; SI: v_floor_f32 +; SI: v_cvt_u32_f32 +; SI: s_endpgm +define void @no_cvt_rpi_i32_f32_0(i32 addrspace(1)* %out, float %x) #0 { + %fadd = fadd float %x, 0.5 + %floor = call float @llvm.floor.f32(float %fadd) #1 + %cvt = fptoui float %floor to i32 + store i32 %cvt, i32 addrspace(1)* %out + ret void +} + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } diff --git a/llvm/test/CodeGen/AMDGPU/dagcombiner-bug-illegal-vec4-int-to-fp.ll b/llvm/test/CodeGen/AMDGPU/dagcombiner-bug-illegal-vec4-int-to-fp.ll new file mode 100644 index 00000000000..fb43ff4fbdd --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/dagcombiner-bug-illegal-vec4-int-to-fp.ll @@ -0,0 +1,36 @@ +;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s + +; This test is for a bug in +; DAGCombiner::reduceBuildVecConvertToConvertBuildVec() where +; the wrong type was being passed to +; TargetLowering::getOperationAction() when checking the legality of +; ISD::UINT_TO_FP and ISD::SINT_TO_FP opcodes. + + +; CHECK: {{^}}sint: +; CHECK: INT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} + +define void @sint(<4 x float> addrspace(1)* %out, i32 addrspace(1)* %in) { +entry: + %ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 + %sint = load i32, i32 addrspace(1) * %in + %conv = sitofp i32 %sint to float + %0 = insertelement <4 x float> undef, float %conv, i32 0 + %splat = shufflevector <4 x float> %0, <4 x float> undef, <4 x i32> zeroinitializer + store <4 x float> %splat, <4 x float> addrspace(1)* %out + ret void +} + +;CHECK: {{^}}uint: +;CHECK: UINT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} + +define void @uint(<4 x float> addrspace(1)* %out, i32 addrspace(1)* %in) { +entry: + %ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 + %uint = load i32, i32 addrspace(1) * %in + %conv = uitofp i32 %uint to float + %0 = insertelement <4 x float> undef, float %conv, i32 0 + %splat = shufflevector <4 x float> %0, <4 x float> undef, <4 x i32> zeroinitializer + store <4 x float> %splat, <4 x float> addrspace(1)* %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/debug.ll b/llvm/test/CodeGen/AMDGPU/debug.ll new file mode 100644 index 00000000000..a2e0e878b74 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/debug.ll @@ -0,0 +1,10 @@ +; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs -mattr=dumpcode -filetype=obj | FileCheck --check-prefix=SI --check-prefix=FUNC %s +; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs -mattr=dumpcode -filetype=obj | FileCheck --check-prefix=SI --check-prefix=FUNC %s + +; Test for a crash in the custom assembly dump code. + +; SI: s_endpgm +define void @test(i32 addrspace(1)* %out) { + store i32 0, i32 addrspace(1)* %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/default-fp-mode.ll b/llvm/test/CodeGen/AMDGPU/default-fp-mode.ll new file mode 100644 index 00000000000..da8e91454b9 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/default-fp-mode.ll @@ -0,0 +1,36 @@ +; RUN: llc -march=amdgcn -mcpu=SI -mattr=-fp32-denormals,+fp64-denormals < %s | FileCheck -check-prefix=FP64-DENORMAL -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=SI -mattr=+fp32-denormals,-fp64-denormals < %s | FileCheck -check-prefix=FP32-DENORMAL -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=SI -mattr=+fp32-denormals,+fp64-denormals < %s | FileCheck -check-prefix=BOTH-DENORMAL -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=SI -mattr=-fp32-denormals,-fp64-denormals < %s | FileCheck -check-prefix=NO-DENORMAL -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=DEFAULT -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=SI -mattr=-fp32-denormals < %s | FileCheck -check-prefix=DEFAULT -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=SI -mattr=+fp64-denormals < %s | FileCheck -check-prefix=DEFAULT -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-fp32-denormals,+fp64-denormals < %s | FileCheck -check-prefix=FP64-DENORMAL -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=+fp32-denormals,-fp64-denormals < %s | FileCheck -check-prefix=FP32-DENORMAL -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=+fp32-denormals,+fp64-denormals < %s | FileCheck -check-prefix=BOTH-DENORMAL -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-fp32-denormals,-fp64-denormals < %s | FileCheck -check-prefix=NO-DENORMAL -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=DEFAULT -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-fp32-denormals < %s | FileCheck -check-prefix=DEFAULT -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=+fp64-denormals < %s | FileCheck -check-prefix=DEFAULT -check-prefix=FUNC %s + +; FUNC-LABEL: {{^}}test_kernel: + +; DEFAULT: FloatMode: 192 +; DEFAULT: IeeeMode: 0 + +; FP64-DENORMAL: FloatMode: 192 +; FP64-DENORMAL: IeeeMode: 0 + +; FP32-DENORMAL: FloatMode: 48 +; FP32-DENORMAL: IeeeMode: 0 + +; BOTH-DENORMAL: FloatMode: 240 +; BOTH-DENORMAL: IeeeMode: 0 + +; NO-DENORMAL: FloatMode: 0 +; NO-DENORMAL: IeeeMode: 0 +define void @test_kernel(float addrspace(1)* %out0, double addrspace(1)* %out1) nounwind { + store float 0.0, float addrspace(1)* %out0 + store double 0.0, double addrspace(1)* %out1 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/disconnected-predset-break-bug.ll b/llvm/test/CodeGen/AMDGPU/disconnected-predset-break-bug.ll new file mode 100644 index 00000000000..cdd2c0cd4f4 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/disconnected-predset-break-bug.ll @@ -0,0 +1,29 @@ +; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s + +; PRED_SET* instructions must be tied to any instruction that uses their +; result. This tests that there are no instructions between the PRED_SET* +; and the PREDICATE_BREAK in this loop. + +; CHECK: {{^}}loop_ge: +; CHECK: LOOP_START_DX10 +; CHECK: ALU_PUSH_BEFORE +; CHECK-NEXT: JUMP +; CHECK-NEXT: LOOP_BREAK +define void @loop_ge(i32 addrspace(1)* nocapture %out, i32 %iterations) nounwind { +entry: + %cmp5 = icmp sgt i32 %iterations, 0 + br i1 %cmp5, label %for.body, label %for.end + +for.body: ; preds = %for.body, %entry + %i.07.in = phi i32 [ %i.07, %for.body ], [ %iterations, %entry ] + %ai.06 = phi i32 [ %add, %for.body ], [ 0, %entry ] + %i.07 = add nsw i32 %i.07.in, -1 + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %ai.06 + store i32 %i.07, i32 addrspace(1)* %arrayidx, align 4 + %add = add nsw i32 %ai.06, 1 + %exitcond = icmp eq i32 %add, %iterations + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/dot4-folding.ll b/llvm/test/CodeGen/AMDGPU/dot4-folding.ll new file mode 100644 index 00000000000..4df7b63bf98 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/dot4-folding.ll @@ -0,0 +1,27 @@ +;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s + +; Exactly one constant vector can be folded into dot4, which means exactly +; 4 MOV instructions +; CHECK: {{^}}main: +; CHECK: MOV +; CHECK: MOV +; CHECK: MOV +; CHECK: MOV +; CHECK-NOT: MOV +; CHECK-NOT: MOV +; CHECK-NOT: MOV +; CHECK-NOT: MOV + +define void @main(float addrspace(1)* %out) { +main_body: + %0 = load <4 x float>, <4 x float> addrspace(8)* null + %1 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1) + %2 = call float @llvm.AMDGPU.dp4(<4 x float> %0,<4 x float> %1) + %3 = insertelement <4 x float> undef, float %2, i32 0 + call void @llvm.R600.store.swizzle(<4 x float> %3, i32 0, i32 0) + ret void +} + +declare float @llvm.AMDGPU.dp4(<4 x float>, <4 x float>) #1 +declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) +attributes #1 = { readnone } diff --git a/llvm/test/CodeGen/AMDGPU/ds-negative-offset-addressing-mode-loop.ll b/llvm/test/CodeGen/AMDGPU/ds-negative-offset-addressing-mode-loop.ll new file mode 100644 index 00000000000..e7e13d6178c --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/ds-negative-offset-addressing-mode-loop.ll @@ -0,0 +1,69 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck -check-prefix=SI --check-prefix=CHECK %s +; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck -check-prefix=CI --check-prefix=CHECK %s + +declare i32 @llvm.r600.read.tidig.x() #0 +declare void @llvm.AMDGPU.barrier.local() #1 + +; Function Attrs: nounwind +; CHECK-LABEL: {{^}}signed_ds_offset_addressing_loop: +; CHECK: BB0_1: +; CHECK: v_add_i32_e32 [[VADDR:v[0-9]+]], +; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[VADDR]] +; SI-DAG: v_add_i32_e32 [[VADDR4:v[0-9]+]], 4, [[VADDR]] +; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[VADDR4]] +; SI-DAG: v_add_i32_e32 [[VADDR0x80:v[0-9]+]], 0x80, [[VADDR]] +; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[VADDR0x80]] +; SI-DAG: v_add_i32_e32 [[VADDR0x84:v[0-9]+]], 0x84, [[VADDR]] +; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[VADDR0x84]] +; SI-DAG: v_add_i32_e32 [[VADDR0x100:v[0-9]+]], 0x100, [[VADDR]] +; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[VADDR0x100]] + +; CI-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[VADDR]] offset1:1 +; CI-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[VADDR]] offset0:32 offset1:33 +; CI-DAG: ds_read_b32 v{{[0-9]+}}, [[VADDR]] offset:256 +; CHECK: s_endpgm +define void @signed_ds_offset_addressing_loop(float addrspace(1)* noalias nocapture %out, float addrspace(3)* noalias nocapture readonly %lptr, i32 %n) #2 { +entry: + %x.i = tail call i32 @llvm.r600.read.tidig.x() #0 + %mul = shl nsw i32 %x.i, 1 + br label %for.body + +for.body: ; preds = %for.body, %entry + %sum.03 = phi float [ 0.000000e+00, %entry ], [ %add13, %for.body ] + %offset.02 = phi i32 [ %mul, %entry ], [ %add14, %for.body ] + %k.01 = phi i32 [ 0, %entry ], [ %inc, %for.body ] + tail call void @llvm.AMDGPU.barrier.local() #1 + %arrayidx = getelementptr inbounds float, float addrspace(3)* %lptr, i32 %offset.02 + %tmp = load float, float addrspace(3)* %arrayidx, align 4 + %add1 = add nsw i32 %offset.02, 1 + %arrayidx2 = getelementptr inbounds float, float addrspace(3)* %lptr, i32 %add1 + %tmp1 = load float, float addrspace(3)* %arrayidx2, align 4 + %add3 = add nsw i32 %offset.02, 32 + %arrayidx4 = getelementptr inbounds float, float addrspace(3)* %lptr, i32 %add3 + %tmp2 = load float, float addrspace(3)* %arrayidx4, align 4 + %add5 = add nsw i32 %offset.02, 33 + %arrayidx6 = getelementptr inbounds float, float addrspace(3)* %lptr, i32 %add5 + %tmp3 = load float, float addrspace(3)* %arrayidx6, align 4 + %add7 = add nsw i32 %offset.02, 64 + %arrayidx8 = getelementptr inbounds float, float addrspace(3)* %lptr, i32 %add7 + %tmp4 = load float, float addrspace(3)* %arrayidx8, align 4 + %add9 = fadd float %tmp, %tmp1 + %add10 = fadd float %add9, %tmp2 + %add11 = fadd float %add10, %tmp3 + %add12 = fadd float %add11, %tmp4 + %add13 = fadd float %sum.03, %add12 + %inc = add nsw i32 %k.01, 1 + %add14 = add nsw i32 %offset.02, 97 + %exitcond = icmp eq i32 %inc, 8 + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body + %tmp5 = sext i32 %x.i to i64 + %arrayidx15 = getelementptr inbounds float, float addrspace(1)* %out, i64 %tmp5 + store float %add13, float addrspace(1)* %arrayidx15, align 4 + ret void +} + +attributes #0 = { nounwind readnone } +attributes #1 = { noduplicate nounwind } +attributes #2 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } diff --git a/llvm/test/CodeGen/AMDGPU/ds_read2.ll b/llvm/test/CodeGen/AMDGPU/ds_read2.ll new file mode 100644 index 00000000000..5929898f8bd --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/ds_read2.ll @@ -0,0 +1,515 @@ +; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck -strict-whitespace -check-prefix=SI %s + +; FIXME: We don't get cases where the address was an SGPR because we +; get a copy to the address register for each one. + +@lds = addrspace(3) global [512 x float] undef, align 4 + @lds.f64 = addrspace(3) global [512 x double] undef, align 8 + +; SI-LABEL: @simple_read2_f32 +; SI: ds_read2_b32 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:8 +; SI: s_waitcnt lgkmcnt(0) +; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[HI_VREG]], v[[LO_VREG]] +; SI: buffer_store_dword [[RESULT]] +; SI: s_endpgm +define void @simple_read2_f32(float addrspace(1)* %out) #0 { + %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 + %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i + %val0 = load float, float addrspace(3)* %arrayidx0, align 4 + %add.x = add nsw i32 %x.i, 8 + %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x + %val1 = load float, float addrspace(3)* %arrayidx1, align 4 + %sum = fadd float %val0, %val1 + %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i + store float %sum, float addrspace(1)* %out.gep, align 4 + ret void +} + +; SI-LABEL: @simple_read2_f32_max_offset +; SI: ds_read2_b32 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:255 +; SI: s_waitcnt lgkmcnt(0) +; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[HI_VREG]], v[[LO_VREG]] +; SI: buffer_store_dword [[RESULT]] +; SI: s_endpgm +define void @simple_read2_f32_max_offset(float addrspace(1)* %out) #0 { + %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 + %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i + %val0 = load float, float addrspace(3)* %arrayidx0, align 4 + %add.x = add nsw i32 %x.i, 255 + %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x + %val1 = load float, float addrspace(3)* %arrayidx1, align 4 + %sum = fadd float %val0, %val1 + %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i + store float %sum, float addrspace(1)* %out.gep, align 4 + ret void +} + +; SI-LABEL: @simple_read2_f32_too_far +; SI-NOT ds_read2_b32 +; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} +; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:1028 +; SI: s_endpgm +define void @simple_read2_f32_too_far(float addrspace(1)* %out) #0 { + %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 + %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i + %val0 = load float, float addrspace(3)* %arrayidx0, align 4 + %add.x = add nsw i32 %x.i, 257 + %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x + %val1 = load float, float addrspace(3)* %arrayidx1, align 4 + %sum = fadd float %val0, %val1 + %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i + store float %sum, float addrspace(1)* %out.gep, align 4 + ret void +} + +; SI-LABEL: @simple_read2_f32_x2 +; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR:v[0-9]+]] offset1:8 +; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR]] offset0:11 offset1:27 +; SI: s_endpgm +define void @simple_read2_f32_x2(float addrspace(1)* %out) #0 { + %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1 + %idx.0 = add nsw i32 %tid.x, 0 + %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.0 + %val0 = load float, float addrspace(3)* %arrayidx0, align 4 + + %idx.1 = add nsw i32 %tid.x, 8 + %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.1 + %val1 = load float, float addrspace(3)* %arrayidx1, align 4 + %sum.0 = fadd float %val0, %val1 + + %idx.2 = add nsw i32 %tid.x, 11 + %arrayidx2 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.2 + %val2 = load float, float addrspace(3)* %arrayidx2, align 4 + + %idx.3 = add nsw i32 %tid.x, 27 + %arrayidx3 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.3 + %val3 = load float, float addrspace(3)* %arrayidx3, align 4 + %sum.1 = fadd float %val2, %val3 + + %sum = fadd float %sum.0, %sum.1 + %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %idx.0 + store float %sum, float addrspace(1)* %out.gep, align 4 + ret void +} + +; Make sure there is an instruction between the two sets of reads. +; SI-LABEL: @simple_read2_f32_x2_barrier +; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR:v[0-9]+]] offset1:8 +; SI: s_barrier +; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR]] offset0:11 offset1:27 +; SI: s_endpgm +define void @simple_read2_f32_x2_barrier(float addrspace(1)* %out) #0 { + %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1 + %idx.0 = add nsw i32 %tid.x, 0 + %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.0 + %val0 = load float, float addrspace(3)* %arrayidx0, align 4 + + %idx.1 = add nsw i32 %tid.x, 8 + %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.1 + %val1 = load float, float addrspace(3)* %arrayidx1, align 4 + %sum.0 = fadd float %val0, %val1 + + call void @llvm.AMDGPU.barrier.local() #2 + + %idx.2 = add nsw i32 %tid.x, 11 + %arrayidx2 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.2 + %val2 = load float, float addrspace(3)* %arrayidx2, align 4 + + %idx.3 = add nsw i32 %tid.x, 27 + %arrayidx3 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.3 + %val3 = load float, float addrspace(3)* %arrayidx3, align 4 + %sum.1 = fadd float %val2, %val3 + + %sum = fadd float %sum.0, %sum.1 + %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %idx.0 + store float %sum, float addrspace(1)* %out.gep, align 4 + ret void +} + +; For some reason adding something to the base address for the first +; element results in only folding the inner pair. + +; SI-LABEL: @simple_read2_f32_x2_nonzero_base +; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR:v[0-9]+]] offset0:2 offset1:8 +; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR]] offset0:11 offset1:27 +; SI: s_endpgm +define void @simple_read2_f32_x2_nonzero_base(float addrspace(1)* %out) #0 { + %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1 + %idx.0 = add nsw i32 %tid.x, 2 + %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.0 + %val0 = load float, float addrspace(3)* %arrayidx0, align 4 + + %idx.1 = add nsw i32 %tid.x, 8 + %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.1 + %val1 = load float, float addrspace(3)* %arrayidx1, align 4 + %sum.0 = fadd float %val0, %val1 + + %idx.2 = add nsw i32 %tid.x, 11 + %arrayidx2 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.2 + %val2 = load float, float addrspace(3)* %arrayidx2, align 4 + + %idx.3 = add nsw i32 %tid.x, 27 + %arrayidx3 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.3 + %val3 = load float, float addrspace(3)* %arrayidx3, align 4 + %sum.1 = fadd float %val2, %val3 + + %sum = fadd float %sum.0, %sum.1 + %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %idx.0 + store float %sum, float addrspace(1)* %out.gep, align 4 + ret void +} + +; Be careful of vectors of pointers. We don't know if the 2 pointers +; in the vectors are really the same base, so this is not safe to +; merge. +; Base pointers come from different subregister of same super +; register. We can't safely merge this. + +; SI-LABEL: @read2_ptr_is_subreg_arg_f32 +; SI-NOT: ds_read2_b32 +; SI: ds_read_b32 +; SI: ds_read_b32 +; SI: s_endpgm +define void @read2_ptr_is_subreg_arg_f32(float addrspace(1)* %out, <2 x float addrspace(3)*> %lds.ptr) #0 { + %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 + %index.0 = insertelement <2 x i32> undef, i32 %x.i, i32 0 + %index.1 = insertelement <2 x i32> %index.0, i32 8, i32 0 + %gep = getelementptr inbounds float, <2 x float addrspace(3)*> %lds.ptr, <2 x i32> %index.1 + %gep.0 = extractelement <2 x float addrspace(3)*> %gep, i32 0 + %gep.1 = extractelement <2 x float addrspace(3)*> %gep, i32 1 + %val0 = load float, float addrspace(3)* %gep.0, align 4 + %val1 = load float, float addrspace(3)* %gep.1, align 4 + %add.x = add nsw i32 %x.i, 8 + %sum = fadd float %val0, %val1 + %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i + store float %sum, float addrspace(1)* %out.gep, align 4 + ret void +} + +; Apply a constant scalar offset after the pointer vector extract. We +; are rejecting merges that have the same, constant 0 offset, so make +; sure we are really rejecting it because of the different +; subregisters. + +; SI-LABEL: @read2_ptr_is_subreg_arg_offset_f32 +; SI-NOT: ds_read2_b32 +; SI: ds_read_b32 +; SI: ds_read_b32 +; SI: s_endpgm +define void @read2_ptr_is_subreg_arg_offset_f32(float addrspace(1)* %out, <2 x float addrspace(3)*> %lds.ptr) #0 { + %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 + %index.0 = insertelement <2 x i32> undef, i32 %x.i, i32 0 + %index.1 = insertelement <2 x i32> %index.0, i32 8, i32 0 + %gep = getelementptr inbounds float, <2 x float addrspace(3)*> %lds.ptr, <2 x i32> %index.1 + %gep.0 = extractelement <2 x float addrspace(3)*> %gep, i32 0 + %gep.1 = extractelement <2 x float addrspace(3)*> %gep, i32 1 + + ; Apply an additional offset after the vector that will be more obviously folded. + %gep.1.offset = getelementptr float, float addrspace(3)* %gep.1, i32 8 + + %val0 = load float, float addrspace(3)* %gep.0, align 4 + %val1 = load float, float addrspace(3)* %gep.1.offset, align 4 + %add.x = add nsw i32 %x.i, 8 + %sum = fadd float %val0, %val1 + %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i + store float %sum, float addrspace(1)* %out.gep, align 4 + ret void +} + +; We should be able to merge in this case, but probably not worth the effort. +; SI-NOT: ds_read2_b32 +; SI: ds_read_b32 +; SI: ds_read_b32 +; SI: s_endpgm +define void @read2_ptr_is_subreg_f32(float addrspace(1)* %out) #0 { + %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 + %ptr.0 = insertelement <2 x [512 x float] addrspace(3)*> undef, [512 x float] addrspace(3)* @lds, i32 0 + %ptr.1 = insertelement <2 x [512 x float] addrspace(3)*> %ptr.0, [512 x float] addrspace(3)* @lds, i32 1 + %x.i.v.0 = insertelement <2 x i32> undef, i32 %x.i, i32 0 + %x.i.v.1 = insertelement <2 x i32> %x.i.v.0, i32 %x.i, i32 1 + %idx = add <2 x i32> %x.i.v.1, + %gep = getelementptr inbounds [512 x float], <2 x [512 x float] addrspace(3)*> %ptr.1, <2 x i32> , <2 x i32> %idx + %gep.0 = extractelement <2 x float addrspace(3)*> %gep, i32 0 + %gep.1 = extractelement <2 x float addrspace(3)*> %gep, i32 1 + %val0 = load float, float addrspace(3)* %gep.0, align 4 + %val1 = load float, float addrspace(3)* %gep.1, align 4 + %add.x = add nsw i32 %x.i, 8 + %sum = fadd float %val0, %val1 + %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i + store float %sum, float addrspace(1)* %out.gep, align 4 + ret void +} + +; SI-LABEL: @simple_read2_f32_volatile_0 +; SI-NOT ds_read2_b32 +; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} +; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:32 +; SI: s_endpgm +define void @simple_read2_f32_volatile_0(float addrspace(1)* %out) #0 { + %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 + %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i + %val0 = load volatile float, float addrspace(3)* %arrayidx0, align 4 + %add.x = add nsw i32 %x.i, 8 + %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x + %val1 = load float, float addrspace(3)* %arrayidx1, align 4 + %sum = fadd float %val0, %val1 + %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i + store float %sum, float addrspace(1)* %out.gep, align 4 + ret void +} + +; SI-LABEL: @simple_read2_f32_volatile_1 +; SI-NOT ds_read2_b32 +; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} +; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:32 +; SI: s_endpgm +define void @simple_read2_f32_volatile_1(float addrspace(1)* %out) #0 { + %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 + %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i + %val0 = load float, float addrspace(3)* %arrayidx0, align 4 + %add.x = add nsw i32 %x.i, 8 + %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x + %val1 = load volatile float, float addrspace(3)* %arrayidx1, align 4 + %sum = fadd float %val0, %val1 + %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i + store float %sum, float addrspace(1)* %out.gep, align 4 + ret void +} + +; Can't fold since not correctly aligned. +; XXX: This isn't really testing anything useful now. I think CI +; allows unaligned LDS accesses, which would be a problem here. +; SI-LABEL: @unaligned_read2_f32 +; SI-NOT: ds_read2_b32 +; SI: s_endpgm +define void @unaligned_read2_f32(float addrspace(1)* %out, float addrspace(3)* %lds) #0 { + %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 + %arrayidx0 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %x.i + %val0 = load float, float addrspace(3)* %arrayidx0, align 1 + %add.x = add nsw i32 %x.i, 8 + %arrayidx1 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %add.x + %val1 = load float, float addrspace(3)* %arrayidx1, align 1 + %sum = fadd float %val0, %val1 + %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i + store float %sum, float addrspace(1)* %out.gep, align 4 + ret void +} + +; SI-LABEL: @misaligned_2_simple_read2_f32 +; SI-NOT: ds_read2_b32 +; SI: s_endpgm +define void @misaligned_2_simple_read2_f32(float addrspace(1)* %out, float addrspace(3)* %lds) #0 { + %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 + %arrayidx0 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %x.i + %val0 = load float, float addrspace(3)* %arrayidx0, align 2 + %add.x = add nsw i32 %x.i, 8 + %arrayidx1 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %add.x + %val1 = load float, float addrspace(3)* %arrayidx1, align 2 + %sum = fadd float %val0, %val1 + %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i + store float %sum, float addrspace(1)* %out.gep, align 4 + ret void +} + +; SI-LABEL: @simple_read2_f64 +; SI: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 3, {{v[0-9]+}} +; SI: ds_read2_b64 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, [[VPTR]] offset1:8 +; SI: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO_VREG]]:{{[0-9]+\]}}, v{{\[[0-9]+}}:[[HI_VREG]]{{\]}} +; SI: buffer_store_dwordx2 [[RESULT]] +; SI: s_endpgm +define void @simple_read2_f64(double addrspace(1)* %out) #0 { + %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 + %arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i + %val0 = load double, double addrspace(3)* %arrayidx0, align 8 + %add.x = add nsw i32 %x.i, 8 + %arrayidx1 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %add.x + %val1 = load double, double addrspace(3)* %arrayidx1, align 8 + %sum = fadd double %val0, %val1 + %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i32 %x.i + store double %sum, double addrspace(1)* %out.gep, align 8 + ret void +} + +; SI-LABEL: @simple_read2_f64_max_offset +; SI: ds_read2_b64 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:255 +; SI: s_endpgm +define void @simple_read2_f64_max_offset(double addrspace(1)* %out) #0 { + %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 + %arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i + %val0 = load double, double addrspace(3)* %arrayidx0, align 8 + %add.x = add nsw i32 %x.i, 255 + %arrayidx1 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %add.x + %val1 = load double, double addrspace(3)* %arrayidx1, align 8 + %sum = fadd double %val0, %val1 + %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i32 %x.i + store double %sum, double addrspace(1)* %out.gep, align 8 + ret void +} + +; SI-LABEL: @simple_read2_f64_too_far +; SI-NOT ds_read2_b64 +; SI: ds_read_b64 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} +; SI: ds_read_b64 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset:2056 +; SI: s_endpgm +define void @simple_read2_f64_too_far(double addrspace(1)* %out) #0 { + %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 + %arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i + %val0 = load double, double addrspace(3)* %arrayidx0, align 8 + %add.x = add nsw i32 %x.i, 257 + %arrayidx1 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %add.x + %val1 = load double, double addrspace(3)* %arrayidx1, align 8 + %sum = fadd double %val0, %val1 + %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i32 %x.i + store double %sum, double addrspace(1)* %out.gep, align 8 + ret void +} + +; Alignment only 4 +; SI-LABEL: @misaligned_read2_f64 +; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset1:1 +; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset0:14 offset1:15 +; SI: s_endpgm +define void @misaligned_read2_f64(double addrspace(1)* %out, double addrspace(3)* %lds) #0 { + %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 + %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %x.i + %val0 = load double, double addrspace(3)* %arrayidx0, align 4 + %add.x = add nsw i32 %x.i, 7 + %arrayidx1 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x + %val1 = load double, double addrspace(3)* %arrayidx1, align 4 + %sum = fadd double %val0, %val1 + %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i32 %x.i + store double %sum, double addrspace(1)* %out.gep, align 4 + ret void +} + +@foo = addrspace(3) global [4 x i32] undef, align 4 + +; SI-LABEL: @load_constant_adjacent_offsets +; SI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} +; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] offset1:1 +define void @load_constant_adjacent_offsets(i32 addrspace(1)* %out) { + %val0 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4 + %val1 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 1), align 4 + %sum = add i32 %val0, %val1 + store i32 %sum, i32 addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL: @load_constant_disjoint_offsets +; SI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} +; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] offset1:2 +define void @load_constant_disjoint_offsets(i32 addrspace(1)* %out) { + %val0 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4 + %val1 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 2), align 4 + %sum = add i32 %val0, %val1 + store i32 %sum, i32 addrspace(1)* %out, align 4 + ret void +} + +@bar = addrspace(3) global [4 x i64] undef, align 4 + +; SI-LABEL: @load_misaligned64_constant_offsets +; SI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} +; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] offset1:1 +; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] offset0:2 offset1:3 +define void @load_misaligned64_constant_offsets(i64 addrspace(1)* %out) { + %val0 = load i64, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 0), align 4 + %val1 = load i64, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 1), align 4 + %sum = add i64 %val0, %val1 + store i64 %sum, i64 addrspace(1)* %out, align 8 + ret void +} + +@bar.large = addrspace(3) global [4096 x i64] undef, align 4 + +; SI-LABEL: @load_misaligned64_constant_large_offsets +; SI-DAG: v_mov_b32_e32 [[BASE0:v[0-9]+]], 0x7ff8{{$}} +; SI-DAG: v_mov_b32_e32 [[BASE1:v[0-9]+]], 0x4000 +; SI-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASE0]] offset1:1 +; SI-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASE1]] offset1:1 +; SI: s_endpgm +define void @load_misaligned64_constant_large_offsets(i64 addrspace(1)* %out) { + %val0 = load i64, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 2048), align 4 + %val1 = load i64, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 4095), align 4 + %sum = add i64 %val0, %val1 + store i64 %sum, i64 addrspace(1)* %out, align 8 + ret void +} + +@sgemm.lA = internal unnamed_addr addrspace(3) global [264 x float] undef, align 4 +@sgemm.lB = internal unnamed_addr addrspace(3) global [776 x float] undef, align 4 + +define void @sgemm_inner_loop_read2_sequence(float addrspace(1)* %C, i32 %lda, i32 %ldb) #0 { + %x.i = tail call i32 @llvm.r600.read.tgid.x() #1 + %y.i = tail call i32 @llvm.r600.read.tidig.y() #1 + %arrayidx44 = getelementptr inbounds [264 x float], [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %x.i + %tmp16 = load float, float addrspace(3)* %arrayidx44, align 4 + %add47 = add nsw i32 %x.i, 1 + %arrayidx48 = getelementptr inbounds [264 x float], [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %add47 + %tmp17 = load float, float addrspace(3)* %arrayidx48, align 4 + %add51 = add nsw i32 %x.i, 16 + %arrayidx52 = getelementptr inbounds [264 x float], [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %add51 + %tmp18 = load float, float addrspace(3)* %arrayidx52, align 4 + %add55 = add nsw i32 %x.i, 17 + %arrayidx56 = getelementptr inbounds [264 x float], [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %add55 + %tmp19 = load float, float addrspace(3)* %arrayidx56, align 4 + %arrayidx60 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %y.i + %tmp20 = load float, float addrspace(3)* %arrayidx60, align 4 + %add63 = add nsw i32 %y.i, 1 + %arrayidx64 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add63 + %tmp21 = load float, float addrspace(3)* %arrayidx64, align 4 + %add67 = add nsw i32 %y.i, 32 + %arrayidx68 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add67 + %tmp22 = load float, float addrspace(3)* %arrayidx68, align 4 + %add71 = add nsw i32 %y.i, 33 + %arrayidx72 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add71 + %tmp23 = load float, float addrspace(3)* %arrayidx72, align 4 + %add75 = add nsw i32 %y.i, 64 + %arrayidx76 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add75 + %tmp24 = load float, float addrspace(3)* %arrayidx76, align 4 + %add79 = add nsw i32 %y.i, 65 + %arrayidx80 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add79 + %tmp25 = load float, float addrspace(3)* %arrayidx80, align 4 + %sum.0 = fadd float %tmp16, %tmp17 + %sum.1 = fadd float %sum.0, %tmp18 + %sum.2 = fadd float %sum.1, %tmp19 + %sum.3 = fadd float %sum.2, %tmp20 + %sum.4 = fadd float %sum.3, %tmp21 + %sum.5 = fadd float %sum.4, %tmp22 + %sum.6 = fadd float %sum.5, %tmp23 + %sum.7 = fadd float %sum.6, %tmp24 + %sum.8 = fadd float %sum.7, %tmp25 + store float %sum.8, float addrspace(1)* %C, align 4 + ret void +} + +define void @misaligned_read2_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(3)* %in) #0 { + %load = load <2 x i32>, <2 x i32> addrspace(3)* %in, align 4 + store <2 x i32> %load, <2 x i32> addrspace(1)* %out, align 8 + ret void +} + +define void @misaligned_read2_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %in) #0 { + %load = load i64, i64 addrspace(3)* %in, align 4 + store i64 %load, i64 addrspace(1)* %out, align 8 + ret void +} + +; Function Attrs: nounwind readnone +declare i32 @llvm.r600.read.tgid.x() #1 + +; Function Attrs: nounwind readnone +declare i32 @llvm.r600.read.tgid.y() #1 + +; Function Attrs: nounwind readnone +declare i32 @llvm.r600.read.tidig.x() #1 + +; Function Attrs: nounwind readnone +declare i32 @llvm.r600.read.tidig.y() #1 + +; Function Attrs: noduplicate nounwind +declare void @llvm.AMDGPU.barrier.local() #2 + +attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { nounwind readnone } +attributes #2 = { noduplicate nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/ds_read2_offset_order.ll b/llvm/test/CodeGen/AMDGPU/ds_read2_offset_order.ll new file mode 100644 index 00000000000..9ea9a5a2617 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/ds_read2_offset_order.ll @@ -0,0 +1,45 @@ +; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck -strict-whitespace -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck -strict-whitespace -check-prefix=SI %s + +; XFAIL: * + +@lds = addrspace(3) global [512 x float] undef, align 4 + +; SI-LABEL: {{^}}offset_order: + +; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:56 +; SI: ds_read2st64_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset0:0 offset1:4 +; SI: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset0:2 offset1:3 +; SI: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset0:11 offset1:1 + +define void @offset_order(float addrspace(1)* %out) { +entry: + %ptr0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 0 + %val0 = load float, float addrspace(3)* %ptr0 + + %ptr1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 256 + %val1 = load float, float addrspace(3)* %ptr1 + %add1 = fadd float %val0, %val1 + + %ptr2 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 3 + %val2 = load float, float addrspace(3)* %ptr2 + %add2 = fadd float %add1, %val2 + + %ptr3 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 2 + %val3 = load float, float addrspace(3)* %ptr3 + %add3 = fadd float %add2, %val3 + + %ptr4 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 12 + %val4 = load float, float addrspace(3)* %ptr4 + %add4 = fadd float %add3, %val4 + + %ptr5 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 14 + %val5 = load float, float addrspace(3)* %ptr5 + %add5 = fadd float %add4, %val5 + + %ptr6 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 11 + %val6 = load float, float addrspace(3)* %ptr6 + %add6 = fadd float %add5, %val6 + store float %add6, float addrspace(1)* %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/ds_read2st64.ll b/llvm/test/CodeGen/AMDGPU/ds_read2st64.ll new file mode 100644 index 00000000000..54b3b45636d --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/ds_read2st64.ll @@ -0,0 +1,272 @@ +; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck -check-prefix=SI %s + +@lds = addrspace(3) global [512 x float] undef, align 4 +@lds.f64 = addrspace(3) global [512 x double] undef, align 8 + + +; SI-LABEL: @simple_read2st64_f32_0_1 +; SI: ds_read2st64_b32 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:1 +; SI: s_waitcnt lgkmcnt(0) +; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[HI_VREG]], v[[LO_VREG]] +; SI: buffer_store_dword [[RESULT]] +; SI: s_endpgm +define void @simple_read2st64_f32_0_1(float addrspace(1)* %out) #0 { + %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 + %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i + %val0 = load float, float addrspace(3)* %arrayidx0, align 4 + %add.x = add nsw i32 %x.i, 64 + %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x + %val1 = load float, float addrspace(3)* %arrayidx1, align 4 + %sum = fadd float %val0, %val1 + %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i + store float %sum, float addrspace(1)* %out.gep, align 4 + ret void +} + +; SI-LABEL: @simple_read2st64_f32_1_2 +; SI: ds_read2st64_b32 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1 offset1:2 +; SI: s_waitcnt lgkmcnt(0) +; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[HI_VREG]], v[[LO_VREG]] +; SI: buffer_store_dword [[RESULT]] +; SI: s_endpgm +define void @simple_read2st64_f32_1_2(float addrspace(1)* %out, float addrspace(3)* %lds) #0 { + %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 + %add.x.0 = add nsw i32 %x.i, 64 + %arrayidx0 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %add.x.0 + %val0 = load float, float addrspace(3)* %arrayidx0, align 4 + %add.x.1 = add nsw i32 %x.i, 128 + %arrayidx1 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %add.x.1 + %val1 = load float, float addrspace(3)* %arrayidx1, align 4 + %sum = fadd float %val0, %val1 + %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i + store float %sum, float addrspace(1)* %out.gep, align 4 + ret void +} + +; SI-LABEL: @simple_read2st64_f32_max_offset +; SI: ds_read2st64_b32 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1 offset1:255 +; SI: s_waitcnt lgkmcnt(0) +; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[HI_VREG]], v[[LO_VREG]] +; SI: buffer_store_dword [[RESULT]] +; SI: s_endpgm +define void @simple_read2st64_f32_max_offset(float addrspace(1)* %out, float addrspace(3)* %lds) #0 { + %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 + %add.x.0 = add nsw i32 %x.i, 64 + %arrayidx0 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %add.x.0 + %val0 = load float, float addrspace(3)* %arrayidx0, align 4 + %add.x.1 = add nsw i32 %x.i, 16320 + %arrayidx1 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %add.x.1 + %val1 = load float, float addrspace(3)* %arrayidx1, align 4 + %sum = fadd float %val0, %val1 + %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i + store float %sum, float addrspace(1)* %out.gep, align 4 + ret void +} + +; SI-LABEL: @simple_read2st64_f32_over_max_offset +; SI-NOT: ds_read2st64_b32 +; SI: v_add_i32_e32 [[BIGADD:v[0-9]+]], 0x10000, {{v[0-9]+}} +; SI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:256 +; SI: ds_read_b32 {{v[0-9]+}}, [[BIGADD]] +; SI: s_endpgm +define void @simple_read2st64_f32_over_max_offset(float addrspace(1)* %out, float addrspace(3)* %lds) #0 { + %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 + %add.x.0 = add nsw i32 %x.i, 64 + %arrayidx0 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %add.x.0 + %val0 = load float, float addrspace(3)* %arrayidx0, align 4 + %add.x.1 = add nsw i32 %x.i, 16384 + %arrayidx1 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %add.x.1 + %val1 = load float, float addrspace(3)* %arrayidx1, align 4 + %sum = fadd float %val0, %val1 + %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i + store float %sum, float addrspace(1)* %out.gep, align 4 + ret void +} + +; SI-LABEL: @odd_invalid_read2st64_f32_0 +; SI-NOT: ds_read2st64_b32 +; SI: s_endpgm +define void @odd_invalid_read2st64_f32_0(float addrspace(1)* %out) #0 { + %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 + %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i + %val0 = load float, float addrspace(3)* %arrayidx0, align 4 + %add.x = add nsw i32 %x.i, 63 + %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x + %val1 = load float, float addrspace(3)* %arrayidx1, align 4 + %sum = fadd float %val0, %val1 + %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i + store float %sum, float addrspace(1)* %out.gep, align 4 + ret void +} + +; SI-LABEL: @odd_invalid_read2st64_f32_1 +; SI-NOT: ds_read2st64_b32 +; SI: s_endpgm +define void @odd_invalid_read2st64_f32_1(float addrspace(1)* %out) #0 { + %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 + %add.x.0 = add nsw i32 %x.i, 64 + %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x.0 + %val0 = load float, float addrspace(3)* %arrayidx0, align 4 + %add.x.1 = add nsw i32 %x.i, 127 + %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x.1 + %val1 = load float, float addrspace(3)* %arrayidx1, align 4 + %sum = fadd float %val0, %val1 + %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i + store float %sum, float addrspace(1)* %out.gep, align 4 + ret void +} + +; SI-LABEL: @simple_read2st64_f64_0_1 +; SI: ds_read2st64_b64 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:1 +; SI: s_waitcnt lgkmcnt(0) +; SI: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO_VREG]]:{{[0-9]+\]}}, v{{\[[0-9]+}}:[[HI_VREG]]{{\]}} +; SI: buffer_store_dwordx2 [[RESULT]] +; SI: s_endpgm +define void @simple_read2st64_f64_0_1(double addrspace(1)* %out) #0 { + %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 + %arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i + %val0 = load double, double addrspace(3)* %arrayidx0, align 8 + %add.x = add nsw i32 %x.i, 64 + %arrayidx1 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %add.x + %val1 = load double, double addrspace(3)* %arrayidx1, align 8 + %sum = fadd double %val0, %val1 + %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i32 %x.i + store double %sum, double addrspace(1)* %out.gep, align 8 + ret void +} + +; SI-LABEL: @simple_read2st64_f64_1_2 +; SI: ds_read2st64_b64 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1 offset1:2 +; SI: s_waitcnt lgkmcnt(0) +; SI: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO_VREG]]:{{[0-9]+\]}}, v{{\[[0-9]+}}:[[HI_VREG]]{{\]}} +; SI: buffer_store_dwordx2 [[RESULT]] +; SI: s_endpgm +define void @simple_read2st64_f64_1_2(double addrspace(1)* %out, double addrspace(3)* %lds) #0 { + %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 + %add.x.0 = add nsw i32 %x.i, 64 + %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.0 + %val0 = load double, double addrspace(3)* %arrayidx0, align 8 + %add.x.1 = add nsw i32 %x.i, 128 + %arrayidx1 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.1 + %val1 = load double, double addrspace(3)* %arrayidx1, align 8 + %sum = fadd double %val0, %val1 + %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i32 %x.i + store double %sum, double addrspace(1)* %out.gep, align 8 + ret void +} + +; Alignment only + +; SI-LABEL: @misaligned_read2st64_f64 +; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset1:1 +; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset0:128 offset1:129 +; SI: s_endpgm +define void @misaligned_read2st64_f64(double addrspace(1)* %out, double addrspace(3)* %lds) #0 { + %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 + %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %x.i + %val0 = load double, double addrspace(3)* %arrayidx0, align 4 + %add.x = add nsw i32 %x.i, 64 + %arrayidx1 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x + %val1 = load double, double addrspace(3)* %arrayidx1, align 4 + %sum = fadd double %val0, %val1 + %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i32 %x.i + store double %sum, double addrspace(1)* %out.gep, align 4 + ret void +} + +; The maximum is not the usual 0xff because 0xff * 8 * 64 > 0xffff +; SI-LABEL: @simple_read2st64_f64_max_offset +; SI: ds_read2st64_b64 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:4 offset1:127 +; SI: s_waitcnt lgkmcnt(0) +; SI: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO_VREG]]:{{[0-9]+\]}}, v{{\[[0-9]+}}:[[HI_VREG]]{{\]}} +; SI: buffer_store_dwordx2 [[RESULT]] +; SI: s_endpgm +define void @simple_read2st64_f64_max_offset(double addrspace(1)* %out, double addrspace(3)* %lds) #0 { + %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 + %add.x.0 = add nsw i32 %x.i, 256 + %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.0 + %val0 = load double, double addrspace(3)* %arrayidx0, align 8 + %add.x.1 = add nsw i32 %x.i, 8128 + %arrayidx1 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.1 + %val1 = load double, double addrspace(3)* %arrayidx1, align 8 + %sum = fadd double %val0, %val1 + %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i32 %x.i + store double %sum, double addrspace(1)* %out.gep, align 8 + ret void +} + +; SI-LABEL: @simple_read2st64_f64_over_max_offset +; SI-NOT: ds_read2st64_b64 +; SI: v_add_i32_e32 [[BIGADD:v[0-9]+]], 0x10000, {{v[0-9]+}} +; SI: ds_read_b64 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset:512 +; SI: ds_read_b64 {{v\[[0-9]+:[0-9]+\]}}, [[BIGADD]] +; SI: s_endpgm +define void @simple_read2st64_f64_over_max_offset(double addrspace(1)* %out, double addrspace(3)* %lds) #0 { + %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 + %add.x.0 = add nsw i32 %x.i, 64 + %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.0 + %val0 = load double, double addrspace(3)* %arrayidx0, align 8 + %add.x.1 = add nsw i32 %x.i, 8192 + %arrayidx1 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.1 + %val1 = load double, double addrspace(3)* %arrayidx1, align 8 + %sum = fadd double %val0, %val1 + %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i32 %x.i + store double %sum, double addrspace(1)* %out.gep, align 8 + ret void +} + +; SI-LABEL: @invalid_read2st64_f64_odd_offset +; SI-NOT: ds_read2st64_b64 +; SI: s_endpgm +define void @invalid_read2st64_f64_odd_offset(double addrspace(1)* %out, double addrspace(3)* %lds) #0 { + %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 + %add.x.0 = add nsw i32 %x.i, 64 + %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.0 + %val0 = load double, double addrspace(3)* %arrayidx0, align 8 + %add.x.1 = add nsw i32 %x.i, 8129 + %arrayidx1 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.1 + %val1 = load double, double addrspace(3)* %arrayidx1, align 8 + %sum = fadd double %val0, %val1 + %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i32 %x.i + store double %sum, double addrspace(1)* %out.gep, align 8 + ret void +} + +; The stride of 8 elements is 8 * 8 bytes. We need to make sure the +; stride in elements, not bytes, is a multiple of 64. + +; SI-LABEL: @byte_size_only_divisible_64_read2_f64 +; SI-NOT: ds_read2st_b64 +; SI: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:8 +; SI: s_endpgm +define void @byte_size_only_divisible_64_read2_f64(double addrspace(1)* %out, double addrspace(3)* %lds) #0 { + %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 + %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %x.i + %val0 = load double, double addrspace(3)* %arrayidx0, align 8 + %add.x = add nsw i32 %x.i, 8 + %arrayidx1 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x + %val1 = load double, double addrspace(3)* %arrayidx1, align 8 + %sum = fadd double %val0, %val1 + %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i32 %x.i + store double %sum, double addrspace(1)* %out.gep, align 4 + ret void +} + +; Function Attrs: nounwind readnone +declare i32 @llvm.r600.read.tgid.x() #1 + +; Function Attrs: nounwind readnone +declare i32 @llvm.r600.read.tgid.y() #1 + +; Function Attrs: nounwind readnone +declare i32 @llvm.r600.read.tidig.x() #1 + +; Function Attrs: nounwind readnone +declare i32 @llvm.r600.read.tidig.y() #1 + +; Function Attrs: noduplicate nounwind +declare void @llvm.AMDGPU.barrier.local() #2 + +attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { nounwind readnone } +attributes #2 = { noduplicate nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/ds_write2.ll b/llvm/test/CodeGen/AMDGPU/ds_write2.ll new file mode 100644 index 00000000000..b553d3459e4 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/ds_write2.ll @@ -0,0 +1,425 @@ +; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck -strict-whitespace -check-prefix=SI %s + +@lds = addrspace(3) global [512 x float] undef, align 4 +@lds.f64 = addrspace(3) global [512 x double] undef, align 8 + + +; SI-LABEL: @simple_write2_one_val_f32 +; SI-DAG: buffer_load_dword [[VAL:v[0-9]+]] +; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}} +; SI: ds_write2_b32 [[VPTR]], [[VAL]], [[VAL]] offset1:8 +; SI: s_endpgm +define void @simple_write2_one_val_f32(float addrspace(1)* %C, float addrspace(1)* %in) #0 { + %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 + %in.gep = getelementptr float, float addrspace(1)* %in, i32 %x.i + %val = load float, float addrspace(1)* %in.gep, align 4 + %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i + store float %val, float addrspace(3)* %arrayidx0, align 4 + %add.x = add nsw i32 %x.i, 8 + %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x + store float %val, float addrspace(3)* %arrayidx1, align 4 + ret void +} + +; SI-LABEL: @simple_write2_two_val_f32 +; SI-DAG: buffer_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI-DAG: buffer_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 +; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}} +; SI: ds_write2_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset1:8 +; SI: s_endpgm +define void @simple_write2_two_val_f32(float addrspace(1)* %C, float addrspace(1)* %in) #0 { + %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 + %in.gep.0 = getelementptr float, float addrspace(1)* %in, i32 %x.i + %in.gep.1 = getelementptr float, float addrspace(1)* %in.gep.0, i32 1 + %val0 = load float, float addrspace(1)* %in.gep.0, align 4 + %val1 = load float, float addrspace(1)* %in.gep.1, align 4 + %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i + store float %val0, float addrspace(3)* %arrayidx0, align 4 + %add.x = add nsw i32 %x.i, 8 + %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x + store float %val1, float addrspace(3)* %arrayidx1, align 4 + ret void +} + +; SI-LABEL: @simple_write2_two_val_f32_volatile_0 +; SI-NOT: ds_write2_b32 +; SI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} +; SI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:32 +; SI: s_endpgm +define void @simple_write2_two_val_f32_volatile_0(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 { + %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 + %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %x.i + %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %x.i + %val0 = load float, float addrspace(1)* %in0.gep, align 4 + %val1 = load float, float addrspace(1)* %in1.gep, align 4 + %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i + store volatile float %val0, float addrspace(3)* %arrayidx0, align 4 + %add.x = add nsw i32 %x.i, 8 + %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x + store float %val1, float addrspace(3)* %arrayidx1, align 4 + ret void +} + +; SI-LABEL: @simple_write2_two_val_f32_volatile_1 +; SI-NOT: ds_write2_b32 +; SI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} +; SI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:32 +; SI: s_endpgm +define void @simple_write2_two_val_f32_volatile_1(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 { + %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 + %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %x.i + %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %x.i + %val0 = load float, float addrspace(1)* %in0.gep, align 4 + %val1 = load float, float addrspace(1)* %in1.gep, align 4 + %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i + store float %val0, float addrspace(3)* %arrayidx0, align 4 + %add.x = add nsw i32 %x.i, 8 + %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x + store volatile float %val1, float addrspace(3)* %arrayidx1, align 4 + ret void +} + +; 2 data subregisters from different super registers. +; SI-LABEL: @simple_write2_two_val_subreg2_mixed_f32 +; SI: buffer_load_dwordx2 v{{\[}}[[VAL0:[0-9]+]]:{{[0-9]+\]}} +; SI: buffer_load_dwordx2 v{{\[[0-9]+}}:[[VAL1:[0-9]+]]{{\]}} +; SI: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}} +; SI: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:8 +; SI: s_endpgm +define void @simple_write2_two_val_subreg2_mixed_f32(float addrspace(1)* %C, <2 x float> addrspace(1)* %in) #0 { + %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 + %in.gep.0 = getelementptr <2 x float>, <2 x float> addrspace(1)* %in, i32 %x.i + %in.gep.1 = getelementptr <2 x float>, <2 x float> addrspace(1)* %in.gep.0, i32 1 + %val0 = load <2 x float>, <2 x float> addrspace(1)* %in.gep.0, align 8 + %val1 = load <2 x float>, <2 x float> addrspace(1)* %in.gep.1, align 8 + %val0.0 = extractelement <2 x float> %val0, i32 0 + %val1.1 = extractelement <2 x float> %val1, i32 1 + %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i + store float %val0.0, float addrspace(3)* %arrayidx0, align 4 + %add.x = add nsw i32 %x.i, 8 + %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x + store float %val1.1, float addrspace(3)* %arrayidx1, align 4 + ret void +} + +; SI-LABEL: @simple_write2_two_val_subreg2_f32 +; SI-DAG: buffer_load_dwordx2 v{{\[}}[[VAL0:[0-9]+]]:[[VAL1:[0-9]+]]{{\]}} +; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}} +; SI: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:8 +; SI: s_endpgm +define void @simple_write2_two_val_subreg2_f32(float addrspace(1)* %C, <2 x float> addrspace(1)* %in) #0 { + %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 + %in.gep = getelementptr <2 x float>, <2 x float> addrspace(1)* %in, i32 %x.i + %val = load <2 x float>, <2 x float> addrspace(1)* %in.gep, align 8 + %val0 = extractelement <2 x float> %val, i32 0 + %val1 = extractelement <2 x float> %val, i32 1 + %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i + store float %val0, float addrspace(3)* %arrayidx0, align 4 + %add.x = add nsw i32 %x.i, 8 + %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x + store float %val1, float addrspace(3)* %arrayidx1, align 4 + ret void +} + +; SI-LABEL: @simple_write2_two_val_subreg4_f32 +; SI-DAG: buffer_load_dwordx4 v{{\[}}[[VAL0:[0-9]+]]:[[VAL1:[0-9]+]]{{\]}} +; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}} +; SI: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:8 +; SI: s_endpgm +define void @simple_write2_two_val_subreg4_f32(float addrspace(1)* %C, <4 x float> addrspace(1)* %in) #0 { + %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 + %in.gep = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 %x.i + %val = load <4 x float>, <4 x float> addrspace(1)* %in.gep, align 16 + %val0 = extractelement <4 x float> %val, i32 0 + %val1 = extractelement <4 x float> %val, i32 3 + %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i + store float %val0, float addrspace(3)* %arrayidx0, align 4 + %add.x = add nsw i32 %x.i, 8 + %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x + store float %val1, float addrspace(3)* %arrayidx1, align 4 + ret void +} + +; SI-LABEL: @simple_write2_two_val_max_offset_f32 +; SI-DAG: buffer_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI-DAG: buffer_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 +; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}} +; SI: ds_write2_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset1:255 +; SI: s_endpgm +define void @simple_write2_two_val_max_offset_f32(float addrspace(1)* %C, float addrspace(1)* %in) #0 { + %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 + %in.gep.0 = getelementptr float, float addrspace(1)* %in, i32 %x.i + %in.gep.1 = getelementptr float, float addrspace(1)* %in.gep.0, i32 1 + %val0 = load float, float addrspace(1)* %in.gep.0, align 4 + %val1 = load float, float addrspace(1)* %in.gep.1, align 4 + %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i + store float %val0, float addrspace(3)* %arrayidx0, align 4 + %add.x = add nsw i32 %x.i, 255 + %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x + store float %val1, float addrspace(3)* %arrayidx1, align 4 + ret void +} + +; SI-LABEL: @simple_write2_two_val_too_far_f32 +; SI: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}} +; SI: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:1028 +; SI: s_endpgm +define void @simple_write2_two_val_too_far_f32(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 { + %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 + %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %x.i + %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %x.i + %val0 = load float, float addrspace(1)* %in0.gep, align 4 + %val1 = load float, float addrspace(1)* %in1.gep, align 4 + %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i + store float %val0, float addrspace(3)* %arrayidx0, align 4 + %add.x = add nsw i32 %x.i, 257 + %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x + store float %val1, float addrspace(3)* %arrayidx1, align 4 + ret void +} + +; SI-LABEL: @simple_write2_two_val_f32_x2 +; SI: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0:v[0-9]+]], [[VAL1:v[0-9]+]] offset1:8 +; SI-NEXT: ds_write2_b32 [[BASEADDR]], [[VAL0]], [[VAL1]] offset0:11 offset1:27 +; SI: s_endpgm +define void @simple_write2_two_val_f32_x2(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 { + %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1 + %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %tid.x + %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %tid.x + %val0 = load float, float addrspace(1)* %in0.gep, align 4 + %val1 = load float, float addrspace(1)* %in1.gep, align 4 + + %idx.0 = add nsw i32 %tid.x, 0 + %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.0 + store float %val0, float addrspace(3)* %arrayidx0, align 4 + + %idx.1 = add nsw i32 %tid.x, 8 + %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.1 + store float %val1, float addrspace(3)* %arrayidx1, align 4 + + %idx.2 = add nsw i32 %tid.x, 11 + %arrayidx2 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.2 + store float %val0, float addrspace(3)* %arrayidx2, align 4 + + %idx.3 = add nsw i32 %tid.x, 27 + %arrayidx3 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.3 + store float %val1, float addrspace(3)* %arrayidx3, align 4 + + ret void +} + +; SI-LABEL: @simple_write2_two_val_f32_x2_nonzero_base +; SI: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0:v[0-9]+]], [[VAL1:v[0-9]+]] offset0:3 offset1:8 +; SI-NEXT: ds_write2_b32 [[BASEADDR]], [[VAL0]], [[VAL1]] offset0:11 offset1:27 +; SI: s_endpgm +define void @simple_write2_two_val_f32_x2_nonzero_base(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 { + %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1 + %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %tid.x + %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %tid.x + %val0 = load float, float addrspace(1)* %in0.gep, align 4 + %val1 = load float, float addrspace(1)* %in1.gep, align 4 + + %idx.0 = add nsw i32 %tid.x, 3 + %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.0 + store float %val0, float addrspace(3)* %arrayidx0, align 4 + + %idx.1 = add nsw i32 %tid.x, 8 + %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.1 + store float %val1, float addrspace(3)* %arrayidx1, align 4 + + %idx.2 = add nsw i32 %tid.x, 11 + %arrayidx2 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.2 + store float %val0, float addrspace(3)* %arrayidx2, align 4 + + %idx.3 = add nsw i32 %tid.x, 27 + %arrayidx3 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.3 + store float %val1, float addrspace(3)* %arrayidx3, align 4 + + ret void +} + +; SI-LABEL: @write2_ptr_subreg_arg_two_val_f32 +; SI-NOT: ds_write2_b32 +; SI: ds_write_b32 +; SI: ds_write_b32 +; SI: s_endpgm +define void @write2_ptr_subreg_arg_two_val_f32(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1, <2 x float addrspace(3)*> %lds.ptr) #0 { + %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 + %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %x.i + %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %x.i + %val0 = load float, float addrspace(1)* %in0.gep, align 4 + %val1 = load float, float addrspace(1)* %in1.gep, align 4 + + %index.0 = insertelement <2 x i32> undef, i32 %x.i, i32 0 + %index.1 = insertelement <2 x i32> %index.0, i32 8, i32 0 + %gep = getelementptr inbounds float, <2 x float addrspace(3)*> %lds.ptr, <2 x i32> %index.1 + %gep.0 = extractelement <2 x float addrspace(3)*> %gep, i32 0 + %gep.1 = extractelement <2 x float addrspace(3)*> %gep, i32 1 + + ; Apply an additional offset after the vector that will be more obviously folded. + %gep.1.offset = getelementptr float, float addrspace(3)* %gep.1, i32 8 + store float %val0, float addrspace(3)* %gep.0, align 4 + + %add.x = add nsw i32 %x.i, 8 + store float %val1, float addrspace(3)* %gep.1.offset, align 4 + ret void +} + +; SI-LABEL: @simple_write2_one_val_f64 +; SI: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]], +; SI: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 3, v{{[0-9]+}} +; SI: ds_write2_b64 [[VPTR]], [[VAL]], [[VAL]] offset1:8 +; SI: s_endpgm +define void @simple_write2_one_val_f64(double addrspace(1)* %C, double addrspace(1)* %in) #0 { + %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 + %in.gep = getelementptr double, double addrspace(1)* %in, i32 %x.i + %val = load double, double addrspace(1)* %in.gep, align 8 + %arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i + store double %val, double addrspace(3)* %arrayidx0, align 8 + %add.x = add nsw i32 %x.i, 8 + %arrayidx1 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %add.x + store double %val, double addrspace(3)* %arrayidx1, align 8 + ret void +} + +; SI-LABEL: @misaligned_simple_write2_one_val_f64 +; SI-DAG: buffer_load_dwordx2 v{{\[}}[[VAL0:[0-9]+]]:[[VAL1:[0-9]+]]{{\]}} +; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 3, v{{[0-9]+}} +; SI: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:1 +; SI: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset0:14 offset1:15 +; SI: s_endpgm +define void @misaligned_simple_write2_one_val_f64(double addrspace(1)* %C, double addrspace(1)* %in, double addrspace(3)* %lds) #0 { + %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 + %in.gep = getelementptr double, double addrspace(1)* %in, i32 %x.i + %val = load double, double addrspace(1)* %in.gep, align 8 + %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %x.i + store double %val, double addrspace(3)* %arrayidx0, align 4 + %add.x = add nsw i32 %x.i, 7 + %arrayidx1 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x + store double %val, double addrspace(3)* %arrayidx1, align 4 + ret void +} + +; SI-LABEL: @simple_write2_two_val_f64 +; SI-DAG: buffer_load_dwordx2 [[VAL0:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI-DAG: buffer_load_dwordx2 [[VAL1:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 +; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 3, v{{[0-9]+}} +; SI: ds_write2_b64 [[VPTR]], [[VAL0]], [[VAL1]] offset1:8 +; SI: s_endpgm +define void @simple_write2_two_val_f64(double addrspace(1)* %C, double addrspace(1)* %in) #0 { + %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 + %in.gep.0 = getelementptr double, double addrspace(1)* %in, i32 %x.i + %in.gep.1 = getelementptr double, double addrspace(1)* %in.gep.0, i32 1 + %val0 = load double, double addrspace(1)* %in.gep.0, align 8 + %val1 = load double, double addrspace(1)* %in.gep.1, align 8 + %arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i + store double %val0, double addrspace(3)* %arrayidx0, align 8 + %add.x = add nsw i32 %x.i, 8 + %arrayidx1 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %add.x + store double %val1, double addrspace(3)* %arrayidx1, align 8 + ret void +} + +@foo = addrspace(3) global [4 x i32] undef, align 4 + +; SI-LABEL: @store_constant_adjacent_offsets +; SI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} +; SI: ds_write2_b32 [[ZERO]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1 +define void @store_constant_adjacent_offsets() { + store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4 + store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 1), align 4 + ret void +} + +; SI-LABEL: @store_constant_disjoint_offsets +; SI-DAG: v_mov_b32_e32 [[VAL:v[0-9]+]], 0x7b{{$}} +; SI-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} +; SI: ds_write2_b32 [[ZERO]], [[VAL]], [[VAL]] offset1:2 +define void @store_constant_disjoint_offsets() { + store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4 + store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 2), align 4 + ret void +} + +@bar = addrspace(3) global [4 x i64] undef, align 4 + +; SI-LABEL: @store_misaligned64_constant_offsets +; SI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} +; SI: ds_write2_b32 [[ZERO]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1 +; SI: ds_write2_b32 [[ZERO]], v{{[0-9]+}}, v{{[0-9]+}} offset0:2 offset1:3 +define void @store_misaligned64_constant_offsets() { + store i64 123, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 0), align 4 + store i64 123, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 1), align 4 + ret void +} + +@bar.large = addrspace(3) global [4096 x i64] undef, align 4 + +; SI-LABEL: @store_misaligned64_constant_large_offsets +; SI-DAG: v_mov_b32_e32 [[BASE0:v[0-9]+]], 0x7ff8{{$}} +; SI-DAG: v_mov_b32_e32 [[BASE1:v[0-9]+]], 0x4000{{$}} +; SI-DAG: ds_write2_b32 [[BASE0]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1 +; SI-DAG: ds_write2_b32 [[BASE1]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1 +; SI: s_endpgm +define void @store_misaligned64_constant_large_offsets() { + store i64 123, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 2048), align 4 + store i64 123, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 4095), align 4 + ret void +} + +@sgemm.lA = internal unnamed_addr addrspace(3) global [264 x float] undef, align 4 +@sgemm.lB = internal unnamed_addr addrspace(3) global [776 x float] undef, align 4 + +define void @write2_sgemm_sequence(float addrspace(1)* %C, i32 %lda, i32 %ldb, float addrspace(1)* %in) #0 { + %x.i = tail call i32 @llvm.r600.read.tgid.x() #1 + %y.i = tail call i32 @llvm.r600.read.tidig.y() #1 + %val = load float, float addrspace(1)* %in + %arrayidx44 = getelementptr inbounds [264 x float], [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %x.i + store float %val, float addrspace(3)* %arrayidx44, align 4 + %add47 = add nsw i32 %x.i, 1 + %arrayidx48 = getelementptr inbounds [264 x float], [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %add47 + store float %val, float addrspace(3)* %arrayidx48, align 4 + %add51 = add nsw i32 %x.i, 16 + %arrayidx52 = getelementptr inbounds [264 x float], [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %add51 + store float %val, float addrspace(3)* %arrayidx52, align 4 + %add55 = add nsw i32 %x.i, 17 + %arrayidx56 = getelementptr inbounds [264 x float], [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %add55 + store float %val, float addrspace(3)* %arrayidx56, align 4 + %arrayidx60 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %y.i + store float %val, float addrspace(3)* %arrayidx60, align 4 + %add63 = add nsw i32 %y.i, 1 + %arrayidx64 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add63 + store float %val, float addrspace(3)* %arrayidx64, align 4 + %add67 = add nsw i32 %y.i, 32 + %arrayidx68 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add67 + store float %val, float addrspace(3)* %arrayidx68, align 4 + %add71 = add nsw i32 %y.i, 33 + %arrayidx72 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add71 + store float %val, float addrspace(3)* %arrayidx72, align 4 + %add75 = add nsw i32 %y.i, 64 + %arrayidx76 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add75 + store float %val, float addrspace(3)* %arrayidx76, align 4 + %add79 = add nsw i32 %y.i, 65 + %arrayidx80 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add79 + store float %val, float addrspace(3)* %arrayidx80, align 4 + ret void +} + +; Function Attrs: nounwind readnone +declare i32 @llvm.r600.read.tgid.x() #1 + +; Function Attrs: nounwind readnone +declare i32 @llvm.r600.read.tgid.y() #1 + +; Function Attrs: nounwind readnone +declare i32 @llvm.r600.read.tidig.x() #1 + +; Function Attrs: nounwind readnone +declare i32 @llvm.r600.read.tidig.y() #1 + +; Function Attrs: noduplicate nounwind +declare void @llvm.AMDGPU.barrier.local() #2 + +attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { nounwind readnone } +attributes #2 = { noduplicate nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/ds_write2st64.ll b/llvm/test/CodeGen/AMDGPU/ds_write2st64.ll new file mode 100644 index 00000000000..1d9d881c5c7 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/ds_write2st64.ll @@ -0,0 +1,119 @@ +; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck -check-prefix=SI %s + + +@lds = addrspace(3) global [512 x float] undef, align 4 + + +; SI-LABEL: @simple_write2st64_one_val_f32_0_1 +; SI-DAG: buffer_load_dword [[VAL:v[0-9]+]] +; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}} +; SI: ds_write2st64_b32 [[VPTR]], [[VAL]], [[VAL]] offset1:1 +; SI: s_endpgm +define void @simple_write2st64_one_val_f32_0_1(float addrspace(1)* %C, float addrspace(1)* %in) #0 { + %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 + %in.gep = getelementptr float, float addrspace(1)* %in, i32 %x.i + %val = load float, float addrspace(1)* %in.gep, align 4 + %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i + store float %val, float addrspace(3)* %arrayidx0, align 4 + %add.x = add nsw i32 %x.i, 64 + %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x + store float %val, float addrspace(3)* %arrayidx1, align 4 + ret void +} + +; SI-LABEL: @simple_write2st64_two_val_f32_2_5 +; SI-DAG: buffer_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI-DAG: buffer_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 +; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}} +; SI: ds_write2st64_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset0:2 offset1:5 +; SI: s_endpgm +define void @simple_write2st64_two_val_f32_2_5(float addrspace(1)* %C, float addrspace(1)* %in) #0 { + %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 + %in.gep.0 = getelementptr float, float addrspace(1)* %in, i32 %x.i + %in.gep.1 = getelementptr float, float addrspace(1)* %in.gep.0, i32 1 + %val0 = load float, float addrspace(1)* %in.gep.0, align 4 + %val1 = load float, float addrspace(1)* %in.gep.1, align 4 + %add.x.0 = add nsw i32 %x.i, 128 + %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x.0 + store float %val0, float addrspace(3)* %arrayidx0, align 4 + %add.x.1 = add nsw i32 %x.i, 320 + %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x.1 + store float %val1, float addrspace(3)* %arrayidx1, align 4 + ret void +} + +; SI-LABEL: @simple_write2st64_two_val_max_offset_f32 +; SI-DAG: buffer_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI-DAG: buffer_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 +; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}} +; SI: ds_write2st64_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset1:255 +; SI: s_endpgm +define void @simple_write2st64_two_val_max_offset_f32(float addrspace(1)* %C, float addrspace(1)* %in, float addrspace(3)* %lds) #0 { + %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 + %in.gep.0 = getelementptr float, float addrspace(1)* %in, i32 %x.i + %in.gep.1 = getelementptr float, float addrspace(1)* %in.gep.0, i32 1 + %val0 = load float, float addrspace(1)* %in.gep.0, align 4 + %val1 = load float, float addrspace(1)* %in.gep.1, align 4 + %arrayidx0 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %x.i + store float %val0, float addrspace(3)* %arrayidx0, align 4 + %add.x = add nsw i32 %x.i, 16320 + %arrayidx1 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %add.x + store float %val1, float addrspace(3)* %arrayidx1, align 4 + ret void +} + +; SI-LABEL: @simple_write2st64_two_val_max_offset_f64 +; SI-DAG: buffer_load_dwordx2 [[VAL0:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI-DAG: buffer_load_dwordx2 [[VAL1:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 +; SI-DAG: v_add_i32_e32 [[VPTR:v[0-9]+]], +; SI: ds_write2st64_b64 [[VPTR]], [[VAL0]], [[VAL1]] offset0:4 offset1:127 +; SI: s_endpgm +define void @simple_write2st64_two_val_max_offset_f64(double addrspace(1)* %C, double addrspace(1)* %in, double addrspace(3)* %lds) #0 { + %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 + %in.gep.0 = getelementptr double, double addrspace(1)* %in, i32 %x.i + %in.gep.1 = getelementptr double, double addrspace(1)* %in.gep.0, i32 1 + %val0 = load double, double addrspace(1)* %in.gep.0, align 8 + %val1 = load double, double addrspace(1)* %in.gep.1, align 8 + %add.x.0 = add nsw i32 %x.i, 256 + %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.0 + store double %val0, double addrspace(3)* %arrayidx0, align 8 + %add.x.1 = add nsw i32 %x.i, 8128 + %arrayidx1 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.1 + store double %val1, double addrspace(3)* %arrayidx1, align 8 + ret void +} + +; SI-LABEL: @byte_size_only_divisible_64_write2st64_f64 +; SI-NOT: ds_write2st64_b64 +; SI: ds_write2_b64 {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset1:8 +; SI: s_endpgm +define void @byte_size_only_divisible_64_write2st64_f64(double addrspace(1)* %C, double addrspace(1)* %in, double addrspace(3)* %lds) #0 { + %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 + %in.gep = getelementptr double, double addrspace(1)* %in, i32 %x.i + %val = load double, double addrspace(1)* %in.gep, align 8 + %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %x.i + store double %val, double addrspace(3)* %arrayidx0, align 8 + %add.x = add nsw i32 %x.i, 8 + %arrayidx1 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x + store double %val, double addrspace(3)* %arrayidx1, align 8 + ret void +} + +; Function Attrs: nounwind readnone +declare i32 @llvm.r600.read.tgid.x() #1 + +; Function Attrs: nounwind readnone +declare i32 @llvm.r600.read.tgid.y() #1 + +; Function Attrs: nounwind readnone +declare i32 @llvm.r600.read.tidig.x() #1 + +; Function Attrs: nounwind readnone +declare i32 @llvm.r600.read.tidig.y() #1 + +; Function Attrs: noduplicate nounwind +declare void @llvm.AMDGPU.barrier.local() #2 + +attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { nounwind readnone } +attributes #2 = { noduplicate nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/elf.ll b/llvm/test/CodeGen/AMDGPU/elf.ll new file mode 100644 index 00000000000..d0fd06a3437 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/elf.ll @@ -0,0 +1,34 @@ +; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs -filetype=obj | llvm-readobj -s -symbols - | FileCheck --check-prefix=ELF %s +; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs -o - | FileCheck --check-prefix=CONFIG --check-prefix=TYPICAL %s +; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs -filetype=obj | llvm-readobj -s -symbols - | FileCheck --check-prefix=ELF %s +; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs -o - | FileCheck --check-prefix=CONFIG --check-prefix=TONGA %s +; RUN: llc < %s -march=amdgcn -mcpu=carrizo -verify-machineinstrs -filetype=obj | llvm-readobj -s -symbols - | FileCheck --check-prefix=ELF %s +; RUN: llc < %s -march=amdgcn -mcpu=carrizo -verify-machineinstrs -o - | FileCheck --check-prefix=CONFIG --check-prefix=TYPICAL %s + +; Test that we don't try to produce a COFF file on windows +; RUN: llc < %s -mtriple=amdgcn-pc-mingw -mcpu=SI -verify-machineinstrs -filetype=obj | llvm-readobj -s -symbols - | FileCheck --check-prefix=ELF %s + +; ELF: Format: ELF32 +; ELF: Name: .AMDGPU.config +; ELF: Type: SHT_PROGBITS + +; ELF: Symbol { +; ELF: Name: test +; ELF: Binding: Global + +; CONFIG: .section .AMDGPU.config +; CONFIG-NEXT: .long 45096 +; TYPICAL-NEXT: .long 0 +; TONGA-NEXT: .long 576 +; CONFIG: .align 256 +; CONFIG: test: +define void @test(i32 %p) #0 { + %i = add i32 %p, 2 + %r = bitcast i32 %i to float + call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %r, float %r, float %r, float %r) + ret void +} + +declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) + +attributes #0 = { "ShaderType"="0" } ; Pixel Shader diff --git a/llvm/test/CodeGen/AMDGPU/elf.r600.ll b/llvm/test/CodeGen/AMDGPU/elf.r600.ll new file mode 100644 index 00000000000..51cd0850093 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/elf.r600.ll @@ -0,0 +1,17 @@ +; RUN: llc < %s -march=r600 -mcpu=redwood -filetype=obj | llvm-readobj -s - | FileCheck --check-prefix=ELF %s +; RUN: llc < %s -march=r600 -mcpu=redwood -o - | FileCheck --check-prefix=CONFIG %s + +; ELF: Format: ELF32 +; ELF: Name: .AMDGPU.config + +; CONFIG: .section .AMDGPU.config +; CONFIG-NEXT: .long 166100 +; CONFIG-NEXT: .long 2 +; CONFIG-NEXT: .long 165900 +; CONFIG-NEXT: .long 0 +define void @test(float addrspace(1)* %out, i32 %p) { + %i = add i32 %p, 2 + %r = bitcast i32 %i to float + store float %r, float addrspace(1)* %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/empty-function.ll b/llvm/test/CodeGen/AMDGPU/empty-function.ll new file mode 100644 index 00000000000..a060900811e --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/empty-function.ll @@ -0,0 +1,21 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s + +; Make sure we don't assert on empty functions + +; SI: .text +; SI-LABEL: {{^}}empty_function_ret: +; SI: s_endpgm +; SI: codeLenInByte = 4 +define void @empty_function_ret() #0 { + ret void +} + +; SI: .text +; SI-LABEL: {{^}}empty_function_unreachable: +; SI: codeLenInByte = 0 +define void @empty_function_unreachable() #0 { + unreachable +} + +attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/endcf-loop-header.ll b/llvm/test/CodeGen/AMDGPU/endcf-loop-header.ll new file mode 100644 index 00000000000..267a323c506 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/endcf-loop-header.ll @@ -0,0 +1,34 @@ +; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s + +; This tests that the llvm.SI.end.cf intrinsic is not inserted into the +; loop block. This intrinsic will be lowered to s_or_b64 by the code +; generator. + +; CHECK-LABEL: {{^}}test: + +; This is was lowered from the llvm.SI.end.cf intrinsic: +; CHECK: s_or_b64 exec, exec + +; CHECK: [[LOOP_LABEL:[0-9A-Za-z_]+]]: ; %loop{{$}} +; CHECK-NOT: s_or_b64 exec, exec +; CHECK: s_cbranch_execnz [[LOOP_LABEL]] +define void @test(i32 addrspace(1)* %out, i32 %cond) { +entry: + %tmp0 = icmp eq i32 %cond, 0 + br i1 %tmp0, label %if, label %loop + +if: + store i32 0, i32 addrspace(1)* %out + br label %loop + +loop: + %tmp1 = phi i32 [0, %entry], [0, %if], [%inc, %loop] + %inc = add i32 %tmp1, %cond + %tmp2 = icmp ugt i32 %inc, 10 + br i1 %tmp2, label %done, label %loop + +done: + %tmp3 = getelementptr i32, i32 addrspace(1)* %out, i64 1 + store i32 %inc, i32 addrspace(1)* %tmp3 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/extload-private.ll b/llvm/test/CodeGen/AMDGPU/extload-private.ll new file mode 100644 index 00000000000..294c3a9c678 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/extload-private.ll @@ -0,0 +1,46 @@ +; RUN: llc < %s -march=amdgcn -mcpu=SI -mattr=-promote-alloca -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s +; RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-promote-alloca -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s + +; FUNC-LABEL: {{^}}load_i8_sext_private: +; SI: buffer_load_sbyte v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen +define void @load_i8_sext_private(i32 addrspace(1)* %out) { +entry: + %tmp0 = alloca i8 + %tmp1 = load i8, i8* %tmp0 + %tmp2 = sext i8 %tmp1 to i32 + store i32 %tmp2, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}load_i8_zext_private: +; SI: buffer_load_ubyte v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen +define void @load_i8_zext_private(i32 addrspace(1)* %out) { +entry: + %tmp0 = alloca i8 + %tmp1 = load i8, i8* %tmp0 + %tmp2 = zext i8 %tmp1 to i32 + store i32 %tmp2, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}load_i16_sext_private: +; SI: buffer_load_sshort v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen +define void @load_i16_sext_private(i32 addrspace(1)* %out) { +entry: + %tmp0 = alloca i16 + %tmp1 = load i16, i16* %tmp0 + %tmp2 = sext i16 %tmp1 to i32 + store i32 %tmp2, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}load_i16_zext_private: +; SI: buffer_load_ushort v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen +define void @load_i16_zext_private(i32 addrspace(1)* %out) { +entry: + %tmp0 = alloca i16 + %tmp1 = load i16, i16* %tmp0 + %tmp2 = zext i16 %tmp1 to i32 + store i32 %tmp2, i32 addrspace(1)* %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/extload.ll b/llvm/test/CodeGen/AMDGPU/extload.ll new file mode 100644 index 00000000000..662eb7a9716 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/extload.ll @@ -0,0 +1,53 @@ +; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s + +; FUNC-LABEL: {{^}}anyext_load_i8: +; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+.[XYZW]]], +; EG: VTX_READ_32 [[VAL]] + +define void @anyext_load_i8(i8 addrspace(1)* nocapture noalias %out, i8 addrspace(1)* nocapture noalias %src) nounwind { + %cast = bitcast i8 addrspace(1)* %src to i32 addrspace(1)* + %load = load i32, i32 addrspace(1)* %cast, align 1 + %x = bitcast i32 %load to <4 x i8> + %castOut = bitcast i8 addrspace(1)* %out to <4 x i8> addrspace(1)* + store <4 x i8> %x, <4 x i8> addrspace(1)* %castOut, align 1 + ret void +} + +; FUNC-LABEL: {{^}}anyext_load_i16: +; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+.[XYZW]]], +; EG: VTX_READ_32 [[VAL]] + +define void @anyext_load_i16(i16 addrspace(1)* nocapture noalias %out, i16 addrspace(1)* nocapture noalias %src) nounwind { + %cast = bitcast i16 addrspace(1)* %src to i32 addrspace(1)* + %load = load i32, i32 addrspace(1)* %cast, align 1 + %x = bitcast i32 %load to <2 x i16> + %castOut = bitcast i16 addrspace(1)* %out to <2 x i16> addrspace(1)* + store <2 x i16> %x, <2 x i16> addrspace(1)* %castOut, align 1 + ret void +} + +; FUNC-LABEL: {{^}}anyext_load_lds_i8: +; EG: LDS_READ_RET {{.*}}, [[VAL:T[0-9]+.[XYZW]]] +; EG: LDS_WRITE * [[VAL]] +define void @anyext_load_lds_i8(i8 addrspace(3)* nocapture noalias %out, i8 addrspace(3)* nocapture noalias %src) nounwind { + %cast = bitcast i8 addrspace(3)* %src to i32 addrspace(3)* + %load = load i32, i32 addrspace(3)* %cast, align 1 + %x = bitcast i32 %load to <4 x i8> + %castOut = bitcast i8 addrspace(3)* %out to <4 x i8> addrspace(3)* + store <4 x i8> %x, <4 x i8> addrspace(3)* %castOut, align 1 + ret void +} + +; FUNC-LABEL: {{^}}anyext_load_lds_i16: +; EG: LDS_READ_RET {{.*}}, [[VAL:T[0-9]+.[XYZW]]] +; EG: LDS_WRITE * [[VAL]] +define void @anyext_load_lds_i16(i16 addrspace(3)* nocapture noalias %out, i16 addrspace(3)* nocapture noalias %src) nounwind { + %cast = bitcast i16 addrspace(3)* %src to i32 addrspace(3)* + %load = load i32, i32 addrspace(3)* %cast, align 1 + %x = bitcast i32 %load to <2 x i16> + %castOut = bitcast i16 addrspace(3)* %out to <2 x i16> addrspace(3)* + store <2 x i16> %x, <2 x i16> addrspace(3)* %castOut, align 1 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_elt_i16.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_elt_i16.ll new file mode 100644 index 00000000000..c7572efc6f5 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/extract_vector_elt_i16.ll @@ -0,0 +1,30 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s + +; FUNC-LABEL: {{^}}extract_vector_elt_v2i16: +; SI: buffer_load_ushort +; SI: buffer_load_ushort +; SI: buffer_store_short +; SI: buffer_store_short +define void @extract_vector_elt_v2i16(i16 addrspace(1)* %out, <2 x i16> %foo) nounwind { + %p0 = extractelement <2 x i16> %foo, i32 0 + %p1 = extractelement <2 x i16> %foo, i32 1 + %out1 = getelementptr i16, i16 addrspace(1)* %out, i32 1 + store i16 %p1, i16 addrspace(1)* %out, align 2 + store i16 %p0, i16 addrspace(1)* %out1, align 2 + ret void +} + +; FUNC-LABEL: {{^}}extract_vector_elt_v4i16: +; SI: buffer_load_ushort +; SI: buffer_load_ushort +; SI: buffer_store_short +; SI: buffer_store_short +define void @extract_vector_elt_v4i16(i16 addrspace(1)* %out, <4 x i16> %foo) nounwind { + %p0 = extractelement <4 x i16> %foo, i32 0 + %p1 = extractelement <4 x i16> %foo, i32 2 + %out1 = getelementptr i16, i16 addrspace(1)* %out, i32 1 + store i16 %p1, i16 addrspace(1)* %out, align 2 + store i16 %p0, i16 addrspace(1)* %out1, align 2 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/fabs.f64.ll b/llvm/test/CodeGen/AMDGPU/fabs.f64.ll new file mode 100644 index 00000000000..3c6136c1a7b --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/fabs.f64.ll @@ -0,0 +1,97 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s + +declare i32 @llvm.r600.read.tidig.x() nounwind readnone + +declare double @fabs(double) readnone +declare double @llvm.fabs.f64(double) readnone +declare <2 x double> @llvm.fabs.v2f64(<2 x double>) readnone +declare <4 x double> @llvm.fabs.v4f64(<4 x double>) readnone + +; FUNC-LABEL: {{^}}v_fabs_f64: +; SI: v_and_b32 +; SI: s_endpgm +define void @v_fabs_f64(double addrspace(1)* %out, double addrspace(1)* %in) { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %tidext = sext i32 %tid to i64 + %gep = getelementptr double, double addrspace(1)* %in, i64 %tidext + %val = load double, double addrspace(1)* %gep, align 8 + %fabs = call double @llvm.fabs.f64(double %val) + store double %fabs, double addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}fabs_f64: +; SI: v_and_b32 +; SI-NOT: v_and_b32 +; SI: s_endpgm +define void @fabs_f64(double addrspace(1)* %out, double %in) { + %fabs = call double @llvm.fabs.f64(double %in) + store double %fabs, double addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}fabs_v2f64: +; SI: v_and_b32 +; SI: v_and_b32 +; SI: s_endpgm +define void @fabs_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in) { + %fabs = call <2 x double> @llvm.fabs.v2f64(<2 x double> %in) + store <2 x double> %fabs, <2 x double> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}fabs_v4f64: +; SI: v_and_b32 +; SI: v_and_b32 +; SI: v_and_b32 +; SI: v_and_b32 +; SI: s_endpgm +define void @fabs_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %in) { + %fabs = call <4 x double> @llvm.fabs.v4f64(<4 x double> %in) + store <4 x double> %fabs, <4 x double> addrspace(1)* %out + ret void +} + +; SI-LABEL: {{^}}fabs_fold_f64: +; SI: s_load_dwordx2 [[ABS_VALUE:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb +; SI-NOT: and +; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\]}}, |[[ABS_VALUE]]|, {{v\[[0-9]+:[0-9]+\]}} +; SI: s_endpgm +define void @fabs_fold_f64(double addrspace(1)* %out, double %in0, double %in1) { + %fabs = call double @llvm.fabs.f64(double %in0) + %fmul = fmul double %fabs, %in1 + store double %fmul, double addrspace(1)* %out + ret void +} + +; SI-LABEL: {{^}}fabs_fn_fold_f64: +; SI: s_load_dwordx2 [[ABS_VALUE:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb +; SI-NOT: and +; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\]}}, |[[ABS_VALUE]]|, {{v\[[0-9]+:[0-9]+\]}} +; SI: s_endpgm +define void @fabs_fn_fold_f64(double addrspace(1)* %out, double %in0, double %in1) { + %fabs = call double @fabs(double %in0) + %fmul = fmul double %fabs, %in1 + store double %fmul, double addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}fabs_free_f64: +; SI: v_and_b32 +; SI: s_endpgm +define void @fabs_free_f64(double addrspace(1)* %out, i64 %in) { + %bc= bitcast i64 %in to double + %fabs = call double @llvm.fabs.f64(double %bc) + store double %fabs, double addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}fabs_fn_free_f64: +; SI: v_and_b32 +; SI: s_endpgm +define void @fabs_fn_free_f64(double addrspace(1)* %out, i64 %in) { + %bc= bitcast i64 %in to double + %fabs = call double @fabs(double %bc) + store double %fabs, double addrspace(1)* %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/fabs.ll b/llvm/test/CodeGen/AMDGPU/fabs.ll new file mode 100644 index 00000000000..419a73d0266 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/fabs.ll @@ -0,0 +1,101 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s + + +; DAGCombiner will transform: +; (fabs (f32 bitcast (i32 a))) => (f32 bitcast (and (i32 a), 0x7FFFFFFF)) +; unless isFabsFree returns true + +; FUNC-LABEL: {{^}}fabs_fn_free: +; R600-NOT: AND +; R600: |PV.{{[XYZW]}}| + +; GCN: v_and_b32 + +define void @fabs_fn_free(float addrspace(1)* %out, i32 %in) { + %bc= bitcast i32 %in to float + %fabs = call float @fabs(float %bc) + store float %fabs, float addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}fabs_free: +; R600-NOT: AND +; R600: |PV.{{[XYZW]}}| + +; GCN: v_and_b32 + +define void @fabs_free(float addrspace(1)* %out, i32 %in) { + %bc= bitcast i32 %in to float + %fabs = call float @llvm.fabs.f32(float %bc) + store float %fabs, float addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}fabs_f32: +; R600: |{{(PV|T[0-9])\.[XYZW]}}| + +; GCN: v_and_b32 +define void @fabs_f32(float addrspace(1)* %out, float %in) { + %fabs = call float @llvm.fabs.f32(float %in) + store float %fabs, float addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}fabs_v2f32: +; R600: |{{(PV|T[0-9])\.[XYZW]}}| +; R600: |{{(PV|T[0-9])\.[XYZW]}}| + +; GCN: v_and_b32 +; GCN: v_and_b32 +define void @fabs_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) { + %fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %in) + store <2 x float> %fabs, <2 x float> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}fabs_v4f32: +; R600: |{{(PV|T[0-9])\.[XYZW]}}| +; R600: |{{(PV|T[0-9])\.[XYZW]}}| +; R600: |{{(PV|T[0-9])\.[XYZW]}}| +; R600: |{{(PV|T[0-9])\.[XYZW]}}| + +; GCN: v_and_b32 +; GCN: v_and_b32 +; GCN: v_and_b32 +; GCN: v_and_b32 +define void @fabs_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) { + %fabs = call <4 x float> @llvm.fabs.v4f32(<4 x float> %in) + store <4 x float> %fabs, <4 x float> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}fabs_fn_fold: +; SI: s_load_dword [[ABS_VALUE:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xb +; VI: s_load_dword [[ABS_VALUE:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x2c +; GCN-NOT: and +; GCN: v_mul_f32_e64 v{{[0-9]+}}, |[[ABS_VALUE]]|, v{{[0-9]+}} +define void @fabs_fn_fold(float addrspace(1)* %out, float %in0, float %in1) { + %fabs = call float @fabs(float %in0) + %fmul = fmul float %fabs, %in1 + store float %fmul, float addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}fabs_fold: +; SI: s_load_dword [[ABS_VALUE:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xb +; VI: s_load_dword [[ABS_VALUE:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x2c +; GCN-NOT: and +; GCN: v_mul_f32_e64 v{{[0-9]+}}, |[[ABS_VALUE]]|, v{{[0-9]+}} +define void @fabs_fold(float addrspace(1)* %out, float %in0, float %in1) { + %fabs = call float @llvm.fabs.f32(float %in0) + %fmul = fmul float %fabs, %in1 + store float %fmul, float addrspace(1)* %out + ret void +} + +declare float @fabs(float) readnone +declare float @llvm.fabs.f32(float) readnone +declare <2 x float> @llvm.fabs.v2f32(<2 x float>) readnone +declare <4 x float> @llvm.fabs.v4f32(<4 x float>) readnone diff --git a/llvm/test/CodeGen/AMDGPU/fadd.ll b/llvm/test/CodeGen/AMDGPU/fadd.ll new file mode 100644 index 00000000000..5fac328c598 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/fadd.ll @@ -0,0 +1,64 @@ +; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck %s -check-prefix=R600 -check-prefix=FUNC +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck %s -check-prefix=SI -check-prefix=FUNC +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s -check-prefix=SI -check-prefix=FUNC + +; FUNC-LABEL: {{^}}fadd_f32: +; R600: ADD {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, KC0[2].W +; SI: v_add_f32 +define void @fadd_f32(float addrspace(1)* %out, float %a, float %b) { + %add = fadd float %a, %b + store float %add, float addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}fadd_v2f32: +; R600-DAG: ADD {{\** *}}T{{[0-9]\.[XYZW]}}, KC0[3].X, KC0[3].Z +; R600-DAG: ADD {{\** *}}T{{[0-9]\.[XYZW]}}, KC0[2].W, KC0[3].Y +; SI: v_add_f32 +; SI: v_add_f32 +define void @fadd_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) { + %add = fadd <2 x float> %a, %b + store <2 x float> %add, <2 x float> addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}fadd_v4f32: +; R600: ADD {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; R600: ADD {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; R600: ADD {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; R600: ADD {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; SI: v_add_f32 +; SI: v_add_f32 +; SI: v_add_f32 +; SI: v_add_f32 +define void @fadd_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) { + %b_ptr = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 1 + %a = load <4 x float>, <4 x float> addrspace(1)* %in, align 16 + %b = load <4 x float>, <4 x float> addrspace(1)* %b_ptr, align 16 + %result = fadd <4 x float> %a, %b + store <4 x float> %result, <4 x float> addrspace(1)* %out, align 16 + ret void +} + +; FUNC-LABEL: {{^}}fadd_v8f32: +; R600: ADD +; R600: ADD +; R600: ADD +; R600: ADD +; R600: ADD +; R600: ADD +; R600: ADD +; R600: ADD +; SI: v_add_f32 +; SI: v_add_f32 +; SI: v_add_f32 +; SI: v_add_f32 +; SI: v_add_f32 +; SI: v_add_f32 +; SI: v_add_f32 +; SI: v_add_f32 +define void @fadd_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, <8 x float> %b) { + %add = fadd <8 x float> %a, %b + store <8 x float> %add, <8 x float> addrspace(1)* %out, align 32 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/fadd64.ll b/llvm/test/CodeGen/AMDGPU/fadd64.ll new file mode 100644 index 00000000000..485c55870c4 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/fadd64.ll @@ -0,0 +1,14 @@ +; RUN: llc < %s -march=amdgcn -mcpu=tahiti -verify-machineinstrs | FileCheck %s +; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s + +; CHECK: {{^}}fadd_f64: +; CHECK: v_add_f64 {{v[[0-9]+:[0-9]+]}}, {{v[[0-9]+:[0-9]+]}}, {{v[[0-9]+:[0-9]+]}} + +define void @fadd_f64(double addrspace(1)* %out, double addrspace(1)* %in1, + double addrspace(1)* %in2) { + %r0 = load double, double addrspace(1)* %in1 + %r1 = load double, double addrspace(1)* %in2 + %r2 = fadd double %r0, %r1 + store double %r2, double addrspace(1)* %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/fceil.ll b/llvm/test/CodeGen/AMDGPU/fceil.ll new file mode 100644 index 00000000000..f23e8919d73 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/fceil.ll @@ -0,0 +1,132 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s + +declare float @llvm.ceil.f32(float) nounwind readnone +declare <2 x float> @llvm.ceil.v2f32(<2 x float>) nounwind readnone +declare <3 x float> @llvm.ceil.v3f32(<3 x float>) nounwind readnone +declare <4 x float> @llvm.ceil.v4f32(<4 x float>) nounwind readnone +declare <8 x float> @llvm.ceil.v8f32(<8 x float>) nounwind readnone +declare <16 x float> @llvm.ceil.v16f32(<16 x float>) nounwind readnone + +; FUNC-LABEL: {{^}}fceil_f32: +; SI: v_ceil_f32_e32 +; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+\.[XYZW]]] +; EG: CEIL {{\*? *}}[[RESULT]] +define void @fceil_f32(float addrspace(1)* %out, float %x) { + %y = call float @llvm.ceil.f32(float %x) nounwind readnone + store float %y, float addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}fceil_v2f32: +; SI: v_ceil_f32_e32 +; SI: v_ceil_f32_e32 +; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+]]{{\.[XYZW]}} +; EG: CEIL {{\*? *}}[[RESULT]] +; EG: CEIL {{\*? *}}[[RESULT]] +define void @fceil_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %x) { + %y = call <2 x float> @llvm.ceil.v2f32(<2 x float> %x) nounwind readnone + store <2 x float> %y, <2 x float> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}fceil_v3f32: +; FIXME-SI: v_ceil_f32_e32 +; FIXME-SI: v_ceil_f32_e32 +; FIXME-SI: v_ceil_f32_e32 +; FIXME-EG: v3 is treated as v2 and v1, hence 2 stores +; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT1:T[0-9]+]]{{\.[XYZW]}} +; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT2:T[0-9]+]]{{\.[XYZW]}} +; EG-DAG: CEIL {{\*? *}}[[RESULT1]] +; EG-DAG: CEIL {{\*? *}}[[RESULT2]] +; EG-DAG: CEIL {{\*? *}}[[RESULT2]] +define void @fceil_v3f32(<3 x float> addrspace(1)* %out, <3 x float> %x) { + %y = call <3 x float> @llvm.ceil.v3f32(<3 x float> %x) nounwind readnone + store <3 x float> %y, <3 x float> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}fceil_v4f32: +; SI: v_ceil_f32_e32 +; SI: v_ceil_f32_e32 +; SI: v_ceil_f32_e32 +; SI: v_ceil_f32_e32 +; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+]]{{\.[XYZW]}} +; EG: CEIL {{\*? *}}[[RESULT]] +; EG: CEIL {{\*? *}}[[RESULT]] +; EG: CEIL {{\*? *}}[[RESULT]] +; EG: CEIL {{\*? *}}[[RESULT]] +define void @fceil_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %x) { + %y = call <4 x float> @llvm.ceil.v4f32(<4 x float> %x) nounwind readnone + store <4 x float> %y, <4 x float> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}fceil_v8f32: +; SI: v_ceil_f32_e32 +; SI: v_ceil_f32_e32 +; SI: v_ceil_f32_e32 +; SI: v_ceil_f32_e32 +; SI: v_ceil_f32_e32 +; SI: v_ceil_f32_e32 +; SI: v_ceil_f32_e32 +; SI: v_ceil_f32_e32 +; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT1:T[0-9]+]]{{\.[XYZW]}} +; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT2:T[0-9]+]]{{\.[XYZW]}} +; EG-DAG: CEIL {{\*? *}}[[RESULT1]] +; EG-DAG: CEIL {{\*? *}}[[RESULT1]] +; EG-DAG: CEIL {{\*? *}}[[RESULT1]] +; EG-DAG: CEIL {{\*? *}}[[RESULT1]] +; EG-DAG: CEIL {{\*? *}}[[RESULT2]] +; EG-DAG: CEIL {{\*? *}}[[RESULT2]] +; EG-DAG: CEIL {{\*? *}}[[RESULT2]] +; EG-DAG: CEIL {{\*? *}}[[RESULT2]] +define void @fceil_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %x) { + %y = call <8 x float> @llvm.ceil.v8f32(<8 x float> %x) nounwind readnone + store <8 x float> %y, <8 x float> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}fceil_v16f32: +; SI: v_ceil_f32_e32 +; SI: v_ceil_f32_e32 +; SI: v_ceil_f32_e32 +; SI: v_ceil_f32_e32 +; SI: v_ceil_f32_e32 +; SI: v_ceil_f32_e32 +; SI: v_ceil_f32_e32 +; SI: v_ceil_f32_e32 +; SI: v_ceil_f32_e32 +; SI: v_ceil_f32_e32 +; SI: v_ceil_f32_e32 +; SI: v_ceil_f32_e32 +; SI: v_ceil_f32_e32 +; SI: v_ceil_f32_e32 +; SI: v_ceil_f32_e32 +; SI: v_ceil_f32_e32 +; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT1:T[0-9]+]]{{\.[XYZW]}} +; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT2:T[0-9]+]]{{\.[XYZW]}} +; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT3:T[0-9]+]]{{\.[XYZW]}} +; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT4:T[0-9]+]]{{\.[XYZW]}} +; EG-DAG: CEIL {{\*? *}}[[RESULT1]] +; EG-DAG: CEIL {{\*? *}}[[RESULT1]] +; EG-DAG: CEIL {{\*? *}}[[RESULT1]] +; EG-DAG: CEIL {{\*? *}}[[RESULT1]] +; EG-DAG: CEIL {{\*? *}}[[RESULT2]] +; EG-DAG: CEIL {{\*? *}}[[RESULT2]] +; EG-DAG: CEIL {{\*? *}}[[RESULT2]] +; EG-DAG: CEIL {{\*? *}}[[RESULT2]] +; EG-DAG: CEIL {{\*? *}}[[RESULT3]] +; EG-DAG: CEIL {{\*? *}}[[RESULT3]] +; EG-DAG: CEIL {{\*? *}}[[RESULT3]] +; EG-DAG: CEIL {{\*? *}}[[RESULT3]] +; EG-DAG: CEIL {{\*? *}}[[RESULT4]] +; EG-DAG: CEIL {{\*? *}}[[RESULT4]] +; EG-DAG: CEIL {{\*? *}}[[RESULT4]] +; EG-DAG: CEIL {{\*? *}}[[RESULT4]] +define void @fceil_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %x) { + %y = call <16 x float> @llvm.ceil.v16f32(<16 x float> %x) nounwind readnone + store <16 x float> %y, <16 x float> addrspace(1)* %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/fceil64.ll b/llvm/test/CodeGen/AMDGPU/fceil64.ll new file mode 100644 index 00000000000..e8c34f0141e --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/fceil64.ll @@ -0,0 +1,105 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s + +declare double @llvm.ceil.f64(double) nounwind readnone +declare <2 x double> @llvm.ceil.v2f64(<2 x double>) nounwind readnone +declare <3 x double> @llvm.ceil.v3f64(<3 x double>) nounwind readnone +declare <4 x double> @llvm.ceil.v4f64(<4 x double>) nounwind readnone +declare <8 x double> @llvm.ceil.v8f64(<8 x double>) nounwind readnone +declare <16 x double> @llvm.ceil.v16f64(<16 x double>) nounwind readnone + +; FUNC-LABEL: {{^}}fceil_f64: +; CI: v_ceil_f64_e32 +; SI: s_bfe_u32 [[SEXP:s[0-9]+]], {{s[0-9]+}}, 0xb0014 +; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000 +; SI: s_add_i32 s{{[0-9]+}}, [[SEXP]], 0xfffffc01 +; SI: s_lshr_b64 +; SI: s_not_b64 +; SI: s_and_b64 +; SI: cmp_gt_i32 +; SI: cndmask_b32 +; SI: cndmask_b32 +; SI: cmp_lt_i32 +; SI: cndmask_b32 +; SI: cndmask_b32 +; SI-DAG: v_cmp_lt_f64 +; SI-DAG: v_cmp_lg_f64 +; SI: s_and_b64 +; SI: v_cndmask_b32 +; SI: v_cndmask_b32 +; SI: v_add_f64 +; SI: s_endpgm +define void @fceil_f64(double addrspace(1)* %out, double %x) { + %y = call double @llvm.ceil.f64(double %x) nounwind readnone + store double %y, double addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}fceil_v2f64: +; CI: v_ceil_f64_e32 +; CI: v_ceil_f64_e32 +define void @fceil_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %x) { + %y = call <2 x double> @llvm.ceil.v2f64(<2 x double> %x) nounwind readnone + store <2 x double> %y, <2 x double> addrspace(1)* %out + ret void +} + +; FIXME-FUNC-LABEL: {{^}}fceil_v3f64: +; FIXME-CI: v_ceil_f64_e32 +; FIXME-CI: v_ceil_f64_e32 +; FIXME-CI: v_ceil_f64_e32 +; define void @fceil_v3f64(<3 x double> addrspace(1)* %out, <3 x double> %x) { +; %y = call <3 x double> @llvm.ceil.v3f64(<3 x double> %x) nounwind readnone +; store <3 x double> %y, <3 x double> addrspace(1)* %out +; ret void +; } + +; FUNC-LABEL: {{^}}fceil_v4f64: +; CI: v_ceil_f64_e32 +; CI: v_ceil_f64_e32 +; CI: v_ceil_f64_e32 +; CI: v_ceil_f64_e32 +define void @fceil_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %x) { + %y = call <4 x double> @llvm.ceil.v4f64(<4 x double> %x) nounwind readnone + store <4 x double> %y, <4 x double> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}fceil_v8f64: +; CI: v_ceil_f64_e32 +; CI: v_ceil_f64_e32 +; CI: v_ceil_f64_e32 +; CI: v_ceil_f64_e32 +; CI: v_ceil_f64_e32 +; CI: v_ceil_f64_e32 +; CI: v_ceil_f64_e32 +; CI: v_ceil_f64_e32 +define void @fceil_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %x) { + %y = call <8 x double> @llvm.ceil.v8f64(<8 x double> %x) nounwind readnone + store <8 x double> %y, <8 x double> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}fceil_v16f64: +; CI: v_ceil_f64_e32 +; CI: v_ceil_f64_e32 +; CI: v_ceil_f64_e32 +; CI: v_ceil_f64_e32 +; CI: v_ceil_f64_e32 +; CI: v_ceil_f64_e32 +; CI: v_ceil_f64_e32 +; CI: v_ceil_f64_e32 +; CI: v_ceil_f64_e32 +; CI: v_ceil_f64_e32 +; CI: v_ceil_f64_e32 +; CI: v_ceil_f64_e32 +; CI: v_ceil_f64_e32 +; CI: v_ceil_f64_e32 +; CI: v_ceil_f64_e32 +; CI: v_ceil_f64_e32 +define void @fceil_v16f64(<16 x double> addrspace(1)* %out, <16 x double> %x) { + %y = call <16 x double> @llvm.ceil.v16f64(<16 x double> %x) nounwind readnone + store <16 x double> %y, <16 x double> addrspace(1)* %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/fcmp-cnd.ll b/llvm/test/CodeGen/AMDGPU/fcmp-cnd.ll new file mode 100644 index 00000000000..530274f920f --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/fcmp-cnd.ll @@ -0,0 +1,14 @@ +;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s + +;Not checking arguments 2 and 3 to CNDE, because they may change between +;registers and literal.x depending on what the optimizer does. +;CHECK: CNDE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} + +define void @test(i32 addrspace(1)* %out, float addrspace(1)* %in) { +entry: + %0 = load float, float addrspace(1)* %in + %cmp = fcmp oeq float %0, 0.000000e+00 + %value = select i1 %cmp, i32 2, i32 3 + store i32 %value, i32 addrspace(1)* %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/fcmp-cnde-int-args.ll b/llvm/test/CodeGen/AMDGPU/fcmp-cnde-int-args.ll new file mode 100644 index 00000000000..c402805feb3 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/fcmp-cnde-int-args.ll @@ -0,0 +1,16 @@ +; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s + +; This test checks a bug in R600TargetLowering::LowerSELECT_CC where the +; chance to optimize the fcmp + select instructions to SET* was missed +; due to the fact that the operands to fcmp and select had different types + +; CHECK: SET{{[A-Z]+}}_DX10 + +define void @test(i32 addrspace(1)* %out, float addrspace(1)* %in) { +entry: + %0 = load float, float addrspace(1)* %in + %cmp = fcmp oeq float %0, 0.000000e+00 + %value = select i1 %cmp, i32 -1, i32 0 + store i32 %value, i32 addrspace(1)* %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/fcmp.ll b/llvm/test/CodeGen/AMDGPU/fcmp.ll new file mode 100644 index 00000000000..5207ab57bad --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/fcmp.ll @@ -0,0 +1,38 @@ +; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s + +; CHECK: {{^}}fcmp_sext: +; CHECK: SETE_DX10 T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} + +define void @fcmp_sext(i32 addrspace(1)* %out, float addrspace(1)* %in) { +entry: + %0 = load float, float addrspace(1)* %in + %arrayidx1 = getelementptr inbounds float, float addrspace(1)* %in, i32 1 + %1 = load float, float addrspace(1)* %arrayidx1 + %cmp = fcmp oeq float %0, %1 + %sext = sext i1 %cmp to i32 + store i32 %sext, i32 addrspace(1)* %out + ret void +} + +; This test checks that a setcc node with f32 operands is lowered to a +; SET*_DX10 instruction. Previously we were lowering this to: +; SET* + FP_TO_SINT + +; CHECK: {{^}}fcmp_br: +; CHECK: SET{{[N]*}}E_DX10 * T{{[0-9]+\.[XYZW],}} +; CHECK-NEXT {{[0-9]+(5.0}} + +define void @fcmp_br(i32 addrspace(1)* %out, float %in) { +entry: + %0 = fcmp oeq float %in, 5.0 + br i1 %0, label %IF, label %ENDIF + +IF: + %1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 + store i32 0, i32 addrspace(1)* %1 + br label %ENDIF + +ENDIF: + store i32 0, i32 addrspace(1)* %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/fcmp64.ll b/llvm/test/CodeGen/AMDGPU/fcmp64.ll new file mode 100644 index 00000000000..053ab0ed7aa --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/fcmp64.ll @@ -0,0 +1,74 @@ +; RUN: llc < %s -march=amdgcn -mcpu=tahiti -verify-machineinstrs | FileCheck %s +; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s + +; CHECK-LABEL: {{^}}flt_f64: +; CHECK: v_cmp_nge_f64_e32 vcc, {{v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}} +define void @flt_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1, + double addrspace(1)* %in2) { + %r0 = load double, double addrspace(1)* %in1 + %r1 = load double, double addrspace(1)* %in2 + %r2 = fcmp ult double %r0, %r1 + %r3 = zext i1 %r2 to i32 + store i32 %r3, i32 addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}fle_f64: +; CHECK: v_cmp_ngt_f64_e32 vcc, {{v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}} +define void @fle_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1, + double addrspace(1)* %in2) { + %r0 = load double, double addrspace(1)* %in1 + %r1 = load double, double addrspace(1)* %in2 + %r2 = fcmp ule double %r0, %r1 + %r3 = zext i1 %r2 to i32 + store i32 %r3, i32 addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}fgt_f64: +; CHECK: v_cmp_nle_f64_e32 vcc, {{v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}} +define void @fgt_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1, + double addrspace(1)* %in2) { + %r0 = load double, double addrspace(1)* %in1 + %r1 = load double, double addrspace(1)* %in2 + %r2 = fcmp ugt double %r0, %r1 + %r3 = zext i1 %r2 to i32 + store i32 %r3, i32 addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}fge_f64: +; CHECK: v_cmp_nlt_f64_e32 vcc, {{v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}} +define void @fge_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1, + double addrspace(1)* %in2) { + %r0 = load double, double addrspace(1)* %in1 + %r1 = load double, double addrspace(1)* %in2 + %r2 = fcmp uge double %r0, %r1 + %r3 = zext i1 %r2 to i32 + store i32 %r3, i32 addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}fne_f64: +; CHECK: v_cmp_neq_f64_e32 vcc, {{v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}} +define void @fne_f64(double addrspace(1)* %out, double addrspace(1)* %in1, + double addrspace(1)* %in2) { + %r0 = load double, double addrspace(1)* %in1 + %r1 = load double, double addrspace(1)* %in2 + %r2 = fcmp une double %r0, %r1 + %r3 = select i1 %r2, double %r0, double %r1 + store double %r3, double addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}feq_f64: +; CHECK: v_cmp_nlg_f64_e32 vcc, {{v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}} +define void @feq_f64(double addrspace(1)* %out, double addrspace(1)* %in1, + double addrspace(1)* %in2) { + %r0 = load double, double addrspace(1)* %in1 + %r1 = load double, double addrspace(1)* %in2 + %r2 = fcmp ueq double %r0, %r1 + %r3 = select i1 %r2, double %r0, double %r1 + store double %r3, double addrspace(1)* %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/fconst64.ll b/llvm/test/CodeGen/AMDGPU/fconst64.ll new file mode 100644 index 00000000000..89af37545c9 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/fconst64.ll @@ -0,0 +1,13 @@ +; RUN: llc < %s -march=amdgcn -mcpu=tahiti -verify-machineinstrs | FileCheck %s +; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s + +; CHECK: {{^}}fconst_f64: +; CHECK-DAG: s_mov_b32 {{s[0-9]+}}, 0x40140000 +; CHECK-DAG: s_mov_b32 {{s[0-9]+}}, 0 + +define void @fconst_f64(double addrspace(1)* %out, double addrspace(1)* %in) { + %r1 = load double, double addrspace(1)* %in + %r2 = fadd double %r1, 5.000000e+00 + store double %r2, double addrspace(1)* %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll new file mode 100644 index 00000000000..b719d5a3978 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll @@ -0,0 +1,53 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s + + +declare float @llvm.copysign.f32(float, float) nounwind readnone +declare <2 x float> @llvm.copysign.v2f32(<2 x float>, <2 x float>) nounwind readnone +declare <4 x float> @llvm.copysign.v4f32(<4 x float>, <4 x float>) nounwind readnone + +; Try to identify arg based on higher address. +; FUNC-LABEL: {{^}}test_copysign_f32: +; SI: s_load_dword [[SMAG:s[0-9]+]], {{.*}} 0xb +; SI: s_load_dword [[SSIGN:s[0-9]+]], {{.*}} 0xc +; VI: s_load_dword [[SMAG:s[0-9]+]], {{.*}} 0x2c +; VI: s_load_dword [[SSIGN:s[0-9]+]], {{.*}} 0x30 +; GCN-DAG: v_mov_b32_e32 [[VSIGN:v[0-9]+]], [[SSIGN]] +; GCN-DAG: v_mov_b32_e32 [[VMAG:v[0-9]+]], [[SMAG]] +; GCN-DAG: s_mov_b32 [[SCONST:s[0-9]+]], 0x7fffffff +; GCN: v_bfi_b32 [[RESULT:v[0-9]+]], [[SCONST]], [[VMAG]], [[VSIGN]] +; GCN: buffer_store_dword [[RESULT]], +; GCN: s_endpgm + +; EG: BFI_INT +define void @test_copysign_f32(float addrspace(1)* %out, float %mag, float %sign) nounwind { + %result = call float @llvm.copysign.f32(float %mag, float %sign) + store float %result, float addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}test_copysign_v2f32: +; GCN: s_endpgm + +; EG: BFI_INT +; EG: BFI_INT +define void @test_copysign_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %mag, <2 x float> %sign) nounwind { + %result = call <2 x float> @llvm.copysign.v2f32(<2 x float> %mag, <2 x float> %sign) + store <2 x float> %result, <2 x float> addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}test_copysign_v4f32: +; GCN: s_endpgm + +; EG: BFI_INT +; EG: BFI_INT +; EG: BFI_INT +; EG: BFI_INT +define void @test_copysign_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %mag, <4 x float> %sign) nounwind { + %result = call <4 x float> @llvm.copysign.v4f32(<4 x float> %mag, <4 x float> %sign) + store <4 x float> %result, <4 x float> addrspace(1)* %out, align 16 + ret void +} + diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll new file mode 100644 index 00000000000..3d8c5599308 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll @@ -0,0 +1,40 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=FUNC %s + +declare double @llvm.copysign.f64(double, double) nounwind readnone +declare <2 x double> @llvm.copysign.v2f64(<2 x double>, <2 x double>) nounwind readnone +declare <4 x double> @llvm.copysign.v4f64(<4 x double>, <4 x double>) nounwind readnone + +; FUNC-LABEL: {{^}}test_copysign_f64: +; SI-DAG: s_load_dwordx2 s{{\[}}[[SMAG_LO:[0-9]+]]:[[SMAG_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb +; SI-DAG: s_load_dwordx2 s{{\[}}[[SSIGN_LO:[0-9]+]]:[[SSIGN_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd +; VI-DAG: s_load_dwordx2 s{{\[}}[[SMAG_LO:[0-9]+]]:[[SMAG_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c +; VI-DAG: s_load_dwordx2 s{{\[}}[[SSIGN_LO:[0-9]+]]:[[SSIGN_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x34 +; GCN-DAG: v_mov_b32_e32 v[[VSIGN_HI:[0-9]+]], s[[SSIGN_HI]] +; GCN-DAG: v_mov_b32_e32 v[[VMAG_HI:[0-9]+]], s[[SMAG_HI]] +; GCN-DAG: s_mov_b32 [[SCONST:s[0-9]+]], 0x7fffffff +; GCN: v_bfi_b32 v[[VRESULT_HI:[0-9]+]], [[SCONST]], v[[VMAG_HI]], v[[VSIGN_HI]] +; GCN: v_mov_b32_e32 v[[VMAG_LO:[0-9]+]], s[[SMAG_LO]] +; GCN: buffer_store_dwordx2 v{{\[}}[[VMAG_LO]]:[[VRESULT_HI]]{{\]}} +; GCN: s_endpgm +define void @test_copysign_f64(double addrspace(1)* %out, double %mag, double %sign) nounwind { + %result = call double @llvm.copysign.f64(double %mag, double %sign) + store double %result, double addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}test_copysign_v2f64: +; GCN: s_endpgm +define void @test_copysign_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %mag, <2 x double> %sign) nounwind { + %result = call <2 x double> @llvm.copysign.v2f64(<2 x double> %mag, <2 x double> %sign) + store <2 x double> %result, <2 x double> addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}test_copysign_v4f64: +; GCN: s_endpgm +define void @test_copysign_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %mag, <4 x double> %sign) nounwind { + %result = call <4 x double> @llvm.copysign.v4f64(<4 x double> %mag, <4 x double> %sign) + store <4 x double> %result, <4 x double> addrspace(1)* %out, align 8 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/fdiv.f64.ll b/llvm/test/CodeGen/AMDGPU/fdiv.f64.ll new file mode 100644 index 00000000000..7c022e38c80 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/fdiv.f64.ll @@ -0,0 +1,96 @@ +; RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=COMMON %s +; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=COMMON %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=COMMON %s + + +; COMMON-LABEL: {{^}}fdiv_f64: +; COMMON-DAG: buffer_load_dwordx2 [[NUM:v\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0 +; COMMON-DAG: buffer_load_dwordx2 [[DEN:v\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0 offset:8 +; CI-DAG: v_div_scale_f64 [[SCALE0:v\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, [[DEN]], [[DEN]], [[NUM]] +; CI-DAG: v_div_scale_f64 [[SCALE1:v\[[0-9]+:[0-9]+\]]], vcc, [[NUM]], [[DEN]], [[NUM]] + +; Check for div_scale bug workaround on SI +; SI-DAG: v_div_scale_f64 [[SCALE0:v\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, [[DEN]], [[DEN]], [[NUM]] +; SI-DAG: v_div_scale_f64 [[SCALE1:v\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, [[NUM]], [[DEN]], [[NUM]] + +; COMMON-DAG: v_rcp_f64_e32 [[RCP_SCALE0:v\[[0-9]+:[0-9]+\]]], [[SCALE0]] + +; SI-DAG: v_cmp_eq_i32_e32 vcc, {{v[0-9]+}}, {{v[0-9]+}} +; SI-DAG: v_cmp_eq_i32_e64 [[CMP0:s\[[0-9]+:[0-9]+\]]], {{v[0-9]+}}, {{v[0-9]+}} +; SI-DAG: s_xor_b64 vcc, [[CMP0]], vcc + +; COMMON-DAG: v_fma_f64 [[FMA0:v\[[0-9]+:[0-9]+\]]], -[[SCALE0]], [[RCP_SCALE0]], 1.0 +; COMMON-DAG: v_fma_f64 [[FMA1:v\[[0-9]+:[0-9]+\]]], [[RCP_SCALE0]], [[FMA0]], [[RCP_SCALE0]] +; COMMON-DAG: v_fma_f64 [[FMA2:v\[[0-9]+:[0-9]+\]]], -[[SCALE0]], [[FMA1]], 1.0 +; COMMON-DAG: v_fma_f64 [[FMA3:v\[[0-9]+:[0-9]+\]]], [[FMA1]], [[FMA2]], [[FMA1]] +; COMMON-DAG: v_mul_f64 [[MUL:v\[[0-9]+:[0-9]+\]]], [[SCALE1]], [[FMA3]] +; COMMON-DAG: v_fma_f64 [[FMA4:v\[[0-9]+:[0-9]+\]]], -[[SCALE0]], [[MUL]], [[SCALE1]] +; COMMON: v_div_fmas_f64 [[FMAS:v\[[0-9]+:[0-9]+\]]], [[FMA4]], [[FMA3]], [[MUL]] +; COMMON: v_div_fixup_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[FMAS]], [[DEN]], [[NUM]] +; COMMON: buffer_store_dwordx2 [[RESULT]] +; COMMON: s_endpgm +define void @fdiv_f64(double addrspace(1)* %out, double addrspace(1)* %in) nounwind { + %gep.1 = getelementptr double, double addrspace(1)* %in, i32 1 + %num = load double, double addrspace(1)* %in + %den = load double, double addrspace(1)* %gep.1 + %result = fdiv double %num, %den + store double %result, double addrspace(1)* %out + ret void +} + +; COMMON-LABEL: {{^}}fdiv_f64_s_v: +define void @fdiv_f64_s_v(double addrspace(1)* %out, double addrspace(1)* %in, double %num) nounwind { + %den = load double, double addrspace(1)* %in + %result = fdiv double %num, %den + store double %result, double addrspace(1)* %out + ret void +} + +; COMMON-LABEL: {{^}}fdiv_f64_v_s: +define void @fdiv_f64_v_s(double addrspace(1)* %out, double addrspace(1)* %in, double %den) nounwind { + %num = load double, double addrspace(1)* %in + %result = fdiv double %num, %den + store double %result, double addrspace(1)* %out + ret void +} + +; COMMON-LABEL: {{^}}fdiv_f64_s_s: +define void @fdiv_f64_s_s(double addrspace(1)* %out, double %num, double %den) nounwind { + %result = fdiv double %num, %den + store double %result, double addrspace(1)* %out + ret void +} + +; COMMON-LABEL: {{^}}v_fdiv_v2f64: +define void @v_fdiv_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %in) nounwind { + %gep.1 = getelementptr <2 x double>, <2 x double> addrspace(1)* %in, i32 1 + %num = load <2 x double>, <2 x double> addrspace(1)* %in + %den = load <2 x double>, <2 x double> addrspace(1)* %gep.1 + %result = fdiv <2 x double> %num, %den + store <2 x double> %result, <2 x double> addrspace(1)* %out + ret void +} + +; COMMON-LABEL: {{^}}s_fdiv_v2f64: +define void @s_fdiv_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %num, <2 x double> %den) { + %result = fdiv <2 x double> %num, %den + store <2 x double> %result, <2 x double> addrspace(1)* %out + ret void +} + +; COMMON-LABEL: {{^}}v_fdiv_v4f64: +define void @v_fdiv_v4f64(<4 x double> addrspace(1)* %out, <4 x double> addrspace(1)* %in) nounwind { + %gep.1 = getelementptr <4 x double>, <4 x double> addrspace(1)* %in, i32 1 + %num = load <4 x double>, <4 x double> addrspace(1)* %in + %den = load <4 x double>, <4 x double> addrspace(1)* %gep.1 + %result = fdiv <4 x double> %num, %den + store <4 x double> %result, <4 x double> addrspace(1)* %out + ret void +} + +; COMMON-LABEL: {{^}}s_fdiv_v4f64: +define void @s_fdiv_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %num, <4 x double> %den) { + %result = fdiv <4 x double> %num, %den + store <4 x double> %result, <4 x double> addrspace(1)* %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/fdiv.ll b/llvm/test/CodeGen/AMDGPU/fdiv.ll new file mode 100644 index 00000000000..7cbf8733639 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/fdiv.ll @@ -0,0 +1,68 @@ +; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 %s +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s + +; These tests check that fdiv is expanded correctly and also test that the +; scheduler is scheduling the RECIP_IEEE and MUL_IEEE instructions in separate +; instruction groups. + +; FUNC-LABEL: {{^}}fdiv_f32: +; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[3].Z +; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[3].Y +; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[3].X, PS +; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].W, PS + +; SI-DAG: v_rcp_f32 +; SI-DAG: v_mul_f32 +define void @fdiv_f32(float addrspace(1)* %out, float %a, float %b) { +entry: + %0 = fdiv float %a, %b + store float %0, float addrspace(1)* %out + ret void +} + + + +; FUNC-LABEL: {{^}}fdiv_v2f32: +; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[3].Z +; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[3].Y +; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[3].X, PS +; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].W, PS + +; SI-DAG: v_rcp_f32 +; SI-DAG: v_mul_f32 +; SI-DAG: v_rcp_f32 +; SI-DAG: v_mul_f32 +define void @fdiv_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) { +entry: + %0 = fdiv <2 x float> %a, %b + store <2 x float> %0, <2 x float> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}fdiv_v4f32: +; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS +; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS +; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS +; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS + +; SI-DAG: v_rcp_f32 +; SI-DAG: v_mul_f32 +; SI-DAG: v_rcp_f32 +; SI-DAG: v_mul_f32 +; SI-DAG: v_rcp_f32 +; SI-DAG: v_mul_f32 +; SI-DAG: v_rcp_f32 +; SI-DAG: v_mul_f32 +define void @fdiv_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) { + %b_ptr = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 1 + %a = load <4 x float>, <4 x float> addrspace(1) * %in + %b = load <4 x float>, <4 x float> addrspace(1) * %b_ptr + %result = fdiv <4 x float> %a, %b + store <4 x float> %result, <4 x float> addrspace(1)* %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/fetch-limits.r600.ll b/llvm/test/CodeGen/AMDGPU/fetch-limits.r600.ll new file mode 100644 index 00000000000..e7160ef5d72 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/fetch-limits.r600.ll @@ -0,0 +1,48 @@ +; RUN: llc < %s -march=r600 -mcpu=r600 | FileCheck %s +; RUN: llc < %s -march=r600 -mcpu=rs880 | FileCheck %s +; RUN: llc < %s -march=r600 -mcpu=rv670 | FileCheck %s + +; R600 supports 8 fetches in a clause +; CHECK: {{^}}fetch_limits_r600: +; CHECK: Fetch clause +; CHECK: Fetch clause + +define void @fetch_limits_r600() #0 { +entry: + %0 = load <4 x float>, <4 x float> addrspace(8)* null + %1 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1) + %2 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2) + %3 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3) + %4 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4) + %5 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 5) + %6 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 6) + %7 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 7) + %8 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 8) + %res0 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %0, i32 0, i32 0, i32 1) + %res1 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %1, i32 0, i32 0, i32 1) + %res2 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %2, i32 0, i32 0, i32 1) + %res3 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %3, i32 0, i32 0, i32 1) + %res4 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %4, i32 0, i32 0, i32 1) + %res5 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %5, i32 0, i32 0, i32 1) + %res6 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %6, i32 0, i32 0, i32 1) + %res7 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %7, i32 0, i32 0, i32 1) + %res8 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %8, i32 0, i32 0, i32 1) + %a = fadd <4 x float> %res0, %res1 + %b = fadd <4 x float> %res2, %res3 + %c = fadd <4 x float> %res4, %res5 + %d = fadd <4 x float> %res6, %res7 + %e = fadd <4 x float> %res8, %a + + %bc = fadd <4 x float> %b, %c + %de = fadd <4 x float> %d, %e + + %bcde = fadd <4 x float> %bc, %de + + call void @llvm.R600.store.swizzle(<4 x float> %bcde, i32 0, i32 1) + ret void +} + +attributes #0 = { "ShaderType"="0" } ; Pixel Shader + +declare <4 x float> @llvm.AMDGPU.tex(<4 x float>, i32, i32, i32) readnone +declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) diff --git a/llvm/test/CodeGen/AMDGPU/fetch-limits.r700+.ll b/llvm/test/CodeGen/AMDGPU/fetch-limits.r700+.ll new file mode 100644 index 00000000000..acaea2aa794 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/fetch-limits.r700+.ll @@ -0,0 +1,81 @@ +; RUN: llc < %s -march=r600 -mcpu=rv710 | FileCheck %s +; RUN: llc < %s -march=r600 -mcpu=rv730 | FileCheck %s +; RUN: llc < %s -march=r600 -mcpu=rv770 | FileCheck %s +; RUN: llc < %s -march=r600 -mcpu=cedar | FileCheck %s +; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s +; RUN: llc < %s -march=r600 -mcpu=sumo | FileCheck %s +; RUN: llc < %s -march=r600 -mcpu=juniper | FileCheck %s +; RUN: llc < %s -march=r600 -mcpu=cypress | FileCheck %s +; RUN: llc < %s -march=r600 -mcpu=barts | FileCheck %s +; RUN: llc < %s -march=r600 -mcpu=turks | FileCheck %s +; RUN: llc < %s -march=r600 -mcpu=caicos | FileCheck %s +; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s + +; r700+ supports 16 fetches in a clause +; CHECK: {{^}}fetch_limits_r700: +; CHECK: Fetch clause +; CHECK: Fetch clause + +define void @fetch_limits_r700() #0 { +entry: + %0 = load <4 x float>, <4 x float> addrspace(8)* null + %1 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1) + %2 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2) + %3 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3) + %4 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4) + %5 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 5) + %6 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 6) + %7 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 7) + %8 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 8) + %9 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9) + %10 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 10) + %11 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 11) + %12 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 12) + %13 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 13) + %14 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 14) + %15 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 15) + %16 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 16) + %res0 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %0, i32 0, i32 0, i32 1) + %res1 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %1, i32 0, i32 0, i32 1) + %res2 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %2, i32 0, i32 0, i32 1) + %res3 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %3, i32 0, i32 0, i32 1) + %res4 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %4, i32 0, i32 0, i32 1) + %res5 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %5, i32 0, i32 0, i32 1) + %res6 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %6, i32 0, i32 0, i32 1) + %res7 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %7, i32 0, i32 0, i32 1) + %res8 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %8, i32 0, i32 0, i32 1) + %res9 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %9, i32 0, i32 0, i32 1) + %res10 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %10, i32 0, i32 0, i32 1) + %res11 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %11, i32 0, i32 0, i32 1) + %res12 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %12, i32 0, i32 0, i32 1) + %res13 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %13, i32 0, i32 0, i32 1) + %res14 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %14, i32 0, i32 0, i32 1) + %res15 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %15, i32 0, i32 0, i32 1) + %res16 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %16, i32 0, i32 0, i32 1) + %a = fadd <4 x float> %res0, %res1 + %b = fadd <4 x float> %res2, %res3 + %c = fadd <4 x float> %res4, %res5 + %d = fadd <4 x float> %res6, %res7 + %e = fadd <4 x float> %res8, %res9 + %f = fadd <4 x float> %res10, %res11 + %g = fadd <4 x float> %res12, %res13 + %h = fadd <4 x float> %res14, %res15 + %i = fadd <4 x float> %res16, %a + + %bc = fadd <4 x float> %b, %c + %de = fadd <4 x float> %d, %e + %fg = fadd <4 x float> %f, %g + %hi = fadd <4 x float> %h, %i + + %bcde = fadd <4 x float> %bc, %de + %fghi = fadd <4 x float> %fg, %hi + + %bcdefghi = fadd <4 x float> %bcde, %fghi + call void @llvm.R600.store.swizzle(<4 x float> %bcdefghi, i32 0, i32 1) + ret void +} + +attributes #0 = { "ShaderType"="0" } ; Pixel Shader + +declare <4 x float> @llvm.AMDGPU.tex(<4 x float>, i32, i32, i32) readnone +declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) diff --git a/llvm/test/CodeGen/AMDGPU/ffloor.f64.ll b/llvm/test/CodeGen/AMDGPU/ffloor.f64.ll new file mode 100644 index 00000000000..45f8382c392 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/ffloor.f64.ll @@ -0,0 +1,127 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s + +declare double @llvm.fabs.f64(double %Val) +declare double @llvm.floor.f64(double) nounwind readnone +declare <2 x double> @llvm.floor.v2f64(<2 x double>) nounwind readnone +declare <3 x double> @llvm.floor.v3f64(<3 x double>) nounwind readnone +declare <4 x double> @llvm.floor.v4f64(<4 x double>) nounwind readnone +declare <8 x double> @llvm.floor.v8f64(<8 x double>) nounwind readnone +declare <16 x double> @llvm.floor.v16f64(<16 x double>) nounwind readnone + +; FUNC-LABEL: {{^}}ffloor_f64: +; CI: v_floor_f64_e32 +; SI: v_fract_f64_e32 +; SI: v_min_f64 +; SI: v_cmp_class_f64_e64 +; SI: v_cndmask_b32_e64 +; SI: v_cndmask_b32_e64 +; SI: v_add_f64 +; SI: s_endpgm +define void @ffloor_f64(double addrspace(1)* %out, double %x) { + %y = call double @llvm.floor.f64(double %x) nounwind readnone + store double %y, double addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}ffloor_f64_neg: +; CI: v_floor_f64_e64 +; SI: v_fract_f64_e64 {{v[[0-9]+:[0-9]+]}}, -[[INPUT:s[[0-9]+:[0-9]+]]] +; SI: v_min_f64 +; SI: v_cmp_class_f64_e64 +; SI: v_cndmask_b32_e64 +; SI: v_cndmask_b32_e64 +; SI: v_add_f64 {{v[[0-9]+:[0-9]+]}}, -[[INPUT]] +; SI: s_endpgm +define void @ffloor_f64_neg(double addrspace(1)* %out, double %x) { + %neg = fsub double 0.0, %x + %y = call double @llvm.floor.f64(double %neg) nounwind readnone + store double %y, double addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}ffloor_f64_neg_abs: +; CI: v_floor_f64_e64 +; SI: v_fract_f64_e64 {{v[[0-9]+:[0-9]+]}}, -|[[INPUT:s[[0-9]+:[0-9]+]]]| +; SI: v_min_f64 +; SI: v_cmp_class_f64_e64 +; SI: v_cndmask_b32_e64 +; SI: v_cndmask_b32_e64 +; SI: v_add_f64 {{v[[0-9]+:[0-9]+]}}, -|[[INPUT]]| +; SI: s_endpgm +define void @ffloor_f64_neg_abs(double addrspace(1)* %out, double %x) { + %abs = call double @llvm.fabs.f64(double %x) + %neg = fsub double 0.0, %abs + %y = call double @llvm.floor.f64(double %neg) nounwind readnone + store double %y, double addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}ffloor_v2f64: +; CI: v_floor_f64_e32 +; CI: v_floor_f64_e32 +define void @ffloor_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %x) { + %y = call <2 x double> @llvm.floor.v2f64(<2 x double> %x) nounwind readnone + store <2 x double> %y, <2 x double> addrspace(1)* %out + ret void +} + +; FIXME-FUNC-LABEL: {{^}}ffloor_v3f64: +; FIXME-CI: v_floor_f64_e32 +; FIXME-CI: v_floor_f64_e32 +; FIXME-CI: v_floor_f64_e32 +; define void @ffloor_v3f64(<3 x double> addrspace(1)* %out, <3 x double> %x) { +; %y = call <3 x double> @llvm.floor.v3f64(<3 x double> %x) nounwind readnone +; store <3 x double> %y, <3 x double> addrspace(1)* %out +; ret void +; } + +; FUNC-LABEL: {{^}}ffloor_v4f64: +; CI: v_floor_f64_e32 +; CI: v_floor_f64_e32 +; CI: v_floor_f64_e32 +; CI: v_floor_f64_e32 +define void @ffloor_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %x) { + %y = call <4 x double> @llvm.floor.v4f64(<4 x double> %x) nounwind readnone + store <4 x double> %y, <4 x double> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}ffloor_v8f64: +; CI: v_floor_f64_e32 +; CI: v_floor_f64_e32 +; CI: v_floor_f64_e32 +; CI: v_floor_f64_e32 +; CI: v_floor_f64_e32 +; CI: v_floor_f64_e32 +; CI: v_floor_f64_e32 +; CI: v_floor_f64_e32 +define void @ffloor_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %x) { + %y = call <8 x double> @llvm.floor.v8f64(<8 x double> %x) nounwind readnone + store <8 x double> %y, <8 x double> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}ffloor_v16f64: +; CI: v_floor_f64_e32 +; CI: v_floor_f64_e32 +; CI: v_floor_f64_e32 +; CI: v_floor_f64_e32 +; CI: v_floor_f64_e32 +; CI: v_floor_f64_e32 +; CI: v_floor_f64_e32 +; CI: v_floor_f64_e32 +; CI: v_floor_f64_e32 +; CI: v_floor_f64_e32 +; CI: v_floor_f64_e32 +; CI: v_floor_f64_e32 +; CI: v_floor_f64_e32 +; CI: v_floor_f64_e32 +; CI: v_floor_f64_e32 +; CI: v_floor_f64_e32 +define void @ffloor_v16f64(<16 x double> addrspace(1)* %out, <16 x double> %x) { + %y = call <16 x double> @llvm.floor.v16f64(<16 x double> %x) nounwind readnone + store <16 x double> %y, <16 x double> addrspace(1)* %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/ffloor.ll b/llvm/test/CodeGen/AMDGPU/ffloor.ll new file mode 100644 index 00000000000..61c46ac2bc0 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/ffloor.ll @@ -0,0 +1,49 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s + +; FUNC-LABEL: {{^}}floor_f32: +; SI: v_floor_f32_e32 +; R600: FLOOR +define void @floor_f32(float addrspace(1)* %out, float %in) { + %tmp = call float @llvm.floor.f32(float %in) #0 + store float %tmp, float addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}floor_v2f32: +; SI: v_floor_f32_e32 +; SI: v_floor_f32_e32 + +define void @floor_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) { + %tmp = call <2 x float> @llvm.floor.v2f32(<2 x float> %in) #0 + store <2 x float> %tmp, <2 x float> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}floor_v4f32: +; SI: v_floor_f32_e32 +; SI: v_floor_f32_e32 +; SI: v_floor_f32_e32 +; SI: v_floor_f32_e32 + +; R600: FLOOR +; R600: FLOOR +; R600: FLOOR +; R600: FLOOR +define void @floor_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) { + %tmp = call <4 x float> @llvm.floor.v4f32(<4 x float> %in) #0 + store <4 x float> %tmp, <4 x float> addrspace(1)* %out + ret void +} + +; Function Attrs: nounwind readonly +declare float @llvm.floor.f32(float) #0 + +; Function Attrs: nounwind readonly +declare <2 x float> @llvm.floor.v2f32(<2 x float>) #0 + +; Function Attrs: nounwind readonly +declare <4 x float> @llvm.floor.v4f32(<4 x float>) #0 + +attributes #0 = { nounwind readnone } diff --git a/llvm/test/CodeGen/AMDGPU/flat-address-space.ll b/llvm/test/CodeGen/AMDGPU/flat-address-space.ll new file mode 100644 index 00000000000..8ceca078f2d --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/flat-address-space.ll @@ -0,0 +1,184 @@ +; RUN: llc -O0 -march=amdgcn -mcpu=bonaire -mattr=-promote-alloca < %s | FileCheck -check-prefix=CHECK -check-prefix=CHECK-NO-PROMOTE %s +; RUN: llc -O0 -march=amdgcn -mcpu=bonaire -mattr=+promote-alloca < %s | FileCheck -check-prefix=CHECK -check-prefix=CHECK-PROMOTE %s +; RUN: llc -O0 -march=amdgcn -mcpu=tonga -mattr=-promote-alloca < %s | FileCheck -check-prefix=CHECK -check-prefix=CHECK-NO-PROMOTE %s +; RUN: llc -O0 -march=amdgcn -mcpu=tonga -mattr=+promote-alloca < %s | FileCheck -check-prefix=CHECK -check-prefix=CHECK-PROMOTE %s + +; Disable optimizations in case there are optimizations added that +; specialize away generic pointer accesses. + + +; CHECK-LABEL: {{^}}branch_use_flat_i32: +; CHECK: flat_store_dword {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} +; CHECK: s_endpgm +define void @branch_use_flat_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* %gptr, i32 addrspace(3)* %lptr, i32 %x, i32 %c) #0 { +entry: + %cmp = icmp ne i32 %c, 0 + br i1 %cmp, label %local, label %global + +local: + %flat_local = addrspacecast i32 addrspace(3)* %lptr to i32 addrspace(4)* + br label %end + +global: + %flat_global = addrspacecast i32 addrspace(1)* %gptr to i32 addrspace(4)* + br label %end + +end: + %fptr = phi i32 addrspace(4)* [ %flat_local, %local ], [ %flat_global, %global ] + store i32 %x, i32 addrspace(4)* %fptr, align 4 +; %val = load i32, i32 addrspace(4)* %fptr, align 4 +; store i32 %val, i32 addrspace(1)* %out, align 4 + ret void +} + + + +; These testcases might become useless when there are optimizations to +; remove generic pointers. + +; CHECK-LABEL: {{^}}store_flat_i32: +; CHECK: v_mov_b32_e32 v[[DATA:[0-9]+]], {{s[0-9]+}} +; CHECK: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], {{s[0-9]+}} +; CHECK: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], {{s[0-9]+}} +; CHECK: flat_store_dword v[[DATA]], v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}} +define void @store_flat_i32(i32 addrspace(1)* %gptr, i32 %x) #0 { + %fptr = addrspacecast i32 addrspace(1)* %gptr to i32 addrspace(4)* + store i32 %x, i32 addrspace(4)* %fptr, align 4 + ret void +} + +; CHECK-LABEL: {{^}}store_flat_i64: +; CHECK: flat_store_dwordx2 +define void @store_flat_i64(i64 addrspace(1)* %gptr, i64 %x) #0 { + %fptr = addrspacecast i64 addrspace(1)* %gptr to i64 addrspace(4)* + store i64 %x, i64 addrspace(4)* %fptr, align 8 + ret void +} + +; CHECK-LABEL: {{^}}store_flat_v4i32: +; CHECK: flat_store_dwordx4 +define void @store_flat_v4i32(<4 x i32> addrspace(1)* %gptr, <4 x i32> %x) #0 { + %fptr = addrspacecast <4 x i32> addrspace(1)* %gptr to <4 x i32> addrspace(4)* + store <4 x i32> %x, <4 x i32> addrspace(4)* %fptr, align 16 + ret void +} + +; CHECK-LABEL: {{^}}store_flat_trunc_i16: +; CHECK: flat_store_short +define void @store_flat_trunc_i16(i16 addrspace(1)* %gptr, i32 %x) #0 { + %fptr = addrspacecast i16 addrspace(1)* %gptr to i16 addrspace(4)* + %y = trunc i32 %x to i16 + store i16 %y, i16 addrspace(4)* %fptr, align 2 + ret void +} + +; CHECK-LABEL: {{^}}store_flat_trunc_i8: +; CHECK: flat_store_byte +define void @store_flat_trunc_i8(i8 addrspace(1)* %gptr, i32 %x) #0 { + %fptr = addrspacecast i8 addrspace(1)* %gptr to i8 addrspace(4)* + %y = trunc i32 %x to i8 + store i8 %y, i8 addrspace(4)* %fptr, align 2 + ret void +} + + + +; CHECK-LABEL @load_flat_i32: +; CHECK: flat_load_dword +define void @load_flat_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %gptr) #0 { + %fptr = addrspacecast i32 addrspace(1)* %gptr to i32 addrspace(4)* + %fload = load i32, i32 addrspace(4)* %fptr, align 4 + store i32 %fload, i32 addrspace(1)* %out, align 4 + ret void +} + +; CHECK-LABEL @load_flat_i64: +; CHECK: flat_load_dwordx2 +define void @load_flat_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %gptr) #0 { + %fptr = addrspacecast i64 addrspace(1)* %gptr to i64 addrspace(4)* + %fload = load i64, i64 addrspace(4)* %fptr, align 4 + store i64 %fload, i64 addrspace(1)* %out, align 8 + ret void +} + +; CHECK-LABEL @load_flat_v4i32: +; CHECK: flat_load_dwordx4 +define void @load_flat_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %gptr) #0 { + %fptr = addrspacecast <4 x i32> addrspace(1)* %gptr to <4 x i32> addrspace(4)* + %fload = load <4 x i32>, <4 x i32> addrspace(4)* %fptr, align 4 + store <4 x i32> %fload, <4 x i32> addrspace(1)* %out, align 8 + ret void +} + +; CHECK-LABEL @sextload_flat_i8: +; CHECK: flat_load_sbyte +define void @sextload_flat_i8(i32 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %gptr) #0 { + %fptr = addrspacecast i8 addrspace(1)* %gptr to i8 addrspace(4)* + %fload = load i8, i8 addrspace(4)* %fptr, align 4 + %ext = sext i8 %fload to i32 + store i32 %ext, i32 addrspace(1)* %out, align 4 + ret void +} + +; CHECK-LABEL @zextload_flat_i8: +; CHECK: flat_load_ubyte +define void @zextload_flat_i8(i32 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %gptr) #0 { + %fptr = addrspacecast i8 addrspace(1)* %gptr to i8 addrspace(4)* + %fload = load i8, i8 addrspace(4)* %fptr, align 4 + %ext = zext i8 %fload to i32 + store i32 %ext, i32 addrspace(1)* %out, align 4 + ret void +} + +; CHECK-LABEL @sextload_flat_i16: +; CHECK: flat_load_sshort +define void @sextload_flat_i16(i32 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %gptr) #0 { + %fptr = addrspacecast i16 addrspace(1)* %gptr to i16 addrspace(4)* + %fload = load i16, i16 addrspace(4)* %fptr, align 4 + %ext = sext i16 %fload to i32 + store i32 %ext, i32 addrspace(1)* %out, align 4 + ret void +} + +; CHECK-LABEL @zextload_flat_i16: +; CHECK: flat_load_ushort +define void @zextload_flat_i16(i32 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %gptr) #0 { + %fptr = addrspacecast i16 addrspace(1)* %gptr to i16 addrspace(4)* + %fload = load i16, i16 addrspace(4)* %fptr, align 4 + %ext = zext i16 %fload to i32 + store i32 %ext, i32 addrspace(1)* %out, align 4 + ret void +} + + + +; TODO: This should not be zero when registers are used for small +; scratch allocations again. + +; Check for prologue initializing special SGPRs pointing to scratch. +; CHECK-LABEL: {{^}}store_flat_scratch: +; CHECK: s_movk_i32 flat_scratch_lo, 0 +; CHECK-NO-PROMOTE: s_movk_i32 flat_scratch_hi, 0x28{{$}} +; CHECK-PROMOTE: s_movk_i32 flat_scratch_hi, 0x0{{$}} +; CHECK: flat_store_dword +; CHECK: s_barrier +; CHECK: flat_load_dword +define void @store_flat_scratch(i32 addrspace(1)* noalias %out, i32) #0 { + %alloca = alloca i32, i32 9, align 4 + %x = call i32 @llvm.r600.read.tidig.x() #3 + %pptr = getelementptr i32, i32* %alloca, i32 %x + %fptr = addrspacecast i32* %pptr to i32 addrspace(4)* + store i32 %x, i32 addrspace(4)* %fptr + ; Dummy call + call void @llvm.AMDGPU.barrier.local() #1 + %reload = load i32, i32 addrspace(4)* %fptr, align 4 + store i32 %reload, i32 addrspace(1)* %out, align 4 + ret void +} + +declare void @llvm.AMDGPU.barrier.local() #1 +declare i32 @llvm.r600.read.tidig.x() #3 + +attributes #0 = { nounwind } +attributes #1 = { nounwind noduplicate } +attributes #3 = { nounwind readnone } diff --git a/llvm/test/CodeGen/AMDGPU/floor.ll b/llvm/test/CodeGen/AMDGPU/floor.ll new file mode 100644 index 00000000000..c6bfb8567a0 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/floor.ll @@ -0,0 +1,15 @@ +; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck %s + +; CHECK: FLOOR * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +define void @test(<4 x float> inreg %reg0) #0 { + %r0 = extractelement <4 x float> %reg0, i32 0 + %r1 = call float @floor(float %r0) + %vec = insertelement <4 x float> undef, float %r1, i32 0 + call void @llvm.R600.store.swizzle(<4 x float> %vec, i32 0, i32 0) + ret void +} + +declare float @floor(float) readonly +declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) + +attributes #0 = { "ShaderType"="0" } diff --git a/llvm/test/CodeGen/AMDGPU/fma-combine.ll b/llvm/test/CodeGen/AMDGPU/fma-combine.ll new file mode 100644 index 00000000000..bd574b87711 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/fma-combine.ll @@ -0,0 +1,368 @@ +; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -fp-contract=fast < %s | FileCheck -check-prefix=SI-FASTFMAF -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs -fp-contract=fast < %s | FileCheck -check-prefix=SI-SLOWFMAF -check-prefix=SI -check-prefix=FUNC %s + +declare i32 @llvm.r600.read.tidig.x() #0 +declare double @llvm.fabs.f64(double) #0 +declare double @llvm.fma.f64(double, double, double) #0 +declare float @llvm.fma.f32(float, float, float) #0 + +; (fadd (fmul x, y), z) -> (fma x, y, z) +; FUNC-LABEL: {{^}}combine_to_fma_f64_0: +; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} +; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} +; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[C]] +; SI: buffer_store_dwordx2 [[RESULT]] +define void @combine_to_fma_f64_0(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 { + %tid = tail call i32 @llvm.r600.read.tidig.x() #0 + %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 + %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2 + %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid + + %a = load double, double addrspace(1)* %gep.0 + %b = load double, double addrspace(1)* %gep.1 + %c = load double, double addrspace(1)* %gep.2 + + %mul = fmul double %a, %b + %fma = fadd double %mul, %c + store double %fma, double addrspace(1)* %gep.out + ret void +} + +; (fadd (fmul x, y), z) -> (fma x, y, z) +; FUNC-LABEL: {{^}}combine_to_fma_f64_0_2use: +; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} +; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} +; SI-DAG: buffer_load_dwordx2 [[D:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24{{$}} +; SI-DAG: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[C]] +; SI-DAG: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[D]] +; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} +; SI: s_endpgm +define void @combine_to_fma_f64_0_2use(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 { + %tid = tail call i32 @llvm.r600.read.tidig.x() #0 + %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 + %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2 + %gep.3 = getelementptr double, double addrspace(1)* %gep.0, i32 3 + %gep.out.0 = getelementptr double, double addrspace(1)* %out, i32 %tid + %gep.out.1 = getelementptr double, double addrspace(1)* %gep.out.0, i32 1 + + %a = load double, double addrspace(1)* %gep.0 + %b = load double, double addrspace(1)* %gep.1 + %c = load double, double addrspace(1)* %gep.2 + %d = load double, double addrspace(1)* %gep.3 + + %mul = fmul double %a, %b + %fma0 = fadd double %mul, %c + %fma1 = fadd double %mul, %d + store double %fma0, double addrspace(1)* %gep.out.0 + store double %fma1, double addrspace(1)* %gep.out.1 + ret void +} + +; (fadd x, (fmul y, z)) -> (fma y, z, x) +; FUNC-LABEL: {{^}}combine_to_fma_f64_1: +; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} +; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} +; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[C]] +; SI: buffer_store_dwordx2 [[RESULT]] +define void @combine_to_fma_f64_1(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 { + %tid = tail call i32 @llvm.r600.read.tidig.x() #0 + %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 + %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2 + %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid + + %a = load double, double addrspace(1)* %gep.0 + %b = load double, double addrspace(1)* %gep.1 + %c = load double, double addrspace(1)* %gep.2 + + %mul = fmul double %a, %b + %fma = fadd double %c, %mul + store double %fma, double addrspace(1)* %gep.out + ret void +} + +; (fsub (fmul x, y), z) -> (fma x, y, (fneg z)) +; FUNC-LABEL: {{^}}combine_to_fma_fsub_0_f64: +; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} +; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} +; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], -[[C]] +; SI: buffer_store_dwordx2 [[RESULT]] +define void @combine_to_fma_fsub_0_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 { + %tid = tail call i32 @llvm.r600.read.tidig.x() #0 + %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 + %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2 + %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid + + %a = load double, double addrspace(1)* %gep.0 + %b = load double, double addrspace(1)* %gep.1 + %c = load double, double addrspace(1)* %gep.2 + + %mul = fmul double %a, %b + %fma = fsub double %mul, %c + store double %fma, double addrspace(1)* %gep.out + ret void +} + +; (fsub (fmul x, y), z) -> (fma x, y, (fneg z)) +; FUNC-LABEL: {{^}}combine_to_fma_fsub_f64_0_2use: +; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} +; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} +; SI-DAG: buffer_load_dwordx2 [[D:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24{{$}} +; SI-DAG: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], -[[C]] +; SI-DAG: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], -[[D]] +; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} +; SI: s_endpgm +define void @combine_to_fma_fsub_f64_0_2use(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 { + %tid = tail call i32 @llvm.r600.read.tidig.x() #0 + %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 + %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2 + %gep.3 = getelementptr double, double addrspace(1)* %gep.0, i32 3 + %gep.out.0 = getelementptr double, double addrspace(1)* %out, i32 %tid + %gep.out.1 = getelementptr double, double addrspace(1)* %gep.out.0, i32 1 + + %a = load double, double addrspace(1)* %gep.0 + %b = load double, double addrspace(1)* %gep.1 + %c = load double, double addrspace(1)* %gep.2 + %d = load double, double addrspace(1)* %gep.3 + + %mul = fmul double %a, %b + %fma0 = fsub double %mul, %c + %fma1 = fsub double %mul, %d + store double %fma0, double addrspace(1)* %gep.out.0 + store double %fma1, double addrspace(1)* %gep.out.1 + ret void +} + +; (fsub x, (fmul y, z)) -> (fma (fneg y), z, x) +; FUNC-LABEL: {{^}}combine_to_fma_fsub_1_f64: +; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} +; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} +; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], [[C]] +; SI: buffer_store_dwordx2 [[RESULT]] +define void @combine_to_fma_fsub_1_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 { + %tid = tail call i32 @llvm.r600.read.tidig.x() #0 + %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 + %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2 + %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid + + %a = load double, double addrspace(1)* %gep.0 + %b = load double, double addrspace(1)* %gep.1 + %c = load double, double addrspace(1)* %gep.2 + + %mul = fmul double %a, %b + %fma = fsub double %c, %mul + store double %fma, double addrspace(1)* %gep.out + ret void +} + +; (fsub x, (fmul y, z)) -> (fma (fneg y), z, x) +; FUNC-LABEL: {{^}}combine_to_fma_fsub_1_f64_2use: +; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} +; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} +; SI-DAG: buffer_load_dwordx2 [[D:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24{{$}} +; SI-DAG: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], [[C]] +; SI-DAG: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], [[D]] +; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} +; SI: s_endpgm +define void @combine_to_fma_fsub_1_f64_2use(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 { + %tid = tail call i32 @llvm.r600.read.tidig.x() #0 + %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 + %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2 + %gep.3 = getelementptr double, double addrspace(1)* %gep.0, i32 3 + %gep.out.0 = getelementptr double, double addrspace(1)* %out, i32 %tid + %gep.out.1 = getelementptr double, double addrspace(1)* %gep.out.0, i32 1 + + %a = load double, double addrspace(1)* %gep.0 + %b = load double, double addrspace(1)* %gep.1 + %c = load double, double addrspace(1)* %gep.2 + %d = load double, double addrspace(1)* %gep.3 + + %mul = fmul double %a, %b + %fma0 = fsub double %c, %mul + %fma1 = fsub double %d, %mul + store double %fma0, double addrspace(1)* %gep.out.0 + store double %fma1, double addrspace(1)* %gep.out.1 + ret void +} + +; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z)) +; FUNC-LABEL: {{^}}combine_to_fma_fsub_2_f64: +; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} +; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} +; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], -[[C]] +; SI: buffer_store_dwordx2 [[RESULT]] +define void @combine_to_fma_fsub_2_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 { + %tid = tail call i32 @llvm.r600.read.tidig.x() #0 + %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 + %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2 + %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid + + %a = load double, double addrspace(1)* %gep.0 + %b = load double, double addrspace(1)* %gep.1 + %c = load double, double addrspace(1)* %gep.2 + + %mul = fmul double %a, %b + %mul.neg = fsub double -0.0, %mul + %fma = fsub double %mul.neg, %c + + store double %fma, double addrspace(1)* %gep.out + ret void +} + +; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z)) +; FUNC-LABEL: {{^}}combine_to_fma_fsub_2_f64_2uses_neg: +; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} +; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} +; SI-DAG: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], -[[C]] +; SI-DAG: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], -[[D]] +; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} +; SI: s_endpgm +define void @combine_to_fma_fsub_2_f64_2uses_neg(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 { + %tid = tail call i32 @llvm.r600.read.tidig.x() #0 + %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 + %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2 + %gep.3 = getelementptr double, double addrspace(1)* %gep.0, i32 3 + %gep.out.0 = getelementptr double, double addrspace(1)* %out, i32 %tid + %gep.out.1 = getelementptr double, double addrspace(1)* %gep.out.0, i32 1 + + %a = load double, double addrspace(1)* %gep.0 + %b = load double, double addrspace(1)* %gep.1 + %c = load double, double addrspace(1)* %gep.2 + %d = load double, double addrspace(1)* %gep.3 + + %mul = fmul double %a, %b + %mul.neg = fsub double -0.0, %mul + %fma0 = fsub double %mul.neg, %c + %fma1 = fsub double %mul.neg, %d + + store double %fma0, double addrspace(1)* %gep.out.0 + store double %fma1, double addrspace(1)* %gep.out.1 + ret void +} + +; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z)) +; FUNC-LABEL: {{^}}combine_to_fma_fsub_2_f64_2uses_mul: +; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} +; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} +; SI-DAG: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], -[[C]] +; SI-DAG: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], -[[D]] +; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} +; SI: s_endpgm +define void @combine_to_fma_fsub_2_f64_2uses_mul(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 { + %tid = tail call i32 @llvm.r600.read.tidig.x() #0 + %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 + %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2 + %gep.3 = getelementptr double, double addrspace(1)* %gep.0, i32 3 + %gep.out.0 = getelementptr double, double addrspace(1)* %out, i32 %tid + %gep.out.1 = getelementptr double, double addrspace(1)* %gep.out.0, i32 1 + + %a = load double, double addrspace(1)* %gep.0 + %b = load double, double addrspace(1)* %gep.1 + %c = load double, double addrspace(1)* %gep.2 + %d = load double, double addrspace(1)* %gep.3 + + %mul = fmul double %a, %b + %mul.neg = fsub double -0.0, %mul + %fma0 = fsub double %mul.neg, %c + %fma1 = fsub double %mul, %d + + store double %fma0, double addrspace(1)* %gep.out.0 + store double %fma1, double addrspace(1)* %gep.out.1 + ret void +} + +; fold (fsub (fma x, y, (fmul u, v)), z) -> (fma x, y (fma u, v, (fneg z))) + +; FUNC-LABEL: {{^}}aggressive_combine_to_fma_fsub_0_f64: +; SI-DAG: buffer_load_dwordx2 [[X:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI-DAG: buffer_load_dwordx2 [[Y:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} +; SI-DAG: buffer_load_dwordx2 [[Z:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} +; SI-DAG: buffer_load_dwordx2 [[U:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24{{$}} +; SI-DAG: buffer_load_dwordx2 [[V:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:32{{$}} +; SI: v_fma_f64 [[FMA0:v\[[0-9]+:[0-9]+\]]], [[U]], [[V]], -[[Z]] +; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[X]], [[Y]], [[FMA0]] +; SI: buffer_store_dwordx2 [[RESULT]] +define void @aggressive_combine_to_fma_fsub_0_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 { + %tid = tail call i32 @llvm.r600.read.tidig.x() #0 + %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 + %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2 + %gep.3 = getelementptr double, double addrspace(1)* %gep.0, i32 3 + %gep.4 = getelementptr double, double addrspace(1)* %gep.0, i32 4 + %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid + + %x = load double, double addrspace(1)* %gep.0 + %y = load double, double addrspace(1)* %gep.1 + %z = load double, double addrspace(1)* %gep.2 + %u = load double, double addrspace(1)* %gep.3 + %v = load double, double addrspace(1)* %gep.4 + + %tmp0 = fmul double %u, %v + %tmp1 = call double @llvm.fma.f64(double %x, double %y, double %tmp0) #0 + %tmp2 = fsub double %tmp1, %z + + store double %tmp2, double addrspace(1)* %gep.out + ret void +} + +; fold (fsub x, (fma y, z, (fmul u, v))) +; -> (fma (fneg y), z, (fma (fneg u), v, x)) + +; FUNC-LABEL: {{^}}aggressive_combine_to_fma_fsub_1_f64: +; SI-DAG: buffer_load_dwordx2 [[X:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI-DAG: buffer_load_dwordx2 [[Y:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} +; SI-DAG: buffer_load_dwordx2 [[Z:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} +; SI-DAG: buffer_load_dwordx2 [[U:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24{{$}} +; SI-DAG: buffer_load_dwordx2 [[V:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:32{{$}} +; SI: v_fma_f64 [[FMA0:v\[[0-9]+:[0-9]+\]]], -[[U]], [[V]], [[X]] +; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], -[[Y]], [[Z]], [[FMA0]] +; SI: buffer_store_dwordx2 [[RESULT]] +define void @aggressive_combine_to_fma_fsub_1_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 { + %tid = tail call i32 @llvm.r600.read.tidig.x() #0 + %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 + %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2 + %gep.3 = getelementptr double, double addrspace(1)* %gep.0, i32 3 + %gep.4 = getelementptr double, double addrspace(1)* %gep.0, i32 4 + %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid + + %x = load double, double addrspace(1)* %gep.0 + %y = load double, double addrspace(1)* %gep.1 + %z = load double, double addrspace(1)* %gep.2 + %u = load double, double addrspace(1)* %gep.3 + %v = load double, double addrspace(1)* %gep.4 + + %tmp0 = fmul double %u, %v + %tmp1 = call double @llvm.fma.f64(double %y, double %z, double %tmp0) #0 + %tmp2 = fsub double %x, %tmp1 + + store double %tmp2, double addrspace(1)* %gep.out + ret void +} + +attributes #0 = { nounwind readnone } +attributes #1 = { nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/fma.f64.ll b/llvm/test/CodeGen/AMDGPU/fma.f64.ll new file mode 100644 index 00000000000..0a55ef77855 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/fma.f64.ll @@ -0,0 +1,47 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s + +declare double @llvm.fma.f64(double, double, double) nounwind readnone +declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>) nounwind readnone +declare <4 x double> @llvm.fma.v4f64(<4 x double>, <4 x double>, <4 x double>) nounwind readnone + + +; FUNC-LABEL: {{^}}fma_f64: +; SI: v_fma_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}} +define void @fma_f64(double addrspace(1)* %out, double addrspace(1)* %in1, + double addrspace(1)* %in2, double addrspace(1)* %in3) { + %r0 = load double, double addrspace(1)* %in1 + %r1 = load double, double addrspace(1)* %in2 + %r2 = load double, double addrspace(1)* %in3 + %r3 = tail call double @llvm.fma.f64(double %r0, double %r1, double %r2) + store double %r3, double addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}fma_v2f64: +; SI: v_fma_f64 +; SI: v_fma_f64 +define void @fma_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %in1, + <2 x double> addrspace(1)* %in2, <2 x double> addrspace(1)* %in3) { + %r0 = load <2 x double>, <2 x double> addrspace(1)* %in1 + %r1 = load <2 x double>, <2 x double> addrspace(1)* %in2 + %r2 = load <2 x double>, <2 x double> addrspace(1)* %in3 + %r3 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %r0, <2 x double> %r1, <2 x double> %r2) + store <2 x double> %r3, <2 x double> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}fma_v4f64: +; SI: v_fma_f64 +; SI: v_fma_f64 +; SI: v_fma_f64 +; SI: v_fma_f64 +define void @fma_v4f64(<4 x double> addrspace(1)* %out, <4 x double> addrspace(1)* %in1, + <4 x double> addrspace(1)* %in2, <4 x double> addrspace(1)* %in3) { + %r0 = load <4 x double>, <4 x double> addrspace(1)* %in1 + %r1 = load <4 x double>, <4 x double> addrspace(1)* %in2 + %r2 = load <4 x double>, <4 x double> addrspace(1)* %in3 + %r3 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %r0, <4 x double> %r1, <4 x double> %r2) + store <4 x double> %r3, <4 x double> addrspace(1)* %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/fma.ll b/llvm/test/CodeGen/AMDGPU/fma.ll new file mode 100644 index 00000000000..d6024aa0b4c --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/fma.ll @@ -0,0 +1,92 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s + +declare float @llvm.fma.f32(float, float, float) nounwind readnone +declare <2 x float> @llvm.fma.v2f32(<2 x float>, <2 x float>, <2 x float>) nounwind readnone +declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>) nounwind readnone + +declare i32 @llvm.r600.read.tidig.x() nounwind readnone + +; FUNC-LABEL: {{^}}fma_f32: +; SI: v_fma_f32 {{v[0-9]+, v[0-9]+, v[0-9]+, v[0-9]+}} + +; EG: MEM_RAT_{{.*}} STORE_{{.*}} [[RES:T[0-9]\.[XYZW]]], {{T[0-9]\.[XYZW]}}, +; EG: FMA {{\*? *}}[[RES]] +define void @fma_f32(float addrspace(1)* %out, float addrspace(1)* %in1, + float addrspace(1)* %in2, float addrspace(1)* %in3) { + %r0 = load float, float addrspace(1)* %in1 + %r1 = load float, float addrspace(1)* %in2 + %r2 = load float, float addrspace(1)* %in3 + %r3 = tail call float @llvm.fma.f32(float %r0, float %r1, float %r2) + store float %r3, float addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}fma_v2f32: +; SI: v_fma_f32 +; SI: v_fma_f32 + +; EG: MEM_RAT_{{.*}} STORE_{{.*}} [[RES:T[0-9]]].[[CHLO:[XYZW]]][[CHHI:[XYZW]]], {{T[0-9]\.[XYZW]}}, +; EG-DAG: FMA {{\*? *}}[[RES]].[[CHLO]] +; EG-DAG: FMA {{\*? *}}[[RES]].[[CHHI]] +define void @fma_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in1, + <2 x float> addrspace(1)* %in2, <2 x float> addrspace(1)* %in3) { + %r0 = load <2 x float>, <2 x float> addrspace(1)* %in1 + %r1 = load <2 x float>, <2 x float> addrspace(1)* %in2 + %r2 = load <2 x float>, <2 x float> addrspace(1)* %in3 + %r3 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %r0, <2 x float> %r1, <2 x float> %r2) + store <2 x float> %r3, <2 x float> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}fma_v4f32: +; SI: v_fma_f32 +; SI: v_fma_f32 +; SI: v_fma_f32 +; SI: v_fma_f32 + +; EG: MEM_RAT_{{.*}} STORE_{{.*}} [[RES:T[0-9]]].{{[XYZW][XYZW][XYZW][XYZW]}}, {{T[0-9]\.[XYZW]}}, +; EG-DAG: FMA {{\*? *}}[[RES]].X +; EG-DAG: FMA {{\*? *}}[[RES]].Y +; EG-DAG: FMA {{\*? *}}[[RES]].Z +; EG-DAG: FMA {{\*? *}}[[RES]].W +define void @fma_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in1, + <4 x float> addrspace(1)* %in2, <4 x float> addrspace(1)* %in3) { + %r0 = load <4 x float>, <4 x float> addrspace(1)* %in1 + %r1 = load <4 x float>, <4 x float> addrspace(1)* %in2 + %r2 = load <4 x float>, <4 x float> addrspace(1)* %in3 + %r3 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %r0, <4 x float> %r1, <4 x float> %r2) + store <4 x float> %r3, <4 x float> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: @fma_commute_mul_inline_imm_f32 +; SI: v_fma_f32 {{v[0-9]+}}, 2.0, {{v[0-9]+}}, {{v[0-9]+}} +define void @fma_commute_mul_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind { + %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone + %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid + %in.b.gep = getelementptr float, float addrspace(1)* %in.b, i32 %tid + %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid + + %a = load float, float addrspace(1)* %in.a.gep, align 4 + %b = load float, float addrspace(1)* %in.b.gep, align 4 + + %fma = call float @llvm.fma.f32(float %a, float 2.0, float %b) + store float %fma, float addrspace(1)* %out.gep, align 4 + ret void +} + +; FUNC-LABEL: @fma_commute_mul_s_f32 +define void @fma_commute_mul_s_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b, float %b) nounwind { + %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone + %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid + %in.b.gep = getelementptr float, float addrspace(1)* %in.b, i32 %tid + %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid + + %a = load float, float addrspace(1)* %in.a.gep, align 4 + %c = load float, float addrspace(1)* %in.b.gep, align 4 + + %fma = call float @llvm.fma.f32(float %a, float %b, float %c) + store float %fma, float addrspace(1)* %out.gep, align 4 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/fmad.ll b/llvm/test/CodeGen/AMDGPU/fmad.ll new file mode 100644 index 00000000000..935e35123f4 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/fmad.ll @@ -0,0 +1,19 @@ +;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s + +;CHECK: MULADD_IEEE * {{T[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} + +define void @test(<4 x float> inreg %reg0) #0 { + %r0 = extractelement <4 x float> %reg0, i32 0 + %r1 = extractelement <4 x float> %reg0, i32 1 + %r2 = extractelement <4 x float> %reg0, i32 2 + %r3 = fmul float %r0, %r1 + %r4 = fadd float %r3, %r2 + %vec = insertelement <4 x float> undef, float %r4, i32 0 + call void @llvm.R600.store.swizzle(<4 x float> %vec, i32 0, i32 0) + ret void +} + +declare float @fabs(float ) readnone +declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) + +attributes #0 = { "ShaderType"="0" } \ No newline at end of file diff --git a/llvm/test/CodeGen/AMDGPU/fmax.ll b/llvm/test/CodeGen/AMDGPU/fmax.ll new file mode 100644 index 00000000000..d7127f485c7 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/fmax.ll @@ -0,0 +1,17 @@ +;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s + +;CHECK: MAX * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} + +define void @test(<4 x float> inreg %reg0) #0 { + %r0 = extractelement <4 x float> %reg0, i32 0 + %r1 = extractelement <4 x float> %reg0, i32 1 + %r2 = fcmp oge float %r0, %r1 + %r3 = select i1 %r2, float %r0, float %r1 + %vec = insertelement <4 x float> undef, float %r3, i32 0 + call void @llvm.R600.store.swizzle(<4 x float> %vec, i32 0, i32 0) + ret void +} + +declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) + +attributes #0 = { "ShaderType"="0" } \ No newline at end of file diff --git a/llvm/test/CodeGen/AMDGPU/fmax3.f64.ll b/llvm/test/CodeGen/AMDGPU/fmax3.f64.ll new file mode 100644 index 00000000000..f78c71b2826 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/fmax3.f64.ll @@ -0,0 +1,24 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s + +declare double @llvm.maxnum.f64(double, double) nounwind readnone + +; SI-LABEL: {{^}}test_fmax3_f64: +; SI-DAG: buffer_load_dwordx2 [[REGA:v\[[0-9]+:[0-9]+\]]], s[{{[0-9]+:[0-9]+}}], 0{{$}} +; SI-DAG: buffer_load_dwordx2 [[REGB:v\[[0-9]+:[0-9]+\]]], s[{{[0-9]+:[0-9]+}}], 0 offset:8 +; SI-DAG: buffer_load_dwordx2 [[REGC:v\[[0-9]+:[0-9]+\]]], s[{{[0-9]+:[0-9]+}}], 0 offset:16 +; SI: v_max_f64 [[REGA]], [[REGA]], [[REGB]] +; SI: v_max_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[REGA]], [[REGC]] +; SI: buffer_store_dwordx2 [[RESULT]], +; SI: s_endpgm +define void @test_fmax3_f64(double addrspace(1)* %out, double addrspace(1)* %aptr) nounwind { + %bptr = getelementptr double, double addrspace(1)* %aptr, i32 1 + %cptr = getelementptr double, double addrspace(1)* %aptr, i32 2 + %a = load double, double addrspace(1)* %aptr, align 8 + %b = load double, double addrspace(1)* %bptr, align 8 + %c = load double, double addrspace(1)* %cptr, align 8 + %f0 = call double @llvm.maxnum.f64(double %a, double %b) nounwind readnone + %f1 = call double @llvm.maxnum.f64(double %f0, double %c) nounwind readnone + store double %f1, double addrspace(1)* %out, align 8 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/fmax3.ll b/llvm/test/CodeGen/AMDGPU/fmax3.ll new file mode 100644 index 00000000000..c3028a6217d --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/fmax3.ll @@ -0,0 +1,39 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s + +declare float @llvm.maxnum.f32(float, float) nounwind readnone + +; SI-LABEL: {{^}}test_fmax3_olt_0: +; SI: buffer_load_dword [[REGC:v[0-9]+]] +; SI: buffer_load_dword [[REGB:v[0-9]+]] +; SI: buffer_load_dword [[REGA:v[0-9]+]] +; SI: v_max3_f32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]] +; SI: buffer_store_dword [[RESULT]], +; SI: s_endpgm +define void @test_fmax3_olt_0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) nounwind { + %a = load float, float addrspace(1)* %aptr, align 4 + %b = load float, float addrspace(1)* %bptr, align 4 + %c = load float, float addrspace(1)* %cptr, align 4 + %f0 = call float @llvm.maxnum.f32(float %a, float %b) nounwind readnone + %f1 = call float @llvm.maxnum.f32(float %f0, float %c) nounwind readnone + store float %f1, float addrspace(1)* %out, align 4 + ret void +} + +; Commute operand of second fmax +; SI-LABEL: {{^}}test_fmax3_olt_1: +; SI: buffer_load_dword [[REGB:v[0-9]+]] +; SI: buffer_load_dword [[REGA:v[0-9]+]] +; SI: buffer_load_dword [[REGC:v[0-9]+]] +; SI: v_max3_f32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]] +; SI: buffer_store_dword [[RESULT]], +; SI: s_endpgm +define void @test_fmax3_olt_1(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) nounwind { + %a = load float, float addrspace(1)* %aptr, align 4 + %b = load float, float addrspace(1)* %bptr, align 4 + %c = load float, float addrspace(1)* %cptr, align 4 + %f0 = call float @llvm.maxnum.f32(float %a, float %b) nounwind readnone + %f1 = call float @llvm.maxnum.f32(float %c, float %f0) nounwind readnone + store float %f1, float addrspace(1)* %out, align 4 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/fmax_legacy.f64.ll b/llvm/test/CodeGen/AMDGPU/fmax_legacy.f64.ll new file mode 100644 index 00000000000..828243888ac --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/fmax_legacy.f64.ll @@ -0,0 +1,67 @@ +; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; Make sure we don't try to form FMAX_LEGACY nodes with f64 + +declare i32 @llvm.r600.read.tidig.x() #1 + +; FUNC-LABEL: @test_fmax_legacy_uge_f64 +define void @test_fmax_legacy_uge_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 { + %tid = call i32 @llvm.r600.read.tidig.x() #1 + %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 + + %a = load double, double addrspace(1)* %gep.0, align 8 + %b = load double, double addrspace(1)* %gep.1, align 8 + + %cmp = fcmp uge double %a, %b + %val = select i1 %cmp, double %a, double %b + store double %val, double addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: @test_fmax_legacy_oge_f64 +define void @test_fmax_legacy_oge_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 { + %tid = call i32 @llvm.r600.read.tidig.x() #1 + %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 + + %a = load double, double addrspace(1)* %gep.0, align 8 + %b = load double, double addrspace(1)* %gep.1, align 8 + + %cmp = fcmp oge double %a, %b + %val = select i1 %cmp, double %a, double %b + store double %val, double addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: @test_fmax_legacy_ugt_f64 +define void @test_fmax_legacy_ugt_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 { + %tid = call i32 @llvm.r600.read.tidig.x() #1 + %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 + + %a = load double, double addrspace(1)* %gep.0, align 8 + %b = load double, double addrspace(1)* %gep.1, align 8 + + %cmp = fcmp ugt double %a, %b + %val = select i1 %cmp, double %a, double %b + store double %val, double addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: @test_fmax_legacy_ogt_f64 +define void @test_fmax_legacy_ogt_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 { + %tid = call i32 @llvm.r600.read.tidig.x() #1 + %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 + + %a = load double, double addrspace(1)* %gep.0, align 8 + %b = load double, double addrspace(1)* %gep.1, align 8 + + %cmp = fcmp ogt double %a, %b + %val = select i1 %cmp, double %a, double %b + store double %val, double addrspace(1)* %out, align 8 + ret void +} + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } diff --git a/llvm/test/CodeGen/AMDGPU/fmax_legacy.ll b/llvm/test/CodeGen/AMDGPU/fmax_legacy.ll new file mode 100644 index 00000000000..413957d2982 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/fmax_legacy.ll @@ -0,0 +1,116 @@ +; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=SI-SAFE -check-prefix=FUNC %s +; RUN: llc -enable-no-nans-fp-math -enable-unsafe-fp-math -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI-NONAN -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s + +; FIXME: Should replace unsafe-fp-math with no signed zeros. + +declare i32 @llvm.r600.read.tidig.x() #1 + +; FUNC-LABEL: @test_fmax_legacy_uge_f32 +; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 +; SI-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]] +; SI-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[B]], [[A]] + +; EG: MAX +define void @test_fmax_legacy_uge_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { + %tid = call i32 @llvm.r600.read.tidig.x() #1 + %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 + + %a = load float, float addrspace(1)* %gep.0, align 4 + %b = load float, float addrspace(1)* %gep.1, align 4 + + %cmp = fcmp uge float %a, %b + %val = select i1 %cmp, float %a, float %b + store float %val, float addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @test_fmax_legacy_oge_f32 +; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 +; SI-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]] +; SI-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[B]], [[A]] +; EG: MAX +define void @test_fmax_legacy_oge_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { + %tid = call i32 @llvm.r600.read.tidig.x() #1 + %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 + + %a = load float, float addrspace(1)* %gep.0, align 4 + %b = load float, float addrspace(1)* %gep.1, align 4 + + %cmp = fcmp oge float %a, %b + %val = select i1 %cmp, float %a, float %b + store float %val, float addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @test_fmax_legacy_ugt_f32 +; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 +; SI-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]] +; SI-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[B]], [[A]] +; EG: MAX +define void @test_fmax_legacy_ugt_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { + %tid = call i32 @llvm.r600.read.tidig.x() #1 + %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 + + %a = load float, float addrspace(1)* %gep.0, align 4 + %b = load float, float addrspace(1)* %gep.1, align 4 + + %cmp = fcmp ugt float %a, %b + %val = select i1 %cmp, float %a, float %b + store float %val, float addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @test_fmax_legacy_ogt_f32 +; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 +; SI-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]] +; SI-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[B]], [[A]] +; EG: MAX +define void @test_fmax_legacy_ogt_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { + %tid = call i32 @llvm.r600.read.tidig.x() #1 + %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 + + %a = load float, float addrspace(1)* %gep.0, align 4 + %b = load float, float addrspace(1)* %gep.1, align 4 + + %cmp = fcmp ogt float %a, %b + %val = select i1 %cmp, float %a, float %b + store float %val, float addrspace(1)* %out, align 4 + ret void +} + + +; FUNC-LABEL: @test_fmax_legacy_ogt_f32_multi_use +; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 +; SI-NOT: v_max_ +; SI: v_cmp_gt_f32 +; SI-NEXT: v_cndmask_b32 +; SI-NOT: v_max_ + +; EG: MAX +define void @test_fmax_legacy_ogt_f32_multi_use(float addrspace(1)* %out0, i1 addrspace(1)* %out1, float addrspace(1)* %in) #0 { + %tid = call i32 @llvm.r600.read.tidig.x() #1 + %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 + + %a = load float, float addrspace(1)* %gep.0, align 4 + %b = load float, float addrspace(1)* %gep.1, align 4 + + %cmp = fcmp ogt float %a, %b + %val = select i1 %cmp, float %a, float %b + store float %val, float addrspace(1)* %out0, align 4 + store i1 %cmp, i1addrspace(1)* %out1 + ret void +} + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } diff --git a/llvm/test/CodeGen/AMDGPU/fmaxnum.f64.ll b/llvm/test/CodeGen/AMDGPU/fmaxnum.f64.ll new file mode 100644 index 00000000000..de563cec341 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/fmaxnum.f64.ll @@ -0,0 +1,76 @@ +; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s + +declare double @llvm.maxnum.f64(double, double) #0 +declare <2 x double> @llvm.maxnum.v2f64(<2 x double>, <2 x double>) #0 +declare <4 x double> @llvm.maxnum.v4f64(<4 x double>, <4 x double>) #0 +declare <8 x double> @llvm.maxnum.v8f64(<8 x double>, <8 x double>) #0 +declare <16 x double> @llvm.maxnum.v16f64(<16 x double>, <16 x double>) #0 + +; FUNC-LABEL: @test_fmax_f64 +; SI: v_max_f64 +define void @test_fmax_f64(double addrspace(1)* %out, double %a, double %b) nounwind { + %val = call double @llvm.maxnum.f64(double %a, double %b) #0 + store double %val, double addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: @test_fmax_v2f64 +; SI: v_max_f64 +; SI: v_max_f64 +define void @test_fmax_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b) nounwind { + %val = call <2 x double> @llvm.maxnum.v2f64(<2 x double> %a, <2 x double> %b) #0 + store <2 x double> %val, <2 x double> addrspace(1)* %out, align 16 + ret void +} + +; FUNC-LABEL: @test_fmax_v4f64 +; SI: v_max_f64 +; SI: v_max_f64 +; SI: v_max_f64 +; SI: v_max_f64 +define void @test_fmax_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b) nounwind { + %val = call <4 x double> @llvm.maxnum.v4f64(<4 x double> %a, <4 x double> %b) #0 + store <4 x double> %val, <4 x double> addrspace(1)* %out, align 32 + ret void +} + +; FUNC-LABEL: @test_fmax_v8f64 +; SI: v_max_f64 +; SI: v_max_f64 +; SI: v_max_f64 +; SI: v_max_f64 +; SI: v_max_f64 +; SI: v_max_f64 +; SI: v_max_f64 +; SI: v_max_f64 +define void @test_fmax_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %a, <8 x double> %b) nounwind { + %val = call <8 x double> @llvm.maxnum.v8f64(<8 x double> %a, <8 x double> %b) #0 + store <8 x double> %val, <8 x double> addrspace(1)* %out, align 64 + ret void +} + +; FUNC-LABEL: @test_fmax_v16f64 +; SI: v_max_f64 +; SI: v_max_f64 +; SI: v_max_f64 +; SI: v_max_f64 +; SI: v_max_f64 +; SI: v_max_f64 +; SI: v_max_f64 +; SI: v_max_f64 +; SI: v_max_f64 +; SI: v_max_f64 +; SI: v_max_f64 +; SI: v_max_f64 +; SI: v_max_f64 +; SI: v_max_f64 +; SI: v_max_f64 +; SI: v_max_f64 +define void @test_fmax_v16f64(<16 x double> addrspace(1)* %out, <16 x double> %a, <16 x double> %b) nounwind { + %val = call <16 x double> @llvm.maxnum.v16f64(<16 x double> %a, <16 x double> %b) #0 + store <16 x double> %val, <16 x double> addrspace(1)* %out, align 128 + ret void +} + +attributes #0 = { nounwind readnone } diff --git a/llvm/test/CodeGen/AMDGPU/fmaxnum.ll b/llvm/test/CodeGen/AMDGPU/fmaxnum.ll new file mode 100644 index 00000000000..3029bd02e4d --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/fmaxnum.ll @@ -0,0 +1,283 @@ +; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s + +declare float @llvm.maxnum.f32(float, float) #0 +declare <2 x float> @llvm.maxnum.v2f32(<2 x float>, <2 x float>) #0 +declare <4 x float> @llvm.maxnum.v4f32(<4 x float>, <4 x float>) #0 +declare <8 x float> @llvm.maxnum.v8f32(<8 x float>, <8 x float>) #0 +declare <16 x float> @llvm.maxnum.v16f32(<16 x float>, <16 x float>) #0 + +declare double @llvm.maxnum.f64(double, double) + +; FUNC-LABEL: @test_fmax_f32 +; SI: v_max_f32_e32 + +; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] +; EG: MAX_DX10 {{.*}}[[OUT]] +define void @test_fmax_f32(float addrspace(1)* %out, float %a, float %b) nounwind { + %val = call float @llvm.maxnum.f32(float %a, float %b) #0 + store float %val, float addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @test_fmax_v2f32 +; SI: v_max_f32_e32 +; SI: v_max_f32_e32 + +; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+]] +; EG: MAX_DX10 {{.*}}[[OUT]] +; EG: MAX_DX10 {{.*}}[[OUT]] +define void @test_fmax_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) nounwind { + %val = call <2 x float> @llvm.maxnum.v2f32(<2 x float> %a, <2 x float> %b) #0 + store <2 x float> %val, <2 x float> addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: @test_fmax_v4f32 +; SI: v_max_f32_e32 +; SI: v_max_f32_e32 +; SI: v_max_f32_e32 +; SI: v_max_f32_e32 + +; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+]] +; EG: MAX_DX10 {{.*}}[[OUT]] +; EG: MAX_DX10 {{.*}}[[OUT]] +; EG: MAX_DX10 {{.*}}[[OUT]] +; EG: MAX_DX10 {{.*}}[[OUT]] +define void @test_fmax_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, <4 x float> %b) nounwind { + %val = call <4 x float> @llvm.maxnum.v4f32(<4 x float> %a, <4 x float> %b) #0 + store <4 x float> %val, <4 x float> addrspace(1)* %out, align 16 + ret void +} + +; FUNC-LABEL: @test_fmax_v8f32 +; SI: v_max_f32_e32 +; SI: v_max_f32_e32 +; SI: v_max_f32_e32 +; SI: v_max_f32_e32 +; SI: v_max_f32_e32 +; SI: v_max_f32_e32 +; SI: v_max_f32_e32 +; SI: v_max_f32_e32 + +; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT1:T[0-9]+]] +; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT2:T[0-9]+]] +; EG-DAG: MAX_DX10 {{.*}}[[OUT1]].X +; EG-DAG: MAX_DX10 {{.*}}[[OUT1]].Y +; EG-DAG: MAX_DX10 {{.*}}[[OUT1]].Z +; EG-DAG: MAX_DX10 {{.*}}[[OUT1]].W +; EG-DAG: MAX_DX10 {{.*}}[[OUT2]].X +; EG-DAG: MAX_DX10 {{.*}}[[OUT2]].Y +; EG-DAG: MAX_DX10 {{.*}}[[OUT2]].Z +; EG-DAG: MAX_DX10 {{.*}}[[OUT2]].W +define void @test_fmax_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, <8 x float> %b) nounwind { + %val = call <8 x float> @llvm.maxnum.v8f32(<8 x float> %a, <8 x float> %b) #0 + store <8 x float> %val, <8 x float> addrspace(1)* %out, align 32 + ret void +} + +; FUNC-LABEL: @test_fmax_v16f32 +; SI: v_max_f32_e32 +; SI: v_max_f32_e32 +; SI: v_max_f32_e32 +; SI: v_max_f32_e32 +; SI: v_max_f32_e32 +; SI: v_max_f32_e32 +; SI: v_max_f32_e32 +; SI: v_max_f32_e32 +; SI: v_max_f32_e32 +; SI: v_max_f32_e32 +; SI: v_max_f32_e32 +; SI: v_max_f32_e32 +; SI: v_max_f32_e32 +; SI: v_max_f32_e32 +; SI: v_max_f32_e32 +; SI: v_max_f32_e32 + +; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT1:T[0-9]+]] +; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT2:T[0-9]+]] +; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT3:T[0-9]+]] +; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT4:T[0-9]+]] +; EG-DAG: MAX_DX10 {{.*}}[[OUT1]].X +; EG-DAG: MAX_DX10 {{.*}}[[OUT1]].Y +; EG-DAG: MAX_DX10 {{.*}}[[OUT1]].Z +; EG-DAG: MAX_DX10 {{.*}}[[OUT1]].W +; EG-DAG: MAX_DX10 {{.*}}[[OUT2]].X +; EG-DAG: MAX_DX10 {{.*}}[[OUT2]].Y +; EG-DAG: MAX_DX10 {{.*}}[[OUT2]].Z +; EG-DAG: MAX_DX10 {{.*}}[[OUT2]].W +; EG-DAG: MAX_DX10 {{.*}}[[OUT3]].X +; EG-DAG: MAX_DX10 {{.*}}[[OUT3]].Y +; EG-DAG: MAX_DX10 {{.*}}[[OUT3]].Z +; EG-DAG: MAX_DX10 {{.*}}[[OUT3]].W +; EG-DAG: MAX_DX10 {{.*}}[[OUT4]].X +; EG-DAG: MAX_DX10 {{.*}}[[OUT4]].Y +; EG-DAG: MAX_DX10 {{.*}}[[OUT4]].Z +; EG-DAG: MAX_DX10 {{.*}}[[OUT4]].W +define void @test_fmax_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %a, <16 x float> %b) nounwind { + %val = call <16 x float> @llvm.maxnum.v16f32(<16 x float> %a, <16 x float> %b) #0 + store <16 x float> %val, <16 x float> addrspace(1)* %out, align 64 + ret void +} + +; FUNC-LABEL: @constant_fold_fmax_f32 +; SI-NOT: v_max_f32_e32 +; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 2.0 +; SI: buffer_store_dword [[REG]] + +; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] +; EG-NOT: MAX_DX10 +; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}} +define void @constant_fold_fmax_f32(float addrspace(1)* %out) nounwind { + %val = call float @llvm.maxnum.f32(float 1.0, float 2.0) #0 + store float %val, float addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @constant_fold_fmax_f32_nan_nan +; SI-NOT: v_max_f32_e32 +; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7fc00000 +; SI: buffer_store_dword [[REG]] + +; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] +; EG-NOT: MAX_DX10 +; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}} +; EG: 2143289344(nan) +define void @constant_fold_fmax_f32_nan_nan(float addrspace(1)* %out) nounwind { + %val = call float @llvm.maxnum.f32(float 0x7FF8000000000000, float 0x7FF8000000000000) #0 + store float %val, float addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @constant_fold_fmax_f32_val_nan +; SI-NOT: v_max_f32_e32 +; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 1.0 +; SI: buffer_store_dword [[REG]] + +; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] +; EG-NOT: MAX_DX10 +; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}} +define void @constant_fold_fmax_f32_val_nan(float addrspace(1)* %out) nounwind { + %val = call float @llvm.maxnum.f32(float 1.0, float 0x7FF8000000000000) #0 + store float %val, float addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @constant_fold_fmax_f32_nan_val +; SI-NOT: v_max_f32_e32 +; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 1.0 +; SI: buffer_store_dword [[REG]] + +; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] +; EG-NOT: MAX_DX10 +; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}} +define void @constant_fold_fmax_f32_nan_val(float addrspace(1)* %out) nounwind { + %val = call float @llvm.maxnum.f32(float 0x7FF8000000000000, float 1.0) #0 + store float %val, float addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @constant_fold_fmax_f32_p0_p0 +; SI-NOT: v_max_f32_e32 +; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0 +; SI: buffer_store_dword [[REG]] + +; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] +; EG-NOT: MAX_DX10 +; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}} +define void @constant_fold_fmax_f32_p0_p0(float addrspace(1)* %out) nounwind { + %val = call float @llvm.maxnum.f32(float 0.0, float 0.0) #0 + store float %val, float addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @constant_fold_fmax_f32_p0_n0 +; SI-NOT: v_max_f32_e32 +; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0 +; SI: buffer_store_dword [[REG]] + +; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] +; EG-NOT: MAX_DX10 +; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}} +define void @constant_fold_fmax_f32_p0_n0(float addrspace(1)* %out) nounwind { + %val = call float @llvm.maxnum.f32(float 0.0, float -0.0) #0 + store float %val, float addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @constant_fold_fmax_f32_n0_p0 +; SI-NOT: v_max_f32_e32 +; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x80000000 +; SI: buffer_store_dword [[REG]] + +; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] +; EG-NOT: MAX_DX10 +; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}} +define void @constant_fold_fmax_f32_n0_p0(float addrspace(1)* %out) nounwind { + %val = call float @llvm.maxnum.f32(float -0.0, float 0.0) #0 + store float %val, float addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @constant_fold_fmax_f32_n0_n0 +; SI-NOT: v_max_f32_e32 +; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x80000000 +; SI: buffer_store_dword [[REG]] + +; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] +; EG-NOT: MAX_DX10 +; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}} +define void @constant_fold_fmax_f32_n0_n0(float addrspace(1)* %out) nounwind { + %val = call float @llvm.maxnum.f32(float -0.0, float -0.0) #0 + store float %val, float addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @fmax_var_immediate_f32 +; SI: v_max_f32_e64 {{v[0-9]+}}, 2.0, {{s[0-9]+}} + +; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] +; EG-NOT: MAX_DX10 +; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}} +define void @fmax_var_immediate_f32(float addrspace(1)* %out, float %a) nounwind { + %val = call float @llvm.maxnum.f32(float %a, float 2.0) #0 + store float %val, float addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @fmax_immediate_var_f32 +; SI: v_max_f32_e64 {{v[0-9]+}}, 2.0, {{s[0-9]+}} + +; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] +; EG: MAX_DX10 {{.*}}[[OUT]], {{KC0\[[0-9]\].[XYZW]}}, literal.{{[xy]}} +define void @fmax_immediate_var_f32(float addrspace(1)* %out, float %a) nounwind { + %val = call float @llvm.maxnum.f32(float 2.0, float %a) #0 + store float %val, float addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @fmax_var_literal_f32 +; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x42c60000 +; SI: v_max_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, [[REG]] + +; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] +; EG: MAX_DX10 {{.*}}[[OUT]], {{KC0\[[0-9]\].[XYZW]}}, literal.{{[xy]}} +define void @fmax_var_literal_f32(float addrspace(1)* %out, float %a) nounwind { + %val = call float @llvm.maxnum.f32(float %a, float 99.0) #0 + store float %val, float addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @fmax_literal_var_f32 +; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x42c60000 +; SI: v_max_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, [[REG]] + +; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] +; EG: MAX_DX10 {{.*}}[[OUT]], {{KC0\[[0-9]\].[XYZW]}}, literal.{{[xy]}} +define void @fmax_literal_var_f32(float addrspace(1)* %out, float %a) nounwind { + %val = call float @llvm.maxnum.f32(float 99.0, float %a) #0 + store float %val, float addrspace(1)* %out, align 4 + ret void +} + +attributes #0 = { nounwind readnone } diff --git a/llvm/test/CodeGen/AMDGPU/fmin.ll b/llvm/test/CodeGen/AMDGPU/fmin.ll new file mode 100644 index 00000000000..defa8c09638 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/fmin.ll @@ -0,0 +1,17 @@ +;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s + +;CHECK: MIN * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} + +define void @test(<4 x float> inreg %reg0) #0 { + %r0 = extractelement <4 x float> %reg0, i32 0 + %r1 = extractelement <4 x float> %reg0, i32 1 + %r2 = fcmp uge float %r0, %r1 + %r3 = select i1 %r2, float %r1, float %r0 + %vec = insertelement <4 x float> undef, float %r3, i32 0 + call void @llvm.R600.store.swizzle(<4 x float> %vec, i32 0, i32 0) + ret void +} + +declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) + +attributes #0 = { "ShaderType"="0" } \ No newline at end of file diff --git a/llvm/test/CodeGen/AMDGPU/fmin3.ll b/llvm/test/CodeGen/AMDGPU/fmin3.ll new file mode 100644 index 00000000000..0a76699b43e --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/fmin3.ll @@ -0,0 +1,40 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s + +declare float @llvm.minnum.f32(float, float) nounwind readnone + +; SI-LABEL: {{^}}test_fmin3_olt_0: +; SI: buffer_load_dword [[REGC:v[0-9]+]] +; SI: buffer_load_dword [[REGB:v[0-9]+]] +; SI: buffer_load_dword [[REGA:v[0-9]+]] +; SI: v_min3_f32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]] +; SI: buffer_store_dword [[RESULT]], +; SI: s_endpgm +define void @test_fmin3_olt_0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) nounwind { + %a = load float, float addrspace(1)* %aptr, align 4 + %b = load float, float addrspace(1)* %bptr, align 4 + %c = load float, float addrspace(1)* %cptr, align 4 + %f0 = call float @llvm.minnum.f32(float %a, float %b) nounwind readnone + %f1 = call float @llvm.minnum.f32(float %f0, float %c) nounwind readnone + store float %f1, float addrspace(1)* %out, align 4 + ret void +} + +; Commute operand of second fmin +; SI-LABEL: {{^}}test_fmin3_olt_1: +; SI: buffer_load_dword [[REGB:v[0-9]+]] +; SI: buffer_load_dword [[REGA:v[0-9]+]] +; SI: buffer_load_dword [[REGC:v[0-9]+]] +; SI: v_min3_f32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]] +; SI: buffer_store_dword [[RESULT]], +; SI: s_endpgm +define void @test_fmin3_olt_1(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) nounwind { + %a = load float, float addrspace(1)* %aptr, align 4 + %b = load float, float addrspace(1)* %bptr, align 4 + %c = load float, float addrspace(1)* %cptr, align 4 + %f0 = call float @llvm.minnum.f32(float %a, float %b) nounwind readnone + %f1 = call float @llvm.minnum.f32(float %c, float %f0) nounwind readnone + store float %f1, float addrspace(1)* %out, align 4 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/fmin_legacy.f64.ll b/llvm/test/CodeGen/AMDGPU/fmin_legacy.f64.ll new file mode 100644 index 00000000000..e19a48f3f7e --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/fmin_legacy.f64.ll @@ -0,0 +1,77 @@ +; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s + +declare i32 @llvm.r600.read.tidig.x() #1 + +; FUNC-LABEL: @test_fmin_legacy_f64 +define void @test_fmin_legacy_f64(<4 x double> addrspace(1)* %out, <4 x double> inreg %reg0) #0 { + %r0 = extractelement <4 x double> %reg0, i32 0 + %r1 = extractelement <4 x double> %reg0, i32 1 + %r2 = fcmp uge double %r0, %r1 + %r3 = select i1 %r2, double %r1, double %r0 + %vec = insertelement <4 x double> undef, double %r3, i32 0 + store <4 x double> %vec, <4 x double> addrspace(1)* %out, align 16 + ret void +} + +; FUNC-LABEL: @test_fmin_legacy_ule_f64 +define void @test_fmin_legacy_ule_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 { + %tid = call i32 @llvm.r600.read.tidig.x() #1 + %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 + + %a = load double, double addrspace(1)* %gep.0, align 8 + %b = load double, double addrspace(1)* %gep.1, align 8 + + %cmp = fcmp ule double %a, %b + %val = select i1 %cmp, double %a, double %b + store double %val, double addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: @test_fmin_legacy_ole_f64 +define void @test_fmin_legacy_ole_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 { + %tid = call i32 @llvm.r600.read.tidig.x() #1 + %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 + + %a = load double, double addrspace(1)* %gep.0, align 8 + %b = load double, double addrspace(1)* %gep.1, align 8 + + %cmp = fcmp ole double %a, %b + %val = select i1 %cmp, double %a, double %b + store double %val, double addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: @test_fmin_legacy_olt_f64 +define void @test_fmin_legacy_olt_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 { + %tid = call i32 @llvm.r600.read.tidig.x() #1 + %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 + + %a = load double, double addrspace(1)* %gep.0, align 8 + %b = load double, double addrspace(1)* %gep.1, align 8 + + %cmp = fcmp olt double %a, %b + %val = select i1 %cmp, double %a, double %b + store double %val, double addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: @test_fmin_legacy_ult_f64 +define void @test_fmin_legacy_ult_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 { + %tid = call i32 @llvm.r600.read.tidig.x() #1 + %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 + + %a = load double, double addrspace(1)* %gep.0, align 8 + %b = load double, double addrspace(1)* %gep.1, align 8 + + %cmp = fcmp ult double %a, %b + %val = select i1 %cmp, double %a, double %b + store double %val, double addrspace(1)* %out, align 8 + ret void +} + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } diff --git a/llvm/test/CodeGen/AMDGPU/fmin_legacy.ll b/llvm/test/CodeGen/AMDGPU/fmin_legacy.ll new file mode 100644 index 00000000000..6a625c239d7 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/fmin_legacy.ll @@ -0,0 +1,123 @@ +; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI-SAFE -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -enable-no-nans-fp-math -enable-unsafe-fp-math -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI-NONAN -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s + +; FIXME: Should replace unsafe-fp-math with no signed zeros. + +declare i32 @llvm.r600.read.tidig.x() #1 + +; FUNC-LABEL: @test_fmin_legacy_f32 +; EG: MIN * +; SI-SAFE: v_min_legacy_f32_e32 +; SI-NONAN: v_min_f32_e32 +define void @test_fmin_legacy_f32(<4 x float> addrspace(1)* %out, <4 x float> inreg %reg0) #0 { + %r0 = extractelement <4 x float> %reg0, i32 0 + %r1 = extractelement <4 x float> %reg0, i32 1 + %r2 = fcmp uge float %r0, %r1 + %r3 = select i1 %r2, float %r1, float %r0 + %vec = insertelement <4 x float> undef, float %r3, i32 0 + store <4 x float> %vec, <4 x float> addrspace(1)* %out, align 16 + ret void +} + +; FUNC-LABEL: @test_fmin_legacy_ule_f32 +; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 +; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]] +; SI-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[B]], [[A]] +define void @test_fmin_legacy_ule_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { + %tid = call i32 @llvm.r600.read.tidig.x() #1 + %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 + + %a = load float, float addrspace(1)* %gep.0, align 4 + %b = load float, float addrspace(1)* %gep.1, align 4 + + %cmp = fcmp ule float %a, %b + %val = select i1 %cmp, float %a, float %b + store float %val, float addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @test_fmin_legacy_ole_f32 +; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 +; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]] +; SI-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[B]], [[A]] +define void @test_fmin_legacy_ole_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { + %tid = call i32 @llvm.r600.read.tidig.x() #1 + %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 + + %a = load float, float addrspace(1)* %gep.0, align 4 + %b = load float, float addrspace(1)* %gep.1, align 4 + + %cmp = fcmp ole float %a, %b + %val = select i1 %cmp, float %a, float %b + store float %val, float addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @test_fmin_legacy_olt_f32 +; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 +; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]] +; SI-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[B]], [[A]] +define void @test_fmin_legacy_olt_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { + %tid = call i32 @llvm.r600.read.tidig.x() #1 + %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 + + %a = load float, float addrspace(1)* %gep.0, align 4 + %b = load float, float addrspace(1)* %gep.1, align 4 + + %cmp = fcmp olt float %a, %b + %val = select i1 %cmp, float %a, float %b + store float %val, float addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @test_fmin_legacy_ult_f32 +; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 +; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]] +; SI-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[B]], [[A]] +define void @test_fmin_legacy_ult_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { + %tid = call i32 @llvm.r600.read.tidig.x() #1 + %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 + + %a = load float, float addrspace(1)* %gep.0, align 4 + %b = load float, float addrspace(1)* %gep.1, align 4 + + %cmp = fcmp ult float %a, %b + %val = select i1 %cmp, float %a, float %b + store float %val, float addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @test_fmin_legacy_ole_f32_multi_use +; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 +; SI-NOT: v_min +; SI: v_cmp_le_f32 +; SI-NEXT: v_cndmask_b32 +; SI-NOT: v_min +; SI: s_endpgm +define void @test_fmin_legacy_ole_f32_multi_use(float addrspace(1)* %out0, i1 addrspace(1)* %out1, float addrspace(1)* %in) #0 { + %tid = call i32 @llvm.r600.read.tidig.x() #1 + %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 + + %a = load float, float addrspace(1)* %gep.0, align 4 + %b = load float, float addrspace(1)* %gep.1, align 4 + + %cmp = fcmp ole float %a, %b + %val0 = select i1 %cmp, float %a, float %b + store float %val0, float addrspace(1)* %out0, align 4 + store i1 %cmp, i1 addrspace(1)* %out1 + ret void +} + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } diff --git a/llvm/test/CodeGen/AMDGPU/fminnum.f64.ll b/llvm/test/CodeGen/AMDGPU/fminnum.f64.ll new file mode 100644 index 00000000000..0f929d6a81f --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/fminnum.f64.ll @@ -0,0 +1,76 @@ +; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s + +declare double @llvm.minnum.f64(double, double) #0 +declare <2 x double> @llvm.minnum.v2f64(<2 x double>, <2 x double>) #0 +declare <4 x double> @llvm.minnum.v4f64(<4 x double>, <4 x double>) #0 +declare <8 x double> @llvm.minnum.v8f64(<8 x double>, <8 x double>) #0 +declare <16 x double> @llvm.minnum.v16f64(<16 x double>, <16 x double>) #0 + +; FUNC-LABEL: @test_fmin_f64 +; SI: v_min_f64 +define void @test_fmin_f64(double addrspace(1)* %out, double %a, double %b) nounwind { + %val = call double @llvm.minnum.f64(double %a, double %b) #0 + store double %val, double addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: @test_fmin_v2f64 +; SI: v_min_f64 +; SI: v_min_f64 +define void @test_fmin_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b) nounwind { + %val = call <2 x double> @llvm.minnum.v2f64(<2 x double> %a, <2 x double> %b) #0 + store <2 x double> %val, <2 x double> addrspace(1)* %out, align 16 + ret void +} + +; FUNC-LABEL: @test_fmin_v4f64 +; SI: v_min_f64 +; SI: v_min_f64 +; SI: v_min_f64 +; SI: v_min_f64 +define void @test_fmin_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b) nounwind { + %val = call <4 x double> @llvm.minnum.v4f64(<4 x double> %a, <4 x double> %b) #0 + store <4 x double> %val, <4 x double> addrspace(1)* %out, align 32 + ret void +} + +; FUNC-LABEL: @test_fmin_v8f64 +; SI: v_min_f64 +; SI: v_min_f64 +; SI: v_min_f64 +; SI: v_min_f64 +; SI: v_min_f64 +; SI: v_min_f64 +; SI: v_min_f64 +; SI: v_min_f64 +define void @test_fmin_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %a, <8 x double> %b) nounwind { + %val = call <8 x double> @llvm.minnum.v8f64(<8 x double> %a, <8 x double> %b) #0 + store <8 x double> %val, <8 x double> addrspace(1)* %out, align 64 + ret void +} + +; FUNC-LABEL: @test_fmin_v16f64 +; SI: v_min_f64 +; SI: v_min_f64 +; SI: v_min_f64 +; SI: v_min_f64 +; SI: v_min_f64 +; SI: v_min_f64 +; SI: v_min_f64 +; SI: v_min_f64 +; SI: v_min_f64 +; SI: v_min_f64 +; SI: v_min_f64 +; SI: v_min_f64 +; SI: v_min_f64 +; SI: v_min_f64 +; SI: v_min_f64 +; SI: v_min_f64 +define void @test_fmin_v16f64(<16 x double> addrspace(1)* %out, <16 x double> %a, <16 x double> %b) nounwind { + %val = call <16 x double> @llvm.minnum.v16f64(<16 x double> %a, <16 x double> %b) #0 + store <16 x double> %val, <16 x double> addrspace(1)* %out, align 128 + ret void +} + +attributes #0 = { nounwind readnone } diff --git a/llvm/test/CodeGen/AMDGPU/fminnum.ll b/llvm/test/CodeGen/AMDGPU/fminnum.ll new file mode 100644 index 00000000000..4d7b52540d8 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/fminnum.ll @@ -0,0 +1,281 @@ +; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s + +declare float @llvm.minnum.f32(float, float) #0 +declare <2 x float> @llvm.minnum.v2f32(<2 x float>, <2 x float>) #0 +declare <4 x float> @llvm.minnum.v4f32(<4 x float>, <4 x float>) #0 +declare <8 x float> @llvm.minnum.v8f32(<8 x float>, <8 x float>) #0 +declare <16 x float> @llvm.minnum.v16f32(<16 x float>, <16 x float>) #0 + +; FUNC-LABEL: @test_fmin_f32 +; SI: v_min_f32_e32 + +; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] +; EG: MIN_DX10 {{.*}}[[OUT]] +define void @test_fmin_f32(float addrspace(1)* %out, float %a, float %b) nounwind { + %val = call float @llvm.minnum.f32(float %a, float %b) #0 + store float %val, float addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @test_fmin_v2f32 +; SI: v_min_f32_e32 +; SI: v_min_f32_e32 + +; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+]] +; EG: MIN_DX10 {{.*}}[[OUT]] +; EG: MIN_DX10 {{.*}}[[OUT]] +define void @test_fmin_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) nounwind { + %val = call <2 x float> @llvm.minnum.v2f32(<2 x float> %a, <2 x float> %b) #0 + store <2 x float> %val, <2 x float> addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: @test_fmin_v4f32 +; SI: v_min_f32_e32 +; SI: v_min_f32_e32 +; SI: v_min_f32_e32 +; SI: v_min_f32_e32 + +; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+]] +; EG: MIN_DX10 {{.*}}[[OUT]] +; EG: MIN_DX10 {{.*}}[[OUT]] +; EG: MIN_DX10 {{.*}}[[OUT]] +; EG: MIN_DX10 {{.*}}[[OUT]] +define void @test_fmin_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, <4 x float> %b) nounwind { + %val = call <4 x float> @llvm.minnum.v4f32(<4 x float> %a, <4 x float> %b) #0 + store <4 x float> %val, <4 x float> addrspace(1)* %out, align 16 + ret void +} + +; FUNC-LABEL: @test_fmin_v8f32 +; SI: v_min_f32_e32 +; SI: v_min_f32_e32 +; SI: v_min_f32_e32 +; SI: v_min_f32_e32 +; SI: v_min_f32_e32 +; SI: v_min_f32_e32 +; SI: v_min_f32_e32 +; SI: v_min_f32_e32 + +; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT1:T[0-9]+]] +; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT2:T[0-9]+]] +; EG-DAG: MIN_DX10 {{.*}}[[OUT1]].X +; EG-DAG: MIN_DX10 {{.*}}[[OUT1]].Y +; EG-DAG: MIN_DX10 {{.*}}[[OUT1]].Z +; EG-DAG: MIN_DX10 {{.*}}[[OUT1]].W +; EG-DAG: MIN_DX10 {{.*}}[[OUT2]].X +; EG-DAG: MIN_DX10 {{.*}}[[OUT2]].Y +; EG-DAG: MIN_DX10 {{.*}}[[OUT2]].Z +; EG-DAG: MIN_DX10 {{.*}}[[OUT2]].W +define void @test_fmin_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, <8 x float> %b) nounwind { + %val = call <8 x float> @llvm.minnum.v8f32(<8 x float> %a, <8 x float> %b) #0 + store <8 x float> %val, <8 x float> addrspace(1)* %out, align 32 + ret void +} + +; FUNC-LABEL: @test_fmin_v16f32 +; SI: v_min_f32_e32 +; SI: v_min_f32_e32 +; SI: v_min_f32_e32 +; SI: v_min_f32_e32 +; SI: v_min_f32_e32 +; SI: v_min_f32_e32 +; SI: v_min_f32_e32 +; SI: v_min_f32_e32 +; SI: v_min_f32_e32 +; SI: v_min_f32_e32 +; SI: v_min_f32_e32 +; SI: v_min_f32_e32 +; SI: v_min_f32_e32 +; SI: v_min_f32_e32 +; SI: v_min_f32_e32 +; SI: v_min_f32_e32 + +; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT1:T[0-9]+]] +; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT2:T[0-9]+]] +; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT3:T[0-9]+]] +; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT4:T[0-9]+]] +; EG-DAG: MIN_DX10 {{.*}}[[OUT1]].X +; EG-DAG: MIN_DX10 {{.*}}[[OUT1]].Y +; EG-DAG: MIN_DX10 {{.*}}[[OUT1]].Z +; EG-DAG: MIN_DX10 {{.*}}[[OUT1]].W +; EG-DAG: MIN_DX10 {{.*}}[[OUT2]].X +; EG-DAG: MIN_DX10 {{.*}}[[OUT2]].Y +; EG-DAG: MIN_DX10 {{.*}}[[OUT2]].Z +; EG-DAG: MIN_DX10 {{.*}}[[OUT2]].W +; EG-DAG: MIN_DX10 {{.*}}[[OUT3]].X +; EG-DAG: MIN_DX10 {{.*}}[[OUT3]].Y +; EG-DAG: MIN_DX10 {{.*}}[[OUT3]].Z +; EG-DAG: MIN_DX10 {{.*}}[[OUT3]].W +; EG-DAG: MIN_DX10 {{.*}}[[OUT4]].X +; EG-DAG: MIN_DX10 {{.*}}[[OUT4]].Y +; EG-DAG: MIN_DX10 {{.*}}[[OUT4]].Z +; EG-DAG: MIN_DX10 {{.*}}[[OUT4]].W +define void @test_fmin_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %a, <16 x float> %b) nounwind { + %val = call <16 x float> @llvm.minnum.v16f32(<16 x float> %a, <16 x float> %b) #0 + store <16 x float> %val, <16 x float> addrspace(1)* %out, align 64 + ret void +} + +; FUNC-LABEL: @constant_fold_fmin_f32 +; SI-NOT: v_min_f32_e32 +; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 1.0 +; SI: buffer_store_dword [[REG]] + +; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] +; EG-NOT: MIN_DX10 +; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}} +define void @constant_fold_fmin_f32(float addrspace(1)* %out) nounwind { + %val = call float @llvm.minnum.f32(float 1.0, float 2.0) #0 + store float %val, float addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @constant_fold_fmin_f32_nan_nan +; SI-NOT: v_min_f32_e32 +; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7fc00000 +; SI: buffer_store_dword [[REG]] + +; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] +; EG-NOT: MIN_DX10 +; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}} +; EG: 2143289344({{nan|1\.#QNAN0e\+00}}) +define void @constant_fold_fmin_f32_nan_nan(float addrspace(1)* %out) nounwind { + %val = call float @llvm.minnum.f32(float 0x7FF8000000000000, float 0x7FF8000000000000) #0 + store float %val, float addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @constant_fold_fmin_f32_val_nan +; SI-NOT: v_min_f32_e32 +; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 1.0 +; SI: buffer_store_dword [[REG]] + +; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] +; EG-NOT: MIN_DX10 +; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}} +define void @constant_fold_fmin_f32_val_nan(float addrspace(1)* %out) nounwind { + %val = call float @llvm.minnum.f32(float 1.0, float 0x7FF8000000000000) #0 + store float %val, float addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @constant_fold_fmin_f32_nan_val +; SI-NOT: v_min_f32_e32 +; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 1.0 +; SI: buffer_store_dword [[REG]] + +; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] +; EG-NOT: MIN_DX10 +; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}} +define void @constant_fold_fmin_f32_nan_val(float addrspace(1)* %out) nounwind { + %val = call float @llvm.minnum.f32(float 0x7FF8000000000000, float 1.0) #0 + store float %val, float addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @constant_fold_fmin_f32_p0_p0 +; SI-NOT: v_min_f32_e32 +; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0 +; SI: buffer_store_dword [[REG]] + +; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] +; EG-NOT: MIN_DX10 +; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}} +define void @constant_fold_fmin_f32_p0_p0(float addrspace(1)* %out) nounwind { + %val = call float @llvm.minnum.f32(float 0.0, float 0.0) #0 + store float %val, float addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @constant_fold_fmin_f32_p0_n0 +; SI-NOT: v_min_f32_e32 +; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0 +; SI: buffer_store_dword [[REG]] + +; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] +; EG-NOT: MIN_DX10 +; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}} +define void @constant_fold_fmin_f32_p0_n0(float addrspace(1)* %out) nounwind { + %val = call float @llvm.minnum.f32(float 0.0, float -0.0) #0 + store float %val, float addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @constant_fold_fmin_f32_n0_p0 +; SI-NOT: v_min_f32_e32 +; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x80000000 +; SI: buffer_store_dword [[REG]] + +; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] +; EG-NOT: MIN_DX10 +; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}} +define void @constant_fold_fmin_f32_n0_p0(float addrspace(1)* %out) nounwind { + %val = call float @llvm.minnum.f32(float -0.0, float 0.0) #0 + store float %val, float addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @constant_fold_fmin_f32_n0_n0 +; SI-NOT: v_min_f32_e32 +; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x80000000 +; SI: buffer_store_dword [[REG]] + +; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] +; EG-NOT: MIN_DX10 +; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}} +define void @constant_fold_fmin_f32_n0_n0(float addrspace(1)* %out) nounwind { + %val = call float @llvm.minnum.f32(float -0.0, float -0.0) #0 + store float %val, float addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @fmin_var_immediate_f32 +; SI: v_min_f32_e64 {{v[0-9]+}}, 2.0, {{s[0-9]+}} + +; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] +; EG: MIN_DX10 {{.*}}[[OUT]], {{KC0\[[0-9]\].[XYZW]}}, literal.{{[xy]}} +define void @fmin_var_immediate_f32(float addrspace(1)* %out, float %a) nounwind { + %val = call float @llvm.minnum.f32(float %a, float 2.0) #0 + store float %val, float addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @fmin_immediate_var_f32 +; SI: v_min_f32_e64 {{v[0-9]+}}, 2.0, {{s[0-9]+}} + +; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] +; EG: MIN_DX10 {{.*}}[[OUT]], {{KC0\[[0-9]\].[XYZW]}}, literal.{{[xy]}} +define void @fmin_immediate_var_f32(float addrspace(1)* %out, float %a) nounwind { + %val = call float @llvm.minnum.f32(float 2.0, float %a) #0 + store float %val, float addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @fmin_var_literal_f32 +; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x42c60000 +; SI: v_min_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, [[REG]] + +; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] +; EG: MIN_DX10 {{.*}}[[OUT]], {{KC0\[[0-9]\].[XYZW]}}, literal.{{[xy]}} +define void @fmin_var_literal_f32(float addrspace(1)* %out, float %a) nounwind { + %val = call float @llvm.minnum.f32(float %a, float 99.0) #0 + store float %val, float addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @fmin_literal_var_f32 +; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x42c60000 +; SI: v_min_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, [[REG]] + +; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] +; EG: MIN_DX10 {{.*}}[[OUT]], {{KC0\[[0-9]\].[XYZW]}}, literal.{{[xy]}} +define void @fmin_literal_var_f32(float addrspace(1)* %out, float %a) nounwind { + %val = call float @llvm.minnum.f32(float 99.0, float %a) #0 + store float %val, float addrspace(1)* %out, align 4 + ret void +} + +attributes #0 = { nounwind readnone } diff --git a/llvm/test/CodeGen/AMDGPU/fmul.ll b/llvm/test/CodeGen/AMDGPU/fmul.ll new file mode 100644 index 00000000000..addc409c9eb --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/fmul.ll @@ -0,0 +1,92 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s + + +; FUNC-LABEL: {{^}}fmul_f32: +; R600: MUL_IEEE {{\** *}}{{T[0-9]+\.[XYZW]}}, KC0[2].Z, KC0[2].W + +; SI: v_mul_f32 +define void @fmul_f32(float addrspace(1)* %out, float %a, float %b) { +entry: + %0 = fmul float %a, %b + store float %0, float addrspace(1)* %out + ret void +} + +declare float @llvm.R600.load.input(i32) readnone + +declare void @llvm.AMDGPU.store.output(float, i32) + +; FUNC-LABEL: {{^}}fmul_v2f32: +; R600: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}} +; R600: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}} + +; SI: v_mul_f32 +; SI: v_mul_f32 +define void @fmul_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) { +entry: + %0 = fmul <2 x float> %a, %b + store <2 x float> %0, <2 x float> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}fmul_v4f32: +; R600: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; R600: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; R600: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; R600: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} + +; SI: v_mul_f32 +; SI: v_mul_f32 +; SI: v_mul_f32 +; SI: v_mul_f32 +define void @fmul_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) { + %b_ptr = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 1 + %a = load <4 x float>, <4 x float> addrspace(1) * %in + %b = load <4 x float>, <4 x float> addrspace(1) * %b_ptr + %result = fmul <4 x float> %a, %b + store <4 x float> %result, <4 x float> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}test_mul_2_k: +; SI: v_mul_f32 +; SI-NOT: v_mul_f32 +; SI: s_endpgm +define void @test_mul_2_k(float addrspace(1)* %out, float %x) #0 { + %y = fmul float %x, 2.0 + %z = fmul float %y, 3.0 + store float %z, float addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}test_mul_2_k_inv: +; SI: v_mul_f32 +; SI-NOT: v_mul_f32 +; SI-NOT: v_mad_f32 +; SI: s_endpgm +define void @test_mul_2_k_inv(float addrspace(1)* %out, float %x) #0 { + %y = fmul float %x, 3.0 + %z = fmul float %y, 2.0 + store float %z, float addrspace(1)* %out + ret void +} + +; There should be three multiplies here; %a should be used twice (once +; negated), not duplicated into mul x, 5.0 and mul x, -5.0. +; FUNC-LABEL: {{^}}test_mul_twouse: +; SI: v_mul_f32 +; SI: v_mul_f32 +; SI: v_mul_f32 +; SI-NOT: v_mul_f32 +define void @test_mul_twouse(float addrspace(1)* %out, float %x, float %y) #0 { + %a = fmul float %x, 5.0 + %b = fsub float -0.0, %a + %c = fmul float %b, %y + %d = fmul float %c, %a + store float %d, float addrspace(1)* %out + ret void +} + +attributes #0 = { "less-precise-fpmad"="true" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "unsafe-fp-math"="true" } diff --git a/llvm/test/CodeGen/AMDGPU/fmul64.ll b/llvm/test/CodeGen/AMDGPU/fmul64.ll new file mode 100644 index 00000000000..3c222eaba89 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/fmul64.ll @@ -0,0 +1,39 @@ +; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=SI %s + +; FUNC-LABEL: {{^}}fmul_f64: +; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}} +define void @fmul_f64(double addrspace(1)* %out, double addrspace(1)* %in1, + double addrspace(1)* %in2) { + %r0 = load double, double addrspace(1)* %in1 + %r1 = load double, double addrspace(1)* %in2 + %r2 = fmul double %r0, %r1 + store double %r2, double addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}fmul_v2f64: +; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}} +; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}} +define void @fmul_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %in1, + <2 x double> addrspace(1)* %in2) { + %r0 = load <2 x double>, <2 x double> addrspace(1)* %in1 + %r1 = load <2 x double>, <2 x double> addrspace(1)* %in2 + %r2 = fmul <2 x double> %r0, %r1 + store <2 x double> %r2, <2 x double> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}fmul_v4f64: +; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}} +; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}} +; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}} +; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}} +define void @fmul_v4f64(<4 x double> addrspace(1)* %out, <4 x double> addrspace(1)* %in1, + <4 x double> addrspace(1)* %in2) { + %r0 = load <4 x double>, <4 x double> addrspace(1)* %in1 + %r1 = load <4 x double>, <4 x double> addrspace(1)* %in2 + %r2 = fmul <4 x double> %r0, %r1 + store <4 x double> %r2, <4 x double> addrspace(1)* %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/fmuladd.ll b/llvm/test/CodeGen/AMDGPU/fmuladd.ll new file mode 100644 index 00000000000..ae84d841021 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/fmuladd.ll @@ -0,0 +1,199 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck %s + +declare float @llvm.fmuladd.f32(float, float, float) +declare double @llvm.fmuladd.f64(double, double, double) +declare i32 @llvm.r600.read.tidig.x() nounwind readnone +declare float @llvm.fabs.f32(float) nounwind readnone + +; CHECK-LABEL: {{^}}fmuladd_f32: +; CHECK: v_mad_f32 {{v[0-9]+, v[0-9]+, v[0-9]+, v[0-9]+}} + +define void @fmuladd_f32(float addrspace(1)* %out, float addrspace(1)* %in1, + float addrspace(1)* %in2, float addrspace(1)* %in3) { + %r0 = load float, float addrspace(1)* %in1 + %r1 = load float, float addrspace(1)* %in2 + %r2 = load float, float addrspace(1)* %in3 + %r3 = tail call float @llvm.fmuladd.f32(float %r0, float %r1, float %r2) + store float %r3, float addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}fmuladd_f64: +; CHECK: v_fma_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}} + +define void @fmuladd_f64(double addrspace(1)* %out, double addrspace(1)* %in1, + double addrspace(1)* %in2, double addrspace(1)* %in3) { + %r0 = load double, double addrspace(1)* %in1 + %r1 = load double, double addrspace(1)* %in2 + %r2 = load double, double addrspace(1)* %in3 + %r3 = tail call double @llvm.fmuladd.f64(double %r0, double %r1, double %r2) + store double %r3, double addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}fmuladd_2.0_a_b_f32 +; CHECK-DAG: buffer_load_dword [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; CHECK-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 +; CHECK: v_mad_f32 [[RESULT:v[0-9]+]], 2.0, [[R1]], [[R2]] +; CHECK: buffer_store_dword [[RESULT]] +define void @fmuladd_2.0_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid + %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 + %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid + + %r1 = load float, float addrspace(1)* %gep.0 + %r2 = load float, float addrspace(1)* %gep.1 + + %r3 = tail call float @llvm.fmuladd.f32(float 2.0, float %r1, float %r2) + store float %r3, float addrspace(1)* %gep.out + ret void +} + +; CHECK-LABEL: {{^}}fmuladd_a_2.0_b_f32 +; CHECK-DAG: buffer_load_dword [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; CHECK-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 +; CHECK: v_mad_f32 [[RESULT:v[0-9]+]], 2.0, [[R1]], [[R2]] +; CHECK: buffer_store_dword [[RESULT]] +define void @fmuladd_a_2.0_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid + %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 + %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid + + %r1 = load float, float addrspace(1)* %gep.0 + %r2 = load float, float addrspace(1)* %gep.1 + + %r3 = tail call float @llvm.fmuladd.f32(float %r1, float 2.0, float %r2) + store float %r3, float addrspace(1)* %gep.out + ret void +} + +; CHECK-LABEL: {{^}}fadd_a_a_b_f32: +; CHECK-DAG: buffer_load_dword [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; CHECK-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 +; CHECK: v_mad_f32 [[RESULT:v[0-9]+]], 2.0, [[R1]], [[R2]] +; CHECK: buffer_store_dword [[RESULT]] +define void @fadd_a_a_b_f32(float addrspace(1)* %out, + float addrspace(1)* %in1, + float addrspace(1)* %in2) { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid + %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 + %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid + + %r0 = load float, float addrspace(1)* %gep.0 + %r1 = load float, float addrspace(1)* %gep.1 + + %add.0 = fadd float %r0, %r0 + %add.1 = fadd float %add.0, %r1 + store float %add.1, float addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}fadd_b_a_a_f32: +; CHECK-DAG: buffer_load_dword [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; CHECK-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 +; CHECK: v_mad_f32 [[RESULT:v[0-9]+]], 2.0, [[R1]], [[R2]] +; CHECK: buffer_store_dword [[RESULT]] +define void @fadd_b_a_a_f32(float addrspace(1)* %out, + float addrspace(1)* %in1, + float addrspace(1)* %in2) { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid + %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 + %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid + + %r0 = load float, float addrspace(1)* %gep.0 + %r1 = load float, float addrspace(1)* %gep.1 + + %add.0 = fadd float %r0, %r0 + %add.1 = fadd float %r1, %add.0 + store float %add.1, float addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}fmuladd_neg_2.0_a_b_f32 +; CHECK-DAG: buffer_load_dword [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; CHECK-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 +; CHECK: v_mad_f32 [[RESULT:v[0-9]+]], -2.0, [[R1]], [[R2]] +; CHECK: buffer_store_dword [[RESULT]] +define void @fmuladd_neg_2.0_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid + %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 + %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid + + %r1 = load float, float addrspace(1)* %gep.0 + %r2 = load float, float addrspace(1)* %gep.1 + + %r3 = tail call float @llvm.fmuladd.f32(float -2.0, float %r1, float %r2) + store float %r3, float addrspace(1)* %gep.out + ret void +} + + +; CHECK-LABEL: {{^}}fmuladd_neg_2.0_neg_a_b_f32 +; CHECK-DAG: buffer_load_dword [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; CHECK-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 +; CHECK: v_mad_f32 [[RESULT:v[0-9]+]], 2.0, [[R1]], [[R2]] +; CHECK: buffer_store_dword [[RESULT]] +define void @fmuladd_neg_2.0_neg_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid + %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 + %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid + + %r1 = load float, float addrspace(1)* %gep.0 + %r2 = load float, float addrspace(1)* %gep.1 + + %r1.fneg = fsub float -0.000000e+00, %r1 + + %r3 = tail call float @llvm.fmuladd.f32(float -2.0, float %r1.fneg, float %r2) + store float %r3, float addrspace(1)* %gep.out + ret void +} + + +; CHECK-LABEL: {{^}}fmuladd_2.0_neg_a_b_f32 +; CHECK-DAG: buffer_load_dword [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; CHECK-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 +; CHECK: v_mad_f32 [[RESULT:v[0-9]+]], -2.0, [[R1]], [[R2]] +; CHECK: buffer_store_dword [[RESULT]] +define void @fmuladd_2.0_neg_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid + %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 + %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid + + %r1 = load float, float addrspace(1)* %gep.0 + %r2 = load float, float addrspace(1)* %gep.1 + + %r1.fneg = fsub float -0.000000e+00, %r1 + + %r3 = tail call float @llvm.fmuladd.f32(float 2.0, float %r1.fneg, float %r2) + store float %r3, float addrspace(1)* %gep.out + ret void +} + + +; CHECK-LABEL: {{^}}fmuladd_2.0_a_neg_b_f32 +; CHECK-DAG: buffer_load_dword [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; CHECK-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 +; CHECK: v_mad_f32 [[RESULT:v[0-9]+]], 2.0, [[R1]], -[[R2]] +; CHECK: buffer_store_dword [[RESULT]] +define void @fmuladd_2.0_a_neg_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid + %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 + %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid + + %r1 = load float, float addrspace(1)* %gep.0 + %r2 = load float, float addrspace(1)* %gep.1 + + %r2.fneg = fsub float -0.000000e+00, %r2 + + %r3 = tail call float @llvm.fmuladd.f32(float 2.0, float %r1, float %r2.fneg) + store float %r3, float addrspace(1)* %gep.out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/fnearbyint.ll b/llvm/test/CodeGen/AMDGPU/fnearbyint.ll new file mode 100644 index 00000000000..4fa9adaabda --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/fnearbyint.ll @@ -0,0 +1,58 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s +; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s + +; This should have the exactly the same output as the test for rint, +; so no need to check anything. + +declare float @llvm.nearbyint.f32(float) #0 +declare <2 x float> @llvm.nearbyint.v2f32(<2 x float>) #0 +declare <4 x float> @llvm.nearbyint.v4f32(<4 x float>) #0 +declare double @llvm.nearbyint.f64(double) #0 +declare <2 x double> @llvm.nearbyint.v2f64(<2 x double>) #0 +declare <4 x double> @llvm.nearbyint.v4f64(<4 x double>) #0 + + +define void @fnearbyint_f32(float addrspace(1)* %out, float %in) #1 { +entry: + %0 = call float @llvm.nearbyint.f32(float %in) + store float %0, float addrspace(1)* %out + ret void +} + +define void @fnearbyint_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) #1 { +entry: + %0 = call <2 x float> @llvm.nearbyint.v2f32(<2 x float> %in) + store <2 x float> %0, <2 x float> addrspace(1)* %out + ret void +} + +define void @fnearbyint_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) #1 { +entry: + %0 = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %in) + store <4 x float> %0, <4 x float> addrspace(1)* %out + ret void +} + +define void @nearbyint_f64(double addrspace(1)* %out, double %in) { +entry: + %0 = call double @llvm.nearbyint.f64(double %in) + store double %0, double addrspace(1)* %out + ret void +} +define void @nearbyint_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in) { +entry: + %0 = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> %in) + store <2 x double> %0, <2 x double> addrspace(1)* %out + ret void +} + +define void @nearbyint_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %in) { +entry: + %0 = call <4 x double> @llvm.nearbyint.v4f64(<4 x double> %in) + store <4 x double> %0, <4 x double> addrspace(1)* %out + ret void +} + +attributes #0 = { nounwind readonly } +attributes #1 = { nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f64.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f64.ll new file mode 100644 index 00000000000..8830e827366 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f64.ll @@ -0,0 +1,100 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s + +; FIXME: Check something here. Currently it seems fabs + fneg aren't +; into 2 modifiers, although theoretically that should work. + +; FUNC-LABEL: {{^}}fneg_fabs_fadd_f64: +; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, -|v{{\[[0-9]+:[0-9]+\]}}| +define void @fneg_fabs_fadd_f64(double addrspace(1)* %out, double %x, double %y) { + %fabs = call double @llvm.fabs.f64(double %x) + %fsub = fsub double -0.000000e+00, %fabs + %fadd = fadd double %y, %fsub + store double %fadd, double addrspace(1)* %out, align 8 + ret void +} + +define void @v_fneg_fabs_fadd_f64(double addrspace(1)* %out, double addrspace(1)* %xptr, double addrspace(1)* %yptr) { + %x = load double, double addrspace(1)* %xptr, align 8 + %y = load double, double addrspace(1)* %xptr, align 8 + %fabs = call double @llvm.fabs.f64(double %x) + %fsub = fsub double -0.000000e+00, %fabs + %fadd = fadd double %y, %fsub + store double %fadd, double addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}fneg_fabs_fmul_f64: +; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, -|{{v\[[0-9]+:[0-9]+\]}}| +define void @fneg_fabs_fmul_f64(double addrspace(1)* %out, double %x, double %y) { + %fabs = call double @llvm.fabs.f64(double %x) + %fsub = fsub double -0.000000e+00, %fabs + %fmul = fmul double %y, %fsub + store double %fmul, double addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}fneg_fabs_free_f64: +define void @fneg_fabs_free_f64(double addrspace(1)* %out, i64 %in) { + %bc = bitcast i64 %in to double + %fabs = call double @llvm.fabs.f64(double %bc) + %fsub = fsub double -0.000000e+00, %fabs + store double %fsub, double addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}fneg_fabs_fn_free_f64: +; SI: v_mov_b32_e32 [[IMMREG:v[0-9]+]], 0x80000000 +; SI: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]] +define void @fneg_fabs_fn_free_f64(double addrspace(1)* %out, i64 %in) { + %bc = bitcast i64 %in to double + %fabs = call double @fabs(double %bc) + %fsub = fsub double -0.000000e+00, %fabs + store double %fsub, double addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}fneg_fabs_f64: +; SI: s_load_dwordx2 s{{\[}}[[LO_X:[0-9]+]]:[[HI_X:[0-9]+]]{{\]}} +; SI: s_load_dwordx2 +; SI: v_mov_b32_e32 [[IMMREG:v[0-9]+]], 0x80000000 +; SI-DAG: v_or_b32_e32 v[[HI_V:[0-9]+]], s[[HI_X]], [[IMMREG]] +; SI-DAG: v_mov_b32_e32 v[[LO_V:[0-9]+]], s[[LO_X]] +; SI: buffer_store_dwordx2 v{{\[}}[[LO_V]]:[[HI_V]]{{\]}} +define void @fneg_fabs_f64(double addrspace(1)* %out, double %in) { + %fabs = call double @llvm.fabs.f64(double %in) + %fsub = fsub double -0.000000e+00, %fabs + store double %fsub, double addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}fneg_fabs_v2f64: +; SI: v_mov_b32_e32 [[IMMREG:v[0-9]+]], 0x80000000 +; SI-NOT: 0x80000000 +; SI: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]] +; SI: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]] +define void @fneg_fabs_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in) { + %fabs = call <2 x double> @llvm.fabs.v2f64(<2 x double> %in) + %fsub = fsub <2 x double> , %fabs + store <2 x double> %fsub, <2 x double> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}fneg_fabs_v4f64: +; SI: v_mov_b32_e32 [[IMMREG:v[0-9]+]], 0x80000000 +; SI-NOT: 0x80000000 +; SI: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]] +; SI: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]] +; SI: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]] +; SI: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]] +define void @fneg_fabs_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %in) { + %fabs = call <4 x double> @llvm.fabs.v4f64(<4 x double> %in) + %fsub = fsub <4 x double> , %fabs + store <4 x double> %fsub, <4 x double> addrspace(1)* %out + ret void +} + +declare double @fabs(double) readnone +declare double @llvm.fabs.f64(double) readnone +declare <2 x double> @llvm.fabs.v2f64(<2 x double>) readnone +declare <4 x double> @llvm.fabs.v4f64(<4 x double>) readnone diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll new file mode 100644 index 00000000000..3b4930d9897 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll @@ -0,0 +1,118 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s + +; FUNC-LABEL: {{^}}fneg_fabs_fadd_f32: +; SI-NOT: and +; SI: v_sub_f32_e64 {{v[0-9]+}}, {{s[0-9]+}}, |{{v[0-9]+}}| +define void @fneg_fabs_fadd_f32(float addrspace(1)* %out, float %x, float %y) { + %fabs = call float @llvm.fabs.f32(float %x) + %fsub = fsub float -0.000000e+00, %fabs + %fadd = fadd float %y, %fsub + store float %fadd, float addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}fneg_fabs_fmul_f32: +; SI-NOT: and +; SI: v_mul_f32_e64 {{v[0-9]+}}, {{s[0-9]+}}, -|{{v[0-9]+}}| +; SI-NOT: and +define void @fneg_fabs_fmul_f32(float addrspace(1)* %out, float %x, float %y) { + %fabs = call float @llvm.fabs.f32(float %x) + %fsub = fsub float -0.000000e+00, %fabs + %fmul = fmul float %y, %fsub + store float %fmul, float addrspace(1)* %out, align 4 + ret void +} + +; DAGCombiner will transform: +; (fabs (f32 bitcast (i32 a))) => (f32 bitcast (and (i32 a), 0x7FFFFFFF)) +; unless isFabsFree returns true + +; FUNC-LABEL: {{^}}fneg_fabs_free_f32: +; R600-NOT: AND +; R600: |PV.{{[XYZW]}}| +; R600: -PV + +; SI: v_mov_b32_e32 [[IMMREG:v[0-9]+]], 0x80000000 +; SI: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]] +define void @fneg_fabs_free_f32(float addrspace(1)* %out, i32 %in) { + %bc = bitcast i32 %in to float + %fabs = call float @llvm.fabs.f32(float %bc) + %fsub = fsub float -0.000000e+00, %fabs + store float %fsub, float addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}fneg_fabs_fn_free_f32: +; R600-NOT: AND +; R600: |PV.{{[XYZW]}}| +; R600: -PV + +; SI: v_mov_b32_e32 [[IMMREG:v[0-9]+]], 0x80000000 +; SI: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]] +define void @fneg_fabs_fn_free_f32(float addrspace(1)* %out, i32 %in) { + %bc = bitcast i32 %in to float + %fabs = call float @fabs(float %bc) + %fsub = fsub float -0.000000e+00, %fabs + store float %fsub, float addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}fneg_fabs_f32: +; SI: v_mov_b32_e32 [[IMMREG:v[0-9]+]], 0x80000000 +; SI: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]] +define void @fneg_fabs_f32(float addrspace(1)* %out, float %in) { + %fabs = call float @llvm.fabs.f32(float %in) + %fsub = fsub float -0.000000e+00, %fabs + store float %fsub, float addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}v_fneg_fabs_f32: +; SI: v_or_b32_e32 v{{[0-9]+}}, 0x80000000, v{{[0-9]+}} +define void @v_fneg_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) { + %val = load float, float addrspace(1)* %in, align 4 + %fabs = call float @llvm.fabs.f32(float %val) + %fsub = fsub float -0.000000e+00, %fabs + store float %fsub, float addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}fneg_fabs_v2f32: +; R600: |{{(PV|T[0-9])\.[XYZW]}}| +; R600: -PV +; R600: |{{(PV|T[0-9])\.[XYZW]}}| +; R600: -PV + +; FIXME: SGPR should be used directly for first src operand. +; SI: v_mov_b32_e32 [[IMMREG:v[0-9]+]], 0x80000000 +; SI-NOT: 0x80000000 +; SI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[IMMREG]] +; SI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[IMMREG]] +define void @fneg_fabs_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) { + %fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %in) + %fsub = fsub <2 x float> , %fabs + store <2 x float> %fsub, <2 x float> addrspace(1)* %out + ret void +} + +; FIXME: SGPR should be used directly for first src operand. +; FUNC-LABEL: {{^}}fneg_fabs_v4f32: +; SI: v_mov_b32_e32 [[IMMREG:v[0-9]+]], 0x80000000 +; SI-NOT: 0x80000000 +; SI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[IMMREG]] +; SI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[IMMREG]] +; SI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[IMMREG]] +; SI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[IMMREG]] +define void @fneg_fabs_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) { + %fabs = call <4 x float> @llvm.fabs.v4f32(<4 x float> %in) + %fsub = fsub <4 x float> , %fabs + store <4 x float> %fsub, <4 x float> addrspace(1)* %out + ret void +} + +declare float @fabs(float) readnone +declare float @llvm.fabs.f32(float) readnone +declare <2 x float> @llvm.fabs.v2f32(<2 x float>) readnone +declare <4 x float> @llvm.fabs.v4f32(<4 x float>) readnone diff --git a/llvm/test/CodeGen/AMDGPU/fneg.f64.ll b/llvm/test/CodeGen/AMDGPU/fneg.f64.ll new file mode 100644 index 00000000000..aa6df209035 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/fneg.f64.ll @@ -0,0 +1,60 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=FUNC %s + +; FUNC-LABEL: {{^}}fneg_f64: +; GCN: v_xor_b32 +define void @fneg_f64(double addrspace(1)* %out, double %in) { + %fneg = fsub double -0.000000e+00, %in + store double %fneg, double addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}fneg_v2f64: +; GCN: v_xor_b32 +; GCN: v_xor_b32 +define void @fneg_v2f64(<2 x double> addrspace(1)* nocapture %out, <2 x double> %in) { + %fneg = fsub <2 x double> , %in + store <2 x double> %fneg, <2 x double> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}fneg_v4f64: +; R600: -PV +; R600: -T +; R600: -PV +; R600: -PV + +; GCN: v_xor_b32 +; GCN: v_xor_b32 +; GCN: v_xor_b32 +; GCN: v_xor_b32 +define void @fneg_v4f64(<4 x double> addrspace(1)* nocapture %out, <4 x double> %in) { + %fneg = fsub <4 x double> , %in + store <4 x double> %fneg, <4 x double> addrspace(1)* %out + ret void +} + +; DAGCombiner will transform: +; (fneg (f64 bitcast (i64 a))) => (f64 bitcast (xor (i64 a), 0x80000000)) +; unless the target returns true for isNegFree() + +; FUNC-LABEL: {{^}}fneg_free_f64: +; GCN: v_add_f64 {{v\[[0-9]+:[0-9]+\]}}, 0, -{{s\[[0-9]+:[0-9]+\]$}} +define void @fneg_free_f64(double addrspace(1)* %out, i64 %in) { + %bc = bitcast i64 %in to double + %fsub = fsub double 0.0, %bc + store double %fsub, double addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}fneg_fold_f64: +; SI: s_load_dwordx2 [[NEG_VALUE:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb +; VI: s_load_dwordx2 [[NEG_VALUE:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c +; GCN-NOT: xor +; GCN: v_mul_f64 {{v\[[0-9]+:[0-9]+\]}}, -[[NEG_VALUE]], [[NEG_VALUE]] +define void @fneg_fold_f64(double addrspace(1)* %out, double %in) { + %fsub = fsub double -0.0, %in + %fmul = fmul double %fsub, %in + store double %fmul, double addrspace(1)* %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/fneg.ll b/llvm/test/CodeGen/AMDGPU/fneg.ll new file mode 100644 index 00000000000..a0fd539863c --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/fneg.ll @@ -0,0 +1,70 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s + +; FUNC-LABEL: {{^}}fneg_f32: +; R600: -PV + +; GCN: v_xor_b32 +define void @fneg_f32(float addrspace(1)* %out, float %in) { + %fneg = fsub float -0.000000e+00, %in + store float %fneg, float addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}fneg_v2f32: +; R600: -PV +; R600: -PV + +; GCN: v_xor_b32 +; GCN: v_xor_b32 +define void @fneg_v2f32(<2 x float> addrspace(1)* nocapture %out, <2 x float> %in) { + %fneg = fsub <2 x float> , %in + store <2 x float> %fneg, <2 x float> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}fneg_v4f32: +; R600: -PV +; R600: -T +; R600: -PV +; R600: -PV + +; GCN: v_xor_b32 +; GCN: v_xor_b32 +; GCN: v_xor_b32 +; GCN: v_xor_b32 +define void @fneg_v4f32(<4 x float> addrspace(1)* nocapture %out, <4 x float> %in) { + %fneg = fsub <4 x float> , %in + store <4 x float> %fneg, <4 x float> addrspace(1)* %out + ret void +} + +; DAGCombiner will transform: +; (fneg (f32 bitcast (i32 a))) => (f32 bitcast (xor (i32 a), 0x80000000)) +; unless the target returns true for isNegFree() + +; FUNC-LABEL: {{^}}fneg_free_f32: +; R600-NOT: XOR +; R600: -KC0[2].Z + +; XXX: We could use v_add_f32_e64 with the negate bit here instead. +; GCN: v_sub_f32_e64 v{{[0-9]}}, 0, s{{[0-9]+$}} +define void @fneg_free_f32(float addrspace(1)* %out, i32 %in) { + %bc = bitcast i32 %in to float + %fsub = fsub float 0.0, %bc + store float %fsub, float addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}fneg_fold_f32: +; SI: s_load_dword [[NEG_VALUE:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xb +; VI: s_load_dword [[NEG_VALUE:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x2c +; GCN-NOT: xor +; GCN: v_mul_f32_e64 v{{[0-9]+}}, -[[NEG_VALUE]], [[NEG_VALUE]] +define void @fneg_fold_f32(float addrspace(1)* %out, float %in) { + %fsub = fsub float -0.0, %in + %fmul = fmul float %fsub, %in + store float %fmul, float addrspace(1)* %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/fp-classify.ll b/llvm/test/CodeGen/AMDGPU/fp-classify.ll new file mode 100644 index 00000000000..4fac5176fac --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/fp-classify.ll @@ -0,0 +1,131 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s + +declare i1 @llvm.AMDGPU.class.f32(float, i32) #1 +declare i1 @llvm.AMDGPU.class.f64(double, i32) #1 +declare i32 @llvm.r600.read.tidig.x() #1 +declare float @llvm.fabs.f32(float) #1 +declare double @llvm.fabs.f64(double) #1 + +; SI-LABEL: {{^}}test_isinf_pattern: +; SI: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x204{{$}} +; SI: v_cmp_class_f32_e32 vcc, s{{[0-9]+}}, [[MASK]] +; SI-NOT: v_cmp +; SI: s_endpgm +define void @test_isinf_pattern(i32 addrspace(1)* nocapture %out, float %x) #0 { + %fabs = tail call float @llvm.fabs.f32(float %x) #1 + %cmp = fcmp oeq float %fabs, 0x7FF0000000000000 + %ext = zext i1 %cmp to i32 + store i32 %ext, i32 addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL: {{^}}test_not_isinf_pattern_0: +; SI-NOT: v_cmp_class +; SI: s_endpgm +define void @test_not_isinf_pattern_0(i32 addrspace(1)* nocapture %out, float %x) #0 { + %fabs = tail call float @llvm.fabs.f32(float %x) #1 + %cmp = fcmp ueq float %fabs, 0x7FF0000000000000 + %ext = zext i1 %cmp to i32 + store i32 %ext, i32 addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL: {{^}}test_not_isinf_pattern_1: +; SI-NOT: v_cmp_class +; SI: s_endpgm +define void @test_not_isinf_pattern_1(i32 addrspace(1)* nocapture %out, float %x) #0 { + %fabs = tail call float @llvm.fabs.f32(float %x) #1 + %cmp = fcmp oeq float %fabs, 0xFFF0000000000000 + %ext = zext i1 %cmp to i32 + store i32 %ext, i32 addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL: {{^}}test_isfinite_pattern_0: +; SI-NOT: v_cmp +; SI: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x1f8{{$}} +; SI: v_cmp_class_f32_e32 vcc, s{{[0-9]+}}, [[MASK]] +; SI-NOT: v_cmp +; SI: s_endpgm +define void @test_isfinite_pattern_0(i32 addrspace(1)* nocapture %out, float %x) #0 { + %ord = fcmp ord float %x, 0.000000e+00 + %x.fabs = tail call float @llvm.fabs.f32(float %x) #1 + %ninf = fcmp une float %x.fabs, 0x7FF0000000000000 + %and = and i1 %ord, %ninf + %ext = zext i1 %and to i32 + store i32 %ext, i32 addrspace(1)* %out, align 4 + ret void +} + +; Use negative infinity +; SI-LABEL: {{^}}test_isfinite_not_pattern_0: +; SI-NOT: v_cmp_class_f32 +; SI: s_endpgm +define void @test_isfinite_not_pattern_0(i32 addrspace(1)* nocapture %out, float %x) #0 { + %ord = fcmp ord float %x, 0.000000e+00 + %x.fabs = tail call float @llvm.fabs.f32(float %x) #1 + %ninf = fcmp une float %x.fabs, 0xFFF0000000000000 + %and = and i1 %ord, %ninf + %ext = zext i1 %and to i32 + store i32 %ext, i32 addrspace(1)* %out, align 4 + ret void +} + +; No fabs +; SI-LABEL: {{^}}test_isfinite_not_pattern_1: +; SI-NOT: v_cmp_class_f32 +; SI: s_endpgm +define void @test_isfinite_not_pattern_1(i32 addrspace(1)* nocapture %out, float %x) #0 { + %ord = fcmp ord float %x, 0.000000e+00 + %ninf = fcmp une float %x, 0x7FF0000000000000 + %and = and i1 %ord, %ninf + %ext = zext i1 %and to i32 + store i32 %ext, i32 addrspace(1)* %out, align 4 + ret void +} + +; fabs of different value +; SI-LABEL: {{^}}test_isfinite_not_pattern_2: +; SI-NOT: v_cmp_class_f32 +; SI: s_endpgm +define void @test_isfinite_not_pattern_2(i32 addrspace(1)* nocapture %out, float %x, float %y) #0 { + %ord = fcmp ord float %x, 0.000000e+00 + %x.fabs = tail call float @llvm.fabs.f32(float %y) #1 + %ninf = fcmp une float %x.fabs, 0x7FF0000000000000 + %and = and i1 %ord, %ninf + %ext = zext i1 %and to i32 + store i32 %ext, i32 addrspace(1)* %out, align 4 + ret void +} + +; Wrong ordered compare type +; SI-LABEL: {{^}}test_isfinite_not_pattern_3: +; SI-NOT: v_cmp_class_f32 +; SI: s_endpgm +define void @test_isfinite_not_pattern_3(i32 addrspace(1)* nocapture %out, float %x) #0 { + %ord = fcmp uno float %x, 0.000000e+00 + %x.fabs = tail call float @llvm.fabs.f32(float %x) #1 + %ninf = fcmp une float %x.fabs, 0x7FF0000000000000 + %and = and i1 %ord, %ninf + %ext = zext i1 %and to i32 + store i32 %ext, i32 addrspace(1)* %out, align 4 + ret void +} + +; Wrong unordered compare +; SI-LABEL: {{^}}test_isfinite_not_pattern_4: +; SI-NOT: v_cmp_class_f32 +; SI: s_endpgm +define void @test_isfinite_not_pattern_4(i32 addrspace(1)* nocapture %out, float %x) #0 { + %ord = fcmp ord float %x, 0.000000e+00 + %x.fabs = tail call float @llvm.fabs.f32(float %x) #1 + %ninf = fcmp one float %x.fabs, 0x7FF0000000000000 + %and = and i1 %ord, %ninf + %ext = zext i1 %and to i32 + store i32 %ext, i32 addrspace(1)* %out, align 4 + ret void +} + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } diff --git a/llvm/test/CodeGen/AMDGPU/fp16_to_fp.ll b/llvm/test/CodeGen/AMDGPU/fp16_to_fp.ll new file mode 100644 index 00000000000..5a79ca82bc2 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/fp16_to_fp.ll @@ -0,0 +1,29 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s + +declare float @llvm.convert.from.fp16.f32(i16) nounwind readnone +declare double @llvm.convert.from.fp16.f64(i16) nounwind readnone + +; SI-LABEL: {{^}}test_convert_fp16_to_fp32: +; SI: buffer_load_ushort [[VAL:v[0-9]+]] +; SI: v_cvt_f32_f16_e32 [[RESULT:v[0-9]+]], [[VAL]] +; SI: buffer_store_dword [[RESULT]] +define void @test_convert_fp16_to_fp32(float addrspace(1)* noalias %out, i16 addrspace(1)* noalias %in) nounwind { + %val = load i16, i16 addrspace(1)* %in, align 2 + %cvt = call float @llvm.convert.from.fp16.f32(i16 %val) nounwind readnone + store float %cvt, float addrspace(1)* %out, align 4 + ret void +} + + +; SI-LABEL: {{^}}test_convert_fp16_to_fp64: +; SI: buffer_load_ushort [[VAL:v[0-9]+]] +; SI: v_cvt_f32_f16_e32 [[RESULT32:v[0-9]+]], [[VAL]] +; SI: v_cvt_f64_f32_e32 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[RESULT32]] +; SI: buffer_store_dwordx2 [[RESULT]] +define void @test_convert_fp16_to_fp64(double addrspace(1)* noalias %out, i16 addrspace(1)* noalias %in) nounwind { + %val = load i16, i16 addrspace(1)* %in, align 2 + %cvt = call double @llvm.convert.from.fp16.f64(i16 %val) nounwind readnone + store double %cvt, double addrspace(1)* %out, align 4 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/fp32_to_fp16.ll b/llvm/test/CodeGen/AMDGPU/fp32_to_fp16.ll new file mode 100644 index 00000000000..67925ebd82b --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/fp32_to_fp16.ll @@ -0,0 +1,15 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s + +declare i16 @llvm.convert.to.fp16.f32(float) nounwind readnone + +; SI-LABEL: {{^}}test_convert_fp32_to_fp16: +; SI: buffer_load_dword [[VAL:v[0-9]+]] +; SI: v_cvt_f16_f32_e32 [[RESULT:v[0-9]+]], [[VAL]] +; SI: buffer_store_short [[RESULT]] +define void @test_convert_fp32_to_fp16(i16 addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind { + %val = load float, float addrspace(1)* %in, align 4 + %cvt = call i16 @llvm.convert.to.fp16.f32(float %val) nounwind readnone + store i16 %cvt, i16 addrspace(1)* %out, align 2 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/fp_to_sint.f64.ll b/llvm/test/CodeGen/AMDGPU/fp_to_sint.f64.ll new file mode 100644 index 00000000000..12df6606e8f --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/fp_to_sint.f64.ll @@ -0,0 +1,56 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s + +declare i32 @llvm.r600.read.tidig.x() nounwind readnone + +; FUNC-LABEL: @fp_to_sint_f64_i32 +; SI: v_cvt_i32_f64_e32 +define void @fp_to_sint_f64_i32(i32 addrspace(1)* %out, double %in) { + %result = fptosi double %in to i32 + store i32 %result, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: @fp_to_sint_v2f64_v2i32 +; SI: v_cvt_i32_f64_e32 +; SI: v_cvt_i32_f64_e32 +define void @fp_to_sint_v2f64_v2i32(<2 x i32> addrspace(1)* %out, <2 x double> %in) { + %result = fptosi <2 x double> %in to <2 x i32> + store <2 x i32> %result, <2 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: @fp_to_sint_v4f64_v4i32 +; SI: v_cvt_i32_f64_e32 +; SI: v_cvt_i32_f64_e32 +; SI: v_cvt_i32_f64_e32 +; SI: v_cvt_i32_f64_e32 +define void @fp_to_sint_v4f64_v4i32(<4 x i32> addrspace(1)* %out, <4 x double> %in) { + %result = fptosi <4 x double> %in to <4 x i32> + store <4 x i32> %result, <4 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: @fp_to_sint_i64_f64 +; CI-DAG: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]] +; CI-DAG: v_trunc_f64_e32 [[TRUNC:v\[[0-9]+:[0-9]+\]]], [[VAL]] +; CI-DAG: s_mov_b32 s[[K0_LO:[0-9]+]], 0{{$}} +; CI-DAG: s_mov_b32 s[[K0_HI:[0-9]+]], 0x3df00000 + +; CI-DAG: v_mul_f64 [[MUL:v\[[0-9]+:[0-9]+\]]], [[VAL]], s{{\[}}[[K0_LO]]:[[K0_HI]]{{\]}} +; CI-DAG: v_floor_f64_e32 [[FLOOR:v\[[0-9]+:[0-9]+\]]], [[MUL]] + +; CI-DAG: s_mov_b32 s[[K1_HI:[0-9]+]], 0xc1f00000 + +; CI-DAG: v_fma_f64 [[FMA:v\[[0-9]+:[0-9]+\]]], [[FLOOR]], s{{\[[0-9]+}}:[[K1_HI]]{{\]}}, [[TRUNC]] +; CI-DAG: v_cvt_u32_f64_e32 v[[LO:[0-9]+]], [[FMA]] +; CI-DAG: v_cvt_i32_f64_e32 v[[HI:[0-9]+]], [[FLOOR]] +; CI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} +define void @fp_to_sint_i64_f64(i64 addrspace(1)* %out, double addrspace(1)* %in) { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep = getelementptr double, double addrspace(1)* %in, i32 %tid + %val = load double, double addrspace(1)* %gep, align 8 + %cast = fptosi double %val to i64 + store i64 %cast, i64 addrspace(1)* %out, align 8 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/fp_to_sint.ll b/llvm/test/CodeGen/AMDGPU/fp_to_sint.ll new file mode 100644 index 00000000000..301a94b4904 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/fp_to_sint.ll @@ -0,0 +1,230 @@ +; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck %s --check-prefix=EG --check-prefix=FUNC +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck %s --check-prefix=SI --check-prefix=FUNC +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s --check-prefix=SI --check-prefix=FUNC + +declare float @llvm.fabs.f32(float) #0 + +; FUNC-LABEL: {{^}}fp_to_sint_i32: +; EG: FLT_TO_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}} +; SI: v_cvt_i32_f32_e32 +; SI: s_endpgm +define void @fp_to_sint_i32(i32 addrspace(1)* %out, float %in) { + %conv = fptosi float %in to i32 + store i32 %conv, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}fp_to_sint_i32_fabs: +; SI: v_cvt_i32_f32_e64 v{{[0-9]+}}, |s{{[0-9]+}}|{{$}} +define void @fp_to_sint_i32_fabs(i32 addrspace(1)* %out, float %in) { + %in.fabs = call float @llvm.fabs.f32(float %in) #0 + %conv = fptosi float %in.fabs to i32 + store i32 %conv, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}fp_to_sint_v2i32: +; EG: FLT_TO_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}} +; EG: FLT_TO_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}} +; SI: v_cvt_i32_f32_e32 +; SI: v_cvt_i32_f32_e32 +define void @fp_to_sint_v2i32(<2 x i32> addrspace(1)* %out, <2 x float> %in) { + %result = fptosi <2 x float> %in to <2 x i32> + store <2 x i32> %result, <2 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}fp_to_sint_v4i32: +; EG: FLT_TO_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}} +; EG: FLT_TO_INT {{\** *}}T{{[0-9]+\.[XYZW]}} +; EG: FLT_TO_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}} +; EG: FLT_TO_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}} +; SI: v_cvt_i32_f32_e32 +; SI: v_cvt_i32_f32_e32 +; SI: v_cvt_i32_f32_e32 +; SI: v_cvt_i32_f32_e32 +define void @fp_to_sint_v4i32(<4 x i32> addrspace(1)* %out, <4 x float> addrspace(1)* %in) { + %value = load <4 x float>, <4 x float> addrspace(1) * %in + %result = fptosi <4 x float> %value to <4 x i32> + store <4 x i32> %result, <4 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}fp_to_sint_i64: + +; EG-DAG: AND_INT +; EG-DAG: LSHR +; EG-DAG: SUB_INT +; EG-DAG: AND_INT +; EG-DAG: ASHR +; EG-DAG: AND_INT +; EG-DAG: OR_INT +; EG-DAG: SUB_INT +; EG-DAG: LSHL +; EG-DAG: LSHL +; EG-DAG: SUB_INT +; EG-DAG: LSHR +; EG-DAG: LSHR +; EG-DAG: SETGT_UINT +; EG-DAG: SETGT_INT +; EG-DAG: XOR_INT +; EG-DAG: XOR_INT +; EG: SUB_INT +; EG-DAG: SUB_INT +; EG-DAG: CNDE_INT +; EG-DAG: CNDE_INT + +; Check that the compiler doesn't crash with a "cannot select" error +; SI: s_endpgm +define void @fp_to_sint_i64 (i64 addrspace(1)* %out, float %in) { +entry: + %0 = fptosi float %in to i64 + store i64 %0, i64 addrspace(1)* %out + ret void +} + +; FUNC: {{^}}fp_to_sint_v2i64: +; EG-DAG: AND_INT +; EG-DAG: LSHR +; EG-DAG: SUB_INT +; EG-DAG: AND_INT +; EG-DAG: ASHR +; EG-DAG: AND_INT +; EG-DAG: OR_INT +; EG-DAG: SUB_INT +; EG-DAG: LSHL +; EG-DAG: LSHL +; EG-DAG: SUB_INT +; EG-DAG: LSHR +; EG-DAG: LSHR +; EG-DAG: SETGT_UINT +; EG-DAG: SETGT_INT +; EG-DAG: XOR_INT +; EG-DAG: XOR_INT +; EG-DAG: SUB_INT +; EG-DAG: SUB_INT +; EG-DAG: CNDE_INT +; EG-DAG: CNDE_INT +; EG-DAG: AND_INT +; EG-DAG: LSHR +; EG-DAG: SUB_INT +; EG-DAG: AND_INT +; EG-DAG: ASHR +; EG-DAG: AND_INT +; EG-DAG: OR_INT +; EG-DAG: SUB_INT +; EG-DAG: LSHL +; EG-DAG: LSHL +; EG-DAG: SUB_INT +; EG-DAG: LSHR +; EG-DAG: LSHR +; EG-DAG: SETGT_UINT +; EG-DAG: SETGT_INT +; EG-DAG: XOR_INT +; EG-DAG: XOR_INT +; EG-DAG: SUB_INT +; EG-DAG: SUB_INT +; EG-DAG: CNDE_INT +; EG-DAG: CNDE_INT + +; SI: s_endpgm +define void @fp_to_sint_v2i64(<2 x i64> addrspace(1)* %out, <2 x float> %x) { + %conv = fptosi <2 x float> %x to <2 x i64> + store <2 x i64> %conv, <2 x i64> addrspace(1)* %out + ret void +} + +; FUNC: {{^}}fp_to_sint_v4i64: +; EG-DAG: AND_INT +; EG-DAG: LSHR +; EG-DAG: SUB_INT +; EG-DAG: AND_INT +; EG-DAG: ASHR +; EG-DAG: AND_INT +; EG-DAG: OR_INT +; EG-DAG: SUB_INT +; EG-DAG: LSHL +; EG-DAG: LSHL +; EG-DAG: SUB_INT +; EG-DAG: LSHR +; EG-DAG: LSHR +; EG-DAG: SETGT_UINT +; EG-DAG: SETGT_INT +; EG-DAG: XOR_INT +; EG-DAG: XOR_INT +; EG-DAG: SUB_INT +; EG-DAG: SUB_INT +; EG-DAG: CNDE_INT +; EG-DAG: CNDE_INT +; EG-DAG: AND_INT +; EG-DAG: LSHR +; EG-DAG: SUB_INT +; EG-DAG: AND_INT +; EG-DAG: ASHR +; EG-DAG: AND_INT +; EG-DAG: OR_INT +; EG-DAG: SUB_INT +; EG-DAG: LSHL +; EG-DAG: LSHL +; EG-DAG: SUB_INT +; EG-DAG: LSHR +; EG-DAG: LSHR +; EG-DAG: SETGT_UINT +; EG-DAG: SETGT_INT +; EG-DAG: XOR_INT +; EG-DAG: XOR_INT +; EG-DAG: SUB_INT +; EG-DAG: SUB_INT +; EG-DAG: CNDE_INT +; EG-DAG: CNDE_INT +; EG-DAG: AND_INT +; EG-DAG: LSHR +; EG-DAG: SUB_INT +; EG-DAG: AND_INT +; EG-DAG: ASHR +; EG-DAG: AND_INT +; EG-DAG: OR_INT +; EG-DAG: SUB_INT +; EG-DAG: LSHL +; EG-DAG: LSHL +; EG-DAG: SUB_INT +; EG-DAG: LSHR +; EG-DAG: LSHR +; EG-DAG: SETGT_UINT +; EG-DAG: SETGT_INT +; EG-DAG: XOR_INT +; EG-DAG: XOR_INT +; EG-DAG: SUB_INT +; EG-DAG: SUB_INT +; EG-DAG: CNDE_INT +; EG-DAG: CNDE_INT +; EG-DAG: AND_INT +; EG-DAG: LSHR +; EG-DAG: SUB_INT +; EG-DAG: AND_INT +; EG-DAG: ASHR +; EG-DAG: AND_INT +; EG-DAG: OR_INT +; EG-DAG: SUB_INT +; EG-DAG: LSHL +; EG-DAG: LSHL +; EG-DAG: SUB_INT +; EG-DAG: LSHR +; EG-DAG: LSHR +; EG-DAG: SETGT_UINT +; EG-DAG: SETGT_INT +; EG-DAG: XOR_INT +; EG-DAG: XOR_INT +; EG-DAG: SUB_INT +; EG-DAG: SUB_INT +; EG-DAG: CNDE_INT +; EG-DAG: CNDE_INT + +; SI: s_endpgm +define void @fp_to_sint_v4i64(<4 x i64> addrspace(1)* %out, <4 x float> %x) { + %conv = fptosi <4 x float> %x to <4 x i64> + store <4 x i64> %conv, <4 x i64> addrspace(1)* %out + ret void +} + +attributes #0 = { nounwind readnone } diff --git a/llvm/test/CodeGen/AMDGPU/fp_to_uint.f64.ll b/llvm/test/CodeGen/AMDGPU/fp_to_uint.f64.ll new file mode 100644 index 00000000000..41bc2a78001 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/fp_to_uint.f64.ll @@ -0,0 +1,70 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s + +declare i32 @llvm.r600.read.tidig.x() nounwind readnone + +; SI-LABEL: {{^}}fp_to_uint_i32_f64: +; SI: v_cvt_u32_f64_e32 +define void @fp_to_uint_i32_f64(i32 addrspace(1)* %out, double %in) { + %cast = fptoui double %in to i32 + store i32 %cast, i32 addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL: @fp_to_uint_v2i32_v2f64 +; SI: v_cvt_u32_f64_e32 +; SI: v_cvt_u32_f64_e32 +define void @fp_to_uint_v2i32_v2f64(<2 x i32> addrspace(1)* %out, <2 x double> %in) { + %cast = fptoui <2 x double> %in to <2 x i32> + store <2 x i32> %cast, <2 x i32> addrspace(1)* %out, align 8 + ret void +} + +; SI-LABEL: @fp_to_uint_v4i32_v4f64 +; SI: v_cvt_u32_f64_e32 +; SI: v_cvt_u32_f64_e32 +; SI: v_cvt_u32_f64_e32 +; SI: v_cvt_u32_f64_e32 +define void @fp_to_uint_v4i32_v4f64(<4 x i32> addrspace(1)* %out, <4 x double> %in) { + %cast = fptoui <4 x double> %in to <4 x i32> + store <4 x i32> %cast, <4 x i32> addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: @fp_to_uint_i64_f64 +; CI-DAG: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]] +; CI-DAG: v_trunc_f64_e32 [[TRUNC:v\[[0-9]+:[0-9]+\]]], [[VAL]] +; CI-DAG: s_mov_b32 s[[K0_LO:[0-9]+]], 0{{$}} +; CI-DAG: s_mov_b32 s[[K0_HI:[0-9]+]], 0x3df00000 + +; CI-DAG: v_mul_f64 [[MUL:v\[[0-9]+:[0-9]+\]]], [[VAL]], s{{\[}}[[K0_LO]]:[[K0_HI]]{{\]}} +; CI-DAG: v_floor_f64_e32 [[FLOOR:v\[[0-9]+:[0-9]+\]]], [[MUL]] + +; CI-DAG: s_mov_b32 s[[K1_HI:[0-9]+]], 0xc1f00000 + +; CI-DAG: v_fma_f64 [[FMA:v\[[0-9]+:[0-9]+\]]], [[FLOOR]], s{{\[[0-9]+}}:[[K1_HI]]{{\]}}, [[TRUNC]] +; CI-DAG: v_cvt_u32_f64_e32 v[[LO:[0-9]+]], [[FMA]] +; CI-DAG: v_cvt_u32_f64_e32 v[[HI:[0-9]+]], [[FLOOR]] +; CI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} +define void @fp_to_uint_i64_f64(i64 addrspace(1)* %out, double addrspace(1)* %in) { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep = getelementptr double, double addrspace(1)* %in, i32 %tid + %val = load double, double addrspace(1)* %gep, align 8 + %cast = fptoui double %val to i64 + store i64 %cast, i64 addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL: @fp_to_uint_v2i64_v2f64 +define void @fp_to_uint_v2i64_v2f64(<2 x i64> addrspace(1)* %out, <2 x double> %in) { + %cast = fptoui <2 x double> %in to <2 x i64> + store <2 x i64> %cast, <2 x i64> addrspace(1)* %out, align 16 + ret void +} + +; SI-LABEL: @fp_to_uint_v4i64_v4f64 +define void @fp_to_uint_v4i64_v4f64(<4 x i64> addrspace(1)* %out, <4 x double> %in) { + %cast = fptoui <4 x double> %in to <4 x i64> + store <4 x i64> %cast, <4 x i64> addrspace(1)* %out, align 32 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/fp_to_uint.ll b/llvm/test/CodeGen/AMDGPU/fp_to_uint.ll new file mode 100644 index 00000000000..b7b6ccc238b --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/fp_to_uint.ll @@ -0,0 +1,217 @@ +; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck %s -check-prefix=EG -check-prefix=FUNC +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck %s -check-prefix=SI -check-prefix=FUNC +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s -check-prefix=SI -check-prefix=FUNC + +; FUNC-LABEL: {{^}}fp_to_uint_f32_to_i32: +; EG: FLT_TO_UINT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}} + +; SI: v_cvt_u32_f32_e32 +; SI: s_endpgm +define void @fp_to_uint_f32_to_i32 (i32 addrspace(1)* %out, float %in) { + %conv = fptoui float %in to i32 + store i32 %conv, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}fp_to_uint_v2f32_to_v2i32: +; EG: FLT_TO_UINT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}} +; EG: FLT_TO_UINT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} + +; SI: v_cvt_u32_f32_e32 +; SI: v_cvt_u32_f32_e32 +define void @fp_to_uint_v2f32_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x float> %in) { + %result = fptoui <2 x float> %in to <2 x i32> + store <2 x i32> %result, <2 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}fp_to_uint_v4f32_to_v4i32: +; EG: FLT_TO_UINT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}} +; EG: FLT_TO_UINT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; EG: FLT_TO_UINT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}} +; EG: FLT_TO_UINT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}} +; SI: v_cvt_u32_f32_e32 +; SI: v_cvt_u32_f32_e32 +; SI: v_cvt_u32_f32_e32 +; SI: v_cvt_u32_f32_e32 + +define void @fp_to_uint_v4f32_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x float> addrspace(1)* %in) { + %value = load <4 x float>, <4 x float> addrspace(1) * %in + %result = fptoui <4 x float> %value to <4 x i32> + store <4 x i32> %result, <4 x i32> addrspace(1)* %out + ret void +} + +; FUNC: {{^}}fp_to_uint_f32_to_i64: +; EG-DAG: AND_INT +; EG-DAG: LSHR +; EG-DAG: SUB_INT +; EG-DAG: AND_INT +; EG-DAG: ASHR +; EG-DAG: AND_INT +; EG-DAG: OR_INT +; EG-DAG: SUB_INT +; EG-DAG: LSHL +; EG-DAG: LSHL +; EG-DAG: SUB_INT +; EG-DAG: LSHR +; EG-DAG: LSHR +; EG-DAG: SETGT_UINT +; EG-DAG: SETGT_INT +; EG-DAG: XOR_INT +; EG-DAG: XOR_INT +; EG: SUB_INT +; EG-DAG: SUB_INT +; EG-DAG: CNDE_INT +; EG-DAG: CNDE_INT + +; SI: s_endpgm +define void @fp_to_uint_f32_to_i64(i64 addrspace(1)* %out, float %x) { + %conv = fptoui float %x to i64 + store i64 %conv, i64 addrspace(1)* %out + ret void +} + +; FUNC: {{^}}fp_to_uint_v2f32_to_v2i64: +; EG-DAG: AND_INT +; EG-DAG: LSHR +; EG-DAG: SUB_INT +; EG-DAG: AND_INT +; EG-DAG: ASHR +; EG-DAG: AND_INT +; EG-DAG: OR_INT +; EG-DAG: SUB_INT +; EG-DAG: LSHL +; EG-DAG: LSHL +; EG-DAG: SUB_INT +; EG-DAG: LSHR +; EG-DAG: LSHR +; EG-DAG: SETGT_UINT +; EG-DAG: SETGT_INT +; EG-DAG: XOR_INT +; EG-DAG: XOR_INT +; EG-DAG: SUB_INT +; EG-DAG: SUB_INT +; EG-DAG: CNDE_INT +; EG-DAG: CNDE_INT +; EG-DAG: AND_INT +; EG-DAG: LSHR +; EG-DAG: SUB_INT +; EG-DAG: AND_INT +; EG-DAG: ASHR +; EG-DAG: AND_INT +; EG-DAG: OR_INT +; EG-DAG: SUB_INT +; EG-DAG: LSHL +; EG-DAG: LSHL +; EG-DAG: SUB_INT +; EG-DAG: LSHR +; EG-DAG: LSHR +; EG-DAG: SETGT_UINT +; EG-DAG: SETGT_INT +; EG-DAG: XOR_INT +; EG-DAG: XOR_INT +; EG-DAG: SUB_INT +; EG-DAG: SUB_INT +; EG-DAG: CNDE_INT +; EG-DAG: CNDE_INT + +; SI: s_endpgm +define void @fp_to_uint_v2f32_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x float> %x) { + %conv = fptoui <2 x float> %x to <2 x i64> + store <2 x i64> %conv, <2 x i64> addrspace(1)* %out + ret void +} + +; FUNC: {{^}}fp_to_uint_v4f32_to_v4i64: +; EG-DAG: AND_INT +; EG-DAG: LSHR +; EG-DAG: SUB_INT +; EG-DAG: AND_INT +; EG-DAG: ASHR +; EG-DAG: AND_INT +; EG-DAG: OR_INT +; EG-DAG: SUB_INT +; EG-DAG: LSHL +; EG-DAG: LSHL +; EG-DAG: SUB_INT +; EG-DAG: LSHR +; EG-DAG: LSHR +; EG-DAG: SETGT_UINT +; EG-DAG: SETGT_INT +; EG-DAG: XOR_INT +; EG-DAG: XOR_INT +; EG-DAG: SUB_INT +; EG-DAG: SUB_INT +; EG-DAG: CNDE_INT +; EG-DAG: CNDE_INT +; EG-DAG: AND_INT +; EG-DAG: LSHR +; EG-DAG: SUB_INT +; EG-DAG: AND_INT +; EG-DAG: ASHR +; EG-DAG: AND_INT +; EG-DAG: OR_INT +; EG-DAG: SUB_INT +; EG-DAG: LSHL +; EG-DAG: LSHL +; EG-DAG: SUB_INT +; EG-DAG: LSHR +; EG-DAG: LSHR +; EG-DAG: SETGT_UINT +; EG-DAG: SETGT_INT +; EG-DAG: XOR_INT +; EG-DAG: XOR_INT +; EG-DAG: SUB_INT +; EG-DAG: SUB_INT +; EG-DAG: CNDE_INT +; EG-DAG: CNDE_INT +; EG-DAG: AND_INT +; EG-DAG: LSHR +; EG-DAG: SUB_INT +; EG-DAG: AND_INT +; EG-DAG: ASHR +; EG-DAG: AND_INT +; EG-DAG: OR_INT +; EG-DAG: SUB_INT +; EG-DAG: LSHL +; EG-DAG: LSHL +; EG-DAG: SUB_INT +; EG-DAG: LSHR +; EG-DAG: LSHR +; EG-DAG: SETGT_UINT +; EG-DAG: SETGT_INT +; EG-DAG: XOR_INT +; EG-DAG: XOR_INT +; EG-DAG: SUB_INT +; EG-DAG: SUB_INT +; EG-DAG: CNDE_INT +; EG-DAG: CNDE_INT +; EG-DAG: AND_INT +; EG-DAG: LSHR +; EG-DAG: SUB_INT +; EG-DAG: AND_INT +; EG-DAG: ASHR +; EG-DAG: AND_INT +; EG-DAG: OR_INT +; EG-DAG: SUB_INT +; EG-DAG: LSHL +; EG-DAG: LSHL +; EG-DAG: SUB_INT +; EG-DAG: LSHR +; EG-DAG: LSHR +; EG-DAG: SETGT_UINT +; EG-DAG: SETGT_INT +; EG-DAG: XOR_INT +; EG-DAG: XOR_INT +; EG-DAG: SUB_INT +; EG-DAG: SUB_INT +; EG-DAG: CNDE_INT +; EG-DAG: CNDE_INT + +; SI: s_endpgm +define void @fp_to_uint_v4f32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x float> %x) { + %conv = fptoui <4 x float> %x to <4 x i64> + store <4 x i64> %conv, <4 x i64> addrspace(1)* %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/fpext.ll b/llvm/test/CodeGen/AMDGPU/fpext.ll new file mode 100644 index 00000000000..734a43be229 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/fpext.ll @@ -0,0 +1,45 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s + +; FUNC-LABEL: {{^}}fpext_f32_to_f64: +; SI: v_cvt_f64_f32_e32 {{v\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} +define void @fpext_f32_to_f64(double addrspace(1)* %out, float %in) { + %result = fpext float %in to double + store double %result, double addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}fpext_v2f32_to_v2f64: +; SI: v_cvt_f64_f32_e32 +; SI: v_cvt_f64_f32_e32 +define void @fpext_v2f32_to_v2f64(<2 x double> addrspace(1)* %out, <2 x float> %in) { + %result = fpext <2 x float> %in to <2 x double> + store <2 x double> %result, <2 x double> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}fpext_v4f32_to_v4f64: +; SI: v_cvt_f64_f32_e32 +; SI: v_cvt_f64_f32_e32 +; SI: v_cvt_f64_f32_e32 +; SI: v_cvt_f64_f32_e32 +define void @fpext_v4f32_to_v4f64(<4 x double> addrspace(1)* %out, <4 x float> %in) { + %result = fpext <4 x float> %in to <4 x double> + store <4 x double> %result, <4 x double> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}fpext_v8f32_to_v8f64: +; SI: v_cvt_f64_f32_e32 +; SI: v_cvt_f64_f32_e32 +; SI: v_cvt_f64_f32_e32 +; SI: v_cvt_f64_f32_e32 +; SI: v_cvt_f64_f32_e32 +; SI: v_cvt_f64_f32_e32 +; SI: v_cvt_f64_f32_e32 +; SI: v_cvt_f64_f32_e32 +define void @fpext_v8f32_to_v8f64(<8 x double> addrspace(1)* %out, <8 x float> %in) { + %result = fpext <8 x float> %in to <8 x double> + store <8 x double> %result, <8 x double> addrspace(1)* %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/fptrunc.ll b/llvm/test/CodeGen/AMDGPU/fptrunc.ll new file mode 100644 index 00000000000..385e10e7baa --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/fptrunc.ll @@ -0,0 +1,45 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s + +; FUNC-LABEL: {{^}}fptrunc_f64_to_f32: +; SI: v_cvt_f32_f64_e32 {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}} +define void @fptrunc_f64_to_f32(float addrspace(1)* %out, double %in) { + %result = fptrunc double %in to float + store float %result, float addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}fptrunc_v2f64_to_v2f32: +; SI: v_cvt_f32_f64_e32 +; SI: v_cvt_f32_f64_e32 +define void @fptrunc_v2f64_to_v2f32(<2 x float> addrspace(1)* %out, <2 x double> %in) { + %result = fptrunc <2 x double> %in to <2 x float> + store <2 x float> %result, <2 x float> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}fptrunc_v4f64_to_v4f32: +; SI: v_cvt_f32_f64_e32 +; SI: v_cvt_f32_f64_e32 +; SI: v_cvt_f32_f64_e32 +; SI: v_cvt_f32_f64_e32 +define void @fptrunc_v4f64_to_v4f32(<4 x float> addrspace(1)* %out, <4 x double> %in) { + %result = fptrunc <4 x double> %in to <4 x float> + store <4 x float> %result, <4 x float> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}fptrunc_v8f64_to_v8f32: +; SI: v_cvt_f32_f64_e32 +; SI: v_cvt_f32_f64_e32 +; SI: v_cvt_f32_f64_e32 +; SI: v_cvt_f32_f64_e32 +; SI: v_cvt_f32_f64_e32 +; SI: v_cvt_f32_f64_e32 +; SI: v_cvt_f32_f64_e32 +; SI: v_cvt_f32_f64_e32 +define void @fptrunc_v8f64_to_v8f32(<8 x float> addrspace(1)* %out, <8 x double> %in) { + %result = fptrunc <8 x double> %in to <8 x float> + store <8 x float> %result, <8 x float> addrspace(1)* %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/frem.ll b/llvm/test/CodeGen/AMDGPU/frem.ll new file mode 100644 index 00000000000..f245ef08cb9 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/frem.ll @@ -0,0 +1,112 @@ +; RUN: llc -march=amdgcn -mcpu=SI -enable-misched < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=bonaire -enable-misched < %s | FileCheck -check-prefix=CI -check-prefix=GCN -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -enable-misched < %s | FileCheck -check-prefix=CI -check-prefix=GCN -check-prefix=FUNC %s + +; FUNC-LABEL: {{^}}frem_f32: +; GCN-DAG: buffer_load_dword [[X:v[0-9]+]], {{.*$}} +; GCN-DAG: buffer_load_dword [[Y:v[0-9]+]], {{.*}} offset:16 +; GCN-DAG: v_cmp +; GCN-DAG: v_mul_f32 +; GCN: v_rcp_f32_e32 +; GCN: v_mul_f32_e32 +; GCN: v_mul_f32_e32 +; GCN: v_trunc_f32_e32 +; GCN: v_mad_f32 +; GCN: s_endpgm +define void @frem_f32(float addrspace(1)* %out, float addrspace(1)* %in1, + float addrspace(1)* %in2) #0 { + %gep2 = getelementptr float, float addrspace(1)* %in2, i32 4 + %r0 = load float, float addrspace(1)* %in1, align 4 + %r1 = load float, float addrspace(1)* %gep2, align 4 + %r2 = frem float %r0, %r1 + store float %r2, float addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}unsafe_frem_f32: +; GCN: buffer_load_dword [[Y:v[0-9]+]], {{.*}} offset:16 +; GCN: buffer_load_dword [[X:v[0-9]+]], {{.*}} +; GCN: v_rcp_f32_e32 [[INVY:v[0-9]+]], [[Y]] +; GCN: v_mul_f32_e32 [[DIV:v[0-9]+]], [[INVY]], [[X]] +; GCN: v_trunc_f32_e32 [[TRUNC:v[0-9]+]], [[DIV]] +; GCN: v_mad_f32 [[RESULT:v[0-9]+]], -[[TRUNC]], [[Y]], [[X]] +; GCN: buffer_store_dword [[RESULT]] +; GCN: s_endpgm +define void @unsafe_frem_f32(float addrspace(1)* %out, float addrspace(1)* %in1, + float addrspace(1)* %in2) #1 { + %gep2 = getelementptr float, float addrspace(1)* %in2, i32 4 + %r0 = load float, float addrspace(1)* %in1, align 4 + %r1 = load float, float addrspace(1)* %gep2, align 4 + %r2 = frem float %r0, %r1 + store float %r2, float addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}frem_f64: +; GCN: buffer_load_dwordx2 [[Y:v\[[0-9]+:[0-9]+\]]], {{.*}}, 0 +; GCN: buffer_load_dwordx2 [[X:v\[[0-9]+:[0-9]+\]]], {{.*}}, 0 +; GCN-DAG: v_div_fmas_f64 +; GCN-DAG: v_div_scale_f64 +; GCN-DAG: v_mul_f64 +; CI: v_trunc_f64_e32 +; CI: v_mul_f64 +; GCN: v_add_f64 +; GCN: buffer_store_dwordx2 +; GCN: s_endpgm +define void @frem_f64(double addrspace(1)* %out, double addrspace(1)* %in1, + double addrspace(1)* %in2) #0 { + %r0 = load double, double addrspace(1)* %in1, align 8 + %r1 = load double, double addrspace(1)* %in2, align 8 + %r2 = frem double %r0, %r1 + store double %r2, double addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}unsafe_frem_f64: +; GCN: v_rcp_f64_e32 +; GCN: v_mul_f64 +; SI: v_bfe_u32 +; CI: v_trunc_f64_e32 +; GCN: v_fma_f64 +; GCN: s_endpgm +define void @unsafe_frem_f64(double addrspace(1)* %out, double addrspace(1)* %in1, + double addrspace(1)* %in2) #1 { + %r0 = load double, double addrspace(1)* %in1, align 8 + %r1 = load double, double addrspace(1)* %in2, align 8 + %r2 = frem double %r0, %r1 + store double %r2, double addrspace(1)* %out, align 8 + ret void +} + +define void @frem_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in1, + <2 x float> addrspace(1)* %in2) #0 { + %gep2 = getelementptr <2 x float>, <2 x float> addrspace(1)* %in2, i32 4 + %r0 = load <2 x float>, <2 x float> addrspace(1)* %in1, align 8 + %r1 = load <2 x float>, <2 x float> addrspace(1)* %gep2, align 8 + %r2 = frem <2 x float> %r0, %r1 + store <2 x float> %r2, <2 x float> addrspace(1)* %out, align 8 + ret void +} + +define void @frem_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in1, + <4 x float> addrspace(1)* %in2) #0 { + %gep2 = getelementptr <4 x float>, <4 x float> addrspace(1)* %in2, i32 4 + %r0 = load <4 x float>, <4 x float> addrspace(1)* %in1, align 16 + %r1 = load <4 x float>, <4 x float> addrspace(1)* %gep2, align 16 + %r2 = frem <4 x float> %r0, %r1 + store <4 x float> %r2, <4 x float> addrspace(1)* %out, align 16 + ret void +} + +define void @frem_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %in1, + <2 x double> addrspace(1)* %in2) #0 { + %gep2 = getelementptr <2 x double>, <2 x double> addrspace(1)* %in2, i32 4 + %r0 = load <2 x double>, <2 x double> addrspace(1)* %in1, align 16 + %r1 = load <2 x double>, <2 x double> addrspace(1)* %gep2, align 16 + %r2 = frem <2 x double> %r0, %r1 + store <2 x double> %r2, <2 x double> addrspace(1)* %out, align 16 + ret void +} + +attributes #0 = { nounwind "unsafe-fp-math"="false" } +attributes #1 = { nounwind "unsafe-fp-math"="true" } diff --git a/llvm/test/CodeGen/AMDGPU/fsqrt.ll b/llvm/test/CodeGen/AMDGPU/fsqrt.ll new file mode 100644 index 00000000000..04101346cdf --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/fsqrt.ll @@ -0,0 +1,29 @@ +; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck %s + +; Run with unsafe-fp-math to make sure nothing tries to turn this into 1 / rsqrt(x) + +; CHECK: {{^}}fsqrt_f32: +; CHECK: v_sqrt_f32_e32 {{v[0-9]+, v[0-9]+}} + +define void @fsqrt_f32(float addrspace(1)* %out, float addrspace(1)* %in) { + %r0 = load float, float addrspace(1)* %in + %r1 = call float @llvm.sqrt.f32(float %r0) + store float %r1, float addrspace(1)* %out + ret void +} + +; CHECK: {{^}}fsqrt_f64: +; CHECK: v_sqrt_f64_e32 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}} + +define void @fsqrt_f64(double addrspace(1)* %out, double addrspace(1)* %in) { + %r0 = load double, double addrspace(1)* %in + %r1 = call double @llvm.sqrt.f64(double %r0) + store double %r1, double addrspace(1)* %out + ret void +} + +declare float @llvm.sqrt.f32(float %Val) +declare double @llvm.sqrt.f64(double %Val) diff --git a/llvm/test/CodeGen/AMDGPU/fsub.ll b/llvm/test/CodeGen/AMDGPU/fsub.ll new file mode 100644 index 00000000000..dfe41cb5b11 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/fsub.ll @@ -0,0 +1,75 @@ +; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s + + +; FUNC-LABEL: {{^}}v_fsub_f32: +; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} +define void @v_fsub_f32(float addrspace(1)* %out, float addrspace(1)* %in) { + %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1 + %a = load float, float addrspace(1)* %in, align 4 + %b = load float, float addrspace(1)* %b_ptr, align 4 + %result = fsub float %a, %b + store float %result, float addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}s_fsub_f32: +; R600: ADD {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, -KC0[2].W + +; SI: v_sub_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} +define void @s_fsub_f32(float addrspace(1)* %out, float %a, float %b) { + %sub = fsub float %a, %b + store float %sub, float addrspace(1)* %out, align 4 + ret void +} + +declare float @llvm.R600.load.input(i32) readnone + +declare void @llvm.AMDGPU.store.output(float, i32) + +; FUNC-LABEL: {{^}}fsub_v2f32: +; R600-DAG: ADD {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[3].X, -KC0[3].Z +; R600-DAG: ADD {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].W, -KC0[3].Y + +; FIXME: Should be using SGPR directly for first operand +; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} +; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} +define void @fsub_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) { + %sub = fsub <2 x float> %a, %b + store <2 x float> %sub, <2 x float> addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}v_fsub_v4f32: +; R600: ADD {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], -T[0-9]+\.[XYZW]}} +; R600: ADD {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], -T[0-9]+\.[XYZW]}} +; R600: ADD {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], -T[0-9]+\.[XYZW]}} +; R600: ADD {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], -T[0-9]+\.[XYZW]}} + +; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} +; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} +; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} +; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} +define void @v_fsub_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) { + %b_ptr = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 1 + %a = load <4 x float>, <4 x float> addrspace(1)* %in, align 16 + %b = load <4 x float>, <4 x float> addrspace(1)* %b_ptr, align 16 + %result = fsub <4 x float> %a, %b + store <4 x float> %result, <4 x float> addrspace(1)* %out, align 16 + ret void +} + +; FIXME: Should be using SGPR directly for first operand + +; FUNC-LABEL: {{^}}s_fsub_v4f32: +; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} +; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} +; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} +; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} +; SI: s_endpgm +define void @s_fsub_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, <4 x float> %b) { + %result = fsub <4 x float> %a, %b + store <4 x float> %result, <4 x float> addrspace(1)* %out, align 16 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/fsub64.ll b/llvm/test/CodeGen/AMDGPU/fsub64.ll new file mode 100644 index 00000000000..f34a48e30a8 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/fsub64.ll @@ -0,0 +1,107 @@ +; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s + +declare double @llvm.fabs.f64(double) #0 + +; SI-LABEL: {{^}}fsub_f64: +; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}} +define void @fsub_f64(double addrspace(1)* %out, double addrspace(1)* %in1, + double addrspace(1)* %in2) { + %r0 = load double, double addrspace(1)* %in1 + %r1 = load double, double addrspace(1)* %in2 + %r2 = fsub double %r0, %r1 + store double %r2, double addrspace(1)* %out + ret void +} + +; SI-LABEL: {{^}}fsub_fabs_f64: +; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], -\|v\[[0-9]+:[0-9]+\]\|}} +define void @fsub_fabs_f64(double addrspace(1)* %out, double addrspace(1)* %in1, + double addrspace(1)* %in2) { + %r0 = load double, double addrspace(1)* %in1 + %r1 = load double, double addrspace(1)* %in2 + %r1.fabs = call double @llvm.fabs.f64(double %r1) #0 + %r2 = fsub double %r0, %r1.fabs + store double %r2, double addrspace(1)* %out + ret void +} + +; SI-LABEL: {{^}}fsub_fabs_inv_f64: +; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], |v\[[0-9]+:[0-9]+\]|, -v\[[0-9]+:[0-9]+\]}} +define void @fsub_fabs_inv_f64(double addrspace(1)* %out, double addrspace(1)* %in1, + double addrspace(1)* %in2) { + %r0 = load double, double addrspace(1)* %in1 + %r1 = load double, double addrspace(1)* %in2 + %r0.fabs = call double @llvm.fabs.f64(double %r0) #0 + %r2 = fsub double %r0.fabs, %r1 + store double %r2, double addrspace(1)* %out + ret void +} + +; SI-LABEL: {{^}}s_fsub_f64: +; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], s\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}} +define void @s_fsub_f64(double addrspace(1)* %out, double %a, double %b) { + %sub = fsub double %a, %b + store double %sub, double addrspace(1)* %out + ret void +} + +; SI-LABEL: {{^}}s_fsub_imm_f64: +; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], 4.0, -s\[[0-9]+:[0-9]+\]}} +define void @s_fsub_imm_f64(double addrspace(1)* %out, double %a, double %b) { + %sub = fsub double 4.0, %a + store double %sub, double addrspace(1)* %out + ret void +} + +; SI-LABEL: {{^}}s_fsub_imm_inv_f64: +; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], -4.0, s\[[0-9]+:[0-9]+\]}} +define void @s_fsub_imm_inv_f64(double addrspace(1)* %out, double %a, double %b) { + %sub = fsub double %a, 4.0 + store double %sub, double addrspace(1)* %out + ret void +} + +; SI-LABEL: {{^}}s_fsub_self_f64: +; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], s\[[0-9]+:[0-9]+\], -s\[[0-9]+:[0-9]+\]}} +define void @s_fsub_self_f64(double addrspace(1)* %out, double %a) { + %sub = fsub double %a, %a + store double %sub, double addrspace(1)* %out + ret void +} + +; SI-LABEL: {{^}}fsub_v2f64: +; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], s\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}} +; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], s\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}} +define void @fsub_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b) { + %sub = fsub <2 x double> %a, %b + store <2 x double> %sub, <2 x double> addrspace(1)* %out + ret void +} + +; SI-LABEL: {{^}}fsub_v4f64: +; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}} +; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}} +; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}} +; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}} +define void @fsub_v4f64(<4 x double> addrspace(1)* %out, <4 x double> addrspace(1)* %in) { + %b_ptr = getelementptr <4 x double>, <4 x double> addrspace(1)* %in, i32 1 + %a = load <4 x double>, <4 x double> addrspace(1)* %in + %b = load <4 x double>, <4 x double> addrspace(1)* %b_ptr + %result = fsub <4 x double> %a, %b + store <4 x double> %result, <4 x double> addrspace(1)* %out + ret void +} + +; SI-LABEL: {{^}}s_fsub_v4f64: +; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], s\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}} +; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], s\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}} +; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], s\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}} +; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], s\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}} +define void @s_fsub_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b) { + %result = fsub <4 x double> %a, %b + store <4 x double> %result, <4 x double> addrspace(1)* %out, align 16 + ret void +} + +attributes #0 = { nounwind readnone } diff --git a/llvm/test/CodeGen/AMDGPU/ftrunc.f64.ll b/llvm/test/CodeGen/AMDGPU/ftrunc.f64.ll new file mode 100644 index 00000000000..6618d8b5e57 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/ftrunc.f64.ll @@ -0,0 +1,111 @@ +; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=bonaire < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s + +declare double @llvm.trunc.f64(double) nounwind readnone +declare <2 x double> @llvm.trunc.v2f64(<2 x double>) nounwind readnone +declare <3 x double> @llvm.trunc.v3f64(<3 x double>) nounwind readnone +declare <4 x double> @llvm.trunc.v4f64(<4 x double>) nounwind readnone +declare <8 x double> @llvm.trunc.v8f64(<8 x double>) nounwind readnone +declare <16 x double> @llvm.trunc.v16f64(<16 x double>) nounwind readnone + +; FUNC-LABEL: {{^}}v_ftrunc_f64: +; CI: v_trunc_f64 +; SI: v_bfe_u32 {{v[0-9]+}}, {{v[0-9]+}}, 20, 11 +; SI: s_endpgm +define void @v_ftrunc_f64(double addrspace(1)* %out, double addrspace(1)* %in) { + %x = load double, double addrspace(1)* %in, align 8 + %y = call double @llvm.trunc.f64(double %x) nounwind readnone + store double %y, double addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}ftrunc_f64: +; CI: v_trunc_f64_e32 + +; SI: s_bfe_u32 [[SEXP:s[0-9]+]], {{s[0-9]+}}, 0xb0014 +; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000 +; SI: s_add_i32 s{{[0-9]+}}, [[SEXP]], 0xfffffc01 +; SI: s_lshr_b64 +; SI: s_not_b64 +; SI: s_and_b64 +; SI: cmp_gt_i32 +; SI: cndmask_b32 +; SI: cndmask_b32 +; SI: cmp_lt_i32 +; SI: cndmask_b32 +; SI: cndmask_b32 +; SI: s_endpgm +define void @ftrunc_f64(double addrspace(1)* %out, double %x) { + %y = call double @llvm.trunc.f64(double %x) nounwind readnone + store double %y, double addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}ftrunc_v2f64: +; CI: v_trunc_f64_e32 +; CI: v_trunc_f64_e32 +define void @ftrunc_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %x) { + %y = call <2 x double> @llvm.trunc.v2f64(<2 x double> %x) nounwind readnone + store <2 x double> %y, <2 x double> addrspace(1)* %out + ret void +} + +; FIXME-FUNC-LABEL: {{^}}ftrunc_v3f64: +; FIXME-CI: v_trunc_f64_e32 +; FIXME-CI: v_trunc_f64_e32 +; FIXME-CI: v_trunc_f64_e32 +; define void @ftrunc_v3f64(<3 x double> addrspace(1)* %out, <3 x double> %x) { +; %y = call <3 x double> @llvm.trunc.v3f64(<3 x double> %x) nounwind readnone +; store <3 x double> %y, <3 x double> addrspace(1)* %out +; ret void +; } + +; FUNC-LABEL: {{^}}ftrunc_v4f64: +; CI: v_trunc_f64_e32 +; CI: v_trunc_f64_e32 +; CI: v_trunc_f64_e32 +; CI: v_trunc_f64_e32 +define void @ftrunc_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %x) { + %y = call <4 x double> @llvm.trunc.v4f64(<4 x double> %x) nounwind readnone + store <4 x double> %y, <4 x double> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}ftrunc_v8f64: +; CI: v_trunc_f64_e32 +; CI: v_trunc_f64_e32 +; CI: v_trunc_f64_e32 +; CI: v_trunc_f64_e32 +; CI: v_trunc_f64_e32 +; CI: v_trunc_f64_e32 +; CI: v_trunc_f64_e32 +; CI: v_trunc_f64_e32 +define void @ftrunc_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %x) { + %y = call <8 x double> @llvm.trunc.v8f64(<8 x double> %x) nounwind readnone + store <8 x double> %y, <8 x double> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}ftrunc_v16f64: +; CI: v_trunc_f64_e32 +; CI: v_trunc_f64_e32 +; CI: v_trunc_f64_e32 +; CI: v_trunc_f64_e32 +; CI: v_trunc_f64_e32 +; CI: v_trunc_f64_e32 +; CI: v_trunc_f64_e32 +; CI: v_trunc_f64_e32 +; CI: v_trunc_f64_e32 +; CI: v_trunc_f64_e32 +; CI: v_trunc_f64_e32 +; CI: v_trunc_f64_e32 +; CI: v_trunc_f64_e32 +; CI: v_trunc_f64_e32 +; CI: v_trunc_f64_e32 +; CI: v_trunc_f64_e32 +define void @ftrunc_v16f64(<16 x double> addrspace(1)* %out, <16 x double> %x) { + %y = call <16 x double> @llvm.trunc.v16f64(<16 x double> %x) nounwind readnone + store <16 x double> %y, <16 x double> addrspace(1)* %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/ftrunc.ll b/llvm/test/CodeGen/AMDGPU/ftrunc.ll new file mode 100644 index 00000000000..edc08609a8a --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/ftrunc.ll @@ -0,0 +1,120 @@ +; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG --check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI --check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI --check-prefix=FUNC %s + +declare float @llvm.trunc.f32(float) nounwind readnone +declare <2 x float> @llvm.trunc.v2f32(<2 x float>) nounwind readnone +declare <3 x float> @llvm.trunc.v3f32(<3 x float>) nounwind readnone +declare <4 x float> @llvm.trunc.v4f32(<4 x float>) nounwind readnone +declare <8 x float> @llvm.trunc.v8f32(<8 x float>) nounwind readnone +declare <16 x float> @llvm.trunc.v16f32(<16 x float>) nounwind readnone + +; FUNC-LABEL: {{^}}ftrunc_f32: +; EG: TRUNC +; SI: v_trunc_f32_e32 +define void @ftrunc_f32(float addrspace(1)* %out, float %x) { + %y = call float @llvm.trunc.f32(float %x) nounwind readnone + store float %y, float addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}ftrunc_v2f32: +; EG: TRUNC +; EG: TRUNC +; SI: v_trunc_f32_e32 +; SI: v_trunc_f32_e32 +define void @ftrunc_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %x) { + %y = call <2 x float> @llvm.trunc.v2f32(<2 x float> %x) nounwind readnone + store <2 x float> %y, <2 x float> addrspace(1)* %out + ret void +} + +; FIXME-FUNC-LABEL: {{^}}ftrunc_v3f32: +; FIXME-EG: TRUNC +; FIXME-EG: TRUNC +; FIXME-EG: TRUNC +; FIXME-SI: v_trunc_f32_e32 +; FIXME-SI: v_trunc_f32_e32 +; FIXME-SI: v_trunc_f32_e32 +; define void @ftrunc_v3f32(<3 x float> addrspace(1)* %out, <3 x float> %x) { +; %y = call <3 x float> @llvm.trunc.v3f32(<3 x float> %x) nounwind readnone +; store <3 x float> %y, <3 x float> addrspace(1)* %out +; ret void +; } + +; FUNC-LABEL: {{^}}ftrunc_v4f32: +; EG: TRUNC +; EG: TRUNC +; EG: TRUNC +; EG: TRUNC +; SI: v_trunc_f32_e32 +; SI: v_trunc_f32_e32 +; SI: v_trunc_f32_e32 +; SI: v_trunc_f32_e32 +define void @ftrunc_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %x) { + %y = call <4 x float> @llvm.trunc.v4f32(<4 x float> %x) nounwind readnone + store <4 x float> %y, <4 x float> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}ftrunc_v8f32: +; EG: TRUNC +; EG: TRUNC +; EG: TRUNC +; EG: TRUNC +; EG: TRUNC +; EG: TRUNC +; EG: TRUNC +; EG: TRUNC +; SI: v_trunc_f32_e32 +; SI: v_trunc_f32_e32 +; SI: v_trunc_f32_e32 +; SI: v_trunc_f32_e32 +; SI: v_trunc_f32_e32 +; SI: v_trunc_f32_e32 +; SI: v_trunc_f32_e32 +; SI: v_trunc_f32_e32 +define void @ftrunc_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %x) { + %y = call <8 x float> @llvm.trunc.v8f32(<8 x float> %x) nounwind readnone + store <8 x float> %y, <8 x float> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}ftrunc_v16f32: +; EG: TRUNC +; EG: TRUNC +; EG: TRUNC +; EG: TRUNC +; EG: TRUNC +; EG: TRUNC +; EG: TRUNC +; EG: TRUNC +; EG: TRUNC +; EG: TRUNC +; EG: TRUNC +; EG: TRUNC +; EG: TRUNC +; EG: TRUNC +; EG: TRUNC +; EG: TRUNC +; SI: v_trunc_f32_e32 +; SI: v_trunc_f32_e32 +; SI: v_trunc_f32_e32 +; SI: v_trunc_f32_e32 +; SI: v_trunc_f32_e32 +; SI: v_trunc_f32_e32 +; SI: v_trunc_f32_e32 +; SI: v_trunc_f32_e32 +; SI: v_trunc_f32_e32 +; SI: v_trunc_f32_e32 +; SI: v_trunc_f32_e32 +; SI: v_trunc_f32_e32 +; SI: v_trunc_f32_e32 +; SI: v_trunc_f32_e32 +; SI: v_trunc_f32_e32 +; SI: v_trunc_f32_e32 +define void @ftrunc_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %x) { + %y = call <16 x float> @llvm.trunc.v16f32(<16 x float> %x) nounwind readnone + store <16 x float> %y, <16 x float> addrspace(1)* %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/gep-address-space.ll b/llvm/test/CodeGen/AMDGPU/gep-address-space.ll new file mode 100644 index 00000000000..471b0f6b13e --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/gep-address-space.ll @@ -0,0 +1,55 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck --check-prefix=SI --check-prefix=CHECK %s +; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs< %s | FileCheck --check-prefix=CI --check-prefix=CHECK %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck --check-prefix=CI --check-prefix=CHECK %s + +define void @use_gep_address_space([1024 x i32] addrspace(3)* %array) nounwind { +; CHECK-LABEL: {{^}}use_gep_address_space: +; CHECK: v_mov_b32_e32 [[PTR:v[0-9]+]], s{{[0-9]+}} +; CHECK: ds_write_b32 [[PTR]], v{{[0-9]+}} offset:64 + %p = getelementptr [1024 x i32], [1024 x i32] addrspace(3)* %array, i16 0, i16 16 + store i32 99, i32 addrspace(3)* %p + ret void +} + +define void @use_gep_address_space_large_offset([1024 x i32] addrspace(3)* %array) nounwind { +; CHECK-LABEL: {{^}}use_gep_address_space_large_offset: +; The LDS offset will be 65536 bytes, which is larger than the size of LDS on +; SI, which is why it is being OR'd with the base pointer. +; SI: s_or_b32 +; CI: s_add_i32 +; CHECK: ds_write_b32 + %p = getelementptr [1024 x i32], [1024 x i32] addrspace(3)* %array, i16 0, i16 16384 + store i32 99, i32 addrspace(3)* %p + ret void +} + +define void @gep_as_vector_v4(<4 x [1024 x i32] addrspace(3)*> %array) nounwind { +; CHECK-LABEL: {{^}}gep_as_vector_v4: +; CHECK: s_add_i32 +; CHECK: s_add_i32 +; CHECK: s_add_i32 +; CHECK: s_add_i32 + %p = getelementptr [1024 x i32], <4 x [1024 x i32] addrspace(3)*> %array, <4 x i16> zeroinitializer, <4 x i16> + %p0 = extractelement <4 x i32 addrspace(3)*> %p, i32 0 + %p1 = extractelement <4 x i32 addrspace(3)*> %p, i32 1 + %p2 = extractelement <4 x i32 addrspace(3)*> %p, i32 2 + %p3 = extractelement <4 x i32 addrspace(3)*> %p, i32 3 + store i32 99, i32 addrspace(3)* %p0 + store i32 99, i32 addrspace(3)* %p1 + store i32 99, i32 addrspace(3)* %p2 + store i32 99, i32 addrspace(3)* %p3 + ret void +} + +define void @gep_as_vector_v2(<2 x [1024 x i32] addrspace(3)*> %array) nounwind { +; CHECK-LABEL: {{^}}gep_as_vector_v2: +; CHECK: s_add_i32 +; CHECK: s_add_i32 + %p = getelementptr [1024 x i32], <2 x [1024 x i32] addrspace(3)*> %array, <2 x i16> zeroinitializer, <2 x i16> + %p0 = extractelement <2 x i32 addrspace(3)*> %p, i32 0 + %p1 = extractelement <2 x i32 addrspace(3)*> %p, i32 1 + store i32 99, i32 addrspace(3)* %p0 + store i32 99, i32 addrspace(3)* %p1 + ret void +} + diff --git a/llvm/test/CodeGen/AMDGPU/global-directive.ll b/llvm/test/CodeGen/AMDGPU/global-directive.ll new file mode 100644 index 00000000000..be775cf9292 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/global-directive.ll @@ -0,0 +1,15 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s + +; Make sure the GlobalDirective isn't merged with the function name + +; SI: .globl foo +; SI: {{^}}foo: +define void @foo(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { + %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 + %a = load i32, i32 addrspace(1)* %in + %b = load i32, i32 addrspace(1)* %b_ptr + %result = add i32 %a, %b + store i32 %result, i32 addrspace(1)* %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/global-extload-i1.ll b/llvm/test/CodeGen/AMDGPU/global-extload-i1.ll new file mode 100644 index 00000000000..bd9557d730f --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/global-extload-i1.ll @@ -0,0 +1,302 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; XUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s +; FIXME: Evergreen broken + +; FUNC-LABEL: {{^}}zextload_global_i1_to_i32: +; SI: buffer_load_ubyte +; SI: buffer_store_dword +; SI: s_endpgm +define void @zextload_global_i1_to_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind { + %a = load i1, i1 addrspace(1)* %in + %ext = zext i1 %a to i32 + store i32 %ext, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}sextload_global_i1_to_i32: +; SI: buffer_load_ubyte +; SI: v_bfe_i32 {{v[0-9]+}}, {{v[0-9]+}}, 0, 1{{$}} +; SI: buffer_store_dword +; SI: s_endpgm +define void @sextload_global_i1_to_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind { + %a = load i1, i1 addrspace(1)* %in + %ext = sext i1 %a to i32 + store i32 %ext, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}zextload_global_v1i1_to_v1i32: +; SI: s_endpgm +define void @zextload_global_v1i1_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i1> addrspace(1)* nocapture %in) nounwind { + %load = load <1 x i1>, <1 x i1> addrspace(1)* %in + %ext = zext <1 x i1> %load to <1 x i32> + store <1 x i32> %ext, <1 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}sextload_global_v1i1_to_v1i32: +; SI: s_endpgm +define void @sextload_global_v1i1_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i1> addrspace(1)* nocapture %in) nounwind { + %load = load <1 x i1>, <1 x i1> addrspace(1)* %in + %ext = sext <1 x i1> %load to <1 x i32> + store <1 x i32> %ext, <1 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}zextload_global_v2i1_to_v2i32: +; SI: s_endpgm +define void @zextload_global_v2i1_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i1> addrspace(1)* nocapture %in) nounwind { + %load = load <2 x i1>, <2 x i1> addrspace(1)* %in + %ext = zext <2 x i1> %load to <2 x i32> + store <2 x i32> %ext, <2 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}sextload_global_v2i1_to_v2i32: +; SI: s_endpgm +define void @sextload_global_v2i1_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i1> addrspace(1)* nocapture %in) nounwind { + %load = load <2 x i1>, <2 x i1> addrspace(1)* %in + %ext = sext <2 x i1> %load to <2 x i32> + store <2 x i32> %ext, <2 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}zextload_global_v4i1_to_v4i32: +; SI: s_endpgm +define void @zextload_global_v4i1_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i1> addrspace(1)* nocapture %in) nounwind { + %load = load <4 x i1>, <4 x i1> addrspace(1)* %in + %ext = zext <4 x i1> %load to <4 x i32> + store <4 x i32> %ext, <4 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}sextload_global_v4i1_to_v4i32: +; SI: s_endpgm +define void @sextload_global_v4i1_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i1> addrspace(1)* nocapture %in) nounwind { + %load = load <4 x i1>, <4 x i1> addrspace(1)* %in + %ext = sext <4 x i1> %load to <4 x i32> + store <4 x i32> %ext, <4 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}zextload_global_v8i1_to_v8i32: +; SI: s_endpgm +define void @zextload_global_v8i1_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i1> addrspace(1)* nocapture %in) nounwind { + %load = load <8 x i1>, <8 x i1> addrspace(1)* %in + %ext = zext <8 x i1> %load to <8 x i32> + store <8 x i32> %ext, <8 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}sextload_global_v8i1_to_v8i32: +; SI: s_endpgm +define void @sextload_global_v8i1_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i1> addrspace(1)* nocapture %in) nounwind { + %load = load <8 x i1>, <8 x i1> addrspace(1)* %in + %ext = sext <8 x i1> %load to <8 x i32> + store <8 x i32> %ext, <8 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}zextload_global_v16i1_to_v16i32: +; SI: s_endpgm +define void @zextload_global_v16i1_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i1> addrspace(1)* nocapture %in) nounwind { + %load = load <16 x i1>, <16 x i1> addrspace(1)* %in + %ext = zext <16 x i1> %load to <16 x i32> + store <16 x i32> %ext, <16 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}sextload_global_v16i1_to_v16i32: +; SI: s_endpgm +define void @sextload_global_v16i1_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i1> addrspace(1)* nocapture %in) nounwind { + %load = load <16 x i1>, <16 x i1> addrspace(1)* %in + %ext = sext <16 x i1> %load to <16 x i32> + store <16 x i32> %ext, <16 x i32> addrspace(1)* %out + ret void +} + +; XFUNC-LABEL: {{^}}zextload_global_v32i1_to_v32i32: +; XSI: s_endpgm +; define void @zextload_global_v32i1_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i1> addrspace(1)* nocapture %in) nounwind { +; %load = load <32 x i1>, <32 x i1> addrspace(1)* %in +; %ext = zext <32 x i1> %load to <32 x i32> +; store <32 x i32> %ext, <32 x i32> addrspace(1)* %out +; ret void +; } + +; XFUNC-LABEL: {{^}}sextload_global_v32i1_to_v32i32: +; XSI: s_endpgm +; define void @sextload_global_v32i1_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i1> addrspace(1)* nocapture %in) nounwind { +; %load = load <32 x i1>, <32 x i1> addrspace(1)* %in +; %ext = sext <32 x i1> %load to <32 x i32> +; store <32 x i32> %ext, <32 x i32> addrspace(1)* %out +; ret void +; } + +; XFUNC-LABEL: {{^}}zextload_global_v64i1_to_v64i32: +; XSI: s_endpgm +; define void @zextload_global_v64i1_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i1> addrspace(1)* nocapture %in) nounwind { +; %load = load <64 x i1>, <64 x i1> addrspace(1)* %in +; %ext = zext <64 x i1> %load to <64 x i32> +; store <64 x i32> %ext, <64 x i32> addrspace(1)* %out +; ret void +; } + +; XFUNC-LABEL: {{^}}sextload_global_v64i1_to_v64i32: +; XSI: s_endpgm +; define void @sextload_global_v64i1_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i1> addrspace(1)* nocapture %in) nounwind { +; %load = load <64 x i1>, <64 x i1> addrspace(1)* %in +; %ext = sext <64 x i1> %load to <64 x i32> +; store <64 x i32> %ext, <64 x i32> addrspace(1)* %out +; ret void +; } + +; FUNC-LABEL: {{^}}zextload_global_i1_to_i64: +; SI: buffer_load_ubyte [[LOAD:v[0-9]+]], +; SI: v_mov_b32_e32 {{v[0-9]+}}, 0{{$}} +; SI: buffer_store_dwordx2 +define void @zextload_global_i1_to_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind { + %a = load i1, i1 addrspace(1)* %in + %ext = zext i1 %a to i64 + store i64 %ext, i64 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}sextload_global_i1_to_i64: +; SI: buffer_load_ubyte [[LOAD:v[0-9]+]], +; SI: v_bfe_i32 [[BFE:v[0-9]+]], {{v[0-9]+}}, 0, 1{{$}} +; SI: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, [[BFE]] +; SI: buffer_store_dwordx2 +define void @sextload_global_i1_to_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind { + %a = load i1, i1 addrspace(1)* %in + %ext = sext i1 %a to i64 + store i64 %ext, i64 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}zextload_global_v1i1_to_v1i64: +; SI: s_endpgm +define void @zextload_global_v1i1_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i1> addrspace(1)* nocapture %in) nounwind { + %load = load <1 x i1>, <1 x i1> addrspace(1)* %in + %ext = zext <1 x i1> %load to <1 x i64> + store <1 x i64> %ext, <1 x i64> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}sextload_global_v1i1_to_v1i64: +; SI: s_endpgm +define void @sextload_global_v1i1_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i1> addrspace(1)* nocapture %in) nounwind { + %load = load <1 x i1>, <1 x i1> addrspace(1)* %in + %ext = sext <1 x i1> %load to <1 x i64> + store <1 x i64> %ext, <1 x i64> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}zextload_global_v2i1_to_v2i64: +; SI: s_endpgm +define void @zextload_global_v2i1_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i1> addrspace(1)* nocapture %in) nounwind { + %load = load <2 x i1>, <2 x i1> addrspace(1)* %in + %ext = zext <2 x i1> %load to <2 x i64> + store <2 x i64> %ext, <2 x i64> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}sextload_global_v2i1_to_v2i64: +; SI: s_endpgm +define void @sextload_global_v2i1_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i1> addrspace(1)* nocapture %in) nounwind { + %load = load <2 x i1>, <2 x i1> addrspace(1)* %in + %ext = sext <2 x i1> %load to <2 x i64> + store <2 x i64> %ext, <2 x i64> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}zextload_global_v4i1_to_v4i64: +; SI: s_endpgm +define void @zextload_global_v4i1_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i1> addrspace(1)* nocapture %in) nounwind { + %load = load <4 x i1>, <4 x i1> addrspace(1)* %in + %ext = zext <4 x i1> %load to <4 x i64> + store <4 x i64> %ext, <4 x i64> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}sextload_global_v4i1_to_v4i64: +; SI: s_endpgm +define void @sextload_global_v4i1_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i1> addrspace(1)* nocapture %in) nounwind { + %load = load <4 x i1>, <4 x i1> addrspace(1)* %in + %ext = sext <4 x i1> %load to <4 x i64> + store <4 x i64> %ext, <4 x i64> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}zextload_global_v8i1_to_v8i64: +; SI: s_endpgm +define void @zextload_global_v8i1_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i1> addrspace(1)* nocapture %in) nounwind { + %load = load <8 x i1>, <8 x i1> addrspace(1)* %in + %ext = zext <8 x i1> %load to <8 x i64> + store <8 x i64> %ext, <8 x i64> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}sextload_global_v8i1_to_v8i64: +; SI: s_endpgm +define void @sextload_global_v8i1_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i1> addrspace(1)* nocapture %in) nounwind { + %load = load <8 x i1>, <8 x i1> addrspace(1)* %in + %ext = sext <8 x i1> %load to <8 x i64> + store <8 x i64> %ext, <8 x i64> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}zextload_global_v16i1_to_v16i64: +; SI: s_endpgm +define void @zextload_global_v16i1_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i1> addrspace(1)* nocapture %in) nounwind { + %load = load <16 x i1>, <16 x i1> addrspace(1)* %in + %ext = zext <16 x i1> %load to <16 x i64> + store <16 x i64> %ext, <16 x i64> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}sextload_global_v16i1_to_v16i64: +; SI: s_endpgm +define void @sextload_global_v16i1_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i1> addrspace(1)* nocapture %in) nounwind { + %load = load <16 x i1>, <16 x i1> addrspace(1)* %in + %ext = sext <16 x i1> %load to <16 x i64> + store <16 x i64> %ext, <16 x i64> addrspace(1)* %out + ret void +} + +; XFUNC-LABEL: {{^}}zextload_global_v32i1_to_v32i64: +; XSI: s_endpgm +; define void @zextload_global_v32i1_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i1> addrspace(1)* nocapture %in) nounwind { +; %load = load <32 x i1>, <32 x i1> addrspace(1)* %in +; %ext = zext <32 x i1> %load to <32 x i64> +; store <32 x i64> %ext, <32 x i64> addrspace(1)* %out +; ret void +; } + +; XFUNC-LABEL: {{^}}sextload_global_v32i1_to_v32i64: +; XSI: s_endpgm +; define void @sextload_global_v32i1_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i1> addrspace(1)* nocapture %in) nounwind { +; %load = load <32 x i1>, <32 x i1> addrspace(1)* %in +; %ext = sext <32 x i1> %load to <32 x i64> +; store <32 x i64> %ext, <32 x i64> addrspace(1)* %out +; ret void +; } + +; XFUNC-LABEL: {{^}}zextload_global_v64i1_to_v64i64: +; XSI: s_endpgm +; define void @zextload_global_v64i1_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i1> addrspace(1)* nocapture %in) nounwind { +; %load = load <64 x i1>, <64 x i1> addrspace(1)* %in +; %ext = zext <64 x i1> %load to <64 x i64> +; store <64 x i64> %ext, <64 x i64> addrspace(1)* %out +; ret void +; } + +; XFUNC-LABEL: {{^}}sextload_global_v64i1_to_v64i64: +; XSI: s_endpgm +; define void @sextload_global_v64i1_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i1> addrspace(1)* nocapture %in) nounwind { +; %load = load <64 x i1>, <64 x i1> addrspace(1)* %in +; %ext = sext <64 x i1> %load to <64 x i64> +; store <64 x i64> %ext, <64 x i64> addrspace(1)* %out +; ret void +; } diff --git a/llvm/test/CodeGen/AMDGPU/global-extload-i16.ll b/llvm/test/CodeGen/AMDGPU/global-extload-i16.ll new file mode 100644 index 00000000000..103a40dee27 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/global-extload-i16.ll @@ -0,0 +1,302 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; XUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s +; FIXME: cypress is broken because the bigger testcases spill and it's not implemented + +; FUNC-LABEL: {{^}}zextload_global_i16_to_i32: +; SI: buffer_load_ushort +; SI: buffer_store_dword +; SI: s_endpgm +define void @zextload_global_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in) nounwind { + %a = load i16, i16 addrspace(1)* %in + %ext = zext i16 %a to i32 + store i32 %ext, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}sextload_global_i16_to_i32: +; SI: buffer_load_sshort +; SI: buffer_store_dword +; SI: s_endpgm +define void @sextload_global_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in) nounwind { + %a = load i16, i16 addrspace(1)* %in + %ext = sext i16 %a to i32 + store i32 %ext, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}zextload_global_v1i16_to_v1i32: +; SI: buffer_load_ushort +; SI: s_endpgm +define void @zextload_global_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i16> addrspace(1)* nocapture %in) nounwind { + %load = load <1 x i16>, <1 x i16> addrspace(1)* %in + %ext = zext <1 x i16> %load to <1 x i32> + store <1 x i32> %ext, <1 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}sextload_global_v1i16_to_v1i32: +; SI: buffer_load_sshort +; SI: s_endpgm +define void @sextload_global_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i16> addrspace(1)* nocapture %in) nounwind { + %load = load <1 x i16>, <1 x i16> addrspace(1)* %in + %ext = sext <1 x i16> %load to <1 x i32> + store <1 x i32> %ext, <1 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}zextload_global_v2i16_to_v2i32: +; SI: s_endpgm +define void @zextload_global_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* nocapture %in) nounwind { + %load = load <2 x i16>, <2 x i16> addrspace(1)* %in + %ext = zext <2 x i16> %load to <2 x i32> + store <2 x i32> %ext, <2 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}sextload_global_v2i16_to_v2i32: +; SI: s_endpgm +define void @sextload_global_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* nocapture %in) nounwind { + %load = load <2 x i16>, <2 x i16> addrspace(1)* %in + %ext = sext <2 x i16> %load to <2 x i32> + store <2 x i32> %ext, <2 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}zextload_global_v4i16_to_v4i32: +; SI: s_endpgm +define void @zextload_global_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(1)* nocapture %in) nounwind { + %load = load <4 x i16>, <4 x i16> addrspace(1)* %in + %ext = zext <4 x i16> %load to <4 x i32> + store <4 x i32> %ext, <4 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}sextload_global_v4i16_to_v4i32: +; SI: s_endpgm +define void @sextload_global_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(1)* nocapture %in) nounwind { + %load = load <4 x i16>, <4 x i16> addrspace(1)* %in + %ext = sext <4 x i16> %load to <4 x i32> + store <4 x i32> %ext, <4 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}zextload_global_v8i16_to_v8i32: +; SI: s_endpgm +define void @zextload_global_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i16> addrspace(1)* nocapture %in) nounwind { + %load = load <8 x i16>, <8 x i16> addrspace(1)* %in + %ext = zext <8 x i16> %load to <8 x i32> + store <8 x i32> %ext, <8 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}sextload_global_v8i16_to_v8i32: +; SI: s_endpgm +define void @sextload_global_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i16> addrspace(1)* nocapture %in) nounwind { + %load = load <8 x i16>, <8 x i16> addrspace(1)* %in + %ext = sext <8 x i16> %load to <8 x i32> + store <8 x i32> %ext, <8 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}zextload_global_v16i16_to_v16i32: +; SI: s_endpgm +define void @zextload_global_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i16> addrspace(1)* nocapture %in) nounwind { + %load = load <16 x i16>, <16 x i16> addrspace(1)* %in + %ext = zext <16 x i16> %load to <16 x i32> + store <16 x i32> %ext, <16 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}sextload_global_v16i16_to_v16i32: +; SI: s_endpgm +define void @sextload_global_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i16> addrspace(1)* nocapture %in) nounwind { + %load = load <16 x i16>, <16 x i16> addrspace(1)* %in + %ext = sext <16 x i16> %load to <16 x i32> + store <16 x i32> %ext, <16 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}zextload_global_v32i16_to_v32i32: +; SI: s_endpgm +define void @zextload_global_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i16> addrspace(1)* nocapture %in) nounwind { + %load = load <32 x i16>, <32 x i16> addrspace(1)* %in + %ext = zext <32 x i16> %load to <32 x i32> + store <32 x i32> %ext, <32 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}sextload_global_v32i16_to_v32i32: +; SI: s_endpgm +define void @sextload_global_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i16> addrspace(1)* nocapture %in) nounwind { + %load = load <32 x i16>, <32 x i16> addrspace(1)* %in + %ext = sext <32 x i16> %load to <32 x i32> + store <32 x i32> %ext, <32 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}zextload_global_v64i16_to_v64i32: +; SI: s_endpgm +define void @zextload_global_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i16> addrspace(1)* nocapture %in) nounwind { + %load = load <64 x i16>, <64 x i16> addrspace(1)* %in + %ext = zext <64 x i16> %load to <64 x i32> + store <64 x i32> %ext, <64 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}sextload_global_v64i16_to_v64i32: +; SI: s_endpgm +define void @sextload_global_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i16> addrspace(1)* nocapture %in) nounwind { + %load = load <64 x i16>, <64 x i16> addrspace(1)* %in + %ext = sext <64 x i16> %load to <64 x i32> + store <64 x i32> %ext, <64 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}zextload_global_i16_to_i64: +; SI: buffer_load_ushort v[[LO:[0-9]+]], +; SI: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}} +; SI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]] +define void @zextload_global_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in) nounwind { + %a = load i16, i16 addrspace(1)* %in + %ext = zext i16 %a to i64 + store i64 %ext, i64 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}sextload_global_i16_to_i64: +; SI: buffer_load_sshort [[LOAD:v[0-9]+]], +; SI: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, [[LOAD]] +; SI: buffer_store_dwordx2 +define void @sextload_global_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in) nounwind { + %a = load i16, i16 addrspace(1)* %in + %ext = sext i16 %a to i64 + store i64 %ext, i64 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}zextload_global_v1i16_to_v1i64: +; SI: s_endpgm +define void @zextload_global_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i16> addrspace(1)* nocapture %in) nounwind { + %load = load <1 x i16>, <1 x i16> addrspace(1)* %in + %ext = zext <1 x i16> %load to <1 x i64> + store <1 x i64> %ext, <1 x i64> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}sextload_global_v1i16_to_v1i64: +; SI: s_endpgm +define void @sextload_global_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i16> addrspace(1)* nocapture %in) nounwind { + %load = load <1 x i16>, <1 x i16> addrspace(1)* %in + %ext = sext <1 x i16> %load to <1 x i64> + store <1 x i64> %ext, <1 x i64> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}zextload_global_v2i16_to_v2i64: +; SI: s_endpgm +define void @zextload_global_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* nocapture %in) nounwind { + %load = load <2 x i16>, <2 x i16> addrspace(1)* %in + %ext = zext <2 x i16> %load to <2 x i64> + store <2 x i64> %ext, <2 x i64> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}sextload_global_v2i16_to_v2i64: +; SI: s_endpgm +define void @sextload_global_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* nocapture %in) nounwind { + %load = load <2 x i16>, <2 x i16> addrspace(1)* %in + %ext = sext <2 x i16> %load to <2 x i64> + store <2 x i64> %ext, <2 x i64> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}zextload_global_v4i16_to_v4i64: +; SI: s_endpgm +define void @zextload_global_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i16> addrspace(1)* nocapture %in) nounwind { + %load = load <4 x i16>, <4 x i16> addrspace(1)* %in + %ext = zext <4 x i16> %load to <4 x i64> + store <4 x i64> %ext, <4 x i64> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}sextload_global_v4i16_to_v4i64: +; SI: s_endpgm +define void @sextload_global_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i16> addrspace(1)* nocapture %in) nounwind { + %load = load <4 x i16>, <4 x i16> addrspace(1)* %in + %ext = sext <4 x i16> %load to <4 x i64> + store <4 x i64> %ext, <4 x i64> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}zextload_global_v8i16_to_v8i64: +; SI: s_endpgm +define void @zextload_global_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i16> addrspace(1)* nocapture %in) nounwind { + %load = load <8 x i16>, <8 x i16> addrspace(1)* %in + %ext = zext <8 x i16> %load to <8 x i64> + store <8 x i64> %ext, <8 x i64> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}sextload_global_v8i16_to_v8i64: +; SI: s_endpgm +define void @sextload_global_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i16> addrspace(1)* nocapture %in) nounwind { + %load = load <8 x i16>, <8 x i16> addrspace(1)* %in + %ext = sext <8 x i16> %load to <8 x i64> + store <8 x i64> %ext, <8 x i64> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}zextload_global_v16i16_to_v16i64: +; SI: s_endpgm +define void @zextload_global_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i16> addrspace(1)* nocapture %in) nounwind { + %load = load <16 x i16>, <16 x i16> addrspace(1)* %in + %ext = zext <16 x i16> %load to <16 x i64> + store <16 x i64> %ext, <16 x i64> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}sextload_global_v16i16_to_v16i64: +; SI: s_endpgm +define void @sextload_global_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i16> addrspace(1)* nocapture %in) nounwind { + %load = load <16 x i16>, <16 x i16> addrspace(1)* %in + %ext = sext <16 x i16> %load to <16 x i64> + store <16 x i64> %ext, <16 x i64> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}zextload_global_v32i16_to_v32i64: +; SI: s_endpgm +define void @zextload_global_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i16> addrspace(1)* nocapture %in) nounwind { + %load = load <32 x i16>, <32 x i16> addrspace(1)* %in + %ext = zext <32 x i16> %load to <32 x i64> + store <32 x i64> %ext, <32 x i64> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}sextload_global_v32i16_to_v32i64: +; SI: s_endpgm +define void @sextload_global_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i16> addrspace(1)* nocapture %in) nounwind { + %load = load <32 x i16>, <32 x i16> addrspace(1)* %in + %ext = sext <32 x i16> %load to <32 x i64> + store <32 x i64> %ext, <32 x i64> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}zextload_global_v64i16_to_v64i64: +; SI: s_endpgm +define void @zextload_global_v64i16_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i16> addrspace(1)* nocapture %in) nounwind { + %load = load <64 x i16>, <64 x i16> addrspace(1)* %in + %ext = zext <64 x i16> %load to <64 x i64> + store <64 x i64> %ext, <64 x i64> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}sextload_global_v64i16_to_v64i64: +; SI: s_endpgm +define void @sextload_global_v64i16_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i16> addrspace(1)* nocapture %in) nounwind { + %load = load <64 x i16>, <64 x i16> addrspace(1)* %in + %ext = sext <64 x i16> %load to <64 x i64> + store <64 x i64> %ext, <64 x i64> addrspace(1)* %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/global-extload-i32.ll b/llvm/test/CodeGen/AMDGPU/global-extload-i32.ll new file mode 100644 index 00000000000..79b83452939 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/global-extload-i32.ll @@ -0,0 +1,457 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s + +; FUNC-LABEL: {{^}}zextload_global_i32_to_i64: +; SI: buffer_load_dword v[[LO:[0-9]+]], +; SI: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}} +; SI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]] +define void @zextload_global_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { + %a = load i32, i32 addrspace(1)* %in + %ext = zext i32 %a to i64 + store i64 %ext, i64 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}sextload_global_i32_to_i64: +; SI: buffer_load_dword [[LOAD:v[0-9]+]], +; SI: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, [[LOAD]] +; SI: buffer_store_dwordx2 +define void @sextload_global_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { + %a = load i32, i32 addrspace(1)* %in + %ext = sext i32 %a to i64 + store i64 %ext, i64 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}zextload_global_v1i32_to_v1i64: +; SI: buffer_load_dword +; SI: buffer_store_dwordx2 +; SI: s_endpgm +define void @zextload_global_v1i32_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i32> addrspace(1)* nocapture %in) nounwind { + %load = load <1 x i32>, <1 x i32> addrspace(1)* %in + %ext = zext <1 x i32> %load to <1 x i64> + store <1 x i64> %ext, <1 x i64> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}sextload_global_v1i32_to_v1i64: +; SI: buffer_load_dword +; SI: v_ashrrev_i32 +; SI: buffer_store_dwordx2 +; SI: s_endpgm +define void @sextload_global_v1i32_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i32> addrspace(1)* nocapture %in) nounwind { + %load = load <1 x i32>, <1 x i32> addrspace(1)* %in + %ext = sext <1 x i32> %load to <1 x i64> + store <1 x i64> %ext, <1 x i64> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}zextload_global_v2i32_to_v2i64: +; SI: buffer_load_dwordx2 +; SI: buffer_store_dwordx2 +; SI: buffer_store_dwordx2 +; SI: s_endpgm +define void @zextload_global_v2i32_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i32> addrspace(1)* nocapture %in) nounwind { + %load = load <2 x i32>, <2 x i32> addrspace(1)* %in + %ext = zext <2 x i32> %load to <2 x i64> + store <2 x i64> %ext, <2 x i64> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}sextload_global_v2i32_to_v2i64: +; SI: buffer_load_dwordx2 +; SI-DAG: v_ashrrev_i32 +; SI-DAG: v_ashrrev_i32 +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx2 +; SI: s_endpgm +define void @sextload_global_v2i32_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i32> addrspace(1)* nocapture %in) nounwind { + %load = load <2 x i32>, <2 x i32> addrspace(1)* %in + %ext = sext <2 x i32> %load to <2 x i64> + store <2 x i64> %ext, <2 x i64> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}zextload_global_v4i32_to_v4i64: +; SI: buffer_load_dwordx4 +; SI: buffer_store_dwordx2 +; SI: buffer_store_dwordx2 +; SI: buffer_store_dwordx2 +; SI: buffer_store_dwordx2 +; SI: s_endpgm +define void @zextload_global_v4i32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i32> addrspace(1)* nocapture %in) nounwind { + %load = load <4 x i32>, <4 x i32> addrspace(1)* %in + %ext = zext <4 x i32> %load to <4 x i64> + store <4 x i64> %ext, <4 x i64> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}sextload_global_v4i32_to_v4i64: +; SI: buffer_load_dwordx4 +; SI-DAG: v_ashrrev_i32 +; SI-DAG: v_ashrrev_i32 +; SI-DAG: v_ashrrev_i32 +; SI-DAG: v_ashrrev_i32 +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx2 +; SI: s_endpgm +define void @sextload_global_v4i32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i32> addrspace(1)* nocapture %in) nounwind { + %load = load <4 x i32>, <4 x i32> addrspace(1)* %in + %ext = sext <4 x i32> %load to <4 x i64> + store <4 x i64> %ext, <4 x i64> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}zextload_global_v8i32_to_v8i64: +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx2 +; SI: s_endpgm +define void @zextload_global_v8i32_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i32> addrspace(1)* nocapture %in) nounwind { + %load = load <8 x i32>, <8 x i32> addrspace(1)* %in + %ext = zext <8 x i32> %load to <8 x i64> + store <8 x i64> %ext, <8 x i64> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}sextload_global_v8i32_to_v8i64: +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword + +; SI-DAG: v_ashrrev_i32 +; SI-DAG: v_ashrrev_i32 +; SI-DAG: v_ashrrev_i32 +; SI-DAG: v_ashrrev_i32 +; SI-DAG: v_ashrrev_i32 +; SI-DAG: v_ashrrev_i32 +; SI-DAG: v_ashrrev_i32 +; SI-DAG: v_ashrrev_i32 +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx2 + +; SI: s_endpgm +define void @sextload_global_v8i32_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i32> addrspace(1)* nocapture %in) nounwind { + %load = load <8 x i32>, <8 x i32> addrspace(1)* %in + %ext = sext <8 x i32> %load to <8 x i64> + store <8 x i64> %ext, <8 x i64> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}sextload_global_v16i32_to_v16i64: +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword + +; SI-DAG: v_ashrrev_i32 +; SI-DAG: v_ashrrev_i32 +; SI-DAG: v_ashrrev_i32 +; SI-DAG: v_ashrrev_i32 +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx2 + +; SI-DAG: v_ashrrev_i32 +; SI-DAG: v_ashrrev_i32 +; SI-DAG: v_ashrrev_i32 +; SI-DAG: v_ashrrev_i32 +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx2 + +; SI-DAG: v_ashrrev_i32 +; SI-DAG: v_ashrrev_i32 +; SI-DAG: v_ashrrev_i32 +; SI-DAG: v_ashrrev_i32 +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx2 + +; SI-DAG: v_ashrrev_i32 +; SI-DAG: v_ashrrev_i32 +; SI-DAG: v_ashrrev_i32 +; SI-DAG: v_ashrrev_i32 +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx2 +; SI: s_endpgm +define void @sextload_global_v16i32_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i32> addrspace(1)* nocapture %in) nounwind { + %load = load <16 x i32>, <16 x i32> addrspace(1)* %in + %ext = sext <16 x i32> %load to <16 x i64> + store <16 x i64> %ext, <16 x i64> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}zextload_global_v16i32_to_v16i64 +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword + +; SI: buffer_store_dwordx2 +; SI: buffer_store_dwordx2 +; SI: buffer_store_dwordx2 +; SI: buffer_store_dwordx2 +; SI: buffer_store_dwordx2 +; SI: buffer_store_dwordx2 +; SI: buffer_store_dwordx2 +; SI: buffer_store_dwordx2 +; SI: buffer_store_dwordx2 +; SI: buffer_store_dwordx2 +; SI: buffer_store_dwordx2 +; SI: buffer_store_dwordx2 +; SI: buffer_store_dwordx2 +; SI: buffer_store_dwordx2 +; SI: buffer_store_dwordx2 +; SI: buffer_store_dwordx2 + +; SI: s_endpgm +define void @zextload_global_v16i32_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i32> addrspace(1)* nocapture %in) nounwind { + %load = load <16 x i32>, <16 x i32> addrspace(1)* %in + %ext = zext <16 x i32> %load to <16 x i64> + store <16 x i64> %ext, <16 x i64> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}sextload_global_v32i32_to_v32i64: +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword + +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword + +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword + +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword + +; SI-DAG: v_ashrrev_i32 +; SI-DAG: v_ashrrev_i32 +; SI-DAG: v_ashrrev_i32 +; SI-DAG: v_ashrrev_i32 +; SI-DAG: v_ashrrev_i32 +; SI-DAG: v_ashrrev_i32 +; SI-DAG: v_ashrrev_i32 +; SI-DAG: v_ashrrev_i32 +; SI-DAG: v_ashrrev_i32 +; SI-DAG: v_ashrrev_i32 +; SI-DAG: v_ashrrev_i32 +; SI-DAG: v_ashrrev_i32 +; SI-DAG: v_ashrrev_i32 +; SI-DAG: v_ashrrev_i32 +; SI-DAG: v_ashrrev_i32 +; SI-DAG: v_ashrrev_i32 +; SI-DAG: v_ashrrev_i32 +; SI-DAG: v_ashrrev_i32 +; SI-DAG: v_ashrrev_i32 +; SI-DAG: v_ashrrev_i32 +; SI-DAG: v_ashrrev_i32 +; SI-DAG: v_ashrrev_i32 +; SI-DAG: v_ashrrev_i32 +; SI-DAG: v_ashrrev_i32 +; SI-DAG: v_ashrrev_i32 +; SI-DAG: v_ashrrev_i32 +; SI-DAG: v_ashrrev_i32 +; SI-DAG: v_ashrrev_i32 +; SI-DAG: v_ashrrev_i32 +; SI-DAG: v_ashrrev_i32 +; SI-DAG: v_ashrrev_i32 +; SI-DAG: v_ashrrev_i32 + +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx2 + +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx2 + +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx2 + +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx2 + +; SI: s_endpgm +define void @sextload_global_v32i32_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i32> addrspace(1)* nocapture %in) nounwind { + %load = load <32 x i32>, <32 x i32> addrspace(1)* %in + %ext = sext <32 x i32> %load to <32 x i64> + store <32 x i64> %ext, <32 x i64> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}zextload_global_v32i32_to_v32i64: +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword + +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword + +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword + +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword + +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx2 + +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx2 + +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx2 + +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx2 + +; SI: s_endpgm +define void @zextload_global_v32i32_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i32> addrspace(1)* nocapture %in) nounwind { + %load = load <32 x i32>, <32 x i32> addrspace(1)* %in + %ext = zext <32 x i32> %load to <32 x i64> + store <32 x i64> %ext, <32 x i64> addrspace(1)* %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/global-extload-i8.ll b/llvm/test/CodeGen/AMDGPU/global-extload-i8.ll new file mode 100644 index 00000000000..b31d5361d5a --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/global-extload-i8.ll @@ -0,0 +1,299 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s + +; FUNC-LABEL: {{^}}zextload_global_i8_to_i32: +; SI: buffer_load_ubyte +; SI: buffer_store_dword +; SI: s_endpgm +define void @zextload_global_i8_to_i32(i32 addrspace(1)* %out, i8 addrspace(1)* %in) nounwind { + %a = load i8, i8 addrspace(1)* %in + %ext = zext i8 %a to i32 + store i32 %ext, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}sextload_global_i8_to_i32: +; SI: buffer_load_sbyte +; SI: buffer_store_dword +; SI: s_endpgm +define void @sextload_global_i8_to_i32(i32 addrspace(1)* %out, i8 addrspace(1)* %in) nounwind { + %a = load i8, i8 addrspace(1)* %in + %ext = sext i8 %a to i32 + store i32 %ext, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}zextload_global_v1i8_to_v1i32: +; SI: s_endpgm +define void @zextload_global_v1i8_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i8> addrspace(1)* nocapture %in) nounwind { + %load = load <1 x i8>, <1 x i8> addrspace(1)* %in + %ext = zext <1 x i8> %load to <1 x i32> + store <1 x i32> %ext, <1 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}sextload_global_v1i8_to_v1i32: +; SI: s_endpgm +define void @sextload_global_v1i8_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i8> addrspace(1)* nocapture %in) nounwind { + %load = load <1 x i8>, <1 x i8> addrspace(1)* %in + %ext = sext <1 x i8> %load to <1 x i32> + store <1 x i32> %ext, <1 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}zextload_global_v2i8_to_v2i32: +; SI: s_endpgm +define void @zextload_global_v2i8_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(1)* nocapture %in) nounwind { + %load = load <2 x i8>, <2 x i8> addrspace(1)* %in + %ext = zext <2 x i8> %load to <2 x i32> + store <2 x i32> %ext, <2 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}sextload_global_v2i8_to_v2i32: +; SI: s_endpgm +define void @sextload_global_v2i8_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(1)* nocapture %in) nounwind { + %load = load <2 x i8>, <2 x i8> addrspace(1)* %in + %ext = sext <2 x i8> %load to <2 x i32> + store <2 x i32> %ext, <2 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}zextload_global_v4i8_to_v4i32: +; SI: s_endpgm +define void @zextload_global_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(1)* nocapture %in) nounwind { + %load = load <4 x i8>, <4 x i8> addrspace(1)* %in + %ext = zext <4 x i8> %load to <4 x i32> + store <4 x i32> %ext, <4 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}sextload_global_v4i8_to_v4i32: +; SI: s_endpgm +define void @sextload_global_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(1)* nocapture %in) nounwind { + %load = load <4 x i8>, <4 x i8> addrspace(1)* %in + %ext = sext <4 x i8> %load to <4 x i32> + store <4 x i32> %ext, <4 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}zextload_global_v8i8_to_v8i32: +; SI: s_endpgm +define void @zextload_global_v8i8_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i8> addrspace(1)* nocapture %in) nounwind { + %load = load <8 x i8>, <8 x i8> addrspace(1)* %in + %ext = zext <8 x i8> %load to <8 x i32> + store <8 x i32> %ext, <8 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}sextload_global_v8i8_to_v8i32: +; SI: s_endpgm +define void @sextload_global_v8i8_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i8> addrspace(1)* nocapture %in) nounwind { + %load = load <8 x i8>, <8 x i8> addrspace(1)* %in + %ext = sext <8 x i8> %load to <8 x i32> + store <8 x i32> %ext, <8 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}zextload_global_v16i8_to_v16i32: +; SI: s_endpgm +define void @zextload_global_v16i8_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i8> addrspace(1)* nocapture %in) nounwind { + %load = load <16 x i8>, <16 x i8> addrspace(1)* %in + %ext = zext <16 x i8> %load to <16 x i32> + store <16 x i32> %ext, <16 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}sextload_global_v16i8_to_v16i32: +; SI: s_endpgm +define void @sextload_global_v16i8_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i8> addrspace(1)* nocapture %in) nounwind { + %load = load <16 x i8>, <16 x i8> addrspace(1)* %in + %ext = sext <16 x i8> %load to <16 x i32> + store <16 x i32> %ext, <16 x i32> addrspace(1)* %out + ret void +} + +; XFUNC-LABEL: {{^}}zextload_global_v32i8_to_v32i32: +; XSI: s_endpgm +; define void @zextload_global_v32i8_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i8> addrspace(1)* nocapture %in) nounwind { +; %load = load <32 x i8>, <32 x i8> addrspace(1)* %in +; %ext = zext <32 x i8> %load to <32 x i32> +; store <32 x i32> %ext, <32 x i32> addrspace(1)* %out +; ret void +; } + +; XFUNC-LABEL: {{^}}sextload_global_v32i8_to_v32i32: +; XSI: s_endpgm +; define void @sextload_global_v32i8_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i8> addrspace(1)* nocapture %in) nounwind { +; %load = load <32 x i8>, <32 x i8> addrspace(1)* %in +; %ext = sext <32 x i8> %load to <32 x i32> +; store <32 x i32> %ext, <32 x i32> addrspace(1)* %out +; ret void +; } + +; XFUNC-LABEL: {{^}}zextload_global_v64i8_to_v64i32: +; XSI: s_endpgm +; define void @zextload_global_v64i8_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i8> addrspace(1)* nocapture %in) nounwind { +; %load = load <64 x i8>, <64 x i8> addrspace(1)* %in +; %ext = zext <64 x i8> %load to <64 x i32> +; store <64 x i32> %ext, <64 x i32> addrspace(1)* %out +; ret void +; } + +; XFUNC-LABEL: {{^}}sextload_global_v64i8_to_v64i32: +; XSI: s_endpgm +; define void @sextload_global_v64i8_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i8> addrspace(1)* nocapture %in) nounwind { +; %load = load <64 x i8>, <64 x i8> addrspace(1)* %in +; %ext = sext <64 x i8> %load to <64 x i32> +; store <64 x i32> %ext, <64 x i32> addrspace(1)* %out +; ret void +; } + +; FUNC-LABEL: {{^}}zextload_global_i8_to_i64: +; SI: buffer_load_ubyte v[[LO:[0-9]+]], +; SI: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}} +; SI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]] +define void @zextload_global_i8_to_i64(i64 addrspace(1)* %out, i8 addrspace(1)* %in) nounwind { + %a = load i8, i8 addrspace(1)* %in + %ext = zext i8 %a to i64 + store i64 %ext, i64 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}sextload_global_i8_to_i64: +; SI: buffer_load_sbyte [[LOAD:v[0-9]+]], +; SI: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, [[LOAD]] +; SI: buffer_store_dwordx2 +define void @sextload_global_i8_to_i64(i64 addrspace(1)* %out, i8 addrspace(1)* %in) nounwind { + %a = load i8, i8 addrspace(1)* %in + %ext = sext i8 %a to i64 + store i64 %ext, i64 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}zextload_global_v1i8_to_v1i64: +; SI: s_endpgm +define void @zextload_global_v1i8_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i8> addrspace(1)* nocapture %in) nounwind { + %load = load <1 x i8>, <1 x i8> addrspace(1)* %in + %ext = zext <1 x i8> %load to <1 x i64> + store <1 x i64> %ext, <1 x i64> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}sextload_global_v1i8_to_v1i64: +; SI: s_endpgm +define void @sextload_global_v1i8_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i8> addrspace(1)* nocapture %in) nounwind { + %load = load <1 x i8>, <1 x i8> addrspace(1)* %in + %ext = sext <1 x i8> %load to <1 x i64> + store <1 x i64> %ext, <1 x i64> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}zextload_global_v2i8_to_v2i64: +; SI: s_endpgm +define void @zextload_global_v2i8_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i8> addrspace(1)* nocapture %in) nounwind { + %load = load <2 x i8>, <2 x i8> addrspace(1)* %in + %ext = zext <2 x i8> %load to <2 x i64> + store <2 x i64> %ext, <2 x i64> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}sextload_global_v2i8_to_v2i64: +; SI: s_endpgm +define void @sextload_global_v2i8_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i8> addrspace(1)* nocapture %in) nounwind { + %load = load <2 x i8>, <2 x i8> addrspace(1)* %in + %ext = sext <2 x i8> %load to <2 x i64> + store <2 x i64> %ext, <2 x i64> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}zextload_global_v4i8_to_v4i64: +; SI: s_endpgm +define void @zextload_global_v4i8_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i8> addrspace(1)* nocapture %in) nounwind { + %load = load <4 x i8>, <4 x i8> addrspace(1)* %in + %ext = zext <4 x i8> %load to <4 x i64> + store <4 x i64> %ext, <4 x i64> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}sextload_global_v4i8_to_v4i64: +; SI: s_endpgm +define void @sextload_global_v4i8_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i8> addrspace(1)* nocapture %in) nounwind { + %load = load <4 x i8>, <4 x i8> addrspace(1)* %in + %ext = sext <4 x i8> %load to <4 x i64> + store <4 x i64> %ext, <4 x i64> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}zextload_global_v8i8_to_v8i64: +; SI: s_endpgm +define void @zextload_global_v8i8_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i8> addrspace(1)* nocapture %in) nounwind { + %load = load <8 x i8>, <8 x i8> addrspace(1)* %in + %ext = zext <8 x i8> %load to <8 x i64> + store <8 x i64> %ext, <8 x i64> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}sextload_global_v8i8_to_v8i64: +; SI: s_endpgm +define void @sextload_global_v8i8_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i8> addrspace(1)* nocapture %in) nounwind { + %load = load <8 x i8>, <8 x i8> addrspace(1)* %in + %ext = sext <8 x i8> %load to <8 x i64> + store <8 x i64> %ext, <8 x i64> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}zextload_global_v16i8_to_v16i64: +; SI: s_endpgm +define void @zextload_global_v16i8_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i8> addrspace(1)* nocapture %in) nounwind { + %load = load <16 x i8>, <16 x i8> addrspace(1)* %in + %ext = zext <16 x i8> %load to <16 x i64> + store <16 x i64> %ext, <16 x i64> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}sextload_global_v16i8_to_v16i64: +; SI: s_endpgm +define void @sextload_global_v16i8_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i8> addrspace(1)* nocapture %in) nounwind { + %load = load <16 x i8>, <16 x i8> addrspace(1)* %in + %ext = sext <16 x i8> %load to <16 x i64> + store <16 x i64> %ext, <16 x i64> addrspace(1)* %out + ret void +} + +; XFUNC-LABEL: {{^}}zextload_global_v32i8_to_v32i64: +; XSI: s_endpgm +; define void @zextload_global_v32i8_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i8> addrspace(1)* nocapture %in) nounwind { +; %load = load <32 x i8>, <32 x i8> addrspace(1)* %in +; %ext = zext <32 x i8> %load to <32 x i64> +; store <32 x i64> %ext, <32 x i64> addrspace(1)* %out +; ret void +; } + +; XFUNC-LABEL: {{^}}sextload_global_v32i8_to_v32i64: +; XSI: s_endpgm +; define void @sextload_global_v32i8_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i8> addrspace(1)* nocapture %in) nounwind { +; %load = load <32 x i8>, <32 x i8> addrspace(1)* %in +; %ext = sext <32 x i8> %load to <32 x i64> +; store <32 x i64> %ext, <32 x i64> addrspace(1)* %out +; ret void +; } + +; XFUNC-LABEL: {{^}}zextload_global_v64i8_to_v64i64: +; XSI: s_endpgm +; define void @zextload_global_v64i8_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i8> addrspace(1)* nocapture %in) nounwind { +; %load = load <64 x i8>, <64 x i8> addrspace(1)* %in +; %ext = zext <64 x i8> %load to <64 x i64> +; store <64 x i64> %ext, <64 x i64> addrspace(1)* %out +; ret void +; } + +; XFUNC-LABEL: {{^}}sextload_global_v64i8_to_v64i64: +; XSI: s_endpgm +; define void @sextload_global_v64i8_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i8> addrspace(1)* nocapture %in) nounwind { +; %load = load <64 x i8>, <64 x i8> addrspace(1)* %in +; %ext = sext <64 x i8> %load to <64 x i64> +; store <64 x i64> %ext, <64 x i64> addrspace(1)* %out +; ret void +; } diff --git a/llvm/test/CodeGen/AMDGPU/global-zero-initializer.ll b/llvm/test/CodeGen/AMDGPU/global-zero-initializer.ll new file mode 100644 index 00000000000..45aa8bf4e1d --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/global-zero-initializer.ll @@ -0,0 +1,13 @@ +; RUN: not llc -march=amdgcn -mcpu=SI < %s 2>&1 | FileCheck %s +; RUN: not llc -march=amdgcn -mcpu=tonga < %s 2>&1 | FileCheck %s + +; CHECK: error: unsupported initializer for address space in load_init_global_global + +@lds = addrspace(1) global [256 x i32] zeroinitializer + +define void @load_init_global_global(i32 addrspace(1)* %out, i1 %p) { + %gep = getelementptr [256 x i32], [256 x i32] addrspace(1)* @lds, i32 0, i32 10 + %ld = load i32, i32 addrspace(1)* %gep + store i32 %ld, i32 addrspace(1)* %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics.ll b/llvm/test/CodeGen/AMDGPU/global_atomics.ll new file mode 100644 index 00000000000..847950f6376 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/global_atomics.ll @@ -0,0 +1,801 @@ +; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s + +; FUNC-LABEL: {{^}}atomic_add_i32_offset: +; SI: buffer_atomic_add v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}} +define void @atomic_add_i32_offset(i32 addrspace(1)* %out, i32 %in) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %0 = atomicrmw volatile add i32 addrspace(1)* %gep, i32 %in seq_cst + ret void +} + +; FUNC-LABEL: {{^}}atomic_add_i32_ret_offset: +; SI: buffer_atomic_add [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc {{$}} +; SI: buffer_store_dword [[RET]] +define void @atomic_add_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %0 = atomicrmw volatile add i32 addrspace(1)* %gep, i32 %in seq_cst + store i32 %0, i32 addrspace(1)* %out2 + ret void +} + +; FUNC-LABEL: {{^}}atomic_add_i32_addr64_offset: +; SI: buffer_atomic_add v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}} +define void @atomic_add_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) { +entry: + %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index + %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4 + %0 = atomicrmw volatile add i32 addrspace(1)* %gep, i32 %in seq_cst + ret void +} + +; FUNC-LABEL: {{^}}atomic_add_i32_ret_addr64_offset: +; SI: buffer_atomic_add [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}} +; SI: buffer_store_dword [[RET]] +define void @atomic_add_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { +entry: + %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index + %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4 + %0 = atomicrmw volatile add i32 addrspace(1)* %gep, i32 %in seq_cst + store i32 %0, i32 addrspace(1)* %out2 + ret void +} + +; FUNC-LABEL: {{^}}atomic_add_i32: +; SI: buffer_atomic_add v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}} +define void @atomic_add_i32(i32 addrspace(1)* %out, i32 %in) { +entry: + %0 = atomicrmw volatile add i32 addrspace(1)* %out, i32 %in seq_cst + ret void +} + +; FUNC-LABEL: {{^}}atomic_add_i32_ret: +; SI: buffer_atomic_add [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 glc +; SI: buffer_store_dword [[RET]] +define void @atomic_add_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { +entry: + %0 = atomicrmw volatile add i32 addrspace(1)* %out, i32 %in seq_cst + store i32 %0, i32 addrspace(1)* %out2 + ret void +} + +; FUNC-LABEL: {{^}}atomic_add_i32_addr64: +; SI: buffer_atomic_add v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}} +define void @atomic_add_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) { +entry: + %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index + %0 = atomicrmw volatile add i32 addrspace(1)* %ptr, i32 %in seq_cst + ret void +} + +; FUNC-LABEL: {{^}}atomic_add_i32_ret_addr64: +; SI: buffer_atomic_add [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}} +; SI: buffer_store_dword [[RET]] +define void @atomic_add_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { +entry: + %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index + %0 = atomicrmw volatile add i32 addrspace(1)* %ptr, i32 %in seq_cst + store i32 %0, i32 addrspace(1)* %out2 + ret void +} + +; FUNC-LABEL: {{^}}atomic_and_i32_offset: +; SI: buffer_atomic_and v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}} +define void @atomic_and_i32_offset(i32 addrspace(1)* %out, i32 %in) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %0 = atomicrmw volatile and i32 addrspace(1)* %gep, i32 %in seq_cst + ret void +} + +; FUNC-LABEL: {{^}}atomic_and_i32_ret_offset: +; SI: buffer_atomic_and [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc {{$}} +; SI: buffer_store_dword [[RET]] +define void @atomic_and_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %0 = atomicrmw volatile and i32 addrspace(1)* %gep, i32 %in seq_cst + store i32 %0, i32 addrspace(1)* %out2 + ret void +} + +; FUNC-LABEL: {{^}}atomic_and_i32_addr64_offset: +; SI: buffer_atomic_and v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}} +define void @atomic_and_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) { +entry: + %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index + %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4 + %0 = atomicrmw volatile and i32 addrspace(1)* %gep, i32 %in seq_cst + ret void +} + +; FUNC-LABEL: {{^}}atomic_and_i32_ret_addr64_offset: +; SI: buffer_atomic_and [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}} +; SI: buffer_store_dword [[RET]] +define void @atomic_and_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { +entry: + %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index + %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4 + %0 = atomicrmw volatile and i32 addrspace(1)* %gep, i32 %in seq_cst + store i32 %0, i32 addrspace(1)* %out2 + ret void +} + +; FUNC-LABEL: {{^}}atomic_and_i32: +; SI: buffer_atomic_and v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}} +define void @atomic_and_i32(i32 addrspace(1)* %out, i32 %in) { +entry: + %0 = atomicrmw volatile and i32 addrspace(1)* %out, i32 %in seq_cst + ret void +} + +; FUNC-LABEL: {{^}}atomic_and_i32_ret: +; SI: buffer_atomic_and [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 glc +; SI: buffer_store_dword [[RET]] +define void @atomic_and_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { +entry: + %0 = atomicrmw volatile and i32 addrspace(1)* %out, i32 %in seq_cst + store i32 %0, i32 addrspace(1)* %out2 + ret void +} + +; FUNC-LABEL: {{^}}atomic_and_i32_addr64: +; SI: buffer_atomic_and v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}} +define void @atomic_and_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) { +entry: + %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index + %0 = atomicrmw volatile and i32 addrspace(1)* %ptr, i32 %in seq_cst + ret void +} + +; FUNC-LABEL: {{^}}atomic_and_i32_ret_addr64: +; SI: buffer_atomic_and [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}} +; SI: buffer_store_dword [[RET]] +define void @atomic_and_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { +entry: + %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index + %0 = atomicrmw volatile and i32 addrspace(1)* %ptr, i32 %in seq_cst + store i32 %0, i32 addrspace(1)* %out2 + ret void +} + +; FUNC-LABEL: {{^}}atomic_sub_i32_offset: +; SI: buffer_atomic_sub v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}} +define void @atomic_sub_i32_offset(i32 addrspace(1)* %out, i32 %in) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %0 = atomicrmw volatile sub i32 addrspace(1)* %gep, i32 %in seq_cst + ret void +} + +; FUNC-LABEL: {{^}}atomic_sub_i32_ret_offset: +; SI: buffer_atomic_sub [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc {{$}} +; SI: buffer_store_dword [[RET]] +define void @atomic_sub_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %0 = atomicrmw volatile sub i32 addrspace(1)* %gep, i32 %in seq_cst + store i32 %0, i32 addrspace(1)* %out2 + ret void +} + +; FUNC-LABEL: {{^}}atomic_sub_i32_addr64_offset: +; SI: buffer_atomic_sub v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}} +define void @atomic_sub_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) { +entry: + %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index + %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4 + %0 = atomicrmw volatile sub i32 addrspace(1)* %gep, i32 %in seq_cst + ret void +} + +; FUNC-LABEL: {{^}}atomic_sub_i32_ret_addr64_offset: +; SI: buffer_atomic_sub [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}} +; SI: buffer_store_dword [[RET]] +define void @atomic_sub_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { +entry: + %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index + %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4 + %0 = atomicrmw volatile sub i32 addrspace(1)* %gep, i32 %in seq_cst + store i32 %0, i32 addrspace(1)* %out2 + ret void +} + +; FUNC-LABEL: {{^}}atomic_sub_i32: +; SI: buffer_atomic_sub v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}} +define void @atomic_sub_i32(i32 addrspace(1)* %out, i32 %in) { +entry: + %0 = atomicrmw volatile sub i32 addrspace(1)* %out, i32 %in seq_cst + ret void +} + +; FUNC-LABEL: {{^}}atomic_sub_i32_ret: +; SI: buffer_atomic_sub [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 glc +; SI: buffer_store_dword [[RET]] +define void @atomic_sub_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { +entry: + %0 = atomicrmw volatile sub i32 addrspace(1)* %out, i32 %in seq_cst + store i32 %0, i32 addrspace(1)* %out2 + ret void +} + +; FUNC-LABEL: {{^}}atomic_sub_i32_addr64: +; SI: buffer_atomic_sub v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}} +define void @atomic_sub_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) { +entry: + %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index + %0 = atomicrmw volatile sub i32 addrspace(1)* %ptr, i32 %in seq_cst + ret void +} + +; FUNC-LABEL: {{^}}atomic_sub_i32_ret_addr64: +; SI: buffer_atomic_sub [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}} +; SI: buffer_store_dword [[RET]] +define void @atomic_sub_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { +entry: + %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index + %0 = atomicrmw volatile sub i32 addrspace(1)* %ptr, i32 %in seq_cst + store i32 %0, i32 addrspace(1)* %out2 + ret void +} + +; FUNC-LABEL: {{^}}atomic_max_i32_offset: +; SI: buffer_atomic_smax v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}} +define void @atomic_max_i32_offset(i32 addrspace(1)* %out, i32 %in) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %0 = atomicrmw volatile max i32 addrspace(1)* %gep, i32 %in seq_cst + ret void +} + +; FUNC-LABEL: {{^}}atomic_max_i32_ret_offset: +; SI: buffer_atomic_smax [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc {{$}} +; SI: buffer_store_dword [[RET]] +define void @atomic_max_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %0 = atomicrmw volatile max i32 addrspace(1)* %gep, i32 %in seq_cst + store i32 %0, i32 addrspace(1)* %out2 + ret void +} + +; FUNC-LABEL: {{^}}atomic_max_i32_addr64_offset: +; SI: buffer_atomic_smax v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}} +define void @atomic_max_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) { +entry: + %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index + %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4 + %0 = atomicrmw volatile max i32 addrspace(1)* %gep, i32 %in seq_cst + ret void +} + +; FUNC-LABEL: {{^}}atomic_max_i32_ret_addr64_offset: +; SI: buffer_atomic_smax [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}} +; SI: buffer_store_dword [[RET]] +define void @atomic_max_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { +entry: + %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index + %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4 + %0 = atomicrmw volatile max i32 addrspace(1)* %gep, i32 %in seq_cst + store i32 %0, i32 addrspace(1)* %out2 + ret void +} + +; FUNC-LABEL: {{^}}atomic_max_i32: +; SI: buffer_atomic_smax v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}} +define void @atomic_max_i32(i32 addrspace(1)* %out, i32 %in) { +entry: + %0 = atomicrmw volatile max i32 addrspace(1)* %out, i32 %in seq_cst + ret void +} + +; FUNC-LABEL: {{^}}atomic_max_i32_ret: +; SI: buffer_atomic_smax [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 glc +; SI: buffer_store_dword [[RET]] +define void @atomic_max_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { +entry: + %0 = atomicrmw volatile max i32 addrspace(1)* %out, i32 %in seq_cst + store i32 %0, i32 addrspace(1)* %out2 + ret void +} + +; FUNC-LABEL: {{^}}atomic_max_i32_addr64: +; SI: buffer_atomic_smax v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}} +define void @atomic_max_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) { +entry: + %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index + %0 = atomicrmw volatile max i32 addrspace(1)* %ptr, i32 %in seq_cst + ret void +} + +; FUNC-LABEL: {{^}}atomic_max_i32_ret_addr64: +; SI: buffer_atomic_smax [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}} +; SI: buffer_store_dword [[RET]] +define void @atomic_max_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { +entry: + %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index + %0 = atomicrmw volatile max i32 addrspace(1)* %ptr, i32 %in seq_cst + store i32 %0, i32 addrspace(1)* %out2 + ret void +} + +; FUNC-LABEL: {{^}}atomic_umax_i32_offset: +; SI: buffer_atomic_umax v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}} +define void @atomic_umax_i32_offset(i32 addrspace(1)* %out, i32 %in) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %0 = atomicrmw volatile umax i32 addrspace(1)* %gep, i32 %in seq_cst + ret void +} + +; FUNC-LABEL: {{^}}atomic_umax_i32_ret_offset: +; SI: buffer_atomic_umax [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc {{$}} +; SI: buffer_store_dword [[RET]] +define void @atomic_umax_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %0 = atomicrmw volatile umax i32 addrspace(1)* %gep, i32 %in seq_cst + store i32 %0, i32 addrspace(1)* %out2 + ret void +} + +; FUNC-LABEL: {{^}}atomic_umax_i32_addr64_offset: +; SI: buffer_atomic_umax v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}} +define void @atomic_umax_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) { +entry: + %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index + %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4 + %0 = atomicrmw volatile umax i32 addrspace(1)* %gep, i32 %in seq_cst + ret void +} + +; FUNC-LABEL: {{^}}atomic_umax_i32_ret_addr64_offset: +; SI: buffer_atomic_umax [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}} +; SI: buffer_store_dword [[RET]] +define void @atomic_umax_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { +entry: + %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index + %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4 + %0 = atomicrmw volatile umax i32 addrspace(1)* %gep, i32 %in seq_cst + store i32 %0, i32 addrspace(1)* %out2 + ret void +} + +; FUNC-LABEL: {{^}}atomic_umax_i32: +; SI: buffer_atomic_umax v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}} +define void @atomic_umax_i32(i32 addrspace(1)* %out, i32 %in) { +entry: + %0 = atomicrmw volatile umax i32 addrspace(1)* %out, i32 %in seq_cst + ret void +} + +; FUNC-LABEL: {{^}}atomic_umax_i32_ret: +; SI: buffer_atomic_umax [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 glc +; SI: buffer_store_dword [[RET]] +define void @atomic_umax_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { +entry: + %0 = atomicrmw volatile umax i32 addrspace(1)* %out, i32 %in seq_cst + store i32 %0, i32 addrspace(1)* %out2 + ret void +} + +; FUNC-LABEL: {{^}}atomic_umax_i32_addr64: +; SI: buffer_atomic_umax v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}} +define void @atomic_umax_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) { +entry: + %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index + %0 = atomicrmw volatile umax i32 addrspace(1)* %ptr, i32 %in seq_cst + ret void +} + +; FUNC-LABEL: {{^}}atomic_umax_i32_ret_addr64: +; SI: buffer_atomic_umax [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}} +; SI: buffer_store_dword [[RET]] +define void @atomic_umax_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { +entry: + %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index + %0 = atomicrmw volatile umax i32 addrspace(1)* %ptr, i32 %in seq_cst + store i32 %0, i32 addrspace(1)* %out2 + ret void +} + +; FUNC-LABEL: {{^}}atomic_min_i32_offset: +; SI: buffer_atomic_smin v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}} +define void @atomic_min_i32_offset(i32 addrspace(1)* %out, i32 %in) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %0 = atomicrmw volatile min i32 addrspace(1)* %gep, i32 %in seq_cst + ret void +} + +; FUNC-LABEL: {{^}}atomic_min_i32_ret_offset: +; SI: buffer_atomic_smin [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc {{$}} +; SI: buffer_store_dword [[RET]] +define void @atomic_min_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %0 = atomicrmw volatile min i32 addrspace(1)* %gep, i32 %in seq_cst + store i32 %0, i32 addrspace(1)* %out2 + ret void +} + +; FUNC-LABEL: {{^}}atomic_min_i32_addr64_offset: +; SI: buffer_atomic_smin v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}} +define void @atomic_min_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) { +entry: + %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index + %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4 + %0 = atomicrmw volatile min i32 addrspace(1)* %gep, i32 %in seq_cst + ret void +} + +; FUNC-LABEL: {{^}}atomic_min_i32_ret_addr64_offset: +; SI: buffer_atomic_smin [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}} +; SI: buffer_store_dword [[RET]] +define void @atomic_min_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { +entry: + %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index + %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4 + %0 = atomicrmw volatile min i32 addrspace(1)* %gep, i32 %in seq_cst + store i32 %0, i32 addrspace(1)* %out2 + ret void +} + +; FUNC-LABEL: {{^}}atomic_min_i32: +; SI: buffer_atomic_smin v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}} +define void @atomic_min_i32(i32 addrspace(1)* %out, i32 %in) { +entry: + %0 = atomicrmw volatile min i32 addrspace(1)* %out, i32 %in seq_cst + ret void +} + +; FUNC-LABEL: {{^}}atomic_min_i32_ret: +; SI: buffer_atomic_smin [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 glc +; SI: buffer_store_dword [[RET]] +define void @atomic_min_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { +entry: + %0 = atomicrmw volatile min i32 addrspace(1)* %out, i32 %in seq_cst + store i32 %0, i32 addrspace(1)* %out2 + ret void +} + +; FUNC-LABEL: {{^}}atomic_min_i32_addr64: +; SI: buffer_atomic_smin v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}} +define void @atomic_min_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) { +entry: + %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index + %0 = atomicrmw volatile min i32 addrspace(1)* %ptr, i32 %in seq_cst + ret void +} + +; FUNC-LABEL: {{^}}atomic_min_i32_ret_addr64: +; SI: buffer_atomic_smin [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}} +; SI: buffer_store_dword [[RET]] +define void @atomic_min_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { +entry: + %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index + %0 = atomicrmw volatile min i32 addrspace(1)* %ptr, i32 %in seq_cst + store i32 %0, i32 addrspace(1)* %out2 + ret void +} + +; FUNC-LABEL: {{^}}atomic_umin_i32_offset: +; SI: buffer_atomic_umin v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}} +define void @atomic_umin_i32_offset(i32 addrspace(1)* %out, i32 %in) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %0 = atomicrmw volatile umin i32 addrspace(1)* %gep, i32 %in seq_cst + ret void +} + +; FUNC-LABEL: {{^}}atomic_umin_i32_ret_offset: +; SI: buffer_atomic_umin [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc {{$}} +; SI: buffer_store_dword [[RET]] +define void @atomic_umin_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %0 = atomicrmw volatile umin i32 addrspace(1)* %gep, i32 %in seq_cst + store i32 %0, i32 addrspace(1)* %out2 + ret void +} + +; FUNC-LABEL: {{^}}atomic_umin_i32_addr64_offset: +; SI: buffer_atomic_umin v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}} +define void @atomic_umin_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) { +entry: + %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index + %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4 + %0 = atomicrmw volatile umin i32 addrspace(1)* %gep, i32 %in seq_cst + ret void +} + +; FUNC-LABEL: {{^}}atomic_umin_i32_ret_addr64_offset: +; SI: buffer_atomic_umin [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}} +; SI: buffer_store_dword [[RET]] +define void @atomic_umin_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { +entry: + %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index + %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4 + %0 = atomicrmw volatile umin i32 addrspace(1)* %gep, i32 %in seq_cst + store i32 %0, i32 addrspace(1)* %out2 + ret void +} + +; FUNC-LABEL: {{^}}atomic_umin_i32: +; SI: buffer_atomic_umin v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}} +define void @atomic_umin_i32(i32 addrspace(1)* %out, i32 %in) { +entry: + %0 = atomicrmw volatile umin i32 addrspace(1)* %out, i32 %in seq_cst + ret void +} + +; FUNC-LABEL: {{^}}atomic_umin_i32_ret: +; SI: buffer_atomic_umin [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 glc +; SI: buffer_store_dword [[RET]] +define void @atomic_umin_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { +entry: + %0 = atomicrmw volatile umin i32 addrspace(1)* %out, i32 %in seq_cst + store i32 %0, i32 addrspace(1)* %out2 + ret void +} + +; FUNC-LABEL: {{^}}atomic_umin_i32_addr64: +; SI: buffer_atomic_umin v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}} +define void @atomic_umin_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) { +entry: + %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index + %0 = atomicrmw volatile umin i32 addrspace(1)* %ptr, i32 %in seq_cst + ret void +} + +; FUNC-LABEL: {{^}}atomic_umin_i32_ret_addr64: +; SI: buffer_atomic_umin [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}} +; SI: buffer_store_dword [[RET]] +define void @atomic_umin_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { +entry: + %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index + %0 = atomicrmw volatile umin i32 addrspace(1)* %ptr, i32 %in seq_cst + store i32 %0, i32 addrspace(1)* %out2 + ret void +} + +; FUNC-LABEL: {{^}}atomic_or_i32_offset: +; SI: buffer_atomic_or v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}} +define void @atomic_or_i32_offset(i32 addrspace(1)* %out, i32 %in) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %0 = atomicrmw volatile or i32 addrspace(1)* %gep, i32 %in seq_cst + ret void +} + +; FUNC-LABEL: {{^}}atomic_or_i32_ret_offset: +; SI: buffer_atomic_or [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc {{$}} +; SI: buffer_store_dword [[RET]] +define void @atomic_or_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %0 = atomicrmw volatile or i32 addrspace(1)* %gep, i32 %in seq_cst + store i32 %0, i32 addrspace(1)* %out2 + ret void +} + +; FUNC-LABEL: {{^}}atomic_or_i32_addr64_offset: +; SI: buffer_atomic_or v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}} +define void @atomic_or_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) { +entry: + %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index + %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4 + %0 = atomicrmw volatile or i32 addrspace(1)* %gep, i32 %in seq_cst + ret void +} + +; FUNC-LABEL: {{^}}atomic_or_i32_ret_addr64_offset: +; SI: buffer_atomic_or [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}} +; SI: buffer_store_dword [[RET]] +define void @atomic_or_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { +entry: + %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index + %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4 + %0 = atomicrmw volatile or i32 addrspace(1)* %gep, i32 %in seq_cst + store i32 %0, i32 addrspace(1)* %out2 + ret void +} + +; FUNC-LABEL: {{^}}atomic_or_i32: +; SI: buffer_atomic_or v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}} +define void @atomic_or_i32(i32 addrspace(1)* %out, i32 %in) { +entry: + %0 = atomicrmw volatile or i32 addrspace(1)* %out, i32 %in seq_cst + ret void +} + +; FUNC-LABEL: {{^}}atomic_or_i32_ret: +; SI: buffer_atomic_or [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 glc +; SI: buffer_store_dword [[RET]] +define void @atomic_or_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { +entry: + %0 = atomicrmw volatile or i32 addrspace(1)* %out, i32 %in seq_cst + store i32 %0, i32 addrspace(1)* %out2 + ret void +} + +; FUNC-LABEL: {{^}}atomic_or_i32_addr64: +; SI: buffer_atomic_or v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}} +define void @atomic_or_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) { +entry: + %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index + %0 = atomicrmw volatile or i32 addrspace(1)* %ptr, i32 %in seq_cst + ret void +} + +; FUNC-LABEL: {{^}}atomic_or_i32_ret_addr64: +; SI: buffer_atomic_or [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}} +; SI: buffer_store_dword [[RET]] +define void @atomic_or_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { +entry: + %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index + %0 = atomicrmw volatile or i32 addrspace(1)* %ptr, i32 %in seq_cst + store i32 %0, i32 addrspace(1)* %out2 + ret void +} + +; FUNC-LABEL: {{^}}atomic_xchg_i32_offset: +; SI: buffer_atomic_swap v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}} +define void @atomic_xchg_i32_offset(i32 addrspace(1)* %out, i32 %in) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %0 = atomicrmw volatile xchg i32 addrspace(1)* %gep, i32 %in seq_cst + ret void +} + +; FUNC-LABEL: {{^}}atomic_xchg_i32_ret_offset: +; SI: buffer_atomic_swap [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc {{$}} +; SI: buffer_store_dword [[RET]] +define void @atomic_xchg_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %0 = atomicrmw volatile xchg i32 addrspace(1)* %gep, i32 %in seq_cst + store i32 %0, i32 addrspace(1)* %out2 + ret void +} + +; FUNC-LABEL: {{^}}atomic_xchg_i32_addr64_offset: +; SI: buffer_atomic_swap v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}} +define void @atomic_xchg_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) { +entry: + %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index + %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4 + %0 = atomicrmw volatile xchg i32 addrspace(1)* %gep, i32 %in seq_cst + ret void +} + +; FUNC-LABEL: {{^}}atomic_xchg_i32_ret_addr64_offset: +; SI: buffer_atomic_swap [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}} +; SI: buffer_store_dword [[RET]] +define void @atomic_xchg_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { +entry: + %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index + %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4 + %0 = atomicrmw volatile xchg i32 addrspace(1)* %gep, i32 %in seq_cst + store i32 %0, i32 addrspace(1)* %out2 + ret void +} + +; FUNC-LABEL: {{^}}atomic_xchg_i32: +; SI: buffer_atomic_swap v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}} +define void @atomic_xchg_i32(i32 addrspace(1)* %out, i32 %in) { +entry: + %0 = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in seq_cst + ret void +} + +; FUNC-LABEL: {{^}}atomic_xchg_i32_ret: +; SI: buffer_atomic_swap [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 glc +; SI: buffer_store_dword [[RET]] +define void @atomic_xchg_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { +entry: + %0 = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in seq_cst + store i32 %0, i32 addrspace(1)* %out2 + ret void +} + +; FUNC-LABEL: {{^}}atomic_xchg_i32_addr64: +; SI: buffer_atomic_swap v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}} +define void @atomic_xchg_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) { +entry: + %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index + %0 = atomicrmw volatile xchg i32 addrspace(1)* %ptr, i32 %in seq_cst + ret void +} + +; FUNC-LABEL: {{^}}atomic_xchg_i32_ret_addr64: +; SI: buffer_atomic_swap [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}} +; SI: buffer_store_dword [[RET]] +define void @atomic_xchg_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { +entry: + %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index + %0 = atomicrmw volatile xchg i32 addrspace(1)* %ptr, i32 %in seq_cst + store i32 %0, i32 addrspace(1)* %out2 + ret void +} + +; FUNC-LABEL: {{^}}atomic_xor_i32_offset: +; SI: buffer_atomic_xor v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}} +define void @atomic_xor_i32_offset(i32 addrspace(1)* %out, i32 %in) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %0 = atomicrmw volatile xor i32 addrspace(1)* %gep, i32 %in seq_cst + ret void +} + +; FUNC-LABEL: {{^}}atomic_xor_i32_ret_offset: +; SI: buffer_atomic_xor [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc {{$}} +; SI: buffer_store_dword [[RET]] +define void @atomic_xor_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %0 = atomicrmw volatile xor i32 addrspace(1)* %gep, i32 %in seq_cst + store i32 %0, i32 addrspace(1)* %out2 + ret void +} + +; FUNC-LABEL: {{^}}atomic_xor_i32_addr64_offset: +; SI: buffer_atomic_xor v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}} +define void @atomic_xor_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) { +entry: + %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index + %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4 + %0 = atomicrmw volatile xor i32 addrspace(1)* %gep, i32 %in seq_cst + ret void +} + +; FUNC-LABEL: {{^}}atomic_xor_i32_ret_addr64_offset: +; SI: buffer_atomic_xor [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}} +; SI: buffer_store_dword [[RET]] +define void @atomic_xor_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { +entry: + %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index + %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4 + %0 = atomicrmw volatile xor i32 addrspace(1)* %gep, i32 %in seq_cst + store i32 %0, i32 addrspace(1)* %out2 + ret void +} + +; FUNC-LABEL: {{^}}atomic_xor_i32: +; SI: buffer_atomic_xor v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}} +define void @atomic_xor_i32(i32 addrspace(1)* %out, i32 %in) { +entry: + %0 = atomicrmw volatile xor i32 addrspace(1)* %out, i32 %in seq_cst + ret void +} + +; FUNC-LABEL: {{^}}atomic_xor_i32_ret: +; SI: buffer_atomic_xor [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 glc +; SI: buffer_store_dword [[RET]] +define void @atomic_xor_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { +entry: + %0 = atomicrmw volatile xor i32 addrspace(1)* %out, i32 %in seq_cst + store i32 %0, i32 addrspace(1)* %out2 + ret void +} + +; FUNC-LABEL: {{^}}atomic_xor_i32_addr64: +; SI: buffer_atomic_xor v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}} +define void @atomic_xor_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) { +entry: + %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index + %0 = atomicrmw volatile xor i32 addrspace(1)* %ptr, i32 %in seq_cst + ret void +} + +; FUNC-LABEL: {{^}}atomic_xor_i32_ret_addr64: +; SI: buffer_atomic_xor [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}} +; SI: buffer_store_dword [[RET]] +define void @atomic_xor_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { +entry: + %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index + %0 = atomicrmw volatile xor i32 addrspace(1)* %ptr, i32 %in seq_cst + store i32 %0, i32 addrspace(1)* %out2 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/gv-const-addrspace-fail.ll b/llvm/test/CodeGen/AMDGPU/gv-const-addrspace-fail.ll new file mode 100644 index 00000000000..014b0a5482a --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/gv-const-addrspace-fail.ll @@ -0,0 +1,57 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; XUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s + + +@a = internal addrspace(2) constant [1 x i8] [ i8 7 ], align 1 + +; FUNC-LABEL: {{^}}test_i8: +; EG: CF_END +; SI: buffer_store_byte +; SI: s_endpgm +define void @test_i8( i32 %s, i8 addrspace(1)* %out) #3 { + %arrayidx = getelementptr inbounds [1 x i8], [1 x i8] addrspace(2)* @a, i32 0, i32 %s + %1 = load i8, i8 addrspace(2)* %arrayidx, align 1 + store i8 %1, i8 addrspace(1)* %out + ret void +} + +@b = internal addrspace(2) constant [1 x i16] [ i16 7 ], align 2 + +; FUNC-LABEL: {{^}}test_i16: +; EG: CF_END +; SI: buffer_store_short +; SI: s_endpgm +define void @test_i16( i32 %s, i16 addrspace(1)* %out) #3 { + %arrayidx = getelementptr inbounds [1 x i16], [1 x i16] addrspace(2)* @b, i32 0, i32 %s + %1 = load i16, i16 addrspace(2)* %arrayidx, align 2 + store i16 %1, i16 addrspace(1)* %out + ret void +} + +%struct.bar = type { float, [5 x i8] } + +; The illegal i8s aren't handled +@struct_bar_gv = internal addrspace(2) constant [1 x %struct.bar] [ %struct.bar { float 16.0, [5 x i8] [i8 0, i8 1, i8 2, i8 3, i8 4] } ] + +; FUNC-LABEL: {{^}}struct_bar_gv_load: +define void @struct_bar_gv_load(i8 addrspace(1)* %out, i32 %index) { + %gep = getelementptr inbounds [1 x %struct.bar], [1 x %struct.bar] addrspace(2)* @struct_bar_gv, i32 0, i32 0, i32 1, i32 %index + %load = load i8, i8 addrspace(2)* %gep, align 1 + store i8 %load, i8 addrspace(1)* %out, align 1 + ret void +} + + +; The private load isn't scalarzied. +@array_vector_gv = internal addrspace(2) constant [4 x <4 x i32>] [ <4 x i32> , + <4 x i32> , + <4 x i32> , + <4 x i32> ] + +; FUNC-LABEL: {{^}}array_vector_gv_load: +define void @array_vector_gv_load(<4 x i32> addrspace(1)* %out, i32 %index) { + %gep = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>] addrspace(2)* @array_vector_gv, i32 0, i32 %index + %load = load <4 x i32>, <4 x i32> addrspace(2)* %gep, align 16 + store <4 x i32> %load, <4 x i32> addrspace(1)* %out, align 16 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/gv-const-addrspace.ll b/llvm/test/CodeGen/AMDGPU/gv-const-addrspace.ll new file mode 100644 index 00000000000..3c1fc6c98f7 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/gv-const-addrspace.ll @@ -0,0 +1,101 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s + + +@b = internal addrspace(2) constant [1 x i16] [ i16 7 ], align 2 + +@float_gv = internal unnamed_addr addrspace(2) constant [5 x float] [float 0.0, float 1.0, float 2.0, float 3.0, float 4.0], align 4 + +; FUNC-LABEL: {{^}}float: +; FIXME: We should be using s_load_dword here. +; SI: buffer_load_dword +; VI: s_load_dword + +; EG-DAG: MOV {{\** *}}T2.X +; EG-DAG: MOV {{\** *}}T3.X +; EG-DAG: MOV {{\** *}}T4.X +; EG-DAG: MOV {{\** *}}T5.X +; EG-DAG: MOV {{\** *}}T6.X +; EG: MOVA_INT + +define void @float(float addrspace(1)* %out, i32 %index) { +entry: + %0 = getelementptr inbounds [5 x float], [5 x float] addrspace(2)* @float_gv, i32 0, i32 %index + %1 = load float, float addrspace(2)* %0 + store float %1, float addrspace(1)* %out + ret void +} + +@i32_gv = internal unnamed_addr addrspace(2) constant [5 x i32] [i32 0, i32 1, i32 2, i32 3, i32 4], align 4 + +; FUNC-LABEL: {{^}}i32: + +; FIXME: We should be using s_load_dword here. +; SI: buffer_load_dword +; VI: s_load_dword + +; EG-DAG: MOV {{\** *}}T2.X +; EG-DAG: MOV {{\** *}}T3.X +; EG-DAG: MOV {{\** *}}T4.X +; EG-DAG: MOV {{\** *}}T5.X +; EG-DAG: MOV {{\** *}}T6.X +; EG: MOVA_INT + +define void @i32(i32 addrspace(1)* %out, i32 %index) { +entry: + %0 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(2)* @i32_gv, i32 0, i32 %index + %1 = load i32, i32 addrspace(2)* %0 + store i32 %1, i32 addrspace(1)* %out + ret void +} + + +%struct.foo = type { float, [5 x i32] } + +@struct_foo_gv = internal unnamed_addr addrspace(2) constant [1 x %struct.foo] [ %struct.foo { float 16.0, [5 x i32] [i32 0, i32 1, i32 2, i32 3, i32 4] } ] + +; FUNC-LABEL: {{^}}struct_foo_gv_load: +; GCN: s_load_dword + +define void @struct_foo_gv_load(i32 addrspace(1)* %out, i32 %index) { + %gep = getelementptr inbounds [1 x %struct.foo], [1 x %struct.foo] addrspace(2)* @struct_foo_gv, i32 0, i32 0, i32 1, i32 %index + %load = load i32, i32 addrspace(2)* %gep, align 4 + store i32 %load, i32 addrspace(1)* %out, align 4 + ret void +} + +@array_v1_gv = internal addrspace(2) constant [4 x <1 x i32>] [ <1 x i32> , + <1 x i32> , + <1 x i32> , + <1 x i32> ] + +; FUNC-LABEL: {{^}}array_v1_gv_load: +; FIXME: We should be using s_load_dword here. +; SI: buffer_load_dword +; VI: s_load_dword +define void @array_v1_gv_load(<1 x i32> addrspace(1)* %out, i32 %index) { + %gep = getelementptr inbounds [4 x <1 x i32>], [4 x <1 x i32>] addrspace(2)* @array_v1_gv, i32 0, i32 %index + %load = load <1 x i32>, <1 x i32> addrspace(2)* %gep, align 4 + store <1 x i32> %load, <1 x i32> addrspace(1)* %out, align 4 + ret void +} + +define void @gv_addressing_in_branch(float addrspace(1)* %out, i32 %index, i32 %a) { +entry: + %0 = icmp eq i32 0, %a + br i1 %0, label %if, label %else + +if: + %1 = getelementptr inbounds [5 x float], [5 x float] addrspace(2)* @float_gv, i32 0, i32 %index + %2 = load float, float addrspace(2)* %1 + store float %2, float addrspace(1)* %out + br label %endif + +else: + store float 1.0, float addrspace(1)* %out + br label %endif + +endif: + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/half.ll b/llvm/test/CodeGen/AMDGPU/half.ll new file mode 100644 index 00000000000..bf8f11860b5 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/half.ll @@ -0,0 +1,525 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s + +; half args should be promoted to float + +; GCN-LABEL: {{^}}load_f16_arg: +; GCN: s_load_dword [[ARG:s[0-9]+]] +; GCN: v_cvt_f16_f32_e32 [[CVT:v[0-9]+]], [[ARG]] +; GCN: buffer_store_short [[CVT]] +define void @load_f16_arg(half addrspace(1)* %out, half %arg) #0 { + store half %arg, half addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}load_v2f16_arg: +; GCN-DAG: buffer_load_ushort [[V0:v[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:44 +; GCN-DAG: buffer_load_ushort [[V1:v[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:46 +; GCN-DAG: buffer_store_short [[V0]], s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} +; GCN-DAG: buffer_store_short [[V1]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:2{{$}} +; GCN: s_endpgm +define void @load_v2f16_arg(<2 x half> addrspace(1)* %out, <2 x half> %arg) #0 { + store <2 x half> %arg, <2 x half> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}load_v3f16_arg: +; GCN: buffer_load_ushort +; GCN: buffer_load_ushort +; GCN: buffer_load_ushort +; GCN-NOT: buffer_load +; GCN-DAG: buffer_store_dword +; GCN-DAG: buffer_store_short +; GCN-NOT: buffer_store +; GCN: s_endpgm +define void @load_v3f16_arg(<3 x half> addrspace(1)* %out, <3 x half> %arg) #0 { + store <3 x half> %arg, <3 x half> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}load_v4f16_arg: +; GCN: buffer_load_ushort +; GCN: buffer_load_ushort +; GCN: buffer_load_ushort +; GCN: buffer_load_ushort +; GCN: buffer_store_short +; GCN: buffer_store_short +; GCN: buffer_store_short +; GCN: buffer_store_short +; GCN: s_endpgm +define void @load_v4f16_arg(<4 x half> addrspace(1)* %out, <4 x half> %arg) #0 { + store <4 x half> %arg, <4 x half> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}load_v8f16_arg: +define void @load_v8f16_arg(<8 x half> addrspace(1)* %out, <8 x half> %arg) #0 { + store <8 x half> %arg, <8 x half> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}extload_v2f16_arg: +define void @extload_v2f16_arg(<2 x float> addrspace(1)* %out, <2 x half> %in) #0 { + %fpext = fpext <2 x half> %in to <2 x float> + store <2 x float> %fpext, <2 x float> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}extload_f16_to_f32_arg: +define void @extload_f16_to_f32_arg(float addrspace(1)* %out, half %arg) #0 { + %ext = fpext half %arg to float + store float %ext, float addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}extload_v2f16_to_v2f32_arg: +define void @extload_v2f16_to_v2f32_arg(<2 x float> addrspace(1)* %out, <2 x half> %arg) #0 { + %ext = fpext <2 x half> %arg to <2 x float> + store <2 x float> %ext, <2 x float> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}extload_v3f16_to_v3f32_arg: +; GCN: buffer_load_ushort +; GCN: buffer_load_ushort +; GCN: buffer_load_ushort +; GCN-NOT: buffer_load +; GCN: v_cvt_f32_f16_e32 +; GCN: v_cvt_f32_f16_e32 +; GCN: v_cvt_f32_f16_e32 +; GCN-NOT: v_cvt_f32_f16 +; GCN-DAG: buffer_store_dword +; GCN-DAG: buffer_store_dwordx2 +; GCN: s_endpgm +define void @extload_v3f16_to_v3f32_arg(<3 x float> addrspace(1)* %out, <3 x half> %arg) #0 { + %ext = fpext <3 x half> %arg to <3 x float> + store <3 x float> %ext, <3 x float> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}extload_v4f16_to_v4f32_arg: +define void @extload_v4f16_to_v4f32_arg(<4 x float> addrspace(1)* %out, <4 x half> %arg) #0 { + %ext = fpext <4 x half> %arg to <4 x float> + store <4 x float> %ext, <4 x float> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}extload_v8f16_to_v8f32_arg: +define void @extload_v8f16_to_v8f32_arg(<8 x float> addrspace(1)* %out, <8 x half> %arg) #0 { + %ext = fpext <8 x half> %arg to <8 x float> + store <8 x float> %ext, <8 x float> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}extload_f16_to_f64_arg: +define void @extload_f16_to_f64_arg(double addrspace(1)* %out, half %arg) #0 { + %ext = fpext half %arg to double + store double %ext, double addrspace(1)* %out + ret void +} +; GCN-LABEL: {{^}}extload_v2f16_to_v2f64_arg: +define void @extload_v2f16_to_v2f64_arg(<2 x double> addrspace(1)* %out, <2 x half> %arg) #0 { + %ext = fpext <2 x half> %arg to <2 x double> + store <2 x double> %ext, <2 x double> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}extload_v3f16_to_v3f64_arg: +define void @extload_v3f16_to_v3f64_arg(<3 x double> addrspace(1)* %out, <3 x half> %arg) #0 { + %ext = fpext <3 x half> %arg to <3 x double> + store <3 x double> %ext, <3 x double> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}extload_v4f16_to_v4f64_arg: +define void @extload_v4f16_to_v4f64_arg(<4 x double> addrspace(1)* %out, <4 x half> %arg) #0 { + %ext = fpext <4 x half> %arg to <4 x double> + store <4 x double> %ext, <4 x double> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}extload_v8f16_to_v8f64_arg: +define void @extload_v8f16_to_v8f64_arg(<8 x double> addrspace(1)* %out, <8 x half> %arg) #0 { + %ext = fpext <8 x half> %arg to <8 x double> + store <8 x double> %ext, <8 x double> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}global_load_store_f16: +; GCN: buffer_load_ushort [[TMP:v[0-9]+]] +; GCN: buffer_store_short [[TMP]] +define void @global_load_store_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 { + %val = load half, half addrspace(1)* %in + store half %val, half addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}global_load_store_v2f16: +; GCN: buffer_load_dword [[TMP:v[0-9]+]] +; GCN: buffer_store_dword [[TMP]] +define void @global_load_store_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 { + %val = load <2 x half>, <2 x half> addrspace(1)* %in + store <2 x half> %val, <2 x half> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}global_load_store_v4f16: +; GCN: buffer_load_dwordx2 [[TMP:v\[[0-9]+:[0-9]+\]]] +; GCN: buffer_store_dwordx2 [[TMP]] +define void @global_load_store_v4f16(<4 x half> addrspace(1)* %in, <4 x half> addrspace(1)* %out) #0 { + %val = load <4 x half>, <4 x half> addrspace(1)* %in + store <4 x half> %val, <4 x half> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}global_load_store_v8f16: +; GCN: buffer_load_dwordx4 [[TMP:v\[[0-9]+:[0-9]+\]]] +; GCN: buffer_store_dwordx4 [[TMP:v\[[0-9]+:[0-9]+\]]] +; GCN: s_endpgm +define void @global_load_store_v8f16(<8 x half> addrspace(1)* %out, <8 x half> addrspace(1)* %in) #0 { + %val = load <8 x half>, <8 x half> addrspace(1)* %in + store <8 x half> %val, <8 x half> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}global_extload_f16_to_f32: +; GCN: buffer_load_ushort [[LOAD:v[0-9]+]] +; GCN: v_cvt_f32_f16_e32 [[CVT:v[0-9]+]], [[LOAD]] +; GCN: buffer_store_dword [[CVT]] +define void @global_extload_f16_to_f32(float addrspace(1)* %out, half addrspace(1)* %in) #0 { + %val = load half, half addrspace(1)* %in + %cvt = fpext half %val to float + store float %cvt, float addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}global_extload_v2f16_to_v2f32: +define void @global_extload_v2f16_to_v2f32(<2 x float> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 { + %val = load <2 x half>, <2 x half> addrspace(1)* %in + %cvt = fpext <2 x half> %val to <2 x float> + store <2 x float> %cvt, <2 x float> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}global_extload_v3f16_to_v3f32: +define void @global_extload_v3f16_to_v3f32(<3 x float> addrspace(1)* %out, <3 x half> addrspace(1)* %in) #0 { + %val = load <3 x half>, <3 x half> addrspace(1)* %in + %cvt = fpext <3 x half> %val to <3 x float> + store <3 x float> %cvt, <3 x float> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}global_extload_v4f16_to_v4f32: +define void @global_extload_v4f16_to_v4f32(<4 x float> addrspace(1)* %out, <4 x half> addrspace(1)* %in) #0 { + %val = load <4 x half>, <4 x half> addrspace(1)* %in + %cvt = fpext <4 x half> %val to <4 x float> + store <4 x float> %cvt, <4 x float> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}global_extload_v8f16_to_v8f32: +define void @global_extload_v8f16_to_v8f32(<8 x float> addrspace(1)* %out, <8 x half> addrspace(1)* %in) #0 { + %val = load <8 x half>, <8 x half> addrspace(1)* %in + %cvt = fpext <8 x half> %val to <8 x float> + store <8 x float> %cvt, <8 x float> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}global_extload_v16f16_to_v16f32: +define void @global_extload_v16f16_to_v16f32(<16 x float> addrspace(1)* %out, <16 x half> addrspace(1)* %in) #0 { + %val = load <16 x half>, <16 x half> addrspace(1)* %in + %cvt = fpext <16 x half> %val to <16 x float> + store <16 x float> %cvt, <16 x float> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}global_extload_f16_to_f64: +; GCN: buffer_load_ushort [[LOAD:v[0-9]+]] +; GCN: v_cvt_f32_f16_e32 [[CVT0:v[0-9]+]], [[LOAD]] +; GCN: v_cvt_f64_f32_e32 [[CVT1:v\[[0-9]+:[0-9]+\]]], [[CVT0]] +; GCN: buffer_store_dwordx2 [[CVT1]] +define void @global_extload_f16_to_f64(double addrspace(1)* %out, half addrspace(1)* %in) #0 { + %val = load half, half addrspace(1)* %in + %cvt = fpext half %val to double + store double %cvt, double addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}global_extload_v2f16_to_v2f64: +define void @global_extload_v2f16_to_v2f64(<2 x double> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 { + %val = load <2 x half>, <2 x half> addrspace(1)* %in + %cvt = fpext <2 x half> %val to <2 x double> + store <2 x double> %cvt, <2 x double> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}global_extload_v3f16_to_v3f64: +define void @global_extload_v3f16_to_v3f64(<3 x double> addrspace(1)* %out, <3 x half> addrspace(1)* %in) #0 { + %val = load <3 x half>, <3 x half> addrspace(1)* %in + %cvt = fpext <3 x half> %val to <3 x double> + store <3 x double> %cvt, <3 x double> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}global_extload_v4f16_to_v4f64: +define void @global_extload_v4f16_to_v4f64(<4 x double> addrspace(1)* %out, <4 x half> addrspace(1)* %in) #0 { + %val = load <4 x half>, <4 x half> addrspace(1)* %in + %cvt = fpext <4 x half> %val to <4 x double> + store <4 x double> %cvt, <4 x double> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}global_extload_v8f16_to_v8f64: +define void @global_extload_v8f16_to_v8f64(<8 x double> addrspace(1)* %out, <8 x half> addrspace(1)* %in) #0 { + %val = load <8 x half>, <8 x half> addrspace(1)* %in + %cvt = fpext <8 x half> %val to <8 x double> + store <8 x double> %cvt, <8 x double> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}global_extload_v16f16_to_v16f64: +define void @global_extload_v16f16_to_v16f64(<16 x double> addrspace(1)* %out, <16 x half> addrspace(1)* %in) #0 { + %val = load <16 x half>, <16 x half> addrspace(1)* %in + %cvt = fpext <16 x half> %val to <16 x double> + store <16 x double> %cvt, <16 x double> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}global_truncstore_f32_to_f16: +; GCN: buffer_load_dword [[LOAD:v[0-9]+]] +; GCN: v_cvt_f16_f32_e32 [[CVT:v[0-9]+]], [[LOAD]] +; GCN: buffer_store_short [[CVT]] +define void @global_truncstore_f32_to_f16(half addrspace(1)* %out, float addrspace(1)* %in) #0 { + %val = load float, float addrspace(1)* %in + %cvt = fptrunc float %val to half + store half %cvt, half addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}global_truncstore_v2f32_to_v2f16: +; GCN: buffer_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}} +; GCN-DAG: v_cvt_f16_f32_e32 [[CVT0:v[0-9]+]], v[[LO]] +; GCN-DAG: v_cvt_f16_f32_e32 [[CVT1:v[0-9]+]], v[[HI]] +; GCN-DAG: buffer_store_short [[CVT0]] +; GCN-DAG: buffer_store_short [[CVT1]] +; GCN: s_endpgm +define void @global_truncstore_v2f32_to_v2f16(<2 x half> addrspace(1)* %out, <2 x float> addrspace(1)* %in) #0 { + %val = load <2 x float>, <2 x float> addrspace(1)* %in + %cvt = fptrunc <2 x float> %val to <2 x half> + store <2 x half> %cvt, <2 x half> addrspace(1)* %out + ret void +} + +; FIXME: Shouldn't do 4th conversion +; GCN-LABEL: {{^}}global_truncstore_v3f32_to_v3f16: +; GCN: buffer_load_dwordx4 +; GCN: v_cvt_f16_f32_e32 +; GCN: v_cvt_f16_f32_e32 +; GCN: v_cvt_f16_f32_e32 +; GCN: v_cvt_f16_f32_e32 +; GCN: buffer_store_short +; GCN: buffer_store_dword +; GCN: s_endpgm +define void @global_truncstore_v3f32_to_v3f16(<3 x half> addrspace(1)* %out, <3 x float> addrspace(1)* %in) #0 { + %val = load <3 x float>, <3 x float> addrspace(1)* %in + %cvt = fptrunc <3 x float> %val to <3 x half> + store <3 x half> %cvt, <3 x half> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}global_truncstore_v4f32_to_v4f16: +; GCN: buffer_load_dwordx4 +; GCN: v_cvt_f16_f32_e32 +; GCN: v_cvt_f16_f32_e32 +; GCN: v_cvt_f16_f32_e32 +; GCN: v_cvt_f16_f32_e32 +; GCN: buffer_store_short +; GCN: buffer_store_short +; GCN: buffer_store_short +; GCN: buffer_store_short +; GCN: s_endpgm +define void @global_truncstore_v4f32_to_v4f16(<4 x half> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 { + %val = load <4 x float>, <4 x float> addrspace(1)* %in + %cvt = fptrunc <4 x float> %val to <4 x half> + store <4 x half> %cvt, <4 x half> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}global_truncstore_v8f32_to_v8f16: +; GCN: buffer_load_dword +; GCN: buffer_load_dword +; GCN: buffer_load_dword +; GCN: buffer_load_dword +; GCN: buffer_load_dword +; GCN: buffer_load_dword +; GCN: buffer_load_dword +; GCN: buffer_load_dword +; GCN: v_cvt_f16_f32_e32 +; GCN: v_cvt_f16_f32_e32 +; GCN: v_cvt_f16_f32_e32 +; GCN: v_cvt_f16_f32_e32 +; GCN: v_cvt_f16_f32_e32 +; GCN: v_cvt_f16_f32_e32 +; GCN: v_cvt_f16_f32_e32 +; GCN: v_cvt_f16_f32_e32 +; GCN: buffer_store_short +; GCN: buffer_store_short +; GCN: buffer_store_short +; GCN: buffer_store_short +; GCN: buffer_store_short +; GCN: buffer_store_short +; GCN: buffer_store_short +; GCN: buffer_store_short +; GCN: s_endpgm +define void @global_truncstore_v8f32_to_v8f16(<8 x half> addrspace(1)* %out, <8 x float> addrspace(1)* %in) #0 { + %val = load <8 x float>, <8 x float> addrspace(1)* %in + %cvt = fptrunc <8 x float> %val to <8 x half> + store <8 x half> %cvt, <8 x half> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}global_truncstore_v16f32_to_v16f16: +; GCN: buffer_load_dword +; GCN: buffer_load_dword +; GCN: buffer_load_dword +; GCN: buffer_load_dword +; GCN: buffer_load_dword +; GCN: buffer_load_dword +; GCN: buffer_load_dword +; GCN: buffer_load_dword +; GCN: buffer_load_dword +; GCN: buffer_load_dword +; GCN: buffer_load_dword +; GCN: buffer_load_dword +; GCN: buffer_load_dword +; GCN: buffer_load_dword +; GCN: buffer_load_dword +; GCN: buffer_load_dword +; GCN: v_cvt_f16_f32_e32 +; GCN: v_cvt_f16_f32_e32 +; GCN: v_cvt_f16_f32_e32 +; GCN: v_cvt_f16_f32_e32 +; GCN: v_cvt_f16_f32_e32 +; GCN: v_cvt_f16_f32_e32 +; GCN: v_cvt_f16_f32_e32 +; GCN: v_cvt_f16_f32_e32 +; GCN: v_cvt_f16_f32_e32 +; GCN: v_cvt_f16_f32_e32 +; GCN: v_cvt_f16_f32_e32 +; GCN: v_cvt_f16_f32_e32 +; GCN: v_cvt_f16_f32_e32 +; GCN: v_cvt_f16_f32_e32 +; GCN: v_cvt_f16_f32_e32 +; GCN: v_cvt_f16_f32_e32 +; GCN: buffer_store_short +; GCN: buffer_store_short +; GCN: buffer_store_short +; GCN: buffer_store_short +; GCN: buffer_store_short +; GCN: buffer_store_short +; GCN: buffer_store_short +; GCN: buffer_store_short +; GCN: buffer_store_short +; GCN: buffer_store_short +; GCN: buffer_store_short +; GCN: buffer_store_short +; GCN: buffer_store_short +; GCN: buffer_store_short +; GCN: buffer_store_short +; GCN: buffer_store_short +; GCN: s_endpgm +define void @global_truncstore_v16f32_to_v16f16(<16 x half> addrspace(1)* %out, <16 x float> addrspace(1)* %in) #0 { + %val = load <16 x float>, <16 x float> addrspace(1)* %in + %cvt = fptrunc <16 x float> %val to <16 x half> + store <16 x half> %cvt, <16 x half> addrspace(1)* %out + ret void +} + +; FIXME: Unsafe math should fold conversions away +; GCN-LABEL: {{^}}fadd_f16: +; SI-DAG: v_cvt_f32_f16_e32 v{{[0-9]+}}, +; SI-DAG: v_cvt_f32_f16_e32 v{{[0-9]+}}, +; SI-DAG: v_cvt_f32_f16_e32 v{{[0-9]+}}, +; SI-DAG: v_cvt_f32_f16_e32 v{{[0-9]+}}, +; SI: v_add_f32 +; GCN: s_endpgm +define void @fadd_f16(half addrspace(1)* %out, half %a, half %b) #0 { + %add = fadd half %a, %b + store half %add, half addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}fadd_v2f16: +; SI: v_add_f32 +; SI: v_add_f32 +; GCN: s_endpgm +define void @fadd_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %a, <2 x half> %b) #0 { + %add = fadd <2 x half> %a, %b + store <2 x half> %add, <2 x half> addrspace(1)* %out, align 8 + ret void +} + +; GCN-LABEL: {{^}}fadd_v4f16: +; SI: v_add_f32 +; SI: v_add_f32 +; SI: v_add_f32 +; SI: v_add_f32 +; GCN: s_endpgm +define void @fadd_v4f16(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in) #0 { + %b_ptr = getelementptr <4 x half>, <4 x half> addrspace(1)* %in, i32 1 + %a = load <4 x half>, <4 x half> addrspace(1)* %in, align 16 + %b = load <4 x half>, <4 x half> addrspace(1)* %b_ptr, align 16 + %result = fadd <4 x half> %a, %b + store <4 x half> %result, <4 x half> addrspace(1)* %out, align 16 + ret void +} + +; GCN-LABEL: {{^}}fadd_v8f16: +; SI: v_add_f32 +; SI: v_add_f32 +; SI: v_add_f32 +; SI: v_add_f32 +; SI: v_add_f32 +; SI: v_add_f32 +; SI: v_add_f32 +; SI: v_add_f32 +; GCN: s_endpgm +define void @fadd_v8f16(<8 x half> addrspace(1)* %out, <8 x half> %a, <8 x half> %b) #0 { + %add = fadd <8 x half> %a, %b + store <8 x half> %add, <8 x half> addrspace(1)* %out, align 32 + ret void +} + +; GCN-LABEL: {{^}}fsub_f16: +; GCN: v_subrev_f32_e32 +; GCN: s_endpgm +define void @fsub_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 { + %b_ptr = getelementptr half, half addrspace(1)* %in, i32 1 + %a = load half, half addrspace(1)* %in + %b = load half, half addrspace(1)* %b_ptr + %sub = fsub half %a, %b + store half %sub, half addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}test_bitcast_from_half: +; GCN: buffer_load_ushort [[TMP:v[0-9]+]] +; GCN: buffer_store_short [[TMP]] +define void @test_bitcast_from_half(half addrspace(1)* %in, i16 addrspace(1)* %out) #0 { + %val = load half, half addrspace(1)* %in + %val_int = bitcast half %val to i16 + store i16 %val_int, i16 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}test_bitcast_to_half: +; GCN: buffer_load_ushort [[TMP:v[0-9]+]] +; GCN: buffer_store_short [[TMP]] +define void @test_bitcast_to_half(half addrspace(1)* %out, i16 addrspace(1)* %in) #0 { + %val = load i16, i16 addrspace(1)* %in + %val_fp = bitcast i16 %val to half + store half %val_fp, half addrspace(1)* %out + ret void +} + +attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/hsa.ll b/llvm/test/CodeGen/AMDGPU/hsa.ll new file mode 100644 index 00000000000..f9113399afe --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/hsa.ll @@ -0,0 +1,14 @@ +; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri | FileCheck --check-prefix=HSA %s + +; HSA: .section .hsa.version +; HSA-NEXT: .ascii "HSA Code Unit:0.0:AMD:0.1:GFX8.1:0" +; HSA: {{^}}simple: +; Make sure we are setting the ATC bit: +; HSA: s_mov_b32 s[[HI:[0-9]]], 0x100f000 +; HSA: buffer_store_dword v{{[0-9]+}}, s[0:[[HI]]], 0 + +define void @simple(i32 addrspace(1)* %out) { +entry: + store i32 0, i32 addrspace(1)* %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/i1-copy-implicit-def.ll b/llvm/test/CodeGen/AMDGPU/i1-copy-implicit-def.ll new file mode 100644 index 00000000000..b11a2113764 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/i1-copy-implicit-def.ll @@ -0,0 +1,22 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s + +; SILowerI1Copies was not handling IMPLICIT_DEF +; SI-LABEL: {{^}}br_implicit_def: +; SI: BB#0: +; SI-NEXT: s_and_saveexec_b64 +; SI-NEXT: s_xor_b64 +; SI-NEXT: BB#1: +define void @br_implicit_def(i32 addrspace(1)* %out, i32 %arg) #0 { +bb: + br i1 undef, label %bb1, label %bb2 + +bb1: + store volatile i32 123, i32 addrspace(1)* %out + ret void + +bb2: + ret void +} + +attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/i1-copy-phi.ll b/llvm/test/CodeGen/AMDGPU/i1-copy-phi.ll new file mode 100644 index 00000000000..105cd06b330 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/i1-copy-phi.ll @@ -0,0 +1,30 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s + +; SI-LABEL: {{^}}br_i1_phi: +; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0{{$}} +; SI: s_and_saveexec_b64 +; SI: s_xor_b64 +; SI: v_mov_b32_e32 [[REG]], -1{{$}} +; SI: v_cmp_ne_i32_e32 vcc, 0, [[REG]] +; SI: s_and_saveexec_b64 +; SI: s_xor_b64 +; SI: s_endpgm +define void @br_i1_phi(i32 %arg, i1 %arg1) #0 { +bb: + br i1 %arg1, label %bb2, label %bb3 + +bb2: ; preds = %bb + br label %bb3 + +bb3: ; preds = %bb2, %bb + %tmp = phi i1 [ true, %bb2 ], [ false, %bb ] + br i1 %tmp, label %bb4, label %bb6 + +bb4: ; preds = %bb3 + %tmp5 = mul i32 undef, %arg + br label %bb6 + +bb6: ; preds = %bb4, %bb3 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/i8-to-double-to-float.ll b/llvm/test/CodeGen/AMDGPU/i8-to-double-to-float.ll new file mode 100644 index 00000000000..c218e1918bb --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/i8-to-double-to-float.ll @@ -0,0 +1,11 @@ +;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s + +;CHECK: UINT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} + +define void @test(float addrspace(1)* %out, i8 addrspace(1)* %in) { + %1 = load i8, i8 addrspace(1)* %in + %2 = uitofp i8 %1 to double + %3 = fptrunc double %2 to float + store float %3, float addrspace(1)* %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/icmp-select-sete-reverse-args.ll b/llvm/test/CodeGen/AMDGPU/icmp-select-sete-reverse-args.ll new file mode 100644 index 00000000000..60e59a5a528 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/icmp-select-sete-reverse-args.ll @@ -0,0 +1,18 @@ +;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s + +;Test that a select with reversed True/False values is correctly lowered +;to a SETNE_INT. There should only be one SETNE_INT instruction. + +;CHECK: SETNE_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +;CHECK-NOT: SETNE_INT + +define void @test(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { +entry: + %0 = load i32, i32 addrspace(1)* %in + %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1 + %1 = load i32, i32 addrspace(1)* %arrayidx1 + %cmp = icmp eq i32 %0, %1 + %value = select i1 %cmp, i32 0, i32 -1 + store i32 %value, i32 addrspace(1)* %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/icmp64.ll b/llvm/test/CodeGen/AMDGPU/icmp64.ll new file mode 100644 index 00000000000..0eaa33ebafe --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/icmp64.ll @@ -0,0 +1,93 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s + +; SI-LABEL: {{^}}test_i64_eq: +; SI: v_cmp_eq_i64 +define void @test_i64_eq(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind { + %cmp = icmp eq i64 %a, %b + %result = sext i1 %cmp to i32 + store i32 %result, i32 addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL: {{^}}test_i64_ne: +; SI: v_cmp_ne_i64 +define void @test_i64_ne(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind { + %cmp = icmp ne i64 %a, %b + %result = sext i1 %cmp to i32 + store i32 %result, i32 addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL: {{^}}test_i64_slt: +; SI: v_cmp_lt_i64 +define void @test_i64_slt(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind { + %cmp = icmp slt i64 %a, %b + %result = sext i1 %cmp to i32 + store i32 %result, i32 addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL: {{^}}test_i64_ult: +; SI: v_cmp_lt_u64 +define void @test_i64_ult(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind { + %cmp = icmp ult i64 %a, %b + %result = sext i1 %cmp to i32 + store i32 %result, i32 addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL: {{^}}test_i64_sle: +; SI: v_cmp_le_i64 +define void @test_i64_sle(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind { + %cmp = icmp sle i64 %a, %b + %result = sext i1 %cmp to i32 + store i32 %result, i32 addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL: {{^}}test_i64_ule: +; SI: v_cmp_le_u64 +define void @test_i64_ule(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind { + %cmp = icmp ule i64 %a, %b + %result = sext i1 %cmp to i32 + store i32 %result, i32 addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL: {{^}}test_i64_sgt: +; SI: v_cmp_gt_i64 +define void @test_i64_sgt(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind { + %cmp = icmp sgt i64 %a, %b + %result = sext i1 %cmp to i32 + store i32 %result, i32 addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL: {{^}}test_i64_ugt: +; SI: v_cmp_gt_u64 +define void @test_i64_ugt(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind { + %cmp = icmp ugt i64 %a, %b + %result = sext i1 %cmp to i32 + store i32 %result, i32 addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL: {{^}}test_i64_sge: +; SI: v_cmp_ge_i64 +define void @test_i64_sge(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind { + %cmp = icmp sge i64 %a, %b + %result = sext i1 %cmp to i32 + store i32 %result, i32 addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL: {{^}}test_i64_uge: +; SI: v_cmp_ge_u64 +define void @test_i64_uge(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind { + %cmp = icmp uge i64 %a, %b + %result = sext i1 %cmp to i32 + store i32 %result, i32 addrspace(1)* %out, align 4 + ret void +} + diff --git a/llvm/test/CodeGen/AMDGPU/imm.ll b/llvm/test/CodeGen/AMDGPU/imm.ll new file mode 100644 index 00000000000..12eed550eb1 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/imm.ll @@ -0,0 +1,617 @@ +; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=CHECK %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=CHECK %s + +; Use a 64-bit value with lo bits that can be represented as an inline constant +; CHECK-LABEL: {{^}}i64_imm_inline_lo: +; CHECK: s_mov_b32 [[LO:s[0-9]+]], 5 +; CHECK: v_mov_b32_e32 v[[LO_VGPR:[0-9]+]], [[LO]] +; CHECK: buffer_store_dwordx2 v{{\[}}[[LO_VGPR]]: +define void @i64_imm_inline_lo(i64 addrspace(1) *%out) { +entry: + store i64 1311768464867721221, i64 addrspace(1) *%out ; 0x1234567800000005 + ret void +} + +; Use a 64-bit value with hi bits that can be represented as an inline constant +; CHECK-LABEL: {{^}}i64_imm_inline_hi: +; CHECK: s_mov_b32 [[HI:s[0-9]+]], 5 +; CHECK: v_mov_b32_e32 v[[HI_VGPR:[0-9]+]], [[HI]] +; CHECK: buffer_store_dwordx2 v{{\[[0-9]+:}}[[HI_VGPR]] +define void @i64_imm_inline_hi(i64 addrspace(1) *%out) { +entry: + store i64 21780256376, i64 addrspace(1) *%out ; 0x0000000512345678 + ret void +} + +; CHECK-LABEL: {{^}}store_imm_neg_0.0_i64: +; CHECK-DAG: s_mov_b32 s[[HI_SREG:[0-9]+]], 0x80000000 +; CHECK-DAG: s_mov_b32 s[[LO_SREG:[0-9]+]], 0{{$}} +; CHECK-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], s[[LO_SREG]] +; CHECK-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], s[[HI_SREG]] +; CHECK: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}} +define void @store_imm_neg_0.0_i64(i64 addrspace(1) *%out) { + store i64 -9223372036854775808, i64 addrspace(1) *%out + ret void +} + +; CHECK-LABEL: {{^}}store_inline_imm_neg_0.0_i32: +; CHECK: v_mov_b32_e32 [[REG:v[0-9]+]], 0x80000000 +; CHECK: buffer_store_dword [[REG]] +define void @store_inline_imm_neg_0.0_i32(i32 addrspace(1)* %out) { + store i32 -2147483648, i32 addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}store_inline_imm_0.0_f32: +; CHECK: v_mov_b32_e32 [[REG:v[0-9]+]], 0{{$}} +; CHECK: buffer_store_dword [[REG]] +define void @store_inline_imm_0.0_f32(float addrspace(1)* %out) { + store float 0.0, float addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}store_imm_neg_0.0_f32: +; CHECK: v_mov_b32_e32 [[REG:v[0-9]+]], 0x80000000 +; CHECK: buffer_store_dword [[REG]] +define void @store_imm_neg_0.0_f32(float addrspace(1)* %out) { + store float -0.0, float addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}store_inline_imm_0.5_f32: +; CHECK: v_mov_b32_e32 [[REG:v[0-9]+]], 0.5{{$}} +; CHECK: buffer_store_dword [[REG]] +define void @store_inline_imm_0.5_f32(float addrspace(1)* %out) { + store float 0.5, float addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}store_inline_imm_m_0.5_f32: +; CHECK: v_mov_b32_e32 [[REG:v[0-9]+]], -0.5{{$}} +; CHECK: buffer_store_dword [[REG]] +define void @store_inline_imm_m_0.5_f32(float addrspace(1)* %out) { + store float -0.5, float addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}store_inline_imm_1.0_f32: +; CHECK: v_mov_b32_e32 [[REG:v[0-9]+]], 1.0{{$}} +; CHECK: buffer_store_dword [[REG]] +define void @store_inline_imm_1.0_f32(float addrspace(1)* %out) { + store float 1.0, float addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}store_inline_imm_m_1.0_f32: +; CHECK: v_mov_b32_e32 [[REG:v[0-9]+]], -1.0{{$}} +; CHECK: buffer_store_dword [[REG]] +define void @store_inline_imm_m_1.0_f32(float addrspace(1)* %out) { + store float -1.0, float addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}store_inline_imm_2.0_f32: +; CHECK: v_mov_b32_e32 [[REG:v[0-9]+]], 2.0{{$}} +; CHECK: buffer_store_dword [[REG]] +define void @store_inline_imm_2.0_f32(float addrspace(1)* %out) { + store float 2.0, float addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}store_inline_imm_m_2.0_f32: +; CHECK: v_mov_b32_e32 [[REG:v[0-9]+]], -2.0{{$}} +; CHECK: buffer_store_dword [[REG]] +define void @store_inline_imm_m_2.0_f32(float addrspace(1)* %out) { + store float -2.0, float addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}store_inline_imm_4.0_f32: +; CHECK: v_mov_b32_e32 [[REG:v[0-9]+]], 4.0{{$}} +; CHECK: buffer_store_dword [[REG]] +define void @store_inline_imm_4.0_f32(float addrspace(1)* %out) { + store float 4.0, float addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}store_inline_imm_m_4.0_f32: +; CHECK: v_mov_b32_e32 [[REG:v[0-9]+]], -4.0{{$}} +; CHECK: buffer_store_dword [[REG]] +define void @store_inline_imm_m_4.0_f32(float addrspace(1)* %out) { + store float -4.0, float addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}store_literal_imm_f32: +; CHECK: v_mov_b32_e32 [[REG:v[0-9]+]], 0x45800000 +; CHECK: buffer_store_dword [[REG]] +define void @store_literal_imm_f32(float addrspace(1)* %out) { + store float 4096.0, float addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}add_inline_imm_0.0_f32: +; CHECK: s_load_dword [[VAL:s[0-9]+]] +; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], 0, [[VAL]]{{$}} +; CHECK: buffer_store_dword [[REG]] +define void @add_inline_imm_0.0_f32(float addrspace(1)* %out, float %x) { + %y = fadd float %x, 0.0 + store float %y, float addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}add_inline_imm_0.5_f32: +; CHECK: s_load_dword [[VAL:s[0-9]+]] +; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], 0.5, [[VAL]]{{$}} +; CHECK: buffer_store_dword [[REG]] +define void @add_inline_imm_0.5_f32(float addrspace(1)* %out, float %x) { + %y = fadd float %x, 0.5 + store float %y, float addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}add_inline_imm_neg_0.5_f32: +; CHECK: s_load_dword [[VAL:s[0-9]+]] +; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], -0.5, [[VAL]]{{$}} +; CHECK: buffer_store_dword [[REG]] +define void @add_inline_imm_neg_0.5_f32(float addrspace(1)* %out, float %x) { + %y = fadd float %x, -0.5 + store float %y, float addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}add_inline_imm_1.0_f32: +; CHECK: s_load_dword [[VAL:s[0-9]+]] +; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], 1.0, [[VAL]]{{$}} +; CHECK: buffer_store_dword [[REG]] +define void @add_inline_imm_1.0_f32(float addrspace(1)* %out, float %x) { + %y = fadd float %x, 1.0 + store float %y, float addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}add_inline_imm_neg_1.0_f32: +; CHECK: s_load_dword [[VAL:s[0-9]+]] +; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], -1.0, [[VAL]]{{$}} +; CHECK: buffer_store_dword [[REG]] +define void @add_inline_imm_neg_1.0_f32(float addrspace(1)* %out, float %x) { + %y = fadd float %x, -1.0 + store float %y, float addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}add_inline_imm_2.0_f32: +; CHECK: s_load_dword [[VAL:s[0-9]+]] +; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], 2.0, [[VAL]]{{$}} +; CHECK: buffer_store_dword [[REG]] +define void @add_inline_imm_2.0_f32(float addrspace(1)* %out, float %x) { + %y = fadd float %x, 2.0 + store float %y, float addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}add_inline_imm_neg_2.0_f32: +; CHECK: s_load_dword [[VAL:s[0-9]+]] +; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], -2.0, [[VAL]]{{$}} +; CHECK: buffer_store_dword [[REG]] +define void @add_inline_imm_neg_2.0_f32(float addrspace(1)* %out, float %x) { + %y = fadd float %x, -2.0 + store float %y, float addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}add_inline_imm_4.0_f32: +; CHECK: s_load_dword [[VAL:s[0-9]+]] +; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], 4.0, [[VAL]]{{$}} +; CHECK: buffer_store_dword [[REG]] +define void @add_inline_imm_4.0_f32(float addrspace(1)* %out, float %x) { + %y = fadd float %x, 4.0 + store float %y, float addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}add_inline_imm_neg_4.0_f32: +; CHECK: s_load_dword [[VAL:s[0-9]+]] +; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], -4.0, [[VAL]]{{$}} +; CHECK: buffer_store_dword [[REG]] +define void @add_inline_imm_neg_4.0_f32(float addrspace(1)* %out, float %x) { + %y = fadd float %x, -4.0 + store float %y, float addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}commute_add_inline_imm_0.5_f32: +; CHECK: buffer_load_dword [[VAL:v[0-9]+]] +; CHECK: v_add_f32_e32 [[REG:v[0-9]+]], 0.5, [[VAL]] +; CHECK: buffer_store_dword [[REG]] +define void @commute_add_inline_imm_0.5_f32(float addrspace(1)* %out, float addrspace(1)* %in) { + %x = load float, float addrspace(1)* %in + %y = fadd float %x, 0.5 + store float %y, float addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}commute_add_literal_f32: +; CHECK: buffer_load_dword [[VAL:v[0-9]+]] +; CHECK: v_add_f32_e32 [[REG:v[0-9]+]], 0x44800000, [[VAL]] +; CHECK: buffer_store_dword [[REG]] +define void @commute_add_literal_f32(float addrspace(1)* %out, float addrspace(1)* %in) { + %x = load float, float addrspace(1)* %in + %y = fadd float %x, 1024.0 + store float %y, float addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}add_inline_imm_1_f32: +; CHECK: s_load_dword [[VAL:s[0-9]+]] +; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], 1, [[VAL]]{{$}} +; CHECK: buffer_store_dword [[REG]] +define void @add_inline_imm_1_f32(float addrspace(1)* %out, float %x) { + %y = fadd float %x, 0x36a0000000000000 + store float %y, float addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}add_inline_imm_2_f32: +; CHECK: s_load_dword [[VAL:s[0-9]+]] +; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], 2, [[VAL]]{{$}} +; CHECK: buffer_store_dword [[REG]] +define void @add_inline_imm_2_f32(float addrspace(1)* %out, float %x) { + %y = fadd float %x, 0x36b0000000000000 + store float %y, float addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}add_inline_imm_16_f32: +; CHECK: s_load_dword [[VAL:s[0-9]+]] +; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], 16, [[VAL]] +; CHECK: buffer_store_dword [[REG]] +define void @add_inline_imm_16_f32(float addrspace(1)* %out, float %x) { + %y = fadd float %x, 0x36e0000000000000 + store float %y, float addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}add_inline_imm_neg_1_f32: +; CHECK: s_load_dword [[VAL:s[0-9]+]] +; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], -1, [[VAL]] +; CHECK: buffer_store_dword [[REG]] +define void @add_inline_imm_neg_1_f32(float addrspace(1)* %out, float %x) { + %y = fadd float %x, 0xffffffffe0000000 + store float %y, float addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}add_inline_imm_neg_2_f32: +; CHECK: s_load_dword [[VAL:s[0-9]+]] +; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], -2, [[VAL]] +; CHECK: buffer_store_dword [[REG]] +define void @add_inline_imm_neg_2_f32(float addrspace(1)* %out, float %x) { + %y = fadd float %x, 0xffffffffc0000000 + store float %y, float addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}add_inline_imm_neg_16_f32: +; CHECK: s_load_dword [[VAL:s[0-9]+]] +; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], -16, [[VAL]] +; CHECK: buffer_store_dword [[REG]] +define void @add_inline_imm_neg_16_f32(float addrspace(1)* %out, float %x) { + %y = fadd float %x, 0xfffffffe00000000 + store float %y, float addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}add_inline_imm_63_f32: +; CHECK: s_load_dword [[VAL:s[0-9]+]] +; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], 63, [[VAL]] +; CHECK: buffer_store_dword [[REG]] +define void @add_inline_imm_63_f32(float addrspace(1)* %out, float %x) { + %y = fadd float %x, 0x36ff800000000000 + store float %y, float addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}add_inline_imm_64_f32: +; CHECK: s_load_dword [[VAL:s[0-9]+]] +; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], 64, [[VAL]] +; CHECK: buffer_store_dword [[REG]] +define void @add_inline_imm_64_f32(float addrspace(1)* %out, float %x) { + %y = fadd float %x, 0x3700000000000000 + store float %y, float addrspace(1)* %out + ret void +} + + +; CHECK-LABEL: {{^}}add_inline_imm_0.0_f64: +; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb +; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c +; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 0, [[VAL]] +; CHECK: buffer_store_dwordx2 [[REG]] +define void @add_inline_imm_0.0_f64(double addrspace(1)* %out, double %x) { + %y = fadd double %x, 0.0 + store double %y, double addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}add_inline_imm_0.5_f64: +; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb +; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c +; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 0.5, [[VAL]] +; CHECK: buffer_store_dwordx2 [[REG]] +define void @add_inline_imm_0.5_f64(double addrspace(1)* %out, double %x) { + %y = fadd double %x, 0.5 + store double %y, double addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}add_inline_imm_neg_0.5_f64: +; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb +; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c +; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], -0.5, [[VAL]] +; CHECK: buffer_store_dwordx2 [[REG]] +define void @add_inline_imm_neg_0.5_f64(double addrspace(1)* %out, double %x) { + %y = fadd double %x, -0.5 + store double %y, double addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}add_inline_imm_1.0_f64: +; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb +; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c +; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 1.0, [[VAL]] +; CHECK: buffer_store_dwordx2 [[REG]] +define void @add_inline_imm_1.0_f64(double addrspace(1)* %out, double %x) { + %y = fadd double %x, 1.0 + store double %y, double addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}add_inline_imm_neg_1.0_f64: +; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb +; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c +; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], -1.0, [[VAL]] +; CHECK: buffer_store_dwordx2 [[REG]] +define void @add_inline_imm_neg_1.0_f64(double addrspace(1)* %out, double %x) { + %y = fadd double %x, -1.0 + store double %y, double addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}add_inline_imm_2.0_f64: +; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb +; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c +; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 2.0, [[VAL]] +; CHECK: buffer_store_dwordx2 [[REG]] +define void @add_inline_imm_2.0_f64(double addrspace(1)* %out, double %x) { + %y = fadd double %x, 2.0 + store double %y, double addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}add_inline_imm_neg_2.0_f64: +; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb +; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c +; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], -2.0, [[VAL]] +; CHECK: buffer_store_dwordx2 [[REG]] +define void @add_inline_imm_neg_2.0_f64(double addrspace(1)* %out, double %x) { + %y = fadd double %x, -2.0 + store double %y, double addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}add_inline_imm_4.0_f64: +; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb +; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c +; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 4.0, [[VAL]] +; CHECK: buffer_store_dwordx2 [[REG]] +define void @add_inline_imm_4.0_f64(double addrspace(1)* %out, double %x) { + %y = fadd double %x, 4.0 + store double %y, double addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}add_inline_imm_neg_4.0_f64: +; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb +; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c +; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], -4.0, [[VAL]] +; CHECK: buffer_store_dwordx2 [[REG]] +define void @add_inline_imm_neg_4.0_f64(double addrspace(1)* %out, double %x) { + %y = fadd double %x, -4.0 + store double %y, double addrspace(1)* %out + ret void +} + + +; CHECK-LABEL: {{^}}add_inline_imm_1_f64: +; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb +; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c +; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 1, [[VAL]] +; CHECK: buffer_store_dwordx2 [[REG]] +define void @add_inline_imm_1_f64(double addrspace(1)* %out, double %x) { + %y = fadd double %x, 0x0000000000000001 + store double %y, double addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}add_inline_imm_2_f64: +; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb +; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c +; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 2, [[VAL]] +; CHECK: buffer_store_dwordx2 [[REG]] +define void @add_inline_imm_2_f64(double addrspace(1)* %out, double %x) { + %y = fadd double %x, 0x0000000000000002 + store double %y, double addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}add_inline_imm_16_f64: +; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb +; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c +; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 16, [[VAL]] +; CHECK: buffer_store_dwordx2 [[REG]] +define void @add_inline_imm_16_f64(double addrspace(1)* %out, double %x) { + %y = fadd double %x, 0x0000000000000010 + store double %y, double addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}add_inline_imm_neg_1_f64: +; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb +; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c +; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], -1, [[VAL]] +; CHECK: buffer_store_dwordx2 [[REG]] +define void @add_inline_imm_neg_1_f64(double addrspace(1)* %out, double %x) { + %y = fadd double %x, 0xffffffffffffffff + store double %y, double addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}add_inline_imm_neg_2_f64: +; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb +; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c +; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], -2, [[VAL]] +; CHECK: buffer_store_dwordx2 [[REG]] +define void @add_inline_imm_neg_2_f64(double addrspace(1)* %out, double %x) { + %y = fadd double %x, 0xfffffffffffffffe + store double %y, double addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}add_inline_imm_neg_16_f64: +; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb +; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c +; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], -16, [[VAL]] +; CHECK: buffer_store_dwordx2 [[REG]] +define void @add_inline_imm_neg_16_f64(double addrspace(1)* %out, double %x) { + %y = fadd double %x, 0xfffffffffffffff0 + store double %y, double addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}add_inline_imm_63_f64: +; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb +; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c +; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 63, [[VAL]] +; CHECK: buffer_store_dwordx2 [[REG]] +define void @add_inline_imm_63_f64(double addrspace(1)* %out, double %x) { + %y = fadd double %x, 0x000000000000003F + store double %y, double addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}add_inline_imm_64_f64: +; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb +; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c +; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 64, [[VAL]] +; CHECK: buffer_store_dwordx2 [[REG]] +define void @add_inline_imm_64_f64(double addrspace(1)* %out, double %x) { + %y = fadd double %x, 0x0000000000000040 + store double %y, double addrspace(1)* %out + ret void +} + + +; CHECK-LABEL: {{^}}store_inline_imm_0.0_f64: +; CHECK: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0 +; CHECK: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 0 +; CHECK: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}} +define void @store_inline_imm_0.0_f64(double addrspace(1)* %out) { + store double 0.0, double addrspace(1)* %out + ret void +} + + +; CHECK-LABEL: {{^}}store_literal_imm_neg_0.0_f64: +; CHECK-DAG: s_mov_b32 s[[HI_SREG:[0-9]+]], 0x80000000 +; CHECK-DAG: s_mov_b32 s[[LO_SREG:[0-9]+]], 0{{$}} +; CHECK-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], s[[LO_SREG]] +; CHECK-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], s[[HI_SREG]] +; CHECK: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}} +define void @store_literal_imm_neg_0.0_f64(double addrspace(1)* %out) { + store double -0.0, double addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}store_inline_imm_0.5_f64: +; CHECK-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0{{$}} +; CHECK-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 0x3fe00000 +; CHECK: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}} +define void @store_inline_imm_0.5_f64(double addrspace(1)* %out) { + store double 0.5, double addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}store_inline_imm_m_0.5_f64: +; CHECK-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0{{$}} +; CHECK-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 0xbfe00000 +; CHECK: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}} +define void @store_inline_imm_m_0.5_f64(double addrspace(1)* %out) { + store double -0.5, double addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}store_inline_imm_1.0_f64: +; CHECK-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0{{$}} +; CHECK-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 0x3ff00000 +; CHECK: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}} +define void @store_inline_imm_1.0_f64(double addrspace(1)* %out) { + store double 1.0, double addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}store_inline_imm_m_1.0_f64: +; CHECK-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0{{$}} +; CHECK-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 0xbff00000 +; CHECK: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}} +define void @store_inline_imm_m_1.0_f64(double addrspace(1)* %out) { + store double -1.0, double addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}store_inline_imm_2.0_f64: +; CHECK-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0{{$}} +; CHECK-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 2.0 +; CHECK: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}} +define void @store_inline_imm_2.0_f64(double addrspace(1)* %out) { + store double 2.0, double addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}store_inline_imm_m_2.0_f64: +; CHECK-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0{{$}} +; CHECK-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], -2.0 +; CHECK: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}} +define void @store_inline_imm_m_2.0_f64(double addrspace(1)* %out) { + store double -2.0, double addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}store_inline_imm_4.0_f64: +; CHECK-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0{{$}} +; CHECK-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 0x40100000 +; CHECK: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}} +define void @store_inline_imm_4.0_f64(double addrspace(1)* %out) { + store double 4.0, double addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}store_inline_imm_m_4.0_f64: +; CHECK-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0{{$}} +; CHECK-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 0xc0100000 +; CHECK: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}} +define void @store_inline_imm_m_4.0_f64(double addrspace(1)* %out) { + store double -4.0, double addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}store_literal_imm_f64: +; CHECK-DAG: s_mov_b32 s[[HI_SREG:[0-9]+]], 0x40b00000 +; CHECK-DAG: s_mov_b32 s[[LO_SREG:[0-9]+]], 0{{$}} +; CHECK-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], s[[LO_SREG]] +; CHECK-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], s[[HI_SREG]] +; CHECK: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}} +define void @store_literal_imm_f64(double addrspace(1)* %out) { + store double 4096.0, double addrspace(1)* %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll new file mode 100644 index 00000000000..f551606d63a --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll @@ -0,0 +1,121 @@ +; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s +; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s + +; Tests for indirect addressing on SI, which is implemented using dynamic +; indexing of vectors. + +; CHECK-LABEL: {{^}}extract_w_offset: +; CHECK: s_mov_b32 m0 +; CHECK-NEXT: v_movrels_b32_e32 +define void @extract_w_offset(float addrspace(1)* %out, i32 %in) { +entry: + %0 = add i32 %in, 1 + %1 = extractelement <4 x float> , i32 %0 + store float %1, float addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}extract_wo_offset: +; CHECK: s_mov_b32 m0 +; CHECK-NEXT: v_movrels_b32_e32 +define void @extract_wo_offset(float addrspace(1)* %out, i32 %in) { +entry: + %0 = extractelement <4 x float> , i32 %in + store float %0, float addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}extract_neg_offset_sgpr: +; The offset depends on the register that holds the first element of the vector. +; CHECK: s_add_i32 m0, s{{[0-9]+}}, 0xfffffe{{[0-9a-z]+}} +; CHECK: v_movrels_b32_e32 v{{[0-9]}}, v0 +define void @extract_neg_offset_sgpr(i32 addrspace(1)* %out, i32 %offset) { +entry: + %index = add i32 %offset, -512 + %value = extractelement <4 x i32> , i32 %index + store i32 %value, i32 addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}extract_neg_offset_vgpr: +; The offset depends on the register that holds the first element of the vector. +; CHECK: v_readfirstlane_b32 +; CHECK: s_add_i32 m0, m0, 0xfffffe{{[0-9a-z]+}} +; CHECK-NEXT: v_movrels_b32_e32 v{{[0-9]}}, v0 +; CHECK: s_cbranch_execnz +define void @extract_neg_offset_vgpr(i32 addrspace(1)* %out) { +entry: + %id = call i32 @llvm.r600.read.tidig.x() #1 + %index = add i32 %id, -512 + %value = extractelement <4 x i32> , i32 %index + store i32 %value, i32 addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}insert_w_offset: +; CHECK: s_mov_b32 m0 +; CHECK-NEXT: v_movreld_b32_e32 +define void @insert_w_offset(float addrspace(1)* %out, i32 %in) { +entry: + %0 = add i32 %in, 1 + %1 = insertelement <4 x float> , float 5.0, i32 %0 + %2 = extractelement <4 x float> %1, i32 2 + store float %2, float addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}insert_wo_offset: +; CHECK: s_mov_b32 m0 +; CHECK-NEXT: v_movreld_b32_e32 +define void @insert_wo_offset(float addrspace(1)* %out, i32 %in) { +entry: + %0 = insertelement <4 x float> , float 5.0, i32 %in + %1 = extractelement <4 x float> %0, i32 2 + store float %1, float addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}insert_neg_offset_sgpr: +; The offset depends on the register that holds the first element of the vector. +; CHECK: s_add_i32 m0, s{{[0-9]+}}, 0xfffffe{{[0-9a-z]+}} +; CHECK: v_movreld_b32_e32 v0, v{{[0-9]}} +define void @insert_neg_offset_sgpr(i32 addrspace(1)* %in, <4 x i32> addrspace(1)* %out, i32 %offset) { +entry: + %index = add i32 %offset, -512 + %value = insertelement <4 x i32> , i32 5, i32 %index + store <4 x i32> %value, <4 x i32> addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}insert_neg_offset_vgpr: +; The offset depends on the register that holds the first element of the vector. +; CHECK: v_readfirstlane_b32 +; CHECK: s_add_i32 m0, m0, 0xfffffe{{[0-9a-z]+}} +; CHECK-NEXT: v_movreld_b32_e32 v0, v{{[0-9]}} +; CHECK: s_cbranch_execnz +define void @insert_neg_offset_vgpr(i32 addrspace(1)* %in, <4 x i32> addrspace(1)* %out) { +entry: + %id = call i32 @llvm.r600.read.tidig.x() #1 + %index = add i32 %id, -512 + %value = insertelement <4 x i32> , i32 5, i32 %index + store <4 x i32> %value, <4 x i32> addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}insert_neg_inline_offset_vgpr: +; The offset depends on the register that holds the first element of the vector. +; CHECK: v_readfirstlane_b32 +; CHECK: s_add_i32 m0, m0, -{{[0-9]+}} +; CHECK-NEXT: v_movreld_b32_e32 v0, v{{[0-9]}} +; CHECK: s_cbranch_execnz +define void @insert_neg_inline_offset_vgpr(i32 addrspace(1)* %in, <4 x i32> addrspace(1)* %out) { +entry: + %id = call i32 @llvm.r600.read.tidig.x() #1 + %index = add i32 %id, -16 + %value = insertelement <4 x i32> , i32 5, i32 %index + store <4 x i32> %value, <4 x i32> addrspace(1)* %out + ret void +} + +declare i32 @llvm.r600.read.tidig.x() #1 +attributes #1 = { nounwind readnone } diff --git a/llvm/test/CodeGen/AMDGPU/indirect-private-64.ll b/llvm/test/CodeGen/AMDGPU/indirect-private-64.ll new file mode 100644 index 00000000000..d63e1b6c521 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/indirect-private-64.ll @@ -0,0 +1,91 @@ +; RUN: llc -march=amdgcn -mcpu=SI -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-ALLOCA -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=SI -mattr=+promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-PROMOTE -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-ALLOCA -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=+promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-PROMOTE -check-prefix=SI %s + + +declare void @llvm.AMDGPU.barrier.local() noduplicate nounwind + +; SI-LABEL: {{^}}private_access_f64_alloca: + +; SI-ALLOCA: buffer_store_dwordx2 +; SI-ALLOCA: buffer_load_dwordx2 + +; SI-PROMOTE: ds_write_b64 +; SI-PROMOTE: ds_read_b64 +define void @private_access_f64_alloca(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in, i32 %b) nounwind { + %val = load double, double addrspace(1)* %in, align 8 + %array = alloca double, i32 16, align 8 + %ptr = getelementptr double, double* %array, i32 %b + store double %val, double* %ptr, align 8 + call void @llvm.AMDGPU.barrier.local() noduplicate nounwind + %result = load double, double* %ptr, align 8 + store double %result, double addrspace(1)* %out, align 8 + ret void +} + +; SI-LABEL: {{^}}private_access_v2f64_alloca: + +; SI-ALLOCA: buffer_store_dwordx4 +; SI-ALLOCA: buffer_load_dwordx4 + +; SI-PROMOTE: ds_write_b32 +; SI-PROMOTE: ds_write_b32 +; SI-PROMOTE: ds_write_b32 +; SI-PROMOTE: ds_write_b32 +; SI-PROMOTE: ds_read_b32 +; SI-PROMOTE: ds_read_b32 +; SI-PROMOTE: ds_read_b32 +; SI-PROMOTE: ds_read_b32 +define void @private_access_v2f64_alloca(<2 x double> addrspace(1)* noalias %out, <2 x double> addrspace(1)* noalias %in, i32 %b) nounwind { + %val = load <2 x double>, <2 x double> addrspace(1)* %in, align 16 + %array = alloca <2 x double>, i32 16, align 16 + %ptr = getelementptr <2 x double>, <2 x double>* %array, i32 %b + store <2 x double> %val, <2 x double>* %ptr, align 16 + call void @llvm.AMDGPU.barrier.local() noduplicate nounwind + %result = load <2 x double>, <2 x double>* %ptr, align 16 + store <2 x double> %result, <2 x double> addrspace(1)* %out, align 16 + ret void +} + +; SI-LABEL: {{^}}private_access_i64_alloca: + +; SI-ALLOCA: buffer_store_dwordx2 +; SI-ALLOCA: buffer_load_dwordx2 + +; SI-PROMOTE: ds_write_b64 +; SI-PROMOTE: ds_read_b64 +define void @private_access_i64_alloca(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in, i32 %b) nounwind { + %val = load i64, i64 addrspace(1)* %in, align 8 + %array = alloca i64, i32 16, align 8 + %ptr = getelementptr i64, i64* %array, i32 %b + store i64 %val, i64* %ptr, align 8 + call void @llvm.AMDGPU.barrier.local() noduplicate nounwind + %result = load i64, i64* %ptr, align 8 + store i64 %result, i64 addrspace(1)* %out, align 8 + ret void +} + +; SI-LABEL: {{^}}private_access_v2i64_alloca: + +; SI-ALLOCA: buffer_store_dwordx4 +; SI-ALLOCA: buffer_load_dwordx4 + +; SI-PROMOTE: ds_write_b32 +; SI-PROMOTE: ds_write_b32 +; SI-PROMOTE: ds_write_b32 +; SI-PROMOTE: ds_write_b32 +; SI-PROMOTE: ds_read_b32 +; SI-PROMOTE: ds_read_b32 +; SI-PROMOTE: ds_read_b32 +; SI-PROMOTE: ds_read_b32 +define void @private_access_v2i64_alloca(<2 x i64> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %in, i32 %b) nounwind { + %val = load <2 x i64>, <2 x i64> addrspace(1)* %in, align 16 + %array = alloca <2 x i64>, i32 16, align 16 + %ptr = getelementptr <2 x i64>, <2 x i64>* %array, i32 %b + store <2 x i64> %val, <2 x i64>* %ptr, align 16 + call void @llvm.AMDGPU.barrier.local() noduplicate nounwind + %result = load <2 x i64>, <2 x i64>* %ptr, align 16 + store <2 x i64> %result, <2 x i64> addrspace(1)* %out, align 16 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/infinite-loop-evergreen.ll b/llvm/test/CodeGen/AMDGPU/infinite-loop-evergreen.ll new file mode 100644 index 00000000000..f6e39b3d830 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/infinite-loop-evergreen.ll @@ -0,0 +1,10 @@ +; XFAIL: * +; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck %s + +define void @inf_loop_irreducible_cfg() nounwind { +entry: + br label %block + +block: + br label %block +} diff --git a/llvm/test/CodeGen/AMDGPU/infinite-loop.ll b/llvm/test/CodeGen/AMDGPU/infinite-loop.ll new file mode 100644 index 00000000000..7233aa57fd7 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/infinite-loop.ll @@ -0,0 +1,18 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s + +; SI-LABEL: {{^}}infinite_loop: +; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3e7 +; SI: BB0_1: +; SI: buffer_store_dword [[REG]] +; SI: s_waitcnt vmcnt(0) expcnt(0) +; SI: s_branch BB0_1 +define void @infinite_loop(i32 addrspace(1)* %out) { +entry: + br label %for.body + +for.body: ; preds = %entry, %for.body + store i32 999, i32 addrspace(1)* %out, align 4 + br label %for.body +} + diff --git a/llvm/test/CodeGen/AMDGPU/inline-asm.ll b/llvm/test/CodeGen/AMDGPU/inline-asm.ll new file mode 100644 index 00000000000..efc2292de3a --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/inline-asm.ll @@ -0,0 +1,12 @@ +; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s +; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s + +; CHECK: {{^}}inline_asm: +; CHECK: s_endpgm +; CHECK: s_endpgm +define void @inline_asm(i32 addrspace(1)* %out) { +entry: + store i32 5, i32 addrspace(1)* %out + call void asm sideeffect "s_endpgm", ""() + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/inline-calls.ll b/llvm/test/CodeGen/AMDGPU/inline-calls.ll new file mode 100644 index 00000000000..33a4c832e75 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/inline-calls.ll @@ -0,0 +1,25 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck %s + +; CHECK-NOT: {{^}}func: +define internal fastcc i32 @func(i32 %a) { +entry: + %tmp0 = add i32 %a, 1 + ret i32 %tmp0 +} + +; CHECK: {{^}}kernel: +define void @kernel(i32 addrspace(1)* %out) { +entry: + %tmp0 = call i32 @func(i32 1) + store i32 %tmp0, i32 addrspace(1)* %out + ret void +} + +; CHECK: {{^}}kernel2: +define void @kernel2(i32 addrspace(1)* %out) { +entry: + call void @kernel(i32 addrspace(1)* %out) + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/input-mods.ll b/llvm/test/CodeGen/AMDGPU/input-mods.ll new file mode 100644 index 00000000000..1c4d285cbcb --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/input-mods.ll @@ -0,0 +1,26 @@ +;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG +;RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=CM + +;EG-LABEL: {{^}}test: +;EG: EXP_IEEE * +;CM-LABEL: {{^}}test: +;CM: EXP_IEEE T{{[0-9]+}}.X, -|T{{[0-9]+}}.X| +;CM: EXP_IEEE T{{[0-9]+}}.Y (MASKED), -|T{{[0-9]+}}.X| +;CM: EXP_IEEE T{{[0-9]+}}.Z (MASKED), -|T{{[0-9]+}}.X| +;CM: EXP_IEEE * T{{[0-9]+}}.W (MASKED), -|T{{[0-9]+}}.X| + +define void @test(<4 x float> inreg %reg0) #0 { + %r0 = extractelement <4 x float> %reg0, i32 0 + %r1 = call float @llvm.fabs.f32(float %r0) + %r2 = fsub float -0.000000e+00, %r1 + %r3 = call float @llvm.exp2.f32(float %r2) + %vec = insertelement <4 x float> undef, float %r3, i32 0 + call void @llvm.R600.store.swizzle(<4 x float> %vec, i32 0, i32 0) + ret void +} + +declare float @llvm.exp2.f32(float) readnone +declare float @llvm.fabs.f32(float) readnone +declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) + +attributes #0 = { "ShaderType"="0" } diff --git a/llvm/test/CodeGen/AMDGPU/insert_subreg.ll b/llvm/test/CodeGen/AMDGPU/insert_subreg.ll new file mode 100644 index 00000000000..4a5e8869c2d --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/insert_subreg.ll @@ -0,0 +1,16 @@ +; RUN: llc -march=amdgcn -mcpu=SI -mattr=-promote-alloca -verify-machineinstrs < %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-promote-alloca -verify-machineinstrs < %s + +; Test that INSERT_SUBREG instructions don't have non-register operands after +; instruction selection. + +; Make sure this doesn't crash +; CHECK-LABEL: test: +define void @test(i64 addrspace(1)* %out) { +entry: + %tmp0 = alloca [16 x i32] + %tmp1 = ptrtoint [16 x i32]* %tmp0 to i32 + %tmp2 = sext i32 %tmp1 to i64 + store i64 %tmp2, i64 addrspace(1)* %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll new file mode 100644 index 00000000000..6de3d408c48 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll @@ -0,0 +1,252 @@ +; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI %s +; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI %s + +; FIXME: Broken on evergreen +; FIXME: For some reason the 8 and 16 vectors are being stored as +; individual elements instead of 128-bit stores. + + +; FIXME: Why is the constant moved into the intermediate register and +; not just directly into the vector component? + +; SI-LABEL: {{^}}insertelement_v4f32_0: +; s_load_dwordx4 s{{[}}[[LOW_REG:[0-9]+]]: +; v_mov_b32_e32 +; v_mov_b32_e32 [[CONSTREG:v[0-9]+]], 5.000000e+00 +; v_mov_b32_e32 v[[LOW_REG]], [[CONSTREG]] +; buffer_store_dwordx4 v{{[}}[[LOW_REG]]: +define void @insertelement_v4f32_0(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind { + %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 0 + store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16 + ret void +} + +; SI-LABEL: {{^}}insertelement_v4f32_1: +define void @insertelement_v4f32_1(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind { + %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 1 + store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16 + ret void +} + +; SI-LABEL: {{^}}insertelement_v4f32_2: +define void @insertelement_v4f32_2(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind { + %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 2 + store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16 + ret void +} + +; SI-LABEL: {{^}}insertelement_v4f32_3: +define void @insertelement_v4f32_3(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind { + %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 3 + store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16 + ret void +} + +; SI-LABEL: {{^}}insertelement_v4i32_0: +define void @insertelement_v4i32_0(<4 x i32> addrspace(1)* %out, <4 x i32> %a) nounwind { + %vecins = insertelement <4 x i32> %a, i32 999, i32 0 + store <4 x i32> %vecins, <4 x i32> addrspace(1)* %out, align 16 + ret void +} + +; SI-LABEL: {{^}}dynamic_insertelement_v2f32: +; SI: v_mov_b32_e32 [[CONST:v[0-9]+]], 0x40a00000 +; SI: v_movreld_b32_e32 v[[LOW_RESULT_REG:[0-9]+]], [[CONST]] +; SI: buffer_store_dwordx2 {{v\[}}[[LOW_RESULT_REG]]: +define void @dynamic_insertelement_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, i32 %b) nounwind { + %vecins = insertelement <2 x float> %a, float 5.000000e+00, i32 %b + store <2 x float> %vecins, <2 x float> addrspace(1)* %out, align 8 + ret void +} + +; SI-LABEL: {{^}}dynamic_insertelement_v4f32: +; SI: v_mov_b32_e32 [[CONST:v[0-9]+]], 0x40a00000 +; SI: v_movreld_b32_e32 v[[LOW_RESULT_REG:[0-9]+]], [[CONST]] +; SI: buffer_store_dwordx4 {{v\[}}[[LOW_RESULT_REG]]: +define void @dynamic_insertelement_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, i32 %b) nounwind { + %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 %b + store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16 + ret void +} + +; SI-LABEL: {{^}}dynamic_insertelement_v8f32: +; FIXMESI: buffer_store_dwordx4 +; FIXMESI: buffer_store_dwordx4 +define void @dynamic_insertelement_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, i32 %b) nounwind { + %vecins = insertelement <8 x float> %a, float 5.000000e+00, i32 %b + store <8 x float> %vecins, <8 x float> addrspace(1)* %out, align 32 + ret void +} + +; SI-LABEL: {{^}}dynamic_insertelement_v16f32: +; FIXMESI: buffer_store_dwordx4 +; FIXMESI: buffer_store_dwordx4 +; FIXMESI: buffer_store_dwordx4 +; FIXMESI: buffer_store_dwordx4 +define void @dynamic_insertelement_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %a, i32 %b) nounwind { + %vecins = insertelement <16 x float> %a, float 5.000000e+00, i32 %b + store <16 x float> %vecins, <16 x float> addrspace(1)* %out, align 64 + ret void +} + +; SI-LABEL: {{^}}dynamic_insertelement_v2i32: +; SI: buffer_store_dwordx2 +define void @dynamic_insertelement_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, i32 %b) nounwind { + %vecins = insertelement <2 x i32> %a, i32 5, i32 %b + store <2 x i32> %vecins, <2 x i32> addrspace(1)* %out, align 8 + ret void +} + +; SI-LABEL: {{^}}dynamic_insertelement_v4i32: +; SI: buffer_store_dwordx4 +define void @dynamic_insertelement_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, i32 %b) nounwind { + %vecins = insertelement <4 x i32> %a, i32 5, i32 %b + store <4 x i32> %vecins, <4 x i32> addrspace(1)* %out, align 16 + ret void +} + +; SI-LABEL: {{^}}dynamic_insertelement_v8i32: +; FIXMESI: buffer_store_dwordx4 +; FIXMESI: buffer_store_dwordx4 +define void @dynamic_insertelement_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> %a, i32 %b) nounwind { + %vecins = insertelement <8 x i32> %a, i32 5, i32 %b + store <8 x i32> %vecins, <8 x i32> addrspace(1)* %out, align 32 + ret void +} + +; SI-LABEL: {{^}}dynamic_insertelement_v16i32: +; FIXMESI: buffer_store_dwordx4 +; FIXMESI: buffer_store_dwordx4 +; FIXMESI: buffer_store_dwordx4 +; FIXMESI: buffer_store_dwordx4 +define void @dynamic_insertelement_v16i32(<16 x i32> addrspace(1)* %out, <16 x i32> %a, i32 %b) nounwind { + %vecins = insertelement <16 x i32> %a, i32 5, i32 %b + store <16 x i32> %vecins, <16 x i32> addrspace(1)* %out, align 64 + ret void +} + + +; SI-LABEL: {{^}}dynamic_insertelement_v2i16: +; FIXMESI: buffer_store_dwordx2 +define void @dynamic_insertelement_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %a, i32 %b) nounwind { + %vecins = insertelement <2 x i16> %a, i16 5, i32 %b + store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out, align 8 + ret void +} + +; SI-LABEL: {{^}}dynamic_insertelement_v4i16: +; FIXMESI: buffer_store_dwordx4 +define void @dynamic_insertelement_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %a, i32 %b) nounwind { + %vecins = insertelement <4 x i16> %a, i16 5, i32 %b + store <4 x i16> %vecins, <4 x i16> addrspace(1)* %out, align 16 + ret void +} + + +; SI-LABEL: {{^}}dynamic_insertelement_v2i8: +; FIXMESI: BUFFER_STORE_USHORT +define void @dynamic_insertelement_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> %a, i32 %b) nounwind { + %vecins = insertelement <2 x i8> %a, i8 5, i32 %b + store <2 x i8> %vecins, <2 x i8> addrspace(1)* %out, align 8 + ret void +} + +; SI-LABEL: {{^}}dynamic_insertelement_v4i8: +; FIXMESI: buffer_store_dword +define void @dynamic_insertelement_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> %a, i32 %b) nounwind { + %vecins = insertelement <4 x i8> %a, i8 5, i32 %b + store <4 x i8> %vecins, <4 x i8> addrspace(1)* %out, align 16 + ret void +} + +; SI-LABEL: {{^}}dynamic_insertelement_v8i8: +; FIXMESI: buffer_store_dwordx2 +define void @dynamic_insertelement_v8i8(<8 x i8> addrspace(1)* %out, <8 x i8> %a, i32 %b) nounwind { + %vecins = insertelement <8 x i8> %a, i8 5, i32 %b + store <8 x i8> %vecins, <8 x i8> addrspace(1)* %out, align 16 + ret void +} + +; SI-LABEL: {{^}}dynamic_insertelement_v16i8: +; FIXMESI: buffer_store_dwordx4 +define void @dynamic_insertelement_v16i8(<16 x i8> addrspace(1)* %out, <16 x i8> %a, i32 %b) nounwind { + %vecins = insertelement <16 x i8> %a, i8 5, i32 %b + store <16 x i8> %vecins, <16 x i8> addrspace(1)* %out, align 16 + ret void +} + +; This test requires handling INSERT_SUBREG in SIFixSGPRCopies. Check that +; the compiler doesn't crash. +; SI-LABEL: {{^}}insert_split_bb: +define void @insert_split_bb(<2 x i32> addrspace(1)* %out, i32 addrspace(1)* %in, i32 %a, i32 %b) { +entry: + %0 = insertelement <2 x i32> undef, i32 %a, i32 0 + %1 = icmp eq i32 %a, 0 + br i1 %1, label %if, label %else + +if: + %2 = load i32, i32 addrspace(1)* %in + %3 = insertelement <2 x i32> %0, i32 %2, i32 1 + br label %endif + +else: + %4 = getelementptr i32, i32 addrspace(1)* %in, i32 1 + %5 = load i32, i32 addrspace(1)* %4 + %6 = insertelement <2 x i32> %0, i32 %5, i32 1 + br label %endif + +endif: + %7 = phi <2 x i32> [%3, %if], [%6, %else] + store <2 x i32> %7, <2 x i32> addrspace(1)* %out + ret void +} + +; SI-LABEL: {{^}}dynamic_insertelement_v2f64: +; SI: buffer_store_dwordx2 +; SI: buffer_store_dwordx2 +; SI: buffer_store_dwordx2 +; SI: buffer_store_dwordx2 +; SI: s_endpgm +define void @dynamic_insertelement_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, i32 %b) nounwind { + %vecins = insertelement <2 x double> %a, double 8.0, i32 %b + store <2 x double> %vecins, <2 x double> addrspace(1)* %out, align 16 + ret void +} + +; SI-LABEL: {{^}}dynamic_insertelement_v2i64: +; SI: buffer_store_dwordx2 +; SI: buffer_store_dwordx2 +; SI: s_endpgm +define void @dynamic_insertelement_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> %a, i32 %b) nounwind { + %vecins = insertelement <2 x i64> %a, i64 5, i32 %b + store <2 x i64> %vecins, <2 x i64> addrspace(1)* %out, align 8 + ret void +} + +; SI-LABEL: {{^}}dynamic_insertelement_v4f64: +; SI: buffer_store_dwordx2 +; SI: buffer_store_dwordx2 +; SI: buffer_store_dwordx2 +; SI: buffer_store_dwordx2 +; SI: s_endpgm +define void @dynamic_insertelement_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, i32 %b) nounwind { + %vecins = insertelement <4 x double> %a, double 8.0, i32 %b + store <4 x double> %vecins, <4 x double> addrspace(1)* %out, align 16 + ret void +} + +; SI-LABEL: {{^}}dynamic_insertelement_v8f64: +; SI: buffer_store_dwordx2 +; SI: buffer_store_dwordx2 +; SI: buffer_store_dwordx2 +; SI: buffer_store_dwordx2 +; SI: buffer_store_dwordx2 +; SI: buffer_store_dwordx2 +; SI: buffer_store_dwordx2 +; SI: buffer_store_dwordx2 +; SI: s_endpgm +define void @dynamic_insertelement_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %a, i32 %b) nounwind { + %vecins = insertelement <8 x double> %a, double 8.0, i32 %b + store <8 x double> %vecins, <8 x double> addrspace(1)* %out, align 16 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/jump-address.ll b/llvm/test/CodeGen/AMDGPU/jump-address.ll new file mode 100644 index 00000000000..f55912e3740 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/jump-address.ll @@ -0,0 +1,52 @@ +;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s + +; CHECK: JUMP @6 +; CHECK: EXPORT +; CHECK-NOT: EXPORT + +define void @main() #0 { +main_body: + %0 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1) + %1 = extractelement <4 x float> %0, i32 0 + %2 = bitcast float %1 to i32 + %3 = icmp eq i32 %2, 0 + %4 = sext i1 %3 to i32 + %5 = bitcast i32 %4 to float + %6 = bitcast float %5 to i32 + %7 = icmp ne i32 %6, 0 + br i1 %7, label %ENDIF, label %ELSE + +ELSE: ; preds = %main_body + %8 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1) + %9 = extractelement <4 x float> %8, i32 0 + %10 = bitcast float %9 to i32 + %11 = icmp eq i32 %10, 1 + %12 = sext i1 %11 to i32 + %13 = bitcast i32 %12 to float + %14 = bitcast float %13 to i32 + %15 = icmp ne i32 %14, 0 + br i1 %15, label %IF13, label %ENDIF + +ENDIF: ; preds = %IF13, %ELSE, %main_body + %temp.0 = phi float [ 0xFFF8000000000000, %main_body ], [ 0.000000e+00, %ELSE ], [ 0.000000e+00, %IF13 ] + %temp1.0 = phi float [ 0.000000e+00, %main_body ], [ %23, %IF13 ], [ 0.000000e+00, %ELSE ] + %temp2.0 = phi float [ 1.000000e+00, %main_body ], [ 0.000000e+00, %ELSE ], [ 0.000000e+00, %IF13 ] + %temp3.0 = phi float [ 5.000000e-01, %main_body ], [ 0.000000e+00, %ELSE ], [ 0.000000e+00, %IF13 ] + %16 = insertelement <4 x float> undef, float %temp.0, i32 0 + %17 = insertelement <4 x float> %16, float %temp1.0, i32 1 + %18 = insertelement <4 x float> %17, float %temp2.0, i32 2 + %19 = insertelement <4 x float> %18, float %temp3.0, i32 3 + call void @llvm.R600.store.swizzle(<4 x float> %19, i32 0, i32 0) + ret void + +IF13: ; preds = %ELSE + %20 = load <4 x float>, <4 x float> addrspace(8)* null + %21 = extractelement <4 x float> %20, i32 0 + %22 = fsub float -0.000000e+00, %21 + %23 = fadd float 0xFFF8000000000000, %22 + br label %ENDIF +} + +declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) + +attributes #0 = { "ShaderType"="0" } diff --git a/llvm/test/CodeGen/AMDGPU/kcache-fold.ll b/llvm/test/CodeGen/AMDGPU/kcache-fold.ll new file mode 100644 index 00000000000..7e2291cfdc3 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/kcache-fold.ll @@ -0,0 +1,100 @@ +;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s + +; CHECK: {{^}}main1: +; CHECK: MOV * T{{[0-9]+\.[XYZW], KC0}} +define void @main1() { +main_body: + %0 = load <4 x float>, <4 x float> addrspace(8)* null + %1 = extractelement <4 x float> %0, i32 0 + %2 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1) + %3 = extractelement <4 x float> %2, i32 0 + %4 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2) + %5 = extractelement <4 x float> %4, i32 0 + %6 = fcmp ogt float %1, 0.000000e+00 + %7 = select i1 %6, float %3, float %5 + %8 = load <4 x float>, <4 x float> addrspace(8)* null + %9 = extractelement <4 x float> %8, i32 1 + %10 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1) + %11 = extractelement <4 x float> %10, i32 1 + %12 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2) + %13 = extractelement <4 x float> %12, i32 1 + %14 = fcmp ogt float %9, 0.000000e+00 + %15 = select i1 %14, float %11, float %13 + %16 = load <4 x float>, <4 x float> addrspace(8)* null + %17 = extractelement <4 x float> %16, i32 2 + %18 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1) + %19 = extractelement <4 x float> %18, i32 2 + %20 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2) + %21 = extractelement <4 x float> %20, i32 2 + %22 = fcmp ogt float %17, 0.000000e+00 + %23 = select i1 %22, float %19, float %21 + %24 = load <4 x float>, <4 x float> addrspace(8)* null + %25 = extractelement <4 x float> %24, i32 3 + %26 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1) + %27 = extractelement <4 x float> %26, i32 3 + %28 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2) + %29 = extractelement <4 x float> %28, i32 3 + %30 = fcmp ogt float %25, 0.000000e+00 + %31 = select i1 %30, float %27, float %29 + %32 = call float @llvm.AMDIL.clamp.(float %7, float 0.000000e+00, float 1.000000e+00) + %33 = call float @llvm.AMDIL.clamp.(float %15, float 0.000000e+00, float 1.000000e+00) + %34 = call float @llvm.AMDIL.clamp.(float %23, float 0.000000e+00, float 1.000000e+00) + %35 = call float @llvm.AMDIL.clamp.(float %31, float 0.000000e+00, float 1.000000e+00) + %36 = insertelement <4 x float> undef, float %32, i32 0 + %37 = insertelement <4 x float> %36, float %33, i32 1 + %38 = insertelement <4 x float> %37, float %34, i32 2 + %39 = insertelement <4 x float> %38, float %35, i32 3 + call void @llvm.R600.store.swizzle(<4 x float> %39, i32 0, i32 0) + ret void +} + +; CHECK: {{^}}main2: +; CHECK-NOT: MOV +define void @main2() { +main_body: + %0 = load <4 x float>, <4 x float> addrspace(8)* null + %1 = extractelement <4 x float> %0, i32 0 + %2 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1) + %3 = extractelement <4 x float> %2, i32 0 + %4 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1) + %5 = extractelement <4 x float> %4, i32 1 + %6 = fcmp ogt float %1, 0.000000e+00 + %7 = select i1 %6, float %3, float %5 + %8 = load <4 x float>, <4 x float> addrspace(8)* null + %9 = extractelement <4 x float> %8, i32 1 + %10 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2) + %11 = extractelement <4 x float> %10, i32 0 + %12 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2) + %13 = extractelement <4 x float> %12, i32 1 + %14 = fcmp ogt float %9, 0.000000e+00 + %15 = select i1 %14, float %11, float %13 + %16 = load <4 x float>, <4 x float> addrspace(8)* null + %17 = extractelement <4 x float> %16, i32 2 + %18 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1) + %19 = extractelement <4 x float> %18, i32 3 + %20 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1) + %21 = extractelement <4 x float> %20, i32 2 + %22 = fcmp ogt float %17, 0.000000e+00 + %23 = select i1 %22, float %19, float %21 + %24 = load <4 x float>, <4 x float> addrspace(8)* null + %25 = extractelement <4 x float> %24, i32 3 + %26 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2) + %27 = extractelement <4 x float> %26, i32 3 + %28 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2) + %29 = extractelement <4 x float> %28, i32 2 + %30 = fcmp ogt float %25, 0.000000e+00 + %31 = select i1 %30, float %27, float %29 + %32 = call float @llvm.AMDIL.clamp.(float %7, float 0.000000e+00, float 1.000000e+00) + %33 = call float @llvm.AMDIL.clamp.(float %15, float 0.000000e+00, float 1.000000e+00) + %34 = call float @llvm.AMDIL.clamp.(float %23, float 0.000000e+00, float 1.000000e+00) + %35 = call float @llvm.AMDIL.clamp.(float %31, float 0.000000e+00, float 1.000000e+00) + %36 = insertelement <4 x float> undef, float %32, i32 0 + %37 = insertelement <4 x float> %36, float %33, i32 1 + %38 = insertelement <4 x float> %37, float %34, i32 2 + %39 = insertelement <4 x float> %38, float %35, i32 3 + call void @llvm.R600.store.swizzle(<4 x float> %39, i32 0, i32 0) + ret void +} + +declare float @llvm.AMDIL.clamp.(float, float, float) readnone +declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) diff --git a/llvm/test/CodeGen/AMDGPU/kernel-args.ll b/llvm/test/CodeGen/AMDGPU/kernel-args.ll new file mode 100644 index 00000000000..1dd7c2cb799 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/kernel-args.ll @@ -0,0 +1,473 @@ +; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=GCN --check-prefix=FUNC +; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s --check-prefix=VI --check-prefix=GCN --check-prefix=FUNC +; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG --check-prefix=FUNC +; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=EG --check-prefix=FUNC + +; FUNC-LABEL: {{^}}i8_arg: +; EG: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z +; GCN: buffer_load_ubyte + +define void @i8_arg(i32 addrspace(1)* nocapture %out, i8 %in) nounwind { +entry: + %0 = zext i8 %in to i32 + store i32 %0, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}i8_zext_arg: +; EG: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z +; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb +; VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c + +define void @i8_zext_arg(i32 addrspace(1)* nocapture %out, i8 zeroext %in) nounwind { +entry: + %0 = zext i8 %in to i32 + store i32 %0, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}i8_sext_arg: +; EG: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z +; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb +; VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c + +define void @i8_sext_arg(i32 addrspace(1)* nocapture %out, i8 signext %in) nounwind { +entry: + %0 = sext i8 %in to i32 + store i32 %0, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}i16_arg: +; EG: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z +; GCN: buffer_load_ushort + +define void @i16_arg(i32 addrspace(1)* nocapture %out, i16 %in) nounwind { +entry: + %0 = zext i16 %in to i32 + store i32 %0, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}i16_zext_arg: +; EG: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z +; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb +; VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c + +define void @i16_zext_arg(i32 addrspace(1)* nocapture %out, i16 zeroext %in) nounwind { +entry: + %0 = zext i16 %in to i32 + store i32 %0, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}i16_sext_arg: +; EG: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z +; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb +; VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c + +define void @i16_sext_arg(i32 addrspace(1)* nocapture %out, i16 signext %in) nounwind { +entry: + %0 = sext i16 %in to i32 + store i32 %0, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}i32_arg: +; EG: T{{[0-9]\.[XYZW]}}, KC0[2].Z +; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb +; VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c +define void @i32_arg(i32 addrspace(1)* nocapture %out, i32 %in) nounwind { +entry: + store i32 %in, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}f32_arg: +; EG: T{{[0-9]\.[XYZW]}}, KC0[2].Z +; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb +; VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c +define void @f32_arg(float addrspace(1)* nocapture %out, float %in) nounwind { +entry: + store float %in, float addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}v2i8_arg: +; EG: VTX_READ_8 +; EG: VTX_READ_8 +; GCN: buffer_load_ubyte +; GCN: buffer_load_ubyte +define void @v2i8_arg(<2 x i8> addrspace(1)* %out, <2 x i8> %in) { +entry: + store <2 x i8> %in, <2 x i8> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}v2i16_arg: +; EG: VTX_READ_16 +; EG: VTX_READ_16 +; GCN-DAG: buffer_load_ushort +; GCN-DAG: buffer_load_ushort +define void @v2i16_arg(<2 x i16> addrspace(1)* %out, <2 x i16> %in) { +entry: + store <2 x i16> %in, <2 x i16> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}v2i32_arg: +; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].X +; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[2].W +; SI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xb +; VI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0x2c +define void @v2i32_arg(<2 x i32> addrspace(1)* nocapture %out, <2 x i32> %in) nounwind { +entry: + store <2 x i32> %in, <2 x i32> addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}v2f32_arg: +; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].X +; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[2].W +; SI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xb +; VI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0x2c +define void @v2f32_arg(<2 x float> addrspace(1)* nocapture %out, <2 x float> %in) nounwind { +entry: + store <2 x float> %in, <2 x float> addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}v3i8_arg: +; VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 40 +; VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 41 +; VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 42 +define void @v3i8_arg(<3 x i8> addrspace(1)* nocapture %out, <3 x i8> %in) nounwind { +entry: + store <3 x i8> %in, <3 x i8> addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}v3i16_arg: +; VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 44 +; VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 46 +; VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 48 +define void @v3i16_arg(<3 x i16> addrspace(1)* nocapture %out, <3 x i16> %in) nounwind { +entry: + store <3 x i16> %in, <3 x i16> addrspace(1)* %out, align 4 + ret void +} +; FUNC-LABEL: {{^}}v3i32_arg: +; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y +; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z +; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W +; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0xd +; VI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x34 +define void @v3i32_arg(<3 x i32> addrspace(1)* nocapture %out, <3 x i32> %in) nounwind { +entry: + store <3 x i32> %in, <3 x i32> addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}v3f32_arg: +; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y +; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z +; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W +; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0xd +; VI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x34 +define void @v3f32_arg(<3 x float> addrspace(1)* nocapture %out, <3 x float> %in) nounwind { +entry: + store <3 x float> %in, <3 x float> addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}v4i8_arg: +; EG: VTX_READ_8 +; EG: VTX_READ_8 +; EG: VTX_READ_8 +; EG: VTX_READ_8 +; GCN: buffer_load_ubyte +; GCN: buffer_load_ubyte +; GCN: buffer_load_ubyte +; GCN: buffer_load_ubyte +define void @v4i8_arg(<4 x i8> addrspace(1)* %out, <4 x i8> %in) { +entry: + store <4 x i8> %in, <4 x i8> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}v4i16_arg: +; EG: VTX_READ_16 +; EG: VTX_READ_16 +; EG: VTX_READ_16 +; EG: VTX_READ_16 +; GCN: buffer_load_ushort +; GCN: buffer_load_ushort +; GCN: buffer_load_ushort +; GCN: buffer_load_ushort +define void @v4i16_arg(<4 x i16> addrspace(1)* %out, <4 x i16> %in) { +entry: + store <4 x i16> %in, <4 x i16> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}v4i32_arg: +; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y +; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z +; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W +; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].X +; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xd +; VI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0x34 +define void @v4i32_arg(<4 x i32> addrspace(1)* nocapture %out, <4 x i32> %in) nounwind { +entry: + store <4 x i32> %in, <4 x i32> addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}v4f32_arg: +; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y +; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z +; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W +; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].X +; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xd +; VI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0x34 +define void @v4f32_arg(<4 x float> addrspace(1)* nocapture %out, <4 x float> %in) nounwind { +entry: + store <4 x float> %in, <4 x float> addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}v8i8_arg: +; EG: VTX_READ_8 +; EG: VTX_READ_8 +; EG: VTX_READ_8 +; EG: VTX_READ_8 +; EG: VTX_READ_8 +; EG: VTX_READ_8 +; EG: VTX_READ_8 +; EG: VTX_READ_8 +; GCN: buffer_load_ubyte +; GCN: buffer_load_ubyte +; GCN: buffer_load_ubyte +; GCN: buffer_load_ubyte +; GCN: buffer_load_ubyte +; GCN: buffer_load_ubyte +; GCN: buffer_load_ubyte +define void @v8i8_arg(<8 x i8> addrspace(1)* %out, <8 x i8> %in) { +entry: + store <8 x i8> %in, <8 x i8> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}v8i16_arg: +; EG: VTX_READ_16 +; EG: VTX_READ_16 +; EG: VTX_READ_16 +; EG: VTX_READ_16 +; EG: VTX_READ_16 +; EG: VTX_READ_16 +; EG: VTX_READ_16 +; EG: VTX_READ_16 +; GCN: buffer_load_ushort +; GCN: buffer_load_ushort +; GCN: buffer_load_ushort +; GCN: buffer_load_ushort +; GCN: buffer_load_ushort +; GCN: buffer_load_ushort +; GCN: buffer_load_ushort +; GCN: buffer_load_ushort +define void @v8i16_arg(<8 x i16> addrspace(1)* %out, <8 x i16> %in) { +entry: + store <8 x i16> %in, <8 x i16> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}v8i32_arg: +; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Y +; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Z +; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].W +; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].X +; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].Y +; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].Z +; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].W +; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].X +; SI: s_load_dwordx8 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x11 +; VI: s_load_dwordx8 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x44 +define void @v8i32_arg(<8 x i32> addrspace(1)* nocapture %out, <8 x i32> %in) nounwind { +entry: + store <8 x i32> %in, <8 x i32> addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}v8f32_arg: +; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Y +; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Z +; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].W +; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].X +; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].Y +; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].Z +; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].W +; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].X +; SI: s_load_dwordx8 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x11 +define void @v8f32_arg(<8 x float> addrspace(1)* nocapture %out, <8 x float> %in) nounwind { +entry: + store <8 x float> %in, <8 x float> addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}v16i8_arg: +; EG: VTX_READ_8 +; EG: VTX_READ_8 +; EG: VTX_READ_8 +; EG: VTX_READ_8 +; EG: VTX_READ_8 +; EG: VTX_READ_8 +; EG: VTX_READ_8 +; EG: VTX_READ_8 +; EG: VTX_READ_8 +; EG: VTX_READ_8 +; EG: VTX_READ_8 +; EG: VTX_READ_8 +; EG: VTX_READ_8 +; EG: VTX_READ_8 +; EG: VTX_READ_8 +; EG: VTX_READ_8 +; GCN: buffer_load_ubyte +; GCN: buffer_load_ubyte +; GCN: buffer_load_ubyte +; GCN: buffer_load_ubyte +; GCN: buffer_load_ubyte +; GCN: buffer_load_ubyte +; GCN: buffer_load_ubyte +; GCN: buffer_load_ubyte +; GCN: buffer_load_ubyte +; GCN: buffer_load_ubyte +; GCN: buffer_load_ubyte +; GCN: buffer_load_ubyte +; GCN: buffer_load_ubyte +; GCN: buffer_load_ubyte +; GCN: buffer_load_ubyte +; GCN: buffer_load_ubyte +define void @v16i8_arg(<16 x i8> addrspace(1)* %out, <16 x i8> %in) { +entry: + store <16 x i8> %in, <16 x i8> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}v16i16_arg: +; EG: VTX_READ_16 +; EG: VTX_READ_16 +; EG: VTX_READ_16 +; EG: VTX_READ_16 +; EG: VTX_READ_16 +; EG: VTX_READ_16 +; EG: VTX_READ_16 +; EG: VTX_READ_16 +; EG: VTX_READ_16 +; EG: VTX_READ_16 +; EG: VTX_READ_16 +; EG: VTX_READ_16 +; EG: VTX_READ_16 +; EG: VTX_READ_16 +; EG: VTX_READ_16 +; EG: VTX_READ_16 +; GCN: buffer_load_ushort +; GCN: buffer_load_ushort +; GCN: buffer_load_ushort +; GCN: buffer_load_ushort +; GCN: buffer_load_ushort +; GCN: buffer_load_ushort +; GCN: buffer_load_ushort +; GCN: buffer_load_ushort +; GCN: buffer_load_ushort +; GCN: buffer_load_ushort +; GCN: buffer_load_ushort +; GCN: buffer_load_ushort +; GCN: buffer_load_ushort +; GCN: buffer_load_ushort +; GCN: buffer_load_ushort +; GCN: buffer_load_ushort +define void @v16i16_arg(<16 x i16> addrspace(1)* %out, <16 x i16> %in) { +entry: + store <16 x i16> %in, <16 x i16> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}v16i32_arg: +; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Y +; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Z +; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].W +; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].X +; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].Y +; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].Z +; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].W +; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].X +; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Y +; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Z +; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].W +; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].X +; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].Y +; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].Z +; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].W +; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[10].X +; SI: s_load_dwordx16 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x19 +; VI: s_load_dwordx16 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x64 +define void @v16i32_arg(<16 x i32> addrspace(1)* nocapture %out, <16 x i32> %in) nounwind { +entry: + store <16 x i32> %in, <16 x i32> addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}v16f32_arg: +; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Y +; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Z +; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].W +; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].X +; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].Y +; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].Z +; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].W +; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].X +; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Y +; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Z +; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].W +; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].X +; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].Y +; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].Z +; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].W +; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[10].X +; SI: s_load_dwordx16 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x19 +; VI: s_load_dwordx16 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x64 +define void @v16f32_arg(<16 x float> addrspace(1)* nocapture %out, <16 x float> %in) nounwind { +entry: + store <16 x float> %in, <16 x float> addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}kernel_arg_i64: +; GCN: s_load_dwordx2 +; GCN: s_load_dwordx2 +; GCN: buffer_store_dwordx2 +define void @kernel_arg_i64(i64 addrspace(1)* %out, i64 %a) nounwind { + store i64 %a, i64 addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}f64_kernel_arg: +; SI-DAG: s_load_dwordx2 s[{{[0-9]:[0-9]}}], s[0:1], 0x9 +; SI-DAG: s_load_dwordx2 s[{{[0-9]:[0-9]}}], s[0:1], 0xb +; VI-DAG: s_load_dwordx2 s[{{[0-9]:[0-9]}}], s[0:1], 0x24 +; VI-DAG: s_load_dwordx2 s[{{[0-9]:[0-9]}}], s[0:1], 0x2c +; GCN: buffer_store_dwordx2 +define void @f64_kernel_arg(double addrspace(1)* %out, double %in) { +entry: + store double %in, double addrspace(1)* %out + ret void +} + +; XFUNC-LABEL: {{^}}kernel_arg_v1i64: +; XGCN: s_load_dwordx2 +; XGCN: s_load_dwordx2 +; XGCN: buffer_store_dwordx2 +; define void @kernel_arg_v1i64(<1 x i64> addrspace(1)* %out, <1 x i64> %a) nounwind { +; store <1 x i64> %a, <1 x i64> addrspace(1)* %out, align 8 +; ret void +; } diff --git a/llvm/test/CodeGen/AMDGPU/large-alloca.ll b/llvm/test/CodeGen/AMDGPU/large-alloca.ll new file mode 100644 index 00000000000..671833d1a33 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/large-alloca.ll @@ -0,0 +1,15 @@ +; XFAIL: * +; REQUIRES: asserts +; RUN: llc -march=amdgcn -mcpu=SI < %s +; RUN: llc -march=amdgcn -mcpu=tonga < %s + +define void @large_alloca(i32 addrspace(1)* %out, i32 %x, i32 %y) nounwind { + %large = alloca [8192 x i32], align 4 + %gep = getelementptr [8192 x i32], [8192 x i32]* %large, i32 0, i32 8191 + store i32 %x, i32* %gep + %gep1 = getelementptr [8192 x i32], [8192 x i32]* %large, i32 0, i32 %y + %0 = load i32, i32* %gep1 + store i32 %0, i32 addrspace(1)* %out + ret void +} + diff --git a/llvm/test/CodeGen/AMDGPU/large-constant-initializer.ll b/llvm/test/CodeGen/AMDGPU/large-constant-initializer.ll new file mode 100644 index 00000000000..9975b1b7f5c --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/large-constant-initializer.ll @@ -0,0 +1,19 @@ +; RUN: llc -march=amdgcn -mcpu=SI < %s +; RUN: llc -march=amdgcn -mcpu=tonga < %s +; CHECK: s_endpgm + +@gv = external unnamed_addr addrspace(2) constant [239 x i32], align 4 + +define void @opencv_cvtfloat_crash(i32 addrspace(1)* %out, i32 %x) nounwind { + %val = load i32, i32 addrspace(2)* getelementptr ([239 x i32], [239 x i32] addrspace(2)* @gv, i64 0, i64 239), align 4 + %mul12 = mul nsw i32 %val, 7 + br i1 undef, label %exit, label %bb + +bb: + %cmp = icmp slt i32 %x, 0 + br label %exit + +exit: + ret void +} + diff --git a/llvm/test/CodeGen/AMDGPU/lds-initializer.ll b/llvm/test/CodeGen/AMDGPU/lds-initializer.ll new file mode 100644 index 00000000000..bf8df63be9f --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/lds-initializer.ll @@ -0,0 +1,13 @@ +; RUN: not llc -march=amdgcn -mcpu=SI < %s 2>&1 | FileCheck %s +; RUN: not llc -march=amdgcn -mcpu=tonga < %s 2>&1 | FileCheck %s + +; CHECK: error: unsupported initializer for address space in load_init_lds_global + +@lds = addrspace(3) global [8 x i32] [i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8] + +define void @load_init_lds_global(i32 addrspace(1)* %out, i1 %p) { + %gep = getelementptr [8 x i32], [8 x i32] addrspace(3)* @lds, i32 0, i32 10 + %ld = load i32, i32 addrspace(3)* %gep + store i32 %ld, i32 addrspace(1)* %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/lds-oqap-crash.ll b/llvm/test/CodeGen/AMDGPU/lds-oqap-crash.ll new file mode 100644 index 00000000000..6ff6fc3d7af --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/lds-oqap-crash.ll @@ -0,0 +1,28 @@ +; RUN: llc < %s -march=r600 -mcpu=redwood -verify-machineinstrs | FileCheck %s + +; The test is for a bug in R600EmitClauseMarkers.cpp where this pass +; was searching for a use of the OQAP register in order to determine +; if an LDS instruction could fit in the current clause, but never finding +; one. This created an infinite loop and hung the compiler. +; +; The LDS instruction should not have been defining OQAP in the first place, +; because the LDS instructions are pseudo instructions and the OQAP +; reads and writes are bundled together in the same instruction. + +; CHECK: {{^}}lds_crash: +define void @lds_crash(i32 addrspace(1)* %out, i32 addrspace(3)* %in, i32 %a, i32 %b, i32 %c) { +entry: + %0 = load i32, i32 addrspace(3)* %in + ; This block needs to be > 115 ISA instructions to hit the bug, + ; so we'll use udiv instructions. + %div0 = udiv i32 %0, %b + %div1 = udiv i32 %div0, %a + %div2 = udiv i32 %div1, 11 + %div3 = udiv i32 %div2, %a + %div4 = udiv i32 %div3, %b + %div5 = udiv i32 %div4, %c + %div6 = udiv i32 %div5, %div0 + %div7 = udiv i32 %div6, %div1 + store i32 %div7, i32 addrspace(1)* %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/lds-output-queue.ll b/llvm/test/CodeGen/AMDGPU/lds-output-queue.ll new file mode 100644 index 00000000000..44ffc36af14 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/lds-output-queue.ll @@ -0,0 +1,99 @@ +; RUN: llc < %s -march=r600 -mcpu=redwood -verify-machineinstrs | FileCheck %s +; +; This test checks that the lds input queue will is empty at the end of +; the ALU clause. + +; CHECK-LABEL: {{^}}lds_input_queue: +; CHECK: LDS_READ_RET * OQAP +; CHECK-NOT: ALU clause +; CHECK: MOV * T{{[0-9]\.[XYZW]}}, OQAP + +@local_mem = internal unnamed_addr addrspace(3) global [2 x i32] undef, align 4 + +define void @lds_input_queue(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %index) { +entry: + %0 = getelementptr inbounds [2 x i32], [2 x i32] addrspace(3)* @local_mem, i32 0, i32 %index + %1 = load i32, i32 addrspace(3)* %0 + call void @llvm.AMDGPU.barrier.local() + + ; This will start a new clause for the vertex fetch + %2 = load i32, i32 addrspace(1)* %in + %3 = add i32 %1, %2 + store i32 %3, i32 addrspace(1)* %out + ret void +} + +declare void @llvm.AMDGPU.barrier.local() + +; The machine scheduler does not do proper alias analysis and assumes that +; loads from global values (Note that a global value is different that a +; value from global memory. A global value is a value that is declared +; outside of a function, it can reside in any address space) alias with +; all other loads. +; +; This is a problem for scheduling the reads from the local data share (lds). +; These reads are implemented using two instructions. The first copies the +; data from lds into the lds output queue, and the second moves the data from +; the input queue into main memory. These two instructions don't have to be +; scheduled one after the other, but they do need to be scheduled in the same +; clause. The aliasing problem mentioned above causes problems when there is a +; load from global memory which immediately follows a load from a global value that +; has been declared in the local memory space: +; +; %0 = getelementptr inbounds [2 x i32], [2 x i32] addrspace(3)* @local_mem, i32 0, i32 %index +; %1 = load i32, i32 addrspace(3)* %0 +; %2 = load i32, i32 addrspace(1)* %in +; +; The instruction selection phase will generate ISA that looks like this: +; %OQAP = LDS_READ_RET +; %vreg0 = MOV %OQAP +; %vreg1 = VTX_READ_32 +; %vreg2 = ADD_INT %vreg1, %vreg0 +; +; The bottom scheduler will schedule the two ALU instructions first: +; +; UNSCHEDULED: +; %OQAP = LDS_READ_RET +; %vreg1 = VTX_READ_32 +; +; SCHEDULED: +; +; vreg0 = MOV %OQAP +; vreg2 = ADD_INT %vreg1, %vreg2 +; +; The lack of proper aliasing results in the local memory read (LDS_READ_RET) +; to consider the global memory read (VTX_READ_32) has a chain dependency, so +; the global memory read will always be scheduled first. This will give us a +; final program which looks like this: +; +; Alu clause: +; %OQAP = LDS_READ_RET +; VTX clause: +; %vreg1 = VTX_READ_32 +; Alu clause: +; vreg0 = MOV %OQAP +; vreg2 = ADD_INT %vreg1, %vreg2 +; +; This is an illegal program because the OQAP def and use know occur in +; different ALU clauses. +; +; This test checks this scenario and makes sure it doesn't result in an +; illegal program. For now, we have fixed this issue by merging the +; LDS_READ_RET and MOV together during instruction selection and then +; expanding them after scheduling. Once the scheduler has better alias +; analysis, we should be able to keep these instructions sparate before +; scheduling. +; +; CHECK-LABEL: {{^}}local_global_alias: +; CHECK: LDS_READ_RET +; CHECK-NOT: ALU clause +; CHECK: MOV * T{{[0-9]\.[XYZW]}}, OQAP +define void @local_global_alias(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { +entry: + %0 = getelementptr inbounds [2 x i32], [2 x i32] addrspace(3)* @local_mem, i32 0, i32 0 + %1 = load i32, i32 addrspace(3)* %0 + %2 = load i32, i32 addrspace(1)* %in + %3 = add i32 %2, %1 + store i32 %3, i32 addrspace(1)* %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/lds-size.ll b/llvm/test/CodeGen/AMDGPU/lds-size.ll new file mode 100644 index 00000000000..3e8328659fd --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/lds-size.ll @@ -0,0 +1,26 @@ +; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s + +; This test makes sure we do not double count global values when they are +; used in different basic blocks. + +; CHECK: .long 166120 +; CHECK-NEXT: .long 1 +; CHECK-LABEL: {{^}}test: +@lds = internal unnamed_addr addrspace(3) global i32 undef, align 4 + +define void @test(i32 addrspace(1)* %out, i32 %cond) { +entry: + %0 = icmp eq i32 %cond, 0 + br i1 %0, label %if, label %else + +if: + store i32 1, i32 addrspace(3)* @lds + br label %endif + +else: + store i32 2, i32 addrspace(3)* @lds + br label %endif + +endif: + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/lds-zero-initializer.ll b/llvm/test/CodeGen/AMDGPU/lds-zero-initializer.ll new file mode 100644 index 00000000000..fb51bc0e50c --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/lds-zero-initializer.ll @@ -0,0 +1,13 @@ +; RUN: not llc -march=amdgcn -mcpu=SI < %s 2>&1 | FileCheck %s +; RUN: not llc -march=amdgcn -mcpu=tonga < %s 2>&1 | FileCheck %s + +; CHECK: error: unsupported initializer for address space in load_zeroinit_lds_global + +@lds = addrspace(3) global [256 x i32] zeroinitializer + +define void @load_zeroinit_lds_global(i32 addrspace(1)* %out, i1 %p) { + %gep = getelementptr [256 x i32], [256 x i32] addrspace(3)* @lds, i32 0, i32 10 + %ld = load i32, i32 addrspace(3)* %gep + store i32 %ld, i32 addrspace(1)* %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/legalizedag-bug-expand-setcc.ll b/llvm/test/CodeGen/AMDGPU/legalizedag-bug-expand-setcc.ll new file mode 100644 index 00000000000..4244c48d240 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/legalizedag-bug-expand-setcc.ll @@ -0,0 +1,26 @@ +; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s + +; This tests a bug where LegalizeDAG was not checking the target's +; BooleanContents value and always using one for true, when expanding +; setcc to select_cc. +; +; This bug caused the icmp IR instruction to be expanded to two machine +; instructions, when only one is needed. +; + +; CHECK: {{^}}setcc_expand: +; CHECK: SET +; CHECK-NOT: CND +define void @setcc_expand(i32 addrspace(1)* %out, i32 %in) { +entry: + %0 = icmp eq i32 %in, 5 + br i1 %0, label %IF, label %ENDIF +IF: + %1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 + store i32 0, i32 addrspace(1)* %1 + br label %ENDIF + +ENDIF: + store i32 0, i32 addrspace(1)* %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/lit.local.cfg b/llvm/test/CodeGen/AMDGPU/lit.local.cfg new file mode 100644 index 00000000000..2a665f06be7 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/lit.local.cfg @@ -0,0 +1,2 @@ +if not 'AMDGPU' in config.root.targets: + config.unsupported = True diff --git a/llvm/test/CodeGen/AMDGPU/literals.ll b/llvm/test/CodeGen/AMDGPU/literals.ll new file mode 100644 index 00000000000..cff1c24f89d --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/literals.ll @@ -0,0 +1,64 @@ +; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s + +; Test using an integer literal constant. +; Generated ASM should be: +; ADD_INT KC0[2].Z literal.x, 5 +; or +; ADD_INT literal.x KC0[2].Z, 5 + +; CHECK: {{^}}i32_literal: +; CHECK: ADD_INT {{\** *}}T{{[0-9]\.[XYZW]}}, KC0[2].Z, literal.x +; CHECK-NEXT: LSHR +; CHECK-NEXT: 5 +define void @i32_literal(i32 addrspace(1)* %out, i32 %in) { +entry: + %0 = add i32 5, %in + store i32 %0, i32 addrspace(1)* %out + ret void +} + +; Test using a float literal constant. +; Generated ASM should be: +; ADD KC0[2].Z literal.x, 5.0 +; or +; ADD literal.x KC0[2].Z, 5.0 + +; CHECK: {{^}}float_literal: +; CHECK: ADD {{\** *}}T{{[0-9]\.[XYZW]}}, KC0[2].Z, literal.x +; CHECK-NEXT: LSHR +; CHECK-NEXT: 1084227584(5.0 +define void @float_literal(float addrspace(1)* %out, float %in) { +entry: + %0 = fadd float 5.0, %in + store float %0, float addrspace(1)* %out + ret void +} + +; Make sure inline literals are folded into REG_SEQUENCE instructions. +; CHECK: {{^}}inline_literal_reg_sequence: +; CHECK: MOV {{\** *}}T[[GPR:[0-9]]].X, 0.0 +; CHECK-NEXT: MOV {{\** *}}T[[GPR]].Y, 0.0 +; CHECK-NEXT: MOV {{\** *}}T[[GPR]].Z, 0.0 +; CHECK-NEXT: MOV {{\** *}}T[[GPR]].W, 0.0 + +define void @inline_literal_reg_sequence(<4 x i32> addrspace(1)* %out) { +entry: + store <4 x i32> , <4 x i32> addrspace(1)* %out + ret void +} + +; CHECK: {{^}}inline_literal_dot4: +; CHECK: DOT4 T[[GPR:[0-9]]].X, 1.0 +; CHECK-NEXT: DOT4 T[[GPR]].Y (MASKED), 1.0 +; CHECK-NEXT: DOT4 T[[GPR]].Z (MASKED), 1.0 +; CHECK-NEXT: DOT4 * T[[GPR]].W (MASKED), 1.0 +define void @inline_literal_dot4(float addrspace(1)* %out) { +entry: + %0 = call float @llvm.AMDGPU.dp4(<4 x float> , <4 x float> ) + store float %0, float addrspace(1)* %out + ret void +} + +declare float @llvm.AMDGPU.dp4(<4 x float>, <4 x float>) #1 + +attributes #1 = { readnone } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.abs.ll b/llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.abs.ll new file mode 100644 index 00000000000..8bf094b8bc7 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.abs.ll @@ -0,0 +1,49 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s + +declare i32 @llvm.AMDGPU.abs(i32) nounwind readnone + +; Legacy name +declare i32 @llvm.AMDIL.abs.i32(i32) nounwind readnone + +; FUNC-LABEL: {{^}}s_abs_i32: +; SI: s_sub_i32 +; SI: s_max_i32 +; SI: s_endpgm + +; EG: SUB_INT +; EG: MAX_INT +define void @s_abs_i32(i32 addrspace(1)* %out, i32 %src) nounwind { + %abs = call i32 @llvm.AMDGPU.abs(i32 %src) nounwind readnone + store i32 %abs, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}v_abs_i32: +; SI: v_sub_i32_e32 +; SI: v_max_i32_e32 +; SI: s_endpgm + +; EG: SUB_INT +; EG: MAX_INT +define void @v_abs_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %src) nounwind { + %val = load i32, i32 addrspace(1)* %src, align 4 + %abs = call i32 @llvm.AMDGPU.abs(i32 %val) nounwind readnone + store i32 %abs, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}abs_i32_legacy_amdil: +; SI: v_sub_i32_e32 +; SI: v_max_i32_e32 +; SI: s_endpgm + +; EG: SUB_INT +; EG: MAX_INT +define void @abs_i32_legacy_amdil(i32 addrspace(1)* %out, i32 addrspace(1)* %src) nounwind { + %val = load i32, i32 addrspace(1)* %src, align 4 + %abs = call i32 @llvm.AMDIL.abs.i32(i32 %val) nounwind readnone + store i32 %abs, i32 addrspace(1)* %out, align 4 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.barrier.global.ll b/llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.barrier.global.ll new file mode 100644 index 00000000000..db883972d64 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.barrier.global.ll @@ -0,0 +1,30 @@ +; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s + +; FUNC-LABEL: {{^}}test_barrier_global: +; EG: GROUP_BARRIER +; SI: buffer_store_dword +; SI: s_waitcnt +; SI: s_barrier + +define void @test_barrier_global(i32 addrspace(1)* %out) { +entry: + %0 = call i32 @llvm.r600.read.tidig.x() + %1 = getelementptr i32, i32 addrspace(1)* %out, i32 %0 + store i32 %0, i32 addrspace(1)* %1 + call void @llvm.AMDGPU.barrier.global() + %2 = call i32 @llvm.r600.read.local.size.x() + %3 = sub i32 %2, 1 + %4 = sub i32 %3, %0 + %5 = getelementptr i32, i32 addrspace(1)* %out, i32 %4 + %6 = load i32, i32 addrspace(1)* %5 + store i32 %6, i32 addrspace(1)* %1 + ret void +} + +declare void @llvm.AMDGPU.barrier.global() + +declare i32 @llvm.r600.read.tidig.x() #0 +declare i32 @llvm.r600.read.local.size.x() #0 + +attributes #0 = { readnone } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.barrier.local.ll b/llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.barrier.local.ll new file mode 100644 index 00000000000..48fb2e0b1a8 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.barrier.local.ll @@ -0,0 +1,31 @@ +; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s + +; FUNC-LABEL: {{^}}test_barrier_local: +; EG: GROUP_BARRIER + +; SI: buffer_store_dword +; SI: s_waitcnt +; SI: s_barrier + +define void @test_barrier_local(i32 addrspace(1)* %out) { +entry: + %0 = call i32 @llvm.r600.read.tidig.x() + %1 = getelementptr i32, i32 addrspace(1)* %out, i32 %0 + store i32 %0, i32 addrspace(1)* %1 + call void @llvm.AMDGPU.barrier.local() + %2 = call i32 @llvm.r600.read.local.size.x() + %3 = sub i32 %2, 1 + %4 = sub i32 %3, %0 + %5 = getelementptr i32, i32 addrspace(1)* %out, i32 %4 + %6 = load i32, i32 addrspace(1)* %5 + store i32 %6, i32 addrspace(1)* %1 + ret void +} + +declare void @llvm.AMDGPU.barrier.local() + +declare i32 @llvm.r600.read.tidig.x() #0 +declare i32 @llvm.r600.read.local.size.x() #0 + +attributes #0 = { readnone } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.bfe.i32.ll b/llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.bfe.i32.ll new file mode 100644 index 00000000000..1168713ca66 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.bfe.i32.ll @@ -0,0 +1,437 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=redwood -show-mc-encoding -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s + +declare i32 @llvm.AMDGPU.bfe.i32(i32, i32, i32) nounwind readnone + +; FUNC-LABEL: {{^}}bfe_i32_arg_arg_arg: +; SI: v_bfe_i32 +; EG: BFE_INT +; EG: encoding: [{{[x0-9a-f]+,[x0-9a-f]+,[x0-9a-f]+,[x0-9a-f]+,[x0-9a-f]+}},0xac +define void @bfe_i32_arg_arg_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) nounwind { + %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 %src0, i32 %src1, i32 %src1) nounwind readnone + store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfe_i32_arg_arg_imm: +; SI: v_bfe_i32 +; EG: BFE_INT +define void @bfe_i32_arg_arg_imm(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind { + %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 %src0, i32 %src1, i32 123) nounwind readnone + store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfe_i32_arg_imm_arg: +; SI: v_bfe_i32 +; EG: BFE_INT +define void @bfe_i32_arg_imm_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src2) nounwind { + %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 %src0, i32 123, i32 %src2) nounwind readnone + store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfe_i32_imm_arg_arg: +; SI: v_bfe_i32 +; EG: BFE_INT +define void @bfe_i32_imm_arg_arg(i32 addrspace(1)* %out, i32 %src1, i32 %src2) nounwind { + %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 123, i32 %src1, i32 %src2) nounwind readnone + store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}v_bfe_print_arg: +; SI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 2, 8 +define void @v_bfe_print_arg(i32 addrspace(1)* %out, i32 addrspace(1)* %src0) nounwind { + %load = load i32, i32 addrspace(1)* %src0, align 4 + %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 %load, i32 2, i32 8) nounwind readnone + store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfe_i32_arg_0_width_reg_offset: +; SI-NOT: {{[^@]}}bfe +; SI: s_endpgm +; EG-NOT: BFE +define void @bfe_i32_arg_0_width_reg_offset(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind { + %bfe_u32 = call i32 @llvm.AMDGPU.bfe.i32(i32 %src0, i32 %src1, i32 0) nounwind readnone + store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfe_i32_arg_0_width_imm_offset: +; SI-NOT: {{[^@]}}bfe +; SI: s_endpgm +; EG-NOT: BFE +define void @bfe_i32_arg_0_width_imm_offset(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind { + %bfe_u32 = call i32 @llvm.AMDGPU.bfe.i32(i32 %src0, i32 8, i32 0) nounwind readnone + store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfe_i32_test_6: +; SI: v_lshlrev_b32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}} +; SI: v_ashrrev_i32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}} +; SI: s_endpgm +define void @bfe_i32_test_6(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { + %x = load i32, i32 addrspace(1)* %in, align 4 + %shl = shl i32 %x, 31 + %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %shl, i32 1, i32 31) + store i32 %bfe, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfe_i32_test_7: +; SI-NOT: shl +; SI-NOT: {{[^@]}}bfe +; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0 +; SI: buffer_store_dword [[VREG]], +; SI: s_endpgm +define void @bfe_i32_test_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { + %x = load i32, i32 addrspace(1)* %in, align 4 + %shl = shl i32 %x, 31 + %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %shl, i32 0, i32 31) + store i32 %bfe, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfe_i32_test_8: +; SI: buffer_load_dword +; SI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 1 +; SI: s_endpgm +define void @bfe_i32_test_8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { + %x = load i32, i32 addrspace(1)* %in, align 4 + %shl = shl i32 %x, 31 + %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %shl, i32 31, i32 1) + store i32 %bfe, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfe_i32_test_9: +; SI-NOT: {{[^@]}}bfe +; SI: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}} +; SI-NOT: {{[^@]}}bfe +; SI: s_endpgm +define void @bfe_i32_test_9(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { + %x = load i32, i32 addrspace(1)* %in, align 4 + %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %x, i32 31, i32 1) + store i32 %bfe, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfe_i32_test_10: +; SI-NOT: {{[^@]}}bfe +; SI: v_ashrrev_i32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}} +; SI-NOT: {{[^@]}}bfe +; SI: s_endpgm +define void @bfe_i32_test_10(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { + %x = load i32, i32 addrspace(1)* %in, align 4 + %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %x, i32 1, i32 31) + store i32 %bfe, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfe_i32_test_11: +; SI-NOT: {{[^@]}}bfe +; SI: v_ashrrev_i32_e32 v{{[0-9]+}}, 8, v{{[0-9]+}} +; SI-NOT: {{[^@]}}bfe +; SI: s_endpgm +define void @bfe_i32_test_11(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { + %x = load i32, i32 addrspace(1)* %in, align 4 + %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %x, i32 8, i32 24) + store i32 %bfe, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfe_i32_test_12: +; SI-NOT: {{[^@]}}bfe +; SI: v_ashrrev_i32_e32 v{{[0-9]+}}, 24, v{{[0-9]+}} +; SI-NOT: {{[^@]}}bfe +; SI: s_endpgm +define void @bfe_i32_test_12(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { + %x = load i32, i32 addrspace(1)* %in, align 4 + %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %x, i32 24, i32 8) + store i32 %bfe, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfe_i32_test_13: +; SI: v_ashrrev_i32_e32 {{v[0-9]+}}, 31, {{v[0-9]+}} +; SI-NOT: {{[^@]}}bfe +; SI: s_endpgm +define void @bfe_i32_test_13(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { + %x = load i32, i32 addrspace(1)* %in, align 4 + %shl = ashr i32 %x, 31 + %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %shl, i32 31, i32 1) + store i32 %bfe, i32 addrspace(1)* %out, align 4 ret void +} + +; FUNC-LABEL: {{^}}bfe_i32_test_14: +; SI-NOT: lshr +; SI-NOT: {{[^@]}}bfe +; SI: s_endpgm +define void @bfe_i32_test_14(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { + %x = load i32, i32 addrspace(1)* %in, align 4 + %shl = lshr i32 %x, 31 + %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %shl, i32 31, i32 1) + store i32 %bfe, i32 addrspace(1)* %out, align 4 ret void +} + +; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_0: +; SI-NOT: {{[^@]}}bfe +; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0 +; SI: buffer_store_dword [[VREG]], +; SI: s_endpgm +; EG-NOT: BFE +define void @bfe_i32_constant_fold_test_0(i32 addrspace(1)* %out) nounwind { + %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 0, i32 0, i32 0) nounwind readnone + store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_1: +; SI-NOT: {{[^@]}}bfe +; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0 +; SI: buffer_store_dword [[VREG]], +; SI: s_endpgm +; EG-NOT: BFE +define void @bfe_i32_constant_fold_test_1(i32 addrspace(1)* %out) nounwind { + %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 12334, i32 0, i32 0) nounwind readnone + store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_2: +; SI-NOT: {{[^@]}}bfe +; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0 +; SI: buffer_store_dword [[VREG]], +; SI: s_endpgm +; EG-NOT: BFE +define void @bfe_i32_constant_fold_test_2(i32 addrspace(1)* %out) nounwind { + %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 0, i32 0, i32 1) nounwind readnone + store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_3: +; SI-NOT: {{[^@]}}bfe +; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], -1 +; SI: buffer_store_dword [[VREG]], +; SI: s_endpgm +; EG-NOT: BFE +define void @bfe_i32_constant_fold_test_3(i32 addrspace(1)* %out) nounwind { + %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 1, i32 0, i32 1) nounwind readnone + store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_4: +; SI-NOT: {{[^@]}}bfe +; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], -1 +; SI: buffer_store_dword [[VREG]], +; SI: s_endpgm +; EG-NOT: BFE +define void @bfe_i32_constant_fold_test_4(i32 addrspace(1)* %out) nounwind { + %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 4294967295, i32 0, i32 1) nounwind readnone + store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_5: +; SI-NOT: {{[^@]}}bfe +; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], -1 +; SI: buffer_store_dword [[VREG]], +; SI: s_endpgm +; EG-NOT: BFE +define void @bfe_i32_constant_fold_test_5(i32 addrspace(1)* %out) nounwind { + %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 128, i32 7, i32 1) nounwind readnone + store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_6: +; SI-NOT: {{[^@]}}bfe +; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0xffffff80 +; SI: buffer_store_dword [[VREG]], +; SI: s_endpgm +; EG-NOT: BFE +define void @bfe_i32_constant_fold_test_6(i32 addrspace(1)* %out) nounwind { + %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 128, i32 0, i32 8) nounwind readnone + store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_7: +; SI-NOT: {{[^@]}}bfe +; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0x7f +; SI: buffer_store_dword [[VREG]], +; SI: s_endpgm +; EG-NOT: BFE +define void @bfe_i32_constant_fold_test_7(i32 addrspace(1)* %out) nounwind { + %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 127, i32 0, i32 8) nounwind readnone + store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_8: +; SI-NOT: {{[^@]}}bfe +; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 1 +; SI: buffer_store_dword [[VREG]], +; SI: s_endpgm +; EG-NOT: BFE +define void @bfe_i32_constant_fold_test_8(i32 addrspace(1)* %out) nounwind { + %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 127, i32 6, i32 8) nounwind readnone + store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_9: +; SI-NOT: {{[^@]}}bfe +; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 1 +; SI: buffer_store_dword [[VREG]], +; SI: s_endpgm +; EG-NOT: BFE +define void @bfe_i32_constant_fold_test_9(i32 addrspace(1)* %out) nounwind { + %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 65536, i32 16, i32 8) nounwind readnone + store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_10: +; SI-NOT: {{[^@]}}bfe +; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0 +; SI: buffer_store_dword [[VREG]], +; SI: s_endpgm +; EG-NOT: BFE +define void @bfe_i32_constant_fold_test_10(i32 addrspace(1)* %out) nounwind { + %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 65535, i32 16, i32 16) nounwind readnone + store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_11: +; SI-NOT: {{[^@]}}bfe +; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], -6 +; SI: buffer_store_dword [[VREG]], +; SI: s_endpgm +; EG-NOT: BFE +define void @bfe_i32_constant_fold_test_11(i32 addrspace(1)* %out) nounwind { + %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 160, i32 4, i32 4) nounwind readnone + store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_12: +; SI-NOT: {{[^@]}}bfe +; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0 +; SI: buffer_store_dword [[VREG]], +; SI: s_endpgm +; EG-NOT: BFE +define void @bfe_i32_constant_fold_test_12(i32 addrspace(1)* %out) nounwind { + %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 160, i32 31, i32 1) nounwind readnone + store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_13: +; SI-NOT: {{[^@]}}bfe +; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 1 +; SI: buffer_store_dword [[VREG]], +; SI: s_endpgm +; EG-NOT: BFE +define void @bfe_i32_constant_fold_test_13(i32 addrspace(1)* %out) nounwind { + %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 131070, i32 16, i32 16) nounwind readnone + store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_14: +; SI-NOT: {{[^@]}}bfe +; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 40 +; SI: buffer_store_dword [[VREG]], +; SI: s_endpgm +; EG-NOT: BFE +define void @bfe_i32_constant_fold_test_14(i32 addrspace(1)* %out) nounwind { + %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 160, i32 2, i32 30) nounwind readnone + store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_15: +; SI-NOT: {{[^@]}}bfe +; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 10 +; SI: buffer_store_dword [[VREG]], +; SI: s_endpgm +; EG-NOT: BFE +define void @bfe_i32_constant_fold_test_15(i32 addrspace(1)* %out) nounwind { + %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 160, i32 4, i32 28) nounwind readnone + store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_16: +; SI-NOT: {{[^@]}}bfe +; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], -1 +; SI: buffer_store_dword [[VREG]], +; SI: s_endpgm +; EG-NOT: BFE +define void @bfe_i32_constant_fold_test_16(i32 addrspace(1)* %out) nounwind { + %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 4294967295, i32 1, i32 7) nounwind readnone + store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_17: +; SI-NOT: {{[^@]}}bfe +; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0x7f +; SI: buffer_store_dword [[VREG]], +; SI: s_endpgm +; EG-NOT: BFE +define void @bfe_i32_constant_fold_test_17(i32 addrspace(1)* %out) nounwind { + %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 255, i32 1, i32 31) nounwind readnone + store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_18: +; SI-NOT: {{[^@]}}bfe +; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0 +; SI: buffer_store_dword [[VREG]], +; SI: s_endpgm +; EG-NOT: BFE +define void @bfe_i32_constant_fold_test_18(i32 addrspace(1)* %out) nounwind { + %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 255, i32 31, i32 1) nounwind readnone + store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfe_sext_in_reg_i24: +; SI: buffer_load_dword [[LOAD:v[0-9]+]], +; SI-NOT: v_lshl +; SI-NOT: v_ashr +; SI: v_bfe_i32 [[BFE:v[0-9]+]], [[LOAD]], 0, 24 +; SI: buffer_store_dword [[BFE]], +define void @bfe_sext_in_reg_i24(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { + %x = load i32, i32 addrspace(1)* %in, align 4 + %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %x, i32 0, i32 24) + %shl = shl i32 %bfe, 8 + %ashr = ashr i32 %shl, 8 + store i32 %ashr, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @simplify_demanded_bfe_sdiv +; SI: buffer_load_dword [[LOAD:v[0-9]+]] +; SI: v_bfe_i32 [[BFE:v[0-9]+]], [[LOAD]], 1, 16 +; SI: v_lshrrev_b32_e32 [[TMP0:v[0-9]+]], 31, [[BFE]] +; SI: v_add_i32_e32 [[TMP1:v[0-9]+]], [[TMP0]], [[BFE]] +; SI: v_ashrrev_i32_e32 [[TMP2:v[0-9]+]], 1, [[TMP1]] +; SI: buffer_store_dword [[TMP2]] +define void @simplify_demanded_bfe_sdiv(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { + %src = load i32, i32 addrspace(1)* %in, align 4 + %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %src, i32 1, i32 16) nounwind readnone + %div = sdiv i32 %bfe, 2 + store i32 %div, i32 addrspace(1)* %out, align 4 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.bfe.u32.ll b/llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.bfe.u32.ll new file mode 100644 index 00000000000..541119242a9 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.bfe.u32.ll @@ -0,0 +1,627 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s + +declare i32 @llvm.AMDGPU.bfe.u32(i32, i32, i32) nounwind readnone + +; FUNC-LABEL: {{^}}bfe_u32_arg_arg_arg: +; SI: v_bfe_u32 +; EG: BFE_UINT +define void @bfe_u32_arg_arg_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) nounwind { + %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 %src0, i32 %src1, i32 %src1) nounwind readnone + store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfe_u32_arg_arg_imm: +; SI: v_bfe_u32 +; EG: BFE_UINT +define void @bfe_u32_arg_arg_imm(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind { + %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 %src0, i32 %src1, i32 123) nounwind readnone + store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfe_u32_arg_imm_arg: +; SI: v_bfe_u32 +; EG: BFE_UINT +define void @bfe_u32_arg_imm_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src2) nounwind { + %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 %src0, i32 123, i32 %src2) nounwind readnone + store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfe_u32_imm_arg_arg: +; SI: v_bfe_u32 +; EG: BFE_UINT +define void @bfe_u32_imm_arg_arg(i32 addrspace(1)* %out, i32 %src1, i32 %src2) nounwind { + %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 123, i32 %src1, i32 %src2) nounwind readnone + store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfe_u32_arg_0_width_reg_offset: +; SI-NOT: {{[^@]}}bfe +; SI: s_endpgm +; EG-NOT: BFE +define void @bfe_u32_arg_0_width_reg_offset(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind { + %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 %src0, i32 %src1, i32 0) nounwind readnone + store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfe_u32_arg_0_width_imm_offset: +; SI-NOT: {{[^@]}}bfe +; SI: s_endpgm +; EG-NOT: BFE +define void @bfe_u32_arg_0_width_imm_offset(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind { + %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 %src0, i32 8, i32 0) nounwind readnone + store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfe_u32_zextload_i8: +; SI: buffer_load_ubyte +; SI-NOT: {{[^@]}}bfe +; SI: s_endpgm +define void @bfe_u32_zextload_i8(i32 addrspace(1)* %out, i8 addrspace(1)* %in) nounwind { + %load = load i8, i8 addrspace(1)* %in + %ext = zext i8 %load to i32 + %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %ext, i32 0, i32 8) + store i32 %bfe, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfe_u32_zext_in_reg_i8: +; SI: buffer_load_dword +; SI: v_add_i32 +; SI-NEXT: v_and_b32_e32 +; SI-NOT: {{[^@]}}bfe +; SI: s_endpgm +define void @bfe_u32_zext_in_reg_i8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { + %load = load i32, i32 addrspace(1)* %in, align 4 + %add = add i32 %load, 1 + %ext = and i32 %add, 255 + %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %ext, i32 0, i32 8) + store i32 %bfe, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfe_u32_zext_in_reg_i16: +; SI: buffer_load_dword +; SI: v_add_i32 +; SI-NEXT: v_and_b32_e32 +; SI-NOT: {{[^@]}}bfe +; SI: s_endpgm +define void @bfe_u32_zext_in_reg_i16(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { + %load = load i32, i32 addrspace(1)* %in, align 4 + %add = add i32 %load, 1 + %ext = and i32 %add, 65535 + %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %ext, i32 0, i32 16) + store i32 %bfe, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfe_u32_zext_in_reg_i8_offset_1: +; SI: buffer_load_dword +; SI: v_add_i32 +; SI: bfe +; SI: s_endpgm +define void @bfe_u32_zext_in_reg_i8_offset_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { + %load = load i32, i32 addrspace(1)* %in, align 4 + %add = add i32 %load, 1 + %ext = and i32 %add, 255 + %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %ext, i32 1, i32 8) + store i32 %bfe, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfe_u32_zext_in_reg_i8_offset_3: +; SI: buffer_load_dword +; SI: v_add_i32 +; SI-NEXT: v_and_b32_e32 {{v[0-9]+}}, 0xf8 +; SI-NEXT: bfe +; SI: s_endpgm +define void @bfe_u32_zext_in_reg_i8_offset_3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { + %load = load i32, i32 addrspace(1)* %in, align 4 + %add = add i32 %load, 1 + %ext = and i32 %add, 255 + %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %ext, i32 3, i32 8) + store i32 %bfe, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfe_u32_zext_in_reg_i8_offset_7: +; SI: buffer_load_dword +; SI: v_add_i32 +; SI-NEXT: v_and_b32_e32 {{v[0-9]+}}, 0x80 +; SI-NEXT: bfe +; SI: s_endpgm +define void @bfe_u32_zext_in_reg_i8_offset_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { + %load = load i32, i32 addrspace(1)* %in, align 4 + %add = add i32 %load, 1 + %ext = and i32 %add, 255 + %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %ext, i32 7, i32 8) + store i32 %bfe, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfe_u32_zext_in_reg_i16_offset_8: +; SI: buffer_load_dword +; SI: v_add_i32 +; SI-NEXT: bfe +; SI: s_endpgm +define void @bfe_u32_zext_in_reg_i16_offset_8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { + %load = load i32, i32 addrspace(1)* %in, align 4 + %add = add i32 %load, 1 + %ext = and i32 %add, 65535 + %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %ext, i32 8, i32 8) + store i32 %bfe, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfe_u32_test_1: +; SI: buffer_load_dword +; SI: v_and_b32_e32 {{v[0-9]+}}, 1, {{v[0-9]+}} +; SI: s_endpgm +; EG: AND_INT T{{[0-9]\.[XYZW]}}, T{{[0-9]\.[XYZW]}}, 1, +define void @bfe_u32_test_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { + %x = load i32, i32 addrspace(1)* %in, align 4 + %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %x, i32 0, i32 1) + store i32 %bfe, i32 addrspace(1)* %out, align 4 + ret void +} + +define void @bfe_u32_test_2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { + %x = load i32, i32 addrspace(1)* %in, align 4 + %shl = shl i32 %x, 31 + %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %shl, i32 0, i32 8) + store i32 %bfe, i32 addrspace(1)* %out, align 4 + ret void +} + +define void @bfe_u32_test_3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { + %x = load i32, i32 addrspace(1)* %in, align 4 + %shl = shl i32 %x, 31 + %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %shl, i32 0, i32 1) + store i32 %bfe, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfe_u32_test_4: +; SI-NOT: lshl +; SI-NOT: shr +; SI-NOT: {{[^@]}}bfe +; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0 +; SI: buffer_store_dword [[VREG]], +; SI: s_endpgm +define void @bfe_u32_test_4(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { + %x = load i32, i32 addrspace(1)* %in, align 4 + %shl = shl i32 %x, 31 + %shr = lshr i32 %shl, 31 + %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %shr, i32 31, i32 1) + store i32 %bfe, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfe_u32_test_5: +; SI: buffer_load_dword +; SI-NOT: lshl +; SI-NOT: shr +; SI: v_bfe_i32 {{v[0-9]+}}, {{v[0-9]+}}, 0, 1 +; SI: s_endpgm +define void @bfe_u32_test_5(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { + %x = load i32, i32 addrspace(1)* %in, align 4 + %shl = shl i32 %x, 31 + %shr = ashr i32 %shl, 31 + %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %shr, i32 0, i32 1) + store i32 %bfe, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfe_u32_test_6: +; SI: v_lshlrev_b32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}} +; SI: v_lshrrev_b32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}} +; SI: s_endpgm +define void @bfe_u32_test_6(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { + %x = load i32, i32 addrspace(1)* %in, align 4 + %shl = shl i32 %x, 31 + %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %shl, i32 1, i32 31) + store i32 %bfe, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfe_u32_test_7: +; SI: v_lshlrev_b32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}} +; SI-NOT: {{[^@]}}bfe +; SI: s_endpgm +define void @bfe_u32_test_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { + %x = load i32, i32 addrspace(1)* %in, align 4 + %shl = shl i32 %x, 31 + %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %shl, i32 0, i32 31) + store i32 %bfe, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfe_u32_test_8: +; SI-NOT: {{[^@]}}bfe +; SI: v_and_b32_e32 {{v[0-9]+}}, 1, {{v[0-9]+}} +; SI-NOT: {{[^@]}}bfe +; SI: s_endpgm +define void @bfe_u32_test_8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { + %x = load i32, i32 addrspace(1)* %in, align 4 + %shl = shl i32 %x, 31 + %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %shl, i32 31, i32 1) + store i32 %bfe, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfe_u32_test_9: +; SI-NOT: {{[^@]}}bfe +; SI: v_lshrrev_b32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}} +; SI-NOT: {{[^@]}}bfe +; SI: s_endpgm +define void @bfe_u32_test_9(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { + %x = load i32, i32 addrspace(1)* %in, align 4 + %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %x, i32 31, i32 1) + store i32 %bfe, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfe_u32_test_10: +; SI-NOT: {{[^@]}}bfe +; SI: v_lshrrev_b32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}} +; SI-NOT: {{[^@]}}bfe +; SI: s_endpgm +define void @bfe_u32_test_10(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { + %x = load i32, i32 addrspace(1)* %in, align 4 + %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %x, i32 1, i32 31) + store i32 %bfe, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfe_u32_test_11: +; SI-NOT: {{[^@]}}bfe +; SI: v_lshrrev_b32_e32 v{{[0-9]+}}, 8, v{{[0-9]+}} +; SI-NOT: {{[^@]}}bfe +; SI: s_endpgm +define void @bfe_u32_test_11(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { + %x = load i32, i32 addrspace(1)* %in, align 4 + %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %x, i32 8, i32 24) + store i32 %bfe, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfe_u32_test_12: +; SI-NOT: {{[^@]}}bfe +; SI: v_lshrrev_b32_e32 v{{[0-9]+}}, 24, v{{[0-9]+}} +; SI-NOT: {{[^@]}}bfe +; SI: s_endpgm +define void @bfe_u32_test_12(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { + %x = load i32, i32 addrspace(1)* %in, align 4 + %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %x, i32 24, i32 8) + store i32 %bfe, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfe_u32_test_13: +; V_ASHRREV_U32_e32 {{v[0-9]+}}, 31, {{v[0-9]+}} +; SI-NOT: {{[^@]}}bfe +; SI: s_endpgm +define void @bfe_u32_test_13(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { + %x = load i32, i32 addrspace(1)* %in, align 4 + %shl = ashr i32 %x, 31 + %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %shl, i32 31, i32 1) + store i32 %bfe, i32 addrspace(1)* %out, align 4 ret void +} + +; FUNC-LABEL: {{^}}bfe_u32_test_14: +; SI-NOT: lshr +; SI-NOT: {{[^@]}}bfe +; SI: s_endpgm +define void @bfe_u32_test_14(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { + %x = load i32, i32 addrspace(1)* %in, align 4 + %shl = lshr i32 %x, 31 + %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %shl, i32 31, i32 1) + store i32 %bfe, i32 addrspace(1)* %out, align 4 ret void +} + +; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_0: +; SI-NOT: {{[^@]}}bfe +; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0 +; SI: buffer_store_dword [[VREG]], +; SI: s_endpgm +; EG-NOT: BFE +define void @bfe_u32_constant_fold_test_0(i32 addrspace(1)* %out) nounwind { + %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 0, i32 0, i32 0) nounwind readnone + store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_1: +; SI-NOT: {{[^@]}}bfe +; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0 +; SI: buffer_store_dword [[VREG]], +; SI: s_endpgm +; EG-NOT: BFE +define void @bfe_u32_constant_fold_test_1(i32 addrspace(1)* %out) nounwind { + %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 12334, i32 0, i32 0) nounwind readnone + store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_2: +; SI-NOT: {{[^@]}}bfe +; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0 +; SI: buffer_store_dword [[VREG]], +; SI: s_endpgm +; EG-NOT: BFE +define void @bfe_u32_constant_fold_test_2(i32 addrspace(1)* %out) nounwind { + %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 0, i32 0, i32 1) nounwind readnone + store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_3: +; SI-NOT: {{[^@]}}bfe +; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 1 +; SI: buffer_store_dword [[VREG]], +; SI: s_endpgm +; EG-NOT: BFE +define void @bfe_u32_constant_fold_test_3(i32 addrspace(1)* %out) nounwind { + %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 1, i32 0, i32 1) nounwind readnone + store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_4: +; SI-NOT: {{[^@]}}bfe +; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], -1 +; SI: buffer_store_dword [[VREG]], +; SI: s_endpgm +; EG-NOT: BFE +define void @bfe_u32_constant_fold_test_4(i32 addrspace(1)* %out) nounwind { + %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 4294967295, i32 0, i32 1) nounwind readnone + store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_5: +; SI-NOT: {{[^@]}}bfe +; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 1 +; SI: buffer_store_dword [[VREG]], +; SI: s_endpgm +; EG-NOT: BFE +define void @bfe_u32_constant_fold_test_5(i32 addrspace(1)* %out) nounwind { + %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 128, i32 7, i32 1) nounwind readnone + store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_6: +; SI-NOT: {{[^@]}}bfe +; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0x80 +; SI: buffer_store_dword [[VREG]], +; SI: s_endpgm +; EG-NOT: BFE +define void @bfe_u32_constant_fold_test_6(i32 addrspace(1)* %out) nounwind { + %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 128, i32 0, i32 8) nounwind readnone + store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_7: +; SI-NOT: {{[^@]}}bfe +; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0x7f +; SI: buffer_store_dword [[VREG]], +; SI: s_endpgm +; EG-NOT: BFE +define void @bfe_u32_constant_fold_test_7(i32 addrspace(1)* %out) nounwind { + %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 127, i32 0, i32 8) nounwind readnone + store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_8: +; SI-NOT: {{[^@]}}bfe +; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 1 +; SI: buffer_store_dword [[VREG]], +; SI: s_endpgm +; EG-NOT: BFE +define void @bfe_u32_constant_fold_test_8(i32 addrspace(1)* %out) nounwind { + %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 127, i32 6, i32 8) nounwind readnone + store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_9: +; SI-NOT: {{[^@]}}bfe +; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 1 +; SI: buffer_store_dword [[VREG]], +; SI: s_endpgm +; EG-NOT: BFE +define void @bfe_u32_constant_fold_test_9(i32 addrspace(1)* %out) nounwind { + %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 65536, i32 16, i32 8) nounwind readnone + store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_10: +; SI-NOT: {{[^@]}}bfe +; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0 +; SI: buffer_store_dword [[VREG]], +; SI: s_endpgm +; EG-NOT: BFE +define void @bfe_u32_constant_fold_test_10(i32 addrspace(1)* %out) nounwind { + %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 65535, i32 16, i32 16) nounwind readnone + store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_11: +; SI-NOT: {{[^@]}}bfe +; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 10 +; SI: buffer_store_dword [[VREG]], +; SI: s_endpgm +; EG-NOT: BFE +define void @bfe_u32_constant_fold_test_11(i32 addrspace(1)* %out) nounwind { + %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 160, i32 4, i32 4) nounwind readnone + store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_12: +; SI-NOT: {{[^@]}}bfe +; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0 +; SI: buffer_store_dword [[VREG]], +; SI: s_endpgm +; EG-NOT: BFE +define void @bfe_u32_constant_fold_test_12(i32 addrspace(1)* %out) nounwind { + %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 160, i32 31, i32 1) nounwind readnone + store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_13: +; SI-NOT: {{[^@]}}bfe +; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 1 +; SI: buffer_store_dword [[VREG]], +; SI: s_endpgm +; EG-NOT: BFE +define void @bfe_u32_constant_fold_test_13(i32 addrspace(1)* %out) nounwind { + %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 131070, i32 16, i32 16) nounwind readnone + store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_14: +; SI-NOT: {{[^@]}}bfe +; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 40 +; SI: buffer_store_dword [[VREG]], +; SI: s_endpgm +; EG-NOT: BFE +define void @bfe_u32_constant_fold_test_14(i32 addrspace(1)* %out) nounwind { + %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 160, i32 2, i32 30) nounwind readnone + store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_15: +; SI-NOT: {{[^@]}}bfe +; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 10 +; SI: buffer_store_dword [[VREG]], +; SI: s_endpgm +; EG-NOT: BFE +define void @bfe_u32_constant_fold_test_15(i32 addrspace(1)* %out) nounwind { + %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 160, i32 4, i32 28) nounwind readnone + store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_16: +; SI-NOT: {{[^@]}}bfe +; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0x7f +; SI: buffer_store_dword [[VREG]], +; SI: s_endpgm +; EG-NOT: BFE +define void @bfe_u32_constant_fold_test_16(i32 addrspace(1)* %out) nounwind { + %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 4294967295, i32 1, i32 7) nounwind readnone + store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_17: +; SI-NOT: {{[^@]}}bfe +; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0x7f +; SI: buffer_store_dword [[VREG]], +; SI: s_endpgm +; EG-NOT: BFE +define void @bfe_u32_constant_fold_test_17(i32 addrspace(1)* %out) nounwind { + %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 255, i32 1, i32 31) nounwind readnone + store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_18: +; SI-NOT: {{[^@]}}bfe +; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0 +; SI: buffer_store_dword [[VREG]], +; SI: s_endpgm +; EG-NOT: BFE +define void @bfe_u32_constant_fold_test_18(i32 addrspace(1)* %out) nounwind { + %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 255, i32 31, i32 1) nounwind readnone + store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 + ret void +} + +; Make sure that SimplifyDemandedBits doesn't cause the and to be +; reduced to the bits demanded by the bfe. + +; XXX: The operand to v_bfe_u32 could also just directly be the load register. +; FUNC-LABEL: {{^}}simplify_bfe_u32_multi_use_arg: +; SI: buffer_load_dword [[ARG:v[0-9]+]] +; SI: v_and_b32_e32 [[AND:v[0-9]+]], 63, [[ARG]] +; SI: v_bfe_u32 [[BFE:v[0-9]+]], [[AND]], 2, 2 +; SI-DAG: buffer_store_dword [[AND]] +; SI-DAG: buffer_store_dword [[BFE]] +; SI: s_endpgm +define void @simplify_bfe_u32_multi_use_arg(i32 addrspace(1)* %out0, + i32 addrspace(1)* %out1, + i32 addrspace(1)* %in) nounwind { + %src = load i32, i32 addrspace(1)* %in, align 4 + %and = and i32 %src, 63 + %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 %and, i32 2, i32 2) nounwind readnone + store i32 %bfe_u32, i32 addrspace(1)* %out0, align 4 + store i32 %and, i32 addrspace(1)* %out1, align 4 + ret void +} + +; FUNC-LABEL: {{^}}lshr_and: +; SI: s_bfe_u32 {{s[0-9]+}}, {{s[0-9]+}}, 0x30006 +; SI: buffer_store_dword +define void @lshr_and(i32 addrspace(1)* %out, i32 %a) nounwind { + %b = lshr i32 %a, 6 + %c = and i32 %b, 7 + store i32 %c, i32 addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}v_lshr_and: +; SI: v_bfe_u32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}, 3 +; SI: buffer_store_dword +define void @v_lshr_and(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind { + %c = lshr i32 %a, %b + %d = and i32 %c, 7 + store i32 %d, i32 addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}and_lshr: +; SI: s_bfe_u32 {{s[0-9]+}}, {{s[0-9]+}}, 0x30006 +; SI: buffer_store_dword +define void @and_lshr(i32 addrspace(1)* %out, i32 %a) nounwind { + %b = and i32 %a, 448 + %c = lshr i32 %b, 6 + store i32 %c, i32 addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}and_lshr2: +; SI: s_bfe_u32 {{s[0-9]+}}, {{s[0-9]+}}, 0x30006 +; SI: buffer_store_dword +define void @and_lshr2(i32 addrspace(1)* %out, i32 %a) nounwind { + %b = and i32 %a, 511 + %c = lshr i32 %b, 6 + store i32 %c, i32 addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}shl_lshr: +; SI: s_bfe_u32 {{s[0-9]+}}, {{s[0-9]+}}, 0x150002 +; SI: buffer_store_dword +define void @shl_lshr(i32 addrspace(1)* %out, i32 %a) nounwind { + %b = shl i32 %a, 9 + %c = lshr i32 %b, 11 + store i32 %c, i32 addrspace(1)* %out, align 8 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.bfi.ll b/llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.bfi.ll new file mode 100644 index 00000000000..517a55abc09 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.bfi.ll @@ -0,0 +1,42 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s + +declare i32 @llvm.AMDGPU.bfi(i32, i32, i32) nounwind readnone + +; FUNC-LABEL: {{^}}bfi_arg_arg_arg: +; SI: v_bfi_b32 +; EG: BFI_INT +define void @bfi_arg_arg_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) nounwind { + %bfi = call i32 @llvm.AMDGPU.bfi(i32 %src0, i32 %src1, i32 %src1) nounwind readnone + store i32 %bfi, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfi_arg_arg_imm: +; SI: v_bfi_b32 +; EG: BFI_INT +define void @bfi_arg_arg_imm(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind { + %bfi = call i32 @llvm.AMDGPU.bfi(i32 %src0, i32 %src1, i32 123) nounwind readnone + store i32 %bfi, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfi_arg_imm_arg: +; SI: v_bfi_b32 +; EG: BFI_INT +define void @bfi_arg_imm_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src2) nounwind { + %bfi = call i32 @llvm.AMDGPU.bfi(i32 %src0, i32 123, i32 %src2) nounwind readnone + store i32 %bfi, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfi_imm_arg_arg: +; SI: v_bfi_b32 +; EG: BFI_INT +define void @bfi_imm_arg_arg(i32 addrspace(1)* %out, i32 %src1, i32 %src2) nounwind { + %bfi = call i32 @llvm.AMDGPU.bfi(i32 123, i32 %src1, i32 %src2) nounwind readnone + store i32 %bfi, i32 addrspace(1)* %out, align 4 + ret void +} + diff --git a/llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.bfm.ll b/llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.bfm.ll new file mode 100644 index 00000000000..50492289d74 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.bfm.ll @@ -0,0 +1,60 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s + +declare i32 @llvm.AMDGPU.bfm(i32, i32) nounwind readnone + +; FUNC-LABEL: {{^}}bfm_arg_arg: +; SI: s_bfm_b32 {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} +; EG: BFM_INT +define void @bfm_arg_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind { + %bfm = call i32 @llvm.AMDGPU.bfm(i32 %src0, i32 %src1) nounwind readnone + store i32 %bfm, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfm_arg_imm: +; SI: s_bfm_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0x7b +; EG: BFM_INT +define void @bfm_arg_imm(i32 addrspace(1)* %out, i32 %src0) nounwind { + %bfm = call i32 @llvm.AMDGPU.bfm(i32 %src0, i32 123) nounwind readnone + store i32 %bfm, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfm_imm_arg: +; SI: s_bfm_b32 {{s[0-9]+}}, 0x7b, {{s[0-9]+}} +; EG: BFM_INT +define void @bfm_imm_arg(i32 addrspace(1)* %out, i32 %src1) nounwind { + %bfm = call i32 @llvm.AMDGPU.bfm(i32 123, i32 %src1) nounwind readnone + store i32 %bfm, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfm_imm_imm: +; SI: s_bfm_b32 {{s[0-9]+}}, 0x7b, 0x1c8 +; EG: BFM_INT +define void @bfm_imm_imm(i32 addrspace(1)* %out) nounwind { + %bfm = call i32 @llvm.AMDGPU.bfm(i32 123, i32 456) nounwind readnone + store i32 %bfm, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfm_pattern: +; SI: s_bfm_b32 {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} +define void @bfm_pattern(i32 addrspace(1)* %out, i32 %x, i32 %y) { + %a = shl i32 1, %x + %b = sub i32 %a, 1 + %c = shl i32 %b, %y + store i32 %c, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}bfm_pattern_simple: +; SI: s_bfm_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0 +define void @bfm_pattern_simple(i32 addrspace(1)* %out, i32 %x) { + %a = shl i32 1, %x + %b = sub i32 %a, 1 + store i32 %b, i32 addrspace(1)* %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.brev.ll b/llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.brev.ll new file mode 100644 index 00000000000..301de4b1c82 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.brev.ll @@ -0,0 +1,28 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s + +declare i32 @llvm.AMDGPU.brev(i32) nounwind readnone + +; FUNC-LABEL: {{^}}s_brev_i32: +; SI: s_load_dword [[VAL:s[0-9]+]], +; SI: s_brev_b32 [[SRESULT:s[0-9]+]], [[VAL]] +; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]] +; SI: buffer_store_dword [[VRESULT]], +; SI: s_endpgm +define void @s_brev_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind { + %ctlz = call i32 @llvm.AMDGPU.brev(i32 %val) nounwind readnone + store i32 %ctlz, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}v_brev_i32: +; SI: buffer_load_dword [[VAL:v[0-9]+]], +; SI: v_bfrev_b32_e32 [[RESULT:v[0-9]+]], [[VAL]] +; SI: buffer_store_dword [[RESULT]], +; SI: s_endpgm +define void @v_brev_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { + %val = load i32, i32 addrspace(1)* %valptr, align 4 + %ctlz = call i32 @llvm.AMDGPU.brev(i32 %val) nounwind readnone + store i32 %ctlz, i32 addrspace(1)* %out, align 4 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.clamp.ll b/llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.clamp.ll new file mode 100644 index 00000000000..11ec963ab31 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.clamp.ll @@ -0,0 +1,67 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s + +declare float @llvm.fabs.f32(float) nounwind readnone +declare float @llvm.AMDGPU.clamp.f32(float, float, float) nounwind readnone +declare float @llvm.AMDIL.clamp.f32(float, float, float) nounwind readnone + +; FUNC-LABEL: {{^}}clamp_0_1_f32: +; SI: s_load_dword [[ARG:s[0-9]+]], +; SI: v_add_f32_e64 [[RESULT:v[0-9]+]], 0, [[ARG]] clamp{{$}} +; SI: buffer_store_dword [[RESULT]] +; SI: s_endpgm + +; EG: MOV_SAT +define void @clamp_0_1_f32(float addrspace(1)* %out, float %src) nounwind { + %clamp = call float @llvm.AMDGPU.clamp.f32(float %src, float 0.0, float 1.0) nounwind readnone + store float %clamp, float addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}clamp_fabs_0_1_f32: +; SI: s_load_dword [[ARG:s[0-9]+]], +; SI: v_add_f32_e64 [[RESULT:v[0-9]+]], 0, |[[ARG]]| clamp{{$}} +; SI: buffer_store_dword [[RESULT]] +; SI: s_endpgm +define void @clamp_fabs_0_1_f32(float addrspace(1)* %out, float %src) nounwind { + %src.fabs = call float @llvm.fabs.f32(float %src) nounwind readnone + %clamp = call float @llvm.AMDGPU.clamp.f32(float %src.fabs, float 0.0, float 1.0) nounwind readnone + store float %clamp, float addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}clamp_fneg_0_1_f32: +; SI: s_load_dword [[ARG:s[0-9]+]], +; SI: v_add_f32_e64 [[RESULT:v[0-9]+]], 0, -[[ARG]] clamp{{$}} +; SI: buffer_store_dword [[RESULT]] +; SI: s_endpgm +define void @clamp_fneg_0_1_f32(float addrspace(1)* %out, float %src) nounwind { + %src.fneg = fsub float -0.0, %src + %clamp = call float @llvm.AMDGPU.clamp.f32(float %src.fneg, float 0.0, float 1.0) nounwind readnone + store float %clamp, float addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}clamp_fneg_fabs_0_1_f32: +; SI: s_load_dword [[ARG:s[0-9]+]], +; SI: v_add_f32_e64 [[RESULT:v[0-9]+]], 0, -|[[ARG]]| clamp{{$}} +; SI: buffer_store_dword [[RESULT]] +; SI: s_endpgm +define void @clamp_fneg_fabs_0_1_f32(float addrspace(1)* %out, float %src) nounwind { + %src.fabs = call float @llvm.fabs.f32(float %src) nounwind readnone + %src.fneg.fabs = fsub float -0.0, %src.fabs + %clamp = call float @llvm.AMDGPU.clamp.f32(float %src.fneg.fabs, float 0.0, float 1.0) nounwind readnone + store float %clamp, float addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}clamp_0_1_amdil_legacy_f32: +; SI: s_load_dword [[ARG:s[0-9]+]], +; SI: v_add_f32_e64 [[RESULT:v[0-9]+]], 0, [[ARG]] clamp{{$}} +; SI: buffer_store_dword [[RESULT]] +define void @clamp_0_1_amdil_legacy_f32(float addrspace(1)* %out, float %src) nounwind { + %clamp = call float @llvm.AMDIL.clamp.f32(float %src, float 0.0, float 1.0) nounwind readnone + store float %clamp, float addrspace(1)* %out, align 4 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.class.ll b/llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.class.ll new file mode 100644 index 00000000000..805a88b59c7 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.class.ll @@ -0,0 +1,497 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s + +declare i1 @llvm.AMDGPU.class.f32(float, i32) #1 +declare i1 @llvm.AMDGPU.class.f64(double, i32) #1 +declare i32 @llvm.r600.read.tidig.x() #1 +declare float @llvm.fabs.f32(float) #1 +declare double @llvm.fabs.f64(double) #1 + +; SI-LABEL: {{^}}test_class_f32: +; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb +; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc +; SI: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]] +; SI: v_cmp_class_f32_e32 vcc, [[SA]], [[VB]] +; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc +; SI-NEXT: buffer_store_dword [[RESULT]] +; SI: s_endpgm +define void @test_class_f32(i32 addrspace(1)* %out, float %a, i32 %b) #0 { + %result = call i1 @llvm.AMDGPU.class.f32(float %a, i32 %b) #1 + %sext = sext i1 %result to i32 + store i32 %sext, i32 addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL: {{^}}test_class_fabs_f32: +; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb +; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc +; SI: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]] +; SI: v_cmp_class_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], |[[SA]]|, [[VB]] +; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]] +; SI-NEXT: buffer_store_dword [[RESULT]] +; SI: s_endpgm +define void @test_class_fabs_f32(i32 addrspace(1)* %out, float %a, i32 %b) #0 { + %a.fabs = call float @llvm.fabs.f32(float %a) #1 + %result = call i1 @llvm.AMDGPU.class.f32(float %a.fabs, i32 %b) #1 + %sext = sext i1 %result to i32 + store i32 %sext, i32 addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL: {{^}}test_class_fneg_f32: +; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb +; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc +; SI: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]] +; SI: v_cmp_class_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], -[[SA]], [[VB]] +; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]] +; SI-NEXT: buffer_store_dword [[RESULT]] +; SI: s_endpgm +define void @test_class_fneg_f32(i32 addrspace(1)* %out, float %a, i32 %b) #0 { + %a.fneg = fsub float -0.0, %a + %result = call i1 @llvm.AMDGPU.class.f32(float %a.fneg, i32 %b) #1 + %sext = sext i1 %result to i32 + store i32 %sext, i32 addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL: {{^}}test_class_fneg_fabs_f32: +; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb +; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc +; SI: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]] +; SI: v_cmp_class_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], -|[[SA]]|, [[VB]] +; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]] +; SI-NEXT: buffer_store_dword [[RESULT]] +; SI: s_endpgm +define void @test_class_fneg_fabs_f32(i32 addrspace(1)* %out, float %a, i32 %b) #0 { + %a.fabs = call float @llvm.fabs.f32(float %a) #1 + %a.fneg.fabs = fsub float -0.0, %a.fabs + %result = call i1 @llvm.AMDGPU.class.f32(float %a.fneg.fabs, i32 %b) #1 + %sext = sext i1 %result to i32 + store i32 %sext, i32 addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL: {{^}}test_class_1_f32: +; SI: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb +; SI: v_cmp_class_f32_e64 [[COND:s\[[0-9]+:[0-9]+\]]], [[SA]], 1{{$}} +; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[COND]] +; SI-NEXT: buffer_store_dword [[RESULT]] +; SI: s_endpgm +define void @test_class_1_f32(i32 addrspace(1)* %out, float %a) #0 { + %result = call i1 @llvm.AMDGPU.class.f32(float %a, i32 1) #1 + %sext = sext i1 %result to i32 + store i32 %sext, i32 addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL: {{^}}test_class_64_f32: +; SI: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb +; SI: v_cmp_class_f32_e64 [[COND:s\[[0-9]+:[0-9]+\]]], [[SA]], 64{{$}} +; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[COND]] +; SI-NEXT: buffer_store_dword [[RESULT]] +; SI: s_endpgm +define void @test_class_64_f32(i32 addrspace(1)* %out, float %a) #0 { + %result = call i1 @llvm.AMDGPU.class.f32(float %a, i32 64) #1 + %sext = sext i1 %result to i32 + store i32 %sext, i32 addrspace(1)* %out, align 4 + ret void +} + +; Set all 10 bits of mask +; SI-LABEL: {{^}}test_class_full_mask_f32: +; SI: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb +; SI: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x3ff{{$}} +; SI: v_cmp_class_f32_e32 vcc, [[SA]], [[MASK]] +; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc +; SI-NEXT: buffer_store_dword [[RESULT]] +; SI: s_endpgm +define void @test_class_full_mask_f32(i32 addrspace(1)* %out, float %a) #0 { + %result = call i1 @llvm.AMDGPU.class.f32(float %a, i32 1023) #1 + %sext = sext i1 %result to i32 + store i32 %sext, i32 addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL: {{^}}test_class_9bit_mask_f32: +; SI: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb +; SI: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x1ff{{$}} +; SI: v_cmp_class_f32_e32 vcc, [[SA]], [[MASK]] +; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc +; SI-NEXT: buffer_store_dword [[RESULT]] +; SI: s_endpgm +define void @test_class_9bit_mask_f32(i32 addrspace(1)* %out, float %a) #0 { + %result = call i1 @llvm.AMDGPU.class.f32(float %a, i32 511) #1 + %sext = sext i1 %result to i32 + store i32 %sext, i32 addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL: {{^}}v_test_class_full_mask_f32: +; SI-DAG: buffer_load_dword [[VA:v[0-9]+]] +; SI-DAG: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x1ff{{$}} +; SI: v_cmp_class_f32_e32 vcc, [[VA]], [[MASK]] +; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc +; SI: buffer_store_dword [[RESULT]] +; SI: s_endpgm +define void @v_test_class_full_mask_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 { + %tid = call i32 @llvm.r600.read.tidig.x() #1 + %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %a = load float, float addrspace(1)* %gep.in + + %result = call i1 @llvm.AMDGPU.class.f32(float %a, i32 511) #1 + %sext = sext i1 %result to i32 + store i32 %sext, i32 addrspace(1)* %gep.out, align 4 + ret void +} + +; SI-LABEL: {{^}}test_class_inline_imm_constant_dynamic_mask_f32: +; SI-DAG: buffer_load_dword [[VB:v[0-9]+]] +; SI: v_cmp_class_f32_e32 vcc, 1.0, [[VB]] +; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc +; SI: buffer_store_dword [[RESULT]] +; SI: s_endpgm +define void @test_class_inline_imm_constant_dynamic_mask_f32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { + %tid = call i32 @llvm.r600.read.tidig.x() #1 + %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %b = load i32, i32 addrspace(1)* %gep.in + + %result = call i1 @llvm.AMDGPU.class.f32(float 1.0, i32 %b) #1 + %sext = sext i1 %result to i32 + store i32 %sext, i32 addrspace(1)* %gep.out, align 4 + ret void +} + +; FIXME: Why isn't this using a literal constant operand? +; SI-LABEL: {{^}}test_class_lit_constant_dynamic_mask_f32: +; SI-DAG: buffer_load_dword [[VB:v[0-9]+]] +; SI-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x44800000 +; SI: v_cmp_class_f32_e32 vcc, [[VK]], [[VB]] +; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc +; SI: buffer_store_dword [[RESULT]] +; SI: s_endpgm +define void @test_class_lit_constant_dynamic_mask_f32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { + %tid = call i32 @llvm.r600.read.tidig.x() #1 + %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %b = load i32, i32 addrspace(1)* %gep.in + + %result = call i1 @llvm.AMDGPU.class.f32(float 1024.0, i32 %b) #1 + %sext = sext i1 %result to i32 + store i32 %sext, i32 addrspace(1)* %gep.out, align 4 + ret void +} + +; SI-LABEL: {{^}}test_class_f64: +; SI-DAG: s_load_dwordx2 [[SA:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0xb +; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd +; SI-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]] +; SI: v_cmp_class_f64_e32 vcc, [[SA]], [[VB]] +; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc +; SI-NEXT: buffer_store_dword [[RESULT]] +; SI: s_endpgm +define void @test_class_f64(i32 addrspace(1)* %out, double %a, i32 %b) #0 { + %result = call i1 @llvm.AMDGPU.class.f64(double %a, i32 %b) #1 + %sext = sext i1 %result to i32 + store i32 %sext, i32 addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL: {{^}}test_class_fabs_f64: +; SI-DAG: s_load_dwordx2 [[SA:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0xb +; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd +; SI-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]] +; SI: v_cmp_class_f64_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], |[[SA]]|, [[VB]] +; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]] +; SI-NEXT: buffer_store_dword [[RESULT]] +; SI: s_endpgm +define void @test_class_fabs_f64(i32 addrspace(1)* %out, double %a, i32 %b) #0 { + %a.fabs = call double @llvm.fabs.f64(double %a) #1 + %result = call i1 @llvm.AMDGPU.class.f64(double %a.fabs, i32 %b) #1 + %sext = sext i1 %result to i32 + store i32 %sext, i32 addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL: {{^}}test_class_fneg_f64: +; SI-DAG: s_load_dwordx2 [[SA:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0xb +; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd +; SI-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]] +; SI: v_cmp_class_f64_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], -[[SA]], [[VB]] +; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]] +; SI-NEXT: buffer_store_dword [[RESULT]] +; SI: s_endpgm +define void @test_class_fneg_f64(i32 addrspace(1)* %out, double %a, i32 %b) #0 { + %a.fneg = fsub double -0.0, %a + %result = call i1 @llvm.AMDGPU.class.f64(double %a.fneg, i32 %b) #1 + %sext = sext i1 %result to i32 + store i32 %sext, i32 addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL: {{^}}test_class_fneg_fabs_f64: +; SI-DAG: s_load_dwordx2 [[SA:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0xb +; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd +; SI-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]] +; SI: v_cmp_class_f64_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], -|[[SA]]|, [[VB]] +; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]] +; SI-NEXT: buffer_store_dword [[RESULT]] +; SI: s_endpgm +define void @test_class_fneg_fabs_f64(i32 addrspace(1)* %out, double %a, i32 %b) #0 { + %a.fabs = call double @llvm.fabs.f64(double %a) #1 + %a.fneg.fabs = fsub double -0.0, %a.fabs + %result = call i1 @llvm.AMDGPU.class.f64(double %a.fneg.fabs, i32 %b) #1 + %sext = sext i1 %result to i32 + store i32 %sext, i32 addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL: {{^}}test_class_1_f64: +; SI: v_cmp_class_f64_e64 {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 1{{$}} +; SI: s_endpgm +define void @test_class_1_f64(i32 addrspace(1)* %out, double %a) #0 { + %result = call i1 @llvm.AMDGPU.class.f64(double %a, i32 1) #1 + %sext = sext i1 %result to i32 + store i32 %sext, i32 addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL: {{^}}test_class_64_f64: +; SI: v_cmp_class_f64_e64 {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 64{{$}} +; SI: s_endpgm +define void @test_class_64_f64(i32 addrspace(1)* %out, double %a) #0 { + %result = call i1 @llvm.AMDGPU.class.f64(double %a, i32 64) #1 + %sext = sext i1 %result to i32 + store i32 %sext, i32 addrspace(1)* %out, align 4 + ret void +} + +; Set all 9 bits of mask +; SI-LABEL: {{^}}test_class_full_mask_f64: +; SI: s_load_dwordx2 [[SA:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0xb +; SI: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x1ff{{$}} +; SI: v_cmp_class_f64_e32 vcc, [[SA]], [[MASK]] +; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc +; SI-NEXT: buffer_store_dword [[RESULT]] +; SI: s_endpgm +define void @test_class_full_mask_f64(i32 addrspace(1)* %out, double %a) #0 { + %result = call i1 @llvm.AMDGPU.class.f64(double %a, i32 511) #1 + %sext = sext i1 %result to i32 + store i32 %sext, i32 addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL: {{^}}v_test_class_full_mask_f64: +; SI-DAG: buffer_load_dwordx2 [[VA:v\[[0-9]+:[0-9]+\]]] +; SI-DAG: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x1ff{{$}} +; SI: v_cmp_class_f64_e32 vcc, [[VA]], [[MASK]] +; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc +; SI: buffer_store_dword [[RESULT]] +; SI: s_endpgm +define void @v_test_class_full_mask_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #0 { + %tid = call i32 @llvm.r600.read.tidig.x() #1 + %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %a = load double, double addrspace(1)* %in + + %result = call i1 @llvm.AMDGPU.class.f64(double %a, i32 511) #1 + %sext = sext i1 %result to i32 + store i32 %sext, i32 addrspace(1)* %gep.out, align 4 + ret void +} + +; SI-LABEL: {{^}}test_class_inline_imm_constant_dynamic_mask_f64: +; XSI: v_cmp_class_f64_e32 vcc, 1.0, +; SI: v_cmp_class_f64_e32 vcc, +; SI: s_endpgm +define void @test_class_inline_imm_constant_dynamic_mask_f64(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { + %tid = call i32 @llvm.r600.read.tidig.x() #1 + %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %b = load i32, i32 addrspace(1)* %gep.in + + %result = call i1 @llvm.AMDGPU.class.f64(double 1.0, i32 %b) #1 + %sext = sext i1 %result to i32 + store i32 %sext, i32 addrspace(1)* %gep.out, align 4 + ret void +} + +; SI-LABEL: {{^}}test_class_lit_constant_dynamic_mask_f64: +; SI: v_cmp_class_f64_e32 vcc, s{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} +; SI: s_endpgm +define void @test_class_lit_constant_dynamic_mask_f64(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { + %tid = call i32 @llvm.r600.read.tidig.x() #1 + %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %b = load i32, i32 addrspace(1)* %gep.in + + %result = call i1 @llvm.AMDGPU.class.f64(double 1024.0, i32 %b) #1 + %sext = sext i1 %result to i32 + store i32 %sext, i32 addrspace(1)* %gep.out, align 4 + ret void +} + +; SI-LABEL: {{^}}test_fold_or_class_f32_0: +; SI-NOT: v_cmp_class +; SI: v_cmp_class_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, 3{{$}} +; SI-NOT: v_cmp_class +; SI: s_endpgm +define void @test_fold_or_class_f32_0(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 { + %tid = call i32 @llvm.r600.read.tidig.x() #1 + %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %a = load float, float addrspace(1)* %gep.in + + %class0 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 1) #1 + %class1 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 3) #1 + %or = or i1 %class0, %class1 + + %sext = sext i1 %or to i32 + store i32 %sext, i32 addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL: {{^}}test_fold_or3_class_f32_0: +; SI-NOT: v_cmp_class +; SI: v_cmp_class_f32_e64 s{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, 7{{$}} +; SI-NOT: v_cmp_class +; SI: s_endpgm +define void @test_fold_or3_class_f32_0(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 { + %tid = call i32 @llvm.r600.read.tidig.x() #1 + %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %a = load float, float addrspace(1)* %gep.in + + %class0 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 1) #1 + %class1 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 2) #1 + %class2 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 4) #1 + %or.0 = or i1 %class0, %class1 + %or.1 = or i1 %or.0, %class2 + + %sext = sext i1 %or.1 to i32 + store i32 %sext, i32 addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL: {{^}}test_fold_or_all_tests_class_f32_0: +; SI-NOT: v_cmp_class +; SI: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x3ff{{$}} +; SI: v_cmp_class_f32_e32 vcc, v{{[0-9]+}}, [[MASK]]{{$}} +; SI-NOT: v_cmp_class +; SI: s_endpgm +define void @test_fold_or_all_tests_class_f32_0(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 { + %tid = call i32 @llvm.r600.read.tidig.x() #1 + %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %a = load float, float addrspace(1)* %gep.in + + %class0 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 1) #1 + %class1 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 2) #1 + %class2 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 4) #1 + %class3 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 8) #1 + %class4 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 16) #1 + %class5 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 32) #1 + %class6 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 64) #1 + %class7 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 128) #1 + %class8 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 256) #1 + %class9 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 512) #1 + %or.0 = or i1 %class0, %class1 + %or.1 = or i1 %or.0, %class2 + %or.2 = or i1 %or.1, %class3 + %or.3 = or i1 %or.2, %class4 + %or.4 = or i1 %or.3, %class5 + %or.5 = or i1 %or.4, %class6 + %or.6 = or i1 %or.5, %class7 + %or.7 = or i1 %or.6, %class8 + %or.8 = or i1 %or.7, %class9 + %sext = sext i1 %or.8 to i32 + store i32 %sext, i32 addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL: {{^}}test_fold_or_class_f32_1: +; SI-NOT: v_cmp_class +; SI: v_cmp_class_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, 12{{$}} +; SI-NOT: v_cmp_class +; SI: s_endpgm +define void @test_fold_or_class_f32_1(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 { + %tid = call i32 @llvm.r600.read.tidig.x() #1 + %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %a = load float, float addrspace(1)* %gep.in + + %class0 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 4) #1 + %class1 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 8) #1 + %or = or i1 %class0, %class1 + + %sext = sext i1 %or to i32 + store i32 %sext, i32 addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL: {{^}}test_fold_or_class_f32_2: +; SI-NOT: v_cmp_class +; SI: v_cmp_class_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, 7{{$}} +; SI-NOT: v_cmp_class +; SI: s_endpgm +define void @test_fold_or_class_f32_2(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 { + %tid = call i32 @llvm.r600.read.tidig.x() #1 + %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %a = load float, float addrspace(1)* %gep.in + + %class0 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 7) #1 + %class1 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 7) #1 + %or = or i1 %class0, %class1 + + %sext = sext i1 %or to i32 + store i32 %sext, i32 addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL: {{^}}test_no_fold_or_class_f32_0: +; SI-DAG: v_cmp_class_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, 4{{$}} +; SI-DAG: v_cmp_class_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}, 8{{$}} +; SI: s_or_b64 +; SI: s_endpgm +define void @test_no_fold_or_class_f32_0(i32 addrspace(1)* %out, float addrspace(1)* %in, float %b) #0 { + %tid = call i32 @llvm.r600.read.tidig.x() #1 + %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %a = load float, float addrspace(1)* %gep.in + + %class0 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 4) #1 + %class1 = call i1 @llvm.AMDGPU.class.f32(float %b, i32 8) #1 + %or = or i1 %class0, %class1 + + %sext = sext i1 %or to i32 + store i32 %sext, i32 addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL: {{^}}test_class_0_f32: +; SI-NOT: v_cmp_class +; SI: v_mov_b32_e32 [[RESULT:v[0-9]+]], 0{{$}} +; SI: buffer_store_dword [[RESULT]] +; SI: s_endpgm +define void @test_class_0_f32(i32 addrspace(1)* %out, float %a) #0 { + %result = call i1 @llvm.AMDGPU.class.f32(float %a, i32 0) #1 + %sext = sext i1 %result to i32 + store i32 %sext, i32 addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL: {{^}}test_class_0_f64: +; SI-NOT: v_cmp_class +; SI: v_mov_b32_e32 [[RESULT:v[0-9]+]], 0{{$}} +; SI: buffer_store_dword [[RESULT]] +; SI: s_endpgm +define void @test_class_0_f64(i32 addrspace(1)* %out, double %a) #0 { + %result = call i1 @llvm.AMDGPU.class.f64(double %a, i32 0) #1 + %sext = sext i1 %result to i32 + store i32 %sext, i32 addrspace(1)* %out, align 4 + ret void +} + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.cube.ll b/llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.cube.ll new file mode 100644 index 00000000000..e95a51093cb --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.cube.ll @@ -0,0 +1,59 @@ + +; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s + +; CHECK: {{^}}cube: +; CHECK: CUBE T{{[0-9]}}.X +; CHECK: CUBE T{{[0-9]}}.Y +; CHECK: CUBE T{{[0-9]}}.Z +; CHECK: CUBE * T{{[0-9]}}.W +define void @cube() #0 { +main_body: + %0 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9) + %1 = extractelement <4 x float> %0, i32 3 + %2 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9) + %3 = extractelement <4 x float> %2, i32 0 + %4 = fdiv float %3, %1 + %5 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9) + %6 = extractelement <4 x float> %5, i32 1 + %7 = fdiv float %6, %1 + %8 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9) + %9 = extractelement <4 x float> %8, i32 2 + %10 = fdiv float %9, %1 + %11 = insertelement <4 x float> undef, float %4, i32 0 + %12 = insertelement <4 x float> %11, float %7, i32 1 + %13 = insertelement <4 x float> %12, float %10, i32 2 + %14 = insertelement <4 x float> %13, float 1.000000e+00, i32 3 + %15 = call <4 x float> @llvm.AMDGPU.cube(<4 x float> %14) + %16 = extractelement <4 x float> %15, i32 0 + %17 = extractelement <4 x float> %15, i32 1 + %18 = extractelement <4 x float> %15, i32 2 + %19 = extractelement <4 x float> %15, i32 3 + %20 = call float @fabs(float %18) + %21 = fdiv float 1.000000e+00, %20 + %22 = fmul float %16, %21 + %23 = fadd float %22, 1.500000e+00 + %24 = fmul float %17, %21 + %25 = fadd float %24, 1.500000e+00 + %26 = insertelement <4 x float> undef, float %25, i32 0 + %27 = insertelement <4 x float> %26, float %23, i32 1 + %28 = insertelement <4 x float> %27, float %19, i32 2 + %29 = insertelement <4 x float> %28, float %25, i32 3 + %30 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %29, i32 16, i32 0, i32 4) + call void @llvm.R600.store.swizzle(<4 x float> %30, i32 0, i32 0) + ret void +} + +; Function Attrs: readnone +declare <4 x float> @llvm.AMDGPU.cube(<4 x float>) #1 + +; Function Attrs: readnone +declare float @fabs(float) #1 + +; Function Attrs: readnone +declare <4 x float> @llvm.AMDGPU.tex(<4 x float>, i32, i32, i32) #1 + +declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) + +attributes #0 = { "ShaderType"="0" } +attributes #1 = { readnone } + diff --git a/llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.cvt_f32_ubyte.ll new file mode 100644 index 00000000000..8b32f696449 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.cvt_f32_ubyte.ll @@ -0,0 +1,43 @@ +; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI %s + +declare float @llvm.AMDGPU.cvt.f32.ubyte0(i32) nounwind readnone +declare float @llvm.AMDGPU.cvt.f32.ubyte1(i32) nounwind readnone +declare float @llvm.AMDGPU.cvt.f32.ubyte2(i32) nounwind readnone +declare float @llvm.AMDGPU.cvt.f32.ubyte3(i32) nounwind readnone + +; SI-LABEL: {{^}}test_unpack_byte0_to_float: +; SI: v_cvt_f32_ubyte0 +define void @test_unpack_byte0_to_float(float addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { + %val = load i32, i32 addrspace(1)* %in, align 4 + %cvt = call float @llvm.AMDGPU.cvt.f32.ubyte0(i32 %val) nounwind readnone + store float %cvt, float addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL: {{^}}test_unpack_byte1_to_float: +; SI: v_cvt_f32_ubyte1 +define void @test_unpack_byte1_to_float(float addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { + %val = load i32, i32 addrspace(1)* %in, align 4 + %cvt = call float @llvm.AMDGPU.cvt.f32.ubyte1(i32 %val) nounwind readnone + store float %cvt, float addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL: {{^}}test_unpack_byte2_to_float: +; SI: v_cvt_f32_ubyte2 +define void @test_unpack_byte2_to_float(float addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { + %val = load i32, i32 addrspace(1)* %in, align 4 + %cvt = call float @llvm.AMDGPU.cvt.f32.ubyte2(i32 %val) nounwind readnone + store float %cvt, float addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL: {{^}}test_unpack_byte3_to_float: +; SI: v_cvt_f32_ubyte3 +define void @test_unpack_byte3_to_float(float addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { + %val = load i32, i32 addrspace(1)* %in, align 4 + %cvt = call float @llvm.AMDGPU.cvt.f32.ubyte3(i32 %val) nounwind readnone + store float %cvt, float addrspace(1)* %out, align 4 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.div_fixup.ll b/llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.div_fixup.ll new file mode 100644 index 00000000000..55ca9c7536e --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.div_fixup.ll @@ -0,0 +1,31 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN %s + +declare float @llvm.AMDGPU.div.fixup.f32(float, float, float) nounwind readnone +declare double @llvm.AMDGPU.div.fixup.f64(double, double, double) nounwind readnone + +; GCN-LABEL: {{^}}test_div_fixup_f32: +; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb +; SI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd +; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc +; VI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c +; VI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x34 +; VI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30 +; GCN-DAG: v_mov_b32_e32 [[VC:v[0-9]+]], [[SC]] +; GCN-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]] +; GCN: v_div_fixup_f32 [[RESULT:v[0-9]+]], [[SA]], [[VB]], [[VC]] +; GCN: buffer_store_dword [[RESULT]], +; GCN: s_endpgm +define void @test_div_fixup_f32(float addrspace(1)* %out, float %a, float %b, float %c) nounwind { + %result = call float @llvm.AMDGPU.div.fixup.f32(float %a, float %b, float %c) nounwind readnone + store float %result, float addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}test_div_fixup_f64: +; GCN: v_div_fixup_f64 +define void @test_div_fixup_f64(double addrspace(1)* %out, double %a, double %b, double %c) nounwind { + %result = call double @llvm.AMDGPU.div.fixup.f64(double %a, double %b, double %c) nounwind readnone + store double %result, double addrspace(1)* %out, align 8 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.div_fmas.ll b/llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.div_fmas.ll new file mode 100644 index 00000000000..bcb7f870f1f --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.div_fmas.ll @@ -0,0 +1,179 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; XUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s + +; FIXME: Enable for VI. + +declare i32 @llvm.r600.read.tidig.x() nounwind readnone +declare void @llvm.AMDGPU.barrier.global() nounwind noduplicate +declare float @llvm.AMDGPU.div.fmas.f32(float, float, float, i1) nounwind readnone +declare double @llvm.AMDGPU.div.fmas.f64(double, double, double, i1) nounwind readnone + +; GCN-LABEL: {{^}}test_div_fmas_f32: +; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb +; SI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd +; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc +; VI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c +; VI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x34 +; VI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30 +; GCN-DAG: v_mov_b32_e32 [[VC:v[0-9]+]], [[SC]] +; GCN-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]] +; GCN-DAG: v_mov_b32_e32 [[VA:v[0-9]+]], [[SA]] +; GCN: v_div_fmas_f32 [[RESULT:v[0-9]+]], [[VB]], [[VA]], [[VC]] +; GCN: buffer_store_dword [[RESULT]], +; GCN: s_endpgm +define void @test_div_fmas_f32(float addrspace(1)* %out, float %a, float %b, float %c, i1 %d) nounwind { + %result = call float @llvm.AMDGPU.div.fmas.f32(float %a, float %b, float %c, i1 %d) nounwind readnone + store float %result, float addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}test_div_fmas_f32_inline_imm_0: +; SI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd +; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc +; SI-DAG: v_mov_b32_e32 [[VC:v[0-9]+]], [[SC]] +; SI-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]] +; SI: v_div_fmas_f32 [[RESULT:v[0-9]+]], 1.0, [[VB]], [[VC]] +; SI: buffer_store_dword [[RESULT]], +; SI: s_endpgm +define void @test_div_fmas_f32_inline_imm_0(float addrspace(1)* %out, float %a, float %b, float %c, i1 %d) nounwind { + %result = call float @llvm.AMDGPU.div.fmas.f32(float 1.0, float %b, float %c, i1 %d) nounwind readnone + store float %result, float addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}test_div_fmas_f32_inline_imm_1: +; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb +; SI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd +; SI-DAG: v_mov_b32_e32 [[VC:v[0-9]+]], [[SC]] +; SI-DAG: v_mov_b32_e32 [[VA:v[0-9]+]], [[SA]] +; SI: v_div_fmas_f32 [[RESULT:v[0-9]+]], 1.0, [[VA]], [[VC]] +; SI: buffer_store_dword [[RESULT]], +; SI: s_endpgm +define void @test_div_fmas_f32_inline_imm_1(float addrspace(1)* %out, float %a, float %b, float %c, i1 %d) nounwind { + %result = call float @llvm.AMDGPU.div.fmas.f32(float %a, float 1.0, float %c, i1 %d) nounwind readnone + store float %result, float addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}test_div_fmas_f32_inline_imm_2: +; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb +; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc +; SI-DAG: v_mov_b32_e32 [[VA:v[0-9]+]], [[SA]] +; SI-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]] +; SI: v_div_fmas_f32 [[RESULT:v[0-9]+]], [[VA]], [[VB]], 1.0 +; SI: buffer_store_dword [[RESULT]], +; SI: s_endpgm +define void @test_div_fmas_f32_inline_imm_2(float addrspace(1)* %out, float %a, float %b, float %c, i1 %d) nounwind { + %result = call float @llvm.AMDGPU.div.fmas.f32(float %a, float %b, float 1.0, i1 %d) nounwind readnone + store float %result, float addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}test_div_fmas_f64: +; GCN: v_div_fmas_f64 +define void @test_div_fmas_f64(double addrspace(1)* %out, double %a, double %b, double %c, i1 %d) nounwind { + %result = call double @llvm.AMDGPU.div.fmas.f64(double %a, double %b, double %c, i1 %d) nounwind readnone + store double %result, double addrspace(1)* %out, align 8 + ret void +} + +; GCN-LABEL: {{^}}test_div_fmas_f32_cond_to_vcc: +; SI: v_cmp_eq_i32_e64 vcc, 0, s{{[0-9]+}} +; SI: v_div_fmas_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} +define void @test_div_fmas_f32_cond_to_vcc(float addrspace(1)* %out, float %a, float %b, float %c, i32 %i) nounwind { + %cmp = icmp eq i32 %i, 0 + %result = call float @llvm.AMDGPU.div.fmas.f32(float %a, float %b, float %c, i1 %cmp) nounwind readnone + store float %result, float addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}test_div_fmas_f32_imm_false_cond_to_vcc: +; SI: s_mov_b64 vcc, 0 +; SI: v_div_fmas_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} +define void @test_div_fmas_f32_imm_false_cond_to_vcc(float addrspace(1)* %out, float %a, float %b, float %c) nounwind { + %result = call float @llvm.AMDGPU.div.fmas.f32(float %a, float %b, float %c, i1 false) nounwind readnone + store float %result, float addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}test_div_fmas_f32_imm_true_cond_to_vcc: +; SI: s_mov_b64 vcc, -1 +; SI: v_div_fmas_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} +define void @test_div_fmas_f32_imm_true_cond_to_vcc(float addrspace(1)* %out, float %a, float %b, float %c) nounwind { + %result = call float @llvm.AMDGPU.div.fmas.f32(float %a, float %b, float %c, i1 true) nounwind readnone + store float %result, float addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}test_div_fmas_f32_logical_cond_to_vcc: +; SI-DAG: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI-DAG: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} +; SI-DAG: buffer_load_dword [[C:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} + +; SI-DAG: v_cmp_eq_i32_e32 [[CMP0:vcc]], 0, v{{[0-9]+}} +; SI-DAG: v_cmp_ne_i32_e64 [[CMP1:s\[[0-9]+:[0-9]+\]]], 0, s{{[0-9]+}} +; SI: s_and_b64 vcc, [[CMP0]], [[CMP1]] +; SI: v_div_fmas_f32 {{v[0-9]+}}, [[A]], [[B]], [[C]] +; SI: s_endpgm +define void @test_div_fmas_f32_logical_cond_to_vcc(float addrspace(1)* %out, float addrspace(1)* %in, i32 %d) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep.a = getelementptr float, float addrspace(1)* %in, i32 %tid + %gep.b = getelementptr float, float addrspace(1)* %gep.a, i32 1 + %gep.c = getelementptr float, float addrspace(1)* %gep.a, i32 2 + %gep.out = getelementptr float, float addrspace(1)* %out, i32 2 + + %a = load float, float addrspace(1)* %gep.a + %b = load float, float addrspace(1)* %gep.b + %c = load float, float addrspace(1)* %gep.c + + %cmp0 = icmp eq i32 %tid, 0 + %cmp1 = icmp ne i32 %d, 0 + %and = and i1 %cmp0, %cmp1 + + %result = call float @llvm.AMDGPU.div.fmas.f32(float %a, float %b, float %c, i1 %and) nounwind readnone + store float %result, float addrspace(1)* %gep.out, align 4 + ret void +} + +; GCN-LABEL: {{^}}test_div_fmas_f32_i1_phi_vcc: +; SI: v_cmp_eq_i32_e32 vcc, 0, v{{[0-9]+}} +; SI: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], vcc +; SI: s_xor_b64 [[SAVE]], exec, [[SAVE]] + +; SI: buffer_load_dword [[LOAD:v[0-9]+]] +; SI: v_cmp_ne_i32_e32 vcc, 0, [[LOAD]] +; SI: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc + + +; SI: BB9_2: +; SI: s_or_b64 exec, exec, [[SAVE]] +; SI: v_cmp_ne_i32_e32 vcc, 0, v0 +; SI: v_div_fmas_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} +; SI: buffer_store_dword +; SI: s_endpgm +define void @test_div_fmas_f32_i1_phi_vcc(float addrspace(1)* %out, float addrspace(1)* %in, i32 addrspace(1)* %dummy) nounwind { +entry: + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep.out = getelementptr float, float addrspace(1)* %out, i32 2 + %gep.a = getelementptr float, float addrspace(1)* %in, i32 %tid + %gep.b = getelementptr float, float addrspace(1)* %gep.a, i32 1 + %gep.c = getelementptr float, float addrspace(1)* %gep.a, i32 2 + + %a = load float, float addrspace(1)* %gep.a + %b = load float, float addrspace(1)* %gep.b + %c = load float, float addrspace(1)* %gep.c + + %cmp0 = icmp eq i32 %tid, 0 + br i1 %cmp0, label %bb, label %exit + +bb: + %val = load i32, i32 addrspace(1)* %dummy + %cmp1 = icmp ne i32 %val, 0 + br label %exit + +exit: + %cond = phi i1 [false, %entry], [%cmp1, %bb] + %result = call float @llvm.AMDGPU.div.fmas.f32(float %a, float %b, float %c, i1 %cond) nounwind readnone + store float %result, float addrspace(1)* %gep.out, align 4 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.div_scale.ll b/llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.div_scale.ll new file mode 100644 index 00000000000..de830de039c --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.div_scale.ll @@ -0,0 +1,364 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s + +declare i32 @llvm.r600.read.tidig.x() nounwind readnone +declare { float, i1 } @llvm.AMDGPU.div.scale.f32(float, float, i1) nounwind readnone +declare { double, i1 } @llvm.AMDGPU.div.scale.f64(double, double, i1) nounwind readnone +declare float @llvm.fabs.f32(float) nounwind readnone + +; SI-LABEL @test_div_scale_f32_1: +; SI-DAG: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 +; SI-DAG: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 +; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[A]] +; SI: buffer_store_dword [[RESULT0]] +; SI: s_endpgm +define void @test_div_scale_f32_1(float addrspace(1)* %out, float addrspace(1)* %in) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 + + %a = load float, float addrspace(1)* %gep.0, align 4 + %b = load float, float addrspace(1)* %gep.1, align 4 + + %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a, float %b, i1 false) nounwind readnone + %result0 = extractvalue { float, i1 } %result, 0 + store float %result0, float addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL @test_div_scale_f32_2: +; SI-DAG: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 +; SI-DAG: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 +; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[A]] +; SI: buffer_store_dword [[RESULT0]] +; SI: s_endpgm +define void @test_div_scale_f32_2(float addrspace(1)* %out, float addrspace(1)* %in) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 + + %a = load float, float addrspace(1)* %gep.0, align 4 + %b = load float, float addrspace(1)* %gep.1, align 4 + + %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a, float %b, i1 true) nounwind readnone + %result0 = extractvalue { float, i1 } %result, 0 + store float %result0, float addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL @test_div_scale_f64_1: +; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 +; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 +; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[A]] +; SI: buffer_store_dwordx2 [[RESULT0]] +; SI: s_endpgm +define void @test_div_scale_f64_1(double addrspace(1)* %out, double addrspace(1)* %aptr, double addrspace(1)* %in) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 + + %a = load double, double addrspace(1)* %gep.0, align 8 + %b = load double, double addrspace(1)* %gep.1, align 8 + + %result = call { double, i1 } @llvm.AMDGPU.div.scale.f64(double %a, double %b, i1 false) nounwind readnone + %result0 = extractvalue { double, i1 } %result, 0 + store double %result0, double addrspace(1)* %out, align 8 + ret void +} + +; SI-LABEL @test_div_scale_f64_1: +; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 +; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 +; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[A]] +; SI: buffer_store_dwordx2 [[RESULT0]] +; SI: s_endpgm +define void @test_div_scale_f64_2(double addrspace(1)* %out, double addrspace(1)* %aptr, double addrspace(1)* %in) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 + + %a = load double, double addrspace(1)* %gep.0, align 8 + %b = load double, double addrspace(1)* %gep.1, align 8 + + %result = call { double, i1 } @llvm.AMDGPU.div.scale.f64(double %a, double %b, i1 true) nounwind readnone + %result0 = extractvalue { double, i1 } %result, 0 + store double %result0, double addrspace(1)* %out, align 8 + ret void +} + +; SI-LABEL @test_div_scale_f32_scalar_num_1: +; SI-DAG: buffer_load_dword [[B:v[0-9]+]] +; SI-DAG: s_load_dword [[A:s[0-9]+]] +; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[A]] +; SI: buffer_store_dword [[RESULT0]] +; SI: s_endpgm +define void @test_div_scale_f32_scalar_num_1(float addrspace(1)* %out, float addrspace(1)* %in, float %a) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep = getelementptr float, float addrspace(1)* %in, i32 %tid + + %b = load float, float addrspace(1)* %gep, align 4 + + %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a, float %b, i1 false) nounwind readnone + %result0 = extractvalue { float, i1 } %result, 0 + store float %result0, float addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL @test_div_scale_f32_scalar_num_2: +; SI-DAG: buffer_load_dword [[B:v[0-9]+]] +; SI-DAG: s_load_dword [[A:s[0-9]+]] +; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[A]] +; SI: buffer_store_dword [[RESULT0]] +; SI: s_endpgm +define void @test_div_scale_f32_scalar_num_2(float addrspace(1)* %out, float addrspace(1)* %in, float %a) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep = getelementptr float, float addrspace(1)* %in, i32 %tid + + %b = load float, float addrspace(1)* %gep, align 4 + + %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a, float %b, i1 true) nounwind readnone + %result0 = extractvalue { float, i1 } %result, 0 + store float %result0, float addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL @test_div_scale_f32_scalar_den_1: +; SI-DAG: buffer_load_dword [[A:v[0-9]+]] +; SI-DAG: s_load_dword [[B:s[0-9]+]] +; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[A]] +; SI: buffer_store_dword [[RESULT0]] +; SI: s_endpgm +define void @test_div_scale_f32_scalar_den_1(float addrspace(1)* %out, float addrspace(1)* %in, float %b) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep = getelementptr float, float addrspace(1)* %in, i32 %tid + + %a = load float, float addrspace(1)* %gep, align 4 + + %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a, float %b, i1 false) nounwind readnone + %result0 = extractvalue { float, i1 } %result, 0 + store float %result0, float addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL @test_div_scale_f32_scalar_den_2: +; SI-DAG: buffer_load_dword [[A:v[0-9]+]] +; SI-DAG: s_load_dword [[B:s[0-9]+]] +; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[A]] +; SI: buffer_store_dword [[RESULT0]] +; SI: s_endpgm +define void @test_div_scale_f32_scalar_den_2(float addrspace(1)* %out, float addrspace(1)* %in, float %b) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep = getelementptr float, float addrspace(1)* %in, i32 %tid + + %a = load float, float addrspace(1)* %gep, align 4 + + %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a, float %b, i1 true) nounwind readnone + %result0 = extractvalue { float, i1 } %result, 0 + store float %result0, float addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL @test_div_scale_f64_scalar_num_1: +; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]] +; SI-DAG: s_load_dwordx2 [[A:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xd +; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[A]] +; SI: buffer_store_dwordx2 [[RESULT0]] +; SI: s_endpgm +define void @test_div_scale_f64_scalar_num_1(double addrspace(1)* %out, double addrspace(1)* %in, double %a) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep = getelementptr double, double addrspace(1)* %in, i32 %tid + + %b = load double, double addrspace(1)* %gep, align 8 + + %result = call { double, i1 } @llvm.AMDGPU.div.scale.f64(double %a, double %b, i1 false) nounwind readnone + %result0 = extractvalue { double, i1 } %result, 0 + store double %result0, double addrspace(1)* %out, align 8 + ret void +} + +; SI-LABEL @test_div_scale_f64_scalar_num_2: +; SI-DAG: s_load_dwordx2 [[A:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xd +; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]] +; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[A]] +; SI: buffer_store_dwordx2 [[RESULT0]] +; SI: s_endpgm +define void @test_div_scale_f64_scalar_num_2(double addrspace(1)* %out, double addrspace(1)* %in, double %a) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep = getelementptr double, double addrspace(1)* %in, i32 %tid + + %b = load double, double addrspace(1)* %gep, align 8 + + %result = call { double, i1 } @llvm.AMDGPU.div.scale.f64(double %a, double %b, i1 true) nounwind readnone + %result0 = extractvalue { double, i1 } %result, 0 + store double %result0, double addrspace(1)* %out, align 8 + ret void +} + +; SI-LABEL @test_div_scale_f64_scalar_den_1: +; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]] +; SI-DAG: s_load_dwordx2 [[B:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xd +; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[A]] +; SI: buffer_store_dwordx2 [[RESULT0]] +; SI: s_endpgm +define void @test_div_scale_f64_scalar_den_1(double addrspace(1)* %out, double addrspace(1)* %in, double %b) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep = getelementptr double, double addrspace(1)* %in, i32 %tid + + %a = load double, double addrspace(1)* %gep, align 8 + + %result = call { double, i1 } @llvm.AMDGPU.div.scale.f64(double %a, double %b, i1 false) nounwind readnone + %result0 = extractvalue { double, i1 } %result, 0 + store double %result0, double addrspace(1)* %out, align 8 + ret void +} + +; SI-LABEL @test_div_scale_f64_scalar_den_2: +; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]] +; SI-DAG: s_load_dwordx2 [[B:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xd +; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[A]] +; SI: buffer_store_dwordx2 [[RESULT0]] +; SI: s_endpgm +define void @test_div_scale_f64_scalar_den_2(double addrspace(1)* %out, double addrspace(1)* %in, double %b) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep = getelementptr double, double addrspace(1)* %in, i32 %tid + + %a = load double, double addrspace(1)* %gep, align 8 + + %result = call { double, i1 } @llvm.AMDGPU.div.scale.f64(double %a, double %b, i1 true) nounwind readnone + %result0 = extractvalue { double, i1 } %result, 0 + store double %result0, double addrspace(1)* %out, align 8 + ret void +} + +; SI-LABEL @test_div_scale_f32_all_scalar_1: +; SI-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xb +; SI-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xc +; SI: v_mov_b32_e32 [[VA:v[0-9]+]], [[A]] +; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[VA]] +; SI: buffer_store_dword [[RESULT0]] +; SI: s_endpgm +define void @test_div_scale_f32_all_scalar_1(float addrspace(1)* %out, float %a, float %b) nounwind { + %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a, float %b, i1 false) nounwind readnone + %result0 = extractvalue { float, i1 } %result, 0 + store float %result0, float addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL @test_div_scale_f32_all_scalar_2: +; SI-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xb +; SI-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xc +; SI: v_mov_b32_e32 [[VB:v[0-9]+]], [[B]] +; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[VB]], [[A]] +; SI: buffer_store_dword [[RESULT0]] +; SI: s_endpgm +define void @test_div_scale_f32_all_scalar_2(float addrspace(1)* %out, float %a, float %b) nounwind { + %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a, float %b, i1 true) nounwind readnone + %result0 = extractvalue { float, i1 } %result, 0 + store float %result0, float addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL @test_div_scale_f64_all_scalar_1: +; SI-DAG: s_load_dwordx2 s{{\[}}[[A_LO:[0-9]+]]:[[A_HI:[0-9]+]]{{\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0xb +; SI-DAG: s_load_dwordx2 [[B:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xd +; SI-DAG: v_mov_b32_e32 v[[VA_LO:[0-9]+]], s[[A_LO]] +; SI-DAG: v_mov_b32_e32 v[[VA_HI:[0-9]+]], s[[A_HI]] +; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], v{{\[}}[[VA_LO]]:[[VA_HI]]{{\]}} +; SI: buffer_store_dwordx2 [[RESULT0]] +; SI: s_endpgm +define void @test_div_scale_f64_all_scalar_1(double addrspace(1)* %out, double %a, double %b) nounwind { + %result = call { double, i1 } @llvm.AMDGPU.div.scale.f64(double %a, double %b, i1 false) nounwind readnone + %result0 = extractvalue { double, i1 } %result, 0 + store double %result0, double addrspace(1)* %out, align 8 + ret void +} + +; SI-LABEL @test_div_scale_f64_all_scalar_2: +; SI-DAG: s_load_dwordx2 [[A:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb +; SI-DAG: s_load_dwordx2 s{{\[}}[[B_LO:[0-9]+]]:[[B_HI:[0-9]+]]{{\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0xd +; SI-DAG: v_mov_b32_e32 v[[VB_LO:[0-9]+]], s[[B_LO]] +; SI-DAG: v_mov_b32_e32 v[[VB_HI:[0-9]+]], s[[B_HI]] +; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], v{{\[}}[[VB_LO]]:[[VB_HI]]{{\]}}, [[A]] +; SI: buffer_store_dwordx2 [[RESULT0]] +; SI: s_endpgm +define void @test_div_scale_f64_all_scalar_2(double addrspace(1)* %out, double %a, double %b) nounwind { + %result = call { double, i1 } @llvm.AMDGPU.div.scale.f64(double %a, double %b, i1 true) nounwind readnone + %result0 = extractvalue { double, i1 } %result, 0 + store double %result0, double addrspace(1)* %out, align 8 + ret void +} + +; SI-LABEL @test_div_scale_f32_inline_imm_num: +; SI-DAG: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[A]], 1.0 +; SI: buffer_store_dword [[RESULT0]] +; SI: s_endpgm +define void @test_div_scale_f32_inline_imm_num(float addrspace(1)* %out, float addrspace(1)* %in) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid + %a = load float, float addrspace(1)* %gep.0, align 4 + + %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float 1.0, float %a, i1 false) nounwind readnone + %result0 = extractvalue { float, i1 } %result, 0 + store float %result0, float addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL @test_div_scale_f32_inline_imm_den: +; SI-DAG: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], 2.0, 2.0, [[A]] +; SI: buffer_store_dword [[RESULT0]] +; SI: s_endpgm +define void @test_div_scale_f32_inline_imm_den(float addrspace(1)* %out, float addrspace(1)* %in) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid + %a = load float, float addrspace(1)* %gep.0, align 4 + + %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a, float 2.0, i1 false) nounwind readnone + %result0 = extractvalue { float, i1 } %result, 0 + store float %result0, float addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL @test_div_scale_f32_fabs_num: +; SI-DAG: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 +; SI-DAG: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 +; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], |[[A]]| +; SI: buffer_store_dword [[RESULT0]] +; SI: s_endpgm +define void @test_div_scale_f32_fabs_num(float addrspace(1)* %out, float addrspace(1)* %in) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 + + %a = load float, float addrspace(1)* %gep.0, align 4 + %b = load float, float addrspace(1)* %gep.1, align 4 + + %a.fabs = call float @llvm.fabs.f32(float %a) nounwind readnone + + %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a.fabs, float %b, i1 false) nounwind readnone + %result0 = extractvalue { float, i1 } %result, 0 + store float %result0, float addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL @test_div_scale_f32_fabs_den: +; SI-DAG: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 +; SI-DAG: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 +; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], |[[B]]|, |[[B]]|, [[A]] +; SI: buffer_store_dword [[RESULT0]] +; SI: s_endpgm +define void @test_div_scale_f32_fabs_den(float addrspace(1)* %out, float addrspace(1)* %in) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 + + %a = load float, float addrspace(1)* %gep.0, align 4 + %b = load float, float addrspace(1)* %gep.1, align 4 + + %b.fabs = call float @llvm.fabs.f32(float %b) nounwind readnone + + %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a, float %b.fabs, i1 false) nounwind readnone + %result0 = extractvalue { float, i1 } %result, 0 + store float %result0, float addrspace(1)* %out, align 4 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.flbit.i32.ll b/llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.flbit.i32.ll new file mode 100644 index 00000000000..20c7af8ade5 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.flbit.i32.ll @@ -0,0 +1,28 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s + +declare i32 @llvm.AMDGPU.flbit.i32(i32) nounwind readnone + +; FUNC-LABEL: {{^}}s_flbit: +; SI: s_load_dword [[VAL:s[0-9]+]], +; SI: s_flbit_i32 [[SRESULT:s[0-9]+]], [[VAL]] +; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]] +; SI: buffer_store_dword [[VRESULT]], +; SI: s_endpgm +define void @s_flbit(i32 addrspace(1)* noalias %out, i32 %val) nounwind { + %r = call i32 @llvm.AMDGPU.flbit.i32(i32 %val) nounwind readnone + store i32 %r, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}v_flbit: +; SI: buffer_load_dword [[VAL:v[0-9]+]], +; SI: v_ffbh_i32_e32 [[RESULT:v[0-9]+]], [[VAL]] +; SI: buffer_store_dword [[RESULT]], +; SI: s_endpgm +define void @v_flbit(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { + %val = load i32, i32 addrspace(1)* %valptr, align 4 + %r = call i32 @llvm.AMDGPU.flbit.i32(i32 %val) nounwind readnone + store i32 %r, i32 addrspace(1)* %out, align 4 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.fract.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.fract.f64.ll new file mode 100644 index 00000000000..e098dd35d6d --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.fract.f64.ll @@ -0,0 +1,60 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=FUNC %s + +declare double @llvm.fabs.f64(double %Val) +declare double @llvm.AMDGPU.fract.f64(double) nounwind readnone + +; FUNC-LABEL: {{^}}fract_f64: +; GCN: v_fract_f64_e32 [[FRC:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]] +; SI: v_mov_b32_e32 v[[UPLO:[0-9]+]], -1 +; SI: v_mov_b32_e32 v[[UPHI:[0-9]+]], 0x3fefffff +; SI: v_min_f64 v{{\[}}[[MINLO:[0-9]+]]:[[MINHI:[0-9]+]]], v{{\[}}[[UPLO]]:[[UPHI]]], [[FRC]] +; SI: v_cmp_class_f64_e64 [[COND:s\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO]]:[[HI]]], 3 +; SI: v_cndmask_b32_e64 v[[RESLO:[0-9]+]], v[[LO]], v[[MINLO]], [[COND]] +; SI: v_cndmask_b32_e64 v[[RESHI:[0-9]+]], v[[HI]], v[[MINHI]], [[COND]] +; SI: buffer_store_dwordx2 v{{\[}}[[RESLO]]:[[RESHI]]] +; CI: buffer_store_dwordx2 [[FRC]] +define void @fract_f64(double addrspace(1)* %out, double addrspace(1)* %src) nounwind { + %val = load double, double addrspace(1)* %src, align 4 + %fract = call double @llvm.AMDGPU.fract.f64(double %val) nounwind readnone + store double %fract, double addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}fract_f64_neg: +; GCN: v_fract_f64_e64 [[FRC:v\[[0-9]+:[0-9]+\]]], -v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]] +; SI: v_mov_b32_e32 v[[UPLO:[0-9]+]], -1 +; SI: v_mov_b32_e32 v[[UPHI:[0-9]+]], 0x3fefffff +; SI: v_min_f64 v{{\[}}[[MINLO:[0-9]+]]:[[MINHI:[0-9]+]]], v{{\[}}[[UPLO]]:[[UPHI]]], [[FRC]] +; SI: v_cmp_class_f64_e64 [[COND:s\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO]]:[[HI]]], 3 +; SI: v_cndmask_b32_e64 v[[RESLO:[0-9]+]], v[[LO]], v[[MINLO]], [[COND]] +; SI: v_cndmask_b32_e64 v[[RESHI:[0-9]+]], v[[HI]], v[[MINHI]], [[COND]] +; SI: buffer_store_dwordx2 v{{\[}}[[RESLO]]:[[RESHI]]] +; CI: buffer_store_dwordx2 [[FRC]] +define void @fract_f64_neg(double addrspace(1)* %out, double addrspace(1)* %src) nounwind { + %val = load double, double addrspace(1)* %src, align 4 + %neg = fsub double 0.0, %val + %fract = call double @llvm.AMDGPU.fract.f64(double %neg) nounwind readnone + store double %fract, double addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}fract_f64_neg_abs: +; GCN: v_fract_f64_e64 [[FRC:v\[[0-9]+:[0-9]+\]]], -|v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]]| +; SI: v_mov_b32_e32 v[[UPLO:[0-9]+]], -1 +; SI: v_mov_b32_e32 v[[UPHI:[0-9]+]], 0x3fefffff +; SI: v_min_f64 v{{\[}}[[MINLO:[0-9]+]]:[[MINHI:[0-9]+]]], v{{\[}}[[UPLO]]:[[UPHI]]], [[FRC]] +; SI: v_cmp_class_f64_e64 [[COND:s\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO]]:[[HI]]], 3 +; SI: v_cndmask_b32_e64 v[[RESLO:[0-9]+]], v[[LO]], v[[MINLO]], [[COND]] +; SI: v_cndmask_b32_e64 v[[RESHI:[0-9]+]], v[[HI]], v[[MINHI]], [[COND]] +; SI: buffer_store_dwordx2 v{{\[}}[[RESLO]]:[[RESHI]]] +; CI: buffer_store_dwordx2 [[FRC]] +define void @fract_f64_neg_abs(double addrspace(1)* %out, double addrspace(1)* %src) nounwind { + %val = load double, double addrspace(1)* %src, align 4 + %abs = call double @llvm.fabs.f64(double %val) + %neg = fsub double 0.0, %abs + %fract = call double @llvm.AMDGPU.fract.f64(double %neg) nounwind readnone + store double %fract, double addrspace(1)* %out, align 4 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.fract.ll b/llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.fract.ll new file mode 100644 index 00000000000..7501b4b7546 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.fract.ll @@ -0,0 +1,65 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s + +declare float @llvm.fabs.f32(float %Val) +declare float @llvm.AMDGPU.fract.f32(float) nounwind readnone + +; Legacy name +declare float @llvm.AMDIL.fraction.f32(float) nounwind readnone + +; FUNC-LABEL: {{^}}fract_f32: +; CI: v_fract_f32_e32 [[RESULT:v[0-9]+]], [[INPUT:v[0-9]+]] +; SI: v_floor_f32_e32 [[FLR:v[0-9]+]], [[INPUT:v[0-9]+]] +; SI: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[FLR]], [[INPUT]] +; GCN: buffer_store_dword [[RESULT]] +; EG: FRACT +define void @fract_f32(float addrspace(1)* %out, float addrspace(1)* %src) nounwind { + %val = load float, float addrspace(1)* %src, align 4 + %fract = call float @llvm.AMDGPU.fract.f32(float %val) nounwind readnone + store float %fract, float addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}fract_f32_legacy_amdil: +; CI: v_fract_f32_e32 [[RESULT:v[0-9]+]], [[INPUT:v[0-9]+]] +; SI: v_floor_f32_e32 [[FLR:v[0-9]+]], [[INPUT:v[0-9]+]] +; SI: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[FLR]], [[INPUT]] +; GCN: buffer_store_dword [[RESULT]] +; EG: FRACT +define void @fract_f32_legacy_amdil(float addrspace(1)* %out, float addrspace(1)* %src) nounwind { + %val = load float, float addrspace(1)* %src, align 4 + %fract = call float @llvm.AMDIL.fraction.f32(float %val) nounwind readnone + store float %fract, float addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}fract_f32_neg: +; CI: v_fract_f32_e64 [[RESULT:v[0-9]+]], -[[INPUT:v[0-9]+]] +; SI: v_floor_f32_e64 [[FLR:v[0-9]+]], -[[INPUT:v[0-9]+]] +; SI: v_sub_f32_e64 [[RESULT:v[0-9]+]], -[[INPUT]], [[FLR]] +; GCN: buffer_store_dword [[RESULT]] +; EG: FRACT +define void @fract_f32_neg(float addrspace(1)* %out, float addrspace(1)* %src) nounwind { + %val = load float, float addrspace(1)* %src, align 4 + %neg = fsub float 0.0, %val + %fract = call float @llvm.AMDGPU.fract.f32(float %neg) nounwind readnone + store float %fract, float addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}fract_f32_neg_abs: +; CI: v_fract_f32_e64 [[RESULT:v[0-9]+]], -|[[INPUT:v[0-9]+]]| +; SI: v_floor_f32_e64 [[FLR:v[0-9]+]], -|[[INPUT:v[0-9]+]]| +; SI: v_sub_f32_e64 [[RESULT:v[0-9]+]], -|[[INPUT]]|, [[FLR]] +; GCN: buffer_store_dword [[RESULT]] +; EG: FRACT +define void @fract_f32_neg_abs(float addrspace(1)* %out, float addrspace(1)* %src) nounwind { + %val = load float, float addrspace(1)* %src, align 4 + %abs = call float @llvm.fabs.f32(float %val) + %neg = fsub float 0.0, %abs + %fract = call float @llvm.AMDGPU.fract.f32(float %neg) nounwind readnone + store float %fract, float addrspace(1)* %out, align 4 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.imad24.ll b/llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.imad24.ll new file mode 100644 index 00000000000..42102e30f07 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.imad24.ll @@ -0,0 +1,22 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=cayman -verify-machineinstrs < %s | FileCheck -check-prefix=CM -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s +; XUN: llc -march=r600 -mcpu=r600 -verify-machineinstrs < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s +; XUN: llc -march=r600 -mcpu=r770 -verify-machineinstrs < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s + +; FIXME: Store of i32 seems to be broken pre-EG somehow? + +declare i32 @llvm.AMDGPU.imad24(i32, i32, i32) nounwind readnone + +; FUNC-LABEL: {{^}}test_imad24: +; SI: v_mad_i32_i24 +; CM: MULADD_INT24 +; R600: MULLO_INT +; R600: ADD_INT +define void @test_imad24(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) nounwind { + %mad = call i32 @llvm.AMDGPU.imad24(i32 %src0, i32 %src1, i32 %src2) nounwind readnone + store i32 %mad, i32 addrspace(1)* %out, align 4 + ret void +} + diff --git a/llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.imax.ll b/llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.imax.ll new file mode 100644 index 00000000000..46662f96c29 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.imax.ll @@ -0,0 +1,33 @@ +; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck -check-prefix=SI %s +; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefix=SI %s + +; SI-LABEL: {{^}}vector_imax: +; SI: v_max_i32_e32 +define void @vector_imax(i32 %p0, i32 %p1, i32 addrspace(1)* %in) #0 { +main_body: + %load = load i32, i32 addrspace(1)* %in, align 4 + %max = call i32 @llvm.AMDGPU.imax(i32 %p0, i32 %load) + %bc = bitcast i32 %max to float + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %bc, float %bc, float %bc, float %bc) + ret void +} + +; SI-LABEL: {{^}}scalar_imax: +; SI: s_max_i32 +define void @scalar_imax(i32 %p0, i32 %p1) #0 { +entry: + %max = call i32 @llvm.AMDGPU.imax(i32 %p0, i32 %p1) + %bc = bitcast i32 %max to float + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %bc, float %bc, float %bc, float %bc) + ret void +} + +; Function Attrs: readnone +declare i32 @llvm.AMDGPU.imax(i32, i32) #1 + +declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } + +!0 = !{!"const", null, i32 1} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.imin.ll b/llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.imin.ll new file mode 100644 index 00000000000..34b454e2375 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.imin.ll @@ -0,0 +1,33 @@ +; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck -check-prefix=SI %s +; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefix=SI %s + +; SI-LABEL: {{^}}vector_imin: +; SI: v_min_i32_e32 +define void @vector_imin(i32 %p0, i32 %p1, i32 addrspace(1)* %in) #0 { +main_body: + %load = load i32, i32 addrspace(1)* %in, align 4 + %min = call i32 @llvm.AMDGPU.imin(i32 %p0, i32 %load) + %bc = bitcast i32 %min to float + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %bc, float %bc, float %bc, float %bc) + ret void +} + +; SI-LABEL: {{^}}scalar_imin: +; SI: s_min_i32 +define void @scalar_imin(i32 %p0, i32 %p1) #0 { +entry: + %min = call i32 @llvm.AMDGPU.imin(i32 %p0, i32 %p1) + %bc = bitcast i32 %min to float + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %bc, float %bc, float %bc, float %bc) + ret void +} + +; Function Attrs: readnone +declare i32 @llvm.AMDGPU.imin(i32, i32) #1 + +declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } + +!0 = !{!"const", null, i32 1} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.imul24.ll b/llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.imul24.ll new file mode 100644 index 00000000000..fdc1172260b --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.imul24.ll @@ -0,0 +1,16 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=cayman -verify-machineinstrs < %s | FileCheck -check-prefix=CM -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s + +declare i32 @llvm.AMDGPU.imul24(i32, i32) nounwind readnone + +; FUNC-LABEL: {{^}}test_imul24: +; SI: v_mul_i32_i24 +; CM: MUL_INT24 +; R600: MULLO_INT +define void @test_imul24(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind { + %mul = call i32 @llvm.AMDGPU.imul24(i32 %src0, i32 %src1) nounwind readnone + store i32 %mul, i32 addrspace(1)* %out, align 4 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.kill.ll b/llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.kill.ll new file mode 100644 index 00000000000..057708e7b5c --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.kill.ll @@ -0,0 +1,39 @@ +; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s + +; SI-LABEL: {{^}}kill_gs_const: +; SI-NOT: v_cmpx_le_f32 +; SI: s_mov_b64 exec, 0 + +define void @kill_gs_const() #0 { +main_body: + %0 = icmp ule i32 0, 3 + %1 = select i1 %0, float 1.000000e+00, float -1.000000e+00 + call void @llvm.AMDGPU.kill(float %1) + %2 = icmp ule i32 3, 0 + %3 = select i1 %2, float 1.000000e+00, float -1.000000e+00 + call void @llvm.AMDGPU.kill(float %3) + ret void +} + +; SI-LABEL: {{^}}kill_vcc_implicit_def: +; SI-NOT: v_cmp_gt_f32_e32 vcc, +; SI: v_cmp_gt_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], 0, v{{[0-9]+}} +; SI: v_cmpx_le_f32_e32 vcc, 0, v{{[0-9]+}} +; SI: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1.0, [[CMP]] +define void @kill_vcc_implicit_def([6 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* byval, [17 x <4 x i32>] addrspace(2)* byval, [34 x <8 x i32>] addrspace(2)* byval, float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, i32, float, float) #1 { +entry: + %tmp0 = fcmp olt float %13, 0.0 + call void @llvm.AMDGPU.kill(float %14) + %tmp1 = select i1 %tmp0, float 1.0, float 0.0 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 1, i32 1, float %tmp1, float %tmp1, float %tmp1, float %tmp1) + ret void +} + +declare void @llvm.AMDGPU.kill(float) +declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) + +attributes #0 = { "ShaderType"="2" } +attributes #1 = { "ShaderType"="0" } + +!0 = !{!"const", null, i32 1} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.ldexp.ll b/llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.ldexp.ll new file mode 100644 index 00000000000..a59c0ce6d67 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.ldexp.ll @@ -0,0 +1,23 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s + +declare float @llvm.AMDGPU.ldexp.f32(float, i32) nounwind readnone +declare double @llvm.AMDGPU.ldexp.f64(double, i32) nounwind readnone + +; SI-LABEL: {{^}}test_ldexp_f32: +; SI: v_ldexp_f32 +; SI: s_endpgm +define void @test_ldexp_f32(float addrspace(1)* %out, float %a, i32 %b) nounwind { + %result = call float @llvm.AMDGPU.ldexp.f32(float %a, i32 %b) nounwind readnone + store float %result, float addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL: {{^}}test_ldexp_f64: +; SI: v_ldexp_f64 +; SI: s_endpgm +define void @test_ldexp_f64(double addrspace(1)* %out, double %a, i32 %b) nounwind { + %result = call double @llvm.AMDGPU.ldexp.f64(double %a, i32 %b) nounwind readnone + store double %result, double addrspace(1)* %out, align 8 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.legacy.rsq.ll b/llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.legacy.rsq.ll new file mode 100644 index 00000000000..4cafd563685 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.legacy.rsq.ll @@ -0,0 +1,13 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s + +declare float @llvm.AMDGPU.legacy.rsq(float) nounwind readnone + +; FUNC-LABEL: {{^}}rsq_legacy_f32: +; SI: v_rsq_legacy_f32_e32 +; EG: RECIPSQRT_IEEE +define void @rsq_legacy_f32(float addrspace(1)* %out, float %src) nounwind { + %rsq = call float @llvm.AMDGPU.legacy.rsq(float %src) nounwind readnone + store float %rsq, float addrspace(1)* %out, align 4 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.mul.ll b/llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.mul.ll new file mode 100644 index 00000000000..83b56a5029d --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.mul.ll @@ -0,0 +1,17 @@ +;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s + +;CHECK: MUL NON-IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} + +define void @test(<4 x float> inreg %reg0) #0 { + %r0 = extractelement <4 x float> %reg0, i32 0 + %r1 = extractelement <4 x float> %reg0, i32 1 + %r2 = call float @llvm.AMDGPU.mul( float %r0, float %r1) + %vec = insertelement <4 x float> undef, float %r2, i32 0 + call void @llvm.R600.store.swizzle(<4 x float> %vec, i32 0, i32 0) + ret void +} + +declare float @llvm.AMDGPU.mul(float ,float ) readnone +declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) + +attributes #0 = { "ShaderType"="0" } \ No newline at end of file diff --git a/llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.rcp.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.rcp.f64.ll new file mode 100644 index 00000000000..d2a655bf909 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.rcp.f64.ll @@ -0,0 +1,33 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s + +declare double @llvm.AMDGPU.rcp.f64(double) nounwind readnone +declare double @llvm.sqrt.f64(double) nounwind readnone + +; FUNC-LABEL: {{^}}rcp_f64: +; SI: v_rcp_f64_e32 +define void @rcp_f64(double addrspace(1)* %out, double %src) nounwind { + %rcp = call double @llvm.AMDGPU.rcp.f64(double %src) nounwind readnone + store double %rcp, double addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}rcp_pat_f64: +; SI: v_rcp_f64_e32 +define void @rcp_pat_f64(double addrspace(1)* %out, double %src) nounwind { + %rcp = fdiv double 1.0, %src + store double %rcp, double addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}rsq_rcp_pat_f64: +; SI-UNSAFE: v_rsq_f64_e32 +; SI-SAFE-NOT: v_rsq_f64_e32 +; SI-SAFE: v_sqrt_f64 +; SI-SAFE: v_rcp_f64 +define void @rsq_rcp_pat_f64(double addrspace(1)* %out, double %src) nounwind { + %sqrt = call double @llvm.sqrt.f64(double %src) nounwind readnone + %rcp = call double @llvm.AMDGPU.rcp.f64(double %sqrt) nounwind readnone + store double %rcp, double addrspace(1)* %out, align 8 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.rcp.ll b/llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.rcp.ll new file mode 100644 index 00000000000..edd6e9a72f1 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.rcp.ll @@ -0,0 +1,50 @@ +; RUN: llc -march=amdgcn -mcpu=SI -mattr=-fp32-denormals -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=SI-UNSAFE -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=SI -mattr=-fp32-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=SI-SAFE -check-prefix=SI -check-prefix=FUNC %s +; XUN: llc -march=amdgcn -mcpu=SI -mattr=+fp32-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=SI-SAFE-SPDENORM -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-fp32-denormals -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=SI-UNSAFE -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-fp32-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=SI-SAFE -check-prefix=SI -check-prefix=FUNC %s +; XUN: llc -march=amdgcn -mcpu=tonga -mattr=+fp32-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=SI-SAFE-SPDENORM -check-prefix=SI -check-prefix=FUNC %s + +; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG-SAFE -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=cayman -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s + +declare float @llvm.AMDGPU.rcp.f32(float) nounwind readnone +declare double @llvm.AMDGPU.rcp.f64(double) nounwind readnone + +declare float @llvm.sqrt.f32(float) nounwind readnone + +; FUNC-LABEL: {{^}}rcp_f32: +; SI: v_rcp_f32_e32 +; EG: RECIP_IEEE +define void @rcp_f32(float addrspace(1)* %out, float %src) nounwind { + %rcp = call float @llvm.AMDGPU.rcp.f32(float %src) nounwind readnone + store float %rcp, float addrspace(1)* %out, align 4 + ret void +} + +; FIXME: Evergreen only ever does unsafe fp math. +; FUNC-LABEL: {{^}}rcp_pat_f32: + +; SI-SAFE: v_rcp_f32_e32 +; XSI-SAFE-SPDENORM-NOT: v_rcp_f32_e32 + +; EG: RECIP_IEEE + +define void @rcp_pat_f32(float addrspace(1)* %out, float %src) nounwind { + %rcp = fdiv float 1.0, %src + store float %rcp, float addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}rsq_rcp_pat_f32: +; SI-UNSAFE: v_rsq_f32_e32 +; SI-SAFE: v_sqrt_f32_e32 +; SI-SAFE: v_rcp_f32_e32 + +; EG: RECIPSQRT_IEEE +define void @rsq_rcp_pat_f32(float addrspace(1)* %out, float %src) nounwind { + %sqrt = call float @llvm.sqrt.f32(float %src) nounwind readnone + %rcp = call float @llvm.AMDGPU.rcp.f32(float %sqrt) nounwind readnone + store float %rcp, float addrspace(1)* %out, align 4 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.rsq.clamped.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.rsq.clamped.f64.ll new file mode 100644 index 00000000000..67f1d22c717 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.rsq.clamped.f64.ll @@ -0,0 +1,23 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=FUNC %s + +declare double @llvm.AMDGPU.rsq.clamped.f64(double) nounwind readnone + +; FUNC-LABEL: {{^}}rsq_clamped_f64: +; SI: v_rsq_clamp_f64_e32 + +; VI: v_rsq_f64_e32 [[RSQ:v\[[0-9]+:[0-9]+\]]], s[2:3] +; TODO: this constant should be folded: +; VI: s_mov_b32 s[[ALLBITS:[0-9+]]], -1 +; VI: s_mov_b32 s[[HIGH1:[0-9+]]], 0x7fefffff +; VI: s_mov_b32 s[[LOW1:[0-9+]]], s[[ALLBITS]] +; VI: v_min_f64 v[0:1], [[RSQ]], s{{\[}}[[LOW1]]:[[HIGH1]]] +; VI: s_mov_b32 s[[HIGH2:[0-9+]]], 0xffefffff +; VI: s_mov_b32 s[[LOW2:[0-9+]]], s[[ALLBITS]] +; VI: v_max_f64 v[0:1], v[0:1], s{{\[}}[[LOW2]]:[[HIGH2]]] + +define void @rsq_clamped_f64(double addrspace(1)* %out, double %src) nounwind { + %rsq_clamped = call double @llvm.AMDGPU.rsq.clamped.f64(double %src) nounwind readnone + store double %rsq_clamped, double addrspace(1)* %out, align 8 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.rsq.clamped.ll b/llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.rsq.clamped.ll new file mode 100644 index 00000000000..eeff2536b23 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.rsq.clamped.ll @@ -0,0 +1,23 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s + + +declare float @llvm.AMDGPU.rsq.clamped.f32(float) nounwind readnone + +; FUNC-LABEL: {{^}}rsq_clamped_f32: +; SI: v_rsq_clamp_f32_e32 + +; VI: v_rsq_f32_e32 [[RSQ:v[0-9]+]], {{s[0-9]+}} +; VI: v_min_f32_e32 [[MIN:v[0-9]+]], 0x7f7fffff, [[RSQ]] +; TODO: this constant should be folded: +; VI: v_mov_b32_e32 [[MINFLT:v[0-9]+]], 0xff7fffff +; VI: v_max_f32_e32 {{v[0-9]+}}, [[MIN]], [[MINFLT]] + +; EG: RECIPSQRT_CLAMPED + +define void @rsq_clamped_f32(float addrspace(1)* %out, float %src) nounwind { + %rsq_clamped = call float @llvm.AMDGPU.rsq.clamped.f32(float %src) nounwind readnone + store float %rsq_clamped, float addrspace(1)* %out, align 4 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.rsq.ll b/llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.rsq.ll new file mode 100644 index 00000000000..36b72f14db1 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.rsq.ll @@ -0,0 +1,33 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s + +declare float @llvm.AMDGPU.rsq.f32(float) nounwind readnone + +; FUNC-LABEL: {{^}}rsq_f32: +; SI: v_rsq_f32_e32 {{v[0-9]+}}, {{s[0-9]+}} +; EG: RECIPSQRT_IEEE +define void @rsq_f32(float addrspace(1)* %out, float %src) nounwind { + %rsq = call float @llvm.AMDGPU.rsq.f32(float %src) nounwind readnone + store float %rsq, float addrspace(1)* %out, align 4 + ret void +} + +; TODO: Really these should be constant folded +; FUNC-LABEL: {{^}}rsq_f32_constant_4.0 +; SI: v_rsq_f32_e32 {{v[0-9]+}}, 4.0 +; EG: RECIPSQRT_IEEE +define void @rsq_f32_constant_4.0(float addrspace(1)* %out) nounwind { + %rsq = call float @llvm.AMDGPU.rsq.f32(float 4.0) nounwind readnone + store float %rsq, float addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}rsq_f32_constant_100.0 +; SI: v_rsq_f32_e32 {{v[0-9]+}}, 0x42c80000 +; EG: RECIPSQRT_IEEE +define void @rsq_f32_constant_100.0(float addrspace(1)* %out) nounwind { + %rsq = call float @llvm.AMDGPU.rsq.f32(float 100.0) nounwind readnone + store float %rsq, float addrspace(1)* %out, align 4 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.tex.ll b/llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.tex.ll new file mode 100644 index 00000000000..10206609bb5 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.tex.ll @@ -0,0 +1,42 @@ +;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s + +;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNNN +;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNNN +;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNNN +;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNNN +;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:UUNN +;CHECK: TEX_SAMPLE_C T{{[0-9]+\.XYZW, T[0-9]+\.XYZZ}} RID:0 SID:0 CT:NNNN +;CHECK: TEX_SAMPLE_C T{{[0-9]+\.XYZW, T[0-9]+\.XYZZ}} RID:0 SID:0 CT:NNNN +;CHECK: TEX_SAMPLE_C T{{[0-9]+\.XYZW, T[0-9]+\.XYZZ}} RID:0 SID:0 CT:UUNN +;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYYW}} RID:0 SID:0 CT:NNUN +;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNUN +;CHECK: TEX_SAMPLE_C T{{[0-9]+\.XYZW, T[0-9]+\.XYYZ}} RID:0 SID:0 CT:NNUN +;CHECK: TEX_SAMPLE_C T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNUN +;CHECK: TEX_SAMPLE_C T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNNN +;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNNN +;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNNN +;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNUN + +define void @test(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) { + %addr = load <4 x float>, <4 x float> addrspace(1)* %in + %res1 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %addr, i32 0, i32 0, i32 1) + %res2 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res1, i32 0, i32 0, i32 2) + %res3 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res2, i32 0, i32 0, i32 3) + %res4 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res3, i32 0, i32 0, i32 4) + %res5 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res4, i32 0, i32 0, i32 5) + %res6 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res5, i32 0, i32 0, i32 6) + %res7 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res6, i32 0, i32 0, i32 7) + %res8 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res7, i32 0, i32 0, i32 8) + %res9 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res8, i32 0, i32 0, i32 9) + %res10 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res9, i32 0, i32 0, i32 10) + %res11 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res10, i32 0, i32 0, i32 11) + %res12 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res11, i32 0, i32 0, i32 12) + %res13 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res12, i32 0, i32 0, i32 13) + %res14 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res13, i32 0, i32 0, i32 14) + %res15 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res14, i32 0, i32 0, i32 15) + %res16 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res15, i32 0, i32 0, i32 16) + store <4 x float> %res16, <4 x float> addrspace(1)* %out + ret void +} + +declare <4 x float> @llvm.AMDGPU.tex(<4 x float>, i32, i32, i32) readnone diff --git a/llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.trig_preop.ll b/llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.trig_preop.ll new file mode 100644 index 00000000000..6b546a7e17c --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.trig_preop.ll @@ -0,0 +1,30 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s + +declare double @llvm.AMDGPU.trig.preop.f64(double, i32) nounwind readnone + +; SI-LABEL: {{^}}test_trig_preop_f64: +; SI-DAG: buffer_load_dword [[SEG:v[0-9]+]] +; SI-DAG: buffer_load_dwordx2 [[SRC:v\[[0-9]+:[0-9]+\]]], +; SI: v_trig_preop_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[SRC]], [[SEG]] +; SI: buffer_store_dwordx2 [[RESULT]], +; SI: s_endpgm +define void @test_trig_preop_f64(double addrspace(1)* %out, double addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind { + %a = load double, double addrspace(1)* %aptr, align 8 + %b = load i32, i32 addrspace(1)* %bptr, align 4 + %result = call double @llvm.AMDGPU.trig.preop.f64(double %a, i32 %b) nounwind readnone + store double %result, double addrspace(1)* %out, align 8 + ret void +} + +; SI-LABEL: {{^}}test_trig_preop_f64_imm_segment: +; SI: buffer_load_dwordx2 [[SRC:v\[[0-9]+:[0-9]+\]]], +; SI: v_trig_preop_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[SRC]], 7 +; SI: buffer_store_dwordx2 [[RESULT]], +; SI: s_endpgm +define void @test_trig_preop_f64_imm_segment(double addrspace(1)* %out, double addrspace(1)* %aptr) nounwind { + %a = load double, double addrspace(1)* %aptr, align 8 + %result = call double @llvm.AMDGPU.trig.preop.f64(double %a, i32 7) nounwind readnone + store double %result, double addrspace(1)* %out, align 8 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.trunc.ll b/llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.trunc.ll new file mode 100644 index 00000000000..74792e50017 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.trunc.ll @@ -0,0 +1,17 @@ +; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=R600 %s +; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI %s +; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=SI %s + +; R600: {{^}}amdgpu_trunc: +; R600: TRUNC T{{[0-9]+\.[XYZW]}}, KC0[2].Z +; SI: {{^}}amdgpu_trunc: +; SI: v_trunc_f32 + +define void @amdgpu_trunc(float addrspace(1)* %out, float %x) { +entry: + %0 = call float @llvm.AMDGPU.trunc(float %x) + store float %0, float addrspace(1)* %out + ret void +} + +declare float @llvm.AMDGPU.trunc(float ) readnone diff --git a/llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.umad24.ll b/llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.umad24.ll new file mode 100644 index 00000000000..77a073b0cb0 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.umad24.ll @@ -0,0 +1,38 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=cayman -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s +; XUN: llc -march=r600 -mcpu=r600 -verify-machineinstrs < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s +; XUN: llc -march=r600 -mcpu=rv770 -verify-machineinstrs < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s + +declare i32 @llvm.AMDGPU.umad24(i32, i32, i32) nounwind readnone +declare i32 @llvm.r600.read.tidig.x() nounwind readnone + +; FUNC-LABEL: {{^}}test_umad24: +; SI: v_mad_u32_u24 +; EG: MULADD_UINT24 +; R600: MULLO_UINT +; R600: ADD_INT +define void @test_umad24(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) nounwind { + %mad = call i32 @llvm.AMDGPU.umad24(i32 %src0, i32 %src1, i32 %src2) nounwind readnone + store i32 %mad, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}commute_umad24: +; SI-DAG: buffer_load_dword [[SRC0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI-DAG: buffer_load_dword [[SRC2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 +; SI: v_mad_u32_u24 [[RESULT:v[0-9]+]], 4, [[SRC0]], [[SRC2]] +; SI: buffer_store_dword [[RESULT]] +define void @commute_umad24(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %src0.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %src2.gep = getelementptr i32, i32 addrspace(1)* %src0.gep, i32 1 + + %src0 = load i32, i32 addrspace(1)* %src0.gep, align 4 + %src2 = load i32, i32 addrspace(1)* %src2.gep, align 4 + %mad = call i32 @llvm.AMDGPU.umad24(i32 %src0, i32 4, i32 %src2) nounwind readnone + store i32 %mad, i32 addrspace(1)* %out.gep, align 4 + ret void +} + diff --git a/llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.umax.ll b/llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.umax.ll new file mode 100644 index 00000000000..a97d103016d --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.umax.ll @@ -0,0 +1,48 @@ +; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck -check-prefix=SI %s +; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefix=SI %s + +; SI-LABEL: {{^}}vector_umax: +; SI: v_max_u32_e32 +define void @vector_umax(i32 %p0, i32 %p1, i32 addrspace(1)* %in) #0 { +main_body: + %load = load i32, i32 addrspace(1)* %in, align 4 + %max = call i32 @llvm.AMDGPU.umax(i32 %p0, i32 %load) + %bc = bitcast i32 %max to float + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %bc, float %bc, float %bc, float %bc) + ret void +} + +; SI-LABEL: {{^}}scalar_umax: +; SI: s_max_u32 +define void @scalar_umax(i32 %p0, i32 %p1) #0 { +entry: + %max = call i32 @llvm.AMDGPU.umax(i32 %p0, i32 %p1) + %bc = bitcast i32 %max to float + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %bc, float %bc, float %bc, float %bc) + ret void +} + +; SI-LABEL: {{^}}trunc_zext_umax: +; SI: buffer_load_ubyte [[VREG:v[0-9]+]], +; SI: v_max_u32_e32 [[RESULT:v[0-9]+]], 0, [[VREG]] +; SI-NOT: and +; SI: buffer_store_short [[RESULT]], +define void @trunc_zext_umax(i16 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %src) nounwind { + %tmp5 = load i8, i8 addrspace(1)* %src, align 1 + %tmp2 = zext i8 %tmp5 to i32 + %tmp3 = tail call i32 @llvm.AMDGPU.umax(i32 %tmp2, i32 0) nounwind readnone + %tmp4 = trunc i32 %tmp3 to i8 + %tmp6 = zext i8 %tmp4 to i16 + store i16 %tmp6, i16 addrspace(1)* %out, align 2 + ret void +} + +; Function Attrs: readnone +declare i32 @llvm.AMDGPU.umax(i32, i32) #1 + +declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } + +!0 = !{!"const", null, i32 1} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.umin.ll b/llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.umin.ll new file mode 100644 index 00000000000..2acd10e0c63 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.umin.ll @@ -0,0 +1,48 @@ +; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck -check-prefix=SI %s +; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefix=SI %s + +; SI-LABEL: {{^}}vector_umin: +; SI: v_min_u32_e32 +define void @vector_umin(i32 %p0, i32 %p1, i32 addrspace(1)* %in) #0 { +main_body: + %load = load i32, i32 addrspace(1)* %in, align 4 + %min = call i32 @llvm.AMDGPU.umin(i32 %p0, i32 %load) + %bc = bitcast i32 %min to float + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %bc, float %bc, float %bc, float %bc) + ret void +} + +; SI-LABEL: {{^}}scalar_umin: +; SI: s_min_u32 +define void @scalar_umin(i32 %p0, i32 %p1) #0 { +entry: + %min = call i32 @llvm.AMDGPU.umin(i32 %p0, i32 %p1) + %bc = bitcast i32 %min to float + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %bc, float %bc, float %bc, float %bc) + ret void +} + +; SI-LABEL: {{^}}trunc_zext_umin: +; SI: buffer_load_ubyte [[VREG:v[0-9]+]], +; SI: v_min_u32_e32 [[RESULT:v[0-9]+]], 0, [[VREG]] +; SI-NOT: and +; SI: buffer_store_short [[RESULT]], +define void @trunc_zext_umin(i16 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %src) nounwind { + %tmp5 = load i8, i8 addrspace(1)* %src, align 1 + %tmp2 = zext i8 %tmp5 to i32 + %tmp3 = tail call i32 @llvm.AMDGPU.umin(i32 %tmp2, i32 0) nounwind readnone + %tmp4 = trunc i32 %tmp3 to i8 + %tmp6 = zext i8 %tmp4 to i16 + store i16 %tmp6, i16 addrspace(1)* %out, align 2 + ret void +} + +; Function Attrs: readnone +declare i32 @llvm.AMDGPU.umin(i32, i32) #1 + +declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } + +!0 = !{!"const", null, i32 1} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.umul24.ll b/llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.umul24.ll new file mode 100644 index 00000000000..76624a078b3 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.umul24.ll @@ -0,0 +1,18 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s +; RUN: llc -march=r600 -mcpu=cayman -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s +; XUN: llc -march=r600 -mcpu=r600 -verify-machineinstrs < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s +; XUN: llc -march=r600 -mcpu=r770 -verify-machineinstrs < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s + +declare i32 @llvm.AMDGPU.umul24(i32, i32) nounwind readnone + +; FUNC-LABEL: {{^}}test_umul24: +; SI: v_mul_u32_u24 +; R600: MUL_UINT24 +; R600: MULLO_UINT +define void @test_umul24(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind { + %mul = call i32 @llvm.AMDGPU.umul24(i32 %src0, i32 %src1) nounwind readnone + store i32 %mul, i32 addrspace(1)* %out, align 4 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.SI.fs.interp.ll b/llvm/test/CodeGen/AMDGPU/llvm.SI.fs.interp.ll new file mode 100644 index 00000000000..3d05da616e4 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.SI.fs.interp.ll @@ -0,0 +1,59 @@ +;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=GCN %s +;RUN: llc < %s -march=amdgcn -mcpu=kabini -verify-machineinstrs | FileCheck --check-prefix=GCN --check-prefix=16BANK %s +;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=GCN %s + +;GCN-LABEL: {{^}}main: +;GCN-NOT: s_wqm +;GCN: s_mov_b32 +;GCN-NEXT: v_interp_mov_f32 +;GCN: v_interp_p1_f32 +;GCN: v_interp_p2_f32 + +define void @main(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>) #0 { +main_body: + %5 = call float @llvm.SI.fs.constant(i32 0, i32 0, i32 %3) + %6 = call float @llvm.SI.fs.interp(i32 0, i32 0, i32 %3, <2 x i32> %4) + %7 = call float @llvm.SI.fs.interp(i32 1, i32 0, i32 %3, <2 x i32> %4) + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %5, float %6, float %7, float %7) + ret void +} + +; Thest that v_interp_p1 uses different source and destination registers +; on 16 bank LDS chips. + +; 16BANK-LABEL: {{^}}v_interp_p1_bank16_bug: +; 16BANK-NOT: v_interp_p1_f32 [[DST:v[0-9]+]], [[DST]] + +define void @v_interp_p1_bank16_bug([6 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* byval, [17 x <4 x i32>] addrspace(2)* byval, [34 x <8 x i32>] addrspace(2)* byval, float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, i32, float, float) #0 { +main_body: + %22 = call float @llvm.SI.fs.interp(i32 0, i32 0, i32 %5, <2 x i32> %7) + %23 = call float @llvm.SI.fs.interp(i32 1, i32 0, i32 %5, <2 x i32> %7) + %24 = call float @llvm.SI.fs.interp(i32 2, i32 0, i32 %5, <2 x i32> %7) + %25 = call float @fabs(float %22) + %26 = call float @fabs(float %23) + %27 = call float @fabs(float %24) + %28 = call i32 @llvm.SI.packf16(float %25, float %26) + %29 = bitcast i32 %28 to float + %30 = call i32 @llvm.SI.packf16(float %27, float 1.000000e+00) + %31 = bitcast i32 %30 to float + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %29, float %31, float %29, float %31) + ret void +} + +; Function Attrs: readnone +declare float @fabs(float) #2 + +; Function Attrs: nounwind readnone +declare i32 @llvm.SI.packf16(float, float) #1 + +; Function Attrs: nounwind readnone +declare float @llvm.SI.fs.constant(i32, i32, i32) #1 + +; Function Attrs: nounwind readnone +declare float @llvm.SI.fs.interp(i32, i32, i32, <2 x i32>) #1 + +declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) + +attributes #0 = { "ShaderType"="0" } +attributes #1 = { nounwind readnone } +attributes #2 = { readnone } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.SI.gather4.ll b/llvm/test/CodeGen/AMDGPU/llvm.SI.gather4.ll new file mode 100644 index 00000000000..275cb580bc9 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.SI.gather4.ll @@ -0,0 +1,509 @@ +;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s +;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s + +;CHECK-LABEL: {{^}}gather4_v2: +;CHECK: image_gather4 {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @gather4_v2() #0 { +main_body: + %r = call <4 x float> @llvm.SI.gather4.v2i32(<2 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: {{^}}gather4: +;CHECK: image_gather4 {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @gather4() #0 { +main_body: + %r = call <4 x float> @llvm.SI.gather4.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: {{^}}gather4_cl: +;CHECK: image_gather4_cl {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @gather4_cl() #0 { +main_body: + %r = call <4 x float> @llvm.SI.gather4.cl.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: {{^}}gather4_l: +;CHECK: image_gather4_l {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @gather4_l() #0 { +main_body: + %r = call <4 x float> @llvm.SI.gather4.l.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: {{^}}gather4_b: +;CHECK: image_gather4_b {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @gather4_b() #0 { +main_body: + %r = call <4 x float> @llvm.SI.gather4.b.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: {{^}}gather4_b_cl: +;CHECK: image_gather4_b_cl {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @gather4_b_cl() #0 { +main_body: + %r = call <4 x float> @llvm.SI.gather4.b.cl.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: {{^}}gather4_b_cl_v8: +;CHECK: image_gather4_b_cl {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @gather4_b_cl_v8() #0 { +main_body: + %r = call <4 x float> @llvm.SI.gather4.b.cl.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: {{^}}gather4_lz_v2: +;CHECK: image_gather4_lz {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @gather4_lz_v2() #0 { +main_body: + %r = call <4 x float> @llvm.SI.gather4.lz.v2i32(<2 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: {{^}}gather4_lz: +;CHECK: image_gather4_lz {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @gather4_lz() #0 { +main_body: + %r = call <4 x float> @llvm.SI.gather4.lz.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + + + +;CHECK-LABEL: {{^}}gather4_o: +;CHECK: image_gather4_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @gather4_o() #0 { +main_body: + %r = call <4 x float> @llvm.SI.gather4.o.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: {{^}}gather4_cl_o: +;CHECK: image_gather4_cl_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @gather4_cl_o() #0 { +main_body: + %r = call <4 x float> @llvm.SI.gather4.cl.o.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: {{^}}gather4_cl_o_v8: +;CHECK: image_gather4_cl_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @gather4_cl_o_v8() #0 { +main_body: + %r = call <4 x float> @llvm.SI.gather4.cl.o.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: {{^}}gather4_l_o: +;CHECK: image_gather4_l_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @gather4_l_o() #0 { +main_body: + %r = call <4 x float> @llvm.SI.gather4.l.o.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: {{^}}gather4_l_o_v8: +;CHECK: image_gather4_l_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @gather4_l_o_v8() #0 { +main_body: + %r = call <4 x float> @llvm.SI.gather4.l.o.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: {{^}}gather4_b_o: +;CHECK: image_gather4_b_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @gather4_b_o() #0 { +main_body: + %r = call <4 x float> @llvm.SI.gather4.b.o.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: {{^}}gather4_b_o_v8: +;CHECK: image_gather4_b_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @gather4_b_o_v8() #0 { +main_body: + %r = call <4 x float> @llvm.SI.gather4.b.o.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: {{^}}gather4_b_cl_o: +;CHECK: image_gather4_b_cl_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @gather4_b_cl_o() #0 { +main_body: + %r = call <4 x float> @llvm.SI.gather4.b.cl.o.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: {{^}}gather4_lz_o: +;CHECK: image_gather4_lz_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @gather4_lz_o() #0 { +main_body: + %r = call <4 x float> @llvm.SI.gather4.lz.o.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + + + +;CHECK-LABEL: {{^}}gather4_c: +;CHECK: image_gather4_c {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @gather4_c() #0 { +main_body: + %r = call <4 x float> @llvm.SI.gather4.c.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: {{^}}gather4_c_cl: +;CHECK: image_gather4_c_cl {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @gather4_c_cl() #0 { +main_body: + %r = call <4 x float> @llvm.SI.gather4.c.cl.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: {{^}}gather4_c_cl_v8: +;CHECK: image_gather4_c_cl {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @gather4_c_cl_v8() #0 { +main_body: + %r = call <4 x float> @llvm.SI.gather4.c.cl.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: {{^}}gather4_c_l: +;CHECK: image_gather4_c_l {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @gather4_c_l() #0 { +main_body: + %r = call <4 x float> @llvm.SI.gather4.c.l.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: {{^}}gather4_c_l_v8: +;CHECK: image_gather4_c_l {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @gather4_c_l_v8() #0 { +main_body: + %r = call <4 x float> @llvm.SI.gather4.c.l.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: {{^}}gather4_c_b: +;CHECK: image_gather4_c_b {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @gather4_c_b() #0 { +main_body: + %r = call <4 x float> @llvm.SI.gather4.c.b.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: {{^}}gather4_c_b_v8: +;CHECK: image_gather4_c_b {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @gather4_c_b_v8() #0 { +main_body: + %r = call <4 x float> @llvm.SI.gather4.c.b.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: {{^}}gather4_c_b_cl: +;CHECK: image_gather4_c_b_cl {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @gather4_c_b_cl() #0 { +main_body: + %r = call <4 x float> @llvm.SI.gather4.c.b.cl.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: {{^}}gather4_c_lz: +;CHECK: image_gather4_c_lz {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @gather4_c_lz() #0 { +main_body: + %r = call <4 x float> @llvm.SI.gather4.c.lz.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + + + +;CHECK-LABEL: {{^}}gather4_c_o: +;CHECK: image_gather4_c_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @gather4_c_o() #0 { +main_body: + %r = call <4 x float> @llvm.SI.gather4.c.o.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: {{^}}gather4_c_o_v8: +;CHECK: image_gather4_c_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @gather4_c_o_v8() #0 { +main_body: + %r = call <4 x float> @llvm.SI.gather4.c.o.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: {{^}}gather4_c_cl_o: +;CHECK: image_gather4_c_cl_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @gather4_c_cl_o() #0 { +main_body: + %r = call <4 x float> @llvm.SI.gather4.c.cl.o.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: {{^}}gather4_c_l_o: +;CHECK: image_gather4_c_l_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @gather4_c_l_o() #0 { +main_body: + %r = call <4 x float> @llvm.SI.gather4.c.l.o.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: {{^}}gather4_c_b_o: +;CHECK: image_gather4_c_b_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @gather4_c_b_o() #0 { +main_body: + %r = call <4 x float> @llvm.SI.gather4.c.b.o.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: {{^}}gather4_c_b_cl_o: +;CHECK: image_gather4_c_b_cl_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @gather4_c_b_cl_o() #0 { +main_body: + %r = call <4 x float> @llvm.SI.gather4.c.b.cl.o.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: {{^}}gather4_c_lz_o: +;CHECK: image_gather4_c_lz_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @gather4_c_lz_o() #0 { +main_body: + %r = call <4 x float> @llvm.SI.gather4.c.lz.o.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: {{^}}gather4_c_lz_o_v8: +;CHECK: image_gather4_c_lz_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @gather4_c_lz_o_v8() #0 { +main_body: + %r = call <4 x float> @llvm.SI.gather4.c.lz.o.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + + + +declare <4 x float> @llvm.SI.gather4.v2i32(<2 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.gather4.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.gather4.cl.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.gather4.l.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.gather4.b.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.gather4.b.cl.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.gather4.b.cl.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.gather4.lz.v2i32(<2 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.gather4.lz.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 + +declare <4 x float> @llvm.SI.gather4.o.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.gather4.cl.o.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.gather4.cl.o.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.gather4.l.o.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.gather4.l.o.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.gather4.b.o.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.gather4.b.o.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.gather4.b.cl.o.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.gather4.lz.o.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 + +declare <4 x float> @llvm.SI.gather4.c.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.gather4.c.cl.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.gather4.c.cl.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.gather4.c.l.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.gather4.c.l.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.gather4.c.b.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.gather4.c.b.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.gather4.c.b.cl.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.gather4.c.lz.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 + +declare <4 x float> @llvm.SI.gather4.c.o.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.gather4.c.o.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.gather4.c.cl.o.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.gather4.c.l.o.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.gather4.c.b.o.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.gather4.c.b.cl.o.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.gather4.c.lz.o.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.gather4.c.lz.o.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 + +declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) + +attributes #0 = { "ShaderType"="0" } +attributes #1 = { nounwind readnone } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.SI.getlod.ll b/llvm/test/CodeGen/AMDGPU/llvm.SI.getlod.ll new file mode 100644 index 00000000000..06ee98e91b3 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.SI.getlod.ll @@ -0,0 +1,45 @@ +;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s +;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s + +;CHECK-LABEL: {{^}}getlod: +;CHECK: image_get_lod {{v\[[0-9]+:[0-9]+\]}}, 3, 0, 0, -1, 0, 0, 0, 0, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @getlod() #0 { +main_body: + %r = call <4 x float> @llvm.SI.getlod.i32(i32 undef, <32 x i8> undef, <16 x i8> undef, i32 15, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r0, float %r1) + ret void +} + +;CHECK-LABEL: {{^}}getlod_v2: +;CHECK: image_get_lod {{v\[[0-9]+:[0-9]+\]}}, 3, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @getlod_v2() #0 { +main_body: + %r = call <4 x float> @llvm.SI.getlod.v2i32(<2 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 15, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r0, float %r1) + ret void +} + +;CHECK-LABEL: {{^}}getlod_v4: +;CHECK: image_get_lod {{v\[[0-9]+:[0-9]+\]}}, 3, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @getlod_v4() #0 { +main_body: + %r = call <4 x float> @llvm.SI.getlod.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 15, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r0, float %r1) + ret void +} + + +declare <4 x float> @llvm.SI.getlod.i32(i32, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.getlod.v2i32(<2 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.getlod.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 + +declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) + +attributes #0 = { "ShaderType"="0" } +attributes #1 = { nounwind readnone } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.SI.image.ll b/llvm/test/CodeGen/AMDGPU/llvm.SI.image.ll new file mode 100644 index 00000000000..0fac8d79956 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.SI.image.ll @@ -0,0 +1,50 @@ +;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s +;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s + +;CHECK-LABEL: {{^}}image_load: +;CHECK: image_load {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @image_load() #0 { +main_body: + %r = call <4 x float> @llvm.SI.image.load.v4i32(<4 x i32> undef, <8 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: {{^}}image_load_mip: +;CHECK: image_load_mip {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @image_load_mip() #0 { +main_body: + %r = call <4 x float> @llvm.SI.image.load.mip.v4i32(<4 x i32> undef, <8 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: {{^}}getresinfo: +;CHECK: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}} +define void @getresinfo() #0 { +main_body: + %r = call <4 x float> @llvm.SI.getresinfo.i32(i32 undef, <8 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +declare <4 x float> @llvm.SI.image.load.v4i32(<4 x i32>, <8 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.image.load.mip.v4i32(<4 x i32>, <8 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.getresinfo.i32(i32, <8 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 + +declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) + +attributes #0 = { "ShaderType"="0" } +attributes #1 = { nounwind readnone } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.SI.image.sample.ll b/llvm/test/CodeGen/AMDGPU/llvm.SI.image.sample.ll new file mode 100644 index 00000000000..4bc638a2806 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.SI.image.sample.ll @@ -0,0 +1,310 @@ +;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s +;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s + +;CHECK-LABEL: {{^}}sample: +;CHECK: s_wqm +;CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @sample() #0 { +main_body: + %r = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: {{^}}sample_cl: +;CHECK: s_wqm +;CHECK: image_sample_cl {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @sample_cl() #0 { +main_body: + %r = call <4 x float> @llvm.SI.image.sample.cl.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: {{^}}sample_d: +;CHECK-NOT: s_wqm +;CHECK: image_sample_d {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @sample_d() #0 { +main_body: + %r = call <4 x float> @llvm.SI.image.sample.d.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: {{^}}sample_d_cl: +;CHECK-NOT: s_wqm +;CHECK: image_sample_d_cl {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @sample_d_cl() #0 { +main_body: + %r = call <4 x float> @llvm.SI.image.sample.d.cl.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: {{^}}sample_l: +;CHECK-NOT: s_wqm +;CHECK: image_sample_l {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @sample_l() #0 { +main_body: + %r = call <4 x float> @llvm.SI.image.sample.l.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: {{^}}sample_b: +;CHECK: s_wqm +;CHECK: image_sample_b {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @sample_b() #0 { +main_body: + %r = call <4 x float> @llvm.SI.image.sample.b.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: {{^}}sample_b_cl: +;CHECK: s_wqm +;CHECK: image_sample_b_cl {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @sample_b_cl() #0 { +main_body: + %r = call <4 x float> @llvm.SI.image.sample.b.cl.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: {{^}}sample_lz: +;CHECK-NOT: s_wqm +;CHECK: image_sample_lz {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @sample_lz() #0 { +main_body: + %r = call <4 x float> @llvm.SI.image.sample.lz.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: {{^}}sample_cd: +;CHECK-NOT: s_wqm +;CHECK: image_sample_cd {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @sample_cd() #0 { +main_body: + %r = call <4 x float> @llvm.SI.image.sample.cd.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: {{^}}sample_cd_cl: +;CHECK-NOT: s_wqm +;CHECK: image_sample_cd_cl {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @sample_cd_cl() #0 { +main_body: + %r = call <4 x float> @llvm.SI.image.sample.cd.cl.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: {{^}}sample_c: +;CHECK: s_wqm +;CHECK: image_sample_c {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @sample_c() #0 { +main_body: + %r = call <4 x float> @llvm.SI.image.sample.c.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: {{^}}sample_c_cl: +;CHECK: s_wqm +;CHECK: image_sample_c_cl {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @sample_c_cl() #0 { +main_body: + %r = call <4 x float> @llvm.SI.image.sample.c.cl.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: {{^}}sample_c_d: +;CHECK-NOT: s_wqm +;CHECK: image_sample_c_d {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @sample_c_d() #0 { +main_body: + %r = call <4 x float> @llvm.SI.image.sample.c.d.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: {{^}}sample_c_d_cl: +;CHECK-NOT: s_wqm +;CHECK: image_sample_c_d_cl {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @sample_c_d_cl() #0 { +main_body: + %r = call <4 x float> @llvm.SI.image.sample.c.d.cl.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: {{^}}sample_c_l: +;CHECK-NOT: s_wqm +;CHECK: image_sample_c_l {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @sample_c_l() #0 { +main_body: + %r = call <4 x float> @llvm.SI.image.sample.c.l.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: {{^}}sample_c_b: +;CHECK: s_wqm +;CHECK: image_sample_c_b {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @sample_c_b() #0 { +main_body: + %r = call <4 x float> @llvm.SI.image.sample.c.b.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: {{^}}sample_c_b_cl: +;CHECK: s_wqm +;CHECK: image_sample_c_b_cl {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @sample_c_b_cl() #0 { +main_body: + %r = call <4 x float> @llvm.SI.image.sample.c.b.cl.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: {{^}}sample_c_lz: +;CHECK-NOT: s_wqm +;CHECK: image_sample_c_lz {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @sample_c_lz() #0 { +main_body: + %r = call <4 x float> @llvm.SI.image.sample.c.lz.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: {{^}}sample_c_cd: +;CHECK-NOT: s_wqm +;CHECK: image_sample_c_cd {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @sample_c_cd() #0 { +main_body: + %r = call <4 x float> @llvm.SI.image.sample.c.cd.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: {{^}}sample_c_cd_cl: +;CHECK-NOT: s_wqm +;CHECK: image_sample_c_cd_cl {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @sample_c_cd_cl() #0 { +main_body: + %r = call <4 x float> @llvm.SI.image.sample.c.cd.cl.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + + +declare <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.image.sample.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.image.sample.d.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.image.sample.d.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.image.sample.l.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.image.sample.b.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.image.sample.b.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.image.sample.lz.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.image.sample.cd.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.image.sample.cd.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 + +declare <4 x float> @llvm.SI.image.sample.c.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.image.sample.c.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.image.sample.c.d.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.image.sample.c.d.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.image.sample.c.l.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.image.sample.c.b.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.image.sample.c.b.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.image.sample.c.lz.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.image.sample.c.cd.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.image.sample.c.cd.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 + +declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) + +attributes #0 = { "ShaderType"="0" } +attributes #1 = { nounwind readnone } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.SI.image.sample.o.ll b/llvm/test/CodeGen/AMDGPU/llvm.SI.image.sample.o.ll new file mode 100644 index 00000000000..9d8935414ed --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.SI.image.sample.o.ll @@ -0,0 +1,310 @@ +;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s +;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s + +;CHECK-LABEL: {{^}}sample: +;CHECK: s_wqm +;CHECK: image_sample_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @sample() #0 { +main_body: + %r = call <4 x float> @llvm.SI.image.sample.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: {{^}}sample_cl: +;CHECK: s_wqm +;CHECK: image_sample_cl_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @sample_cl() #0 { +main_body: + %r = call <4 x float> @llvm.SI.image.sample.cl.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: {{^}}sample_d: +;CHECK-NOT: s_wqm +;CHECK: image_sample_d_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @sample_d() #0 { +main_body: + %r = call <4 x float> @llvm.SI.image.sample.d.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: {{^}}sample_d_cl: +;CHECK-NOT: s_wqm +;CHECK: image_sample_d_cl_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @sample_d_cl() #0 { +main_body: + %r = call <4 x float> @llvm.SI.image.sample.d.cl.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: {{^}}sample_l: +;CHECK-NOT: s_wqm +;CHECK: image_sample_l_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @sample_l() #0 { +main_body: + %r = call <4 x float> @llvm.SI.image.sample.l.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: {{^}}sample_b: +;CHECK: s_wqm +;CHECK: image_sample_b_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @sample_b() #0 { +main_body: + %r = call <4 x float> @llvm.SI.image.sample.b.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: {{^}}sample_b_cl: +;CHECK: s_wqm +;CHECK: image_sample_b_cl_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @sample_b_cl() #0 { +main_body: + %r = call <4 x float> @llvm.SI.image.sample.b.cl.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: {{^}}sample_lz: +;CHECK-NOT: s_wqm +;CHECK: image_sample_lz_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @sample_lz() #0 { +main_body: + %r = call <4 x float> @llvm.SI.image.sample.lz.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: {{^}}sample_cd: +;CHECK-NOT: s_wqm +;CHECK: image_sample_cd_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @sample_cd() #0 { +main_body: + %r = call <4 x float> @llvm.SI.image.sample.cd.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: {{^}}sample_cd_cl: +;CHECK-NOT: s_wqm +;CHECK: image_sample_cd_cl_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @sample_cd_cl() #0 { +main_body: + %r = call <4 x float> @llvm.SI.image.sample.cd.cl.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: {{^}}sample_c: +;CHECK: s_wqm +;CHECK: image_sample_c_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @sample_c() #0 { +main_body: + %r = call <4 x float> @llvm.SI.image.sample.c.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: {{^}}sample_c_cl: +;CHECK: s_wqm +;CHECK: image_sample_c_cl_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @sample_c_cl() #0 { +main_body: + %r = call <4 x float> @llvm.SI.image.sample.c.cl.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: {{^}}sample_c_d: +;CHECK-NOT: s_wqm +;CHECK: image_sample_c_d_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @sample_c_d() #0 { +main_body: + %r = call <4 x float> @llvm.SI.image.sample.c.d.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: {{^}}sample_c_d_cl: +;CHECK-NOT: s_wqm +;CHECK: image_sample_c_d_cl_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @sample_c_d_cl() #0 { +main_body: + %r = call <4 x float> @llvm.SI.image.sample.c.d.cl.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: {{^}}sample_c_l: +;CHECK-NOT: s_wqm +;CHECK: image_sample_c_l_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @sample_c_l() #0 { +main_body: + %r = call <4 x float> @llvm.SI.image.sample.c.l.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: {{^}}sample_c_b: +;CHECK: s_wqm +;CHECK: image_sample_c_b_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @sample_c_b() #0 { +main_body: + %r = call <4 x float> @llvm.SI.image.sample.c.b.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: {{^}}sample_c_b_cl: +;CHECK: s_wqm +;CHECK: image_sample_c_b_cl_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @sample_c_b_cl() #0 { +main_body: + %r = call <4 x float> @llvm.SI.image.sample.c.b.cl.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: {{^}}sample_c_lz: +;CHECK-NOT: s_wqm +;CHECK: image_sample_c_lz_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @sample_c_lz() #0 { +main_body: + %r = call <4 x float> @llvm.SI.image.sample.c.lz.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: {{^}}sample_c_cd: +;CHECK-NOT: s_wqm +;CHECK: image_sample_c_cd_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @sample_c_cd() #0 { +main_body: + %r = call <4 x float> @llvm.SI.image.sample.c.cd.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: {{^}}sample_c_cd_cl: +;CHECK-NOT: s_wqm +;CHECK: image_sample_c_cd_cl_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @sample_c_cd_cl() #0 { +main_body: + %r = call <4 x float> @llvm.SI.image.sample.c.cd.cl.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + + +declare <4 x float> @llvm.SI.image.sample.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.image.sample.cl.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.image.sample.d.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.image.sample.d.cl.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.image.sample.l.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.image.sample.b.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.image.sample.b.cl.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.image.sample.lz.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.image.sample.cd.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.image.sample.cd.cl.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 + +declare <4 x float> @llvm.SI.image.sample.c.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.image.sample.c.cl.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.image.sample.c.d.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.image.sample.c.d.cl.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.image.sample.c.l.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.image.sample.c.b.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.image.sample.c.b.cl.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.image.sample.c.lz.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.image.sample.c.cd.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.image.sample.c.cd.cl.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 + +declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) + +attributes #0 = { "ShaderType"="0" } +attributes #1 = { nounwind readnone } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.SI.imageload.ll b/llvm/test/CodeGen/AMDGPU/llvm.SI.imageload.ll new file mode 100644 index 00000000000..b67716c3b66 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.SI.imageload.ll @@ -0,0 +1,132 @@ +;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s +;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s + +;CHECK-DAG: image_load {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, -1 +;CHECK-DAG: image_load_mip {{v\[[0-9]+:[0-9]+\]}}, 3, 0, 0, 0 +;CHECK-DAG: image_load_mip {{v[0-9]+}}, 2, 0, 0, 0 +;CHECK-DAG: image_load_mip {{v[0-9]+}}, 1, 0, 0, 0 +;CHECK-DAG: image_load_mip {{v[0-9]+}}, 4, 0, 0, 0 +;CHECK-DAG: image_load_mip {{v[0-9]+}}, 8, 0, 0, 0 +;CHECK-DAG: image_load_mip {{v\[[0-9]+:[0-9]+\]}}, 5, 0, 0, 0 +;CHECK-DAG: image_load_mip {{v\[[0-9]+:[0-9]+\]}}, 12, 0, 0, -1 +;CHECK-DAG: image_load_mip {{v\[[0-9]+:[0-9]+\]}}, 7, 0, 0, 0 +;CHECK-DAG: image_load_mip {{v[0-9]+}}, 8, 0, 0, -1 + +define void @test(i32 %a1, i32 %a2, i32 %a3, i32 %a4) { + %v1 = insertelement <4 x i32> undef, i32 %a1, i32 0 + %v2 = insertelement <4 x i32> undef, i32 %a1, i32 1 + %v3 = insertelement <4 x i32> undef, i32 %a1, i32 2 + %v4 = insertelement <4 x i32> undef, i32 %a1, i32 3 + %v5 = insertelement <4 x i32> undef, i32 %a2, i32 0 + %v6 = insertelement <4 x i32> undef, i32 %a2, i32 1 + %v10 = insertelement <4 x i32> undef, i32 %a3, i32 1 + %v11 = insertelement <4 x i32> undef, i32 %a3, i32 2 + %v15 = insertelement <4 x i32> undef, i32 %a4, i32 2 + %v16 = insertelement <4 x i32> undef, i32 %a4, i32 3 + %res1 = call <4 x i32> @llvm.SI.imageload.(<4 x i32> %v1, + <32 x i8> undef, i32 1) + %res2 = call <4 x i32> @llvm.SI.imageload.(<4 x i32> %v2, + <32 x i8> undef, i32 2) + %res3 = call <4 x i32> @llvm.SI.imageload.(<4 x i32> %v3, + <32 x i8> undef, i32 3) + %res4 = call <4 x i32> @llvm.SI.imageload.(<4 x i32> %v4, + <32 x i8> undef, i32 4) + %res5 = call <4 x i32> @llvm.SI.imageload.(<4 x i32> %v5, + <32 x i8> undef, i32 5) + %res6 = call <4 x i32> @llvm.SI.imageload.(<4 x i32> %v6, + <32 x i8> undef, i32 6) + %res10 = call <4 x i32> @llvm.SI.imageload.(<4 x i32> %v10, + <32 x i8> undef, i32 10) + %res11 = call <4 x i32> @llvm.SI.imageload.(<4 x i32> %v11, + <32 x i8> undef, i32 11) + %res15 = call <4 x i32> @llvm.SI.imageload.(<4 x i32> %v15, + <32 x i8> undef, i32 15) + %res16 = call <4 x i32> @llvm.SI.imageload.(<4 x i32> %v16, + <32 x i8> undef, i32 16) + %e1 = extractelement <4 x i32> %res1, i32 0 + %e2 = extractelement <4 x i32> %res2, i32 1 + %e3 = extractelement <4 x i32> %res3, i32 2 + %e4 = extractelement <4 x i32> %res4, i32 3 + %t0 = extractelement <4 x i32> %res5, i32 0 + %t1 = extractelement <4 x i32> %res5, i32 1 + %e5 = add i32 %t0, %t1 + %t2 = extractelement <4 x i32> %res6, i32 0 + %t3 = extractelement <4 x i32> %res6, i32 2 + %e6 = add i32 %t2, %t3 + %t10 = extractelement <4 x i32> %res10, i32 2 + %t11 = extractelement <4 x i32> %res10, i32 3 + %e10 = add i32 %t10, %t11 + %t12 = extractelement <4 x i32> %res11, i32 0 + %t13 = extractelement <4 x i32> %res11, i32 1 + %t14 = extractelement <4 x i32> %res11, i32 2 + %t15 = add i32 %t12, %t13 + %e11 = add i32 %t14, %t15 + %t28 = extractelement <4 x i32> %res15, i32 0 + %t29 = extractelement <4 x i32> %res15, i32 1 + %t30 = extractelement <4 x i32> %res15, i32 2 + %t31 = extractelement <4 x i32> %res15, i32 3 + %t32 = add i32 %t28, %t29 + %t33 = add i32 %t30, %t31 + %e15 = add i32 %t32, %t33 + %e16 = extractelement <4 x i32> %res16, i32 3 + %s1 = add i32 %e1, %e2 + %s2 = add i32 %s1, %e3 + %s3 = add i32 %s2, %e4 + %s4 = add i32 %s3, %e5 + %s5 = add i32 %s4, %e6 + %s9 = add i32 %s5, %e10 + %s10 = add i32 %s9, %e11 + %s14 = add i32 %s10, %e15 + %s15 = add i32 %s14, %e16 + %s16 = bitcast i32 %s15 to float + call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %s16, float %s16, float %s16, float %s16) + ret void +} + +; Test that ccordinates are stored in vgprs and not sgprs +; CHECK: vgpr_coords +; CHECK: image_load_mip {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}} +define void @vgpr_coords(float addrspace(2)* addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 { +main_body: + %20 = getelementptr float addrspace(2)*, float addrspace(2)* addrspace(2)* %0, i32 0 + %21 = load float addrspace(2)*, float addrspace(2)* addrspace(2)* %20, !tbaa !2 + %22 = getelementptr float, float addrspace(2)* %21, i32 0 + %23 = load float, float addrspace(2)* %22, !tbaa !2, !invariant.load !1 + %24 = getelementptr float, float addrspace(2)* %21, i32 1 + %25 = load float, float addrspace(2)* %24, !tbaa !2, !invariant.load !1 + %26 = getelementptr float, float addrspace(2)* %21, i32 4 + %27 = load float, float addrspace(2)* %26, !tbaa !2, !invariant.load !1 + %28 = getelementptr <32 x i8>, <32 x i8> addrspace(2)* %2, i32 0 + %29 = load <32 x i8>, <32 x i8> addrspace(2)* %28, !tbaa !2 + %30 = bitcast float %27 to i32 + %31 = bitcast float %23 to i32 + %32 = bitcast float %25 to i32 + %33 = insertelement <4 x i32> undef, i32 %31, i32 0 + %34 = insertelement <4 x i32> %33, i32 %32, i32 1 + %35 = insertelement <4 x i32> %34, i32 %30, i32 2 + %36 = insertelement <4 x i32> %35, i32 undef, i32 3 + %37 = call <4 x i32> @llvm.SI.imageload.v4i32(<4 x i32> %36, <32 x i8> %29, i32 2) + %38 = extractelement <4 x i32> %37, i32 0 + %39 = extractelement <4 x i32> %37, i32 1 + %40 = extractelement <4 x i32> %37, i32 2 + %41 = extractelement <4 x i32> %37, i32 3 + %42 = bitcast i32 %38 to float + %43 = bitcast i32 %39 to float + %44 = bitcast i32 %40 to float + %45 = bitcast i32 %41 to float + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %42, float %43, float %44, float %45) + ret void +} + +declare <4 x i32> @llvm.SI.imageload.(<4 x i32>, <32 x i8>, i32) readnone +; Function Attrs: nounwind readnone +declare <4 x i32> @llvm.SI.imageload.v4i32(<4 x i32>, <32 x i8>, i32) #1 + +declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) + +attributes #0 = { "ShaderType"="0" } +attributes #1 = { nounwind readnone } + +!0 = !{!"const", null} +!1 = !{} +!2 = !{!0, !0, i64 0, i32 1} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.SI.load.dword.ll b/llvm/test/CodeGen/AMDGPU/llvm.SI.load.dword.ll new file mode 100644 index 00000000000..f6c258539d5 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.SI.load.dword.ll @@ -0,0 +1,53 @@ +; RUN: llc -march=amdgcn -mcpu=verde -show-mc-encoding -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -march=amdgcn -mcpu=tonga -show-mc-encoding -verify-machineinstrs < %s | FileCheck %s + +; Example of a simple geometry shader loading vertex attributes from the +; ESGS ring buffer + +; FIXME: Out of bounds immediate offset crashes + +; CHECK-LABEL: {{^}}main: +; CHECK: buffer_load_dword {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 glc slc +; CHECK: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen glc slc +; CHECK: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 idxen glc slc +; CHECK: buffer_load_dword {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 idxen offen glc slc +; CHECK: s_movk_i32 [[K:s[0-9]+]], 0x4d2 ; encoding +; CHECK: buffer_load_dword {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, [[K]] idxen offen offset:65535 glc slc + +define void @main([17 x <16 x i8>] addrspace(2)* byval %arg, [32 x <16 x i8>] addrspace(2)* byval %arg1, [16 x <32 x i8>] addrspace(2)* byval %arg2, [2 x <16 x i8>] addrspace(2)* byval %arg3, [17 x <16 x i8>] addrspace(2)* inreg %arg4, [17 x <16 x i8>] addrspace(2)* inreg %arg5, i32 %arg6, i32 %arg7, i32 %arg8, i32 %arg9) #0 { +main_body: + %tmp = getelementptr [2 x <16 x i8>], [2 x <16 x i8>] addrspace(2)* %arg3, i64 0, i32 1 + %tmp10 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp, !tbaa !0 + %tmp11 = shl i32 %arg6, 2 + %tmp12 = call i32 @llvm.SI.buffer.load.dword.i32.i32(<16 x i8> %tmp10, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 0) + %tmp13 = bitcast i32 %tmp12 to float + %tmp14 = call i32 @llvm.SI.buffer.load.dword.i32.i32(<16 x i8> %tmp10, i32 %tmp11, i32 0, i32 0, i32 1, i32 0, i32 1, i32 1, i32 0) + %tmp15 = bitcast i32 %tmp14 to float + %tmp16 = call i32 @llvm.SI.buffer.load.dword.i32.i32(<16 x i8> %tmp10, i32 %tmp11, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 0) + %tmp17 = bitcast i32 %tmp16 to float + %tmp18 = call i32 @llvm.SI.buffer.load.dword.i32.v2i32(<16 x i8> %tmp10, <2 x i32> zeroinitializer, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 0) + %tmp19 = bitcast i32 %tmp18 to float + + %tmp20 = call i32 @llvm.SI.buffer.load.dword.i32.v2i32(<16 x i8> %tmp10, <2 x i32> zeroinitializer, i32 0, i32 123, i32 1, i32 1, i32 1, i32 1, i32 0) + %tmp21 = bitcast i32 %tmp20 to float + + %tmp22 = call i32 @llvm.SI.buffer.load.dword.i32.v2i32(<16 x i8> %tmp10, <2 x i32> zeroinitializer, i32 1234, i32 65535, i32 1, i32 1, i32 1, i32 1, i32 0) + %tmp23 = bitcast i32 %tmp22 to float + + call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %tmp13, float %tmp15, float %tmp17, float %tmp19) + call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %tmp21, float %tmp23, float %tmp23, float %tmp23) + ret void +} + +; Function Attrs: nounwind readonly +declare i32 @llvm.SI.buffer.load.dword.i32.i32(<16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 + +; Function Attrs: nounwind readonly +declare i32 @llvm.SI.buffer.load.dword.i32.v2i32(<16 x i8>, <2 x i32>, i32, i32, i32, i32, i32, i32, i32) #1 + +declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) + +attributes #0 = { "ShaderType"="1" } +attributes #1 = { nounwind readonly } + +!0 = !{!"const", null, i32 1} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.SI.resinfo.ll b/llvm/test/CodeGen/AMDGPU/llvm.SI.resinfo.ll new file mode 100644 index 00000000000..ac95fd0b83a --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.SI.resinfo.ll @@ -0,0 +1,111 @@ +; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s + +; CHECK-DAG: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, -1 +; CHECK-DAG: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, 3, 0, 0, 0 +; CHECK-DAG: image_get_resinfo {{v[0-9]+}}, 2, 0, 0, 0 +; CHECK-DAG: image_get_resinfo {{v[0-9]+}}, 1, 0, 0, 0 +; CHECK-DAG: image_get_resinfo {{v[0-9]+}}, 4, 0, 0, 0 +; CHECK-DAG: image_get_resinfo {{v[0-9]+}}, 8, 0, 0, 0 +; CHECK-DAG: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, 5, 0, 0, 0 +; CHECK-DAG: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, 9, 0, 0, 0 +; CHECK-DAG: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, 6, 0, 0, 0 +; CHECK-DAG: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, 10, 0, 0, -1 +; CHECK-DAG: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, 12, 0, 0, -1 +; CHECK-DAG: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, 7, 0, 0, 0 +; CHECK-DAG: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, 11, 0, 0, 0 +; CHECK-DAG: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, 13, 0, 0, 0 +; CHECK-DAG: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, 14, 0, 0, 0 +; CHECK-DAG: image_get_resinfo {{v[0-9]+}}, 8, 0, 0, -1 + +define void @test(i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6, i32 %a7, i32 %a8, + i32 %a9, i32 %a10, i32 %a11, i32 %a12, i32 %a13, i32 %a14, i32 %a15, i32 %a16) { + %res1 = call <4 x i32> @llvm.SI.resinfo(i32 %a1, <32 x i8> undef, i32 1) + %res2 = call <4 x i32> @llvm.SI.resinfo(i32 %a2, <32 x i8> undef, i32 2) + %res3 = call <4 x i32> @llvm.SI.resinfo(i32 %a3, <32 x i8> undef, i32 3) + %res4 = call <4 x i32> @llvm.SI.resinfo(i32 %a4, <32 x i8> undef, i32 4) + %res5 = call <4 x i32> @llvm.SI.resinfo(i32 %a5, <32 x i8> undef, i32 5) + %res6 = call <4 x i32> @llvm.SI.resinfo(i32 %a6, <32 x i8> undef, i32 6) + %res7 = call <4 x i32> @llvm.SI.resinfo(i32 %a7, <32 x i8> undef, i32 7) + %res8 = call <4 x i32> @llvm.SI.resinfo(i32 %a8, <32 x i8> undef, i32 8) + %res9 = call <4 x i32> @llvm.SI.resinfo(i32 %a9, <32 x i8> undef, i32 9) + %res10 = call <4 x i32> @llvm.SI.resinfo(i32 %a10, <32 x i8> undef, i32 10) + %res11 = call <4 x i32> @llvm.SI.resinfo(i32 %a11, <32 x i8> undef, i32 11) + %res12 = call <4 x i32> @llvm.SI.resinfo(i32 %a12, <32 x i8> undef, i32 12) + %res13 = call <4 x i32> @llvm.SI.resinfo(i32 %a13, <32 x i8> undef, i32 13) + %res14 = call <4 x i32> @llvm.SI.resinfo(i32 %a14, <32 x i8> undef, i32 14) + %res15 = call <4 x i32> @llvm.SI.resinfo(i32 %a15, <32 x i8> undef, i32 15) + %res16 = call <4 x i32> @llvm.SI.resinfo(i32 %a16, <32 x i8> undef, i32 16) + %e1 = extractelement <4 x i32> %res1, i32 0 + %e2 = extractelement <4 x i32> %res2, i32 1 + %e3 = extractelement <4 x i32> %res3, i32 2 + %e4 = extractelement <4 x i32> %res4, i32 3 + %t0 = extractelement <4 x i32> %res5, i32 0 + %t1 = extractelement <4 x i32> %res5, i32 1 + %e5 = add i32 %t0, %t1 + %t2 = extractelement <4 x i32> %res6, i32 0 + %t3 = extractelement <4 x i32> %res6, i32 2 + %e6 = add i32 %t2, %t3 + %t4 = extractelement <4 x i32> %res7, i32 0 + %t5 = extractelement <4 x i32> %res7, i32 3 + %e7 = add i32 %t4, %t5 + %t6 = extractelement <4 x i32> %res8, i32 1 + %t7 = extractelement <4 x i32> %res8, i32 2 + %e8 = add i32 %t6, %t7 + %t8 = extractelement <4 x i32> %res9, i32 1 + %t9 = extractelement <4 x i32> %res9, i32 3 + %e9 = add i32 %t8, %t9 + %t10 = extractelement <4 x i32> %res10, i32 2 + %t11 = extractelement <4 x i32> %res10, i32 3 + %e10 = add i32 %t10, %t11 + %t12 = extractelement <4 x i32> %res11, i32 0 + %t13 = extractelement <4 x i32> %res11, i32 1 + %t14 = extractelement <4 x i32> %res11, i32 2 + %t15 = add i32 %t12, %t13 + %e11 = add i32 %t14, %t15 + %t16 = extractelement <4 x i32> %res12, i32 0 + %t17 = extractelement <4 x i32> %res12, i32 1 + %t18 = extractelement <4 x i32> %res12, i32 3 + %t19 = add i32 %t16, %t17 + %e12 = add i32 %t18, %t19 + %t20 = extractelement <4 x i32> %res13, i32 0 + %t21 = extractelement <4 x i32> %res13, i32 2 + %t22 = extractelement <4 x i32> %res13, i32 3 + %t23 = add i32 %t20, %t21 + %e13 = add i32 %t22, %t23 + %t24 = extractelement <4 x i32> %res14, i32 1 + %t25 = extractelement <4 x i32> %res14, i32 2 + %t26 = extractelement <4 x i32> %res14, i32 3 + %t27 = add i32 %t24, %t25 + %e14 = add i32 %t26, %t27 + %t28 = extractelement <4 x i32> %res15, i32 0 + %t29 = extractelement <4 x i32> %res15, i32 1 + %t30 = extractelement <4 x i32> %res15, i32 2 + %t31 = extractelement <4 x i32> %res15, i32 3 + %t32 = add i32 %t28, %t29 + %t33 = add i32 %t30, %t31 + %e15 = add i32 %t32, %t33 + %e16 = extractelement <4 x i32> %res16, i32 3 + %s1 = add i32 %e1, %e2 + %s2 = add i32 %s1, %e3 + %s3 = add i32 %s2, %e4 + %s4 = add i32 %s3, %e5 + %s5 = add i32 %s4, %e6 + %s6 = add i32 %s5, %e7 + %s7 = add i32 %s6, %e8 + %s8 = add i32 %s7, %e9 + %s9 = add i32 %s8, %e10 + %s10 = add i32 %s9, %e11 + %s11 = add i32 %s10, %e12 + %s12 = add i32 %s11, %e13 + %s13 = add i32 %s12, %e14 + %s14 = add i32 %s13, %e15 + %s15 = add i32 %s14, %e16 + %s16 = bitcast i32 %s15 to float + call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %s16, float %s16, float %s16, float %s16) + ret void +} + +declare <4 x i32> @llvm.SI.resinfo(i32, <32 x i8>, i32) readnone + +declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.SI.sample-masked.ll b/llvm/test/CodeGen/AMDGPU/llvm.SI.sample-masked.ll new file mode 100644 index 00000000000..ce9558cbf81 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.SI.sample-masked.ll @@ -0,0 +1,96 @@ +;RUN: llc < %s -march=amdgcn -mcpu=verde | FileCheck %s +;RUN: llc < %s -march=amdgcn -mcpu=tonga | FileCheck %s + +; CHECK-LABEL: {{^}}v1: +; CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, 13 +define void @v1(i32 %a1) #0 { +entry: + %0 = insertelement <1 x i32> undef, i32 %a1, i32 0 + %1 = call <4 x float> @llvm.SI.sample.v1i32(<1 x i32> %0, <32 x i8> undef, <16 x i8> undef, i32 0) + %2 = extractelement <4 x float> %1, i32 0 + %3 = extractelement <4 x float> %1, i32 2 + %4 = extractelement <4 x float> %1, i32 3 + call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %2, float %3, float %4, float %4) + ret void +} + +; CHECK-LABEL: {{^}}v2: +; CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, 11 +define void @v2(i32 %a1) #0 { +entry: + %0 = insertelement <1 x i32> undef, i32 %a1, i32 0 + %1 = call <4 x float> @llvm.SI.sample.v1i32(<1 x i32> %0, <32 x i8> undef, <16 x i8> undef, i32 0) + %2 = extractelement <4 x float> %1, i32 0 + %3 = extractelement <4 x float> %1, i32 1 + %4 = extractelement <4 x float> %1, i32 3 + call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %2, float %3, float %4, float %4) + ret void +} + +; CHECK-LABEL: {{^}}v3: +; CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, 14 +define void @v3(i32 %a1) #0 { +entry: + %0 = insertelement <1 x i32> undef, i32 %a1, i32 0 + %1 = call <4 x float> @llvm.SI.sample.v1i32(<1 x i32> %0, <32 x i8> undef, <16 x i8> undef, i32 0) + %2 = extractelement <4 x float> %1, i32 1 + %3 = extractelement <4 x float> %1, i32 2 + %4 = extractelement <4 x float> %1, i32 3 + call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %2, float %3, float %4, float %4) + ret void +} + +; CHECK-LABEL: {{^}}v4: +; CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, 7 +define void @v4(i32 %a1) #0 { +entry: + %0 = insertelement <1 x i32> undef, i32 %a1, i32 0 + %1 = call <4 x float> @llvm.SI.sample.v1i32(<1 x i32> %0, <32 x i8> undef, <16 x i8> undef, i32 0) + %2 = extractelement <4 x float> %1, i32 0 + %3 = extractelement <4 x float> %1, i32 1 + %4 = extractelement <4 x float> %1, i32 2 + call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %2, float %3, float %4, float %4) + ret void +} + +; CHECK-LABEL: {{^}}v5: +; CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, 10 +define void @v5(i32 %a1) #0 { +entry: + %0 = insertelement <1 x i32> undef, i32 %a1, i32 0 + %1 = call <4 x float> @llvm.SI.sample.v1i32(<1 x i32> %0, <32 x i8> undef, <16 x i8> undef, i32 0) + %2 = extractelement <4 x float> %1, i32 1 + %3 = extractelement <4 x float> %1, i32 3 + call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %2, float %3, float %3, float %3) + ret void +} + +; CHECK-LABEL: {{^}}v6: +; CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, 6 +define void @v6(i32 %a1) #0 { +entry: + %0 = insertelement <1 x i32> undef, i32 %a1, i32 0 + %1 = call <4 x float> @llvm.SI.sample.v1i32(<1 x i32> %0, <32 x i8> undef, <16 x i8> undef, i32 0) + %2 = extractelement <4 x float> %1, i32 1 + %3 = extractelement <4 x float> %1, i32 2 + call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %2, float %3, float %3, float %3) + ret void +} + +; CHECK-LABEL: {{^}}v7: +; CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, 9 +define void @v7(i32 %a1) #0 { +entry: + %0 = insertelement <1 x i32> undef, i32 %a1, i32 0 + %1 = call <4 x float> @llvm.SI.sample.v1i32(<1 x i32> %0, <32 x i8> undef, <16 x i8> undef, i32 0) + %2 = extractelement <4 x float> %1, i32 0 + %3 = extractelement <4 x float> %1, i32 3 + call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %2, float %3, float %3, float %3) + ret void +} + +declare <4 x float> @llvm.SI.sample.v1i32(<1 x i32>, <32 x i8>, <16 x i8>, i32) readnone + +declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) + +attributes #0 = { "ShaderType"="0" } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.SI.sample.ll b/llvm/test/CodeGen/AMDGPU/llvm.SI.sample.ll new file mode 100644 index 00000000000..509c45f588b --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.SI.sample.ll @@ -0,0 +1,160 @@ +;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s +;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s + +;CHECK-DAG: image_sample {{v\[[0-9]+:[0-9]+\]}}, 15 +;CHECK-DAG: image_sample {{v\[[0-9]+:[0-9]+\]}}, 3 +;CHECK-DAG: image_sample {{v[0-9]+}}, 2 +;CHECK-DAG: image_sample {{v[0-9]+}}, 1 +;CHECK-DAG: image_sample {{v[0-9]+}}, 4 +;CHECK-DAG: image_sample {{v[0-9]+}}, 8 +;CHECK-DAG: image_sample_c {{v\[[0-9]+:[0-9]+\]}}, 5 +;CHECK-DAG: image_sample_c {{v\[[0-9]+:[0-9]+\]}}, 9 +;CHECK-DAG: image_sample_c {{v\[[0-9]+:[0-9]+\]}}, 6 +;CHECK-DAG: image_sample {{v\[[0-9]+:[0-9]+\]}}, 10 +;CHECK-DAG: image_sample {{v\[[0-9]+:[0-9]+\]}}, 12 +;CHECK-DAG: image_sample_c {{v\[[0-9]+:[0-9]+\]}}, 7 +;CHECK-DAG: image_sample_c {{v\[[0-9]+:[0-9]+\]}}, 11 +;CHECK-DAG: image_sample_c {{v\[[0-9]+:[0-9]+\]}}, 13 +;CHECK-DAG: image_sample {{v\[[0-9]+:[0-9]+\]}}, 14 +;CHECK-DAG: image_sample {{v[0-9]+}}, 8 + +define void @test(i32 %a1, i32 %a2, i32 %a3, i32 %a4) #0 { + %v1 = insertelement <4 x i32> undef, i32 %a1, i32 0 + %v2 = insertelement <4 x i32> undef, i32 %a1, i32 1 + %v3 = insertelement <4 x i32> undef, i32 %a1, i32 2 + %v4 = insertelement <4 x i32> undef, i32 %a1, i32 3 + %v5 = insertelement <4 x i32> undef, i32 %a2, i32 0 + %v6 = insertelement <4 x i32> undef, i32 %a2, i32 1 + %v7 = insertelement <4 x i32> undef, i32 %a2, i32 2 + %v8 = insertelement <4 x i32> undef, i32 %a2, i32 3 + %v9 = insertelement <4 x i32> undef, i32 %a3, i32 0 + %v10 = insertelement <4 x i32> undef, i32 %a3, i32 1 + %v11 = insertelement <4 x i32> undef, i32 %a3, i32 2 + %v12 = insertelement <4 x i32> undef, i32 %a3, i32 3 + %v13 = insertelement <4 x i32> undef, i32 %a4, i32 0 + %v14 = insertelement <4 x i32> undef, i32 %a4, i32 1 + %v15 = insertelement <4 x i32> undef, i32 %a4, i32 2 + %v16 = insertelement <4 x i32> undef, i32 %a4, i32 3 + %res1 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v1, + <32 x i8> undef, <16 x i8> undef, i32 1) + %res2 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v2, + <32 x i8> undef, <16 x i8> undef, i32 2) + %res3 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v3, + <32 x i8> undef, <16 x i8> undef, i32 3) + %res4 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v4, + <32 x i8> undef, <16 x i8> undef, i32 4) + %res5 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v5, + <32 x i8> undef, <16 x i8> undef, i32 5) + %res6 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v6, + <32 x i8> undef, <16 x i8> undef, i32 6) + %res7 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v7, + <32 x i8> undef, <16 x i8> undef, i32 7) + %res8 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v8, + <32 x i8> undef, <16 x i8> undef, i32 8) + %res9 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v9, + <32 x i8> undef, <16 x i8> undef, i32 9) + %res10 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v10, + <32 x i8> undef, <16 x i8> undef, i32 10) + %res11 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v11, + <32 x i8> undef, <16 x i8> undef, i32 11) + %res12 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v12, + <32 x i8> undef, <16 x i8> undef, i32 12) + %res13 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v13, + <32 x i8> undef, <16 x i8> undef, i32 13) + %res14 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v14, + <32 x i8> undef, <16 x i8> undef, i32 14) + %res15 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v15, + <32 x i8> undef, <16 x i8> undef, i32 15) + %res16 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v16, + <32 x i8> undef, <16 x i8> undef, i32 16) + %e1 = extractelement <4 x float> %res1, i32 0 + %e2 = extractelement <4 x float> %res2, i32 1 + %e3 = extractelement <4 x float> %res3, i32 2 + %e4 = extractelement <4 x float> %res4, i32 3 + %t0 = extractelement <4 x float> %res5, i32 0 + %t1 = extractelement <4 x float> %res5, i32 1 + %e5 = fadd float %t0, %t1 + %t2 = extractelement <4 x float> %res6, i32 0 + %t3 = extractelement <4 x float> %res6, i32 2 + %e6 = fadd float %t2, %t3 + %t4 = extractelement <4 x float> %res7, i32 0 + %t5 = extractelement <4 x float> %res7, i32 3 + %e7 = fadd float %t4, %t5 + %t6 = extractelement <4 x float> %res8, i32 1 + %t7 = extractelement <4 x float> %res8, i32 2 + %e8 = fadd float %t6, %t7 + %t8 = extractelement <4 x float> %res9, i32 1 + %t9 = extractelement <4 x float> %res9, i32 3 + %e9 = fadd float %t8, %t9 + %t10 = extractelement <4 x float> %res10, i32 2 + %t11 = extractelement <4 x float> %res10, i32 3 + %e10 = fadd float %t10, %t11 + %t12 = extractelement <4 x float> %res11, i32 0 + %t13 = extractelement <4 x float> %res11, i32 1 + %t14 = extractelement <4 x float> %res11, i32 2 + %t15 = fadd float %t12, %t13 + %e11 = fadd float %t14, %t15 + %t16 = extractelement <4 x float> %res12, i32 0 + %t17 = extractelement <4 x float> %res12, i32 1 + %t18 = extractelement <4 x float> %res12, i32 3 + %t19 = fadd float %t16, %t17 + %e12 = fadd float %t18, %t19 + %t20 = extractelement <4 x float> %res13, i32 0 + %t21 = extractelement <4 x float> %res13, i32 2 + %t22 = extractelement <4 x float> %res13, i32 3 + %t23 = fadd float %t20, %t21 + %e13 = fadd float %t22, %t23 + %t24 = extractelement <4 x float> %res14, i32 1 + %t25 = extractelement <4 x float> %res14, i32 2 + %t26 = extractelement <4 x float> %res14, i32 3 + %t27 = fadd float %t24, %t25 + %e14 = fadd float %t26, %t27 + %t28 = extractelement <4 x float> %res15, i32 0 + %t29 = extractelement <4 x float> %res15, i32 1 + %t30 = extractelement <4 x float> %res15, i32 2 + %t31 = extractelement <4 x float> %res15, i32 3 + %t32 = fadd float %t28, %t29 + %t33 = fadd float %t30, %t31 + %e15 = fadd float %t32, %t33 + %e16 = extractelement <4 x float> %res16, i32 3 + %s1 = fadd float %e1, %e2 + %s2 = fadd float %s1, %e3 + %s3 = fadd float %s2, %e4 + %s4 = fadd float %s3, %e5 + %s5 = fadd float %s4, %e6 + %s6 = fadd float %s5, %e7 + %s7 = fadd float %s6, %e8 + %s8 = fadd float %s7, %e9 + %s9 = fadd float %s8, %e10 + %s10 = fadd float %s9, %e11 + %s11 = fadd float %s10, %e12 + %s12 = fadd float %s11, %e13 + %s13 = fadd float %s12, %e14 + %s14 = fadd float %s13, %e15 + %s15 = fadd float %s14, %e16 + call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %s15, float %s15, float %s15, float %s15) + ret void +} + +; CHECK: {{^}}v1: +; CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, 15 +define void @v1(i32 %a1) #0 { +entry: + %0 = insertelement <1 x i32> undef, i32 %a1, i32 0 + %1 = call <4 x float> @llvm.SI.sample.v1i32(<1 x i32> %0, <32 x i8> undef, <16 x i8> undef, i32 0) + %2 = extractelement <4 x float> %1, i32 0 + %3 = extractelement <4 x float> %1, i32 1 + %4 = extractelement <4 x float> %1, i32 2 + %5 = extractelement <4 x float> %1, i32 3 + call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %2, float %3, float %4, float %5) + ret void +} + + +declare <4 x float> @llvm.SI.sample.v1i32(<1 x i32>, <32 x i8>, <16 x i8>, i32) readnone + +declare <4 x float> @llvm.SI.sample.(<4 x i32>, <32 x i8>, <16 x i8>, i32) readnone + +declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) + +attributes #0 = { "ShaderType"="0" } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.SI.sampled.ll b/llvm/test/CodeGen/AMDGPU/llvm.SI.sampled.ll new file mode 100644 index 00000000000..f2badff2a99 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.SI.sampled.ll @@ -0,0 +1,143 @@ +;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s +;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s + +;CHECK-DAG: image_sample_d {{v\[[0-9]+:[0-9]+\]}}, 15 +;CHECK-DAG: image_sample_d {{v\[[0-9]+:[0-9]+\]}}, 3 +;CHECK-DAG: image_sample_d {{v[0-9]+}}, 2 +;CHECK-DAG: image_sample_d {{v[0-9]+}}, 1 +;CHECK-DAG: image_sample_d {{v[0-9]+}}, 4 +;CHECK-DAG: image_sample_d {{v[0-9]+}}, 8 +;CHECK-DAG: image_sample_c_d {{v\[[0-9]+:[0-9]+\]}}, 5 +;CHECK-DAG: image_sample_c_d {{v\[[0-9]+:[0-9]+\]}}, 9 +;CHECK-DAG: image_sample_c_d {{v\[[0-9]+:[0-9]+\]}}, 6 +;CHECK-DAG: image_sample_d {{v\[[0-9]+:[0-9]+\]}}, 10 +;CHECK-DAG: image_sample_d {{v\[[0-9]+:[0-9]+\]}}, 12 +;CHECK-DAG: image_sample_c_d {{v\[[0-9]+:[0-9]+\]}}, 7 +;CHECK-DAG: image_sample_c_d {{v\[[0-9]+:[0-9]+\]}}, 11 +;CHECK-DAG: image_sample_c_d {{v\[[0-9]+:[0-9]+\]}}, 13 +;CHECK-DAG: image_sample_d {{v\[[0-9]+:[0-9]+\]}}, 14 +;CHECK-DAG: image_sample_d {{v[0-9]+}}, 8 + +define void @test(i32 %a1, i32 %a2, i32 %a3, i32 %a4) #0 { + %v1 = insertelement <4 x i32> undef, i32 %a1, i32 0 + %v2 = insertelement <4 x i32> undef, i32 %a1, i32 1 + %v3 = insertelement <4 x i32> undef, i32 %a1, i32 2 + %v4 = insertelement <4 x i32> undef, i32 %a1, i32 3 + %v5 = insertelement <4 x i32> undef, i32 %a2, i32 0 + %v6 = insertelement <4 x i32> undef, i32 %a2, i32 1 + %v7 = insertelement <4 x i32> undef, i32 %a2, i32 2 + %v8 = insertelement <4 x i32> undef, i32 %a2, i32 3 + %v9 = insertelement <4 x i32> undef, i32 %a3, i32 0 + %v10 = insertelement <4 x i32> undef, i32 %a3, i32 1 + %v11 = insertelement <4 x i32> undef, i32 %a3, i32 2 + %v12 = insertelement <4 x i32> undef, i32 %a3, i32 3 + %v13 = insertelement <4 x i32> undef, i32 %a4, i32 0 + %v14 = insertelement <4 x i32> undef, i32 %a4, i32 1 + %v15 = insertelement <4 x i32> undef, i32 %a4, i32 2 + %v16 = insertelement <4 x i32> undef, i32 %a4, i32 3 + %res1 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v1, + <32 x i8> undef, <16 x i8> undef, i32 1) + %res2 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v2, + <32 x i8> undef, <16 x i8> undef, i32 2) + %res3 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v3, + <32 x i8> undef, <16 x i8> undef, i32 3) + %res4 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v4, + <32 x i8> undef, <16 x i8> undef, i32 4) + %res5 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v5, + <32 x i8> undef, <16 x i8> undef, i32 5) + %res6 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v6, + <32 x i8> undef, <16 x i8> undef, i32 6) + %res7 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v7, + <32 x i8> undef, <16 x i8> undef, i32 7) + %res8 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v8, + <32 x i8> undef, <16 x i8> undef, i32 8) + %res9 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v9, + <32 x i8> undef, <16 x i8> undef, i32 9) + %res10 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v10, + <32 x i8> undef, <16 x i8> undef, i32 10) + %res11 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v11, + <32 x i8> undef, <16 x i8> undef, i32 11) + %res12 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v12, + <32 x i8> undef, <16 x i8> undef, i32 12) + %res13 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v13, + <32 x i8> undef, <16 x i8> undef, i32 13) + %res14 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v14, + <32 x i8> undef, <16 x i8> undef, i32 14) + %res15 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v15, + <32 x i8> undef, <16 x i8> undef, i32 15) + %res16 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v16, + <32 x i8> undef, <16 x i8> undef, i32 16) + %e1 = extractelement <4 x float> %res1, i32 0 + %e2 = extractelement <4 x float> %res2, i32 1 + %e3 = extractelement <4 x float> %res3, i32 2 + %e4 = extractelement <4 x float> %res4, i32 3 + %t0 = extractelement <4 x float> %res5, i32 0 + %t1 = extractelement <4 x float> %res5, i32 1 + %e5 = fadd float %t0, %t1 + %t2 = extractelement <4 x float> %res6, i32 0 + %t3 = extractelement <4 x float> %res6, i32 2 + %e6 = fadd float %t2, %t3 + %t4 = extractelement <4 x float> %res7, i32 0 + %t5 = extractelement <4 x float> %res7, i32 3 + %e7 = fadd float %t4, %t5 + %t6 = extractelement <4 x float> %res8, i32 1 + %t7 = extractelement <4 x float> %res8, i32 2 + %e8 = fadd float %t6, %t7 + %t8 = extractelement <4 x float> %res9, i32 1 + %t9 = extractelement <4 x float> %res9, i32 3 + %e9 = fadd float %t8, %t9 + %t10 = extractelement <4 x float> %res10, i32 2 + %t11 = extractelement <4 x float> %res10, i32 3 + %e10 = fadd float %t10, %t11 + %t12 = extractelement <4 x float> %res11, i32 0 + %t13 = extractelement <4 x float> %res11, i32 1 + %t14 = extractelement <4 x float> %res11, i32 2 + %t15 = fadd float %t12, %t13 + %e11 = fadd float %t14, %t15 + %t16 = extractelement <4 x float> %res12, i32 0 + %t17 = extractelement <4 x float> %res12, i32 1 + %t18 = extractelement <4 x float> %res12, i32 3 + %t19 = fadd float %t16, %t17 + %e12 = fadd float %t18, %t19 + %t20 = extractelement <4 x float> %res13, i32 0 + %t21 = extractelement <4 x float> %res13, i32 2 + %t22 = extractelement <4 x float> %res13, i32 3 + %t23 = fadd float %t20, %t21 + %e13 = fadd float %t22, %t23 + %t24 = extractelement <4 x float> %res14, i32 1 + %t25 = extractelement <4 x float> %res14, i32 2 + %t26 = extractelement <4 x float> %res14, i32 3 + %t27 = fadd float %t24, %t25 + %e14 = fadd float %t26, %t27 + %t28 = extractelement <4 x float> %res15, i32 0 + %t29 = extractelement <4 x float> %res15, i32 1 + %t30 = extractelement <4 x float> %res15, i32 2 + %t31 = extractelement <4 x float> %res15, i32 3 + %t32 = fadd float %t28, %t29 + %t33 = fadd float %t30, %t31 + %e15 = fadd float %t32, %t33 + %e16 = extractelement <4 x float> %res16, i32 3 + %s1 = fadd float %e1, %e2 + %s2 = fadd float %s1, %e3 + %s3 = fadd float %s2, %e4 + %s4 = fadd float %s3, %e5 + %s5 = fadd float %s4, %e6 + %s6 = fadd float %s5, %e7 + %s7 = fadd float %s6, %e8 + %s8 = fadd float %s7, %e9 + %s9 = fadd float %s8, %e10 + %s10 = fadd float %s9, %e11 + %s11 = fadd float %s10, %e12 + %s12 = fadd float %s11, %e13 + %s13 = fadd float %s12, %e14 + %s14 = fadd float %s13, %e15 + %s15 = fadd float %s14, %e16 + call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %s15, float %s15, float %s15, float %s15) + ret void +} + +declare <4 x float> @llvm.SI.sampled.(<4 x i32>, <32 x i8>, <16 x i8>, i32) readnone + +declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) + +attributes #0 = { "ShaderType"="0" } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.SI.sendmsg-m0.ll b/llvm/test/CodeGen/AMDGPU/llvm.SI.sendmsg-m0.ll new file mode 100644 index 00000000000..2198590f2df --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.SI.sendmsg-m0.ll @@ -0,0 +1,20 @@ +;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=BOTH %s +;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=VI --check-prefix=BOTH %s + +; BOTH-LABEL: {{^}}main: +; BOTH: s_mov_b32 m0, s0 +; VI-NEXT: s_nop 0 +; BOTH-NEXT: s_sendmsg Gs_done(nop) +; BOTH-NEXT: s_endpgm + +define void @main(i32 inreg %a) #0 { +main_body: + call void @llvm.SI.sendmsg(i32 3, i32 %a) + ret void +} + +; Function Attrs: nounwind +declare void @llvm.SI.sendmsg(i32, i32) #1 + +attributes #0 = { "ShaderType"="2" "unsafe-fp-math"="true" } +attributes #1 = { nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.SI.sendmsg.ll b/llvm/test/CodeGen/AMDGPU/llvm.SI.sendmsg.ll new file mode 100644 index 00000000000..09675d50335 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.SI.sendmsg.ll @@ -0,0 +1,24 @@ +;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s +;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s + +; CHECK-LABEL: {{^}}main: +; CHECK: s_mov_b32 m0, 0 +; CHECK-NOT: s_mov_b32 m0 +; CHECK: s_sendmsg Gs(emit stream 0) +; CHECK: s_sendmsg Gs(cut stream 1) +; CHECK: s_sendmsg Gs(emit-cut stream 2) +; CHECK: s_sendmsg Gs_done(nop) + +define void @main() { +main_body: + call void @llvm.SI.sendmsg(i32 34, i32 0); + call void @llvm.SI.sendmsg(i32 274, i32 0); + call void @llvm.SI.sendmsg(i32 562, i32 0); + call void @llvm.SI.sendmsg(i32 3, i32 0); + ret void +} + +; Function Attrs: nounwind +declare void @llvm.SI.sendmsg(i32, i32) #0 + +attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.SI.tbuffer.store.ll b/llvm/test/CodeGen/AMDGPU/llvm.SI.tbuffer.store.ll new file mode 100644 index 00000000000..71f51548a5f --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.SI.tbuffer.store.ll @@ -0,0 +1,47 @@ +;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s +;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s + +;CHECK-LABEL: {{^}}test1: +;CHECK: tbuffer_store_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, 0x20, -1, 0, -1, 0, 14, 4, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, -1, 0, 0 +define void @test1(i32 %a1, i32 %vaddr) #0 { + %vdata = insertelement <4 x i32> undef, i32 %a1, i32 0 + call void @llvm.SI.tbuffer.store.v4i32(<16 x i8> undef, <4 x i32> %vdata, + i32 4, i32 %vaddr, i32 0, i32 32, i32 14, i32 4, i32 1, i32 0, i32 1, + i32 1, i32 0) + ret void +} + +;CHECK-LABEL: {{^}}test2: +;CHECK: tbuffer_store_format_xyz {{v\[[0-9]+:[0-9]+\]}}, 0x18, -1, 0, -1, 0, 13, 4, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, -1, 0, 0 +define void @test2(i32 %a1, i32 %vaddr) #0 { + %vdata = insertelement <4 x i32> undef, i32 %a1, i32 0 + call void @llvm.SI.tbuffer.store.v4i32(<16 x i8> undef, <4 x i32> %vdata, + i32 3, i32 %vaddr, i32 0, i32 24, i32 13, i32 4, i32 1, i32 0, i32 1, + i32 1, i32 0) + ret void +} + +;CHECK-LABEL: {{^}}test3: +;CHECK: tbuffer_store_format_xy {{v\[[0-9]+:[0-9]+\]}}, 0x10, -1, 0, -1, 0, 11, 4, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, -1, 0, 0 +define void @test3(i32 %a1, i32 %vaddr) #0 { + %vdata = insertelement <2 x i32> undef, i32 %a1, i32 0 + call void @llvm.SI.tbuffer.store.v2i32(<16 x i8> undef, <2 x i32> %vdata, + i32 2, i32 %vaddr, i32 0, i32 16, i32 11, i32 4, i32 1, i32 0, i32 1, + i32 1, i32 0) + ret void +} + +;CHECK-LABEL: {{^}}test4: +;CHECK: tbuffer_store_format_x {{v[0-9]+}}, 0x8, -1, 0, -1, 0, 4, 4, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, -1, 0, 0 +define void @test4(i32 %vdata, i32 %vaddr) #0 { + call void @llvm.SI.tbuffer.store.i32(<16 x i8> undef, i32 %vdata, + i32 1, i32 %vaddr, i32 0, i32 8, i32 4, i32 4, i32 1, i32 0, i32 1, + i32 1, i32 0) + ret void +} + +declare void @llvm.SI.tbuffer.store.i32(<16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) +declare void @llvm.SI.tbuffer.store.v2i32(<16 x i8>, <2 x i32>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) +declare void @llvm.SI.tbuffer.store.v4i32(<16 x i8>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) + +attributes #0 = { "ShaderType"="1" } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.SI.tid.ll b/llvm/test/CodeGen/AMDGPU/llvm.SI.tid.ll new file mode 100644 index 00000000000..f6e6d7050ba --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.SI.tid.ll @@ -0,0 +1,18 @@ +;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=GCN %s +;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=VI --check-prefix=GCN %s + +;GCN: v_mbcnt_lo_u32_b32_e64 +;SI: v_mbcnt_hi_u32_b32_e32 +;VI: v_mbcnt_hi_u32_b32_e64 + +define void @main(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg) "ShaderType"="0" { +main_body: + %4 = call i32 @llvm.SI.tid() + %5 = bitcast i32 %4 to float + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %5, float %5, float %5, float %5) + ret void +} + +declare i32 @llvm.SI.tid() readnone + +declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgpu.dp4.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgpu.dp4.ll new file mode 100644 index 00000000000..036cd2ca82a --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgpu.dp4.ll @@ -0,0 +1,11 @@ +; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s + +declare float @llvm.AMDGPU.dp4(<4 x float>, <4 x float>) nounwind readnone + +define void @test_dp4(float addrspace(1)* %out, <4 x float> addrspace(1)* %a, <4 x float> addrspace(1)* %b) nounwind { + %src0 = load <4 x float>, <4 x float> addrspace(1)* %a, align 16 + %src1 = load <4 x float>, <4 x float> addrspace(1)* %b, align 16 + %dp4 = call float @llvm.AMDGPU.dp4(<4 x float> %src0, <4 x float> %src1) nounwind readnone + store float %dp4, float addrspace(1)* %out, align 4 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgpu.kilp.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgpu.kilp.ll new file mode 100644 index 00000000000..42df6db1ccf --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgpu.kilp.ll @@ -0,0 +1,21 @@ +; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s + +; SI-LABEL: {{^}}kilp_gs_const: +; SI: s_mov_b64 exec, 0 +define void @kilp_gs_const() #0 { +main_body: + %0 = icmp ule i32 0, 3 + %1 = select i1 %0, float 1.000000e+00, float -1.000000e+00 + call void @llvm.AMDGPU.kilp(float %1) + %2 = icmp ule i32 3, 0 + %3 = select i1 %2, float 1.000000e+00, float -1.000000e+00 + call void @llvm.AMDGPU.kilp(float %3) + ret void +} + +declare void @llvm.AMDGPU.kilp(float) + +attributes #0 = { "ShaderType"="2" } + +!0 = !{!"const", null, i32 1} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgpu.lrp.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgpu.lrp.ll new file mode 100644 index 00000000000..4e4c2ec7791 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgpu.lrp.ll @@ -0,0 +1,13 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s + +declare float @llvm.AMDGPU.lrp(float, float, float) nounwind readnone + +; FUNC-LABEL: {{^}}test_lrp: +; SI: v_sub_f32 +; SI: v_mad_f32 +define void @test_lrp(float addrspace(1)* %out, float %src0, float %src1, float %src2) nounwind { + %mad = call float @llvm.AMDGPU.lrp(float %src0, float %src1, float %src2) nounwind readnone + store float %mad, float addrspace(1)* %out, align 4 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.cos.ll b/llvm/test/CodeGen/AMDGPU/llvm.cos.ll new file mode 100644 index 00000000000..c65df8b3e8d --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.cos.ll @@ -0,0 +1,41 @@ +;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s -check-prefix=EG -check-prefix=FUNC +;RUN: llc < %s -march=amdgcn -mcpu=SI | FileCheck %s -check-prefix=SI -check-prefix=FUNC +;RUN: llc < %s -march=amdgcn -mcpu=tonga | FileCheck %s -check-prefix=SI -check-prefix=FUNC + +;FUNC-LABEL: test +;EG: MULADD_IEEE * +;EG: FRACT * +;EG: ADD * +;EG: COS * T{{[0-9]+\.[XYZW], PV\.[XYZW]}} +;EG-NOT: COS +;SI: v_cos_f32 +;SI-NOT: v_cos_f32 + +define void @test(float addrspace(1)* %out, float %x) #1 { + %cos = call float @llvm.cos.f32(float %x) + store float %cos, float addrspace(1)* %out + ret void +} + +;FUNC-LABEL: testv +;EG: COS * T{{[0-9]+\.[XYZW], PV\.[XYZW]}} +;EG: COS * T{{[0-9]+\.[XYZW], PV\.[XYZW]}} +;EG: COS * T{{[0-9]+\.[XYZW], PV\.[XYZW]}} +;EG: COS * T{{[0-9]+\.[XYZW], PV\.[XYZW]}} +;EG-NOT: COS +;SI: v_cos_f32 +;SI: v_cos_f32 +;SI: v_cos_f32 +;SI: v_cos_f32 +;SI-NOT: v_cos_f32 + +define void @testv(<4 x float> addrspace(1)* %out, <4 x float> inreg %vx) #1 { + %cos = call <4 x float> @llvm.cos.v4f32(<4 x float> %vx) + store <4 x float> %cos, <4 x float> addrspace(1)* %out + ret void +} + +declare float @llvm.cos.f32(float) readnone +declare <4 x float> @llvm.cos.v4f32(<4 x float>) readnone + +attributes #0 = { "ShaderType"="0" } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll new file mode 100644 index 00000000000..42698925aae --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll @@ -0,0 +1,80 @@ +;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG --check-prefix=FUNC +;RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=CM --check-prefix=FUNC +;RUN: llc < %s -march=amdgcn -mcpu=SI | FileCheck %s --check-prefix=SI --check-prefix=FUNC +;RUN: llc < %s -march=amdgcn -mcpu=tonga | FileCheck %s --check-prefix=SI --check-prefix=FUNC + +;FUNC-LABEL: {{^}}test: +;EG: EXP_IEEE +;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} +;SI: v_exp_f32 + +define void @test(float addrspace(1)* %out, float %in) { +entry: + %0 = call float @llvm.exp2.f32(float %in) + store float %0, float addrspace(1)* %out + ret void +} + +;FUNC-LABEL: {{^}}testv2: +;EG: EXP_IEEE +;EG: EXP_IEEE +; FIXME: We should be able to merge these packets together on Cayman so we +; have a maximum of 4 instructions. +;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} +;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} +;SI: v_exp_f32 +;SI: v_exp_f32 + +define void @testv2(<2 x float> addrspace(1)* %out, <2 x float> %in) { +entry: + %0 = call <2 x float> @llvm.exp2.v2f32(<2 x float> %in) + store <2 x float> %0, <2 x float> addrspace(1)* %out + ret void +} + +;FUNC-LABEL: {{^}}testv4: +;EG: EXP_IEEE +;EG: EXP_IEEE +;EG: EXP_IEEE +;EG: EXP_IEEE +; FIXME: We should be able to merge these packets together on Cayman so we +; have a maximum of 4 instructions. +;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} +;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} +;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} +;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} +;SI: v_exp_f32 +;SI: v_exp_f32 +;SI: v_exp_f32 +;SI: v_exp_f32 +define void @testv4(<4 x float> addrspace(1)* %out, <4 x float> %in) { +entry: + %0 = call <4 x float> @llvm.exp2.v4f32(<4 x float> %in) + store <4 x float> %0, <4 x float> addrspace(1)* %out + ret void +} + +declare float @llvm.exp2.f32(float) readnone +declare <2 x float> @llvm.exp2.v2f32(<2 x float>) readnone +declare <4 x float> @llvm.exp2.v4f32(<4 x float>) readnone diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll new file mode 100644 index 00000000000..c75e7850b35 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll @@ -0,0 +1,80 @@ +;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG --check-prefix=FUNC +;RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=CM --check-prefix=FUNC +;RUN: llc < %s -march=amdgcn -mcpu=SI | FileCheck %s --check-prefix=SI --check-prefix=FUNC +;RUN: llc < %s -march=amdgcn -mcpu=tonga | FileCheck %s --check-prefix=SI --check-prefix=FUNC + +;FUNC-LABEL: {{^}}test: +;EG: LOG_IEEE +;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} +;SI: v_log_f32 + +define void @test(float addrspace(1)* %out, float %in) { +entry: + %0 = call float @llvm.log2.f32(float %in) + store float %0, float addrspace(1)* %out + ret void +} + +;FUNC-LABEL: {{^}}testv2: +;EG: LOG_IEEE +;EG: LOG_IEEE +; FIXME: We should be able to merge these packets together on Cayman so we +; have a maximum of 4 instructions. +;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} +;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} +;SI: v_log_f32 +;SI: v_log_f32 + +define void @testv2(<2 x float> addrspace(1)* %out, <2 x float> %in) { +entry: + %0 = call <2 x float> @llvm.log2.v2f32(<2 x float> %in) + store <2 x float> %0, <2 x float> addrspace(1)* %out + ret void +} + +;FUNC-LABEL: {{^}}testv4: +;EG: LOG_IEEE +;EG: LOG_IEEE +;EG: LOG_IEEE +;EG: LOG_IEEE +; FIXME: We should be able to merge these packets together on Cayman so we +; have a maximum of 4 instructions. +;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} +;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} +;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} +;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} +;SI: v_log_f32 +;SI: v_log_f32 +;SI: v_log_f32 +;SI: v_log_f32 +define void @testv4(<4 x float> addrspace(1)* %out, <4 x float> %in) { +entry: + %0 = call <4 x float> @llvm.log2.v4f32(<4 x float> %in) + store <4 x float> %0, <4 x float> addrspace(1)* %out + ret void +} + +declare float @llvm.log2.f32(float) readnone +declare <2 x float> @llvm.log2.v2f32(<2 x float>) readnone +declare <4 x float> @llvm.log2.v4f32(<4 x float>) readnone diff --git a/llvm/test/CodeGen/AMDGPU/llvm.memcpy.ll b/llvm/test/CodeGen/AMDGPU/llvm.memcpy.ll new file mode 100644 index 00000000000..e491732cf9c --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.memcpy.ll @@ -0,0 +1,365 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s + +declare void @llvm.memcpy.p3i8.p3i8.i32(i8 addrspace(3)* nocapture, i8 addrspace(3)* nocapture, i32, i32, i1) nounwind +declare void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* nocapture, i8 addrspace(1)* nocapture, i64, i32, i1) nounwind + + +; FUNC-LABEL: {{^}}test_small_memcpy_i64_lds_to_lds_align1: +; SI: ds_read_u8 +; SI: ds_read_u8 +; SI: ds_read_u8 +; SI: ds_read_u8 +; SI: ds_read_u8 +; SI: ds_read_u8 +; SI: ds_read_u8 +; SI: ds_read_u8 + +; SI: ds_read_u8 +; SI: ds_read_u8 +; SI: ds_read_u8 +; SI: ds_read_u8 +; SI: ds_read_u8 +; SI: ds_read_u8 +; SI: ds_read_u8 +; SI: ds_read_u8 + +; SI: ds_read_u8 +; SI: ds_read_u8 +; SI: ds_read_u8 +; SI: ds_read_u8 +; SI: ds_read_u8 +; SI: ds_read_u8 +; SI: ds_read_u8 +; SI: ds_read_u8 + +; SI: ds_read_u8 +; SI: ds_read_u8 +; SI: ds_read_u8 +; SI: ds_read_u8 +; SI: ds_read_u8 +; SI: ds_read_u8 +; SI: ds_read_u8 +; SI: ds_read_u8 + +; SI: ds_write_b8 +; SI: ds_write_b8 +; SI: ds_write_b8 +; SI: ds_write_b8 +; SI: ds_write_b8 +; SI: ds_write_b8 +; SI: ds_write_b8 +; SI: ds_write_b8 + +; SI: ds_write_b8 +; SI: ds_write_b8 +; SI: ds_write_b8 +; SI: ds_write_b8 +; SI: ds_write_b8 +; SI: ds_write_b8 +; SI: ds_write_b8 +; SI: ds_write_b8 + +; SI: ds_write_b8 +; SI: ds_write_b8 +; SI: ds_write_b8 +; SI: ds_write_b8 +; SI: ds_write_b8 +; SI: ds_write_b8 +; SI: ds_write_b8 +; SI: ds_write_b8 + +; SI: ds_write_b8 +; SI: ds_write_b8 +; SI: ds_write_b8 +; SI: ds_write_b8 +; SI: ds_write_b8 +; SI: ds_write_b8 +; SI: ds_write_b8 +; SI: ds_write_b8 + +; SI: s_endpgm +define void @test_small_memcpy_i64_lds_to_lds_align1(i64 addrspace(3)* noalias %out, i64 addrspace(3)* noalias %in) nounwind { + %bcin = bitcast i64 addrspace(3)* %in to i8 addrspace(3)* + %bcout = bitcast i64 addrspace(3)* %out to i8 addrspace(3)* + call void @llvm.memcpy.p3i8.p3i8.i32(i8 addrspace(3)* %bcout, i8 addrspace(3)* %bcin, i32 32, i32 1, i1 false) nounwind + ret void +} + +; FUNC-LABEL: {{^}}test_small_memcpy_i64_lds_to_lds_align2: +; SI: ds_read_u16 +; SI: ds_read_u16 +; SI: ds_read_u16 +; SI: ds_read_u16 +; SI: ds_read_u16 +; SI: ds_read_u16 +; SI: ds_read_u16 +; SI: ds_read_u16 + +; SI: ds_read_u16 +; SI: ds_read_u16 +; SI: ds_read_u16 +; SI: ds_read_u16 +; SI: ds_read_u16 +; SI: ds_read_u16 +; SI: ds_read_u16 +; SI: ds_read_u16 + +; SI: ds_write_b16 +; SI: ds_write_b16 +; SI: ds_write_b16 +; SI: ds_write_b16 +; SI: ds_write_b16 +; SI: ds_write_b16 +; SI: ds_write_b16 +; SI: ds_write_b16 + +; SI: ds_write_b16 +; SI: ds_write_b16 +; SI: ds_write_b16 +; SI: ds_write_b16 +; SI: ds_write_b16 +; SI: ds_write_b16 +; SI: ds_write_b16 +; SI: ds_write_b16 + +; SI: s_endpgm +define void @test_small_memcpy_i64_lds_to_lds_align2(i64 addrspace(3)* noalias %out, i64 addrspace(3)* noalias %in) nounwind { + %bcin = bitcast i64 addrspace(3)* %in to i8 addrspace(3)* + %bcout = bitcast i64 addrspace(3)* %out to i8 addrspace(3)* + call void @llvm.memcpy.p3i8.p3i8.i32(i8 addrspace(3)* %bcout, i8 addrspace(3)* %bcin, i32 32, i32 2, i1 false) nounwind + ret void +} + +; FUNC-LABEL: {{^}}test_small_memcpy_i64_lds_to_lds_align4: +; SI-DAG: ds_read_b32 +; SI-DAG: ds_write_b32 + +; SI-DAG: ds_read_b32 +; SI-DAG: ds_write_b32 + +; SI-DAG: ds_read_b32 +; SI-DAG: ds_write_b32 + +; SI-DAG: ds_read_b32 +; SI-DAG: ds_write_b32 + +; SI-DAG: ds_read_b32 +; SI-DAG: ds_write_b32 + +; SI-DAG: ds_read_b32 +; SI-DAG: ds_write_b32 + +; SI-DAG: ds_read_b32 +; SI-DAG: ds_write_b32 + +; SI-DAG: ds_read_b32 +; SI-DAG: ds_write_b32 + +; SI-DAG: ds_read_b32 +; SI-DAG: ds_write_b32 + +; SI: s_endpgm +define void @test_small_memcpy_i64_lds_to_lds_align4(i64 addrspace(3)* noalias %out, i64 addrspace(3)* noalias %in) nounwind { + %bcin = bitcast i64 addrspace(3)* %in to i8 addrspace(3)* + %bcout = bitcast i64 addrspace(3)* %out to i8 addrspace(3)* + call void @llvm.memcpy.p3i8.p3i8.i32(i8 addrspace(3)* %bcout, i8 addrspace(3)* %bcin, i32 32, i32 4, i1 false) nounwind + ret void +} + +; FIXME: Use 64-bit ops +; FUNC-LABEL: {{^}}test_small_memcpy_i64_lds_to_lds_align8: + +; SI-DAG: ds_read_b32 +; SI-DAG: ds_write_b32 + +; SI-DAG: ds_read_b32 +; SI-DAG: ds_write_b32 + +; SI-DAG: ds_read_b32 +; SI-DAG: ds_write_b32 + +; SI-DAG: ds_read_b32 +; SI-DAG: ds_write_b32 + +; SI-DAG: ds_read_b32 +; SI-DAG: ds_write_b32 + +; SI-DAG: ds_read_b32 +; SI-DAG: ds_write_b32 + +; SI-DAG: ds_read_b32 +; SI-DAG: ds_write_b32 + +; SI-DAG: ds_read_b32 +; SI-DAG: ds_write_b32 + +; SI-DAG: ds_read_b32 +; SI-DAG: ds_write_b32 + +; SI-DAG: s_endpgm +define void @test_small_memcpy_i64_lds_to_lds_align8(i64 addrspace(3)* noalias %out, i64 addrspace(3)* noalias %in) nounwind { + %bcin = bitcast i64 addrspace(3)* %in to i8 addrspace(3)* + %bcout = bitcast i64 addrspace(3)* %out to i8 addrspace(3)* + call void @llvm.memcpy.p3i8.p3i8.i32(i8 addrspace(3)* %bcout, i8 addrspace(3)* %bcin, i32 32, i32 8, i1 false) nounwind + ret void +} + +; FUNC-LABEL: {{^}}test_small_memcpy_i64_global_to_global_align1: +; SI-DAG: buffer_load_ubyte +; SI-DAG: buffer_store_byte +; SI-DAG: buffer_load_ubyte +; SI-DAG: buffer_store_byte +; SI-DAG: buffer_load_ubyte +; SI-DAG: buffer_store_byte +; SI-DAG: buffer_load_ubyte +; SI-DAG: buffer_store_byte +; SI-DAG: buffer_load_ubyte +; SI-DAG: buffer_store_byte +; SI-DAG: buffer_load_ubyte +; SI-DAG: buffer_store_byte +; SI-DAG: buffer_load_ubyte +; SI-DAG: buffer_store_byte +; SI-DAG: buffer_load_ubyte +; SI-DAG: buffer_store_byte + +; SI-DAG: buffer_load_ubyte +; SI-DAG: buffer_store_byte +; SI-DAG: buffer_load_ubyte +; SI-DAG: buffer_store_byte +; SI-DAG: buffer_load_ubyte +; SI-DAG: buffer_store_byte +; SI-DAG: buffer_load_ubyte +; SI-DAG: buffer_store_byte +; SI-DAG: buffer_load_ubyte +; SI-DAG: buffer_store_byte +; SI-DAG: buffer_load_ubyte +; SI-DAG: buffer_store_byte +; SI-DAG: buffer_load_ubyte +; SI-DAG: buffer_store_byte +; SI-DAG: buffer_load_ubyte +; SI-DAG: buffer_store_byte + +; SI-DAG: buffer_load_ubyte +; SI-DAG: buffer_store_byte +; SI-DAG: buffer_load_ubyte +; SI-DAG: buffer_store_byte +; SI-DAG: buffer_load_ubyte +; SI-DAG: buffer_store_byte +; SI-DAG: buffer_load_ubyte +; SI-DAG: buffer_store_byte +; SI-DAG: buffer_load_ubyte +; SI-DAG: buffer_store_byte +; SI-DAG: buffer_load_ubyte +; SI-DAG: buffer_store_byte +; SI-DAG: buffer_load_ubyte +; SI-DAG: buffer_store_byte +; SI-DAG: buffer_load_ubyte +; SI-DAG: buffer_store_byte + +; SI-DAG: buffer_load_ubyte +; SI-DAG: buffer_store_byte +; SI-DAG: buffer_load_ubyte +; SI-DAG: buffer_store_byte +; SI-DAG: buffer_load_ubyte +; SI-DAG: buffer_store_byte +; SI-DAG: buffer_load_ubyte +; SI-DAG: buffer_store_byte +; SI-DAG: buffer_load_ubyte +; SI-DAG: buffer_store_byte +; SI-DAG: buffer_load_ubyte +; SI-DAG: buffer_store_byte +; SI-DAG: buffer_load_ubyte +; SI-DAG: buffer_store_byte +; SI-DAG: buffer_load_ubyte +; SI-DAG: buffer_store_byte + +; SI: s_endpgm +define void @test_small_memcpy_i64_global_to_global_align1(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind { + %bcin = bitcast i64 addrspace(1)* %in to i8 addrspace(1)* + %bcout = bitcast i64 addrspace(1)* %out to i8 addrspace(1)* + call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %bcout, i8 addrspace(1)* %bcin, i64 32, i32 1, i1 false) nounwind + ret void +} + +; FUNC-LABEL: {{^}}test_small_memcpy_i64_global_to_global_align2: +; SI-DAG: buffer_load_ushort +; SI-DAG: buffer_load_ushort +; SI-DAG: buffer_load_ushort +; SI-DAG: buffer_load_ushort +; SI-DAG: buffer_load_ushort +; SI-DAG: buffer_load_ushort +; SI-DAG: buffer_load_ushort +; SI-DAG: buffer_load_ushort +; SI-DAG: buffer_load_ushort +; SI-DAG: buffer_load_ushort +; SI-DAG: buffer_load_ushort +; SI-DAG: buffer_load_ushort +; SI-DAG: buffer_load_ushort +; SI-DAG: buffer_load_ushort +; SI-DAG: buffer_load_ushort +; SI-DAG: buffer_load_ushort + +; SI-DAG: buffer_store_short +; SI-DAG: buffer_store_short +; SI-DAG: buffer_store_short +; SI-DAG: buffer_store_short +; SI-DAG: buffer_store_short +; SI-DAG: buffer_store_short +; SI-DAG: buffer_store_short +; SI-DAG: buffer_store_short +; SI-DAG: buffer_store_short +; SI-DAG: buffer_store_short +; SI-DAG: buffer_store_short +; SI-DAG: buffer_store_short +; SI-DAG: buffer_store_short +; SI-DAG: buffer_store_short +; SI-DAG: buffer_store_short +; SI-DAG: buffer_store_short + +; SI: s_endpgm +define void @test_small_memcpy_i64_global_to_global_align2(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind { + %bcin = bitcast i64 addrspace(1)* %in to i8 addrspace(1)* + %bcout = bitcast i64 addrspace(1)* %out to i8 addrspace(1)* + call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %bcout, i8 addrspace(1)* %bcin, i64 32, i32 2, i1 false) nounwind + ret void +} + +; FUNC-LABEL: {{^}}test_small_memcpy_i64_global_to_global_align4: +; SI: buffer_load_dwordx4 +; SI: buffer_load_dwordx4 +; SI: buffer_store_dwordx4 +; SI: buffer_store_dwordx4 +; SI: s_endpgm +define void @test_small_memcpy_i64_global_to_global_align4(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind { + %bcin = bitcast i64 addrspace(1)* %in to i8 addrspace(1)* + %bcout = bitcast i64 addrspace(1)* %out to i8 addrspace(1)* + call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %bcout, i8 addrspace(1)* %bcin, i64 32, i32 4, i1 false) nounwind + ret void +} + +; FUNC-LABEL: {{^}}test_small_memcpy_i64_global_to_global_align8: +; SI: buffer_load_dwordx4 +; SI: buffer_load_dwordx4 +; SI: buffer_store_dwordx4 +; SI: buffer_store_dwordx4 +; SI: s_endpgm +define void @test_small_memcpy_i64_global_to_global_align8(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind { + %bcin = bitcast i64 addrspace(1)* %in to i8 addrspace(1)* + %bcout = bitcast i64 addrspace(1)* %out to i8 addrspace(1)* + call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %bcout, i8 addrspace(1)* %bcin, i64 32, i32 8, i1 false) nounwind + ret void +} + +; FUNC-LABEL: {{^}}test_small_memcpy_i64_global_to_global_align16: +; SI: buffer_load_dwordx4 +; SI: buffer_load_dwordx4 +; SI: buffer_store_dwordx4 +; SI: buffer_store_dwordx4 +; SI: s_endpgm +define void @test_small_memcpy_i64_global_to_global_align16(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind { + %bcin = bitcast i64 addrspace(1)* %in to i8 addrspace(1)* + %bcout = bitcast i64 addrspace(1)* %out to i8 addrspace(1)* + call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %bcout, i8 addrspace(1)* %bcin, i64 32, i32 16, i1 false) nounwind + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.pow.ll b/llvm/test/CodeGen/AMDGPU/llvm.pow.ll new file mode 100644 index 00000000000..c4ae652619c --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.pow.ll @@ -0,0 +1,40 @@ +;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s + +;CHECK-LABEL: test1: +;CHECK: LOG_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, +;CHECK-NEXT: MUL NON-IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], PS}}, +;CHECK-NEXT: EXP_IEEE * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}, + +define void @test1(<4 x float> inreg %reg0) #0 { + %r0 = extractelement <4 x float> %reg0, i32 0 + %r1 = extractelement <4 x float> %reg0, i32 1 + %r2 = call float @llvm.pow.f32( float %r0, float %r1) + %vec = insertelement <4 x float> undef, float %r2, i32 0 + call void @llvm.R600.store.swizzle(<4 x float> %vec, i32 0, i32 0) + ret void +} + +;CHECK-LABEL: test2: +;CHECK: LOG_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, +;CHECK-NEXT: MUL NON-IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], PS}}, +;CHECK-NEXT: LOG_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, +;CHECK-NEXT: MUL NON-IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], PS}}, +;CHECK-NEXT: EXP_IEEE * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}, +;CHECK-NEXT: EXP_IEEE * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}, +;CHECK-NEXT: LOG_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, +;CHECK-NEXT: MUL NON-IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], PS}}, +;CHECK-NEXT: LOG_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, +;CHECK-NEXT: MUL NON-IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], PS}}, +;CHECK-NEXT: EXP_IEEE * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}, +;CHECK-NEXT: EXP_IEEE * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}, +define void @test2(<4 x float> inreg %reg0, <4 x float> inreg %reg1) #0 { + %vec = call <4 x float> @llvm.pow.v4f32( <4 x float> %reg0, <4 x float> %reg1) + call void @llvm.R600.store.swizzle(<4 x float> %vec, i32 0, i32 0) + ret void +} + +declare float @llvm.pow.f32(float ,float ) readonly +declare <4 x float> @llvm.pow.v4f32(<4 x float> ,<4 x float> ) readonly +declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) + +attributes #0 = { "ShaderType"="0" } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.rint.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.rint.f64.ll new file mode 100644 index 00000000000..c63fb172794 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.rint.f64.ll @@ -0,0 +1,46 @@ +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s + +; FUNC-LABEL: {{^}}rint_f64: +; CI: v_rndne_f64_e32 + +; SI-DAG: v_add_f64 +; SI-DAG: v_add_f64 +; SI-DAG v_cmp_gt_f64_e64 +; SI: v_cndmask_b32 +; SI: v_cndmask_b32 +; SI: s_endpgm +define void @rint_f64(double addrspace(1)* %out, double %in) { +entry: + %0 = call double @llvm.rint.f64(double %in) + store double %0, double addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}rint_v2f64: +; CI: v_rndne_f64_e32 +; CI: v_rndne_f64_e32 +define void @rint_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in) { +entry: + %0 = call <2 x double> @llvm.rint.v2f64(<2 x double> %in) + store <2 x double> %0, <2 x double> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}rint_v4f64: +; CI: v_rndne_f64_e32 +; CI: v_rndne_f64_e32 +; CI: v_rndne_f64_e32 +; CI: v_rndne_f64_e32 +define void @rint_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %in) { +entry: + %0 = call <4 x double> @llvm.rint.v4f64(<4 x double> %in) + store <4 x double> %0, <4 x double> addrspace(1)* %out + ret void +} + + +declare double @llvm.rint.f64(double) #0 +declare <2 x double> @llvm.rint.v2f64(<2 x double>) #0 +declare <4 x double> @llvm.rint.v4f64(<4 x double>) #0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.rint.ll b/llvm/test/CodeGen/AMDGPU/llvm.rint.ll new file mode 100644 index 00000000000..661db51ad03 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.rint.ll @@ -0,0 +1,62 @@ +; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck %s -check-prefix=R600 -check-prefix=FUNC +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s + +; FUNC-LABEL: {{^}}rint_f32: +; R600: RNDNE + +; SI: v_rndne_f32_e32 +define void @rint_f32(float addrspace(1)* %out, float %in) { +entry: + %0 = call float @llvm.rint.f32(float %in) #0 + store float %0, float addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}rint_v2f32: +; R600: RNDNE +; R600: RNDNE + +; SI: v_rndne_f32_e32 +; SI: v_rndne_f32_e32 +define void @rint_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) { +entry: + %0 = call <2 x float> @llvm.rint.v2f32(<2 x float> %in) #0 + store <2 x float> %0, <2 x float> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}rint_v4f32: +; R600: RNDNE +; R600: RNDNE +; R600: RNDNE +; R600: RNDNE + +; SI: v_rndne_f32_e32 +; SI: v_rndne_f32_e32 +; SI: v_rndne_f32_e32 +; SI: v_rndne_f32_e32 +define void @rint_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) { +entry: + %0 = call <4 x float> @llvm.rint.v4f32(<4 x float> %in) #0 + store <4 x float> %0, <4 x float> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}legacy_amdil_round_nearest_f32: +; R600: RNDNE + +; SI: v_rndne_f32_e32 +define void @legacy_amdil_round_nearest_f32(float addrspace(1)* %out, float %in) { +entry: + %0 = call float @llvm.AMDIL.round.nearest.f32(float %in) #0 + store float %0, float addrspace(1)* %out + ret void +} + +declare float @llvm.AMDIL.round.nearest.f32(float) #0 +declare float @llvm.rint.f32(float) #0 +declare <2 x float> @llvm.rint.v2f32(<2 x float>) #0 +declare <4 x float> @llvm.rint.v4f32(<4 x float>) #0 + +attributes #0 = { nounwind readnone } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll new file mode 100644 index 00000000000..3d0f57e3328 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll @@ -0,0 +1,74 @@ +; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s + +; FUNC-LABEL: {{^}}round_f64: +; SI: s_endpgm +define void @round_f64(double addrspace(1)* %out, double %x) #0 { + %result = call double @llvm.round.f64(double %x) #1 + store double %result, double addrspace(1)* %out + ret void +} + +; This is a pretty large function, so just test a few of the +; instructions that are necessary. + +; FUNC-LABEL: {{^}}v_round_f64: +; SI: buffer_load_dwordx2 +; SI: v_bfe_u32 [[EXP:v[0-9]+]], v{{[0-9]+}}, 20, 11 + +; SI-DAG: v_not_b32_e32 +; SI-DAG: v_not_b32_e32 + +; SI-DAG: v_cmp_eq_i32 + +; SI-DAG: s_mov_b32 [[BFIMASK:s[0-9]+]], 0x7fffffff +; SI-DAG: v_cmp_gt_i32_e64 +; SI-DAG: v_bfi_b32 [[COPYSIGN:v[0-9]+]], [[BFIMASK]] + +; SI-DAG: v_cmp_gt_i32_e64 + + +; SI: buffer_store_dwordx2 +; SI: s_endpgm +define void @v_round_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 { + %tid = call i32 @llvm.r600.read.tidig.x() #1 + %gep = getelementptr double, double addrspace(1)* %in, i32 %tid + %out.gep = getelementptr double, double addrspace(1)* %out, i32 %tid + %x = load double, double addrspace(1)* %gep + %result = call double @llvm.round.f64(double %x) #1 + store double %result, double addrspace(1)* %out.gep + ret void +} + +; FUNC-LABEL: {{^}}round_v2f64: +; SI: s_endpgm +define void @round_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in) #0 { + %result = call <2 x double> @llvm.round.v2f64(<2 x double> %in) #1 + store <2 x double> %result, <2 x double> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}round_v4f64: +; SI: s_endpgm +define void @round_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %in) #0 { + %result = call <4 x double> @llvm.round.v4f64(<4 x double> %in) #1 + store <4 x double> %result, <4 x double> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}round_v8f64: +; SI: s_endpgm +define void @round_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %in) #0 { + %result = call <8 x double> @llvm.round.v8f64(<8 x double> %in) #1 + store <8 x double> %result, <8 x double> addrspace(1)* %out + ret void +} + +declare i32 @llvm.r600.read.tidig.x() #1 + +declare double @llvm.round.f64(double) #1 +declare <2 x double> @llvm.round.v2f64(<2 x double>) #1 +declare <4 x double> @llvm.round.v4f64(<4 x double>) #1 +declare <8 x double> @llvm.round.v8f64(<8 x double>) #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.round.ll b/llvm/test/CodeGen/AMDGPU/llvm.round.ll new file mode 100644 index 00000000000..f5f124d915a --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.round.ll @@ -0,0 +1,67 @@ +; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s + +; FUNC-LABEL: {{^}}round_f32: +; SI-DAG: s_load_dword [[SX:s[0-9]+]] +; SI-DAG: s_mov_b32 [[K:s[0-9]+]], 0x7fffffff +; SI: v_trunc_f32_e32 [[TRUNC:v[0-9]+]], [[SX]] +; SI: v_sub_f32_e32 [[SUB:v[0-9]+]], [[SX]], [[TRUNC]] +; SI: v_mov_b32_e32 [[VX:v[0-9]+]], [[SX]] +; SI: v_bfi_b32 [[COPYSIGN:v[0-9]+]], [[K]], 1.0, [[VX]] +; SI: v_cmp_le_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], 0.5, |[[SUB]]| +; SI: v_cndmask_b32_e64 [[SEL:v[0-9]+]], 0, [[VX]], [[CMP]] +; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], [[SEL]], [[TRUNC]] +; SI: buffer_store_dword [[RESULT]] + +; R600: TRUNC {{.*}}, [[ARG:KC[0-9]\[[0-9]+\]\.[XYZW]]] +; R600-DAG: ADD {{.*}}, +; R600-DAG: BFI_INT +; R600-DAG: SETGE +; R600-DAG: CNDE +; R600-DAG: ADD +define void @round_f32(float addrspace(1)* %out, float %x) #0 { + %result = call float @llvm.round.f32(float %x) #1 + store float %result, float addrspace(1)* %out + ret void +} + +; The vector tests are really difficult to verify, since it can be hard to +; predict how the scheduler will order the instructions. We already have +; a test for the scalar case, so the vector tests just check that the +; compiler doesn't crash. + +; FUNC-LABEL: {{^}}round_v2f32: +; SI: s_endpgm +; R600: CF_END +define void @round_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) #0 { + %result = call <2 x float> @llvm.round.v2f32(<2 x float> %in) #1 + store <2 x float> %result, <2 x float> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}round_v4f32: +; SI: s_endpgm +; R600: CF_END +define void @round_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) #0 { + %result = call <4 x float> @llvm.round.v4f32(<4 x float> %in) #1 + store <4 x float> %result, <4 x float> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}round_v8f32: +; SI: s_endpgm +; R600: CF_END +define void @round_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %in) #0 { + %result = call <8 x float> @llvm.round.v8f32(<8 x float> %in) #1 + store <8 x float> %result, <8 x float> addrspace(1)* %out + ret void +} + +declare float @llvm.round.f32(float) #1 +declare <2 x float> @llvm.round.v2f32(<2 x float>) #1 +declare <4 x float> @llvm.round.v4f32(<4 x float>) #1 +declare <8 x float> @llvm.round.v8f32(<8 x float>) #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.sin.ll b/llvm/test/CodeGen/AMDGPU/llvm.sin.ll new file mode 100644 index 00000000000..3bb245c2e24 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.sin.ll @@ -0,0 +1,92 @@ +; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=SI-SAFE -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=SI -enable-unsafe-fp-math < %s | FileCheck -check-prefix=SI -check-prefix=SI-UNSAFE -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=SI-SAFE -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -enable-unsafe-fp-math < %s | FileCheck -check-prefix=SI -check-prefix=SI-UNSAFE -check-prefix=FUNC %s + +; FUNC-LABEL: sin_f32 +; EG: MULADD_IEEE * +; EG: FRACT * +; EG: ADD * +; EG: SIN * T{{[0-9]+\.[XYZW], PV\.[XYZW]}} +; EG-NOT: SIN +; SI: v_mul_f32 +; SI: v_fract_f32 +; SI: v_sin_f32 +; SI-NOT: v_sin_f32 + +define void @sin_f32(float addrspace(1)* %out, float %x) #1 { + %sin = call float @llvm.sin.f32(float %x) + store float %sin, float addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}sin_3x_f32: +; SI-UNSAFE-NOT: v_add_f32 +; SI-UNSAFE: 0x3ef47644 +; SI-UNSAFE: v_mul_f32 +; SI-SAFE: v_mul_f32 +; SI-SAFE: v_mul_f32 +; SI: v_fract_f32 +; SI: v_sin_f32 +; SI-NOT: v_sin_f32 +define void @sin_3x_f32(float addrspace(1)* %out, float %x) #1 { + %y = fmul float 3.0, %x + %sin = call float @llvm.sin.f32(float %y) + store float %sin, float addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}sin_2x_f32: +; SI-UNSAFE-NOT: v_add_f32 +; SI-UNSAFE: 0x3ea2f983 +; SI-UNSAFE: v_mul_f32 +; SI-SAFE: v_add_f32 +; SI-SAFE: v_mul_f32 +; SI: v_fract_f32 +; SI: v_sin_f32 +; SI-NOT: v_sin_f32 +define void @sin_2x_f32(float addrspace(1)* %out, float %x) #1 { + %y = fmul float 2.0, %x + %sin = call float @llvm.sin.f32(float %y) + store float %sin, float addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}test_2sin_f32: +; SI-UNSAFE: 0x3ea2f983 +; SI-UNSAFE: v_mul_f32 +; SI-SAFE: v_add_f32 +; SI-SAFE: v_mul_f32 +; SI: v_fract_f32 +; SI: v_sin_f32 +; SI-NOT: v_sin_f32 +define void @test_2sin_f32(float addrspace(1)* %out, float %x) #1 { + %y = fmul float 2.0, %x + %sin = call float @llvm.sin.f32(float %y) + store float %sin, float addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}sin_v4f32: +; EG: SIN * T{{[0-9]+\.[XYZW], PV\.[XYZW]}} +; EG: SIN * T{{[0-9]+\.[XYZW], PV\.[XYZW]}} +; EG: SIN * T{{[0-9]+\.[XYZW], PV\.[XYZW]}} +; EG: SIN * T{{[0-9]+\.[XYZW], PV\.[XYZW]}} +; EG-NOT: SIN +; SI: v_sin_f32 +; SI: v_sin_f32 +; SI: v_sin_f32 +; SI: v_sin_f32 +; SI-NOT: v_sin_f32 + +define void @sin_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %vx) #1 { + %sin = call <4 x float> @llvm.sin.v4f32( <4 x float> %vx) + store <4 x float> %sin, <4 x float> addrspace(1)* %out + ret void +} + +declare float @llvm.sin.f32(float) readnone +declare <4 x float> @llvm.sin.v4f32(<4 x float>) readnone + +attributes #0 = { "ShaderType"="0" } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.sqrt.ll b/llvm/test/CodeGen/AMDGPU/llvm.sqrt.ll new file mode 100644 index 00000000000..c6da047f539 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.sqrt.ll @@ -0,0 +1,105 @@ +; RUN: llc < %s -march=r600 --mcpu=redwood | FileCheck %s --check-prefix=R600 +; RUN: llc < %s -march=amdgcn --mcpu=SI -verify-machineinstrs| FileCheck %s --check-prefix=SI +; RUN: llc < %s -march=amdgcn --mcpu=tonga -verify-machineinstrs| FileCheck %s --check-prefix=SI + +; R600-LABEL: {{^}}sqrt_f32: +; R600: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[2].Z +; R600: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[2].Z, PS +; SI-LABEL: {{^}}sqrt_f32: +; SI: v_sqrt_f32_e32 +define void @sqrt_f32(float addrspace(1)* %out, float %in) { +entry: + %0 = call float @llvm.sqrt.f32(float %in) + store float %0, float addrspace(1)* %out + ret void +} + +; R600-LABEL: {{^}}sqrt_v2f32: +; R600-DAG: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[2].W +; R600-DAG: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[2].W, PS +; R600-DAG: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[3].X +; R600-DAG: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[3].X, PS +; SI-LABEL: {{^}}sqrt_v2f32: +; SI: v_sqrt_f32_e32 +; SI: v_sqrt_f32_e32 +define void @sqrt_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) { +entry: + %0 = call <2 x float> @llvm.sqrt.v2f32(<2 x float> %in) + store <2 x float> %0, <2 x float> addrspace(1)* %out + ret void +} + +; R600-LABEL: {{^}}sqrt_v4f32: +; R600-DAG: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[3].Y +; R600-DAG: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[3].Y, PS +; R600-DAG: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[3].Z +; R600-DAG: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[3].Z, PS +; R600-DAG: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[3].W +; R600-DAG: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[3].W, PS +; R600-DAG: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[4].X +; R600-DAG: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[4].X, PS +; SI-LABEL: {{^}}sqrt_v4f32: +; SI: v_sqrt_f32_e32 +; SI: v_sqrt_f32_e32 +; SI: v_sqrt_f32_e32 +; SI: v_sqrt_f32_e32 +define void @sqrt_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) { +entry: + %0 = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %in) + store <4 x float> %0, <4 x float> addrspace(1)* %out + ret void +} + +; SI-LABEL: {{^}}elim_redun_check: +; SI: v_sqrt_f32_e32 +; SI-NOT: v_cndmask +define void @elim_redun_check(float addrspace(1)* %out, float %in) { +entry: + %sqrt = call float @llvm.sqrt.f32(float %in) + %cmp = fcmp olt float %in, -0.000000e+00 + %res = select i1 %cmp, float 0x7FF8000000000000, float %sqrt + store float %res, float addrspace(1)* %out + ret void +} + +; SI-LABEL: {{^}}elim_redun_check_ult: +; SI: v_sqrt_f32_e32 +; SI-NOT: v_cndmask +define void @elim_redun_check_ult(float addrspace(1)* %out, float %in) { +entry: + %sqrt = call float @llvm.sqrt.f32(float %in) + %cmp = fcmp ult float %in, -0.000000e+00 + %res = select i1 %cmp, float 0x7FF8000000000000, float %sqrt + store float %res, float addrspace(1)* %out + ret void +} + +; SI-LABEL: {{^}}elim_redun_check_v2: +; SI: v_sqrt_f32_e32 +; SI: v_sqrt_f32_e32 +; SI-NOT: v_cndmask +define void @elim_redun_check_v2(<2 x float> addrspace(1)* %out, <2 x float> %in) { +entry: + %sqrt = call <2 x float> @llvm.sqrt.v2f32(<2 x float> %in) + %cmp = fcmp olt <2 x float> %in, + %res = select <2 x i1> %cmp, <2 x float> , <2 x float> %sqrt + store <2 x float> %res, <2 x float> addrspace(1)* %out + ret void +} + +; SI-LABEL: {{^}}elim_redun_check_v2_ult +; SI: v_sqrt_f32_e32 +; SI: v_sqrt_f32_e32 +; SI-NOT: v_cndmask +define void @elim_redun_check_v2_ult(<2 x float> addrspace(1)* %out, <2 x float> %in) { +entry: + %sqrt = call <2 x float> @llvm.sqrt.v2f32(<2 x float> %in) + %cmp = fcmp ult <2 x float> %in, + %res = select <2 x i1> %cmp, <2 x float> , <2 x float> %sqrt + store <2 x float> %res, <2 x float> addrspace(1)* %out + ret void +} + +declare float @llvm.sqrt.f32(float %in) +declare <2 x float> @llvm.sqrt.v2f32(<2 x float> %in) +declare <4 x float> @llvm.sqrt.v4f32(<4 x float> %in) diff --git a/llvm/test/CodeGen/AMDGPU/load-i1.ll b/llvm/test/CodeGen/AMDGPU/load-i1.ll new file mode 100644 index 00000000000..0ca49fde3e7 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/load-i1.ll @@ -0,0 +1,149 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s + +; FUNC-LABEL: {{^}}global_copy_i1_to_i1: +; SI: buffer_load_ubyte +; SI: v_and_b32_e32 v{{[0-9]+}}, 1 +; SI: buffer_store_byte +; SI: s_endpgm + +; EG: VTX_READ_8 +; EG: AND_INT +define void @global_copy_i1_to_i1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind { + %load = load i1, i1 addrspace(1)* %in + store i1 %load, i1 addrspace(1)* %out, align 1 + ret void +} + +; FUNC-LABEL: {{^}}local_copy_i1_to_i1: +; SI: ds_read_u8 +; SI: v_and_b32_e32 v{{[0-9]+}}, 1 +; SI: ds_write_b8 +; SI: s_endpgm + +; EG: LDS_UBYTE_READ_RET +; EG: AND_INT +; EG: LDS_BYTE_WRITE +define void @local_copy_i1_to_i1(i1 addrspace(3)* %out, i1 addrspace(3)* %in) nounwind { + %load = load i1, i1 addrspace(3)* %in + store i1 %load, i1 addrspace(3)* %out, align 1 + ret void +} + +; FUNC-LABEL: {{^}}constant_copy_i1_to_i1: +; SI: buffer_load_ubyte +; SI: v_and_b32_e32 v{{[0-9]+}}, 1 +; SI: buffer_store_byte +; SI: s_endpgm + +; EG: VTX_READ_8 +; EG: AND_INT +define void @constant_copy_i1_to_i1(i1 addrspace(1)* %out, i1 addrspace(2)* %in) nounwind { + %load = load i1, i1 addrspace(2)* %in + store i1 %load, i1 addrspace(1)* %out, align 1 + ret void +} + +; FUNC-LABEL: {{^}}global_sextload_i1_to_i32: +; SI: buffer_load_ubyte +; SI: v_bfe_i32 +; SI: buffer_store_dword +; SI: s_endpgm + +; EG: VTX_READ_8 +; EG: BFE_INT +define void @global_sextload_i1_to_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind { + %load = load i1, i1 addrspace(1)* %in + %ext = sext i1 %load to i32 + store i32 %ext, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}global_zextload_i1_to_i32: +; SI: buffer_load_ubyte +; SI: buffer_store_dword +; SI: s_endpgm + +define void @global_zextload_i1_to_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind { + %load = load i1, i1 addrspace(1)* %in + %ext = zext i1 %load to i32 + store i32 %ext, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}global_sextload_i1_to_i64: +; SI: buffer_load_ubyte +; SI: v_bfe_i32 +; SI: buffer_store_dwordx2 +; SI: s_endpgm +define void @global_sextload_i1_to_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind { + %load = load i1, i1 addrspace(1)* %in + %ext = sext i1 %load to i64 + store i64 %ext, i64 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}global_zextload_i1_to_i64: +; SI: buffer_load_ubyte +; SI: v_mov_b32_e32 {{v[0-9]+}}, 0 +; SI: buffer_store_dwordx2 +; SI: s_endpgm +define void @global_zextload_i1_to_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind { + %load = load i1, i1 addrspace(1)* %in + %ext = zext i1 %load to i64 + store i64 %ext, i64 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}i1_arg: +; SI: buffer_load_ubyte +; SI: v_and_b32_e32 +; SI: buffer_store_byte +; SI: s_endpgm +define void @i1_arg(i1 addrspace(1)* %out, i1 %x) nounwind { + store i1 %x, i1 addrspace(1)* %out, align 1 + ret void +} + +; FUNC-LABEL: {{^}}i1_arg_zext_i32: +; SI: buffer_load_ubyte +; SI: buffer_store_dword +; SI: s_endpgm +define void @i1_arg_zext_i32(i32 addrspace(1)* %out, i1 %x) nounwind { + %ext = zext i1 %x to i32 + store i32 %ext, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}i1_arg_zext_i64: +; SI: buffer_load_ubyte +; SI: buffer_store_dwordx2 +; SI: s_endpgm +define void @i1_arg_zext_i64(i64 addrspace(1)* %out, i1 %x) nounwind { + %ext = zext i1 %x to i64 + store i64 %ext, i64 addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}i1_arg_sext_i32: +; SI: buffer_load_ubyte +; SI: buffer_store_dword +; SI: s_endpgm +define void @i1_arg_sext_i32(i32 addrspace(1)* %out, i1 %x) nounwind { + %ext = sext i1 %x to i32 + store i32 %ext, i32addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}i1_arg_sext_i64: +; SI: buffer_load_ubyte +; SI: v_bfe_i32 +; SI: v_ashrrev_i32 +; SI: buffer_store_dwordx2 +; SI: s_endpgm +define void @i1_arg_sext_i64(i64 addrspace(1)* %out, i1 %x) nounwind { + %ext = sext i1 %x to i64 + store i64 %ext, i64 addrspace(1)* %out, align 8 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/load-input-fold.ll b/llvm/test/CodeGen/AMDGPU/load-input-fold.ll new file mode 100644 index 00000000000..1daf0e6527b --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/load-input-fold.ll @@ -0,0 +1,117 @@ +;RUN: llc < %s -march=r600 -mcpu=cayman + +define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1, <4 x float> inreg %reg2, <4 x float> inreg %reg3) #0 { +main_body: + %0 = extractelement <4 x float> %reg1, i32 0 + %1 = extractelement <4 x float> %reg1, i32 1 + %2 = extractelement <4 x float> %reg1, i32 2 + %3 = extractelement <4 x float> %reg1, i32 3 + %4 = extractelement <4 x float> %reg2, i32 0 + %5 = extractelement <4 x float> %reg2, i32 1 + %6 = extractelement <4 x float> %reg2, i32 2 + %7 = extractelement <4 x float> %reg2, i32 3 + %8 = extractelement <4 x float> %reg3, i32 0 + %9 = extractelement <4 x float> %reg3, i32 1 + %10 = extractelement <4 x float> %reg3, i32 2 + %11 = extractelement <4 x float> %reg3, i32 3 + %12 = load <4 x float>, <4 x float> addrspace(8)* null + %13 = extractelement <4 x float> %12, i32 0 + %14 = fmul float %0, %13 + %15 = load <4 x float>, <4 x float> addrspace(8)* null + %16 = extractelement <4 x float> %15, i32 1 + %17 = fmul float %0, %16 + %18 = load <4 x float>, <4 x float> addrspace(8)* null + %19 = extractelement <4 x float> %18, i32 2 + %20 = fmul float %0, %19 + %21 = load <4 x float>, <4 x float> addrspace(8)* null + %22 = extractelement <4 x float> %21, i32 3 + %23 = fmul float %0, %22 + %24 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1) + %25 = extractelement <4 x float> %24, i32 0 + %26 = fmul float %1, %25 + %27 = fadd float %26, %14 + %28 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1) + %29 = extractelement <4 x float> %28, i32 1 + %30 = fmul float %1, %29 + %31 = fadd float %30, %17 + %32 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1) + %33 = extractelement <4 x float> %32, i32 2 + %34 = fmul float %1, %33 + %35 = fadd float %34, %20 + %36 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1) + %37 = extractelement <4 x float> %36, i32 3 + %38 = fmul float %1, %37 + %39 = fadd float %38, %23 + %40 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2) + %41 = extractelement <4 x float> %40, i32 0 + %42 = fmul float %2, %41 + %43 = fadd float %42, %27 + %44 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2) + %45 = extractelement <4 x float> %44, i32 1 + %46 = fmul float %2, %45 + %47 = fadd float %46, %31 + %48 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2) + %49 = extractelement <4 x float> %48, i32 2 + %50 = fmul float %2, %49 + %51 = fadd float %50, %35 + %52 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2) + %53 = extractelement <4 x float> %52, i32 3 + %54 = fmul float %2, %53 + %55 = fadd float %54, %39 + %56 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3) + %57 = extractelement <4 x float> %56, i32 0 + %58 = fmul float %3, %57 + %59 = fadd float %58, %43 + %60 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3) + %61 = extractelement <4 x float> %60, i32 1 + %62 = fmul float %3, %61 + %63 = fadd float %62, %47 + %64 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3) + %65 = extractelement <4 x float> %64, i32 2 + %66 = fmul float %3, %65 + %67 = fadd float %66, %51 + %68 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3) + %69 = extractelement <4 x float> %68, i32 3 + %70 = fmul float %3, %69 + %71 = fadd float %70, %55 + %72 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4) + %73 = extractelement <4 x float> %72, i32 0 + %74 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4) + %75 = extractelement <4 x float> %74, i32 1 + %76 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4) + %77 = extractelement <4 x float> %76, i32 2 + %78 = insertelement <4 x float> undef, float %4, i32 0 + %79 = insertelement <4 x float> %78, float %5, i32 1 + %80 = insertelement <4 x float> %79, float %6, i32 2 + %81 = insertelement <4 x float> %80, float 0.000000e+00, i32 3 + %82 = insertelement <4 x float> undef, float %73, i32 0 + %83 = insertelement <4 x float> %82, float %75, i32 1 + %84 = insertelement <4 x float> %83, float %77, i32 2 + %85 = insertelement <4 x float> %84, float 0.000000e+00, i32 3 + %86 = call float @llvm.AMDGPU.dp4(<4 x float> %81, <4 x float> %85) + %87 = insertelement <4 x float> undef, float %86, i32 0 + call void @llvm.R600.store.swizzle(<4 x float> %87, i32 2, i32 2) + ret void +} + +; Function Attrs: readnone +declare float @llvm.AMDGPU.dp4(<4 x float>, <4 x float>) #1 + +; Function Attrs: readonly +declare float @fabs(float) #2 + +; Function Attrs: readnone +declare float @llvm.AMDGPU.rsq(float) #1 + +; Function Attrs: readnone +declare float @llvm.AMDIL.clamp.(float, float, float) #1 + +; Function Attrs: nounwind readonly +declare float @llvm.pow.f32(float, float) #3 + +declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) + +attributes #0 = { "ShaderType"="1" } +attributes #1 = { readnone } +attributes #2 = { readonly } +attributes #3 = { nounwind readonly } diff --git a/llvm/test/CodeGen/AMDGPU/load.ll b/llvm/test/CodeGen/AMDGPU/load.ll new file mode 100644 index 00000000000..93b1b51a0d0 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/load.ll @@ -0,0 +1,709 @@ +; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=R600 --check-prefix=FUNC %s +; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck --check-prefix=R600 --check-prefix=FUNC %s +; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s +; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s + +;===------------------------------------------------------------------------===; +; GLOBAL ADDRESS SPACE +;===------------------------------------------------------------------------===; + +; Load an i8 value from the global address space. +; FUNC-LABEL: {{^}}load_i8: +; R600: VTX_READ_8 T{{[0-9]+\.X, T[0-9]+\.X}} + +; SI: buffer_load_ubyte v{{[0-9]+}}, +define void @load_i8(i32 addrspace(1)* %out, i8 addrspace(1)* %in) { + %1 = load i8, i8 addrspace(1)* %in + %2 = zext i8 %1 to i32 + store i32 %2, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}load_i8_sext: +; R600: VTX_READ_8 [[DST:T[0-9]\.[XYZW]]], [[DST]] +; R600: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], 0.0, literal +; R600: 8 +; SI: buffer_load_sbyte +define void @load_i8_sext(i32 addrspace(1)* %out, i8 addrspace(1)* %in) { +entry: + %0 = load i8, i8 addrspace(1)* %in + %1 = sext i8 %0 to i32 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}load_v2i8: +; R600: VTX_READ_8 +; R600: VTX_READ_8 +; SI: buffer_load_ubyte +; SI: buffer_load_ubyte +define void @load_v2i8(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(1)* %in) { +entry: + %0 = load <2 x i8>, <2 x i8> addrspace(1)* %in + %1 = zext <2 x i8> %0 to <2 x i32> + store <2 x i32> %1, <2 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}load_v2i8_sext: +; R600-DAG: VTX_READ_8 [[DST_X:T[0-9]\.[XYZW]]], [[DST_X]] +; R600-DAG: VTX_READ_8 [[DST_Y:T[0-9]\.[XYZW]]], [[DST_Y]] +; R600-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_X]], 0.0, literal +; R600-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_Y]], 0.0, literal +; R600-DAG: 8 +; R600-DAG: 8 + +; SI: buffer_load_sbyte +; SI: buffer_load_sbyte +define void @load_v2i8_sext(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(1)* %in) { +entry: + %0 = load <2 x i8>, <2 x i8> addrspace(1)* %in + %1 = sext <2 x i8> %0 to <2 x i32> + store <2 x i32> %1, <2 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}load_v4i8: +; R600: VTX_READ_8 +; R600: VTX_READ_8 +; R600: VTX_READ_8 +; R600: VTX_READ_8 +; SI: buffer_load_ubyte +; SI: buffer_load_ubyte +; SI: buffer_load_ubyte +; SI: buffer_load_ubyte +define void @load_v4i8(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) { +entry: + %0 = load <4 x i8>, <4 x i8> addrspace(1)* %in + %1 = zext <4 x i8> %0 to <4 x i32> + store <4 x i32> %1, <4 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}load_v4i8_sext: +; R600-DAG: VTX_READ_8 [[DST_X:T[0-9]\.[XYZW]]], [[DST_X]] +; R600-DAG: VTX_READ_8 [[DST_Y:T[0-9]\.[XYZW]]], [[DST_Y]] +; R600-DAG: VTX_READ_8 [[DST_Z:T[0-9]\.[XYZW]]], [[DST_Z]] +; R600-DAG: VTX_READ_8 [[DST_W:T[0-9]\.[XYZW]]], [[DST_W]] +; R600-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_X]], 0.0, literal +; R600-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_Y]], 0.0, literal +; R600-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_Z]], 0.0, literal +; R600-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_W]], 0.0, literal +; R600-DAG: 8 +; R600-DAG: 8 +; R600-DAG: 8 +; R600-DAG: 8 +; SI: buffer_load_sbyte +; SI: buffer_load_sbyte +; SI: buffer_load_sbyte +; SI: buffer_load_sbyte +define void @load_v4i8_sext(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) { +entry: + %0 = load <4 x i8>, <4 x i8> addrspace(1)* %in + %1 = sext <4 x i8> %0 to <4 x i32> + store <4 x i32> %1, <4 x i32> addrspace(1)* %out + ret void +} + +; Load an i16 value from the global address space. +; FUNC-LABEL: {{^}}load_i16: +; R600: VTX_READ_16 T{{[0-9]+\.X, T[0-9]+\.X}} +; SI: buffer_load_ushort +define void @load_i16(i32 addrspace(1)* %out, i16 addrspace(1)* %in) { +entry: + %0 = load i16 , i16 addrspace(1)* %in + %1 = zext i16 %0 to i32 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}load_i16_sext: +; R600: VTX_READ_16 [[DST:T[0-9]\.[XYZW]]], [[DST]] +; R600: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], 0.0, literal +; R600: 16 +; SI: buffer_load_sshort +define void @load_i16_sext(i32 addrspace(1)* %out, i16 addrspace(1)* %in) { +entry: + %0 = load i16, i16 addrspace(1)* %in + %1 = sext i16 %0 to i32 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}load_v2i16: +; R600: VTX_READ_16 +; R600: VTX_READ_16 +; SI: buffer_load_ushort +; SI: buffer_load_ushort +define void @load_v2i16(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) { +entry: + %0 = load <2 x i16>, <2 x i16> addrspace(1)* %in + %1 = zext <2 x i16> %0 to <2 x i32> + store <2 x i32> %1, <2 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}load_v2i16_sext: +; R600-DAG: VTX_READ_16 [[DST_X:T[0-9]\.[XYZW]]], [[DST_X]] +; R600-DAG: VTX_READ_16 [[DST_Y:T[0-9]\.[XYZW]]], [[DST_Y]] +; R600-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_X]], 0.0, literal +; R600-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_Y]], 0.0, literal +; R600-DAG: 16 +; R600-DAG: 16 +; SI: buffer_load_sshort +; SI: buffer_load_sshort +define void @load_v2i16_sext(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) { +entry: + %0 = load <2 x i16>, <2 x i16> addrspace(1)* %in + %1 = sext <2 x i16> %0 to <2 x i32> + store <2 x i32> %1, <2 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}load_v4i16: +; R600: VTX_READ_16 +; R600: VTX_READ_16 +; R600: VTX_READ_16 +; R600: VTX_READ_16 +; SI: buffer_load_ushort +; SI: buffer_load_ushort +; SI: buffer_load_ushort +; SI: buffer_load_ushort +define void @load_v4i16(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) { +entry: + %0 = load <4 x i16>, <4 x i16> addrspace(1)* %in + %1 = zext <4 x i16> %0 to <4 x i32> + store <4 x i32> %1, <4 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}load_v4i16_sext: +; R600-DAG: VTX_READ_16 [[DST_X:T[0-9]\.[XYZW]]], [[DST_X]] +; R600-DAG: VTX_READ_16 [[DST_Y:T[0-9]\.[XYZW]]], [[DST_Y]] +; R600-DAG: VTX_READ_16 [[DST_Z:T[0-9]\.[XYZW]]], [[DST_Z]] +; R600-DAG: VTX_READ_16 [[DST_W:T[0-9]\.[XYZW]]], [[DST_W]] +; R600-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_X]], 0.0, literal +; R600-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_Y]], 0.0, literal +; R600-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_Z]], 0.0, literal +; R600-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_W]], 0.0, literal +; R600-DAG: 16 +; R600-DAG: 16 +; R600-DAG: 16 +; R600-DAG: 16 +; SI: buffer_load_sshort +; SI: buffer_load_sshort +; SI: buffer_load_sshort +; SI: buffer_load_sshort +define void @load_v4i16_sext(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) { +entry: + %0 = load <4 x i16>, <4 x i16> addrspace(1)* %in + %1 = sext <4 x i16> %0 to <4 x i32> + store <4 x i32> %1, <4 x i32> addrspace(1)* %out + ret void +} + +; load an i32 value from the global address space. +; FUNC-LABEL: {{^}}load_i32: +; R600: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0 + +; SI: buffer_load_dword v{{[0-9]+}} +define void @load_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { +entry: + %0 = load i32, i32 addrspace(1)* %in + store i32 %0, i32 addrspace(1)* %out + ret void +} + +; load a f32 value from the global address space. +; FUNC-LABEL: {{^}}load_f32: +; R600: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0 + +; SI: buffer_load_dword v{{[0-9]+}} +define void @load_f32(float addrspace(1)* %out, float addrspace(1)* %in) { +entry: + %0 = load float, float addrspace(1)* %in + store float %0, float addrspace(1)* %out + ret void +} + +; load a v2f32 value from the global address space +; FUNC-LABEL: {{^}}load_v2f32: +; R600: MEM_RAT +; R600: VTX_READ_64 +; SI: buffer_load_dwordx2 +define void @load_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in) { +entry: + %0 = load <2 x float>, <2 x float> addrspace(1)* %in + store <2 x float> %0, <2 x float> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}load_i64: +; R600: VTX_READ_64 +; SI: buffer_load_dwordx2 +define void @load_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { +entry: + %0 = load i64, i64 addrspace(1)* %in + store i64 %0, i64 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}load_i64_sext: +; R600: MEM_RAT +; R600: MEM_RAT +; R600: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, T{{[0-9]\.[XYZW]}}, literal.x +; R600: 31 +; SI: buffer_load_dword + +define void @load_i64_sext(i64 addrspace(1)* %out, i32 addrspace(1)* %in) { +entry: + %0 = load i32, i32 addrspace(1)* %in + %1 = sext i32 %0 to i64 + store i64 %1, i64 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}load_i64_zext: +; R600: MEM_RAT +; R600: MEM_RAT +define void @load_i64_zext(i64 addrspace(1)* %out, i32 addrspace(1)* %in) { +entry: + %0 = load i32, i32 addrspace(1)* %in + %1 = zext i32 %0 to i64 + store i64 %1, i64 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}load_v8i32: +; R600: VTX_READ_128 +; R600: VTX_READ_128 +; XXX: We should be using DWORDX4 instructions on SI. +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +define void @load_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %in) { +entry: + %0 = load <8 x i32>, <8 x i32> addrspace(1)* %in + store <8 x i32> %0, <8 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}load_v16i32: +; R600: VTX_READ_128 +; R600: VTX_READ_128 +; R600: VTX_READ_128 +; R600: VTX_READ_128 +; XXX: We should be using DWORDX4 instructions on SI. +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +define void @load_v16i32(<16 x i32> addrspace(1)* %out, <16 x i32> addrspace(1)* %in) { +entry: + %0 = load <16 x i32>, <16 x i32> addrspace(1)* %in + store <16 x i32> %0, <16 x i32> addrspace(1)* %out + ret void +} + +;===------------------------------------------------------------------------===; +; CONSTANT ADDRESS SPACE +;===------------------------------------------------------------------------===; + +; Load a sign-extended i8 value +; FUNC-LABEL: {{^}}load_const_i8_sext: +; R600: VTX_READ_8 [[DST:T[0-9]\.[XYZW]]], [[DST]] +; R600: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], 0.0, literal +; R600: 8 +; SI: buffer_load_sbyte v{{[0-9]+}}, +define void @load_const_i8_sext(i32 addrspace(1)* %out, i8 addrspace(2)* %in) { +entry: + %0 = load i8, i8 addrspace(2)* %in + %1 = sext i8 %0 to i32 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; Load an aligned i8 value +; FUNC-LABEL: {{^}}load_const_i8_aligned: +; R600: VTX_READ_8 T{{[0-9]+\.X, T[0-9]+\.X}} +; SI: buffer_load_ubyte v{{[0-9]+}}, +define void @load_const_i8_aligned(i32 addrspace(1)* %out, i8 addrspace(2)* %in) { +entry: + %0 = load i8, i8 addrspace(2)* %in + %1 = zext i8 %0 to i32 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; Load an un-aligned i8 value +; FUNC-LABEL: {{^}}load_const_i8_unaligned: +; R600: VTX_READ_8 T{{[0-9]+\.X, T[0-9]+\.X}} +; SI: buffer_load_ubyte v{{[0-9]+}}, +define void @load_const_i8_unaligned(i32 addrspace(1)* %out, i8 addrspace(2)* %in) { +entry: + %0 = getelementptr i8, i8 addrspace(2)* %in, i32 1 + %1 = load i8, i8 addrspace(2)* %0 + %2 = zext i8 %1 to i32 + store i32 %2, i32 addrspace(1)* %out + ret void +} + +; Load a sign-extended i16 value +; FUNC-LABEL: {{^}}load_const_i16_sext: +; R600: VTX_READ_16 [[DST:T[0-9]\.[XYZW]]], [[DST]] +; R600: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], 0.0, literal +; R600: 16 +; SI: buffer_load_sshort +define void @load_const_i16_sext(i32 addrspace(1)* %out, i16 addrspace(2)* %in) { +entry: + %0 = load i16, i16 addrspace(2)* %in + %1 = sext i16 %0 to i32 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; Load an aligned i16 value +; FUNC-LABEL: {{^}}load_const_i16_aligned: +; R600: VTX_READ_16 T{{[0-9]+\.X, T[0-9]+\.X}} +; SI: buffer_load_ushort +define void @load_const_i16_aligned(i32 addrspace(1)* %out, i16 addrspace(2)* %in) { +entry: + %0 = load i16, i16 addrspace(2)* %in + %1 = zext i16 %0 to i32 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; Load an un-aligned i16 value +; FUNC-LABEL: {{^}}load_const_i16_unaligned: +; R600: VTX_READ_16 T{{[0-9]+\.X, T[0-9]+\.X}} +; SI: buffer_load_ushort +define void @load_const_i16_unaligned(i32 addrspace(1)* %out, i16 addrspace(2)* %in) { +entry: + %0 = getelementptr i16, i16 addrspace(2)* %in, i32 1 + %1 = load i16, i16 addrspace(2)* %0 + %2 = zext i16 %1 to i32 + store i32 %2, i32 addrspace(1)* %out + ret void +} + +; Load an i32 value from the constant address space. +; FUNC-LABEL: {{^}}load_const_addrspace_i32: +; R600: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0 + +; SI: s_load_dword s{{[0-9]+}} +define void @load_const_addrspace_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) { +entry: + %0 = load i32, i32 addrspace(2)* %in + store i32 %0, i32 addrspace(1)* %out + ret void +} + +; Load a f32 value from the constant address space. +; FUNC-LABEL: {{^}}load_const_addrspace_f32: +; R600: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0 + +; SI: s_load_dword s{{[0-9]+}} +define void @load_const_addrspace_f32(float addrspace(1)* %out, float addrspace(2)* %in) { + %1 = load float, float addrspace(2)* %in + store float %1, float addrspace(1)* %out + ret void +} + +;===------------------------------------------------------------------------===; +; LOCAL ADDRESS SPACE +;===------------------------------------------------------------------------===; + +; Load an i8 value from the local address space. +; FUNC-LABEL: {{^}}load_i8_local: +; R600: LDS_UBYTE_READ_RET +; SI-NOT: s_wqm_b64 +; SI: s_mov_b32 m0 +; SI: ds_read_u8 +define void @load_i8_local(i32 addrspace(1)* %out, i8 addrspace(3)* %in) { + %1 = load i8, i8 addrspace(3)* %in + %2 = zext i8 %1 to i32 + store i32 %2, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}load_i8_sext_local: +; R600: LDS_UBYTE_READ_RET +; R600: BFE_INT +; SI-NOT: s_wqm_b64 +; SI: s_mov_b32 m0 +; SI: ds_read_i8 +define void @load_i8_sext_local(i32 addrspace(1)* %out, i8 addrspace(3)* %in) { +entry: + %0 = load i8, i8 addrspace(3)* %in + %1 = sext i8 %0 to i32 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}load_v2i8_local: +; R600: LDS_UBYTE_READ_RET +; R600: LDS_UBYTE_READ_RET +; SI-NOT: s_wqm_b64 +; SI: s_mov_b32 m0 +; SI: ds_read_u8 +; SI: ds_read_u8 +define void @load_v2i8_local(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(3)* %in) { +entry: + %0 = load <2 x i8>, <2 x i8> addrspace(3)* %in + %1 = zext <2 x i8> %0 to <2 x i32> + store <2 x i32> %1, <2 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}load_v2i8_sext_local: +; R600-DAG: LDS_UBYTE_READ_RET +; R600-DAG: LDS_UBYTE_READ_RET +; R600-DAG: BFE_INT +; R600-DAG: BFE_INT +; SI-NOT: s_wqm_b64 +; SI: s_mov_b32 m0 +; SI: ds_read_i8 +; SI: ds_read_i8 +define void @load_v2i8_sext_local(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(3)* %in) { +entry: + %0 = load <2 x i8>, <2 x i8> addrspace(3)* %in + %1 = sext <2 x i8> %0 to <2 x i32> + store <2 x i32> %1, <2 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}load_v4i8_local: +; R600: LDS_UBYTE_READ_RET +; R600: LDS_UBYTE_READ_RET +; R600: LDS_UBYTE_READ_RET +; R600: LDS_UBYTE_READ_RET +; SI-NOT: s_wqm_b64 +; SI: s_mov_b32 m0 +; SI: ds_read_u8 +; SI: ds_read_u8 +; SI: ds_read_u8 +; SI: ds_read_u8 +define void @load_v4i8_local(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(3)* %in) { +entry: + %0 = load <4 x i8>, <4 x i8> addrspace(3)* %in + %1 = zext <4 x i8> %0 to <4 x i32> + store <4 x i32> %1, <4 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}load_v4i8_sext_local: +; R600-DAG: LDS_UBYTE_READ_RET +; R600-DAG: LDS_UBYTE_READ_RET +; R600-DAG: LDS_UBYTE_READ_RET +; R600-DAG: LDS_UBYTE_READ_RET +; R600-DAG: BFE_INT +; R600-DAG: BFE_INT +; R600-DAG: BFE_INT +; R600-DAG: BFE_INT +; SI-NOT: s_wqm_b64 +; SI: s_mov_b32 m0 +; SI: ds_read_i8 +; SI: ds_read_i8 +; SI: ds_read_i8 +; SI: ds_read_i8 +define void @load_v4i8_sext_local(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(3)* %in) { +entry: + %0 = load <4 x i8>, <4 x i8> addrspace(3)* %in + %1 = sext <4 x i8> %0 to <4 x i32> + store <4 x i32> %1, <4 x i32> addrspace(1)* %out + ret void +} + +; Load an i16 value from the local address space. +; FUNC-LABEL: {{^}}load_i16_local: +; R600: LDS_USHORT_READ_RET +; SI-NOT: s_wqm_b64 +; SI: s_mov_b32 m0 +; SI: ds_read_u16 +define void @load_i16_local(i32 addrspace(1)* %out, i16 addrspace(3)* %in) { +entry: + %0 = load i16 , i16 addrspace(3)* %in + %1 = zext i16 %0 to i32 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}load_i16_sext_local: +; R600: LDS_USHORT_READ_RET +; R600: BFE_INT +; SI-NOT: s_wqm_b64 +; SI: s_mov_b32 m0 +; SI: ds_read_i16 +define void @load_i16_sext_local(i32 addrspace(1)* %out, i16 addrspace(3)* %in) { +entry: + %0 = load i16, i16 addrspace(3)* %in + %1 = sext i16 %0 to i32 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}load_v2i16_local: +; R600: LDS_USHORT_READ_RET +; R600: LDS_USHORT_READ_RET +; SI-NOT: s_wqm_b64 +; SI: s_mov_b32 m0 +; SI: ds_read_u16 +; SI: ds_read_u16 +define void @load_v2i16_local(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(3)* %in) { +entry: + %0 = load <2 x i16>, <2 x i16> addrspace(3)* %in + %1 = zext <2 x i16> %0 to <2 x i32> + store <2 x i32> %1, <2 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}load_v2i16_sext_local: +; R600-DAG: LDS_USHORT_READ_RET +; R600-DAG: LDS_USHORT_READ_RET +; R600-DAG: BFE_INT +; R600-DAG: BFE_INT +; SI-NOT: s_wqm_b64 +; SI: s_mov_b32 m0 +; SI: ds_read_i16 +; SI: ds_read_i16 +define void @load_v2i16_sext_local(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(3)* %in) { +entry: + %0 = load <2 x i16>, <2 x i16> addrspace(3)* %in + %1 = sext <2 x i16> %0 to <2 x i32> + store <2 x i32> %1, <2 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}load_v4i16_local: +; R600: LDS_USHORT_READ_RET +; R600: LDS_USHORT_READ_RET +; R600: LDS_USHORT_READ_RET +; R600: LDS_USHORT_READ_RET +; SI-NOT: s_wqm_b64 +; SI: s_mov_b32 m0 +; SI: ds_read_u16 +; SI: ds_read_u16 +; SI: ds_read_u16 +; SI: ds_read_u16 +define void @load_v4i16_local(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(3)* %in) { +entry: + %0 = load <4 x i16>, <4 x i16> addrspace(3)* %in + %1 = zext <4 x i16> %0 to <4 x i32> + store <4 x i32> %1, <4 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}load_v4i16_sext_local: +; R600-DAG: LDS_USHORT_READ_RET +; R600-DAG: LDS_USHORT_READ_RET +; R600-DAG: LDS_USHORT_READ_RET +; R600-DAG: LDS_USHORT_READ_RET +; R600-DAG: BFE_INT +; R600-DAG: BFE_INT +; R600-DAG: BFE_INT +; R600-DAG: BFE_INT +; SI-NOT: s_wqm_b64 +; SI: s_mov_b32 m0 +; SI: ds_read_i16 +; SI: ds_read_i16 +; SI: ds_read_i16 +; SI: ds_read_i16 +define void @load_v4i16_sext_local(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(3)* %in) { +entry: + %0 = load <4 x i16>, <4 x i16> addrspace(3)* %in + %1 = sext <4 x i16> %0 to <4 x i32> + store <4 x i32> %1, <4 x i32> addrspace(1)* %out + ret void +} + +; load an i32 value from the local address space. +; FUNC-LABEL: {{^}}load_i32_local: +; R600: LDS_READ_RET +; SI-NOT: s_wqm_b64 +; SI: s_mov_b32 m0 +; SI: ds_read_b32 +define void @load_i32_local(i32 addrspace(1)* %out, i32 addrspace(3)* %in) { +entry: + %0 = load i32, i32 addrspace(3)* %in + store i32 %0, i32 addrspace(1)* %out + ret void +} + +; load a f32 value from the local address space. +; FUNC-LABEL: {{^}}load_f32_local: +; R600: LDS_READ_RET +; SI: s_mov_b32 m0 +; SI: ds_read_b32 +define void @load_f32_local(float addrspace(1)* %out, float addrspace(3)* %in) { +entry: + %0 = load float, float addrspace(3)* %in + store float %0, float addrspace(1)* %out + ret void +} + +; load a v2f32 value from the local address space +; FUNC-LABEL: {{^}}load_v2f32_local: +; R600: LDS_READ_RET +; R600: LDS_READ_RET +; SI: s_mov_b32 m0 +; SI: ds_read_b64 +define void @load_v2f32_local(<2 x float> addrspace(1)* %out, <2 x float> addrspace(3)* %in) { +entry: + %0 = load <2 x float>, <2 x float> addrspace(3)* %in + store <2 x float> %0, <2 x float> addrspace(1)* %out + ret void +} + +; Test loading a i32 and v2i32 value from the same base pointer. +; FUNC-LABEL: {{^}}load_i32_v2i32_local: +; R600: LDS_READ_RET +; R600: LDS_READ_RET +; R600: LDS_READ_RET +; SI-DAG: ds_read_b32 +; SI-DAG: ds_read2_b32 +define void @load_i32_v2i32_local(<2 x i32> addrspace(1)* %out, i32 addrspace(3)* %in) { + %scalar = load i32, i32 addrspace(3)* %in + %tmp0 = bitcast i32 addrspace(3)* %in to <2 x i32> addrspace(3)* + %vec_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(3)* %tmp0, i32 2 + %vec0 = load <2 x i32>, <2 x i32> addrspace(3)* %vec_ptr, align 4 + %vec1 = insertelement <2 x i32> , i32 %scalar, i32 0 + %vec = add <2 x i32> %vec0, %vec1 + store <2 x i32> %vec, <2 x i32> addrspace(1)* %out + ret void +} + + +@lds = addrspace(3) global [512 x i32] undef, align 4 + +; On SI we need to make sure that the base offset is a register and not +; an immediate. +; FUNC-LABEL: {{^}}load_i32_local_const_ptr: +; SI: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0 +; SI: ds_read_b32 v0, v[[ZERO]] offset:4 +; R600: LDS_READ_RET +define void @load_i32_local_const_ptr(i32 addrspace(1)* %out, i32 addrspace(3)* %in) { +entry: + %tmp0 = getelementptr [512 x i32], [512 x i32] addrspace(3)* @lds, i32 0, i32 1 + %tmp1 = load i32, i32 addrspace(3)* %tmp0 + %tmp2 = getelementptr i32, i32 addrspace(1)* %out, i32 1 + store i32 %tmp1, i32 addrspace(1)* %tmp2 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/load.vec.ll b/llvm/test/CodeGen/AMDGPU/load.vec.ll new file mode 100644 index 00000000000..02f883cd8e9 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/load.vec.ll @@ -0,0 +1,25 @@ +; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG %s +; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck --check-prefix=SI %s +; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=SI %s + +; load a v2i32 value from the global address space. +; EG: {{^}}load_v2i32: +; EG: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 0 +; SI: {{^}}load_v2i32: +; SI: buffer_load_dwordx2 v[{{[0-9]+:[0-9]+}}] +define void @load_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { + %a = load <2 x i32>, <2 x i32> addrspace(1) * %in + store <2 x i32> %a, <2 x i32> addrspace(1)* %out + ret void +} + +; load a v4i32 value from the global address space. +; EG: {{^}}load_v4i32: +; EG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0 +; SI: {{^}}load_v4i32: +; SI: buffer_load_dwordx4 v[{{[0-9]+:[0-9]+}}] +define void @load_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { + %a = load <4 x i32>, <4 x i32> addrspace(1) * %in + store <4 x i32> %a, <4 x i32> addrspace(1)* %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/load64.ll b/llvm/test/CodeGen/AMDGPU/load64.ll new file mode 100644 index 00000000000..74beabdc007 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/load64.ll @@ -0,0 +1,31 @@ +; RUN: llc < %s -march=amdgcn -mcpu=tahiti -verify-machineinstrs | FileCheck %s +; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s + +; load a f64 value from the global address space. +; CHECK-LABEL: {{^}}load_f64: +; CHECK: buffer_load_dwordx2 v[{{[0-9]+:[0-9]+}}] +; CHECK: buffer_store_dwordx2 v[{{[0-9]+:[0-9]+}}] +define void @load_f64(double addrspace(1)* %out, double addrspace(1)* %in) { + %1 = load double, double addrspace(1)* %in + store double %1, double addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}load_i64: +; CHECK: buffer_load_dwordx2 v[{{[0-9]+:[0-9]+}}] +; CHECK: buffer_store_dwordx2 v[{{[0-9]+:[0-9]+}}] +define void @load_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { + %tmp = load i64, i64 addrspace(1)* %in + store i64 %tmp, i64 addrspace(1)* %out, align 8 + ret void +} + +; Load a f64 value from the constant address space. +; CHECK-LABEL: {{^}}load_const_addrspace_f64: +; CHECK: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}] +; CHECK: buffer_store_dwordx2 v[{{[0-9]+:[0-9]+}}] +define void @load_const_addrspace_f64(double addrspace(1)* %out, double addrspace(2)* %in) { + %1 = load double, double addrspace(2)* %in + store double %1, double addrspace(1)* %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/local-64.ll b/llvm/test/CodeGen/AMDGPU/local-64.ll new file mode 100644 index 00000000000..33f3159d13e --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/local-64.ll @@ -0,0 +1,167 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck --check-prefix=SI --check-prefix=BOTH %s +; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs< %s | FileCheck --check-prefix=CI --check-prefix=BOTH %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck --check-prefix=CI --check-prefix=BOTH %s + +; BOTH-LABEL: {{^}}local_i32_load +; BOTH: ds_read_b32 [[REG:v[0-9]+]], v{{[0-9]+}} offset:28 +; BOTH: buffer_store_dword [[REG]], +define void @local_i32_load(i32 addrspace(1)* %out, i32 addrspace(3)* %in) nounwind { + %gep = getelementptr i32, i32 addrspace(3)* %in, i32 7 + %val = load i32, i32 addrspace(3)* %gep, align 4 + store i32 %val, i32 addrspace(1)* %out, align 4 + ret void +} + +; BOTH-LABEL: {{^}}local_i32_load_0_offset +; BOTH: ds_read_b32 [[REG:v[0-9]+]], v{{[0-9]+}} +; BOTH: buffer_store_dword [[REG]], +define void @local_i32_load_0_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %in) nounwind { + %val = load i32, i32 addrspace(3)* %in, align 4 + store i32 %val, i32 addrspace(1)* %out, align 4 + ret void +} + +; BOTH-LABEL: {{^}}local_i8_load_i16_max_offset: +; BOTH-NOT: ADD +; BOTH: ds_read_u8 [[REG:v[0-9]+]], {{v[0-9]+}} offset:65535 +; BOTH: buffer_store_byte [[REG]], +define void @local_i8_load_i16_max_offset(i8 addrspace(1)* %out, i8 addrspace(3)* %in) nounwind { + %gep = getelementptr i8, i8 addrspace(3)* %in, i32 65535 + %val = load i8, i8 addrspace(3)* %gep, align 4 + store i8 %val, i8 addrspace(1)* %out, align 4 + ret void +} + +; BOTH-LABEL: {{^}}local_i8_load_over_i16_max_offset: +; The LDS offset will be 65536 bytes, which is larger than the size of LDS on +; SI, which is why it is being OR'd with the base pointer. +; SI: s_or_b32 [[ADDR:s[0-9]+]], s{{[0-9]+}}, 0x10000 +; CI: s_add_i32 [[ADDR:s[0-9]+]], s{{[0-9]+}}, 0x10000 +; BOTH: v_mov_b32_e32 [[VREGADDR:v[0-9]+]], [[ADDR]] +; BOTH: ds_read_u8 [[REG:v[0-9]+]], [[VREGADDR]] +; BOTH: buffer_store_byte [[REG]], +define void @local_i8_load_over_i16_max_offset(i8 addrspace(1)* %out, i8 addrspace(3)* %in) nounwind { + %gep = getelementptr i8, i8 addrspace(3)* %in, i32 65536 + %val = load i8, i8 addrspace(3)* %gep, align 4 + store i8 %val, i8 addrspace(1)* %out, align 4 + ret void +} + +; BOTH-LABEL: {{^}}local_i64_load: +; BOTH-NOT: ADD +; BOTH: ds_read_b64 [[REG:v[[0-9]+:[0-9]+]]], v{{[0-9]+}} offset:56 +; BOTH: buffer_store_dwordx2 [[REG]], +define void @local_i64_load(i64 addrspace(1)* %out, i64 addrspace(3)* %in) nounwind { + %gep = getelementptr i64, i64 addrspace(3)* %in, i32 7 + %val = load i64, i64 addrspace(3)* %gep, align 8 + store i64 %val, i64 addrspace(1)* %out, align 8 + ret void +} + +; BOTH-LABEL: {{^}}local_i64_load_0_offset +; BOTH: ds_read_b64 [[REG:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}} +; BOTH: buffer_store_dwordx2 [[REG]], +define void @local_i64_load_0_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %in) nounwind { + %val = load i64, i64 addrspace(3)* %in, align 8 + store i64 %val, i64 addrspace(1)* %out, align 8 + ret void +} + +; BOTH-LABEL: {{^}}local_f64_load: +; BOTH-NOT: ADD +; BOTH: ds_read_b64 [[REG:v[[0-9]+:[0-9]+]]], v{{[0-9]+}} offset:56 +; BOTH: buffer_store_dwordx2 [[REG]], +define void @local_f64_load(double addrspace(1)* %out, double addrspace(3)* %in) nounwind { + %gep = getelementptr double, double addrspace(3)* %in, i32 7 + %val = load double, double addrspace(3)* %gep, align 8 + store double %val, double addrspace(1)* %out, align 8 + ret void +} + +; BOTH-LABEL: {{^}}local_f64_load_0_offset +; BOTH: ds_read_b64 [[REG:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}} +; BOTH: buffer_store_dwordx2 [[REG]], +define void @local_f64_load_0_offset(double addrspace(1)* %out, double addrspace(3)* %in) nounwind { + %val = load double, double addrspace(3)* %in, align 8 + store double %val, double addrspace(1)* %out, align 8 + ret void +} + +; BOTH-LABEL: {{^}}local_i64_store: +; BOTH-NOT: ADD +; BOTH: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:56 +define void @local_i64_store(i64 addrspace(3)* %out) nounwind { + %gep = getelementptr i64, i64 addrspace(3)* %out, i32 7 + store i64 5678, i64 addrspace(3)* %gep, align 8 + ret void +} + +; BOTH-LABEL: {{^}}local_i64_store_0_offset: +; BOTH-NOT: ADD +; BOTH: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} +define void @local_i64_store_0_offset(i64 addrspace(3)* %out) nounwind { + store i64 1234, i64 addrspace(3)* %out, align 8 + ret void +} + +; BOTH-LABEL: {{^}}local_f64_store: +; BOTH-NOT: ADD +; BOTH: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:56 +define void @local_f64_store(double addrspace(3)* %out) nounwind { + %gep = getelementptr double, double addrspace(3)* %out, i32 7 + store double 16.0, double addrspace(3)* %gep, align 8 + ret void +} + +; BOTH-LABEL: {{^}}local_f64_store_0_offset +; BOTH: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} +define void @local_f64_store_0_offset(double addrspace(3)* %out) nounwind { + store double 20.0, double addrspace(3)* %out, align 8 + ret void +} + +; BOTH-LABEL: {{^}}local_v2i64_store: +; BOTH-NOT: ADD +; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:112 +; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:120 +; BOTH: s_endpgm +define void @local_v2i64_store(<2 x i64> addrspace(3)* %out) nounwind { + %gep = getelementptr <2 x i64>, <2 x i64> addrspace(3)* %out, i32 7 + store <2 x i64> , <2 x i64> addrspace(3)* %gep, align 16 + ret void +} + +; BOTH-LABEL: {{^}}local_v2i64_store_0_offset: +; BOTH-NOT: ADD +; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} +; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:8 +; BOTH: s_endpgm +define void @local_v2i64_store_0_offset(<2 x i64> addrspace(3)* %out) nounwind { + store <2 x i64> , <2 x i64> addrspace(3)* %out, align 16 + ret void +} + +; BOTH-LABEL: {{^}}local_v4i64_store: +; BOTH-NOT: ADD +; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:224 +; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:232 +; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:240 +; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:248 +; BOTH: s_endpgm +define void @local_v4i64_store(<4 x i64> addrspace(3)* %out) nounwind { + %gep = getelementptr <4 x i64>, <4 x i64> addrspace(3)* %out, i32 7 + store <4 x i64> , <4 x i64> addrspace(3)* %gep, align 16 + ret void +} + +; BOTH-LABEL: {{^}}local_v4i64_store_0_offset: +; BOTH-NOT: ADD +; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} +; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:8 +; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:16 +; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:24 +; BOTH: s_endpgm +define void @local_v4i64_store_0_offset(<4 x i64> addrspace(3)* %out) nounwind { + store <4 x i64> , <4 x i64> addrspace(3)* %out, align 16 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/local-atomics.ll b/llvm/test/CodeGen/AMDGPU/local-atomics.ll new file mode 100644 index 00000000000..2aaf977ab90 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/local-atomics.ll @@ -0,0 +1,551 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CIVI -check-prefix=GCN -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=CIVI -check-prefix=GCN -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s + +; FUNC-LABEL: {{^}}lds_atomic_xchg_ret_i32: +; EG: LDS_WRXCHG_RET * +; GCN: v_mov_b32_e32 [[DATA:v[0-9]+]], 4 +; GCN: s_load_dword [[SPTR:s[0-9]+]], +; GCN: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]] +; GCN: ds_wrxchg_rtn_b32 [[RESULT:v[0-9]+]], [[VPTR]], [[DATA]] +; GCN: buffer_store_dword [[RESULT]], +; GCN: s_endpgm +define void @lds_atomic_xchg_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { + %result = atomicrmw xchg i32 addrspace(3)* %ptr, i32 4 seq_cst + store i32 %result, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_xchg_ret_i32_offset: +; EG: LDS_WRXCHG_RET * +; GCN: ds_wrxchg_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16 +; GCN: s_endpgm +define void @lds_atomic_xchg_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { + %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 + %result = atomicrmw xchg i32 addrspace(3)* %gep, i32 4 seq_cst + store i32 %result, i32 addrspace(1)* %out, align 4 + ret void +} + +; XXX - Is it really necessary to load 4 into VGPR? +; FUNC-LABEL: {{^}}lds_atomic_add_ret_i32: +; EG: LDS_ADD_RET * +; GCN: v_mov_b32_e32 [[DATA:v[0-9]+]], 4 +; GCN: s_load_dword [[SPTR:s[0-9]+]], +; GCN: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]] +; GCN: ds_add_rtn_u32 [[RESULT:v[0-9]+]], [[VPTR]], [[DATA]] +; GCN: buffer_store_dword [[RESULT]], +; GCN: s_endpgm +define void @lds_atomic_add_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { + %result = atomicrmw add i32 addrspace(3)* %ptr, i32 4 seq_cst + store i32 %result, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_add_ret_i32_offset: +; EG: LDS_ADD_RET * +; GCN: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16 +; GCN: s_endpgm +define void @lds_atomic_add_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { + %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 + %result = atomicrmw add i32 addrspace(3)* %gep, i32 4 seq_cst + store i32 %result, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_add_ret_i32_bad_si_offset: +; EG: LDS_ADD_RET * +; SI: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; CIVI: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16 +; GCN: s_endpgm +define void @lds_atomic_add_ret_i32_bad_si_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr, i32 %a, i32 %b) nounwind { + %sub = sub i32 %a, %b + %add = add i32 %sub, 4 + %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 %add + %result = atomicrmw add i32 addrspace(3)* %gep, i32 4 seq_cst + store i32 %result, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_inc_ret_i32: +; EG: LDS_ADD_RET * +; GCN: v_mov_b32_e32 [[NEGONE:v[0-9]+]], -1 +; GCN: ds_inc_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[NEGONE]] +; GCN: s_endpgm +define void @lds_atomic_inc_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { + %result = atomicrmw add i32 addrspace(3)* %ptr, i32 1 seq_cst + store i32 %result, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_inc_ret_i32_offset: +; EG: LDS_ADD_RET * +; GCN: v_mov_b32_e32 [[NEGONE:v[0-9]+]], -1 +; GCN: ds_inc_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[NEGONE]] offset:16 +; GCN: s_endpgm +define void @lds_atomic_inc_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { + %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 + %result = atomicrmw add i32 addrspace(3)* %gep, i32 1 seq_cst + store i32 %result, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_inc_ret_i32_bad_si_offset: +; EG: LDS_ADD_RET * +; SI: ds_inc_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; CIVI: ds_inc_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16 +; GCN: s_endpgm +define void @lds_atomic_inc_ret_i32_bad_si_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr, i32 %a, i32 %b) nounwind { + %sub = sub i32 %a, %b + %add = add i32 %sub, 4 + %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 %add + %result = atomicrmw add i32 addrspace(3)* %gep, i32 1 seq_cst + store i32 %result, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_sub_ret_i32: +; EG: LDS_SUB_RET * +; GCN: ds_sub_rtn_u32 +; GCN: s_endpgm +define void @lds_atomic_sub_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { + %result = atomicrmw sub i32 addrspace(3)* %ptr, i32 4 seq_cst + store i32 %result, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_sub_ret_i32_offset: +; EG: LDS_SUB_RET * +; GCN: ds_sub_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16 +; GCN: s_endpgm +define void @lds_atomic_sub_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { + %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 + %result = atomicrmw sub i32 addrspace(3)* %gep, i32 4 seq_cst + store i32 %result, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_dec_ret_i32: +; EG: LDS_SUB_RET * +; GCN: v_mov_b32_e32 [[NEGONE:v[0-9]+]], -1 +; GCN: ds_dec_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[NEGONE]] +; GCN: s_endpgm +define void @lds_atomic_dec_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { + %result = atomicrmw sub i32 addrspace(3)* %ptr, i32 1 seq_cst + store i32 %result, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_dec_ret_i32_offset: +; EG: LDS_SUB_RET * +; GCN: v_mov_b32_e32 [[NEGONE:v[0-9]+]], -1 +; GCN: ds_dec_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[NEGONE]] offset:16 +; GCN: s_endpgm +define void @lds_atomic_dec_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { + %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 + %result = atomicrmw sub i32 addrspace(3)* %gep, i32 1 seq_cst + store i32 %result, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_and_ret_i32: +; EG: LDS_AND_RET * +; GCN: ds_and_rtn_b32 +; GCN: s_endpgm +define void @lds_atomic_and_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { + %result = atomicrmw and i32 addrspace(3)* %ptr, i32 4 seq_cst + store i32 %result, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_and_ret_i32_offset: +; EG: LDS_AND_RET * +; GCN: ds_and_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16 +; GCN: s_endpgm +define void @lds_atomic_and_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { + %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 + %result = atomicrmw and i32 addrspace(3)* %gep, i32 4 seq_cst + store i32 %result, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_or_ret_i32: +; EG: LDS_OR_RET * +; GCN: ds_or_rtn_b32 +; GCN: s_endpgm +define void @lds_atomic_or_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { + %result = atomicrmw or i32 addrspace(3)* %ptr, i32 4 seq_cst + store i32 %result, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_or_ret_i32_offset: +; EG: LDS_OR_RET * +; GCN: ds_or_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16 +; GCN: s_endpgm +define void @lds_atomic_or_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { + %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 + %result = atomicrmw or i32 addrspace(3)* %gep, i32 4 seq_cst + store i32 %result, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_xor_ret_i32: +; EG: LDS_XOR_RET * +; GCN: ds_xor_rtn_b32 +; GCN: s_endpgm +define void @lds_atomic_xor_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { + %result = atomicrmw xor i32 addrspace(3)* %ptr, i32 4 seq_cst + store i32 %result, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_xor_ret_i32_offset: +; EG: LDS_XOR_RET * +; GCN: ds_xor_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16 +; GCN: s_endpgm +define void @lds_atomic_xor_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { + %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 + %result = atomicrmw xor i32 addrspace(3)* %gep, i32 4 seq_cst + store i32 %result, i32 addrspace(1)* %out, align 4 + ret void +} + +; FIXME: There is no atomic nand instr +; XFUNC-LABEL: {{^}}lds_atomic_nand_ret_i32:uction, so we somehow need to expand this. +; define void @lds_atomic_nand_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { +; %result = atomicrmw nand i32 addrspace(3)* %ptr, i32 4 seq_cst +; store i32 %result, i32 addrspace(1)* %out, align 4 +; ret void +; } + +; FUNC-LABEL: {{^}}lds_atomic_min_ret_i32: +; EG: LDS_MIN_INT_RET * +; GCN: ds_min_rtn_i32 +; GCN: s_endpgm +define void @lds_atomic_min_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { + %result = atomicrmw min i32 addrspace(3)* %ptr, i32 4 seq_cst + store i32 %result, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_min_ret_i32_offset: +; EG: LDS_MIN_INT_RET * +; GCN: ds_min_rtn_i32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16 +; GCN: s_endpgm +define void @lds_atomic_min_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { + %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 + %result = atomicrmw min i32 addrspace(3)* %gep, i32 4 seq_cst + store i32 %result, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_max_ret_i32: +; EG: LDS_MAX_INT_RET * +; GCN: ds_max_rtn_i32 +; GCN: s_endpgm +define void @lds_atomic_max_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { + %result = atomicrmw max i32 addrspace(3)* %ptr, i32 4 seq_cst + store i32 %result, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_max_ret_i32_offset: +; EG: LDS_MAX_INT_RET * +; GCN: ds_max_rtn_i32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16 +; GCN: s_endpgm +define void @lds_atomic_max_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { + %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 + %result = atomicrmw max i32 addrspace(3)* %gep, i32 4 seq_cst + store i32 %result, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_umin_ret_i32: +; EG: LDS_MIN_UINT_RET * +; GCN: ds_min_rtn_u32 +; GCN: s_endpgm +define void @lds_atomic_umin_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { + %result = atomicrmw umin i32 addrspace(3)* %ptr, i32 4 seq_cst + store i32 %result, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_umin_ret_i32_offset: +; EG: LDS_MIN_UINT_RET * +; GCN: ds_min_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16 +; GCN: s_endpgm +define void @lds_atomic_umin_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { + %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 + %result = atomicrmw umin i32 addrspace(3)* %gep, i32 4 seq_cst + store i32 %result, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_umax_ret_i32: +; EG: LDS_MAX_UINT_RET * +; GCN: ds_max_rtn_u32 +; GCN: s_endpgm +define void @lds_atomic_umax_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { + %result = atomicrmw umax i32 addrspace(3)* %ptr, i32 4 seq_cst + store i32 %result, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_umax_ret_i32_offset: +; EG: LDS_MAX_UINT_RET * +; GCN: ds_max_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16 +; GCN: s_endpgm +define void @lds_atomic_umax_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { + %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 + %result = atomicrmw umax i32 addrspace(3)* %gep, i32 4 seq_cst + store i32 %result, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_xchg_noret_i32: +; GCN: s_load_dword [[SPTR:s[0-9]+]], +; GCN: v_mov_b32_e32 [[DATA:v[0-9]+]], 4 +; GCN: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]] +; GCN: ds_wrxchg_rtn_b32 [[RESULT:v[0-9]+]], [[VPTR]], [[DATA]] +; GCN: s_endpgm +define void @lds_atomic_xchg_noret_i32(i32 addrspace(3)* %ptr) nounwind { + %result = atomicrmw xchg i32 addrspace(3)* %ptr, i32 4 seq_cst + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_xchg_noret_i32_offset: +; GCN: ds_wrxchg_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16 +; GCN: s_endpgm +define void @lds_atomic_xchg_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind { + %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 + %result = atomicrmw xchg i32 addrspace(3)* %gep, i32 4 seq_cst + ret void +} + +; XXX - Is it really necessary to load 4 into VGPR? +; FUNC-LABEL: {{^}}lds_atomic_add_noret_i32: +; GCN: s_load_dword [[SPTR:s[0-9]+]], +; GCN: v_mov_b32_e32 [[DATA:v[0-9]+]], 4 +; GCN: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]] +; GCN: ds_add_u32 [[VPTR]], [[DATA]] +; GCN: s_endpgm +define void @lds_atomic_add_noret_i32(i32 addrspace(3)* %ptr) nounwind { + %result = atomicrmw add i32 addrspace(3)* %ptr, i32 4 seq_cst + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_add_noret_i32_offset: +; GCN: ds_add_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16 +; GCN: s_endpgm +define void @lds_atomic_add_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind { + %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 + %result = atomicrmw add i32 addrspace(3)* %gep, i32 4 seq_cst + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_add_noret_i32_bad_si_offset +; SI: ds_add_u32 v{{[0-9]+}}, v{{[0-9]+}} +; CIVI: ds_add_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16 +; GCN: s_endpgm +define void @lds_atomic_add_noret_i32_bad_si_offset(i32 addrspace(3)* %ptr, i32 %a, i32 %b) nounwind { + %sub = sub i32 %a, %b + %add = add i32 %sub, 4 + %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 %add + %result = atomicrmw add i32 addrspace(3)* %gep, i32 4 seq_cst + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_inc_noret_i32: +; GCN: v_mov_b32_e32 [[NEGONE:v[0-9]+]], -1 +; GCN: ds_inc_u32 v{{[0-9]+}}, [[NEGONE]] +; GCN: s_endpgm +define void @lds_atomic_inc_noret_i32(i32 addrspace(3)* %ptr) nounwind { + %result = atomicrmw add i32 addrspace(3)* %ptr, i32 1 seq_cst + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_inc_noret_i32_offset: +; GCN: v_mov_b32_e32 [[NEGONE:v[0-9]+]], -1 +; GCN: ds_inc_u32 v{{[0-9]+}}, [[NEGONE]] offset:16 +; GCN: s_endpgm +define void @lds_atomic_inc_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind { + %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 + %result = atomicrmw add i32 addrspace(3)* %gep, i32 1 seq_cst + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_inc_noret_i32_bad_si_offset: +; SI: ds_inc_u32 v{{[0-9]+}}, v{{[0-9]+}} +; CIVI: ds_inc_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16 +; GCN: s_endpgm +define void @lds_atomic_inc_noret_i32_bad_si_offset(i32 addrspace(3)* %ptr, i32 %a, i32 %b) nounwind { + %sub = sub i32 %a, %b + %add = add i32 %sub, 4 + %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 %add + %result = atomicrmw add i32 addrspace(3)* %gep, i32 1 seq_cst + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_sub_noret_i32: +; GCN: ds_sub_u32 +; GCN: s_endpgm +define void @lds_atomic_sub_noret_i32(i32 addrspace(3)* %ptr) nounwind { + %result = atomicrmw sub i32 addrspace(3)* %ptr, i32 4 seq_cst + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_sub_noret_i32_offset: +; GCN: ds_sub_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16 +; GCN: s_endpgm +define void @lds_atomic_sub_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind { + %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 + %result = atomicrmw sub i32 addrspace(3)* %gep, i32 4 seq_cst + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_dec_noret_i32: +; GCN: v_mov_b32_e32 [[NEGONE:v[0-9]+]], -1 +; GCN: ds_dec_u32 v{{[0-9]+}}, [[NEGONE]] +; GCN: s_endpgm +define void @lds_atomic_dec_noret_i32(i32 addrspace(3)* %ptr) nounwind { + %result = atomicrmw sub i32 addrspace(3)* %ptr, i32 1 seq_cst + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_dec_noret_i32_offset: +; GCN: v_mov_b32_e32 [[NEGONE:v[0-9]+]], -1 +; GCN: ds_dec_u32 v{{[0-9]+}}, [[NEGONE]] offset:16 +; GCN: s_endpgm +define void @lds_atomic_dec_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind { + %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 + %result = atomicrmw sub i32 addrspace(3)* %gep, i32 1 seq_cst + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_and_noret_i32: +; GCN: ds_and_b32 +; GCN: s_endpgm +define void @lds_atomic_and_noret_i32(i32 addrspace(3)* %ptr) nounwind { + %result = atomicrmw and i32 addrspace(3)* %ptr, i32 4 seq_cst + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_and_noret_i32_offset: +; GCN: ds_and_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:16 +; GCN: s_endpgm +define void @lds_atomic_and_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind { + %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 + %result = atomicrmw and i32 addrspace(3)* %gep, i32 4 seq_cst + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_or_noret_i32: +; GCN: ds_or_b32 +; GCN: s_endpgm +define void @lds_atomic_or_noret_i32(i32 addrspace(3)* %ptr) nounwind { + %result = atomicrmw or i32 addrspace(3)* %ptr, i32 4 seq_cst + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_or_noret_i32_offset: +; GCN: ds_or_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:16 +; GCN: s_endpgm +define void @lds_atomic_or_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind { + %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 + %result = atomicrmw or i32 addrspace(3)* %gep, i32 4 seq_cst + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_xor_noret_i32: +; GCN: ds_xor_b32 +; GCN: s_endpgm +define void @lds_atomic_xor_noret_i32(i32 addrspace(3)* %ptr) nounwind { + %result = atomicrmw xor i32 addrspace(3)* %ptr, i32 4 seq_cst + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_xor_noret_i32_offset: +; GCN: ds_xor_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:16 +; GCN: s_endpgm +define void @lds_atomic_xor_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind { + %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 + %result = atomicrmw xor i32 addrspace(3)* %gep, i32 4 seq_cst + ret void +} + +; FIXME: There is no atomic nand instr +; XFUNC-LABEL: {{^}}lds_atomic_nand_noret_i32:uction, so we somehow need to expand this. +; define void @lds_atomic_nand_noret_i32(i32 addrspace(3)* %ptr) nounwind { +; %result = atomicrmw nand i32 addrspace(3)* %ptr, i32 4 seq_cst +; ret void +; } + +; FUNC-LABEL: {{^}}lds_atomic_min_noret_i32: +; GCN: ds_min_i32 +; GCN: s_endpgm +define void @lds_atomic_min_noret_i32(i32 addrspace(3)* %ptr) nounwind { + %result = atomicrmw min i32 addrspace(3)* %ptr, i32 4 seq_cst + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_min_noret_i32_offset: +; GCN: ds_min_i32 v{{[0-9]+}}, v{{[0-9]+}} offset:16 +; GCN: s_endpgm +define void @lds_atomic_min_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind { + %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 + %result = atomicrmw min i32 addrspace(3)* %gep, i32 4 seq_cst + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_max_noret_i32: +; GCN: ds_max_i32 +; GCN: s_endpgm +define void @lds_atomic_max_noret_i32(i32 addrspace(3)* %ptr) nounwind { + %result = atomicrmw max i32 addrspace(3)* %ptr, i32 4 seq_cst + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_max_noret_i32_offset: +; GCN: ds_max_i32 v{{[0-9]+}}, v{{[0-9]+}} offset:16 +; GCN: s_endpgm +define void @lds_atomic_max_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind { + %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 + %result = atomicrmw max i32 addrspace(3)* %gep, i32 4 seq_cst + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_umin_noret_i32: +; GCN: ds_min_u32 +; GCN: s_endpgm +define void @lds_atomic_umin_noret_i32(i32 addrspace(3)* %ptr) nounwind { + %result = atomicrmw umin i32 addrspace(3)* %ptr, i32 4 seq_cst + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_umin_noret_i32_offset: +; GCN: ds_min_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16 +; GCN: s_endpgm +define void @lds_atomic_umin_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind { + %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 + %result = atomicrmw umin i32 addrspace(3)* %gep, i32 4 seq_cst + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_umax_noret_i32: +; GCN: ds_max_u32 +; GCN: s_endpgm +define void @lds_atomic_umax_noret_i32(i32 addrspace(3)* %ptr) nounwind { + %result = atomicrmw umax i32 addrspace(3)* %ptr, i32 4 seq_cst + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_umax_noret_i32_offset: +; GCN: ds_max_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16 +; GCN: s_endpgm +define void @lds_atomic_umax_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind { + %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 + %result = atomicrmw umax i32 addrspace(3)* %gep, i32 4 seq_cst + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/local-atomics64.ll b/llvm/test/CodeGen/AMDGPU/local-atomics64.ll new file mode 100644 index 00000000000..0ffa5e751b7 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/local-atomics64.ll @@ -0,0 +1,470 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=SI -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=VI -check-prefix=GCN %s + +; FUNC-LABEL: {{^}}lds_atomic_xchg_ret_i64: +; GCN: ds_wrxchg_rtn_b64 +; GCN: s_endpgm +define void @lds_atomic_xchg_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { + %result = atomicrmw xchg i64 addrspace(3)* %ptr, i64 4 seq_cst + store i64 %result, i64 addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_xchg_ret_i64_offset: +; GCN: ds_wrxchg_rtn_b64 {{.*}} offset:32 +; GCN: s_endpgm +define void @lds_atomic_xchg_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { + %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 + %result = atomicrmw xchg i64 addrspace(3)* %gep, i64 4 seq_cst + store i64 %result, i64 addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_add_ret_i64: +; GCN: ds_add_rtn_u64 +; GCN: s_endpgm +define void @lds_atomic_add_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { + %result = atomicrmw add i64 addrspace(3)* %ptr, i64 4 seq_cst + store i64 %result, i64 addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_add_ret_i64_offset: +; GCN: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], 9 +; GCN: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], 0 +; SI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb +; VI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c +; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]] +; GCN: ds_add_rtn_u64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}} offset:32 +; GCN: buffer_store_dwordx2 [[RESULT]], +; GCN: s_endpgm +define void @lds_atomic_add_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { + %gep = getelementptr i64, i64 addrspace(3)* %ptr, i64 4 + %result = atomicrmw add i64 addrspace(3)* %gep, i64 9 seq_cst + store i64 %result, i64 addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_inc_ret_i64: +; GCN: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], -1 +; GCN: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], -1 +; GCN: ds_inc_rtn_u64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}} +; GCN: buffer_store_dwordx2 [[RESULT]], +; GCN: s_endpgm +define void @lds_atomic_inc_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { + %result = atomicrmw add i64 addrspace(3)* %ptr, i64 1 seq_cst + store i64 %result, i64 addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_inc_ret_i64_offset: +; GCN: ds_inc_rtn_u64 {{.*}} offset:32 +; GCN: s_endpgm +define void @lds_atomic_inc_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { + %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 + %result = atomicrmw add i64 addrspace(3)* %gep, i64 1 seq_cst + store i64 %result, i64 addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_sub_ret_i64: +; GCN: ds_sub_rtn_u64 +; GCN: s_endpgm +define void @lds_atomic_sub_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { + %result = atomicrmw sub i64 addrspace(3)* %ptr, i64 4 seq_cst + store i64 %result, i64 addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_sub_ret_i64_offset: +; GCN: ds_sub_rtn_u64 {{.*}} offset:32 +; GCN: s_endpgm +define void @lds_atomic_sub_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { + %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 + %result = atomicrmw sub i64 addrspace(3)* %gep, i64 4 seq_cst + store i64 %result, i64 addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_dec_ret_i64: +; GCN: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], -1 +; GCN: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], -1 +; GCN: ds_dec_rtn_u64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}} +; GCN: buffer_store_dwordx2 [[RESULT]], +; GCN: s_endpgm +define void @lds_atomic_dec_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { + %result = atomicrmw sub i64 addrspace(3)* %ptr, i64 1 seq_cst + store i64 %result, i64 addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_dec_ret_i64_offset: +; GCN: ds_dec_rtn_u64 {{.*}} offset:32 +; GCN: s_endpgm +define void @lds_atomic_dec_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { + %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 + %result = atomicrmw sub i64 addrspace(3)* %gep, i64 1 seq_cst + store i64 %result, i64 addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_and_ret_i64: +; GCN: ds_and_rtn_b64 +; GCN: s_endpgm +define void @lds_atomic_and_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { + %result = atomicrmw and i64 addrspace(3)* %ptr, i64 4 seq_cst + store i64 %result, i64 addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_and_ret_i64_offset: +; GCN: ds_and_rtn_b64 {{.*}} offset:32 +; GCN: s_endpgm +define void @lds_atomic_and_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { + %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 + %result = atomicrmw and i64 addrspace(3)* %gep, i64 4 seq_cst + store i64 %result, i64 addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_or_ret_i64: +; GCN: ds_or_rtn_b64 +; GCN: s_endpgm +define void @lds_atomic_or_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { + %result = atomicrmw or i64 addrspace(3)* %ptr, i64 4 seq_cst + store i64 %result, i64 addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_or_ret_i64_offset: +; GCN: ds_or_rtn_b64 {{.*}} offset:32 +; GCN: s_endpgm +define void @lds_atomic_or_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { + %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 + %result = atomicrmw or i64 addrspace(3)* %gep, i64 4 seq_cst + store i64 %result, i64 addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_xor_ret_i64: +; GCN: ds_xor_rtn_b64 +; GCN: s_endpgm +define void @lds_atomic_xor_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { + %result = atomicrmw xor i64 addrspace(3)* %ptr, i64 4 seq_cst + store i64 %result, i64 addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_xor_ret_i64_offset: +; GCN: ds_xor_rtn_b64 {{.*}} offset:32 +; GCN: s_endpgm +define void @lds_atomic_xor_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { + %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 + %result = atomicrmw xor i64 addrspace(3)* %gep, i64 4 seq_cst + store i64 %result, i64 addrspace(1)* %out, align 8 + ret void +} + +; FIXME: There is no atomic nand instr +; XFUNC-LABEL: {{^}}lds_atomic_nand_ret_i64:uction, so we somehow need to expand this. +; define void @lds_atomic_nand_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { +; %result = atomicrmw nand i64 addrspace(3)* %ptr, i32 4 seq_cst +; store i64 %result, i64 addrspace(1)* %out, align 8 +; ret void +; } + +; FUNC-LABEL: {{^}}lds_atomic_min_ret_i64: +; GCN: ds_min_rtn_i64 +; GCN: s_endpgm +define void @lds_atomic_min_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { + %result = atomicrmw min i64 addrspace(3)* %ptr, i64 4 seq_cst + store i64 %result, i64 addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_min_ret_i64_offset: +; GCN: ds_min_rtn_i64 {{.*}} offset:32 +; GCN: s_endpgm +define void @lds_atomic_min_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { + %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 + %result = atomicrmw min i64 addrspace(3)* %gep, i64 4 seq_cst + store i64 %result, i64 addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_max_ret_i64: +; GCN: ds_max_rtn_i64 +; GCN: s_endpgm +define void @lds_atomic_max_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { + %result = atomicrmw max i64 addrspace(3)* %ptr, i64 4 seq_cst + store i64 %result, i64 addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_max_ret_i64_offset: +; GCN: ds_max_rtn_i64 {{.*}} offset:32 +; GCN: s_endpgm +define void @lds_atomic_max_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { + %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 + %result = atomicrmw max i64 addrspace(3)* %gep, i64 4 seq_cst + store i64 %result, i64 addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_umin_ret_i64: +; GCN: ds_min_rtn_u64 +; GCN: s_endpgm +define void @lds_atomic_umin_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { + %result = atomicrmw umin i64 addrspace(3)* %ptr, i64 4 seq_cst + store i64 %result, i64 addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_umin_ret_i64_offset: +; GCN: ds_min_rtn_u64 {{.*}} offset:32 +; GCN: s_endpgm +define void @lds_atomic_umin_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { + %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 + %result = atomicrmw umin i64 addrspace(3)* %gep, i64 4 seq_cst + store i64 %result, i64 addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_umax_ret_i64: +; GCN: ds_max_rtn_u64 +; GCN: s_endpgm +define void @lds_atomic_umax_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { + %result = atomicrmw umax i64 addrspace(3)* %ptr, i64 4 seq_cst + store i64 %result, i64 addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_umax_ret_i64_offset: +; GCN: ds_max_rtn_u64 {{.*}} offset:32 +; GCN: s_endpgm +define void @lds_atomic_umax_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { + %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 + %result = atomicrmw umax i64 addrspace(3)* %gep, i64 4 seq_cst + store i64 %result, i64 addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_xchg_noret_i64: +; GCN: ds_wrxchg_rtn_b64 +; GCN: s_endpgm +define void @lds_atomic_xchg_noret_i64(i64 addrspace(3)* %ptr) nounwind { + %result = atomicrmw xchg i64 addrspace(3)* %ptr, i64 4 seq_cst + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_xchg_noret_i64_offset: +; GCN: ds_wrxchg_rtn_b64 {{.*}} offset:32 +; GCN: s_endpgm +define void @lds_atomic_xchg_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind { + %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 + %result = atomicrmw xchg i64 addrspace(3)* %gep, i64 4 seq_cst + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_add_noret_i64: +; GCN: ds_add_u64 +; GCN: s_endpgm +define void @lds_atomic_add_noret_i64(i64 addrspace(3)* %ptr) nounwind { + %result = atomicrmw add i64 addrspace(3)* %ptr, i64 4 seq_cst + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_add_noret_i64_offset: +; SI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x9 +; VI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x24 +; GCN: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], 9 +; GCN: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], 0 +; GCN: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]] +; GCN: ds_add_u64 [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}} offset:32 +; GCN: s_endpgm +define void @lds_atomic_add_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind { + %gep = getelementptr i64, i64 addrspace(3)* %ptr, i64 4 + %result = atomicrmw add i64 addrspace(3)* %gep, i64 9 seq_cst + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_inc_noret_i64: +; GCN: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], -1 +; GCN: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], -1 +; GCN: ds_inc_u64 [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}} +; GCN: s_endpgm +define void @lds_atomic_inc_noret_i64(i64 addrspace(3)* %ptr) nounwind { + %result = atomicrmw add i64 addrspace(3)* %ptr, i64 1 seq_cst + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_inc_noret_i64_offset: +; GCN: ds_inc_u64 {{.*}} offset:32 +; GCN: s_endpgm +define void @lds_atomic_inc_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind { + %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 + %result = atomicrmw add i64 addrspace(3)* %gep, i64 1 seq_cst + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_sub_noret_i64: +; GCN: ds_sub_u64 +; GCN: s_endpgm +define void @lds_atomic_sub_noret_i64(i64 addrspace(3)* %ptr) nounwind { + %result = atomicrmw sub i64 addrspace(3)* %ptr, i64 4 seq_cst + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_sub_noret_i64_offset: +; GCN: ds_sub_u64 {{.*}} offset:32 +; GCN: s_endpgm +define void @lds_atomic_sub_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind { + %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 + %result = atomicrmw sub i64 addrspace(3)* %gep, i64 4 seq_cst + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_dec_noret_i64: +; GCN: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], -1 +; GCN: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], -1 +; GCN: ds_dec_u64 [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}} +; GCN: s_endpgm +define void @lds_atomic_dec_noret_i64(i64 addrspace(3)* %ptr) nounwind { + %result = atomicrmw sub i64 addrspace(3)* %ptr, i64 1 seq_cst + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_dec_noret_i64_offset: +; GCN: ds_dec_u64 {{.*}} offset:32 +; GCN: s_endpgm +define void @lds_atomic_dec_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind { + %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 + %result = atomicrmw sub i64 addrspace(3)* %gep, i64 1 seq_cst + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_and_noret_i64: +; GCN: ds_and_b64 +; GCN: s_endpgm +define void @lds_atomic_and_noret_i64(i64 addrspace(3)* %ptr) nounwind { + %result = atomicrmw and i64 addrspace(3)* %ptr, i64 4 seq_cst + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_and_noret_i64_offset: +; GCN: ds_and_b64 {{.*}} offset:32 +; GCN: s_endpgm +define void @lds_atomic_and_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind { + %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 + %result = atomicrmw and i64 addrspace(3)* %gep, i64 4 seq_cst + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_or_noret_i64: +; GCN: ds_or_b64 +; GCN: s_endpgm +define void @lds_atomic_or_noret_i64(i64 addrspace(3)* %ptr) nounwind { + %result = atomicrmw or i64 addrspace(3)* %ptr, i64 4 seq_cst + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_or_noret_i64_offset: +; GCN: ds_or_b64 {{.*}} offset:32 +; GCN: s_endpgm +define void @lds_atomic_or_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind { + %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 + %result = atomicrmw or i64 addrspace(3)* %gep, i64 4 seq_cst + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_xor_noret_i64: +; GCN: ds_xor_b64 +; GCN: s_endpgm +define void @lds_atomic_xor_noret_i64(i64 addrspace(3)* %ptr) nounwind { + %result = atomicrmw xor i64 addrspace(3)* %ptr, i64 4 seq_cst + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_xor_noret_i64_offset: +; GCN: ds_xor_b64 {{.*}} offset:32 +; GCN: s_endpgm +define void @lds_atomic_xor_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind { + %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 + %result = atomicrmw xor i64 addrspace(3)* %gep, i64 4 seq_cst + ret void +} + +; FIXME: There is no atomic nand instr +; XFUNC-LABEL: {{^}}lds_atomic_nand_noret_i64:uction, so we somehow need to expand this. +; define void @lds_atomic_nand_noret_i64(i64 addrspace(3)* %ptr) nounwind { +; %result = atomicrmw nand i64 addrspace(3)* %ptr, i32 4 seq_cst +; ret void +; } + +; FUNC-LABEL: {{^}}lds_atomic_min_noret_i64: +; GCN: ds_min_i64 +; GCN: s_endpgm +define void @lds_atomic_min_noret_i64(i64 addrspace(3)* %ptr) nounwind { + %result = atomicrmw min i64 addrspace(3)* %ptr, i64 4 seq_cst + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_min_noret_i64_offset: +; GCN: ds_min_i64 {{.*}} offset:32 +; GCN: s_endpgm +define void @lds_atomic_min_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind { + %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 + %result = atomicrmw min i64 addrspace(3)* %gep, i64 4 seq_cst + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_max_noret_i64: +; GCN: ds_max_i64 +; GCN: s_endpgm +define void @lds_atomic_max_noret_i64(i64 addrspace(3)* %ptr) nounwind { + %result = atomicrmw max i64 addrspace(3)* %ptr, i64 4 seq_cst + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_max_noret_i64_offset: +; GCN: ds_max_i64 {{.*}} offset:32 +; GCN: s_endpgm +define void @lds_atomic_max_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind { + %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 + %result = atomicrmw max i64 addrspace(3)* %gep, i64 4 seq_cst + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_umin_noret_i64: +; GCN: ds_min_u64 +; GCN: s_endpgm +define void @lds_atomic_umin_noret_i64(i64 addrspace(3)* %ptr) nounwind { + %result = atomicrmw umin i64 addrspace(3)* %ptr, i64 4 seq_cst + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_umin_noret_i64_offset: +; GCN: ds_min_u64 {{.*}} offset:32 +; GCN: s_endpgm +define void @lds_atomic_umin_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind { + %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 + %result = atomicrmw umin i64 addrspace(3)* %gep, i64 4 seq_cst + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_umax_noret_i64: +; GCN: ds_max_u64 +; GCN: s_endpgm +define void @lds_atomic_umax_noret_i64(i64 addrspace(3)* %ptr) nounwind { + %result = atomicrmw umax i64 addrspace(3)* %ptr, i64 4 seq_cst + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_umax_noret_i64_offset: +; GCN: ds_max_u64 {{.*}} offset:32 +; GCN: s_endpgm +define void @lds_atomic_umax_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind { + %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 + %result = atomicrmw umax i64 addrspace(3)* %gep, i64 4 seq_cst + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/local-memory-two-objects.ll b/llvm/test/CodeGen/AMDGPU/local-memory-two-objects.ll new file mode 100644 index 00000000000..06a8b1246e6 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/local-memory-two-objects.ll @@ -0,0 +1,63 @@ +; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG %s +; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=GCN --check-prefix=SI %s +; RUN: llc < %s -march=amdgcn -mcpu=bonaire -verify-machineinstrs | FileCheck --check-prefix=GCN --check-prefix=CI %s + +@local_memory_two_objects.local_mem0 = internal unnamed_addr addrspace(3) global [4 x i32] undef, align 4 +@local_memory_two_objects.local_mem1 = internal unnamed_addr addrspace(3) global [4 x i32] undef, align 4 + + +; Check that the LDS size emitted correctly +; EG: .long 166120 +; EG-NEXT: .long 8 +; GCN: .long 47180 +; GCN-NEXT: .long 38792 + +; EG: {{^}}local_memory_two_objects: + +; We would like to check the the lds writes are using different +; addresses, but due to variations in the scheduler, we can't do +; this consistently on evergreen GPUs. +; EG: LDS_WRITE +; EG: LDS_WRITE +; GCN: ds_write_b32 {{v[0-9]*}}, v[[ADDRW:[0-9]*]] +; GCN-NOT: ds_write_b32 {{v[0-9]*}}, v[[ADDRW]] + +; GROUP_BARRIER must be the last instruction in a clause +; EG: GROUP_BARRIER +; EG-NEXT: ALU clause + +; Make sure the lds reads are using different addresses, at different +; constant offsets. +; EG: LDS_READ_RET {{[*]*}} OQAP, {{PV|T}}[[ADDRR:[0-9]*\.[XYZW]]] +; EG-NOT: LDS_READ_RET {{[*]*}} OQAP, T[[ADDRR]] +; SI: v_add_i32_e32 [[SIPTR:v[0-9]+]], 16, v{{[0-9]+}} +; SI: ds_read_b32 {{v[0-9]+}}, [[SIPTR]] +; CI: ds_read_b32 {{v[0-9]+}}, [[ADDRR:v[0-9]+]] offset:16 +; CI: ds_read_b32 {{v[0-9]+}}, [[ADDRR]] + +define void @local_memory_two_objects(i32 addrspace(1)* %out) { +entry: + %x.i = call i32 @llvm.r600.read.tidig.x() #0 + %arrayidx = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem0, i32 0, i32 %x.i + store i32 %x.i, i32 addrspace(3)* %arrayidx, align 4 + %mul = shl nsw i32 %x.i, 1 + %arrayidx1 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem1, i32 0, i32 %x.i + store i32 %mul, i32 addrspace(3)* %arrayidx1, align 4 + %sub = sub nsw i32 3, %x.i + call void @llvm.AMDGPU.barrier.local() + %arrayidx2 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem0, i32 0, i32 %sub + %0 = load i32, i32 addrspace(3)* %arrayidx2, align 4 + %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %x.i + store i32 %0, i32 addrspace(1)* %arrayidx3, align 4 + %arrayidx4 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem1, i32 0, i32 %sub + %1 = load i32, i32 addrspace(3)* %arrayidx4, align 4 + %add = add nsw i32 %x.i, 4 + %arrayidx5 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %add + store i32 %1, i32 addrspace(1)* %arrayidx5, align 4 + ret void +} + +declare i32 @llvm.r600.read.tidig.x() #0 +declare void @llvm.AMDGPU.barrier.local() + +attributes #0 = { readnone } diff --git a/llvm/test/CodeGen/AMDGPU/local-memory.ll b/llvm/test/CodeGen/AMDGPU/local-memory.ll new file mode 100644 index 00000000000..9494ed75bd0 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/local-memory.ll @@ -0,0 +1,49 @@ +; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s + +@local_memory.local_mem = internal unnamed_addr addrspace(3) global [128 x i32] undef, align 4 + + +; Check that the LDS size emitted correctly +; EG: .long 166120 +; EG-NEXT: .long 128 +; SI: .long 47180 +; SI-NEXT: .long 71560 +; CI: .long 47180 +; CI-NEXT: .long 38792 + +; FUNC-LABEL: {{^}}local_memory: + +; EG: LDS_WRITE +; SI-NOT: s_wqm_b64 +; SI: ds_write_b32 + +; GROUP_BARRIER must be the last instruction in a clause +; EG: GROUP_BARRIER +; EG-NEXT: ALU clause +; SI: s_barrier + +; EG: LDS_READ_RET +; SI: ds_read_b32 {{v[0-9]+}}, + +define void @local_memory(i32 addrspace(1)* %out) { +entry: + %y.i = call i32 @llvm.r600.read.tidig.x() #0 + %arrayidx = getelementptr inbounds [128 x i32], [128 x i32] addrspace(3)* @local_memory.local_mem, i32 0, i32 %y.i + store i32 %y.i, i32 addrspace(3)* %arrayidx, align 4 + %add = add nsw i32 %y.i, 1 + %cmp = icmp eq i32 %add, 16 + %.add = select i1 %cmp, i32 0, i32 %add + call void @llvm.AMDGPU.barrier.local() + %arrayidx1 = getelementptr inbounds [128 x i32], [128 x i32] addrspace(3)* @local_memory.local_mem, i32 0, i32 %.add + %0 = load i32, i32 addrspace(3)* %arrayidx1, align 4 + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %y.i + store i32 %0, i32 addrspace(1)* %arrayidx2, align 4 + ret void +} + +declare i32 @llvm.r600.read.tidig.x() #0 +declare void @llvm.AMDGPU.barrier.local() + +attributes #0 = { readnone } diff --git a/llvm/test/CodeGen/AMDGPU/loop-address.ll b/llvm/test/CodeGen/AMDGPU/loop-address.ll new file mode 100644 index 00000000000..f60d574497d --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/loop-address.ll @@ -0,0 +1,34 @@ +;RUN: llc < %s -march=r600 -mcpu=redwood < %s | FileCheck %s + +;CHECK: ALU_PUSH +;CHECK: LOOP_START_DX10 @11 +;CHECK: LOOP_BREAK @10 +;CHECK: POP @10 + +define void @loop_ge(i32 addrspace(1)* nocapture %out, i32 %iterations) #0 { +entry: + %cmp5 = icmp sgt i32 %iterations, 0 + br i1 %cmp5, label %for.body, label %for.end + +for.body: ; preds = %for.body, %entry + %i.07.in = phi i32 [ %i.07, %for.body ], [ %iterations, %entry ] + %ai.06 = phi i32 [ %add, %for.body ], [ 0, %entry ] + %i.07 = add nsw i32 %i.07.in, -1 + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %ai.06 + store i32 %i.07, i32 addrspace(1)* %arrayidx, align 4 + %add = add nsw i32 %ai.06, 1 + %exitcond = icmp eq i32 %add, %iterations + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret void +} + +attributes #0 = { nounwind "fp-contract-model"="standard" "relocation-model"="pic" "ssp-buffers-size"="8" } + +!opencl.kernels = !{!0, !1, !2, !3} + +!0 = !{void (i32 addrspace(1)*, i32)* @loop_ge} +!1 = !{null} +!2 = !{null} +!3 = !{null} diff --git a/llvm/test/CodeGen/AMDGPU/loop-idiom.ll b/llvm/test/CodeGen/AMDGPU/loop-idiom.ll new file mode 100644 index 00000000000..5fd9806813c --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/loop-idiom.ll @@ -0,0 +1,51 @@ +; RUN: opt -basicaa -loop-idiom -S < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=R600 --check-prefix=FUNC %s +; RUN: opt -basicaa -loop-idiom -S < %s -march=amdgcn -mcpu=SI -verify-machineinstrs| FileCheck --check-prefix=SI --check-prefix=FUNC %s +; RUN: opt -basicaa -loop-idiom -S < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs| FileCheck --check-prefix=SI --check-prefix=FUNC %s + + +; Make sure loop-idiom doesn't create memcpy or memset. There are no library +; implementations of these for R600. + +; FUNC: @no_memcpy +; R600-NOT: {{^}}llvm.memcpy +; SI-NOT: {{^}}llvm.memcpy +define void @no_memcpy(i8 addrspace(3)* %in, i32 %size) { +entry: + %dest = alloca i8, i32 32 + br label %for.body + +for.body: + %0 = phi i32 [0, %entry], [%4, %for.body] + %1 = getelementptr i8, i8 addrspace(3)* %in, i32 %0 + %2 = getelementptr i8, i8* %dest, i32 %0 + %3 = load i8, i8 addrspace(3)* %1 + store i8 %3, i8* %2 + %4 = add i32 %0, 1 + %5 = icmp eq i32 %4, %size + br i1 %5, label %for.end, label %for.body + +for.end: + ret void +} + +; FUNC: @no_memset +; R600-NOT: {{^}}llvm.memset +; R600-NOT: {{^}}memset_pattern16: +; SI-NOT: {{^}}llvm.memset +; SI-NOT: {{^}}memset_pattern16: +define void @no_memset(i32 %size) { +entry: + %dest = alloca i8, i32 32 + br label %for.body + +for.body: + %0 = phi i32 [0, %entry], [%2, %for.body] + %1 = getelementptr i8, i8* %dest, i32 %0 + store i8 0, i8* %1 + %2 = add i32 %0, 1 + %3 = icmp eq i32 %2, %size + br i1 %3, label %for.end, label %for.body + +for.end: + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/lshl.ll b/llvm/test/CodeGen/AMDGPU/lshl.ll new file mode 100644 index 00000000000..9ac988d38d1 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/lshl.ll @@ -0,0 +1,15 @@ +;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s +;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s + +;CHECK: s_lshl_b32 s{{[0-9]}}, s{{[0-9]}}, 1 + +define void @test(i32 %p) { + %i = mul i32 %p, 2 + %r = bitcast i32 %i to float + call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %r, float %r, float %r, float %r) + ret void +} + +declare <4 x float> @llvm.SI.sample.(i32, <4 x i32>, <8 x i32>, <4 x i32>, i32) readnone + +declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) diff --git a/llvm/test/CodeGen/AMDGPU/lshr.ll b/llvm/test/CodeGen/AMDGPU/lshr.ll new file mode 100644 index 00000000000..50e444ac26b --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/lshr.ll @@ -0,0 +1,15 @@ +;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s +;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s + +;CHECK: s_lshr_b32 s{{[0-9]}}, s{{[0-9]}}, 1 + +define void @test(i32 %p) { + %i = udiv i32 %p, 2 + %r = bitcast i32 %i to float + call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %r, float %r, float %r, float %r) + ret void +} + +declare <4 x float> @llvm.SI.sample.(i32, <4 x i32>, <8 x i32>, <4 x i32>, i32) readnone + +declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) diff --git a/llvm/test/CodeGen/AMDGPU/m0-spill.ll b/llvm/test/CodeGen/AMDGPU/m0-spill.ll new file mode 100644 index 00000000000..1dddc85f775 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/m0-spill.ll @@ -0,0 +1,35 @@ +; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s +; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s + +@lds = external addrspace(3) global [64 x float] + +; CHECK-LABEL: {{^}}main: +; CHECK-NOT: v_readlane_b32 m0 +define void @main(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg) "ShaderType"="0" { +main_body: + %4 = call float @llvm.SI.fs.constant(i32 0, i32 0, i32 %3) + %cmp = fcmp ueq float 0.0, %4 + br i1 %cmp, label %if, label %else + +if: + %lds_ptr = getelementptr [64 x float], [64 x float] addrspace(3)* @lds, i32 0, i32 0 + %lds_data = load float, float addrspace(3)* %lds_ptr + br label %endif + +else: + %interp = call float @llvm.SI.fs.constant(i32 0, i32 0, i32 %3) + br label %endif + +endif: + %export = phi float [%lds_data, %if], [%interp, %else] + %5 = call i32 @llvm.SI.packf16(float %export, float %export) + %6 = bitcast i32 %5 to float + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %6, float %6, float %6, float %6) + ret void +} + +declare float @llvm.SI.fs.constant(i32, i32, i32) readnone + +declare i32 @llvm.SI.packf16(float, float) readnone + +declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) diff --git a/llvm/test/CodeGen/AMDGPU/mad-combine.ll b/llvm/test/CodeGen/AMDGPU/mad-combine.ll new file mode 100644 index 00000000000..bc071628ead --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/mad-combine.ll @@ -0,0 +1,567 @@ +; Make sure we still form mad even when unsafe math or fp-contract is allowed instead of fma. + +; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=SI-STD -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -fp-contract=fast < %s | FileCheck -check-prefix=SI -check-prefix=SI-STD -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=SI -check-prefix=SI-STD -check-prefix=FUNC %s + +; Make sure we don't form mad with denormals +; RUN: llc -march=amdgcn -mcpu=tahiti -mattr=+fp32-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=SI-DENORM -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=verde -mattr=+fp32-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=SI-DENORM-SLOWFMAF -check-prefix=FUNC %s + +declare i32 @llvm.r600.read.tidig.x() #0 +declare float @llvm.fabs.f32(float) #0 +declare float @llvm.fma.f32(float, float, float) #0 +declare float @llvm.fmuladd.f32(float, float, float) #0 + +; (fadd (fmul x, y), z) -> (fma x, y, z) +; FUNC-LABEL: {{^}}combine_to_mad_f32_0: +; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} +; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} + +; SI-STD: v_mad_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[C]] + +; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[C]] + +; SI-DENORM-SLOWFMAF-NOT: v_fma +; SI-DENORM-SLOWFMAF-NOT: v_mad + +; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]] +; SI-DENORM-SLOWFMAF: v_add_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP]] + +; SI: buffer_store_dword [[RESULT]] +define void @combine_to_mad_f32_0(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { + %tid = tail call i32 @llvm.r600.read.tidig.x() #0 + %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 + %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2 + %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid + + %a = load float, float addrspace(1)* %gep.0 + %b = load float, float addrspace(1)* %gep.1 + %c = load float, float addrspace(1)* %gep.2 + + %mul = fmul float %a, %b + %fma = fadd float %mul, %c + store float %fma, float addrspace(1)* %gep.out + ret void +} + +; (fadd (fmul x, y), z) -> (fma x, y, z) +; FUNC-LABEL: {{^}}combine_to_mad_f32_0_2use: +; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} +; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} +; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}} + +; SI-STD-DAG: v_mad_f32 [[RESULT0:v[0-9]+]], [[A]], [[B]], [[C]] +; SI-STD-DAG: v_mad_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], [[D]] + +; SI-DENORM-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], [[A]], [[B]], [[C]] +; SI-DENORM-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], [[D]] + +; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]] +; SI-DENORM-SLOWFMAF-DAG: v_add_f32_e32 [[RESULT0:v[0-9]+]], [[C]], [[TMP]] +; SI-DENORM-SLOWFMAF-DAG: v_add_f32_e32 [[RESULT1:v[0-9]+]], [[D]], [[TMP]] + +; SI-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} +; SI: s_endpgm +define void @combine_to_mad_f32_0_2use(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { + %tid = tail call i32 @llvm.r600.read.tidig.x() #0 + %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 + %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2 + %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3 + %gep.out.0 = getelementptr float, float addrspace(1)* %out, i32 %tid + %gep.out.1 = getelementptr float, float addrspace(1)* %gep.out.0, i32 1 + + %a = load float, float addrspace(1)* %gep.0 + %b = load float, float addrspace(1)* %gep.1 + %c = load float, float addrspace(1)* %gep.2 + %d = load float, float addrspace(1)* %gep.3 + + %mul = fmul float %a, %b + %fma0 = fadd float %mul, %c + %fma1 = fadd float %mul, %d + + store float %fma0, float addrspace(1)* %gep.out.0 + store float %fma1, float addrspace(1)* %gep.out.1 + ret void +} + +; (fadd x, (fmul y, z)) -> (fma y, z, x) +; FUNC-LABEL: {{^}}combine_to_mad_f32_1: +; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} +; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} + +; SI-STD: v_mad_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[C]] +; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[C]] + +; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]] +; SI-DENORM-SLOWFMAF: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[C]] + +; SI: buffer_store_dword [[RESULT]] +define void @combine_to_mad_f32_1(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { + %tid = tail call i32 @llvm.r600.read.tidig.x() #0 + %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 + %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2 + %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid + + %a = load float, float addrspace(1)* %gep.0 + %b = load float, float addrspace(1)* %gep.1 + %c = load float, float addrspace(1)* %gep.2 + + %mul = fmul float %a, %b + %fma = fadd float %c, %mul + store float %fma, float addrspace(1)* %gep.out + ret void +} + +; (fsub (fmul x, y), z) -> (fma x, y, (fneg z)) +; FUNC-LABEL: {{^}}combine_to_mad_fsub_0_f32: +; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} +; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} + +; SI-STD: v_mad_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], -[[C]] +; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], -[[C]] + +; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]] +; SI-DENORM-SLOWFMAF: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP]] + +; SI: buffer_store_dword [[RESULT]] +define void @combine_to_mad_fsub_0_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { + %tid = tail call i32 @llvm.r600.read.tidig.x() #0 + %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 + %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2 + %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid + + %a = load float, float addrspace(1)* %gep.0 + %b = load float, float addrspace(1)* %gep.1 + %c = load float, float addrspace(1)* %gep.2 + + %mul = fmul float %a, %b + %fma = fsub float %mul, %c + store float %fma, float addrspace(1)* %gep.out + ret void +} + +; (fsub (fmul x, y), z) -> (fma x, y, (fneg z)) +; FUNC-LABEL: {{^}}combine_to_mad_fsub_0_f32_2use: +; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} +; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} +; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}} + +; SI-STD-DAG: v_mad_f32 [[RESULT0:v[0-9]+]], [[A]], [[B]], -[[C]] +; SI-STD-DAG: v_mad_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], -[[D]] + +; SI-DENORM-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], [[A]], [[B]], -[[C]] +; SI-DENORM-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], -[[D]] + +; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]] +; SI-DENORM-SLOWFMAF-DAG: v_subrev_f32_e32 [[RESULT0:v[0-9]+]], [[C]], [[TMP]] +; SI-DENORM-SLOWFMAF-DAG: v_subrev_f32_e32 [[RESULT1:v[0-9]+]], [[D]], [[TMP]] + +; SI-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} +; SI: s_endpgm +define void @combine_to_mad_fsub_0_f32_2use(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { + %tid = tail call i32 @llvm.r600.read.tidig.x() #0 + %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 + %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2 + %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3 + %gep.out.0 = getelementptr float, float addrspace(1)* %out, i32 %tid + %gep.out.1 = getelementptr float, float addrspace(1)* %gep.out.0, i32 1 + + %a = load float, float addrspace(1)* %gep.0 + %b = load float, float addrspace(1)* %gep.1 + %c = load float, float addrspace(1)* %gep.2 + %d = load float, float addrspace(1)* %gep.3 + + %mul = fmul float %a, %b + %fma0 = fsub float %mul, %c + %fma1 = fsub float %mul, %d + store float %fma0, float addrspace(1)* %gep.out.0 + store float %fma1, float addrspace(1)* %gep.out.1 + ret void +} + +; (fsub x, (fmul y, z)) -> (fma (fneg y), z, x) +; FUNC-LABEL: {{^}}combine_to_mad_fsub_1_f32: +; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} +; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} + +; SI-STD: v_mad_f32 [[RESULT:v[0-9]+]], -[[A]], [[B]], [[C]] +; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], -[[A]], [[B]], [[C]] + +; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]] +; SI-DENORM-SLOWFMAF: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[C]] + +; SI: buffer_store_dword [[RESULT]] +define void @combine_to_mad_fsub_1_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { + %tid = tail call i32 @llvm.r600.read.tidig.x() #0 + %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 + %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2 + %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid + + %a = load float, float addrspace(1)* %gep.0 + %b = load float, float addrspace(1)* %gep.1 + %c = load float, float addrspace(1)* %gep.2 + + %mul = fmul float %a, %b + %fma = fsub float %c, %mul + store float %fma, float addrspace(1)* %gep.out + ret void +} + +; (fsub x, (fmul y, z)) -> (fma (fneg y), z, x) +; FUNC-LABEL: {{^}}combine_to_mad_fsub_1_f32_2use: +; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} +; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} + +; SI-STD-DAG: v_mad_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], [[C]] +; SI-STD-DAG: v_mad_f32 [[RESULT1:v[0-9]+]], -[[A]], [[B]], [[D]] + +; SI-DENORM-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], [[C]] +; SI-DENORM-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], -[[A]], [[B]], [[D]] + +; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]] +; SI-DENORM-SLOWFMAF-DAG: v_subrev_f32_e32 [[RESULT0:v[0-9]+]], [[TMP]], [[C]] +; SI-DENORM-SLOWFMAF-DAG: v_subrev_f32_e32 [[RESULT1:v[0-9]+]], [[TMP]], [[D]] + +; SI-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} +; SI: s_endpgm +define void @combine_to_mad_fsub_1_f32_2use(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { + %tid = tail call i32 @llvm.r600.read.tidig.x() #0 + %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 + %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2 + %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3 + %gep.out.0 = getelementptr float, float addrspace(1)* %out, i32 %tid + %gep.out.1 = getelementptr float, float addrspace(1)* %gep.out.0, i32 1 + + %a = load float, float addrspace(1)* %gep.0 + %b = load float, float addrspace(1)* %gep.1 + %c = load float, float addrspace(1)* %gep.2 + %d = load float, float addrspace(1)* %gep.3 + + %mul = fmul float %a, %b + %fma0 = fsub float %c, %mul + %fma1 = fsub float %d, %mul + store float %fma0, float addrspace(1)* %gep.out.0 + store float %fma1, float addrspace(1)* %gep.out.1 + ret void +} + +; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z)) +; FUNC-LABEL: {{^}}combine_to_mad_fsub_2_f32: +; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} +; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} + +; SI-STD: v_mad_f32 [[RESULT:v[0-9]+]], -[[A]], [[B]], -[[C]] + +; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], -[[A]], [[B]], -[[C]] + +; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]] +; SI-DENORM-SLOWFMAF: v_sub_f32_e64 [[RESULT:v[0-9]+]], -[[TMP]], [[C]] + +; SI: buffer_store_dword [[RESULT]] +define void @combine_to_mad_fsub_2_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { + %tid = tail call i32 @llvm.r600.read.tidig.x() #0 + %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 + %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2 + %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid + + %a = load float, float addrspace(1)* %gep.0 + %b = load float, float addrspace(1)* %gep.1 + %c = load float, float addrspace(1)* %gep.2 + + %mul = fmul float %a, %b + %mul.neg = fsub float -0.0, %mul + %fma = fsub float %mul.neg, %c + + store float %fma, float addrspace(1)* %gep.out + ret void +} + +; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z)) +; FUNC-LABEL: {{^}}combine_to_mad_fsub_2_f32_2uses_neg: +; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} +; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} + +; SI-STD-DAG: v_mad_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], -[[C]] +; SI-STD-DAG: v_mad_f32 [[RESULT1:v[0-9]+]], -[[A]], [[B]], -[[D]] + +; SI-DENORM-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], -[[C]] +; SI-DENORM-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], -[[A]], [[B]], -[[D]] + +; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]] +; SI-DENORM-SLOWFMAF-DAG: v_sub_f32_e64 [[RESULT0:v[0-9]+]], -[[TMP]], [[C]] +; SI-DENORM-SLOWFMAF-DAG: v_sub_f32_e64 [[RESULT1:v[0-9]+]], -[[TMP]], [[D]] + +; SI-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} +; SI: s_endpgm +define void @combine_to_mad_fsub_2_f32_2uses_neg(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { + %tid = tail call i32 @llvm.r600.read.tidig.x() #0 + %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 + %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2 + %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3 + %gep.out.0 = getelementptr float, float addrspace(1)* %out, i32 %tid + %gep.out.1 = getelementptr float, float addrspace(1)* %gep.out.0, i32 1 + + %a = load float, float addrspace(1)* %gep.0 + %b = load float, float addrspace(1)* %gep.1 + %c = load float, float addrspace(1)* %gep.2 + %d = load float, float addrspace(1)* %gep.3 + + %mul = fmul float %a, %b + %mul.neg = fsub float -0.0, %mul + %fma0 = fsub float %mul.neg, %c + %fma1 = fsub float %mul.neg, %d + + store float %fma0, float addrspace(1)* %gep.out.0 + store float %fma1, float addrspace(1)* %gep.out.1 + ret void +} + +; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z)) +; FUNC-LABEL: {{^}}combine_to_mad_fsub_2_f32_2uses_mul: +; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} +; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} + +; SI-STD-DAG: v_mad_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], -[[C]] +; SI-STD-DAG: v_mad_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], -[[D]] + +; SI-DENORM-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], -[[C]] +; SI-DENORM-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], -[[D]] + +; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]] +; SI-DENORM-SLOWFMAF-DAG: v_sub_f32_e64 [[RESULT0:v[0-9]+]], -[[TMP]], [[C]] +; SI-DENORM-SLOWFMAF-DAG: v_subrev_f32_e32 [[RESULT1:v[0-9]+]], [[D]], [[TMP]] + +; SI-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} +; SI: s_endpgm +define void @combine_to_mad_fsub_2_f32_2uses_mul(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { + %tid = tail call i32 @llvm.r600.read.tidig.x() #0 + %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 + %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2 + %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3 + %gep.out.0 = getelementptr float, float addrspace(1)* %out, i32 %tid + %gep.out.1 = getelementptr float, float addrspace(1)* %gep.out.0, i32 1 + + %a = load float, float addrspace(1)* %gep.0 + %b = load float, float addrspace(1)* %gep.1 + %c = load float, float addrspace(1)* %gep.2 + %d = load float, float addrspace(1)* %gep.3 + + %mul = fmul float %a, %b + %mul.neg = fsub float -0.0, %mul + %fma0 = fsub float %mul.neg, %c + %fma1 = fsub float %mul, %d + + store float %fma0, float addrspace(1)* %gep.out.0 + store float %fma1, float addrspace(1)* %gep.out.1 + ret void +} + +; fold (fsub (fma x, y, (fmul u, v)), z) -> (fma x, y (fma u, v, (fneg z))) + +; FUNC-LABEL: {{^}}aggressive_combine_to_mad_fsub_0_f32: +; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} +; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} +; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}} +; SI-DAG: buffer_load_dword [[E:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} + +; SI-STD: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]] +; SI-STD: v_fma_f32 [[TMP1:v[0-9]+]], [[A]], [[B]], [[TMP0]] +; SI-STD: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP1]] + +; SI-DENORM: v_fma_f32 [[TMP0:v[0-9]+]], [[D]], [[E]], -[[C]] +; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[TMP0]] + +; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]] +; SI-DENORM-SLOWFMAF: v_fma_f32 [[TMP1:v[0-9]+]], [[A]], [[B]], [[TMP0]] +; SI-DENORM-SLOWFMAF: v_subrev_f32_e32 [[RESULT1:v[0-9]+]], [[C]], [[TMP1]] + +; SI: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +define void @aggressive_combine_to_mad_fsub_0_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { + %tid = tail call i32 @llvm.r600.read.tidig.x() #0 + %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 + %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2 + %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3 + %gep.4 = getelementptr float, float addrspace(1)* %gep.0, i32 4 + %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid + + %x = load float, float addrspace(1)* %gep.0 + %y = load float, float addrspace(1)* %gep.1 + %z = load float, float addrspace(1)* %gep.2 + %u = load float, float addrspace(1)* %gep.3 + %v = load float, float addrspace(1)* %gep.4 + + %tmp0 = fmul float %u, %v + %tmp1 = call float @llvm.fma.f32(float %x, float %y, float %tmp0) #0 + %tmp2 = fsub float %tmp1, %z + + store float %tmp2, float addrspace(1)* %gep.out + ret void +} + +; fold (fsub x, (fma y, z, (fmul u, v))) +; -> (fma (fneg y), z, (fma (fneg u), v, x)) + +; FUNC-LABEL: {{^}}aggressive_combine_to_mad_fsub_1_f32: +; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} +; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} +; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}} +; SI-DAG: buffer_load_dword [[E:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} + +; SI-STD: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]] +; SI-STD: v_fma_f32 [[TMP1:v[0-9]+]], [[B]], [[C]], [[TMP0]] +; SI-STD: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP1]], [[A]] + +; SI-DENORM: v_fma_f32 [[TMP0:v[0-9]+]], -[[D]], [[E]], [[A]] +; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], -[[B]], [[C]], [[TMP0]] + +; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]] +; SI-DENORM-SLOWFMAF: v_fma_f32 [[TMP1:v[0-9]+]], [[B]], [[C]], [[TMP0]] +; SI-DENORM-SLOWFMAF: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP1]], [[A]] + +; SI: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI: s_endpgm +define void @aggressive_combine_to_mad_fsub_1_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { + %tid = tail call i32 @llvm.r600.read.tidig.x() #0 + %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 + %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2 + %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3 + %gep.4 = getelementptr float, float addrspace(1)* %gep.0, i32 4 + %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid + + %x = load float, float addrspace(1)* %gep.0 + %y = load float, float addrspace(1)* %gep.1 + %z = load float, float addrspace(1)* %gep.2 + %u = load float, float addrspace(1)* %gep.3 + %v = load float, float addrspace(1)* %gep.4 + + %tmp0 = fmul float %u, %v + %tmp1 = call float @llvm.fma.f32(float %y, float %z, float %tmp0) #0 + %tmp2 = fsub float %x, %tmp1 + + store float %tmp2, float addrspace(1)* %gep.out + ret void +} + +; fold (fsub (fma x, y, (fmul u, v)), z) -> (fma x, y (fma u, v, (fneg z))) + +; FUNC-LABEL: {{^}}aggressive_combine_to_mad_fsub_2_f32: +; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} +; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} +; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}} +; SI-DAG: buffer_load_dword [[E:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} + +; SI-STD: v_mad_f32 [[TMP:v[0-9]+]], [[D]], [[E]], -[[C]] +; SI-STD: v_mad_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[TMP]] + +; SI-DENORM: v_fma_f32 [[TMP:v[0-9]+]], [[D]], [[E]], -[[C]] +; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[TMP]] + +; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]] +; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP1:v[0-9]+]], [[B]], [[A]] +; SI-DENORM-SLOWFMAF: v_add_f32_e32 [[TMP2:v[0-9]+]], [[TMP0]], [[TMP1]] +; SI-DENORM-SLOWFMAF: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP2]] + +; SI: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI: s_endpgm +define void @aggressive_combine_to_mad_fsub_2_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { + %tid = tail call i32 @llvm.r600.read.tidig.x() #0 + %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 + %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2 + %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3 + %gep.4 = getelementptr float, float addrspace(1)* %gep.0, i32 4 + %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid + + %x = load float, float addrspace(1)* %gep.0 + %y = load float, float addrspace(1)* %gep.1 + %z = load float, float addrspace(1)* %gep.2 + %u = load float, float addrspace(1)* %gep.3 + %v = load float, float addrspace(1)* %gep.4 + + %tmp0 = fmul float %u, %v + %tmp1 = call float @llvm.fmuladd.f32(float %x, float %y, float %tmp0) #0 + %tmp2 = fsub float %tmp1, %z + + store float %tmp2, float addrspace(1)* %gep.out + ret void +} + +; fold (fsub x, (fmuladd y, z, (fmul u, v))) +; -> (fmuladd (fneg y), z, (fmuladd (fneg u), v, x)) + +; FUNC-LABEL: {{^}}aggressive_combine_to_mad_fsub_3_f32: +; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} +; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} +; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}} +; SI-DAG: buffer_load_dword [[E:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} + +; SI-STD: v_mad_f32 [[TMP:v[0-9]+]], -[[D]], [[E]], [[A]] +; SI-STD: v_mad_f32 [[RESULT:v[0-9]+]], -[[B]], [[C]], [[TMP]] + +; SI-DENORM: v_fma_f32 [[TMP:v[0-9]+]], -[[D]], [[E]], [[A]] +; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], -[[B]], [[C]], [[TMP]] + +; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]] +; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP1:v[0-9]+]], [[C]], [[B]] +; SI-DENORM-SLOWFMAF: v_add_f32_e32 [[TMP2:v[0-9]+]], [[TMP0]], [[TMP1]] +; SI-DENORM-SLOWFMAF: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP2]], [[A]] + +; SI: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI: s_endpgm +define void @aggressive_combine_to_mad_fsub_3_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { + %tid = tail call i32 @llvm.r600.read.tidig.x() #0 + %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 + %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2 + %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3 + %gep.4 = getelementptr float, float addrspace(1)* %gep.0, i32 4 + %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid + + %x = load float, float addrspace(1)* %gep.0 + %y = load float, float addrspace(1)* %gep.1 + %z = load float, float addrspace(1)* %gep.2 + %u = load float, float addrspace(1)* %gep.3 + %v = load float, float addrspace(1)* %gep.4 + + %tmp0 = fmul float %u, %v + %tmp1 = call float @llvm.fmuladd.f32(float %y, float %z, float %tmp0) #0 + %tmp2 = fsub float %x, %tmp1 + + store float %tmp2, float addrspace(1)* %gep.out + ret void +} + +attributes #0 = { nounwind readnone } +attributes #1 = { nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/mad-sub.ll b/llvm/test/CodeGen/AMDGPU/mad-sub.ll new file mode 100644 index 00000000000..aa4194ff610 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/mad-sub.ll @@ -0,0 +1,215 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s + +declare i32 @llvm.r600.read.tidig.x() #0 +declare float @llvm.fabs.f32(float) #0 + +; FUNC-LABEL: {{^}}mad_sub_f32: +; SI: buffer_load_dword [[REGA:v[0-9]+]] +; SI: buffer_load_dword [[REGB:v[0-9]+]] +; SI: buffer_load_dword [[REGC:v[0-9]+]] +; SI: v_mad_f32 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -[[REGC]] +; SI: buffer_store_dword [[RESULT]] +define void @mad_sub_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #1 { + %tid = tail call i32 @llvm.r600.read.tidig.x() #0 + %tid.ext = sext i32 %tid to i64 + %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext + %add1 = add i64 %tid.ext, 1 + %gep1 = getelementptr float, float addrspace(1)* %ptr, i64 %add1 + %add2 = add i64 %tid.ext, 2 + %gep2 = getelementptr float, float addrspace(1)* %ptr, i64 %add2 + %outgep = getelementptr float, float addrspace(1)* %out, i64 %tid.ext + %a = load float, float addrspace(1)* %gep0, align 4 + %b = load float, float addrspace(1)* %gep1, align 4 + %c = load float, float addrspace(1)* %gep2, align 4 + %mul = fmul float %a, %b + %sub = fsub float %mul, %c + store float %sub, float addrspace(1)* %outgep, align 4 + ret void +} + +; FUNC-LABEL: {{^}}mad_sub_inv_f32: +; SI: buffer_load_dword [[REGA:v[0-9]+]] +; SI: buffer_load_dword [[REGB:v[0-9]+]] +; SI: buffer_load_dword [[REGC:v[0-9]+]] +; SI: v_mad_f32 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], [[REGC]] +; SI: buffer_store_dword [[RESULT]] +define void @mad_sub_inv_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #1 { + %tid = tail call i32 @llvm.r600.read.tidig.x() #0 + %tid.ext = sext i32 %tid to i64 + %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext + %add1 = add i64 %tid.ext, 1 + %gep1 = getelementptr float, float addrspace(1)* %ptr, i64 %add1 + %add2 = add i64 %tid.ext, 2 + %gep2 = getelementptr float, float addrspace(1)* %ptr, i64 %add2 + %outgep = getelementptr float, float addrspace(1)* %out, i64 %tid.ext + %a = load float, float addrspace(1)* %gep0, align 4 + %b = load float, float addrspace(1)* %gep1, align 4 + %c = load float, float addrspace(1)* %gep2, align 4 + %mul = fmul float %a, %b + %sub = fsub float %c, %mul + store float %sub, float addrspace(1)* %outgep, align 4 + ret void +} + +; FUNC-LABEL: {{^}}mad_sub_f64: +; SI: v_mul_f64 +; SI: v_add_f64 +define void @mad_sub_f64(double addrspace(1)* noalias nocapture %out, double addrspace(1)* noalias nocapture readonly %ptr) #1 { + %tid = tail call i32 @llvm.r600.read.tidig.x() #0 + %tid.ext = sext i32 %tid to i64 + %gep0 = getelementptr double, double addrspace(1)* %ptr, i64 %tid.ext + %add1 = add i64 %tid.ext, 1 + %gep1 = getelementptr double, double addrspace(1)* %ptr, i64 %add1 + %add2 = add i64 %tid.ext, 2 + %gep2 = getelementptr double, double addrspace(1)* %ptr, i64 %add2 + %outgep = getelementptr double, double addrspace(1)* %out, i64 %tid.ext + %a = load double, double addrspace(1)* %gep0, align 8 + %b = load double, double addrspace(1)* %gep1, align 8 + %c = load double, double addrspace(1)* %gep2, align 8 + %mul = fmul double %a, %b + %sub = fsub double %mul, %c + store double %sub, double addrspace(1)* %outgep, align 8 + ret void +} + +; FUNC-LABEL: {{^}}mad_sub_fabs_f32: +; SI: buffer_load_dword [[REGA:v[0-9]+]] +; SI: buffer_load_dword [[REGB:v[0-9]+]] +; SI: buffer_load_dword [[REGC:v[0-9]+]] +; SI: v_mad_f32 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -|[[REGC]]| +; SI: buffer_store_dword [[RESULT]] +define void @mad_sub_fabs_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #1 { + %tid = tail call i32 @llvm.r600.read.tidig.x() #0 + %tid.ext = sext i32 %tid to i64 + %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext + %add1 = add i64 %tid.ext, 1 + %gep1 = getelementptr float, float addrspace(1)* %ptr, i64 %add1 + %add2 = add i64 %tid.ext, 2 + %gep2 = getelementptr float, float addrspace(1)* %ptr, i64 %add2 + %outgep = getelementptr float, float addrspace(1)* %out, i64 %tid.ext + %a = load float, float addrspace(1)* %gep0, align 4 + %b = load float, float addrspace(1)* %gep1, align 4 + %c = load float, float addrspace(1)* %gep2, align 4 + %c.abs = call float @llvm.fabs.f32(float %c) #0 + %mul = fmul float %a, %b + %sub = fsub float %mul, %c.abs + store float %sub, float addrspace(1)* %outgep, align 4 + ret void +} + +; FUNC-LABEL: {{^}}mad_sub_fabs_inv_f32: +; SI: buffer_load_dword [[REGA:v[0-9]+]] +; SI: buffer_load_dword [[REGB:v[0-9]+]] +; SI: buffer_load_dword [[REGC:v[0-9]+]] +; SI: v_mad_f32 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], |[[REGC]]| +; SI: buffer_store_dword [[RESULT]] +define void @mad_sub_fabs_inv_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #1 { + %tid = tail call i32 @llvm.r600.read.tidig.x() #0 + %tid.ext = sext i32 %tid to i64 + %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext + %add1 = add i64 %tid.ext, 1 + %gep1 = getelementptr float, float addrspace(1)* %ptr, i64 %add1 + %add2 = add i64 %tid.ext, 2 + %gep2 = getelementptr float, float addrspace(1)* %ptr, i64 %add2 + %outgep = getelementptr float, float addrspace(1)* %out, i64 %tid.ext + %a = load float, float addrspace(1)* %gep0, align 4 + %b = load float, float addrspace(1)* %gep1, align 4 + %c = load float, float addrspace(1)* %gep2, align 4 + %c.abs = call float @llvm.fabs.f32(float %c) #0 + %mul = fmul float %a, %b + %sub = fsub float %c.abs, %mul + store float %sub, float addrspace(1)* %outgep, align 4 + ret void +} + +; FUNC-LABEL: {{^}}neg_neg_mad_f32: +; SI: v_mad_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} +define void @neg_neg_mad_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #1 { + %tid = tail call i32 @llvm.r600.read.tidig.x() #0 + %tid.ext = sext i32 %tid to i64 + %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext + %add1 = add i64 %tid.ext, 1 + %gep1 = getelementptr float, float addrspace(1)* %ptr, i64 %add1 + %add2 = add i64 %tid.ext, 2 + %gep2 = getelementptr float, float addrspace(1)* %ptr, i64 %add2 + %outgep = getelementptr float, float addrspace(1)* %out, i64 %tid.ext + %a = load float, float addrspace(1)* %gep0, align 4 + %b = load float, float addrspace(1)* %gep1, align 4 + %c = load float, float addrspace(1)* %gep2, align 4 + %nega = fsub float -0.000000e+00, %a + %negb = fsub float -0.000000e+00, %b + %mul = fmul float %nega, %negb + %sub = fadd float %mul, %c + store float %sub, float addrspace(1)* %outgep, align 4 + ret void +} + +; FUNC-LABEL: {{^}}mad_fabs_sub_f32: +; SI: buffer_load_dword [[REGA:v[0-9]+]] +; SI: buffer_load_dword [[REGB:v[0-9]+]] +; SI: buffer_load_dword [[REGC:v[0-9]+]] +; SI: v_mad_f32 [[RESULT:v[0-9]+]], [[REGA]], |[[REGB]]|, -[[REGC]] +; SI: buffer_store_dword [[RESULT]] +define void @mad_fabs_sub_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #1 { + %tid = tail call i32 @llvm.r600.read.tidig.x() #0 + %tid.ext = sext i32 %tid to i64 + %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext + %add1 = add i64 %tid.ext, 1 + %gep1 = getelementptr float, float addrspace(1)* %ptr, i64 %add1 + %add2 = add i64 %tid.ext, 2 + %gep2 = getelementptr float, float addrspace(1)* %ptr, i64 %add2 + %outgep = getelementptr float, float addrspace(1)* %out, i64 %tid.ext + %a = load float, float addrspace(1)* %gep0, align 4 + %b = load float, float addrspace(1)* %gep1, align 4 + %c = load float, float addrspace(1)* %gep2, align 4 + %b.abs = call float @llvm.fabs.f32(float %b) #0 + %mul = fmul float %a, %b.abs + %sub = fsub float %mul, %c + store float %sub, float addrspace(1)* %outgep, align 4 + ret void +} + +; FUNC-LABEL: {{^}}fsub_c_fadd_a_a: +; SI-DAG: buffer_load_dword [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 +; SI: v_mad_f32 [[RESULT:v[0-9]+]], -2.0, [[R1]], [[R2]] +; SI: buffer_store_dword [[RESULT]] +define void @fsub_c_fadd_a_a(float addrspace(1)* %out, float addrspace(1)* %in) { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid + %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 + %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid + + %r1 = load float, float addrspace(1)* %gep.0 + %r2 = load float, float addrspace(1)* %gep.1 + + %add = fadd float %r1, %r1 + %r3 = fsub float %r2, %add + + store float %r3, float addrspace(1)* %gep.out + ret void +} + +; FUNC-LABEL: {{^}}fsub_fadd_a_a_c: +; SI-DAG: buffer_load_dword [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 +; SI: v_mad_f32 [[RESULT:v[0-9]+]], 2.0, [[R1]], -[[R2]] +; SI: buffer_store_dword [[RESULT]] +define void @fsub_fadd_a_a_c(float addrspace(1)* %out, float addrspace(1)* %in) { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid + %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 + %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid + + %r1 = load float, float addrspace(1)* %gep.0 + %r2 = load float, float addrspace(1)* %gep.1 + + %add = fadd float %r1, %r1 + %r3 = fsub float %add, %r2 + + store float %r3, float addrspace(1)* %gep.out + ret void +} + +attributes #0 = { nounwind readnone } +attributes #1 = { nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/mad_int24.ll b/llvm/test/CodeGen/AMDGPU/mad_int24.ll new file mode 100644 index 00000000000..86d75a63ca4 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/mad_int24.ll @@ -0,0 +1,35 @@ +; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG --check-prefix=FUNC +; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=CM --check-prefix=FUNC +; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC +; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC + +declare i32 @llvm.AMDGPU.imul24(i32, i32) nounwind readnone + +; FUNC-LABEL: {{^}}i32_mad24: +; Signed 24-bit multiply is not supported on pre-Cayman GPUs. +; EG: MULLO_INT +; Make sure we aren't masking the inputs. +; CM-NOT: AND +; CM: MULADD_INT24 +; SI-NOT: and +; SI: v_mad_i32_i24 +define void @i32_mad24(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) { +entry: + %0 = shl i32 %a, 8 + %a_24 = ashr i32 %0, 8 + %1 = shl i32 %b, 8 + %b_24 = ashr i32 %1, 8 + %2 = mul i32 %a_24, %b_24 + %3 = add i32 %2, %c + store i32 %3, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: @test_imul24 +; SI: v_mad_i32_i24 +define void @test_imul24(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) nounwind { + %mul = call i32 @llvm.AMDGPU.imul24(i32 %src0, i32 %src1) nounwind readnone + %add = add i32 %mul, %src2 + store i32 %add, i32 addrspace(1)* %out, align 4 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/mad_uint24.ll b/llvm/test/CodeGen/AMDGPU/mad_uint24.ll new file mode 100644 index 00000000000..95fe3411959 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/mad_uint24.ll @@ -0,0 +1,76 @@ +; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG --check-prefix=FUNC +; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=EG --check-prefix=FUNC +; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC +; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC + +; FUNC-LABEL: {{^}}u32_mad24: +; EG: MULADD_UINT24 +; SI: v_mad_u32_u24 + +define void @u32_mad24(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) { +entry: + %0 = shl i32 %a, 8 + %a_24 = lshr i32 %0, 8 + %1 = shl i32 %b, 8 + %b_24 = lshr i32 %1, 8 + %2 = mul i32 %a_24, %b_24 + %3 = add i32 %2, %c + store i32 %3, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}i16_mad24: +; The order of A and B does not matter. +; EG: MULADD_UINT24 {{[* ]*}}T{{[0-9]}}.[[MAD_CHAN:[XYZW]]] +; The result must be sign-extended +; EG: BFE_INT {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[MAD_CHAN]], 0.0, literal.x +; EG: 16 +; SI: v_mad_u32_u24 [[MAD:v[0-9]]], {{[sv][0-9], [sv][0-9]}} +; SI: v_bfe_i32 v{{[0-9]}}, [[MAD]], 0, 16 + +define void @i16_mad24(i32 addrspace(1)* %out, i16 %a, i16 %b, i16 %c) { +entry: + %0 = mul i16 %a, %b + %1 = add i16 %0, %c + %2 = sext i16 %1 to i32 + store i32 %2, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}i8_mad24: +; EG: MULADD_UINT24 {{[* ]*}}T{{[0-9]}}.[[MAD_CHAN:[XYZW]]] +; The result must be sign-extended +; EG: BFE_INT {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[MAD_CHAN]], 0.0, literal.x +; EG: 8 +; SI: v_mad_u32_u24 [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}} +; SI: v_bfe_i32 v{{[0-9]}}, [[MUL]], 0, 8 + +define void @i8_mad24(i32 addrspace(1)* %out, i8 %a, i8 %b, i8 %c) { +entry: + %0 = mul i8 %a, %b + %1 = add i8 %0, %c + %2 = sext i8 %1 to i32 + store i32 %2, i32 addrspace(1)* %out + ret void +} + +; This tests for a bug where the mad_u24 pattern matcher would call +; SimplifyDemandedBits on the first operand of the mul instruction +; assuming that the pattern would be matched to a 24-bit mad. This +; led to some instructions being incorrectly erased when the entire +; 24-bit mad pattern wasn't being matched. + +; Check that the select instruction is not deleted. +; FUNC-LABEL: {{^}}i24_i32_i32_mad: +; EG: CNDE_INT +; SI: v_cndmask +define void @i24_i32_i32_mad(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d) { +entry: + %0 = ashr i32 %a, 8 + %1 = icmp ne i32 %c, 0 + %2 = select i1 %1, i32 %0, i32 34 + %3 = mul i32 %2, %c + %4 = add i32 %3, %d + store i32 %4, i32 addrspace(1)* %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/madak.ll b/llvm/test/CodeGen/AMDGPU/madak.ll new file mode 100644 index 00000000000..933bb016d2c --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/madak.ll @@ -0,0 +1,193 @@ +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s +; XUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN %s + +; FIXME: Enable VI + +declare i32 @llvm.r600.read.tidig.x() nounwind readnone +declare float @llvm.fabs.f32(float) nounwind readnone + +; GCN-LABEL: {{^}}madak_f32: +; GCN: buffer_load_dword [[VA:v[0-9]+]] +; GCN: buffer_load_dword [[VB:v[0-9]+]] +; GCN: v_madak_f32_e32 {{v[0-9]+}}, [[VB]], [[VA]], 0x41200000 +define void @madak_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind { + %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone + %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid + %in.b.gep = getelementptr float, float addrspace(1)* %in.b, i32 %tid + %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid + + %a = load float, float addrspace(1)* %in.a.gep, align 4 + %b = load float, float addrspace(1)* %in.b.gep, align 4 + + %mul = fmul float %a, %b + %madak = fadd float %mul, 10.0 + store float %madak, float addrspace(1)* %out.gep, align 4 + ret void +} + +; Make sure this is only folded with one use. This is a code size +; optimization and if we fold the immediate multiple times, we'll undo +; it. + +; GCN-LABEL: {{^}}madak_2_use_f32: +; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; GCN-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 +; GCN-DAG: buffer_load_dword [[VC:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 +; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000 +; GCN-DAG: v_mad_f32 {{v[0-9]+}}, [[VA]], [[VB]], [[VK]] +; GCN-DAG: v_mad_f32 {{v[0-9]+}}, [[VA]], [[VC]], [[VK]] +; GCN: s_endpgm +define void @madak_2_use_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind { + %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone + + %in.gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid + %in.gep.1 = getelementptr float, float addrspace(1)* %in.gep.0, i32 1 + %in.gep.2 = getelementptr float, float addrspace(1)* %in.gep.0, i32 2 + + %out.gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid + %out.gep.1 = getelementptr float, float addrspace(1)* %in.gep.0, i32 1 + + %a = load float, float addrspace(1)* %in.gep.0, align 4 + %b = load float, float addrspace(1)* %in.gep.1, align 4 + %c = load float, float addrspace(1)* %in.gep.2, align 4 + + %mul0 = fmul float %a, %b + %mul1 = fmul float %a, %c + %madak0 = fadd float %mul0, 10.0 + %madak1 = fadd float %mul1, 10.0 + + store float %madak0, float addrspace(1)* %out.gep.0, align 4 + store float %madak1, float addrspace(1)* %out.gep.1, align 4 + ret void +} + +; GCN-LABEL: {{^}}madak_m_inline_imm_f32: +; GCN: buffer_load_dword [[VA:v[0-9]+]] +; GCN: v_madak_f32_e32 {{v[0-9]+}}, 4.0, [[VA]], 0x41200000 +define void @madak_m_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a) nounwind { + %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone + %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid + %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid + + %a = load float, float addrspace(1)* %in.a.gep, align 4 + + %mul = fmul float 4.0, %a + %madak = fadd float %mul, 10.0 + store float %madak, float addrspace(1)* %out.gep, align 4 + ret void +} + +; Make sure nothing weird happens with a value that is also allowed as +; an inline immediate. + +; GCN-LABEL: {{^}}madak_inline_imm_f32: +; GCN: buffer_load_dword [[VA:v[0-9]+]] +; GCN: buffer_load_dword [[VB:v[0-9]+]] +; GCN: v_mad_f32 {{v[0-9]+}}, [[VA]], [[VB]], 4.0 +define void @madak_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind { + %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone + %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid + %in.b.gep = getelementptr float, float addrspace(1)* %in.b, i32 %tid + %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid + + %a = load float, float addrspace(1)* %in.a.gep, align 4 + %b = load float, float addrspace(1)* %in.b.gep, align 4 + + %mul = fmul float %a, %b + %madak = fadd float %mul, 4.0 + store float %madak, float addrspace(1)* %out.gep, align 4 + ret void +} + +; We can't use an SGPR when forming madak +; GCN-LABEL: {{^}}s_v_madak_f32: +; GCN: s_load_dword [[SB:s[0-9]+]] +; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000 +; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]] +; GCN-NOT: v_madak_f32 +; GCN: v_mad_f32 {{v[0-9]+}}, [[SB]], [[VA]], [[VK]] +define void @s_v_madak_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float %b) nounwind { + %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone + %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid + %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid + + %a = load float, float addrspace(1)* %in.a.gep, align 4 + + %mul = fmul float %a, %b + %madak = fadd float %mul, 10.0 + store float %madak, float addrspace(1)* %out.gep, align 4 + ret void +} + +; GCN-LABEL: @v_s_madak_f32 +; GCN-DAG: s_load_dword [[SB:s[0-9]+]] +; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000 +; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]] +; GCN-NOT: v_madak_f32 +; GCN: v_mad_f32 {{v[0-9]+}}, [[VA]], [[SB]], [[VK]] +define void @v_s_madak_f32(float addrspace(1)* noalias %out, float %a, float addrspace(1)* noalias %in.b) nounwind { + %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone + %in.b.gep = getelementptr float, float addrspace(1)* %in.b, i32 %tid + %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid + + %b = load float, float addrspace(1)* %in.b.gep, align 4 + + %mul = fmul float %a, %b + %madak = fadd float %mul, 10.0 + store float %madak, float addrspace(1)* %out.gep, align 4 + ret void +} + +; GCN-LABEL: {{^}}s_s_madak_f32: +; GCN-NOT: v_madak_f32 +; GCN: v_mad_f32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} +define void @s_s_madak_f32(float addrspace(1)* %out, float %a, float %b) nounwind { + %mul = fmul float %a, %b + %madak = fadd float %mul, 10.0 + store float %madak, float addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}no_madak_src0_modifier_f32: +; GCN: buffer_load_dword [[VA:v[0-9]+]] +; GCN: buffer_load_dword [[VB:v[0-9]+]] +; GCN: v_mad_f32 {{v[0-9]+}}, |{{v[0-9]+}}|, {{v[0-9]+}}, {{[sv][0-9]+}} +; GCN: s_endpgm +define void @no_madak_src0_modifier_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind { + %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone + %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid + %in.b.gep = getelementptr float, float addrspace(1)* %in.b, i32 %tid + %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid + + %a = load float, float addrspace(1)* %in.a.gep, align 4 + %b = load float, float addrspace(1)* %in.b.gep, align 4 + + %a.fabs = call float @llvm.fabs.f32(float %a) nounwind readnone + + %mul = fmul float %a.fabs, %b + %madak = fadd float %mul, 10.0 + store float %madak, float addrspace(1)* %out.gep, align 4 + ret void +} + +; GCN-LABEL: {{^}}no_madak_src1_modifier_f32: +; GCN: buffer_load_dword [[VA:v[0-9]+]] +; GCN: buffer_load_dword [[VB:v[0-9]+]] +; GCN: v_mad_f32 {{v[0-9]+}}, {{v[0-9]+}}, |{{v[0-9]+}}|, {{[sv][0-9]+}} +; GCN: s_endpgm +define void @no_madak_src1_modifier_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind { + %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone + %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid + %in.b.gep = getelementptr float, float addrspace(1)* %in.b, i32 %tid + %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid + + %a = load float, float addrspace(1)* %in.a.gep, align 4 + %b = load float, float addrspace(1)* %in.b.gep, align 4 + + %b.fabs = call float @llvm.fabs.f32(float %b) nounwind readnone + + %mul = fmul float %a, %b.fabs + %madak = fadd float %mul, 10.0 + store float %madak, float addrspace(1)* %out.gep, align 4 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/madmk.ll b/llvm/test/CodeGen/AMDGPU/madmk.ll new file mode 100644 index 00000000000..ba7bb221a99 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/madmk.ll @@ -0,0 +1,205 @@ +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; XUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s + +declare i32 @llvm.r600.read.tidig.x() nounwind readnone +declare float @llvm.fabs.f32(float) nounwind readnone + +; GCN-LABEL: {{^}}madmk_f32: +; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; GCN-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 +; GCN: v_madmk_f32_e32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000 +define void @madmk_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind { + %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 + %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid + + %a = load float, float addrspace(1)* %gep.0, align 4 + %b = load float, float addrspace(1)* %gep.1, align 4 + + %mul = fmul float %a, 10.0 + %madmk = fadd float %mul, %b + store float %madmk, float addrspace(1)* %out.gep, align 4 + ret void +} + +; GCN-LABEL: {{^}}madmk_2_use_f32: +; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; GCN-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 +; GCN-DAG: buffer_load_dword [[VC:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 +; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000 +; GCN-DAG: v_mad_f32 {{v[0-9]+}}, [[VA]], [[VK]], [[VB]] +; GCN-DAG: v_mad_f32 {{v[0-9]+}}, [[VA]], [[VK]], [[VC]] +; GCN: s_endpgm +define void @madmk_2_use_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind { + %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone + + %in.gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid + %in.gep.1 = getelementptr float, float addrspace(1)* %in.gep.0, i32 1 + %in.gep.2 = getelementptr float, float addrspace(1)* %in.gep.0, i32 2 + + %out.gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid + %out.gep.1 = getelementptr float, float addrspace(1)* %in.gep.0, i32 1 + + %a = load float, float addrspace(1)* %in.gep.0, align 4 + %b = load float, float addrspace(1)* %in.gep.1, align 4 + %c = load float, float addrspace(1)* %in.gep.2, align 4 + + %mul0 = fmul float %a, 10.0 + %mul1 = fmul float %a, 10.0 + %madmk0 = fadd float %mul0, %b + %madmk1 = fadd float %mul1, %c + + store float %madmk0, float addrspace(1)* %out.gep.0, align 4 + store float %madmk1, float addrspace(1)* %out.gep.1, align 4 + ret void +} + +; We don't get any benefit if the constant is an inline immediate. +; GCN-LABEL: {{^}}madmk_inline_imm_f32: +; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; GCN-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 +; GCN: v_mad_f32 {{v[0-9]+}}, 4.0, [[VA]], [[VB]] +define void @madmk_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind { + %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 + %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid + + %a = load float, float addrspace(1)* %gep.0, align 4 + %b = load float, float addrspace(1)* %gep.1, align 4 + + %mul = fmul float %a, 4.0 + %madmk = fadd float %mul, %b + store float %madmk, float addrspace(1)* %out.gep, align 4 + ret void +} + +; GCN-LABEL: {{^}}s_s_madmk_f32: +; GCN-NOT: v_madmk_f32 +; GCN: v_mad_f32 +; GCN: s_endpgm +define void @s_s_madmk_f32(float addrspace(1)* noalias %out, float %a, float %b) nounwind { + %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone + %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid + + %mul = fmul float %a, 10.0 + %madmk = fadd float %mul, %b + store float %madmk, float addrspace(1)* %out.gep, align 4 + ret void +} + +; GCN-LABEL: {{^}}v_s_madmk_f32: +; GCN-NOT: v_madmk_f32 +; GCN: v_mad_f32 +; GCN: s_endpgm +define void @v_s_madmk_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in, float %b) nounwind { + %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid + %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid + %a = load float, float addrspace(1)* %gep.0, align 4 + + %mul = fmul float %a, 10.0 + %madmk = fadd float %mul, %b + store float %madmk, float addrspace(1)* %out.gep, align 4 + ret void +} + +; GCN-LABEL: {{^}}scalar_vector_madmk_f32: +; GCN-NOT: v_madmk_f32 +; GCN: v_mad_f32 +; GCN: s_endpgm +define void @scalar_vector_madmk_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in, float %a) nounwind { + %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid + %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid + %b = load float, float addrspace(1)* %gep.0, align 4 + + %mul = fmul float %a, 10.0 + %madmk = fadd float %mul, %b + store float %madmk, float addrspace(1)* %out.gep, align 4 + ret void +} + +; GCN-LABEL: {{^}}no_madmk_src0_modifier_f32: +; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; GCN-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 +; GCN: v_mad_f32 {{v[0-9]+}}, |{{v[0-9]+}}|, {{v[0-9]+}}, {{[sv][0-9]+}} +define void @no_madmk_src0_modifier_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind { + %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 + %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid + + %a = load float, float addrspace(1)* %gep.0, align 4 + %b = load float, float addrspace(1)* %gep.1, align 4 + + %a.fabs = call float @llvm.fabs.f32(float %a) nounwind readnone + + %mul = fmul float %a.fabs, 10.0 + %madmk = fadd float %mul, %b + store float %madmk, float addrspace(1)* %out.gep, align 4 + ret void +} + +; GCN-LABEL: {{^}}no_madmk_src2_modifier_f32: +; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; GCN-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 +; GCN: v_mad_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, |{{[sv][0-9]+}}| +define void @no_madmk_src2_modifier_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind { + %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 + %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid + + %a = load float, float addrspace(1)* %gep.0, align 4 + %b = load float, float addrspace(1)* %gep.1, align 4 + + %b.fabs = call float @llvm.fabs.f32(float %b) nounwind readnone + + %mul = fmul float %a, 10.0 + %madmk = fadd float %mul, %b.fabs + store float %madmk, float addrspace(1)* %out.gep, align 4 + ret void +} + +; GCN-LABEL: {{^}}madmk_add_inline_imm_f32: +; GCN: buffer_load_dword [[A:v[0-9]+]] +; GCN: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000 +; GCN: v_mad_f32 {{v[0-9]+}}, [[VK]], [[A]], 2.0 +define void @madmk_add_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind { + %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid + %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid + + %a = load float, float addrspace(1)* %gep.0, align 4 + + %mul = fmul float %a, 10.0 + %madmk = fadd float %mul, 2.0 + store float %madmk, float addrspace(1)* %out.gep, align 4 + ret void +} + +; SI-LABEL: {{^}}kill_madmk_verifier_error: +; SI: s_xor_b64 +; SI: v_madmk_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, 0x472aee8c +; SI: s_or_b64 +define void @kill_madmk_verifier_error() nounwind { +bb: + br label %bb2 + +bb1: ; preds = %bb2 + ret void + +bb2: ; preds = %bb6, %bb + %tmp = phi float [ undef, %bb ], [ %tmp8, %bb6 ] + %tmp3 = fsub float undef, %tmp + %tmp5 = fcmp oeq float %tmp3, 1.000000e+04 + br i1 %tmp5, label %bb1, label %bb6 + +bb6: ; preds = %bb2 + %tmp4 = fmul float %tmp, undef + %tmp7 = fmul float %tmp4, 0x40E55DD180000000 + %tmp8 = fadd float %tmp7, undef + br label %bb2 +} diff --git a/llvm/test/CodeGen/AMDGPU/max-literals.ll b/llvm/test/CodeGen/AMDGPU/max-literals.ll new file mode 100644 index 00000000000..c357524b140 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/max-literals.ll @@ -0,0 +1,67 @@ +;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s + +; CHECK-LABEL: {{^}}main: +; CHECK: ADD * + +define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1, <4 x float> inreg %reg2) #0 { +main_body: + %0 = extractelement <4 x float> %reg1, i32 0 + %1 = extractelement <4 x float> %reg1, i32 1 + %2 = extractelement <4 x float> %reg1, i32 2 + %3 = extractelement <4 x float> %reg1, i32 3 + %4 = extractelement <4 x float> %reg2, i32 0 + %5 = fadd float %0, 2.0 + %6 = fadd float %1, 3.0 + %7 = fadd float %2, 4.0 + %8 = fadd float %3, 5.0 + %9 = bitcast float %4 to i32 + %10 = mul i32 %9, 6 + %11 = bitcast i32 %10 to float + %12 = insertelement <4 x float> undef, float %5, i32 0 + %13 = insertelement <4 x float> %12, float %6, i32 1 + %14 = insertelement <4 x float> %13, float %7, i32 2 + %15 = insertelement <4 x float> %14, float %8, i32 3 + %16 = insertelement <4 x float> %15, float %11, i32 3 + + %17 = call float @llvm.AMDGPU.dp4(<4 x float> %15,<4 x float> %16) + %18 = insertelement <4 x float> undef, float %17, i32 0 + call void @llvm.R600.store.swizzle(<4 x float> %18, i32 0, i32 2) + ret void +} + +; CHECK-LABEL: {{^}}main2: +; CHECK-NOT: ADD * + +define void @main2(<4 x float> inreg %reg0, <4 x float> inreg %reg1, <4 x float> inreg %reg2) #0 { +main_body: + %0 = extractelement <4 x float> %reg1, i32 0 + %1 = extractelement <4 x float> %reg1, i32 1 + %2 = extractelement <4 x float> %reg1, i32 2 + %3 = extractelement <4 x float> %reg1, i32 3 + %4 = extractelement <4 x float> %reg2, i32 0 + %5 = fadd float %0, 2.0 + %6 = fadd float %1, 3.0 + %7 = fadd float %2, 4.0 + %8 = fadd float %3, 2.0 + %9 = bitcast float %4 to i32 + %10 = mul i32 %9, 6 + %11 = bitcast i32 %10 to float + %12 = insertelement <4 x float> undef, float %5, i32 0 + %13 = insertelement <4 x float> %12, float %6, i32 1 + %14 = insertelement <4 x float> %13, float %7, i32 2 + %15 = insertelement <4 x float> %14, float %8, i32 3 + %16 = insertelement <4 x float> %15, float %11, i32 3 + + %17 = call float @llvm.AMDGPU.dp4(<4 x float> %15,<4 x float> %16) + %18 = insertelement <4 x float> undef, float %17, i32 0 + call void @llvm.R600.store.swizzle(<4 x float> %18, i32 0, i32 2) + ret void +} + +; Function Attrs: readnone +declare float @llvm.AMDGPU.dp4(<4 x float>, <4 x float>) #1 + +declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) + +attributes #0 = { "ShaderType"="1" } +attributes #1 = { readnone } diff --git a/llvm/test/CodeGen/AMDGPU/max.ll b/llvm/test/CodeGen/AMDGPU/max.ll new file mode 100644 index 00000000000..fef3e2f0a21 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/max.ll @@ -0,0 +1,168 @@ +; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s + +declare i32 @llvm.r600.read.tidig.x() nounwind readnone + +; FUNC-LABEL: @v_test_imax_sge_i32 +; SI: v_max_i32_e32 +define void @v_test_imax_sge_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid + %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid + %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %a = load i32, i32 addrspace(1)* %gep0, align 4 + %b = load i32, i32 addrspace(1)* %gep1, align 4 + %cmp = icmp sge i32 %a, %b + %val = select i1 %cmp, i32 %a, i32 %b + store i32 %val, i32 addrspace(1)* %outgep, align 4 + ret void +} + +; FUNC-LABEL: @s_test_imax_sge_i32 +; SI: s_max_i32 +define void @s_test_imax_sge_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind { + %cmp = icmp sge i32 %a, %b + %val = select i1 %cmp, i32 %a, i32 %b + store i32 %val, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}s_test_imax_sge_imm_i32: +; SI: s_max_i32 {{s[0-9]+}}, {{s[0-9]+}}, 9 +define void @s_test_imax_sge_imm_i32(i32 addrspace(1)* %out, i32 %a) nounwind { + %cmp = icmp sge i32 %a, 9 + %val = select i1 %cmp, i32 %a, i32 9 + store i32 %val, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}s_test_imax_sgt_imm_i32: +; SI: s_max_i32 {{s[0-9]+}}, {{s[0-9]+}}, 9 +define void @s_test_imax_sgt_imm_i32(i32 addrspace(1)* %out, i32 %a) nounwind { + %cmp = icmp sgt i32 %a, 9 + %val = select i1 %cmp, i32 %a, i32 9 + store i32 %val, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @v_test_imax_sgt_i32 +; SI: v_max_i32_e32 +define void @v_test_imax_sgt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid + %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid + %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %a = load i32, i32 addrspace(1)* %gep0, align 4 + %b = load i32, i32 addrspace(1)* %gep1, align 4 + %cmp = icmp sgt i32 %a, %b + %val = select i1 %cmp, i32 %a, i32 %b + store i32 %val, i32 addrspace(1)* %outgep, align 4 + ret void +} + +; FUNC-LABEL: @s_test_imax_sgt_i32 +; SI: s_max_i32 +define void @s_test_imax_sgt_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind { + %cmp = icmp sgt i32 %a, %b + %val = select i1 %cmp, i32 %a, i32 %b + store i32 %val, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @v_test_umax_uge_i32 +; SI: v_max_u32_e32 +define void @v_test_umax_uge_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid + %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid + %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %a = load i32, i32 addrspace(1)* %gep0, align 4 + %b = load i32, i32 addrspace(1)* %gep1, align 4 + %cmp = icmp uge i32 %a, %b + %val = select i1 %cmp, i32 %a, i32 %b + store i32 %val, i32 addrspace(1)* %outgep, align 4 + ret void +} + +; FUNC-LABEL: @s_test_umax_uge_i32 +; SI: s_max_u32 +define void @s_test_umax_uge_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind { + %cmp = icmp uge i32 %a, %b + %val = select i1 %cmp, i32 %a, i32 %b + store i32 %val, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @v_test_umax_ugt_i32 +; SI: v_max_u32_e32 +define void @v_test_umax_ugt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid + %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid + %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %a = load i32, i32 addrspace(1)* %gep0, align 4 + %b = load i32, i32 addrspace(1)* %gep1, align 4 + %cmp = icmp ugt i32 %a, %b + %val = select i1 %cmp, i32 %a, i32 %b + store i32 %val, i32 addrspace(1)* %outgep, align 4 + ret void +} + +; FUNC-LABEL: @s_test_umax_ugt_i32 +; SI: s_max_u32 +define void @s_test_umax_ugt_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind { + %cmp = icmp ugt i32 %a, %b + %val = select i1 %cmp, i32 %a, i32 %b + store i32 %val, i32 addrspace(1)* %out, align 4 + ret void +} + +; Make sure redundant and removed +; FUNC-LABEL: {{^}}simplify_demanded_bits_test_umax_ugt_i16: +; SI-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xb +; SI-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xc +; SI: s_max_u32 [[MIN:s[0-9]+]], [[A]], [[B]] +; SI-NEXT: v_mov_b32_e32 [[VMIN:v[0-9]+]], [[MIN]] +; SI-NEXT: buffer_store_dword [[VMIN]] +define void @simplify_demanded_bits_test_umax_ugt_i16(i32 addrspace(1)* %out, i16 zeroext %a, i16 zeroext %b) nounwind { + %a.ext = zext i16 %a to i32 + %b.ext = zext i16 %b to i32 + %cmp = icmp ugt i32 %a.ext, %b.ext + %val = select i1 %cmp, i32 %a.ext, i32 %b.ext + %mask = and i32 %val, 65535 + store i32 %mask, i32 addrspace(1)* %out + ret void +} + +; Make sure redundant sign_extend_inreg removed. + +; FUNC-LABEL: {{^}}simplify_demanded_bits_test_min_slt_i16: +; SI-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xb +; SI-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xc +; SI: s_max_i32 [[MIN:s[0-9]+]], [[A]], [[B]] +; SI-NEXT: v_mov_b32_e32 [[VMIN:v[0-9]+]], [[MIN]] +; SI-NEXT: buffer_store_dword [[VMIN]] +define void @simplify_demanded_bits_test_min_slt_i16(i32 addrspace(1)* %out, i16 signext %a, i16 signext %b) nounwind { + %a.ext = sext i16 %a to i32 + %b.ext = sext i16 %b to i32 + %cmp = icmp sgt i32 %a.ext, %b.ext + %val = select i1 %cmp, i32 %a.ext, i32 %b.ext + %shl = shl i32 %val, 16 + %sextinreg = ashr i32 %shl, 16 + store i32 %sextinreg, i32 addrspace(1)* %out + ret void +} + +; FIXME: Should get match min/max through extends inserted by +; legalization. + +; FUNC-LABEL: {{^}}s_test_imin_sge_i16: +; SI: s_sext_i32_i16 +; SI: s_sext_i32_i16 +; SI: v_cmp_ge_i32_e32 +; SI: v_cndmask_b32 +define void @s_test_imin_sge_i16(i16 addrspace(1)* %out, i16 %a, i16 %b) nounwind { + %cmp = icmp sge i16 %a, %b + %val = select i1 %cmp, i16 %a, i16 %b + store i16 %val, i16 addrspace(1)* %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/max3.ll b/llvm/test/CodeGen/AMDGPU/max3.ll new file mode 100644 index 00000000000..cfb94b272e5 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/max3.ll @@ -0,0 +1,41 @@ +; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s + +declare i32 @llvm.r600.read.tidig.x() nounwind readnone + +; FUNC-LABEL: @v_test_imax3_sgt_i32 +; SI: v_max3_i32 +define void @v_test_imax3_sgt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid + %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid + %gep2 = getelementptr i32, i32 addrspace(1)* %cptr, i32 %tid + %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %a = load i32, i32 addrspace(1)* %gep0, align 4 + %b = load i32, i32 addrspace(1)* %gep1, align 4 + %c = load i32, i32 addrspace(1)* %gep2, align 4 + %icmp0 = icmp sgt i32 %a, %b + %i0 = select i1 %icmp0, i32 %a, i32 %b + %icmp1 = icmp sgt i32 %i0, %c + %i1 = select i1 %icmp1, i32 %i0, i32 %c + store i32 %i1, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @v_test_umax3_ugt_i32 +; SI: v_max3_u32 +define void @v_test_umax3_ugt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid + %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid + %gep2 = getelementptr i32, i32 addrspace(1)* %cptr, i32 %tid + %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %a = load i32, i32 addrspace(1)* %gep0, align 4 + %b = load i32, i32 addrspace(1)* %gep1, align 4 + %c = load i32, i32 addrspace(1)* %gep2, align 4 + %icmp0 = icmp ugt i32 %a, %b + %i0 = select i1 %icmp0, i32 %a, i32 %b + %icmp1 = icmp ugt i32 %i0, %c + %i1 = select i1 %icmp1, i32 %i0, i32 %c + store i32 %i1, i32 addrspace(1)* %out, align 4 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/merge-stores.ll b/llvm/test/CodeGen/AMDGPU/merge-stores.ll new file mode 100644 index 00000000000..dbf9d4481ff --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/merge-stores.ll @@ -0,0 +1,536 @@ +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s + +; Run with devices with different unaligned load restrictions. + +; TODO: Vector element tests +; TODO: Non-zero base offset for load and store combinations +; TODO: Same base addrspacecasted + + +; GCN-LABEL: {{^}}merge_global_store_2_constants_i8: +; GCN: buffer_store_byte +; GCN: buffer_store_byte +; GCN: s_endpgm +define void @merge_global_store_2_constants_i8(i8 addrspace(1)* %out) #0 { + %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i32 1 + + store i8 123, i8 addrspace(1)* %out.gep.1 + store i8 456, i8 addrspace(1)* %out, align 2 + ret void +} + +; GCN-LABEL: {{^}}merge_global_store_2_constants_i8_natural_align: +; GCN: buffer_store_byte +; GCN: buffer_store_byte +; GCN: s_endpgm +define void @merge_global_store_2_constants_i8_natural_align(i8 addrspace(1)* %out) #0 { + %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i32 1 + + store i8 123, i8 addrspace(1)* %out.gep.1 + store i8 456, i8 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}merge_global_store_2_constants_i16: +; GCN: buffer_store_dword v +define void @merge_global_store_2_constants_i16(i16 addrspace(1)* %out) #0 { + %out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1 + + store i16 123, i16 addrspace(1)* %out.gep.1 + store i16 456, i16 addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}merge_global_store_2_constants_0_i16: +; GCN: buffer_store_dword v +define void @merge_global_store_2_constants_0_i16(i16 addrspace(1)* %out) #0 { + %out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1 + + store i16 0, i16 addrspace(1)* %out.gep.1 + store i16 0, i16 addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}merge_global_store_2_constants_i16_natural_align: +; GCN: buffer_store_short +; GCN: buffer_store_short +; GCN: s_endpgm +define void @merge_global_store_2_constants_i16_natural_align(i16 addrspace(1)* %out) #0 { + %out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1 + + store i16 123, i16 addrspace(1)* %out.gep.1 + store i16 456, i16 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}merge_global_store_2_constants_i32: +; SI-DAG: s_movk_i32 [[SLO:s[0-9]+]], 0x1c8 +; SI-DAG: s_movk_i32 [[SHI:s[0-9]+]], 0x7b +; SI-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], [[SLO]] +; SI-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[SHI]] +; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} +define void @merge_global_store_2_constants_i32(i32 addrspace(1)* %out) #0 { + %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 + + store i32 123, i32 addrspace(1)* %out.gep.1 + store i32 456, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}merge_global_store_2_constants_i32_f32: +; GCN: buffer_store_dwordx2 +define void @merge_global_store_2_constants_i32_f32(i32 addrspace(1)* %out) #0 { + %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 + %out.gep.1.bc = bitcast i32 addrspace(1)* %out.gep.1 to float addrspace(1)* + store float 1.0, float addrspace(1)* %out.gep.1.bc + store i32 456, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}merge_global_store_2_constants_f32_i32: +; GCN: buffer_store_dwordx2 +define void @merge_global_store_2_constants_f32_i32(float addrspace(1)* %out) #0 { + %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1 + %out.gep.1.bc = bitcast float addrspace(1)* %out.gep.1 to i32 addrspace(1)* + store i32 123, i32 addrspace(1)* %out.gep.1.bc + store float 4.0, float addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}merge_global_store_4_constants_i32: +; GCN: buffer_store_dwordx4 +define void @merge_global_store_4_constants_i32(i32 addrspace(1)* %out) #0 { + %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 + %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2 + %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3 + + store i32 123, i32 addrspace(1)* %out.gep.1 + store i32 456, i32 addrspace(1)* %out.gep.2 + store i32 333, i32 addrspace(1)* %out.gep.3 + store i32 1234, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}merge_global_store_4_constants_f32_order: +; XGCN: buffer_store_dwordx4 +; GCN: buffer_store_dword v +; GCN: buffer_store_dword v +; GCN: buffer_store_dwordx2 v +define void @merge_global_store_4_constants_f32_order(float addrspace(1)* %out) #0 { + %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1 + %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2 + %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3 + + store float 8.0, float addrspace(1)* %out + store float 1.0, float addrspace(1)* %out.gep.1 + store float 2.0, float addrspace(1)* %out.gep.2 + store float 4.0, float addrspace(1)* %out.gep.3 + ret void +} + +; First store is out of order. Because of order of combines, the +; consecutive store fails because only some of the stores have been +; replaced with integer constant stores, and then won't merge because +; the types are different. + +; GCN-LABEL: {{^}}merge_global_store_4_constants_f32: +; XGCN: buffer_store_dwordx4 +; GCN: buffer_store_dword v +; GCN: buffer_store_dword v +; GCN: buffer_store_dword v +; GCN: buffer_store_dword v +define void @merge_global_store_4_constants_f32(float addrspace(1)* %out) #0 { + %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1 + %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2 + %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3 + + store float 1.0, float addrspace(1)* %out.gep.1 + store float 2.0, float addrspace(1)* %out.gep.2 + store float 4.0, float addrspace(1)* %out.gep.3 + store float 8.0, float addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}merge_global_store_3_constants_i32: +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dword +; SI-NOT: buffer_store_dword +; GCN: s_endpgm +define void @merge_global_store_3_constants_i32(i32 addrspace(1)* %out) #0 { + %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 + %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2 + + store i32 123, i32 addrspace(1)* %out.gep.1 + store i32 456, i32 addrspace(1)* %out.gep.2 + store i32 1234, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}merge_global_store_2_constants_i64: +; XGCN: buffer_store_dwordx4 +; GCN: buffer_store_dwordx2 +; GCN: buffer_store_dwordx2 +define void @merge_global_store_2_constants_i64(i64 addrspace(1)* %out) #0 { + %out.gep.1 = getelementptr i64, i64 addrspace(1)* %out, i64 1 + + store i64 123, i64 addrspace(1)* %out.gep.1 + store i64 456, i64 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}merge_global_store_4_constants_i64: +; XGCN: buffer_store_dwordx4 +; XGCN: buffer_store_dwordx4 + +; GCN: buffer_store_dwordx2 +; GCN: buffer_store_dwordx2 +; GCN: buffer_store_dwordx2 +; GCN: buffer_store_dwordx2 +define void @merge_global_store_4_constants_i64(i64 addrspace(1)* %out) #0 { + %out.gep.1 = getelementptr i64, i64 addrspace(1)* %out, i64 1 + %out.gep.2 = getelementptr i64, i64 addrspace(1)* %out, i64 2 + %out.gep.3 = getelementptr i64, i64 addrspace(1)* %out, i64 3 + + store i64 123, i64 addrspace(1)* %out.gep.1 + store i64 456, i64 addrspace(1)* %out.gep.2 + store i64 333, i64 addrspace(1)* %out.gep.3 + store i64 1234, i64 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}merge_global_store_2_adjacent_loads_i32: +; GCN: buffer_load_dwordx2 [[LOAD:v\[[0-9]+:[0-9]+\]]] +; GCN: buffer_store_dwordx2 [[LOAD]] +define void @merge_global_store_2_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { + %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 + %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1 + + %lo = load i32, i32 addrspace(1)* %in + %hi = load i32, i32 addrspace(1)* %in.gep.1 + + store i32 %lo, i32 addrspace(1)* %out + store i32 %hi, i32 addrspace(1)* %out.gep.1 + ret void +} + +; GCN-LABEL: {{^}}merge_global_store_2_adjacent_loads_i32_nonzero_base: +; GCN: buffer_load_dwordx2 [[LOAD:v\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8 +; GCN: buffer_store_dwordx2 [[LOAD]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8 +define void @merge_global_store_2_adjacent_loads_i32_nonzero_base(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { + %in.gep.0 = getelementptr i32, i32 addrspace(1)* %in, i32 2 + %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 3 + + %out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i32 2 + %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 3 + %lo = load i32, i32 addrspace(1)* %in.gep.0 + %hi = load i32, i32 addrspace(1)* %in.gep.1 + + store i32 %lo, i32 addrspace(1)* %out.gep.0 + store i32 %hi, i32 addrspace(1)* %out.gep.1 + ret void +} + +; GCN-LABEL: {{^}}merge_global_store_2_adjacent_loads_shuffle_i32: +; GCN: buffer_load_dword v +; GCN: buffer_load_dword v +; GCN: buffer_store_dword v +; GCN: buffer_store_dword v +define void @merge_global_store_2_adjacent_loads_shuffle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { + %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 + %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1 + + %lo = load i32, i32 addrspace(1)* %in + %hi = load i32, i32 addrspace(1)* %in.gep.1 + + store i32 %hi, i32 addrspace(1)* %out + store i32 %lo, i32 addrspace(1)* %out.gep.1 + ret void +} + +; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i32: +; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]] +; GCN: buffer_store_dwordx4 [[LOAD]] +define void @merge_global_store_4_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { + %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 + %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2 + %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3 + %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1 + %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2 + %in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 3 + + %x = load i32, i32 addrspace(1)* %in + %y = load i32, i32 addrspace(1)* %in.gep.1 + %z = load i32, i32 addrspace(1)* %in.gep.2 + %w = load i32, i32 addrspace(1)* %in.gep.3 + + store i32 %x, i32 addrspace(1)* %out + store i32 %y, i32 addrspace(1)* %out.gep.1 + store i32 %z, i32 addrspace(1)* %out.gep.2 + store i32 %w, i32 addrspace(1)* %out.gep.3 + ret void +} + +; GCN-LABEL: {{^}}merge_global_store_3_adjacent_loads_i32: +; SI-DAG: buffer_load_dwordx2 +; SI-DAG: buffer_load_dword v +; GCN: s_waitcnt +; SI-DAG: buffer_store_dword v +; SI-DAG: buffer_store_dwordx2 v +; GCN: s_endpgm +define void @merge_global_store_3_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { + %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 + %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2 + %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1 + %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2 + + %x = load i32, i32 addrspace(1)* %in + %y = load i32, i32 addrspace(1)* %in.gep.1 + %z = load i32, i32 addrspace(1)* %in.gep.2 + + store i32 %x, i32 addrspace(1)* %out + store i32 %y, i32 addrspace(1)* %out.gep.1 + store i32 %z, i32 addrspace(1)* %out.gep.2 + ret void +} + +; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_f32: +; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]] +; GCN: buffer_store_dwordx4 [[LOAD]] +define void @merge_global_store_4_adjacent_loads_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { + %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1 + %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2 + %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3 + %in.gep.1 = getelementptr float, float addrspace(1)* %in, i32 1 + %in.gep.2 = getelementptr float, float addrspace(1)* %in, i32 2 + %in.gep.3 = getelementptr float, float addrspace(1)* %in, i32 3 + + %x = load float, float addrspace(1)* %in + %y = load float, float addrspace(1)* %in.gep.1 + %z = load float, float addrspace(1)* %in.gep.2 + %w = load float, float addrspace(1)* %in.gep.3 + + store float %x, float addrspace(1)* %out + store float %y, float addrspace(1)* %out.gep.1 + store float %z, float addrspace(1)* %out.gep.2 + store float %w, float addrspace(1)* %out.gep.3 + ret void +} + +; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i32_nonzero_base: +; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:44 +; GCN: buffer_store_dwordx4 [[LOAD]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:28 +define void @merge_global_store_4_adjacent_loads_i32_nonzero_base(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { + %in.gep.0 = getelementptr i32, i32 addrspace(1)* %in, i32 11 + %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 12 + %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 13 + %in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 14 + %out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i32 7 + %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 8 + %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 9 + %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 10 + + %x = load i32, i32 addrspace(1)* %in.gep.0 + %y = load i32, i32 addrspace(1)* %in.gep.1 + %z = load i32, i32 addrspace(1)* %in.gep.2 + %w = load i32, i32 addrspace(1)* %in.gep.3 + + store i32 %x, i32 addrspace(1)* %out.gep.0 + store i32 %y, i32 addrspace(1)* %out.gep.1 + store i32 %z, i32 addrspace(1)* %out.gep.2 + store i32 %w, i32 addrspace(1)* %out.gep.3 + ret void +} + +; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_inverse_i32: +; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]] +; GCN: s_barrier +; GCN: buffer_store_dwordx4 [[LOAD]] +define void @merge_global_store_4_adjacent_loads_inverse_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { + %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 + %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2 + %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3 + %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1 + %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2 + %in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 3 + + %x = load i32, i32 addrspace(1)* %in + %y = load i32, i32 addrspace(1)* %in.gep.1 + %z = load i32, i32 addrspace(1)* %in.gep.2 + %w = load i32, i32 addrspace(1)* %in.gep.3 + + ; Make sure the barrier doesn't stop this + tail call void @llvm.AMDGPU.barrier.local() #1 + + store i32 %w, i32 addrspace(1)* %out.gep.3 + store i32 %z, i32 addrspace(1)* %out.gep.2 + store i32 %y, i32 addrspace(1)* %out.gep.1 + store i32 %x, i32 addrspace(1)* %out + + ret void +} + +; TODO: Re-packing of loaded register required. Maybe an IR pass +; should catch this? + +; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_shuffle_i32: +; GCN: buffer_load_dword v +; GCN: buffer_load_dword v +; GCN: buffer_load_dword v +; GCN: buffer_load_dword v +; GCN: s_barrier +; GCN: buffer_store_dword v +; GCN: buffer_store_dword v +; GCN: buffer_store_dword v +; GCN: buffer_store_dword v +define void @merge_global_store_4_adjacent_loads_shuffle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { + %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 + %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2 + %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3 + %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1 + %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2 + %in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 3 + + %x = load i32, i32 addrspace(1)* %in + %y = load i32, i32 addrspace(1)* %in.gep.1 + %z = load i32, i32 addrspace(1)* %in.gep.2 + %w = load i32, i32 addrspace(1)* %in.gep.3 + + ; Make sure the barrier doesn't stop this + tail call void @llvm.AMDGPU.barrier.local() #1 + + store i32 %w, i32 addrspace(1)* %out + store i32 %z, i32 addrspace(1)* %out.gep.1 + store i32 %y, i32 addrspace(1)* %out.gep.2 + store i32 %x, i32 addrspace(1)* %out.gep.3 + + ret void +} + +; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i8: +; GCN: buffer_load_dword [[LOAD:v[0-9]+]] +; GCN: buffer_store_dword [[LOAD]] +; GCN: s_endpgm +define void @merge_global_store_4_adjacent_loads_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #0 { + %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i8 1 + %out.gep.2 = getelementptr i8, i8 addrspace(1)* %out, i8 2 + %out.gep.3 = getelementptr i8, i8 addrspace(1)* %out, i8 3 + %in.gep.1 = getelementptr i8, i8 addrspace(1)* %in, i8 1 + %in.gep.2 = getelementptr i8, i8 addrspace(1)* %in, i8 2 + %in.gep.3 = getelementptr i8, i8 addrspace(1)* %in, i8 3 + + %x = load i8, i8 addrspace(1)* %in, align 4 + %y = load i8, i8 addrspace(1)* %in.gep.1 + %z = load i8, i8 addrspace(1)* %in.gep.2 + %w = load i8, i8 addrspace(1)* %in.gep.3 + + store i8 %x, i8 addrspace(1)* %out, align 4 + store i8 %y, i8 addrspace(1)* %out.gep.1 + store i8 %z, i8 addrspace(1)* %out.gep.2 + store i8 %w, i8 addrspace(1)* %out.gep.3 + ret void +} + +; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i8_natural_align: +; GCN: buffer_load_ubyte +; GCN: buffer_load_ubyte +; GCN: buffer_load_ubyte +; GCN: buffer_load_ubyte +; GCN: buffer_store_byte +; GCN: buffer_store_byte +; GCN: buffer_store_byte +; GCN: buffer_store_byte +; GCN: s_endpgm +define void @merge_global_store_4_adjacent_loads_i8_natural_align(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #0 { + %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i8 1 + %out.gep.2 = getelementptr i8, i8 addrspace(1)* %out, i8 2 + %out.gep.3 = getelementptr i8, i8 addrspace(1)* %out, i8 3 + %in.gep.1 = getelementptr i8, i8 addrspace(1)* %in, i8 1 + %in.gep.2 = getelementptr i8, i8 addrspace(1)* %in, i8 2 + %in.gep.3 = getelementptr i8, i8 addrspace(1)* %in, i8 3 + + %x = load i8, i8 addrspace(1)* %in + %y = load i8, i8 addrspace(1)* %in.gep.1 + %z = load i8, i8 addrspace(1)* %in.gep.2 + %w = load i8, i8 addrspace(1)* %in.gep.3 + + store i8 %x, i8 addrspace(1)* %out + store i8 %y, i8 addrspace(1)* %out.gep.1 + store i8 %z, i8 addrspace(1)* %out.gep.2 + store i8 %w, i8 addrspace(1)* %out.gep.3 + ret void +} + +; This works once AA is enabled on the subtarget +; GCN-LABEL: {{^}}merge_global_store_4_vector_elts_loads_v4i32: +; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]] +; XGCN: buffer_store_dwordx4 [[LOAD]] +; GCN: buffer_store_dword v +; GCN: buffer_store_dword v +; GCN: buffer_store_dword v +; GCN: buffer_store_dword v +define void @merge_global_store_4_vector_elts_loads_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 { + %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 + %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2 + %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3 + %vec = load <4 x i32>, <4 x i32> addrspace(1)* %in + + %x = extractelement <4 x i32> %vec, i32 0 + %y = extractelement <4 x i32> %vec, i32 1 + %z = extractelement <4 x i32> %vec, i32 2 + %w = extractelement <4 x i32> %vec, i32 3 + + store i32 %x, i32 addrspace(1)* %out + store i32 %y, i32 addrspace(1)* %out.gep.1 + store i32 %z, i32 addrspace(1)* %out.gep.2 + store i32 %w, i32 addrspace(1)* %out.gep.3 + ret void +} + +; GCN-LABEL: {{^}}merge_local_store_2_constants_i8: +; GCN: ds_write_b8 +; GCN: ds_write_b8 +; GCN: s_endpgm +define void @merge_local_store_2_constants_i8(i8 addrspace(3)* %out) #0 { + %out.gep.1 = getelementptr i8, i8 addrspace(3)* %out, i32 1 + + store i8 123, i8 addrspace(3)* %out.gep.1 + store i8 456, i8 addrspace(3)* %out, align 2 + ret void +} + +; GCN-LABEL: {{^}}merge_local_store_2_constants_i32: +; GCN-DAG: s_movk_i32 [[SLO:s[0-9]+]], 0x1c8 +; GCN-DAG: s_movk_i32 [[SHI:s[0-9]+]], 0x7b +; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], [[SLO]] +; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[SHI]] +; GCN: ds_write2_b32 v{{[0-9]+}}, v[[LO]], v[[HI]] offset1:1{{$}} +define void @merge_local_store_2_constants_i32(i32 addrspace(3)* %out) #0 { + %out.gep.1 = getelementptr i32, i32 addrspace(3)* %out, i32 1 + + store i32 123, i32 addrspace(3)* %out.gep.1 + store i32 456, i32 addrspace(3)* %out + ret void +} + +; GCN-LABEL: {{^}}merge_local_store_4_constants_i32: +; GCN: ds_write_b32 +; GCN: ds_write_b32 +; GCN: ds_write_b32 +; GCN: ds_write_b32 +define void @merge_local_store_4_constants_i32(i32 addrspace(3)* %out) #0 { + %out.gep.1 = getelementptr i32, i32 addrspace(3)* %out, i32 1 + %out.gep.2 = getelementptr i32, i32 addrspace(3)* %out, i32 2 + %out.gep.3 = getelementptr i32, i32 addrspace(3)* %out, i32 3 + + store i32 123, i32 addrspace(3)* %out.gep.1 + store i32 456, i32 addrspace(3)* %out.gep.2 + store i32 333, i32 addrspace(3)* %out.gep.3 + store i32 1234, i32 addrspace(3)* %out + ret void +} + +declare void @llvm.AMDGPU.barrier.local() #1 + +attributes #0 = { nounwind } +attributes #1 = { noduplicate nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/min.ll b/llvm/test/CodeGen/AMDGPU/min.ll new file mode 100644 index 00000000000..0332d1a8e40 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/min.ll @@ -0,0 +1,189 @@ +; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s + +declare i32 @llvm.r600.read.tidig.x() nounwind readnone + +; FUNC-LABEL: @v_test_imin_sle_i32 +; SI: v_min_i32_e32 +define void @v_test_imin_sle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid + %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid + %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %a = load i32, i32 addrspace(1)* %gep0, align 4 + %b = load i32, i32 addrspace(1)* %gep1, align 4 + %cmp = icmp sle i32 %a, %b + %val = select i1 %cmp, i32 %a, i32 %b + store i32 %val, i32 addrspace(1)* %outgep, align 4 + ret void +} + +; FUNC-LABEL: @s_test_imin_sle_i32 +; SI: s_min_i32 +define void @s_test_imin_sle_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind { + %cmp = icmp sle i32 %a, %b + %val = select i1 %cmp, i32 %a, i32 %b + store i32 %val, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @v_test_imin_slt_i32 +; SI: v_min_i32_e32 +define void @v_test_imin_slt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid + %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid + %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %a = load i32, i32 addrspace(1)* %gep0, align 4 + %b = load i32, i32 addrspace(1)* %gep1, align 4 + %cmp = icmp slt i32 %a, %b + %val = select i1 %cmp, i32 %a, i32 %b + store i32 %val, i32 addrspace(1)* %outgep, align 4 + ret void +} + +; FUNC-LABEL: @s_test_imin_slt_i32 +; SI: s_min_i32 +define void @s_test_imin_slt_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind { + %cmp = icmp slt i32 %a, %b + %val = select i1 %cmp, i32 %a, i32 %b + store i32 %val, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}s_test_imin_slt_imm_i32: +; SI: s_min_i32 {{s[0-9]+}}, {{s[0-9]+}}, 8 +define void @s_test_imin_slt_imm_i32(i32 addrspace(1)* %out, i32 %a) nounwind { + %cmp = icmp slt i32 %a, 8 + %val = select i1 %cmp, i32 %a, i32 8 + store i32 %val, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}s_test_imin_sle_imm_i32: +; SI: s_min_i32 {{s[0-9]+}}, {{s[0-9]+}}, 8 +define void @s_test_imin_sle_imm_i32(i32 addrspace(1)* %out, i32 %a) nounwind { + %cmp = icmp sle i32 %a, 8 + %val = select i1 %cmp, i32 %a, i32 8 + store i32 %val, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @v_test_umin_ule_i32 +; SI: v_min_u32_e32 +define void @v_test_umin_ule_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid + %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid + %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %a = load i32, i32 addrspace(1)* %gep0, align 4 + %b = load i32, i32 addrspace(1)* %gep1, align 4 + %cmp = icmp ule i32 %a, %b + %val = select i1 %cmp, i32 %a, i32 %b + store i32 %val, i32 addrspace(1)* %outgep, align 4 + ret void +} + +; FUNC-LABEL: @s_test_umin_ule_i32 +; SI: s_min_u32 +define void @s_test_umin_ule_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind { + %cmp = icmp ule i32 %a, %b + %val = select i1 %cmp, i32 %a, i32 %b + store i32 %val, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @v_test_umin_ult_i32 +; SI: v_min_u32_e32 +define void @v_test_umin_ult_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid + %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid + %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %a = load i32, i32 addrspace(1)* %gep0, align 4 + %b = load i32, i32 addrspace(1)* %gep1, align 4 + %cmp = icmp ult i32 %a, %b + %val = select i1 %cmp, i32 %a, i32 %b + store i32 %val, i32 addrspace(1)* %outgep, align 4 + ret void +} + +; FUNC-LABEL: @s_test_umin_ult_i32 +; SI: s_min_u32 +define void @s_test_umin_ult_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind { + %cmp = icmp ult i32 %a, %b + %val = select i1 %cmp, i32 %a, i32 %b + store i32 %val, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @v_test_umin_ult_i32_multi_use +; SI-NOT: v_min +; SI: v_cmp_lt_u32 +; SI-NEXT: v_cndmask_b32 +; SI-NOT: v_min +; SI: s_endpgm +define void @v_test_umin_ult_i32_multi_use(i32 addrspace(1)* %out0, i1 addrspace(1)* %out1, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid + %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid + %outgep0 = getelementptr i32, i32 addrspace(1)* %out0, i32 %tid + %outgep1 = getelementptr i1, i1 addrspace(1)* %out1, i32 %tid + %a = load i32, i32 addrspace(1)* %gep0, align 4 + %b = load i32, i32 addrspace(1)* %gep1, align 4 + %cmp = icmp ult i32 %a, %b + %val = select i1 %cmp, i32 %a, i32 %b + store i32 %val, i32 addrspace(1)* %outgep0, align 4 + store i1 %cmp, i1 addrspace(1)* %outgep1 + ret void +} + +; Make sure redundant and removed +; FUNC-LABEL: {{^}}simplify_demanded_bits_test_umin_ult_i16: +; SI-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xb +; SI-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xc +; SI: s_min_u32 [[MIN:s[0-9]+]], [[A]], [[B]] +; SI-NEXT: v_mov_b32_e32 [[VMIN:v[0-9]+]], [[MIN]] +; SI-NEXT: buffer_store_dword [[VMIN]] +define void @simplify_demanded_bits_test_umin_ult_i16(i32 addrspace(1)* %out, i16 zeroext %a, i16 zeroext %b) nounwind { + %a.ext = zext i16 %a to i32 + %b.ext = zext i16 %b to i32 + %cmp = icmp ult i32 %a.ext, %b.ext + %val = select i1 %cmp, i32 %a.ext, i32 %b.ext + %mask = and i32 %val, 65535 + store i32 %mask, i32 addrspace(1)* %out + ret void +} + +; Make sure redundant sign_extend_inreg removed. + +; FUNC-LABEL: {{^}}simplify_demanded_bits_test_min_slt_i16: +; SI-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xb +; SI-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xc +; SI: s_min_i32 [[MIN:s[0-9]+]], [[A]], [[B]] +; SI-NEXT: v_mov_b32_e32 [[VMIN:v[0-9]+]], [[MIN]] +; SI-NEXT: buffer_store_dword [[VMIN]] +define void @simplify_demanded_bits_test_min_slt_i16(i32 addrspace(1)* %out, i16 signext %a, i16 signext %b) nounwind { + %a.ext = sext i16 %a to i32 + %b.ext = sext i16 %b to i32 + %cmp = icmp slt i32 %a.ext, %b.ext + %val = select i1 %cmp, i32 %a.ext, i32 %b.ext + %shl = shl i32 %val, 16 + %sextinreg = ashr i32 %shl, 16 + store i32 %sextinreg, i32 addrspace(1)* %out + ret void +} + +; FIXME: Should get match min/max through extends inserted by +; legalization. + +; FUNC-LABEL: {{^}}s_test_imin_sle_i16: +; SI: s_sext_i32_i16 +; SI: s_sext_i32_i16 +; SI: v_cmp_le_i32_e32 +; SI: v_cndmask_b32 +define void @s_test_imin_sle_i16(i16 addrspace(1)* %out, i16 %a, i16 %b) nounwind { + %cmp = icmp sle i16 %a, %b + %val = select i1 %cmp, i16 %a, i16 %b + store i16 %val, i16 addrspace(1)* %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/min3.ll b/llvm/test/CodeGen/AMDGPU/min3.ll new file mode 100644 index 00000000000..38ef46d1bdd --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/min3.ll @@ -0,0 +1,111 @@ +; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s + +declare i32 @llvm.r600.read.tidig.x() nounwind readnone + +; FUNC-LABEL: @v_test_imin3_slt_i32 +; SI: v_min3_i32 +define void @v_test_imin3_slt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid + %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid + %gep2 = getelementptr i32, i32 addrspace(1)* %cptr, i32 %tid + %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %a = load i32, i32 addrspace(1)* %gep0, align 4 + %b = load i32, i32 addrspace(1)* %gep1, align 4 + %c = load i32, i32 addrspace(1)* %gep2, align 4 + %icmp0 = icmp slt i32 %a, %b + %i0 = select i1 %icmp0, i32 %a, i32 %b + %icmp1 = icmp slt i32 %i0, %c + %i1 = select i1 %icmp1, i32 %i0, i32 %c + store i32 %i1, i32 addrspace(1)* %outgep, align 4 + ret void +} + +; FUNC-LABEL: @v_test_umin3_ult_i32 +; SI: v_min3_u32 +define void @v_test_umin3_ult_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid + %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid + %gep2 = getelementptr i32, i32 addrspace(1)* %cptr, i32 %tid + %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %a = load i32, i32 addrspace(1)* %gep0, align 4 + %b = load i32, i32 addrspace(1)* %gep1, align 4 + %c = load i32, i32 addrspace(1)* %gep2, align 4 + %icmp0 = icmp ult i32 %a, %b + %i0 = select i1 %icmp0, i32 %a, i32 %b + %icmp1 = icmp ult i32 %i0, %c + %i1 = select i1 %icmp1, i32 %i0, i32 %c + store i32 %i1, i32 addrspace(1)* %outgep, align 4 + ret void +} + +; FUNC-LABEL: @v_test_umin_umin_umin +; SI: v_min_i32 +; SI: v_min3_i32 +define void @v_test_umin_umin_umin(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %tid2 = mul i32 %tid, 2 + %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid + %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid + %gep2 = getelementptr i32, i32 addrspace(1)* %cptr, i32 %tid + + %gep3 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid2 + %gep4 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid2 + %gep5 = getelementptr i32, i32 addrspace(1)* %cptr, i32 %tid2 + + %outgep0 = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %outgep1 = getelementptr i32, i32 addrspace(1)* %out, i32 %tid2 + + %a = load i32, i32 addrspace(1)* %gep0, align 4 + %b = load i32, i32 addrspace(1)* %gep1, align 4 + %c = load i32, i32 addrspace(1)* %gep2, align 4 + %d = load i32, i32 addrspace(1)* %gep3, align 4 + + %icmp0 = icmp slt i32 %a, %b + %i0 = select i1 %icmp0, i32 %a, i32 %b + + %icmp1 = icmp slt i32 %c, %d + %i1 = select i1 %icmp1, i32 %c, i32 %d + + %icmp2 = icmp slt i32 %i0, %i1 + %i2 = select i1 %icmp2, i32 %i0, i32 %i1 + + store i32 %i2, i32 addrspace(1)* %outgep1, align 4 + ret void +} + +; FUNC-LABEL: @v_test_umin3_2_uses +; SI-NOT: v_min3 +define void @v_test_umin3_2_uses(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %tid2 = mul i32 %tid, 2 + %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid + %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid + %gep2 = getelementptr i32, i32 addrspace(1)* %cptr, i32 %tid + + %gep3 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid2 + %gep4 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid2 + %gep5 = getelementptr i32, i32 addrspace(1)* %cptr, i32 %tid2 + + %outgep0 = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %outgep1 = getelementptr i32, i32 addrspace(1)* %out, i32 %tid2 + + %a = load i32, i32 addrspace(1)* %gep0, align 4 + %b = load i32, i32 addrspace(1)* %gep1, align 4 + %c = load i32, i32 addrspace(1)* %gep2, align 4 + %d = load i32, i32 addrspace(1)* %gep3, align 4 + + %icmp0 = icmp slt i32 %a, %b + %i0 = select i1 %icmp0, i32 %a, i32 %b + + %icmp1 = icmp slt i32 %c, %d + %i1 = select i1 %icmp1, i32 %c, i32 %d + + %icmp2 = icmp slt i32 %i0, %c + %i2 = select i1 %icmp2, i32 %i0, i32 %c + + store i32 %i2, i32 addrspace(1)* %outgep0, align 4 + store i32 %i0, i32 addrspace(1)* %outgep1, align 4 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/missing-store.ll b/llvm/test/CodeGen/AMDGPU/missing-store.ll new file mode 100644 index 00000000000..4af9cdf1b96 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/missing-store.ll @@ -0,0 +1,26 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=SI %s + +@ptr_load = addrspace(3) global i32 addrspace(2)* undef, align 8 + +; Make sure when the load from %ptr2 is folded the chain isn't lost, +; resulting in losing the store to gptr + +; FUNC-LABEL: {{^}}missing_store_reduced: +; SI: ds_read_b64 +; SI: buffer_store_dword +; SI: buffer_load_dword +; SI: buffer_store_dword +; SI: s_endpgm +define void @missing_store_reduced(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 { + %ptr0 = load i32 addrspace(2)*, i32 addrspace(2)* addrspace(3)* @ptr_load, align 8 + %ptr2 = getelementptr inbounds i32, i32 addrspace(2)* %ptr0, i64 2 + + store i32 99, i32 addrspace(1)* %gptr, align 4 + %tmp2 = load i32, i32 addrspace(2)* %ptr2, align 4 + + store i32 %tmp2, i32 addrspace(1)* %out, align 4 + ret void +} + +attributes #0 = { nounwind } + diff --git a/llvm/test/CodeGen/AMDGPU/mubuf.ll b/llvm/test/CodeGen/AMDGPU/mubuf.ll new file mode 100644 index 00000000000..b19163f294e --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/mubuf.ll @@ -0,0 +1,183 @@ +; RUN: llc -march=amdgcn -mcpu=SI -show-mc-encoding -verify-machineinstrs < %s | FileCheck %s + +declare i32 @llvm.r600.read.tidig.x() readnone + +;;;==========================================================================;;; +;;; MUBUF LOAD TESTS +;;;==========================================================================;;; + +; MUBUF load with an immediate byte offset that fits into 12-bits +; CHECK-LABEL: {{^}}mubuf_load0: +; CHECK: buffer_load_dword v{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0 offset:4 ; encoding: [0x04,0x00,0x30,0xe0 +define void @mubuf_load0(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { +entry: + %0 = getelementptr i32, i32 addrspace(1)* %in, i64 1 + %1 = load i32, i32 addrspace(1)* %0 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; MUBUF load with the largest possible immediate offset +; CHECK-LABEL: {{^}}mubuf_load1: +; CHECK: buffer_load_ubyte v{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0 offset:4095 ; encoding: [0xff,0x0f,0x20,0xe0 +define void @mubuf_load1(i8 addrspace(1)* %out, i8 addrspace(1)* %in) { +entry: + %0 = getelementptr i8, i8 addrspace(1)* %in, i64 4095 + %1 = load i8, i8 addrspace(1)* %0 + store i8 %1, i8 addrspace(1)* %out + ret void +} + +; MUBUF load with an immediate byte offset that doesn't fit into 12-bits +; CHECK-LABEL: {{^}}mubuf_load2: +; CHECK: s_movk_i32 [[SOFFSET:s[0-9]+]], 0x1000 +; CHECK: buffer_load_dword v{{[0-9]}}, s[{{[0-9]+:[0-9]+}}], [[SOFFSET]] ; encoding: [0x00,0x00,0x30,0xe0 +define void @mubuf_load2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { +entry: + %0 = getelementptr i32, i32 addrspace(1)* %in, i64 1024 + %1 = load i32, i32 addrspace(1)* %0 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; MUBUF load with a 12-bit immediate offset and a register offset +; CHECK-LABEL: {{^}}mubuf_load3: +; CHECK-NOT: ADD +; CHECK: buffer_load_dword v{{[0-9]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:4 ; encoding: [0x04,0x80,0x30,0xe0 +define void @mubuf_load3(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i64 %offset) { +entry: + %0 = getelementptr i32, i32 addrspace(1)* %in, i64 %offset + %1 = getelementptr i32, i32 addrspace(1)* %0, i64 1 + %2 = load i32, i32 addrspace(1)* %1 + store i32 %2, i32 addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}soffset_max_imm: +; CHECK: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 64 offen glc +define void @soffset_max_imm([6 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* byval, [16 x <4 x i32>] addrspace(2)* byval, [32 x <8 x i32>] addrspace(2)* byval, i32 inreg, i32 inreg, i32, i32, i32, i32, i32, i32, i32, i32) #1 { +main_body: + %tmp0 = getelementptr [6 x <16 x i8>], [6 x <16 x i8>] addrspace(2)* %0, i32 0, i32 0 + %tmp1 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp0 + %tmp2 = shl i32 %6, 2 + %tmp3 = call i32 @llvm.SI.buffer.load.dword.i32.i32(<16 x i8> %tmp1, i32 %tmp2, i32 64, i32 0, i32 1, i32 0, i32 1, i32 0, i32 0) + %tmp4 = add i32 %6, 16 + %tmp5 = bitcast float 0.0 to i32 + call void @llvm.SI.tbuffer.store.i32(<16 x i8> %tmp1, i32 %tmp5, i32 1, i32 %tmp4, i32 %4, i32 0, i32 4, i32 4, i32 1, i32 0, i32 1, i32 1, i32 0) + ret void +} + +; Make sure immediates that aren't inline constants don't get folded into +; the soffset operand. +; FIXME: for this test we should be smart enough to shift the immediate into +; the offset field. +; CHECK-LABEL: {{^}}soffset_no_fold: +; CHECK: s_movk_i32 [[SOFFSET:s[0-9]+]], 0x41 +; CHECK: buffer_load_dword v{{[0-9+]}}, v{{[0-9+]}}, s[{{[0-9]+}}:{{[0-9]+}}], [[SOFFSET]] offen glc +define void @soffset_no_fold([6 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* byval, [16 x <4 x i32>] addrspace(2)* byval, [32 x <8 x i32>] addrspace(2)* byval, i32 inreg, i32 inreg, i32, i32, i32, i32, i32, i32, i32, i32) #1 { +main_body: + %tmp0 = getelementptr [6 x <16 x i8>], [6 x <16 x i8>] addrspace(2)* %0, i32 0, i32 0 + %tmp1 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp0 + %tmp2 = shl i32 %6, 2 + %tmp3 = call i32 @llvm.SI.buffer.load.dword.i32.i32(<16 x i8> %tmp1, i32 %tmp2, i32 65, i32 0, i32 1, i32 0, i32 1, i32 0, i32 0) + %tmp4 = add i32 %6, 16 + %tmp5 = bitcast float 0.0 to i32 + call void @llvm.SI.tbuffer.store.i32(<16 x i8> %tmp1, i32 %tmp5, i32 1, i32 %tmp4, i32 %4, i32 0, i32 4, i32 4, i32 1, i32 0, i32 1, i32 1, i32 0) + ret void +} + +;;;==========================================================================;;; +;;; MUBUF STORE TESTS +;;;==========================================================================;;; + +; MUBUF store with an immediate byte offset that fits into 12-bits +; CHECK-LABEL: {{^}}mubuf_store0: +; CHECK: buffer_store_dword v{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0 offset:4 ; encoding: [0x04,0x00,0x70,0xe0 +define void @mubuf_store0(i32 addrspace(1)* %out) { +entry: + %0 = getelementptr i32, i32 addrspace(1)* %out, i64 1 + store i32 0, i32 addrspace(1)* %0 + ret void +} + +; MUBUF store with the largest possible immediate offset +; CHECK-LABEL: {{^}}mubuf_store1: +; CHECK: buffer_store_byte v{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0 offset:4095 ; encoding: [0xff,0x0f,0x60,0xe0 + +define void @mubuf_store1(i8 addrspace(1)* %out) { +entry: + %0 = getelementptr i8, i8 addrspace(1)* %out, i64 4095 + store i8 0, i8 addrspace(1)* %0 + ret void +} + +; MUBUF store with an immediate byte offset that doesn't fit into 12-bits +; CHECK-LABEL: {{^}}mubuf_store2: +; CHECK: s_movk_i32 [[SOFFSET:s[0-9]+]], 0x1000 +; CHECK: buffer_store_dword v{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[SOFFSET]] ; encoding: [0x00,0x00,0x70,0xe0 +define void @mubuf_store2(i32 addrspace(1)* %out) { +entry: + %0 = getelementptr i32, i32 addrspace(1)* %out, i64 1024 + store i32 0, i32 addrspace(1)* %0 + ret void +} + +; MUBUF store with a 12-bit immediate offset and a register offset +; CHECK-LABEL: {{^}}mubuf_store3: +; CHECK-NOT: ADD +; CHECK: buffer_store_dword v{{[0-9]}}, v[{{[0-9]:[0-9]}}], s[{{[0-9]:[0-9]}}], 0 addr64 offset:4 ; encoding: [0x04,0x80,0x70,0xe0 +define void @mubuf_store3(i32 addrspace(1)* %out, i64 %offset) { +entry: + %0 = getelementptr i32, i32 addrspace(1)* %out, i64 %offset + %1 = getelementptr i32, i32 addrspace(1)* %0, i64 1 + store i32 0, i32 addrspace(1)* %1 + ret void +} + +; CHECK-LABEL: {{^}}store_sgpr_ptr: +; CHECK: buffer_store_dword v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0 +define void @store_sgpr_ptr(i32 addrspace(1)* %out) #0 { + store i32 99, i32 addrspace(1)* %out, align 4 + ret void +} + +; CHECK-LABEL: {{^}}store_sgpr_ptr_offset: +; CHECK: buffer_store_dword v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:40 +define void @store_sgpr_ptr_offset(i32 addrspace(1)* %out) #0 { + %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 10 + store i32 99, i32 addrspace(1)* %out.gep, align 4 + ret void +} + +; CHECK-LABEL: {{^}}store_sgpr_ptr_large_offset: +; CHECK: s_mov_b32 [[SOFFSET:s[0-9]+]], 0x20000 +; CHECK: buffer_store_dword v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, [[SOFFSET]] +define void @store_sgpr_ptr_large_offset(i32 addrspace(1)* %out) #0 { + %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 32768 + store i32 99, i32 addrspace(1)* %out.gep, align 4 + ret void +} + +; CHECK-LABEL: {{^}}store_sgpr_ptr_large_offset_atomic: +; CHECK: s_mov_b32 [[SOFFSET:s[0-9]+]], 0x20000 +; CHECK: buffer_atomic_add v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, [[SOFFSET]] +define void @store_sgpr_ptr_large_offset_atomic(i32 addrspace(1)* %out) #0 { + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 32768 + %val = atomicrmw volatile add i32 addrspace(1)* %gep, i32 5 seq_cst + ret void +} + +; CHECK-LABEL: {{^}}store_vgpr_ptr: +; CHECK: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 +define void @store_vgpr_ptr(i32 addrspace(1)* %out) #0 { + %tid = call i32 @llvm.r600.read.tidig.x() readnone + %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + store i32 99, i32 addrspace(1)* %out.gep, align 4 + ret void +} + +declare i32 @llvm.SI.buffer.load.dword.i32.i32(<16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #3 +declare void @llvm.SI.tbuffer.store.i32(<16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) + +attributes #1 = { "ShaderType"="2" "unsafe-fp-math"="true" } +attributes #3 = { nounwind readonly } diff --git a/llvm/test/CodeGen/AMDGPU/mul.ll b/llvm/test/CodeGen/AMDGPU/mul.ll new file mode 100644 index 00000000000..94e0f96b323 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/mul.ll @@ -0,0 +1,200 @@ +; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG %s -check-prefix=FUNC +; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s + +; mul24 and mad24 are affected + +; FUNC-LABEL: {{^}}test_mul_v2i32: +; EG: MULLO_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; EG: MULLO_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} + +; SI: v_mul_lo_i32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +; SI: v_mul_lo_i32 v{{[0-9]+, v[0-9]+, v[0-9]+}} + +define void @test_mul_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { + %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1 + %a = load <2 x i32>, <2 x i32> addrspace(1) * %in + %b = load <2 x i32>, <2 x i32> addrspace(1) * %b_ptr + %result = mul <2 x i32> %a, %b + store <2 x i32> %result, <2 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}v_mul_v4i32: +; EG: MULLO_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; EG: MULLO_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; EG: MULLO_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; EG: MULLO_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} + +; SI: v_mul_lo_i32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +; SI: v_mul_lo_i32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +; SI: v_mul_lo_i32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +; SI: v_mul_lo_i32 v{{[0-9]+, v[0-9]+, v[0-9]+}} + +define void @v_mul_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { + %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1 + %a = load <4 x i32>, <4 x i32> addrspace(1) * %in + %b = load <4 x i32>, <4 x i32> addrspace(1) * %b_ptr + %result = mul <4 x i32> %a, %b + store <4 x i32> %result, <4 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}s_trunc_i64_mul_to_i32: +; SI: s_load_dword +; SI: s_load_dword +; SI: s_mul_i32 +; SI: buffer_store_dword +define void @s_trunc_i64_mul_to_i32(i32 addrspace(1)* %out, i64 %a, i64 %b) { + %mul = mul i64 %b, %a + %trunc = trunc i64 %mul to i32 + store i32 %trunc, i32 addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}v_trunc_i64_mul_to_i32: +; SI: s_load_dword +; SI: s_load_dword +; SI: v_mul_lo_i32 +; SI: buffer_store_dword +define void @v_trunc_i64_mul_to_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind { + %a = load i64, i64 addrspace(1)* %aptr, align 8 + %b = load i64, i64 addrspace(1)* %bptr, align 8 + %mul = mul i64 %b, %a + %trunc = trunc i64 %mul to i32 + store i32 %trunc, i32 addrspace(1)* %out, align 8 + ret void +} + +; This 64-bit multiply should just use MUL_HI and MUL_LO, since the top +; 32-bits of both arguments are sign bits. +; FUNC-LABEL: {{^}}mul64_sext_c: +; EG-DAG: MULLO_INT +; EG-DAG: MULHI_INT +; SI-DAG: s_mul_i32 +; SI-DAG: v_mul_hi_i32 +define void @mul64_sext_c(i64 addrspace(1)* %out, i32 %in) { +entry: + %0 = sext i32 %in to i64 + %1 = mul i64 %0, 80 + store i64 %1, i64 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}v_mul64_sext_c: +; EG-DAG: MULLO_INT +; EG-DAG: MULHI_INT +; SI-DAG: v_mul_lo_i32 +; SI-DAG: v_mul_hi_i32 +; SI: s_endpgm +define void @v_mul64_sext_c(i64 addrspace(1)* %out, i32 addrspace(1)* %in) { + %val = load i32, i32 addrspace(1)* %in, align 4 + %ext = sext i32 %val to i64 + %mul = mul i64 %ext, 80 + store i64 %mul, i64 addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}v_mul64_sext_inline_imm: +; SI-DAG: v_mul_lo_i32 v{{[0-9]+}}, 9, v{{[0-9]+}} +; SI-DAG: v_mul_hi_i32 v{{[0-9]+}}, 9, v{{[0-9]+}} +; SI: s_endpgm +define void @v_mul64_sext_inline_imm(i64 addrspace(1)* %out, i32 addrspace(1)* %in) { + %val = load i32, i32 addrspace(1)* %in, align 4 + %ext = sext i32 %val to i64 + %mul = mul i64 %ext, 9 + store i64 %mul, i64 addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}s_mul_i32: +; SI: s_load_dword [[SRC0:s[0-9]+]], +; SI: s_load_dword [[SRC1:s[0-9]+]], +; SI: s_mul_i32 [[SRESULT:s[0-9]+]], [[SRC0]], [[SRC1]] +; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]] +; SI: buffer_store_dword [[VRESULT]], +; SI: s_endpgm +define void @s_mul_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind { + %mul = mul i32 %a, %b + store i32 %mul, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}v_mul_i32: +; SI: v_mul_lo_i32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +define void @v_mul_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { + %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 + %a = load i32, i32 addrspace(1)* %in + %b = load i32, i32 addrspace(1)* %b_ptr + %result = mul i32 %a, %b + store i32 %result, i32 addrspace(1)* %out + ret void +} + +; A standard 64-bit multiply. The expansion should be around 6 instructions. +; It would be difficult to match the expansion correctly without writing +; a really complicated list of FileCheck expressions. I don't want +; to confuse people who may 'break' this test with a correct optimization, +; so this test just uses FUNC-LABEL to make sure the compiler does not +; crash with a 'failed to select' error. + +; FUNC-LABEL: {{^}}s_mul_i64: +define void @s_mul_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind { + %mul = mul i64 %a, %b + store i64 %mul, i64 addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}v_mul_i64: +; SI: v_mul_lo_i32 +define void @v_mul_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) { + %a = load i64, i64 addrspace(1)* %aptr, align 8 + %b = load i64, i64 addrspace(1)* %bptr, align 8 + %mul = mul i64 %a, %b + store i64 %mul, i64 addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}mul32_in_branch: +; SI: s_mul_i32 +define void @mul32_in_branch(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %a, i32 %b, i32 %c) { +entry: + %0 = icmp eq i32 %a, 0 + br i1 %0, label %if, label %else + +if: + %1 = load i32, i32 addrspace(1)* %in + br label %endif + +else: + %2 = mul i32 %a, %b + br label %endif + +endif: + %3 = phi i32 [%1, %if], [%2, %else] + store i32 %3, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}mul64_in_branch: +; SI-DAG: s_mul_i32 +; SI-DAG: v_mul_hi_u32 +; SI: s_endpgm +define void @mul64_in_branch(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %a, i64 %b, i64 %c) { +entry: + %0 = icmp eq i64 %a, 0 + br i1 %0, label %if, label %else + +if: + %1 = load i64, i64 addrspace(1)* %in + br label %endif + +else: + %2 = mul i64 %a, %b + br label %endif + +endif: + %3 = phi i64 [%1, %if], [%2, %else] + store i64 %3, i64 addrspace(1)* %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/mul_int24.ll b/llvm/test/CodeGen/AMDGPU/mul_int24.ll new file mode 100644 index 00000000000..7609dcc87af --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/mul_int24.ll @@ -0,0 +1,23 @@ +; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG --check-prefix=FUNC +; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=CM --check-prefix=FUNC +; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC +; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC + +; FUNC-LABEL: {{^}}i32_mul24: +; Signed 24-bit multiply is not supported on pre-Cayman GPUs. +; EG: MULLO_INT +; Make sure we are not masking the inputs +; CM-NOT: AND +; CM: MUL_INT24 +; SI-NOT: and +; SI: v_mul_i32_i24 +define void @i32_mul24(i32 addrspace(1)* %out, i32 %a, i32 %b) { +entry: + %0 = shl i32 %a, 8 + %a_24 = ashr i32 %0, 8 + %1 = shl i32 %b, 8 + %b_24 = ashr i32 %1, 8 + %2 = mul i32 %a_24, %b_24 + store i32 %2, i32 addrspace(1)* %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/mul_uint24.ll b/llvm/test/CodeGen/AMDGPU/mul_uint24.ll new file mode 100644 index 00000000000..e640a7cd69f --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/mul_uint24.ll @@ -0,0 +1,67 @@ +; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG --check-prefix=FUNC +; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=EG --check-prefix=FUNC +; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC +; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC + +; FUNC-LABEL: {{^}}u32_mul24: +; EG: MUL_UINT24 {{[* ]*}}T{{[0-9]\.[XYZW]}}, KC0[2].Z, KC0[2].W +; SI: v_mul_u32_u24 + +define void @u32_mul24(i32 addrspace(1)* %out, i32 %a, i32 %b) { +entry: + %0 = shl i32 %a, 8 + %a_24 = lshr i32 %0, 8 + %1 = shl i32 %b, 8 + %b_24 = lshr i32 %1, 8 + %2 = mul i32 %a_24, %b_24 + store i32 %2, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}i16_mul24: +; EG: MUL_UINT24 {{[* ]*}}T{{[0-9]}}.[[MUL_CHAN:[XYZW]]] +; The result must be sign-extended +; EG: BFE_INT {{[* ]*}}T{{[0-9]}}.{{[XYZW]}}, PV.[[MUL_CHAN]], 0.0, literal.x +; EG: 16 +; SI: v_mul_u32_u24_e{{(32|64)}} [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}} +; SI: v_bfe_i32 v{{[0-9]}}, [[MUL]], 0, 16 +define void @i16_mul24(i32 addrspace(1)* %out, i16 %a, i16 %b) { +entry: + %0 = mul i16 %a, %b + %1 = sext i16 %0 to i32 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}i8_mul24: +; EG: MUL_UINT24 {{[* ]*}}T{{[0-9]}}.[[MUL_CHAN:[XYZW]]] +; The result must be sign-extended +; EG: BFE_INT {{[* ]*}}T{{[0-9]}}.{{[XYZW]}}, PV.[[MUL_CHAN]], 0.0, literal.x +; SI: v_mul_u32_u24_e{{(32|64)}} [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}} +; SI: v_bfe_i32 v{{[0-9]}}, [[MUL]], 0, 8 + +define void @i8_mul24(i32 addrspace(1)* %out, i8 %a, i8 %b) { +entry: + %0 = mul i8 %a, %b + %1 = sext i8 %0 to i32 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; Multiply with 24-bit inputs and 64-bit output +; FUNC_LABEL: {{^}}mul24_i64: +; EG; MUL_UINT24 +; EG: MULHI +; SI: v_mul_u32_u24 +; FIXME: SI support 24-bit mulhi +; SI: v_mul_hi_u32 +define void @mul24_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) { +entry: + %0 = shl i64 %a, 40 + %a_24 = lshr i64 %0, 40 + %1 = shl i64 %b, 40 + %b_24 = lshr i64 %1, 40 + %2 = mul i64 %a_24, %b_24 + store i64 %2, i64 addrspace(1)* %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/mulhu.ll b/llvm/test/CodeGen/AMDGPU/mulhu.ll new file mode 100644 index 00000000000..29b0944a553 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/mulhu.ll @@ -0,0 +1,17 @@ +;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s +;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s + +;CHECK: v_mov_b32_e32 v{{[0-9]+}}, 0xaaaaaaab +;CHECK: v_mul_hi_u32 v0, {{v[0-9]+}}, {{s[0-9]+}} +;CHECK-NEXT: v_lshrrev_b32_e32 v0, 1, v0 + +define void @test(i32 %p) { + %i = udiv i32 %p, 3 + %r = bitcast i32 %i to float + call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %r, float %r, float %r, float %r) + ret void +} + +declare <4 x float> @llvm.SI.sample.(i32, <4 x i32>, <8 x i32>, <4 x i32>, i32) readnone + +declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) diff --git a/llvm/test/CodeGen/AMDGPU/no-initializer-constant-addrspace.ll b/llvm/test/CodeGen/AMDGPU/no-initializer-constant-addrspace.ll new file mode 100644 index 00000000000..9a814b579de --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/no-initializer-constant-addrspace.ll @@ -0,0 +1,21 @@ +; RUN: llc -march=amdgcn -mcpu=SI -o /dev/null %s +; RUN: llc -march=amdgcn -mcpu=tonga -o /dev/null %s +; RUN: llc -march=r600 -mcpu=cypress -o /dev/null %s + +@extern_const_addrspace = external unnamed_addr addrspace(2) constant [5 x i32], align 4 + +; FUNC-LABEL: {{^}}load_extern_const_init: +define void @load_extern_const_init(i32 addrspace(1)* %out) nounwind { + %val = load i32, i32 addrspace(2)* getelementptr ([5 x i32], [5 x i32] addrspace(2)* @extern_const_addrspace, i64 0, i64 3), align 4 + store i32 %val, i32 addrspace(1)* %out, align 4 + ret void +} + +@undef_const_addrspace = unnamed_addr addrspace(2) constant [5 x i32] undef, align 4 + +; FUNC-LABEL: {{^}}load_undef_const_init: +define void @load_undef_const_init(i32 addrspace(1)* %out) nounwind { + %val = load i32, i32 addrspace(2)* getelementptr ([5 x i32], [5 x i32] addrspace(2)* @undef_const_addrspace, i64 0, i64 3), align 4 + store i32 %val, i32 addrspace(1)* %out, align 4 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/no-shrink-extloads.ll b/llvm/test/CodeGen/AMDGPU/no-shrink-extloads.ll new file mode 100644 index 00000000000..e4328ecbaca --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/no-shrink-extloads.ll @@ -0,0 +1,191 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s + +declare i32 @llvm.r600.read.tidig.x() nounwind readnone + +; Make sure we don't turn the 32-bit argument load into a 16-bit +; load. There aren't extending scalar lods, so that would require +; using a buffer_load instruction. + +; FUNC-LABEL: {{^}}truncate_kernarg_i32_to_i16: +; SI: s_load_dword s +; SI: buffer_store_short v +define void @truncate_kernarg_i32_to_i16(i16 addrspace(1)* %out, i32 %arg) nounwind { + %trunc = trunc i32 %arg to i16 + store i16 %trunc, i16 addrspace(1)* %out + ret void +} + +; It should be OK (and probably performance neutral) to reduce this, +; but we don't know if the load is uniform yet. + +; FUNC-LABEL: {{^}}truncate_buffer_load_i32_to_i16: +; SI: buffer_load_dword v +; SI: buffer_store_short v +define void @truncate_buffer_load_i32_to_i16(i16 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i16, i16 addrspace(1)* %out, i32 %tid + %load = load i32, i32 addrspace(1)* %gep.in + %trunc = trunc i32 %load to i16 + store i16 %trunc, i16 addrspace(1)* %gep.out + ret void +} + +; FUNC-LABEL: {{^}}truncate_kernarg_i32_to_i8: +; SI: s_load_dword s +; SI: buffer_store_byte v +define void @truncate_kernarg_i32_to_i8(i8 addrspace(1)* %out, i32 %arg) nounwind { + %trunc = trunc i32 %arg to i8 + store i8 %trunc, i8 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}truncate_buffer_load_i32_to_i8: +; SI: buffer_load_dword v +; SI: buffer_store_byte v +define void @truncate_buffer_load_i32_to_i8(i8 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i8, i8 addrspace(1)* %out, i32 %tid + %load = load i32, i32 addrspace(1)* %gep.in + %trunc = trunc i32 %load to i8 + store i8 %trunc, i8 addrspace(1)* %gep.out + ret void +} + +; FUNC-LABEL: {{^}}truncate_kernarg_i32_to_i1: +; SI: s_load_dword s +; SI: buffer_store_byte v +define void @truncate_kernarg_i32_to_i1(i1 addrspace(1)* %out, i32 %arg) nounwind { + %trunc = trunc i32 %arg to i1 + store i1 %trunc, i1 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}truncate_buffer_load_i32_to_i1: +; SI: buffer_load_dword v +; SI: buffer_store_byte v +define void @truncate_buffer_load_i32_to_i1(i1 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i1, i1 addrspace(1)* %out, i32 %tid + %load = load i32, i32 addrspace(1)* %gep.in + %trunc = trunc i32 %load to i1 + store i1 %trunc, i1 addrspace(1)* %gep.out + ret void +} + +; FUNC-LABEL: {{^}}truncate_kernarg_i64_to_i32: +; SI: s_load_dword s +; SI: buffer_store_dword v +define void @truncate_kernarg_i64_to_i32(i32 addrspace(1)* %out, i64 %arg) nounwind { + %trunc = trunc i64 %arg to i32 + store i32 %trunc, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}truncate_buffer_load_i64_to_i32: +; SI: buffer_load_dword v +; SI: buffer_store_dword v +define void @truncate_buffer_load_i64_to_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %load = load i64, i64 addrspace(1)* %gep.in + %trunc = trunc i64 %load to i32 + store i32 %trunc, i32 addrspace(1)* %gep.out + ret void +} + +; FUNC-LABEL: {{^}}srl_kernarg_i64_to_i32: +; SI: s_load_dword s +; SI: buffer_store_dword v +define void @srl_kernarg_i64_to_i32(i32 addrspace(1)* %out, i64 %arg) nounwind { + %srl = lshr i64 %arg, 32 + %trunc = trunc i64 %srl to i32 + store i32 %trunc, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}srl_buffer_load_i64_to_i32: +; SI: buffer_load_dword v +; SI: buffer_store_dword v +define void @srl_buffer_load_i64_to_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %load = load i64, i64 addrspace(1)* %gep.in + %srl = lshr i64 %load, 32 + %trunc = trunc i64 %srl to i32 + store i32 %trunc, i32 addrspace(1)* %gep.out + ret void +} + +; Might as well reduce to 8-bit loads. +; FUNC-LABEL: {{^}}truncate_kernarg_i16_to_i8: +; SI: s_load_dword s +; SI: buffer_store_byte v +define void @truncate_kernarg_i16_to_i8(i8 addrspace(1)* %out, i16 %arg) nounwind { + %trunc = trunc i16 %arg to i8 + store i8 %trunc, i8 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}truncate_buffer_load_i16_to_i8: +; SI: buffer_load_ubyte v +; SI: buffer_store_byte v +define void @truncate_buffer_load_i16_to_i8(i8 addrspace(1)* %out, i16 addrspace(1)* %in) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep.in = getelementptr i16, i16 addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i8, i8 addrspace(1)* %out, i32 %tid + %load = load i16, i16 addrspace(1)* %gep.in + %trunc = trunc i16 %load to i8 + store i8 %trunc, i8 addrspace(1)* %gep.out + ret void +} + +; FUNC-LABEL: {{^}}srl_kernarg_i64_to_i8: +; SI: s_load_dword s +; SI: buffer_store_byte v +define void @srl_kernarg_i64_to_i8(i8 addrspace(1)* %out, i64 %arg) nounwind { + %srl = lshr i64 %arg, 32 + %trunc = trunc i64 %srl to i8 + store i8 %trunc, i8 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}srl_buffer_load_i64_to_i8: +; SI: buffer_load_dword v +; SI: buffer_store_byte v +define void @srl_buffer_load_i64_to_i8(i8 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i8, i8 addrspace(1)* %out, i32 %tid + %load = load i64, i64 addrspace(1)* %gep.in + %srl = lshr i64 %load, 32 + %trunc = trunc i64 %srl to i8 + store i8 %trunc, i8 addrspace(1)* %gep.out + ret void +} + +; FUNC-LABEL: {{^}}truncate_kernarg_i64_to_i8: +; SI: s_load_dword s +; SI: buffer_store_byte v +define void @truncate_kernarg_i64_to_i8(i8 addrspace(1)* %out, i64 %arg) nounwind { + %trunc = trunc i64 %arg to i8 + store i8 %trunc, i8 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}truncate_buffer_load_i64_to_i8: +; SI: buffer_load_dword v +; SI: buffer_store_byte v +define void @truncate_buffer_load_i64_to_i8(i8 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i8, i8 addrspace(1)* %out, i32 %tid + %load = load i64, i64 addrspace(1)* %gep.in + %trunc = trunc i64 %load to i8 + store i8 %trunc, i8 addrspace(1)* %gep.out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/operand-folding.ll b/llvm/test/CodeGen/AMDGPU/operand-folding.ll new file mode 100644 index 00000000000..816755efb07 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/operand-folding.ll @@ -0,0 +1,113 @@ +; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s + +; CHECK-LABEL: {{^}}fold_sgpr: +; CHECK: v_add_i32_e32 v{{[0-9]+}}, s +define void @fold_sgpr(i32 addrspace(1)* %out, i32 %fold) { +entry: + %tmp0 = icmp ne i32 %fold, 0 + br i1 %tmp0, label %if, label %endif + +if: + %id = call i32 @llvm.r600.read.tidig.x() + %offset = add i32 %fold, %id + %tmp1 = getelementptr i32, i32 addrspace(1)* %out, i32 %offset + store i32 0, i32 addrspace(1)* %tmp1 + br label %endif + +endif: + ret void +} + +; CHECK-LABEL: {{^}}fold_imm: +; CHECK: v_or_b32_e32 v{{[0-9]+}}, 5 +define void @fold_imm(i32 addrspace(1)* %out, i32 %cmp) { +entry: + %fold = add i32 3, 2 + %tmp0 = icmp ne i32 %cmp, 0 + br i1 %tmp0, label %if, label %endif + +if: + %id = call i32 @llvm.r600.read.tidig.x() + %val = or i32 %id, %fold + store i32 %val, i32 addrspace(1)* %out + br label %endif + +endif: + ret void +} + +; CHECK-LABEL: {{^}}fold_64bit_constant_add: +; CHECK-NOT: s_mov_b64 +; FIXME: It would be better if we could use v_add here and drop the extra +; v_mov_b32 instructions. +; CHECK-DAG: s_add_u32 [[LO:s[0-9]+]], s{{[0-9]+}}, 1 +; CHECK-DAG: s_addc_u32 [[HI:s[0-9]+]], s{{[0-9]+}}, 0 +; CHECK-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], [[LO]] +; CHECK-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], [[HI]] +; CHECK: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}}, + +define void @fold_64bit_constant_add(i64 addrspace(1)* %out, i32 %cmp, i64 %val) { +entry: + %tmp0 = add i64 %val, 1 + store i64 %tmp0, i64 addrspace(1)* %out + ret void +} + +; Inline constants should always be folded. + +; CHECK-LABEL: {{^}}vector_inline: +; CHECK: v_xor_b32_e32 v{{[0-9]+}}, 5, v{{[0-9]+}} +; CHECK: v_xor_b32_e32 v{{[0-9]+}}, 5, v{{[0-9]+}} +; CHECK: v_xor_b32_e32 v{{[0-9]+}}, 5, v{{[0-9]+}} +; CHECK: v_xor_b32_e32 v{{[0-9]+}}, 5, v{{[0-9]+}} + +define void @vector_inline(<4 x i32> addrspace(1)* %out) { +entry: + %tmp0 = call i32 @llvm.r600.read.tidig.x() + %tmp1 = add i32 %tmp0, 1 + %tmp2 = add i32 %tmp0, 2 + %tmp3 = add i32 %tmp0, 3 + %vec0 = insertelement <4 x i32> undef, i32 %tmp0, i32 0 + %vec1 = insertelement <4 x i32> %vec0, i32 %tmp1, i32 1 + %vec2 = insertelement <4 x i32> %vec1, i32 %tmp2, i32 2 + %vec3 = insertelement <4 x i32> %vec2, i32 %tmp3, i32 3 + %tmp4 = xor <4 x i32> , %vec3 + store <4 x i32> %tmp4, <4 x i32> addrspace(1)* %out + ret void +} + +; Immediates with one use should be folded +; CHECK-LABEL: {{^}}imm_one_use: +; CHECK: v_xor_b32_e32 v{{[0-9]+}}, 0x64, v{{[0-9]+}} + +define void @imm_one_use(i32 addrspace(1)* %out) { +entry: + %tmp0 = call i32 @llvm.r600.read.tidig.x() + %tmp1 = xor i32 %tmp0, 100 + store i32 %tmp1, i32 addrspace(1)* %out + ret void +} +; CHECK-LABEL: {{^}}vector_imm: +; CHECK: s_movk_i32 [[IMM:s[0-9]+]], 0x64 +; CHECK: v_xor_b32_e32 v{{[0-9]}}, [[IMM]], v{{[0-9]}} +; CHECK: v_xor_b32_e32 v{{[0-9]}}, [[IMM]], v{{[0-9]}} +; CHECK: v_xor_b32_e32 v{{[0-9]}}, [[IMM]], v{{[0-9]}} +; CHECK: v_xor_b32_e32 v{{[0-9]}}, [[IMM]], v{{[0-9]}} + +define void @vector_imm(<4 x i32> addrspace(1)* %out) { +entry: + %tmp0 = call i32 @llvm.r600.read.tidig.x() + %tmp1 = add i32 %tmp0, 1 + %tmp2 = add i32 %tmp0, 2 + %tmp3 = add i32 %tmp0, 3 + %vec0 = insertelement <4 x i32> undef, i32 %tmp0, i32 0 + %vec1 = insertelement <4 x i32> %vec0, i32 %tmp1, i32 1 + %vec2 = insertelement <4 x i32> %vec1, i32 %tmp2, i32 2 + %vec3 = insertelement <4 x i32> %vec2, i32 %tmp3, i32 3 + %tmp4 = xor <4 x i32> , %vec3 + store <4 x i32> %tmp4, <4 x i32> addrspace(1)* %out + ret void +} + +declare i32 @llvm.r600.read.tidig.x() #0 +attributes #0 = { readnone } diff --git a/llvm/test/CodeGen/AMDGPU/operand-spacing.ll b/llvm/test/CodeGen/AMDGPU/operand-spacing.ll new file mode 100644 index 00000000000..20420a84de6 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/operand-spacing.ll @@ -0,0 +1,18 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=SI -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=VI -check-prefix=GCN %s + +; Make sure there isn't an extra space between the instruction name and first operands. + +; GCN-LABEL: {{^}}add_f32: +; SI-DAG: s_load_dword [[SREGA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb +; SI-DAG: s_load_dword [[SREGB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc +; VI-DAG: s_load_dword [[SREGA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c +; VI-DAG: s_load_dword [[SREGB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30 +; GCN: v_mov_b32_e32 [[VREGB:v[0-9]+]], [[SREGB]] +; GCN: v_add_f32_e32 [[RESULT:v[0-9]+]], [[SREGA]], [[VREGB]] +; GCN: buffer_store_dword [[RESULT]], +define void @add_f32(float addrspace(1)* %out, float %a, float %b) { + %result = fadd float %a, %b + store float %result, float addrspace(1)* %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/or.ll b/llvm/test/CodeGen/AMDGPU/or.ll new file mode 100644 index 00000000000..1c04090b407 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/or.ll @@ -0,0 +1,178 @@ +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s + + +; FUNC-LABEL: {{^}}or_v2i32: +; EG: OR_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; EG: OR_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} + +; SI: v_or_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +; SI: v_or_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +define void @or_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { + %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1 + %a = load <2 x i32>, <2 x i32> addrspace(1) * %in + %b = load <2 x i32>, <2 x i32> addrspace(1) * %b_ptr + %result = or <2 x i32> %a, %b + store <2 x i32> %result, <2 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}or_v4i32: +; EG: OR_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; EG: OR_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; EG: OR_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; EG: OR_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} + +; SI: v_or_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +; SI: v_or_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +; SI: v_or_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +; SI: v_or_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +define void @or_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { + %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1 + %a = load <4 x i32>, <4 x i32> addrspace(1) * %in + %b = load <4 x i32>, <4 x i32> addrspace(1) * %b_ptr + %result = or <4 x i32> %a, %b + store <4 x i32> %result, <4 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}scalar_or_i32: +; SI: s_or_b32 +define void @scalar_or_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) { + %or = or i32 %a, %b + store i32 %or, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}vector_or_i32: +; SI: v_or_b32_e32 v{{[0-9]}} +define void @vector_or_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %a, i32 %b) { + %loada = load i32, i32 addrspace(1)* %a + %or = or i32 %loada, %b + store i32 %or, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}scalar_or_literal_i32: +; SI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x1869f +define void @scalar_or_literal_i32(i32 addrspace(1)* %out, i32 %a) { + %or = or i32 %a, 99999 + store i32 %or, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}vector_or_literal_i32: +; SI: v_or_b32_e32 v{{[0-9]+}}, 0xffff, v{{[0-9]+}} +define void @vector_or_literal_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %a, i32 addrspace(1)* %b) { + %loada = load i32, i32 addrspace(1)* %a, align 4 + %or = or i32 %loada, 65535 + store i32 %or, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}vector_or_inline_immediate_i32: +; SI: v_or_b32_e32 v{{[0-9]+}}, 4, v{{[0-9]+}} +define void @vector_or_inline_immediate_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %a, i32 addrspace(1)* %b) { + %loada = load i32, i32 addrspace(1)* %a, align 4 + %or = or i32 %loada, 4 + store i32 %or, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}scalar_or_i64: +; EG-DAG: OR_INT * T{{[0-9]\.[XYZW]}}, KC0[2].W, KC0[3].Y +; EG-DAG: OR_INT * T{{[0-9]\.[XYZW]}}, KC0[3].X, KC0[3].Z + +; SI: s_or_b64 +define void @scalar_or_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) { + %or = or i64 %a, %b + store i64 %or, i64 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}vector_or_i64: +; SI: v_or_b32_e32 v{{[0-9]}} +; SI: v_or_b32_e32 v{{[0-9]}} +define void @vector_or_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { + %loada = load i64, i64 addrspace(1)* %a, align 8 + %loadb = load i64, i64 addrspace(1)* %a, align 8 + %or = or i64 %loada, %loadb + store i64 %or, i64 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}scalar_vector_or_i64: +; SI: v_or_b32_e32 v{{[0-9]}} +; SI: v_or_b32_e32 v{{[0-9]}} +define void @scalar_vector_or_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 %b) { + %loada = load i64, i64 addrspace(1)* %a + %or = or i64 %loada, %b + store i64 %or, i64 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}vector_or_i64_loadimm: +; SI-DAG: s_mov_b32 [[LO_S_IMM:s[0-9]+]], 0xdf77987f +; SI-DAG: s_movk_i32 [[HI_S_IMM:s[0-9]+]], 0x146f +; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, +; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]] +; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]] +; SI: s_endpgm +define void @vector_or_i64_loadimm(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { + %loada = load i64, i64 addrspace(1)* %a, align 8 + %or = or i64 %loada, 22470723082367 + store i64 %or, i64 addrspace(1)* %out + ret void +} + +; FIXME: The or 0 should really be removed. +; FUNC-LABEL: {{^}}vector_or_i64_imm: +; SI: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, +; SI: v_or_b32_e32 {{v[0-9]+}}, 8, v[[LO_VREG]] +; SI: v_or_b32_e32 {{v[0-9]+}}, 0, {{.*}} +; SI: s_endpgm +define void @vector_or_i64_imm(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { + %loada = load i64, i64 addrspace(1)* %a, align 8 + %or = or i64 %loada, 8 + store i64 %or, i64 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}trunc_i64_or_to_i32: +; SI: s_load_dword s[[SREG0:[0-9]+]] +; SI: s_load_dword s[[SREG1:[0-9]+]] +; SI: s_or_b32 s[[SRESULT:[0-9]+]], s[[SREG1]], s[[SREG0]] +; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], s[[SRESULT]] +; SI: buffer_store_dword [[VRESULT]], +define void @trunc_i64_or_to_i32(i32 addrspace(1)* %out, i64 %a, i64 %b) { + %add = or i64 %b, %a + %trunc = trunc i64 %add to i32 + store i32 %trunc, i32 addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}or_i1: +; EG: OR_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], PS}} + +; SI: s_or_b64 s[{{[0-9]+:[0-9]+}}], vcc, s[{{[0-9]+:[0-9]+}}] +define void @or_i1(i32 addrspace(1)* %out, float addrspace(1)* %in0, float addrspace(1)* %in1) { + %a = load float, float addrspace(1)* %in0 + %b = load float, float addrspace(1)* %in1 + %acmp = fcmp oge float %a, 0.000000e+00 + %bcmp = fcmp oge float %b, 0.000000e+00 + %or = or i1 %acmp, %bcmp + %result = zext i1 %or to i32 + store i32 %result, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}s_or_i1: +; SI: s_or_b64 s[{{[0-9]+:[0-9]+}}], vcc, s[{{[0-9]+:[0-9]+}}] +define void @s_or_i1(i1 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d) { + %cmp0 = icmp eq i32 %a, %b + %cmp1 = icmp eq i32 %c, %d + %or = or i1 %cmp0, %cmp1 + store i1 %or, i1 addrspace(1)* %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/packetizer.ll b/llvm/test/CodeGen/AMDGPU/packetizer.ll new file mode 100644 index 00000000000..49a7c0df748 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/packetizer.ll @@ -0,0 +1,34 @@ +; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s +; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s + +; CHECK: {{^}}test: +; CHECK: BIT_ALIGN_INT T{{[0-9]}}.X +; CHECK: BIT_ALIGN_INT T{{[0-9]}}.Y +; CHECK: BIT_ALIGN_INT T{{[0-9]}}.Z +; CHECK: BIT_ALIGN_INT * T{{[0-9]}}.W + +define void @test(i32 addrspace(1)* %out, i32 %x_arg, i32 %y_arg, i32 %z_arg, i32 %w_arg, i32 %e) { +entry: + %shl = sub i32 32, %e + %x = add i32 %x_arg, 1 + %x.0 = shl i32 %x, %shl + %x.1 = lshr i32 %x, %e + %x.2 = or i32 %x.0, %x.1 + %y = add i32 %y_arg, 1 + %y.0 = shl i32 %y, %shl + %y.1 = lshr i32 %y, %e + %y.2 = or i32 %y.0, %y.1 + %z = add i32 %z_arg, 1 + %z.0 = shl i32 %z, %shl + %z.1 = lshr i32 %z, %e + %z.2 = or i32 %z.0, %z.1 + %w = add i32 %w_arg, 1 + %w.0 = shl i32 %w, %shl + %w.1 = lshr i32 %w, %e + %w.2 = or i32 %w.0, %w.1 + %xy = or i32 %x.2, %y.2 + %zw = or i32 %z.2, %w.2 + %xyzw = or i32 %xy, %zw + store i32 %xyzw, i32 addrspace(1)* %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/parallelandifcollapse.ll b/llvm/test/CodeGen/AMDGPU/parallelandifcollapse.ll new file mode 100644 index 00000000000..f32b044198a --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/parallelandifcollapse.ll @@ -0,0 +1,59 @@ +; Function Attrs: nounwind +; RUN: llc -march=r600 -mcpu=redwood -mattr=-promote-alloca < %s | FileCheck %s +; +; CFG flattening should use parallel-and mode to generate branch conditions and +; then merge if-regions with the same bodies. +; +; CHECK: AND_INT +; CHECK-NEXT: AND_INT +; CHECK-NEXT: OR_INT + +; FIXME: For some reason having the allocas here allowed the flatten cfg pass +; to do its transfomation, however now that we are using local memory for +; allocas, the transformation isn't happening. + +define void @_Z9chk1D_512v() #0 { +entry: + %a0 = alloca i32, align 4 + %b0 = alloca i32, align 4 + %c0 = alloca i32, align 4 + %d0 = alloca i32, align 4 + %a1 = alloca i32, align 4 + %b1 = alloca i32, align 4 + %c1 = alloca i32, align 4 + %d1 = alloca i32, align 4 + %data = alloca i32, align 4 + %0 = load i32, i32* %a0, align 4 + %1 = load i32, i32* %b0, align 4 + %cmp = icmp ne i32 %0, %1 + br i1 %cmp, label %land.lhs.true, label %if.end + +land.lhs.true: ; preds = %entry + %2 = load i32, i32* %c0, align 4 + %3 = load i32, i32* %d0, align 4 + %cmp1 = icmp ne i32 %2, %3 + br i1 %cmp1, label %if.then, label %if.end + +if.then: ; preds = %land.lhs.true + store i32 1, i32* %data, align 4 + br label %if.end + +if.end: ; preds = %if.then, %land.lhs.true, %entry + %4 = load i32, i32* %a1, align 4 + %5 = load i32, i32* %b1, align 4 + %cmp2 = icmp ne i32 %4, %5 + br i1 %cmp2, label %land.lhs.true3, label %if.end6 + +land.lhs.true3: ; preds = %if.end + %6 = load i32, i32* %c1, align 4 + %7 = load i32, i32* %d1, align 4 + %cmp4 = icmp ne i32 %6, %7 + br i1 %cmp4, label %if.then5, label %if.end6 + +if.then5: ; preds = %land.lhs.true3 + store i32 1, i32* %data, align 4 + br label %if.end6 + +if.end6: ; preds = %if.then5, %land.lhs.true3, %if.end + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/parallelorifcollapse.ll b/llvm/test/CodeGen/AMDGPU/parallelorifcollapse.ll new file mode 100644 index 00000000000..1da1e91b8ab --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/parallelorifcollapse.ll @@ -0,0 +1,66 @@ +; Function Attrs: nounwind +; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s +; +; CFG flattening should use parallel-or to generate branch conditions and +; then merge if-regions with the same bodies. + +; FIXME: For some reason having the allocas here allowed the flatten cfg pass +; to do its transfomation, however now that we are using local memory for +; allocas, the transformation isn't happening. +; XFAIL: * +; +; CHECK: OR_INT +; CHECK-NEXT: OR_INT +; CHECK-NEXT: OR_INT +define void @_Z9chk1D_512v() #0 { +entry: + %a0 = alloca i32, align 4 + %b0 = alloca i32, align 4 + %c0 = alloca i32, align 4 + %d0 = alloca i32, align 4 + %a1 = alloca i32, align 4 + %b1 = alloca i32, align 4 + %c1 = alloca i32, align 4 + %d1 = alloca i32, align 4 + %data = alloca i32, align 4 + %0 = load i32, i32* %a0, align 4 + %1 = load i32, i32* %b0, align 4 + %cmp = icmp ne i32 %0, %1 + br i1 %cmp, label %land.lhs.true, label %if.else + +land.lhs.true: ; preds = %entry + %2 = load i32, i32* %c0, align 4 + %3 = load i32, i32* %d0, align 4 + %cmp1 = icmp ne i32 %2, %3 + br i1 %cmp1, label %if.then, label %if.else + +if.then: ; preds = %land.lhs.true + br label %if.end + +if.else: ; preds = %land.lhs.true, %entry + store i32 1, i32* %data, align 4 + br label %if.end + +if.end: ; preds = %if.else, %if.then + %4 = load i32, i32* %a1, align 4 + %5 = load i32, i32* %b1, align 4 + %cmp2 = icmp ne i32 %4, %5 + br i1 %cmp2, label %land.lhs.true3, label %if.else6 + +land.lhs.true3: ; preds = %if.end + %6 = load i32, i32* %c1, align 4 + %7 = load i32, i32* %d1, align 4 + %cmp4 = icmp ne i32 %6, %7 + br i1 %cmp4, label %if.then5, label %if.else6 + +if.then5: ; preds = %land.lhs.true3 + br label %if.end7 + +if.else6: ; preds = %land.lhs.true3, %if.end + store i32 1, i32* %data, align 4 + br label %if.end7 + +if.end7: ; preds = %if.else6, %if.then5 + ret void +} + diff --git a/llvm/test/CodeGen/AMDGPU/predicate-dp4.ll b/llvm/test/CodeGen/AMDGPU/predicate-dp4.ll new file mode 100644 index 00000000000..6bc18759435 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/predicate-dp4.ll @@ -0,0 +1,27 @@ +;RUN: llc < %s -march=r600 -mcpu=cayman + +; CHECK-LABEL: {{^}}main: +; CHECK: PRED_SETE_INT * Pred, +; CHECK: DOT4 T{{[0-9]+}}.X, T0.X, T0.X, Pred_sel_one +define void @main(<4 x float> inreg) #0 { +main_body: + %1 = extractelement <4 x float> %0, i32 0 + %2 = bitcast float %1 to i32 + %3 = icmp eq i32 %2, 0 + br i1 %3, label %IF, label %ENDIF + +IF: ; preds = %main_body + %4 = call float @llvm.AMDGPU.dp4(<4 x float> %0, <4 x float> %0) + br label %ENDIF + +ENDIF: ; preds = %IF, %main_body + %5 = phi float [%4, %IF], [0.000000e+00, %main_body] + %6 = insertelement <4 x float> undef, float %5, i32 0 + call void @llvm.R600.store.swizzle(<4 x float> %6, i32 0, i32 0) + ret void +} + +declare float @llvm.AMDGPU.dp4(<4 x float>, <4 x float>) #1 +declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) +attributes #1 = { readnone } +attributes #0 = { "ShaderType"="0" } diff --git a/llvm/test/CodeGen/AMDGPU/predicates.ll b/llvm/test/CodeGen/AMDGPU/predicates.ll new file mode 100644 index 00000000000..0ce74d97ba8 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/predicates.ll @@ -0,0 +1,104 @@ +; RUN: llc < %s -march=r600 -mattr=disable-irstructurizer -mcpu=redwood | FileCheck %s + +; These tests make sure the compiler is optimizing branches using predicates +; when it is legal to do so. + +; CHECK: {{^}}simple_if: +; CHECK: PRED_SET{{[EGN][ET]*}}_INT * Pred, +; CHECK: LSHL * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, 1, Pred_sel +define void @simple_if(i32 addrspace(1)* %out, i32 %in) { +entry: + %0 = icmp sgt i32 %in, 0 + br i1 %0, label %IF, label %ENDIF + +IF: + %1 = shl i32 %in, 1 + br label %ENDIF + +ENDIF: + %2 = phi i32 [ %in, %entry ], [ %1, %IF ] + store i32 %2, i32 addrspace(1)* %out + ret void +} + +; CHECK: {{^}}simple_if_else: +; CHECK: PRED_SET{{[EGN][ET]*}}_INT * Pred, +; CHECK: LSH{{[LR] \* T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, 1, Pred_sel +; CHECK: LSH{{[LR] \* T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, 1, Pred_sel +define void @simple_if_else(i32 addrspace(1)* %out, i32 %in) { +entry: + %0 = icmp sgt i32 %in, 0 + br i1 %0, label %IF, label %ELSE + +IF: + %1 = shl i32 %in, 1 + br label %ENDIF + +ELSE: + %2 = lshr i32 %in, 1 + br label %ENDIF + +ENDIF: + %3 = phi i32 [ %1, %IF ], [ %2, %ELSE ] + store i32 %3, i32 addrspace(1)* %out + ret void +} + +; CHECK: {{^}}nested_if: +; CHECK: ALU_PUSH_BEFORE +; CHECK: JUMP +; CHECK: POP +; CHECK: PRED_SET{{[EGN][ET]*}}_INT * Exec +; CHECK: PRED_SET{{[EGN][ET]*}}_INT * Pred, +; CHECK: LSHL * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, 1, Pred_sel +define void @nested_if(i32 addrspace(1)* %out, i32 %in) { +entry: + %0 = icmp sgt i32 %in, 0 + br i1 %0, label %IF0, label %ENDIF + +IF0: + %1 = add i32 %in, 10 + %2 = icmp sgt i32 %1, 0 + br i1 %2, label %IF1, label %ENDIF + +IF1: + %3 = shl i32 %1, 1 + br label %ENDIF + +ENDIF: + %4 = phi i32 [%in, %entry], [%1, %IF0], [%3, %IF1] + store i32 %4, i32 addrspace(1)* %out + ret void +} + +; CHECK: {{^}}nested_if_else: +; CHECK: ALU_PUSH_BEFORE +; CHECK: JUMP +; CHECK: POP +; CHECK: PRED_SET{{[EGN][ET]*}}_INT * Exec +; CHECK: PRED_SET{{[EGN][ET]*}}_INT * Pred, +; CHECK: LSH{{[LR] \* T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, 1, Pred_sel +; CHECK: LSH{{[LR] \* T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, 1, Pred_sel +define void @nested_if_else(i32 addrspace(1)* %out, i32 %in) { +entry: + %0 = icmp sgt i32 %in, 0 + br i1 %0, label %IF0, label %ENDIF + +IF0: + %1 = add i32 %in, 10 + %2 = icmp sgt i32 %1, 0 + br i1 %2, label %IF1, label %ELSE1 + +IF1: + %3 = shl i32 %1, 1 + br label %ENDIF + +ELSE1: + %4 = lshr i32 %in, 1 + br label %ENDIF + +ENDIF: + %5 = phi i32 [%in, %entry], [%3, %IF1], [%4, %ELSE1] + store i32 %5, i32 addrspace(1)* %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/private-memory-atomics.ll b/llvm/test/CodeGen/AMDGPU/private-memory-atomics.ll new file mode 100644 index 00000000000..a008ac98a43 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/private-memory-atomics.ll @@ -0,0 +1,32 @@ +; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=SI < %s +; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s + +; This works because promote allocas pass replaces these with LDS atomics. + +; Private atomics have no real use, but at least shouldn't crash on it. +define void @atomicrmw_private(i32 addrspace(1)* %out, i32 %in) nounwind { +entry: + %tmp = alloca [2 x i32] + %tmp1 = getelementptr [2 x i32], [2 x i32]* %tmp, i32 0, i32 0 + %tmp2 = getelementptr [2 x i32], [2 x i32]* %tmp, i32 0, i32 1 + store i32 0, i32* %tmp1 + store i32 1, i32* %tmp2 + %tmp3 = getelementptr [2 x i32], [2 x i32]* %tmp, i32 0, i32 %in + %tmp4 = atomicrmw add i32* %tmp3, i32 7 acq_rel + store i32 %tmp4, i32 addrspace(1)* %out + ret void +} + +define void @cmpxchg_private(i32 addrspace(1)* %out, i32 %in) nounwind { +entry: + %tmp = alloca [2 x i32] + %tmp1 = getelementptr [2 x i32], [2 x i32]* %tmp, i32 0, i32 0 + %tmp2 = getelementptr [2 x i32], [2 x i32]* %tmp, i32 0, i32 1 + store i32 0, i32* %tmp1 + store i32 1, i32* %tmp2 + %tmp3 = getelementptr [2 x i32], [2 x i32]* %tmp, i32 0, i32 %in + %tmp4 = cmpxchg i32* %tmp3, i32 0, i32 1 acq_rel monotonic + %val = extractvalue { i32, i1 } %tmp4, 0 + store i32 %val, i32 addrspace(1)* %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/private-memory-broken.ll b/llvm/test/CodeGen/AMDGPU/private-memory-broken.ll new file mode 100644 index 00000000000..6b18a19f195 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/private-memory-broken.ll @@ -0,0 +1,21 @@ +; RUN: not llc -verify-machineinstrs -march=amdgcn -mcpu=SI %s -o /dev/null 2>&1 | FileCheck %s +; RUN: not llc -verify-machineinstrs -march=amdgcn -mcpu=tonga %s -o /dev/null 2>&1 | FileCheck %s + +; Make sure promote alloca pass doesn't crash + +; CHECK: unsupported call + +declare i32 @foo(i32*) nounwind + +define void @call_private(i32 addrspace(1)* %out, i32 %in) nounwind { +entry: + %tmp = alloca [2 x i32] + %tmp1 = getelementptr [2 x i32], [2 x i32]* %tmp, i32 0, i32 0 + %tmp2 = getelementptr [2 x i32], [2 x i32]* %tmp, i32 0, i32 1 + store i32 0, i32* %tmp1 + store i32 1, i32* %tmp2 + %tmp3 = getelementptr [2 x i32], [2 x i32]* %tmp, i32 0, i32 %in + %val = call i32 @foo(i32* %tmp3) nounwind + store i32 %val, i32 addrspace(1)* %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/private-memory.ll b/llvm/test/CodeGen/AMDGPU/private-memory.ll new file mode 100644 index 00000000000..1c562978050 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/private-memory.ll @@ -0,0 +1,313 @@ +; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck %s -check-prefix=R600 -check-prefix=FUNC +; RUN: llc -show-mc-encoding -mattr=+promote-alloca -verify-machineinstrs -march=amdgcn -mcpu=SI < %s | FileCheck %s -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC +; RUN: llc -show-mc-encoding -mattr=-promote-alloca -verify-machineinstrs -march=amdgcn -mcpu=SI < %s | FileCheck %s -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC +; RUN: llc -show-mc-encoding -mattr=+promote-alloca -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck %s -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC +; RUN: llc -show-mc-encoding -mattr=-promote-alloca -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck %s -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC + +declare i32 @llvm.r600.read.tidig.x() nounwind readnone + +; FUNC-LABEL: {{^}}mova_same_clause: + +; R600: LDS_WRITE +; R600: LDS_WRITE +; R600: LDS_READ +; R600: LDS_READ + +; SI-PROMOTE: ds_write_b32 +; SI-PROMOTE: ds_write_b32 +; SI-PROMOTE: ds_read_b32 +; SI-PROMOTE: ds_read_b32 + +; SI-ALLOCA: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen ; encoding: [0x00,0x10,0x70,0xe0 +; SI-ALLOCA: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen ; encoding: [0x00,0x10,0x70,0xe0 +define void @mova_same_clause(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) { +entry: + %stack = alloca [5 x i32], align 4 + %0 = load i32, i32 addrspace(1)* %in, align 4 + %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %0 + store i32 4, i32* %arrayidx1, align 4 + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1 + %1 = load i32, i32 addrspace(1)* %arrayidx2, align 4 + %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %1 + store i32 5, i32* %arrayidx3, align 4 + %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 0 + %2 = load i32, i32* %arrayidx10, align 4 + store i32 %2, i32 addrspace(1)* %out, align 4 + %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 1 + %3 = load i32, i32* %arrayidx12 + %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1 + store i32 %3, i32 addrspace(1)* %arrayidx13 + ret void +} + +; This test checks that the stack offset is calculated correctly for structs. +; All register loads/stores should be optimized away, so there shouldn't be +; any MOVA instructions. +; +; XXX: This generated code has unnecessary MOVs, we should be able to optimize +; this. + +; FUNC-LABEL: {{^}}multiple_structs: +; R600-NOT: MOVA_INT +; SI-NOT: v_movrel +; SI-NOT: v_movrel +%struct.point = type { i32, i32 } + +define void @multiple_structs(i32 addrspace(1)* %out) { +entry: + %a = alloca %struct.point + %b = alloca %struct.point + %a.x.ptr = getelementptr %struct.point, %struct.point* %a, i32 0, i32 0 + %a.y.ptr = getelementptr %struct.point, %struct.point* %a, i32 0, i32 1 + %b.x.ptr = getelementptr %struct.point, %struct.point* %b, i32 0, i32 0 + %b.y.ptr = getelementptr %struct.point, %struct.point* %b, i32 0, i32 1 + store i32 0, i32* %a.x.ptr + store i32 1, i32* %a.y.ptr + store i32 2, i32* %b.x.ptr + store i32 3, i32* %b.y.ptr + %a.indirect.ptr = getelementptr %struct.point, %struct.point* %a, i32 0, i32 0 + %b.indirect.ptr = getelementptr %struct.point, %struct.point* %b, i32 0, i32 0 + %a.indirect = load i32, i32* %a.indirect.ptr + %b.indirect = load i32, i32* %b.indirect.ptr + %0 = add i32 %a.indirect, %b.indirect + store i32 %0, i32 addrspace(1)* %out + ret void +} + +; Test direct access of a private array inside a loop. The private array +; loads and stores should be lowered to copies, so there shouldn't be any +; MOVA instructions. + +; FUNC-LABEL: {{^}}direct_loop: +; R600-NOT: MOVA_INT +; SI-NOT: v_movrel + +define void @direct_loop(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { +entry: + %prv_array_const = alloca [2 x i32] + %prv_array = alloca [2 x i32] + %a = load i32, i32 addrspace(1)* %in + %b_src_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 + %b = load i32, i32 addrspace(1)* %b_src_ptr + %a_dst_ptr = getelementptr [2 x i32], [2 x i32]* %prv_array_const, i32 0, i32 0 + store i32 %a, i32* %a_dst_ptr + %b_dst_ptr = getelementptr [2 x i32], [2 x i32]* %prv_array_const, i32 0, i32 1 + store i32 %b, i32* %b_dst_ptr + br label %for.body + +for.body: + %inc = phi i32 [0, %entry], [%count, %for.body] + %x_ptr = getelementptr [2 x i32], [2 x i32]* %prv_array_const, i32 0, i32 0 + %x = load i32, i32* %x_ptr + %y_ptr = getelementptr [2 x i32], [2 x i32]* %prv_array, i32 0, i32 0 + %y = load i32, i32* %y_ptr + %xy = add i32 %x, %y + store i32 %xy, i32* %y_ptr + %count = add i32 %inc, 1 + %done = icmp eq i32 %count, 4095 + br i1 %done, label %for.end, label %for.body + +for.end: + %value_ptr = getelementptr [2 x i32], [2 x i32]* %prv_array, i32 0, i32 0 + %value = load i32, i32* %value_ptr + store i32 %value, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}short_array: + +; R600: MOVA_INT + +; SI-PROMOTE-DAG: buffer_store_short v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen ; encoding: [0x00,0x10,0x68,0xe0 +; SI-PROMOTE-DAG: buffer_store_short v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen offset:2 ; encoding: [0x02,0x10,0x68,0xe0 +; SI-PROMOTE: buffer_load_sshort v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} +define void @short_array(i32 addrspace(1)* %out, i32 %index) { +entry: + %0 = alloca [2 x i16] + %1 = getelementptr [2 x i16], [2 x i16]* %0, i32 0, i32 0 + %2 = getelementptr [2 x i16], [2 x i16]* %0, i32 0, i32 1 + store i16 0, i16* %1 + store i16 1, i16* %2 + %3 = getelementptr [2 x i16], [2 x i16]* %0, i32 0, i32 %index + %4 = load i16, i16* %3 + %5 = sext i16 %4 to i32 + store i32 %5, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}char_array: + +; R600: MOVA_INT + +; SI-DAG: buffer_store_byte v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen ; encoding: [0x00,0x10,0x60,0xe0 +; SI-DAG: buffer_store_byte v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen offset:1 ; encoding: [0x01,0x10,0x60,0xe0 +define void @char_array(i32 addrspace(1)* %out, i32 %index) { +entry: + %0 = alloca [2 x i8] + %1 = getelementptr [2 x i8], [2 x i8]* %0, i32 0, i32 0 + %2 = getelementptr [2 x i8], [2 x i8]* %0, i32 0, i32 1 + store i8 0, i8* %1 + store i8 1, i8* %2 + %3 = getelementptr [2 x i8], [2 x i8]* %0, i32 0, i32 %index + %4 = load i8, i8* %3 + %5 = sext i8 %4 to i32 + store i32 %5, i32 addrspace(1)* %out + ret void + +} + +; Make sure we don't overwrite workitem information with private memory + +; FUNC-LABEL: {{^}}work_item_info: +; R600-NOT: MOV T0.X +; Additional check in case the move ends up in the last slot +; R600-NOT: MOV * TO.X + +; SI-NOT: v_mov_b32_e{{(32|64)}} v0 +define void @work_item_info(i32 addrspace(1)* %out, i32 %in) { +entry: + %0 = alloca [2 x i32] + %1 = getelementptr [2 x i32], [2 x i32]* %0, i32 0, i32 0 + %2 = getelementptr [2 x i32], [2 x i32]* %0, i32 0, i32 1 + store i32 0, i32* %1 + store i32 1, i32* %2 + %3 = getelementptr [2 x i32], [2 x i32]* %0, i32 0, i32 %in + %4 = load i32, i32* %3 + %5 = call i32 @llvm.r600.read.tidig.x() + %6 = add i32 %4, %5 + store i32 %6, i32 addrspace(1)* %out + ret void +} + +; Test that two stack objects are not stored in the same register +; The second stack object should be in T3.X +; FUNC-LABEL: {{^}}no_overlap: +; R600_CHECK: MOV +; R600_CHECK: [[CHAN:[XYZW]]]+ +; R600-NOT: [[CHAN]]+ +; SI: v_mov_b32_e32 v3 +define void @no_overlap(i32 addrspace(1)* %out, i32 %in) { +entry: + %0 = alloca [3 x i8], align 1 + %1 = alloca [2 x i8], align 1 + %2 = getelementptr [3 x i8], [3 x i8]* %0, i32 0, i32 0 + %3 = getelementptr [3 x i8], [3 x i8]* %0, i32 0, i32 1 + %4 = getelementptr [3 x i8], [3 x i8]* %0, i32 0, i32 2 + %5 = getelementptr [2 x i8], [2 x i8]* %1, i32 0, i32 0 + %6 = getelementptr [2 x i8], [2 x i8]* %1, i32 0, i32 1 + store i8 0, i8* %2 + store i8 1, i8* %3 + store i8 2, i8* %4 + store i8 1, i8* %5 + store i8 0, i8* %6 + %7 = getelementptr [3 x i8], [3 x i8]* %0, i32 0, i32 %in + %8 = getelementptr [2 x i8], [2 x i8]* %1, i32 0, i32 %in + %9 = load i8, i8* %7 + %10 = load i8, i8* %8 + %11 = add i8 %9, %10 + %12 = sext i8 %11 to i32 + store i32 %12, i32 addrspace(1)* %out + ret void +} + +define void @char_array_array(i32 addrspace(1)* %out, i32 %index) { +entry: + %alloca = alloca [2 x [2 x i8]] + %gep0 = getelementptr [2 x [2 x i8]], [2 x [2 x i8]]* %alloca, i32 0, i32 0, i32 0 + %gep1 = getelementptr [2 x [2 x i8]], [2 x [2 x i8]]* %alloca, i32 0, i32 0, i32 1 + store i8 0, i8* %gep0 + store i8 1, i8* %gep1 + %gep2 = getelementptr [2 x [2 x i8]], [2 x [2 x i8]]* %alloca, i32 0, i32 0, i32 %index + %load = load i8, i8* %gep2 + %sext = sext i8 %load to i32 + store i32 %sext, i32 addrspace(1)* %out + ret void +} + +define void @i32_array_array(i32 addrspace(1)* %out, i32 %index) { +entry: + %alloca = alloca [2 x [2 x i32]] + %gep0 = getelementptr [2 x [2 x i32]], [2 x [2 x i32]]* %alloca, i32 0, i32 0, i32 0 + %gep1 = getelementptr [2 x [2 x i32]], [2 x [2 x i32]]* %alloca, i32 0, i32 0, i32 1 + store i32 0, i32* %gep0 + store i32 1, i32* %gep1 + %gep2 = getelementptr [2 x [2 x i32]], [2 x [2 x i32]]* %alloca, i32 0, i32 0, i32 %index + %load = load i32, i32* %gep2 + store i32 %load, i32 addrspace(1)* %out + ret void +} + +define void @i64_array_array(i64 addrspace(1)* %out, i32 %index) { +entry: + %alloca = alloca [2 x [2 x i64]] + %gep0 = getelementptr [2 x [2 x i64]], [2 x [2 x i64]]* %alloca, i32 0, i32 0, i32 0 + %gep1 = getelementptr [2 x [2 x i64]], [2 x [2 x i64]]* %alloca, i32 0, i32 0, i32 1 + store i64 0, i64* %gep0 + store i64 1, i64* %gep1 + %gep2 = getelementptr [2 x [2 x i64]], [2 x [2 x i64]]* %alloca, i32 0, i32 0, i32 %index + %load = load i64, i64* %gep2 + store i64 %load, i64 addrspace(1)* %out + ret void +} + +%struct.pair32 = type { i32, i32 } + +define void @struct_array_array(i32 addrspace(1)* %out, i32 %index) { +entry: + %alloca = alloca [2 x [2 x %struct.pair32]] + %gep0 = getelementptr [2 x [2 x %struct.pair32]], [2 x [2 x %struct.pair32]]* %alloca, i32 0, i32 0, i32 0, i32 1 + %gep1 = getelementptr [2 x [2 x %struct.pair32]], [2 x [2 x %struct.pair32]]* %alloca, i32 0, i32 0, i32 1, i32 1 + store i32 0, i32* %gep0 + store i32 1, i32* %gep1 + %gep2 = getelementptr [2 x [2 x %struct.pair32]], [2 x [2 x %struct.pair32]]* %alloca, i32 0, i32 0, i32 %index, i32 0 + %load = load i32, i32* %gep2 + store i32 %load, i32 addrspace(1)* %out + ret void +} + +define void @struct_pair32_array(i32 addrspace(1)* %out, i32 %index) { +entry: + %alloca = alloca [2 x %struct.pair32] + %gep0 = getelementptr [2 x %struct.pair32], [2 x %struct.pair32]* %alloca, i32 0, i32 0, i32 1 + %gep1 = getelementptr [2 x %struct.pair32], [2 x %struct.pair32]* %alloca, i32 0, i32 1, i32 0 + store i32 0, i32* %gep0 + store i32 1, i32* %gep1 + %gep2 = getelementptr [2 x %struct.pair32], [2 x %struct.pair32]* %alloca, i32 0, i32 %index, i32 0 + %load = load i32, i32* %gep2 + store i32 %load, i32 addrspace(1)* %out + ret void +} + +define void @select_private(i32 addrspace(1)* %out, i32 %in) nounwind { +entry: + %tmp = alloca [2 x i32] + %tmp1 = getelementptr [2 x i32], [2 x i32]* %tmp, i32 0, i32 0 + %tmp2 = getelementptr [2 x i32], [2 x i32]* %tmp, i32 0, i32 1 + store i32 0, i32* %tmp1 + store i32 1, i32* %tmp2 + %cmp = icmp eq i32 %in, 0 + %sel = select i1 %cmp, i32* %tmp1, i32* %tmp2 + %load = load i32, i32* %sel + store i32 %load, i32 addrspace(1)* %out + ret void +} + +; AMDGPUPromoteAlloca does not know how to handle ptrtoint. When it +; finds one, it should stop trying to promote. + +; FUNC-LABEL: ptrtoint: +; SI-NOT: ds_write +; SI: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen +; SI: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen offset:5 +define void @ptrtoint(i32 addrspace(1)* %out, i32 %a, i32 %b) { + %alloca = alloca [16 x i32] + %tmp0 = getelementptr [16 x i32], [16 x i32]* %alloca, i32 0, i32 %a + store i32 5, i32* %tmp0 + %tmp1 = ptrtoint [16 x i32]* %alloca to i32 + %tmp2 = add i32 %tmp1, 5 + %tmp3 = inttoptr i32 %tmp2 to i32* + %tmp4 = getelementptr i32, i32* %tmp3, i32 %b + %tmp5 = load i32, i32* %tmp4 + store i32 %tmp5, i32 addrspace(1)* %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/pv-packing.ll b/llvm/test/CodeGen/AMDGPU/pv-packing.ll new file mode 100644 index 00000000000..abeae563ff3 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/pv-packing.ll @@ -0,0 +1,45 @@ +; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s + +;CHECK: DOT4 T{{[0-9]\.X}} +;CHECK: MULADD_IEEE * T{{[0-9]\.W}} + +define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1, <4 x float> inreg %reg2, <4 x float> inreg %reg3) #0 { +main_body: + %0 = extractelement <4 x float> %reg1, i32 0 + %1 = extractelement <4 x float> %reg1, i32 1 + %2 = extractelement <4 x float> %reg1, i32 2 + %3 = extractelement <4 x float> %reg2, i32 0 + %4 = extractelement <4 x float> %reg2, i32 1 + %5 = extractelement <4 x float> %reg2, i32 2 + %6 = extractelement <4 x float> %reg3, i32 0 + %7 = extractelement <4 x float> %reg3, i32 1 + %8 = extractelement <4 x float> %reg3, i32 2 + %9 = load <4 x float>, <4 x float> addrspace(8)* null + %10 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1) + %11 = call float @llvm.AMDGPU.dp4(<4 x float> %9, <4 x float> %9) + %12 = fmul float %0, %3 + %13 = fadd float %12, %6 + %14 = fmul float %1, %4 + %15 = fadd float %14, %7 + %16 = fmul float %2, %5 + %17 = fadd float %16, %8 + %18 = fmul float %11, %11 + %19 = fadd float %18, %0 + %20 = insertelement <4 x float> undef, float %13, i32 0 + %21 = insertelement <4 x float> %20, float %15, i32 1 + %22 = insertelement <4 x float> %21, float %17, i32 2 + %23 = insertelement <4 x float> %22, float %19, i32 3 + %24 = call float @llvm.AMDGPU.dp4(<4 x float> %23, <4 x float> %10) + %25 = insertelement <4 x float> undef, float %24, i32 0 + call void @llvm.R600.store.swizzle(<4 x float> %25, i32 0, i32 2) + ret void +} + +; Function Attrs: readnone +declare float @llvm.AMDGPU.dp4(<4 x float>, <4 x float>) #1 + + +declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) + +attributes #0 = { "ShaderType"="1" } +attributes #1 = { readnone } diff --git a/llvm/test/CodeGen/AMDGPU/pv.ll b/llvm/test/CodeGen/AMDGPU/pv.ll new file mode 100644 index 00000000000..9a57dd19765 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/pv.ll @@ -0,0 +1,241 @@ +; RUN: llc < %s -march=r600 | FileCheck %s + +; CHECK: DOT4 * T{{[0-9]\.W}} (MASKED) +; CHECK: MAX T{{[0-9].[XYZW]}}, 0.0, PV.X + +define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1, <4 x float> inreg %reg2, <4 x float> inreg %reg3, <4 x float> inreg %reg4, <4 x float> inreg %reg5, <4 x float> inreg %reg6, <4 x float> inreg %reg7) #0 { +main_body: + %0 = extractelement <4 x float> %reg1, i32 0 + %1 = extractelement <4 x float> %reg1, i32 1 + %2 = extractelement <4 x float> %reg1, i32 2 + %3 = extractelement <4 x float> %reg1, i32 3 + %4 = extractelement <4 x float> %reg2, i32 0 + %5 = extractelement <4 x float> %reg2, i32 1 + %6 = extractelement <4 x float> %reg2, i32 2 + %7 = extractelement <4 x float> %reg2, i32 3 + %8 = extractelement <4 x float> %reg3, i32 0 + %9 = extractelement <4 x float> %reg3, i32 1 + %10 = extractelement <4 x float> %reg3, i32 2 + %11 = extractelement <4 x float> %reg3, i32 3 + %12 = extractelement <4 x float> %reg4, i32 0 + %13 = extractelement <4 x float> %reg4, i32 1 + %14 = extractelement <4 x float> %reg4, i32 2 + %15 = extractelement <4 x float> %reg4, i32 3 + %16 = extractelement <4 x float> %reg5, i32 0 + %17 = extractelement <4 x float> %reg5, i32 1 + %18 = extractelement <4 x float> %reg5, i32 2 + %19 = extractelement <4 x float> %reg5, i32 3 + %20 = extractelement <4 x float> %reg6, i32 0 + %21 = extractelement <4 x float> %reg6, i32 1 + %22 = extractelement <4 x float> %reg6, i32 2 + %23 = extractelement <4 x float> %reg6, i32 3 + %24 = extractelement <4 x float> %reg7, i32 0 + %25 = extractelement <4 x float> %reg7, i32 1 + %26 = extractelement <4 x float> %reg7, i32 2 + %27 = extractelement <4 x float> %reg7, i32 3 + %28 = load <4 x float>, <4 x float> addrspace(8)* null + %29 = extractelement <4 x float> %28, i32 0 + %30 = fmul float %0, %29 + %31 = load <4 x float>, <4 x float> addrspace(8)* null + %32 = extractelement <4 x float> %31, i32 1 + %33 = fmul float %0, %32 + %34 = load <4 x float>, <4 x float> addrspace(8)* null + %35 = extractelement <4 x float> %34, i32 2 + %36 = fmul float %0, %35 + %37 = load <4 x float>, <4 x float> addrspace(8)* null + %38 = extractelement <4 x float> %37, i32 3 + %39 = fmul float %0, %38 + %40 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1) + %41 = extractelement <4 x float> %40, i32 0 + %42 = fmul float %1, %41 + %43 = fadd float %42, %30 + %44 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1) + %45 = extractelement <4 x float> %44, i32 1 + %46 = fmul float %1, %45 + %47 = fadd float %46, %33 + %48 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1) + %49 = extractelement <4 x float> %48, i32 2 + %50 = fmul float %1, %49 + %51 = fadd float %50, %36 + %52 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1) + %53 = extractelement <4 x float> %52, i32 3 + %54 = fmul float %1, %53 + %55 = fadd float %54, %39 + %56 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2) + %57 = extractelement <4 x float> %56, i32 0 + %58 = fmul float %2, %57 + %59 = fadd float %58, %43 + %60 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2) + %61 = extractelement <4 x float> %60, i32 1 + %62 = fmul float %2, %61 + %63 = fadd float %62, %47 + %64 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2) + %65 = extractelement <4 x float> %64, i32 2 + %66 = fmul float %2, %65 + %67 = fadd float %66, %51 + %68 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2) + %69 = extractelement <4 x float> %68, i32 3 + %70 = fmul float %2, %69 + %71 = fadd float %70, %55 + %72 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3) + %73 = extractelement <4 x float> %72, i32 0 + %74 = fmul float %3, %73 + %75 = fadd float %74, %59 + %76 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3) + %77 = extractelement <4 x float> %76, i32 1 + %78 = fmul float %3, %77 + %79 = fadd float %78, %63 + %80 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3) + %81 = extractelement <4 x float> %80, i32 2 + %82 = fmul float %3, %81 + %83 = fadd float %82, %67 + %84 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3) + %85 = extractelement <4 x float> %84, i32 3 + %86 = fmul float %3, %85 + %87 = fadd float %86, %71 + %88 = insertelement <4 x float> undef, float %4, i32 0 + %89 = insertelement <4 x float> %88, float %5, i32 1 + %90 = insertelement <4 x float> %89, float %6, i32 2 + %91 = insertelement <4 x float> %90, float 0.000000e+00, i32 3 + %92 = insertelement <4 x float> undef, float %4, i32 0 + %93 = insertelement <4 x float> %92, float %5, i32 1 + %94 = insertelement <4 x float> %93, float %6, i32 2 + %95 = insertelement <4 x float> %94, float 0.000000e+00, i32 3 + %96 = call float @llvm.AMDGPU.dp4(<4 x float> %91, <4 x float> %95) + %97 = call float @fabs(float %96) + %98 = call float @llvm.AMDGPU.rsq.f32(float %97) + %99 = fmul float %4, %98 + %100 = fmul float %5, %98 + %101 = fmul float %6, %98 + %102 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4) + %103 = extractelement <4 x float> %102, i32 0 + %104 = fmul float %103, %8 + %105 = fadd float %104, %20 + %106 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4) + %107 = extractelement <4 x float> %106, i32 1 + %108 = fmul float %107, %9 + %109 = fadd float %108, %21 + %110 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4) + %111 = extractelement <4 x float> %110, i32 2 + %112 = fmul float %111, %10 + %113 = fadd float %112, %22 + %114 = call float @llvm.AMDIL.clamp.(float %105, float 0.000000e+00, float 1.000000e+00) + %115 = call float @llvm.AMDIL.clamp.(float %109, float 0.000000e+00, float 1.000000e+00) + %116 = call float @llvm.AMDIL.clamp.(float %113, float 0.000000e+00, float 1.000000e+00) + %117 = call float @llvm.AMDIL.clamp.(float %15, float 0.000000e+00, float 1.000000e+00) + %118 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 5) + %119 = extractelement <4 x float> %118, i32 0 + %120 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 5) + %121 = extractelement <4 x float> %120, i32 1 + %122 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 5) + %123 = extractelement <4 x float> %122, i32 2 + %124 = insertelement <4 x float> undef, float %99, i32 0 + %125 = insertelement <4 x float> %124, float %100, i32 1 + %126 = insertelement <4 x float> %125, float %101, i32 2 + %127 = insertelement <4 x float> %126, float 0.000000e+00, i32 3 + %128 = insertelement <4 x float> undef, float %119, i32 0 + %129 = insertelement <4 x float> %128, float %121, i32 1 + %130 = insertelement <4 x float> %129, float %123, i32 2 + %131 = insertelement <4 x float> %130, float 0.000000e+00, i32 3 + %132 = call float @llvm.AMDGPU.dp4(<4 x float> %127, <4 x float> %131) + %133 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 7) + %134 = extractelement <4 x float> %133, i32 0 + %135 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 7) + %136 = extractelement <4 x float> %135, i32 1 + %137 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 7) + %138 = extractelement <4 x float> %137, i32 2 + %139 = insertelement <4 x float> undef, float %99, i32 0 + %140 = insertelement <4 x float> %139, float %100, i32 1 + %141 = insertelement <4 x float> %140, float %101, i32 2 + %142 = insertelement <4 x float> %141, float 0.000000e+00, i32 3 + %143 = insertelement <4 x float> undef, float %134, i32 0 + %144 = insertelement <4 x float> %143, float %136, i32 1 + %145 = insertelement <4 x float> %144, float %138, i32 2 + %146 = insertelement <4 x float> %145, float 0.000000e+00, i32 3 + %147 = call float @llvm.AMDGPU.dp4(<4 x float> %142, <4 x float> %146) + %148 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 8) + %149 = extractelement <4 x float> %148, i32 0 + %150 = fmul float %149, %8 + %151 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 8) + %152 = extractelement <4 x float> %151, i32 1 + %153 = fmul float %152, %9 + %154 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 8) + %155 = extractelement <4 x float> %154, i32 2 + %156 = fmul float %155, %10 + %157 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9) + %158 = extractelement <4 x float> %157, i32 0 + %159 = fmul float %158, %12 + %160 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9) + %161 = extractelement <4 x float> %160, i32 1 + %162 = fmul float %161, %13 + %163 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9) + %164 = extractelement <4 x float> %163, i32 2 + %165 = fmul float %164, %14 + %166 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 10) + %167 = extractelement <4 x float> %166, i32 0 + %168 = fmul float %167, %16 + %169 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 10) + %170 = extractelement <4 x float> %169, i32 1 + %171 = fmul float %170, %17 + %172 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 10) + %173 = extractelement <4 x float> %172, i32 2 + %174 = fmul float %173, %18 + %175 = fcmp uge float %132, 0.000000e+00 + %176 = select i1 %175, float %132, float 0.000000e+00 + %177 = fcmp uge float %147, 0.000000e+00 + %178 = select i1 %177, float %147, float 0.000000e+00 + %179 = call float @llvm.pow.f32(float %178, float %24) + %180 = fcmp ult float %132, 0.000000e+00 + %181 = select i1 %180, float 0.000000e+00, float %179 + %182 = fadd float %150, %105 + %183 = fadd float %153, %109 + %184 = fadd float %156, %113 + %185 = fmul float %176, %159 + %186 = fadd float %185, %182 + %187 = fmul float %176, %162 + %188 = fadd float %187, %183 + %189 = fmul float %176, %165 + %190 = fadd float %189, %184 + %191 = fmul float %181, %168 + %192 = fadd float %191, %186 + %193 = fmul float %181, %171 + %194 = fadd float %193, %188 + %195 = fmul float %181, %174 + %196 = fadd float %195, %190 + %197 = call float @llvm.AMDIL.clamp.(float %192, float 0.000000e+00, float 1.000000e+00) + %198 = call float @llvm.AMDIL.clamp.(float %194, float 0.000000e+00, float 1.000000e+00) + %199 = call float @llvm.AMDIL.clamp.(float %196, float 0.000000e+00, float 1.000000e+00) + %200 = insertelement <4 x float> undef, float %75, i32 0 + %201 = insertelement <4 x float> %200, float %79, i32 1 + %202 = insertelement <4 x float> %201, float %83, i32 2 + %203 = insertelement <4 x float> %202, float %87, i32 3 + call void @llvm.R600.store.swizzle(<4 x float> %203, i32 60, i32 1) + %204 = insertelement <4 x float> undef, float %197, i32 0 + %205 = insertelement <4 x float> %204, float %198, i32 1 + %206 = insertelement <4 x float> %205, float %199, i32 2 + %207 = insertelement <4 x float> %206, float %117, i32 3 + call void @llvm.R600.store.swizzle(<4 x float> %207, i32 0, i32 2) + ret void +} + +; Function Attrs: readnone +declare float @llvm.AMDGPU.dp4(<4 x float>, <4 x float>) #1 + +; Function Attrs: readonly +declare float @fabs(float) #2 + +; Function Attrs: readnone +declare float @llvm.AMDGPU.rsq.f32(float) #1 + +; Function Attrs: readnone +declare float @llvm.AMDIL.clamp.(float, float, float) #1 + +; Function Attrs: nounwind readonly +declare float @llvm.pow.f32(float, float) #3 + +declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) + +attributes #0 = { "ShaderType"="1" } +attributes #1 = { readnone } +attributes #2 = { readonly } +attributes #3 = { nounwind readonly } diff --git a/llvm/test/CodeGen/AMDGPU/r600-encoding.ll b/llvm/test/CodeGen/AMDGPU/r600-encoding.ll new file mode 100644 index 00000000000..3a82ee30a32 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/r600-encoding.ll @@ -0,0 +1,25 @@ +; RUN: llc < %s -march=r600 -show-mc-encoding -mcpu=redwood | FileCheck --check-prefix=EG %s +; RUN: llc < %s -march=r600 -show-mc-encoding -mcpu=rs880 | FileCheck --check-prefix=R600 %s + +; The earliest R600 GPUs have a slightly different encoding than the rest of +; the VLIW4/5 GPUs. + +; EG: {{^}}test: +; EG: MUL_IEEE {{[ *TXYZWPVxyzw.,0-9]+}} ; encoding: [{{0x[0-9a-f]+,0x[0-9a-f]+,0x[0-9a-f]+,0x[0-9a-f]+,0x10,0x01,0x[0-9a-f]+,0x[0-9a-f]+}}] + +; R600: {{^}}test: +; R600: MUL_IEEE {{[ *TXYZWPVxyzw.,0-9]+}} ; encoding: [{{0x[0-9a-f]+,0x[0-9a-f]+,0x[0-9a-f]+,0x[0-9a-f]+,0x10,0x02,0x[0-9a-f]+,0x[0-9a-f]+}}] + +define void @test(<4 x float> inreg %reg0) #0 { +entry: + %r0 = extractelement <4 x float> %reg0, i32 0 + %r1 = extractelement <4 x float> %reg0, i32 1 + %r2 = fmul float %r0, %r1 + %vec = insertelement <4 x float> undef, float %r2, i32 0 + call void @llvm.R600.store.swizzle(<4 x float> %vec, i32 0, i32 0) + ret void +} + +declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) + +attributes #0 = { "ShaderType"="0" } diff --git a/llvm/test/CodeGen/AMDGPU/r600-export-fix.ll b/llvm/test/CodeGen/AMDGPU/r600-export-fix.ll new file mode 100644 index 00000000000..7cb80195b36 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/r600-export-fix.ll @@ -0,0 +1,142 @@ +; RUN: llc < %s -march=r600 -mcpu=cedar | FileCheck %s + +;CHECK: EXPORT T{{[0-9]}}.XYZW +;CHECK: EXPORT T{{[0-9]}}.0000 +;CHECK: EXPORT T{{[0-9]}}.0000 +;CHECK: EXPORT T{{[0-9]}}.0XYZ +;CHECK: EXPORT T{{[0-9]}}.XYZW +;CHECK: EXPORT T{{[0-9]}}.YZ00 +;CHECK: EXPORT T{{[0-9]}}.0000 +;CHECK: EXPORT T{{[0-9]}}.0000 + + +define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1) #0 { +main_body: + %0 = extractelement <4 x float> %reg1, i32 0 + %1 = extractelement <4 x float> %reg1, i32 1 + %2 = extractelement <4 x float> %reg1, i32 2 + %3 = extractelement <4 x float> %reg1, i32 3 + %4 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4) + %5 = extractelement <4 x float> %4, i32 0 + %6 = fmul float %5, %0 + %7 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4) + %8 = extractelement <4 x float> %7, i32 1 + %9 = fmul float %8, %0 + %10 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4) + %11 = extractelement <4 x float> %10, i32 2 + %12 = fmul float %11, %0 + %13 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4) + %14 = extractelement <4 x float> %13, i32 3 + %15 = fmul float %14, %0 + %16 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 5) + %17 = extractelement <4 x float> %16, i32 0 + %18 = fmul float %17, %1 + %19 = fadd float %18, %6 + %20 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 5) + %21 = extractelement <4 x float> %20, i32 1 + %22 = fmul float %21, %1 + %23 = fadd float %22, %9 + %24 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 5) + %25 = extractelement <4 x float> %24, i32 2 + %26 = fmul float %25, %1 + %27 = fadd float %26, %12 + %28 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 5) + %29 = extractelement <4 x float> %28, i32 3 + %30 = fmul float %29, %1 + %31 = fadd float %30, %15 + %32 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 6) + %33 = extractelement <4 x float> %32, i32 0 + %34 = fmul float %33, %2 + %35 = fadd float %34, %19 + %36 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 6) + %37 = extractelement <4 x float> %36, i32 1 + %38 = fmul float %37, %2 + %39 = fadd float %38, %23 + %40 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 6) + %41 = extractelement <4 x float> %40, i32 2 + %42 = fmul float %41, %2 + %43 = fadd float %42, %27 + %44 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 6) + %45 = extractelement <4 x float> %44, i32 3 + %46 = fmul float %45, %2 + %47 = fadd float %46, %31 + %48 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 7) + %49 = extractelement <4 x float> %48, i32 0 + %50 = fmul float %49, %3 + %51 = fadd float %50, %35 + %52 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 7) + %53 = extractelement <4 x float> %52, i32 1 + %54 = fmul float %53, %3 + %55 = fadd float %54, %39 + %56 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 7) + %57 = extractelement <4 x float> %56, i32 2 + %58 = fmul float %57, %3 + %59 = fadd float %58, %43 + %60 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 7) + %61 = extractelement <4 x float> %60, i32 3 + %62 = fmul float %61, %3 + %63 = fadd float %62, %47 + %64 = load <4 x float>, <4 x float> addrspace(8)* null + %65 = extractelement <4 x float> %64, i32 0 + %66 = load <4 x float>, <4 x float> addrspace(8)* null + %67 = extractelement <4 x float> %66, i32 1 + %68 = load <4 x float>, <4 x float> addrspace(8)* null + %69 = extractelement <4 x float> %68, i32 2 + %70 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2) + %71 = extractelement <4 x float> %70, i32 0 + %72 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2) + %73 = extractelement <4 x float> %72, i32 1 + %74 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2) + %75 = extractelement <4 x float> %74, i32 2 + %76 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3) + %77 = extractelement <4 x float> %76, i32 0 + %78 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3) + %79 = extractelement <4 x float> %78, i32 1 + %80 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3) + %81 = extractelement <4 x float> %80, i32 2 + %82 = insertelement <4 x float> undef, float %51, i32 0 + %83 = insertelement <4 x float> %82, float %55, i32 1 + %84 = insertelement <4 x float> %83, float %59, i32 2 + %85 = insertelement <4 x float> %84, float %63, i32 3 + call void @llvm.R600.store.swizzle(<4 x float> %85, i32 60, i32 1) + %86 = insertelement <4 x float> undef, float 0.000000e+00, i32 0 + %87 = insertelement <4 x float> %86, float 0.000000e+00, i32 1 + %88 = insertelement <4 x float> %87, float 0.000000e+00, i32 2 + %89 = insertelement <4 x float> %88, float 0.000000e+00, i32 3 + call void @llvm.R600.store.swizzle(<4 x float> %89, i32 0, i32 2) + %90 = insertelement <4 x float> undef, float 0.000000e+00, i32 0 + %91 = insertelement <4 x float> %90, float 0.000000e+00, i32 1 + %92 = insertelement <4 x float> %91, float 0.000000e+00, i32 2 + %93 = insertelement <4 x float> %92, float 0.000000e+00, i32 3 + call void @llvm.R600.store.swizzle(<4 x float> %93, i32 1, i32 2) + %94 = insertelement <4 x float> undef, float 0.000000e+00, i32 0 + %95 = insertelement <4 x float> %94, float %65, i32 1 + %96 = insertelement <4 x float> %95, float %67, i32 2 + %97 = insertelement <4 x float> %96, float %69, i32 3 + call void @llvm.R600.store.swizzle(<4 x float> %97, i32 2, i32 2) + %98 = insertelement <4 x float> undef, float %77, i32 0 + %99 = insertelement <4 x float> %98, float %79, i32 1 + %100 = insertelement <4 x float> %99, float %81, i32 2 + %101 = insertelement <4 x float> %100, float %71, i32 3 + call void @llvm.R600.store.swizzle(<4 x float> %101, i32 3, i32 2) + %102 = insertelement <4 x float> undef, float %73, i32 0 + %103 = insertelement <4 x float> %102, float %75, i32 1 + %104 = insertelement <4 x float> %103, float 0.000000e+00, i32 2 + %105 = insertelement <4 x float> %104, float 0.000000e+00, i32 3 + call void @llvm.R600.store.swizzle(<4 x float> %105, i32 4, i32 2) + %106 = insertelement <4 x float> undef, float 0.000000e+00, i32 0 + %107 = insertelement <4 x float> %106, float 0.000000e+00, i32 1 + %108 = insertelement <4 x float> %107, float 0.000000e+00, i32 2 + %109 = insertelement <4 x float> %108, float 0.000000e+00, i32 3 + call void @llvm.R600.store.swizzle(<4 x float> %109, i32 5, i32 2) + %110 = insertelement <4 x float> undef, float 0.000000e+00, i32 0 + %111 = insertelement <4 x float> %110, float 0.000000e+00, i32 1 + %112 = insertelement <4 x float> %111, float 0.000000e+00, i32 2 + %113 = insertelement <4 x float> %112, float 0.000000e+00, i32 3 + call void @llvm.R600.store.swizzle(<4 x float> %113, i32 6, i32 2) + ret void +} + +declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) + +attributes #0 = { "ShaderType"="1" } diff --git a/llvm/test/CodeGen/AMDGPU/r600-infinite-loop-bug-while-reorganizing-vector.ll b/llvm/test/CodeGen/AMDGPU/r600-infinite-loop-bug-while-reorganizing-vector.ll new file mode 100644 index 00000000000..f388f8ffe29 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/r600-infinite-loop-bug-while-reorganizing-vector.ll @@ -0,0 +1,58 @@ +;RUN: llc < %s -march=r600 -mcpu=cayman + +define void @main(<4 x float> inreg, <4 x float> inreg) #0 { +main_body: + %2 = extractelement <4 x float> %0, i32 0 + %3 = extractelement <4 x float> %0, i32 1 + %4 = extractelement <4 x float> %0, i32 2 + %5 = extractelement <4 x float> %0, i32 3 + %6 = insertelement <4 x float> undef, float %2, i32 0 + %7 = insertelement <4 x float> %6, float %3, i32 1 + %8 = insertelement <4 x float> %7, float %4, i32 2 + %9 = insertelement <4 x float> %8, float %5, i32 3 + %10 = call <4 x float> @llvm.AMDGPU.cube(<4 x float> %9) + %11 = extractelement <4 x float> %10, i32 0 + %12 = extractelement <4 x float> %10, i32 1 + %13 = extractelement <4 x float> %10, i32 2 + %14 = extractelement <4 x float> %10, i32 3 + %15 = call float @fabs(float %13) + %16 = fdiv float 1.000000e+00, %15 + %17 = fmul float %11, %16 + %18 = fadd float %17, 1.500000e+00 + %19 = fmul float %12, %16 + %20 = fadd float %19, 1.500000e+00 + %21 = insertelement <4 x float> undef, float %20, i32 0 + %22 = insertelement <4 x float> %21, float %18, i32 1 + %23 = insertelement <4 x float> %22, float %14, i32 2 + %24 = insertelement <4 x float> %23, float %5, i32 3 + %25 = extractelement <4 x float> %24, i32 0 + %26 = extractelement <4 x float> %24, i32 1 + %27 = extractelement <4 x float> %24, i32 2 + %28 = extractelement <4 x float> %24, i32 3 + %29 = insertelement <4 x float> undef, float %25, i32 0 + %30 = insertelement <4 x float> %29, float %26, i32 1 + %31 = insertelement <4 x float> %30, float %27, i32 2 + %32 = insertelement <4 x float> %31, float %28, i32 3 + %33 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %32, i32 16, i32 0, i32 13) + %34 = extractelement <4 x float> %33, i32 0 + %35 = insertelement <4 x float> undef, float %34, i32 0 + %36 = insertelement <4 x float> %35, float %34, i32 1 + %37 = insertelement <4 x float> %36, float %34, i32 2 + %38 = insertelement <4 x float> %37, float 1.000000e+00, i32 3 + call void @llvm.R600.store.swizzle(<4 x float> %38, i32 0, i32 0) + ret void +} + +; Function Attrs: readnone +declare <4 x float> @llvm.AMDGPU.cube(<4 x float>) #1 + +; Function Attrs: readnone +declare float @fabs(float) #1 + +; Function Attrs: readnone +declare <4 x float> @llvm.AMDGPU.tex(<4 x float>, i32, i32, i32) #1 + +declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) + +attributes #0 = { "ShaderType"="0" } +attributes #1 = { readnone } diff --git a/llvm/test/CodeGen/AMDGPU/r600cfg.ll b/llvm/test/CodeGen/AMDGPU/r600cfg.ll new file mode 100644 index 00000000000..c7b9d65220f --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/r600cfg.ll @@ -0,0 +1,119 @@ +;RUN: llc < %s -march=r600 -mcpu=redwood + +define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1) #0 { +main_body: + %0 = extractelement <4 x float> %reg1, i32 0 + %1 = extractelement <4 x float> %reg1, i32 1 + %2 = extractelement <4 x float> %reg1, i32 2 + %3 = extractelement <4 x float> %reg1, i32 3 + %4 = bitcast float %0 to i32 + %5 = icmp eq i32 %4, 0 + %6 = sext i1 %5 to i32 + %7 = bitcast i32 %6 to float + %8 = bitcast float %7 to i32 + %9 = icmp ne i32 %8, 0 + %. = select i1 %9, float 0x36A0000000000000, float %0 + br label %LOOP + +LOOP: ; preds = %LOOP47, %main_body + %temp12.0 = phi float [ 0x36A0000000000000, %main_body ], [ %temp12.1, %LOOP47 ] + %temp8.0 = phi float [ 0.000000e+00, %main_body ], [ %38, %LOOP47 ] + %temp4.1 = phi float [ %., %main_body ], [ %52, %LOOP47 ] + %10 = bitcast float %temp4.1 to i32 + %11 = icmp eq i32 %10, 1 + %12 = sext i1 %11 to i32 + %13 = bitcast i32 %12 to float + %14 = bitcast float %13 to i32 + %15 = icmp ne i32 %14, 0 + br i1 %15, label %IF41, label %ENDIF40 + +IF41: ; preds = %LOOP + %16 = insertelement <4 x float> undef, float %0, i32 0 + %17 = insertelement <4 x float> %16, float %temp8.0, i32 1 + %18 = insertelement <4 x float> %17, float %temp12.0, i32 2 + %19 = insertelement <4 x float> %18, float 0.000000e+00, i32 3 + call void @llvm.R600.store.stream.output(<4 x float> %19, i32 0, i32 0, i32 1) + %20 = insertelement <4 x float> undef, float %0, i32 0 + %21 = insertelement <4 x float> %20, float %temp8.0, i32 1 + %22 = insertelement <4 x float> %21, float %temp12.0, i32 2 + %23 = insertelement <4 x float> %22, float 0.000000e+00, i32 3 + call void @llvm.R600.store.stream.output(<4 x float> %23, i32 0, i32 0, i32 2) + %24 = insertelement <4 x float> undef, float %0, i32 0 + %25 = insertelement <4 x float> %24, float %temp8.0, i32 1 + %26 = insertelement <4 x float> %25, float %temp12.0, i32 2 + %27 = insertelement <4 x float> %26, float 0.000000e+00, i32 3 + call void @llvm.R600.store.stream.output(<4 x float> %27, i32 0, i32 0, i32 4) + %28 = insertelement <4 x float> undef, float 0.000000e+00, i32 0 + %29 = insertelement <4 x float> %28, float 0.000000e+00, i32 1 + %30 = insertelement <4 x float> %29, float 0.000000e+00, i32 2 + %31 = insertelement <4 x float> %30, float 0.000000e+00, i32 3 + call void @llvm.R600.store.swizzle(<4 x float> %31, i32 60, i32 1) + %32 = insertelement <4 x float> undef, float %0, i32 0 + %33 = insertelement <4 x float> %32, float %temp8.0, i32 1 + %34 = insertelement <4 x float> %33, float %temp12.0, i32 2 + %35 = insertelement <4 x float> %34, float 0.000000e+00, i32 3 + call void @llvm.R600.store.swizzle(<4 x float> %35, i32 0, i32 2) + ret void + +ENDIF40: ; preds = %LOOP + %36 = bitcast float %temp8.0 to i32 + %37 = add i32 %36, 1 + %38 = bitcast i32 %37 to float + %39 = bitcast float %temp4.1 to i32 + %40 = urem i32 %39, 2 + %41 = bitcast i32 %40 to float + %42 = bitcast float %41 to i32 + %43 = icmp eq i32 %42, 0 + %44 = sext i1 %43 to i32 + %45 = bitcast i32 %44 to float + %46 = bitcast float %45 to i32 + %47 = icmp ne i32 %46, 0 + %48 = bitcast float %temp4.1 to i32 + br i1 %47, label %IF44, label %ELSE45 + +IF44: ; preds = %ENDIF40 + %49 = udiv i32 %48, 2 + br label %ENDIF43 + +ELSE45: ; preds = %ENDIF40 + %50 = mul i32 3, %48 + %51 = add i32 %50, 1 + br label %ENDIF43 + +ENDIF43: ; preds = %ELSE45, %IF44 + %.sink = phi i32 [ %49, %IF44 ], [ %51, %ELSE45 ] + %52 = bitcast i32 %.sink to float + %53 = load <4 x float>, <4 x float> addrspace(8)* null + %54 = extractelement <4 x float> %53, i32 0 + %55 = bitcast float %54 to i32 + br label %LOOP47 + +LOOP47: ; preds = %ENDIF48, %ENDIF43 + %temp12.1 = phi float [ %temp12.0, %ENDIF43 ], [ %67, %ENDIF48 ] + %temp28.0 = phi float [ 0.000000e+00, %ENDIF43 ], [ %70, %ENDIF48 ] + %56 = bitcast float %temp28.0 to i32 + %57 = icmp uge i32 %56, %55 + %58 = sext i1 %57 to i32 + %59 = bitcast i32 %58 to float + %60 = bitcast float %59 to i32 + %61 = icmp ne i32 %60, 0 + br i1 %61, label %LOOP, label %ENDIF48 + +ENDIF48: ; preds = %LOOP47 + %62 = bitcast float %temp12.1 to i32 + %63 = mul i32 %62, 2 + %64 = bitcast i32 %63 to float + %65 = bitcast float %64 to i32 + %66 = urem i32 %65, 2147483647 + %67 = bitcast i32 %66 to float + %68 = bitcast float %temp28.0 to i32 + %69 = add i32 %68, 1 + %70 = bitcast i32 %69 to float + br label %LOOP47 +} + +declare void @llvm.R600.store.stream.output(<4 x float>, i32, i32, i32) + +declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) + +attributes #0 = { "ShaderType"="1" } diff --git a/llvm/test/CodeGen/AMDGPU/reciprocal.ll b/llvm/test/CodeGen/AMDGPU/reciprocal.ll new file mode 100644 index 00000000000..b4ac47afced --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/reciprocal.ll @@ -0,0 +1,15 @@ +;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s + +;CHECK: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} + +define void @test(<4 x float> inreg %reg0) #0 { + %r0 = extractelement <4 x float> %reg0, i32 0 + %r1 = fdiv float 1.0, %r0 + %vec = insertelement <4 x float> undef, float %r1, i32 0 + call void @llvm.R600.store.swizzle(<4 x float> %vec, i32 0, i32 0) + ret void +} + +declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) + +attributes #0 = { "ShaderType"="0" } diff --git a/llvm/test/CodeGen/AMDGPU/register-count-comments.ll b/llvm/test/CodeGen/AMDGPU/register-count-comments.ll new file mode 100644 index 00000000000..de6bfb31088 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/register-count-comments.ll @@ -0,0 +1,27 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs -asm-verbose < %s | FileCheck -check-prefix=SI %s + +declare i32 @llvm.SI.tid() nounwind readnone + +; SI-LABEL: {{^}}foo: +; SI: .section .AMDGPU.csdata +; SI: ; Kernel info: +; SI: ; NumSgprs: {{[0-9]+}} +; SI: ; NumVgprs: {{[0-9]+}} +define void @foo(i32 addrspace(1)* noalias %out, i32 addrspace(1)* %abase, i32 addrspace(1)* %bbase) nounwind { + %tid = call i32 @llvm.SI.tid() nounwind readnone + %aptr = getelementptr i32, i32 addrspace(1)* %abase, i32 %tid + %bptr = getelementptr i32, i32 addrspace(1)* %bbase, i32 %tid + %outptr = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %a = load i32, i32 addrspace(1)* %aptr, align 4 + %b = load i32, i32 addrspace(1)* %bptr, align 4 + %result = add i32 %a, %b + store i32 %result, i32 addrspace(1)* %outptr, align 4 + ret void +} + +; SI-LABEL: {{^}}one_vgpr_used: +; SI: NumVgprs: 1 +define void @one_vgpr_used(i32 addrspace(1)* %out, i32 %x) nounwind { + store i32 %x, i32 addrspace(1)* %out, align 4 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/reorder-stores.ll b/llvm/test/CodeGen/AMDGPU/reorder-stores.ll new file mode 100644 index 00000000000..187650ff9a5 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/reorder-stores.ll @@ -0,0 +1,105 @@ +; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI %s + +; SI-LABEL: {{^}}no_reorder_v2f64_global_load_store: +; SI: buffer_load_dwordx2 +; SI: buffer_load_dwordx2 +; SI: buffer_load_dwordx2 +; SI: buffer_load_dwordx2 +; SI: buffer_store_dwordx2 +; SI: buffer_store_dwordx2 +; SI: buffer_store_dwordx2 +; SI: buffer_store_dwordx2 +; SI: s_endpgm +define void @no_reorder_v2f64_global_load_store(<2 x double> addrspace(1)* nocapture %x, <2 x double> addrspace(1)* nocapture %y) nounwind { + %tmp1 = load <2 x double>, <2 x double> addrspace(1)* %x, align 16 + %tmp4 = load <2 x double>, <2 x double> addrspace(1)* %y, align 16 + store <2 x double> %tmp4, <2 x double> addrspace(1)* %x, align 16 + store <2 x double> %tmp1, <2 x double> addrspace(1)* %y, align 16 + ret void +} + +; SI-LABEL: {{^}}no_reorder_scalarized_v2f64_local_load_store: +; SI: ds_read_b64 +; SI: ds_read_b64 +; SI: ds_write_b64 +; SI: ds_write_b64 +; SI: s_endpgm +define void @no_reorder_scalarized_v2f64_local_load_store(<2 x double> addrspace(3)* nocapture %x, <2 x double> addrspace(3)* nocapture %y) nounwind { + %tmp1 = load <2 x double>, <2 x double> addrspace(3)* %x, align 16 + %tmp4 = load <2 x double>, <2 x double> addrspace(3)* %y, align 16 + store <2 x double> %tmp4, <2 x double> addrspace(3)* %x, align 16 + store <2 x double> %tmp1, <2 x double> addrspace(3)* %y, align 16 + ret void +} + +; SI-LABEL: {{^}}no_reorder_split_v8i32_global_load_store: +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword + +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword + +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword + +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword + + +; SI: buffer_store_dword +; SI: buffer_store_dword +; SI: buffer_store_dword +; SI: buffer_store_dword + +; SI: buffer_store_dword +; SI: buffer_store_dword +; SI: buffer_store_dword +; SI: buffer_store_dword + +; SI: buffer_store_dword +; SI: buffer_store_dword +; SI: buffer_store_dword +; SI: buffer_store_dword + +; SI: buffer_store_dword +; SI: buffer_store_dword +; SI: buffer_store_dword +; SI: buffer_store_dword +; SI: s_endpgm +define void @no_reorder_split_v8i32_global_load_store(<8 x i32> addrspace(1)* nocapture %x, <8 x i32> addrspace(1)* nocapture %y) nounwind { + %tmp1 = load <8 x i32>, <8 x i32> addrspace(1)* %x, align 32 + %tmp4 = load <8 x i32>, <8 x i32> addrspace(1)* %y, align 32 + store <8 x i32> %tmp4, <8 x i32> addrspace(1)* %x, align 32 + store <8 x i32> %tmp1, <8 x i32> addrspace(1)* %y, align 32 + ret void +} + +; SI-LABEL: {{^}}no_reorder_extload_64: +; SI: ds_read_b64 +; SI: ds_read_b64 +; SI: ds_write_b64 +; SI-NOT: ds_read +; SI: ds_write_b64 +; SI: s_endpgm +define void @no_reorder_extload_64(<2 x i32> addrspace(3)* nocapture %x, <2 x i32> addrspace(3)* nocapture %y) nounwind { + %tmp1 = load <2 x i32>, <2 x i32> addrspace(3)* %x, align 8 + %tmp4 = load <2 x i32>, <2 x i32> addrspace(3)* %y, align 8 + %tmp1ext = zext <2 x i32> %tmp1 to <2 x i64> + %tmp4ext = zext <2 x i32> %tmp4 to <2 x i64> + %tmp7 = add <2 x i64> %tmp1ext, + %tmp9 = add <2 x i64> %tmp4ext, + %trunctmp9 = trunc <2 x i64> %tmp9 to <2 x i32> + %trunctmp7 = trunc <2 x i64> %tmp7 to <2 x i32> + store <2 x i32> %trunctmp9, <2 x i32> addrspace(3)* %x, align 8 + store <2 x i32> %trunctmp7, <2 x i32> addrspace(3)* %y, align 8 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/rotl.i64.ll b/llvm/test/CodeGen/AMDGPU/rotl.i64.ll new file mode 100644 index 00000000000..3f4ceb7e031 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/rotl.i64.ll @@ -0,0 +1,39 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=BOTH %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=BOTH %s + +; BOTH-LABEL: {{^}}s_rotl_i64: +; BOTH-DAG: s_lshl_b64 +; BOTH-DAG: s_sub_i32 +; BOTH-DAG: s_lshr_b64 +; BOTH: s_or_b64 +; BOTH: s_endpgm +define void @s_rotl_i64(i64 addrspace(1)* %in, i64 %x, i64 %y) { +entry: + %0 = shl i64 %x, %y + %1 = sub i64 64, %y + %2 = lshr i64 %x, %1 + %3 = or i64 %0, %2 + store i64 %3, i64 addrspace(1)* %in + ret void +} + +; BOTH-LABEL: {{^}}v_rotl_i64: +; SI-DAG: v_lshl_b64 +; VI-DAG: v_lshlrev_b64 +; BOTH-DAG: v_sub_i32 +; SI: v_lshr_b64 +; VI: v_lshrrev_b64 +; BOTH: v_or_b32 +; BOTH: v_or_b32 +; BOTH: s_endpgm +define void @v_rotl_i64(i64 addrspace(1)* %in, i64 addrspace(1)* %xptr, i64 addrspace(1)* %yptr) { +entry: + %x = load i64, i64 addrspace(1)* %xptr, align 8 + %y = load i64, i64 addrspace(1)* %yptr, align 8 + %tmp0 = shl i64 %x, %y + %tmp1 = sub i64 64, %y + %tmp2 = lshr i64 %x, %tmp1 + %tmp3 = or i64 %tmp0, %tmp2 + store i64 %tmp3, i64 addrspace(1)* %in, align 8 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/rotl.ll b/llvm/test/CodeGen/AMDGPU/rotl.ll new file mode 100644 index 00000000000..6c144cd56ea --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/rotl.ll @@ -0,0 +1,57 @@ +; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck --check-prefix=R600 -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s + +; FUNC-LABEL: {{^}}rotl_i32: +; R600: SUB_INT {{\** T[0-9]+\.[XYZW]}}, literal.x +; R600-NEXT: 32 +; R600: BIT_ALIGN_INT {{T[0-9]+\.[XYZW]}}, KC0[2].Z, KC0[2].Z, PV.{{[XYZW]}} + +; SI: s_sub_i32 [[SDST:s[0-9]+]], 32, {{[s][0-9]+}} +; SI: v_mov_b32_e32 [[VDST:v[0-9]+]], [[SDST]] +; SI: v_alignbit_b32 {{v[0-9]+, [s][0-9]+, s[0-9]+}}, [[VDST]] +define void @rotl_i32(i32 addrspace(1)* %in, i32 %x, i32 %y) { +entry: + %0 = shl i32 %x, %y + %1 = sub i32 32, %y + %2 = lshr i32 %x, %1 + %3 = or i32 %0, %2 + store i32 %3, i32 addrspace(1)* %in + ret void +} + +; FUNC-LABEL: {{^}}rotl_v2i32: +; SI-DAG: s_sub_i32 +; SI-DAG: s_sub_i32 +; SI-DAG: v_alignbit_b32 +; SI-DAG: v_alignbit_b32 +; SI: s_endpgm +define void @rotl_v2i32(<2 x i32> addrspace(1)* %in, <2 x i32> %x, <2 x i32> %y) { +entry: + %0 = shl <2 x i32> %x, %y + %1 = sub <2 x i32> , %y + %2 = lshr <2 x i32> %x, %1 + %3 = or <2 x i32> %0, %2 + store <2 x i32> %3, <2 x i32> addrspace(1)* %in + ret void +} + +; FUNC-LABEL: {{^}}rotl_v4i32: +; SI-DAG: s_sub_i32 +; SI-DAG: v_alignbit_b32 +; SI-DAG: s_sub_i32 +; SI-DAG: v_alignbit_b32 +; SI-DAG: s_sub_i32 +; SI-DAG: v_alignbit_b32 +; SI-DAG: s_sub_i32 +; SI-DAG: v_alignbit_b32 +; SI: s_endpgm +define void @rotl_v4i32(<4 x i32> addrspace(1)* %in, <4 x i32> %x, <4 x i32> %y) { +entry: + %0 = shl <4 x i32> %x, %y + %1 = sub <4 x i32> , %y + %2 = lshr <4 x i32> %x, %1 + %3 = or <4 x i32> %0, %2 + store <4 x i32> %3, <4 x i32> addrspace(1)* %in + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/rotr.i64.ll b/llvm/test/CodeGen/AMDGPU/rotr.i64.ll new file mode 100644 index 00000000000..586de44a566 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/rotr.i64.ll @@ -0,0 +1,61 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=BOTH %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=BOTH %s + +; BOTH-LABEL: {{^}}s_rotr_i64: +; BOTH-DAG: s_sub_i32 +; BOTH-DAG: s_lshr_b64 +; BOTH-DAG: s_lshl_b64 +; BOTH: s_or_b64 +define void @s_rotr_i64(i64 addrspace(1)* %in, i64 %x, i64 %y) { +entry: + %tmp0 = sub i64 64, %y + %tmp1 = shl i64 %x, %tmp0 + %tmp2 = lshr i64 %x, %y + %tmp3 = or i64 %tmp1, %tmp2 + store i64 %tmp3, i64 addrspace(1)* %in + ret void +} + +; BOTH-LABEL: {{^}}v_rotr_i64: +; BOTH-DAG: v_sub_i32 +; SI-DAG: v_lshr_b64 +; SI-DAG: v_lshl_b64 +; VI-DAG: v_lshrrev_b64 +; VI-DAG: v_lshlrev_b64 +; BOTH: v_or_b32 +; BOTH: v_or_b32 +define void @v_rotr_i64(i64 addrspace(1)* %in, i64 addrspace(1)* %xptr, i64 addrspace(1)* %yptr) { +entry: + %x = load i64, i64 addrspace(1)* %xptr, align 8 + %y = load i64, i64 addrspace(1)* %yptr, align 8 + %tmp0 = sub i64 64, %y + %tmp1 = shl i64 %x, %tmp0 + %tmp2 = lshr i64 %x, %y + %tmp3 = or i64 %tmp1, %tmp2 + store i64 %tmp3, i64 addrspace(1)* %in + ret void +} + +; BOTH-LABEL: {{^}}s_rotr_v2i64: +define void @s_rotr_v2i64(<2 x i64> addrspace(1)* %in, <2 x i64> %x, <2 x i64> %y) { +entry: + %tmp0 = sub <2 x i64> , %y + %tmp1 = shl <2 x i64> %x, %tmp0 + %tmp2 = lshr <2 x i64> %x, %y + %tmp3 = or <2 x i64> %tmp1, %tmp2 + store <2 x i64> %tmp3, <2 x i64> addrspace(1)* %in + ret void +} + +; BOTH-LABEL: {{^}}v_rotr_v2i64: +define void @v_rotr_v2i64(<2 x i64> addrspace(1)* %in, <2 x i64> addrspace(1)* %xptr, <2 x i64> addrspace(1)* %yptr) { +entry: + %x = load <2 x i64>, <2 x i64> addrspace(1)* %xptr, align 8 + %y = load <2 x i64>, <2 x i64> addrspace(1)* %yptr, align 8 + %tmp0 = sub <2 x i64> , %y + %tmp1 = shl <2 x i64> %x, %tmp0 + %tmp2 = lshr <2 x i64> %x, %y + %tmp3 = or <2 x i64> %tmp1, %tmp2 + store <2 x i64> %tmp3, <2 x i64> addrspace(1)* %in + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/rotr.ll b/llvm/test/CodeGen/AMDGPU/rotr.ll new file mode 100644 index 00000000000..044f9ffe6d6 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/rotr.ll @@ -0,0 +1,53 @@ +; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck --check-prefix=R600 -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s + +; FUNC-LABEL: {{^}}rotr_i32: +; R600: BIT_ALIGN_INT + +; SI: v_alignbit_b32 +define void @rotr_i32(i32 addrspace(1)* %in, i32 %x, i32 %y) { +entry: + %tmp0 = sub i32 32, %y + %tmp1 = shl i32 %x, %tmp0 + %tmp2 = lshr i32 %x, %y + %tmp3 = or i32 %tmp1, %tmp2 + store i32 %tmp3, i32 addrspace(1)* %in + ret void +} + +; FUNC-LABEL: {{^}}rotr_v2i32: +; R600: BIT_ALIGN_INT +; R600: BIT_ALIGN_INT + +; SI: v_alignbit_b32 +; SI: v_alignbit_b32 +define void @rotr_v2i32(<2 x i32> addrspace(1)* %in, <2 x i32> %x, <2 x i32> %y) { +entry: + %tmp0 = sub <2 x i32> , %y + %tmp1 = shl <2 x i32> %x, %tmp0 + %tmp2 = lshr <2 x i32> %x, %y + %tmp3 = or <2 x i32> %tmp1, %tmp2 + store <2 x i32> %tmp3, <2 x i32> addrspace(1)* %in + ret void +} + +; FUNC-LABEL: {{^}}rotr_v4i32: +; R600: BIT_ALIGN_INT +; R600: BIT_ALIGN_INT +; R600: BIT_ALIGN_INT +; R600: BIT_ALIGN_INT + +; SI: v_alignbit_b32 +; SI: v_alignbit_b32 +; SI: v_alignbit_b32 +; SI: v_alignbit_b32 +define void @rotr_v4i32(<4 x i32> addrspace(1)* %in, <4 x i32> %x, <4 x i32> %y) { +entry: + %tmp0 = sub <4 x i32> , %y + %tmp1 = shl <4 x i32> %x, %tmp0 + %tmp2 = lshr <4 x i32> %x, %y + %tmp3 = or <4 x i32> %tmp1, %tmp2 + store <4 x i32> %tmp3, <4 x i32> addrspace(1)* %in + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/rsq.ll b/llvm/test/CodeGen/AMDGPU/rsq.ll new file mode 100644 index 00000000000..b67b800c737 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/rsq.ll @@ -0,0 +1,74 @@ +; RUN: llc -march=amdgcn -mcpu=SI -mattr=-fp32-denormals -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=SI-UNSAFE -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=SI -mattr=-fp32-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=SI-SAFE -check-prefix=SI %s + +declare i32 @llvm.r600.read.tidig.x() nounwind readnone +declare float @llvm.sqrt.f32(float) nounwind readnone +declare double @llvm.sqrt.f64(double) nounwind readnone + +; SI-LABEL: {{^}}rsq_f32: +; SI: v_rsq_f32_e32 +; SI: s_endpgm +define void @rsq_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind { + %val = load float, float addrspace(1)* %in, align 4 + %sqrt = call float @llvm.sqrt.f32(float %val) nounwind readnone + %div = fdiv float 1.0, %sqrt + store float %div, float addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL: {{^}}rsq_f64: +; SI-UNSAFE: v_rsq_f64_e32 +; SI-SAFE: v_sqrt_f64_e32 +; SI: s_endpgm +define void @rsq_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) nounwind { + %val = load double, double addrspace(1)* %in, align 4 + %sqrt = call double @llvm.sqrt.f64(double %val) nounwind readnone + %div = fdiv double 1.0, %sqrt + store double %div, double addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL: {{^}}rsq_f32_sgpr: +; SI: v_rsq_f32_e32 {{v[0-9]+}}, {{s[0-9]+}} +; SI: s_endpgm +define void @rsq_f32_sgpr(float addrspace(1)* noalias %out, float %val) nounwind { + %sqrt = call float @llvm.sqrt.f32(float %val) nounwind readnone + %div = fdiv float 1.0, %sqrt + store float %div, float addrspace(1)* %out, align 4 + ret void +} + +; Recognize that this is rsqrt(a) * rcp(b) * c, +; not 1 / ( 1 / sqrt(a)) * rcp(b) * c. + +; SI-LABEL: @rsqrt_fmul +; SI-DAG: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI-DAG: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 +; SI-DAG: buffer_load_dword [[C:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 + +; SI-UNSAFE-DAG: v_rsq_f32_e32 [[RSQA:v[0-9]+]], [[A]] +; SI-UNSAFE-DAG: v_rcp_f32_e32 [[RCPB:v[0-9]+]], [[B]] +; SI-UNSAFE-DAG: v_mul_f32_e32 [[TMP:v[0-9]+]], [[RCPB]], [[RSQA]] +; SI-UNSAFE: v_mul_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[C]] +; SI-UNSAFE: buffer_store_dword [[RESULT]] + +; SI-SAFE-NOT: v_rsq_f32 + +; SI: s_endpgm +define void @rsqrt_fmul(float addrspace(1)* %out, float addrspace(1)* %in) { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid + %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 + %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2 + + %a = load float, float addrspace(1)* %gep.0 + %b = load float, float addrspace(1)* %gep.1 + %c = load float, float addrspace(1)* %gep.2 + + %x = call float @llvm.sqrt.f32(float %a) + %y = fmul float %x, %b + %z = fdiv float %c, %y + store float %z, float addrspace(1)* %out.gep + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/rv7x0_count3.ll b/llvm/test/CodeGen/AMDGPU/rv7x0_count3.ll new file mode 100644 index 00000000000..c3fd923e459 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/rv7x0_count3.ll @@ -0,0 +1,41 @@ +; RUN: llc < %s -march=r600 -show-mc-encoding -mcpu=rv710 | FileCheck %s + +; CHECK: TEX 9 @6 ; encoding: [0x06,0x00,0x00,0x00,0x00,0x04,0x88,0x80] + +define void @test(<4 x float> inreg %reg0, <4 x float> inreg %reg1) #0 { + %1 = extractelement <4 x float> %reg1, i32 0 + %2 = extractelement <4 x float> %reg1, i32 1 + %3 = extractelement <4 x float> %reg1, i32 2 + %4 = extractelement <4 x float> %reg1, i32 3 + %5 = insertelement <4 x float> undef, float %1, i32 0 + %6 = insertelement <4 x float> %5, float %2, i32 1 + %7 = insertelement <4 x float> %6, float %3, i32 2 + %8 = insertelement <4 x float> %7, float %4, i32 3 + %9 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %8, i32 0, i32 0, i32 1) + %10 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %8, i32 1, i32 0, i32 1) + %11 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %8, i32 2, i32 0, i32 1) + %12 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %8, i32 3, i32 0, i32 1) + %13 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %8, i32 4, i32 0, i32 1) + %14 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %8, i32 5, i32 0, i32 1) + %15 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %8, i32 6, i32 0, i32 1) + %16 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %8, i32 7, i32 0, i32 1) + %17 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %8, i32 8, i32 0, i32 1) + %18 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %8, i32 9, i32 0, i32 1) + %19 = fadd <4 x float> %9, %10 + %20 = fadd <4 x float> %19, %11 + %21 = fadd <4 x float> %20, %12 + %22 = fadd <4 x float> %21, %13 + %23 = fadd <4 x float> %22, %14 + %24 = fadd <4 x float> %23, %15 + %25 = fadd <4 x float> %24, %16 + %26 = fadd <4 x float> %25, %17 + %27 = fadd <4 x float> %26, %18 + call void @llvm.R600.store.swizzle(<4 x float> %27, i32 0, i32 2) + ret void +} + +declare <4 x float> @llvm.AMDGPU.tex(<4 x float>, i32, i32, i32) readnone + +declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) + +attributes #0 = { "ShaderType"="1" } diff --git a/llvm/test/CodeGen/AMDGPU/s_movk_i32.ll b/llvm/test/CodeGen/AMDGPU/s_movk_i32.ll new file mode 100644 index 00000000000..6b1a36c979c --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/s_movk_i32.ll @@ -0,0 +1,185 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s + +; SI-LABEL: {{^}}s_movk_i32_k0: +; SI-DAG: s_mov_b32 [[LO_S_IMM:s[0-9]+]], 0xffff{{$}} +; SI-DAG: s_mov_b32 [[HI_S_IMM:s[0-9]+]], 1{{$}} +; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, +; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]] +; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]] +; SI: s_endpgm +define void @s_movk_i32_k0(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { + %loada = load i64, i64 addrspace(1)* %a, align 4 + %or = or i64 %loada, 4295032831 ; ((1 << 16) - 1) | (1 << 32) + store i64 %or, i64 addrspace(1)* %out + ret void +} + +; SI-LABEL: {{^}}s_movk_i32_k1: +; SI-DAG: s_movk_i32 [[LO_S_IMM:s[0-9]+]], 0x7fff{{$}} +; SI-DAG: s_mov_b32 [[HI_S_IMM:s[0-9]+]], 1{{$}} +; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, +; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]] +; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]] +; SI: s_endpgm +define void @s_movk_i32_k1(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { + %loada = load i64, i64 addrspace(1)* %a, align 4 + %or = or i64 %loada, 4295000063 ; ((1 << 15) - 1) | (1 << 32) + store i64 %or, i64 addrspace(1)* %out + ret void +} + +; SI-LABEL: {{^}}s_movk_i32_k2: +; SI-DAG: s_movk_i32 [[LO_S_IMM:s[0-9]+]], 0x7fff{{$}} +; SI-DAG: s_mov_b32 [[HI_S_IMM:s[0-9]+]], 64{{$}} +; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, +; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]] +; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]] +; SI: s_endpgm +define void @s_movk_i32_k2(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { + %loada = load i64, i64 addrspace(1)* %a, align 4 + %or = or i64 %loada, 274877939711 ; ((1 << 15) - 1) | (64 << 32) + store i64 %or, i64 addrspace(1)* %out + ret void +} + +; SI-LABEL: {{^}}s_movk_i32_k3: +; SI-DAG: s_mov_b32 [[LO_S_IMM:s[0-9]+]], 0x8000{{$}} +; SI-DAG: s_mov_b32 [[HI_S_IMM:s[0-9]+]], 1{{$}} +; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, +; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]] +; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]] +; SI: s_endpgm +define void @s_movk_i32_k3(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { + %loada = load i64, i64 addrspace(1)* %a, align 4 + %or = or i64 %loada, 4295000064 ; (1 << 15) | (1 << 32) + store i64 %or, i64 addrspace(1)* %out + ret void +} + +; SI-LABEL: {{^}}s_movk_i32_k4: +; SI-DAG: s_mov_b32 [[LO_S_IMM:s[0-9]+]], 0x20000{{$}} +; SI-DAG: s_mov_b32 [[HI_S_IMM:s[0-9]+]], 1{{$}} +; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, +; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]] +; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]] +; SI: s_endpgm +define void @s_movk_i32_k4(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { + %loada = load i64, i64 addrspace(1)* %a, align 4 + %or = or i64 %loada, 4295098368 ; (1 << 17) | (1 << 32) + store i64 %or, i64 addrspace(1)* %out + ret void +} + +; SI-LABEL: {{^}}s_movk_i32_k5: +; SI-DAG: s_movk_i32 [[LO_S_IMM:s[0-9]+]], 0xffef{{$}} +; SI-DAG: s_mov_b32 [[HI_S_IMM:s[0-9]+]], 0xff00ffff{{$}} +; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, +; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]] +; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]] +; SI: s_endpgm +define void @s_movk_i32_k5(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { + %loada = load i64, i64 addrspace(1)* %a, align 4 + %or = or i64 %loada, 18374967954648334319 ; -17 & 0xff00ffffffffffff + store i64 %or, i64 addrspace(1)* %out + ret void +} + +; SI-LABEL: {{^}}s_movk_i32_k6: +; SI-DAG: s_movk_i32 [[LO_S_IMM:s[0-9]+]], 0x41{{$}} +; SI-DAG: s_mov_b32 [[HI_S_IMM:s[0-9]+]], 63{{$}} +; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, +; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]] +; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]] +; SI: s_endpgm +define void @s_movk_i32_k6(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { + %loada = load i64, i64 addrspace(1)* %a, align 4 + %or = or i64 %loada, 270582939713 ; 65 | (63 << 32) + store i64 %or, i64 addrspace(1)* %out + ret void +} + +; SI-LABEL: {{^}}s_movk_i32_k7: +; SI-DAG: s_movk_i32 [[LO_S_IMM:s[0-9]+]], 0x2000{{$}} +; SI-DAG: s_movk_i32 [[HI_S_IMM:s[0-9]+]], 0x4000{{$}} +; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, +; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]] +; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]] +; SI: s_endpgm +define void @s_movk_i32_k7(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { + %loada = load i64, i64 addrspace(1)* %a, align 4 + %or = or i64 %loada, 70368744185856; ((1 << 13)) | ((1 << 14) << 32) + store i64 %or, i64 addrspace(1)* %out + ret void +} + + +; SI-LABEL: {{^}}s_movk_i32_k8: +; SI-DAG: s_movk_i32 [[LO_S_IMM:s[0-9]+]], 0x8000{{$}} +; SI-DAG: s_mov_b32 [[HI_S_IMM:s[0-9]+]], 0x11111111{{$}} +; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, +; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]] +; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]] +; SI: s_endpgm +define void @s_movk_i32_k8(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { + %loada = load i64, i64 addrspace(1)* %a, align 4 + %or = or i64 %loada, 1229782942255906816 ; 0x11111111ffff8000 + store i64 %or, i64 addrspace(1)* %out + ret void +} + +; SI-LABEL: {{^}}s_movk_i32_k9: +; SI-DAG: s_movk_i32 [[LO_S_IMM:s[0-9]+]], 0x8001{{$}} +; SI-DAG: s_mov_b32 [[HI_S_IMM:s[0-9]+]], 0x11111111{{$}} +; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, +; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]] +; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]] +; SI: s_endpgm +define void @s_movk_i32_k9(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { + %loada = load i64, i64 addrspace(1)* %a, align 4 + %or = or i64 %loada, 1229782942255906817 ; 0x11111111ffff8001 + store i64 %or, i64 addrspace(1)* %out + ret void +} + +; SI-LABEL: {{^}}s_movk_i32_k10: +; SI-DAG: s_movk_i32 [[LO_S_IMM:s[0-9]+]], 0x8888{{$}} +; SI-DAG: s_mov_b32 [[HI_S_IMM:s[0-9]+]], 0x11111111{{$}} +; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, +; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]] +; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]] +; SI: s_endpgm +define void @s_movk_i32_k10(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { + %loada = load i64, i64 addrspace(1)* %a, align 4 + %or = or i64 %loada, 1229782942255909000 ; 0x11111111ffff8888 + store i64 %or, i64 addrspace(1)* %out + ret void +} + +; SI-LABEL: {{^}}s_movk_i32_k11: +; SI-DAG: s_movk_i32 [[LO_S_IMM:s[0-9]+]], 0x8fff{{$}} +; SI-DAG: s_mov_b32 [[HI_S_IMM:s[0-9]+]], 0x11111111{{$}} +; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, +; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]] +; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]] +; SI: s_endpgm +define void @s_movk_i32_k11(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { + %loada = load i64, i64 addrspace(1)* %a, align 4 + %or = or i64 %loada, 1229782942255910911 ; 0x11111111ffff8fff + store i64 %or, i64 addrspace(1)* %out + ret void +} + +; SI-LABEL: {{^}}s_movk_i32_k12: +; SI-DAG: s_mov_b32 [[LO_S_IMM:s[0-9]+]], 0xffff7001{{$}} +; SI-DAG: s_mov_b32 [[HI_S_IMM:s[0-9]+]], 0x11111111{{$}} +; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, +; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]] +; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]] +; SI: s_endpgm +define void @s_movk_i32_k12(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { + %loada = load i64, i64 addrspace(1)* %a, align 4 + %or = or i64 %loada, 1229782942255902721 ; 0x11111111ffff7001 + store i64 %or, i64 addrspace(1)* %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/saddo.ll b/llvm/test/CodeGen/AMDGPU/saddo.ll new file mode 100644 index 00000000000..f8ced7942a6 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/saddo.ll @@ -0,0 +1,63 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs< %s + +declare { i32, i1 } @llvm.sadd.with.overflow.i32(i32, i32) nounwind readnone +declare { i64, i1 } @llvm.sadd.with.overflow.i64(i64, i64) nounwind readnone + +; FUNC-LABEL: {{^}}saddo_i64_zext: +define void @saddo_i64_zext(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind { + %sadd = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %a, i64 %b) nounwind + %val = extractvalue { i64, i1 } %sadd, 0 + %carry = extractvalue { i64, i1 } %sadd, 1 + %ext = zext i1 %carry to i64 + %add2 = add i64 %val, %ext + store i64 %add2, i64 addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}s_saddo_i32: +define void @s_saddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 %a, i32 %b) nounwind { + %sadd = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 %a, i32 %b) nounwind + %val = extractvalue { i32, i1 } %sadd, 0 + %carry = extractvalue { i32, i1 } %sadd, 1 + store i32 %val, i32 addrspace(1)* %out, align 4 + store i1 %carry, i1 addrspace(1)* %carryout + ret void +} + +; FUNC-LABEL: {{^}}v_saddo_i32: +define void @v_saddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind { + %a = load i32, i32 addrspace(1)* %aptr, align 4 + %b = load i32, i32 addrspace(1)* %bptr, align 4 + %sadd = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 %a, i32 %b) nounwind + %val = extractvalue { i32, i1 } %sadd, 0 + %carry = extractvalue { i32, i1 } %sadd, 1 + store i32 %val, i32 addrspace(1)* %out, align 4 + store i1 %carry, i1 addrspace(1)* %carryout + ret void +} + +; FUNC-LABEL: {{^}}s_saddo_i64: +define void @s_saddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 %a, i64 %b) nounwind { + %sadd = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %a, i64 %b) nounwind + %val = extractvalue { i64, i1 } %sadd, 0 + %carry = extractvalue { i64, i1 } %sadd, 1 + store i64 %val, i64 addrspace(1)* %out, align 8 + store i1 %carry, i1 addrspace(1)* %carryout + ret void +} + +; FUNC-LABEL: {{^}}v_saddo_i64: +; SI: v_add_i32 +; SI: v_addc_u32 +define void @v_saddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind { + %a = load i64, i64 addrspace(1)* %aptr, align 4 + %b = load i64, i64 addrspace(1)* %bptr, align 4 + %sadd = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %a, i64 %b) nounwind + %val = extractvalue { i64, i1 } %sadd, 0 + %carry = extractvalue { i64, i1 } %sadd, 1 + store i64 %val, i64 addrspace(1)* %out, align 8 + store i1 %carry, i1 addrspace(1)* %carryout + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/salu-to-valu.ll b/llvm/test/CodeGen/AMDGPU/salu-to-valu.ll new file mode 100644 index 00000000000..0b964957654 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/salu-to-valu.ll @@ -0,0 +1,118 @@ +; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s + +; In this test both the pointer and the offset operands to the +; BUFFER_LOAD instructions end up being stored in vgprs. This +; requires us to add the pointer and offset together, store the +; result in the offset operand (vaddr), and then store 0 in an +; sgpr register pair and use that for the pointer operand +; (low 64-bits of srsrc). + +; CHECK-LABEL: {{^}}mubuf: + +; Make sure we aren't using VGPRs for the source operand of s_mov_b64 +; CHECK-NOT: s_mov_b64 s[{{[0-9]+:[0-9]+}}], v + +; Make sure we aren't using VGPR's for the srsrc operand of BUFFER_LOAD_* +; instructions +; CHECK: buffer_load_ubyte v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64 +; CHECK: buffer_load_ubyte v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64 +define void @mubuf(i32 addrspace(1)* %out, i8 addrspace(1)* %in) { +entry: + %0 = call i32 @llvm.r600.read.tidig.x() #1 + %1 = call i32 @llvm.r600.read.tidig.y() #1 + %2 = sext i32 %0 to i64 + %3 = sext i32 %1 to i64 + br label %loop + +loop: + %4 = phi i64 [0, %entry], [%5, %loop] + %5 = add i64 %2, %4 + %6 = getelementptr i8, i8 addrspace(1)* %in, i64 %5 + %7 = load i8, i8 addrspace(1)* %6, align 1 + %8 = or i64 %5, 1 + %9 = getelementptr i8, i8 addrspace(1)* %in, i64 %8 + %10 = load i8, i8 addrspace(1)* %9, align 1 + %11 = add i8 %7, %10 + %12 = sext i8 %11 to i32 + store i32 %12, i32 addrspace(1)* %out + %13 = icmp slt i64 %5, 10 + br i1 %13, label %loop, label %done + +done: + ret void +} + +declare i32 @llvm.r600.read.tidig.x() #1 +declare i32 @llvm.r600.read.tidig.y() #1 + +attributes #1 = { nounwind readnone } + +; Test moving an SMRD instruction to the VALU + +; CHECK-LABEL: {{^}}smrd_valu: +; CHECK: buffer_load_dword [[OUT:v[0-9]+]] +; CHECK: buffer_store_dword [[OUT]] + +define void @smrd_valu(i32 addrspace(2)* addrspace(1)* %in, i32 %a, i32 addrspace(1)* %out) { +entry: + %0 = icmp ne i32 %a, 0 + br i1 %0, label %if, label %else + +if: + %1 = load i32 addrspace(2)*, i32 addrspace(2)* addrspace(1)* %in + br label %endif + +else: + %2 = getelementptr i32 addrspace(2)*, i32 addrspace(2)* addrspace(1)* %in + %3 = load i32 addrspace(2)*, i32 addrspace(2)* addrspace(1)* %2 + br label %endif + +endif: + %4 = phi i32 addrspace(2)* [%1, %if], [%3, %else] + %5 = getelementptr i32, i32 addrspace(2)* %4, i32 3000 + %6 = load i32, i32 addrspace(2)* %5 + store i32 %6, i32 addrspace(1)* %out + ret void +} + +; Test moving ann SMRD with an immediate offset to the VALU + +; CHECK-LABEL: {{^}}smrd_valu2: +; CHECK: buffer_load_dword +define void @smrd_valu2(i32 addrspace(1)* %out, [8 x i32] addrspace(2)* %in) { +entry: + %0 = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %1 = add i32 %0, 4 + %2 = getelementptr [8 x i32], [8 x i32] addrspace(2)* %in, i32 %0, i32 4 + %3 = load i32, i32 addrspace(2)* %2 + store i32 %3, i32 addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}s_load_imm_v8i32: +; CHECK: buffer_load_dwordx4 +; CHECK: buffer_load_dwordx4 +define void @s_load_imm_v8i32(<8 x i32> addrspace(1)* %out, i32 addrspace(2)* nocapture readonly %in) { +entry: + %tmp0 = tail call i32 @llvm.r600.read.tidig.x() #1 + %tmp1 = getelementptr inbounds i32, i32 addrspace(2)* %in, i32 %tmp0 + %tmp2 = bitcast i32 addrspace(2)* %tmp1 to <8 x i32> addrspace(2)* + %tmp3 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp2, align 4 + store <8 x i32> %tmp3, <8 x i32> addrspace(1)* %out, align 32 + ret void +} + +; CHECK-LABEL: {{^}}s_load_imm_v16i32: +; CHECK: buffer_load_dwordx4 +; CHECK: buffer_load_dwordx4 +; CHECK: buffer_load_dwordx4 +; CHECK: buffer_load_dwordx4 +define void @s_load_imm_v16i32(<16 x i32> addrspace(1)* %out, i32 addrspace(2)* nocapture readonly %in) { +entry: + %tmp0 = tail call i32 @llvm.r600.read.tidig.x() #1 + %tmp1 = getelementptr inbounds i32, i32 addrspace(2)* %in, i32 %tmp0 + %tmp2 = bitcast i32 addrspace(2)* %tmp1 to <16 x i32> addrspace(2)* + %tmp3 = load <16 x i32>, <16 x i32> addrspace(2)* %tmp2, align 4 + store <16 x i32> %tmp3, <16 x i32> addrspace(1)* %out, align 32 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll new file mode 100644 index 00000000000..0970e5d3063 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll @@ -0,0 +1,81 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s + + +; FUNC-LABEL: {{^}}scalar_to_vector_v2i32: +; SI: buffer_load_dword [[VAL:v[0-9]+]], +; SI: v_lshrrev_b32_e32 [[RESULT:v[0-9]+]], 16, [[VAL]] +; SI: buffer_store_short [[RESULT]] +; SI: buffer_store_short [[RESULT]] +; SI: buffer_store_short [[RESULT]] +; SI: buffer_store_short [[RESULT]] +; SI: s_endpgm +define void @scalar_to_vector_v2i32(<4 x i16> addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { + %tmp1 = load i32, i32 addrspace(1)* %in, align 4 + %bc = bitcast i32 %tmp1 to <2 x i16> + %tmp2 = shufflevector <2 x i16> %bc, <2 x i16> undef, <4 x i32> + store <4 x i16> %tmp2, <4 x i16> addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}scalar_to_vector_v2f32: +; SI: buffer_load_dword [[VAL:v[0-9]+]], +; SI: v_lshrrev_b32_e32 [[RESULT:v[0-9]+]], 16, [[VAL]] +; SI: buffer_store_short [[RESULT]] +; SI: buffer_store_short [[RESULT]] +; SI: buffer_store_short [[RESULT]] +; SI: buffer_store_short [[RESULT]] +; SI: s_endpgm +define void @scalar_to_vector_v2f32(<4 x i16> addrspace(1)* %out, float addrspace(1)* %in) nounwind { + %tmp1 = load float, float addrspace(1)* %in, align 4 + %bc = bitcast float %tmp1 to <2 x i16> + %tmp2 = shufflevector <2 x i16> %bc, <2 x i16> undef, <4 x i32> + store <4 x i16> %tmp2, <4 x i16> addrspace(1)* %out, align 8 + ret void +} + +; Getting a SCALAR_TO_VECTOR seems to be tricky. These cases managed +; to produce one, but for some reason never made it to selection. + + +; define void @scalar_to_vector_test2(<8 x i8> addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { +; %tmp1 = load i32, i32 addrspace(1)* %in, align 4 +; %bc = bitcast i32 %tmp1 to <4 x i8> + +; %tmp2 = shufflevector <4 x i8> %bc, <4 x i8> undef, <8 x i32> +; store <8 x i8> %tmp2, <8 x i8> addrspace(1)* %out, align 4 +; ret void +; } + +; define void @scalar_to_vector_test3(<4 x i32> addrspace(1)* %out) nounwind { +; %newvec0 = insertelement <2 x i64> undef, i64 12345, i32 0 +; %newvec1 = insertelement <2 x i64> %newvec0, i64 undef, i32 1 +; %bc = bitcast <2 x i64> %newvec1 to <4 x i32> +; %add = add <4 x i32> %bc, +; store <4 x i32> %add, <4 x i32> addrspace(1)* %out, align 16 +; ret void +; } + +; define void @scalar_to_vector_test4(<8 x i16> addrspace(1)* %out) nounwind { +; %newvec0 = insertelement <4 x i32> undef, i32 12345, i32 0 +; %bc = bitcast <4 x i32> %newvec0 to <8 x i16> +; %add = add <8 x i16> %bc, +; store <8 x i16> %add, <8 x i16> addrspace(1)* %out, align 16 +; ret void +; } + +; define void @scalar_to_vector_test5(<4 x i16> addrspace(1)* %out) nounwind { +; %newvec0 = insertelement <2 x i32> undef, i32 12345, i32 0 +; %bc = bitcast <2 x i32> %newvec0 to <4 x i16> +; %add = add <4 x i16> %bc, +; store <4 x i16> %add, <4 x i16> addrspace(1)* %out, align 16 +; ret void +; } + +; define void @scalar_to_vector_test6(<4 x i16> addrspace(1)* %out) nounwind { +; %newvec0 = insertelement <2 x i32> undef, i32 12345, i32 0 +; %bc = bitcast <2 x i32> %newvec0 to <4 x i16> +; %add = add <4 x i16> %bc, +; store <4 x i16> %add, <4 x i16> addrspace(1)* %out, align 16 +; ret void +; } diff --git a/llvm/test/CodeGen/AMDGPU/schedule-fs-loop-nested-if.ll b/llvm/test/CodeGen/AMDGPU/schedule-fs-loop-nested-if.ll new file mode 100644 index 00000000000..11e8f5176f4 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/schedule-fs-loop-nested-if.ll @@ -0,0 +1,82 @@ +;RUN: llc < %s -march=r600 -mcpu=cayman -stress-sched -verify-misched -verify-machineinstrs +;REQUIRES: asserts + +define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1) #1 { +main_body: + %0 = extractelement <4 x float> %reg1, i32 0 + %1 = extractelement <4 x float> %reg1, i32 1 + %2 = extractelement <4 x float> %reg1, i32 2 + %3 = extractelement <4 x float> %reg1, i32 3 + %4 = fcmp ult float %1, 0.000000e+00 + %5 = select i1 %4, float 1.000000e+00, float 0.000000e+00 + %6 = fsub float -0.000000e+00, %5 + %7 = fptosi float %6 to i32 + %8 = bitcast i32 %7 to float + %9 = fcmp ult float %0, 5.700000e+01 + %10 = select i1 %9, float 1.000000e+00, float 0.000000e+00 + %11 = fsub float -0.000000e+00, %10 + %12 = fptosi float %11 to i32 + %13 = bitcast i32 %12 to float + %14 = bitcast float %8 to i32 + %15 = bitcast float %13 to i32 + %16 = and i32 %14, %15 + %17 = bitcast i32 %16 to float + %18 = bitcast float %17 to i32 + %19 = icmp ne i32 %18, 0 + %20 = fcmp ult float %0, 0.000000e+00 + %21 = select i1 %20, float 1.000000e+00, float 0.000000e+00 + %22 = fsub float -0.000000e+00, %21 + %23 = fptosi float %22 to i32 + %24 = bitcast i32 %23 to float + %25 = bitcast float %24 to i32 + %26 = icmp ne i32 %25, 0 + br i1 %19, label %IF, label %ELSE + +IF: ; preds = %main_body + %. = select i1 %26, float 0.000000e+00, float 1.000000e+00 + %.18 = select i1 %26, float 1.000000e+00, float 0.000000e+00 + br label %ENDIF + +ELSE: ; preds = %main_body + br i1 %26, label %ENDIF, label %ELSE17 + +ENDIF: ; preds = %ELSE17, %ELSE, %IF + %temp1.0 = phi float [ %., %IF ], [ %48, %ELSE17 ], [ 0.000000e+00, %ELSE ] + %temp2.0 = phi float [ 0.000000e+00, %IF ], [ %49, %ELSE17 ], [ 1.000000e+00, %ELSE ] + %temp.0 = phi float [ %.18, %IF ], [ %47, %ELSE17 ], [ 0.000000e+00, %ELSE ] + %27 = call float @llvm.AMDIL.clamp.(float %temp.0, float 0.000000e+00, float 1.000000e+00) + %28 = call float @llvm.AMDIL.clamp.(float %temp1.0, float 0.000000e+00, float 1.000000e+00) + %29 = call float @llvm.AMDIL.clamp.(float %temp2.0, float 0.000000e+00, float 1.000000e+00) + %30 = call float @llvm.AMDIL.clamp.(float 1.000000e+00, float 0.000000e+00, float 1.000000e+00) + %31 = insertelement <4 x float> undef, float %27, i32 0 + %32 = insertelement <4 x float> %31, float %28, i32 1 + %33 = insertelement <4 x float> %32, float %29, i32 2 + %34 = insertelement <4 x float> %33, float %30, i32 3 + call void @llvm.R600.store.swizzle(<4 x float> %34, i32 0, i32 0) + ret void + +ELSE17: ; preds = %ELSE + %35 = fadd float 0.000000e+00, 0x3FC99999A0000000 + %36 = fadd float 0.000000e+00, 0x3FC99999A0000000 + %37 = fadd float 0.000000e+00, 0x3FC99999A0000000 + %38 = fadd float %35, 0x3FC99999A0000000 + %39 = fadd float %36, 0x3FC99999A0000000 + %40 = fadd float %37, 0x3FC99999A0000000 + %41 = fadd float %38, 0x3FC99999A0000000 + %42 = fadd float %39, 0x3FC99999A0000000 + %43 = fadd float %40, 0x3FC99999A0000000 + %44 = fadd float %41, 0x3FC99999A0000000 + %45 = fadd float %42, 0x3FC99999A0000000 + %46 = fadd float %43, 0x3FC99999A0000000 + %47 = fadd float %44, 0x3FC99999A0000000 + %48 = fadd float %45, 0x3FC99999A0000000 + %49 = fadd float %46, 0x3FC99999A0000000 + br label %ENDIF +} + +declare float @llvm.AMDIL.clamp.(float, float, float) #0 + +declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) + +attributes #0 = { readnone } +attributes #1 = { "ShaderType"="1" } diff --git a/llvm/test/CodeGen/AMDGPU/schedule-fs-loop-nested.ll b/llvm/test/CodeGen/AMDGPU/schedule-fs-loop-nested.ll new file mode 100644 index 00000000000..759197ca61f --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/schedule-fs-loop-nested.ll @@ -0,0 +1,88 @@ +;RUN: llc < %s -march=r600 -mcpu=cayman -stress-sched -verify-misched -verify-machineinstrs +;REQUIRES: asserts + +define void @main() { +main_body: + %0 = load <4 x float>, <4 x float> addrspace(9)* null + %1 = extractelement <4 x float> %0, i32 3 + %2 = fptosi float %1 to i32 + %3 = bitcast i32 %2 to float + %4 = bitcast float %3 to i32 + %5 = sdiv i32 %4, 4 + %6 = bitcast i32 %5 to float + %7 = bitcast float %6 to i32 + %8 = mul i32 %7, 4 + %9 = bitcast i32 %8 to float + %10 = bitcast float %9 to i32 + %11 = sub i32 0, %10 + %12 = bitcast i32 %11 to float + %13 = bitcast float %3 to i32 + %14 = bitcast float %12 to i32 + %15 = add i32 %13, %14 + %16 = bitcast i32 %15 to float + %17 = load <4 x float>, <4 x float> addrspace(9)* null + %18 = extractelement <4 x float> %17, i32 0 + %19 = load <4 x float>, <4 x float> addrspace(9)* null + %20 = extractelement <4 x float> %19, i32 1 + %21 = load <4 x float>, <4 x float> addrspace(9)* null + %22 = extractelement <4 x float> %21, i32 2 + br label %LOOP + +LOOP: ; preds = %IF31, %main_body + %temp12.0 = phi float [ 0.000000e+00, %main_body ], [ %47, %IF31 ] + %temp6.0 = phi float [ %22, %main_body ], [ %temp6.1, %IF31 ] + %temp5.0 = phi float [ %20, %main_body ], [ %temp5.1, %IF31 ] + %temp4.0 = phi float [ %18, %main_body ], [ %temp4.1, %IF31 ] + %23 = bitcast float %temp12.0 to i32 + %24 = bitcast float %6 to i32 + %25 = icmp sge i32 %23, %24 + %26 = sext i1 %25 to i32 + %27 = bitcast i32 %26 to float + %28 = bitcast float %27 to i32 + %29 = icmp ne i32 %28, 0 + br i1 %29, label %IF, label %LOOP29 + +IF: ; preds = %LOOP + %30 = call float @llvm.AMDIL.clamp.(float %temp4.0, float 0.000000e+00, float 1.000000e+00) + %31 = call float @llvm.AMDIL.clamp.(float %temp5.0, float 0.000000e+00, float 1.000000e+00) + %32 = call float @llvm.AMDIL.clamp.(float %temp6.0, float 0.000000e+00, float 1.000000e+00) + %33 = call float @llvm.AMDIL.clamp.(float 1.000000e+00, float 0.000000e+00, float 1.000000e+00) + %34 = insertelement <4 x float> undef, float %30, i32 0 + %35 = insertelement <4 x float> %34, float %31, i32 1 + %36 = insertelement <4 x float> %35, float %32, i32 2 + %37 = insertelement <4 x float> %36, float %33, i32 3 + call void @llvm.R600.store.swizzle(<4 x float> %37, i32 0, i32 0) + ret void + +LOOP29: ; preds = %LOOP, %ENDIF30 + %temp6.1 = phi float [ %temp4.1, %ENDIF30 ], [ %temp6.0, %LOOP ] + %temp5.1 = phi float [ %temp6.1, %ENDIF30 ], [ %temp5.0, %LOOP ] + %temp4.1 = phi float [ %temp5.1, %ENDIF30 ], [ %temp4.0, %LOOP ] + %temp20.0 = phi float [ %50, %ENDIF30 ], [ 0.000000e+00, %LOOP ] + %38 = bitcast float %temp20.0 to i32 + %39 = bitcast float %16 to i32 + %40 = icmp sge i32 %38, %39 + %41 = sext i1 %40 to i32 + %42 = bitcast i32 %41 to float + %43 = bitcast float %42 to i32 + %44 = icmp ne i32 %43, 0 + br i1 %44, label %IF31, label %ENDIF30 + +IF31: ; preds = %LOOP29 + %45 = bitcast float %temp12.0 to i32 + %46 = add i32 %45, 1 + %47 = bitcast i32 %46 to float + br label %LOOP + +ENDIF30: ; preds = %LOOP29 + %48 = bitcast float %temp20.0 to i32 + %49 = add i32 %48, 1 + %50 = bitcast i32 %49 to float + br label %LOOP29 +} + +declare float @llvm.AMDIL.clamp.(float, float, float) #0 + +declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) + +attributes #0 = { readnone } diff --git a/llvm/test/CodeGen/AMDGPU/schedule-fs-loop.ll b/llvm/test/CodeGen/AMDGPU/schedule-fs-loop.ll new file mode 100644 index 00000000000..28cc08abc02 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/schedule-fs-loop.ll @@ -0,0 +1,55 @@ +;RUN: llc < %s -march=r600 -mcpu=cayman -stress-sched -verify-misched -verify-machineinstrs +;REQUIRES: asserts + +define void @main() { +main_body: + %0 = load <4 x float>, <4 x float> addrspace(9)* null + %1 = extractelement <4 x float> %0, i32 3 + %2 = fptosi float %1 to i32 + %3 = bitcast i32 %2 to float + %4 = load <4 x float>, <4 x float> addrspace(9)* null + %5 = extractelement <4 x float> %4, i32 0 + %6 = load <4 x float>, <4 x float> addrspace(9)* null + %7 = extractelement <4 x float> %6, i32 1 + %8 = load <4 x float>, <4 x float> addrspace(9)* null + %9 = extractelement <4 x float> %8, i32 2 + br label %LOOP + +LOOP: ; preds = %ENDIF, %main_body + %temp4.0 = phi float [ %5, %main_body ], [ %temp5.0, %ENDIF ] + %temp5.0 = phi float [ %7, %main_body ], [ %temp6.0, %ENDIF ] + %temp6.0 = phi float [ %9, %main_body ], [ %temp4.0, %ENDIF ] + %temp8.0 = phi float [ 0.000000e+00, %main_body ], [ %27, %ENDIF ] + %10 = bitcast float %temp8.0 to i32 + %11 = bitcast float %3 to i32 + %12 = icmp sge i32 %10, %11 + %13 = sext i1 %12 to i32 + %14 = bitcast i32 %13 to float + %15 = bitcast float %14 to i32 + %16 = icmp ne i32 %15, 0 + br i1 %16, label %IF, label %ENDIF + +IF: ; preds = %LOOP + %17 = call float @llvm.AMDIL.clamp.(float %temp4.0, float 0.000000e+00, float 1.000000e+00) + %18 = call float @llvm.AMDIL.clamp.(float %temp5.0, float 0.000000e+00, float 1.000000e+00) + %19 = call float @llvm.AMDIL.clamp.(float %temp6.0, float 0.000000e+00, float 1.000000e+00) + %20 = call float @llvm.AMDIL.clamp.(float 1.000000e+00, float 0.000000e+00, float 1.000000e+00) + %21 = insertelement <4 x float> undef, float %17, i32 0 + %22 = insertelement <4 x float> %21, float %18, i32 1 + %23 = insertelement <4 x float> %22, float %19, i32 2 + %24 = insertelement <4 x float> %23, float %20, i32 3 + call void @llvm.R600.store.swizzle(<4 x float> %24, i32 0, i32 0) + ret void + +ENDIF: ; preds = %LOOP + %25 = bitcast float %temp8.0 to i32 + %26 = add i32 %25, 1 + %27 = bitcast i32 %26 to float + br label %LOOP +} + +declare float @llvm.AMDIL.clamp.(float, float, float) #0 + +declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) + +attributes #0 = { readnone } diff --git a/llvm/test/CodeGen/AMDGPU/schedule-global-loads.ll b/llvm/test/CodeGen/AMDGPU/schedule-global-loads.ll new file mode 100644 index 00000000000..3f728fd873b --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/schedule-global-loads.ll @@ -0,0 +1,41 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=SI %s + + +declare i32 @llvm.r600.read.tidig.x() #1 + +; FIXME: This currently doesn't do a great job of clustering the +; loads, which end up with extra moves between them. Right now, it +; seems the only things areLoadsFromSameBasePtr is accomplishing is +; ordering the loads so that the lower address loads come first. + +; FUNC-LABEL: {{^}}cluster_global_arg_loads: +; SI-DAG: buffer_load_dword [[REG0:v[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} +; SI-DAG: buffer_load_dword [[REG1:v[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:4 +; SI: buffer_store_dword [[REG0]] +; SI: buffer_store_dword [[REG1]] +define void @cluster_global_arg_loads(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 addrspace(1)* %ptr) #0 { + %load0 = load i32, i32 addrspace(1)* %ptr, align 4 + %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 1 + %load1 = load i32, i32 addrspace(1)* %gep, align 4 + store i32 %load0, i32 addrspace(1)* %out0, align 4 + store i32 %load1, i32 addrspace(1)* %out1, align 4 + ret void +} + +; Test for a crach in SIInstrInfo::areLoadsFromSameBasePtr() when checking +; an MUBUF load which does not have a vaddr operand. +; FUNC-LABEL: {{^}}same_base_ptr_crash: +; SI: buffer_load_dword +; SI: buffer_load_dword +define void @same_base_ptr_crash(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %offset) { +entry: + %out1 = getelementptr i32, i32 addrspace(1)* %out, i32 %offset + %tmp0 = load i32, i32 addrspace(1)* %out + %tmp1 = load i32, i32 addrspace(1)* %out1 + %tmp2 = add i32 %tmp0, %tmp1 + store i32 %tmp2, i32 addrspace(1)* %out + ret void +} + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } diff --git a/llvm/test/CodeGen/AMDGPU/schedule-if-2.ll b/llvm/test/CodeGen/AMDGPU/schedule-if-2.ll new file mode 100644 index 00000000000..54946509683 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/schedule-if-2.ll @@ -0,0 +1,94 @@ +;RUN: llc < %s -march=r600 -mcpu=cayman -stress-sched -verify-misched -verify-machineinstrs +;REQUIRES: asserts + +define void @main() { +main_body: + %0 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2) + %1 = extractelement <4 x float> %0, i32 0 + %2 = fadd float 1.000000e+03, %1 + %3 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1) + %4 = extractelement <4 x float> %3, i32 0 + %5 = bitcast float %4 to i32 + %6 = icmp eq i32 %5, 0 + %7 = sext i1 %6 to i32 + %8 = bitcast i32 %7 to float + %9 = bitcast float %8 to i32 + %10 = icmp ne i32 %9, 0 + br i1 %10, label %IF, label %ELSE + +IF: ; preds = %main_body + %11 = call float @fabs(float %2) + %12 = fcmp ueq float %11, 0x7FF0000000000000 + %13 = select i1 %12, float 1.000000e+00, float 0.000000e+00 + %14 = fsub float -0.000000e+00, %13 + %15 = fptosi float %14 to i32 + %16 = bitcast i32 %15 to float + %17 = bitcast float %16 to i32 + %18 = icmp ne i32 %17, 0 + %. = select i1 %18, float 0x36A0000000000000, float 0.000000e+00 + %19 = fcmp une float %2, %2 + %20 = select i1 %19, float 1.000000e+00, float 0.000000e+00 + %21 = fsub float -0.000000e+00, %20 + %22 = fptosi float %21 to i32 + %23 = bitcast i32 %22 to float + %24 = bitcast float %23 to i32 + %25 = icmp ne i32 %24, 0 + %temp8.0 = select i1 %25, float 0x36A0000000000000, float 0.000000e+00 + %26 = bitcast float %. to i32 + %27 = sitofp i32 %26 to float + %28 = bitcast float %temp8.0 to i32 + %29 = sitofp i32 %28 to float + %30 = fcmp ugt float %2, 0.000000e+00 + %31 = select i1 %30, float 1.000000e+00, float %2 + %32 = fcmp uge float %31, 0.000000e+00 + %33 = select i1 %32, float %31, float -1.000000e+00 + %34 = fadd float %33, 1.000000e+00 + %35 = fmul float %34, 5.000000e-01 + br label %ENDIF + +ELSE: ; preds = %main_body + %36 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1) + %37 = extractelement <4 x float> %36, i32 0 + %38 = bitcast float %37 to i32 + %39 = icmp eq i32 %38, 1 + %40 = sext i1 %39 to i32 + %41 = bitcast i32 %40 to float + %42 = bitcast float %41 to i32 + %43 = icmp ne i32 %42, 0 + br i1 %43, label %IF23, label %ENDIF + +ENDIF: ; preds = %IF23, %ELSE, %IF + %temp4.0 = phi float [ %2, %IF ], [ %56, %IF23 ], [ 0.000000e+00, %ELSE ] + %temp5.0 = phi float [ %27, %IF ], [ %60, %IF23 ], [ 0.000000e+00, %ELSE ] + %temp6.0 = phi float [ %29, %IF ], [ 0.000000e+00, %ELSE ], [ 0.000000e+00, %IF23 ] + %temp7.0 = phi float [ %35, %IF ], [ 0.000000e+00, %ELSE ], [ 0.000000e+00, %IF23 ] + %44 = insertelement <4 x float> undef, float %temp4.0, i32 0 + %45 = insertelement <4 x float> %44, float %temp5.0, i32 1 + %46 = insertelement <4 x float> %45, float %temp6.0, i32 2 + %47 = insertelement <4 x float> %46, float %temp7.0, i32 3 + call void @llvm.R600.store.swizzle(<4 x float> %47, i32 0, i32 0) + ret void + +IF23: ; preds = %ELSE + %48 = fcmp ult float 0.000000e+00, %2 + %49 = select i1 %48, float 1.000000e+00, float 0.000000e+00 + %50 = fsub float -0.000000e+00, %49 + %51 = fptosi float %50 to i32 + %52 = bitcast i32 %51 to float + %53 = bitcast float %52 to i32 + %54 = icmp ne i32 %53, 0 + %.28 = select i1 %54, float 0x36A0000000000000, float 0.000000e+00 + %55 = bitcast float %.28 to i32 + %56 = sitofp i32 %55 to float + %57 = load <4 x float>, <4 x float> addrspace(8)* null + %58 = extractelement <4 x float> %57, i32 0 + %59 = fsub float -0.000000e+00, %58 + %60 = fadd float %2, %59 + br label %ENDIF +} + +declare float @fabs(float) #0 + +declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) + +attributes #0 = { readonly } diff --git a/llvm/test/CodeGen/AMDGPU/schedule-if.ll b/llvm/test/CodeGen/AMDGPU/schedule-if.ll new file mode 100644 index 00000000000..94c653c8f25 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/schedule-if.ll @@ -0,0 +1,46 @@ +;RUN: llc < %s -march=r600 -mcpu=cayman -stress-sched -verify-misched -verify-machineinstrs +;REQUIRES: asserts + +define void @main() { +main_body: + %0 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1) + %1 = extractelement <4 x float> %0, i32 0 + %2 = bitcast float %1 to i32 + %3 = icmp eq i32 %2, 0 + %4 = sext i1 %3 to i32 + %5 = bitcast i32 %4 to float + %6 = bitcast float %5 to i32 + %7 = icmp ne i32 %6, 0 + br i1 %7, label %ENDIF, label %ELSE + +ELSE: ; preds = %main_body + %8 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1) + %9 = extractelement <4 x float> %8, i32 0 + %10 = bitcast float %9 to i32 + %11 = icmp eq i32 %10, 1 + %12 = sext i1 %11 to i32 + %13 = bitcast i32 %12 to float + %14 = bitcast float %13 to i32 + %15 = icmp ne i32 %14, 0 + br i1 %15, label %IF13, label %ENDIF + +ENDIF: ; preds = %IF13, %ELSE, %main_body + %temp.0 = phi float [ 1.000000e+03, %main_body ], [ 1.000000e+00, %IF13 ], [ 0.000000e+00, %ELSE ] + %temp1.0 = phi float [ 0.000000e+00, %main_body ], [ %23, %IF13 ], [ 0.000000e+00, %ELSE ] + %temp3.0 = phi float [ 1.000000e+00, %main_body ], [ 0.000000e+00, %ELSE ], [ 0.000000e+00, %IF13 ] + %16 = insertelement <4 x float> undef, float %temp.0, i32 0 + %17 = insertelement <4 x float> %16, float %temp1.0, i32 1 + %18 = insertelement <4 x float> %17, float 0.000000e+00, i32 2 + %19 = insertelement <4 x float> %18, float %temp3.0, i32 3 + call void @llvm.R600.store.swizzle(<4 x float> %19, i32 0, i32 0) + ret void + +IF13: ; preds = %ELSE + %20 = load <4 x float>, <4 x float> addrspace(8)* null + %21 = extractelement <4 x float> %20, i32 0 + %22 = fsub float -0.000000e+00, %21 + %23 = fadd float 1.000000e+03, %22 + br label %ENDIF +} + +declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) diff --git a/llvm/test/CodeGen/AMDGPU/schedule-kernel-arg-loads.ll b/llvm/test/CodeGen/AMDGPU/schedule-kernel-arg-loads.ll new file mode 100644 index 00000000000..6b3e0814c38 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/schedule-kernel-arg-loads.ll @@ -0,0 +1,51 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=SI --check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=VI --check-prefix=GCN %s + +; FUNC-LABEL: {{^}}cluster_arg_loads: +; SI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x9 +; SI-NEXT: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb +; SI-NEXT: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd +; SI-NEXT: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0xe +; VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x24 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c +; VI-NEXT: s_nop 0 +; VI-NEXT: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x34 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x38 +define void @cluster_arg_loads(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 %x, i32 %y) nounwind { + store i32 %x, i32 addrspace(1)* %out0, align 4 + store i32 %y, i32 addrspace(1)* %out1, align 4 + ret void +} + +; Test for a crash in SIInstrInfo::areLoadsFromSameBasePtr() when +; s_load_dwordx2 has a register offset + +; FUNC-LABEL: @same_base_ptr_crash +; GCN: s_load_dwordx2 +; GCN: s_load_dwordx2 +; GCN: s_load_dwordx2 +; GCN: s_endpgm +define void @same_base_ptr_crash(i64 addrspace(1)* %out, + i64 %arg0, i64 %arg1, i64 %arg2, i64 %arg3, i64 %arg4, i64 %arg5, i64 %arg6, i64 %arg7, + i64 %arg8, i64 %arg9, i64 %arg10, i64 %arg11, i64 %arg12, i64 %arg13, i64 %arg14, i64 %arg15, + i64 %arg16, i64 %arg17, i64 %arg18, i64 %arg19, i64 %arg20, i64 %arg21, i64 %arg22, i64 %arg23, + i64 %arg24, i64 %arg25, i64 %arg26, i64 %arg27, i64 %arg28, i64 %arg29, i64 %arg30, i64 %arg31, + i64 %arg32, i64 %arg33, i64 %arg34, i64 %arg35, i64 %arg36, i64 %arg37, i64 %arg38, i64 %arg39, + i64 %arg40, i64 %arg41, i64 %arg42, i64 %arg43, i64 %arg44, i64 %arg45, i64 %arg46, i64 %arg47, + i64 %arg48, i64 %arg49, i64 %arg50, i64 %arg51, i64 %arg52, i64 %arg53, i64 %arg54, i64 %arg55, + i64 %arg56, i64 %arg57, i64 %arg58, i64 %arg59, i64 %arg60, i64 %arg61, i64 %arg62, i64 %arg63, + i64 %arg64, i64 %arg65, i64 %arg66, i64 %arg67, i64 %arg68, i64 %arg69, i64 %arg70, i64 %arg71, + i64 %arg72, i64 %arg73, i64 %arg74, i64 %arg75, i64 %arg76, i64 %arg77, i64 %arg78, i64 %arg79, + i64 %arg80, i64 %arg81, i64 %arg82, i64 %arg83, i64 %arg84, i64 %arg85, i64 %arg86, i64 %arg87, + i64 %arg88, i64 %arg89, i64 %arg90, i64 %arg91, i64 %arg92, i64 %arg93, i64 %arg94, i64 %arg95, + i64 %arg96, i64 %arg97, i64 %arg98, i64 %arg99, i64 %arg100, i64 %arg101, i64 %arg102, i64 %arg103, + i64 %arg104, i64 %arg105, i64 %arg106, i64 %arg107, i64 %arg108, i64 %arg109, i64 %arg110, i64 %arg111, + i64 %arg112, i64 %arg113, i64 %arg114, i64 %arg115, i64 %arg116, i64 %arg117, i64 %arg118, i64 %arg119, + i64 %arg120, i64 %arg121, i64 %arg122, i64 %arg123, i64 %arg124, i64 %arg125, i64 %arg126) { +entry: + %value = add i64 %arg125, %arg126 + store i64 %value, i64 addrspace(1)* %out, align 8 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/schedule-vs-if-nested-loop-failure.ll b/llvm/test/CodeGen/AMDGPU/schedule-vs-if-nested-loop-failure.ll new file mode 100644 index 00000000000..3863afda5dd --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/schedule-vs-if-nested-loop-failure.ll @@ -0,0 +1,163 @@ +; XFAIL: * +; REQUIRES: asserts +; RUN: llc -O0 -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck %s -check-prefix=SI +; RUN: llc -O0 -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck %s -check-prefix=SI + +declare void @llvm.AMDGPU.barrier.local() nounwind noduplicate + + +; SI-LABEL: {{^}}main( +define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1) #0 { +main_body: + %0 = extractelement <4 x float> %reg1, i32 0 + %1 = extractelement <4 x float> %reg1, i32 2 + %2 = fcmp ult float %0, 0.000000e+00 + %3 = select i1 %2, float 1.000000e+00, float 0.000000e+00 + %4 = fsub float -0.000000e+00, %3 + %5 = fptosi float %4 to i32 + %6 = bitcast i32 %5 to float + %7 = bitcast float %6 to i32 + %8 = icmp ne i32 %7, 0 + br i1 %8, label %LOOP, label %ENDIF + +Flow1: ; preds = %ENDIF19, %ENDIF16 + %9 = phi float [ %115, %ENDIF19 ], [ undef, %ENDIF16 ] + %10 = phi float [ %114, %ENDIF19 ], [ undef, %ENDIF16 ] + %11 = phi float [ %113, %ENDIF19 ], [ undef, %ENDIF16 ] + %12 = phi float [ %112, %ENDIF19 ], [ undef, %ENDIF16 ] + %13 = phi float [ %111, %ENDIF19 ], [ undef, %ENDIF16 ] + %14 = phi i1 [ false, %ENDIF19 ], [ true, %ENDIF16 ] + br label %Flow + +Flow2: ; preds = %Flow + br label %ENDIF + +ENDIF: ; preds = %main_body, %Flow2 + %temp.0 = phi float [ 0.000000e+00, %main_body ], [ %104, %Flow2 ] + %temp1.0 = phi float [ 1.000000e+00, %main_body ], [ %103, %Flow2 ] + %temp2.0 = phi float [ 0.000000e+00, %main_body ], [ %102, %Flow2 ] + %temp3.0 = phi float [ 0.000000e+00, %main_body ], [ %101, %Flow2 ] + %15 = extractelement <4 x float> %reg1, i32 1 + %16 = extractelement <4 x float> %reg1, i32 3 + %17 = load <4 x float>, <4 x float> addrspace(9)* null + %18 = extractelement <4 x float> %17, i32 0 + %19 = fmul float %18, %0 + %20 = load <4 x float>, <4 x float> addrspace(9)* null + %21 = extractelement <4 x float> %20, i32 1 + %22 = fmul float %21, %0 + %23 = load <4 x float>, <4 x float> addrspace(9)* null + %24 = extractelement <4 x float> %23, i32 2 + %25 = fmul float %24, %0 + %26 = load <4 x float>, <4 x float> addrspace(9)* null + %27 = extractelement <4 x float> %26, i32 3 + %28 = fmul float %27, %0 + %29 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 1) + %30 = extractelement <4 x float> %29, i32 0 + %31 = fmul float %30, %15 + %32 = fadd float %31, %19 + %33 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 1) + %34 = extractelement <4 x float> %33, i32 1 + %35 = fmul float %34, %15 + %36 = fadd float %35, %22 + %37 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 1) + %38 = extractelement <4 x float> %37, i32 2 + %39 = fmul float %38, %15 + %40 = fadd float %39, %25 + %41 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 1) + %42 = extractelement <4 x float> %41, i32 3 + %43 = fmul float %42, %15 + %44 = fadd float %43, %28 + %45 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 2) + %46 = extractelement <4 x float> %45, i32 0 + %47 = fmul float %46, %1 + %48 = fadd float %47, %32 + %49 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 2) + %50 = extractelement <4 x float> %49, i32 1 + %51 = fmul float %50, %1 + %52 = fadd float %51, %36 + %53 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 2) + %54 = extractelement <4 x float> %53, i32 2 + %55 = fmul float %54, %1 + %56 = fadd float %55, %40 + %57 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 2) + %58 = extractelement <4 x float> %57, i32 3 + %59 = fmul float %58, %1 + %60 = fadd float %59, %44 + %61 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 3) + %62 = extractelement <4 x float> %61, i32 0 + %63 = fmul float %62, %16 + %64 = fadd float %63, %48 + %65 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 3) + %66 = extractelement <4 x float> %65, i32 1 + %67 = fmul float %66, %16 + %68 = fadd float %67, %52 + %69 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 3) + %70 = extractelement <4 x float> %69, i32 2 + %71 = fmul float %70, %16 + %72 = fadd float %71, %56 + %73 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 3) + %74 = extractelement <4 x float> %73, i32 3 + %75 = fmul float %74, %16 + %76 = fadd float %75, %60 + %77 = insertelement <4 x float> undef, float %64, i32 0 + %78 = insertelement <4 x float> %77, float %68, i32 1 + %79 = insertelement <4 x float> %78, float %72, i32 2 + %80 = insertelement <4 x float> %79, float %76, i32 3 + call void @llvm.AMDGPU.barrier.local() + %81 = insertelement <4 x float> undef, float %temp.0, i32 0 + %82 = insertelement <4 x float> %81, float %temp1.0, i32 1 + %83 = insertelement <4 x float> %82, float %temp2.0, i32 2 + %84 = insertelement <4 x float> %83, float %temp3.0, i32 3 + call void @llvm.AMDGPU.barrier.local() + ret void + +LOOP: ; preds = %main_body, %Flow + %temp.1 = phi float [ %109, %Flow ], [ 0.000000e+00, %main_body ] + %temp1.1 = phi float [ %108, %Flow ], [ 1.000000e+00, %main_body ] + %temp2.1 = phi float [ %107, %Flow ], [ 0.000000e+00, %main_body ] + %temp3.1 = phi float [ %106, %Flow ], [ 0.000000e+00, %main_body ] + %temp4.0 = phi float [ %105, %Flow ], [ -2.000000e+00, %main_body ] + %85 = fcmp uge float %temp4.0, %0 + %86 = select i1 %85, float 1.000000e+00, float 0.000000e+00 + %87 = fsub float -0.000000e+00, %86 + %88 = fptosi float %87 to i32 + %89 = bitcast i32 %88 to float + %90 = bitcast float %89 to i32 + %91 = icmp ne i32 %90, 0 + %92 = xor i1 %91, true + br i1 %92, label %ENDIF16, label %Flow + +ENDIF16: ; preds = %LOOP + %93 = fcmp une float %1, %temp4.0 + %94 = select i1 %93, float 1.000000e+00, float 0.000000e+00 + %95 = fsub float -0.000000e+00, %94 + %96 = fptosi float %95 to i32 + %97 = bitcast i32 %96 to float + %98 = bitcast float %97 to i32 + %99 = icmp ne i32 %98, 0 + %100 = xor i1 %99, true + br i1 %100, label %ENDIF19, label %Flow1 + +Flow: ; preds = %Flow1, %LOOP + %101 = phi float [ %temp3.1, %Flow1 ], [ %temp3.1, %LOOP ] + %102 = phi float [ %temp2.1, %Flow1 ], [ %temp2.1, %LOOP ] + %103 = phi float [ %temp1.1, %Flow1 ], [ %temp1.1, %LOOP ] + %104 = phi float [ %temp.1, %Flow1 ], [ %temp.1, %LOOP ] + %105 = phi float [ %9, %Flow1 ], [ undef, %LOOP ] + %106 = phi float [ %10, %Flow1 ], [ undef, %LOOP ] + %107 = phi float [ %11, %Flow1 ], [ undef, %LOOP ] + %108 = phi float [ %12, %Flow1 ], [ undef, %LOOP ] + %109 = phi float [ %13, %Flow1 ], [ undef, %LOOP ] + %110 = phi i1 [ %14, %Flow1 ], [ true, %LOOP ] + br i1 %110, label %Flow2, label %LOOP + +ENDIF19: ; preds = %ENDIF16 + %111 = fadd float %temp.1, 1.000000e+00 + %112 = fadd float %temp1.1, 0.000000e+00 + %113 = fadd float %temp2.1, 0.000000e+00 + %114 = fadd float %temp3.1, 0.000000e+00 + %115 = fadd float %temp4.0, 1.000000e+00 + br label %Flow1 +} + +attributes #0 = { "ShaderType"="1" } diff --git a/llvm/test/CodeGen/AMDGPU/schedule-vs-if-nested-loop.ll b/llvm/test/CodeGen/AMDGPU/schedule-vs-if-nested-loop.ll new file mode 100644 index 00000000000..8d980dbf899 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/schedule-vs-if-nested-loop.ll @@ -0,0 +1,132 @@ +;RUN: llc < %s -march=r600 -mcpu=cayman -stress-sched -verify-misched +;REQUIRES: asserts + +define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1) #0 { +main_body: + %0 = extractelement <4 x float> %reg1, i32 0 + %1 = extractelement <4 x float> %reg1, i32 1 + %2 = extractelement <4 x float> %reg1, i32 2 + %3 = extractelement <4 x float> %reg1, i32 3 + %4 = fcmp ult float %0, 0.000000e+00 + %5 = select i1 %4, float 1.000000e+00, float 0.000000e+00 + %6 = fsub float -0.000000e+00, %5 + %7 = fptosi float %6 to i32 + %8 = bitcast i32 %7 to float + %9 = bitcast float %8 to i32 + %10 = icmp ne i32 %9, 0 + br i1 %10, label %LOOP, label %ENDIF + +ENDIF: ; preds = %ENDIF16, %LOOP, %main_body + %temp.0 = phi float [ 0.000000e+00, %main_body ], [ %temp.1, %LOOP ], [ %temp.1, %ENDIF16 ] + %temp1.0 = phi float [ 1.000000e+00, %main_body ], [ %temp1.1, %LOOP ], [ %temp1.1, %ENDIF16 ] + %temp2.0 = phi float [ 0.000000e+00, %main_body ], [ %temp2.1, %LOOP ], [ %temp2.1, %ENDIF16 ] + %temp3.0 = phi float [ 0.000000e+00, %main_body ], [ %temp3.1, %LOOP ], [ %temp3.1, %ENDIF16 ] + %11 = load <4 x float>, <4 x float> addrspace(9)* null + %12 = extractelement <4 x float> %11, i32 0 + %13 = fmul float %12, %0 + %14 = load <4 x float>, <4 x float> addrspace(9)* null + %15 = extractelement <4 x float> %14, i32 1 + %16 = fmul float %15, %0 + %17 = load <4 x float>, <4 x float> addrspace(9)* null + %18 = extractelement <4 x float> %17, i32 2 + %19 = fmul float %18, %0 + %20 = load <4 x float>, <4 x float> addrspace(9)* null + %21 = extractelement <4 x float> %20, i32 3 + %22 = fmul float %21, %0 + %23 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 1) + %24 = extractelement <4 x float> %23, i32 0 + %25 = fmul float %24, %1 + %26 = fadd float %25, %13 + %27 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 1) + %28 = extractelement <4 x float> %27, i32 1 + %29 = fmul float %28, %1 + %30 = fadd float %29, %16 + %31 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 1) + %32 = extractelement <4 x float> %31, i32 2 + %33 = fmul float %32, %1 + %34 = fadd float %33, %19 + %35 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 1) + %36 = extractelement <4 x float> %35, i32 3 + %37 = fmul float %36, %1 + %38 = fadd float %37, %22 + %39 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 2) + %40 = extractelement <4 x float> %39, i32 0 + %41 = fmul float %40, %2 + %42 = fadd float %41, %26 + %43 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 2) + %44 = extractelement <4 x float> %43, i32 1 + %45 = fmul float %44, %2 + %46 = fadd float %45, %30 + %47 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 2) + %48 = extractelement <4 x float> %47, i32 2 + %49 = fmul float %48, %2 + %50 = fadd float %49, %34 + %51 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 2) + %52 = extractelement <4 x float> %51, i32 3 + %53 = fmul float %52, %2 + %54 = fadd float %53, %38 + %55 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 3) + %56 = extractelement <4 x float> %55, i32 0 + %57 = fmul float %56, %3 + %58 = fadd float %57, %42 + %59 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 3) + %60 = extractelement <4 x float> %59, i32 1 + %61 = fmul float %60, %3 + %62 = fadd float %61, %46 + %63 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 3) + %64 = extractelement <4 x float> %63, i32 2 + %65 = fmul float %64, %3 + %66 = fadd float %65, %50 + %67 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 3) + %68 = extractelement <4 x float> %67, i32 3 + %69 = fmul float %68, %3 + %70 = fadd float %69, %54 + %71 = insertelement <4 x float> undef, float %58, i32 0 + %72 = insertelement <4 x float> %71, float %62, i32 1 + %73 = insertelement <4 x float> %72, float %66, i32 2 + %74 = insertelement <4 x float> %73, float %70, i32 3 + call void @llvm.R600.store.swizzle(<4 x float> %74, i32 60, i32 1) + %75 = insertelement <4 x float> undef, float %temp.0, i32 0 + %76 = insertelement <4 x float> %75, float %temp1.0, i32 1 + %77 = insertelement <4 x float> %76, float %temp2.0, i32 2 + %78 = insertelement <4 x float> %77, float %temp3.0, i32 3 + call void @llvm.R600.store.swizzle(<4 x float> %78, i32 0, i32 2) + ret void + +LOOP: ; preds = %main_body, %ENDIF19 + %temp.1 = phi float [ %93, %ENDIF19 ], [ 0.000000e+00, %main_body ] + %temp1.1 = phi float [ %94, %ENDIF19 ], [ 1.000000e+00, %main_body ] + %temp2.1 = phi float [ %95, %ENDIF19 ], [ 0.000000e+00, %main_body ] + %temp3.1 = phi float [ %96, %ENDIF19 ], [ 0.000000e+00, %main_body ] + %temp4.0 = phi float [ %97, %ENDIF19 ], [ -2.000000e+00, %main_body ] + %79 = fcmp uge float %temp4.0, %0 + %80 = select i1 %79, float 1.000000e+00, float 0.000000e+00 + %81 = fsub float -0.000000e+00, %80 + %82 = fptosi float %81 to i32 + %83 = bitcast i32 %82 to float + %84 = bitcast float %83 to i32 + %85 = icmp ne i32 %84, 0 + br i1 %85, label %ENDIF, label %ENDIF16 + +ENDIF16: ; preds = %LOOP + %86 = fcmp une float %2, %temp4.0 + %87 = select i1 %86, float 1.000000e+00, float 0.000000e+00 + %88 = fsub float -0.000000e+00, %87 + %89 = fptosi float %88 to i32 + %90 = bitcast i32 %89 to float + %91 = bitcast float %90 to i32 + %92 = icmp ne i32 %91, 0 + br i1 %92, label %ENDIF, label %ENDIF19 + +ENDIF19: ; preds = %ENDIF16 + %93 = fadd float %temp.1, 1.000000e+00 + %94 = fadd float %temp1.1, 0.000000e+00 + %95 = fadd float %temp2.1, 0.000000e+00 + %96 = fadd float %temp3.1, 0.000000e+00 + %97 = fadd float %temp4.0, 1.000000e+00 + br label %LOOP +} + +declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) + +attributes #0 = { "ShaderType"="1" } diff --git a/llvm/test/CodeGen/AMDGPU/scratch-buffer.ll b/llvm/test/CodeGen/AMDGPU/scratch-buffer.ll new file mode 100644 index 00000000000..56088718ada --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/scratch-buffer.ll @@ -0,0 +1,87 @@ +; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=SI < %s | FileCheck %s +; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck %s + +; When a frame index offset is more than 12-bits, make sure we don't store +; it in mubuf's offset field. + +; Also, make sure we use the same register for storing the scratch buffer addresss +; for both stores. This register is allocated by the register scavenger, so we +; should be able to reuse the same regiser for each scratch buffer access. + +; CHECK-LABEL: {{^}}legal_offset_fi: +; CHECK: v_mov_b32_e32 [[OFFSET:v[0-9]+]], 0{{$}} +; CHECK: buffer_store_dword v{{[0-9]+}}, [[OFFSET]], s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen +; CHECK: v_mov_b32_e32 [[OFFSET]], 0x8000 +; CHECK: buffer_store_dword v{{[0-9]+}}, [[OFFSET]], s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen{{$}} + +define void @legal_offset_fi(i32 addrspace(1)* %out, i32 %cond, i32 %if_offset, i32 %else_offset) { +entry: + %scratch0 = alloca [8192 x i32] + %scratch1 = alloca [8192 x i32] + + %scratchptr0 = getelementptr [8192 x i32], [8192 x i32]* %scratch0, i32 0, i32 0 + store i32 1, i32* %scratchptr0 + + %scratchptr1 = getelementptr [8192 x i32], [8192 x i32]* %scratch1, i32 0, i32 0 + store i32 2, i32* %scratchptr1 + + %cmp = icmp eq i32 %cond, 0 + br i1 %cmp, label %if, label %else + +if: + %if_ptr = getelementptr [8192 x i32], [8192 x i32]* %scratch0, i32 0, i32 %if_offset + %if_value = load i32, i32* %if_ptr + br label %done + +else: + %else_ptr = getelementptr [8192 x i32], [8192 x i32]* %scratch1, i32 0, i32 %else_offset + %else_value = load i32, i32* %else_ptr + br label %done + +done: + %value = phi i32 [%if_value, %if], [%else_value, %else] + store i32 %value, i32 addrspace(1)* %out + ret void + + ret void + +} + +; CHECK-LABEL: {{^}}legal_offset_fi_offset +; CHECK: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen +; CHECK: v_add_i32_e32 [[OFFSET:v[0-9]+]], 0x8000 +; CHECK: buffer_store_dword v{{[0-9]+}}, [[OFFSET]], s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen{{$}} + +define void @legal_offset_fi_offset(i32 addrspace(1)* %out, i32 %cond, i32 addrspace(1)* %offsets, i32 %if_offset, i32 %else_offset) { +entry: + %scratch0 = alloca [8192 x i32] + %scratch1 = alloca [8192 x i32] + + %offset0 = load i32, i32 addrspace(1)* %offsets + %scratchptr0 = getelementptr [8192 x i32], [8192 x i32]* %scratch0, i32 0, i32 %offset0 + store i32 %offset0, i32* %scratchptr0 + + %offsetptr1 = getelementptr i32, i32 addrspace(1)* %offsets, i32 1 + %offset1 = load i32, i32 addrspace(1)* %offsetptr1 + %scratchptr1 = getelementptr [8192 x i32], [8192 x i32]* %scratch1, i32 0, i32 %offset1 + store i32 %offset1, i32* %scratchptr1 + + %cmp = icmp eq i32 %cond, 0 + br i1 %cmp, label %if, label %else + +if: + %if_ptr = getelementptr [8192 x i32], [8192 x i32]* %scratch0, i32 0, i32 %if_offset + %if_value = load i32, i32* %if_ptr + br label %done + +else: + %else_ptr = getelementptr [8192 x i32], [8192 x i32]* %scratch1, i32 0, i32 %else_offset + %else_value = load i32, i32* %else_ptr + br label %done + +done: + %value = phi i32 [%if_value, %if], [%else_value, %else] + store i32 %value, i32 addrspace(1)* %out + ret void +} + diff --git a/llvm/test/CodeGen/AMDGPU/sdiv.ll b/llvm/test/CodeGen/AMDGPU/sdiv.ll new file mode 100644 index 00000000000..de645353a40 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/sdiv.ll @@ -0,0 +1,104 @@ +; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s + +; The code generated by sdiv is long and complex and may frequently change. +; The goal of this test is to make sure the ISel doesn't fail. +; +; This program was previously failing to compile when one of the selectcc +; opcodes generated by the sdiv lowering was being legalized and optimized to: +; selectcc Remainder -1, 0, -1, SETGT +; This was fixed by adding an additional pattern in R600Instructions.td to +; match this pattern with a CNDGE_INT. + +; FUNC-LABEL: {{^}}sdiv_i32: +; EG: CF_END +define void @sdiv_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { + %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 + %num = load i32, i32 addrspace(1) * %in + %den = load i32, i32 addrspace(1) * %den_ptr + %result = sdiv i32 %num, %den + store i32 %result, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}sdiv_i32_4: +define void @sdiv_i32_4(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { + %num = load i32, i32 addrspace(1) * %in + %result = sdiv i32 %num, 4 + store i32 %result, i32 addrspace(1)* %out + ret void +} + +; Multiply by a weird constant to make sure setIntDivIsCheap is +; working. + +; FUNC-LABEL: {{^}}slow_sdiv_i32_3435: +; SI: buffer_load_dword [[VAL:v[0-9]+]], +; SI: v_mov_b32_e32 [[MAGIC:v[0-9]+]], 0x98a1930b +; SI: v_mul_hi_i32 [[TMP:v[0-9]+]], [[MAGIC]], [[VAL]] +; SI: v_add_i32 +; SI: v_lshrrev_b32 +; SI: v_ashrrev_i32 +; SI: v_add_i32 +; SI: buffer_store_dword +; SI: s_endpgm +define void @slow_sdiv_i32_3435(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { + %num = load i32, i32 addrspace(1) * %in + %result = sdiv i32 %num, 3435 + store i32 %result, i32 addrspace(1)* %out + ret void +} + +define void @sdiv_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { + %den_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1 + %num = load <2 x i32>, <2 x i32> addrspace(1) * %in + %den = load <2 x i32>, <2 x i32> addrspace(1) * %den_ptr + %result = sdiv <2 x i32> %num, %den + store <2 x i32> %result, <2 x i32> addrspace(1)* %out + ret void +} + +define void @sdiv_v2i32_4(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { + %num = load <2 x i32>, <2 x i32> addrspace(1) * %in + %result = sdiv <2 x i32> %num, + store <2 x i32> %result, <2 x i32> addrspace(1)* %out + ret void +} + +define void @sdiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { + %den_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1 + %num = load <4 x i32>, <4 x i32> addrspace(1) * %in + %den = load <4 x i32>, <4 x i32> addrspace(1) * %den_ptr + %result = sdiv <4 x i32> %num, %den + store <4 x i32> %result, <4 x i32> addrspace(1)* %out + ret void +} + +define void @sdiv_v4i32_4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { + %num = load <4 x i32>, <4 x i32> addrspace(1) * %in + %result = sdiv <4 x i32> %num, + store <4 x i32> %result, <4 x i32> addrspace(1)* %out + ret void +} + +; Tests for 64-bit divide bypass. +; define void @test_get_quotient(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind { +; %result = sdiv i64 %a, %b +; store i64 %result, i64 addrspace(1)* %out, align 8 +; ret void +; } + +; define void @test_get_remainder(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind { +; %result = srem i64 %a, %b +; store i64 %result, i64 addrspace(1)* %out, align 8 +; ret void +; } + +; define void @test_get_quotient_and_remainder(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind { +; %resultdiv = sdiv i64 %a, %b +; %resultrem = srem i64 %a, %b +; %result = add i64 %resultdiv, %resultrem +; store i64 %result, i64 addrspace(1)* %out, align 8 +; ret void +; } diff --git a/llvm/test/CodeGen/AMDGPU/sdivrem24.ll b/llvm/test/CodeGen/AMDGPU/sdivrem24.ll new file mode 100644 index 00000000000..ad5df39f550 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/sdivrem24.ll @@ -0,0 +1,239 @@ +; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s + +; FUNC-LABEL: {{^}}sdiv24_i8: +; SI: v_cvt_f32_i32 +; SI: v_cvt_f32_i32 +; SI: v_rcp_f32 +; SI: v_cvt_i32_f32 + +; EG: INT_TO_FLT +; EG-DAG: INT_TO_FLT +; EG-DAG: RECIP_IEEE +; EG: FLT_TO_INT +define void @sdiv24_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) { + %den_ptr = getelementptr i8, i8 addrspace(1)* %in, i8 1 + %num = load i8, i8 addrspace(1) * %in + %den = load i8, i8 addrspace(1) * %den_ptr + %result = sdiv i8 %num, %den + store i8 %result, i8 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}sdiv24_i16: +; SI: v_cvt_f32_i32 +; SI: v_cvt_f32_i32 +; SI: v_rcp_f32 +; SI: v_cvt_i32_f32 + +; EG: INT_TO_FLT +; EG-DAG: INT_TO_FLT +; EG-DAG: RECIP_IEEE +; EG: FLT_TO_INT +define void @sdiv24_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) { + %den_ptr = getelementptr i16, i16 addrspace(1)* %in, i16 1 + %num = load i16, i16 addrspace(1) * %in, align 2 + %den = load i16, i16 addrspace(1) * %den_ptr, align 2 + %result = sdiv i16 %num, %den + store i16 %result, i16 addrspace(1)* %out, align 2 + ret void +} + +; FUNC-LABEL: {{^}}sdiv24_i32: +; SI: v_cvt_f32_i32 +; SI: v_cvt_f32_i32 +; SI: v_rcp_f32 +; SI: v_cvt_i32_f32 + +; EG: INT_TO_FLT +; EG-DAG: INT_TO_FLT +; EG-DAG: RECIP_IEEE +; EG: FLT_TO_INT +define void @sdiv24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { + %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 + %num = load i32, i32 addrspace(1) * %in, align 4 + %den = load i32, i32 addrspace(1) * %den_ptr, align 4 + %num.i24.0 = shl i32 %num, 8 + %den.i24.0 = shl i32 %den, 8 + %num.i24 = ashr i32 %num.i24.0, 8 + %den.i24 = ashr i32 %den.i24.0, 8 + %result = sdiv i32 %num.i24, %den.i24 + store i32 %result, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}sdiv25_i32: +; SI-NOT: v_cvt_f32_i32 +; SI-NOT: v_rcp_f32 + +; EG-NOT: INT_TO_FLT +; EG-NOT: RECIP_IEEE +define void @sdiv25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { + %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 + %num = load i32, i32 addrspace(1) * %in, align 4 + %den = load i32, i32 addrspace(1) * %den_ptr, align 4 + %num.i24.0 = shl i32 %num, 7 + %den.i24.0 = shl i32 %den, 7 + %num.i24 = ashr i32 %num.i24.0, 7 + %den.i24 = ashr i32 %den.i24.0, 7 + %result = sdiv i32 %num.i24, %den.i24 + store i32 %result, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}test_no_sdiv24_i32_1: +; SI-NOT: v_cvt_f32_i32 +; SI-NOT: v_rcp_f32 + +; EG-NOT: INT_TO_FLT +; EG-NOT: RECIP_IEEE +define void @test_no_sdiv24_i32_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { + %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 + %num = load i32, i32 addrspace(1) * %in, align 4 + %den = load i32, i32 addrspace(1) * %den_ptr, align 4 + %num.i24.0 = shl i32 %num, 8 + %den.i24.0 = shl i32 %den, 7 + %num.i24 = ashr i32 %num.i24.0, 8 + %den.i24 = ashr i32 %den.i24.0, 7 + %result = sdiv i32 %num.i24, %den.i24 + store i32 %result, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}test_no_sdiv24_i32_2: +; SI-NOT: v_cvt_f32_i32 +; SI-NOT: v_rcp_f32 + +; EG-NOT: INT_TO_FLT +; EG-NOT: RECIP_IEEE +define void @test_no_sdiv24_i32_2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { + %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 + %num = load i32, i32 addrspace(1) * %in, align 4 + %den = load i32, i32 addrspace(1) * %den_ptr, align 4 + %num.i24.0 = shl i32 %num, 7 + %den.i24.0 = shl i32 %den, 8 + %num.i24 = ashr i32 %num.i24.0, 7 + %den.i24 = ashr i32 %den.i24.0, 8 + %result = sdiv i32 %num.i24, %den.i24 + store i32 %result, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}srem24_i8: +; SI: v_cvt_f32_i32 +; SI: v_cvt_f32_i32 +; SI: v_rcp_f32 +; SI: v_cvt_i32_f32 + +; EG: INT_TO_FLT +; EG-DAG: INT_TO_FLT +; EG-DAG: RECIP_IEEE +; EG: FLT_TO_INT +define void @srem24_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) { + %den_ptr = getelementptr i8, i8 addrspace(1)* %in, i8 1 + %num = load i8, i8 addrspace(1) * %in + %den = load i8, i8 addrspace(1) * %den_ptr + %result = srem i8 %num, %den + store i8 %result, i8 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}srem24_i16: +; SI: v_cvt_f32_i32 +; SI: v_cvt_f32_i32 +; SI: v_rcp_f32 +; SI: v_cvt_i32_f32 + +; EG: INT_TO_FLT +; EG-DAG: INT_TO_FLT +; EG-DAG: RECIP_IEEE +; EG: FLT_TO_INT +define void @srem24_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) { + %den_ptr = getelementptr i16, i16 addrspace(1)* %in, i16 1 + %num = load i16, i16 addrspace(1) * %in, align 2 + %den = load i16, i16 addrspace(1) * %den_ptr, align 2 + %result = srem i16 %num, %den + store i16 %result, i16 addrspace(1)* %out, align 2 + ret void +} + +; FUNC-LABEL: {{^}}srem24_i32: +; SI: v_cvt_f32_i32 +; SI: v_cvt_f32_i32 +; SI: v_rcp_f32 +; SI: v_cvt_i32_f32 + +; EG: INT_TO_FLT +; EG-DAG: INT_TO_FLT +; EG-DAG: RECIP_IEEE +; EG: FLT_TO_INT +define void @srem24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { + %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 + %num = load i32, i32 addrspace(1) * %in, align 4 + %den = load i32, i32 addrspace(1) * %den_ptr, align 4 + %num.i24.0 = shl i32 %num, 8 + %den.i24.0 = shl i32 %den, 8 + %num.i24 = ashr i32 %num.i24.0, 8 + %den.i24 = ashr i32 %den.i24.0, 8 + %result = srem i32 %num.i24, %den.i24 + store i32 %result, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}srem25_i32: +; SI-NOT: v_cvt_f32_i32 +; SI-NOT: v_rcp_f32 + +; EG-NOT: INT_TO_FLT +; EG-NOT: RECIP_IEEE +define void @srem25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { + %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 + %num = load i32, i32 addrspace(1) * %in, align 4 + %den = load i32, i32 addrspace(1) * %den_ptr, align 4 + %num.i24.0 = shl i32 %num, 7 + %den.i24.0 = shl i32 %den, 7 + %num.i24 = ashr i32 %num.i24.0, 7 + %den.i24 = ashr i32 %den.i24.0, 7 + %result = srem i32 %num.i24, %den.i24 + store i32 %result, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}test_no_srem24_i32_1: +; SI-NOT: v_cvt_f32_i32 +; SI-NOT: v_rcp_f32 + +; EG-NOT: INT_TO_FLT +; EG-NOT: RECIP_IEEE +define void @test_no_srem24_i32_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { + %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 + %num = load i32, i32 addrspace(1) * %in, align 4 + %den = load i32, i32 addrspace(1) * %den_ptr, align 4 + %num.i24.0 = shl i32 %num, 8 + %den.i24.0 = shl i32 %den, 7 + %num.i24 = ashr i32 %num.i24.0, 8 + %den.i24 = ashr i32 %den.i24.0, 7 + %result = srem i32 %num.i24, %den.i24 + store i32 %result, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}test_no_srem24_i32_2: +; SI-NOT: v_cvt_f32_i32 +; SI-NOT: v_rcp_f32 + +; EG-NOT: INT_TO_FLT +; EG-NOT: RECIP_IEEE +define void @test_no_srem24_i32_2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { + %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 + %num = load i32, i32 addrspace(1) * %in, align 4 + %den = load i32, i32 addrspace(1) * %den_ptr, align 4 + %num.i24.0 = shl i32 %num, 7 + %den.i24.0 = shl i32 %den, 8 + %num.i24 = ashr i32 %num.i24.0, 7 + %den.i24 = ashr i32 %den.i24.0, 8 + %result = srem i32 %num.i24, %den.i24 + store i32 %result, i32 addrspace(1)* %out, align 4 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/sdivrem64.ll b/llvm/test/CodeGen/AMDGPU/sdivrem64.ll new file mode 100644 index 00000000000..a9b2b7f9df5 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/sdivrem64.ll @@ -0,0 +1,225 @@ +;RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck --check-prefix=SI --check-prefix=GCN --check-prefix=FUNC %s +;RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck --check-prefix=VI --check-prefix=GCN --check-prefix=FUNC %s +;RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck --check-prefix=EG --check-prefix=FUNC %s + +;FUNC-LABEL: {{^}}test_sdiv: +;EG: RECIP_UINT +;EG: LSHL {{.*}}, 1, +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT + +;GCN: v_bfe_u32 +;GCN: v_bfe_u32 +;GCN: v_bfe_u32 +;GCN: v_bfe_u32 +;GCN: v_bfe_u32 +;GCN: v_bfe_u32 +;GCN: v_bfe_u32 +;GCN: v_bfe_u32 +;GCN: v_bfe_u32 +;GCN: v_bfe_u32 +;GCN: v_bfe_u32 +;GCN: v_bfe_u32 +;GCN: v_bfe_u32 +;GCN: v_bfe_u32 +;GCN: v_bfe_u32 +;GCN: v_bfe_u32 +;GCN: v_bfe_u32 +;GCN: v_bfe_u32 +;GCN: v_bfe_u32 +;GCN: v_bfe_u32 +;GCN: v_bfe_u32 +;GCN: v_bfe_u32 +;GCN: v_bfe_u32 +;GCN: v_bfe_u32 +;GCN: v_bfe_u32 +;GCN: v_bfe_u32 +;GCN: v_bfe_u32 +;GCN: v_bfe_u32 +;GCN: v_bfe_u32 +;GCN: v_bfe_u32 +;GCN-NOT: v_mad_f32 +;SI-NOT: v_lshr_b64 +;VI-NOT: v_lshrrev_b64 +;GCN: s_endpgm +define void @test_sdiv(i64 addrspace(1)* %out, i64 %x, i64 %y) { + %result = sdiv i64 %x, %y + store i64 %result, i64 addrspace(1)* %out + ret void +} + +;FUNC-LABEL: {{^}}test_srem: +;EG: RECIP_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: AND_INT {{.*}}, 1, + +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN-NOT: v_mad_f32 +;SI-NOT: v_lshr_b64 +;VI-NOT: v_lshrrev_b64 +;GCN: s_endpgm +define void @test_srem(i64 addrspace(1)* %out, i64 %x, i64 %y) { + %result = urem i64 %x, %y + store i64 %result, i64 addrspace(1)* %out + ret void +} + +;FUNC-LABEL: {{^}}test_sdiv3264: +;EG: RECIP_UINT +;EG-NOT: BFE_UINT + +;GCN-NOT: s_bfe_u32 +;GCN-NOT: v_mad_f32 +;SI-NOT: v_lshr_b64 +;VI-NOT: v_lshrrev_b64 +;GCN: s_endpgm +define void @test_sdiv3264(i64 addrspace(1)* %out, i64 %x, i64 %y) { + %1 = ashr i64 %x, 33 + %2 = ashr i64 %y, 33 + %result = sdiv i64 %1, %2 + store i64 %result, i64 addrspace(1)* %out + ret void +} + +;FUNC-LABEL: {{^}}test_srem3264: +;EG: RECIP_UINT +;EG-NOT: BFE_UINT + +;GCN-NOT: s_bfe_u32 +;GCN-NOT: v_mad_f32 +;SI-NOT: v_lshr_b64 +;VI-NOT: v_lshrrev_b64 +;GCN: s_endpgm +define void @test_srem3264(i64 addrspace(1)* %out, i64 %x, i64 %y) { + %1 = ashr i64 %x, 33 + %2 = ashr i64 %y, 33 + %result = srem i64 %1, %2 + store i64 %result, i64 addrspace(1)* %out + ret void +} + +;FUNC-LABEL: {{^}}test_sdiv2464: +;EG: INT_TO_FLT +;EG: INT_TO_FLT +;EG: FLT_TO_INT +;EG-NOT: RECIP_UINT +;EG-NOT: BFE_UINT + +;GCN-NOT: s_bfe_u32 +;GCN: v_mad_f32 +;SI-NOT: v_lshr_b64 +;VI-NOT: v_lshrrev_b64 +;GCN: s_endpgm +define void @test_sdiv2464(i64 addrspace(1)* %out, i64 %x, i64 %y) { + %1 = ashr i64 %x, 40 + %2 = ashr i64 %y, 40 + %result = sdiv i64 %1, %2 + store i64 %result, i64 addrspace(1)* %out + ret void +} + +;FUNC-LABEL: {{^}}test_srem2464: +;EG: INT_TO_FLT +;EG: INT_TO_FLT +;EG: FLT_TO_INT +;EG-NOT: RECIP_UINT +;EG-NOT: BFE_UINT + +;GCN-NOT: s_bfe_u32 +;GCN: v_mad_f32 +;SI-NOT: v_lshr_b64 +;VI-NOT: v_lshrrev_b64 +;GCN: s_endpgm +define void @test_srem2464(i64 addrspace(1)* %out, i64 %x, i64 %y) { + %1 = ashr i64 %x, 40 + %2 = ashr i64 %y, 40 + %result = srem i64 %1, %2 + store i64 %result, i64 addrspace(1)* %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/select-i1.ll b/llvm/test/CodeGen/AMDGPU/select-i1.ll new file mode 100644 index 00000000000..6735394e93a --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/select-i1.ll @@ -0,0 +1,15 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s + +; FIXME: This should go in existing select.ll test, except the current testcase there is broken on SI + +; FUNC-LABEL: {{^}}select_i1: +; SI: v_cndmask_b32 +; SI-NOT: v_cndmask_b32 +define void @select_i1(i1 addrspace(1)* %out, i32 %cond, i1 %a, i1 %b) nounwind { + %cmp = icmp ugt i32 %cond, 5 + %sel = select i1 %cmp, i1 %a, i1 %b + store i1 %sel, i1 addrspace(1)* %out, align 4 + ret void +} + diff --git a/llvm/test/CodeGen/AMDGPU/select-vectors.ll b/llvm/test/CodeGen/AMDGPU/select-vectors.ll new file mode 100644 index 00000000000..59082c65cc8 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/select-vectors.ll @@ -0,0 +1,156 @@ +; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s + +; Test expansion of scalar selects on vectors. +; Evergreen not enabled since it seems to be having problems with doubles. + + +; FUNC-LABEL: {{^}}select_v4i8: +; SI: v_cndmask_b32_e64 +; SI: v_cndmask_b32_e64 +; SI: v_cndmask_b32_e64 +; SI: v_cndmask_b32_e64 +define void @select_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> %a, <4 x i8> %b, i8 %c) nounwind { + %cmp = icmp eq i8 %c, 0 + %select = select i1 %cmp, <4 x i8> %a, <4 x i8> %b + store <4 x i8> %select, <4 x i8> addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}select_v4i16: +; SI: v_cndmask_b32_e64 +; SI: v_cndmask_b32_e64 +; SI: v_cndmask_b32_e64 +; SI: v_cndmask_b32_e64 +define void @select_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %a, <4 x i16> %b, i32 %c) nounwind { + %cmp = icmp eq i32 %c, 0 + %select = select i1 %cmp, <4 x i16> %a, <4 x i16> %b + store <4 x i16> %select, <4 x i16> addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}select_v2i32: +; SI: v_cndmask_b32_e64 +; SI: v_cndmask_b32_e64 +; SI: buffer_store_dwordx2 +define void @select_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b, i32 %c) nounwind { + %cmp = icmp eq i32 %c, 0 + %select = select i1 %cmp, <2 x i32> %a, <2 x i32> %b + store <2 x i32> %select, <2 x i32> addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}select_v4i32: +; SI: v_cndmask_b32_e64 +; SI: v_cndmask_b32_e64 +; SI: v_cndmask_b32_e64 +; SI: v_cndmask_b32_e64 +; SI: buffer_store_dwordx4 +define void @select_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b, i32 %c) nounwind { + %cmp = icmp eq i32 %c, 0 + %select = select i1 %cmp, <4 x i32> %a, <4 x i32> %b + store <4 x i32> %select, <4 x i32> addrspace(1)* %out, align 16 + ret void +} + +; FUNC-LABEL: {{^}}select_v8i32: +; SI: v_cndmask_b32_e64 +; SI: v_cndmask_b32_e64 +; SI: v_cndmask_b32_e64 +; SI: v_cndmask_b32_e64 +; SI: v_cndmask_b32_e64 +; SI: v_cndmask_b32_e64 +; SI: v_cndmask_b32_e64 +; SI: v_cndmask_b32_e64 +define void @select_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> %a, <8 x i32> %b, i32 %c) nounwind { + %cmp = icmp eq i32 %c, 0 + %select = select i1 %cmp, <8 x i32> %a, <8 x i32> %b + store <8 x i32> %select, <8 x i32> addrspace(1)* %out, align 16 + ret void +} + +; FUNC-LABEL: {{^}}select_v2f32: +; SI: buffer_store_dwordx2 +define void @select_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b, i32 %c) nounwind { + %cmp = icmp eq i32 %c, 0 + %select = select i1 %cmp, <2 x float> %a, <2 x float> %b + store <2 x float> %select, <2 x float> addrspace(1)* %out, align 16 + ret void +} + +; FUNC-LABEL: {{^}}select_v4f32: +; SI: buffer_store_dwordx4 +define void @select_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, <4 x float> %b, i32 %c) nounwind { + %cmp = icmp eq i32 %c, 0 + %select = select i1 %cmp, <4 x float> %a, <4 x float> %b + store <4 x float> %select, <4 x float> addrspace(1)* %out, align 16 + ret void +} + +; FUNC-LABEL: {{^}}select_v8f32: +; SI: v_cndmask_b32_e64 +; SI: v_cndmask_b32_e64 +; SI: v_cndmask_b32_e64 +; SI: v_cndmask_b32_e64 +; SI: v_cndmask_b32_e64 +; SI: v_cndmask_b32_e64 +; SI: v_cndmask_b32_e64 +; SI: v_cndmask_b32_e64 +define void @select_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, <8 x float> %b, i32 %c) nounwind { + %cmp = icmp eq i32 %c, 0 + %select = select i1 %cmp, <8 x float> %a, <8 x float> %b + store <8 x float> %select, <8 x float> addrspace(1)* %out, align 16 + ret void +} + +; FUNC-LABEL: {{^}}select_v2f64: +; SI: v_cndmask_b32_e64 +; SI: v_cndmask_b32_e64 +; SI: v_cndmask_b32_e64 +; SI: v_cndmask_b32_e64 +define void @select_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b, i32 %c) nounwind { + %cmp = icmp eq i32 %c, 0 + %select = select i1 %cmp, <2 x double> %a, <2 x double> %b + store <2 x double> %select, <2 x double> addrspace(1)* %out, align 16 + ret void +} + +; FUNC-LABEL: {{^}}select_v4f64: +; SI: v_cndmask_b32_e64 +; SI: v_cndmask_b32_e64 +; SI: v_cndmask_b32_e64 +; SI: v_cndmask_b32_e64 +; SI: v_cndmask_b32_e64 +; SI: v_cndmask_b32_e64 +; SI: v_cndmask_b32_e64 +; SI: v_cndmask_b32_e64 +define void @select_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b, i32 %c) nounwind { + %cmp = icmp eq i32 %c, 0 + %select = select i1 %cmp, <4 x double> %a, <4 x double> %b + store <4 x double> %select, <4 x double> addrspace(1)* %out, align 16 + ret void +} + +; FUNC-LABEL: {{^}}select_v8f64: +; SI: v_cndmask_b32_e64 +; SI: v_cndmask_b32_e64 +; SI: v_cndmask_b32_e64 +; SI: v_cndmask_b32_e64 +; SI: v_cndmask_b32_e64 +; SI: v_cndmask_b32_e64 +; SI: v_cndmask_b32_e64 +; SI: v_cndmask_b32_e64 +; SI: v_cndmask_b32_e64 +; SI: v_cndmask_b32_e64 +; SI: v_cndmask_b32_e64 +; SI: v_cndmask_b32_e64 +; SI: v_cndmask_b32_e64 +; SI: v_cndmask_b32_e64 +; SI: v_cndmask_b32_e64 +; SI: v_cndmask_b32_e64 +define void @select_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %a, <8 x double> %b, i32 %c) nounwind { + %cmp = icmp eq i32 %c, 0 + %select = select i1 %cmp, <8 x double> %a, <8 x double> %b + store <8 x double> %select, <8 x double> addrspace(1)* %out, align 16 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/select.ll b/llvm/test/CodeGen/AMDGPU/select.ll new file mode 100644 index 00000000000..45f3cd5a7ac --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/select.ll @@ -0,0 +1,47 @@ +; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s + + +; Normally icmp + select is optimized to select_cc, when this happens the +; DAGLegalizer never sees the select and doesn't have a chance to leaglize it. +; +; In order to avoid the select_cc optimization, this test case calculates the +; condition for the select in a separate basic block. + +; FUNC-LABEL: {{^}}select: +; EG-DAG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.X +; EG-DAG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.X +; EG-DAG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XY +; EG-DAG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XY +; EG-DAG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XYZW +; EG-DAG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XYZW +define void @select (i32 addrspace(1)* %i32out, float addrspace(1)* %f32out, + <2 x i32> addrspace(1)* %v2i32out, <2 x float> addrspace(1)* %v2f32out, + <4 x i32> addrspace(1)* %v4i32out, <4 x float> addrspace(1)* %v4f32out, + i32 %cond) { +entry: + br label %for +body: + %inc = add i32 %i, 1 + %br_cmp.i = icmp eq i1 %br_cmp, 0 + br label %for +for: + %i = phi i32 [ %inc, %body], [ 0, %entry ] + %br_cmp = phi i1 [ %br_cmp.i, %body ], [ 0, %entry ] + %0 = icmp eq i32 %cond, %i + %1 = select i1 %br_cmp, i32 2, i32 3 + %2 = select i1 %br_cmp, float 2.0 , float 5.0 + %3 = select i1 %br_cmp, <2 x i32> , <2 x i32> + %4 = select i1 %br_cmp, <2 x float> , <2 x float> + %5 = select i1 %br_cmp, <4 x i32> , <4 x i32> + %6 = select i1 %br_cmp, <4 x float> , <4 x float> + br i1 %0, label %body, label %done + +done: + store i32 %1, i32 addrspace(1)* %i32out + store float %2, float addrspace(1)* %f32out + store <2 x i32> %3, <2 x i32> addrspace(1)* %v2i32out + store <2 x float> %4, <2 x float> addrspace(1)* %v2f32out + store <4 x i32> %5, <4 x i32> addrspace(1)* %v4i32out + store <4 x float> %6, <4 x float> addrspace(1)* %v4f32out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/select64.ll b/llvm/test/CodeGen/AMDGPU/select64.ll new file mode 100644 index 00000000000..5cebb30dc72 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/select64.ll @@ -0,0 +1,68 @@ +; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s +; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s + +; CHECK-LABEL: {{^}}select0: +; i64 select should be split into two i32 selects, and we shouldn't need +; to use a shfit to extract the hi dword of the input. +; CHECK-NOT: s_lshr_b64 +; CHECK: v_cndmask +; CHECK: v_cndmask +define void @select0(i64 addrspace(1)* %out, i32 %cond, i64 %in) { +entry: + %0 = icmp ugt i32 %cond, 5 + %1 = select i1 %0, i64 0, i64 %in + store i64 %1, i64 addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}select_trunc_i64: +; CHECK: v_cndmask_b32 +; CHECK-NOT: v_cndmask_b32 +define void @select_trunc_i64(i32 addrspace(1)* %out, i32 %cond, i64 %in) nounwind { + %cmp = icmp ugt i32 %cond, 5 + %sel = select i1 %cmp, i64 0, i64 %in + %trunc = trunc i64 %sel to i32 + store i32 %trunc, i32 addrspace(1)* %out, align 4 + ret void +} + +; CHECK-LABEL: {{^}}select_trunc_i64_2: +; CHECK: v_cndmask_b32 +; CHECK-NOT: v_cndmask_b32 +define void @select_trunc_i64_2(i32 addrspace(1)* %out, i32 %cond, i64 %a, i64 %b) nounwind { + %cmp = icmp ugt i32 %cond, 5 + %sel = select i1 %cmp, i64 %a, i64 %b + %trunc = trunc i64 %sel to i32 + store i32 %trunc, i32 addrspace(1)* %out, align 4 + ret void +} + +; CHECK-LABEL: {{^}}v_select_trunc_i64_2: +; CHECK: v_cndmask_b32 +; CHECK-NOT: v_cndmask_b32 +define void @v_select_trunc_i64_2(i32 addrspace(1)* %out, i32 %cond, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind { + %cmp = icmp ugt i32 %cond, 5 + %a = load i64, i64 addrspace(1)* %aptr, align 8 + %b = load i64, i64 addrspace(1)* %bptr, align 8 + %sel = select i1 %cmp, i64 %a, i64 %b + %trunc = trunc i64 %sel to i32 + store i32 %trunc, i32 addrspace(1)* %out, align 4 + ret void +} + +; CHECK-LABEL: {{^}}v_select_i64_split_imm: +; CHECK: s_mov_b32 [[SHI:s[0-9]+]], 63 +; CHECK: s_mov_b32 [[SLO:s[0-9]+]], 0 +; CHECK-DAG: v_mov_b32_e32 [[VHI:v[0-9]+]], [[SHI]] +; CHECK-DAG: v_mov_b32_e32 [[VLO:v[0-9]+]], [[SLO]] +; CHECK-DAG: v_cndmask_b32_e64 {{v[0-9]+}}, [[VLO]], {{v[0-9]+}} +; CHECK-DAG: v_cndmask_b32_e64 {{v[0-9]+}}, [[VHI]], {{v[0-9]+}} +; CHECK: s_endpgm +define void @v_select_i64_split_imm(i64 addrspace(1)* %out, i32 %cond, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind { + %cmp = icmp ugt i32 %cond, 5 + %a = load i64, i64 addrspace(1)* %aptr, align 8 + %b = load i64, i64 addrspace(1)* %bptr, align 8 + %sel = select i1 %cmp, i64 %a, i64 270582939648 ; 63 << 32 + store i64 %sel, i64 addrspace(1)* %out, align 8 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/selectcc-cnd.ll b/llvm/test/CodeGen/AMDGPU/selectcc-cnd.ll new file mode 100644 index 00000000000..94d0ace7569 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/selectcc-cnd.ll @@ -0,0 +1,12 @@ +;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s + +;CHECK-NOT: SETE +;CHECK: CNDE {{\*?}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, 1.0, literal.x, +;CHECK: 1073741824 +define void @test(float addrspace(1)* %out, float addrspace(1)* %in) { + %1 = load float, float addrspace(1)* %in + %2 = fcmp oeq float %1, 0.0 + %3 = select i1 %2, float 1.0, float 2.0 + store float %3, float addrspace(1)* %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/selectcc-cnde-int.ll b/llvm/test/CodeGen/AMDGPU/selectcc-cnde-int.ll new file mode 100644 index 00000000000..58a4ee7d62b --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/selectcc-cnde-int.ll @@ -0,0 +1,12 @@ +;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s + +;CHECK-NOT: SETE_INT +;CHECK: CNDE_INT {{\*?}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, 1, literal.x, +;CHECK-NEXT: 2 +define void @test(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { + %1 = load i32, i32 addrspace(1)* %in + %2 = icmp eq i32 %1, 0 + %3 = select i1 %2, i32 1, i32 2 + store i32 %3, i32 addrspace(1)* %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/selectcc-icmp-select-float.ll b/llvm/test/CodeGen/AMDGPU/selectcc-icmp-select-float.ll new file mode 100644 index 00000000000..e870ee891e6 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/selectcc-icmp-select-float.ll @@ -0,0 +1,16 @@ +; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s + +; Note additional optimizations may cause this SGT to be replaced with a +; CND* instruction. +; CHECK: SETGT_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, literal.x, +; CHECK-NEXT: -1 +; Test a selectcc with i32 LHS/RHS and float True/False + +define void @test(float addrspace(1)* %out, i32 addrspace(1)* %in) { +entry: + %0 = load i32, i32 addrspace(1)* %in + %1 = icmp sge i32 %0, 0 + %2 = select i1 %1, float 1.0, float 0.0 + store float %2, float addrspace(1)* %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/selectcc-opt.ll b/llvm/test/CodeGen/AMDGPU/selectcc-opt.ll new file mode 100644 index 00000000000..65be4a626a1 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/selectcc-opt.ll @@ -0,0 +1,80 @@ +; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s + + +; FUNC-LABEL: {{^}}test_a: +; EG-NOT: CND +; EG: SET{{[NEQGTL]+}}_DX10 + +define void @test_a(i32 addrspace(1)* %out, float %in) { +entry: + %0 = fcmp olt float %in, 0.000000e+00 + %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00 + %2 = fsub float -0.000000e+00, %1 + %3 = fptosi float %2 to i32 + %4 = bitcast i32 %3 to float + %5 = bitcast float %4 to i32 + %6 = icmp ne i32 %5, 0 + br i1 %6, label %IF, label %ENDIF + +IF: + %7 = getelementptr i32, i32 addrspace(1)* %out, i32 1 + store i32 0, i32 addrspace(1)* %7 + br label %ENDIF + +ENDIF: + store i32 0, i32 addrspace(1)* %out + ret void +} + +; Same as test_a, but the branch labels are swapped to produce the inverse cc +; for the icmp instruction + +; EG-LABEL: {{^}}test_b: +; EG: SET{{[GTEQN]+}}_DX10 +; EG-NEXT: PRED_ +; EG-NEXT: ALU clause starting +define void @test_b(i32 addrspace(1)* %out, float %in) { +entry: + %0 = fcmp olt float %in, 0.0 + %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00 + %2 = fsub float -0.000000e+00, %1 + %3 = fptosi float %2 to i32 + %4 = bitcast i32 %3 to float + %5 = bitcast float %4 to i32 + %6 = icmp ne i32 %5, 0 + br i1 %6, label %ENDIF, label %IF + +IF: + %7 = getelementptr i32, i32 addrspace(1)* %out, i32 1 + store i32 0, i32 addrspace(1)* %7 + br label %ENDIF + +ENDIF: + store i32 0, i32 addrspace(1)* %out + ret void +} + +; Test a CND*_INT instruction with float true/false values +; EG-LABEL: {{^}}test_c: +; EG: CND{{[GTE]+}}_INT +define void @test_c(float addrspace(1)* %out, i32 %in) { +entry: + %0 = icmp sgt i32 %in, 0 + %1 = select i1 %0, float 2.0, float 3.0 + store float %1, float addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}selectcc_bool: +; SI: v_cmp_ne_i32 +; SI-NEXT: v_cndmask_b32_e64 +; SI-NOT: cmp +; SI-NOT: cndmask +define void @selectcc_bool(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind { + %icmp0 = icmp ne i32 %a, %b + %ext = select i1 %icmp0, i32 -1, i32 0 + store i32 %ext, i32 addrspace(1)* %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/selectcc.ll b/llvm/test/CodeGen/AMDGPU/selectcc.ll new file mode 100644 index 00000000000..f378e15dd76 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/selectcc.ll @@ -0,0 +1,20 @@ +; RUN: llc -verify-machineinstrs -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s +; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s + +; FUNC-LABEL: {{^}}selectcc_i64: +; EG: XOR_INT +; EG: XOR_INT +; EG: OR_INT +; EG: CNDE_INT +; EG: CNDE_INT +; SI: v_cmp_eq_i64 +; SI: v_cndmask +; SI: v_cndmask +define void @selectcc_i64(i64 addrspace(1) * %out, i64 %lhs, i64 %rhs, i64 %true, i64 %false) { +entry: + %0 = icmp eq i64 %lhs, %rhs + %1 = select i1 %0, i64 %true, i64 %false + store i64 %1, i64 addrspace(1)* %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/set-dx10.ll b/llvm/test/CodeGen/AMDGPU/set-dx10.ll new file mode 100644 index 00000000000..53694dcffa6 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/set-dx10.ll @@ -0,0 +1,161 @@ +; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s + +; These tests check that floating point comparisons which are used by select +; to store integer true (-1) and false (0) values are lowered to one of the +; SET*DX10 instructions. + +; CHECK: {{^}}fcmp_une_select_fptosi: +; CHECK: SETNE_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x, +; CHECK-NEXT: LSHR +; CHECK-NEXT: 1084227584(5.000000e+00) +define void @fcmp_une_select_fptosi(i32 addrspace(1)* %out, float %in) { +entry: + %0 = fcmp une float %in, 5.0 + %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00 + %2 = fsub float -0.000000e+00, %1 + %3 = fptosi float %2 to i32 + store i32 %3, i32 addrspace(1)* %out + ret void +} + +; CHECK: {{^}}fcmp_une_select_i32: +; CHECK: SETNE_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x, +; CHECK-NEXT: LSHR +; CHECK-NEXT: 1084227584(5.000000e+00) +define void @fcmp_une_select_i32(i32 addrspace(1)* %out, float %in) { +entry: + %0 = fcmp une float %in, 5.0 + %1 = select i1 %0, i32 -1, i32 0 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; CHECK: {{^}}fcmp_oeq_select_fptosi: +; CHECK: SETE_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x, +; CHECK-NEXT: LSHR +; CHECK-NEXT: 1084227584(5.000000e+00) +define void @fcmp_oeq_select_fptosi(i32 addrspace(1)* %out, float %in) { +entry: + %0 = fcmp oeq float %in, 5.0 + %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00 + %2 = fsub float -0.000000e+00, %1 + %3 = fptosi float %2 to i32 + store i32 %3, i32 addrspace(1)* %out + ret void +} + +; CHECK: {{^}}fcmp_oeq_select_i32: +; CHECK: SETE_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x, +; CHECK-NEXT: LSHR +; CHECK-NEXT: 1084227584(5.000000e+00) +define void @fcmp_oeq_select_i32(i32 addrspace(1)* %out, float %in) { +entry: + %0 = fcmp oeq float %in, 5.0 + %1 = select i1 %0, i32 -1, i32 0 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; CHECK: {{^}}fcmp_ogt_select_fptosi: +; CHECK: SETGT_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x, +; CHECK-NEXT: LSHR +; CHECK-NEXT: 1084227584(5.000000e+00) +define void @fcmp_ogt_select_fptosi(i32 addrspace(1)* %out, float %in) { +entry: + %0 = fcmp ogt float %in, 5.0 + %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00 + %2 = fsub float -0.000000e+00, %1 + %3 = fptosi float %2 to i32 + store i32 %3, i32 addrspace(1)* %out + ret void +} + +; CHECK: {{^}}fcmp_ogt_select_i32: +; CHECK: SETGT_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x, +; CHECK-NEXT: LSHR +; CHECK-NEXT: 1084227584(5.000000e+00) +define void @fcmp_ogt_select_i32(i32 addrspace(1)* %out, float %in) { +entry: + %0 = fcmp ogt float %in, 5.0 + %1 = select i1 %0, i32 -1, i32 0 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; CHECK: {{^}}fcmp_oge_select_fptosi: +; CHECK: SETGE_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x, +; CHECK-NEXT: LSHR +; CHECK-NEXT: 1084227584(5.000000e+00) +define void @fcmp_oge_select_fptosi(i32 addrspace(1)* %out, float %in) { +entry: + %0 = fcmp oge float %in, 5.0 + %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00 + %2 = fsub float -0.000000e+00, %1 + %3 = fptosi float %2 to i32 + store i32 %3, i32 addrspace(1)* %out + ret void +} + +; CHECK: {{^}}fcmp_oge_select_i32: +; CHECK: SETGE_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x, +; CHECK-NEXT: LSHR +; CHECK-NEXT: 1084227584(5.000000e+00) +define void @fcmp_oge_select_i32(i32 addrspace(1)* %out, float %in) { +entry: + %0 = fcmp oge float %in, 5.0 + %1 = select i1 %0, i32 -1, i32 0 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; CHECK: {{^}}fcmp_ole_select_fptosi: +; CHECK: SETGE_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z, +; CHECK-NEXT: LSHR +; CHECK-NEXT: 1084227584(5.000000e+00) +define void @fcmp_ole_select_fptosi(i32 addrspace(1)* %out, float %in) { +entry: + %0 = fcmp ole float %in, 5.0 + %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00 + %2 = fsub float -0.000000e+00, %1 + %3 = fptosi float %2 to i32 + store i32 %3, i32 addrspace(1)* %out + ret void +} + +; CHECK: {{^}}fcmp_ole_select_i32: +; CHECK: SETGE_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z, +; CHECK-NEXT: LSHR +; CHECK-NEXT: 1084227584(5.000000e+00) +define void @fcmp_ole_select_i32(i32 addrspace(1)* %out, float %in) { +entry: + %0 = fcmp ole float %in, 5.0 + %1 = select i1 %0, i32 -1, i32 0 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; CHECK: {{^}}fcmp_olt_select_fptosi: +; CHECK: SETGT_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z, +; CHECK-NEXT: LSHR +; CHECK-NEXT: 1084227584(5.000000e+00) +define void @fcmp_olt_select_fptosi(i32 addrspace(1)* %out, float %in) { +entry: + %0 = fcmp olt float %in, 5.0 + %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00 + %2 = fsub float -0.000000e+00, %1 + %3 = fptosi float %2 to i32 + store i32 %3, i32 addrspace(1)* %out + ret void +} + +; CHECK: {{^}}fcmp_olt_select_i32: +; CHECK: SETGT_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z, +; CHECK-NEXT: LSHR +; CHECK-NEXT: 1084227584(5.000000e+00) +define void @fcmp_olt_select_i32(i32 addrspace(1)* %out, float %in) { +entry: + %0 = fcmp olt float %in, 5.0 + %1 = select i1 %0, i32 -1, i32 0 + store i32 %1, i32 addrspace(1)* %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/setcc-equivalent.ll b/llvm/test/CodeGen/AMDGPU/setcc-equivalent.ll new file mode 100644 index 00000000000..11ea793650c --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/setcc-equivalent.ll @@ -0,0 +1,30 @@ +; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG %s + +; EG-LABEL: {{^}}and_setcc_setcc_i32: +; EG: AND_INT +; EG-NEXT: SETE_INT +define void @and_setcc_setcc_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) { + %cmp1 = icmp eq i32 %a, -1 + %cmp2 = icmp eq i32 %b, -1 + %and = and i1 %cmp1, %cmp2 + %ext = sext i1 %and to i32 + store i32 %ext, i32 addrspace(1)* %out, align 4 + ret void +} + +; EG-LABEL: {{^}}and_setcc_setcc_v4i32: +; EG: AND_INT +; EG: AND_INT +; EG: SETE_INT +; EG: AND_INT +; EG: SETE_INT +; EG: AND_INT +; EG: SETE_INT +define void @and_setcc_setcc_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b) { + %cmp1 = icmp eq <4 x i32> %a, + %cmp2 = icmp eq <4 x i32> %b, + %and = and <4 x i1> %cmp1, %cmp2 + %ext = sext <4 x i1> %and to <4 x i32> + store <4 x i32> %ext, <4 x i32> addrspace(1)* %out, align 4 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/setcc-opt.ll b/llvm/test/CodeGen/AMDGPU/setcc-opt.ll new file mode 100644 index 00000000000..4e6a10d6b78 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/setcc-opt.ll @@ -0,0 +1,236 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s + +; FUNC-LABEL: {{^}}sext_bool_icmp_eq_0: +; GCN-NOT: v_cmp +; GCN: v_cmp_ne_i32_e32 vcc, +; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc +; GCN-NEXT:buffer_store_byte [[RESULT]] +; GCN-NEXT: s_endpgm + +; EG: SETNE_INT * [[CMP:T[0-9]+]].[[CMPCHAN:[XYZW]]], KC0[2].Z, KC0[2].W +; EG: AND_INT T{{[0-9]+.[XYZW]}}, PS, 1 +define void @sext_bool_icmp_eq_0(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind { + %icmp0 = icmp eq i32 %a, %b + %ext = sext i1 %icmp0 to i32 + %icmp1 = icmp eq i32 %ext, 0 + store i1 %icmp1, i1 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}sext_bool_icmp_ne_0: +; GCN-NOT: v_cmp +; GCN: v_cmp_ne_i32_e32 vcc, +; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc +; GCN-NEXT: buffer_store_byte [[RESULT]] +; GCN-NEXT: s_endpgm + +; EG: SETNE_INT * [[CMP:T[0-9]+]].[[CMPCHAN:[XYZW]]], KC0[2].Z, KC0[2].W +; EG: AND_INT T{{[0-9]+.[XYZW]}}, PS, 1 +define void @sext_bool_icmp_ne_0(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind { + %icmp0 = icmp ne i32 %a, %b + %ext = sext i1 %icmp0 to i32 + %icmp1 = icmp ne i32 %ext, 0 + store i1 %icmp1, i1 addrspace(1)* %out + ret void +} + +; This really folds away to false +; FUNC-LABEL: {{^}}sext_bool_icmp_eq_1: +; GCN: v_cmp_eq_i32_e32 vcc, +; GCN-NEXT: v_cndmask_b32_e64 [[TMP:v[0-9]+]], 0, -1, vcc +; GCN-NEXT: v_cmp_eq_i32_e32 vcc, 1, [[TMP]]{{$}} +; GCN-NEXT: v_cndmask_b32_e64 [[TMP:v[0-9]+]], 0, 1, +; GCN-NEXT: buffer_store_byte [[TMP]] +; GCN-NEXT: s_endpgm +define void @sext_bool_icmp_eq_1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind { + %icmp0 = icmp eq i32 %a, %b + %ext = sext i1 %icmp0 to i32 + %icmp1 = icmp eq i32 %ext, 1 + store i1 %icmp1, i1 addrspace(1)* %out + ret void +} + +; This really folds away to true +; FUNC-LABEL: {{^}}sext_bool_icmp_ne_1: +; GCN: v_cmp_ne_i32_e32 vcc, +; GCN-NEXT: v_cndmask_b32_e64 [[TMP:v[0-9]+]], 0, -1, vcc +; GCN-NEXT: v_cmp_ne_i32_e32 vcc, 1, [[TMP]]{{$}} +; GCN-NEXT: v_cndmask_b32_e64 [[TMP:v[0-9]+]], 0, 1, +; GCN-NEXT: buffer_store_byte [[TMP]] +; GCN-NEXT: s_endpgm +define void @sext_bool_icmp_ne_1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind { + %icmp0 = icmp ne i32 %a, %b + %ext = sext i1 %icmp0 to i32 + %icmp1 = icmp ne i32 %ext, 1 + store i1 %icmp1, i1 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}zext_bool_icmp_eq_0: +; GCN-NOT: v_cmp +; GCN: v_cmp_ne_i32_e32 vcc, +; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc +; GCN-NEXT: buffer_store_byte [[RESULT]] +; GCN-NEXT: s_endpgm +define void @zext_bool_icmp_eq_0(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind { + %icmp0 = icmp eq i32 %a, %b + %ext = zext i1 %icmp0 to i32 + %icmp1 = icmp eq i32 %ext, 0 + store i1 %icmp1, i1 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}zext_bool_icmp_ne_0: +; GCN-NOT: v_cmp +; GCN: v_cmp_ne_i32_e32 vcc, +; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc +; GCN-NEXT: buffer_store_byte [[RESULT]] +; GCN-NEXT: s_endpgm +define void @zext_bool_icmp_ne_0(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind { + %icmp0 = icmp ne i32 %a, %b + %ext = zext i1 %icmp0 to i32 + %icmp1 = icmp ne i32 %ext, 0 + store i1 %icmp1, i1 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}zext_bool_icmp_eq_1: +; GCN-NOT: v_cmp +; GCN: v_cmp_eq_i32_e32 vcc, +; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc +; GCN-NEXT: buffer_store_byte [[RESULT]] +; GCN-NEXT: s_endpgm +define void @zext_bool_icmp_eq_1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind { + %icmp0 = icmp eq i32 %a, %b + %ext = zext i1 %icmp0 to i32 + %icmp1 = icmp eq i32 %ext, 1 + store i1 %icmp1, i1 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}zext_bool_icmp_ne_1: +; GCN-NOT: v_cmp +; GCN: v_cmp_eq_i32_e32 vcc, +; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc +; GCN-NEXT: buffer_store_byte [[RESULT]] +define void @zext_bool_icmp_ne_1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind { + %icmp0 = icmp ne i32 %a, %b + %ext = zext i1 %icmp0 to i32 + %icmp1 = icmp ne i32 %ext, 1 + store i1 %icmp1, i1 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}sext_bool_icmp_ne_k: +; SI-DAG: s_load_dword [[A:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb +; SI-DAG: s_load_dword [[B:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc +; VI-DAG: s_load_dword [[A:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c +; VI-DAG: s_load_dword [[B:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30 +; GCN: v_mov_b32_e32 [[VB:v[0-9]+]], [[B]] +; GCN: v_cmp_ne_i32_e32 vcc, 2, [[VB]]{{$}} +; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc +; GCN: buffer_store_byte +; GCN: s_endpgm +define void @sext_bool_icmp_ne_k(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind { + %icmp0 = icmp ne i32 %a, %b + %ext = sext i1 %icmp0 to i32 + %icmp1 = icmp ne i32 %ext, 2 + store i1 %icmp1, i1 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}cmp_zext_k_i8max: +; GCN: buffer_load_ubyte [[B:v[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:44 +; GCN: v_mov_b32_e32 [[K255:v[0-9]+]], 0xff{{$}} +; GCN: v_cmp_ne_i32_e32 vcc, [[K255]], [[B]] +; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc +; GCN-NEXT: buffer_store_byte [[RESULT]] +; GCN: s_endpgm +define void @cmp_zext_k_i8max(i1 addrspace(1)* %out, i8 %b) nounwind { + %b.ext = zext i8 %b to i32 + %icmp0 = icmp ne i32 %b.ext, 255 + store i1 %icmp0, i1 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}cmp_sext_k_neg1: +; GCN: buffer_load_sbyte [[B:v[0-9]+]] +; GCN: v_cmp_ne_i32_e32 vcc, -1, [[B]]{{$}} +; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc +; GCN-NEXT: buffer_store_byte [[RESULT]] +; GCN: s_endpgm +define void @cmp_sext_k_neg1(i1 addrspace(1)* %out, i8 addrspace(1)* %b.ptr) nounwind { + %b = load i8, i8 addrspace(1)* %b.ptr + %b.ext = sext i8 %b to i32 + %icmp0 = icmp ne i32 %b.ext, -1 + store i1 %icmp0, i1 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}cmp_sext_k_neg1_i8_sext_arg: +; GCN: s_load_dword [[B:s[0-9]+]] +; GCN: v_cmp_ne_i32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], -1, [[B]] +; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[CMP]] +; GCN-NEXT: buffer_store_byte [[RESULT]] +; GCN: s_endpgm +define void @cmp_sext_k_neg1_i8_sext_arg(i1 addrspace(1)* %out, i8 signext %b) nounwind { + %b.ext = sext i8 %b to i32 + %icmp0 = icmp ne i32 %b.ext, -1 + store i1 %icmp0, i1 addrspace(1)* %out + ret void +} + +; FIXME: This ends up doing a buffer_load_ubyte, and and compare to +; 255. Seems to be because of ordering problems when not allowing load widths to be reduced. +; Should do a buffer_load_sbyte and compare with -1 + +; FUNC-LABEL: {{^}}cmp_sext_k_neg1_i8_arg: +; GCN-DAG: buffer_load_ubyte [[B:v[0-9]+]] +; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0xff{{$}} +; GCN: v_cmp_ne_i32_e32 vcc, [[K]], [[B]]{{$}} +; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc +; GCN-NEXT: buffer_store_byte [[RESULT]] +; GCN: s_endpgm +define void @cmp_sext_k_neg1_i8_arg(i1 addrspace(1)* %out, i8 %b) nounwind { + %b.ext = sext i8 %b to i32 + %icmp0 = icmp ne i32 %b.ext, -1 + store i1 %icmp0, i1 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}cmp_zext_k_neg1: +; GCN: v_mov_b32_e32 [[RESULT:v[0-9]+]], 1{{$}} +; GCN: buffer_store_byte [[RESULT]] +; GCN: s_endpgm +define void @cmp_zext_k_neg1(i1 addrspace(1)* %out, i8 %b) nounwind { + %b.ext = zext i8 %b to i32 + %icmp0 = icmp ne i32 %b.ext, -1 + store i1 %icmp0, i1 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}zext_bool_icmp_ne_k: +; GCN: v_mov_b32_e32 [[RESULT:v[0-9]+]], 1{{$}} +; GCN: buffer_store_byte [[RESULT]] +; GCN-NEXT: s_endpgm +define void @zext_bool_icmp_ne_k(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind { + %icmp0 = icmp ne i32 %a, %b + %ext = zext i1 %icmp0 to i32 + %icmp1 = icmp ne i32 %ext, 2 + store i1 %icmp1, i1 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}zext_bool_icmp_eq_k: +; GCN: v_mov_b32_e32 [[RESULT:v[0-9]+]], 0{{$}} +; GCN: buffer_store_byte [[RESULT]] +; GCN-NEXT: s_endpgm +define void @zext_bool_icmp_eq_k(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind { + %icmp0 = icmp ne i32 %a, %b + %ext = zext i1 %icmp0 to i32 + %icmp1 = icmp eq i32 %ext, 2 + store i1 %icmp1, i1 addrspace(1)* %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/setcc.ll b/llvm/test/CodeGen/AMDGPU/setcc.ll new file mode 100644 index 00000000000..f33a82df5ff --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/setcc.ll @@ -0,0 +1,377 @@ +; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=R600 --check-prefix=FUNC %s +; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck -check-prefix=SI -check-prefix=FUNC %s + +declare i32 @llvm.r600.read.tidig.x() nounwind readnone + +; FUNC-LABEL: {{^}}setcc_v2i32: +; R600-DAG: SETE_INT * T{{[0-9]+\.[XYZW]}}, KC0[3].X, KC0[3].Z +; R600-DAG: SETE_INT * T{{[0-9]+\.[XYZW]}}, KC0[2].W, KC0[3].Y + +define void @setcc_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) { + %result = icmp eq <2 x i32> %a, %b + %sext = sext <2 x i1> %result to <2 x i32> + store <2 x i32> %sext, <2 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}setcc_v4i32: +; R600-DAG: SETE_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; R600-DAG: SETE_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; R600-DAG: SETE_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; R600-DAG: SETE_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} + +define void @setcc_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { + %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1 + %a = load <4 x i32>, <4 x i32> addrspace(1) * %in + %b = load <4 x i32>, <4 x i32> addrspace(1) * %b_ptr + %result = icmp eq <4 x i32> %a, %b + %sext = sext <4 x i1> %result to <4 x i32> + store <4 x i32> %sext, <4 x i32> addrspace(1)* %out + ret void +} + +;;;==========================================================================;;; +;; Float comparisons +;;;==========================================================================;;; + +; FUNC-LABEL: {{^}}f32_oeq: +; R600: SETE_DX10 +; SI: v_cmp_eq_f32 +define void @f32_oeq(i32 addrspace(1)* %out, float %a, float %b) { +entry: + %0 = fcmp oeq float %a, %b + %1 = sext i1 %0 to i32 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}f32_ogt: +; R600: SETGT_DX10 +; SI: v_cmp_gt_f32 +define void @f32_ogt(i32 addrspace(1)* %out, float %a, float %b) { +entry: + %0 = fcmp ogt float %a, %b + %1 = sext i1 %0 to i32 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}f32_oge: +; R600: SETGE_DX10 +; SI: v_cmp_ge_f32 +define void @f32_oge(i32 addrspace(1)* %out, float %a, float %b) { +entry: + %0 = fcmp oge float %a, %b + %1 = sext i1 %0 to i32 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}f32_olt: +; R600: SETGT_DX10 +; SI: v_cmp_lt_f32 +define void @f32_olt(i32 addrspace(1)* %out, float %a, float %b) { +entry: + %0 = fcmp olt float %a, %b + %1 = sext i1 %0 to i32 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}f32_ole: +; R600: SETGE_DX10 +; SI: v_cmp_le_f32 +define void @f32_ole(i32 addrspace(1)* %out, float %a, float %b) { +entry: + %0 = fcmp ole float %a, %b + %1 = sext i1 %0 to i32 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}f32_one: +; R600-DAG: SETE_DX10 +; R600-DAG: SETE_DX10 +; R600-DAG: AND_INT +; R600-DAG: SETNE_DX10 +; R600-DAG: AND_INT +; R600-DAG: SETNE_INT + +; SI: v_cmp_lg_f32_e32 vcc +; SI-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc +define void @f32_one(i32 addrspace(1)* %out, float %a, float %b) { +entry: + %0 = fcmp one float %a, %b + %1 = sext i1 %0 to i32 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}f32_ord: +; R600-DAG: SETE_DX10 +; R600-DAG: SETE_DX10 +; R600-DAG: AND_INT +; R600-DAG: SETNE_INT +; SI: v_cmp_o_f32 +define void @f32_ord(i32 addrspace(1)* %out, float %a, float %b) { +entry: + %0 = fcmp ord float %a, %b + %1 = sext i1 %0 to i32 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}f32_ueq: +; R600-DAG: SETNE_DX10 +; R600-DAG: SETNE_DX10 +; R600-DAG: OR_INT +; R600-DAG: SETE_DX10 +; R600-DAG: OR_INT +; R600-DAG: SETNE_INT + +; SI: v_cmp_nlg_f32_e32 vcc +; SI-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc +define void @f32_ueq(i32 addrspace(1)* %out, float %a, float %b) { +entry: + %0 = fcmp ueq float %a, %b + %1 = sext i1 %0 to i32 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}f32_ugt: +; R600: SETGE +; R600: SETE_DX10 +; SI: v_cmp_nle_f32_e32 vcc +; SI-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc +define void @f32_ugt(i32 addrspace(1)* %out, float %a, float %b) { +entry: + %0 = fcmp ugt float %a, %b + %1 = sext i1 %0 to i32 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}f32_uge: +; R600: SETGT +; R600: SETE_DX10 + +; SI: v_cmp_nlt_f32_e32 vcc +; SI-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc +define void @f32_uge(i32 addrspace(1)* %out, float %a, float %b) { +entry: + %0 = fcmp uge float %a, %b + %1 = sext i1 %0 to i32 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}f32_ult: +; R600: SETGE +; R600: SETE_DX10 + +; SI: v_cmp_nge_f32_e32 vcc +; SI-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc +define void @f32_ult(i32 addrspace(1)* %out, float %a, float %b) { +entry: + %0 = fcmp ult float %a, %b + %1 = sext i1 %0 to i32 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}f32_ule: +; R600: SETGT +; R600: SETE_DX10 + +; SI: v_cmp_ngt_f32_e32 vcc +; SI-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc +define void @f32_ule(i32 addrspace(1)* %out, float %a, float %b) { +entry: + %0 = fcmp ule float %a, %b + %1 = sext i1 %0 to i32 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}f32_une: +; R600: SETNE_DX10 +; SI: v_cmp_neq_f32 +define void @f32_une(i32 addrspace(1)* %out, float %a, float %b) { +entry: + %0 = fcmp une float %a, %b + %1 = sext i1 %0 to i32 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}f32_uno: +; R600: SETNE_DX10 +; R600: SETNE_DX10 +; R600: OR_INT +; R600: SETNE_INT +; SI: v_cmp_u_f32 +define void @f32_uno(i32 addrspace(1)* %out, float %a, float %b) { +entry: + %0 = fcmp uno float %a, %b + %1 = sext i1 %0 to i32 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +;;;==========================================================================;;; +;; 32-bit integer comparisons +;;;==========================================================================;;; + +; FUNC-LABEL: {{^}}i32_eq: +; R600: SETE_INT +; SI: v_cmp_eq_i32 +define void @i32_eq(i32 addrspace(1)* %out, i32 %a, i32 %b) { +entry: + %0 = icmp eq i32 %a, %b + %1 = sext i1 %0 to i32 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}i32_ne: +; R600: SETNE_INT +; SI: v_cmp_ne_i32 +define void @i32_ne(i32 addrspace(1)* %out, i32 %a, i32 %b) { +entry: + %0 = icmp ne i32 %a, %b + %1 = sext i1 %0 to i32 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}i32_ugt: +; R600: SETGT_UINT +; SI: v_cmp_gt_u32 +define void @i32_ugt(i32 addrspace(1)* %out, i32 %a, i32 %b) { +entry: + %0 = icmp ugt i32 %a, %b + %1 = sext i1 %0 to i32 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}i32_uge: +; R600: SETGE_UINT +; SI: v_cmp_ge_u32 +define void @i32_uge(i32 addrspace(1)* %out, i32 %a, i32 %b) { +entry: + %0 = icmp uge i32 %a, %b + %1 = sext i1 %0 to i32 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}i32_ult: +; R600: SETGT_UINT +; SI: v_cmp_lt_u32 +define void @i32_ult(i32 addrspace(1)* %out, i32 %a, i32 %b) { +entry: + %0 = icmp ult i32 %a, %b + %1 = sext i1 %0 to i32 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}i32_ule: +; R600: SETGE_UINT +; SI: v_cmp_le_u32 +define void @i32_ule(i32 addrspace(1)* %out, i32 %a, i32 %b) { +entry: + %0 = icmp ule i32 %a, %b + %1 = sext i1 %0 to i32 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}i32_sgt: +; R600: SETGT_INT +; SI: v_cmp_gt_i32 +define void @i32_sgt(i32 addrspace(1)* %out, i32 %a, i32 %b) { +entry: + %0 = icmp sgt i32 %a, %b + %1 = sext i1 %0 to i32 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}i32_sge: +; R600: SETGE_INT +; SI: v_cmp_ge_i32 +define void @i32_sge(i32 addrspace(1)* %out, i32 %a, i32 %b) { +entry: + %0 = icmp sge i32 %a, %b + %1 = sext i1 %0 to i32 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}i32_slt: +; R600: SETGT_INT +; SI: v_cmp_lt_i32 +define void @i32_slt(i32 addrspace(1)* %out, i32 %a, i32 %b) { +entry: + %0 = icmp slt i32 %a, %b + %1 = sext i1 %0 to i32 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}i32_sle: +; R600: SETGE_INT +; SI: v_cmp_le_i32 +define void @i32_sle(i32 addrspace(1)* %out, i32 %a, i32 %b) { +entry: + %0 = icmp sle i32 %a, %b + %1 = sext i1 %0 to i32 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; FIXME: This does 4 compares +; FUNC-LABEL: {{^}}v3i32_eq: +; SI-DAG: v_cmp_eq_i32 +; SI-DAG: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, +; SI-DAG: v_cmp_eq_i32 +; SI-DAG: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, +; SI-DAG: v_cmp_eq_i32 +; SI-DAG: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, +; SI: s_endpgm +define void @v3i32_eq(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(1)* %ptra, <3 x i32> addrspace(1)* %ptrb) { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep.a = getelementptr <3 x i32>, <3 x i32> addrspace(1)* %ptra, i32 %tid + %gep.b = getelementptr <3 x i32>, <3 x i32> addrspace(1)* %ptrb, i32 %tid + %gep.out = getelementptr <3 x i32>, <3 x i32> addrspace(1)* %out, i32 %tid + %a = load <3 x i32>, <3 x i32> addrspace(1)* %gep.a + %b = load <3 x i32>, <3 x i32> addrspace(1)* %gep.b + %cmp = icmp eq <3 x i32> %a, %b + %ext = sext <3 x i1> %cmp to <3 x i32> + store <3 x i32> %ext, <3 x i32> addrspace(1)* %gep.out + ret void +} + +; FUNC-LABEL: {{^}}v3i8_eq: +; SI-DAG: v_cmp_eq_i32 +; SI-DAG: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, +; SI-DAG: v_cmp_eq_i32 +; SI-DAG: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, +; SI-DAG: v_cmp_eq_i32 +; SI-DAG: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, +; SI: s_endpgm +define void @v3i8_eq(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %ptra, <3 x i8> addrspace(1)* %ptrb) { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep.a = getelementptr <3 x i8>, <3 x i8> addrspace(1)* %ptra, i32 %tid + %gep.b = getelementptr <3 x i8>, <3 x i8> addrspace(1)* %ptrb, i32 %tid + %gep.out = getelementptr <3 x i8>, <3 x i8> addrspace(1)* %out, i32 %tid + %a = load <3 x i8>, <3 x i8> addrspace(1)* %gep.a + %b = load <3 x i8>, <3 x i8> addrspace(1)* %gep.b + %cmp = icmp eq <3 x i8> %a, %b + %ext = sext <3 x i1> %cmp to <3 x i8> + store <3 x i8> %ext, <3 x i8> addrspace(1)* %gep.out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/setcc64.ll b/llvm/test/CodeGen/AMDGPU/setcc64.ll new file mode 100644 index 00000000000..231be7aa3da --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/setcc64.ll @@ -0,0 +1,259 @@ +;RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs| FileCheck --check-prefix=SI --check-prefix=FUNC %s +;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs| FileCheck --check-prefix=SI --check-prefix=FUNC %s + +; XXX: Merge this into setcc, once R600 supports 64-bit operations + +;;;==========================================================================;;; +;; Double comparisons +;;;==========================================================================;;; + +; FUNC-LABEL: {{^}}f64_oeq: +; SI: v_cmp_eq_f64 +define void @f64_oeq(i32 addrspace(1)* %out, double %a, double %b) { +entry: + %0 = fcmp oeq double %a, %b + %1 = sext i1 %0 to i32 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}f64_ogt: +; SI: v_cmp_gt_f64 +define void @f64_ogt(i32 addrspace(1)* %out, double %a, double %b) { +entry: + %0 = fcmp ogt double %a, %b + %1 = sext i1 %0 to i32 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}f64_oge: +; SI: v_cmp_ge_f64 +define void @f64_oge(i32 addrspace(1)* %out, double %a, double %b) { +entry: + %0 = fcmp oge double %a, %b + %1 = sext i1 %0 to i32 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}f64_olt: +; SI: v_cmp_lt_f64 +define void @f64_olt(i32 addrspace(1)* %out, double %a, double %b) { +entry: + %0 = fcmp olt double %a, %b + %1 = sext i1 %0 to i32 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}f64_ole: +; SI: v_cmp_le_f64 +define void @f64_ole(i32 addrspace(1)* %out, double %a, double %b) { +entry: + %0 = fcmp ole double %a, %b + %1 = sext i1 %0 to i32 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}f64_one: +; SI: v_cmp_lg_f64_e32 vcc +; SI-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc +define void @f64_one(i32 addrspace(1)* %out, double %a, double %b) { +entry: + %0 = fcmp one double %a, %b + %1 = sext i1 %0 to i32 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}f64_ord: +; SI: v_cmp_o_f64 +define void @f64_ord(i32 addrspace(1)* %out, double %a, double %b) { +entry: + %0 = fcmp ord double %a, %b + %1 = sext i1 %0 to i32 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}f64_ueq: +; SI: v_cmp_nlg_f64_e32 vcc +; SI-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc +define void @f64_ueq(i32 addrspace(1)* %out, double %a, double %b) { +entry: + %0 = fcmp ueq double %a, %b + %1 = sext i1 %0 to i32 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}f64_ugt: + +; SI: v_cmp_nle_f64_e32 vcc +; SI-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc +define void @f64_ugt(i32 addrspace(1)* %out, double %a, double %b) { +entry: + %0 = fcmp ugt double %a, %b + %1 = sext i1 %0 to i32 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}f64_uge: +; SI: v_cmp_nlt_f64_e32 vcc +; SI-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc +define void @f64_uge(i32 addrspace(1)* %out, double %a, double %b) { +entry: + %0 = fcmp uge double %a, %b + %1 = sext i1 %0 to i32 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}f64_ult: +; SI: v_cmp_nge_f64_e32 vcc +; SI-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc +define void @f64_ult(i32 addrspace(1)* %out, double %a, double %b) { +entry: + %0 = fcmp ult double %a, %b + %1 = sext i1 %0 to i32 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}f64_ule: +; SI: v_cmp_ngt_f64_e32 vcc +; SI-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc +define void @f64_ule(i32 addrspace(1)* %out, double %a, double %b) { +entry: + %0 = fcmp ule double %a, %b + %1 = sext i1 %0 to i32 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}f64_une: +; SI: v_cmp_neq_f64 +define void @f64_une(i32 addrspace(1)* %out, double %a, double %b) { +entry: + %0 = fcmp une double %a, %b + %1 = sext i1 %0 to i32 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}f64_uno: +; SI: v_cmp_u_f64 +define void @f64_uno(i32 addrspace(1)* %out, double %a, double %b) { +entry: + %0 = fcmp uno double %a, %b + %1 = sext i1 %0 to i32 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +;;;==========================================================================;;; +;; 64-bit integer comparisons +;;;==========================================================================;;; + +; FUNC-LABEL: {{^}}i64_eq: +; SI: v_cmp_eq_i64 +define void @i64_eq(i32 addrspace(1)* %out, i64 %a, i64 %b) { +entry: + %0 = icmp eq i64 %a, %b + %1 = sext i1 %0 to i32 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}i64_ne: +; SI: v_cmp_ne_i64 +define void @i64_ne(i32 addrspace(1)* %out, i64 %a, i64 %b) { +entry: + %0 = icmp ne i64 %a, %b + %1 = sext i1 %0 to i32 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}i64_ugt: +; SI: v_cmp_gt_u64 +define void @i64_ugt(i32 addrspace(1)* %out, i64 %a, i64 %b) { +entry: + %0 = icmp ugt i64 %a, %b + %1 = sext i1 %0 to i32 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}i64_uge: +; SI: v_cmp_ge_u64 +define void @i64_uge(i32 addrspace(1)* %out, i64 %a, i64 %b) { +entry: + %0 = icmp uge i64 %a, %b + %1 = sext i1 %0 to i32 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}i64_ult: +; SI: v_cmp_lt_u64 +define void @i64_ult(i32 addrspace(1)* %out, i64 %a, i64 %b) { +entry: + %0 = icmp ult i64 %a, %b + %1 = sext i1 %0 to i32 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}i64_ule: +; SI: v_cmp_le_u64 +define void @i64_ule(i32 addrspace(1)* %out, i64 %a, i64 %b) { +entry: + %0 = icmp ule i64 %a, %b + %1 = sext i1 %0 to i32 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}i64_sgt: +; SI: v_cmp_gt_i64 +define void @i64_sgt(i32 addrspace(1)* %out, i64 %a, i64 %b) { +entry: + %0 = icmp sgt i64 %a, %b + %1 = sext i1 %0 to i32 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}i64_sge: +; SI: v_cmp_ge_i64 +define void @i64_sge(i32 addrspace(1)* %out, i64 %a, i64 %b) { +entry: + %0 = icmp sge i64 %a, %b + %1 = sext i1 %0 to i32 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}i64_slt: +; SI: v_cmp_lt_i64 +define void @i64_slt(i32 addrspace(1)* %out, i64 %a, i64 %b) { +entry: + %0 = icmp slt i64 %a, %b + %1 = sext i1 %0 to i32 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}i64_sle: +; SI: v_cmp_le_i64 +define void @i64_sle(i32 addrspace(1)* %out, i64 %a, i64 %b) { +entry: + %0 = icmp sle i64 %a, %b + %1 = sext i1 %0 to i32 + store i32 %1, i32 addrspace(1)* %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/seto.ll b/llvm/test/CodeGen/AMDGPU/seto.ll new file mode 100644 index 00000000000..9b5d6b5dbd6 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/seto.ll @@ -0,0 +1,15 @@ +; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s + +; CHECK-LABEL: {{^}}main: +; CHECK: v_cmp_o_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], [[SREG:s[0-9]+]], [[SREG]] +; CHECK-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, 1.0, [[CMP]] +define void @main(float %p) { +main_body: + %c = fcmp oeq float %p, %p + %r = select i1 %c, float 1.000000e+00, float 0.000000e+00 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %r, float %r, float %r, float %r) + ret void +} + +declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) diff --git a/llvm/test/CodeGen/AMDGPU/setuo.ll b/llvm/test/CodeGen/AMDGPU/setuo.ll new file mode 100644 index 00000000000..76346c4f624 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/setuo.ll @@ -0,0 +1,15 @@ +; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s + +; CHECK-LABEL: {{^}}main: +; CHECK: v_cmp_u_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], [[SREG:s[0-9]+]], [[SREG]] +; CHECK-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, 1.0, [[CMP]] +define void @main(float %p) { +main_body: + %c = fcmp une float %p, %p + %r = select i1 %c, float 1.000000e+00, float 0.000000e+00 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %r, float %r, float %r, float %r) + ret void +} + +declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) diff --git a/llvm/test/CodeGen/AMDGPU/sext-eliminate.ll b/llvm/test/CodeGen/AMDGPU/sext-eliminate.ll new file mode 100644 index 00000000000..7dc6eb87f6b --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/sext-eliminate.ll @@ -0,0 +1,26 @@ +; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s + + +; FUNC-LABEL: {{^}}sext_in_reg_i1_i32_add: + +; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+\.[XYZW]]], [[ADDR:T[0-9]+.[XYZW]]] +; EG: SUB_INT {{[* ]*}}[[RES]] +; EG-NOT: BFE +define void @sext_in_reg_i1_i32_add(i32 addrspace(1)* %out, i1 %a, i32 %b) { + %sext = sext i1 %a to i32 + %res = add i32 %b, %sext + store i32 %res, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}sext_in_reg_i1_i32_sub: + +; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+\.[XYZW]]], [[ADDR:T[0-9]+.[XYZW]]] +; EG: ADD_INT {{[* ]*}}[[RES]] +; EG-NOT: BFE +define void @sext_in_reg_i1_i32_sub(i32 addrspace(1)* %out, i1 %a, i32 %b) { + %sext = sext i1 %a to i32 + %res = sub i32 %b, %sext + store i32 %res, i32 addrspace(1)* %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/sext-in-reg.ll b/llvm/test/CodeGen/AMDGPU/sext-in-reg.ll new file mode 100644 index 00000000000..5aedda2ce1a --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/sext-in-reg.ll @@ -0,0 +1,611 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s + +declare i32 @llvm.AMDGPU.imax(i32, i32) nounwind readnone +declare i32 @llvm.r600.read.tidig.x() nounwind readnone + + +; FUNC-LABEL: {{^}}sext_in_reg_i1_i32: +; SI: s_load_dword [[ARG:s[0-9]+]], +; SI: s_bfe_i32 [[SEXTRACT:s[0-9]+]], [[ARG]], 0x10000 +; SI: v_mov_b32_e32 [[EXTRACT:v[0-9]+]], [[SEXTRACT]] +; SI: buffer_store_dword [[EXTRACT]], + +; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+\.[XYZW]]], [[ADDR:T[0-9]+.[XYZW]]] +; EG: BFE_INT [[RES]], {{.*}}, 0.0, 1 +; EG-NEXT: LSHR * [[ADDR]] +define void @sext_in_reg_i1_i32(i32 addrspace(1)* %out, i32 %in) { + %shl = shl i32 %in, 31 + %sext = ashr i32 %shl, 31 + store i32 %sext, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}sext_in_reg_i8_to_i32: +; SI: s_add_i32 [[VAL:s[0-9]+]], +; SI: s_sext_i32_i8 [[EXTRACT:s[0-9]+]], [[VAL]] +; SI: v_mov_b32_e32 [[VEXTRACT:v[0-9]+]], [[EXTRACT]] +; SI: buffer_store_dword [[VEXTRACT]], + +; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+\.[XYZW]]], [[ADDR:T[0-9]+.[XYZW]]] +; EG: ADD_INT +; EG-NEXT: BFE_INT [[RES]], {{.*}}, 0.0, literal +; EG-NEXT: LSHR * [[ADDR]] +define void @sext_in_reg_i8_to_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind { + %c = add i32 %a, %b ; add to prevent folding into extload + %shl = shl i32 %c, 24 + %ashr = ashr i32 %shl, 24 + store i32 %ashr, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}sext_in_reg_i16_to_i32: +; SI: s_add_i32 [[VAL:s[0-9]+]], +; SI: s_sext_i32_i16 [[EXTRACT:s[0-9]+]], [[VAL]] +; SI: v_mov_b32_e32 [[VEXTRACT:v[0-9]+]], [[EXTRACT]] +; SI: buffer_store_dword [[VEXTRACT]], + +; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+\.[XYZW]]], [[ADDR:T[0-9]+.[XYZW]]] +; EG: ADD_INT +; EG-NEXT: BFE_INT [[RES]], {{.*}}, 0.0, literal +; EG-NEXT: LSHR * [[ADDR]] +define void @sext_in_reg_i16_to_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind { + %c = add i32 %a, %b ; add to prevent folding into extload + %shl = shl i32 %c, 16 + %ashr = ashr i32 %shl, 16 + store i32 %ashr, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}sext_in_reg_i8_to_v1i32: +; SI: s_add_i32 [[VAL:s[0-9]+]], +; SI: s_sext_i32_i8 [[EXTRACT:s[0-9]+]], [[VAL]] +; SI: v_mov_b32_e32 [[VEXTRACT:v[0-9]+]], [[EXTRACT]] +; SI: buffer_store_dword [[VEXTRACT]], + +; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+\.[XYZW]]], [[ADDR:T[0-9]+.[XYZW]]] +; EG: ADD_INT +; EG-NEXT: BFE_INT [[RES]], {{.*}}, 0.0, literal +; EG-NEXT: LSHR * [[ADDR]] +define void @sext_in_reg_i8_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i32> %a, <1 x i32> %b) nounwind { + %c = add <1 x i32> %a, %b ; add to prevent folding into extload + %shl = shl <1 x i32> %c, + %ashr = ashr <1 x i32> %shl, + store <1 x i32> %ashr, <1 x i32> addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}sext_in_reg_i1_to_i64: +; SI: s_lshl_b64 [[VAL:s\[[0-9]+:[0-9]+\]]] +; SI-DAG: s_bfe_i64 s{{\[}}[[SLO:[0-9]+]]:[[SHI:[0-9]+]]{{\]}}, [[VAL]], 0x10000 +; SI-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], s[[SLO]] +; SI-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[SHI]] +; SI: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}} +define void @sext_in_reg_i1_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind { + %c = shl i64 %a, %b + %shl = shl i64 %c, 63 + %ashr = ashr i64 %shl, 63 + store i64 %ashr, i64 addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}sext_in_reg_i8_to_i64: +; SI: s_lshl_b64 [[VAL:s\[[0-9]+:[0-9]+\]]] +; SI-DAG: s_bfe_i64 s{{\[}}[[SLO:[0-9]+]]:[[SHI:[0-9]+]]{{\]}}, [[VAL]], 0x80000 +; SI-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], s[[SLO]] +; SI-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[SHI]] +; SI: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}} + +; EG: MEM_{{.*}} STORE_{{.*}} [[RES_LO:T[0-9]+\.[XYZW]]], [[ADDR_LO:T[0-9]+.[XYZW]]] +; EG: MEM_{{.*}} STORE_{{.*}} [[RES_HI:T[0-9]+\.[XYZW]]], [[ADDR_HI:T[0-9]+.[XYZW]]] +; EG: LSHL +; EG: BFE_INT {{\*?}} [[RES_LO]], {{.*}}, 0.0, literal +; EG: ASHR [[RES_HI]] +; EG-NOT: BFE_INT +; EG: LSHR +; EG: LSHR +;; TODO Check address computation, using | with variables in {{}} does not work, +;; also the _LO/_HI order might be different +define void @sext_in_reg_i8_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind { + %c = shl i64 %a, %b + %shl = shl i64 %c, 56 + %ashr = ashr i64 %shl, 56 + store i64 %ashr, i64 addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}sext_in_reg_i16_to_i64: +; SI: s_lshl_b64 [[VAL:s\[[0-9]+:[0-9]+\]]] +; SI-DAG: s_bfe_i64 s{{\[}}[[SLO:[0-9]+]]:[[SHI:[0-9]+]]{{\]}}, [[VAL]], 0x100000 +; SI-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], s[[SLO]] +; SI-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[SHI]] +; SI: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}} + +; EG: MEM_{{.*}} STORE_{{.*}} [[RES_LO:T[0-9]+\.[XYZW]]], [[ADDR_LO:T[0-9]+.[XYZW]]] +; EG: MEM_{{.*}} STORE_{{.*}} [[RES_HI:T[0-9]+\.[XYZW]]], [[ADDR_HI:T[0-9]+.[XYZW]]] +; EG: LSHL +; EG: BFE_INT {{\*?}} [[RES_LO]], {{.*}}, 0.0, literal +; EG: ASHR [[RES_HI]] +; EG-NOT: BFE_INT +; EG: LSHR +; EG: LSHR +;; TODO Check address computation, using | with variables in {{}} does not work, +;; also the _LO/_HI order might be different +define void @sext_in_reg_i16_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind { + %c = shl i64 %a, %b + %shl = shl i64 %c, 48 + %ashr = ashr i64 %shl, 48 + store i64 %ashr, i64 addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}sext_in_reg_i32_to_i64: +; SI: s_lshl_b64 [[VAL:s\[[0-9]+:[0-9]+\]]] +; SI-DAG: s_bfe_i64 s{{\[}}[[SLO:[0-9]+]]:[[SHI:[0-9]+]]{{\]}}, [[VAL]], 0x200000 +; SI-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], s[[SLO]] +; SI-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[SHI]] +; SI: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}} + +; EG: MEM_{{.*}} STORE_{{.*}} [[RES_LO:T[0-9]+\.[XYZW]]], [[ADDR_LO:T[0-9]+.[XYZW]]] +; EG: MEM_{{.*}} STORE_{{.*}} [[RES_HI:T[0-9]+\.[XYZW]]], [[ADDR_HI:T[0-9]+.[XYZW]]] +; EG-NOT: BFE_INT + +; EG: ASHR [[RES_HI]] + +; EG: LSHR +; EG: LSHR +;; TODO Check address computation, using | with variables in {{}} does not work, +;; also the _LO/_HI order might be different +define void @sext_in_reg_i32_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind { + %c = shl i64 %a, %b + %shl = shl i64 %c, 32 + %ashr = ashr i64 %shl, 32 + store i64 %ashr, i64 addrspace(1)* %out, align 8 + ret void +} + +; This is broken on Evergreen for some reason related to the <1 x i64> kernel arguments. +; XFUNC-LABEL: {{^}}sext_in_reg_i8_to_v1i64: +; XSI: s_bfe_i32 [[EXTRACT:s[0-9]+]], {{s[0-9]+}}, 524288 +; XSI: s_ashr_i32 {{v[0-9]+}}, [[EXTRACT]], 31 +; XSI: buffer_store_dword +; XEG: BFE_INT +; XEG: ASHR +; define void @sext_in_reg_i8_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i64> %a, <1 x i64> %b) nounwind { +; %c = add <1 x i64> %a, %b +; %shl = shl <1 x i64> %c, +; %ashr = ashr <1 x i64> %shl, +; store <1 x i64> %ashr, <1 x i64> addrspace(1)* %out, align 8 +; ret void +; } + +; FUNC-LABEL: {{^}}v_sext_in_reg_i1_to_i64: +; SI: buffer_load_dwordx2 +; SI: v_lshl_b64 v{{\[}}[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]{{\]}} +; SI: v_bfe_i32 v[[LO:[0-9]+]], v[[VAL_LO]], 0, 1 +; SI: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]] +; SI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} +define void @v_sext_in_reg_i1_to_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() + %a.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid + %b.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid + %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %tid + %a = load i64, i64 addrspace(1)* %a.gep, align 8 + %b = load i64, i64 addrspace(1)* %b.gep, align 8 + + %c = shl i64 %a, %b + %shl = shl i64 %c, 63 + %ashr = ashr i64 %shl, 63 + store i64 %ashr, i64 addrspace(1)* %out.gep, align 8 + ret void +} + +; FUNC-LABEL: {{^}}v_sext_in_reg_i8_to_i64: +; SI: buffer_load_dwordx2 +; SI: v_lshl_b64 v{{\[}}[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]{{\]}} +; SI: v_bfe_i32 v[[LO:[0-9]+]], v[[VAL_LO]], 0, 8 +; SI: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]] +; SI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} +define void @v_sext_in_reg_i8_to_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() + %a.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid + %b.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid + %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %tid + %a = load i64, i64 addrspace(1)* %a.gep, align 8 + %b = load i64, i64 addrspace(1)* %b.gep, align 8 + + %c = shl i64 %a, %b + %shl = shl i64 %c, 56 + %ashr = ashr i64 %shl, 56 + store i64 %ashr, i64 addrspace(1)* %out.gep, align 8 + ret void +} + +; FUNC-LABEL: {{^}}v_sext_in_reg_i16_to_i64: +; SI: buffer_load_dwordx2 +; SI: v_lshl_b64 v{{\[}}[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]{{\]}} +; SI: v_bfe_i32 v[[LO:[0-9]+]], v[[VAL_LO]], 0, 16 +; SI: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]] +; SI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} +define void @v_sext_in_reg_i16_to_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() + %a.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid + %b.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid + %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %tid + %a = load i64, i64 addrspace(1)* %a.gep, align 8 + %b = load i64, i64 addrspace(1)* %b.gep, align 8 + + %c = shl i64 %a, %b + %shl = shl i64 %c, 48 + %ashr = ashr i64 %shl, 48 + store i64 %ashr, i64 addrspace(1)* %out.gep, align 8 + ret void +} + +; FUNC-LABEL: {{^}}v_sext_in_reg_i32_to_i64: +; SI: buffer_load_dwordx2 +; SI: v_lshl_b64 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, +; SI: v_ashrrev_i32_e32 v[[SHR:[0-9]+]], 31, v[[LO]] +; SI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[SHR]]{{\]}} +define void @v_sext_in_reg_i32_to_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() + %a.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid + %b.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid + %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %tid + %a = load i64, i64 addrspace(1)* %a.gep, align 8 + %b = load i64, i64 addrspace(1)* %b.gep, align 8 + + %c = shl i64 %a, %b + %shl = shl i64 %c, 32 + %ashr = ashr i64 %shl, 32 + store i64 %ashr, i64 addrspace(1)* %out.gep, align 8 + ret void +} + +; FUNC-LABEL: {{^}}sext_in_reg_i1_in_i32_other_amount: +; SI-NOT: s_lshl +; SI-NOT: s_ashr +; SI: s_bfe_i32 {{s[0-9]+}}, {{s[0-9]+}}, 0x190001 + +; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+\.[XYZW]]], [[ADDR:T[0-9]+.[XYZW]]] +; EG-NOT: BFE +; EG: ADD_INT +; EG: LSHL +; EG: ASHR [[RES]] +; EG: LSHR {{\*?}} [[ADDR]] +define void @sext_in_reg_i1_in_i32_other_amount(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind { + %c = add i32 %a, %b + %x = shl i32 %c, 6 + %y = ashr i32 %x, 7 + store i32 %y, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}sext_in_reg_v2i1_in_v2i32_other_amount: +; SI-NOT: s_lshl +; SI-NOT: s_ashr +; SI-DAG: s_bfe_i32 {{s[0-9]+}}, {{s[0-9]+}}, 0x190001 +; SI-DAG: s_bfe_i32 {{s[0-9]+}}, {{s[0-9]+}}, 0x190001 +; SI: s_endpgm + +; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+]]{{\.[XYZW][XYZW]}}, [[ADDR:T[0-9]+.[XYZW]]] +; EG-NOT: BFE +; EG: ADD_INT +; EG: LSHL +; EG: ASHR [[RES]] +; EG: LSHL +; EG: ASHR [[RES]] +; EG: LSHR {{\*?}} [[ADDR]] +define void @sext_in_reg_v2i1_in_v2i32_other_amount(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) nounwind { + %c = add <2 x i32> %a, %b + %x = shl <2 x i32> %c, + %y = ashr <2 x i32> %x, + store <2 x i32> %y, <2 x i32> addrspace(1)* %out, align 2 + ret void +} + + +; FUNC-LABEL: {{^}}sext_in_reg_v2i1_to_v2i32: +; SI: s_bfe_i32 {{s[0-9]+}}, {{s[0-9]+}}, 0x10000 +; SI: s_bfe_i32 {{s[0-9]+}}, {{s[0-9]+}}, 0x10000 +; SI: buffer_store_dwordx2 + +; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+]]{{\.[XYZW][XYZW]}}, [[ADDR:T[0-9]+.[XYZW]]] +; EG: BFE_INT [[RES]] +; EG: BFE_INT [[RES]] +; EG: LSHR {{\*?}} [[ADDR]] +define void @sext_in_reg_v2i1_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) nounwind { + %c = add <2 x i32> %a, %b ; add to prevent folding into extload + %shl = shl <2 x i32> %c, + %ashr = ashr <2 x i32> %shl, + store <2 x i32> %ashr, <2 x i32> addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}sext_in_reg_v4i1_to_v4i32: +; SI: s_bfe_i32 {{s[0-9]+}}, {{s[0-9]+}}, 0x10000 +; SI: s_bfe_i32 {{s[0-9]+}}, {{s[0-9]+}}, 0x10000 +; SI: s_bfe_i32 {{s[0-9]+}}, {{s[0-9]+}}, 0x10000 +; SI: s_bfe_i32 {{s[0-9]+}}, {{s[0-9]+}}, 0x10000 +; SI: buffer_store_dwordx4 + +; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+]]{{\.[XYZW][XYZW][XYZW][XYZW]}}, [[ADDR:T[0-9]+.[XYZW]]] +; EG: BFE_INT [[RES]] +; EG: BFE_INT [[RES]] +; EG: BFE_INT [[RES]] +; EG: BFE_INT [[RES]] +; EG: LSHR {{\*?}} [[ADDR]] +define void @sext_in_reg_v4i1_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b) nounwind { + %c = add <4 x i32> %a, %b ; add to prevent folding into extload + %shl = shl <4 x i32> %c, + %ashr = ashr <4 x i32> %shl, + store <4 x i32> %ashr, <4 x i32> addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}sext_in_reg_v2i8_to_v2i32: +; SI: s_sext_i32_i8 {{s[0-9]+}}, {{s[0-9]+}} +; SI: s_sext_i32_i8 {{s[0-9]+}}, {{s[0-9]+}} +; SI: buffer_store_dwordx2 + +; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+]]{{\.[XYZW][XYZW]}}, [[ADDR:T[0-9]+.[XYZW]]] +; EG: BFE_INT [[RES]] +; EG: BFE_INT [[RES]] +; EG: LSHR {{\*?}} [[ADDR]] +define void @sext_in_reg_v2i8_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) nounwind { + %c = add <2 x i32> %a, %b ; add to prevent folding into extload + %shl = shl <2 x i32> %c, + %ashr = ashr <2 x i32> %shl, + store <2 x i32> %ashr, <2 x i32> addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}sext_in_reg_v4i8_to_v4i32: +; SI: s_sext_i32_i8 {{s[0-9]+}}, {{s[0-9]+}} +; SI: s_sext_i32_i8 {{s[0-9]+}}, {{s[0-9]+}} +; SI: s_sext_i32_i8 {{s[0-9]+}}, {{s[0-9]+}} +; SI: s_sext_i32_i8 {{s[0-9]+}}, {{s[0-9]+}} +; SI: buffer_store_dwordx4 + +; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+]]{{\.[XYZW][XYZW][XYZW][XYZW]}}, [[ADDR:T[0-9]+.[XYZW]]] +; EG: BFE_INT [[RES]] +; EG: BFE_INT [[RES]] +; EG: BFE_INT [[RES]] +; EG: BFE_INT [[RES]] +; EG: LSHR {{\*?}} [[ADDR]] +define void @sext_in_reg_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b) nounwind { + %c = add <4 x i32> %a, %b ; add to prevent folding into extload + %shl = shl <4 x i32> %c, + %ashr = ashr <4 x i32> %shl, + store <4 x i32> %ashr, <4 x i32> addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}sext_in_reg_v2i16_to_v2i32: +; SI: s_sext_i32_i16 {{s[0-9]+}}, {{s[0-9]+}} +; SI: s_sext_i32_i16 {{s[0-9]+}}, {{s[0-9]+}} +; SI: buffer_store_dwordx2 + +; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+]]{{\.[XYZW][XYZW]}}, [[ADDR:T[0-9]+.[XYZW]]] +; EG: BFE_INT [[RES]] +; EG: BFE_INT [[RES]] +; EG: LSHR {{\*?}} [[ADDR]] +define void @sext_in_reg_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) nounwind { + %c = add <2 x i32> %a, %b ; add to prevent folding into extload + %shl = shl <2 x i32> %c, + %ashr = ashr <2 x i32> %shl, + store <2 x i32> %ashr, <2 x i32> addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}testcase: +define void @testcase(i8 addrspace(1)* %out, i8 %a) nounwind { + %and_a_1 = and i8 %a, 1 + %cmp_eq = icmp eq i8 %and_a_1, 0 + %cmp_slt = icmp slt i8 %a, 0 + %sel0 = select i1 %cmp_slt, i8 0, i8 %a + %sel1 = select i1 %cmp_eq, i8 0, i8 %a + %xor = xor i8 %sel0, %sel1 + store i8 %xor, i8 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}testcase_3: +define void @testcase_3(i8 addrspace(1)* %out, i8 %a) nounwind { + %and_a_1 = and i8 %a, 1 + %cmp_eq = icmp eq i8 %and_a_1, 0 + %cmp_slt = icmp slt i8 %a, 0 + %sel0 = select i1 %cmp_slt, i8 0, i8 %a + %sel1 = select i1 %cmp_eq, i8 0, i8 %a + %xor = xor i8 %sel0, %sel1 + store i8 %xor, i8 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}vgpr_sext_in_reg_v4i8_to_v4i32: +; SI: v_bfe_i32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 8 +; SI: v_bfe_i32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 8 +; SI: v_bfe_i32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 8 +; SI: v_bfe_i32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 8 +define void @vgpr_sext_in_reg_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %a, <4 x i32> addrspace(1)* %b) nounwind { + %loada = load <4 x i32>, <4 x i32> addrspace(1)* %a, align 16 + %loadb = load <4 x i32>, <4 x i32> addrspace(1)* %b, align 16 + %c = add <4 x i32> %loada, %loadb ; add to prevent folding into extload + %shl = shl <4 x i32> %c, + %ashr = ashr <4 x i32> %shl, + store <4 x i32> %ashr, <4 x i32> addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}vgpr_sext_in_reg_v4i16_to_v4i32: +; SI: v_bfe_i32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 16 +; SI: v_bfe_i32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 16 +define void @vgpr_sext_in_reg_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %a, <4 x i32> addrspace(1)* %b) nounwind { + %loada = load <4 x i32>, <4 x i32> addrspace(1)* %a, align 16 + %loadb = load <4 x i32>, <4 x i32> addrspace(1)* %b, align 16 + %c = add <4 x i32> %loada, %loadb ; add to prevent folding into extload + %shl = shl <4 x i32> %c, + %ashr = ashr <4 x i32> %shl, + store <4 x i32> %ashr, <4 x i32> addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}sext_in_reg_to_illegal_type: +; SI: buffer_load_sbyte +; SI: v_max_i32 +; SI-NOT: bfe +; SI: buffer_store_short +define void @sext_in_reg_to_illegal_type(i16 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %src) nounwind { + %tmp5 = load i8, i8 addrspace(1)* %src, align 1 + %tmp2 = sext i8 %tmp5 to i32 + %tmp3 = tail call i32 @llvm.AMDGPU.imax(i32 %tmp2, i32 0) nounwind readnone + %tmp4 = trunc i32 %tmp3 to i8 + %tmp6 = sext i8 %tmp4 to i16 + store i16 %tmp6, i16 addrspace(1)* %out, align 2 + ret void +} + +declare i32 @llvm.AMDGPU.bfe.i32(i32, i32, i32) nounwind readnone + +; FUNC-LABEL: {{^}}bfe_0_width: +; SI-NOT: {{[^@]}}bfe +; SI: s_endpgm +define void @bfe_0_width(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) nounwind { + %load = load i32, i32 addrspace(1)* %ptr, align 4 + %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %load, i32 8, i32 0) nounwind readnone + store i32 %bfe, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfe_8_bfe_8: +; SI: v_bfe_i32 +; SI-NOT: {{[^@]}}bfe +; SI: s_endpgm +define void @bfe_8_bfe_8(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) nounwind { + %load = load i32, i32 addrspace(1)* %ptr, align 4 + %bfe0 = call i32 @llvm.AMDGPU.bfe.i32(i32 %load, i32 0, i32 8) nounwind readnone + %bfe1 = call i32 @llvm.AMDGPU.bfe.i32(i32 %bfe0, i32 0, i32 8) nounwind readnone + store i32 %bfe1, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfe_8_bfe_16: +; SI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 8 +; SI: s_endpgm +define void @bfe_8_bfe_16(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) nounwind { + %load = load i32, i32 addrspace(1)* %ptr, align 4 + %bfe0 = call i32 @llvm.AMDGPU.bfe.i32(i32 %load, i32 0, i32 8) nounwind readnone + %bfe1 = call i32 @llvm.AMDGPU.bfe.i32(i32 %bfe0, i32 0, i32 16) nounwind readnone + store i32 %bfe1, i32 addrspace(1)* %out, align 4 + ret void +} + +; This really should be folded into 1 +; FUNC-LABEL: {{^}}bfe_16_bfe_8: +; SI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 8 +; SI-NOT: {{[^@]}}bfe +; SI: s_endpgm +define void @bfe_16_bfe_8(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) nounwind { + %load = load i32, i32 addrspace(1)* %ptr, align 4 + %bfe0 = call i32 @llvm.AMDGPU.bfe.i32(i32 %load, i32 0, i32 16) nounwind readnone + %bfe1 = call i32 @llvm.AMDGPU.bfe.i32(i32 %bfe0, i32 0, i32 8) nounwind readnone + store i32 %bfe1, i32 addrspace(1)* %out, align 4 + ret void +} + +; Make sure there isn't a redundant BFE +; FUNC-LABEL: {{^}}sext_in_reg_i8_to_i32_bfe: +; SI: s_sext_i32_i8 s{{[0-9]+}}, s{{[0-9]+}} +; SI-NOT: {{[^@]}}bfe +; SI: s_endpgm +define void @sext_in_reg_i8_to_i32_bfe(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind { + %c = add i32 %a, %b ; add to prevent folding into extload + %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %c, i32 0, i32 8) nounwind readnone + %shl = shl i32 %bfe, 24 + %ashr = ashr i32 %shl, 24 + store i32 %ashr, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}sext_in_reg_i8_to_i32_bfe_wrong: +define void @sext_in_reg_i8_to_i32_bfe_wrong(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind { + %c = add i32 %a, %b ; add to prevent folding into extload + %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %c, i32 8, i32 0) nounwind readnone + %shl = shl i32 %bfe, 24 + %ashr = ashr i32 %shl, 24 + store i32 %ashr, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}sextload_i8_to_i32_bfe: +; SI: buffer_load_sbyte +; SI-NOT: {{[^@]}}bfe +; SI: s_endpgm +define void @sextload_i8_to_i32_bfe(i32 addrspace(1)* %out, i8 addrspace(1)* %ptr) nounwind { + %load = load i8, i8 addrspace(1)* %ptr, align 1 + %sext = sext i8 %load to i32 + %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %sext, i32 0, i32 8) nounwind readnone + %shl = shl i32 %bfe, 24 + %ashr = ashr i32 %shl, 24 + store i32 %ashr, i32 addrspace(1)* %out, align 4 + ret void +} + +; SI: .text +; FUNC-LABEL: {{^}}sextload_i8_to_i32_bfe_0:{{.*$}} +; SI-NOT: {{[^@]}}bfe +; SI: s_endpgm +define void @sextload_i8_to_i32_bfe_0(i32 addrspace(1)* %out, i8 addrspace(1)* %ptr) nounwind { + %load = load i8, i8 addrspace(1)* %ptr, align 1 + %sext = sext i8 %load to i32 + %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %sext, i32 8, i32 0) nounwind readnone + %shl = shl i32 %bfe, 24 + %ashr = ashr i32 %shl, 24 + store i32 %ashr, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}sext_in_reg_i1_bfe_offset_0: +; SI-NOT: shr +; SI-NOT: shl +; SI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 1 +; SI: s_endpgm +define void @sext_in_reg_i1_bfe_offset_0(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { + %x = load i32, i32 addrspace(1)* %in, align 4 + %shl = shl i32 %x, 31 + %shr = ashr i32 %shl, 31 + %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %shr, i32 0, i32 1) + store i32 %bfe, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}sext_in_reg_i1_bfe_offset_1: +; SI: buffer_load_dword +; SI-NOT: shl +; SI-NOT: shr +; SI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 1, 1 +; SI: s_endpgm +define void @sext_in_reg_i1_bfe_offset_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { + %x = load i32, i32 addrspace(1)* %in, align 4 + %shl = shl i32 %x, 30 + %shr = ashr i32 %shl, 30 + %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %shr, i32 1, i32 1) + store i32 %bfe, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}sext_in_reg_i2_bfe_offset_1: +; SI: buffer_load_dword +; SI-NOT: v_lshl +; SI-NOT: v_ashr +; SI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 2 +; SI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 1, 2 +; SI: s_endpgm +define void @sext_in_reg_i2_bfe_offset_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { + %x = load i32, i32 addrspace(1)* %in, align 4 + %shl = shl i32 %x, 30 + %shr = ashr i32 %shl, 30 + %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %shr, i32 1, i32 2) + store i32 %bfe, i32 addrspace(1)* %out, align 4 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll b/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll new file mode 100644 index 00000000000..38289ced632 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll @@ -0,0 +1,105 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s +; +; +; Most SALU instructions ignore control flow, so we need to make sure +; they don't overwrite values from other blocks. + +; If the branch decision is made based on a value in an SGPR then all +; threads will execute the same code paths, so we don't need to worry +; about instructions in different blocks overwriting each other. +; SI-LABEL: {{^}}sgpr_if_else_salu_br: +; SI: s_add +; SI: s_add + +define void @sgpr_if_else_salu_br(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d, i32 %e) { +entry: + %0 = icmp eq i32 %a, 0 + br i1 %0, label %if, label %else + +if: + %1 = add i32 %b, %c + br label %endif + +else: + %2 = add i32 %d, %e + br label %endif + +endif: + %3 = phi i32 [%1, %if], [%2, %else] + %4 = add i32 %3, %a + store i32 %4, i32 addrspace(1)* %out + ret void +} + +; The two S_ADD instructions should write to different registers, since +; different threads will take different control flow paths. + +; SI-LABEL: {{^}}sgpr_if_else_valu_br: +; SI: s_add_i32 [[SGPR:s[0-9]+]] +; SI-NOT: s_add_i32 [[SGPR]] + +define void @sgpr_if_else_valu_br(i32 addrspace(1)* %out, float %a, i32 %b, i32 %c, i32 %d, i32 %e) { +entry: + %tid = call i32 @llvm.r600.read.tidig.x() #0 + %tid_f = uitofp i32 %tid to float + %tmp1 = fcmp ueq float %tid_f, 0.0 + br i1 %tmp1, label %if, label %else + +if: + %tmp2 = add i32 %b, %c + br label %endif + +else: + %tmp3 = add i32 %d, %e + br label %endif + +endif: + %tmp4 = phi i32 [%tmp2, %if], [%tmp3, %else] + store i32 %tmp4, i32 addrspace(1)* %out + ret void +} + +; FIXME: Should write to different SGPR pairs instead of copying to +; VALU for i1 phi. + +; SI-LABEL: {{^}}sgpr_if_else_valu_cmp_phi_br: +; SI: buffer_load_dword [[AVAL:v[0-9]+]] +; SI: v_cmp_gt_i32_e32 [[CMP_IF:vcc]], 0, [[AVAL]] +; SI: v_cndmask_b32_e64 [[V_CMP:v[0-9]+]], 0, -1, [[CMP_IF]] + +; SI: BB2_1: +; SI: buffer_load_dword [[AVAL:v[0-9]+]] +; SI: v_cmp_eq_i32_e32 [[CMP_ELSE:vcc]], 0, [[AVAL]] +; SI: v_cndmask_b32_e64 [[V_CMP]], 0, -1, [[CMP_ELSE]] + +; SI: v_cmp_ne_i32_e32 [[CMP_CMP:vcc]], 0, [[V_CMP]] +; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP_CMP]] +; SI: buffer_store_dword [[RESULT]] +define void @sgpr_if_else_valu_cmp_phi_br(i32 addrspace(1)* %out, i32 addrspace(1)* %a, i32 addrspace(1)* %b) { +entry: + %tid = call i32 @llvm.r600.read.tidig.x() #0 + %tmp1 = icmp eq i32 %tid, 0 + br i1 %tmp1, label %if, label %else + +if: + %gep.if = getelementptr i32, i32 addrspace(1)* %a, i32 %tid + %a.val = load i32, i32 addrspace(1)* %gep.if + %cmp.if = icmp eq i32 %a.val, 0 + br label %endif + +else: + %gep.else = getelementptr i32, i32 addrspace(1)* %b, i32 %tid + %b.val = load i32, i32 addrspace(1)* %gep.else + %cmp.else = icmp slt i32 %b.val, 0 + br label %endif + +endif: + %tmp4 = phi i1 [%cmp.if, %if], [%cmp.else, %else] + %ext = sext i1 %tmp4 to i32 + store i32 %ext, i32 addrspace(1)* %out + ret void +} + +declare i32 @llvm.r600.read.tidig.x() #0 + +attributes #0 = { readnone } diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-copy-duplicate-operand.ll b/llvm/test/CodeGen/AMDGPU/sgpr-copy-duplicate-operand.ll new file mode 100644 index 00000000000..df67fcca22f --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/sgpr-copy-duplicate-operand.ll @@ -0,0 +1,19 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s + +; Copy VGPR -> SGPR used twice as an instruction operand, which is then +; used in an REG_SEQUENCE that also needs to be handled. + +; SI-LABEL: {{^}}test_dup_operands: +; SI: v_add_i32_e32 +define void @test_dup_operands(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %in) { + %a = load <2 x i32>, <2 x i32> addrspace(1)* %in + %lo = extractelement <2 x i32> %a, i32 0 + %hi = extractelement <2 x i32> %a, i32 1 + %add = add i32 %lo, %lo + %vec0 = insertelement <2 x i32> undef, i32 %add, i32 0 + %vec1 = insertelement <2 x i32> %vec0, i32 %hi, i32 1 + store <2 x i32> %vec1, <2 x i32> addrspace(1)* %out, align 8 + ret void +} + diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-copy.ll b/llvm/test/CodeGen/AMDGPU/sgpr-copy.ll new file mode 100644 index 00000000000..b849c4038bc --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/sgpr-copy.ll @@ -0,0 +1,379 @@ +; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s +; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s + +; This test checks that no VGPR to SGPR copies are created by the register +; allocator. +; CHECK-LABEL: {{^}}phi1: +; CHECK: s_buffer_load_dword [[DST:s[0-9]]], {{s\[[0-9]+:[0-9]+\]}}, 0x0 +; CHECK: v_mov_b32_e32 v{{[0-9]}}, [[DST]] + +define void @phi1(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 { +main_body: + %20 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %0, i32 0 + %21 = load <16 x i8>, <16 x i8> addrspace(2)* %20, !tbaa !1 + %22 = call float @llvm.SI.load.const(<16 x i8> %21, i32 0) + %23 = call float @llvm.SI.load.const(<16 x i8> %21, i32 16) + %24 = call float @llvm.SI.load.const(<16 x i8> %21, i32 32) + %25 = fptosi float %23 to i32 + %26 = icmp ne i32 %25, 0 + br i1 %26, label %ENDIF, label %ELSE + +ELSE: ; preds = %main_body + %27 = fsub float -0.000000e+00, %22 + br label %ENDIF + +ENDIF: ; preds = %main_body, %ELSE + %temp.0 = phi float [ %27, %ELSE ], [ %22, %main_body ] + %28 = fadd float %temp.0, %24 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %28, float %28, float 0.000000e+00, float 1.000000e+00) + ret void +} + +; Make sure this program doesn't crash +; CHECK-LABEL: {{^}}phi2: +define void @phi2(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 { +main_body: + %20 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %0, i32 0 + %21 = load <16 x i8>, <16 x i8> addrspace(2)* %20, !tbaa !1 + %22 = call float @llvm.SI.load.const(<16 x i8> %21, i32 16) + %23 = call float @llvm.SI.load.const(<16 x i8> %21, i32 32) + %24 = call float @llvm.SI.load.const(<16 x i8> %21, i32 36) + %25 = call float @llvm.SI.load.const(<16 x i8> %21, i32 40) + %26 = call float @llvm.SI.load.const(<16 x i8> %21, i32 48) + %27 = call float @llvm.SI.load.const(<16 x i8> %21, i32 52) + %28 = call float @llvm.SI.load.const(<16 x i8> %21, i32 56) + %29 = call float @llvm.SI.load.const(<16 x i8> %21, i32 64) + %30 = call float @llvm.SI.load.const(<16 x i8> %21, i32 68) + %31 = call float @llvm.SI.load.const(<16 x i8> %21, i32 72) + %32 = call float @llvm.SI.load.const(<16 x i8> %21, i32 76) + %33 = call float @llvm.SI.load.const(<16 x i8> %21, i32 80) + %34 = call float @llvm.SI.load.const(<16 x i8> %21, i32 84) + %35 = call float @llvm.SI.load.const(<16 x i8> %21, i32 88) + %36 = call float @llvm.SI.load.const(<16 x i8> %21, i32 92) + %37 = getelementptr <32 x i8>, <32 x i8> addrspace(2)* %2, i32 0 + %38 = load <32 x i8>, <32 x i8> addrspace(2)* %37, !tbaa !1 + %39 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %1, i32 0 + %40 = load <16 x i8>, <16 x i8> addrspace(2)* %39, !tbaa !1 + %41 = call float @llvm.SI.fs.interp(i32 0, i32 0, i32 %3, <2 x i32> %5) + %42 = call float @llvm.SI.fs.interp(i32 1, i32 0, i32 %3, <2 x i32> %5) + %43 = call float @llvm.SI.fs.interp(i32 0, i32 1, i32 %3, <2 x i32> %5) + %44 = call float @llvm.SI.fs.interp(i32 1, i32 1, i32 %3, <2 x i32> %5) + %45 = call float @llvm.SI.fs.interp(i32 2, i32 1, i32 %3, <2 x i32> %5) + %46 = bitcast float %41 to i32 + %47 = bitcast float %42 to i32 + %48 = insertelement <2 x i32> undef, i32 %46, i32 0 + %49 = insertelement <2 x i32> %48, i32 %47, i32 1 + %50 = call <4 x float> @llvm.SI.sample.v2i32(<2 x i32> %49, <32 x i8> %38, <16 x i8> %40, i32 2) + %51 = extractelement <4 x float> %50, i32 2 + %52 = call float @fabs(float %51) + %53 = fmul float %43, %43 + %54 = fmul float %44, %44 + %55 = fadd float %54, %53 + %56 = fmul float %45, %45 + %57 = fadd float %55, %56 + %58 = call float @llvm.AMDGPU.rsq.f32(float %57) + %59 = fmul float %43, %58 + %60 = fmul float %44, %58 + %61 = fmul float %45, %58 + %62 = fmul float %59, %23 + %63 = fmul float %60, %24 + %64 = fadd float %63, %62 + %65 = fmul float %61, %25 + %66 = fadd float %64, %65 + %67 = fsub float -0.000000e+00, %26 + %68 = fmul float %66, %52 + %69 = fadd float %68, %67 + %70 = fmul float %27, %69 + %71 = fmul float %28, %69 + %72 = call float @fabs(float %70) + %73 = fcmp olt float 0x3EE4F8B580000000, %72 + %74 = sext i1 %73 to i32 + %75 = bitcast i32 %74 to float + %76 = bitcast float %75 to i32 + %77 = icmp ne i32 %76, 0 + br i1 %77, label %IF, label %ENDIF + +IF: ; preds = %main_body + %78 = fsub float -0.000000e+00, %70 + %79 = call float @llvm.AMDIL.exp.(float %78) + %80 = fsub float -0.000000e+00, %79 + %81 = fadd float 1.000000e+00, %80 + %82 = fdiv float 1.000000e+00, %70 + %83 = fmul float %81, %82 + %84 = fmul float %32, %83 + br label %ENDIF + +ENDIF: ; preds = %main_body, %IF + %temp4.0 = phi float [ %84, %IF ], [ %32, %main_body ] + %85 = call float @fabs(float %71) + %86 = fcmp olt float 0x3EE4F8B580000000, %85 + %87 = sext i1 %86 to i32 + %88 = bitcast i32 %87 to float + %89 = bitcast float %88 to i32 + %90 = icmp ne i32 %89, 0 + br i1 %90, label %IF25, label %ENDIF24 + +IF25: ; preds = %ENDIF + %91 = fsub float -0.000000e+00, %71 + %92 = call float @llvm.AMDIL.exp.(float %91) + %93 = fsub float -0.000000e+00, %92 + %94 = fadd float 1.000000e+00, %93 + %95 = fdiv float 1.000000e+00, %71 + %96 = fmul float %94, %95 + %97 = fmul float %36, %96 + br label %ENDIF24 + +ENDIF24: ; preds = %ENDIF, %IF25 + %temp8.0 = phi float [ %97, %IF25 ], [ %36, %ENDIF ] + %98 = fmul float %29, %temp4.0 + %99 = fmul float %30, %temp4.0 + %100 = fmul float %31, %temp4.0 + %101 = fmul float %33, %temp8.0 + %102 = fadd float %101, %98 + %103 = fmul float %34, %temp8.0 + %104 = fadd float %103, %99 + %105 = fmul float %35, %temp8.0 + %106 = fadd float %105, %100 + %107 = call float @llvm.pow.f32(float %52, float %22) + %108 = fsub float -0.000000e+00, %102 + %109 = fmul float %108, %107 + %110 = fsub float -0.000000e+00, %104 + %111 = fmul float %110, %107 + %112 = fsub float -0.000000e+00, %106 + %113 = fmul float %112, %107 + %114 = call i32 @llvm.SI.packf16(float %109, float %111) + %115 = bitcast i32 %114 to float + %116 = call i32 @llvm.SI.packf16(float %113, float 1.000000e+00) + %117 = bitcast i32 %116 to float + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %115, float %117, float %115, float %117) + ret void +} + +; We just want ot make sure the program doesn't crash +; CHECK-LABEL: {{^}}loop: + +define void @loop(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 { +main_body: + %20 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %0, i32 0 + %21 = load <16 x i8>, <16 x i8> addrspace(2)* %20, !tbaa !1 + %22 = call float @llvm.SI.load.const(<16 x i8> %21, i32 0) + %23 = call float @llvm.SI.load.const(<16 x i8> %21, i32 4) + %24 = call float @llvm.SI.load.const(<16 x i8> %21, i32 8) + %25 = call float @llvm.SI.load.const(<16 x i8> %21, i32 12) + %26 = fptosi float %25 to i32 + %27 = bitcast i32 %26 to float + %28 = bitcast float %27 to i32 + br label %LOOP + +LOOP: ; preds = %ENDIF, %main_body + %temp4.0 = phi float [ %22, %main_body ], [ %temp5.0, %ENDIF ] + %temp5.0 = phi float [ %23, %main_body ], [ %temp6.0, %ENDIF ] + %temp6.0 = phi float [ %24, %main_body ], [ %temp4.0, %ENDIF ] + %temp8.0 = phi float [ 0.000000e+00, %main_body ], [ %37, %ENDIF ] + %29 = bitcast float %temp8.0 to i32 + %30 = icmp sge i32 %29, %28 + %31 = sext i1 %30 to i32 + %32 = bitcast i32 %31 to float + %33 = bitcast float %32 to i32 + %34 = icmp ne i32 %33, 0 + br i1 %34, label %IF, label %ENDIF + +IF: ; preds = %LOOP + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %temp4.0, float %temp5.0, float %temp6.0, float 1.000000e+00) + ret void + +ENDIF: ; preds = %LOOP + %35 = bitcast float %temp8.0 to i32 + %36 = add i32 %35, 1 + %37 = bitcast i32 %36 to float + br label %LOOP +} + +; Function Attrs: nounwind readnone +declare float @llvm.SI.load.const(<16 x i8>, i32) #1 + +; Function Attrs: readonly +declare float @fabs(float) #2 + +declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) + +attributes #0 = { "ShaderType"="0" } +attributes #1 = { nounwind readnone } +attributes #2 = { readonly } +attributes #3 = { readnone } +attributes #4 = { nounwind readonly } + +!0 = !{!"const", null} +!1 = !{!0, !0, i64 0, i32 1} + +; Function Attrs: nounwind readnone +declare float @llvm.SI.fs.interp(i32, i32, i32, <2 x i32>) #1 + +; Function Attrs: nounwind readnone +declare <4 x float> @llvm.SI.sample.v2i32(<2 x i32>, <32 x i8>, <16 x i8>, i32) #1 + +; Function Attrs: readnone +declare float @llvm.AMDGPU.rsq.f32(float) #3 + +; Function Attrs: readnone +declare float @llvm.AMDIL.exp.(float) #3 + +; Function Attrs: nounwind readonly +declare float @llvm.pow.f32(float, float) #4 + +; Function Attrs: nounwind readnone +declare i32 @llvm.SI.packf16(float, float) #1 + +; This checks for a bug in the FixSGPRCopies pass where VReg96 +; registers were being identified as an SGPR regclass which was causing +; an assertion failure. + +; CHECK-LABEL: {{^}}sample_v3: +; CHECK: image_sample +; CHECK: image_sample +; CHECK: exp +; CHECK: s_endpgm +define void @sample_v3([17 x <16 x i8>] addrspace(2)* byval, [32 x <16 x i8>] addrspace(2)* byval, [16 x <32 x i8>] addrspace(2)* byval, float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 { + +entry: + %21 = getelementptr [17 x <16 x i8>], [17 x <16 x i8>] addrspace(2)* %0, i64 0, i32 0 + %22 = load <16 x i8>, <16 x i8> addrspace(2)* %21, !tbaa !2 + %23 = call float @llvm.SI.load.const(<16 x i8> %22, i32 16) + %24 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 0 + %25 = load <32 x i8>, <32 x i8> addrspace(2)* %24, !tbaa !2 + %26 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 0 + %27 = load <16 x i8>, <16 x i8> addrspace(2)* %26, !tbaa !2 + %28 = fcmp oeq float %23, 0.0 + br i1 %28, label %if, label %else + +if: + %val.if = call <4 x float> @llvm.SI.sample.v2i32(<2 x i32> , <32 x i8> %25, <16 x i8> %27, i32 2) + %val.if.0 = extractelement <4 x float> %val.if, i32 0 + %val.if.1 = extractelement <4 x float> %val.if, i32 1 + %val.if.2 = extractelement <4 x float> %val.if, i32 2 + br label %endif + +else: + %val.else = call <4 x float> @llvm.SI.sample.v2i32(<2 x i32> , <32 x i8> %25, <16 x i8> %27, i32 2) + %val.else.0 = extractelement <4 x float> %val.else, i32 0 + %val.else.1 = extractelement <4 x float> %val.else, i32 1 + %val.else.2 = extractelement <4 x float> %val.else, i32 2 + br label %endif + +endif: + %val.0 = phi float [%val.if.0, %if], [%val.else.0, %else] + %val.1 = phi float [%val.if.1, %if], [%val.else.1, %else] + %val.2 = phi float [%val.if.2, %if], [%val.else.2, %else] + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %val.0, float %val.1, float %val.2, float 0.0) + ret void +} + +!2 = !{!"const", null, i32 1} + +; CHECK-LABEL: {{^}}copy1: +; CHECK: buffer_load_dword +; CHECK: v_add +; CHECK: s_endpgm +define void @copy1(float addrspace(1)* %out, float addrspace(1)* %in0) { +entry: + %0 = load float, float addrspace(1)* %in0 + %1 = fcmp oeq float %0, 0.0 + br i1 %1, label %if0, label %endif + +if0: + %2 = bitcast float %0 to i32 + %3 = fcmp olt float %0, 0.0 + br i1 %3, label %if1, label %endif + +if1: + %4 = add i32 %2, 1 + br label %endif + +endif: + %5 = phi i32 [ 0, %entry ], [ %2, %if0 ], [ %4, %if1 ] + %6 = bitcast i32 %5 to float + store float %6, float addrspace(1)* %out + ret void +} + +; This test is just checking that we don't crash / assertion fail. +; CHECK-LABEL: {{^}}copy2: +; CHECK: s_endpgm + +define void @copy2([17 x <16 x i8>] addrspace(2)* byval, [32 x <16 x i8>] addrspace(2)* byval, [16 x <32 x i8>] addrspace(2)* byval, float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 { +entry: + br label %LOOP68 + +LOOP68: + %temp4.7 = phi float [ 0.000000e+00, %entry ], [ %v, %ENDIF69 ] + %t = phi i32 [ 20, %entry ], [ %x, %ENDIF69 ] + %g = icmp eq i32 0, %t + %l = bitcast float %temp4.7 to i32 + br i1 %g, label %IF70, label %ENDIF69 + +IF70: + %q = icmp ne i32 %l, 13 + %temp.8 = select i1 %q, float 1.000000e+00, float 0.000000e+00 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %temp.8, float 0.000000e+00, float 0.000000e+00, float 1.000000e+00) + ret void + +ENDIF69: + %u = add i32 %l, %t + %v = bitcast i32 %u to float + %x = add i32 %t, -1 + br label %LOOP68 +} + +attributes #0 = { "ShaderType"="0" } + +; This test checks that image_sample resource descriptors aren't loaded into +; vgprs. The verifier will fail if this happens. +; CHECK-LABEL:{{^}}sample_rsrc: +; CHECK: image_sample +; CHECK: image_sample +; CHECK: s_endpgm +define void @sample_rsrc([6 x <16 x i8>] addrspace(2)* byval %arg, [17 x <16 x i8>] addrspace(2)* byval %arg1, [16 x <4 x i32>] addrspace(2)* byval %arg2, [32 x <8 x i32>] addrspace(2)* byval %arg3, float inreg %arg4, i32 inreg %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <3 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, <2 x i32> %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, i32 %arg19, float %arg20, float %arg21) #0 { +bb: + %tmp = getelementptr [17 x <16 x i8>], [17 x <16 x i8>] addrspace(2)* %arg1, i32 0, i32 0 + %tmp22 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp, !tbaa !0 + %tmp23 = call float @llvm.SI.load.const(<16 x i8> %tmp22, i32 16) + %tmp25 = getelementptr [32 x <8 x i32>], [32 x <8 x i32>] addrspace(2)* %arg3, i32 0, i32 0 + %tmp26 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp25, !tbaa !0 + %tmp27 = getelementptr [16 x <4 x i32>], [16 x <4 x i32>] addrspace(2)* %arg2, i32 0, i32 0 + %tmp28 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp27, !tbaa !0 + %tmp29 = call float @llvm.SI.fs.interp(i32 0, i32 0, i32 %arg5, <2 x i32> %arg7) + %tmp30 = call float @llvm.SI.fs.interp(i32 1, i32 0, i32 %arg5, <2 x i32> %arg7) + %tmp31 = bitcast float %tmp23 to i32 + %tmp36 = icmp ne i32 %tmp31, 0 + br i1 %tmp36, label %bb38, label %bb80 + +bb38: ; preds = %bb + %tmp52 = bitcast float %tmp29 to i32 + %tmp53 = bitcast float %tmp30 to i32 + %tmp54 = insertelement <2 x i32> undef, i32 %tmp52, i32 0 + %tmp55 = insertelement <2 x i32> %tmp54, i32 %tmp53, i32 1 + %tmp56 = bitcast <8 x i32> %tmp26 to <32 x i8> + %tmp57 = bitcast <4 x i32> %tmp28 to <16 x i8> + %tmp58 = call <4 x float> @llvm.SI.sample.v2i32(<2 x i32> %tmp55, <32 x i8> %tmp56, <16 x i8> %tmp57, i32 2) + br label %bb71 + +bb80: ; preds = %bb + %tmp81 = bitcast float %tmp29 to i32 + %tmp82 = bitcast float %tmp30 to i32 + %tmp82.2 = add i32 %tmp82, 1 + %tmp83 = insertelement <2 x i32> undef, i32 %tmp81, i32 0 + %tmp84 = insertelement <2 x i32> %tmp83, i32 %tmp82.2, i32 1 + %tmp85 = bitcast <8 x i32> %tmp26 to <32 x i8> + %tmp86 = bitcast <4 x i32> %tmp28 to <16 x i8> + %tmp87 = call <4 x float> @llvm.SI.sample.v2i32(<2 x i32> %tmp84, <32 x i8> %tmp85, <16 x i8> %tmp86, i32 2) + br label %bb71 + +bb71: ; preds = %bb80, %bb38 + %tmp72 = phi <4 x float> [ %tmp58, %bb38 ], [ %tmp87, %bb80 ] + %tmp88 = extractelement <4 x float> %tmp72, i32 0 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %tmp88, float %tmp88, float %tmp88, float %tmp88) + ret void +} + +attributes #0 = { "ShaderType"="0" "unsafe-fp-math"="true" } +attributes #1 = { nounwind readnone } diff --git a/llvm/test/CodeGen/AMDGPU/shared-op-cycle.ll b/llvm/test/CodeGen/AMDGPU/shared-op-cycle.ll new file mode 100644 index 00000000000..f52a9baf4d1 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/shared-op-cycle.ll @@ -0,0 +1,32 @@ +; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s + +; CHECK: {{^}}main: +; CHECK: MULADD_IEEE * +; CHECK-NOT: MULADD_IEEE * + +define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1, <4 x float> inreg %reg2) #0 { + %w0 = extractelement <4 x float> %reg0, i32 3 + %w1 = extractelement <4 x float> %reg1, i32 3 + %w2 = extractelement <4 x float> %reg2, i32 3 + %sq0 = fmul float %w0, %w0 + %r0 = fadd float %sq0, 2.0 + %sq1 = fmul float %w1, %w1 + %r1 = fadd float %sq1, 2.0 + %sq2 = fmul float %w2, %w2 + %r2 = fadd float %sq2, 2.0 + %v0 = insertelement <4 x float> undef, float %r0, i32 0 + %v1 = insertelement <4 x float> %v0, float %r1, i32 1 + %v2 = insertelement <4 x float> %v1, float %r2, i32 2 + %res = call float @llvm.AMDGPU.dp4(<4 x float> %v2, <4 x float> %v2) + %vecres = insertelement <4 x float> undef, float %res, i32 0 + call void @llvm.R600.store.swizzle(<4 x float> %vecres, i32 0, i32 2) + ret void +} + +; Function Attrs: readnone +declare float @llvm.AMDGPU.dp4(<4 x float>, <4 x float>) #1 + +declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) + +attributes #0 = { "ShaderType"="1" } +attributes #1 = { readnone } \ No newline at end of file diff --git a/llvm/test/CodeGen/AMDGPU/shl.ll b/llvm/test/CodeGen/AMDGPU/shl.ll new file mode 100644 index 00000000000..53b63dc4b8a --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/shl.ll @@ -0,0 +1,180 @@ +;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG %s +;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI %s +;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=VI %s + +;EG: {{^}}shl_v2i32: +;EG: LSHL {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +;EG: LSHL {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} + +;SI: {{^}}shl_v2i32: +;SI: v_lshl_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +;SI: v_lshl_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} + +;VI: {{^}}shl_v2i32: +;VI: v_lshlrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +;VI: v_lshlrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} + +define void @shl_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { + %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1 + %a = load <2 x i32>, <2 x i32> addrspace(1) * %in + %b = load <2 x i32>, <2 x i32> addrspace(1) * %b_ptr + %result = shl <2 x i32> %a, %b + store <2 x i32> %result, <2 x i32> addrspace(1)* %out + ret void +} + +;EG: {{^}}shl_v4i32: +;EG: LSHL {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +;EG: LSHL {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +;EG: LSHL {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +;EG: LSHL {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} + +;SI: {{^}}shl_v4i32: +;SI: v_lshl_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +;SI: v_lshl_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +;SI: v_lshl_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +;SI: v_lshl_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} + +;VI: {{^}}shl_v4i32: +;VI: v_lshlrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +;VI: v_lshlrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +;VI: v_lshlrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +;VI: v_lshlrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} + +define void @shl_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { + %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1 + %a = load <4 x i32>, <4 x i32> addrspace(1) * %in + %b = load <4 x i32>, <4 x i32> addrspace(1) * %b_ptr + %result = shl <4 x i32> %a, %b + store <4 x i32> %result, <4 x i32> addrspace(1)* %out + ret void +} + +;EG: {{^}}shl_i64: +;EG: SUB_INT {{\*? *}}[[COMPSH:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHIFT:T[0-9]+\.[XYZW]]] +;EG: LSHR {{\* *}}[[TEMP:T[0-9]+\.[XYZW]]], [[OPLO:T[0-9]+\.[XYZW]]], {{[[COMPSH]]|PV.[XYZW]}} +;EG: LSHR {{\*? *}}[[OVERF:T[0-9]+\.[XYZW]]], {{[[TEMP]]|PV.[XYZW]}}, 1 +;EG_CHECK-DAG: ADD_INT {{\*? *}}[[BIGSH:T[0-9]+\.[XYZW]]], [[SHIFT]], literal +;EG-DAG: LSHL {{\*? *}}[[HISMTMP:T[0-9]+\.[XYZW]]], [[OPHI:T[0-9]+\.[XYZW]]], [[SHIFT]] +;EG-DAG: OR_INT {{\*? *}}[[HISM:T[0-9]+\.[XYZW]]], {{[[HISMTMP]]|PV.[XYZW]}}, {{[[OVERF]]|PV.[XYZW]}} +;EG-DAG: LSHL {{\*? *}}[[LOSM:T[0-9]+\.[XYZW]]], [[OPLO]], {{PS|[[SHIFT]]}} +;EG-DAG: SETGT_UINT {{\*? *}}[[RESC:T[0-9]+\.[XYZW]]], [[SHIFT]], literal +;EG-DAG: CNDE_INT {{\*? *}}[[RESLO:T[0-9]+\.[XYZW]]], {{T[0-9]+\.[XYZW]}} +;EG-DAG: CNDE_INT {{\*? *}}[[RESHI:T[0-9]+\.[XYZW]]], {{T[0-9]+\.[XYZW], .*}}, 0.0 + +;SI: {{^}}shl_i64: +;SI: v_lshl_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}} + +;VI: {{^}}shl_i64: +;VI: v_lshlrev_b64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}} + +define void @shl_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { + %b_ptr = getelementptr i64, i64 addrspace(1)* %in, i64 1 + %a = load i64, i64 addrspace(1) * %in + %b = load i64, i64 addrspace(1) * %b_ptr + %result = shl i64 %a, %b + store i64 %result, i64 addrspace(1)* %out + ret void +} + +;EG: {{^}}shl_v2i64: +;EG-DAG: SUB_INT {{\*? *}}[[COMPSHA:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHA:T[0-9]+\.[XYZW]]] +;EG-DAG: SUB_INT {{\*? *}}[[COMPSHB:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHB:T[0-9]+\.[XYZW]]] +;EG-DAG: LSHR {{\*? *}}[[COMPSHA]] +;EG-DAG: LSHR {{\*? *}}[[COMPSHB]] +;EG-DAG: LSHR {{.*}}, 1 +;EG-DAG: LSHR {{.*}}, 1 +;EG-DAG: ADD_INT {{\*? *}}[[BIGSHA:T[0-9]+\.[XYZW]]]{{.*}}, literal +;EG-DAG: ADD_INT {{\*? *}}[[BIGSHB:T[0-9]+\.[XYZW]]]{{.*}}, literal +;EG-DAG: LSHL {{.*}}, [[SHA]] +;EG-DAG: LSHL {{.*}}, [[SHB]] +;EG-DAG: LSHL {{.*}}, [[SHA]] +;EG-DAG: LSHL {{.*}}, [[SHB]] +;EG-DAG: LSHL +;EG-DAG: LSHL +;EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHA]], literal +;EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHB]], literal +;EG-DAG: CNDE_INT {{.*}}, 0.0 +;EG-DAG: CNDE_INT {{.*}}, 0.0 +;EG-DAG: CNDE_INT +;EG-DAG: CNDE_INT + +;SI: {{^}}shl_v2i64: +;SI: v_lshl_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}} +;SI: v_lshl_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}} + +;VI: {{^}}shl_v2i64: +;VI: v_lshlrev_b64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}} +;VI: v_lshlrev_b64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}} + +define void @shl_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) { + %b_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %in, i64 1 + %a = load <2 x i64>, <2 x i64> addrspace(1) * %in + %b = load <2 x i64>, <2 x i64> addrspace(1) * %b_ptr + %result = shl <2 x i64> %a, %b + store <2 x i64> %result, <2 x i64> addrspace(1)* %out + ret void +} + +;EG: {{^}}shl_v4i64: +;EG-DAG: SUB_INT {{\*? *}}[[COMPSHA:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHA:T[0-9]+\.[XYZW]]] +;EG-DAG: SUB_INT {{\*? *}}[[COMPSHB:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHB:T[0-9]+\.[XYZW]]] +;EG-DAG: SUB_INT {{\*? *}}[[COMPSHC:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHC:T[0-9]+\.[XYZW]]] +;EG-DAG: SUB_INT {{\*? *}}[[COMPSHD:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHD:T[0-9]+\.[XYZW]]] +;EG-DAG: LSHR {{\*? *}}[[COMPSHA]] +;EG-DAG: LSHR {{\*? *}}[[COMPSHB]] +;EG-DAG: LSHR {{\*? *}}[[COMPSHC]] +;EG-DAG: LSHR {{\*? *}}[[COMPSHD]] +;EG-DAG: LSHR {{.*}}, 1 +;EG-DAG: LSHR {{.*}}, 1 +;EG-DAG: LSHR {{.*}}, 1 +;EG-DAG: LSHR {{.*}}, 1 +;EG-DAG: ADD_INT {{\*? *}}[[BIGSHA:T[0-9]+\.[XYZW]]]{{.*}}, literal +;EG-DAG: ADD_INT {{\*? *}}[[BIGSHB:T[0-9]+\.[XYZW]]]{{.*}}, literal +;EG-DAG: ADD_INT {{\*? *}}[[BIGSHC:T[0-9]+\.[XYZW]]]{{.*}}, literal +;EG-DAG: ADD_INT {{\*? *}}[[BIGSHD:T[0-9]+\.[XYZW]]]{{.*}}, literal +;EG-DAG: LSHL {{.*}}, [[SHA]] +;EG-DAG: LSHL {{.*}}, [[SHB]] +;EG-DAG: LSHL {{.*}}, [[SHC]] +;EG-DAG: LSHL {{.*}}, [[SHD]] +;EG-DAG: LSHL {{.*}}, [[SHA]] +;EG-DAG: LSHL {{.*}}, [[SHB]] +;EG-DAG: LSHL {{.*}}, [[SHC]] +;EG-DAG: LSHL {{.*}}, [[SHD]] +;EG-DAG: LSHL +;EG-DAG: LSHL +;EG-DAG: LSHL +;EG-DAG: LSHL +;EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHA]], literal +;EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHB]], literal +;EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHC]], literal +;EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHD]], literal +;EG-DAG: CNDE_INT {{.*}}, 0.0 +;EG-DAG: CNDE_INT {{.*}}, 0.0 +;EG-DAG: CNDE_INT {{.*}}, 0.0 +;EG-DAG: CNDE_INT {{.*}}, 0.0 +;EG-DAG: CNDE_INT +;EG-DAG: CNDE_INT +;EG-DAG: CNDE_INT +;EG-DAG: CNDE_INT + +;SI: {{^}}shl_v4i64: +;SI: v_lshl_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}} +;SI: v_lshl_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}} +;SI: v_lshl_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}} +;SI: v_lshl_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}} + +;VI: {{^}}shl_v4i64: +;VI: v_lshlrev_b64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}} +;VI: v_lshlrev_b64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}} +;VI: v_lshlrev_b64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}} +;VI: v_lshlrev_b64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}} + +define void @shl_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) { + %b_ptr = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %in, i64 1 + %a = load <4 x i64>, <4 x i64> addrspace(1) * %in + %b = load <4 x i64>, <4 x i64> addrspace(1) * %b_ptr + %result = shl <4 x i64> %a, %b + store <4 x i64> %result, <4 x i64> addrspace(1)* %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/shl_add_constant.ll b/llvm/test/CodeGen/AMDGPU/shl_add_constant.ll new file mode 100644 index 00000000000..b1485bfaaeb --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/shl_add_constant.ll @@ -0,0 +1,90 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s + +declare i32 @llvm.r600.read.tidig.x() #1 + +; Test with inline immediate + +; FUNC-LABEL: {{^}}shl_2_add_9_i32: +; SI: v_lshlrev_b32_e32 [[REG:v[0-9]+]], 2, {{v[0-9]+}} +; SI: v_add_i32_e32 [[RESULT:v[0-9]+]], 36, [[REG]] +; SI: buffer_store_dword [[RESULT]] +; SI: s_endpgm +define void @shl_2_add_9_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { + %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1 + %ptr = getelementptr i32, i32 addrspace(1)* %in, i32 %tid.x + %val = load i32, i32 addrspace(1)* %ptr, align 4 + %add = add i32 %val, 9 + %result = shl i32 %add, 2 + store i32 %result, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}shl_2_add_9_i32_2_add_uses: +; SI-DAG: v_add_i32_e32 [[ADDREG:v[0-9]+]], 9, {{v[0-9]+}} +; SI-DAG: v_lshlrev_b32_e32 [[SHLREG:v[0-9]+]], 2, {{v[0-9]+}} +; SI-DAG: buffer_store_dword [[ADDREG]] +; SI-DAG: buffer_store_dword [[SHLREG]] +; SI: s_endpgm +define void @shl_2_add_9_i32_2_add_uses(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 addrspace(1)* %in) #0 { + %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1 + %ptr = getelementptr i32, i32 addrspace(1)* %in, i32 %tid.x + %val = load i32, i32 addrspace(1)* %ptr, align 4 + %add = add i32 %val, 9 + %result = shl i32 %add, 2 + store i32 %result, i32 addrspace(1)* %out0, align 4 + store i32 %add, i32 addrspace(1)* %out1, align 4 + ret void +} + +; Test with add literal constant + +; FUNC-LABEL: {{^}}shl_2_add_999_i32: +; SI: v_lshlrev_b32_e32 [[REG:v[0-9]+]], 2, {{v[0-9]+}} +; SI: v_add_i32_e32 [[RESULT:v[0-9]+]], 0xf9c, [[REG]] +; SI: buffer_store_dword [[RESULT]] +; SI: s_endpgm +define void @shl_2_add_999_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { + %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1 + %ptr = getelementptr i32, i32 addrspace(1)* %in, i32 %tid.x + %val = load i32, i32 addrspace(1)* %ptr, align 4 + %shl = add i32 %val, 999 + %result = shl i32 %shl, 2 + store i32 %result, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}test_add_shl_add_constant: +; SI-DAG: s_load_dword [[X:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb +; SI-DAG: s_load_dword [[Y:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc +; SI: s_lshl_b32 [[SHL3:s[0-9]+]], [[X]], 3 +; SI: s_add_i32 [[TMP:s[0-9]+]], [[SHL3]], [[Y]] +; SI: s_add_i32 [[RESULT:s[0-9]+]], [[TMP]], 0x3d8 +; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[RESULT]] +; SI: buffer_store_dword [[VRESULT]] +define void @test_add_shl_add_constant(i32 addrspace(1)* %out, i32 %x, i32 %y) #0 { + %add.0 = add i32 %x, 123 + %shl = shl i32 %add.0, 3 + %add.1 = add i32 %shl, %y + store i32 %add.1, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}test_add_shl_add_constant_inv: +; SI-DAG: s_load_dword [[X:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb +; SI-DAG: s_load_dword [[Y:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc +; SI: s_lshl_b32 [[SHL3:s[0-9]+]], [[X]], 3 +; SI: s_add_i32 [[TMP:s[0-9]+]], [[SHL3]], [[Y]] +; SI: s_add_i32 [[RESULT:s[0-9]+]], [[TMP]], 0x3d8 +; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[RESULT]] +; SI: buffer_store_dword [[VRESULT]] + +define void @test_add_shl_add_constant_inv(i32 addrspace(1)* %out, i32 %x, i32 %y) #0 { + %add.0 = add i32 %x, 123 + %shl = shl i32 %add.0, 3 + %add.1 = add i32 %y, %shl + store i32 %add.1, i32 addrspace(1)* %out, align 4 + ret void +} + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } diff --git a/llvm/test/CodeGen/AMDGPU/shl_add_ptr.ll b/llvm/test/CodeGen/AMDGPU/shl_add_ptr.ll new file mode 100644 index 00000000000..6671e909cd1 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/shl_add_ptr.ll @@ -0,0 +1,284 @@ +; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck -check-prefix=SI %s + +; Test that doing a shift of a pointer with a constant add will be +; folded into the constant offset addressing mode even if the add has +; multiple uses. This is relevant to accessing 2 separate, adjacent +; LDS globals. + + +declare i32 @llvm.r600.read.tidig.x() #1 + +@lds0 = addrspace(3) global [512 x float] undef, align 4 +@lds1 = addrspace(3) global [512 x float] undef, align 4 + + +; Make sure the (add tid, 2) << 2 gets folded into the ds's offset as (tid << 2) + 8 + +; SI-LABEL: {{^}}load_shl_base_lds_0: +; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}} +; SI: ds_read_b32 {{v[0-9]+}}, [[PTR]] offset:8 +; SI: s_endpgm +define void @load_shl_base_lds_0(float addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { + %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1 + %idx.0 = add nsw i32 %tid.x, 2 + %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds0, i32 0, i32 %idx.0 + %val0 = load float, float addrspace(3)* %arrayidx0, align 4 + store i32 %idx.0, i32 addrspace(1)* %add_use, align 4 + store float %val0, float addrspace(1)* %out + ret void +} + +; Make sure once the first use is folded into the addressing mode, the +; remaining add use goes through the normal shl + add constant fold. + +; SI-LABEL: {{^}}load_shl_base_lds_1: +; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}} +; SI: ds_read_b32 [[RESULT:v[0-9]+]], [[PTR]] offset:8 +; SI: v_add_i32_e32 [[ADDUSE:v[0-9]+]], 8, v{{[0-9]+}} +; SI-DAG: buffer_store_dword [[RESULT]] +; SI-DAG: buffer_store_dword [[ADDUSE]] +; SI: s_endpgm +define void @load_shl_base_lds_1(float addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { + %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1 + %idx.0 = add nsw i32 %tid.x, 2 + %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds0, i32 0, i32 %idx.0 + %val0 = load float, float addrspace(3)* %arrayidx0, align 4 + %shl_add_use = shl i32 %idx.0, 2 + store i32 %shl_add_use, i32 addrspace(1)* %add_use, align 4 + store float %val0, float addrspace(1)* %out + ret void +} + +@maxlds = addrspace(3) global [65536 x i8] undef, align 4 + +; SI-LABEL: {{^}}load_shl_base_lds_max_offset +; SI: ds_read_u8 v{{[0-9]+}}, v{{[0-9]+}} offset:65535 +; SI: s_endpgm +define void @load_shl_base_lds_max_offset(i8 addrspace(1)* %out, i8 addrspace(3)* %lds, i32 addrspace(1)* %add_use) #0 { + %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1 + %idx.0 = add nsw i32 %tid.x, 65535 + %arrayidx0 = getelementptr inbounds [65536 x i8], [65536 x i8] addrspace(3)* @maxlds, i32 0, i32 %idx.0 + %val0 = load i8, i8 addrspace(3)* %arrayidx0 + store i32 %idx.0, i32 addrspace(1)* %add_use + store i8 %val0, i8 addrspace(1)* %out + ret void +} + +; The two globals are placed adjacent in memory, so the same base +; pointer can be used with an offset into the second one. + +; SI-LABEL: {{^}}load_shl_base_lds_2: +; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}} +; SI: s_mov_b32 m0, -1 +; SI-NEXT: ds_read2st64_b32 {{v\[[0-9]+:[0-9]+\]}}, [[PTR]] offset0:1 offset1:9 +; SI: s_endpgm +define void @load_shl_base_lds_2(float addrspace(1)* %out) #0 { + %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1 + %idx.0 = add nsw i32 %tid.x, 64 + %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds0, i32 0, i32 %idx.0 + %val0 = load float, float addrspace(3)* %arrayidx0, align 4 + %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds1, i32 0, i32 %idx.0 + %val1 = load float, float addrspace(3)* %arrayidx1, align 4 + %sum = fadd float %val0, %val1 + store float %sum, float addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL: {{^}}store_shl_base_lds_0: +; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}} +; SI: ds_write_b32 [[PTR]], {{v[0-9]+}} offset:8 +; SI: s_endpgm +define void @store_shl_base_lds_0(float addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { + %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1 + %idx.0 = add nsw i32 %tid.x, 2 + %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds0, i32 0, i32 %idx.0 + store float 1.0, float addrspace(3)* %arrayidx0, align 4 + store i32 %idx.0, i32 addrspace(1)* %add_use, align 4 + ret void +} + + +; -------------------------------------------------------------------------------- +; Atomics. + +@lds2 = addrspace(3) global [512 x i32] undef, align 4 + +; define void @atomic_load_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { +; %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1 +; %idx.0 = add nsw i32 %tid.x, 2 +; %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0 +; %val = load atomic i32, i32 addrspace(3)* %arrayidx0 seq_cst, align 4 +; store i32 %val, i32 addrspace(1)* %out, align 4 +; store i32 %idx.0, i32 addrspace(1)* %add_use, align 4 +; ret void +; } + + +; SI-LABEL: {{^}}atomic_cmpxchg_shl_base_lds_0: +; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}} +; SI: ds_cmpst_rtn_b32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}}, {{v[0-9]+}} offset:8 +; SI: s_endpgm +define void @atomic_cmpxchg_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use, i32 %swap) #0 { + %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1 + %idx.0 = add nsw i32 %tid.x, 2 + %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0 + %pair = cmpxchg i32 addrspace(3)* %arrayidx0, i32 7, i32 %swap seq_cst monotonic + %result = extractvalue { i32, i1 } %pair, 0 + store i32 %result, i32 addrspace(1)* %out, align 4 + store i32 %idx.0, i32 addrspace(1)* %add_use, align 4 + ret void +} + +; SI-LABEL: {{^}}atomic_swap_shl_base_lds_0: +; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}} +; SI: ds_wrxchg_rtn_b32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8 +; SI: s_endpgm +define void @atomic_swap_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { + %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1 + %idx.0 = add nsw i32 %tid.x, 2 + %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0 + %val = atomicrmw xchg i32 addrspace(3)* %arrayidx0, i32 3 seq_cst + store i32 %val, i32 addrspace(1)* %out, align 4 + store i32 %idx.0, i32 addrspace(1)* %add_use, align 4 + ret void +} + +; SI-LABEL: {{^}}atomic_add_shl_base_lds_0: +; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}} +; SI: ds_add_rtn_u32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8 +; SI: s_endpgm +define void @atomic_add_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { + %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1 + %idx.0 = add nsw i32 %tid.x, 2 + %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0 + %val = atomicrmw add i32 addrspace(3)* %arrayidx0, i32 3 seq_cst + store i32 %val, i32 addrspace(1)* %out, align 4 + store i32 %idx.0, i32 addrspace(1)* %add_use, align 4 + ret void +} + +; SI-LABEL: {{^}}atomic_sub_shl_base_lds_0: +; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}} +; SI: ds_sub_rtn_u32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8 +; SI: s_endpgm +define void @atomic_sub_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { + %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1 + %idx.0 = add nsw i32 %tid.x, 2 + %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0 + %val = atomicrmw sub i32 addrspace(3)* %arrayidx0, i32 3 seq_cst + store i32 %val, i32 addrspace(1)* %out, align 4 + store i32 %idx.0, i32 addrspace(1)* %add_use, align 4 + ret void +} + +; SI-LABEL: {{^}}atomic_and_shl_base_lds_0: +; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}} +; SI: ds_and_rtn_b32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8 +; SI: s_endpgm +define void @atomic_and_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { + %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1 + %idx.0 = add nsw i32 %tid.x, 2 + %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0 + %val = atomicrmw and i32 addrspace(3)* %arrayidx0, i32 3 seq_cst + store i32 %val, i32 addrspace(1)* %out, align 4 + store i32 %idx.0, i32 addrspace(1)* %add_use, align 4 + ret void +} + +; SI-LABEL: {{^}}atomic_or_shl_base_lds_0: +; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}} +; SI: ds_or_rtn_b32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8 +; SI: s_endpgm +define void @atomic_or_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { + %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1 + %idx.0 = add nsw i32 %tid.x, 2 + %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0 + %val = atomicrmw or i32 addrspace(3)* %arrayidx0, i32 3 seq_cst + store i32 %val, i32 addrspace(1)* %out, align 4 + store i32 %idx.0, i32 addrspace(1)* %add_use, align 4 + ret void +} + +; SI-LABEL: {{^}}atomic_xor_shl_base_lds_0: +; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}} +; SI: ds_xor_rtn_b32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8 +; SI: s_endpgm +define void @atomic_xor_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { + %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1 + %idx.0 = add nsw i32 %tid.x, 2 + %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0 + %val = atomicrmw xor i32 addrspace(3)* %arrayidx0, i32 3 seq_cst + store i32 %val, i32 addrspace(1)* %out, align 4 + store i32 %idx.0, i32 addrspace(1)* %add_use, align 4 + ret void +} + +; define void @atomic_nand_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { +; %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1 +; %idx.0 = add nsw i32 %tid.x, 2 +; %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0 +; %val = atomicrmw nand i32 addrspace(3)* %arrayidx0, i32 3 seq_cst +; store i32 %val, i32 addrspace(1)* %out, align 4 +; store i32 %idx.0, i32 addrspace(1)* %add_use, align 4 +; ret void +; } + +; SI-LABEL: {{^}}atomic_min_shl_base_lds_0: +; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}} +; SI: ds_min_rtn_i32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8 +; SI: s_endpgm +define void @atomic_min_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { + %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1 + %idx.0 = add nsw i32 %tid.x, 2 + %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0 + %val = atomicrmw min i32 addrspace(3)* %arrayidx0, i32 3 seq_cst + store i32 %val, i32 addrspace(1)* %out, align 4 + store i32 %idx.0, i32 addrspace(1)* %add_use, align 4 + ret void +} + +; SI-LABEL: {{^}}atomic_max_shl_base_lds_0: +; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}} +; SI: ds_max_rtn_i32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8 +; SI: s_endpgm +define void @atomic_max_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { + %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1 + %idx.0 = add nsw i32 %tid.x, 2 + %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0 + %val = atomicrmw max i32 addrspace(3)* %arrayidx0, i32 3 seq_cst + store i32 %val, i32 addrspace(1)* %out, align 4 + store i32 %idx.0, i32 addrspace(1)* %add_use, align 4 + ret void +} + +; SI-LABEL: {{^}}atomic_umin_shl_base_lds_0: +; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}} +; SI: ds_min_rtn_u32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8 +; SI: s_endpgm +define void @atomic_umin_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { + %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1 + %idx.0 = add nsw i32 %tid.x, 2 + %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0 + %val = atomicrmw umin i32 addrspace(3)* %arrayidx0, i32 3 seq_cst + store i32 %val, i32 addrspace(1)* %out, align 4 + store i32 %idx.0, i32 addrspace(1)* %add_use, align 4 + ret void +} + +; SI-LABEL: {{^}}atomic_umax_shl_base_lds_0: +; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}} +; SI: ds_max_rtn_u32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8 +; SI: s_endpgm +define void @atomic_umax_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { + %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1 + %idx.0 = add nsw i32 %tid.x, 2 + %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0 + %val = atomicrmw umax i32 addrspace(3)* %arrayidx0, i32 3 seq_cst + store i32 %val, i32 addrspace(1)* %out, align 4 + store i32 %idx.0, i32 addrspace(1)* %add_use, align 4 + ret void +} + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } diff --git a/llvm/test/CodeGen/AMDGPU/si-annotate-cf-assertion.ll b/llvm/test/CodeGen/AMDGPU/si-annotate-cf-assertion.ll new file mode 100644 index 00000000000..69d719385ac --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/si-annotate-cf-assertion.ll @@ -0,0 +1,25 @@ +; REQUIRES: asserts +; XFAIL: * +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs-asm-verbose=false < %s | FileCheck %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs-asm-verbose=false < %s | FileCheck %s + + +define void @test(i32 addrspace(1)* %g, i8 addrspace(3)* %l, i32 %x) nounwind { +; CHECK-LABEL: {{^}}test: + +entry: + switch i32 %x, label %sw.default [ + i32 0, label %sw.bb + i32 60, label %sw.bb + ] + +sw.bb: + unreachable + +sw.default: + unreachable + +sw.epilog: + ret void +} + diff --git a/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll b/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll new file mode 100644 index 00000000000..bbcb861f37d --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll @@ -0,0 +1,63 @@ +; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s +; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s + +; FUNC-LABEL: {{^}}break_inserted_outside_of_loop: + +; SI: [[LOOP_LABEL:[A-Z0-9]+]]: +; Lowered break instructin: +; SI: s_or_b64 +; Lowered Loop instruction: +; SI: s_andn2_b64 +; s_cbranch_execnz [[LOOP_LABEL]] +; SI: s_endpgm +define void @break_inserted_outside_of_loop(i32 addrspace(1)* %out, i32 %a, i32 %b) { +main_body: + %0 = and i32 %a, %b + %1 = trunc i32 %0 to i1 + br label %ENDIF + +ENDLOOP: + store i32 0, i32 addrspace(1)* %out + ret void + +ENDIF: + br i1 %1, label %ENDLOOP, label %ENDIF +} + + +; FUNC-LABEL: {{^}}phi_cond_outside_loop: +; FIXME: This could be folded into the s_or_b64 instruction +; SI: s_mov_b64 [[ZERO:s\[[0-9]+:[0-9]+\]]], 0 +; SI: [[LOOP_LABEL:[A-Z0-9]+]] +; SI: v_cmp_ne_i32_e32 vcc, 0, v{{[0-9]+}} + +; SI_IF_BREAK instruction: +; SI: s_or_b64 [[BREAK:s\[[0-9]+:[0-9]+\]]], vcc, [[ZERO]] + +; SI_LOOP instruction: +; SI: s_andn2_b64 exec, exec, [[BREAK]] +; SI: s_cbranch_execnz [[LOOP_LABEL]] +; SI: s_endpgm + +define void @phi_cond_outside_loop(i32 %a, i32 %b) { +entry: + %0 = icmp eq i32 %a , 0 + br i1 %0, label %if, label %else + +if: + br label %endif + +else: + %1 = icmp eq i32 %b, 0 + br label %endif + +endif: + %2 = phi i1 [0, %if], [%1, %else] + br label %loop + +loop: + br i1 %2, label %exit, label %loop + +exit: + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/si-lod-bias.ll b/llvm/test/CodeGen/AMDGPU/si-lod-bias.ll new file mode 100644 index 00000000000..944499a1146 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/si-lod-bias.ll @@ -0,0 +1,52 @@ +;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s +;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s + +; This shader has the potential to generated illegal VGPR to SGPR copies if +; the wrong register class is used for the REG_SEQUENCE instructions. + +; CHECK: {{^}}main: +; CHECK: image_sample_b v{{\[[0-9]:[0-9]\]}}, 15, 0, 0, 0, 0, 0, 0, 0, v{{\[[0-9]:[0-9]\]}} + +define void @main(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 { +main_body: + %20 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %0, i32 0 + %21 = load <16 x i8>, <16 x i8> addrspace(2)* %20, !tbaa !1 + %22 = call float @llvm.SI.load.const(<16 x i8> %21, i32 16) + %23 = getelementptr <32 x i8>, <32 x i8> addrspace(2)* %2, i32 0 + %24 = load <32 x i8>, <32 x i8> addrspace(2)* %23, !tbaa !1 + %25 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %1, i32 0 + %26 = load <16 x i8>, <16 x i8> addrspace(2)* %25, !tbaa !1 + %27 = call float @llvm.SI.fs.interp(i32 0, i32 0, i32 %3, <2 x i32> %5) + %28 = call float @llvm.SI.fs.interp(i32 1, i32 0, i32 %3, <2 x i32> %5) + %29 = bitcast float %22 to i32 + %30 = bitcast float %27 to i32 + %31 = bitcast float %28 to i32 + %32 = insertelement <4 x i32> undef, i32 %29, i32 0 + %33 = insertelement <4 x i32> %32, i32 %30, i32 1 + %34 = insertelement <4 x i32> %33, i32 %31, i32 2 + %35 = insertelement <4 x i32> %34, i32 undef, i32 3 + %36 = call <4 x float> @llvm.SI.sampleb.v4i32(<4 x i32> %35, <32 x i8> %24, <16 x i8> %26, i32 2) + %37 = extractelement <4 x float> %36, i32 0 + %38 = extractelement <4 x float> %36, i32 1 + %39 = extractelement <4 x float> %36, i32 2 + %40 = extractelement <4 x float> %36, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %37, float %38, float %39, float %40) + ret void +} + +; Function Attrs: nounwind readnone +declare float @llvm.SI.load.const(<16 x i8>, i32) #1 + +; Function Attrs: nounwind readnone +declare float @llvm.SI.fs.interp(i32, i32, i32, <2 x i32>) #1 + +; Function Attrs: nounwind readnone +declare <4 x float> @llvm.SI.sampleb.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32) #1 + +declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) + +attributes #0 = { "ShaderType"="0" } +attributes #1 = { nounwind readnone } + +!0 = !{!"const", null} +!1 = !{!0, !0, i64 0, i32 1} diff --git a/llvm/test/CodeGen/AMDGPU/si-sgpr-spill.ll b/llvm/test/CodeGen/AMDGPU/si-sgpr-spill.ll new file mode 100644 index 00000000000..84652701f77 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/si-sgpr-spill.ll @@ -0,0 +1,1568 @@ +; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck %s +; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck %s + +; These tests check that the compiler won't crash when it needs to spill +; SGPRs. + +; CHECK-LABEL: {{^}}main: +; CHECK: s_wqm +; Writing to M0 from an SMRD instruction will hang the GPU. +; CHECK-NOT: s_buffer_load_dword m0 +; CHECK: s_endpgm +@ddxy_lds = external addrspace(3) global [64 x i32] + +define void @main([17 x <16 x i8>] addrspace(2)* byval, [32 x <16 x i8>] addrspace(2)* byval, [16 x <32 x i8>] addrspace(2)* byval, float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 { +main_body: + %21 = getelementptr [17 x <16 x i8>], [17 x <16 x i8>] addrspace(2)* %0, i64 0, i32 0 + %22 = load <16 x i8>, <16 x i8> addrspace(2)* %21, !tbaa !0 + %23 = call float @llvm.SI.load.const(<16 x i8> %22, i32 96) + %24 = call float @llvm.SI.load.const(<16 x i8> %22, i32 100) + %25 = call float @llvm.SI.load.const(<16 x i8> %22, i32 104) + %26 = call float @llvm.SI.load.const(<16 x i8> %22, i32 112) + %27 = call float @llvm.SI.load.const(<16 x i8> %22, i32 116) + %28 = call float @llvm.SI.load.const(<16 x i8> %22, i32 120) + %29 = call float @llvm.SI.load.const(<16 x i8> %22, i32 128) + %30 = call float @llvm.SI.load.const(<16 x i8> %22, i32 132) + %31 = call float @llvm.SI.load.const(<16 x i8> %22, i32 140) + %32 = call float @llvm.SI.load.const(<16 x i8> %22, i32 144) + %33 = call float @llvm.SI.load.const(<16 x i8> %22, i32 160) + %34 = call float @llvm.SI.load.const(<16 x i8> %22, i32 176) + %35 = call float @llvm.SI.load.const(<16 x i8> %22, i32 180) + %36 = call float @llvm.SI.load.const(<16 x i8> %22, i32 184) + %37 = call float @llvm.SI.load.const(<16 x i8> %22, i32 192) + %38 = call float @llvm.SI.load.const(<16 x i8> %22, i32 196) + %39 = call float @llvm.SI.load.const(<16 x i8> %22, i32 200) + %40 = call float @llvm.SI.load.const(<16 x i8> %22, i32 208) + %41 = call float @llvm.SI.load.const(<16 x i8> %22, i32 212) + %42 = call float @llvm.SI.load.const(<16 x i8> %22, i32 216) + %43 = call float @llvm.SI.load.const(<16 x i8> %22, i32 224) + %44 = call float @llvm.SI.load.const(<16 x i8> %22, i32 240) + %45 = call float @llvm.SI.load.const(<16 x i8> %22, i32 244) + %46 = call float @llvm.SI.load.const(<16 x i8> %22, i32 248) + %47 = call float @llvm.SI.load.const(<16 x i8> %22, i32 256) + %48 = call float @llvm.SI.load.const(<16 x i8> %22, i32 272) + %49 = call float @llvm.SI.load.const(<16 x i8> %22, i32 276) + %50 = call float @llvm.SI.load.const(<16 x i8> %22, i32 280) + %51 = call float @llvm.SI.load.const(<16 x i8> %22, i32 288) + %52 = call float @llvm.SI.load.const(<16 x i8> %22, i32 292) + %53 = call float @llvm.SI.load.const(<16 x i8> %22, i32 296) + %54 = call float @llvm.SI.load.const(<16 x i8> %22, i32 304) + %55 = call float @llvm.SI.load.const(<16 x i8> %22, i32 308) + %56 = call float @llvm.SI.load.const(<16 x i8> %22, i32 312) + %57 = call float @llvm.SI.load.const(<16 x i8> %22, i32 368) + %58 = call float @llvm.SI.load.const(<16 x i8> %22, i32 372) + %59 = call float @llvm.SI.load.const(<16 x i8> %22, i32 376) + %60 = call float @llvm.SI.load.const(<16 x i8> %22, i32 384) + %61 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 0 + %62 = load <32 x i8>, <32 x i8> addrspace(2)* %61, !tbaa !0 + %63 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 0 + %64 = load <16 x i8>, <16 x i8> addrspace(2)* %63, !tbaa !0 + %65 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 1 + %66 = load <32 x i8>, <32 x i8> addrspace(2)* %65, !tbaa !0 + %67 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 1 + %68 = load <16 x i8>, <16 x i8> addrspace(2)* %67, !tbaa !0 + %69 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 2 + %70 = load <32 x i8>, <32 x i8> addrspace(2)* %69, !tbaa !0 + %71 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 2 + %72 = load <16 x i8>, <16 x i8> addrspace(2)* %71, !tbaa !0 + %73 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 3 + %74 = load <32 x i8>, <32 x i8> addrspace(2)* %73, !tbaa !0 + %75 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 3 + %76 = load <16 x i8>, <16 x i8> addrspace(2)* %75, !tbaa !0 + %77 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 4 + %78 = load <32 x i8>, <32 x i8> addrspace(2)* %77, !tbaa !0 + %79 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 4 + %80 = load <16 x i8>, <16 x i8> addrspace(2)* %79, !tbaa !0 + %81 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 5 + %82 = load <32 x i8>, <32 x i8> addrspace(2)* %81, !tbaa !0 + %83 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 5 + %84 = load <16 x i8>, <16 x i8> addrspace(2)* %83, !tbaa !0 + %85 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 6 + %86 = load <32 x i8>, <32 x i8> addrspace(2)* %85, !tbaa !0 + %87 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 6 + %88 = load <16 x i8>, <16 x i8> addrspace(2)* %87, !tbaa !0 + %89 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 7 + %90 = load <32 x i8>, <32 x i8> addrspace(2)* %89, !tbaa !0 + %91 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 7 + %92 = load <16 x i8>, <16 x i8> addrspace(2)* %91, !tbaa !0 + %93 = call float @llvm.SI.fs.interp(i32 0, i32 0, i32 %4, <2 x i32> %6) + %94 = call float @llvm.SI.fs.interp(i32 1, i32 0, i32 %4, <2 x i32> %6) + %95 = call float @llvm.SI.fs.interp(i32 0, i32 1, i32 %4, <2 x i32> %6) + %96 = call float @llvm.SI.fs.interp(i32 1, i32 1, i32 %4, <2 x i32> %6) + %97 = call float @llvm.SI.fs.interp(i32 2, i32 1, i32 %4, <2 x i32> %6) + %98 = call float @llvm.SI.fs.interp(i32 0, i32 2, i32 %4, <2 x i32> %6) + %99 = call float @llvm.SI.fs.interp(i32 1, i32 2, i32 %4, <2 x i32> %6) + %100 = call float @llvm.SI.fs.interp(i32 2, i32 2, i32 %4, <2 x i32> %6) + %101 = call float @llvm.SI.fs.interp(i32 0, i32 3, i32 %4, <2 x i32> %6) + %102 = call float @llvm.SI.fs.interp(i32 1, i32 3, i32 %4, <2 x i32> %6) + %103 = call float @llvm.SI.fs.interp(i32 2, i32 3, i32 %4, <2 x i32> %6) + %104 = call float @llvm.SI.fs.interp(i32 0, i32 4, i32 %4, <2 x i32> %6) + %105 = call float @llvm.SI.fs.interp(i32 1, i32 4, i32 %4, <2 x i32> %6) + %106 = call float @llvm.SI.fs.interp(i32 2, i32 4, i32 %4, <2 x i32> %6) + %107 = call float @llvm.SI.fs.interp(i32 0, i32 5, i32 %4, <2 x i32> %6) + %108 = call float @llvm.SI.fs.interp(i32 1, i32 5, i32 %4, <2 x i32> %6) + %109 = call float @llvm.SI.fs.interp(i32 2, i32 5, i32 %4, <2 x i32> %6) + %110 = call i32 @llvm.SI.tid() + %111 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %110 + %112 = bitcast float %93 to i32 + store i32 %112, i32 addrspace(3)* %111 + %113 = bitcast float %94 to i32 + store i32 %113, i32 addrspace(3)* %111 + %114 = call i32 @llvm.SI.tid() + %115 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %114 + %116 = and i32 %114, -4 + %117 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %116 + %118 = add i32 %116, 1 + %119 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %118 + %120 = bitcast float %93 to i32 + store i32 %120, i32 addrspace(3)* %115 + %121 = load i32, i32 addrspace(3)* %117 + %122 = bitcast i32 %121 to float + %123 = load i32, i32 addrspace(3)* %119 + %124 = bitcast i32 %123 to float + %125 = fsub float %124, %122 + %126 = bitcast float %94 to i32 + store i32 %126, i32 addrspace(3)* %115 + %127 = load i32, i32 addrspace(3)* %117 + %128 = bitcast i32 %127 to float + %129 = load i32, i32 addrspace(3)* %119 + %130 = bitcast i32 %129 to float + %131 = fsub float %130, %128 + %132 = insertelement <4 x float> undef, float %125, i32 0 + %133 = insertelement <4 x float> %132, float %131, i32 1 + %134 = insertelement <4 x float> %133, float %131, i32 2 + %135 = insertelement <4 x float> %134, float %131, i32 3 + %136 = extractelement <4 x float> %135, i32 0 + %137 = extractelement <4 x float> %135, i32 1 + %138 = fmul float %60, %93 + %139 = fmul float %60, %94 + %140 = fmul float %60, %94 + %141 = fmul float %60, %94 + %142 = call i32 @llvm.SI.tid() + %143 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %142 + %144 = bitcast float %138 to i32 + store i32 %144, i32 addrspace(3)* %143 + %145 = bitcast float %139 to i32 + store i32 %145, i32 addrspace(3)* %143 + %146 = bitcast float %140 to i32 + store i32 %146, i32 addrspace(3)* %143 + %147 = bitcast float %141 to i32 + store i32 %147, i32 addrspace(3)* %143 + %148 = call i32 @llvm.SI.tid() + %149 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %148 + %150 = and i32 %148, -4 + %151 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %150 + %152 = add i32 %150, 2 + %153 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %152 + %154 = bitcast float %138 to i32 + store i32 %154, i32 addrspace(3)* %149 + %155 = load i32, i32 addrspace(3)* %151 + %156 = bitcast i32 %155 to float + %157 = load i32, i32 addrspace(3)* %153 + %158 = bitcast i32 %157 to float + %159 = fsub float %158, %156 + %160 = bitcast float %139 to i32 + store i32 %160, i32 addrspace(3)* %149 + %161 = load i32, i32 addrspace(3)* %151 + %162 = bitcast i32 %161 to float + %163 = load i32, i32 addrspace(3)* %153 + %164 = bitcast i32 %163 to float + %165 = fsub float %164, %162 + %166 = bitcast float %140 to i32 + store i32 %166, i32 addrspace(3)* %149 + %167 = load i32, i32 addrspace(3)* %151 + %168 = bitcast i32 %167 to float + %169 = load i32, i32 addrspace(3)* %153 + %170 = bitcast i32 %169 to float + %171 = fsub float %170, %168 + %172 = bitcast float %141 to i32 + store i32 %172, i32 addrspace(3)* %149 + %173 = load i32, i32 addrspace(3)* %151 + %174 = bitcast i32 %173 to float + %175 = load i32, i32 addrspace(3)* %153 + %176 = bitcast i32 %175 to float + %177 = fsub float %176, %174 + %178 = insertelement <4 x float> undef, float %159, i32 0 + %179 = insertelement <4 x float> %178, float %165, i32 1 + %180 = insertelement <4 x float> %179, float %171, i32 2 + %181 = insertelement <4 x float> %180, float %177, i32 3 + %182 = extractelement <4 x float> %181, i32 0 + %183 = extractelement <4 x float> %181, i32 1 + %184 = fdiv float 1.000000e+00, %97 + %185 = fmul float %33, %184 + %186 = fcmp uge float 1.000000e+00, %185 + %187 = select i1 %186, float %185, float 1.000000e+00 + %188 = fmul float %187, %30 + %189 = call float @ceil(float %188) + %190 = fcmp uge float 3.000000e+00, %189 + %191 = select i1 %190, float 3.000000e+00, float %189 + %192 = fdiv float 1.000000e+00, %191 + %193 = fdiv float 1.000000e+00, %30 + %194 = fmul float %191, %193 + %195 = fmul float %31, %194 + %196 = fmul float %95, %95 + %197 = fmul float %96, %96 + %198 = fadd float %197, %196 + %199 = fmul float %97, %97 + %200 = fadd float %198, %199 + %201 = call float @llvm.AMDGPU.rsq.f32(float %200) + %202 = fmul float %95, %201 + %203 = fmul float %96, %201 + %204 = fmul float %202, %29 + %205 = fmul float %203, %29 + %206 = fmul float %204, -1.000000e+00 + %207 = fmul float %205, 1.000000e+00 + %208 = fmul float %206, %32 + %209 = fmul float %207, %32 + %210 = fsub float -0.000000e+00, %208 + %211 = fadd float %93, %210 + %212 = fsub float -0.000000e+00, %209 + %213 = fadd float %94, %212 + %214 = fmul float %206, %192 + %215 = fmul float %207, %192 + %216 = fmul float -1.000000e+00, %192 + %217 = bitcast float %136 to i32 + %218 = bitcast float %182 to i32 + %219 = bitcast float %137 to i32 + %220 = bitcast float %183 to i32 + %221 = insertelement <8 x i32> undef, i32 %217, i32 0 + %222 = insertelement <8 x i32> %221, i32 %218, i32 1 + %223 = insertelement <8 x i32> %222, i32 %219, i32 2 + %224 = insertelement <8 x i32> %223, i32 %220, i32 3 + br label %LOOP + +LOOP: ; preds = %ENDIF, %main_body + %temp24.0 = phi float [ 1.000000e+00, %main_body ], [ %258, %ENDIF ] + %temp28.0 = phi float [ %211, %main_body ], [ %253, %ENDIF ] + %temp29.0 = phi float [ %213, %main_body ], [ %255, %ENDIF ] + %temp30.0 = phi float [ 1.000000e+00, %main_body ], [ %257, %ENDIF ] + %225 = fcmp oge float %temp24.0, %191 + %226 = sext i1 %225 to i32 + %227 = bitcast i32 %226 to float + %228 = bitcast float %227 to i32 + %229 = icmp ne i32 %228, 0 + br i1 %229, label %IF, label %ENDIF + +IF: ; preds = %LOOP + %230 = bitcast float %136 to i32 + %231 = bitcast float %182 to i32 + %232 = bitcast float %137 to i32 + %233 = bitcast float %183 to i32 + %234 = insertelement <8 x i32> undef, i32 %230, i32 0 + %235 = insertelement <8 x i32> %234, i32 %231, i32 1 + %236 = insertelement <8 x i32> %235, i32 %232, i32 2 + %237 = insertelement <8 x i32> %236, i32 %233, i32 3 + br label %LOOP65 + +ENDIF: ; preds = %LOOP + %238 = bitcast float %temp28.0 to i32 + %239 = bitcast float %temp29.0 to i32 + %240 = insertelement <8 x i32> %224, i32 %238, i32 4 + %241 = insertelement <8 x i32> %240, i32 %239, i32 5 + %242 = insertelement <8 x i32> %241, i32 undef, i32 6 + %243 = insertelement <8 x i32> %242, i32 undef, i32 7 + %244 = call <4 x float> @llvm.SI.sampled.v8i32(<8 x i32> %243, <32 x i8> %62, <16 x i8> %64, i32 2) + %245 = extractelement <4 x float> %244, i32 3 + %246 = fcmp oge float %temp30.0, %245 + %247 = sext i1 %246 to i32 + %248 = bitcast i32 %247 to float + %249 = bitcast float %248 to i32 + %250 = and i32 %249, 1065353216 + %251 = bitcast i32 %250 to float + %252 = fmul float %214, %251 + %253 = fadd float %252, %temp28.0 + %254 = fmul float %215, %251 + %255 = fadd float %254, %temp29.0 + %256 = fmul float %216, %251 + %257 = fadd float %256, %temp30.0 + %258 = fadd float %temp24.0, 1.000000e+00 + br label %LOOP + +LOOP65: ; preds = %ENDIF66, %IF + %temp24.1 = phi float [ 0.000000e+00, %IF ], [ %610, %ENDIF66 ] + %temp28.1 = phi float [ %temp28.0, %IF ], [ %605, %ENDIF66 ] + %temp29.1 = phi float [ %temp29.0, %IF ], [ %607, %ENDIF66 ] + %temp30.1 = phi float [ %temp30.0, %IF ], [ %609, %ENDIF66 ] + %temp32.0 = phi float [ 1.000000e+00, %IF ], [ %611, %ENDIF66 ] + %259 = fcmp oge float %temp24.1, %195 + %260 = sext i1 %259 to i32 + %261 = bitcast i32 %260 to float + %262 = bitcast float %261 to i32 + %263 = icmp ne i32 %262, 0 + br i1 %263, label %IF67, label %ENDIF66 + +IF67: ; preds = %LOOP65 + %264 = bitcast float %136 to i32 + %265 = bitcast float %182 to i32 + %266 = bitcast float %137 to i32 + %267 = bitcast float %183 to i32 + %268 = bitcast float %temp28.1 to i32 + %269 = bitcast float %temp29.1 to i32 + %270 = insertelement <8 x i32> undef, i32 %264, i32 0 + %271 = insertelement <8 x i32> %270, i32 %265, i32 1 + %272 = insertelement <8 x i32> %271, i32 %266, i32 2 + %273 = insertelement <8 x i32> %272, i32 %267, i32 3 + %274 = insertelement <8 x i32> %273, i32 %268, i32 4 + %275 = insertelement <8 x i32> %274, i32 %269, i32 5 + %276 = insertelement <8 x i32> %275, i32 undef, i32 6 + %277 = insertelement <8 x i32> %276, i32 undef, i32 7 + %278 = call <4 x float> @llvm.SI.sampled.v8i32(<8 x i32> %277, <32 x i8> %66, <16 x i8> %68, i32 2) + %279 = extractelement <4 x float> %278, i32 0 + %280 = extractelement <4 x float> %278, i32 1 + %281 = extractelement <4 x float> %278, i32 2 + %282 = extractelement <4 x float> %278, i32 3 + %283 = fmul float %282, %47 + %284 = bitcast float %136 to i32 + %285 = bitcast float %182 to i32 + %286 = bitcast float %137 to i32 + %287 = bitcast float %183 to i32 + %288 = bitcast float %temp28.1 to i32 + %289 = bitcast float %temp29.1 to i32 + %290 = insertelement <8 x i32> undef, i32 %284, i32 0 + %291 = insertelement <8 x i32> %290, i32 %285, i32 1 + %292 = insertelement <8 x i32> %291, i32 %286, i32 2 + %293 = insertelement <8 x i32> %292, i32 %287, i32 3 + %294 = insertelement <8 x i32> %293, i32 %288, i32 4 + %295 = insertelement <8 x i32> %294, i32 %289, i32 5 + %296 = insertelement <8 x i32> %295, i32 undef, i32 6 + %297 = insertelement <8 x i32> %296, i32 undef, i32 7 + %298 = call <4 x float> @llvm.SI.sampled.v8i32(<8 x i32> %297, <32 x i8> %82, <16 x i8> %84, i32 2) + %299 = extractelement <4 x float> %298, i32 0 + %300 = extractelement <4 x float> %298, i32 1 + %301 = extractelement <4 x float> %298, i32 2 + %302 = bitcast float %136 to i32 + %303 = bitcast float %182 to i32 + %304 = bitcast float %137 to i32 + %305 = bitcast float %183 to i32 + %306 = bitcast float %temp28.1 to i32 + %307 = bitcast float %temp29.1 to i32 + %308 = insertelement <8 x i32> undef, i32 %302, i32 0 + %309 = insertelement <8 x i32> %308, i32 %303, i32 1 + %310 = insertelement <8 x i32> %309, i32 %304, i32 2 + %311 = insertelement <8 x i32> %310, i32 %305, i32 3 + %312 = insertelement <8 x i32> %311, i32 %306, i32 4 + %313 = insertelement <8 x i32> %312, i32 %307, i32 5 + %314 = insertelement <8 x i32> %313, i32 undef, i32 6 + %315 = insertelement <8 x i32> %314, i32 undef, i32 7 + %316 = call <4 x float> @llvm.SI.sampled.v8i32(<8 x i32> %315, <32 x i8> %78, <16 x i8> %80, i32 2) + %317 = extractelement <4 x float> %316, i32 0 + %318 = extractelement <4 x float> %316, i32 1 + %319 = extractelement <4 x float> %316, i32 2 + %320 = fmul float %317, %23 + %321 = fmul float %318, %24 + %322 = fmul float %319, %25 + %323 = fmul float %299, %26 + %324 = fadd float %323, %320 + %325 = fmul float %300, %27 + %326 = fadd float %325, %321 + %327 = fmul float %301, %28 + %328 = fadd float %327, %322 + %329 = fadd float %279, %324 + %330 = fadd float %280, %326 + %331 = fadd float %281, %328 + %332 = bitcast float %136 to i32 + %333 = bitcast float %182 to i32 + %334 = bitcast float %137 to i32 + %335 = bitcast float %183 to i32 + %336 = bitcast float %temp28.1 to i32 + %337 = bitcast float %temp29.1 to i32 + %338 = insertelement <8 x i32> undef, i32 %332, i32 0 + %339 = insertelement <8 x i32> %338, i32 %333, i32 1 + %340 = insertelement <8 x i32> %339, i32 %334, i32 2 + %341 = insertelement <8 x i32> %340, i32 %335, i32 3 + %342 = insertelement <8 x i32> %341, i32 %336, i32 4 + %343 = insertelement <8 x i32> %342, i32 %337, i32 5 + %344 = insertelement <8 x i32> %343, i32 undef, i32 6 + %345 = insertelement <8 x i32> %344, i32 undef, i32 7 + %346 = call <4 x float> @llvm.SI.sampled.v8i32(<8 x i32> %345, <32 x i8> %62, <16 x i8> %64, i32 2) + %347 = extractelement <4 x float> %346, i32 0 + %348 = extractelement <4 x float> %346, i32 1 + %349 = extractelement <4 x float> %346, i32 2 + %350 = fadd float %347, -5.000000e-01 + %351 = fadd float %348, -5.000000e-01 + %352 = fadd float %349, -5.000000e-01 + %353 = fmul float %350, %350 + %354 = fmul float %351, %351 + %355 = fadd float %354, %353 + %356 = fmul float %352, %352 + %357 = fadd float %355, %356 + %358 = call float @llvm.AMDGPU.rsq.f32(float %357) + %359 = fmul float %350, %358 + %360 = fmul float %351, %358 + %361 = fmul float %352, %358 + %362 = bitcast float %136 to i32 + %363 = bitcast float %182 to i32 + %364 = bitcast float %137 to i32 + %365 = bitcast float %183 to i32 + %366 = bitcast float %temp28.1 to i32 + %367 = bitcast float %temp29.1 to i32 + %368 = insertelement <8 x i32> undef, i32 %362, i32 0 + %369 = insertelement <8 x i32> %368, i32 %363, i32 1 + %370 = insertelement <8 x i32> %369, i32 %364, i32 2 + %371 = insertelement <8 x i32> %370, i32 %365, i32 3 + %372 = insertelement <8 x i32> %371, i32 %366, i32 4 + %373 = insertelement <8 x i32> %372, i32 %367, i32 5 + %374 = insertelement <8 x i32> %373, i32 undef, i32 6 + %375 = insertelement <8 x i32> %374, i32 undef, i32 7 + %376 = call <4 x float> @llvm.SI.sampled.v8i32(<8 x i32> %375, <32 x i8> %70, <16 x i8> %72, i32 2) + %377 = extractelement <4 x float> %376, i32 0 + %378 = extractelement <4 x float> %376, i32 1 + %379 = extractelement <4 x float> %376, i32 2 + %380 = extractelement <4 x float> %376, i32 3 + %381 = fsub float -0.000000e+00, %95 + %382 = fsub float -0.000000e+00, %96 + %383 = fsub float -0.000000e+00, %97 + %384 = fmul float %359, %381 + %385 = fmul float %360, %382 + %386 = fadd float %385, %384 + %387 = fmul float %361, %383 + %388 = fadd float %386, %387 + %389 = fmul float %388, %359 + %390 = fmul float %388, %360 + %391 = fmul float %388, %361 + %392 = fmul float 2.000000e+00, %389 + %393 = fmul float 2.000000e+00, %390 + %394 = fmul float 2.000000e+00, %391 + %395 = fsub float -0.000000e+00, %392 + %396 = fadd float %381, %395 + %397 = fsub float -0.000000e+00, %393 + %398 = fadd float %382, %397 + %399 = fsub float -0.000000e+00, %394 + %400 = fadd float %383, %399 + %401 = fmul float %396, %98 + %402 = fmul float %396, %99 + %403 = fmul float %396, %100 + %404 = fmul float %398, %101 + %405 = fadd float %404, %401 + %406 = fmul float %398, %102 + %407 = fadd float %406, %402 + %408 = fmul float %398, %103 + %409 = fadd float %408, %403 + %410 = fmul float %400, %104 + %411 = fadd float %410, %405 + %412 = fmul float %400, %105 + %413 = fadd float %412, %407 + %414 = fmul float %400, %106 + %415 = fadd float %414, %409 + %416 = bitcast float %136 to i32 + %417 = bitcast float %182 to i32 + %418 = bitcast float %137 to i32 + %419 = bitcast float %183 to i32 + %420 = bitcast float %temp28.1 to i32 + %421 = bitcast float %temp29.1 to i32 + %422 = insertelement <8 x i32> undef, i32 %416, i32 0 + %423 = insertelement <8 x i32> %422, i32 %417, i32 1 + %424 = insertelement <8 x i32> %423, i32 %418, i32 2 + %425 = insertelement <8 x i32> %424, i32 %419, i32 3 + %426 = insertelement <8 x i32> %425, i32 %420, i32 4 + %427 = insertelement <8 x i32> %426, i32 %421, i32 5 + %428 = insertelement <8 x i32> %427, i32 undef, i32 6 + %429 = insertelement <8 x i32> %428, i32 undef, i32 7 + %430 = call <4 x float> @llvm.SI.sampled.v8i32(<8 x i32> %429, <32 x i8> %86, <16 x i8> %88, i32 2) + %431 = extractelement <4 x float> %430, i32 0 + %432 = extractelement <4 x float> %430, i32 1 + %433 = extractelement <4 x float> %430, i32 2 + %434 = fmul float %48, %411 + %435 = fmul float %49, %411 + %436 = fmul float %50, %411 + %437 = fmul float %51, %413 + %438 = fadd float %437, %434 + %439 = fmul float %52, %413 + %440 = fadd float %439, %435 + %441 = fmul float %53, %413 + %442 = fadd float %441, %436 + %443 = fmul float %54, %415 + %444 = fadd float %443, %438 + %445 = fmul float %55, %415 + %446 = fadd float %445, %440 + %447 = fmul float %56, %415 + %448 = fadd float %447, %442 + %449 = insertelement <4 x float> undef, float %444, i32 0 + %450 = insertelement <4 x float> %449, float %446, i32 1 + %451 = insertelement <4 x float> %450, float %448, i32 2 + %452 = insertelement <4 x float> %451, float %195, i32 3 + %453 = call <4 x float> @llvm.AMDGPU.cube(<4 x float> %452) + %454 = extractelement <4 x float> %453, i32 0 + %455 = extractelement <4 x float> %453, i32 1 + %456 = extractelement <4 x float> %453, i32 2 + %457 = extractelement <4 x float> %453, i32 3 + %458 = call float @fabs(float %456) + %459 = fdiv float 1.000000e+00, %458 + %460 = fmul float %454, %459 + %461 = fadd float %460, 1.500000e+00 + %462 = fmul float %455, %459 + %463 = fadd float %462, 1.500000e+00 + %464 = bitcast float %463 to i32 + %465 = bitcast float %461 to i32 + %466 = bitcast float %457 to i32 + %467 = insertelement <4 x i32> undef, i32 %464, i32 0 + %468 = insertelement <4 x i32> %467, i32 %465, i32 1 + %469 = insertelement <4 x i32> %468, i32 %466, i32 2 + %470 = insertelement <4 x i32> %469, i32 undef, i32 3 + %471 = call <4 x float> @llvm.SI.sample.v4i32(<4 x i32> %470, <32 x i8> %90, <16 x i8> %92, i32 4) + %472 = extractelement <4 x float> %471, i32 0 + %473 = extractelement <4 x float> %471, i32 1 + %474 = extractelement <4 x float> %471, i32 2 + %475 = fmul float %431, %472 + %476 = fadd float %475, %329 + %477 = fmul float %432, %473 + %478 = fadd float %477, %330 + %479 = fmul float %433, %474 + %480 = fadd float %479, %331 + %481 = fmul float %107, %107 + %482 = fmul float %108, %108 + %483 = fadd float %482, %481 + %484 = fmul float %109, %109 + %485 = fadd float %483, %484 + %486 = call float @llvm.AMDGPU.rsq.f32(float %485) + %487 = fmul float %107, %486 + %488 = fmul float %108, %486 + %489 = fmul float %109, %486 + %490 = fmul float %377, %40 + %491 = fmul float %378, %41 + %492 = fmul float %379, %42 + %493 = fmul float %359, %487 + %494 = fmul float %360, %488 + %495 = fadd float %494, %493 + %496 = fmul float %361, %489 + %497 = fadd float %495, %496 + %498 = fmul float %497, %359 + %499 = fmul float %497, %360 + %500 = fmul float %497, %361 + %501 = fmul float 2.000000e+00, %498 + %502 = fmul float 2.000000e+00, %499 + %503 = fmul float 2.000000e+00, %500 + %504 = fsub float -0.000000e+00, %501 + %505 = fadd float %487, %504 + %506 = fsub float -0.000000e+00, %502 + %507 = fadd float %488, %506 + %508 = fsub float -0.000000e+00, %503 + %509 = fadd float %489, %508 + %510 = fmul float %95, %95 + %511 = fmul float %96, %96 + %512 = fadd float %511, %510 + %513 = fmul float %97, %97 + %514 = fadd float %512, %513 + %515 = call float @llvm.AMDGPU.rsq.f32(float %514) + %516 = fmul float %95, %515 + %517 = fmul float %96, %515 + %518 = fmul float %97, %515 + %519 = fmul float %505, %516 + %520 = fmul float %507, %517 + %521 = fadd float %520, %519 + %522 = fmul float %509, %518 + %523 = fadd float %521, %522 + %524 = fsub float -0.000000e+00, %523 + %525 = fcmp uge float %524, 0.000000e+00 + %526 = select i1 %525, float %524, float 0.000000e+00 + %527 = fmul float %43, %380 + %528 = fadd float %527, 1.000000e+00 + %529 = call float @llvm.pow.f32(float %526, float %528) + %530 = fmul float %476, %37 + %531 = fmul float %478, %38 + %532 = fmul float %480, %39 + %533 = fmul float %359, %487 + %534 = fmul float %360, %488 + %535 = fadd float %534, %533 + %536 = fmul float %361, %489 + %537 = fadd float %535, %536 + %538 = fcmp uge float %537, 0.000000e+00 + %539 = select i1 %538, float %537, float 0.000000e+00 + %540 = fmul float %530, %539 + %541 = fmul float %531, %539 + %542 = fmul float %532, %539 + %543 = fmul float %490, %529 + %544 = fadd float %543, %540 + %545 = fmul float %491, %529 + %546 = fadd float %545, %541 + %547 = fmul float %492, %529 + %548 = fadd float %547, %542 + %549 = fmul float %476, %34 + %550 = fmul float %478, %35 + %551 = fmul float %480, %36 + %552 = fmul float %544, %57 + %553 = fadd float %552, %549 + %554 = fmul float %546, %58 + %555 = fadd float %554, %550 + %556 = fmul float %548, %59 + %557 = fadd float %556, %551 + %558 = bitcast float %136 to i32 + %559 = bitcast float %182 to i32 + %560 = bitcast float %137 to i32 + %561 = bitcast float %183 to i32 + %562 = bitcast float %temp28.1 to i32 + %563 = bitcast float %temp29.1 to i32 + %564 = insertelement <8 x i32> undef, i32 %558, i32 0 + %565 = insertelement <8 x i32> %564, i32 %559, i32 1 + %566 = insertelement <8 x i32> %565, i32 %560, i32 2 + %567 = insertelement <8 x i32> %566, i32 %561, i32 3 + %568 = insertelement <8 x i32> %567, i32 %562, i32 4 + %569 = insertelement <8 x i32> %568, i32 %563, i32 5 + %570 = insertelement <8 x i32> %569, i32 undef, i32 6 + %571 = insertelement <8 x i32> %570, i32 undef, i32 7 + %572 = call <4 x float> @llvm.SI.sampled.v8i32(<8 x i32> %571, <32 x i8> %74, <16 x i8> %76, i32 2) + %573 = extractelement <4 x float> %572, i32 0 + %574 = extractelement <4 x float> %572, i32 1 + %575 = extractelement <4 x float> %572, i32 2 + %576 = fmul float %573, %44 + %577 = fadd float %576, %553 + %578 = fmul float %574, %45 + %579 = fadd float %578, %555 + %580 = fmul float %575, %46 + %581 = fadd float %580, %557 + %582 = call i32 @llvm.SI.packf16(float %577, float %579) + %583 = bitcast i32 %582 to float + %584 = call i32 @llvm.SI.packf16(float %581, float %283) + %585 = bitcast i32 %584 to float + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %583, float %585, float %583, float %585) + ret void + +ENDIF66: ; preds = %LOOP65 + %586 = bitcast float %temp28.1 to i32 + %587 = bitcast float %temp29.1 to i32 + %588 = insertelement <8 x i32> %237, i32 %586, i32 4 + %589 = insertelement <8 x i32> %588, i32 %587, i32 5 + %590 = insertelement <8 x i32> %589, i32 undef, i32 6 + %591 = insertelement <8 x i32> %590, i32 undef, i32 7 + %592 = call <4 x float> @llvm.SI.sampled.v8i32(<8 x i32> %591, <32 x i8> %62, <16 x i8> %64, i32 2) + %593 = extractelement <4 x float> %592, i32 3 + %594 = fcmp oge float %temp30.1, %593 + %595 = sext i1 %594 to i32 + %596 = bitcast i32 %595 to float + %597 = bitcast float %596 to i32 + %598 = and i32 %597, 1065353216 + %599 = bitcast i32 %598 to float + %600 = fmul float 5.000000e-01, %temp32.0 + %601 = fsub float -0.000000e+00, %600 + %602 = fmul float %599, %temp32.0 + %603 = fadd float %602, %601 + %604 = fmul float %214, %603 + %605 = fadd float %604, %temp28.1 + %606 = fmul float %215, %603 + %607 = fadd float %606, %temp29.1 + %608 = fmul float %216, %603 + %609 = fadd float %608, %temp30.1 + %610 = fadd float %temp24.1, 1.000000e+00 + %611 = fmul float %temp32.0, 5.000000e-01 + br label %LOOP65 +} + +; Function Attrs: nounwind readnone +declare float @llvm.SI.load.const(<16 x i8>, i32) #1 + +; Function Attrs: nounwind readnone +declare float @llvm.SI.fs.interp(i32, i32, i32, <2 x i32>) #1 + +; Function Attrs: readnone +declare i32 @llvm.SI.tid() #2 + +; Function Attrs: readonly +declare float @ceil(float) #3 + +; Function Attrs: readnone +declare float @llvm.AMDGPU.rsq.f32(float) #2 + +; Function Attrs: nounwind readnone +declare <4 x float> @llvm.SI.sampled.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32) #1 + +; Function Attrs: readnone +declare <4 x float> @llvm.AMDGPU.cube(<4 x float>) #2 + +; Function Attrs: readnone +declare float @fabs(float) #2 + +; Function Attrs: nounwind readnone +declare <4 x float> @llvm.SI.sample.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32) #1 + +; Function Attrs: nounwind readonly +declare float @llvm.pow.f32(float, float) #4 + +; Function Attrs: nounwind readnone +declare i32 @llvm.SI.packf16(float, float) #1 + +declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) + +attributes #0 = { "ShaderType"="0" } +attributes #1 = { nounwind readnone } +attributes #2 = { readnone } +attributes #3 = { readonly } +attributes #4 = { nounwind readonly } + +!0 = !{!"const", null, i32 1} + +; CHECK-LABEL: {{^}}main1: +; CHECK: s_endpgm +define void @main1([17 x <16 x i8>] addrspace(2)* byval, [32 x <16 x i8>] addrspace(2)* byval, [16 x <32 x i8>] addrspace(2)* byval, float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 { +main_body: + %21 = getelementptr [17 x <16 x i8>], [17 x <16 x i8>] addrspace(2)* %0, i64 0, i32 0 + %22 = load <16 x i8>, <16 x i8> addrspace(2)* %21, !tbaa !0 + %23 = call float @llvm.SI.load.const(<16 x i8> %22, i32 0) + %24 = call float @llvm.SI.load.const(<16 x i8> %22, i32 4) + %25 = call float @llvm.SI.load.const(<16 x i8> %22, i32 8) + %26 = call float @llvm.SI.load.const(<16 x i8> %22, i32 12) + %27 = call float @llvm.SI.load.const(<16 x i8> %22, i32 28) + %28 = call float @llvm.SI.load.const(<16 x i8> %22, i32 48) + %29 = call float @llvm.SI.load.const(<16 x i8> %22, i32 52) + %30 = call float @llvm.SI.load.const(<16 x i8> %22, i32 56) + %31 = call float @llvm.SI.load.const(<16 x i8> %22, i32 64) + %32 = call float @llvm.SI.load.const(<16 x i8> %22, i32 68) + %33 = call float @llvm.SI.load.const(<16 x i8> %22, i32 72) + %34 = call float @llvm.SI.load.const(<16 x i8> %22, i32 76) + %35 = call float @llvm.SI.load.const(<16 x i8> %22, i32 128) + %36 = call float @llvm.SI.load.const(<16 x i8> %22, i32 132) + %37 = call float @llvm.SI.load.const(<16 x i8> %22, i32 144) + %38 = call float @llvm.SI.load.const(<16 x i8> %22, i32 148) + %39 = call float @llvm.SI.load.const(<16 x i8> %22, i32 152) + %40 = call float @llvm.SI.load.const(<16 x i8> %22, i32 160) + %41 = call float @llvm.SI.load.const(<16 x i8> %22, i32 164) + %42 = call float @llvm.SI.load.const(<16 x i8> %22, i32 168) + %43 = call float @llvm.SI.load.const(<16 x i8> %22, i32 172) + %44 = call float @llvm.SI.load.const(<16 x i8> %22, i32 176) + %45 = call float @llvm.SI.load.const(<16 x i8> %22, i32 180) + %46 = call float @llvm.SI.load.const(<16 x i8> %22, i32 184) + %47 = call float @llvm.SI.load.const(<16 x i8> %22, i32 192) + %48 = call float @llvm.SI.load.const(<16 x i8> %22, i32 196) + %49 = call float @llvm.SI.load.const(<16 x i8> %22, i32 200) + %50 = call float @llvm.SI.load.const(<16 x i8> %22, i32 208) + %51 = call float @llvm.SI.load.const(<16 x i8> %22, i32 212) + %52 = call float @llvm.SI.load.const(<16 x i8> %22, i32 216) + %53 = call float @llvm.SI.load.const(<16 x i8> %22, i32 220) + %54 = call float @llvm.SI.load.const(<16 x i8> %22, i32 236) + %55 = call float @llvm.SI.load.const(<16 x i8> %22, i32 240) + %56 = call float @llvm.SI.load.const(<16 x i8> %22, i32 244) + %57 = call float @llvm.SI.load.const(<16 x i8> %22, i32 248) + %58 = call float @llvm.SI.load.const(<16 x i8> %22, i32 252) + %59 = call float @llvm.SI.load.const(<16 x i8> %22, i32 256) + %60 = call float @llvm.SI.load.const(<16 x i8> %22, i32 260) + %61 = call float @llvm.SI.load.const(<16 x i8> %22, i32 264) + %62 = call float @llvm.SI.load.const(<16 x i8> %22, i32 268) + %63 = call float @llvm.SI.load.const(<16 x i8> %22, i32 272) + %64 = call float @llvm.SI.load.const(<16 x i8> %22, i32 276) + %65 = call float @llvm.SI.load.const(<16 x i8> %22, i32 280) + %66 = call float @llvm.SI.load.const(<16 x i8> %22, i32 284) + %67 = call float @llvm.SI.load.const(<16 x i8> %22, i32 288) + %68 = call float @llvm.SI.load.const(<16 x i8> %22, i32 292) + %69 = call float @llvm.SI.load.const(<16 x i8> %22, i32 464) + %70 = call float @llvm.SI.load.const(<16 x i8> %22, i32 468) + %71 = call float @llvm.SI.load.const(<16 x i8> %22, i32 472) + %72 = call float @llvm.SI.load.const(<16 x i8> %22, i32 496) + %73 = call float @llvm.SI.load.const(<16 x i8> %22, i32 500) + %74 = call float @llvm.SI.load.const(<16 x i8> %22, i32 504) + %75 = call float @llvm.SI.load.const(<16 x i8> %22, i32 512) + %76 = call float @llvm.SI.load.const(<16 x i8> %22, i32 516) + %77 = call float @llvm.SI.load.const(<16 x i8> %22, i32 524) + %78 = call float @llvm.SI.load.const(<16 x i8> %22, i32 532) + %79 = call float @llvm.SI.load.const(<16 x i8> %22, i32 536) + %80 = call float @llvm.SI.load.const(<16 x i8> %22, i32 540) + %81 = call float @llvm.SI.load.const(<16 x i8> %22, i32 544) + %82 = call float @llvm.SI.load.const(<16 x i8> %22, i32 548) + %83 = call float @llvm.SI.load.const(<16 x i8> %22, i32 552) + %84 = call float @llvm.SI.load.const(<16 x i8> %22, i32 556) + %85 = call float @llvm.SI.load.const(<16 x i8> %22, i32 560) + %86 = call float @llvm.SI.load.const(<16 x i8> %22, i32 564) + %87 = call float @llvm.SI.load.const(<16 x i8> %22, i32 568) + %88 = call float @llvm.SI.load.const(<16 x i8> %22, i32 572) + %89 = call float @llvm.SI.load.const(<16 x i8> %22, i32 576) + %90 = call float @llvm.SI.load.const(<16 x i8> %22, i32 580) + %91 = call float @llvm.SI.load.const(<16 x i8> %22, i32 584) + %92 = call float @llvm.SI.load.const(<16 x i8> %22, i32 588) + %93 = call float @llvm.SI.load.const(<16 x i8> %22, i32 592) + %94 = call float @llvm.SI.load.const(<16 x i8> %22, i32 596) + %95 = call float @llvm.SI.load.const(<16 x i8> %22, i32 600) + %96 = call float @llvm.SI.load.const(<16 x i8> %22, i32 604) + %97 = call float @llvm.SI.load.const(<16 x i8> %22, i32 608) + %98 = call float @llvm.SI.load.const(<16 x i8> %22, i32 612) + %99 = call float @llvm.SI.load.const(<16 x i8> %22, i32 616) + %100 = call float @llvm.SI.load.const(<16 x i8> %22, i32 624) + %101 = call float @llvm.SI.load.const(<16 x i8> %22, i32 628) + %102 = call float @llvm.SI.load.const(<16 x i8> %22, i32 632) + %103 = call float @llvm.SI.load.const(<16 x i8> %22, i32 636) + %104 = call float @llvm.SI.load.const(<16 x i8> %22, i32 640) + %105 = call float @llvm.SI.load.const(<16 x i8> %22, i32 644) + %106 = call float @llvm.SI.load.const(<16 x i8> %22, i32 648) + %107 = call float @llvm.SI.load.const(<16 x i8> %22, i32 652) + %108 = call float @llvm.SI.load.const(<16 x i8> %22, i32 656) + %109 = call float @llvm.SI.load.const(<16 x i8> %22, i32 660) + %110 = call float @llvm.SI.load.const(<16 x i8> %22, i32 664) + %111 = call float @llvm.SI.load.const(<16 x i8> %22, i32 668) + %112 = call float @llvm.SI.load.const(<16 x i8> %22, i32 672) + %113 = call float @llvm.SI.load.const(<16 x i8> %22, i32 676) + %114 = call float @llvm.SI.load.const(<16 x i8> %22, i32 680) + %115 = call float @llvm.SI.load.const(<16 x i8> %22, i32 684) + %116 = call float @llvm.SI.load.const(<16 x i8> %22, i32 688) + %117 = call float @llvm.SI.load.const(<16 x i8> %22, i32 692) + %118 = call float @llvm.SI.load.const(<16 x i8> %22, i32 696) + %119 = call float @llvm.SI.load.const(<16 x i8> %22, i32 700) + %120 = call float @llvm.SI.load.const(<16 x i8> %22, i32 704) + %121 = call float @llvm.SI.load.const(<16 x i8> %22, i32 708) + %122 = call float @llvm.SI.load.const(<16 x i8> %22, i32 712) + %123 = call float @llvm.SI.load.const(<16 x i8> %22, i32 716) + %124 = call float @llvm.SI.load.const(<16 x i8> %22, i32 864) + %125 = call float @llvm.SI.load.const(<16 x i8> %22, i32 868) + %126 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 0 + %127 = load <32 x i8>, <32 x i8> addrspace(2)* %126, !tbaa !0 + %128 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 0 + %129 = load <16 x i8>, <16 x i8> addrspace(2)* %128, !tbaa !0 + %130 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 1 + %131 = load <32 x i8>, <32 x i8> addrspace(2)* %130, !tbaa !0 + %132 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 1 + %133 = load <16 x i8>, <16 x i8> addrspace(2)* %132, !tbaa !0 + %134 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 2 + %135 = load <32 x i8>, <32 x i8> addrspace(2)* %134, !tbaa !0 + %136 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 2 + %137 = load <16 x i8>, <16 x i8> addrspace(2)* %136, !tbaa !0 + %138 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 3 + %139 = load <32 x i8>, <32 x i8> addrspace(2)* %138, !tbaa !0 + %140 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 3 + %141 = load <16 x i8>, <16 x i8> addrspace(2)* %140, !tbaa !0 + %142 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 4 + %143 = load <32 x i8>, <32 x i8> addrspace(2)* %142, !tbaa !0 + %144 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 4 + %145 = load <16 x i8>, <16 x i8> addrspace(2)* %144, !tbaa !0 + %146 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 5 + %147 = load <32 x i8>, <32 x i8> addrspace(2)* %146, !tbaa !0 + %148 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 5 + %149 = load <16 x i8>, <16 x i8> addrspace(2)* %148, !tbaa !0 + %150 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 6 + %151 = load <32 x i8>, <32 x i8> addrspace(2)* %150, !tbaa !0 + %152 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 6 + %153 = load <16 x i8>, <16 x i8> addrspace(2)* %152, !tbaa !0 + %154 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 7 + %155 = load <32 x i8>, <32 x i8> addrspace(2)* %154, !tbaa !0 + %156 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 7 + %157 = load <16 x i8>, <16 x i8> addrspace(2)* %156, !tbaa !0 + %158 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 8 + %159 = load <32 x i8>, <32 x i8> addrspace(2)* %158, !tbaa !0 + %160 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 8 + %161 = load <16 x i8>, <16 x i8> addrspace(2)* %160, !tbaa !0 + %162 = fcmp ugt float %17, 0.000000e+00 + %163 = select i1 %162, float 1.000000e+00, float 0.000000e+00 + %164 = call float @llvm.SI.fs.interp(i32 0, i32 0, i32 %4, <2 x i32> %6) + %165 = call float @llvm.SI.fs.interp(i32 1, i32 0, i32 %4, <2 x i32> %6) + %166 = call float @llvm.SI.fs.interp(i32 2, i32 0, i32 %4, <2 x i32> %6) + %167 = call float @llvm.SI.fs.interp(i32 3, i32 0, i32 %4, <2 x i32> %6) + %168 = call float @llvm.SI.fs.interp(i32 0, i32 1, i32 %4, <2 x i32> %6) + %169 = call float @llvm.SI.fs.interp(i32 1, i32 1, i32 %4, <2 x i32> %6) + %170 = call float @llvm.SI.fs.interp(i32 2, i32 1, i32 %4, <2 x i32> %6) + %171 = call float @llvm.SI.fs.interp(i32 3, i32 1, i32 %4, <2 x i32> %6) + %172 = call float @llvm.SI.fs.interp(i32 0, i32 2, i32 %4, <2 x i32> %6) + %173 = call float @llvm.SI.fs.interp(i32 1, i32 2, i32 %4, <2 x i32> %6) + %174 = call float @llvm.SI.fs.interp(i32 2, i32 2, i32 %4, <2 x i32> %6) + %175 = call float @llvm.SI.fs.interp(i32 3, i32 2, i32 %4, <2 x i32> %6) + %176 = call float @llvm.SI.fs.interp(i32 0, i32 3, i32 %4, <2 x i32> %6) + %177 = call float @llvm.SI.fs.interp(i32 1, i32 3, i32 %4, <2 x i32> %6) + %178 = call float @llvm.SI.fs.interp(i32 2, i32 3, i32 %4, <2 x i32> %6) + %179 = call float @llvm.SI.fs.interp(i32 3, i32 3, i32 %4, <2 x i32> %6) + %180 = call float @llvm.SI.fs.interp(i32 0, i32 4, i32 %4, <2 x i32> %6) + %181 = call float @llvm.SI.fs.interp(i32 1, i32 4, i32 %4, <2 x i32> %6) + %182 = call float @llvm.SI.fs.interp(i32 2, i32 4, i32 %4, <2 x i32> %6) + %183 = call float @llvm.SI.fs.interp(i32 3, i32 4, i32 %4, <2 x i32> %6) + %184 = call float @llvm.SI.fs.interp(i32 0, i32 5, i32 %4, <2 x i32> %6) + %185 = call float @llvm.SI.fs.interp(i32 1, i32 5, i32 %4, <2 x i32> %6) + %186 = call float @llvm.SI.fs.interp(i32 2, i32 5, i32 %4, <2 x i32> %6) + %187 = call float @llvm.SI.fs.interp(i32 3, i32 5, i32 %4, <2 x i32> %6) + %188 = call float @llvm.SI.fs.interp(i32 0, i32 6, i32 %4, <2 x i32> %6) + %189 = call float @llvm.SI.fs.interp(i32 1, i32 6, i32 %4, <2 x i32> %6) + %190 = call float @llvm.SI.fs.interp(i32 2, i32 6, i32 %4, <2 x i32> %6) + %191 = call float @llvm.SI.fs.interp(i32 3, i32 6, i32 %4, <2 x i32> %6) + %192 = call float @llvm.SI.fs.interp(i32 0, i32 7, i32 %4, <2 x i32> %6) + %193 = call float @llvm.SI.fs.interp(i32 1, i32 7, i32 %4, <2 x i32> %6) + %194 = call float @llvm.SI.fs.interp(i32 2, i32 7, i32 %4, <2 x i32> %6) + %195 = call float @llvm.SI.fs.interp(i32 3, i32 7, i32 %4, <2 x i32> %6) + %196 = fmul float %14, %124 + %197 = fadd float %196, %125 + %198 = call float @llvm.AMDIL.clamp.(float %163, float 0.000000e+00, float 1.000000e+00) + %199 = call float @llvm.AMDIL.clamp.(float 0.000000e+00, float 0.000000e+00, float 1.000000e+00) + %200 = call float @llvm.AMDIL.clamp.(float 0.000000e+00, float 0.000000e+00, float 1.000000e+00) + %201 = call float @llvm.AMDIL.clamp.(float 1.000000e+00, float 0.000000e+00, float 1.000000e+00) + %202 = bitcast float %198 to i32 + %203 = icmp ne i32 %202, 0 + %. = select i1 %203, float -1.000000e+00, float 1.000000e+00 + %204 = fsub float -0.000000e+00, %164 + %205 = fadd float %44, %204 + %206 = fsub float -0.000000e+00, %165 + %207 = fadd float %45, %206 + %208 = fsub float -0.000000e+00, %166 + %209 = fadd float %46, %208 + %210 = fmul float %205, %205 + %211 = fmul float %207, %207 + %212 = fadd float %211, %210 + %213 = fmul float %209, %209 + %214 = fadd float %212, %213 + %215 = call float @llvm.AMDGPU.rsq.f32(float %214) + %216 = fmul float %205, %215 + %217 = fmul float %207, %215 + %218 = fmul float %209, %215 + %219 = fmul float %., %54 + %220 = fmul float %13, %47 + %221 = fmul float %197, %48 + %222 = bitcast float %174 to i32 + %223 = bitcast float %175 to i32 + %224 = insertelement <2 x i32> undef, i32 %222, i32 0 + %225 = insertelement <2 x i32> %224, i32 %223, i32 1 + %226 = call <4 x float> @llvm.SI.sample.v2i32(<2 x i32> %225, <32 x i8> %131, <16 x i8> %133, i32 2) + %227 = extractelement <4 x float> %226, i32 0 + %228 = extractelement <4 x float> %226, i32 1 + %229 = extractelement <4 x float> %226, i32 2 + %230 = extractelement <4 x float> %226, i32 3 + %231 = fmul float %227, 0x4012611180000000 + %232 = fmul float %228, 0x4012611180000000 + %233 = fmul float %229, 0x4012611180000000 + %234 = call float @llvm.AMDGPU.lrp(float %27, float %231, float 1.000000e+00) + %235 = call float @llvm.AMDGPU.lrp(float %27, float %232, float 1.000000e+00) + %236 = call float @llvm.AMDGPU.lrp(float %27, float %233, float 1.000000e+00) + %237 = fmul float %216, %184 + %238 = fmul float %217, %185 + %239 = fadd float %238, %237 + %240 = fmul float %218, %186 + %241 = fadd float %239, %240 + %242 = fmul float %216, %187 + %243 = fmul float %217, %188 + %244 = fadd float %243, %242 + %245 = fmul float %218, %189 + %246 = fadd float %244, %245 + %247 = fmul float %216, %190 + %248 = fmul float %217, %191 + %249 = fadd float %248, %247 + %250 = fmul float %218, %192 + %251 = fadd float %249, %250 + %252 = call float @llvm.AMDIL.clamp.(float %251, float 0.000000e+00, float 1.000000e+00) + %253 = fmul float %214, 0x3F5A36E2E0000000 + %254 = call float @llvm.AMDIL.clamp.(float %253, float 0.000000e+00, float 1.000000e+00) + %255 = fsub float -0.000000e+00, %254 + %256 = fadd float 1.000000e+00, %255 + %257 = call float @llvm.pow.f32(float %252, float 2.500000e-01) + %258 = fmul float %39, %257 + %259 = fmul float %241, %258 + %260 = fmul float %246, %258 + %261 = fmul float %259, %230 + %262 = fmul float %260, %230 + %263 = fadd float %252, 0x3EE4F8B580000000 + %264 = fsub float -0.000000e+00, %252 + %265 = fadd float 1.000000e+00, %264 + %266 = fmul float 1.200000e+01, %265 + %267 = fadd float %266, 4.000000e+00 + %268 = fsub float -0.000000e+00, %267 + %269 = fmul float %268, %263 + %270 = fsub float -0.000000e+00, %267 + %271 = fmul float %270, %263 + %272 = fsub float -0.000000e+00, %267 + %273 = fmul float %272, %263 + %274 = fdiv float 1.000000e+00, %269 + %275 = fdiv float 1.000000e+00, %271 + %276 = fdiv float 1.000000e+00, %273 + %277 = fmul float %261, %274 + %278 = fmul float %262, %275 + %279 = fmul float %263, %276 + br label %LOOP + +LOOP: ; preds = %LOOP, %main_body + %temp144.0 = phi float [ 1.000000e+00, %main_body ], [ %292, %LOOP ] + %temp168.0 = phi float [ %176, %main_body ], [ %288, %LOOP ] + %temp169.0 = phi float [ %177, %main_body ], [ %289, %LOOP ] + %temp170.0 = phi float [ %256, %main_body ], [ %290, %LOOP ] + %280 = bitcast float %temp168.0 to i32 + %281 = bitcast float %temp169.0 to i32 + %282 = insertelement <4 x i32> undef, i32 %280, i32 0 + %283 = insertelement <4 x i32> %282, i32 %281, i32 1 + %284 = insertelement <4 x i32> %283, i32 0, i32 2 + %285 = insertelement <4 x i32> %284, i32 undef, i32 3 + %286 = call <4 x float> @llvm.SI.samplel.v4i32(<4 x i32> %285, <32 x i8> %147, <16 x i8> %149, i32 2) + %287 = extractelement <4 x float> %286, i32 3 + %288 = fadd float %temp168.0, %277 + %289 = fadd float %temp169.0, %278 + %290 = fadd float %temp170.0, %279 + %291 = fsub float -0.000000e+00, %287 + %292 = fadd float %290, %291 + %293 = fcmp oge float 0.000000e+00, %292 + %294 = sext i1 %293 to i32 + %295 = bitcast i32 %294 to float + %296 = bitcast float %295 to i32 + %297 = icmp ne i32 %296, 0 + br i1 %297, label %IF189, label %LOOP + +IF189: ; preds = %LOOP + %298 = extractelement <4 x float> %286, i32 0 + %299 = extractelement <4 x float> %286, i32 1 + %300 = extractelement <4 x float> %286, i32 2 + %301 = fsub float -0.000000e+00, %292 + %302 = fadd float %temp144.0, %301 + %303 = fdiv float 1.000000e+00, %302 + %304 = fmul float %292, %303 + %305 = fadd float %304, -1.000000e+00 + %306 = fmul float %305, %277 + %307 = fadd float %306, %288 + %308 = fmul float %305, %278 + %309 = fadd float %308, %289 + %310 = fsub float -0.000000e+00, %176 + %311 = fadd float %307, %310 + %312 = fsub float -0.000000e+00, %177 + %313 = fadd float %309, %312 + %314 = fadd float %176, %311 + %315 = fadd float %177, %313 + %316 = fmul float %311, %67 + %317 = fmul float %313, %68 + %318 = fmul float %316, %55 + %319 = fmul float %316, %56 + %320 = fmul float %317, %57 + %321 = fadd float %320, %318 + %322 = fmul float %317, %58 + %323 = fadd float %322, %319 + %324 = fadd float %178, %321 + %325 = fadd float %179, %323 + %326 = fmul float %316, %59 + %327 = fmul float %316, %60 + %328 = fmul float %316, %61 + %329 = fmul float %316, %62 + %330 = fmul float %317, %63 + %331 = fadd float %330, %326 + %332 = fmul float %317, %64 + %333 = fadd float %332, %327 + %334 = fmul float %317, %65 + %335 = fadd float %334, %328 + %336 = fmul float %317, %66 + %337 = fadd float %336, %329 + %338 = fadd float %168, %331 + %339 = fadd float %169, %333 + %340 = fadd float %170, %335 + %341 = fadd float %171, %337 + %342 = bitcast float %338 to i32 + %343 = bitcast float %339 to i32 + %344 = insertelement <2 x i32> undef, i32 %342, i32 0 + %345 = insertelement <2 x i32> %344, i32 %343, i32 1 + %346 = call <4 x float> @llvm.SI.sample.v2i32(<2 x i32> %345, <32 x i8> %135, <16 x i8> %137, i32 2) + %347 = extractelement <4 x float> %346, i32 0 + %348 = extractelement <4 x float> %346, i32 1 + %349 = extractelement <4 x float> %346, i32 2 + %350 = extractelement <4 x float> %346, i32 3 + %351 = fmul float %347, %23 + %352 = fmul float %348, %24 + %353 = fmul float %349, %25 + %354 = fmul float %350, %26 + %355 = fmul float %351, %180 + %356 = fmul float %352, %181 + %357 = fmul float %353, %182 + %358 = fmul float %354, %183 + %359 = fsub float -0.000000e+00, %350 + %360 = fadd float 1.000000e+00, %359 + %361 = fmul float %360, %49 + %362 = call float @llvm.AMDGPU.lrp(float %361, float %347, float %355) + %363 = call float @llvm.AMDGPU.lrp(float %361, float %348, float %356) + %364 = call float @llvm.AMDGPU.lrp(float %361, float %349, float %357) + %365 = bitcast float %340 to i32 + %366 = bitcast float %341 to i32 + %367 = insertelement <2 x i32> undef, i32 %365, i32 0 + %368 = insertelement <2 x i32> %367, i32 %366, i32 1 + %369 = call <4 x float> @llvm.SI.sample.v2i32(<2 x i32> %368, <32 x i8> %151, <16 x i8> %153, i32 2) + %370 = extractelement <4 x float> %369, i32 2 + %371 = fmul float %362, %234 + %372 = fmul float %363, %235 + %373 = fmul float %364, %236 + %374 = fmul float %358, %230 + %375 = bitcast float %314 to i32 + %376 = bitcast float %315 to i32 + %377 = insertelement <2 x i32> undef, i32 %375, i32 0 + %378 = insertelement <2 x i32> %377, i32 %376, i32 1 + %379 = call <4 x float> @llvm.SI.sample.v2i32(<2 x i32> %378, <32 x i8> %139, <16 x i8> %141, i32 2) + %380 = extractelement <4 x float> %379, i32 0 + %381 = extractelement <4 x float> %379, i32 1 + %382 = extractelement <4 x float> %379, i32 2 + %383 = extractelement <4 x float> %379, i32 3 + %384 = fcmp olt float 0.000000e+00, %382 + %385 = sext i1 %384 to i32 + %386 = bitcast i32 %385 to float + %387 = bitcast float %386 to i32 + %388 = icmp ne i32 %387, 0 + %.224 = select i1 %388, float %381, float %380 + %.225 = select i1 %388, float %383, float %381 + %389 = bitcast float %324 to i32 + %390 = bitcast float %325 to i32 + %391 = insertelement <2 x i32> undef, i32 %389, i32 0 + %392 = insertelement <2 x i32> %391, i32 %390, i32 1 + %393 = call <4 x float> @llvm.SI.sample.v2i32(<2 x i32> %392, <32 x i8> %143, <16 x i8> %145, i32 2) + %394 = extractelement <4 x float> %393, i32 0 + %395 = extractelement <4 x float> %393, i32 1 + %396 = extractelement <4 x float> %393, i32 2 + %397 = extractelement <4 x float> %393, i32 3 + %398 = fcmp olt float 0.000000e+00, %396 + %399 = sext i1 %398 to i32 + %400 = bitcast i32 %399 to float + %401 = bitcast float %400 to i32 + %402 = icmp ne i32 %401, 0 + %temp112.1 = select i1 %402, float %395, float %394 + %temp113.1 = select i1 %402, float %397, float %395 + %403 = fmul float %.224, 2.000000e+00 + %404 = fadd float %403, -1.000000e+00 + %405 = fmul float %.225, 2.000000e+00 + %406 = fadd float %405, -1.000000e+00 + %407 = fmul float %temp112.1, 2.000000e+00 + %408 = fadd float %407, -1.000000e+00 + %409 = fmul float %temp113.1, 2.000000e+00 + %410 = fadd float %409, -1.000000e+00 + %411 = fsub float -0.000000e+00, %404 + %412 = fmul float %411, %35 + %413 = fsub float -0.000000e+00, %406 + %414 = fmul float %413, %35 + %415 = fsub float -0.000000e+00, %408 + %416 = fmul float %415, %36 + %417 = fsub float -0.000000e+00, %410 + %418 = fmul float %417, %36 + %419 = fmul float %416, %370 + %420 = fmul float %418, %370 + %421 = call float @fabs(float %412) + %422 = call float @fabs(float %414) + %423 = fsub float -0.000000e+00, %421 + %424 = fadd float 1.000000e+00, %423 + %425 = fsub float -0.000000e+00, %422 + %426 = fadd float 1.000000e+00, %425 + %427 = fmul float %424, %419 + %428 = fadd float %427, %412 + %429 = fmul float %426, %420 + %430 = fadd float %429, %414 + %431 = fmul float %428, %428 + %432 = fmul float %430, %430 + %433 = fadd float %431, %432 + %434 = fsub float -0.000000e+00, %433 + %435 = fadd float 0x3FF00068E0000000, %434 + %436 = call float @llvm.AMDIL.clamp.(float %435, float 0.000000e+00, float 1.000000e+00) + %437 = call float @llvm.AMDGPU.rsq.f32(float %436) + %438 = fmul float %437, %436 + %439 = fsub float -0.000000e+00, %436 + %440 = call float @llvm.AMDGPU.cndlt(float %439, float %438, float 0.000000e+00) + %441 = fmul float %184, %428 + %442 = fmul float %185, %428 + %443 = fmul float %186, %428 + %444 = fmul float %187, %430 + %445 = fadd float %444, %441 + %446 = fmul float %188, %430 + %447 = fadd float %446, %442 + %448 = fmul float %189, %430 + %449 = fadd float %448, %443 + %450 = fmul float %190, %440 + %451 = fadd float %450, %445 + %452 = fmul float %191, %440 + %453 = fadd float %452, %447 + %454 = fmul float %192, %440 + %455 = fadd float %454, %449 + %456 = fmul float %451, %451 + %457 = fmul float %453, %453 + %458 = fadd float %457, %456 + %459 = fmul float %455, %455 + %460 = fadd float %458, %459 + %461 = call float @llvm.AMDGPU.rsq.f32(float %460) + %462 = fmul float %451, %461 + %463 = fmul float %453, %461 + %464 = fmul float %455, %461 + %465 = fcmp olt float 0.000000e+00, %219 + %466 = sext i1 %465 to i32 + %467 = bitcast i32 %466 to float + %468 = bitcast float %467 to i32 + %469 = icmp ne i32 %468, 0 + br i1 %469, label %IF198, label %ENDIF197 + +IF198: ; preds = %IF189 + %470 = fsub float -0.000000e+00, %462 + %471 = fsub float -0.000000e+00, %463 + %472 = fsub float -0.000000e+00, %464 + br label %ENDIF197 + +ENDIF197: ; preds = %IF189, %IF198 + %temp14.0 = phi float [ %472, %IF198 ], [ %464, %IF189 ] + %temp13.0 = phi float [ %471, %IF198 ], [ %463, %IF189 ] + %temp12.0 = phi float [ %470, %IF198 ], [ %462, %IF189 ] + %473 = bitcast float %220 to i32 + %474 = bitcast float %221 to i32 + %475 = insertelement <2 x i32> undef, i32 %473, i32 0 + %476 = insertelement <2 x i32> %475, i32 %474, i32 1 + %477 = call <4 x float> @llvm.SI.sample.v2i32(<2 x i32> %476, <32 x i8> %159, <16 x i8> %161, i32 2) + %478 = extractelement <4 x float> %477, i32 0 + %479 = extractelement <4 x float> %477, i32 1 + %480 = extractelement <4 x float> %477, i32 2 + %481 = extractelement <4 x float> %477, i32 3 + %482 = fmul float %478, %40 + %483 = fadd float %482, %41 + %484 = fmul float %479, %40 + %485 = fadd float %484, %41 + %486 = fmul float %480, %40 + %487 = fadd float %486, %41 + %488 = fmul float %481, %42 + %489 = fadd float %488, %43 + %490 = bitcast float %172 to i32 + %491 = bitcast float %173 to i32 + %492 = insertelement <2 x i32> undef, i32 %490, i32 0 + %493 = insertelement <2 x i32> %492, i32 %491, i32 1 + %494 = call <4 x float> @llvm.SI.sample.v2i32(<2 x i32> %493, <32 x i8> %155, <16 x i8> %157, i32 2) + %495 = extractelement <4 x float> %494, i32 0 + %496 = extractelement <4 x float> %494, i32 1 + %497 = extractelement <4 x float> %494, i32 2 + %498 = extractelement <4 x float> %494, i32 3 + %499 = fmul float %498, 3.200000e+01 + %500 = fadd float %499, -1.600000e+01 + %501 = call float @llvm.AMDIL.exp.(float %500) + %502 = fmul float %495, %501 + %503 = fmul float %496, %501 + %504 = fmul float %497, %501 + %505 = fmul float %28, %502 + %506 = fadd float %505, %193 + %507 = fmul float %29, %503 + %508 = fadd float %507, %194 + %509 = fmul float %30, %504 + %510 = fadd float %509, %195 + %511 = fmul float %506, %489 + %512 = fmul float %508, %489 + %513 = fmul float %510, %489 + %514 = fmul float %489, 5.000000e-01 + %515 = fadd float %514, 5.000000e-01 + %516 = fmul float %483, %515 + %517 = fadd float %516, %511 + %518 = fmul float %485, %515 + %519 = fadd float %518, %512 + %520 = fmul float %487, %515 + %521 = fadd float %520, %513 + %522 = fmul float %517, %371 + %523 = fmul float %519, %372 + %524 = fmul float %521, %373 + %525 = fmul float %428, 0x3FDB272440000000 + %526 = fmul float %430, 0xBFDB272440000000 + %527 = fadd float %526, %525 + %528 = fmul float %440, 0x3FE99999A0000000 + %529 = fadd float %527, %528 + %530 = fmul float %529, 5.000000e-01 + %531 = fadd float %530, 0x3FE3333340000000 + %532 = fmul float %531, %531 + %533 = fmul float %522, %532 + %534 = fmul float %523, %532 + %535 = fmul float %524, %532 + %536 = fsub float -0.000000e+00, %72 + %537 = fsub float -0.000000e+00, %73 + %538 = fsub float -0.000000e+00, %74 + %539 = fmul float %temp12.0, %536 + %540 = fmul float %temp13.0, %537 + %541 = fadd float %540, %539 + %542 = fmul float %temp14.0, %538 + %543 = fadd float %541, %542 + %544 = call float @llvm.AMDIL.clamp.(float %543, float 0.000000e+00, float 1.000000e+00) + %545 = fmul float %371, %544 + %546 = fmul float %372, %544 + %547 = fmul float %373, %544 + %548 = fmul float %545, %69 + %549 = fmul float %546, %70 + %550 = fmul float %547, %71 + %551 = fsub float -0.000000e+00, %164 + %552 = fadd float %97, %551 + %553 = fsub float -0.000000e+00, %165 + %554 = fadd float %98, %553 + %555 = fsub float -0.000000e+00, %166 + %556 = fadd float %99, %555 + %557 = fmul float %552, %552 + %558 = fmul float %554, %554 + %559 = fadd float %558, %557 + %560 = fmul float %556, %556 + %561 = fadd float %559, %560 + %562 = call float @llvm.AMDGPU.rsq.f32(float %561) + %563 = fmul float %562, %561 + %564 = fsub float -0.000000e+00, %561 + %565 = call float @llvm.AMDGPU.cndlt(float %564, float %563, float 0.000000e+00) + %566 = fsub float -0.000000e+00, %84 + %567 = fadd float %565, %566 + %568 = fsub float -0.000000e+00, %83 + %569 = fadd float %565, %568 + %570 = fsub float -0.000000e+00, %82 + %571 = fadd float %565, %570 + %572 = fsub float -0.000000e+00, %84 + %573 = fadd float %83, %572 + %574 = fsub float -0.000000e+00, %83 + %575 = fadd float %82, %574 + %576 = fsub float -0.000000e+00, %82 + %577 = fadd float %81, %576 + %578 = fdiv float 1.000000e+00, %573 + %579 = fdiv float 1.000000e+00, %575 + %580 = fdiv float 1.000000e+00, %577 + %581 = fmul float %567, %578 + %582 = fmul float %569, %579 + %583 = fmul float %571, %580 + %584 = fcmp olt float %565, %83 + %585 = sext i1 %584 to i32 + %586 = bitcast i32 %585 to float + %587 = bitcast float %586 to i32 + %588 = icmp ne i32 %587, 0 + br i1 %588, label %ENDIF200, label %ELSE202 + +ELSE202: ; preds = %ENDIF197 + %589 = fcmp olt float %565, %82 + %590 = sext i1 %589 to i32 + %591 = bitcast i32 %590 to float + %592 = bitcast float %591 to i32 + %593 = icmp ne i32 %592, 0 + br i1 %593, label %ENDIF200, label %ELSE205 + +ENDIF200: ; preds = %ELSE205, %ELSE202, %ENDIF197 + %temp80.0 = phi float [ %581, %ENDIF197 ], [ %.226, %ELSE205 ], [ %582, %ELSE202 ] + %temp88.0 = phi float [ %122, %ENDIF197 ], [ %.227, %ELSE205 ], [ %120, %ELSE202 ] + %temp89.0 = phi float [ %123, %ENDIF197 ], [ %.228, %ELSE205 ], [ %121, %ELSE202 ] + %temp90.0 = phi float [ %120, %ENDIF197 ], [ %116, %ELSE205 ], [ %118, %ELSE202 ] + %temp91.0 = phi float [ %121, %ENDIF197 ], [ %117, %ELSE205 ], [ %119, %ELSE202 ] + %594 = fcmp olt float %565, %83 + %595 = sext i1 %594 to i32 + %596 = bitcast i32 %595 to float + %597 = bitcast float %596 to i32 + %598 = icmp ne i32 %597, 0 + br i1 %598, label %ENDIF209, label %ELSE211 + +ELSE205: ; preds = %ELSE202 + %599 = fcmp olt float %565, %81 + %600 = sext i1 %599 to i32 + %601 = bitcast i32 %600 to float + %602 = bitcast float %601 to i32 + %603 = icmp ne i32 %602, 0 + %.226 = select i1 %603, float %583, float 1.000000e+00 + %.227 = select i1 %603, float %118, float %116 + %.228 = select i1 %603, float %119, float %117 + br label %ENDIF200 + +ELSE211: ; preds = %ENDIF200 + %604 = fcmp olt float %565, %82 + %605 = sext i1 %604 to i32 + %606 = bitcast i32 %605 to float + %607 = bitcast float %606 to i32 + %608 = icmp ne i32 %607, 0 + br i1 %608, label %ENDIF209, label %ELSE214 + +ENDIF209: ; preds = %ELSE214, %ELSE211, %ENDIF200 + %temp52.0 = phi float [ %108, %ENDIF200 ], [ %100, %ELSE214 ], [ %104, %ELSE211 ] + %temp53.0 = phi float [ %109, %ENDIF200 ], [ %101, %ELSE214 ], [ %105, %ELSE211 ] + %temp54.0 = phi float [ %110, %ENDIF200 ], [ %102, %ELSE214 ], [ %106, %ELSE211 ] + %temp55.0 = phi float [ %111, %ENDIF200 ], [ %103, %ELSE214 ], [ %107, %ELSE211 ] + %temp68.0 = phi float [ %112, %ENDIF200 ], [ %.230, %ELSE214 ], [ %108, %ELSE211 ] + %temp69.0 = phi float [ %113, %ENDIF200 ], [ %.231, %ELSE214 ], [ %109, %ELSE211 ] + %temp70.0 = phi float [ %114, %ENDIF200 ], [ %.232, %ELSE214 ], [ %110, %ELSE211 ] + %temp71.0 = phi float [ %115, %ENDIF200 ], [ %.233, %ELSE214 ], [ %111, %ELSE211 ] + %609 = fmul float %164, %85 + %610 = fmul float %165, %86 + %611 = fadd float %609, %610 + %612 = fmul float %166, %87 + %613 = fadd float %611, %612 + %614 = fmul float %167, %88 + %615 = fadd float %613, %614 + %616 = fmul float %164, %89 + %617 = fmul float %165, %90 + %618 = fadd float %616, %617 + %619 = fmul float %166, %91 + %620 = fadd float %618, %619 + %621 = fmul float %167, %92 + %622 = fadd float %620, %621 + %623 = fmul float %164, %93 + %624 = fmul float %165, %94 + %625 = fadd float %623, %624 + %626 = fmul float %166, %95 + %627 = fadd float %625, %626 + %628 = fmul float %167, %96 + %629 = fadd float %627, %628 + %630 = fsub float -0.000000e+00, %78 + %631 = fadd float 1.000000e+00, %630 + %632 = call float @fabs(float %615) + %633 = call float @fabs(float %622) + %634 = fcmp oge float %631, %632 + %635 = sext i1 %634 to i32 + %636 = bitcast i32 %635 to float + %637 = bitcast float %636 to i32 + %638 = and i32 %637, 1065353216 + %639 = bitcast i32 %638 to float + %640 = fcmp oge float %631, %633 + %641 = sext i1 %640 to i32 + %642 = bitcast i32 %641 to float + %643 = bitcast float %642 to i32 + %644 = and i32 %643, 1065353216 + %645 = bitcast i32 %644 to float + %646 = fmul float %639, %645 + %647 = fmul float %629, %646 + %648 = fmul float %615, %temp68.0 + %649 = fadd float %648, %temp70.0 + %650 = fmul float %622, %temp69.0 + %651 = fadd float %650, %temp71.0 + %652 = fmul float %615, %temp52.0 + %653 = fadd float %652, %temp54.0 + %654 = fmul float %622, %temp53.0 + %655 = fadd float %654, %temp55.0 + %656 = fadd float %temp80.0, -1.000000e+00 + %657 = fmul float %656, %77 + %658 = fadd float %657, 1.000000e+00 + %659 = call float @llvm.AMDIL.clamp.(float %658, float 0.000000e+00, float 1.000000e+00) + %660 = bitcast float %649 to i32 + %661 = bitcast float %651 to i32 + %662 = bitcast float 0.000000e+00 to i32 + %663 = insertelement <4 x i32> undef, i32 %660, i32 0 + %664 = insertelement <4 x i32> %663, i32 %661, i32 1 + %665 = insertelement <4 x i32> %664, i32 %662, i32 2 + %666 = insertelement <4 x i32> %665, i32 undef, i32 3 + %667 = call <4 x float> @llvm.SI.samplel.v4i32(<4 x i32> %666, <32 x i8> %127, <16 x i8> %129, i32 2) + %668 = extractelement <4 x float> %667, i32 0 + %669 = extractelement <4 x float> %667, i32 1 + %670 = bitcast float %653 to i32 + %671 = bitcast float %655 to i32 + %672 = bitcast float 0.000000e+00 to i32 + %673 = insertelement <4 x i32> undef, i32 %670, i32 0 + %674 = insertelement <4 x i32> %673, i32 %671, i32 1 + %675 = insertelement <4 x i32> %674, i32 %672, i32 2 + %676 = insertelement <4 x i32> %675, i32 undef, i32 3 + %677 = call <4 x float> @llvm.SI.samplel.v4i32(<4 x i32> %676, <32 x i8> %127, <16 x i8> %129, i32 2) + %678 = extractelement <4 x float> %677, i32 0 + %679 = extractelement <4 x float> %677, i32 1 + %680 = fsub float -0.000000e+00, %669 + %681 = fadd float 1.000000e+00, %680 + %682 = fsub float -0.000000e+00, %679 + %683 = fadd float 1.000000e+00, %682 + %684 = fmul float %681, 2.500000e-01 + %685 = fmul float %683, 2.500000e-01 + %686 = fsub float -0.000000e+00, %684 + %687 = fadd float %668, %686 + %688 = fsub float -0.000000e+00, %685 + %689 = fadd float %678, %688 + %690 = fmul float %647, %temp88.0 + %691 = fadd float %690, %temp89.0 + %692 = fmul float %647, %temp90.0 + %693 = fadd float %692, %temp91.0 + %694 = call float @llvm.AMDIL.clamp.(float %691, float 0.000000e+00, float 1.000000e+00) + %695 = call float @llvm.AMDIL.clamp.(float %693, float 0.000000e+00, float 1.000000e+00) + %696 = fsub float -0.000000e+00, %694 + %697 = fadd float %668, %696 + %698 = fsub float -0.000000e+00, %695 + %699 = fadd float %678, %698 + %700 = fmul float %668, %668 + %701 = fmul float %678, %678 + %702 = fsub float -0.000000e+00, %700 + %703 = fadd float %687, %702 + %704 = fsub float -0.000000e+00, %701 + %705 = fadd float %689, %704 + %706 = fcmp uge float %703, %75 + %707 = select i1 %706, float %703, float %75 + %708 = fcmp uge float %705, %75 + %709 = select i1 %708, float %705, float %75 + %710 = fmul float %697, %697 + %711 = fadd float %710, %707 + %712 = fmul float %699, %699 + %713 = fadd float %712, %709 + %714 = fdiv float 1.000000e+00, %711 + %715 = fdiv float 1.000000e+00, %713 + %716 = fmul float %707, %714 + %717 = fmul float %709, %715 + %718 = fcmp oge float %697, 0.000000e+00 + %719 = sext i1 %718 to i32 + %720 = bitcast i32 %719 to float + %721 = bitcast float %720 to i32 + %722 = icmp ne i32 %721, 0 + %.229 = select i1 %722, float 1.000000e+00, float %716 + %723 = fcmp oge float %699, 0.000000e+00 + %724 = sext i1 %723 to i32 + %725 = bitcast i32 %724 to float + %726 = bitcast float %725 to i32 + %727 = icmp ne i32 %726, 0 + %temp28.0 = select i1 %727, float 1.000000e+00, float %717 + %728 = call float @llvm.AMDGPU.lrp(float %659, float %temp28.0, float %.229) + %729 = call float @llvm.pow.f32(float %728, float %76) + %730 = fmul float %729, %79 + %731 = fadd float %730, %80 + %732 = call float @llvm.AMDIL.clamp.(float %731, float 0.000000e+00, float 1.000000e+00) + %733 = fmul float %732, %732 + %734 = fmul float 2.000000e+00, %732 + %735 = fsub float -0.000000e+00, %734 + %736 = fadd float 3.000000e+00, %735 + %737 = fmul float %733, %736 + %738 = fmul float %548, %737 + %739 = fmul float %549, %737 + %740 = fmul float %550, %737 + %741 = fmul float %738, %515 + %742 = fadd float %741, %533 + %743 = fmul float %739, %515 + %744 = fadd float %743, %534 + %745 = fmul float %740, %515 + %746 = fadd float %745, %535 + %747 = call float @llvm.AMDGPU.lrp(float %230, float %287, float 1.000000e+00) + %748 = call float @llvm.AMDGPU.lrp(float %37, float %298, float 1.000000e+00) + %749 = call float @llvm.AMDGPU.lrp(float %37, float %299, float 1.000000e+00) + %750 = call float @llvm.AMDGPU.lrp(float %37, float %300, float 1.000000e+00) + %751 = call float @llvm.AMDGPU.lrp(float %38, float %747, float 1.000000e+00) + %752 = fmul float %748, %751 + %753 = fmul float %749, %751 + %754 = fmul float %750, %751 + %755 = fmul float %742, %752 + %756 = fmul float %744, %753 + %757 = fmul float %746, %754 + %758 = fmul float %temp12.0, %216 + %759 = fmul float %temp13.0, %217 + %760 = fadd float %759, %758 + %761 = fmul float %temp14.0, %218 + %762 = fadd float %760, %761 + %763 = call float @fabs(float %762) + %764 = fmul float %763, %763 + %765 = fmul float %764, %50 + %766 = fadd float %765, %51 + %767 = call float @llvm.AMDIL.clamp.(float %766, float 0.000000e+00, float 1.000000e+00) + %768 = fsub float -0.000000e+00, %767 + %769 = fadd float 1.000000e+00, %768 + %770 = fmul float %33, %769 + %771 = fmul float %33, %769 + %772 = fmul float %33, %769 + %773 = fmul float %34, %769 + %774 = call float @llvm.AMDGPU.lrp(float %770, float %31, float %755) + %775 = call float @llvm.AMDGPU.lrp(float %771, float %31, float %756) + %776 = call float @llvm.AMDGPU.lrp(float %772, float %31, float %757) + %777 = call float @llvm.AMDGPU.lrp(float %773, float %32, float %374) + %778 = fcmp uge float %774, 0x3E6FFFFE60000000 + %779 = select i1 %778, float %774, float 0x3E6FFFFE60000000 + %780 = fcmp uge float %775, 0x3E6FFFFE60000000 + %781 = select i1 %780, float %775, float 0x3E6FFFFE60000000 + %782 = fcmp uge float %776, 0x3E6FFFFE60000000 + %783 = select i1 %782, float %776, float 0x3E6FFFFE60000000 + %784 = fcmp uge float %779, 6.550400e+04 + %785 = select i1 %784, float 6.550400e+04, float %779 + %786 = fcmp uge float %781, 6.550400e+04 + %787 = select i1 %786, float 6.550400e+04, float %781 + %788 = fcmp uge float %783, 6.550400e+04 + %789 = select i1 %788, float 6.550400e+04, float %783 + %790 = fmul float %777, %52 + %791 = fadd float %790, %53 + %792 = call float @llvm.AMDIL.clamp.(float %791, float 0.000000e+00, float 1.000000e+00) + %793 = call i32 @llvm.SI.packf16(float %785, float %787) + %794 = bitcast i32 %793 to float + %795 = call i32 @llvm.SI.packf16(float %789, float %792) + %796 = bitcast i32 %795 to float + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %794, float %796, float %794, float %796) + ret void + +ELSE214: ; preds = %ELSE211 + %797 = fcmp olt float %565, %81 + %798 = sext i1 %797 to i32 + %799 = bitcast i32 %798 to float + %800 = bitcast float %799 to i32 + %801 = icmp ne i32 %800, 0 + %.230 = select i1 %801, float %104, float %100 + %.231 = select i1 %801, float %105, float %101 + %.232 = select i1 %801, float %106, float %102 + %.233 = select i1 %801, float %107, float %103 + br label %ENDIF209 +} + +; Function Attrs: readnone +declare float @llvm.AMDIL.clamp.(float, float, float) #2 + +; Function Attrs: nounwind readnone +declare <4 x float> @llvm.SI.sample.v2i32(<2 x i32>, <32 x i8>, <16 x i8>, i32) #1 + +; Function Attrs: readnone +declare float @llvm.AMDGPU.lrp(float, float, float) #2 + +; Function Attrs: nounwind readnone +declare <4 x float> @llvm.SI.samplel.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32) #1 + +; Function Attrs: readnone +declare float @llvm.AMDGPU.cndlt(float, float, float) #2 + +; Function Attrs: readnone +declare float @llvm.AMDIL.exp.(float) #2 + +attributes #0 = { "ShaderType"="0" } +attributes #1 = { nounwind readnone } +attributes #2 = { readnone } +attributes #3 = { nounwind readonly } +attributes #4 = { readonly } diff --git a/llvm/test/CodeGen/AMDGPU/si-spill-cf.ll b/llvm/test/CodeGen/AMDGPU/si-spill-cf.ll new file mode 100644 index 00000000000..4b2d8ec6bf0 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/si-spill-cf.ll @@ -0,0 +1,501 @@ +; RUN: llc -march=amdgcn -mcpu=SI < %s -verify-machineinstrs | FileCheck -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=tonga < %s -verify-machineinstrs | FileCheck -check-prefix=SI %s + +; If this occurs it is likely due to reordering and the restore was +; originally supposed to happen before SI_END_CF. +; SI: s_or_b64 exec, exec, [[SAVED:s\[[0-9]+:[0-9]+\]|[a-z]+]] +; SI-NOT: v_readlane_b32 [[SAVED]] + +define void @main() #0 { +main_body: + %0 = call float @llvm.SI.load.const(<16 x i8> undef, i32 16) + %1 = call float @llvm.SI.load.const(<16 x i8> undef, i32 32) + %2 = call float @llvm.SI.load.const(<16 x i8> undef, i32 80) + %3 = call float @llvm.SI.load.const(<16 x i8> undef, i32 84) + %4 = call float @llvm.SI.load.const(<16 x i8> undef, i32 88) + %5 = call float @llvm.SI.load.const(<16 x i8> undef, i32 96) + %6 = call float @llvm.SI.load.const(<16 x i8> undef, i32 100) + %7 = call float @llvm.SI.load.const(<16 x i8> undef, i32 104) + %8 = call float @llvm.SI.load.const(<16 x i8> undef, i32 112) + %9 = call float @llvm.SI.load.const(<16 x i8> undef, i32 116) + %10 = call float @llvm.SI.load.const(<16 x i8> undef, i32 120) + %11 = call float @llvm.SI.load.const(<16 x i8> undef, i32 128) + %12 = call float @llvm.SI.load.const(<16 x i8> undef, i32 132) + %13 = call float @llvm.SI.load.const(<16 x i8> undef, i32 136) + %14 = call float @llvm.SI.load.const(<16 x i8> undef, i32 144) + %15 = call float @llvm.SI.load.const(<16 x i8> undef, i32 148) + %16 = call float @llvm.SI.load.const(<16 x i8> undef, i32 152) + %17 = call float @llvm.SI.load.const(<16 x i8> undef, i32 160) + %18 = call float @llvm.SI.load.const(<16 x i8> undef, i32 164) + %19 = call float @llvm.SI.load.const(<16 x i8> undef, i32 168) + %20 = call float @llvm.SI.load.const(<16 x i8> undef, i32 176) + %21 = call float @llvm.SI.load.const(<16 x i8> undef, i32 180) + %22 = call float @llvm.SI.load.const(<16 x i8> undef, i32 184) + %23 = call float @llvm.SI.load.const(<16 x i8> undef, i32 192) + %24 = call float @llvm.SI.load.const(<16 x i8> undef, i32 196) + %25 = call float @llvm.SI.load.const(<16 x i8> undef, i32 200) + %26 = call float @llvm.SI.load.const(<16 x i8> undef, i32 208) + %27 = call float @llvm.SI.load.const(<16 x i8> undef, i32 212) + %28 = call float @llvm.SI.load.const(<16 x i8> undef, i32 216) + %29 = call float @llvm.SI.load.const(<16 x i8> undef, i32 224) + %30 = call float @llvm.SI.load.const(<16 x i8> undef, i32 228) + %31 = call float @llvm.SI.load.const(<16 x i8> undef, i32 232) + %32 = call float @llvm.SI.load.const(<16 x i8> undef, i32 240) + %33 = call float @llvm.SI.load.const(<16 x i8> undef, i32 244) + %34 = call float @llvm.SI.load.const(<16 x i8> undef, i32 248) + %35 = call float @llvm.SI.load.const(<16 x i8> undef, i32 256) + %36 = call float @llvm.SI.load.const(<16 x i8> undef, i32 260) + %37 = call float @llvm.SI.load.const(<16 x i8> undef, i32 264) + %38 = call float @llvm.SI.load.const(<16 x i8> undef, i32 272) + %39 = call float @llvm.SI.load.const(<16 x i8> undef, i32 276) + %40 = call float @llvm.SI.load.const(<16 x i8> undef, i32 280) + %41 = call float @llvm.SI.load.const(<16 x i8> undef, i32 288) + %42 = call float @llvm.SI.load.const(<16 x i8> undef, i32 292) + %43 = call float @llvm.SI.load.const(<16 x i8> undef, i32 296) + %44 = call float @llvm.SI.load.const(<16 x i8> undef, i32 304) + %45 = call float @llvm.SI.load.const(<16 x i8> undef, i32 308) + %46 = call float @llvm.SI.load.const(<16 x i8> undef, i32 312) + %47 = call float @llvm.SI.load.const(<16 x i8> undef, i32 320) + %48 = call float @llvm.SI.load.const(<16 x i8> undef, i32 324) + %49 = call float @llvm.SI.load.const(<16 x i8> undef, i32 328) + %50 = call float @llvm.SI.load.const(<16 x i8> undef, i32 336) + %51 = call float @llvm.SI.load.const(<16 x i8> undef, i32 340) + %52 = call float @llvm.SI.load.const(<16 x i8> undef, i32 344) + %53 = call float @llvm.SI.load.const(<16 x i8> undef, i32 352) + %54 = call float @llvm.SI.load.const(<16 x i8> undef, i32 356) + %55 = call float @llvm.SI.load.const(<16 x i8> undef, i32 360) + %56 = call float @llvm.SI.load.const(<16 x i8> undef, i32 368) + %57 = call float @llvm.SI.load.const(<16 x i8> undef, i32 372) + %58 = call float @llvm.SI.load.const(<16 x i8> undef, i32 376) + %59 = call float @llvm.SI.load.const(<16 x i8> undef, i32 384) + %60 = call float @llvm.SI.load.const(<16 x i8> undef, i32 388) + %61 = call float @llvm.SI.load.const(<16 x i8> undef, i32 392) + %62 = call float @llvm.SI.load.const(<16 x i8> undef, i32 400) + %63 = call float @llvm.SI.load.const(<16 x i8> undef, i32 404) + %64 = call float @llvm.SI.load.const(<16 x i8> undef, i32 408) + %65 = call float @llvm.SI.load.const(<16 x i8> undef, i32 416) + %66 = call float @llvm.SI.load.const(<16 x i8> undef, i32 420) + br label %LOOP + +LOOP: ; preds = %ENDIF2795, %main_body + %temp894.0 = phi float [ 0.000000e+00, %main_body ], [ %temp894.1, %ENDIF2795 ] + %temp18.0 = phi float [ undef, %main_body ], [ %temp18.1, %ENDIF2795 ] + %67 = icmp sgt i32 undef, 4 + br i1 %67, label %ENDLOOP, label %ENDIF + +ENDLOOP: ; preds = %ELSE2566, %LOOP + %68 = call float @llvm.AMDGPU.lrp(float %0, float undef, float undef) + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float undef, float %68, float undef, float 1.000000e+00) + ret void + +ENDIF: ; preds = %LOOP + %69 = fsub float %2, undef + %70 = fsub float %3, undef + %71 = fsub float %4, undef + %72 = fmul float %69, 0.000000e+00 + %73 = fmul float %70, undef + %74 = fmul float %71, undef + %75 = fsub float %6, undef + %76 = fsub float %7, undef + %77 = fmul float %75, undef + %78 = fmul float %76, 0.000000e+00 + %79 = call float @llvm.minnum.f32(float %74, float %78) + %80 = call float @llvm.maxnum.f32(float %72, float 0.000000e+00) + %81 = call float @llvm.maxnum.f32(float %73, float %77) + %82 = call float @llvm.maxnum.f32(float undef, float %79) + %83 = call float @llvm.minnum.f32(float %80, float %81) + %84 = call float @llvm.minnum.f32(float %83, float undef) + %85 = fsub float %14, undef + %86 = fsub float %15, undef + %87 = fsub float %16, undef + %88 = fmul float %85, undef + %89 = fmul float %86, undef + %90 = fmul float %87, undef + %91 = fsub float %17, undef + %92 = fsub float %18, undef + %93 = fsub float %19, undef + %94 = fmul float %91, 0.000000e+00 + %95 = fmul float %92, undef + %96 = fmul float %93, undef + %97 = call float @llvm.minnum.f32(float %89, float %95) + %98 = call float @llvm.maxnum.f32(float %88, float %94) + %99 = call float @llvm.maxnum.f32(float %90, float %96) + %100 = call float @llvm.maxnum.f32(float undef, float %97) + %101 = call float @llvm.maxnum.f32(float %100, float undef) + %102 = call float @llvm.minnum.f32(float %98, float undef) + %103 = call float @llvm.minnum.f32(float %102, float %99) + %104 = fsub float %30, undef + %105 = fsub float %31, undef + %106 = fmul float %104, 0.000000e+00 + %107 = fmul float %105, 0.000000e+00 + %108 = call float @llvm.minnum.f32(float undef, float %106) + %109 = call float @llvm.maxnum.f32(float undef, float %107) + %110 = call float @llvm.maxnum.f32(float undef, float %108) + %111 = call float @llvm.maxnum.f32(float %110, float undef) + %112 = call float @llvm.minnum.f32(float undef, float %109) + %113 = fsub float %32, undef + %114 = fsub float %33, undef + %115 = fsub float %34, undef + %116 = fmul float %113, 0.000000e+00 + %117 = fmul float %114, undef + %118 = fmul float %115, undef + %119 = fsub float %35, undef + %120 = fsub float %36, undef + %121 = fsub float %37, undef + %122 = fmul float %119, undef + %123 = fmul float %120, undef + %124 = fmul float %121, undef + %125 = call float @llvm.minnum.f32(float %116, float %122) + %126 = call float @llvm.minnum.f32(float %117, float %123) + %127 = call float @llvm.minnum.f32(float %118, float %124) + %128 = call float @llvm.maxnum.f32(float %125, float %126) + %129 = call float @llvm.maxnum.f32(float %128, float %127) + %130 = fsub float %38, undef + %131 = fsub float %39, undef + %132 = fsub float %40, undef + %133 = fmul float %130, 0.000000e+00 + %134 = fmul float %131, undef + %135 = fmul float %132, undef + %136 = fsub float %41, undef + %137 = fsub float %42, undef + %138 = fsub float %43, undef + %139 = fmul float %136, undef + %140 = fmul float %137, undef + %141 = fmul float %138, undef + %142 = call float @llvm.minnum.f32(float %133, float %139) + %143 = call float @llvm.minnum.f32(float %134, float %140) + %144 = call float @llvm.minnum.f32(float %135, float %141) + %145 = call float @llvm.maxnum.f32(float %142, float %143) + %146 = call float @llvm.maxnum.f32(float %145, float %144) + %147 = fsub float %44, undef + %148 = fsub float %45, undef + %149 = fsub float %46, undef + %150 = fmul float %147, 0.000000e+00 + %151 = fmul float %148, 0.000000e+00 + %152 = fmul float %149, undef + %153 = fsub float %47, undef + %154 = fsub float %48, undef + %155 = fsub float %49, undef + %156 = fmul float %153, undef + %157 = fmul float %154, 0.000000e+00 + %158 = fmul float %155, undef + %159 = call float @llvm.minnum.f32(float %150, float %156) + %160 = call float @llvm.minnum.f32(float %151, float %157) + %161 = call float @llvm.minnum.f32(float %152, float %158) + %162 = call float @llvm.maxnum.f32(float %159, float %160) + %163 = call float @llvm.maxnum.f32(float %162, float %161) + %164 = fsub float %50, undef + %165 = fsub float %51, undef + %166 = fsub float %52, undef + %167 = fmul float %164, undef + %168 = fmul float %165, 0.000000e+00 + %169 = fmul float %166, 0.000000e+00 + %170 = fsub float %53, undef + %171 = fsub float %54, undef + %172 = fsub float %55, undef + %173 = fdiv float 1.000000e+00, %temp18.0 + %174 = fmul float %170, undef + %175 = fmul float %171, undef + %176 = fmul float %172, %173 + %177 = call float @llvm.minnum.f32(float %167, float %174) + %178 = call float @llvm.minnum.f32(float %168, float %175) + %179 = call float @llvm.minnum.f32(float %169, float %176) + %180 = call float @llvm.maxnum.f32(float %177, float %178) + %181 = call float @llvm.maxnum.f32(float %180, float %179) + %182 = fsub float %62, undef + %183 = fsub float %63, undef + %184 = fsub float %64, undef + %185 = fmul float %182, 0.000000e+00 + %186 = fmul float %183, undef + %187 = fmul float %184, undef + %188 = fsub float %65, undef + %189 = fsub float %66, undef + %190 = fmul float %188, undef + %191 = fmul float %189, undef + %192 = call float @llvm.maxnum.f32(float %185, float %190) + %193 = call float @llvm.maxnum.f32(float %186, float %191) + %194 = call float @llvm.maxnum.f32(float %187, float undef) + %195 = call float @llvm.minnum.f32(float %192, float %193) + %196 = call float @llvm.minnum.f32(float %195, float %194) + %.temp292.7 = select i1 undef, float %163, float undef + %temp292.9 = select i1 false, float %181, float %.temp292.7 + %.temp292.9 = select i1 undef, float undef, float %temp292.9 + %197 = fcmp ogt float undef, 0.000000e+00 + %198 = fcmp olt float undef, %196 + %199 = and i1 %197, %198 + %200 = fcmp olt float undef, %.temp292.9 + %201 = and i1 %199, %200 + %temp292.11 = select i1 %201, float undef, float %.temp292.9 + br i1 undef, label %IF2565, label %ELSE2566 + +IF2565: ; preds = %ENDIF + br i1 false, label %ENDIF2582, label %ELSE2584 + +ELSE2566: ; preds = %ENDIF + %202 = fcmp oeq float %temp292.11, 1.000000e+04 + br i1 %202, label %ENDLOOP, label %ELSE2593 + +ENDIF2564: ; preds = %ENDIF2594, %ENDIF2588 + %temp894.1 = phi float [ undef, %ENDIF2588 ], [ %temp894.2, %ENDIF2594 ] + %temp18.1 = phi float [ %219, %ENDIF2588 ], [ undef, %ENDIF2594 ] + %203 = fsub float %5, undef + %204 = fmul float %203, undef + %205 = call float @llvm.maxnum.f32(float undef, float %204) + %206 = call float @llvm.minnum.f32(float %205, float undef) + %207 = call float @llvm.minnum.f32(float %206, float undef) + %208 = fcmp ogt float undef, 0.000000e+00 + %209 = fcmp olt float undef, 1.000000e+00 + %210 = and i1 %208, %209 + %211 = fcmp olt float undef, %207 + %212 = and i1 %210, %211 + br i1 %212, label %ENDIF2795, label %ELSE2797 + +ELSE2584: ; preds = %IF2565 + br label %ENDIF2582 + +ENDIF2582: ; preds = %ELSE2584, %IF2565 + %213 = fadd float %1, undef + %214 = fadd float 0.000000e+00, %213 + %215 = call float @llvm.AMDIL.fraction.(float %214) + br i1 undef, label %IF2589, label %ELSE2590 + +IF2589: ; preds = %ENDIF2582 + br label %ENDIF2588 + +ELSE2590: ; preds = %ENDIF2582 + br label %ENDIF2588 + +ENDIF2588: ; preds = %ELSE2590, %IF2589 + %216 = fsub float 1.000000e+00, %215 + %217 = call float @llvm.sqrt.f32(float %216) + %218 = fmul float %217, undef + %219 = fadd float %218, undef + br label %ENDIF2564 + +ELSE2593: ; preds = %ELSE2566 + %220 = fcmp oeq float %temp292.11, %82 + %221 = fcmp olt float %82, %84 + %222 = and i1 %220, %221 + br i1 %222, label %ENDIF2594, label %ELSE2596 + +ELSE2596: ; preds = %ELSE2593 + %223 = fcmp oeq float %temp292.11, %101 + %224 = fcmp olt float %101, %103 + %225 = and i1 %223, %224 + br i1 %225, label %ENDIF2594, label %ELSE2632 + +ENDIF2594: ; preds = %ELSE2788, %ELSE2785, %ELSE2782, %ELSE2779, %IF2775, %ELSE2761, %ELSE2758, %IF2757, %ELSE2704, %ELSE2686, %ELSE2671, %ELSE2668, %IF2667, %ELSE2632, %ELSE2596, %ELSE2593 + %temp894.2 = phi float [ 0.000000e+00, %IF2667 ], [ 0.000000e+00, %ELSE2671 ], [ 0.000000e+00, %IF2757 ], [ 0.000000e+00, %ELSE2761 ], [ %temp894.0, %ELSE2758 ], [ 0.000000e+00, %IF2775 ], [ 0.000000e+00, %ELSE2779 ], [ 0.000000e+00, %ELSE2782 ], [ %.2848, %ELSE2788 ], [ 0.000000e+00, %ELSE2785 ], [ 0.000000e+00, %ELSE2593 ], [ 0.000000e+00, %ELSE2632 ], [ 0.000000e+00, %ELSE2704 ], [ 0.000000e+00, %ELSE2686 ], [ 0.000000e+00, %ELSE2668 ], [ 0.000000e+00, %ELSE2596 ] + %226 = fmul float %temp894.2, undef + br label %ENDIF2564 + +ELSE2632: ; preds = %ELSE2596 + br i1 undef, label %ENDIF2594, label %ELSE2650 + +ELSE2650: ; preds = %ELSE2632 + %227 = fcmp oeq float %temp292.11, %111 + %228 = fcmp olt float %111, %112 + %229 = and i1 %227, %228 + br i1 %229, label %IF2667, label %ELSE2668 + +IF2667: ; preds = %ELSE2650 + br i1 undef, label %ENDIF2594, label %ELSE2671 + +ELSE2668: ; preds = %ELSE2650 + %230 = fcmp oeq float %temp292.11, %129 + %231 = fcmp olt float %129, undef + %232 = and i1 %230, %231 + br i1 %232, label %ENDIF2594, label %ELSE2686 + +ELSE2671: ; preds = %IF2667 + br label %ENDIF2594 + +ELSE2686: ; preds = %ELSE2668 + %233 = fcmp oeq float %temp292.11, %146 + %234 = fcmp olt float %146, undef + %235 = and i1 %233, %234 + br i1 %235, label %ENDIF2594, label %ELSE2704 + +ELSE2704: ; preds = %ELSE2686 + %236 = fcmp oeq float %temp292.11, %181 + %237 = fcmp olt float %181, undef + %238 = and i1 %236, %237 + br i1 %238, label %ENDIF2594, label %ELSE2740 + +ELSE2740: ; preds = %ELSE2704 + br i1 undef, label %IF2757, label %ELSE2758 + +IF2757: ; preds = %ELSE2740 + br i1 undef, label %ENDIF2594, label %ELSE2761 + +ELSE2758: ; preds = %ELSE2740 + br i1 undef, label %IF2775, label %ENDIF2594 + +ELSE2761: ; preds = %IF2757 + br label %ENDIF2594 + +IF2775: ; preds = %ELSE2758 + %239 = fcmp olt float undef, undef + br i1 %239, label %ENDIF2594, label %ELSE2779 + +ELSE2779: ; preds = %IF2775 + br i1 undef, label %ENDIF2594, label %ELSE2782 + +ELSE2782: ; preds = %ELSE2779 + br i1 undef, label %ENDIF2594, label %ELSE2785 + +ELSE2785: ; preds = %ELSE2782 + %240 = fcmp olt float undef, 0.000000e+00 + br i1 %240, label %ENDIF2594, label %ELSE2788 + +ELSE2788: ; preds = %ELSE2785 + %241 = fcmp olt float 0.000000e+00, undef + %.2848 = select i1 %241, float -1.000000e+00, float 1.000000e+00 + br label %ENDIF2594 + +ELSE2797: ; preds = %ENDIF2564 + %242 = fsub float %8, undef + %243 = fsub float %9, undef + %244 = fsub float %10, undef + %245 = fmul float %242, undef + %246 = fmul float %243, undef + %247 = fmul float %244, undef + %248 = fsub float %11, undef + %249 = fsub float %12, undef + %250 = fsub float %13, undef + %251 = fmul float %248, undef + %252 = fmul float %249, undef + %253 = fmul float %250, undef + %254 = call float @llvm.minnum.f32(float %245, float %251) + %255 = call float @llvm.minnum.f32(float %246, float %252) + %256 = call float @llvm.maxnum.f32(float %247, float %253) + %257 = call float @llvm.maxnum.f32(float %254, float %255) + %258 = call float @llvm.maxnum.f32(float %257, float undef) + %259 = call float @llvm.minnum.f32(float undef, float %256) + %260 = fcmp ogt float %258, 0.000000e+00 + %261 = fcmp olt float %258, 1.000000e+00 + %262 = and i1 %260, %261 + %263 = fcmp olt float %258, %259 + %264 = and i1 %262, %263 + br i1 %264, label %ENDIF2795, label %ELSE2800 + +ENDIF2795: ; preds = %ELSE2824, %ELSE2821, %ELSE2818, %ELSE2815, %ELSE2812, %ELSE2809, %ELSE2806, %ELSE2803, %ELSE2800, %ELSE2797, %ENDIF2564 + br label %LOOP + +ELSE2800: ; preds = %ELSE2797 + br i1 undef, label %ENDIF2795, label %ELSE2803 + +ELSE2803: ; preds = %ELSE2800 + %265 = fsub float %20, undef + %266 = fsub float %21, undef + %267 = fsub float %22, undef + %268 = fmul float %265, undef + %269 = fmul float %266, undef + %270 = fmul float %267, 0.000000e+00 + %271 = fsub float %23, undef + %272 = fsub float %24, undef + %273 = fsub float %25, undef + %274 = fmul float %271, undef + %275 = fmul float %272, undef + %276 = fmul float %273, undef + %277 = call float @llvm.minnum.f32(float %268, float %274) + %278 = call float @llvm.maxnum.f32(float %269, float %275) + %279 = call float @llvm.maxnum.f32(float %270, float %276) + %280 = call float @llvm.maxnum.f32(float %277, float undef) + %281 = call float @llvm.maxnum.f32(float %280, float undef) + %282 = call float @llvm.minnum.f32(float undef, float %278) + %283 = call float @llvm.minnum.f32(float %282, float %279) + %284 = fcmp ogt float %281, 0.000000e+00 + %285 = fcmp olt float %281, 1.000000e+00 + %286 = and i1 %284, %285 + %287 = fcmp olt float %281, %283 + %288 = and i1 %286, %287 + br i1 %288, label %ENDIF2795, label %ELSE2806 + +ELSE2806: ; preds = %ELSE2803 + %289 = fsub float %26, undef + %290 = fsub float %27, undef + %291 = fsub float %28, undef + %292 = fmul float %289, undef + %293 = fmul float %290, 0.000000e+00 + %294 = fmul float %291, undef + %295 = fsub float %29, undef + %296 = fmul float %295, undef + %297 = call float @llvm.minnum.f32(float %292, float %296) + %298 = call float @llvm.minnum.f32(float %293, float undef) + %299 = call float @llvm.maxnum.f32(float %294, float undef) + %300 = call float @llvm.maxnum.f32(float %297, float %298) + %301 = call float @llvm.maxnum.f32(float %300, float undef) + %302 = call float @llvm.minnum.f32(float undef, float %299) + %303 = fcmp ogt float %301, 0.000000e+00 + %304 = fcmp olt float %301, 1.000000e+00 + %305 = and i1 %303, %304 + %306 = fcmp olt float %301, %302 + %307 = and i1 %305, %306 + br i1 %307, label %ENDIF2795, label %ELSE2809 + +ELSE2809: ; preds = %ELSE2806 + br i1 undef, label %ENDIF2795, label %ELSE2812 + +ELSE2812: ; preds = %ELSE2809 + br i1 undef, label %ENDIF2795, label %ELSE2815 + +ELSE2815: ; preds = %ELSE2812 + br i1 undef, label %ENDIF2795, label %ELSE2818 + +ELSE2818: ; preds = %ELSE2815 + br i1 undef, label %ENDIF2795, label %ELSE2821 + +ELSE2821: ; preds = %ELSE2818 + %308 = fsub float %56, undef + %309 = fsub float %57, undef + %310 = fsub float %58, undef + %311 = fmul float %308, undef + %312 = fmul float %309, 0.000000e+00 + %313 = fmul float %310, undef + %314 = fsub float %59, undef + %315 = fsub float %60, undef + %316 = fsub float %61, undef + %317 = fmul float %314, undef + %318 = fmul float %315, undef + %319 = fmul float %316, undef + %320 = call float @llvm.maxnum.f32(float %311, float %317) + %321 = call float @llvm.maxnum.f32(float %312, float %318) + %322 = call float @llvm.maxnum.f32(float %313, float %319) + %323 = call float @llvm.minnum.f32(float %320, float %321) + %324 = call float @llvm.minnum.f32(float %323, float %322) + %325 = fcmp ogt float undef, 0.000000e+00 + %326 = fcmp olt float undef, 1.000000e+00 + %327 = and i1 %325, %326 + %328 = fcmp olt float undef, %324 + %329 = and i1 %327, %328 + br i1 %329, label %ENDIF2795, label %ELSE2824 + +ELSE2824: ; preds = %ELSE2821 + %.2849 = select i1 undef, float 0.000000e+00, float 1.000000e+00 + br label %ENDIF2795 +} + +; Function Attrs: nounwind readnone +declare float @llvm.SI.load.const(<16 x i8>, i32) #1 + +; Function Attrs: readnone +declare float @llvm.AMDIL.fraction.(float) #2 + +; Function Attrs: nounwind readnone +declare float @llvm.sqrt.f32(float) #1 + +; Function Attrs: nounwind readnone +declare float @llvm.minnum.f32(float, float) #1 + +; Function Attrs: nounwind readnone +declare float @llvm.maxnum.f32(float, float) #1 + +; Function Attrs: readnone +declare float @llvm.AMDGPU.lrp(float, float, float) #2 + +declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) + +attributes #0 = { "ShaderType"="0" "enable-no-nans-fp-math"="true" } +attributes #1 = { nounwind readnone } +attributes #2 = { readnone } diff --git a/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll b/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll new file mode 100644 index 00000000000..5a6129aaa3f --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll @@ -0,0 +1,236 @@ +; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -enable-misched -enable-aa-sched-mi < %s | FileCheck -check-prefix=FUNC -check-prefix=CI %s + +declare void @llvm.SI.tbuffer.store.i32(<16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) +declare void @llvm.SI.tbuffer.store.v4i32(<16 x i8>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) +declare void @llvm.AMDGPU.barrier.local() #2 + + +@stored_lds_ptr = addrspace(3) global i32 addrspace(3)* undef, align 4 +@stored_constant_ptr = addrspace(3) global i32 addrspace(2)* undef, align 8 +@stored_global_ptr = addrspace(3) global i32 addrspace(1)* undef, align 8 + +; FUNC-LABEL: @reorder_local_load_global_store_local_load +; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:4 +; CI-NEXT: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:8 +; CI: buffer_store_dword +define void @reorder_local_load_global_store_local_load(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 { + %ptr0 = load i32 addrspace(3)*, i32 addrspace(3)* addrspace(3)* @stored_lds_ptr, align 4 + + %ptr1 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 1 + %ptr2 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 2 + + %tmp1 = load i32, i32 addrspace(3)* %ptr1, align 4 + store i32 99, i32 addrspace(1)* %gptr, align 4 + %tmp2 = load i32, i32 addrspace(3)* %ptr2, align 4 + + %add = add nsw i32 %tmp1, %tmp2 + + store i32 %add, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @no_reorder_local_load_volatile_global_store_local_load +; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:4 +; CI: buffer_store_dword +; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:8 +define void @no_reorder_local_load_volatile_global_store_local_load(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 { + %ptr0 = load i32 addrspace(3)*, i32 addrspace(3)* addrspace(3)* @stored_lds_ptr, align 4 + + %ptr1 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 1 + %ptr2 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 2 + + %tmp1 = load i32, i32 addrspace(3)* %ptr1, align 4 + store volatile i32 99, i32 addrspace(1)* %gptr, align 4 + %tmp2 = load i32, i32 addrspace(3)* %ptr2, align 4 + + %add = add nsw i32 %tmp1, %tmp2 + + store i32 %add, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @no_reorder_barrier_local_load_global_store_local_load +; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:4 +; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:8 +; CI: buffer_store_dword +define void @no_reorder_barrier_local_load_global_store_local_load(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 { + %ptr0 = load i32 addrspace(3)*, i32 addrspace(3)* addrspace(3)* @stored_lds_ptr, align 4 + + %ptr1 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 1 + %ptr2 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 2 + + %tmp1 = load i32, i32 addrspace(3)* %ptr1, align 4 + store i32 99, i32 addrspace(1)* %gptr, align 4 + call void @llvm.AMDGPU.barrier.local() #2 + %tmp2 = load i32, i32 addrspace(3)* %ptr2, align 4 + + %add = add nsw i32 %tmp1, %tmp2 + + store i32 %add, i32 addrspace(1)* %out, align 4 + ret void +} + +; Technically we could reorder these, but just comparing the +; instruction type of the load is insufficient. + +; FUNC-LABEL: @no_reorder_constant_load_global_store_constant_load +; CI: buffer_load_dword +; CI: buffer_store_dword +; CI: buffer_load_dword +; CI: buffer_store_dword +define void @no_reorder_constant_load_global_store_constant_load(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 { + %ptr0 = load i32 addrspace(2)*, i32 addrspace(2)* addrspace(3)* @stored_constant_ptr, align 8 + + %ptr1 = getelementptr inbounds i32, i32 addrspace(2)* %ptr0, i64 1 + %ptr2 = getelementptr inbounds i32, i32 addrspace(2)* %ptr0, i64 2 + + %tmp1 = load i32, i32 addrspace(2)* %ptr1, align 4 + store i32 99, i32 addrspace(1)* %gptr, align 4 + %tmp2 = load i32, i32 addrspace(2)* %ptr2, align 4 + + %add = add nsw i32 %tmp1, %tmp2 + + store i32 %add, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @reorder_constant_load_local_store_constant_load +; CI: buffer_load_dword +; CI: buffer_load_dword +; CI: ds_write_b32 +; CI: buffer_store_dword +define void @reorder_constant_load_local_store_constant_load(i32 addrspace(1)* %out, i32 addrspace(3)* %lptr) #0 { + %ptr0 = load i32 addrspace(2)*, i32 addrspace(2)* addrspace(3)* @stored_constant_ptr, align 8 + + %ptr1 = getelementptr inbounds i32, i32 addrspace(2)* %ptr0, i64 1 + %ptr2 = getelementptr inbounds i32, i32 addrspace(2)* %ptr0, i64 2 + + %tmp1 = load i32, i32 addrspace(2)* %ptr1, align 4 + store i32 99, i32 addrspace(3)* %lptr, align 4 + %tmp2 = load i32, i32 addrspace(2)* %ptr2, align 4 + + %add = add nsw i32 %tmp1, %tmp2 + + store i32 %add, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @reorder_smrd_load_local_store_smrd_load +; CI: s_load_dword +; CI: s_load_dword +; CI: s_load_dword +; CI: ds_write_b32 +; CI: buffer_store_dword +define void @reorder_smrd_load_local_store_smrd_load(i32 addrspace(1)* %out, i32 addrspace(3)* noalias %lptr, i32 addrspace(2)* %ptr0) #0 { + %ptr1 = getelementptr inbounds i32, i32 addrspace(2)* %ptr0, i64 1 + %ptr2 = getelementptr inbounds i32, i32 addrspace(2)* %ptr0, i64 2 + + %tmp1 = load i32, i32 addrspace(2)* %ptr1, align 4 + store i32 99, i32 addrspace(3)* %lptr, align 4 + %tmp2 = load i32, i32 addrspace(2)* %ptr2, align 4 + + %add = add nsw i32 %tmp1, %tmp2 + + store i32 %add, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @reorder_global_load_local_store_global_load +; CI: buffer_load_dword +; CI: buffer_load_dword +; CI: ds_write_b32 +; CI: buffer_store_dword +define void @reorder_global_load_local_store_global_load(i32 addrspace(1)* %out, i32 addrspace(3)* %lptr, i32 addrspace(1)* %ptr0) #0 { + %ptr1 = getelementptr inbounds i32, i32 addrspace(1)* %ptr0, i64 1 + %ptr2 = getelementptr inbounds i32, i32 addrspace(1)* %ptr0, i64 2 + + %tmp1 = load i32, i32 addrspace(1)* %ptr1, align 4 + store i32 99, i32 addrspace(3)* %lptr, align 4 + %tmp2 = load i32, i32 addrspace(1)* %ptr2, align 4 + + %add = add nsw i32 %tmp1, %tmp2 + + store i32 %add, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @reorder_local_offsets +; CI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:12 +; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:400 +; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:404 +; CI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:400 +; CI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:404 +; CI: buffer_store_dword +; CI: s_endpgm +define void @reorder_local_offsets(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* noalias nocapture readnone %gptr, i32 addrspace(3)* noalias nocapture %ptr0) #0 { + %ptr1 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 3 + %ptr2 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 100 + %ptr3 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 101 + + store i32 123, i32 addrspace(3)* %ptr1, align 4 + %tmp1 = load i32, i32 addrspace(3)* %ptr2, align 4 + %tmp2 = load i32, i32 addrspace(3)* %ptr3, align 4 + store i32 123, i32 addrspace(3)* %ptr2, align 4 + %tmp3 = load i32, i32 addrspace(3)* %ptr1, align 4 + store i32 789, i32 addrspace(3)* %ptr3, align 4 + + %add.0 = add nsw i32 %tmp2, %tmp1 + %add.1 = add nsw i32 %add.0, %tmp3 + store i32 %add.1, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @reorder_global_offsets +; CI: buffer_store_dword {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:12 +; CI: buffer_load_dword {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:400 +; CI: buffer_load_dword {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:404 +; CI: buffer_store_dword {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:400 +; CI: buffer_store_dword {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:404 +; CI: buffer_store_dword +; CI: s_endpgm +define void @reorder_global_offsets(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* noalias nocapture readnone %gptr, i32 addrspace(1)* noalias nocapture %ptr0) #0 { + %ptr1 = getelementptr inbounds i32, i32 addrspace(1)* %ptr0, i32 3 + %ptr2 = getelementptr inbounds i32, i32 addrspace(1)* %ptr0, i32 100 + %ptr3 = getelementptr inbounds i32, i32 addrspace(1)* %ptr0, i32 101 + + store i32 123, i32 addrspace(1)* %ptr1, align 4 + %tmp1 = load i32, i32 addrspace(1)* %ptr2, align 4 + %tmp2 = load i32, i32 addrspace(1)* %ptr3, align 4 + store i32 123, i32 addrspace(1)* %ptr2, align 4 + %tmp3 = load i32, i32 addrspace(1)* %ptr1, align 4 + store i32 789, i32 addrspace(1)* %ptr3, align 4 + + %add.0 = add nsw i32 %tmp2, %tmp1 + %add.1 = add nsw i32 %add.0, %tmp3 + store i32 %add.1, i32 addrspace(1)* %out, align 4 + ret void +} + +; XFUNC-LABEL: @reorder_local_load_tbuffer_store_local_load +; XCI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}}, 0x4 +; XCI: TBUFFER_STORE_FORMAT +; XCI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}}, 0x8 +; define void @reorder_local_load_tbuffer_store_local_load(i32 addrspace(1)* %out, i32 %a1, i32 %vaddr) #1 { +; %ptr0 = load i32 addrspace(3)*, i32 addrspace(3)* addrspace(3)* @stored_lds_ptr, align 4 + +; %ptr1 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 1 +; %ptr2 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 2 + +; %tmp1 = load i32, i32 addrspace(3)* %ptr1, align 4 + +; %vdata = insertelement <4 x i32> undef, i32 %a1, i32 0 +; call void @llvm.SI.tbuffer.store.v4i32(<16 x i8> undef, <4 x i32> %vdata, +; i32 4, i32 %vaddr, i32 0, i32 32, i32 14, i32 4, i32 1, i32 0, i32 1, +; i32 1, i32 0) + +; %tmp2 = load i32, i32 addrspace(3)* %ptr2, align 4 + +; %add = add nsw i32 %tmp1, %tmp2 + +; store i32 %add, i32 addrspace(1)* %out, align 4 +; ret void +; } + +attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="true" "use-soft-float"="false" } +attributes #1 = { "ShaderType"="1" nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="true" "use-soft-float"="false" } +attributes #2 = { nounwind noduplicate } diff --git a/llvm/test/CodeGen/AMDGPU/si-vector-hang.ll b/llvm/test/CodeGen/AMDGPU/si-vector-hang.ll new file mode 100644 index 00000000000..bd427dd3ed4 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/si-vector-hang.ll @@ -0,0 +1,105 @@ +; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s +; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s + +; CHECK: {{^}}test_8_min_char: +; CHECK: buffer_store_byte +; CHECK: buffer_store_byte +; CHECK: buffer_store_byte +; CHECK: buffer_store_byte +; CHECK: buffer_store_byte +; CHECK: buffer_store_byte +; CHECK: buffer_store_byte +; CHECK: buffer_store_byte +; ModuleID = 'radeon' + +define void @test_8_min_char(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture readonly %in0, i8 addrspace(1)* nocapture readonly %in1) #0 { +entry: + %0 = load i8, i8 addrspace(1)* %in0, align 1 + %1 = insertelement <8 x i8> undef, i8 %0, i32 0 + %arrayidx2.i.i = getelementptr inbounds i8, i8 addrspace(1)* %in0, i64 1 + %2 = load i8, i8 addrspace(1)* %arrayidx2.i.i, align 1 + %3 = insertelement <8 x i8> %1, i8 %2, i32 1 + %arrayidx6.i.i = getelementptr inbounds i8, i8 addrspace(1)* %in0, i64 2 + %4 = load i8, i8 addrspace(1)* %arrayidx6.i.i, align 1 + %5 = insertelement <8 x i8> %3, i8 %4, i32 2 + %arrayidx10.i.i = getelementptr inbounds i8, i8 addrspace(1)* %in0, i64 3 + %6 = load i8, i8 addrspace(1)* %arrayidx10.i.i, align 1 + %7 = insertelement <8 x i8> %5, i8 %6, i32 3 + %arrayidx.i.i = getelementptr inbounds i8, i8 addrspace(1)* %in0, i64 4 + %8 = load i8, i8 addrspace(1)* %arrayidx.i.i, align 1 + %9 = insertelement <8 x i8> undef, i8 %8, i32 0 + %arrayidx2.i9.i = getelementptr inbounds i8, i8 addrspace(1)* %in0, i64 5 + %10 = load i8, i8 addrspace(1)* %arrayidx2.i9.i, align 1 + %11 = insertelement <8 x i8> %9, i8 %10, i32 1 + %arrayidx6.i11.i = getelementptr inbounds i8, i8 addrspace(1)* %in0, i64 6 + %12 = load i8, i8 addrspace(1)* %arrayidx6.i11.i, align 1 + %13 = insertelement <8 x i8> %11, i8 %12, i32 2 + %arrayidx10.i13.i = getelementptr inbounds i8, i8 addrspace(1)* %in0, i64 7 + %14 = load i8, i8 addrspace(1)* %arrayidx10.i13.i, align 1 + %15 = insertelement <8 x i8> %13, i8 %14, i32 3 + %vecinit5.i = shufflevector <8 x i8> %7, <8 x i8> %15, <8 x i32> + %16 = load i8, i8 addrspace(1)* %in1, align 1 + %17 = insertelement <8 x i8> undef, i8 %16, i32 0 + %arrayidx2.i.i4 = getelementptr inbounds i8, i8 addrspace(1)* %in1, i64 1 + %18 = load i8, i8 addrspace(1)* %arrayidx2.i.i4, align 1 + %19 = insertelement <8 x i8> %17, i8 %18, i32 1 + %arrayidx6.i.i5 = getelementptr inbounds i8, i8 addrspace(1)* %in1, i64 2 + %20 = load i8, i8 addrspace(1)* %arrayidx6.i.i5, align 1 + %21 = insertelement <8 x i8> %19, i8 %20, i32 2 + %arrayidx10.i.i6 = getelementptr inbounds i8, i8 addrspace(1)* %in1, i64 3 + %22 = load i8, i8 addrspace(1)* %arrayidx10.i.i6, align 1 + %23 = insertelement <8 x i8> %21, i8 %22, i32 3 + %arrayidx.i.i7 = getelementptr inbounds i8, i8 addrspace(1)* %in1, i64 4 + %24 = load i8, i8 addrspace(1)* %arrayidx.i.i7, align 1 + %25 = insertelement <8 x i8> undef, i8 %24, i32 0 + %arrayidx2.i9.i8 = getelementptr inbounds i8, i8 addrspace(1)* %in1, i64 5 + %26 = load i8, i8 addrspace(1)* %arrayidx2.i9.i8, align 1 + %27 = insertelement <8 x i8> %25, i8 %26, i32 1 + %arrayidx6.i11.i9 = getelementptr inbounds i8, i8 addrspace(1)* %in1, i64 6 + %28 = load i8, i8 addrspace(1)* %arrayidx6.i11.i9, align 1 + %29 = insertelement <8 x i8> %27, i8 %28, i32 2 + %arrayidx10.i13.i10 = getelementptr inbounds i8, i8 addrspace(1)* %in1, i64 7 + %30 = load i8, i8 addrspace(1)* %arrayidx10.i13.i10, align 1 + %31 = insertelement <8 x i8> %29, i8 %30, i32 3 + %vecinit5.i11 = shufflevector <8 x i8> %23, <8 x i8> %31, <8 x i32> + %cmp.i = icmp slt <8 x i8> %vecinit5.i, %vecinit5.i11 + %cond.i = select <8 x i1> %cmp.i, <8 x i8> %vecinit5.i, <8 x i8> %vecinit5.i11 + %32 = extractelement <8 x i8> %cond.i, i32 0 + store i8 %32, i8 addrspace(1)* %out, align 1 + %33 = extractelement <8 x i8> %cond.i, i32 1 + %arrayidx2.i.i.i = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 1 + store i8 %33, i8 addrspace(1)* %arrayidx2.i.i.i, align 1 + %34 = extractelement <8 x i8> %cond.i, i32 2 + %arrayidx.i.i.i = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 2 + store i8 %34, i8 addrspace(1)* %arrayidx.i.i.i, align 1 + %35 = extractelement <8 x i8> %cond.i, i32 3 + %arrayidx2.i6.i.i = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 3 + store i8 %35, i8 addrspace(1)* %arrayidx2.i6.i.i, align 1 + %arrayidx.i.i3 = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 4 + %36 = extractelement <8 x i8> %cond.i, i32 4 + store i8 %36, i8 addrspace(1)* %arrayidx.i.i3, align 1 + %37 = extractelement <8 x i8> %cond.i, i32 5 + %arrayidx2.i.i6.i = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 5 + store i8 %37, i8 addrspace(1)* %arrayidx2.i.i6.i, align 1 + %38 = extractelement <8 x i8> %cond.i, i32 6 + %arrayidx.i.i7.i = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 6 + store i8 %38, i8 addrspace(1)* %arrayidx.i.i7.i, align 1 + %39 = extractelement <8 x i8> %cond.i, i32 7 + %arrayidx2.i6.i8.i = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 7 + store i8 %39, i8 addrspace(1)* %arrayidx2.i6.i8.i, align 1 + ret void +} + +attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } + +!opencl.kernels = !{!0, !1, !2, !3, !4, !5, !6, !7, !8} + +!0 = !{null} +!1 = !{null} +!2 = !{null} +!3 = !{void (i8 addrspace(1)*, i8 addrspace(1)*, i8 addrspace(1)*)* @test_8_min_char} +!4 = !{null} +!5 = !{null} +!6 = !{null} +!7 = !{null} +!8 = !{null} diff --git a/llvm/test/CodeGen/AMDGPU/sign_extend.ll b/llvm/test/CodeGen/AMDGPU/sign_extend.ll new file mode 100644 index 00000000000..06bee114c23 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/sign_extend.ll @@ -0,0 +1,63 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s + +; SI-LABEL: {{^}}s_sext_i1_to_i32: +; SI: v_cndmask_b32_e64 +; SI: s_endpgm +define void @s_sext_i1_to_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind { + %cmp = icmp eq i32 %a, %b + %sext = sext i1 %cmp to i32 + store i32 %sext, i32 addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL: {{^}}test_s_sext_i32_to_i64: +; SI: s_ashr_i32 +; SI: s_endpg +define void @test_s_sext_i32_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) nounwind { +entry: + %mul = mul i32 %a, %b + %add = add i32 %mul, %c + %sext = sext i32 %add to i64 + store i64 %sext, i64 addrspace(1)* %out, align 8 + ret void +} + +; SI-LABEL: {{^}}s_sext_i1_to_i64: +; SI: v_cndmask_b32_e64 v[[LOREG:[0-9]+]], 0, -1, vcc +; SI: v_mov_b32_e32 v[[HIREG:[0-9]+]], v[[LOREG]] +; SI: buffer_store_dwordx2 v{{\[}}[[LOREG]]:[[HIREG]]{{\]}} +; SI: s_endpgm +define void @s_sext_i1_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b) nounwind { + %cmp = icmp eq i32 %a, %b + %sext = sext i1 %cmp to i64 + store i64 %sext, i64 addrspace(1)* %out, align 8 + ret void +} + +; SI-LABEL: {{^}}s_sext_i32_to_i64: +; SI: s_ashr_i32 +; SI: s_endpgm +define void @s_sext_i32_to_i64(i64 addrspace(1)* %out, i32 %a) nounwind { + %sext = sext i32 %a to i64 + store i64 %sext, i64 addrspace(1)* %out, align 8 + ret void +} + +; SI-LABEL: {{^}}v_sext_i32_to_i64: +; SI: v_ashr +; SI: s_endpgm +define void @v_sext_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { + %val = load i32, i32 addrspace(1)* %in, align 4 + %sext = sext i32 %val to i64 + store i64 %sext, i64 addrspace(1)* %out, align 8 + ret void +} + +; SI-LABEL: {{^}}s_sext_i16_to_i64: +; SI: s_endpgm +define void @s_sext_i16_to_i64(i64 addrspace(1)* %out, i16 %a) nounwind { + %sext = sext i16 %a to i64 + store i64 %sext, i64 addrspace(1)* %out, align 8 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/simplify-demanded-bits-build-pair.ll b/llvm/test/CodeGen/AMDGPU/simplify-demanded-bits-build-pair.ll new file mode 100644 index 00000000000..dffee70b6b0 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/simplify-demanded-bits-build-pair.ll @@ -0,0 +1,39 @@ +; XFAIL: * +; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=SI -mattr=-promote-alloca < %s | FileCheck -check-prefix=SI %s +; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=tonga -mattr=-promote-alloca < %s | FileCheck -check-prefix=SI %s + +; 64-bit select was originally lowered with a build_pair, and this +; could be simplified to 1 cndmask instead of 2, but that broken when +; it started being implemented with a v2i32 build_vector and +; bitcasting. +define void @trunc_select_i64(i32 addrspace(1)* %out, i64 %a, i64 %b, i32 %c) { + %cmp = icmp eq i32 %c, 0 + %select = select i1 %cmp, i64 %a, i64 %b + %trunc = trunc i64 %select to i32 + store i32 %trunc, i32 addrspace(1)* %out, align 4 + ret void +} + +; FIXME: Fix truncating store for local memory +; SI-LABEL: {{^}}trunc_load_alloca_i64: +; SI: v_movrels_b32 +; SI-NOT: v_movrels_b32 +; SI: s_endpgm +define void @trunc_load_alloca_i64(i64 addrspace(1)* %out, i32 %a, i32 %b) { + %idx = add i32 %a, %b + %alloca = alloca i64, i32 4 + %gep0 = getelementptr i64, i64* %alloca, i64 0 + %gep1 = getelementptr i64, i64* %alloca, i64 1 + %gep2 = getelementptr i64, i64* %alloca, i64 2 + %gep3 = getelementptr i64, i64* %alloca, i64 3 + store i64 24, i64* %gep0, align 8 + store i64 9334, i64* %gep1, align 8 + store i64 3935, i64* %gep2, align 8 + store i64 9342, i64* %gep3, align 8 + %gep = getelementptr i64, i64* %alloca, i32 %idx + %load = load i64, i64* %gep, align 8 + %mask = and i64 %load, 4294967296 + %add = add i64 %mask, -1 + store i64 %add, i64 addrspace(1)* %out, align 4 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll b/llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll new file mode 100644 index 00000000000..da4e91db3a3 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll @@ -0,0 +1,61 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s + +declare i32 @llvm.r600.read.tidig.x() nounwind readnone + +; SI-LABEL: {{^}}sint_to_fp_i32_to_f64 +; SI: v_cvt_f64_i32_e32 +define void @sint_to_fp_i32_to_f64(double addrspace(1)* %out, i32 %in) { + %result = sitofp i32 %in to double + store double %result, double addrspace(1)* %out + ret void +} + +; FIXME: select on 0, 0 +; SI-LABEL: {{^}}sint_to_fp_i1_f64: +; SI: v_cmp_eq_i32_e64 [[CMP:s\[[0-9]+:[0-9]\]]], +; We can't fold the SGPRs into v_cndmask_b32_e64, because it already +; uses an SGPR for [[CMP]] +; SI: v_cndmask_b32_e64 v{{[0-9]+}}, 0, v{{[0-9]+}}, [[CMP]] +; SI: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 0, [[CMP]] +; SI: buffer_store_dwordx2 +; SI: s_endpgm +define void @sint_to_fp_i1_f64(double addrspace(1)* %out, i32 %in) { + %cmp = icmp eq i32 %in, 0 + %fp = sitofp i1 %cmp to double + store double %fp, double addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL: {{^}}sint_to_fp_i1_f64_load: +; SI: v_cndmask_b32_e64 [[IRESULT:v[0-9]]], 0, -1 +; SI-NEXT: v_cvt_f64_i32_e32 [[RESULT:v\[[0-9]+:[0-9]\]]], [[IRESULT]] +; SI: buffer_store_dwordx2 [[RESULT]] +; SI: s_endpgm +define void @sint_to_fp_i1_f64_load(double addrspace(1)* %out, i1 %in) { + %fp = sitofp i1 %in to double + store double %fp, double addrspace(1)* %out, align 8 + ret void +} + +; SI-LABEL: @s_sint_to_fp_i64_to_f64 +define void @s_sint_to_fp_i64_to_f64(double addrspace(1)* %out, i64 %in) { + %result = sitofp i64 %in to double + store double %result, double addrspace(1)* %out + ret void +} + +; SI-LABEL: @v_sint_to_fp_i64_to_f64 +; SI: buffer_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}} +; SI: v_cvt_f64_i32_e32 [[HI_CONV:v\[[0-9]+:[0-9]+\]]], v[[HI]] +; SI: v_ldexp_f64 [[LDEXP:v\[[0-9]+:[0-9]+\]]], [[HI_CONV]], 32 +; SI: v_cvt_f64_u32_e32 [[LO_CONV:v\[[0-9]+:[0-9]+\]]], v[[LO]] +; SI: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[LDEXP]], [[LO_CONV]] +; SI: buffer_store_dwordx2 [[RESULT]] +define void @v_sint_to_fp_i64_to_f64(double addrspace(1)* %out, i64 addrspace(1)* %in) { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid + %val = load i64, i64 addrspace(1)* %gep, align 8 + %result = sitofp i64 %val to double + store double %result, double addrspace(1)* %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/sint_to_fp.ll b/llvm/test/CodeGen/AMDGPU/sint_to_fp.ll new file mode 100644 index 00000000000..8506441d136 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/sint_to_fp.ll @@ -0,0 +1,64 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s + + +; FUNC-LABEL: {{^}}s_sint_to_fp_i32_to_f32: +; R600: INT_TO_FLT * T{{[0-9]+\.[XYZW]}}, KC0[2].Z +; SI: v_cvt_f32_i32_e32 {{v[0-9]+}}, {{s[0-9]+$}} +define void @s_sint_to_fp_i32_to_f32(float addrspace(1)* %out, i32 %in) { + %result = sitofp i32 %in to float + store float %result, float addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}sint_to_fp_v2i32: +; R600-DAG: INT_TO_FLT * T{{[0-9]+\.[XYZW]}}, KC0[2].W +; R600-DAG: INT_TO_FLT * T{{[0-9]+\.[XYZW]}}, KC0[3].X + +; SI: v_cvt_f32_i32_e32 +; SI: v_cvt_f32_i32_e32 +define void @sint_to_fp_v2i32(<2 x float> addrspace(1)* %out, <2 x i32> %in) { + %result = sitofp <2 x i32> %in to <2 x float> + store <2 x float> %result, <2 x float> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}sint_to_fp_v4i32: +; R600: INT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; R600: INT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; R600: INT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; R600: INT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} + +; SI: v_cvt_f32_i32_e32 +; SI: v_cvt_f32_i32_e32 +; SI: v_cvt_f32_i32_e32 +; SI: v_cvt_f32_i32_e32 +define void @sint_to_fp_v4i32(<4 x float> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { + %value = load <4 x i32>, <4 x i32> addrspace(1) * %in + %result = sitofp <4 x i32> %value to <4 x float> + store <4 x float> %result, <4 x float> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}sint_to_fp_i1_f32: +; SI: v_cmp_eq_i32_e64 [[CMP:s\[[0-9]+:[0-9]\]]], +; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1.0, [[CMP]] +; SI: buffer_store_dword [[RESULT]], +; SI: s_endpgm +define void @sint_to_fp_i1_f32(float addrspace(1)* %out, i32 %in) { + %cmp = icmp eq i32 %in, 0 + %fp = uitofp i1 %cmp to float + store float %fp, float addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}sint_to_fp_i1_f32_load: +; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1.0 +; SI: buffer_store_dword [[RESULT]], +; SI: s_endpgm +define void @sint_to_fp_i1_f32_load(float addrspace(1)* %out, i1 %in) { + %fp = sitofp i1 %in to float + store float %fp, float addrspace(1)* %out, align 4 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/smrd.ll b/llvm/test/CodeGen/AMDGPU/smrd.ll new file mode 100644 index 00000000000..b0c18ca5959 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/smrd.ll @@ -0,0 +1,111 @@ +; RUN: llc < %s -march=amdgcn -mcpu=SI -show-mc-encoding -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=GCN %s +; RUN: llc < %s -march=amdgcn -mcpu=tonga -show-mc-encoding -verify-machineinstrs | FileCheck --check-prefix=VI --check-prefix=GCN %s + +; SMRD load with an immediate offset. +; GCN-LABEL: {{^}}smrd0: +; SI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x1 ; encoding: [0x01 +; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x4 +define void @smrd0(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) { +entry: + %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 1 + %1 = load i32, i32 addrspace(2)* %0 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; SMRD load with the largest possible immediate offset. +; GCN-LABEL: {{^}}smrd1: +; SI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xff ; encoding: [0xff +; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3fc +define void @smrd1(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) { +entry: + %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 255 + %1 = load i32, i32 addrspace(2)* %0 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; SMRD load with an offset greater than the largest possible immediate. +; GCN-LABEL: {{^}}smrd2: +; SI: s_movk_i32 s[[OFFSET:[0-9]]], 0x400 +; SI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], s[[OFFSET]] ; encoding: [0x0[[OFFSET]] +; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x400 +; GCN: s_endpgm +define void @smrd2(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) { +entry: + %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 256 + %1 = load i32, i32 addrspace(2)* %0 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; SMRD load with a 64-bit offset +; GCN-LABEL: {{^}}smrd3: +; FIXME: There are too many copies here because we don't fold immediates +; through REG_SEQUENCE +; SI: s_mov_b32 s[[SLO:[0-9]+]], 0 ; +; SI: s_mov_b32 s[[SHI:[0-9]+]], 4 +; SI: s_mov_b32 s[[SSLO:[0-9]+]], s[[SLO]] +; SI-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], s[[SSLO]] +; SI-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[SHI]] +; FIXME: We should be able to use s_load_dword here +; SI: buffer_load_dword v{{[0-9]+}}, v{{\[}}[[VLO]]:[[VHI]]{{\]}}, s[{{[0-9]+:[0-9]+}}], 0 addr64 +; TODO: Add VI checks +; GCN: s_endpgm +define void @smrd3(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) { +entry: + %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 4294967296 ; 2 ^ 32 + %1 = load i32, i32 addrspace(2)* %0 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; SMRD load using the load.const intrinsic with an immediate offset +; GCN-LABEL: {{^}}smrd_load_const0: +; SI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x4 ; encoding: [0x04 +; VI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x10 +define void @smrd_load_const0(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 { +main_body: + %20 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %0, i32 0 + %21 = load <16 x i8>, <16 x i8> addrspace(2)* %20 + %22 = call float @llvm.SI.load.const(<16 x i8> %21, i32 16) + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %22, float %22, float %22, float %22) + ret void +} + +; SMRD load using the load.const intrinsic with the largest possible immediate +; offset. +; GCN-LABEL: {{^}}smrd_load_const1: +; SI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xff ; encoding: [0xff +; VI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3fc +define void @smrd_load_const1(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 { +main_body: + %20 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %0, i32 0 + %21 = load <16 x i8>, <16 x i8> addrspace(2)* %20 + %22 = call float @llvm.SI.load.const(<16 x i8> %21, i32 1020) + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %22, float %22, float %22, float %22) + ret void +} +; SMRD load using the load.const intrinsic with an offset greater than the +; largets possible immediate. +; immediate offset. +; GCN-LABEL: {{^}}smrd_load_const2: +; SI: s_movk_i32 s[[OFFSET:[0-9]]], 0x400 +; SI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], s[[OFFSET]] ; encoding: [0x0[[OFFSET]] +; VI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x400 +define void @smrd_load_const2(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 { +main_body: + %20 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %0, i32 0 + %21 = load <16 x i8>, <16 x i8> addrspace(2)* %20 + %22 = call float @llvm.SI.load.const(<16 x i8> %21, i32 1024) + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %22, float %22, float %22, float %22) + ret void +} + +; Function Attrs: nounwind readnone +declare float @llvm.SI.load.const(<16 x i8>, i32) #1 + +declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) + +attributes #0 = { "ShaderType"="0" } +attributes #1 = { nounwind readnone } diff --git a/llvm/test/CodeGen/AMDGPU/split-scalar-i64-add.ll b/llvm/test/CodeGen/AMDGPU/split-scalar-i64-add.ll new file mode 100644 index 00000000000..46409cdfae1 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/split-scalar-i64-add.ll @@ -0,0 +1,48 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s + +declare i32 @llvm.r600.read.tidig.x() readnone + +; This is broken because the low half of the 64-bit add remains on the +; SALU, but the upper half does not. The addc expects the carry bit +; set in vcc, which is undefined since the low scalar half add sets +; scc instead. + +; FUNC-LABEL: {{^}}imp_def_vcc_split_i64_add_0: +; SI: v_add_i32 +; SI: v_addc_u32 +define void @imp_def_vcc_split_i64_add_0(i64 addrspace(1)* %out, i32 %val) { + %vec.0 = insertelement <2 x i32> undef, i32 %val, i32 0 + %vec.1 = insertelement <2 x i32> %vec.0, i32 999999, i32 1 + %bc = bitcast <2 x i32> %vec.1 to i64 + %add = add i64 %bc, 399 + store i64 %add, i64 addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}imp_def_vcc_split_i64_add_1: +; SI: v_add_i32 +; SI: v_addc_u32 +define void @imp_def_vcc_split_i64_add_1(i64 addrspace(1)* %out, i32 %val0, i64 %val1) { + %vec.0 = insertelement <2 x i32> undef, i32 %val0, i32 0 + %vec.1 = insertelement <2 x i32> %vec.0, i32 99999, i32 1 + %bc = bitcast <2 x i32> %vec.1 to i64 + %add = add i64 %bc, %val1 + store i64 %add, i64 addrspace(1)* %out, align 8 + ret void +} + +; Doesn't use constants +; FUNC-LABEL @imp_def_vcc_split_i64_add_2 +; SI: v_add_i32 +; SI: v_addc_u32 +define void @imp_def_vcc_split_i64_add_2(i64 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %val0, i64 %val1) { + %tid = call i32 @llvm.r600.read.tidig.x() readnone + %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid + %load = load i32, i32 addrspace(1)* %gep + %vec.0 = insertelement <2 x i32> undef, i32 %val0, i32 0 + %vec.1 = insertelement <2 x i32> %vec.0, i32 %load, i32 1 + %bc = bitcast <2 x i32> %vec.1 to i64 + %add = add i64 %bc, %val1 + store i64 %add, i64 addrspace(1)* %out, align 8 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/sra.ll b/llvm/test/CodeGen/AMDGPU/sra.ll new file mode 100644 index 00000000000..bcbc32f4c05 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/sra.ll @@ -0,0 +1,213 @@ +;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG %s +;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI %s +;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=VI %s + +;EG-LABEL: {{^}}ashr_v2i32: +;EG: ASHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +;EG: ASHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} + +;SI-LABEL: {{^}}ashr_v2i32: +;SI: v_ashr_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +;SI: v_ashr_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} + +;VI-LABEL: {{^}}ashr_v2i32: +;VI: v_ashrrev_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +;VI: v_ashrrev_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} + +define void @ashr_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { + %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1 + %a = load <2 x i32>, <2 x i32> addrspace(1) * %in + %b = load <2 x i32>, <2 x i32> addrspace(1) * %b_ptr + %result = ashr <2 x i32> %a, %b + store <2 x i32> %result, <2 x i32> addrspace(1)* %out + ret void +} + +;EG-LABEL: {{^}}ashr_v4i32: +;EG: ASHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +;EG: ASHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +;EG: ASHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +;EG: ASHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} + +;SI-LABEL: {{^}}ashr_v4i32: +;SI: v_ashr_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +;SI: v_ashr_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +;SI: v_ashr_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +;SI: v_ashr_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} + +;VI-LABEL: {{^}}ashr_v4i32: +;VI: v_ashrrev_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +;VI: v_ashrrev_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +;VI: v_ashrrev_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +;VI: v_ashrrev_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} + +define void @ashr_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { + %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1 + %a = load <4 x i32>, <4 x i32> addrspace(1) * %in + %b = load <4 x i32>, <4 x i32> addrspace(1) * %b_ptr + %result = ashr <4 x i32> %a, %b + store <4 x i32> %result, <4 x i32> addrspace(1)* %out + ret void +} + +;EG-LABEL: {{^}}ashr_i64: +;EG: ASHR + +;SI-LABEL: {{^}}ashr_i64: +;SI: s_ashr_i64 s[{{[0-9]}}:{{[0-9]}}], s[{{[0-9]}}:{{[0-9]}}], 8 + +;VI-LABEL: {{^}}ashr_i64: +;VI: s_ashr_i64 s[{{[0-9]}}:{{[0-9]}}], s[{{[0-9]}}:{{[0-9]}}], 8 + +define void @ashr_i64(i64 addrspace(1)* %out, i32 %in) { +entry: + %0 = sext i32 %in to i64 + %1 = ashr i64 %0, 8 + store i64 %1, i64 addrspace(1)* %out + ret void +} + +;EG-LABEL: {{^}}ashr_i64_2: +;EG: SUB_INT {{\*? *}}[[COMPSH:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHIFT:T[0-9]+\.[XYZW]]] +;EG: LSHL {{\* *}}[[TEMP:T[0-9]+\.[XYZW]]], [[OPHI:T[0-9]+\.[XYZW]]], {{[[COMPSH]]|PV.[XYZW]}} +;EG: LSHL {{\*? *}}[[OVERF:T[0-9]+\.[XYZW]]], {{[[TEMP]]|PV.[XYZW]}}, 1 +;EG_CHECK-DAG: ADD_INT {{\*? *}}[[BIGSH:T[0-9]+\.[XYZW]]], [[SHIFT]], literal +;EG-DAG: LSHR {{\*? *}}[[LOSMTMP:T[0-9]+\.[XYZW]]], [[OPLO:T[0-9]+\.[XYZW]]], [[SHIFT]] +;EG-DAG: OR_INT {{\*? *}}[[LOSM:T[0-9]+\.[XYZW]]], {{[[LOSMTMP]]|PV.[XYZW]}}, {{[[OVERF]]|PV.[XYZW]}} +;EG-DAG: ASHR {{\*? *}}[[HISM:T[0-9]+\.[XYZW]]], [[OPHI]], {{PS|[[SHIFT]]}} +;EG-DAG: ASHR {{\*? *}}[[LOBIG:T[0-9]+\.[XYZW]]], [[OPHI]], literal +;EG-DAG: ASHR {{\*? *}}[[HIBIG:T[0-9]+\.[XYZW]]], [[OPHI]], literal +;EG-DAG: SETGT_UINT {{\*? *}}[[RESC:T[0-9]+\.[XYZW]]], [[SHIFT]], literal +;EG-DAG: CNDE_INT {{\*? *}}[[RESLO:T[0-9]+\.[XYZW]]], {{T[0-9]+\.[XYZW]}} +;EG-DAG: CNDE_INT {{\*? *}}[[RESHI:T[0-9]+\.[XYZW]]], {{T[0-9]+\.[XYZW]}} + +;SI-LABEL: {{^}}ashr_i64_2: +;SI: v_ashr_i64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}} + +;VI-LABEL: {{^}}ashr_i64_2: +;VI: v_ashrrev_i64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}} + +define void @ashr_i64_2(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { +entry: + %b_ptr = getelementptr i64, i64 addrspace(1)* %in, i64 1 + %a = load i64, i64 addrspace(1) * %in + %b = load i64, i64 addrspace(1) * %b_ptr + %result = ashr i64 %a, %b + store i64 %result, i64 addrspace(1)* %out + ret void +} + +;EG-LABEL: {{^}}ashr_v2i64: +;EG-DAG: SUB_INT {{\*? *}}[[COMPSHA:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHA:T[0-9]+\.[XYZW]]] +;EG-DAG: SUB_INT {{\*? *}}[[COMPSHB:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHB:T[0-9]+\.[XYZW]]] +;EG-DAG: LSHL {{\*? *}}[[COMPSHA]] +;EG-DAG: LSHL {{\*? *}}[[COMPSHB]] +;EG-DAG: LSHL {{.*}}, 1 +;EG-DAG: LSHL {{.*}}, 1 +;EG-DAG: ASHR {{.*}}, [[SHA]] +;EG-DAG: ASHR {{.*}}, [[SHB]] +;EG-DAG: LSHR {{.*}}, [[SHA]] +;EG-DAG: LSHR {{.*}}, [[SHB]] +;EG-DAG: OR_INT +;EG-DAG: OR_INT +;EG-DAG: ADD_INT {{\*? *}}[[BIGSHA:T[0-9]+\.[XYZW]]]{{.*}}, literal +;EG-DAG: ADD_INT {{\*? *}}[[BIGSHB:T[0-9]+\.[XYZW]]]{{.*}}, literal +;EG-DAG: ASHR +;EG-DAG: ASHR +;EG-DAG: ASHR {{.*}}, literal +;EG-DAG: ASHR {{.*}}, literal +;EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHA]], literal +;EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHB]], literal +;EG-DAG: CNDE_INT +;EG-DAG: CNDE_INT +;EG-DAG: CNDE_INT +;EG-DAG: CNDE_INT + +;SI-LABEL: {{^}}ashr_v2i64: +;SI: v_ashr_i64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}} +;SI: v_ashr_i64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}} + +;VI-LABEL: {{^}}ashr_v2i64: +;VI: v_ashrrev_i64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}} +;VI: v_ashrrev_i64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}} + +define void @ashr_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) { + %b_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %in, i64 1 + %a = load <2 x i64>, <2 x i64> addrspace(1) * %in + %b = load <2 x i64>, <2 x i64> addrspace(1) * %b_ptr + %result = ashr <2 x i64> %a, %b + store <2 x i64> %result, <2 x i64> addrspace(1)* %out + ret void +} + +;EG-LABEL: {{^}}ashr_v4i64: +;EG-DAG: SUB_INT {{\*? *}}[[COMPSHA:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHA:T[0-9]+\.[XYZW]]] +;EG-DAG: SUB_INT {{\*? *}}[[COMPSHB:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHB:T[0-9]+\.[XYZW]]] +;EG-DAG: SUB_INT {{\*? *}}[[COMPSHC:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHC:T[0-9]+\.[XYZW]]] +;EG-DAG: SUB_INT {{\*? *}}[[COMPSHD:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHD:T[0-9]+\.[XYZW]]] +;EG-DAG: LSHL {{\*? *}}[[COMPSHA]] +;EG-DAG: LSHL {{\*? *}}[[COMPSHB]] +;EG-DAG: LSHL {{\*? *}}[[COMPSHC]] +;EG-DAG: LSHL {{\*? *}}[[COMPSHD]] +;EG-DAG: LSHL {{.*}}, 1 +;EG-DAG: LSHL {{.*}}, 1 +;EG-DAG: LSHL {{.*}}, 1 +;EG-DAG: LSHL {{.*}}, 1 +;EG-DAG: ASHR {{.*}}, [[SHA]] +;EG-DAG: ASHR {{.*}}, [[SHB]] +;EG-DAG: ASHR {{.*}}, [[SHC]] +;EG-DAG: ASHR {{.*}}, [[SHD]] +;EG-DAG: LSHR {{.*}}, [[SHA]] +;EG-DAG: LSHR {{.*}}, [[SHB]] +;EG-DAG: LSHR {{.*}}, [[SHA]] +;EG-DAG: LSHR {{.*}}, [[SHB]] +;EG-DAG: OR_INT +;EG-DAG: OR_INT +;EG-DAG: OR_INT +;EG-DAG: OR_INT +;EG-DAG: ADD_INT {{\*? *}}[[BIGSHA:T[0-9]+\.[XYZW]]]{{.*}}, literal +;EG-DAG: ADD_INT {{\*? *}}[[BIGSHB:T[0-9]+\.[XYZW]]]{{.*}}, literal +;EG-DAG: ADD_INT {{\*? *}}[[BIGSHC:T[0-9]+\.[XYZW]]]{{.*}}, literal +;EG-DAG: ADD_INT {{\*? *}}[[BIGSHD:T[0-9]+\.[XYZW]]]{{.*}}, literal +;EG-DAG: ASHR +;EG-DAG: ASHR +;EG-DAG: ASHR +;EG-DAG: ASHR +;EG-DAG: ASHR {{.*}}, literal +;EG-DAG: ASHR {{.*}}, literal +;EG-DAG: ASHR {{.*}}, literal +;EG-DAG: ASHR {{.*}}, literal +;EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHA]], literal +;EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHB]], literal +;EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHC]], literal +;EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHD]], literal +;EG-DAG: CNDE_INT +;EG-DAG: CNDE_INT +;EG-DAG: CNDE_INT +;EG-DAG: CNDE_INT +;EG-DAG: CNDE_INT +;EG-DAG: CNDE_INT +;EG-DAG: CNDE_INT +;EG-DAG: CNDE_INT + +;SI-LABEL: {{^}}ashr_v4i64: +;SI: v_ashr_i64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}} +;SI: v_ashr_i64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}} +;SI: v_ashr_i64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}} +;SI: v_ashr_i64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}} + +;VI-LABEL: {{^}}ashr_v4i64: +;VI: v_ashrrev_i64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}} +;VI: v_ashrrev_i64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}} +;VI: v_ashrrev_i64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}} +;VI: v_ashrrev_i64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}} + +define void @ashr_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) { + %b_ptr = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %in, i64 1 + %a = load <4 x i64>, <4 x i64> addrspace(1) * %in + %b = load <4 x i64>, <4 x i64> addrspace(1) * %b_ptr + %result = ashr <4 x i64> %a, %b + store <4 x i64> %result, <4 x i64> addrspace(1)* %out + ret void +} + diff --git a/llvm/test/CodeGen/AMDGPU/srem.ll b/llvm/test/CodeGen/AMDGPU/srem.ll new file mode 100644 index 00000000000..c78fd549b31 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/srem.ll @@ -0,0 +1,112 @@ +; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=redwood < %s + +define void @srem_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { + %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 + %num = load i32, i32 addrspace(1) * %in + %den = load i32, i32 addrspace(1) * %den_ptr + %result = srem i32 %num, %den + store i32 %result, i32 addrspace(1)* %out + ret void +} + +define void @srem_i32_4(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { + %num = load i32, i32 addrspace(1) * %in + %result = srem i32 %num, 4 + store i32 %result, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}srem_i32_7: +; SI: v_mov_b32_e32 [[MAGIC:v[0-9]+]], 0x92492493 +; SI: v_mul_hi_i32 {{v[0-9]+}}, [[MAGIC]], +; SI: v_mul_lo_i32 +; SI: v_sub_i32 +; SI: s_endpgm +define void @srem_i32_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { + %num = load i32, i32 addrspace(1) * %in + %result = srem i32 %num, 7 + store i32 %result, i32 addrspace(1)* %out + ret void +} + +define void @srem_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { + %den_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1 + %num = load <2 x i32>, <2 x i32> addrspace(1) * %in + %den = load <2 x i32>, <2 x i32> addrspace(1) * %den_ptr + %result = srem <2 x i32> %num, %den + store <2 x i32> %result, <2 x i32> addrspace(1)* %out + ret void +} + +define void @srem_v2i32_4(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { + %num = load <2 x i32>, <2 x i32> addrspace(1) * %in + %result = srem <2 x i32> %num, + store <2 x i32> %result, <2 x i32> addrspace(1)* %out + ret void +} + +define void @srem_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { + %den_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1 + %num = load <4 x i32>, <4 x i32> addrspace(1) * %in + %den = load <4 x i32>, <4 x i32> addrspace(1) * %den_ptr + %result = srem <4 x i32> %num, %den + store <4 x i32> %result, <4 x i32> addrspace(1)* %out + ret void +} + +define void @srem_v4i32_4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { + %num = load <4 x i32>, <4 x i32> addrspace(1) * %in + %result = srem <4 x i32> %num, + store <4 x i32> %result, <4 x i32> addrspace(1)* %out + ret void +} + +define void @srem_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { + %den_ptr = getelementptr i64, i64 addrspace(1)* %in, i64 1 + %num = load i64, i64 addrspace(1) * %in + %den = load i64, i64 addrspace(1) * %den_ptr + %result = srem i64 %num, %den + store i64 %result, i64 addrspace(1)* %out + ret void +} + +define void @srem_i64_4(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { + %num = load i64, i64 addrspace(1) * %in + %result = srem i64 %num, 4 + store i64 %result, i64 addrspace(1)* %out + ret void +} + +define void @srem_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) { + %den_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %in, i64 1 + %num = load <2 x i64>, <2 x i64> addrspace(1) * %in + %den = load <2 x i64>, <2 x i64> addrspace(1) * %den_ptr + %result = srem <2 x i64> %num, %den + store <2 x i64> %result, <2 x i64> addrspace(1)* %out + ret void +} + +define void @srem_v2i64_4(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) { + %num = load <2 x i64>, <2 x i64> addrspace(1) * %in + %result = srem <2 x i64> %num, + store <2 x i64> %result, <2 x i64> addrspace(1)* %out + ret void +} + +define void @srem_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) { + %den_ptr = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %in, i64 1 + %num = load <4 x i64>, <4 x i64> addrspace(1) * %in + %den = load <4 x i64>, <4 x i64> addrspace(1) * %den_ptr + %result = srem <4 x i64> %num, %den + store <4 x i64> %result, <4 x i64> addrspace(1)* %out + ret void +} + +define void @srem_v4i64_4(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) { + %num = load <4 x i64>, <4 x i64> addrspace(1) * %in + %result = srem <4 x i64> %num, + store <4 x i64> %result, <4 x i64> addrspace(1)* %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/srl.ll b/llvm/test/CodeGen/AMDGPU/srl.ll new file mode 100644 index 00000000000..4904d7fa1bd --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/srl.ll @@ -0,0 +1,186 @@ +; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s + +; FUNC-LABEL: {{^}}lshr_i32: +; SI: v_lshrrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +; VI: v_lshrrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +; EG: LSHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +define void @lshr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { + %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 + %a = load i32, i32 addrspace(1)* %in + %b = load i32, i32 addrspace(1)* %b_ptr + %result = lshr i32 %a, %b + store i32 %result, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}lshr_v2i32: +; SI: v_lshr_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +; SI: v_lshr_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} + +; VI: v_lshrrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +; VI: v_lshrrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} + +; EG: LSHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; EG: LSHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +define void @lshr_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { + %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1 + %a = load <2 x i32>, <2 x i32> addrspace(1)* %in + %b = load <2 x i32>, <2 x i32> addrspace(1)* %b_ptr + %result = lshr <2 x i32> %a, %b + store <2 x i32> %result, <2 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}lshr_v4i32: +; SI: v_lshr_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +; SI: v_lshr_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +; SI: v_lshr_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +; SI: v_lshr_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} + +; VI: v_lshrrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +; VI: v_lshrrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +; VI: v_lshrrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +; VI: v_lshrrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} + +; EG: LSHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; EG: LSHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; EG: LSHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; EG: LSHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +define void @lshr_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { + %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1 + %a = load <4 x i32>, <4 x i32> addrspace(1)* %in + %b = load <4 x i32>, <4 x i32> addrspace(1)* %b_ptr + %result = lshr <4 x i32> %a, %b + store <4 x i32> %result, <4 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}lshr_i64: +; SI: v_lshr_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}} +; VI: v_lshrrev_b64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}} + +; EG: SUB_INT {{\*? *}}[[COMPSH:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHIFT:T[0-9]+\.[XYZW]]] +; EG: LSHL {{\* *}}[[TEMP:T[0-9]+\.[XYZW]]], [[OPHI:T[0-9]+\.[XYZW]]], {{[[COMPSH]]|PV.[XYZW]}} +; EG: LSHL {{\*? *}}[[OVERF:T[0-9]+\.[XYZW]]], {{[[TEMP]]|PV.[XYZW]}}, 1 +; EG-DAG: ADD_INT {{\*? *}}[[BIGSH:T[0-9]+\.[XYZW]]], [[SHIFT]], literal +; EG-DAG: LSHR {{\*? *}}[[LOSMTMP:T[0-9]+\.[XYZW]]], [[OPLO:T[0-9]+\.[XYZW]]], [[SHIFT]] +; EG-DAG: OR_INT {{\*? *}}[[LOSM:T[0-9]+\.[XYZW]]], {{[[LOSMTMP]]|PV.[XYZW]}}, {{[[OVERF]]|PV.[XYZW]}} +; EG-DAG: LSHR {{\*? *}}[[HISM:T[0-9]+\.[XYZW]]], [[OPHI]], {{PS|[[SHIFT]]}} +; EG-DAG: LSHR {{\*? *}}[[LOBIG:T[0-9]+\.[XYZW]]], [[OPHI]], {{PS|[[SHIFT]]}} +; EG-DAG: SETGT_UINT {{\*? *}}[[RESC:T[0-9]+\.[XYZW]]], [[SHIFT]], literal +; EG-DAG: CNDE_INT {{\*? *}}[[RESLO:T[0-9]+\.[XYZW]]], {{T[0-9]+\.[XYZW]}} +; EG-DAG: CNDE_INT {{\*? *}}[[RESHI:T[0-9]+\.[XYZW]]], {{T[0-9]+\.[XYZW], .*}}, 0.0 +define void @lshr_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { + %b_ptr = getelementptr i64, i64 addrspace(1)* %in, i64 1 + %a = load i64, i64 addrspace(1)* %in + %b = load i64, i64 addrspace(1)* %b_ptr + %result = lshr i64 %a, %b + store i64 %result, i64 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}lshr_v2i64: +; SI: v_lshr_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}} +; SI: v_lshr_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}} + +; VI: v_lshrrev_b64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}} +; VI: v_lshrrev_b64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}} + +; EG-DAG: SUB_INT {{\*? *}}[[COMPSHA:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHA:T[0-9]+\.[XYZW]]] +; EG-DAG: SUB_INT {{\*? *}}[[COMPSHB:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHB:T[0-9]+\.[XYZW]]] +; EG-DAG: LSHL {{\*? *}}[[COMPSHA]] +; EG-DAG: LSHL {{\*? *}}[[COMPSHB]] +; EG-DAG: LSHL {{.*}}, 1 +; EG-DAG: LSHL {{.*}}, 1 +; EG-DAG: LSHR {{.*}}, [[SHA]] +; EG-DAG: LSHR {{.*}}, [[SHB]] +; EG-DAG: LSHR {{.*}}, [[SHA]] +; EG-DAG: LSHR {{.*}}, [[SHB]] +; EG-DAG: OR_INT +; EG-DAG: OR_INT +; EG-DAG: ADD_INT {{\*? *}}[[BIGSHA:T[0-9]+\.[XYZW]]]{{.*}}, literal +; EG-DAG: ADD_INT {{\*? *}}[[BIGSHB:T[0-9]+\.[XYZW]]]{{.*}}, literal +; EG-DAG: LSHR +; EG-DAG: LSHR +; EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHA]], literal +; EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHB]], literal +; EG-DAG: CNDE_INT {{.*}}, 0.0 +; EG-DAG: CNDE_INT {{.*}}, 0.0 +; EG-DAG: CNDE_INT +; EG-DAG: CNDE_INT +define void @lshr_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) { + %b_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %in, i64 1 + %a = load <2 x i64>, <2 x i64> addrspace(1)* %in + %b = load <2 x i64>, <2 x i64> addrspace(1)* %b_ptr + %result = lshr <2 x i64> %a, %b + store <2 x i64> %result, <2 x i64> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}lshr_v4i64: +; SI: v_lshr_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}} +; SI: v_lshr_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}} +; SI: v_lshr_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}} +; SI: v_lshr_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}} + +; VI: v_lshrrev_b64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}} +; VI: v_lshrrev_b64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}} +; VI: v_lshrrev_b64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}} +; VI: v_lshrrev_b64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}} + +; EG-DAG: SUB_INT {{\*? *}}[[COMPSHA:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHA:T[0-9]+\.[XYZW]]] +; EG-DAG: SUB_INT {{\*? *}}[[COMPSHB:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHB:T[0-9]+\.[XYZW]]] +; EG-DAG: SUB_INT {{\*? *}}[[COMPSHC:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHC:T[0-9]+\.[XYZW]]] +; EG-DAG: SUB_INT {{\*? *}}[[COMPSHD:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHD:T[0-9]+\.[XYZW]]] +; EG-DAG: LSHL {{\*? *}}[[COMPSHA]] +; EG-DAG: LSHL {{\*? *}}[[COMPSHB]] +; EG-DAG: LSHL {{\*? *}}[[COMPSHC]] +; EG-DAG: LSHL {{\*? *}}[[COMPSHD]] +; EG-DAG: LSHL {{.*}}, 1 +; EG-DAG: LSHL {{.*}}, 1 +; EG-DAG: LSHL {{.*}}, 1 +; EG-DAG: LSHL {{.*}}, 1 +; EG-DAG: LSHR {{.*}}, [[SHA]] +; EG-DAG: LSHR {{.*}}, [[SHB]] +; EG-DAG: LSHR {{.*}}, [[SHC]] +; EG-DAG: LSHR {{.*}}, [[SHD]] +; EG-DAG: LSHR {{.*}}, [[SHA]] +; EG-DAG: LSHR {{.*}}, [[SHB]] +; EG-DAG: LSHR {{.*}}, [[SHC]] +; EG-DAG: LSHR {{.*}}, [[SHD]] +; EG-DAG: OR_INT +; EG-DAG: OR_INT +; EG-DAG: OR_INT +; EG-DAG: OR_INT +; EG-DAG: ADD_INT {{\*? *}}[[BIGSHA:T[0-9]+\.[XYZW]]]{{.*}}, literal +; EG-DAG: ADD_INT {{\*? *}}[[BIGSHB:T[0-9]+\.[XYZW]]]{{.*}}, literal +; EG-DAG: ADD_INT {{\*? *}}[[BIGSHC:T[0-9]+\.[XYZW]]]{{.*}}, literal +; EG-DAG: ADD_INT {{\*? *}}[[BIGSHD:T[0-9]+\.[XYZW]]]{{.*}}, literal +; EG-DAG: LSHR +; EG-DAG: LSHR +; EG-DAG: LSHR +; EG-DAG: LSHR +; EG-DAG: LSHR +; EG-DAG: LSHR +; EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHA]], literal +; EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHB]], literal +; EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHC]], literal +; EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHD]], literal +; EG-DAG: CNDE_INT {{.*}}, 0.0 +; EG-DAG: CNDE_INT {{.*}}, 0.0 +; EG-DAG: CNDE_INT {{.*}}, 0.0 +; EG-DAG: CNDE_INT {{.*}}, 0.0 +; EG-DAG: CNDE_INT +; EG-DAG: CNDE_INT +; EG-DAG: CNDE_INT +; EG-DAG: CNDE_INT +define void @lshr_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) { + %b_ptr = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %in, i64 1 + %a = load <4 x i64>, <4 x i64> addrspace(1)* %in + %b = load <4 x i64>, <4 x i64> addrspace(1)* %b_ptr + %result = lshr <4 x i64> %a, %b + store <4 x i64> %result, <4 x i64> addrspace(1)* %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/ssubo.ll b/llvm/test/CodeGen/AMDGPU/ssubo.ll new file mode 100644 index 00000000000..26884a1b776 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/ssubo.ll @@ -0,0 +1,65 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs< %s + +declare { i32, i1 } @llvm.ssub.with.overflow.i32(i32, i32) nounwind readnone +declare { i64, i1 } @llvm.ssub.with.overflow.i64(i64, i64) nounwind readnone + +; FUNC-LABEL: {{^}}ssubo_i64_zext: +define void @ssubo_i64_zext(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind { + %ssub = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 %a, i64 %b) nounwind + %val = extractvalue { i64, i1 } %ssub, 0 + %carry = extractvalue { i64, i1 } %ssub, 1 + %ext = zext i1 %carry to i64 + %add2 = add i64 %val, %ext + store i64 %add2, i64 addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}s_ssubo_i32: +define void @s_ssubo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 %a, i32 %b) nounwind { + %ssub = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 %a, i32 %b) nounwind + %val = extractvalue { i32, i1 } %ssub, 0 + %carry = extractvalue { i32, i1 } %ssub, 1 + store i32 %val, i32 addrspace(1)* %out, align 4 + store i1 %carry, i1 addrspace(1)* %carryout + ret void +} + +; FUNC-LABEL: {{^}}v_ssubo_i32: +define void @v_ssubo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind { + %a = load i32, i32 addrspace(1)* %aptr, align 4 + %b = load i32, i32 addrspace(1)* %bptr, align 4 + %ssub = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 %a, i32 %b) nounwind + %val = extractvalue { i32, i1 } %ssub, 0 + %carry = extractvalue { i32, i1 } %ssub, 1 + store i32 %val, i32 addrspace(1)* %out, align 4 + store i1 %carry, i1 addrspace(1)* %carryout + ret void +} + +; FUNC-LABEL: {{^}}s_ssubo_i64: +; SI: s_sub_u32 +; SI: s_subb_u32 +define void @s_ssubo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 %a, i64 %b) nounwind { + %ssub = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 %a, i64 %b) nounwind + %val = extractvalue { i64, i1 } %ssub, 0 + %carry = extractvalue { i64, i1 } %ssub, 1 + store i64 %val, i64 addrspace(1)* %out, align 8 + store i1 %carry, i1 addrspace(1)* %carryout + ret void +} + +; FUNC-LABEL: {{^}}v_ssubo_i64: +; SI: v_sub_i32_e32 +; SI: v_subb_u32_e32 +define void @v_ssubo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind { + %a = load i64, i64 addrspace(1)* %aptr, align 4 + %b = load i64, i64 addrspace(1)* %bptr, align 4 + %ssub = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 %a, i64 %b) nounwind + %val = extractvalue { i64, i1 } %ssub, 0 + %carry = extractvalue { i64, i1 } %ssub, 1 + store i64 %val, i64 addrspace(1)* %out, align 8 + store i1 %carry, i1 addrspace(1)* %carryout + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/store-barrier.ll b/llvm/test/CodeGen/AMDGPU/store-barrier.ll new file mode 100644 index 00000000000..4a72b4d090a --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/store-barrier.ll @@ -0,0 +1,42 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck --check-prefix=CHECK %s +; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck --check-prefix=CHECK %s + +; This test is for a bug in the machine scheduler where stores without +; an underlying object would be moved across the barrier. In this +; test, the <2 x i8> store will be split into two i8 stores, so they +; won't have an underlying object. + +; CHECK-LABEL: {{^}}test: +; CHECK: ds_write_b8 +; CHECK: ds_write_b8 +; CHECK: s_barrier +; CHECK: s_endpgm +; Function Attrs: nounwind +define void @test(<2 x i8> addrspace(3)* nocapture %arg, <2 x i8> addrspace(1)* nocapture readonly %arg1, i32 addrspace(1)* nocapture readonly %arg2, <2 x i8> addrspace(1)* nocapture %arg3, i32 %arg4, i64 %tmp9) { +bb: + %tmp10 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp9 + %tmp13 = load i32, i32 addrspace(1)* %tmp10, align 2 + %tmp14 = getelementptr inbounds <2 x i8>, <2 x i8> addrspace(3)* %arg, i32 %tmp13 + %tmp15 = load <2 x i8>, <2 x i8> addrspace(3)* %tmp14, align 2 + %tmp16 = add i32 %tmp13, 1 + %tmp17 = getelementptr inbounds <2 x i8>, <2 x i8> addrspace(3)* %arg, i32 %tmp16 + store <2 x i8> %tmp15, <2 x i8> addrspace(3)* %tmp17, align 2 + tail call void @llvm.AMDGPU.barrier.local() #2 + %tmp25 = load i32, i32 addrspace(1)* %tmp10, align 4 + %tmp26 = sext i32 %tmp25 to i64 + %tmp27 = sext i32 %arg4 to i64 + %tmp28 = getelementptr inbounds <2 x i8>, <2 x i8> addrspace(3)* %arg, i32 %tmp25, i32 %arg4 + %tmp29 = load i8, i8 addrspace(3)* %tmp28, align 1 + %tmp30 = getelementptr inbounds <2 x i8>, <2 x i8> addrspace(1)* %arg3, i64 %tmp26, i64 %tmp27 + store i8 %tmp29, i8 addrspace(1)* %tmp30, align 1 + %tmp32 = getelementptr inbounds <2 x i8>, <2 x i8> addrspace(3)* %arg, i32 %tmp25, i32 0 + %tmp33 = load i8, i8 addrspace(3)* %tmp32, align 1 + %tmp35 = getelementptr inbounds <2 x i8>, <2 x i8> addrspace(1)* %arg3, i64 %tmp26, i64 0 + store i8 %tmp33, i8 addrspace(1)* %tmp35, align 1 + ret void +} + +; Function Attrs: noduplicate nounwind +declare void @llvm.AMDGPU.barrier.local() #2 + +attributes #2 = { noduplicate nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/store-v3i32.ll b/llvm/test/CodeGen/AMDGPU/store-v3i32.ll new file mode 100644 index 00000000000..33617b55ed6 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/store-v3i32.ll @@ -0,0 +1,13 @@ +; XFAIL: * +; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI %s +; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI %s + +; 3 vectors have the same size and alignment as 4 vectors, so this +; should be done in a single store. + +; SI-LABEL: {{^}}store_v3i32: +; SI: buffer_store_dwordx4 +define void @store_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> %a) nounwind { + store <3 x i32> %a, <3 x i32> addrspace(1)* %out, align 16 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/store-v3i64.ll b/llvm/test/CodeGen/AMDGPU/store-v3i64.ll new file mode 100644 index 00000000000..e0c554ad2c1 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/store-v3i64.ll @@ -0,0 +1,29 @@ +; XFAIL: * +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s + +; SI-LABEL: {{^}}global_store_v3i64: +; SI: buffer_store_dwordx4 +; SI: buffer_store_dwordx4 +define void @global_store_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> %x) { + store <3 x i64> %x, <3 x i64> addrspace(1)* %out, align 32 + ret void +} + +; SI-LABEL: {{^}}global_store_v3i64_unaligned: +define void @global_store_v3i64_unaligned(<3 x i64> addrspace(1)* %out, <3 x i64> %x) { + store <3 x i64> %x, <3 x i64> addrspace(1)* %out, align 1 + ret void +} + +; SI-LABEL: {{^}}local_store_v3i64: +define void @local_store_v3i64(<3 x i64> addrspace(3)* %out, <3 x i64> %x) { + store <3 x i64> %x, <3 x i64> addrspace(3)* %out, align 32 + ret void +} + +; SI-LABEL: {{^}}local_store_v3i64_unaligned: +define void @local_store_v3i64_unaligned(<3 x i64> addrspace(1)* %out, <3 x i64> %x) { + store <3 x i64> %x, <3 x i64> addrspace(1)* %out, align 1 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/store-vector-ptrs.ll b/llvm/test/CodeGen/AMDGPU/store-vector-ptrs.ll new file mode 100644 index 00000000000..d5af3b29118 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/store-vector-ptrs.ll @@ -0,0 +1,12 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s + +; This tests for a bug that caused a crash in +; AMDGPUDAGToDAGISel::SelectMUBUFScratch() which is used for selecting +; scratch loads and stores. +; CHECK-LABEL: {{^}}store_vector_ptrs: +define void @store_vector_ptrs(<4 x i32*>* %out, <4 x [1024 x i32]*> %array) nounwind { + %p = getelementptr [1024 x i32], <4 x [1024 x i32]*> %array, <4 x i16> zeroinitializer, <4 x i16> + store <4 x i32*> %p, <4 x i32*>* %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/store.ll b/llvm/test/CodeGen/AMDGPU/store.ll new file mode 100644 index 00000000000..0f89405e073 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/store.ll @@ -0,0 +1,369 @@ +; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=cayman < %s | FileCheck -check-prefix=CM -check-prefix=FUNC %s + +;===------------------------------------------------------------------------===; +; Global Address Space +;===------------------------------------------------------------------------===; +; FUNC-LABEL: {{^}}store_i1: +; EG: MEM_RAT MSKOR +; SI: buffer_store_byte +define void @store_i1(i1 addrspace(1)* %out) { +entry: + store i1 true, i1 addrspace(1)* %out + ret void +} + +; i8 store +; FUNC-LABEL: {{^}}store_i8: +; EG: MEM_RAT MSKOR T[[RW_GPR:[0-9]]].XW, T{{[0-9]}}.X + +; IG 0: Get the byte index and truncate the value +; EG: AND_INT * T{{[0-9]}}.[[BI_CHAN:[XYZW]]], KC0[2].Y, literal.x +; EG: LSHL T{{[0-9]}}.[[SHIFT_CHAN:[XYZW]]], PV.[[BI_CHAN]], literal.x +; EG: AND_INT * T{{[0-9]}}.[[TRUNC_CHAN:[XYZW]]], KC0[2].Z, literal.y +; EG-NEXT: 3(4.203895e-45), 255(3.573311e-43) + + +; IG 1: Truncate the calculated the shift amount for the mask + +; IG 2: Shift the value and the mask +; EG: LSHL T[[RW_GPR]].X, PS, PV.[[SHIFT_CHAN]] +; EG: LSHL * T[[RW_GPR]].W, literal.x, PV.[[SHIFT_CHAN]] +; EG-NEXT: 255 +; IG 3: Initialize the Y and Z channels to zero +; XXX: An optimal scheduler should merge this into one of the prevous IGs. +; EG: MOV T[[RW_GPR]].Y, 0.0 +; EG: MOV * T[[RW_GPR]].Z, 0.0 + +; SI: buffer_store_byte + +define void @store_i8(i8 addrspace(1)* %out, i8 %in) { +entry: + store i8 %in, i8 addrspace(1)* %out + ret void +} + +; i16 store +; FUNC-LABEL: {{^}}store_i16: +; EG: MEM_RAT MSKOR T[[RW_GPR:[0-9]]].XW, T{{[0-9]}}.X + +; IG 0: Get the byte index and truncate the value + + +; EG: AND_INT * T{{[0-9]}}.[[BI_CHAN:[XYZW]]], KC0[2].Y, literal.x +; EG-NEXT: 3(4.203895e-45), + +; EG: LSHL T{{[0-9]}}.[[SHIFT_CHAN:[XYZW]]], PV.[[BI_CHAN]], literal.x +; EG: AND_INT * T{{[0-9]}}.[[TRUNC_CHAN:[XYZW]]], KC0[2].Z, literal.y + +; EG-NEXT: 3(4.203895e-45), 65535(9.183409e-41) +; IG 1: Truncate the calculated the shift amount for the mask + +; IG 2: Shift the value and the mask +; EG: LSHL T[[RW_GPR]].X, PS, PV.[[SHIFT_CHAN]] +; EG: LSHL * T[[RW_GPR]].W, literal.x, PV.[[SHIFT_CHAN]] +; EG-NEXT: 65535 +; IG 3: Initialize the Y and Z channels to zero +; XXX: An optimal scheduler should merge this into one of the prevous IGs. +; EG: MOV T[[RW_GPR]].Y, 0.0 +; EG: MOV * T[[RW_GPR]].Z, 0.0 + +; SI: buffer_store_short +define void @store_i16(i16 addrspace(1)* %out, i16 %in) { +entry: + store i16 %in, i16 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}store_v2i8: +; EG: MEM_RAT MSKOR +; EG-NOT: MEM_RAT MSKOR + +; SI: buffer_store_byte +; SI: buffer_store_byte +define void @store_v2i8(<2 x i8> addrspace(1)* %out, <2 x i32> %in) { +entry: + %0 = trunc <2 x i32> %in to <2 x i8> + store <2 x i8> %0, <2 x i8> addrspace(1)* %out + ret void +} + + +; FUNC-LABEL: {{^}}store_v2i16: +; EG: MEM_RAT_CACHELESS STORE_RAW + +; CM: MEM_RAT_CACHELESS STORE_DWORD + +; SI: buffer_store_short +; SI: buffer_store_short +define void @store_v2i16(<2 x i16> addrspace(1)* %out, <2 x i32> %in) { +entry: + %0 = trunc <2 x i32> %in to <2 x i16> + store <2 x i16> %0, <2 x i16> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}store_v4i8: +; EG: MEM_RAT_CACHELESS STORE_RAW + +; CM: MEM_RAT_CACHELESS STORE_DWORD + +; SI: buffer_store_byte +; SI: buffer_store_byte +; SI: buffer_store_byte +; SI: buffer_store_byte +define void @store_v4i8(<4 x i8> addrspace(1)* %out, <4 x i32> %in) { +entry: + %0 = trunc <4 x i32> %in to <4 x i8> + store <4 x i8> %0, <4 x i8> addrspace(1)* %out + ret void +} + +; floating-point store +; FUNC-LABEL: {{^}}store_f32: +; EG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+\.X, T[0-9]+\.X}}, 1 + +; CM: MEM_RAT_CACHELESS STORE_DWORD T{{[0-9]+\.X, T[0-9]+\.X}} + +; SI: buffer_store_dword + +define void @store_f32(float addrspace(1)* %out, float %in) { + store float %in, float addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}store_v4i16: +; EG: MEM_RAT MSKOR +; EG: MEM_RAT MSKOR +; EG: MEM_RAT MSKOR +; EG: MEM_RAT MSKOR +; EG-NOT: MEM_RAT MSKOR + +; SI: buffer_store_short +; SI: buffer_store_short +; SI: buffer_store_short +; SI: buffer_store_short +; SI-NOT: buffer_store_byte +define void @store_v4i16(<4 x i16> addrspace(1)* %out, <4 x i32> %in) { +entry: + %0 = trunc <4 x i32> %in to <4 x i16> + store <4 x i16> %0, <4 x i16> addrspace(1)* %out + ret void +} + +; vec2 floating-point stores +; FUNC-LABEL: {{^}}store_v2f32: +; EG: MEM_RAT_CACHELESS STORE_RAW + +; CM: MEM_RAT_CACHELESS STORE_DWORD + +; SI: buffer_store_dwordx2 + +define void @store_v2f32(<2 x float> addrspace(1)* %out, float %a, float %b) { +entry: + %0 = insertelement <2 x float> , float %a, i32 0 + %1 = insertelement <2 x float> %0, float %b, i32 1 + store <2 x float> %1, <2 x float> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}store_v4i32: +; EG: MEM_RAT_CACHELESS STORE_RAW +; EG-NOT: MEM_RAT_CACHELESS STORE_RAW + +; CM: MEM_RAT_CACHELESS STORE_DWORD +; CM-NOT: MEM_RAT_CACHELESS STORE_DWORD + +; SI: buffer_store_dwordx4 +define void @store_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %in) { +entry: + store <4 x i32> %in, <4 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}store_i64_i8: +; EG: MEM_RAT MSKOR +; SI: buffer_store_byte +define void @store_i64_i8(i8 addrspace(1)* %out, i64 %in) { +entry: + %0 = trunc i64 %in to i8 + store i8 %0, i8 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}store_i64_i16: +; EG: MEM_RAT MSKOR +; SI: buffer_store_short +define void @store_i64_i16(i16 addrspace(1)* %out, i64 %in) { +entry: + %0 = trunc i64 %in to i16 + store i16 %0, i16 addrspace(1)* %out + ret void +} + +;===------------------------------------------------------------------------===; +; Local Address Space +;===------------------------------------------------------------------------===; + +; FUNC-LABEL: {{^}}store_local_i1: +; EG: LDS_BYTE_WRITE +; SI: ds_write_b8 +define void @store_local_i1(i1 addrspace(3)* %out) { +entry: + store i1 true, i1 addrspace(3)* %out + ret void +} + +; FUNC-LABEL: {{^}}store_local_i8: +; EG: LDS_BYTE_WRITE + +; SI: ds_write_b8 +define void @store_local_i8(i8 addrspace(3)* %out, i8 %in) { + store i8 %in, i8 addrspace(3)* %out + ret void +} + +; FUNC-LABEL: {{^}}store_local_i16: +; EG: LDS_SHORT_WRITE + +; SI: ds_write_b16 +define void @store_local_i16(i16 addrspace(3)* %out, i16 %in) { + store i16 %in, i16 addrspace(3)* %out + ret void +} + +; FUNC-LABEL: {{^}}store_local_v2i16: +; EG: LDS_WRITE + +; CM: LDS_WRITE + +; SI: ds_write_b16 +; SI: ds_write_b16 +define void @store_local_v2i16(<2 x i16> addrspace(3)* %out, <2 x i16> %in) { +entry: + store <2 x i16> %in, <2 x i16> addrspace(3)* %out + ret void +} + +; FUNC-LABEL: {{^}}store_local_v4i8: +; EG: LDS_WRITE + +; CM: LDS_WRITE + +; SI: ds_write_b8 +; SI: ds_write_b8 +; SI: ds_write_b8 +; SI: ds_write_b8 +define void @store_local_v4i8(<4 x i8> addrspace(3)* %out, <4 x i8> %in) { +entry: + store <4 x i8> %in, <4 x i8> addrspace(3)* %out + ret void +} + +; FUNC-LABEL: {{^}}store_local_v2i32: +; EG: LDS_WRITE +; EG: LDS_WRITE + +; CM: LDS_WRITE +; CM: LDS_WRITE + +; SI: ds_write_b64 +define void @store_local_v2i32(<2 x i32> addrspace(3)* %out, <2 x i32> %in) { +entry: + store <2 x i32> %in, <2 x i32> addrspace(3)* %out + ret void +} + +; FUNC-LABEL: {{^}}store_local_v4i32: +; EG: LDS_WRITE +; EG: LDS_WRITE +; EG: LDS_WRITE +; EG: LDS_WRITE + +; CM: LDS_WRITE +; CM: LDS_WRITE +; CM: LDS_WRITE +; CM: LDS_WRITE + +; SI: ds_write_b32 +; SI: ds_write_b32 +; SI: ds_write_b32 +; SI: ds_write_b32 +define void @store_local_v4i32(<4 x i32> addrspace(3)* %out, <4 x i32> %in) { +entry: + store <4 x i32> %in, <4 x i32> addrspace(3)* %out + ret void +} + +; FUNC-LABEL: {{^}}store_local_i64_i8: +; EG: LDS_BYTE_WRITE +; SI: ds_write_b8 +define void @store_local_i64_i8(i8 addrspace(3)* %out, i64 %in) { +entry: + %0 = trunc i64 %in to i8 + store i8 %0, i8 addrspace(3)* %out + ret void +} + +; FUNC-LABEL: {{^}}store_local_i64_i16: +; EG: LDS_SHORT_WRITE +; SI: ds_write_b16 +define void @store_local_i64_i16(i16 addrspace(3)* %out, i64 %in) { +entry: + %0 = trunc i64 %in to i16 + store i16 %0, i16 addrspace(3)* %out + ret void +} + +; The stores in this function are combined by the optimizer to create a +; 64-bit store with 32-bit alignment. This is legal for SI and the legalizer +; should not try to split the 64-bit store back into 2 32-bit stores. +; +; Evergreen / Northern Islands don't support 64-bit stores yet, so there should +; be two 32-bit stores. + +; FUNC-LABEL: {{^}}vecload2: +; EG: MEM_RAT_CACHELESS STORE_RAW + +; CM: MEM_RAT_CACHELESS STORE_DWORD + +; SI: buffer_store_dwordx2 +define void @vecload2(i32 addrspace(1)* nocapture %out, i32 addrspace(2)* nocapture %mem) #0 { +entry: + %0 = load i32, i32 addrspace(2)* %mem, align 4 + %arrayidx1.i = getelementptr inbounds i32, i32 addrspace(2)* %mem, i64 1 + %1 = load i32, i32 addrspace(2)* %arrayidx1.i, align 4 + store i32 %0, i32 addrspace(1)* %out, align 4 + %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1 + store i32 %1, i32 addrspace(1)* %arrayidx1, align 4 + ret void +} + +attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" } + +; When i128 was a legal type this program generated cannot select errors: + +; FUNC-LABEL: {{^}}"i128-const-store": +; FIXME: We should be able to to this with one store instruction +; EG: STORE_RAW +; EG: STORE_RAW +; EG: STORE_RAW +; EG: STORE_RAW +; CM: STORE_DWORD +; CM: STORE_DWORD +; CM: STORE_DWORD +; CM: STORE_DWORD +; SI: buffer_store_dwordx4 +define void @i128-const-store(i32 addrspace(1)* %out) { +entry: + store i32 1, i32 addrspace(1)* %out, align 4 + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1 + store i32 1, i32 addrspace(1)* %arrayidx2, align 4 + %arrayidx4 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 2 + store i32 2, i32 addrspace(1)* %arrayidx4, align 4 + %arrayidx6 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 3 + store i32 2, i32 addrspace(1)* %arrayidx6, align 4 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/store.r600.ll b/llvm/test/CodeGen/AMDGPU/store.r600.ll new file mode 100644 index 00000000000..696fb033b5e --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/store.r600.ll @@ -0,0 +1,22 @@ +; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG %s + +; XXX: Merge this test into store.ll once it is supported on SI + +; v4i32 store +; EG: {{^}}store_v4i32: +; EG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+\.XYZW, T[0-9]+\.X}}, 1 + +define void @store_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { + %1 = load <4 x i32>, <4 x i32> addrspace(1) * %in + store <4 x i32> %1, <4 x i32> addrspace(1)* %out + ret void +} + +; v4f32 store +; EG: {{^}}store_v4f32: +; EG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+\.XYZW, T[0-9]+\.X}}, 1 +define void @store_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) { + %1 = load <4 x float>, <4 x float> addrspace(1) * %in + store <4 x float> %1, <4 x float> addrspace(1)* %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/structurize.ll b/llvm/test/CodeGen/AMDGPU/structurize.ll new file mode 100644 index 00000000000..02e592e9a55 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/structurize.ll @@ -0,0 +1,83 @@ +; RUN: llc < %s -march=r600 -mcpu=redwood -mattr=disable-irstructurizer | FileCheck %s +; Test case for a crash in the AMDILCFGStructurizer from a CFG like this: +; +; entry +; / \ +; diamond_head branch_from +; / \ | +; diamond_false diamond_true +; \ / +; done +; +; When the diamond_true branch had more than 100 instructions. +; +; + +; CHECK-LABEL: {{^}}branch_into_diamond: +; === entry block: +; CHECK: ALU_PUSH_BEFORE +; === Branch instruction (IF): +; CHECK: JUMP + ; === branch_from block + ; CHECK: ALU + ; === Duplicated diamond_true block (There can be more than one ALU clause): + ; === XXX: We should be able to optimize this so the basic block is not + ; === duplicated. See comments in + ; === AMDGPUCFGStructurizer::improveSimpleJumpintoIf() + ; CHECK: ALU +; === Branch instruction (ELSE): +; CHECK: ELSE + ; === diamond_head block: + ; CHECK: ALU_PUSH_BEFORE + ; === Branch instruction (IF): + ; CHECK: JUMP + ; === diamond_true block (There can be more than one ALU clause): + ; ALU + ; === Branch instruction (ELSE): + ; CHECK: ELSE + ; === diamond_false block plus implicit ENDIF + ; CHECK: ALU_POP_AFTER +; === Branch instruction (ENDIF): +; CHECK: POP +; === done block: +; CHECK: ALU +; CHECK: MEM_RAT_CACHELESS +; CHECK: CF_END + + +define void @branch_into_diamond(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) { +entry: +%0 = icmp ne i32 %a, 0 + br i1 %0, label %diamond_head, label %branch_from + +diamond_head: + %1 = icmp ne i32 %a, 1 + br i1 %1, label %diamond_true, label %diamond_false + +branch_from: + %2 = add i32 %a, 1 + br label %diamond_true + +diamond_false: + %3 = add i32 %a, 2 + br label %done + +diamond_true: + %4 = phi i32 [%2, %branch_from], [%a, %diamond_head] + ; This block needs to be > 100 ISA instructions to hit the bug, + ; so we'll use udiv instructions. + %div0 = udiv i32 %a, %b + %div1 = udiv i32 %div0, %4 + %div2 = udiv i32 %div1, 11 + %div3 = udiv i32 %div2, %a + %div4 = udiv i32 %div3, %b + %div5 = udiv i32 %div4, %c + %div6 = udiv i32 %div5, %div0 + %div7 = udiv i32 %div6, %div1 + br label %done + +done: + %5 = phi i32 [%3, %diamond_false], [%div7, %diamond_true] + store i32 %5, i32 addrspace(1)* %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/structurize1.ll b/llvm/test/CodeGen/AMDGPU/structurize1.ll new file mode 100644 index 00000000000..77432c1f9d2 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/structurize1.ll @@ -0,0 +1,62 @@ +; RUN: llc < %s -march=r600 -mattr=disable-ifcvt -mcpu=redwood | FileCheck %s + +; This tests for abug where the AMDILCFGStructurizer was crashing on loops +; like this: +; +; for (i = 0; i < x; i++) { +; if (cond0) { +; if (cond1) { +; +; } else { +; +; } +; if (cond2) { +; +; } +; } +; } + +; CHECK-LABEL: {{^}}if_inside_loop: +; CHECK: LOOP_START_DX10 +; CHECK: END_LOOP +define void @if_inside_loop(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d) { +entry: + br label %for.body + +for.body: + %0 = phi i32 [0, %entry], [%inc, %for.inc] + %val = phi i32 [0, %entry], [%val.for.inc, %for.inc] + %inc = add i32 %0, 1 + %1 = icmp ult i32 10, %a + br i1 %1, label %for.inc, label %if.then + +if.then: + %2 = icmp ne i32 0, %b + br i1 %2, label %if.then.true, label %if.then.false + +if.then.true: + %3 = add i32 %a, %val + br label %if + +if.then.false: + %4 = mul i32 %a, %val + br label %if + +if: + %val.if = phi i32 [%3, %if.then.true], [%4, %if.then.false] + %5 = icmp ne i32 0, %c + br i1 %5, label %if.true, label %for.inc + +if.true: + %6 = add i32 %a, %val.if + br label %for.inc + +for.inc: + %val.for.inc = phi i32 [%val, %for.body], [%val.if, %if], [%6, %if.true] + %7 = icmp ne i32 0, %d + br i1 %7, label %for.body, label %exit + +exit: + store i32 %val.for.inc, i32 addrspace(1)* %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/sub.ll b/llvm/test/CodeGen/AMDGPU/sub.ll new file mode 100644 index 00000000000..b7fba0efa5b --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/sub.ll @@ -0,0 +1,130 @@ +; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s + + +declare i32 @llvm.r600.read.tidig.x() readnone + +; FUNC-LABEL: {{^}}test_sub_i32: +; EG: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} + +; SI: v_subrev_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +define void @test_sub_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { + %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 + %a = load i32, i32 addrspace(1)* %in + %b = load i32, i32 addrspace(1)* %b_ptr + %result = sub i32 %a, %b + store i32 %result, i32 addrspace(1)* %out + ret void +} + + +; FUNC-LABEL: {{^}}test_sub_v2i32: +; EG: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; EG: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} + +; SI: v_sub_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +; SI: v_sub_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} + +define void @test_sub_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { + %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1 + %a = load <2 x i32>, <2 x i32> addrspace(1) * %in + %b = load <2 x i32>, <2 x i32> addrspace(1) * %b_ptr + %result = sub <2 x i32> %a, %b + store <2 x i32> %result, <2 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}test_sub_v4i32: +; EG: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; EG: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; EG: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; EG: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} + +; SI: v_sub_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +; SI: v_sub_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +; SI: v_sub_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +; SI: v_sub_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} + +define void @test_sub_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { + %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1 + %a = load <4 x i32>, <4 x i32> addrspace(1) * %in + %b = load <4 x i32>, <4 x i32> addrspace(1) * %b_ptr + %result = sub <4 x i32> %a, %b + store <4 x i32> %result, <4 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}s_sub_i64: +; SI: s_sub_u32 +; SI: s_subb_u32 + +; EG: MEM_RAT_CACHELESS STORE_RAW [[LO:T[0-9]+\.[XYZW]]] +; EG: MEM_RAT_CACHELESS STORE_RAW [[HI:T[0-9]+\.[XYZW]]] +; EG-DAG: SUB_INT {{[* ]*}}[[LO]] +; EG-DAG: SUBB_UINT +; EG-DAG: SUB_INT +; EG-DAG: SUB_INT {{[* ]*}}[[HI]] +; EG-NOT: SUB +define void @s_sub_i64(i64 addrspace(1)* noalias %out, i64 %a, i64 %b) nounwind { + %result = sub i64 %a, %b + store i64 %result, i64 addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}v_sub_i64: +; SI: v_sub_i32_e32 +; SI: v_subb_u32_e32 + +; EG: MEM_RAT_CACHELESS STORE_RAW [[LO:T[0-9]+\.[XYZW]]] +; EG: MEM_RAT_CACHELESS STORE_RAW [[HI:T[0-9]+\.[XYZW]]] +; EG-DAG: SUB_INT {{[* ]*}}[[LO]] +; EG-DAG: SUBB_UINT +; EG-DAG: SUB_INT +; EG-DAG: SUB_INT {{[* ]*}}[[HI]] +; EG-NOT: SUB +define void @v_sub_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %inA, i64 addrspace(1)* noalias %inB) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() readnone + %a_ptr = getelementptr i64, i64 addrspace(1)* %inA, i32 %tid + %b_ptr = getelementptr i64, i64 addrspace(1)* %inB, i32 %tid + %a = load i64, i64 addrspace(1)* %a_ptr + %b = load i64, i64 addrspace(1)* %b_ptr + %result = sub i64 %a, %b + store i64 %result, i64 addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}v_test_sub_v2i64: +; SI: v_sub_i32_e32 +; SI: v_subb_u32_e32 +; SI: v_sub_i32_e32 +; SI: v_subb_u32_e32 +define void @v_test_sub_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* noalias %inA, <2 x i64> addrspace(1)* noalias %inB) { + %tid = call i32 @llvm.r600.read.tidig.x() readnone + %a_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %inA, i32 %tid + %b_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %inB, i32 %tid + %a = load <2 x i64>, <2 x i64> addrspace(1)* %a_ptr + %b = load <2 x i64>, <2 x i64> addrspace(1)* %b_ptr + %result = sub <2 x i64> %a, %b + store <2 x i64> %result, <2 x i64> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}v_test_sub_v4i64: +; SI: v_sub_i32_e32 +; SI: v_subb_u32_e32 +; SI: v_sub_i32_e32 +; SI: v_subb_u32_e32 +; SI: v_sub_i32_e32 +; SI: v_subb_u32_e32 +; SI: v_sub_i32_e32 +; SI: v_subb_u32_e32 +define void @v_test_sub_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* noalias %inA, <4 x i64> addrspace(1)* noalias %inB) { + %tid = call i32 @llvm.r600.read.tidig.x() readnone + %a_ptr = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %inA, i32 %tid + %b_ptr = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %inB, i32 %tid + %a = load <4 x i64>, <4 x i64> addrspace(1)* %a_ptr + %b = load <4 x i64>, <4 x i64> addrspace(1)* %b_ptr + %result = sub <4 x i64> %a, %b + store <4 x i64> %result, <4 x i64> addrspace(1)* %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/subreg-coalescer-crash.ll b/llvm/test/CodeGen/AMDGPU/subreg-coalescer-crash.ll new file mode 100644 index 00000000000..c4dae4736cf --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/subreg-coalescer-crash.ll @@ -0,0 +1,109 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs -o - %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -o - %s + +; SI-LABEL:{{^}}row_filter_C1_D0: +; SI: s_endpgm +; Function Attrs: nounwind +define void @row_filter_C1_D0() { +entry: + br i1 undef, label %for.inc.1, label %do.body.preheader + +do.body.preheader: ; preds = %entry + %0 = insertelement <4 x i32> zeroinitializer, i32 undef, i32 1 + br i1 undef, label %do.body56.1, label %do.body90 + +do.body90: ; preds = %do.body56.2, %do.body56.1, %do.body.preheader + %1 = phi <4 x i32> [ %6, %do.body56.2 ], [ %5, %do.body56.1 ], [ %0, %do.body.preheader ] + %2 = insertelement <4 x i32> %1, i32 undef, i32 2 + %3 = insertelement <4 x i32> %2, i32 undef, i32 3 + br i1 undef, label %do.body124.1, label %do.body.1562.preheader + +do.body.1562.preheader: ; preds = %do.body124.1, %do.body90 + %storemerge = phi <4 x i32> [ %3, %do.body90 ], [ %7, %do.body124.1 ] + %4 = insertelement <4 x i32> undef, i32 undef, i32 1 + br label %for.inc.1 + +do.body56.1: ; preds = %do.body.preheader + %5 = insertelement <4 x i32> %0, i32 undef, i32 1 + %or.cond472.1 = or i1 undef, undef + br i1 %or.cond472.1, label %do.body56.2, label %do.body90 + +do.body56.2: ; preds = %do.body56.1 + %6 = insertelement <4 x i32> %5, i32 undef, i32 1 + br label %do.body90 + +do.body124.1: ; preds = %do.body90 + %7 = insertelement <4 x i32> %3, i32 undef, i32 3 + br label %do.body.1562.preheader + +for.inc.1: ; preds = %do.body.1562.preheader, %entry + %storemerge591 = phi <4 x i32> [ zeroinitializer, %entry ], [ %storemerge, %do.body.1562.preheader ] + %add.i495 = add <4 x i32> %storemerge591, undef + unreachable +} + +; SI-LABEL: {{^}}foo: +; SI: s_endpgm +define void @foo() #0 { +bb: + br i1 undef, label %bb2, label %bb1 + +bb1: ; preds = %bb + br i1 undef, label %bb4, label %bb6 + +bb2: ; preds = %bb4, %bb + %tmp = phi float [ %tmp5, %bb4 ], [ 0.000000e+00, %bb ] + br i1 undef, label %bb9, label %bb13 + +bb4: ; preds = %bb7, %bb6, %bb1 + %tmp5 = phi float [ undef, %bb1 ], [ undef, %bb6 ], [ %tmp8, %bb7 ] + br label %bb2 + +bb6: ; preds = %bb1 + br i1 undef, label %bb7, label %bb4 + +bb7: ; preds = %bb6 + %tmp8 = fmul float undef, undef + br label %bb4 + +bb9: ; preds = %bb2 + %tmp10 = call <4 x float> @llvm.SI.sample.v2i32(<2 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 2) + %tmp11 = extractelement <4 x float> %tmp10, i32 1 + %tmp12 = extractelement <4 x float> %tmp10, i32 3 + br label %bb14 + +bb13: ; preds = %bb2 + br i1 undef, label %bb23, label %bb24 + +bb14: ; preds = %bb27, %bb24, %bb9 + %tmp15 = phi float [ %tmp12, %bb9 ], [ undef, %bb27 ], [ 0.000000e+00, %bb24 ] + %tmp16 = phi float [ %tmp11, %bb9 ], [ undef, %bb27 ], [ %tmp25, %bb24 ] + %tmp17 = fmul float 10.5, %tmp16 + %tmp18 = fmul float 11.5, %tmp15 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %tmp18, float %tmp17, float %tmp17, float %tmp17) + ret void + +bb23: ; preds = %bb13 + br i1 undef, label %bb24, label %bb26 + +bb24: ; preds = %bb26, %bb23, %bb13 + %tmp25 = phi float [ %tmp, %bb13 ], [ %tmp, %bb26 ], [ 0.000000e+00, %bb23 ] + br i1 undef, label %bb27, label %bb14 + +bb26: ; preds = %bb23 + br label %bb24 + +bb27: ; preds = %bb24 + br label %bb14 +} + +; Function Attrs: nounwind readnone +declare <4 x float> @llvm.SI.sample.v2i32(<2 x i32>, <32 x i8>, <16 x i8>, i32) #1 + +; Function Attrs: nounwind readnone +declare i32 @llvm.SI.packf16(float, float) #1 + +declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) + +attributes #0 = { "ShaderType"="0" "enable-no-nans-fp-math"="true" "unsafe-fp-math"="true" } +attributes #1 = { nounwind readnone } diff --git a/llvm/test/CodeGen/AMDGPU/subreg-eliminate-dead.ll b/llvm/test/CodeGen/AMDGPU/subreg-eliminate-dead.ll new file mode 100644 index 00000000000..8bd995a8ecb --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/subreg-eliminate-dead.ll @@ -0,0 +1,19 @@ +; RUN: llc -mtriple=amdgcn-- -verify-machineinstrs -o - %s | FileCheck %s +; LiveRangeEdit::eliminateDeadDef did not update LiveInterval sub ranges +; properly. + +; Just make sure this test doesn't crash. +; CHECK-LABEL: foobar: +; CHECK: s_endpgm +define void @foobar() { + %v0 = icmp eq <4 x i32> undef, + %v3 = sext <4 x i1> %v0 to <4 x i32> + %v4 = extractelement <4 x i32> %v3, i32 1 + %v5 = icmp ne i32 %v4, 0 + %v6 = select i1 %v5, i32 undef, i32 0 + %v15 = insertelement <2 x i32> undef, i32 %v6, i32 1 + store <2 x i32> %v15, <2 x i32> addrspace(1)* undef, align 8 + ret void +} + +declare double @llvm.fma.f64(double, double, double) diff --git a/llvm/test/CodeGen/AMDGPU/swizzle-export.ll b/llvm/test/CodeGen/AMDGPU/swizzle-export.ll new file mode 100644 index 00000000000..000ee2faa47 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/swizzle-export.ll @@ -0,0 +1,129 @@ +; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG %s + +;EG: {{^}}main: +;EG: EXPORT T{{[0-9]+}}.XYXX +;EG: EXPORT T{{[0-9]+}}.ZXXX +;EG: EXPORT T{{[0-9]+}}.XXWX +;EG: EXPORT T{{[0-9]+}}.XXXW + +define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1) #0 { +main_body: + %0 = extractelement <4 x float> %reg1, i32 0 + %1 = extractelement <4 x float> %reg1, i32 1 + %2 = extractelement <4 x float> %reg1, i32 2 + %3 = extractelement <4 x float> %reg1, i32 3 + %4 = load <4 x float>, <4 x float> addrspace(8)* null + %5 = extractelement <4 x float> %4, i32 1 + %6 = load <4 x float>, <4 x float> addrspace(8)* null + %7 = extractelement <4 x float> %6, i32 2 + %8 = load <4 x float>, <4 x float> addrspace(8)* null + %9 = extractelement <4 x float> %8, i32 0 + %10 = fmul float 0.000000e+00, %9 + %11 = load <4 x float>, <4 x float> addrspace(8)* null + %12 = extractelement <4 x float> %11, i32 0 + %13 = fmul float %5, %12 + %14 = load <4 x float>, <4 x float> addrspace(8)* null + %15 = extractelement <4 x float> %14, i32 0 + %16 = fmul float 0.000000e+00, %15 + %17 = load <4 x float>, <4 x float> addrspace(8)* null + %18 = extractelement <4 x float> %17, i32 0 + %19 = fmul float 0.000000e+00, %18 + %20 = load <4 x float>, <4 x float> addrspace(8)* null + %21 = extractelement <4 x float> %20, i32 0 + %22 = fmul float %7, %21 + %23 = load <4 x float>, <4 x float> addrspace(8)* null + %24 = extractelement <4 x float> %23, i32 0 + %25 = fmul float 0.000000e+00, %24 + %26 = load <4 x float>, <4 x float> addrspace(8)* null + %27 = extractelement <4 x float> %26, i32 0 + %28 = fmul float 0.000000e+00, %27 + %29 = load <4 x float>, <4 x float> addrspace(8)* null + %30 = extractelement <4 x float> %29, i32 0 + %31 = fmul float 0.000000e+00, %30 + %32 = load <4 x float>, <4 x float> addrspace(8)* null + %33 = extractelement <4 x float> %32, i32 0 + %34 = fmul float 0.000000e+00, %33 + %35 = load <4 x float>, <4 x float> addrspace(8)* null + %36 = extractelement <4 x float> %35, i32 0 + %37 = fmul float 0.000000e+00, %36 + %38 = load <4 x float>, <4 x float> addrspace(8)* null + %39 = extractelement <4 x float> %38, i32 0 + %40 = fmul float 1.000000e+00, %39 + %41 = load <4 x float>, <4 x float> addrspace(8)* null + %42 = extractelement <4 x float> %41, i32 0 + %43 = fmul float 0.000000e+00, %42 + %44 = load <4 x float>, <4 x float> addrspace(8)* null + %45 = extractelement <4 x float> %44, i32 0 + %46 = fmul float 0.000000e+00, %45 + %47 = load <4 x float>, <4 x float> addrspace(8)* null + %48 = extractelement <4 x float> %47, i32 0 + %49 = fmul float 0.000000e+00, %48 + %50 = load <4 x float>, <4 x float> addrspace(8)* null + %51 = extractelement <4 x float> %50, i32 0 + %52 = fmul float 0.000000e+00, %51 + %53 = load <4 x float>, <4 x float> addrspace(8)* null + %54 = extractelement <4 x float> %53, i32 0 + %55 = fmul float 1.000000e+00, %54 + %56 = insertelement <4 x float> undef, float %0, i32 0 + %57 = insertelement <4 x float> %56, float %1, i32 1 + %58 = insertelement <4 x float> %57, float %2, i32 2 + %59 = insertelement <4 x float> %58, float %3, i32 3 + call void @llvm.R600.store.swizzle(<4 x float> %59, i32 60, i32 1) + %60 = insertelement <4 x float> undef, float %10, i32 0 + %61 = insertelement <4 x float> %60, float %13, i32 1 + %62 = insertelement <4 x float> %61, float %16, i32 2 + %63 = insertelement <4 x float> %62, float %19, i32 3 + call void @llvm.R600.store.swizzle(<4 x float> %63, i32 0, i32 2) + %64 = insertelement <4 x float> undef, float %22, i32 0 + %65 = insertelement <4 x float> %64, float %25, i32 1 + %66 = insertelement <4 x float> %65, float %28, i32 2 + %67 = insertelement <4 x float> %66, float %31, i32 3 + call void @llvm.R600.store.swizzle(<4 x float> %67, i32 1, i32 2) + %68 = insertelement <4 x float> undef, float %34, i32 0 + %69 = insertelement <4 x float> %68, float %37, i32 1 + %70 = insertelement <4 x float> %69, float %40, i32 2 + %71 = insertelement <4 x float> %70, float %43, i32 3 + call void @llvm.R600.store.swizzle(<4 x float> %71, i32 2, i32 2) + %72 = insertelement <4 x float> undef, float %46, i32 0 + %73 = insertelement <4 x float> %72, float %49, i32 1 + %74 = insertelement <4 x float> %73, float %52, i32 2 + %75 = insertelement <4 x float> %74, float %55, i32 3 + call void @llvm.R600.store.swizzle(<4 x float> %75, i32 3, i32 2) + ret void +} + +; EG: {{^}}main2: +; EG: T{{[0-9]+}}.XY__ +; EG: T{{[0-9]+}}.ZXY0 + +define void @main2(<4 x float> inreg %reg0, <4 x float> inreg %reg1) #0 { +main_body: + %0 = extractelement <4 x float> %reg1, i32 0 + %1 = extractelement <4 x float> %reg1, i32 1 + %2 = fadd float %0, 2.5 + %3 = fmul float %1, 3.5 + %4 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1) + %5 = extractelement <4 x float> %4, i32 0 + %6 = call float @llvm.cos.f32(float %5) + %7 = load <4 x float>, <4 x float> addrspace(8)* null + %8 = extractelement <4 x float> %7, i32 0 + %9 = load <4 x float>, <4 x float> addrspace(8)* null + %10 = extractelement <4 x float> %9, i32 1 + %11 = insertelement <4 x float> undef, float %2, i32 0 + %12 = insertelement <4 x float> %11, float %3, i32 1 + call void @llvm.R600.store.swizzle(<4 x float> %12, i32 60, i32 1) + %13 = insertelement <4 x float> undef, float %6, i32 0 + %14 = insertelement <4 x float> %13, float %8, i32 1 + %15 = insertelement <4 x float> %14, float %10, i32 2 + %16 = insertelement <4 x float> %15, float 0.000000e+00, i32 3 + call void @llvm.R600.store.swizzle(<4 x float> %16, i32 0, i32 2) + ret void +} + +; Function Attrs: nounwind readonly +declare float @llvm.cos.f32(float) #1 + +declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) + +attributes #0 = { "ShaderType"="1" } +attributes #1 = { nounwind readonly } diff --git a/llvm/test/CodeGen/AMDGPU/tex-clause-antidep.ll b/llvm/test/CodeGen/AMDGPU/tex-clause-antidep.ll new file mode 100644 index 00000000000..cbb9c50974a --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/tex-clause-antidep.ll @@ -0,0 +1,25 @@ +;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s + +;CHECK: TEX +;CHECK-NEXT: ALU + +define void @test(<4 x float> inreg %reg0) #0 { + %1 = extractelement <4 x float> %reg0, i32 0 + %2 = extractelement <4 x float> %reg0, i32 1 + %3 = extractelement <4 x float> %reg0, i32 2 + %4 = extractelement <4 x float> %reg0, i32 3 + %5 = insertelement <4 x float> undef, float %1, i32 0 + %6 = insertelement <4 x float> %5, float %2, i32 1 + %7 = insertelement <4 x float> %6, float %3, i32 2 + %8 = insertelement <4 x float> %7, float %4, i32 3 + %9 = call <4 x float> @llvm.R600.tex(<4 x float> %8, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %10 = call <4 x float> @llvm.R600.tex(<4 x float> %8, i32 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %11 = fadd <4 x float> %9, %10 + call void @llvm.R600.store.swizzle(<4 x float> %11, i32 0, i32 0) + ret void +} + +declare <4 x float> @llvm.R600.tex(<4 x float>, i32, i32, i32, i32, i32, i32, i32, i32, i32) readnone +declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) + +attributes #0 = { "ShaderType"="1" } \ No newline at end of file diff --git a/llvm/test/CodeGen/AMDGPU/texture-input-merge.ll b/llvm/test/CodeGen/AMDGPU/texture-input-merge.ll new file mode 100644 index 00000000000..789538af582 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/texture-input-merge.ll @@ -0,0 +1,31 @@ +;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s + +;CHECK-NOT: MOV + +define void @test(<4 x float> inreg %reg0) #0 { + %1 = extractelement <4 x float> %reg0, i32 0 + %2 = extractelement <4 x float> %reg0, i32 1 + %3 = extractelement <4 x float> %reg0, i32 2 + %4 = extractelement <4 x float> %reg0, i32 3 + %5 = fmul float %1, 3.0 + %6 = fmul float %2, 3.0 + %7 = fmul float %3, 3.0 + %8 = fmul float %4, 3.0 + %9 = insertelement <4 x float> undef, float %5, i32 0 + %10 = insertelement <4 x float> %9, float %6, i32 1 + %11 = insertelement <4 x float> undef, float %7, i32 0 + %12 = insertelement <4 x float> %11, float %5, i32 1 + %13 = insertelement <4 x float> undef, float %8, i32 0 + %14 = call <4 x float> @llvm.R600.tex(<4 x float> %10, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %15 = call <4 x float> @llvm.R600.tex(<4 x float> %12, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %16 = call <4 x float> @llvm.R600.tex(<4 x float> %13, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %17 = fadd <4 x float> %14, %15 + %18 = fadd <4 x float> %17, %16 + call void @llvm.R600.store.swizzle(<4 x float> %18, i32 0, i32 0) + ret void +} + +declare <4 x float> @llvm.R600.tex(<4 x float>, i32, i32, i32, i32, i32, i32, i32, i32, i32) readnone +declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) + +attributes #0 = { "ShaderType"="1" } \ No newline at end of file diff --git a/llvm/test/CodeGen/AMDGPU/trunc-cmp-constant.ll b/llvm/test/CodeGen/AMDGPU/trunc-cmp-constant.ll new file mode 100644 index 00000000000..dac74728b3c --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/trunc-cmp-constant.ll @@ -0,0 +1,170 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s + +; FUNC-LABEL {{^}}sextload_i1_to_i32_trunc_cmp_eq_0: +; SI: buffer_load_ubyte [[LOAD:v[0-9]+]] +; SI: v_and_b32_e32 [[TMP:v[0-9]+]], 1, [[LOAD]] +; SI: v_cmp_eq_i32_e32 vcc, 1, [[TMP]]{{$}} +; SI: s_xor_b64 s{{\[[0-9]+:[0-9]+\]}}, vcc, -1{{$}} +; SI: v_cndmask_b32_e64 +; SI: buffer_store_byte +define void @sextload_i1_to_i32_trunc_cmp_eq_0(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind { + %load = load i1, i1 addrspace(1)* %in + %ext = sext i1 %load to i32 + %cmp = icmp eq i32 %ext, 0 + store i1 %cmp, i1 addrspace(1)* %out + ret void +} + +; FIXME: The negate should be inverting the compare. +; FUNC-LABEL: {{^}}zextload_i1_to_i32_trunc_cmp_eq_0: +; SI: buffer_load_ubyte [[LOAD:v[0-9]+]] +; SI: v_and_b32_e32 [[TMP:v[0-9]+]], 1, [[LOAD]] +; SI: v_cmp_eq_i32_e32 vcc, 1, [[TMP]]{{$}} +; SI-NEXT: s_xor_b64 [[NEG:s\[[0-9]+:[0-9]+\]]], vcc, -1 +; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[NEG]] +; SI-NEXT: buffer_store_byte [[RESULT]] +define void @zextload_i1_to_i32_trunc_cmp_eq_0(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind { + %load = load i1, i1 addrspace(1)* %in + %ext = zext i1 %load to i32 + %cmp = icmp eq i32 %ext, 0 + store i1 %cmp, i1 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}sextload_i1_to_i32_trunc_cmp_eq_1: +; SI: v_mov_b32_e32 [[RESULT:v[0-9]+]], 0{{$}} +; SI: buffer_store_byte [[RESULT]] +define void @sextload_i1_to_i32_trunc_cmp_eq_1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind { + %load = load i1, i1 addrspace(1)* %in + %ext = sext i1 %load to i32 + %cmp = icmp eq i32 %ext, 1 + store i1 %cmp, i1 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}zextload_i1_to_i32_trunc_cmp_eq_1: +; SI: buffer_load_ubyte [[LOAD:v[0-9]+]] +; SI: v_and_b32_e32 [[RESULT:v[0-9]+]], 1, [[LOAD]] +; SI-NEXT: buffer_store_byte [[RESULT]] +define void @zextload_i1_to_i32_trunc_cmp_eq_1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind { + %load = load i1, i1 addrspace(1)* %in + %ext = zext i1 %load to i32 + %cmp = icmp eq i32 %ext, 1 + store i1 %cmp, i1 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}sextload_i1_to_i32_trunc_cmp_eq_neg1: +; SI: buffer_load_ubyte [[LOAD:v[0-9]+]] +; SI: v_and_b32_e32 [[RESULT:v[0-9]+]], 1, [[LOAD]] +; SI-NEXT: buffer_store_byte [[RESULT]] +define void @sextload_i1_to_i32_trunc_cmp_eq_neg1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind { + %load = load i1, i1 addrspace(1)* %in + %ext = sext i1 %load to i32 + %cmp = icmp eq i32 %ext, -1 + store i1 %cmp, i1 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}zextload_i1_to_i32_trunc_cmp_eq_neg1: +; SI: v_mov_b32_e32 [[RESULT:v[0-9]+]], 0{{$}} +; SI: buffer_store_byte [[RESULT]] +define void @zextload_i1_to_i32_trunc_cmp_eq_neg1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind { + %load = load i1, i1 addrspace(1)* %in + %ext = zext i1 %load to i32 + %cmp = icmp eq i32 %ext, -1 + store i1 %cmp, i1 addrspace(1)* %out + ret void +} + + +; FUNC-LABEL {{^}}sextload_i1_to_i32_trunc_cmp_ne_0: +; SI: buffer_load_ubyte [[LOAD:v[0-9]+]] +; SI: v_and_b32_e32 [[TMP:v[0-9]+]], 1, [[LOAD]] +; SI-NEXT: buffer_store_byte [[RESULT]] +define void @sextload_i1_to_i32_trunc_cmp_ne_0(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind { + %load = load i1, i1 addrspace(1)* %in + %ext = sext i1 %load to i32 + %cmp = icmp ne i32 %ext, 0 + store i1 %cmp, i1 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}zextload_i1_to_i32_trunc_cmp_ne_0: +; SI: buffer_load_ubyte [[LOAD:v[0-9]+]] +; SI: v_and_b32_e32 [[TMP:v[0-9]+]], 1, [[LOAD]] +; SI-NEXT: buffer_store_byte [[RESULT]] +define void @zextload_i1_to_i32_trunc_cmp_ne_0(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind { + %load = load i1, i1 addrspace(1)* %in + %ext = zext i1 %load to i32 + %cmp = icmp ne i32 %ext, 0 + store i1 %cmp, i1 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}sextload_i1_to_i32_trunc_cmp_ne_1: +; SI: v_mov_b32_e32 [[RESULT:v[0-9]+]], 1{{$}} +; SI: buffer_store_byte [[RESULT]] +define void @sextload_i1_to_i32_trunc_cmp_ne_1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind { + %load = load i1, i1 addrspace(1)* %in + %ext = sext i1 %load to i32 + %cmp = icmp ne i32 %ext, 1 + store i1 %cmp, i1 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}zextload_i1_to_i32_trunc_cmp_ne_1: +; SI: buffer_load_ubyte [[LOAD:v[0-9]+]] +; SI: v_and_b32_e32 [[TMP:v[0-9]+]], 1, [[LOAD]] +; SI: v_cmp_eq_i32_e32 vcc, 1, [[TMP]]{{$}} +; SI-NEXT: s_xor_b64 [[NEG:s\[[0-9]+:[0-9]+\]]], vcc, -1 +; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[NEG]] +; SI-NEXT: buffer_store_byte [[RESULT]] +define void @zextload_i1_to_i32_trunc_cmp_ne_1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind { + %load = load i1, i1 addrspace(1)* %in + %ext = zext i1 %load to i32 + %cmp = icmp ne i32 %ext, 1 + store i1 %cmp, i1 addrspace(1)* %out + ret void +} + +; FIXME: This should be one compare. +; FUNC-LABEL: {{^}}sextload_i1_to_i32_trunc_cmp_ne_neg1: +; XSI: buffer_load_ubyte [[LOAD:v[0-9]+]] +; XSI: v_and_b32_e32 [[TMP:v[0-9]+]], 1, [[LOAD]] +; XSI: v_cmp_eq_i32_e64 [[CMP0:s\[[0-9]+:[0-9]+\]]], [[TMP]], 0{{$}} +; XSI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[CMP0]] +; XSI-NEXT: buffer_store_byte [[RESULT]] +define void @sextload_i1_to_i32_trunc_cmp_ne_neg1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind { + %load = load i1, i1 addrspace(1)* %in + %ext = sext i1 %load to i32 + %cmp = icmp ne i32 %ext, -1 + store i1 %cmp, i1 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}zextload_i1_to_i32_trunc_cmp_ne_neg1: +; SI: v_mov_b32_e32 [[RESULT:v[0-9]+]], 1{{$}} +; SI: buffer_store_byte [[RESULT]] +define void @zextload_i1_to_i32_trunc_cmp_ne_neg1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind { + %load = load i1, i1 addrspace(1)* %in + %ext = zext i1 %load to i32 + %cmp = icmp ne i32 %ext, -1 + store i1 %cmp, i1 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}masked_load_i1_to_i32_trunc_cmp_ne_neg1: +; SI: buffer_load_sbyte [[LOAD:v[0-9]+]] +; SI: v_cmp_ne_i32_e32 vcc, -1, [[LOAD]]{{$}} +; SI-NEXT: v_cndmask_b32_e64 +; SI-NEXT: buffer_store_byte +define void @masked_load_i1_to_i32_trunc_cmp_ne_neg1(i1 addrspace(1)* %out, i8 addrspace(1)* %in) nounwind { + %load = load i8, i8 addrspace(1)* %in + %masked = and i8 %load, 255 + %ext = sext i8 %masked to i32 + %cmp = icmp ne i32 %ext, -1 + store i1 %cmp, i1 addrspace(1)* %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/trunc-store-f64-to-f16.ll b/llvm/test/CodeGen/AMDGPU/trunc-store-f64-to-f16.ll new file mode 100644 index 00000000000..c29872beef8 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/trunc-store-f64-to-f16.ll @@ -0,0 +1,56 @@ +; XFAIL: * +; RUN: llc -march=amdgcn -mcpu=SI < %s + +; GCN-LABEL: {{^}}global_truncstore_f64_to_f16: +; GCN: s_endpgm +define void @global_truncstore_f64_to_f16(half addrspace(1)* %out, double addrspace(1)* %in) #0 { + %val = load double, double addrspace(1)* %in + %cvt = fptrunc double %val to half + store half %cvt, half addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}global_truncstore_v2f64_to_v2f16: +; GCN: s_endpgm +define void @global_truncstore_v2f64_to_v2f16(<2 x half> addrspace(1)* %out, <2 x double> addrspace(1)* %in) #0 { + %val = load <2 x double>, <2 x double> addrspace(1)* %in + %cvt = fptrunc <2 x double> %val to <2 x half> + store <2 x half> %cvt, <2 x half> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}global_truncstore_v3f64_to_v3f16: +; GCN: s_endpgm +define void @global_truncstore_v3f64_to_v3f16(<3 x half> addrspace(1)* %out, <3 x double> addrspace(1)* %in) #0 { + %val = load <3 x double>, <3 x double> addrspace(1)* %in + %cvt = fptrunc <3 x double> %val to <3 x half> + store <3 x half> %cvt, <3 x half> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}global_truncstore_v4f64_to_v4f16: +; GCN: s_endpgm +define void @global_truncstore_v4f64_to_v4f16(<4 x half> addrspace(1)* %out, <4 x double> addrspace(1)* %in) #0 { + %val = load <4 x double>, <4 x double> addrspace(1)* %in + %cvt = fptrunc <4 x double> %val to <4 x half> + store <4 x half> %cvt, <4 x half> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}global_truncstore_v8f64_to_v8f16: +; GCN: s_endpgm +define void @global_truncstore_v8f64_to_v8f16(<8 x half> addrspace(1)* %out, <8 x double> addrspace(1)* %in) #0 { + %val = load <8 x double>, <8 x double> addrspace(1)* %in + %cvt = fptrunc <8 x double> %val to <8 x half> + store <8 x half> %cvt, <8 x half> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}global_truncstore_v16f64_to_v16f16: +; GCN: s_endpgm +define void @global_truncstore_v16f64_to_v16f16(<16 x half> addrspace(1)* %out, <16 x double> addrspace(1)* %in) #0 { + %val = load <16 x double>, <16 x double> addrspace(1)* %in + %cvt = fptrunc <16 x double> %val to <16 x half> + store <16 x half> %cvt, <16 x half> addrspace(1)* %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/trunc-store-i1.ll b/llvm/test/CodeGen/AMDGPU/trunc-store-i1.ll new file mode 100644 index 00000000000..b71a838b62c --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/trunc-store-i1.ll @@ -0,0 +1,33 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s + + +; SI-LABEL: {{^}}global_truncstore_i32_to_i1: +; SI: s_load_dword [[LOAD:s[0-9]+]], +; SI: s_and_b32 [[SREG:s[0-9]+]], [[LOAD]], 1 +; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], [[SREG]] +; SI: buffer_store_byte [[VREG]], +define void @global_truncstore_i32_to_i1(i1 addrspace(1)* %out, i32 %val) nounwind { + %trunc = trunc i32 %val to i1 + store i1 %trunc, i1 addrspace(1)* %out, align 1 + ret void +} + +; SI-LABEL: {{^}}global_truncstore_i64_to_i1: +; SI: buffer_store_byte +define void @global_truncstore_i64_to_i1(i1 addrspace(1)* %out, i64 %val) nounwind { + %trunc = trunc i64 %val to i1 + store i1 %trunc, i1 addrspace(1)* %out, align 1 + ret void +} + +; SI-LABEL: {{^}}global_truncstore_i16_to_i1: +; SI: s_load_dword [[LOAD:s[0-9]+]], +; SI: s_and_b32 [[SREG:s[0-9]+]], [[LOAD]], 1 +; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], [[SREG]] +; SI: buffer_store_byte [[VREG]], +define void @global_truncstore_i16_to_i1(i1 addrspace(1)* %out, i16 %val) nounwind { + %trunc = trunc i16 %val to i1 + store i1 %trunc, i1 addrspace(1)* %out, align 1 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/trunc-vector-store-assertion-failure.ll b/llvm/test/CodeGen/AMDGPU/trunc-vector-store-assertion-failure.ll new file mode 100644 index 00000000000..878ea3f4899 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/trunc-vector-store-assertion-failure.ll @@ -0,0 +1,20 @@ +; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s + +; This tests for a bug in the SelectionDAG where custom lowered truncated +; vector stores at the end of a basic block were not being added to the +; LegalizedNodes list, which triggered an assertion failure. + +; CHECK-LABEL: {{^}}test: +; CHECK: MEM_RAT_CACHELESS STORE_RAW +define void @test(<4 x i8> addrspace(1)* %out, i32 %cond, <4 x i8> %in) { +entry: + %0 = icmp eq i32 %cond, 0 + br i1 %0, label %if, label %done + +if: + store <4 x i8> %in, <4 x i8> addrspace(1)* %out + br label %done + +done: + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/trunc.ll b/llvm/test/CodeGen/AMDGPU/trunc.ll new file mode 100644 index 00000000000..bf690ca4cb2 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/trunc.ll @@ -0,0 +1,100 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s +; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG %s + +declare i32 @llvm.r600.read.tidig.x() nounwind readnone + +define void @trunc_i64_to_i32_store(i32 addrspace(1)* %out, i64 %in) { +; SI-LABEL: {{^}}trunc_i64_to_i32_store: +; SI: s_load_dword [[SLOAD:s[0-9]+]], s[0:1], 0xb +; SI: v_mov_b32_e32 [[VLOAD:v[0-9]+]], [[SLOAD]] +; SI: buffer_store_dword [[VLOAD]] + +; EG-LABEL: {{^}}trunc_i64_to_i32_store: +; EG: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 +; EG: LSHR +; EG-NEXT: 2( + + %result = trunc i64 %in to i32 store i32 %result, i32 addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL: {{^}}trunc_load_shl_i64: +; SI-DAG: s_load_dwordx2 +; SI-DAG: s_load_dword [[SREG:s[0-9]+]], +; SI: s_lshl_b32 [[SHL:s[0-9]+]], [[SREG]], 2 +; SI: v_mov_b32_e32 [[VSHL:v[0-9]+]], [[SHL]] +; SI: buffer_store_dword [[VSHL]], +define void @trunc_load_shl_i64(i32 addrspace(1)* %out, i64 %a) { + %b = shl i64 %a, 2 + %result = trunc i64 %b to i32 + store i32 %result, i32 addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL: {{^}}trunc_shl_i64: +; SI: s_load_dwordx2 s{{\[}}[[LO_SREG:[0-9]+]]:{{[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd +; SI: s_lshl_b64 s{{\[}}[[LO_SHL:[0-9]+]]:{{[0-9]+\]}}, s{{\[}}[[LO_SREG]]:{{[0-9]+\]}}, 2 +; SI: s_add_u32 s[[LO_SREG2:[0-9]+]], s[[LO_SHL]], +; SI: s_addc_u32 +; SI: v_mov_b32_e32 +; SI: v_mov_b32_e32 +; SI: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], s[[LO_SREG2]] +; SI: buffer_store_dword v[[LO_VREG]], +define void @trunc_shl_i64(i64 addrspace(1)* %out2, i32 addrspace(1)* %out, i64 %a) { + %aa = add i64 %a, 234 ; Prevent shrinking store. + %b = shl i64 %aa, 2 + %result = trunc i64 %b to i32 + store i32 %result, i32 addrspace(1)* %out, align 4 + store i64 %b, i64 addrspace(1)* %out2, align 8 ; Prevent reducing ops to 32-bits + ret void +} + +; SI-LABEL: {{^}}trunc_i32_to_i1: +; SI: v_and_b32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}} +; SI: v_cmp_eq_i32 +define void @trunc_i32_to_i1(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) { + %a = load i32, i32 addrspace(1)* %ptr, align 4 + %trunc = trunc i32 %a to i1 + %result = select i1 %trunc, i32 1, i32 0 + store i32 %result, i32 addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL: {{^}}sgpr_trunc_i32_to_i1: +; SI: v_and_b32_e64 v{{[0-9]+}}, 1, s{{[0-9]+}} +; SI: v_cmp_eq_i32 +define void @sgpr_trunc_i32_to_i1(i32 addrspace(1)* %out, i32 %a) { + %trunc = trunc i32 %a to i1 + %result = select i1 %trunc, i32 1, i32 0 + store i32 %result, i32 addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL: {{^}}s_trunc_i64_to_i1: +; SI: s_load_dwordx2 s{{\[}}[[SLO:[0-9]+]]:{{[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0xb +; SI: v_and_b32_e64 [[MASKED:v[0-9]+]], 1, s[[SLO]] +; SI: v_cmp_eq_i32_e32 vcc, 1, [[MASKED]] +; SI: v_cndmask_b32_e64 {{v[0-9]+}}, -12, 63, vcc +define void @s_trunc_i64_to_i1(i32 addrspace(1)* %out, i64 %x) { + %trunc = trunc i64 %x to i1 + %sel = select i1 %trunc, i32 63, i32 -12 + store i32 %sel, i32 addrspace(1)* %out + ret void +} + +; SI-LABEL: {{^}}v_trunc_i64_to_i1: +; SI: buffer_load_dwordx2 v{{\[}}[[VLO:[0-9]+]]:{{[0-9]+\]}} +; SI: v_and_b32_e32 [[MASKED:v[0-9]+]], 1, v[[VLO]] +; SI: v_cmp_eq_i32_e32 vcc, 1, [[MASKED]] +; SI: v_cndmask_b32_e64 {{v[0-9]+}}, -12, 63, vcc +define void @v_trunc_i64_to_i1(i32 addrspace(1)* %out, i64 addrspace(1)* %in) { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid + %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %x = load i64, i64 addrspace(1)* %gep + + %trunc = trunc i64 %x to i1 + %sel = select i1 %trunc, i32 63, i32 -12 + store i32 %sel, i32 addrspace(1)* %out.gep + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/tti-unroll-prefs.ll b/llvm/test/CodeGen/AMDGPU/tti-unroll-prefs.ll new file mode 100644 index 00000000000..76c32afc1f2 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/tti-unroll-prefs.ll @@ -0,0 +1,58 @@ +; RUN: opt -loop-unroll -S -mtriple=amdgcn-- -mcpu=SI %s | FileCheck %s + +; This IR comes from this OpenCL C code: +; +; if (b + 4 > a) { +; for (int i = 0; i < 4; i++, b++) { +; if (b + 1 <= a) +; *(dst + c + b) = 0; +; else +; break; +; } +; } +; +; This test is meant to check that this loop isn't unrolled into more than +; four iterations. The loop unrolling preferences we currently use cause this +; loop to not be unrolled at all, but that may change in the future. + +; CHECK-LABEL: @test +; CHECK: store i8 0, i8 addrspace(1)* +; CHECK-NOT: store i8 0, i8 addrspace(1)* +; CHECK: ret void +define void @test(i8 addrspace(1)* nocapture %dst, i32 %a, i32 %b, i32 %c) { +entry: + %add = add nsw i32 %b, 4 + %cmp = icmp sgt i32 %add, %a + br i1 %cmp, label %for.cond.preheader, label %if.end7 + +for.cond.preheader: ; preds = %entry + %cmp313 = icmp slt i32 %b, %a + br i1 %cmp313, label %if.then4.lr.ph, label %if.end7.loopexit + +if.then4.lr.ph: ; preds = %for.cond.preheader + %0 = sext i32 %c to i64 + br label %if.then4 + +if.then4: ; preds = %if.then4.lr.ph, %if.then4 + %i.015 = phi i32 [ 0, %if.then4.lr.ph ], [ %inc, %if.then4 ] + %b.addr.014 = phi i32 [ %b, %if.then4.lr.ph ], [ %add2, %if.then4 ] + %add2 = add nsw i32 %b.addr.014, 1 + %1 = sext i32 %b.addr.014 to i64 + %add.ptr.sum = add nsw i64 %1, %0 + %add.ptr5 = getelementptr inbounds i8, i8 addrspace(1)* %dst, i64 %add.ptr.sum + store i8 0, i8 addrspace(1)* %add.ptr5, align 1 + %inc = add nsw i32 %i.015, 1 + %cmp1 = icmp slt i32 %inc, 4 + %cmp3 = icmp slt i32 %add2, %a + %or.cond = and i1 %cmp3, %cmp1 + br i1 %or.cond, label %if.then4, label %for.cond.if.end7.loopexit_crit_edge + +for.cond.if.end7.loopexit_crit_edge: ; preds = %if.then4 + br label %if.end7.loopexit + +if.end7.loopexit: ; preds = %for.cond.if.end7.loopexit_crit_edge, %for.cond.preheader + br label %if.end7 + +if.end7: ; preds = %if.end7.loopexit, %entry + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/uaddo.ll b/llvm/test/CodeGen/AMDGPU/uaddo.ll new file mode 100644 index 00000000000..11438f267ad --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/uaddo.ll @@ -0,0 +1,85 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs< %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s + +declare { i32, i1 } @llvm.uadd.with.overflow.i32(i32, i32) nounwind readnone +declare { i64, i1 } @llvm.uadd.with.overflow.i64(i64, i64) nounwind readnone + +; FUNC-LABEL: {{^}}uaddo_i64_zext: +; SI: add +; SI: addc +; SI: addc + +; EG: ADDC_UINT +; EG: ADDC_UINT +define void @uaddo_i64_zext(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind { + %uadd = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %b) nounwind + %val = extractvalue { i64, i1 } %uadd, 0 + %carry = extractvalue { i64, i1 } %uadd, 1 + %ext = zext i1 %carry to i64 + %add2 = add i64 %val, %ext + store i64 %add2, i64 addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}s_uaddo_i32: +; SI: s_add_i32 + +; EG: ADDC_UINT +; EG: ADD_INT +define void @s_uaddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 %a, i32 %b) nounwind { + %uadd = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %a, i32 %b) nounwind + %val = extractvalue { i32, i1 } %uadd, 0 + %carry = extractvalue { i32, i1 } %uadd, 1 + store i32 %val, i32 addrspace(1)* %out, align 4 + store i1 %carry, i1 addrspace(1)* %carryout + ret void +} + +; FUNC-LABEL: {{^}}v_uaddo_i32: +; SI: v_add_i32 + +; EG: ADDC_UINT +; EG: ADD_INT +define void @v_uaddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind { + %a = load i32, i32 addrspace(1)* %aptr, align 4 + %b = load i32, i32 addrspace(1)* %bptr, align 4 + %uadd = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %a, i32 %b) nounwind + %val = extractvalue { i32, i1 } %uadd, 0 + %carry = extractvalue { i32, i1 } %uadd, 1 + store i32 %val, i32 addrspace(1)* %out, align 4 + store i1 %carry, i1 addrspace(1)* %carryout + ret void +} + +; FUNC-LABEL: {{^}}s_uaddo_i64: +; SI: s_add_u32 +; SI: s_addc_u32 + +; EG: ADDC_UINT +; EG: ADD_INT +define void @s_uaddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 %a, i64 %b) nounwind { + %uadd = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %b) nounwind + %val = extractvalue { i64, i1 } %uadd, 0 + %carry = extractvalue { i64, i1 } %uadd, 1 + store i64 %val, i64 addrspace(1)* %out, align 8 + store i1 %carry, i1 addrspace(1)* %carryout + ret void +} + +; FUNC-LABEL: {{^}}v_uaddo_i64: +; SI: v_add_i32 +; SI: v_addc_u32 + +; EG: ADDC_UINT +; EG: ADD_INT +define void @v_uaddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind { + %a = load i64, i64 addrspace(1)* %aptr, align 4 + %b = load i64, i64 addrspace(1)* %bptr, align 4 + %uadd = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %b) nounwind + %val = extractvalue { i64, i1 } %uadd, 0 + %carry = extractvalue { i64, i1 } %uadd, 1 + store i64 %val, i64 addrspace(1)* %out, align 8 + store i1 %carry, i1 addrspace(1)* %carryout + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/udiv.ll b/llvm/test/CodeGen/AMDGPU/udiv.ll new file mode 100644 index 00000000000..de22a22e502 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/udiv.ll @@ -0,0 +1,48 @@ +;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG %s +;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI %s +;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=SI %s + +;EG-LABEL: {{^}}test: +;EG-NOT: SETGE_INT +;EG: CF_END + +define void @test(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { + %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 + %a = load i32, i32 addrspace(1) * %in + %b = load i32, i32 addrspace(1) * %b_ptr + %result = udiv i32 %a, %b + store i32 %result, i32 addrspace(1)* %out + ret void +} + +;The code generated by udiv is long and complex and may frequently change. +;The goal of this test is to make sure the ISel doesn't fail when it gets +;a v4i32 udiv + +;EG-LABEL: {{^}}test2: +;EG: CF_END +;SI-LABEL: {{^}}test2: +;SI: s_endpgm + +define void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { + %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1 + %a = load <2 x i32>, <2 x i32> addrspace(1) * %in + %b = load <2 x i32>, <2 x i32> addrspace(1) * %b_ptr + %result = udiv <2 x i32> %a, %b + store <2 x i32> %result, <2 x i32> addrspace(1)* %out + ret void +} + +;EG-LABEL: {{^}}test4: +;EG: CF_END +;SI-LABEL: {{^}}test4: +;SI: s_endpgm + +define void @test4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { + %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1 + %a = load <4 x i32>, <4 x i32> addrspace(1) * %in + %b = load <4 x i32>, <4 x i32> addrspace(1) * %b_ptr + %result = udiv <4 x i32> %a, %b + store <4 x i32> %result, <4 x i32> addrspace(1)* %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/udivrem.ll b/llvm/test/CodeGen/AMDGPU/udivrem.ll new file mode 100644 index 00000000000..b3837f28209 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/udivrem.ll @@ -0,0 +1,345 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck --check-prefix=SI --check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck --check-prefix=SI --check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck --check-prefix=EG --check-prefix=FUNC %s + +; FUNC-LABEL: {{^}}test_udivrem: +; EG: RECIP_UINT +; EG-DAG: MULHI +; EG-DAG: MULLO_INT +; EG-DAG: SUB_INT +; EG: CNDE_INT +; EG: MULHI +; EG-DAG: ADD_INT +; EG-DAG: SUB_INT +; EG: CNDE_INT +; EG: MULHI +; EG: MULLO_INT +; EG: SUB_INT +; EG-DAG: SETGE_UINT +; EG-DAG: SETGE_UINT +; EG: AND_INT +; EG-DAG: ADD_INT +; EG-DAG: SUB_INT +; EG-DAG: CNDE_INT +; EG-DAG: CNDE_INT +; EG-DAG: ADD_INT +; EG-DAG: SUB_INT +; EG-DAG: CNDE_INT +; EG-DAG: CNDE_INT + +; SI: v_rcp_iflag_f32_e32 [[RCP:v[0-9]+]] +; SI-DAG: v_mul_hi_u32 [[RCP_HI:v[0-9]+]], [[RCP]] +; SI-DAG: v_mul_lo_i32 [[RCP_LO:v[0-9]+]], [[RCP]] +; SI-DAG: v_sub_i32_e32 [[NEG_RCP_LO:v[0-9]+]], 0, [[RCP_LO]] +; SI: v_cndmask_b32_e64 +; SI: v_mul_hi_u32 [[E:v[0-9]+]], {{v[0-9]+}}, [[RCP]] +; SI-DAG: v_add_i32_e32 [[RCP_A_E:v[0-9]+]], [[E]], [[RCP]] +; SI-DAG: v_subrev_i32_e32 [[RCP_S_E:v[0-9]+]], [[E]], [[RCP]] +; SI: v_cndmask_b32_e64 +; SI: v_mul_hi_u32 [[Quotient:v[0-9]+]] +; SI: v_mul_lo_i32 [[Num_S_Remainder:v[0-9]+]] +; SI-DAG: v_sub_i32_e32 [[Remainder:v[0-9]+]], {{[vs][0-9]+}}, [[Num_S_Remainder]] +; SI-DAG: v_cndmask_b32_e64 +; SI-DAG: v_cndmask_b32_e64 +; SI: v_and_b32_e32 [[Tmp1:v[0-9]+]] +; SI-DAG: v_add_i32_e32 [[Quotient_A_One:v[0-9]+]], 1, [[Quotient]] +; SI-DAG: v_subrev_i32_e32 [[Quotient_S_One:v[0-9]+]], +; SI-DAG: v_cndmask_b32_e64 +; SI-DAG: v_cndmask_b32_e64 +; SI-DAG: v_add_i32_e32 [[Remainder_A_Den:v[0-9]+]], +; SI-DAG: v_subrev_i32_e32 [[Remainder_S_Den:v[0-9]+]], +; SI-DAG: v_cndmask_b32_e64 +; SI-DAG: v_cndmask_b32_e64 +; SI: s_endpgm +define void @test_udivrem(i32 addrspace(1)* %out, i32 %x, i32 %y) { + %result0 = udiv i32 %x, %y + store i32 %result0, i32 addrspace(1)* %out + %result1 = urem i32 %x, %y + store i32 %result1, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}test_udivrem_v2: +; EG-DAG: RECIP_UINT +; EG-DAG: MULHI +; EG-DAG: MULLO_INT +; EG-DAG: SUB_INT +; EG-DAG: CNDE_INT +; EG-DAG: MULHI +; EG-DAG: ADD_INT +; EG-DAG: SUB_INT +; EG-DAG: CNDE_INT +; EG-DAG: MULHI +; EG-DAG: MULLO_INT +; EG-DAG: SUB_INT +; EG-DAG: SETGE_UINT +; EG-DAG: SETGE_UINT +; EG-DAG: AND_INT +; EG-DAG: ADD_INT +; EG-DAG: SUB_INT +; EG-DAG: CNDE_INT +; EG-DAG: CNDE_INT +; EG-DAG: ADD_INT +; EG-DAG: SUB_INT +; EG-DAG: CNDE_INT +; EG-DAG: CNDE_INT +; EG-DAG: RECIP_UINT +; EG-DAG: MULHI +; EG-DAG: MULLO_INT +; EG-DAG: SUB_INT +; EG-DAG: CNDE_INT +; EG-DAG: MULHI +; EG-DAG: ADD_INT +; EG-DAG: SUB_INT +; EG-DAG: CNDE_INT +; EG-DAG: MULHI +; EG-DAG: MULLO_INT +; EG-DAG: SUB_INT +; EG-DAG: SETGE_UINT +; EG-DAG: SETGE_UINT +; EG-DAG: AND_INT +; EG-DAG: ADD_INT +; EG-DAG: SUB_INT +; EG-DAG: CNDE_INT +; EG-DAG: CNDE_INT +; EG-DAG: ADD_INT +; EG-DAG: SUB_INT +; EG-DAG: CNDE_INT +; EG-DAG: CNDE_INT + +; SI-DAG: v_rcp_iflag_f32_e32 [[FIRST_RCP:v[0-9]+]] +; SI-DAG: v_mul_hi_u32 [[FIRST_RCP_HI:v[0-9]+]], [[FIRST_RCP]] +; SI-DAG: v_mul_lo_i32 [[FIRST_RCP_LO:v[0-9]+]], [[FIRST_RCP]] +; SI-DAG: v_sub_i32_e32 [[FIRST_NEG_RCP_LO:v[0-9]+]], 0, [[FIRST_RCP_LO]] +; SI-DAG: v_cndmask_b32_e64 +; SI-DAG: v_mul_hi_u32 [[FIRST_E:v[0-9]+]], {{v[0-9]+}}, [[FIRST_RCP]] +; SI-DAG: v_add_i32_e32 [[FIRST_RCP_A_E:v[0-9]+]], [[FIRST_E]], [[FIRST_RCP]] +; SI-DAG: v_subrev_i32_e32 [[FIRST_RCP_S_E:v[0-9]+]], [[FIRST_E]], [[FIRST_RCP]] +; SI-DAG: v_cndmask_b32_e64 +; SI-DAG: v_mul_hi_u32 [[FIRST_Quotient:v[0-9]+]] +; SI-DAG: v_mul_lo_i32 [[FIRST_Num_S_Remainder:v[0-9]+]] +; SI-DAG: v_subrev_i32_e32 [[FIRST_Remainder:v[0-9]+]], [[FIRST_Num_S_Remainder]], v{{[0-9]+}} +; SI-DAG: v_cndmask_b32_e64 +; SI-DAG: v_cndmask_b32_e64 +; SI-DAG: v_and_b32_e32 [[FIRST_Tmp1:v[0-9]+]] +; SI-DAG: v_add_i32_e32 [[FIRST_Quotient_A_One:v[0-9]+]], {{.*}}, [[FIRST_Quotient]] +; SI-DAG: v_subrev_i32_e32 [[FIRST_Quotient_S_One:v[0-9]+]], +; SI-DAG: v_cndmask_b32_e64 +; SI-DAG: v_cndmask_b32_e64 +; SI-DAG: v_add_i32_e32 [[FIRST_Remainder_A_Den:v[0-9]+]], +; SI-DAG: v_subrev_i32_e32 [[FIRST_Remainder_S_Den:v[0-9]+]], +; SI-DAG: v_cndmask_b32_e64 +; SI-DAG: v_cndmask_b32_e64 +; SI-DAG: v_rcp_iflag_f32_e32 [[SECOND_RCP:v[0-9]+]] +; SI-DAG: v_mul_hi_u32 [[SECOND_RCP_HI:v[0-9]+]], [[SECOND_RCP]] +; SI-DAG: v_mul_lo_i32 [[SECOND_RCP_LO:v[0-9]+]], [[SECOND_RCP]] +; SI-DAG: v_sub_i32_e32 [[SECOND_NEG_RCP_LO:v[0-9]+]], 0, [[SECOND_RCP_LO]] +; SI-DAG: v_cndmask_b32_e64 +; SI-DAG: v_mul_hi_u32 [[SECOND_E:v[0-9]+]], {{v[0-9]+}}, [[SECOND_RCP]] +; SI-DAG: v_add_i32_e32 [[SECOND_RCP_A_E:v[0-9]+]], [[SECOND_E]], [[SECOND_RCP]] +; SI-DAG: v_subrev_i32_e32 [[SECOND_RCP_S_E:v[0-9]+]], [[SECOND_E]], [[SECOND_RCP]] +; SI-DAG: v_cndmask_b32_e64 +; SI-DAG: v_mul_hi_u32 [[SECOND_Quotient:v[0-9]+]] +; SI-DAG: v_mul_lo_i32 [[SECOND_Num_S_Remainder:v[0-9]+]] +; SI-DAG: v_subrev_i32_e32 [[SECOND_Remainder:v[0-9]+]], [[SECOND_Num_S_Remainder]], v{{[0-9]+}} +; SI-DAG: v_cndmask_b32_e64 +; SI-DAG: v_cndmask_b32_e64 +; SI-DAG: v_and_b32_e32 [[SECOND_Tmp1:v[0-9]+]] +; SI-DAG: v_add_i32_e32 [[SECOND_Quotient_A_One:v[0-9]+]], {{.*}}, [[SECOND_Quotient]] +; SI-DAG: v_subrev_i32_e32 [[SECOND_Quotient_S_One:v[0-9]+]], +; SI-DAG: v_cndmask_b32_e64 +; SI-DAG: v_cndmask_b32_e64 +; SI-DAG: v_add_i32_e32 [[SECOND_Remainder_A_Den:v[0-9]+]], +; SI-DAG: v_subrev_i32_e32 [[SECOND_Remainder_S_Den:v[0-9]+]], +; SI-DAG: v_cndmask_b32_e64 +; SI-DAG: v_cndmask_b32_e64 +; SI: s_endpgm +define void @test_udivrem_v2(<2 x i32> addrspace(1)* %out, <2 x i32> %x, <2 x i32> %y) { + %result0 = udiv <2 x i32> %x, %y + store <2 x i32> %result0, <2 x i32> addrspace(1)* %out + %result1 = urem <2 x i32> %x, %y + store <2 x i32> %result1, <2 x i32> addrspace(1)* %out + ret void +} + + +; FUNC-LABEL: {{^}}test_udivrem_v4: +; EG-DAG: RECIP_UINT +; EG-DAG: MULHI +; EG-DAG: MULLO_INT +; EG-DAG: SUB_INT +; EG-DAG: CNDE_INT +; EG-DAG: MULHI +; EG-DAG: ADD_INT +; EG-DAG: SUB_INT +; EG-DAG: CNDE_INT +; EG-DAG: MULHI +; EG-DAG: MULLO_INT +; EG-DAG: SUB_INT +; EG-DAG: SETGE_UINT +; EG-DAG: SETGE_UINT +; EG-DAG: AND_INT +; EG-DAG: ADD_INT +; EG-DAG: SUB_INT +; EG-DAG: CNDE_INT +; EG-DAG: CNDE_INT +; EG-DAG: ADD_INT +; EG-DAG: SUB_INT +; EG-DAG: CNDE_INT +; EG-DAG: CNDE_INT +; EG-DAG: RECIP_UINT +; EG-DAG: MULHI +; EG-DAG: MULLO_INT +; EG-DAG: SUB_INT +; EG-DAG: CNDE_INT +; EG-DAG: MULHI +; EG-DAG: ADD_INT +; EG-DAG: SUB_INT +; EG-DAG: CNDE_INT +; EG-DAG: MULHI +; EG-DAG: MULLO_INT +; EG-DAG: SUB_INT +; EG-DAG: SETGE_UINT +; EG-DAG: SETGE_UINT +; EG-DAG: AND_INT +; EG-DAG: ADD_INT +; EG-DAG: SUB_INT +; EG-DAG: CNDE_INT +; EG-DAG: CNDE_INT +; EG-DAG: ADD_INT +; EG-DAG: SUB_INT +; EG-DAG: CNDE_INT +; EG-DAG: CNDE_INT +; EG-DAG: RECIP_UINT +; EG-DAG: MULHI +; EG-DAG: MULLO_INT +; EG-DAG: SUB_INT +; EG-DAG: CNDE_INT +; EG-DAG: MULHI +; EG-DAG: ADD_INT +; EG-DAG: SUB_INT +; EG-DAG: CNDE_INT +; EG-DAG: MULHI +; EG-DAG: MULLO_INT +; EG-DAG: SUB_INT +; EG-DAG: SETGE_UINT +; EG-DAG: SETGE_UINT +; EG-DAG: AND_INT +; EG-DAG: ADD_INT +; EG-DAG: SUB_INT +; EG-DAG: CNDE_INT +; EG-DAG: CNDE_INT +; EG-DAG: ADD_INT +; EG-DAG: SUB_INT +; EG-DAG: CNDE_INT +; EG-DAG: CNDE_INT +; EG-DAG: RECIP_UINT +; EG-DAG: MULHI +; EG-DAG: MULLO_INT +; EG-DAG: SUB_INT +; EG-DAG: CNDE_INT +; EG-DAG: MULHI +; EG-DAG: ADD_INT +; EG-DAG: SUB_INT +; EG-DAG: CNDE_INT +; EG-DAG: MULHI +; EG-DAG: MULLO_INT +; EG-DAG: SUB_INT +; EG-DAG: SETGE_UINT +; EG-DAG: SETGE_UINT +; EG-DAG: AND_INT +; EG-DAG: ADD_INT +; EG-DAG: SUB_INT +; EG-DAG: CNDE_INT +; EG-DAG: CNDE_INT +; EG-DAG: ADD_INT +; EG-DAG: SUB_INT +; EG-DAG: CNDE_INT +; EG-DAG: CNDE_INT + +; SI-DAG: v_rcp_iflag_f32_e32 [[FIRST_RCP:v[0-9]+]] +; SI-DAG: v_mul_hi_u32 [[FIRST_RCP_HI:v[0-9]+]], [[FIRST_RCP]] +; SI-DAG: v_mul_lo_i32 [[FIRST_RCP_LO:v[0-9]+]], [[FIRST_RCP]] +; SI-DAG: v_sub_i32_e32 [[FIRST_NEG_RCP_LO:v[0-9]+]], 0, [[FIRST_RCP_LO]] +; SI-DAG: v_cndmask_b32_e64 +; SI-DAG: v_mul_hi_u32 [[FIRST_E:v[0-9]+]], {{v[0-9]+}}, [[FIRST_RCP]] +; SI-DAG: v_add_i32_e32 [[FIRST_RCP_A_E:v[0-9]+]], [[FIRST_E]], [[FIRST_RCP]] +; SI-DAG: v_subrev_i32_e32 [[FIRST_RCP_S_E:v[0-9]+]], [[FIRST_E]], [[FIRST_RCP]] +; SI-DAG: v_cndmask_b32_e64 +; SI-DAG: v_mul_hi_u32 [[FIRST_Quotient:v[0-9]+]] +; SI-DAG: v_mul_lo_i32 [[FIRST_Num_S_Remainder:v[0-9]+]] +; SI-DAG: v_subrev_i32_e32 [[FIRST_Remainder:v[l0-9]+]], [[FIRST_Num_S_Remainder]], v{{[0-9]+}} +; SI-DAG: v_cndmask_b32_e64 +; SI-DAG: v_cndmask_b32_e64 +; SI-DAG: v_and_b32_e32 [[FIRST_Tmp1:v[0-9]+]] +; SI-DAG: v_add_i32_e32 [[FIRST_Quotient_A_One:v[0-9]+]], {{.*}}, [[FIRST_Quotient]] +; SI-DAG: v_subrev_i32_e32 [[FIRST_Quotient_S_One:v[0-9]+]], +; SI-DAG: v_cndmask_b32_e64 +; SI-DAG: v_cndmask_b32_e64 +; SI-DAG: v_add_i32_e32 [[FIRST_Remainder_A_Den:v[0-9]+]], +; SI-DAG: v_subrev_i32_e32 [[FIRST_Remainder_S_Den:v[0-9]+]], +; SI-DAG: v_cndmask_b32_e64 +; SI-DAG: v_cndmask_b32_e64 +; SI-DAG: v_rcp_iflag_f32_e32 [[SECOND_RCP:v[0-9]+]] +; SI-DAG: v_mul_hi_u32 [[SECOND_RCP_HI:v[0-9]+]], [[SECOND_RCP]] +; SI-DAG: v_mul_lo_i32 [[SECOND_RCP_LO:v[0-9]+]], [[SECOND_RCP]] +; SI-DAG: v_sub_i32_e32 [[SECOND_NEG_RCP_LO:v[0-9]+]], 0, [[SECOND_RCP_LO]] +; SI-DAG: v_cndmask_b32_e64 +; SI-DAG: v_mul_hi_u32 [[SECOND_E:v[0-9]+]], {{v[0-9]+}}, [[SECOND_RCP]] +; SI-DAG: v_add_i32_e32 [[SECOND_RCP_A_E:v[0-9]+]], [[SECOND_E]], [[SECOND_RCP]] +; SI-DAG: v_subrev_i32_e32 [[SECOND_RCP_S_E:v[0-9]+]], [[SECOND_E]], [[SECOND_RCP]] +; SI-DAG: v_cndmask_b32_e64 +; SI-DAG: v_mul_hi_u32 [[SECOND_Quotient:v[0-9]+]] +; SI-DAG: v_mul_lo_i32 [[SECOND_Num_S_Remainder:v[0-9]+]] +; SI-DAG: v_subrev_i32_e32 [[SECOND_Remainder:v[0-9]+]], [[SECOND_Num_S_Remainder]], v{{[0-9]+}} +; SI-DAG: v_cndmask_b32_e64 +; SI-DAG: v_cndmask_b32_e64 +; SI-DAG: v_and_b32_e32 [[SECOND_Tmp1:v[0-9]+]] +; SI-DAG: v_add_i32_e32 [[SECOND_Quotient_A_One:v[0-9]+]], {{.*}}, [[SECOND_Quotient]] +; SI-DAG: v_subrev_i32_e32 [[SECOND_Quotient_S_One:v[0-9]+]], +; SI-DAG: v_cndmask_b32_e64 +; SI-DAG: v_cndmask_b32_e64 +; SI-DAG: v_add_i32_e32 [[SECOND_Remainder_A_Den:v[0-9]+]], +; SI-DAG: v_subrev_i32_e32 [[SECOND_Remainder_S_Den:v[0-9]+]], +; SI-DAG: v_cndmask_b32_e64 +; SI-DAG: v_cndmask_b32_e64 +; SI-DAG: v_rcp_iflag_f32_e32 [[THIRD_RCP:v[0-9]+]] +; SI-DAG: v_mul_hi_u32 [[THIRD_RCP_HI:v[0-9]+]], [[THIRD_RCP]] +; SI-DAG: v_mul_lo_i32 [[THIRD_RCP_LO:v[0-9]+]], [[THIRD_RCP]] +; SI-DAG: v_sub_i32_e32 [[THIRD_NEG_RCP_LO:v[0-9]+]], 0, [[THIRD_RCP_LO]] +; SI-DAG: v_cndmask_b32_e64 +; SI-DAG: v_mul_hi_u32 [[THIRD_E:v[0-9]+]], {{v[0-9]+}}, [[THIRD_RCP]] +; SI-DAG: v_add_i32_e32 [[THIRD_RCP_A_E:v[0-9]+]], [[THIRD_E]], [[THIRD_RCP]] +; SI-DAG: v_subrev_i32_e32 [[THIRD_RCP_S_E:v[0-9]+]], [[THIRD_E]], [[THIRD_RCP]] +; SI-DAG: v_cndmask_b32_e64 +; SI-DAG: v_mul_hi_u32 [[THIRD_Quotient:v[0-9]+]] +; SI-DAG: v_mul_lo_i32 [[THIRD_Num_S_Remainder:v[0-9]+]] +; SI-DAG: v_subrev_i32_e32 [[THIRD_Remainder:v[0-9]+]], [[THIRD_Num_S_Remainder]], {{v[0-9]+}} +; SI-DAG: v_cndmask_b32_e64 +; SI-DAG: v_cndmask_b32_e64 +; SI-DAG: v_and_b32_e32 [[THIRD_Tmp1:v[0-9]+]] +; SI-DAG: v_add_i32_e32 [[THIRD_Quotient_A_One:v[0-9]+]], {{.*}}, [[THIRD_Quotient]] +; SI-DAG: v_subrev_i32_e32 [[THIRD_Quotient_S_One:v[0-9]+]], +; SI-DAG: v_cndmask_b32_e64 +; SI-DAG: v_cndmask_b32_e64 +; SI-DAG: v_add_i32_e32 [[THIRD_Remainder_A_Den:v[0-9]+]], +; SI-DAG: v_subrev_i32_e32 [[THIRD_Remainder_S_Den:v[0-9]+]], +; SI-DAG: v_cndmask_b32_e64 +; SI-DAG: v_cndmask_b32_e64 +; SI-DAG: v_rcp_iflag_f32_e32 [[FOURTH_RCP:v[0-9]+]] +; SI-DAG: v_mul_hi_u32 [[FOURTH_RCP_HI:v[0-9]+]], [[FOURTH_RCP]] +; SI-DAG: v_mul_lo_i32 [[FOURTH_RCP_LO:v[0-9]+]], [[FOURTH_RCP]] +; SI-DAG: v_sub_i32_e32 [[FOURTH_NEG_RCP_LO:v[0-9]+]], 0, [[FOURTH_RCP_LO]] +; SI-DAG: v_cndmask_b32_e64 +; SI-DAG: v_mul_hi_u32 [[FOURTH_E:v[0-9]+]], {{v[0-9]+}}, [[FOURTH_RCP]] +; SI-DAG: v_add_i32_e32 [[FOURTH_RCP_A_E:v[0-9]+]], [[FOURTH_E]], [[FOURTH_RCP]] +; SI-DAG: v_subrev_i32_e32 [[FOURTH_RCP_S_E:v[0-9]+]], [[FOURTH_E]], [[FOURTH_RCP]] +; SI-DAG: v_cndmask_b32_e64 +; SI: s_endpgm +define void @test_udivrem_v4(<4 x i32> addrspace(1)* %out, <4 x i32> %x, <4 x i32> %y) { + %result0 = udiv <4 x i32> %x, %y + store <4 x i32> %result0, <4 x i32> addrspace(1)* %out + %result1 = urem <4 x i32> %x, %y + store <4 x i32> %result1, <4 x i32> addrspace(1)* %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/udivrem24.ll b/llvm/test/CodeGen/AMDGPU/udivrem24.ll new file mode 100644 index 00000000000..4de881b66f1 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/udivrem24.ll @@ -0,0 +1,245 @@ +; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s + +; FUNC-LABEL: {{^}}udiv24_i8: +; SI: v_cvt_f32_ubyte +; SI: v_cvt_f32_ubyte +; SI: v_rcp_f32 +; SI: v_cvt_u32_f32 + +; EG: UINT_TO_FLT +; EG-DAG: UINT_TO_FLT +; EG-DAG: RECIP_IEEE +; EG: FLT_TO_UINT +define void @udiv24_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) { + %den_ptr = getelementptr i8, i8 addrspace(1)* %in, i8 1 + %num = load i8, i8 addrspace(1) * %in + %den = load i8, i8 addrspace(1) * %den_ptr + %result = udiv i8 %num, %den + store i8 %result, i8 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}udiv24_i16: +; SI: v_cvt_f32_u32 +; SI: v_cvt_f32_u32 +; SI: v_rcp_f32 +; SI: v_cvt_u32_f32 + +; EG: UINT_TO_FLT +; EG-DAG: UINT_TO_FLT +; EG-DAG: RECIP_IEEE +; EG: FLT_TO_UINT +define void @udiv24_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) { + %den_ptr = getelementptr i16, i16 addrspace(1)* %in, i16 1 + %num = load i16, i16 addrspace(1) * %in, align 2 + %den = load i16, i16 addrspace(1) * %den_ptr, align 2 + %result = udiv i16 %num, %den + store i16 %result, i16 addrspace(1)* %out, align 2 + ret void +} + +; FUNC-LABEL: {{^}}udiv24_i32: +; SI: v_cvt_f32_u32 +; SI-DAG: v_cvt_f32_u32 +; SI-DAG: v_rcp_f32 +; SI: v_cvt_u32_f32 + +; EG: UINT_TO_FLT +; EG-DAG: UINT_TO_FLT +; EG-DAG: RECIP_IEEE +; EG: FLT_TO_UINT +define void @udiv24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { + %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 + %num = load i32, i32 addrspace(1) * %in, align 4 + %den = load i32, i32 addrspace(1) * %den_ptr, align 4 + %num.i24.0 = shl i32 %num, 8 + %den.i24.0 = shl i32 %den, 8 + %num.i24 = lshr i32 %num.i24.0, 8 + %den.i24 = lshr i32 %den.i24.0, 8 + %result = udiv i32 %num.i24, %den.i24 + store i32 %result, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}udiv25_i32: +; RCP_IFLAG is for URECIP in the full 32b alg +; SI: v_rcp_iflag +; SI-NOT: v_rcp_f32 + +; EG-NOT: UINT_TO_FLT +; EG-NOT: RECIP_IEEE +define void @udiv25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { + %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 + %num = load i32, i32 addrspace(1) * %in, align 4 + %den = load i32, i32 addrspace(1) * %den_ptr, align 4 + %num.i24.0 = shl i32 %num, 7 + %den.i24.0 = shl i32 %den, 7 + %num.i24 = lshr i32 %num.i24.0, 7 + %den.i24 = lshr i32 %den.i24.0, 7 + %result = udiv i32 %num.i24, %den.i24 + store i32 %result, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}test_no_udiv24_i32_1: +; RCP_IFLAG is for URECIP in the full 32b alg +; SI: v_rcp_iflag +; SI-NOT: v_rcp_f32 + +; EG-NOT: UINT_TO_FLT +; EG-NOT: RECIP_IEEE +define void @test_no_udiv24_i32_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { + %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 + %num = load i32, i32 addrspace(1) * %in, align 4 + %den = load i32, i32 addrspace(1) * %den_ptr, align 4 + %num.i24.0 = shl i32 %num, 8 + %den.i24.0 = shl i32 %den, 7 + %num.i24 = lshr i32 %num.i24.0, 8 + %den.i24 = lshr i32 %den.i24.0, 7 + %result = udiv i32 %num.i24, %den.i24 + store i32 %result, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}test_no_udiv24_i32_2: +; RCP_IFLAG is for URECIP in the full 32b alg +; SI: v_rcp_iflag +; SI-NOT: v_rcp_f32 + +; EG-NOT: UINT_TO_FLT +; EG-NOT: RECIP_IEEE +define void @test_no_udiv24_i32_2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { + %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 + %num = load i32, i32 addrspace(1) * %in, align 4 + %den = load i32, i32 addrspace(1) * %den_ptr, align 4 + %num.i24.0 = shl i32 %num, 7 + %den.i24.0 = shl i32 %den, 8 + %num.i24 = lshr i32 %num.i24.0, 7 + %den.i24 = lshr i32 %den.i24.0, 8 + %result = udiv i32 %num.i24, %den.i24 + store i32 %result, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}urem24_i8: +; SI: v_cvt_f32_ubyte +; SI: v_cvt_f32_ubyte +; SI: v_rcp_f32 +; SI: v_cvt_u32_f32 + +; EG: UINT_TO_FLT +; EG-DAG: UINT_TO_FLT +; EG-DAG: RECIP_IEEE +; EG: FLT_TO_UINT +define void @urem24_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) { + %den_ptr = getelementptr i8, i8 addrspace(1)* %in, i8 1 + %num = load i8, i8 addrspace(1) * %in + %den = load i8, i8 addrspace(1) * %den_ptr + %result = urem i8 %num, %den + store i8 %result, i8 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}urem24_i16: +; SI: v_cvt_f32_u32 +; SI: v_cvt_f32_u32 +; SI: v_rcp_f32 +; SI: v_cvt_u32_f32 + +; EG: UINT_TO_FLT +; EG-DAG: UINT_TO_FLT +; EG-DAG: RECIP_IEEE +; EG: FLT_TO_UINT +define void @urem24_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) { + %den_ptr = getelementptr i16, i16 addrspace(1)* %in, i16 1 + %num = load i16, i16 addrspace(1) * %in, align 2 + %den = load i16, i16 addrspace(1) * %den_ptr, align 2 + %result = urem i16 %num, %den + store i16 %result, i16 addrspace(1)* %out, align 2 + ret void +} + +; FUNC-LABEL: {{^}}urem24_i32: +; SI: v_cvt_f32_u32 +; SI: v_cvt_f32_u32 +; SI: v_rcp_f32 +; SI: v_cvt_u32_f32 + +; EG: UINT_TO_FLT +; EG-DAG: UINT_TO_FLT +; EG-DAG: RECIP_IEEE +; EG: FLT_TO_UINT +define void @urem24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { + %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 + %num = load i32, i32 addrspace(1) * %in, align 4 + %den = load i32, i32 addrspace(1) * %den_ptr, align 4 + %num.i24.0 = shl i32 %num, 8 + %den.i24.0 = shl i32 %den, 8 + %num.i24 = lshr i32 %num.i24.0, 8 + %den.i24 = lshr i32 %den.i24.0, 8 + %result = urem i32 %num.i24, %den.i24 + store i32 %result, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}urem25_i32: +; RCP_IFLAG is for URECIP in the full 32b alg +; SI: v_rcp_iflag +; SI-NOT: v_rcp_f32 + +; EG-NOT: UINT_TO_FLT +; EG-NOT: RECIP_IEEE +define void @urem25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { + %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 + %num = load i32, i32 addrspace(1) * %in, align 4 + %den = load i32, i32 addrspace(1) * %den_ptr, align 4 + %num.i24.0 = shl i32 %num, 7 + %den.i24.0 = shl i32 %den, 7 + %num.i24 = lshr i32 %num.i24.0, 7 + %den.i24 = lshr i32 %den.i24.0, 7 + %result = urem i32 %num.i24, %den.i24 + store i32 %result, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}test_no_urem24_i32_1: +; RCP_IFLAG is for URECIP in the full 32b alg +; SI: v_rcp_iflag +; SI-NOT: v_rcp_f32 + +; EG-NOT: UINT_TO_FLT +; EG-NOT: RECIP_IEEE +define void @test_no_urem24_i32_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { + %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 + %num = load i32, i32 addrspace(1) * %in, align 4 + %den = load i32, i32 addrspace(1) * %den_ptr, align 4 + %num.i24.0 = shl i32 %num, 8 + %den.i24.0 = shl i32 %den, 7 + %num.i24 = lshr i32 %num.i24.0, 8 + %den.i24 = lshr i32 %den.i24.0, 7 + %result = urem i32 %num.i24, %den.i24 + store i32 %result, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}test_no_urem24_i32_2: +; RCP_IFLAG is for URECIP in the full 32b alg +; SI: v_rcp_iflag +; SI-NOT: v_rcp_f32 + +; EG-NOT: UINT_TO_FLT +; EG-NOT: RECIP_IEEE +define void @test_no_urem24_i32_2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { + %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 + %num = load i32, i32 addrspace(1) * %in, align 4 + %den = load i32, i32 addrspace(1) * %den_ptr, align 4 + %num.i24.0 = shl i32 %num, 7 + %den.i24.0 = shl i32 %den, 8 + %num.i24 = lshr i32 %num.i24.0, 7 + %den.i24 = lshr i32 %den.i24.0, 8 + %result = urem i32 %num.i24, %den.i24 + store i32 %result, i32 addrspace(1)* %out, align 4 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/udivrem64.ll b/llvm/test/CodeGen/AMDGPU/udivrem64.ll new file mode 100644 index 00000000000..9f3069bdf80 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/udivrem64.ll @@ -0,0 +1,223 @@ +;RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck --check-prefix=SI --check-prefix=GCN --check-prefix=FUNC %s +;RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck --check-prefix=VI --check-prefix=GCN --check-prefix=FUNC %s +;RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck --check-prefix=EG --check-prefix=FUNC %s + +;FUNC-LABEL: {{^}}test_udiv: +;EG: RECIP_UINT +;EG: LSHL {{.*}}, 1, +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT + +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN-NOT: v_mad_f32 +;SI-NOT: v_lshr_b64 +;VI-NOT: v_lshrrev_b64 +;GCN: s_endpgm +define void @test_udiv(i64 addrspace(1)* %out, i64 %x, i64 %y) { + %result = udiv i64 %x, %y + store i64 %result, i64 addrspace(1)* %out + ret void +} + +;FUNC-LABEL: {{^}}test_urem: +;EG: RECIP_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: AND_INT {{.*}}, 1, + +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN-NOT: v_mad_f32 +;SI-NOT: v_lshr_b64 +;VI-NOT: v_lshrrev_b64 +;GCN: s_endpgm +define void @test_urem(i64 addrspace(1)* %out, i64 %x, i64 %y) { + %result = urem i64 %x, %y + store i64 %result, i64 addrspace(1)* %out + ret void +} + +;FUNC-LABEL: {{^}}test_udiv3264: +;EG: RECIP_UINT +;EG-NOT: BFE_UINT + +;GCN-NOT: s_bfe_u32 +;GCN-NOT: v_mad_f32 +;SI-NOT: v_lshr_b64 +;VI-NOT: v_lshrrev_b64 +;GCN: s_endpgm +define void @test_udiv3264(i64 addrspace(1)* %out, i64 %x, i64 %y) { + %1 = lshr i64 %x, 33 + %2 = lshr i64 %y, 33 + %result = udiv i64 %1, %2 + store i64 %result, i64 addrspace(1)* %out + ret void +} + +;FUNC-LABEL: {{^}}test_urem3264: +;EG: RECIP_UINT +;EG-NOT: BFE_UINT + +;GCN-NOT: s_bfe_u32 +;GCN-NOT: v_mad_f32 +;SI-NOT: v_lshr_b64 +;VI-NOT: v_lshrrev_b64 +;GCN: s_endpgm +define void @test_urem3264(i64 addrspace(1)* %out, i64 %x, i64 %y) { + %1 = lshr i64 %x, 33 + %2 = lshr i64 %y, 33 + %result = urem i64 %1, %2 + store i64 %result, i64 addrspace(1)* %out + ret void +} + +;FUNC-LABEL: {{^}}test_udiv2464: +;EG: UINT_TO_FLT +;EG: UINT_TO_FLT +;EG: FLT_TO_UINT +;EG-NOT: RECIP_UINT +;EG-NOT: BFE_UINT + +;SI-NOT: v_lshr_b64 +;VI-NOT: v_lshrrev_b64 +;GCN: v_mad_f32 +;GCN: s_endpgm +define void @test_udiv2464(i64 addrspace(1)* %out, i64 %x, i64 %y) { + %1 = lshr i64 %x, 40 + %2 = lshr i64 %y, 40 + %result = udiv i64 %1, %2 + store i64 %result, i64 addrspace(1)* %out + ret void +} + +;FUNC-LABEL: {{^}}test_urem2464: +;EG: UINT_TO_FLT +;EG: UINT_TO_FLT +;EG: FLT_TO_UINT +;EG-NOT: RECIP_UINT +;EG-NOT: BFE_UINT + +;SI-NOT: v_lshr_b64 +;VI-NOT: v_lshrrev_b64 +;GCN: v_mad_f32 +;GCN: s_endpgm +define void @test_urem2464(i64 addrspace(1)* %out, i64 %x, i64 %y) { + %1 = lshr i64 %x, 40 + %2 = lshr i64 %y, 40 + %result = urem i64 %1, %2 + store i64 %result, i64 addrspace(1)* %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll b/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll new file mode 100644 index 00000000000..dfec8eb15cb --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll @@ -0,0 +1,98 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s + +declare i32 @llvm.r600.read.tidig.x() nounwind readnone + +; SI-LABEL: {{^}}v_uint_to_fp_i64_to_f64 +; SI: buffer_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}} +; SI: v_cvt_f64_u32_e32 [[HI_CONV:v\[[0-9]+:[0-9]+\]]], v[[HI]] +; SI: v_ldexp_f64 [[LDEXP:v\[[0-9]+:[0-9]+\]]], [[HI_CONV]], 32 +; SI: v_cvt_f64_u32_e32 [[LO_CONV:v\[[0-9]+:[0-9]+\]]], v[[LO]] +; SI: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[LDEXP]], [[LO_CONV]] +; SI: buffer_store_dwordx2 [[RESULT]] +define void @v_uint_to_fp_i64_to_f64(double addrspace(1)* %out, i64 addrspace(1)* %in) { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid + %val = load i64, i64 addrspace(1)* %gep, align 8 + %result = uitofp i64 %val to double + store double %result, double addrspace(1)* %out + ret void +} + +; SI-LABEL: {{^}}s_uint_to_fp_i64_to_f64 +define void @s_uint_to_fp_i64_to_f64(double addrspace(1)* %out, i64 %in) { + %cast = uitofp i64 %in to double + store double %cast, double addrspace(1)* %out, align 8 + ret void +} + +; SI-LABEL: {{^}}s_uint_to_fp_v2i64_to_v2f64 +define void @s_uint_to_fp_v2i64_to_v2f64(<2 x double> addrspace(1)* %out, <2 x i64> %in) { + %cast = uitofp <2 x i64> %in to <2 x double> + store <2 x double> %cast, <2 x double> addrspace(1)* %out, align 16 + ret void +} + +; SI-LABEL: {{^}}s_uint_to_fp_v4i64_to_v4f64 +define void @s_uint_to_fp_v4i64_to_v4f64(<4 x double> addrspace(1)* %out, <4 x i64> %in) { + %cast = uitofp <4 x i64> %in to <4 x double> + store <4 x double> %cast, <4 x double> addrspace(1)* %out, align 16 + ret void +} + +; SI-LABEL: {{^}}s_uint_to_fp_i32_to_f64 +; SI: v_cvt_f64_u32_e32 +; SI: s_endpgm +define void @s_uint_to_fp_i32_to_f64(double addrspace(1)* %out, i32 %in) { + %cast = uitofp i32 %in to double + store double %cast, double addrspace(1)* %out, align 8 + ret void +} + +; SI-LABEL: {{^}}s_uint_to_fp_v2i32_to_v2f64 +; SI: v_cvt_f64_u32_e32 +; SI: v_cvt_f64_u32_e32 +; SI: s_endpgm +define void @s_uint_to_fp_v2i32_to_v2f64(<2 x double> addrspace(1)* %out, <2 x i32> %in) { + %cast = uitofp <2 x i32> %in to <2 x double> + store <2 x double> %cast, <2 x double> addrspace(1)* %out, align 16 + ret void +} + +; SI-LABEL: {{^}}s_uint_to_fp_v4i32_to_v4f64 +; SI: v_cvt_f64_u32_e32 +; SI: v_cvt_f64_u32_e32 +; SI: v_cvt_f64_u32_e32 +; SI: v_cvt_f64_u32_e32 +; SI: s_endpgm +define void @s_uint_to_fp_v4i32_to_v4f64(<4 x double> addrspace(1)* %out, <4 x i32> %in) { + %cast = uitofp <4 x i32> %in to <4 x double> + store <4 x double> %cast, <4 x double> addrspace(1)* %out, align 16 + ret void +} + +; FIXME: select on 0, 0 +; SI-LABEL: {{^}}uint_to_fp_i1_to_f64: +; SI: v_cmp_eq_i32_e64 [[CMP:s\[[0-9]+:[0-9]\]]], +; We can't fold the SGPRs into v_cndmask_b32_e64, because it already +; uses an SGPR for [[CMP]] +; SI: v_cndmask_b32_e64 v{{[0-9]+}}, 0, v{{[0-9]+}}, [[CMP]] +; SI: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 0, [[CMP]] +; SI: buffer_store_dwordx2 +; SI: s_endpgm +define void @uint_to_fp_i1_to_f64(double addrspace(1)* %out, i32 %in) { + %cmp = icmp eq i32 %in, 0 + %fp = uitofp i1 %cmp to double + store double %fp, double addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL: {{^}}uint_to_fp_i1_to_f64_load: +; SI: v_cndmask_b32_e64 [[IRESULT:v[0-9]]], 0, 1 +; SI-NEXT: v_cvt_f64_u32_e32 [[RESULT:v\[[0-9]+:[0-9]\]]], [[IRESULT]] +; SI: buffer_store_dwordx2 [[RESULT]] +; SI: s_endpgm +define void @uint_to_fp_i1_to_f64_load(double addrspace(1)* %out, i1 %in) { + %fp = uitofp i1 %in to double + store double %fp, double addrspace(1)* %out, align 8 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/uint_to_fp.ll b/llvm/test/CodeGen/AMDGPU/uint_to_fp.ll new file mode 100644 index 00000000000..00fea80b1bc --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/uint_to_fp.ll @@ -0,0 +1,82 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s + +; FUNC-LABEL: {{^}}uint_to_fp_i32_to_f32: +; R600-DAG: UINT_TO_FLT * T{{[0-9]+\.[XYZW]}}, KC0[2].Z + +; SI: v_cvt_f32_u32_e32 +; SI: s_endpgm +define void @uint_to_fp_i32_to_f32(float addrspace(1)* %out, i32 %in) { + %result = uitofp i32 %in to float + store float %result, float addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}uint_to_fp_v2i32_to_v2f32: +; R600-DAG: UINT_TO_FLT * T{{[0-9]+\.[XYZW]}}, KC0[2].W +; R600-DAG: UINT_TO_FLT * T{{[0-9]+\.[XYZW]}}, KC0[3].X + +; SI: v_cvt_f32_u32_e32 +; SI: v_cvt_f32_u32_e32 +; SI: s_endpgm +define void @uint_to_fp_v2i32_to_v2f32(<2 x float> addrspace(1)* %out, <2 x i32> %in) { + %result = uitofp <2 x i32> %in to <2 x float> + store <2 x float> %result, <2 x float> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}uint_to_fp_v4i32_to_v4f32: +; R600: UINT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; R600: UINT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; R600: UINT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; R600: UINT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} + +; SI: v_cvt_f32_u32_e32 +; SI: v_cvt_f32_u32_e32 +; SI: v_cvt_f32_u32_e32 +; SI: v_cvt_f32_u32_e32 +; SI: s_endpgm +define void @uint_to_fp_v4i32_to_v4f32(<4 x float> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { + %value = load <4 x i32>, <4 x i32> addrspace(1) * %in + %result = uitofp <4 x i32> %value to <4 x float> + store <4 x float> %result, <4 x float> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}uint_to_fp_i64_to_f32: +; R600: UINT_TO_FLT +; R600: UINT_TO_FLT +; R600: MULADD_IEEE +; SI: v_cvt_f32_u32_e32 +; SI: v_cvt_f32_u32_e32 +; SI: v_madmk_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, 0x4f800000 +; SI: s_endpgm +define void @uint_to_fp_i64_to_f32(float addrspace(1)* %out, i64 %in) { +entry: + %0 = uitofp i64 %in to float + store float %0, float addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}uint_to_fp_i1_to_f32: +; SI: v_cmp_eq_i32_e64 [[CMP:s\[[0-9]+:[0-9]\]]], +; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1.0, [[CMP]] +; SI: buffer_store_dword [[RESULT]], +; SI: s_endpgm +define void @uint_to_fp_i1_to_f32(float addrspace(1)* %out, i32 %in) { + %cmp = icmp eq i32 %in, 0 + %fp = uitofp i1 %cmp to float + store float %fp, float addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}uint_to_fp_i1_to_f32_load: +; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1.0 +; SI: buffer_store_dword [[RESULT]], +; SI: s_endpgm +define void @uint_to_fp_i1_to_f32_load(float addrspace(1)* %out, i1 %in) { + %fp = uitofp i1 %in to float + store float %fp, float addrspace(1)* %out, align 4 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/unaligned-load-store.ll b/llvm/test/CodeGen/AMDGPU/unaligned-load-store.ll new file mode 100644 index 00000000000..82d88ebd3ae --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/unaligned-load-store.ll @@ -0,0 +1,254 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s + +; SI-LABEL: {{^}}unaligned_load_store_i16_local: +; SI: ds_read_u8 +; SI: ds_read_u8 +; SI: ds_write_b8 +; SI: ds_write_b8 +; SI: s_endpgm +define void @unaligned_load_store_i16_local(i16 addrspace(3)* %p, i16 addrspace(3)* %r) nounwind { + %v = load i16, i16 addrspace(3)* %p, align 1 + store i16 %v, i16 addrspace(3)* %r, align 1 + ret void +} + +; SI-LABEL: {{^}}unaligned_load_store_i16_global: +; SI: buffer_load_ubyte +; SI: buffer_load_ubyte +; SI: buffer_store_byte +; SI: buffer_store_byte +; SI: s_endpgm +define void @unaligned_load_store_i16_global(i16 addrspace(1)* %p, i16 addrspace(1)* %r) nounwind { + %v = load i16, i16 addrspace(1)* %p, align 1 + store i16 %v, i16 addrspace(1)* %r, align 1 + ret void +} + +; SI-LABEL: {{^}}unaligned_load_store_i32_local: +; SI: ds_read_u8 +; SI: ds_read_u8 +; SI: ds_read_u8 +; SI: ds_read_u8 +; SI: ds_write_b8 +; SI: ds_write_b8 +; SI: ds_write_b8 +; SI: ds_write_b8 +; SI: s_endpgm +define void @unaligned_load_store_i32_local(i32 addrspace(3)* %p, i32 addrspace(3)* %r) nounwind { + %v = load i32, i32 addrspace(3)* %p, align 1 + store i32 %v, i32 addrspace(3)* %r, align 1 + ret void +} + +; SI-LABEL: {{^}}unaligned_load_store_i32_global: +; SI: buffer_load_ubyte +; SI: buffer_load_ubyte +; SI: buffer_load_ubyte +; SI: buffer_load_ubyte +; SI: buffer_store_byte +; SI: buffer_store_byte +; SI: buffer_store_byte +; SI: buffer_store_byte +define void @unaligned_load_store_i32_global(i32 addrspace(1)* %p, i32 addrspace(1)* %r) nounwind { + %v = load i32, i32 addrspace(1)* %p, align 1 + store i32 %v, i32 addrspace(1)* %r, align 1 + ret void +} + +; SI-LABEL: {{^}}unaligned_load_store_i64_local: +; SI: ds_read_u8 +; SI: ds_read_u8 +; SI: ds_read_u8 +; SI: ds_read_u8 +; SI: ds_read_u8 +; SI: ds_read_u8 +; SI: ds_read_u8 +; SI: ds_read_u8 +; SI: ds_write_b8 +; SI: ds_write_b8 +; SI: ds_write_b8 +; SI: ds_write_b8 +; SI: ds_write_b8 +; SI: ds_write_b8 +; SI: ds_write_b8 +; SI: ds_write_b8 +; SI: s_endpgm +define void @unaligned_load_store_i64_local(i64 addrspace(3)* %p, i64 addrspace(3)* %r) { + %v = load i64, i64 addrspace(3)* %p, align 1 + store i64 %v, i64 addrspace(3)* %r, align 1 + ret void +} + +; SI-LABEL: {{^}}unaligned_load_store_i64_global: +; SI: buffer_load_ubyte +; SI: buffer_load_ubyte +; SI: buffer_load_ubyte +; SI: buffer_load_ubyte +; SI: buffer_load_ubyte +; SI: buffer_load_ubyte +; SI: buffer_load_ubyte +; SI: buffer_load_ubyte +; SI: buffer_store_byte +; SI: buffer_store_byte +; SI: buffer_store_byte +; SI: buffer_store_byte +; SI: buffer_store_byte +; SI: buffer_store_byte +; SI: buffer_store_byte +; SI: buffer_store_byte +define void @unaligned_load_store_i64_global(i64 addrspace(1)* %p, i64 addrspace(1)* %r) { + %v = load i64, i64 addrspace(1)* %p, align 1 + store i64 %v, i64 addrspace(1)* %r, align 1 + ret void +} + +; SI-LABEL: {{^}}unaligned_load_store_v4i32_local: +; SI: ds_read_u8 +; SI: ds_read_u8 +; SI: ds_read_u8 +; SI: ds_read_u8 + +; SI: ds_read_u8 +; SI: ds_read_u8 +; SI: ds_read_u8 +; SI: ds_read_u8 + +; SI: ds_read_u8 +; SI: ds_read_u8 +; SI: ds_read_u8 +; SI: ds_read_u8 + +; SI: ds_read_u8 +; SI: ds_read_u8 +; SI: ds_read_u8 +; SI: ds_read_u8 + +; SI: ds_write_b8 +; SI: ds_write_b8 +; SI: ds_write_b8 +; SI: ds_write_b8 + +; SI: ds_write_b8 +; SI: ds_write_b8 +; SI: ds_write_b8 +; SI: ds_write_b8 + +; SI: ds_write_b8 +; SI: ds_write_b8 +; SI: ds_write_b8 +; SI: ds_write_b8 + +; SI: ds_write_b8 +; SI: ds_write_b8 +; SI: ds_write_b8 +; SI: ds_write_b8 +; SI: s_endpgm +define void @unaligned_load_store_v4i32_local(<4 x i32> addrspace(3)* %p, <4 x i32> addrspace(3)* %r) nounwind { + %v = load <4 x i32>, <4 x i32> addrspace(3)* %p, align 1 + store <4 x i32> %v, <4 x i32> addrspace(3)* %r, align 1 + ret void +} + +; FIXME: We mark v4i32 as custom, so misaligned loads are never expanded. +; FIXME-SI-LABEL: {{^}}unaligned_load_store_v4i32_global +; FIXME-SI: buffer_load_ubyte +; FIXME-SI: buffer_load_ubyte +; FIXME-SI: buffer_load_ubyte +; FIXME-SI: buffer_load_ubyte +; FIXME-SI: buffer_load_ubyte +; FIXME-SI: buffer_load_ubyte +; FIXME-SI: buffer_load_ubyte +; FIXME-SI: buffer_load_ubyte +; FIXME-SI: buffer_load_ubyte +; FIXME-SI: buffer_load_ubyte +; FIXME-SI: buffer_load_ubyte +; FIXME-SI: buffer_load_ubyte +; FIXME-SI: buffer_load_ubyte +; FIXME-SI: buffer_load_ubyte +; FIXME-SI: buffer_load_ubyte +; FIXME-SI: buffer_load_ubyte +define void @unaligned_load_store_v4i32_global(<4 x i32> addrspace(1)* %p, <4 x i32> addrspace(1)* %r) nounwind { + %v = load <4 x i32>, <4 x i32> addrspace(1)* %p, align 1 + store <4 x i32> %v, <4 x i32> addrspace(1)* %r, align 1 + ret void +} + +; SI-LABEL: {{^}}load_lds_i64_align_4: +; SI: ds_read2_b32 +; SI: s_endpgm +define void @load_lds_i64_align_4(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 { + %val = load i64, i64 addrspace(3)* %in, align 4 + store i64 %val, i64 addrspace(1)* %out, align 8 + ret void +} + +; SI-LABEL: {{^}}load_lds_i64_align_4_with_offset +; SI: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]}} offset0:8 offset1:9 +; SI: s_endpgm +define void @load_lds_i64_align_4_with_offset(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 { + %ptr = getelementptr i64, i64 addrspace(3)* %in, i32 4 + %val = load i64, i64 addrspace(3)* %ptr, align 4 + store i64 %val, i64 addrspace(1)* %out, align 8 + ret void +} + +; SI-LABEL: {{^}}load_lds_i64_align_4_with_split_offset: +; The tests for the case where the lo offset is 8-bits, but the hi offset is 9-bits +; SI: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]}} offset1:1 +; SI: s_endpgm +define void @load_lds_i64_align_4_with_split_offset(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 { + %ptr = bitcast i64 addrspace(3)* %in to i32 addrspace(3)* + %ptr255 = getelementptr i32, i32 addrspace(3)* %ptr, i32 255 + %ptri64 = bitcast i32 addrspace(3)* %ptr255 to i64 addrspace(3)* + %val = load i64, i64 addrspace(3)* %ptri64, align 4 + store i64 %val, i64 addrspace(1)* %out, align 8 + ret void +} + +; SI-LABEL: {{^}}load_lds_i64_align_1: +; SI: ds_read_u8 +; SI: ds_read_u8 +; SI: ds_read_u8 +; SI: ds_read_u8 +; SI: ds_read_u8 +; SI: ds_read_u8 +; SI: ds_read_u8 +; SI: ds_read_u8 +; SI: buffer_store_dwordx2 +; SI: s_endpgm + +define void @load_lds_i64_align_1(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 { + %val = load i64, i64 addrspace(3)* %in, align 1 + store i64 %val, i64 addrspace(1)* %out, align 8 + ret void +} + +; SI-LABEL: {{^}}store_lds_i64_align_4: +; SI: ds_write2_b32 +; SI: s_endpgm +define void @store_lds_i64_align_4(i64 addrspace(3)* %out, i64 %val) #0 { + store i64 %val, i64 addrspace(3)* %out, align 4 + ret void +} + +; SI-LABEL: {{^}}store_lds_i64_align_4_with_offset +; SI: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset0:8 offset1:9 +; SI: s_endpgm +define void @store_lds_i64_align_4_with_offset(i64 addrspace(3)* %out) #0 { + %ptr = getelementptr i64, i64 addrspace(3)* %out, i32 4 + store i64 0, i64 addrspace(3)* %ptr, align 4 + ret void +} + +; SI-LABEL: {{^}}store_lds_i64_align_4_with_split_offset: +; The tests for the case where the lo offset is 8-bits, but the hi offset is 9-bits +; SI: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset1:1 +; SI: s_endpgm +define void @store_lds_i64_align_4_with_split_offset(i64 addrspace(3)* %out) #0 { + %ptr = bitcast i64 addrspace(3)* %out to i32 addrspace(3)* + %ptr255 = getelementptr i32, i32 addrspace(3)* %ptr, i32 255 + %ptri64 = bitcast i32 addrspace(3)* %ptr255 to i64 addrspace(3)* + store i64 0, i64 addrspace(3)* %out, align 4 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/unhandled-loop-condition-assertion.ll b/llvm/test/CodeGen/AMDGPU/unhandled-loop-condition-assertion.ll new file mode 100644 index 00000000000..036a7e91b47 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/unhandled-loop-condition-assertion.ll @@ -0,0 +1,115 @@ +; REQUIRES: asserts +; XFAIL: * +; RUN: llc -O0 -verify-machineinstrs -asm-verbose=0 -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=COMMON %s +; RUN: llc -O0 -verify-machineinstrs -asm-verbose=0 -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=COMMON %s +; RUN: llc -O0 -verify-machineinstrs -asm-verbose=0 -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=COMMON %s + +; SI hits an assertion at -O0, evergreen hits a not implemented unreachable. + +; COMMON-LABEL: {{^}}branch_true: +define void @branch_true(i8 addrspace(1)* nocapture %main, i32 %main_stride) #0 { +entry: + br i1 true, label %for.end, label %for.body.lr.ph + +for.body.lr.ph: ; preds = %entry + %add.ptr.sum = shl i32 %main_stride, 1 + %add.ptr1.sum = add i32 %add.ptr.sum, %main_stride + %add.ptr4.sum = shl i32 %main_stride, 2 + br label %for.body + +for.body: ; preds = %for.body, %for.body.lr.ph + %main.addr.011 = phi i8 addrspace(1)* [ %main, %for.body.lr.ph ], [ %add.ptr6, %for.body ] + %0 = bitcast i8 addrspace(1)* %main.addr.011 to i32 addrspace(1)* + %1 = load i32, i32 addrspace(1)* %0, align 4 + %add.ptr = getelementptr inbounds i8, i8 addrspace(1)* %main.addr.011, i32 %main_stride + %2 = bitcast i8 addrspace(1)* %add.ptr to i32 addrspace(1)* + %3 = load i32, i32 addrspace(1)* %2, align 4 + %add.ptr1 = getelementptr inbounds i8, i8 addrspace(1)* %main.addr.011, i32 %add.ptr.sum + %4 = bitcast i8 addrspace(1)* %add.ptr1 to i32 addrspace(1)* + %5 = load i32, i32 addrspace(1)* %4, align 4 + %add.ptr2 = getelementptr inbounds i8, i8 addrspace(1)* %main.addr.011, i32 %add.ptr1.sum + %6 = bitcast i8 addrspace(1)* %add.ptr2 to i32 addrspace(1)* + %7 = load i32, i32 addrspace(1)* %6, align 4 + %add.ptr3 = getelementptr inbounds i8, i8 addrspace(1)* %main.addr.011, i32 %add.ptr4.sum + %8 = bitcast i8 addrspace(1)* %add.ptr3 to i32 addrspace(1)* + %9 = load i32, i32 addrspace(1)* %8, align 4 + %add.ptr6 = getelementptr inbounds i8, i8 addrspace(1)* %main.addr.011, i32 undef + br i1 undef, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret void +} + +; COMMON-LABEL: {{^}}branch_false: +; SI: .text +; SI-NEXT: s_endpgm +define void @branch_false(i8 addrspace(1)* nocapture %main, i32 %main_stride) #0 { +entry: + br i1 false, label %for.end, label %for.body.lr.ph + +for.body.lr.ph: ; preds = %entry + %add.ptr.sum = shl i32 %main_stride, 1 + %add.ptr1.sum = add i32 %add.ptr.sum, %main_stride + %add.ptr4.sum = shl i32 %main_stride, 2 + br label %for.body + +for.body: ; preds = %for.body, %for.body.lr.ph + %main.addr.011 = phi i8 addrspace(1)* [ %main, %for.body.lr.ph ], [ %add.ptr6, %for.body ] + %0 = bitcast i8 addrspace(1)* %main.addr.011 to i32 addrspace(1)* + %1 = load i32, i32 addrspace(1)* %0, align 4 + %add.ptr = getelementptr inbounds i8, i8 addrspace(1)* %main.addr.011, i32 %main_stride + %2 = bitcast i8 addrspace(1)* %add.ptr to i32 addrspace(1)* + %3 = load i32, i32 addrspace(1)* %2, align 4 + %add.ptr1 = getelementptr inbounds i8, i8 addrspace(1)* %main.addr.011, i32 %add.ptr.sum + %4 = bitcast i8 addrspace(1)* %add.ptr1 to i32 addrspace(1)* + %5 = load i32, i32 addrspace(1)* %4, align 4 + %add.ptr2 = getelementptr inbounds i8, i8 addrspace(1)* %main.addr.011, i32 %add.ptr1.sum + %6 = bitcast i8 addrspace(1)* %add.ptr2 to i32 addrspace(1)* + %7 = load i32, i32 addrspace(1)* %6, align 4 + %add.ptr3 = getelementptr inbounds i8, i8 addrspace(1)* %main.addr.011, i32 %add.ptr4.sum + %8 = bitcast i8 addrspace(1)* %add.ptr3 to i32 addrspace(1)* + %9 = load i32, i32 addrspace(1)* %8, align 4 + %add.ptr6 = getelementptr inbounds i8, i8 addrspace(1)* %main.addr.011, i32 undef + br i1 undef, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret void +} + +; COMMON-LABEL: {{^}}branch_undef: +; SI: .text +; SI-NEXT: s_endpgm +define void @branch_undef(i8 addrspace(1)* nocapture %main, i32 %main_stride) #0 { +entry: + br i1 undef, label %for.end, label %for.body.lr.ph + +for.body.lr.ph: ; preds = %entry + %add.ptr.sum = shl i32 %main_stride, 1 + %add.ptr1.sum = add i32 %add.ptr.sum, %main_stride + %add.ptr4.sum = shl i32 %main_stride, 2 + br label %for.body + +for.body: ; preds = %for.body, %for.body.lr.ph + %main.addr.011 = phi i8 addrspace(1)* [ %main, %for.body.lr.ph ], [ %add.ptr6, %for.body ] + %0 = bitcast i8 addrspace(1)* %main.addr.011 to i32 addrspace(1)* + %1 = load i32, i32 addrspace(1)* %0, align 4 + %add.ptr = getelementptr inbounds i8, i8 addrspace(1)* %main.addr.011, i32 %main_stride + %2 = bitcast i8 addrspace(1)* %add.ptr to i32 addrspace(1)* + %3 = load i32, i32 addrspace(1)* %2, align 4 + %add.ptr1 = getelementptr inbounds i8, i8 addrspace(1)* %main.addr.011, i32 %add.ptr.sum + %4 = bitcast i8 addrspace(1)* %add.ptr1 to i32 addrspace(1)* + %5 = load i32, i32 addrspace(1)* %4, align 4 + %add.ptr2 = getelementptr inbounds i8, i8 addrspace(1)* %main.addr.011, i32 %add.ptr1.sum + %6 = bitcast i8 addrspace(1)* %add.ptr2 to i32 addrspace(1)* + %7 = load i32, i32 addrspace(1)* %6, align 4 + %add.ptr3 = getelementptr inbounds i8, i8 addrspace(1)* %main.addr.011, i32 %add.ptr4.sum + %8 = bitcast i8 addrspace(1)* %add.ptr3 to i32 addrspace(1)* + %9 = load i32, i32 addrspace(1)* %8, align 4 + %add.ptr6 = getelementptr inbounds i8, i8 addrspace(1)* %main.addr.011, i32 undef + br i1 undef, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret void +} + +attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/unroll.ll b/llvm/test/CodeGen/AMDGPU/unroll.ll new file mode 100644 index 00000000000..411a15a4b83 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/unroll.ll @@ -0,0 +1,36 @@ +; RUN: opt -mtriple=amdgcn-- -loop-unroll -simplifycfg -sroa %s -S -o - | FileCheck %s +; RUN: opt -mtriple=r600-- -loop-unroll -simplifycfg -sroa %s -S -o - | FileCheck %s + + +; This test contains a simple loop that initializes an array declared in +; private memory. We want to make sure these kinds of loops are always +; unrolled, because private memory is slow. + +; CHECK-LABEL: @test +; CHECK-NOT: alloca +; CHECK: store i32 5, i32 addrspace(1)* %out +define void @test(i32 addrspace(1)* %out) { +entry: + %0 = alloca [32 x i32] + br label %loop.header + +loop.header: + %counter = phi i32 [0, %entry], [%inc, %loop.inc] + br label %loop.body + +loop.body: + %ptr = getelementptr [32 x i32], [32 x i32]* %0, i32 0, i32 %counter + store i32 %counter, i32* %ptr + br label %loop.inc + +loop.inc: + %inc = add i32 %counter, 1 + %1 = icmp sge i32 %counter, 32 + br i1 %1, label %exit, label %loop.header + +exit: + %2 = getelementptr [32 x i32], [32 x i32]* %0, i32 0, i32 5 + %3 = load i32, i32* %2 + store i32 %3, i32 addrspace(1)* %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/unsupported-cc.ll b/llvm/test/CodeGen/AMDGPU/unsupported-cc.ll new file mode 100644 index 00000000000..8ab4faf2f14 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/unsupported-cc.ll @@ -0,0 +1,125 @@ +; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s + +; These tests are for condition codes that are not supported by the hardware + +; CHECK-LABEL: {{^}}slt: +; CHECK: SETGT_INT {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z +; CHECK-NEXT: LSHR +; CHECK-NEXT: 5(7.006492e-45) +define void @slt(i32 addrspace(1)* %out, i32 %in) { +entry: + %0 = icmp slt i32 %in, 5 + %1 = select i1 %0, i32 -1, i32 0 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}ult_i32: +; CHECK: SETGT_UINT {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z +; CHECK-NEXT: LSHR +; CHECK-NEXT: 5(7.006492e-45) +define void @ult_i32(i32 addrspace(1)* %out, i32 %in) { +entry: + %0 = icmp ult i32 %in, 5 + %1 = select i1 %0, i32 -1, i32 0 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}ult_float: +; CHECK: SETGE * T{{[0-9]}}.[[CHAN:[XYZW]]], KC0[2].Z, literal.x +; CHECK-NEXT: 1084227584(5.000000e+00) +; CHECK-NEXT: SETE T{{[0-9]\.[XYZW]}}, PV.[[CHAN]], 0.0 +; CHECK-NEXT: LSHR * +define void @ult_float(float addrspace(1)* %out, float %in) { +entry: + %0 = fcmp ult float %in, 5.0 + %1 = select i1 %0, float 1.0, float 0.0 + store float %1, float addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}ult_float_native: +; CHECK: SETGE T{{[0-9]\.[XYZW]}}, KC0[2].Z, literal.x +; CHECK-NEXT: LSHR * +; CHECK-NEXT: 1084227584(5.000000e+00) +define void @ult_float_native(float addrspace(1)* %out, float %in) { +entry: + %0 = fcmp ult float %in, 5.0 + %1 = select i1 %0, float 0.0, float 1.0 + store float %1, float addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}olt: +; CHECK: SETGT T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z +; CHECK-NEXT: LSHR * +; CHECK-NEXT: 1084227584(5.000000e+00) +define void @olt(float addrspace(1)* %out, float %in) { +entry: + %0 = fcmp olt float %in, 5.0 + %1 = select i1 %0, float 1.0, float 0.0 + store float %1, float addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}sle: +; CHECK: SETGT_INT {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z +; CHECK-NEXT: LSHR +; CHECK-NEXT: 6(8.407791e-45) +define void @sle(i32 addrspace(1)* %out, i32 %in) { +entry: + %0 = icmp sle i32 %in, 5 + %1 = select i1 %0, i32 -1, i32 0 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}ule_i32: +; CHECK: SETGT_UINT {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z +; CHECK-NEXT: LSHR +; CHECK-NEXT: 6(8.407791e-45) +define void @ule_i32(i32 addrspace(1)* %out, i32 %in) { +entry: + %0 = icmp ule i32 %in, 5 + %1 = select i1 %0, i32 -1, i32 0 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}ule_float: +; CHECK: SETGT * T{{[0-9]}}.[[CHAN:[XYZW]]], KC0[2].Z, literal.x +; CHECK-NEXT: 1084227584(5.000000e+00) +; CHECK-NEXT: SETE T{{[0-9]\.[XYZW]}}, PV.[[CHAN]], 0.0 +; CHECK-NEXT: LSHR * +define void @ule_float(float addrspace(1)* %out, float %in) { +entry: + %0 = fcmp ule float %in, 5.0 + %1 = select i1 %0, float 1.0, float 0.0 + store float %1, float addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}ule_float_native: +; CHECK: SETGT T{{[0-9]\.[XYZW]}}, KC0[2].Z, literal.x +; CHECK-NEXT: LSHR * +; CHECK-NEXT: 1084227584(5.000000e+00) +define void @ule_float_native(float addrspace(1)* %out, float %in) { +entry: + %0 = fcmp ule float %in, 5.0 + %1 = select i1 %0, float 0.0, float 1.0 + store float %1, float addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}ole: +; CHECK: SETGE T{{[0-9]\.[XYZW]}}, literal.x, KC0[2].Z +; CHECK-NEXT: LSHR * +; CHECK-NEXT:1084227584(5.000000e+00) +define void @ole(float addrspace(1)* %out, float %in) { +entry: + %0 = fcmp ole float %in, 5.0 + %1 = select i1 %0, float 1.0, float 0.0 + store float %1, float addrspace(1)* %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/urecip.ll b/llvm/test/CodeGen/AMDGPU/urecip.ll new file mode 100644 index 00000000000..daacc771708 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/urecip.ll @@ -0,0 +1,13 @@ +;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s +;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s + +;CHECK: v_rcp_iflag_f32_e32 + +define void @test(i32 %p, i32 %q) { + %i = udiv i32 %p, %q + %r = bitcast i32 %i to float + call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %r, float %r, float %r, float %r) + ret void +} + +declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) diff --git a/llvm/test/CodeGen/AMDGPU/urem.ll b/llvm/test/CodeGen/AMDGPU/urem.ll new file mode 100644 index 00000000000..62841ec2d6c --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/urem.ll @@ -0,0 +1,94 @@ +; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s + +; The code generated by urem is long and complex and may frequently +; change. The goal of this test is to make sure the ISel doesn't fail +; when it gets a v2i32/v4i32 urem + +; FUNC-LABEL: {{^}}test_urem_i32: +; SI: s_endpgm +; EG: CF_END +define void @test_urem_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { + %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 + %a = load i32, i32 addrspace(1)* %in + %b = load i32, i32 addrspace(1)* %b_ptr + %result = urem i32 %a, %b + store i32 %result, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}test_urem_i32_7: +; SI: v_mov_b32_e32 [[MAGIC:v[0-9]+]], 0x24924925 +; SI: v_mul_hi_u32 {{v[0-9]+}}, [[MAGIC]] +; SI: v_subrev_i32 +; SI: v_mul_lo_i32 +; SI: v_sub_i32 +; SI: buffer_store_dword +; SI: s_endpgm +define void @test_urem_i32_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { + %num = load i32, i32 addrspace(1) * %in + %result = urem i32 %num, 7 + store i32 %result, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}test_urem_v2i32: +; SI: s_endpgm +; EG: CF_END +define void @test_urem_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { + %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1 + %a = load <2 x i32>, <2 x i32> addrspace(1)* %in + %b = load <2 x i32>, <2 x i32> addrspace(1)* %b_ptr + %result = urem <2 x i32> %a, %b + store <2 x i32> %result, <2 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}test_urem_v4i32: +; SI: s_endpgm +; EG: CF_END +define void @test_urem_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { + %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1 + %a = load <4 x i32>, <4 x i32> addrspace(1)* %in + %b = load <4 x i32>, <4 x i32> addrspace(1)* %b_ptr + %result = urem <4 x i32> %a, %b + store <4 x i32> %result, <4 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}test_urem_i64: +; SI: s_endpgm +; EG: CF_END +define void @test_urem_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { + %b_ptr = getelementptr i64, i64 addrspace(1)* %in, i64 1 + %a = load i64, i64 addrspace(1)* %in + %b = load i64, i64 addrspace(1)* %b_ptr + %result = urem i64 %a, %b + store i64 %result, i64 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}test_urem_v2i64: +; SI: s_endpgm +; EG: CF_END +define void @test_urem_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) { + %b_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %in, i64 1 + %a = load <2 x i64>, <2 x i64> addrspace(1)* %in + %b = load <2 x i64>, <2 x i64> addrspace(1)* %b_ptr + %result = urem <2 x i64> %a, %b + store <2 x i64> %result, <2 x i64> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}test_urem_v4i64: +; SI: s_endpgm +; EG: CF_END +define void @test_urem_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) { + %b_ptr = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %in, i64 1 + %a = load <4 x i64>, <4 x i64> addrspace(1)* %in + %b = load <4 x i64>, <4 x i64> addrspace(1)* %b_ptr + %result = urem <4 x i64> %a, %b + store <4 x i64> %result, <4 x i64> addrspace(1)* %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll b/llvm/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll new file mode 100644 index 00000000000..f26f30022b4 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll @@ -0,0 +1,103 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN %s + +declare float @llvm.fma.f32(float, float, float) #1 +declare float @llvm.fmuladd.f32(float, float, float) #1 +declare i32 @llvm.AMDGPU.imad24(i32, i32, i32) #1 + + +; GCN-LABEL: {{^}}test_sgpr_use_twice_binop: +; GCN: s_load_dword [[SGPR:s[0-9]+]], +; GCN: v_add_f32_e64 [[RESULT:v[0-9]+]], [[SGPR]], [[SGPR]] +; GCN: buffer_store_dword [[RESULT]] +define void @test_sgpr_use_twice_binop(float addrspace(1)* %out, float %a) #0 { + %dbl = fadd float %a, %a + store float %dbl, float addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}test_sgpr_use_three_ternary_op: +; GCN: s_load_dword [[SGPR:s[0-9]+]], +; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[SGPR]], [[SGPR]], [[SGPR]] +; GCN: buffer_store_dword [[RESULT]] +define void @test_sgpr_use_three_ternary_op(float addrspace(1)* %out, float %a) #0 { + %fma = call float @llvm.fma.f32(float %a, float %a, float %a) #1 + store float %fma, float addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}test_sgpr_use_twice_ternary_op_a_a_b: +; SI: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb +; SI: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc +; VI: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c +; VI: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30 +; GCN: v_mov_b32_e32 [[VGPR1:v[0-9]+]], [[SGPR1]] +; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[SGPR0]], [[SGPR0]], [[VGPR1]] +; GCN: buffer_store_dword [[RESULT]] +define void @test_sgpr_use_twice_ternary_op_a_a_b(float addrspace(1)* %out, float %a, float %b) #0 { + %fma = call float @llvm.fma.f32(float %a, float %a, float %b) #1 + store float %fma, float addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}test_sgpr_use_twice_ternary_op_a_b_a: +; SI: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb +; SI: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc +; VI: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c +; VI: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30 +; GCN: v_mov_b32_e32 [[VGPR1:v[0-9]+]], [[SGPR1]] +; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[VGPR1]], [[SGPR0]], [[SGPR0]] +; GCN: buffer_store_dword [[RESULT]] +define void @test_sgpr_use_twice_ternary_op_a_b_a(float addrspace(1)* %out, float %a, float %b) #0 { + %fma = call float @llvm.fma.f32(float %a, float %b, float %a) #1 + store float %fma, float addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}test_sgpr_use_twice_ternary_op_b_a_a: +; SI: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb +; SI: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc +; VI: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c +; VI: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30 +; GCN: v_mov_b32_e32 [[VGPR1:v[0-9]+]], [[SGPR1]] +; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[SGPR0]], [[VGPR1]], [[SGPR0]] +; GCN: buffer_store_dword [[RESULT]] +define void @test_sgpr_use_twice_ternary_op_b_a_a(float addrspace(1)* %out, float %a, float %b) #0 { + %fma = call float @llvm.fma.f32(float %b, float %a, float %a) #1 + store float %fma, float addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}test_sgpr_use_twice_ternary_op_a_a_imm: +; GCN: s_load_dword [[SGPR:s[0-9]+]] +; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[SGPR]], [[SGPR]], 2.0 +; GCN: buffer_store_dword [[RESULT]] +define void @test_sgpr_use_twice_ternary_op_a_a_imm(float addrspace(1)* %out, float %a) #0 { + %fma = call float @llvm.fma.f32(float %a, float %a, float 2.0) #1 + store float %fma, float addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}test_sgpr_use_twice_ternary_op_a_imm_a: +; GCN: s_load_dword [[SGPR:s[0-9]+]] +; GCN: v_fma_f32 [[RESULT:v[0-9]+]], 2.0, [[SGPR]], [[SGPR]] +; GCN: buffer_store_dword [[RESULT]] +define void @test_sgpr_use_twice_ternary_op_a_imm_a(float addrspace(1)* %out, float %a) #0 { + %fma = call float @llvm.fma.f32(float %a, float 2.0, float %a) #1 + store float %fma, float addrspace(1)* %out, align 4 + ret void +} + +; Don't use fma since fma c, x, y is canonicalized to fma x, c, y +; GCN-LABEL: {{^}}test_sgpr_use_twice_ternary_op_imm_a_a: +; GCN: s_load_dword [[SGPR:s[0-9]+]] +; GCN: v_mad_i32_i24 [[RESULT:v[0-9]+]], 2, [[SGPR]], [[SGPR]] +; GCN: buffer_store_dword [[RESULT]] +define void @test_sgpr_use_twice_ternary_op_imm_a_a(i32 addrspace(1)* %out, i32 %a) #0 { + %fma = call i32 @llvm.AMDGPU.imad24(i32 2, i32 %a, i32 %a) #1 + store i32 %fma, i32 addrspace(1)* %out, align 4 + ret void +} + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } diff --git a/llvm/test/CodeGen/AMDGPU/usubo.ll b/llvm/test/CodeGen/AMDGPU/usubo.ll new file mode 100644 index 00000000000..3c9b1622a07 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/usubo.ll @@ -0,0 +1,86 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs< %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s + +declare { i32, i1 } @llvm.usub.with.overflow.i32(i32, i32) nounwind readnone +declare { i64, i1 } @llvm.usub.with.overflow.i64(i64, i64) nounwind readnone + +; FUNC-LABEL: {{^}}usubo_i64_zext: + +; EG: SUBB_UINT +; EG: ADDC_UINT +define void @usubo_i64_zext(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind { + %usub = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 %a, i64 %b) nounwind + %val = extractvalue { i64, i1 } %usub, 0 + %carry = extractvalue { i64, i1 } %usub, 1 + %ext = zext i1 %carry to i64 + %add2 = add i64 %val, %ext + store i64 %add2, i64 addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}s_usubo_i32: +; SI: s_sub_i32 + +; EG-DAG: SUBB_UINT +; EG-DAG: SUB_INT +define void @s_usubo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 %a, i32 %b) nounwind { + %usub = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %a, i32 %b) nounwind + %val = extractvalue { i32, i1 } %usub, 0 + %carry = extractvalue { i32, i1 } %usub, 1 + store i32 %val, i32 addrspace(1)* %out, align 4 + store i1 %carry, i1 addrspace(1)* %carryout + ret void +} + +; FUNC-LABEL: {{^}}v_usubo_i32: +; SI: v_subrev_i32_e32 + +; EG-DAG: SUBB_UINT +; EG-DAG: SUB_INT +define void @v_usubo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind { + %a = load i32, i32 addrspace(1)* %aptr, align 4 + %b = load i32, i32 addrspace(1)* %bptr, align 4 + %usub = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %a, i32 %b) nounwind + %val = extractvalue { i32, i1 } %usub, 0 + %carry = extractvalue { i32, i1 } %usub, 1 + store i32 %val, i32 addrspace(1)* %out, align 4 + store i1 %carry, i1 addrspace(1)* %carryout + ret void +} + +; FUNC-LABEL: {{^}}s_usubo_i64: +; SI: s_sub_u32 +; SI: s_subb_u32 + +; EG-DAG: SUBB_UINT +; EG-DAG: SUB_INT +; EG-DAG: SUB_INT +; EG: SUB_INT +define void @s_usubo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 %a, i64 %b) nounwind { + %usub = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 %a, i64 %b) nounwind + %val = extractvalue { i64, i1 } %usub, 0 + %carry = extractvalue { i64, i1 } %usub, 1 + store i64 %val, i64 addrspace(1)* %out, align 8 + store i1 %carry, i1 addrspace(1)* %carryout + ret void +} + +; FUNC-LABEL: {{^}}v_usubo_i64: +; SI: v_sub_i32 +; SI: v_subb_u32 + +; EG-DAG: SUBB_UINT +; EG-DAG: SUB_INT +; EG-DAG: SUB_INT +; EG: SUB_INT +define void @v_usubo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind { + %a = load i64, i64 addrspace(1)* %aptr, align 4 + %b = load i64, i64 addrspace(1)* %bptr, align 4 + %usub = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 %a, i64 %b) nounwind + %val = extractvalue { i64, i1 } %usub, 0 + %carry = extractvalue { i64, i1 } %usub, 1 + store i64 %val, i64 addrspace(1)* %out, align 8 + store i1 %carry, i1 addrspace(1)* %carryout + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/v1i64-kernel-arg.ll b/llvm/test/CodeGen/AMDGPU/v1i64-kernel-arg.ll new file mode 100644 index 00000000000..31755125c03 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/v1i64-kernel-arg.ll @@ -0,0 +1,17 @@ +; REQUIRES: asserts +; XFAIL: * +; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck %s + +; CHECK-LABEL: {{^}}kernel_arg_i64: +define void @kernel_arg_i64(i64 addrspace(1)* %out, i64 %a) nounwind { + store i64 %a, i64 addrspace(1)* %out, align 8 + ret void +} + +; i64 arg works, v1i64 arg does not. +; CHECK-LABEL: {{^}}kernel_arg_v1i64: +define void @kernel_arg_v1i64(<1 x i64> addrspace(1)* %out, <1 x i64> %a) nounwind { + store <1 x i64> %a, <1 x i64> addrspace(1)* %out, align 8 + ret void +} + diff --git a/llvm/test/CodeGen/AMDGPU/v_cndmask.ll b/llvm/test/CodeGen/AMDGPU/v_cndmask.ll new file mode 100644 index 00000000000..c368c5aaf7d --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/v_cndmask.ll @@ -0,0 +1,39 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s + +declare i32 @llvm.r600.read.tidig.x() #1 + +; SI-LABEL: {{^}}v_cnd_nan_nosgpr: +; SI: v_cndmask_b32_e64 v{{[0-9]}}, v{{[0-9]}}, -1, s{{\[[0-9]+:[0-9]+\]}} +; SI-DAG: v{{[0-9]}} +; All nan values are converted to 0xffffffff +; SI: s_endpgm +define void @v_cnd_nan_nosgpr(float addrspace(1)* %out, i32 %c, float addrspace(1)* %fptr) #0 { + %idx = call i32 @llvm.r600.read.tidig.x() #1 + %f.gep = getelementptr float, float addrspace(1)* %fptr, i32 %idx + %f = load float, float addrspace(1)* %fptr + %setcc = icmp ne i32 %c, 0 + %select = select i1 %setcc, float 0xFFFFFFFFE0000000, float %f + store float %select, float addrspace(1)* %out + ret void +} + + +; This requires slightly trickier SGPR operand legalization since the +; single constant bus SGPR usage is the last operand, and it should +; never be moved. + +; SI-LABEL: {{^}}v_cnd_nan: +; SI: v_cndmask_b32_e64 v{{[0-9]}}, v{{[0-9]}}, -1, s{{\[[0-9]+:[0-9]+\]}} +; SI-DAG: v{{[0-9]}} +; All nan values are converted to 0xffffffff +; SI: s_endpgm +define void @v_cnd_nan(float addrspace(1)* %out, i32 %c, float %f) #0 { + %setcc = icmp ne i32 %c, 0 + %select = select i1 %setcc, float 0xFFFFFFFFE0000000, float %f + store float %select, float addrspace(1)* %out + ret void +} + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } diff --git a/llvm/test/CodeGen/AMDGPU/valu-i1.ll b/llvm/test/CodeGen/AMDGPU/valu-i1.ll new file mode 100644 index 00000000000..7d0ebd139f5 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/valu-i1.ll @@ -0,0 +1,188 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs -enable-misched -asm-verbose < %s | FileCheck -check-prefix=SI %s + +declare i32 @llvm.r600.read.tidig.x() nounwind readnone + +; SI-LABEL: @test_if +; Make sure the i1 values created by the cfg structurizer pass are +; moved using VALU instructions +; SI-NOT: s_mov_b64 s[{{[0-9]:[0-9]}}], -1 +; SI: v_mov_b32_e32 v{{[0-9]}}, -1 +define void @test_if(i32 %a, i32 %b, i32 addrspace(1)* %src, i32 addrspace(1)* %dst) #1 { +entry: + switch i32 %a, label %default [ + i32 0, label %case0 + i32 1, label %case1 + ] + +case0: + %arrayidx1 = getelementptr i32, i32 addrspace(1)* %dst, i32 %b + store i32 0, i32 addrspace(1)* %arrayidx1, align 4 + br label %end + +case1: + %arrayidx5 = getelementptr i32, i32 addrspace(1)* %dst, i32 %b + store i32 1, i32 addrspace(1)* %arrayidx5, align 4 + br label %end + +default: + %cmp8 = icmp eq i32 %a, 2 + %arrayidx10 = getelementptr i32, i32 addrspace(1)* %dst, i32 %b + br i1 %cmp8, label %if, label %else + +if: + store i32 2, i32 addrspace(1)* %arrayidx10, align 4 + br label %end + +else: + store i32 3, i32 addrspace(1)* %arrayidx10, align 4 + br label %end + +end: + ret void +} + +; SI-LABEL: @simple_test_v_if +; SI: v_cmp_ne_i32_e32 vcc, 0, v{{[0-9]+}} +; SI: s_and_saveexec_b64 [[BR_SREG:s\[[0-9]+:[0-9]+\]]], vcc +; SI: s_xor_b64 [[BR_SREG]], exec, [[BR_SREG]] + +; SI: ; BB#1 +; SI: buffer_store_dword +; SI: s_endpgm + +; SI: BB1_2: +; SI: s_or_b64 exec, exec, [[BR_SREG]] +; SI: s_endpgm +define void @simple_test_v_if(i32 addrspace(1)* %dst, i32 addrspace(1)* %src) #1 { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %is.0 = icmp ne i32 %tid, 0 + br i1 %is.0, label %store, label %exit + +store: + %gep = getelementptr i32, i32 addrspace(1)* %dst, i32 %tid + store i32 999, i32 addrspace(1)* %gep + ret void + +exit: + ret void +} + +; SI-LABEL: @simple_test_v_loop +; SI: v_cmp_ne_i32_e32 vcc, 0, v{{[0-9]+}} +; SI: s_and_saveexec_b64 [[BR_SREG:s\[[0-9]+:[0-9]+\]]], vcc +; SI: s_xor_b64 [[BR_SREG]], exec, [[BR_SREG]] +; SI: s_cbranch_execz BB2_2 + +; SI: ; BB#1: +; SI: s_mov_b64 {{s\[[0-9]+:[0-9]+\]}}, 0{{$}} + +; SI: BB2_3: +; SI: buffer_load_dword +; SI: buffer_store_dword +; SI: v_cmp_eq_i32_e32 vcc, +; SI: s_or_b64 [[OR_SREG:s\[[0-9]+:[0-9]+\]]] +; SI: s_andn2_b64 exec, exec, [[OR_SREG]] +; SI: s_cbranch_execnz BB2_3 + +define void @simple_test_v_loop(i32 addrspace(1)* %dst, i32 addrspace(1)* %src) #1 { +entry: + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %is.0 = icmp ne i32 %tid, 0 + %limit = add i32 %tid, 64 + br i1 %is.0, label %loop, label %exit + +loop: + %i = phi i32 [%tid, %entry], [%i.inc, %loop] + %gep.src = getelementptr i32, i32 addrspace(1)* %src, i32 %i + %gep.dst = getelementptr i32, i32 addrspace(1)* %dst, i32 %i + %load = load i32, i32 addrspace(1)* %src + store i32 %load, i32 addrspace(1)* %gep.dst + %i.inc = add nsw i32 %i, 1 + %cmp = icmp eq i32 %limit, %i.inc + br i1 %cmp, label %exit, label %loop + +exit: + ret void +} + +; SI-LABEL: @multi_vcond_loop + +; Load loop limit from buffer +; Branch to exit if uniformly not taken +; SI: ; BB#0: +; SI: buffer_load_dword [[VBOUND:v[0-9]+]] +; SI: v_cmp_lt_i32_e32 vcc +; SI: s_and_saveexec_b64 [[OUTER_CMP_SREG:s\[[0-9]+:[0-9]+\]]], vcc +; SI: s_xor_b64 [[OUTER_CMP_SREG]], exec, [[OUTER_CMP_SREG]] +; SI: s_cbranch_execz BB3_2 + +; Initialize inner condition to false +; SI: ; BB#1: +; SI: s_mov_b64 [[ZERO:s\[[0-9]+:[0-9]+\]]], 0{{$}} +; SI: s_mov_b64 [[COND_STATE:s\[[0-9]+:[0-9]+\]]], [[ZERO]] + +; Clear exec bits for workitems that load -1s +; SI: BB3_3: +; SI: buffer_load_dword [[B:v[0-9]+]] +; SI: buffer_load_dword [[A:v[0-9]+]] +; SI-DAG: v_cmp_ne_i32_e64 [[NEG1_CHECK_0:s\[[0-9]+:[0-9]+\]]], -1, [[A]] +; SI-DAG: v_cmp_ne_i32_e32 [[NEG1_CHECK_1:vcc]], -1, [[B]] +; SI: s_and_b64 [[ORNEG1:s\[[0-9]+:[0-9]+\]]], [[NEG1_CHECK_1]], [[NEG1_CHECK_0]] +; SI: s_and_saveexec_b64 [[ORNEG1]], [[ORNEG1]] +; SI: s_xor_b64 [[ORNEG1]], exec, [[ORNEG1]] +; SI: s_cbranch_execz BB3_5 + +; SI: BB#4: +; SI: buffer_store_dword +; SI: v_cmp_ge_i64_e32 vcc +; SI: s_or_b64 [[COND_STATE]], vcc, [[COND_STATE]] + +; SI: BB3_5: +; SI: s_or_b64 exec, exec, [[ORNEG1]] +; SI: s_or_b64 [[COND_STATE]], [[ORNEG1]], [[COND_STATE]] +; SI: s_andn2_b64 exec, exec, [[COND_STATE]] +; SI: s_cbranch_execnz BB3_3 + +; SI: BB#6 +; SI: s_or_b64 exec, exec, [[COND_STATE]] + +; SI: BB3_2: +; SI-NOT: [[COND_STATE]] +; SI: s_endpgm + +define void @multi_vcond_loop(i32 addrspace(1)* noalias nocapture %arg, i32 addrspace(1)* noalias nocapture readonly %arg1, i32 addrspace(1)* noalias nocapture readonly %arg2, i32 addrspace(1)* noalias nocapture readonly %arg3) #1 { +bb: + %tmp = tail call i32 @llvm.r600.read.tidig.x() #0 + %tmp4 = sext i32 %tmp to i64 + %tmp5 = getelementptr inbounds i32, i32 addrspace(1)* %arg3, i64 %tmp4 + %tmp6 = load i32, i32 addrspace(1)* %tmp5, align 4 + %tmp7 = icmp sgt i32 %tmp6, 0 + %tmp8 = sext i32 %tmp6 to i64 + br i1 %tmp7, label %bb10, label %bb26 + +bb10: ; preds = %bb, %bb20 + %tmp11 = phi i64 [ %tmp23, %bb20 ], [ 0, %bb ] + %tmp12 = add nsw i64 %tmp11, %tmp4 + %tmp13 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp12 + %tmp14 = load i32, i32 addrspace(1)* %tmp13, align 4 + %tmp15 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp12 + %tmp16 = load i32, i32 addrspace(1)* %tmp15, align 4 + %tmp17 = icmp ne i32 %tmp14, -1 + %tmp18 = icmp ne i32 %tmp16, -1 + %tmp19 = and i1 %tmp17, %tmp18 + br i1 %tmp19, label %bb20, label %bb26 + +bb20: ; preds = %bb10 + %tmp21 = add nsw i32 %tmp16, %tmp14 + %tmp22 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp12 + store i32 %tmp21, i32 addrspace(1)* %tmp22, align 4 + %tmp23 = add nuw nsw i64 %tmp11, 1 + %tmp24 = icmp slt i64 %tmp23, %tmp8 + br i1 %tmp24, label %bb10, label %bb26 + +bb26: ; preds = %bb10, %bb20, %bb + ret void +} + +attributes #0 = { nounwind readnone } +attributes #1 = { nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/vector-alloca.ll b/llvm/test/CodeGen/AMDGPU/vector-alloca.ll new file mode 100644 index 00000000000..6f3b4847fbd --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/vector-alloca.ll @@ -0,0 +1,77 @@ +; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck --check-prefix=EG -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=verde -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=verde -mattr=+promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=+promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC %s + +; FUNC-LABEL: {{^}}vector_read: +; EG: MOV +; EG: MOV +; EG: MOV +; EG: MOV +; EG: MOVA_INT +define void @vector_read(i32 addrspace(1)* %out, i32 %index) { +entry: + %0 = alloca [4 x i32] + %x = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 0 + %y = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 1 + %z = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 2 + %w = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 3 + store i32 0, i32* %x + store i32 1, i32* %y + store i32 2, i32* %z + store i32 3, i32* %w + %1 = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 %index + %2 = load i32, i32* %1 + store i32 %2, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}vector_write: +; EG: MOV +; EG: MOV +; EG: MOV +; EG: MOV +; EG: MOVA_INT +; EG: MOVA_INT +define void @vector_write(i32 addrspace(1)* %out, i32 %w_index, i32 %r_index) { +entry: + %0 = alloca [4 x i32] + %x = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 0 + %y = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 1 + %z = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 2 + %w = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 3 + store i32 0, i32* %x + store i32 0, i32* %y + store i32 0, i32* %z + store i32 0, i32* %w + %1 = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 %w_index + store i32 1, i32* %1 + %2 = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 %r_index + %3 = load i32, i32* %2 + store i32 %3, i32 addrspace(1)* %out + ret void +} + +; This test should be optimize to: +; store i32 0, i32 addrspace(1)* %out +; FUNC-LABEL: {{^}}bitcast_gep: +; EG: STORE_RAW +define void @bitcast_gep(i32 addrspace(1)* %out, i32 %w_index, i32 %r_index) { +entry: + %0 = alloca [4 x i32] + %x = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 0 + %y = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 1 + %z = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 2 + %w = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 3 + store i32 0, i32* %x + store i32 0, i32* %y + store i32 0, i32* %z + store i32 0, i32* %w + %1 = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 1 + %2 = bitcast i32* %1 to [4 x i32]* + %3 = getelementptr [4 x i32], [4 x i32]* %2, i32 0, i32 0 + %4 = load i32, i32* %3 + store i32 %4, i32 addrspace(1)* %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/vertex-fetch-encoding.ll b/llvm/test/CodeGen/AMDGPU/vertex-fetch-encoding.ll new file mode 100644 index 00000000000..fb6a17e6714 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/vertex-fetch-encoding.ll @@ -0,0 +1,25 @@ +; RUN: llc < %s -march=r600 -show-mc-encoding -mcpu=barts | FileCheck --check-prefix=NI %s +; RUN: llc < %s -march=r600 -show-mc-encoding -mcpu=cayman | FileCheck --check-prefix=CM %s + +; NI: {{^}}vtx_fetch32: +; NI: VTX_READ_32 T[[GPR:[0-9]]].X, T[[GPR]].X, 0 ; encoding: [0x40,0x01,0x0[[GPR]],0x10,0x0[[GPR]],0xf0,0x5f,0x13,0x00,0x00,0x08,0x00 +; CM: {{^}}vtx_fetch32: +; CM: VTX_READ_32 T[[GPR:[0-9]]].X, T[[GPR]].X, 0 ; encoding: [0x40,0x01,0x0[[GPR]],0x00,0x0[[GPR]],0xf0,0x5f,0x13,0x00,0x00,0x00,0x00 + +define void @vtx_fetch32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { +entry: + %0 = load i32, i32 addrspace(1)* %in + store i32 %0, i32 addrspace(1)* %out + ret void +} + +; NI: {{^}}vtx_fetch128: +; NI: VTX_READ_128 T[[DST:[0-9]]].XYZW, T[[SRC:[0-9]]].X, 0 ; encoding: [0x40,0x01,0x0[[SRC]],0x40,0x0[[DST]],0x10,0x8d,0x18,0x00,0x00,0x08,0x00 +; XXX: Add a case for Cayman when v4i32 stores are supported. + +define void @vtx_fetch128(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { +entry: + %0 = load <4 x i32>, <4 x i32> addrspace(1)* %in + store <4 x i32> %0, <4 x i32> addrspace(1)* %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/vop-shrink.ll b/llvm/test/CodeGen/AMDGPU/vop-shrink.ll new file mode 100644 index 00000000000..9b2f229c05a --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/vop-shrink.ll @@ -0,0 +1,51 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s + +; Test that we correctly commute a sub instruction +; FUNC-LABEL: {{^}}sub_rev: +; SI-NOT: v_sub_i32_e32 v{{[0-9]+}}, s +; SI: v_subrev_i32_e32 v{{[0-9]+}}, s + +; ModuleID = 'vop-shrink.ll' + +define void @sub_rev(i32 addrspace(1)* %out, <4 x i32> %sgpr, i32 %cond) { +entry: + %vgpr = call i32 @llvm.r600.read.tidig.x() #1 + %tmp = icmp eq i32 %cond, 0 + br i1 %tmp, label %if, label %else + +if: ; preds = %entry + %tmp1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 + %tmp2 = extractelement <4 x i32> %sgpr, i32 1 + store i32 %tmp2, i32 addrspace(1)* %out + br label %endif + +else: ; preds = %entry + %tmp3 = extractelement <4 x i32> %sgpr, i32 2 + %tmp4 = sub i32 %vgpr, %tmp3 + store i32 %tmp4, i32 addrspace(1)* %out + br label %endif + +endif: ; preds = %else, %if + ret void +} + +; Test that we fold an immediate that was illegal for a 64-bit op into the +; 32-bit op when we shrink it. + +; FUNC-LABEL: {{^}}add_fold: +; SI: v_add_f32_e32 v{{[0-9]+}}, 0x44800000 +define void @add_fold(float addrspace(1)* %out) { +entry: + %tmp = call i32 @llvm.r600.read.tidig.x() + %tmp1 = uitofp i32 %tmp to float + %tmp2 = fadd float %tmp1, 1.024000e+03 + store float %tmp2, float addrspace(1)* %out + ret void +} + +; Function Attrs: nounwind readnone +declare i32 @llvm.r600.read.tidig.x() #0 + +attributes #0 = { nounwind readnone } +attributes #1 = { readnone } diff --git a/llvm/test/CodeGen/AMDGPU/vselect.ll b/llvm/test/CodeGen/AMDGPU/vselect.ll new file mode 100644 index 00000000000..a3014b03d2b --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/vselect.ll @@ -0,0 +1,77 @@ +;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG %s +;RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck --check-prefix=SI %s +;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=SI %s + +;EG: {{^}}test_select_v2i32: +;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} + +;SI: {{^}}test_select_v2i32: +;SI: v_cndmask_b32_e64 +;SI: v_cndmask_b32_e64 + +define void @test_select_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in0, <2 x i32> addrspace(1)* %in1) { +entry: + %0 = load <2 x i32>, <2 x i32> addrspace(1)* %in0 + %1 = load <2 x i32>, <2 x i32> addrspace(1)* %in1 + %cmp = icmp ne <2 x i32> %0, %1 + %result = select <2 x i1> %cmp, <2 x i32> %0, <2 x i32> %1 + store <2 x i32> %result, <2 x i32> addrspace(1)* %out + ret void +} + +;EG: {{^}}test_select_v2f32: +;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} + +;SI: {{^}}test_select_v2f32: +;SI: v_cndmask_b32_e64 +;SI: v_cndmask_b32_e64 + +define void @test_select_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in0, <2 x float> addrspace(1)* %in1) { +entry: + %0 = load <2 x float>, <2 x float> addrspace(1)* %in0 + %1 = load <2 x float>, <2 x float> addrspace(1)* %in1 + %cmp = fcmp une <2 x float> %0, %1 + %result = select <2 x i1> %cmp, <2 x float> %0, <2 x float> %1 + store <2 x float> %result, <2 x float> addrspace(1)* %out + ret void +} + +;EG: {{^}}test_select_v4i32: +;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} + +;SI: {{^}}test_select_v4i32: +;SI: v_cndmask_b32_e64 +;SI: v_cndmask_b32_e64 +;SI: v_cndmask_b32_e64 +;SI: v_cndmask_b32_e64 + +define void @test_select_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in0, <4 x i32> addrspace(1)* %in1) { +entry: + %0 = load <4 x i32>, <4 x i32> addrspace(1)* %in0 + %1 = load <4 x i32>, <4 x i32> addrspace(1)* %in1 + %cmp = icmp ne <4 x i32> %0, %1 + %result = select <4 x i1> %cmp, <4 x i32> %0, <4 x i32> %1 + store <4 x i32> %result, <4 x i32> addrspace(1)* %out + ret void +} + +;EG: {{^}}test_select_v4f32: +;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} + +define void @test_select_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in0, <4 x float> addrspace(1)* %in1) { +entry: + %0 = load <4 x float>, <4 x float> addrspace(1)* %in0 + %1 = load <4 x float>, <4 x float> addrspace(1)* %in1 + %cmp = fcmp une <4 x float> %0, %1 + %result = select <4 x i1> %cmp, <4 x float> %0, <4 x float> %1 + store <4 x float> %result, <4 x float> addrspace(1)* %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/vselect64.ll b/llvm/test/CodeGen/AMDGPU/vselect64.ll new file mode 100644 index 00000000000..ef85ebe7899 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/vselect64.ll @@ -0,0 +1,15 @@ +; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s +; XXX: Merge this test into vselect.ll once SI supports 64-bit select. + +; CHECK-LABEL: {{^}}test_select_v4i64: +; Make sure the vectors aren't being stored on the stack. We know they are +; being stored on the stack if the shaders uses at leat 10 registers. +; CHECK-NOT: {{\**}} MOV T{{[0-9][0-9]}}.X +define void @test_select_v4i64(<4 x i64> addrspace(1)* %out, <4 x i32> %c) { +entry: + %cmp = icmp ne <4 x i32> %c, + %result = select <4 x i1> %cmp, <4 x i64> , <4 x i64> + store <4 x i64> %result, <4 x i64> addrspace(1)* %out + ret void +} + diff --git a/llvm/test/CodeGen/AMDGPU/vtx-fetch-branch.ll b/llvm/test/CodeGen/AMDGPU/vtx-fetch-branch.ll new file mode 100644 index 00000000000..4584d6e2525 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/vtx-fetch-branch.ll @@ -0,0 +1,29 @@ +; RUN: llc -march=r600 -mcpu=redwood %s -o - | FileCheck %s + +; This tests for a bug where vertex fetch clauses right before an ENDIF +; instruction where being emitted after the ENDIF. We were using ALU_POP_AFTER +; for the ALU clause before the vetex fetch instead of emitting a POP instruction +; after the fetch clause. + + +; CHECK-LABEL: {{^}}test: +; CHECK-NOT: ALU_POP_AFTER +; CHECK: TEX +; CHECK-NEXT: POP +define void @test(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %cond) { +entry: + %0 = icmp eq i32 %cond, 0 + br i1 %0, label %endif, label %if + +if: + %1 = load i32, i32 addrspace(1)* %in + br label %endif + +endif: + %x = phi i32 [ %1, %if], [ 0, %entry] + store i32 %x, i32 addrspace(1)* %out + br label %done + +done: + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/vtx-schedule.ll b/llvm/test/CodeGen/AMDGPU/vtx-schedule.ll new file mode 100644 index 00000000000..912e258ebb8 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/vtx-schedule.ll @@ -0,0 +1,18 @@ +; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s + +; This test is for a scheduler bug where VTX_READ instructions that used +; the result of another VTX_READ instruction were being grouped in the +; same fetch clasue. + +; CHECK: {{^}}test: +; CHECK: Fetch clause +; CHECK: VTX_READ_32 [[IN0:T[0-9]+\.X]], [[IN0]], 0 +; CHECK: Fetch clause +; CHECK: VTX_READ_32 [[IN1:T[0-9]+\.X]], [[IN1]], 0 +define void @test(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* addrspace(1)* nocapture %in0) { +entry: + %0 = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(1)* %in0 + %1 = load i32, i32 addrspace(1)* %0 + store i32 %1, i32 addrspace(1)* %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/wait.ll b/llvm/test/CodeGen/AMDGPU/wait.ll new file mode 100644 index 00000000000..5cc7577cad3 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/wait.ll @@ -0,0 +1,45 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -strict-whitespace %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -strict-whitespace %s + +; CHECK-LABEL: {{^}}main: +; CHECK: s_load_dwordx4 +; CHECK: s_load_dwordx4 +; CHECK: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; CHECK: s_endpgm +define void @main(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <32 x i8> addrspace(2)* inreg %arg2, <16 x i8> addrspace(2)* inreg %arg3, <16 x i8> addrspace(2)* inreg %arg4, i32 inreg %arg5, i32 %arg6, i32 %arg7, i32 %arg8, i32 %arg9, float addrspace(2)* inreg %constptr) #0 { +main_body: + %tmp = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %arg3, i32 0 + %tmp10 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp, !tbaa !0 + %tmp11 = call <4 x float> @llvm.SI.vs.load.input(<16 x i8> %tmp10, i32 0, i32 %arg6) + %tmp12 = extractelement <4 x float> %tmp11, i32 0 + %tmp13 = extractelement <4 x float> %tmp11, i32 1 + call void @llvm.AMDGPU.barrier.global() #1 + %tmp14 = extractelement <4 x float> %tmp11, i32 2 +; %tmp15 = extractelement <4 x float> %tmp11, i32 3 + %tmp15 = load float, float addrspace(2)* %constptr, align 4 ; Force waiting for expcnt and lgkmcnt + %tmp16 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %arg3, i32 1 + %tmp17 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp16, !tbaa !0 + %tmp18 = call <4 x float> @llvm.SI.vs.load.input(<16 x i8> %tmp17, i32 0, i32 %arg6) + %tmp19 = extractelement <4 x float> %tmp18, i32 0 + %tmp20 = extractelement <4 x float> %tmp18, i32 1 + %tmp21 = extractelement <4 x float> %tmp18, i32 2 + %tmp22 = extractelement <4 x float> %tmp18, i32 3 + call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 32, i32 0, float %tmp19, float %tmp20, float %tmp21, float %tmp22) + call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %tmp12, float %tmp13, float %tmp14, float %tmp15) + ret void +} + +; Function Attrs: noduplicate nounwind +declare void @llvm.AMDGPU.barrier.global() #1 + +; Function Attrs: nounwind readnone +declare <4 x float> @llvm.SI.vs.load.input(<16 x i8>, i32, i32) #2 + +declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) + +attributes #0 = { "ShaderType"="1" } +attributes #1 = { noduplicate nounwind } +attributes #2 = { nounwind readnone } + +!0 = !{!1, !1, i64 0, i32 1} +!1 = !{!"const", null} diff --git a/llvm/test/CodeGen/AMDGPU/work-item-intrinsics.ll b/llvm/test/CodeGen/AMDGPU/work-item-intrinsics.ll new file mode 100644 index 00000000000..4328e964c1b --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/work-item-intrinsics.ll @@ -0,0 +1,238 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s + + +; FUNC-LABEL: {{^}}ngroups_x: +; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] +; EG: MOV [[VAL]], KC0[0].X + +; GCN: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0 +; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] +; GCN: buffer_store_dword [[VVAL]] +define void @ngroups_x (i32 addrspace(1)* %out) { +entry: + %0 = call i32 @llvm.r600.read.ngroups.x() #0 + store i32 %0, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}ngroups_y: +; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] +; EG: MOV [[VAL]], KC0[0].Y + +; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x1 +; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x4 +; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] +; GCN: buffer_store_dword [[VVAL]] +define void @ngroups_y (i32 addrspace(1)* %out) { +entry: + %0 = call i32 @llvm.r600.read.ngroups.y() #0 + store i32 %0, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}ngroups_z: +; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] +; EG: MOV [[VAL]], KC0[0].Z + +; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x2 +; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x8 +; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] +; GCN: buffer_store_dword [[VVAL]] +define void @ngroups_z (i32 addrspace(1)* %out) { +entry: + %0 = call i32 @llvm.r600.read.ngroups.z() #0 + store i32 %0, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}global_size_x: +; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] +; EG: MOV [[VAL]], KC0[0].W + +; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x3 +; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0xc +; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] +; GCN: buffer_store_dword [[VVAL]] +define void @global_size_x (i32 addrspace(1)* %out) { +entry: + %0 = call i32 @llvm.r600.read.global.size.x() #0 + store i32 %0, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}global_size_y: +; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] +; EG: MOV [[VAL]], KC0[1].X + +; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x4 +; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x10 +; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] +; GCN: buffer_store_dword [[VVAL]] +define void @global_size_y (i32 addrspace(1)* %out) { +entry: + %0 = call i32 @llvm.r600.read.global.size.y() #0 + store i32 %0, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}global_size_z: +; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] +; EG: MOV [[VAL]], KC0[1].Y + +; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x5 +; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x14 +; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] +; GCN: buffer_store_dword [[VVAL]] +define void @global_size_z (i32 addrspace(1)* %out) { +entry: + %0 = call i32 @llvm.r600.read.global.size.z() #0 + store i32 %0, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}local_size_x: +; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] +; EG: MOV [[VAL]], KC0[1].Z + +; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x6 +; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x18 +; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] +; GCN: buffer_store_dword [[VVAL]] +define void @local_size_x (i32 addrspace(1)* %out) { +entry: + %0 = call i32 @llvm.r600.read.local.size.x() #0 + store i32 %0, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}local_size_y: +; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] +; EG: MOV [[VAL]], KC0[1].W + +; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x7 +; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x1c +; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] +; GCN: buffer_store_dword [[VVAL]] +define void @local_size_y (i32 addrspace(1)* %out) { +entry: + %0 = call i32 @llvm.r600.read.local.size.y() #0 + store i32 %0, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}local_size_z: +; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] +; EG: MOV [[VAL]], KC0[2].X + +; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x8 +; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x20 +; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] +; GCN: buffer_store_dword [[VVAL]] +define void @local_size_z (i32 addrspace(1)* %out) { +entry: + %0 = call i32 @llvm.r600.read.local.size.z() #0 + store i32 %0, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}get_work_dim: +; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] +; EG: MOV [[VAL]], KC0[2].Z + +; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0xb +; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x2c +; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] +; GCN: buffer_store_dword [[VVAL]] +define void @get_work_dim (i32 addrspace(1)* %out) { +entry: + %0 = call i32 @llvm.AMDGPU.read.workdim() #0 + store i32 %0, i32 addrspace(1)* %out + ret void +} + +; The tgid values are stored in sgprs offset by the number of user sgprs. +; Currently we always use exactly 2 user sgprs for the pointer to the +; kernel arguments, but this may change in the future. + +; FUNC-LABEL: {{^}}tgid_x: +; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], s4 +; GCN: buffer_store_dword [[VVAL]] +define void @tgid_x (i32 addrspace(1)* %out) { +entry: + %0 = call i32 @llvm.r600.read.tgid.x() #0 + store i32 %0, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}tgid_y: +; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], s5 +; GCN: buffer_store_dword [[VVAL]] +define void @tgid_y (i32 addrspace(1)* %out) { +entry: + %0 = call i32 @llvm.r600.read.tgid.y() #0 + store i32 %0, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}tgid_z: +; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], s6 +; GCN: buffer_store_dword [[VVAL]] +define void @tgid_z (i32 addrspace(1)* %out) { +entry: + %0 = call i32 @llvm.r600.read.tgid.z() #0 + store i32 %0, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}tidig_x: +; GCN: buffer_store_dword v0 +define void @tidig_x (i32 addrspace(1)* %out) { +entry: + %0 = call i32 @llvm.r600.read.tidig.x() #0 + store i32 %0, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}tidig_y: +; GCN: buffer_store_dword v1 +define void @tidig_y (i32 addrspace(1)* %out) { +entry: + %0 = call i32 @llvm.r600.read.tidig.y() #0 + store i32 %0, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}tidig_z: +; GCN: buffer_store_dword v2 +define void @tidig_z (i32 addrspace(1)* %out) { +entry: + %0 = call i32 @llvm.r600.read.tidig.z() #0 + store i32 %0, i32 addrspace(1)* %out + ret void +} + +declare i32 @llvm.r600.read.ngroups.x() #0 +declare i32 @llvm.r600.read.ngroups.y() #0 +declare i32 @llvm.r600.read.ngroups.z() #0 + +declare i32 @llvm.r600.read.global.size.x() #0 +declare i32 @llvm.r600.read.global.size.y() #0 +declare i32 @llvm.r600.read.global.size.z() #0 + +declare i32 @llvm.r600.read.local.size.x() #0 +declare i32 @llvm.r600.read.local.size.y() #0 +declare i32 @llvm.r600.read.local.size.z() #0 + +declare i32 @llvm.r600.read.tgid.x() #0 +declare i32 @llvm.r600.read.tgid.y() #0 +declare i32 @llvm.r600.read.tgid.z() #0 + +declare i32 @llvm.r600.read.tidig.x() #0 +declare i32 @llvm.r600.read.tidig.y() #0 +declare i32 @llvm.r600.read.tidig.z() #0 + +declare i32 @llvm.AMDGPU.read.workdim() #0 + +attributes #0 = { readnone } diff --git a/llvm/test/CodeGen/AMDGPU/wrong-transalu-pos-fix.ll b/llvm/test/CodeGen/AMDGPU/wrong-transalu-pos-fix.ll new file mode 100644 index 00000000000..8b383e4c393 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/wrong-transalu-pos-fix.ll @@ -0,0 +1,81 @@ +; RUN: llc -march=r600 -mcpu=redwood -mtriple=r600-- < %s | FileCheck %s + +; We want all MULLO_INT inst to be last in their instruction group +;CHECK: {{^}}fill3d: +;CHECK-NOT: MULLO_INT T[0-9]+ + +define void @fill3d(i32 addrspace(1)* nocapture %out) #0 { +entry: + %x.i = tail call i32 @llvm.r600.read.global.size.x() #1 + %y.i18 = tail call i32 @llvm.r600.read.global.size.y() #1 + %mul = mul i32 %y.i18, %x.i + %z.i17 = tail call i32 @llvm.r600.read.global.size.z() #1 + %mul3 = mul i32 %mul, %z.i17 + %x.i.i = tail call i32 @llvm.r600.read.tgid.x() #1 + %x.i12.i = tail call i32 @llvm.r600.read.local.size.x() #1 + %mul26.i = mul i32 %x.i12.i, %x.i.i + %x.i4.i = tail call i32 @llvm.r600.read.tidig.x() #1 + %add.i16 = add i32 %x.i4.i, %mul26.i + %mul7 = mul i32 %add.i16, %y.i18 + %y.i.i = tail call i32 @llvm.r600.read.tgid.y() #1 + %y.i14.i = tail call i32 @llvm.r600.read.local.size.y() #1 + %mul30.i = mul i32 %y.i14.i, %y.i.i + %y.i6.i = tail call i32 @llvm.r600.read.tidig.y() #1 + %add.i14 = add i32 %mul30.i, %mul7 + %mul819 = add i32 %add.i14, %y.i6.i + %add = mul i32 %mul819, %z.i17 + %z.i.i = tail call i32 @llvm.r600.read.tgid.z() #1 + %z.i16.i = tail call i32 @llvm.r600.read.local.size.z() #1 + %mul33.i = mul i32 %z.i16.i, %z.i.i + %z.i8.i = tail call i32 @llvm.r600.read.tidig.z() #1 + %add.i = add i32 %z.i8.i, %mul33.i + %add13 = add i32 %add.i, %add + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %add13 + store i32 %mul3, i32 addrspace(1)* %arrayidx, align 4 + ret void +} + +; Function Attrs: nounwind readnone +declare i32 @llvm.r600.read.tgid.x() #1 + +; Function Attrs: nounwind readnone +declare i32 @llvm.r600.read.tgid.y() #1 + +; Function Attrs: nounwind readnone +declare i32 @llvm.r600.read.tgid.z() #1 + +; Function Attrs: nounwind readnone +declare i32 @llvm.r600.read.local.size.x() #1 + +; Function Attrs: nounwind readnone +declare i32 @llvm.r600.read.local.size.y() #1 + +; Function Attrs: nounwind readnone +declare i32 @llvm.r600.read.local.size.z() #1 + +; Function Attrs: nounwind readnone +declare i32 @llvm.r600.read.tidig.x() #1 + +; Function Attrs: nounwind readnone +declare i32 @llvm.r600.read.tidig.y() #1 + +; Function Attrs: nounwind readnone +declare i32 @llvm.r600.read.tidig.z() #1 + +; Function Attrs: nounwind readnone +declare i32 @llvm.r600.read.global.size.x() #1 + +; Function Attrs: nounwind readnone +declare i32 @llvm.r600.read.global.size.y() #1 + +; Function Attrs: nounwind readnone +declare i32 @llvm.r600.read.global.size.z() #1 + +attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { nounwind readnone } + +!opencl.kernels = !{!0, !1, !2} + +!0 = !{null} +!1 = !{null} +!2 = !{void (i32 addrspace(1)*)* @fill3d} diff --git a/llvm/test/CodeGen/AMDGPU/xor.ll b/llvm/test/CodeGen/AMDGPU/xor.ll new file mode 100644 index 00000000000..089db59eabc --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/xor.ll @@ -0,0 +1,173 @@ +; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s + + +; FUNC-LABEL: {{^}}xor_v2i32: +; EG: XOR_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; EG: XOR_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} + +; SI: v_xor_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +; SI: v_xor_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} + +define void @xor_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in0, <2 x i32> addrspace(1)* %in1) { + %a = load <2 x i32>, <2 x i32> addrspace(1) * %in0 + %b = load <2 x i32>, <2 x i32> addrspace(1) * %in1 + %result = xor <2 x i32> %a, %b + store <2 x i32> %result, <2 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}xor_v4i32: +; EG: XOR_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; EG: XOR_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; EG: XOR_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; EG: XOR_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} + +; SI: v_xor_b32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}} +; SI: v_xor_b32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}} +; SI: v_xor_b32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}} +; SI: v_xor_b32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}} + +define void @xor_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in0, <4 x i32> addrspace(1)* %in1) { + %a = load <4 x i32>, <4 x i32> addrspace(1) * %in0 + %b = load <4 x i32>, <4 x i32> addrspace(1) * %in1 + %result = xor <4 x i32> %a, %b + store <4 x i32> %result, <4 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}xor_i1: +; EG: XOR_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], PS}} + +; SI-DAG: v_cmp_le_f32_e32 [[CMP0:vcc]], 0, {{v[0-9]+}} +; SI-DAG: v_cmp_le_f32_e64 [[CMP1:s\[[0-9]+:[0-9]+\]]], 1.0, {{v[0-9]+}} +; SI: s_xor_b64 [[XOR:s\[[0-9]+:[0-9]+\]]], [[CMP0]], [[CMP1]] +; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, [[XOR]] +; SI: buffer_store_dword [[RESULT]] +; SI: s_endpgm +define void @xor_i1(float addrspace(1)* %out, float addrspace(1)* %in0, float addrspace(1)* %in1) { + %a = load float, float addrspace(1) * %in0 + %b = load float, float addrspace(1) * %in1 + %acmp = fcmp oge float %a, 0.000000e+00 + %bcmp = fcmp oge float %b, 1.000000e+00 + %xor = xor i1 %acmp, %bcmp + %result = select i1 %xor, float %a, float %b + store float %result, float addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}v_xor_i1: +; SI: buffer_load_ubyte [[B:v[0-9]+]] +; SI: buffer_load_ubyte [[A:v[0-9]+]] +; SI: v_xor_b32_e32 [[XOR:v[0-9]+]], [[A]], [[B]] +; SI: v_and_b32_e32 [[RESULT:v[0-9]+]], 1, [[XOR]] +; SI: buffer_store_byte [[RESULT]] +define void @v_xor_i1(i1 addrspace(1)* %out, i1 addrspace(1)* %in0, i1 addrspace(1)* %in1) { + %a = load i1, i1 addrspace(1)* %in0 + %b = load i1, i1 addrspace(1)* %in1 + %xor = xor i1 %a, %b + store i1 %xor, i1 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}vector_xor_i32: +; SI: v_xor_b32_e32 +define void @vector_xor_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) { + %a = load i32, i32 addrspace(1)* %in0 + %b = load i32, i32 addrspace(1)* %in1 + %result = xor i32 %a, %b + store i32 %result, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}scalar_xor_i32: +; SI: s_xor_b32 +define void @scalar_xor_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) { + %result = xor i32 %a, %b + store i32 %result, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}scalar_not_i32: +; SI: s_not_b32 +define void @scalar_not_i32(i32 addrspace(1)* %out, i32 %a) { + %result = xor i32 %a, -1 + store i32 %result, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}vector_not_i32: +; SI: v_not_b32 +define void @vector_not_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) { + %a = load i32, i32 addrspace(1)* %in0 + %b = load i32, i32 addrspace(1)* %in1 + %result = xor i32 %a, -1 + store i32 %result, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}vector_xor_i64: +; SI: v_xor_b32_e32 +; SI: v_xor_b32_e32 +; SI: s_endpgm +define void @vector_xor_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in0, i64 addrspace(1)* %in1) { + %a = load i64, i64 addrspace(1)* %in0 + %b = load i64, i64 addrspace(1)* %in1 + %result = xor i64 %a, %b + store i64 %result, i64 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}scalar_xor_i64: +; SI: s_xor_b64 +; SI: s_endpgm +define void @scalar_xor_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) { + %result = xor i64 %a, %b + store i64 %result, i64 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}scalar_not_i64: +; SI: s_not_b64 +define void @scalar_not_i64(i64 addrspace(1)* %out, i64 %a) { + %result = xor i64 %a, -1 + store i64 %result, i64 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}vector_not_i64: +; SI: v_not_b32 +; SI: v_not_b32 +define void @vector_not_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in0, i64 addrspace(1)* %in1) { + %a = load i64, i64 addrspace(1)* %in0 + %b = load i64, i64 addrspace(1)* %in1 + %result = xor i64 %a, -1 + store i64 %result, i64 addrspace(1)* %out + ret void +} + +; Test that we have a pattern to match xor inside a branch. +; Note that in the future the backend may be smart enough to +; use an SALU instruction for this. + +; FUNC-LABEL: {{^}}xor_cf: +; SI: s_xor_b64 +define void @xor_cf(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %a, i64 %b) { +entry: + %0 = icmp eq i64 %a, 0 + br i1 %0, label %if, label %else + +if: + %1 = xor i64 %a, %b + br label %endif + +else: + %2 = load i64, i64 addrspace(1)* %in + br label %endif + +endif: + %3 = phi i64 [%1, %if], [%2, %else] + store i64 %3, i64 addrspace(1)* %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/zero_extend.ll b/llvm/test/CodeGen/AMDGPU/zero_extend.ll new file mode 100644 index 00000000000..033055db185 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/zero_extend.ll @@ -0,0 +1,41 @@ +; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600 +; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI +; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s --check-prefix=SI + +; R600: {{^}}test: +; R600: MEM_RAT_CACHELESS STORE_RAW +; R600: MEM_RAT_CACHELESS STORE_RAW + +; SI: {{^}}test: +; SI: s_mov_b32 [[ZERO:s[0-9]]], 0{{$}} +; SI: v_mov_b32_e32 v[[V_ZERO:[0-9]]], [[ZERO]] +; SI: buffer_store_dwordx2 v[0:[[V_ZERO]]{{\]}} +define void @test(i64 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) { +entry: + %0 = mul i32 %a, %b + %1 = add i32 %0, %c + %2 = zext i32 %1 to i64 + store i64 %2, i64 addrspace(1)* %out + ret void +} + +; SI-LABEL: {{^}}testi1toi32: +; SI: v_cndmask_b32 +define void @testi1toi32(i32 addrspace(1)* %out, i32 %a, i32 %b) { +entry: + %0 = icmp eq i32 %a, %b + %1 = zext i1 %0 to i32 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; SI-LABEL: {{^}}zext_i1_to_i64: +; SI: s_mov_b32 s{{[0-9]+}}, 0 +; SI: v_cmp_eq_i32 +; SI: v_cndmask_b32 +define void @zext_i1_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b) nounwind { + %cmp = icmp eq i32 %a, %b + %ext = zext i1 %cmp to i64 + store i64 %ext, i64 addrspace(1)* %out, align 8 + ret void +} diff --git a/llvm/test/CodeGen/R600/32-bit-local-address-space.ll b/llvm/test/CodeGen/R600/32-bit-local-address-space.ll deleted file mode 100644 index c7bcfd2ddab..00000000000 --- a/llvm/test/CodeGen/R600/32-bit-local-address-space.ll +++ /dev/null @@ -1,139 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s - -; On Southern Islands GPUs the local address space(3) uses 32-bit pointers and -; the global address space(1) uses 64-bit pointers. These tests check to make sure -; the correct pointer size is used for the local address space. - -; The e{{32|64}} suffix on the instructions refers to the encoding size and not -; the size of the operands. The operand size is denoted in the instruction name. -; Instructions with B32, U32, and I32 in their name take 32-bit operands, while -; instructions with B64, U64, and I64 take 64-bit operands. - -; FUNC-LABEL: {{^}}local_address_load: -; SI: v_mov_b32_e{{32|64}} [[PTR:v[0-9]]] -; SI: ds_read_b32 v{{[0-9]+}}, [[PTR]] -define void @local_address_load(i32 addrspace(1)* %out, i32 addrspace(3)* %in) { -entry: - %0 = load i32, i32 addrspace(3)* %in - store i32 %0, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}local_address_gep: -; SI: s_add_i32 [[SPTR:s[0-9]]] -; SI: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]] -; SI: ds_read_b32 [[VPTR]] -define void @local_address_gep(i32 addrspace(1)* %out, i32 addrspace(3)* %in, i32 %offset) { -entry: - %0 = getelementptr i32, i32 addrspace(3)* %in, i32 %offset - %1 = load i32, i32 addrspace(3)* %0 - store i32 %1, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}local_address_gep_const_offset: -; SI: v_mov_b32_e32 [[VPTR:v[0-9]+]], s{{[0-9]+}} -; SI: ds_read_b32 v{{[0-9]+}}, [[VPTR]] offset:4 -define void @local_address_gep_const_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %in) { -entry: - %0 = getelementptr i32, i32 addrspace(3)* %in, i32 1 - %1 = load i32, i32 addrspace(3)* %0 - store i32 %1, i32 addrspace(1)* %out - ret void -} - -; Offset too large, can't fold into 16-bit immediate offset. -; FUNC-LABEL: {{^}}local_address_gep_large_const_offset: -; SI: s_add_i32 [[SPTR:s[0-9]]], s{{[0-9]+}}, 0x10004 -; SI: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]] -; SI: ds_read_b32 [[VPTR]] -define void @local_address_gep_large_const_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %in) { -entry: - %0 = getelementptr i32, i32 addrspace(3)* %in, i32 16385 - %1 = load i32, i32 addrspace(3)* %0 - store i32 %1, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}null_32bit_lds_ptr: -; SI: v_cmp_ne_i32 -; SI-NOT: v_cmp_ne_i32 -; SI: v_cndmask_b32 -define void @null_32bit_lds_ptr(i32 addrspace(1)* %out, i32 addrspace(3)* %lds) nounwind { - %cmp = icmp ne i32 addrspace(3)* %lds, null - %x = select i1 %cmp, i32 123, i32 456 - store i32 %x, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}mul_32bit_ptr: -; SI: s_mul_i32 -; SI-NEXT: s_add_i32 -; SI: ds_read_b32 -define void @mul_32bit_ptr(float addrspace(1)* %out, [3 x float] addrspace(3)* %lds, i32 %tid) { - %ptr = getelementptr [3 x float], [3 x float] addrspace(3)* %lds, i32 %tid, i32 0 - %val = load float, float addrspace(3)* %ptr - store float %val, float addrspace(1)* %out - ret void -} - -@g_lds = addrspace(3) global float undef, align 4 - -; FUNC-LABEL: {{^}}infer_ptr_alignment_global_offset: -; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0 -; SI: ds_read_b32 v{{[0-9]+}}, [[REG]] -define void @infer_ptr_alignment_global_offset(float addrspace(1)* %out, i32 %tid) { - %val = load float, float addrspace(3)* @g_lds - store float %val, float addrspace(1)* %out - ret void -} - - -@ptr = addrspace(3) global i32 addrspace(3)* undef -@dst = addrspace(3) global [16384 x i32] undef - -; FUNC-LABEL: {{^}}global_ptr: -; SI: ds_write_b32 -define void @global_ptr() nounwind { - store i32 addrspace(3)* getelementptr ([16384 x i32], [16384 x i32] addrspace(3)* @dst, i32 0, i32 16), i32 addrspace(3)* addrspace(3)* @ptr - ret void -} - -; FUNC-LABEL: {{^}}local_address_store: -; SI: ds_write_b32 -define void @local_address_store(i32 addrspace(3)* %out, i32 %val) { - store i32 %val, i32 addrspace(3)* %out - ret void -} - -; FUNC-LABEL: {{^}}local_address_gep_store: -; SI: s_add_i32 [[SADDR:s[0-9]+]], -; SI: v_mov_b32_e32 [[ADDR:v[0-9]+]], [[SADDR]] -; SI: ds_write_b32 [[ADDR]], v{{[0-9]+}} -define void @local_address_gep_store(i32 addrspace(3)* %out, i32, i32 %val, i32 %offset) { - %gep = getelementptr i32, i32 addrspace(3)* %out, i32 %offset - store i32 %val, i32 addrspace(3)* %gep, align 4 - ret void -} - -; FUNC-LABEL: {{^}}local_address_gep_const_offset_store: -; SI: v_mov_b32_e32 [[VPTR:v[0-9]+]], s{{[0-9]+}} -; SI: v_mov_b32_e32 [[VAL:v[0-9]+]], s{{[0-9]+}} -; SI: ds_write_b32 [[VPTR]], [[VAL]] offset:4 -define void @local_address_gep_const_offset_store(i32 addrspace(3)* %out, i32 %val) { - %gep = getelementptr i32, i32 addrspace(3)* %out, i32 1 - store i32 %val, i32 addrspace(3)* %gep, align 4 - ret void -} - -; Offset too large, can't fold into 16-bit immediate offset. -; FUNC-LABEL: {{^}}local_address_gep_large_const_offset_store: -; SI: s_add_i32 [[SPTR:s[0-9]]], s{{[0-9]+}}, 0x10004 -; SI: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]] -; SI: ds_write_b32 [[VPTR]], v{{[0-9]+$}} -define void @local_address_gep_large_const_offset_store(i32 addrspace(3)* %out, i32 %val) { - %gep = getelementptr i32, i32 addrspace(3)* %out, i32 16385 - store i32 %val, i32 addrspace(3)* %gep, align 4 - ret void -} diff --git a/llvm/test/CodeGen/R600/README b/llvm/test/CodeGen/R600/README deleted file mode 100644 index 96998bba28f..00000000000 --- a/llvm/test/CodeGen/R600/README +++ /dev/null @@ -1,21 +0,0 @@ -+==============================================================================+ -| How to organize the lit tests | -+==============================================================================+ - -- If you write a test for matching a single DAG opcode or intrinsic, it should - go in a file called {opcode_name,intrinsic_name}.ll (e.g. fadd.ll) - -- If you write a test that matches several DAG opcodes and checks for a single - ISA instruction, then that test should go in a file called {ISA_name}.ll (e.g. - bfi_int.ll - -- For all other tests, use your best judgement for organizing tests and naming - the files. - -+==============================================================================+ -| Naming conventions | -+==============================================================================+ - -- Use dash '-' and not underscore '_' to separate words in file names, unless - the file is named after a DAG opcode or ISA instruction that has an - underscore '_' in its name. diff --git a/llvm/test/CodeGen/R600/add-debug.ll b/llvm/test/CodeGen/R600/add-debug.ll deleted file mode 100644 index 529905dd36a..00000000000 --- a/llvm/test/CodeGen/R600/add-debug.ll +++ /dev/null @@ -1,24 +0,0 @@ -; RUN: llc < %s -march=amdgcn -mcpu=tahiti -debug -; RUN: llc < %s -march=amdgcn -mcpu=tonga -debug -; REQUIRES: asserts - -; Check that SelectionDAGDumper does not crash on int_SI_if. -define void @add64_in_branch(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %a, i64 %b, i64 %c) { -entry: - %0 = icmp eq i64 %a, 0 - br i1 %0, label %if, label %else - -if: - %1 = load i64, i64 addrspace(1)* %in - br label %endif - -else: - %2 = add i64 %a, %b - br label %endif - -endif: - %3 = phi i64 [%1, %if], [%2, %else] - store i64 %3, i64 addrspace(1)* %out - ret void -} - diff --git a/llvm/test/CodeGen/R600/add.ll b/llvm/test/CodeGen/R600/add.ll deleted file mode 100644 index 655e75dbc1a..00000000000 --- a/llvm/test/CodeGen/R600/add.ll +++ /dev/null @@ -1,192 +0,0 @@ -; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG --check-prefix=FUNC %s -; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s -; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s - -;FUNC-LABEL: {{^}}test1: -;EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} - -;SI: v_add_i32_e32 [[REG:v[0-9]+]], {{v[0-9]+, v[0-9]+}} -;SI-NOT: [[REG]] -;SI: buffer_store_dword [[REG]], -define void @test1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { - %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 - %a = load i32, i32 addrspace(1)* %in - %b = load i32, i32 addrspace(1)* %b_ptr - %result = add i32 %a, %b - store i32 %result, i32 addrspace(1)* %out - ret void -} - -;FUNC-LABEL: {{^}}test2: -;EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -;EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} - -;SI: v_add_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -;SI: v_add_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} - -define void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { - %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1 - %a = load <2 x i32>, <2 x i32> addrspace(1)* %in - %b = load <2 x i32>, <2 x i32> addrspace(1)* %b_ptr - %result = add <2 x i32> %a, %b - store <2 x i32> %result, <2 x i32> addrspace(1)* %out - ret void -} - -;FUNC-LABEL: {{^}}test4: -;EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -;EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -;EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -;EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} - -;SI: v_add_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -;SI: v_add_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -;SI: v_add_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -;SI: v_add_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} - -define void @test4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { - %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1 - %a = load <4 x i32>, <4 x i32> addrspace(1)* %in - %b = load <4 x i32>, <4 x i32> addrspace(1)* %b_ptr - %result = add <4 x i32> %a, %b - store <4 x i32> %result, <4 x i32> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}test8: -; EG: ADD_INT -; EG: ADD_INT -; EG: ADD_INT -; EG: ADD_INT -; EG: ADD_INT -; EG: ADD_INT -; EG: ADD_INT -; EG: ADD_INT - -; SI: s_add_i32 -; SI: s_add_i32 -; SI: s_add_i32 -; SI: s_add_i32 -; SI: s_add_i32 -; SI: s_add_i32 -; SI: s_add_i32 -; SI: s_add_i32 -define void @test8(<8 x i32> addrspace(1)* %out, <8 x i32> %a, <8 x i32> %b) { -entry: - %0 = add <8 x i32> %a, %b - store <8 x i32> %0, <8 x i32> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}test16: -; EG: ADD_INT -; EG: ADD_INT -; EG: ADD_INT -; EG: ADD_INT -; EG: ADD_INT -; EG: ADD_INT -; EG: ADD_INT -; EG: ADD_INT -; EG: ADD_INT -; EG: ADD_INT -; EG: ADD_INT -; EG: ADD_INT -; EG: ADD_INT -; EG: ADD_INT -; EG: ADD_INT -; EG: ADD_INT - -; SI: s_add_i32 -; SI: s_add_i32 -; SI: s_add_i32 -; SI: s_add_i32 -; SI: s_add_i32 -; SI: s_add_i32 -; SI: s_add_i32 -; SI: s_add_i32 -; SI: s_add_i32 -; SI: s_add_i32 -; SI: s_add_i32 -; SI: s_add_i32 -; SI: s_add_i32 -; SI: s_add_i32 -; SI: s_add_i32 -; SI: s_add_i32 -define void @test16(<16 x i32> addrspace(1)* %out, <16 x i32> %a, <16 x i32> %b) { -entry: - %0 = add <16 x i32> %a, %b - store <16 x i32> %0, <16 x i32> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}add64: -; SI: s_add_u32 -; SI: s_addc_u32 - -; EG: MEM_RAT_CACHELESS STORE_RAW [[LO:T[0-9]+\.[XYZW]]] -; EG: MEM_RAT_CACHELESS STORE_RAW [[HI:T[0-9]+\.[XYZW]]] -; EG-DAG: ADD_INT {{[* ]*}}[[LO]] -; EG-DAG: ADDC_UINT -; EG-DAG: ADD_INT -; EG-DAG: ADD_INT {{[* ]*}}[[HI]] -; EG-NOT: SUB -define void @add64(i64 addrspace(1)* %out, i64 %a, i64 %b) { -entry: - %0 = add i64 %a, %b - store i64 %0, i64 addrspace(1)* %out - ret void -} - -; The v_addc_u32 and v_add_i32 instruction can't read SGPRs, because they -; use VCC. The test is designed so that %a will be stored in an SGPR and -; %0 will be stored in a VGPR, so the comiler will be forced to copy %a -; to a VGPR before doing the add. - -; FUNC-LABEL: {{^}}add64_sgpr_vgpr: -; SI-NOT: v_addc_u32_e32 s - -; EG: MEM_RAT_CACHELESS STORE_RAW [[LO:T[0-9]+\.[XYZW]]] -; EG: MEM_RAT_CACHELESS STORE_RAW [[HI:T[0-9]+\.[XYZW]]] -; EG-DAG: ADD_INT {{[* ]*}}[[LO]] -; EG-DAG: ADDC_UINT -; EG-DAG: ADD_INT -; EG-DAG: ADD_INT {{[* ]*}}[[HI]] -; EG-NOT: SUB -define void @add64_sgpr_vgpr(i64 addrspace(1)* %out, i64 %a, i64 addrspace(1)* %in) { -entry: - %0 = load i64, i64 addrspace(1)* %in - %1 = add i64 %a, %0 - store i64 %1, i64 addrspace(1)* %out - ret void -} - -; Test i64 add inside a branch. -; FUNC-LABEL: {{^}}add64_in_branch: -; SI: s_add_u32 -; SI: s_addc_u32 - -; EG: MEM_RAT_CACHELESS STORE_RAW [[LO:T[0-9]+\.[XYZW]]] -; EG: MEM_RAT_CACHELESS STORE_RAW [[HI:T[0-9]+\.[XYZW]]] -; EG-DAG: ADD_INT {{[* ]*}}[[LO]] -; EG-DAG: ADDC_UINT -; EG-DAG: ADD_INT -; EG-DAG: ADD_INT {{[* ]*}}[[HI]] -; EG-NOT: SUB -define void @add64_in_branch(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %a, i64 %b, i64 %c) { -entry: - %0 = icmp eq i64 %a, 0 - br i1 %0, label %if, label %else - -if: - %1 = load i64, i64 addrspace(1)* %in - br label %endif - -else: - %2 = add i64 %a, %b - br label %endif - -endif: - %3 = phi i64 [%1, %if], [%2, %else] - store i64 %3, i64 addrspace(1)* %out - ret void -} diff --git a/llvm/test/CodeGen/R600/add_i64.ll b/llvm/test/CodeGen/R600/add_i64.ll deleted file mode 100644 index 8346add7df9..00000000000 --- a/llvm/test/CodeGen/R600/add_i64.ll +++ /dev/null @@ -1,84 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s - - -declare i32 @llvm.r600.read.tidig.x() readnone - -; SI-LABEL: {{^}}test_i64_vreg: -; SI: v_add_i32 -; SI: v_addc_u32 -define void @test_i64_vreg(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %inA, i64 addrspace(1)* noalias %inB) { - %tid = call i32 @llvm.r600.read.tidig.x() readnone - %a_ptr = getelementptr i64, i64 addrspace(1)* %inA, i32 %tid - %b_ptr = getelementptr i64, i64 addrspace(1)* %inB, i32 %tid - %a = load i64, i64 addrspace(1)* %a_ptr - %b = load i64, i64 addrspace(1)* %b_ptr - %result = add i64 %a, %b - store i64 %result, i64 addrspace(1)* %out - ret void -} - -; Check that the SGPR add operand is correctly moved to a VGPR. -; SI-LABEL: {{^}}sgpr_operand: -; SI: v_add_i32 -; SI: v_addc_u32 -define void @sgpr_operand(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in, i64 addrspace(1)* noalias %in_bar, i64 %a) { - %foo = load i64, i64 addrspace(1)* %in, align 8 - %result = add i64 %foo, %a - store i64 %result, i64 addrspace(1)* %out - ret void -} - -; Swap the arguments. Check that the SGPR -> VGPR copy works with the -; SGPR as other operand. -; -; SI-LABEL: {{^}}sgpr_operand_reversed: -; SI: v_add_i32 -; SI: v_addc_u32 -define void @sgpr_operand_reversed(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in, i64 %a) { - %foo = load i64, i64 addrspace(1)* %in, align 8 - %result = add i64 %a, %foo - store i64 %result, i64 addrspace(1)* %out - ret void -} - - -; SI-LABEL: {{^}}test_v2i64_sreg: -; SI: s_add_u32 -; SI: s_addc_u32 -; SI: s_add_u32 -; SI: s_addc_u32 -define void @test_v2i64_sreg(<2 x i64> addrspace(1)* noalias %out, <2 x i64> %a, <2 x i64> %b) { - %result = add <2 x i64> %a, %b - store <2 x i64> %result, <2 x i64> addrspace(1)* %out - ret void -} - -; SI-LABEL: {{^}}test_v2i64_vreg: -; SI: v_add_i32 -; SI: v_addc_u32 -; SI: v_add_i32 -; SI: v_addc_u32 -define void @test_v2i64_vreg(<2 x i64> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %inA, <2 x i64> addrspace(1)* noalias %inB) { - %tid = call i32 @llvm.r600.read.tidig.x() readnone - %a_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %inA, i32 %tid - %b_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %inB, i32 %tid - %a = load <2 x i64>, <2 x i64> addrspace(1)* %a_ptr - %b = load <2 x i64>, <2 x i64> addrspace(1)* %b_ptr - %result = add <2 x i64> %a, %b - store <2 x i64> %result, <2 x i64> addrspace(1)* %out - ret void -} - -; SI-LABEL: {{^}}trunc_i64_add_to_i32: -; SI: s_load_dword s[[SREG0:[0-9]+]] -; SI: s_load_dword s[[SREG1:[0-9]+]] -; SI: s_add_i32 [[SRESULT:s[0-9]+]], s[[SREG1]], s[[SREG0]] -; SI-NOT: addc -; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]] -; SI: buffer_store_dword [[VRESULT]], -define void @trunc_i64_add_to_i32(i32 addrspace(1)* %out, i64 %a, i64 %b) { - %add = add i64 %b, %a - %trunc = trunc i64 %add to i32 - store i32 %trunc, i32 addrspace(1)* %out, align 8 - ret void -} diff --git a/llvm/test/CodeGen/R600/address-space.ll b/llvm/test/CodeGen/R600/address-space.ll deleted file mode 100644 index 4be8c584752..00000000000 --- a/llvm/test/CodeGen/R600/address-space.ll +++ /dev/null @@ -1,36 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s - -; Test that codegenprepare understands address space sizes - -%struct.foo = type { [3 x float], [3 x float] } - -; FIXME: Extra V_MOV from SGPR to VGPR for second read. The address is -; already in a VGPR after the first read. - -; CHECK-LABEL: {{^}}do_as_ptr_calcs: -; CHECK: s_load_dword [[SREG1:s[0-9]+]], -; CHECK: v_mov_b32_e32 [[VREG2:v[0-9]+]], [[SREG1]] -; CHECK: v_mov_b32_e32 [[VREG1:v[0-9]+]], [[SREG1]] -; CHECK-DAG: ds_read_b32 v{{[0-9]+}}, [[VREG1]] offset:12 -; CHECK-DAG: ds_read_b32 v{{[0-9]+}}, [[VREG2]] offset:20 -define void @do_as_ptr_calcs(%struct.foo addrspace(3)* nocapture %ptr) nounwind { -entry: - %x = getelementptr inbounds %struct.foo, %struct.foo addrspace(3)* %ptr, i32 0, i32 1, i32 0 - %y = getelementptr inbounds %struct.foo, %struct.foo addrspace(3)* %ptr, i32 0, i32 1, i32 2 - br label %bb32 - -bb32: - %a = load float, float addrspace(3)* %x, align 4 - %b = load float, float addrspace(3)* %y, align 4 - %cmp = fcmp one float %a, %b - br i1 %cmp, label %bb34, label %bb33 - -bb33: - unreachable - -bb34: - unreachable -} - - diff --git a/llvm/test/CodeGen/R600/and.ll b/llvm/test/CodeGen/R600/and.ll deleted file mode 100644 index 5672d470bd7..00000000000 --- a/llvm/test/CodeGen/R600/and.ll +++ /dev/null @@ -1,296 +0,0 @@ -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s - -; FUNC-LABEL: {{^}}test2: -; EG: AND_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -; EG: AND_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} - -; SI: v_and_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -; SI: v_and_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} - -define void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { - %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1 - %a = load <2 x i32>, <2 x i32> addrspace(1) * %in - %b = load <2 x i32>, <2 x i32> addrspace(1) * %b_ptr - %result = and <2 x i32> %a, %b - store <2 x i32> %result, <2 x i32> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}test4: -; EG: AND_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -; EG: AND_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -; EG: AND_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -; EG: AND_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} - -; SI: v_and_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -; SI: v_and_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -; SI: v_and_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -; SI: v_and_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} - -define void @test4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { - %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1 - %a = load <4 x i32>, <4 x i32> addrspace(1) * %in - %b = load <4 x i32>, <4 x i32> addrspace(1) * %b_ptr - %result = and <4 x i32> %a, %b - store <4 x i32> %result, <4 x i32> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}s_and_i32: -; SI: s_and_b32 -define void @s_and_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) { - %and = and i32 %a, %b - store i32 %and, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}s_and_constant_i32: -; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x12d687 -define void @s_and_constant_i32(i32 addrspace(1)* %out, i32 %a) { - %and = and i32 %a, 1234567 - store i32 %and, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}v_and_i32: -; SI: v_and_b32 -define void @v_and_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) { - %a = load i32, i32 addrspace(1)* %aptr, align 4 - %b = load i32, i32 addrspace(1)* %bptr, align 4 - %and = and i32 %a, %b - store i32 %and, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}v_and_constant_i32 -; SI: v_and_b32_e32 v{{[0-9]+}}, 0x12d687, v{{[0-9]+}} -define void @v_and_constant_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) { - %a = load i32, i32 addrspace(1)* %aptr, align 4 - %and = and i32 %a, 1234567 - store i32 %and, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}v_and_inline_imm_64_i32 -; SI: v_and_b32_e32 v{{[0-9]+}}, 64, v{{[0-9]+}} -define void @v_and_inline_imm_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) { - %a = load i32, i32 addrspace(1)* %aptr, align 4 - %and = and i32 %a, 64 - store i32 %and, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}v_and_inline_imm_neg_16_i32 -; SI: v_and_b32_e32 v{{[0-9]+}}, -16, v{{[0-9]+}} -define void @v_and_inline_imm_neg_16_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) { - %a = load i32, i32 addrspace(1)* %aptr, align 4 - %and = and i32 %a, -16 - store i32 %and, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}s_and_i64 -; SI: s_and_b64 -define void @s_and_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) { - %and = and i64 %a, %b - store i64 %and, i64 addrspace(1)* %out, align 8 - ret void -} - -; FIXME: Should use SGPRs -; FUNC-LABEL: {{^}}s_and_i1: -; SI: v_and_b32 -define void @s_and_i1(i1 addrspace(1)* %out, i1 %a, i1 %b) { - %and = and i1 %a, %b - store i1 %and, i1 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}s_and_constant_i64 -; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} -define void @s_and_constant_i64(i64 addrspace(1)* %out, i64 %a) { - %and = and i64 %a, 281474976710655 - store i64 %and, i64 addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}v_and_i64: -; SI: v_and_b32 -; SI: v_and_b32 -define void @v_and_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) { - %a = load i64, i64 addrspace(1)* %aptr, align 8 - %b = load i64, i64 addrspace(1)* %bptr, align 8 - %and = and i64 %a, %b - store i64 %and, i64 addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}v_and_i64_br: -; SI: v_and_b32 -; SI: v_and_b32 -define void @v_and_i64_br(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr, i32 %cond) { -entry: - %tmp0 = icmp eq i32 %cond, 0 - br i1 %tmp0, label %if, label %endif - -if: - %a = load i64, i64 addrspace(1)* %aptr, align 8 - %b = load i64, i64 addrspace(1)* %bptr, align 8 - %and = and i64 %a, %b - br label %endif - -endif: - %tmp1 = phi i64 [%and, %if], [0, %entry] - store i64 %tmp1, i64 addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}v_and_constant_i64: -; SI: v_and_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} -; SI: v_and_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} -define void @v_and_constant_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) { - %a = load i64, i64 addrspace(1)* %aptr, align 8 - %and = and i64 %a, 1234567 - store i64 %and, i64 addrspace(1)* %out, align 8 - ret void -} - -; FIXME: Replace and 0 with mov 0 -; FUNC-LABEL: {{^}}v_and_inline_imm_i64: -; SI: v_and_b32_e32 {{v[0-9]+}}, 64, {{v[0-9]+}} -; SI: v_and_b32_e32 {{v[0-9]+}}, 0, {{v[0-9]+}} -define void @v_and_inline_imm_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) { - %a = load i64, i64 addrspace(1)* %aptr, align 8 - %and = and i64 %a, 64 - store i64 %and, i64 addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}s_and_inline_imm_64_i64 -; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 64 -define void @s_and_inline_imm_64_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { - %and = and i64 %a, 64 - store i64 %and, i64 addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}s_and_inline_imm_1_i64 -; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 1 -define void @s_and_inline_imm_1_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { - %and = and i64 %a, 1 - store i64 %and, i64 addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}s_and_inline_imm_1.0_i64 -; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 1.0 -define void @s_and_inline_imm_1.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { - %and = and i64 %a, 4607182418800017408 - store i64 %and, i64 addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}s_and_inline_imm_neg_1.0_i64 -; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, -1.0 -define void @s_and_inline_imm_neg_1.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { - %and = and i64 %a, 13830554455654793216 - store i64 %and, i64 addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}s_and_inline_imm_0.5_i64 -; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0.5 -define void @s_and_inline_imm_0.5_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { - %and = and i64 %a, 4602678819172646912 - store i64 %and, i64 addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}s_and_inline_imm_neg_0.5_i64 -; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, -0.5 -define void @s_and_inline_imm_neg_0.5_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { - %and = and i64 %a, 13826050856027422720 - store i64 %and, i64 addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}s_and_inline_imm_2.0_i64 -; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 2.0 -define void @s_and_inline_imm_2.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { - %and = and i64 %a, 4611686018427387904 - store i64 %and, i64 addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}s_and_inline_imm_neg_2.0_i64 -; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, -2.0 -define void @s_and_inline_imm_neg_2.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { - %and = and i64 %a, 13835058055282163712 - store i64 %and, i64 addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}s_and_inline_imm_4.0_i64 -; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 4.0 -define void @s_and_inline_imm_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { - %and = and i64 %a, 4616189618054758400 - store i64 %and, i64 addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}s_and_inline_imm_neg_4.0_i64 -; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, -4.0 -define void @s_and_inline_imm_neg_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { - %and = and i64 %a, 13839561654909534208 - store i64 %and, i64 addrspace(1)* %out, align 8 - ret void -} - - -; Test with the 64-bit integer bitpattern for a 32-bit float in the -; low 32-bits, which is not a valid 64-bit inline immmediate. - -; FUNC-LABEL: {{^}}s_and_inline_imm_f32_4.0_i64 -; SI-DAG: s_mov_b32 s[[K_LO:[0-9]+]], 4.0 -; SI-DAG: s_mov_b32 s[[K_HI:[0-9]+]], 0{{$}} -; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[K_LO]]:[[K_HI]]{{\]}} -define void @s_and_inline_imm_f32_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { - %and = and i64 %a, 1082130432 - store i64 %and, i64 addrspace(1)* %out, align 8 - ret void -} - -; FIXME: Copy of -1 register -; FUNC-LABEL: {{^}}s_and_inline_imm_f32_neg_4.0_i64 -; SI-DAG: s_mov_b32 s[[K_LO:[0-9]+]], -4.0 -; SI-DAG: s_mov_b32 s[[K_HI:[0-9]+]], -1{{$}} -; SI-DAG: s_mov_b32 s[[K_HI_COPY:[0-9]+]], s[[K_HI]] -; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[K_LO]]:[[K_HI_COPY]]{{\]}} -define void @s_and_inline_imm_f32_neg_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { - %and = and i64 %a, -1065353216 - store i64 %and, i64 addrspace(1)* %out, align 8 - ret void -} - -; Shift into upper 32-bits -; FUNC-LABEL: {{^}}s_and_inline_high_imm_f32_4.0_i64 -; SI-DAG: s_mov_b32 s[[K_HI:[0-9]+]], 4.0 -; SI-DAG: s_mov_b32 s[[K_LO:[0-9]+]], 0{{$}} -; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[K_LO]]:[[K_HI]]{{\]}} -define void @s_and_inline_high_imm_f32_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { - %and = and i64 %a, 4647714815446351872 - store i64 %and, i64 addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}s_and_inline_high_imm_f32_neg_4.0_i64 -; SI-DAG: s_mov_b32 s[[K_HI:[0-9]+]], -4.0 -; SI-DAG: s_mov_b32 s[[K_LO:[0-9]+]], 0{{$}} -; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[K_LO]]:[[K_HI]]{{\]}} -define void @s_and_inline_high_imm_f32_neg_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { - %and = and i64 %a, 13871086852301127680 - store i64 %and, i64 addrspace(1)* %out, align 8 - ret void -} diff --git a/llvm/test/CodeGen/R600/anyext.ll b/llvm/test/CodeGen/R600/anyext.ll deleted file mode 100644 index 48d8f312249..00000000000 --- a/llvm/test/CodeGen/R600/anyext.ll +++ /dev/null @@ -1,15 +0,0 @@ -; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s -; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s - -; CHECK-LABEL: {{^}}anyext_i1_i32: -; CHECK: v_cndmask_b32_e64 -define void @anyext_i1_i32(i32 addrspace(1)* %out, i32 %cond) { -entry: - %0 = icmp eq i32 %cond, 0 - %1 = zext i1 %0 to i8 - %2 = xor i8 %1, -1 - %3 = and i8 %2, 1 - %4 = zext i8 %3 to i32 - store i32 %4, i32 addrspace(1)* %out - ret void -} diff --git a/llvm/test/CodeGen/R600/array-ptr-calc-i32.ll b/llvm/test/CodeGen/R600/array-ptr-calc-i32.ll deleted file mode 100644 index 8c2a0795860..00000000000 --- a/llvm/test/CodeGen/R600/array-ptr-calc-i32.ll +++ /dev/null @@ -1,44 +0,0 @@ -; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=SI -mattr=-promote-alloca < %s | FileCheck -check-prefix=SI-ALLOCA -check-prefix=SI %s -; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=SI -mattr=+promote-alloca < %s | FileCheck -check-prefix=SI-PROMOTE -check-prefix=SI %s - -declare i32 @llvm.SI.tid() nounwind readnone -declare void @llvm.AMDGPU.barrier.local() nounwind noduplicate - -; The required pointer calculations for the alloca'd actually requires -; an add and won't be folded into the addressing, which fails with a -; 64-bit pointer add. This should work since private pointers should -; be 32-bits. - -; SI-LABEL: {{^}}test_private_array_ptr_calc: - -; FIXME: We end up with zero argument for ADD, because -; SIRegisterInfo::eliminateFrameIndex() blindly replaces the frame index -; with the appropriate offset. We should fold this into the store. -; SI-ALLOCA: v_add_i32_e32 [[PTRREG:v[0-9]+]], 0, v{{[0-9]+}} -; SI-ALLOCA: buffer_store_dword {{v[0-9]+}}, [[PTRREG]], s[{{[0-9]+:[0-9]+}}] -; -; FIXME: The AMDGPUPromoteAlloca pass should be able to convert this -; alloca to a vector. It currently fails because it does not know how -; to interpret: -; getelementptr [4 x i32], [4 x i32]* %alloca, i32 1, i32 %b - -; SI-PROMOTE: v_add_i32_e32 [[PTRREG:v[0-9]+]], 16 -; SI-PROMOTE: ds_write_b32 [[PTRREG]] -define void @test_private_array_ptr_calc(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %inA, i32 addrspace(1)* noalias %inB) { - %alloca = alloca [4 x i32], i32 4, align 16 - %tid = call i32 @llvm.SI.tid() readnone - %a_ptr = getelementptr i32, i32 addrspace(1)* %inA, i32 %tid - %b_ptr = getelementptr i32, i32 addrspace(1)* %inB, i32 %tid - %a = load i32, i32 addrspace(1)* %a_ptr - %b = load i32, i32 addrspace(1)* %b_ptr - %result = add i32 %a, %b - %alloca_ptr = getelementptr [4 x i32], [4 x i32]* %alloca, i32 1, i32 %b - store i32 %result, i32* %alloca_ptr, align 4 - ; Dummy call - call void @llvm.AMDGPU.barrier.local() nounwind noduplicate - %reload = load i32, i32* %alloca_ptr, align 4 - %out_ptr = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - store i32 %reload, i32 addrspace(1)* %out_ptr, align 4 - ret void -} - diff --git a/llvm/test/CodeGen/R600/array-ptr-calc-i64.ll b/llvm/test/CodeGen/R600/array-ptr-calc-i64.ll deleted file mode 100644 index eae095eb844..00000000000 --- a/llvm/test/CodeGen/R600/array-ptr-calc-i64.ll +++ /dev/null @@ -1,17 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s - -declare i32 @llvm.SI.tid() readnone - -; SI-LABEL: {{^}}test_array_ptr_calc: -; SI: v_mul_lo_i32 -; SI: v_mul_hi_i32 -define void @test_array_ptr_calc(i32 addrspace(1)* noalias %out, [1025 x i32] addrspace(1)* noalias %inA, i32 addrspace(1)* noalias %inB) { - %tid = call i32 @llvm.SI.tid() readnone - %a_ptr = getelementptr [1025 x i32], [1025 x i32] addrspace(1)* %inA, i32 %tid, i32 0 - %b_ptr = getelementptr i32, i32 addrspace(1)* %inB, i32 %tid - %a = load i32, i32 addrspace(1)* %a_ptr - %b = load i32, i32 addrspace(1)* %b_ptr - %result = add i32 %a, %b - store i32 %result, i32 addrspace(1)* %out - ret void -} diff --git a/llvm/test/CodeGen/R600/atomic_cmp_swap_local.ll b/llvm/test/CodeGen/R600/atomic_cmp_swap_local.ll deleted file mode 100644 index ef2560ef184..00000000000 --- a/llvm/test/CodeGen/R600/atomic_cmp_swap_local.ll +++ /dev/null @@ -1,92 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=SICI -check-prefix=GCN -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=SICI -check-prefix=CIVI -check-prefix=GCN -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=CIVI -check-prefix=GCN -check-prefix=FUNC %s - -; FUNC-LABEL: {{^}}lds_atomic_cmpxchg_ret_i32_offset: -; GCN: v_mov_b32_e32 [[VCMP:v[0-9]+]], 7 -; SICI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb -; SICI: s_load_dword [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc -; VI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c -; VI: s_load_dword [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30 -; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]] -; GCN-DAG: v_mov_b32_e32 [[VSWAP:v[0-9]+]], [[SWAP]] -; GCN: ds_cmpst_rtn_b32 [[RESULT:v[0-9]+]], [[VPTR]], [[VCMP]], [[VSWAP]] offset:16 -; GCN: s_endpgm -define void @lds_atomic_cmpxchg_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr, i32 %swap) nounwind { - %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 - %pair = cmpxchg i32 addrspace(3)* %gep, i32 7, i32 %swap seq_cst monotonic - %result = extractvalue { i32, i1 } %pair, 0 - store i32 %result, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_cmpxchg_ret_i64_offset: -; GCN-DAG: v_mov_b32_e32 v[[LOVCMP:[0-9]+]], 7 -; GCN-DAG: v_mov_b32_e32 v[[HIVCMP:[0-9]+]], 0 -; SICI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb -; SICI: s_load_dwordx2 s{{\[}}[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd -; VI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c -; VI: s_load_dwordx2 s{{\[}}[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x34 -; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]] -; GCN-DAG: v_mov_b32_e32 v[[LOSWAPV:[0-9]+]], s[[LOSWAP]] -; GCN-DAG: v_mov_b32_e32 v[[HISWAPV:[0-9]+]], s[[HISWAP]] -; GCN: ds_cmpst_rtn_b64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[VPTR]], v{{\[}}[[LOVCMP]]:[[HIVCMP]]{{\]}}, v{{\[}}[[LOSWAPV]]:[[HISWAPV]]{{\]}} offset:32 -; GCN: buffer_store_dwordx2 [[RESULT]], -; GCN: s_endpgm -define void @lds_atomic_cmpxchg_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr, i64 %swap) nounwind { - %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 - %pair = cmpxchg i64 addrspace(3)* %gep, i64 7, i64 %swap seq_cst monotonic - %result = extractvalue { i64, i1 } %pair, 0 - store i64 %result, i64 addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_cmpxchg_ret_i32_bad_si_offset -; SI: ds_cmpst_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; CIVI: ds_cmpst_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16 -; GCN: s_endpgm -define void @lds_atomic_cmpxchg_ret_i32_bad_si_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr, i32 %swap, i32 %a, i32 %b) nounwind { - %sub = sub i32 %a, %b - %add = add i32 %sub, 4 - %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 %add - %pair = cmpxchg i32 addrspace(3)* %gep, i32 7, i32 %swap seq_cst monotonic - %result = extractvalue { i32, i1 } %pair, 0 - store i32 %result, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_cmpxchg_noret_i32_offset: -; SICI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x9 -; SICI: s_load_dword [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xa -; VI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x24 -; VI: s_load_dword [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x28 -; GCN-DAG: v_mov_b32_e32 [[VCMP:v[0-9]+]], 7 -; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]] -; GCN-DAG: v_mov_b32_e32 [[VSWAP:v[0-9]+]], [[SWAP]] -; GCN: ds_cmpst_b32 [[VPTR]], [[VCMP]], [[VSWAP]] offset:16 -; GCN: s_endpgm -define void @lds_atomic_cmpxchg_noret_i32_offset(i32 addrspace(3)* %ptr, i32 %swap) nounwind { - %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 - %pair = cmpxchg i32 addrspace(3)* %gep, i32 7, i32 %swap seq_cst monotonic - %result = extractvalue { i32, i1 } %pair, 0 - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_cmpxchg_noret_i64_offset: -; SICI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x9 -; SICI: s_load_dwordx2 s{{\[}}[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb -; VI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x24 -; VI: s_load_dwordx2 s{{\[}}[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c -; GCN-DAG: v_mov_b32_e32 v[[LOVCMP:[0-9]+]], 7 -; GCN-DAG: v_mov_b32_e32 v[[HIVCMP:[0-9]+]], 0 -; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]] -; GCN-DAG: v_mov_b32_e32 v[[LOSWAPV:[0-9]+]], s[[LOSWAP]] -; GCN-DAG: v_mov_b32_e32 v[[HISWAPV:[0-9]+]], s[[HISWAP]] -; GCN: ds_cmpst_b64 [[VPTR]], v{{\[}}[[LOVCMP]]:[[HIVCMP]]{{\]}}, v{{\[}}[[LOSWAPV]]:[[HISWAPV]]{{\]}} offset:32 -; GCN: s_endpgm -define void @lds_atomic_cmpxchg_noret_i64_offset(i64 addrspace(3)* %ptr, i64 %swap) nounwind { - %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 - %pair = cmpxchg i64 addrspace(3)* %gep, i64 7, i64 %swap seq_cst monotonic - %result = extractvalue { i64, i1 } %pair, 0 - ret void -} diff --git a/llvm/test/CodeGen/R600/atomic_load_add.ll b/llvm/test/CodeGen/R600/atomic_load_add.ll deleted file mode 100644 index 20c685447ee..00000000000 --- a/llvm/test/CodeGen/R600/atomic_load_add.ll +++ /dev/null @@ -1,39 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck %s -check-prefix=SI -check-prefix=FUNC -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s -check-prefix=SI -check-prefix=FUNC -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s - -; FUNC-LABEL: {{^}}atomic_add_local: -; R600: LDS_ADD * -; SI: ds_add_u32 -define void @atomic_add_local(i32 addrspace(3)* %local) { - %unused = atomicrmw volatile add i32 addrspace(3)* %local, i32 5 seq_cst - ret void -} - -; FUNC-LABEL: {{^}}atomic_add_local_const_offset: -; R600: LDS_ADD * -; SI: ds_add_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16 -define void @atomic_add_local_const_offset(i32 addrspace(3)* %local) { - %gep = getelementptr i32, i32 addrspace(3)* %local, i32 4 - %val = atomicrmw volatile add i32 addrspace(3)* %gep, i32 5 seq_cst - ret void -} - -; FUNC-LABEL: {{^}}atomic_add_ret_local: -; R600: LDS_ADD_RET * -; SI: ds_add_rtn_u32 -define void @atomic_add_ret_local(i32 addrspace(1)* %out, i32 addrspace(3)* %local) { - %val = atomicrmw volatile add i32 addrspace(3)* %local, i32 5 seq_cst - store i32 %val, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}atomic_add_ret_local_const_offset: -; R600: LDS_ADD_RET * -; SI: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:20 -define void @atomic_add_ret_local_const_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %local) { - %gep = getelementptr i32, i32 addrspace(3)* %local, i32 5 - %val = atomicrmw volatile add i32 addrspace(3)* %gep, i32 5 seq_cst - store i32 %val, i32 addrspace(1)* %out - ret void -} diff --git a/llvm/test/CodeGen/R600/atomic_load_sub.ll b/llvm/test/CodeGen/R600/atomic_load_sub.ll deleted file mode 100644 index 4c6f45525b9..00000000000 --- a/llvm/test/CodeGen/R600/atomic_load_sub.ll +++ /dev/null @@ -1,39 +0,0 @@ -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s - -; FUNC-LABEL: {{^}}atomic_sub_local: -; R600: LDS_SUB * -; SI: ds_sub_u32 -define void @atomic_sub_local(i32 addrspace(3)* %local) { - %unused = atomicrmw volatile sub i32 addrspace(3)* %local, i32 5 seq_cst - ret void -} - -; FUNC-LABEL: {{^}}atomic_sub_local_const_offset: -; R600: LDS_SUB * -; SI: ds_sub_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16 -define void @atomic_sub_local_const_offset(i32 addrspace(3)* %local) { - %gep = getelementptr i32, i32 addrspace(3)* %local, i32 4 - %val = atomicrmw volatile sub i32 addrspace(3)* %gep, i32 5 seq_cst - ret void -} - -; FUNC-LABEL: {{^}}atomic_sub_ret_local: -; R600: LDS_SUB_RET * -; SI: ds_sub_rtn_u32 -define void @atomic_sub_ret_local(i32 addrspace(1)* %out, i32 addrspace(3)* %local) { - %val = atomicrmw volatile sub i32 addrspace(3)* %local, i32 5 seq_cst - store i32 %val, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}atomic_sub_ret_local_const_offset: -; R600: LDS_SUB_RET * -; SI: ds_sub_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:20 -define void @atomic_sub_ret_local_const_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %local) { - %gep = getelementptr i32, i32 addrspace(3)* %local, i32 5 - %val = atomicrmw volatile sub i32 addrspace(3)* %gep, i32 5 seq_cst - store i32 %val, i32 addrspace(1)* %out - ret void -} diff --git a/llvm/test/CodeGen/R600/basic-branch.ll b/llvm/test/CodeGen/R600/basic-branch.ll deleted file mode 100644 index abdc4afef47..00000000000 --- a/llvm/test/CodeGen/R600/basic-branch.ll +++ /dev/null @@ -1,16 +0,0 @@ -; XFAIL: * -; RUN: llc -O0 -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -O0 -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s - -; CHECK-LABEL: {{^}}test_branch( -define void @test_branch(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %val) nounwind { - %cmp = icmp ne i32 %val, 0 - br i1 %cmp, label %store, label %end - -store: - store i32 222, i32 addrspace(1)* %out - ret void - -end: - ret void -} diff --git a/llvm/test/CodeGen/R600/basic-loop.ll b/llvm/test/CodeGen/R600/basic-loop.ll deleted file mode 100644 index f0263caf5d6..00000000000 --- a/llvm/test/CodeGen/R600/basic-loop.ll +++ /dev/null @@ -1,18 +0,0 @@ -; RUN: llc -O0 -verify-machineinstrs -march=amdgcn -mcpu=SI < %s | FileCheck %s -; RUN: llc -O0 -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck %s - -; CHECK-LABEL: {{^}}test_loop: -define void @test_loop(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %val) nounwind { -entry: - br label %loop.body - -loop.body: - %i = phi i32 [0, %entry], [%i.inc, %loop.body] - store i32 222, i32 addrspace(1)* %out - %cmp = icmp ne i32 %i, %val - %i.inc = add i32 %i, 1 - br i1 %cmp, label %loop.body, label %end - -end: - ret void -} diff --git a/llvm/test/CodeGen/R600/bfe_uint.ll b/llvm/test/CodeGen/R600/bfe_uint.ll deleted file mode 100644 index 32e3fc26106..00000000000 --- a/llvm/test/CodeGen/R600/bfe_uint.ll +++ /dev/null @@ -1,26 +0,0 @@ -; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s - -; CHECK: {{^}}bfe_def: -; CHECK: BFE_UINT -define void @bfe_def(i32 addrspace(1)* %out, i32 %x) { -entry: - %0 = lshr i32 %x, 5 - %1 = and i32 %0, 15 ; 0xf - store i32 %1, i32 addrspace(1)* %out - ret void -} - -; This program could be implemented using a BFE_UINT instruction, however -; since the lshr constant + number of bits in the mask is >= 32, it can also be -; implmented with a LSHR instruction, which is better, because LSHR has less -; operands and requires less constants. - -; CHECK: {{^}}bfe_shift: -; CHECK-NOT: BFE_UINT -define void @bfe_shift(i32 addrspace(1)* %out, i32 %x) { -entry: - %0 = lshr i32 %x, 16 - %1 = and i32 %0, 65535 ; 0xffff - store i32 %1, i32 addrspace(1)* %out - ret void -} diff --git a/llvm/test/CodeGen/R600/bfi_int.ll b/llvm/test/CodeGen/R600/bfi_int.ll deleted file mode 100644 index 03349349735..00000000000 --- a/llvm/test/CodeGen/R600/bfi_int.ll +++ /dev/null @@ -1,53 +0,0 @@ -; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=R600 %s -; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck --check-prefix=SI %s -; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=SI %s - -; BFI_INT Definition pattern from ISA docs -; (y & x) | (z & ~x) -; -; R600: {{^}}bfi_def: -; R600: BFI_INT -; SI: @bfi_def -; SI: v_bfi_b32 -define void @bfi_def(i32 addrspace(1)* %out, i32 %x, i32 %y, i32 %z) { -entry: - %0 = xor i32 %x, -1 - %1 = and i32 %z, %0 - %2 = and i32 %y, %x - %3 = or i32 %1, %2 - store i32 %3, i32 addrspace(1)* %out - ret void -} - -; SHA-256 Ch function -; z ^ (x & (y ^ z)) -; R600: {{^}}bfi_sha256_ch: -; R600: BFI_INT -; SI: @bfi_sha256_ch -; SI: v_bfi_b32 -define void @bfi_sha256_ch(i32 addrspace(1)* %out, i32 %x, i32 %y, i32 %z) { -entry: - %0 = xor i32 %y, %z - %1 = and i32 %x, %0 - %2 = xor i32 %z, %1 - store i32 %2, i32 addrspace(1)* %out - ret void -} - -; SHA-256 Ma function -; ((x & z) | (y & (x | z))) -; R600: {{^}}bfi_sha256_ma: -; R600: XOR_INT * [[DST:T[0-9]+\.[XYZW]]], KC0[2].Z, KC0[2].W -; R600: BFI_INT * {{T[0-9]+\.[XYZW]}}, {{[[DST]]|PV\.[XYZW]}}, KC0[3].X, KC0[2].W -; SI: v_xor_b32_e32 [[DST:v[0-9]+]], {{s[0-9]+, v[0-9]+}} -; SI: v_bfi_b32 {{v[0-9]+}}, [[DST]], {{s[0-9]+, v[0-9]+}} - -define void @bfi_sha256_ma(i32 addrspace(1)* %out, i32 %x, i32 %y, i32 %z) { -entry: - %0 = and i32 %x, %z - %1 = or i32 %x, %z - %2 = and i32 %y, %1 - %3 = or i32 %0, %2 - store i32 %3, i32 addrspace(1)* %out - ret void -} diff --git a/llvm/test/CodeGen/R600/big_alu.ll b/llvm/test/CodeGen/R600/big_alu.ll deleted file mode 100644 index 2671c5d102b..00000000000 --- a/llvm/test/CodeGen/R600/big_alu.ll +++ /dev/null @@ -1,1173 +0,0 @@ -;RUN: llc < %s -march=r600 -mcpu=cedar - -;This test ensures that R600 backend can handle ifcvt properly -;and do not generate ALU clauses with more than 128 instructions. - -define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1, <4 x float> inreg %reg2, <4 x float> inreg %reg3, <4 x float> inreg %reg4, <4 x float> inreg %reg5, <4 x float> inreg %reg6, <4 x float> inreg %reg7, <4 x float> inreg %reg8, <4 x float> inreg %reg9) #0 { -main_body: - %0 = extractelement <4 x float> %reg0, i32 0 - %1 = extractelement <4 x float> %reg0, i32 1 - %2 = extractelement <4 x float> %reg0, i32 2 - %3 = extractelement <4 x float> %reg0, i32 3 - %4 = extractelement <4 x float> %reg1, i32 0 - %5 = extractelement <4 x float> %reg9, i32 0 - %6 = extractelement <4 x float> %reg8, i32 0 - %7 = fcmp ugt float %6, 0.000000e+00 - %8 = select i1 %7, float %4, float %5 - %9 = extractelement <4 x float> %reg1, i32 1 - %10 = extractelement <4 x float> %reg9, i32 1 - %11 = extractelement <4 x float> %reg8, i32 0 - %12 = fcmp ugt float %11, 0.000000e+00 - %13 = select i1 %12, float %9, float %10 - %14 = extractelement <4 x float> %reg1, i32 2 - %15 = extractelement <4 x float> %reg9, i32 2 - %16 = extractelement <4 x float> %reg8, i32 0 - %17 = fcmp ugt float %16, 0.000000e+00 - %18 = select i1 %17, float %14, float %15 - %19 = extractelement <4 x float> %reg1, i32 3 - %20 = extractelement <4 x float> %reg9, i32 3 - %21 = extractelement <4 x float> %reg8, i32 0 - %22 = extractelement <4 x float> %reg2, i32 0 - %23 = extractelement <4 x float> %reg2, i32 1 - %24 = extractelement <4 x float> %reg2, i32 2 - %25 = extractelement <4 x float> %reg2, i32 3 - %26 = extractelement <4 x float> %reg3, i32 0 - %27 = extractelement <4 x float> %reg3, i32 1 - %28 = extractelement <4 x float> %reg3, i32 2 - %29 = extractelement <4 x float> %reg3, i32 3 - %30 = extractelement <4 x float> %reg4, i32 0 - %31 = extractelement <4 x float> %reg4, i32 1 - %32 = extractelement <4 x float> %reg4, i32 2 - %33 = extractelement <4 x float> %reg4, i32 3 - %34 = extractelement <4 x float> %reg5, i32 0 - %35 = extractelement <4 x float> %reg5, i32 1 - %36 = extractelement <4 x float> %reg5, i32 2 - %37 = extractelement <4 x float> %reg5, i32 3 - %38 = extractelement <4 x float> %reg6, i32 0 - %39 = extractelement <4 x float> %reg6, i32 1 - %40 = extractelement <4 x float> %reg6, i32 2 - %41 = extractelement <4 x float> %reg6, i32 3 - %42 = extractelement <4 x float> %reg7, i32 0 - %43 = extractelement <4 x float> %reg7, i32 1 - %44 = extractelement <4 x float> %reg7, i32 2 - %45 = extractelement <4 x float> %reg7, i32 3 - %46 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 11) - %47 = extractelement <4 x float> %46, i32 0 - %48 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 11) - %49 = extractelement <4 x float> %48, i32 1 - %50 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 11) - %51 = extractelement <4 x float> %50, i32 2 - %52 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 12) - %53 = extractelement <4 x float> %52, i32 0 - %54 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 14) - %55 = extractelement <4 x float> %54, i32 0 - %56 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 14) - %57 = extractelement <4 x float> %56, i32 1 - %58 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 14) - %59 = extractelement <4 x float> %58, i32 2 - %60 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 14) - %61 = extractelement <4 x float> %60, i32 3 - %62 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 16) - %63 = extractelement <4 x float> %62, i32 0 - %64 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 16) - %65 = extractelement <4 x float> %64, i32 1 - %66 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 16) - %67 = extractelement <4 x float> %66, i32 2 - %68 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9) - %69 = extractelement <4 x float> %68, i32 0 - %70 = fcmp oge float %69, 3.500000e+00 - %71 = sext i1 %70 to i32 - %72 = bitcast i32 %71 to float - %73 = bitcast float %72 to i32 - %74 = icmp ne i32 %73, 0 - %. = select i1 %74, float 0.000000e+00, float 0.000000e+00 - %75 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9) - %76 = extractelement <4 x float> %75, i32 0 - %77 = fcmp oge float %76, 2.000000e+00 - %78 = sext i1 %77 to i32 - %79 = bitcast i32 %78 to float - %80 = bitcast float %79 to i32 - %81 = icmp ne i32 %80, 0 - br i1 %81, label %IF137, label %ENDIF136 - -IF137: ; preds = %main_body - %82 = insertelement <4 x float> undef, float %30, i32 0 - %83 = insertelement <4 x float> %82, float %31, i32 1 - %84 = insertelement <4 x float> %83, float %32, i32 2 - %85 = insertelement <4 x float> %84, float 0.000000e+00, i32 3 - %86 = insertelement <4 x float> undef, float %30, i32 0 - %87 = insertelement <4 x float> %86, float %31, i32 1 - %88 = insertelement <4 x float> %87, float %32, i32 2 - %89 = insertelement <4 x float> %88, float 0.000000e+00, i32 3 - %90 = call float @llvm.AMDGPU.dp4(<4 x float> %85, <4 x float> %89) - %91 = call float @llvm.AMDGPU.rsq.f32(float %90) - %92 = fmul float %30, %91 - %93 = fmul float %31, %91 - %94 = fmul float %32, %91 - %95 = insertelement <4 x float> undef, float %92, i32 0 - %96 = insertelement <4 x float> %95, float %93, i32 1 - %97 = insertelement <4 x float> %96, float %94, i32 2 - %98 = insertelement <4 x float> %97, float 0.000000e+00, i32 3 - %99 = insertelement <4 x float> undef, float %37, i32 0 - %100 = insertelement <4 x float> %99, float %38, i32 1 - %101 = insertelement <4 x float> %100, float %39, i32 2 - %102 = insertelement <4 x float> %101, float 0.000000e+00, i32 3 - %103 = call float @llvm.AMDGPU.dp4(<4 x float> %98, <4 x float> %102) - %104 = insertelement <4 x float> undef, float %92, i32 0 - %105 = insertelement <4 x float> %104, float %93, i32 1 - %106 = insertelement <4 x float> %105, float %94, i32 2 - %107 = insertelement <4 x float> %106, float 0.000000e+00, i32 3 - %108 = insertelement <4 x float> undef, float %40, i32 0 - %109 = insertelement <4 x float> %108, float %41, i32 1 - %110 = insertelement <4 x float> %109, float %42, i32 2 - %111 = insertelement <4 x float> %110, float 0.000000e+00, i32 3 - %112 = call float @llvm.AMDGPU.dp4(<4 x float> %107, <4 x float> %111) - %113 = fsub float -0.000000e+00, %92 - %114 = fsub float -0.000000e+00, %93 - %115 = fsub float -0.000000e+00, %94 - %116 = insertelement <4 x float> undef, float %34, i32 0 - %117 = insertelement <4 x float> %116, float %35, i32 1 - %118 = insertelement <4 x float> %117, float %36, i32 2 - %119 = insertelement <4 x float> %118, float 0.000000e+00, i32 3 - %120 = insertelement <4 x float> undef, float %113, i32 0 - %121 = insertelement <4 x float> %120, float %114, i32 1 - %122 = insertelement <4 x float> %121, float %115, i32 2 - %123 = insertelement <4 x float> %122, float 0.000000e+00, i32 3 - %124 = call float @llvm.AMDGPU.dp4(<4 x float> %119, <4 x float> %123) - %125 = fdiv float 1.000000e+00, %124 - %126 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 5) - %127 = extractelement <4 x float> %126, i32 0 - %128 = fmul float %127, %125 - %129 = fmul float %103, %128 - %130 = fmul float %112, %128 - %131 = bitcast float %. to i32 - %132 = sitofp i32 %131 to float - %133 = fdiv float 1.000000e+00, %132 - %134 = bitcast float %. to i32 - %135 = add i32 %134, -1 - %136 = bitcast i32 %135 to float - %137 = bitcast float %136 to i32 - br label %LOOP - -ENDIF136: ; preds = %main_body, %ENDIF154 - %temp68.1 = phi float [ %600, %ENDIF154 ], [ 0.000000e+00, %main_body ] - %temp69.0 = phi float [ %602, %ENDIF154 ], [ 0.000000e+00, %main_body ] - %temp70.0 = phi float [ %604, %ENDIF154 ], [ 1.000000e+00, %main_body ] - %138 = fmul float %26, 0x3F847AE140000000 - %139 = fmul float %27, 0x3F847AE140000000 - %140 = fmul float %28, 0x3F847AE140000000 - %141 = insertelement <4 x float> undef, float %138, i32 0 - %142 = insertelement <4 x float> %141, float %139, i32 1 - %143 = insertelement <4 x float> %142, float %140, i32 2 - %144 = insertelement <4 x float> %143, float 0.000000e+00, i32 3 - %145 = extractelement <4 x float> %144, i32 0 - %146 = extractelement <4 x float> %144, i32 1 - %147 = extractelement <4 x float> %144, i32 2 - %148 = extractelement <4 x float> %144, i32 3 - %149 = insertelement <4 x float> undef, float %145, i32 0 - %150 = insertelement <4 x float> %149, float %146, i32 1 - %151 = insertelement <4 x float> %150, float %147, i32 2 - %152 = insertelement <4 x float> %151, float %148, i32 3 - %153 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %152, i32 16, i32 0, i32 3) - %154 = extractelement <4 x float> %153, i32 0 - %155 = extractelement <4 x float> %153, i32 1 - %156 = extractelement <4 x float> %153, i32 2 - %157 = extractelement <4 x float> %153, i32 3 - %158 = fmul float %26, 0x3F45A07B40000000 - %159 = fmul float %27, 0x3F45A07B40000000 - %160 = fmul float %28, 0x3F45A07B40000000 - %161 = insertelement <4 x float> undef, float %158, i32 0 - %162 = insertelement <4 x float> %161, float %159, i32 1 - %163 = insertelement <4 x float> %162, float %160, i32 2 - %164 = insertelement <4 x float> %163, float 0.000000e+00, i32 3 - %165 = extractelement <4 x float> %164, i32 0 - %166 = extractelement <4 x float> %164, i32 1 - %167 = extractelement <4 x float> %164, i32 2 - %168 = extractelement <4 x float> %164, i32 3 - %169 = insertelement <4 x float> undef, float %165, i32 0 - %170 = insertelement <4 x float> %169, float %166, i32 1 - %171 = insertelement <4 x float> %170, float %167, i32 2 - %172 = insertelement <4 x float> %171, float %168, i32 3 - %173 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %172, i32 16, i32 0, i32 3) - %174 = extractelement <4 x float> %173, i32 0 - %175 = extractelement <4 x float> %173, i32 1 - %176 = extractelement <4 x float> %173, i32 2 - %177 = extractelement <4 x float> %173, i32 3 - %178 = fmul float %176, 3.000000e+03 - %179 = fadd float %178, %28 - %180 = fdiv float 1.000000e+00, %33 - %181 = fmul float %32, %180 - %182 = call float @fabs(float %181) - %183 = fmul float %174, 0x3FD99999A0000000 - %184 = fadd float %183, 0x3FAEB851E0000000 - %185 = fmul float %175, 0x3FE3333340000000 - %186 = fadd float %185, %184 - %187 = fmul float %176, 2.000000e+00 - %188 = fadd float %187, %186 - %189 = fmul float %177, 4.000000e+00 - %190 = fadd float %189, %188 - %191 = fmul float %154, 0x3FB99999A0000000 - %192 = fadd float %191, %190 - %193 = fmul float %155, 0x3FD99999A0000000 - %194 = fadd float %193, %192 - %195 = fmul float %156, 0x3FE99999A0000000 - %196 = fadd float %195, %194 - %197 = fmul float %157, 0x4000CCCCC0000000 - %198 = fadd float %197, %196 - %199 = fmul float 0xBE5EFB4CC0000000, %182 - %200 = fmul float %199, %182 - %201 = call float @llvm.AMDIL.exp.(float %200) - %202 = call float @llvm.AMDGPU.lrp(float %201, float %198, float 0x3FA99999A0000000) - %203 = fadd float %202, 0x3FF4CCCCC0000000 - %204 = fmul float %203, 0x3FE1C71C80000000 - %205 = call float @llvm.AMDIL.clamp.(float %204, float 0.000000e+00, float 1.000000e+00) - %206 = fadd float %202, 0x3FF4CCCCC0000000 - %207 = fmul float %206, 0x3FE1C71C80000000 - %208 = call float @llvm.AMDIL.clamp.(float %207, float 0.000000e+00, float 1.000000e+00) - %209 = fadd float %202, 2.000000e+00 - %210 = fmul float %209, 0x3FD611A7A0000000 - %211 = call float @llvm.AMDIL.clamp.(float %210, float 0.000000e+00, float 1.000000e+00) - %212 = fmul float 2.000000e+00, %205 - %213 = fsub float -0.000000e+00, %212 - %214 = fadd float 3.000000e+00, %213 - %215 = fmul float %205, %214 - %216 = fmul float %205, %215 - %217 = fmul float 2.000000e+00, %208 - %218 = fsub float -0.000000e+00, %217 - %219 = fadd float 3.000000e+00, %218 - %220 = fmul float %208, %219 - %221 = fmul float %208, %220 - %222 = fmul float 2.000000e+00, %211 - %223 = fsub float -0.000000e+00, %222 - %224 = fadd float 3.000000e+00, %223 - %225 = fmul float %211, %224 - %226 = fmul float %211, %225 - %227 = fmul float %26, 0x3F368B5CC0000000 - %228 = fmul float %27, 0x3F368B5CC0000000 - %229 = insertelement <4 x float> undef, float %227, i32 0 - %230 = insertelement <4 x float> %229, float %228, i32 1 - %231 = insertelement <4 x float> %230, float 0.000000e+00, i32 2 - %232 = insertelement <4 x float> %231, float 0.000000e+00, i32 3 - %233 = extractelement <4 x float> %232, i32 0 - %234 = extractelement <4 x float> %232, i32 1 - %235 = insertelement <4 x float> undef, float %233, i32 0 - %236 = insertelement <4 x float> %235, float %234, i32 1 - %237 = insertelement <4 x float> %236, float undef, i32 2 - %238 = insertelement <4 x float> %237, float undef, i32 3 - %239 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %238, i32 17, i32 1, i32 2) - %240 = extractelement <4 x float> %239, i32 0 - %241 = insertelement <4 x float> undef, float %240, i32 0 - %242 = insertelement <4 x float> %241, float %228, i32 1 - %243 = insertelement <4 x float> %242, float 0.000000e+00, i32 2 - %244 = insertelement <4 x float> %243, float 0.000000e+00, i32 3 - %245 = extractelement <4 x float> %244, i32 0 - %246 = insertelement <4 x float> undef, float %245, i32 0 - %247 = insertelement <4 x float> %246, float undef, i32 1 - %248 = insertelement <4 x float> %247, float undef, i32 2 - %249 = insertelement <4 x float> %248, float undef, i32 3 - %250 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %249, i32 18, i32 2, i32 1) - %251 = extractelement <4 x float> %250, i32 0 - %252 = extractelement <4 x float> %250, i32 1 - %253 = extractelement <4 x float> %250, i32 2 - %254 = extractelement <4 x float> %250, i32 3 - %255 = fmul float %251, %216 - %256 = fmul float %252, %221 - %257 = fmul float %253, %226 - %258 = fmul float %254, 0.000000e+00 - %259 = fadd float %202, 0x3FF4CCCCC0000000 - %260 = fmul float %259, 0x3FE1C71C80000000 - %261 = call float @llvm.AMDIL.clamp.(float %260, float 0.000000e+00, float 1.000000e+00) - %262 = fadd float %202, 0x3FF4CCCCC0000000 - %263 = fmul float %262, 0x3FE1C71C80000000 - %264 = call float @llvm.AMDIL.clamp.(float %263, float 0.000000e+00, float 1.000000e+00) - %265 = fadd float %202, 2.000000e+00 - %266 = fmul float %265, 0x3FD611A7A0000000 - %267 = call float @llvm.AMDIL.clamp.(float %266, float 0.000000e+00, float 1.000000e+00) - %268 = fmul float 2.000000e+00, %261 - %269 = fsub float -0.000000e+00, %268 - %270 = fadd float 3.000000e+00, %269 - %271 = fmul float %261, %270 - %272 = fmul float %261, %271 - %273 = fmul float 2.000000e+00, %264 - %274 = fsub float -0.000000e+00, %273 - %275 = fadd float 3.000000e+00, %274 - %276 = fmul float %264, %275 - %277 = fmul float %264, %276 - %278 = fmul float 2.000000e+00, %267 - %279 = fsub float -0.000000e+00, %278 - %280 = fadd float 3.000000e+00, %279 - %281 = fmul float %267, %280 - %282 = fmul float %267, %281 - %283 = fmul float %26, 0x3F22DFD6A0000000 - %284 = fmul float %27, 0x3F22DFD6A0000000 - %285 = insertelement <4 x float> undef, float %283, i32 0 - %286 = insertelement <4 x float> %285, float %284, i32 1 - %287 = insertelement <4 x float> %286, float 0.000000e+00, i32 2 - %288 = insertelement <4 x float> %287, float 0.000000e+00, i32 3 - %289 = extractelement <4 x float> %288, i32 0 - %290 = extractelement <4 x float> %288, i32 1 - %291 = insertelement <4 x float> undef, float %289, i32 0 - %292 = insertelement <4 x float> %291, float %290, i32 1 - %293 = insertelement <4 x float> %292, float undef, i32 2 - %294 = insertelement <4 x float> %293, float undef, i32 3 - %295 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %294, i32 19, i32 3, i32 2) - %296 = extractelement <4 x float> %295, i32 0 - %297 = extractelement <4 x float> %295, i32 1 - %298 = extractelement <4 x float> %295, i32 2 - %299 = extractelement <4 x float> %295, i32 3 - %300 = fmul float %296, %272 - %301 = fmul float %297, %277 - %302 = fmul float %298, %282 - %303 = fmul float %299, 0.000000e+00 - %304 = fmul float %temp68.1, %37 - %305 = fmul float %temp68.1, %38 - %306 = fmul float %temp68.1, %39 - %307 = fmul float %temp69.0, %40 - %308 = fadd float %307, %304 - %309 = fmul float %temp69.0, %41 - %310 = fadd float %309, %305 - %311 = fmul float %temp69.0, %42 - %312 = fadd float %311, %306 - %313 = fmul float %temp70.0, %34 - %314 = fadd float %313, %308 - %315 = fmul float %temp70.0, %35 - %316 = fadd float %315, %310 - %317 = fmul float %temp70.0, %36 - %318 = fadd float %317, %312 - %319 = insertelement <4 x float> undef, float %314, i32 0 - %320 = insertelement <4 x float> %319, float %316, i32 1 - %321 = insertelement <4 x float> %320, float %318, i32 2 - %322 = insertelement <4 x float> %321, float 0.000000e+00, i32 3 - %323 = insertelement <4 x float> undef, float %314, i32 0 - %324 = insertelement <4 x float> %323, float %316, i32 1 - %325 = insertelement <4 x float> %324, float %318, i32 2 - %326 = insertelement <4 x float> %325, float 0.000000e+00, i32 3 - %327 = call float @llvm.AMDGPU.dp4(<4 x float> %322, <4 x float> %326) - %328 = call float @llvm.AMDGPU.rsq.f32(float %327) - %329 = fmul float %314, %328 - %330 = fmul float %316, %328 - %331 = fmul float %318, %328 - %332 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 6) - %333 = extractelement <4 x float> %332, i32 0 - %334 = fsub float -0.000000e+00, %333 - %335 = fadd float 1.000000e+00, %334 - %336 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 7) - %337 = extractelement <4 x float> %336, i32 0 - %338 = fsub float -0.000000e+00, %337 - %339 = fadd float 1.000000e+00, %338 - %340 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 8) - %341 = extractelement <4 x float> %340, i32 0 - %342 = fsub float -0.000000e+00, %341 - %343 = fadd float 1.000000e+00, %342 - %344 = fsub float -0.000000e+00, %335 - %345 = fadd float %202, %344 - %346 = fsub float -0.000000e+00, %339 - %347 = fadd float %202, %346 - %348 = fadd float %347, 0xBFE3333340000000 - %349 = fsub float -0.000000e+00, %202 - %350 = fsub float -0.000000e+00, %343 - %351 = fadd float %349, %350 - %352 = insertelement <4 x float> undef, float %43, i32 0 - %353 = insertelement <4 x float> %352, float %44, i32 1 - %354 = insertelement <4 x float> %353, float %45, i32 2 - %355 = insertelement <4 x float> %354, float 0.000000e+00, i32 3 - %356 = insertelement <4 x float> undef, float %43, i32 0 - %357 = insertelement <4 x float> %356, float %44, i32 1 - %358 = insertelement <4 x float> %357, float %45, i32 2 - %359 = insertelement <4 x float> %358, float 0.000000e+00, i32 3 - %360 = call float @llvm.AMDGPU.dp4(<4 x float> %355, <4 x float> %359) - %361 = call float @llvm.AMDGPU.rsq.f32(float %360) - %362 = fmul float %45, %361 - %363 = call float @fabs(float %362) - %364 = fmul float %176, 0x3FECCCCCC0000000 - %365 = fadd float %364, %363 - %366 = fadd float %365, 0xBFEFAE1480000000 - %367 = fmul float %366, 0xC023FFFFC0000000 - %368 = call float @llvm.AMDIL.clamp.(float %367, float 0.000000e+00, float 1.000000e+00) - %369 = fsub float -0.000000e+00, %335 - %370 = fadd float %202, %369 - %371 = fadd float %370, 0x3FBEB851E0000000 - %372 = fsub float -0.000000e+00, %339 - %373 = fadd float %202, %372 - %374 = fadd float %373, 0xBFE0A3D700000000 - %375 = fsub float -0.000000e+00, %202 - %376 = fsub float -0.000000e+00, %343 - %377 = fadd float %375, %376 - %378 = insertelement <4 x float> undef, float %43, i32 0 - %379 = insertelement <4 x float> %378, float %44, i32 1 - %380 = insertelement <4 x float> %379, float %45, i32 2 - %381 = insertelement <4 x float> %380, float 0.000000e+00, i32 3 - %382 = insertelement <4 x float> undef, float %43, i32 0 - %383 = insertelement <4 x float> %382, float %44, i32 1 - %384 = insertelement <4 x float> %383, float %45, i32 2 - %385 = insertelement <4 x float> %384, float 0.000000e+00, i32 3 - %386 = call float @llvm.AMDGPU.dp4(<4 x float> %381, <4 x float> %385) - %387 = call float @llvm.AMDGPU.rsq.f32(float %386) - %388 = fmul float %45, %387 - %389 = call float @fabs(float %388) - %390 = fmul float %176, 0x3FF51EB860000000 - %391 = fadd float %390, %389 - %392 = fadd float %391, 0xBFEFAE1480000000 - %393 = fmul float %392, 0xC0490001A0000000 - %394 = call float @llvm.AMDIL.clamp.(float %393, float 0.000000e+00, float 1.000000e+00) - %395 = fmul float 2.000000e+00, %368 - %396 = fsub float -0.000000e+00, %395 - %397 = fadd float 3.000000e+00, %396 - %398 = fmul float %368, %397 - %399 = fmul float %368, %398 - %400 = call float @llvm.AMDGPU.lrp(float %399, float %255, float %345) - %401 = call float @llvm.AMDGPU.lrp(float %399, float %256, float %348) - %402 = call float @llvm.AMDGPU.lrp(float %399, float %257, float %351) - %403 = call float @llvm.AMDGPU.lrp(float %399, float %258, float 0.000000e+00) - %404 = fmul float 2.000000e+00, %394 - %405 = fsub float -0.000000e+00, %404 - %406 = fadd float 3.000000e+00, %405 - %407 = fmul float %394, %406 - %408 = fmul float %394, %407 - %409 = call float @llvm.AMDGPU.lrp(float %408, float %255, float %371) - %410 = call float @llvm.AMDGPU.lrp(float %408, float %256, float %374) - %411 = call float @llvm.AMDGPU.lrp(float %408, float %257, float %377) - %412 = call float @llvm.AMDGPU.lrp(float %408, float %258, float 0x3FD3333340000000) - %413 = fcmp oge float 2.200000e+03, %179 - %414 = sext i1 %413 to i32 - %415 = bitcast i32 %414 to float - %416 = bitcast float %415 to i32 - %417 = icmp ne i32 %416, 0 - br i1 %417, label %IF161, label %ENDIF160 - -LOOP: ; preds = %ENDIF139, %IF137 - %temp88.0 = phi float [ 0.000000e+00, %IF137 ], [ %446, %ENDIF139 ] - %temp92.0 = phi float [ 1.000000e+00, %IF137 ], [ %.temp92.0, %ENDIF139 ] - %temp96.0 = phi float [ 0.000000e+00, %IF137 ], [ %477, %ENDIF139 ] - %418 = bitcast float %temp96.0 to i32 - %419 = icmp sge i32 %418, %137 - %420 = sext i1 %419 to i32 - %421 = bitcast i32 %420 to float - %422 = bitcast float %421 to i32 - %423 = icmp ne i32 %422, 0 - br i1 %423, label %IF140, label %ENDIF139 - -IF140: ; preds = %LOOP - %424 = fmul float %133, 5.000000e-01 - %425 = fmul float %129, %temp92.0 - %426 = fadd float %425, %22 - %427 = fmul float %130, %temp92.0 - %428 = fadd float %427, %23 - %429 = insertelement <4 x float> undef, float %426, i32 0 - %430 = insertelement <4 x float> %429, float %428, i32 1 - %431 = insertelement <4 x float> %430, float 0.000000e+00, i32 2 - %432 = insertelement <4 x float> %431, float 0.000000e+00, i32 3 - %433 = extractelement <4 x float> %432, i32 0 - %434 = extractelement <4 x float> %432, i32 1 - %435 = insertelement <4 x float> undef, float %433, i32 0 - %436 = insertelement <4 x float> %435, float %434, i32 1 - %437 = insertelement <4 x float> %436, float undef, i32 2 - %438 = insertelement <4 x float> %437, float undef, i32 3 - %439 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %438, i32 20, i32 4, i32 2) - %440 = extractelement <4 x float> %439, i32 3 - %441 = fcmp oge float %temp92.0, %440 - %442 = sext i1 %441 to i32 - %443 = bitcast i32 %442 to float - %444 = bitcast float %443 to i32 - %445 = icmp ne i32 %444, 0 - br i1 %445, label %IF146, label %ENDIF145 - -ENDIF139: ; preds = %LOOP - %446 = fadd float %temp88.0, %133 - %447 = fmul float %129, %446 - %448 = fadd float %447, %22 - %449 = fmul float %130, %446 - %450 = fadd float %449, %23 - %451 = insertelement <4 x float> undef, float %448, i32 0 - %452 = insertelement <4 x float> %451, float %450, i32 1 - %453 = insertelement <4 x float> %452, float 0.000000e+00, i32 2 - %454 = insertelement <4 x float> %453, float 0.000000e+00, i32 3 - %455 = extractelement <4 x float> %454, i32 0 - %456 = extractelement <4 x float> %454, i32 1 - %457 = insertelement <4 x float> undef, float %455, i32 0 - %458 = insertelement <4 x float> %457, float %456, i32 1 - %459 = insertelement <4 x float> %458, float undef, i32 2 - %460 = insertelement <4 x float> %459, float undef, i32 3 - %461 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %460, i32 20, i32 4, i32 2) - %462 = extractelement <4 x float> %461, i32 3 - %463 = fcmp olt float 0x3FEFDF3B60000000, %temp92.0 - %464 = sext i1 %463 to i32 - %465 = bitcast i32 %464 to float - %466 = fcmp oge float %446, %462 - %467 = sext i1 %466 to i32 - %468 = bitcast i32 %467 to float - %469 = bitcast float %465 to i32 - %470 = bitcast float %468 to i32 - %471 = and i32 %469, %470 - %472 = bitcast i32 %471 to float - %473 = bitcast float %472 to i32 - %474 = icmp ne i32 %473, 0 - %.temp92.0 = select i1 %474, float %446, float %temp92.0 - %475 = bitcast float %temp96.0 to i32 - %476 = add i32 %475, 1 - %477 = bitcast i32 %476 to float - br label %LOOP - -IF146: ; preds = %IF140 - %478 = fmul float 2.000000e+00, %424 - %479 = fsub float -0.000000e+00, %478 - %480 = fadd float %temp92.0, %479 - br label %ENDIF145 - -ENDIF145: ; preds = %IF140, %IF146 - %temp88.1 = phi float [ %480, %IF146 ], [ %temp92.0, %IF140 ] - %481 = fadd float %temp88.1, %424 - %482 = fmul float %424, 5.000000e-01 - %483 = fmul float %129, %481 - %484 = fadd float %483, %22 - %485 = fmul float %130, %481 - %486 = fadd float %485, %23 - %487 = insertelement <4 x float> undef, float %484, i32 0 - %488 = insertelement <4 x float> %487, float %486, i32 1 - %489 = insertelement <4 x float> %488, float 0.000000e+00, i32 2 - %490 = insertelement <4 x float> %489, float %440, i32 3 - %491 = extractelement <4 x float> %490, i32 0 - %492 = extractelement <4 x float> %490, i32 1 - %493 = insertelement <4 x float> undef, float %491, i32 0 - %494 = insertelement <4 x float> %493, float %492, i32 1 - %495 = insertelement <4 x float> %494, float undef, i32 2 - %496 = insertelement <4 x float> %495, float undef, i32 3 - %497 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %496, i32 20, i32 4, i32 2) - %498 = extractelement <4 x float> %497, i32 3 - %499 = fcmp oge float %481, %498 - %500 = sext i1 %499 to i32 - %501 = bitcast i32 %500 to float - %502 = bitcast float %501 to i32 - %503 = icmp ne i32 %502, 0 - br i1 %503, label %IF149, label %ENDIF148 - -IF149: ; preds = %ENDIF145 - %504 = fmul float 2.000000e+00, %482 - %505 = fsub float -0.000000e+00, %504 - %506 = fadd float %481, %505 - br label %ENDIF148 - -ENDIF148: ; preds = %ENDIF145, %IF149 - %temp88.2 = phi float [ %506, %IF149 ], [ %481, %ENDIF145 ] - %temp92.2 = phi float [ %481, %IF149 ], [ %temp92.0, %ENDIF145 ] - %507 = fadd float %temp88.2, %482 - %508 = fmul float %482, 5.000000e-01 - %509 = fmul float %129, %507 - %510 = fadd float %509, %22 - %511 = fmul float %130, %507 - %512 = fadd float %511, %23 - %513 = insertelement <4 x float> undef, float %510, i32 0 - %514 = insertelement <4 x float> %513, float %512, i32 1 - %515 = insertelement <4 x float> %514, float 0.000000e+00, i32 2 - %516 = insertelement <4 x float> %515, float %498, i32 3 - %517 = extractelement <4 x float> %516, i32 0 - %518 = extractelement <4 x float> %516, i32 1 - %519 = insertelement <4 x float> undef, float %517, i32 0 - %520 = insertelement <4 x float> %519, float %518, i32 1 - %521 = insertelement <4 x float> %520, float undef, i32 2 - %522 = insertelement <4 x float> %521, float undef, i32 3 - %523 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %522, i32 20, i32 4, i32 2) - %524 = extractelement <4 x float> %523, i32 3 - %525 = fcmp oge float %507, %524 - %526 = sext i1 %525 to i32 - %527 = bitcast i32 %526 to float - %528 = bitcast float %527 to i32 - %529 = icmp ne i32 %528, 0 - br i1 %529, label %IF152, label %ENDIF151 - -IF152: ; preds = %ENDIF148 - %530 = fmul float 2.000000e+00, %508 - %531 = fsub float -0.000000e+00, %530 - %532 = fadd float %507, %531 - br label %ENDIF151 - -ENDIF151: ; preds = %ENDIF148, %IF152 - %temp88.3 = phi float [ %532, %IF152 ], [ %507, %ENDIF148 ] - %temp92.3 = phi float [ %507, %IF152 ], [ %temp92.2, %ENDIF148 ] - %533 = fadd float %temp88.3, %508 - %534 = fmul float %508, 5.000000e-01 - %535 = fmul float %129, %533 - %536 = fadd float %535, %22 - %537 = fmul float %130, %533 - %538 = fadd float %537, %23 - %539 = insertelement <4 x float> undef, float %536, i32 0 - %540 = insertelement <4 x float> %539, float %538, i32 1 - %541 = insertelement <4 x float> %540, float 0.000000e+00, i32 2 - %542 = insertelement <4 x float> %541, float %524, i32 3 - %543 = extractelement <4 x float> %542, i32 0 - %544 = extractelement <4 x float> %542, i32 1 - %545 = insertelement <4 x float> undef, float %543, i32 0 - %546 = insertelement <4 x float> %545, float %544, i32 1 - %547 = insertelement <4 x float> %546, float undef, i32 2 - %548 = insertelement <4 x float> %547, float undef, i32 3 - %549 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %548, i32 20, i32 4, i32 2) - %550 = extractelement <4 x float> %549, i32 3 - %551 = fcmp oge float %533, %550 - %552 = sext i1 %551 to i32 - %553 = bitcast i32 %552 to float - %554 = bitcast float %553 to i32 - %555 = icmp ne i32 %554, 0 - br i1 %555, label %IF155, label %ENDIF154 - -IF155: ; preds = %ENDIF151 - %556 = fmul float 2.000000e+00, %534 - %557 = fsub float -0.000000e+00, %556 - %558 = fadd float %533, %557 - br label %ENDIF154 - -ENDIF154: ; preds = %ENDIF151, %IF155 - %temp88.4 = phi float [ %558, %IF155 ], [ %533, %ENDIF151 ] - %temp92.4 = phi float [ %533, %IF155 ], [ %temp92.3, %ENDIF151 ] - %559 = fadd float %temp88.4, %534 - %560 = fmul float %129, %559 - %561 = fadd float %560, %22 - %562 = fmul float %130, %559 - %563 = fadd float %562, %23 - %564 = insertelement <4 x float> undef, float %561, i32 0 - %565 = insertelement <4 x float> %564, float %563, i32 1 - %566 = insertelement <4 x float> %565, float 0.000000e+00, i32 2 - %567 = insertelement <4 x float> %566, float %550, i32 3 - %568 = extractelement <4 x float> %567, i32 0 - %569 = extractelement <4 x float> %567, i32 1 - %570 = insertelement <4 x float> undef, float %568, i32 0 - %571 = insertelement <4 x float> %570, float %569, i32 1 - %572 = insertelement <4 x float> %571, float undef, i32 2 - %573 = insertelement <4 x float> %572, float undef, i32 3 - %574 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %573, i32 20, i32 4, i32 2) - %575 = extractelement <4 x float> %574, i32 3 - %576 = fcmp oge float %559, %575 - %577 = sext i1 %576 to i32 - %578 = bitcast i32 %577 to float - %579 = bitcast float %578 to i32 - %580 = icmp ne i32 %579, 0 - %.temp92.4 = select i1 %580, float %559, float %temp92.4 - %581 = fmul float %129, %.temp92.4 - %582 = fadd float %581, %22 - %583 = fmul float %130, %.temp92.4 - %584 = fadd float %583, %23 - %585 = insertelement <4 x float> undef, float %582, i32 0 - %586 = insertelement <4 x float> %585, float %584, i32 1 - %587 = insertelement <4 x float> %586, float 0.000000e+00, i32 2 - %588 = insertelement <4 x float> %587, float %575, i32 3 - %589 = extractelement <4 x float> %588, i32 0 - %590 = extractelement <4 x float> %588, i32 1 - %591 = insertelement <4 x float> undef, float %589, i32 0 - %592 = insertelement <4 x float> %591, float %590, i32 1 - %593 = insertelement <4 x float> %592, float undef, i32 2 - %594 = insertelement <4 x float> %593, float undef, i32 3 - %595 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %594, i32 20, i32 4, i32 2) - %596 = extractelement <4 x float> %595, i32 0 - %597 = extractelement <4 x float> %595, i32 1 - %598 = extractelement <4 x float> %595, i32 2 - %599 = fmul float %596, 2.000000e+00 - %600 = fadd float %599, -1.000000e+00 - %601 = fmul float %597, 2.000000e+00 - %602 = fadd float %601, -1.000000e+00 - %603 = fmul float %598, 2.000000e+00 - %604 = fadd float %603, -1.000000e+00 - br label %ENDIF136 - -IF161: ; preds = %ENDIF136 - %605 = fmul float %202, 0x3FB99999A0000000 - %606 = fcmp uge float 0x3FE4CCCCC0000000, %605 - %607 = select i1 %606, float 0x3FE4CCCCC0000000, float %605 - %608 = fcmp uge float %607, 5.000000e-01 - %609 = select i1 %608, float 5.000000e-01, float %607 - %610 = call float @llvm.AMDGPU.lrp(float %609, float %400, float %300) - %611 = call float @llvm.AMDGPU.lrp(float %609, float %401, float %301) - %612 = call float @llvm.AMDGPU.lrp(float %609, float %402, float %302) - %613 = call float @llvm.AMDGPU.lrp(float %609, float %403, float %303) - %614 = insertelement <4 x float> undef, float %329, i32 0 - %615 = insertelement <4 x float> %614, float %330, i32 1 - %616 = insertelement <4 x float> %615, float %331, i32 2 - %617 = insertelement <4 x float> %616, float 0.000000e+00, i32 3 - %618 = insertelement <4 x float> undef, float %63, i32 0 - %619 = insertelement <4 x float> %618, float %65, i32 1 - %620 = insertelement <4 x float> %619, float %67, i32 2 - %621 = insertelement <4 x float> %620, float 0.000000e+00, i32 3 - %622 = call float @llvm.AMDGPU.dp4(<4 x float> %617, <4 x float> %621) - %623 = fcmp uge float 0x3FE6666660000000, %622 - %624 = select i1 %623, float 0x3FE6666660000000, float %622 - %625 = fmul float %8, %624 - %626 = fmul float %13, %624 - %627 = fmul float %18, %624 - %628 = insertelement <4 x float> undef, float %34, i32 0 - %629 = insertelement <4 x float> %628, float %35, i32 1 - %630 = insertelement <4 x float> %629, float %36, i32 2 - %631 = insertelement <4 x float> %630, float 0.000000e+00, i32 3 - %632 = insertelement <4 x float> undef, float %63, i32 0 - %633 = insertelement <4 x float> %632, float %65, i32 1 - %634 = insertelement <4 x float> %633, float %67, i32 2 - %635 = insertelement <4 x float> %634, float 0.000000e+00, i32 3 - %636 = call float @llvm.AMDGPU.dp4(<4 x float> %631, <4 x float> %635) - %637 = fcmp uge float 0x3FECCCCCC0000000, %636 - %638 = select i1 %637, float 0x3FECCCCCC0000000, float %636 - %639 = fmul float %625, %638 - %640 = fmul float %626, %638 - %641 = fmul float %627, %638 - br label %ENDIF160 - -ENDIF160: ; preds = %ENDIF136, %IF161 - %temp84.0 = phi float [ %610, %IF161 ], [ %255, %ENDIF136 ] - %temp85.0 = phi float [ %611, %IF161 ], [ %256, %ENDIF136 ] - %temp86.0 = phi float [ %612, %IF161 ], [ %257, %ENDIF136 ] - %temp87.0 = phi float [ %613, %IF161 ], [ %258, %ENDIF136 ] - %temp92.6 = phi float [ %639, %IF161 ], [ %415, %ENDIF136 ] - %temp93.0 = phi float [ %640, %IF161 ], [ 0.000000e+00, %ENDIF136 ] - %temp94.0 = phi float [ %641, %IF161 ], [ 0.000000e+00, %ENDIF136 ] - %642 = fcmp olt float 2.200000e+03, %179 - %643 = sext i1 %642 to i32 - %644 = bitcast i32 %643 to float - %645 = fcmp olt float %179, 2.300000e+03 - %646 = sext i1 %645 to i32 - %647 = bitcast i32 %646 to float - %648 = bitcast float %644 to i32 - %649 = bitcast float %647 to i32 - %650 = and i32 %648, %649 - %651 = bitcast i32 %650 to float - %652 = bitcast float %651 to i32 - %653 = icmp ne i32 %652, 0 - br i1 %653, label %IF164, label %ENDIF163 - -IF164: ; preds = %ENDIF160 - %654 = fmul float %202, 5.000000e-01 - %655 = fcmp uge float 0x3FE4CCCCC0000000, %654 - %656 = select i1 %655, float 0x3FE4CCCCC0000000, float %654 - %657 = fcmp uge float %656, 0x3FD6666660000000 - %658 = select i1 %657, float 0x3FD6666660000000, float %656 - %659 = call float @llvm.AMDGPU.lrp(float %658, float %400, float %300) - %660 = call float @llvm.AMDGPU.lrp(float %658, float %401, float %301) - %661 = call float @llvm.AMDGPU.lrp(float %658, float %402, float %302) - %662 = call float @llvm.AMDGPU.lrp(float %658, float %403, float %303) - %663 = insertelement <4 x float> undef, float %329, i32 0 - %664 = insertelement <4 x float> %663, float %330, i32 1 - %665 = insertelement <4 x float> %664, float %331, i32 2 - %666 = insertelement <4 x float> %665, float 0.000000e+00, i32 3 - %667 = insertelement <4 x float> undef, float %63, i32 0 - %668 = insertelement <4 x float> %667, float %65, i32 1 - %669 = insertelement <4 x float> %668, float %67, i32 2 - %670 = insertelement <4 x float> %669, float 0.000000e+00, i32 3 - %671 = call float @llvm.AMDGPU.dp4(<4 x float> %666, <4 x float> %670) - %672 = fcmp uge float 0x3FE6666660000000, %671 - %673 = select i1 %672, float 0x3FE6666660000000, float %671 - %674 = fmul float %8, %673 - %675 = fmul float %13, %673 - %676 = fmul float %18, %673 - %677 = insertelement <4 x float> undef, float %34, i32 0 - %678 = insertelement <4 x float> %677, float %35, i32 1 - %679 = insertelement <4 x float> %678, float %36, i32 2 - %680 = insertelement <4 x float> %679, float 0.000000e+00, i32 3 - %681 = insertelement <4 x float> undef, float %63, i32 0 - %682 = insertelement <4 x float> %681, float %65, i32 1 - %683 = insertelement <4 x float> %682, float %67, i32 2 - %684 = insertelement <4 x float> %683, float 0.000000e+00, i32 3 - %685 = call float @llvm.AMDGPU.dp4(<4 x float> %680, <4 x float> %684) - %686 = fcmp uge float 0x3FECCCCCC0000000, %685 - %687 = select i1 %686, float 0x3FECCCCCC0000000, float %685 - %688 = fmul float %674, %687 - %689 = fmul float %675, %687 - %690 = fmul float %676, %687 - br label %ENDIF163 - -ENDIF163: ; preds = %ENDIF160, %IF164 - %temp84.1 = phi float [ %659, %IF164 ], [ %temp84.0, %ENDIF160 ] - %temp85.1 = phi float [ %660, %IF164 ], [ %temp85.0, %ENDIF160 ] - %temp86.1 = phi float [ %661, %IF164 ], [ %temp86.0, %ENDIF160 ] - %temp87.1 = phi float [ %662, %IF164 ], [ %temp87.0, %ENDIF160 ] - %temp92.7 = phi float [ %688, %IF164 ], [ %temp92.6, %ENDIF160 ] - %temp93.1 = phi float [ %689, %IF164 ], [ %temp93.0, %ENDIF160 ] - %temp94.1 = phi float [ %690, %IF164 ], [ %temp94.0, %ENDIF160 ] - %691 = fcmp oge float %179, 2.300000e+03 - %692 = sext i1 %691 to i32 - %693 = bitcast i32 %692 to float - %694 = fcmp olt float %179, 2.480000e+03 - %695 = sext i1 %694 to i32 - %696 = bitcast i32 %695 to float - %697 = bitcast float %693 to i32 - %698 = bitcast float %696 to i32 - %699 = and i32 %697, %698 - %700 = bitcast i32 %699 to float - %701 = bitcast float %700 to i32 - %702 = icmp ne i32 %701, 0 - br i1 %702, label %IF167, label %ENDIF166 - -IF167: ; preds = %ENDIF163 - %703 = fmul float %202, 5.000000e-01 - %704 = fcmp uge float 0x3FE4CCCCC0000000, %703 - %705 = select i1 %704, float 0x3FE4CCCCC0000000, float %703 - %706 = fcmp uge float %705, 0x3FD3333340000000 - %707 = select i1 %706, float 0x3FD3333340000000, float %705 - %708 = call float @llvm.AMDGPU.lrp(float %707, float %409, float %300) - %709 = call float @llvm.AMDGPU.lrp(float %707, float %410, float %301) - %710 = call float @llvm.AMDGPU.lrp(float %707, float %411, float %302) - %711 = call float @llvm.AMDGPU.lrp(float %707, float %412, float %303) - %712 = insertelement <4 x float> undef, float %329, i32 0 - %713 = insertelement <4 x float> %712, float %330, i32 1 - %714 = insertelement <4 x float> %713, float %331, i32 2 - %715 = insertelement <4 x float> %714, float 0.000000e+00, i32 3 - %716 = insertelement <4 x float> undef, float %63, i32 0 - %717 = insertelement <4 x float> %716, float %65, i32 1 - %718 = insertelement <4 x float> %717, float %67, i32 2 - %719 = insertelement <4 x float> %718, float 0.000000e+00, i32 3 - %720 = call float @llvm.AMDGPU.dp4(<4 x float> %715, <4 x float> %719) - %721 = fcmp uge float 0x3FEB333340000000, %720 - %722 = select i1 %721, float 0x3FEB333340000000, float %720 - %723 = fmul float %8, %722 - %724 = fmul float %13, %722 - %725 = fmul float %18, %722 - %726 = insertelement <4 x float> undef, float %34, i32 0 - %727 = insertelement <4 x float> %726, float %35, i32 1 - %728 = insertelement <4 x float> %727, float %36, i32 2 - %729 = insertelement <4 x float> %728, float 0.000000e+00, i32 3 - %730 = insertelement <4 x float> undef, float %63, i32 0 - %731 = insertelement <4 x float> %730, float %65, i32 1 - %732 = insertelement <4 x float> %731, float %67, i32 2 - %733 = insertelement <4 x float> %732, float 0.000000e+00, i32 3 - %734 = call float @llvm.AMDGPU.dp4(<4 x float> %729, <4 x float> %733) - %735 = fcmp uge float 0x3FECCCCCC0000000, %734 - %736 = select i1 %735, float 0x3FECCCCCC0000000, float %734 - %737 = fmul float %723, %736 - %738 = fmul float %724, %736 - %739 = fmul float %725, %736 - br label %ENDIF166 - -ENDIF166: ; preds = %ENDIF163, %IF167 - %temp84.2 = phi float [ %708, %IF167 ], [ %temp84.1, %ENDIF163 ] - %temp85.2 = phi float [ %709, %IF167 ], [ %temp85.1, %ENDIF163 ] - %temp86.2 = phi float [ %710, %IF167 ], [ %temp86.1, %ENDIF163 ] - %temp87.2 = phi float [ %711, %IF167 ], [ %temp87.1, %ENDIF163 ] - %temp92.8 = phi float [ %737, %IF167 ], [ %temp92.7, %ENDIF163 ] - %temp93.2 = phi float [ %738, %IF167 ], [ %temp93.1, %ENDIF163 ] - %temp94.2 = phi float [ %739, %IF167 ], [ %temp94.1, %ENDIF163 ] - %740 = fcmp oge float %179, 2.480000e+03 - %741 = sext i1 %740 to i32 - %742 = bitcast i32 %741 to float - %743 = fcmp olt float %179, 2.530000e+03 - %744 = sext i1 %743 to i32 - %745 = bitcast i32 %744 to float - %746 = bitcast float %742 to i32 - %747 = bitcast float %745 to i32 - %748 = and i32 %746, %747 - %749 = bitcast i32 %748 to float - %750 = bitcast float %749 to i32 - %751 = icmp ne i32 %750, 0 - br i1 %751, label %IF170, label %ENDIF169 - -IF170: ; preds = %ENDIF166 - %752 = fmul float %202, 5.000000e-01 - %753 = fcmp uge float 0x3FE4CCCCC0000000, %752 - %754 = select i1 %753, float 0x3FE4CCCCC0000000, float %752 - %755 = fcmp uge float %754, 0x3FC99999A0000000 - %756 = select i1 %755, float 0x3FC99999A0000000, float %754 - %757 = call float @llvm.AMDGPU.lrp(float %756, float %409, float %300) - %758 = call float @llvm.AMDGPU.lrp(float %756, float %410, float %301) - %759 = call float @llvm.AMDGPU.lrp(float %756, float %411, float %302) - %760 = call float @llvm.AMDGPU.lrp(float %756, float %412, float %303) - %761 = insertelement <4 x float> undef, float %329, i32 0 - %762 = insertelement <4 x float> %761, float %330, i32 1 - %763 = insertelement <4 x float> %762, float %331, i32 2 - %764 = insertelement <4 x float> %763, float 0.000000e+00, i32 3 - %765 = insertelement <4 x float> undef, float %63, i32 0 - %766 = insertelement <4 x float> %765, float %65, i32 1 - %767 = insertelement <4 x float> %766, float %67, i32 2 - %768 = insertelement <4 x float> %767, float 0.000000e+00, i32 3 - %769 = call float @llvm.AMDGPU.dp4(<4 x float> %764, <4 x float> %768) - %770 = fcmp uge float 0x3FEB333340000000, %769 - %771 = select i1 %770, float 0x3FEB333340000000, float %769 - %772 = fmul float %8, %771 - %773 = fmul float %13, %771 - %774 = fmul float %18, %771 - %775 = insertelement <4 x float> undef, float %34, i32 0 - %776 = insertelement <4 x float> %775, float %35, i32 1 - %777 = insertelement <4 x float> %776, float %36, i32 2 - %778 = insertelement <4 x float> %777, float 0.000000e+00, i32 3 - %779 = insertelement <4 x float> undef, float %63, i32 0 - %780 = insertelement <4 x float> %779, float %65, i32 1 - %781 = insertelement <4 x float> %780, float %67, i32 2 - %782 = insertelement <4 x float> %781, float 0.000000e+00, i32 3 - %783 = call float @llvm.AMDGPU.dp4(<4 x float> %778, <4 x float> %782) - %784 = fcmp uge float 0x3FECCCCCC0000000, %783 - %785 = select i1 %784, float 0x3FECCCCCC0000000, float %783 - %786 = fmul float %772, %785 - %787 = fmul float %773, %785 - %788 = fmul float %774, %785 - br label %ENDIF169 - -ENDIF169: ; preds = %ENDIF166, %IF170 - %temp84.3 = phi float [ %757, %IF170 ], [ %temp84.2, %ENDIF166 ] - %temp85.3 = phi float [ %758, %IF170 ], [ %temp85.2, %ENDIF166 ] - %temp86.3 = phi float [ %759, %IF170 ], [ %temp86.2, %ENDIF166 ] - %temp87.3 = phi float [ %760, %IF170 ], [ %temp87.2, %ENDIF166 ] - %temp92.9 = phi float [ %786, %IF170 ], [ %temp92.8, %ENDIF166 ] - %temp93.3 = phi float [ %787, %IF170 ], [ %temp93.2, %ENDIF166 ] - %temp94.3 = phi float [ %788, %IF170 ], [ %temp94.2, %ENDIF166 ] - %789 = fcmp oge float %179, 2.530000e+03 - %790 = sext i1 %789 to i32 - %791 = bitcast i32 %790 to float - %792 = fcmp olt float %179, 2.670000e+03 - %793 = sext i1 %792 to i32 - %794 = bitcast i32 %793 to float - %795 = bitcast float %791 to i32 - %796 = bitcast float %794 to i32 - %797 = and i32 %795, %796 - %798 = bitcast i32 %797 to float - %799 = bitcast float %798 to i32 - %800 = icmp ne i32 %799, 0 - br i1 %800, label %IF173, label %ENDIF172 - -IF173: ; preds = %ENDIF169 - %801 = fmul float %202, 5.000000e-01 - %802 = fcmp uge float 0x3FE4CCCCC0000000, %801 - %803 = select i1 %802, float 0x3FE4CCCCC0000000, float %801 - %804 = fcmp uge float %803, 0x3FB99999A0000000 - %805 = select i1 %804, float 0x3FB99999A0000000, float %803 - %806 = call float @llvm.AMDGPU.lrp(float %805, float %400, float %300) - %807 = call float @llvm.AMDGPU.lrp(float %805, float %401, float %301) - %808 = call float @llvm.AMDGPU.lrp(float %805, float %402, float %302) - %809 = call float @llvm.AMDGPU.lrp(float %805, float %403, float %303) - %810 = insertelement <4 x float> undef, float %329, i32 0 - %811 = insertelement <4 x float> %810, float %330, i32 1 - %812 = insertelement <4 x float> %811, float %331, i32 2 - %813 = insertelement <4 x float> %812, float 0.000000e+00, i32 3 - %814 = insertelement <4 x float> undef, float %63, i32 0 - %815 = insertelement <4 x float> %814, float %65, i32 1 - %816 = insertelement <4 x float> %815, float %67, i32 2 - %817 = insertelement <4 x float> %816, float 0.000000e+00, i32 3 - %818 = call float @llvm.AMDGPU.dp4(<4 x float> %813, <4 x float> %817) - %819 = fcmp uge float 0x3FEB333340000000, %818 - %820 = select i1 %819, float 0x3FEB333340000000, float %818 - %821 = fmul float %8, %820 - %822 = fmul float %13, %820 - %823 = fmul float %18, %820 - %824 = insertelement <4 x float> undef, float %34, i32 0 - %825 = insertelement <4 x float> %824, float %35, i32 1 - %826 = insertelement <4 x float> %825, float %36, i32 2 - %827 = insertelement <4 x float> %826, float 0.000000e+00, i32 3 - %828 = insertelement <4 x float> undef, float %63, i32 0 - %829 = insertelement <4 x float> %828, float %65, i32 1 - %830 = insertelement <4 x float> %829, float %67, i32 2 - %831 = insertelement <4 x float> %830, float 0.000000e+00, i32 3 - %832 = call float @llvm.AMDGPU.dp4(<4 x float> %827, <4 x float> %831) - %833 = fcmp uge float 0x3FECCCCCC0000000, %832 - %834 = select i1 %833, float 0x3FECCCCCC0000000, float %832 - %835 = fmul float %821, %834 - %836 = fmul float %822, %834 - %837 = fmul float %823, %834 - br label %ENDIF172 - -ENDIF172: ; preds = %ENDIF169, %IF173 - %temp84.4 = phi float [ %806, %IF173 ], [ %temp84.3, %ENDIF169 ] - %temp85.4 = phi float [ %807, %IF173 ], [ %temp85.3, %ENDIF169 ] - %temp86.4 = phi float [ %808, %IF173 ], [ %temp86.3, %ENDIF169 ] - %temp87.4 = phi float [ %809, %IF173 ], [ %temp87.3, %ENDIF169 ] - %temp92.10 = phi float [ %835, %IF173 ], [ %temp92.9, %ENDIF169 ] - %temp93.4 = phi float [ %836, %IF173 ], [ %temp93.3, %ENDIF169 ] - %temp94.4 = phi float [ %837, %IF173 ], [ %temp94.3, %ENDIF169 ] - %838 = fcmp oge float %179, 2.670000e+03 - %839 = sext i1 %838 to i32 - %840 = bitcast i32 %839 to float - %841 = bitcast float %840 to i32 - %842 = icmp ne i32 %841, 0 - br i1 %842, label %IF176, label %ENDIF175 - -IF176: ; preds = %ENDIF172 - %843 = fmul float %202, 0x3FB99999A0000000 - %844 = fcmp uge float 0.000000e+00, %843 - %845 = select i1 %844, float 0.000000e+00, float %843 - %846 = fcmp uge float %845, 0x3FD99999A0000000 - %847 = select i1 %846, float 0x3FD99999A0000000, float %845 - %848 = call float @llvm.AMDGPU.lrp(float %847, float %400, float %300) - %849 = call float @llvm.AMDGPU.lrp(float %847, float %401, float %301) - %850 = call float @llvm.AMDGPU.lrp(float %847, float %402, float %302) - %851 = call float @llvm.AMDGPU.lrp(float %847, float %403, float %303) - %852 = insertelement <4 x float> undef, float %329, i32 0 - %853 = insertelement <4 x float> %852, float %330, i32 1 - %854 = insertelement <4 x float> %853, float %331, i32 2 - %855 = insertelement <4 x float> %854, float 0.000000e+00, i32 3 - %856 = insertelement <4 x float> undef, float %63, i32 0 - %857 = insertelement <4 x float> %856, float %65, i32 1 - %858 = insertelement <4 x float> %857, float %67, i32 2 - %859 = insertelement <4 x float> %858, float 0.000000e+00, i32 3 - %860 = call float @llvm.AMDGPU.dp4(<4 x float> %855, <4 x float> %859) - %861 = fcmp uge float 0x3FEB333340000000, %860 - %862 = select i1 %861, float 0x3FEB333340000000, float %860 - %863 = fmul float %8, %862 - %864 = fmul float %13, %862 - %865 = fmul float %18, %862 - %866 = insertelement <4 x float> undef, float %34, i32 0 - %867 = insertelement <4 x float> %866, float %35, i32 1 - %868 = insertelement <4 x float> %867, float %36, i32 2 - %869 = insertelement <4 x float> %868, float 0.000000e+00, i32 3 - %870 = insertelement <4 x float> undef, float %63, i32 0 - %871 = insertelement <4 x float> %870, float %65, i32 1 - %872 = insertelement <4 x float> %871, float %67, i32 2 - %873 = insertelement <4 x float> %872, float 0.000000e+00, i32 3 - %874 = call float @llvm.AMDGPU.dp4(<4 x float> %869, <4 x float> %873) - %875 = fcmp uge float 0x3FECCCCCC0000000, %874 - %876 = select i1 %875, float 0x3FECCCCCC0000000, float %874 - %877 = fmul float %863, %876 - %878 = fmul float %864, %876 - %879 = fmul float %865, %876 - br label %ENDIF175 - -ENDIF175: ; preds = %ENDIF172, %IF176 - %temp84.5 = phi float [ %848, %IF176 ], [ %temp84.4, %ENDIF172 ] - %temp85.5 = phi float [ %849, %IF176 ], [ %temp85.4, %ENDIF172 ] - %temp86.5 = phi float [ %850, %IF176 ], [ %temp86.4, %ENDIF172 ] - %temp87.5 = phi float [ %851, %IF176 ], [ %temp87.4, %ENDIF172 ] - %temp92.11 = phi float [ %877, %IF176 ], [ %temp92.10, %ENDIF172 ] - %temp93.5 = phi float [ %878, %IF176 ], [ %temp93.4, %ENDIF172 ] - %temp94.5 = phi float [ %879, %IF176 ], [ %temp94.4, %ENDIF172 ] - %880 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 10) - %881 = extractelement <4 x float> %880, i32 0 - %882 = fcmp olt float %881, %179 - %883 = sext i1 %882 to i32 - %884 = bitcast i32 %883 to float - %885 = bitcast float %884 to i32 - %886 = icmp ne i32 %885, 0 - br i1 %886, label %IF179, label %ENDIF178 - -IF179: ; preds = %ENDIF175 - %887 = fadd float %202, 1.000000e+00 - %888 = fadd float %202, 1.000000e+00 - %889 = fadd float %202, 1.000000e+00 - %890 = insertelement <4 x float> undef, float %43, i32 0 - %891 = insertelement <4 x float> %890, float %44, i32 1 - %892 = insertelement <4 x float> %891, float %45, i32 2 - %893 = insertelement <4 x float> %892, float 0.000000e+00, i32 3 - %894 = insertelement <4 x float> undef, float %43, i32 0 - %895 = insertelement <4 x float> %894, float %44, i32 1 - %896 = insertelement <4 x float> %895, float %45, i32 2 - %897 = insertelement <4 x float> %896, float 0.000000e+00, i32 3 - %898 = call float @llvm.AMDGPU.dp4(<4 x float> %893, <4 x float> %897) - %899 = call float @llvm.AMDGPU.rsq.f32(float %898) - %900 = fmul float %45, %899 - %901 = call float @fabs(float %900) - %902 = fmul float %176, 0x3FECCCCCC0000000 - %903 = fadd float %902, %901 - %904 = fadd float %903, 0xBFEFAE1480000000 - %905 = fmul float %904, 0xC043FFFE20000000 - %906 = call float @llvm.AMDIL.clamp.(float %905, float 0.000000e+00, float 1.000000e+00) - %907 = fmul float 2.000000e+00, %906 - %908 = fsub float -0.000000e+00, %907 - %909 = fadd float 3.000000e+00, %908 - %910 = fmul float %906, %909 - %911 = fmul float %906, %910 - %912 = call float @llvm.AMDGPU.lrp(float %911, float %temp84.5, float %887) - %913 = call float @llvm.AMDGPU.lrp(float %911, float %temp85.5, float %888) - %914 = call float @llvm.AMDGPU.lrp(float %911, float %temp86.5, float %889) - %915 = call float @llvm.AMDGPU.lrp(float %911, float %temp87.5, float 0.000000e+00) - %916 = fmul float %202, 5.000000e-01 - %917 = fcmp uge float 0x3FE4CCCCC0000000, %916 - %918 = select i1 %917, float 0x3FE4CCCCC0000000, float %916 - %919 = fcmp uge float %918, 0x3FE3333340000000 - %920 = select i1 %919, float 0x3FE3333340000000, float %918 - %921 = call float @llvm.AMDGPU.lrp(float %920, float %912, float %temp84.5) - %922 = call float @llvm.AMDGPU.lrp(float %920, float %913, float %temp85.5) - %923 = call float @llvm.AMDGPU.lrp(float %920, float %914, float %temp86.5) - %924 = call float @llvm.AMDGPU.lrp(float %920, float %915, float %temp87.5) - %925 = insertelement <4 x float> undef, float %329, i32 0 - %926 = insertelement <4 x float> %925, float %330, i32 1 - %927 = insertelement <4 x float> %926, float %331, i32 2 - %928 = insertelement <4 x float> %927, float 0.000000e+00, i32 3 - %929 = insertelement <4 x float> undef, float %63, i32 0 - %930 = insertelement <4 x float> %929, float %65, i32 1 - %931 = insertelement <4 x float> %930, float %67, i32 2 - %932 = insertelement <4 x float> %931, float 0.000000e+00, i32 3 - %933 = call float @llvm.AMDGPU.dp4(<4 x float> %928, <4 x float> %932) - %934 = fcmp uge float 0x3FE99999A0000000, %933 - %935 = select i1 %934, float 0x3FE99999A0000000, float %933 - %936 = fmul float %8, %935 - %937 = fmul float %13, %935 - %938 = fmul float %18, %935 - %939 = insertelement <4 x float> undef, float %34, i32 0 - %940 = insertelement <4 x float> %939, float %35, i32 1 - %941 = insertelement <4 x float> %940, float %36, i32 2 - %942 = insertelement <4 x float> %941, float 0.000000e+00, i32 3 - %943 = insertelement <4 x float> undef, float %63, i32 0 - %944 = insertelement <4 x float> %943, float %65, i32 1 - %945 = insertelement <4 x float> %944, float %67, i32 2 - %946 = insertelement <4 x float> %945, float 0.000000e+00, i32 3 - %947 = call float @llvm.AMDGPU.dp4(<4 x float> %942, <4 x float> %946) - %948 = fcmp uge float 0x3FECCCCCC0000000, %947 - %949 = select i1 %948, float 0x3FECCCCCC0000000, float %947 - %950 = fmul float %936, %949 - %951 = fmul float %937, %949 - %952 = fmul float %938, %949 - br label %ENDIF178 - -ENDIF178: ; preds = %ENDIF175, %IF179 - %temp84.6 = phi float [ %921, %IF179 ], [ %temp84.5, %ENDIF175 ] - %temp85.6 = phi float [ %922, %IF179 ], [ %temp85.5, %ENDIF175 ] - %temp86.6 = phi float [ %923, %IF179 ], [ %temp86.5, %ENDIF175 ] - %temp87.6 = phi float [ %924, %IF179 ], [ %temp87.5, %ENDIF175 ] - %temp92.12 = phi float [ %950, %IF179 ], [ %temp92.11, %ENDIF175 ] - %temp93.6 = phi float [ %951, %IF179 ], [ %temp93.5, %ENDIF175 ] - %temp94.6 = phi float [ %952, %IF179 ], [ %temp94.5, %ENDIF175 ] - %953 = fmul float %55, %temp92.12 - %954 = fmul float %57, %temp93.6 - %955 = fmul float %59, %temp94.6 - %956 = fmul float %61, 0.000000e+00 - %957 = fmul float %temp84.6, %953 - %958 = fmul float %temp85.6, %954 - %959 = fmul float %temp86.6, %955 - %960 = fmul float %temp87.6, %956 - %961 = fmul float %2, -2.000000e+00 - %962 = fadd float %961, 1.000000e+00 - %963 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 23) - %964 = extractelement <4 x float> %963, i32 2 - %965 = fsub float -0.000000e+00, %964 - %966 = fadd float %962, %965 - %967 = fdiv float 1.000000e+00, %966 - %968 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 24) - %969 = extractelement <4 x float> %968, i32 2 - %970 = fmul float %969, %967 - %971 = fsub float -0.000000e+00, %53 - %972 = fmul float %971, %53 - %973 = fmul float %972, %970 - %974 = fmul float %973, %970 - %975 = fmul float %974, 0x3FF7154760000000 - %976 = call float @llvm.AMDIL.exp.(float %975) - %977 = fcmp oeq float %53, 1.000000e+00 - %978 = sext i1 %977 to i32 - %979 = bitcast i32 %978 to float - %980 = bitcast float %979 to i32 - %981 = icmp ne i32 %980, 0 - %.184 = select i1 %981, float 1.000000e+00, float %976 - %982 = call float @llvm.AMDGPU.lrp(float %.184, float %957, float %47) - %983 = call float @llvm.AMDGPU.lrp(float %.184, float %958, float %49) - %984 = call float @llvm.AMDGPU.lrp(float %.184, float %959, float %51) - %985 = insertelement <4 x float> undef, float %982, i32 0 - %986 = insertelement <4 x float> %985, float %983, i32 1 - %987 = insertelement <4 x float> %986, float %984, i32 2 - %988 = insertelement <4 x float> %987, float %960, i32 3 - call void @llvm.R600.store.swizzle(<4 x float> %988, i32 0, i32 0) - ret void -} - -; Function Attrs: readnone -declare float @llvm.AMDGPU.dp4(<4 x float>, <4 x float>) #1 - -; Function Attrs: readnone -declare float @llvm.AMDGPU.rsq.f32(float) #1 - -; Function Attrs: readnone -declare <4 x float> @llvm.AMDGPU.tex(<4 x float>, i32, i32, i32) #1 - -; Function Attrs: readonly -declare float @fabs(float) #2 - -; Function Attrs: readnone -declare float @llvm.AMDIL.exp.(float) #1 - -; Function Attrs: readnone -declare float @llvm.AMDGPU.lrp(float, float, float) #1 - -; Function Attrs: readnone -declare float @llvm.AMDIL.clamp.(float, float, float) #1 - -declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) - -attributes #0 = { "ShaderType"="0" } -attributes #1 = { readnone } -attributes #2 = { readonly } diff --git a/llvm/test/CodeGen/R600/bitcast.ll b/llvm/test/CodeGen/R600/bitcast.ll deleted file mode 100644 index fd56d956bf3..00000000000 --- a/llvm/test/CodeGen/R600/bitcast.ll +++ /dev/null @@ -1,79 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s - -; This test just checks that the compiler doesn't crash. - -declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) - -; FUNC-LABEL: {{^}}v32i8_to_v8i32: -; SI: s_endpgm -define void @v32i8_to_v8i32(<32 x i8> addrspace(2)* inreg) #0 { -entry: - %1 = load <32 x i8>, <32 x i8> addrspace(2)* %0 - %2 = bitcast <32 x i8> %1 to <8 x i32> - %3 = extractelement <8 x i32> %2, i32 1 - %4 = icmp ne i32 %3, 0 - %5 = select i1 %4, float 0.0, float 1.0 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %5, float %5, float %5, float %5) - ret void -} - -; FUNC-LABEL: {{^}}i8ptr_v16i8ptr: -; SI: s_endpgm -define void @i8ptr_v16i8ptr(<16 x i8> addrspace(1)* %out, i8 addrspace(1)* %in) { -entry: - %0 = bitcast i8 addrspace(1)* %in to <16 x i8> addrspace(1)* - %1 = load <16 x i8>, <16 x i8> addrspace(1)* %0 - store <16 x i8> %1, <16 x i8> addrspace(1)* %out - ret void -} - -define void @f32_to_v2i16(<2 x i16> addrspace(1)* %out, float addrspace(1)* %in) nounwind { - %load = load float, float addrspace(1)* %in, align 4 - %bc = bitcast float %load to <2 x i16> - store <2 x i16> %bc, <2 x i16> addrspace(1)* %out, align 4 - ret void -} - -define void @v2i16_to_f32(float addrspace(1)* %out, <2 x i16> addrspace(1)* %in) nounwind { - %load = load <2 x i16>, <2 x i16> addrspace(1)* %in, align 4 - %bc = bitcast <2 x i16> %load to float - store float %bc, float addrspace(1)* %out, align 4 - ret void -} - -define void @v4i8_to_i32(i32 addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind { - %load = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4 - %bc = bitcast <4 x i8> %load to i32 - store i32 %bc, i32 addrspace(1)* %out, align 4 - ret void -} - -define void @i32_to_v4i8(<4 x i8> addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { - %load = load i32, i32 addrspace(1)* %in, align 4 - %bc = bitcast i32 %load to <4 x i8> - store <4 x i8> %bc, <4 x i8> addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bitcast_v2i32_to_f64: -; SI: s_endpgm -define void @bitcast_v2i32_to_f64(double addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { - %val = load <2 x i32>, <2 x i32> addrspace(1)* %in, align 8 - %add = add <2 x i32> %val, - %bc = bitcast <2 x i32> %add to double - store double %bc, double addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}bitcast_f64_to_v2i32: -; SI: s_endpgm -define void @bitcast_f64_to_v2i32(<2 x i32> addrspace(1)* %out, double addrspace(1)* %in) { - %val = load double, double addrspace(1)* %in, align 8 - %add = fadd double %val, 4.0 - %bc = bitcast double %add to <2 x i32> - store <2 x i32> %bc, <2 x i32> addrspace(1)* %out, align 8 - ret void -} - -attributes #0 = { "ShaderType"="0" } diff --git a/llvm/test/CodeGen/R600/bswap.ll b/llvm/test/CodeGen/R600/bswap.ll deleted file mode 100644 index 4cf8e4bfed5..00000000000 --- a/llvm/test/CodeGen/R600/bswap.ll +++ /dev/null @@ -1,115 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s - -declare i32 @llvm.bswap.i32(i32) nounwind readnone -declare <2 x i32> @llvm.bswap.v2i32(<2 x i32>) nounwind readnone -declare <4 x i32> @llvm.bswap.v4i32(<4 x i32>) nounwind readnone -declare <8 x i32> @llvm.bswap.v8i32(<8 x i32>) nounwind readnone -declare i64 @llvm.bswap.i64(i64) nounwind readnone -declare <2 x i64> @llvm.bswap.v2i64(<2 x i64>) nounwind readnone -declare <4 x i64> @llvm.bswap.v4i64(<4 x i64>) nounwind readnone - -; FUNC-LABEL: @test_bswap_i32 -; SI: buffer_load_dword [[VAL:v[0-9]+]] -; SI-DAG: v_alignbit_b32 [[TMP0:v[0-9]+]], [[VAL]], [[VAL]], 8 -; SI-DAG: v_alignbit_b32 [[TMP1:v[0-9]+]], [[VAL]], [[VAL]], 24 -; SI-DAG: s_mov_b32 [[K:s[0-9]+]], 0xff00ff -; SI: v_bfi_b32 [[RESULT:v[0-9]+]], [[K]], [[TMP1]], [[TMP0]] -; SI: buffer_store_dword [[RESULT]] -; SI: s_endpgm -define void @test_bswap_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { - %val = load i32, i32 addrspace(1)* %in, align 4 - %bswap = call i32 @llvm.bswap.i32(i32 %val) nounwind readnone - store i32 %bswap, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: @test_bswap_v2i32 -; SI-DAG: v_alignbit_b32 -; SI-DAG: v_alignbit_b32 -; SI-DAG: v_bfi_b32 -; SI-DAG: v_alignbit_b32 -; SI-DAG: v_alignbit_b32 -; SI-DAG: v_bfi_b32 -; SI: s_endpgm -define void @test_bswap_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) nounwind { - %val = load <2 x i32>, <2 x i32> addrspace(1)* %in, align 8 - %bswap = call <2 x i32> @llvm.bswap.v2i32(<2 x i32> %val) nounwind readnone - store <2 x i32> %bswap, <2 x i32> addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: @test_bswap_v4i32 -; SI-DAG: v_alignbit_b32 -; SI-DAG: v_alignbit_b32 -; SI-DAG: v_bfi_b32 -; SI-DAG: v_alignbit_b32 -; SI-DAG: v_alignbit_b32 -; SI-DAG: v_bfi_b32 -; SI-DAG: v_alignbit_b32 -; SI-DAG: v_alignbit_b32 -; SI-DAG: v_bfi_b32 -; SI-DAG: v_alignbit_b32 -; SI-DAG: v_alignbit_b32 -; SI-DAG: v_bfi_b32 -; SI: s_endpgm -define void @test_bswap_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) nounwind { - %val = load <4 x i32>, <4 x i32> addrspace(1)* %in, align 16 - %bswap = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %val) nounwind readnone - store <4 x i32> %bswap, <4 x i32> addrspace(1)* %out, align 16 - ret void -} - -; FUNC-LABEL: @test_bswap_v8i32 -; SI-DAG: v_alignbit_b32 -; SI-DAG: v_alignbit_b32 -; SI-DAG: v_bfi_b32 -; SI-DAG: v_alignbit_b32 -; SI-DAG: v_alignbit_b32 -; SI-DAG: v_bfi_b32 -; SI-DAG: v_alignbit_b32 -; SI-DAG: v_alignbit_b32 -; SI-DAG: v_bfi_b32 -; SI-DAG: v_alignbit_b32 -; SI-DAG: v_alignbit_b32 -; SI-DAG: v_bfi_b32 -; SI-DAG: v_alignbit_b32 -; SI-DAG: v_alignbit_b32 -; SI-DAG: v_bfi_b32 -; SI-DAG: v_alignbit_b32 -; SI-DAG: v_alignbit_b32 -; SI-DAG: v_bfi_b32 -; SI-DAG: v_alignbit_b32 -; SI-DAG: v_alignbit_b32 -; SI-DAG: v_bfi_b32 -; SI-DAG: v_alignbit_b32 -; SI-DAG: v_alignbit_b32 -; SI-DAG: v_bfi_b32 -; SI: s_endpgm -define void @test_bswap_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %in) nounwind { - %val = load <8 x i32>, <8 x i32> addrspace(1)* %in, align 32 - %bswap = call <8 x i32> @llvm.bswap.v8i32(<8 x i32> %val) nounwind readnone - store <8 x i32> %bswap, <8 x i32> addrspace(1)* %out, align 32 - ret void -} - -define void @test_bswap_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind { - %val = load i64, i64 addrspace(1)* %in, align 8 - %bswap = call i64 @llvm.bswap.i64(i64 %val) nounwind readnone - store i64 %bswap, i64 addrspace(1)* %out, align 8 - ret void -} - -define void @test_bswap_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) nounwind { - %val = load <2 x i64>, <2 x i64> addrspace(1)* %in, align 16 - %bswap = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> %val) nounwind readnone - store <2 x i64> %bswap, <2 x i64> addrspace(1)* %out, align 16 - ret void -} - -define void @test_bswap_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) nounwind { - %val = load <4 x i64>, <4 x i64> addrspace(1)* %in, align 32 - %bswap = call <4 x i64> @llvm.bswap.v4i64(<4 x i64> %val) nounwind readnone - store <4 x i64> %bswap, <4 x i64> addrspace(1)* %out, align 32 - ret void -} diff --git a/llvm/test/CodeGen/R600/build_vector.ll b/llvm/test/CodeGen/R600/build_vector.ll deleted file mode 100644 index 65eacf5adc4..00000000000 --- a/llvm/test/CodeGen/R600/build_vector.ll +++ /dev/null @@ -1,35 +0,0 @@ -; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600 -; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI -; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s --check-prefix=SI - -; R600: {{^}}build_vector2: -; R600: MOV -; R600: MOV -; R600-NOT: MOV -; SI: {{^}}build_vector2: -; SI-DAG: v_mov_b32_e32 v[[X:[0-9]]], 5 -; SI-DAG: v_mov_b32_e32 v[[Y:[0-9]]], 6 -; SI: buffer_store_dwordx2 v{{\[}}[[X]]:[[Y]]{{\]}} -define void @build_vector2 (<2 x i32> addrspace(1)* %out) { -entry: - store <2 x i32> , <2 x i32> addrspace(1)* %out - ret void -} - -; R600: {{^}}build_vector4: -; R600: MOV -; R600: MOV -; R600: MOV -; R600: MOV -; R600-NOT: MOV -; SI: {{^}}build_vector4: -; SI-DAG: v_mov_b32_e32 v[[X:[0-9]]], 5 -; SI-DAG: v_mov_b32_e32 v[[Y:[0-9]]], 6 -; SI-DAG: v_mov_b32_e32 v[[Z:[0-9]]], 7 -; SI-DAG: v_mov_b32_e32 v[[W:[0-9]]], 8 -; SI: buffer_store_dwordx4 v{{\[}}[[X]]:[[W]]{{\]}} -define void @build_vector4 (<4 x i32> addrspace(1)* %out) { -entry: - store <4 x i32> , <4 x i32> addrspace(1)* %out - ret void -} diff --git a/llvm/test/CodeGen/R600/call.ll b/llvm/test/CodeGen/R600/call.ll deleted file mode 100644 index e769fd11c28..00000000000 --- a/llvm/test/CodeGen/R600/call.ll +++ /dev/null @@ -1,33 +0,0 @@ -; RUN: not llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s 2>&1 | FileCheck %s -; RUN: not llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s 2>&1 | FileCheck %s -; RUN: not llc -march=r600 -mcpu=cypress < %s 2>&1 | FileCheck %s - -; CHECK: error: unsupported call to function external_function in test_call_external - - -declare i32 @external_function(i32) nounwind - -define void @test_call_external(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { - %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 - %a = load i32, i32 addrspace(1)* %in - %b = load i32, i32 addrspace(1)* %b_ptr - %c = call i32 @external_function(i32 %b) nounwind - %result = add i32 %a, %c - store i32 %result, i32 addrspace(1)* %out - ret void -} - -define i32 @defined_function(i32 %x) nounwind noinline { - %y = add i32 %x, 8 - ret i32 %y -} - -define void @test_call(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { - %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 - %a = load i32, i32 addrspace(1)* %in - %b = load i32, i32 addrspace(1)* %b_ptr - %c = call i32 @defined_function(i32 %b) nounwind - %result = add i32 %a, %c - store i32 %result, i32 addrspace(1)* %out - ret void -} diff --git a/llvm/test/CodeGen/R600/call_fs.ll b/llvm/test/CodeGen/R600/call_fs.ll deleted file mode 100644 index 87bebbc49d5..00000000000 --- a/llvm/test/CodeGen/R600/call_fs.ll +++ /dev/null @@ -1,17 +0,0 @@ - -; RUN: llc < %s -march=r600 -mcpu=redwood -show-mc-encoding -o - | FileCheck --check-prefix=EG %s -; RUN: llc < %s -march=r600 -mcpu=rv710 -show-mc-encoding -o - | FileCheck --check-prefix=R600 %s - -; EG: .long 257 -; EG: {{^}}call_fs: -; EG: CALL_FS ; encoding: [0x00,0x00,0x00,0x00,0x00,0x00,0xc0,0x84] -; R600: .long 257 -; R600: {{^}}call_fs: -; R600:CALL_FS ; encoding: [0x00,0x00,0x00,0x00,0x00,0x00,0x80,0x89] - - -define void @call_fs() #0 { - ret void -} - -attributes #0 = { "ShaderType"="1" } ; Vertex Shader diff --git a/llvm/test/CodeGen/R600/cayman-loop-bug.ll b/llvm/test/CodeGen/R600/cayman-loop-bug.ll deleted file mode 100644 index c7b8c403731..00000000000 --- a/llvm/test/CodeGen/R600/cayman-loop-bug.ll +++ /dev/null @@ -1,32 +0,0 @@ -; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s - -; CHECK-LABEL: {{^}}main: -; CHECK: LOOP_START_DX10 -; CHECK: ALU_PUSH_BEFORE -; CHECK: LOOP_START_DX10 -; CHECK: PUSH -; CHECK-NOT: ALU_PUSH_BEFORE -; CHECK: END_LOOP -; CHECK: END_LOOP -define void @main (<4 x float> inreg %reg0) #0 { -entry: - br label %outer_loop -outer_loop: - %cnt = phi i32 [0, %entry], [%cnt_incr, %inner_loop] - %cond = icmp eq i32 %cnt, 16 - br i1 %cond, label %outer_loop_body, label %exit -outer_loop_body: - %cnt_incr = add i32 %cnt, 1 - br label %inner_loop -inner_loop: - %cnt2 = phi i32 [0, %outer_loop_body], [%cnt2_incr, %inner_loop_body] - %cond2 = icmp eq i32 %cnt2, 16 - br i1 %cond, label %inner_loop_body, label %outer_loop -inner_loop_body: - %cnt2_incr = add i32 %cnt2, 1 - br label %inner_loop -exit: - ret void -} - -attributes #0 = { "ShaderType"="0" } \ No newline at end of file diff --git a/llvm/test/CodeGen/R600/cf-stack-bug.ll b/llvm/test/CodeGen/R600/cf-stack-bug.ll deleted file mode 100644 index 75b87e48622..00000000000 --- a/llvm/test/CodeGen/R600/cf-stack-bug.ll +++ /dev/null @@ -1,244 +0,0 @@ -; RUN: llc -march=r600 -mcpu=redwood -debug-only=r600cf %s -o - 2>%t | FileCheck %s --check-prefix=FUNC -; RUN: FileCheck --check-prefix=BUG64 %s < %t - -; RUN: llc -march=r600 -mcpu=sumo -debug-only=r600cf %s -o - 2>%t | FileCheck %s --check-prefix=FUNC -; RUN: FileCheck --check-prefix=BUG64 %s < %t - -; RUN: llc -march=r600 -mcpu=barts -debug-only=r600cf %s -o - 2>%t | FileCheck %s --check-prefix=FUNC -; RUN: FileCheck --check-prefix=BUG64 %s < %t - -; RUN: llc -march=r600 -mcpu=turks -debug-only=r600cf %s -o - 2>%t | FileCheck %s --check-prefix=FUNC -; RUN: FileCheck --check-prefix=BUG64 %s < %t - -; RUN: llc -march=r600 -mcpu=caicos -debug-only=r600cf %s -o - 2>%t | FileCheck %s --check-prefix=FUNC -; RUN: FileCheck --check-prefix=BUG64 %s < %t - -; RUN: llc -march=r600 -mcpu=cedar -debug-only=r600cf %s -o - 2>%t | FileCheck %s --check-prefix=FUNC -; RUN: FileCheck --check-prefix=BUG32 %s < %t - -; RUN: llc -march=r600 -mcpu=juniper -debug-only=r600cf %s -o - 2>%t | FileCheck %s --check-prefix=FUNC -; RUN: FileCheck --check-prefix=NOBUG %s < %t - -; RUN: llc -march=r600 -mcpu=cypress -debug-only=r600cf %s -o - 2>%t | FileCheck %s --check-prefix=FUNC -; RUN: FileCheck --check-prefix=NOBUG %s < %t - -; RUN: llc -march=r600 -mcpu=cayman -debug-only=r600cf %s -o - 2>%t | FileCheck %s --check-prefix=FUNC -; RUN: FileCheck --check-prefix=NOBUG %s < %t - -; REQUIRES: asserts - -; We are currently allocating 2 extra sub-entries on Evergreen / NI for -; non-WQM push instructions if we change this to 1, then we will need to -; add one level of depth to each of these tests. - -; BUG64-NOT: Applying bug work-around -; BUG32-NOT: Applying bug work-around -; NOBUG-NOT: Applying bug work-around -; FUNC-LABEL: {{^}}nested3: -define void @nested3(i32 addrspace(1)* %out, i32 %cond) { -entry: - %0 = icmp sgt i32 %cond, 0 - br i1 %0, label %if.1, label %end - -if.1: - %1 = icmp sgt i32 %cond, 10 - br i1 %1, label %if.2, label %if.store.1 - -if.store.1: - store i32 1, i32 addrspace(1)* %out - br label %end - -if.2: - %2 = icmp sgt i32 %cond, 20 - br i1 %2, label %if.3, label %if.2.store - -if.2.store: - store i32 2, i32 addrspace(1)* %out - br label %end - -if.3: - store i32 3, i32 addrspace(1)* %out - br label %end - -end: - ret void -} - -; BUG64: Applying bug work-around -; BUG32-NOT: Applying bug work-around -; NOBUG-NOT: Applying bug work-around -; FUNC-LABEL: {{^}}nested4: -define void @nested4(i32 addrspace(1)* %out, i32 %cond) { -entry: - %0 = icmp sgt i32 %cond, 0 - br i1 %0, label %if.1, label %end - -if.1: - %1 = icmp sgt i32 %cond, 10 - br i1 %1, label %if.2, label %if.1.store - -if.1.store: - store i32 1, i32 addrspace(1)* %out - br label %end - -if.2: - %2 = icmp sgt i32 %cond, 20 - br i1 %2, label %if.3, label %if.2.store - -if.2.store: - store i32 2, i32 addrspace(1)* %out - br label %end - -if.3: - %3 = icmp sgt i32 %cond, 30 - br i1 %3, label %if.4, label %if.3.store - -if.3.store: - store i32 3, i32 addrspace(1)* %out - br label %end - -if.4: - store i32 4, i32 addrspace(1)* %out - br label %end - -end: - ret void -} - -; BUG64: Applying bug work-around -; BUG32-NOT: Applying bug work-around -; NOBUG-NOT: Applying bug work-around -; FUNC-LABEL: {{^}}nested7: -define void @nested7(i32 addrspace(1)* %out, i32 %cond) { -entry: - %0 = icmp sgt i32 %cond, 0 - br i1 %0, label %if.1, label %end - -if.1: - %1 = icmp sgt i32 %cond, 10 - br i1 %1, label %if.2, label %if.1.store - -if.1.store: - store i32 1, i32 addrspace(1)* %out - br label %end - -if.2: - %2 = icmp sgt i32 %cond, 20 - br i1 %2, label %if.3, label %if.2.store - -if.2.store: - store i32 2, i32 addrspace(1)* %out - br label %end - -if.3: - %3 = icmp sgt i32 %cond, 30 - br i1 %3, label %if.4, label %if.3.store - -if.3.store: - store i32 3, i32 addrspace(1)* %out - br label %end - -if.4: - %4 = icmp sgt i32 %cond, 40 - br i1 %4, label %if.5, label %if.4.store - -if.4.store: - store i32 4, i32 addrspace(1)* %out - br label %end - -if.5: - %5 = icmp sgt i32 %cond, 50 - br i1 %5, label %if.6, label %if.5.store - -if.5.store: - store i32 5, i32 addrspace(1)* %out - br label %end - -if.6: - %6 = icmp sgt i32 %cond, 60 - br i1 %6, label %if.7, label %if.6.store - -if.6.store: - store i32 6, i32 addrspace(1)* %out - br label %end - -if.7: - store i32 7, i32 addrspace(1)* %out - br label %end - -end: - ret void -} - -; BUG64: Applying bug work-around -; BUG32: Applying bug work-around -; NOBUG-NOT: Applying bug work-around -; FUNC-LABEL: {{^}}nested8: -define void @nested8(i32 addrspace(1)* %out, i32 %cond) { -entry: - %0 = icmp sgt i32 %cond, 0 - br i1 %0, label %if.1, label %end - -if.1: - %1 = icmp sgt i32 %cond, 10 - br i1 %1, label %if.2, label %if.1.store - -if.1.store: - store i32 1, i32 addrspace(1)* %out - br label %end - -if.2: - %2 = icmp sgt i32 %cond, 20 - br i1 %2, label %if.3, label %if.2.store - -if.2.store: - store i32 2, i32 addrspace(1)* %out - br label %end - -if.3: - %3 = icmp sgt i32 %cond, 30 - br i1 %3, label %if.4, label %if.3.store - -if.3.store: - store i32 3, i32 addrspace(1)* %out - br label %end - -if.4: - %4 = icmp sgt i32 %cond, 40 - br i1 %4, label %if.5, label %if.4.store - -if.4.store: - store i32 4, i32 addrspace(1)* %out - br label %end - -if.5: - %5 = icmp sgt i32 %cond, 50 - br i1 %5, label %if.6, label %if.5.store - -if.5.store: - store i32 5, i32 addrspace(1)* %out - br label %end - -if.6: - %6 = icmp sgt i32 %cond, 60 - br i1 %6, label %if.7, label %if.6.store - -if.6.store: - store i32 6, i32 addrspace(1)* %out - br label %end - -if.7: - %7 = icmp sgt i32 %cond, 70 - br i1 %7, label %if.8, label %if.7.store - -if.7.store: - store i32 7, i32 addrspace(1)* %out - br label %end - -if.8: - store i32 8, i32 addrspace(1)* %out - br label %end - -end: - ret void -} diff --git a/llvm/test/CodeGen/R600/cf_end.ll b/llvm/test/CodeGen/R600/cf_end.ll deleted file mode 100644 index c74ee22868d..00000000000 --- a/llvm/test/CodeGen/R600/cf_end.ll +++ /dev/null @@ -1,9 +0,0 @@ -; RUN: llc < %s -march=r600 -mcpu=redwood --show-mc-encoding | FileCheck --check-prefix=EG %s -; RUN: llc < %s -march=r600 -mcpu=caicos --show-mc-encoding | FileCheck --check-prefix=EG %s -; RUN: llc < %s -march=r600 -mcpu=cayman --show-mc-encoding | FileCheck --check-prefix=CM %s - -; EG: CF_END ; encoding: [0x00,0x00,0x00,0x00,0x00,0x00,0x20,0x80] -; CM: CF_END ; encoding: [0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x88] -define void @eop() { - ret void -} diff --git a/llvm/test/CodeGen/R600/cgp-addressing-modes.ll b/llvm/test/CodeGen/R600/cgp-addressing-modes.ll deleted file mode 100644 index 77f7bd01b7f..00000000000 --- a/llvm/test/CodeGen/R600/cgp-addressing-modes.ll +++ /dev/null @@ -1,242 +0,0 @@ -; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown < %s | FileCheck -check-prefix=OPT %s -; RUN: llc -march=amdgcn -mcpu=bonaire -mattr=-promote-alloca < %s | FileCheck -check-prefix=GCN %s - -declare i32 @llvm.r600.read.tidig.x() #0 - -; OPT-LABEL: @test_sink_global_small_offset_i32( -; OPT-NOT: getelementptr i32, i32 addrspace(1)* %in -; OPT: br i1 -; OPT: ptrtoint - -; GCN-LABEL: {{^}}test_sink_global_small_offset_i32: -; GCN: {{^}}BB0_2: -define void @test_sink_global_small_offset_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %cond) { -entry: - %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999 - %in.gep = getelementptr i32, i32 addrspace(1)* %in, i64 7 - %tmp0 = icmp eq i32 %cond, 0 - br i1 %tmp0, label %endif, label %if - -if: - %tmp1 = load i32, i32 addrspace(1)* %in.gep - br label %endif - -endif: - %x = phi i32 [ %tmp1, %if ], [ 0, %entry ] - store i32 %x, i32 addrspace(1)* %out.gep - br label %done - -done: - ret void -} - -; OPT-LABEL: @test_sink_global_small_max_i32_ds_offset( -; OPT: %in.gep = getelementptr i8, i8 addrspace(1)* %in, i64 65535 -; OPT: br i1 - -; GCN-LABEL: {{^}}test_sink_global_small_max_i32_ds_offset: -; GCN: s_and_saveexec_b64 -; GCN: buffer_load_sbyte {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s{{[0-9]+$}} -; GCN: {{^}}BB1_2: -; GCN: s_or_b64 exec -define void @test_sink_global_small_max_i32_ds_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in, i32 %cond) { -entry: - %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 99999 - %in.gep = getelementptr i8, i8 addrspace(1)* %in, i64 65535 - %tmp0 = icmp eq i32 %cond, 0 - br i1 %tmp0, label %endif, label %if - -if: - %tmp1 = load i8, i8 addrspace(1)* %in.gep - %tmp2 = sext i8 %tmp1 to i32 - br label %endif - -endif: - %x = phi i32 [ %tmp2, %if ], [ 0, %entry ] - store i32 %x, i32 addrspace(1)* %out.gep - br label %done - -done: - ret void -} - -; GCN-LABEL: {{^}}test_sink_global_small_max_mubuf_offset: -; GCN: s_and_saveexec_b64 -; GCN: buffer_load_sbyte {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:4095{{$}} -; GCN: {{^}}BB2_2: -; GCN: s_or_b64 exec -define void @test_sink_global_small_max_mubuf_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in, i32 %cond) { -entry: - %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 1024 - %in.gep = getelementptr i8, i8 addrspace(1)* %in, i64 4095 - %tmp0 = icmp eq i32 %cond, 0 - br i1 %tmp0, label %endif, label %if - -if: - %tmp1 = load i8, i8 addrspace(1)* %in.gep - %tmp2 = sext i8 %tmp1 to i32 - br label %endif - -endif: - %x = phi i32 [ %tmp2, %if ], [ 0, %entry ] - store i32 %x, i32 addrspace(1)* %out.gep - br label %done - -done: - ret void -} - -; GCN-LABEL: {{^}}test_sink_global_small_max_plus_1_mubuf_offset: -; GCN: s_and_saveexec_b64 -; GCN: buffer_load_sbyte {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s{{[0-9]+$}} -; GCN: {{^}}BB3_2: -; GCN: s_or_b64 exec -define void @test_sink_global_small_max_plus_1_mubuf_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in, i32 %cond) { -entry: - %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 99999 - %in.gep = getelementptr i8, i8 addrspace(1)* %in, i64 4096 - %tmp0 = icmp eq i32 %cond, 0 - br i1 %tmp0, label %endif, label %if - -if: - %tmp1 = load i8, i8 addrspace(1)* %in.gep - %tmp2 = sext i8 %tmp1 to i32 - br label %endif - -endif: - %x = phi i32 [ %tmp2, %if ], [ 0, %entry ] - store i32 %x, i32 addrspace(1)* %out.gep - br label %done - -done: - ret void -} - -; OPT-LABEL: @test_no_sink_flat_small_offset_i32( -; OPT: getelementptr i32, i32 addrspace(4)* %in -; OPT: br i1 -; OPT-NOT: ptrtoint - -; GCN-LABEL: {{^}}test_no_sink_flat_small_offset_i32: -; GCN: flat_load_dword -; GCN: {{^}}BB4_2: - -define void @test_no_sink_flat_small_offset_i32(i32 addrspace(4)* %out, i32 addrspace(4)* %in, i32 %cond) { -entry: - %out.gep = getelementptr i32, i32 addrspace(4)* %out, i64 999999 - %in.gep = getelementptr i32, i32 addrspace(4)* %in, i64 7 - %tmp0 = icmp eq i32 %cond, 0 - br i1 %tmp0, label %endif, label %if - -if: - %tmp1 = load i32, i32 addrspace(4)* %in.gep - br label %endif - -endif: - %x = phi i32 [ %tmp1, %if ], [ 0, %entry ] - store i32 %x, i32 addrspace(4)* %out.gep - br label %done - -done: - ret void -} - -; OPT-LABEL: @test_sink_scratch_small_offset_i32( -; OPT-NOT: getelementptr [512 x i32] -; OPT: br i1 -; OPT: ptrtoint - -; GCN-LABEL: {{^}}test_sink_scratch_small_offset_i32: -; GCN: s_and_saveexec_b64 -; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen offset:4092{{$}} -; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen offset:4092{{$}} -; GCN: {{^}}BB5_2: -define void @test_sink_scratch_small_offset_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %cond, i32 %arg) { -entry: - %alloca = alloca [512 x i32], align 4 - %out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i64 999998 - %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i64 999999 - %add.arg = add i32 %arg, 8 - %alloca.gep = getelementptr [512 x i32], [512 x i32]* %alloca, i32 0, i32 1023 - %tmp0 = icmp eq i32 %cond, 0 - br i1 %tmp0, label %endif, label %if - -if: - store volatile i32 123, i32* %alloca.gep - %tmp1 = load volatile i32, i32* %alloca.gep - br label %endif - -endif: - %x = phi i32 [ %tmp1, %if ], [ 0, %entry ] - store i32 %x, i32 addrspace(1)* %out.gep.0 - %load = load volatile i32, i32* %alloca.gep - store i32 %load, i32 addrspace(1)* %out.gep.1 - br label %done - -done: - ret void -} - -; OPT-LABEL: @test_no_sink_scratch_large_offset_i32( -; OPT: %alloca.gep = getelementptr [512 x i32], [512 x i32]* %alloca, i32 0, i32 1024 -; OPT: br i1 -; OPT-NOT: ptrtoint - -; GCN-LABEL: {{^}}test_no_sink_scratch_large_offset_i32: -; GCN: s_and_saveexec_b64 -; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}} -; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}} -; GCN: {{^}}BB6_2: -define void @test_no_sink_scratch_large_offset_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %cond, i32 %arg) { -entry: - %alloca = alloca [512 x i32], align 4 - %out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i64 999998 - %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i64 999999 - %add.arg = add i32 %arg, 8 - %alloca.gep = getelementptr [512 x i32], [512 x i32]* %alloca, i32 0, i32 1024 - %tmp0 = icmp eq i32 %cond, 0 - br i1 %tmp0, label %endif, label %if - -if: - store volatile i32 123, i32* %alloca.gep - %tmp1 = load volatile i32, i32* %alloca.gep - br label %endif - -endif: - %x = phi i32 [ %tmp1, %if ], [ 0, %entry ] - store i32 %x, i32 addrspace(1)* %out.gep.0 - %load = load volatile i32, i32* %alloca.gep - store i32 %load, i32 addrspace(1)* %out.gep.1 - br label %done - -done: - ret void -} - -; GCN-LABEL: {{^}}test_sink_global_vreg_sreg_i32: -; GCN: s_and_saveexec_b64 -; GCN: buffer_load_dword {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; GCN: {{^}}BB7_2: -define void @test_sink_global_vreg_sreg_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %offset, i32 %cond) { -entry: - %offset.ext = zext i32 %offset to i64 - %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999 - %in.gep = getelementptr i32, i32 addrspace(1)* %in, i64 %offset.ext - %tmp0 = icmp eq i32 %cond, 0 - br i1 %tmp0, label %endif, label %if - -if: - %tmp1 = load i32, i32 addrspace(1)* %in.gep - br label %endif - -endif: - %x = phi i32 [ %tmp1, %if ], [ 0, %entry ] - store i32 %x, i32 addrspace(1)* %out.gep - br label %done - -done: - ret void -} - -attributes #0 = { nounwind readnone } -attributes #1 = { nounwind } diff --git a/llvm/test/CodeGen/R600/coalescer_remat.ll b/llvm/test/CodeGen/R600/coalescer_remat.ll deleted file mode 100644 index 96730bcf2e8..00000000000 --- a/llvm/test/CodeGen/R600/coalescer_remat.ll +++ /dev/null @@ -1,57 +0,0 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs -mtriple=amdgcn-- -o - %s | FileCheck %s - -declare float @llvm.fma.f32(float, float, float) - -; This checks that rematerialization support of the coalescer does not -; unnecessarily widen the register class. Without those fixes > 20 VGprs -; are used here -; Also check that some rematerialization of the 0 constant happened. -; CHECK-LABEL: foobar -; CHECK: v_mov_b32_e32 v{{[0-9]+}}, 0 -; CHECK: v_mov_b32_e32 v{{[0-9]+}}, 0 -; CHECK: v_mov_b32_e32 v{{[0-9]+}}, 0 -; CHECK: v_mov_b32_e32 v{{[0-9]+}}, 0 -; It's probably OK if this is slightly higher: -; CHECK: ; NumVgprs: 9 -define void @foobar(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in, i32 %flag) { -entry: - %cmpflag = icmp eq i32 %flag, 1 - br i1 %cmpflag, label %loop, label %exit - -loop: - %c = phi i32 [0, %entry], [%cnext, %loop] - %v0 = phi float [0.0, %entry], [%fma.0, %loop] - %v1 = phi float [0.0, %entry], [%fma.1, %loop] - %v2 = phi float [0.0, %entry], [%fma.2, %loop] - %v3 = phi float [0.0, %entry], [%fma.3, %loop] - - ; Try to get the 0 constant to get coalesced into a wide register - %blup = insertelement <4 x float> undef, float %v0, i32 0 - store <4 x float> %blup, <4 x float> addrspace(1)* %out - - %load = load <4 x float>, <4 x float> addrspace(1)* %in - %load.0 = extractelement <4 x float> %load, i32 0 - %load.1 = extractelement <4 x float> %load, i32 1 - %load.2 = extractelement <4 x float> %load, i32 2 - %load.3 = extractelement <4 x float> %load, i32 3 - %fma.0 = call float @llvm.fma.f32(float %v0, float %load.0, float %v0) - %fma.1 = call float @llvm.fma.f32(float %v1, float %load.1, float %v1) - %fma.2 = call float @llvm.fma.f32(float %v2, float %load.2, float %v2) - %fma.3 = call float @llvm.fma.f32(float %v3, float %load.3, float %v3) - - %cnext = add nsw i32 %c, 1 - %cmp = icmp eq i32 %cnext, 42 - br i1 %cmp, label %exit, label %loop - -exit: - %ev0 = phi float [0.0, %entry], [%fma.0, %loop] - %ev1 = phi float [0.0, %entry], [%fma.1, %loop] - %ev2 = phi float [0.0, %entry], [%fma.2, %loop] - %ev3 = phi float [0.0, %entry], [%fma.3, %loop] - %dst.0 = insertelement <4 x float> undef, float %ev0, i32 0 - %dst.1 = insertelement <4 x float> %dst.0, float %ev1, i32 1 - %dst.2 = insertelement <4 x float> %dst.1, float %ev2, i32 2 - %dst.3 = insertelement <4 x float> %dst.2, float %ev3, i32 3 - store <4 x float> %dst.3, <4 x float> addrspace(1)* %out - ret void -} diff --git a/llvm/test/CodeGen/R600/codegen-prepare-addrmode-sext.ll b/llvm/test/CodeGen/R600/codegen-prepare-addrmode-sext.ll deleted file mode 100644 index 58517209267..00000000000 --- a/llvm/test/CodeGen/R600/codegen-prepare-addrmode-sext.ll +++ /dev/null @@ -1,18 +0,0 @@ -; RUN: opt -mtriple=amdgcn-- -codegenprepare -S < %s | FileCheck -check-prefix=OPT %s -; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI-LLC %s - -; OPT-LABEL: @test( -; OPT: mul nsw i32 -; OPT-NEXT: sext - -; SI-LLC-LABEL: {{^}}test: -; SI-LLC: s_mul_i32 -; SI-LLC-NOT: mul -define void @test(i8 addrspace(1)* nocapture readonly %in, i32 %a, i8 %b) { -entry: - %0 = mul nsw i32 %a, 3 - %1 = sext i32 %0 to i64 - %2 = getelementptr i8, i8 addrspace(1)* %in, i64 %1 - store i8 %b, i8 addrspace(1)* %2 - ret void -} diff --git a/llvm/test/CodeGen/R600/combine_vloads.ll b/llvm/test/CodeGen/R600/combine_vloads.ll deleted file mode 100644 index 01572afa620..00000000000 --- a/llvm/test/CodeGen/R600/combine_vloads.ll +++ /dev/null @@ -1,42 +0,0 @@ -; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG %s - -; -; kernel void combine_vloads(global char8* src, global char8* result) { -; for (int i = 0; i < 1024; ++i) -; result[i] = src[0] + src[1] + src[2] + src[3]; -; } -; - - -; 128-bit loads instead of many 8-bit -; EG-LABEL: {{^}}combine_vloads: -; EG: VTX_READ_128 -; EG: VTX_READ_128 -define void @combine_vloads(<8 x i8> addrspace(1)* nocapture %src, <8 x i8> addrspace(1)* nocapture %result) nounwind { -entry: - br label %for.body - -for.exit: ; preds = %for.body - ret void - -for.body: ; preds = %for.body, %entry - %i.01 = phi i32 [ 0, %entry ], [ %tmp19, %for.body ] - %arrayidx_v4 = bitcast <8 x i8> addrspace(1)* %src to <32 x i8> addrspace(1)* - %0 = bitcast <32 x i8> addrspace(1)* %arrayidx_v4 to <8 x i32> addrspace(1)* - %vecload2 = load <8 x i32>, <8 x i32> addrspace(1)* %0, align 32 - %1 = bitcast <8 x i32> %vecload2 to <32 x i8> - %tmp5 = shufflevector <32 x i8> %1, <32 x i8> undef, <8 x i32> - %tmp8 = shufflevector <32 x i8> %1, <32 x i8> undef, <8 x i32> - %tmp9 = add nsw <8 x i8> %tmp5, %tmp8 - %tmp12 = shufflevector <32 x i8> %1, <32 x i8> undef, <8 x i32> - %tmp13 = add nsw <8 x i8> %tmp9, %tmp12 - %tmp16 = shufflevector <32 x i8> %1, <32 x i8> undef, <8 x i32> - %tmp17 = add nsw <8 x i8> %tmp13, %tmp16 - %scevgep = getelementptr <8 x i8>, <8 x i8> addrspace(1)* %result, i32 %i.01 - %2 = bitcast <8 x i8> %tmp17 to <2 x i32> - %3 = bitcast <8 x i8> addrspace(1)* %scevgep to <2 x i32> addrspace(1)* - store <2 x i32> %2, <2 x i32> addrspace(1)* %3, align 8 - %tmp19 = add nsw i32 %i.01, 1 - %exitcond = icmp eq i32 %tmp19, 1024 - br i1 %exitcond, label %for.exit, label %for.body -} diff --git a/llvm/test/CodeGen/R600/commute-compares.ll b/llvm/test/CodeGen/R600/commute-compares.ll deleted file mode 100644 index 31766047a35..00000000000 --- a/llvm/test/CodeGen/R600/commute-compares.ll +++ /dev/null @@ -1,697 +0,0 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s - -declare i32 @llvm.r600.read.tidig.x() #0 - -; -------------------------------------------------------------------------------- -; i32 compares -; -------------------------------------------------------------------------------- - -; GCN-LABEL: {{^}}commute_eq_64_i32: -; GCN: v_cmp_eq_i32_e32 vcc, 64, v{{[0-9]+}} -define void @commute_eq_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 { - %tid = call i32 @llvm.r600.read.tidig.x() #0 - %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid - %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %val = load i32, i32 addrspace(1)* %gep.in - %cmp = icmp eq i32 %val, 64 - %ext = sext i1 %cmp to i32 - store i32 %ext, i32 addrspace(1)* %gep.out - ret void -} - -; GCN-LABEL: {{^}}commute_ne_64_i32: -; GCN: v_cmp_ne_i32_e32 vcc, 64, v{{[0-9]+}} -define void @commute_ne_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 { - %tid = call i32 @llvm.r600.read.tidig.x() #0 - %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid - %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %val = load i32, i32 addrspace(1)* %gep.in - %cmp = icmp ne i32 %val, 64 - %ext = sext i1 %cmp to i32 - store i32 %ext, i32 addrspace(1)* %gep.out - ret void -} - -; FIXME: Why isn't this being folded as a constant? -; GCN-LABEL: {{^}}commute_ne_litk_i32: -; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0x3039 -; GCN: v_cmp_ne_i32_e32 vcc, [[K]], v{{[0-9]+}} -define void @commute_ne_litk_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 { - %tid = call i32 @llvm.r600.read.tidig.x() #0 - %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid - %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %val = load i32, i32 addrspace(1)* %gep.in - %cmp = icmp ne i32 %val, 12345 - %ext = sext i1 %cmp to i32 - store i32 %ext, i32 addrspace(1)* %gep.out - ret void -} - -; GCN-LABEL: {{^}}commute_ugt_64_i32: -; GCN: v_cmp_lt_u32_e32 vcc, 64, v{{[0-9]+}} -define void @commute_ugt_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 { - %tid = call i32 @llvm.r600.read.tidig.x() #0 - %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid - %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %val = load i32, i32 addrspace(1)* %gep.in - %cmp = icmp ugt i32 %val, 64 - %ext = sext i1 %cmp to i32 - store i32 %ext, i32 addrspace(1)* %gep.out - ret void -} - -; GCN-LABEL: {{^}}commute_uge_64_i32: -; GCN: v_cmp_lt_u32_e32 vcc, 63, v{{[0-9]+}} -define void @commute_uge_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 { - %tid = call i32 @llvm.r600.read.tidig.x() #0 - %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid - %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %val = load i32, i32 addrspace(1)* %gep.in - %cmp = icmp uge i32 %val, 64 - %ext = sext i1 %cmp to i32 - store i32 %ext, i32 addrspace(1)* %gep.out - ret void -} - -; GCN-LABEL: {{^}}commute_ult_64_i32: -; GCN: v_cmp_gt_u32_e32 vcc, 64, v{{[0-9]+}} -define void @commute_ult_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 { - %tid = call i32 @llvm.r600.read.tidig.x() #0 - %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid - %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %val = load i32, i32 addrspace(1)* %gep.in - %cmp = icmp ult i32 %val, 64 - %ext = sext i1 %cmp to i32 - store i32 %ext, i32 addrspace(1)* %gep.out - ret void -} - -; GCN-LABEL: {{^}}commute_ule_63_i32: -; GCN: v_cmp_gt_u32_e32 vcc, 64, v{{[0-9]+}} -define void @commute_ule_63_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 { - %tid = call i32 @llvm.r600.read.tidig.x() #0 - %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid - %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %val = load i32, i32 addrspace(1)* %gep.in - %cmp = icmp ule i32 %val, 63 - %ext = sext i1 %cmp to i32 - store i32 %ext, i32 addrspace(1)* %gep.out - ret void -} - -; FIXME: Undo canonicalization to gt (x + 1) since it doesn't use the inline imm - -; GCN-LABEL: {{^}}commute_ule_64_i32: -; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0x41{{$}} -; GCN: v_cmp_gt_u32_e32 vcc, [[K]], v{{[0-9]+}} -define void @commute_ule_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 { - %tid = call i32 @llvm.r600.read.tidig.x() #0 - %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid - %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %val = load i32, i32 addrspace(1)* %gep.in - %cmp = icmp ule i32 %val, 64 - %ext = sext i1 %cmp to i32 - store i32 %ext, i32 addrspace(1)* %gep.out - ret void -} - -; GCN-LABEL: {{^}}commute_sgt_neg1_i32: -; GCN: v_cmp_lt_i32_e32 vcc, -1, v{{[0-9]+}} -define void @commute_sgt_neg1_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 { - %tid = call i32 @llvm.r600.read.tidig.x() #0 - %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid - %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %val = load i32, i32 addrspace(1)* %gep.in - %cmp = icmp sgt i32 %val, -1 - %ext = sext i1 %cmp to i32 - store i32 %ext, i32 addrspace(1)* %gep.out - ret void -} - -; GCN-LABEL: {{^}}commute_sge_neg2_i32: -; GCN: v_cmp_lt_i32_e32 vcc, -3, v{{[0-9]+}} -define void @commute_sge_neg2_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 { - %tid = call i32 @llvm.r600.read.tidig.x() #0 - %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid - %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %val = load i32, i32 addrspace(1)* %gep.in - %cmp = icmp sge i32 %val, -2 - %ext = sext i1 %cmp to i32 - store i32 %ext, i32 addrspace(1)* %gep.out - ret void -} - -; GCN-LABEL: {{^}}commute_slt_neg16_i32: -; GCN: v_cmp_gt_i32_e32 vcc, -16, v{{[0-9]+}} -define void @commute_slt_neg16_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 { - %tid = call i32 @llvm.r600.read.tidig.x() #0 - %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid - %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %val = load i32, i32 addrspace(1)* %gep.in - %cmp = icmp slt i32 %val, -16 - %ext = sext i1 %cmp to i32 - store i32 %ext, i32 addrspace(1)* %gep.out - ret void -} - -; GCN-LABEL: {{^}}commute_sle_5_i32: -; GCN: v_cmp_gt_i32_e32 vcc, 6, v{{[0-9]+}} -define void @commute_sle_5_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 { - %tid = call i32 @llvm.r600.read.tidig.x() #0 - %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid - %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %val = load i32, i32 addrspace(1)* %gep.in - %cmp = icmp sle i32 %val, 5 - %ext = sext i1 %cmp to i32 - store i32 %ext, i32 addrspace(1)* %gep.out - ret void -} - -; -------------------------------------------------------------------------------- -; i64 compares -; -------------------------------------------------------------------------------- - -; GCN-LABEL: {{^}}commute_eq_64_i64: -; GCN: v_cmp_eq_i64_e32 vcc, 64, v{{\[[0-9]+:[0-9]+\]}} -define void @commute_eq_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { - %tid = call i32 @llvm.r600.read.tidig.x() #0 - %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid - %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %val = load i64, i64 addrspace(1)* %gep.in - %cmp = icmp eq i64 %val, 64 - %ext = sext i1 %cmp to i32 - store i32 %ext, i32 addrspace(1)* %gep.out - ret void -} - -; GCN-LABEL: {{^}}commute_ne_64_i64: -; GCN: v_cmp_ne_i64_e32 vcc, 64, v{{\[[0-9]+:[0-9]+\]}} -define void @commute_ne_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { - %tid = call i32 @llvm.r600.read.tidig.x() #0 - %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid - %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %val = load i64, i64 addrspace(1)* %gep.in - %cmp = icmp ne i64 %val, 64 - %ext = sext i1 %cmp to i32 - store i32 %ext, i32 addrspace(1)* %gep.out - ret void -} - -; GCN-LABEL: {{^}}commute_ugt_64_i64: -; GCN: v_cmp_lt_u64_e32 vcc, 64, v{{\[[0-9]+:[0-9]+\]}} -define void @commute_ugt_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { - %tid = call i32 @llvm.r600.read.tidig.x() #0 - %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid - %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %val = load i64, i64 addrspace(1)* %gep.in - %cmp = icmp ugt i64 %val, 64 - %ext = sext i1 %cmp to i32 - store i32 %ext, i32 addrspace(1)* %gep.out - ret void -} - -; GCN-LABEL: {{^}}commute_uge_64_i64: -; GCN: v_cmp_lt_u64_e32 vcc, 63, v{{\[[0-9]+:[0-9]+\]}} -define void @commute_uge_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { - %tid = call i32 @llvm.r600.read.tidig.x() #0 - %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid - %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %val = load i64, i64 addrspace(1)* %gep.in - %cmp = icmp uge i64 %val, 64 - %ext = sext i1 %cmp to i32 - store i32 %ext, i32 addrspace(1)* %gep.out - ret void -} - -; GCN-LABEL: {{^}}commute_ult_64_i64: -; GCN: v_cmp_gt_u64_e32 vcc, 64, v{{\[[0-9]+:[0-9]+\]}} -define void @commute_ult_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { - %tid = call i32 @llvm.r600.read.tidig.x() #0 - %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid - %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %val = load i64, i64 addrspace(1)* %gep.in - %cmp = icmp ult i64 %val, 64 - %ext = sext i1 %cmp to i32 - store i32 %ext, i32 addrspace(1)* %gep.out - ret void -} - -; GCN-LABEL: {{^}}commute_ule_63_i64: -; GCN: v_cmp_gt_u64_e32 vcc, 64, v{{\[[0-9]+:[0-9]+\]}} -define void @commute_ule_63_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { - %tid = call i32 @llvm.r600.read.tidig.x() #0 - %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid - %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %val = load i64, i64 addrspace(1)* %gep.in - %cmp = icmp ule i64 %val, 63 - %ext = sext i1 %cmp to i32 - store i32 %ext, i32 addrspace(1)* %gep.out - ret void -} - -; FIXME: Undo canonicalization to gt (x + 1) since it doesn't use the inline imm - -; GCN-LABEL: {{^}}commute_ule_64_i64: -; GCN-DAG: s_movk_i32 s[[KLO:[0-9]+]], 0x41{{$}} -; GCN: v_cmp_gt_u64_e32 vcc, s{{\[}}[[KLO]]:{{[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} -define void @commute_ule_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { - %tid = call i32 @llvm.r600.read.tidig.x() #0 - %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid - %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %val = load i64, i64 addrspace(1)* %gep.in - %cmp = icmp ule i64 %val, 64 - %ext = sext i1 %cmp to i32 - store i32 %ext, i32 addrspace(1)* %gep.out - ret void -} - -; GCN-LABEL: {{^}}commute_sgt_neg1_i64: -; GCN: v_cmp_lt_i64_e32 vcc, -1, v{{\[[0-9]+:[0-9]+\]}} -define void @commute_sgt_neg1_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { - %tid = call i32 @llvm.r600.read.tidig.x() #0 - %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid - %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %val = load i64, i64 addrspace(1)* %gep.in - %cmp = icmp sgt i64 %val, -1 - %ext = sext i1 %cmp to i32 - store i32 %ext, i32 addrspace(1)* %gep.out - ret void -} - -; GCN-LABEL: {{^}}commute_sge_neg2_i64: -; GCN: v_cmp_lt_i64_e32 vcc, -3, v{{\[[0-9]+:[0-9]+\]}} -define void @commute_sge_neg2_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { - %tid = call i32 @llvm.r600.read.tidig.x() #0 - %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid - %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %val = load i64, i64 addrspace(1)* %gep.in - %cmp = icmp sge i64 %val, -2 - %ext = sext i1 %cmp to i32 - store i32 %ext, i32 addrspace(1)* %gep.out - ret void -} - -; GCN-LABEL: {{^}}commute_slt_neg16_i64: -; GCN: v_cmp_gt_i64_e32 vcc, -16, v{{\[[0-9]+:[0-9]+\]}} -define void @commute_slt_neg16_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { - %tid = call i32 @llvm.r600.read.tidig.x() #0 - %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid - %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %val = load i64, i64 addrspace(1)* %gep.in - %cmp = icmp slt i64 %val, -16 - %ext = sext i1 %cmp to i32 - store i32 %ext, i32 addrspace(1)* %gep.out - ret void -} - -; GCN-LABEL: {{^}}commute_sle_5_i64: -; GCN: v_cmp_gt_i64_e32 vcc, 6, v{{\[[0-9]+:[0-9]+\]}} -define void @commute_sle_5_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { - %tid = call i32 @llvm.r600.read.tidig.x() #0 - %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid - %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %val = load i64, i64 addrspace(1)* %gep.in - %cmp = icmp sle i64 %val, 5 - %ext = sext i1 %cmp to i32 - store i32 %ext, i32 addrspace(1)* %gep.out - ret void -} - -; -------------------------------------------------------------------------------- -; f32 compares -; -------------------------------------------------------------------------------- - - -; GCN-LABEL: {{^}}commute_oeq_2.0_f32: -; GCN: v_cmp_eq_f32_e32 vcc, 2.0, v{{[0-9]+}} -define void @commute_oeq_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 { - %tid = call i32 @llvm.r600.read.tidig.x() #0 - %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid - %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %val = load float, float addrspace(1)* %gep.in - %cmp = fcmp oeq float %val, 2.0 - %ext = sext i1 %cmp to i32 - store i32 %ext, i32 addrspace(1)* %gep.out - ret void -} - - -; GCN-LABEL: {{^}}commute_ogt_2.0_f32: -; GCN: v_cmp_lt_f32_e32 vcc, 2.0, v{{[0-9]+}} -define void @commute_ogt_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 { - %tid = call i32 @llvm.r600.read.tidig.x() #0 - %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid - %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %val = load float, float addrspace(1)* %gep.in - %cmp = fcmp ogt float %val, 2.0 - %ext = sext i1 %cmp to i32 - store i32 %ext, i32 addrspace(1)* %gep.out - ret void -} - -; GCN-LABEL: {{^}}commute_oge_2.0_f32: -; GCN: v_cmp_le_f32_e32 vcc, 2.0, v{{[0-9]+}} -define void @commute_oge_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 { - %tid = call i32 @llvm.r600.read.tidig.x() #0 - %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid - %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %val = load float, float addrspace(1)* %gep.in - %cmp = fcmp oge float %val, 2.0 - %ext = sext i1 %cmp to i32 - store i32 %ext, i32 addrspace(1)* %gep.out - ret void -} - -; GCN-LABEL: {{^}}commute_olt_2.0_f32: -; GCN: v_cmp_gt_f32_e32 vcc, 2.0, v{{[0-9]+}} -define void @commute_olt_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 { - %tid = call i32 @llvm.r600.read.tidig.x() #0 - %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid - %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %val = load float, float addrspace(1)* %gep.in - %cmp = fcmp olt float %val, 2.0 - %ext = sext i1 %cmp to i32 - store i32 %ext, i32 addrspace(1)* %gep.out - ret void -} - -; GCN-LABEL: {{^}}commute_ole_2.0_f32: -; GCN: v_cmp_ge_f32_e32 vcc, 2.0, v{{[0-9]+}} -define void @commute_ole_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 { - %tid = call i32 @llvm.r600.read.tidig.x() #0 - %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid - %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %val = load float, float addrspace(1)* %gep.in - %cmp = fcmp ole float %val, 2.0 - %ext = sext i1 %cmp to i32 - store i32 %ext, i32 addrspace(1)* %gep.out - ret void -} - -; GCN-LABEL: {{^}}commute_one_2.0_f32: -; GCN: v_cmp_lg_f32_e32 vcc, 2.0, v{{[0-9]+}} -define void @commute_one_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 { - %tid = call i32 @llvm.r600.read.tidig.x() #0 - %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid - %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %val = load float, float addrspace(1)* %gep.in - %cmp = fcmp one float %val, 2.0 - %ext = sext i1 %cmp to i32 - store i32 %ext, i32 addrspace(1)* %gep.out - ret void -} - -; GCN-LABEL: {{^}}commute_ord_2.0_f32: -; GCN: v_cmp_o_f32_e32 vcc, [[REG:v[0-9]+]], [[REG]] -define void @commute_ord_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 { - %tid = call i32 @llvm.r600.read.tidig.x() #0 - %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid - %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %val = load float, float addrspace(1)* %gep.in - %cmp = fcmp ord float %val, 2.0 - %ext = sext i1 %cmp to i32 - store i32 %ext, i32 addrspace(1)* %gep.out - ret void -} - -; GCN-LABEL: {{^}}commute_ueq_2.0_f32: -; GCN: v_cmp_nlg_f32_e32 vcc, 2.0, v{{[0-9]+}} -define void @commute_ueq_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 { - %tid = call i32 @llvm.r600.read.tidig.x() #0 - %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid - %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %val = load float, float addrspace(1)* %gep.in - %cmp = fcmp ueq float %val, 2.0 - %ext = sext i1 %cmp to i32 - store i32 %ext, i32 addrspace(1)* %gep.out - ret void -} - -; GCN-LABEL: {{^}}commute_ugt_2.0_f32: -; GCN: v_cmp_nge_f32_e32 vcc, 2.0, v{{[0-9]+}} -define void @commute_ugt_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 { - %tid = call i32 @llvm.r600.read.tidig.x() #0 - %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid - %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %val = load float, float addrspace(1)* %gep.in - %cmp = fcmp ugt float %val, 2.0 - %ext = sext i1 %cmp to i32 - store i32 %ext, i32 addrspace(1)* %gep.out - ret void -} - -; GCN-LABEL: {{^}}commute_uge_2.0_f32: -; GCN: v_cmp_ngt_f32_e32 vcc, 2.0, v{{[0-9]+}} -define void @commute_uge_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 { - %tid = call i32 @llvm.r600.read.tidig.x() #0 - %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid - %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %val = load float, float addrspace(1)* %gep.in - %cmp = fcmp uge float %val, 2.0 - %ext = sext i1 %cmp to i32 - store i32 %ext, i32 addrspace(1)* %gep.out - ret void -} - -; GCN-LABEL: {{^}}commute_ult_2.0_f32: -; GCN: v_cmp_nle_f32_e32 vcc, 2.0, v{{[0-9]+}} -define void @commute_ult_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 { - %tid = call i32 @llvm.r600.read.tidig.x() #0 - %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid - %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %val = load float, float addrspace(1)* %gep.in - %cmp = fcmp ult float %val, 2.0 - %ext = sext i1 %cmp to i32 - store i32 %ext, i32 addrspace(1)* %gep.out - ret void -} - -; GCN-LABEL: {{^}}commute_ule_2.0_f32: -; GCN: v_cmp_nlt_f32_e32 vcc, 2.0, v{{[0-9]+}} -define void @commute_ule_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 { - %tid = call i32 @llvm.r600.read.tidig.x() #0 - %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid - %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %val = load float, float addrspace(1)* %gep.in - %cmp = fcmp ule float %val, 2.0 - %ext = sext i1 %cmp to i32 - store i32 %ext, i32 addrspace(1)* %gep.out - ret void -} - -; GCN-LABEL: {{^}}commute_une_2.0_f32: -; GCN: v_cmp_neq_f32_e32 vcc, 2.0, v{{[0-9]+}} -define void @commute_une_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 { - %tid = call i32 @llvm.r600.read.tidig.x() #0 - %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid - %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %val = load float, float addrspace(1)* %gep.in - %cmp = fcmp une float %val, 2.0 - %ext = sext i1 %cmp to i32 - store i32 %ext, i32 addrspace(1)* %gep.out - ret void -} - -; GCN-LABEL: {{^}}commute_uno_2.0_f32: -; GCN: v_cmp_u_f32_e32 vcc, [[REG:v[0-9]+]], [[REG]] -define void @commute_uno_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 { - %tid = call i32 @llvm.r600.read.tidig.x() #0 - %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid - %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %val = load float, float addrspace(1)* %gep.in - %cmp = fcmp uno float %val, 2.0 - %ext = sext i1 %cmp to i32 - store i32 %ext, i32 addrspace(1)* %gep.out - ret void -} - -; -------------------------------------------------------------------------------- -; f64 compares -; -------------------------------------------------------------------------------- - - -; GCN-LABEL: {{^}}commute_oeq_2.0_f64: -; GCN: v_cmp_eq_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}} -define void @commute_oeq_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 { - %tid = call i32 @llvm.r600.read.tidig.x() #0 - %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid - %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %val = load double, double addrspace(1)* %gep.in - %cmp = fcmp oeq double %val, 2.0 - %ext = sext i1 %cmp to i32 - store i32 %ext, i32 addrspace(1)* %gep.out - ret void -} - - -; GCN-LABEL: {{^}}commute_ogt_2.0_f64: -; GCN: v_cmp_lt_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}} -define void @commute_ogt_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 { - %tid = call i32 @llvm.r600.read.tidig.x() #0 - %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid - %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %val = load double, double addrspace(1)* %gep.in - %cmp = fcmp ogt double %val, 2.0 - %ext = sext i1 %cmp to i32 - store i32 %ext, i32 addrspace(1)* %gep.out - ret void -} - -; GCN-LABEL: {{^}}commute_oge_2.0_f64: -; GCN: v_cmp_le_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}} -define void @commute_oge_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 { - %tid = call i32 @llvm.r600.read.tidig.x() #0 - %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid - %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %val = load double, double addrspace(1)* %gep.in - %cmp = fcmp oge double %val, 2.0 - %ext = sext i1 %cmp to i32 - store i32 %ext, i32 addrspace(1)* %gep.out - ret void -} - -; GCN-LABEL: {{^}}commute_olt_2.0_f64: -; GCN: v_cmp_gt_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}} -define void @commute_olt_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 { - %tid = call i32 @llvm.r600.read.tidig.x() #0 - %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid - %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %val = load double, double addrspace(1)* %gep.in - %cmp = fcmp olt double %val, 2.0 - %ext = sext i1 %cmp to i32 - store i32 %ext, i32 addrspace(1)* %gep.out - ret void -} - -; GCN-LABEL: {{^}}commute_ole_2.0_f64: -; GCN: v_cmp_ge_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}} -define void @commute_ole_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 { - %tid = call i32 @llvm.r600.read.tidig.x() #0 - %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid - %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %val = load double, double addrspace(1)* %gep.in - %cmp = fcmp ole double %val, 2.0 - %ext = sext i1 %cmp to i32 - store i32 %ext, i32 addrspace(1)* %gep.out - ret void -} - -; GCN-LABEL: {{^}}commute_one_2.0_f64: -; GCN: v_cmp_lg_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}} -define void @commute_one_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 { - %tid = call i32 @llvm.r600.read.tidig.x() #0 - %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid - %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %val = load double, double addrspace(1)* %gep.in - %cmp = fcmp one double %val, 2.0 - %ext = sext i1 %cmp to i32 - store i32 %ext, i32 addrspace(1)* %gep.out - ret void -} - -; GCN-LABEL: {{^}}commute_ord_2.0_f64: -; GCN: v_cmp_o_f64_e32 vcc, [[REG:v\[[0-9]+:[0-9]+\]]], [[REG]] -define void @commute_ord_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 { - %tid = call i32 @llvm.r600.read.tidig.x() #0 - %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid - %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %val = load double, double addrspace(1)* %gep.in - %cmp = fcmp ord double %val, 2.0 - %ext = sext i1 %cmp to i32 - store i32 %ext, i32 addrspace(1)* %gep.out - ret void -} - -; GCN-LABEL: {{^}}commute_ueq_2.0_f64: -; GCN: v_cmp_nlg_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}} -define void @commute_ueq_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 { - %tid = call i32 @llvm.r600.read.tidig.x() #0 - %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid - %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %val = load double, double addrspace(1)* %gep.in - %cmp = fcmp ueq double %val, 2.0 - %ext = sext i1 %cmp to i32 - store i32 %ext, i32 addrspace(1)* %gep.out - ret void -} - -; GCN-LABEL: {{^}}commute_ugt_2.0_f64: -; GCN: v_cmp_nge_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}} -define void @commute_ugt_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 { - %tid = call i32 @llvm.r600.read.tidig.x() #0 - %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid - %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %val = load double, double addrspace(1)* %gep.in - %cmp = fcmp ugt double %val, 2.0 - %ext = sext i1 %cmp to i32 - store i32 %ext, i32 addrspace(1)* %gep.out - ret void -} - -; GCN-LABEL: {{^}}commute_uge_2.0_f64: -; GCN: v_cmp_ngt_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}} -define void @commute_uge_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 { - %tid = call i32 @llvm.r600.read.tidig.x() #0 - %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid - %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %val = load double, double addrspace(1)* %gep.in - %cmp = fcmp uge double %val, 2.0 - %ext = sext i1 %cmp to i32 - store i32 %ext, i32 addrspace(1)* %gep.out - ret void -} - -; GCN-LABEL: {{^}}commute_ult_2.0_f64: -; GCN: v_cmp_nle_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}} -define void @commute_ult_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 { - %tid = call i32 @llvm.r600.read.tidig.x() #0 - %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid - %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %val = load double, double addrspace(1)* %gep.in - %cmp = fcmp ult double %val, 2.0 - %ext = sext i1 %cmp to i32 - store i32 %ext, i32 addrspace(1)* %gep.out - ret void -} - -; GCN-LABEL: {{^}}commute_ule_2.0_f64: -; GCN: v_cmp_nlt_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}} -define void @commute_ule_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 { - %tid = call i32 @llvm.r600.read.tidig.x() #0 - %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid - %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %val = load double, double addrspace(1)* %gep.in - %cmp = fcmp ule double %val, 2.0 - %ext = sext i1 %cmp to i32 - store i32 %ext, i32 addrspace(1)* %gep.out - ret void -} - -; GCN-LABEL: {{^}}commute_une_2.0_f64: -; GCN: v_cmp_neq_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}} -define void @commute_une_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 { - %tid = call i32 @llvm.r600.read.tidig.x() #0 - %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid - %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %val = load double, double addrspace(1)* %gep.in - %cmp = fcmp une double %val, 2.0 - %ext = sext i1 %cmp to i32 - store i32 %ext, i32 addrspace(1)* %gep.out - ret void -} - -; GCN-LABEL: {{^}}commute_uno_2.0_f64: -; GCN: v_cmp_u_f64_e32 vcc, [[REG:v\[[0-9]+:[0-9]+\]]], [[REG]] -define void @commute_uno_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 { - %tid = call i32 @llvm.r600.read.tidig.x() #0 - %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid - %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %val = load double, double addrspace(1)* %gep.in - %cmp = fcmp uno double %val, 2.0 - %ext = sext i1 %cmp to i32 - store i32 %ext, i32 addrspace(1)* %gep.out - ret void -} - -attributes #0 = { nounwind readnone } -attributes #1 = { nounwind } diff --git a/llvm/test/CodeGen/R600/commute_modifiers.ll b/llvm/test/CodeGen/R600/commute_modifiers.ll deleted file mode 100644 index 7fc36eabb78..00000000000 --- a/llvm/test/CodeGen/R600/commute_modifiers.ll +++ /dev/null @@ -1,181 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s - -declare i32 @llvm.r600.read.tidig.x() #1 -declare float @llvm.fabs.f32(float) #1 -declare float @llvm.fma.f32(float, float, float) nounwind readnone - -; FUNC-LABEL: @commute_add_imm_fabs_f32 -; SI: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI: v_add_f32_e64 [[REG:v[0-9]+]], 2.0, |[[X]]| -; SI-NEXT: buffer_store_dword [[REG]] -define void @commute_add_imm_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { - %tid = call i32 @llvm.r600.read.tidig.x() #1 - %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid - %x = load float, float addrspace(1)* %gep.0 - %x.fabs = call float @llvm.fabs.f32(float %x) #1 - %z = fadd float 2.0, %x.fabs - store float %z, float addrspace(1)* %out - ret void -} - -; FUNC-LABEL: @commute_mul_imm_fneg_fabs_f32 -; SI: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI: v_mul_f32_e64 [[REG:v[0-9]+]], -4.0, |[[X]]| -; SI-NEXT: buffer_store_dword [[REG]] -define void @commute_mul_imm_fneg_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { - %tid = call i32 @llvm.r600.read.tidig.x() #1 - %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid - %x = load float, float addrspace(1)* %gep.0 - %x.fabs = call float @llvm.fabs.f32(float %x) #1 - %x.fneg.fabs = fsub float -0.000000e+00, %x.fabs - %z = fmul float 4.0, %x.fneg.fabs - store float %z, float addrspace(1)* %out - ret void -} - -; FUNC-LABEL: @commute_mul_imm_fneg_f32 -; SI: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI: v_mul_f32_e32 [[REG:v[0-9]+]], -4.0, [[X]] -; SI-NEXT: buffer_store_dword [[REG]] -define void @commute_mul_imm_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { - %tid = call i32 @llvm.r600.read.tidig.x() #1 - %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid - %x = load float, float addrspace(1)* %gep.0 - %x.fneg = fsub float -0.000000e+00, %x - %z = fmul float 4.0, %x.fneg - store float %z, float addrspace(1)* %out - ret void -} - -; FIXME: Should use SGPR for literal. -; FUNC-LABEL: @commute_add_lit_fabs_f32 -; SI: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI: v_mov_b32_e32 [[K:v[0-9]+]], 0x44800000 -; SI: v_add_f32_e64 [[REG:v[0-9]+]], |[[X]]|, [[K]] -; SI-NEXT: buffer_store_dword [[REG]] -define void @commute_add_lit_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { - %tid = call i32 @llvm.r600.read.tidig.x() #1 - %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid - %x = load float, float addrspace(1)* %gep.0 - %x.fabs = call float @llvm.fabs.f32(float %x) #1 - %z = fadd float 1024.0, %x.fabs - store float %z, float addrspace(1)* %out - ret void -} - -; FUNC-LABEL: @commute_add_fabs_f32 -; SI-DAG: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI-DAG: buffer_load_dword [[Y:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; SI: v_add_f32_e64 [[REG:v[0-9]+]], [[X]], |[[Y]]| -; SI-NEXT: buffer_store_dword [[REG]] -define void @commute_add_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { - %tid = call i32 @llvm.r600.read.tidig.x() #1 - %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid - %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 - %x = load float, float addrspace(1)* %gep.0 - %y = load float, float addrspace(1)* %gep.1 - %y.fabs = call float @llvm.fabs.f32(float %y) #1 - %z = fadd float %x, %y.fabs - store float %z, float addrspace(1)* %out - ret void -} - -; FUNC-LABEL: @commute_mul_fneg_f32 -; SI-DAG: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI-DAG: buffer_load_dword [[Y:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; SI: v_mul_f32_e64 [[REG:v[0-9]+]], [[X]], -[[Y]] -; SI-NEXT: buffer_store_dword [[REG]] -define void @commute_mul_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { - %tid = call i32 @llvm.r600.read.tidig.x() #1 - %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid - %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 - %x = load float, float addrspace(1)* %gep.0 - %y = load float, float addrspace(1)* %gep.1 - %y.fneg = fsub float -0.000000e+00, %y - %z = fmul float %x, %y.fneg - store float %z, float addrspace(1)* %out - ret void -} - -; FUNC-LABEL: @commute_mul_fabs_fneg_f32 -; SI-DAG: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI-DAG: buffer_load_dword [[Y:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; SI: v_mul_f32_e64 [[REG:v[0-9]+]], [[X]], -|[[Y]]| -; SI-NEXT: buffer_store_dword [[REG]] -define void @commute_mul_fabs_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { - %tid = call i32 @llvm.r600.read.tidig.x() #1 - %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid - %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 - %x = load float, float addrspace(1)* %gep.0 - %y = load float, float addrspace(1)* %gep.1 - %y.fabs = call float @llvm.fabs.f32(float %y) #1 - %y.fabs.fneg = fsub float -0.000000e+00, %y.fabs - %z = fmul float %x, %y.fabs.fneg - store float %z, float addrspace(1)* %out - ret void -} - -; There's no reason to commute this. -; FUNC-LABEL: @commute_mul_fabs_x_fabs_y_f32 -; SI-DAG: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI-DAG: buffer_load_dword [[Y:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; SI: v_mul_f32_e64 [[REG:v[0-9]+]], |[[X]]|, |[[Y]]| -; SI-NEXT: buffer_store_dword [[REG]] -define void @commute_mul_fabs_x_fabs_y_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { - %tid = call i32 @llvm.r600.read.tidig.x() #1 - %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid - %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 - %x = load float, float addrspace(1)* %gep.0 - %y = load float, float addrspace(1)* %gep.1 - %x.fabs = call float @llvm.fabs.f32(float %x) #1 - %y.fabs = call float @llvm.fabs.f32(float %y) #1 - %z = fmul float %x.fabs, %y.fabs - store float %z, float addrspace(1)* %out - ret void -} - -; FUNC-LABEL: @commute_mul_fabs_x_fneg_fabs_y_f32 -; SI-DAG: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI-DAG: buffer_load_dword [[Y:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; SI: v_mul_f32_e64 [[REG:v[0-9]+]], |[[X]]|, -|[[Y]]| -; SI-NEXT: buffer_store_dword [[REG]] -define void @commute_mul_fabs_x_fneg_fabs_y_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { - %tid = call i32 @llvm.r600.read.tidig.x() #1 - %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid - %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 - %x = load float, float addrspace(1)* %gep.0 - %y = load float, float addrspace(1)* %gep.1 - %x.fabs = call float @llvm.fabs.f32(float %x) #1 - %y.fabs = call float @llvm.fabs.f32(float %y) #1 - %y.fabs.fneg = fsub float -0.000000e+00, %y.fabs - %z = fmul float %x.fabs, %y.fabs.fneg - store float %z, float addrspace(1)* %out - ret void -} - -; Make sure we commute the multiply part for the constant in src0 even -; though we have negate modifier on src2. - -; SI-LABEL: {{^}}fma_a_2.0_neg_b_f32 -; SI-DAG: buffer_load_dword [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; SI: v_fma_f32 [[RESULT:v[0-9]+]], 2.0, [[R1]], |[[R2]]| -; SI: buffer_store_dword [[RESULT]] -define void @fma_a_2.0_neg_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) { - %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone - %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid - %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 - %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid - - %r1 = load float, float addrspace(1)* %gep.0 - %r2 = load float, float addrspace(1)* %gep.1 - - %r2.fabs = call float @llvm.fabs.f32(float %r2) - - %r3 = tail call float @llvm.fma.f32(float %r1, float 2.0, float %r2.fabs) - store float %r3, float addrspace(1)* %gep.out - ret void -} - -attributes #0 = { nounwind } -attributes #1 = { nounwind readnone } diff --git a/llvm/test/CodeGen/R600/complex-folding.ll b/llvm/test/CodeGen/R600/complex-folding.ll deleted file mode 100644 index a5399a71324..00000000000 --- a/llvm/test/CodeGen/R600/complex-folding.ll +++ /dev/null @@ -1,19 +0,0 @@ -;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s - -; CHECK: {{^}}main: -; CHECK-NOT: MOV -define void @main(<4 x float> inreg %reg0) #0 { -entry: - %0 = extractelement <4 x float> %reg0, i32 0 - %1 = call float @fabs(float %0) - %2 = fptoui float %1 to i32 - %3 = bitcast i32 %2 to float - %4 = insertelement <4 x float> undef, float %3, i32 0 - call void @llvm.R600.store.swizzle(<4 x float> %4, i32 0, i32 0) - ret void -} - -declare float @fabs(float ) readnone -declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) - -attributes #0 = { "ShaderType"="0" } \ No newline at end of file diff --git a/llvm/test/CodeGen/R600/concat_vectors.ll b/llvm/test/CodeGen/R600/concat_vectors.ll deleted file mode 100644 index a09ed1f7385..00000000000 --- a/llvm/test/CodeGen/R600/concat_vectors.ll +++ /dev/null @@ -1,296 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s - -; FUNC-LABEL: {{^}}test_concat_v1i32: -; 0x80f000 is the high 32 bits of the resource descriptor used by MUBUF -; instructions that access scratch memory. Bit 23, which is the add_tid_enable -; bit, is only set for scratch access, so we can check for the absence of this -; value if we want to ensure scratch memory is not being used. -; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 -; SI-NOT: movrel -define void @test_concat_v1i32(<2 x i32> addrspace(1)* %out, <1 x i32> %a, <1 x i32> %b) nounwind { - %concat = shufflevector <1 x i32> %a, <1 x i32> %b, <2 x i32> - store <2 x i32> %concat, <2 x i32> addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}test_concat_v2i32: -; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 -; SI-NOT: movrel -define void @test_concat_v2i32(<4 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) nounwind { - %concat = shufflevector <2 x i32> %a, <2 x i32> %b, <4 x i32> - store <4 x i32> %concat, <4 x i32> addrspace(1)* %out, align 16 - ret void -} - -; FUNC-LABEL: {{^}}test_concat_v4i32: -; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 -; SI-NOT: movrel -define void @test_concat_v4i32(<8 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b) nounwind { - %concat = shufflevector <4 x i32> %a, <4 x i32> %b, <8 x i32> - store <8 x i32> %concat, <8 x i32> addrspace(1)* %out, align 32 - ret void -} - -; FUNC-LABEL: {{^}}test_concat_v8i32: -; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 -; SI-NOT: movrel -define void @test_concat_v8i32(<16 x i32> addrspace(1)* %out, <8 x i32> %a, <8 x i32> %b) nounwind { - %concat = shufflevector <8 x i32> %a, <8 x i32> %b, <16 x i32> - store <16 x i32> %concat, <16 x i32> addrspace(1)* %out, align 64 - ret void -} - -; FUNC-LABEL: {{^}}test_concat_v16i32: -; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 -; SI-NOT: movrel -define void @test_concat_v16i32(<32 x i32> addrspace(1)* %out, <16 x i32> %a, <16 x i32> %b) nounwind { - %concat = shufflevector <16 x i32> %a, <16 x i32> %b, <32 x i32> - store <32 x i32> %concat, <32 x i32> addrspace(1)* %out, align 128 - ret void -} - -; FUNC-LABEL: {{^}}test_concat_v1f32: -; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 -; SI-NOT: movrel -define void @test_concat_v1f32(<2 x float> addrspace(1)* %out, <1 x float> %a, <1 x float> %b) nounwind { - %concat = shufflevector <1 x float> %a, <1 x float> %b, <2 x i32> - store <2 x float> %concat, <2 x float> addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}test_concat_v2f32: -; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 -; SI-NOT: movrel -define void @test_concat_v2f32(<4 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) nounwind { - %concat = shufflevector <2 x float> %a, <2 x float> %b, <4 x i32> - store <4 x float> %concat, <4 x float> addrspace(1)* %out, align 16 - ret void -} - -; FUNC-LABEL: {{^}}test_concat_v4f32: -; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 -; SI-NOT: movrel -define void @test_concat_v4f32(<8 x float> addrspace(1)* %out, <4 x float> %a, <4 x float> %b) nounwind { - %concat = shufflevector <4 x float> %a, <4 x float> %b, <8 x i32> - store <8 x float> %concat, <8 x float> addrspace(1)* %out, align 32 - ret void -} - -; FUNC-LABEL: {{^}}test_concat_v8f32: -; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 -; SI-NOT: movrel -define void @test_concat_v8f32(<16 x float> addrspace(1)* %out, <8 x float> %a, <8 x float> %b) nounwind { - %concat = shufflevector <8 x float> %a, <8 x float> %b, <16 x i32> - store <16 x float> %concat, <16 x float> addrspace(1)* %out, align 64 - ret void -} - -; FUNC-LABEL: {{^}}test_concat_v16f32: -; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 -; SI-NOT: movrel -define void @test_concat_v16f32(<32 x float> addrspace(1)* %out, <16 x float> %a, <16 x float> %b) nounwind { - %concat = shufflevector <16 x float> %a, <16 x float> %b, <32 x i32> - store <32 x float> %concat, <32 x float> addrspace(1)* %out, align 128 - ret void -} - -; FUNC-LABEL: {{^}}test_concat_v1i64: -; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 -; SI-NOT: movrel -define void @test_concat_v1i64(<2 x double> addrspace(1)* %out, <1 x double> %a, <1 x double> %b) nounwind { - %concat = shufflevector <1 x double> %a, <1 x double> %b, <2 x i32> - store <2 x double> %concat, <2 x double> addrspace(1)* %out, align 16 - ret void -} - -; FUNC-LABEL: {{^}}test_concat_v2i64: -; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 -; SI-NOT: movrel -define void @test_concat_v2i64(<4 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b) nounwind { - %concat = shufflevector <2 x double> %a, <2 x double> %b, <4 x i32> - store <4 x double> %concat, <4 x double> addrspace(1)* %out, align 32 - ret void -} - -; FUNC-LABEL: {{^}}test_concat_v4i64: -; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 -; SI-NOT: movrel -define void @test_concat_v4i64(<8 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b) nounwind { - %concat = shufflevector <4 x double> %a, <4 x double> %b, <8 x i32> - store <8 x double> %concat, <8 x double> addrspace(1)* %out, align 64 - ret void -} - -; FUNC-LABEL: {{^}}test_concat_v8i64: -; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 -; SI-NOT: movrel -define void @test_concat_v8i64(<16 x double> addrspace(1)* %out, <8 x double> %a, <8 x double> %b) nounwind { - %concat = shufflevector <8 x double> %a, <8 x double> %b, <16 x i32> - store <16 x double> %concat, <16 x double> addrspace(1)* %out, align 128 - ret void -} - -; FUNC-LABEL: {{^}}test_concat_v16i64: -; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 -; SI-NOT: movrel -define void @test_concat_v16i64(<32 x double> addrspace(1)* %out, <16 x double> %a, <16 x double> %b) nounwind { - %concat = shufflevector <16 x double> %a, <16 x double> %b, <32 x i32> - store <32 x double> %concat, <32 x double> addrspace(1)* %out, align 256 - ret void -} - -; FUNC-LABEL: {{^}}test_concat_v1f64: -; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 -; SI-NOT: movrel -define void @test_concat_v1f64(<2 x double> addrspace(1)* %out, <1 x double> %a, <1 x double> %b) nounwind { - %concat = shufflevector <1 x double> %a, <1 x double> %b, <2 x i32> - store <2 x double> %concat, <2 x double> addrspace(1)* %out, align 16 - ret void -} - -; FUNC-LABEL: {{^}}test_concat_v2f64: -; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 -; SI-NOT: movrel -define void @test_concat_v2f64(<4 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b) nounwind { - %concat = shufflevector <2 x double> %a, <2 x double> %b, <4 x i32> - store <4 x double> %concat, <4 x double> addrspace(1)* %out, align 32 - ret void -} - -; FUNC-LABEL: {{^}}test_concat_v4f64: -; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 -; SI-NOT: movrel -define void @test_concat_v4f64(<8 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b) nounwind { - %concat = shufflevector <4 x double> %a, <4 x double> %b, <8 x i32> - store <8 x double> %concat, <8 x double> addrspace(1)* %out, align 64 - ret void -} - -; FUNC-LABEL: {{^}}test_concat_v8f64: -; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 -; SI-NOT: movrel -define void @test_concat_v8f64(<16 x double> addrspace(1)* %out, <8 x double> %a, <8 x double> %b) nounwind { - %concat = shufflevector <8 x double> %a, <8 x double> %b, <16 x i32> - store <16 x double> %concat, <16 x double> addrspace(1)* %out, align 128 - ret void -} - -; FUNC-LABEL: {{^}}test_concat_v16f64: -; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 -; SI-NOT: movrel -define void @test_concat_v16f64(<32 x double> addrspace(1)* %out, <16 x double> %a, <16 x double> %b) nounwind { - %concat = shufflevector <16 x double> %a, <16 x double> %b, <32 x i32> - store <32 x double> %concat, <32 x double> addrspace(1)* %out, align 256 - ret void -} - -; FUNC-LABEL: {{^}}test_concat_v1i1: -; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 -; SI-NOT: movrel -define void @test_concat_v1i1(<2 x i1> addrspace(1)* %out, <1 x i1> %a, <1 x i1> %b) nounwind { - %concat = shufflevector <1 x i1> %a, <1 x i1> %b, <2 x i32> - store <2 x i1> %concat, <2 x i1> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}test_concat_v2i1: -; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 -; SI-NOT: movrel -define void @test_concat_v2i1(<4 x i1> addrspace(1)* %out, <2 x i1> %a, <2 x i1> %b) nounwind { - %concat = shufflevector <2 x i1> %a, <2 x i1> %b, <4 x i32> - store <4 x i1> %concat, <4 x i1> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}test_concat_v4i1: -; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 -; SI-NOT: movrel -define void @test_concat_v4i1(<8 x i1> addrspace(1)* %out, <4 x i1> %a, <4 x i1> %b) nounwind { - %concat = shufflevector <4 x i1> %a, <4 x i1> %b, <8 x i32> - store <8 x i1> %concat, <8 x i1> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}test_concat_v8i1: -; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 -; SI-NOT: movrel -define void @test_concat_v8i1(<16 x i1> addrspace(1)* %out, <8 x i1> %a, <8 x i1> %b) nounwind { - %concat = shufflevector <8 x i1> %a, <8 x i1> %b, <16 x i32> - store <16 x i1> %concat, <16 x i1> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}test_concat_v16i1: -; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 -; SI-NOT: movrel -define void @test_concat_v16i1(<32 x i1> addrspace(1)* %out, <16 x i1> %a, <16 x i1> %b) nounwind { - %concat = shufflevector <16 x i1> %a, <16 x i1> %b, <32 x i32> - store <32 x i1> %concat, <32 x i1> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}test_concat_v32i1: -; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 -; SI-NOT: movrel -define void @test_concat_v32i1(<64 x i1> addrspace(1)* %out, <32 x i1> %a, <32 x i1> %b) nounwind { - %concat = shufflevector <32 x i1> %a, <32 x i1> %b, <64 x i32> - store <64 x i1> %concat, <64 x i1> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}test_concat_v1i16: -; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 -; SI-NOT: movrel -define void @test_concat_v1i16(<2 x i16> addrspace(1)* %out, <1 x i16> %a, <1 x i16> %b) nounwind { - %concat = shufflevector <1 x i16> %a, <1 x i16> %b, <2 x i32> - store <2 x i16> %concat, <2 x i16> addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}test_concat_v2i16: -; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 -; SI-NOT: movrel -define void @test_concat_v2i16(<4 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b) nounwind { - %concat = shufflevector <2 x i16> %a, <2 x i16> %b, <4 x i32> - store <4 x i16> %concat, <4 x i16> addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}test_concat_v4i16: -; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 -; SI-NOT: movrel -define void @test_concat_v4i16(<8 x i16> addrspace(1)* %out, <4 x i16> %a, <4 x i16> %b) nounwind { - %concat = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> - store <8 x i16> %concat, <8 x i16> addrspace(1)* %out, align 16 - ret void -} - -; FUNC-LABEL: {{^}}test_concat_v8i16: -; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 -; SI-NOT: movrel -define void @test_concat_v8i16(<16 x i16> addrspace(1)* %out, <8 x i16> %a, <8 x i16> %b) nounwind { - %concat = shufflevector <8 x i16> %a, <8 x i16> %b, <16 x i32> - store <16 x i16> %concat, <16 x i16> addrspace(1)* %out, align 32 - ret void -} - -; FUNC-LABEL: {{^}}test_concat_v16i16: -; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 -; SI-NOT: movrel -define void @test_concat_v16i16(<32 x i16> addrspace(1)* %out, <16 x i16> %a, <16 x i16> %b) nounwind { - %concat = shufflevector <16 x i16> %a, <16 x i16> %b, <32 x i32> - store <32 x i16> %concat, <32 x i16> addrspace(1)* %out, align 64 - ret void -} - -; FUNC-LABEL: {{^}}concat_vector_crash: -; SI: s_endpgm -define void @concat_vector_crash(<8 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in) { -bb: - %tmp = load <2 x float>, <2 x float> addrspace(1)* %in, align 4 - %tmp1 = shufflevector <2 x float> %tmp, <2 x float> undef, <8 x i32> - %tmp2 = shufflevector <8 x float> undef, <8 x float> %tmp1, <8 x i32> - store <8 x float> %tmp2, <8 x float> addrspace(1)* %out, align 32 - ret void -} diff --git a/llvm/test/CodeGen/R600/copy-illegal-type.ll b/llvm/test/CodeGen/R600/copy-illegal-type.ll deleted file mode 100644 index 8b397566066..00000000000 --- a/llvm/test/CodeGen/R600/copy-illegal-type.ll +++ /dev/null @@ -1,167 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s - -; FUNC-LABEL: {{^}}test_copy_v4i8: -; SI: buffer_load_dword [[REG:v[0-9]+]] -; SI: buffer_store_dword [[REG]] -; SI: s_endpgm -define void @test_copy_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind { - %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4 - store <4 x i8> %val, <4 x i8> addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}test_copy_v4i8_x2: -; SI: buffer_load_dword [[REG:v[0-9]+]] -; SI: buffer_store_dword [[REG]] -; SI: buffer_store_dword [[REG]] -; SI: s_endpgm -define void @test_copy_v4i8_x2(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %in) nounwind { - %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4 - store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4 - store <4 x i8> %val, <4 x i8> addrspace(1)* %out1, align 4 - ret void -} - -; FUNC-LABEL: {{^}}test_copy_v4i8_x3: -; SI: buffer_load_dword [[REG:v[0-9]+]] -; SI: buffer_store_dword [[REG]] -; SI: buffer_store_dword [[REG]] -; SI: buffer_store_dword [[REG]] -; SI: s_endpgm -define void @test_copy_v4i8_x3(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %in) nounwind { - %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4 - store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4 - store <4 x i8> %val, <4 x i8> addrspace(1)* %out1, align 4 - store <4 x i8> %val, <4 x i8> addrspace(1)* %out2, align 4 - ret void -} - -; FUNC-LABEL: {{^}}test_copy_v4i8_x4: -; SI: buffer_load_dword [[REG:v[0-9]+]] -; SI: buffer_store_dword [[REG]] -; SI: buffer_store_dword [[REG]] -; SI: buffer_store_dword [[REG]] -; SI: buffer_store_dword [[REG]] -; SI: s_endpgm -define void @test_copy_v4i8_x4(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %out3, <4 x i8> addrspace(1)* %in) nounwind { - %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4 - store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4 - store <4 x i8> %val, <4 x i8> addrspace(1)* %out1, align 4 - store <4 x i8> %val, <4 x i8> addrspace(1)* %out2, align 4 - store <4 x i8> %val, <4 x i8> addrspace(1)* %out3, align 4 - ret void -} - -; FUNC-LABEL: {{^}}test_copy_v4i8_extra_use: -; SI: buffer_load_ubyte -; SI: buffer_load_ubyte -; SI: buffer_load_ubyte -; SI: buffer_load_ubyte -; SI-DAG: v_add -; SI-DAG: v_add -; SI-DAG: v_add -; SI-DAG: v_add -; SI-DAG: buffer_store_byte -; SI-DAG: buffer_store_byte -; SI-DAG: buffer_store_byte -; SI-DAG: buffer_store_byte -; SI-DAG: buffer_store_byte -; SI-DAG: buffer_store_byte -; SI-DAG: buffer_store_byte -; SI_DAG: buffer_store_byte - -; After scalarizing v4i8 loads is fixed. -; XSI: buffer_load_dword -; XSI: V_BFE -; XSI: V_ADD -; XSI: V_ADD -; XSI: V_ADD -; XSI: buffer_store_dword -; XSI: buffer_store_dword - -; SI: s_endpgm -define void @test_copy_v4i8_extra_use(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %in) nounwind { - %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4 - %add = add <4 x i8> %val, - store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4 - store <4 x i8> %add, <4 x i8> addrspace(1)* %out1, align 4 - ret void -} - -; FUNC-LABEL: {{^}}test_copy_v4i8_x2_extra_use: -; SI: buffer_load_ubyte -; SI: buffer_load_ubyte -; SI: buffer_load_ubyte -; SI: buffer_load_ubyte -; SI-DAG: v_add -; SI-DAG: v_add -; SI-DAG: v_add -; SI-DAG: v_add -; SI-DAG: buffer_store_byte -; SI-DAG: buffer_store_byte -; SI-DAG: buffer_store_byte -; SI-DAG: buffer_store_byte -; SI-DAG: buffer_store_byte -; SI-DAG: buffer_store_byte -; SI-DAG: buffer_store_byte -; SI_DAG: buffer_store_byte -; SI-DAG: buffer_store_byte -; SI-DAG: buffer_store_byte -; SI-DAG: buffer_store_byte -; SI_DAG: buffer_store_byte - -; XSI: buffer_load_dword -; XSI: BFE -; XSI: buffer_store_dword -; XSI: V_ADD -; XSI: buffer_store_dword -; XSI-NEXT: buffer_store_dword - -; SI: s_endpgm -define void @test_copy_v4i8_x2_extra_use(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %in) nounwind { - %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4 - %add = add <4 x i8> %val, - store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4 - store <4 x i8> %add, <4 x i8> addrspace(1)* %out1, align 4 - store <4 x i8> %val, <4 x i8> addrspace(1)* %out2, align 4 - ret void -} - -; FUNC-LABEL: {{^}}test_copy_v3i8: -; SI-NOT: bfe -; SI-NOT: bfi -; SI: s_endpgm -define void @test_copy_v3i8(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) nounwind { - %val = load <3 x i8>, <3 x i8> addrspace(1)* %in, align 4 - store <3 x i8> %val, <3 x i8> addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}test_copy_v4i8_volatile_load: -; SI: buffer_load_ubyte -; SI: buffer_load_ubyte -; SI: buffer_load_ubyte -; SI: buffer_load_ubyte -; SI: s_endpgm -define void @test_copy_v4i8_volatile_load(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind { - %val = load volatile <4 x i8>, <4 x i8> addrspace(1)* %in, align 4 - store <4 x i8> %val, <4 x i8> addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}test_copy_v4i8_volatile_store: -; SI: buffer_load_ubyte -; SI: buffer_load_ubyte -; SI: buffer_load_ubyte -; SI: buffer_load_ubyte -; SI: buffer_store_byte -; SI: buffer_store_byte -; SI: buffer_store_byte -; SI: buffer_store_byte -; SI: s_endpgm -define void @test_copy_v4i8_volatile_store(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind { - %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4 - store volatile <4 x i8> %val, <4 x i8> addrspace(1)* %out, align 4 - ret void -} diff --git a/llvm/test/CodeGen/R600/copy-to-reg.ll b/llvm/test/CodeGen/R600/copy-to-reg.ll deleted file mode 100644 index fc875f6ef7a..00000000000 --- a/llvm/test/CodeGen/R600/copy-to-reg.ll +++ /dev/null @@ -1,27 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -mattr=-promote-alloca -verify-machineinstrs < %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-promote-alloca -verify-machineinstrs < %s - -; Test that CopyToReg instructions don't have non-register operands prior -; to being emitted. - -; Make sure this doesn't crash -; CHECK-LABEL: {{^}}copy_to_reg_frameindex: -define void @copy_to_reg_frameindex(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) { -entry: - %alloca = alloca [16 x i32] - br label %loop - -loop: - %inc = phi i32 [0, %entry], [%inc.i, %loop] - %ptr = getelementptr [16 x i32], [16 x i32]* %alloca, i32 0, i32 %inc - store i32 %inc, i32* %ptr - %inc.i = add i32 %inc, 1 - %cnd = icmp uge i32 %inc.i, 16 - br i1 %cnd, label %done, label %loop - -done: - %tmp0 = getelementptr [16 x i32], [16 x i32]* %alloca, i32 0, i32 0 - %tmp1 = load i32, i32* %tmp0 - store i32 %tmp1, i32 addrspace(1)* %out - ret void -} diff --git a/llvm/test/CodeGen/R600/ctlz_zero_undef.ll b/llvm/test/CodeGen/R600/ctlz_zero_undef.ll deleted file mode 100644 index bd26c302fe5..00000000000 --- a/llvm/test/CodeGen/R600/ctlz_zero_undef.ll +++ /dev/null @@ -1,71 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s - -declare i32 @llvm.ctlz.i32(i32, i1) nounwind readnone -declare <2 x i32> @llvm.ctlz.v2i32(<2 x i32>, i1) nounwind readnone -declare <4 x i32> @llvm.ctlz.v4i32(<4 x i32>, i1) nounwind readnone - -; FUNC-LABEL: {{^}}s_ctlz_zero_undef_i32: -; SI: s_load_dword [[VAL:s[0-9]+]], -; SI: s_flbit_i32_b32 [[SRESULT:s[0-9]+]], [[VAL]] -; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]] -; SI: buffer_store_dword [[VRESULT]], -; SI: s_endpgm -; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+\.[XYZW]]] -; EG: FFBH_UINT {{\*? *}}[[RESULT]] -define void @s_ctlz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind { - %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone - store i32 %ctlz, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}v_ctlz_zero_undef_i32: -; SI: buffer_load_dword [[VAL:v[0-9]+]], -; SI: v_ffbh_u32_e32 [[RESULT:v[0-9]+]], [[VAL]] -; SI: buffer_store_dword [[RESULT]], -; SI: s_endpgm -; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+\.[XYZW]]] -; EG: FFBH_UINT {{\*? *}}[[RESULT]] -define void @v_ctlz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { - %val = load i32, i32 addrspace(1)* %valptr, align 4 - %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone - store i32 %ctlz, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}v_ctlz_zero_undef_v2i32: -; SI: buffer_load_dwordx2 -; SI: v_ffbh_u32_e32 -; SI: v_ffbh_u32_e32 -; SI: buffer_store_dwordx2 -; SI: s_endpgm -; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+]]{{\.[XYZW]}} -; EG: FFBH_UINT {{\*? *}}[[RESULT]] -; EG: FFBH_UINT {{\*? *}}[[RESULT]] -define void @v_ctlz_zero_undef_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) nounwind { - %val = load <2 x i32>, <2 x i32> addrspace(1)* %valptr, align 8 - %ctlz = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %val, i1 true) nounwind readnone - store <2 x i32> %ctlz, <2 x i32> addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}v_ctlz_zero_undef_v4i32: -; SI: buffer_load_dwordx4 -; SI: v_ffbh_u32_e32 -; SI: v_ffbh_u32_e32 -; SI: v_ffbh_u32_e32 -; SI: v_ffbh_u32_e32 -; SI: buffer_store_dwordx4 -; SI: s_endpgm -; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+]]{{\.[XYZW]}} -; EG: FFBH_UINT {{\*? *}}[[RESULT]] -; EG: FFBH_UINT {{\*? *}}[[RESULT]] -; EG: FFBH_UINT {{\*? *}}[[RESULT]] -; EG: FFBH_UINT {{\*? *}}[[RESULT]] -define void @v_ctlz_zero_undef_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %valptr) nounwind { - %val = load <4 x i32>, <4 x i32> addrspace(1)* %valptr, align 16 - %ctlz = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %val, i1 true) nounwind readnone - store <4 x i32> %ctlz, <4 x i32> addrspace(1)* %out, align 16 - ret void -} diff --git a/llvm/test/CodeGen/R600/ctpop.ll b/llvm/test/CodeGen/R600/ctpop.ll deleted file mode 100644 index 0a031c5e24d..00000000000 --- a/llvm/test/CodeGen/R600/ctpop.ll +++ /dev/null @@ -1,300 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC -check-prefix=VI %s -; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s - -declare i32 @llvm.ctpop.i32(i32) nounwind readnone -declare <2 x i32> @llvm.ctpop.v2i32(<2 x i32>) nounwind readnone -declare <4 x i32> @llvm.ctpop.v4i32(<4 x i32>) nounwind readnone -declare <8 x i32> @llvm.ctpop.v8i32(<8 x i32>) nounwind readnone -declare <16 x i32> @llvm.ctpop.v16i32(<16 x i32>) nounwind readnone - -; FUNC-LABEL: {{^}}s_ctpop_i32: -; GCN: s_load_dword [[SVAL:s[0-9]+]], -; GCN: s_bcnt1_i32_b32 [[SRESULT:s[0-9]+]], [[SVAL]] -; GCN: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]] -; GCN: buffer_store_dword [[VRESULT]], -; GCN: s_endpgm - -; EG: BCNT_INT -define void @s_ctpop_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind { - %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone - store i32 %ctpop, i32 addrspace(1)* %out, align 4 - ret void -} - -; XXX - Why 0 in register? -; FUNC-LABEL: {{^}}v_ctpop_i32: -; GCN: buffer_load_dword [[VAL:v[0-9]+]], -; GCN: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL]], 0 -; GCN: buffer_store_dword [[RESULT]], -; GCN: s_endpgm - -; EG: BCNT_INT -define void @v_ctpop_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { - %val = load i32, i32 addrspace(1)* %in, align 4 - %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone - store i32 %ctpop, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}v_ctpop_add_chain_i32: -; GCN: buffer_load_dword [[VAL1:v[0-9]+]], -; GCN: buffer_load_dword [[VAL0:v[0-9]+]], -; GCN: v_bcnt_u32_b32_e64 [[MIDRESULT:v[0-9]+]], [[VAL1]], 0 -; SI: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], [[VAL0]], [[MIDRESULT]] -; VI: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL0]], [[MIDRESULT]] -; GCN: buffer_store_dword [[RESULT]], -; GCN: s_endpgm - -; EG: BCNT_INT -; EG: BCNT_INT -define void @v_ctpop_add_chain_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in0, i32 addrspace(1)* noalias %in1) nounwind { - %val0 = load i32, i32 addrspace(1)* %in0, align 4 - %val1 = load i32, i32 addrspace(1)* %in1, align 4 - %ctpop0 = call i32 @llvm.ctpop.i32(i32 %val0) nounwind readnone - %ctpop1 = call i32 @llvm.ctpop.i32(i32 %val1) nounwind readnone - %add = add i32 %ctpop0, %ctpop1 - store i32 %add, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}v_ctpop_add_sgpr_i32: -; GCN: buffer_load_dword [[VAL0:v[0-9]+]], -; GCN-NEXT: s_waitcnt -; GCN-NEXT: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL0]], s{{[0-9]+}} -; GCN-NEXT: buffer_store_dword [[RESULT]], -; GCN: s_endpgm -define void @v_ctpop_add_sgpr_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in0, i32 addrspace(1)* noalias %in1, i32 %sval) nounwind { - %val0 = load i32, i32 addrspace(1)* %in0, align 4 - %ctpop0 = call i32 @llvm.ctpop.i32(i32 %val0) nounwind readnone - %add = add i32 %ctpop0, %sval - store i32 %add, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}v_ctpop_v2i32: -; GCN: v_bcnt_u32_b32_e64 -; GCN: v_bcnt_u32_b32_e64 -; GCN: s_endpgm - -; EG: BCNT_INT -; EG: BCNT_INT -define void @v_ctpop_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %in) nounwind { - %val = load <2 x i32>, <2 x i32> addrspace(1)* %in, align 8 - %ctpop = call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %val) nounwind readnone - store <2 x i32> %ctpop, <2 x i32> addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}v_ctpop_v4i32: -; GCN: v_bcnt_u32_b32_e64 -; GCN: v_bcnt_u32_b32_e64 -; GCN: v_bcnt_u32_b32_e64 -; GCN: v_bcnt_u32_b32_e64 -; GCN: s_endpgm - -; EG: BCNT_INT -; EG: BCNT_INT -; EG: BCNT_INT -; EG: BCNT_INT -define void @v_ctpop_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %in) nounwind { - %val = load <4 x i32>, <4 x i32> addrspace(1)* %in, align 16 - %ctpop = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %val) nounwind readnone - store <4 x i32> %ctpop, <4 x i32> addrspace(1)* %out, align 16 - ret void -} - -; FUNC-LABEL: {{^}}v_ctpop_v8i32: -; GCN: v_bcnt_u32_b32_e64 -; GCN: v_bcnt_u32_b32_e64 -; GCN: v_bcnt_u32_b32_e64 -; GCN: v_bcnt_u32_b32_e64 -; GCN: v_bcnt_u32_b32_e64 -; GCN: v_bcnt_u32_b32_e64 -; GCN: v_bcnt_u32_b32_e64 -; GCN: v_bcnt_u32_b32_e64 -; GCN: s_endpgm - -; EG: BCNT_INT -; EG: BCNT_INT -; EG: BCNT_INT -; EG: BCNT_INT -; EG: BCNT_INT -; EG: BCNT_INT -; EG: BCNT_INT -; EG: BCNT_INT -define void @v_ctpop_v8i32(<8 x i32> addrspace(1)* noalias %out, <8 x i32> addrspace(1)* noalias %in) nounwind { - %val = load <8 x i32>, <8 x i32> addrspace(1)* %in, align 32 - %ctpop = call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> %val) nounwind readnone - store <8 x i32> %ctpop, <8 x i32> addrspace(1)* %out, align 32 - ret void -} - -; FUNC-LABEL: {{^}}v_ctpop_v16i32: -; GCN: v_bcnt_u32_b32_e64 -; GCN: v_bcnt_u32_b32_e64 -; GCN: v_bcnt_u32_b32_e64 -; GCN: v_bcnt_u32_b32_e64 -; GCN: v_bcnt_u32_b32_e64 -; GCN: v_bcnt_u32_b32_e64 -; GCN: v_bcnt_u32_b32_e64 -; GCN: v_bcnt_u32_b32_e64 -; GCN: v_bcnt_u32_b32_e64 -; GCN: v_bcnt_u32_b32_e64 -; GCN: v_bcnt_u32_b32_e64 -; GCN: v_bcnt_u32_b32_e64 -; GCN: v_bcnt_u32_b32_e64 -; GCN: v_bcnt_u32_b32_e64 -; GCN: v_bcnt_u32_b32_e64 -; GCN: v_bcnt_u32_b32_e64 -; GCN: s_endpgm - -; EG: BCNT_INT -; EG: BCNT_INT -; EG: BCNT_INT -; EG: BCNT_INT -; EG: BCNT_INT -; EG: BCNT_INT -; EG: BCNT_INT -; EG: BCNT_INT -; EG: BCNT_INT -; EG: BCNT_INT -; EG: BCNT_INT -; EG: BCNT_INT -; EG: BCNT_INT -; EG: BCNT_INT -; EG: BCNT_INT -; EG: BCNT_INT -define void @v_ctpop_v16i32(<16 x i32> addrspace(1)* noalias %out, <16 x i32> addrspace(1)* noalias %in) nounwind { - %val = load <16 x i32>, <16 x i32> addrspace(1)* %in, align 32 - %ctpop = call <16 x i32> @llvm.ctpop.v16i32(<16 x i32> %val) nounwind readnone - store <16 x i32> %ctpop, <16 x i32> addrspace(1)* %out, align 32 - ret void -} - -; FUNC-LABEL: {{^}}v_ctpop_i32_add_inline_constant: -; GCN: buffer_load_dword [[VAL:v[0-9]+]], -; GCN: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL]], 4 -; GCN: buffer_store_dword [[RESULT]], -; GCN: s_endpgm - -; EG: BCNT_INT -define void @v_ctpop_i32_add_inline_constant(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { - %val = load i32, i32 addrspace(1)* %in, align 4 - %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone - %add = add i32 %ctpop, 4 - store i32 %add, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}v_ctpop_i32_add_inline_constant_inv: -; GCN: buffer_load_dword [[VAL:v[0-9]+]], -; GCN: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL]], 4 -; GCN: buffer_store_dword [[RESULT]], -; GCN: s_endpgm - -; EG: BCNT_INT -define void @v_ctpop_i32_add_inline_constant_inv(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { - %val = load i32, i32 addrspace(1)* %in, align 4 - %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone - %add = add i32 4, %ctpop - store i32 %add, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}v_ctpop_i32_add_literal: -; GCN: buffer_load_dword [[VAL:v[0-9]+]], -; GCN: v_mov_b32_e32 [[LIT:v[0-9]+]], 0x1869f -; SI: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], [[VAL]], [[LIT]] -; VI: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL]], [[LIT]] -; GCN: buffer_store_dword [[RESULT]], -; GCN: s_endpgm -define void @v_ctpop_i32_add_literal(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { - %val = load i32, i32 addrspace(1)* %in, align 4 - %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone - %add = add i32 %ctpop, 99999 - store i32 %add, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}v_ctpop_i32_add_var: -; GCN-DAG: buffer_load_dword [[VAL:v[0-9]+]], -; GCN-DAG: s_load_dword [[VAR:s[0-9]+]], -; GCN: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL]], [[VAR]] -; GCN: buffer_store_dword [[RESULT]], -; GCN: s_endpgm - -; EG: BCNT_INT -define void @v_ctpop_i32_add_var(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %const) nounwind { - %val = load i32, i32 addrspace(1)* %in, align 4 - %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone - %add = add i32 %ctpop, %const - store i32 %add, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}v_ctpop_i32_add_var_inv: -; GCN-DAG: buffer_load_dword [[VAL:v[0-9]+]], -; GCN-DAG: s_load_dword [[VAR:s[0-9]+]], -; GCN: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL]], [[VAR]] -; GCN: buffer_store_dword [[RESULT]], -; GCN: s_endpgm - -; EG: BCNT_INT -define void @v_ctpop_i32_add_var_inv(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %const) nounwind { - %val = load i32, i32 addrspace(1)* %in, align 4 - %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone - %add = add i32 %const, %ctpop - store i32 %add, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}v_ctpop_i32_add_vvar_inv: -; GCN-DAG: buffer_load_dword [[VAL:v[0-9]+]], s[{{[0-9]+:[0-9]+}}], {{0$}} -; GCN-DAG: buffer_load_dword [[VAR:v[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0 offset:16 -; SI: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], [[VAL]], [[VAR]] -; VI: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL]], [[VAR]] -; GCN: buffer_store_dword [[RESULT]], -; GCN: s_endpgm - -; EG: BCNT_INT -define void @v_ctpop_i32_add_vvar_inv(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 addrspace(1)* noalias %constptr) nounwind { - %val = load i32, i32 addrspace(1)* %in, align 4 - %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone - %gep = getelementptr i32, i32 addrspace(1)* %constptr, i32 4 - %const = load i32, i32 addrspace(1)* %gep, align 4 - %add = add i32 %const, %ctpop - store i32 %add, i32 addrspace(1)* %out, align 4 - ret void -} - -; FIXME: We currently disallow SALU instructions in all branches, -; but there are some cases when the should be allowed. - -; FUNC-LABEL: {{^}}ctpop_i32_in_br: -; SI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xd -; VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x34 -; GCN: s_bcnt1_i32_b32 [[SRESULT:s[0-9]+]], [[VAL]] -; GCN: v_mov_b32_e32 [[RESULT]], [[SRESULT]] -; GCN: buffer_store_dword [[RESULT]], -; GCN: s_endpgm -; EG: BCNT_INT -define void @ctpop_i32_in_br(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %ctpop_arg, i32 %cond) { -entry: - %tmp0 = icmp eq i32 %cond, 0 - br i1 %tmp0, label %if, label %else - -if: - %tmp2 = call i32 @llvm.ctpop.i32(i32 %ctpop_arg) - br label %endif - -else: - %tmp3 = getelementptr i32, i32 addrspace(1)* %in, i32 1 - %tmp4 = load i32, i32 addrspace(1)* %tmp3 - br label %endif - -endif: - %tmp5 = phi i32 [%tmp2, %if], [%tmp4, %else] - store i32 %tmp5, i32 addrspace(1)* %out - ret void -} diff --git a/llvm/test/CodeGen/R600/ctpop64.ll b/llvm/test/CodeGen/R600/ctpop64.ll deleted file mode 100644 index e1a0ee3ea21..00000000000 --- a/llvm/test/CodeGen/R600/ctpop64.ll +++ /dev/null @@ -1,124 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=FUNC %s - -declare i64 @llvm.ctpop.i64(i64) nounwind readnone -declare <2 x i64> @llvm.ctpop.v2i64(<2 x i64>) nounwind readnone -declare <4 x i64> @llvm.ctpop.v4i64(<4 x i64>) nounwind readnone -declare <8 x i64> @llvm.ctpop.v8i64(<8 x i64>) nounwind readnone -declare <16 x i64> @llvm.ctpop.v16i64(<16 x i64>) nounwind readnone - -; FUNC-LABEL: {{^}}s_ctpop_i64: -; SI: s_load_dwordx2 [[SVAL:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0xb -; VI: s_load_dwordx2 [[SVAL:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c -; GCN: s_bcnt1_i32_b64 [[SRESULT:s[0-9]+]], [[SVAL]] -; GCN: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]] -; GCN: buffer_store_dword [[VRESULT]], -; GCN: s_endpgm -define void @s_ctpop_i64(i32 addrspace(1)* noalias %out, i64 %val) nounwind { - %ctpop = call i64 @llvm.ctpop.i64(i64 %val) nounwind readnone - %truncctpop = trunc i64 %ctpop to i32 - store i32 %truncctpop, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}v_ctpop_i64: -; GCN: buffer_load_dwordx2 v{{\[}}[[LOVAL:[0-9]+]]:[[HIVAL:[0-9]+]]{{\]}}, -; GCN: v_bcnt_u32_b32_e64 [[MIDRESULT:v[0-9]+]], v[[LOVAL]], 0 -; SI-NEXT: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], v[[HIVAL]], [[MIDRESULT]] -; VI-NEXT: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], v[[HIVAL]], [[MIDRESULT]] -; GCN: buffer_store_dword [[RESULT]], -; GCN: s_endpgm -define void @v_ctpop_i64(i32 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind { - %val = load i64, i64 addrspace(1)* %in, align 8 - %ctpop = call i64 @llvm.ctpop.i64(i64 %val) nounwind readnone - %truncctpop = trunc i64 %ctpop to i32 - store i32 %truncctpop, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}s_ctpop_v2i64: -; GCN: s_bcnt1_i32_b64 -; GCN: s_bcnt1_i32_b64 -; GCN: s_endpgm -define void @s_ctpop_v2i64(<2 x i32> addrspace(1)* noalias %out, <2 x i64> %val) nounwind { - %ctpop = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %val) nounwind readnone - %truncctpop = trunc <2 x i64> %ctpop to <2 x i32> - store <2 x i32> %truncctpop, <2 x i32> addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}s_ctpop_v4i64: -; GCN: s_bcnt1_i32_b64 -; GCN: s_bcnt1_i32_b64 -; GCN: s_bcnt1_i32_b64 -; GCN: s_bcnt1_i32_b64 -; GCN: s_endpgm -define void @s_ctpop_v4i64(<4 x i32> addrspace(1)* noalias %out, <4 x i64> %val) nounwind { - %ctpop = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %val) nounwind readnone - %truncctpop = trunc <4 x i64> %ctpop to <4 x i32> - store <4 x i32> %truncctpop, <4 x i32> addrspace(1)* %out, align 16 - ret void -} - -; FUNC-LABEL: {{^}}v_ctpop_v2i64: -; GCN: v_bcnt_u32_b32 -; GCN: v_bcnt_u32_b32 -; GCN: v_bcnt_u32_b32 -; GCN: v_bcnt_u32_b32 -; GCN: s_endpgm -define void @v_ctpop_v2i64(<2 x i32> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %in) nounwind { - %val = load <2 x i64>, <2 x i64> addrspace(1)* %in, align 16 - %ctpop = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %val) nounwind readnone - %truncctpop = trunc <2 x i64> %ctpop to <2 x i32> - store <2 x i32> %truncctpop, <2 x i32> addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}v_ctpop_v4i64: -; GCN: v_bcnt_u32_b32 -; GCN: v_bcnt_u32_b32 -; GCN: v_bcnt_u32_b32 -; GCN: v_bcnt_u32_b32 -; GCN: v_bcnt_u32_b32 -; GCN: v_bcnt_u32_b32 -; GCN: v_bcnt_u32_b32 -; GCN: v_bcnt_u32_b32 -; GCN: s_endpgm -define void @v_ctpop_v4i64(<4 x i32> addrspace(1)* noalias %out, <4 x i64> addrspace(1)* noalias %in) nounwind { - %val = load <4 x i64>, <4 x i64> addrspace(1)* %in, align 32 - %ctpop = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %val) nounwind readnone - %truncctpop = trunc <4 x i64> %ctpop to <4 x i32> - store <4 x i32> %truncctpop, <4 x i32> addrspace(1)* %out, align 16 - ret void -} - -; FIXME: We currently disallow SALU instructions in all branches, -; but there are some cases when the should be allowed. - -; FUNC-LABEL: {{^}}ctpop_i64_in_br: -; SI: s_load_dwordx2 s{{\[}}[[LOVAL:[0-9]+]]:[[HIVAL:[0-9]+]]{{\]}}, s[{{[0-9]+:[0-9]+}}], 0xd -; VI: s_load_dwordx2 s{{\[}}[[LOVAL:[0-9]+]]:[[HIVAL:[0-9]+]]{{\]}}, s[{{[0-9]+:[0-9]+}}], 0x34 -; GCN: s_bcnt1_i32_b64 [[RESULT:s[0-9]+]], {{s\[}}[[LOVAL]]:[[HIVAL]]{{\]}} -; GCN: v_mov_b32_e32 v[[VLO:[0-9]+]], [[RESULT]] -; GCN: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[HIVAL]] -; GCN: buffer_store_dwordx2 {{v\[}}[[VLO]]:[[VHI]]{{\]}} -; GCN: s_endpgm -define void @ctpop_i64_in_br(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %ctpop_arg, i32 %cond) { -entry: - %tmp0 = icmp eq i32 %cond, 0 - br i1 %tmp0, label %if, label %else - -if: - %tmp2 = call i64 @llvm.ctpop.i64(i64 %ctpop_arg) - br label %endif - -else: - %tmp3 = getelementptr i64, i64 addrspace(1)* %in, i32 1 - %tmp4 = load i64, i64 addrspace(1)* %tmp3 - br label %endif - -endif: - %tmp5 = phi i64 [%tmp2, %if], [%tmp4, %else] - store i64 %tmp5, i64 addrspace(1)* %out - ret void -} diff --git a/llvm/test/CodeGen/R600/cttz_zero_undef.ll b/llvm/test/CodeGen/R600/cttz_zero_undef.ll deleted file mode 100644 index 56fcb51fe14..00000000000 --- a/llvm/test/CodeGen/R600/cttz_zero_undef.ll +++ /dev/null @@ -1,71 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s - -declare i32 @llvm.cttz.i32(i32, i1) nounwind readnone -declare <2 x i32> @llvm.cttz.v2i32(<2 x i32>, i1) nounwind readnone -declare <4 x i32> @llvm.cttz.v4i32(<4 x i32>, i1) nounwind readnone - -; FUNC-LABEL: {{^}}s_cttz_zero_undef_i32: -; SI: s_load_dword [[VAL:s[0-9]+]], -; SI: s_ff1_i32_b32 [[SRESULT:s[0-9]+]], [[VAL]] -; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]] -; SI: buffer_store_dword [[VRESULT]], -; SI: s_endpgm -; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+\.[XYZW]]] -; EG: FFBL_INT {{\*? *}}[[RESULT]] -define void @s_cttz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind { - %cttz = call i32 @llvm.cttz.i32(i32 %val, i1 true) nounwind readnone - store i32 %cttz, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}v_cttz_zero_undef_i32: -; SI: buffer_load_dword [[VAL:v[0-9]+]], -; SI: v_ffbl_b32_e32 [[RESULT:v[0-9]+]], [[VAL]] -; SI: buffer_store_dword [[RESULT]], -; SI: s_endpgm -; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+\.[XYZW]]] -; EG: FFBL_INT {{\*? *}}[[RESULT]] -define void @v_cttz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { - %val = load i32, i32 addrspace(1)* %valptr, align 4 - %cttz = call i32 @llvm.cttz.i32(i32 %val, i1 true) nounwind readnone - store i32 %cttz, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}v_cttz_zero_undef_v2i32: -; SI: buffer_load_dwordx2 -; SI: v_ffbl_b32_e32 -; SI: v_ffbl_b32_e32 -; SI: buffer_store_dwordx2 -; SI: s_endpgm -; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+]]{{\.[XYZW]}} -; EG: FFBL_INT {{\*? *}}[[RESULT]] -; EG: FFBL_INT {{\*? *}}[[RESULT]] -define void @v_cttz_zero_undef_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) nounwind { - %val = load <2 x i32>, <2 x i32> addrspace(1)* %valptr, align 8 - %cttz = call <2 x i32> @llvm.cttz.v2i32(<2 x i32> %val, i1 true) nounwind readnone - store <2 x i32> %cttz, <2 x i32> addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}v_cttz_zero_undef_v4i32: -; SI: buffer_load_dwordx4 -; SI: v_ffbl_b32_e32 -; SI: v_ffbl_b32_e32 -; SI: v_ffbl_b32_e32 -; SI: v_ffbl_b32_e32 -; SI: buffer_store_dwordx4 -; SI: s_endpgm -; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+]]{{\.[XYZW]}} -; EG: FFBL_INT {{\*? *}}[[RESULT]] -; EG: FFBL_INT {{\*? *}}[[RESULT]] -; EG: FFBL_INT {{\*? *}}[[RESULT]] -; EG: FFBL_INT {{\*? *}}[[RESULT]] -define void @v_cttz_zero_undef_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %valptr) nounwind { - %val = load <4 x i32>, <4 x i32> addrspace(1)* %valptr, align 16 - %cttz = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %val, i1 true) nounwind readnone - store <4 x i32> %cttz, <4 x i32> addrspace(1)* %out, align 16 - ret void -} diff --git a/llvm/test/CodeGen/R600/cvt_f32_ubyte.ll b/llvm/test/CodeGen/R600/cvt_f32_ubyte.ll deleted file mode 100644 index 3399d9da29e..00000000000 --- a/llvm/test/CodeGen/R600/cvt_f32_ubyte.ll +++ /dev/null @@ -1,196 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s - -; SI-LABEL: {{^}}load_i8_to_f32: -; SI: buffer_load_ubyte [[LOADREG:v[0-9]+]], -; SI-NOT: bfe -; SI-NOT: lshr -; SI: v_cvt_f32_ubyte0_e32 [[CONV:v[0-9]+]], [[LOADREG]] -; SI: buffer_store_dword [[CONV]], -define void @load_i8_to_f32(float addrspace(1)* noalias %out, i8 addrspace(1)* noalias %in) nounwind { - %load = load i8, i8 addrspace(1)* %in, align 1 - %cvt = uitofp i8 %load to float - store float %cvt, float addrspace(1)* %out, align 4 - ret void -} - -; SI-LABEL: {{^}}load_v2i8_to_v2f32: -; SI: buffer_load_ushort [[LOADREG:v[0-9]+]], -; SI-NOT: bfe -; SI-NOT: lshr -; SI-NOT: and -; SI-DAG: v_cvt_f32_ubyte1_e32 v[[HIRESULT:[0-9]+]], [[LOADREG]] -; SI-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[LOADREG]] -; SI: buffer_store_dwordx2 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}}, -define void @load_v2i8_to_v2f32(<2 x float> addrspace(1)* noalias %out, <2 x i8> addrspace(1)* noalias %in) nounwind { - %load = load <2 x i8>, <2 x i8> addrspace(1)* %in, align 2 - %cvt = uitofp <2 x i8> %load to <2 x float> - store <2 x float> %cvt, <2 x float> addrspace(1)* %out, align 16 - ret void -} - -; SI-LABEL: {{^}}load_v3i8_to_v3f32: -; SI-NOT: bfe -; SI-NOT: v_cvt_f32_ubyte3_e32 -; SI-DAG: v_cvt_f32_ubyte2_e32 -; SI-DAG: v_cvt_f32_ubyte1_e32 -; SI-DAG: v_cvt_f32_ubyte0_e32 -; SI: buffer_store_dwordx2 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}}, -define void @load_v3i8_to_v3f32(<3 x float> addrspace(1)* noalias %out, <3 x i8> addrspace(1)* noalias %in) nounwind { - %load = load <3 x i8>, <3 x i8> addrspace(1)* %in, align 4 - %cvt = uitofp <3 x i8> %load to <3 x float> - store <3 x float> %cvt, <3 x float> addrspace(1)* %out, align 16 - ret void -} - -; SI-LABEL: {{^}}load_v4i8_to_v4f32: -; SI: buffer_load_dword [[LOADREG:v[0-9]+]] -; SI-NOT: bfe -; SI-NOT: lshr -; SI-DAG: v_cvt_f32_ubyte3_e32 v[[HIRESULT:[0-9]+]], [[LOADREG]] -; SI-DAG: v_cvt_f32_ubyte2_e32 v{{[0-9]+}}, [[LOADREG]] -; SI-DAG: v_cvt_f32_ubyte1_e32 v{{[0-9]+}}, [[LOADREG]] -; SI-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[LOADREG]] -; SI: buffer_store_dwordx4 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}}, -define void @load_v4i8_to_v4f32(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind { - %load = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4 - %cvt = uitofp <4 x i8> %load to <4 x float> - store <4 x float> %cvt, <4 x float> addrspace(1)* %out, align 16 - ret void -} - -; This should not be adding instructions to shift into the correct -; position in the word for the component. - -; SI-LABEL: {{^}}load_v4i8_to_v4f32_unaligned: -; SI: buffer_load_ubyte [[LOADREG3:v[0-9]+]] -; SI: buffer_load_ubyte [[LOADREG2:v[0-9]+]] -; SI: buffer_load_ubyte [[LOADREG1:v[0-9]+]] -; SI: buffer_load_ubyte [[LOADREG0:v[0-9]+]] -; SI-NOT: v_lshlrev_b32 -; SI-NOT: v_or_b32 - -; SI-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[LOADREG0]] -; SI-DAG: v_cvt_f32_ubyte0_e32 v{{[0-9]+}}, [[LOADREG1]] -; SI-DAG: v_cvt_f32_ubyte0_e32 v{{[0-9]+}}, [[LOADREG2]] -; SI-DAG: v_cvt_f32_ubyte0_e32 v[[HIRESULT:[0-9]+]], [[LOADREG3]] - -; SI: buffer_store_dwordx4 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}}, -define void @load_v4i8_to_v4f32_unaligned(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind { - %load = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 1 - %cvt = uitofp <4 x i8> %load to <4 x float> - store <4 x float> %cvt, <4 x float> addrspace(1)* %out, align 16 - ret void -} - -; XXX - This should really still be able to use the v_cvt_f32_ubyte0 -; for each component, but computeKnownBits doesn't handle vectors very -; well. - -; SI-LABEL: {{^}}load_v4i8_to_v4f32_2_uses: -; SI: buffer_load_ubyte -; SI: buffer_load_ubyte -; SI: buffer_load_ubyte -; SI: buffer_load_ubyte -; SI: v_cvt_f32_ubyte0_e32 -; SI: v_cvt_f32_ubyte0_e32 -; SI: v_cvt_f32_ubyte0_e32 -; SI: v_cvt_f32_ubyte0_e32 - -; XXX - replace with this when v4i8 loads aren't scalarized anymore. -; XSI: buffer_load_dword -; XSI: v_cvt_f32_u32_e32 -; XSI: v_cvt_f32_u32_e32 -; XSI: v_cvt_f32_u32_e32 -; XSI: v_cvt_f32_u32_e32 -; SI: s_endpgm -define void @load_v4i8_to_v4f32_2_uses(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %out2, <4 x i8> addrspace(1)* noalias %in) nounwind { - %load = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4 - %cvt = uitofp <4 x i8> %load to <4 x float> - store <4 x float> %cvt, <4 x float> addrspace(1)* %out, align 16 - %add = add <4 x i8> %load, ; Second use of %load - store <4 x i8> %add, <4 x i8> addrspace(1)* %out2, align 4 - ret void -} - -; Make sure this doesn't crash. -; SI-LABEL: {{^}}load_v7i8_to_v7f32: -; SI: s_endpgm -define void @load_v7i8_to_v7f32(<7 x float> addrspace(1)* noalias %out, <7 x i8> addrspace(1)* noalias %in) nounwind { - %load = load <7 x i8>, <7 x i8> addrspace(1)* %in, align 1 - %cvt = uitofp <7 x i8> %load to <7 x float> - store <7 x float> %cvt, <7 x float> addrspace(1)* %out, align 16 - ret void -} - -; SI-LABEL: {{^}}load_v8i8_to_v8f32: -; SI: buffer_load_dwordx2 v{{\[}}[[LOLOAD:[0-9]+]]:[[HILOAD:[0-9]+]]{{\]}}, -; SI-NOT: bfe -; SI-NOT: lshr -; SI-DAG: v_cvt_f32_ubyte3_e32 v{{[0-9]+}}, v[[LOLOAD]] -; SI-DAG: v_cvt_f32_ubyte2_e32 v{{[0-9]+}}, v[[LOLOAD]] -; SI-DAG: v_cvt_f32_ubyte1_e32 v{{[0-9]+}}, v[[LOLOAD]] -; SI-DAG: v_cvt_f32_ubyte0_e32 v{{[0-9]+}}, v[[LOLOAD]] -; SI-DAG: v_cvt_f32_ubyte3_e32 v{{[0-9]+}}, v[[HILOAD]] -; SI-DAG: v_cvt_f32_ubyte2_e32 v{{[0-9]+}}, v[[HILOAD]] -; SI-DAG: v_cvt_f32_ubyte1_e32 v{{[0-9]+}}, v[[HILOAD]] -; SI-DAG: v_cvt_f32_ubyte0_e32 v{{[0-9]+}}, v[[HILOAD]] -; SI-NOT: bfe -; SI-NOT: lshr -; SI: buffer_store_dword -; SI: buffer_store_dword -; SI: buffer_store_dword -; SI: buffer_store_dword -; SI: buffer_store_dword -; SI: buffer_store_dword -; SI: buffer_store_dword -; SI: buffer_store_dword -define void @load_v8i8_to_v8f32(<8 x float> addrspace(1)* noalias %out, <8 x i8> addrspace(1)* noalias %in) nounwind { - %load = load <8 x i8>, <8 x i8> addrspace(1)* %in, align 8 - %cvt = uitofp <8 x i8> %load to <8 x float> - store <8 x float> %cvt, <8 x float> addrspace(1)* %out, align 16 - ret void -} - -; SI-LABEL: {{^}}i8_zext_inreg_i32_to_f32: -; SI: buffer_load_dword [[LOADREG:v[0-9]+]], -; SI: v_add_i32_e32 [[ADD:v[0-9]+]], 2, [[LOADREG]] -; SI-NEXT: v_cvt_f32_ubyte0_e32 [[CONV:v[0-9]+]], [[ADD]] -; SI: buffer_store_dword [[CONV]], -define void @i8_zext_inreg_i32_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { - %load = load i32, i32 addrspace(1)* %in, align 4 - %add = add i32 %load, 2 - %inreg = and i32 %add, 255 - %cvt = uitofp i32 %inreg to float - store float %cvt, float addrspace(1)* %out, align 4 - ret void -} - -; SI-LABEL: {{^}}i8_zext_inreg_hi1_to_f32: -define void @i8_zext_inreg_hi1_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { - %load = load i32, i32 addrspace(1)* %in, align 4 - %inreg = and i32 %load, 65280 - %shr = lshr i32 %inreg, 8 - %cvt = uitofp i32 %shr to float - store float %cvt, float addrspace(1)* %out, align 4 - ret void -} - - -; We don't get these ones because of the zext, but instcombine removes -; them so it shouldn't really matter. -define void @i8_zext_i32_to_f32(float addrspace(1)* noalias %out, i8 addrspace(1)* noalias %in) nounwind { - %load = load i8, i8 addrspace(1)* %in, align 1 - %ext = zext i8 %load to i32 - %cvt = uitofp i32 %ext to float - store float %cvt, float addrspace(1)* %out, align 4 - ret void -} - -define void @v4i8_zext_v4i32_to_v4f32(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind { - %load = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 1 - %ext = zext <4 x i8> %load to <4 x i32> - %cvt = uitofp <4 x i32> %ext to <4 x float> - store <4 x float> %cvt, <4 x float> addrspace(1)* %out, align 16 - ret void -} diff --git a/llvm/test/CodeGen/R600/cvt_flr_i32_f32.ll b/llvm/test/CodeGen/R600/cvt_flr_i32_f32.ll deleted file mode 100644 index 2dd3a9f2a77..00000000000 --- a/llvm/test/CodeGen/R600/cvt_flr_i32_f32.ll +++ /dev/null @@ -1,86 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI-SAFE -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=SI -enable-no-nans-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=SI-NONAN -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s - -declare float @llvm.fabs.f32(float) #1 -declare float @llvm.floor.f32(float) #1 - -; FUNC-LABEL: {{^}}cvt_flr_i32_f32_0: -; SI-SAFE-NOT: v_cvt_flr_i32_f32 -; SI-NOT: add -; SI-NONAN: v_cvt_flr_i32_f32_e32 v{{[0-9]+}}, s{{[0-9]+}} -; SI: s_endpgm -define void @cvt_flr_i32_f32_0(i32 addrspace(1)* %out, float %x) #0 { - %floor = call float @llvm.floor.f32(float %x) #1 - %cvt = fptosi float %floor to i32 - store i32 %cvt, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}cvt_flr_i32_f32_1: -; SI: v_add_f32_e64 [[TMP:v[0-9]+]], 1.0, s{{[0-9]+}} -; SI-SAFE-NOT: v_cvt_flr_i32_f32 -; SI-NONAN: v_cvt_flr_i32_f32_e32 v{{[0-9]+}}, [[TMP]] -; SI: s_endpgm -define void @cvt_flr_i32_f32_1(i32 addrspace(1)* %out, float %x) #0 { - %fadd = fadd float %x, 1.0 - %floor = call float @llvm.floor.f32(float %fadd) #1 - %cvt = fptosi float %floor to i32 - store i32 %cvt, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}cvt_flr_i32_f32_fabs: -; SI-NOT: add -; SI-SAFE-NOT: v_cvt_flr_i32_f32 -; SI-NONAN: v_cvt_flr_i32_f32_e64 v{{[0-9]+}}, |s{{[0-9]+}}| -; SI: s_endpgm -define void @cvt_flr_i32_f32_fabs(i32 addrspace(1)* %out, float %x) #0 { - %x.fabs = call float @llvm.fabs.f32(float %x) #1 - %floor = call float @llvm.floor.f32(float %x.fabs) #1 - %cvt = fptosi float %floor to i32 - store i32 %cvt, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}cvt_flr_i32_f32_fneg: -; SI-NOT: add -; SI-SAFE-NOT: v_cvt_flr_i32_f32 -; SI-NONAN: v_cvt_flr_i32_f32_e64 v{{[0-9]+}}, -s{{[0-9]+}} -; SI: s_endpgm -define void @cvt_flr_i32_f32_fneg(i32 addrspace(1)* %out, float %x) #0 { - %x.fneg = fsub float -0.000000e+00, %x - %floor = call float @llvm.floor.f32(float %x.fneg) #1 - %cvt = fptosi float %floor to i32 - store i32 %cvt, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}cvt_flr_i32_f32_fabs_fneg: -; SI-NOT: add -; SI-SAFE-NOT: v_cvt_flr_i32_f32 -; SI-NONAN: v_cvt_flr_i32_f32_e64 v{{[0-9]+}}, -|s{{[0-9]+}}| -; SI: s_endpgm -define void @cvt_flr_i32_f32_fabs_fneg(i32 addrspace(1)* %out, float %x) #0 { - %x.fabs = call float @llvm.fabs.f32(float %x) #1 - %x.fabs.fneg = fsub float -0.000000e+00, %x.fabs - %floor = call float @llvm.floor.f32(float %x.fabs.fneg) #1 - %cvt = fptosi float %floor to i32 - store i32 %cvt, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}no_cvt_flr_i32_f32_0: -; SI-NOT: v_cvt_flr_i32_f32 -; SI: v_floor_f32 -; SI: v_cvt_u32_f32_e32 -; SI: s_endpgm -define void @no_cvt_flr_i32_f32_0(i32 addrspace(1)* %out, float %x) #0 { - %floor = call float @llvm.floor.f32(float %x) #1 - %cvt = fptoui float %floor to i32 - store i32 %cvt, i32 addrspace(1)* %out - ret void -} - -attributes #0 = { nounwind } -attributes #1 = { nounwind readnone } diff --git a/llvm/test/CodeGen/R600/cvt_rpi_i32_f32.ll b/llvm/test/CodeGen/R600/cvt_rpi_i32_f32.ll deleted file mode 100644 index 864ac40260b..00000000000 --- a/llvm/test/CodeGen/R600/cvt_rpi_i32_f32.ll +++ /dev/null @@ -1,83 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI-SAFE -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=SI -enable-no-nans-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=SI-NONAN -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI-SAFE -check-prefix=SI -check-prefix=FUNC %s - -declare float @llvm.fabs.f32(float) #1 -declare float @llvm.floor.f32(float) #1 - -; FUNC-LABEL: {{^}}cvt_rpi_i32_f32: -; SI-SAFE-NOT: v_cvt_rpi_i32_f32 -; SI-NONAN: v_cvt_rpi_i32_f32_e32 v{{[0-9]+}}, s{{[0-9]+}} -; SI: s_endpgm -define void @cvt_rpi_i32_f32(i32 addrspace(1)* %out, float %x) #0 { - %fadd = fadd float %x, 0.5 - %floor = call float @llvm.floor.f32(float %fadd) #1 - %cvt = fptosi float %floor to i32 - store i32 %cvt, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}cvt_rpi_i32_f32_fabs: -; SI-SAFE-NOT: v_cvt_rpi_i32_f32 -; SI-NONAN: v_cvt_rpi_i32_f32_e64 v{{[0-9]+}}, |s{{[0-9]+}}|{{$}} -; SI: s_endpgm -define void @cvt_rpi_i32_f32_fabs(i32 addrspace(1)* %out, float %x) #0 { - %x.fabs = call float @llvm.fabs.f32(float %x) #1 - %fadd = fadd float %x.fabs, 0.5 - %floor = call float @llvm.floor.f32(float %fadd) #1 - %cvt = fptosi float %floor to i32 - store i32 %cvt, i32 addrspace(1)* %out - ret void -} - -; FIXME: This doesn't work because it forms fsub 0.5, x -; FUNC-LABEL: {{^}}cvt_rpi_i32_f32_fneg: -; XSI-NONAN: v_cvt_rpi_i32_f32_e64 v{{[0-9]+}}, -s{{[0-9]+}} -; SI: v_sub_f32_e64 [[TMP:v[0-9]+]], 0.5, s{{[0-9]+}} -; SI-SAFE-NOT: v_cvt_flr_i32_f32 -; SI-NONAN: v_cvt_flr_i32_f32_e32 {{v[0-9]+}}, [[TMP]] -; SI: s_endpgm -define void @cvt_rpi_i32_f32_fneg(i32 addrspace(1)* %out, float %x) #0 { - %x.fneg = fsub float -0.000000e+00, %x - %fadd = fadd float %x.fneg, 0.5 - %floor = call float @llvm.floor.f32(float %fadd) #1 - %cvt = fptosi float %floor to i32 - store i32 %cvt, i32 addrspace(1)* %out - ret void -} - -; FIXME: This doesn't work for same reason as above -; FUNC-LABEL: {{^}}cvt_rpi_i32_f32_fabs_fneg: -; SI-SAFE-NOT: v_cvt_rpi_i32_f32 -; XSI-NONAN: v_cvt_rpi_i32_f32_e64 v{{[0-9]+}}, -|s{{[0-9]+}}| - -; SI: v_sub_f32_e64 [[TMP:v[0-9]+]], 0.5, |s{{[0-9]+}}| -; SI-SAFE-NOT: v_cvt_flr_i32_f32 -; SI-NONAN: v_cvt_flr_i32_f32_e32 {{v[0-9]+}}, [[TMP]] -; SI: s_endpgm -define void @cvt_rpi_i32_f32_fabs_fneg(i32 addrspace(1)* %out, float %x) #0 { - %x.fabs = call float @llvm.fabs.f32(float %x) #1 - %x.fabs.fneg = fsub float -0.000000e+00, %x.fabs - %fadd = fadd float %x.fabs.fneg, 0.5 - %floor = call float @llvm.floor.f32(float %fadd) #1 - %cvt = fptosi float %floor to i32 - store i32 %cvt, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}no_cvt_rpi_i32_f32_0: -; SI-NOT: v_cvt_rpi_i32_f32 -; SI: v_add_f32 -; SI: v_floor_f32 -; SI: v_cvt_u32_f32 -; SI: s_endpgm -define void @no_cvt_rpi_i32_f32_0(i32 addrspace(1)* %out, float %x) #0 { - %fadd = fadd float %x, 0.5 - %floor = call float @llvm.floor.f32(float %fadd) #1 - %cvt = fptoui float %floor to i32 - store i32 %cvt, i32 addrspace(1)* %out - ret void -} - -attributes #0 = { nounwind } -attributes #1 = { nounwind readnone } diff --git a/llvm/test/CodeGen/R600/dagcombiner-bug-illegal-vec4-int-to-fp.ll b/llvm/test/CodeGen/R600/dagcombiner-bug-illegal-vec4-int-to-fp.ll deleted file mode 100644 index fb43ff4fbdd..00000000000 --- a/llvm/test/CodeGen/R600/dagcombiner-bug-illegal-vec4-int-to-fp.ll +++ /dev/null @@ -1,36 +0,0 @@ -;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s - -; This test is for a bug in -; DAGCombiner::reduceBuildVecConvertToConvertBuildVec() where -; the wrong type was being passed to -; TargetLowering::getOperationAction() when checking the legality of -; ISD::UINT_TO_FP and ISD::SINT_TO_FP opcodes. - - -; CHECK: {{^}}sint: -; CHECK: INT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} - -define void @sint(<4 x float> addrspace(1)* %out, i32 addrspace(1)* %in) { -entry: - %ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 - %sint = load i32, i32 addrspace(1) * %in - %conv = sitofp i32 %sint to float - %0 = insertelement <4 x float> undef, float %conv, i32 0 - %splat = shufflevector <4 x float> %0, <4 x float> undef, <4 x i32> zeroinitializer - store <4 x float> %splat, <4 x float> addrspace(1)* %out - ret void -} - -;CHECK: {{^}}uint: -;CHECK: UINT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} - -define void @uint(<4 x float> addrspace(1)* %out, i32 addrspace(1)* %in) { -entry: - %ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 - %uint = load i32, i32 addrspace(1) * %in - %conv = uitofp i32 %uint to float - %0 = insertelement <4 x float> undef, float %conv, i32 0 - %splat = shufflevector <4 x float> %0, <4 x float> undef, <4 x i32> zeroinitializer - store <4 x float> %splat, <4 x float> addrspace(1)* %out - ret void -} diff --git a/llvm/test/CodeGen/R600/debug.ll b/llvm/test/CodeGen/R600/debug.ll deleted file mode 100644 index a2e0e878b74..00000000000 --- a/llvm/test/CodeGen/R600/debug.ll +++ /dev/null @@ -1,10 +0,0 @@ -; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs -mattr=dumpcode -filetype=obj | FileCheck --check-prefix=SI --check-prefix=FUNC %s -; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs -mattr=dumpcode -filetype=obj | FileCheck --check-prefix=SI --check-prefix=FUNC %s - -; Test for a crash in the custom assembly dump code. - -; SI: s_endpgm -define void @test(i32 addrspace(1)* %out) { - store i32 0, i32 addrspace(1)* %out - ret void -} diff --git a/llvm/test/CodeGen/R600/default-fp-mode.ll b/llvm/test/CodeGen/R600/default-fp-mode.ll deleted file mode 100644 index da8e91454b9..00000000000 --- a/llvm/test/CodeGen/R600/default-fp-mode.ll +++ /dev/null @@ -1,36 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -mattr=-fp32-denormals,+fp64-denormals < %s | FileCheck -check-prefix=FP64-DENORMAL -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=SI -mattr=+fp32-denormals,-fp64-denormals < %s | FileCheck -check-prefix=FP32-DENORMAL -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=SI -mattr=+fp32-denormals,+fp64-denormals < %s | FileCheck -check-prefix=BOTH-DENORMAL -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=SI -mattr=-fp32-denormals,-fp64-denormals < %s | FileCheck -check-prefix=NO-DENORMAL -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=DEFAULT -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=SI -mattr=-fp32-denormals < %s | FileCheck -check-prefix=DEFAULT -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=SI -mattr=+fp64-denormals < %s | FileCheck -check-prefix=DEFAULT -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-fp32-denormals,+fp64-denormals < %s | FileCheck -check-prefix=FP64-DENORMAL -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=+fp32-denormals,-fp64-denormals < %s | FileCheck -check-prefix=FP32-DENORMAL -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=+fp32-denormals,+fp64-denormals < %s | FileCheck -check-prefix=BOTH-DENORMAL -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-fp32-denormals,-fp64-denormals < %s | FileCheck -check-prefix=NO-DENORMAL -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=DEFAULT -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-fp32-denormals < %s | FileCheck -check-prefix=DEFAULT -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=+fp64-denormals < %s | FileCheck -check-prefix=DEFAULT -check-prefix=FUNC %s - -; FUNC-LABEL: {{^}}test_kernel: - -; DEFAULT: FloatMode: 192 -; DEFAULT: IeeeMode: 0 - -; FP64-DENORMAL: FloatMode: 192 -; FP64-DENORMAL: IeeeMode: 0 - -; FP32-DENORMAL: FloatMode: 48 -; FP32-DENORMAL: IeeeMode: 0 - -; BOTH-DENORMAL: FloatMode: 240 -; BOTH-DENORMAL: IeeeMode: 0 - -; NO-DENORMAL: FloatMode: 0 -; NO-DENORMAL: IeeeMode: 0 -define void @test_kernel(float addrspace(1)* %out0, double addrspace(1)* %out1) nounwind { - store float 0.0, float addrspace(1)* %out0 - store double 0.0, double addrspace(1)* %out1 - ret void -} diff --git a/llvm/test/CodeGen/R600/disconnected-predset-break-bug.ll b/llvm/test/CodeGen/R600/disconnected-predset-break-bug.ll deleted file mode 100644 index cdd2c0cd4f4..00000000000 --- a/llvm/test/CodeGen/R600/disconnected-predset-break-bug.ll +++ /dev/null @@ -1,29 +0,0 @@ -; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s - -; PRED_SET* instructions must be tied to any instruction that uses their -; result. This tests that there are no instructions between the PRED_SET* -; and the PREDICATE_BREAK in this loop. - -; CHECK: {{^}}loop_ge: -; CHECK: LOOP_START_DX10 -; CHECK: ALU_PUSH_BEFORE -; CHECK-NEXT: JUMP -; CHECK-NEXT: LOOP_BREAK -define void @loop_ge(i32 addrspace(1)* nocapture %out, i32 %iterations) nounwind { -entry: - %cmp5 = icmp sgt i32 %iterations, 0 - br i1 %cmp5, label %for.body, label %for.end - -for.body: ; preds = %for.body, %entry - %i.07.in = phi i32 [ %i.07, %for.body ], [ %iterations, %entry ] - %ai.06 = phi i32 [ %add, %for.body ], [ 0, %entry ] - %i.07 = add nsw i32 %i.07.in, -1 - %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %ai.06 - store i32 %i.07, i32 addrspace(1)* %arrayidx, align 4 - %add = add nsw i32 %ai.06, 1 - %exitcond = icmp eq i32 %add, %iterations - br i1 %exitcond, label %for.end, label %for.body - -for.end: ; preds = %for.body, %entry - ret void -} diff --git a/llvm/test/CodeGen/R600/dot4-folding.ll b/llvm/test/CodeGen/R600/dot4-folding.ll deleted file mode 100644 index 4df7b63bf98..00000000000 --- a/llvm/test/CodeGen/R600/dot4-folding.ll +++ /dev/null @@ -1,27 +0,0 @@ -;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s - -; Exactly one constant vector can be folded into dot4, which means exactly -; 4 MOV instructions -; CHECK: {{^}}main: -; CHECK: MOV -; CHECK: MOV -; CHECK: MOV -; CHECK: MOV -; CHECK-NOT: MOV -; CHECK-NOT: MOV -; CHECK-NOT: MOV -; CHECK-NOT: MOV - -define void @main(float addrspace(1)* %out) { -main_body: - %0 = load <4 x float>, <4 x float> addrspace(8)* null - %1 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1) - %2 = call float @llvm.AMDGPU.dp4(<4 x float> %0,<4 x float> %1) - %3 = insertelement <4 x float> undef, float %2, i32 0 - call void @llvm.R600.store.swizzle(<4 x float> %3, i32 0, i32 0) - ret void -} - -declare float @llvm.AMDGPU.dp4(<4 x float>, <4 x float>) #1 -declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) -attributes #1 = { readnone } diff --git a/llvm/test/CodeGen/R600/ds-negative-offset-addressing-mode-loop.ll b/llvm/test/CodeGen/R600/ds-negative-offset-addressing-mode-loop.ll deleted file mode 100644 index e7e13d6178c..00000000000 --- a/llvm/test/CodeGen/R600/ds-negative-offset-addressing-mode-loop.ll +++ /dev/null @@ -1,69 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck -check-prefix=SI --check-prefix=CHECK %s -; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck -check-prefix=CI --check-prefix=CHECK %s - -declare i32 @llvm.r600.read.tidig.x() #0 -declare void @llvm.AMDGPU.barrier.local() #1 - -; Function Attrs: nounwind -; CHECK-LABEL: {{^}}signed_ds_offset_addressing_loop: -; CHECK: BB0_1: -; CHECK: v_add_i32_e32 [[VADDR:v[0-9]+]], -; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[VADDR]] -; SI-DAG: v_add_i32_e32 [[VADDR4:v[0-9]+]], 4, [[VADDR]] -; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[VADDR4]] -; SI-DAG: v_add_i32_e32 [[VADDR0x80:v[0-9]+]], 0x80, [[VADDR]] -; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[VADDR0x80]] -; SI-DAG: v_add_i32_e32 [[VADDR0x84:v[0-9]+]], 0x84, [[VADDR]] -; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[VADDR0x84]] -; SI-DAG: v_add_i32_e32 [[VADDR0x100:v[0-9]+]], 0x100, [[VADDR]] -; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[VADDR0x100]] - -; CI-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[VADDR]] offset1:1 -; CI-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[VADDR]] offset0:32 offset1:33 -; CI-DAG: ds_read_b32 v{{[0-9]+}}, [[VADDR]] offset:256 -; CHECK: s_endpgm -define void @signed_ds_offset_addressing_loop(float addrspace(1)* noalias nocapture %out, float addrspace(3)* noalias nocapture readonly %lptr, i32 %n) #2 { -entry: - %x.i = tail call i32 @llvm.r600.read.tidig.x() #0 - %mul = shl nsw i32 %x.i, 1 - br label %for.body - -for.body: ; preds = %for.body, %entry - %sum.03 = phi float [ 0.000000e+00, %entry ], [ %add13, %for.body ] - %offset.02 = phi i32 [ %mul, %entry ], [ %add14, %for.body ] - %k.01 = phi i32 [ 0, %entry ], [ %inc, %for.body ] - tail call void @llvm.AMDGPU.barrier.local() #1 - %arrayidx = getelementptr inbounds float, float addrspace(3)* %lptr, i32 %offset.02 - %tmp = load float, float addrspace(3)* %arrayidx, align 4 - %add1 = add nsw i32 %offset.02, 1 - %arrayidx2 = getelementptr inbounds float, float addrspace(3)* %lptr, i32 %add1 - %tmp1 = load float, float addrspace(3)* %arrayidx2, align 4 - %add3 = add nsw i32 %offset.02, 32 - %arrayidx4 = getelementptr inbounds float, float addrspace(3)* %lptr, i32 %add3 - %tmp2 = load float, float addrspace(3)* %arrayidx4, align 4 - %add5 = add nsw i32 %offset.02, 33 - %arrayidx6 = getelementptr inbounds float, float addrspace(3)* %lptr, i32 %add5 - %tmp3 = load float, float addrspace(3)* %arrayidx6, align 4 - %add7 = add nsw i32 %offset.02, 64 - %arrayidx8 = getelementptr inbounds float, float addrspace(3)* %lptr, i32 %add7 - %tmp4 = load float, float addrspace(3)* %arrayidx8, align 4 - %add9 = fadd float %tmp, %tmp1 - %add10 = fadd float %add9, %tmp2 - %add11 = fadd float %add10, %tmp3 - %add12 = fadd float %add11, %tmp4 - %add13 = fadd float %sum.03, %add12 - %inc = add nsw i32 %k.01, 1 - %add14 = add nsw i32 %offset.02, 97 - %exitcond = icmp eq i32 %inc, 8 - br i1 %exitcond, label %for.end, label %for.body - -for.end: ; preds = %for.body - %tmp5 = sext i32 %x.i to i64 - %arrayidx15 = getelementptr inbounds float, float addrspace(1)* %out, i64 %tmp5 - store float %add13, float addrspace(1)* %arrayidx15, align 4 - ret void -} - -attributes #0 = { nounwind readnone } -attributes #1 = { noduplicate nounwind } -attributes #2 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } diff --git a/llvm/test/CodeGen/R600/ds_read2.ll b/llvm/test/CodeGen/R600/ds_read2.ll deleted file mode 100644 index 5929898f8bd..00000000000 --- a/llvm/test/CodeGen/R600/ds_read2.ll +++ /dev/null @@ -1,515 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck -strict-whitespace -check-prefix=SI %s - -; FIXME: We don't get cases where the address was an SGPR because we -; get a copy to the address register for each one. - -@lds = addrspace(3) global [512 x float] undef, align 4 - @lds.f64 = addrspace(3) global [512 x double] undef, align 8 - -; SI-LABEL: @simple_read2_f32 -; SI: ds_read2_b32 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:8 -; SI: s_waitcnt lgkmcnt(0) -; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[HI_VREG]], v[[LO_VREG]] -; SI: buffer_store_dword [[RESULT]] -; SI: s_endpgm -define void @simple_read2_f32(float addrspace(1)* %out) #0 { - %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 - %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i - %val0 = load float, float addrspace(3)* %arrayidx0, align 4 - %add.x = add nsw i32 %x.i, 8 - %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x - %val1 = load float, float addrspace(3)* %arrayidx1, align 4 - %sum = fadd float %val0, %val1 - %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i - store float %sum, float addrspace(1)* %out.gep, align 4 - ret void -} - -; SI-LABEL: @simple_read2_f32_max_offset -; SI: ds_read2_b32 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:255 -; SI: s_waitcnt lgkmcnt(0) -; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[HI_VREG]], v[[LO_VREG]] -; SI: buffer_store_dword [[RESULT]] -; SI: s_endpgm -define void @simple_read2_f32_max_offset(float addrspace(1)* %out) #0 { - %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 - %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i - %val0 = load float, float addrspace(3)* %arrayidx0, align 4 - %add.x = add nsw i32 %x.i, 255 - %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x - %val1 = load float, float addrspace(3)* %arrayidx1, align 4 - %sum = fadd float %val0, %val1 - %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i - store float %sum, float addrspace(1)* %out.gep, align 4 - ret void -} - -; SI-LABEL: @simple_read2_f32_too_far -; SI-NOT ds_read2_b32 -; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} -; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:1028 -; SI: s_endpgm -define void @simple_read2_f32_too_far(float addrspace(1)* %out) #0 { - %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 - %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i - %val0 = load float, float addrspace(3)* %arrayidx0, align 4 - %add.x = add nsw i32 %x.i, 257 - %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x - %val1 = load float, float addrspace(3)* %arrayidx1, align 4 - %sum = fadd float %val0, %val1 - %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i - store float %sum, float addrspace(1)* %out.gep, align 4 - ret void -} - -; SI-LABEL: @simple_read2_f32_x2 -; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR:v[0-9]+]] offset1:8 -; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR]] offset0:11 offset1:27 -; SI: s_endpgm -define void @simple_read2_f32_x2(float addrspace(1)* %out) #0 { - %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1 - %idx.0 = add nsw i32 %tid.x, 0 - %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.0 - %val0 = load float, float addrspace(3)* %arrayidx0, align 4 - - %idx.1 = add nsw i32 %tid.x, 8 - %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.1 - %val1 = load float, float addrspace(3)* %arrayidx1, align 4 - %sum.0 = fadd float %val0, %val1 - - %idx.2 = add nsw i32 %tid.x, 11 - %arrayidx2 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.2 - %val2 = load float, float addrspace(3)* %arrayidx2, align 4 - - %idx.3 = add nsw i32 %tid.x, 27 - %arrayidx3 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.3 - %val3 = load float, float addrspace(3)* %arrayidx3, align 4 - %sum.1 = fadd float %val2, %val3 - - %sum = fadd float %sum.0, %sum.1 - %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %idx.0 - store float %sum, float addrspace(1)* %out.gep, align 4 - ret void -} - -; Make sure there is an instruction between the two sets of reads. -; SI-LABEL: @simple_read2_f32_x2_barrier -; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR:v[0-9]+]] offset1:8 -; SI: s_barrier -; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR]] offset0:11 offset1:27 -; SI: s_endpgm -define void @simple_read2_f32_x2_barrier(float addrspace(1)* %out) #0 { - %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1 - %idx.0 = add nsw i32 %tid.x, 0 - %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.0 - %val0 = load float, float addrspace(3)* %arrayidx0, align 4 - - %idx.1 = add nsw i32 %tid.x, 8 - %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.1 - %val1 = load float, float addrspace(3)* %arrayidx1, align 4 - %sum.0 = fadd float %val0, %val1 - - call void @llvm.AMDGPU.barrier.local() #2 - - %idx.2 = add nsw i32 %tid.x, 11 - %arrayidx2 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.2 - %val2 = load float, float addrspace(3)* %arrayidx2, align 4 - - %idx.3 = add nsw i32 %tid.x, 27 - %arrayidx3 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.3 - %val3 = load float, float addrspace(3)* %arrayidx3, align 4 - %sum.1 = fadd float %val2, %val3 - - %sum = fadd float %sum.0, %sum.1 - %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %idx.0 - store float %sum, float addrspace(1)* %out.gep, align 4 - ret void -} - -; For some reason adding something to the base address for the first -; element results in only folding the inner pair. - -; SI-LABEL: @simple_read2_f32_x2_nonzero_base -; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR:v[0-9]+]] offset0:2 offset1:8 -; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR]] offset0:11 offset1:27 -; SI: s_endpgm -define void @simple_read2_f32_x2_nonzero_base(float addrspace(1)* %out) #0 { - %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1 - %idx.0 = add nsw i32 %tid.x, 2 - %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.0 - %val0 = load float, float addrspace(3)* %arrayidx0, align 4 - - %idx.1 = add nsw i32 %tid.x, 8 - %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.1 - %val1 = load float, float addrspace(3)* %arrayidx1, align 4 - %sum.0 = fadd float %val0, %val1 - - %idx.2 = add nsw i32 %tid.x, 11 - %arrayidx2 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.2 - %val2 = load float, float addrspace(3)* %arrayidx2, align 4 - - %idx.3 = add nsw i32 %tid.x, 27 - %arrayidx3 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.3 - %val3 = load float, float addrspace(3)* %arrayidx3, align 4 - %sum.1 = fadd float %val2, %val3 - - %sum = fadd float %sum.0, %sum.1 - %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %idx.0 - store float %sum, float addrspace(1)* %out.gep, align 4 - ret void -} - -; Be careful of vectors of pointers. We don't know if the 2 pointers -; in the vectors are really the same base, so this is not safe to -; merge. -; Base pointers come from different subregister of same super -; register. We can't safely merge this. - -; SI-LABEL: @read2_ptr_is_subreg_arg_f32 -; SI-NOT: ds_read2_b32 -; SI: ds_read_b32 -; SI: ds_read_b32 -; SI: s_endpgm -define void @read2_ptr_is_subreg_arg_f32(float addrspace(1)* %out, <2 x float addrspace(3)*> %lds.ptr) #0 { - %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 - %index.0 = insertelement <2 x i32> undef, i32 %x.i, i32 0 - %index.1 = insertelement <2 x i32> %index.0, i32 8, i32 0 - %gep = getelementptr inbounds float, <2 x float addrspace(3)*> %lds.ptr, <2 x i32> %index.1 - %gep.0 = extractelement <2 x float addrspace(3)*> %gep, i32 0 - %gep.1 = extractelement <2 x float addrspace(3)*> %gep, i32 1 - %val0 = load float, float addrspace(3)* %gep.0, align 4 - %val1 = load float, float addrspace(3)* %gep.1, align 4 - %add.x = add nsw i32 %x.i, 8 - %sum = fadd float %val0, %val1 - %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i - store float %sum, float addrspace(1)* %out.gep, align 4 - ret void -} - -; Apply a constant scalar offset after the pointer vector extract. We -; are rejecting merges that have the same, constant 0 offset, so make -; sure we are really rejecting it because of the different -; subregisters. - -; SI-LABEL: @read2_ptr_is_subreg_arg_offset_f32 -; SI-NOT: ds_read2_b32 -; SI: ds_read_b32 -; SI: ds_read_b32 -; SI: s_endpgm -define void @read2_ptr_is_subreg_arg_offset_f32(float addrspace(1)* %out, <2 x float addrspace(3)*> %lds.ptr) #0 { - %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 - %index.0 = insertelement <2 x i32> undef, i32 %x.i, i32 0 - %index.1 = insertelement <2 x i32> %index.0, i32 8, i32 0 - %gep = getelementptr inbounds float, <2 x float addrspace(3)*> %lds.ptr, <2 x i32> %index.1 - %gep.0 = extractelement <2 x float addrspace(3)*> %gep, i32 0 - %gep.1 = extractelement <2 x float addrspace(3)*> %gep, i32 1 - - ; Apply an additional offset after the vector that will be more obviously folded. - %gep.1.offset = getelementptr float, float addrspace(3)* %gep.1, i32 8 - - %val0 = load float, float addrspace(3)* %gep.0, align 4 - %val1 = load float, float addrspace(3)* %gep.1.offset, align 4 - %add.x = add nsw i32 %x.i, 8 - %sum = fadd float %val0, %val1 - %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i - store float %sum, float addrspace(1)* %out.gep, align 4 - ret void -} - -; We should be able to merge in this case, but probably not worth the effort. -; SI-NOT: ds_read2_b32 -; SI: ds_read_b32 -; SI: ds_read_b32 -; SI: s_endpgm -define void @read2_ptr_is_subreg_f32(float addrspace(1)* %out) #0 { - %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 - %ptr.0 = insertelement <2 x [512 x float] addrspace(3)*> undef, [512 x float] addrspace(3)* @lds, i32 0 - %ptr.1 = insertelement <2 x [512 x float] addrspace(3)*> %ptr.0, [512 x float] addrspace(3)* @lds, i32 1 - %x.i.v.0 = insertelement <2 x i32> undef, i32 %x.i, i32 0 - %x.i.v.1 = insertelement <2 x i32> %x.i.v.0, i32 %x.i, i32 1 - %idx = add <2 x i32> %x.i.v.1, - %gep = getelementptr inbounds [512 x float], <2 x [512 x float] addrspace(3)*> %ptr.1, <2 x i32> , <2 x i32> %idx - %gep.0 = extractelement <2 x float addrspace(3)*> %gep, i32 0 - %gep.1 = extractelement <2 x float addrspace(3)*> %gep, i32 1 - %val0 = load float, float addrspace(3)* %gep.0, align 4 - %val1 = load float, float addrspace(3)* %gep.1, align 4 - %add.x = add nsw i32 %x.i, 8 - %sum = fadd float %val0, %val1 - %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i - store float %sum, float addrspace(1)* %out.gep, align 4 - ret void -} - -; SI-LABEL: @simple_read2_f32_volatile_0 -; SI-NOT ds_read2_b32 -; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} -; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:32 -; SI: s_endpgm -define void @simple_read2_f32_volatile_0(float addrspace(1)* %out) #0 { - %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 - %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i - %val0 = load volatile float, float addrspace(3)* %arrayidx0, align 4 - %add.x = add nsw i32 %x.i, 8 - %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x - %val1 = load float, float addrspace(3)* %arrayidx1, align 4 - %sum = fadd float %val0, %val1 - %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i - store float %sum, float addrspace(1)* %out.gep, align 4 - ret void -} - -; SI-LABEL: @simple_read2_f32_volatile_1 -; SI-NOT ds_read2_b32 -; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} -; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:32 -; SI: s_endpgm -define void @simple_read2_f32_volatile_1(float addrspace(1)* %out) #0 { - %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 - %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i - %val0 = load float, float addrspace(3)* %arrayidx0, align 4 - %add.x = add nsw i32 %x.i, 8 - %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x - %val1 = load volatile float, float addrspace(3)* %arrayidx1, align 4 - %sum = fadd float %val0, %val1 - %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i - store float %sum, float addrspace(1)* %out.gep, align 4 - ret void -} - -; Can't fold since not correctly aligned. -; XXX: This isn't really testing anything useful now. I think CI -; allows unaligned LDS accesses, which would be a problem here. -; SI-LABEL: @unaligned_read2_f32 -; SI-NOT: ds_read2_b32 -; SI: s_endpgm -define void @unaligned_read2_f32(float addrspace(1)* %out, float addrspace(3)* %lds) #0 { - %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 - %arrayidx0 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %x.i - %val0 = load float, float addrspace(3)* %arrayidx0, align 1 - %add.x = add nsw i32 %x.i, 8 - %arrayidx1 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %add.x - %val1 = load float, float addrspace(3)* %arrayidx1, align 1 - %sum = fadd float %val0, %val1 - %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i - store float %sum, float addrspace(1)* %out.gep, align 4 - ret void -} - -; SI-LABEL: @misaligned_2_simple_read2_f32 -; SI-NOT: ds_read2_b32 -; SI: s_endpgm -define void @misaligned_2_simple_read2_f32(float addrspace(1)* %out, float addrspace(3)* %lds) #0 { - %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 - %arrayidx0 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %x.i - %val0 = load float, float addrspace(3)* %arrayidx0, align 2 - %add.x = add nsw i32 %x.i, 8 - %arrayidx1 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %add.x - %val1 = load float, float addrspace(3)* %arrayidx1, align 2 - %sum = fadd float %val0, %val1 - %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i - store float %sum, float addrspace(1)* %out.gep, align 4 - ret void -} - -; SI-LABEL: @simple_read2_f64 -; SI: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 3, {{v[0-9]+}} -; SI: ds_read2_b64 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, [[VPTR]] offset1:8 -; SI: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO_VREG]]:{{[0-9]+\]}}, v{{\[[0-9]+}}:[[HI_VREG]]{{\]}} -; SI: buffer_store_dwordx2 [[RESULT]] -; SI: s_endpgm -define void @simple_read2_f64(double addrspace(1)* %out) #0 { - %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 - %arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i - %val0 = load double, double addrspace(3)* %arrayidx0, align 8 - %add.x = add nsw i32 %x.i, 8 - %arrayidx1 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %add.x - %val1 = load double, double addrspace(3)* %arrayidx1, align 8 - %sum = fadd double %val0, %val1 - %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i32 %x.i - store double %sum, double addrspace(1)* %out.gep, align 8 - ret void -} - -; SI-LABEL: @simple_read2_f64_max_offset -; SI: ds_read2_b64 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:255 -; SI: s_endpgm -define void @simple_read2_f64_max_offset(double addrspace(1)* %out) #0 { - %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 - %arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i - %val0 = load double, double addrspace(3)* %arrayidx0, align 8 - %add.x = add nsw i32 %x.i, 255 - %arrayidx1 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %add.x - %val1 = load double, double addrspace(3)* %arrayidx1, align 8 - %sum = fadd double %val0, %val1 - %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i32 %x.i - store double %sum, double addrspace(1)* %out.gep, align 8 - ret void -} - -; SI-LABEL: @simple_read2_f64_too_far -; SI-NOT ds_read2_b64 -; SI: ds_read_b64 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} -; SI: ds_read_b64 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset:2056 -; SI: s_endpgm -define void @simple_read2_f64_too_far(double addrspace(1)* %out) #0 { - %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 - %arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i - %val0 = load double, double addrspace(3)* %arrayidx0, align 8 - %add.x = add nsw i32 %x.i, 257 - %arrayidx1 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %add.x - %val1 = load double, double addrspace(3)* %arrayidx1, align 8 - %sum = fadd double %val0, %val1 - %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i32 %x.i - store double %sum, double addrspace(1)* %out.gep, align 8 - ret void -} - -; Alignment only 4 -; SI-LABEL: @misaligned_read2_f64 -; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset1:1 -; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset0:14 offset1:15 -; SI: s_endpgm -define void @misaligned_read2_f64(double addrspace(1)* %out, double addrspace(3)* %lds) #0 { - %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 - %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %x.i - %val0 = load double, double addrspace(3)* %arrayidx0, align 4 - %add.x = add nsw i32 %x.i, 7 - %arrayidx1 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x - %val1 = load double, double addrspace(3)* %arrayidx1, align 4 - %sum = fadd double %val0, %val1 - %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i32 %x.i - store double %sum, double addrspace(1)* %out.gep, align 4 - ret void -} - -@foo = addrspace(3) global [4 x i32] undef, align 4 - -; SI-LABEL: @load_constant_adjacent_offsets -; SI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} -; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] offset1:1 -define void @load_constant_adjacent_offsets(i32 addrspace(1)* %out) { - %val0 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4 - %val1 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 1), align 4 - %sum = add i32 %val0, %val1 - store i32 %sum, i32 addrspace(1)* %out, align 4 - ret void -} - -; SI-LABEL: @load_constant_disjoint_offsets -; SI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} -; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] offset1:2 -define void @load_constant_disjoint_offsets(i32 addrspace(1)* %out) { - %val0 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4 - %val1 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 2), align 4 - %sum = add i32 %val0, %val1 - store i32 %sum, i32 addrspace(1)* %out, align 4 - ret void -} - -@bar = addrspace(3) global [4 x i64] undef, align 4 - -; SI-LABEL: @load_misaligned64_constant_offsets -; SI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} -; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] offset1:1 -; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] offset0:2 offset1:3 -define void @load_misaligned64_constant_offsets(i64 addrspace(1)* %out) { - %val0 = load i64, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 0), align 4 - %val1 = load i64, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 1), align 4 - %sum = add i64 %val0, %val1 - store i64 %sum, i64 addrspace(1)* %out, align 8 - ret void -} - -@bar.large = addrspace(3) global [4096 x i64] undef, align 4 - -; SI-LABEL: @load_misaligned64_constant_large_offsets -; SI-DAG: v_mov_b32_e32 [[BASE0:v[0-9]+]], 0x7ff8{{$}} -; SI-DAG: v_mov_b32_e32 [[BASE1:v[0-9]+]], 0x4000 -; SI-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASE0]] offset1:1 -; SI-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASE1]] offset1:1 -; SI: s_endpgm -define void @load_misaligned64_constant_large_offsets(i64 addrspace(1)* %out) { - %val0 = load i64, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 2048), align 4 - %val1 = load i64, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 4095), align 4 - %sum = add i64 %val0, %val1 - store i64 %sum, i64 addrspace(1)* %out, align 8 - ret void -} - -@sgemm.lA = internal unnamed_addr addrspace(3) global [264 x float] undef, align 4 -@sgemm.lB = internal unnamed_addr addrspace(3) global [776 x float] undef, align 4 - -define void @sgemm_inner_loop_read2_sequence(float addrspace(1)* %C, i32 %lda, i32 %ldb) #0 { - %x.i = tail call i32 @llvm.r600.read.tgid.x() #1 - %y.i = tail call i32 @llvm.r600.read.tidig.y() #1 - %arrayidx44 = getelementptr inbounds [264 x float], [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %x.i - %tmp16 = load float, float addrspace(3)* %arrayidx44, align 4 - %add47 = add nsw i32 %x.i, 1 - %arrayidx48 = getelementptr inbounds [264 x float], [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %add47 - %tmp17 = load float, float addrspace(3)* %arrayidx48, align 4 - %add51 = add nsw i32 %x.i, 16 - %arrayidx52 = getelementptr inbounds [264 x float], [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %add51 - %tmp18 = load float, float addrspace(3)* %arrayidx52, align 4 - %add55 = add nsw i32 %x.i, 17 - %arrayidx56 = getelementptr inbounds [264 x float], [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %add55 - %tmp19 = load float, float addrspace(3)* %arrayidx56, align 4 - %arrayidx60 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %y.i - %tmp20 = load float, float addrspace(3)* %arrayidx60, align 4 - %add63 = add nsw i32 %y.i, 1 - %arrayidx64 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add63 - %tmp21 = load float, float addrspace(3)* %arrayidx64, align 4 - %add67 = add nsw i32 %y.i, 32 - %arrayidx68 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add67 - %tmp22 = load float, float addrspace(3)* %arrayidx68, align 4 - %add71 = add nsw i32 %y.i, 33 - %arrayidx72 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add71 - %tmp23 = load float, float addrspace(3)* %arrayidx72, align 4 - %add75 = add nsw i32 %y.i, 64 - %arrayidx76 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add75 - %tmp24 = load float, float addrspace(3)* %arrayidx76, align 4 - %add79 = add nsw i32 %y.i, 65 - %arrayidx80 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add79 - %tmp25 = load float, float addrspace(3)* %arrayidx80, align 4 - %sum.0 = fadd float %tmp16, %tmp17 - %sum.1 = fadd float %sum.0, %tmp18 - %sum.2 = fadd float %sum.1, %tmp19 - %sum.3 = fadd float %sum.2, %tmp20 - %sum.4 = fadd float %sum.3, %tmp21 - %sum.5 = fadd float %sum.4, %tmp22 - %sum.6 = fadd float %sum.5, %tmp23 - %sum.7 = fadd float %sum.6, %tmp24 - %sum.8 = fadd float %sum.7, %tmp25 - store float %sum.8, float addrspace(1)* %C, align 4 - ret void -} - -define void @misaligned_read2_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(3)* %in) #0 { - %load = load <2 x i32>, <2 x i32> addrspace(3)* %in, align 4 - store <2 x i32> %load, <2 x i32> addrspace(1)* %out, align 8 - ret void -} - -define void @misaligned_read2_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %in) #0 { - %load = load i64, i64 addrspace(3)* %in, align 4 - store i64 %load, i64 addrspace(1)* %out, align 8 - ret void -} - -; Function Attrs: nounwind readnone -declare i32 @llvm.r600.read.tgid.x() #1 - -; Function Attrs: nounwind readnone -declare i32 @llvm.r600.read.tgid.y() #1 - -; Function Attrs: nounwind readnone -declare i32 @llvm.r600.read.tidig.x() #1 - -; Function Attrs: nounwind readnone -declare i32 @llvm.r600.read.tidig.y() #1 - -; Function Attrs: noduplicate nounwind -declare void @llvm.AMDGPU.barrier.local() #2 - -attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { nounwind readnone } -attributes #2 = { noduplicate nounwind } diff --git a/llvm/test/CodeGen/R600/ds_read2_offset_order.ll b/llvm/test/CodeGen/R600/ds_read2_offset_order.ll deleted file mode 100644 index 9ea9a5a2617..00000000000 --- a/llvm/test/CodeGen/R600/ds_read2_offset_order.ll +++ /dev/null @@ -1,45 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck -strict-whitespace -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck -strict-whitespace -check-prefix=SI %s - -; XFAIL: * - -@lds = addrspace(3) global [512 x float] undef, align 4 - -; SI-LABEL: {{^}}offset_order: - -; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:56 -; SI: ds_read2st64_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset0:0 offset1:4 -; SI: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset0:2 offset1:3 -; SI: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset0:11 offset1:1 - -define void @offset_order(float addrspace(1)* %out) { -entry: - %ptr0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 0 - %val0 = load float, float addrspace(3)* %ptr0 - - %ptr1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 256 - %val1 = load float, float addrspace(3)* %ptr1 - %add1 = fadd float %val0, %val1 - - %ptr2 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 3 - %val2 = load float, float addrspace(3)* %ptr2 - %add2 = fadd float %add1, %val2 - - %ptr3 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 2 - %val3 = load float, float addrspace(3)* %ptr3 - %add3 = fadd float %add2, %val3 - - %ptr4 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 12 - %val4 = load float, float addrspace(3)* %ptr4 - %add4 = fadd float %add3, %val4 - - %ptr5 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 14 - %val5 = load float, float addrspace(3)* %ptr5 - %add5 = fadd float %add4, %val5 - - %ptr6 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 11 - %val6 = load float, float addrspace(3)* %ptr6 - %add6 = fadd float %add5, %val6 - store float %add6, float addrspace(1)* %out - ret void -} diff --git a/llvm/test/CodeGen/R600/ds_read2st64.ll b/llvm/test/CodeGen/R600/ds_read2st64.ll deleted file mode 100644 index 54b3b45636d..00000000000 --- a/llvm/test/CodeGen/R600/ds_read2st64.ll +++ /dev/null @@ -1,272 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck -check-prefix=SI %s - -@lds = addrspace(3) global [512 x float] undef, align 4 -@lds.f64 = addrspace(3) global [512 x double] undef, align 8 - - -; SI-LABEL: @simple_read2st64_f32_0_1 -; SI: ds_read2st64_b32 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:1 -; SI: s_waitcnt lgkmcnt(0) -; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[HI_VREG]], v[[LO_VREG]] -; SI: buffer_store_dword [[RESULT]] -; SI: s_endpgm -define void @simple_read2st64_f32_0_1(float addrspace(1)* %out) #0 { - %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 - %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i - %val0 = load float, float addrspace(3)* %arrayidx0, align 4 - %add.x = add nsw i32 %x.i, 64 - %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x - %val1 = load float, float addrspace(3)* %arrayidx1, align 4 - %sum = fadd float %val0, %val1 - %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i - store float %sum, float addrspace(1)* %out.gep, align 4 - ret void -} - -; SI-LABEL: @simple_read2st64_f32_1_2 -; SI: ds_read2st64_b32 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1 offset1:2 -; SI: s_waitcnt lgkmcnt(0) -; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[HI_VREG]], v[[LO_VREG]] -; SI: buffer_store_dword [[RESULT]] -; SI: s_endpgm -define void @simple_read2st64_f32_1_2(float addrspace(1)* %out, float addrspace(3)* %lds) #0 { - %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 - %add.x.0 = add nsw i32 %x.i, 64 - %arrayidx0 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %add.x.0 - %val0 = load float, float addrspace(3)* %arrayidx0, align 4 - %add.x.1 = add nsw i32 %x.i, 128 - %arrayidx1 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %add.x.1 - %val1 = load float, float addrspace(3)* %arrayidx1, align 4 - %sum = fadd float %val0, %val1 - %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i - store float %sum, float addrspace(1)* %out.gep, align 4 - ret void -} - -; SI-LABEL: @simple_read2st64_f32_max_offset -; SI: ds_read2st64_b32 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1 offset1:255 -; SI: s_waitcnt lgkmcnt(0) -; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[HI_VREG]], v[[LO_VREG]] -; SI: buffer_store_dword [[RESULT]] -; SI: s_endpgm -define void @simple_read2st64_f32_max_offset(float addrspace(1)* %out, float addrspace(3)* %lds) #0 { - %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 - %add.x.0 = add nsw i32 %x.i, 64 - %arrayidx0 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %add.x.0 - %val0 = load float, float addrspace(3)* %arrayidx0, align 4 - %add.x.1 = add nsw i32 %x.i, 16320 - %arrayidx1 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %add.x.1 - %val1 = load float, float addrspace(3)* %arrayidx1, align 4 - %sum = fadd float %val0, %val1 - %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i - store float %sum, float addrspace(1)* %out.gep, align 4 - ret void -} - -; SI-LABEL: @simple_read2st64_f32_over_max_offset -; SI-NOT: ds_read2st64_b32 -; SI: v_add_i32_e32 [[BIGADD:v[0-9]+]], 0x10000, {{v[0-9]+}} -; SI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:256 -; SI: ds_read_b32 {{v[0-9]+}}, [[BIGADD]] -; SI: s_endpgm -define void @simple_read2st64_f32_over_max_offset(float addrspace(1)* %out, float addrspace(3)* %lds) #0 { - %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 - %add.x.0 = add nsw i32 %x.i, 64 - %arrayidx0 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %add.x.0 - %val0 = load float, float addrspace(3)* %arrayidx0, align 4 - %add.x.1 = add nsw i32 %x.i, 16384 - %arrayidx1 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %add.x.1 - %val1 = load float, float addrspace(3)* %arrayidx1, align 4 - %sum = fadd float %val0, %val1 - %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i - store float %sum, float addrspace(1)* %out.gep, align 4 - ret void -} - -; SI-LABEL: @odd_invalid_read2st64_f32_0 -; SI-NOT: ds_read2st64_b32 -; SI: s_endpgm -define void @odd_invalid_read2st64_f32_0(float addrspace(1)* %out) #0 { - %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 - %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i - %val0 = load float, float addrspace(3)* %arrayidx0, align 4 - %add.x = add nsw i32 %x.i, 63 - %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x - %val1 = load float, float addrspace(3)* %arrayidx1, align 4 - %sum = fadd float %val0, %val1 - %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i - store float %sum, float addrspace(1)* %out.gep, align 4 - ret void -} - -; SI-LABEL: @odd_invalid_read2st64_f32_1 -; SI-NOT: ds_read2st64_b32 -; SI: s_endpgm -define void @odd_invalid_read2st64_f32_1(float addrspace(1)* %out) #0 { - %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 - %add.x.0 = add nsw i32 %x.i, 64 - %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x.0 - %val0 = load float, float addrspace(3)* %arrayidx0, align 4 - %add.x.1 = add nsw i32 %x.i, 127 - %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x.1 - %val1 = load float, float addrspace(3)* %arrayidx1, align 4 - %sum = fadd float %val0, %val1 - %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i - store float %sum, float addrspace(1)* %out.gep, align 4 - ret void -} - -; SI-LABEL: @simple_read2st64_f64_0_1 -; SI: ds_read2st64_b64 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:1 -; SI: s_waitcnt lgkmcnt(0) -; SI: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO_VREG]]:{{[0-9]+\]}}, v{{\[[0-9]+}}:[[HI_VREG]]{{\]}} -; SI: buffer_store_dwordx2 [[RESULT]] -; SI: s_endpgm -define void @simple_read2st64_f64_0_1(double addrspace(1)* %out) #0 { - %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 - %arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i - %val0 = load double, double addrspace(3)* %arrayidx0, align 8 - %add.x = add nsw i32 %x.i, 64 - %arrayidx1 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %add.x - %val1 = load double, double addrspace(3)* %arrayidx1, align 8 - %sum = fadd double %val0, %val1 - %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i32 %x.i - store double %sum, double addrspace(1)* %out.gep, align 8 - ret void -} - -; SI-LABEL: @simple_read2st64_f64_1_2 -; SI: ds_read2st64_b64 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1 offset1:2 -; SI: s_waitcnt lgkmcnt(0) -; SI: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO_VREG]]:{{[0-9]+\]}}, v{{\[[0-9]+}}:[[HI_VREG]]{{\]}} -; SI: buffer_store_dwordx2 [[RESULT]] -; SI: s_endpgm -define void @simple_read2st64_f64_1_2(double addrspace(1)* %out, double addrspace(3)* %lds) #0 { - %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 - %add.x.0 = add nsw i32 %x.i, 64 - %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.0 - %val0 = load double, double addrspace(3)* %arrayidx0, align 8 - %add.x.1 = add nsw i32 %x.i, 128 - %arrayidx1 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.1 - %val1 = load double, double addrspace(3)* %arrayidx1, align 8 - %sum = fadd double %val0, %val1 - %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i32 %x.i - store double %sum, double addrspace(1)* %out.gep, align 8 - ret void -} - -; Alignment only - -; SI-LABEL: @misaligned_read2st64_f64 -; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset1:1 -; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset0:128 offset1:129 -; SI: s_endpgm -define void @misaligned_read2st64_f64(double addrspace(1)* %out, double addrspace(3)* %lds) #0 { - %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 - %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %x.i - %val0 = load double, double addrspace(3)* %arrayidx0, align 4 - %add.x = add nsw i32 %x.i, 64 - %arrayidx1 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x - %val1 = load double, double addrspace(3)* %arrayidx1, align 4 - %sum = fadd double %val0, %val1 - %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i32 %x.i - store double %sum, double addrspace(1)* %out.gep, align 4 - ret void -} - -; The maximum is not the usual 0xff because 0xff * 8 * 64 > 0xffff -; SI-LABEL: @simple_read2st64_f64_max_offset -; SI: ds_read2st64_b64 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:4 offset1:127 -; SI: s_waitcnt lgkmcnt(0) -; SI: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO_VREG]]:{{[0-9]+\]}}, v{{\[[0-9]+}}:[[HI_VREG]]{{\]}} -; SI: buffer_store_dwordx2 [[RESULT]] -; SI: s_endpgm -define void @simple_read2st64_f64_max_offset(double addrspace(1)* %out, double addrspace(3)* %lds) #0 { - %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 - %add.x.0 = add nsw i32 %x.i, 256 - %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.0 - %val0 = load double, double addrspace(3)* %arrayidx0, align 8 - %add.x.1 = add nsw i32 %x.i, 8128 - %arrayidx1 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.1 - %val1 = load double, double addrspace(3)* %arrayidx1, align 8 - %sum = fadd double %val0, %val1 - %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i32 %x.i - store double %sum, double addrspace(1)* %out.gep, align 8 - ret void -} - -; SI-LABEL: @simple_read2st64_f64_over_max_offset -; SI-NOT: ds_read2st64_b64 -; SI: v_add_i32_e32 [[BIGADD:v[0-9]+]], 0x10000, {{v[0-9]+}} -; SI: ds_read_b64 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset:512 -; SI: ds_read_b64 {{v\[[0-9]+:[0-9]+\]}}, [[BIGADD]] -; SI: s_endpgm -define void @simple_read2st64_f64_over_max_offset(double addrspace(1)* %out, double addrspace(3)* %lds) #0 { - %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 - %add.x.0 = add nsw i32 %x.i, 64 - %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.0 - %val0 = load double, double addrspace(3)* %arrayidx0, align 8 - %add.x.1 = add nsw i32 %x.i, 8192 - %arrayidx1 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.1 - %val1 = load double, double addrspace(3)* %arrayidx1, align 8 - %sum = fadd double %val0, %val1 - %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i32 %x.i - store double %sum, double addrspace(1)* %out.gep, align 8 - ret void -} - -; SI-LABEL: @invalid_read2st64_f64_odd_offset -; SI-NOT: ds_read2st64_b64 -; SI: s_endpgm -define void @invalid_read2st64_f64_odd_offset(double addrspace(1)* %out, double addrspace(3)* %lds) #0 { - %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 - %add.x.0 = add nsw i32 %x.i, 64 - %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.0 - %val0 = load double, double addrspace(3)* %arrayidx0, align 8 - %add.x.1 = add nsw i32 %x.i, 8129 - %arrayidx1 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.1 - %val1 = load double, double addrspace(3)* %arrayidx1, align 8 - %sum = fadd double %val0, %val1 - %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i32 %x.i - store double %sum, double addrspace(1)* %out.gep, align 8 - ret void -} - -; The stride of 8 elements is 8 * 8 bytes. We need to make sure the -; stride in elements, not bytes, is a multiple of 64. - -; SI-LABEL: @byte_size_only_divisible_64_read2_f64 -; SI-NOT: ds_read2st_b64 -; SI: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:8 -; SI: s_endpgm -define void @byte_size_only_divisible_64_read2_f64(double addrspace(1)* %out, double addrspace(3)* %lds) #0 { - %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 - %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %x.i - %val0 = load double, double addrspace(3)* %arrayidx0, align 8 - %add.x = add nsw i32 %x.i, 8 - %arrayidx1 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x - %val1 = load double, double addrspace(3)* %arrayidx1, align 8 - %sum = fadd double %val0, %val1 - %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i32 %x.i - store double %sum, double addrspace(1)* %out.gep, align 4 - ret void -} - -; Function Attrs: nounwind readnone -declare i32 @llvm.r600.read.tgid.x() #1 - -; Function Attrs: nounwind readnone -declare i32 @llvm.r600.read.tgid.y() #1 - -; Function Attrs: nounwind readnone -declare i32 @llvm.r600.read.tidig.x() #1 - -; Function Attrs: nounwind readnone -declare i32 @llvm.r600.read.tidig.y() #1 - -; Function Attrs: noduplicate nounwind -declare void @llvm.AMDGPU.barrier.local() #2 - -attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { nounwind readnone } -attributes #2 = { noduplicate nounwind } diff --git a/llvm/test/CodeGen/R600/ds_write2.ll b/llvm/test/CodeGen/R600/ds_write2.ll deleted file mode 100644 index b553d3459e4..00000000000 --- a/llvm/test/CodeGen/R600/ds_write2.ll +++ /dev/null @@ -1,425 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck -strict-whitespace -check-prefix=SI %s - -@lds = addrspace(3) global [512 x float] undef, align 4 -@lds.f64 = addrspace(3) global [512 x double] undef, align 8 - - -; SI-LABEL: @simple_write2_one_val_f32 -; SI-DAG: buffer_load_dword [[VAL:v[0-9]+]] -; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}} -; SI: ds_write2_b32 [[VPTR]], [[VAL]], [[VAL]] offset1:8 -; SI: s_endpgm -define void @simple_write2_one_val_f32(float addrspace(1)* %C, float addrspace(1)* %in) #0 { - %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 - %in.gep = getelementptr float, float addrspace(1)* %in, i32 %x.i - %val = load float, float addrspace(1)* %in.gep, align 4 - %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i - store float %val, float addrspace(3)* %arrayidx0, align 4 - %add.x = add nsw i32 %x.i, 8 - %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x - store float %val, float addrspace(3)* %arrayidx1, align 4 - ret void -} - -; SI-LABEL: @simple_write2_two_val_f32 -; SI-DAG: buffer_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI-DAG: buffer_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}} -; SI: ds_write2_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset1:8 -; SI: s_endpgm -define void @simple_write2_two_val_f32(float addrspace(1)* %C, float addrspace(1)* %in) #0 { - %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 - %in.gep.0 = getelementptr float, float addrspace(1)* %in, i32 %x.i - %in.gep.1 = getelementptr float, float addrspace(1)* %in.gep.0, i32 1 - %val0 = load float, float addrspace(1)* %in.gep.0, align 4 - %val1 = load float, float addrspace(1)* %in.gep.1, align 4 - %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i - store float %val0, float addrspace(3)* %arrayidx0, align 4 - %add.x = add nsw i32 %x.i, 8 - %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x - store float %val1, float addrspace(3)* %arrayidx1, align 4 - ret void -} - -; SI-LABEL: @simple_write2_two_val_f32_volatile_0 -; SI-NOT: ds_write2_b32 -; SI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} -; SI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:32 -; SI: s_endpgm -define void @simple_write2_two_val_f32_volatile_0(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 { - %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 - %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %x.i - %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %x.i - %val0 = load float, float addrspace(1)* %in0.gep, align 4 - %val1 = load float, float addrspace(1)* %in1.gep, align 4 - %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i - store volatile float %val0, float addrspace(3)* %arrayidx0, align 4 - %add.x = add nsw i32 %x.i, 8 - %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x - store float %val1, float addrspace(3)* %arrayidx1, align 4 - ret void -} - -; SI-LABEL: @simple_write2_two_val_f32_volatile_1 -; SI-NOT: ds_write2_b32 -; SI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} -; SI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:32 -; SI: s_endpgm -define void @simple_write2_two_val_f32_volatile_1(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 { - %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 - %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %x.i - %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %x.i - %val0 = load float, float addrspace(1)* %in0.gep, align 4 - %val1 = load float, float addrspace(1)* %in1.gep, align 4 - %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i - store float %val0, float addrspace(3)* %arrayidx0, align 4 - %add.x = add nsw i32 %x.i, 8 - %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x - store volatile float %val1, float addrspace(3)* %arrayidx1, align 4 - ret void -} - -; 2 data subregisters from different super registers. -; SI-LABEL: @simple_write2_two_val_subreg2_mixed_f32 -; SI: buffer_load_dwordx2 v{{\[}}[[VAL0:[0-9]+]]:{{[0-9]+\]}} -; SI: buffer_load_dwordx2 v{{\[[0-9]+}}:[[VAL1:[0-9]+]]{{\]}} -; SI: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}} -; SI: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:8 -; SI: s_endpgm -define void @simple_write2_two_val_subreg2_mixed_f32(float addrspace(1)* %C, <2 x float> addrspace(1)* %in) #0 { - %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 - %in.gep.0 = getelementptr <2 x float>, <2 x float> addrspace(1)* %in, i32 %x.i - %in.gep.1 = getelementptr <2 x float>, <2 x float> addrspace(1)* %in.gep.0, i32 1 - %val0 = load <2 x float>, <2 x float> addrspace(1)* %in.gep.0, align 8 - %val1 = load <2 x float>, <2 x float> addrspace(1)* %in.gep.1, align 8 - %val0.0 = extractelement <2 x float> %val0, i32 0 - %val1.1 = extractelement <2 x float> %val1, i32 1 - %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i - store float %val0.0, float addrspace(3)* %arrayidx0, align 4 - %add.x = add nsw i32 %x.i, 8 - %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x - store float %val1.1, float addrspace(3)* %arrayidx1, align 4 - ret void -} - -; SI-LABEL: @simple_write2_two_val_subreg2_f32 -; SI-DAG: buffer_load_dwordx2 v{{\[}}[[VAL0:[0-9]+]]:[[VAL1:[0-9]+]]{{\]}} -; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}} -; SI: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:8 -; SI: s_endpgm -define void @simple_write2_two_val_subreg2_f32(float addrspace(1)* %C, <2 x float> addrspace(1)* %in) #0 { - %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 - %in.gep = getelementptr <2 x float>, <2 x float> addrspace(1)* %in, i32 %x.i - %val = load <2 x float>, <2 x float> addrspace(1)* %in.gep, align 8 - %val0 = extractelement <2 x float> %val, i32 0 - %val1 = extractelement <2 x float> %val, i32 1 - %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i - store float %val0, float addrspace(3)* %arrayidx0, align 4 - %add.x = add nsw i32 %x.i, 8 - %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x - store float %val1, float addrspace(3)* %arrayidx1, align 4 - ret void -} - -; SI-LABEL: @simple_write2_two_val_subreg4_f32 -; SI-DAG: buffer_load_dwordx4 v{{\[}}[[VAL0:[0-9]+]]:[[VAL1:[0-9]+]]{{\]}} -; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}} -; SI: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:8 -; SI: s_endpgm -define void @simple_write2_two_val_subreg4_f32(float addrspace(1)* %C, <4 x float> addrspace(1)* %in) #0 { - %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 - %in.gep = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 %x.i - %val = load <4 x float>, <4 x float> addrspace(1)* %in.gep, align 16 - %val0 = extractelement <4 x float> %val, i32 0 - %val1 = extractelement <4 x float> %val, i32 3 - %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i - store float %val0, float addrspace(3)* %arrayidx0, align 4 - %add.x = add nsw i32 %x.i, 8 - %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x - store float %val1, float addrspace(3)* %arrayidx1, align 4 - ret void -} - -; SI-LABEL: @simple_write2_two_val_max_offset_f32 -; SI-DAG: buffer_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI-DAG: buffer_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}} -; SI: ds_write2_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset1:255 -; SI: s_endpgm -define void @simple_write2_two_val_max_offset_f32(float addrspace(1)* %C, float addrspace(1)* %in) #0 { - %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 - %in.gep.0 = getelementptr float, float addrspace(1)* %in, i32 %x.i - %in.gep.1 = getelementptr float, float addrspace(1)* %in.gep.0, i32 1 - %val0 = load float, float addrspace(1)* %in.gep.0, align 4 - %val1 = load float, float addrspace(1)* %in.gep.1, align 4 - %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i - store float %val0, float addrspace(3)* %arrayidx0, align 4 - %add.x = add nsw i32 %x.i, 255 - %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x - store float %val1, float addrspace(3)* %arrayidx1, align 4 - ret void -} - -; SI-LABEL: @simple_write2_two_val_too_far_f32 -; SI: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}} -; SI: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:1028 -; SI: s_endpgm -define void @simple_write2_two_val_too_far_f32(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 { - %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 - %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %x.i - %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %x.i - %val0 = load float, float addrspace(1)* %in0.gep, align 4 - %val1 = load float, float addrspace(1)* %in1.gep, align 4 - %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i - store float %val0, float addrspace(3)* %arrayidx0, align 4 - %add.x = add nsw i32 %x.i, 257 - %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x - store float %val1, float addrspace(3)* %arrayidx1, align 4 - ret void -} - -; SI-LABEL: @simple_write2_two_val_f32_x2 -; SI: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0:v[0-9]+]], [[VAL1:v[0-9]+]] offset1:8 -; SI-NEXT: ds_write2_b32 [[BASEADDR]], [[VAL0]], [[VAL1]] offset0:11 offset1:27 -; SI: s_endpgm -define void @simple_write2_two_val_f32_x2(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 { - %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1 - %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %tid.x - %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %tid.x - %val0 = load float, float addrspace(1)* %in0.gep, align 4 - %val1 = load float, float addrspace(1)* %in1.gep, align 4 - - %idx.0 = add nsw i32 %tid.x, 0 - %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.0 - store float %val0, float addrspace(3)* %arrayidx0, align 4 - - %idx.1 = add nsw i32 %tid.x, 8 - %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.1 - store float %val1, float addrspace(3)* %arrayidx1, align 4 - - %idx.2 = add nsw i32 %tid.x, 11 - %arrayidx2 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.2 - store float %val0, float addrspace(3)* %arrayidx2, align 4 - - %idx.3 = add nsw i32 %tid.x, 27 - %arrayidx3 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.3 - store float %val1, float addrspace(3)* %arrayidx3, align 4 - - ret void -} - -; SI-LABEL: @simple_write2_two_val_f32_x2_nonzero_base -; SI: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0:v[0-9]+]], [[VAL1:v[0-9]+]] offset0:3 offset1:8 -; SI-NEXT: ds_write2_b32 [[BASEADDR]], [[VAL0]], [[VAL1]] offset0:11 offset1:27 -; SI: s_endpgm -define void @simple_write2_two_val_f32_x2_nonzero_base(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 { - %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1 - %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %tid.x - %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %tid.x - %val0 = load float, float addrspace(1)* %in0.gep, align 4 - %val1 = load float, float addrspace(1)* %in1.gep, align 4 - - %idx.0 = add nsw i32 %tid.x, 3 - %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.0 - store float %val0, float addrspace(3)* %arrayidx0, align 4 - - %idx.1 = add nsw i32 %tid.x, 8 - %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.1 - store float %val1, float addrspace(3)* %arrayidx1, align 4 - - %idx.2 = add nsw i32 %tid.x, 11 - %arrayidx2 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.2 - store float %val0, float addrspace(3)* %arrayidx2, align 4 - - %idx.3 = add nsw i32 %tid.x, 27 - %arrayidx3 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.3 - store float %val1, float addrspace(3)* %arrayidx3, align 4 - - ret void -} - -; SI-LABEL: @write2_ptr_subreg_arg_two_val_f32 -; SI-NOT: ds_write2_b32 -; SI: ds_write_b32 -; SI: ds_write_b32 -; SI: s_endpgm -define void @write2_ptr_subreg_arg_two_val_f32(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1, <2 x float addrspace(3)*> %lds.ptr) #0 { - %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 - %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %x.i - %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %x.i - %val0 = load float, float addrspace(1)* %in0.gep, align 4 - %val1 = load float, float addrspace(1)* %in1.gep, align 4 - - %index.0 = insertelement <2 x i32> undef, i32 %x.i, i32 0 - %index.1 = insertelement <2 x i32> %index.0, i32 8, i32 0 - %gep = getelementptr inbounds float, <2 x float addrspace(3)*> %lds.ptr, <2 x i32> %index.1 - %gep.0 = extractelement <2 x float addrspace(3)*> %gep, i32 0 - %gep.1 = extractelement <2 x float addrspace(3)*> %gep, i32 1 - - ; Apply an additional offset after the vector that will be more obviously folded. - %gep.1.offset = getelementptr float, float addrspace(3)* %gep.1, i32 8 - store float %val0, float addrspace(3)* %gep.0, align 4 - - %add.x = add nsw i32 %x.i, 8 - store float %val1, float addrspace(3)* %gep.1.offset, align 4 - ret void -} - -; SI-LABEL: @simple_write2_one_val_f64 -; SI: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]], -; SI: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 3, v{{[0-9]+}} -; SI: ds_write2_b64 [[VPTR]], [[VAL]], [[VAL]] offset1:8 -; SI: s_endpgm -define void @simple_write2_one_val_f64(double addrspace(1)* %C, double addrspace(1)* %in) #0 { - %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 - %in.gep = getelementptr double, double addrspace(1)* %in, i32 %x.i - %val = load double, double addrspace(1)* %in.gep, align 8 - %arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i - store double %val, double addrspace(3)* %arrayidx0, align 8 - %add.x = add nsw i32 %x.i, 8 - %arrayidx1 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %add.x - store double %val, double addrspace(3)* %arrayidx1, align 8 - ret void -} - -; SI-LABEL: @misaligned_simple_write2_one_val_f64 -; SI-DAG: buffer_load_dwordx2 v{{\[}}[[VAL0:[0-9]+]]:[[VAL1:[0-9]+]]{{\]}} -; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 3, v{{[0-9]+}} -; SI: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:1 -; SI: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset0:14 offset1:15 -; SI: s_endpgm -define void @misaligned_simple_write2_one_val_f64(double addrspace(1)* %C, double addrspace(1)* %in, double addrspace(3)* %lds) #0 { - %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 - %in.gep = getelementptr double, double addrspace(1)* %in, i32 %x.i - %val = load double, double addrspace(1)* %in.gep, align 8 - %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %x.i - store double %val, double addrspace(3)* %arrayidx0, align 4 - %add.x = add nsw i32 %x.i, 7 - %arrayidx1 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x - store double %val, double addrspace(3)* %arrayidx1, align 4 - ret void -} - -; SI-LABEL: @simple_write2_two_val_f64 -; SI-DAG: buffer_load_dwordx2 [[VAL0:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI-DAG: buffer_load_dwordx2 [[VAL1:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 -; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 3, v{{[0-9]+}} -; SI: ds_write2_b64 [[VPTR]], [[VAL0]], [[VAL1]] offset1:8 -; SI: s_endpgm -define void @simple_write2_two_val_f64(double addrspace(1)* %C, double addrspace(1)* %in) #0 { - %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 - %in.gep.0 = getelementptr double, double addrspace(1)* %in, i32 %x.i - %in.gep.1 = getelementptr double, double addrspace(1)* %in.gep.0, i32 1 - %val0 = load double, double addrspace(1)* %in.gep.0, align 8 - %val1 = load double, double addrspace(1)* %in.gep.1, align 8 - %arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i - store double %val0, double addrspace(3)* %arrayidx0, align 8 - %add.x = add nsw i32 %x.i, 8 - %arrayidx1 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %add.x - store double %val1, double addrspace(3)* %arrayidx1, align 8 - ret void -} - -@foo = addrspace(3) global [4 x i32] undef, align 4 - -; SI-LABEL: @store_constant_adjacent_offsets -; SI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} -; SI: ds_write2_b32 [[ZERO]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1 -define void @store_constant_adjacent_offsets() { - store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4 - store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 1), align 4 - ret void -} - -; SI-LABEL: @store_constant_disjoint_offsets -; SI-DAG: v_mov_b32_e32 [[VAL:v[0-9]+]], 0x7b{{$}} -; SI-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} -; SI: ds_write2_b32 [[ZERO]], [[VAL]], [[VAL]] offset1:2 -define void @store_constant_disjoint_offsets() { - store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4 - store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 2), align 4 - ret void -} - -@bar = addrspace(3) global [4 x i64] undef, align 4 - -; SI-LABEL: @store_misaligned64_constant_offsets -; SI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} -; SI: ds_write2_b32 [[ZERO]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1 -; SI: ds_write2_b32 [[ZERO]], v{{[0-9]+}}, v{{[0-9]+}} offset0:2 offset1:3 -define void @store_misaligned64_constant_offsets() { - store i64 123, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 0), align 4 - store i64 123, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 1), align 4 - ret void -} - -@bar.large = addrspace(3) global [4096 x i64] undef, align 4 - -; SI-LABEL: @store_misaligned64_constant_large_offsets -; SI-DAG: v_mov_b32_e32 [[BASE0:v[0-9]+]], 0x7ff8{{$}} -; SI-DAG: v_mov_b32_e32 [[BASE1:v[0-9]+]], 0x4000{{$}} -; SI-DAG: ds_write2_b32 [[BASE0]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1 -; SI-DAG: ds_write2_b32 [[BASE1]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1 -; SI: s_endpgm -define void @store_misaligned64_constant_large_offsets() { - store i64 123, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 2048), align 4 - store i64 123, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 4095), align 4 - ret void -} - -@sgemm.lA = internal unnamed_addr addrspace(3) global [264 x float] undef, align 4 -@sgemm.lB = internal unnamed_addr addrspace(3) global [776 x float] undef, align 4 - -define void @write2_sgemm_sequence(float addrspace(1)* %C, i32 %lda, i32 %ldb, float addrspace(1)* %in) #0 { - %x.i = tail call i32 @llvm.r600.read.tgid.x() #1 - %y.i = tail call i32 @llvm.r600.read.tidig.y() #1 - %val = load float, float addrspace(1)* %in - %arrayidx44 = getelementptr inbounds [264 x float], [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %x.i - store float %val, float addrspace(3)* %arrayidx44, align 4 - %add47 = add nsw i32 %x.i, 1 - %arrayidx48 = getelementptr inbounds [264 x float], [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %add47 - store float %val, float addrspace(3)* %arrayidx48, align 4 - %add51 = add nsw i32 %x.i, 16 - %arrayidx52 = getelementptr inbounds [264 x float], [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %add51 - store float %val, float addrspace(3)* %arrayidx52, align 4 - %add55 = add nsw i32 %x.i, 17 - %arrayidx56 = getelementptr inbounds [264 x float], [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %add55 - store float %val, float addrspace(3)* %arrayidx56, align 4 - %arrayidx60 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %y.i - store float %val, float addrspace(3)* %arrayidx60, align 4 - %add63 = add nsw i32 %y.i, 1 - %arrayidx64 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add63 - store float %val, float addrspace(3)* %arrayidx64, align 4 - %add67 = add nsw i32 %y.i, 32 - %arrayidx68 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add67 - store float %val, float addrspace(3)* %arrayidx68, align 4 - %add71 = add nsw i32 %y.i, 33 - %arrayidx72 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add71 - store float %val, float addrspace(3)* %arrayidx72, align 4 - %add75 = add nsw i32 %y.i, 64 - %arrayidx76 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add75 - store float %val, float addrspace(3)* %arrayidx76, align 4 - %add79 = add nsw i32 %y.i, 65 - %arrayidx80 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add79 - store float %val, float addrspace(3)* %arrayidx80, align 4 - ret void -} - -; Function Attrs: nounwind readnone -declare i32 @llvm.r600.read.tgid.x() #1 - -; Function Attrs: nounwind readnone -declare i32 @llvm.r600.read.tgid.y() #1 - -; Function Attrs: nounwind readnone -declare i32 @llvm.r600.read.tidig.x() #1 - -; Function Attrs: nounwind readnone -declare i32 @llvm.r600.read.tidig.y() #1 - -; Function Attrs: noduplicate nounwind -declare void @llvm.AMDGPU.barrier.local() #2 - -attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { nounwind readnone } -attributes #2 = { noduplicate nounwind } diff --git a/llvm/test/CodeGen/R600/ds_write2st64.ll b/llvm/test/CodeGen/R600/ds_write2st64.ll deleted file mode 100644 index 1d9d881c5c7..00000000000 --- a/llvm/test/CodeGen/R600/ds_write2st64.ll +++ /dev/null @@ -1,119 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck -check-prefix=SI %s - - -@lds = addrspace(3) global [512 x float] undef, align 4 - - -; SI-LABEL: @simple_write2st64_one_val_f32_0_1 -; SI-DAG: buffer_load_dword [[VAL:v[0-9]+]] -; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}} -; SI: ds_write2st64_b32 [[VPTR]], [[VAL]], [[VAL]] offset1:1 -; SI: s_endpgm -define void @simple_write2st64_one_val_f32_0_1(float addrspace(1)* %C, float addrspace(1)* %in) #0 { - %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 - %in.gep = getelementptr float, float addrspace(1)* %in, i32 %x.i - %val = load float, float addrspace(1)* %in.gep, align 4 - %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i - store float %val, float addrspace(3)* %arrayidx0, align 4 - %add.x = add nsw i32 %x.i, 64 - %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x - store float %val, float addrspace(3)* %arrayidx1, align 4 - ret void -} - -; SI-LABEL: @simple_write2st64_two_val_f32_2_5 -; SI-DAG: buffer_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI-DAG: buffer_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}} -; SI: ds_write2st64_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset0:2 offset1:5 -; SI: s_endpgm -define void @simple_write2st64_two_val_f32_2_5(float addrspace(1)* %C, float addrspace(1)* %in) #0 { - %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 - %in.gep.0 = getelementptr float, float addrspace(1)* %in, i32 %x.i - %in.gep.1 = getelementptr float, float addrspace(1)* %in.gep.0, i32 1 - %val0 = load float, float addrspace(1)* %in.gep.0, align 4 - %val1 = load float, float addrspace(1)* %in.gep.1, align 4 - %add.x.0 = add nsw i32 %x.i, 128 - %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x.0 - store float %val0, float addrspace(3)* %arrayidx0, align 4 - %add.x.1 = add nsw i32 %x.i, 320 - %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x.1 - store float %val1, float addrspace(3)* %arrayidx1, align 4 - ret void -} - -; SI-LABEL: @simple_write2st64_two_val_max_offset_f32 -; SI-DAG: buffer_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI-DAG: buffer_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}} -; SI: ds_write2st64_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset1:255 -; SI: s_endpgm -define void @simple_write2st64_two_val_max_offset_f32(float addrspace(1)* %C, float addrspace(1)* %in, float addrspace(3)* %lds) #0 { - %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 - %in.gep.0 = getelementptr float, float addrspace(1)* %in, i32 %x.i - %in.gep.1 = getelementptr float, float addrspace(1)* %in.gep.0, i32 1 - %val0 = load float, float addrspace(1)* %in.gep.0, align 4 - %val1 = load float, float addrspace(1)* %in.gep.1, align 4 - %arrayidx0 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %x.i - store float %val0, float addrspace(3)* %arrayidx0, align 4 - %add.x = add nsw i32 %x.i, 16320 - %arrayidx1 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %add.x - store float %val1, float addrspace(3)* %arrayidx1, align 4 - ret void -} - -; SI-LABEL: @simple_write2st64_two_val_max_offset_f64 -; SI-DAG: buffer_load_dwordx2 [[VAL0:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI-DAG: buffer_load_dwordx2 [[VAL1:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 -; SI-DAG: v_add_i32_e32 [[VPTR:v[0-9]+]], -; SI: ds_write2st64_b64 [[VPTR]], [[VAL0]], [[VAL1]] offset0:4 offset1:127 -; SI: s_endpgm -define void @simple_write2st64_two_val_max_offset_f64(double addrspace(1)* %C, double addrspace(1)* %in, double addrspace(3)* %lds) #0 { - %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 - %in.gep.0 = getelementptr double, double addrspace(1)* %in, i32 %x.i - %in.gep.1 = getelementptr double, double addrspace(1)* %in.gep.0, i32 1 - %val0 = load double, double addrspace(1)* %in.gep.0, align 8 - %val1 = load double, double addrspace(1)* %in.gep.1, align 8 - %add.x.0 = add nsw i32 %x.i, 256 - %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.0 - store double %val0, double addrspace(3)* %arrayidx0, align 8 - %add.x.1 = add nsw i32 %x.i, 8128 - %arrayidx1 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.1 - store double %val1, double addrspace(3)* %arrayidx1, align 8 - ret void -} - -; SI-LABEL: @byte_size_only_divisible_64_write2st64_f64 -; SI-NOT: ds_write2st64_b64 -; SI: ds_write2_b64 {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset1:8 -; SI: s_endpgm -define void @byte_size_only_divisible_64_write2st64_f64(double addrspace(1)* %C, double addrspace(1)* %in, double addrspace(3)* %lds) #0 { - %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 - %in.gep = getelementptr double, double addrspace(1)* %in, i32 %x.i - %val = load double, double addrspace(1)* %in.gep, align 8 - %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %x.i - store double %val, double addrspace(3)* %arrayidx0, align 8 - %add.x = add nsw i32 %x.i, 8 - %arrayidx1 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x - store double %val, double addrspace(3)* %arrayidx1, align 8 - ret void -} - -; Function Attrs: nounwind readnone -declare i32 @llvm.r600.read.tgid.x() #1 - -; Function Attrs: nounwind readnone -declare i32 @llvm.r600.read.tgid.y() #1 - -; Function Attrs: nounwind readnone -declare i32 @llvm.r600.read.tidig.x() #1 - -; Function Attrs: nounwind readnone -declare i32 @llvm.r600.read.tidig.y() #1 - -; Function Attrs: noduplicate nounwind -declare void @llvm.AMDGPU.barrier.local() #2 - -attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { nounwind readnone } -attributes #2 = { noduplicate nounwind } diff --git a/llvm/test/CodeGen/R600/elf.ll b/llvm/test/CodeGen/R600/elf.ll deleted file mode 100644 index d0fd06a3437..00000000000 --- a/llvm/test/CodeGen/R600/elf.ll +++ /dev/null @@ -1,34 +0,0 @@ -; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs -filetype=obj | llvm-readobj -s -symbols - | FileCheck --check-prefix=ELF %s -; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs -o - | FileCheck --check-prefix=CONFIG --check-prefix=TYPICAL %s -; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs -filetype=obj | llvm-readobj -s -symbols - | FileCheck --check-prefix=ELF %s -; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs -o - | FileCheck --check-prefix=CONFIG --check-prefix=TONGA %s -; RUN: llc < %s -march=amdgcn -mcpu=carrizo -verify-machineinstrs -filetype=obj | llvm-readobj -s -symbols - | FileCheck --check-prefix=ELF %s -; RUN: llc < %s -march=amdgcn -mcpu=carrizo -verify-machineinstrs -o - | FileCheck --check-prefix=CONFIG --check-prefix=TYPICAL %s - -; Test that we don't try to produce a COFF file on windows -; RUN: llc < %s -mtriple=amdgcn-pc-mingw -mcpu=SI -verify-machineinstrs -filetype=obj | llvm-readobj -s -symbols - | FileCheck --check-prefix=ELF %s - -; ELF: Format: ELF32 -; ELF: Name: .AMDGPU.config -; ELF: Type: SHT_PROGBITS - -; ELF: Symbol { -; ELF: Name: test -; ELF: Binding: Global - -; CONFIG: .section .AMDGPU.config -; CONFIG-NEXT: .long 45096 -; TYPICAL-NEXT: .long 0 -; TONGA-NEXT: .long 576 -; CONFIG: .align 256 -; CONFIG: test: -define void @test(i32 %p) #0 { - %i = add i32 %p, 2 - %r = bitcast i32 %i to float - call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %r, float %r, float %r, float %r) - ret void -} - -declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) - -attributes #0 = { "ShaderType"="0" } ; Pixel Shader diff --git a/llvm/test/CodeGen/R600/elf.r600.ll b/llvm/test/CodeGen/R600/elf.r600.ll deleted file mode 100644 index 51cd0850093..00000000000 --- a/llvm/test/CodeGen/R600/elf.r600.ll +++ /dev/null @@ -1,17 +0,0 @@ -; RUN: llc < %s -march=r600 -mcpu=redwood -filetype=obj | llvm-readobj -s - | FileCheck --check-prefix=ELF %s -; RUN: llc < %s -march=r600 -mcpu=redwood -o - | FileCheck --check-prefix=CONFIG %s - -; ELF: Format: ELF32 -; ELF: Name: .AMDGPU.config - -; CONFIG: .section .AMDGPU.config -; CONFIG-NEXT: .long 166100 -; CONFIG-NEXT: .long 2 -; CONFIG-NEXT: .long 165900 -; CONFIG-NEXT: .long 0 -define void @test(float addrspace(1)* %out, i32 %p) { - %i = add i32 %p, 2 - %r = bitcast i32 %i to float - store float %r, float addrspace(1)* %out - ret void -} diff --git a/llvm/test/CodeGen/R600/empty-function.ll b/llvm/test/CodeGen/R600/empty-function.ll deleted file mode 100644 index a060900811e..00000000000 --- a/llvm/test/CodeGen/R600/empty-function.ll +++ /dev/null @@ -1,21 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s - -; Make sure we don't assert on empty functions - -; SI: .text -; SI-LABEL: {{^}}empty_function_ret: -; SI: s_endpgm -; SI: codeLenInByte = 4 -define void @empty_function_ret() #0 { - ret void -} - -; SI: .text -; SI-LABEL: {{^}}empty_function_unreachable: -; SI: codeLenInByte = 0 -define void @empty_function_unreachable() #0 { - unreachable -} - -attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/R600/endcf-loop-header.ll b/llvm/test/CodeGen/R600/endcf-loop-header.ll deleted file mode 100644 index 267a323c506..00000000000 --- a/llvm/test/CodeGen/R600/endcf-loop-header.ll +++ /dev/null @@ -1,34 +0,0 @@ -; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s - -; This tests that the llvm.SI.end.cf intrinsic is not inserted into the -; loop block. This intrinsic will be lowered to s_or_b64 by the code -; generator. - -; CHECK-LABEL: {{^}}test: - -; This is was lowered from the llvm.SI.end.cf intrinsic: -; CHECK: s_or_b64 exec, exec - -; CHECK: [[LOOP_LABEL:[0-9A-Za-z_]+]]: ; %loop{{$}} -; CHECK-NOT: s_or_b64 exec, exec -; CHECK: s_cbranch_execnz [[LOOP_LABEL]] -define void @test(i32 addrspace(1)* %out, i32 %cond) { -entry: - %tmp0 = icmp eq i32 %cond, 0 - br i1 %tmp0, label %if, label %loop - -if: - store i32 0, i32 addrspace(1)* %out - br label %loop - -loop: - %tmp1 = phi i32 [0, %entry], [0, %if], [%inc, %loop] - %inc = add i32 %tmp1, %cond - %tmp2 = icmp ugt i32 %inc, 10 - br i1 %tmp2, label %done, label %loop - -done: - %tmp3 = getelementptr i32, i32 addrspace(1)* %out, i64 1 - store i32 %inc, i32 addrspace(1)* %tmp3 - ret void -} diff --git a/llvm/test/CodeGen/R600/extload-private.ll b/llvm/test/CodeGen/R600/extload-private.ll deleted file mode 100644 index 294c3a9c678..00000000000 --- a/llvm/test/CodeGen/R600/extload-private.ll +++ /dev/null @@ -1,46 +0,0 @@ -; RUN: llc < %s -march=amdgcn -mcpu=SI -mattr=-promote-alloca -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s -; RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-promote-alloca -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s - -; FUNC-LABEL: {{^}}load_i8_sext_private: -; SI: buffer_load_sbyte v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen -define void @load_i8_sext_private(i32 addrspace(1)* %out) { -entry: - %tmp0 = alloca i8 - %tmp1 = load i8, i8* %tmp0 - %tmp2 = sext i8 %tmp1 to i32 - store i32 %tmp2, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}load_i8_zext_private: -; SI: buffer_load_ubyte v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen -define void @load_i8_zext_private(i32 addrspace(1)* %out) { -entry: - %tmp0 = alloca i8 - %tmp1 = load i8, i8* %tmp0 - %tmp2 = zext i8 %tmp1 to i32 - store i32 %tmp2, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}load_i16_sext_private: -; SI: buffer_load_sshort v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen -define void @load_i16_sext_private(i32 addrspace(1)* %out) { -entry: - %tmp0 = alloca i16 - %tmp1 = load i16, i16* %tmp0 - %tmp2 = sext i16 %tmp1 to i32 - store i32 %tmp2, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}load_i16_zext_private: -; SI: buffer_load_ushort v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen -define void @load_i16_zext_private(i32 addrspace(1)* %out) { -entry: - %tmp0 = alloca i16 - %tmp1 = load i16, i16* %tmp0 - %tmp2 = zext i16 %tmp1 to i32 - store i32 %tmp2, i32 addrspace(1)* %out - ret void -} diff --git a/llvm/test/CodeGen/R600/extload.ll b/llvm/test/CodeGen/R600/extload.ll deleted file mode 100644 index 662eb7a9716..00000000000 --- a/llvm/test/CodeGen/R600/extload.ll +++ /dev/null @@ -1,53 +0,0 @@ -; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s - -; FUNC-LABEL: {{^}}anyext_load_i8: -; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+.[XYZW]]], -; EG: VTX_READ_32 [[VAL]] - -define void @anyext_load_i8(i8 addrspace(1)* nocapture noalias %out, i8 addrspace(1)* nocapture noalias %src) nounwind { - %cast = bitcast i8 addrspace(1)* %src to i32 addrspace(1)* - %load = load i32, i32 addrspace(1)* %cast, align 1 - %x = bitcast i32 %load to <4 x i8> - %castOut = bitcast i8 addrspace(1)* %out to <4 x i8> addrspace(1)* - store <4 x i8> %x, <4 x i8> addrspace(1)* %castOut, align 1 - ret void -} - -; FUNC-LABEL: {{^}}anyext_load_i16: -; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+.[XYZW]]], -; EG: VTX_READ_32 [[VAL]] - -define void @anyext_load_i16(i16 addrspace(1)* nocapture noalias %out, i16 addrspace(1)* nocapture noalias %src) nounwind { - %cast = bitcast i16 addrspace(1)* %src to i32 addrspace(1)* - %load = load i32, i32 addrspace(1)* %cast, align 1 - %x = bitcast i32 %load to <2 x i16> - %castOut = bitcast i16 addrspace(1)* %out to <2 x i16> addrspace(1)* - store <2 x i16> %x, <2 x i16> addrspace(1)* %castOut, align 1 - ret void -} - -; FUNC-LABEL: {{^}}anyext_load_lds_i8: -; EG: LDS_READ_RET {{.*}}, [[VAL:T[0-9]+.[XYZW]]] -; EG: LDS_WRITE * [[VAL]] -define void @anyext_load_lds_i8(i8 addrspace(3)* nocapture noalias %out, i8 addrspace(3)* nocapture noalias %src) nounwind { - %cast = bitcast i8 addrspace(3)* %src to i32 addrspace(3)* - %load = load i32, i32 addrspace(3)* %cast, align 1 - %x = bitcast i32 %load to <4 x i8> - %castOut = bitcast i8 addrspace(3)* %out to <4 x i8> addrspace(3)* - store <4 x i8> %x, <4 x i8> addrspace(3)* %castOut, align 1 - ret void -} - -; FUNC-LABEL: {{^}}anyext_load_lds_i16: -; EG: LDS_READ_RET {{.*}}, [[VAL:T[0-9]+.[XYZW]]] -; EG: LDS_WRITE * [[VAL]] -define void @anyext_load_lds_i16(i16 addrspace(3)* nocapture noalias %out, i16 addrspace(3)* nocapture noalias %src) nounwind { - %cast = bitcast i16 addrspace(3)* %src to i32 addrspace(3)* - %load = load i32, i32 addrspace(3)* %cast, align 1 - %x = bitcast i32 %load to <2 x i16> - %castOut = bitcast i16 addrspace(3)* %out to <2 x i16> addrspace(3)* - store <2 x i16> %x, <2 x i16> addrspace(3)* %castOut, align 1 - ret void -} diff --git a/llvm/test/CodeGen/R600/extract_vector_elt_i16.ll b/llvm/test/CodeGen/R600/extract_vector_elt_i16.ll deleted file mode 100644 index c7572efc6f5..00000000000 --- a/llvm/test/CodeGen/R600/extract_vector_elt_i16.ll +++ /dev/null @@ -1,30 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s - -; FUNC-LABEL: {{^}}extract_vector_elt_v2i16: -; SI: buffer_load_ushort -; SI: buffer_load_ushort -; SI: buffer_store_short -; SI: buffer_store_short -define void @extract_vector_elt_v2i16(i16 addrspace(1)* %out, <2 x i16> %foo) nounwind { - %p0 = extractelement <2 x i16> %foo, i32 0 - %p1 = extractelement <2 x i16> %foo, i32 1 - %out1 = getelementptr i16, i16 addrspace(1)* %out, i32 1 - store i16 %p1, i16 addrspace(1)* %out, align 2 - store i16 %p0, i16 addrspace(1)* %out1, align 2 - ret void -} - -; FUNC-LABEL: {{^}}extract_vector_elt_v4i16: -; SI: buffer_load_ushort -; SI: buffer_load_ushort -; SI: buffer_store_short -; SI: buffer_store_short -define void @extract_vector_elt_v4i16(i16 addrspace(1)* %out, <4 x i16> %foo) nounwind { - %p0 = extractelement <4 x i16> %foo, i32 0 - %p1 = extractelement <4 x i16> %foo, i32 2 - %out1 = getelementptr i16, i16 addrspace(1)* %out, i32 1 - store i16 %p1, i16 addrspace(1)* %out, align 2 - store i16 %p0, i16 addrspace(1)* %out1, align 2 - ret void -} diff --git a/llvm/test/CodeGen/R600/fabs.f64.ll b/llvm/test/CodeGen/R600/fabs.f64.ll deleted file mode 100644 index 3c6136c1a7b..00000000000 --- a/llvm/test/CodeGen/R600/fabs.f64.ll +++ /dev/null @@ -1,97 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s - -declare i32 @llvm.r600.read.tidig.x() nounwind readnone - -declare double @fabs(double) readnone -declare double @llvm.fabs.f64(double) readnone -declare <2 x double> @llvm.fabs.v2f64(<2 x double>) readnone -declare <4 x double> @llvm.fabs.v4f64(<4 x double>) readnone - -; FUNC-LABEL: {{^}}v_fabs_f64: -; SI: v_and_b32 -; SI: s_endpgm -define void @v_fabs_f64(double addrspace(1)* %out, double addrspace(1)* %in) { - %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone - %tidext = sext i32 %tid to i64 - %gep = getelementptr double, double addrspace(1)* %in, i64 %tidext - %val = load double, double addrspace(1)* %gep, align 8 - %fabs = call double @llvm.fabs.f64(double %val) - store double %fabs, double addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}fabs_f64: -; SI: v_and_b32 -; SI-NOT: v_and_b32 -; SI: s_endpgm -define void @fabs_f64(double addrspace(1)* %out, double %in) { - %fabs = call double @llvm.fabs.f64(double %in) - store double %fabs, double addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}fabs_v2f64: -; SI: v_and_b32 -; SI: v_and_b32 -; SI: s_endpgm -define void @fabs_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in) { - %fabs = call <2 x double> @llvm.fabs.v2f64(<2 x double> %in) - store <2 x double> %fabs, <2 x double> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}fabs_v4f64: -; SI: v_and_b32 -; SI: v_and_b32 -; SI: v_and_b32 -; SI: v_and_b32 -; SI: s_endpgm -define void @fabs_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %in) { - %fabs = call <4 x double> @llvm.fabs.v4f64(<4 x double> %in) - store <4 x double> %fabs, <4 x double> addrspace(1)* %out - ret void -} - -; SI-LABEL: {{^}}fabs_fold_f64: -; SI: s_load_dwordx2 [[ABS_VALUE:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb -; SI-NOT: and -; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\]}}, |[[ABS_VALUE]]|, {{v\[[0-9]+:[0-9]+\]}} -; SI: s_endpgm -define void @fabs_fold_f64(double addrspace(1)* %out, double %in0, double %in1) { - %fabs = call double @llvm.fabs.f64(double %in0) - %fmul = fmul double %fabs, %in1 - store double %fmul, double addrspace(1)* %out - ret void -} - -; SI-LABEL: {{^}}fabs_fn_fold_f64: -; SI: s_load_dwordx2 [[ABS_VALUE:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb -; SI-NOT: and -; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\]}}, |[[ABS_VALUE]]|, {{v\[[0-9]+:[0-9]+\]}} -; SI: s_endpgm -define void @fabs_fn_fold_f64(double addrspace(1)* %out, double %in0, double %in1) { - %fabs = call double @fabs(double %in0) - %fmul = fmul double %fabs, %in1 - store double %fmul, double addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}fabs_free_f64: -; SI: v_and_b32 -; SI: s_endpgm -define void @fabs_free_f64(double addrspace(1)* %out, i64 %in) { - %bc= bitcast i64 %in to double - %fabs = call double @llvm.fabs.f64(double %bc) - store double %fabs, double addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}fabs_fn_free_f64: -; SI: v_and_b32 -; SI: s_endpgm -define void @fabs_fn_free_f64(double addrspace(1)* %out, i64 %in) { - %bc= bitcast i64 %in to double - %fabs = call double @fabs(double %bc) - store double %fabs, double addrspace(1)* %out - ret void -} diff --git a/llvm/test/CodeGen/R600/fabs.ll b/llvm/test/CodeGen/R600/fabs.ll deleted file mode 100644 index 419a73d0266..00000000000 --- a/llvm/test/CodeGen/R600/fabs.ll +++ /dev/null @@ -1,101 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s - - -; DAGCombiner will transform: -; (fabs (f32 bitcast (i32 a))) => (f32 bitcast (and (i32 a), 0x7FFFFFFF)) -; unless isFabsFree returns true - -; FUNC-LABEL: {{^}}fabs_fn_free: -; R600-NOT: AND -; R600: |PV.{{[XYZW]}}| - -; GCN: v_and_b32 - -define void @fabs_fn_free(float addrspace(1)* %out, i32 %in) { - %bc= bitcast i32 %in to float - %fabs = call float @fabs(float %bc) - store float %fabs, float addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}fabs_free: -; R600-NOT: AND -; R600: |PV.{{[XYZW]}}| - -; GCN: v_and_b32 - -define void @fabs_free(float addrspace(1)* %out, i32 %in) { - %bc= bitcast i32 %in to float - %fabs = call float @llvm.fabs.f32(float %bc) - store float %fabs, float addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}fabs_f32: -; R600: |{{(PV|T[0-9])\.[XYZW]}}| - -; GCN: v_and_b32 -define void @fabs_f32(float addrspace(1)* %out, float %in) { - %fabs = call float @llvm.fabs.f32(float %in) - store float %fabs, float addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}fabs_v2f32: -; R600: |{{(PV|T[0-9])\.[XYZW]}}| -; R600: |{{(PV|T[0-9])\.[XYZW]}}| - -; GCN: v_and_b32 -; GCN: v_and_b32 -define void @fabs_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) { - %fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %in) - store <2 x float> %fabs, <2 x float> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}fabs_v4f32: -; R600: |{{(PV|T[0-9])\.[XYZW]}}| -; R600: |{{(PV|T[0-9])\.[XYZW]}}| -; R600: |{{(PV|T[0-9])\.[XYZW]}}| -; R600: |{{(PV|T[0-9])\.[XYZW]}}| - -; GCN: v_and_b32 -; GCN: v_and_b32 -; GCN: v_and_b32 -; GCN: v_and_b32 -define void @fabs_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) { - %fabs = call <4 x float> @llvm.fabs.v4f32(<4 x float> %in) - store <4 x float> %fabs, <4 x float> addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}fabs_fn_fold: -; SI: s_load_dword [[ABS_VALUE:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xb -; VI: s_load_dword [[ABS_VALUE:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x2c -; GCN-NOT: and -; GCN: v_mul_f32_e64 v{{[0-9]+}}, |[[ABS_VALUE]]|, v{{[0-9]+}} -define void @fabs_fn_fold(float addrspace(1)* %out, float %in0, float %in1) { - %fabs = call float @fabs(float %in0) - %fmul = fmul float %fabs, %in1 - store float %fmul, float addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}fabs_fold: -; SI: s_load_dword [[ABS_VALUE:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xb -; VI: s_load_dword [[ABS_VALUE:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x2c -; GCN-NOT: and -; GCN: v_mul_f32_e64 v{{[0-9]+}}, |[[ABS_VALUE]]|, v{{[0-9]+}} -define void @fabs_fold(float addrspace(1)* %out, float %in0, float %in1) { - %fabs = call float @llvm.fabs.f32(float %in0) - %fmul = fmul float %fabs, %in1 - store float %fmul, float addrspace(1)* %out - ret void -} - -declare float @fabs(float) readnone -declare float @llvm.fabs.f32(float) readnone -declare <2 x float> @llvm.fabs.v2f32(<2 x float>) readnone -declare <4 x float> @llvm.fabs.v4f32(<4 x float>) readnone diff --git a/llvm/test/CodeGen/R600/fadd.ll b/llvm/test/CodeGen/R600/fadd.ll deleted file mode 100644 index 5fac328c598..00000000000 --- a/llvm/test/CodeGen/R600/fadd.ll +++ /dev/null @@ -1,64 +0,0 @@ -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck %s -check-prefix=R600 -check-prefix=FUNC -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck %s -check-prefix=SI -check-prefix=FUNC -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s -check-prefix=SI -check-prefix=FUNC - -; FUNC-LABEL: {{^}}fadd_f32: -; R600: ADD {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, KC0[2].W -; SI: v_add_f32 -define void @fadd_f32(float addrspace(1)* %out, float %a, float %b) { - %add = fadd float %a, %b - store float %add, float addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}fadd_v2f32: -; R600-DAG: ADD {{\** *}}T{{[0-9]\.[XYZW]}}, KC0[3].X, KC0[3].Z -; R600-DAG: ADD {{\** *}}T{{[0-9]\.[XYZW]}}, KC0[2].W, KC0[3].Y -; SI: v_add_f32 -; SI: v_add_f32 -define void @fadd_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) { - %add = fadd <2 x float> %a, %b - store <2 x float> %add, <2 x float> addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}fadd_v4f32: -; R600: ADD {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -; R600: ADD {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -; R600: ADD {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -; R600: ADD {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -; SI: v_add_f32 -; SI: v_add_f32 -; SI: v_add_f32 -; SI: v_add_f32 -define void @fadd_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) { - %b_ptr = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 1 - %a = load <4 x float>, <4 x float> addrspace(1)* %in, align 16 - %b = load <4 x float>, <4 x float> addrspace(1)* %b_ptr, align 16 - %result = fadd <4 x float> %a, %b - store <4 x float> %result, <4 x float> addrspace(1)* %out, align 16 - ret void -} - -; FUNC-LABEL: {{^}}fadd_v8f32: -; R600: ADD -; R600: ADD -; R600: ADD -; R600: ADD -; R600: ADD -; R600: ADD -; R600: ADD -; R600: ADD -; SI: v_add_f32 -; SI: v_add_f32 -; SI: v_add_f32 -; SI: v_add_f32 -; SI: v_add_f32 -; SI: v_add_f32 -; SI: v_add_f32 -; SI: v_add_f32 -define void @fadd_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, <8 x float> %b) { - %add = fadd <8 x float> %a, %b - store <8 x float> %add, <8 x float> addrspace(1)* %out, align 32 - ret void -} diff --git a/llvm/test/CodeGen/R600/fadd64.ll b/llvm/test/CodeGen/R600/fadd64.ll deleted file mode 100644 index 485c55870c4..00000000000 --- a/llvm/test/CodeGen/R600/fadd64.ll +++ /dev/null @@ -1,14 +0,0 @@ -; RUN: llc < %s -march=amdgcn -mcpu=tahiti -verify-machineinstrs | FileCheck %s -; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s - -; CHECK: {{^}}fadd_f64: -; CHECK: v_add_f64 {{v[[0-9]+:[0-9]+]}}, {{v[[0-9]+:[0-9]+]}}, {{v[[0-9]+:[0-9]+]}} - -define void @fadd_f64(double addrspace(1)* %out, double addrspace(1)* %in1, - double addrspace(1)* %in2) { - %r0 = load double, double addrspace(1)* %in1 - %r1 = load double, double addrspace(1)* %in2 - %r2 = fadd double %r0, %r1 - store double %r2, double addrspace(1)* %out - ret void -} diff --git a/llvm/test/CodeGen/R600/fceil.ll b/llvm/test/CodeGen/R600/fceil.ll deleted file mode 100644 index f23e8919d73..00000000000 --- a/llvm/test/CodeGen/R600/fceil.ll +++ /dev/null @@ -1,132 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s - -declare float @llvm.ceil.f32(float) nounwind readnone -declare <2 x float> @llvm.ceil.v2f32(<2 x float>) nounwind readnone -declare <3 x float> @llvm.ceil.v3f32(<3 x float>) nounwind readnone -declare <4 x float> @llvm.ceil.v4f32(<4 x float>) nounwind readnone -declare <8 x float> @llvm.ceil.v8f32(<8 x float>) nounwind readnone -declare <16 x float> @llvm.ceil.v16f32(<16 x float>) nounwind readnone - -; FUNC-LABEL: {{^}}fceil_f32: -; SI: v_ceil_f32_e32 -; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+\.[XYZW]]] -; EG: CEIL {{\*? *}}[[RESULT]] -define void @fceil_f32(float addrspace(1)* %out, float %x) { - %y = call float @llvm.ceil.f32(float %x) nounwind readnone - store float %y, float addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}fceil_v2f32: -; SI: v_ceil_f32_e32 -; SI: v_ceil_f32_e32 -; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+]]{{\.[XYZW]}} -; EG: CEIL {{\*? *}}[[RESULT]] -; EG: CEIL {{\*? *}}[[RESULT]] -define void @fceil_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %x) { - %y = call <2 x float> @llvm.ceil.v2f32(<2 x float> %x) nounwind readnone - store <2 x float> %y, <2 x float> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}fceil_v3f32: -; FIXME-SI: v_ceil_f32_e32 -; FIXME-SI: v_ceil_f32_e32 -; FIXME-SI: v_ceil_f32_e32 -; FIXME-EG: v3 is treated as v2 and v1, hence 2 stores -; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT1:T[0-9]+]]{{\.[XYZW]}} -; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT2:T[0-9]+]]{{\.[XYZW]}} -; EG-DAG: CEIL {{\*? *}}[[RESULT1]] -; EG-DAG: CEIL {{\*? *}}[[RESULT2]] -; EG-DAG: CEIL {{\*? *}}[[RESULT2]] -define void @fceil_v3f32(<3 x float> addrspace(1)* %out, <3 x float> %x) { - %y = call <3 x float> @llvm.ceil.v3f32(<3 x float> %x) nounwind readnone - store <3 x float> %y, <3 x float> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}fceil_v4f32: -; SI: v_ceil_f32_e32 -; SI: v_ceil_f32_e32 -; SI: v_ceil_f32_e32 -; SI: v_ceil_f32_e32 -; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+]]{{\.[XYZW]}} -; EG: CEIL {{\*? *}}[[RESULT]] -; EG: CEIL {{\*? *}}[[RESULT]] -; EG: CEIL {{\*? *}}[[RESULT]] -; EG: CEIL {{\*? *}}[[RESULT]] -define void @fceil_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %x) { - %y = call <4 x float> @llvm.ceil.v4f32(<4 x float> %x) nounwind readnone - store <4 x float> %y, <4 x float> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}fceil_v8f32: -; SI: v_ceil_f32_e32 -; SI: v_ceil_f32_e32 -; SI: v_ceil_f32_e32 -; SI: v_ceil_f32_e32 -; SI: v_ceil_f32_e32 -; SI: v_ceil_f32_e32 -; SI: v_ceil_f32_e32 -; SI: v_ceil_f32_e32 -; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT1:T[0-9]+]]{{\.[XYZW]}} -; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT2:T[0-9]+]]{{\.[XYZW]}} -; EG-DAG: CEIL {{\*? *}}[[RESULT1]] -; EG-DAG: CEIL {{\*? *}}[[RESULT1]] -; EG-DAG: CEIL {{\*? *}}[[RESULT1]] -; EG-DAG: CEIL {{\*? *}}[[RESULT1]] -; EG-DAG: CEIL {{\*? *}}[[RESULT2]] -; EG-DAG: CEIL {{\*? *}}[[RESULT2]] -; EG-DAG: CEIL {{\*? *}}[[RESULT2]] -; EG-DAG: CEIL {{\*? *}}[[RESULT2]] -define void @fceil_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %x) { - %y = call <8 x float> @llvm.ceil.v8f32(<8 x float> %x) nounwind readnone - store <8 x float> %y, <8 x float> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}fceil_v16f32: -; SI: v_ceil_f32_e32 -; SI: v_ceil_f32_e32 -; SI: v_ceil_f32_e32 -; SI: v_ceil_f32_e32 -; SI: v_ceil_f32_e32 -; SI: v_ceil_f32_e32 -; SI: v_ceil_f32_e32 -; SI: v_ceil_f32_e32 -; SI: v_ceil_f32_e32 -; SI: v_ceil_f32_e32 -; SI: v_ceil_f32_e32 -; SI: v_ceil_f32_e32 -; SI: v_ceil_f32_e32 -; SI: v_ceil_f32_e32 -; SI: v_ceil_f32_e32 -; SI: v_ceil_f32_e32 -; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT1:T[0-9]+]]{{\.[XYZW]}} -; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT2:T[0-9]+]]{{\.[XYZW]}} -; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT3:T[0-9]+]]{{\.[XYZW]}} -; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT4:T[0-9]+]]{{\.[XYZW]}} -; EG-DAG: CEIL {{\*? *}}[[RESULT1]] -; EG-DAG: CEIL {{\*? *}}[[RESULT1]] -; EG-DAG: CEIL {{\*? *}}[[RESULT1]] -; EG-DAG: CEIL {{\*? *}}[[RESULT1]] -; EG-DAG: CEIL {{\*? *}}[[RESULT2]] -; EG-DAG: CEIL {{\*? *}}[[RESULT2]] -; EG-DAG: CEIL {{\*? *}}[[RESULT2]] -; EG-DAG: CEIL {{\*? *}}[[RESULT2]] -; EG-DAG: CEIL {{\*? *}}[[RESULT3]] -; EG-DAG: CEIL {{\*? *}}[[RESULT3]] -; EG-DAG: CEIL {{\*? *}}[[RESULT3]] -; EG-DAG: CEIL {{\*? *}}[[RESULT3]] -; EG-DAG: CEIL {{\*? *}}[[RESULT4]] -; EG-DAG: CEIL {{\*? *}}[[RESULT4]] -; EG-DAG: CEIL {{\*? *}}[[RESULT4]] -; EG-DAG: CEIL {{\*? *}}[[RESULT4]] -define void @fceil_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %x) { - %y = call <16 x float> @llvm.ceil.v16f32(<16 x float> %x) nounwind readnone - store <16 x float> %y, <16 x float> addrspace(1)* %out - ret void -} diff --git a/llvm/test/CodeGen/R600/fceil64.ll b/llvm/test/CodeGen/R600/fceil64.ll deleted file mode 100644 index e8c34f0141e..00000000000 --- a/llvm/test/CodeGen/R600/fceil64.ll +++ /dev/null @@ -1,105 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s - -declare double @llvm.ceil.f64(double) nounwind readnone -declare <2 x double> @llvm.ceil.v2f64(<2 x double>) nounwind readnone -declare <3 x double> @llvm.ceil.v3f64(<3 x double>) nounwind readnone -declare <4 x double> @llvm.ceil.v4f64(<4 x double>) nounwind readnone -declare <8 x double> @llvm.ceil.v8f64(<8 x double>) nounwind readnone -declare <16 x double> @llvm.ceil.v16f64(<16 x double>) nounwind readnone - -; FUNC-LABEL: {{^}}fceil_f64: -; CI: v_ceil_f64_e32 -; SI: s_bfe_u32 [[SEXP:s[0-9]+]], {{s[0-9]+}}, 0xb0014 -; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000 -; SI: s_add_i32 s{{[0-9]+}}, [[SEXP]], 0xfffffc01 -; SI: s_lshr_b64 -; SI: s_not_b64 -; SI: s_and_b64 -; SI: cmp_gt_i32 -; SI: cndmask_b32 -; SI: cndmask_b32 -; SI: cmp_lt_i32 -; SI: cndmask_b32 -; SI: cndmask_b32 -; SI-DAG: v_cmp_lt_f64 -; SI-DAG: v_cmp_lg_f64 -; SI: s_and_b64 -; SI: v_cndmask_b32 -; SI: v_cndmask_b32 -; SI: v_add_f64 -; SI: s_endpgm -define void @fceil_f64(double addrspace(1)* %out, double %x) { - %y = call double @llvm.ceil.f64(double %x) nounwind readnone - store double %y, double addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}fceil_v2f64: -; CI: v_ceil_f64_e32 -; CI: v_ceil_f64_e32 -define void @fceil_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %x) { - %y = call <2 x double> @llvm.ceil.v2f64(<2 x double> %x) nounwind readnone - store <2 x double> %y, <2 x double> addrspace(1)* %out - ret void -} - -; FIXME-FUNC-LABEL: {{^}}fceil_v3f64: -; FIXME-CI: v_ceil_f64_e32 -; FIXME-CI: v_ceil_f64_e32 -; FIXME-CI: v_ceil_f64_e32 -; define void @fceil_v3f64(<3 x double> addrspace(1)* %out, <3 x double> %x) { -; %y = call <3 x double> @llvm.ceil.v3f64(<3 x double> %x) nounwind readnone -; store <3 x double> %y, <3 x double> addrspace(1)* %out -; ret void -; } - -; FUNC-LABEL: {{^}}fceil_v4f64: -; CI: v_ceil_f64_e32 -; CI: v_ceil_f64_e32 -; CI: v_ceil_f64_e32 -; CI: v_ceil_f64_e32 -define void @fceil_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %x) { - %y = call <4 x double> @llvm.ceil.v4f64(<4 x double> %x) nounwind readnone - store <4 x double> %y, <4 x double> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}fceil_v8f64: -; CI: v_ceil_f64_e32 -; CI: v_ceil_f64_e32 -; CI: v_ceil_f64_e32 -; CI: v_ceil_f64_e32 -; CI: v_ceil_f64_e32 -; CI: v_ceil_f64_e32 -; CI: v_ceil_f64_e32 -; CI: v_ceil_f64_e32 -define void @fceil_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %x) { - %y = call <8 x double> @llvm.ceil.v8f64(<8 x double> %x) nounwind readnone - store <8 x double> %y, <8 x double> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}fceil_v16f64: -; CI: v_ceil_f64_e32 -; CI: v_ceil_f64_e32 -; CI: v_ceil_f64_e32 -; CI: v_ceil_f64_e32 -; CI: v_ceil_f64_e32 -; CI: v_ceil_f64_e32 -; CI: v_ceil_f64_e32 -; CI: v_ceil_f64_e32 -; CI: v_ceil_f64_e32 -; CI: v_ceil_f64_e32 -; CI: v_ceil_f64_e32 -; CI: v_ceil_f64_e32 -; CI: v_ceil_f64_e32 -; CI: v_ceil_f64_e32 -; CI: v_ceil_f64_e32 -; CI: v_ceil_f64_e32 -define void @fceil_v16f64(<16 x double> addrspace(1)* %out, <16 x double> %x) { - %y = call <16 x double> @llvm.ceil.v16f64(<16 x double> %x) nounwind readnone - store <16 x double> %y, <16 x double> addrspace(1)* %out - ret void -} diff --git a/llvm/test/CodeGen/R600/fcmp-cnd.ll b/llvm/test/CodeGen/R600/fcmp-cnd.ll deleted file mode 100644 index 530274f920f..00000000000 --- a/llvm/test/CodeGen/R600/fcmp-cnd.ll +++ /dev/null @@ -1,14 +0,0 @@ -;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s - -;Not checking arguments 2 and 3 to CNDE, because they may change between -;registers and literal.x depending on what the optimizer does. -;CHECK: CNDE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} - -define void @test(i32 addrspace(1)* %out, float addrspace(1)* %in) { -entry: - %0 = load float, float addrspace(1)* %in - %cmp = fcmp oeq float %0, 0.000000e+00 - %value = select i1 %cmp, i32 2, i32 3 - store i32 %value, i32 addrspace(1)* %out - ret void -} diff --git a/llvm/test/CodeGen/R600/fcmp-cnde-int-args.ll b/llvm/test/CodeGen/R600/fcmp-cnde-int-args.ll deleted file mode 100644 index c402805feb3..00000000000 --- a/llvm/test/CodeGen/R600/fcmp-cnde-int-args.ll +++ /dev/null @@ -1,16 +0,0 @@ -; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s - -; This test checks a bug in R600TargetLowering::LowerSELECT_CC where the -; chance to optimize the fcmp + select instructions to SET* was missed -; due to the fact that the operands to fcmp and select had different types - -; CHECK: SET{{[A-Z]+}}_DX10 - -define void @test(i32 addrspace(1)* %out, float addrspace(1)* %in) { -entry: - %0 = load float, float addrspace(1)* %in - %cmp = fcmp oeq float %0, 0.000000e+00 - %value = select i1 %cmp, i32 -1, i32 0 - store i32 %value, i32 addrspace(1)* %out - ret void -} diff --git a/llvm/test/CodeGen/R600/fcmp.ll b/llvm/test/CodeGen/R600/fcmp.ll deleted file mode 100644 index 5207ab57bad..00000000000 --- a/llvm/test/CodeGen/R600/fcmp.ll +++ /dev/null @@ -1,38 +0,0 @@ -; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s - -; CHECK: {{^}}fcmp_sext: -; CHECK: SETE_DX10 T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} - -define void @fcmp_sext(i32 addrspace(1)* %out, float addrspace(1)* %in) { -entry: - %0 = load float, float addrspace(1)* %in - %arrayidx1 = getelementptr inbounds float, float addrspace(1)* %in, i32 1 - %1 = load float, float addrspace(1)* %arrayidx1 - %cmp = fcmp oeq float %0, %1 - %sext = sext i1 %cmp to i32 - store i32 %sext, i32 addrspace(1)* %out - ret void -} - -; This test checks that a setcc node with f32 operands is lowered to a -; SET*_DX10 instruction. Previously we were lowering this to: -; SET* + FP_TO_SINT - -; CHECK: {{^}}fcmp_br: -; CHECK: SET{{[N]*}}E_DX10 * T{{[0-9]+\.[XYZW],}} -; CHECK-NEXT {{[0-9]+(5.0}} - -define void @fcmp_br(i32 addrspace(1)* %out, float %in) { -entry: - %0 = fcmp oeq float %in, 5.0 - br i1 %0, label %IF, label %ENDIF - -IF: - %1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 - store i32 0, i32 addrspace(1)* %1 - br label %ENDIF - -ENDIF: - store i32 0, i32 addrspace(1)* %out - ret void -} diff --git a/llvm/test/CodeGen/R600/fcmp64.ll b/llvm/test/CodeGen/R600/fcmp64.ll deleted file mode 100644 index 053ab0ed7aa..00000000000 --- a/llvm/test/CodeGen/R600/fcmp64.ll +++ /dev/null @@ -1,74 +0,0 @@ -; RUN: llc < %s -march=amdgcn -mcpu=tahiti -verify-machineinstrs | FileCheck %s -; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s - -; CHECK-LABEL: {{^}}flt_f64: -; CHECK: v_cmp_nge_f64_e32 vcc, {{v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}} -define void @flt_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1, - double addrspace(1)* %in2) { - %r0 = load double, double addrspace(1)* %in1 - %r1 = load double, double addrspace(1)* %in2 - %r2 = fcmp ult double %r0, %r1 - %r3 = zext i1 %r2 to i32 - store i32 %r3, i32 addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}fle_f64: -; CHECK: v_cmp_ngt_f64_e32 vcc, {{v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}} -define void @fle_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1, - double addrspace(1)* %in2) { - %r0 = load double, double addrspace(1)* %in1 - %r1 = load double, double addrspace(1)* %in2 - %r2 = fcmp ule double %r0, %r1 - %r3 = zext i1 %r2 to i32 - store i32 %r3, i32 addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}fgt_f64: -; CHECK: v_cmp_nle_f64_e32 vcc, {{v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}} -define void @fgt_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1, - double addrspace(1)* %in2) { - %r0 = load double, double addrspace(1)* %in1 - %r1 = load double, double addrspace(1)* %in2 - %r2 = fcmp ugt double %r0, %r1 - %r3 = zext i1 %r2 to i32 - store i32 %r3, i32 addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}fge_f64: -; CHECK: v_cmp_nlt_f64_e32 vcc, {{v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}} -define void @fge_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1, - double addrspace(1)* %in2) { - %r0 = load double, double addrspace(1)* %in1 - %r1 = load double, double addrspace(1)* %in2 - %r2 = fcmp uge double %r0, %r1 - %r3 = zext i1 %r2 to i32 - store i32 %r3, i32 addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}fne_f64: -; CHECK: v_cmp_neq_f64_e32 vcc, {{v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}} -define void @fne_f64(double addrspace(1)* %out, double addrspace(1)* %in1, - double addrspace(1)* %in2) { - %r0 = load double, double addrspace(1)* %in1 - %r1 = load double, double addrspace(1)* %in2 - %r2 = fcmp une double %r0, %r1 - %r3 = select i1 %r2, double %r0, double %r1 - store double %r3, double addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}feq_f64: -; CHECK: v_cmp_nlg_f64_e32 vcc, {{v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}} -define void @feq_f64(double addrspace(1)* %out, double addrspace(1)* %in1, - double addrspace(1)* %in2) { - %r0 = load double, double addrspace(1)* %in1 - %r1 = load double, double addrspace(1)* %in2 - %r2 = fcmp ueq double %r0, %r1 - %r3 = select i1 %r2, double %r0, double %r1 - store double %r3, double addrspace(1)* %out - ret void -} diff --git a/llvm/test/CodeGen/R600/fconst64.ll b/llvm/test/CodeGen/R600/fconst64.ll deleted file mode 100644 index 89af37545c9..00000000000 --- a/llvm/test/CodeGen/R600/fconst64.ll +++ /dev/null @@ -1,13 +0,0 @@ -; RUN: llc < %s -march=amdgcn -mcpu=tahiti -verify-machineinstrs | FileCheck %s -; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s - -; CHECK: {{^}}fconst_f64: -; CHECK-DAG: s_mov_b32 {{s[0-9]+}}, 0x40140000 -; CHECK-DAG: s_mov_b32 {{s[0-9]+}}, 0 - -define void @fconst_f64(double addrspace(1)* %out, double addrspace(1)* %in) { - %r1 = load double, double addrspace(1)* %in - %r2 = fadd double %r1, 5.000000e+00 - store double %r2, double addrspace(1)* %out - ret void -} diff --git a/llvm/test/CodeGen/R600/fcopysign.f32.ll b/llvm/test/CodeGen/R600/fcopysign.f32.ll deleted file mode 100644 index b719d5a3978..00000000000 --- a/llvm/test/CodeGen/R600/fcopysign.f32.ll +++ /dev/null @@ -1,53 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s - - -declare float @llvm.copysign.f32(float, float) nounwind readnone -declare <2 x float> @llvm.copysign.v2f32(<2 x float>, <2 x float>) nounwind readnone -declare <4 x float> @llvm.copysign.v4f32(<4 x float>, <4 x float>) nounwind readnone - -; Try to identify arg based on higher address. -; FUNC-LABEL: {{^}}test_copysign_f32: -; SI: s_load_dword [[SMAG:s[0-9]+]], {{.*}} 0xb -; SI: s_load_dword [[SSIGN:s[0-9]+]], {{.*}} 0xc -; VI: s_load_dword [[SMAG:s[0-9]+]], {{.*}} 0x2c -; VI: s_load_dword [[SSIGN:s[0-9]+]], {{.*}} 0x30 -; GCN-DAG: v_mov_b32_e32 [[VSIGN:v[0-9]+]], [[SSIGN]] -; GCN-DAG: v_mov_b32_e32 [[VMAG:v[0-9]+]], [[SMAG]] -; GCN-DAG: s_mov_b32 [[SCONST:s[0-9]+]], 0x7fffffff -; GCN: v_bfi_b32 [[RESULT:v[0-9]+]], [[SCONST]], [[VMAG]], [[VSIGN]] -; GCN: buffer_store_dword [[RESULT]], -; GCN: s_endpgm - -; EG: BFI_INT -define void @test_copysign_f32(float addrspace(1)* %out, float %mag, float %sign) nounwind { - %result = call float @llvm.copysign.f32(float %mag, float %sign) - store float %result, float addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}test_copysign_v2f32: -; GCN: s_endpgm - -; EG: BFI_INT -; EG: BFI_INT -define void @test_copysign_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %mag, <2 x float> %sign) nounwind { - %result = call <2 x float> @llvm.copysign.v2f32(<2 x float> %mag, <2 x float> %sign) - store <2 x float> %result, <2 x float> addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}test_copysign_v4f32: -; GCN: s_endpgm - -; EG: BFI_INT -; EG: BFI_INT -; EG: BFI_INT -; EG: BFI_INT -define void @test_copysign_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %mag, <4 x float> %sign) nounwind { - %result = call <4 x float> @llvm.copysign.v4f32(<4 x float> %mag, <4 x float> %sign) - store <4 x float> %result, <4 x float> addrspace(1)* %out, align 16 - ret void -} - diff --git a/llvm/test/CodeGen/R600/fcopysign.f64.ll b/llvm/test/CodeGen/R600/fcopysign.f64.ll deleted file mode 100644 index 3d8c5599308..00000000000 --- a/llvm/test/CodeGen/R600/fcopysign.f64.ll +++ /dev/null @@ -1,40 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=FUNC %s - -declare double @llvm.copysign.f64(double, double) nounwind readnone -declare <2 x double> @llvm.copysign.v2f64(<2 x double>, <2 x double>) nounwind readnone -declare <4 x double> @llvm.copysign.v4f64(<4 x double>, <4 x double>) nounwind readnone - -; FUNC-LABEL: {{^}}test_copysign_f64: -; SI-DAG: s_load_dwordx2 s{{\[}}[[SMAG_LO:[0-9]+]]:[[SMAG_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb -; SI-DAG: s_load_dwordx2 s{{\[}}[[SSIGN_LO:[0-9]+]]:[[SSIGN_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd -; VI-DAG: s_load_dwordx2 s{{\[}}[[SMAG_LO:[0-9]+]]:[[SMAG_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c -; VI-DAG: s_load_dwordx2 s{{\[}}[[SSIGN_LO:[0-9]+]]:[[SSIGN_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x34 -; GCN-DAG: v_mov_b32_e32 v[[VSIGN_HI:[0-9]+]], s[[SSIGN_HI]] -; GCN-DAG: v_mov_b32_e32 v[[VMAG_HI:[0-9]+]], s[[SMAG_HI]] -; GCN-DAG: s_mov_b32 [[SCONST:s[0-9]+]], 0x7fffffff -; GCN: v_bfi_b32 v[[VRESULT_HI:[0-9]+]], [[SCONST]], v[[VMAG_HI]], v[[VSIGN_HI]] -; GCN: v_mov_b32_e32 v[[VMAG_LO:[0-9]+]], s[[SMAG_LO]] -; GCN: buffer_store_dwordx2 v{{\[}}[[VMAG_LO]]:[[VRESULT_HI]]{{\]}} -; GCN: s_endpgm -define void @test_copysign_f64(double addrspace(1)* %out, double %mag, double %sign) nounwind { - %result = call double @llvm.copysign.f64(double %mag, double %sign) - store double %result, double addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}test_copysign_v2f64: -; GCN: s_endpgm -define void @test_copysign_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %mag, <2 x double> %sign) nounwind { - %result = call <2 x double> @llvm.copysign.v2f64(<2 x double> %mag, <2 x double> %sign) - store <2 x double> %result, <2 x double> addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}test_copysign_v4f64: -; GCN: s_endpgm -define void @test_copysign_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %mag, <4 x double> %sign) nounwind { - %result = call <4 x double> @llvm.copysign.v4f64(<4 x double> %mag, <4 x double> %sign) - store <4 x double> %result, <4 x double> addrspace(1)* %out, align 8 - ret void -} diff --git a/llvm/test/CodeGen/R600/fdiv.f64.ll b/llvm/test/CodeGen/R600/fdiv.f64.ll deleted file mode 100644 index 7c022e38c80..00000000000 --- a/llvm/test/CodeGen/R600/fdiv.f64.ll +++ /dev/null @@ -1,96 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=COMMON %s -; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=COMMON %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=COMMON %s - - -; COMMON-LABEL: {{^}}fdiv_f64: -; COMMON-DAG: buffer_load_dwordx2 [[NUM:v\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0 -; COMMON-DAG: buffer_load_dwordx2 [[DEN:v\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0 offset:8 -; CI-DAG: v_div_scale_f64 [[SCALE0:v\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, [[DEN]], [[DEN]], [[NUM]] -; CI-DAG: v_div_scale_f64 [[SCALE1:v\[[0-9]+:[0-9]+\]]], vcc, [[NUM]], [[DEN]], [[NUM]] - -; Check for div_scale bug workaround on SI -; SI-DAG: v_div_scale_f64 [[SCALE0:v\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, [[DEN]], [[DEN]], [[NUM]] -; SI-DAG: v_div_scale_f64 [[SCALE1:v\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, [[NUM]], [[DEN]], [[NUM]] - -; COMMON-DAG: v_rcp_f64_e32 [[RCP_SCALE0:v\[[0-9]+:[0-9]+\]]], [[SCALE0]] - -; SI-DAG: v_cmp_eq_i32_e32 vcc, {{v[0-9]+}}, {{v[0-9]+}} -; SI-DAG: v_cmp_eq_i32_e64 [[CMP0:s\[[0-9]+:[0-9]+\]]], {{v[0-9]+}}, {{v[0-9]+}} -; SI-DAG: s_xor_b64 vcc, [[CMP0]], vcc - -; COMMON-DAG: v_fma_f64 [[FMA0:v\[[0-9]+:[0-9]+\]]], -[[SCALE0]], [[RCP_SCALE0]], 1.0 -; COMMON-DAG: v_fma_f64 [[FMA1:v\[[0-9]+:[0-9]+\]]], [[RCP_SCALE0]], [[FMA0]], [[RCP_SCALE0]] -; COMMON-DAG: v_fma_f64 [[FMA2:v\[[0-9]+:[0-9]+\]]], -[[SCALE0]], [[FMA1]], 1.0 -; COMMON-DAG: v_fma_f64 [[FMA3:v\[[0-9]+:[0-9]+\]]], [[FMA1]], [[FMA2]], [[FMA1]] -; COMMON-DAG: v_mul_f64 [[MUL:v\[[0-9]+:[0-9]+\]]], [[SCALE1]], [[FMA3]] -; COMMON-DAG: v_fma_f64 [[FMA4:v\[[0-9]+:[0-9]+\]]], -[[SCALE0]], [[MUL]], [[SCALE1]] -; COMMON: v_div_fmas_f64 [[FMAS:v\[[0-9]+:[0-9]+\]]], [[FMA4]], [[FMA3]], [[MUL]] -; COMMON: v_div_fixup_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[FMAS]], [[DEN]], [[NUM]] -; COMMON: buffer_store_dwordx2 [[RESULT]] -; COMMON: s_endpgm -define void @fdiv_f64(double addrspace(1)* %out, double addrspace(1)* %in) nounwind { - %gep.1 = getelementptr double, double addrspace(1)* %in, i32 1 - %num = load double, double addrspace(1)* %in - %den = load double, double addrspace(1)* %gep.1 - %result = fdiv double %num, %den - store double %result, double addrspace(1)* %out - ret void -} - -; COMMON-LABEL: {{^}}fdiv_f64_s_v: -define void @fdiv_f64_s_v(double addrspace(1)* %out, double addrspace(1)* %in, double %num) nounwind { - %den = load double, double addrspace(1)* %in - %result = fdiv double %num, %den - store double %result, double addrspace(1)* %out - ret void -} - -; COMMON-LABEL: {{^}}fdiv_f64_v_s: -define void @fdiv_f64_v_s(double addrspace(1)* %out, double addrspace(1)* %in, double %den) nounwind { - %num = load double, double addrspace(1)* %in - %result = fdiv double %num, %den - store double %result, double addrspace(1)* %out - ret void -} - -; COMMON-LABEL: {{^}}fdiv_f64_s_s: -define void @fdiv_f64_s_s(double addrspace(1)* %out, double %num, double %den) nounwind { - %result = fdiv double %num, %den - store double %result, double addrspace(1)* %out - ret void -} - -; COMMON-LABEL: {{^}}v_fdiv_v2f64: -define void @v_fdiv_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %in) nounwind { - %gep.1 = getelementptr <2 x double>, <2 x double> addrspace(1)* %in, i32 1 - %num = load <2 x double>, <2 x double> addrspace(1)* %in - %den = load <2 x double>, <2 x double> addrspace(1)* %gep.1 - %result = fdiv <2 x double> %num, %den - store <2 x double> %result, <2 x double> addrspace(1)* %out - ret void -} - -; COMMON-LABEL: {{^}}s_fdiv_v2f64: -define void @s_fdiv_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %num, <2 x double> %den) { - %result = fdiv <2 x double> %num, %den - store <2 x double> %result, <2 x double> addrspace(1)* %out - ret void -} - -; COMMON-LABEL: {{^}}v_fdiv_v4f64: -define void @v_fdiv_v4f64(<4 x double> addrspace(1)* %out, <4 x double> addrspace(1)* %in) nounwind { - %gep.1 = getelementptr <4 x double>, <4 x double> addrspace(1)* %in, i32 1 - %num = load <4 x double>, <4 x double> addrspace(1)* %in - %den = load <4 x double>, <4 x double> addrspace(1)* %gep.1 - %result = fdiv <4 x double> %num, %den - store <4 x double> %result, <4 x double> addrspace(1)* %out - ret void -} - -; COMMON-LABEL: {{^}}s_fdiv_v4f64: -define void @s_fdiv_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %num, <4 x double> %den) { - %result = fdiv <4 x double> %num, %den - store <4 x double> %result, <4 x double> addrspace(1)* %out - ret void -} diff --git a/llvm/test/CodeGen/R600/fdiv.ll b/llvm/test/CodeGen/R600/fdiv.ll deleted file mode 100644 index 7cbf8733639..00000000000 --- a/llvm/test/CodeGen/R600/fdiv.ll +++ /dev/null @@ -1,68 +0,0 @@ -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 %s -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s - -; These tests check that fdiv is expanded correctly and also test that the -; scheduler is scheduling the RECIP_IEEE and MUL_IEEE instructions in separate -; instruction groups. - -; FUNC-LABEL: {{^}}fdiv_f32: -; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[3].Z -; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[3].Y -; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[3].X, PS -; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].W, PS - -; SI-DAG: v_rcp_f32 -; SI-DAG: v_mul_f32 -define void @fdiv_f32(float addrspace(1)* %out, float %a, float %b) { -entry: - %0 = fdiv float %a, %b - store float %0, float addrspace(1)* %out - ret void -} - - - -; FUNC-LABEL: {{^}}fdiv_v2f32: -; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[3].Z -; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[3].Y -; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[3].X, PS -; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].W, PS - -; SI-DAG: v_rcp_f32 -; SI-DAG: v_mul_f32 -; SI-DAG: v_rcp_f32 -; SI-DAG: v_mul_f32 -define void @fdiv_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) { -entry: - %0 = fdiv <2 x float> %a, %b - store <2 x float> %0, <2 x float> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}fdiv_v4f32: -; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS -; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS -; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS -; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS - -; SI-DAG: v_rcp_f32 -; SI-DAG: v_mul_f32 -; SI-DAG: v_rcp_f32 -; SI-DAG: v_mul_f32 -; SI-DAG: v_rcp_f32 -; SI-DAG: v_mul_f32 -; SI-DAG: v_rcp_f32 -; SI-DAG: v_mul_f32 -define void @fdiv_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) { - %b_ptr = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 1 - %a = load <4 x float>, <4 x float> addrspace(1) * %in - %b = load <4 x float>, <4 x float> addrspace(1) * %b_ptr - %result = fdiv <4 x float> %a, %b - store <4 x float> %result, <4 x float> addrspace(1)* %out - ret void -} diff --git a/llvm/test/CodeGen/R600/fetch-limits.r600.ll b/llvm/test/CodeGen/R600/fetch-limits.r600.ll deleted file mode 100644 index e7160ef5d72..00000000000 --- a/llvm/test/CodeGen/R600/fetch-limits.r600.ll +++ /dev/null @@ -1,48 +0,0 @@ -; RUN: llc < %s -march=r600 -mcpu=r600 | FileCheck %s -; RUN: llc < %s -march=r600 -mcpu=rs880 | FileCheck %s -; RUN: llc < %s -march=r600 -mcpu=rv670 | FileCheck %s - -; R600 supports 8 fetches in a clause -; CHECK: {{^}}fetch_limits_r600: -; CHECK: Fetch clause -; CHECK: Fetch clause - -define void @fetch_limits_r600() #0 { -entry: - %0 = load <4 x float>, <4 x float> addrspace(8)* null - %1 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1) - %2 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2) - %3 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3) - %4 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4) - %5 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 5) - %6 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 6) - %7 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 7) - %8 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 8) - %res0 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %0, i32 0, i32 0, i32 1) - %res1 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %1, i32 0, i32 0, i32 1) - %res2 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %2, i32 0, i32 0, i32 1) - %res3 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %3, i32 0, i32 0, i32 1) - %res4 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %4, i32 0, i32 0, i32 1) - %res5 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %5, i32 0, i32 0, i32 1) - %res6 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %6, i32 0, i32 0, i32 1) - %res7 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %7, i32 0, i32 0, i32 1) - %res8 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %8, i32 0, i32 0, i32 1) - %a = fadd <4 x float> %res0, %res1 - %b = fadd <4 x float> %res2, %res3 - %c = fadd <4 x float> %res4, %res5 - %d = fadd <4 x float> %res6, %res7 - %e = fadd <4 x float> %res8, %a - - %bc = fadd <4 x float> %b, %c - %de = fadd <4 x float> %d, %e - - %bcde = fadd <4 x float> %bc, %de - - call void @llvm.R600.store.swizzle(<4 x float> %bcde, i32 0, i32 1) - ret void -} - -attributes #0 = { "ShaderType"="0" } ; Pixel Shader - -declare <4 x float> @llvm.AMDGPU.tex(<4 x float>, i32, i32, i32) readnone -declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) diff --git a/llvm/test/CodeGen/R600/fetch-limits.r700+.ll b/llvm/test/CodeGen/R600/fetch-limits.r700+.ll deleted file mode 100644 index acaea2aa794..00000000000 --- a/llvm/test/CodeGen/R600/fetch-limits.r700+.ll +++ /dev/null @@ -1,81 +0,0 @@ -; RUN: llc < %s -march=r600 -mcpu=rv710 | FileCheck %s -; RUN: llc < %s -march=r600 -mcpu=rv730 | FileCheck %s -; RUN: llc < %s -march=r600 -mcpu=rv770 | FileCheck %s -; RUN: llc < %s -march=r600 -mcpu=cedar | FileCheck %s -; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s -; RUN: llc < %s -march=r600 -mcpu=sumo | FileCheck %s -; RUN: llc < %s -march=r600 -mcpu=juniper | FileCheck %s -; RUN: llc < %s -march=r600 -mcpu=cypress | FileCheck %s -; RUN: llc < %s -march=r600 -mcpu=barts | FileCheck %s -; RUN: llc < %s -march=r600 -mcpu=turks | FileCheck %s -; RUN: llc < %s -march=r600 -mcpu=caicos | FileCheck %s -; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s - -; r700+ supports 16 fetches in a clause -; CHECK: {{^}}fetch_limits_r700: -; CHECK: Fetch clause -; CHECK: Fetch clause - -define void @fetch_limits_r700() #0 { -entry: - %0 = load <4 x float>, <4 x float> addrspace(8)* null - %1 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1) - %2 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2) - %3 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3) - %4 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4) - %5 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 5) - %6 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 6) - %7 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 7) - %8 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 8) - %9 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9) - %10 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 10) - %11 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 11) - %12 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 12) - %13 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 13) - %14 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 14) - %15 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 15) - %16 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 16) - %res0 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %0, i32 0, i32 0, i32 1) - %res1 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %1, i32 0, i32 0, i32 1) - %res2 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %2, i32 0, i32 0, i32 1) - %res3 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %3, i32 0, i32 0, i32 1) - %res4 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %4, i32 0, i32 0, i32 1) - %res5 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %5, i32 0, i32 0, i32 1) - %res6 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %6, i32 0, i32 0, i32 1) - %res7 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %7, i32 0, i32 0, i32 1) - %res8 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %8, i32 0, i32 0, i32 1) - %res9 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %9, i32 0, i32 0, i32 1) - %res10 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %10, i32 0, i32 0, i32 1) - %res11 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %11, i32 0, i32 0, i32 1) - %res12 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %12, i32 0, i32 0, i32 1) - %res13 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %13, i32 0, i32 0, i32 1) - %res14 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %14, i32 0, i32 0, i32 1) - %res15 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %15, i32 0, i32 0, i32 1) - %res16 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %16, i32 0, i32 0, i32 1) - %a = fadd <4 x float> %res0, %res1 - %b = fadd <4 x float> %res2, %res3 - %c = fadd <4 x float> %res4, %res5 - %d = fadd <4 x float> %res6, %res7 - %e = fadd <4 x float> %res8, %res9 - %f = fadd <4 x float> %res10, %res11 - %g = fadd <4 x float> %res12, %res13 - %h = fadd <4 x float> %res14, %res15 - %i = fadd <4 x float> %res16, %a - - %bc = fadd <4 x float> %b, %c - %de = fadd <4 x float> %d, %e - %fg = fadd <4 x float> %f, %g - %hi = fadd <4 x float> %h, %i - - %bcde = fadd <4 x float> %bc, %de - %fghi = fadd <4 x float> %fg, %hi - - %bcdefghi = fadd <4 x float> %bcde, %fghi - call void @llvm.R600.store.swizzle(<4 x float> %bcdefghi, i32 0, i32 1) - ret void -} - -attributes #0 = { "ShaderType"="0" } ; Pixel Shader - -declare <4 x float> @llvm.AMDGPU.tex(<4 x float>, i32, i32, i32) readnone -declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) diff --git a/llvm/test/CodeGen/R600/ffloor.f64.ll b/llvm/test/CodeGen/R600/ffloor.f64.ll deleted file mode 100644 index 45f8382c392..00000000000 --- a/llvm/test/CodeGen/R600/ffloor.f64.ll +++ /dev/null @@ -1,127 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s - -declare double @llvm.fabs.f64(double %Val) -declare double @llvm.floor.f64(double) nounwind readnone -declare <2 x double> @llvm.floor.v2f64(<2 x double>) nounwind readnone -declare <3 x double> @llvm.floor.v3f64(<3 x double>) nounwind readnone -declare <4 x double> @llvm.floor.v4f64(<4 x double>) nounwind readnone -declare <8 x double> @llvm.floor.v8f64(<8 x double>) nounwind readnone -declare <16 x double> @llvm.floor.v16f64(<16 x double>) nounwind readnone - -; FUNC-LABEL: {{^}}ffloor_f64: -; CI: v_floor_f64_e32 -; SI: v_fract_f64_e32 -; SI: v_min_f64 -; SI: v_cmp_class_f64_e64 -; SI: v_cndmask_b32_e64 -; SI: v_cndmask_b32_e64 -; SI: v_add_f64 -; SI: s_endpgm -define void @ffloor_f64(double addrspace(1)* %out, double %x) { - %y = call double @llvm.floor.f64(double %x) nounwind readnone - store double %y, double addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}ffloor_f64_neg: -; CI: v_floor_f64_e64 -; SI: v_fract_f64_e64 {{v[[0-9]+:[0-9]+]}}, -[[INPUT:s[[0-9]+:[0-9]+]]] -; SI: v_min_f64 -; SI: v_cmp_class_f64_e64 -; SI: v_cndmask_b32_e64 -; SI: v_cndmask_b32_e64 -; SI: v_add_f64 {{v[[0-9]+:[0-9]+]}}, -[[INPUT]] -; SI: s_endpgm -define void @ffloor_f64_neg(double addrspace(1)* %out, double %x) { - %neg = fsub double 0.0, %x - %y = call double @llvm.floor.f64(double %neg) nounwind readnone - store double %y, double addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}ffloor_f64_neg_abs: -; CI: v_floor_f64_e64 -; SI: v_fract_f64_e64 {{v[[0-9]+:[0-9]+]}}, -|[[INPUT:s[[0-9]+:[0-9]+]]]| -; SI: v_min_f64 -; SI: v_cmp_class_f64_e64 -; SI: v_cndmask_b32_e64 -; SI: v_cndmask_b32_e64 -; SI: v_add_f64 {{v[[0-9]+:[0-9]+]}}, -|[[INPUT]]| -; SI: s_endpgm -define void @ffloor_f64_neg_abs(double addrspace(1)* %out, double %x) { - %abs = call double @llvm.fabs.f64(double %x) - %neg = fsub double 0.0, %abs - %y = call double @llvm.floor.f64(double %neg) nounwind readnone - store double %y, double addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}ffloor_v2f64: -; CI: v_floor_f64_e32 -; CI: v_floor_f64_e32 -define void @ffloor_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %x) { - %y = call <2 x double> @llvm.floor.v2f64(<2 x double> %x) nounwind readnone - store <2 x double> %y, <2 x double> addrspace(1)* %out - ret void -} - -; FIXME-FUNC-LABEL: {{^}}ffloor_v3f64: -; FIXME-CI: v_floor_f64_e32 -; FIXME-CI: v_floor_f64_e32 -; FIXME-CI: v_floor_f64_e32 -; define void @ffloor_v3f64(<3 x double> addrspace(1)* %out, <3 x double> %x) { -; %y = call <3 x double> @llvm.floor.v3f64(<3 x double> %x) nounwind readnone -; store <3 x double> %y, <3 x double> addrspace(1)* %out -; ret void -; } - -; FUNC-LABEL: {{^}}ffloor_v4f64: -; CI: v_floor_f64_e32 -; CI: v_floor_f64_e32 -; CI: v_floor_f64_e32 -; CI: v_floor_f64_e32 -define void @ffloor_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %x) { - %y = call <4 x double> @llvm.floor.v4f64(<4 x double> %x) nounwind readnone - store <4 x double> %y, <4 x double> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}ffloor_v8f64: -; CI: v_floor_f64_e32 -; CI: v_floor_f64_e32 -; CI: v_floor_f64_e32 -; CI: v_floor_f64_e32 -; CI: v_floor_f64_e32 -; CI: v_floor_f64_e32 -; CI: v_floor_f64_e32 -; CI: v_floor_f64_e32 -define void @ffloor_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %x) { - %y = call <8 x double> @llvm.floor.v8f64(<8 x double> %x) nounwind readnone - store <8 x double> %y, <8 x double> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}ffloor_v16f64: -; CI: v_floor_f64_e32 -; CI: v_floor_f64_e32 -; CI: v_floor_f64_e32 -; CI: v_floor_f64_e32 -; CI: v_floor_f64_e32 -; CI: v_floor_f64_e32 -; CI: v_floor_f64_e32 -; CI: v_floor_f64_e32 -; CI: v_floor_f64_e32 -; CI: v_floor_f64_e32 -; CI: v_floor_f64_e32 -; CI: v_floor_f64_e32 -; CI: v_floor_f64_e32 -; CI: v_floor_f64_e32 -; CI: v_floor_f64_e32 -; CI: v_floor_f64_e32 -define void @ffloor_v16f64(<16 x double> addrspace(1)* %out, <16 x double> %x) { - %y = call <16 x double> @llvm.floor.v16f64(<16 x double> %x) nounwind readnone - store <16 x double> %y, <16 x double> addrspace(1)* %out - ret void -} diff --git a/llvm/test/CodeGen/R600/ffloor.ll b/llvm/test/CodeGen/R600/ffloor.ll deleted file mode 100644 index 61c46ac2bc0..00000000000 --- a/llvm/test/CodeGen/R600/ffloor.ll +++ /dev/null @@ -1,49 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s - -; FUNC-LABEL: {{^}}floor_f32: -; SI: v_floor_f32_e32 -; R600: FLOOR -define void @floor_f32(float addrspace(1)* %out, float %in) { - %tmp = call float @llvm.floor.f32(float %in) #0 - store float %tmp, float addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}floor_v2f32: -; SI: v_floor_f32_e32 -; SI: v_floor_f32_e32 - -define void @floor_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) { - %tmp = call <2 x float> @llvm.floor.v2f32(<2 x float> %in) #0 - store <2 x float> %tmp, <2 x float> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}floor_v4f32: -; SI: v_floor_f32_e32 -; SI: v_floor_f32_e32 -; SI: v_floor_f32_e32 -; SI: v_floor_f32_e32 - -; R600: FLOOR -; R600: FLOOR -; R600: FLOOR -; R600: FLOOR -define void @floor_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) { - %tmp = call <4 x float> @llvm.floor.v4f32(<4 x float> %in) #0 - store <4 x float> %tmp, <4 x float> addrspace(1)* %out - ret void -} - -; Function Attrs: nounwind readonly -declare float @llvm.floor.f32(float) #0 - -; Function Attrs: nounwind readonly -declare <2 x float> @llvm.floor.v2f32(<2 x float>) #0 - -; Function Attrs: nounwind readonly -declare <4 x float> @llvm.floor.v4f32(<4 x float>) #0 - -attributes #0 = { nounwind readnone } diff --git a/llvm/test/CodeGen/R600/flat-address-space.ll b/llvm/test/CodeGen/R600/flat-address-space.ll deleted file mode 100644 index 8ceca078f2d..00000000000 --- a/llvm/test/CodeGen/R600/flat-address-space.ll +++ /dev/null @@ -1,184 +0,0 @@ -; RUN: llc -O0 -march=amdgcn -mcpu=bonaire -mattr=-promote-alloca < %s | FileCheck -check-prefix=CHECK -check-prefix=CHECK-NO-PROMOTE %s -; RUN: llc -O0 -march=amdgcn -mcpu=bonaire -mattr=+promote-alloca < %s | FileCheck -check-prefix=CHECK -check-prefix=CHECK-PROMOTE %s -; RUN: llc -O0 -march=amdgcn -mcpu=tonga -mattr=-promote-alloca < %s | FileCheck -check-prefix=CHECK -check-prefix=CHECK-NO-PROMOTE %s -; RUN: llc -O0 -march=amdgcn -mcpu=tonga -mattr=+promote-alloca < %s | FileCheck -check-prefix=CHECK -check-prefix=CHECK-PROMOTE %s - -; Disable optimizations in case there are optimizations added that -; specialize away generic pointer accesses. - - -; CHECK-LABEL: {{^}}branch_use_flat_i32: -; CHECK: flat_store_dword {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} -; CHECK: s_endpgm -define void @branch_use_flat_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* %gptr, i32 addrspace(3)* %lptr, i32 %x, i32 %c) #0 { -entry: - %cmp = icmp ne i32 %c, 0 - br i1 %cmp, label %local, label %global - -local: - %flat_local = addrspacecast i32 addrspace(3)* %lptr to i32 addrspace(4)* - br label %end - -global: - %flat_global = addrspacecast i32 addrspace(1)* %gptr to i32 addrspace(4)* - br label %end - -end: - %fptr = phi i32 addrspace(4)* [ %flat_local, %local ], [ %flat_global, %global ] - store i32 %x, i32 addrspace(4)* %fptr, align 4 -; %val = load i32, i32 addrspace(4)* %fptr, align 4 -; store i32 %val, i32 addrspace(1)* %out, align 4 - ret void -} - - - -; These testcases might become useless when there are optimizations to -; remove generic pointers. - -; CHECK-LABEL: {{^}}store_flat_i32: -; CHECK: v_mov_b32_e32 v[[DATA:[0-9]+]], {{s[0-9]+}} -; CHECK: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], {{s[0-9]+}} -; CHECK: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], {{s[0-9]+}} -; CHECK: flat_store_dword v[[DATA]], v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}} -define void @store_flat_i32(i32 addrspace(1)* %gptr, i32 %x) #0 { - %fptr = addrspacecast i32 addrspace(1)* %gptr to i32 addrspace(4)* - store i32 %x, i32 addrspace(4)* %fptr, align 4 - ret void -} - -; CHECK-LABEL: {{^}}store_flat_i64: -; CHECK: flat_store_dwordx2 -define void @store_flat_i64(i64 addrspace(1)* %gptr, i64 %x) #0 { - %fptr = addrspacecast i64 addrspace(1)* %gptr to i64 addrspace(4)* - store i64 %x, i64 addrspace(4)* %fptr, align 8 - ret void -} - -; CHECK-LABEL: {{^}}store_flat_v4i32: -; CHECK: flat_store_dwordx4 -define void @store_flat_v4i32(<4 x i32> addrspace(1)* %gptr, <4 x i32> %x) #0 { - %fptr = addrspacecast <4 x i32> addrspace(1)* %gptr to <4 x i32> addrspace(4)* - store <4 x i32> %x, <4 x i32> addrspace(4)* %fptr, align 16 - ret void -} - -; CHECK-LABEL: {{^}}store_flat_trunc_i16: -; CHECK: flat_store_short -define void @store_flat_trunc_i16(i16 addrspace(1)* %gptr, i32 %x) #0 { - %fptr = addrspacecast i16 addrspace(1)* %gptr to i16 addrspace(4)* - %y = trunc i32 %x to i16 - store i16 %y, i16 addrspace(4)* %fptr, align 2 - ret void -} - -; CHECK-LABEL: {{^}}store_flat_trunc_i8: -; CHECK: flat_store_byte -define void @store_flat_trunc_i8(i8 addrspace(1)* %gptr, i32 %x) #0 { - %fptr = addrspacecast i8 addrspace(1)* %gptr to i8 addrspace(4)* - %y = trunc i32 %x to i8 - store i8 %y, i8 addrspace(4)* %fptr, align 2 - ret void -} - - - -; CHECK-LABEL @load_flat_i32: -; CHECK: flat_load_dword -define void @load_flat_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %gptr) #0 { - %fptr = addrspacecast i32 addrspace(1)* %gptr to i32 addrspace(4)* - %fload = load i32, i32 addrspace(4)* %fptr, align 4 - store i32 %fload, i32 addrspace(1)* %out, align 4 - ret void -} - -; CHECK-LABEL @load_flat_i64: -; CHECK: flat_load_dwordx2 -define void @load_flat_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %gptr) #0 { - %fptr = addrspacecast i64 addrspace(1)* %gptr to i64 addrspace(4)* - %fload = load i64, i64 addrspace(4)* %fptr, align 4 - store i64 %fload, i64 addrspace(1)* %out, align 8 - ret void -} - -; CHECK-LABEL @load_flat_v4i32: -; CHECK: flat_load_dwordx4 -define void @load_flat_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %gptr) #0 { - %fptr = addrspacecast <4 x i32> addrspace(1)* %gptr to <4 x i32> addrspace(4)* - %fload = load <4 x i32>, <4 x i32> addrspace(4)* %fptr, align 4 - store <4 x i32> %fload, <4 x i32> addrspace(1)* %out, align 8 - ret void -} - -; CHECK-LABEL @sextload_flat_i8: -; CHECK: flat_load_sbyte -define void @sextload_flat_i8(i32 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %gptr) #0 { - %fptr = addrspacecast i8 addrspace(1)* %gptr to i8 addrspace(4)* - %fload = load i8, i8 addrspace(4)* %fptr, align 4 - %ext = sext i8 %fload to i32 - store i32 %ext, i32 addrspace(1)* %out, align 4 - ret void -} - -; CHECK-LABEL @zextload_flat_i8: -; CHECK: flat_load_ubyte -define void @zextload_flat_i8(i32 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %gptr) #0 { - %fptr = addrspacecast i8 addrspace(1)* %gptr to i8 addrspace(4)* - %fload = load i8, i8 addrspace(4)* %fptr, align 4 - %ext = zext i8 %fload to i32 - store i32 %ext, i32 addrspace(1)* %out, align 4 - ret void -} - -; CHECK-LABEL @sextload_flat_i16: -; CHECK: flat_load_sshort -define void @sextload_flat_i16(i32 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %gptr) #0 { - %fptr = addrspacecast i16 addrspace(1)* %gptr to i16 addrspace(4)* - %fload = load i16, i16 addrspace(4)* %fptr, align 4 - %ext = sext i16 %fload to i32 - store i32 %ext, i32 addrspace(1)* %out, align 4 - ret void -} - -; CHECK-LABEL @zextload_flat_i16: -; CHECK: flat_load_ushort -define void @zextload_flat_i16(i32 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %gptr) #0 { - %fptr = addrspacecast i16 addrspace(1)* %gptr to i16 addrspace(4)* - %fload = load i16, i16 addrspace(4)* %fptr, align 4 - %ext = zext i16 %fload to i32 - store i32 %ext, i32 addrspace(1)* %out, align 4 - ret void -} - - - -; TODO: This should not be zero when registers are used for small -; scratch allocations again. - -; Check for prologue initializing special SGPRs pointing to scratch. -; CHECK-LABEL: {{^}}store_flat_scratch: -; CHECK: s_movk_i32 flat_scratch_lo, 0 -; CHECK-NO-PROMOTE: s_movk_i32 flat_scratch_hi, 0x28{{$}} -; CHECK-PROMOTE: s_movk_i32 flat_scratch_hi, 0x0{{$}} -; CHECK: flat_store_dword -; CHECK: s_barrier -; CHECK: flat_load_dword -define void @store_flat_scratch(i32 addrspace(1)* noalias %out, i32) #0 { - %alloca = alloca i32, i32 9, align 4 - %x = call i32 @llvm.r600.read.tidig.x() #3 - %pptr = getelementptr i32, i32* %alloca, i32 %x - %fptr = addrspacecast i32* %pptr to i32 addrspace(4)* - store i32 %x, i32 addrspace(4)* %fptr - ; Dummy call - call void @llvm.AMDGPU.barrier.local() #1 - %reload = load i32, i32 addrspace(4)* %fptr, align 4 - store i32 %reload, i32 addrspace(1)* %out, align 4 - ret void -} - -declare void @llvm.AMDGPU.barrier.local() #1 -declare i32 @llvm.r600.read.tidig.x() #3 - -attributes #0 = { nounwind } -attributes #1 = { nounwind noduplicate } -attributes #3 = { nounwind readnone } diff --git a/llvm/test/CodeGen/R600/floor.ll b/llvm/test/CodeGen/R600/floor.ll deleted file mode 100644 index c6bfb8567a0..00000000000 --- a/llvm/test/CodeGen/R600/floor.ll +++ /dev/null @@ -1,15 +0,0 @@ -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck %s - -; CHECK: FLOOR * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -define void @test(<4 x float> inreg %reg0) #0 { - %r0 = extractelement <4 x float> %reg0, i32 0 - %r1 = call float @floor(float %r0) - %vec = insertelement <4 x float> undef, float %r1, i32 0 - call void @llvm.R600.store.swizzle(<4 x float> %vec, i32 0, i32 0) - ret void -} - -declare float @floor(float) readonly -declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) - -attributes #0 = { "ShaderType"="0" } diff --git a/llvm/test/CodeGen/R600/fma-combine.ll b/llvm/test/CodeGen/R600/fma-combine.ll deleted file mode 100644 index bd574b87711..00000000000 --- a/llvm/test/CodeGen/R600/fma-combine.ll +++ /dev/null @@ -1,368 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -fp-contract=fast < %s | FileCheck -check-prefix=SI-FASTFMAF -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs -fp-contract=fast < %s | FileCheck -check-prefix=SI-SLOWFMAF -check-prefix=SI -check-prefix=FUNC %s - -declare i32 @llvm.r600.read.tidig.x() #0 -declare double @llvm.fabs.f64(double) #0 -declare double @llvm.fma.f64(double, double, double) #0 -declare float @llvm.fma.f32(float, float, float) #0 - -; (fadd (fmul x, y), z) -> (fma x, y, z) -; FUNC-LABEL: {{^}}combine_to_fma_f64_0: -; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} -; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} -; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[C]] -; SI: buffer_store_dwordx2 [[RESULT]] -define void @combine_to_fma_f64_0(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 { - %tid = tail call i32 @llvm.r600.read.tidig.x() #0 - %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid - %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 - %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2 - %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid - - %a = load double, double addrspace(1)* %gep.0 - %b = load double, double addrspace(1)* %gep.1 - %c = load double, double addrspace(1)* %gep.2 - - %mul = fmul double %a, %b - %fma = fadd double %mul, %c - store double %fma, double addrspace(1)* %gep.out - ret void -} - -; (fadd (fmul x, y), z) -> (fma x, y, z) -; FUNC-LABEL: {{^}}combine_to_fma_f64_0_2use: -; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} -; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} -; SI-DAG: buffer_load_dwordx2 [[D:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24{{$}} -; SI-DAG: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[C]] -; SI-DAG: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[D]] -; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} -; SI: s_endpgm -define void @combine_to_fma_f64_0_2use(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 { - %tid = tail call i32 @llvm.r600.read.tidig.x() #0 - %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid - %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 - %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2 - %gep.3 = getelementptr double, double addrspace(1)* %gep.0, i32 3 - %gep.out.0 = getelementptr double, double addrspace(1)* %out, i32 %tid - %gep.out.1 = getelementptr double, double addrspace(1)* %gep.out.0, i32 1 - - %a = load double, double addrspace(1)* %gep.0 - %b = load double, double addrspace(1)* %gep.1 - %c = load double, double addrspace(1)* %gep.2 - %d = load double, double addrspace(1)* %gep.3 - - %mul = fmul double %a, %b - %fma0 = fadd double %mul, %c - %fma1 = fadd double %mul, %d - store double %fma0, double addrspace(1)* %gep.out.0 - store double %fma1, double addrspace(1)* %gep.out.1 - ret void -} - -; (fadd x, (fmul y, z)) -> (fma y, z, x) -; FUNC-LABEL: {{^}}combine_to_fma_f64_1: -; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} -; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} -; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[C]] -; SI: buffer_store_dwordx2 [[RESULT]] -define void @combine_to_fma_f64_1(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 { - %tid = tail call i32 @llvm.r600.read.tidig.x() #0 - %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid - %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 - %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2 - %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid - - %a = load double, double addrspace(1)* %gep.0 - %b = load double, double addrspace(1)* %gep.1 - %c = load double, double addrspace(1)* %gep.2 - - %mul = fmul double %a, %b - %fma = fadd double %c, %mul - store double %fma, double addrspace(1)* %gep.out - ret void -} - -; (fsub (fmul x, y), z) -> (fma x, y, (fneg z)) -; FUNC-LABEL: {{^}}combine_to_fma_fsub_0_f64: -; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} -; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} -; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], -[[C]] -; SI: buffer_store_dwordx2 [[RESULT]] -define void @combine_to_fma_fsub_0_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 { - %tid = tail call i32 @llvm.r600.read.tidig.x() #0 - %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid - %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 - %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2 - %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid - - %a = load double, double addrspace(1)* %gep.0 - %b = load double, double addrspace(1)* %gep.1 - %c = load double, double addrspace(1)* %gep.2 - - %mul = fmul double %a, %b - %fma = fsub double %mul, %c - store double %fma, double addrspace(1)* %gep.out - ret void -} - -; (fsub (fmul x, y), z) -> (fma x, y, (fneg z)) -; FUNC-LABEL: {{^}}combine_to_fma_fsub_f64_0_2use: -; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} -; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} -; SI-DAG: buffer_load_dwordx2 [[D:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24{{$}} -; SI-DAG: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], -[[C]] -; SI-DAG: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], -[[D]] -; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} -; SI: s_endpgm -define void @combine_to_fma_fsub_f64_0_2use(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 { - %tid = tail call i32 @llvm.r600.read.tidig.x() #0 - %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid - %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 - %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2 - %gep.3 = getelementptr double, double addrspace(1)* %gep.0, i32 3 - %gep.out.0 = getelementptr double, double addrspace(1)* %out, i32 %tid - %gep.out.1 = getelementptr double, double addrspace(1)* %gep.out.0, i32 1 - - %a = load double, double addrspace(1)* %gep.0 - %b = load double, double addrspace(1)* %gep.1 - %c = load double, double addrspace(1)* %gep.2 - %d = load double, double addrspace(1)* %gep.3 - - %mul = fmul double %a, %b - %fma0 = fsub double %mul, %c - %fma1 = fsub double %mul, %d - store double %fma0, double addrspace(1)* %gep.out.0 - store double %fma1, double addrspace(1)* %gep.out.1 - ret void -} - -; (fsub x, (fmul y, z)) -> (fma (fneg y), z, x) -; FUNC-LABEL: {{^}}combine_to_fma_fsub_1_f64: -; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} -; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} -; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], [[C]] -; SI: buffer_store_dwordx2 [[RESULT]] -define void @combine_to_fma_fsub_1_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 { - %tid = tail call i32 @llvm.r600.read.tidig.x() #0 - %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid - %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 - %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2 - %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid - - %a = load double, double addrspace(1)* %gep.0 - %b = load double, double addrspace(1)* %gep.1 - %c = load double, double addrspace(1)* %gep.2 - - %mul = fmul double %a, %b - %fma = fsub double %c, %mul - store double %fma, double addrspace(1)* %gep.out - ret void -} - -; (fsub x, (fmul y, z)) -> (fma (fneg y), z, x) -; FUNC-LABEL: {{^}}combine_to_fma_fsub_1_f64_2use: -; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} -; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} -; SI-DAG: buffer_load_dwordx2 [[D:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24{{$}} -; SI-DAG: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], [[C]] -; SI-DAG: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], [[D]] -; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} -; SI: s_endpgm -define void @combine_to_fma_fsub_1_f64_2use(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 { - %tid = tail call i32 @llvm.r600.read.tidig.x() #0 - %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid - %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 - %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2 - %gep.3 = getelementptr double, double addrspace(1)* %gep.0, i32 3 - %gep.out.0 = getelementptr double, double addrspace(1)* %out, i32 %tid - %gep.out.1 = getelementptr double, double addrspace(1)* %gep.out.0, i32 1 - - %a = load double, double addrspace(1)* %gep.0 - %b = load double, double addrspace(1)* %gep.1 - %c = load double, double addrspace(1)* %gep.2 - %d = load double, double addrspace(1)* %gep.3 - - %mul = fmul double %a, %b - %fma0 = fsub double %c, %mul - %fma1 = fsub double %d, %mul - store double %fma0, double addrspace(1)* %gep.out.0 - store double %fma1, double addrspace(1)* %gep.out.1 - ret void -} - -; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z)) -; FUNC-LABEL: {{^}}combine_to_fma_fsub_2_f64: -; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} -; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} -; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], -[[C]] -; SI: buffer_store_dwordx2 [[RESULT]] -define void @combine_to_fma_fsub_2_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 { - %tid = tail call i32 @llvm.r600.read.tidig.x() #0 - %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid - %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 - %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2 - %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid - - %a = load double, double addrspace(1)* %gep.0 - %b = load double, double addrspace(1)* %gep.1 - %c = load double, double addrspace(1)* %gep.2 - - %mul = fmul double %a, %b - %mul.neg = fsub double -0.0, %mul - %fma = fsub double %mul.neg, %c - - store double %fma, double addrspace(1)* %gep.out - ret void -} - -; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z)) -; FUNC-LABEL: {{^}}combine_to_fma_fsub_2_f64_2uses_neg: -; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} -; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} -; SI-DAG: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], -[[C]] -; SI-DAG: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], -[[D]] -; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} -; SI: s_endpgm -define void @combine_to_fma_fsub_2_f64_2uses_neg(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 { - %tid = tail call i32 @llvm.r600.read.tidig.x() #0 - %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid - %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 - %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2 - %gep.3 = getelementptr double, double addrspace(1)* %gep.0, i32 3 - %gep.out.0 = getelementptr double, double addrspace(1)* %out, i32 %tid - %gep.out.1 = getelementptr double, double addrspace(1)* %gep.out.0, i32 1 - - %a = load double, double addrspace(1)* %gep.0 - %b = load double, double addrspace(1)* %gep.1 - %c = load double, double addrspace(1)* %gep.2 - %d = load double, double addrspace(1)* %gep.3 - - %mul = fmul double %a, %b - %mul.neg = fsub double -0.0, %mul - %fma0 = fsub double %mul.neg, %c - %fma1 = fsub double %mul.neg, %d - - store double %fma0, double addrspace(1)* %gep.out.0 - store double %fma1, double addrspace(1)* %gep.out.1 - ret void -} - -; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z)) -; FUNC-LABEL: {{^}}combine_to_fma_fsub_2_f64_2uses_mul: -; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} -; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} -; SI-DAG: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], -[[C]] -; SI-DAG: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], -[[D]] -; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} -; SI: s_endpgm -define void @combine_to_fma_fsub_2_f64_2uses_mul(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 { - %tid = tail call i32 @llvm.r600.read.tidig.x() #0 - %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid - %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 - %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2 - %gep.3 = getelementptr double, double addrspace(1)* %gep.0, i32 3 - %gep.out.0 = getelementptr double, double addrspace(1)* %out, i32 %tid - %gep.out.1 = getelementptr double, double addrspace(1)* %gep.out.0, i32 1 - - %a = load double, double addrspace(1)* %gep.0 - %b = load double, double addrspace(1)* %gep.1 - %c = load double, double addrspace(1)* %gep.2 - %d = load double, double addrspace(1)* %gep.3 - - %mul = fmul double %a, %b - %mul.neg = fsub double -0.0, %mul - %fma0 = fsub double %mul.neg, %c - %fma1 = fsub double %mul, %d - - store double %fma0, double addrspace(1)* %gep.out.0 - store double %fma1, double addrspace(1)* %gep.out.1 - ret void -} - -; fold (fsub (fma x, y, (fmul u, v)), z) -> (fma x, y (fma u, v, (fneg z))) - -; FUNC-LABEL: {{^}}aggressive_combine_to_fma_fsub_0_f64: -; SI-DAG: buffer_load_dwordx2 [[X:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI-DAG: buffer_load_dwordx2 [[Y:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} -; SI-DAG: buffer_load_dwordx2 [[Z:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} -; SI-DAG: buffer_load_dwordx2 [[U:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24{{$}} -; SI-DAG: buffer_load_dwordx2 [[V:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:32{{$}} -; SI: v_fma_f64 [[FMA0:v\[[0-9]+:[0-9]+\]]], [[U]], [[V]], -[[Z]] -; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[X]], [[Y]], [[FMA0]] -; SI: buffer_store_dwordx2 [[RESULT]] -define void @aggressive_combine_to_fma_fsub_0_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 { - %tid = tail call i32 @llvm.r600.read.tidig.x() #0 - %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid - %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 - %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2 - %gep.3 = getelementptr double, double addrspace(1)* %gep.0, i32 3 - %gep.4 = getelementptr double, double addrspace(1)* %gep.0, i32 4 - %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid - - %x = load double, double addrspace(1)* %gep.0 - %y = load double, double addrspace(1)* %gep.1 - %z = load double, double addrspace(1)* %gep.2 - %u = load double, double addrspace(1)* %gep.3 - %v = load double, double addrspace(1)* %gep.4 - - %tmp0 = fmul double %u, %v - %tmp1 = call double @llvm.fma.f64(double %x, double %y, double %tmp0) #0 - %tmp2 = fsub double %tmp1, %z - - store double %tmp2, double addrspace(1)* %gep.out - ret void -} - -; fold (fsub x, (fma y, z, (fmul u, v))) -; -> (fma (fneg y), z, (fma (fneg u), v, x)) - -; FUNC-LABEL: {{^}}aggressive_combine_to_fma_fsub_1_f64: -; SI-DAG: buffer_load_dwordx2 [[X:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI-DAG: buffer_load_dwordx2 [[Y:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} -; SI-DAG: buffer_load_dwordx2 [[Z:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} -; SI-DAG: buffer_load_dwordx2 [[U:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24{{$}} -; SI-DAG: buffer_load_dwordx2 [[V:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:32{{$}} -; SI: v_fma_f64 [[FMA0:v\[[0-9]+:[0-9]+\]]], -[[U]], [[V]], [[X]] -; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], -[[Y]], [[Z]], [[FMA0]] -; SI: buffer_store_dwordx2 [[RESULT]] -define void @aggressive_combine_to_fma_fsub_1_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 { - %tid = tail call i32 @llvm.r600.read.tidig.x() #0 - %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid - %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 - %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2 - %gep.3 = getelementptr double, double addrspace(1)* %gep.0, i32 3 - %gep.4 = getelementptr double, double addrspace(1)* %gep.0, i32 4 - %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid - - %x = load double, double addrspace(1)* %gep.0 - %y = load double, double addrspace(1)* %gep.1 - %z = load double, double addrspace(1)* %gep.2 - %u = load double, double addrspace(1)* %gep.3 - %v = load double, double addrspace(1)* %gep.4 - - %tmp0 = fmul double %u, %v - %tmp1 = call double @llvm.fma.f64(double %y, double %z, double %tmp0) #0 - %tmp2 = fsub double %x, %tmp1 - - store double %tmp2, double addrspace(1)* %gep.out - ret void -} - -attributes #0 = { nounwind readnone } -attributes #1 = { nounwind } diff --git a/llvm/test/CodeGen/R600/fma.f64.ll b/llvm/test/CodeGen/R600/fma.f64.ll deleted file mode 100644 index 0a55ef77855..00000000000 --- a/llvm/test/CodeGen/R600/fma.f64.ll +++ /dev/null @@ -1,47 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s - -declare double @llvm.fma.f64(double, double, double) nounwind readnone -declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>) nounwind readnone -declare <4 x double> @llvm.fma.v4f64(<4 x double>, <4 x double>, <4 x double>) nounwind readnone - - -; FUNC-LABEL: {{^}}fma_f64: -; SI: v_fma_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}} -define void @fma_f64(double addrspace(1)* %out, double addrspace(1)* %in1, - double addrspace(1)* %in2, double addrspace(1)* %in3) { - %r0 = load double, double addrspace(1)* %in1 - %r1 = load double, double addrspace(1)* %in2 - %r2 = load double, double addrspace(1)* %in3 - %r3 = tail call double @llvm.fma.f64(double %r0, double %r1, double %r2) - store double %r3, double addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}fma_v2f64: -; SI: v_fma_f64 -; SI: v_fma_f64 -define void @fma_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %in1, - <2 x double> addrspace(1)* %in2, <2 x double> addrspace(1)* %in3) { - %r0 = load <2 x double>, <2 x double> addrspace(1)* %in1 - %r1 = load <2 x double>, <2 x double> addrspace(1)* %in2 - %r2 = load <2 x double>, <2 x double> addrspace(1)* %in3 - %r3 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %r0, <2 x double> %r1, <2 x double> %r2) - store <2 x double> %r3, <2 x double> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}fma_v4f64: -; SI: v_fma_f64 -; SI: v_fma_f64 -; SI: v_fma_f64 -; SI: v_fma_f64 -define void @fma_v4f64(<4 x double> addrspace(1)* %out, <4 x double> addrspace(1)* %in1, - <4 x double> addrspace(1)* %in2, <4 x double> addrspace(1)* %in3) { - %r0 = load <4 x double>, <4 x double> addrspace(1)* %in1 - %r1 = load <4 x double>, <4 x double> addrspace(1)* %in2 - %r2 = load <4 x double>, <4 x double> addrspace(1)* %in3 - %r3 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %r0, <4 x double> %r1, <4 x double> %r2) - store <4 x double> %r3, <4 x double> addrspace(1)* %out - ret void -} diff --git a/llvm/test/CodeGen/R600/fma.ll b/llvm/test/CodeGen/R600/fma.ll deleted file mode 100644 index d6024aa0b4c..00000000000 --- a/llvm/test/CodeGen/R600/fma.ll +++ /dev/null @@ -1,92 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s - -declare float @llvm.fma.f32(float, float, float) nounwind readnone -declare <2 x float> @llvm.fma.v2f32(<2 x float>, <2 x float>, <2 x float>) nounwind readnone -declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>) nounwind readnone - -declare i32 @llvm.r600.read.tidig.x() nounwind readnone - -; FUNC-LABEL: {{^}}fma_f32: -; SI: v_fma_f32 {{v[0-9]+, v[0-9]+, v[0-9]+, v[0-9]+}} - -; EG: MEM_RAT_{{.*}} STORE_{{.*}} [[RES:T[0-9]\.[XYZW]]], {{T[0-9]\.[XYZW]}}, -; EG: FMA {{\*? *}}[[RES]] -define void @fma_f32(float addrspace(1)* %out, float addrspace(1)* %in1, - float addrspace(1)* %in2, float addrspace(1)* %in3) { - %r0 = load float, float addrspace(1)* %in1 - %r1 = load float, float addrspace(1)* %in2 - %r2 = load float, float addrspace(1)* %in3 - %r3 = tail call float @llvm.fma.f32(float %r0, float %r1, float %r2) - store float %r3, float addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}fma_v2f32: -; SI: v_fma_f32 -; SI: v_fma_f32 - -; EG: MEM_RAT_{{.*}} STORE_{{.*}} [[RES:T[0-9]]].[[CHLO:[XYZW]]][[CHHI:[XYZW]]], {{T[0-9]\.[XYZW]}}, -; EG-DAG: FMA {{\*? *}}[[RES]].[[CHLO]] -; EG-DAG: FMA {{\*? *}}[[RES]].[[CHHI]] -define void @fma_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in1, - <2 x float> addrspace(1)* %in2, <2 x float> addrspace(1)* %in3) { - %r0 = load <2 x float>, <2 x float> addrspace(1)* %in1 - %r1 = load <2 x float>, <2 x float> addrspace(1)* %in2 - %r2 = load <2 x float>, <2 x float> addrspace(1)* %in3 - %r3 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %r0, <2 x float> %r1, <2 x float> %r2) - store <2 x float> %r3, <2 x float> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}fma_v4f32: -; SI: v_fma_f32 -; SI: v_fma_f32 -; SI: v_fma_f32 -; SI: v_fma_f32 - -; EG: MEM_RAT_{{.*}} STORE_{{.*}} [[RES:T[0-9]]].{{[XYZW][XYZW][XYZW][XYZW]}}, {{T[0-9]\.[XYZW]}}, -; EG-DAG: FMA {{\*? *}}[[RES]].X -; EG-DAG: FMA {{\*? *}}[[RES]].Y -; EG-DAG: FMA {{\*? *}}[[RES]].Z -; EG-DAG: FMA {{\*? *}}[[RES]].W -define void @fma_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in1, - <4 x float> addrspace(1)* %in2, <4 x float> addrspace(1)* %in3) { - %r0 = load <4 x float>, <4 x float> addrspace(1)* %in1 - %r1 = load <4 x float>, <4 x float> addrspace(1)* %in2 - %r2 = load <4 x float>, <4 x float> addrspace(1)* %in3 - %r3 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %r0, <4 x float> %r1, <4 x float> %r2) - store <4 x float> %r3, <4 x float> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: @fma_commute_mul_inline_imm_f32 -; SI: v_fma_f32 {{v[0-9]+}}, 2.0, {{v[0-9]+}}, {{v[0-9]+}} -define void @fma_commute_mul_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind { - %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone - %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid - %in.b.gep = getelementptr float, float addrspace(1)* %in.b, i32 %tid - %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid - - %a = load float, float addrspace(1)* %in.a.gep, align 4 - %b = load float, float addrspace(1)* %in.b.gep, align 4 - - %fma = call float @llvm.fma.f32(float %a, float 2.0, float %b) - store float %fma, float addrspace(1)* %out.gep, align 4 - ret void -} - -; FUNC-LABEL: @fma_commute_mul_s_f32 -define void @fma_commute_mul_s_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b, float %b) nounwind { - %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone - %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid - %in.b.gep = getelementptr float, float addrspace(1)* %in.b, i32 %tid - %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid - - %a = load float, float addrspace(1)* %in.a.gep, align 4 - %c = load float, float addrspace(1)* %in.b.gep, align 4 - - %fma = call float @llvm.fma.f32(float %a, float %b, float %c) - store float %fma, float addrspace(1)* %out.gep, align 4 - ret void -} diff --git a/llvm/test/CodeGen/R600/fmad.ll b/llvm/test/CodeGen/R600/fmad.ll deleted file mode 100644 index 935e35123f4..00000000000 --- a/llvm/test/CodeGen/R600/fmad.ll +++ /dev/null @@ -1,19 +0,0 @@ -;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s - -;CHECK: MULADD_IEEE * {{T[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} - -define void @test(<4 x float> inreg %reg0) #0 { - %r0 = extractelement <4 x float> %reg0, i32 0 - %r1 = extractelement <4 x float> %reg0, i32 1 - %r2 = extractelement <4 x float> %reg0, i32 2 - %r3 = fmul float %r0, %r1 - %r4 = fadd float %r3, %r2 - %vec = insertelement <4 x float> undef, float %r4, i32 0 - call void @llvm.R600.store.swizzle(<4 x float> %vec, i32 0, i32 0) - ret void -} - -declare float @fabs(float ) readnone -declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) - -attributes #0 = { "ShaderType"="0" } \ No newline at end of file diff --git a/llvm/test/CodeGen/R600/fmax.ll b/llvm/test/CodeGen/R600/fmax.ll deleted file mode 100644 index d7127f485c7..00000000000 --- a/llvm/test/CodeGen/R600/fmax.ll +++ /dev/null @@ -1,17 +0,0 @@ -;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s - -;CHECK: MAX * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} - -define void @test(<4 x float> inreg %reg0) #0 { - %r0 = extractelement <4 x float> %reg0, i32 0 - %r1 = extractelement <4 x float> %reg0, i32 1 - %r2 = fcmp oge float %r0, %r1 - %r3 = select i1 %r2, float %r0, float %r1 - %vec = insertelement <4 x float> undef, float %r3, i32 0 - call void @llvm.R600.store.swizzle(<4 x float> %vec, i32 0, i32 0) - ret void -} - -declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) - -attributes #0 = { "ShaderType"="0" } \ No newline at end of file diff --git a/llvm/test/CodeGen/R600/fmax3.f64.ll b/llvm/test/CodeGen/R600/fmax3.f64.ll deleted file mode 100644 index f78c71b2826..00000000000 --- a/llvm/test/CodeGen/R600/fmax3.f64.ll +++ /dev/null @@ -1,24 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s - -declare double @llvm.maxnum.f64(double, double) nounwind readnone - -; SI-LABEL: {{^}}test_fmax3_f64: -; SI-DAG: buffer_load_dwordx2 [[REGA:v\[[0-9]+:[0-9]+\]]], s[{{[0-9]+:[0-9]+}}], 0{{$}} -; SI-DAG: buffer_load_dwordx2 [[REGB:v\[[0-9]+:[0-9]+\]]], s[{{[0-9]+:[0-9]+}}], 0 offset:8 -; SI-DAG: buffer_load_dwordx2 [[REGC:v\[[0-9]+:[0-9]+\]]], s[{{[0-9]+:[0-9]+}}], 0 offset:16 -; SI: v_max_f64 [[REGA]], [[REGA]], [[REGB]] -; SI: v_max_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[REGA]], [[REGC]] -; SI: buffer_store_dwordx2 [[RESULT]], -; SI: s_endpgm -define void @test_fmax3_f64(double addrspace(1)* %out, double addrspace(1)* %aptr) nounwind { - %bptr = getelementptr double, double addrspace(1)* %aptr, i32 1 - %cptr = getelementptr double, double addrspace(1)* %aptr, i32 2 - %a = load double, double addrspace(1)* %aptr, align 8 - %b = load double, double addrspace(1)* %bptr, align 8 - %c = load double, double addrspace(1)* %cptr, align 8 - %f0 = call double @llvm.maxnum.f64(double %a, double %b) nounwind readnone - %f1 = call double @llvm.maxnum.f64(double %f0, double %c) nounwind readnone - store double %f1, double addrspace(1)* %out, align 8 - ret void -} diff --git a/llvm/test/CodeGen/R600/fmax3.ll b/llvm/test/CodeGen/R600/fmax3.ll deleted file mode 100644 index c3028a6217d..00000000000 --- a/llvm/test/CodeGen/R600/fmax3.ll +++ /dev/null @@ -1,39 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s - -declare float @llvm.maxnum.f32(float, float) nounwind readnone - -; SI-LABEL: {{^}}test_fmax3_olt_0: -; SI: buffer_load_dword [[REGC:v[0-9]+]] -; SI: buffer_load_dword [[REGB:v[0-9]+]] -; SI: buffer_load_dword [[REGA:v[0-9]+]] -; SI: v_max3_f32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]] -; SI: buffer_store_dword [[RESULT]], -; SI: s_endpgm -define void @test_fmax3_olt_0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) nounwind { - %a = load float, float addrspace(1)* %aptr, align 4 - %b = load float, float addrspace(1)* %bptr, align 4 - %c = load float, float addrspace(1)* %cptr, align 4 - %f0 = call float @llvm.maxnum.f32(float %a, float %b) nounwind readnone - %f1 = call float @llvm.maxnum.f32(float %f0, float %c) nounwind readnone - store float %f1, float addrspace(1)* %out, align 4 - ret void -} - -; Commute operand of second fmax -; SI-LABEL: {{^}}test_fmax3_olt_1: -; SI: buffer_load_dword [[REGB:v[0-9]+]] -; SI: buffer_load_dword [[REGA:v[0-9]+]] -; SI: buffer_load_dword [[REGC:v[0-9]+]] -; SI: v_max3_f32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]] -; SI: buffer_store_dword [[RESULT]], -; SI: s_endpgm -define void @test_fmax3_olt_1(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) nounwind { - %a = load float, float addrspace(1)* %aptr, align 4 - %b = load float, float addrspace(1)* %bptr, align 4 - %c = load float, float addrspace(1)* %cptr, align 4 - %f0 = call float @llvm.maxnum.f32(float %a, float %b) nounwind readnone - %f1 = call float @llvm.maxnum.f32(float %c, float %f0) nounwind readnone - store float %f1, float addrspace(1)* %out, align 4 - ret void -} diff --git a/llvm/test/CodeGen/R600/fmax_legacy.f64.ll b/llvm/test/CodeGen/R600/fmax_legacy.f64.ll deleted file mode 100644 index 828243888ac..00000000000 --- a/llvm/test/CodeGen/R600/fmax_legacy.f64.ll +++ /dev/null @@ -1,67 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; Make sure we don't try to form FMAX_LEGACY nodes with f64 - -declare i32 @llvm.r600.read.tidig.x() #1 - -; FUNC-LABEL: @test_fmax_legacy_uge_f64 -define void @test_fmax_legacy_uge_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 { - %tid = call i32 @llvm.r600.read.tidig.x() #1 - %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid - %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 - - %a = load double, double addrspace(1)* %gep.0, align 8 - %b = load double, double addrspace(1)* %gep.1, align 8 - - %cmp = fcmp uge double %a, %b - %val = select i1 %cmp, double %a, double %b - store double %val, double addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: @test_fmax_legacy_oge_f64 -define void @test_fmax_legacy_oge_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 { - %tid = call i32 @llvm.r600.read.tidig.x() #1 - %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid - %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 - - %a = load double, double addrspace(1)* %gep.0, align 8 - %b = load double, double addrspace(1)* %gep.1, align 8 - - %cmp = fcmp oge double %a, %b - %val = select i1 %cmp, double %a, double %b - store double %val, double addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: @test_fmax_legacy_ugt_f64 -define void @test_fmax_legacy_ugt_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 { - %tid = call i32 @llvm.r600.read.tidig.x() #1 - %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid - %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 - - %a = load double, double addrspace(1)* %gep.0, align 8 - %b = load double, double addrspace(1)* %gep.1, align 8 - - %cmp = fcmp ugt double %a, %b - %val = select i1 %cmp, double %a, double %b - store double %val, double addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: @test_fmax_legacy_ogt_f64 -define void @test_fmax_legacy_ogt_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 { - %tid = call i32 @llvm.r600.read.tidig.x() #1 - %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid - %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 - - %a = load double, double addrspace(1)* %gep.0, align 8 - %b = load double, double addrspace(1)* %gep.1, align 8 - - %cmp = fcmp ogt double %a, %b - %val = select i1 %cmp, double %a, double %b - store double %val, double addrspace(1)* %out, align 8 - ret void -} - -attributes #0 = { nounwind } -attributes #1 = { nounwind readnone } diff --git a/llvm/test/CodeGen/R600/fmax_legacy.ll b/llvm/test/CodeGen/R600/fmax_legacy.ll deleted file mode 100644 index 413957d2982..00000000000 --- a/llvm/test/CodeGen/R600/fmax_legacy.ll +++ /dev/null @@ -1,116 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=SI-SAFE -check-prefix=FUNC %s -; RUN: llc -enable-no-nans-fp-math -enable-unsafe-fp-math -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI-NONAN -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s - -; FIXME: Should replace unsafe-fp-math with no signed zeros. - -declare i32 @llvm.r600.read.tidig.x() #1 - -; FUNC-LABEL: @test_fmax_legacy_uge_f32 -; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; SI-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]] -; SI-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[B]], [[A]] - -; EG: MAX -define void @test_fmax_legacy_uge_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { - %tid = call i32 @llvm.r600.read.tidig.x() #1 - %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid - %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 - - %a = load float, float addrspace(1)* %gep.0, align 4 - %b = load float, float addrspace(1)* %gep.1, align 4 - - %cmp = fcmp uge float %a, %b - %val = select i1 %cmp, float %a, float %b - store float %val, float addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: @test_fmax_legacy_oge_f32 -; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; SI-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]] -; SI-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[B]], [[A]] -; EG: MAX -define void @test_fmax_legacy_oge_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { - %tid = call i32 @llvm.r600.read.tidig.x() #1 - %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid - %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 - - %a = load float, float addrspace(1)* %gep.0, align 4 - %b = load float, float addrspace(1)* %gep.1, align 4 - - %cmp = fcmp oge float %a, %b - %val = select i1 %cmp, float %a, float %b - store float %val, float addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: @test_fmax_legacy_ugt_f32 -; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; SI-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]] -; SI-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[B]], [[A]] -; EG: MAX -define void @test_fmax_legacy_ugt_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { - %tid = call i32 @llvm.r600.read.tidig.x() #1 - %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid - %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 - - %a = load float, float addrspace(1)* %gep.0, align 4 - %b = load float, float addrspace(1)* %gep.1, align 4 - - %cmp = fcmp ugt float %a, %b - %val = select i1 %cmp, float %a, float %b - store float %val, float addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: @test_fmax_legacy_ogt_f32 -; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; SI-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]] -; SI-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[B]], [[A]] -; EG: MAX -define void @test_fmax_legacy_ogt_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { - %tid = call i32 @llvm.r600.read.tidig.x() #1 - %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid - %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 - - %a = load float, float addrspace(1)* %gep.0, align 4 - %b = load float, float addrspace(1)* %gep.1, align 4 - - %cmp = fcmp ogt float %a, %b - %val = select i1 %cmp, float %a, float %b - store float %val, float addrspace(1)* %out, align 4 - ret void -} - - -; FUNC-LABEL: @test_fmax_legacy_ogt_f32_multi_use -; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; SI-NOT: v_max_ -; SI: v_cmp_gt_f32 -; SI-NEXT: v_cndmask_b32 -; SI-NOT: v_max_ - -; EG: MAX -define void @test_fmax_legacy_ogt_f32_multi_use(float addrspace(1)* %out0, i1 addrspace(1)* %out1, float addrspace(1)* %in) #0 { - %tid = call i32 @llvm.r600.read.tidig.x() #1 - %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid - %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 - - %a = load float, float addrspace(1)* %gep.0, align 4 - %b = load float, float addrspace(1)* %gep.1, align 4 - - %cmp = fcmp ogt float %a, %b - %val = select i1 %cmp, float %a, float %b - store float %val, float addrspace(1)* %out0, align 4 - store i1 %cmp, i1addrspace(1)* %out1 - ret void -} - -attributes #0 = { nounwind } -attributes #1 = { nounwind readnone } diff --git a/llvm/test/CodeGen/R600/fmaxnum.f64.ll b/llvm/test/CodeGen/R600/fmaxnum.f64.ll deleted file mode 100644 index de563cec341..00000000000 --- a/llvm/test/CodeGen/R600/fmaxnum.f64.ll +++ /dev/null @@ -1,76 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s - -declare double @llvm.maxnum.f64(double, double) #0 -declare <2 x double> @llvm.maxnum.v2f64(<2 x double>, <2 x double>) #0 -declare <4 x double> @llvm.maxnum.v4f64(<4 x double>, <4 x double>) #0 -declare <8 x double> @llvm.maxnum.v8f64(<8 x double>, <8 x double>) #0 -declare <16 x double> @llvm.maxnum.v16f64(<16 x double>, <16 x double>) #0 - -; FUNC-LABEL: @test_fmax_f64 -; SI: v_max_f64 -define void @test_fmax_f64(double addrspace(1)* %out, double %a, double %b) nounwind { - %val = call double @llvm.maxnum.f64(double %a, double %b) #0 - store double %val, double addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: @test_fmax_v2f64 -; SI: v_max_f64 -; SI: v_max_f64 -define void @test_fmax_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b) nounwind { - %val = call <2 x double> @llvm.maxnum.v2f64(<2 x double> %a, <2 x double> %b) #0 - store <2 x double> %val, <2 x double> addrspace(1)* %out, align 16 - ret void -} - -; FUNC-LABEL: @test_fmax_v4f64 -; SI: v_max_f64 -; SI: v_max_f64 -; SI: v_max_f64 -; SI: v_max_f64 -define void @test_fmax_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b) nounwind { - %val = call <4 x double> @llvm.maxnum.v4f64(<4 x double> %a, <4 x double> %b) #0 - store <4 x double> %val, <4 x double> addrspace(1)* %out, align 32 - ret void -} - -; FUNC-LABEL: @test_fmax_v8f64 -; SI: v_max_f64 -; SI: v_max_f64 -; SI: v_max_f64 -; SI: v_max_f64 -; SI: v_max_f64 -; SI: v_max_f64 -; SI: v_max_f64 -; SI: v_max_f64 -define void @test_fmax_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %a, <8 x double> %b) nounwind { - %val = call <8 x double> @llvm.maxnum.v8f64(<8 x double> %a, <8 x double> %b) #0 - store <8 x double> %val, <8 x double> addrspace(1)* %out, align 64 - ret void -} - -; FUNC-LABEL: @test_fmax_v16f64 -; SI: v_max_f64 -; SI: v_max_f64 -; SI: v_max_f64 -; SI: v_max_f64 -; SI: v_max_f64 -; SI: v_max_f64 -; SI: v_max_f64 -; SI: v_max_f64 -; SI: v_max_f64 -; SI: v_max_f64 -; SI: v_max_f64 -; SI: v_max_f64 -; SI: v_max_f64 -; SI: v_max_f64 -; SI: v_max_f64 -; SI: v_max_f64 -define void @test_fmax_v16f64(<16 x double> addrspace(1)* %out, <16 x double> %a, <16 x double> %b) nounwind { - %val = call <16 x double> @llvm.maxnum.v16f64(<16 x double> %a, <16 x double> %b) #0 - store <16 x double> %val, <16 x double> addrspace(1)* %out, align 128 - ret void -} - -attributes #0 = { nounwind readnone } diff --git a/llvm/test/CodeGen/R600/fmaxnum.ll b/llvm/test/CodeGen/R600/fmaxnum.ll deleted file mode 100644 index 3029bd02e4d..00000000000 --- a/llvm/test/CodeGen/R600/fmaxnum.ll +++ /dev/null @@ -1,283 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s - -declare float @llvm.maxnum.f32(float, float) #0 -declare <2 x float> @llvm.maxnum.v2f32(<2 x float>, <2 x float>) #0 -declare <4 x float> @llvm.maxnum.v4f32(<4 x float>, <4 x float>) #0 -declare <8 x float> @llvm.maxnum.v8f32(<8 x float>, <8 x float>) #0 -declare <16 x float> @llvm.maxnum.v16f32(<16 x float>, <16 x float>) #0 - -declare double @llvm.maxnum.f64(double, double) - -; FUNC-LABEL: @test_fmax_f32 -; SI: v_max_f32_e32 - -; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] -; EG: MAX_DX10 {{.*}}[[OUT]] -define void @test_fmax_f32(float addrspace(1)* %out, float %a, float %b) nounwind { - %val = call float @llvm.maxnum.f32(float %a, float %b) #0 - store float %val, float addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: @test_fmax_v2f32 -; SI: v_max_f32_e32 -; SI: v_max_f32_e32 - -; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+]] -; EG: MAX_DX10 {{.*}}[[OUT]] -; EG: MAX_DX10 {{.*}}[[OUT]] -define void @test_fmax_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) nounwind { - %val = call <2 x float> @llvm.maxnum.v2f32(<2 x float> %a, <2 x float> %b) #0 - store <2 x float> %val, <2 x float> addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: @test_fmax_v4f32 -; SI: v_max_f32_e32 -; SI: v_max_f32_e32 -; SI: v_max_f32_e32 -; SI: v_max_f32_e32 - -; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+]] -; EG: MAX_DX10 {{.*}}[[OUT]] -; EG: MAX_DX10 {{.*}}[[OUT]] -; EG: MAX_DX10 {{.*}}[[OUT]] -; EG: MAX_DX10 {{.*}}[[OUT]] -define void @test_fmax_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, <4 x float> %b) nounwind { - %val = call <4 x float> @llvm.maxnum.v4f32(<4 x float> %a, <4 x float> %b) #0 - store <4 x float> %val, <4 x float> addrspace(1)* %out, align 16 - ret void -} - -; FUNC-LABEL: @test_fmax_v8f32 -; SI: v_max_f32_e32 -; SI: v_max_f32_e32 -; SI: v_max_f32_e32 -; SI: v_max_f32_e32 -; SI: v_max_f32_e32 -; SI: v_max_f32_e32 -; SI: v_max_f32_e32 -; SI: v_max_f32_e32 - -; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT1:T[0-9]+]] -; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT2:T[0-9]+]] -; EG-DAG: MAX_DX10 {{.*}}[[OUT1]].X -; EG-DAG: MAX_DX10 {{.*}}[[OUT1]].Y -; EG-DAG: MAX_DX10 {{.*}}[[OUT1]].Z -; EG-DAG: MAX_DX10 {{.*}}[[OUT1]].W -; EG-DAG: MAX_DX10 {{.*}}[[OUT2]].X -; EG-DAG: MAX_DX10 {{.*}}[[OUT2]].Y -; EG-DAG: MAX_DX10 {{.*}}[[OUT2]].Z -; EG-DAG: MAX_DX10 {{.*}}[[OUT2]].W -define void @test_fmax_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, <8 x float> %b) nounwind { - %val = call <8 x float> @llvm.maxnum.v8f32(<8 x float> %a, <8 x float> %b) #0 - store <8 x float> %val, <8 x float> addrspace(1)* %out, align 32 - ret void -} - -; FUNC-LABEL: @test_fmax_v16f32 -; SI: v_max_f32_e32 -; SI: v_max_f32_e32 -; SI: v_max_f32_e32 -; SI: v_max_f32_e32 -; SI: v_max_f32_e32 -; SI: v_max_f32_e32 -; SI: v_max_f32_e32 -; SI: v_max_f32_e32 -; SI: v_max_f32_e32 -; SI: v_max_f32_e32 -; SI: v_max_f32_e32 -; SI: v_max_f32_e32 -; SI: v_max_f32_e32 -; SI: v_max_f32_e32 -; SI: v_max_f32_e32 -; SI: v_max_f32_e32 - -; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT1:T[0-9]+]] -; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT2:T[0-9]+]] -; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT3:T[0-9]+]] -; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT4:T[0-9]+]] -; EG-DAG: MAX_DX10 {{.*}}[[OUT1]].X -; EG-DAG: MAX_DX10 {{.*}}[[OUT1]].Y -; EG-DAG: MAX_DX10 {{.*}}[[OUT1]].Z -; EG-DAG: MAX_DX10 {{.*}}[[OUT1]].W -; EG-DAG: MAX_DX10 {{.*}}[[OUT2]].X -; EG-DAG: MAX_DX10 {{.*}}[[OUT2]].Y -; EG-DAG: MAX_DX10 {{.*}}[[OUT2]].Z -; EG-DAG: MAX_DX10 {{.*}}[[OUT2]].W -; EG-DAG: MAX_DX10 {{.*}}[[OUT3]].X -; EG-DAG: MAX_DX10 {{.*}}[[OUT3]].Y -; EG-DAG: MAX_DX10 {{.*}}[[OUT3]].Z -; EG-DAG: MAX_DX10 {{.*}}[[OUT3]].W -; EG-DAG: MAX_DX10 {{.*}}[[OUT4]].X -; EG-DAG: MAX_DX10 {{.*}}[[OUT4]].Y -; EG-DAG: MAX_DX10 {{.*}}[[OUT4]].Z -; EG-DAG: MAX_DX10 {{.*}}[[OUT4]].W -define void @test_fmax_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %a, <16 x float> %b) nounwind { - %val = call <16 x float> @llvm.maxnum.v16f32(<16 x float> %a, <16 x float> %b) #0 - store <16 x float> %val, <16 x float> addrspace(1)* %out, align 64 - ret void -} - -; FUNC-LABEL: @constant_fold_fmax_f32 -; SI-NOT: v_max_f32_e32 -; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 2.0 -; SI: buffer_store_dword [[REG]] - -; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] -; EG-NOT: MAX_DX10 -; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}} -define void @constant_fold_fmax_f32(float addrspace(1)* %out) nounwind { - %val = call float @llvm.maxnum.f32(float 1.0, float 2.0) #0 - store float %val, float addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: @constant_fold_fmax_f32_nan_nan -; SI-NOT: v_max_f32_e32 -; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7fc00000 -; SI: buffer_store_dword [[REG]] - -; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] -; EG-NOT: MAX_DX10 -; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}} -; EG: 2143289344(nan) -define void @constant_fold_fmax_f32_nan_nan(float addrspace(1)* %out) nounwind { - %val = call float @llvm.maxnum.f32(float 0x7FF8000000000000, float 0x7FF8000000000000) #0 - store float %val, float addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: @constant_fold_fmax_f32_val_nan -; SI-NOT: v_max_f32_e32 -; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 1.0 -; SI: buffer_store_dword [[REG]] - -; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] -; EG-NOT: MAX_DX10 -; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}} -define void @constant_fold_fmax_f32_val_nan(float addrspace(1)* %out) nounwind { - %val = call float @llvm.maxnum.f32(float 1.0, float 0x7FF8000000000000) #0 - store float %val, float addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: @constant_fold_fmax_f32_nan_val -; SI-NOT: v_max_f32_e32 -; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 1.0 -; SI: buffer_store_dword [[REG]] - -; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] -; EG-NOT: MAX_DX10 -; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}} -define void @constant_fold_fmax_f32_nan_val(float addrspace(1)* %out) nounwind { - %val = call float @llvm.maxnum.f32(float 0x7FF8000000000000, float 1.0) #0 - store float %val, float addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: @constant_fold_fmax_f32_p0_p0 -; SI-NOT: v_max_f32_e32 -; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0 -; SI: buffer_store_dword [[REG]] - -; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] -; EG-NOT: MAX_DX10 -; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}} -define void @constant_fold_fmax_f32_p0_p0(float addrspace(1)* %out) nounwind { - %val = call float @llvm.maxnum.f32(float 0.0, float 0.0) #0 - store float %val, float addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: @constant_fold_fmax_f32_p0_n0 -; SI-NOT: v_max_f32_e32 -; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0 -; SI: buffer_store_dword [[REG]] - -; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] -; EG-NOT: MAX_DX10 -; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}} -define void @constant_fold_fmax_f32_p0_n0(float addrspace(1)* %out) nounwind { - %val = call float @llvm.maxnum.f32(float 0.0, float -0.0) #0 - store float %val, float addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: @constant_fold_fmax_f32_n0_p0 -; SI-NOT: v_max_f32_e32 -; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x80000000 -; SI: buffer_store_dword [[REG]] - -; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] -; EG-NOT: MAX_DX10 -; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}} -define void @constant_fold_fmax_f32_n0_p0(float addrspace(1)* %out) nounwind { - %val = call float @llvm.maxnum.f32(float -0.0, float 0.0) #0 - store float %val, float addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: @constant_fold_fmax_f32_n0_n0 -; SI-NOT: v_max_f32_e32 -; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x80000000 -; SI: buffer_store_dword [[REG]] - -; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] -; EG-NOT: MAX_DX10 -; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}} -define void @constant_fold_fmax_f32_n0_n0(float addrspace(1)* %out) nounwind { - %val = call float @llvm.maxnum.f32(float -0.0, float -0.0) #0 - store float %val, float addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: @fmax_var_immediate_f32 -; SI: v_max_f32_e64 {{v[0-9]+}}, 2.0, {{s[0-9]+}} - -; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] -; EG-NOT: MAX_DX10 -; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}} -define void @fmax_var_immediate_f32(float addrspace(1)* %out, float %a) nounwind { - %val = call float @llvm.maxnum.f32(float %a, float 2.0) #0 - store float %val, float addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: @fmax_immediate_var_f32 -; SI: v_max_f32_e64 {{v[0-9]+}}, 2.0, {{s[0-9]+}} - -; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] -; EG: MAX_DX10 {{.*}}[[OUT]], {{KC0\[[0-9]\].[XYZW]}}, literal.{{[xy]}} -define void @fmax_immediate_var_f32(float addrspace(1)* %out, float %a) nounwind { - %val = call float @llvm.maxnum.f32(float 2.0, float %a) #0 - store float %val, float addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: @fmax_var_literal_f32 -; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x42c60000 -; SI: v_max_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, [[REG]] - -; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] -; EG: MAX_DX10 {{.*}}[[OUT]], {{KC0\[[0-9]\].[XYZW]}}, literal.{{[xy]}} -define void @fmax_var_literal_f32(float addrspace(1)* %out, float %a) nounwind { - %val = call float @llvm.maxnum.f32(float %a, float 99.0) #0 - store float %val, float addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: @fmax_literal_var_f32 -; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x42c60000 -; SI: v_max_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, [[REG]] - -; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] -; EG: MAX_DX10 {{.*}}[[OUT]], {{KC0\[[0-9]\].[XYZW]}}, literal.{{[xy]}} -define void @fmax_literal_var_f32(float addrspace(1)* %out, float %a) nounwind { - %val = call float @llvm.maxnum.f32(float 99.0, float %a) #0 - store float %val, float addrspace(1)* %out, align 4 - ret void -} - -attributes #0 = { nounwind readnone } diff --git a/llvm/test/CodeGen/R600/fmin.ll b/llvm/test/CodeGen/R600/fmin.ll deleted file mode 100644 index defa8c09638..00000000000 --- a/llvm/test/CodeGen/R600/fmin.ll +++ /dev/null @@ -1,17 +0,0 @@ -;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s - -;CHECK: MIN * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} - -define void @test(<4 x float> inreg %reg0) #0 { - %r0 = extractelement <4 x float> %reg0, i32 0 - %r1 = extractelement <4 x float> %reg0, i32 1 - %r2 = fcmp uge float %r0, %r1 - %r3 = select i1 %r2, float %r1, float %r0 - %vec = insertelement <4 x float> undef, float %r3, i32 0 - call void @llvm.R600.store.swizzle(<4 x float> %vec, i32 0, i32 0) - ret void -} - -declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) - -attributes #0 = { "ShaderType"="0" } \ No newline at end of file diff --git a/llvm/test/CodeGen/R600/fmin3.ll b/llvm/test/CodeGen/R600/fmin3.ll deleted file mode 100644 index 0a76699b43e..00000000000 --- a/llvm/test/CodeGen/R600/fmin3.ll +++ /dev/null @@ -1,40 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s - -declare float @llvm.minnum.f32(float, float) nounwind readnone - -; SI-LABEL: {{^}}test_fmin3_olt_0: -; SI: buffer_load_dword [[REGC:v[0-9]+]] -; SI: buffer_load_dword [[REGB:v[0-9]+]] -; SI: buffer_load_dword [[REGA:v[0-9]+]] -; SI: v_min3_f32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]] -; SI: buffer_store_dword [[RESULT]], -; SI: s_endpgm -define void @test_fmin3_olt_0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) nounwind { - %a = load float, float addrspace(1)* %aptr, align 4 - %b = load float, float addrspace(1)* %bptr, align 4 - %c = load float, float addrspace(1)* %cptr, align 4 - %f0 = call float @llvm.minnum.f32(float %a, float %b) nounwind readnone - %f1 = call float @llvm.minnum.f32(float %f0, float %c) nounwind readnone - store float %f1, float addrspace(1)* %out, align 4 - ret void -} - -; Commute operand of second fmin -; SI-LABEL: {{^}}test_fmin3_olt_1: -; SI: buffer_load_dword [[REGB:v[0-9]+]] -; SI: buffer_load_dword [[REGA:v[0-9]+]] -; SI: buffer_load_dword [[REGC:v[0-9]+]] -; SI: v_min3_f32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]] -; SI: buffer_store_dword [[RESULT]], -; SI: s_endpgm -define void @test_fmin3_olt_1(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) nounwind { - %a = load float, float addrspace(1)* %aptr, align 4 - %b = load float, float addrspace(1)* %bptr, align 4 - %c = load float, float addrspace(1)* %cptr, align 4 - %f0 = call float @llvm.minnum.f32(float %a, float %b) nounwind readnone - %f1 = call float @llvm.minnum.f32(float %c, float %f0) nounwind readnone - store float %f1, float addrspace(1)* %out, align 4 - ret void -} diff --git a/llvm/test/CodeGen/R600/fmin_legacy.f64.ll b/llvm/test/CodeGen/R600/fmin_legacy.f64.ll deleted file mode 100644 index e19a48f3f7e..00000000000 --- a/llvm/test/CodeGen/R600/fmin_legacy.f64.ll +++ /dev/null @@ -1,77 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s - -declare i32 @llvm.r600.read.tidig.x() #1 - -; FUNC-LABEL: @test_fmin_legacy_f64 -define void @test_fmin_legacy_f64(<4 x double> addrspace(1)* %out, <4 x double> inreg %reg0) #0 { - %r0 = extractelement <4 x double> %reg0, i32 0 - %r1 = extractelement <4 x double> %reg0, i32 1 - %r2 = fcmp uge double %r0, %r1 - %r3 = select i1 %r2, double %r1, double %r0 - %vec = insertelement <4 x double> undef, double %r3, i32 0 - store <4 x double> %vec, <4 x double> addrspace(1)* %out, align 16 - ret void -} - -; FUNC-LABEL: @test_fmin_legacy_ule_f64 -define void @test_fmin_legacy_ule_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 { - %tid = call i32 @llvm.r600.read.tidig.x() #1 - %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid - %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 - - %a = load double, double addrspace(1)* %gep.0, align 8 - %b = load double, double addrspace(1)* %gep.1, align 8 - - %cmp = fcmp ule double %a, %b - %val = select i1 %cmp, double %a, double %b - store double %val, double addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: @test_fmin_legacy_ole_f64 -define void @test_fmin_legacy_ole_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 { - %tid = call i32 @llvm.r600.read.tidig.x() #1 - %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid - %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 - - %a = load double, double addrspace(1)* %gep.0, align 8 - %b = load double, double addrspace(1)* %gep.1, align 8 - - %cmp = fcmp ole double %a, %b - %val = select i1 %cmp, double %a, double %b - store double %val, double addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: @test_fmin_legacy_olt_f64 -define void @test_fmin_legacy_olt_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 { - %tid = call i32 @llvm.r600.read.tidig.x() #1 - %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid - %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 - - %a = load double, double addrspace(1)* %gep.0, align 8 - %b = load double, double addrspace(1)* %gep.1, align 8 - - %cmp = fcmp olt double %a, %b - %val = select i1 %cmp, double %a, double %b - store double %val, double addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: @test_fmin_legacy_ult_f64 -define void @test_fmin_legacy_ult_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 { - %tid = call i32 @llvm.r600.read.tidig.x() #1 - %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid - %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 - - %a = load double, double addrspace(1)* %gep.0, align 8 - %b = load double, double addrspace(1)* %gep.1, align 8 - - %cmp = fcmp ult double %a, %b - %val = select i1 %cmp, double %a, double %b - store double %val, double addrspace(1)* %out, align 8 - ret void -} - -attributes #0 = { nounwind } -attributes #1 = { nounwind readnone } diff --git a/llvm/test/CodeGen/R600/fmin_legacy.ll b/llvm/test/CodeGen/R600/fmin_legacy.ll deleted file mode 100644 index 6a625c239d7..00000000000 --- a/llvm/test/CodeGen/R600/fmin_legacy.ll +++ /dev/null @@ -1,123 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI-SAFE -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -enable-no-nans-fp-math -enable-unsafe-fp-math -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI-NONAN -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s - -; FIXME: Should replace unsafe-fp-math with no signed zeros. - -declare i32 @llvm.r600.read.tidig.x() #1 - -; FUNC-LABEL: @test_fmin_legacy_f32 -; EG: MIN * -; SI-SAFE: v_min_legacy_f32_e32 -; SI-NONAN: v_min_f32_e32 -define void @test_fmin_legacy_f32(<4 x float> addrspace(1)* %out, <4 x float> inreg %reg0) #0 { - %r0 = extractelement <4 x float> %reg0, i32 0 - %r1 = extractelement <4 x float> %reg0, i32 1 - %r2 = fcmp uge float %r0, %r1 - %r3 = select i1 %r2, float %r1, float %r0 - %vec = insertelement <4 x float> undef, float %r3, i32 0 - store <4 x float> %vec, <4 x float> addrspace(1)* %out, align 16 - ret void -} - -; FUNC-LABEL: @test_fmin_legacy_ule_f32 -; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]] -; SI-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[B]], [[A]] -define void @test_fmin_legacy_ule_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { - %tid = call i32 @llvm.r600.read.tidig.x() #1 - %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid - %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 - - %a = load float, float addrspace(1)* %gep.0, align 4 - %b = load float, float addrspace(1)* %gep.1, align 4 - - %cmp = fcmp ule float %a, %b - %val = select i1 %cmp, float %a, float %b - store float %val, float addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: @test_fmin_legacy_ole_f32 -; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]] -; SI-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[B]], [[A]] -define void @test_fmin_legacy_ole_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { - %tid = call i32 @llvm.r600.read.tidig.x() #1 - %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid - %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 - - %a = load float, float addrspace(1)* %gep.0, align 4 - %b = load float, float addrspace(1)* %gep.1, align 4 - - %cmp = fcmp ole float %a, %b - %val = select i1 %cmp, float %a, float %b - store float %val, float addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: @test_fmin_legacy_olt_f32 -; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]] -; SI-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[B]], [[A]] -define void @test_fmin_legacy_olt_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { - %tid = call i32 @llvm.r600.read.tidig.x() #1 - %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid - %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 - - %a = load float, float addrspace(1)* %gep.0, align 4 - %b = load float, float addrspace(1)* %gep.1, align 4 - - %cmp = fcmp olt float %a, %b - %val = select i1 %cmp, float %a, float %b - store float %val, float addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: @test_fmin_legacy_ult_f32 -; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]] -; SI-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[B]], [[A]] -define void @test_fmin_legacy_ult_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { - %tid = call i32 @llvm.r600.read.tidig.x() #1 - %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid - %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 - - %a = load float, float addrspace(1)* %gep.0, align 4 - %b = load float, float addrspace(1)* %gep.1, align 4 - - %cmp = fcmp ult float %a, %b - %val = select i1 %cmp, float %a, float %b - store float %val, float addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: @test_fmin_legacy_ole_f32_multi_use -; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; SI-NOT: v_min -; SI: v_cmp_le_f32 -; SI-NEXT: v_cndmask_b32 -; SI-NOT: v_min -; SI: s_endpgm -define void @test_fmin_legacy_ole_f32_multi_use(float addrspace(1)* %out0, i1 addrspace(1)* %out1, float addrspace(1)* %in) #0 { - %tid = call i32 @llvm.r600.read.tidig.x() #1 - %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid - %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 - - %a = load float, float addrspace(1)* %gep.0, align 4 - %b = load float, float addrspace(1)* %gep.1, align 4 - - %cmp = fcmp ole float %a, %b - %val0 = select i1 %cmp, float %a, float %b - store float %val0, float addrspace(1)* %out0, align 4 - store i1 %cmp, i1 addrspace(1)* %out1 - ret void -} - -attributes #0 = { nounwind } -attributes #1 = { nounwind readnone } diff --git a/llvm/test/CodeGen/R600/fminnum.f64.ll b/llvm/test/CodeGen/R600/fminnum.f64.ll deleted file mode 100644 index 0f929d6a81f..00000000000 --- a/llvm/test/CodeGen/R600/fminnum.f64.ll +++ /dev/null @@ -1,76 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s - -declare double @llvm.minnum.f64(double, double) #0 -declare <2 x double> @llvm.minnum.v2f64(<2 x double>, <2 x double>) #0 -declare <4 x double> @llvm.minnum.v4f64(<4 x double>, <4 x double>) #0 -declare <8 x double> @llvm.minnum.v8f64(<8 x double>, <8 x double>) #0 -declare <16 x double> @llvm.minnum.v16f64(<16 x double>, <16 x double>) #0 - -; FUNC-LABEL: @test_fmin_f64 -; SI: v_min_f64 -define void @test_fmin_f64(double addrspace(1)* %out, double %a, double %b) nounwind { - %val = call double @llvm.minnum.f64(double %a, double %b) #0 - store double %val, double addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: @test_fmin_v2f64 -; SI: v_min_f64 -; SI: v_min_f64 -define void @test_fmin_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b) nounwind { - %val = call <2 x double> @llvm.minnum.v2f64(<2 x double> %a, <2 x double> %b) #0 - store <2 x double> %val, <2 x double> addrspace(1)* %out, align 16 - ret void -} - -; FUNC-LABEL: @test_fmin_v4f64 -; SI: v_min_f64 -; SI: v_min_f64 -; SI: v_min_f64 -; SI: v_min_f64 -define void @test_fmin_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b) nounwind { - %val = call <4 x double> @llvm.minnum.v4f64(<4 x double> %a, <4 x double> %b) #0 - store <4 x double> %val, <4 x double> addrspace(1)* %out, align 32 - ret void -} - -; FUNC-LABEL: @test_fmin_v8f64 -; SI: v_min_f64 -; SI: v_min_f64 -; SI: v_min_f64 -; SI: v_min_f64 -; SI: v_min_f64 -; SI: v_min_f64 -; SI: v_min_f64 -; SI: v_min_f64 -define void @test_fmin_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %a, <8 x double> %b) nounwind { - %val = call <8 x double> @llvm.minnum.v8f64(<8 x double> %a, <8 x double> %b) #0 - store <8 x double> %val, <8 x double> addrspace(1)* %out, align 64 - ret void -} - -; FUNC-LABEL: @test_fmin_v16f64 -; SI: v_min_f64 -; SI: v_min_f64 -; SI: v_min_f64 -; SI: v_min_f64 -; SI: v_min_f64 -; SI: v_min_f64 -; SI: v_min_f64 -; SI: v_min_f64 -; SI: v_min_f64 -; SI: v_min_f64 -; SI: v_min_f64 -; SI: v_min_f64 -; SI: v_min_f64 -; SI: v_min_f64 -; SI: v_min_f64 -; SI: v_min_f64 -define void @test_fmin_v16f64(<16 x double> addrspace(1)* %out, <16 x double> %a, <16 x double> %b) nounwind { - %val = call <16 x double> @llvm.minnum.v16f64(<16 x double> %a, <16 x double> %b) #0 - store <16 x double> %val, <16 x double> addrspace(1)* %out, align 128 - ret void -} - -attributes #0 = { nounwind readnone } diff --git a/llvm/test/CodeGen/R600/fminnum.ll b/llvm/test/CodeGen/R600/fminnum.ll deleted file mode 100644 index 4d7b52540d8..00000000000 --- a/llvm/test/CodeGen/R600/fminnum.ll +++ /dev/null @@ -1,281 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s - -declare float @llvm.minnum.f32(float, float) #0 -declare <2 x float> @llvm.minnum.v2f32(<2 x float>, <2 x float>) #0 -declare <4 x float> @llvm.minnum.v4f32(<4 x float>, <4 x float>) #0 -declare <8 x float> @llvm.minnum.v8f32(<8 x float>, <8 x float>) #0 -declare <16 x float> @llvm.minnum.v16f32(<16 x float>, <16 x float>) #0 - -; FUNC-LABEL: @test_fmin_f32 -; SI: v_min_f32_e32 - -; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] -; EG: MIN_DX10 {{.*}}[[OUT]] -define void @test_fmin_f32(float addrspace(1)* %out, float %a, float %b) nounwind { - %val = call float @llvm.minnum.f32(float %a, float %b) #0 - store float %val, float addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: @test_fmin_v2f32 -; SI: v_min_f32_e32 -; SI: v_min_f32_e32 - -; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+]] -; EG: MIN_DX10 {{.*}}[[OUT]] -; EG: MIN_DX10 {{.*}}[[OUT]] -define void @test_fmin_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) nounwind { - %val = call <2 x float> @llvm.minnum.v2f32(<2 x float> %a, <2 x float> %b) #0 - store <2 x float> %val, <2 x float> addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: @test_fmin_v4f32 -; SI: v_min_f32_e32 -; SI: v_min_f32_e32 -; SI: v_min_f32_e32 -; SI: v_min_f32_e32 - -; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+]] -; EG: MIN_DX10 {{.*}}[[OUT]] -; EG: MIN_DX10 {{.*}}[[OUT]] -; EG: MIN_DX10 {{.*}}[[OUT]] -; EG: MIN_DX10 {{.*}}[[OUT]] -define void @test_fmin_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, <4 x float> %b) nounwind { - %val = call <4 x float> @llvm.minnum.v4f32(<4 x float> %a, <4 x float> %b) #0 - store <4 x float> %val, <4 x float> addrspace(1)* %out, align 16 - ret void -} - -; FUNC-LABEL: @test_fmin_v8f32 -; SI: v_min_f32_e32 -; SI: v_min_f32_e32 -; SI: v_min_f32_e32 -; SI: v_min_f32_e32 -; SI: v_min_f32_e32 -; SI: v_min_f32_e32 -; SI: v_min_f32_e32 -; SI: v_min_f32_e32 - -; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT1:T[0-9]+]] -; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT2:T[0-9]+]] -; EG-DAG: MIN_DX10 {{.*}}[[OUT1]].X -; EG-DAG: MIN_DX10 {{.*}}[[OUT1]].Y -; EG-DAG: MIN_DX10 {{.*}}[[OUT1]].Z -; EG-DAG: MIN_DX10 {{.*}}[[OUT1]].W -; EG-DAG: MIN_DX10 {{.*}}[[OUT2]].X -; EG-DAG: MIN_DX10 {{.*}}[[OUT2]].Y -; EG-DAG: MIN_DX10 {{.*}}[[OUT2]].Z -; EG-DAG: MIN_DX10 {{.*}}[[OUT2]].W -define void @test_fmin_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, <8 x float> %b) nounwind { - %val = call <8 x float> @llvm.minnum.v8f32(<8 x float> %a, <8 x float> %b) #0 - store <8 x float> %val, <8 x float> addrspace(1)* %out, align 32 - ret void -} - -; FUNC-LABEL: @test_fmin_v16f32 -; SI: v_min_f32_e32 -; SI: v_min_f32_e32 -; SI: v_min_f32_e32 -; SI: v_min_f32_e32 -; SI: v_min_f32_e32 -; SI: v_min_f32_e32 -; SI: v_min_f32_e32 -; SI: v_min_f32_e32 -; SI: v_min_f32_e32 -; SI: v_min_f32_e32 -; SI: v_min_f32_e32 -; SI: v_min_f32_e32 -; SI: v_min_f32_e32 -; SI: v_min_f32_e32 -; SI: v_min_f32_e32 -; SI: v_min_f32_e32 - -; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT1:T[0-9]+]] -; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT2:T[0-9]+]] -; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT3:T[0-9]+]] -; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT4:T[0-9]+]] -; EG-DAG: MIN_DX10 {{.*}}[[OUT1]].X -; EG-DAG: MIN_DX10 {{.*}}[[OUT1]].Y -; EG-DAG: MIN_DX10 {{.*}}[[OUT1]].Z -; EG-DAG: MIN_DX10 {{.*}}[[OUT1]].W -; EG-DAG: MIN_DX10 {{.*}}[[OUT2]].X -; EG-DAG: MIN_DX10 {{.*}}[[OUT2]].Y -; EG-DAG: MIN_DX10 {{.*}}[[OUT2]].Z -; EG-DAG: MIN_DX10 {{.*}}[[OUT2]].W -; EG-DAG: MIN_DX10 {{.*}}[[OUT3]].X -; EG-DAG: MIN_DX10 {{.*}}[[OUT3]].Y -; EG-DAG: MIN_DX10 {{.*}}[[OUT3]].Z -; EG-DAG: MIN_DX10 {{.*}}[[OUT3]].W -; EG-DAG: MIN_DX10 {{.*}}[[OUT4]].X -; EG-DAG: MIN_DX10 {{.*}}[[OUT4]].Y -; EG-DAG: MIN_DX10 {{.*}}[[OUT4]].Z -; EG-DAG: MIN_DX10 {{.*}}[[OUT4]].W -define void @test_fmin_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %a, <16 x float> %b) nounwind { - %val = call <16 x float> @llvm.minnum.v16f32(<16 x float> %a, <16 x float> %b) #0 - store <16 x float> %val, <16 x float> addrspace(1)* %out, align 64 - ret void -} - -; FUNC-LABEL: @constant_fold_fmin_f32 -; SI-NOT: v_min_f32_e32 -; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 1.0 -; SI: buffer_store_dword [[REG]] - -; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] -; EG-NOT: MIN_DX10 -; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}} -define void @constant_fold_fmin_f32(float addrspace(1)* %out) nounwind { - %val = call float @llvm.minnum.f32(float 1.0, float 2.0) #0 - store float %val, float addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: @constant_fold_fmin_f32_nan_nan -; SI-NOT: v_min_f32_e32 -; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7fc00000 -; SI: buffer_store_dword [[REG]] - -; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] -; EG-NOT: MIN_DX10 -; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}} -; EG: 2143289344({{nan|1\.#QNAN0e\+00}}) -define void @constant_fold_fmin_f32_nan_nan(float addrspace(1)* %out) nounwind { - %val = call float @llvm.minnum.f32(float 0x7FF8000000000000, float 0x7FF8000000000000) #0 - store float %val, float addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: @constant_fold_fmin_f32_val_nan -; SI-NOT: v_min_f32_e32 -; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 1.0 -; SI: buffer_store_dword [[REG]] - -; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] -; EG-NOT: MIN_DX10 -; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}} -define void @constant_fold_fmin_f32_val_nan(float addrspace(1)* %out) nounwind { - %val = call float @llvm.minnum.f32(float 1.0, float 0x7FF8000000000000) #0 - store float %val, float addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: @constant_fold_fmin_f32_nan_val -; SI-NOT: v_min_f32_e32 -; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 1.0 -; SI: buffer_store_dword [[REG]] - -; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] -; EG-NOT: MIN_DX10 -; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}} -define void @constant_fold_fmin_f32_nan_val(float addrspace(1)* %out) nounwind { - %val = call float @llvm.minnum.f32(float 0x7FF8000000000000, float 1.0) #0 - store float %val, float addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: @constant_fold_fmin_f32_p0_p0 -; SI-NOT: v_min_f32_e32 -; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0 -; SI: buffer_store_dword [[REG]] - -; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] -; EG-NOT: MIN_DX10 -; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}} -define void @constant_fold_fmin_f32_p0_p0(float addrspace(1)* %out) nounwind { - %val = call float @llvm.minnum.f32(float 0.0, float 0.0) #0 - store float %val, float addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: @constant_fold_fmin_f32_p0_n0 -; SI-NOT: v_min_f32_e32 -; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0 -; SI: buffer_store_dword [[REG]] - -; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] -; EG-NOT: MIN_DX10 -; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}} -define void @constant_fold_fmin_f32_p0_n0(float addrspace(1)* %out) nounwind { - %val = call float @llvm.minnum.f32(float 0.0, float -0.0) #0 - store float %val, float addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: @constant_fold_fmin_f32_n0_p0 -; SI-NOT: v_min_f32_e32 -; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x80000000 -; SI: buffer_store_dword [[REG]] - -; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] -; EG-NOT: MIN_DX10 -; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}} -define void @constant_fold_fmin_f32_n0_p0(float addrspace(1)* %out) nounwind { - %val = call float @llvm.minnum.f32(float -0.0, float 0.0) #0 - store float %val, float addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: @constant_fold_fmin_f32_n0_n0 -; SI-NOT: v_min_f32_e32 -; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x80000000 -; SI: buffer_store_dword [[REG]] - -; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] -; EG-NOT: MIN_DX10 -; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}} -define void @constant_fold_fmin_f32_n0_n0(float addrspace(1)* %out) nounwind { - %val = call float @llvm.minnum.f32(float -0.0, float -0.0) #0 - store float %val, float addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: @fmin_var_immediate_f32 -; SI: v_min_f32_e64 {{v[0-9]+}}, 2.0, {{s[0-9]+}} - -; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] -; EG: MIN_DX10 {{.*}}[[OUT]], {{KC0\[[0-9]\].[XYZW]}}, literal.{{[xy]}} -define void @fmin_var_immediate_f32(float addrspace(1)* %out, float %a) nounwind { - %val = call float @llvm.minnum.f32(float %a, float 2.0) #0 - store float %val, float addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: @fmin_immediate_var_f32 -; SI: v_min_f32_e64 {{v[0-9]+}}, 2.0, {{s[0-9]+}} - -; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] -; EG: MIN_DX10 {{.*}}[[OUT]], {{KC0\[[0-9]\].[XYZW]}}, literal.{{[xy]}} -define void @fmin_immediate_var_f32(float addrspace(1)* %out, float %a) nounwind { - %val = call float @llvm.minnum.f32(float 2.0, float %a) #0 - store float %val, float addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: @fmin_var_literal_f32 -; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x42c60000 -; SI: v_min_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, [[REG]] - -; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] -; EG: MIN_DX10 {{.*}}[[OUT]], {{KC0\[[0-9]\].[XYZW]}}, literal.{{[xy]}} -define void @fmin_var_literal_f32(float addrspace(1)* %out, float %a) nounwind { - %val = call float @llvm.minnum.f32(float %a, float 99.0) #0 - store float %val, float addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: @fmin_literal_var_f32 -; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x42c60000 -; SI: v_min_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, [[REG]] - -; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] -; EG: MIN_DX10 {{.*}}[[OUT]], {{KC0\[[0-9]\].[XYZW]}}, literal.{{[xy]}} -define void @fmin_literal_var_f32(float addrspace(1)* %out, float %a) nounwind { - %val = call float @llvm.minnum.f32(float 99.0, float %a) #0 - store float %val, float addrspace(1)* %out, align 4 - ret void -} - -attributes #0 = { nounwind readnone } diff --git a/llvm/test/CodeGen/R600/fmul.ll b/llvm/test/CodeGen/R600/fmul.ll deleted file mode 100644 index addc409c9eb..00000000000 --- a/llvm/test/CodeGen/R600/fmul.ll +++ /dev/null @@ -1,92 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s - - -; FUNC-LABEL: {{^}}fmul_f32: -; R600: MUL_IEEE {{\** *}}{{T[0-9]+\.[XYZW]}}, KC0[2].Z, KC0[2].W - -; SI: v_mul_f32 -define void @fmul_f32(float addrspace(1)* %out, float %a, float %b) { -entry: - %0 = fmul float %a, %b - store float %0, float addrspace(1)* %out - ret void -} - -declare float @llvm.R600.load.input(i32) readnone - -declare void @llvm.AMDGPU.store.output(float, i32) - -; FUNC-LABEL: {{^}}fmul_v2f32: -; R600: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}} -; R600: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}} - -; SI: v_mul_f32 -; SI: v_mul_f32 -define void @fmul_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) { -entry: - %0 = fmul <2 x float> %a, %b - store <2 x float> %0, <2 x float> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}fmul_v4f32: -; R600: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -; R600: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -; R600: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -; R600: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} - -; SI: v_mul_f32 -; SI: v_mul_f32 -; SI: v_mul_f32 -; SI: v_mul_f32 -define void @fmul_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) { - %b_ptr = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 1 - %a = load <4 x float>, <4 x float> addrspace(1) * %in - %b = load <4 x float>, <4 x float> addrspace(1) * %b_ptr - %result = fmul <4 x float> %a, %b - store <4 x float> %result, <4 x float> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}test_mul_2_k: -; SI: v_mul_f32 -; SI-NOT: v_mul_f32 -; SI: s_endpgm -define void @test_mul_2_k(float addrspace(1)* %out, float %x) #0 { - %y = fmul float %x, 2.0 - %z = fmul float %y, 3.0 - store float %z, float addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}test_mul_2_k_inv: -; SI: v_mul_f32 -; SI-NOT: v_mul_f32 -; SI-NOT: v_mad_f32 -; SI: s_endpgm -define void @test_mul_2_k_inv(float addrspace(1)* %out, float %x) #0 { - %y = fmul float %x, 3.0 - %z = fmul float %y, 2.0 - store float %z, float addrspace(1)* %out - ret void -} - -; There should be three multiplies here; %a should be used twice (once -; negated), not duplicated into mul x, 5.0 and mul x, -5.0. -; FUNC-LABEL: {{^}}test_mul_twouse: -; SI: v_mul_f32 -; SI: v_mul_f32 -; SI: v_mul_f32 -; SI-NOT: v_mul_f32 -define void @test_mul_twouse(float addrspace(1)* %out, float %x, float %y) #0 { - %a = fmul float %x, 5.0 - %b = fsub float -0.0, %a - %c = fmul float %b, %y - %d = fmul float %c, %a - store float %d, float addrspace(1)* %out - ret void -} - -attributes #0 = { "less-precise-fpmad"="true" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "unsafe-fp-math"="true" } diff --git a/llvm/test/CodeGen/R600/fmul64.ll b/llvm/test/CodeGen/R600/fmul64.ll deleted file mode 100644 index 3c222eaba89..00000000000 --- a/llvm/test/CodeGen/R600/fmul64.ll +++ /dev/null @@ -1,39 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=SI %s - -; FUNC-LABEL: {{^}}fmul_f64: -; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}} -define void @fmul_f64(double addrspace(1)* %out, double addrspace(1)* %in1, - double addrspace(1)* %in2) { - %r0 = load double, double addrspace(1)* %in1 - %r1 = load double, double addrspace(1)* %in2 - %r2 = fmul double %r0, %r1 - store double %r2, double addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}fmul_v2f64: -; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}} -; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}} -define void @fmul_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %in1, - <2 x double> addrspace(1)* %in2) { - %r0 = load <2 x double>, <2 x double> addrspace(1)* %in1 - %r1 = load <2 x double>, <2 x double> addrspace(1)* %in2 - %r2 = fmul <2 x double> %r0, %r1 - store <2 x double> %r2, <2 x double> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}fmul_v4f64: -; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}} -; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}} -; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}} -; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}} -define void @fmul_v4f64(<4 x double> addrspace(1)* %out, <4 x double> addrspace(1)* %in1, - <4 x double> addrspace(1)* %in2) { - %r0 = load <4 x double>, <4 x double> addrspace(1)* %in1 - %r1 = load <4 x double>, <4 x double> addrspace(1)* %in2 - %r2 = fmul <4 x double> %r0, %r1 - store <4 x double> %r2, <4 x double> addrspace(1)* %out - ret void -} diff --git a/llvm/test/CodeGen/R600/fmuladd.ll b/llvm/test/CodeGen/R600/fmuladd.ll deleted file mode 100644 index ae84d841021..00000000000 --- a/llvm/test/CodeGen/R600/fmuladd.ll +++ /dev/null @@ -1,199 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck %s - -declare float @llvm.fmuladd.f32(float, float, float) -declare double @llvm.fmuladd.f64(double, double, double) -declare i32 @llvm.r600.read.tidig.x() nounwind readnone -declare float @llvm.fabs.f32(float) nounwind readnone - -; CHECK-LABEL: {{^}}fmuladd_f32: -; CHECK: v_mad_f32 {{v[0-9]+, v[0-9]+, v[0-9]+, v[0-9]+}} - -define void @fmuladd_f32(float addrspace(1)* %out, float addrspace(1)* %in1, - float addrspace(1)* %in2, float addrspace(1)* %in3) { - %r0 = load float, float addrspace(1)* %in1 - %r1 = load float, float addrspace(1)* %in2 - %r2 = load float, float addrspace(1)* %in3 - %r3 = tail call float @llvm.fmuladd.f32(float %r0, float %r1, float %r2) - store float %r3, float addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}fmuladd_f64: -; CHECK: v_fma_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}} - -define void @fmuladd_f64(double addrspace(1)* %out, double addrspace(1)* %in1, - double addrspace(1)* %in2, double addrspace(1)* %in3) { - %r0 = load double, double addrspace(1)* %in1 - %r1 = load double, double addrspace(1)* %in2 - %r2 = load double, double addrspace(1)* %in3 - %r3 = tail call double @llvm.fmuladd.f64(double %r0, double %r1, double %r2) - store double %r3, double addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}fmuladd_2.0_a_b_f32 -; CHECK-DAG: buffer_load_dword [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; CHECK-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; CHECK: v_mad_f32 [[RESULT:v[0-9]+]], 2.0, [[R1]], [[R2]] -; CHECK: buffer_store_dword [[RESULT]] -define void @fmuladd_2.0_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) { - %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone - %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid - %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 - %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid - - %r1 = load float, float addrspace(1)* %gep.0 - %r2 = load float, float addrspace(1)* %gep.1 - - %r3 = tail call float @llvm.fmuladd.f32(float 2.0, float %r1, float %r2) - store float %r3, float addrspace(1)* %gep.out - ret void -} - -; CHECK-LABEL: {{^}}fmuladd_a_2.0_b_f32 -; CHECK-DAG: buffer_load_dword [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; CHECK-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; CHECK: v_mad_f32 [[RESULT:v[0-9]+]], 2.0, [[R1]], [[R2]] -; CHECK: buffer_store_dword [[RESULT]] -define void @fmuladd_a_2.0_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) { - %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone - %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid - %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 - %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid - - %r1 = load float, float addrspace(1)* %gep.0 - %r2 = load float, float addrspace(1)* %gep.1 - - %r3 = tail call float @llvm.fmuladd.f32(float %r1, float 2.0, float %r2) - store float %r3, float addrspace(1)* %gep.out - ret void -} - -; CHECK-LABEL: {{^}}fadd_a_a_b_f32: -; CHECK-DAG: buffer_load_dword [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; CHECK-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; CHECK: v_mad_f32 [[RESULT:v[0-9]+]], 2.0, [[R1]], [[R2]] -; CHECK: buffer_store_dword [[RESULT]] -define void @fadd_a_a_b_f32(float addrspace(1)* %out, - float addrspace(1)* %in1, - float addrspace(1)* %in2) { - %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone - %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid - %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 - %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid - - %r0 = load float, float addrspace(1)* %gep.0 - %r1 = load float, float addrspace(1)* %gep.1 - - %add.0 = fadd float %r0, %r0 - %add.1 = fadd float %add.0, %r1 - store float %add.1, float addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}fadd_b_a_a_f32: -; CHECK-DAG: buffer_load_dword [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; CHECK-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; CHECK: v_mad_f32 [[RESULT:v[0-9]+]], 2.0, [[R1]], [[R2]] -; CHECK: buffer_store_dword [[RESULT]] -define void @fadd_b_a_a_f32(float addrspace(1)* %out, - float addrspace(1)* %in1, - float addrspace(1)* %in2) { - %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone - %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid - %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 - %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid - - %r0 = load float, float addrspace(1)* %gep.0 - %r1 = load float, float addrspace(1)* %gep.1 - - %add.0 = fadd float %r0, %r0 - %add.1 = fadd float %r1, %add.0 - store float %add.1, float addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}fmuladd_neg_2.0_a_b_f32 -; CHECK-DAG: buffer_load_dword [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; CHECK-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; CHECK: v_mad_f32 [[RESULT:v[0-9]+]], -2.0, [[R1]], [[R2]] -; CHECK: buffer_store_dword [[RESULT]] -define void @fmuladd_neg_2.0_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) { - %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone - %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid - %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 - %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid - - %r1 = load float, float addrspace(1)* %gep.0 - %r2 = load float, float addrspace(1)* %gep.1 - - %r3 = tail call float @llvm.fmuladd.f32(float -2.0, float %r1, float %r2) - store float %r3, float addrspace(1)* %gep.out - ret void -} - - -; CHECK-LABEL: {{^}}fmuladd_neg_2.0_neg_a_b_f32 -; CHECK-DAG: buffer_load_dword [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; CHECK-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; CHECK: v_mad_f32 [[RESULT:v[0-9]+]], 2.0, [[R1]], [[R2]] -; CHECK: buffer_store_dword [[RESULT]] -define void @fmuladd_neg_2.0_neg_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) { - %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone - %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid - %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 - %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid - - %r1 = load float, float addrspace(1)* %gep.0 - %r2 = load float, float addrspace(1)* %gep.1 - - %r1.fneg = fsub float -0.000000e+00, %r1 - - %r3 = tail call float @llvm.fmuladd.f32(float -2.0, float %r1.fneg, float %r2) - store float %r3, float addrspace(1)* %gep.out - ret void -} - - -; CHECK-LABEL: {{^}}fmuladd_2.0_neg_a_b_f32 -; CHECK-DAG: buffer_load_dword [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; CHECK-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; CHECK: v_mad_f32 [[RESULT:v[0-9]+]], -2.0, [[R1]], [[R2]] -; CHECK: buffer_store_dword [[RESULT]] -define void @fmuladd_2.0_neg_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) { - %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone - %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid - %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 - %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid - - %r1 = load float, float addrspace(1)* %gep.0 - %r2 = load float, float addrspace(1)* %gep.1 - - %r1.fneg = fsub float -0.000000e+00, %r1 - - %r3 = tail call float @llvm.fmuladd.f32(float 2.0, float %r1.fneg, float %r2) - store float %r3, float addrspace(1)* %gep.out - ret void -} - - -; CHECK-LABEL: {{^}}fmuladd_2.0_a_neg_b_f32 -; CHECK-DAG: buffer_load_dword [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; CHECK-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; CHECK: v_mad_f32 [[RESULT:v[0-9]+]], 2.0, [[R1]], -[[R2]] -; CHECK: buffer_store_dword [[RESULT]] -define void @fmuladd_2.0_a_neg_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) { - %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone - %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid - %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 - %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid - - %r1 = load float, float addrspace(1)* %gep.0 - %r2 = load float, float addrspace(1)* %gep.1 - - %r2.fneg = fsub float -0.000000e+00, %r2 - - %r3 = tail call float @llvm.fmuladd.f32(float 2.0, float %r1, float %r2.fneg) - store float %r3, float addrspace(1)* %gep.out - ret void -} diff --git a/llvm/test/CodeGen/R600/fnearbyint.ll b/llvm/test/CodeGen/R600/fnearbyint.ll deleted file mode 100644 index 4fa9adaabda..00000000000 --- a/llvm/test/CodeGen/R600/fnearbyint.ll +++ /dev/null @@ -1,58 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s -; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s - -; This should have the exactly the same output as the test for rint, -; so no need to check anything. - -declare float @llvm.nearbyint.f32(float) #0 -declare <2 x float> @llvm.nearbyint.v2f32(<2 x float>) #0 -declare <4 x float> @llvm.nearbyint.v4f32(<4 x float>) #0 -declare double @llvm.nearbyint.f64(double) #0 -declare <2 x double> @llvm.nearbyint.v2f64(<2 x double>) #0 -declare <4 x double> @llvm.nearbyint.v4f64(<4 x double>) #0 - - -define void @fnearbyint_f32(float addrspace(1)* %out, float %in) #1 { -entry: - %0 = call float @llvm.nearbyint.f32(float %in) - store float %0, float addrspace(1)* %out - ret void -} - -define void @fnearbyint_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) #1 { -entry: - %0 = call <2 x float> @llvm.nearbyint.v2f32(<2 x float> %in) - store <2 x float> %0, <2 x float> addrspace(1)* %out - ret void -} - -define void @fnearbyint_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) #1 { -entry: - %0 = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %in) - store <4 x float> %0, <4 x float> addrspace(1)* %out - ret void -} - -define void @nearbyint_f64(double addrspace(1)* %out, double %in) { -entry: - %0 = call double @llvm.nearbyint.f64(double %in) - store double %0, double addrspace(1)* %out - ret void -} -define void @nearbyint_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in) { -entry: - %0 = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> %in) - store <2 x double> %0, <2 x double> addrspace(1)* %out - ret void -} - -define void @nearbyint_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %in) { -entry: - %0 = call <4 x double> @llvm.nearbyint.v4f64(<4 x double> %in) - store <4 x double> %0, <4 x double> addrspace(1)* %out - ret void -} - -attributes #0 = { nounwind readonly } -attributes #1 = { nounwind } diff --git a/llvm/test/CodeGen/R600/fneg-fabs.f64.ll b/llvm/test/CodeGen/R600/fneg-fabs.f64.ll deleted file mode 100644 index 8830e827366..00000000000 --- a/llvm/test/CodeGen/R600/fneg-fabs.f64.ll +++ /dev/null @@ -1,100 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s - -; FIXME: Check something here. Currently it seems fabs + fneg aren't -; into 2 modifiers, although theoretically that should work. - -; FUNC-LABEL: {{^}}fneg_fabs_fadd_f64: -; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, -|v{{\[[0-9]+:[0-9]+\]}}| -define void @fneg_fabs_fadd_f64(double addrspace(1)* %out, double %x, double %y) { - %fabs = call double @llvm.fabs.f64(double %x) - %fsub = fsub double -0.000000e+00, %fabs - %fadd = fadd double %y, %fsub - store double %fadd, double addrspace(1)* %out, align 8 - ret void -} - -define void @v_fneg_fabs_fadd_f64(double addrspace(1)* %out, double addrspace(1)* %xptr, double addrspace(1)* %yptr) { - %x = load double, double addrspace(1)* %xptr, align 8 - %y = load double, double addrspace(1)* %xptr, align 8 - %fabs = call double @llvm.fabs.f64(double %x) - %fsub = fsub double -0.000000e+00, %fabs - %fadd = fadd double %y, %fsub - store double %fadd, double addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}fneg_fabs_fmul_f64: -; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, -|{{v\[[0-9]+:[0-9]+\]}}| -define void @fneg_fabs_fmul_f64(double addrspace(1)* %out, double %x, double %y) { - %fabs = call double @llvm.fabs.f64(double %x) - %fsub = fsub double -0.000000e+00, %fabs - %fmul = fmul double %y, %fsub - store double %fmul, double addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}fneg_fabs_free_f64: -define void @fneg_fabs_free_f64(double addrspace(1)* %out, i64 %in) { - %bc = bitcast i64 %in to double - %fabs = call double @llvm.fabs.f64(double %bc) - %fsub = fsub double -0.000000e+00, %fabs - store double %fsub, double addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}fneg_fabs_fn_free_f64: -; SI: v_mov_b32_e32 [[IMMREG:v[0-9]+]], 0x80000000 -; SI: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]] -define void @fneg_fabs_fn_free_f64(double addrspace(1)* %out, i64 %in) { - %bc = bitcast i64 %in to double - %fabs = call double @fabs(double %bc) - %fsub = fsub double -0.000000e+00, %fabs - store double %fsub, double addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}fneg_fabs_f64: -; SI: s_load_dwordx2 s{{\[}}[[LO_X:[0-9]+]]:[[HI_X:[0-9]+]]{{\]}} -; SI: s_load_dwordx2 -; SI: v_mov_b32_e32 [[IMMREG:v[0-9]+]], 0x80000000 -; SI-DAG: v_or_b32_e32 v[[HI_V:[0-9]+]], s[[HI_X]], [[IMMREG]] -; SI-DAG: v_mov_b32_e32 v[[LO_V:[0-9]+]], s[[LO_X]] -; SI: buffer_store_dwordx2 v{{\[}}[[LO_V]]:[[HI_V]]{{\]}} -define void @fneg_fabs_f64(double addrspace(1)* %out, double %in) { - %fabs = call double @llvm.fabs.f64(double %in) - %fsub = fsub double -0.000000e+00, %fabs - store double %fsub, double addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}fneg_fabs_v2f64: -; SI: v_mov_b32_e32 [[IMMREG:v[0-9]+]], 0x80000000 -; SI-NOT: 0x80000000 -; SI: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]] -; SI: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]] -define void @fneg_fabs_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in) { - %fabs = call <2 x double> @llvm.fabs.v2f64(<2 x double> %in) - %fsub = fsub <2 x double> , %fabs - store <2 x double> %fsub, <2 x double> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}fneg_fabs_v4f64: -; SI: v_mov_b32_e32 [[IMMREG:v[0-9]+]], 0x80000000 -; SI-NOT: 0x80000000 -; SI: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]] -; SI: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]] -; SI: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]] -; SI: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]] -define void @fneg_fabs_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %in) { - %fabs = call <4 x double> @llvm.fabs.v4f64(<4 x double> %in) - %fsub = fsub <4 x double> , %fabs - store <4 x double> %fsub, <4 x double> addrspace(1)* %out - ret void -} - -declare double @fabs(double) readnone -declare double @llvm.fabs.f64(double) readnone -declare <2 x double> @llvm.fabs.v2f64(<2 x double>) readnone -declare <4 x double> @llvm.fabs.v4f64(<4 x double>) readnone diff --git a/llvm/test/CodeGen/R600/fneg-fabs.ll b/llvm/test/CodeGen/R600/fneg-fabs.ll deleted file mode 100644 index 3b4930d9897..00000000000 --- a/llvm/test/CodeGen/R600/fneg-fabs.ll +++ /dev/null @@ -1,118 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s - -; FUNC-LABEL: {{^}}fneg_fabs_fadd_f32: -; SI-NOT: and -; SI: v_sub_f32_e64 {{v[0-9]+}}, {{s[0-9]+}}, |{{v[0-9]+}}| -define void @fneg_fabs_fadd_f32(float addrspace(1)* %out, float %x, float %y) { - %fabs = call float @llvm.fabs.f32(float %x) - %fsub = fsub float -0.000000e+00, %fabs - %fadd = fadd float %y, %fsub - store float %fadd, float addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}fneg_fabs_fmul_f32: -; SI-NOT: and -; SI: v_mul_f32_e64 {{v[0-9]+}}, {{s[0-9]+}}, -|{{v[0-9]+}}| -; SI-NOT: and -define void @fneg_fabs_fmul_f32(float addrspace(1)* %out, float %x, float %y) { - %fabs = call float @llvm.fabs.f32(float %x) - %fsub = fsub float -0.000000e+00, %fabs - %fmul = fmul float %y, %fsub - store float %fmul, float addrspace(1)* %out, align 4 - ret void -} - -; DAGCombiner will transform: -; (fabs (f32 bitcast (i32 a))) => (f32 bitcast (and (i32 a), 0x7FFFFFFF)) -; unless isFabsFree returns true - -; FUNC-LABEL: {{^}}fneg_fabs_free_f32: -; R600-NOT: AND -; R600: |PV.{{[XYZW]}}| -; R600: -PV - -; SI: v_mov_b32_e32 [[IMMREG:v[0-9]+]], 0x80000000 -; SI: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]] -define void @fneg_fabs_free_f32(float addrspace(1)* %out, i32 %in) { - %bc = bitcast i32 %in to float - %fabs = call float @llvm.fabs.f32(float %bc) - %fsub = fsub float -0.000000e+00, %fabs - store float %fsub, float addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}fneg_fabs_fn_free_f32: -; R600-NOT: AND -; R600: |PV.{{[XYZW]}}| -; R600: -PV - -; SI: v_mov_b32_e32 [[IMMREG:v[0-9]+]], 0x80000000 -; SI: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]] -define void @fneg_fabs_fn_free_f32(float addrspace(1)* %out, i32 %in) { - %bc = bitcast i32 %in to float - %fabs = call float @fabs(float %bc) - %fsub = fsub float -0.000000e+00, %fabs - store float %fsub, float addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}fneg_fabs_f32: -; SI: v_mov_b32_e32 [[IMMREG:v[0-9]+]], 0x80000000 -; SI: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]] -define void @fneg_fabs_f32(float addrspace(1)* %out, float %in) { - %fabs = call float @llvm.fabs.f32(float %in) - %fsub = fsub float -0.000000e+00, %fabs - store float %fsub, float addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}v_fneg_fabs_f32: -; SI: v_or_b32_e32 v{{[0-9]+}}, 0x80000000, v{{[0-9]+}} -define void @v_fneg_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) { - %val = load float, float addrspace(1)* %in, align 4 - %fabs = call float @llvm.fabs.f32(float %val) - %fsub = fsub float -0.000000e+00, %fabs - store float %fsub, float addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}fneg_fabs_v2f32: -; R600: |{{(PV|T[0-9])\.[XYZW]}}| -; R600: -PV -; R600: |{{(PV|T[0-9])\.[XYZW]}}| -; R600: -PV - -; FIXME: SGPR should be used directly for first src operand. -; SI: v_mov_b32_e32 [[IMMREG:v[0-9]+]], 0x80000000 -; SI-NOT: 0x80000000 -; SI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[IMMREG]] -; SI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[IMMREG]] -define void @fneg_fabs_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) { - %fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %in) - %fsub = fsub <2 x float> , %fabs - store <2 x float> %fsub, <2 x float> addrspace(1)* %out - ret void -} - -; FIXME: SGPR should be used directly for first src operand. -; FUNC-LABEL: {{^}}fneg_fabs_v4f32: -; SI: v_mov_b32_e32 [[IMMREG:v[0-9]+]], 0x80000000 -; SI-NOT: 0x80000000 -; SI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[IMMREG]] -; SI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[IMMREG]] -; SI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[IMMREG]] -; SI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[IMMREG]] -define void @fneg_fabs_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) { - %fabs = call <4 x float> @llvm.fabs.v4f32(<4 x float> %in) - %fsub = fsub <4 x float> , %fabs - store <4 x float> %fsub, <4 x float> addrspace(1)* %out - ret void -} - -declare float @fabs(float) readnone -declare float @llvm.fabs.f32(float) readnone -declare <2 x float> @llvm.fabs.v2f32(<2 x float>) readnone -declare <4 x float> @llvm.fabs.v4f32(<4 x float>) readnone diff --git a/llvm/test/CodeGen/R600/fneg.f64.ll b/llvm/test/CodeGen/R600/fneg.f64.ll deleted file mode 100644 index aa6df209035..00000000000 --- a/llvm/test/CodeGen/R600/fneg.f64.ll +++ /dev/null @@ -1,60 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=FUNC %s - -; FUNC-LABEL: {{^}}fneg_f64: -; GCN: v_xor_b32 -define void @fneg_f64(double addrspace(1)* %out, double %in) { - %fneg = fsub double -0.000000e+00, %in - store double %fneg, double addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}fneg_v2f64: -; GCN: v_xor_b32 -; GCN: v_xor_b32 -define void @fneg_v2f64(<2 x double> addrspace(1)* nocapture %out, <2 x double> %in) { - %fneg = fsub <2 x double> , %in - store <2 x double> %fneg, <2 x double> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}fneg_v4f64: -; R600: -PV -; R600: -T -; R600: -PV -; R600: -PV - -; GCN: v_xor_b32 -; GCN: v_xor_b32 -; GCN: v_xor_b32 -; GCN: v_xor_b32 -define void @fneg_v4f64(<4 x double> addrspace(1)* nocapture %out, <4 x double> %in) { - %fneg = fsub <4 x double> , %in - store <4 x double> %fneg, <4 x double> addrspace(1)* %out - ret void -} - -; DAGCombiner will transform: -; (fneg (f64 bitcast (i64 a))) => (f64 bitcast (xor (i64 a), 0x80000000)) -; unless the target returns true for isNegFree() - -; FUNC-LABEL: {{^}}fneg_free_f64: -; GCN: v_add_f64 {{v\[[0-9]+:[0-9]+\]}}, 0, -{{s\[[0-9]+:[0-9]+\]$}} -define void @fneg_free_f64(double addrspace(1)* %out, i64 %in) { - %bc = bitcast i64 %in to double - %fsub = fsub double 0.0, %bc - store double %fsub, double addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}fneg_fold_f64: -; SI: s_load_dwordx2 [[NEG_VALUE:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb -; VI: s_load_dwordx2 [[NEG_VALUE:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c -; GCN-NOT: xor -; GCN: v_mul_f64 {{v\[[0-9]+:[0-9]+\]}}, -[[NEG_VALUE]], [[NEG_VALUE]] -define void @fneg_fold_f64(double addrspace(1)* %out, double %in) { - %fsub = fsub double -0.0, %in - %fmul = fmul double %fsub, %in - store double %fmul, double addrspace(1)* %out - ret void -} diff --git a/llvm/test/CodeGen/R600/fneg.ll b/llvm/test/CodeGen/R600/fneg.ll deleted file mode 100644 index a0fd539863c..00000000000 --- a/llvm/test/CodeGen/R600/fneg.ll +++ /dev/null @@ -1,70 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s - -; FUNC-LABEL: {{^}}fneg_f32: -; R600: -PV - -; GCN: v_xor_b32 -define void @fneg_f32(float addrspace(1)* %out, float %in) { - %fneg = fsub float -0.000000e+00, %in - store float %fneg, float addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}fneg_v2f32: -; R600: -PV -; R600: -PV - -; GCN: v_xor_b32 -; GCN: v_xor_b32 -define void @fneg_v2f32(<2 x float> addrspace(1)* nocapture %out, <2 x float> %in) { - %fneg = fsub <2 x float> , %in - store <2 x float> %fneg, <2 x float> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}fneg_v4f32: -; R600: -PV -; R600: -T -; R600: -PV -; R600: -PV - -; GCN: v_xor_b32 -; GCN: v_xor_b32 -; GCN: v_xor_b32 -; GCN: v_xor_b32 -define void @fneg_v4f32(<4 x float> addrspace(1)* nocapture %out, <4 x float> %in) { - %fneg = fsub <4 x float> , %in - store <4 x float> %fneg, <4 x float> addrspace(1)* %out - ret void -} - -; DAGCombiner will transform: -; (fneg (f32 bitcast (i32 a))) => (f32 bitcast (xor (i32 a), 0x80000000)) -; unless the target returns true for isNegFree() - -; FUNC-LABEL: {{^}}fneg_free_f32: -; R600-NOT: XOR -; R600: -KC0[2].Z - -; XXX: We could use v_add_f32_e64 with the negate bit here instead. -; GCN: v_sub_f32_e64 v{{[0-9]}}, 0, s{{[0-9]+$}} -define void @fneg_free_f32(float addrspace(1)* %out, i32 %in) { - %bc = bitcast i32 %in to float - %fsub = fsub float 0.0, %bc - store float %fsub, float addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}fneg_fold_f32: -; SI: s_load_dword [[NEG_VALUE:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xb -; VI: s_load_dword [[NEG_VALUE:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x2c -; GCN-NOT: xor -; GCN: v_mul_f32_e64 v{{[0-9]+}}, -[[NEG_VALUE]], [[NEG_VALUE]] -define void @fneg_fold_f32(float addrspace(1)* %out, float %in) { - %fsub = fsub float -0.0, %in - %fmul = fmul float %fsub, %in - store float %fmul, float addrspace(1)* %out - ret void -} diff --git a/llvm/test/CodeGen/R600/fp-classify.ll b/llvm/test/CodeGen/R600/fp-classify.ll deleted file mode 100644 index 4fac5176fac..00000000000 --- a/llvm/test/CodeGen/R600/fp-classify.ll +++ /dev/null @@ -1,131 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s - -declare i1 @llvm.AMDGPU.class.f32(float, i32) #1 -declare i1 @llvm.AMDGPU.class.f64(double, i32) #1 -declare i32 @llvm.r600.read.tidig.x() #1 -declare float @llvm.fabs.f32(float) #1 -declare double @llvm.fabs.f64(double) #1 - -; SI-LABEL: {{^}}test_isinf_pattern: -; SI: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x204{{$}} -; SI: v_cmp_class_f32_e32 vcc, s{{[0-9]+}}, [[MASK]] -; SI-NOT: v_cmp -; SI: s_endpgm -define void @test_isinf_pattern(i32 addrspace(1)* nocapture %out, float %x) #0 { - %fabs = tail call float @llvm.fabs.f32(float %x) #1 - %cmp = fcmp oeq float %fabs, 0x7FF0000000000000 - %ext = zext i1 %cmp to i32 - store i32 %ext, i32 addrspace(1)* %out, align 4 - ret void -} - -; SI-LABEL: {{^}}test_not_isinf_pattern_0: -; SI-NOT: v_cmp_class -; SI: s_endpgm -define void @test_not_isinf_pattern_0(i32 addrspace(1)* nocapture %out, float %x) #0 { - %fabs = tail call float @llvm.fabs.f32(float %x) #1 - %cmp = fcmp ueq float %fabs, 0x7FF0000000000000 - %ext = zext i1 %cmp to i32 - store i32 %ext, i32 addrspace(1)* %out, align 4 - ret void -} - -; SI-LABEL: {{^}}test_not_isinf_pattern_1: -; SI-NOT: v_cmp_class -; SI: s_endpgm -define void @test_not_isinf_pattern_1(i32 addrspace(1)* nocapture %out, float %x) #0 { - %fabs = tail call float @llvm.fabs.f32(float %x) #1 - %cmp = fcmp oeq float %fabs, 0xFFF0000000000000 - %ext = zext i1 %cmp to i32 - store i32 %ext, i32 addrspace(1)* %out, align 4 - ret void -} - -; SI-LABEL: {{^}}test_isfinite_pattern_0: -; SI-NOT: v_cmp -; SI: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x1f8{{$}} -; SI: v_cmp_class_f32_e32 vcc, s{{[0-9]+}}, [[MASK]] -; SI-NOT: v_cmp -; SI: s_endpgm -define void @test_isfinite_pattern_0(i32 addrspace(1)* nocapture %out, float %x) #0 { - %ord = fcmp ord float %x, 0.000000e+00 - %x.fabs = tail call float @llvm.fabs.f32(float %x) #1 - %ninf = fcmp une float %x.fabs, 0x7FF0000000000000 - %and = and i1 %ord, %ninf - %ext = zext i1 %and to i32 - store i32 %ext, i32 addrspace(1)* %out, align 4 - ret void -} - -; Use negative infinity -; SI-LABEL: {{^}}test_isfinite_not_pattern_0: -; SI-NOT: v_cmp_class_f32 -; SI: s_endpgm -define void @test_isfinite_not_pattern_0(i32 addrspace(1)* nocapture %out, float %x) #0 { - %ord = fcmp ord float %x, 0.000000e+00 - %x.fabs = tail call float @llvm.fabs.f32(float %x) #1 - %ninf = fcmp une float %x.fabs, 0xFFF0000000000000 - %and = and i1 %ord, %ninf - %ext = zext i1 %and to i32 - store i32 %ext, i32 addrspace(1)* %out, align 4 - ret void -} - -; No fabs -; SI-LABEL: {{^}}test_isfinite_not_pattern_1: -; SI-NOT: v_cmp_class_f32 -; SI: s_endpgm -define void @test_isfinite_not_pattern_1(i32 addrspace(1)* nocapture %out, float %x) #0 { - %ord = fcmp ord float %x, 0.000000e+00 - %ninf = fcmp une float %x, 0x7FF0000000000000 - %and = and i1 %ord, %ninf - %ext = zext i1 %and to i32 - store i32 %ext, i32 addrspace(1)* %out, align 4 - ret void -} - -; fabs of different value -; SI-LABEL: {{^}}test_isfinite_not_pattern_2: -; SI-NOT: v_cmp_class_f32 -; SI: s_endpgm -define void @test_isfinite_not_pattern_2(i32 addrspace(1)* nocapture %out, float %x, float %y) #0 { - %ord = fcmp ord float %x, 0.000000e+00 - %x.fabs = tail call float @llvm.fabs.f32(float %y) #1 - %ninf = fcmp une float %x.fabs, 0x7FF0000000000000 - %and = and i1 %ord, %ninf - %ext = zext i1 %and to i32 - store i32 %ext, i32 addrspace(1)* %out, align 4 - ret void -} - -; Wrong ordered compare type -; SI-LABEL: {{^}}test_isfinite_not_pattern_3: -; SI-NOT: v_cmp_class_f32 -; SI: s_endpgm -define void @test_isfinite_not_pattern_3(i32 addrspace(1)* nocapture %out, float %x) #0 { - %ord = fcmp uno float %x, 0.000000e+00 - %x.fabs = tail call float @llvm.fabs.f32(float %x) #1 - %ninf = fcmp une float %x.fabs, 0x7FF0000000000000 - %and = and i1 %ord, %ninf - %ext = zext i1 %and to i32 - store i32 %ext, i32 addrspace(1)* %out, align 4 - ret void -} - -; Wrong unordered compare -; SI-LABEL: {{^}}test_isfinite_not_pattern_4: -; SI-NOT: v_cmp_class_f32 -; SI: s_endpgm -define void @test_isfinite_not_pattern_4(i32 addrspace(1)* nocapture %out, float %x) #0 { - %ord = fcmp ord float %x, 0.000000e+00 - %x.fabs = tail call float @llvm.fabs.f32(float %x) #1 - %ninf = fcmp one float %x.fabs, 0x7FF0000000000000 - %and = and i1 %ord, %ninf - %ext = zext i1 %and to i32 - store i32 %ext, i32 addrspace(1)* %out, align 4 - ret void -} - -attributes #0 = { nounwind } -attributes #1 = { nounwind readnone } diff --git a/llvm/test/CodeGen/R600/fp16_to_fp.ll b/llvm/test/CodeGen/R600/fp16_to_fp.ll deleted file mode 100644 index 5a79ca82bc2..00000000000 --- a/llvm/test/CodeGen/R600/fp16_to_fp.ll +++ /dev/null @@ -1,29 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s - -declare float @llvm.convert.from.fp16.f32(i16) nounwind readnone -declare double @llvm.convert.from.fp16.f64(i16) nounwind readnone - -; SI-LABEL: {{^}}test_convert_fp16_to_fp32: -; SI: buffer_load_ushort [[VAL:v[0-9]+]] -; SI: v_cvt_f32_f16_e32 [[RESULT:v[0-9]+]], [[VAL]] -; SI: buffer_store_dword [[RESULT]] -define void @test_convert_fp16_to_fp32(float addrspace(1)* noalias %out, i16 addrspace(1)* noalias %in) nounwind { - %val = load i16, i16 addrspace(1)* %in, align 2 - %cvt = call float @llvm.convert.from.fp16.f32(i16 %val) nounwind readnone - store float %cvt, float addrspace(1)* %out, align 4 - ret void -} - - -; SI-LABEL: {{^}}test_convert_fp16_to_fp64: -; SI: buffer_load_ushort [[VAL:v[0-9]+]] -; SI: v_cvt_f32_f16_e32 [[RESULT32:v[0-9]+]], [[VAL]] -; SI: v_cvt_f64_f32_e32 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[RESULT32]] -; SI: buffer_store_dwordx2 [[RESULT]] -define void @test_convert_fp16_to_fp64(double addrspace(1)* noalias %out, i16 addrspace(1)* noalias %in) nounwind { - %val = load i16, i16 addrspace(1)* %in, align 2 - %cvt = call double @llvm.convert.from.fp16.f64(i16 %val) nounwind readnone - store double %cvt, double addrspace(1)* %out, align 4 - ret void -} diff --git a/llvm/test/CodeGen/R600/fp32_to_fp16.ll b/llvm/test/CodeGen/R600/fp32_to_fp16.ll deleted file mode 100644 index 67925ebd82b..00000000000 --- a/llvm/test/CodeGen/R600/fp32_to_fp16.ll +++ /dev/null @@ -1,15 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s - -declare i16 @llvm.convert.to.fp16.f32(float) nounwind readnone - -; SI-LABEL: {{^}}test_convert_fp32_to_fp16: -; SI: buffer_load_dword [[VAL:v[0-9]+]] -; SI: v_cvt_f16_f32_e32 [[RESULT:v[0-9]+]], [[VAL]] -; SI: buffer_store_short [[RESULT]] -define void @test_convert_fp32_to_fp16(i16 addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind { - %val = load float, float addrspace(1)* %in, align 4 - %cvt = call i16 @llvm.convert.to.fp16.f32(float %val) nounwind readnone - store i16 %cvt, i16 addrspace(1)* %out, align 2 - ret void -} diff --git a/llvm/test/CodeGen/R600/fp_to_sint.f64.ll b/llvm/test/CodeGen/R600/fp_to_sint.f64.ll deleted file mode 100644 index 12df6606e8f..00000000000 --- a/llvm/test/CodeGen/R600/fp_to_sint.f64.ll +++ /dev/null @@ -1,56 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s - -declare i32 @llvm.r600.read.tidig.x() nounwind readnone - -; FUNC-LABEL: @fp_to_sint_f64_i32 -; SI: v_cvt_i32_f64_e32 -define void @fp_to_sint_f64_i32(i32 addrspace(1)* %out, double %in) { - %result = fptosi double %in to i32 - store i32 %result, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: @fp_to_sint_v2f64_v2i32 -; SI: v_cvt_i32_f64_e32 -; SI: v_cvt_i32_f64_e32 -define void @fp_to_sint_v2f64_v2i32(<2 x i32> addrspace(1)* %out, <2 x double> %in) { - %result = fptosi <2 x double> %in to <2 x i32> - store <2 x i32> %result, <2 x i32> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: @fp_to_sint_v4f64_v4i32 -; SI: v_cvt_i32_f64_e32 -; SI: v_cvt_i32_f64_e32 -; SI: v_cvt_i32_f64_e32 -; SI: v_cvt_i32_f64_e32 -define void @fp_to_sint_v4f64_v4i32(<4 x i32> addrspace(1)* %out, <4 x double> %in) { - %result = fptosi <4 x double> %in to <4 x i32> - store <4 x i32> %result, <4 x i32> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: @fp_to_sint_i64_f64 -; CI-DAG: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]] -; CI-DAG: v_trunc_f64_e32 [[TRUNC:v\[[0-9]+:[0-9]+\]]], [[VAL]] -; CI-DAG: s_mov_b32 s[[K0_LO:[0-9]+]], 0{{$}} -; CI-DAG: s_mov_b32 s[[K0_HI:[0-9]+]], 0x3df00000 - -; CI-DAG: v_mul_f64 [[MUL:v\[[0-9]+:[0-9]+\]]], [[VAL]], s{{\[}}[[K0_LO]]:[[K0_HI]]{{\]}} -; CI-DAG: v_floor_f64_e32 [[FLOOR:v\[[0-9]+:[0-9]+\]]], [[MUL]] - -; CI-DAG: s_mov_b32 s[[K1_HI:[0-9]+]], 0xc1f00000 - -; CI-DAG: v_fma_f64 [[FMA:v\[[0-9]+:[0-9]+\]]], [[FLOOR]], s{{\[[0-9]+}}:[[K1_HI]]{{\]}}, [[TRUNC]] -; CI-DAG: v_cvt_u32_f64_e32 v[[LO:[0-9]+]], [[FMA]] -; CI-DAG: v_cvt_i32_f64_e32 v[[HI:[0-9]+]], [[FLOOR]] -; CI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} -define void @fp_to_sint_i64_f64(i64 addrspace(1)* %out, double addrspace(1)* %in) { - %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone - %gep = getelementptr double, double addrspace(1)* %in, i32 %tid - %val = load double, double addrspace(1)* %gep, align 8 - %cast = fptosi double %val to i64 - store i64 %cast, i64 addrspace(1)* %out, align 8 - ret void -} diff --git a/llvm/test/CodeGen/R600/fp_to_sint.ll b/llvm/test/CodeGen/R600/fp_to_sint.ll deleted file mode 100644 index 301a94b4904..00000000000 --- a/llvm/test/CodeGen/R600/fp_to_sint.ll +++ /dev/null @@ -1,230 +0,0 @@ -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck %s --check-prefix=EG --check-prefix=FUNC -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck %s --check-prefix=SI --check-prefix=FUNC -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s --check-prefix=SI --check-prefix=FUNC - -declare float @llvm.fabs.f32(float) #0 - -; FUNC-LABEL: {{^}}fp_to_sint_i32: -; EG: FLT_TO_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}} -; SI: v_cvt_i32_f32_e32 -; SI: s_endpgm -define void @fp_to_sint_i32(i32 addrspace(1)* %out, float %in) { - %conv = fptosi float %in to i32 - store i32 %conv, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}fp_to_sint_i32_fabs: -; SI: v_cvt_i32_f32_e64 v{{[0-9]+}}, |s{{[0-9]+}}|{{$}} -define void @fp_to_sint_i32_fabs(i32 addrspace(1)* %out, float %in) { - %in.fabs = call float @llvm.fabs.f32(float %in) #0 - %conv = fptosi float %in.fabs to i32 - store i32 %conv, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}fp_to_sint_v2i32: -; EG: FLT_TO_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}} -; EG: FLT_TO_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}} -; SI: v_cvt_i32_f32_e32 -; SI: v_cvt_i32_f32_e32 -define void @fp_to_sint_v2i32(<2 x i32> addrspace(1)* %out, <2 x float> %in) { - %result = fptosi <2 x float> %in to <2 x i32> - store <2 x i32> %result, <2 x i32> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}fp_to_sint_v4i32: -; EG: FLT_TO_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}} -; EG: FLT_TO_INT {{\** *}}T{{[0-9]+\.[XYZW]}} -; EG: FLT_TO_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}} -; EG: FLT_TO_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}} -; SI: v_cvt_i32_f32_e32 -; SI: v_cvt_i32_f32_e32 -; SI: v_cvt_i32_f32_e32 -; SI: v_cvt_i32_f32_e32 -define void @fp_to_sint_v4i32(<4 x i32> addrspace(1)* %out, <4 x float> addrspace(1)* %in) { - %value = load <4 x float>, <4 x float> addrspace(1) * %in - %result = fptosi <4 x float> %value to <4 x i32> - store <4 x i32> %result, <4 x i32> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}fp_to_sint_i64: - -; EG-DAG: AND_INT -; EG-DAG: LSHR -; EG-DAG: SUB_INT -; EG-DAG: AND_INT -; EG-DAG: ASHR -; EG-DAG: AND_INT -; EG-DAG: OR_INT -; EG-DAG: SUB_INT -; EG-DAG: LSHL -; EG-DAG: LSHL -; EG-DAG: SUB_INT -; EG-DAG: LSHR -; EG-DAG: LSHR -; EG-DAG: SETGT_UINT -; EG-DAG: SETGT_INT -; EG-DAG: XOR_INT -; EG-DAG: XOR_INT -; EG: SUB_INT -; EG-DAG: SUB_INT -; EG-DAG: CNDE_INT -; EG-DAG: CNDE_INT - -; Check that the compiler doesn't crash with a "cannot select" error -; SI: s_endpgm -define void @fp_to_sint_i64 (i64 addrspace(1)* %out, float %in) { -entry: - %0 = fptosi float %in to i64 - store i64 %0, i64 addrspace(1)* %out - ret void -} - -; FUNC: {{^}}fp_to_sint_v2i64: -; EG-DAG: AND_INT -; EG-DAG: LSHR -; EG-DAG: SUB_INT -; EG-DAG: AND_INT -; EG-DAG: ASHR -; EG-DAG: AND_INT -; EG-DAG: OR_INT -; EG-DAG: SUB_INT -; EG-DAG: LSHL -; EG-DAG: LSHL -; EG-DAG: SUB_INT -; EG-DAG: LSHR -; EG-DAG: LSHR -; EG-DAG: SETGT_UINT -; EG-DAG: SETGT_INT -; EG-DAG: XOR_INT -; EG-DAG: XOR_INT -; EG-DAG: SUB_INT -; EG-DAG: SUB_INT -; EG-DAG: CNDE_INT -; EG-DAG: CNDE_INT -; EG-DAG: AND_INT -; EG-DAG: LSHR -; EG-DAG: SUB_INT -; EG-DAG: AND_INT -; EG-DAG: ASHR -; EG-DAG: AND_INT -; EG-DAG: OR_INT -; EG-DAG: SUB_INT -; EG-DAG: LSHL -; EG-DAG: LSHL -; EG-DAG: SUB_INT -; EG-DAG: LSHR -; EG-DAG: LSHR -; EG-DAG: SETGT_UINT -; EG-DAG: SETGT_INT -; EG-DAG: XOR_INT -; EG-DAG: XOR_INT -; EG-DAG: SUB_INT -; EG-DAG: SUB_INT -; EG-DAG: CNDE_INT -; EG-DAG: CNDE_INT - -; SI: s_endpgm -define void @fp_to_sint_v2i64(<2 x i64> addrspace(1)* %out, <2 x float> %x) { - %conv = fptosi <2 x float> %x to <2 x i64> - store <2 x i64> %conv, <2 x i64> addrspace(1)* %out - ret void -} - -; FUNC: {{^}}fp_to_sint_v4i64: -; EG-DAG: AND_INT -; EG-DAG: LSHR -; EG-DAG: SUB_INT -; EG-DAG: AND_INT -; EG-DAG: ASHR -; EG-DAG: AND_INT -; EG-DAG: OR_INT -; EG-DAG: SUB_INT -; EG-DAG: LSHL -; EG-DAG: LSHL -; EG-DAG: SUB_INT -; EG-DAG: LSHR -; EG-DAG: LSHR -; EG-DAG: SETGT_UINT -; EG-DAG: SETGT_INT -; EG-DAG: XOR_INT -; EG-DAG: XOR_INT -; EG-DAG: SUB_INT -; EG-DAG: SUB_INT -; EG-DAG: CNDE_INT -; EG-DAG: CNDE_INT -; EG-DAG: AND_INT -; EG-DAG: LSHR -; EG-DAG: SUB_INT -; EG-DAG: AND_INT -; EG-DAG: ASHR -; EG-DAG: AND_INT -; EG-DAG: OR_INT -; EG-DAG: SUB_INT -; EG-DAG: LSHL -; EG-DAG: LSHL -; EG-DAG: SUB_INT -; EG-DAG: LSHR -; EG-DAG: LSHR -; EG-DAG: SETGT_UINT -; EG-DAG: SETGT_INT -; EG-DAG: XOR_INT -; EG-DAG: XOR_INT -; EG-DAG: SUB_INT -; EG-DAG: SUB_INT -; EG-DAG: CNDE_INT -; EG-DAG: CNDE_INT -; EG-DAG: AND_INT -; EG-DAG: LSHR -; EG-DAG: SUB_INT -; EG-DAG: AND_INT -; EG-DAG: ASHR -; EG-DAG: AND_INT -; EG-DAG: OR_INT -; EG-DAG: SUB_INT -; EG-DAG: LSHL -; EG-DAG: LSHL -; EG-DAG: SUB_INT -; EG-DAG: LSHR -; EG-DAG: LSHR -; EG-DAG: SETGT_UINT -; EG-DAG: SETGT_INT -; EG-DAG: XOR_INT -; EG-DAG: XOR_INT -; EG-DAG: SUB_INT -; EG-DAG: SUB_INT -; EG-DAG: CNDE_INT -; EG-DAG: CNDE_INT -; EG-DAG: AND_INT -; EG-DAG: LSHR -; EG-DAG: SUB_INT -; EG-DAG: AND_INT -; EG-DAG: ASHR -; EG-DAG: AND_INT -; EG-DAG: OR_INT -; EG-DAG: SUB_INT -; EG-DAG: LSHL -; EG-DAG: LSHL -; EG-DAG: SUB_INT -; EG-DAG: LSHR -; EG-DAG: LSHR -; EG-DAG: SETGT_UINT -; EG-DAG: SETGT_INT -; EG-DAG: XOR_INT -; EG-DAG: XOR_INT -; EG-DAG: SUB_INT -; EG-DAG: SUB_INT -; EG-DAG: CNDE_INT -; EG-DAG: CNDE_INT - -; SI: s_endpgm -define void @fp_to_sint_v4i64(<4 x i64> addrspace(1)* %out, <4 x float> %x) { - %conv = fptosi <4 x float> %x to <4 x i64> - store <4 x i64> %conv, <4 x i64> addrspace(1)* %out - ret void -} - -attributes #0 = { nounwind readnone } diff --git a/llvm/test/CodeGen/R600/fp_to_uint.f64.ll b/llvm/test/CodeGen/R600/fp_to_uint.f64.ll deleted file mode 100644 index 41bc2a78001..00000000000 --- a/llvm/test/CodeGen/R600/fp_to_uint.f64.ll +++ /dev/null @@ -1,70 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s - -declare i32 @llvm.r600.read.tidig.x() nounwind readnone - -; SI-LABEL: {{^}}fp_to_uint_i32_f64: -; SI: v_cvt_u32_f64_e32 -define void @fp_to_uint_i32_f64(i32 addrspace(1)* %out, double %in) { - %cast = fptoui double %in to i32 - store i32 %cast, i32 addrspace(1)* %out, align 4 - ret void -} - -; SI-LABEL: @fp_to_uint_v2i32_v2f64 -; SI: v_cvt_u32_f64_e32 -; SI: v_cvt_u32_f64_e32 -define void @fp_to_uint_v2i32_v2f64(<2 x i32> addrspace(1)* %out, <2 x double> %in) { - %cast = fptoui <2 x double> %in to <2 x i32> - store <2 x i32> %cast, <2 x i32> addrspace(1)* %out, align 8 - ret void -} - -; SI-LABEL: @fp_to_uint_v4i32_v4f64 -; SI: v_cvt_u32_f64_e32 -; SI: v_cvt_u32_f64_e32 -; SI: v_cvt_u32_f64_e32 -; SI: v_cvt_u32_f64_e32 -define void @fp_to_uint_v4i32_v4f64(<4 x i32> addrspace(1)* %out, <4 x double> %in) { - %cast = fptoui <4 x double> %in to <4 x i32> - store <4 x i32> %cast, <4 x i32> addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: @fp_to_uint_i64_f64 -; CI-DAG: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]] -; CI-DAG: v_trunc_f64_e32 [[TRUNC:v\[[0-9]+:[0-9]+\]]], [[VAL]] -; CI-DAG: s_mov_b32 s[[K0_LO:[0-9]+]], 0{{$}} -; CI-DAG: s_mov_b32 s[[K0_HI:[0-9]+]], 0x3df00000 - -; CI-DAG: v_mul_f64 [[MUL:v\[[0-9]+:[0-9]+\]]], [[VAL]], s{{\[}}[[K0_LO]]:[[K0_HI]]{{\]}} -; CI-DAG: v_floor_f64_e32 [[FLOOR:v\[[0-9]+:[0-9]+\]]], [[MUL]] - -; CI-DAG: s_mov_b32 s[[K1_HI:[0-9]+]], 0xc1f00000 - -; CI-DAG: v_fma_f64 [[FMA:v\[[0-9]+:[0-9]+\]]], [[FLOOR]], s{{\[[0-9]+}}:[[K1_HI]]{{\]}}, [[TRUNC]] -; CI-DAG: v_cvt_u32_f64_e32 v[[LO:[0-9]+]], [[FMA]] -; CI-DAG: v_cvt_u32_f64_e32 v[[HI:[0-9]+]], [[FLOOR]] -; CI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} -define void @fp_to_uint_i64_f64(i64 addrspace(1)* %out, double addrspace(1)* %in) { - %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone - %gep = getelementptr double, double addrspace(1)* %in, i32 %tid - %val = load double, double addrspace(1)* %gep, align 8 - %cast = fptoui double %val to i64 - store i64 %cast, i64 addrspace(1)* %out, align 4 - ret void -} - -; SI-LABEL: @fp_to_uint_v2i64_v2f64 -define void @fp_to_uint_v2i64_v2f64(<2 x i64> addrspace(1)* %out, <2 x double> %in) { - %cast = fptoui <2 x double> %in to <2 x i64> - store <2 x i64> %cast, <2 x i64> addrspace(1)* %out, align 16 - ret void -} - -; SI-LABEL: @fp_to_uint_v4i64_v4f64 -define void @fp_to_uint_v4i64_v4f64(<4 x i64> addrspace(1)* %out, <4 x double> %in) { - %cast = fptoui <4 x double> %in to <4 x i64> - store <4 x i64> %cast, <4 x i64> addrspace(1)* %out, align 32 - ret void -} diff --git a/llvm/test/CodeGen/R600/fp_to_uint.ll b/llvm/test/CodeGen/R600/fp_to_uint.ll deleted file mode 100644 index b7b6ccc238b..00000000000 --- a/llvm/test/CodeGen/R600/fp_to_uint.ll +++ /dev/null @@ -1,217 +0,0 @@ -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck %s -check-prefix=EG -check-prefix=FUNC -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck %s -check-prefix=SI -check-prefix=FUNC -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s -check-prefix=SI -check-prefix=FUNC - -; FUNC-LABEL: {{^}}fp_to_uint_f32_to_i32: -; EG: FLT_TO_UINT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}} - -; SI: v_cvt_u32_f32_e32 -; SI: s_endpgm -define void @fp_to_uint_f32_to_i32 (i32 addrspace(1)* %out, float %in) { - %conv = fptoui float %in to i32 - store i32 %conv, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}fp_to_uint_v2f32_to_v2i32: -; EG: FLT_TO_UINT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}} -; EG: FLT_TO_UINT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} - -; SI: v_cvt_u32_f32_e32 -; SI: v_cvt_u32_f32_e32 -define void @fp_to_uint_v2f32_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x float> %in) { - %result = fptoui <2 x float> %in to <2 x i32> - store <2 x i32> %result, <2 x i32> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}fp_to_uint_v4f32_to_v4i32: -; EG: FLT_TO_UINT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}} -; EG: FLT_TO_UINT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -; EG: FLT_TO_UINT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}} -; EG: FLT_TO_UINT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}} -; SI: v_cvt_u32_f32_e32 -; SI: v_cvt_u32_f32_e32 -; SI: v_cvt_u32_f32_e32 -; SI: v_cvt_u32_f32_e32 - -define void @fp_to_uint_v4f32_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x float> addrspace(1)* %in) { - %value = load <4 x float>, <4 x float> addrspace(1) * %in - %result = fptoui <4 x float> %value to <4 x i32> - store <4 x i32> %result, <4 x i32> addrspace(1)* %out - ret void -} - -; FUNC: {{^}}fp_to_uint_f32_to_i64: -; EG-DAG: AND_INT -; EG-DAG: LSHR -; EG-DAG: SUB_INT -; EG-DAG: AND_INT -; EG-DAG: ASHR -; EG-DAG: AND_INT -; EG-DAG: OR_INT -; EG-DAG: SUB_INT -; EG-DAG: LSHL -; EG-DAG: LSHL -; EG-DAG: SUB_INT -; EG-DAG: LSHR -; EG-DAG: LSHR -; EG-DAG: SETGT_UINT -; EG-DAG: SETGT_INT -; EG-DAG: XOR_INT -; EG-DAG: XOR_INT -; EG: SUB_INT -; EG-DAG: SUB_INT -; EG-DAG: CNDE_INT -; EG-DAG: CNDE_INT - -; SI: s_endpgm -define void @fp_to_uint_f32_to_i64(i64 addrspace(1)* %out, float %x) { - %conv = fptoui float %x to i64 - store i64 %conv, i64 addrspace(1)* %out - ret void -} - -; FUNC: {{^}}fp_to_uint_v2f32_to_v2i64: -; EG-DAG: AND_INT -; EG-DAG: LSHR -; EG-DAG: SUB_INT -; EG-DAG: AND_INT -; EG-DAG: ASHR -; EG-DAG: AND_INT -; EG-DAG: OR_INT -; EG-DAG: SUB_INT -; EG-DAG: LSHL -; EG-DAG: LSHL -; EG-DAG: SUB_INT -; EG-DAG: LSHR -; EG-DAG: LSHR -; EG-DAG: SETGT_UINT -; EG-DAG: SETGT_INT -; EG-DAG: XOR_INT -; EG-DAG: XOR_INT -; EG-DAG: SUB_INT -; EG-DAG: SUB_INT -; EG-DAG: CNDE_INT -; EG-DAG: CNDE_INT -; EG-DAG: AND_INT -; EG-DAG: LSHR -; EG-DAG: SUB_INT -; EG-DAG: AND_INT -; EG-DAG: ASHR -; EG-DAG: AND_INT -; EG-DAG: OR_INT -; EG-DAG: SUB_INT -; EG-DAG: LSHL -; EG-DAG: LSHL -; EG-DAG: SUB_INT -; EG-DAG: LSHR -; EG-DAG: LSHR -; EG-DAG: SETGT_UINT -; EG-DAG: SETGT_INT -; EG-DAG: XOR_INT -; EG-DAG: XOR_INT -; EG-DAG: SUB_INT -; EG-DAG: SUB_INT -; EG-DAG: CNDE_INT -; EG-DAG: CNDE_INT - -; SI: s_endpgm -define void @fp_to_uint_v2f32_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x float> %x) { - %conv = fptoui <2 x float> %x to <2 x i64> - store <2 x i64> %conv, <2 x i64> addrspace(1)* %out - ret void -} - -; FUNC: {{^}}fp_to_uint_v4f32_to_v4i64: -; EG-DAG: AND_INT -; EG-DAG: LSHR -; EG-DAG: SUB_INT -; EG-DAG: AND_INT -; EG-DAG: ASHR -; EG-DAG: AND_INT -; EG-DAG: OR_INT -; EG-DAG: SUB_INT -; EG-DAG: LSHL -; EG-DAG: LSHL -; EG-DAG: SUB_INT -; EG-DAG: LSHR -; EG-DAG: LSHR -; EG-DAG: SETGT_UINT -; EG-DAG: SETGT_INT -; EG-DAG: XOR_INT -; EG-DAG: XOR_INT -; EG-DAG: SUB_INT -; EG-DAG: SUB_INT -; EG-DAG: CNDE_INT -; EG-DAG: CNDE_INT -; EG-DAG: AND_INT -; EG-DAG: LSHR -; EG-DAG: SUB_INT -; EG-DAG: AND_INT -; EG-DAG: ASHR -; EG-DAG: AND_INT -; EG-DAG: OR_INT -; EG-DAG: SUB_INT -; EG-DAG: LSHL -; EG-DAG: LSHL -; EG-DAG: SUB_INT -; EG-DAG: LSHR -; EG-DAG: LSHR -; EG-DAG: SETGT_UINT -; EG-DAG: SETGT_INT -; EG-DAG: XOR_INT -; EG-DAG: XOR_INT -; EG-DAG: SUB_INT -; EG-DAG: SUB_INT -; EG-DAG: CNDE_INT -; EG-DAG: CNDE_INT -; EG-DAG: AND_INT -; EG-DAG: LSHR -; EG-DAG: SUB_INT -; EG-DAG: AND_INT -; EG-DAG: ASHR -; EG-DAG: AND_INT -; EG-DAG: OR_INT -; EG-DAG: SUB_INT -; EG-DAG: LSHL -; EG-DAG: LSHL -; EG-DAG: SUB_INT -; EG-DAG: LSHR -; EG-DAG: LSHR -; EG-DAG: SETGT_UINT -; EG-DAG: SETGT_INT -; EG-DAG: XOR_INT -; EG-DAG: XOR_INT -; EG-DAG: SUB_INT -; EG-DAG: SUB_INT -; EG-DAG: CNDE_INT -; EG-DAG: CNDE_INT -; EG-DAG: AND_INT -; EG-DAG: LSHR -; EG-DAG: SUB_INT -; EG-DAG: AND_INT -; EG-DAG: ASHR -; EG-DAG: AND_INT -; EG-DAG: OR_INT -; EG-DAG: SUB_INT -; EG-DAG: LSHL -; EG-DAG: LSHL -; EG-DAG: SUB_INT -; EG-DAG: LSHR -; EG-DAG: LSHR -; EG-DAG: SETGT_UINT -; EG-DAG: SETGT_INT -; EG-DAG: XOR_INT -; EG-DAG: XOR_INT -; EG-DAG: SUB_INT -; EG-DAG: SUB_INT -; EG-DAG: CNDE_INT -; EG-DAG: CNDE_INT - -; SI: s_endpgm -define void @fp_to_uint_v4f32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x float> %x) { - %conv = fptoui <4 x float> %x to <4 x i64> - store <4 x i64> %conv, <4 x i64> addrspace(1)* %out - ret void -} diff --git a/llvm/test/CodeGen/R600/fpext.ll b/llvm/test/CodeGen/R600/fpext.ll deleted file mode 100644 index 734a43be229..00000000000 --- a/llvm/test/CodeGen/R600/fpext.ll +++ /dev/null @@ -1,45 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s - -; FUNC-LABEL: {{^}}fpext_f32_to_f64: -; SI: v_cvt_f64_f32_e32 {{v\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} -define void @fpext_f32_to_f64(double addrspace(1)* %out, float %in) { - %result = fpext float %in to double - store double %result, double addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}fpext_v2f32_to_v2f64: -; SI: v_cvt_f64_f32_e32 -; SI: v_cvt_f64_f32_e32 -define void @fpext_v2f32_to_v2f64(<2 x double> addrspace(1)* %out, <2 x float> %in) { - %result = fpext <2 x float> %in to <2 x double> - store <2 x double> %result, <2 x double> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}fpext_v4f32_to_v4f64: -; SI: v_cvt_f64_f32_e32 -; SI: v_cvt_f64_f32_e32 -; SI: v_cvt_f64_f32_e32 -; SI: v_cvt_f64_f32_e32 -define void @fpext_v4f32_to_v4f64(<4 x double> addrspace(1)* %out, <4 x float> %in) { - %result = fpext <4 x float> %in to <4 x double> - store <4 x double> %result, <4 x double> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}fpext_v8f32_to_v8f64: -; SI: v_cvt_f64_f32_e32 -; SI: v_cvt_f64_f32_e32 -; SI: v_cvt_f64_f32_e32 -; SI: v_cvt_f64_f32_e32 -; SI: v_cvt_f64_f32_e32 -; SI: v_cvt_f64_f32_e32 -; SI: v_cvt_f64_f32_e32 -; SI: v_cvt_f64_f32_e32 -define void @fpext_v8f32_to_v8f64(<8 x double> addrspace(1)* %out, <8 x float> %in) { - %result = fpext <8 x float> %in to <8 x double> - store <8 x double> %result, <8 x double> addrspace(1)* %out - ret void -} diff --git a/llvm/test/CodeGen/R600/fptrunc.ll b/llvm/test/CodeGen/R600/fptrunc.ll deleted file mode 100644 index 385e10e7baa..00000000000 --- a/llvm/test/CodeGen/R600/fptrunc.ll +++ /dev/null @@ -1,45 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s - -; FUNC-LABEL: {{^}}fptrunc_f64_to_f32: -; SI: v_cvt_f32_f64_e32 {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}} -define void @fptrunc_f64_to_f32(float addrspace(1)* %out, double %in) { - %result = fptrunc double %in to float - store float %result, float addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}fptrunc_v2f64_to_v2f32: -; SI: v_cvt_f32_f64_e32 -; SI: v_cvt_f32_f64_e32 -define void @fptrunc_v2f64_to_v2f32(<2 x float> addrspace(1)* %out, <2 x double> %in) { - %result = fptrunc <2 x double> %in to <2 x float> - store <2 x float> %result, <2 x float> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}fptrunc_v4f64_to_v4f32: -; SI: v_cvt_f32_f64_e32 -; SI: v_cvt_f32_f64_e32 -; SI: v_cvt_f32_f64_e32 -; SI: v_cvt_f32_f64_e32 -define void @fptrunc_v4f64_to_v4f32(<4 x float> addrspace(1)* %out, <4 x double> %in) { - %result = fptrunc <4 x double> %in to <4 x float> - store <4 x float> %result, <4 x float> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}fptrunc_v8f64_to_v8f32: -; SI: v_cvt_f32_f64_e32 -; SI: v_cvt_f32_f64_e32 -; SI: v_cvt_f32_f64_e32 -; SI: v_cvt_f32_f64_e32 -; SI: v_cvt_f32_f64_e32 -; SI: v_cvt_f32_f64_e32 -; SI: v_cvt_f32_f64_e32 -; SI: v_cvt_f32_f64_e32 -define void @fptrunc_v8f64_to_v8f32(<8 x float> addrspace(1)* %out, <8 x double> %in) { - %result = fptrunc <8 x double> %in to <8 x float> - store <8 x float> %result, <8 x float> addrspace(1)* %out - ret void -} diff --git a/llvm/test/CodeGen/R600/frem.ll b/llvm/test/CodeGen/R600/frem.ll deleted file mode 100644 index f245ef08cb9..00000000000 --- a/llvm/test/CodeGen/R600/frem.ll +++ /dev/null @@ -1,112 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -enable-misched < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=bonaire -enable-misched < %s | FileCheck -check-prefix=CI -check-prefix=GCN -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -enable-misched < %s | FileCheck -check-prefix=CI -check-prefix=GCN -check-prefix=FUNC %s - -; FUNC-LABEL: {{^}}frem_f32: -; GCN-DAG: buffer_load_dword [[X:v[0-9]+]], {{.*$}} -; GCN-DAG: buffer_load_dword [[Y:v[0-9]+]], {{.*}} offset:16 -; GCN-DAG: v_cmp -; GCN-DAG: v_mul_f32 -; GCN: v_rcp_f32_e32 -; GCN: v_mul_f32_e32 -; GCN: v_mul_f32_e32 -; GCN: v_trunc_f32_e32 -; GCN: v_mad_f32 -; GCN: s_endpgm -define void @frem_f32(float addrspace(1)* %out, float addrspace(1)* %in1, - float addrspace(1)* %in2) #0 { - %gep2 = getelementptr float, float addrspace(1)* %in2, i32 4 - %r0 = load float, float addrspace(1)* %in1, align 4 - %r1 = load float, float addrspace(1)* %gep2, align 4 - %r2 = frem float %r0, %r1 - store float %r2, float addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}unsafe_frem_f32: -; GCN: buffer_load_dword [[Y:v[0-9]+]], {{.*}} offset:16 -; GCN: buffer_load_dword [[X:v[0-9]+]], {{.*}} -; GCN: v_rcp_f32_e32 [[INVY:v[0-9]+]], [[Y]] -; GCN: v_mul_f32_e32 [[DIV:v[0-9]+]], [[INVY]], [[X]] -; GCN: v_trunc_f32_e32 [[TRUNC:v[0-9]+]], [[DIV]] -; GCN: v_mad_f32 [[RESULT:v[0-9]+]], -[[TRUNC]], [[Y]], [[X]] -; GCN: buffer_store_dword [[RESULT]] -; GCN: s_endpgm -define void @unsafe_frem_f32(float addrspace(1)* %out, float addrspace(1)* %in1, - float addrspace(1)* %in2) #1 { - %gep2 = getelementptr float, float addrspace(1)* %in2, i32 4 - %r0 = load float, float addrspace(1)* %in1, align 4 - %r1 = load float, float addrspace(1)* %gep2, align 4 - %r2 = frem float %r0, %r1 - store float %r2, float addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}frem_f64: -; GCN: buffer_load_dwordx2 [[Y:v\[[0-9]+:[0-9]+\]]], {{.*}}, 0 -; GCN: buffer_load_dwordx2 [[X:v\[[0-9]+:[0-9]+\]]], {{.*}}, 0 -; GCN-DAG: v_div_fmas_f64 -; GCN-DAG: v_div_scale_f64 -; GCN-DAG: v_mul_f64 -; CI: v_trunc_f64_e32 -; CI: v_mul_f64 -; GCN: v_add_f64 -; GCN: buffer_store_dwordx2 -; GCN: s_endpgm -define void @frem_f64(double addrspace(1)* %out, double addrspace(1)* %in1, - double addrspace(1)* %in2) #0 { - %r0 = load double, double addrspace(1)* %in1, align 8 - %r1 = load double, double addrspace(1)* %in2, align 8 - %r2 = frem double %r0, %r1 - store double %r2, double addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}unsafe_frem_f64: -; GCN: v_rcp_f64_e32 -; GCN: v_mul_f64 -; SI: v_bfe_u32 -; CI: v_trunc_f64_e32 -; GCN: v_fma_f64 -; GCN: s_endpgm -define void @unsafe_frem_f64(double addrspace(1)* %out, double addrspace(1)* %in1, - double addrspace(1)* %in2) #1 { - %r0 = load double, double addrspace(1)* %in1, align 8 - %r1 = load double, double addrspace(1)* %in2, align 8 - %r2 = frem double %r0, %r1 - store double %r2, double addrspace(1)* %out, align 8 - ret void -} - -define void @frem_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in1, - <2 x float> addrspace(1)* %in2) #0 { - %gep2 = getelementptr <2 x float>, <2 x float> addrspace(1)* %in2, i32 4 - %r0 = load <2 x float>, <2 x float> addrspace(1)* %in1, align 8 - %r1 = load <2 x float>, <2 x float> addrspace(1)* %gep2, align 8 - %r2 = frem <2 x float> %r0, %r1 - store <2 x float> %r2, <2 x float> addrspace(1)* %out, align 8 - ret void -} - -define void @frem_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in1, - <4 x float> addrspace(1)* %in2) #0 { - %gep2 = getelementptr <4 x float>, <4 x float> addrspace(1)* %in2, i32 4 - %r0 = load <4 x float>, <4 x float> addrspace(1)* %in1, align 16 - %r1 = load <4 x float>, <4 x float> addrspace(1)* %gep2, align 16 - %r2 = frem <4 x float> %r0, %r1 - store <4 x float> %r2, <4 x float> addrspace(1)* %out, align 16 - ret void -} - -define void @frem_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %in1, - <2 x double> addrspace(1)* %in2) #0 { - %gep2 = getelementptr <2 x double>, <2 x double> addrspace(1)* %in2, i32 4 - %r0 = load <2 x double>, <2 x double> addrspace(1)* %in1, align 16 - %r1 = load <2 x double>, <2 x double> addrspace(1)* %gep2, align 16 - %r2 = frem <2 x double> %r0, %r1 - store <2 x double> %r2, <2 x double> addrspace(1)* %out, align 16 - ret void -} - -attributes #0 = { nounwind "unsafe-fp-math"="false" } -attributes #1 = { nounwind "unsafe-fp-math"="true" } diff --git a/llvm/test/CodeGen/R600/fsqrt.ll b/llvm/test/CodeGen/R600/fsqrt.ll deleted file mode 100644 index 04101346cdf..00000000000 --- a/llvm/test/CodeGen/R600/fsqrt.ll +++ /dev/null @@ -1,29 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck %s - -; Run with unsafe-fp-math to make sure nothing tries to turn this into 1 / rsqrt(x) - -; CHECK: {{^}}fsqrt_f32: -; CHECK: v_sqrt_f32_e32 {{v[0-9]+, v[0-9]+}} - -define void @fsqrt_f32(float addrspace(1)* %out, float addrspace(1)* %in) { - %r0 = load float, float addrspace(1)* %in - %r1 = call float @llvm.sqrt.f32(float %r0) - store float %r1, float addrspace(1)* %out - ret void -} - -; CHECK: {{^}}fsqrt_f64: -; CHECK: v_sqrt_f64_e32 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}} - -define void @fsqrt_f64(double addrspace(1)* %out, double addrspace(1)* %in) { - %r0 = load double, double addrspace(1)* %in - %r1 = call double @llvm.sqrt.f64(double %r0) - store double %r1, double addrspace(1)* %out - ret void -} - -declare float @llvm.sqrt.f32(float %Val) -declare double @llvm.sqrt.f64(double %Val) diff --git a/llvm/test/CodeGen/R600/fsub.ll b/llvm/test/CodeGen/R600/fsub.ll deleted file mode 100644 index dfe41cb5b11..00000000000 --- a/llvm/test/CodeGen/R600/fsub.ll +++ /dev/null @@ -1,75 +0,0 @@ -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s - - -; FUNC-LABEL: {{^}}v_fsub_f32: -; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} -define void @v_fsub_f32(float addrspace(1)* %out, float addrspace(1)* %in) { - %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1 - %a = load float, float addrspace(1)* %in, align 4 - %b = load float, float addrspace(1)* %b_ptr, align 4 - %result = fsub float %a, %b - store float %result, float addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}s_fsub_f32: -; R600: ADD {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, -KC0[2].W - -; SI: v_sub_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} -define void @s_fsub_f32(float addrspace(1)* %out, float %a, float %b) { - %sub = fsub float %a, %b - store float %sub, float addrspace(1)* %out, align 4 - ret void -} - -declare float @llvm.R600.load.input(i32) readnone - -declare void @llvm.AMDGPU.store.output(float, i32) - -; FUNC-LABEL: {{^}}fsub_v2f32: -; R600-DAG: ADD {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[3].X, -KC0[3].Z -; R600-DAG: ADD {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].W, -KC0[3].Y - -; FIXME: Should be using SGPR directly for first operand -; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} -; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} -define void @fsub_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) { - %sub = fsub <2 x float> %a, %b - store <2 x float> %sub, <2 x float> addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}v_fsub_v4f32: -; R600: ADD {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], -T[0-9]+\.[XYZW]}} -; R600: ADD {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], -T[0-9]+\.[XYZW]}} -; R600: ADD {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], -T[0-9]+\.[XYZW]}} -; R600: ADD {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], -T[0-9]+\.[XYZW]}} - -; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} -; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} -; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} -; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} -define void @v_fsub_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) { - %b_ptr = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 1 - %a = load <4 x float>, <4 x float> addrspace(1)* %in, align 16 - %b = load <4 x float>, <4 x float> addrspace(1)* %b_ptr, align 16 - %result = fsub <4 x float> %a, %b - store <4 x float> %result, <4 x float> addrspace(1)* %out, align 16 - ret void -} - -; FIXME: Should be using SGPR directly for first operand - -; FUNC-LABEL: {{^}}s_fsub_v4f32: -; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} -; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} -; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} -; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} -; SI: s_endpgm -define void @s_fsub_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, <4 x float> %b) { - %result = fsub <4 x float> %a, %b - store <4 x float> %result, <4 x float> addrspace(1)* %out, align 16 - ret void -} diff --git a/llvm/test/CodeGen/R600/fsub64.ll b/llvm/test/CodeGen/R600/fsub64.ll deleted file mode 100644 index f34a48e30a8..00000000000 --- a/llvm/test/CodeGen/R600/fsub64.ll +++ /dev/null @@ -1,107 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s - -declare double @llvm.fabs.f64(double) #0 - -; SI-LABEL: {{^}}fsub_f64: -; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}} -define void @fsub_f64(double addrspace(1)* %out, double addrspace(1)* %in1, - double addrspace(1)* %in2) { - %r0 = load double, double addrspace(1)* %in1 - %r1 = load double, double addrspace(1)* %in2 - %r2 = fsub double %r0, %r1 - store double %r2, double addrspace(1)* %out - ret void -} - -; SI-LABEL: {{^}}fsub_fabs_f64: -; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], -\|v\[[0-9]+:[0-9]+\]\|}} -define void @fsub_fabs_f64(double addrspace(1)* %out, double addrspace(1)* %in1, - double addrspace(1)* %in2) { - %r0 = load double, double addrspace(1)* %in1 - %r1 = load double, double addrspace(1)* %in2 - %r1.fabs = call double @llvm.fabs.f64(double %r1) #0 - %r2 = fsub double %r0, %r1.fabs - store double %r2, double addrspace(1)* %out - ret void -} - -; SI-LABEL: {{^}}fsub_fabs_inv_f64: -; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], |v\[[0-9]+:[0-9]+\]|, -v\[[0-9]+:[0-9]+\]}} -define void @fsub_fabs_inv_f64(double addrspace(1)* %out, double addrspace(1)* %in1, - double addrspace(1)* %in2) { - %r0 = load double, double addrspace(1)* %in1 - %r1 = load double, double addrspace(1)* %in2 - %r0.fabs = call double @llvm.fabs.f64(double %r0) #0 - %r2 = fsub double %r0.fabs, %r1 - store double %r2, double addrspace(1)* %out - ret void -} - -; SI-LABEL: {{^}}s_fsub_f64: -; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], s\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}} -define void @s_fsub_f64(double addrspace(1)* %out, double %a, double %b) { - %sub = fsub double %a, %b - store double %sub, double addrspace(1)* %out - ret void -} - -; SI-LABEL: {{^}}s_fsub_imm_f64: -; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], 4.0, -s\[[0-9]+:[0-9]+\]}} -define void @s_fsub_imm_f64(double addrspace(1)* %out, double %a, double %b) { - %sub = fsub double 4.0, %a - store double %sub, double addrspace(1)* %out - ret void -} - -; SI-LABEL: {{^}}s_fsub_imm_inv_f64: -; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], -4.0, s\[[0-9]+:[0-9]+\]}} -define void @s_fsub_imm_inv_f64(double addrspace(1)* %out, double %a, double %b) { - %sub = fsub double %a, 4.0 - store double %sub, double addrspace(1)* %out - ret void -} - -; SI-LABEL: {{^}}s_fsub_self_f64: -; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], s\[[0-9]+:[0-9]+\], -s\[[0-9]+:[0-9]+\]}} -define void @s_fsub_self_f64(double addrspace(1)* %out, double %a) { - %sub = fsub double %a, %a - store double %sub, double addrspace(1)* %out - ret void -} - -; SI-LABEL: {{^}}fsub_v2f64: -; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], s\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}} -; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], s\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}} -define void @fsub_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b) { - %sub = fsub <2 x double> %a, %b - store <2 x double> %sub, <2 x double> addrspace(1)* %out - ret void -} - -; SI-LABEL: {{^}}fsub_v4f64: -; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}} -; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}} -; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}} -; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}} -define void @fsub_v4f64(<4 x double> addrspace(1)* %out, <4 x double> addrspace(1)* %in) { - %b_ptr = getelementptr <4 x double>, <4 x double> addrspace(1)* %in, i32 1 - %a = load <4 x double>, <4 x double> addrspace(1)* %in - %b = load <4 x double>, <4 x double> addrspace(1)* %b_ptr - %result = fsub <4 x double> %a, %b - store <4 x double> %result, <4 x double> addrspace(1)* %out - ret void -} - -; SI-LABEL: {{^}}s_fsub_v4f64: -; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], s\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}} -; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], s\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}} -; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], s\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}} -; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], s\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}} -define void @s_fsub_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b) { - %result = fsub <4 x double> %a, %b - store <4 x double> %result, <4 x double> addrspace(1)* %out, align 16 - ret void -} - -attributes #0 = { nounwind readnone } diff --git a/llvm/test/CodeGen/R600/ftrunc.f64.ll b/llvm/test/CodeGen/R600/ftrunc.f64.ll deleted file mode 100644 index 6618d8b5e57..00000000000 --- a/llvm/test/CodeGen/R600/ftrunc.f64.ll +++ /dev/null @@ -1,111 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=bonaire < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s - -declare double @llvm.trunc.f64(double) nounwind readnone -declare <2 x double> @llvm.trunc.v2f64(<2 x double>) nounwind readnone -declare <3 x double> @llvm.trunc.v3f64(<3 x double>) nounwind readnone -declare <4 x double> @llvm.trunc.v4f64(<4 x double>) nounwind readnone -declare <8 x double> @llvm.trunc.v8f64(<8 x double>) nounwind readnone -declare <16 x double> @llvm.trunc.v16f64(<16 x double>) nounwind readnone - -; FUNC-LABEL: {{^}}v_ftrunc_f64: -; CI: v_trunc_f64 -; SI: v_bfe_u32 {{v[0-9]+}}, {{v[0-9]+}}, 20, 11 -; SI: s_endpgm -define void @v_ftrunc_f64(double addrspace(1)* %out, double addrspace(1)* %in) { - %x = load double, double addrspace(1)* %in, align 8 - %y = call double @llvm.trunc.f64(double %x) nounwind readnone - store double %y, double addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}ftrunc_f64: -; CI: v_trunc_f64_e32 - -; SI: s_bfe_u32 [[SEXP:s[0-9]+]], {{s[0-9]+}}, 0xb0014 -; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000 -; SI: s_add_i32 s{{[0-9]+}}, [[SEXP]], 0xfffffc01 -; SI: s_lshr_b64 -; SI: s_not_b64 -; SI: s_and_b64 -; SI: cmp_gt_i32 -; SI: cndmask_b32 -; SI: cndmask_b32 -; SI: cmp_lt_i32 -; SI: cndmask_b32 -; SI: cndmask_b32 -; SI: s_endpgm -define void @ftrunc_f64(double addrspace(1)* %out, double %x) { - %y = call double @llvm.trunc.f64(double %x) nounwind readnone - store double %y, double addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}ftrunc_v2f64: -; CI: v_trunc_f64_e32 -; CI: v_trunc_f64_e32 -define void @ftrunc_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %x) { - %y = call <2 x double> @llvm.trunc.v2f64(<2 x double> %x) nounwind readnone - store <2 x double> %y, <2 x double> addrspace(1)* %out - ret void -} - -; FIXME-FUNC-LABEL: {{^}}ftrunc_v3f64: -; FIXME-CI: v_trunc_f64_e32 -; FIXME-CI: v_trunc_f64_e32 -; FIXME-CI: v_trunc_f64_e32 -; define void @ftrunc_v3f64(<3 x double> addrspace(1)* %out, <3 x double> %x) { -; %y = call <3 x double> @llvm.trunc.v3f64(<3 x double> %x) nounwind readnone -; store <3 x double> %y, <3 x double> addrspace(1)* %out -; ret void -; } - -; FUNC-LABEL: {{^}}ftrunc_v4f64: -; CI: v_trunc_f64_e32 -; CI: v_trunc_f64_e32 -; CI: v_trunc_f64_e32 -; CI: v_trunc_f64_e32 -define void @ftrunc_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %x) { - %y = call <4 x double> @llvm.trunc.v4f64(<4 x double> %x) nounwind readnone - store <4 x double> %y, <4 x double> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}ftrunc_v8f64: -; CI: v_trunc_f64_e32 -; CI: v_trunc_f64_e32 -; CI: v_trunc_f64_e32 -; CI: v_trunc_f64_e32 -; CI: v_trunc_f64_e32 -; CI: v_trunc_f64_e32 -; CI: v_trunc_f64_e32 -; CI: v_trunc_f64_e32 -define void @ftrunc_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %x) { - %y = call <8 x double> @llvm.trunc.v8f64(<8 x double> %x) nounwind readnone - store <8 x double> %y, <8 x double> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}ftrunc_v16f64: -; CI: v_trunc_f64_e32 -; CI: v_trunc_f64_e32 -; CI: v_trunc_f64_e32 -; CI: v_trunc_f64_e32 -; CI: v_trunc_f64_e32 -; CI: v_trunc_f64_e32 -; CI: v_trunc_f64_e32 -; CI: v_trunc_f64_e32 -; CI: v_trunc_f64_e32 -; CI: v_trunc_f64_e32 -; CI: v_trunc_f64_e32 -; CI: v_trunc_f64_e32 -; CI: v_trunc_f64_e32 -; CI: v_trunc_f64_e32 -; CI: v_trunc_f64_e32 -; CI: v_trunc_f64_e32 -define void @ftrunc_v16f64(<16 x double> addrspace(1)* %out, <16 x double> %x) { - %y = call <16 x double> @llvm.trunc.v16f64(<16 x double> %x) nounwind readnone - store <16 x double> %y, <16 x double> addrspace(1)* %out - ret void -} diff --git a/llvm/test/CodeGen/R600/ftrunc.ll b/llvm/test/CodeGen/R600/ftrunc.ll deleted file mode 100644 index edc08609a8a..00000000000 --- a/llvm/test/CodeGen/R600/ftrunc.ll +++ /dev/null @@ -1,120 +0,0 @@ -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG --check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI --check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI --check-prefix=FUNC %s - -declare float @llvm.trunc.f32(float) nounwind readnone -declare <2 x float> @llvm.trunc.v2f32(<2 x float>) nounwind readnone -declare <3 x float> @llvm.trunc.v3f32(<3 x float>) nounwind readnone -declare <4 x float> @llvm.trunc.v4f32(<4 x float>) nounwind readnone -declare <8 x float> @llvm.trunc.v8f32(<8 x float>) nounwind readnone -declare <16 x float> @llvm.trunc.v16f32(<16 x float>) nounwind readnone - -; FUNC-LABEL: {{^}}ftrunc_f32: -; EG: TRUNC -; SI: v_trunc_f32_e32 -define void @ftrunc_f32(float addrspace(1)* %out, float %x) { - %y = call float @llvm.trunc.f32(float %x) nounwind readnone - store float %y, float addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}ftrunc_v2f32: -; EG: TRUNC -; EG: TRUNC -; SI: v_trunc_f32_e32 -; SI: v_trunc_f32_e32 -define void @ftrunc_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %x) { - %y = call <2 x float> @llvm.trunc.v2f32(<2 x float> %x) nounwind readnone - store <2 x float> %y, <2 x float> addrspace(1)* %out - ret void -} - -; FIXME-FUNC-LABEL: {{^}}ftrunc_v3f32: -; FIXME-EG: TRUNC -; FIXME-EG: TRUNC -; FIXME-EG: TRUNC -; FIXME-SI: v_trunc_f32_e32 -; FIXME-SI: v_trunc_f32_e32 -; FIXME-SI: v_trunc_f32_e32 -; define void @ftrunc_v3f32(<3 x float> addrspace(1)* %out, <3 x float> %x) { -; %y = call <3 x float> @llvm.trunc.v3f32(<3 x float> %x) nounwind readnone -; store <3 x float> %y, <3 x float> addrspace(1)* %out -; ret void -; } - -; FUNC-LABEL: {{^}}ftrunc_v4f32: -; EG: TRUNC -; EG: TRUNC -; EG: TRUNC -; EG: TRUNC -; SI: v_trunc_f32_e32 -; SI: v_trunc_f32_e32 -; SI: v_trunc_f32_e32 -; SI: v_trunc_f32_e32 -define void @ftrunc_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %x) { - %y = call <4 x float> @llvm.trunc.v4f32(<4 x float> %x) nounwind readnone - store <4 x float> %y, <4 x float> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}ftrunc_v8f32: -; EG: TRUNC -; EG: TRUNC -; EG: TRUNC -; EG: TRUNC -; EG: TRUNC -; EG: TRUNC -; EG: TRUNC -; EG: TRUNC -; SI: v_trunc_f32_e32 -; SI: v_trunc_f32_e32 -; SI: v_trunc_f32_e32 -; SI: v_trunc_f32_e32 -; SI: v_trunc_f32_e32 -; SI: v_trunc_f32_e32 -; SI: v_trunc_f32_e32 -; SI: v_trunc_f32_e32 -define void @ftrunc_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %x) { - %y = call <8 x float> @llvm.trunc.v8f32(<8 x float> %x) nounwind readnone - store <8 x float> %y, <8 x float> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}ftrunc_v16f32: -; EG: TRUNC -; EG: TRUNC -; EG: TRUNC -; EG: TRUNC -; EG: TRUNC -; EG: TRUNC -; EG: TRUNC -; EG: TRUNC -; EG: TRUNC -; EG: TRUNC -; EG: TRUNC -; EG: TRUNC -; EG: TRUNC -; EG: TRUNC -; EG: TRUNC -; EG: TRUNC -; SI: v_trunc_f32_e32 -; SI: v_trunc_f32_e32 -; SI: v_trunc_f32_e32 -; SI: v_trunc_f32_e32 -; SI: v_trunc_f32_e32 -; SI: v_trunc_f32_e32 -; SI: v_trunc_f32_e32 -; SI: v_trunc_f32_e32 -; SI: v_trunc_f32_e32 -; SI: v_trunc_f32_e32 -; SI: v_trunc_f32_e32 -; SI: v_trunc_f32_e32 -; SI: v_trunc_f32_e32 -; SI: v_trunc_f32_e32 -; SI: v_trunc_f32_e32 -; SI: v_trunc_f32_e32 -define void @ftrunc_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %x) { - %y = call <16 x float> @llvm.trunc.v16f32(<16 x float> %x) nounwind readnone - store <16 x float> %y, <16 x float> addrspace(1)* %out - ret void -} diff --git a/llvm/test/CodeGen/R600/gep-address-space.ll b/llvm/test/CodeGen/R600/gep-address-space.ll deleted file mode 100644 index 471b0f6b13e..00000000000 --- a/llvm/test/CodeGen/R600/gep-address-space.ll +++ /dev/null @@ -1,55 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck --check-prefix=SI --check-prefix=CHECK %s -; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs< %s | FileCheck --check-prefix=CI --check-prefix=CHECK %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck --check-prefix=CI --check-prefix=CHECK %s - -define void @use_gep_address_space([1024 x i32] addrspace(3)* %array) nounwind { -; CHECK-LABEL: {{^}}use_gep_address_space: -; CHECK: v_mov_b32_e32 [[PTR:v[0-9]+]], s{{[0-9]+}} -; CHECK: ds_write_b32 [[PTR]], v{{[0-9]+}} offset:64 - %p = getelementptr [1024 x i32], [1024 x i32] addrspace(3)* %array, i16 0, i16 16 - store i32 99, i32 addrspace(3)* %p - ret void -} - -define void @use_gep_address_space_large_offset([1024 x i32] addrspace(3)* %array) nounwind { -; CHECK-LABEL: {{^}}use_gep_address_space_large_offset: -; The LDS offset will be 65536 bytes, which is larger than the size of LDS on -; SI, which is why it is being OR'd with the base pointer. -; SI: s_or_b32 -; CI: s_add_i32 -; CHECK: ds_write_b32 - %p = getelementptr [1024 x i32], [1024 x i32] addrspace(3)* %array, i16 0, i16 16384 - store i32 99, i32 addrspace(3)* %p - ret void -} - -define void @gep_as_vector_v4(<4 x [1024 x i32] addrspace(3)*> %array) nounwind { -; CHECK-LABEL: {{^}}gep_as_vector_v4: -; CHECK: s_add_i32 -; CHECK: s_add_i32 -; CHECK: s_add_i32 -; CHECK: s_add_i32 - %p = getelementptr [1024 x i32], <4 x [1024 x i32] addrspace(3)*> %array, <4 x i16> zeroinitializer, <4 x i16> - %p0 = extractelement <4 x i32 addrspace(3)*> %p, i32 0 - %p1 = extractelement <4 x i32 addrspace(3)*> %p, i32 1 - %p2 = extractelement <4 x i32 addrspace(3)*> %p, i32 2 - %p3 = extractelement <4 x i32 addrspace(3)*> %p, i32 3 - store i32 99, i32 addrspace(3)* %p0 - store i32 99, i32 addrspace(3)* %p1 - store i32 99, i32 addrspace(3)* %p2 - store i32 99, i32 addrspace(3)* %p3 - ret void -} - -define void @gep_as_vector_v2(<2 x [1024 x i32] addrspace(3)*> %array) nounwind { -; CHECK-LABEL: {{^}}gep_as_vector_v2: -; CHECK: s_add_i32 -; CHECK: s_add_i32 - %p = getelementptr [1024 x i32], <2 x [1024 x i32] addrspace(3)*> %array, <2 x i16> zeroinitializer, <2 x i16> - %p0 = extractelement <2 x i32 addrspace(3)*> %p, i32 0 - %p1 = extractelement <2 x i32 addrspace(3)*> %p, i32 1 - store i32 99, i32 addrspace(3)* %p0 - store i32 99, i32 addrspace(3)* %p1 - ret void -} - diff --git a/llvm/test/CodeGen/R600/global-directive.ll b/llvm/test/CodeGen/R600/global-directive.ll deleted file mode 100644 index be775cf9292..00000000000 --- a/llvm/test/CodeGen/R600/global-directive.ll +++ /dev/null @@ -1,15 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s - -; Make sure the GlobalDirective isn't merged with the function name - -; SI: .globl foo -; SI: {{^}}foo: -define void @foo(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { - %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 - %a = load i32, i32 addrspace(1)* %in - %b = load i32, i32 addrspace(1)* %b_ptr - %result = add i32 %a, %b - store i32 %result, i32 addrspace(1)* %out - ret void -} diff --git a/llvm/test/CodeGen/R600/global-extload-i1.ll b/llvm/test/CodeGen/R600/global-extload-i1.ll deleted file mode 100644 index bd9557d730f..00000000000 --- a/llvm/test/CodeGen/R600/global-extload-i1.ll +++ /dev/null @@ -1,302 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; XUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s -; FIXME: Evergreen broken - -; FUNC-LABEL: {{^}}zextload_global_i1_to_i32: -; SI: buffer_load_ubyte -; SI: buffer_store_dword -; SI: s_endpgm -define void @zextload_global_i1_to_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind { - %a = load i1, i1 addrspace(1)* %in - %ext = zext i1 %a to i32 - store i32 %ext, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}sextload_global_i1_to_i32: -; SI: buffer_load_ubyte -; SI: v_bfe_i32 {{v[0-9]+}}, {{v[0-9]+}}, 0, 1{{$}} -; SI: buffer_store_dword -; SI: s_endpgm -define void @sextload_global_i1_to_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind { - %a = load i1, i1 addrspace(1)* %in - %ext = sext i1 %a to i32 - store i32 %ext, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}zextload_global_v1i1_to_v1i32: -; SI: s_endpgm -define void @zextload_global_v1i1_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i1> addrspace(1)* nocapture %in) nounwind { - %load = load <1 x i1>, <1 x i1> addrspace(1)* %in - %ext = zext <1 x i1> %load to <1 x i32> - store <1 x i32> %ext, <1 x i32> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}sextload_global_v1i1_to_v1i32: -; SI: s_endpgm -define void @sextload_global_v1i1_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i1> addrspace(1)* nocapture %in) nounwind { - %load = load <1 x i1>, <1 x i1> addrspace(1)* %in - %ext = sext <1 x i1> %load to <1 x i32> - store <1 x i32> %ext, <1 x i32> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}zextload_global_v2i1_to_v2i32: -; SI: s_endpgm -define void @zextload_global_v2i1_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i1> addrspace(1)* nocapture %in) nounwind { - %load = load <2 x i1>, <2 x i1> addrspace(1)* %in - %ext = zext <2 x i1> %load to <2 x i32> - store <2 x i32> %ext, <2 x i32> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}sextload_global_v2i1_to_v2i32: -; SI: s_endpgm -define void @sextload_global_v2i1_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i1> addrspace(1)* nocapture %in) nounwind { - %load = load <2 x i1>, <2 x i1> addrspace(1)* %in - %ext = sext <2 x i1> %load to <2 x i32> - store <2 x i32> %ext, <2 x i32> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}zextload_global_v4i1_to_v4i32: -; SI: s_endpgm -define void @zextload_global_v4i1_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i1> addrspace(1)* nocapture %in) nounwind { - %load = load <4 x i1>, <4 x i1> addrspace(1)* %in - %ext = zext <4 x i1> %load to <4 x i32> - store <4 x i32> %ext, <4 x i32> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}sextload_global_v4i1_to_v4i32: -; SI: s_endpgm -define void @sextload_global_v4i1_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i1> addrspace(1)* nocapture %in) nounwind { - %load = load <4 x i1>, <4 x i1> addrspace(1)* %in - %ext = sext <4 x i1> %load to <4 x i32> - store <4 x i32> %ext, <4 x i32> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}zextload_global_v8i1_to_v8i32: -; SI: s_endpgm -define void @zextload_global_v8i1_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i1> addrspace(1)* nocapture %in) nounwind { - %load = load <8 x i1>, <8 x i1> addrspace(1)* %in - %ext = zext <8 x i1> %load to <8 x i32> - store <8 x i32> %ext, <8 x i32> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}sextload_global_v8i1_to_v8i32: -; SI: s_endpgm -define void @sextload_global_v8i1_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i1> addrspace(1)* nocapture %in) nounwind { - %load = load <8 x i1>, <8 x i1> addrspace(1)* %in - %ext = sext <8 x i1> %load to <8 x i32> - store <8 x i32> %ext, <8 x i32> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}zextload_global_v16i1_to_v16i32: -; SI: s_endpgm -define void @zextload_global_v16i1_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i1> addrspace(1)* nocapture %in) nounwind { - %load = load <16 x i1>, <16 x i1> addrspace(1)* %in - %ext = zext <16 x i1> %load to <16 x i32> - store <16 x i32> %ext, <16 x i32> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}sextload_global_v16i1_to_v16i32: -; SI: s_endpgm -define void @sextload_global_v16i1_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i1> addrspace(1)* nocapture %in) nounwind { - %load = load <16 x i1>, <16 x i1> addrspace(1)* %in - %ext = sext <16 x i1> %load to <16 x i32> - store <16 x i32> %ext, <16 x i32> addrspace(1)* %out - ret void -} - -; XFUNC-LABEL: {{^}}zextload_global_v32i1_to_v32i32: -; XSI: s_endpgm -; define void @zextload_global_v32i1_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i1> addrspace(1)* nocapture %in) nounwind { -; %load = load <32 x i1>, <32 x i1> addrspace(1)* %in -; %ext = zext <32 x i1> %load to <32 x i32> -; store <32 x i32> %ext, <32 x i32> addrspace(1)* %out -; ret void -; } - -; XFUNC-LABEL: {{^}}sextload_global_v32i1_to_v32i32: -; XSI: s_endpgm -; define void @sextload_global_v32i1_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i1> addrspace(1)* nocapture %in) nounwind { -; %load = load <32 x i1>, <32 x i1> addrspace(1)* %in -; %ext = sext <32 x i1> %load to <32 x i32> -; store <32 x i32> %ext, <32 x i32> addrspace(1)* %out -; ret void -; } - -; XFUNC-LABEL: {{^}}zextload_global_v64i1_to_v64i32: -; XSI: s_endpgm -; define void @zextload_global_v64i1_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i1> addrspace(1)* nocapture %in) nounwind { -; %load = load <64 x i1>, <64 x i1> addrspace(1)* %in -; %ext = zext <64 x i1> %load to <64 x i32> -; store <64 x i32> %ext, <64 x i32> addrspace(1)* %out -; ret void -; } - -; XFUNC-LABEL: {{^}}sextload_global_v64i1_to_v64i32: -; XSI: s_endpgm -; define void @sextload_global_v64i1_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i1> addrspace(1)* nocapture %in) nounwind { -; %load = load <64 x i1>, <64 x i1> addrspace(1)* %in -; %ext = sext <64 x i1> %load to <64 x i32> -; store <64 x i32> %ext, <64 x i32> addrspace(1)* %out -; ret void -; } - -; FUNC-LABEL: {{^}}zextload_global_i1_to_i64: -; SI: buffer_load_ubyte [[LOAD:v[0-9]+]], -; SI: v_mov_b32_e32 {{v[0-9]+}}, 0{{$}} -; SI: buffer_store_dwordx2 -define void @zextload_global_i1_to_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind { - %a = load i1, i1 addrspace(1)* %in - %ext = zext i1 %a to i64 - store i64 %ext, i64 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}sextload_global_i1_to_i64: -; SI: buffer_load_ubyte [[LOAD:v[0-9]+]], -; SI: v_bfe_i32 [[BFE:v[0-9]+]], {{v[0-9]+}}, 0, 1{{$}} -; SI: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, [[BFE]] -; SI: buffer_store_dwordx2 -define void @sextload_global_i1_to_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind { - %a = load i1, i1 addrspace(1)* %in - %ext = sext i1 %a to i64 - store i64 %ext, i64 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}zextload_global_v1i1_to_v1i64: -; SI: s_endpgm -define void @zextload_global_v1i1_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i1> addrspace(1)* nocapture %in) nounwind { - %load = load <1 x i1>, <1 x i1> addrspace(1)* %in - %ext = zext <1 x i1> %load to <1 x i64> - store <1 x i64> %ext, <1 x i64> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}sextload_global_v1i1_to_v1i64: -; SI: s_endpgm -define void @sextload_global_v1i1_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i1> addrspace(1)* nocapture %in) nounwind { - %load = load <1 x i1>, <1 x i1> addrspace(1)* %in - %ext = sext <1 x i1> %load to <1 x i64> - store <1 x i64> %ext, <1 x i64> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}zextload_global_v2i1_to_v2i64: -; SI: s_endpgm -define void @zextload_global_v2i1_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i1> addrspace(1)* nocapture %in) nounwind { - %load = load <2 x i1>, <2 x i1> addrspace(1)* %in - %ext = zext <2 x i1> %load to <2 x i64> - store <2 x i64> %ext, <2 x i64> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}sextload_global_v2i1_to_v2i64: -; SI: s_endpgm -define void @sextload_global_v2i1_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i1> addrspace(1)* nocapture %in) nounwind { - %load = load <2 x i1>, <2 x i1> addrspace(1)* %in - %ext = sext <2 x i1> %load to <2 x i64> - store <2 x i64> %ext, <2 x i64> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}zextload_global_v4i1_to_v4i64: -; SI: s_endpgm -define void @zextload_global_v4i1_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i1> addrspace(1)* nocapture %in) nounwind { - %load = load <4 x i1>, <4 x i1> addrspace(1)* %in - %ext = zext <4 x i1> %load to <4 x i64> - store <4 x i64> %ext, <4 x i64> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}sextload_global_v4i1_to_v4i64: -; SI: s_endpgm -define void @sextload_global_v4i1_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i1> addrspace(1)* nocapture %in) nounwind { - %load = load <4 x i1>, <4 x i1> addrspace(1)* %in - %ext = sext <4 x i1> %load to <4 x i64> - store <4 x i64> %ext, <4 x i64> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}zextload_global_v8i1_to_v8i64: -; SI: s_endpgm -define void @zextload_global_v8i1_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i1> addrspace(1)* nocapture %in) nounwind { - %load = load <8 x i1>, <8 x i1> addrspace(1)* %in - %ext = zext <8 x i1> %load to <8 x i64> - store <8 x i64> %ext, <8 x i64> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}sextload_global_v8i1_to_v8i64: -; SI: s_endpgm -define void @sextload_global_v8i1_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i1> addrspace(1)* nocapture %in) nounwind { - %load = load <8 x i1>, <8 x i1> addrspace(1)* %in - %ext = sext <8 x i1> %load to <8 x i64> - store <8 x i64> %ext, <8 x i64> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}zextload_global_v16i1_to_v16i64: -; SI: s_endpgm -define void @zextload_global_v16i1_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i1> addrspace(1)* nocapture %in) nounwind { - %load = load <16 x i1>, <16 x i1> addrspace(1)* %in - %ext = zext <16 x i1> %load to <16 x i64> - store <16 x i64> %ext, <16 x i64> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}sextload_global_v16i1_to_v16i64: -; SI: s_endpgm -define void @sextload_global_v16i1_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i1> addrspace(1)* nocapture %in) nounwind { - %load = load <16 x i1>, <16 x i1> addrspace(1)* %in - %ext = sext <16 x i1> %load to <16 x i64> - store <16 x i64> %ext, <16 x i64> addrspace(1)* %out - ret void -} - -; XFUNC-LABEL: {{^}}zextload_global_v32i1_to_v32i64: -; XSI: s_endpgm -; define void @zextload_global_v32i1_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i1> addrspace(1)* nocapture %in) nounwind { -; %load = load <32 x i1>, <32 x i1> addrspace(1)* %in -; %ext = zext <32 x i1> %load to <32 x i64> -; store <32 x i64> %ext, <32 x i64> addrspace(1)* %out -; ret void -; } - -; XFUNC-LABEL: {{^}}sextload_global_v32i1_to_v32i64: -; XSI: s_endpgm -; define void @sextload_global_v32i1_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i1> addrspace(1)* nocapture %in) nounwind { -; %load = load <32 x i1>, <32 x i1> addrspace(1)* %in -; %ext = sext <32 x i1> %load to <32 x i64> -; store <32 x i64> %ext, <32 x i64> addrspace(1)* %out -; ret void -; } - -; XFUNC-LABEL: {{^}}zextload_global_v64i1_to_v64i64: -; XSI: s_endpgm -; define void @zextload_global_v64i1_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i1> addrspace(1)* nocapture %in) nounwind { -; %load = load <64 x i1>, <64 x i1> addrspace(1)* %in -; %ext = zext <64 x i1> %load to <64 x i64> -; store <64 x i64> %ext, <64 x i64> addrspace(1)* %out -; ret void -; } - -; XFUNC-LABEL: {{^}}sextload_global_v64i1_to_v64i64: -; XSI: s_endpgm -; define void @sextload_global_v64i1_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i1> addrspace(1)* nocapture %in) nounwind { -; %load = load <64 x i1>, <64 x i1> addrspace(1)* %in -; %ext = sext <64 x i1> %load to <64 x i64> -; store <64 x i64> %ext, <64 x i64> addrspace(1)* %out -; ret void -; } diff --git a/llvm/test/CodeGen/R600/global-extload-i16.ll b/llvm/test/CodeGen/R600/global-extload-i16.ll deleted file mode 100644 index 103a40dee27..00000000000 --- a/llvm/test/CodeGen/R600/global-extload-i16.ll +++ /dev/null @@ -1,302 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; XUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s -; FIXME: cypress is broken because the bigger testcases spill and it's not implemented - -; FUNC-LABEL: {{^}}zextload_global_i16_to_i32: -; SI: buffer_load_ushort -; SI: buffer_store_dword -; SI: s_endpgm -define void @zextload_global_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in) nounwind { - %a = load i16, i16 addrspace(1)* %in - %ext = zext i16 %a to i32 - store i32 %ext, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}sextload_global_i16_to_i32: -; SI: buffer_load_sshort -; SI: buffer_store_dword -; SI: s_endpgm -define void @sextload_global_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in) nounwind { - %a = load i16, i16 addrspace(1)* %in - %ext = sext i16 %a to i32 - store i32 %ext, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}zextload_global_v1i16_to_v1i32: -; SI: buffer_load_ushort -; SI: s_endpgm -define void @zextload_global_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i16> addrspace(1)* nocapture %in) nounwind { - %load = load <1 x i16>, <1 x i16> addrspace(1)* %in - %ext = zext <1 x i16> %load to <1 x i32> - store <1 x i32> %ext, <1 x i32> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}sextload_global_v1i16_to_v1i32: -; SI: buffer_load_sshort -; SI: s_endpgm -define void @sextload_global_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i16> addrspace(1)* nocapture %in) nounwind { - %load = load <1 x i16>, <1 x i16> addrspace(1)* %in - %ext = sext <1 x i16> %load to <1 x i32> - store <1 x i32> %ext, <1 x i32> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}zextload_global_v2i16_to_v2i32: -; SI: s_endpgm -define void @zextload_global_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* nocapture %in) nounwind { - %load = load <2 x i16>, <2 x i16> addrspace(1)* %in - %ext = zext <2 x i16> %load to <2 x i32> - store <2 x i32> %ext, <2 x i32> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}sextload_global_v2i16_to_v2i32: -; SI: s_endpgm -define void @sextload_global_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* nocapture %in) nounwind { - %load = load <2 x i16>, <2 x i16> addrspace(1)* %in - %ext = sext <2 x i16> %load to <2 x i32> - store <2 x i32> %ext, <2 x i32> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}zextload_global_v4i16_to_v4i32: -; SI: s_endpgm -define void @zextload_global_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(1)* nocapture %in) nounwind { - %load = load <4 x i16>, <4 x i16> addrspace(1)* %in - %ext = zext <4 x i16> %load to <4 x i32> - store <4 x i32> %ext, <4 x i32> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}sextload_global_v4i16_to_v4i32: -; SI: s_endpgm -define void @sextload_global_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(1)* nocapture %in) nounwind { - %load = load <4 x i16>, <4 x i16> addrspace(1)* %in - %ext = sext <4 x i16> %load to <4 x i32> - store <4 x i32> %ext, <4 x i32> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}zextload_global_v8i16_to_v8i32: -; SI: s_endpgm -define void @zextload_global_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i16> addrspace(1)* nocapture %in) nounwind { - %load = load <8 x i16>, <8 x i16> addrspace(1)* %in - %ext = zext <8 x i16> %load to <8 x i32> - store <8 x i32> %ext, <8 x i32> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}sextload_global_v8i16_to_v8i32: -; SI: s_endpgm -define void @sextload_global_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i16> addrspace(1)* nocapture %in) nounwind { - %load = load <8 x i16>, <8 x i16> addrspace(1)* %in - %ext = sext <8 x i16> %load to <8 x i32> - store <8 x i32> %ext, <8 x i32> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}zextload_global_v16i16_to_v16i32: -; SI: s_endpgm -define void @zextload_global_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i16> addrspace(1)* nocapture %in) nounwind { - %load = load <16 x i16>, <16 x i16> addrspace(1)* %in - %ext = zext <16 x i16> %load to <16 x i32> - store <16 x i32> %ext, <16 x i32> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}sextload_global_v16i16_to_v16i32: -; SI: s_endpgm -define void @sextload_global_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i16> addrspace(1)* nocapture %in) nounwind { - %load = load <16 x i16>, <16 x i16> addrspace(1)* %in - %ext = sext <16 x i16> %load to <16 x i32> - store <16 x i32> %ext, <16 x i32> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}zextload_global_v32i16_to_v32i32: -; SI: s_endpgm -define void @zextload_global_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i16> addrspace(1)* nocapture %in) nounwind { - %load = load <32 x i16>, <32 x i16> addrspace(1)* %in - %ext = zext <32 x i16> %load to <32 x i32> - store <32 x i32> %ext, <32 x i32> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}sextload_global_v32i16_to_v32i32: -; SI: s_endpgm -define void @sextload_global_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i16> addrspace(1)* nocapture %in) nounwind { - %load = load <32 x i16>, <32 x i16> addrspace(1)* %in - %ext = sext <32 x i16> %load to <32 x i32> - store <32 x i32> %ext, <32 x i32> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}zextload_global_v64i16_to_v64i32: -; SI: s_endpgm -define void @zextload_global_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i16> addrspace(1)* nocapture %in) nounwind { - %load = load <64 x i16>, <64 x i16> addrspace(1)* %in - %ext = zext <64 x i16> %load to <64 x i32> - store <64 x i32> %ext, <64 x i32> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}sextload_global_v64i16_to_v64i32: -; SI: s_endpgm -define void @sextload_global_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i16> addrspace(1)* nocapture %in) nounwind { - %load = load <64 x i16>, <64 x i16> addrspace(1)* %in - %ext = sext <64 x i16> %load to <64 x i32> - store <64 x i32> %ext, <64 x i32> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}zextload_global_i16_to_i64: -; SI: buffer_load_ushort v[[LO:[0-9]+]], -; SI: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}} -; SI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]] -define void @zextload_global_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in) nounwind { - %a = load i16, i16 addrspace(1)* %in - %ext = zext i16 %a to i64 - store i64 %ext, i64 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}sextload_global_i16_to_i64: -; SI: buffer_load_sshort [[LOAD:v[0-9]+]], -; SI: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, [[LOAD]] -; SI: buffer_store_dwordx2 -define void @sextload_global_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in) nounwind { - %a = load i16, i16 addrspace(1)* %in - %ext = sext i16 %a to i64 - store i64 %ext, i64 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}zextload_global_v1i16_to_v1i64: -; SI: s_endpgm -define void @zextload_global_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i16> addrspace(1)* nocapture %in) nounwind { - %load = load <1 x i16>, <1 x i16> addrspace(1)* %in - %ext = zext <1 x i16> %load to <1 x i64> - store <1 x i64> %ext, <1 x i64> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}sextload_global_v1i16_to_v1i64: -; SI: s_endpgm -define void @sextload_global_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i16> addrspace(1)* nocapture %in) nounwind { - %load = load <1 x i16>, <1 x i16> addrspace(1)* %in - %ext = sext <1 x i16> %load to <1 x i64> - store <1 x i64> %ext, <1 x i64> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}zextload_global_v2i16_to_v2i64: -; SI: s_endpgm -define void @zextload_global_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* nocapture %in) nounwind { - %load = load <2 x i16>, <2 x i16> addrspace(1)* %in - %ext = zext <2 x i16> %load to <2 x i64> - store <2 x i64> %ext, <2 x i64> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}sextload_global_v2i16_to_v2i64: -; SI: s_endpgm -define void @sextload_global_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* nocapture %in) nounwind { - %load = load <2 x i16>, <2 x i16> addrspace(1)* %in - %ext = sext <2 x i16> %load to <2 x i64> - store <2 x i64> %ext, <2 x i64> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}zextload_global_v4i16_to_v4i64: -; SI: s_endpgm -define void @zextload_global_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i16> addrspace(1)* nocapture %in) nounwind { - %load = load <4 x i16>, <4 x i16> addrspace(1)* %in - %ext = zext <4 x i16> %load to <4 x i64> - store <4 x i64> %ext, <4 x i64> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}sextload_global_v4i16_to_v4i64: -; SI: s_endpgm -define void @sextload_global_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i16> addrspace(1)* nocapture %in) nounwind { - %load = load <4 x i16>, <4 x i16> addrspace(1)* %in - %ext = sext <4 x i16> %load to <4 x i64> - store <4 x i64> %ext, <4 x i64> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}zextload_global_v8i16_to_v8i64: -; SI: s_endpgm -define void @zextload_global_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i16> addrspace(1)* nocapture %in) nounwind { - %load = load <8 x i16>, <8 x i16> addrspace(1)* %in - %ext = zext <8 x i16> %load to <8 x i64> - store <8 x i64> %ext, <8 x i64> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}sextload_global_v8i16_to_v8i64: -; SI: s_endpgm -define void @sextload_global_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i16> addrspace(1)* nocapture %in) nounwind { - %load = load <8 x i16>, <8 x i16> addrspace(1)* %in - %ext = sext <8 x i16> %load to <8 x i64> - store <8 x i64> %ext, <8 x i64> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}zextload_global_v16i16_to_v16i64: -; SI: s_endpgm -define void @zextload_global_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i16> addrspace(1)* nocapture %in) nounwind { - %load = load <16 x i16>, <16 x i16> addrspace(1)* %in - %ext = zext <16 x i16> %load to <16 x i64> - store <16 x i64> %ext, <16 x i64> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}sextload_global_v16i16_to_v16i64: -; SI: s_endpgm -define void @sextload_global_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i16> addrspace(1)* nocapture %in) nounwind { - %load = load <16 x i16>, <16 x i16> addrspace(1)* %in - %ext = sext <16 x i16> %load to <16 x i64> - store <16 x i64> %ext, <16 x i64> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}zextload_global_v32i16_to_v32i64: -; SI: s_endpgm -define void @zextload_global_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i16> addrspace(1)* nocapture %in) nounwind { - %load = load <32 x i16>, <32 x i16> addrspace(1)* %in - %ext = zext <32 x i16> %load to <32 x i64> - store <32 x i64> %ext, <32 x i64> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}sextload_global_v32i16_to_v32i64: -; SI: s_endpgm -define void @sextload_global_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i16> addrspace(1)* nocapture %in) nounwind { - %load = load <32 x i16>, <32 x i16> addrspace(1)* %in - %ext = sext <32 x i16> %load to <32 x i64> - store <32 x i64> %ext, <32 x i64> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}zextload_global_v64i16_to_v64i64: -; SI: s_endpgm -define void @zextload_global_v64i16_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i16> addrspace(1)* nocapture %in) nounwind { - %load = load <64 x i16>, <64 x i16> addrspace(1)* %in - %ext = zext <64 x i16> %load to <64 x i64> - store <64 x i64> %ext, <64 x i64> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}sextload_global_v64i16_to_v64i64: -; SI: s_endpgm -define void @sextload_global_v64i16_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i16> addrspace(1)* nocapture %in) nounwind { - %load = load <64 x i16>, <64 x i16> addrspace(1)* %in - %ext = sext <64 x i16> %load to <64 x i64> - store <64 x i64> %ext, <64 x i64> addrspace(1)* %out - ret void -} diff --git a/llvm/test/CodeGen/R600/global-extload-i32.ll b/llvm/test/CodeGen/R600/global-extload-i32.ll deleted file mode 100644 index 79b83452939..00000000000 --- a/llvm/test/CodeGen/R600/global-extload-i32.ll +++ /dev/null @@ -1,457 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s - -; FUNC-LABEL: {{^}}zextload_global_i32_to_i64: -; SI: buffer_load_dword v[[LO:[0-9]+]], -; SI: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}} -; SI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]] -define void @zextload_global_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { - %a = load i32, i32 addrspace(1)* %in - %ext = zext i32 %a to i64 - store i64 %ext, i64 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}sextload_global_i32_to_i64: -; SI: buffer_load_dword [[LOAD:v[0-9]+]], -; SI: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, [[LOAD]] -; SI: buffer_store_dwordx2 -define void @sextload_global_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { - %a = load i32, i32 addrspace(1)* %in - %ext = sext i32 %a to i64 - store i64 %ext, i64 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}zextload_global_v1i32_to_v1i64: -; SI: buffer_load_dword -; SI: buffer_store_dwordx2 -; SI: s_endpgm -define void @zextload_global_v1i32_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i32> addrspace(1)* nocapture %in) nounwind { - %load = load <1 x i32>, <1 x i32> addrspace(1)* %in - %ext = zext <1 x i32> %load to <1 x i64> - store <1 x i64> %ext, <1 x i64> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}sextload_global_v1i32_to_v1i64: -; SI: buffer_load_dword -; SI: v_ashrrev_i32 -; SI: buffer_store_dwordx2 -; SI: s_endpgm -define void @sextload_global_v1i32_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i32> addrspace(1)* nocapture %in) nounwind { - %load = load <1 x i32>, <1 x i32> addrspace(1)* %in - %ext = sext <1 x i32> %load to <1 x i64> - store <1 x i64> %ext, <1 x i64> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}zextload_global_v2i32_to_v2i64: -; SI: buffer_load_dwordx2 -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 -; SI: s_endpgm -define void @zextload_global_v2i32_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i32> addrspace(1)* nocapture %in) nounwind { - %load = load <2 x i32>, <2 x i32> addrspace(1)* %in - %ext = zext <2 x i32> %load to <2 x i64> - store <2 x i64> %ext, <2 x i64> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}sextload_global_v2i32_to_v2i64: -; SI: buffer_load_dwordx2 -; SI-DAG: v_ashrrev_i32 -; SI-DAG: v_ashrrev_i32 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI: s_endpgm -define void @sextload_global_v2i32_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i32> addrspace(1)* nocapture %in) nounwind { - %load = load <2 x i32>, <2 x i32> addrspace(1)* %in - %ext = sext <2 x i32> %load to <2 x i64> - store <2 x i64> %ext, <2 x i64> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}zextload_global_v4i32_to_v4i64: -; SI: buffer_load_dwordx4 -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 -; SI: s_endpgm -define void @zextload_global_v4i32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i32> addrspace(1)* nocapture %in) nounwind { - %load = load <4 x i32>, <4 x i32> addrspace(1)* %in - %ext = zext <4 x i32> %load to <4 x i64> - store <4 x i64> %ext, <4 x i64> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}sextload_global_v4i32_to_v4i64: -; SI: buffer_load_dwordx4 -; SI-DAG: v_ashrrev_i32 -; SI-DAG: v_ashrrev_i32 -; SI-DAG: v_ashrrev_i32 -; SI-DAG: v_ashrrev_i32 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI: s_endpgm -define void @sextload_global_v4i32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i32> addrspace(1)* nocapture %in) nounwind { - %load = load <4 x i32>, <4 x i32> addrspace(1)* %in - %ext = sext <4 x i32> %load to <4 x i64> - store <4 x i64> %ext, <4 x i64> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}zextload_global_v8i32_to_v8i64: -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI: s_endpgm -define void @zextload_global_v8i32_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i32> addrspace(1)* nocapture %in) nounwind { - %load = load <8 x i32>, <8 x i32> addrspace(1)* %in - %ext = zext <8 x i32> %load to <8 x i64> - store <8 x i64> %ext, <8 x i64> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}sextload_global_v8i32_to_v8i64: -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword - -; SI-DAG: v_ashrrev_i32 -; SI-DAG: v_ashrrev_i32 -; SI-DAG: v_ashrrev_i32 -; SI-DAG: v_ashrrev_i32 -; SI-DAG: v_ashrrev_i32 -; SI-DAG: v_ashrrev_i32 -; SI-DAG: v_ashrrev_i32 -; SI-DAG: v_ashrrev_i32 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 - -; SI: s_endpgm -define void @sextload_global_v8i32_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i32> addrspace(1)* nocapture %in) nounwind { - %load = load <8 x i32>, <8 x i32> addrspace(1)* %in - %ext = sext <8 x i32> %load to <8 x i64> - store <8 x i64> %ext, <8 x i64> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}sextload_global_v16i32_to_v16i64: -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword - -; SI-DAG: v_ashrrev_i32 -; SI-DAG: v_ashrrev_i32 -; SI-DAG: v_ashrrev_i32 -; SI-DAG: v_ashrrev_i32 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 - -; SI-DAG: v_ashrrev_i32 -; SI-DAG: v_ashrrev_i32 -; SI-DAG: v_ashrrev_i32 -; SI-DAG: v_ashrrev_i32 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 - -; SI-DAG: v_ashrrev_i32 -; SI-DAG: v_ashrrev_i32 -; SI-DAG: v_ashrrev_i32 -; SI-DAG: v_ashrrev_i32 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 - -; SI-DAG: v_ashrrev_i32 -; SI-DAG: v_ashrrev_i32 -; SI-DAG: v_ashrrev_i32 -; SI-DAG: v_ashrrev_i32 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI: s_endpgm -define void @sextload_global_v16i32_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i32> addrspace(1)* nocapture %in) nounwind { - %load = load <16 x i32>, <16 x i32> addrspace(1)* %in - %ext = sext <16 x i32> %load to <16 x i64> - store <16 x i64> %ext, <16 x i64> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}zextload_global_v16i32_to_v16i64 -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword - -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 - -; SI: s_endpgm -define void @zextload_global_v16i32_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i32> addrspace(1)* nocapture %in) nounwind { - %load = load <16 x i32>, <16 x i32> addrspace(1)* %in - %ext = zext <16 x i32> %load to <16 x i64> - store <16 x i64> %ext, <16 x i64> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}sextload_global_v32i32_to_v32i64: -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword - -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword - -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword - -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword - -; SI-DAG: v_ashrrev_i32 -; SI-DAG: v_ashrrev_i32 -; SI-DAG: v_ashrrev_i32 -; SI-DAG: v_ashrrev_i32 -; SI-DAG: v_ashrrev_i32 -; SI-DAG: v_ashrrev_i32 -; SI-DAG: v_ashrrev_i32 -; SI-DAG: v_ashrrev_i32 -; SI-DAG: v_ashrrev_i32 -; SI-DAG: v_ashrrev_i32 -; SI-DAG: v_ashrrev_i32 -; SI-DAG: v_ashrrev_i32 -; SI-DAG: v_ashrrev_i32 -; SI-DAG: v_ashrrev_i32 -; SI-DAG: v_ashrrev_i32 -; SI-DAG: v_ashrrev_i32 -; SI-DAG: v_ashrrev_i32 -; SI-DAG: v_ashrrev_i32 -; SI-DAG: v_ashrrev_i32 -; SI-DAG: v_ashrrev_i32 -; SI-DAG: v_ashrrev_i32 -; SI-DAG: v_ashrrev_i32 -; SI-DAG: v_ashrrev_i32 -; SI-DAG: v_ashrrev_i32 -; SI-DAG: v_ashrrev_i32 -; SI-DAG: v_ashrrev_i32 -; SI-DAG: v_ashrrev_i32 -; SI-DAG: v_ashrrev_i32 -; SI-DAG: v_ashrrev_i32 -; SI-DAG: v_ashrrev_i32 -; SI-DAG: v_ashrrev_i32 -; SI-DAG: v_ashrrev_i32 - -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 - -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 - -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 - -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 - -; SI: s_endpgm -define void @sextload_global_v32i32_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i32> addrspace(1)* nocapture %in) nounwind { - %load = load <32 x i32>, <32 x i32> addrspace(1)* %in - %ext = sext <32 x i32> %load to <32 x i64> - store <32 x i64> %ext, <32 x i64> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}zextload_global_v32i32_to_v32i64: -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword - -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword - -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword - -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword - -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 - -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 - -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 - -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 - -; SI: s_endpgm -define void @zextload_global_v32i32_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i32> addrspace(1)* nocapture %in) nounwind { - %load = load <32 x i32>, <32 x i32> addrspace(1)* %in - %ext = zext <32 x i32> %load to <32 x i64> - store <32 x i64> %ext, <32 x i64> addrspace(1)* %out - ret void -} diff --git a/llvm/test/CodeGen/R600/global-extload-i8.ll b/llvm/test/CodeGen/R600/global-extload-i8.ll deleted file mode 100644 index b31d5361d5a..00000000000 --- a/llvm/test/CodeGen/R600/global-extload-i8.ll +++ /dev/null @@ -1,299 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s - -; FUNC-LABEL: {{^}}zextload_global_i8_to_i32: -; SI: buffer_load_ubyte -; SI: buffer_store_dword -; SI: s_endpgm -define void @zextload_global_i8_to_i32(i32 addrspace(1)* %out, i8 addrspace(1)* %in) nounwind { - %a = load i8, i8 addrspace(1)* %in - %ext = zext i8 %a to i32 - store i32 %ext, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}sextload_global_i8_to_i32: -; SI: buffer_load_sbyte -; SI: buffer_store_dword -; SI: s_endpgm -define void @sextload_global_i8_to_i32(i32 addrspace(1)* %out, i8 addrspace(1)* %in) nounwind { - %a = load i8, i8 addrspace(1)* %in - %ext = sext i8 %a to i32 - store i32 %ext, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}zextload_global_v1i8_to_v1i32: -; SI: s_endpgm -define void @zextload_global_v1i8_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i8> addrspace(1)* nocapture %in) nounwind { - %load = load <1 x i8>, <1 x i8> addrspace(1)* %in - %ext = zext <1 x i8> %load to <1 x i32> - store <1 x i32> %ext, <1 x i32> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}sextload_global_v1i8_to_v1i32: -; SI: s_endpgm -define void @sextload_global_v1i8_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i8> addrspace(1)* nocapture %in) nounwind { - %load = load <1 x i8>, <1 x i8> addrspace(1)* %in - %ext = sext <1 x i8> %load to <1 x i32> - store <1 x i32> %ext, <1 x i32> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}zextload_global_v2i8_to_v2i32: -; SI: s_endpgm -define void @zextload_global_v2i8_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(1)* nocapture %in) nounwind { - %load = load <2 x i8>, <2 x i8> addrspace(1)* %in - %ext = zext <2 x i8> %load to <2 x i32> - store <2 x i32> %ext, <2 x i32> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}sextload_global_v2i8_to_v2i32: -; SI: s_endpgm -define void @sextload_global_v2i8_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(1)* nocapture %in) nounwind { - %load = load <2 x i8>, <2 x i8> addrspace(1)* %in - %ext = sext <2 x i8> %load to <2 x i32> - store <2 x i32> %ext, <2 x i32> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}zextload_global_v4i8_to_v4i32: -; SI: s_endpgm -define void @zextload_global_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(1)* nocapture %in) nounwind { - %load = load <4 x i8>, <4 x i8> addrspace(1)* %in - %ext = zext <4 x i8> %load to <4 x i32> - store <4 x i32> %ext, <4 x i32> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}sextload_global_v4i8_to_v4i32: -; SI: s_endpgm -define void @sextload_global_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(1)* nocapture %in) nounwind { - %load = load <4 x i8>, <4 x i8> addrspace(1)* %in - %ext = sext <4 x i8> %load to <4 x i32> - store <4 x i32> %ext, <4 x i32> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}zextload_global_v8i8_to_v8i32: -; SI: s_endpgm -define void @zextload_global_v8i8_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i8> addrspace(1)* nocapture %in) nounwind { - %load = load <8 x i8>, <8 x i8> addrspace(1)* %in - %ext = zext <8 x i8> %load to <8 x i32> - store <8 x i32> %ext, <8 x i32> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}sextload_global_v8i8_to_v8i32: -; SI: s_endpgm -define void @sextload_global_v8i8_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i8> addrspace(1)* nocapture %in) nounwind { - %load = load <8 x i8>, <8 x i8> addrspace(1)* %in - %ext = sext <8 x i8> %load to <8 x i32> - store <8 x i32> %ext, <8 x i32> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}zextload_global_v16i8_to_v16i32: -; SI: s_endpgm -define void @zextload_global_v16i8_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i8> addrspace(1)* nocapture %in) nounwind { - %load = load <16 x i8>, <16 x i8> addrspace(1)* %in - %ext = zext <16 x i8> %load to <16 x i32> - store <16 x i32> %ext, <16 x i32> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}sextload_global_v16i8_to_v16i32: -; SI: s_endpgm -define void @sextload_global_v16i8_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i8> addrspace(1)* nocapture %in) nounwind { - %load = load <16 x i8>, <16 x i8> addrspace(1)* %in - %ext = sext <16 x i8> %load to <16 x i32> - store <16 x i32> %ext, <16 x i32> addrspace(1)* %out - ret void -} - -; XFUNC-LABEL: {{^}}zextload_global_v32i8_to_v32i32: -; XSI: s_endpgm -; define void @zextload_global_v32i8_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i8> addrspace(1)* nocapture %in) nounwind { -; %load = load <32 x i8>, <32 x i8> addrspace(1)* %in -; %ext = zext <32 x i8> %load to <32 x i32> -; store <32 x i32> %ext, <32 x i32> addrspace(1)* %out -; ret void -; } - -; XFUNC-LABEL: {{^}}sextload_global_v32i8_to_v32i32: -; XSI: s_endpgm -; define void @sextload_global_v32i8_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i8> addrspace(1)* nocapture %in) nounwind { -; %load = load <32 x i8>, <32 x i8> addrspace(1)* %in -; %ext = sext <32 x i8> %load to <32 x i32> -; store <32 x i32> %ext, <32 x i32> addrspace(1)* %out -; ret void -; } - -; XFUNC-LABEL: {{^}}zextload_global_v64i8_to_v64i32: -; XSI: s_endpgm -; define void @zextload_global_v64i8_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i8> addrspace(1)* nocapture %in) nounwind { -; %load = load <64 x i8>, <64 x i8> addrspace(1)* %in -; %ext = zext <64 x i8> %load to <64 x i32> -; store <64 x i32> %ext, <64 x i32> addrspace(1)* %out -; ret void -; } - -; XFUNC-LABEL: {{^}}sextload_global_v64i8_to_v64i32: -; XSI: s_endpgm -; define void @sextload_global_v64i8_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i8> addrspace(1)* nocapture %in) nounwind { -; %load = load <64 x i8>, <64 x i8> addrspace(1)* %in -; %ext = sext <64 x i8> %load to <64 x i32> -; store <64 x i32> %ext, <64 x i32> addrspace(1)* %out -; ret void -; } - -; FUNC-LABEL: {{^}}zextload_global_i8_to_i64: -; SI: buffer_load_ubyte v[[LO:[0-9]+]], -; SI: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}} -; SI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]] -define void @zextload_global_i8_to_i64(i64 addrspace(1)* %out, i8 addrspace(1)* %in) nounwind { - %a = load i8, i8 addrspace(1)* %in - %ext = zext i8 %a to i64 - store i64 %ext, i64 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}sextload_global_i8_to_i64: -; SI: buffer_load_sbyte [[LOAD:v[0-9]+]], -; SI: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, [[LOAD]] -; SI: buffer_store_dwordx2 -define void @sextload_global_i8_to_i64(i64 addrspace(1)* %out, i8 addrspace(1)* %in) nounwind { - %a = load i8, i8 addrspace(1)* %in - %ext = sext i8 %a to i64 - store i64 %ext, i64 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}zextload_global_v1i8_to_v1i64: -; SI: s_endpgm -define void @zextload_global_v1i8_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i8> addrspace(1)* nocapture %in) nounwind { - %load = load <1 x i8>, <1 x i8> addrspace(1)* %in - %ext = zext <1 x i8> %load to <1 x i64> - store <1 x i64> %ext, <1 x i64> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}sextload_global_v1i8_to_v1i64: -; SI: s_endpgm -define void @sextload_global_v1i8_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i8> addrspace(1)* nocapture %in) nounwind { - %load = load <1 x i8>, <1 x i8> addrspace(1)* %in - %ext = sext <1 x i8> %load to <1 x i64> - store <1 x i64> %ext, <1 x i64> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}zextload_global_v2i8_to_v2i64: -; SI: s_endpgm -define void @zextload_global_v2i8_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i8> addrspace(1)* nocapture %in) nounwind { - %load = load <2 x i8>, <2 x i8> addrspace(1)* %in - %ext = zext <2 x i8> %load to <2 x i64> - store <2 x i64> %ext, <2 x i64> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}sextload_global_v2i8_to_v2i64: -; SI: s_endpgm -define void @sextload_global_v2i8_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i8> addrspace(1)* nocapture %in) nounwind { - %load = load <2 x i8>, <2 x i8> addrspace(1)* %in - %ext = sext <2 x i8> %load to <2 x i64> - store <2 x i64> %ext, <2 x i64> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}zextload_global_v4i8_to_v4i64: -; SI: s_endpgm -define void @zextload_global_v4i8_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i8> addrspace(1)* nocapture %in) nounwind { - %load = load <4 x i8>, <4 x i8> addrspace(1)* %in - %ext = zext <4 x i8> %load to <4 x i64> - store <4 x i64> %ext, <4 x i64> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}sextload_global_v4i8_to_v4i64: -; SI: s_endpgm -define void @sextload_global_v4i8_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i8> addrspace(1)* nocapture %in) nounwind { - %load = load <4 x i8>, <4 x i8> addrspace(1)* %in - %ext = sext <4 x i8> %load to <4 x i64> - store <4 x i64> %ext, <4 x i64> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}zextload_global_v8i8_to_v8i64: -; SI: s_endpgm -define void @zextload_global_v8i8_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i8> addrspace(1)* nocapture %in) nounwind { - %load = load <8 x i8>, <8 x i8> addrspace(1)* %in - %ext = zext <8 x i8> %load to <8 x i64> - store <8 x i64> %ext, <8 x i64> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}sextload_global_v8i8_to_v8i64: -; SI: s_endpgm -define void @sextload_global_v8i8_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i8> addrspace(1)* nocapture %in) nounwind { - %load = load <8 x i8>, <8 x i8> addrspace(1)* %in - %ext = sext <8 x i8> %load to <8 x i64> - store <8 x i64> %ext, <8 x i64> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}zextload_global_v16i8_to_v16i64: -; SI: s_endpgm -define void @zextload_global_v16i8_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i8> addrspace(1)* nocapture %in) nounwind { - %load = load <16 x i8>, <16 x i8> addrspace(1)* %in - %ext = zext <16 x i8> %load to <16 x i64> - store <16 x i64> %ext, <16 x i64> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}sextload_global_v16i8_to_v16i64: -; SI: s_endpgm -define void @sextload_global_v16i8_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i8> addrspace(1)* nocapture %in) nounwind { - %load = load <16 x i8>, <16 x i8> addrspace(1)* %in - %ext = sext <16 x i8> %load to <16 x i64> - store <16 x i64> %ext, <16 x i64> addrspace(1)* %out - ret void -} - -; XFUNC-LABEL: {{^}}zextload_global_v32i8_to_v32i64: -; XSI: s_endpgm -; define void @zextload_global_v32i8_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i8> addrspace(1)* nocapture %in) nounwind { -; %load = load <32 x i8>, <32 x i8> addrspace(1)* %in -; %ext = zext <32 x i8> %load to <32 x i64> -; store <32 x i64> %ext, <32 x i64> addrspace(1)* %out -; ret void -; } - -; XFUNC-LABEL: {{^}}sextload_global_v32i8_to_v32i64: -; XSI: s_endpgm -; define void @sextload_global_v32i8_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i8> addrspace(1)* nocapture %in) nounwind { -; %load = load <32 x i8>, <32 x i8> addrspace(1)* %in -; %ext = sext <32 x i8> %load to <32 x i64> -; store <32 x i64> %ext, <32 x i64> addrspace(1)* %out -; ret void -; } - -; XFUNC-LABEL: {{^}}zextload_global_v64i8_to_v64i64: -; XSI: s_endpgm -; define void @zextload_global_v64i8_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i8> addrspace(1)* nocapture %in) nounwind { -; %load = load <64 x i8>, <64 x i8> addrspace(1)* %in -; %ext = zext <64 x i8> %load to <64 x i64> -; store <64 x i64> %ext, <64 x i64> addrspace(1)* %out -; ret void -; } - -; XFUNC-LABEL: {{^}}sextload_global_v64i8_to_v64i64: -; XSI: s_endpgm -; define void @sextload_global_v64i8_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i8> addrspace(1)* nocapture %in) nounwind { -; %load = load <64 x i8>, <64 x i8> addrspace(1)* %in -; %ext = sext <64 x i8> %load to <64 x i64> -; store <64 x i64> %ext, <64 x i64> addrspace(1)* %out -; ret void -; } diff --git a/llvm/test/CodeGen/R600/global-zero-initializer.ll b/llvm/test/CodeGen/R600/global-zero-initializer.ll deleted file mode 100644 index 45aa8bf4e1d..00000000000 --- a/llvm/test/CodeGen/R600/global-zero-initializer.ll +++ /dev/null @@ -1,13 +0,0 @@ -; RUN: not llc -march=amdgcn -mcpu=SI < %s 2>&1 | FileCheck %s -; RUN: not llc -march=amdgcn -mcpu=tonga < %s 2>&1 | FileCheck %s - -; CHECK: error: unsupported initializer for address space in load_init_global_global - -@lds = addrspace(1) global [256 x i32] zeroinitializer - -define void @load_init_global_global(i32 addrspace(1)* %out, i1 %p) { - %gep = getelementptr [256 x i32], [256 x i32] addrspace(1)* @lds, i32 0, i32 10 - %ld = load i32, i32 addrspace(1)* %gep - store i32 %ld, i32 addrspace(1)* %out - ret void -} diff --git a/llvm/test/CodeGen/R600/global_atomics.ll b/llvm/test/CodeGen/R600/global_atomics.ll deleted file mode 100644 index 847950f6376..00000000000 --- a/llvm/test/CodeGen/R600/global_atomics.ll +++ /dev/null @@ -1,801 +0,0 @@ -; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s - -; FUNC-LABEL: {{^}}atomic_add_i32_offset: -; SI: buffer_atomic_add v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}} -define void @atomic_add_i32_offset(i32 addrspace(1)* %out, i32 %in) { -entry: - %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %0 = atomicrmw volatile add i32 addrspace(1)* %gep, i32 %in seq_cst - ret void -} - -; FUNC-LABEL: {{^}}atomic_add_i32_ret_offset: -; SI: buffer_atomic_add [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc {{$}} -; SI: buffer_store_dword [[RET]] -define void @atomic_add_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { -entry: - %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %0 = atomicrmw volatile add i32 addrspace(1)* %gep, i32 %in seq_cst - store i32 %0, i32 addrspace(1)* %out2 - ret void -} - -; FUNC-LABEL: {{^}}atomic_add_i32_addr64_offset: -; SI: buffer_atomic_add v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}} -define void @atomic_add_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) { -entry: - %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index - %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4 - %0 = atomicrmw volatile add i32 addrspace(1)* %gep, i32 %in seq_cst - ret void -} - -; FUNC-LABEL: {{^}}atomic_add_i32_ret_addr64_offset: -; SI: buffer_atomic_add [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}} -; SI: buffer_store_dword [[RET]] -define void @atomic_add_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { -entry: - %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index - %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4 - %0 = atomicrmw volatile add i32 addrspace(1)* %gep, i32 %in seq_cst - store i32 %0, i32 addrspace(1)* %out2 - ret void -} - -; FUNC-LABEL: {{^}}atomic_add_i32: -; SI: buffer_atomic_add v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}} -define void @atomic_add_i32(i32 addrspace(1)* %out, i32 %in) { -entry: - %0 = atomicrmw volatile add i32 addrspace(1)* %out, i32 %in seq_cst - ret void -} - -; FUNC-LABEL: {{^}}atomic_add_i32_ret: -; SI: buffer_atomic_add [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 glc -; SI: buffer_store_dword [[RET]] -define void @atomic_add_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { -entry: - %0 = atomicrmw volatile add i32 addrspace(1)* %out, i32 %in seq_cst - store i32 %0, i32 addrspace(1)* %out2 - ret void -} - -; FUNC-LABEL: {{^}}atomic_add_i32_addr64: -; SI: buffer_atomic_add v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}} -define void @atomic_add_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) { -entry: - %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index - %0 = atomicrmw volatile add i32 addrspace(1)* %ptr, i32 %in seq_cst - ret void -} - -; FUNC-LABEL: {{^}}atomic_add_i32_ret_addr64: -; SI: buffer_atomic_add [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}} -; SI: buffer_store_dword [[RET]] -define void @atomic_add_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { -entry: - %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index - %0 = atomicrmw volatile add i32 addrspace(1)* %ptr, i32 %in seq_cst - store i32 %0, i32 addrspace(1)* %out2 - ret void -} - -; FUNC-LABEL: {{^}}atomic_and_i32_offset: -; SI: buffer_atomic_and v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}} -define void @atomic_and_i32_offset(i32 addrspace(1)* %out, i32 %in) { -entry: - %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %0 = atomicrmw volatile and i32 addrspace(1)* %gep, i32 %in seq_cst - ret void -} - -; FUNC-LABEL: {{^}}atomic_and_i32_ret_offset: -; SI: buffer_atomic_and [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc {{$}} -; SI: buffer_store_dword [[RET]] -define void @atomic_and_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { -entry: - %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %0 = atomicrmw volatile and i32 addrspace(1)* %gep, i32 %in seq_cst - store i32 %0, i32 addrspace(1)* %out2 - ret void -} - -; FUNC-LABEL: {{^}}atomic_and_i32_addr64_offset: -; SI: buffer_atomic_and v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}} -define void @atomic_and_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) { -entry: - %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index - %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4 - %0 = atomicrmw volatile and i32 addrspace(1)* %gep, i32 %in seq_cst - ret void -} - -; FUNC-LABEL: {{^}}atomic_and_i32_ret_addr64_offset: -; SI: buffer_atomic_and [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}} -; SI: buffer_store_dword [[RET]] -define void @atomic_and_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { -entry: - %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index - %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4 - %0 = atomicrmw volatile and i32 addrspace(1)* %gep, i32 %in seq_cst - store i32 %0, i32 addrspace(1)* %out2 - ret void -} - -; FUNC-LABEL: {{^}}atomic_and_i32: -; SI: buffer_atomic_and v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}} -define void @atomic_and_i32(i32 addrspace(1)* %out, i32 %in) { -entry: - %0 = atomicrmw volatile and i32 addrspace(1)* %out, i32 %in seq_cst - ret void -} - -; FUNC-LABEL: {{^}}atomic_and_i32_ret: -; SI: buffer_atomic_and [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 glc -; SI: buffer_store_dword [[RET]] -define void @atomic_and_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { -entry: - %0 = atomicrmw volatile and i32 addrspace(1)* %out, i32 %in seq_cst - store i32 %0, i32 addrspace(1)* %out2 - ret void -} - -; FUNC-LABEL: {{^}}atomic_and_i32_addr64: -; SI: buffer_atomic_and v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}} -define void @atomic_and_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) { -entry: - %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index - %0 = atomicrmw volatile and i32 addrspace(1)* %ptr, i32 %in seq_cst - ret void -} - -; FUNC-LABEL: {{^}}atomic_and_i32_ret_addr64: -; SI: buffer_atomic_and [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}} -; SI: buffer_store_dword [[RET]] -define void @atomic_and_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { -entry: - %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index - %0 = atomicrmw volatile and i32 addrspace(1)* %ptr, i32 %in seq_cst - store i32 %0, i32 addrspace(1)* %out2 - ret void -} - -; FUNC-LABEL: {{^}}atomic_sub_i32_offset: -; SI: buffer_atomic_sub v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}} -define void @atomic_sub_i32_offset(i32 addrspace(1)* %out, i32 %in) { -entry: - %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %0 = atomicrmw volatile sub i32 addrspace(1)* %gep, i32 %in seq_cst - ret void -} - -; FUNC-LABEL: {{^}}atomic_sub_i32_ret_offset: -; SI: buffer_atomic_sub [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc {{$}} -; SI: buffer_store_dword [[RET]] -define void @atomic_sub_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { -entry: - %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %0 = atomicrmw volatile sub i32 addrspace(1)* %gep, i32 %in seq_cst - store i32 %0, i32 addrspace(1)* %out2 - ret void -} - -; FUNC-LABEL: {{^}}atomic_sub_i32_addr64_offset: -; SI: buffer_atomic_sub v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}} -define void @atomic_sub_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) { -entry: - %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index - %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4 - %0 = atomicrmw volatile sub i32 addrspace(1)* %gep, i32 %in seq_cst - ret void -} - -; FUNC-LABEL: {{^}}atomic_sub_i32_ret_addr64_offset: -; SI: buffer_atomic_sub [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}} -; SI: buffer_store_dword [[RET]] -define void @atomic_sub_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { -entry: - %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index - %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4 - %0 = atomicrmw volatile sub i32 addrspace(1)* %gep, i32 %in seq_cst - store i32 %0, i32 addrspace(1)* %out2 - ret void -} - -; FUNC-LABEL: {{^}}atomic_sub_i32: -; SI: buffer_atomic_sub v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}} -define void @atomic_sub_i32(i32 addrspace(1)* %out, i32 %in) { -entry: - %0 = atomicrmw volatile sub i32 addrspace(1)* %out, i32 %in seq_cst - ret void -} - -; FUNC-LABEL: {{^}}atomic_sub_i32_ret: -; SI: buffer_atomic_sub [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 glc -; SI: buffer_store_dword [[RET]] -define void @atomic_sub_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { -entry: - %0 = atomicrmw volatile sub i32 addrspace(1)* %out, i32 %in seq_cst - store i32 %0, i32 addrspace(1)* %out2 - ret void -} - -; FUNC-LABEL: {{^}}atomic_sub_i32_addr64: -; SI: buffer_atomic_sub v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}} -define void @atomic_sub_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) { -entry: - %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index - %0 = atomicrmw volatile sub i32 addrspace(1)* %ptr, i32 %in seq_cst - ret void -} - -; FUNC-LABEL: {{^}}atomic_sub_i32_ret_addr64: -; SI: buffer_atomic_sub [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}} -; SI: buffer_store_dword [[RET]] -define void @atomic_sub_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { -entry: - %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index - %0 = atomicrmw volatile sub i32 addrspace(1)* %ptr, i32 %in seq_cst - store i32 %0, i32 addrspace(1)* %out2 - ret void -} - -; FUNC-LABEL: {{^}}atomic_max_i32_offset: -; SI: buffer_atomic_smax v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}} -define void @atomic_max_i32_offset(i32 addrspace(1)* %out, i32 %in) { -entry: - %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %0 = atomicrmw volatile max i32 addrspace(1)* %gep, i32 %in seq_cst - ret void -} - -; FUNC-LABEL: {{^}}atomic_max_i32_ret_offset: -; SI: buffer_atomic_smax [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc {{$}} -; SI: buffer_store_dword [[RET]] -define void @atomic_max_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { -entry: - %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %0 = atomicrmw volatile max i32 addrspace(1)* %gep, i32 %in seq_cst - store i32 %0, i32 addrspace(1)* %out2 - ret void -} - -; FUNC-LABEL: {{^}}atomic_max_i32_addr64_offset: -; SI: buffer_atomic_smax v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}} -define void @atomic_max_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) { -entry: - %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index - %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4 - %0 = atomicrmw volatile max i32 addrspace(1)* %gep, i32 %in seq_cst - ret void -} - -; FUNC-LABEL: {{^}}atomic_max_i32_ret_addr64_offset: -; SI: buffer_atomic_smax [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}} -; SI: buffer_store_dword [[RET]] -define void @atomic_max_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { -entry: - %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index - %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4 - %0 = atomicrmw volatile max i32 addrspace(1)* %gep, i32 %in seq_cst - store i32 %0, i32 addrspace(1)* %out2 - ret void -} - -; FUNC-LABEL: {{^}}atomic_max_i32: -; SI: buffer_atomic_smax v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}} -define void @atomic_max_i32(i32 addrspace(1)* %out, i32 %in) { -entry: - %0 = atomicrmw volatile max i32 addrspace(1)* %out, i32 %in seq_cst - ret void -} - -; FUNC-LABEL: {{^}}atomic_max_i32_ret: -; SI: buffer_atomic_smax [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 glc -; SI: buffer_store_dword [[RET]] -define void @atomic_max_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { -entry: - %0 = atomicrmw volatile max i32 addrspace(1)* %out, i32 %in seq_cst - store i32 %0, i32 addrspace(1)* %out2 - ret void -} - -; FUNC-LABEL: {{^}}atomic_max_i32_addr64: -; SI: buffer_atomic_smax v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}} -define void @atomic_max_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) { -entry: - %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index - %0 = atomicrmw volatile max i32 addrspace(1)* %ptr, i32 %in seq_cst - ret void -} - -; FUNC-LABEL: {{^}}atomic_max_i32_ret_addr64: -; SI: buffer_atomic_smax [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}} -; SI: buffer_store_dword [[RET]] -define void @atomic_max_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { -entry: - %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index - %0 = atomicrmw volatile max i32 addrspace(1)* %ptr, i32 %in seq_cst - store i32 %0, i32 addrspace(1)* %out2 - ret void -} - -; FUNC-LABEL: {{^}}atomic_umax_i32_offset: -; SI: buffer_atomic_umax v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}} -define void @atomic_umax_i32_offset(i32 addrspace(1)* %out, i32 %in) { -entry: - %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %0 = atomicrmw volatile umax i32 addrspace(1)* %gep, i32 %in seq_cst - ret void -} - -; FUNC-LABEL: {{^}}atomic_umax_i32_ret_offset: -; SI: buffer_atomic_umax [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc {{$}} -; SI: buffer_store_dword [[RET]] -define void @atomic_umax_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { -entry: - %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %0 = atomicrmw volatile umax i32 addrspace(1)* %gep, i32 %in seq_cst - store i32 %0, i32 addrspace(1)* %out2 - ret void -} - -; FUNC-LABEL: {{^}}atomic_umax_i32_addr64_offset: -; SI: buffer_atomic_umax v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}} -define void @atomic_umax_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) { -entry: - %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index - %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4 - %0 = atomicrmw volatile umax i32 addrspace(1)* %gep, i32 %in seq_cst - ret void -} - -; FUNC-LABEL: {{^}}atomic_umax_i32_ret_addr64_offset: -; SI: buffer_atomic_umax [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}} -; SI: buffer_store_dword [[RET]] -define void @atomic_umax_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { -entry: - %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index - %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4 - %0 = atomicrmw volatile umax i32 addrspace(1)* %gep, i32 %in seq_cst - store i32 %0, i32 addrspace(1)* %out2 - ret void -} - -; FUNC-LABEL: {{^}}atomic_umax_i32: -; SI: buffer_atomic_umax v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}} -define void @atomic_umax_i32(i32 addrspace(1)* %out, i32 %in) { -entry: - %0 = atomicrmw volatile umax i32 addrspace(1)* %out, i32 %in seq_cst - ret void -} - -; FUNC-LABEL: {{^}}atomic_umax_i32_ret: -; SI: buffer_atomic_umax [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 glc -; SI: buffer_store_dword [[RET]] -define void @atomic_umax_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { -entry: - %0 = atomicrmw volatile umax i32 addrspace(1)* %out, i32 %in seq_cst - store i32 %0, i32 addrspace(1)* %out2 - ret void -} - -; FUNC-LABEL: {{^}}atomic_umax_i32_addr64: -; SI: buffer_atomic_umax v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}} -define void @atomic_umax_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) { -entry: - %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index - %0 = atomicrmw volatile umax i32 addrspace(1)* %ptr, i32 %in seq_cst - ret void -} - -; FUNC-LABEL: {{^}}atomic_umax_i32_ret_addr64: -; SI: buffer_atomic_umax [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}} -; SI: buffer_store_dword [[RET]] -define void @atomic_umax_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { -entry: - %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index - %0 = atomicrmw volatile umax i32 addrspace(1)* %ptr, i32 %in seq_cst - store i32 %0, i32 addrspace(1)* %out2 - ret void -} - -; FUNC-LABEL: {{^}}atomic_min_i32_offset: -; SI: buffer_atomic_smin v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}} -define void @atomic_min_i32_offset(i32 addrspace(1)* %out, i32 %in) { -entry: - %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %0 = atomicrmw volatile min i32 addrspace(1)* %gep, i32 %in seq_cst - ret void -} - -; FUNC-LABEL: {{^}}atomic_min_i32_ret_offset: -; SI: buffer_atomic_smin [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc {{$}} -; SI: buffer_store_dword [[RET]] -define void @atomic_min_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { -entry: - %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %0 = atomicrmw volatile min i32 addrspace(1)* %gep, i32 %in seq_cst - store i32 %0, i32 addrspace(1)* %out2 - ret void -} - -; FUNC-LABEL: {{^}}atomic_min_i32_addr64_offset: -; SI: buffer_atomic_smin v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}} -define void @atomic_min_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) { -entry: - %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index - %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4 - %0 = atomicrmw volatile min i32 addrspace(1)* %gep, i32 %in seq_cst - ret void -} - -; FUNC-LABEL: {{^}}atomic_min_i32_ret_addr64_offset: -; SI: buffer_atomic_smin [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}} -; SI: buffer_store_dword [[RET]] -define void @atomic_min_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { -entry: - %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index - %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4 - %0 = atomicrmw volatile min i32 addrspace(1)* %gep, i32 %in seq_cst - store i32 %0, i32 addrspace(1)* %out2 - ret void -} - -; FUNC-LABEL: {{^}}atomic_min_i32: -; SI: buffer_atomic_smin v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}} -define void @atomic_min_i32(i32 addrspace(1)* %out, i32 %in) { -entry: - %0 = atomicrmw volatile min i32 addrspace(1)* %out, i32 %in seq_cst - ret void -} - -; FUNC-LABEL: {{^}}atomic_min_i32_ret: -; SI: buffer_atomic_smin [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 glc -; SI: buffer_store_dword [[RET]] -define void @atomic_min_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { -entry: - %0 = atomicrmw volatile min i32 addrspace(1)* %out, i32 %in seq_cst - store i32 %0, i32 addrspace(1)* %out2 - ret void -} - -; FUNC-LABEL: {{^}}atomic_min_i32_addr64: -; SI: buffer_atomic_smin v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}} -define void @atomic_min_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) { -entry: - %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index - %0 = atomicrmw volatile min i32 addrspace(1)* %ptr, i32 %in seq_cst - ret void -} - -; FUNC-LABEL: {{^}}atomic_min_i32_ret_addr64: -; SI: buffer_atomic_smin [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}} -; SI: buffer_store_dword [[RET]] -define void @atomic_min_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { -entry: - %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index - %0 = atomicrmw volatile min i32 addrspace(1)* %ptr, i32 %in seq_cst - store i32 %0, i32 addrspace(1)* %out2 - ret void -} - -; FUNC-LABEL: {{^}}atomic_umin_i32_offset: -; SI: buffer_atomic_umin v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}} -define void @atomic_umin_i32_offset(i32 addrspace(1)* %out, i32 %in) { -entry: - %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %0 = atomicrmw volatile umin i32 addrspace(1)* %gep, i32 %in seq_cst - ret void -} - -; FUNC-LABEL: {{^}}atomic_umin_i32_ret_offset: -; SI: buffer_atomic_umin [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc {{$}} -; SI: buffer_store_dword [[RET]] -define void @atomic_umin_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { -entry: - %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %0 = atomicrmw volatile umin i32 addrspace(1)* %gep, i32 %in seq_cst - store i32 %0, i32 addrspace(1)* %out2 - ret void -} - -; FUNC-LABEL: {{^}}atomic_umin_i32_addr64_offset: -; SI: buffer_atomic_umin v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}} -define void @atomic_umin_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) { -entry: - %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index - %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4 - %0 = atomicrmw volatile umin i32 addrspace(1)* %gep, i32 %in seq_cst - ret void -} - -; FUNC-LABEL: {{^}}atomic_umin_i32_ret_addr64_offset: -; SI: buffer_atomic_umin [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}} -; SI: buffer_store_dword [[RET]] -define void @atomic_umin_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { -entry: - %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index - %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4 - %0 = atomicrmw volatile umin i32 addrspace(1)* %gep, i32 %in seq_cst - store i32 %0, i32 addrspace(1)* %out2 - ret void -} - -; FUNC-LABEL: {{^}}atomic_umin_i32: -; SI: buffer_atomic_umin v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}} -define void @atomic_umin_i32(i32 addrspace(1)* %out, i32 %in) { -entry: - %0 = atomicrmw volatile umin i32 addrspace(1)* %out, i32 %in seq_cst - ret void -} - -; FUNC-LABEL: {{^}}atomic_umin_i32_ret: -; SI: buffer_atomic_umin [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 glc -; SI: buffer_store_dword [[RET]] -define void @atomic_umin_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { -entry: - %0 = atomicrmw volatile umin i32 addrspace(1)* %out, i32 %in seq_cst - store i32 %0, i32 addrspace(1)* %out2 - ret void -} - -; FUNC-LABEL: {{^}}atomic_umin_i32_addr64: -; SI: buffer_atomic_umin v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}} -define void @atomic_umin_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) { -entry: - %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index - %0 = atomicrmw volatile umin i32 addrspace(1)* %ptr, i32 %in seq_cst - ret void -} - -; FUNC-LABEL: {{^}}atomic_umin_i32_ret_addr64: -; SI: buffer_atomic_umin [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}} -; SI: buffer_store_dword [[RET]] -define void @atomic_umin_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { -entry: - %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index - %0 = atomicrmw volatile umin i32 addrspace(1)* %ptr, i32 %in seq_cst - store i32 %0, i32 addrspace(1)* %out2 - ret void -} - -; FUNC-LABEL: {{^}}atomic_or_i32_offset: -; SI: buffer_atomic_or v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}} -define void @atomic_or_i32_offset(i32 addrspace(1)* %out, i32 %in) { -entry: - %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %0 = atomicrmw volatile or i32 addrspace(1)* %gep, i32 %in seq_cst - ret void -} - -; FUNC-LABEL: {{^}}atomic_or_i32_ret_offset: -; SI: buffer_atomic_or [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc {{$}} -; SI: buffer_store_dword [[RET]] -define void @atomic_or_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { -entry: - %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %0 = atomicrmw volatile or i32 addrspace(1)* %gep, i32 %in seq_cst - store i32 %0, i32 addrspace(1)* %out2 - ret void -} - -; FUNC-LABEL: {{^}}atomic_or_i32_addr64_offset: -; SI: buffer_atomic_or v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}} -define void @atomic_or_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) { -entry: - %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index - %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4 - %0 = atomicrmw volatile or i32 addrspace(1)* %gep, i32 %in seq_cst - ret void -} - -; FUNC-LABEL: {{^}}atomic_or_i32_ret_addr64_offset: -; SI: buffer_atomic_or [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}} -; SI: buffer_store_dword [[RET]] -define void @atomic_or_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { -entry: - %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index - %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4 - %0 = atomicrmw volatile or i32 addrspace(1)* %gep, i32 %in seq_cst - store i32 %0, i32 addrspace(1)* %out2 - ret void -} - -; FUNC-LABEL: {{^}}atomic_or_i32: -; SI: buffer_atomic_or v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}} -define void @atomic_or_i32(i32 addrspace(1)* %out, i32 %in) { -entry: - %0 = atomicrmw volatile or i32 addrspace(1)* %out, i32 %in seq_cst - ret void -} - -; FUNC-LABEL: {{^}}atomic_or_i32_ret: -; SI: buffer_atomic_or [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 glc -; SI: buffer_store_dword [[RET]] -define void @atomic_or_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { -entry: - %0 = atomicrmw volatile or i32 addrspace(1)* %out, i32 %in seq_cst - store i32 %0, i32 addrspace(1)* %out2 - ret void -} - -; FUNC-LABEL: {{^}}atomic_or_i32_addr64: -; SI: buffer_atomic_or v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}} -define void @atomic_or_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) { -entry: - %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index - %0 = atomicrmw volatile or i32 addrspace(1)* %ptr, i32 %in seq_cst - ret void -} - -; FUNC-LABEL: {{^}}atomic_or_i32_ret_addr64: -; SI: buffer_atomic_or [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}} -; SI: buffer_store_dword [[RET]] -define void @atomic_or_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { -entry: - %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index - %0 = atomicrmw volatile or i32 addrspace(1)* %ptr, i32 %in seq_cst - store i32 %0, i32 addrspace(1)* %out2 - ret void -} - -; FUNC-LABEL: {{^}}atomic_xchg_i32_offset: -; SI: buffer_atomic_swap v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}} -define void @atomic_xchg_i32_offset(i32 addrspace(1)* %out, i32 %in) { -entry: - %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %0 = atomicrmw volatile xchg i32 addrspace(1)* %gep, i32 %in seq_cst - ret void -} - -; FUNC-LABEL: {{^}}atomic_xchg_i32_ret_offset: -; SI: buffer_atomic_swap [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc {{$}} -; SI: buffer_store_dword [[RET]] -define void @atomic_xchg_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { -entry: - %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %0 = atomicrmw volatile xchg i32 addrspace(1)* %gep, i32 %in seq_cst - store i32 %0, i32 addrspace(1)* %out2 - ret void -} - -; FUNC-LABEL: {{^}}atomic_xchg_i32_addr64_offset: -; SI: buffer_atomic_swap v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}} -define void @atomic_xchg_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) { -entry: - %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index - %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4 - %0 = atomicrmw volatile xchg i32 addrspace(1)* %gep, i32 %in seq_cst - ret void -} - -; FUNC-LABEL: {{^}}atomic_xchg_i32_ret_addr64_offset: -; SI: buffer_atomic_swap [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}} -; SI: buffer_store_dword [[RET]] -define void @atomic_xchg_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { -entry: - %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index - %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4 - %0 = atomicrmw volatile xchg i32 addrspace(1)* %gep, i32 %in seq_cst - store i32 %0, i32 addrspace(1)* %out2 - ret void -} - -; FUNC-LABEL: {{^}}atomic_xchg_i32: -; SI: buffer_atomic_swap v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}} -define void @atomic_xchg_i32(i32 addrspace(1)* %out, i32 %in) { -entry: - %0 = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in seq_cst - ret void -} - -; FUNC-LABEL: {{^}}atomic_xchg_i32_ret: -; SI: buffer_atomic_swap [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 glc -; SI: buffer_store_dword [[RET]] -define void @atomic_xchg_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { -entry: - %0 = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in seq_cst - store i32 %0, i32 addrspace(1)* %out2 - ret void -} - -; FUNC-LABEL: {{^}}atomic_xchg_i32_addr64: -; SI: buffer_atomic_swap v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}} -define void @atomic_xchg_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) { -entry: - %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index - %0 = atomicrmw volatile xchg i32 addrspace(1)* %ptr, i32 %in seq_cst - ret void -} - -; FUNC-LABEL: {{^}}atomic_xchg_i32_ret_addr64: -; SI: buffer_atomic_swap [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}} -; SI: buffer_store_dword [[RET]] -define void @atomic_xchg_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { -entry: - %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index - %0 = atomicrmw volatile xchg i32 addrspace(1)* %ptr, i32 %in seq_cst - store i32 %0, i32 addrspace(1)* %out2 - ret void -} - -; FUNC-LABEL: {{^}}atomic_xor_i32_offset: -; SI: buffer_atomic_xor v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}} -define void @atomic_xor_i32_offset(i32 addrspace(1)* %out, i32 %in) { -entry: - %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %0 = atomicrmw volatile xor i32 addrspace(1)* %gep, i32 %in seq_cst - ret void -} - -; FUNC-LABEL: {{^}}atomic_xor_i32_ret_offset: -; SI: buffer_atomic_xor [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc {{$}} -; SI: buffer_store_dword [[RET]] -define void @atomic_xor_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { -entry: - %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %0 = atomicrmw volatile xor i32 addrspace(1)* %gep, i32 %in seq_cst - store i32 %0, i32 addrspace(1)* %out2 - ret void -} - -; FUNC-LABEL: {{^}}atomic_xor_i32_addr64_offset: -; SI: buffer_atomic_xor v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}} -define void @atomic_xor_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) { -entry: - %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index - %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4 - %0 = atomicrmw volatile xor i32 addrspace(1)* %gep, i32 %in seq_cst - ret void -} - -; FUNC-LABEL: {{^}}atomic_xor_i32_ret_addr64_offset: -; SI: buffer_atomic_xor [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}} -; SI: buffer_store_dword [[RET]] -define void @atomic_xor_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { -entry: - %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index - %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4 - %0 = atomicrmw volatile xor i32 addrspace(1)* %gep, i32 %in seq_cst - store i32 %0, i32 addrspace(1)* %out2 - ret void -} - -; FUNC-LABEL: {{^}}atomic_xor_i32: -; SI: buffer_atomic_xor v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}} -define void @atomic_xor_i32(i32 addrspace(1)* %out, i32 %in) { -entry: - %0 = atomicrmw volatile xor i32 addrspace(1)* %out, i32 %in seq_cst - ret void -} - -; FUNC-LABEL: {{^}}atomic_xor_i32_ret: -; SI: buffer_atomic_xor [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 glc -; SI: buffer_store_dword [[RET]] -define void @atomic_xor_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { -entry: - %0 = atomicrmw volatile xor i32 addrspace(1)* %out, i32 %in seq_cst - store i32 %0, i32 addrspace(1)* %out2 - ret void -} - -; FUNC-LABEL: {{^}}atomic_xor_i32_addr64: -; SI: buffer_atomic_xor v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}} -define void @atomic_xor_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) { -entry: - %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index - %0 = atomicrmw volatile xor i32 addrspace(1)* %ptr, i32 %in seq_cst - ret void -} - -; FUNC-LABEL: {{^}}atomic_xor_i32_ret_addr64: -; SI: buffer_atomic_xor [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}} -; SI: buffer_store_dword [[RET]] -define void @atomic_xor_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { -entry: - %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index - %0 = atomicrmw volatile xor i32 addrspace(1)* %ptr, i32 %in seq_cst - store i32 %0, i32 addrspace(1)* %out2 - ret void -} diff --git a/llvm/test/CodeGen/R600/gv-const-addrspace-fail.ll b/llvm/test/CodeGen/R600/gv-const-addrspace-fail.ll deleted file mode 100644 index 014b0a5482a..00000000000 --- a/llvm/test/CodeGen/R600/gv-const-addrspace-fail.ll +++ /dev/null @@ -1,57 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; XUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s - - -@a = internal addrspace(2) constant [1 x i8] [ i8 7 ], align 1 - -; FUNC-LABEL: {{^}}test_i8: -; EG: CF_END -; SI: buffer_store_byte -; SI: s_endpgm -define void @test_i8( i32 %s, i8 addrspace(1)* %out) #3 { - %arrayidx = getelementptr inbounds [1 x i8], [1 x i8] addrspace(2)* @a, i32 0, i32 %s - %1 = load i8, i8 addrspace(2)* %arrayidx, align 1 - store i8 %1, i8 addrspace(1)* %out - ret void -} - -@b = internal addrspace(2) constant [1 x i16] [ i16 7 ], align 2 - -; FUNC-LABEL: {{^}}test_i16: -; EG: CF_END -; SI: buffer_store_short -; SI: s_endpgm -define void @test_i16( i32 %s, i16 addrspace(1)* %out) #3 { - %arrayidx = getelementptr inbounds [1 x i16], [1 x i16] addrspace(2)* @b, i32 0, i32 %s - %1 = load i16, i16 addrspace(2)* %arrayidx, align 2 - store i16 %1, i16 addrspace(1)* %out - ret void -} - -%struct.bar = type { float, [5 x i8] } - -; The illegal i8s aren't handled -@struct_bar_gv = internal addrspace(2) constant [1 x %struct.bar] [ %struct.bar { float 16.0, [5 x i8] [i8 0, i8 1, i8 2, i8 3, i8 4] } ] - -; FUNC-LABEL: {{^}}struct_bar_gv_load: -define void @struct_bar_gv_load(i8 addrspace(1)* %out, i32 %index) { - %gep = getelementptr inbounds [1 x %struct.bar], [1 x %struct.bar] addrspace(2)* @struct_bar_gv, i32 0, i32 0, i32 1, i32 %index - %load = load i8, i8 addrspace(2)* %gep, align 1 - store i8 %load, i8 addrspace(1)* %out, align 1 - ret void -} - - -; The private load isn't scalarzied. -@array_vector_gv = internal addrspace(2) constant [4 x <4 x i32>] [ <4 x i32> , - <4 x i32> , - <4 x i32> , - <4 x i32> ] - -; FUNC-LABEL: {{^}}array_vector_gv_load: -define void @array_vector_gv_load(<4 x i32> addrspace(1)* %out, i32 %index) { - %gep = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>] addrspace(2)* @array_vector_gv, i32 0, i32 %index - %load = load <4 x i32>, <4 x i32> addrspace(2)* %gep, align 16 - store <4 x i32> %load, <4 x i32> addrspace(1)* %out, align 16 - ret void -} diff --git a/llvm/test/CodeGen/R600/gv-const-addrspace.ll b/llvm/test/CodeGen/R600/gv-const-addrspace.ll deleted file mode 100644 index 3c1fc6c98f7..00000000000 --- a/llvm/test/CodeGen/R600/gv-const-addrspace.ll +++ /dev/null @@ -1,101 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s - - -@b = internal addrspace(2) constant [1 x i16] [ i16 7 ], align 2 - -@float_gv = internal unnamed_addr addrspace(2) constant [5 x float] [float 0.0, float 1.0, float 2.0, float 3.0, float 4.0], align 4 - -; FUNC-LABEL: {{^}}float: -; FIXME: We should be using s_load_dword here. -; SI: buffer_load_dword -; VI: s_load_dword - -; EG-DAG: MOV {{\** *}}T2.X -; EG-DAG: MOV {{\** *}}T3.X -; EG-DAG: MOV {{\** *}}T4.X -; EG-DAG: MOV {{\** *}}T5.X -; EG-DAG: MOV {{\** *}}T6.X -; EG: MOVA_INT - -define void @float(float addrspace(1)* %out, i32 %index) { -entry: - %0 = getelementptr inbounds [5 x float], [5 x float] addrspace(2)* @float_gv, i32 0, i32 %index - %1 = load float, float addrspace(2)* %0 - store float %1, float addrspace(1)* %out - ret void -} - -@i32_gv = internal unnamed_addr addrspace(2) constant [5 x i32] [i32 0, i32 1, i32 2, i32 3, i32 4], align 4 - -; FUNC-LABEL: {{^}}i32: - -; FIXME: We should be using s_load_dword here. -; SI: buffer_load_dword -; VI: s_load_dword - -; EG-DAG: MOV {{\** *}}T2.X -; EG-DAG: MOV {{\** *}}T3.X -; EG-DAG: MOV {{\** *}}T4.X -; EG-DAG: MOV {{\** *}}T5.X -; EG-DAG: MOV {{\** *}}T6.X -; EG: MOVA_INT - -define void @i32(i32 addrspace(1)* %out, i32 %index) { -entry: - %0 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(2)* @i32_gv, i32 0, i32 %index - %1 = load i32, i32 addrspace(2)* %0 - store i32 %1, i32 addrspace(1)* %out - ret void -} - - -%struct.foo = type { float, [5 x i32] } - -@struct_foo_gv = internal unnamed_addr addrspace(2) constant [1 x %struct.foo] [ %struct.foo { float 16.0, [5 x i32] [i32 0, i32 1, i32 2, i32 3, i32 4] } ] - -; FUNC-LABEL: {{^}}struct_foo_gv_load: -; GCN: s_load_dword - -define void @struct_foo_gv_load(i32 addrspace(1)* %out, i32 %index) { - %gep = getelementptr inbounds [1 x %struct.foo], [1 x %struct.foo] addrspace(2)* @struct_foo_gv, i32 0, i32 0, i32 1, i32 %index - %load = load i32, i32 addrspace(2)* %gep, align 4 - store i32 %load, i32 addrspace(1)* %out, align 4 - ret void -} - -@array_v1_gv = internal addrspace(2) constant [4 x <1 x i32>] [ <1 x i32> , - <1 x i32> , - <1 x i32> , - <1 x i32> ] - -; FUNC-LABEL: {{^}}array_v1_gv_load: -; FIXME: We should be using s_load_dword here. -; SI: buffer_load_dword -; VI: s_load_dword -define void @array_v1_gv_load(<1 x i32> addrspace(1)* %out, i32 %index) { - %gep = getelementptr inbounds [4 x <1 x i32>], [4 x <1 x i32>] addrspace(2)* @array_v1_gv, i32 0, i32 %index - %load = load <1 x i32>, <1 x i32> addrspace(2)* %gep, align 4 - store <1 x i32> %load, <1 x i32> addrspace(1)* %out, align 4 - ret void -} - -define void @gv_addressing_in_branch(float addrspace(1)* %out, i32 %index, i32 %a) { -entry: - %0 = icmp eq i32 0, %a - br i1 %0, label %if, label %else - -if: - %1 = getelementptr inbounds [5 x float], [5 x float] addrspace(2)* @float_gv, i32 0, i32 %index - %2 = load float, float addrspace(2)* %1 - store float %2, float addrspace(1)* %out - br label %endif - -else: - store float 1.0, float addrspace(1)* %out - br label %endif - -endif: - ret void -} diff --git a/llvm/test/CodeGen/R600/half.ll b/llvm/test/CodeGen/R600/half.ll deleted file mode 100644 index bf8f11860b5..00000000000 --- a/llvm/test/CodeGen/R600/half.ll +++ /dev/null @@ -1,525 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s - -; half args should be promoted to float - -; GCN-LABEL: {{^}}load_f16_arg: -; GCN: s_load_dword [[ARG:s[0-9]+]] -; GCN: v_cvt_f16_f32_e32 [[CVT:v[0-9]+]], [[ARG]] -; GCN: buffer_store_short [[CVT]] -define void @load_f16_arg(half addrspace(1)* %out, half %arg) #0 { - store half %arg, half addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}load_v2f16_arg: -; GCN-DAG: buffer_load_ushort [[V0:v[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:44 -; GCN-DAG: buffer_load_ushort [[V1:v[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:46 -; GCN-DAG: buffer_store_short [[V0]], s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} -; GCN-DAG: buffer_store_short [[V1]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:2{{$}} -; GCN: s_endpgm -define void @load_v2f16_arg(<2 x half> addrspace(1)* %out, <2 x half> %arg) #0 { - store <2 x half> %arg, <2 x half> addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}load_v3f16_arg: -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN-NOT: buffer_load -; GCN-DAG: buffer_store_dword -; GCN-DAG: buffer_store_short -; GCN-NOT: buffer_store -; GCN: s_endpgm -define void @load_v3f16_arg(<3 x half> addrspace(1)* %out, <3 x half> %arg) #0 { - store <3 x half> %arg, <3 x half> addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}load_v4f16_arg: -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: s_endpgm -define void @load_v4f16_arg(<4 x half> addrspace(1)* %out, <4 x half> %arg) #0 { - store <4 x half> %arg, <4 x half> addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}load_v8f16_arg: -define void @load_v8f16_arg(<8 x half> addrspace(1)* %out, <8 x half> %arg) #0 { - store <8 x half> %arg, <8 x half> addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}extload_v2f16_arg: -define void @extload_v2f16_arg(<2 x float> addrspace(1)* %out, <2 x half> %in) #0 { - %fpext = fpext <2 x half> %in to <2 x float> - store <2 x float> %fpext, <2 x float> addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}extload_f16_to_f32_arg: -define void @extload_f16_to_f32_arg(float addrspace(1)* %out, half %arg) #0 { - %ext = fpext half %arg to float - store float %ext, float addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}extload_v2f16_to_v2f32_arg: -define void @extload_v2f16_to_v2f32_arg(<2 x float> addrspace(1)* %out, <2 x half> %arg) #0 { - %ext = fpext <2 x half> %arg to <2 x float> - store <2 x float> %ext, <2 x float> addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}extload_v3f16_to_v3f32_arg: -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN-NOT: buffer_load -; GCN: v_cvt_f32_f16_e32 -; GCN: v_cvt_f32_f16_e32 -; GCN: v_cvt_f32_f16_e32 -; GCN-NOT: v_cvt_f32_f16 -; GCN-DAG: buffer_store_dword -; GCN-DAG: buffer_store_dwordx2 -; GCN: s_endpgm -define void @extload_v3f16_to_v3f32_arg(<3 x float> addrspace(1)* %out, <3 x half> %arg) #0 { - %ext = fpext <3 x half> %arg to <3 x float> - store <3 x float> %ext, <3 x float> addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}extload_v4f16_to_v4f32_arg: -define void @extload_v4f16_to_v4f32_arg(<4 x float> addrspace(1)* %out, <4 x half> %arg) #0 { - %ext = fpext <4 x half> %arg to <4 x float> - store <4 x float> %ext, <4 x float> addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}extload_v8f16_to_v8f32_arg: -define void @extload_v8f16_to_v8f32_arg(<8 x float> addrspace(1)* %out, <8 x half> %arg) #0 { - %ext = fpext <8 x half> %arg to <8 x float> - store <8 x float> %ext, <8 x float> addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}extload_f16_to_f64_arg: -define void @extload_f16_to_f64_arg(double addrspace(1)* %out, half %arg) #0 { - %ext = fpext half %arg to double - store double %ext, double addrspace(1)* %out - ret void -} -; GCN-LABEL: {{^}}extload_v2f16_to_v2f64_arg: -define void @extload_v2f16_to_v2f64_arg(<2 x double> addrspace(1)* %out, <2 x half> %arg) #0 { - %ext = fpext <2 x half> %arg to <2 x double> - store <2 x double> %ext, <2 x double> addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}extload_v3f16_to_v3f64_arg: -define void @extload_v3f16_to_v3f64_arg(<3 x double> addrspace(1)* %out, <3 x half> %arg) #0 { - %ext = fpext <3 x half> %arg to <3 x double> - store <3 x double> %ext, <3 x double> addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}extload_v4f16_to_v4f64_arg: -define void @extload_v4f16_to_v4f64_arg(<4 x double> addrspace(1)* %out, <4 x half> %arg) #0 { - %ext = fpext <4 x half> %arg to <4 x double> - store <4 x double> %ext, <4 x double> addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}extload_v8f16_to_v8f64_arg: -define void @extload_v8f16_to_v8f64_arg(<8 x double> addrspace(1)* %out, <8 x half> %arg) #0 { - %ext = fpext <8 x half> %arg to <8 x double> - store <8 x double> %ext, <8 x double> addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}global_load_store_f16: -; GCN: buffer_load_ushort [[TMP:v[0-9]+]] -; GCN: buffer_store_short [[TMP]] -define void @global_load_store_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 { - %val = load half, half addrspace(1)* %in - store half %val, half addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}global_load_store_v2f16: -; GCN: buffer_load_dword [[TMP:v[0-9]+]] -; GCN: buffer_store_dword [[TMP]] -define void @global_load_store_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 { - %val = load <2 x half>, <2 x half> addrspace(1)* %in - store <2 x half> %val, <2 x half> addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}global_load_store_v4f16: -; GCN: buffer_load_dwordx2 [[TMP:v\[[0-9]+:[0-9]+\]]] -; GCN: buffer_store_dwordx2 [[TMP]] -define void @global_load_store_v4f16(<4 x half> addrspace(1)* %in, <4 x half> addrspace(1)* %out) #0 { - %val = load <4 x half>, <4 x half> addrspace(1)* %in - store <4 x half> %val, <4 x half> addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}global_load_store_v8f16: -; GCN: buffer_load_dwordx4 [[TMP:v\[[0-9]+:[0-9]+\]]] -; GCN: buffer_store_dwordx4 [[TMP:v\[[0-9]+:[0-9]+\]]] -; GCN: s_endpgm -define void @global_load_store_v8f16(<8 x half> addrspace(1)* %out, <8 x half> addrspace(1)* %in) #0 { - %val = load <8 x half>, <8 x half> addrspace(1)* %in - store <8 x half> %val, <8 x half> addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}global_extload_f16_to_f32: -; GCN: buffer_load_ushort [[LOAD:v[0-9]+]] -; GCN: v_cvt_f32_f16_e32 [[CVT:v[0-9]+]], [[LOAD]] -; GCN: buffer_store_dword [[CVT]] -define void @global_extload_f16_to_f32(float addrspace(1)* %out, half addrspace(1)* %in) #0 { - %val = load half, half addrspace(1)* %in - %cvt = fpext half %val to float - store float %cvt, float addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}global_extload_v2f16_to_v2f32: -define void @global_extload_v2f16_to_v2f32(<2 x float> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 { - %val = load <2 x half>, <2 x half> addrspace(1)* %in - %cvt = fpext <2 x half> %val to <2 x float> - store <2 x float> %cvt, <2 x float> addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}global_extload_v3f16_to_v3f32: -define void @global_extload_v3f16_to_v3f32(<3 x float> addrspace(1)* %out, <3 x half> addrspace(1)* %in) #0 { - %val = load <3 x half>, <3 x half> addrspace(1)* %in - %cvt = fpext <3 x half> %val to <3 x float> - store <3 x float> %cvt, <3 x float> addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}global_extload_v4f16_to_v4f32: -define void @global_extload_v4f16_to_v4f32(<4 x float> addrspace(1)* %out, <4 x half> addrspace(1)* %in) #0 { - %val = load <4 x half>, <4 x half> addrspace(1)* %in - %cvt = fpext <4 x half> %val to <4 x float> - store <4 x float> %cvt, <4 x float> addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}global_extload_v8f16_to_v8f32: -define void @global_extload_v8f16_to_v8f32(<8 x float> addrspace(1)* %out, <8 x half> addrspace(1)* %in) #0 { - %val = load <8 x half>, <8 x half> addrspace(1)* %in - %cvt = fpext <8 x half> %val to <8 x float> - store <8 x float> %cvt, <8 x float> addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}global_extload_v16f16_to_v16f32: -define void @global_extload_v16f16_to_v16f32(<16 x float> addrspace(1)* %out, <16 x half> addrspace(1)* %in) #0 { - %val = load <16 x half>, <16 x half> addrspace(1)* %in - %cvt = fpext <16 x half> %val to <16 x float> - store <16 x float> %cvt, <16 x float> addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}global_extload_f16_to_f64: -; GCN: buffer_load_ushort [[LOAD:v[0-9]+]] -; GCN: v_cvt_f32_f16_e32 [[CVT0:v[0-9]+]], [[LOAD]] -; GCN: v_cvt_f64_f32_e32 [[CVT1:v\[[0-9]+:[0-9]+\]]], [[CVT0]] -; GCN: buffer_store_dwordx2 [[CVT1]] -define void @global_extload_f16_to_f64(double addrspace(1)* %out, half addrspace(1)* %in) #0 { - %val = load half, half addrspace(1)* %in - %cvt = fpext half %val to double - store double %cvt, double addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}global_extload_v2f16_to_v2f64: -define void @global_extload_v2f16_to_v2f64(<2 x double> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 { - %val = load <2 x half>, <2 x half> addrspace(1)* %in - %cvt = fpext <2 x half> %val to <2 x double> - store <2 x double> %cvt, <2 x double> addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}global_extload_v3f16_to_v3f64: -define void @global_extload_v3f16_to_v3f64(<3 x double> addrspace(1)* %out, <3 x half> addrspace(1)* %in) #0 { - %val = load <3 x half>, <3 x half> addrspace(1)* %in - %cvt = fpext <3 x half> %val to <3 x double> - store <3 x double> %cvt, <3 x double> addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}global_extload_v4f16_to_v4f64: -define void @global_extload_v4f16_to_v4f64(<4 x double> addrspace(1)* %out, <4 x half> addrspace(1)* %in) #0 { - %val = load <4 x half>, <4 x half> addrspace(1)* %in - %cvt = fpext <4 x half> %val to <4 x double> - store <4 x double> %cvt, <4 x double> addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}global_extload_v8f16_to_v8f64: -define void @global_extload_v8f16_to_v8f64(<8 x double> addrspace(1)* %out, <8 x half> addrspace(1)* %in) #0 { - %val = load <8 x half>, <8 x half> addrspace(1)* %in - %cvt = fpext <8 x half> %val to <8 x double> - store <8 x double> %cvt, <8 x double> addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}global_extload_v16f16_to_v16f64: -define void @global_extload_v16f16_to_v16f64(<16 x double> addrspace(1)* %out, <16 x half> addrspace(1)* %in) #0 { - %val = load <16 x half>, <16 x half> addrspace(1)* %in - %cvt = fpext <16 x half> %val to <16 x double> - store <16 x double> %cvt, <16 x double> addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}global_truncstore_f32_to_f16: -; GCN: buffer_load_dword [[LOAD:v[0-9]+]] -; GCN: v_cvt_f16_f32_e32 [[CVT:v[0-9]+]], [[LOAD]] -; GCN: buffer_store_short [[CVT]] -define void @global_truncstore_f32_to_f16(half addrspace(1)* %out, float addrspace(1)* %in) #0 { - %val = load float, float addrspace(1)* %in - %cvt = fptrunc float %val to half - store half %cvt, half addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}global_truncstore_v2f32_to_v2f16: -; GCN: buffer_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}} -; GCN-DAG: v_cvt_f16_f32_e32 [[CVT0:v[0-9]+]], v[[LO]] -; GCN-DAG: v_cvt_f16_f32_e32 [[CVT1:v[0-9]+]], v[[HI]] -; GCN-DAG: buffer_store_short [[CVT0]] -; GCN-DAG: buffer_store_short [[CVT1]] -; GCN: s_endpgm -define void @global_truncstore_v2f32_to_v2f16(<2 x half> addrspace(1)* %out, <2 x float> addrspace(1)* %in) #0 { - %val = load <2 x float>, <2 x float> addrspace(1)* %in - %cvt = fptrunc <2 x float> %val to <2 x half> - store <2 x half> %cvt, <2 x half> addrspace(1)* %out - ret void -} - -; FIXME: Shouldn't do 4th conversion -; GCN-LABEL: {{^}}global_truncstore_v3f32_to_v3f16: -; GCN: buffer_load_dwordx4 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: buffer_store_short -; GCN: buffer_store_dword -; GCN: s_endpgm -define void @global_truncstore_v3f32_to_v3f16(<3 x half> addrspace(1)* %out, <3 x float> addrspace(1)* %in) #0 { - %val = load <3 x float>, <3 x float> addrspace(1)* %in - %cvt = fptrunc <3 x float> %val to <3 x half> - store <3 x half> %cvt, <3 x half> addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}global_truncstore_v4f32_to_v4f16: -; GCN: buffer_load_dwordx4 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: s_endpgm -define void @global_truncstore_v4f32_to_v4f16(<4 x half> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 { - %val = load <4 x float>, <4 x float> addrspace(1)* %in - %cvt = fptrunc <4 x float> %val to <4 x half> - store <4 x half> %cvt, <4 x half> addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}global_truncstore_v8f32_to_v8f16: -; GCN: buffer_load_dword -; GCN: buffer_load_dword -; GCN: buffer_load_dword -; GCN: buffer_load_dword -; GCN: buffer_load_dword -; GCN: buffer_load_dword -; GCN: buffer_load_dword -; GCN: buffer_load_dword -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: s_endpgm -define void @global_truncstore_v8f32_to_v8f16(<8 x half> addrspace(1)* %out, <8 x float> addrspace(1)* %in) #0 { - %val = load <8 x float>, <8 x float> addrspace(1)* %in - %cvt = fptrunc <8 x float> %val to <8 x half> - store <8 x half> %cvt, <8 x half> addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}global_truncstore_v16f32_to_v16f16: -; GCN: buffer_load_dword -; GCN: buffer_load_dword -; GCN: buffer_load_dword -; GCN: buffer_load_dword -; GCN: buffer_load_dword -; GCN: buffer_load_dword -; GCN: buffer_load_dword -; GCN: buffer_load_dword -; GCN: buffer_load_dword -; GCN: buffer_load_dword -; GCN: buffer_load_dword -; GCN: buffer_load_dword -; GCN: buffer_load_dword -; GCN: buffer_load_dword -; GCN: buffer_load_dword -; GCN: buffer_load_dword -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: s_endpgm -define void @global_truncstore_v16f32_to_v16f16(<16 x half> addrspace(1)* %out, <16 x float> addrspace(1)* %in) #0 { - %val = load <16 x float>, <16 x float> addrspace(1)* %in - %cvt = fptrunc <16 x float> %val to <16 x half> - store <16 x half> %cvt, <16 x half> addrspace(1)* %out - ret void -} - -; FIXME: Unsafe math should fold conversions away -; GCN-LABEL: {{^}}fadd_f16: -; SI-DAG: v_cvt_f32_f16_e32 v{{[0-9]+}}, -; SI-DAG: v_cvt_f32_f16_e32 v{{[0-9]+}}, -; SI-DAG: v_cvt_f32_f16_e32 v{{[0-9]+}}, -; SI-DAG: v_cvt_f32_f16_e32 v{{[0-9]+}}, -; SI: v_add_f32 -; GCN: s_endpgm -define void @fadd_f16(half addrspace(1)* %out, half %a, half %b) #0 { - %add = fadd half %a, %b - store half %add, half addrspace(1)* %out, align 4 - ret void -} - -; GCN-LABEL: {{^}}fadd_v2f16: -; SI: v_add_f32 -; SI: v_add_f32 -; GCN: s_endpgm -define void @fadd_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %a, <2 x half> %b) #0 { - %add = fadd <2 x half> %a, %b - store <2 x half> %add, <2 x half> addrspace(1)* %out, align 8 - ret void -} - -; GCN-LABEL: {{^}}fadd_v4f16: -; SI: v_add_f32 -; SI: v_add_f32 -; SI: v_add_f32 -; SI: v_add_f32 -; GCN: s_endpgm -define void @fadd_v4f16(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in) #0 { - %b_ptr = getelementptr <4 x half>, <4 x half> addrspace(1)* %in, i32 1 - %a = load <4 x half>, <4 x half> addrspace(1)* %in, align 16 - %b = load <4 x half>, <4 x half> addrspace(1)* %b_ptr, align 16 - %result = fadd <4 x half> %a, %b - store <4 x half> %result, <4 x half> addrspace(1)* %out, align 16 - ret void -} - -; GCN-LABEL: {{^}}fadd_v8f16: -; SI: v_add_f32 -; SI: v_add_f32 -; SI: v_add_f32 -; SI: v_add_f32 -; SI: v_add_f32 -; SI: v_add_f32 -; SI: v_add_f32 -; SI: v_add_f32 -; GCN: s_endpgm -define void @fadd_v8f16(<8 x half> addrspace(1)* %out, <8 x half> %a, <8 x half> %b) #0 { - %add = fadd <8 x half> %a, %b - store <8 x half> %add, <8 x half> addrspace(1)* %out, align 32 - ret void -} - -; GCN-LABEL: {{^}}fsub_f16: -; GCN: v_subrev_f32_e32 -; GCN: s_endpgm -define void @fsub_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 { - %b_ptr = getelementptr half, half addrspace(1)* %in, i32 1 - %a = load half, half addrspace(1)* %in - %b = load half, half addrspace(1)* %b_ptr - %sub = fsub half %a, %b - store half %sub, half addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}test_bitcast_from_half: -; GCN: buffer_load_ushort [[TMP:v[0-9]+]] -; GCN: buffer_store_short [[TMP]] -define void @test_bitcast_from_half(half addrspace(1)* %in, i16 addrspace(1)* %out) #0 { - %val = load half, half addrspace(1)* %in - %val_int = bitcast half %val to i16 - store i16 %val_int, i16 addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}test_bitcast_to_half: -; GCN: buffer_load_ushort [[TMP:v[0-9]+]] -; GCN: buffer_store_short [[TMP]] -define void @test_bitcast_to_half(half addrspace(1)* %out, i16 addrspace(1)* %in) #0 { - %val = load i16, i16 addrspace(1)* %in - %val_fp = bitcast i16 %val to half - store half %val_fp, half addrspace(1)* %out - ret void -} - -attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/R600/hsa.ll b/llvm/test/CodeGen/R600/hsa.ll deleted file mode 100644 index f9113399afe..00000000000 --- a/llvm/test/CodeGen/R600/hsa.ll +++ /dev/null @@ -1,14 +0,0 @@ -; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri | FileCheck --check-prefix=HSA %s - -; HSA: .section .hsa.version -; HSA-NEXT: .ascii "HSA Code Unit:0.0:AMD:0.1:GFX8.1:0" -; HSA: {{^}}simple: -; Make sure we are setting the ATC bit: -; HSA: s_mov_b32 s[[HI:[0-9]]], 0x100f000 -; HSA: buffer_store_dword v{{[0-9]+}}, s[0:[[HI]]], 0 - -define void @simple(i32 addrspace(1)* %out) { -entry: - store i32 0, i32 addrspace(1)* %out - ret void -} diff --git a/llvm/test/CodeGen/R600/i1-copy-implicit-def.ll b/llvm/test/CodeGen/R600/i1-copy-implicit-def.ll deleted file mode 100644 index b11a2113764..00000000000 --- a/llvm/test/CodeGen/R600/i1-copy-implicit-def.ll +++ /dev/null @@ -1,22 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s - -; SILowerI1Copies was not handling IMPLICIT_DEF -; SI-LABEL: {{^}}br_implicit_def: -; SI: BB#0: -; SI-NEXT: s_and_saveexec_b64 -; SI-NEXT: s_xor_b64 -; SI-NEXT: BB#1: -define void @br_implicit_def(i32 addrspace(1)* %out, i32 %arg) #0 { -bb: - br i1 undef, label %bb1, label %bb2 - -bb1: - store volatile i32 123, i32 addrspace(1)* %out - ret void - -bb2: - ret void -} - -attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/R600/i1-copy-phi.ll b/llvm/test/CodeGen/R600/i1-copy-phi.ll deleted file mode 100644 index 105cd06b330..00000000000 --- a/llvm/test/CodeGen/R600/i1-copy-phi.ll +++ /dev/null @@ -1,30 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s - -; SI-LABEL: {{^}}br_i1_phi: -; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0{{$}} -; SI: s_and_saveexec_b64 -; SI: s_xor_b64 -; SI: v_mov_b32_e32 [[REG]], -1{{$}} -; SI: v_cmp_ne_i32_e32 vcc, 0, [[REG]] -; SI: s_and_saveexec_b64 -; SI: s_xor_b64 -; SI: s_endpgm -define void @br_i1_phi(i32 %arg, i1 %arg1) #0 { -bb: - br i1 %arg1, label %bb2, label %bb3 - -bb2: ; preds = %bb - br label %bb3 - -bb3: ; preds = %bb2, %bb - %tmp = phi i1 [ true, %bb2 ], [ false, %bb ] - br i1 %tmp, label %bb4, label %bb6 - -bb4: ; preds = %bb3 - %tmp5 = mul i32 undef, %arg - br label %bb6 - -bb6: ; preds = %bb4, %bb3 - ret void -} diff --git a/llvm/test/CodeGen/R600/i8-to-double-to-float.ll b/llvm/test/CodeGen/R600/i8-to-double-to-float.ll deleted file mode 100644 index c218e1918bb..00000000000 --- a/llvm/test/CodeGen/R600/i8-to-double-to-float.ll +++ /dev/null @@ -1,11 +0,0 @@ -;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s - -;CHECK: UINT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} - -define void @test(float addrspace(1)* %out, i8 addrspace(1)* %in) { - %1 = load i8, i8 addrspace(1)* %in - %2 = uitofp i8 %1 to double - %3 = fptrunc double %2 to float - store float %3, float addrspace(1)* %out - ret void -} diff --git a/llvm/test/CodeGen/R600/icmp-select-sete-reverse-args.ll b/llvm/test/CodeGen/R600/icmp-select-sete-reverse-args.ll deleted file mode 100644 index 60e59a5a528..00000000000 --- a/llvm/test/CodeGen/R600/icmp-select-sete-reverse-args.ll +++ /dev/null @@ -1,18 +0,0 @@ -;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s - -;Test that a select with reversed True/False values is correctly lowered -;to a SETNE_INT. There should only be one SETNE_INT instruction. - -;CHECK: SETNE_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -;CHECK-NOT: SETNE_INT - -define void @test(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { -entry: - %0 = load i32, i32 addrspace(1)* %in - %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1 - %1 = load i32, i32 addrspace(1)* %arrayidx1 - %cmp = icmp eq i32 %0, %1 - %value = select i1 %cmp, i32 0, i32 -1 - store i32 %value, i32 addrspace(1)* %out - ret void -} diff --git a/llvm/test/CodeGen/R600/icmp64.ll b/llvm/test/CodeGen/R600/icmp64.ll deleted file mode 100644 index 0eaa33ebafe..00000000000 --- a/llvm/test/CodeGen/R600/icmp64.ll +++ /dev/null @@ -1,93 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s - -; SI-LABEL: {{^}}test_i64_eq: -; SI: v_cmp_eq_i64 -define void @test_i64_eq(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind { - %cmp = icmp eq i64 %a, %b - %result = sext i1 %cmp to i32 - store i32 %result, i32 addrspace(1)* %out, align 4 - ret void -} - -; SI-LABEL: {{^}}test_i64_ne: -; SI: v_cmp_ne_i64 -define void @test_i64_ne(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind { - %cmp = icmp ne i64 %a, %b - %result = sext i1 %cmp to i32 - store i32 %result, i32 addrspace(1)* %out, align 4 - ret void -} - -; SI-LABEL: {{^}}test_i64_slt: -; SI: v_cmp_lt_i64 -define void @test_i64_slt(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind { - %cmp = icmp slt i64 %a, %b - %result = sext i1 %cmp to i32 - store i32 %result, i32 addrspace(1)* %out, align 4 - ret void -} - -; SI-LABEL: {{^}}test_i64_ult: -; SI: v_cmp_lt_u64 -define void @test_i64_ult(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind { - %cmp = icmp ult i64 %a, %b - %result = sext i1 %cmp to i32 - store i32 %result, i32 addrspace(1)* %out, align 4 - ret void -} - -; SI-LABEL: {{^}}test_i64_sle: -; SI: v_cmp_le_i64 -define void @test_i64_sle(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind { - %cmp = icmp sle i64 %a, %b - %result = sext i1 %cmp to i32 - store i32 %result, i32 addrspace(1)* %out, align 4 - ret void -} - -; SI-LABEL: {{^}}test_i64_ule: -; SI: v_cmp_le_u64 -define void @test_i64_ule(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind { - %cmp = icmp ule i64 %a, %b - %result = sext i1 %cmp to i32 - store i32 %result, i32 addrspace(1)* %out, align 4 - ret void -} - -; SI-LABEL: {{^}}test_i64_sgt: -; SI: v_cmp_gt_i64 -define void @test_i64_sgt(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind { - %cmp = icmp sgt i64 %a, %b - %result = sext i1 %cmp to i32 - store i32 %result, i32 addrspace(1)* %out, align 4 - ret void -} - -; SI-LABEL: {{^}}test_i64_ugt: -; SI: v_cmp_gt_u64 -define void @test_i64_ugt(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind { - %cmp = icmp ugt i64 %a, %b - %result = sext i1 %cmp to i32 - store i32 %result, i32 addrspace(1)* %out, align 4 - ret void -} - -; SI-LABEL: {{^}}test_i64_sge: -; SI: v_cmp_ge_i64 -define void @test_i64_sge(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind { - %cmp = icmp sge i64 %a, %b - %result = sext i1 %cmp to i32 - store i32 %result, i32 addrspace(1)* %out, align 4 - ret void -} - -; SI-LABEL: {{^}}test_i64_uge: -; SI: v_cmp_ge_u64 -define void @test_i64_uge(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind { - %cmp = icmp uge i64 %a, %b - %result = sext i1 %cmp to i32 - store i32 %result, i32 addrspace(1)* %out, align 4 - ret void -} - diff --git a/llvm/test/CodeGen/R600/imm.ll b/llvm/test/CodeGen/R600/imm.ll deleted file mode 100644 index 12eed550eb1..00000000000 --- a/llvm/test/CodeGen/R600/imm.ll +++ /dev/null @@ -1,617 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=CHECK %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=CHECK %s - -; Use a 64-bit value with lo bits that can be represented as an inline constant -; CHECK-LABEL: {{^}}i64_imm_inline_lo: -; CHECK: s_mov_b32 [[LO:s[0-9]+]], 5 -; CHECK: v_mov_b32_e32 v[[LO_VGPR:[0-9]+]], [[LO]] -; CHECK: buffer_store_dwordx2 v{{\[}}[[LO_VGPR]]: -define void @i64_imm_inline_lo(i64 addrspace(1) *%out) { -entry: - store i64 1311768464867721221, i64 addrspace(1) *%out ; 0x1234567800000005 - ret void -} - -; Use a 64-bit value with hi bits that can be represented as an inline constant -; CHECK-LABEL: {{^}}i64_imm_inline_hi: -; CHECK: s_mov_b32 [[HI:s[0-9]+]], 5 -; CHECK: v_mov_b32_e32 v[[HI_VGPR:[0-9]+]], [[HI]] -; CHECK: buffer_store_dwordx2 v{{\[[0-9]+:}}[[HI_VGPR]] -define void @i64_imm_inline_hi(i64 addrspace(1) *%out) { -entry: - store i64 21780256376, i64 addrspace(1) *%out ; 0x0000000512345678 - ret void -} - -; CHECK-LABEL: {{^}}store_imm_neg_0.0_i64: -; CHECK-DAG: s_mov_b32 s[[HI_SREG:[0-9]+]], 0x80000000 -; CHECK-DAG: s_mov_b32 s[[LO_SREG:[0-9]+]], 0{{$}} -; CHECK-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], s[[LO_SREG]] -; CHECK-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], s[[HI_SREG]] -; CHECK: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}} -define void @store_imm_neg_0.0_i64(i64 addrspace(1) *%out) { - store i64 -9223372036854775808, i64 addrspace(1) *%out - ret void -} - -; CHECK-LABEL: {{^}}store_inline_imm_neg_0.0_i32: -; CHECK: v_mov_b32_e32 [[REG:v[0-9]+]], 0x80000000 -; CHECK: buffer_store_dword [[REG]] -define void @store_inline_imm_neg_0.0_i32(i32 addrspace(1)* %out) { - store i32 -2147483648, i32 addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}store_inline_imm_0.0_f32: -; CHECK: v_mov_b32_e32 [[REG:v[0-9]+]], 0{{$}} -; CHECK: buffer_store_dword [[REG]] -define void @store_inline_imm_0.0_f32(float addrspace(1)* %out) { - store float 0.0, float addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}store_imm_neg_0.0_f32: -; CHECK: v_mov_b32_e32 [[REG:v[0-9]+]], 0x80000000 -; CHECK: buffer_store_dword [[REG]] -define void @store_imm_neg_0.0_f32(float addrspace(1)* %out) { - store float -0.0, float addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}store_inline_imm_0.5_f32: -; CHECK: v_mov_b32_e32 [[REG:v[0-9]+]], 0.5{{$}} -; CHECK: buffer_store_dword [[REG]] -define void @store_inline_imm_0.5_f32(float addrspace(1)* %out) { - store float 0.5, float addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}store_inline_imm_m_0.5_f32: -; CHECK: v_mov_b32_e32 [[REG:v[0-9]+]], -0.5{{$}} -; CHECK: buffer_store_dword [[REG]] -define void @store_inline_imm_m_0.5_f32(float addrspace(1)* %out) { - store float -0.5, float addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}store_inline_imm_1.0_f32: -; CHECK: v_mov_b32_e32 [[REG:v[0-9]+]], 1.0{{$}} -; CHECK: buffer_store_dword [[REG]] -define void @store_inline_imm_1.0_f32(float addrspace(1)* %out) { - store float 1.0, float addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}store_inline_imm_m_1.0_f32: -; CHECK: v_mov_b32_e32 [[REG:v[0-9]+]], -1.0{{$}} -; CHECK: buffer_store_dword [[REG]] -define void @store_inline_imm_m_1.0_f32(float addrspace(1)* %out) { - store float -1.0, float addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}store_inline_imm_2.0_f32: -; CHECK: v_mov_b32_e32 [[REG:v[0-9]+]], 2.0{{$}} -; CHECK: buffer_store_dword [[REG]] -define void @store_inline_imm_2.0_f32(float addrspace(1)* %out) { - store float 2.0, float addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}store_inline_imm_m_2.0_f32: -; CHECK: v_mov_b32_e32 [[REG:v[0-9]+]], -2.0{{$}} -; CHECK: buffer_store_dword [[REG]] -define void @store_inline_imm_m_2.0_f32(float addrspace(1)* %out) { - store float -2.0, float addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}store_inline_imm_4.0_f32: -; CHECK: v_mov_b32_e32 [[REG:v[0-9]+]], 4.0{{$}} -; CHECK: buffer_store_dword [[REG]] -define void @store_inline_imm_4.0_f32(float addrspace(1)* %out) { - store float 4.0, float addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}store_inline_imm_m_4.0_f32: -; CHECK: v_mov_b32_e32 [[REG:v[0-9]+]], -4.0{{$}} -; CHECK: buffer_store_dword [[REG]] -define void @store_inline_imm_m_4.0_f32(float addrspace(1)* %out) { - store float -4.0, float addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}store_literal_imm_f32: -; CHECK: v_mov_b32_e32 [[REG:v[0-9]+]], 0x45800000 -; CHECK: buffer_store_dword [[REG]] -define void @store_literal_imm_f32(float addrspace(1)* %out) { - store float 4096.0, float addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}add_inline_imm_0.0_f32: -; CHECK: s_load_dword [[VAL:s[0-9]+]] -; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], 0, [[VAL]]{{$}} -; CHECK: buffer_store_dword [[REG]] -define void @add_inline_imm_0.0_f32(float addrspace(1)* %out, float %x) { - %y = fadd float %x, 0.0 - store float %y, float addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}add_inline_imm_0.5_f32: -; CHECK: s_load_dword [[VAL:s[0-9]+]] -; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], 0.5, [[VAL]]{{$}} -; CHECK: buffer_store_dword [[REG]] -define void @add_inline_imm_0.5_f32(float addrspace(1)* %out, float %x) { - %y = fadd float %x, 0.5 - store float %y, float addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}add_inline_imm_neg_0.5_f32: -; CHECK: s_load_dword [[VAL:s[0-9]+]] -; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], -0.5, [[VAL]]{{$}} -; CHECK: buffer_store_dword [[REG]] -define void @add_inline_imm_neg_0.5_f32(float addrspace(1)* %out, float %x) { - %y = fadd float %x, -0.5 - store float %y, float addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}add_inline_imm_1.0_f32: -; CHECK: s_load_dword [[VAL:s[0-9]+]] -; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], 1.0, [[VAL]]{{$}} -; CHECK: buffer_store_dword [[REG]] -define void @add_inline_imm_1.0_f32(float addrspace(1)* %out, float %x) { - %y = fadd float %x, 1.0 - store float %y, float addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}add_inline_imm_neg_1.0_f32: -; CHECK: s_load_dword [[VAL:s[0-9]+]] -; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], -1.0, [[VAL]]{{$}} -; CHECK: buffer_store_dword [[REG]] -define void @add_inline_imm_neg_1.0_f32(float addrspace(1)* %out, float %x) { - %y = fadd float %x, -1.0 - store float %y, float addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}add_inline_imm_2.0_f32: -; CHECK: s_load_dword [[VAL:s[0-9]+]] -; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], 2.0, [[VAL]]{{$}} -; CHECK: buffer_store_dword [[REG]] -define void @add_inline_imm_2.0_f32(float addrspace(1)* %out, float %x) { - %y = fadd float %x, 2.0 - store float %y, float addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}add_inline_imm_neg_2.0_f32: -; CHECK: s_load_dword [[VAL:s[0-9]+]] -; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], -2.0, [[VAL]]{{$}} -; CHECK: buffer_store_dword [[REG]] -define void @add_inline_imm_neg_2.0_f32(float addrspace(1)* %out, float %x) { - %y = fadd float %x, -2.0 - store float %y, float addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}add_inline_imm_4.0_f32: -; CHECK: s_load_dword [[VAL:s[0-9]+]] -; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], 4.0, [[VAL]]{{$}} -; CHECK: buffer_store_dword [[REG]] -define void @add_inline_imm_4.0_f32(float addrspace(1)* %out, float %x) { - %y = fadd float %x, 4.0 - store float %y, float addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}add_inline_imm_neg_4.0_f32: -; CHECK: s_load_dword [[VAL:s[0-9]+]] -; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], -4.0, [[VAL]]{{$}} -; CHECK: buffer_store_dword [[REG]] -define void @add_inline_imm_neg_4.0_f32(float addrspace(1)* %out, float %x) { - %y = fadd float %x, -4.0 - store float %y, float addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}commute_add_inline_imm_0.5_f32: -; CHECK: buffer_load_dword [[VAL:v[0-9]+]] -; CHECK: v_add_f32_e32 [[REG:v[0-9]+]], 0.5, [[VAL]] -; CHECK: buffer_store_dword [[REG]] -define void @commute_add_inline_imm_0.5_f32(float addrspace(1)* %out, float addrspace(1)* %in) { - %x = load float, float addrspace(1)* %in - %y = fadd float %x, 0.5 - store float %y, float addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}commute_add_literal_f32: -; CHECK: buffer_load_dword [[VAL:v[0-9]+]] -; CHECK: v_add_f32_e32 [[REG:v[0-9]+]], 0x44800000, [[VAL]] -; CHECK: buffer_store_dword [[REG]] -define void @commute_add_literal_f32(float addrspace(1)* %out, float addrspace(1)* %in) { - %x = load float, float addrspace(1)* %in - %y = fadd float %x, 1024.0 - store float %y, float addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}add_inline_imm_1_f32: -; CHECK: s_load_dword [[VAL:s[0-9]+]] -; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], 1, [[VAL]]{{$}} -; CHECK: buffer_store_dword [[REG]] -define void @add_inline_imm_1_f32(float addrspace(1)* %out, float %x) { - %y = fadd float %x, 0x36a0000000000000 - store float %y, float addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}add_inline_imm_2_f32: -; CHECK: s_load_dword [[VAL:s[0-9]+]] -; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], 2, [[VAL]]{{$}} -; CHECK: buffer_store_dword [[REG]] -define void @add_inline_imm_2_f32(float addrspace(1)* %out, float %x) { - %y = fadd float %x, 0x36b0000000000000 - store float %y, float addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}add_inline_imm_16_f32: -; CHECK: s_load_dword [[VAL:s[0-9]+]] -; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], 16, [[VAL]] -; CHECK: buffer_store_dword [[REG]] -define void @add_inline_imm_16_f32(float addrspace(1)* %out, float %x) { - %y = fadd float %x, 0x36e0000000000000 - store float %y, float addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}add_inline_imm_neg_1_f32: -; CHECK: s_load_dword [[VAL:s[0-9]+]] -; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], -1, [[VAL]] -; CHECK: buffer_store_dword [[REG]] -define void @add_inline_imm_neg_1_f32(float addrspace(1)* %out, float %x) { - %y = fadd float %x, 0xffffffffe0000000 - store float %y, float addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}add_inline_imm_neg_2_f32: -; CHECK: s_load_dword [[VAL:s[0-9]+]] -; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], -2, [[VAL]] -; CHECK: buffer_store_dword [[REG]] -define void @add_inline_imm_neg_2_f32(float addrspace(1)* %out, float %x) { - %y = fadd float %x, 0xffffffffc0000000 - store float %y, float addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}add_inline_imm_neg_16_f32: -; CHECK: s_load_dword [[VAL:s[0-9]+]] -; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], -16, [[VAL]] -; CHECK: buffer_store_dword [[REG]] -define void @add_inline_imm_neg_16_f32(float addrspace(1)* %out, float %x) { - %y = fadd float %x, 0xfffffffe00000000 - store float %y, float addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}add_inline_imm_63_f32: -; CHECK: s_load_dword [[VAL:s[0-9]+]] -; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], 63, [[VAL]] -; CHECK: buffer_store_dword [[REG]] -define void @add_inline_imm_63_f32(float addrspace(1)* %out, float %x) { - %y = fadd float %x, 0x36ff800000000000 - store float %y, float addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}add_inline_imm_64_f32: -; CHECK: s_load_dword [[VAL:s[0-9]+]] -; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], 64, [[VAL]] -; CHECK: buffer_store_dword [[REG]] -define void @add_inline_imm_64_f32(float addrspace(1)* %out, float %x) { - %y = fadd float %x, 0x3700000000000000 - store float %y, float addrspace(1)* %out - ret void -} - - -; CHECK-LABEL: {{^}}add_inline_imm_0.0_f64: -; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb -; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c -; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 0, [[VAL]] -; CHECK: buffer_store_dwordx2 [[REG]] -define void @add_inline_imm_0.0_f64(double addrspace(1)* %out, double %x) { - %y = fadd double %x, 0.0 - store double %y, double addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}add_inline_imm_0.5_f64: -; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb -; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c -; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 0.5, [[VAL]] -; CHECK: buffer_store_dwordx2 [[REG]] -define void @add_inline_imm_0.5_f64(double addrspace(1)* %out, double %x) { - %y = fadd double %x, 0.5 - store double %y, double addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}add_inline_imm_neg_0.5_f64: -; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb -; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c -; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], -0.5, [[VAL]] -; CHECK: buffer_store_dwordx2 [[REG]] -define void @add_inline_imm_neg_0.5_f64(double addrspace(1)* %out, double %x) { - %y = fadd double %x, -0.5 - store double %y, double addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}add_inline_imm_1.0_f64: -; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb -; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c -; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 1.0, [[VAL]] -; CHECK: buffer_store_dwordx2 [[REG]] -define void @add_inline_imm_1.0_f64(double addrspace(1)* %out, double %x) { - %y = fadd double %x, 1.0 - store double %y, double addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}add_inline_imm_neg_1.0_f64: -; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb -; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c -; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], -1.0, [[VAL]] -; CHECK: buffer_store_dwordx2 [[REG]] -define void @add_inline_imm_neg_1.0_f64(double addrspace(1)* %out, double %x) { - %y = fadd double %x, -1.0 - store double %y, double addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}add_inline_imm_2.0_f64: -; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb -; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c -; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 2.0, [[VAL]] -; CHECK: buffer_store_dwordx2 [[REG]] -define void @add_inline_imm_2.0_f64(double addrspace(1)* %out, double %x) { - %y = fadd double %x, 2.0 - store double %y, double addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}add_inline_imm_neg_2.0_f64: -; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb -; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c -; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], -2.0, [[VAL]] -; CHECK: buffer_store_dwordx2 [[REG]] -define void @add_inline_imm_neg_2.0_f64(double addrspace(1)* %out, double %x) { - %y = fadd double %x, -2.0 - store double %y, double addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}add_inline_imm_4.0_f64: -; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb -; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c -; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 4.0, [[VAL]] -; CHECK: buffer_store_dwordx2 [[REG]] -define void @add_inline_imm_4.0_f64(double addrspace(1)* %out, double %x) { - %y = fadd double %x, 4.0 - store double %y, double addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}add_inline_imm_neg_4.0_f64: -; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb -; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c -; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], -4.0, [[VAL]] -; CHECK: buffer_store_dwordx2 [[REG]] -define void @add_inline_imm_neg_4.0_f64(double addrspace(1)* %out, double %x) { - %y = fadd double %x, -4.0 - store double %y, double addrspace(1)* %out - ret void -} - - -; CHECK-LABEL: {{^}}add_inline_imm_1_f64: -; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb -; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c -; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 1, [[VAL]] -; CHECK: buffer_store_dwordx2 [[REG]] -define void @add_inline_imm_1_f64(double addrspace(1)* %out, double %x) { - %y = fadd double %x, 0x0000000000000001 - store double %y, double addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}add_inline_imm_2_f64: -; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb -; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c -; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 2, [[VAL]] -; CHECK: buffer_store_dwordx2 [[REG]] -define void @add_inline_imm_2_f64(double addrspace(1)* %out, double %x) { - %y = fadd double %x, 0x0000000000000002 - store double %y, double addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}add_inline_imm_16_f64: -; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb -; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c -; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 16, [[VAL]] -; CHECK: buffer_store_dwordx2 [[REG]] -define void @add_inline_imm_16_f64(double addrspace(1)* %out, double %x) { - %y = fadd double %x, 0x0000000000000010 - store double %y, double addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}add_inline_imm_neg_1_f64: -; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb -; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c -; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], -1, [[VAL]] -; CHECK: buffer_store_dwordx2 [[REG]] -define void @add_inline_imm_neg_1_f64(double addrspace(1)* %out, double %x) { - %y = fadd double %x, 0xffffffffffffffff - store double %y, double addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}add_inline_imm_neg_2_f64: -; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb -; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c -; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], -2, [[VAL]] -; CHECK: buffer_store_dwordx2 [[REG]] -define void @add_inline_imm_neg_2_f64(double addrspace(1)* %out, double %x) { - %y = fadd double %x, 0xfffffffffffffffe - store double %y, double addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}add_inline_imm_neg_16_f64: -; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb -; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c -; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], -16, [[VAL]] -; CHECK: buffer_store_dwordx2 [[REG]] -define void @add_inline_imm_neg_16_f64(double addrspace(1)* %out, double %x) { - %y = fadd double %x, 0xfffffffffffffff0 - store double %y, double addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}add_inline_imm_63_f64: -; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb -; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c -; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 63, [[VAL]] -; CHECK: buffer_store_dwordx2 [[REG]] -define void @add_inline_imm_63_f64(double addrspace(1)* %out, double %x) { - %y = fadd double %x, 0x000000000000003F - store double %y, double addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}add_inline_imm_64_f64: -; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb -; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c -; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 64, [[VAL]] -; CHECK: buffer_store_dwordx2 [[REG]] -define void @add_inline_imm_64_f64(double addrspace(1)* %out, double %x) { - %y = fadd double %x, 0x0000000000000040 - store double %y, double addrspace(1)* %out - ret void -} - - -; CHECK-LABEL: {{^}}store_inline_imm_0.0_f64: -; CHECK: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0 -; CHECK: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 0 -; CHECK: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}} -define void @store_inline_imm_0.0_f64(double addrspace(1)* %out) { - store double 0.0, double addrspace(1)* %out - ret void -} - - -; CHECK-LABEL: {{^}}store_literal_imm_neg_0.0_f64: -; CHECK-DAG: s_mov_b32 s[[HI_SREG:[0-9]+]], 0x80000000 -; CHECK-DAG: s_mov_b32 s[[LO_SREG:[0-9]+]], 0{{$}} -; CHECK-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], s[[LO_SREG]] -; CHECK-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], s[[HI_SREG]] -; CHECK: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}} -define void @store_literal_imm_neg_0.0_f64(double addrspace(1)* %out) { - store double -0.0, double addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}store_inline_imm_0.5_f64: -; CHECK-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0{{$}} -; CHECK-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 0x3fe00000 -; CHECK: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}} -define void @store_inline_imm_0.5_f64(double addrspace(1)* %out) { - store double 0.5, double addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}store_inline_imm_m_0.5_f64: -; CHECK-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0{{$}} -; CHECK-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 0xbfe00000 -; CHECK: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}} -define void @store_inline_imm_m_0.5_f64(double addrspace(1)* %out) { - store double -0.5, double addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}store_inline_imm_1.0_f64: -; CHECK-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0{{$}} -; CHECK-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 0x3ff00000 -; CHECK: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}} -define void @store_inline_imm_1.0_f64(double addrspace(1)* %out) { - store double 1.0, double addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}store_inline_imm_m_1.0_f64: -; CHECK-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0{{$}} -; CHECK-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 0xbff00000 -; CHECK: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}} -define void @store_inline_imm_m_1.0_f64(double addrspace(1)* %out) { - store double -1.0, double addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}store_inline_imm_2.0_f64: -; CHECK-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0{{$}} -; CHECK-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 2.0 -; CHECK: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}} -define void @store_inline_imm_2.0_f64(double addrspace(1)* %out) { - store double 2.0, double addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}store_inline_imm_m_2.0_f64: -; CHECK-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0{{$}} -; CHECK-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], -2.0 -; CHECK: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}} -define void @store_inline_imm_m_2.0_f64(double addrspace(1)* %out) { - store double -2.0, double addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}store_inline_imm_4.0_f64: -; CHECK-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0{{$}} -; CHECK-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 0x40100000 -; CHECK: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}} -define void @store_inline_imm_4.0_f64(double addrspace(1)* %out) { - store double 4.0, double addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}store_inline_imm_m_4.0_f64: -; CHECK-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0{{$}} -; CHECK-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 0xc0100000 -; CHECK: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}} -define void @store_inline_imm_m_4.0_f64(double addrspace(1)* %out) { - store double -4.0, double addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}store_literal_imm_f64: -; CHECK-DAG: s_mov_b32 s[[HI_SREG:[0-9]+]], 0x40b00000 -; CHECK-DAG: s_mov_b32 s[[LO_SREG:[0-9]+]], 0{{$}} -; CHECK-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], s[[LO_SREG]] -; CHECK-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], s[[HI_SREG]] -; CHECK: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}} -define void @store_literal_imm_f64(double addrspace(1)* %out) { - store double 4096.0, double addrspace(1)* %out - ret void -} diff --git a/llvm/test/CodeGen/R600/indirect-addressing-si.ll b/llvm/test/CodeGen/R600/indirect-addressing-si.ll deleted file mode 100644 index f551606d63a..00000000000 --- a/llvm/test/CodeGen/R600/indirect-addressing-si.ll +++ /dev/null @@ -1,121 +0,0 @@ -; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s -; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s - -; Tests for indirect addressing on SI, which is implemented using dynamic -; indexing of vectors. - -; CHECK-LABEL: {{^}}extract_w_offset: -; CHECK: s_mov_b32 m0 -; CHECK-NEXT: v_movrels_b32_e32 -define void @extract_w_offset(float addrspace(1)* %out, i32 %in) { -entry: - %0 = add i32 %in, 1 - %1 = extractelement <4 x float> , i32 %0 - store float %1, float addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}extract_wo_offset: -; CHECK: s_mov_b32 m0 -; CHECK-NEXT: v_movrels_b32_e32 -define void @extract_wo_offset(float addrspace(1)* %out, i32 %in) { -entry: - %0 = extractelement <4 x float> , i32 %in - store float %0, float addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}extract_neg_offset_sgpr: -; The offset depends on the register that holds the first element of the vector. -; CHECK: s_add_i32 m0, s{{[0-9]+}}, 0xfffffe{{[0-9a-z]+}} -; CHECK: v_movrels_b32_e32 v{{[0-9]}}, v0 -define void @extract_neg_offset_sgpr(i32 addrspace(1)* %out, i32 %offset) { -entry: - %index = add i32 %offset, -512 - %value = extractelement <4 x i32> , i32 %index - store i32 %value, i32 addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}extract_neg_offset_vgpr: -; The offset depends on the register that holds the first element of the vector. -; CHECK: v_readfirstlane_b32 -; CHECK: s_add_i32 m0, m0, 0xfffffe{{[0-9a-z]+}} -; CHECK-NEXT: v_movrels_b32_e32 v{{[0-9]}}, v0 -; CHECK: s_cbranch_execnz -define void @extract_neg_offset_vgpr(i32 addrspace(1)* %out) { -entry: - %id = call i32 @llvm.r600.read.tidig.x() #1 - %index = add i32 %id, -512 - %value = extractelement <4 x i32> , i32 %index - store i32 %value, i32 addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}insert_w_offset: -; CHECK: s_mov_b32 m0 -; CHECK-NEXT: v_movreld_b32_e32 -define void @insert_w_offset(float addrspace(1)* %out, i32 %in) { -entry: - %0 = add i32 %in, 1 - %1 = insertelement <4 x float> , float 5.0, i32 %0 - %2 = extractelement <4 x float> %1, i32 2 - store float %2, float addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}insert_wo_offset: -; CHECK: s_mov_b32 m0 -; CHECK-NEXT: v_movreld_b32_e32 -define void @insert_wo_offset(float addrspace(1)* %out, i32 %in) { -entry: - %0 = insertelement <4 x float> , float 5.0, i32 %in - %1 = extractelement <4 x float> %0, i32 2 - store float %1, float addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}insert_neg_offset_sgpr: -; The offset depends on the register that holds the first element of the vector. -; CHECK: s_add_i32 m0, s{{[0-9]+}}, 0xfffffe{{[0-9a-z]+}} -; CHECK: v_movreld_b32_e32 v0, v{{[0-9]}} -define void @insert_neg_offset_sgpr(i32 addrspace(1)* %in, <4 x i32> addrspace(1)* %out, i32 %offset) { -entry: - %index = add i32 %offset, -512 - %value = insertelement <4 x i32> , i32 5, i32 %index - store <4 x i32> %value, <4 x i32> addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}insert_neg_offset_vgpr: -; The offset depends on the register that holds the first element of the vector. -; CHECK: v_readfirstlane_b32 -; CHECK: s_add_i32 m0, m0, 0xfffffe{{[0-9a-z]+}} -; CHECK-NEXT: v_movreld_b32_e32 v0, v{{[0-9]}} -; CHECK: s_cbranch_execnz -define void @insert_neg_offset_vgpr(i32 addrspace(1)* %in, <4 x i32> addrspace(1)* %out) { -entry: - %id = call i32 @llvm.r600.read.tidig.x() #1 - %index = add i32 %id, -512 - %value = insertelement <4 x i32> , i32 5, i32 %index - store <4 x i32> %value, <4 x i32> addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}insert_neg_inline_offset_vgpr: -; The offset depends on the register that holds the first element of the vector. -; CHECK: v_readfirstlane_b32 -; CHECK: s_add_i32 m0, m0, -{{[0-9]+}} -; CHECK-NEXT: v_movreld_b32_e32 v0, v{{[0-9]}} -; CHECK: s_cbranch_execnz -define void @insert_neg_inline_offset_vgpr(i32 addrspace(1)* %in, <4 x i32> addrspace(1)* %out) { -entry: - %id = call i32 @llvm.r600.read.tidig.x() #1 - %index = add i32 %id, -16 - %value = insertelement <4 x i32> , i32 5, i32 %index - store <4 x i32> %value, <4 x i32> addrspace(1)* %out - ret void -} - -declare i32 @llvm.r600.read.tidig.x() #1 -attributes #1 = { nounwind readnone } diff --git a/llvm/test/CodeGen/R600/indirect-private-64.ll b/llvm/test/CodeGen/R600/indirect-private-64.ll deleted file mode 100644 index d63e1b6c521..00000000000 --- a/llvm/test/CodeGen/R600/indirect-private-64.ll +++ /dev/null @@ -1,91 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-ALLOCA -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=SI -mattr=+promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-PROMOTE -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-ALLOCA -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=+promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-PROMOTE -check-prefix=SI %s - - -declare void @llvm.AMDGPU.barrier.local() noduplicate nounwind - -; SI-LABEL: {{^}}private_access_f64_alloca: - -; SI-ALLOCA: buffer_store_dwordx2 -; SI-ALLOCA: buffer_load_dwordx2 - -; SI-PROMOTE: ds_write_b64 -; SI-PROMOTE: ds_read_b64 -define void @private_access_f64_alloca(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in, i32 %b) nounwind { - %val = load double, double addrspace(1)* %in, align 8 - %array = alloca double, i32 16, align 8 - %ptr = getelementptr double, double* %array, i32 %b - store double %val, double* %ptr, align 8 - call void @llvm.AMDGPU.barrier.local() noduplicate nounwind - %result = load double, double* %ptr, align 8 - store double %result, double addrspace(1)* %out, align 8 - ret void -} - -; SI-LABEL: {{^}}private_access_v2f64_alloca: - -; SI-ALLOCA: buffer_store_dwordx4 -; SI-ALLOCA: buffer_load_dwordx4 - -; SI-PROMOTE: ds_write_b32 -; SI-PROMOTE: ds_write_b32 -; SI-PROMOTE: ds_write_b32 -; SI-PROMOTE: ds_write_b32 -; SI-PROMOTE: ds_read_b32 -; SI-PROMOTE: ds_read_b32 -; SI-PROMOTE: ds_read_b32 -; SI-PROMOTE: ds_read_b32 -define void @private_access_v2f64_alloca(<2 x double> addrspace(1)* noalias %out, <2 x double> addrspace(1)* noalias %in, i32 %b) nounwind { - %val = load <2 x double>, <2 x double> addrspace(1)* %in, align 16 - %array = alloca <2 x double>, i32 16, align 16 - %ptr = getelementptr <2 x double>, <2 x double>* %array, i32 %b - store <2 x double> %val, <2 x double>* %ptr, align 16 - call void @llvm.AMDGPU.barrier.local() noduplicate nounwind - %result = load <2 x double>, <2 x double>* %ptr, align 16 - store <2 x double> %result, <2 x double> addrspace(1)* %out, align 16 - ret void -} - -; SI-LABEL: {{^}}private_access_i64_alloca: - -; SI-ALLOCA: buffer_store_dwordx2 -; SI-ALLOCA: buffer_load_dwordx2 - -; SI-PROMOTE: ds_write_b64 -; SI-PROMOTE: ds_read_b64 -define void @private_access_i64_alloca(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in, i32 %b) nounwind { - %val = load i64, i64 addrspace(1)* %in, align 8 - %array = alloca i64, i32 16, align 8 - %ptr = getelementptr i64, i64* %array, i32 %b - store i64 %val, i64* %ptr, align 8 - call void @llvm.AMDGPU.barrier.local() noduplicate nounwind - %result = load i64, i64* %ptr, align 8 - store i64 %result, i64 addrspace(1)* %out, align 8 - ret void -} - -; SI-LABEL: {{^}}private_access_v2i64_alloca: - -; SI-ALLOCA: buffer_store_dwordx4 -; SI-ALLOCA: buffer_load_dwordx4 - -; SI-PROMOTE: ds_write_b32 -; SI-PROMOTE: ds_write_b32 -; SI-PROMOTE: ds_write_b32 -; SI-PROMOTE: ds_write_b32 -; SI-PROMOTE: ds_read_b32 -; SI-PROMOTE: ds_read_b32 -; SI-PROMOTE: ds_read_b32 -; SI-PROMOTE: ds_read_b32 -define void @private_access_v2i64_alloca(<2 x i64> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %in, i32 %b) nounwind { - %val = load <2 x i64>, <2 x i64> addrspace(1)* %in, align 16 - %array = alloca <2 x i64>, i32 16, align 16 - %ptr = getelementptr <2 x i64>, <2 x i64>* %array, i32 %b - store <2 x i64> %val, <2 x i64>* %ptr, align 16 - call void @llvm.AMDGPU.barrier.local() noduplicate nounwind - %result = load <2 x i64>, <2 x i64>* %ptr, align 16 - store <2 x i64> %result, <2 x i64> addrspace(1)* %out, align 16 - ret void -} diff --git a/llvm/test/CodeGen/R600/infinite-loop-evergreen.ll b/llvm/test/CodeGen/R600/infinite-loop-evergreen.ll deleted file mode 100644 index f6e39b3d830..00000000000 --- a/llvm/test/CodeGen/R600/infinite-loop-evergreen.ll +++ /dev/null @@ -1,10 +0,0 @@ -; XFAIL: * -; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck %s - -define void @inf_loop_irreducible_cfg() nounwind { -entry: - br label %block - -block: - br label %block -} diff --git a/llvm/test/CodeGen/R600/infinite-loop.ll b/llvm/test/CodeGen/R600/infinite-loop.ll deleted file mode 100644 index 7233aa57fd7..00000000000 --- a/llvm/test/CodeGen/R600/infinite-loop.ll +++ /dev/null @@ -1,18 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s - -; SI-LABEL: {{^}}infinite_loop: -; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3e7 -; SI: BB0_1: -; SI: buffer_store_dword [[REG]] -; SI: s_waitcnt vmcnt(0) expcnt(0) -; SI: s_branch BB0_1 -define void @infinite_loop(i32 addrspace(1)* %out) { -entry: - br label %for.body - -for.body: ; preds = %entry, %for.body - store i32 999, i32 addrspace(1)* %out, align 4 - br label %for.body -} - diff --git a/llvm/test/CodeGen/R600/inline-asm.ll b/llvm/test/CodeGen/R600/inline-asm.ll deleted file mode 100644 index efc2292de3a..00000000000 --- a/llvm/test/CodeGen/R600/inline-asm.ll +++ /dev/null @@ -1,12 +0,0 @@ -; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s -; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s - -; CHECK: {{^}}inline_asm: -; CHECK: s_endpgm -; CHECK: s_endpgm -define void @inline_asm(i32 addrspace(1)* %out) { -entry: - store i32 5, i32 addrspace(1)* %out - call void asm sideeffect "s_endpgm", ""() - ret void -} diff --git a/llvm/test/CodeGen/R600/inline-calls.ll b/llvm/test/CodeGen/R600/inline-calls.ll deleted file mode 100644 index 33a4c832e75..00000000000 --- a/llvm/test/CodeGen/R600/inline-calls.ll +++ /dev/null @@ -1,25 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck %s - -; CHECK-NOT: {{^}}func: -define internal fastcc i32 @func(i32 %a) { -entry: - %tmp0 = add i32 %a, 1 - ret i32 %tmp0 -} - -; CHECK: {{^}}kernel: -define void @kernel(i32 addrspace(1)* %out) { -entry: - %tmp0 = call i32 @func(i32 1) - store i32 %tmp0, i32 addrspace(1)* %out - ret void -} - -; CHECK: {{^}}kernel2: -define void @kernel2(i32 addrspace(1)* %out) { -entry: - call void @kernel(i32 addrspace(1)* %out) - ret void -} diff --git a/llvm/test/CodeGen/R600/input-mods.ll b/llvm/test/CodeGen/R600/input-mods.ll deleted file mode 100644 index 1c4d285cbcb..00000000000 --- a/llvm/test/CodeGen/R600/input-mods.ll +++ /dev/null @@ -1,26 +0,0 @@ -;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG -;RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=CM - -;EG-LABEL: {{^}}test: -;EG: EXP_IEEE * -;CM-LABEL: {{^}}test: -;CM: EXP_IEEE T{{[0-9]+}}.X, -|T{{[0-9]+}}.X| -;CM: EXP_IEEE T{{[0-9]+}}.Y (MASKED), -|T{{[0-9]+}}.X| -;CM: EXP_IEEE T{{[0-9]+}}.Z (MASKED), -|T{{[0-9]+}}.X| -;CM: EXP_IEEE * T{{[0-9]+}}.W (MASKED), -|T{{[0-9]+}}.X| - -define void @test(<4 x float> inreg %reg0) #0 { - %r0 = extractelement <4 x float> %reg0, i32 0 - %r1 = call float @llvm.fabs.f32(float %r0) - %r2 = fsub float -0.000000e+00, %r1 - %r3 = call float @llvm.exp2.f32(float %r2) - %vec = insertelement <4 x float> undef, float %r3, i32 0 - call void @llvm.R600.store.swizzle(<4 x float> %vec, i32 0, i32 0) - ret void -} - -declare float @llvm.exp2.f32(float) readnone -declare float @llvm.fabs.f32(float) readnone -declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) - -attributes #0 = { "ShaderType"="0" } diff --git a/llvm/test/CodeGen/R600/insert_subreg.ll b/llvm/test/CodeGen/R600/insert_subreg.ll deleted file mode 100644 index 4a5e8869c2d..00000000000 --- a/llvm/test/CodeGen/R600/insert_subreg.ll +++ /dev/null @@ -1,16 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -mattr=-promote-alloca -verify-machineinstrs < %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-promote-alloca -verify-machineinstrs < %s - -; Test that INSERT_SUBREG instructions don't have non-register operands after -; instruction selection. - -; Make sure this doesn't crash -; CHECK-LABEL: test: -define void @test(i64 addrspace(1)* %out) { -entry: - %tmp0 = alloca [16 x i32] - %tmp1 = ptrtoint [16 x i32]* %tmp0 to i32 - %tmp2 = sext i32 %tmp1 to i64 - store i64 %tmp2, i64 addrspace(1)* %out - ret void -} diff --git a/llvm/test/CodeGen/R600/insert_vector_elt.ll b/llvm/test/CodeGen/R600/insert_vector_elt.ll deleted file mode 100644 index 6de3d408c48..00000000000 --- a/llvm/test/CodeGen/R600/insert_vector_elt.ll +++ /dev/null @@ -1,252 +0,0 @@ -; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI %s -; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI %s - -; FIXME: Broken on evergreen -; FIXME: For some reason the 8 and 16 vectors are being stored as -; individual elements instead of 128-bit stores. - - -; FIXME: Why is the constant moved into the intermediate register and -; not just directly into the vector component? - -; SI-LABEL: {{^}}insertelement_v4f32_0: -; s_load_dwordx4 s{{[}}[[LOW_REG:[0-9]+]]: -; v_mov_b32_e32 -; v_mov_b32_e32 [[CONSTREG:v[0-9]+]], 5.000000e+00 -; v_mov_b32_e32 v[[LOW_REG]], [[CONSTREG]] -; buffer_store_dwordx4 v{{[}}[[LOW_REG]]: -define void @insertelement_v4f32_0(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind { - %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 0 - store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16 - ret void -} - -; SI-LABEL: {{^}}insertelement_v4f32_1: -define void @insertelement_v4f32_1(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind { - %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 1 - store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16 - ret void -} - -; SI-LABEL: {{^}}insertelement_v4f32_2: -define void @insertelement_v4f32_2(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind { - %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 2 - store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16 - ret void -} - -; SI-LABEL: {{^}}insertelement_v4f32_3: -define void @insertelement_v4f32_3(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind { - %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 3 - store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16 - ret void -} - -; SI-LABEL: {{^}}insertelement_v4i32_0: -define void @insertelement_v4i32_0(<4 x i32> addrspace(1)* %out, <4 x i32> %a) nounwind { - %vecins = insertelement <4 x i32> %a, i32 999, i32 0 - store <4 x i32> %vecins, <4 x i32> addrspace(1)* %out, align 16 - ret void -} - -; SI-LABEL: {{^}}dynamic_insertelement_v2f32: -; SI: v_mov_b32_e32 [[CONST:v[0-9]+]], 0x40a00000 -; SI: v_movreld_b32_e32 v[[LOW_RESULT_REG:[0-9]+]], [[CONST]] -; SI: buffer_store_dwordx2 {{v\[}}[[LOW_RESULT_REG]]: -define void @dynamic_insertelement_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, i32 %b) nounwind { - %vecins = insertelement <2 x float> %a, float 5.000000e+00, i32 %b - store <2 x float> %vecins, <2 x float> addrspace(1)* %out, align 8 - ret void -} - -; SI-LABEL: {{^}}dynamic_insertelement_v4f32: -; SI: v_mov_b32_e32 [[CONST:v[0-9]+]], 0x40a00000 -; SI: v_movreld_b32_e32 v[[LOW_RESULT_REG:[0-9]+]], [[CONST]] -; SI: buffer_store_dwordx4 {{v\[}}[[LOW_RESULT_REG]]: -define void @dynamic_insertelement_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, i32 %b) nounwind { - %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 %b - store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16 - ret void -} - -; SI-LABEL: {{^}}dynamic_insertelement_v8f32: -; FIXMESI: buffer_store_dwordx4 -; FIXMESI: buffer_store_dwordx4 -define void @dynamic_insertelement_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, i32 %b) nounwind { - %vecins = insertelement <8 x float> %a, float 5.000000e+00, i32 %b - store <8 x float> %vecins, <8 x float> addrspace(1)* %out, align 32 - ret void -} - -; SI-LABEL: {{^}}dynamic_insertelement_v16f32: -; FIXMESI: buffer_store_dwordx4 -; FIXMESI: buffer_store_dwordx4 -; FIXMESI: buffer_store_dwordx4 -; FIXMESI: buffer_store_dwordx4 -define void @dynamic_insertelement_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %a, i32 %b) nounwind { - %vecins = insertelement <16 x float> %a, float 5.000000e+00, i32 %b - store <16 x float> %vecins, <16 x float> addrspace(1)* %out, align 64 - ret void -} - -; SI-LABEL: {{^}}dynamic_insertelement_v2i32: -; SI: buffer_store_dwordx2 -define void @dynamic_insertelement_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, i32 %b) nounwind { - %vecins = insertelement <2 x i32> %a, i32 5, i32 %b - store <2 x i32> %vecins, <2 x i32> addrspace(1)* %out, align 8 - ret void -} - -; SI-LABEL: {{^}}dynamic_insertelement_v4i32: -; SI: buffer_store_dwordx4 -define void @dynamic_insertelement_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, i32 %b) nounwind { - %vecins = insertelement <4 x i32> %a, i32 5, i32 %b - store <4 x i32> %vecins, <4 x i32> addrspace(1)* %out, align 16 - ret void -} - -; SI-LABEL: {{^}}dynamic_insertelement_v8i32: -; FIXMESI: buffer_store_dwordx4 -; FIXMESI: buffer_store_dwordx4 -define void @dynamic_insertelement_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> %a, i32 %b) nounwind { - %vecins = insertelement <8 x i32> %a, i32 5, i32 %b - store <8 x i32> %vecins, <8 x i32> addrspace(1)* %out, align 32 - ret void -} - -; SI-LABEL: {{^}}dynamic_insertelement_v16i32: -; FIXMESI: buffer_store_dwordx4 -; FIXMESI: buffer_store_dwordx4 -; FIXMESI: buffer_store_dwordx4 -; FIXMESI: buffer_store_dwordx4 -define void @dynamic_insertelement_v16i32(<16 x i32> addrspace(1)* %out, <16 x i32> %a, i32 %b) nounwind { - %vecins = insertelement <16 x i32> %a, i32 5, i32 %b - store <16 x i32> %vecins, <16 x i32> addrspace(1)* %out, align 64 - ret void -} - - -; SI-LABEL: {{^}}dynamic_insertelement_v2i16: -; FIXMESI: buffer_store_dwordx2 -define void @dynamic_insertelement_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %a, i32 %b) nounwind { - %vecins = insertelement <2 x i16> %a, i16 5, i32 %b - store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out, align 8 - ret void -} - -; SI-LABEL: {{^}}dynamic_insertelement_v4i16: -; FIXMESI: buffer_store_dwordx4 -define void @dynamic_insertelement_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %a, i32 %b) nounwind { - %vecins = insertelement <4 x i16> %a, i16 5, i32 %b - store <4 x i16> %vecins, <4 x i16> addrspace(1)* %out, align 16 - ret void -} - - -; SI-LABEL: {{^}}dynamic_insertelement_v2i8: -; FIXMESI: BUFFER_STORE_USHORT -define void @dynamic_insertelement_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> %a, i32 %b) nounwind { - %vecins = insertelement <2 x i8> %a, i8 5, i32 %b - store <2 x i8> %vecins, <2 x i8> addrspace(1)* %out, align 8 - ret void -} - -; SI-LABEL: {{^}}dynamic_insertelement_v4i8: -; FIXMESI: buffer_store_dword -define void @dynamic_insertelement_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> %a, i32 %b) nounwind { - %vecins = insertelement <4 x i8> %a, i8 5, i32 %b - store <4 x i8> %vecins, <4 x i8> addrspace(1)* %out, align 16 - ret void -} - -; SI-LABEL: {{^}}dynamic_insertelement_v8i8: -; FIXMESI: buffer_store_dwordx2 -define void @dynamic_insertelement_v8i8(<8 x i8> addrspace(1)* %out, <8 x i8> %a, i32 %b) nounwind { - %vecins = insertelement <8 x i8> %a, i8 5, i32 %b - store <8 x i8> %vecins, <8 x i8> addrspace(1)* %out, align 16 - ret void -} - -; SI-LABEL: {{^}}dynamic_insertelement_v16i8: -; FIXMESI: buffer_store_dwordx4 -define void @dynamic_insertelement_v16i8(<16 x i8> addrspace(1)* %out, <16 x i8> %a, i32 %b) nounwind { - %vecins = insertelement <16 x i8> %a, i8 5, i32 %b - store <16 x i8> %vecins, <16 x i8> addrspace(1)* %out, align 16 - ret void -} - -; This test requires handling INSERT_SUBREG in SIFixSGPRCopies. Check that -; the compiler doesn't crash. -; SI-LABEL: {{^}}insert_split_bb: -define void @insert_split_bb(<2 x i32> addrspace(1)* %out, i32 addrspace(1)* %in, i32 %a, i32 %b) { -entry: - %0 = insertelement <2 x i32> undef, i32 %a, i32 0 - %1 = icmp eq i32 %a, 0 - br i1 %1, label %if, label %else - -if: - %2 = load i32, i32 addrspace(1)* %in - %3 = insertelement <2 x i32> %0, i32 %2, i32 1 - br label %endif - -else: - %4 = getelementptr i32, i32 addrspace(1)* %in, i32 1 - %5 = load i32, i32 addrspace(1)* %4 - %6 = insertelement <2 x i32> %0, i32 %5, i32 1 - br label %endif - -endif: - %7 = phi <2 x i32> [%3, %if], [%6, %else] - store <2 x i32> %7, <2 x i32> addrspace(1)* %out - ret void -} - -; SI-LABEL: {{^}}dynamic_insertelement_v2f64: -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 -; SI: s_endpgm -define void @dynamic_insertelement_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, i32 %b) nounwind { - %vecins = insertelement <2 x double> %a, double 8.0, i32 %b - store <2 x double> %vecins, <2 x double> addrspace(1)* %out, align 16 - ret void -} - -; SI-LABEL: {{^}}dynamic_insertelement_v2i64: -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 -; SI: s_endpgm -define void @dynamic_insertelement_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> %a, i32 %b) nounwind { - %vecins = insertelement <2 x i64> %a, i64 5, i32 %b - store <2 x i64> %vecins, <2 x i64> addrspace(1)* %out, align 8 - ret void -} - -; SI-LABEL: {{^}}dynamic_insertelement_v4f64: -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 -; SI: s_endpgm -define void @dynamic_insertelement_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, i32 %b) nounwind { - %vecins = insertelement <4 x double> %a, double 8.0, i32 %b - store <4 x double> %vecins, <4 x double> addrspace(1)* %out, align 16 - ret void -} - -; SI-LABEL: {{^}}dynamic_insertelement_v8f64: -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 -; SI: s_endpgm -define void @dynamic_insertelement_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %a, i32 %b) nounwind { - %vecins = insertelement <8 x double> %a, double 8.0, i32 %b - store <8 x double> %vecins, <8 x double> addrspace(1)* %out, align 16 - ret void -} diff --git a/llvm/test/CodeGen/R600/jump-address.ll b/llvm/test/CodeGen/R600/jump-address.ll deleted file mode 100644 index f55912e3740..00000000000 --- a/llvm/test/CodeGen/R600/jump-address.ll +++ /dev/null @@ -1,52 +0,0 @@ -;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s - -; CHECK: JUMP @6 -; CHECK: EXPORT -; CHECK-NOT: EXPORT - -define void @main() #0 { -main_body: - %0 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1) - %1 = extractelement <4 x float> %0, i32 0 - %2 = bitcast float %1 to i32 - %3 = icmp eq i32 %2, 0 - %4 = sext i1 %3 to i32 - %5 = bitcast i32 %4 to float - %6 = bitcast float %5 to i32 - %7 = icmp ne i32 %6, 0 - br i1 %7, label %ENDIF, label %ELSE - -ELSE: ; preds = %main_body - %8 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1) - %9 = extractelement <4 x float> %8, i32 0 - %10 = bitcast float %9 to i32 - %11 = icmp eq i32 %10, 1 - %12 = sext i1 %11 to i32 - %13 = bitcast i32 %12 to float - %14 = bitcast float %13 to i32 - %15 = icmp ne i32 %14, 0 - br i1 %15, label %IF13, label %ENDIF - -ENDIF: ; preds = %IF13, %ELSE, %main_body - %temp.0 = phi float [ 0xFFF8000000000000, %main_body ], [ 0.000000e+00, %ELSE ], [ 0.000000e+00, %IF13 ] - %temp1.0 = phi float [ 0.000000e+00, %main_body ], [ %23, %IF13 ], [ 0.000000e+00, %ELSE ] - %temp2.0 = phi float [ 1.000000e+00, %main_body ], [ 0.000000e+00, %ELSE ], [ 0.000000e+00, %IF13 ] - %temp3.0 = phi float [ 5.000000e-01, %main_body ], [ 0.000000e+00, %ELSE ], [ 0.000000e+00, %IF13 ] - %16 = insertelement <4 x float> undef, float %temp.0, i32 0 - %17 = insertelement <4 x float> %16, float %temp1.0, i32 1 - %18 = insertelement <4 x float> %17, float %temp2.0, i32 2 - %19 = insertelement <4 x float> %18, float %temp3.0, i32 3 - call void @llvm.R600.store.swizzle(<4 x float> %19, i32 0, i32 0) - ret void - -IF13: ; preds = %ELSE - %20 = load <4 x float>, <4 x float> addrspace(8)* null - %21 = extractelement <4 x float> %20, i32 0 - %22 = fsub float -0.000000e+00, %21 - %23 = fadd float 0xFFF8000000000000, %22 - br label %ENDIF -} - -declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) - -attributes #0 = { "ShaderType"="0" } diff --git a/llvm/test/CodeGen/R600/kcache-fold.ll b/llvm/test/CodeGen/R600/kcache-fold.ll deleted file mode 100644 index 7e2291cfdc3..00000000000 --- a/llvm/test/CodeGen/R600/kcache-fold.ll +++ /dev/null @@ -1,100 +0,0 @@ -;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s - -; CHECK: {{^}}main1: -; CHECK: MOV * T{{[0-9]+\.[XYZW], KC0}} -define void @main1() { -main_body: - %0 = load <4 x float>, <4 x float> addrspace(8)* null - %1 = extractelement <4 x float> %0, i32 0 - %2 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1) - %3 = extractelement <4 x float> %2, i32 0 - %4 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2) - %5 = extractelement <4 x float> %4, i32 0 - %6 = fcmp ogt float %1, 0.000000e+00 - %7 = select i1 %6, float %3, float %5 - %8 = load <4 x float>, <4 x float> addrspace(8)* null - %9 = extractelement <4 x float> %8, i32 1 - %10 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1) - %11 = extractelement <4 x float> %10, i32 1 - %12 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2) - %13 = extractelement <4 x float> %12, i32 1 - %14 = fcmp ogt float %9, 0.000000e+00 - %15 = select i1 %14, float %11, float %13 - %16 = load <4 x float>, <4 x float> addrspace(8)* null - %17 = extractelement <4 x float> %16, i32 2 - %18 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1) - %19 = extractelement <4 x float> %18, i32 2 - %20 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2) - %21 = extractelement <4 x float> %20, i32 2 - %22 = fcmp ogt float %17, 0.000000e+00 - %23 = select i1 %22, float %19, float %21 - %24 = load <4 x float>, <4 x float> addrspace(8)* null - %25 = extractelement <4 x float> %24, i32 3 - %26 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1) - %27 = extractelement <4 x float> %26, i32 3 - %28 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2) - %29 = extractelement <4 x float> %28, i32 3 - %30 = fcmp ogt float %25, 0.000000e+00 - %31 = select i1 %30, float %27, float %29 - %32 = call float @llvm.AMDIL.clamp.(float %7, float 0.000000e+00, float 1.000000e+00) - %33 = call float @llvm.AMDIL.clamp.(float %15, float 0.000000e+00, float 1.000000e+00) - %34 = call float @llvm.AMDIL.clamp.(float %23, float 0.000000e+00, float 1.000000e+00) - %35 = call float @llvm.AMDIL.clamp.(float %31, float 0.000000e+00, float 1.000000e+00) - %36 = insertelement <4 x float> undef, float %32, i32 0 - %37 = insertelement <4 x float> %36, float %33, i32 1 - %38 = insertelement <4 x float> %37, float %34, i32 2 - %39 = insertelement <4 x float> %38, float %35, i32 3 - call void @llvm.R600.store.swizzle(<4 x float> %39, i32 0, i32 0) - ret void -} - -; CHECK: {{^}}main2: -; CHECK-NOT: MOV -define void @main2() { -main_body: - %0 = load <4 x float>, <4 x float> addrspace(8)* null - %1 = extractelement <4 x float> %0, i32 0 - %2 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1) - %3 = extractelement <4 x float> %2, i32 0 - %4 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1) - %5 = extractelement <4 x float> %4, i32 1 - %6 = fcmp ogt float %1, 0.000000e+00 - %7 = select i1 %6, float %3, float %5 - %8 = load <4 x float>, <4 x float> addrspace(8)* null - %9 = extractelement <4 x float> %8, i32 1 - %10 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2) - %11 = extractelement <4 x float> %10, i32 0 - %12 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2) - %13 = extractelement <4 x float> %12, i32 1 - %14 = fcmp ogt float %9, 0.000000e+00 - %15 = select i1 %14, float %11, float %13 - %16 = load <4 x float>, <4 x float> addrspace(8)* null - %17 = extractelement <4 x float> %16, i32 2 - %18 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1) - %19 = extractelement <4 x float> %18, i32 3 - %20 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1) - %21 = extractelement <4 x float> %20, i32 2 - %22 = fcmp ogt float %17, 0.000000e+00 - %23 = select i1 %22, float %19, float %21 - %24 = load <4 x float>, <4 x float> addrspace(8)* null - %25 = extractelement <4 x float> %24, i32 3 - %26 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2) - %27 = extractelement <4 x float> %26, i32 3 - %28 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2) - %29 = extractelement <4 x float> %28, i32 2 - %30 = fcmp ogt float %25, 0.000000e+00 - %31 = select i1 %30, float %27, float %29 - %32 = call float @llvm.AMDIL.clamp.(float %7, float 0.000000e+00, float 1.000000e+00) - %33 = call float @llvm.AMDIL.clamp.(float %15, float 0.000000e+00, float 1.000000e+00) - %34 = call float @llvm.AMDIL.clamp.(float %23, float 0.000000e+00, float 1.000000e+00) - %35 = call float @llvm.AMDIL.clamp.(float %31, float 0.000000e+00, float 1.000000e+00) - %36 = insertelement <4 x float> undef, float %32, i32 0 - %37 = insertelement <4 x float> %36, float %33, i32 1 - %38 = insertelement <4 x float> %37, float %34, i32 2 - %39 = insertelement <4 x float> %38, float %35, i32 3 - call void @llvm.R600.store.swizzle(<4 x float> %39, i32 0, i32 0) - ret void -} - -declare float @llvm.AMDIL.clamp.(float, float, float) readnone -declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) diff --git a/llvm/test/CodeGen/R600/kernel-args.ll b/llvm/test/CodeGen/R600/kernel-args.ll deleted file mode 100644 index 1dd7c2cb799..00000000000 --- a/llvm/test/CodeGen/R600/kernel-args.ll +++ /dev/null @@ -1,473 +0,0 @@ -; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=GCN --check-prefix=FUNC -; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s --check-prefix=VI --check-prefix=GCN --check-prefix=FUNC -; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG --check-prefix=FUNC -; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=EG --check-prefix=FUNC - -; FUNC-LABEL: {{^}}i8_arg: -; EG: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z -; GCN: buffer_load_ubyte - -define void @i8_arg(i32 addrspace(1)* nocapture %out, i8 %in) nounwind { -entry: - %0 = zext i8 %in to i32 - store i32 %0, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}i8_zext_arg: -; EG: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z -; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb -; VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c - -define void @i8_zext_arg(i32 addrspace(1)* nocapture %out, i8 zeroext %in) nounwind { -entry: - %0 = zext i8 %in to i32 - store i32 %0, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}i8_sext_arg: -; EG: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z -; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb -; VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c - -define void @i8_sext_arg(i32 addrspace(1)* nocapture %out, i8 signext %in) nounwind { -entry: - %0 = sext i8 %in to i32 - store i32 %0, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}i16_arg: -; EG: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z -; GCN: buffer_load_ushort - -define void @i16_arg(i32 addrspace(1)* nocapture %out, i16 %in) nounwind { -entry: - %0 = zext i16 %in to i32 - store i32 %0, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}i16_zext_arg: -; EG: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z -; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb -; VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c - -define void @i16_zext_arg(i32 addrspace(1)* nocapture %out, i16 zeroext %in) nounwind { -entry: - %0 = zext i16 %in to i32 - store i32 %0, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}i16_sext_arg: -; EG: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z -; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb -; VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c - -define void @i16_sext_arg(i32 addrspace(1)* nocapture %out, i16 signext %in) nounwind { -entry: - %0 = sext i16 %in to i32 - store i32 %0, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}i32_arg: -; EG: T{{[0-9]\.[XYZW]}}, KC0[2].Z -; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb -; VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c -define void @i32_arg(i32 addrspace(1)* nocapture %out, i32 %in) nounwind { -entry: - store i32 %in, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}f32_arg: -; EG: T{{[0-9]\.[XYZW]}}, KC0[2].Z -; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb -; VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c -define void @f32_arg(float addrspace(1)* nocapture %out, float %in) nounwind { -entry: - store float %in, float addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}v2i8_arg: -; EG: VTX_READ_8 -; EG: VTX_READ_8 -; GCN: buffer_load_ubyte -; GCN: buffer_load_ubyte -define void @v2i8_arg(<2 x i8> addrspace(1)* %out, <2 x i8> %in) { -entry: - store <2 x i8> %in, <2 x i8> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}v2i16_arg: -; EG: VTX_READ_16 -; EG: VTX_READ_16 -; GCN-DAG: buffer_load_ushort -; GCN-DAG: buffer_load_ushort -define void @v2i16_arg(<2 x i16> addrspace(1)* %out, <2 x i16> %in) { -entry: - store <2 x i16> %in, <2 x i16> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}v2i32_arg: -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].X -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[2].W -; SI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xb -; VI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0x2c -define void @v2i32_arg(<2 x i32> addrspace(1)* nocapture %out, <2 x i32> %in) nounwind { -entry: - store <2 x i32> %in, <2 x i32> addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}v2f32_arg: -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].X -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[2].W -; SI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xb -; VI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0x2c -define void @v2f32_arg(<2 x float> addrspace(1)* nocapture %out, <2 x float> %in) nounwind { -entry: - store <2 x float> %in, <2 x float> addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}v3i8_arg: -; VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 40 -; VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 41 -; VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 42 -define void @v3i8_arg(<3 x i8> addrspace(1)* nocapture %out, <3 x i8> %in) nounwind { -entry: - store <3 x i8> %in, <3 x i8> addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}v3i16_arg: -; VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 44 -; VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 46 -; VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 48 -define void @v3i16_arg(<3 x i16> addrspace(1)* nocapture %out, <3 x i16> %in) nounwind { -entry: - store <3 x i16> %in, <3 x i16> addrspace(1)* %out, align 4 - ret void -} -; FUNC-LABEL: {{^}}v3i32_arg: -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W -; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0xd -; VI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x34 -define void @v3i32_arg(<3 x i32> addrspace(1)* nocapture %out, <3 x i32> %in) nounwind { -entry: - store <3 x i32> %in, <3 x i32> addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}v3f32_arg: -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W -; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0xd -; VI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x34 -define void @v3f32_arg(<3 x float> addrspace(1)* nocapture %out, <3 x float> %in) nounwind { -entry: - store <3 x float> %in, <3 x float> addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}v4i8_arg: -; EG: VTX_READ_8 -; EG: VTX_READ_8 -; EG: VTX_READ_8 -; EG: VTX_READ_8 -; GCN: buffer_load_ubyte -; GCN: buffer_load_ubyte -; GCN: buffer_load_ubyte -; GCN: buffer_load_ubyte -define void @v4i8_arg(<4 x i8> addrspace(1)* %out, <4 x i8> %in) { -entry: - store <4 x i8> %in, <4 x i8> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}v4i16_arg: -; EG: VTX_READ_16 -; EG: VTX_READ_16 -; EG: VTX_READ_16 -; EG: VTX_READ_16 -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -define void @v4i16_arg(<4 x i16> addrspace(1)* %out, <4 x i16> %in) { -entry: - store <4 x i16> %in, <4 x i16> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}v4i32_arg: -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].X -; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xd -; VI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0x34 -define void @v4i32_arg(<4 x i32> addrspace(1)* nocapture %out, <4 x i32> %in) nounwind { -entry: - store <4 x i32> %in, <4 x i32> addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}v4f32_arg: -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].X -; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xd -; VI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0x34 -define void @v4f32_arg(<4 x float> addrspace(1)* nocapture %out, <4 x float> %in) nounwind { -entry: - store <4 x float> %in, <4 x float> addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}v8i8_arg: -; EG: VTX_READ_8 -; EG: VTX_READ_8 -; EG: VTX_READ_8 -; EG: VTX_READ_8 -; EG: VTX_READ_8 -; EG: VTX_READ_8 -; EG: VTX_READ_8 -; EG: VTX_READ_8 -; GCN: buffer_load_ubyte -; GCN: buffer_load_ubyte -; GCN: buffer_load_ubyte -; GCN: buffer_load_ubyte -; GCN: buffer_load_ubyte -; GCN: buffer_load_ubyte -; GCN: buffer_load_ubyte -define void @v8i8_arg(<8 x i8> addrspace(1)* %out, <8 x i8> %in) { -entry: - store <8 x i8> %in, <8 x i8> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}v8i16_arg: -; EG: VTX_READ_16 -; EG: VTX_READ_16 -; EG: VTX_READ_16 -; EG: VTX_READ_16 -; EG: VTX_READ_16 -; EG: VTX_READ_16 -; EG: VTX_READ_16 -; EG: VTX_READ_16 -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -define void @v8i16_arg(<8 x i16> addrspace(1)* %out, <8 x i16> %in) { -entry: - store <8 x i16> %in, <8 x i16> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}v8i32_arg: -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Y -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Z -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].W -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].X -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].Y -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].Z -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].W -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].X -; SI: s_load_dwordx8 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x11 -; VI: s_load_dwordx8 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x44 -define void @v8i32_arg(<8 x i32> addrspace(1)* nocapture %out, <8 x i32> %in) nounwind { -entry: - store <8 x i32> %in, <8 x i32> addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}v8f32_arg: -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Y -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Z -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].W -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].X -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].Y -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].Z -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].W -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].X -; SI: s_load_dwordx8 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x11 -define void @v8f32_arg(<8 x float> addrspace(1)* nocapture %out, <8 x float> %in) nounwind { -entry: - store <8 x float> %in, <8 x float> addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}v16i8_arg: -; EG: VTX_READ_8 -; EG: VTX_READ_8 -; EG: VTX_READ_8 -; EG: VTX_READ_8 -; EG: VTX_READ_8 -; EG: VTX_READ_8 -; EG: VTX_READ_8 -; EG: VTX_READ_8 -; EG: VTX_READ_8 -; EG: VTX_READ_8 -; EG: VTX_READ_8 -; EG: VTX_READ_8 -; EG: VTX_READ_8 -; EG: VTX_READ_8 -; EG: VTX_READ_8 -; EG: VTX_READ_8 -; GCN: buffer_load_ubyte -; GCN: buffer_load_ubyte -; GCN: buffer_load_ubyte -; GCN: buffer_load_ubyte -; GCN: buffer_load_ubyte -; GCN: buffer_load_ubyte -; GCN: buffer_load_ubyte -; GCN: buffer_load_ubyte -; GCN: buffer_load_ubyte -; GCN: buffer_load_ubyte -; GCN: buffer_load_ubyte -; GCN: buffer_load_ubyte -; GCN: buffer_load_ubyte -; GCN: buffer_load_ubyte -; GCN: buffer_load_ubyte -; GCN: buffer_load_ubyte -define void @v16i8_arg(<16 x i8> addrspace(1)* %out, <16 x i8> %in) { -entry: - store <16 x i8> %in, <16 x i8> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}v16i16_arg: -; EG: VTX_READ_16 -; EG: VTX_READ_16 -; EG: VTX_READ_16 -; EG: VTX_READ_16 -; EG: VTX_READ_16 -; EG: VTX_READ_16 -; EG: VTX_READ_16 -; EG: VTX_READ_16 -; EG: VTX_READ_16 -; EG: VTX_READ_16 -; EG: VTX_READ_16 -; EG: VTX_READ_16 -; EG: VTX_READ_16 -; EG: VTX_READ_16 -; EG: VTX_READ_16 -; EG: VTX_READ_16 -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -define void @v16i16_arg(<16 x i16> addrspace(1)* %out, <16 x i16> %in) { -entry: - store <16 x i16> %in, <16 x i16> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}v16i32_arg: -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Y -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Z -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].W -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].X -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].Y -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].Z -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].W -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].X -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Y -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Z -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].W -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].X -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].Y -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].Z -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].W -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[10].X -; SI: s_load_dwordx16 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x19 -; VI: s_load_dwordx16 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x64 -define void @v16i32_arg(<16 x i32> addrspace(1)* nocapture %out, <16 x i32> %in) nounwind { -entry: - store <16 x i32> %in, <16 x i32> addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}v16f32_arg: -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Y -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Z -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].W -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].X -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].Y -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].Z -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].W -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].X -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Y -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Z -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].W -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].X -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].Y -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].Z -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].W -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[10].X -; SI: s_load_dwordx16 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x19 -; VI: s_load_dwordx16 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x64 -define void @v16f32_arg(<16 x float> addrspace(1)* nocapture %out, <16 x float> %in) nounwind { -entry: - store <16 x float> %in, <16 x float> addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}kernel_arg_i64: -; GCN: s_load_dwordx2 -; GCN: s_load_dwordx2 -; GCN: buffer_store_dwordx2 -define void @kernel_arg_i64(i64 addrspace(1)* %out, i64 %a) nounwind { - store i64 %a, i64 addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}f64_kernel_arg: -; SI-DAG: s_load_dwordx2 s[{{[0-9]:[0-9]}}], s[0:1], 0x9 -; SI-DAG: s_load_dwordx2 s[{{[0-9]:[0-9]}}], s[0:1], 0xb -; VI-DAG: s_load_dwordx2 s[{{[0-9]:[0-9]}}], s[0:1], 0x24 -; VI-DAG: s_load_dwordx2 s[{{[0-9]:[0-9]}}], s[0:1], 0x2c -; GCN: buffer_store_dwordx2 -define void @f64_kernel_arg(double addrspace(1)* %out, double %in) { -entry: - store double %in, double addrspace(1)* %out - ret void -} - -; XFUNC-LABEL: {{^}}kernel_arg_v1i64: -; XGCN: s_load_dwordx2 -; XGCN: s_load_dwordx2 -; XGCN: buffer_store_dwordx2 -; define void @kernel_arg_v1i64(<1 x i64> addrspace(1)* %out, <1 x i64> %a) nounwind { -; store <1 x i64> %a, <1 x i64> addrspace(1)* %out, align 8 -; ret void -; } diff --git a/llvm/test/CodeGen/R600/large-alloca.ll b/llvm/test/CodeGen/R600/large-alloca.ll deleted file mode 100644 index 671833d1a33..00000000000 --- a/llvm/test/CodeGen/R600/large-alloca.ll +++ /dev/null @@ -1,15 +0,0 @@ -; XFAIL: * -; REQUIRES: asserts -; RUN: llc -march=amdgcn -mcpu=SI < %s -; RUN: llc -march=amdgcn -mcpu=tonga < %s - -define void @large_alloca(i32 addrspace(1)* %out, i32 %x, i32 %y) nounwind { - %large = alloca [8192 x i32], align 4 - %gep = getelementptr [8192 x i32], [8192 x i32]* %large, i32 0, i32 8191 - store i32 %x, i32* %gep - %gep1 = getelementptr [8192 x i32], [8192 x i32]* %large, i32 0, i32 %y - %0 = load i32, i32* %gep1 - store i32 %0, i32 addrspace(1)* %out - ret void -} - diff --git a/llvm/test/CodeGen/R600/large-constant-initializer.ll b/llvm/test/CodeGen/R600/large-constant-initializer.ll deleted file mode 100644 index 9975b1b7f5c..00000000000 --- a/llvm/test/CodeGen/R600/large-constant-initializer.ll +++ /dev/null @@ -1,19 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI < %s -; RUN: llc -march=amdgcn -mcpu=tonga < %s -; CHECK: s_endpgm - -@gv = external unnamed_addr addrspace(2) constant [239 x i32], align 4 - -define void @opencv_cvtfloat_crash(i32 addrspace(1)* %out, i32 %x) nounwind { - %val = load i32, i32 addrspace(2)* getelementptr ([239 x i32], [239 x i32] addrspace(2)* @gv, i64 0, i64 239), align 4 - %mul12 = mul nsw i32 %val, 7 - br i1 undef, label %exit, label %bb - -bb: - %cmp = icmp slt i32 %x, 0 - br label %exit - -exit: - ret void -} - diff --git a/llvm/test/CodeGen/R600/lds-initializer.ll b/llvm/test/CodeGen/R600/lds-initializer.ll deleted file mode 100644 index bf8df63be9f..00000000000 --- a/llvm/test/CodeGen/R600/lds-initializer.ll +++ /dev/null @@ -1,13 +0,0 @@ -; RUN: not llc -march=amdgcn -mcpu=SI < %s 2>&1 | FileCheck %s -; RUN: not llc -march=amdgcn -mcpu=tonga < %s 2>&1 | FileCheck %s - -; CHECK: error: unsupported initializer for address space in load_init_lds_global - -@lds = addrspace(3) global [8 x i32] [i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8] - -define void @load_init_lds_global(i32 addrspace(1)* %out, i1 %p) { - %gep = getelementptr [8 x i32], [8 x i32] addrspace(3)* @lds, i32 0, i32 10 - %ld = load i32, i32 addrspace(3)* %gep - store i32 %ld, i32 addrspace(1)* %out - ret void -} diff --git a/llvm/test/CodeGen/R600/lds-oqap-crash.ll b/llvm/test/CodeGen/R600/lds-oqap-crash.ll deleted file mode 100644 index 6ff6fc3d7af..00000000000 --- a/llvm/test/CodeGen/R600/lds-oqap-crash.ll +++ /dev/null @@ -1,28 +0,0 @@ -; RUN: llc < %s -march=r600 -mcpu=redwood -verify-machineinstrs | FileCheck %s - -; The test is for a bug in R600EmitClauseMarkers.cpp where this pass -; was searching for a use of the OQAP register in order to determine -; if an LDS instruction could fit in the current clause, but never finding -; one. This created an infinite loop and hung the compiler. -; -; The LDS instruction should not have been defining OQAP in the first place, -; because the LDS instructions are pseudo instructions and the OQAP -; reads and writes are bundled together in the same instruction. - -; CHECK: {{^}}lds_crash: -define void @lds_crash(i32 addrspace(1)* %out, i32 addrspace(3)* %in, i32 %a, i32 %b, i32 %c) { -entry: - %0 = load i32, i32 addrspace(3)* %in - ; This block needs to be > 115 ISA instructions to hit the bug, - ; so we'll use udiv instructions. - %div0 = udiv i32 %0, %b - %div1 = udiv i32 %div0, %a - %div2 = udiv i32 %div1, 11 - %div3 = udiv i32 %div2, %a - %div4 = udiv i32 %div3, %b - %div5 = udiv i32 %div4, %c - %div6 = udiv i32 %div5, %div0 - %div7 = udiv i32 %div6, %div1 - store i32 %div7, i32 addrspace(1)* %out - ret void -} diff --git a/llvm/test/CodeGen/R600/lds-output-queue.ll b/llvm/test/CodeGen/R600/lds-output-queue.ll deleted file mode 100644 index 44ffc36af14..00000000000 --- a/llvm/test/CodeGen/R600/lds-output-queue.ll +++ /dev/null @@ -1,99 +0,0 @@ -; RUN: llc < %s -march=r600 -mcpu=redwood -verify-machineinstrs | FileCheck %s -; -; This test checks that the lds input queue will is empty at the end of -; the ALU clause. - -; CHECK-LABEL: {{^}}lds_input_queue: -; CHECK: LDS_READ_RET * OQAP -; CHECK-NOT: ALU clause -; CHECK: MOV * T{{[0-9]\.[XYZW]}}, OQAP - -@local_mem = internal unnamed_addr addrspace(3) global [2 x i32] undef, align 4 - -define void @lds_input_queue(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %index) { -entry: - %0 = getelementptr inbounds [2 x i32], [2 x i32] addrspace(3)* @local_mem, i32 0, i32 %index - %1 = load i32, i32 addrspace(3)* %0 - call void @llvm.AMDGPU.barrier.local() - - ; This will start a new clause for the vertex fetch - %2 = load i32, i32 addrspace(1)* %in - %3 = add i32 %1, %2 - store i32 %3, i32 addrspace(1)* %out - ret void -} - -declare void @llvm.AMDGPU.barrier.local() - -; The machine scheduler does not do proper alias analysis and assumes that -; loads from global values (Note that a global value is different that a -; value from global memory. A global value is a value that is declared -; outside of a function, it can reside in any address space) alias with -; all other loads. -; -; This is a problem for scheduling the reads from the local data share (lds). -; These reads are implemented using two instructions. The first copies the -; data from lds into the lds output queue, and the second moves the data from -; the input queue into main memory. These two instructions don't have to be -; scheduled one after the other, but they do need to be scheduled in the same -; clause. The aliasing problem mentioned above causes problems when there is a -; load from global memory which immediately follows a load from a global value that -; has been declared in the local memory space: -; -; %0 = getelementptr inbounds [2 x i32], [2 x i32] addrspace(3)* @local_mem, i32 0, i32 %index -; %1 = load i32, i32 addrspace(3)* %0 -; %2 = load i32, i32 addrspace(1)* %in -; -; The instruction selection phase will generate ISA that looks like this: -; %OQAP = LDS_READ_RET -; %vreg0 = MOV %OQAP -; %vreg1 = VTX_READ_32 -; %vreg2 = ADD_INT %vreg1, %vreg0 -; -; The bottom scheduler will schedule the two ALU instructions first: -; -; UNSCHEDULED: -; %OQAP = LDS_READ_RET -; %vreg1 = VTX_READ_32 -; -; SCHEDULED: -; -; vreg0 = MOV %OQAP -; vreg2 = ADD_INT %vreg1, %vreg2 -; -; The lack of proper aliasing results in the local memory read (LDS_READ_RET) -; to consider the global memory read (VTX_READ_32) has a chain dependency, so -; the global memory read will always be scheduled first. This will give us a -; final program which looks like this: -; -; Alu clause: -; %OQAP = LDS_READ_RET -; VTX clause: -; %vreg1 = VTX_READ_32 -; Alu clause: -; vreg0 = MOV %OQAP -; vreg2 = ADD_INT %vreg1, %vreg2 -; -; This is an illegal program because the OQAP def and use know occur in -; different ALU clauses. -; -; This test checks this scenario and makes sure it doesn't result in an -; illegal program. For now, we have fixed this issue by merging the -; LDS_READ_RET and MOV together during instruction selection and then -; expanding them after scheduling. Once the scheduler has better alias -; analysis, we should be able to keep these instructions sparate before -; scheduling. -; -; CHECK-LABEL: {{^}}local_global_alias: -; CHECK: LDS_READ_RET -; CHECK-NOT: ALU clause -; CHECK: MOV * T{{[0-9]\.[XYZW]}}, OQAP -define void @local_global_alias(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { -entry: - %0 = getelementptr inbounds [2 x i32], [2 x i32] addrspace(3)* @local_mem, i32 0, i32 0 - %1 = load i32, i32 addrspace(3)* %0 - %2 = load i32, i32 addrspace(1)* %in - %3 = add i32 %2, %1 - store i32 %3, i32 addrspace(1)* %out - ret void -} diff --git a/llvm/test/CodeGen/R600/lds-size.ll b/llvm/test/CodeGen/R600/lds-size.ll deleted file mode 100644 index 3e8328659fd..00000000000 --- a/llvm/test/CodeGen/R600/lds-size.ll +++ /dev/null @@ -1,26 +0,0 @@ -; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s - -; This test makes sure we do not double count global values when they are -; used in different basic blocks. - -; CHECK: .long 166120 -; CHECK-NEXT: .long 1 -; CHECK-LABEL: {{^}}test: -@lds = internal unnamed_addr addrspace(3) global i32 undef, align 4 - -define void @test(i32 addrspace(1)* %out, i32 %cond) { -entry: - %0 = icmp eq i32 %cond, 0 - br i1 %0, label %if, label %else - -if: - store i32 1, i32 addrspace(3)* @lds - br label %endif - -else: - store i32 2, i32 addrspace(3)* @lds - br label %endif - -endif: - ret void -} diff --git a/llvm/test/CodeGen/R600/lds-zero-initializer.ll b/llvm/test/CodeGen/R600/lds-zero-initializer.ll deleted file mode 100644 index fb51bc0e50c..00000000000 --- a/llvm/test/CodeGen/R600/lds-zero-initializer.ll +++ /dev/null @@ -1,13 +0,0 @@ -; RUN: not llc -march=amdgcn -mcpu=SI < %s 2>&1 | FileCheck %s -; RUN: not llc -march=amdgcn -mcpu=tonga < %s 2>&1 | FileCheck %s - -; CHECK: error: unsupported initializer for address space in load_zeroinit_lds_global - -@lds = addrspace(3) global [256 x i32] zeroinitializer - -define void @load_zeroinit_lds_global(i32 addrspace(1)* %out, i1 %p) { - %gep = getelementptr [256 x i32], [256 x i32] addrspace(3)* @lds, i32 0, i32 10 - %ld = load i32, i32 addrspace(3)* %gep - store i32 %ld, i32 addrspace(1)* %out - ret void -} diff --git a/llvm/test/CodeGen/R600/legalizedag-bug-expand-setcc.ll b/llvm/test/CodeGen/R600/legalizedag-bug-expand-setcc.ll deleted file mode 100644 index 4244c48d240..00000000000 --- a/llvm/test/CodeGen/R600/legalizedag-bug-expand-setcc.ll +++ /dev/null @@ -1,26 +0,0 @@ -; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s - -; This tests a bug where LegalizeDAG was not checking the target's -; BooleanContents value and always using one for true, when expanding -; setcc to select_cc. -; -; This bug caused the icmp IR instruction to be expanded to two machine -; instructions, when only one is needed. -; - -; CHECK: {{^}}setcc_expand: -; CHECK: SET -; CHECK-NOT: CND -define void @setcc_expand(i32 addrspace(1)* %out, i32 %in) { -entry: - %0 = icmp eq i32 %in, 5 - br i1 %0, label %IF, label %ENDIF -IF: - %1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 - store i32 0, i32 addrspace(1)* %1 - br label %ENDIF - -ENDIF: - store i32 0, i32 addrspace(1)* %out - ret void -} diff --git a/llvm/test/CodeGen/R600/lit.local.cfg b/llvm/test/CodeGen/R600/lit.local.cfg deleted file mode 100644 index ad9ce2541ef..00000000000 --- a/llvm/test/CodeGen/R600/lit.local.cfg +++ /dev/null @@ -1,2 +0,0 @@ -if not 'R600' in config.root.targets: - config.unsupported = True diff --git a/llvm/test/CodeGen/R600/literals.ll b/llvm/test/CodeGen/R600/literals.ll deleted file mode 100644 index cff1c24f89d..00000000000 --- a/llvm/test/CodeGen/R600/literals.ll +++ /dev/null @@ -1,64 +0,0 @@ -; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s - -; Test using an integer literal constant. -; Generated ASM should be: -; ADD_INT KC0[2].Z literal.x, 5 -; or -; ADD_INT literal.x KC0[2].Z, 5 - -; CHECK: {{^}}i32_literal: -; CHECK: ADD_INT {{\** *}}T{{[0-9]\.[XYZW]}}, KC0[2].Z, literal.x -; CHECK-NEXT: LSHR -; CHECK-NEXT: 5 -define void @i32_literal(i32 addrspace(1)* %out, i32 %in) { -entry: - %0 = add i32 5, %in - store i32 %0, i32 addrspace(1)* %out - ret void -} - -; Test using a float literal constant. -; Generated ASM should be: -; ADD KC0[2].Z literal.x, 5.0 -; or -; ADD literal.x KC0[2].Z, 5.0 - -; CHECK: {{^}}float_literal: -; CHECK: ADD {{\** *}}T{{[0-9]\.[XYZW]}}, KC0[2].Z, literal.x -; CHECK-NEXT: LSHR -; CHECK-NEXT: 1084227584(5.0 -define void @float_literal(float addrspace(1)* %out, float %in) { -entry: - %0 = fadd float 5.0, %in - store float %0, float addrspace(1)* %out - ret void -} - -; Make sure inline literals are folded into REG_SEQUENCE instructions. -; CHECK: {{^}}inline_literal_reg_sequence: -; CHECK: MOV {{\** *}}T[[GPR:[0-9]]].X, 0.0 -; CHECK-NEXT: MOV {{\** *}}T[[GPR]].Y, 0.0 -; CHECK-NEXT: MOV {{\** *}}T[[GPR]].Z, 0.0 -; CHECK-NEXT: MOV {{\** *}}T[[GPR]].W, 0.0 - -define void @inline_literal_reg_sequence(<4 x i32> addrspace(1)* %out) { -entry: - store <4 x i32> , <4 x i32> addrspace(1)* %out - ret void -} - -; CHECK: {{^}}inline_literal_dot4: -; CHECK: DOT4 T[[GPR:[0-9]]].X, 1.0 -; CHECK-NEXT: DOT4 T[[GPR]].Y (MASKED), 1.0 -; CHECK-NEXT: DOT4 T[[GPR]].Z (MASKED), 1.0 -; CHECK-NEXT: DOT4 * T[[GPR]].W (MASKED), 1.0 -define void @inline_literal_dot4(float addrspace(1)* %out) { -entry: - %0 = call float @llvm.AMDGPU.dp4(<4 x float> , <4 x float> ) - store float %0, float addrspace(1)* %out - ret void -} - -declare float @llvm.AMDGPU.dp4(<4 x float>, <4 x float>) #1 - -attributes #1 = { readnone } diff --git a/llvm/test/CodeGen/R600/llvm.AMDGPU.abs.ll b/llvm/test/CodeGen/R600/llvm.AMDGPU.abs.ll deleted file mode 100644 index 8bf094b8bc7..00000000000 --- a/llvm/test/CodeGen/R600/llvm.AMDGPU.abs.ll +++ /dev/null @@ -1,49 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s - -declare i32 @llvm.AMDGPU.abs(i32) nounwind readnone - -; Legacy name -declare i32 @llvm.AMDIL.abs.i32(i32) nounwind readnone - -; FUNC-LABEL: {{^}}s_abs_i32: -; SI: s_sub_i32 -; SI: s_max_i32 -; SI: s_endpgm - -; EG: SUB_INT -; EG: MAX_INT -define void @s_abs_i32(i32 addrspace(1)* %out, i32 %src) nounwind { - %abs = call i32 @llvm.AMDGPU.abs(i32 %src) nounwind readnone - store i32 %abs, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}v_abs_i32: -; SI: v_sub_i32_e32 -; SI: v_max_i32_e32 -; SI: s_endpgm - -; EG: SUB_INT -; EG: MAX_INT -define void @v_abs_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %src) nounwind { - %val = load i32, i32 addrspace(1)* %src, align 4 - %abs = call i32 @llvm.AMDGPU.abs(i32 %val) nounwind readnone - store i32 %abs, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}abs_i32_legacy_amdil: -; SI: v_sub_i32_e32 -; SI: v_max_i32_e32 -; SI: s_endpgm - -; EG: SUB_INT -; EG: MAX_INT -define void @abs_i32_legacy_amdil(i32 addrspace(1)* %out, i32 addrspace(1)* %src) nounwind { - %val = load i32, i32 addrspace(1)* %src, align 4 - %abs = call i32 @llvm.AMDIL.abs.i32(i32 %val) nounwind readnone - store i32 %abs, i32 addrspace(1)* %out, align 4 - ret void -} diff --git a/llvm/test/CodeGen/R600/llvm.AMDGPU.barrier.global.ll b/llvm/test/CodeGen/R600/llvm.AMDGPU.barrier.global.ll deleted file mode 100644 index db883972d64..00000000000 --- a/llvm/test/CodeGen/R600/llvm.AMDGPU.barrier.global.ll +++ /dev/null @@ -1,30 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s - -; FUNC-LABEL: {{^}}test_barrier_global: -; EG: GROUP_BARRIER -; SI: buffer_store_dword -; SI: s_waitcnt -; SI: s_barrier - -define void @test_barrier_global(i32 addrspace(1)* %out) { -entry: - %0 = call i32 @llvm.r600.read.tidig.x() - %1 = getelementptr i32, i32 addrspace(1)* %out, i32 %0 - store i32 %0, i32 addrspace(1)* %1 - call void @llvm.AMDGPU.barrier.global() - %2 = call i32 @llvm.r600.read.local.size.x() - %3 = sub i32 %2, 1 - %4 = sub i32 %3, %0 - %5 = getelementptr i32, i32 addrspace(1)* %out, i32 %4 - %6 = load i32, i32 addrspace(1)* %5 - store i32 %6, i32 addrspace(1)* %1 - ret void -} - -declare void @llvm.AMDGPU.barrier.global() - -declare i32 @llvm.r600.read.tidig.x() #0 -declare i32 @llvm.r600.read.local.size.x() #0 - -attributes #0 = { readnone } diff --git a/llvm/test/CodeGen/R600/llvm.AMDGPU.barrier.local.ll b/llvm/test/CodeGen/R600/llvm.AMDGPU.barrier.local.ll deleted file mode 100644 index 48fb2e0b1a8..00000000000 --- a/llvm/test/CodeGen/R600/llvm.AMDGPU.barrier.local.ll +++ /dev/null @@ -1,31 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s - -; FUNC-LABEL: {{^}}test_barrier_local: -; EG: GROUP_BARRIER - -; SI: buffer_store_dword -; SI: s_waitcnt -; SI: s_barrier - -define void @test_barrier_local(i32 addrspace(1)* %out) { -entry: - %0 = call i32 @llvm.r600.read.tidig.x() - %1 = getelementptr i32, i32 addrspace(1)* %out, i32 %0 - store i32 %0, i32 addrspace(1)* %1 - call void @llvm.AMDGPU.barrier.local() - %2 = call i32 @llvm.r600.read.local.size.x() - %3 = sub i32 %2, 1 - %4 = sub i32 %3, %0 - %5 = getelementptr i32, i32 addrspace(1)* %out, i32 %4 - %6 = load i32, i32 addrspace(1)* %5 - store i32 %6, i32 addrspace(1)* %1 - ret void -} - -declare void @llvm.AMDGPU.barrier.local() - -declare i32 @llvm.r600.read.tidig.x() #0 -declare i32 @llvm.r600.read.local.size.x() #0 - -attributes #0 = { readnone } diff --git a/llvm/test/CodeGen/R600/llvm.AMDGPU.bfe.i32.ll b/llvm/test/CodeGen/R600/llvm.AMDGPU.bfe.i32.ll deleted file mode 100644 index 1168713ca66..00000000000 --- a/llvm/test/CodeGen/R600/llvm.AMDGPU.bfe.i32.ll +++ /dev/null @@ -1,437 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=redwood -show-mc-encoding -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s - -declare i32 @llvm.AMDGPU.bfe.i32(i32, i32, i32) nounwind readnone - -; FUNC-LABEL: {{^}}bfe_i32_arg_arg_arg: -; SI: v_bfe_i32 -; EG: BFE_INT -; EG: encoding: [{{[x0-9a-f]+,[x0-9a-f]+,[x0-9a-f]+,[x0-9a-f]+,[x0-9a-f]+}},0xac -define void @bfe_i32_arg_arg_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) nounwind { - %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 %src0, i32 %src1, i32 %src1) nounwind readnone - store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_i32_arg_arg_imm: -; SI: v_bfe_i32 -; EG: BFE_INT -define void @bfe_i32_arg_arg_imm(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind { - %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 %src0, i32 %src1, i32 123) nounwind readnone - store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_i32_arg_imm_arg: -; SI: v_bfe_i32 -; EG: BFE_INT -define void @bfe_i32_arg_imm_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src2) nounwind { - %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 %src0, i32 123, i32 %src2) nounwind readnone - store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_i32_imm_arg_arg: -; SI: v_bfe_i32 -; EG: BFE_INT -define void @bfe_i32_imm_arg_arg(i32 addrspace(1)* %out, i32 %src1, i32 %src2) nounwind { - %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 123, i32 %src1, i32 %src2) nounwind readnone - store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}v_bfe_print_arg: -; SI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 2, 8 -define void @v_bfe_print_arg(i32 addrspace(1)* %out, i32 addrspace(1)* %src0) nounwind { - %load = load i32, i32 addrspace(1)* %src0, align 4 - %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 %load, i32 2, i32 8) nounwind readnone - store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_i32_arg_0_width_reg_offset: -; SI-NOT: {{[^@]}}bfe -; SI: s_endpgm -; EG-NOT: BFE -define void @bfe_i32_arg_0_width_reg_offset(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind { - %bfe_u32 = call i32 @llvm.AMDGPU.bfe.i32(i32 %src0, i32 %src1, i32 0) nounwind readnone - store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_i32_arg_0_width_imm_offset: -; SI-NOT: {{[^@]}}bfe -; SI: s_endpgm -; EG-NOT: BFE -define void @bfe_i32_arg_0_width_imm_offset(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind { - %bfe_u32 = call i32 @llvm.AMDGPU.bfe.i32(i32 %src0, i32 8, i32 0) nounwind readnone - store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_i32_test_6: -; SI: v_lshlrev_b32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}} -; SI: v_ashrrev_i32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}} -; SI: s_endpgm -define void @bfe_i32_test_6(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { - %x = load i32, i32 addrspace(1)* %in, align 4 - %shl = shl i32 %x, 31 - %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %shl, i32 1, i32 31) - store i32 %bfe, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_i32_test_7: -; SI-NOT: shl -; SI-NOT: {{[^@]}}bfe -; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0 -; SI: buffer_store_dword [[VREG]], -; SI: s_endpgm -define void @bfe_i32_test_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { - %x = load i32, i32 addrspace(1)* %in, align 4 - %shl = shl i32 %x, 31 - %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %shl, i32 0, i32 31) - store i32 %bfe, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_i32_test_8: -; SI: buffer_load_dword -; SI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 1 -; SI: s_endpgm -define void @bfe_i32_test_8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { - %x = load i32, i32 addrspace(1)* %in, align 4 - %shl = shl i32 %x, 31 - %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %shl, i32 31, i32 1) - store i32 %bfe, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_i32_test_9: -; SI-NOT: {{[^@]}}bfe -; SI: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}} -; SI-NOT: {{[^@]}}bfe -; SI: s_endpgm -define void @bfe_i32_test_9(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { - %x = load i32, i32 addrspace(1)* %in, align 4 - %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %x, i32 31, i32 1) - store i32 %bfe, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_i32_test_10: -; SI-NOT: {{[^@]}}bfe -; SI: v_ashrrev_i32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}} -; SI-NOT: {{[^@]}}bfe -; SI: s_endpgm -define void @bfe_i32_test_10(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { - %x = load i32, i32 addrspace(1)* %in, align 4 - %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %x, i32 1, i32 31) - store i32 %bfe, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_i32_test_11: -; SI-NOT: {{[^@]}}bfe -; SI: v_ashrrev_i32_e32 v{{[0-9]+}}, 8, v{{[0-9]+}} -; SI-NOT: {{[^@]}}bfe -; SI: s_endpgm -define void @bfe_i32_test_11(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { - %x = load i32, i32 addrspace(1)* %in, align 4 - %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %x, i32 8, i32 24) - store i32 %bfe, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_i32_test_12: -; SI-NOT: {{[^@]}}bfe -; SI: v_ashrrev_i32_e32 v{{[0-9]+}}, 24, v{{[0-9]+}} -; SI-NOT: {{[^@]}}bfe -; SI: s_endpgm -define void @bfe_i32_test_12(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { - %x = load i32, i32 addrspace(1)* %in, align 4 - %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %x, i32 24, i32 8) - store i32 %bfe, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_i32_test_13: -; SI: v_ashrrev_i32_e32 {{v[0-9]+}}, 31, {{v[0-9]+}} -; SI-NOT: {{[^@]}}bfe -; SI: s_endpgm -define void @bfe_i32_test_13(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { - %x = load i32, i32 addrspace(1)* %in, align 4 - %shl = ashr i32 %x, 31 - %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %shl, i32 31, i32 1) - store i32 %bfe, i32 addrspace(1)* %out, align 4 ret void -} - -; FUNC-LABEL: {{^}}bfe_i32_test_14: -; SI-NOT: lshr -; SI-NOT: {{[^@]}}bfe -; SI: s_endpgm -define void @bfe_i32_test_14(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { - %x = load i32, i32 addrspace(1)* %in, align 4 - %shl = lshr i32 %x, 31 - %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %shl, i32 31, i32 1) - store i32 %bfe, i32 addrspace(1)* %out, align 4 ret void -} - -; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_0: -; SI-NOT: {{[^@]}}bfe -; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0 -; SI: buffer_store_dword [[VREG]], -; SI: s_endpgm -; EG-NOT: BFE -define void @bfe_i32_constant_fold_test_0(i32 addrspace(1)* %out) nounwind { - %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 0, i32 0, i32 0) nounwind readnone - store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_1: -; SI-NOT: {{[^@]}}bfe -; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0 -; SI: buffer_store_dword [[VREG]], -; SI: s_endpgm -; EG-NOT: BFE -define void @bfe_i32_constant_fold_test_1(i32 addrspace(1)* %out) nounwind { - %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 12334, i32 0, i32 0) nounwind readnone - store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_2: -; SI-NOT: {{[^@]}}bfe -; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0 -; SI: buffer_store_dword [[VREG]], -; SI: s_endpgm -; EG-NOT: BFE -define void @bfe_i32_constant_fold_test_2(i32 addrspace(1)* %out) nounwind { - %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 0, i32 0, i32 1) nounwind readnone - store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_3: -; SI-NOT: {{[^@]}}bfe -; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], -1 -; SI: buffer_store_dword [[VREG]], -; SI: s_endpgm -; EG-NOT: BFE -define void @bfe_i32_constant_fold_test_3(i32 addrspace(1)* %out) nounwind { - %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 1, i32 0, i32 1) nounwind readnone - store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_4: -; SI-NOT: {{[^@]}}bfe -; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], -1 -; SI: buffer_store_dword [[VREG]], -; SI: s_endpgm -; EG-NOT: BFE -define void @bfe_i32_constant_fold_test_4(i32 addrspace(1)* %out) nounwind { - %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 4294967295, i32 0, i32 1) nounwind readnone - store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_5: -; SI-NOT: {{[^@]}}bfe -; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], -1 -; SI: buffer_store_dword [[VREG]], -; SI: s_endpgm -; EG-NOT: BFE -define void @bfe_i32_constant_fold_test_5(i32 addrspace(1)* %out) nounwind { - %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 128, i32 7, i32 1) nounwind readnone - store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_6: -; SI-NOT: {{[^@]}}bfe -; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0xffffff80 -; SI: buffer_store_dword [[VREG]], -; SI: s_endpgm -; EG-NOT: BFE -define void @bfe_i32_constant_fold_test_6(i32 addrspace(1)* %out) nounwind { - %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 128, i32 0, i32 8) nounwind readnone - store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_7: -; SI-NOT: {{[^@]}}bfe -; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0x7f -; SI: buffer_store_dword [[VREG]], -; SI: s_endpgm -; EG-NOT: BFE -define void @bfe_i32_constant_fold_test_7(i32 addrspace(1)* %out) nounwind { - %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 127, i32 0, i32 8) nounwind readnone - store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_8: -; SI-NOT: {{[^@]}}bfe -; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 1 -; SI: buffer_store_dword [[VREG]], -; SI: s_endpgm -; EG-NOT: BFE -define void @bfe_i32_constant_fold_test_8(i32 addrspace(1)* %out) nounwind { - %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 127, i32 6, i32 8) nounwind readnone - store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_9: -; SI-NOT: {{[^@]}}bfe -; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 1 -; SI: buffer_store_dword [[VREG]], -; SI: s_endpgm -; EG-NOT: BFE -define void @bfe_i32_constant_fold_test_9(i32 addrspace(1)* %out) nounwind { - %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 65536, i32 16, i32 8) nounwind readnone - store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_10: -; SI-NOT: {{[^@]}}bfe -; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0 -; SI: buffer_store_dword [[VREG]], -; SI: s_endpgm -; EG-NOT: BFE -define void @bfe_i32_constant_fold_test_10(i32 addrspace(1)* %out) nounwind { - %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 65535, i32 16, i32 16) nounwind readnone - store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_11: -; SI-NOT: {{[^@]}}bfe -; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], -6 -; SI: buffer_store_dword [[VREG]], -; SI: s_endpgm -; EG-NOT: BFE -define void @bfe_i32_constant_fold_test_11(i32 addrspace(1)* %out) nounwind { - %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 160, i32 4, i32 4) nounwind readnone - store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_12: -; SI-NOT: {{[^@]}}bfe -; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0 -; SI: buffer_store_dword [[VREG]], -; SI: s_endpgm -; EG-NOT: BFE -define void @bfe_i32_constant_fold_test_12(i32 addrspace(1)* %out) nounwind { - %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 160, i32 31, i32 1) nounwind readnone - store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_13: -; SI-NOT: {{[^@]}}bfe -; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 1 -; SI: buffer_store_dword [[VREG]], -; SI: s_endpgm -; EG-NOT: BFE -define void @bfe_i32_constant_fold_test_13(i32 addrspace(1)* %out) nounwind { - %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 131070, i32 16, i32 16) nounwind readnone - store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_14: -; SI-NOT: {{[^@]}}bfe -; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 40 -; SI: buffer_store_dword [[VREG]], -; SI: s_endpgm -; EG-NOT: BFE -define void @bfe_i32_constant_fold_test_14(i32 addrspace(1)* %out) nounwind { - %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 160, i32 2, i32 30) nounwind readnone - store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_15: -; SI-NOT: {{[^@]}}bfe -; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 10 -; SI: buffer_store_dword [[VREG]], -; SI: s_endpgm -; EG-NOT: BFE -define void @bfe_i32_constant_fold_test_15(i32 addrspace(1)* %out) nounwind { - %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 160, i32 4, i32 28) nounwind readnone - store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_16: -; SI-NOT: {{[^@]}}bfe -; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], -1 -; SI: buffer_store_dword [[VREG]], -; SI: s_endpgm -; EG-NOT: BFE -define void @bfe_i32_constant_fold_test_16(i32 addrspace(1)* %out) nounwind { - %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 4294967295, i32 1, i32 7) nounwind readnone - store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_17: -; SI-NOT: {{[^@]}}bfe -; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0x7f -; SI: buffer_store_dword [[VREG]], -; SI: s_endpgm -; EG-NOT: BFE -define void @bfe_i32_constant_fold_test_17(i32 addrspace(1)* %out) nounwind { - %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 255, i32 1, i32 31) nounwind readnone - store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_18: -; SI-NOT: {{[^@]}}bfe -; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0 -; SI: buffer_store_dword [[VREG]], -; SI: s_endpgm -; EG-NOT: BFE -define void @bfe_i32_constant_fold_test_18(i32 addrspace(1)* %out) nounwind { - %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 255, i32 31, i32 1) nounwind readnone - store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_sext_in_reg_i24: -; SI: buffer_load_dword [[LOAD:v[0-9]+]], -; SI-NOT: v_lshl -; SI-NOT: v_ashr -; SI: v_bfe_i32 [[BFE:v[0-9]+]], [[LOAD]], 0, 24 -; SI: buffer_store_dword [[BFE]], -define void @bfe_sext_in_reg_i24(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { - %x = load i32, i32 addrspace(1)* %in, align 4 - %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %x, i32 0, i32 24) - %shl = shl i32 %bfe, 8 - %ashr = ashr i32 %shl, 8 - store i32 %ashr, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: @simplify_demanded_bfe_sdiv -; SI: buffer_load_dword [[LOAD:v[0-9]+]] -; SI: v_bfe_i32 [[BFE:v[0-9]+]], [[LOAD]], 1, 16 -; SI: v_lshrrev_b32_e32 [[TMP0:v[0-9]+]], 31, [[BFE]] -; SI: v_add_i32_e32 [[TMP1:v[0-9]+]], [[TMP0]], [[BFE]] -; SI: v_ashrrev_i32_e32 [[TMP2:v[0-9]+]], 1, [[TMP1]] -; SI: buffer_store_dword [[TMP2]] -define void @simplify_demanded_bfe_sdiv(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { - %src = load i32, i32 addrspace(1)* %in, align 4 - %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %src, i32 1, i32 16) nounwind readnone - %div = sdiv i32 %bfe, 2 - store i32 %div, i32 addrspace(1)* %out, align 4 - ret void -} diff --git a/llvm/test/CodeGen/R600/llvm.AMDGPU.bfe.u32.ll b/llvm/test/CodeGen/R600/llvm.AMDGPU.bfe.u32.ll deleted file mode 100644 index 541119242a9..00000000000 --- a/llvm/test/CodeGen/R600/llvm.AMDGPU.bfe.u32.ll +++ /dev/null @@ -1,627 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s - -declare i32 @llvm.AMDGPU.bfe.u32(i32, i32, i32) nounwind readnone - -; FUNC-LABEL: {{^}}bfe_u32_arg_arg_arg: -; SI: v_bfe_u32 -; EG: BFE_UINT -define void @bfe_u32_arg_arg_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) nounwind { - %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 %src0, i32 %src1, i32 %src1) nounwind readnone - store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_u32_arg_arg_imm: -; SI: v_bfe_u32 -; EG: BFE_UINT -define void @bfe_u32_arg_arg_imm(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind { - %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 %src0, i32 %src1, i32 123) nounwind readnone - store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_u32_arg_imm_arg: -; SI: v_bfe_u32 -; EG: BFE_UINT -define void @bfe_u32_arg_imm_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src2) nounwind { - %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 %src0, i32 123, i32 %src2) nounwind readnone - store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_u32_imm_arg_arg: -; SI: v_bfe_u32 -; EG: BFE_UINT -define void @bfe_u32_imm_arg_arg(i32 addrspace(1)* %out, i32 %src1, i32 %src2) nounwind { - %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 123, i32 %src1, i32 %src2) nounwind readnone - store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_u32_arg_0_width_reg_offset: -; SI-NOT: {{[^@]}}bfe -; SI: s_endpgm -; EG-NOT: BFE -define void @bfe_u32_arg_0_width_reg_offset(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind { - %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 %src0, i32 %src1, i32 0) nounwind readnone - store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_u32_arg_0_width_imm_offset: -; SI-NOT: {{[^@]}}bfe -; SI: s_endpgm -; EG-NOT: BFE -define void @bfe_u32_arg_0_width_imm_offset(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind { - %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 %src0, i32 8, i32 0) nounwind readnone - store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_u32_zextload_i8: -; SI: buffer_load_ubyte -; SI-NOT: {{[^@]}}bfe -; SI: s_endpgm -define void @bfe_u32_zextload_i8(i32 addrspace(1)* %out, i8 addrspace(1)* %in) nounwind { - %load = load i8, i8 addrspace(1)* %in - %ext = zext i8 %load to i32 - %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %ext, i32 0, i32 8) - store i32 %bfe, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_u32_zext_in_reg_i8: -; SI: buffer_load_dword -; SI: v_add_i32 -; SI-NEXT: v_and_b32_e32 -; SI-NOT: {{[^@]}}bfe -; SI: s_endpgm -define void @bfe_u32_zext_in_reg_i8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { - %load = load i32, i32 addrspace(1)* %in, align 4 - %add = add i32 %load, 1 - %ext = and i32 %add, 255 - %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %ext, i32 0, i32 8) - store i32 %bfe, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_u32_zext_in_reg_i16: -; SI: buffer_load_dword -; SI: v_add_i32 -; SI-NEXT: v_and_b32_e32 -; SI-NOT: {{[^@]}}bfe -; SI: s_endpgm -define void @bfe_u32_zext_in_reg_i16(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { - %load = load i32, i32 addrspace(1)* %in, align 4 - %add = add i32 %load, 1 - %ext = and i32 %add, 65535 - %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %ext, i32 0, i32 16) - store i32 %bfe, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_u32_zext_in_reg_i8_offset_1: -; SI: buffer_load_dword -; SI: v_add_i32 -; SI: bfe -; SI: s_endpgm -define void @bfe_u32_zext_in_reg_i8_offset_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { - %load = load i32, i32 addrspace(1)* %in, align 4 - %add = add i32 %load, 1 - %ext = and i32 %add, 255 - %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %ext, i32 1, i32 8) - store i32 %bfe, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_u32_zext_in_reg_i8_offset_3: -; SI: buffer_load_dword -; SI: v_add_i32 -; SI-NEXT: v_and_b32_e32 {{v[0-9]+}}, 0xf8 -; SI-NEXT: bfe -; SI: s_endpgm -define void @bfe_u32_zext_in_reg_i8_offset_3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { - %load = load i32, i32 addrspace(1)* %in, align 4 - %add = add i32 %load, 1 - %ext = and i32 %add, 255 - %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %ext, i32 3, i32 8) - store i32 %bfe, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_u32_zext_in_reg_i8_offset_7: -; SI: buffer_load_dword -; SI: v_add_i32 -; SI-NEXT: v_and_b32_e32 {{v[0-9]+}}, 0x80 -; SI-NEXT: bfe -; SI: s_endpgm -define void @bfe_u32_zext_in_reg_i8_offset_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { - %load = load i32, i32 addrspace(1)* %in, align 4 - %add = add i32 %load, 1 - %ext = and i32 %add, 255 - %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %ext, i32 7, i32 8) - store i32 %bfe, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_u32_zext_in_reg_i16_offset_8: -; SI: buffer_load_dword -; SI: v_add_i32 -; SI-NEXT: bfe -; SI: s_endpgm -define void @bfe_u32_zext_in_reg_i16_offset_8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { - %load = load i32, i32 addrspace(1)* %in, align 4 - %add = add i32 %load, 1 - %ext = and i32 %add, 65535 - %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %ext, i32 8, i32 8) - store i32 %bfe, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_u32_test_1: -; SI: buffer_load_dword -; SI: v_and_b32_e32 {{v[0-9]+}}, 1, {{v[0-9]+}} -; SI: s_endpgm -; EG: AND_INT T{{[0-9]\.[XYZW]}}, T{{[0-9]\.[XYZW]}}, 1, -define void @bfe_u32_test_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { - %x = load i32, i32 addrspace(1)* %in, align 4 - %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %x, i32 0, i32 1) - store i32 %bfe, i32 addrspace(1)* %out, align 4 - ret void -} - -define void @bfe_u32_test_2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { - %x = load i32, i32 addrspace(1)* %in, align 4 - %shl = shl i32 %x, 31 - %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %shl, i32 0, i32 8) - store i32 %bfe, i32 addrspace(1)* %out, align 4 - ret void -} - -define void @bfe_u32_test_3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { - %x = load i32, i32 addrspace(1)* %in, align 4 - %shl = shl i32 %x, 31 - %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %shl, i32 0, i32 1) - store i32 %bfe, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_u32_test_4: -; SI-NOT: lshl -; SI-NOT: shr -; SI-NOT: {{[^@]}}bfe -; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0 -; SI: buffer_store_dword [[VREG]], -; SI: s_endpgm -define void @bfe_u32_test_4(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { - %x = load i32, i32 addrspace(1)* %in, align 4 - %shl = shl i32 %x, 31 - %shr = lshr i32 %shl, 31 - %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %shr, i32 31, i32 1) - store i32 %bfe, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_u32_test_5: -; SI: buffer_load_dword -; SI-NOT: lshl -; SI-NOT: shr -; SI: v_bfe_i32 {{v[0-9]+}}, {{v[0-9]+}}, 0, 1 -; SI: s_endpgm -define void @bfe_u32_test_5(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { - %x = load i32, i32 addrspace(1)* %in, align 4 - %shl = shl i32 %x, 31 - %shr = ashr i32 %shl, 31 - %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %shr, i32 0, i32 1) - store i32 %bfe, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_u32_test_6: -; SI: v_lshlrev_b32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}} -; SI: v_lshrrev_b32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}} -; SI: s_endpgm -define void @bfe_u32_test_6(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { - %x = load i32, i32 addrspace(1)* %in, align 4 - %shl = shl i32 %x, 31 - %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %shl, i32 1, i32 31) - store i32 %bfe, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_u32_test_7: -; SI: v_lshlrev_b32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}} -; SI-NOT: {{[^@]}}bfe -; SI: s_endpgm -define void @bfe_u32_test_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { - %x = load i32, i32 addrspace(1)* %in, align 4 - %shl = shl i32 %x, 31 - %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %shl, i32 0, i32 31) - store i32 %bfe, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_u32_test_8: -; SI-NOT: {{[^@]}}bfe -; SI: v_and_b32_e32 {{v[0-9]+}}, 1, {{v[0-9]+}} -; SI-NOT: {{[^@]}}bfe -; SI: s_endpgm -define void @bfe_u32_test_8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { - %x = load i32, i32 addrspace(1)* %in, align 4 - %shl = shl i32 %x, 31 - %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %shl, i32 31, i32 1) - store i32 %bfe, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_u32_test_9: -; SI-NOT: {{[^@]}}bfe -; SI: v_lshrrev_b32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}} -; SI-NOT: {{[^@]}}bfe -; SI: s_endpgm -define void @bfe_u32_test_9(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { - %x = load i32, i32 addrspace(1)* %in, align 4 - %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %x, i32 31, i32 1) - store i32 %bfe, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_u32_test_10: -; SI-NOT: {{[^@]}}bfe -; SI: v_lshrrev_b32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}} -; SI-NOT: {{[^@]}}bfe -; SI: s_endpgm -define void @bfe_u32_test_10(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { - %x = load i32, i32 addrspace(1)* %in, align 4 - %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %x, i32 1, i32 31) - store i32 %bfe, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_u32_test_11: -; SI-NOT: {{[^@]}}bfe -; SI: v_lshrrev_b32_e32 v{{[0-9]+}}, 8, v{{[0-9]+}} -; SI-NOT: {{[^@]}}bfe -; SI: s_endpgm -define void @bfe_u32_test_11(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { - %x = load i32, i32 addrspace(1)* %in, align 4 - %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %x, i32 8, i32 24) - store i32 %bfe, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_u32_test_12: -; SI-NOT: {{[^@]}}bfe -; SI: v_lshrrev_b32_e32 v{{[0-9]+}}, 24, v{{[0-9]+}} -; SI-NOT: {{[^@]}}bfe -; SI: s_endpgm -define void @bfe_u32_test_12(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { - %x = load i32, i32 addrspace(1)* %in, align 4 - %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %x, i32 24, i32 8) - store i32 %bfe, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_u32_test_13: -; V_ASHRREV_U32_e32 {{v[0-9]+}}, 31, {{v[0-9]+}} -; SI-NOT: {{[^@]}}bfe -; SI: s_endpgm -define void @bfe_u32_test_13(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { - %x = load i32, i32 addrspace(1)* %in, align 4 - %shl = ashr i32 %x, 31 - %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %shl, i32 31, i32 1) - store i32 %bfe, i32 addrspace(1)* %out, align 4 ret void -} - -; FUNC-LABEL: {{^}}bfe_u32_test_14: -; SI-NOT: lshr -; SI-NOT: {{[^@]}}bfe -; SI: s_endpgm -define void @bfe_u32_test_14(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { - %x = load i32, i32 addrspace(1)* %in, align 4 - %shl = lshr i32 %x, 31 - %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %shl, i32 31, i32 1) - store i32 %bfe, i32 addrspace(1)* %out, align 4 ret void -} - -; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_0: -; SI-NOT: {{[^@]}}bfe -; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0 -; SI: buffer_store_dword [[VREG]], -; SI: s_endpgm -; EG-NOT: BFE -define void @bfe_u32_constant_fold_test_0(i32 addrspace(1)* %out) nounwind { - %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 0, i32 0, i32 0) nounwind readnone - store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_1: -; SI-NOT: {{[^@]}}bfe -; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0 -; SI: buffer_store_dword [[VREG]], -; SI: s_endpgm -; EG-NOT: BFE -define void @bfe_u32_constant_fold_test_1(i32 addrspace(1)* %out) nounwind { - %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 12334, i32 0, i32 0) nounwind readnone - store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_2: -; SI-NOT: {{[^@]}}bfe -; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0 -; SI: buffer_store_dword [[VREG]], -; SI: s_endpgm -; EG-NOT: BFE -define void @bfe_u32_constant_fold_test_2(i32 addrspace(1)* %out) nounwind { - %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 0, i32 0, i32 1) nounwind readnone - store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_3: -; SI-NOT: {{[^@]}}bfe -; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 1 -; SI: buffer_store_dword [[VREG]], -; SI: s_endpgm -; EG-NOT: BFE -define void @bfe_u32_constant_fold_test_3(i32 addrspace(1)* %out) nounwind { - %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 1, i32 0, i32 1) nounwind readnone - store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_4: -; SI-NOT: {{[^@]}}bfe -; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], -1 -; SI: buffer_store_dword [[VREG]], -; SI: s_endpgm -; EG-NOT: BFE -define void @bfe_u32_constant_fold_test_4(i32 addrspace(1)* %out) nounwind { - %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 4294967295, i32 0, i32 1) nounwind readnone - store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_5: -; SI-NOT: {{[^@]}}bfe -; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 1 -; SI: buffer_store_dword [[VREG]], -; SI: s_endpgm -; EG-NOT: BFE -define void @bfe_u32_constant_fold_test_5(i32 addrspace(1)* %out) nounwind { - %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 128, i32 7, i32 1) nounwind readnone - store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_6: -; SI-NOT: {{[^@]}}bfe -; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0x80 -; SI: buffer_store_dword [[VREG]], -; SI: s_endpgm -; EG-NOT: BFE -define void @bfe_u32_constant_fold_test_6(i32 addrspace(1)* %out) nounwind { - %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 128, i32 0, i32 8) nounwind readnone - store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_7: -; SI-NOT: {{[^@]}}bfe -; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0x7f -; SI: buffer_store_dword [[VREG]], -; SI: s_endpgm -; EG-NOT: BFE -define void @bfe_u32_constant_fold_test_7(i32 addrspace(1)* %out) nounwind { - %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 127, i32 0, i32 8) nounwind readnone - store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_8: -; SI-NOT: {{[^@]}}bfe -; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 1 -; SI: buffer_store_dword [[VREG]], -; SI: s_endpgm -; EG-NOT: BFE -define void @bfe_u32_constant_fold_test_8(i32 addrspace(1)* %out) nounwind { - %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 127, i32 6, i32 8) nounwind readnone - store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_9: -; SI-NOT: {{[^@]}}bfe -; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 1 -; SI: buffer_store_dword [[VREG]], -; SI: s_endpgm -; EG-NOT: BFE -define void @bfe_u32_constant_fold_test_9(i32 addrspace(1)* %out) nounwind { - %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 65536, i32 16, i32 8) nounwind readnone - store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_10: -; SI-NOT: {{[^@]}}bfe -; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0 -; SI: buffer_store_dword [[VREG]], -; SI: s_endpgm -; EG-NOT: BFE -define void @bfe_u32_constant_fold_test_10(i32 addrspace(1)* %out) nounwind { - %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 65535, i32 16, i32 16) nounwind readnone - store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_11: -; SI-NOT: {{[^@]}}bfe -; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 10 -; SI: buffer_store_dword [[VREG]], -; SI: s_endpgm -; EG-NOT: BFE -define void @bfe_u32_constant_fold_test_11(i32 addrspace(1)* %out) nounwind { - %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 160, i32 4, i32 4) nounwind readnone - store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_12: -; SI-NOT: {{[^@]}}bfe -; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0 -; SI: buffer_store_dword [[VREG]], -; SI: s_endpgm -; EG-NOT: BFE -define void @bfe_u32_constant_fold_test_12(i32 addrspace(1)* %out) nounwind { - %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 160, i32 31, i32 1) nounwind readnone - store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_13: -; SI-NOT: {{[^@]}}bfe -; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 1 -; SI: buffer_store_dword [[VREG]], -; SI: s_endpgm -; EG-NOT: BFE -define void @bfe_u32_constant_fold_test_13(i32 addrspace(1)* %out) nounwind { - %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 131070, i32 16, i32 16) nounwind readnone - store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_14: -; SI-NOT: {{[^@]}}bfe -; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 40 -; SI: buffer_store_dword [[VREG]], -; SI: s_endpgm -; EG-NOT: BFE -define void @bfe_u32_constant_fold_test_14(i32 addrspace(1)* %out) nounwind { - %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 160, i32 2, i32 30) nounwind readnone - store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_15: -; SI-NOT: {{[^@]}}bfe -; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 10 -; SI: buffer_store_dword [[VREG]], -; SI: s_endpgm -; EG-NOT: BFE -define void @bfe_u32_constant_fold_test_15(i32 addrspace(1)* %out) nounwind { - %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 160, i32 4, i32 28) nounwind readnone - store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_16: -; SI-NOT: {{[^@]}}bfe -; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0x7f -; SI: buffer_store_dword [[VREG]], -; SI: s_endpgm -; EG-NOT: BFE -define void @bfe_u32_constant_fold_test_16(i32 addrspace(1)* %out) nounwind { - %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 4294967295, i32 1, i32 7) nounwind readnone - store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_17: -; SI-NOT: {{[^@]}}bfe -; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0x7f -; SI: buffer_store_dword [[VREG]], -; SI: s_endpgm -; EG-NOT: BFE -define void @bfe_u32_constant_fold_test_17(i32 addrspace(1)* %out) nounwind { - %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 255, i32 1, i32 31) nounwind readnone - store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_18: -; SI-NOT: {{[^@]}}bfe -; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0 -; SI: buffer_store_dword [[VREG]], -; SI: s_endpgm -; EG-NOT: BFE -define void @bfe_u32_constant_fold_test_18(i32 addrspace(1)* %out) nounwind { - %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 255, i32 31, i32 1) nounwind readnone - store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 - ret void -} - -; Make sure that SimplifyDemandedBits doesn't cause the and to be -; reduced to the bits demanded by the bfe. - -; XXX: The operand to v_bfe_u32 could also just directly be the load register. -; FUNC-LABEL: {{^}}simplify_bfe_u32_multi_use_arg: -; SI: buffer_load_dword [[ARG:v[0-9]+]] -; SI: v_and_b32_e32 [[AND:v[0-9]+]], 63, [[ARG]] -; SI: v_bfe_u32 [[BFE:v[0-9]+]], [[AND]], 2, 2 -; SI-DAG: buffer_store_dword [[AND]] -; SI-DAG: buffer_store_dword [[BFE]] -; SI: s_endpgm -define void @simplify_bfe_u32_multi_use_arg(i32 addrspace(1)* %out0, - i32 addrspace(1)* %out1, - i32 addrspace(1)* %in) nounwind { - %src = load i32, i32 addrspace(1)* %in, align 4 - %and = and i32 %src, 63 - %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 %and, i32 2, i32 2) nounwind readnone - store i32 %bfe_u32, i32 addrspace(1)* %out0, align 4 - store i32 %and, i32 addrspace(1)* %out1, align 4 - ret void -} - -; FUNC-LABEL: {{^}}lshr_and: -; SI: s_bfe_u32 {{s[0-9]+}}, {{s[0-9]+}}, 0x30006 -; SI: buffer_store_dword -define void @lshr_and(i32 addrspace(1)* %out, i32 %a) nounwind { - %b = lshr i32 %a, 6 - %c = and i32 %b, 7 - store i32 %c, i32 addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}v_lshr_and: -; SI: v_bfe_u32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}, 3 -; SI: buffer_store_dword -define void @v_lshr_and(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind { - %c = lshr i32 %a, %b - %d = and i32 %c, 7 - store i32 %d, i32 addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}and_lshr: -; SI: s_bfe_u32 {{s[0-9]+}}, {{s[0-9]+}}, 0x30006 -; SI: buffer_store_dword -define void @and_lshr(i32 addrspace(1)* %out, i32 %a) nounwind { - %b = and i32 %a, 448 - %c = lshr i32 %b, 6 - store i32 %c, i32 addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}and_lshr2: -; SI: s_bfe_u32 {{s[0-9]+}}, {{s[0-9]+}}, 0x30006 -; SI: buffer_store_dword -define void @and_lshr2(i32 addrspace(1)* %out, i32 %a) nounwind { - %b = and i32 %a, 511 - %c = lshr i32 %b, 6 - store i32 %c, i32 addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}shl_lshr: -; SI: s_bfe_u32 {{s[0-9]+}}, {{s[0-9]+}}, 0x150002 -; SI: buffer_store_dword -define void @shl_lshr(i32 addrspace(1)* %out, i32 %a) nounwind { - %b = shl i32 %a, 9 - %c = lshr i32 %b, 11 - store i32 %c, i32 addrspace(1)* %out, align 8 - ret void -} diff --git a/llvm/test/CodeGen/R600/llvm.AMDGPU.bfi.ll b/llvm/test/CodeGen/R600/llvm.AMDGPU.bfi.ll deleted file mode 100644 index 517a55abc09..00000000000 --- a/llvm/test/CodeGen/R600/llvm.AMDGPU.bfi.ll +++ /dev/null @@ -1,42 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s - -declare i32 @llvm.AMDGPU.bfi(i32, i32, i32) nounwind readnone - -; FUNC-LABEL: {{^}}bfi_arg_arg_arg: -; SI: v_bfi_b32 -; EG: BFI_INT -define void @bfi_arg_arg_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) nounwind { - %bfi = call i32 @llvm.AMDGPU.bfi(i32 %src0, i32 %src1, i32 %src1) nounwind readnone - store i32 %bfi, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfi_arg_arg_imm: -; SI: v_bfi_b32 -; EG: BFI_INT -define void @bfi_arg_arg_imm(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind { - %bfi = call i32 @llvm.AMDGPU.bfi(i32 %src0, i32 %src1, i32 123) nounwind readnone - store i32 %bfi, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfi_arg_imm_arg: -; SI: v_bfi_b32 -; EG: BFI_INT -define void @bfi_arg_imm_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src2) nounwind { - %bfi = call i32 @llvm.AMDGPU.bfi(i32 %src0, i32 123, i32 %src2) nounwind readnone - store i32 %bfi, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfi_imm_arg_arg: -; SI: v_bfi_b32 -; EG: BFI_INT -define void @bfi_imm_arg_arg(i32 addrspace(1)* %out, i32 %src1, i32 %src2) nounwind { - %bfi = call i32 @llvm.AMDGPU.bfi(i32 123, i32 %src1, i32 %src2) nounwind readnone - store i32 %bfi, i32 addrspace(1)* %out, align 4 - ret void -} - diff --git a/llvm/test/CodeGen/R600/llvm.AMDGPU.bfm.ll b/llvm/test/CodeGen/R600/llvm.AMDGPU.bfm.ll deleted file mode 100644 index 50492289d74..00000000000 --- a/llvm/test/CodeGen/R600/llvm.AMDGPU.bfm.ll +++ /dev/null @@ -1,60 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s - -declare i32 @llvm.AMDGPU.bfm(i32, i32) nounwind readnone - -; FUNC-LABEL: {{^}}bfm_arg_arg: -; SI: s_bfm_b32 {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} -; EG: BFM_INT -define void @bfm_arg_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind { - %bfm = call i32 @llvm.AMDGPU.bfm(i32 %src0, i32 %src1) nounwind readnone - store i32 %bfm, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfm_arg_imm: -; SI: s_bfm_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0x7b -; EG: BFM_INT -define void @bfm_arg_imm(i32 addrspace(1)* %out, i32 %src0) nounwind { - %bfm = call i32 @llvm.AMDGPU.bfm(i32 %src0, i32 123) nounwind readnone - store i32 %bfm, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfm_imm_arg: -; SI: s_bfm_b32 {{s[0-9]+}}, 0x7b, {{s[0-9]+}} -; EG: BFM_INT -define void @bfm_imm_arg(i32 addrspace(1)* %out, i32 %src1) nounwind { - %bfm = call i32 @llvm.AMDGPU.bfm(i32 123, i32 %src1) nounwind readnone - store i32 %bfm, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfm_imm_imm: -; SI: s_bfm_b32 {{s[0-9]+}}, 0x7b, 0x1c8 -; EG: BFM_INT -define void @bfm_imm_imm(i32 addrspace(1)* %out) nounwind { - %bfm = call i32 @llvm.AMDGPU.bfm(i32 123, i32 456) nounwind readnone - store i32 %bfm, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfm_pattern: -; SI: s_bfm_b32 {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} -define void @bfm_pattern(i32 addrspace(1)* %out, i32 %x, i32 %y) { - %a = shl i32 1, %x - %b = sub i32 %a, 1 - %c = shl i32 %b, %y - store i32 %c, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}bfm_pattern_simple: -; SI: s_bfm_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0 -define void @bfm_pattern_simple(i32 addrspace(1)* %out, i32 %x) { - %a = shl i32 1, %x - %b = sub i32 %a, 1 - store i32 %b, i32 addrspace(1)* %out - ret void -} diff --git a/llvm/test/CodeGen/R600/llvm.AMDGPU.brev.ll b/llvm/test/CodeGen/R600/llvm.AMDGPU.brev.ll deleted file mode 100644 index 301de4b1c82..00000000000 --- a/llvm/test/CodeGen/R600/llvm.AMDGPU.brev.ll +++ /dev/null @@ -1,28 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s - -declare i32 @llvm.AMDGPU.brev(i32) nounwind readnone - -; FUNC-LABEL: {{^}}s_brev_i32: -; SI: s_load_dword [[VAL:s[0-9]+]], -; SI: s_brev_b32 [[SRESULT:s[0-9]+]], [[VAL]] -; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]] -; SI: buffer_store_dword [[VRESULT]], -; SI: s_endpgm -define void @s_brev_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind { - %ctlz = call i32 @llvm.AMDGPU.brev(i32 %val) nounwind readnone - store i32 %ctlz, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}v_brev_i32: -; SI: buffer_load_dword [[VAL:v[0-9]+]], -; SI: v_bfrev_b32_e32 [[RESULT:v[0-9]+]], [[VAL]] -; SI: buffer_store_dword [[RESULT]], -; SI: s_endpgm -define void @v_brev_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { - %val = load i32, i32 addrspace(1)* %valptr, align 4 - %ctlz = call i32 @llvm.AMDGPU.brev(i32 %val) nounwind readnone - store i32 %ctlz, i32 addrspace(1)* %out, align 4 - ret void -} diff --git a/llvm/test/CodeGen/R600/llvm.AMDGPU.clamp.ll b/llvm/test/CodeGen/R600/llvm.AMDGPU.clamp.ll deleted file mode 100644 index 11ec963ab31..00000000000 --- a/llvm/test/CodeGen/R600/llvm.AMDGPU.clamp.ll +++ /dev/null @@ -1,67 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s - -declare float @llvm.fabs.f32(float) nounwind readnone -declare float @llvm.AMDGPU.clamp.f32(float, float, float) nounwind readnone -declare float @llvm.AMDIL.clamp.f32(float, float, float) nounwind readnone - -; FUNC-LABEL: {{^}}clamp_0_1_f32: -; SI: s_load_dword [[ARG:s[0-9]+]], -; SI: v_add_f32_e64 [[RESULT:v[0-9]+]], 0, [[ARG]] clamp{{$}} -; SI: buffer_store_dword [[RESULT]] -; SI: s_endpgm - -; EG: MOV_SAT -define void @clamp_0_1_f32(float addrspace(1)* %out, float %src) nounwind { - %clamp = call float @llvm.AMDGPU.clamp.f32(float %src, float 0.0, float 1.0) nounwind readnone - store float %clamp, float addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}clamp_fabs_0_1_f32: -; SI: s_load_dword [[ARG:s[0-9]+]], -; SI: v_add_f32_e64 [[RESULT:v[0-9]+]], 0, |[[ARG]]| clamp{{$}} -; SI: buffer_store_dword [[RESULT]] -; SI: s_endpgm -define void @clamp_fabs_0_1_f32(float addrspace(1)* %out, float %src) nounwind { - %src.fabs = call float @llvm.fabs.f32(float %src) nounwind readnone - %clamp = call float @llvm.AMDGPU.clamp.f32(float %src.fabs, float 0.0, float 1.0) nounwind readnone - store float %clamp, float addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}clamp_fneg_0_1_f32: -; SI: s_load_dword [[ARG:s[0-9]+]], -; SI: v_add_f32_e64 [[RESULT:v[0-9]+]], 0, -[[ARG]] clamp{{$}} -; SI: buffer_store_dword [[RESULT]] -; SI: s_endpgm -define void @clamp_fneg_0_1_f32(float addrspace(1)* %out, float %src) nounwind { - %src.fneg = fsub float -0.0, %src - %clamp = call float @llvm.AMDGPU.clamp.f32(float %src.fneg, float 0.0, float 1.0) nounwind readnone - store float %clamp, float addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}clamp_fneg_fabs_0_1_f32: -; SI: s_load_dword [[ARG:s[0-9]+]], -; SI: v_add_f32_e64 [[RESULT:v[0-9]+]], 0, -|[[ARG]]| clamp{{$}} -; SI: buffer_store_dword [[RESULT]] -; SI: s_endpgm -define void @clamp_fneg_fabs_0_1_f32(float addrspace(1)* %out, float %src) nounwind { - %src.fabs = call float @llvm.fabs.f32(float %src) nounwind readnone - %src.fneg.fabs = fsub float -0.0, %src.fabs - %clamp = call float @llvm.AMDGPU.clamp.f32(float %src.fneg.fabs, float 0.0, float 1.0) nounwind readnone - store float %clamp, float addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}clamp_0_1_amdil_legacy_f32: -; SI: s_load_dword [[ARG:s[0-9]+]], -; SI: v_add_f32_e64 [[RESULT:v[0-9]+]], 0, [[ARG]] clamp{{$}} -; SI: buffer_store_dword [[RESULT]] -define void @clamp_0_1_amdil_legacy_f32(float addrspace(1)* %out, float %src) nounwind { - %clamp = call float @llvm.AMDIL.clamp.f32(float %src, float 0.0, float 1.0) nounwind readnone - store float %clamp, float addrspace(1)* %out, align 4 - ret void -} diff --git a/llvm/test/CodeGen/R600/llvm.AMDGPU.class.ll b/llvm/test/CodeGen/R600/llvm.AMDGPU.class.ll deleted file mode 100644 index 805a88b59c7..00000000000 --- a/llvm/test/CodeGen/R600/llvm.AMDGPU.class.ll +++ /dev/null @@ -1,497 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s - -declare i1 @llvm.AMDGPU.class.f32(float, i32) #1 -declare i1 @llvm.AMDGPU.class.f64(double, i32) #1 -declare i32 @llvm.r600.read.tidig.x() #1 -declare float @llvm.fabs.f32(float) #1 -declare double @llvm.fabs.f64(double) #1 - -; SI-LABEL: {{^}}test_class_f32: -; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb -; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc -; SI: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]] -; SI: v_cmp_class_f32_e32 vcc, [[SA]], [[VB]] -; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc -; SI-NEXT: buffer_store_dword [[RESULT]] -; SI: s_endpgm -define void @test_class_f32(i32 addrspace(1)* %out, float %a, i32 %b) #0 { - %result = call i1 @llvm.AMDGPU.class.f32(float %a, i32 %b) #1 - %sext = sext i1 %result to i32 - store i32 %sext, i32 addrspace(1)* %out, align 4 - ret void -} - -; SI-LABEL: {{^}}test_class_fabs_f32: -; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb -; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc -; SI: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]] -; SI: v_cmp_class_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], |[[SA]]|, [[VB]] -; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]] -; SI-NEXT: buffer_store_dword [[RESULT]] -; SI: s_endpgm -define void @test_class_fabs_f32(i32 addrspace(1)* %out, float %a, i32 %b) #0 { - %a.fabs = call float @llvm.fabs.f32(float %a) #1 - %result = call i1 @llvm.AMDGPU.class.f32(float %a.fabs, i32 %b) #1 - %sext = sext i1 %result to i32 - store i32 %sext, i32 addrspace(1)* %out, align 4 - ret void -} - -; SI-LABEL: {{^}}test_class_fneg_f32: -; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb -; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc -; SI: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]] -; SI: v_cmp_class_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], -[[SA]], [[VB]] -; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]] -; SI-NEXT: buffer_store_dword [[RESULT]] -; SI: s_endpgm -define void @test_class_fneg_f32(i32 addrspace(1)* %out, float %a, i32 %b) #0 { - %a.fneg = fsub float -0.0, %a - %result = call i1 @llvm.AMDGPU.class.f32(float %a.fneg, i32 %b) #1 - %sext = sext i1 %result to i32 - store i32 %sext, i32 addrspace(1)* %out, align 4 - ret void -} - -; SI-LABEL: {{^}}test_class_fneg_fabs_f32: -; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb -; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc -; SI: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]] -; SI: v_cmp_class_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], -|[[SA]]|, [[VB]] -; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]] -; SI-NEXT: buffer_store_dword [[RESULT]] -; SI: s_endpgm -define void @test_class_fneg_fabs_f32(i32 addrspace(1)* %out, float %a, i32 %b) #0 { - %a.fabs = call float @llvm.fabs.f32(float %a) #1 - %a.fneg.fabs = fsub float -0.0, %a.fabs - %result = call i1 @llvm.AMDGPU.class.f32(float %a.fneg.fabs, i32 %b) #1 - %sext = sext i1 %result to i32 - store i32 %sext, i32 addrspace(1)* %out, align 4 - ret void -} - -; SI-LABEL: {{^}}test_class_1_f32: -; SI: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb -; SI: v_cmp_class_f32_e64 [[COND:s\[[0-9]+:[0-9]+\]]], [[SA]], 1{{$}} -; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[COND]] -; SI-NEXT: buffer_store_dword [[RESULT]] -; SI: s_endpgm -define void @test_class_1_f32(i32 addrspace(1)* %out, float %a) #0 { - %result = call i1 @llvm.AMDGPU.class.f32(float %a, i32 1) #1 - %sext = sext i1 %result to i32 - store i32 %sext, i32 addrspace(1)* %out, align 4 - ret void -} - -; SI-LABEL: {{^}}test_class_64_f32: -; SI: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb -; SI: v_cmp_class_f32_e64 [[COND:s\[[0-9]+:[0-9]+\]]], [[SA]], 64{{$}} -; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[COND]] -; SI-NEXT: buffer_store_dword [[RESULT]] -; SI: s_endpgm -define void @test_class_64_f32(i32 addrspace(1)* %out, float %a) #0 { - %result = call i1 @llvm.AMDGPU.class.f32(float %a, i32 64) #1 - %sext = sext i1 %result to i32 - store i32 %sext, i32 addrspace(1)* %out, align 4 - ret void -} - -; Set all 10 bits of mask -; SI-LABEL: {{^}}test_class_full_mask_f32: -; SI: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb -; SI: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x3ff{{$}} -; SI: v_cmp_class_f32_e32 vcc, [[SA]], [[MASK]] -; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc -; SI-NEXT: buffer_store_dword [[RESULT]] -; SI: s_endpgm -define void @test_class_full_mask_f32(i32 addrspace(1)* %out, float %a) #0 { - %result = call i1 @llvm.AMDGPU.class.f32(float %a, i32 1023) #1 - %sext = sext i1 %result to i32 - store i32 %sext, i32 addrspace(1)* %out, align 4 - ret void -} - -; SI-LABEL: {{^}}test_class_9bit_mask_f32: -; SI: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb -; SI: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x1ff{{$}} -; SI: v_cmp_class_f32_e32 vcc, [[SA]], [[MASK]] -; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc -; SI-NEXT: buffer_store_dword [[RESULT]] -; SI: s_endpgm -define void @test_class_9bit_mask_f32(i32 addrspace(1)* %out, float %a) #0 { - %result = call i1 @llvm.AMDGPU.class.f32(float %a, i32 511) #1 - %sext = sext i1 %result to i32 - store i32 %sext, i32 addrspace(1)* %out, align 4 - ret void -} - -; SI-LABEL: {{^}}v_test_class_full_mask_f32: -; SI-DAG: buffer_load_dword [[VA:v[0-9]+]] -; SI-DAG: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x1ff{{$}} -; SI: v_cmp_class_f32_e32 vcc, [[VA]], [[MASK]] -; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc -; SI: buffer_store_dword [[RESULT]] -; SI: s_endpgm -define void @v_test_class_full_mask_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 { - %tid = call i32 @llvm.r600.read.tidig.x() #1 - %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid - %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %a = load float, float addrspace(1)* %gep.in - - %result = call i1 @llvm.AMDGPU.class.f32(float %a, i32 511) #1 - %sext = sext i1 %result to i32 - store i32 %sext, i32 addrspace(1)* %gep.out, align 4 - ret void -} - -; SI-LABEL: {{^}}test_class_inline_imm_constant_dynamic_mask_f32: -; SI-DAG: buffer_load_dword [[VB:v[0-9]+]] -; SI: v_cmp_class_f32_e32 vcc, 1.0, [[VB]] -; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc -; SI: buffer_store_dword [[RESULT]] -; SI: s_endpgm -define void @test_class_inline_imm_constant_dynamic_mask_f32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { - %tid = call i32 @llvm.r600.read.tidig.x() #1 - %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid - %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %b = load i32, i32 addrspace(1)* %gep.in - - %result = call i1 @llvm.AMDGPU.class.f32(float 1.0, i32 %b) #1 - %sext = sext i1 %result to i32 - store i32 %sext, i32 addrspace(1)* %gep.out, align 4 - ret void -} - -; FIXME: Why isn't this using a literal constant operand? -; SI-LABEL: {{^}}test_class_lit_constant_dynamic_mask_f32: -; SI-DAG: buffer_load_dword [[VB:v[0-9]+]] -; SI-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x44800000 -; SI: v_cmp_class_f32_e32 vcc, [[VK]], [[VB]] -; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc -; SI: buffer_store_dword [[RESULT]] -; SI: s_endpgm -define void @test_class_lit_constant_dynamic_mask_f32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { - %tid = call i32 @llvm.r600.read.tidig.x() #1 - %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid - %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %b = load i32, i32 addrspace(1)* %gep.in - - %result = call i1 @llvm.AMDGPU.class.f32(float 1024.0, i32 %b) #1 - %sext = sext i1 %result to i32 - store i32 %sext, i32 addrspace(1)* %gep.out, align 4 - ret void -} - -; SI-LABEL: {{^}}test_class_f64: -; SI-DAG: s_load_dwordx2 [[SA:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0xb -; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd -; SI-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]] -; SI: v_cmp_class_f64_e32 vcc, [[SA]], [[VB]] -; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc -; SI-NEXT: buffer_store_dword [[RESULT]] -; SI: s_endpgm -define void @test_class_f64(i32 addrspace(1)* %out, double %a, i32 %b) #0 { - %result = call i1 @llvm.AMDGPU.class.f64(double %a, i32 %b) #1 - %sext = sext i1 %result to i32 - store i32 %sext, i32 addrspace(1)* %out, align 4 - ret void -} - -; SI-LABEL: {{^}}test_class_fabs_f64: -; SI-DAG: s_load_dwordx2 [[SA:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0xb -; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd -; SI-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]] -; SI: v_cmp_class_f64_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], |[[SA]]|, [[VB]] -; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]] -; SI-NEXT: buffer_store_dword [[RESULT]] -; SI: s_endpgm -define void @test_class_fabs_f64(i32 addrspace(1)* %out, double %a, i32 %b) #0 { - %a.fabs = call double @llvm.fabs.f64(double %a) #1 - %result = call i1 @llvm.AMDGPU.class.f64(double %a.fabs, i32 %b) #1 - %sext = sext i1 %result to i32 - store i32 %sext, i32 addrspace(1)* %out, align 4 - ret void -} - -; SI-LABEL: {{^}}test_class_fneg_f64: -; SI-DAG: s_load_dwordx2 [[SA:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0xb -; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd -; SI-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]] -; SI: v_cmp_class_f64_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], -[[SA]], [[VB]] -; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]] -; SI-NEXT: buffer_store_dword [[RESULT]] -; SI: s_endpgm -define void @test_class_fneg_f64(i32 addrspace(1)* %out, double %a, i32 %b) #0 { - %a.fneg = fsub double -0.0, %a - %result = call i1 @llvm.AMDGPU.class.f64(double %a.fneg, i32 %b) #1 - %sext = sext i1 %result to i32 - store i32 %sext, i32 addrspace(1)* %out, align 4 - ret void -} - -; SI-LABEL: {{^}}test_class_fneg_fabs_f64: -; SI-DAG: s_load_dwordx2 [[SA:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0xb -; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd -; SI-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]] -; SI: v_cmp_class_f64_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], -|[[SA]]|, [[VB]] -; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]] -; SI-NEXT: buffer_store_dword [[RESULT]] -; SI: s_endpgm -define void @test_class_fneg_fabs_f64(i32 addrspace(1)* %out, double %a, i32 %b) #0 { - %a.fabs = call double @llvm.fabs.f64(double %a) #1 - %a.fneg.fabs = fsub double -0.0, %a.fabs - %result = call i1 @llvm.AMDGPU.class.f64(double %a.fneg.fabs, i32 %b) #1 - %sext = sext i1 %result to i32 - store i32 %sext, i32 addrspace(1)* %out, align 4 - ret void -} - -; SI-LABEL: {{^}}test_class_1_f64: -; SI: v_cmp_class_f64_e64 {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 1{{$}} -; SI: s_endpgm -define void @test_class_1_f64(i32 addrspace(1)* %out, double %a) #0 { - %result = call i1 @llvm.AMDGPU.class.f64(double %a, i32 1) #1 - %sext = sext i1 %result to i32 - store i32 %sext, i32 addrspace(1)* %out, align 4 - ret void -} - -; SI-LABEL: {{^}}test_class_64_f64: -; SI: v_cmp_class_f64_e64 {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 64{{$}} -; SI: s_endpgm -define void @test_class_64_f64(i32 addrspace(1)* %out, double %a) #0 { - %result = call i1 @llvm.AMDGPU.class.f64(double %a, i32 64) #1 - %sext = sext i1 %result to i32 - store i32 %sext, i32 addrspace(1)* %out, align 4 - ret void -} - -; Set all 9 bits of mask -; SI-LABEL: {{^}}test_class_full_mask_f64: -; SI: s_load_dwordx2 [[SA:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0xb -; SI: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x1ff{{$}} -; SI: v_cmp_class_f64_e32 vcc, [[SA]], [[MASK]] -; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc -; SI-NEXT: buffer_store_dword [[RESULT]] -; SI: s_endpgm -define void @test_class_full_mask_f64(i32 addrspace(1)* %out, double %a) #0 { - %result = call i1 @llvm.AMDGPU.class.f64(double %a, i32 511) #1 - %sext = sext i1 %result to i32 - store i32 %sext, i32 addrspace(1)* %out, align 4 - ret void -} - -; SI-LABEL: {{^}}v_test_class_full_mask_f64: -; SI-DAG: buffer_load_dwordx2 [[VA:v\[[0-9]+:[0-9]+\]]] -; SI-DAG: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x1ff{{$}} -; SI: v_cmp_class_f64_e32 vcc, [[VA]], [[MASK]] -; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc -; SI: buffer_store_dword [[RESULT]] -; SI: s_endpgm -define void @v_test_class_full_mask_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #0 { - %tid = call i32 @llvm.r600.read.tidig.x() #1 - %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid - %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %a = load double, double addrspace(1)* %in - - %result = call i1 @llvm.AMDGPU.class.f64(double %a, i32 511) #1 - %sext = sext i1 %result to i32 - store i32 %sext, i32 addrspace(1)* %gep.out, align 4 - ret void -} - -; SI-LABEL: {{^}}test_class_inline_imm_constant_dynamic_mask_f64: -; XSI: v_cmp_class_f64_e32 vcc, 1.0, -; SI: v_cmp_class_f64_e32 vcc, -; SI: s_endpgm -define void @test_class_inline_imm_constant_dynamic_mask_f64(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { - %tid = call i32 @llvm.r600.read.tidig.x() #1 - %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid - %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %b = load i32, i32 addrspace(1)* %gep.in - - %result = call i1 @llvm.AMDGPU.class.f64(double 1.0, i32 %b) #1 - %sext = sext i1 %result to i32 - store i32 %sext, i32 addrspace(1)* %gep.out, align 4 - ret void -} - -; SI-LABEL: {{^}}test_class_lit_constant_dynamic_mask_f64: -; SI: v_cmp_class_f64_e32 vcc, s{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} -; SI: s_endpgm -define void @test_class_lit_constant_dynamic_mask_f64(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { - %tid = call i32 @llvm.r600.read.tidig.x() #1 - %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid - %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %b = load i32, i32 addrspace(1)* %gep.in - - %result = call i1 @llvm.AMDGPU.class.f64(double 1024.0, i32 %b) #1 - %sext = sext i1 %result to i32 - store i32 %sext, i32 addrspace(1)* %gep.out, align 4 - ret void -} - -; SI-LABEL: {{^}}test_fold_or_class_f32_0: -; SI-NOT: v_cmp_class -; SI: v_cmp_class_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, 3{{$}} -; SI-NOT: v_cmp_class -; SI: s_endpgm -define void @test_fold_or_class_f32_0(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 { - %tid = call i32 @llvm.r600.read.tidig.x() #1 - %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid - %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %a = load float, float addrspace(1)* %gep.in - - %class0 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 1) #1 - %class1 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 3) #1 - %or = or i1 %class0, %class1 - - %sext = sext i1 %or to i32 - store i32 %sext, i32 addrspace(1)* %out, align 4 - ret void -} - -; SI-LABEL: {{^}}test_fold_or3_class_f32_0: -; SI-NOT: v_cmp_class -; SI: v_cmp_class_f32_e64 s{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, 7{{$}} -; SI-NOT: v_cmp_class -; SI: s_endpgm -define void @test_fold_or3_class_f32_0(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 { - %tid = call i32 @llvm.r600.read.tidig.x() #1 - %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid - %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %a = load float, float addrspace(1)* %gep.in - - %class0 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 1) #1 - %class1 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 2) #1 - %class2 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 4) #1 - %or.0 = or i1 %class0, %class1 - %or.1 = or i1 %or.0, %class2 - - %sext = sext i1 %or.1 to i32 - store i32 %sext, i32 addrspace(1)* %out, align 4 - ret void -} - -; SI-LABEL: {{^}}test_fold_or_all_tests_class_f32_0: -; SI-NOT: v_cmp_class -; SI: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x3ff{{$}} -; SI: v_cmp_class_f32_e32 vcc, v{{[0-9]+}}, [[MASK]]{{$}} -; SI-NOT: v_cmp_class -; SI: s_endpgm -define void @test_fold_or_all_tests_class_f32_0(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 { - %tid = call i32 @llvm.r600.read.tidig.x() #1 - %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid - %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %a = load float, float addrspace(1)* %gep.in - - %class0 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 1) #1 - %class1 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 2) #1 - %class2 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 4) #1 - %class3 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 8) #1 - %class4 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 16) #1 - %class5 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 32) #1 - %class6 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 64) #1 - %class7 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 128) #1 - %class8 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 256) #1 - %class9 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 512) #1 - %or.0 = or i1 %class0, %class1 - %or.1 = or i1 %or.0, %class2 - %or.2 = or i1 %or.1, %class3 - %or.3 = or i1 %or.2, %class4 - %or.4 = or i1 %or.3, %class5 - %or.5 = or i1 %or.4, %class6 - %or.6 = or i1 %or.5, %class7 - %or.7 = or i1 %or.6, %class8 - %or.8 = or i1 %or.7, %class9 - %sext = sext i1 %or.8 to i32 - store i32 %sext, i32 addrspace(1)* %out, align 4 - ret void -} - -; SI-LABEL: {{^}}test_fold_or_class_f32_1: -; SI-NOT: v_cmp_class -; SI: v_cmp_class_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, 12{{$}} -; SI-NOT: v_cmp_class -; SI: s_endpgm -define void @test_fold_or_class_f32_1(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 { - %tid = call i32 @llvm.r600.read.tidig.x() #1 - %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid - %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %a = load float, float addrspace(1)* %gep.in - - %class0 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 4) #1 - %class1 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 8) #1 - %or = or i1 %class0, %class1 - - %sext = sext i1 %or to i32 - store i32 %sext, i32 addrspace(1)* %out, align 4 - ret void -} - -; SI-LABEL: {{^}}test_fold_or_class_f32_2: -; SI-NOT: v_cmp_class -; SI: v_cmp_class_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, 7{{$}} -; SI-NOT: v_cmp_class -; SI: s_endpgm -define void @test_fold_or_class_f32_2(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 { - %tid = call i32 @llvm.r600.read.tidig.x() #1 - %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid - %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %a = load float, float addrspace(1)* %gep.in - - %class0 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 7) #1 - %class1 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 7) #1 - %or = or i1 %class0, %class1 - - %sext = sext i1 %or to i32 - store i32 %sext, i32 addrspace(1)* %out, align 4 - ret void -} - -; SI-LABEL: {{^}}test_no_fold_or_class_f32_0: -; SI-DAG: v_cmp_class_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, 4{{$}} -; SI-DAG: v_cmp_class_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}, 8{{$}} -; SI: s_or_b64 -; SI: s_endpgm -define void @test_no_fold_or_class_f32_0(i32 addrspace(1)* %out, float addrspace(1)* %in, float %b) #0 { - %tid = call i32 @llvm.r600.read.tidig.x() #1 - %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid - %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %a = load float, float addrspace(1)* %gep.in - - %class0 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 4) #1 - %class1 = call i1 @llvm.AMDGPU.class.f32(float %b, i32 8) #1 - %or = or i1 %class0, %class1 - - %sext = sext i1 %or to i32 - store i32 %sext, i32 addrspace(1)* %out, align 4 - ret void -} - -; SI-LABEL: {{^}}test_class_0_f32: -; SI-NOT: v_cmp_class -; SI: v_mov_b32_e32 [[RESULT:v[0-9]+]], 0{{$}} -; SI: buffer_store_dword [[RESULT]] -; SI: s_endpgm -define void @test_class_0_f32(i32 addrspace(1)* %out, float %a) #0 { - %result = call i1 @llvm.AMDGPU.class.f32(float %a, i32 0) #1 - %sext = sext i1 %result to i32 - store i32 %sext, i32 addrspace(1)* %out, align 4 - ret void -} - -; SI-LABEL: {{^}}test_class_0_f64: -; SI-NOT: v_cmp_class -; SI: v_mov_b32_e32 [[RESULT:v[0-9]+]], 0{{$}} -; SI: buffer_store_dword [[RESULT]] -; SI: s_endpgm -define void @test_class_0_f64(i32 addrspace(1)* %out, double %a) #0 { - %result = call i1 @llvm.AMDGPU.class.f64(double %a, i32 0) #1 - %sext = sext i1 %result to i32 - store i32 %sext, i32 addrspace(1)* %out, align 4 - ret void -} - -attributes #0 = { nounwind } -attributes #1 = { nounwind readnone } diff --git a/llvm/test/CodeGen/R600/llvm.AMDGPU.cube.ll b/llvm/test/CodeGen/R600/llvm.AMDGPU.cube.ll deleted file mode 100644 index e95a51093cb..00000000000 --- a/llvm/test/CodeGen/R600/llvm.AMDGPU.cube.ll +++ /dev/null @@ -1,59 +0,0 @@ - -; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s - -; CHECK: {{^}}cube: -; CHECK: CUBE T{{[0-9]}}.X -; CHECK: CUBE T{{[0-9]}}.Y -; CHECK: CUBE T{{[0-9]}}.Z -; CHECK: CUBE * T{{[0-9]}}.W -define void @cube() #0 { -main_body: - %0 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9) - %1 = extractelement <4 x float> %0, i32 3 - %2 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9) - %3 = extractelement <4 x float> %2, i32 0 - %4 = fdiv float %3, %1 - %5 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9) - %6 = extractelement <4 x float> %5, i32 1 - %7 = fdiv float %6, %1 - %8 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9) - %9 = extractelement <4 x float> %8, i32 2 - %10 = fdiv float %9, %1 - %11 = insertelement <4 x float> undef, float %4, i32 0 - %12 = insertelement <4 x float> %11, float %7, i32 1 - %13 = insertelement <4 x float> %12, float %10, i32 2 - %14 = insertelement <4 x float> %13, float 1.000000e+00, i32 3 - %15 = call <4 x float> @llvm.AMDGPU.cube(<4 x float> %14) - %16 = extractelement <4 x float> %15, i32 0 - %17 = extractelement <4 x float> %15, i32 1 - %18 = extractelement <4 x float> %15, i32 2 - %19 = extractelement <4 x float> %15, i32 3 - %20 = call float @fabs(float %18) - %21 = fdiv float 1.000000e+00, %20 - %22 = fmul float %16, %21 - %23 = fadd float %22, 1.500000e+00 - %24 = fmul float %17, %21 - %25 = fadd float %24, 1.500000e+00 - %26 = insertelement <4 x float> undef, float %25, i32 0 - %27 = insertelement <4 x float> %26, float %23, i32 1 - %28 = insertelement <4 x float> %27, float %19, i32 2 - %29 = insertelement <4 x float> %28, float %25, i32 3 - %30 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %29, i32 16, i32 0, i32 4) - call void @llvm.R600.store.swizzle(<4 x float> %30, i32 0, i32 0) - ret void -} - -; Function Attrs: readnone -declare <4 x float> @llvm.AMDGPU.cube(<4 x float>) #1 - -; Function Attrs: readnone -declare float @fabs(float) #1 - -; Function Attrs: readnone -declare <4 x float> @llvm.AMDGPU.tex(<4 x float>, i32, i32, i32) #1 - -declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) - -attributes #0 = { "ShaderType"="0" } -attributes #1 = { readnone } - diff --git a/llvm/test/CodeGen/R600/llvm.AMDGPU.cvt_f32_ubyte.ll b/llvm/test/CodeGen/R600/llvm.AMDGPU.cvt_f32_ubyte.ll deleted file mode 100644 index 8b32f696449..00000000000 --- a/llvm/test/CodeGen/R600/llvm.AMDGPU.cvt_f32_ubyte.ll +++ /dev/null @@ -1,43 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI %s - -declare float @llvm.AMDGPU.cvt.f32.ubyte0(i32) nounwind readnone -declare float @llvm.AMDGPU.cvt.f32.ubyte1(i32) nounwind readnone -declare float @llvm.AMDGPU.cvt.f32.ubyte2(i32) nounwind readnone -declare float @llvm.AMDGPU.cvt.f32.ubyte3(i32) nounwind readnone - -; SI-LABEL: {{^}}test_unpack_byte0_to_float: -; SI: v_cvt_f32_ubyte0 -define void @test_unpack_byte0_to_float(float addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { - %val = load i32, i32 addrspace(1)* %in, align 4 - %cvt = call float @llvm.AMDGPU.cvt.f32.ubyte0(i32 %val) nounwind readnone - store float %cvt, float addrspace(1)* %out, align 4 - ret void -} - -; SI-LABEL: {{^}}test_unpack_byte1_to_float: -; SI: v_cvt_f32_ubyte1 -define void @test_unpack_byte1_to_float(float addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { - %val = load i32, i32 addrspace(1)* %in, align 4 - %cvt = call float @llvm.AMDGPU.cvt.f32.ubyte1(i32 %val) nounwind readnone - store float %cvt, float addrspace(1)* %out, align 4 - ret void -} - -; SI-LABEL: {{^}}test_unpack_byte2_to_float: -; SI: v_cvt_f32_ubyte2 -define void @test_unpack_byte2_to_float(float addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { - %val = load i32, i32 addrspace(1)* %in, align 4 - %cvt = call float @llvm.AMDGPU.cvt.f32.ubyte2(i32 %val) nounwind readnone - store float %cvt, float addrspace(1)* %out, align 4 - ret void -} - -; SI-LABEL: {{^}}test_unpack_byte3_to_float: -; SI: v_cvt_f32_ubyte3 -define void @test_unpack_byte3_to_float(float addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { - %val = load i32, i32 addrspace(1)* %in, align 4 - %cvt = call float @llvm.AMDGPU.cvt.f32.ubyte3(i32 %val) nounwind readnone - store float %cvt, float addrspace(1)* %out, align 4 - ret void -} diff --git a/llvm/test/CodeGen/R600/llvm.AMDGPU.div_fixup.ll b/llvm/test/CodeGen/R600/llvm.AMDGPU.div_fixup.ll deleted file mode 100644 index 55ca9c7536e..00000000000 --- a/llvm/test/CodeGen/R600/llvm.AMDGPU.div_fixup.ll +++ /dev/null @@ -1,31 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN %s - -declare float @llvm.AMDGPU.div.fixup.f32(float, float, float) nounwind readnone -declare double @llvm.AMDGPU.div.fixup.f64(double, double, double) nounwind readnone - -; GCN-LABEL: {{^}}test_div_fixup_f32: -; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb -; SI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd -; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc -; VI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c -; VI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x34 -; VI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30 -; GCN-DAG: v_mov_b32_e32 [[VC:v[0-9]+]], [[SC]] -; GCN-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]] -; GCN: v_div_fixup_f32 [[RESULT:v[0-9]+]], [[SA]], [[VB]], [[VC]] -; GCN: buffer_store_dword [[RESULT]], -; GCN: s_endpgm -define void @test_div_fixup_f32(float addrspace(1)* %out, float %a, float %b, float %c) nounwind { - %result = call float @llvm.AMDGPU.div.fixup.f32(float %a, float %b, float %c) nounwind readnone - store float %result, float addrspace(1)* %out, align 4 - ret void -} - -; GCN-LABEL: {{^}}test_div_fixup_f64: -; GCN: v_div_fixup_f64 -define void @test_div_fixup_f64(double addrspace(1)* %out, double %a, double %b, double %c) nounwind { - %result = call double @llvm.AMDGPU.div.fixup.f64(double %a, double %b, double %c) nounwind readnone - store double %result, double addrspace(1)* %out, align 8 - ret void -} diff --git a/llvm/test/CodeGen/R600/llvm.AMDGPU.div_fmas.ll b/llvm/test/CodeGen/R600/llvm.AMDGPU.div_fmas.ll deleted file mode 100644 index bcb7f870f1f..00000000000 --- a/llvm/test/CodeGen/R600/llvm.AMDGPU.div_fmas.ll +++ /dev/null @@ -1,179 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; XUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s - -; FIXME: Enable for VI. - -declare i32 @llvm.r600.read.tidig.x() nounwind readnone -declare void @llvm.AMDGPU.barrier.global() nounwind noduplicate -declare float @llvm.AMDGPU.div.fmas.f32(float, float, float, i1) nounwind readnone -declare double @llvm.AMDGPU.div.fmas.f64(double, double, double, i1) nounwind readnone - -; GCN-LABEL: {{^}}test_div_fmas_f32: -; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb -; SI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd -; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc -; VI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c -; VI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x34 -; VI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30 -; GCN-DAG: v_mov_b32_e32 [[VC:v[0-9]+]], [[SC]] -; GCN-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]] -; GCN-DAG: v_mov_b32_e32 [[VA:v[0-9]+]], [[SA]] -; GCN: v_div_fmas_f32 [[RESULT:v[0-9]+]], [[VB]], [[VA]], [[VC]] -; GCN: buffer_store_dword [[RESULT]], -; GCN: s_endpgm -define void @test_div_fmas_f32(float addrspace(1)* %out, float %a, float %b, float %c, i1 %d) nounwind { - %result = call float @llvm.AMDGPU.div.fmas.f32(float %a, float %b, float %c, i1 %d) nounwind readnone - store float %result, float addrspace(1)* %out, align 4 - ret void -} - -; GCN-LABEL: {{^}}test_div_fmas_f32_inline_imm_0: -; SI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd -; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc -; SI-DAG: v_mov_b32_e32 [[VC:v[0-9]+]], [[SC]] -; SI-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]] -; SI: v_div_fmas_f32 [[RESULT:v[0-9]+]], 1.0, [[VB]], [[VC]] -; SI: buffer_store_dword [[RESULT]], -; SI: s_endpgm -define void @test_div_fmas_f32_inline_imm_0(float addrspace(1)* %out, float %a, float %b, float %c, i1 %d) nounwind { - %result = call float @llvm.AMDGPU.div.fmas.f32(float 1.0, float %b, float %c, i1 %d) nounwind readnone - store float %result, float addrspace(1)* %out, align 4 - ret void -} - -; GCN-LABEL: {{^}}test_div_fmas_f32_inline_imm_1: -; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb -; SI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd -; SI-DAG: v_mov_b32_e32 [[VC:v[0-9]+]], [[SC]] -; SI-DAG: v_mov_b32_e32 [[VA:v[0-9]+]], [[SA]] -; SI: v_div_fmas_f32 [[RESULT:v[0-9]+]], 1.0, [[VA]], [[VC]] -; SI: buffer_store_dword [[RESULT]], -; SI: s_endpgm -define void @test_div_fmas_f32_inline_imm_1(float addrspace(1)* %out, float %a, float %b, float %c, i1 %d) nounwind { - %result = call float @llvm.AMDGPU.div.fmas.f32(float %a, float 1.0, float %c, i1 %d) nounwind readnone - store float %result, float addrspace(1)* %out, align 4 - ret void -} - -; GCN-LABEL: {{^}}test_div_fmas_f32_inline_imm_2: -; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb -; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc -; SI-DAG: v_mov_b32_e32 [[VA:v[0-9]+]], [[SA]] -; SI-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]] -; SI: v_div_fmas_f32 [[RESULT:v[0-9]+]], [[VA]], [[VB]], 1.0 -; SI: buffer_store_dword [[RESULT]], -; SI: s_endpgm -define void @test_div_fmas_f32_inline_imm_2(float addrspace(1)* %out, float %a, float %b, float %c, i1 %d) nounwind { - %result = call float @llvm.AMDGPU.div.fmas.f32(float %a, float %b, float 1.0, i1 %d) nounwind readnone - store float %result, float addrspace(1)* %out, align 4 - ret void -} - -; GCN-LABEL: {{^}}test_div_fmas_f64: -; GCN: v_div_fmas_f64 -define void @test_div_fmas_f64(double addrspace(1)* %out, double %a, double %b, double %c, i1 %d) nounwind { - %result = call double @llvm.AMDGPU.div.fmas.f64(double %a, double %b, double %c, i1 %d) nounwind readnone - store double %result, double addrspace(1)* %out, align 8 - ret void -} - -; GCN-LABEL: {{^}}test_div_fmas_f32_cond_to_vcc: -; SI: v_cmp_eq_i32_e64 vcc, 0, s{{[0-9]+}} -; SI: v_div_fmas_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} -define void @test_div_fmas_f32_cond_to_vcc(float addrspace(1)* %out, float %a, float %b, float %c, i32 %i) nounwind { - %cmp = icmp eq i32 %i, 0 - %result = call float @llvm.AMDGPU.div.fmas.f32(float %a, float %b, float %c, i1 %cmp) nounwind readnone - store float %result, float addrspace(1)* %out, align 4 - ret void -} - -; GCN-LABEL: {{^}}test_div_fmas_f32_imm_false_cond_to_vcc: -; SI: s_mov_b64 vcc, 0 -; SI: v_div_fmas_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} -define void @test_div_fmas_f32_imm_false_cond_to_vcc(float addrspace(1)* %out, float %a, float %b, float %c) nounwind { - %result = call float @llvm.AMDGPU.div.fmas.f32(float %a, float %b, float %c, i1 false) nounwind readnone - store float %result, float addrspace(1)* %out, align 4 - ret void -} - -; GCN-LABEL: {{^}}test_div_fmas_f32_imm_true_cond_to_vcc: -; SI: s_mov_b64 vcc, -1 -; SI: v_div_fmas_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} -define void @test_div_fmas_f32_imm_true_cond_to_vcc(float addrspace(1)* %out, float %a, float %b, float %c) nounwind { - %result = call float @llvm.AMDGPU.div.fmas.f32(float %a, float %b, float %c, i1 true) nounwind readnone - store float %result, float addrspace(1)* %out, align 4 - ret void -} - -; GCN-LABEL: {{^}}test_div_fmas_f32_logical_cond_to_vcc: -; SI-DAG: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI-DAG: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} -; SI-DAG: buffer_load_dword [[C:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} - -; SI-DAG: v_cmp_eq_i32_e32 [[CMP0:vcc]], 0, v{{[0-9]+}} -; SI-DAG: v_cmp_ne_i32_e64 [[CMP1:s\[[0-9]+:[0-9]+\]]], 0, s{{[0-9]+}} -; SI: s_and_b64 vcc, [[CMP0]], [[CMP1]] -; SI: v_div_fmas_f32 {{v[0-9]+}}, [[A]], [[B]], [[C]] -; SI: s_endpgm -define void @test_div_fmas_f32_logical_cond_to_vcc(float addrspace(1)* %out, float addrspace(1)* %in, i32 %d) nounwind { - %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone - %gep.a = getelementptr float, float addrspace(1)* %in, i32 %tid - %gep.b = getelementptr float, float addrspace(1)* %gep.a, i32 1 - %gep.c = getelementptr float, float addrspace(1)* %gep.a, i32 2 - %gep.out = getelementptr float, float addrspace(1)* %out, i32 2 - - %a = load float, float addrspace(1)* %gep.a - %b = load float, float addrspace(1)* %gep.b - %c = load float, float addrspace(1)* %gep.c - - %cmp0 = icmp eq i32 %tid, 0 - %cmp1 = icmp ne i32 %d, 0 - %and = and i1 %cmp0, %cmp1 - - %result = call float @llvm.AMDGPU.div.fmas.f32(float %a, float %b, float %c, i1 %and) nounwind readnone - store float %result, float addrspace(1)* %gep.out, align 4 - ret void -} - -; GCN-LABEL: {{^}}test_div_fmas_f32_i1_phi_vcc: -; SI: v_cmp_eq_i32_e32 vcc, 0, v{{[0-9]+}} -; SI: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], vcc -; SI: s_xor_b64 [[SAVE]], exec, [[SAVE]] - -; SI: buffer_load_dword [[LOAD:v[0-9]+]] -; SI: v_cmp_ne_i32_e32 vcc, 0, [[LOAD]] -; SI: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc - - -; SI: BB9_2: -; SI: s_or_b64 exec, exec, [[SAVE]] -; SI: v_cmp_ne_i32_e32 vcc, 0, v0 -; SI: v_div_fmas_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} -; SI: buffer_store_dword -; SI: s_endpgm -define void @test_div_fmas_f32_i1_phi_vcc(float addrspace(1)* %out, float addrspace(1)* %in, i32 addrspace(1)* %dummy) nounwind { -entry: - %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone - %gep.out = getelementptr float, float addrspace(1)* %out, i32 2 - %gep.a = getelementptr float, float addrspace(1)* %in, i32 %tid - %gep.b = getelementptr float, float addrspace(1)* %gep.a, i32 1 - %gep.c = getelementptr float, float addrspace(1)* %gep.a, i32 2 - - %a = load float, float addrspace(1)* %gep.a - %b = load float, float addrspace(1)* %gep.b - %c = load float, float addrspace(1)* %gep.c - - %cmp0 = icmp eq i32 %tid, 0 - br i1 %cmp0, label %bb, label %exit - -bb: - %val = load i32, i32 addrspace(1)* %dummy - %cmp1 = icmp ne i32 %val, 0 - br label %exit - -exit: - %cond = phi i1 [false, %entry], [%cmp1, %bb] - %result = call float @llvm.AMDGPU.div.fmas.f32(float %a, float %b, float %c, i1 %cond) nounwind readnone - store float %result, float addrspace(1)* %gep.out, align 4 - ret void -} diff --git a/llvm/test/CodeGen/R600/llvm.AMDGPU.div_scale.ll b/llvm/test/CodeGen/R600/llvm.AMDGPU.div_scale.ll deleted file mode 100644 index de830de039c..00000000000 --- a/llvm/test/CodeGen/R600/llvm.AMDGPU.div_scale.ll +++ /dev/null @@ -1,364 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s - -declare i32 @llvm.r600.read.tidig.x() nounwind readnone -declare { float, i1 } @llvm.AMDGPU.div.scale.f32(float, float, i1) nounwind readnone -declare { double, i1 } @llvm.AMDGPU.div.scale.f64(double, double, i1) nounwind readnone -declare float @llvm.fabs.f32(float) nounwind readnone - -; SI-LABEL @test_div_scale_f32_1: -; SI-DAG: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 -; SI-DAG: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[A]] -; SI: buffer_store_dword [[RESULT0]] -; SI: s_endpgm -define void @test_div_scale_f32_1(float addrspace(1)* %out, float addrspace(1)* %in) nounwind { - %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone - %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid - %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 - - %a = load float, float addrspace(1)* %gep.0, align 4 - %b = load float, float addrspace(1)* %gep.1, align 4 - - %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a, float %b, i1 false) nounwind readnone - %result0 = extractvalue { float, i1 } %result, 0 - store float %result0, float addrspace(1)* %out, align 4 - ret void -} - -; SI-LABEL @test_div_scale_f32_2: -; SI-DAG: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 -; SI-DAG: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[A]] -; SI: buffer_store_dword [[RESULT0]] -; SI: s_endpgm -define void @test_div_scale_f32_2(float addrspace(1)* %out, float addrspace(1)* %in) nounwind { - %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone - %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid - %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 - - %a = load float, float addrspace(1)* %gep.0, align 4 - %b = load float, float addrspace(1)* %gep.1, align 4 - - %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a, float %b, i1 true) nounwind readnone - %result0 = extractvalue { float, i1 } %result, 0 - store float %result0, float addrspace(1)* %out, align 4 - ret void -} - -; SI-LABEL @test_div_scale_f64_1: -; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 -; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 -; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[A]] -; SI: buffer_store_dwordx2 [[RESULT0]] -; SI: s_endpgm -define void @test_div_scale_f64_1(double addrspace(1)* %out, double addrspace(1)* %aptr, double addrspace(1)* %in) nounwind { - %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone - %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid - %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 - - %a = load double, double addrspace(1)* %gep.0, align 8 - %b = load double, double addrspace(1)* %gep.1, align 8 - - %result = call { double, i1 } @llvm.AMDGPU.div.scale.f64(double %a, double %b, i1 false) nounwind readnone - %result0 = extractvalue { double, i1 } %result, 0 - store double %result0, double addrspace(1)* %out, align 8 - ret void -} - -; SI-LABEL @test_div_scale_f64_1: -; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 -; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 -; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[A]] -; SI: buffer_store_dwordx2 [[RESULT0]] -; SI: s_endpgm -define void @test_div_scale_f64_2(double addrspace(1)* %out, double addrspace(1)* %aptr, double addrspace(1)* %in) nounwind { - %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone - %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid - %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 - - %a = load double, double addrspace(1)* %gep.0, align 8 - %b = load double, double addrspace(1)* %gep.1, align 8 - - %result = call { double, i1 } @llvm.AMDGPU.div.scale.f64(double %a, double %b, i1 true) nounwind readnone - %result0 = extractvalue { double, i1 } %result, 0 - store double %result0, double addrspace(1)* %out, align 8 - ret void -} - -; SI-LABEL @test_div_scale_f32_scalar_num_1: -; SI-DAG: buffer_load_dword [[B:v[0-9]+]] -; SI-DAG: s_load_dword [[A:s[0-9]+]] -; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[A]] -; SI: buffer_store_dword [[RESULT0]] -; SI: s_endpgm -define void @test_div_scale_f32_scalar_num_1(float addrspace(1)* %out, float addrspace(1)* %in, float %a) nounwind { - %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone - %gep = getelementptr float, float addrspace(1)* %in, i32 %tid - - %b = load float, float addrspace(1)* %gep, align 4 - - %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a, float %b, i1 false) nounwind readnone - %result0 = extractvalue { float, i1 } %result, 0 - store float %result0, float addrspace(1)* %out, align 4 - ret void -} - -; SI-LABEL @test_div_scale_f32_scalar_num_2: -; SI-DAG: buffer_load_dword [[B:v[0-9]+]] -; SI-DAG: s_load_dword [[A:s[0-9]+]] -; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[A]] -; SI: buffer_store_dword [[RESULT0]] -; SI: s_endpgm -define void @test_div_scale_f32_scalar_num_2(float addrspace(1)* %out, float addrspace(1)* %in, float %a) nounwind { - %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone - %gep = getelementptr float, float addrspace(1)* %in, i32 %tid - - %b = load float, float addrspace(1)* %gep, align 4 - - %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a, float %b, i1 true) nounwind readnone - %result0 = extractvalue { float, i1 } %result, 0 - store float %result0, float addrspace(1)* %out, align 4 - ret void -} - -; SI-LABEL @test_div_scale_f32_scalar_den_1: -; SI-DAG: buffer_load_dword [[A:v[0-9]+]] -; SI-DAG: s_load_dword [[B:s[0-9]+]] -; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[A]] -; SI: buffer_store_dword [[RESULT0]] -; SI: s_endpgm -define void @test_div_scale_f32_scalar_den_1(float addrspace(1)* %out, float addrspace(1)* %in, float %b) nounwind { - %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone - %gep = getelementptr float, float addrspace(1)* %in, i32 %tid - - %a = load float, float addrspace(1)* %gep, align 4 - - %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a, float %b, i1 false) nounwind readnone - %result0 = extractvalue { float, i1 } %result, 0 - store float %result0, float addrspace(1)* %out, align 4 - ret void -} - -; SI-LABEL @test_div_scale_f32_scalar_den_2: -; SI-DAG: buffer_load_dword [[A:v[0-9]+]] -; SI-DAG: s_load_dword [[B:s[0-9]+]] -; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[A]] -; SI: buffer_store_dword [[RESULT0]] -; SI: s_endpgm -define void @test_div_scale_f32_scalar_den_2(float addrspace(1)* %out, float addrspace(1)* %in, float %b) nounwind { - %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone - %gep = getelementptr float, float addrspace(1)* %in, i32 %tid - - %a = load float, float addrspace(1)* %gep, align 4 - - %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a, float %b, i1 true) nounwind readnone - %result0 = extractvalue { float, i1 } %result, 0 - store float %result0, float addrspace(1)* %out, align 4 - ret void -} - -; SI-LABEL @test_div_scale_f64_scalar_num_1: -; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]] -; SI-DAG: s_load_dwordx2 [[A:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xd -; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[A]] -; SI: buffer_store_dwordx2 [[RESULT0]] -; SI: s_endpgm -define void @test_div_scale_f64_scalar_num_1(double addrspace(1)* %out, double addrspace(1)* %in, double %a) nounwind { - %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone - %gep = getelementptr double, double addrspace(1)* %in, i32 %tid - - %b = load double, double addrspace(1)* %gep, align 8 - - %result = call { double, i1 } @llvm.AMDGPU.div.scale.f64(double %a, double %b, i1 false) nounwind readnone - %result0 = extractvalue { double, i1 } %result, 0 - store double %result0, double addrspace(1)* %out, align 8 - ret void -} - -; SI-LABEL @test_div_scale_f64_scalar_num_2: -; SI-DAG: s_load_dwordx2 [[A:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xd -; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]] -; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[A]] -; SI: buffer_store_dwordx2 [[RESULT0]] -; SI: s_endpgm -define void @test_div_scale_f64_scalar_num_2(double addrspace(1)* %out, double addrspace(1)* %in, double %a) nounwind { - %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone - %gep = getelementptr double, double addrspace(1)* %in, i32 %tid - - %b = load double, double addrspace(1)* %gep, align 8 - - %result = call { double, i1 } @llvm.AMDGPU.div.scale.f64(double %a, double %b, i1 true) nounwind readnone - %result0 = extractvalue { double, i1 } %result, 0 - store double %result0, double addrspace(1)* %out, align 8 - ret void -} - -; SI-LABEL @test_div_scale_f64_scalar_den_1: -; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]] -; SI-DAG: s_load_dwordx2 [[B:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xd -; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[A]] -; SI: buffer_store_dwordx2 [[RESULT0]] -; SI: s_endpgm -define void @test_div_scale_f64_scalar_den_1(double addrspace(1)* %out, double addrspace(1)* %in, double %b) nounwind { - %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone - %gep = getelementptr double, double addrspace(1)* %in, i32 %tid - - %a = load double, double addrspace(1)* %gep, align 8 - - %result = call { double, i1 } @llvm.AMDGPU.div.scale.f64(double %a, double %b, i1 false) nounwind readnone - %result0 = extractvalue { double, i1 } %result, 0 - store double %result0, double addrspace(1)* %out, align 8 - ret void -} - -; SI-LABEL @test_div_scale_f64_scalar_den_2: -; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]] -; SI-DAG: s_load_dwordx2 [[B:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xd -; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[A]] -; SI: buffer_store_dwordx2 [[RESULT0]] -; SI: s_endpgm -define void @test_div_scale_f64_scalar_den_2(double addrspace(1)* %out, double addrspace(1)* %in, double %b) nounwind { - %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone - %gep = getelementptr double, double addrspace(1)* %in, i32 %tid - - %a = load double, double addrspace(1)* %gep, align 8 - - %result = call { double, i1 } @llvm.AMDGPU.div.scale.f64(double %a, double %b, i1 true) nounwind readnone - %result0 = extractvalue { double, i1 } %result, 0 - store double %result0, double addrspace(1)* %out, align 8 - ret void -} - -; SI-LABEL @test_div_scale_f32_all_scalar_1: -; SI-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xb -; SI-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xc -; SI: v_mov_b32_e32 [[VA:v[0-9]+]], [[A]] -; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[VA]] -; SI: buffer_store_dword [[RESULT0]] -; SI: s_endpgm -define void @test_div_scale_f32_all_scalar_1(float addrspace(1)* %out, float %a, float %b) nounwind { - %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a, float %b, i1 false) nounwind readnone - %result0 = extractvalue { float, i1 } %result, 0 - store float %result0, float addrspace(1)* %out, align 4 - ret void -} - -; SI-LABEL @test_div_scale_f32_all_scalar_2: -; SI-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xb -; SI-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xc -; SI: v_mov_b32_e32 [[VB:v[0-9]+]], [[B]] -; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[VB]], [[A]] -; SI: buffer_store_dword [[RESULT0]] -; SI: s_endpgm -define void @test_div_scale_f32_all_scalar_2(float addrspace(1)* %out, float %a, float %b) nounwind { - %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a, float %b, i1 true) nounwind readnone - %result0 = extractvalue { float, i1 } %result, 0 - store float %result0, float addrspace(1)* %out, align 4 - ret void -} - -; SI-LABEL @test_div_scale_f64_all_scalar_1: -; SI-DAG: s_load_dwordx2 s{{\[}}[[A_LO:[0-9]+]]:[[A_HI:[0-9]+]]{{\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0xb -; SI-DAG: s_load_dwordx2 [[B:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xd -; SI-DAG: v_mov_b32_e32 v[[VA_LO:[0-9]+]], s[[A_LO]] -; SI-DAG: v_mov_b32_e32 v[[VA_HI:[0-9]+]], s[[A_HI]] -; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], v{{\[}}[[VA_LO]]:[[VA_HI]]{{\]}} -; SI: buffer_store_dwordx2 [[RESULT0]] -; SI: s_endpgm -define void @test_div_scale_f64_all_scalar_1(double addrspace(1)* %out, double %a, double %b) nounwind { - %result = call { double, i1 } @llvm.AMDGPU.div.scale.f64(double %a, double %b, i1 false) nounwind readnone - %result0 = extractvalue { double, i1 } %result, 0 - store double %result0, double addrspace(1)* %out, align 8 - ret void -} - -; SI-LABEL @test_div_scale_f64_all_scalar_2: -; SI-DAG: s_load_dwordx2 [[A:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb -; SI-DAG: s_load_dwordx2 s{{\[}}[[B_LO:[0-9]+]]:[[B_HI:[0-9]+]]{{\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0xd -; SI-DAG: v_mov_b32_e32 v[[VB_LO:[0-9]+]], s[[B_LO]] -; SI-DAG: v_mov_b32_e32 v[[VB_HI:[0-9]+]], s[[B_HI]] -; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], v{{\[}}[[VB_LO]]:[[VB_HI]]{{\]}}, [[A]] -; SI: buffer_store_dwordx2 [[RESULT0]] -; SI: s_endpgm -define void @test_div_scale_f64_all_scalar_2(double addrspace(1)* %out, double %a, double %b) nounwind { - %result = call { double, i1 } @llvm.AMDGPU.div.scale.f64(double %a, double %b, i1 true) nounwind readnone - %result0 = extractvalue { double, i1 } %result, 0 - store double %result0, double addrspace(1)* %out, align 8 - ret void -} - -; SI-LABEL @test_div_scale_f32_inline_imm_num: -; SI-DAG: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[A]], 1.0 -; SI: buffer_store_dword [[RESULT0]] -; SI: s_endpgm -define void @test_div_scale_f32_inline_imm_num(float addrspace(1)* %out, float addrspace(1)* %in) nounwind { - %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone - %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid - %a = load float, float addrspace(1)* %gep.0, align 4 - - %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float 1.0, float %a, i1 false) nounwind readnone - %result0 = extractvalue { float, i1 } %result, 0 - store float %result0, float addrspace(1)* %out, align 4 - ret void -} - -; SI-LABEL @test_div_scale_f32_inline_imm_den: -; SI-DAG: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], 2.0, 2.0, [[A]] -; SI: buffer_store_dword [[RESULT0]] -; SI: s_endpgm -define void @test_div_scale_f32_inline_imm_den(float addrspace(1)* %out, float addrspace(1)* %in) nounwind { - %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone - %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid - %a = load float, float addrspace(1)* %gep.0, align 4 - - %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a, float 2.0, i1 false) nounwind readnone - %result0 = extractvalue { float, i1 } %result, 0 - store float %result0, float addrspace(1)* %out, align 4 - ret void -} - -; SI-LABEL @test_div_scale_f32_fabs_num: -; SI-DAG: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 -; SI-DAG: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], |[[A]]| -; SI: buffer_store_dword [[RESULT0]] -; SI: s_endpgm -define void @test_div_scale_f32_fabs_num(float addrspace(1)* %out, float addrspace(1)* %in) nounwind { - %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone - %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid - %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 - - %a = load float, float addrspace(1)* %gep.0, align 4 - %b = load float, float addrspace(1)* %gep.1, align 4 - - %a.fabs = call float @llvm.fabs.f32(float %a) nounwind readnone - - %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a.fabs, float %b, i1 false) nounwind readnone - %result0 = extractvalue { float, i1 } %result, 0 - store float %result0, float addrspace(1)* %out, align 4 - ret void -} - -; SI-LABEL @test_div_scale_f32_fabs_den: -; SI-DAG: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 -; SI-DAG: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], |[[B]]|, |[[B]]|, [[A]] -; SI: buffer_store_dword [[RESULT0]] -; SI: s_endpgm -define void @test_div_scale_f32_fabs_den(float addrspace(1)* %out, float addrspace(1)* %in) nounwind { - %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone - %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid - %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 - - %a = load float, float addrspace(1)* %gep.0, align 4 - %b = load float, float addrspace(1)* %gep.1, align 4 - - %b.fabs = call float @llvm.fabs.f32(float %b) nounwind readnone - - %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a, float %b.fabs, i1 false) nounwind readnone - %result0 = extractvalue { float, i1 } %result, 0 - store float %result0, float addrspace(1)* %out, align 4 - ret void -} diff --git a/llvm/test/CodeGen/R600/llvm.AMDGPU.flbit.i32.ll b/llvm/test/CodeGen/R600/llvm.AMDGPU.flbit.i32.ll deleted file mode 100644 index 20c7af8ade5..00000000000 --- a/llvm/test/CodeGen/R600/llvm.AMDGPU.flbit.i32.ll +++ /dev/null @@ -1,28 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s - -declare i32 @llvm.AMDGPU.flbit.i32(i32) nounwind readnone - -; FUNC-LABEL: {{^}}s_flbit: -; SI: s_load_dword [[VAL:s[0-9]+]], -; SI: s_flbit_i32 [[SRESULT:s[0-9]+]], [[VAL]] -; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]] -; SI: buffer_store_dword [[VRESULT]], -; SI: s_endpgm -define void @s_flbit(i32 addrspace(1)* noalias %out, i32 %val) nounwind { - %r = call i32 @llvm.AMDGPU.flbit.i32(i32 %val) nounwind readnone - store i32 %r, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}v_flbit: -; SI: buffer_load_dword [[VAL:v[0-9]+]], -; SI: v_ffbh_i32_e32 [[RESULT:v[0-9]+]], [[VAL]] -; SI: buffer_store_dword [[RESULT]], -; SI: s_endpgm -define void @v_flbit(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { - %val = load i32, i32 addrspace(1)* %valptr, align 4 - %r = call i32 @llvm.AMDGPU.flbit.i32(i32 %val) nounwind readnone - store i32 %r, i32 addrspace(1)* %out, align 4 - ret void -} diff --git a/llvm/test/CodeGen/R600/llvm.AMDGPU.fract.f64.ll b/llvm/test/CodeGen/R600/llvm.AMDGPU.fract.f64.ll deleted file mode 100644 index e098dd35d6d..00000000000 --- a/llvm/test/CodeGen/R600/llvm.AMDGPU.fract.f64.ll +++ /dev/null @@ -1,60 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=FUNC %s - -declare double @llvm.fabs.f64(double %Val) -declare double @llvm.AMDGPU.fract.f64(double) nounwind readnone - -; FUNC-LABEL: {{^}}fract_f64: -; GCN: v_fract_f64_e32 [[FRC:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]] -; SI: v_mov_b32_e32 v[[UPLO:[0-9]+]], -1 -; SI: v_mov_b32_e32 v[[UPHI:[0-9]+]], 0x3fefffff -; SI: v_min_f64 v{{\[}}[[MINLO:[0-9]+]]:[[MINHI:[0-9]+]]], v{{\[}}[[UPLO]]:[[UPHI]]], [[FRC]] -; SI: v_cmp_class_f64_e64 [[COND:s\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO]]:[[HI]]], 3 -; SI: v_cndmask_b32_e64 v[[RESLO:[0-9]+]], v[[LO]], v[[MINLO]], [[COND]] -; SI: v_cndmask_b32_e64 v[[RESHI:[0-9]+]], v[[HI]], v[[MINHI]], [[COND]] -; SI: buffer_store_dwordx2 v{{\[}}[[RESLO]]:[[RESHI]]] -; CI: buffer_store_dwordx2 [[FRC]] -define void @fract_f64(double addrspace(1)* %out, double addrspace(1)* %src) nounwind { - %val = load double, double addrspace(1)* %src, align 4 - %fract = call double @llvm.AMDGPU.fract.f64(double %val) nounwind readnone - store double %fract, double addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}fract_f64_neg: -; GCN: v_fract_f64_e64 [[FRC:v\[[0-9]+:[0-9]+\]]], -v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]] -; SI: v_mov_b32_e32 v[[UPLO:[0-9]+]], -1 -; SI: v_mov_b32_e32 v[[UPHI:[0-9]+]], 0x3fefffff -; SI: v_min_f64 v{{\[}}[[MINLO:[0-9]+]]:[[MINHI:[0-9]+]]], v{{\[}}[[UPLO]]:[[UPHI]]], [[FRC]] -; SI: v_cmp_class_f64_e64 [[COND:s\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO]]:[[HI]]], 3 -; SI: v_cndmask_b32_e64 v[[RESLO:[0-9]+]], v[[LO]], v[[MINLO]], [[COND]] -; SI: v_cndmask_b32_e64 v[[RESHI:[0-9]+]], v[[HI]], v[[MINHI]], [[COND]] -; SI: buffer_store_dwordx2 v{{\[}}[[RESLO]]:[[RESHI]]] -; CI: buffer_store_dwordx2 [[FRC]] -define void @fract_f64_neg(double addrspace(1)* %out, double addrspace(1)* %src) nounwind { - %val = load double, double addrspace(1)* %src, align 4 - %neg = fsub double 0.0, %val - %fract = call double @llvm.AMDGPU.fract.f64(double %neg) nounwind readnone - store double %fract, double addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}fract_f64_neg_abs: -; GCN: v_fract_f64_e64 [[FRC:v\[[0-9]+:[0-9]+\]]], -|v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]]| -; SI: v_mov_b32_e32 v[[UPLO:[0-9]+]], -1 -; SI: v_mov_b32_e32 v[[UPHI:[0-9]+]], 0x3fefffff -; SI: v_min_f64 v{{\[}}[[MINLO:[0-9]+]]:[[MINHI:[0-9]+]]], v{{\[}}[[UPLO]]:[[UPHI]]], [[FRC]] -; SI: v_cmp_class_f64_e64 [[COND:s\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO]]:[[HI]]], 3 -; SI: v_cndmask_b32_e64 v[[RESLO:[0-9]+]], v[[LO]], v[[MINLO]], [[COND]] -; SI: v_cndmask_b32_e64 v[[RESHI:[0-9]+]], v[[HI]], v[[MINHI]], [[COND]] -; SI: buffer_store_dwordx2 v{{\[}}[[RESLO]]:[[RESHI]]] -; CI: buffer_store_dwordx2 [[FRC]] -define void @fract_f64_neg_abs(double addrspace(1)* %out, double addrspace(1)* %src) nounwind { - %val = load double, double addrspace(1)* %src, align 4 - %abs = call double @llvm.fabs.f64(double %val) - %neg = fsub double 0.0, %abs - %fract = call double @llvm.AMDGPU.fract.f64(double %neg) nounwind readnone - store double %fract, double addrspace(1)* %out, align 4 - ret void -} diff --git a/llvm/test/CodeGen/R600/llvm.AMDGPU.fract.ll b/llvm/test/CodeGen/R600/llvm.AMDGPU.fract.ll deleted file mode 100644 index 7501b4b7546..00000000000 --- a/llvm/test/CodeGen/R600/llvm.AMDGPU.fract.ll +++ /dev/null @@ -1,65 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s - -declare float @llvm.fabs.f32(float %Val) -declare float @llvm.AMDGPU.fract.f32(float) nounwind readnone - -; Legacy name -declare float @llvm.AMDIL.fraction.f32(float) nounwind readnone - -; FUNC-LABEL: {{^}}fract_f32: -; CI: v_fract_f32_e32 [[RESULT:v[0-9]+]], [[INPUT:v[0-9]+]] -; SI: v_floor_f32_e32 [[FLR:v[0-9]+]], [[INPUT:v[0-9]+]] -; SI: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[FLR]], [[INPUT]] -; GCN: buffer_store_dword [[RESULT]] -; EG: FRACT -define void @fract_f32(float addrspace(1)* %out, float addrspace(1)* %src) nounwind { - %val = load float, float addrspace(1)* %src, align 4 - %fract = call float @llvm.AMDGPU.fract.f32(float %val) nounwind readnone - store float %fract, float addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}fract_f32_legacy_amdil: -; CI: v_fract_f32_e32 [[RESULT:v[0-9]+]], [[INPUT:v[0-9]+]] -; SI: v_floor_f32_e32 [[FLR:v[0-9]+]], [[INPUT:v[0-9]+]] -; SI: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[FLR]], [[INPUT]] -; GCN: buffer_store_dword [[RESULT]] -; EG: FRACT -define void @fract_f32_legacy_amdil(float addrspace(1)* %out, float addrspace(1)* %src) nounwind { - %val = load float, float addrspace(1)* %src, align 4 - %fract = call float @llvm.AMDIL.fraction.f32(float %val) nounwind readnone - store float %fract, float addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}fract_f32_neg: -; CI: v_fract_f32_e64 [[RESULT:v[0-9]+]], -[[INPUT:v[0-9]+]] -; SI: v_floor_f32_e64 [[FLR:v[0-9]+]], -[[INPUT:v[0-9]+]] -; SI: v_sub_f32_e64 [[RESULT:v[0-9]+]], -[[INPUT]], [[FLR]] -; GCN: buffer_store_dword [[RESULT]] -; EG: FRACT -define void @fract_f32_neg(float addrspace(1)* %out, float addrspace(1)* %src) nounwind { - %val = load float, float addrspace(1)* %src, align 4 - %neg = fsub float 0.0, %val - %fract = call float @llvm.AMDGPU.fract.f32(float %neg) nounwind readnone - store float %fract, float addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}fract_f32_neg_abs: -; CI: v_fract_f32_e64 [[RESULT:v[0-9]+]], -|[[INPUT:v[0-9]+]]| -; SI: v_floor_f32_e64 [[FLR:v[0-9]+]], -|[[INPUT:v[0-9]+]]| -; SI: v_sub_f32_e64 [[RESULT:v[0-9]+]], -|[[INPUT]]|, [[FLR]] -; GCN: buffer_store_dword [[RESULT]] -; EG: FRACT -define void @fract_f32_neg_abs(float addrspace(1)* %out, float addrspace(1)* %src) nounwind { - %val = load float, float addrspace(1)* %src, align 4 - %abs = call float @llvm.fabs.f32(float %val) - %neg = fsub float 0.0, %abs - %fract = call float @llvm.AMDGPU.fract.f32(float %neg) nounwind readnone - store float %fract, float addrspace(1)* %out, align 4 - ret void -} diff --git a/llvm/test/CodeGen/R600/llvm.AMDGPU.imad24.ll b/llvm/test/CodeGen/R600/llvm.AMDGPU.imad24.ll deleted file mode 100644 index 42102e30f07..00000000000 --- a/llvm/test/CodeGen/R600/llvm.AMDGPU.imad24.ll +++ /dev/null @@ -1,22 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=cayman -verify-machineinstrs < %s | FileCheck -check-prefix=CM -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s -; XUN: llc -march=r600 -mcpu=r600 -verify-machineinstrs < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s -; XUN: llc -march=r600 -mcpu=r770 -verify-machineinstrs < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s - -; FIXME: Store of i32 seems to be broken pre-EG somehow? - -declare i32 @llvm.AMDGPU.imad24(i32, i32, i32) nounwind readnone - -; FUNC-LABEL: {{^}}test_imad24: -; SI: v_mad_i32_i24 -; CM: MULADD_INT24 -; R600: MULLO_INT -; R600: ADD_INT -define void @test_imad24(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) nounwind { - %mad = call i32 @llvm.AMDGPU.imad24(i32 %src0, i32 %src1, i32 %src2) nounwind readnone - store i32 %mad, i32 addrspace(1)* %out, align 4 - ret void -} - diff --git a/llvm/test/CodeGen/R600/llvm.AMDGPU.imax.ll b/llvm/test/CodeGen/R600/llvm.AMDGPU.imax.ll deleted file mode 100644 index 46662f96c29..00000000000 --- a/llvm/test/CodeGen/R600/llvm.AMDGPU.imax.ll +++ /dev/null @@ -1,33 +0,0 @@ -; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck -check-prefix=SI %s -; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefix=SI %s - -; SI-LABEL: {{^}}vector_imax: -; SI: v_max_i32_e32 -define void @vector_imax(i32 %p0, i32 %p1, i32 addrspace(1)* %in) #0 { -main_body: - %load = load i32, i32 addrspace(1)* %in, align 4 - %max = call i32 @llvm.AMDGPU.imax(i32 %p0, i32 %load) - %bc = bitcast i32 %max to float - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %bc, float %bc, float %bc, float %bc) - ret void -} - -; SI-LABEL: {{^}}scalar_imax: -; SI: s_max_i32 -define void @scalar_imax(i32 %p0, i32 %p1) #0 { -entry: - %max = call i32 @llvm.AMDGPU.imax(i32 %p0, i32 %p1) - %bc = bitcast i32 %max to float - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %bc, float %bc, float %bc, float %bc) - ret void -} - -; Function Attrs: readnone -declare i32 @llvm.AMDGPU.imax(i32, i32) #1 - -declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) - -attributes #0 = { nounwind } -attributes #1 = { nounwind readnone } - -!0 = !{!"const", null, i32 1} diff --git a/llvm/test/CodeGen/R600/llvm.AMDGPU.imin.ll b/llvm/test/CodeGen/R600/llvm.AMDGPU.imin.ll deleted file mode 100644 index 34b454e2375..00000000000 --- a/llvm/test/CodeGen/R600/llvm.AMDGPU.imin.ll +++ /dev/null @@ -1,33 +0,0 @@ -; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck -check-prefix=SI %s -; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefix=SI %s - -; SI-LABEL: {{^}}vector_imin: -; SI: v_min_i32_e32 -define void @vector_imin(i32 %p0, i32 %p1, i32 addrspace(1)* %in) #0 { -main_body: - %load = load i32, i32 addrspace(1)* %in, align 4 - %min = call i32 @llvm.AMDGPU.imin(i32 %p0, i32 %load) - %bc = bitcast i32 %min to float - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %bc, float %bc, float %bc, float %bc) - ret void -} - -; SI-LABEL: {{^}}scalar_imin: -; SI: s_min_i32 -define void @scalar_imin(i32 %p0, i32 %p1) #0 { -entry: - %min = call i32 @llvm.AMDGPU.imin(i32 %p0, i32 %p1) - %bc = bitcast i32 %min to float - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %bc, float %bc, float %bc, float %bc) - ret void -} - -; Function Attrs: readnone -declare i32 @llvm.AMDGPU.imin(i32, i32) #1 - -declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) - -attributes #0 = { nounwind } -attributes #1 = { nounwind readnone } - -!0 = !{!"const", null, i32 1} diff --git a/llvm/test/CodeGen/R600/llvm.AMDGPU.imul24.ll b/llvm/test/CodeGen/R600/llvm.AMDGPU.imul24.ll deleted file mode 100644 index fdc1172260b..00000000000 --- a/llvm/test/CodeGen/R600/llvm.AMDGPU.imul24.ll +++ /dev/null @@ -1,16 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=cayman -verify-machineinstrs < %s | FileCheck -check-prefix=CM -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s - -declare i32 @llvm.AMDGPU.imul24(i32, i32) nounwind readnone - -; FUNC-LABEL: {{^}}test_imul24: -; SI: v_mul_i32_i24 -; CM: MUL_INT24 -; R600: MULLO_INT -define void @test_imul24(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind { - %mul = call i32 @llvm.AMDGPU.imul24(i32 %src0, i32 %src1) nounwind readnone - store i32 %mul, i32 addrspace(1)* %out, align 4 - ret void -} diff --git a/llvm/test/CodeGen/R600/llvm.AMDGPU.kill.ll b/llvm/test/CodeGen/R600/llvm.AMDGPU.kill.ll deleted file mode 100644 index 057708e7b5c..00000000000 --- a/llvm/test/CodeGen/R600/llvm.AMDGPU.kill.ll +++ /dev/null @@ -1,39 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s - -; SI-LABEL: {{^}}kill_gs_const: -; SI-NOT: v_cmpx_le_f32 -; SI: s_mov_b64 exec, 0 - -define void @kill_gs_const() #0 { -main_body: - %0 = icmp ule i32 0, 3 - %1 = select i1 %0, float 1.000000e+00, float -1.000000e+00 - call void @llvm.AMDGPU.kill(float %1) - %2 = icmp ule i32 3, 0 - %3 = select i1 %2, float 1.000000e+00, float -1.000000e+00 - call void @llvm.AMDGPU.kill(float %3) - ret void -} - -; SI-LABEL: {{^}}kill_vcc_implicit_def: -; SI-NOT: v_cmp_gt_f32_e32 vcc, -; SI: v_cmp_gt_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], 0, v{{[0-9]+}} -; SI: v_cmpx_le_f32_e32 vcc, 0, v{{[0-9]+}} -; SI: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1.0, [[CMP]] -define void @kill_vcc_implicit_def([6 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* byval, [17 x <4 x i32>] addrspace(2)* byval, [34 x <8 x i32>] addrspace(2)* byval, float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, i32, float, float) #1 { -entry: - %tmp0 = fcmp olt float %13, 0.0 - call void @llvm.AMDGPU.kill(float %14) - %tmp1 = select i1 %tmp0, float 1.0, float 0.0 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 1, i32 1, float %tmp1, float %tmp1, float %tmp1, float %tmp1) - ret void -} - -declare void @llvm.AMDGPU.kill(float) -declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) - -attributes #0 = { "ShaderType"="2" } -attributes #1 = { "ShaderType"="0" } - -!0 = !{!"const", null, i32 1} diff --git a/llvm/test/CodeGen/R600/llvm.AMDGPU.ldexp.ll b/llvm/test/CodeGen/R600/llvm.AMDGPU.ldexp.ll deleted file mode 100644 index a59c0ce6d67..00000000000 --- a/llvm/test/CodeGen/R600/llvm.AMDGPU.ldexp.ll +++ /dev/null @@ -1,23 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s - -declare float @llvm.AMDGPU.ldexp.f32(float, i32) nounwind readnone -declare double @llvm.AMDGPU.ldexp.f64(double, i32) nounwind readnone - -; SI-LABEL: {{^}}test_ldexp_f32: -; SI: v_ldexp_f32 -; SI: s_endpgm -define void @test_ldexp_f32(float addrspace(1)* %out, float %a, i32 %b) nounwind { - %result = call float @llvm.AMDGPU.ldexp.f32(float %a, i32 %b) nounwind readnone - store float %result, float addrspace(1)* %out, align 4 - ret void -} - -; SI-LABEL: {{^}}test_ldexp_f64: -; SI: v_ldexp_f64 -; SI: s_endpgm -define void @test_ldexp_f64(double addrspace(1)* %out, double %a, i32 %b) nounwind { - %result = call double @llvm.AMDGPU.ldexp.f64(double %a, i32 %b) nounwind readnone - store double %result, double addrspace(1)* %out, align 8 - ret void -} diff --git a/llvm/test/CodeGen/R600/llvm.AMDGPU.legacy.rsq.ll b/llvm/test/CodeGen/R600/llvm.AMDGPU.legacy.rsq.ll deleted file mode 100644 index 4cafd563685..00000000000 --- a/llvm/test/CodeGen/R600/llvm.AMDGPU.legacy.rsq.ll +++ /dev/null @@ -1,13 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s - -declare float @llvm.AMDGPU.legacy.rsq(float) nounwind readnone - -; FUNC-LABEL: {{^}}rsq_legacy_f32: -; SI: v_rsq_legacy_f32_e32 -; EG: RECIPSQRT_IEEE -define void @rsq_legacy_f32(float addrspace(1)* %out, float %src) nounwind { - %rsq = call float @llvm.AMDGPU.legacy.rsq(float %src) nounwind readnone - store float %rsq, float addrspace(1)* %out, align 4 - ret void -} diff --git a/llvm/test/CodeGen/R600/llvm.AMDGPU.mul.ll b/llvm/test/CodeGen/R600/llvm.AMDGPU.mul.ll deleted file mode 100644 index 83b56a5029d..00000000000 --- a/llvm/test/CodeGen/R600/llvm.AMDGPU.mul.ll +++ /dev/null @@ -1,17 +0,0 @@ -;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s - -;CHECK: MUL NON-IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} - -define void @test(<4 x float> inreg %reg0) #0 { - %r0 = extractelement <4 x float> %reg0, i32 0 - %r1 = extractelement <4 x float> %reg0, i32 1 - %r2 = call float @llvm.AMDGPU.mul( float %r0, float %r1) - %vec = insertelement <4 x float> undef, float %r2, i32 0 - call void @llvm.R600.store.swizzle(<4 x float> %vec, i32 0, i32 0) - ret void -} - -declare float @llvm.AMDGPU.mul(float ,float ) readnone -declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) - -attributes #0 = { "ShaderType"="0" } \ No newline at end of file diff --git a/llvm/test/CodeGen/R600/llvm.AMDGPU.rcp.f64.ll b/llvm/test/CodeGen/R600/llvm.AMDGPU.rcp.f64.ll deleted file mode 100644 index d2a655bf909..00000000000 --- a/llvm/test/CodeGen/R600/llvm.AMDGPU.rcp.f64.ll +++ /dev/null @@ -1,33 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s - -declare double @llvm.AMDGPU.rcp.f64(double) nounwind readnone -declare double @llvm.sqrt.f64(double) nounwind readnone - -; FUNC-LABEL: {{^}}rcp_f64: -; SI: v_rcp_f64_e32 -define void @rcp_f64(double addrspace(1)* %out, double %src) nounwind { - %rcp = call double @llvm.AMDGPU.rcp.f64(double %src) nounwind readnone - store double %rcp, double addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}rcp_pat_f64: -; SI: v_rcp_f64_e32 -define void @rcp_pat_f64(double addrspace(1)* %out, double %src) nounwind { - %rcp = fdiv double 1.0, %src - store double %rcp, double addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}rsq_rcp_pat_f64: -; SI-UNSAFE: v_rsq_f64_e32 -; SI-SAFE-NOT: v_rsq_f64_e32 -; SI-SAFE: v_sqrt_f64 -; SI-SAFE: v_rcp_f64 -define void @rsq_rcp_pat_f64(double addrspace(1)* %out, double %src) nounwind { - %sqrt = call double @llvm.sqrt.f64(double %src) nounwind readnone - %rcp = call double @llvm.AMDGPU.rcp.f64(double %sqrt) nounwind readnone - store double %rcp, double addrspace(1)* %out, align 8 - ret void -} diff --git a/llvm/test/CodeGen/R600/llvm.AMDGPU.rcp.ll b/llvm/test/CodeGen/R600/llvm.AMDGPU.rcp.ll deleted file mode 100644 index edd6e9a72f1..00000000000 --- a/llvm/test/CodeGen/R600/llvm.AMDGPU.rcp.ll +++ /dev/null @@ -1,50 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -mattr=-fp32-denormals -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=SI-UNSAFE -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=SI -mattr=-fp32-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=SI-SAFE -check-prefix=SI -check-prefix=FUNC %s -; XUN: llc -march=amdgcn -mcpu=SI -mattr=+fp32-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=SI-SAFE-SPDENORM -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-fp32-denormals -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=SI-UNSAFE -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-fp32-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=SI-SAFE -check-prefix=SI -check-prefix=FUNC %s -; XUN: llc -march=amdgcn -mcpu=tonga -mattr=+fp32-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=SI-SAFE-SPDENORM -check-prefix=SI -check-prefix=FUNC %s - -; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG-SAFE -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=cayman -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s - -declare float @llvm.AMDGPU.rcp.f32(float) nounwind readnone -declare double @llvm.AMDGPU.rcp.f64(double) nounwind readnone - -declare float @llvm.sqrt.f32(float) nounwind readnone - -; FUNC-LABEL: {{^}}rcp_f32: -; SI: v_rcp_f32_e32 -; EG: RECIP_IEEE -define void @rcp_f32(float addrspace(1)* %out, float %src) nounwind { - %rcp = call float @llvm.AMDGPU.rcp.f32(float %src) nounwind readnone - store float %rcp, float addrspace(1)* %out, align 4 - ret void -} - -; FIXME: Evergreen only ever does unsafe fp math. -; FUNC-LABEL: {{^}}rcp_pat_f32: - -; SI-SAFE: v_rcp_f32_e32 -; XSI-SAFE-SPDENORM-NOT: v_rcp_f32_e32 - -; EG: RECIP_IEEE - -define void @rcp_pat_f32(float addrspace(1)* %out, float %src) nounwind { - %rcp = fdiv float 1.0, %src - store float %rcp, float addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}rsq_rcp_pat_f32: -; SI-UNSAFE: v_rsq_f32_e32 -; SI-SAFE: v_sqrt_f32_e32 -; SI-SAFE: v_rcp_f32_e32 - -; EG: RECIPSQRT_IEEE -define void @rsq_rcp_pat_f32(float addrspace(1)* %out, float %src) nounwind { - %sqrt = call float @llvm.sqrt.f32(float %src) nounwind readnone - %rcp = call float @llvm.AMDGPU.rcp.f32(float %sqrt) nounwind readnone - store float %rcp, float addrspace(1)* %out, align 4 - ret void -} diff --git a/llvm/test/CodeGen/R600/llvm.AMDGPU.rsq.clamped.f64.ll b/llvm/test/CodeGen/R600/llvm.AMDGPU.rsq.clamped.f64.ll deleted file mode 100644 index 67f1d22c717..00000000000 --- a/llvm/test/CodeGen/R600/llvm.AMDGPU.rsq.clamped.f64.ll +++ /dev/null @@ -1,23 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=FUNC %s - -declare double @llvm.AMDGPU.rsq.clamped.f64(double) nounwind readnone - -; FUNC-LABEL: {{^}}rsq_clamped_f64: -; SI: v_rsq_clamp_f64_e32 - -; VI: v_rsq_f64_e32 [[RSQ:v\[[0-9]+:[0-9]+\]]], s[2:3] -; TODO: this constant should be folded: -; VI: s_mov_b32 s[[ALLBITS:[0-9+]]], -1 -; VI: s_mov_b32 s[[HIGH1:[0-9+]]], 0x7fefffff -; VI: s_mov_b32 s[[LOW1:[0-9+]]], s[[ALLBITS]] -; VI: v_min_f64 v[0:1], [[RSQ]], s{{\[}}[[LOW1]]:[[HIGH1]]] -; VI: s_mov_b32 s[[HIGH2:[0-9+]]], 0xffefffff -; VI: s_mov_b32 s[[LOW2:[0-9+]]], s[[ALLBITS]] -; VI: v_max_f64 v[0:1], v[0:1], s{{\[}}[[LOW2]]:[[HIGH2]]] - -define void @rsq_clamped_f64(double addrspace(1)* %out, double %src) nounwind { - %rsq_clamped = call double @llvm.AMDGPU.rsq.clamped.f64(double %src) nounwind readnone - store double %rsq_clamped, double addrspace(1)* %out, align 8 - ret void -} diff --git a/llvm/test/CodeGen/R600/llvm.AMDGPU.rsq.clamped.ll b/llvm/test/CodeGen/R600/llvm.AMDGPU.rsq.clamped.ll deleted file mode 100644 index eeff2536b23..00000000000 --- a/llvm/test/CodeGen/R600/llvm.AMDGPU.rsq.clamped.ll +++ /dev/null @@ -1,23 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s - - -declare float @llvm.AMDGPU.rsq.clamped.f32(float) nounwind readnone - -; FUNC-LABEL: {{^}}rsq_clamped_f32: -; SI: v_rsq_clamp_f32_e32 - -; VI: v_rsq_f32_e32 [[RSQ:v[0-9]+]], {{s[0-9]+}} -; VI: v_min_f32_e32 [[MIN:v[0-9]+]], 0x7f7fffff, [[RSQ]] -; TODO: this constant should be folded: -; VI: v_mov_b32_e32 [[MINFLT:v[0-9]+]], 0xff7fffff -; VI: v_max_f32_e32 {{v[0-9]+}}, [[MIN]], [[MINFLT]] - -; EG: RECIPSQRT_CLAMPED - -define void @rsq_clamped_f32(float addrspace(1)* %out, float %src) nounwind { - %rsq_clamped = call float @llvm.AMDGPU.rsq.clamped.f32(float %src) nounwind readnone - store float %rsq_clamped, float addrspace(1)* %out, align 4 - ret void -} diff --git a/llvm/test/CodeGen/R600/llvm.AMDGPU.rsq.ll b/llvm/test/CodeGen/R600/llvm.AMDGPU.rsq.ll deleted file mode 100644 index 36b72f14db1..00000000000 --- a/llvm/test/CodeGen/R600/llvm.AMDGPU.rsq.ll +++ /dev/null @@ -1,33 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s - -declare float @llvm.AMDGPU.rsq.f32(float) nounwind readnone - -; FUNC-LABEL: {{^}}rsq_f32: -; SI: v_rsq_f32_e32 {{v[0-9]+}}, {{s[0-9]+}} -; EG: RECIPSQRT_IEEE -define void @rsq_f32(float addrspace(1)* %out, float %src) nounwind { - %rsq = call float @llvm.AMDGPU.rsq.f32(float %src) nounwind readnone - store float %rsq, float addrspace(1)* %out, align 4 - ret void -} - -; TODO: Really these should be constant folded -; FUNC-LABEL: {{^}}rsq_f32_constant_4.0 -; SI: v_rsq_f32_e32 {{v[0-9]+}}, 4.0 -; EG: RECIPSQRT_IEEE -define void @rsq_f32_constant_4.0(float addrspace(1)* %out) nounwind { - %rsq = call float @llvm.AMDGPU.rsq.f32(float 4.0) nounwind readnone - store float %rsq, float addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}rsq_f32_constant_100.0 -; SI: v_rsq_f32_e32 {{v[0-9]+}}, 0x42c80000 -; EG: RECIPSQRT_IEEE -define void @rsq_f32_constant_100.0(float addrspace(1)* %out) nounwind { - %rsq = call float @llvm.AMDGPU.rsq.f32(float 100.0) nounwind readnone - store float %rsq, float addrspace(1)* %out, align 4 - ret void -} diff --git a/llvm/test/CodeGen/R600/llvm.AMDGPU.tex.ll b/llvm/test/CodeGen/R600/llvm.AMDGPU.tex.ll deleted file mode 100644 index 10206609bb5..00000000000 --- a/llvm/test/CodeGen/R600/llvm.AMDGPU.tex.ll +++ /dev/null @@ -1,42 +0,0 @@ -;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s - -;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNNN -;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNNN -;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNNN -;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNNN -;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:UUNN -;CHECK: TEX_SAMPLE_C T{{[0-9]+\.XYZW, T[0-9]+\.XYZZ}} RID:0 SID:0 CT:NNNN -;CHECK: TEX_SAMPLE_C T{{[0-9]+\.XYZW, T[0-9]+\.XYZZ}} RID:0 SID:0 CT:NNNN -;CHECK: TEX_SAMPLE_C T{{[0-9]+\.XYZW, T[0-9]+\.XYZZ}} RID:0 SID:0 CT:UUNN -;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYYW}} RID:0 SID:0 CT:NNUN -;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNUN -;CHECK: TEX_SAMPLE_C T{{[0-9]+\.XYZW, T[0-9]+\.XYYZ}} RID:0 SID:0 CT:NNUN -;CHECK: TEX_SAMPLE_C T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNUN -;CHECK: TEX_SAMPLE_C T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNNN -;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNNN -;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNNN -;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNUN - -define void @test(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) { - %addr = load <4 x float>, <4 x float> addrspace(1)* %in - %res1 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %addr, i32 0, i32 0, i32 1) - %res2 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res1, i32 0, i32 0, i32 2) - %res3 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res2, i32 0, i32 0, i32 3) - %res4 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res3, i32 0, i32 0, i32 4) - %res5 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res4, i32 0, i32 0, i32 5) - %res6 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res5, i32 0, i32 0, i32 6) - %res7 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res6, i32 0, i32 0, i32 7) - %res8 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res7, i32 0, i32 0, i32 8) - %res9 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res8, i32 0, i32 0, i32 9) - %res10 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res9, i32 0, i32 0, i32 10) - %res11 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res10, i32 0, i32 0, i32 11) - %res12 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res11, i32 0, i32 0, i32 12) - %res13 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res12, i32 0, i32 0, i32 13) - %res14 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res13, i32 0, i32 0, i32 14) - %res15 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res14, i32 0, i32 0, i32 15) - %res16 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res15, i32 0, i32 0, i32 16) - store <4 x float> %res16, <4 x float> addrspace(1)* %out - ret void -} - -declare <4 x float> @llvm.AMDGPU.tex(<4 x float>, i32, i32, i32) readnone diff --git a/llvm/test/CodeGen/R600/llvm.AMDGPU.trig_preop.ll b/llvm/test/CodeGen/R600/llvm.AMDGPU.trig_preop.ll deleted file mode 100644 index 6b546a7e17c..00000000000 --- a/llvm/test/CodeGen/R600/llvm.AMDGPU.trig_preop.ll +++ /dev/null @@ -1,30 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s - -declare double @llvm.AMDGPU.trig.preop.f64(double, i32) nounwind readnone - -; SI-LABEL: {{^}}test_trig_preop_f64: -; SI-DAG: buffer_load_dword [[SEG:v[0-9]+]] -; SI-DAG: buffer_load_dwordx2 [[SRC:v\[[0-9]+:[0-9]+\]]], -; SI: v_trig_preop_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[SRC]], [[SEG]] -; SI: buffer_store_dwordx2 [[RESULT]], -; SI: s_endpgm -define void @test_trig_preop_f64(double addrspace(1)* %out, double addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind { - %a = load double, double addrspace(1)* %aptr, align 8 - %b = load i32, i32 addrspace(1)* %bptr, align 4 - %result = call double @llvm.AMDGPU.trig.preop.f64(double %a, i32 %b) nounwind readnone - store double %result, double addrspace(1)* %out, align 8 - ret void -} - -; SI-LABEL: {{^}}test_trig_preop_f64_imm_segment: -; SI: buffer_load_dwordx2 [[SRC:v\[[0-9]+:[0-9]+\]]], -; SI: v_trig_preop_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[SRC]], 7 -; SI: buffer_store_dwordx2 [[RESULT]], -; SI: s_endpgm -define void @test_trig_preop_f64_imm_segment(double addrspace(1)* %out, double addrspace(1)* %aptr) nounwind { - %a = load double, double addrspace(1)* %aptr, align 8 - %result = call double @llvm.AMDGPU.trig.preop.f64(double %a, i32 7) nounwind readnone - store double %result, double addrspace(1)* %out, align 8 - ret void -} diff --git a/llvm/test/CodeGen/R600/llvm.AMDGPU.trunc.ll b/llvm/test/CodeGen/R600/llvm.AMDGPU.trunc.ll deleted file mode 100644 index 74792e50017..00000000000 --- a/llvm/test/CodeGen/R600/llvm.AMDGPU.trunc.ll +++ /dev/null @@ -1,17 +0,0 @@ -; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=R600 %s -; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI %s -; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=SI %s - -; R600: {{^}}amdgpu_trunc: -; R600: TRUNC T{{[0-9]+\.[XYZW]}}, KC0[2].Z -; SI: {{^}}amdgpu_trunc: -; SI: v_trunc_f32 - -define void @amdgpu_trunc(float addrspace(1)* %out, float %x) { -entry: - %0 = call float @llvm.AMDGPU.trunc(float %x) - store float %0, float addrspace(1)* %out - ret void -} - -declare float @llvm.AMDGPU.trunc(float ) readnone diff --git a/llvm/test/CodeGen/R600/llvm.AMDGPU.umad24.ll b/llvm/test/CodeGen/R600/llvm.AMDGPU.umad24.ll deleted file mode 100644 index 77a073b0cb0..00000000000 --- a/llvm/test/CodeGen/R600/llvm.AMDGPU.umad24.ll +++ /dev/null @@ -1,38 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=cayman -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s -; XUN: llc -march=r600 -mcpu=r600 -verify-machineinstrs < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s -; XUN: llc -march=r600 -mcpu=rv770 -verify-machineinstrs < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s - -declare i32 @llvm.AMDGPU.umad24(i32, i32, i32) nounwind readnone -declare i32 @llvm.r600.read.tidig.x() nounwind readnone - -; FUNC-LABEL: {{^}}test_umad24: -; SI: v_mad_u32_u24 -; EG: MULADD_UINT24 -; R600: MULLO_UINT -; R600: ADD_INT -define void @test_umad24(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) nounwind { - %mad = call i32 @llvm.AMDGPU.umad24(i32 %src0, i32 %src1, i32 %src2) nounwind readnone - store i32 %mad, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}commute_umad24: -; SI-DAG: buffer_load_dword [[SRC0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI-DAG: buffer_load_dword [[SRC2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; SI: v_mad_u32_u24 [[RESULT:v[0-9]+]], 4, [[SRC0]], [[SRC2]] -; SI: buffer_store_dword [[RESULT]] -define void @commute_umad24(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { - %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone - %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %src0.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %src2.gep = getelementptr i32, i32 addrspace(1)* %src0.gep, i32 1 - - %src0 = load i32, i32 addrspace(1)* %src0.gep, align 4 - %src2 = load i32, i32 addrspace(1)* %src2.gep, align 4 - %mad = call i32 @llvm.AMDGPU.umad24(i32 %src0, i32 4, i32 %src2) nounwind readnone - store i32 %mad, i32 addrspace(1)* %out.gep, align 4 - ret void -} - diff --git a/llvm/test/CodeGen/R600/llvm.AMDGPU.umax.ll b/llvm/test/CodeGen/R600/llvm.AMDGPU.umax.ll deleted file mode 100644 index a97d103016d..00000000000 --- a/llvm/test/CodeGen/R600/llvm.AMDGPU.umax.ll +++ /dev/null @@ -1,48 +0,0 @@ -; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck -check-prefix=SI %s -; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefix=SI %s - -; SI-LABEL: {{^}}vector_umax: -; SI: v_max_u32_e32 -define void @vector_umax(i32 %p0, i32 %p1, i32 addrspace(1)* %in) #0 { -main_body: - %load = load i32, i32 addrspace(1)* %in, align 4 - %max = call i32 @llvm.AMDGPU.umax(i32 %p0, i32 %load) - %bc = bitcast i32 %max to float - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %bc, float %bc, float %bc, float %bc) - ret void -} - -; SI-LABEL: {{^}}scalar_umax: -; SI: s_max_u32 -define void @scalar_umax(i32 %p0, i32 %p1) #0 { -entry: - %max = call i32 @llvm.AMDGPU.umax(i32 %p0, i32 %p1) - %bc = bitcast i32 %max to float - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %bc, float %bc, float %bc, float %bc) - ret void -} - -; SI-LABEL: {{^}}trunc_zext_umax: -; SI: buffer_load_ubyte [[VREG:v[0-9]+]], -; SI: v_max_u32_e32 [[RESULT:v[0-9]+]], 0, [[VREG]] -; SI-NOT: and -; SI: buffer_store_short [[RESULT]], -define void @trunc_zext_umax(i16 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %src) nounwind { - %tmp5 = load i8, i8 addrspace(1)* %src, align 1 - %tmp2 = zext i8 %tmp5 to i32 - %tmp3 = tail call i32 @llvm.AMDGPU.umax(i32 %tmp2, i32 0) nounwind readnone - %tmp4 = trunc i32 %tmp3 to i8 - %tmp6 = zext i8 %tmp4 to i16 - store i16 %tmp6, i16 addrspace(1)* %out, align 2 - ret void -} - -; Function Attrs: readnone -declare i32 @llvm.AMDGPU.umax(i32, i32) #1 - -declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) - -attributes #0 = { nounwind } -attributes #1 = { nounwind readnone } - -!0 = !{!"const", null, i32 1} diff --git a/llvm/test/CodeGen/R600/llvm.AMDGPU.umin.ll b/llvm/test/CodeGen/R600/llvm.AMDGPU.umin.ll deleted file mode 100644 index 2acd10e0c63..00000000000 --- a/llvm/test/CodeGen/R600/llvm.AMDGPU.umin.ll +++ /dev/null @@ -1,48 +0,0 @@ -; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck -check-prefix=SI %s -; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefix=SI %s - -; SI-LABEL: {{^}}vector_umin: -; SI: v_min_u32_e32 -define void @vector_umin(i32 %p0, i32 %p1, i32 addrspace(1)* %in) #0 { -main_body: - %load = load i32, i32 addrspace(1)* %in, align 4 - %min = call i32 @llvm.AMDGPU.umin(i32 %p0, i32 %load) - %bc = bitcast i32 %min to float - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %bc, float %bc, float %bc, float %bc) - ret void -} - -; SI-LABEL: {{^}}scalar_umin: -; SI: s_min_u32 -define void @scalar_umin(i32 %p0, i32 %p1) #0 { -entry: - %min = call i32 @llvm.AMDGPU.umin(i32 %p0, i32 %p1) - %bc = bitcast i32 %min to float - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %bc, float %bc, float %bc, float %bc) - ret void -} - -; SI-LABEL: {{^}}trunc_zext_umin: -; SI: buffer_load_ubyte [[VREG:v[0-9]+]], -; SI: v_min_u32_e32 [[RESULT:v[0-9]+]], 0, [[VREG]] -; SI-NOT: and -; SI: buffer_store_short [[RESULT]], -define void @trunc_zext_umin(i16 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %src) nounwind { - %tmp5 = load i8, i8 addrspace(1)* %src, align 1 - %tmp2 = zext i8 %tmp5 to i32 - %tmp3 = tail call i32 @llvm.AMDGPU.umin(i32 %tmp2, i32 0) nounwind readnone - %tmp4 = trunc i32 %tmp3 to i8 - %tmp6 = zext i8 %tmp4 to i16 - store i16 %tmp6, i16 addrspace(1)* %out, align 2 - ret void -} - -; Function Attrs: readnone -declare i32 @llvm.AMDGPU.umin(i32, i32) #1 - -declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) - -attributes #0 = { nounwind } -attributes #1 = { nounwind readnone } - -!0 = !{!"const", null, i32 1} diff --git a/llvm/test/CodeGen/R600/llvm.AMDGPU.umul24.ll b/llvm/test/CodeGen/R600/llvm.AMDGPU.umul24.ll deleted file mode 100644 index 76624a078b3..00000000000 --- a/llvm/test/CodeGen/R600/llvm.AMDGPU.umul24.ll +++ /dev/null @@ -1,18 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s -; RUN: llc -march=r600 -mcpu=cayman -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s -; XUN: llc -march=r600 -mcpu=r600 -verify-machineinstrs < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s -; XUN: llc -march=r600 -mcpu=r770 -verify-machineinstrs < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s - -declare i32 @llvm.AMDGPU.umul24(i32, i32) nounwind readnone - -; FUNC-LABEL: {{^}}test_umul24: -; SI: v_mul_u32_u24 -; R600: MUL_UINT24 -; R600: MULLO_UINT -define void @test_umul24(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind { - %mul = call i32 @llvm.AMDGPU.umul24(i32 %src0, i32 %src1) nounwind readnone - store i32 %mul, i32 addrspace(1)* %out, align 4 - ret void -} diff --git a/llvm/test/CodeGen/R600/llvm.SI.fs.interp.ll b/llvm/test/CodeGen/R600/llvm.SI.fs.interp.ll deleted file mode 100644 index 3d05da616e4..00000000000 --- a/llvm/test/CodeGen/R600/llvm.SI.fs.interp.ll +++ /dev/null @@ -1,59 +0,0 @@ -;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=GCN %s -;RUN: llc < %s -march=amdgcn -mcpu=kabini -verify-machineinstrs | FileCheck --check-prefix=GCN --check-prefix=16BANK %s -;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=GCN %s - -;GCN-LABEL: {{^}}main: -;GCN-NOT: s_wqm -;GCN: s_mov_b32 -;GCN-NEXT: v_interp_mov_f32 -;GCN: v_interp_p1_f32 -;GCN: v_interp_p2_f32 - -define void @main(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>) #0 { -main_body: - %5 = call float @llvm.SI.fs.constant(i32 0, i32 0, i32 %3) - %6 = call float @llvm.SI.fs.interp(i32 0, i32 0, i32 %3, <2 x i32> %4) - %7 = call float @llvm.SI.fs.interp(i32 1, i32 0, i32 %3, <2 x i32> %4) - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %5, float %6, float %7, float %7) - ret void -} - -; Thest that v_interp_p1 uses different source and destination registers -; on 16 bank LDS chips. - -; 16BANK-LABEL: {{^}}v_interp_p1_bank16_bug: -; 16BANK-NOT: v_interp_p1_f32 [[DST:v[0-9]+]], [[DST]] - -define void @v_interp_p1_bank16_bug([6 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* byval, [17 x <4 x i32>] addrspace(2)* byval, [34 x <8 x i32>] addrspace(2)* byval, float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, i32, float, float) #0 { -main_body: - %22 = call float @llvm.SI.fs.interp(i32 0, i32 0, i32 %5, <2 x i32> %7) - %23 = call float @llvm.SI.fs.interp(i32 1, i32 0, i32 %5, <2 x i32> %7) - %24 = call float @llvm.SI.fs.interp(i32 2, i32 0, i32 %5, <2 x i32> %7) - %25 = call float @fabs(float %22) - %26 = call float @fabs(float %23) - %27 = call float @fabs(float %24) - %28 = call i32 @llvm.SI.packf16(float %25, float %26) - %29 = bitcast i32 %28 to float - %30 = call i32 @llvm.SI.packf16(float %27, float 1.000000e+00) - %31 = bitcast i32 %30 to float - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %29, float %31, float %29, float %31) - ret void -} - -; Function Attrs: readnone -declare float @fabs(float) #2 - -; Function Attrs: nounwind readnone -declare i32 @llvm.SI.packf16(float, float) #1 - -; Function Attrs: nounwind readnone -declare float @llvm.SI.fs.constant(i32, i32, i32) #1 - -; Function Attrs: nounwind readnone -declare float @llvm.SI.fs.interp(i32, i32, i32, <2 x i32>) #1 - -declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) - -attributes #0 = { "ShaderType"="0" } -attributes #1 = { nounwind readnone } -attributes #2 = { readnone } diff --git a/llvm/test/CodeGen/R600/llvm.SI.gather4.ll b/llvm/test/CodeGen/R600/llvm.SI.gather4.ll deleted file mode 100644 index 275cb580bc9..00000000000 --- a/llvm/test/CodeGen/R600/llvm.SI.gather4.ll +++ /dev/null @@ -1,509 +0,0 @@ -;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s -;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s - -;CHECK-LABEL: {{^}}gather4_v2: -;CHECK: image_gather4 {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @gather4_v2() #0 { -main_body: - %r = call <4 x float> @llvm.SI.gather4.v2i32(<2 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}gather4: -;CHECK: image_gather4 {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @gather4() #0 { -main_body: - %r = call <4 x float> @llvm.SI.gather4.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}gather4_cl: -;CHECK: image_gather4_cl {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @gather4_cl() #0 { -main_body: - %r = call <4 x float> @llvm.SI.gather4.cl.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}gather4_l: -;CHECK: image_gather4_l {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @gather4_l() #0 { -main_body: - %r = call <4 x float> @llvm.SI.gather4.l.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}gather4_b: -;CHECK: image_gather4_b {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @gather4_b() #0 { -main_body: - %r = call <4 x float> @llvm.SI.gather4.b.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}gather4_b_cl: -;CHECK: image_gather4_b_cl {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @gather4_b_cl() #0 { -main_body: - %r = call <4 x float> @llvm.SI.gather4.b.cl.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}gather4_b_cl_v8: -;CHECK: image_gather4_b_cl {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @gather4_b_cl_v8() #0 { -main_body: - %r = call <4 x float> @llvm.SI.gather4.b.cl.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}gather4_lz_v2: -;CHECK: image_gather4_lz {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @gather4_lz_v2() #0 { -main_body: - %r = call <4 x float> @llvm.SI.gather4.lz.v2i32(<2 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}gather4_lz: -;CHECK: image_gather4_lz {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @gather4_lz() #0 { -main_body: - %r = call <4 x float> @llvm.SI.gather4.lz.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - - - -;CHECK-LABEL: {{^}}gather4_o: -;CHECK: image_gather4_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @gather4_o() #0 { -main_body: - %r = call <4 x float> @llvm.SI.gather4.o.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}gather4_cl_o: -;CHECK: image_gather4_cl_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @gather4_cl_o() #0 { -main_body: - %r = call <4 x float> @llvm.SI.gather4.cl.o.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}gather4_cl_o_v8: -;CHECK: image_gather4_cl_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @gather4_cl_o_v8() #0 { -main_body: - %r = call <4 x float> @llvm.SI.gather4.cl.o.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}gather4_l_o: -;CHECK: image_gather4_l_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @gather4_l_o() #0 { -main_body: - %r = call <4 x float> @llvm.SI.gather4.l.o.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}gather4_l_o_v8: -;CHECK: image_gather4_l_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @gather4_l_o_v8() #0 { -main_body: - %r = call <4 x float> @llvm.SI.gather4.l.o.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}gather4_b_o: -;CHECK: image_gather4_b_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @gather4_b_o() #0 { -main_body: - %r = call <4 x float> @llvm.SI.gather4.b.o.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}gather4_b_o_v8: -;CHECK: image_gather4_b_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @gather4_b_o_v8() #0 { -main_body: - %r = call <4 x float> @llvm.SI.gather4.b.o.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}gather4_b_cl_o: -;CHECK: image_gather4_b_cl_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @gather4_b_cl_o() #0 { -main_body: - %r = call <4 x float> @llvm.SI.gather4.b.cl.o.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}gather4_lz_o: -;CHECK: image_gather4_lz_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @gather4_lz_o() #0 { -main_body: - %r = call <4 x float> @llvm.SI.gather4.lz.o.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - - - -;CHECK-LABEL: {{^}}gather4_c: -;CHECK: image_gather4_c {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @gather4_c() #0 { -main_body: - %r = call <4 x float> @llvm.SI.gather4.c.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}gather4_c_cl: -;CHECK: image_gather4_c_cl {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @gather4_c_cl() #0 { -main_body: - %r = call <4 x float> @llvm.SI.gather4.c.cl.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}gather4_c_cl_v8: -;CHECK: image_gather4_c_cl {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @gather4_c_cl_v8() #0 { -main_body: - %r = call <4 x float> @llvm.SI.gather4.c.cl.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}gather4_c_l: -;CHECK: image_gather4_c_l {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @gather4_c_l() #0 { -main_body: - %r = call <4 x float> @llvm.SI.gather4.c.l.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}gather4_c_l_v8: -;CHECK: image_gather4_c_l {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @gather4_c_l_v8() #0 { -main_body: - %r = call <4 x float> @llvm.SI.gather4.c.l.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}gather4_c_b: -;CHECK: image_gather4_c_b {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @gather4_c_b() #0 { -main_body: - %r = call <4 x float> @llvm.SI.gather4.c.b.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}gather4_c_b_v8: -;CHECK: image_gather4_c_b {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @gather4_c_b_v8() #0 { -main_body: - %r = call <4 x float> @llvm.SI.gather4.c.b.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}gather4_c_b_cl: -;CHECK: image_gather4_c_b_cl {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @gather4_c_b_cl() #0 { -main_body: - %r = call <4 x float> @llvm.SI.gather4.c.b.cl.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}gather4_c_lz: -;CHECK: image_gather4_c_lz {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @gather4_c_lz() #0 { -main_body: - %r = call <4 x float> @llvm.SI.gather4.c.lz.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - - - -;CHECK-LABEL: {{^}}gather4_c_o: -;CHECK: image_gather4_c_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @gather4_c_o() #0 { -main_body: - %r = call <4 x float> @llvm.SI.gather4.c.o.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}gather4_c_o_v8: -;CHECK: image_gather4_c_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @gather4_c_o_v8() #0 { -main_body: - %r = call <4 x float> @llvm.SI.gather4.c.o.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}gather4_c_cl_o: -;CHECK: image_gather4_c_cl_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @gather4_c_cl_o() #0 { -main_body: - %r = call <4 x float> @llvm.SI.gather4.c.cl.o.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}gather4_c_l_o: -;CHECK: image_gather4_c_l_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @gather4_c_l_o() #0 { -main_body: - %r = call <4 x float> @llvm.SI.gather4.c.l.o.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}gather4_c_b_o: -;CHECK: image_gather4_c_b_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @gather4_c_b_o() #0 { -main_body: - %r = call <4 x float> @llvm.SI.gather4.c.b.o.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}gather4_c_b_cl_o: -;CHECK: image_gather4_c_b_cl_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @gather4_c_b_cl_o() #0 { -main_body: - %r = call <4 x float> @llvm.SI.gather4.c.b.cl.o.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}gather4_c_lz_o: -;CHECK: image_gather4_c_lz_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @gather4_c_lz_o() #0 { -main_body: - %r = call <4 x float> @llvm.SI.gather4.c.lz.o.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}gather4_c_lz_o_v8: -;CHECK: image_gather4_c_lz_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @gather4_c_lz_o_v8() #0 { -main_body: - %r = call <4 x float> @llvm.SI.gather4.c.lz.o.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - - - -declare <4 x float> @llvm.SI.gather4.v2i32(<2 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.gather4.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.gather4.cl.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.gather4.l.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.gather4.b.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.gather4.b.cl.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.gather4.b.cl.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.gather4.lz.v2i32(<2 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.gather4.lz.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 - -declare <4 x float> @llvm.SI.gather4.o.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.gather4.cl.o.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.gather4.cl.o.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.gather4.l.o.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.gather4.l.o.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.gather4.b.o.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.gather4.b.o.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.gather4.b.cl.o.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.gather4.lz.o.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 - -declare <4 x float> @llvm.SI.gather4.c.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.gather4.c.cl.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.gather4.c.cl.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.gather4.c.l.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.gather4.c.l.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.gather4.c.b.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.gather4.c.b.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.gather4.c.b.cl.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.gather4.c.lz.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 - -declare <4 x float> @llvm.SI.gather4.c.o.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.gather4.c.o.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.gather4.c.cl.o.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.gather4.c.l.o.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.gather4.c.b.o.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.gather4.c.b.cl.o.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.gather4.c.lz.o.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.gather4.c.lz.o.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 - -declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) - -attributes #0 = { "ShaderType"="0" } -attributes #1 = { nounwind readnone } diff --git a/llvm/test/CodeGen/R600/llvm.SI.getlod.ll b/llvm/test/CodeGen/R600/llvm.SI.getlod.ll deleted file mode 100644 index 06ee98e91b3..00000000000 --- a/llvm/test/CodeGen/R600/llvm.SI.getlod.ll +++ /dev/null @@ -1,45 +0,0 @@ -;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s -;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s - -;CHECK-LABEL: {{^}}getlod: -;CHECK: image_get_lod {{v\[[0-9]+:[0-9]+\]}}, 3, 0, 0, -1, 0, 0, 0, 0, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @getlod() #0 { -main_body: - %r = call <4 x float> @llvm.SI.getlod.i32(i32 undef, <32 x i8> undef, <16 x i8> undef, i32 15, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r0, float %r1) - ret void -} - -;CHECK-LABEL: {{^}}getlod_v2: -;CHECK: image_get_lod {{v\[[0-9]+:[0-9]+\]}}, 3, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @getlod_v2() #0 { -main_body: - %r = call <4 x float> @llvm.SI.getlod.v2i32(<2 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 15, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r0, float %r1) - ret void -} - -;CHECK-LABEL: {{^}}getlod_v4: -;CHECK: image_get_lod {{v\[[0-9]+:[0-9]+\]}}, 3, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @getlod_v4() #0 { -main_body: - %r = call <4 x float> @llvm.SI.getlod.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 15, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r0, float %r1) - ret void -} - - -declare <4 x float> @llvm.SI.getlod.i32(i32, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.getlod.v2i32(<2 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.getlod.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 - -declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) - -attributes #0 = { "ShaderType"="0" } -attributes #1 = { nounwind readnone } diff --git a/llvm/test/CodeGen/R600/llvm.SI.image.ll b/llvm/test/CodeGen/R600/llvm.SI.image.ll deleted file mode 100644 index 0fac8d79956..00000000000 --- a/llvm/test/CodeGen/R600/llvm.SI.image.ll +++ /dev/null @@ -1,50 +0,0 @@ -;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s -;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s - -;CHECK-LABEL: {{^}}image_load: -;CHECK: image_load {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @image_load() #0 { -main_body: - %r = call <4 x float> @llvm.SI.image.load.v4i32(<4 x i32> undef, <8 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}image_load_mip: -;CHECK: image_load_mip {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @image_load_mip() #0 { -main_body: - %r = call <4 x float> @llvm.SI.image.load.mip.v4i32(<4 x i32> undef, <8 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}getresinfo: -;CHECK: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}} -define void @getresinfo() #0 { -main_body: - %r = call <4 x float> @llvm.SI.getresinfo.i32(i32 undef, <8 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -declare <4 x float> @llvm.SI.image.load.v4i32(<4 x i32>, <8 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.image.load.mip.v4i32(<4 x i32>, <8 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.getresinfo.i32(i32, <8 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 - -declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) - -attributes #0 = { "ShaderType"="0" } -attributes #1 = { nounwind readnone } diff --git a/llvm/test/CodeGen/R600/llvm.SI.image.sample.ll b/llvm/test/CodeGen/R600/llvm.SI.image.sample.ll deleted file mode 100644 index 4bc638a2806..00000000000 --- a/llvm/test/CodeGen/R600/llvm.SI.image.sample.ll +++ /dev/null @@ -1,310 +0,0 @@ -;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s -;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s - -;CHECK-LABEL: {{^}}sample: -;CHECK: s_wqm -;CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @sample() #0 { -main_body: - %r = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}sample_cl: -;CHECK: s_wqm -;CHECK: image_sample_cl {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @sample_cl() #0 { -main_body: - %r = call <4 x float> @llvm.SI.image.sample.cl.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}sample_d: -;CHECK-NOT: s_wqm -;CHECK: image_sample_d {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @sample_d() #0 { -main_body: - %r = call <4 x float> @llvm.SI.image.sample.d.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}sample_d_cl: -;CHECK-NOT: s_wqm -;CHECK: image_sample_d_cl {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @sample_d_cl() #0 { -main_body: - %r = call <4 x float> @llvm.SI.image.sample.d.cl.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}sample_l: -;CHECK-NOT: s_wqm -;CHECK: image_sample_l {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @sample_l() #0 { -main_body: - %r = call <4 x float> @llvm.SI.image.sample.l.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}sample_b: -;CHECK: s_wqm -;CHECK: image_sample_b {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @sample_b() #0 { -main_body: - %r = call <4 x float> @llvm.SI.image.sample.b.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}sample_b_cl: -;CHECK: s_wqm -;CHECK: image_sample_b_cl {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @sample_b_cl() #0 { -main_body: - %r = call <4 x float> @llvm.SI.image.sample.b.cl.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}sample_lz: -;CHECK-NOT: s_wqm -;CHECK: image_sample_lz {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @sample_lz() #0 { -main_body: - %r = call <4 x float> @llvm.SI.image.sample.lz.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}sample_cd: -;CHECK-NOT: s_wqm -;CHECK: image_sample_cd {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @sample_cd() #0 { -main_body: - %r = call <4 x float> @llvm.SI.image.sample.cd.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}sample_cd_cl: -;CHECK-NOT: s_wqm -;CHECK: image_sample_cd_cl {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @sample_cd_cl() #0 { -main_body: - %r = call <4 x float> @llvm.SI.image.sample.cd.cl.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}sample_c: -;CHECK: s_wqm -;CHECK: image_sample_c {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @sample_c() #0 { -main_body: - %r = call <4 x float> @llvm.SI.image.sample.c.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}sample_c_cl: -;CHECK: s_wqm -;CHECK: image_sample_c_cl {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @sample_c_cl() #0 { -main_body: - %r = call <4 x float> @llvm.SI.image.sample.c.cl.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}sample_c_d: -;CHECK-NOT: s_wqm -;CHECK: image_sample_c_d {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @sample_c_d() #0 { -main_body: - %r = call <4 x float> @llvm.SI.image.sample.c.d.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}sample_c_d_cl: -;CHECK-NOT: s_wqm -;CHECK: image_sample_c_d_cl {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @sample_c_d_cl() #0 { -main_body: - %r = call <4 x float> @llvm.SI.image.sample.c.d.cl.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}sample_c_l: -;CHECK-NOT: s_wqm -;CHECK: image_sample_c_l {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @sample_c_l() #0 { -main_body: - %r = call <4 x float> @llvm.SI.image.sample.c.l.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}sample_c_b: -;CHECK: s_wqm -;CHECK: image_sample_c_b {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @sample_c_b() #0 { -main_body: - %r = call <4 x float> @llvm.SI.image.sample.c.b.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}sample_c_b_cl: -;CHECK: s_wqm -;CHECK: image_sample_c_b_cl {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @sample_c_b_cl() #0 { -main_body: - %r = call <4 x float> @llvm.SI.image.sample.c.b.cl.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}sample_c_lz: -;CHECK-NOT: s_wqm -;CHECK: image_sample_c_lz {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @sample_c_lz() #0 { -main_body: - %r = call <4 x float> @llvm.SI.image.sample.c.lz.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}sample_c_cd: -;CHECK-NOT: s_wqm -;CHECK: image_sample_c_cd {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @sample_c_cd() #0 { -main_body: - %r = call <4 x float> @llvm.SI.image.sample.c.cd.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}sample_c_cd_cl: -;CHECK-NOT: s_wqm -;CHECK: image_sample_c_cd_cl {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @sample_c_cd_cl() #0 { -main_body: - %r = call <4 x float> @llvm.SI.image.sample.c.cd.cl.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - - -declare <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.image.sample.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.image.sample.d.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.image.sample.d.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.image.sample.l.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.image.sample.b.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.image.sample.b.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.image.sample.lz.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.image.sample.cd.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.image.sample.cd.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 - -declare <4 x float> @llvm.SI.image.sample.c.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.image.sample.c.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.image.sample.c.d.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.image.sample.c.d.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.image.sample.c.l.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.image.sample.c.b.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.image.sample.c.b.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.image.sample.c.lz.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.image.sample.c.cd.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.image.sample.c.cd.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 - -declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) - -attributes #0 = { "ShaderType"="0" } -attributes #1 = { nounwind readnone } diff --git a/llvm/test/CodeGen/R600/llvm.SI.image.sample.o.ll b/llvm/test/CodeGen/R600/llvm.SI.image.sample.o.ll deleted file mode 100644 index 9d8935414ed..00000000000 --- a/llvm/test/CodeGen/R600/llvm.SI.image.sample.o.ll +++ /dev/null @@ -1,310 +0,0 @@ -;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s -;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s - -;CHECK-LABEL: {{^}}sample: -;CHECK: s_wqm -;CHECK: image_sample_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @sample() #0 { -main_body: - %r = call <4 x float> @llvm.SI.image.sample.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}sample_cl: -;CHECK: s_wqm -;CHECK: image_sample_cl_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @sample_cl() #0 { -main_body: - %r = call <4 x float> @llvm.SI.image.sample.cl.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}sample_d: -;CHECK-NOT: s_wqm -;CHECK: image_sample_d_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @sample_d() #0 { -main_body: - %r = call <4 x float> @llvm.SI.image.sample.d.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}sample_d_cl: -;CHECK-NOT: s_wqm -;CHECK: image_sample_d_cl_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @sample_d_cl() #0 { -main_body: - %r = call <4 x float> @llvm.SI.image.sample.d.cl.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}sample_l: -;CHECK-NOT: s_wqm -;CHECK: image_sample_l_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @sample_l() #0 { -main_body: - %r = call <4 x float> @llvm.SI.image.sample.l.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}sample_b: -;CHECK: s_wqm -;CHECK: image_sample_b_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @sample_b() #0 { -main_body: - %r = call <4 x float> @llvm.SI.image.sample.b.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}sample_b_cl: -;CHECK: s_wqm -;CHECK: image_sample_b_cl_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @sample_b_cl() #0 { -main_body: - %r = call <4 x float> @llvm.SI.image.sample.b.cl.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}sample_lz: -;CHECK-NOT: s_wqm -;CHECK: image_sample_lz_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @sample_lz() #0 { -main_body: - %r = call <4 x float> @llvm.SI.image.sample.lz.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}sample_cd: -;CHECK-NOT: s_wqm -;CHECK: image_sample_cd_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @sample_cd() #0 { -main_body: - %r = call <4 x float> @llvm.SI.image.sample.cd.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}sample_cd_cl: -;CHECK-NOT: s_wqm -;CHECK: image_sample_cd_cl_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @sample_cd_cl() #0 { -main_body: - %r = call <4 x float> @llvm.SI.image.sample.cd.cl.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}sample_c: -;CHECK: s_wqm -;CHECK: image_sample_c_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @sample_c() #0 { -main_body: - %r = call <4 x float> @llvm.SI.image.sample.c.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}sample_c_cl: -;CHECK: s_wqm -;CHECK: image_sample_c_cl_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @sample_c_cl() #0 { -main_body: - %r = call <4 x float> @llvm.SI.image.sample.c.cl.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}sample_c_d: -;CHECK-NOT: s_wqm -;CHECK: image_sample_c_d_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @sample_c_d() #0 { -main_body: - %r = call <4 x float> @llvm.SI.image.sample.c.d.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}sample_c_d_cl: -;CHECK-NOT: s_wqm -;CHECK: image_sample_c_d_cl_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @sample_c_d_cl() #0 { -main_body: - %r = call <4 x float> @llvm.SI.image.sample.c.d.cl.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}sample_c_l: -;CHECK-NOT: s_wqm -;CHECK: image_sample_c_l_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @sample_c_l() #0 { -main_body: - %r = call <4 x float> @llvm.SI.image.sample.c.l.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}sample_c_b: -;CHECK: s_wqm -;CHECK: image_sample_c_b_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @sample_c_b() #0 { -main_body: - %r = call <4 x float> @llvm.SI.image.sample.c.b.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}sample_c_b_cl: -;CHECK: s_wqm -;CHECK: image_sample_c_b_cl_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @sample_c_b_cl() #0 { -main_body: - %r = call <4 x float> @llvm.SI.image.sample.c.b.cl.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}sample_c_lz: -;CHECK-NOT: s_wqm -;CHECK: image_sample_c_lz_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @sample_c_lz() #0 { -main_body: - %r = call <4 x float> @llvm.SI.image.sample.c.lz.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}sample_c_cd: -;CHECK-NOT: s_wqm -;CHECK: image_sample_c_cd_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @sample_c_cd() #0 { -main_body: - %r = call <4 x float> @llvm.SI.image.sample.c.cd.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}sample_c_cd_cl: -;CHECK-NOT: s_wqm -;CHECK: image_sample_c_cd_cl_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @sample_c_cd_cl() #0 { -main_body: - %r = call <4 x float> @llvm.SI.image.sample.c.cd.cl.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - - -declare <4 x float> @llvm.SI.image.sample.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.image.sample.cl.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.image.sample.d.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.image.sample.d.cl.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.image.sample.l.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.image.sample.b.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.image.sample.b.cl.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.image.sample.lz.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.image.sample.cd.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.image.sample.cd.cl.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 - -declare <4 x float> @llvm.SI.image.sample.c.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.image.sample.c.cl.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.image.sample.c.d.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.image.sample.c.d.cl.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.image.sample.c.l.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.image.sample.c.b.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.image.sample.c.b.cl.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.image.sample.c.lz.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.image.sample.c.cd.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.image.sample.c.cd.cl.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 - -declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) - -attributes #0 = { "ShaderType"="0" } -attributes #1 = { nounwind readnone } diff --git a/llvm/test/CodeGen/R600/llvm.SI.imageload.ll b/llvm/test/CodeGen/R600/llvm.SI.imageload.ll deleted file mode 100644 index b67716c3b66..00000000000 --- a/llvm/test/CodeGen/R600/llvm.SI.imageload.ll +++ /dev/null @@ -1,132 +0,0 @@ -;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s -;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s - -;CHECK-DAG: image_load {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, -1 -;CHECK-DAG: image_load_mip {{v\[[0-9]+:[0-9]+\]}}, 3, 0, 0, 0 -;CHECK-DAG: image_load_mip {{v[0-9]+}}, 2, 0, 0, 0 -;CHECK-DAG: image_load_mip {{v[0-9]+}}, 1, 0, 0, 0 -;CHECK-DAG: image_load_mip {{v[0-9]+}}, 4, 0, 0, 0 -;CHECK-DAG: image_load_mip {{v[0-9]+}}, 8, 0, 0, 0 -;CHECK-DAG: image_load_mip {{v\[[0-9]+:[0-9]+\]}}, 5, 0, 0, 0 -;CHECK-DAG: image_load_mip {{v\[[0-9]+:[0-9]+\]}}, 12, 0, 0, -1 -;CHECK-DAG: image_load_mip {{v\[[0-9]+:[0-9]+\]}}, 7, 0, 0, 0 -;CHECK-DAG: image_load_mip {{v[0-9]+}}, 8, 0, 0, -1 - -define void @test(i32 %a1, i32 %a2, i32 %a3, i32 %a4) { - %v1 = insertelement <4 x i32> undef, i32 %a1, i32 0 - %v2 = insertelement <4 x i32> undef, i32 %a1, i32 1 - %v3 = insertelement <4 x i32> undef, i32 %a1, i32 2 - %v4 = insertelement <4 x i32> undef, i32 %a1, i32 3 - %v5 = insertelement <4 x i32> undef, i32 %a2, i32 0 - %v6 = insertelement <4 x i32> undef, i32 %a2, i32 1 - %v10 = insertelement <4 x i32> undef, i32 %a3, i32 1 - %v11 = insertelement <4 x i32> undef, i32 %a3, i32 2 - %v15 = insertelement <4 x i32> undef, i32 %a4, i32 2 - %v16 = insertelement <4 x i32> undef, i32 %a4, i32 3 - %res1 = call <4 x i32> @llvm.SI.imageload.(<4 x i32> %v1, - <32 x i8> undef, i32 1) - %res2 = call <4 x i32> @llvm.SI.imageload.(<4 x i32> %v2, - <32 x i8> undef, i32 2) - %res3 = call <4 x i32> @llvm.SI.imageload.(<4 x i32> %v3, - <32 x i8> undef, i32 3) - %res4 = call <4 x i32> @llvm.SI.imageload.(<4 x i32> %v4, - <32 x i8> undef, i32 4) - %res5 = call <4 x i32> @llvm.SI.imageload.(<4 x i32> %v5, - <32 x i8> undef, i32 5) - %res6 = call <4 x i32> @llvm.SI.imageload.(<4 x i32> %v6, - <32 x i8> undef, i32 6) - %res10 = call <4 x i32> @llvm.SI.imageload.(<4 x i32> %v10, - <32 x i8> undef, i32 10) - %res11 = call <4 x i32> @llvm.SI.imageload.(<4 x i32> %v11, - <32 x i8> undef, i32 11) - %res15 = call <4 x i32> @llvm.SI.imageload.(<4 x i32> %v15, - <32 x i8> undef, i32 15) - %res16 = call <4 x i32> @llvm.SI.imageload.(<4 x i32> %v16, - <32 x i8> undef, i32 16) - %e1 = extractelement <4 x i32> %res1, i32 0 - %e2 = extractelement <4 x i32> %res2, i32 1 - %e3 = extractelement <4 x i32> %res3, i32 2 - %e4 = extractelement <4 x i32> %res4, i32 3 - %t0 = extractelement <4 x i32> %res5, i32 0 - %t1 = extractelement <4 x i32> %res5, i32 1 - %e5 = add i32 %t0, %t1 - %t2 = extractelement <4 x i32> %res6, i32 0 - %t3 = extractelement <4 x i32> %res6, i32 2 - %e6 = add i32 %t2, %t3 - %t10 = extractelement <4 x i32> %res10, i32 2 - %t11 = extractelement <4 x i32> %res10, i32 3 - %e10 = add i32 %t10, %t11 - %t12 = extractelement <4 x i32> %res11, i32 0 - %t13 = extractelement <4 x i32> %res11, i32 1 - %t14 = extractelement <4 x i32> %res11, i32 2 - %t15 = add i32 %t12, %t13 - %e11 = add i32 %t14, %t15 - %t28 = extractelement <4 x i32> %res15, i32 0 - %t29 = extractelement <4 x i32> %res15, i32 1 - %t30 = extractelement <4 x i32> %res15, i32 2 - %t31 = extractelement <4 x i32> %res15, i32 3 - %t32 = add i32 %t28, %t29 - %t33 = add i32 %t30, %t31 - %e15 = add i32 %t32, %t33 - %e16 = extractelement <4 x i32> %res16, i32 3 - %s1 = add i32 %e1, %e2 - %s2 = add i32 %s1, %e3 - %s3 = add i32 %s2, %e4 - %s4 = add i32 %s3, %e5 - %s5 = add i32 %s4, %e6 - %s9 = add i32 %s5, %e10 - %s10 = add i32 %s9, %e11 - %s14 = add i32 %s10, %e15 - %s15 = add i32 %s14, %e16 - %s16 = bitcast i32 %s15 to float - call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %s16, float %s16, float %s16, float %s16) - ret void -} - -; Test that ccordinates are stored in vgprs and not sgprs -; CHECK: vgpr_coords -; CHECK: image_load_mip {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}} -define void @vgpr_coords(float addrspace(2)* addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 { -main_body: - %20 = getelementptr float addrspace(2)*, float addrspace(2)* addrspace(2)* %0, i32 0 - %21 = load float addrspace(2)*, float addrspace(2)* addrspace(2)* %20, !tbaa !2 - %22 = getelementptr float, float addrspace(2)* %21, i32 0 - %23 = load float, float addrspace(2)* %22, !tbaa !2, !invariant.load !1 - %24 = getelementptr float, float addrspace(2)* %21, i32 1 - %25 = load float, float addrspace(2)* %24, !tbaa !2, !invariant.load !1 - %26 = getelementptr float, float addrspace(2)* %21, i32 4 - %27 = load float, float addrspace(2)* %26, !tbaa !2, !invariant.load !1 - %28 = getelementptr <32 x i8>, <32 x i8> addrspace(2)* %2, i32 0 - %29 = load <32 x i8>, <32 x i8> addrspace(2)* %28, !tbaa !2 - %30 = bitcast float %27 to i32 - %31 = bitcast float %23 to i32 - %32 = bitcast float %25 to i32 - %33 = insertelement <4 x i32> undef, i32 %31, i32 0 - %34 = insertelement <4 x i32> %33, i32 %32, i32 1 - %35 = insertelement <4 x i32> %34, i32 %30, i32 2 - %36 = insertelement <4 x i32> %35, i32 undef, i32 3 - %37 = call <4 x i32> @llvm.SI.imageload.v4i32(<4 x i32> %36, <32 x i8> %29, i32 2) - %38 = extractelement <4 x i32> %37, i32 0 - %39 = extractelement <4 x i32> %37, i32 1 - %40 = extractelement <4 x i32> %37, i32 2 - %41 = extractelement <4 x i32> %37, i32 3 - %42 = bitcast i32 %38 to float - %43 = bitcast i32 %39 to float - %44 = bitcast i32 %40 to float - %45 = bitcast i32 %41 to float - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %42, float %43, float %44, float %45) - ret void -} - -declare <4 x i32> @llvm.SI.imageload.(<4 x i32>, <32 x i8>, i32) readnone -; Function Attrs: nounwind readnone -declare <4 x i32> @llvm.SI.imageload.v4i32(<4 x i32>, <32 x i8>, i32) #1 - -declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) - -attributes #0 = { "ShaderType"="0" } -attributes #1 = { nounwind readnone } - -!0 = !{!"const", null} -!1 = !{} -!2 = !{!0, !0, i64 0, i32 1} diff --git a/llvm/test/CodeGen/R600/llvm.SI.load.dword.ll b/llvm/test/CodeGen/R600/llvm.SI.load.dword.ll deleted file mode 100644 index f6c258539d5..00000000000 --- a/llvm/test/CodeGen/R600/llvm.SI.load.dword.ll +++ /dev/null @@ -1,53 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=verde -show-mc-encoding -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -march=amdgcn -mcpu=tonga -show-mc-encoding -verify-machineinstrs < %s | FileCheck %s - -; Example of a simple geometry shader loading vertex attributes from the -; ESGS ring buffer - -; FIXME: Out of bounds immediate offset crashes - -; CHECK-LABEL: {{^}}main: -; CHECK: buffer_load_dword {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 glc slc -; CHECK: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen glc slc -; CHECK: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 idxen glc slc -; CHECK: buffer_load_dword {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 idxen offen glc slc -; CHECK: s_movk_i32 [[K:s[0-9]+]], 0x4d2 ; encoding -; CHECK: buffer_load_dword {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, [[K]] idxen offen offset:65535 glc slc - -define void @main([17 x <16 x i8>] addrspace(2)* byval %arg, [32 x <16 x i8>] addrspace(2)* byval %arg1, [16 x <32 x i8>] addrspace(2)* byval %arg2, [2 x <16 x i8>] addrspace(2)* byval %arg3, [17 x <16 x i8>] addrspace(2)* inreg %arg4, [17 x <16 x i8>] addrspace(2)* inreg %arg5, i32 %arg6, i32 %arg7, i32 %arg8, i32 %arg9) #0 { -main_body: - %tmp = getelementptr [2 x <16 x i8>], [2 x <16 x i8>] addrspace(2)* %arg3, i64 0, i32 1 - %tmp10 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp, !tbaa !0 - %tmp11 = shl i32 %arg6, 2 - %tmp12 = call i32 @llvm.SI.buffer.load.dword.i32.i32(<16 x i8> %tmp10, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 0) - %tmp13 = bitcast i32 %tmp12 to float - %tmp14 = call i32 @llvm.SI.buffer.load.dword.i32.i32(<16 x i8> %tmp10, i32 %tmp11, i32 0, i32 0, i32 1, i32 0, i32 1, i32 1, i32 0) - %tmp15 = bitcast i32 %tmp14 to float - %tmp16 = call i32 @llvm.SI.buffer.load.dword.i32.i32(<16 x i8> %tmp10, i32 %tmp11, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 0) - %tmp17 = bitcast i32 %tmp16 to float - %tmp18 = call i32 @llvm.SI.buffer.load.dword.i32.v2i32(<16 x i8> %tmp10, <2 x i32> zeroinitializer, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 0) - %tmp19 = bitcast i32 %tmp18 to float - - %tmp20 = call i32 @llvm.SI.buffer.load.dword.i32.v2i32(<16 x i8> %tmp10, <2 x i32> zeroinitializer, i32 0, i32 123, i32 1, i32 1, i32 1, i32 1, i32 0) - %tmp21 = bitcast i32 %tmp20 to float - - %tmp22 = call i32 @llvm.SI.buffer.load.dword.i32.v2i32(<16 x i8> %tmp10, <2 x i32> zeroinitializer, i32 1234, i32 65535, i32 1, i32 1, i32 1, i32 1, i32 0) - %tmp23 = bitcast i32 %tmp22 to float - - call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %tmp13, float %tmp15, float %tmp17, float %tmp19) - call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %tmp21, float %tmp23, float %tmp23, float %tmp23) - ret void -} - -; Function Attrs: nounwind readonly -declare i32 @llvm.SI.buffer.load.dword.i32.i32(<16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 - -; Function Attrs: nounwind readonly -declare i32 @llvm.SI.buffer.load.dword.i32.v2i32(<16 x i8>, <2 x i32>, i32, i32, i32, i32, i32, i32, i32) #1 - -declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) - -attributes #0 = { "ShaderType"="1" } -attributes #1 = { nounwind readonly } - -!0 = !{!"const", null, i32 1} diff --git a/llvm/test/CodeGen/R600/llvm.SI.resinfo.ll b/llvm/test/CodeGen/R600/llvm.SI.resinfo.ll deleted file mode 100644 index ac95fd0b83a..00000000000 --- a/llvm/test/CodeGen/R600/llvm.SI.resinfo.ll +++ /dev/null @@ -1,111 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s - -; CHECK-DAG: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, -1 -; CHECK-DAG: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, 3, 0, 0, 0 -; CHECK-DAG: image_get_resinfo {{v[0-9]+}}, 2, 0, 0, 0 -; CHECK-DAG: image_get_resinfo {{v[0-9]+}}, 1, 0, 0, 0 -; CHECK-DAG: image_get_resinfo {{v[0-9]+}}, 4, 0, 0, 0 -; CHECK-DAG: image_get_resinfo {{v[0-9]+}}, 8, 0, 0, 0 -; CHECK-DAG: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, 5, 0, 0, 0 -; CHECK-DAG: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, 9, 0, 0, 0 -; CHECK-DAG: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, 6, 0, 0, 0 -; CHECK-DAG: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, 10, 0, 0, -1 -; CHECK-DAG: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, 12, 0, 0, -1 -; CHECK-DAG: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, 7, 0, 0, 0 -; CHECK-DAG: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, 11, 0, 0, 0 -; CHECK-DAG: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, 13, 0, 0, 0 -; CHECK-DAG: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, 14, 0, 0, 0 -; CHECK-DAG: image_get_resinfo {{v[0-9]+}}, 8, 0, 0, -1 - -define void @test(i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6, i32 %a7, i32 %a8, - i32 %a9, i32 %a10, i32 %a11, i32 %a12, i32 %a13, i32 %a14, i32 %a15, i32 %a16) { - %res1 = call <4 x i32> @llvm.SI.resinfo(i32 %a1, <32 x i8> undef, i32 1) - %res2 = call <4 x i32> @llvm.SI.resinfo(i32 %a2, <32 x i8> undef, i32 2) - %res3 = call <4 x i32> @llvm.SI.resinfo(i32 %a3, <32 x i8> undef, i32 3) - %res4 = call <4 x i32> @llvm.SI.resinfo(i32 %a4, <32 x i8> undef, i32 4) - %res5 = call <4 x i32> @llvm.SI.resinfo(i32 %a5, <32 x i8> undef, i32 5) - %res6 = call <4 x i32> @llvm.SI.resinfo(i32 %a6, <32 x i8> undef, i32 6) - %res7 = call <4 x i32> @llvm.SI.resinfo(i32 %a7, <32 x i8> undef, i32 7) - %res8 = call <4 x i32> @llvm.SI.resinfo(i32 %a8, <32 x i8> undef, i32 8) - %res9 = call <4 x i32> @llvm.SI.resinfo(i32 %a9, <32 x i8> undef, i32 9) - %res10 = call <4 x i32> @llvm.SI.resinfo(i32 %a10, <32 x i8> undef, i32 10) - %res11 = call <4 x i32> @llvm.SI.resinfo(i32 %a11, <32 x i8> undef, i32 11) - %res12 = call <4 x i32> @llvm.SI.resinfo(i32 %a12, <32 x i8> undef, i32 12) - %res13 = call <4 x i32> @llvm.SI.resinfo(i32 %a13, <32 x i8> undef, i32 13) - %res14 = call <4 x i32> @llvm.SI.resinfo(i32 %a14, <32 x i8> undef, i32 14) - %res15 = call <4 x i32> @llvm.SI.resinfo(i32 %a15, <32 x i8> undef, i32 15) - %res16 = call <4 x i32> @llvm.SI.resinfo(i32 %a16, <32 x i8> undef, i32 16) - %e1 = extractelement <4 x i32> %res1, i32 0 - %e2 = extractelement <4 x i32> %res2, i32 1 - %e3 = extractelement <4 x i32> %res3, i32 2 - %e4 = extractelement <4 x i32> %res4, i32 3 - %t0 = extractelement <4 x i32> %res5, i32 0 - %t1 = extractelement <4 x i32> %res5, i32 1 - %e5 = add i32 %t0, %t1 - %t2 = extractelement <4 x i32> %res6, i32 0 - %t3 = extractelement <4 x i32> %res6, i32 2 - %e6 = add i32 %t2, %t3 - %t4 = extractelement <4 x i32> %res7, i32 0 - %t5 = extractelement <4 x i32> %res7, i32 3 - %e7 = add i32 %t4, %t5 - %t6 = extractelement <4 x i32> %res8, i32 1 - %t7 = extractelement <4 x i32> %res8, i32 2 - %e8 = add i32 %t6, %t7 - %t8 = extractelement <4 x i32> %res9, i32 1 - %t9 = extractelement <4 x i32> %res9, i32 3 - %e9 = add i32 %t8, %t9 - %t10 = extractelement <4 x i32> %res10, i32 2 - %t11 = extractelement <4 x i32> %res10, i32 3 - %e10 = add i32 %t10, %t11 - %t12 = extractelement <4 x i32> %res11, i32 0 - %t13 = extractelement <4 x i32> %res11, i32 1 - %t14 = extractelement <4 x i32> %res11, i32 2 - %t15 = add i32 %t12, %t13 - %e11 = add i32 %t14, %t15 - %t16 = extractelement <4 x i32> %res12, i32 0 - %t17 = extractelement <4 x i32> %res12, i32 1 - %t18 = extractelement <4 x i32> %res12, i32 3 - %t19 = add i32 %t16, %t17 - %e12 = add i32 %t18, %t19 - %t20 = extractelement <4 x i32> %res13, i32 0 - %t21 = extractelement <4 x i32> %res13, i32 2 - %t22 = extractelement <4 x i32> %res13, i32 3 - %t23 = add i32 %t20, %t21 - %e13 = add i32 %t22, %t23 - %t24 = extractelement <4 x i32> %res14, i32 1 - %t25 = extractelement <4 x i32> %res14, i32 2 - %t26 = extractelement <4 x i32> %res14, i32 3 - %t27 = add i32 %t24, %t25 - %e14 = add i32 %t26, %t27 - %t28 = extractelement <4 x i32> %res15, i32 0 - %t29 = extractelement <4 x i32> %res15, i32 1 - %t30 = extractelement <4 x i32> %res15, i32 2 - %t31 = extractelement <4 x i32> %res15, i32 3 - %t32 = add i32 %t28, %t29 - %t33 = add i32 %t30, %t31 - %e15 = add i32 %t32, %t33 - %e16 = extractelement <4 x i32> %res16, i32 3 - %s1 = add i32 %e1, %e2 - %s2 = add i32 %s1, %e3 - %s3 = add i32 %s2, %e4 - %s4 = add i32 %s3, %e5 - %s5 = add i32 %s4, %e6 - %s6 = add i32 %s5, %e7 - %s7 = add i32 %s6, %e8 - %s8 = add i32 %s7, %e9 - %s9 = add i32 %s8, %e10 - %s10 = add i32 %s9, %e11 - %s11 = add i32 %s10, %e12 - %s12 = add i32 %s11, %e13 - %s13 = add i32 %s12, %e14 - %s14 = add i32 %s13, %e15 - %s15 = add i32 %s14, %e16 - %s16 = bitcast i32 %s15 to float - call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %s16, float %s16, float %s16, float %s16) - ret void -} - -declare <4 x i32> @llvm.SI.resinfo(i32, <32 x i8>, i32) readnone - -declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) diff --git a/llvm/test/CodeGen/R600/llvm.SI.sample-masked.ll b/llvm/test/CodeGen/R600/llvm.SI.sample-masked.ll deleted file mode 100644 index ce9558cbf81..00000000000 --- a/llvm/test/CodeGen/R600/llvm.SI.sample-masked.ll +++ /dev/null @@ -1,96 +0,0 @@ -;RUN: llc < %s -march=amdgcn -mcpu=verde | FileCheck %s -;RUN: llc < %s -march=amdgcn -mcpu=tonga | FileCheck %s - -; CHECK-LABEL: {{^}}v1: -; CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, 13 -define void @v1(i32 %a1) #0 { -entry: - %0 = insertelement <1 x i32> undef, i32 %a1, i32 0 - %1 = call <4 x float> @llvm.SI.sample.v1i32(<1 x i32> %0, <32 x i8> undef, <16 x i8> undef, i32 0) - %2 = extractelement <4 x float> %1, i32 0 - %3 = extractelement <4 x float> %1, i32 2 - %4 = extractelement <4 x float> %1, i32 3 - call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %2, float %3, float %4, float %4) - ret void -} - -; CHECK-LABEL: {{^}}v2: -; CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, 11 -define void @v2(i32 %a1) #0 { -entry: - %0 = insertelement <1 x i32> undef, i32 %a1, i32 0 - %1 = call <4 x float> @llvm.SI.sample.v1i32(<1 x i32> %0, <32 x i8> undef, <16 x i8> undef, i32 0) - %2 = extractelement <4 x float> %1, i32 0 - %3 = extractelement <4 x float> %1, i32 1 - %4 = extractelement <4 x float> %1, i32 3 - call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %2, float %3, float %4, float %4) - ret void -} - -; CHECK-LABEL: {{^}}v3: -; CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, 14 -define void @v3(i32 %a1) #0 { -entry: - %0 = insertelement <1 x i32> undef, i32 %a1, i32 0 - %1 = call <4 x float> @llvm.SI.sample.v1i32(<1 x i32> %0, <32 x i8> undef, <16 x i8> undef, i32 0) - %2 = extractelement <4 x float> %1, i32 1 - %3 = extractelement <4 x float> %1, i32 2 - %4 = extractelement <4 x float> %1, i32 3 - call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %2, float %3, float %4, float %4) - ret void -} - -; CHECK-LABEL: {{^}}v4: -; CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, 7 -define void @v4(i32 %a1) #0 { -entry: - %0 = insertelement <1 x i32> undef, i32 %a1, i32 0 - %1 = call <4 x float> @llvm.SI.sample.v1i32(<1 x i32> %0, <32 x i8> undef, <16 x i8> undef, i32 0) - %2 = extractelement <4 x float> %1, i32 0 - %3 = extractelement <4 x float> %1, i32 1 - %4 = extractelement <4 x float> %1, i32 2 - call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %2, float %3, float %4, float %4) - ret void -} - -; CHECK-LABEL: {{^}}v5: -; CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, 10 -define void @v5(i32 %a1) #0 { -entry: - %0 = insertelement <1 x i32> undef, i32 %a1, i32 0 - %1 = call <4 x float> @llvm.SI.sample.v1i32(<1 x i32> %0, <32 x i8> undef, <16 x i8> undef, i32 0) - %2 = extractelement <4 x float> %1, i32 1 - %3 = extractelement <4 x float> %1, i32 3 - call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %2, float %3, float %3, float %3) - ret void -} - -; CHECK-LABEL: {{^}}v6: -; CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, 6 -define void @v6(i32 %a1) #0 { -entry: - %0 = insertelement <1 x i32> undef, i32 %a1, i32 0 - %1 = call <4 x float> @llvm.SI.sample.v1i32(<1 x i32> %0, <32 x i8> undef, <16 x i8> undef, i32 0) - %2 = extractelement <4 x float> %1, i32 1 - %3 = extractelement <4 x float> %1, i32 2 - call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %2, float %3, float %3, float %3) - ret void -} - -; CHECK-LABEL: {{^}}v7: -; CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, 9 -define void @v7(i32 %a1) #0 { -entry: - %0 = insertelement <1 x i32> undef, i32 %a1, i32 0 - %1 = call <4 x float> @llvm.SI.sample.v1i32(<1 x i32> %0, <32 x i8> undef, <16 x i8> undef, i32 0) - %2 = extractelement <4 x float> %1, i32 0 - %3 = extractelement <4 x float> %1, i32 3 - call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %2, float %3, float %3, float %3) - ret void -} - -declare <4 x float> @llvm.SI.sample.v1i32(<1 x i32>, <32 x i8>, <16 x i8>, i32) readnone - -declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) - -attributes #0 = { "ShaderType"="0" } diff --git a/llvm/test/CodeGen/R600/llvm.SI.sample.ll b/llvm/test/CodeGen/R600/llvm.SI.sample.ll deleted file mode 100644 index 509c45f588b..00000000000 --- a/llvm/test/CodeGen/R600/llvm.SI.sample.ll +++ /dev/null @@ -1,160 +0,0 @@ -;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s -;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s - -;CHECK-DAG: image_sample {{v\[[0-9]+:[0-9]+\]}}, 15 -;CHECK-DAG: image_sample {{v\[[0-9]+:[0-9]+\]}}, 3 -;CHECK-DAG: image_sample {{v[0-9]+}}, 2 -;CHECK-DAG: image_sample {{v[0-9]+}}, 1 -;CHECK-DAG: image_sample {{v[0-9]+}}, 4 -;CHECK-DAG: image_sample {{v[0-9]+}}, 8 -;CHECK-DAG: image_sample_c {{v\[[0-9]+:[0-9]+\]}}, 5 -;CHECK-DAG: image_sample_c {{v\[[0-9]+:[0-9]+\]}}, 9 -;CHECK-DAG: image_sample_c {{v\[[0-9]+:[0-9]+\]}}, 6 -;CHECK-DAG: image_sample {{v\[[0-9]+:[0-9]+\]}}, 10 -;CHECK-DAG: image_sample {{v\[[0-9]+:[0-9]+\]}}, 12 -;CHECK-DAG: image_sample_c {{v\[[0-9]+:[0-9]+\]}}, 7 -;CHECK-DAG: image_sample_c {{v\[[0-9]+:[0-9]+\]}}, 11 -;CHECK-DAG: image_sample_c {{v\[[0-9]+:[0-9]+\]}}, 13 -;CHECK-DAG: image_sample {{v\[[0-9]+:[0-9]+\]}}, 14 -;CHECK-DAG: image_sample {{v[0-9]+}}, 8 - -define void @test(i32 %a1, i32 %a2, i32 %a3, i32 %a4) #0 { - %v1 = insertelement <4 x i32> undef, i32 %a1, i32 0 - %v2 = insertelement <4 x i32> undef, i32 %a1, i32 1 - %v3 = insertelement <4 x i32> undef, i32 %a1, i32 2 - %v4 = insertelement <4 x i32> undef, i32 %a1, i32 3 - %v5 = insertelement <4 x i32> undef, i32 %a2, i32 0 - %v6 = insertelement <4 x i32> undef, i32 %a2, i32 1 - %v7 = insertelement <4 x i32> undef, i32 %a2, i32 2 - %v8 = insertelement <4 x i32> undef, i32 %a2, i32 3 - %v9 = insertelement <4 x i32> undef, i32 %a3, i32 0 - %v10 = insertelement <4 x i32> undef, i32 %a3, i32 1 - %v11 = insertelement <4 x i32> undef, i32 %a3, i32 2 - %v12 = insertelement <4 x i32> undef, i32 %a3, i32 3 - %v13 = insertelement <4 x i32> undef, i32 %a4, i32 0 - %v14 = insertelement <4 x i32> undef, i32 %a4, i32 1 - %v15 = insertelement <4 x i32> undef, i32 %a4, i32 2 - %v16 = insertelement <4 x i32> undef, i32 %a4, i32 3 - %res1 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v1, - <32 x i8> undef, <16 x i8> undef, i32 1) - %res2 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v2, - <32 x i8> undef, <16 x i8> undef, i32 2) - %res3 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v3, - <32 x i8> undef, <16 x i8> undef, i32 3) - %res4 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v4, - <32 x i8> undef, <16 x i8> undef, i32 4) - %res5 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v5, - <32 x i8> undef, <16 x i8> undef, i32 5) - %res6 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v6, - <32 x i8> undef, <16 x i8> undef, i32 6) - %res7 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v7, - <32 x i8> undef, <16 x i8> undef, i32 7) - %res8 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v8, - <32 x i8> undef, <16 x i8> undef, i32 8) - %res9 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v9, - <32 x i8> undef, <16 x i8> undef, i32 9) - %res10 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v10, - <32 x i8> undef, <16 x i8> undef, i32 10) - %res11 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v11, - <32 x i8> undef, <16 x i8> undef, i32 11) - %res12 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v12, - <32 x i8> undef, <16 x i8> undef, i32 12) - %res13 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v13, - <32 x i8> undef, <16 x i8> undef, i32 13) - %res14 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v14, - <32 x i8> undef, <16 x i8> undef, i32 14) - %res15 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v15, - <32 x i8> undef, <16 x i8> undef, i32 15) - %res16 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v16, - <32 x i8> undef, <16 x i8> undef, i32 16) - %e1 = extractelement <4 x float> %res1, i32 0 - %e2 = extractelement <4 x float> %res2, i32 1 - %e3 = extractelement <4 x float> %res3, i32 2 - %e4 = extractelement <4 x float> %res4, i32 3 - %t0 = extractelement <4 x float> %res5, i32 0 - %t1 = extractelement <4 x float> %res5, i32 1 - %e5 = fadd float %t0, %t1 - %t2 = extractelement <4 x float> %res6, i32 0 - %t3 = extractelement <4 x float> %res6, i32 2 - %e6 = fadd float %t2, %t3 - %t4 = extractelement <4 x float> %res7, i32 0 - %t5 = extractelement <4 x float> %res7, i32 3 - %e7 = fadd float %t4, %t5 - %t6 = extractelement <4 x float> %res8, i32 1 - %t7 = extractelement <4 x float> %res8, i32 2 - %e8 = fadd float %t6, %t7 - %t8 = extractelement <4 x float> %res9, i32 1 - %t9 = extractelement <4 x float> %res9, i32 3 - %e9 = fadd float %t8, %t9 - %t10 = extractelement <4 x float> %res10, i32 2 - %t11 = extractelement <4 x float> %res10, i32 3 - %e10 = fadd float %t10, %t11 - %t12 = extractelement <4 x float> %res11, i32 0 - %t13 = extractelement <4 x float> %res11, i32 1 - %t14 = extractelement <4 x float> %res11, i32 2 - %t15 = fadd float %t12, %t13 - %e11 = fadd float %t14, %t15 - %t16 = extractelement <4 x float> %res12, i32 0 - %t17 = extractelement <4 x float> %res12, i32 1 - %t18 = extractelement <4 x float> %res12, i32 3 - %t19 = fadd float %t16, %t17 - %e12 = fadd float %t18, %t19 - %t20 = extractelement <4 x float> %res13, i32 0 - %t21 = extractelement <4 x float> %res13, i32 2 - %t22 = extractelement <4 x float> %res13, i32 3 - %t23 = fadd float %t20, %t21 - %e13 = fadd float %t22, %t23 - %t24 = extractelement <4 x float> %res14, i32 1 - %t25 = extractelement <4 x float> %res14, i32 2 - %t26 = extractelement <4 x float> %res14, i32 3 - %t27 = fadd float %t24, %t25 - %e14 = fadd float %t26, %t27 - %t28 = extractelement <4 x float> %res15, i32 0 - %t29 = extractelement <4 x float> %res15, i32 1 - %t30 = extractelement <4 x float> %res15, i32 2 - %t31 = extractelement <4 x float> %res15, i32 3 - %t32 = fadd float %t28, %t29 - %t33 = fadd float %t30, %t31 - %e15 = fadd float %t32, %t33 - %e16 = extractelement <4 x float> %res16, i32 3 - %s1 = fadd float %e1, %e2 - %s2 = fadd float %s1, %e3 - %s3 = fadd float %s2, %e4 - %s4 = fadd float %s3, %e5 - %s5 = fadd float %s4, %e6 - %s6 = fadd float %s5, %e7 - %s7 = fadd float %s6, %e8 - %s8 = fadd float %s7, %e9 - %s9 = fadd float %s8, %e10 - %s10 = fadd float %s9, %e11 - %s11 = fadd float %s10, %e12 - %s12 = fadd float %s11, %e13 - %s13 = fadd float %s12, %e14 - %s14 = fadd float %s13, %e15 - %s15 = fadd float %s14, %e16 - call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %s15, float %s15, float %s15, float %s15) - ret void -} - -; CHECK: {{^}}v1: -; CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, 15 -define void @v1(i32 %a1) #0 { -entry: - %0 = insertelement <1 x i32> undef, i32 %a1, i32 0 - %1 = call <4 x float> @llvm.SI.sample.v1i32(<1 x i32> %0, <32 x i8> undef, <16 x i8> undef, i32 0) - %2 = extractelement <4 x float> %1, i32 0 - %3 = extractelement <4 x float> %1, i32 1 - %4 = extractelement <4 x float> %1, i32 2 - %5 = extractelement <4 x float> %1, i32 3 - call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %2, float %3, float %4, float %5) - ret void -} - - -declare <4 x float> @llvm.SI.sample.v1i32(<1 x i32>, <32 x i8>, <16 x i8>, i32) readnone - -declare <4 x float> @llvm.SI.sample.(<4 x i32>, <32 x i8>, <16 x i8>, i32) readnone - -declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) - -attributes #0 = { "ShaderType"="0" } diff --git a/llvm/test/CodeGen/R600/llvm.SI.sampled.ll b/llvm/test/CodeGen/R600/llvm.SI.sampled.ll deleted file mode 100644 index f2badff2a99..00000000000 --- a/llvm/test/CodeGen/R600/llvm.SI.sampled.ll +++ /dev/null @@ -1,143 +0,0 @@ -;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s -;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s - -;CHECK-DAG: image_sample_d {{v\[[0-9]+:[0-9]+\]}}, 15 -;CHECK-DAG: image_sample_d {{v\[[0-9]+:[0-9]+\]}}, 3 -;CHECK-DAG: image_sample_d {{v[0-9]+}}, 2 -;CHECK-DAG: image_sample_d {{v[0-9]+}}, 1 -;CHECK-DAG: image_sample_d {{v[0-9]+}}, 4 -;CHECK-DAG: image_sample_d {{v[0-9]+}}, 8 -;CHECK-DAG: image_sample_c_d {{v\[[0-9]+:[0-9]+\]}}, 5 -;CHECK-DAG: image_sample_c_d {{v\[[0-9]+:[0-9]+\]}}, 9 -;CHECK-DAG: image_sample_c_d {{v\[[0-9]+:[0-9]+\]}}, 6 -;CHECK-DAG: image_sample_d {{v\[[0-9]+:[0-9]+\]}}, 10 -;CHECK-DAG: image_sample_d {{v\[[0-9]+:[0-9]+\]}}, 12 -;CHECK-DAG: image_sample_c_d {{v\[[0-9]+:[0-9]+\]}}, 7 -;CHECK-DAG: image_sample_c_d {{v\[[0-9]+:[0-9]+\]}}, 11 -;CHECK-DAG: image_sample_c_d {{v\[[0-9]+:[0-9]+\]}}, 13 -;CHECK-DAG: image_sample_d {{v\[[0-9]+:[0-9]+\]}}, 14 -;CHECK-DAG: image_sample_d {{v[0-9]+}}, 8 - -define void @test(i32 %a1, i32 %a2, i32 %a3, i32 %a4) #0 { - %v1 = insertelement <4 x i32> undef, i32 %a1, i32 0 - %v2 = insertelement <4 x i32> undef, i32 %a1, i32 1 - %v3 = insertelement <4 x i32> undef, i32 %a1, i32 2 - %v4 = insertelement <4 x i32> undef, i32 %a1, i32 3 - %v5 = insertelement <4 x i32> undef, i32 %a2, i32 0 - %v6 = insertelement <4 x i32> undef, i32 %a2, i32 1 - %v7 = insertelement <4 x i32> undef, i32 %a2, i32 2 - %v8 = insertelement <4 x i32> undef, i32 %a2, i32 3 - %v9 = insertelement <4 x i32> undef, i32 %a3, i32 0 - %v10 = insertelement <4 x i32> undef, i32 %a3, i32 1 - %v11 = insertelement <4 x i32> undef, i32 %a3, i32 2 - %v12 = insertelement <4 x i32> undef, i32 %a3, i32 3 - %v13 = insertelement <4 x i32> undef, i32 %a4, i32 0 - %v14 = insertelement <4 x i32> undef, i32 %a4, i32 1 - %v15 = insertelement <4 x i32> undef, i32 %a4, i32 2 - %v16 = insertelement <4 x i32> undef, i32 %a4, i32 3 - %res1 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v1, - <32 x i8> undef, <16 x i8> undef, i32 1) - %res2 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v2, - <32 x i8> undef, <16 x i8> undef, i32 2) - %res3 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v3, - <32 x i8> undef, <16 x i8> undef, i32 3) - %res4 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v4, - <32 x i8> undef, <16 x i8> undef, i32 4) - %res5 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v5, - <32 x i8> undef, <16 x i8> undef, i32 5) - %res6 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v6, - <32 x i8> undef, <16 x i8> undef, i32 6) - %res7 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v7, - <32 x i8> undef, <16 x i8> undef, i32 7) - %res8 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v8, - <32 x i8> undef, <16 x i8> undef, i32 8) - %res9 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v9, - <32 x i8> undef, <16 x i8> undef, i32 9) - %res10 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v10, - <32 x i8> undef, <16 x i8> undef, i32 10) - %res11 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v11, - <32 x i8> undef, <16 x i8> undef, i32 11) - %res12 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v12, - <32 x i8> undef, <16 x i8> undef, i32 12) - %res13 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v13, - <32 x i8> undef, <16 x i8> undef, i32 13) - %res14 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v14, - <32 x i8> undef, <16 x i8> undef, i32 14) - %res15 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v15, - <32 x i8> undef, <16 x i8> undef, i32 15) - %res16 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v16, - <32 x i8> undef, <16 x i8> undef, i32 16) - %e1 = extractelement <4 x float> %res1, i32 0 - %e2 = extractelement <4 x float> %res2, i32 1 - %e3 = extractelement <4 x float> %res3, i32 2 - %e4 = extractelement <4 x float> %res4, i32 3 - %t0 = extractelement <4 x float> %res5, i32 0 - %t1 = extractelement <4 x float> %res5, i32 1 - %e5 = fadd float %t0, %t1 - %t2 = extractelement <4 x float> %res6, i32 0 - %t3 = extractelement <4 x float> %res6, i32 2 - %e6 = fadd float %t2, %t3 - %t4 = extractelement <4 x float> %res7, i32 0 - %t5 = extractelement <4 x float> %res7, i32 3 - %e7 = fadd float %t4, %t5 - %t6 = extractelement <4 x float> %res8, i32 1 - %t7 = extractelement <4 x float> %res8, i32 2 - %e8 = fadd float %t6, %t7 - %t8 = extractelement <4 x float> %res9, i32 1 - %t9 = extractelement <4 x float> %res9, i32 3 - %e9 = fadd float %t8, %t9 - %t10 = extractelement <4 x float> %res10, i32 2 - %t11 = extractelement <4 x float> %res10, i32 3 - %e10 = fadd float %t10, %t11 - %t12 = extractelement <4 x float> %res11, i32 0 - %t13 = extractelement <4 x float> %res11, i32 1 - %t14 = extractelement <4 x float> %res11, i32 2 - %t15 = fadd float %t12, %t13 - %e11 = fadd float %t14, %t15 - %t16 = extractelement <4 x float> %res12, i32 0 - %t17 = extractelement <4 x float> %res12, i32 1 - %t18 = extractelement <4 x float> %res12, i32 3 - %t19 = fadd float %t16, %t17 - %e12 = fadd float %t18, %t19 - %t20 = extractelement <4 x float> %res13, i32 0 - %t21 = extractelement <4 x float> %res13, i32 2 - %t22 = extractelement <4 x float> %res13, i32 3 - %t23 = fadd float %t20, %t21 - %e13 = fadd float %t22, %t23 - %t24 = extractelement <4 x float> %res14, i32 1 - %t25 = extractelement <4 x float> %res14, i32 2 - %t26 = extractelement <4 x float> %res14, i32 3 - %t27 = fadd float %t24, %t25 - %e14 = fadd float %t26, %t27 - %t28 = extractelement <4 x float> %res15, i32 0 - %t29 = extractelement <4 x float> %res15, i32 1 - %t30 = extractelement <4 x float> %res15, i32 2 - %t31 = extractelement <4 x float> %res15, i32 3 - %t32 = fadd float %t28, %t29 - %t33 = fadd float %t30, %t31 - %e15 = fadd float %t32, %t33 - %e16 = extractelement <4 x float> %res16, i32 3 - %s1 = fadd float %e1, %e2 - %s2 = fadd float %s1, %e3 - %s3 = fadd float %s2, %e4 - %s4 = fadd float %s3, %e5 - %s5 = fadd float %s4, %e6 - %s6 = fadd float %s5, %e7 - %s7 = fadd float %s6, %e8 - %s8 = fadd float %s7, %e9 - %s9 = fadd float %s8, %e10 - %s10 = fadd float %s9, %e11 - %s11 = fadd float %s10, %e12 - %s12 = fadd float %s11, %e13 - %s13 = fadd float %s12, %e14 - %s14 = fadd float %s13, %e15 - %s15 = fadd float %s14, %e16 - call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %s15, float %s15, float %s15, float %s15) - ret void -} - -declare <4 x float> @llvm.SI.sampled.(<4 x i32>, <32 x i8>, <16 x i8>, i32) readnone - -declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) - -attributes #0 = { "ShaderType"="0" } diff --git a/llvm/test/CodeGen/R600/llvm.SI.sendmsg-m0.ll b/llvm/test/CodeGen/R600/llvm.SI.sendmsg-m0.ll deleted file mode 100644 index 2198590f2df..00000000000 --- a/llvm/test/CodeGen/R600/llvm.SI.sendmsg-m0.ll +++ /dev/null @@ -1,20 +0,0 @@ -;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=BOTH %s -;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=VI --check-prefix=BOTH %s - -; BOTH-LABEL: {{^}}main: -; BOTH: s_mov_b32 m0, s0 -; VI-NEXT: s_nop 0 -; BOTH-NEXT: s_sendmsg Gs_done(nop) -; BOTH-NEXT: s_endpgm - -define void @main(i32 inreg %a) #0 { -main_body: - call void @llvm.SI.sendmsg(i32 3, i32 %a) - ret void -} - -; Function Attrs: nounwind -declare void @llvm.SI.sendmsg(i32, i32) #1 - -attributes #0 = { "ShaderType"="2" "unsafe-fp-math"="true" } -attributes #1 = { nounwind } diff --git a/llvm/test/CodeGen/R600/llvm.SI.sendmsg.ll b/llvm/test/CodeGen/R600/llvm.SI.sendmsg.ll deleted file mode 100644 index 09675d50335..00000000000 --- a/llvm/test/CodeGen/R600/llvm.SI.sendmsg.ll +++ /dev/null @@ -1,24 +0,0 @@ -;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s -;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s - -; CHECK-LABEL: {{^}}main: -; CHECK: s_mov_b32 m0, 0 -; CHECK-NOT: s_mov_b32 m0 -; CHECK: s_sendmsg Gs(emit stream 0) -; CHECK: s_sendmsg Gs(cut stream 1) -; CHECK: s_sendmsg Gs(emit-cut stream 2) -; CHECK: s_sendmsg Gs_done(nop) - -define void @main() { -main_body: - call void @llvm.SI.sendmsg(i32 34, i32 0); - call void @llvm.SI.sendmsg(i32 274, i32 0); - call void @llvm.SI.sendmsg(i32 562, i32 0); - call void @llvm.SI.sendmsg(i32 3, i32 0); - ret void -} - -; Function Attrs: nounwind -declare void @llvm.SI.sendmsg(i32, i32) #0 - -attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/R600/llvm.SI.tbuffer.store.ll b/llvm/test/CodeGen/R600/llvm.SI.tbuffer.store.ll deleted file mode 100644 index 71f51548a5f..00000000000 --- a/llvm/test/CodeGen/R600/llvm.SI.tbuffer.store.ll +++ /dev/null @@ -1,47 +0,0 @@ -;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s -;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s - -;CHECK-LABEL: {{^}}test1: -;CHECK: tbuffer_store_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, 0x20, -1, 0, -1, 0, 14, 4, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, -1, 0, 0 -define void @test1(i32 %a1, i32 %vaddr) #0 { - %vdata = insertelement <4 x i32> undef, i32 %a1, i32 0 - call void @llvm.SI.tbuffer.store.v4i32(<16 x i8> undef, <4 x i32> %vdata, - i32 4, i32 %vaddr, i32 0, i32 32, i32 14, i32 4, i32 1, i32 0, i32 1, - i32 1, i32 0) - ret void -} - -;CHECK-LABEL: {{^}}test2: -;CHECK: tbuffer_store_format_xyz {{v\[[0-9]+:[0-9]+\]}}, 0x18, -1, 0, -1, 0, 13, 4, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, -1, 0, 0 -define void @test2(i32 %a1, i32 %vaddr) #0 { - %vdata = insertelement <4 x i32> undef, i32 %a1, i32 0 - call void @llvm.SI.tbuffer.store.v4i32(<16 x i8> undef, <4 x i32> %vdata, - i32 3, i32 %vaddr, i32 0, i32 24, i32 13, i32 4, i32 1, i32 0, i32 1, - i32 1, i32 0) - ret void -} - -;CHECK-LABEL: {{^}}test3: -;CHECK: tbuffer_store_format_xy {{v\[[0-9]+:[0-9]+\]}}, 0x10, -1, 0, -1, 0, 11, 4, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, -1, 0, 0 -define void @test3(i32 %a1, i32 %vaddr) #0 { - %vdata = insertelement <2 x i32> undef, i32 %a1, i32 0 - call void @llvm.SI.tbuffer.store.v2i32(<16 x i8> undef, <2 x i32> %vdata, - i32 2, i32 %vaddr, i32 0, i32 16, i32 11, i32 4, i32 1, i32 0, i32 1, - i32 1, i32 0) - ret void -} - -;CHECK-LABEL: {{^}}test4: -;CHECK: tbuffer_store_format_x {{v[0-9]+}}, 0x8, -1, 0, -1, 0, 4, 4, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, -1, 0, 0 -define void @test4(i32 %vdata, i32 %vaddr) #0 { - call void @llvm.SI.tbuffer.store.i32(<16 x i8> undef, i32 %vdata, - i32 1, i32 %vaddr, i32 0, i32 8, i32 4, i32 4, i32 1, i32 0, i32 1, - i32 1, i32 0) - ret void -} - -declare void @llvm.SI.tbuffer.store.i32(<16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -declare void @llvm.SI.tbuffer.store.v2i32(<16 x i8>, <2 x i32>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -declare void @llvm.SI.tbuffer.store.v4i32(<16 x i8>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) - -attributes #0 = { "ShaderType"="1" } diff --git a/llvm/test/CodeGen/R600/llvm.SI.tid.ll b/llvm/test/CodeGen/R600/llvm.SI.tid.ll deleted file mode 100644 index f6e6d7050ba..00000000000 --- a/llvm/test/CodeGen/R600/llvm.SI.tid.ll +++ /dev/null @@ -1,18 +0,0 @@ -;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=GCN %s -;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=VI --check-prefix=GCN %s - -;GCN: v_mbcnt_lo_u32_b32_e64 -;SI: v_mbcnt_hi_u32_b32_e32 -;VI: v_mbcnt_hi_u32_b32_e64 - -define void @main(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg) "ShaderType"="0" { -main_body: - %4 = call i32 @llvm.SI.tid() - %5 = bitcast i32 %4 to float - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %5, float %5, float %5, float %5) - ret void -} - -declare i32 @llvm.SI.tid() readnone - -declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) diff --git a/llvm/test/CodeGen/R600/llvm.amdgpu.dp4.ll b/llvm/test/CodeGen/R600/llvm.amdgpu.dp4.ll deleted file mode 100644 index 036cd2ca82a..00000000000 --- a/llvm/test/CodeGen/R600/llvm.amdgpu.dp4.ll +++ /dev/null @@ -1,11 +0,0 @@ -; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s - -declare float @llvm.AMDGPU.dp4(<4 x float>, <4 x float>) nounwind readnone - -define void @test_dp4(float addrspace(1)* %out, <4 x float> addrspace(1)* %a, <4 x float> addrspace(1)* %b) nounwind { - %src0 = load <4 x float>, <4 x float> addrspace(1)* %a, align 16 - %src1 = load <4 x float>, <4 x float> addrspace(1)* %b, align 16 - %dp4 = call float @llvm.AMDGPU.dp4(<4 x float> %src0, <4 x float> %src1) nounwind readnone - store float %dp4, float addrspace(1)* %out, align 4 - ret void -} diff --git a/llvm/test/CodeGen/R600/llvm.amdgpu.kilp.ll b/llvm/test/CodeGen/R600/llvm.amdgpu.kilp.ll deleted file mode 100644 index 42df6db1ccf..00000000000 --- a/llvm/test/CodeGen/R600/llvm.amdgpu.kilp.ll +++ /dev/null @@ -1,21 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s - -; SI-LABEL: {{^}}kilp_gs_const: -; SI: s_mov_b64 exec, 0 -define void @kilp_gs_const() #0 { -main_body: - %0 = icmp ule i32 0, 3 - %1 = select i1 %0, float 1.000000e+00, float -1.000000e+00 - call void @llvm.AMDGPU.kilp(float %1) - %2 = icmp ule i32 3, 0 - %3 = select i1 %2, float 1.000000e+00, float -1.000000e+00 - call void @llvm.AMDGPU.kilp(float %3) - ret void -} - -declare void @llvm.AMDGPU.kilp(float) - -attributes #0 = { "ShaderType"="2" } - -!0 = !{!"const", null, i32 1} diff --git a/llvm/test/CodeGen/R600/llvm.amdgpu.lrp.ll b/llvm/test/CodeGen/R600/llvm.amdgpu.lrp.ll deleted file mode 100644 index 4e4c2ec7791..00000000000 --- a/llvm/test/CodeGen/R600/llvm.amdgpu.lrp.ll +++ /dev/null @@ -1,13 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s - -declare float @llvm.AMDGPU.lrp(float, float, float) nounwind readnone - -; FUNC-LABEL: {{^}}test_lrp: -; SI: v_sub_f32 -; SI: v_mad_f32 -define void @test_lrp(float addrspace(1)* %out, float %src0, float %src1, float %src2) nounwind { - %mad = call float @llvm.AMDGPU.lrp(float %src0, float %src1, float %src2) nounwind readnone - store float %mad, float addrspace(1)* %out, align 4 - ret void -} diff --git a/llvm/test/CodeGen/R600/llvm.cos.ll b/llvm/test/CodeGen/R600/llvm.cos.ll deleted file mode 100644 index c65df8b3e8d..00000000000 --- a/llvm/test/CodeGen/R600/llvm.cos.ll +++ /dev/null @@ -1,41 +0,0 @@ -;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s -check-prefix=EG -check-prefix=FUNC -;RUN: llc < %s -march=amdgcn -mcpu=SI | FileCheck %s -check-prefix=SI -check-prefix=FUNC -;RUN: llc < %s -march=amdgcn -mcpu=tonga | FileCheck %s -check-prefix=SI -check-prefix=FUNC - -;FUNC-LABEL: test -;EG: MULADD_IEEE * -;EG: FRACT * -;EG: ADD * -;EG: COS * T{{[0-9]+\.[XYZW], PV\.[XYZW]}} -;EG-NOT: COS -;SI: v_cos_f32 -;SI-NOT: v_cos_f32 - -define void @test(float addrspace(1)* %out, float %x) #1 { - %cos = call float @llvm.cos.f32(float %x) - store float %cos, float addrspace(1)* %out - ret void -} - -;FUNC-LABEL: testv -;EG: COS * T{{[0-9]+\.[XYZW], PV\.[XYZW]}} -;EG: COS * T{{[0-9]+\.[XYZW], PV\.[XYZW]}} -;EG: COS * T{{[0-9]+\.[XYZW], PV\.[XYZW]}} -;EG: COS * T{{[0-9]+\.[XYZW], PV\.[XYZW]}} -;EG-NOT: COS -;SI: v_cos_f32 -;SI: v_cos_f32 -;SI: v_cos_f32 -;SI: v_cos_f32 -;SI-NOT: v_cos_f32 - -define void @testv(<4 x float> addrspace(1)* %out, <4 x float> inreg %vx) #1 { - %cos = call <4 x float> @llvm.cos.v4f32(<4 x float> %vx) - store <4 x float> %cos, <4 x float> addrspace(1)* %out - ret void -} - -declare float @llvm.cos.f32(float) readnone -declare <4 x float> @llvm.cos.v4f32(<4 x float>) readnone - -attributes #0 = { "ShaderType"="0" } diff --git a/llvm/test/CodeGen/R600/llvm.exp2.ll b/llvm/test/CodeGen/R600/llvm.exp2.ll deleted file mode 100644 index 42698925aae..00000000000 --- a/llvm/test/CodeGen/R600/llvm.exp2.ll +++ /dev/null @@ -1,80 +0,0 @@ -;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG --check-prefix=FUNC -;RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=CM --check-prefix=FUNC -;RUN: llc < %s -march=amdgcn -mcpu=SI | FileCheck %s --check-prefix=SI --check-prefix=FUNC -;RUN: llc < %s -march=amdgcn -mcpu=tonga | FileCheck %s --check-prefix=SI --check-prefix=FUNC - -;FUNC-LABEL: {{^}}test: -;EG: EXP_IEEE -;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) -;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) -;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) -;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} -;SI: v_exp_f32 - -define void @test(float addrspace(1)* %out, float %in) { -entry: - %0 = call float @llvm.exp2.f32(float %in) - store float %0, float addrspace(1)* %out - ret void -} - -;FUNC-LABEL: {{^}}testv2: -;EG: EXP_IEEE -;EG: EXP_IEEE -; FIXME: We should be able to merge these packets together on Cayman so we -; have a maximum of 4 instructions. -;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) -;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) -;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) -;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) -;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) -;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) -;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} -;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} -;SI: v_exp_f32 -;SI: v_exp_f32 - -define void @testv2(<2 x float> addrspace(1)* %out, <2 x float> %in) { -entry: - %0 = call <2 x float> @llvm.exp2.v2f32(<2 x float> %in) - store <2 x float> %0, <2 x float> addrspace(1)* %out - ret void -} - -;FUNC-LABEL: {{^}}testv4: -;EG: EXP_IEEE -;EG: EXP_IEEE -;EG: EXP_IEEE -;EG: EXP_IEEE -; FIXME: We should be able to merge these packets together on Cayman so we -; have a maximum of 4 instructions. -;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) -;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) -;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) -;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) -;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) -;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) -;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) -;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) -;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) -;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) -;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) -;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) -;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} -;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} -;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} -;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} -;SI: v_exp_f32 -;SI: v_exp_f32 -;SI: v_exp_f32 -;SI: v_exp_f32 -define void @testv4(<4 x float> addrspace(1)* %out, <4 x float> %in) { -entry: - %0 = call <4 x float> @llvm.exp2.v4f32(<4 x float> %in) - store <4 x float> %0, <4 x float> addrspace(1)* %out - ret void -} - -declare float @llvm.exp2.f32(float) readnone -declare <2 x float> @llvm.exp2.v2f32(<2 x float>) readnone -declare <4 x float> @llvm.exp2.v4f32(<4 x float>) readnone diff --git a/llvm/test/CodeGen/R600/llvm.log2.ll b/llvm/test/CodeGen/R600/llvm.log2.ll deleted file mode 100644 index c75e7850b35..00000000000 --- a/llvm/test/CodeGen/R600/llvm.log2.ll +++ /dev/null @@ -1,80 +0,0 @@ -;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG --check-prefix=FUNC -;RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=CM --check-prefix=FUNC -;RUN: llc < %s -march=amdgcn -mcpu=SI | FileCheck %s --check-prefix=SI --check-prefix=FUNC -;RUN: llc < %s -march=amdgcn -mcpu=tonga | FileCheck %s --check-prefix=SI --check-prefix=FUNC - -;FUNC-LABEL: {{^}}test: -;EG: LOG_IEEE -;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) -;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) -;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) -;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} -;SI: v_log_f32 - -define void @test(float addrspace(1)* %out, float %in) { -entry: - %0 = call float @llvm.log2.f32(float %in) - store float %0, float addrspace(1)* %out - ret void -} - -;FUNC-LABEL: {{^}}testv2: -;EG: LOG_IEEE -;EG: LOG_IEEE -; FIXME: We should be able to merge these packets together on Cayman so we -; have a maximum of 4 instructions. -;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) -;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) -;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) -;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) -;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) -;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) -;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} -;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} -;SI: v_log_f32 -;SI: v_log_f32 - -define void @testv2(<2 x float> addrspace(1)* %out, <2 x float> %in) { -entry: - %0 = call <2 x float> @llvm.log2.v2f32(<2 x float> %in) - store <2 x float> %0, <2 x float> addrspace(1)* %out - ret void -} - -;FUNC-LABEL: {{^}}testv4: -;EG: LOG_IEEE -;EG: LOG_IEEE -;EG: LOG_IEEE -;EG: LOG_IEEE -; FIXME: We should be able to merge these packets together on Cayman so we -; have a maximum of 4 instructions. -;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) -;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) -;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) -;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) -;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) -;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) -;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) -;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) -;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) -;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) -;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) -;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) -;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} -;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} -;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} -;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} -;SI: v_log_f32 -;SI: v_log_f32 -;SI: v_log_f32 -;SI: v_log_f32 -define void @testv4(<4 x float> addrspace(1)* %out, <4 x float> %in) { -entry: - %0 = call <4 x float> @llvm.log2.v4f32(<4 x float> %in) - store <4 x float> %0, <4 x float> addrspace(1)* %out - ret void -} - -declare float @llvm.log2.f32(float) readnone -declare <2 x float> @llvm.log2.v2f32(<2 x float>) readnone -declare <4 x float> @llvm.log2.v4f32(<4 x float>) readnone diff --git a/llvm/test/CodeGen/R600/llvm.memcpy.ll b/llvm/test/CodeGen/R600/llvm.memcpy.ll deleted file mode 100644 index e491732cf9c..00000000000 --- a/llvm/test/CodeGen/R600/llvm.memcpy.ll +++ /dev/null @@ -1,365 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s - -declare void @llvm.memcpy.p3i8.p3i8.i32(i8 addrspace(3)* nocapture, i8 addrspace(3)* nocapture, i32, i32, i1) nounwind -declare void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* nocapture, i8 addrspace(1)* nocapture, i64, i32, i1) nounwind - - -; FUNC-LABEL: {{^}}test_small_memcpy_i64_lds_to_lds_align1: -; SI: ds_read_u8 -; SI: ds_read_u8 -; SI: ds_read_u8 -; SI: ds_read_u8 -; SI: ds_read_u8 -; SI: ds_read_u8 -; SI: ds_read_u8 -; SI: ds_read_u8 - -; SI: ds_read_u8 -; SI: ds_read_u8 -; SI: ds_read_u8 -; SI: ds_read_u8 -; SI: ds_read_u8 -; SI: ds_read_u8 -; SI: ds_read_u8 -; SI: ds_read_u8 - -; SI: ds_read_u8 -; SI: ds_read_u8 -; SI: ds_read_u8 -; SI: ds_read_u8 -; SI: ds_read_u8 -; SI: ds_read_u8 -; SI: ds_read_u8 -; SI: ds_read_u8 - -; SI: ds_read_u8 -; SI: ds_read_u8 -; SI: ds_read_u8 -; SI: ds_read_u8 -; SI: ds_read_u8 -; SI: ds_read_u8 -; SI: ds_read_u8 -; SI: ds_read_u8 - -; SI: ds_write_b8 -; SI: ds_write_b8 -; SI: ds_write_b8 -; SI: ds_write_b8 -; SI: ds_write_b8 -; SI: ds_write_b8 -; SI: ds_write_b8 -; SI: ds_write_b8 - -; SI: ds_write_b8 -; SI: ds_write_b8 -; SI: ds_write_b8 -; SI: ds_write_b8 -; SI: ds_write_b8 -; SI: ds_write_b8 -; SI: ds_write_b8 -; SI: ds_write_b8 - -; SI: ds_write_b8 -; SI: ds_write_b8 -; SI: ds_write_b8 -; SI: ds_write_b8 -; SI: ds_write_b8 -; SI: ds_write_b8 -; SI: ds_write_b8 -; SI: ds_write_b8 - -; SI: ds_write_b8 -; SI: ds_write_b8 -; SI: ds_write_b8 -; SI: ds_write_b8 -; SI: ds_write_b8 -; SI: ds_write_b8 -; SI: ds_write_b8 -; SI: ds_write_b8 - -; SI: s_endpgm -define void @test_small_memcpy_i64_lds_to_lds_align1(i64 addrspace(3)* noalias %out, i64 addrspace(3)* noalias %in) nounwind { - %bcin = bitcast i64 addrspace(3)* %in to i8 addrspace(3)* - %bcout = bitcast i64 addrspace(3)* %out to i8 addrspace(3)* - call void @llvm.memcpy.p3i8.p3i8.i32(i8 addrspace(3)* %bcout, i8 addrspace(3)* %bcin, i32 32, i32 1, i1 false) nounwind - ret void -} - -; FUNC-LABEL: {{^}}test_small_memcpy_i64_lds_to_lds_align2: -; SI: ds_read_u16 -; SI: ds_read_u16 -; SI: ds_read_u16 -; SI: ds_read_u16 -; SI: ds_read_u16 -; SI: ds_read_u16 -; SI: ds_read_u16 -; SI: ds_read_u16 - -; SI: ds_read_u16 -; SI: ds_read_u16 -; SI: ds_read_u16 -; SI: ds_read_u16 -; SI: ds_read_u16 -; SI: ds_read_u16 -; SI: ds_read_u16 -; SI: ds_read_u16 - -; SI: ds_write_b16 -; SI: ds_write_b16 -; SI: ds_write_b16 -; SI: ds_write_b16 -; SI: ds_write_b16 -; SI: ds_write_b16 -; SI: ds_write_b16 -; SI: ds_write_b16 - -; SI: ds_write_b16 -; SI: ds_write_b16 -; SI: ds_write_b16 -; SI: ds_write_b16 -; SI: ds_write_b16 -; SI: ds_write_b16 -; SI: ds_write_b16 -; SI: ds_write_b16 - -; SI: s_endpgm -define void @test_small_memcpy_i64_lds_to_lds_align2(i64 addrspace(3)* noalias %out, i64 addrspace(3)* noalias %in) nounwind { - %bcin = bitcast i64 addrspace(3)* %in to i8 addrspace(3)* - %bcout = bitcast i64 addrspace(3)* %out to i8 addrspace(3)* - call void @llvm.memcpy.p3i8.p3i8.i32(i8 addrspace(3)* %bcout, i8 addrspace(3)* %bcin, i32 32, i32 2, i1 false) nounwind - ret void -} - -; FUNC-LABEL: {{^}}test_small_memcpy_i64_lds_to_lds_align4: -; SI-DAG: ds_read_b32 -; SI-DAG: ds_write_b32 - -; SI-DAG: ds_read_b32 -; SI-DAG: ds_write_b32 - -; SI-DAG: ds_read_b32 -; SI-DAG: ds_write_b32 - -; SI-DAG: ds_read_b32 -; SI-DAG: ds_write_b32 - -; SI-DAG: ds_read_b32 -; SI-DAG: ds_write_b32 - -; SI-DAG: ds_read_b32 -; SI-DAG: ds_write_b32 - -; SI-DAG: ds_read_b32 -; SI-DAG: ds_write_b32 - -; SI-DAG: ds_read_b32 -; SI-DAG: ds_write_b32 - -; SI-DAG: ds_read_b32 -; SI-DAG: ds_write_b32 - -; SI: s_endpgm -define void @test_small_memcpy_i64_lds_to_lds_align4(i64 addrspace(3)* noalias %out, i64 addrspace(3)* noalias %in) nounwind { - %bcin = bitcast i64 addrspace(3)* %in to i8 addrspace(3)* - %bcout = bitcast i64 addrspace(3)* %out to i8 addrspace(3)* - call void @llvm.memcpy.p3i8.p3i8.i32(i8 addrspace(3)* %bcout, i8 addrspace(3)* %bcin, i32 32, i32 4, i1 false) nounwind - ret void -} - -; FIXME: Use 64-bit ops -; FUNC-LABEL: {{^}}test_small_memcpy_i64_lds_to_lds_align8: - -; SI-DAG: ds_read_b32 -; SI-DAG: ds_write_b32 - -; SI-DAG: ds_read_b32 -; SI-DAG: ds_write_b32 - -; SI-DAG: ds_read_b32 -; SI-DAG: ds_write_b32 - -; SI-DAG: ds_read_b32 -; SI-DAG: ds_write_b32 - -; SI-DAG: ds_read_b32 -; SI-DAG: ds_write_b32 - -; SI-DAG: ds_read_b32 -; SI-DAG: ds_write_b32 - -; SI-DAG: ds_read_b32 -; SI-DAG: ds_write_b32 - -; SI-DAG: ds_read_b32 -; SI-DAG: ds_write_b32 - -; SI-DAG: ds_read_b32 -; SI-DAG: ds_write_b32 - -; SI-DAG: s_endpgm -define void @test_small_memcpy_i64_lds_to_lds_align8(i64 addrspace(3)* noalias %out, i64 addrspace(3)* noalias %in) nounwind { - %bcin = bitcast i64 addrspace(3)* %in to i8 addrspace(3)* - %bcout = bitcast i64 addrspace(3)* %out to i8 addrspace(3)* - call void @llvm.memcpy.p3i8.p3i8.i32(i8 addrspace(3)* %bcout, i8 addrspace(3)* %bcin, i32 32, i32 8, i1 false) nounwind - ret void -} - -; FUNC-LABEL: {{^}}test_small_memcpy_i64_global_to_global_align1: -; SI-DAG: buffer_load_ubyte -; SI-DAG: buffer_store_byte -; SI-DAG: buffer_load_ubyte -; SI-DAG: buffer_store_byte -; SI-DAG: buffer_load_ubyte -; SI-DAG: buffer_store_byte -; SI-DAG: buffer_load_ubyte -; SI-DAG: buffer_store_byte -; SI-DAG: buffer_load_ubyte -; SI-DAG: buffer_store_byte -; SI-DAG: buffer_load_ubyte -; SI-DAG: buffer_store_byte -; SI-DAG: buffer_load_ubyte -; SI-DAG: buffer_store_byte -; SI-DAG: buffer_load_ubyte -; SI-DAG: buffer_store_byte - -; SI-DAG: buffer_load_ubyte -; SI-DAG: buffer_store_byte -; SI-DAG: buffer_load_ubyte -; SI-DAG: buffer_store_byte -; SI-DAG: buffer_load_ubyte -; SI-DAG: buffer_store_byte -; SI-DAG: buffer_load_ubyte -; SI-DAG: buffer_store_byte -; SI-DAG: buffer_load_ubyte -; SI-DAG: buffer_store_byte -; SI-DAG: buffer_load_ubyte -; SI-DAG: buffer_store_byte -; SI-DAG: buffer_load_ubyte -; SI-DAG: buffer_store_byte -; SI-DAG: buffer_load_ubyte -; SI-DAG: buffer_store_byte - -; SI-DAG: buffer_load_ubyte -; SI-DAG: buffer_store_byte -; SI-DAG: buffer_load_ubyte -; SI-DAG: buffer_store_byte -; SI-DAG: buffer_load_ubyte -; SI-DAG: buffer_store_byte -; SI-DAG: buffer_load_ubyte -; SI-DAG: buffer_store_byte -; SI-DAG: buffer_load_ubyte -; SI-DAG: buffer_store_byte -; SI-DAG: buffer_load_ubyte -; SI-DAG: buffer_store_byte -; SI-DAG: buffer_load_ubyte -; SI-DAG: buffer_store_byte -; SI-DAG: buffer_load_ubyte -; SI-DAG: buffer_store_byte - -; SI-DAG: buffer_load_ubyte -; SI-DAG: buffer_store_byte -; SI-DAG: buffer_load_ubyte -; SI-DAG: buffer_store_byte -; SI-DAG: buffer_load_ubyte -; SI-DAG: buffer_store_byte -; SI-DAG: buffer_load_ubyte -; SI-DAG: buffer_store_byte -; SI-DAG: buffer_load_ubyte -; SI-DAG: buffer_store_byte -; SI-DAG: buffer_load_ubyte -; SI-DAG: buffer_store_byte -; SI-DAG: buffer_load_ubyte -; SI-DAG: buffer_store_byte -; SI-DAG: buffer_load_ubyte -; SI-DAG: buffer_store_byte - -; SI: s_endpgm -define void @test_small_memcpy_i64_global_to_global_align1(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind { - %bcin = bitcast i64 addrspace(1)* %in to i8 addrspace(1)* - %bcout = bitcast i64 addrspace(1)* %out to i8 addrspace(1)* - call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %bcout, i8 addrspace(1)* %bcin, i64 32, i32 1, i1 false) nounwind - ret void -} - -; FUNC-LABEL: {{^}}test_small_memcpy_i64_global_to_global_align2: -; SI-DAG: buffer_load_ushort -; SI-DAG: buffer_load_ushort -; SI-DAG: buffer_load_ushort -; SI-DAG: buffer_load_ushort -; SI-DAG: buffer_load_ushort -; SI-DAG: buffer_load_ushort -; SI-DAG: buffer_load_ushort -; SI-DAG: buffer_load_ushort -; SI-DAG: buffer_load_ushort -; SI-DAG: buffer_load_ushort -; SI-DAG: buffer_load_ushort -; SI-DAG: buffer_load_ushort -; SI-DAG: buffer_load_ushort -; SI-DAG: buffer_load_ushort -; SI-DAG: buffer_load_ushort -; SI-DAG: buffer_load_ushort - -; SI-DAG: buffer_store_short -; SI-DAG: buffer_store_short -; SI-DAG: buffer_store_short -; SI-DAG: buffer_store_short -; SI-DAG: buffer_store_short -; SI-DAG: buffer_store_short -; SI-DAG: buffer_store_short -; SI-DAG: buffer_store_short -; SI-DAG: buffer_store_short -; SI-DAG: buffer_store_short -; SI-DAG: buffer_store_short -; SI-DAG: buffer_store_short -; SI-DAG: buffer_store_short -; SI-DAG: buffer_store_short -; SI-DAG: buffer_store_short -; SI-DAG: buffer_store_short - -; SI: s_endpgm -define void @test_small_memcpy_i64_global_to_global_align2(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind { - %bcin = bitcast i64 addrspace(1)* %in to i8 addrspace(1)* - %bcout = bitcast i64 addrspace(1)* %out to i8 addrspace(1)* - call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %bcout, i8 addrspace(1)* %bcin, i64 32, i32 2, i1 false) nounwind - ret void -} - -; FUNC-LABEL: {{^}}test_small_memcpy_i64_global_to_global_align4: -; SI: buffer_load_dwordx4 -; SI: buffer_load_dwordx4 -; SI: buffer_store_dwordx4 -; SI: buffer_store_dwordx4 -; SI: s_endpgm -define void @test_small_memcpy_i64_global_to_global_align4(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind { - %bcin = bitcast i64 addrspace(1)* %in to i8 addrspace(1)* - %bcout = bitcast i64 addrspace(1)* %out to i8 addrspace(1)* - call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %bcout, i8 addrspace(1)* %bcin, i64 32, i32 4, i1 false) nounwind - ret void -} - -; FUNC-LABEL: {{^}}test_small_memcpy_i64_global_to_global_align8: -; SI: buffer_load_dwordx4 -; SI: buffer_load_dwordx4 -; SI: buffer_store_dwordx4 -; SI: buffer_store_dwordx4 -; SI: s_endpgm -define void @test_small_memcpy_i64_global_to_global_align8(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind { - %bcin = bitcast i64 addrspace(1)* %in to i8 addrspace(1)* - %bcout = bitcast i64 addrspace(1)* %out to i8 addrspace(1)* - call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %bcout, i8 addrspace(1)* %bcin, i64 32, i32 8, i1 false) nounwind - ret void -} - -; FUNC-LABEL: {{^}}test_small_memcpy_i64_global_to_global_align16: -; SI: buffer_load_dwordx4 -; SI: buffer_load_dwordx4 -; SI: buffer_store_dwordx4 -; SI: buffer_store_dwordx4 -; SI: s_endpgm -define void @test_small_memcpy_i64_global_to_global_align16(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind { - %bcin = bitcast i64 addrspace(1)* %in to i8 addrspace(1)* - %bcout = bitcast i64 addrspace(1)* %out to i8 addrspace(1)* - call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %bcout, i8 addrspace(1)* %bcin, i64 32, i32 16, i1 false) nounwind - ret void -} diff --git a/llvm/test/CodeGen/R600/llvm.pow.ll b/llvm/test/CodeGen/R600/llvm.pow.ll deleted file mode 100644 index c4ae652619c..00000000000 --- a/llvm/test/CodeGen/R600/llvm.pow.ll +++ /dev/null @@ -1,40 +0,0 @@ -;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s - -;CHECK-LABEL: test1: -;CHECK: LOG_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, -;CHECK-NEXT: MUL NON-IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], PS}}, -;CHECK-NEXT: EXP_IEEE * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}, - -define void @test1(<4 x float> inreg %reg0) #0 { - %r0 = extractelement <4 x float> %reg0, i32 0 - %r1 = extractelement <4 x float> %reg0, i32 1 - %r2 = call float @llvm.pow.f32( float %r0, float %r1) - %vec = insertelement <4 x float> undef, float %r2, i32 0 - call void @llvm.R600.store.swizzle(<4 x float> %vec, i32 0, i32 0) - ret void -} - -;CHECK-LABEL: test2: -;CHECK: LOG_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, -;CHECK-NEXT: MUL NON-IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], PS}}, -;CHECK-NEXT: LOG_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, -;CHECK-NEXT: MUL NON-IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], PS}}, -;CHECK-NEXT: EXP_IEEE * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}, -;CHECK-NEXT: EXP_IEEE * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}, -;CHECK-NEXT: LOG_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, -;CHECK-NEXT: MUL NON-IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], PS}}, -;CHECK-NEXT: LOG_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, -;CHECK-NEXT: MUL NON-IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], PS}}, -;CHECK-NEXT: EXP_IEEE * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}, -;CHECK-NEXT: EXP_IEEE * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}, -define void @test2(<4 x float> inreg %reg0, <4 x float> inreg %reg1) #0 { - %vec = call <4 x float> @llvm.pow.v4f32( <4 x float> %reg0, <4 x float> %reg1) - call void @llvm.R600.store.swizzle(<4 x float> %vec, i32 0, i32 0) - ret void -} - -declare float @llvm.pow.f32(float ,float ) readonly -declare <4 x float> @llvm.pow.v4f32(<4 x float> ,<4 x float> ) readonly -declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) - -attributes #0 = { "ShaderType"="0" } diff --git a/llvm/test/CodeGen/R600/llvm.rint.f64.ll b/llvm/test/CodeGen/R600/llvm.rint.f64.ll deleted file mode 100644 index c63fb172794..00000000000 --- a/llvm/test/CodeGen/R600/llvm.rint.f64.ll +++ /dev/null @@ -1,46 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s - -; FUNC-LABEL: {{^}}rint_f64: -; CI: v_rndne_f64_e32 - -; SI-DAG: v_add_f64 -; SI-DAG: v_add_f64 -; SI-DAG v_cmp_gt_f64_e64 -; SI: v_cndmask_b32 -; SI: v_cndmask_b32 -; SI: s_endpgm -define void @rint_f64(double addrspace(1)* %out, double %in) { -entry: - %0 = call double @llvm.rint.f64(double %in) - store double %0, double addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}rint_v2f64: -; CI: v_rndne_f64_e32 -; CI: v_rndne_f64_e32 -define void @rint_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in) { -entry: - %0 = call <2 x double> @llvm.rint.v2f64(<2 x double> %in) - store <2 x double> %0, <2 x double> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}rint_v4f64: -; CI: v_rndne_f64_e32 -; CI: v_rndne_f64_e32 -; CI: v_rndne_f64_e32 -; CI: v_rndne_f64_e32 -define void @rint_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %in) { -entry: - %0 = call <4 x double> @llvm.rint.v4f64(<4 x double> %in) - store <4 x double> %0, <4 x double> addrspace(1)* %out - ret void -} - - -declare double @llvm.rint.f64(double) #0 -declare <2 x double> @llvm.rint.v2f64(<2 x double>) #0 -declare <4 x double> @llvm.rint.v4f64(<4 x double>) #0 diff --git a/llvm/test/CodeGen/R600/llvm.rint.ll b/llvm/test/CodeGen/R600/llvm.rint.ll deleted file mode 100644 index 661db51ad03..00000000000 --- a/llvm/test/CodeGen/R600/llvm.rint.ll +++ /dev/null @@ -1,62 +0,0 @@ -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck %s -check-prefix=R600 -check-prefix=FUNC -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s - -; FUNC-LABEL: {{^}}rint_f32: -; R600: RNDNE - -; SI: v_rndne_f32_e32 -define void @rint_f32(float addrspace(1)* %out, float %in) { -entry: - %0 = call float @llvm.rint.f32(float %in) #0 - store float %0, float addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}rint_v2f32: -; R600: RNDNE -; R600: RNDNE - -; SI: v_rndne_f32_e32 -; SI: v_rndne_f32_e32 -define void @rint_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) { -entry: - %0 = call <2 x float> @llvm.rint.v2f32(<2 x float> %in) #0 - store <2 x float> %0, <2 x float> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}rint_v4f32: -; R600: RNDNE -; R600: RNDNE -; R600: RNDNE -; R600: RNDNE - -; SI: v_rndne_f32_e32 -; SI: v_rndne_f32_e32 -; SI: v_rndne_f32_e32 -; SI: v_rndne_f32_e32 -define void @rint_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) { -entry: - %0 = call <4 x float> @llvm.rint.v4f32(<4 x float> %in) #0 - store <4 x float> %0, <4 x float> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}legacy_amdil_round_nearest_f32: -; R600: RNDNE - -; SI: v_rndne_f32_e32 -define void @legacy_amdil_round_nearest_f32(float addrspace(1)* %out, float %in) { -entry: - %0 = call float @llvm.AMDIL.round.nearest.f32(float %in) #0 - store float %0, float addrspace(1)* %out - ret void -} - -declare float @llvm.AMDIL.round.nearest.f32(float) #0 -declare float @llvm.rint.f32(float) #0 -declare <2 x float> @llvm.rint.v2f32(<2 x float>) #0 -declare <4 x float> @llvm.rint.v4f32(<4 x float>) #0 - -attributes #0 = { nounwind readnone } diff --git a/llvm/test/CodeGen/R600/llvm.round.f64.ll b/llvm/test/CodeGen/R600/llvm.round.f64.ll deleted file mode 100644 index 3d0f57e3328..00000000000 --- a/llvm/test/CodeGen/R600/llvm.round.f64.ll +++ /dev/null @@ -1,74 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s - -; FUNC-LABEL: {{^}}round_f64: -; SI: s_endpgm -define void @round_f64(double addrspace(1)* %out, double %x) #0 { - %result = call double @llvm.round.f64(double %x) #1 - store double %result, double addrspace(1)* %out - ret void -} - -; This is a pretty large function, so just test a few of the -; instructions that are necessary. - -; FUNC-LABEL: {{^}}v_round_f64: -; SI: buffer_load_dwordx2 -; SI: v_bfe_u32 [[EXP:v[0-9]+]], v{{[0-9]+}}, 20, 11 - -; SI-DAG: v_not_b32_e32 -; SI-DAG: v_not_b32_e32 - -; SI-DAG: v_cmp_eq_i32 - -; SI-DAG: s_mov_b32 [[BFIMASK:s[0-9]+]], 0x7fffffff -; SI-DAG: v_cmp_gt_i32_e64 -; SI-DAG: v_bfi_b32 [[COPYSIGN:v[0-9]+]], [[BFIMASK]] - -; SI-DAG: v_cmp_gt_i32_e64 - - -; SI: buffer_store_dwordx2 -; SI: s_endpgm -define void @v_round_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 { - %tid = call i32 @llvm.r600.read.tidig.x() #1 - %gep = getelementptr double, double addrspace(1)* %in, i32 %tid - %out.gep = getelementptr double, double addrspace(1)* %out, i32 %tid - %x = load double, double addrspace(1)* %gep - %result = call double @llvm.round.f64(double %x) #1 - store double %result, double addrspace(1)* %out.gep - ret void -} - -; FUNC-LABEL: {{^}}round_v2f64: -; SI: s_endpgm -define void @round_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in) #0 { - %result = call <2 x double> @llvm.round.v2f64(<2 x double> %in) #1 - store <2 x double> %result, <2 x double> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}round_v4f64: -; SI: s_endpgm -define void @round_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %in) #0 { - %result = call <4 x double> @llvm.round.v4f64(<4 x double> %in) #1 - store <4 x double> %result, <4 x double> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}round_v8f64: -; SI: s_endpgm -define void @round_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %in) #0 { - %result = call <8 x double> @llvm.round.v8f64(<8 x double> %in) #1 - store <8 x double> %result, <8 x double> addrspace(1)* %out - ret void -} - -declare i32 @llvm.r600.read.tidig.x() #1 - -declare double @llvm.round.f64(double) #1 -declare <2 x double> @llvm.round.v2f64(<2 x double>) #1 -declare <4 x double> @llvm.round.v4f64(<4 x double>) #1 -declare <8 x double> @llvm.round.v8f64(<8 x double>) #1 - -attributes #0 = { nounwind } -attributes #1 = { nounwind readnone } diff --git a/llvm/test/CodeGen/R600/llvm.round.ll b/llvm/test/CodeGen/R600/llvm.round.ll deleted file mode 100644 index f5f124d915a..00000000000 --- a/llvm/test/CodeGen/R600/llvm.round.ll +++ /dev/null @@ -1,67 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s - -; FUNC-LABEL: {{^}}round_f32: -; SI-DAG: s_load_dword [[SX:s[0-9]+]] -; SI-DAG: s_mov_b32 [[K:s[0-9]+]], 0x7fffffff -; SI: v_trunc_f32_e32 [[TRUNC:v[0-9]+]], [[SX]] -; SI: v_sub_f32_e32 [[SUB:v[0-9]+]], [[SX]], [[TRUNC]] -; SI: v_mov_b32_e32 [[VX:v[0-9]+]], [[SX]] -; SI: v_bfi_b32 [[COPYSIGN:v[0-9]+]], [[K]], 1.0, [[VX]] -; SI: v_cmp_le_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], 0.5, |[[SUB]]| -; SI: v_cndmask_b32_e64 [[SEL:v[0-9]+]], 0, [[VX]], [[CMP]] -; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], [[SEL]], [[TRUNC]] -; SI: buffer_store_dword [[RESULT]] - -; R600: TRUNC {{.*}}, [[ARG:KC[0-9]\[[0-9]+\]\.[XYZW]]] -; R600-DAG: ADD {{.*}}, -; R600-DAG: BFI_INT -; R600-DAG: SETGE -; R600-DAG: CNDE -; R600-DAG: ADD -define void @round_f32(float addrspace(1)* %out, float %x) #0 { - %result = call float @llvm.round.f32(float %x) #1 - store float %result, float addrspace(1)* %out - ret void -} - -; The vector tests are really difficult to verify, since it can be hard to -; predict how the scheduler will order the instructions. We already have -; a test for the scalar case, so the vector tests just check that the -; compiler doesn't crash. - -; FUNC-LABEL: {{^}}round_v2f32: -; SI: s_endpgm -; R600: CF_END -define void @round_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) #0 { - %result = call <2 x float> @llvm.round.v2f32(<2 x float> %in) #1 - store <2 x float> %result, <2 x float> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}round_v4f32: -; SI: s_endpgm -; R600: CF_END -define void @round_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) #0 { - %result = call <4 x float> @llvm.round.v4f32(<4 x float> %in) #1 - store <4 x float> %result, <4 x float> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}round_v8f32: -; SI: s_endpgm -; R600: CF_END -define void @round_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %in) #0 { - %result = call <8 x float> @llvm.round.v8f32(<8 x float> %in) #1 - store <8 x float> %result, <8 x float> addrspace(1)* %out - ret void -} - -declare float @llvm.round.f32(float) #1 -declare <2 x float> @llvm.round.v2f32(<2 x float>) #1 -declare <4 x float> @llvm.round.v4f32(<4 x float>) #1 -declare <8 x float> @llvm.round.v8f32(<8 x float>) #1 - -attributes #0 = { nounwind } -attributes #1 = { nounwind readnone } diff --git a/llvm/test/CodeGen/R600/llvm.sin.ll b/llvm/test/CodeGen/R600/llvm.sin.ll deleted file mode 100644 index 3bb245c2e24..00000000000 --- a/llvm/test/CodeGen/R600/llvm.sin.ll +++ /dev/null @@ -1,92 +0,0 @@ -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=SI-SAFE -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=SI -enable-unsafe-fp-math < %s | FileCheck -check-prefix=SI -check-prefix=SI-UNSAFE -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=SI-SAFE -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -enable-unsafe-fp-math < %s | FileCheck -check-prefix=SI -check-prefix=SI-UNSAFE -check-prefix=FUNC %s - -; FUNC-LABEL: sin_f32 -; EG: MULADD_IEEE * -; EG: FRACT * -; EG: ADD * -; EG: SIN * T{{[0-9]+\.[XYZW], PV\.[XYZW]}} -; EG-NOT: SIN -; SI: v_mul_f32 -; SI: v_fract_f32 -; SI: v_sin_f32 -; SI-NOT: v_sin_f32 - -define void @sin_f32(float addrspace(1)* %out, float %x) #1 { - %sin = call float @llvm.sin.f32(float %x) - store float %sin, float addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}sin_3x_f32: -; SI-UNSAFE-NOT: v_add_f32 -; SI-UNSAFE: 0x3ef47644 -; SI-UNSAFE: v_mul_f32 -; SI-SAFE: v_mul_f32 -; SI-SAFE: v_mul_f32 -; SI: v_fract_f32 -; SI: v_sin_f32 -; SI-NOT: v_sin_f32 -define void @sin_3x_f32(float addrspace(1)* %out, float %x) #1 { - %y = fmul float 3.0, %x - %sin = call float @llvm.sin.f32(float %y) - store float %sin, float addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}sin_2x_f32: -; SI-UNSAFE-NOT: v_add_f32 -; SI-UNSAFE: 0x3ea2f983 -; SI-UNSAFE: v_mul_f32 -; SI-SAFE: v_add_f32 -; SI-SAFE: v_mul_f32 -; SI: v_fract_f32 -; SI: v_sin_f32 -; SI-NOT: v_sin_f32 -define void @sin_2x_f32(float addrspace(1)* %out, float %x) #1 { - %y = fmul float 2.0, %x - %sin = call float @llvm.sin.f32(float %y) - store float %sin, float addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}test_2sin_f32: -; SI-UNSAFE: 0x3ea2f983 -; SI-UNSAFE: v_mul_f32 -; SI-SAFE: v_add_f32 -; SI-SAFE: v_mul_f32 -; SI: v_fract_f32 -; SI: v_sin_f32 -; SI-NOT: v_sin_f32 -define void @test_2sin_f32(float addrspace(1)* %out, float %x) #1 { - %y = fmul float 2.0, %x - %sin = call float @llvm.sin.f32(float %y) - store float %sin, float addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}sin_v4f32: -; EG: SIN * T{{[0-9]+\.[XYZW], PV\.[XYZW]}} -; EG: SIN * T{{[0-9]+\.[XYZW], PV\.[XYZW]}} -; EG: SIN * T{{[0-9]+\.[XYZW], PV\.[XYZW]}} -; EG: SIN * T{{[0-9]+\.[XYZW], PV\.[XYZW]}} -; EG-NOT: SIN -; SI: v_sin_f32 -; SI: v_sin_f32 -; SI: v_sin_f32 -; SI: v_sin_f32 -; SI-NOT: v_sin_f32 - -define void @sin_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %vx) #1 { - %sin = call <4 x float> @llvm.sin.v4f32( <4 x float> %vx) - store <4 x float> %sin, <4 x float> addrspace(1)* %out - ret void -} - -declare float @llvm.sin.f32(float) readnone -declare <4 x float> @llvm.sin.v4f32(<4 x float>) readnone - -attributes #0 = { "ShaderType"="0" } diff --git a/llvm/test/CodeGen/R600/llvm.sqrt.ll b/llvm/test/CodeGen/R600/llvm.sqrt.ll deleted file mode 100644 index c6da047f539..00000000000 --- a/llvm/test/CodeGen/R600/llvm.sqrt.ll +++ /dev/null @@ -1,105 +0,0 @@ -; RUN: llc < %s -march=r600 --mcpu=redwood | FileCheck %s --check-prefix=R600 -; RUN: llc < %s -march=amdgcn --mcpu=SI -verify-machineinstrs| FileCheck %s --check-prefix=SI -; RUN: llc < %s -march=amdgcn --mcpu=tonga -verify-machineinstrs| FileCheck %s --check-prefix=SI - -; R600-LABEL: {{^}}sqrt_f32: -; R600: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[2].Z -; R600: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[2].Z, PS -; SI-LABEL: {{^}}sqrt_f32: -; SI: v_sqrt_f32_e32 -define void @sqrt_f32(float addrspace(1)* %out, float %in) { -entry: - %0 = call float @llvm.sqrt.f32(float %in) - store float %0, float addrspace(1)* %out - ret void -} - -; R600-LABEL: {{^}}sqrt_v2f32: -; R600-DAG: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[2].W -; R600-DAG: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[2].W, PS -; R600-DAG: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[3].X -; R600-DAG: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[3].X, PS -; SI-LABEL: {{^}}sqrt_v2f32: -; SI: v_sqrt_f32_e32 -; SI: v_sqrt_f32_e32 -define void @sqrt_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) { -entry: - %0 = call <2 x float> @llvm.sqrt.v2f32(<2 x float> %in) - store <2 x float> %0, <2 x float> addrspace(1)* %out - ret void -} - -; R600-LABEL: {{^}}sqrt_v4f32: -; R600-DAG: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[3].Y -; R600-DAG: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[3].Y, PS -; R600-DAG: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[3].Z -; R600-DAG: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[3].Z, PS -; R600-DAG: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[3].W -; R600-DAG: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[3].W, PS -; R600-DAG: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[4].X -; R600-DAG: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[4].X, PS -; SI-LABEL: {{^}}sqrt_v4f32: -; SI: v_sqrt_f32_e32 -; SI: v_sqrt_f32_e32 -; SI: v_sqrt_f32_e32 -; SI: v_sqrt_f32_e32 -define void @sqrt_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) { -entry: - %0 = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %in) - store <4 x float> %0, <4 x float> addrspace(1)* %out - ret void -} - -; SI-LABEL: {{^}}elim_redun_check: -; SI: v_sqrt_f32_e32 -; SI-NOT: v_cndmask -define void @elim_redun_check(float addrspace(1)* %out, float %in) { -entry: - %sqrt = call float @llvm.sqrt.f32(float %in) - %cmp = fcmp olt float %in, -0.000000e+00 - %res = select i1 %cmp, float 0x7FF8000000000000, float %sqrt - store float %res, float addrspace(1)* %out - ret void -} - -; SI-LABEL: {{^}}elim_redun_check_ult: -; SI: v_sqrt_f32_e32 -; SI-NOT: v_cndmask -define void @elim_redun_check_ult(float addrspace(1)* %out, float %in) { -entry: - %sqrt = call float @llvm.sqrt.f32(float %in) - %cmp = fcmp ult float %in, -0.000000e+00 - %res = select i1 %cmp, float 0x7FF8000000000000, float %sqrt - store float %res, float addrspace(1)* %out - ret void -} - -; SI-LABEL: {{^}}elim_redun_check_v2: -; SI: v_sqrt_f32_e32 -; SI: v_sqrt_f32_e32 -; SI-NOT: v_cndmask -define void @elim_redun_check_v2(<2 x float> addrspace(1)* %out, <2 x float> %in) { -entry: - %sqrt = call <2 x float> @llvm.sqrt.v2f32(<2 x float> %in) - %cmp = fcmp olt <2 x float> %in, - %res = select <2 x i1> %cmp, <2 x float> , <2 x float> %sqrt - store <2 x float> %res, <2 x float> addrspace(1)* %out - ret void -} - -; SI-LABEL: {{^}}elim_redun_check_v2_ult -; SI: v_sqrt_f32_e32 -; SI: v_sqrt_f32_e32 -; SI-NOT: v_cndmask -define void @elim_redun_check_v2_ult(<2 x float> addrspace(1)* %out, <2 x float> %in) { -entry: - %sqrt = call <2 x float> @llvm.sqrt.v2f32(<2 x float> %in) - %cmp = fcmp ult <2 x float> %in, - %res = select <2 x i1> %cmp, <2 x float> , <2 x float> %sqrt - store <2 x float> %res, <2 x float> addrspace(1)* %out - ret void -} - -declare float @llvm.sqrt.f32(float %in) -declare <2 x float> @llvm.sqrt.v2f32(<2 x float> %in) -declare <4 x float> @llvm.sqrt.v4f32(<4 x float> %in) diff --git a/llvm/test/CodeGen/R600/load-i1.ll b/llvm/test/CodeGen/R600/load-i1.ll deleted file mode 100644 index 0ca49fde3e7..00000000000 --- a/llvm/test/CodeGen/R600/load-i1.ll +++ /dev/null @@ -1,149 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s - -; FUNC-LABEL: {{^}}global_copy_i1_to_i1: -; SI: buffer_load_ubyte -; SI: v_and_b32_e32 v{{[0-9]+}}, 1 -; SI: buffer_store_byte -; SI: s_endpgm - -; EG: VTX_READ_8 -; EG: AND_INT -define void @global_copy_i1_to_i1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind { - %load = load i1, i1 addrspace(1)* %in - store i1 %load, i1 addrspace(1)* %out, align 1 - ret void -} - -; FUNC-LABEL: {{^}}local_copy_i1_to_i1: -; SI: ds_read_u8 -; SI: v_and_b32_e32 v{{[0-9]+}}, 1 -; SI: ds_write_b8 -; SI: s_endpgm - -; EG: LDS_UBYTE_READ_RET -; EG: AND_INT -; EG: LDS_BYTE_WRITE -define void @local_copy_i1_to_i1(i1 addrspace(3)* %out, i1 addrspace(3)* %in) nounwind { - %load = load i1, i1 addrspace(3)* %in - store i1 %load, i1 addrspace(3)* %out, align 1 - ret void -} - -; FUNC-LABEL: {{^}}constant_copy_i1_to_i1: -; SI: buffer_load_ubyte -; SI: v_and_b32_e32 v{{[0-9]+}}, 1 -; SI: buffer_store_byte -; SI: s_endpgm - -; EG: VTX_READ_8 -; EG: AND_INT -define void @constant_copy_i1_to_i1(i1 addrspace(1)* %out, i1 addrspace(2)* %in) nounwind { - %load = load i1, i1 addrspace(2)* %in - store i1 %load, i1 addrspace(1)* %out, align 1 - ret void -} - -; FUNC-LABEL: {{^}}global_sextload_i1_to_i32: -; SI: buffer_load_ubyte -; SI: v_bfe_i32 -; SI: buffer_store_dword -; SI: s_endpgm - -; EG: VTX_READ_8 -; EG: BFE_INT -define void @global_sextload_i1_to_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind { - %load = load i1, i1 addrspace(1)* %in - %ext = sext i1 %load to i32 - store i32 %ext, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}global_zextload_i1_to_i32: -; SI: buffer_load_ubyte -; SI: buffer_store_dword -; SI: s_endpgm - -define void @global_zextload_i1_to_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind { - %load = load i1, i1 addrspace(1)* %in - %ext = zext i1 %load to i32 - store i32 %ext, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}global_sextload_i1_to_i64: -; SI: buffer_load_ubyte -; SI: v_bfe_i32 -; SI: buffer_store_dwordx2 -; SI: s_endpgm -define void @global_sextload_i1_to_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind { - %load = load i1, i1 addrspace(1)* %in - %ext = sext i1 %load to i64 - store i64 %ext, i64 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}global_zextload_i1_to_i64: -; SI: buffer_load_ubyte -; SI: v_mov_b32_e32 {{v[0-9]+}}, 0 -; SI: buffer_store_dwordx2 -; SI: s_endpgm -define void @global_zextload_i1_to_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind { - %load = load i1, i1 addrspace(1)* %in - %ext = zext i1 %load to i64 - store i64 %ext, i64 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}i1_arg: -; SI: buffer_load_ubyte -; SI: v_and_b32_e32 -; SI: buffer_store_byte -; SI: s_endpgm -define void @i1_arg(i1 addrspace(1)* %out, i1 %x) nounwind { - store i1 %x, i1 addrspace(1)* %out, align 1 - ret void -} - -; FUNC-LABEL: {{^}}i1_arg_zext_i32: -; SI: buffer_load_ubyte -; SI: buffer_store_dword -; SI: s_endpgm -define void @i1_arg_zext_i32(i32 addrspace(1)* %out, i1 %x) nounwind { - %ext = zext i1 %x to i32 - store i32 %ext, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}i1_arg_zext_i64: -; SI: buffer_load_ubyte -; SI: buffer_store_dwordx2 -; SI: s_endpgm -define void @i1_arg_zext_i64(i64 addrspace(1)* %out, i1 %x) nounwind { - %ext = zext i1 %x to i64 - store i64 %ext, i64 addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}i1_arg_sext_i32: -; SI: buffer_load_ubyte -; SI: buffer_store_dword -; SI: s_endpgm -define void @i1_arg_sext_i32(i32 addrspace(1)* %out, i1 %x) nounwind { - %ext = sext i1 %x to i32 - store i32 %ext, i32addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}i1_arg_sext_i64: -; SI: buffer_load_ubyte -; SI: v_bfe_i32 -; SI: v_ashrrev_i32 -; SI: buffer_store_dwordx2 -; SI: s_endpgm -define void @i1_arg_sext_i64(i64 addrspace(1)* %out, i1 %x) nounwind { - %ext = sext i1 %x to i64 - store i64 %ext, i64 addrspace(1)* %out, align 8 - ret void -} diff --git a/llvm/test/CodeGen/R600/load-input-fold.ll b/llvm/test/CodeGen/R600/load-input-fold.ll deleted file mode 100644 index 1daf0e6527b..00000000000 --- a/llvm/test/CodeGen/R600/load-input-fold.ll +++ /dev/null @@ -1,117 +0,0 @@ -;RUN: llc < %s -march=r600 -mcpu=cayman - -define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1, <4 x float> inreg %reg2, <4 x float> inreg %reg3) #0 { -main_body: - %0 = extractelement <4 x float> %reg1, i32 0 - %1 = extractelement <4 x float> %reg1, i32 1 - %2 = extractelement <4 x float> %reg1, i32 2 - %3 = extractelement <4 x float> %reg1, i32 3 - %4 = extractelement <4 x float> %reg2, i32 0 - %5 = extractelement <4 x float> %reg2, i32 1 - %6 = extractelement <4 x float> %reg2, i32 2 - %7 = extractelement <4 x float> %reg2, i32 3 - %8 = extractelement <4 x float> %reg3, i32 0 - %9 = extractelement <4 x float> %reg3, i32 1 - %10 = extractelement <4 x float> %reg3, i32 2 - %11 = extractelement <4 x float> %reg3, i32 3 - %12 = load <4 x float>, <4 x float> addrspace(8)* null - %13 = extractelement <4 x float> %12, i32 0 - %14 = fmul float %0, %13 - %15 = load <4 x float>, <4 x float> addrspace(8)* null - %16 = extractelement <4 x float> %15, i32 1 - %17 = fmul float %0, %16 - %18 = load <4 x float>, <4 x float> addrspace(8)* null - %19 = extractelement <4 x float> %18, i32 2 - %20 = fmul float %0, %19 - %21 = load <4 x float>, <4 x float> addrspace(8)* null - %22 = extractelement <4 x float> %21, i32 3 - %23 = fmul float %0, %22 - %24 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1) - %25 = extractelement <4 x float> %24, i32 0 - %26 = fmul float %1, %25 - %27 = fadd float %26, %14 - %28 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1) - %29 = extractelement <4 x float> %28, i32 1 - %30 = fmul float %1, %29 - %31 = fadd float %30, %17 - %32 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1) - %33 = extractelement <4 x float> %32, i32 2 - %34 = fmul float %1, %33 - %35 = fadd float %34, %20 - %36 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1) - %37 = extractelement <4 x float> %36, i32 3 - %38 = fmul float %1, %37 - %39 = fadd float %38, %23 - %40 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2) - %41 = extractelement <4 x float> %40, i32 0 - %42 = fmul float %2, %41 - %43 = fadd float %42, %27 - %44 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2) - %45 = extractelement <4 x float> %44, i32 1 - %46 = fmul float %2, %45 - %47 = fadd float %46, %31 - %48 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2) - %49 = extractelement <4 x float> %48, i32 2 - %50 = fmul float %2, %49 - %51 = fadd float %50, %35 - %52 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2) - %53 = extractelement <4 x float> %52, i32 3 - %54 = fmul float %2, %53 - %55 = fadd float %54, %39 - %56 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3) - %57 = extractelement <4 x float> %56, i32 0 - %58 = fmul float %3, %57 - %59 = fadd float %58, %43 - %60 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3) - %61 = extractelement <4 x float> %60, i32 1 - %62 = fmul float %3, %61 - %63 = fadd float %62, %47 - %64 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3) - %65 = extractelement <4 x float> %64, i32 2 - %66 = fmul float %3, %65 - %67 = fadd float %66, %51 - %68 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3) - %69 = extractelement <4 x float> %68, i32 3 - %70 = fmul float %3, %69 - %71 = fadd float %70, %55 - %72 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4) - %73 = extractelement <4 x float> %72, i32 0 - %74 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4) - %75 = extractelement <4 x float> %74, i32 1 - %76 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4) - %77 = extractelement <4 x float> %76, i32 2 - %78 = insertelement <4 x float> undef, float %4, i32 0 - %79 = insertelement <4 x float> %78, float %5, i32 1 - %80 = insertelement <4 x float> %79, float %6, i32 2 - %81 = insertelement <4 x float> %80, float 0.000000e+00, i32 3 - %82 = insertelement <4 x float> undef, float %73, i32 0 - %83 = insertelement <4 x float> %82, float %75, i32 1 - %84 = insertelement <4 x float> %83, float %77, i32 2 - %85 = insertelement <4 x float> %84, float 0.000000e+00, i32 3 - %86 = call float @llvm.AMDGPU.dp4(<4 x float> %81, <4 x float> %85) - %87 = insertelement <4 x float> undef, float %86, i32 0 - call void @llvm.R600.store.swizzle(<4 x float> %87, i32 2, i32 2) - ret void -} - -; Function Attrs: readnone -declare float @llvm.AMDGPU.dp4(<4 x float>, <4 x float>) #1 - -; Function Attrs: readonly -declare float @fabs(float) #2 - -; Function Attrs: readnone -declare float @llvm.AMDGPU.rsq(float) #1 - -; Function Attrs: readnone -declare float @llvm.AMDIL.clamp.(float, float, float) #1 - -; Function Attrs: nounwind readonly -declare float @llvm.pow.f32(float, float) #3 - -declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) - -attributes #0 = { "ShaderType"="1" } -attributes #1 = { readnone } -attributes #2 = { readonly } -attributes #3 = { nounwind readonly } diff --git a/llvm/test/CodeGen/R600/load.ll b/llvm/test/CodeGen/R600/load.ll deleted file mode 100644 index 93b1b51a0d0..00000000000 --- a/llvm/test/CodeGen/R600/load.ll +++ /dev/null @@ -1,709 +0,0 @@ -; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=R600 --check-prefix=FUNC %s -; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck --check-prefix=R600 --check-prefix=FUNC %s -; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s -; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s - -;===------------------------------------------------------------------------===; -; GLOBAL ADDRESS SPACE -;===------------------------------------------------------------------------===; - -; Load an i8 value from the global address space. -; FUNC-LABEL: {{^}}load_i8: -; R600: VTX_READ_8 T{{[0-9]+\.X, T[0-9]+\.X}} - -; SI: buffer_load_ubyte v{{[0-9]+}}, -define void @load_i8(i32 addrspace(1)* %out, i8 addrspace(1)* %in) { - %1 = load i8, i8 addrspace(1)* %in - %2 = zext i8 %1 to i32 - store i32 %2, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}load_i8_sext: -; R600: VTX_READ_8 [[DST:T[0-9]\.[XYZW]]], [[DST]] -; R600: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], 0.0, literal -; R600: 8 -; SI: buffer_load_sbyte -define void @load_i8_sext(i32 addrspace(1)* %out, i8 addrspace(1)* %in) { -entry: - %0 = load i8, i8 addrspace(1)* %in - %1 = sext i8 %0 to i32 - store i32 %1, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}load_v2i8: -; R600: VTX_READ_8 -; R600: VTX_READ_8 -; SI: buffer_load_ubyte -; SI: buffer_load_ubyte -define void @load_v2i8(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(1)* %in) { -entry: - %0 = load <2 x i8>, <2 x i8> addrspace(1)* %in - %1 = zext <2 x i8> %0 to <2 x i32> - store <2 x i32> %1, <2 x i32> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}load_v2i8_sext: -; R600-DAG: VTX_READ_8 [[DST_X:T[0-9]\.[XYZW]]], [[DST_X]] -; R600-DAG: VTX_READ_8 [[DST_Y:T[0-9]\.[XYZW]]], [[DST_Y]] -; R600-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_X]], 0.0, literal -; R600-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_Y]], 0.0, literal -; R600-DAG: 8 -; R600-DAG: 8 - -; SI: buffer_load_sbyte -; SI: buffer_load_sbyte -define void @load_v2i8_sext(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(1)* %in) { -entry: - %0 = load <2 x i8>, <2 x i8> addrspace(1)* %in - %1 = sext <2 x i8> %0 to <2 x i32> - store <2 x i32> %1, <2 x i32> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}load_v4i8: -; R600: VTX_READ_8 -; R600: VTX_READ_8 -; R600: VTX_READ_8 -; R600: VTX_READ_8 -; SI: buffer_load_ubyte -; SI: buffer_load_ubyte -; SI: buffer_load_ubyte -; SI: buffer_load_ubyte -define void @load_v4i8(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) { -entry: - %0 = load <4 x i8>, <4 x i8> addrspace(1)* %in - %1 = zext <4 x i8> %0 to <4 x i32> - store <4 x i32> %1, <4 x i32> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}load_v4i8_sext: -; R600-DAG: VTX_READ_8 [[DST_X:T[0-9]\.[XYZW]]], [[DST_X]] -; R600-DAG: VTX_READ_8 [[DST_Y:T[0-9]\.[XYZW]]], [[DST_Y]] -; R600-DAG: VTX_READ_8 [[DST_Z:T[0-9]\.[XYZW]]], [[DST_Z]] -; R600-DAG: VTX_READ_8 [[DST_W:T[0-9]\.[XYZW]]], [[DST_W]] -; R600-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_X]], 0.0, literal -; R600-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_Y]], 0.0, literal -; R600-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_Z]], 0.0, literal -; R600-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_W]], 0.0, literal -; R600-DAG: 8 -; R600-DAG: 8 -; R600-DAG: 8 -; R600-DAG: 8 -; SI: buffer_load_sbyte -; SI: buffer_load_sbyte -; SI: buffer_load_sbyte -; SI: buffer_load_sbyte -define void @load_v4i8_sext(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) { -entry: - %0 = load <4 x i8>, <4 x i8> addrspace(1)* %in - %1 = sext <4 x i8> %0 to <4 x i32> - store <4 x i32> %1, <4 x i32> addrspace(1)* %out - ret void -} - -; Load an i16 value from the global address space. -; FUNC-LABEL: {{^}}load_i16: -; R600: VTX_READ_16 T{{[0-9]+\.X, T[0-9]+\.X}} -; SI: buffer_load_ushort -define void @load_i16(i32 addrspace(1)* %out, i16 addrspace(1)* %in) { -entry: - %0 = load i16 , i16 addrspace(1)* %in - %1 = zext i16 %0 to i32 - store i32 %1, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}load_i16_sext: -; R600: VTX_READ_16 [[DST:T[0-9]\.[XYZW]]], [[DST]] -; R600: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], 0.0, literal -; R600: 16 -; SI: buffer_load_sshort -define void @load_i16_sext(i32 addrspace(1)* %out, i16 addrspace(1)* %in) { -entry: - %0 = load i16, i16 addrspace(1)* %in - %1 = sext i16 %0 to i32 - store i32 %1, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}load_v2i16: -; R600: VTX_READ_16 -; R600: VTX_READ_16 -; SI: buffer_load_ushort -; SI: buffer_load_ushort -define void @load_v2i16(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) { -entry: - %0 = load <2 x i16>, <2 x i16> addrspace(1)* %in - %1 = zext <2 x i16> %0 to <2 x i32> - store <2 x i32> %1, <2 x i32> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}load_v2i16_sext: -; R600-DAG: VTX_READ_16 [[DST_X:T[0-9]\.[XYZW]]], [[DST_X]] -; R600-DAG: VTX_READ_16 [[DST_Y:T[0-9]\.[XYZW]]], [[DST_Y]] -; R600-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_X]], 0.0, literal -; R600-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_Y]], 0.0, literal -; R600-DAG: 16 -; R600-DAG: 16 -; SI: buffer_load_sshort -; SI: buffer_load_sshort -define void @load_v2i16_sext(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) { -entry: - %0 = load <2 x i16>, <2 x i16> addrspace(1)* %in - %1 = sext <2 x i16> %0 to <2 x i32> - store <2 x i32> %1, <2 x i32> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}load_v4i16: -; R600: VTX_READ_16 -; R600: VTX_READ_16 -; R600: VTX_READ_16 -; R600: VTX_READ_16 -; SI: buffer_load_ushort -; SI: buffer_load_ushort -; SI: buffer_load_ushort -; SI: buffer_load_ushort -define void @load_v4i16(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) { -entry: - %0 = load <4 x i16>, <4 x i16> addrspace(1)* %in - %1 = zext <4 x i16> %0 to <4 x i32> - store <4 x i32> %1, <4 x i32> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}load_v4i16_sext: -; R600-DAG: VTX_READ_16 [[DST_X:T[0-9]\.[XYZW]]], [[DST_X]] -; R600-DAG: VTX_READ_16 [[DST_Y:T[0-9]\.[XYZW]]], [[DST_Y]] -; R600-DAG: VTX_READ_16 [[DST_Z:T[0-9]\.[XYZW]]], [[DST_Z]] -; R600-DAG: VTX_READ_16 [[DST_W:T[0-9]\.[XYZW]]], [[DST_W]] -; R600-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_X]], 0.0, literal -; R600-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_Y]], 0.0, literal -; R600-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_Z]], 0.0, literal -; R600-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_W]], 0.0, literal -; R600-DAG: 16 -; R600-DAG: 16 -; R600-DAG: 16 -; R600-DAG: 16 -; SI: buffer_load_sshort -; SI: buffer_load_sshort -; SI: buffer_load_sshort -; SI: buffer_load_sshort -define void @load_v4i16_sext(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) { -entry: - %0 = load <4 x i16>, <4 x i16> addrspace(1)* %in - %1 = sext <4 x i16> %0 to <4 x i32> - store <4 x i32> %1, <4 x i32> addrspace(1)* %out - ret void -} - -; load an i32 value from the global address space. -; FUNC-LABEL: {{^}}load_i32: -; R600: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0 - -; SI: buffer_load_dword v{{[0-9]+}} -define void @load_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { -entry: - %0 = load i32, i32 addrspace(1)* %in - store i32 %0, i32 addrspace(1)* %out - ret void -} - -; load a f32 value from the global address space. -; FUNC-LABEL: {{^}}load_f32: -; R600: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0 - -; SI: buffer_load_dword v{{[0-9]+}} -define void @load_f32(float addrspace(1)* %out, float addrspace(1)* %in) { -entry: - %0 = load float, float addrspace(1)* %in - store float %0, float addrspace(1)* %out - ret void -} - -; load a v2f32 value from the global address space -; FUNC-LABEL: {{^}}load_v2f32: -; R600: MEM_RAT -; R600: VTX_READ_64 -; SI: buffer_load_dwordx2 -define void @load_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in) { -entry: - %0 = load <2 x float>, <2 x float> addrspace(1)* %in - store <2 x float> %0, <2 x float> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}load_i64: -; R600: VTX_READ_64 -; SI: buffer_load_dwordx2 -define void @load_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { -entry: - %0 = load i64, i64 addrspace(1)* %in - store i64 %0, i64 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}load_i64_sext: -; R600: MEM_RAT -; R600: MEM_RAT -; R600: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, T{{[0-9]\.[XYZW]}}, literal.x -; R600: 31 -; SI: buffer_load_dword - -define void @load_i64_sext(i64 addrspace(1)* %out, i32 addrspace(1)* %in) { -entry: - %0 = load i32, i32 addrspace(1)* %in - %1 = sext i32 %0 to i64 - store i64 %1, i64 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}load_i64_zext: -; R600: MEM_RAT -; R600: MEM_RAT -define void @load_i64_zext(i64 addrspace(1)* %out, i32 addrspace(1)* %in) { -entry: - %0 = load i32, i32 addrspace(1)* %in - %1 = zext i32 %0 to i64 - store i64 %1, i64 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}load_v8i32: -; R600: VTX_READ_128 -; R600: VTX_READ_128 -; XXX: We should be using DWORDX4 instructions on SI. -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -define void @load_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %in) { -entry: - %0 = load <8 x i32>, <8 x i32> addrspace(1)* %in - store <8 x i32> %0, <8 x i32> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}load_v16i32: -; R600: VTX_READ_128 -; R600: VTX_READ_128 -; R600: VTX_READ_128 -; R600: VTX_READ_128 -; XXX: We should be using DWORDX4 instructions on SI. -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -define void @load_v16i32(<16 x i32> addrspace(1)* %out, <16 x i32> addrspace(1)* %in) { -entry: - %0 = load <16 x i32>, <16 x i32> addrspace(1)* %in - store <16 x i32> %0, <16 x i32> addrspace(1)* %out - ret void -} - -;===------------------------------------------------------------------------===; -; CONSTANT ADDRESS SPACE -;===------------------------------------------------------------------------===; - -; Load a sign-extended i8 value -; FUNC-LABEL: {{^}}load_const_i8_sext: -; R600: VTX_READ_8 [[DST:T[0-9]\.[XYZW]]], [[DST]] -; R600: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], 0.0, literal -; R600: 8 -; SI: buffer_load_sbyte v{{[0-9]+}}, -define void @load_const_i8_sext(i32 addrspace(1)* %out, i8 addrspace(2)* %in) { -entry: - %0 = load i8, i8 addrspace(2)* %in - %1 = sext i8 %0 to i32 - store i32 %1, i32 addrspace(1)* %out - ret void -} - -; Load an aligned i8 value -; FUNC-LABEL: {{^}}load_const_i8_aligned: -; R600: VTX_READ_8 T{{[0-9]+\.X, T[0-9]+\.X}} -; SI: buffer_load_ubyte v{{[0-9]+}}, -define void @load_const_i8_aligned(i32 addrspace(1)* %out, i8 addrspace(2)* %in) { -entry: - %0 = load i8, i8 addrspace(2)* %in - %1 = zext i8 %0 to i32 - store i32 %1, i32 addrspace(1)* %out - ret void -} - -; Load an un-aligned i8 value -; FUNC-LABEL: {{^}}load_const_i8_unaligned: -; R600: VTX_READ_8 T{{[0-9]+\.X, T[0-9]+\.X}} -; SI: buffer_load_ubyte v{{[0-9]+}}, -define void @load_const_i8_unaligned(i32 addrspace(1)* %out, i8 addrspace(2)* %in) { -entry: - %0 = getelementptr i8, i8 addrspace(2)* %in, i32 1 - %1 = load i8, i8 addrspace(2)* %0 - %2 = zext i8 %1 to i32 - store i32 %2, i32 addrspace(1)* %out - ret void -} - -; Load a sign-extended i16 value -; FUNC-LABEL: {{^}}load_const_i16_sext: -; R600: VTX_READ_16 [[DST:T[0-9]\.[XYZW]]], [[DST]] -; R600: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], 0.0, literal -; R600: 16 -; SI: buffer_load_sshort -define void @load_const_i16_sext(i32 addrspace(1)* %out, i16 addrspace(2)* %in) { -entry: - %0 = load i16, i16 addrspace(2)* %in - %1 = sext i16 %0 to i32 - store i32 %1, i32 addrspace(1)* %out - ret void -} - -; Load an aligned i16 value -; FUNC-LABEL: {{^}}load_const_i16_aligned: -; R600: VTX_READ_16 T{{[0-9]+\.X, T[0-9]+\.X}} -; SI: buffer_load_ushort -define void @load_const_i16_aligned(i32 addrspace(1)* %out, i16 addrspace(2)* %in) { -entry: - %0 = load i16, i16 addrspace(2)* %in - %1 = zext i16 %0 to i32 - store i32 %1, i32 addrspace(1)* %out - ret void -} - -; Load an un-aligned i16 value -; FUNC-LABEL: {{^}}load_const_i16_unaligned: -; R600: VTX_READ_16 T{{[0-9]+\.X, T[0-9]+\.X}} -; SI: buffer_load_ushort -define void @load_const_i16_unaligned(i32 addrspace(1)* %out, i16 addrspace(2)* %in) { -entry: - %0 = getelementptr i16, i16 addrspace(2)* %in, i32 1 - %1 = load i16, i16 addrspace(2)* %0 - %2 = zext i16 %1 to i32 - store i32 %2, i32 addrspace(1)* %out - ret void -} - -; Load an i32 value from the constant address space. -; FUNC-LABEL: {{^}}load_const_addrspace_i32: -; R600: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0 - -; SI: s_load_dword s{{[0-9]+}} -define void @load_const_addrspace_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) { -entry: - %0 = load i32, i32 addrspace(2)* %in - store i32 %0, i32 addrspace(1)* %out - ret void -} - -; Load a f32 value from the constant address space. -; FUNC-LABEL: {{^}}load_const_addrspace_f32: -; R600: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0 - -; SI: s_load_dword s{{[0-9]+}} -define void @load_const_addrspace_f32(float addrspace(1)* %out, float addrspace(2)* %in) { - %1 = load float, float addrspace(2)* %in - store float %1, float addrspace(1)* %out - ret void -} - -;===------------------------------------------------------------------------===; -; LOCAL ADDRESS SPACE -;===------------------------------------------------------------------------===; - -; Load an i8 value from the local address space. -; FUNC-LABEL: {{^}}load_i8_local: -; R600: LDS_UBYTE_READ_RET -; SI-NOT: s_wqm_b64 -; SI: s_mov_b32 m0 -; SI: ds_read_u8 -define void @load_i8_local(i32 addrspace(1)* %out, i8 addrspace(3)* %in) { - %1 = load i8, i8 addrspace(3)* %in - %2 = zext i8 %1 to i32 - store i32 %2, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}load_i8_sext_local: -; R600: LDS_UBYTE_READ_RET -; R600: BFE_INT -; SI-NOT: s_wqm_b64 -; SI: s_mov_b32 m0 -; SI: ds_read_i8 -define void @load_i8_sext_local(i32 addrspace(1)* %out, i8 addrspace(3)* %in) { -entry: - %0 = load i8, i8 addrspace(3)* %in - %1 = sext i8 %0 to i32 - store i32 %1, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}load_v2i8_local: -; R600: LDS_UBYTE_READ_RET -; R600: LDS_UBYTE_READ_RET -; SI-NOT: s_wqm_b64 -; SI: s_mov_b32 m0 -; SI: ds_read_u8 -; SI: ds_read_u8 -define void @load_v2i8_local(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(3)* %in) { -entry: - %0 = load <2 x i8>, <2 x i8> addrspace(3)* %in - %1 = zext <2 x i8> %0 to <2 x i32> - store <2 x i32> %1, <2 x i32> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}load_v2i8_sext_local: -; R600-DAG: LDS_UBYTE_READ_RET -; R600-DAG: LDS_UBYTE_READ_RET -; R600-DAG: BFE_INT -; R600-DAG: BFE_INT -; SI-NOT: s_wqm_b64 -; SI: s_mov_b32 m0 -; SI: ds_read_i8 -; SI: ds_read_i8 -define void @load_v2i8_sext_local(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(3)* %in) { -entry: - %0 = load <2 x i8>, <2 x i8> addrspace(3)* %in - %1 = sext <2 x i8> %0 to <2 x i32> - store <2 x i32> %1, <2 x i32> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}load_v4i8_local: -; R600: LDS_UBYTE_READ_RET -; R600: LDS_UBYTE_READ_RET -; R600: LDS_UBYTE_READ_RET -; R600: LDS_UBYTE_READ_RET -; SI-NOT: s_wqm_b64 -; SI: s_mov_b32 m0 -; SI: ds_read_u8 -; SI: ds_read_u8 -; SI: ds_read_u8 -; SI: ds_read_u8 -define void @load_v4i8_local(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(3)* %in) { -entry: - %0 = load <4 x i8>, <4 x i8> addrspace(3)* %in - %1 = zext <4 x i8> %0 to <4 x i32> - store <4 x i32> %1, <4 x i32> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}load_v4i8_sext_local: -; R600-DAG: LDS_UBYTE_READ_RET -; R600-DAG: LDS_UBYTE_READ_RET -; R600-DAG: LDS_UBYTE_READ_RET -; R600-DAG: LDS_UBYTE_READ_RET -; R600-DAG: BFE_INT -; R600-DAG: BFE_INT -; R600-DAG: BFE_INT -; R600-DAG: BFE_INT -; SI-NOT: s_wqm_b64 -; SI: s_mov_b32 m0 -; SI: ds_read_i8 -; SI: ds_read_i8 -; SI: ds_read_i8 -; SI: ds_read_i8 -define void @load_v4i8_sext_local(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(3)* %in) { -entry: - %0 = load <4 x i8>, <4 x i8> addrspace(3)* %in - %1 = sext <4 x i8> %0 to <4 x i32> - store <4 x i32> %1, <4 x i32> addrspace(1)* %out - ret void -} - -; Load an i16 value from the local address space. -; FUNC-LABEL: {{^}}load_i16_local: -; R600: LDS_USHORT_READ_RET -; SI-NOT: s_wqm_b64 -; SI: s_mov_b32 m0 -; SI: ds_read_u16 -define void @load_i16_local(i32 addrspace(1)* %out, i16 addrspace(3)* %in) { -entry: - %0 = load i16 , i16 addrspace(3)* %in - %1 = zext i16 %0 to i32 - store i32 %1, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}load_i16_sext_local: -; R600: LDS_USHORT_READ_RET -; R600: BFE_INT -; SI-NOT: s_wqm_b64 -; SI: s_mov_b32 m0 -; SI: ds_read_i16 -define void @load_i16_sext_local(i32 addrspace(1)* %out, i16 addrspace(3)* %in) { -entry: - %0 = load i16, i16 addrspace(3)* %in - %1 = sext i16 %0 to i32 - store i32 %1, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}load_v2i16_local: -; R600: LDS_USHORT_READ_RET -; R600: LDS_USHORT_READ_RET -; SI-NOT: s_wqm_b64 -; SI: s_mov_b32 m0 -; SI: ds_read_u16 -; SI: ds_read_u16 -define void @load_v2i16_local(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(3)* %in) { -entry: - %0 = load <2 x i16>, <2 x i16> addrspace(3)* %in - %1 = zext <2 x i16> %0 to <2 x i32> - store <2 x i32> %1, <2 x i32> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}load_v2i16_sext_local: -; R600-DAG: LDS_USHORT_READ_RET -; R600-DAG: LDS_USHORT_READ_RET -; R600-DAG: BFE_INT -; R600-DAG: BFE_INT -; SI-NOT: s_wqm_b64 -; SI: s_mov_b32 m0 -; SI: ds_read_i16 -; SI: ds_read_i16 -define void @load_v2i16_sext_local(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(3)* %in) { -entry: - %0 = load <2 x i16>, <2 x i16> addrspace(3)* %in - %1 = sext <2 x i16> %0 to <2 x i32> - store <2 x i32> %1, <2 x i32> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}load_v4i16_local: -; R600: LDS_USHORT_READ_RET -; R600: LDS_USHORT_READ_RET -; R600: LDS_USHORT_READ_RET -; R600: LDS_USHORT_READ_RET -; SI-NOT: s_wqm_b64 -; SI: s_mov_b32 m0 -; SI: ds_read_u16 -; SI: ds_read_u16 -; SI: ds_read_u16 -; SI: ds_read_u16 -define void @load_v4i16_local(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(3)* %in) { -entry: - %0 = load <4 x i16>, <4 x i16> addrspace(3)* %in - %1 = zext <4 x i16> %0 to <4 x i32> - store <4 x i32> %1, <4 x i32> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}load_v4i16_sext_local: -; R600-DAG: LDS_USHORT_READ_RET -; R600-DAG: LDS_USHORT_READ_RET -; R600-DAG: LDS_USHORT_READ_RET -; R600-DAG: LDS_USHORT_READ_RET -; R600-DAG: BFE_INT -; R600-DAG: BFE_INT -; R600-DAG: BFE_INT -; R600-DAG: BFE_INT -; SI-NOT: s_wqm_b64 -; SI: s_mov_b32 m0 -; SI: ds_read_i16 -; SI: ds_read_i16 -; SI: ds_read_i16 -; SI: ds_read_i16 -define void @load_v4i16_sext_local(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(3)* %in) { -entry: - %0 = load <4 x i16>, <4 x i16> addrspace(3)* %in - %1 = sext <4 x i16> %0 to <4 x i32> - store <4 x i32> %1, <4 x i32> addrspace(1)* %out - ret void -} - -; load an i32 value from the local address space. -; FUNC-LABEL: {{^}}load_i32_local: -; R600: LDS_READ_RET -; SI-NOT: s_wqm_b64 -; SI: s_mov_b32 m0 -; SI: ds_read_b32 -define void @load_i32_local(i32 addrspace(1)* %out, i32 addrspace(3)* %in) { -entry: - %0 = load i32, i32 addrspace(3)* %in - store i32 %0, i32 addrspace(1)* %out - ret void -} - -; load a f32 value from the local address space. -; FUNC-LABEL: {{^}}load_f32_local: -; R600: LDS_READ_RET -; SI: s_mov_b32 m0 -; SI: ds_read_b32 -define void @load_f32_local(float addrspace(1)* %out, float addrspace(3)* %in) { -entry: - %0 = load float, float addrspace(3)* %in - store float %0, float addrspace(1)* %out - ret void -} - -; load a v2f32 value from the local address space -; FUNC-LABEL: {{^}}load_v2f32_local: -; R600: LDS_READ_RET -; R600: LDS_READ_RET -; SI: s_mov_b32 m0 -; SI: ds_read_b64 -define void @load_v2f32_local(<2 x float> addrspace(1)* %out, <2 x float> addrspace(3)* %in) { -entry: - %0 = load <2 x float>, <2 x float> addrspace(3)* %in - store <2 x float> %0, <2 x float> addrspace(1)* %out - ret void -} - -; Test loading a i32 and v2i32 value from the same base pointer. -; FUNC-LABEL: {{^}}load_i32_v2i32_local: -; R600: LDS_READ_RET -; R600: LDS_READ_RET -; R600: LDS_READ_RET -; SI-DAG: ds_read_b32 -; SI-DAG: ds_read2_b32 -define void @load_i32_v2i32_local(<2 x i32> addrspace(1)* %out, i32 addrspace(3)* %in) { - %scalar = load i32, i32 addrspace(3)* %in - %tmp0 = bitcast i32 addrspace(3)* %in to <2 x i32> addrspace(3)* - %vec_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(3)* %tmp0, i32 2 - %vec0 = load <2 x i32>, <2 x i32> addrspace(3)* %vec_ptr, align 4 - %vec1 = insertelement <2 x i32> , i32 %scalar, i32 0 - %vec = add <2 x i32> %vec0, %vec1 - store <2 x i32> %vec, <2 x i32> addrspace(1)* %out - ret void -} - - -@lds = addrspace(3) global [512 x i32] undef, align 4 - -; On SI we need to make sure that the base offset is a register and not -; an immediate. -; FUNC-LABEL: {{^}}load_i32_local_const_ptr: -; SI: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0 -; SI: ds_read_b32 v0, v[[ZERO]] offset:4 -; R600: LDS_READ_RET -define void @load_i32_local_const_ptr(i32 addrspace(1)* %out, i32 addrspace(3)* %in) { -entry: - %tmp0 = getelementptr [512 x i32], [512 x i32] addrspace(3)* @lds, i32 0, i32 1 - %tmp1 = load i32, i32 addrspace(3)* %tmp0 - %tmp2 = getelementptr i32, i32 addrspace(1)* %out, i32 1 - store i32 %tmp1, i32 addrspace(1)* %tmp2 - ret void -} diff --git a/llvm/test/CodeGen/R600/load.vec.ll b/llvm/test/CodeGen/R600/load.vec.ll deleted file mode 100644 index 02f883cd8e9..00000000000 --- a/llvm/test/CodeGen/R600/load.vec.ll +++ /dev/null @@ -1,25 +0,0 @@ -; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG %s -; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck --check-prefix=SI %s -; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=SI %s - -; load a v2i32 value from the global address space. -; EG: {{^}}load_v2i32: -; EG: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 0 -; SI: {{^}}load_v2i32: -; SI: buffer_load_dwordx2 v[{{[0-9]+:[0-9]+}}] -define void @load_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { - %a = load <2 x i32>, <2 x i32> addrspace(1) * %in - store <2 x i32> %a, <2 x i32> addrspace(1)* %out - ret void -} - -; load a v4i32 value from the global address space. -; EG: {{^}}load_v4i32: -; EG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0 -; SI: {{^}}load_v4i32: -; SI: buffer_load_dwordx4 v[{{[0-9]+:[0-9]+}}] -define void @load_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { - %a = load <4 x i32>, <4 x i32> addrspace(1) * %in - store <4 x i32> %a, <4 x i32> addrspace(1)* %out - ret void -} diff --git a/llvm/test/CodeGen/R600/load64.ll b/llvm/test/CodeGen/R600/load64.ll deleted file mode 100644 index 74beabdc007..00000000000 --- a/llvm/test/CodeGen/R600/load64.ll +++ /dev/null @@ -1,31 +0,0 @@ -; RUN: llc < %s -march=amdgcn -mcpu=tahiti -verify-machineinstrs | FileCheck %s -; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s - -; load a f64 value from the global address space. -; CHECK-LABEL: {{^}}load_f64: -; CHECK: buffer_load_dwordx2 v[{{[0-9]+:[0-9]+}}] -; CHECK: buffer_store_dwordx2 v[{{[0-9]+:[0-9]+}}] -define void @load_f64(double addrspace(1)* %out, double addrspace(1)* %in) { - %1 = load double, double addrspace(1)* %in - store double %1, double addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}load_i64: -; CHECK: buffer_load_dwordx2 v[{{[0-9]+:[0-9]+}}] -; CHECK: buffer_store_dwordx2 v[{{[0-9]+:[0-9]+}}] -define void @load_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { - %tmp = load i64, i64 addrspace(1)* %in - store i64 %tmp, i64 addrspace(1)* %out, align 8 - ret void -} - -; Load a f64 value from the constant address space. -; CHECK-LABEL: {{^}}load_const_addrspace_f64: -; CHECK: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}] -; CHECK: buffer_store_dwordx2 v[{{[0-9]+:[0-9]+}}] -define void @load_const_addrspace_f64(double addrspace(1)* %out, double addrspace(2)* %in) { - %1 = load double, double addrspace(2)* %in - store double %1, double addrspace(1)* %out - ret void -} diff --git a/llvm/test/CodeGen/R600/local-64.ll b/llvm/test/CodeGen/R600/local-64.ll deleted file mode 100644 index 33f3159d13e..00000000000 --- a/llvm/test/CodeGen/R600/local-64.ll +++ /dev/null @@ -1,167 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck --check-prefix=SI --check-prefix=BOTH %s -; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs< %s | FileCheck --check-prefix=CI --check-prefix=BOTH %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck --check-prefix=CI --check-prefix=BOTH %s - -; BOTH-LABEL: {{^}}local_i32_load -; BOTH: ds_read_b32 [[REG:v[0-9]+]], v{{[0-9]+}} offset:28 -; BOTH: buffer_store_dword [[REG]], -define void @local_i32_load(i32 addrspace(1)* %out, i32 addrspace(3)* %in) nounwind { - %gep = getelementptr i32, i32 addrspace(3)* %in, i32 7 - %val = load i32, i32 addrspace(3)* %gep, align 4 - store i32 %val, i32 addrspace(1)* %out, align 4 - ret void -} - -; BOTH-LABEL: {{^}}local_i32_load_0_offset -; BOTH: ds_read_b32 [[REG:v[0-9]+]], v{{[0-9]+}} -; BOTH: buffer_store_dword [[REG]], -define void @local_i32_load_0_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %in) nounwind { - %val = load i32, i32 addrspace(3)* %in, align 4 - store i32 %val, i32 addrspace(1)* %out, align 4 - ret void -} - -; BOTH-LABEL: {{^}}local_i8_load_i16_max_offset: -; BOTH-NOT: ADD -; BOTH: ds_read_u8 [[REG:v[0-9]+]], {{v[0-9]+}} offset:65535 -; BOTH: buffer_store_byte [[REG]], -define void @local_i8_load_i16_max_offset(i8 addrspace(1)* %out, i8 addrspace(3)* %in) nounwind { - %gep = getelementptr i8, i8 addrspace(3)* %in, i32 65535 - %val = load i8, i8 addrspace(3)* %gep, align 4 - store i8 %val, i8 addrspace(1)* %out, align 4 - ret void -} - -; BOTH-LABEL: {{^}}local_i8_load_over_i16_max_offset: -; The LDS offset will be 65536 bytes, which is larger than the size of LDS on -; SI, which is why it is being OR'd with the base pointer. -; SI: s_or_b32 [[ADDR:s[0-9]+]], s{{[0-9]+}}, 0x10000 -; CI: s_add_i32 [[ADDR:s[0-9]+]], s{{[0-9]+}}, 0x10000 -; BOTH: v_mov_b32_e32 [[VREGADDR:v[0-9]+]], [[ADDR]] -; BOTH: ds_read_u8 [[REG:v[0-9]+]], [[VREGADDR]] -; BOTH: buffer_store_byte [[REG]], -define void @local_i8_load_over_i16_max_offset(i8 addrspace(1)* %out, i8 addrspace(3)* %in) nounwind { - %gep = getelementptr i8, i8 addrspace(3)* %in, i32 65536 - %val = load i8, i8 addrspace(3)* %gep, align 4 - store i8 %val, i8 addrspace(1)* %out, align 4 - ret void -} - -; BOTH-LABEL: {{^}}local_i64_load: -; BOTH-NOT: ADD -; BOTH: ds_read_b64 [[REG:v[[0-9]+:[0-9]+]]], v{{[0-9]+}} offset:56 -; BOTH: buffer_store_dwordx2 [[REG]], -define void @local_i64_load(i64 addrspace(1)* %out, i64 addrspace(3)* %in) nounwind { - %gep = getelementptr i64, i64 addrspace(3)* %in, i32 7 - %val = load i64, i64 addrspace(3)* %gep, align 8 - store i64 %val, i64 addrspace(1)* %out, align 8 - ret void -} - -; BOTH-LABEL: {{^}}local_i64_load_0_offset -; BOTH: ds_read_b64 [[REG:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}} -; BOTH: buffer_store_dwordx2 [[REG]], -define void @local_i64_load_0_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %in) nounwind { - %val = load i64, i64 addrspace(3)* %in, align 8 - store i64 %val, i64 addrspace(1)* %out, align 8 - ret void -} - -; BOTH-LABEL: {{^}}local_f64_load: -; BOTH-NOT: ADD -; BOTH: ds_read_b64 [[REG:v[[0-9]+:[0-9]+]]], v{{[0-9]+}} offset:56 -; BOTH: buffer_store_dwordx2 [[REG]], -define void @local_f64_load(double addrspace(1)* %out, double addrspace(3)* %in) nounwind { - %gep = getelementptr double, double addrspace(3)* %in, i32 7 - %val = load double, double addrspace(3)* %gep, align 8 - store double %val, double addrspace(1)* %out, align 8 - ret void -} - -; BOTH-LABEL: {{^}}local_f64_load_0_offset -; BOTH: ds_read_b64 [[REG:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}} -; BOTH: buffer_store_dwordx2 [[REG]], -define void @local_f64_load_0_offset(double addrspace(1)* %out, double addrspace(3)* %in) nounwind { - %val = load double, double addrspace(3)* %in, align 8 - store double %val, double addrspace(1)* %out, align 8 - ret void -} - -; BOTH-LABEL: {{^}}local_i64_store: -; BOTH-NOT: ADD -; BOTH: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:56 -define void @local_i64_store(i64 addrspace(3)* %out) nounwind { - %gep = getelementptr i64, i64 addrspace(3)* %out, i32 7 - store i64 5678, i64 addrspace(3)* %gep, align 8 - ret void -} - -; BOTH-LABEL: {{^}}local_i64_store_0_offset: -; BOTH-NOT: ADD -; BOTH: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} -define void @local_i64_store_0_offset(i64 addrspace(3)* %out) nounwind { - store i64 1234, i64 addrspace(3)* %out, align 8 - ret void -} - -; BOTH-LABEL: {{^}}local_f64_store: -; BOTH-NOT: ADD -; BOTH: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:56 -define void @local_f64_store(double addrspace(3)* %out) nounwind { - %gep = getelementptr double, double addrspace(3)* %out, i32 7 - store double 16.0, double addrspace(3)* %gep, align 8 - ret void -} - -; BOTH-LABEL: {{^}}local_f64_store_0_offset -; BOTH: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} -define void @local_f64_store_0_offset(double addrspace(3)* %out) nounwind { - store double 20.0, double addrspace(3)* %out, align 8 - ret void -} - -; BOTH-LABEL: {{^}}local_v2i64_store: -; BOTH-NOT: ADD -; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:112 -; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:120 -; BOTH: s_endpgm -define void @local_v2i64_store(<2 x i64> addrspace(3)* %out) nounwind { - %gep = getelementptr <2 x i64>, <2 x i64> addrspace(3)* %out, i32 7 - store <2 x i64> , <2 x i64> addrspace(3)* %gep, align 16 - ret void -} - -; BOTH-LABEL: {{^}}local_v2i64_store_0_offset: -; BOTH-NOT: ADD -; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} -; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:8 -; BOTH: s_endpgm -define void @local_v2i64_store_0_offset(<2 x i64> addrspace(3)* %out) nounwind { - store <2 x i64> , <2 x i64> addrspace(3)* %out, align 16 - ret void -} - -; BOTH-LABEL: {{^}}local_v4i64_store: -; BOTH-NOT: ADD -; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:224 -; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:232 -; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:240 -; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:248 -; BOTH: s_endpgm -define void @local_v4i64_store(<4 x i64> addrspace(3)* %out) nounwind { - %gep = getelementptr <4 x i64>, <4 x i64> addrspace(3)* %out, i32 7 - store <4 x i64> , <4 x i64> addrspace(3)* %gep, align 16 - ret void -} - -; BOTH-LABEL: {{^}}local_v4i64_store_0_offset: -; BOTH-NOT: ADD -; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} -; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:8 -; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:16 -; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:24 -; BOTH: s_endpgm -define void @local_v4i64_store_0_offset(<4 x i64> addrspace(3)* %out) nounwind { - store <4 x i64> , <4 x i64> addrspace(3)* %out, align 16 - ret void -} diff --git a/llvm/test/CodeGen/R600/local-atomics.ll b/llvm/test/CodeGen/R600/local-atomics.ll deleted file mode 100644 index 2aaf977ab90..00000000000 --- a/llvm/test/CodeGen/R600/local-atomics.ll +++ /dev/null @@ -1,551 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CIVI -check-prefix=GCN -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=CIVI -check-prefix=GCN -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s - -; FUNC-LABEL: {{^}}lds_atomic_xchg_ret_i32: -; EG: LDS_WRXCHG_RET * -; GCN: v_mov_b32_e32 [[DATA:v[0-9]+]], 4 -; GCN: s_load_dword [[SPTR:s[0-9]+]], -; GCN: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]] -; GCN: ds_wrxchg_rtn_b32 [[RESULT:v[0-9]+]], [[VPTR]], [[DATA]] -; GCN: buffer_store_dword [[RESULT]], -; GCN: s_endpgm -define void @lds_atomic_xchg_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { - %result = atomicrmw xchg i32 addrspace(3)* %ptr, i32 4 seq_cst - store i32 %result, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_xchg_ret_i32_offset: -; EG: LDS_WRXCHG_RET * -; GCN: ds_wrxchg_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16 -; GCN: s_endpgm -define void @lds_atomic_xchg_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { - %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 - %result = atomicrmw xchg i32 addrspace(3)* %gep, i32 4 seq_cst - store i32 %result, i32 addrspace(1)* %out, align 4 - ret void -} - -; XXX - Is it really necessary to load 4 into VGPR? -; FUNC-LABEL: {{^}}lds_atomic_add_ret_i32: -; EG: LDS_ADD_RET * -; GCN: v_mov_b32_e32 [[DATA:v[0-9]+]], 4 -; GCN: s_load_dword [[SPTR:s[0-9]+]], -; GCN: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]] -; GCN: ds_add_rtn_u32 [[RESULT:v[0-9]+]], [[VPTR]], [[DATA]] -; GCN: buffer_store_dword [[RESULT]], -; GCN: s_endpgm -define void @lds_atomic_add_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { - %result = atomicrmw add i32 addrspace(3)* %ptr, i32 4 seq_cst - store i32 %result, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_add_ret_i32_offset: -; EG: LDS_ADD_RET * -; GCN: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16 -; GCN: s_endpgm -define void @lds_atomic_add_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { - %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 - %result = atomicrmw add i32 addrspace(3)* %gep, i32 4 seq_cst - store i32 %result, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_add_ret_i32_bad_si_offset: -; EG: LDS_ADD_RET * -; SI: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; CIVI: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16 -; GCN: s_endpgm -define void @lds_atomic_add_ret_i32_bad_si_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr, i32 %a, i32 %b) nounwind { - %sub = sub i32 %a, %b - %add = add i32 %sub, 4 - %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 %add - %result = atomicrmw add i32 addrspace(3)* %gep, i32 4 seq_cst - store i32 %result, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_inc_ret_i32: -; EG: LDS_ADD_RET * -; GCN: v_mov_b32_e32 [[NEGONE:v[0-9]+]], -1 -; GCN: ds_inc_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[NEGONE]] -; GCN: s_endpgm -define void @lds_atomic_inc_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { - %result = atomicrmw add i32 addrspace(3)* %ptr, i32 1 seq_cst - store i32 %result, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_inc_ret_i32_offset: -; EG: LDS_ADD_RET * -; GCN: v_mov_b32_e32 [[NEGONE:v[0-9]+]], -1 -; GCN: ds_inc_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[NEGONE]] offset:16 -; GCN: s_endpgm -define void @lds_atomic_inc_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { - %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 - %result = atomicrmw add i32 addrspace(3)* %gep, i32 1 seq_cst - store i32 %result, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_inc_ret_i32_bad_si_offset: -; EG: LDS_ADD_RET * -; SI: ds_inc_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; CIVI: ds_inc_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16 -; GCN: s_endpgm -define void @lds_atomic_inc_ret_i32_bad_si_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr, i32 %a, i32 %b) nounwind { - %sub = sub i32 %a, %b - %add = add i32 %sub, 4 - %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 %add - %result = atomicrmw add i32 addrspace(3)* %gep, i32 1 seq_cst - store i32 %result, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_sub_ret_i32: -; EG: LDS_SUB_RET * -; GCN: ds_sub_rtn_u32 -; GCN: s_endpgm -define void @lds_atomic_sub_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { - %result = atomicrmw sub i32 addrspace(3)* %ptr, i32 4 seq_cst - store i32 %result, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_sub_ret_i32_offset: -; EG: LDS_SUB_RET * -; GCN: ds_sub_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16 -; GCN: s_endpgm -define void @lds_atomic_sub_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { - %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 - %result = atomicrmw sub i32 addrspace(3)* %gep, i32 4 seq_cst - store i32 %result, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_dec_ret_i32: -; EG: LDS_SUB_RET * -; GCN: v_mov_b32_e32 [[NEGONE:v[0-9]+]], -1 -; GCN: ds_dec_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[NEGONE]] -; GCN: s_endpgm -define void @lds_atomic_dec_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { - %result = atomicrmw sub i32 addrspace(3)* %ptr, i32 1 seq_cst - store i32 %result, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_dec_ret_i32_offset: -; EG: LDS_SUB_RET * -; GCN: v_mov_b32_e32 [[NEGONE:v[0-9]+]], -1 -; GCN: ds_dec_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[NEGONE]] offset:16 -; GCN: s_endpgm -define void @lds_atomic_dec_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { - %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 - %result = atomicrmw sub i32 addrspace(3)* %gep, i32 1 seq_cst - store i32 %result, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_and_ret_i32: -; EG: LDS_AND_RET * -; GCN: ds_and_rtn_b32 -; GCN: s_endpgm -define void @lds_atomic_and_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { - %result = atomicrmw and i32 addrspace(3)* %ptr, i32 4 seq_cst - store i32 %result, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_and_ret_i32_offset: -; EG: LDS_AND_RET * -; GCN: ds_and_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16 -; GCN: s_endpgm -define void @lds_atomic_and_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { - %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 - %result = atomicrmw and i32 addrspace(3)* %gep, i32 4 seq_cst - store i32 %result, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_or_ret_i32: -; EG: LDS_OR_RET * -; GCN: ds_or_rtn_b32 -; GCN: s_endpgm -define void @lds_atomic_or_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { - %result = atomicrmw or i32 addrspace(3)* %ptr, i32 4 seq_cst - store i32 %result, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_or_ret_i32_offset: -; EG: LDS_OR_RET * -; GCN: ds_or_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16 -; GCN: s_endpgm -define void @lds_atomic_or_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { - %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 - %result = atomicrmw or i32 addrspace(3)* %gep, i32 4 seq_cst - store i32 %result, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_xor_ret_i32: -; EG: LDS_XOR_RET * -; GCN: ds_xor_rtn_b32 -; GCN: s_endpgm -define void @lds_atomic_xor_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { - %result = atomicrmw xor i32 addrspace(3)* %ptr, i32 4 seq_cst - store i32 %result, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_xor_ret_i32_offset: -; EG: LDS_XOR_RET * -; GCN: ds_xor_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16 -; GCN: s_endpgm -define void @lds_atomic_xor_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { - %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 - %result = atomicrmw xor i32 addrspace(3)* %gep, i32 4 seq_cst - store i32 %result, i32 addrspace(1)* %out, align 4 - ret void -} - -; FIXME: There is no atomic nand instr -; XFUNC-LABEL: {{^}}lds_atomic_nand_ret_i32:uction, so we somehow need to expand this. -; define void @lds_atomic_nand_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { -; %result = atomicrmw nand i32 addrspace(3)* %ptr, i32 4 seq_cst -; store i32 %result, i32 addrspace(1)* %out, align 4 -; ret void -; } - -; FUNC-LABEL: {{^}}lds_atomic_min_ret_i32: -; EG: LDS_MIN_INT_RET * -; GCN: ds_min_rtn_i32 -; GCN: s_endpgm -define void @lds_atomic_min_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { - %result = atomicrmw min i32 addrspace(3)* %ptr, i32 4 seq_cst - store i32 %result, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_min_ret_i32_offset: -; EG: LDS_MIN_INT_RET * -; GCN: ds_min_rtn_i32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16 -; GCN: s_endpgm -define void @lds_atomic_min_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { - %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 - %result = atomicrmw min i32 addrspace(3)* %gep, i32 4 seq_cst - store i32 %result, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_max_ret_i32: -; EG: LDS_MAX_INT_RET * -; GCN: ds_max_rtn_i32 -; GCN: s_endpgm -define void @lds_atomic_max_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { - %result = atomicrmw max i32 addrspace(3)* %ptr, i32 4 seq_cst - store i32 %result, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_max_ret_i32_offset: -; EG: LDS_MAX_INT_RET * -; GCN: ds_max_rtn_i32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16 -; GCN: s_endpgm -define void @lds_atomic_max_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { - %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 - %result = atomicrmw max i32 addrspace(3)* %gep, i32 4 seq_cst - store i32 %result, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_umin_ret_i32: -; EG: LDS_MIN_UINT_RET * -; GCN: ds_min_rtn_u32 -; GCN: s_endpgm -define void @lds_atomic_umin_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { - %result = atomicrmw umin i32 addrspace(3)* %ptr, i32 4 seq_cst - store i32 %result, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_umin_ret_i32_offset: -; EG: LDS_MIN_UINT_RET * -; GCN: ds_min_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16 -; GCN: s_endpgm -define void @lds_atomic_umin_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { - %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 - %result = atomicrmw umin i32 addrspace(3)* %gep, i32 4 seq_cst - store i32 %result, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_umax_ret_i32: -; EG: LDS_MAX_UINT_RET * -; GCN: ds_max_rtn_u32 -; GCN: s_endpgm -define void @lds_atomic_umax_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { - %result = atomicrmw umax i32 addrspace(3)* %ptr, i32 4 seq_cst - store i32 %result, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_umax_ret_i32_offset: -; EG: LDS_MAX_UINT_RET * -; GCN: ds_max_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16 -; GCN: s_endpgm -define void @lds_atomic_umax_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { - %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 - %result = atomicrmw umax i32 addrspace(3)* %gep, i32 4 seq_cst - store i32 %result, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_xchg_noret_i32: -; GCN: s_load_dword [[SPTR:s[0-9]+]], -; GCN: v_mov_b32_e32 [[DATA:v[0-9]+]], 4 -; GCN: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]] -; GCN: ds_wrxchg_rtn_b32 [[RESULT:v[0-9]+]], [[VPTR]], [[DATA]] -; GCN: s_endpgm -define void @lds_atomic_xchg_noret_i32(i32 addrspace(3)* %ptr) nounwind { - %result = atomicrmw xchg i32 addrspace(3)* %ptr, i32 4 seq_cst - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_xchg_noret_i32_offset: -; GCN: ds_wrxchg_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16 -; GCN: s_endpgm -define void @lds_atomic_xchg_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind { - %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 - %result = atomicrmw xchg i32 addrspace(3)* %gep, i32 4 seq_cst - ret void -} - -; XXX - Is it really necessary to load 4 into VGPR? -; FUNC-LABEL: {{^}}lds_atomic_add_noret_i32: -; GCN: s_load_dword [[SPTR:s[0-9]+]], -; GCN: v_mov_b32_e32 [[DATA:v[0-9]+]], 4 -; GCN: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]] -; GCN: ds_add_u32 [[VPTR]], [[DATA]] -; GCN: s_endpgm -define void @lds_atomic_add_noret_i32(i32 addrspace(3)* %ptr) nounwind { - %result = atomicrmw add i32 addrspace(3)* %ptr, i32 4 seq_cst - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_add_noret_i32_offset: -; GCN: ds_add_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16 -; GCN: s_endpgm -define void @lds_atomic_add_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind { - %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 - %result = atomicrmw add i32 addrspace(3)* %gep, i32 4 seq_cst - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_add_noret_i32_bad_si_offset -; SI: ds_add_u32 v{{[0-9]+}}, v{{[0-9]+}} -; CIVI: ds_add_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16 -; GCN: s_endpgm -define void @lds_atomic_add_noret_i32_bad_si_offset(i32 addrspace(3)* %ptr, i32 %a, i32 %b) nounwind { - %sub = sub i32 %a, %b - %add = add i32 %sub, 4 - %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 %add - %result = atomicrmw add i32 addrspace(3)* %gep, i32 4 seq_cst - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_inc_noret_i32: -; GCN: v_mov_b32_e32 [[NEGONE:v[0-9]+]], -1 -; GCN: ds_inc_u32 v{{[0-9]+}}, [[NEGONE]] -; GCN: s_endpgm -define void @lds_atomic_inc_noret_i32(i32 addrspace(3)* %ptr) nounwind { - %result = atomicrmw add i32 addrspace(3)* %ptr, i32 1 seq_cst - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_inc_noret_i32_offset: -; GCN: v_mov_b32_e32 [[NEGONE:v[0-9]+]], -1 -; GCN: ds_inc_u32 v{{[0-9]+}}, [[NEGONE]] offset:16 -; GCN: s_endpgm -define void @lds_atomic_inc_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind { - %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 - %result = atomicrmw add i32 addrspace(3)* %gep, i32 1 seq_cst - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_inc_noret_i32_bad_si_offset: -; SI: ds_inc_u32 v{{[0-9]+}}, v{{[0-9]+}} -; CIVI: ds_inc_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16 -; GCN: s_endpgm -define void @lds_atomic_inc_noret_i32_bad_si_offset(i32 addrspace(3)* %ptr, i32 %a, i32 %b) nounwind { - %sub = sub i32 %a, %b - %add = add i32 %sub, 4 - %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 %add - %result = atomicrmw add i32 addrspace(3)* %gep, i32 1 seq_cst - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_sub_noret_i32: -; GCN: ds_sub_u32 -; GCN: s_endpgm -define void @lds_atomic_sub_noret_i32(i32 addrspace(3)* %ptr) nounwind { - %result = atomicrmw sub i32 addrspace(3)* %ptr, i32 4 seq_cst - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_sub_noret_i32_offset: -; GCN: ds_sub_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16 -; GCN: s_endpgm -define void @lds_atomic_sub_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind { - %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 - %result = atomicrmw sub i32 addrspace(3)* %gep, i32 4 seq_cst - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_dec_noret_i32: -; GCN: v_mov_b32_e32 [[NEGONE:v[0-9]+]], -1 -; GCN: ds_dec_u32 v{{[0-9]+}}, [[NEGONE]] -; GCN: s_endpgm -define void @lds_atomic_dec_noret_i32(i32 addrspace(3)* %ptr) nounwind { - %result = atomicrmw sub i32 addrspace(3)* %ptr, i32 1 seq_cst - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_dec_noret_i32_offset: -; GCN: v_mov_b32_e32 [[NEGONE:v[0-9]+]], -1 -; GCN: ds_dec_u32 v{{[0-9]+}}, [[NEGONE]] offset:16 -; GCN: s_endpgm -define void @lds_atomic_dec_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind { - %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 - %result = atomicrmw sub i32 addrspace(3)* %gep, i32 1 seq_cst - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_and_noret_i32: -; GCN: ds_and_b32 -; GCN: s_endpgm -define void @lds_atomic_and_noret_i32(i32 addrspace(3)* %ptr) nounwind { - %result = atomicrmw and i32 addrspace(3)* %ptr, i32 4 seq_cst - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_and_noret_i32_offset: -; GCN: ds_and_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:16 -; GCN: s_endpgm -define void @lds_atomic_and_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind { - %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 - %result = atomicrmw and i32 addrspace(3)* %gep, i32 4 seq_cst - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_or_noret_i32: -; GCN: ds_or_b32 -; GCN: s_endpgm -define void @lds_atomic_or_noret_i32(i32 addrspace(3)* %ptr) nounwind { - %result = atomicrmw or i32 addrspace(3)* %ptr, i32 4 seq_cst - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_or_noret_i32_offset: -; GCN: ds_or_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:16 -; GCN: s_endpgm -define void @lds_atomic_or_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind { - %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 - %result = atomicrmw or i32 addrspace(3)* %gep, i32 4 seq_cst - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_xor_noret_i32: -; GCN: ds_xor_b32 -; GCN: s_endpgm -define void @lds_atomic_xor_noret_i32(i32 addrspace(3)* %ptr) nounwind { - %result = atomicrmw xor i32 addrspace(3)* %ptr, i32 4 seq_cst - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_xor_noret_i32_offset: -; GCN: ds_xor_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:16 -; GCN: s_endpgm -define void @lds_atomic_xor_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind { - %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 - %result = atomicrmw xor i32 addrspace(3)* %gep, i32 4 seq_cst - ret void -} - -; FIXME: There is no atomic nand instr -; XFUNC-LABEL: {{^}}lds_atomic_nand_noret_i32:uction, so we somehow need to expand this. -; define void @lds_atomic_nand_noret_i32(i32 addrspace(3)* %ptr) nounwind { -; %result = atomicrmw nand i32 addrspace(3)* %ptr, i32 4 seq_cst -; ret void -; } - -; FUNC-LABEL: {{^}}lds_atomic_min_noret_i32: -; GCN: ds_min_i32 -; GCN: s_endpgm -define void @lds_atomic_min_noret_i32(i32 addrspace(3)* %ptr) nounwind { - %result = atomicrmw min i32 addrspace(3)* %ptr, i32 4 seq_cst - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_min_noret_i32_offset: -; GCN: ds_min_i32 v{{[0-9]+}}, v{{[0-9]+}} offset:16 -; GCN: s_endpgm -define void @lds_atomic_min_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind { - %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 - %result = atomicrmw min i32 addrspace(3)* %gep, i32 4 seq_cst - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_max_noret_i32: -; GCN: ds_max_i32 -; GCN: s_endpgm -define void @lds_atomic_max_noret_i32(i32 addrspace(3)* %ptr) nounwind { - %result = atomicrmw max i32 addrspace(3)* %ptr, i32 4 seq_cst - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_max_noret_i32_offset: -; GCN: ds_max_i32 v{{[0-9]+}}, v{{[0-9]+}} offset:16 -; GCN: s_endpgm -define void @lds_atomic_max_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind { - %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 - %result = atomicrmw max i32 addrspace(3)* %gep, i32 4 seq_cst - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_umin_noret_i32: -; GCN: ds_min_u32 -; GCN: s_endpgm -define void @lds_atomic_umin_noret_i32(i32 addrspace(3)* %ptr) nounwind { - %result = atomicrmw umin i32 addrspace(3)* %ptr, i32 4 seq_cst - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_umin_noret_i32_offset: -; GCN: ds_min_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16 -; GCN: s_endpgm -define void @lds_atomic_umin_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind { - %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 - %result = atomicrmw umin i32 addrspace(3)* %gep, i32 4 seq_cst - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_umax_noret_i32: -; GCN: ds_max_u32 -; GCN: s_endpgm -define void @lds_atomic_umax_noret_i32(i32 addrspace(3)* %ptr) nounwind { - %result = atomicrmw umax i32 addrspace(3)* %ptr, i32 4 seq_cst - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_umax_noret_i32_offset: -; GCN: ds_max_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16 -; GCN: s_endpgm -define void @lds_atomic_umax_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind { - %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 - %result = atomicrmw umax i32 addrspace(3)* %gep, i32 4 seq_cst - ret void -} diff --git a/llvm/test/CodeGen/R600/local-atomics64.ll b/llvm/test/CodeGen/R600/local-atomics64.ll deleted file mode 100644 index 0ffa5e751b7..00000000000 --- a/llvm/test/CodeGen/R600/local-atomics64.ll +++ /dev/null @@ -1,470 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=SI -check-prefix=GCN %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=VI -check-prefix=GCN %s - -; FUNC-LABEL: {{^}}lds_atomic_xchg_ret_i64: -; GCN: ds_wrxchg_rtn_b64 -; GCN: s_endpgm -define void @lds_atomic_xchg_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { - %result = atomicrmw xchg i64 addrspace(3)* %ptr, i64 4 seq_cst - store i64 %result, i64 addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_xchg_ret_i64_offset: -; GCN: ds_wrxchg_rtn_b64 {{.*}} offset:32 -; GCN: s_endpgm -define void @lds_atomic_xchg_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { - %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 - %result = atomicrmw xchg i64 addrspace(3)* %gep, i64 4 seq_cst - store i64 %result, i64 addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_add_ret_i64: -; GCN: ds_add_rtn_u64 -; GCN: s_endpgm -define void @lds_atomic_add_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { - %result = atomicrmw add i64 addrspace(3)* %ptr, i64 4 seq_cst - store i64 %result, i64 addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_add_ret_i64_offset: -; GCN: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], 9 -; GCN: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], 0 -; SI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb -; VI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c -; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]] -; GCN: ds_add_rtn_u64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}} offset:32 -; GCN: buffer_store_dwordx2 [[RESULT]], -; GCN: s_endpgm -define void @lds_atomic_add_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { - %gep = getelementptr i64, i64 addrspace(3)* %ptr, i64 4 - %result = atomicrmw add i64 addrspace(3)* %gep, i64 9 seq_cst - store i64 %result, i64 addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_inc_ret_i64: -; GCN: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], -1 -; GCN: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], -1 -; GCN: ds_inc_rtn_u64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}} -; GCN: buffer_store_dwordx2 [[RESULT]], -; GCN: s_endpgm -define void @lds_atomic_inc_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { - %result = atomicrmw add i64 addrspace(3)* %ptr, i64 1 seq_cst - store i64 %result, i64 addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_inc_ret_i64_offset: -; GCN: ds_inc_rtn_u64 {{.*}} offset:32 -; GCN: s_endpgm -define void @lds_atomic_inc_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { - %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 - %result = atomicrmw add i64 addrspace(3)* %gep, i64 1 seq_cst - store i64 %result, i64 addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_sub_ret_i64: -; GCN: ds_sub_rtn_u64 -; GCN: s_endpgm -define void @lds_atomic_sub_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { - %result = atomicrmw sub i64 addrspace(3)* %ptr, i64 4 seq_cst - store i64 %result, i64 addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_sub_ret_i64_offset: -; GCN: ds_sub_rtn_u64 {{.*}} offset:32 -; GCN: s_endpgm -define void @lds_atomic_sub_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { - %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 - %result = atomicrmw sub i64 addrspace(3)* %gep, i64 4 seq_cst - store i64 %result, i64 addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_dec_ret_i64: -; GCN: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], -1 -; GCN: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], -1 -; GCN: ds_dec_rtn_u64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}} -; GCN: buffer_store_dwordx2 [[RESULT]], -; GCN: s_endpgm -define void @lds_atomic_dec_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { - %result = atomicrmw sub i64 addrspace(3)* %ptr, i64 1 seq_cst - store i64 %result, i64 addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_dec_ret_i64_offset: -; GCN: ds_dec_rtn_u64 {{.*}} offset:32 -; GCN: s_endpgm -define void @lds_atomic_dec_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { - %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 - %result = atomicrmw sub i64 addrspace(3)* %gep, i64 1 seq_cst - store i64 %result, i64 addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_and_ret_i64: -; GCN: ds_and_rtn_b64 -; GCN: s_endpgm -define void @lds_atomic_and_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { - %result = atomicrmw and i64 addrspace(3)* %ptr, i64 4 seq_cst - store i64 %result, i64 addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_and_ret_i64_offset: -; GCN: ds_and_rtn_b64 {{.*}} offset:32 -; GCN: s_endpgm -define void @lds_atomic_and_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { - %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 - %result = atomicrmw and i64 addrspace(3)* %gep, i64 4 seq_cst - store i64 %result, i64 addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_or_ret_i64: -; GCN: ds_or_rtn_b64 -; GCN: s_endpgm -define void @lds_atomic_or_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { - %result = atomicrmw or i64 addrspace(3)* %ptr, i64 4 seq_cst - store i64 %result, i64 addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_or_ret_i64_offset: -; GCN: ds_or_rtn_b64 {{.*}} offset:32 -; GCN: s_endpgm -define void @lds_atomic_or_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { - %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 - %result = atomicrmw or i64 addrspace(3)* %gep, i64 4 seq_cst - store i64 %result, i64 addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_xor_ret_i64: -; GCN: ds_xor_rtn_b64 -; GCN: s_endpgm -define void @lds_atomic_xor_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { - %result = atomicrmw xor i64 addrspace(3)* %ptr, i64 4 seq_cst - store i64 %result, i64 addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_xor_ret_i64_offset: -; GCN: ds_xor_rtn_b64 {{.*}} offset:32 -; GCN: s_endpgm -define void @lds_atomic_xor_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { - %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 - %result = atomicrmw xor i64 addrspace(3)* %gep, i64 4 seq_cst - store i64 %result, i64 addrspace(1)* %out, align 8 - ret void -} - -; FIXME: There is no atomic nand instr -; XFUNC-LABEL: {{^}}lds_atomic_nand_ret_i64:uction, so we somehow need to expand this. -; define void @lds_atomic_nand_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { -; %result = atomicrmw nand i64 addrspace(3)* %ptr, i32 4 seq_cst -; store i64 %result, i64 addrspace(1)* %out, align 8 -; ret void -; } - -; FUNC-LABEL: {{^}}lds_atomic_min_ret_i64: -; GCN: ds_min_rtn_i64 -; GCN: s_endpgm -define void @lds_atomic_min_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { - %result = atomicrmw min i64 addrspace(3)* %ptr, i64 4 seq_cst - store i64 %result, i64 addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_min_ret_i64_offset: -; GCN: ds_min_rtn_i64 {{.*}} offset:32 -; GCN: s_endpgm -define void @lds_atomic_min_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { - %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 - %result = atomicrmw min i64 addrspace(3)* %gep, i64 4 seq_cst - store i64 %result, i64 addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_max_ret_i64: -; GCN: ds_max_rtn_i64 -; GCN: s_endpgm -define void @lds_atomic_max_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { - %result = atomicrmw max i64 addrspace(3)* %ptr, i64 4 seq_cst - store i64 %result, i64 addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_max_ret_i64_offset: -; GCN: ds_max_rtn_i64 {{.*}} offset:32 -; GCN: s_endpgm -define void @lds_atomic_max_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { - %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 - %result = atomicrmw max i64 addrspace(3)* %gep, i64 4 seq_cst - store i64 %result, i64 addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_umin_ret_i64: -; GCN: ds_min_rtn_u64 -; GCN: s_endpgm -define void @lds_atomic_umin_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { - %result = atomicrmw umin i64 addrspace(3)* %ptr, i64 4 seq_cst - store i64 %result, i64 addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_umin_ret_i64_offset: -; GCN: ds_min_rtn_u64 {{.*}} offset:32 -; GCN: s_endpgm -define void @lds_atomic_umin_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { - %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 - %result = atomicrmw umin i64 addrspace(3)* %gep, i64 4 seq_cst - store i64 %result, i64 addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_umax_ret_i64: -; GCN: ds_max_rtn_u64 -; GCN: s_endpgm -define void @lds_atomic_umax_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { - %result = atomicrmw umax i64 addrspace(3)* %ptr, i64 4 seq_cst - store i64 %result, i64 addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_umax_ret_i64_offset: -; GCN: ds_max_rtn_u64 {{.*}} offset:32 -; GCN: s_endpgm -define void @lds_atomic_umax_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { - %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 - %result = atomicrmw umax i64 addrspace(3)* %gep, i64 4 seq_cst - store i64 %result, i64 addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_xchg_noret_i64: -; GCN: ds_wrxchg_rtn_b64 -; GCN: s_endpgm -define void @lds_atomic_xchg_noret_i64(i64 addrspace(3)* %ptr) nounwind { - %result = atomicrmw xchg i64 addrspace(3)* %ptr, i64 4 seq_cst - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_xchg_noret_i64_offset: -; GCN: ds_wrxchg_rtn_b64 {{.*}} offset:32 -; GCN: s_endpgm -define void @lds_atomic_xchg_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind { - %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 - %result = atomicrmw xchg i64 addrspace(3)* %gep, i64 4 seq_cst - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_add_noret_i64: -; GCN: ds_add_u64 -; GCN: s_endpgm -define void @lds_atomic_add_noret_i64(i64 addrspace(3)* %ptr) nounwind { - %result = atomicrmw add i64 addrspace(3)* %ptr, i64 4 seq_cst - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_add_noret_i64_offset: -; SI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x9 -; VI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x24 -; GCN: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], 9 -; GCN: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], 0 -; GCN: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]] -; GCN: ds_add_u64 [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}} offset:32 -; GCN: s_endpgm -define void @lds_atomic_add_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind { - %gep = getelementptr i64, i64 addrspace(3)* %ptr, i64 4 - %result = atomicrmw add i64 addrspace(3)* %gep, i64 9 seq_cst - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_inc_noret_i64: -; GCN: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], -1 -; GCN: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], -1 -; GCN: ds_inc_u64 [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}} -; GCN: s_endpgm -define void @lds_atomic_inc_noret_i64(i64 addrspace(3)* %ptr) nounwind { - %result = atomicrmw add i64 addrspace(3)* %ptr, i64 1 seq_cst - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_inc_noret_i64_offset: -; GCN: ds_inc_u64 {{.*}} offset:32 -; GCN: s_endpgm -define void @lds_atomic_inc_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind { - %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 - %result = atomicrmw add i64 addrspace(3)* %gep, i64 1 seq_cst - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_sub_noret_i64: -; GCN: ds_sub_u64 -; GCN: s_endpgm -define void @lds_atomic_sub_noret_i64(i64 addrspace(3)* %ptr) nounwind { - %result = atomicrmw sub i64 addrspace(3)* %ptr, i64 4 seq_cst - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_sub_noret_i64_offset: -; GCN: ds_sub_u64 {{.*}} offset:32 -; GCN: s_endpgm -define void @lds_atomic_sub_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind { - %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 - %result = atomicrmw sub i64 addrspace(3)* %gep, i64 4 seq_cst - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_dec_noret_i64: -; GCN: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], -1 -; GCN: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], -1 -; GCN: ds_dec_u64 [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}} -; GCN: s_endpgm -define void @lds_atomic_dec_noret_i64(i64 addrspace(3)* %ptr) nounwind { - %result = atomicrmw sub i64 addrspace(3)* %ptr, i64 1 seq_cst - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_dec_noret_i64_offset: -; GCN: ds_dec_u64 {{.*}} offset:32 -; GCN: s_endpgm -define void @lds_atomic_dec_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind { - %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 - %result = atomicrmw sub i64 addrspace(3)* %gep, i64 1 seq_cst - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_and_noret_i64: -; GCN: ds_and_b64 -; GCN: s_endpgm -define void @lds_atomic_and_noret_i64(i64 addrspace(3)* %ptr) nounwind { - %result = atomicrmw and i64 addrspace(3)* %ptr, i64 4 seq_cst - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_and_noret_i64_offset: -; GCN: ds_and_b64 {{.*}} offset:32 -; GCN: s_endpgm -define void @lds_atomic_and_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind { - %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 - %result = atomicrmw and i64 addrspace(3)* %gep, i64 4 seq_cst - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_or_noret_i64: -; GCN: ds_or_b64 -; GCN: s_endpgm -define void @lds_atomic_or_noret_i64(i64 addrspace(3)* %ptr) nounwind { - %result = atomicrmw or i64 addrspace(3)* %ptr, i64 4 seq_cst - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_or_noret_i64_offset: -; GCN: ds_or_b64 {{.*}} offset:32 -; GCN: s_endpgm -define void @lds_atomic_or_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind { - %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 - %result = atomicrmw or i64 addrspace(3)* %gep, i64 4 seq_cst - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_xor_noret_i64: -; GCN: ds_xor_b64 -; GCN: s_endpgm -define void @lds_atomic_xor_noret_i64(i64 addrspace(3)* %ptr) nounwind { - %result = atomicrmw xor i64 addrspace(3)* %ptr, i64 4 seq_cst - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_xor_noret_i64_offset: -; GCN: ds_xor_b64 {{.*}} offset:32 -; GCN: s_endpgm -define void @lds_atomic_xor_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind { - %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 - %result = atomicrmw xor i64 addrspace(3)* %gep, i64 4 seq_cst - ret void -} - -; FIXME: There is no atomic nand instr -; XFUNC-LABEL: {{^}}lds_atomic_nand_noret_i64:uction, so we somehow need to expand this. -; define void @lds_atomic_nand_noret_i64(i64 addrspace(3)* %ptr) nounwind { -; %result = atomicrmw nand i64 addrspace(3)* %ptr, i32 4 seq_cst -; ret void -; } - -; FUNC-LABEL: {{^}}lds_atomic_min_noret_i64: -; GCN: ds_min_i64 -; GCN: s_endpgm -define void @lds_atomic_min_noret_i64(i64 addrspace(3)* %ptr) nounwind { - %result = atomicrmw min i64 addrspace(3)* %ptr, i64 4 seq_cst - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_min_noret_i64_offset: -; GCN: ds_min_i64 {{.*}} offset:32 -; GCN: s_endpgm -define void @lds_atomic_min_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind { - %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 - %result = atomicrmw min i64 addrspace(3)* %gep, i64 4 seq_cst - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_max_noret_i64: -; GCN: ds_max_i64 -; GCN: s_endpgm -define void @lds_atomic_max_noret_i64(i64 addrspace(3)* %ptr) nounwind { - %result = atomicrmw max i64 addrspace(3)* %ptr, i64 4 seq_cst - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_max_noret_i64_offset: -; GCN: ds_max_i64 {{.*}} offset:32 -; GCN: s_endpgm -define void @lds_atomic_max_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind { - %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 - %result = atomicrmw max i64 addrspace(3)* %gep, i64 4 seq_cst - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_umin_noret_i64: -; GCN: ds_min_u64 -; GCN: s_endpgm -define void @lds_atomic_umin_noret_i64(i64 addrspace(3)* %ptr) nounwind { - %result = atomicrmw umin i64 addrspace(3)* %ptr, i64 4 seq_cst - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_umin_noret_i64_offset: -; GCN: ds_min_u64 {{.*}} offset:32 -; GCN: s_endpgm -define void @lds_atomic_umin_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind { - %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 - %result = atomicrmw umin i64 addrspace(3)* %gep, i64 4 seq_cst - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_umax_noret_i64: -; GCN: ds_max_u64 -; GCN: s_endpgm -define void @lds_atomic_umax_noret_i64(i64 addrspace(3)* %ptr) nounwind { - %result = atomicrmw umax i64 addrspace(3)* %ptr, i64 4 seq_cst - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_umax_noret_i64_offset: -; GCN: ds_max_u64 {{.*}} offset:32 -; GCN: s_endpgm -define void @lds_atomic_umax_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind { - %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 - %result = atomicrmw umax i64 addrspace(3)* %gep, i64 4 seq_cst - ret void -} diff --git a/llvm/test/CodeGen/R600/local-memory-two-objects.ll b/llvm/test/CodeGen/R600/local-memory-two-objects.ll deleted file mode 100644 index 06a8b1246e6..00000000000 --- a/llvm/test/CodeGen/R600/local-memory-two-objects.ll +++ /dev/null @@ -1,63 +0,0 @@ -; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG %s -; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=GCN --check-prefix=SI %s -; RUN: llc < %s -march=amdgcn -mcpu=bonaire -verify-machineinstrs | FileCheck --check-prefix=GCN --check-prefix=CI %s - -@local_memory_two_objects.local_mem0 = internal unnamed_addr addrspace(3) global [4 x i32] undef, align 4 -@local_memory_two_objects.local_mem1 = internal unnamed_addr addrspace(3) global [4 x i32] undef, align 4 - - -; Check that the LDS size emitted correctly -; EG: .long 166120 -; EG-NEXT: .long 8 -; GCN: .long 47180 -; GCN-NEXT: .long 38792 - -; EG: {{^}}local_memory_two_objects: - -; We would like to check the the lds writes are using different -; addresses, but due to variations in the scheduler, we can't do -; this consistently on evergreen GPUs. -; EG: LDS_WRITE -; EG: LDS_WRITE -; GCN: ds_write_b32 {{v[0-9]*}}, v[[ADDRW:[0-9]*]] -; GCN-NOT: ds_write_b32 {{v[0-9]*}}, v[[ADDRW]] - -; GROUP_BARRIER must be the last instruction in a clause -; EG: GROUP_BARRIER -; EG-NEXT: ALU clause - -; Make sure the lds reads are using different addresses, at different -; constant offsets. -; EG: LDS_READ_RET {{[*]*}} OQAP, {{PV|T}}[[ADDRR:[0-9]*\.[XYZW]]] -; EG-NOT: LDS_READ_RET {{[*]*}} OQAP, T[[ADDRR]] -; SI: v_add_i32_e32 [[SIPTR:v[0-9]+]], 16, v{{[0-9]+}} -; SI: ds_read_b32 {{v[0-9]+}}, [[SIPTR]] -; CI: ds_read_b32 {{v[0-9]+}}, [[ADDRR:v[0-9]+]] offset:16 -; CI: ds_read_b32 {{v[0-9]+}}, [[ADDRR]] - -define void @local_memory_two_objects(i32 addrspace(1)* %out) { -entry: - %x.i = call i32 @llvm.r600.read.tidig.x() #0 - %arrayidx = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem0, i32 0, i32 %x.i - store i32 %x.i, i32 addrspace(3)* %arrayidx, align 4 - %mul = shl nsw i32 %x.i, 1 - %arrayidx1 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem1, i32 0, i32 %x.i - store i32 %mul, i32 addrspace(3)* %arrayidx1, align 4 - %sub = sub nsw i32 3, %x.i - call void @llvm.AMDGPU.barrier.local() - %arrayidx2 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem0, i32 0, i32 %sub - %0 = load i32, i32 addrspace(3)* %arrayidx2, align 4 - %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %x.i - store i32 %0, i32 addrspace(1)* %arrayidx3, align 4 - %arrayidx4 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem1, i32 0, i32 %sub - %1 = load i32, i32 addrspace(3)* %arrayidx4, align 4 - %add = add nsw i32 %x.i, 4 - %arrayidx5 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %add - store i32 %1, i32 addrspace(1)* %arrayidx5, align 4 - ret void -} - -declare i32 @llvm.r600.read.tidig.x() #0 -declare void @llvm.AMDGPU.barrier.local() - -attributes #0 = { readnone } diff --git a/llvm/test/CodeGen/R600/local-memory.ll b/llvm/test/CodeGen/R600/local-memory.ll deleted file mode 100644 index 9494ed75bd0..00000000000 --- a/llvm/test/CodeGen/R600/local-memory.ll +++ /dev/null @@ -1,49 +0,0 @@ -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s - -@local_memory.local_mem = internal unnamed_addr addrspace(3) global [128 x i32] undef, align 4 - - -; Check that the LDS size emitted correctly -; EG: .long 166120 -; EG-NEXT: .long 128 -; SI: .long 47180 -; SI-NEXT: .long 71560 -; CI: .long 47180 -; CI-NEXT: .long 38792 - -; FUNC-LABEL: {{^}}local_memory: - -; EG: LDS_WRITE -; SI-NOT: s_wqm_b64 -; SI: ds_write_b32 - -; GROUP_BARRIER must be the last instruction in a clause -; EG: GROUP_BARRIER -; EG-NEXT: ALU clause -; SI: s_barrier - -; EG: LDS_READ_RET -; SI: ds_read_b32 {{v[0-9]+}}, - -define void @local_memory(i32 addrspace(1)* %out) { -entry: - %y.i = call i32 @llvm.r600.read.tidig.x() #0 - %arrayidx = getelementptr inbounds [128 x i32], [128 x i32] addrspace(3)* @local_memory.local_mem, i32 0, i32 %y.i - store i32 %y.i, i32 addrspace(3)* %arrayidx, align 4 - %add = add nsw i32 %y.i, 1 - %cmp = icmp eq i32 %add, 16 - %.add = select i1 %cmp, i32 0, i32 %add - call void @llvm.AMDGPU.barrier.local() - %arrayidx1 = getelementptr inbounds [128 x i32], [128 x i32] addrspace(3)* @local_memory.local_mem, i32 0, i32 %.add - %0 = load i32, i32 addrspace(3)* %arrayidx1, align 4 - %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %y.i - store i32 %0, i32 addrspace(1)* %arrayidx2, align 4 - ret void -} - -declare i32 @llvm.r600.read.tidig.x() #0 -declare void @llvm.AMDGPU.barrier.local() - -attributes #0 = { readnone } diff --git a/llvm/test/CodeGen/R600/loop-address.ll b/llvm/test/CodeGen/R600/loop-address.ll deleted file mode 100644 index f60d574497d..00000000000 --- a/llvm/test/CodeGen/R600/loop-address.ll +++ /dev/null @@ -1,34 +0,0 @@ -;RUN: llc < %s -march=r600 -mcpu=redwood < %s | FileCheck %s - -;CHECK: ALU_PUSH -;CHECK: LOOP_START_DX10 @11 -;CHECK: LOOP_BREAK @10 -;CHECK: POP @10 - -define void @loop_ge(i32 addrspace(1)* nocapture %out, i32 %iterations) #0 { -entry: - %cmp5 = icmp sgt i32 %iterations, 0 - br i1 %cmp5, label %for.body, label %for.end - -for.body: ; preds = %for.body, %entry - %i.07.in = phi i32 [ %i.07, %for.body ], [ %iterations, %entry ] - %ai.06 = phi i32 [ %add, %for.body ], [ 0, %entry ] - %i.07 = add nsw i32 %i.07.in, -1 - %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %ai.06 - store i32 %i.07, i32 addrspace(1)* %arrayidx, align 4 - %add = add nsw i32 %ai.06, 1 - %exitcond = icmp eq i32 %add, %iterations - br i1 %exitcond, label %for.end, label %for.body - -for.end: ; preds = %for.body, %entry - ret void -} - -attributes #0 = { nounwind "fp-contract-model"="standard" "relocation-model"="pic" "ssp-buffers-size"="8" } - -!opencl.kernels = !{!0, !1, !2, !3} - -!0 = !{void (i32 addrspace(1)*, i32)* @loop_ge} -!1 = !{null} -!2 = !{null} -!3 = !{null} diff --git a/llvm/test/CodeGen/R600/loop-idiom.ll b/llvm/test/CodeGen/R600/loop-idiom.ll deleted file mode 100644 index 5fd9806813c..00000000000 --- a/llvm/test/CodeGen/R600/loop-idiom.ll +++ /dev/null @@ -1,51 +0,0 @@ -; RUN: opt -basicaa -loop-idiom -S < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=R600 --check-prefix=FUNC %s -; RUN: opt -basicaa -loop-idiom -S < %s -march=amdgcn -mcpu=SI -verify-machineinstrs| FileCheck --check-prefix=SI --check-prefix=FUNC %s -; RUN: opt -basicaa -loop-idiom -S < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs| FileCheck --check-prefix=SI --check-prefix=FUNC %s - - -; Make sure loop-idiom doesn't create memcpy or memset. There are no library -; implementations of these for R600. - -; FUNC: @no_memcpy -; R600-NOT: {{^}}llvm.memcpy -; SI-NOT: {{^}}llvm.memcpy -define void @no_memcpy(i8 addrspace(3)* %in, i32 %size) { -entry: - %dest = alloca i8, i32 32 - br label %for.body - -for.body: - %0 = phi i32 [0, %entry], [%4, %for.body] - %1 = getelementptr i8, i8 addrspace(3)* %in, i32 %0 - %2 = getelementptr i8, i8* %dest, i32 %0 - %3 = load i8, i8 addrspace(3)* %1 - store i8 %3, i8* %2 - %4 = add i32 %0, 1 - %5 = icmp eq i32 %4, %size - br i1 %5, label %for.end, label %for.body - -for.end: - ret void -} - -; FUNC: @no_memset -; R600-NOT: {{^}}llvm.memset -; R600-NOT: {{^}}memset_pattern16: -; SI-NOT: {{^}}llvm.memset -; SI-NOT: {{^}}memset_pattern16: -define void @no_memset(i32 %size) { -entry: - %dest = alloca i8, i32 32 - br label %for.body - -for.body: - %0 = phi i32 [0, %entry], [%2, %for.body] - %1 = getelementptr i8, i8* %dest, i32 %0 - store i8 0, i8* %1 - %2 = add i32 %0, 1 - %3 = icmp eq i32 %2, %size - br i1 %3, label %for.end, label %for.body - -for.end: - ret void -} diff --git a/llvm/test/CodeGen/R600/lshl.ll b/llvm/test/CodeGen/R600/lshl.ll deleted file mode 100644 index 9ac988d38d1..00000000000 --- a/llvm/test/CodeGen/R600/lshl.ll +++ /dev/null @@ -1,15 +0,0 @@ -;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s -;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s - -;CHECK: s_lshl_b32 s{{[0-9]}}, s{{[0-9]}}, 1 - -define void @test(i32 %p) { - %i = mul i32 %p, 2 - %r = bitcast i32 %i to float - call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %r, float %r, float %r, float %r) - ret void -} - -declare <4 x float> @llvm.SI.sample.(i32, <4 x i32>, <8 x i32>, <4 x i32>, i32) readnone - -declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) diff --git a/llvm/test/CodeGen/R600/lshr.ll b/llvm/test/CodeGen/R600/lshr.ll deleted file mode 100644 index 50e444ac26b..00000000000 --- a/llvm/test/CodeGen/R600/lshr.ll +++ /dev/null @@ -1,15 +0,0 @@ -;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s -;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s - -;CHECK: s_lshr_b32 s{{[0-9]}}, s{{[0-9]}}, 1 - -define void @test(i32 %p) { - %i = udiv i32 %p, 2 - %r = bitcast i32 %i to float - call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %r, float %r, float %r, float %r) - ret void -} - -declare <4 x float> @llvm.SI.sample.(i32, <4 x i32>, <8 x i32>, <4 x i32>, i32) readnone - -declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) diff --git a/llvm/test/CodeGen/R600/m0-spill.ll b/llvm/test/CodeGen/R600/m0-spill.ll deleted file mode 100644 index 1dddc85f775..00000000000 --- a/llvm/test/CodeGen/R600/m0-spill.ll +++ /dev/null @@ -1,35 +0,0 @@ -; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s -; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s - -@lds = external addrspace(3) global [64 x float] - -; CHECK-LABEL: {{^}}main: -; CHECK-NOT: v_readlane_b32 m0 -define void @main(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg) "ShaderType"="0" { -main_body: - %4 = call float @llvm.SI.fs.constant(i32 0, i32 0, i32 %3) - %cmp = fcmp ueq float 0.0, %4 - br i1 %cmp, label %if, label %else - -if: - %lds_ptr = getelementptr [64 x float], [64 x float] addrspace(3)* @lds, i32 0, i32 0 - %lds_data = load float, float addrspace(3)* %lds_ptr - br label %endif - -else: - %interp = call float @llvm.SI.fs.constant(i32 0, i32 0, i32 %3) - br label %endif - -endif: - %export = phi float [%lds_data, %if], [%interp, %else] - %5 = call i32 @llvm.SI.packf16(float %export, float %export) - %6 = bitcast i32 %5 to float - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %6, float %6, float %6, float %6) - ret void -} - -declare float @llvm.SI.fs.constant(i32, i32, i32) readnone - -declare i32 @llvm.SI.packf16(float, float) readnone - -declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) diff --git a/llvm/test/CodeGen/R600/mad-combine.ll b/llvm/test/CodeGen/R600/mad-combine.ll deleted file mode 100644 index bc071628ead..00000000000 --- a/llvm/test/CodeGen/R600/mad-combine.ll +++ /dev/null @@ -1,567 +0,0 @@ -; Make sure we still form mad even when unsafe math or fp-contract is allowed instead of fma. - -; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=SI-STD -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -fp-contract=fast < %s | FileCheck -check-prefix=SI -check-prefix=SI-STD -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=SI -check-prefix=SI-STD -check-prefix=FUNC %s - -; Make sure we don't form mad with denormals -; RUN: llc -march=amdgcn -mcpu=tahiti -mattr=+fp32-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=SI-DENORM -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=verde -mattr=+fp32-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=SI-DENORM-SLOWFMAF -check-prefix=FUNC %s - -declare i32 @llvm.r600.read.tidig.x() #0 -declare float @llvm.fabs.f32(float) #0 -declare float @llvm.fma.f32(float, float, float) #0 -declare float @llvm.fmuladd.f32(float, float, float) #0 - -; (fadd (fmul x, y), z) -> (fma x, y, z) -; FUNC-LABEL: {{^}}combine_to_mad_f32_0: -; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} -; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} - -; SI-STD: v_mad_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[C]] - -; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[C]] - -; SI-DENORM-SLOWFMAF-NOT: v_fma -; SI-DENORM-SLOWFMAF-NOT: v_mad - -; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]] -; SI-DENORM-SLOWFMAF: v_add_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP]] - -; SI: buffer_store_dword [[RESULT]] -define void @combine_to_mad_f32_0(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { - %tid = tail call i32 @llvm.r600.read.tidig.x() #0 - %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid - %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 - %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2 - %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid - - %a = load float, float addrspace(1)* %gep.0 - %b = load float, float addrspace(1)* %gep.1 - %c = load float, float addrspace(1)* %gep.2 - - %mul = fmul float %a, %b - %fma = fadd float %mul, %c - store float %fma, float addrspace(1)* %gep.out - ret void -} - -; (fadd (fmul x, y), z) -> (fma x, y, z) -; FUNC-LABEL: {{^}}combine_to_mad_f32_0_2use: -; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} -; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} -; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}} - -; SI-STD-DAG: v_mad_f32 [[RESULT0:v[0-9]+]], [[A]], [[B]], [[C]] -; SI-STD-DAG: v_mad_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], [[D]] - -; SI-DENORM-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], [[A]], [[B]], [[C]] -; SI-DENORM-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], [[D]] - -; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]] -; SI-DENORM-SLOWFMAF-DAG: v_add_f32_e32 [[RESULT0:v[0-9]+]], [[C]], [[TMP]] -; SI-DENORM-SLOWFMAF-DAG: v_add_f32_e32 [[RESULT1:v[0-9]+]], [[D]], [[TMP]] - -; SI-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} -; SI: s_endpgm -define void @combine_to_mad_f32_0_2use(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { - %tid = tail call i32 @llvm.r600.read.tidig.x() #0 - %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid - %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 - %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2 - %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3 - %gep.out.0 = getelementptr float, float addrspace(1)* %out, i32 %tid - %gep.out.1 = getelementptr float, float addrspace(1)* %gep.out.0, i32 1 - - %a = load float, float addrspace(1)* %gep.0 - %b = load float, float addrspace(1)* %gep.1 - %c = load float, float addrspace(1)* %gep.2 - %d = load float, float addrspace(1)* %gep.3 - - %mul = fmul float %a, %b - %fma0 = fadd float %mul, %c - %fma1 = fadd float %mul, %d - - store float %fma0, float addrspace(1)* %gep.out.0 - store float %fma1, float addrspace(1)* %gep.out.1 - ret void -} - -; (fadd x, (fmul y, z)) -> (fma y, z, x) -; FUNC-LABEL: {{^}}combine_to_mad_f32_1: -; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} -; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} - -; SI-STD: v_mad_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[C]] -; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[C]] - -; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]] -; SI-DENORM-SLOWFMAF: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[C]] - -; SI: buffer_store_dword [[RESULT]] -define void @combine_to_mad_f32_1(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { - %tid = tail call i32 @llvm.r600.read.tidig.x() #0 - %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid - %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 - %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2 - %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid - - %a = load float, float addrspace(1)* %gep.0 - %b = load float, float addrspace(1)* %gep.1 - %c = load float, float addrspace(1)* %gep.2 - - %mul = fmul float %a, %b - %fma = fadd float %c, %mul - store float %fma, float addrspace(1)* %gep.out - ret void -} - -; (fsub (fmul x, y), z) -> (fma x, y, (fneg z)) -; FUNC-LABEL: {{^}}combine_to_mad_fsub_0_f32: -; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} -; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} - -; SI-STD: v_mad_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], -[[C]] -; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], -[[C]] - -; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]] -; SI-DENORM-SLOWFMAF: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP]] - -; SI: buffer_store_dword [[RESULT]] -define void @combine_to_mad_fsub_0_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { - %tid = tail call i32 @llvm.r600.read.tidig.x() #0 - %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid - %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 - %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2 - %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid - - %a = load float, float addrspace(1)* %gep.0 - %b = load float, float addrspace(1)* %gep.1 - %c = load float, float addrspace(1)* %gep.2 - - %mul = fmul float %a, %b - %fma = fsub float %mul, %c - store float %fma, float addrspace(1)* %gep.out - ret void -} - -; (fsub (fmul x, y), z) -> (fma x, y, (fneg z)) -; FUNC-LABEL: {{^}}combine_to_mad_fsub_0_f32_2use: -; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} -; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} -; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}} - -; SI-STD-DAG: v_mad_f32 [[RESULT0:v[0-9]+]], [[A]], [[B]], -[[C]] -; SI-STD-DAG: v_mad_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], -[[D]] - -; SI-DENORM-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], [[A]], [[B]], -[[C]] -; SI-DENORM-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], -[[D]] - -; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]] -; SI-DENORM-SLOWFMAF-DAG: v_subrev_f32_e32 [[RESULT0:v[0-9]+]], [[C]], [[TMP]] -; SI-DENORM-SLOWFMAF-DAG: v_subrev_f32_e32 [[RESULT1:v[0-9]+]], [[D]], [[TMP]] - -; SI-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} -; SI: s_endpgm -define void @combine_to_mad_fsub_0_f32_2use(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { - %tid = tail call i32 @llvm.r600.read.tidig.x() #0 - %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid - %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 - %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2 - %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3 - %gep.out.0 = getelementptr float, float addrspace(1)* %out, i32 %tid - %gep.out.1 = getelementptr float, float addrspace(1)* %gep.out.0, i32 1 - - %a = load float, float addrspace(1)* %gep.0 - %b = load float, float addrspace(1)* %gep.1 - %c = load float, float addrspace(1)* %gep.2 - %d = load float, float addrspace(1)* %gep.3 - - %mul = fmul float %a, %b - %fma0 = fsub float %mul, %c - %fma1 = fsub float %mul, %d - store float %fma0, float addrspace(1)* %gep.out.0 - store float %fma1, float addrspace(1)* %gep.out.1 - ret void -} - -; (fsub x, (fmul y, z)) -> (fma (fneg y), z, x) -; FUNC-LABEL: {{^}}combine_to_mad_fsub_1_f32: -; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} -; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} - -; SI-STD: v_mad_f32 [[RESULT:v[0-9]+]], -[[A]], [[B]], [[C]] -; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], -[[A]], [[B]], [[C]] - -; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]] -; SI-DENORM-SLOWFMAF: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[C]] - -; SI: buffer_store_dword [[RESULT]] -define void @combine_to_mad_fsub_1_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { - %tid = tail call i32 @llvm.r600.read.tidig.x() #0 - %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid - %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 - %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2 - %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid - - %a = load float, float addrspace(1)* %gep.0 - %b = load float, float addrspace(1)* %gep.1 - %c = load float, float addrspace(1)* %gep.2 - - %mul = fmul float %a, %b - %fma = fsub float %c, %mul - store float %fma, float addrspace(1)* %gep.out - ret void -} - -; (fsub x, (fmul y, z)) -> (fma (fneg y), z, x) -; FUNC-LABEL: {{^}}combine_to_mad_fsub_1_f32_2use: -; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} -; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} - -; SI-STD-DAG: v_mad_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], [[C]] -; SI-STD-DAG: v_mad_f32 [[RESULT1:v[0-9]+]], -[[A]], [[B]], [[D]] - -; SI-DENORM-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], [[C]] -; SI-DENORM-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], -[[A]], [[B]], [[D]] - -; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]] -; SI-DENORM-SLOWFMAF-DAG: v_subrev_f32_e32 [[RESULT0:v[0-9]+]], [[TMP]], [[C]] -; SI-DENORM-SLOWFMAF-DAG: v_subrev_f32_e32 [[RESULT1:v[0-9]+]], [[TMP]], [[D]] - -; SI-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} -; SI: s_endpgm -define void @combine_to_mad_fsub_1_f32_2use(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { - %tid = tail call i32 @llvm.r600.read.tidig.x() #0 - %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid - %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 - %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2 - %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3 - %gep.out.0 = getelementptr float, float addrspace(1)* %out, i32 %tid - %gep.out.1 = getelementptr float, float addrspace(1)* %gep.out.0, i32 1 - - %a = load float, float addrspace(1)* %gep.0 - %b = load float, float addrspace(1)* %gep.1 - %c = load float, float addrspace(1)* %gep.2 - %d = load float, float addrspace(1)* %gep.3 - - %mul = fmul float %a, %b - %fma0 = fsub float %c, %mul - %fma1 = fsub float %d, %mul - store float %fma0, float addrspace(1)* %gep.out.0 - store float %fma1, float addrspace(1)* %gep.out.1 - ret void -} - -; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z)) -; FUNC-LABEL: {{^}}combine_to_mad_fsub_2_f32: -; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} -; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} - -; SI-STD: v_mad_f32 [[RESULT:v[0-9]+]], -[[A]], [[B]], -[[C]] - -; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], -[[A]], [[B]], -[[C]] - -; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]] -; SI-DENORM-SLOWFMAF: v_sub_f32_e64 [[RESULT:v[0-9]+]], -[[TMP]], [[C]] - -; SI: buffer_store_dword [[RESULT]] -define void @combine_to_mad_fsub_2_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { - %tid = tail call i32 @llvm.r600.read.tidig.x() #0 - %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid - %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 - %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2 - %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid - - %a = load float, float addrspace(1)* %gep.0 - %b = load float, float addrspace(1)* %gep.1 - %c = load float, float addrspace(1)* %gep.2 - - %mul = fmul float %a, %b - %mul.neg = fsub float -0.0, %mul - %fma = fsub float %mul.neg, %c - - store float %fma, float addrspace(1)* %gep.out - ret void -} - -; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z)) -; FUNC-LABEL: {{^}}combine_to_mad_fsub_2_f32_2uses_neg: -; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} -; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} - -; SI-STD-DAG: v_mad_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], -[[C]] -; SI-STD-DAG: v_mad_f32 [[RESULT1:v[0-9]+]], -[[A]], [[B]], -[[D]] - -; SI-DENORM-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], -[[C]] -; SI-DENORM-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], -[[A]], [[B]], -[[D]] - -; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]] -; SI-DENORM-SLOWFMAF-DAG: v_sub_f32_e64 [[RESULT0:v[0-9]+]], -[[TMP]], [[C]] -; SI-DENORM-SLOWFMAF-DAG: v_sub_f32_e64 [[RESULT1:v[0-9]+]], -[[TMP]], [[D]] - -; SI-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} -; SI: s_endpgm -define void @combine_to_mad_fsub_2_f32_2uses_neg(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { - %tid = tail call i32 @llvm.r600.read.tidig.x() #0 - %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid - %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 - %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2 - %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3 - %gep.out.0 = getelementptr float, float addrspace(1)* %out, i32 %tid - %gep.out.1 = getelementptr float, float addrspace(1)* %gep.out.0, i32 1 - - %a = load float, float addrspace(1)* %gep.0 - %b = load float, float addrspace(1)* %gep.1 - %c = load float, float addrspace(1)* %gep.2 - %d = load float, float addrspace(1)* %gep.3 - - %mul = fmul float %a, %b - %mul.neg = fsub float -0.0, %mul - %fma0 = fsub float %mul.neg, %c - %fma1 = fsub float %mul.neg, %d - - store float %fma0, float addrspace(1)* %gep.out.0 - store float %fma1, float addrspace(1)* %gep.out.1 - ret void -} - -; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z)) -; FUNC-LABEL: {{^}}combine_to_mad_fsub_2_f32_2uses_mul: -; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} -; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} - -; SI-STD-DAG: v_mad_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], -[[C]] -; SI-STD-DAG: v_mad_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], -[[D]] - -; SI-DENORM-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], -[[C]] -; SI-DENORM-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], -[[D]] - -; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]] -; SI-DENORM-SLOWFMAF-DAG: v_sub_f32_e64 [[RESULT0:v[0-9]+]], -[[TMP]], [[C]] -; SI-DENORM-SLOWFMAF-DAG: v_subrev_f32_e32 [[RESULT1:v[0-9]+]], [[D]], [[TMP]] - -; SI-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} -; SI: s_endpgm -define void @combine_to_mad_fsub_2_f32_2uses_mul(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { - %tid = tail call i32 @llvm.r600.read.tidig.x() #0 - %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid - %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 - %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2 - %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3 - %gep.out.0 = getelementptr float, float addrspace(1)* %out, i32 %tid - %gep.out.1 = getelementptr float, float addrspace(1)* %gep.out.0, i32 1 - - %a = load float, float addrspace(1)* %gep.0 - %b = load float, float addrspace(1)* %gep.1 - %c = load float, float addrspace(1)* %gep.2 - %d = load float, float addrspace(1)* %gep.3 - - %mul = fmul float %a, %b - %mul.neg = fsub float -0.0, %mul - %fma0 = fsub float %mul.neg, %c - %fma1 = fsub float %mul, %d - - store float %fma0, float addrspace(1)* %gep.out.0 - store float %fma1, float addrspace(1)* %gep.out.1 - ret void -} - -; fold (fsub (fma x, y, (fmul u, v)), z) -> (fma x, y (fma u, v, (fneg z))) - -; FUNC-LABEL: {{^}}aggressive_combine_to_mad_fsub_0_f32: -; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} -; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} -; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}} -; SI-DAG: buffer_load_dword [[E:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} - -; SI-STD: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]] -; SI-STD: v_fma_f32 [[TMP1:v[0-9]+]], [[A]], [[B]], [[TMP0]] -; SI-STD: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP1]] - -; SI-DENORM: v_fma_f32 [[TMP0:v[0-9]+]], [[D]], [[E]], -[[C]] -; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[TMP0]] - -; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]] -; SI-DENORM-SLOWFMAF: v_fma_f32 [[TMP1:v[0-9]+]], [[A]], [[B]], [[TMP0]] -; SI-DENORM-SLOWFMAF: v_subrev_f32_e32 [[RESULT1:v[0-9]+]], [[C]], [[TMP1]] - -; SI: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -define void @aggressive_combine_to_mad_fsub_0_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { - %tid = tail call i32 @llvm.r600.read.tidig.x() #0 - %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid - %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 - %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2 - %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3 - %gep.4 = getelementptr float, float addrspace(1)* %gep.0, i32 4 - %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid - - %x = load float, float addrspace(1)* %gep.0 - %y = load float, float addrspace(1)* %gep.1 - %z = load float, float addrspace(1)* %gep.2 - %u = load float, float addrspace(1)* %gep.3 - %v = load float, float addrspace(1)* %gep.4 - - %tmp0 = fmul float %u, %v - %tmp1 = call float @llvm.fma.f32(float %x, float %y, float %tmp0) #0 - %tmp2 = fsub float %tmp1, %z - - store float %tmp2, float addrspace(1)* %gep.out - ret void -} - -; fold (fsub x, (fma y, z, (fmul u, v))) -; -> (fma (fneg y), z, (fma (fneg u), v, x)) - -; FUNC-LABEL: {{^}}aggressive_combine_to_mad_fsub_1_f32: -; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} -; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} -; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}} -; SI-DAG: buffer_load_dword [[E:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} - -; SI-STD: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]] -; SI-STD: v_fma_f32 [[TMP1:v[0-9]+]], [[B]], [[C]], [[TMP0]] -; SI-STD: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP1]], [[A]] - -; SI-DENORM: v_fma_f32 [[TMP0:v[0-9]+]], -[[D]], [[E]], [[A]] -; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], -[[B]], [[C]], [[TMP0]] - -; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]] -; SI-DENORM-SLOWFMAF: v_fma_f32 [[TMP1:v[0-9]+]], [[B]], [[C]], [[TMP0]] -; SI-DENORM-SLOWFMAF: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP1]], [[A]] - -; SI: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI: s_endpgm -define void @aggressive_combine_to_mad_fsub_1_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { - %tid = tail call i32 @llvm.r600.read.tidig.x() #0 - %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid - %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 - %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2 - %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3 - %gep.4 = getelementptr float, float addrspace(1)* %gep.0, i32 4 - %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid - - %x = load float, float addrspace(1)* %gep.0 - %y = load float, float addrspace(1)* %gep.1 - %z = load float, float addrspace(1)* %gep.2 - %u = load float, float addrspace(1)* %gep.3 - %v = load float, float addrspace(1)* %gep.4 - - %tmp0 = fmul float %u, %v - %tmp1 = call float @llvm.fma.f32(float %y, float %z, float %tmp0) #0 - %tmp2 = fsub float %x, %tmp1 - - store float %tmp2, float addrspace(1)* %gep.out - ret void -} - -; fold (fsub (fma x, y, (fmul u, v)), z) -> (fma x, y (fma u, v, (fneg z))) - -; FUNC-LABEL: {{^}}aggressive_combine_to_mad_fsub_2_f32: -; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} -; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} -; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}} -; SI-DAG: buffer_load_dword [[E:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} - -; SI-STD: v_mad_f32 [[TMP:v[0-9]+]], [[D]], [[E]], -[[C]] -; SI-STD: v_mad_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[TMP]] - -; SI-DENORM: v_fma_f32 [[TMP:v[0-9]+]], [[D]], [[E]], -[[C]] -; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[TMP]] - -; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]] -; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP1:v[0-9]+]], [[B]], [[A]] -; SI-DENORM-SLOWFMAF: v_add_f32_e32 [[TMP2:v[0-9]+]], [[TMP0]], [[TMP1]] -; SI-DENORM-SLOWFMAF: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP2]] - -; SI: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI: s_endpgm -define void @aggressive_combine_to_mad_fsub_2_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { - %tid = tail call i32 @llvm.r600.read.tidig.x() #0 - %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid - %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 - %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2 - %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3 - %gep.4 = getelementptr float, float addrspace(1)* %gep.0, i32 4 - %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid - - %x = load float, float addrspace(1)* %gep.0 - %y = load float, float addrspace(1)* %gep.1 - %z = load float, float addrspace(1)* %gep.2 - %u = load float, float addrspace(1)* %gep.3 - %v = load float, float addrspace(1)* %gep.4 - - %tmp0 = fmul float %u, %v - %tmp1 = call float @llvm.fmuladd.f32(float %x, float %y, float %tmp0) #0 - %tmp2 = fsub float %tmp1, %z - - store float %tmp2, float addrspace(1)* %gep.out - ret void -} - -; fold (fsub x, (fmuladd y, z, (fmul u, v))) -; -> (fmuladd (fneg y), z, (fmuladd (fneg u), v, x)) - -; FUNC-LABEL: {{^}}aggressive_combine_to_mad_fsub_3_f32: -; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} -; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} -; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}} -; SI-DAG: buffer_load_dword [[E:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} - -; SI-STD: v_mad_f32 [[TMP:v[0-9]+]], -[[D]], [[E]], [[A]] -; SI-STD: v_mad_f32 [[RESULT:v[0-9]+]], -[[B]], [[C]], [[TMP]] - -; SI-DENORM: v_fma_f32 [[TMP:v[0-9]+]], -[[D]], [[E]], [[A]] -; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], -[[B]], [[C]], [[TMP]] - -; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]] -; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP1:v[0-9]+]], [[C]], [[B]] -; SI-DENORM-SLOWFMAF: v_add_f32_e32 [[TMP2:v[0-9]+]], [[TMP0]], [[TMP1]] -; SI-DENORM-SLOWFMAF: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP2]], [[A]] - -; SI: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI: s_endpgm -define void @aggressive_combine_to_mad_fsub_3_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { - %tid = tail call i32 @llvm.r600.read.tidig.x() #0 - %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid - %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 - %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2 - %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3 - %gep.4 = getelementptr float, float addrspace(1)* %gep.0, i32 4 - %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid - - %x = load float, float addrspace(1)* %gep.0 - %y = load float, float addrspace(1)* %gep.1 - %z = load float, float addrspace(1)* %gep.2 - %u = load float, float addrspace(1)* %gep.3 - %v = load float, float addrspace(1)* %gep.4 - - %tmp0 = fmul float %u, %v - %tmp1 = call float @llvm.fmuladd.f32(float %y, float %z, float %tmp0) #0 - %tmp2 = fsub float %x, %tmp1 - - store float %tmp2, float addrspace(1)* %gep.out - ret void -} - -attributes #0 = { nounwind readnone } -attributes #1 = { nounwind } diff --git a/llvm/test/CodeGen/R600/mad-sub.ll b/llvm/test/CodeGen/R600/mad-sub.ll deleted file mode 100644 index aa4194ff610..00000000000 --- a/llvm/test/CodeGen/R600/mad-sub.ll +++ /dev/null @@ -1,215 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s - -declare i32 @llvm.r600.read.tidig.x() #0 -declare float @llvm.fabs.f32(float) #0 - -; FUNC-LABEL: {{^}}mad_sub_f32: -; SI: buffer_load_dword [[REGA:v[0-9]+]] -; SI: buffer_load_dword [[REGB:v[0-9]+]] -; SI: buffer_load_dword [[REGC:v[0-9]+]] -; SI: v_mad_f32 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -[[REGC]] -; SI: buffer_store_dword [[RESULT]] -define void @mad_sub_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #1 { - %tid = tail call i32 @llvm.r600.read.tidig.x() #0 - %tid.ext = sext i32 %tid to i64 - %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext - %add1 = add i64 %tid.ext, 1 - %gep1 = getelementptr float, float addrspace(1)* %ptr, i64 %add1 - %add2 = add i64 %tid.ext, 2 - %gep2 = getelementptr float, float addrspace(1)* %ptr, i64 %add2 - %outgep = getelementptr float, float addrspace(1)* %out, i64 %tid.ext - %a = load float, float addrspace(1)* %gep0, align 4 - %b = load float, float addrspace(1)* %gep1, align 4 - %c = load float, float addrspace(1)* %gep2, align 4 - %mul = fmul float %a, %b - %sub = fsub float %mul, %c - store float %sub, float addrspace(1)* %outgep, align 4 - ret void -} - -; FUNC-LABEL: {{^}}mad_sub_inv_f32: -; SI: buffer_load_dword [[REGA:v[0-9]+]] -; SI: buffer_load_dword [[REGB:v[0-9]+]] -; SI: buffer_load_dword [[REGC:v[0-9]+]] -; SI: v_mad_f32 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], [[REGC]] -; SI: buffer_store_dword [[RESULT]] -define void @mad_sub_inv_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #1 { - %tid = tail call i32 @llvm.r600.read.tidig.x() #0 - %tid.ext = sext i32 %tid to i64 - %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext - %add1 = add i64 %tid.ext, 1 - %gep1 = getelementptr float, float addrspace(1)* %ptr, i64 %add1 - %add2 = add i64 %tid.ext, 2 - %gep2 = getelementptr float, float addrspace(1)* %ptr, i64 %add2 - %outgep = getelementptr float, float addrspace(1)* %out, i64 %tid.ext - %a = load float, float addrspace(1)* %gep0, align 4 - %b = load float, float addrspace(1)* %gep1, align 4 - %c = load float, float addrspace(1)* %gep2, align 4 - %mul = fmul float %a, %b - %sub = fsub float %c, %mul - store float %sub, float addrspace(1)* %outgep, align 4 - ret void -} - -; FUNC-LABEL: {{^}}mad_sub_f64: -; SI: v_mul_f64 -; SI: v_add_f64 -define void @mad_sub_f64(double addrspace(1)* noalias nocapture %out, double addrspace(1)* noalias nocapture readonly %ptr) #1 { - %tid = tail call i32 @llvm.r600.read.tidig.x() #0 - %tid.ext = sext i32 %tid to i64 - %gep0 = getelementptr double, double addrspace(1)* %ptr, i64 %tid.ext - %add1 = add i64 %tid.ext, 1 - %gep1 = getelementptr double, double addrspace(1)* %ptr, i64 %add1 - %add2 = add i64 %tid.ext, 2 - %gep2 = getelementptr double, double addrspace(1)* %ptr, i64 %add2 - %outgep = getelementptr double, double addrspace(1)* %out, i64 %tid.ext - %a = load double, double addrspace(1)* %gep0, align 8 - %b = load double, double addrspace(1)* %gep1, align 8 - %c = load double, double addrspace(1)* %gep2, align 8 - %mul = fmul double %a, %b - %sub = fsub double %mul, %c - store double %sub, double addrspace(1)* %outgep, align 8 - ret void -} - -; FUNC-LABEL: {{^}}mad_sub_fabs_f32: -; SI: buffer_load_dword [[REGA:v[0-9]+]] -; SI: buffer_load_dword [[REGB:v[0-9]+]] -; SI: buffer_load_dword [[REGC:v[0-9]+]] -; SI: v_mad_f32 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -|[[REGC]]| -; SI: buffer_store_dword [[RESULT]] -define void @mad_sub_fabs_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #1 { - %tid = tail call i32 @llvm.r600.read.tidig.x() #0 - %tid.ext = sext i32 %tid to i64 - %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext - %add1 = add i64 %tid.ext, 1 - %gep1 = getelementptr float, float addrspace(1)* %ptr, i64 %add1 - %add2 = add i64 %tid.ext, 2 - %gep2 = getelementptr float, float addrspace(1)* %ptr, i64 %add2 - %outgep = getelementptr float, float addrspace(1)* %out, i64 %tid.ext - %a = load float, float addrspace(1)* %gep0, align 4 - %b = load float, float addrspace(1)* %gep1, align 4 - %c = load float, float addrspace(1)* %gep2, align 4 - %c.abs = call float @llvm.fabs.f32(float %c) #0 - %mul = fmul float %a, %b - %sub = fsub float %mul, %c.abs - store float %sub, float addrspace(1)* %outgep, align 4 - ret void -} - -; FUNC-LABEL: {{^}}mad_sub_fabs_inv_f32: -; SI: buffer_load_dword [[REGA:v[0-9]+]] -; SI: buffer_load_dword [[REGB:v[0-9]+]] -; SI: buffer_load_dword [[REGC:v[0-9]+]] -; SI: v_mad_f32 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], |[[REGC]]| -; SI: buffer_store_dword [[RESULT]] -define void @mad_sub_fabs_inv_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #1 { - %tid = tail call i32 @llvm.r600.read.tidig.x() #0 - %tid.ext = sext i32 %tid to i64 - %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext - %add1 = add i64 %tid.ext, 1 - %gep1 = getelementptr float, float addrspace(1)* %ptr, i64 %add1 - %add2 = add i64 %tid.ext, 2 - %gep2 = getelementptr float, float addrspace(1)* %ptr, i64 %add2 - %outgep = getelementptr float, float addrspace(1)* %out, i64 %tid.ext - %a = load float, float addrspace(1)* %gep0, align 4 - %b = load float, float addrspace(1)* %gep1, align 4 - %c = load float, float addrspace(1)* %gep2, align 4 - %c.abs = call float @llvm.fabs.f32(float %c) #0 - %mul = fmul float %a, %b - %sub = fsub float %c.abs, %mul - store float %sub, float addrspace(1)* %outgep, align 4 - ret void -} - -; FUNC-LABEL: {{^}}neg_neg_mad_f32: -; SI: v_mad_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} -define void @neg_neg_mad_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #1 { - %tid = tail call i32 @llvm.r600.read.tidig.x() #0 - %tid.ext = sext i32 %tid to i64 - %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext - %add1 = add i64 %tid.ext, 1 - %gep1 = getelementptr float, float addrspace(1)* %ptr, i64 %add1 - %add2 = add i64 %tid.ext, 2 - %gep2 = getelementptr float, float addrspace(1)* %ptr, i64 %add2 - %outgep = getelementptr float, float addrspace(1)* %out, i64 %tid.ext - %a = load float, float addrspace(1)* %gep0, align 4 - %b = load float, float addrspace(1)* %gep1, align 4 - %c = load float, float addrspace(1)* %gep2, align 4 - %nega = fsub float -0.000000e+00, %a - %negb = fsub float -0.000000e+00, %b - %mul = fmul float %nega, %negb - %sub = fadd float %mul, %c - store float %sub, float addrspace(1)* %outgep, align 4 - ret void -} - -; FUNC-LABEL: {{^}}mad_fabs_sub_f32: -; SI: buffer_load_dword [[REGA:v[0-9]+]] -; SI: buffer_load_dword [[REGB:v[0-9]+]] -; SI: buffer_load_dword [[REGC:v[0-9]+]] -; SI: v_mad_f32 [[RESULT:v[0-9]+]], [[REGA]], |[[REGB]]|, -[[REGC]] -; SI: buffer_store_dword [[RESULT]] -define void @mad_fabs_sub_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #1 { - %tid = tail call i32 @llvm.r600.read.tidig.x() #0 - %tid.ext = sext i32 %tid to i64 - %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext - %add1 = add i64 %tid.ext, 1 - %gep1 = getelementptr float, float addrspace(1)* %ptr, i64 %add1 - %add2 = add i64 %tid.ext, 2 - %gep2 = getelementptr float, float addrspace(1)* %ptr, i64 %add2 - %outgep = getelementptr float, float addrspace(1)* %out, i64 %tid.ext - %a = load float, float addrspace(1)* %gep0, align 4 - %b = load float, float addrspace(1)* %gep1, align 4 - %c = load float, float addrspace(1)* %gep2, align 4 - %b.abs = call float @llvm.fabs.f32(float %b) #0 - %mul = fmul float %a, %b.abs - %sub = fsub float %mul, %c - store float %sub, float addrspace(1)* %outgep, align 4 - ret void -} - -; FUNC-LABEL: {{^}}fsub_c_fadd_a_a: -; SI-DAG: buffer_load_dword [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; SI: v_mad_f32 [[RESULT:v[0-9]+]], -2.0, [[R1]], [[R2]] -; SI: buffer_store_dword [[RESULT]] -define void @fsub_c_fadd_a_a(float addrspace(1)* %out, float addrspace(1)* %in) { - %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone - %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid - %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 - %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid - - %r1 = load float, float addrspace(1)* %gep.0 - %r2 = load float, float addrspace(1)* %gep.1 - - %add = fadd float %r1, %r1 - %r3 = fsub float %r2, %add - - store float %r3, float addrspace(1)* %gep.out - ret void -} - -; FUNC-LABEL: {{^}}fsub_fadd_a_a_c: -; SI-DAG: buffer_load_dword [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; SI: v_mad_f32 [[RESULT:v[0-9]+]], 2.0, [[R1]], -[[R2]] -; SI: buffer_store_dword [[RESULT]] -define void @fsub_fadd_a_a_c(float addrspace(1)* %out, float addrspace(1)* %in) { - %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone - %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid - %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 - %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid - - %r1 = load float, float addrspace(1)* %gep.0 - %r2 = load float, float addrspace(1)* %gep.1 - - %add = fadd float %r1, %r1 - %r3 = fsub float %add, %r2 - - store float %r3, float addrspace(1)* %gep.out - ret void -} - -attributes #0 = { nounwind readnone } -attributes #1 = { nounwind } diff --git a/llvm/test/CodeGen/R600/mad_int24.ll b/llvm/test/CodeGen/R600/mad_int24.ll deleted file mode 100644 index 86d75a63ca4..00000000000 --- a/llvm/test/CodeGen/R600/mad_int24.ll +++ /dev/null @@ -1,35 +0,0 @@ -; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG --check-prefix=FUNC -; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=CM --check-prefix=FUNC -; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC -; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC - -declare i32 @llvm.AMDGPU.imul24(i32, i32) nounwind readnone - -; FUNC-LABEL: {{^}}i32_mad24: -; Signed 24-bit multiply is not supported on pre-Cayman GPUs. -; EG: MULLO_INT -; Make sure we aren't masking the inputs. -; CM-NOT: AND -; CM: MULADD_INT24 -; SI-NOT: and -; SI: v_mad_i32_i24 -define void @i32_mad24(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) { -entry: - %0 = shl i32 %a, 8 - %a_24 = ashr i32 %0, 8 - %1 = shl i32 %b, 8 - %b_24 = ashr i32 %1, 8 - %2 = mul i32 %a_24, %b_24 - %3 = add i32 %2, %c - store i32 %3, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: @test_imul24 -; SI: v_mad_i32_i24 -define void @test_imul24(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) nounwind { - %mul = call i32 @llvm.AMDGPU.imul24(i32 %src0, i32 %src1) nounwind readnone - %add = add i32 %mul, %src2 - store i32 %add, i32 addrspace(1)* %out, align 4 - ret void -} diff --git a/llvm/test/CodeGen/R600/mad_uint24.ll b/llvm/test/CodeGen/R600/mad_uint24.ll deleted file mode 100644 index 95fe3411959..00000000000 --- a/llvm/test/CodeGen/R600/mad_uint24.ll +++ /dev/null @@ -1,76 +0,0 @@ -; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG --check-prefix=FUNC -; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=EG --check-prefix=FUNC -; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC -; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC - -; FUNC-LABEL: {{^}}u32_mad24: -; EG: MULADD_UINT24 -; SI: v_mad_u32_u24 - -define void @u32_mad24(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) { -entry: - %0 = shl i32 %a, 8 - %a_24 = lshr i32 %0, 8 - %1 = shl i32 %b, 8 - %b_24 = lshr i32 %1, 8 - %2 = mul i32 %a_24, %b_24 - %3 = add i32 %2, %c - store i32 %3, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}i16_mad24: -; The order of A and B does not matter. -; EG: MULADD_UINT24 {{[* ]*}}T{{[0-9]}}.[[MAD_CHAN:[XYZW]]] -; The result must be sign-extended -; EG: BFE_INT {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[MAD_CHAN]], 0.0, literal.x -; EG: 16 -; SI: v_mad_u32_u24 [[MAD:v[0-9]]], {{[sv][0-9], [sv][0-9]}} -; SI: v_bfe_i32 v{{[0-9]}}, [[MAD]], 0, 16 - -define void @i16_mad24(i32 addrspace(1)* %out, i16 %a, i16 %b, i16 %c) { -entry: - %0 = mul i16 %a, %b - %1 = add i16 %0, %c - %2 = sext i16 %1 to i32 - store i32 %2, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}i8_mad24: -; EG: MULADD_UINT24 {{[* ]*}}T{{[0-9]}}.[[MAD_CHAN:[XYZW]]] -; The result must be sign-extended -; EG: BFE_INT {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[MAD_CHAN]], 0.0, literal.x -; EG: 8 -; SI: v_mad_u32_u24 [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}} -; SI: v_bfe_i32 v{{[0-9]}}, [[MUL]], 0, 8 - -define void @i8_mad24(i32 addrspace(1)* %out, i8 %a, i8 %b, i8 %c) { -entry: - %0 = mul i8 %a, %b - %1 = add i8 %0, %c - %2 = sext i8 %1 to i32 - store i32 %2, i32 addrspace(1)* %out - ret void -} - -; This tests for a bug where the mad_u24 pattern matcher would call -; SimplifyDemandedBits on the first operand of the mul instruction -; assuming that the pattern would be matched to a 24-bit mad. This -; led to some instructions being incorrectly erased when the entire -; 24-bit mad pattern wasn't being matched. - -; Check that the select instruction is not deleted. -; FUNC-LABEL: {{^}}i24_i32_i32_mad: -; EG: CNDE_INT -; SI: v_cndmask -define void @i24_i32_i32_mad(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d) { -entry: - %0 = ashr i32 %a, 8 - %1 = icmp ne i32 %c, 0 - %2 = select i1 %1, i32 %0, i32 34 - %3 = mul i32 %2, %c - %4 = add i32 %3, %d - store i32 %4, i32 addrspace(1)* %out - ret void -} diff --git a/llvm/test/CodeGen/R600/madak.ll b/llvm/test/CodeGen/R600/madak.ll deleted file mode 100644 index 933bb016d2c..00000000000 --- a/llvm/test/CodeGen/R600/madak.ll +++ /dev/null @@ -1,193 +0,0 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s -; XUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN %s - -; FIXME: Enable VI - -declare i32 @llvm.r600.read.tidig.x() nounwind readnone -declare float @llvm.fabs.f32(float) nounwind readnone - -; GCN-LABEL: {{^}}madak_f32: -; GCN: buffer_load_dword [[VA:v[0-9]+]] -; GCN: buffer_load_dword [[VB:v[0-9]+]] -; GCN: v_madak_f32_e32 {{v[0-9]+}}, [[VB]], [[VA]], 0x41200000 -define void @madak_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind { - %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone - %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid - %in.b.gep = getelementptr float, float addrspace(1)* %in.b, i32 %tid - %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid - - %a = load float, float addrspace(1)* %in.a.gep, align 4 - %b = load float, float addrspace(1)* %in.b.gep, align 4 - - %mul = fmul float %a, %b - %madak = fadd float %mul, 10.0 - store float %madak, float addrspace(1)* %out.gep, align 4 - ret void -} - -; Make sure this is only folded with one use. This is a code size -; optimization and if we fold the immediate multiple times, we'll undo -; it. - -; GCN-LABEL: {{^}}madak_2_use_f32: -; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; GCN-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; GCN-DAG: buffer_load_dword [[VC:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 -; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000 -; GCN-DAG: v_mad_f32 {{v[0-9]+}}, [[VA]], [[VB]], [[VK]] -; GCN-DAG: v_mad_f32 {{v[0-9]+}}, [[VA]], [[VC]], [[VK]] -; GCN: s_endpgm -define void @madak_2_use_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind { - %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone - - %in.gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid - %in.gep.1 = getelementptr float, float addrspace(1)* %in.gep.0, i32 1 - %in.gep.2 = getelementptr float, float addrspace(1)* %in.gep.0, i32 2 - - %out.gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid - %out.gep.1 = getelementptr float, float addrspace(1)* %in.gep.0, i32 1 - - %a = load float, float addrspace(1)* %in.gep.0, align 4 - %b = load float, float addrspace(1)* %in.gep.1, align 4 - %c = load float, float addrspace(1)* %in.gep.2, align 4 - - %mul0 = fmul float %a, %b - %mul1 = fmul float %a, %c - %madak0 = fadd float %mul0, 10.0 - %madak1 = fadd float %mul1, 10.0 - - store float %madak0, float addrspace(1)* %out.gep.0, align 4 - store float %madak1, float addrspace(1)* %out.gep.1, align 4 - ret void -} - -; GCN-LABEL: {{^}}madak_m_inline_imm_f32: -; GCN: buffer_load_dword [[VA:v[0-9]+]] -; GCN: v_madak_f32_e32 {{v[0-9]+}}, 4.0, [[VA]], 0x41200000 -define void @madak_m_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a) nounwind { - %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone - %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid - %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid - - %a = load float, float addrspace(1)* %in.a.gep, align 4 - - %mul = fmul float 4.0, %a - %madak = fadd float %mul, 10.0 - store float %madak, float addrspace(1)* %out.gep, align 4 - ret void -} - -; Make sure nothing weird happens with a value that is also allowed as -; an inline immediate. - -; GCN-LABEL: {{^}}madak_inline_imm_f32: -; GCN: buffer_load_dword [[VA:v[0-9]+]] -; GCN: buffer_load_dword [[VB:v[0-9]+]] -; GCN: v_mad_f32 {{v[0-9]+}}, [[VA]], [[VB]], 4.0 -define void @madak_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind { - %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone - %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid - %in.b.gep = getelementptr float, float addrspace(1)* %in.b, i32 %tid - %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid - - %a = load float, float addrspace(1)* %in.a.gep, align 4 - %b = load float, float addrspace(1)* %in.b.gep, align 4 - - %mul = fmul float %a, %b - %madak = fadd float %mul, 4.0 - store float %madak, float addrspace(1)* %out.gep, align 4 - ret void -} - -; We can't use an SGPR when forming madak -; GCN-LABEL: {{^}}s_v_madak_f32: -; GCN: s_load_dword [[SB:s[0-9]+]] -; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000 -; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]] -; GCN-NOT: v_madak_f32 -; GCN: v_mad_f32 {{v[0-9]+}}, [[SB]], [[VA]], [[VK]] -define void @s_v_madak_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float %b) nounwind { - %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone - %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid - %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid - - %a = load float, float addrspace(1)* %in.a.gep, align 4 - - %mul = fmul float %a, %b - %madak = fadd float %mul, 10.0 - store float %madak, float addrspace(1)* %out.gep, align 4 - ret void -} - -; GCN-LABEL: @v_s_madak_f32 -; GCN-DAG: s_load_dword [[SB:s[0-9]+]] -; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000 -; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]] -; GCN-NOT: v_madak_f32 -; GCN: v_mad_f32 {{v[0-9]+}}, [[VA]], [[SB]], [[VK]] -define void @v_s_madak_f32(float addrspace(1)* noalias %out, float %a, float addrspace(1)* noalias %in.b) nounwind { - %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone - %in.b.gep = getelementptr float, float addrspace(1)* %in.b, i32 %tid - %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid - - %b = load float, float addrspace(1)* %in.b.gep, align 4 - - %mul = fmul float %a, %b - %madak = fadd float %mul, 10.0 - store float %madak, float addrspace(1)* %out.gep, align 4 - ret void -} - -; GCN-LABEL: {{^}}s_s_madak_f32: -; GCN-NOT: v_madak_f32 -; GCN: v_mad_f32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} -define void @s_s_madak_f32(float addrspace(1)* %out, float %a, float %b) nounwind { - %mul = fmul float %a, %b - %madak = fadd float %mul, 10.0 - store float %madak, float addrspace(1)* %out, align 4 - ret void -} - -; GCN-LABEL: {{^}}no_madak_src0_modifier_f32: -; GCN: buffer_load_dword [[VA:v[0-9]+]] -; GCN: buffer_load_dword [[VB:v[0-9]+]] -; GCN: v_mad_f32 {{v[0-9]+}}, |{{v[0-9]+}}|, {{v[0-9]+}}, {{[sv][0-9]+}} -; GCN: s_endpgm -define void @no_madak_src0_modifier_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind { - %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone - %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid - %in.b.gep = getelementptr float, float addrspace(1)* %in.b, i32 %tid - %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid - - %a = load float, float addrspace(1)* %in.a.gep, align 4 - %b = load float, float addrspace(1)* %in.b.gep, align 4 - - %a.fabs = call float @llvm.fabs.f32(float %a) nounwind readnone - - %mul = fmul float %a.fabs, %b - %madak = fadd float %mul, 10.0 - store float %madak, float addrspace(1)* %out.gep, align 4 - ret void -} - -; GCN-LABEL: {{^}}no_madak_src1_modifier_f32: -; GCN: buffer_load_dword [[VA:v[0-9]+]] -; GCN: buffer_load_dword [[VB:v[0-9]+]] -; GCN: v_mad_f32 {{v[0-9]+}}, {{v[0-9]+}}, |{{v[0-9]+}}|, {{[sv][0-9]+}} -; GCN: s_endpgm -define void @no_madak_src1_modifier_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind { - %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone - %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid - %in.b.gep = getelementptr float, float addrspace(1)* %in.b, i32 %tid - %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid - - %a = load float, float addrspace(1)* %in.a.gep, align 4 - %b = load float, float addrspace(1)* %in.b.gep, align 4 - - %b.fabs = call float @llvm.fabs.f32(float %b) nounwind readnone - - %mul = fmul float %a, %b.fabs - %madak = fadd float %mul, 10.0 - store float %madak, float addrspace(1)* %out.gep, align 4 - ret void -} diff --git a/llvm/test/CodeGen/R600/madmk.ll b/llvm/test/CodeGen/R600/madmk.ll deleted file mode 100644 index ba7bb221a99..00000000000 --- a/llvm/test/CodeGen/R600/madmk.ll +++ /dev/null @@ -1,205 +0,0 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; XUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s - -declare i32 @llvm.r600.read.tidig.x() nounwind readnone -declare float @llvm.fabs.f32(float) nounwind readnone - -; GCN-LABEL: {{^}}madmk_f32: -; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; GCN-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; GCN: v_madmk_f32_e32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000 -define void @madmk_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind { - %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone - %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid - %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 - %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid - - %a = load float, float addrspace(1)* %gep.0, align 4 - %b = load float, float addrspace(1)* %gep.1, align 4 - - %mul = fmul float %a, 10.0 - %madmk = fadd float %mul, %b - store float %madmk, float addrspace(1)* %out.gep, align 4 - ret void -} - -; GCN-LABEL: {{^}}madmk_2_use_f32: -; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; GCN-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; GCN-DAG: buffer_load_dword [[VC:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 -; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000 -; GCN-DAG: v_mad_f32 {{v[0-9]+}}, [[VA]], [[VK]], [[VB]] -; GCN-DAG: v_mad_f32 {{v[0-9]+}}, [[VA]], [[VK]], [[VC]] -; GCN: s_endpgm -define void @madmk_2_use_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind { - %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone - - %in.gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid - %in.gep.1 = getelementptr float, float addrspace(1)* %in.gep.0, i32 1 - %in.gep.2 = getelementptr float, float addrspace(1)* %in.gep.0, i32 2 - - %out.gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid - %out.gep.1 = getelementptr float, float addrspace(1)* %in.gep.0, i32 1 - - %a = load float, float addrspace(1)* %in.gep.0, align 4 - %b = load float, float addrspace(1)* %in.gep.1, align 4 - %c = load float, float addrspace(1)* %in.gep.2, align 4 - - %mul0 = fmul float %a, 10.0 - %mul1 = fmul float %a, 10.0 - %madmk0 = fadd float %mul0, %b - %madmk1 = fadd float %mul1, %c - - store float %madmk0, float addrspace(1)* %out.gep.0, align 4 - store float %madmk1, float addrspace(1)* %out.gep.1, align 4 - ret void -} - -; We don't get any benefit if the constant is an inline immediate. -; GCN-LABEL: {{^}}madmk_inline_imm_f32: -; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; GCN-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; GCN: v_mad_f32 {{v[0-9]+}}, 4.0, [[VA]], [[VB]] -define void @madmk_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind { - %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone - %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid - %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 - %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid - - %a = load float, float addrspace(1)* %gep.0, align 4 - %b = load float, float addrspace(1)* %gep.1, align 4 - - %mul = fmul float %a, 4.0 - %madmk = fadd float %mul, %b - store float %madmk, float addrspace(1)* %out.gep, align 4 - ret void -} - -; GCN-LABEL: {{^}}s_s_madmk_f32: -; GCN-NOT: v_madmk_f32 -; GCN: v_mad_f32 -; GCN: s_endpgm -define void @s_s_madmk_f32(float addrspace(1)* noalias %out, float %a, float %b) nounwind { - %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone - %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid - - %mul = fmul float %a, 10.0 - %madmk = fadd float %mul, %b - store float %madmk, float addrspace(1)* %out.gep, align 4 - ret void -} - -; GCN-LABEL: {{^}}v_s_madmk_f32: -; GCN-NOT: v_madmk_f32 -; GCN: v_mad_f32 -; GCN: s_endpgm -define void @v_s_madmk_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in, float %b) nounwind { - %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone - %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid - %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid - %a = load float, float addrspace(1)* %gep.0, align 4 - - %mul = fmul float %a, 10.0 - %madmk = fadd float %mul, %b - store float %madmk, float addrspace(1)* %out.gep, align 4 - ret void -} - -; GCN-LABEL: {{^}}scalar_vector_madmk_f32: -; GCN-NOT: v_madmk_f32 -; GCN: v_mad_f32 -; GCN: s_endpgm -define void @scalar_vector_madmk_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in, float %a) nounwind { - %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone - %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid - %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid - %b = load float, float addrspace(1)* %gep.0, align 4 - - %mul = fmul float %a, 10.0 - %madmk = fadd float %mul, %b - store float %madmk, float addrspace(1)* %out.gep, align 4 - ret void -} - -; GCN-LABEL: {{^}}no_madmk_src0_modifier_f32: -; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; GCN-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; GCN: v_mad_f32 {{v[0-9]+}}, |{{v[0-9]+}}|, {{v[0-9]+}}, {{[sv][0-9]+}} -define void @no_madmk_src0_modifier_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind { - %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone - %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid - %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 - %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid - - %a = load float, float addrspace(1)* %gep.0, align 4 - %b = load float, float addrspace(1)* %gep.1, align 4 - - %a.fabs = call float @llvm.fabs.f32(float %a) nounwind readnone - - %mul = fmul float %a.fabs, 10.0 - %madmk = fadd float %mul, %b - store float %madmk, float addrspace(1)* %out.gep, align 4 - ret void -} - -; GCN-LABEL: {{^}}no_madmk_src2_modifier_f32: -; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; GCN-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; GCN: v_mad_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, |{{[sv][0-9]+}}| -define void @no_madmk_src2_modifier_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind { - %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone - %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid - %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 - %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid - - %a = load float, float addrspace(1)* %gep.0, align 4 - %b = load float, float addrspace(1)* %gep.1, align 4 - - %b.fabs = call float @llvm.fabs.f32(float %b) nounwind readnone - - %mul = fmul float %a, 10.0 - %madmk = fadd float %mul, %b.fabs - store float %madmk, float addrspace(1)* %out.gep, align 4 - ret void -} - -; GCN-LABEL: {{^}}madmk_add_inline_imm_f32: -; GCN: buffer_load_dword [[A:v[0-9]+]] -; GCN: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000 -; GCN: v_mad_f32 {{v[0-9]+}}, [[VK]], [[A]], 2.0 -define void @madmk_add_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind { - %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone - %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid - %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid - - %a = load float, float addrspace(1)* %gep.0, align 4 - - %mul = fmul float %a, 10.0 - %madmk = fadd float %mul, 2.0 - store float %madmk, float addrspace(1)* %out.gep, align 4 - ret void -} - -; SI-LABEL: {{^}}kill_madmk_verifier_error: -; SI: s_xor_b64 -; SI: v_madmk_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, 0x472aee8c -; SI: s_or_b64 -define void @kill_madmk_verifier_error() nounwind { -bb: - br label %bb2 - -bb1: ; preds = %bb2 - ret void - -bb2: ; preds = %bb6, %bb - %tmp = phi float [ undef, %bb ], [ %tmp8, %bb6 ] - %tmp3 = fsub float undef, %tmp - %tmp5 = fcmp oeq float %tmp3, 1.000000e+04 - br i1 %tmp5, label %bb1, label %bb6 - -bb6: ; preds = %bb2 - %tmp4 = fmul float %tmp, undef - %tmp7 = fmul float %tmp4, 0x40E55DD180000000 - %tmp8 = fadd float %tmp7, undef - br label %bb2 -} diff --git a/llvm/test/CodeGen/R600/max-literals.ll b/llvm/test/CodeGen/R600/max-literals.ll deleted file mode 100644 index c357524b140..00000000000 --- a/llvm/test/CodeGen/R600/max-literals.ll +++ /dev/null @@ -1,67 +0,0 @@ -;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s - -; CHECK-LABEL: {{^}}main: -; CHECK: ADD * - -define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1, <4 x float> inreg %reg2) #0 { -main_body: - %0 = extractelement <4 x float> %reg1, i32 0 - %1 = extractelement <4 x float> %reg1, i32 1 - %2 = extractelement <4 x float> %reg1, i32 2 - %3 = extractelement <4 x float> %reg1, i32 3 - %4 = extractelement <4 x float> %reg2, i32 0 - %5 = fadd float %0, 2.0 - %6 = fadd float %1, 3.0 - %7 = fadd float %2, 4.0 - %8 = fadd float %3, 5.0 - %9 = bitcast float %4 to i32 - %10 = mul i32 %9, 6 - %11 = bitcast i32 %10 to float - %12 = insertelement <4 x float> undef, float %5, i32 0 - %13 = insertelement <4 x float> %12, float %6, i32 1 - %14 = insertelement <4 x float> %13, float %7, i32 2 - %15 = insertelement <4 x float> %14, float %8, i32 3 - %16 = insertelement <4 x float> %15, float %11, i32 3 - - %17 = call float @llvm.AMDGPU.dp4(<4 x float> %15,<4 x float> %16) - %18 = insertelement <4 x float> undef, float %17, i32 0 - call void @llvm.R600.store.swizzle(<4 x float> %18, i32 0, i32 2) - ret void -} - -; CHECK-LABEL: {{^}}main2: -; CHECK-NOT: ADD * - -define void @main2(<4 x float> inreg %reg0, <4 x float> inreg %reg1, <4 x float> inreg %reg2) #0 { -main_body: - %0 = extractelement <4 x float> %reg1, i32 0 - %1 = extractelement <4 x float> %reg1, i32 1 - %2 = extractelement <4 x float> %reg1, i32 2 - %3 = extractelement <4 x float> %reg1, i32 3 - %4 = extractelement <4 x float> %reg2, i32 0 - %5 = fadd float %0, 2.0 - %6 = fadd float %1, 3.0 - %7 = fadd float %2, 4.0 - %8 = fadd float %3, 2.0 - %9 = bitcast float %4 to i32 - %10 = mul i32 %9, 6 - %11 = bitcast i32 %10 to float - %12 = insertelement <4 x float> undef, float %5, i32 0 - %13 = insertelement <4 x float> %12, float %6, i32 1 - %14 = insertelement <4 x float> %13, float %7, i32 2 - %15 = insertelement <4 x float> %14, float %8, i32 3 - %16 = insertelement <4 x float> %15, float %11, i32 3 - - %17 = call float @llvm.AMDGPU.dp4(<4 x float> %15,<4 x float> %16) - %18 = insertelement <4 x float> undef, float %17, i32 0 - call void @llvm.R600.store.swizzle(<4 x float> %18, i32 0, i32 2) - ret void -} - -; Function Attrs: readnone -declare float @llvm.AMDGPU.dp4(<4 x float>, <4 x float>) #1 - -declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) - -attributes #0 = { "ShaderType"="1" } -attributes #1 = { readnone } diff --git a/llvm/test/CodeGen/R600/max.ll b/llvm/test/CodeGen/R600/max.ll deleted file mode 100644 index fef3e2f0a21..00000000000 --- a/llvm/test/CodeGen/R600/max.ll +++ /dev/null @@ -1,168 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s - -declare i32 @llvm.r600.read.tidig.x() nounwind readnone - -; FUNC-LABEL: @v_test_imax_sge_i32 -; SI: v_max_i32_e32 -define void @v_test_imax_sge_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind { - %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone - %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid - %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid - %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %a = load i32, i32 addrspace(1)* %gep0, align 4 - %b = load i32, i32 addrspace(1)* %gep1, align 4 - %cmp = icmp sge i32 %a, %b - %val = select i1 %cmp, i32 %a, i32 %b - store i32 %val, i32 addrspace(1)* %outgep, align 4 - ret void -} - -; FUNC-LABEL: @s_test_imax_sge_i32 -; SI: s_max_i32 -define void @s_test_imax_sge_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind { - %cmp = icmp sge i32 %a, %b - %val = select i1 %cmp, i32 %a, i32 %b - store i32 %val, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}s_test_imax_sge_imm_i32: -; SI: s_max_i32 {{s[0-9]+}}, {{s[0-9]+}}, 9 -define void @s_test_imax_sge_imm_i32(i32 addrspace(1)* %out, i32 %a) nounwind { - %cmp = icmp sge i32 %a, 9 - %val = select i1 %cmp, i32 %a, i32 9 - store i32 %val, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}s_test_imax_sgt_imm_i32: -; SI: s_max_i32 {{s[0-9]+}}, {{s[0-9]+}}, 9 -define void @s_test_imax_sgt_imm_i32(i32 addrspace(1)* %out, i32 %a) nounwind { - %cmp = icmp sgt i32 %a, 9 - %val = select i1 %cmp, i32 %a, i32 9 - store i32 %val, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: @v_test_imax_sgt_i32 -; SI: v_max_i32_e32 -define void @v_test_imax_sgt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind { - %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone - %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid - %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid - %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %a = load i32, i32 addrspace(1)* %gep0, align 4 - %b = load i32, i32 addrspace(1)* %gep1, align 4 - %cmp = icmp sgt i32 %a, %b - %val = select i1 %cmp, i32 %a, i32 %b - store i32 %val, i32 addrspace(1)* %outgep, align 4 - ret void -} - -; FUNC-LABEL: @s_test_imax_sgt_i32 -; SI: s_max_i32 -define void @s_test_imax_sgt_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind { - %cmp = icmp sgt i32 %a, %b - %val = select i1 %cmp, i32 %a, i32 %b - store i32 %val, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: @v_test_umax_uge_i32 -; SI: v_max_u32_e32 -define void @v_test_umax_uge_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind { - %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone - %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid - %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid - %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %a = load i32, i32 addrspace(1)* %gep0, align 4 - %b = load i32, i32 addrspace(1)* %gep1, align 4 - %cmp = icmp uge i32 %a, %b - %val = select i1 %cmp, i32 %a, i32 %b - store i32 %val, i32 addrspace(1)* %outgep, align 4 - ret void -} - -; FUNC-LABEL: @s_test_umax_uge_i32 -; SI: s_max_u32 -define void @s_test_umax_uge_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind { - %cmp = icmp uge i32 %a, %b - %val = select i1 %cmp, i32 %a, i32 %b - store i32 %val, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: @v_test_umax_ugt_i32 -; SI: v_max_u32_e32 -define void @v_test_umax_ugt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind { - %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone - %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid - %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid - %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %a = load i32, i32 addrspace(1)* %gep0, align 4 - %b = load i32, i32 addrspace(1)* %gep1, align 4 - %cmp = icmp ugt i32 %a, %b - %val = select i1 %cmp, i32 %a, i32 %b - store i32 %val, i32 addrspace(1)* %outgep, align 4 - ret void -} - -; FUNC-LABEL: @s_test_umax_ugt_i32 -; SI: s_max_u32 -define void @s_test_umax_ugt_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind { - %cmp = icmp ugt i32 %a, %b - %val = select i1 %cmp, i32 %a, i32 %b - store i32 %val, i32 addrspace(1)* %out, align 4 - ret void -} - -; Make sure redundant and removed -; FUNC-LABEL: {{^}}simplify_demanded_bits_test_umax_ugt_i16: -; SI-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xb -; SI-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xc -; SI: s_max_u32 [[MIN:s[0-9]+]], [[A]], [[B]] -; SI-NEXT: v_mov_b32_e32 [[VMIN:v[0-9]+]], [[MIN]] -; SI-NEXT: buffer_store_dword [[VMIN]] -define void @simplify_demanded_bits_test_umax_ugt_i16(i32 addrspace(1)* %out, i16 zeroext %a, i16 zeroext %b) nounwind { - %a.ext = zext i16 %a to i32 - %b.ext = zext i16 %b to i32 - %cmp = icmp ugt i32 %a.ext, %b.ext - %val = select i1 %cmp, i32 %a.ext, i32 %b.ext - %mask = and i32 %val, 65535 - store i32 %mask, i32 addrspace(1)* %out - ret void -} - -; Make sure redundant sign_extend_inreg removed. - -; FUNC-LABEL: {{^}}simplify_demanded_bits_test_min_slt_i16: -; SI-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xb -; SI-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xc -; SI: s_max_i32 [[MIN:s[0-9]+]], [[A]], [[B]] -; SI-NEXT: v_mov_b32_e32 [[VMIN:v[0-9]+]], [[MIN]] -; SI-NEXT: buffer_store_dword [[VMIN]] -define void @simplify_demanded_bits_test_min_slt_i16(i32 addrspace(1)* %out, i16 signext %a, i16 signext %b) nounwind { - %a.ext = sext i16 %a to i32 - %b.ext = sext i16 %b to i32 - %cmp = icmp sgt i32 %a.ext, %b.ext - %val = select i1 %cmp, i32 %a.ext, i32 %b.ext - %shl = shl i32 %val, 16 - %sextinreg = ashr i32 %shl, 16 - store i32 %sextinreg, i32 addrspace(1)* %out - ret void -} - -; FIXME: Should get match min/max through extends inserted by -; legalization. - -; FUNC-LABEL: {{^}}s_test_imin_sge_i16: -; SI: s_sext_i32_i16 -; SI: s_sext_i32_i16 -; SI: v_cmp_ge_i32_e32 -; SI: v_cndmask_b32 -define void @s_test_imin_sge_i16(i16 addrspace(1)* %out, i16 %a, i16 %b) nounwind { - %cmp = icmp sge i16 %a, %b - %val = select i1 %cmp, i16 %a, i16 %b - store i16 %val, i16 addrspace(1)* %out - ret void -} diff --git a/llvm/test/CodeGen/R600/max3.ll b/llvm/test/CodeGen/R600/max3.ll deleted file mode 100644 index cfb94b272e5..00000000000 --- a/llvm/test/CodeGen/R600/max3.ll +++ /dev/null @@ -1,41 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s - -declare i32 @llvm.r600.read.tidig.x() nounwind readnone - -; FUNC-LABEL: @v_test_imax3_sgt_i32 -; SI: v_max3_i32 -define void @v_test_imax3_sgt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind { - %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone - %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid - %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid - %gep2 = getelementptr i32, i32 addrspace(1)* %cptr, i32 %tid - %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %a = load i32, i32 addrspace(1)* %gep0, align 4 - %b = load i32, i32 addrspace(1)* %gep1, align 4 - %c = load i32, i32 addrspace(1)* %gep2, align 4 - %icmp0 = icmp sgt i32 %a, %b - %i0 = select i1 %icmp0, i32 %a, i32 %b - %icmp1 = icmp sgt i32 %i0, %c - %i1 = select i1 %icmp1, i32 %i0, i32 %c - store i32 %i1, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: @v_test_umax3_ugt_i32 -; SI: v_max3_u32 -define void @v_test_umax3_ugt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind { - %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone - %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid - %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid - %gep2 = getelementptr i32, i32 addrspace(1)* %cptr, i32 %tid - %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %a = load i32, i32 addrspace(1)* %gep0, align 4 - %b = load i32, i32 addrspace(1)* %gep1, align 4 - %c = load i32, i32 addrspace(1)* %gep2, align 4 - %icmp0 = icmp ugt i32 %a, %b - %i0 = select i1 %icmp0, i32 %a, i32 %b - %icmp1 = icmp ugt i32 %i0, %c - %i1 = select i1 %icmp1, i32 %i0, i32 %c - store i32 %i1, i32 addrspace(1)* %out, align 4 - ret void -} diff --git a/llvm/test/CodeGen/R600/merge-stores.ll b/llvm/test/CodeGen/R600/merge-stores.ll deleted file mode 100644 index dbf9d4481ff..00000000000 --- a/llvm/test/CodeGen/R600/merge-stores.ll +++ /dev/null @@ -1,536 +0,0 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s -; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s - -; Run with devices with different unaligned load restrictions. - -; TODO: Vector element tests -; TODO: Non-zero base offset for load and store combinations -; TODO: Same base addrspacecasted - - -; GCN-LABEL: {{^}}merge_global_store_2_constants_i8: -; GCN: buffer_store_byte -; GCN: buffer_store_byte -; GCN: s_endpgm -define void @merge_global_store_2_constants_i8(i8 addrspace(1)* %out) #0 { - %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i32 1 - - store i8 123, i8 addrspace(1)* %out.gep.1 - store i8 456, i8 addrspace(1)* %out, align 2 - ret void -} - -; GCN-LABEL: {{^}}merge_global_store_2_constants_i8_natural_align: -; GCN: buffer_store_byte -; GCN: buffer_store_byte -; GCN: s_endpgm -define void @merge_global_store_2_constants_i8_natural_align(i8 addrspace(1)* %out) #0 { - %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i32 1 - - store i8 123, i8 addrspace(1)* %out.gep.1 - store i8 456, i8 addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}merge_global_store_2_constants_i16: -; GCN: buffer_store_dword v -define void @merge_global_store_2_constants_i16(i16 addrspace(1)* %out) #0 { - %out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1 - - store i16 123, i16 addrspace(1)* %out.gep.1 - store i16 456, i16 addrspace(1)* %out, align 4 - ret void -} - -; GCN-LABEL: {{^}}merge_global_store_2_constants_0_i16: -; GCN: buffer_store_dword v -define void @merge_global_store_2_constants_0_i16(i16 addrspace(1)* %out) #0 { - %out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1 - - store i16 0, i16 addrspace(1)* %out.gep.1 - store i16 0, i16 addrspace(1)* %out, align 4 - ret void -} - -; GCN-LABEL: {{^}}merge_global_store_2_constants_i16_natural_align: -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: s_endpgm -define void @merge_global_store_2_constants_i16_natural_align(i16 addrspace(1)* %out) #0 { - %out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1 - - store i16 123, i16 addrspace(1)* %out.gep.1 - store i16 456, i16 addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}merge_global_store_2_constants_i32: -; SI-DAG: s_movk_i32 [[SLO:s[0-9]+]], 0x1c8 -; SI-DAG: s_movk_i32 [[SHI:s[0-9]+]], 0x7b -; SI-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], [[SLO]] -; SI-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[SHI]] -; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} -define void @merge_global_store_2_constants_i32(i32 addrspace(1)* %out) #0 { - %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 - - store i32 123, i32 addrspace(1)* %out.gep.1 - store i32 456, i32 addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}merge_global_store_2_constants_i32_f32: -; GCN: buffer_store_dwordx2 -define void @merge_global_store_2_constants_i32_f32(i32 addrspace(1)* %out) #0 { - %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 - %out.gep.1.bc = bitcast i32 addrspace(1)* %out.gep.1 to float addrspace(1)* - store float 1.0, float addrspace(1)* %out.gep.1.bc - store i32 456, i32 addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}merge_global_store_2_constants_f32_i32: -; GCN: buffer_store_dwordx2 -define void @merge_global_store_2_constants_f32_i32(float addrspace(1)* %out) #0 { - %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1 - %out.gep.1.bc = bitcast float addrspace(1)* %out.gep.1 to i32 addrspace(1)* - store i32 123, i32 addrspace(1)* %out.gep.1.bc - store float 4.0, float addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}merge_global_store_4_constants_i32: -; GCN: buffer_store_dwordx4 -define void @merge_global_store_4_constants_i32(i32 addrspace(1)* %out) #0 { - %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 - %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2 - %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3 - - store i32 123, i32 addrspace(1)* %out.gep.1 - store i32 456, i32 addrspace(1)* %out.gep.2 - store i32 333, i32 addrspace(1)* %out.gep.3 - store i32 1234, i32 addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}merge_global_store_4_constants_f32_order: -; XGCN: buffer_store_dwordx4 -; GCN: buffer_store_dword v -; GCN: buffer_store_dword v -; GCN: buffer_store_dwordx2 v -define void @merge_global_store_4_constants_f32_order(float addrspace(1)* %out) #0 { - %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1 - %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2 - %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3 - - store float 8.0, float addrspace(1)* %out - store float 1.0, float addrspace(1)* %out.gep.1 - store float 2.0, float addrspace(1)* %out.gep.2 - store float 4.0, float addrspace(1)* %out.gep.3 - ret void -} - -; First store is out of order. Because of order of combines, the -; consecutive store fails because only some of the stores have been -; replaced with integer constant stores, and then won't merge because -; the types are different. - -; GCN-LABEL: {{^}}merge_global_store_4_constants_f32: -; XGCN: buffer_store_dwordx4 -; GCN: buffer_store_dword v -; GCN: buffer_store_dword v -; GCN: buffer_store_dword v -; GCN: buffer_store_dword v -define void @merge_global_store_4_constants_f32(float addrspace(1)* %out) #0 { - %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1 - %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2 - %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3 - - store float 1.0, float addrspace(1)* %out.gep.1 - store float 2.0, float addrspace(1)* %out.gep.2 - store float 4.0, float addrspace(1)* %out.gep.3 - store float 8.0, float addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}merge_global_store_3_constants_i32: -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dword -; SI-NOT: buffer_store_dword -; GCN: s_endpgm -define void @merge_global_store_3_constants_i32(i32 addrspace(1)* %out) #0 { - %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 - %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2 - - store i32 123, i32 addrspace(1)* %out.gep.1 - store i32 456, i32 addrspace(1)* %out.gep.2 - store i32 1234, i32 addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}merge_global_store_2_constants_i64: -; XGCN: buffer_store_dwordx4 -; GCN: buffer_store_dwordx2 -; GCN: buffer_store_dwordx2 -define void @merge_global_store_2_constants_i64(i64 addrspace(1)* %out) #0 { - %out.gep.1 = getelementptr i64, i64 addrspace(1)* %out, i64 1 - - store i64 123, i64 addrspace(1)* %out.gep.1 - store i64 456, i64 addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}merge_global_store_4_constants_i64: -; XGCN: buffer_store_dwordx4 -; XGCN: buffer_store_dwordx4 - -; GCN: buffer_store_dwordx2 -; GCN: buffer_store_dwordx2 -; GCN: buffer_store_dwordx2 -; GCN: buffer_store_dwordx2 -define void @merge_global_store_4_constants_i64(i64 addrspace(1)* %out) #0 { - %out.gep.1 = getelementptr i64, i64 addrspace(1)* %out, i64 1 - %out.gep.2 = getelementptr i64, i64 addrspace(1)* %out, i64 2 - %out.gep.3 = getelementptr i64, i64 addrspace(1)* %out, i64 3 - - store i64 123, i64 addrspace(1)* %out.gep.1 - store i64 456, i64 addrspace(1)* %out.gep.2 - store i64 333, i64 addrspace(1)* %out.gep.3 - store i64 1234, i64 addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}merge_global_store_2_adjacent_loads_i32: -; GCN: buffer_load_dwordx2 [[LOAD:v\[[0-9]+:[0-9]+\]]] -; GCN: buffer_store_dwordx2 [[LOAD]] -define void @merge_global_store_2_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { - %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 - %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1 - - %lo = load i32, i32 addrspace(1)* %in - %hi = load i32, i32 addrspace(1)* %in.gep.1 - - store i32 %lo, i32 addrspace(1)* %out - store i32 %hi, i32 addrspace(1)* %out.gep.1 - ret void -} - -; GCN-LABEL: {{^}}merge_global_store_2_adjacent_loads_i32_nonzero_base: -; GCN: buffer_load_dwordx2 [[LOAD:v\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8 -; GCN: buffer_store_dwordx2 [[LOAD]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8 -define void @merge_global_store_2_adjacent_loads_i32_nonzero_base(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { - %in.gep.0 = getelementptr i32, i32 addrspace(1)* %in, i32 2 - %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 3 - - %out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i32 2 - %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 3 - %lo = load i32, i32 addrspace(1)* %in.gep.0 - %hi = load i32, i32 addrspace(1)* %in.gep.1 - - store i32 %lo, i32 addrspace(1)* %out.gep.0 - store i32 %hi, i32 addrspace(1)* %out.gep.1 - ret void -} - -; GCN-LABEL: {{^}}merge_global_store_2_adjacent_loads_shuffle_i32: -; GCN: buffer_load_dword v -; GCN: buffer_load_dword v -; GCN: buffer_store_dword v -; GCN: buffer_store_dword v -define void @merge_global_store_2_adjacent_loads_shuffle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { - %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 - %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1 - - %lo = load i32, i32 addrspace(1)* %in - %hi = load i32, i32 addrspace(1)* %in.gep.1 - - store i32 %hi, i32 addrspace(1)* %out - store i32 %lo, i32 addrspace(1)* %out.gep.1 - ret void -} - -; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i32: -; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]] -; GCN: buffer_store_dwordx4 [[LOAD]] -define void @merge_global_store_4_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { - %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 - %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2 - %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3 - %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1 - %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2 - %in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 3 - - %x = load i32, i32 addrspace(1)* %in - %y = load i32, i32 addrspace(1)* %in.gep.1 - %z = load i32, i32 addrspace(1)* %in.gep.2 - %w = load i32, i32 addrspace(1)* %in.gep.3 - - store i32 %x, i32 addrspace(1)* %out - store i32 %y, i32 addrspace(1)* %out.gep.1 - store i32 %z, i32 addrspace(1)* %out.gep.2 - store i32 %w, i32 addrspace(1)* %out.gep.3 - ret void -} - -; GCN-LABEL: {{^}}merge_global_store_3_adjacent_loads_i32: -; SI-DAG: buffer_load_dwordx2 -; SI-DAG: buffer_load_dword v -; GCN: s_waitcnt -; SI-DAG: buffer_store_dword v -; SI-DAG: buffer_store_dwordx2 v -; GCN: s_endpgm -define void @merge_global_store_3_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { - %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 - %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2 - %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1 - %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2 - - %x = load i32, i32 addrspace(1)* %in - %y = load i32, i32 addrspace(1)* %in.gep.1 - %z = load i32, i32 addrspace(1)* %in.gep.2 - - store i32 %x, i32 addrspace(1)* %out - store i32 %y, i32 addrspace(1)* %out.gep.1 - store i32 %z, i32 addrspace(1)* %out.gep.2 - ret void -} - -; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_f32: -; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]] -; GCN: buffer_store_dwordx4 [[LOAD]] -define void @merge_global_store_4_adjacent_loads_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { - %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1 - %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2 - %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3 - %in.gep.1 = getelementptr float, float addrspace(1)* %in, i32 1 - %in.gep.2 = getelementptr float, float addrspace(1)* %in, i32 2 - %in.gep.3 = getelementptr float, float addrspace(1)* %in, i32 3 - - %x = load float, float addrspace(1)* %in - %y = load float, float addrspace(1)* %in.gep.1 - %z = load float, float addrspace(1)* %in.gep.2 - %w = load float, float addrspace(1)* %in.gep.3 - - store float %x, float addrspace(1)* %out - store float %y, float addrspace(1)* %out.gep.1 - store float %z, float addrspace(1)* %out.gep.2 - store float %w, float addrspace(1)* %out.gep.3 - ret void -} - -; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i32_nonzero_base: -; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:44 -; GCN: buffer_store_dwordx4 [[LOAD]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:28 -define void @merge_global_store_4_adjacent_loads_i32_nonzero_base(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { - %in.gep.0 = getelementptr i32, i32 addrspace(1)* %in, i32 11 - %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 12 - %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 13 - %in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 14 - %out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i32 7 - %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 8 - %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 9 - %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 10 - - %x = load i32, i32 addrspace(1)* %in.gep.0 - %y = load i32, i32 addrspace(1)* %in.gep.1 - %z = load i32, i32 addrspace(1)* %in.gep.2 - %w = load i32, i32 addrspace(1)* %in.gep.3 - - store i32 %x, i32 addrspace(1)* %out.gep.0 - store i32 %y, i32 addrspace(1)* %out.gep.1 - store i32 %z, i32 addrspace(1)* %out.gep.2 - store i32 %w, i32 addrspace(1)* %out.gep.3 - ret void -} - -; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_inverse_i32: -; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]] -; GCN: s_barrier -; GCN: buffer_store_dwordx4 [[LOAD]] -define void @merge_global_store_4_adjacent_loads_inverse_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { - %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 - %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2 - %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3 - %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1 - %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2 - %in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 3 - - %x = load i32, i32 addrspace(1)* %in - %y = load i32, i32 addrspace(1)* %in.gep.1 - %z = load i32, i32 addrspace(1)* %in.gep.2 - %w = load i32, i32 addrspace(1)* %in.gep.3 - - ; Make sure the barrier doesn't stop this - tail call void @llvm.AMDGPU.barrier.local() #1 - - store i32 %w, i32 addrspace(1)* %out.gep.3 - store i32 %z, i32 addrspace(1)* %out.gep.2 - store i32 %y, i32 addrspace(1)* %out.gep.1 - store i32 %x, i32 addrspace(1)* %out - - ret void -} - -; TODO: Re-packing of loaded register required. Maybe an IR pass -; should catch this? - -; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_shuffle_i32: -; GCN: buffer_load_dword v -; GCN: buffer_load_dword v -; GCN: buffer_load_dword v -; GCN: buffer_load_dword v -; GCN: s_barrier -; GCN: buffer_store_dword v -; GCN: buffer_store_dword v -; GCN: buffer_store_dword v -; GCN: buffer_store_dword v -define void @merge_global_store_4_adjacent_loads_shuffle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { - %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 - %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2 - %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3 - %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1 - %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2 - %in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 3 - - %x = load i32, i32 addrspace(1)* %in - %y = load i32, i32 addrspace(1)* %in.gep.1 - %z = load i32, i32 addrspace(1)* %in.gep.2 - %w = load i32, i32 addrspace(1)* %in.gep.3 - - ; Make sure the barrier doesn't stop this - tail call void @llvm.AMDGPU.barrier.local() #1 - - store i32 %w, i32 addrspace(1)* %out - store i32 %z, i32 addrspace(1)* %out.gep.1 - store i32 %y, i32 addrspace(1)* %out.gep.2 - store i32 %x, i32 addrspace(1)* %out.gep.3 - - ret void -} - -; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i8: -; GCN: buffer_load_dword [[LOAD:v[0-9]+]] -; GCN: buffer_store_dword [[LOAD]] -; GCN: s_endpgm -define void @merge_global_store_4_adjacent_loads_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #0 { - %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i8 1 - %out.gep.2 = getelementptr i8, i8 addrspace(1)* %out, i8 2 - %out.gep.3 = getelementptr i8, i8 addrspace(1)* %out, i8 3 - %in.gep.1 = getelementptr i8, i8 addrspace(1)* %in, i8 1 - %in.gep.2 = getelementptr i8, i8 addrspace(1)* %in, i8 2 - %in.gep.3 = getelementptr i8, i8 addrspace(1)* %in, i8 3 - - %x = load i8, i8 addrspace(1)* %in, align 4 - %y = load i8, i8 addrspace(1)* %in.gep.1 - %z = load i8, i8 addrspace(1)* %in.gep.2 - %w = load i8, i8 addrspace(1)* %in.gep.3 - - store i8 %x, i8 addrspace(1)* %out, align 4 - store i8 %y, i8 addrspace(1)* %out.gep.1 - store i8 %z, i8 addrspace(1)* %out.gep.2 - store i8 %w, i8 addrspace(1)* %out.gep.3 - ret void -} - -; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i8_natural_align: -; GCN: buffer_load_ubyte -; GCN: buffer_load_ubyte -; GCN: buffer_load_ubyte -; GCN: buffer_load_ubyte -; GCN: buffer_store_byte -; GCN: buffer_store_byte -; GCN: buffer_store_byte -; GCN: buffer_store_byte -; GCN: s_endpgm -define void @merge_global_store_4_adjacent_loads_i8_natural_align(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #0 { - %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i8 1 - %out.gep.2 = getelementptr i8, i8 addrspace(1)* %out, i8 2 - %out.gep.3 = getelementptr i8, i8 addrspace(1)* %out, i8 3 - %in.gep.1 = getelementptr i8, i8 addrspace(1)* %in, i8 1 - %in.gep.2 = getelementptr i8, i8 addrspace(1)* %in, i8 2 - %in.gep.3 = getelementptr i8, i8 addrspace(1)* %in, i8 3 - - %x = load i8, i8 addrspace(1)* %in - %y = load i8, i8 addrspace(1)* %in.gep.1 - %z = load i8, i8 addrspace(1)* %in.gep.2 - %w = load i8, i8 addrspace(1)* %in.gep.3 - - store i8 %x, i8 addrspace(1)* %out - store i8 %y, i8 addrspace(1)* %out.gep.1 - store i8 %z, i8 addrspace(1)* %out.gep.2 - store i8 %w, i8 addrspace(1)* %out.gep.3 - ret void -} - -; This works once AA is enabled on the subtarget -; GCN-LABEL: {{^}}merge_global_store_4_vector_elts_loads_v4i32: -; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]] -; XGCN: buffer_store_dwordx4 [[LOAD]] -; GCN: buffer_store_dword v -; GCN: buffer_store_dword v -; GCN: buffer_store_dword v -; GCN: buffer_store_dword v -define void @merge_global_store_4_vector_elts_loads_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 { - %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 - %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2 - %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3 - %vec = load <4 x i32>, <4 x i32> addrspace(1)* %in - - %x = extractelement <4 x i32> %vec, i32 0 - %y = extractelement <4 x i32> %vec, i32 1 - %z = extractelement <4 x i32> %vec, i32 2 - %w = extractelement <4 x i32> %vec, i32 3 - - store i32 %x, i32 addrspace(1)* %out - store i32 %y, i32 addrspace(1)* %out.gep.1 - store i32 %z, i32 addrspace(1)* %out.gep.2 - store i32 %w, i32 addrspace(1)* %out.gep.3 - ret void -} - -; GCN-LABEL: {{^}}merge_local_store_2_constants_i8: -; GCN: ds_write_b8 -; GCN: ds_write_b8 -; GCN: s_endpgm -define void @merge_local_store_2_constants_i8(i8 addrspace(3)* %out) #0 { - %out.gep.1 = getelementptr i8, i8 addrspace(3)* %out, i32 1 - - store i8 123, i8 addrspace(3)* %out.gep.1 - store i8 456, i8 addrspace(3)* %out, align 2 - ret void -} - -; GCN-LABEL: {{^}}merge_local_store_2_constants_i32: -; GCN-DAG: s_movk_i32 [[SLO:s[0-9]+]], 0x1c8 -; GCN-DAG: s_movk_i32 [[SHI:s[0-9]+]], 0x7b -; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], [[SLO]] -; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[SHI]] -; GCN: ds_write2_b32 v{{[0-9]+}}, v[[LO]], v[[HI]] offset1:1{{$}} -define void @merge_local_store_2_constants_i32(i32 addrspace(3)* %out) #0 { - %out.gep.1 = getelementptr i32, i32 addrspace(3)* %out, i32 1 - - store i32 123, i32 addrspace(3)* %out.gep.1 - store i32 456, i32 addrspace(3)* %out - ret void -} - -; GCN-LABEL: {{^}}merge_local_store_4_constants_i32: -; GCN: ds_write_b32 -; GCN: ds_write_b32 -; GCN: ds_write_b32 -; GCN: ds_write_b32 -define void @merge_local_store_4_constants_i32(i32 addrspace(3)* %out) #0 { - %out.gep.1 = getelementptr i32, i32 addrspace(3)* %out, i32 1 - %out.gep.2 = getelementptr i32, i32 addrspace(3)* %out, i32 2 - %out.gep.3 = getelementptr i32, i32 addrspace(3)* %out, i32 3 - - store i32 123, i32 addrspace(3)* %out.gep.1 - store i32 456, i32 addrspace(3)* %out.gep.2 - store i32 333, i32 addrspace(3)* %out.gep.3 - store i32 1234, i32 addrspace(3)* %out - ret void -} - -declare void @llvm.AMDGPU.barrier.local() #1 - -attributes #0 = { nounwind } -attributes #1 = { noduplicate nounwind } diff --git a/llvm/test/CodeGen/R600/min.ll b/llvm/test/CodeGen/R600/min.ll deleted file mode 100644 index 0332d1a8e40..00000000000 --- a/llvm/test/CodeGen/R600/min.ll +++ /dev/null @@ -1,189 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s - -declare i32 @llvm.r600.read.tidig.x() nounwind readnone - -; FUNC-LABEL: @v_test_imin_sle_i32 -; SI: v_min_i32_e32 -define void @v_test_imin_sle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind { - %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone - %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid - %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid - %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %a = load i32, i32 addrspace(1)* %gep0, align 4 - %b = load i32, i32 addrspace(1)* %gep1, align 4 - %cmp = icmp sle i32 %a, %b - %val = select i1 %cmp, i32 %a, i32 %b - store i32 %val, i32 addrspace(1)* %outgep, align 4 - ret void -} - -; FUNC-LABEL: @s_test_imin_sle_i32 -; SI: s_min_i32 -define void @s_test_imin_sle_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind { - %cmp = icmp sle i32 %a, %b - %val = select i1 %cmp, i32 %a, i32 %b - store i32 %val, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: @v_test_imin_slt_i32 -; SI: v_min_i32_e32 -define void @v_test_imin_slt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind { - %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone - %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid - %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid - %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %a = load i32, i32 addrspace(1)* %gep0, align 4 - %b = load i32, i32 addrspace(1)* %gep1, align 4 - %cmp = icmp slt i32 %a, %b - %val = select i1 %cmp, i32 %a, i32 %b - store i32 %val, i32 addrspace(1)* %outgep, align 4 - ret void -} - -; FUNC-LABEL: @s_test_imin_slt_i32 -; SI: s_min_i32 -define void @s_test_imin_slt_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind { - %cmp = icmp slt i32 %a, %b - %val = select i1 %cmp, i32 %a, i32 %b - store i32 %val, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}s_test_imin_slt_imm_i32: -; SI: s_min_i32 {{s[0-9]+}}, {{s[0-9]+}}, 8 -define void @s_test_imin_slt_imm_i32(i32 addrspace(1)* %out, i32 %a) nounwind { - %cmp = icmp slt i32 %a, 8 - %val = select i1 %cmp, i32 %a, i32 8 - store i32 %val, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}s_test_imin_sle_imm_i32: -; SI: s_min_i32 {{s[0-9]+}}, {{s[0-9]+}}, 8 -define void @s_test_imin_sle_imm_i32(i32 addrspace(1)* %out, i32 %a) nounwind { - %cmp = icmp sle i32 %a, 8 - %val = select i1 %cmp, i32 %a, i32 8 - store i32 %val, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: @v_test_umin_ule_i32 -; SI: v_min_u32_e32 -define void @v_test_umin_ule_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind { - %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone - %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid - %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid - %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %a = load i32, i32 addrspace(1)* %gep0, align 4 - %b = load i32, i32 addrspace(1)* %gep1, align 4 - %cmp = icmp ule i32 %a, %b - %val = select i1 %cmp, i32 %a, i32 %b - store i32 %val, i32 addrspace(1)* %outgep, align 4 - ret void -} - -; FUNC-LABEL: @s_test_umin_ule_i32 -; SI: s_min_u32 -define void @s_test_umin_ule_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind { - %cmp = icmp ule i32 %a, %b - %val = select i1 %cmp, i32 %a, i32 %b - store i32 %val, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: @v_test_umin_ult_i32 -; SI: v_min_u32_e32 -define void @v_test_umin_ult_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind { - %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone - %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid - %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid - %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %a = load i32, i32 addrspace(1)* %gep0, align 4 - %b = load i32, i32 addrspace(1)* %gep1, align 4 - %cmp = icmp ult i32 %a, %b - %val = select i1 %cmp, i32 %a, i32 %b - store i32 %val, i32 addrspace(1)* %outgep, align 4 - ret void -} - -; FUNC-LABEL: @s_test_umin_ult_i32 -; SI: s_min_u32 -define void @s_test_umin_ult_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind { - %cmp = icmp ult i32 %a, %b - %val = select i1 %cmp, i32 %a, i32 %b - store i32 %val, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: @v_test_umin_ult_i32_multi_use -; SI-NOT: v_min -; SI: v_cmp_lt_u32 -; SI-NEXT: v_cndmask_b32 -; SI-NOT: v_min -; SI: s_endpgm -define void @v_test_umin_ult_i32_multi_use(i32 addrspace(1)* %out0, i1 addrspace(1)* %out1, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind { - %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone - %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid - %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid - %outgep0 = getelementptr i32, i32 addrspace(1)* %out0, i32 %tid - %outgep1 = getelementptr i1, i1 addrspace(1)* %out1, i32 %tid - %a = load i32, i32 addrspace(1)* %gep0, align 4 - %b = load i32, i32 addrspace(1)* %gep1, align 4 - %cmp = icmp ult i32 %a, %b - %val = select i1 %cmp, i32 %a, i32 %b - store i32 %val, i32 addrspace(1)* %outgep0, align 4 - store i1 %cmp, i1 addrspace(1)* %outgep1 - ret void -} - -; Make sure redundant and removed -; FUNC-LABEL: {{^}}simplify_demanded_bits_test_umin_ult_i16: -; SI-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xb -; SI-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xc -; SI: s_min_u32 [[MIN:s[0-9]+]], [[A]], [[B]] -; SI-NEXT: v_mov_b32_e32 [[VMIN:v[0-9]+]], [[MIN]] -; SI-NEXT: buffer_store_dword [[VMIN]] -define void @simplify_demanded_bits_test_umin_ult_i16(i32 addrspace(1)* %out, i16 zeroext %a, i16 zeroext %b) nounwind { - %a.ext = zext i16 %a to i32 - %b.ext = zext i16 %b to i32 - %cmp = icmp ult i32 %a.ext, %b.ext - %val = select i1 %cmp, i32 %a.ext, i32 %b.ext - %mask = and i32 %val, 65535 - store i32 %mask, i32 addrspace(1)* %out - ret void -} - -; Make sure redundant sign_extend_inreg removed. - -; FUNC-LABEL: {{^}}simplify_demanded_bits_test_min_slt_i16: -; SI-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xb -; SI-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xc -; SI: s_min_i32 [[MIN:s[0-9]+]], [[A]], [[B]] -; SI-NEXT: v_mov_b32_e32 [[VMIN:v[0-9]+]], [[MIN]] -; SI-NEXT: buffer_store_dword [[VMIN]] -define void @simplify_demanded_bits_test_min_slt_i16(i32 addrspace(1)* %out, i16 signext %a, i16 signext %b) nounwind { - %a.ext = sext i16 %a to i32 - %b.ext = sext i16 %b to i32 - %cmp = icmp slt i32 %a.ext, %b.ext - %val = select i1 %cmp, i32 %a.ext, i32 %b.ext - %shl = shl i32 %val, 16 - %sextinreg = ashr i32 %shl, 16 - store i32 %sextinreg, i32 addrspace(1)* %out - ret void -} - -; FIXME: Should get match min/max through extends inserted by -; legalization. - -; FUNC-LABEL: {{^}}s_test_imin_sle_i16: -; SI: s_sext_i32_i16 -; SI: s_sext_i32_i16 -; SI: v_cmp_le_i32_e32 -; SI: v_cndmask_b32 -define void @s_test_imin_sle_i16(i16 addrspace(1)* %out, i16 %a, i16 %b) nounwind { - %cmp = icmp sle i16 %a, %b - %val = select i1 %cmp, i16 %a, i16 %b - store i16 %val, i16 addrspace(1)* %out - ret void -} diff --git a/llvm/test/CodeGen/R600/min3.ll b/llvm/test/CodeGen/R600/min3.ll deleted file mode 100644 index 38ef46d1bdd..00000000000 --- a/llvm/test/CodeGen/R600/min3.ll +++ /dev/null @@ -1,111 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s - -declare i32 @llvm.r600.read.tidig.x() nounwind readnone - -; FUNC-LABEL: @v_test_imin3_slt_i32 -; SI: v_min3_i32 -define void @v_test_imin3_slt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind { - %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone - %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid - %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid - %gep2 = getelementptr i32, i32 addrspace(1)* %cptr, i32 %tid - %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %a = load i32, i32 addrspace(1)* %gep0, align 4 - %b = load i32, i32 addrspace(1)* %gep1, align 4 - %c = load i32, i32 addrspace(1)* %gep2, align 4 - %icmp0 = icmp slt i32 %a, %b - %i0 = select i1 %icmp0, i32 %a, i32 %b - %icmp1 = icmp slt i32 %i0, %c - %i1 = select i1 %icmp1, i32 %i0, i32 %c - store i32 %i1, i32 addrspace(1)* %outgep, align 4 - ret void -} - -; FUNC-LABEL: @v_test_umin3_ult_i32 -; SI: v_min3_u32 -define void @v_test_umin3_ult_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind { - %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone - %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid - %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid - %gep2 = getelementptr i32, i32 addrspace(1)* %cptr, i32 %tid - %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %a = load i32, i32 addrspace(1)* %gep0, align 4 - %b = load i32, i32 addrspace(1)* %gep1, align 4 - %c = load i32, i32 addrspace(1)* %gep2, align 4 - %icmp0 = icmp ult i32 %a, %b - %i0 = select i1 %icmp0, i32 %a, i32 %b - %icmp1 = icmp ult i32 %i0, %c - %i1 = select i1 %icmp1, i32 %i0, i32 %c - store i32 %i1, i32 addrspace(1)* %outgep, align 4 - ret void -} - -; FUNC-LABEL: @v_test_umin_umin_umin -; SI: v_min_i32 -; SI: v_min3_i32 -define void @v_test_umin_umin_umin(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind { - %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone - %tid2 = mul i32 %tid, 2 - %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid - %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid - %gep2 = getelementptr i32, i32 addrspace(1)* %cptr, i32 %tid - - %gep3 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid2 - %gep4 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid2 - %gep5 = getelementptr i32, i32 addrspace(1)* %cptr, i32 %tid2 - - %outgep0 = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %outgep1 = getelementptr i32, i32 addrspace(1)* %out, i32 %tid2 - - %a = load i32, i32 addrspace(1)* %gep0, align 4 - %b = load i32, i32 addrspace(1)* %gep1, align 4 - %c = load i32, i32 addrspace(1)* %gep2, align 4 - %d = load i32, i32 addrspace(1)* %gep3, align 4 - - %icmp0 = icmp slt i32 %a, %b - %i0 = select i1 %icmp0, i32 %a, i32 %b - - %icmp1 = icmp slt i32 %c, %d - %i1 = select i1 %icmp1, i32 %c, i32 %d - - %icmp2 = icmp slt i32 %i0, %i1 - %i2 = select i1 %icmp2, i32 %i0, i32 %i1 - - store i32 %i2, i32 addrspace(1)* %outgep1, align 4 - ret void -} - -; FUNC-LABEL: @v_test_umin3_2_uses -; SI-NOT: v_min3 -define void @v_test_umin3_2_uses(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind { - %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone - %tid2 = mul i32 %tid, 2 - %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid - %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid - %gep2 = getelementptr i32, i32 addrspace(1)* %cptr, i32 %tid - - %gep3 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid2 - %gep4 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid2 - %gep5 = getelementptr i32, i32 addrspace(1)* %cptr, i32 %tid2 - - %outgep0 = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %outgep1 = getelementptr i32, i32 addrspace(1)* %out, i32 %tid2 - - %a = load i32, i32 addrspace(1)* %gep0, align 4 - %b = load i32, i32 addrspace(1)* %gep1, align 4 - %c = load i32, i32 addrspace(1)* %gep2, align 4 - %d = load i32, i32 addrspace(1)* %gep3, align 4 - - %icmp0 = icmp slt i32 %a, %b - %i0 = select i1 %icmp0, i32 %a, i32 %b - - %icmp1 = icmp slt i32 %c, %d - %i1 = select i1 %icmp1, i32 %c, i32 %d - - %icmp2 = icmp slt i32 %i0, %c - %i2 = select i1 %icmp2, i32 %i0, i32 %c - - store i32 %i2, i32 addrspace(1)* %outgep0, align 4 - store i32 %i0, i32 addrspace(1)* %outgep1, align 4 - ret void -} diff --git a/llvm/test/CodeGen/R600/missing-store.ll b/llvm/test/CodeGen/R600/missing-store.ll deleted file mode 100644 index 4af9cdf1b96..00000000000 --- a/llvm/test/CodeGen/R600/missing-store.ll +++ /dev/null @@ -1,26 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=SI %s - -@ptr_load = addrspace(3) global i32 addrspace(2)* undef, align 8 - -; Make sure when the load from %ptr2 is folded the chain isn't lost, -; resulting in losing the store to gptr - -; FUNC-LABEL: {{^}}missing_store_reduced: -; SI: ds_read_b64 -; SI: buffer_store_dword -; SI: buffer_load_dword -; SI: buffer_store_dword -; SI: s_endpgm -define void @missing_store_reduced(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 { - %ptr0 = load i32 addrspace(2)*, i32 addrspace(2)* addrspace(3)* @ptr_load, align 8 - %ptr2 = getelementptr inbounds i32, i32 addrspace(2)* %ptr0, i64 2 - - store i32 99, i32 addrspace(1)* %gptr, align 4 - %tmp2 = load i32, i32 addrspace(2)* %ptr2, align 4 - - store i32 %tmp2, i32 addrspace(1)* %out, align 4 - ret void -} - -attributes #0 = { nounwind } - diff --git a/llvm/test/CodeGen/R600/mubuf.ll b/llvm/test/CodeGen/R600/mubuf.ll deleted file mode 100644 index b19163f294e..00000000000 --- a/llvm/test/CodeGen/R600/mubuf.ll +++ /dev/null @@ -1,183 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -show-mc-encoding -verify-machineinstrs < %s | FileCheck %s - -declare i32 @llvm.r600.read.tidig.x() readnone - -;;;==========================================================================;;; -;;; MUBUF LOAD TESTS -;;;==========================================================================;;; - -; MUBUF load with an immediate byte offset that fits into 12-bits -; CHECK-LABEL: {{^}}mubuf_load0: -; CHECK: buffer_load_dword v{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0 offset:4 ; encoding: [0x04,0x00,0x30,0xe0 -define void @mubuf_load0(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { -entry: - %0 = getelementptr i32, i32 addrspace(1)* %in, i64 1 - %1 = load i32, i32 addrspace(1)* %0 - store i32 %1, i32 addrspace(1)* %out - ret void -} - -; MUBUF load with the largest possible immediate offset -; CHECK-LABEL: {{^}}mubuf_load1: -; CHECK: buffer_load_ubyte v{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0 offset:4095 ; encoding: [0xff,0x0f,0x20,0xe0 -define void @mubuf_load1(i8 addrspace(1)* %out, i8 addrspace(1)* %in) { -entry: - %0 = getelementptr i8, i8 addrspace(1)* %in, i64 4095 - %1 = load i8, i8 addrspace(1)* %0 - store i8 %1, i8 addrspace(1)* %out - ret void -} - -; MUBUF load with an immediate byte offset that doesn't fit into 12-bits -; CHECK-LABEL: {{^}}mubuf_load2: -; CHECK: s_movk_i32 [[SOFFSET:s[0-9]+]], 0x1000 -; CHECK: buffer_load_dword v{{[0-9]}}, s[{{[0-9]+:[0-9]+}}], [[SOFFSET]] ; encoding: [0x00,0x00,0x30,0xe0 -define void @mubuf_load2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { -entry: - %0 = getelementptr i32, i32 addrspace(1)* %in, i64 1024 - %1 = load i32, i32 addrspace(1)* %0 - store i32 %1, i32 addrspace(1)* %out - ret void -} - -; MUBUF load with a 12-bit immediate offset and a register offset -; CHECK-LABEL: {{^}}mubuf_load3: -; CHECK-NOT: ADD -; CHECK: buffer_load_dword v{{[0-9]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:4 ; encoding: [0x04,0x80,0x30,0xe0 -define void @mubuf_load3(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i64 %offset) { -entry: - %0 = getelementptr i32, i32 addrspace(1)* %in, i64 %offset - %1 = getelementptr i32, i32 addrspace(1)* %0, i64 1 - %2 = load i32, i32 addrspace(1)* %1 - store i32 %2, i32 addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}soffset_max_imm: -; CHECK: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 64 offen glc -define void @soffset_max_imm([6 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* byval, [16 x <4 x i32>] addrspace(2)* byval, [32 x <8 x i32>] addrspace(2)* byval, i32 inreg, i32 inreg, i32, i32, i32, i32, i32, i32, i32, i32) #1 { -main_body: - %tmp0 = getelementptr [6 x <16 x i8>], [6 x <16 x i8>] addrspace(2)* %0, i32 0, i32 0 - %tmp1 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp0 - %tmp2 = shl i32 %6, 2 - %tmp3 = call i32 @llvm.SI.buffer.load.dword.i32.i32(<16 x i8> %tmp1, i32 %tmp2, i32 64, i32 0, i32 1, i32 0, i32 1, i32 0, i32 0) - %tmp4 = add i32 %6, 16 - %tmp5 = bitcast float 0.0 to i32 - call void @llvm.SI.tbuffer.store.i32(<16 x i8> %tmp1, i32 %tmp5, i32 1, i32 %tmp4, i32 %4, i32 0, i32 4, i32 4, i32 1, i32 0, i32 1, i32 1, i32 0) - ret void -} - -; Make sure immediates that aren't inline constants don't get folded into -; the soffset operand. -; FIXME: for this test we should be smart enough to shift the immediate into -; the offset field. -; CHECK-LABEL: {{^}}soffset_no_fold: -; CHECK: s_movk_i32 [[SOFFSET:s[0-9]+]], 0x41 -; CHECK: buffer_load_dword v{{[0-9+]}}, v{{[0-9+]}}, s[{{[0-9]+}}:{{[0-9]+}}], [[SOFFSET]] offen glc -define void @soffset_no_fold([6 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* byval, [16 x <4 x i32>] addrspace(2)* byval, [32 x <8 x i32>] addrspace(2)* byval, i32 inreg, i32 inreg, i32, i32, i32, i32, i32, i32, i32, i32) #1 { -main_body: - %tmp0 = getelementptr [6 x <16 x i8>], [6 x <16 x i8>] addrspace(2)* %0, i32 0, i32 0 - %tmp1 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp0 - %tmp2 = shl i32 %6, 2 - %tmp3 = call i32 @llvm.SI.buffer.load.dword.i32.i32(<16 x i8> %tmp1, i32 %tmp2, i32 65, i32 0, i32 1, i32 0, i32 1, i32 0, i32 0) - %tmp4 = add i32 %6, 16 - %tmp5 = bitcast float 0.0 to i32 - call void @llvm.SI.tbuffer.store.i32(<16 x i8> %tmp1, i32 %tmp5, i32 1, i32 %tmp4, i32 %4, i32 0, i32 4, i32 4, i32 1, i32 0, i32 1, i32 1, i32 0) - ret void -} - -;;;==========================================================================;;; -;;; MUBUF STORE TESTS -;;;==========================================================================;;; - -; MUBUF store with an immediate byte offset that fits into 12-bits -; CHECK-LABEL: {{^}}mubuf_store0: -; CHECK: buffer_store_dword v{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0 offset:4 ; encoding: [0x04,0x00,0x70,0xe0 -define void @mubuf_store0(i32 addrspace(1)* %out) { -entry: - %0 = getelementptr i32, i32 addrspace(1)* %out, i64 1 - store i32 0, i32 addrspace(1)* %0 - ret void -} - -; MUBUF store with the largest possible immediate offset -; CHECK-LABEL: {{^}}mubuf_store1: -; CHECK: buffer_store_byte v{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0 offset:4095 ; encoding: [0xff,0x0f,0x60,0xe0 - -define void @mubuf_store1(i8 addrspace(1)* %out) { -entry: - %0 = getelementptr i8, i8 addrspace(1)* %out, i64 4095 - store i8 0, i8 addrspace(1)* %0 - ret void -} - -; MUBUF store with an immediate byte offset that doesn't fit into 12-bits -; CHECK-LABEL: {{^}}mubuf_store2: -; CHECK: s_movk_i32 [[SOFFSET:s[0-9]+]], 0x1000 -; CHECK: buffer_store_dword v{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[SOFFSET]] ; encoding: [0x00,0x00,0x70,0xe0 -define void @mubuf_store2(i32 addrspace(1)* %out) { -entry: - %0 = getelementptr i32, i32 addrspace(1)* %out, i64 1024 - store i32 0, i32 addrspace(1)* %0 - ret void -} - -; MUBUF store with a 12-bit immediate offset and a register offset -; CHECK-LABEL: {{^}}mubuf_store3: -; CHECK-NOT: ADD -; CHECK: buffer_store_dword v{{[0-9]}}, v[{{[0-9]:[0-9]}}], s[{{[0-9]:[0-9]}}], 0 addr64 offset:4 ; encoding: [0x04,0x80,0x70,0xe0 -define void @mubuf_store3(i32 addrspace(1)* %out, i64 %offset) { -entry: - %0 = getelementptr i32, i32 addrspace(1)* %out, i64 %offset - %1 = getelementptr i32, i32 addrspace(1)* %0, i64 1 - store i32 0, i32 addrspace(1)* %1 - ret void -} - -; CHECK-LABEL: {{^}}store_sgpr_ptr: -; CHECK: buffer_store_dword v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0 -define void @store_sgpr_ptr(i32 addrspace(1)* %out) #0 { - store i32 99, i32 addrspace(1)* %out, align 4 - ret void -} - -; CHECK-LABEL: {{^}}store_sgpr_ptr_offset: -; CHECK: buffer_store_dword v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:40 -define void @store_sgpr_ptr_offset(i32 addrspace(1)* %out) #0 { - %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 10 - store i32 99, i32 addrspace(1)* %out.gep, align 4 - ret void -} - -; CHECK-LABEL: {{^}}store_sgpr_ptr_large_offset: -; CHECK: s_mov_b32 [[SOFFSET:s[0-9]+]], 0x20000 -; CHECK: buffer_store_dword v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, [[SOFFSET]] -define void @store_sgpr_ptr_large_offset(i32 addrspace(1)* %out) #0 { - %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 32768 - store i32 99, i32 addrspace(1)* %out.gep, align 4 - ret void -} - -; CHECK-LABEL: {{^}}store_sgpr_ptr_large_offset_atomic: -; CHECK: s_mov_b32 [[SOFFSET:s[0-9]+]], 0x20000 -; CHECK: buffer_atomic_add v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, [[SOFFSET]] -define void @store_sgpr_ptr_large_offset_atomic(i32 addrspace(1)* %out) #0 { - %gep = getelementptr i32, i32 addrspace(1)* %out, i32 32768 - %val = atomicrmw volatile add i32 addrspace(1)* %gep, i32 5 seq_cst - ret void -} - -; CHECK-LABEL: {{^}}store_vgpr_ptr: -; CHECK: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 -define void @store_vgpr_ptr(i32 addrspace(1)* %out) #0 { - %tid = call i32 @llvm.r600.read.tidig.x() readnone - %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - store i32 99, i32 addrspace(1)* %out.gep, align 4 - ret void -} - -declare i32 @llvm.SI.buffer.load.dword.i32.i32(<16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #3 -declare void @llvm.SI.tbuffer.store.i32(<16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) - -attributes #1 = { "ShaderType"="2" "unsafe-fp-math"="true" } -attributes #3 = { nounwind readonly } diff --git a/llvm/test/CodeGen/R600/mul.ll b/llvm/test/CodeGen/R600/mul.ll deleted file mode 100644 index 94e0f96b323..00000000000 --- a/llvm/test/CodeGen/R600/mul.ll +++ /dev/null @@ -1,200 +0,0 @@ -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG %s -check-prefix=FUNC -; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s - -; mul24 and mad24 are affected - -; FUNC-LABEL: {{^}}test_mul_v2i32: -; EG: MULLO_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -; EG: MULLO_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} - -; SI: v_mul_lo_i32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -; SI: v_mul_lo_i32 v{{[0-9]+, v[0-9]+, v[0-9]+}} - -define void @test_mul_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { - %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1 - %a = load <2 x i32>, <2 x i32> addrspace(1) * %in - %b = load <2 x i32>, <2 x i32> addrspace(1) * %b_ptr - %result = mul <2 x i32> %a, %b - store <2 x i32> %result, <2 x i32> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}v_mul_v4i32: -; EG: MULLO_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -; EG: MULLO_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -; EG: MULLO_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -; EG: MULLO_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} - -; SI: v_mul_lo_i32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -; SI: v_mul_lo_i32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -; SI: v_mul_lo_i32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -; SI: v_mul_lo_i32 v{{[0-9]+, v[0-9]+, v[0-9]+}} - -define void @v_mul_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { - %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1 - %a = load <4 x i32>, <4 x i32> addrspace(1) * %in - %b = load <4 x i32>, <4 x i32> addrspace(1) * %b_ptr - %result = mul <4 x i32> %a, %b - store <4 x i32> %result, <4 x i32> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}s_trunc_i64_mul_to_i32: -; SI: s_load_dword -; SI: s_load_dword -; SI: s_mul_i32 -; SI: buffer_store_dword -define void @s_trunc_i64_mul_to_i32(i32 addrspace(1)* %out, i64 %a, i64 %b) { - %mul = mul i64 %b, %a - %trunc = trunc i64 %mul to i32 - store i32 %trunc, i32 addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}v_trunc_i64_mul_to_i32: -; SI: s_load_dword -; SI: s_load_dword -; SI: v_mul_lo_i32 -; SI: buffer_store_dword -define void @v_trunc_i64_mul_to_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind { - %a = load i64, i64 addrspace(1)* %aptr, align 8 - %b = load i64, i64 addrspace(1)* %bptr, align 8 - %mul = mul i64 %b, %a - %trunc = trunc i64 %mul to i32 - store i32 %trunc, i32 addrspace(1)* %out, align 8 - ret void -} - -; This 64-bit multiply should just use MUL_HI and MUL_LO, since the top -; 32-bits of both arguments are sign bits. -; FUNC-LABEL: {{^}}mul64_sext_c: -; EG-DAG: MULLO_INT -; EG-DAG: MULHI_INT -; SI-DAG: s_mul_i32 -; SI-DAG: v_mul_hi_i32 -define void @mul64_sext_c(i64 addrspace(1)* %out, i32 %in) { -entry: - %0 = sext i32 %in to i64 - %1 = mul i64 %0, 80 - store i64 %1, i64 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}v_mul64_sext_c: -; EG-DAG: MULLO_INT -; EG-DAG: MULHI_INT -; SI-DAG: v_mul_lo_i32 -; SI-DAG: v_mul_hi_i32 -; SI: s_endpgm -define void @v_mul64_sext_c(i64 addrspace(1)* %out, i32 addrspace(1)* %in) { - %val = load i32, i32 addrspace(1)* %in, align 4 - %ext = sext i32 %val to i64 - %mul = mul i64 %ext, 80 - store i64 %mul, i64 addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}v_mul64_sext_inline_imm: -; SI-DAG: v_mul_lo_i32 v{{[0-9]+}}, 9, v{{[0-9]+}} -; SI-DAG: v_mul_hi_i32 v{{[0-9]+}}, 9, v{{[0-9]+}} -; SI: s_endpgm -define void @v_mul64_sext_inline_imm(i64 addrspace(1)* %out, i32 addrspace(1)* %in) { - %val = load i32, i32 addrspace(1)* %in, align 4 - %ext = sext i32 %val to i64 - %mul = mul i64 %ext, 9 - store i64 %mul, i64 addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}s_mul_i32: -; SI: s_load_dword [[SRC0:s[0-9]+]], -; SI: s_load_dword [[SRC1:s[0-9]+]], -; SI: s_mul_i32 [[SRESULT:s[0-9]+]], [[SRC0]], [[SRC1]] -; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]] -; SI: buffer_store_dword [[VRESULT]], -; SI: s_endpgm -define void @s_mul_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind { - %mul = mul i32 %a, %b - store i32 %mul, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}v_mul_i32: -; SI: v_mul_lo_i32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @v_mul_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { - %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 - %a = load i32, i32 addrspace(1)* %in - %b = load i32, i32 addrspace(1)* %b_ptr - %result = mul i32 %a, %b - store i32 %result, i32 addrspace(1)* %out - ret void -} - -; A standard 64-bit multiply. The expansion should be around 6 instructions. -; It would be difficult to match the expansion correctly without writing -; a really complicated list of FileCheck expressions. I don't want -; to confuse people who may 'break' this test with a correct optimization, -; so this test just uses FUNC-LABEL to make sure the compiler does not -; crash with a 'failed to select' error. - -; FUNC-LABEL: {{^}}s_mul_i64: -define void @s_mul_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind { - %mul = mul i64 %a, %b - store i64 %mul, i64 addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}v_mul_i64: -; SI: v_mul_lo_i32 -define void @v_mul_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) { - %a = load i64, i64 addrspace(1)* %aptr, align 8 - %b = load i64, i64 addrspace(1)* %bptr, align 8 - %mul = mul i64 %a, %b - store i64 %mul, i64 addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}mul32_in_branch: -; SI: s_mul_i32 -define void @mul32_in_branch(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %a, i32 %b, i32 %c) { -entry: - %0 = icmp eq i32 %a, 0 - br i1 %0, label %if, label %else - -if: - %1 = load i32, i32 addrspace(1)* %in - br label %endif - -else: - %2 = mul i32 %a, %b - br label %endif - -endif: - %3 = phi i32 [%1, %if], [%2, %else] - store i32 %3, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}mul64_in_branch: -; SI-DAG: s_mul_i32 -; SI-DAG: v_mul_hi_u32 -; SI: s_endpgm -define void @mul64_in_branch(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %a, i64 %b, i64 %c) { -entry: - %0 = icmp eq i64 %a, 0 - br i1 %0, label %if, label %else - -if: - %1 = load i64, i64 addrspace(1)* %in - br label %endif - -else: - %2 = mul i64 %a, %b - br label %endif - -endif: - %3 = phi i64 [%1, %if], [%2, %else] - store i64 %3, i64 addrspace(1)* %out - ret void -} diff --git a/llvm/test/CodeGen/R600/mul_int24.ll b/llvm/test/CodeGen/R600/mul_int24.ll deleted file mode 100644 index 7609dcc87af..00000000000 --- a/llvm/test/CodeGen/R600/mul_int24.ll +++ /dev/null @@ -1,23 +0,0 @@ -; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG --check-prefix=FUNC -; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=CM --check-prefix=FUNC -; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC -; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC - -; FUNC-LABEL: {{^}}i32_mul24: -; Signed 24-bit multiply is not supported on pre-Cayman GPUs. -; EG: MULLO_INT -; Make sure we are not masking the inputs -; CM-NOT: AND -; CM: MUL_INT24 -; SI-NOT: and -; SI: v_mul_i32_i24 -define void @i32_mul24(i32 addrspace(1)* %out, i32 %a, i32 %b) { -entry: - %0 = shl i32 %a, 8 - %a_24 = ashr i32 %0, 8 - %1 = shl i32 %b, 8 - %b_24 = ashr i32 %1, 8 - %2 = mul i32 %a_24, %b_24 - store i32 %2, i32 addrspace(1)* %out - ret void -} diff --git a/llvm/test/CodeGen/R600/mul_uint24.ll b/llvm/test/CodeGen/R600/mul_uint24.ll deleted file mode 100644 index e640a7cd69f..00000000000 --- a/llvm/test/CodeGen/R600/mul_uint24.ll +++ /dev/null @@ -1,67 +0,0 @@ -; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG --check-prefix=FUNC -; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=EG --check-prefix=FUNC -; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC -; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC - -; FUNC-LABEL: {{^}}u32_mul24: -; EG: MUL_UINT24 {{[* ]*}}T{{[0-9]\.[XYZW]}}, KC0[2].Z, KC0[2].W -; SI: v_mul_u32_u24 - -define void @u32_mul24(i32 addrspace(1)* %out, i32 %a, i32 %b) { -entry: - %0 = shl i32 %a, 8 - %a_24 = lshr i32 %0, 8 - %1 = shl i32 %b, 8 - %b_24 = lshr i32 %1, 8 - %2 = mul i32 %a_24, %b_24 - store i32 %2, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}i16_mul24: -; EG: MUL_UINT24 {{[* ]*}}T{{[0-9]}}.[[MUL_CHAN:[XYZW]]] -; The result must be sign-extended -; EG: BFE_INT {{[* ]*}}T{{[0-9]}}.{{[XYZW]}}, PV.[[MUL_CHAN]], 0.0, literal.x -; EG: 16 -; SI: v_mul_u32_u24_e{{(32|64)}} [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}} -; SI: v_bfe_i32 v{{[0-9]}}, [[MUL]], 0, 16 -define void @i16_mul24(i32 addrspace(1)* %out, i16 %a, i16 %b) { -entry: - %0 = mul i16 %a, %b - %1 = sext i16 %0 to i32 - store i32 %1, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}i8_mul24: -; EG: MUL_UINT24 {{[* ]*}}T{{[0-9]}}.[[MUL_CHAN:[XYZW]]] -; The result must be sign-extended -; EG: BFE_INT {{[* ]*}}T{{[0-9]}}.{{[XYZW]}}, PV.[[MUL_CHAN]], 0.0, literal.x -; SI: v_mul_u32_u24_e{{(32|64)}} [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}} -; SI: v_bfe_i32 v{{[0-9]}}, [[MUL]], 0, 8 - -define void @i8_mul24(i32 addrspace(1)* %out, i8 %a, i8 %b) { -entry: - %0 = mul i8 %a, %b - %1 = sext i8 %0 to i32 - store i32 %1, i32 addrspace(1)* %out - ret void -} - -; Multiply with 24-bit inputs and 64-bit output -; FUNC_LABEL: {{^}}mul24_i64: -; EG; MUL_UINT24 -; EG: MULHI -; SI: v_mul_u32_u24 -; FIXME: SI support 24-bit mulhi -; SI: v_mul_hi_u32 -define void @mul24_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) { -entry: - %0 = shl i64 %a, 40 - %a_24 = lshr i64 %0, 40 - %1 = shl i64 %b, 40 - %b_24 = lshr i64 %1, 40 - %2 = mul i64 %a_24, %b_24 - store i64 %2, i64 addrspace(1)* %out - ret void -} diff --git a/llvm/test/CodeGen/R600/mulhu.ll b/llvm/test/CodeGen/R600/mulhu.ll deleted file mode 100644 index 29b0944a553..00000000000 --- a/llvm/test/CodeGen/R600/mulhu.ll +++ /dev/null @@ -1,17 +0,0 @@ -;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s -;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s - -;CHECK: v_mov_b32_e32 v{{[0-9]+}}, 0xaaaaaaab -;CHECK: v_mul_hi_u32 v0, {{v[0-9]+}}, {{s[0-9]+}} -;CHECK-NEXT: v_lshrrev_b32_e32 v0, 1, v0 - -define void @test(i32 %p) { - %i = udiv i32 %p, 3 - %r = bitcast i32 %i to float - call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %r, float %r, float %r, float %r) - ret void -} - -declare <4 x float> @llvm.SI.sample.(i32, <4 x i32>, <8 x i32>, <4 x i32>, i32) readnone - -declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) diff --git a/llvm/test/CodeGen/R600/no-initializer-constant-addrspace.ll b/llvm/test/CodeGen/R600/no-initializer-constant-addrspace.ll deleted file mode 100644 index 9a814b579de..00000000000 --- a/llvm/test/CodeGen/R600/no-initializer-constant-addrspace.ll +++ /dev/null @@ -1,21 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -o /dev/null %s -; RUN: llc -march=amdgcn -mcpu=tonga -o /dev/null %s -; RUN: llc -march=r600 -mcpu=cypress -o /dev/null %s - -@extern_const_addrspace = external unnamed_addr addrspace(2) constant [5 x i32], align 4 - -; FUNC-LABEL: {{^}}load_extern_const_init: -define void @load_extern_const_init(i32 addrspace(1)* %out) nounwind { - %val = load i32, i32 addrspace(2)* getelementptr ([5 x i32], [5 x i32] addrspace(2)* @extern_const_addrspace, i64 0, i64 3), align 4 - store i32 %val, i32 addrspace(1)* %out, align 4 - ret void -} - -@undef_const_addrspace = unnamed_addr addrspace(2) constant [5 x i32] undef, align 4 - -; FUNC-LABEL: {{^}}load_undef_const_init: -define void @load_undef_const_init(i32 addrspace(1)* %out) nounwind { - %val = load i32, i32 addrspace(2)* getelementptr ([5 x i32], [5 x i32] addrspace(2)* @undef_const_addrspace, i64 0, i64 3), align 4 - store i32 %val, i32 addrspace(1)* %out, align 4 - ret void -} diff --git a/llvm/test/CodeGen/R600/no-shrink-extloads.ll b/llvm/test/CodeGen/R600/no-shrink-extloads.ll deleted file mode 100644 index e4328ecbaca..00000000000 --- a/llvm/test/CodeGen/R600/no-shrink-extloads.ll +++ /dev/null @@ -1,191 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s - -declare i32 @llvm.r600.read.tidig.x() nounwind readnone - -; Make sure we don't turn the 32-bit argument load into a 16-bit -; load. There aren't extending scalar lods, so that would require -; using a buffer_load instruction. - -; FUNC-LABEL: {{^}}truncate_kernarg_i32_to_i16: -; SI: s_load_dword s -; SI: buffer_store_short v -define void @truncate_kernarg_i32_to_i16(i16 addrspace(1)* %out, i32 %arg) nounwind { - %trunc = trunc i32 %arg to i16 - store i16 %trunc, i16 addrspace(1)* %out - ret void -} - -; It should be OK (and probably performance neutral) to reduce this, -; but we don't know if the load is uniform yet. - -; FUNC-LABEL: {{^}}truncate_buffer_load_i32_to_i16: -; SI: buffer_load_dword v -; SI: buffer_store_short v -define void @truncate_buffer_load_i32_to_i16(i16 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { - %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone - %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid - %gep.out = getelementptr i16, i16 addrspace(1)* %out, i32 %tid - %load = load i32, i32 addrspace(1)* %gep.in - %trunc = trunc i32 %load to i16 - store i16 %trunc, i16 addrspace(1)* %gep.out - ret void -} - -; FUNC-LABEL: {{^}}truncate_kernarg_i32_to_i8: -; SI: s_load_dword s -; SI: buffer_store_byte v -define void @truncate_kernarg_i32_to_i8(i8 addrspace(1)* %out, i32 %arg) nounwind { - %trunc = trunc i32 %arg to i8 - store i8 %trunc, i8 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}truncate_buffer_load_i32_to_i8: -; SI: buffer_load_dword v -; SI: buffer_store_byte v -define void @truncate_buffer_load_i32_to_i8(i8 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { - %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone - %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid - %gep.out = getelementptr i8, i8 addrspace(1)* %out, i32 %tid - %load = load i32, i32 addrspace(1)* %gep.in - %trunc = trunc i32 %load to i8 - store i8 %trunc, i8 addrspace(1)* %gep.out - ret void -} - -; FUNC-LABEL: {{^}}truncate_kernarg_i32_to_i1: -; SI: s_load_dword s -; SI: buffer_store_byte v -define void @truncate_kernarg_i32_to_i1(i1 addrspace(1)* %out, i32 %arg) nounwind { - %trunc = trunc i32 %arg to i1 - store i1 %trunc, i1 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}truncate_buffer_load_i32_to_i1: -; SI: buffer_load_dword v -; SI: buffer_store_byte v -define void @truncate_buffer_load_i32_to_i1(i1 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { - %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone - %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid - %gep.out = getelementptr i1, i1 addrspace(1)* %out, i32 %tid - %load = load i32, i32 addrspace(1)* %gep.in - %trunc = trunc i32 %load to i1 - store i1 %trunc, i1 addrspace(1)* %gep.out - ret void -} - -; FUNC-LABEL: {{^}}truncate_kernarg_i64_to_i32: -; SI: s_load_dword s -; SI: buffer_store_dword v -define void @truncate_kernarg_i64_to_i32(i32 addrspace(1)* %out, i64 %arg) nounwind { - %trunc = trunc i64 %arg to i32 - store i32 %trunc, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}truncate_buffer_load_i64_to_i32: -; SI: buffer_load_dword v -; SI: buffer_store_dword v -define void @truncate_buffer_load_i64_to_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind { - %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone - %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid - %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %load = load i64, i64 addrspace(1)* %gep.in - %trunc = trunc i64 %load to i32 - store i32 %trunc, i32 addrspace(1)* %gep.out - ret void -} - -; FUNC-LABEL: {{^}}srl_kernarg_i64_to_i32: -; SI: s_load_dword s -; SI: buffer_store_dword v -define void @srl_kernarg_i64_to_i32(i32 addrspace(1)* %out, i64 %arg) nounwind { - %srl = lshr i64 %arg, 32 - %trunc = trunc i64 %srl to i32 - store i32 %trunc, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}srl_buffer_load_i64_to_i32: -; SI: buffer_load_dword v -; SI: buffer_store_dword v -define void @srl_buffer_load_i64_to_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind { - %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone - %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid - %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %load = load i64, i64 addrspace(1)* %gep.in - %srl = lshr i64 %load, 32 - %trunc = trunc i64 %srl to i32 - store i32 %trunc, i32 addrspace(1)* %gep.out - ret void -} - -; Might as well reduce to 8-bit loads. -; FUNC-LABEL: {{^}}truncate_kernarg_i16_to_i8: -; SI: s_load_dword s -; SI: buffer_store_byte v -define void @truncate_kernarg_i16_to_i8(i8 addrspace(1)* %out, i16 %arg) nounwind { - %trunc = trunc i16 %arg to i8 - store i8 %trunc, i8 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}truncate_buffer_load_i16_to_i8: -; SI: buffer_load_ubyte v -; SI: buffer_store_byte v -define void @truncate_buffer_load_i16_to_i8(i8 addrspace(1)* %out, i16 addrspace(1)* %in) nounwind { - %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone - %gep.in = getelementptr i16, i16 addrspace(1)* %in, i32 %tid - %gep.out = getelementptr i8, i8 addrspace(1)* %out, i32 %tid - %load = load i16, i16 addrspace(1)* %gep.in - %trunc = trunc i16 %load to i8 - store i8 %trunc, i8 addrspace(1)* %gep.out - ret void -} - -; FUNC-LABEL: {{^}}srl_kernarg_i64_to_i8: -; SI: s_load_dword s -; SI: buffer_store_byte v -define void @srl_kernarg_i64_to_i8(i8 addrspace(1)* %out, i64 %arg) nounwind { - %srl = lshr i64 %arg, 32 - %trunc = trunc i64 %srl to i8 - store i8 %trunc, i8 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}srl_buffer_load_i64_to_i8: -; SI: buffer_load_dword v -; SI: buffer_store_byte v -define void @srl_buffer_load_i64_to_i8(i8 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind { - %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone - %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid - %gep.out = getelementptr i8, i8 addrspace(1)* %out, i32 %tid - %load = load i64, i64 addrspace(1)* %gep.in - %srl = lshr i64 %load, 32 - %trunc = trunc i64 %srl to i8 - store i8 %trunc, i8 addrspace(1)* %gep.out - ret void -} - -; FUNC-LABEL: {{^}}truncate_kernarg_i64_to_i8: -; SI: s_load_dword s -; SI: buffer_store_byte v -define void @truncate_kernarg_i64_to_i8(i8 addrspace(1)* %out, i64 %arg) nounwind { - %trunc = trunc i64 %arg to i8 - store i8 %trunc, i8 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}truncate_buffer_load_i64_to_i8: -; SI: buffer_load_dword v -; SI: buffer_store_byte v -define void @truncate_buffer_load_i64_to_i8(i8 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind { - %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone - %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid - %gep.out = getelementptr i8, i8 addrspace(1)* %out, i32 %tid - %load = load i64, i64 addrspace(1)* %gep.in - %trunc = trunc i64 %load to i8 - store i8 %trunc, i8 addrspace(1)* %gep.out - ret void -} diff --git a/llvm/test/CodeGen/R600/operand-folding.ll b/llvm/test/CodeGen/R600/operand-folding.ll deleted file mode 100644 index 816755efb07..00000000000 --- a/llvm/test/CodeGen/R600/operand-folding.ll +++ /dev/null @@ -1,113 +0,0 @@ -; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s - -; CHECK-LABEL: {{^}}fold_sgpr: -; CHECK: v_add_i32_e32 v{{[0-9]+}}, s -define void @fold_sgpr(i32 addrspace(1)* %out, i32 %fold) { -entry: - %tmp0 = icmp ne i32 %fold, 0 - br i1 %tmp0, label %if, label %endif - -if: - %id = call i32 @llvm.r600.read.tidig.x() - %offset = add i32 %fold, %id - %tmp1 = getelementptr i32, i32 addrspace(1)* %out, i32 %offset - store i32 0, i32 addrspace(1)* %tmp1 - br label %endif - -endif: - ret void -} - -; CHECK-LABEL: {{^}}fold_imm: -; CHECK: v_or_b32_e32 v{{[0-9]+}}, 5 -define void @fold_imm(i32 addrspace(1)* %out, i32 %cmp) { -entry: - %fold = add i32 3, 2 - %tmp0 = icmp ne i32 %cmp, 0 - br i1 %tmp0, label %if, label %endif - -if: - %id = call i32 @llvm.r600.read.tidig.x() - %val = or i32 %id, %fold - store i32 %val, i32 addrspace(1)* %out - br label %endif - -endif: - ret void -} - -; CHECK-LABEL: {{^}}fold_64bit_constant_add: -; CHECK-NOT: s_mov_b64 -; FIXME: It would be better if we could use v_add here and drop the extra -; v_mov_b32 instructions. -; CHECK-DAG: s_add_u32 [[LO:s[0-9]+]], s{{[0-9]+}}, 1 -; CHECK-DAG: s_addc_u32 [[HI:s[0-9]+]], s{{[0-9]+}}, 0 -; CHECK-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], [[LO]] -; CHECK-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], [[HI]] -; CHECK: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}}, - -define void @fold_64bit_constant_add(i64 addrspace(1)* %out, i32 %cmp, i64 %val) { -entry: - %tmp0 = add i64 %val, 1 - store i64 %tmp0, i64 addrspace(1)* %out - ret void -} - -; Inline constants should always be folded. - -; CHECK-LABEL: {{^}}vector_inline: -; CHECK: v_xor_b32_e32 v{{[0-9]+}}, 5, v{{[0-9]+}} -; CHECK: v_xor_b32_e32 v{{[0-9]+}}, 5, v{{[0-9]+}} -; CHECK: v_xor_b32_e32 v{{[0-9]+}}, 5, v{{[0-9]+}} -; CHECK: v_xor_b32_e32 v{{[0-9]+}}, 5, v{{[0-9]+}} - -define void @vector_inline(<4 x i32> addrspace(1)* %out) { -entry: - %tmp0 = call i32 @llvm.r600.read.tidig.x() - %tmp1 = add i32 %tmp0, 1 - %tmp2 = add i32 %tmp0, 2 - %tmp3 = add i32 %tmp0, 3 - %vec0 = insertelement <4 x i32> undef, i32 %tmp0, i32 0 - %vec1 = insertelement <4 x i32> %vec0, i32 %tmp1, i32 1 - %vec2 = insertelement <4 x i32> %vec1, i32 %tmp2, i32 2 - %vec3 = insertelement <4 x i32> %vec2, i32 %tmp3, i32 3 - %tmp4 = xor <4 x i32> , %vec3 - store <4 x i32> %tmp4, <4 x i32> addrspace(1)* %out - ret void -} - -; Immediates with one use should be folded -; CHECK-LABEL: {{^}}imm_one_use: -; CHECK: v_xor_b32_e32 v{{[0-9]+}}, 0x64, v{{[0-9]+}} - -define void @imm_one_use(i32 addrspace(1)* %out) { -entry: - %tmp0 = call i32 @llvm.r600.read.tidig.x() - %tmp1 = xor i32 %tmp0, 100 - store i32 %tmp1, i32 addrspace(1)* %out - ret void -} -; CHECK-LABEL: {{^}}vector_imm: -; CHECK: s_movk_i32 [[IMM:s[0-9]+]], 0x64 -; CHECK: v_xor_b32_e32 v{{[0-9]}}, [[IMM]], v{{[0-9]}} -; CHECK: v_xor_b32_e32 v{{[0-9]}}, [[IMM]], v{{[0-9]}} -; CHECK: v_xor_b32_e32 v{{[0-9]}}, [[IMM]], v{{[0-9]}} -; CHECK: v_xor_b32_e32 v{{[0-9]}}, [[IMM]], v{{[0-9]}} - -define void @vector_imm(<4 x i32> addrspace(1)* %out) { -entry: - %tmp0 = call i32 @llvm.r600.read.tidig.x() - %tmp1 = add i32 %tmp0, 1 - %tmp2 = add i32 %tmp0, 2 - %tmp3 = add i32 %tmp0, 3 - %vec0 = insertelement <4 x i32> undef, i32 %tmp0, i32 0 - %vec1 = insertelement <4 x i32> %vec0, i32 %tmp1, i32 1 - %vec2 = insertelement <4 x i32> %vec1, i32 %tmp2, i32 2 - %vec3 = insertelement <4 x i32> %vec2, i32 %tmp3, i32 3 - %tmp4 = xor <4 x i32> , %vec3 - store <4 x i32> %tmp4, <4 x i32> addrspace(1)* %out - ret void -} - -declare i32 @llvm.r600.read.tidig.x() #0 -attributes #0 = { readnone } diff --git a/llvm/test/CodeGen/R600/operand-spacing.ll b/llvm/test/CodeGen/R600/operand-spacing.ll deleted file mode 100644 index 20420a84de6..00000000000 --- a/llvm/test/CodeGen/R600/operand-spacing.ll +++ /dev/null @@ -1,18 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=SI -check-prefix=GCN %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=VI -check-prefix=GCN %s - -; Make sure there isn't an extra space between the instruction name and first operands. - -; GCN-LABEL: {{^}}add_f32: -; SI-DAG: s_load_dword [[SREGA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb -; SI-DAG: s_load_dword [[SREGB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc -; VI-DAG: s_load_dword [[SREGA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c -; VI-DAG: s_load_dword [[SREGB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30 -; GCN: v_mov_b32_e32 [[VREGB:v[0-9]+]], [[SREGB]] -; GCN: v_add_f32_e32 [[RESULT:v[0-9]+]], [[SREGA]], [[VREGB]] -; GCN: buffer_store_dword [[RESULT]], -define void @add_f32(float addrspace(1)* %out, float %a, float %b) { - %result = fadd float %a, %b - store float %result, float addrspace(1)* %out - ret void -} diff --git a/llvm/test/CodeGen/R600/or.ll b/llvm/test/CodeGen/R600/or.ll deleted file mode 100644 index 1c04090b407..00000000000 --- a/llvm/test/CodeGen/R600/or.ll +++ /dev/null @@ -1,178 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s - - -; FUNC-LABEL: {{^}}or_v2i32: -; EG: OR_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -; EG: OR_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} - -; SI: v_or_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -; SI: v_or_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -define void @or_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { - %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1 - %a = load <2 x i32>, <2 x i32> addrspace(1) * %in - %b = load <2 x i32>, <2 x i32> addrspace(1) * %b_ptr - %result = or <2 x i32> %a, %b - store <2 x i32> %result, <2 x i32> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}or_v4i32: -; EG: OR_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -; EG: OR_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -; EG: OR_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -; EG: OR_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} - -; SI: v_or_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -; SI: v_or_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -; SI: v_or_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -; SI: v_or_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -define void @or_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { - %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1 - %a = load <4 x i32>, <4 x i32> addrspace(1) * %in - %b = load <4 x i32>, <4 x i32> addrspace(1) * %b_ptr - %result = or <4 x i32> %a, %b - store <4 x i32> %result, <4 x i32> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}scalar_or_i32: -; SI: s_or_b32 -define void @scalar_or_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) { - %or = or i32 %a, %b - store i32 %or, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}vector_or_i32: -; SI: v_or_b32_e32 v{{[0-9]}} -define void @vector_or_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %a, i32 %b) { - %loada = load i32, i32 addrspace(1)* %a - %or = or i32 %loada, %b - store i32 %or, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}scalar_or_literal_i32: -; SI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x1869f -define void @scalar_or_literal_i32(i32 addrspace(1)* %out, i32 %a) { - %or = or i32 %a, 99999 - store i32 %or, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}vector_or_literal_i32: -; SI: v_or_b32_e32 v{{[0-9]+}}, 0xffff, v{{[0-9]+}} -define void @vector_or_literal_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %a, i32 addrspace(1)* %b) { - %loada = load i32, i32 addrspace(1)* %a, align 4 - %or = or i32 %loada, 65535 - store i32 %or, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}vector_or_inline_immediate_i32: -; SI: v_or_b32_e32 v{{[0-9]+}}, 4, v{{[0-9]+}} -define void @vector_or_inline_immediate_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %a, i32 addrspace(1)* %b) { - %loada = load i32, i32 addrspace(1)* %a, align 4 - %or = or i32 %loada, 4 - store i32 %or, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}scalar_or_i64: -; EG-DAG: OR_INT * T{{[0-9]\.[XYZW]}}, KC0[2].W, KC0[3].Y -; EG-DAG: OR_INT * T{{[0-9]\.[XYZW]}}, KC0[3].X, KC0[3].Z - -; SI: s_or_b64 -define void @scalar_or_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) { - %or = or i64 %a, %b - store i64 %or, i64 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}vector_or_i64: -; SI: v_or_b32_e32 v{{[0-9]}} -; SI: v_or_b32_e32 v{{[0-9]}} -define void @vector_or_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { - %loada = load i64, i64 addrspace(1)* %a, align 8 - %loadb = load i64, i64 addrspace(1)* %a, align 8 - %or = or i64 %loada, %loadb - store i64 %or, i64 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}scalar_vector_or_i64: -; SI: v_or_b32_e32 v{{[0-9]}} -; SI: v_or_b32_e32 v{{[0-9]}} -define void @scalar_vector_or_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 %b) { - %loada = load i64, i64 addrspace(1)* %a - %or = or i64 %loada, %b - store i64 %or, i64 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}vector_or_i64_loadimm: -; SI-DAG: s_mov_b32 [[LO_S_IMM:s[0-9]+]], 0xdf77987f -; SI-DAG: s_movk_i32 [[HI_S_IMM:s[0-9]+]], 0x146f -; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, -; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]] -; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]] -; SI: s_endpgm -define void @vector_or_i64_loadimm(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { - %loada = load i64, i64 addrspace(1)* %a, align 8 - %or = or i64 %loada, 22470723082367 - store i64 %or, i64 addrspace(1)* %out - ret void -} - -; FIXME: The or 0 should really be removed. -; FUNC-LABEL: {{^}}vector_or_i64_imm: -; SI: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, -; SI: v_or_b32_e32 {{v[0-9]+}}, 8, v[[LO_VREG]] -; SI: v_or_b32_e32 {{v[0-9]+}}, 0, {{.*}} -; SI: s_endpgm -define void @vector_or_i64_imm(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { - %loada = load i64, i64 addrspace(1)* %a, align 8 - %or = or i64 %loada, 8 - store i64 %or, i64 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}trunc_i64_or_to_i32: -; SI: s_load_dword s[[SREG0:[0-9]+]] -; SI: s_load_dword s[[SREG1:[0-9]+]] -; SI: s_or_b32 s[[SRESULT:[0-9]+]], s[[SREG1]], s[[SREG0]] -; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], s[[SRESULT]] -; SI: buffer_store_dword [[VRESULT]], -define void @trunc_i64_or_to_i32(i32 addrspace(1)* %out, i64 %a, i64 %b) { - %add = or i64 %b, %a - %trunc = trunc i64 %add to i32 - store i32 %trunc, i32 addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}or_i1: -; EG: OR_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], PS}} - -; SI: s_or_b64 s[{{[0-9]+:[0-9]+}}], vcc, s[{{[0-9]+:[0-9]+}}] -define void @or_i1(i32 addrspace(1)* %out, float addrspace(1)* %in0, float addrspace(1)* %in1) { - %a = load float, float addrspace(1)* %in0 - %b = load float, float addrspace(1)* %in1 - %acmp = fcmp oge float %a, 0.000000e+00 - %bcmp = fcmp oge float %b, 0.000000e+00 - %or = or i1 %acmp, %bcmp - %result = zext i1 %or to i32 - store i32 %result, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}s_or_i1: -; SI: s_or_b64 s[{{[0-9]+:[0-9]+}}], vcc, s[{{[0-9]+:[0-9]+}}] -define void @s_or_i1(i1 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d) { - %cmp0 = icmp eq i32 %a, %b - %cmp1 = icmp eq i32 %c, %d - %or = or i1 %cmp0, %cmp1 - store i1 %or, i1 addrspace(1)* %out - ret void -} diff --git a/llvm/test/CodeGen/R600/packetizer.ll b/llvm/test/CodeGen/R600/packetizer.ll deleted file mode 100644 index 49a7c0df748..00000000000 --- a/llvm/test/CodeGen/R600/packetizer.ll +++ /dev/null @@ -1,34 +0,0 @@ -; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s -; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s - -; CHECK: {{^}}test: -; CHECK: BIT_ALIGN_INT T{{[0-9]}}.X -; CHECK: BIT_ALIGN_INT T{{[0-9]}}.Y -; CHECK: BIT_ALIGN_INT T{{[0-9]}}.Z -; CHECK: BIT_ALIGN_INT * T{{[0-9]}}.W - -define void @test(i32 addrspace(1)* %out, i32 %x_arg, i32 %y_arg, i32 %z_arg, i32 %w_arg, i32 %e) { -entry: - %shl = sub i32 32, %e - %x = add i32 %x_arg, 1 - %x.0 = shl i32 %x, %shl - %x.1 = lshr i32 %x, %e - %x.2 = or i32 %x.0, %x.1 - %y = add i32 %y_arg, 1 - %y.0 = shl i32 %y, %shl - %y.1 = lshr i32 %y, %e - %y.2 = or i32 %y.0, %y.1 - %z = add i32 %z_arg, 1 - %z.0 = shl i32 %z, %shl - %z.1 = lshr i32 %z, %e - %z.2 = or i32 %z.0, %z.1 - %w = add i32 %w_arg, 1 - %w.0 = shl i32 %w, %shl - %w.1 = lshr i32 %w, %e - %w.2 = or i32 %w.0, %w.1 - %xy = or i32 %x.2, %y.2 - %zw = or i32 %z.2, %w.2 - %xyzw = or i32 %xy, %zw - store i32 %xyzw, i32 addrspace(1)* %out - ret void -} diff --git a/llvm/test/CodeGen/R600/parallelandifcollapse.ll b/llvm/test/CodeGen/R600/parallelandifcollapse.ll deleted file mode 100644 index f32b044198a..00000000000 --- a/llvm/test/CodeGen/R600/parallelandifcollapse.ll +++ /dev/null @@ -1,59 +0,0 @@ -; Function Attrs: nounwind -; RUN: llc -march=r600 -mcpu=redwood -mattr=-promote-alloca < %s | FileCheck %s -; -; CFG flattening should use parallel-and mode to generate branch conditions and -; then merge if-regions with the same bodies. -; -; CHECK: AND_INT -; CHECK-NEXT: AND_INT -; CHECK-NEXT: OR_INT - -; FIXME: For some reason having the allocas here allowed the flatten cfg pass -; to do its transfomation, however now that we are using local memory for -; allocas, the transformation isn't happening. - -define void @_Z9chk1D_512v() #0 { -entry: - %a0 = alloca i32, align 4 - %b0 = alloca i32, align 4 - %c0 = alloca i32, align 4 - %d0 = alloca i32, align 4 - %a1 = alloca i32, align 4 - %b1 = alloca i32, align 4 - %c1 = alloca i32, align 4 - %d1 = alloca i32, align 4 - %data = alloca i32, align 4 - %0 = load i32, i32* %a0, align 4 - %1 = load i32, i32* %b0, align 4 - %cmp = icmp ne i32 %0, %1 - br i1 %cmp, label %land.lhs.true, label %if.end - -land.lhs.true: ; preds = %entry - %2 = load i32, i32* %c0, align 4 - %3 = load i32, i32* %d0, align 4 - %cmp1 = icmp ne i32 %2, %3 - br i1 %cmp1, label %if.then, label %if.end - -if.then: ; preds = %land.lhs.true - store i32 1, i32* %data, align 4 - br label %if.end - -if.end: ; preds = %if.then, %land.lhs.true, %entry - %4 = load i32, i32* %a1, align 4 - %5 = load i32, i32* %b1, align 4 - %cmp2 = icmp ne i32 %4, %5 - br i1 %cmp2, label %land.lhs.true3, label %if.end6 - -land.lhs.true3: ; preds = %if.end - %6 = load i32, i32* %c1, align 4 - %7 = load i32, i32* %d1, align 4 - %cmp4 = icmp ne i32 %6, %7 - br i1 %cmp4, label %if.then5, label %if.end6 - -if.then5: ; preds = %land.lhs.true3 - store i32 1, i32* %data, align 4 - br label %if.end6 - -if.end6: ; preds = %if.then5, %land.lhs.true3, %if.end - ret void -} diff --git a/llvm/test/CodeGen/R600/parallelorifcollapse.ll b/llvm/test/CodeGen/R600/parallelorifcollapse.ll deleted file mode 100644 index 1da1e91b8ab..00000000000 --- a/llvm/test/CodeGen/R600/parallelorifcollapse.ll +++ /dev/null @@ -1,66 +0,0 @@ -; Function Attrs: nounwind -; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s -; -; CFG flattening should use parallel-or to generate branch conditions and -; then merge if-regions with the same bodies. - -; FIXME: For some reason having the allocas here allowed the flatten cfg pass -; to do its transfomation, however now that we are using local memory for -; allocas, the transformation isn't happening. -; XFAIL: * -; -; CHECK: OR_INT -; CHECK-NEXT: OR_INT -; CHECK-NEXT: OR_INT -define void @_Z9chk1D_512v() #0 { -entry: - %a0 = alloca i32, align 4 - %b0 = alloca i32, align 4 - %c0 = alloca i32, align 4 - %d0 = alloca i32, align 4 - %a1 = alloca i32, align 4 - %b1 = alloca i32, align 4 - %c1 = alloca i32, align 4 - %d1 = alloca i32, align 4 - %data = alloca i32, align 4 - %0 = load i32, i32* %a0, align 4 - %1 = load i32, i32* %b0, align 4 - %cmp = icmp ne i32 %0, %1 - br i1 %cmp, label %land.lhs.true, label %if.else - -land.lhs.true: ; preds = %entry - %2 = load i32, i32* %c0, align 4 - %3 = load i32, i32* %d0, align 4 - %cmp1 = icmp ne i32 %2, %3 - br i1 %cmp1, label %if.then, label %if.else - -if.then: ; preds = %land.lhs.true - br label %if.end - -if.else: ; preds = %land.lhs.true, %entry - store i32 1, i32* %data, align 4 - br label %if.end - -if.end: ; preds = %if.else, %if.then - %4 = load i32, i32* %a1, align 4 - %5 = load i32, i32* %b1, align 4 - %cmp2 = icmp ne i32 %4, %5 - br i1 %cmp2, label %land.lhs.true3, label %if.else6 - -land.lhs.true3: ; preds = %if.end - %6 = load i32, i32* %c1, align 4 - %7 = load i32, i32* %d1, align 4 - %cmp4 = icmp ne i32 %6, %7 - br i1 %cmp4, label %if.then5, label %if.else6 - -if.then5: ; preds = %land.lhs.true3 - br label %if.end7 - -if.else6: ; preds = %land.lhs.true3, %if.end - store i32 1, i32* %data, align 4 - br label %if.end7 - -if.end7: ; preds = %if.else6, %if.then5 - ret void -} - diff --git a/llvm/test/CodeGen/R600/predicate-dp4.ll b/llvm/test/CodeGen/R600/predicate-dp4.ll deleted file mode 100644 index 6bc18759435..00000000000 --- a/llvm/test/CodeGen/R600/predicate-dp4.ll +++ /dev/null @@ -1,27 +0,0 @@ -;RUN: llc < %s -march=r600 -mcpu=cayman - -; CHECK-LABEL: {{^}}main: -; CHECK: PRED_SETE_INT * Pred, -; CHECK: DOT4 T{{[0-9]+}}.X, T0.X, T0.X, Pred_sel_one -define void @main(<4 x float> inreg) #0 { -main_body: - %1 = extractelement <4 x float> %0, i32 0 - %2 = bitcast float %1 to i32 - %3 = icmp eq i32 %2, 0 - br i1 %3, label %IF, label %ENDIF - -IF: ; preds = %main_body - %4 = call float @llvm.AMDGPU.dp4(<4 x float> %0, <4 x float> %0) - br label %ENDIF - -ENDIF: ; preds = %IF, %main_body - %5 = phi float [%4, %IF], [0.000000e+00, %main_body] - %6 = insertelement <4 x float> undef, float %5, i32 0 - call void @llvm.R600.store.swizzle(<4 x float> %6, i32 0, i32 0) - ret void -} - -declare float @llvm.AMDGPU.dp4(<4 x float>, <4 x float>) #1 -declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) -attributes #1 = { readnone } -attributes #0 = { "ShaderType"="0" } diff --git a/llvm/test/CodeGen/R600/predicates.ll b/llvm/test/CodeGen/R600/predicates.ll deleted file mode 100644 index 0ce74d97ba8..00000000000 --- a/llvm/test/CodeGen/R600/predicates.ll +++ /dev/null @@ -1,104 +0,0 @@ -; RUN: llc < %s -march=r600 -mattr=disable-irstructurizer -mcpu=redwood | FileCheck %s - -; These tests make sure the compiler is optimizing branches using predicates -; when it is legal to do so. - -; CHECK: {{^}}simple_if: -; CHECK: PRED_SET{{[EGN][ET]*}}_INT * Pred, -; CHECK: LSHL * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, 1, Pred_sel -define void @simple_if(i32 addrspace(1)* %out, i32 %in) { -entry: - %0 = icmp sgt i32 %in, 0 - br i1 %0, label %IF, label %ENDIF - -IF: - %1 = shl i32 %in, 1 - br label %ENDIF - -ENDIF: - %2 = phi i32 [ %in, %entry ], [ %1, %IF ] - store i32 %2, i32 addrspace(1)* %out - ret void -} - -; CHECK: {{^}}simple_if_else: -; CHECK: PRED_SET{{[EGN][ET]*}}_INT * Pred, -; CHECK: LSH{{[LR] \* T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, 1, Pred_sel -; CHECK: LSH{{[LR] \* T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, 1, Pred_sel -define void @simple_if_else(i32 addrspace(1)* %out, i32 %in) { -entry: - %0 = icmp sgt i32 %in, 0 - br i1 %0, label %IF, label %ELSE - -IF: - %1 = shl i32 %in, 1 - br label %ENDIF - -ELSE: - %2 = lshr i32 %in, 1 - br label %ENDIF - -ENDIF: - %3 = phi i32 [ %1, %IF ], [ %2, %ELSE ] - store i32 %3, i32 addrspace(1)* %out - ret void -} - -; CHECK: {{^}}nested_if: -; CHECK: ALU_PUSH_BEFORE -; CHECK: JUMP -; CHECK: POP -; CHECK: PRED_SET{{[EGN][ET]*}}_INT * Exec -; CHECK: PRED_SET{{[EGN][ET]*}}_INT * Pred, -; CHECK: LSHL * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, 1, Pred_sel -define void @nested_if(i32 addrspace(1)* %out, i32 %in) { -entry: - %0 = icmp sgt i32 %in, 0 - br i1 %0, label %IF0, label %ENDIF - -IF0: - %1 = add i32 %in, 10 - %2 = icmp sgt i32 %1, 0 - br i1 %2, label %IF1, label %ENDIF - -IF1: - %3 = shl i32 %1, 1 - br label %ENDIF - -ENDIF: - %4 = phi i32 [%in, %entry], [%1, %IF0], [%3, %IF1] - store i32 %4, i32 addrspace(1)* %out - ret void -} - -; CHECK: {{^}}nested_if_else: -; CHECK: ALU_PUSH_BEFORE -; CHECK: JUMP -; CHECK: POP -; CHECK: PRED_SET{{[EGN][ET]*}}_INT * Exec -; CHECK: PRED_SET{{[EGN][ET]*}}_INT * Pred, -; CHECK: LSH{{[LR] \* T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, 1, Pred_sel -; CHECK: LSH{{[LR] \* T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, 1, Pred_sel -define void @nested_if_else(i32 addrspace(1)* %out, i32 %in) { -entry: - %0 = icmp sgt i32 %in, 0 - br i1 %0, label %IF0, label %ENDIF - -IF0: - %1 = add i32 %in, 10 - %2 = icmp sgt i32 %1, 0 - br i1 %2, label %IF1, label %ELSE1 - -IF1: - %3 = shl i32 %1, 1 - br label %ENDIF - -ELSE1: - %4 = lshr i32 %in, 1 - br label %ENDIF - -ENDIF: - %5 = phi i32 [%in, %entry], [%3, %IF1], [%4, %ELSE1] - store i32 %5, i32 addrspace(1)* %out - ret void -} diff --git a/llvm/test/CodeGen/R600/private-memory-atomics.ll b/llvm/test/CodeGen/R600/private-memory-atomics.ll deleted file mode 100644 index a008ac98a43..00000000000 --- a/llvm/test/CodeGen/R600/private-memory-atomics.ll +++ /dev/null @@ -1,32 +0,0 @@ -; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=SI < %s -; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s - -; This works because promote allocas pass replaces these with LDS atomics. - -; Private atomics have no real use, but at least shouldn't crash on it. -define void @atomicrmw_private(i32 addrspace(1)* %out, i32 %in) nounwind { -entry: - %tmp = alloca [2 x i32] - %tmp1 = getelementptr [2 x i32], [2 x i32]* %tmp, i32 0, i32 0 - %tmp2 = getelementptr [2 x i32], [2 x i32]* %tmp, i32 0, i32 1 - store i32 0, i32* %tmp1 - store i32 1, i32* %tmp2 - %tmp3 = getelementptr [2 x i32], [2 x i32]* %tmp, i32 0, i32 %in - %tmp4 = atomicrmw add i32* %tmp3, i32 7 acq_rel - store i32 %tmp4, i32 addrspace(1)* %out - ret void -} - -define void @cmpxchg_private(i32 addrspace(1)* %out, i32 %in) nounwind { -entry: - %tmp = alloca [2 x i32] - %tmp1 = getelementptr [2 x i32], [2 x i32]* %tmp, i32 0, i32 0 - %tmp2 = getelementptr [2 x i32], [2 x i32]* %tmp, i32 0, i32 1 - store i32 0, i32* %tmp1 - store i32 1, i32* %tmp2 - %tmp3 = getelementptr [2 x i32], [2 x i32]* %tmp, i32 0, i32 %in - %tmp4 = cmpxchg i32* %tmp3, i32 0, i32 1 acq_rel monotonic - %val = extractvalue { i32, i1 } %tmp4, 0 - store i32 %val, i32 addrspace(1)* %out - ret void -} diff --git a/llvm/test/CodeGen/R600/private-memory-broken.ll b/llvm/test/CodeGen/R600/private-memory-broken.ll deleted file mode 100644 index 6b18a19f195..00000000000 --- a/llvm/test/CodeGen/R600/private-memory-broken.ll +++ /dev/null @@ -1,21 +0,0 @@ -; RUN: not llc -verify-machineinstrs -march=amdgcn -mcpu=SI %s -o /dev/null 2>&1 | FileCheck %s -; RUN: not llc -verify-machineinstrs -march=amdgcn -mcpu=tonga %s -o /dev/null 2>&1 | FileCheck %s - -; Make sure promote alloca pass doesn't crash - -; CHECK: unsupported call - -declare i32 @foo(i32*) nounwind - -define void @call_private(i32 addrspace(1)* %out, i32 %in) nounwind { -entry: - %tmp = alloca [2 x i32] - %tmp1 = getelementptr [2 x i32], [2 x i32]* %tmp, i32 0, i32 0 - %tmp2 = getelementptr [2 x i32], [2 x i32]* %tmp, i32 0, i32 1 - store i32 0, i32* %tmp1 - store i32 1, i32* %tmp2 - %tmp3 = getelementptr [2 x i32], [2 x i32]* %tmp, i32 0, i32 %in - %val = call i32 @foo(i32* %tmp3) nounwind - store i32 %val, i32 addrspace(1)* %out - ret void -} diff --git a/llvm/test/CodeGen/R600/private-memory.ll b/llvm/test/CodeGen/R600/private-memory.ll deleted file mode 100644 index 1c562978050..00000000000 --- a/llvm/test/CodeGen/R600/private-memory.ll +++ /dev/null @@ -1,313 +0,0 @@ -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck %s -check-prefix=R600 -check-prefix=FUNC -; RUN: llc -show-mc-encoding -mattr=+promote-alloca -verify-machineinstrs -march=amdgcn -mcpu=SI < %s | FileCheck %s -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC -; RUN: llc -show-mc-encoding -mattr=-promote-alloca -verify-machineinstrs -march=amdgcn -mcpu=SI < %s | FileCheck %s -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC -; RUN: llc -show-mc-encoding -mattr=+promote-alloca -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck %s -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC -; RUN: llc -show-mc-encoding -mattr=-promote-alloca -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck %s -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC - -declare i32 @llvm.r600.read.tidig.x() nounwind readnone - -; FUNC-LABEL: {{^}}mova_same_clause: - -; R600: LDS_WRITE -; R600: LDS_WRITE -; R600: LDS_READ -; R600: LDS_READ - -; SI-PROMOTE: ds_write_b32 -; SI-PROMOTE: ds_write_b32 -; SI-PROMOTE: ds_read_b32 -; SI-PROMOTE: ds_read_b32 - -; SI-ALLOCA: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen ; encoding: [0x00,0x10,0x70,0xe0 -; SI-ALLOCA: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen ; encoding: [0x00,0x10,0x70,0xe0 -define void @mova_same_clause(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) { -entry: - %stack = alloca [5 x i32], align 4 - %0 = load i32, i32 addrspace(1)* %in, align 4 - %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %0 - store i32 4, i32* %arrayidx1, align 4 - %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1 - %1 = load i32, i32 addrspace(1)* %arrayidx2, align 4 - %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %1 - store i32 5, i32* %arrayidx3, align 4 - %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 0 - %2 = load i32, i32* %arrayidx10, align 4 - store i32 %2, i32 addrspace(1)* %out, align 4 - %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 1 - %3 = load i32, i32* %arrayidx12 - %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1 - store i32 %3, i32 addrspace(1)* %arrayidx13 - ret void -} - -; This test checks that the stack offset is calculated correctly for structs. -; All register loads/stores should be optimized away, so there shouldn't be -; any MOVA instructions. -; -; XXX: This generated code has unnecessary MOVs, we should be able to optimize -; this. - -; FUNC-LABEL: {{^}}multiple_structs: -; R600-NOT: MOVA_INT -; SI-NOT: v_movrel -; SI-NOT: v_movrel -%struct.point = type { i32, i32 } - -define void @multiple_structs(i32 addrspace(1)* %out) { -entry: - %a = alloca %struct.point - %b = alloca %struct.point - %a.x.ptr = getelementptr %struct.point, %struct.point* %a, i32 0, i32 0 - %a.y.ptr = getelementptr %struct.point, %struct.point* %a, i32 0, i32 1 - %b.x.ptr = getelementptr %struct.point, %struct.point* %b, i32 0, i32 0 - %b.y.ptr = getelementptr %struct.point, %struct.point* %b, i32 0, i32 1 - store i32 0, i32* %a.x.ptr - store i32 1, i32* %a.y.ptr - store i32 2, i32* %b.x.ptr - store i32 3, i32* %b.y.ptr - %a.indirect.ptr = getelementptr %struct.point, %struct.point* %a, i32 0, i32 0 - %b.indirect.ptr = getelementptr %struct.point, %struct.point* %b, i32 0, i32 0 - %a.indirect = load i32, i32* %a.indirect.ptr - %b.indirect = load i32, i32* %b.indirect.ptr - %0 = add i32 %a.indirect, %b.indirect - store i32 %0, i32 addrspace(1)* %out - ret void -} - -; Test direct access of a private array inside a loop. The private array -; loads and stores should be lowered to copies, so there shouldn't be any -; MOVA instructions. - -; FUNC-LABEL: {{^}}direct_loop: -; R600-NOT: MOVA_INT -; SI-NOT: v_movrel - -define void @direct_loop(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { -entry: - %prv_array_const = alloca [2 x i32] - %prv_array = alloca [2 x i32] - %a = load i32, i32 addrspace(1)* %in - %b_src_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 - %b = load i32, i32 addrspace(1)* %b_src_ptr - %a_dst_ptr = getelementptr [2 x i32], [2 x i32]* %prv_array_const, i32 0, i32 0 - store i32 %a, i32* %a_dst_ptr - %b_dst_ptr = getelementptr [2 x i32], [2 x i32]* %prv_array_const, i32 0, i32 1 - store i32 %b, i32* %b_dst_ptr - br label %for.body - -for.body: - %inc = phi i32 [0, %entry], [%count, %for.body] - %x_ptr = getelementptr [2 x i32], [2 x i32]* %prv_array_const, i32 0, i32 0 - %x = load i32, i32* %x_ptr - %y_ptr = getelementptr [2 x i32], [2 x i32]* %prv_array, i32 0, i32 0 - %y = load i32, i32* %y_ptr - %xy = add i32 %x, %y - store i32 %xy, i32* %y_ptr - %count = add i32 %inc, 1 - %done = icmp eq i32 %count, 4095 - br i1 %done, label %for.end, label %for.body - -for.end: - %value_ptr = getelementptr [2 x i32], [2 x i32]* %prv_array, i32 0, i32 0 - %value = load i32, i32* %value_ptr - store i32 %value, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}short_array: - -; R600: MOVA_INT - -; SI-PROMOTE-DAG: buffer_store_short v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen ; encoding: [0x00,0x10,0x68,0xe0 -; SI-PROMOTE-DAG: buffer_store_short v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen offset:2 ; encoding: [0x02,0x10,0x68,0xe0 -; SI-PROMOTE: buffer_load_sshort v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} -define void @short_array(i32 addrspace(1)* %out, i32 %index) { -entry: - %0 = alloca [2 x i16] - %1 = getelementptr [2 x i16], [2 x i16]* %0, i32 0, i32 0 - %2 = getelementptr [2 x i16], [2 x i16]* %0, i32 0, i32 1 - store i16 0, i16* %1 - store i16 1, i16* %2 - %3 = getelementptr [2 x i16], [2 x i16]* %0, i32 0, i32 %index - %4 = load i16, i16* %3 - %5 = sext i16 %4 to i32 - store i32 %5, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}char_array: - -; R600: MOVA_INT - -; SI-DAG: buffer_store_byte v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen ; encoding: [0x00,0x10,0x60,0xe0 -; SI-DAG: buffer_store_byte v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen offset:1 ; encoding: [0x01,0x10,0x60,0xe0 -define void @char_array(i32 addrspace(1)* %out, i32 %index) { -entry: - %0 = alloca [2 x i8] - %1 = getelementptr [2 x i8], [2 x i8]* %0, i32 0, i32 0 - %2 = getelementptr [2 x i8], [2 x i8]* %0, i32 0, i32 1 - store i8 0, i8* %1 - store i8 1, i8* %2 - %3 = getelementptr [2 x i8], [2 x i8]* %0, i32 0, i32 %index - %4 = load i8, i8* %3 - %5 = sext i8 %4 to i32 - store i32 %5, i32 addrspace(1)* %out - ret void - -} - -; Make sure we don't overwrite workitem information with private memory - -; FUNC-LABEL: {{^}}work_item_info: -; R600-NOT: MOV T0.X -; Additional check in case the move ends up in the last slot -; R600-NOT: MOV * TO.X - -; SI-NOT: v_mov_b32_e{{(32|64)}} v0 -define void @work_item_info(i32 addrspace(1)* %out, i32 %in) { -entry: - %0 = alloca [2 x i32] - %1 = getelementptr [2 x i32], [2 x i32]* %0, i32 0, i32 0 - %2 = getelementptr [2 x i32], [2 x i32]* %0, i32 0, i32 1 - store i32 0, i32* %1 - store i32 1, i32* %2 - %3 = getelementptr [2 x i32], [2 x i32]* %0, i32 0, i32 %in - %4 = load i32, i32* %3 - %5 = call i32 @llvm.r600.read.tidig.x() - %6 = add i32 %4, %5 - store i32 %6, i32 addrspace(1)* %out - ret void -} - -; Test that two stack objects are not stored in the same register -; The second stack object should be in T3.X -; FUNC-LABEL: {{^}}no_overlap: -; R600_CHECK: MOV -; R600_CHECK: [[CHAN:[XYZW]]]+ -; R600-NOT: [[CHAN]]+ -; SI: v_mov_b32_e32 v3 -define void @no_overlap(i32 addrspace(1)* %out, i32 %in) { -entry: - %0 = alloca [3 x i8], align 1 - %1 = alloca [2 x i8], align 1 - %2 = getelementptr [3 x i8], [3 x i8]* %0, i32 0, i32 0 - %3 = getelementptr [3 x i8], [3 x i8]* %0, i32 0, i32 1 - %4 = getelementptr [3 x i8], [3 x i8]* %0, i32 0, i32 2 - %5 = getelementptr [2 x i8], [2 x i8]* %1, i32 0, i32 0 - %6 = getelementptr [2 x i8], [2 x i8]* %1, i32 0, i32 1 - store i8 0, i8* %2 - store i8 1, i8* %3 - store i8 2, i8* %4 - store i8 1, i8* %5 - store i8 0, i8* %6 - %7 = getelementptr [3 x i8], [3 x i8]* %0, i32 0, i32 %in - %8 = getelementptr [2 x i8], [2 x i8]* %1, i32 0, i32 %in - %9 = load i8, i8* %7 - %10 = load i8, i8* %8 - %11 = add i8 %9, %10 - %12 = sext i8 %11 to i32 - store i32 %12, i32 addrspace(1)* %out - ret void -} - -define void @char_array_array(i32 addrspace(1)* %out, i32 %index) { -entry: - %alloca = alloca [2 x [2 x i8]] - %gep0 = getelementptr [2 x [2 x i8]], [2 x [2 x i8]]* %alloca, i32 0, i32 0, i32 0 - %gep1 = getelementptr [2 x [2 x i8]], [2 x [2 x i8]]* %alloca, i32 0, i32 0, i32 1 - store i8 0, i8* %gep0 - store i8 1, i8* %gep1 - %gep2 = getelementptr [2 x [2 x i8]], [2 x [2 x i8]]* %alloca, i32 0, i32 0, i32 %index - %load = load i8, i8* %gep2 - %sext = sext i8 %load to i32 - store i32 %sext, i32 addrspace(1)* %out - ret void -} - -define void @i32_array_array(i32 addrspace(1)* %out, i32 %index) { -entry: - %alloca = alloca [2 x [2 x i32]] - %gep0 = getelementptr [2 x [2 x i32]], [2 x [2 x i32]]* %alloca, i32 0, i32 0, i32 0 - %gep1 = getelementptr [2 x [2 x i32]], [2 x [2 x i32]]* %alloca, i32 0, i32 0, i32 1 - store i32 0, i32* %gep0 - store i32 1, i32* %gep1 - %gep2 = getelementptr [2 x [2 x i32]], [2 x [2 x i32]]* %alloca, i32 0, i32 0, i32 %index - %load = load i32, i32* %gep2 - store i32 %load, i32 addrspace(1)* %out - ret void -} - -define void @i64_array_array(i64 addrspace(1)* %out, i32 %index) { -entry: - %alloca = alloca [2 x [2 x i64]] - %gep0 = getelementptr [2 x [2 x i64]], [2 x [2 x i64]]* %alloca, i32 0, i32 0, i32 0 - %gep1 = getelementptr [2 x [2 x i64]], [2 x [2 x i64]]* %alloca, i32 0, i32 0, i32 1 - store i64 0, i64* %gep0 - store i64 1, i64* %gep1 - %gep2 = getelementptr [2 x [2 x i64]], [2 x [2 x i64]]* %alloca, i32 0, i32 0, i32 %index - %load = load i64, i64* %gep2 - store i64 %load, i64 addrspace(1)* %out - ret void -} - -%struct.pair32 = type { i32, i32 } - -define void @struct_array_array(i32 addrspace(1)* %out, i32 %index) { -entry: - %alloca = alloca [2 x [2 x %struct.pair32]] - %gep0 = getelementptr [2 x [2 x %struct.pair32]], [2 x [2 x %struct.pair32]]* %alloca, i32 0, i32 0, i32 0, i32 1 - %gep1 = getelementptr [2 x [2 x %struct.pair32]], [2 x [2 x %struct.pair32]]* %alloca, i32 0, i32 0, i32 1, i32 1 - store i32 0, i32* %gep0 - store i32 1, i32* %gep1 - %gep2 = getelementptr [2 x [2 x %struct.pair32]], [2 x [2 x %struct.pair32]]* %alloca, i32 0, i32 0, i32 %index, i32 0 - %load = load i32, i32* %gep2 - store i32 %load, i32 addrspace(1)* %out - ret void -} - -define void @struct_pair32_array(i32 addrspace(1)* %out, i32 %index) { -entry: - %alloca = alloca [2 x %struct.pair32] - %gep0 = getelementptr [2 x %struct.pair32], [2 x %struct.pair32]* %alloca, i32 0, i32 0, i32 1 - %gep1 = getelementptr [2 x %struct.pair32], [2 x %struct.pair32]* %alloca, i32 0, i32 1, i32 0 - store i32 0, i32* %gep0 - store i32 1, i32* %gep1 - %gep2 = getelementptr [2 x %struct.pair32], [2 x %struct.pair32]* %alloca, i32 0, i32 %index, i32 0 - %load = load i32, i32* %gep2 - store i32 %load, i32 addrspace(1)* %out - ret void -} - -define void @select_private(i32 addrspace(1)* %out, i32 %in) nounwind { -entry: - %tmp = alloca [2 x i32] - %tmp1 = getelementptr [2 x i32], [2 x i32]* %tmp, i32 0, i32 0 - %tmp2 = getelementptr [2 x i32], [2 x i32]* %tmp, i32 0, i32 1 - store i32 0, i32* %tmp1 - store i32 1, i32* %tmp2 - %cmp = icmp eq i32 %in, 0 - %sel = select i1 %cmp, i32* %tmp1, i32* %tmp2 - %load = load i32, i32* %sel - store i32 %load, i32 addrspace(1)* %out - ret void -} - -; AMDGPUPromoteAlloca does not know how to handle ptrtoint. When it -; finds one, it should stop trying to promote. - -; FUNC-LABEL: ptrtoint: -; SI-NOT: ds_write -; SI: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen -; SI: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen offset:5 -define void @ptrtoint(i32 addrspace(1)* %out, i32 %a, i32 %b) { - %alloca = alloca [16 x i32] - %tmp0 = getelementptr [16 x i32], [16 x i32]* %alloca, i32 0, i32 %a - store i32 5, i32* %tmp0 - %tmp1 = ptrtoint [16 x i32]* %alloca to i32 - %tmp2 = add i32 %tmp1, 5 - %tmp3 = inttoptr i32 %tmp2 to i32* - %tmp4 = getelementptr i32, i32* %tmp3, i32 %b - %tmp5 = load i32, i32* %tmp4 - store i32 %tmp5, i32 addrspace(1)* %out - ret void -} diff --git a/llvm/test/CodeGen/R600/pv-packing.ll b/llvm/test/CodeGen/R600/pv-packing.ll deleted file mode 100644 index abeae563ff3..00000000000 --- a/llvm/test/CodeGen/R600/pv-packing.ll +++ /dev/null @@ -1,45 +0,0 @@ -; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s - -;CHECK: DOT4 T{{[0-9]\.X}} -;CHECK: MULADD_IEEE * T{{[0-9]\.W}} - -define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1, <4 x float> inreg %reg2, <4 x float> inreg %reg3) #0 { -main_body: - %0 = extractelement <4 x float> %reg1, i32 0 - %1 = extractelement <4 x float> %reg1, i32 1 - %2 = extractelement <4 x float> %reg1, i32 2 - %3 = extractelement <4 x float> %reg2, i32 0 - %4 = extractelement <4 x float> %reg2, i32 1 - %5 = extractelement <4 x float> %reg2, i32 2 - %6 = extractelement <4 x float> %reg3, i32 0 - %7 = extractelement <4 x float> %reg3, i32 1 - %8 = extractelement <4 x float> %reg3, i32 2 - %9 = load <4 x float>, <4 x float> addrspace(8)* null - %10 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1) - %11 = call float @llvm.AMDGPU.dp4(<4 x float> %9, <4 x float> %9) - %12 = fmul float %0, %3 - %13 = fadd float %12, %6 - %14 = fmul float %1, %4 - %15 = fadd float %14, %7 - %16 = fmul float %2, %5 - %17 = fadd float %16, %8 - %18 = fmul float %11, %11 - %19 = fadd float %18, %0 - %20 = insertelement <4 x float> undef, float %13, i32 0 - %21 = insertelement <4 x float> %20, float %15, i32 1 - %22 = insertelement <4 x float> %21, float %17, i32 2 - %23 = insertelement <4 x float> %22, float %19, i32 3 - %24 = call float @llvm.AMDGPU.dp4(<4 x float> %23, <4 x float> %10) - %25 = insertelement <4 x float> undef, float %24, i32 0 - call void @llvm.R600.store.swizzle(<4 x float> %25, i32 0, i32 2) - ret void -} - -; Function Attrs: readnone -declare float @llvm.AMDGPU.dp4(<4 x float>, <4 x float>) #1 - - -declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) - -attributes #0 = { "ShaderType"="1" } -attributes #1 = { readnone } diff --git a/llvm/test/CodeGen/R600/pv.ll b/llvm/test/CodeGen/R600/pv.ll deleted file mode 100644 index 9a57dd19765..00000000000 --- a/llvm/test/CodeGen/R600/pv.ll +++ /dev/null @@ -1,241 +0,0 @@ -; RUN: llc < %s -march=r600 | FileCheck %s - -; CHECK: DOT4 * T{{[0-9]\.W}} (MASKED) -; CHECK: MAX T{{[0-9].[XYZW]}}, 0.0, PV.X - -define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1, <4 x float> inreg %reg2, <4 x float> inreg %reg3, <4 x float> inreg %reg4, <4 x float> inreg %reg5, <4 x float> inreg %reg6, <4 x float> inreg %reg7) #0 { -main_body: - %0 = extractelement <4 x float> %reg1, i32 0 - %1 = extractelement <4 x float> %reg1, i32 1 - %2 = extractelement <4 x float> %reg1, i32 2 - %3 = extractelement <4 x float> %reg1, i32 3 - %4 = extractelement <4 x float> %reg2, i32 0 - %5 = extractelement <4 x float> %reg2, i32 1 - %6 = extractelement <4 x float> %reg2, i32 2 - %7 = extractelement <4 x float> %reg2, i32 3 - %8 = extractelement <4 x float> %reg3, i32 0 - %9 = extractelement <4 x float> %reg3, i32 1 - %10 = extractelement <4 x float> %reg3, i32 2 - %11 = extractelement <4 x float> %reg3, i32 3 - %12 = extractelement <4 x float> %reg4, i32 0 - %13 = extractelement <4 x float> %reg4, i32 1 - %14 = extractelement <4 x float> %reg4, i32 2 - %15 = extractelement <4 x float> %reg4, i32 3 - %16 = extractelement <4 x float> %reg5, i32 0 - %17 = extractelement <4 x float> %reg5, i32 1 - %18 = extractelement <4 x float> %reg5, i32 2 - %19 = extractelement <4 x float> %reg5, i32 3 - %20 = extractelement <4 x float> %reg6, i32 0 - %21 = extractelement <4 x float> %reg6, i32 1 - %22 = extractelement <4 x float> %reg6, i32 2 - %23 = extractelement <4 x float> %reg6, i32 3 - %24 = extractelement <4 x float> %reg7, i32 0 - %25 = extractelement <4 x float> %reg7, i32 1 - %26 = extractelement <4 x float> %reg7, i32 2 - %27 = extractelement <4 x float> %reg7, i32 3 - %28 = load <4 x float>, <4 x float> addrspace(8)* null - %29 = extractelement <4 x float> %28, i32 0 - %30 = fmul float %0, %29 - %31 = load <4 x float>, <4 x float> addrspace(8)* null - %32 = extractelement <4 x float> %31, i32 1 - %33 = fmul float %0, %32 - %34 = load <4 x float>, <4 x float> addrspace(8)* null - %35 = extractelement <4 x float> %34, i32 2 - %36 = fmul float %0, %35 - %37 = load <4 x float>, <4 x float> addrspace(8)* null - %38 = extractelement <4 x float> %37, i32 3 - %39 = fmul float %0, %38 - %40 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1) - %41 = extractelement <4 x float> %40, i32 0 - %42 = fmul float %1, %41 - %43 = fadd float %42, %30 - %44 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1) - %45 = extractelement <4 x float> %44, i32 1 - %46 = fmul float %1, %45 - %47 = fadd float %46, %33 - %48 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1) - %49 = extractelement <4 x float> %48, i32 2 - %50 = fmul float %1, %49 - %51 = fadd float %50, %36 - %52 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1) - %53 = extractelement <4 x float> %52, i32 3 - %54 = fmul float %1, %53 - %55 = fadd float %54, %39 - %56 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2) - %57 = extractelement <4 x float> %56, i32 0 - %58 = fmul float %2, %57 - %59 = fadd float %58, %43 - %60 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2) - %61 = extractelement <4 x float> %60, i32 1 - %62 = fmul float %2, %61 - %63 = fadd float %62, %47 - %64 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2) - %65 = extractelement <4 x float> %64, i32 2 - %66 = fmul float %2, %65 - %67 = fadd float %66, %51 - %68 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2) - %69 = extractelement <4 x float> %68, i32 3 - %70 = fmul float %2, %69 - %71 = fadd float %70, %55 - %72 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3) - %73 = extractelement <4 x float> %72, i32 0 - %74 = fmul float %3, %73 - %75 = fadd float %74, %59 - %76 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3) - %77 = extractelement <4 x float> %76, i32 1 - %78 = fmul float %3, %77 - %79 = fadd float %78, %63 - %80 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3) - %81 = extractelement <4 x float> %80, i32 2 - %82 = fmul float %3, %81 - %83 = fadd float %82, %67 - %84 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3) - %85 = extractelement <4 x float> %84, i32 3 - %86 = fmul float %3, %85 - %87 = fadd float %86, %71 - %88 = insertelement <4 x float> undef, float %4, i32 0 - %89 = insertelement <4 x float> %88, float %5, i32 1 - %90 = insertelement <4 x float> %89, float %6, i32 2 - %91 = insertelement <4 x float> %90, float 0.000000e+00, i32 3 - %92 = insertelement <4 x float> undef, float %4, i32 0 - %93 = insertelement <4 x float> %92, float %5, i32 1 - %94 = insertelement <4 x float> %93, float %6, i32 2 - %95 = insertelement <4 x float> %94, float 0.000000e+00, i32 3 - %96 = call float @llvm.AMDGPU.dp4(<4 x float> %91, <4 x float> %95) - %97 = call float @fabs(float %96) - %98 = call float @llvm.AMDGPU.rsq.f32(float %97) - %99 = fmul float %4, %98 - %100 = fmul float %5, %98 - %101 = fmul float %6, %98 - %102 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4) - %103 = extractelement <4 x float> %102, i32 0 - %104 = fmul float %103, %8 - %105 = fadd float %104, %20 - %106 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4) - %107 = extractelement <4 x float> %106, i32 1 - %108 = fmul float %107, %9 - %109 = fadd float %108, %21 - %110 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4) - %111 = extractelement <4 x float> %110, i32 2 - %112 = fmul float %111, %10 - %113 = fadd float %112, %22 - %114 = call float @llvm.AMDIL.clamp.(float %105, float 0.000000e+00, float 1.000000e+00) - %115 = call float @llvm.AMDIL.clamp.(float %109, float 0.000000e+00, float 1.000000e+00) - %116 = call float @llvm.AMDIL.clamp.(float %113, float 0.000000e+00, float 1.000000e+00) - %117 = call float @llvm.AMDIL.clamp.(float %15, float 0.000000e+00, float 1.000000e+00) - %118 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 5) - %119 = extractelement <4 x float> %118, i32 0 - %120 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 5) - %121 = extractelement <4 x float> %120, i32 1 - %122 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 5) - %123 = extractelement <4 x float> %122, i32 2 - %124 = insertelement <4 x float> undef, float %99, i32 0 - %125 = insertelement <4 x float> %124, float %100, i32 1 - %126 = insertelement <4 x float> %125, float %101, i32 2 - %127 = insertelement <4 x float> %126, float 0.000000e+00, i32 3 - %128 = insertelement <4 x float> undef, float %119, i32 0 - %129 = insertelement <4 x float> %128, float %121, i32 1 - %130 = insertelement <4 x float> %129, float %123, i32 2 - %131 = insertelement <4 x float> %130, float 0.000000e+00, i32 3 - %132 = call float @llvm.AMDGPU.dp4(<4 x float> %127, <4 x float> %131) - %133 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 7) - %134 = extractelement <4 x float> %133, i32 0 - %135 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 7) - %136 = extractelement <4 x float> %135, i32 1 - %137 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 7) - %138 = extractelement <4 x float> %137, i32 2 - %139 = insertelement <4 x float> undef, float %99, i32 0 - %140 = insertelement <4 x float> %139, float %100, i32 1 - %141 = insertelement <4 x float> %140, float %101, i32 2 - %142 = insertelement <4 x float> %141, float 0.000000e+00, i32 3 - %143 = insertelement <4 x float> undef, float %134, i32 0 - %144 = insertelement <4 x float> %143, float %136, i32 1 - %145 = insertelement <4 x float> %144, float %138, i32 2 - %146 = insertelement <4 x float> %145, float 0.000000e+00, i32 3 - %147 = call float @llvm.AMDGPU.dp4(<4 x float> %142, <4 x float> %146) - %148 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 8) - %149 = extractelement <4 x float> %148, i32 0 - %150 = fmul float %149, %8 - %151 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 8) - %152 = extractelement <4 x float> %151, i32 1 - %153 = fmul float %152, %9 - %154 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 8) - %155 = extractelement <4 x float> %154, i32 2 - %156 = fmul float %155, %10 - %157 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9) - %158 = extractelement <4 x float> %157, i32 0 - %159 = fmul float %158, %12 - %160 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9) - %161 = extractelement <4 x float> %160, i32 1 - %162 = fmul float %161, %13 - %163 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9) - %164 = extractelement <4 x float> %163, i32 2 - %165 = fmul float %164, %14 - %166 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 10) - %167 = extractelement <4 x float> %166, i32 0 - %168 = fmul float %167, %16 - %169 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 10) - %170 = extractelement <4 x float> %169, i32 1 - %171 = fmul float %170, %17 - %172 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 10) - %173 = extractelement <4 x float> %172, i32 2 - %174 = fmul float %173, %18 - %175 = fcmp uge float %132, 0.000000e+00 - %176 = select i1 %175, float %132, float 0.000000e+00 - %177 = fcmp uge float %147, 0.000000e+00 - %178 = select i1 %177, float %147, float 0.000000e+00 - %179 = call float @llvm.pow.f32(float %178, float %24) - %180 = fcmp ult float %132, 0.000000e+00 - %181 = select i1 %180, float 0.000000e+00, float %179 - %182 = fadd float %150, %105 - %183 = fadd float %153, %109 - %184 = fadd float %156, %113 - %185 = fmul float %176, %159 - %186 = fadd float %185, %182 - %187 = fmul float %176, %162 - %188 = fadd float %187, %183 - %189 = fmul float %176, %165 - %190 = fadd float %189, %184 - %191 = fmul float %181, %168 - %192 = fadd float %191, %186 - %193 = fmul float %181, %171 - %194 = fadd float %193, %188 - %195 = fmul float %181, %174 - %196 = fadd float %195, %190 - %197 = call float @llvm.AMDIL.clamp.(float %192, float 0.000000e+00, float 1.000000e+00) - %198 = call float @llvm.AMDIL.clamp.(float %194, float 0.000000e+00, float 1.000000e+00) - %199 = call float @llvm.AMDIL.clamp.(float %196, float 0.000000e+00, float 1.000000e+00) - %200 = insertelement <4 x float> undef, float %75, i32 0 - %201 = insertelement <4 x float> %200, float %79, i32 1 - %202 = insertelement <4 x float> %201, float %83, i32 2 - %203 = insertelement <4 x float> %202, float %87, i32 3 - call void @llvm.R600.store.swizzle(<4 x float> %203, i32 60, i32 1) - %204 = insertelement <4 x float> undef, float %197, i32 0 - %205 = insertelement <4 x float> %204, float %198, i32 1 - %206 = insertelement <4 x float> %205, float %199, i32 2 - %207 = insertelement <4 x float> %206, float %117, i32 3 - call void @llvm.R600.store.swizzle(<4 x float> %207, i32 0, i32 2) - ret void -} - -; Function Attrs: readnone -declare float @llvm.AMDGPU.dp4(<4 x float>, <4 x float>) #1 - -; Function Attrs: readonly -declare float @fabs(float) #2 - -; Function Attrs: readnone -declare float @llvm.AMDGPU.rsq.f32(float) #1 - -; Function Attrs: readnone -declare float @llvm.AMDIL.clamp.(float, float, float) #1 - -; Function Attrs: nounwind readonly -declare float @llvm.pow.f32(float, float) #3 - -declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) - -attributes #0 = { "ShaderType"="1" } -attributes #1 = { readnone } -attributes #2 = { readonly } -attributes #3 = { nounwind readonly } diff --git a/llvm/test/CodeGen/R600/r600-encoding.ll b/llvm/test/CodeGen/R600/r600-encoding.ll deleted file mode 100644 index 3a82ee30a32..00000000000 --- a/llvm/test/CodeGen/R600/r600-encoding.ll +++ /dev/null @@ -1,25 +0,0 @@ -; RUN: llc < %s -march=r600 -show-mc-encoding -mcpu=redwood | FileCheck --check-prefix=EG %s -; RUN: llc < %s -march=r600 -show-mc-encoding -mcpu=rs880 | FileCheck --check-prefix=R600 %s - -; The earliest R600 GPUs have a slightly different encoding than the rest of -; the VLIW4/5 GPUs. - -; EG: {{^}}test: -; EG: MUL_IEEE {{[ *TXYZWPVxyzw.,0-9]+}} ; encoding: [{{0x[0-9a-f]+,0x[0-9a-f]+,0x[0-9a-f]+,0x[0-9a-f]+,0x10,0x01,0x[0-9a-f]+,0x[0-9a-f]+}}] - -; R600: {{^}}test: -; R600: MUL_IEEE {{[ *TXYZWPVxyzw.,0-9]+}} ; encoding: [{{0x[0-9a-f]+,0x[0-9a-f]+,0x[0-9a-f]+,0x[0-9a-f]+,0x10,0x02,0x[0-9a-f]+,0x[0-9a-f]+}}] - -define void @test(<4 x float> inreg %reg0) #0 { -entry: - %r0 = extractelement <4 x float> %reg0, i32 0 - %r1 = extractelement <4 x float> %reg0, i32 1 - %r2 = fmul float %r0, %r1 - %vec = insertelement <4 x float> undef, float %r2, i32 0 - call void @llvm.R600.store.swizzle(<4 x float> %vec, i32 0, i32 0) - ret void -} - -declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) - -attributes #0 = { "ShaderType"="0" } diff --git a/llvm/test/CodeGen/R600/r600-export-fix.ll b/llvm/test/CodeGen/R600/r600-export-fix.ll deleted file mode 100644 index 7cb80195b36..00000000000 --- a/llvm/test/CodeGen/R600/r600-export-fix.ll +++ /dev/null @@ -1,142 +0,0 @@ -; RUN: llc < %s -march=r600 -mcpu=cedar | FileCheck %s - -;CHECK: EXPORT T{{[0-9]}}.XYZW -;CHECK: EXPORT T{{[0-9]}}.0000 -;CHECK: EXPORT T{{[0-9]}}.0000 -;CHECK: EXPORT T{{[0-9]}}.0XYZ -;CHECK: EXPORT T{{[0-9]}}.XYZW -;CHECK: EXPORT T{{[0-9]}}.YZ00 -;CHECK: EXPORT T{{[0-9]}}.0000 -;CHECK: EXPORT T{{[0-9]}}.0000 - - -define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1) #0 { -main_body: - %0 = extractelement <4 x float> %reg1, i32 0 - %1 = extractelement <4 x float> %reg1, i32 1 - %2 = extractelement <4 x float> %reg1, i32 2 - %3 = extractelement <4 x float> %reg1, i32 3 - %4 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4) - %5 = extractelement <4 x float> %4, i32 0 - %6 = fmul float %5, %0 - %7 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4) - %8 = extractelement <4 x float> %7, i32 1 - %9 = fmul float %8, %0 - %10 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4) - %11 = extractelement <4 x float> %10, i32 2 - %12 = fmul float %11, %0 - %13 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4) - %14 = extractelement <4 x float> %13, i32 3 - %15 = fmul float %14, %0 - %16 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 5) - %17 = extractelement <4 x float> %16, i32 0 - %18 = fmul float %17, %1 - %19 = fadd float %18, %6 - %20 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 5) - %21 = extractelement <4 x float> %20, i32 1 - %22 = fmul float %21, %1 - %23 = fadd float %22, %9 - %24 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 5) - %25 = extractelement <4 x float> %24, i32 2 - %26 = fmul float %25, %1 - %27 = fadd float %26, %12 - %28 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 5) - %29 = extractelement <4 x float> %28, i32 3 - %30 = fmul float %29, %1 - %31 = fadd float %30, %15 - %32 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 6) - %33 = extractelement <4 x float> %32, i32 0 - %34 = fmul float %33, %2 - %35 = fadd float %34, %19 - %36 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 6) - %37 = extractelement <4 x float> %36, i32 1 - %38 = fmul float %37, %2 - %39 = fadd float %38, %23 - %40 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 6) - %41 = extractelement <4 x float> %40, i32 2 - %42 = fmul float %41, %2 - %43 = fadd float %42, %27 - %44 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 6) - %45 = extractelement <4 x float> %44, i32 3 - %46 = fmul float %45, %2 - %47 = fadd float %46, %31 - %48 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 7) - %49 = extractelement <4 x float> %48, i32 0 - %50 = fmul float %49, %3 - %51 = fadd float %50, %35 - %52 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 7) - %53 = extractelement <4 x float> %52, i32 1 - %54 = fmul float %53, %3 - %55 = fadd float %54, %39 - %56 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 7) - %57 = extractelement <4 x float> %56, i32 2 - %58 = fmul float %57, %3 - %59 = fadd float %58, %43 - %60 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 7) - %61 = extractelement <4 x float> %60, i32 3 - %62 = fmul float %61, %3 - %63 = fadd float %62, %47 - %64 = load <4 x float>, <4 x float> addrspace(8)* null - %65 = extractelement <4 x float> %64, i32 0 - %66 = load <4 x float>, <4 x float> addrspace(8)* null - %67 = extractelement <4 x float> %66, i32 1 - %68 = load <4 x float>, <4 x float> addrspace(8)* null - %69 = extractelement <4 x float> %68, i32 2 - %70 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2) - %71 = extractelement <4 x float> %70, i32 0 - %72 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2) - %73 = extractelement <4 x float> %72, i32 1 - %74 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2) - %75 = extractelement <4 x float> %74, i32 2 - %76 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3) - %77 = extractelement <4 x float> %76, i32 0 - %78 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3) - %79 = extractelement <4 x float> %78, i32 1 - %80 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3) - %81 = extractelement <4 x float> %80, i32 2 - %82 = insertelement <4 x float> undef, float %51, i32 0 - %83 = insertelement <4 x float> %82, float %55, i32 1 - %84 = insertelement <4 x float> %83, float %59, i32 2 - %85 = insertelement <4 x float> %84, float %63, i32 3 - call void @llvm.R600.store.swizzle(<4 x float> %85, i32 60, i32 1) - %86 = insertelement <4 x float> undef, float 0.000000e+00, i32 0 - %87 = insertelement <4 x float> %86, float 0.000000e+00, i32 1 - %88 = insertelement <4 x float> %87, float 0.000000e+00, i32 2 - %89 = insertelement <4 x float> %88, float 0.000000e+00, i32 3 - call void @llvm.R600.store.swizzle(<4 x float> %89, i32 0, i32 2) - %90 = insertelement <4 x float> undef, float 0.000000e+00, i32 0 - %91 = insertelement <4 x float> %90, float 0.000000e+00, i32 1 - %92 = insertelement <4 x float> %91, float 0.000000e+00, i32 2 - %93 = insertelement <4 x float> %92, float 0.000000e+00, i32 3 - call void @llvm.R600.store.swizzle(<4 x float> %93, i32 1, i32 2) - %94 = insertelement <4 x float> undef, float 0.000000e+00, i32 0 - %95 = insertelement <4 x float> %94, float %65, i32 1 - %96 = insertelement <4 x float> %95, float %67, i32 2 - %97 = insertelement <4 x float> %96, float %69, i32 3 - call void @llvm.R600.store.swizzle(<4 x float> %97, i32 2, i32 2) - %98 = insertelement <4 x float> undef, float %77, i32 0 - %99 = insertelement <4 x float> %98, float %79, i32 1 - %100 = insertelement <4 x float> %99, float %81, i32 2 - %101 = insertelement <4 x float> %100, float %71, i32 3 - call void @llvm.R600.store.swizzle(<4 x float> %101, i32 3, i32 2) - %102 = insertelement <4 x float> undef, float %73, i32 0 - %103 = insertelement <4 x float> %102, float %75, i32 1 - %104 = insertelement <4 x float> %103, float 0.000000e+00, i32 2 - %105 = insertelement <4 x float> %104, float 0.000000e+00, i32 3 - call void @llvm.R600.store.swizzle(<4 x float> %105, i32 4, i32 2) - %106 = insertelement <4 x float> undef, float 0.000000e+00, i32 0 - %107 = insertelement <4 x float> %106, float 0.000000e+00, i32 1 - %108 = insertelement <4 x float> %107, float 0.000000e+00, i32 2 - %109 = insertelement <4 x float> %108, float 0.000000e+00, i32 3 - call void @llvm.R600.store.swizzle(<4 x float> %109, i32 5, i32 2) - %110 = insertelement <4 x float> undef, float 0.000000e+00, i32 0 - %111 = insertelement <4 x float> %110, float 0.000000e+00, i32 1 - %112 = insertelement <4 x float> %111, float 0.000000e+00, i32 2 - %113 = insertelement <4 x float> %112, float 0.000000e+00, i32 3 - call void @llvm.R600.store.swizzle(<4 x float> %113, i32 6, i32 2) - ret void -} - -declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) - -attributes #0 = { "ShaderType"="1" } diff --git a/llvm/test/CodeGen/R600/r600-infinite-loop-bug-while-reorganizing-vector.ll b/llvm/test/CodeGen/R600/r600-infinite-loop-bug-while-reorganizing-vector.ll deleted file mode 100644 index f388f8ffe29..00000000000 --- a/llvm/test/CodeGen/R600/r600-infinite-loop-bug-while-reorganizing-vector.ll +++ /dev/null @@ -1,58 +0,0 @@ -;RUN: llc < %s -march=r600 -mcpu=cayman - -define void @main(<4 x float> inreg, <4 x float> inreg) #0 { -main_body: - %2 = extractelement <4 x float> %0, i32 0 - %3 = extractelement <4 x float> %0, i32 1 - %4 = extractelement <4 x float> %0, i32 2 - %5 = extractelement <4 x float> %0, i32 3 - %6 = insertelement <4 x float> undef, float %2, i32 0 - %7 = insertelement <4 x float> %6, float %3, i32 1 - %8 = insertelement <4 x float> %7, float %4, i32 2 - %9 = insertelement <4 x float> %8, float %5, i32 3 - %10 = call <4 x float> @llvm.AMDGPU.cube(<4 x float> %9) - %11 = extractelement <4 x float> %10, i32 0 - %12 = extractelement <4 x float> %10, i32 1 - %13 = extractelement <4 x float> %10, i32 2 - %14 = extractelement <4 x float> %10, i32 3 - %15 = call float @fabs(float %13) - %16 = fdiv float 1.000000e+00, %15 - %17 = fmul float %11, %16 - %18 = fadd float %17, 1.500000e+00 - %19 = fmul float %12, %16 - %20 = fadd float %19, 1.500000e+00 - %21 = insertelement <4 x float> undef, float %20, i32 0 - %22 = insertelement <4 x float> %21, float %18, i32 1 - %23 = insertelement <4 x float> %22, float %14, i32 2 - %24 = insertelement <4 x float> %23, float %5, i32 3 - %25 = extractelement <4 x float> %24, i32 0 - %26 = extractelement <4 x float> %24, i32 1 - %27 = extractelement <4 x float> %24, i32 2 - %28 = extractelement <4 x float> %24, i32 3 - %29 = insertelement <4 x float> undef, float %25, i32 0 - %30 = insertelement <4 x float> %29, float %26, i32 1 - %31 = insertelement <4 x float> %30, float %27, i32 2 - %32 = insertelement <4 x float> %31, float %28, i32 3 - %33 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %32, i32 16, i32 0, i32 13) - %34 = extractelement <4 x float> %33, i32 0 - %35 = insertelement <4 x float> undef, float %34, i32 0 - %36 = insertelement <4 x float> %35, float %34, i32 1 - %37 = insertelement <4 x float> %36, float %34, i32 2 - %38 = insertelement <4 x float> %37, float 1.000000e+00, i32 3 - call void @llvm.R600.store.swizzle(<4 x float> %38, i32 0, i32 0) - ret void -} - -; Function Attrs: readnone -declare <4 x float> @llvm.AMDGPU.cube(<4 x float>) #1 - -; Function Attrs: readnone -declare float @fabs(float) #1 - -; Function Attrs: readnone -declare <4 x float> @llvm.AMDGPU.tex(<4 x float>, i32, i32, i32) #1 - -declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) - -attributes #0 = { "ShaderType"="0" } -attributes #1 = { readnone } diff --git a/llvm/test/CodeGen/R600/r600cfg.ll b/llvm/test/CodeGen/R600/r600cfg.ll deleted file mode 100644 index c7b9d65220f..00000000000 --- a/llvm/test/CodeGen/R600/r600cfg.ll +++ /dev/null @@ -1,119 +0,0 @@ -;RUN: llc < %s -march=r600 -mcpu=redwood - -define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1) #0 { -main_body: - %0 = extractelement <4 x float> %reg1, i32 0 - %1 = extractelement <4 x float> %reg1, i32 1 - %2 = extractelement <4 x float> %reg1, i32 2 - %3 = extractelement <4 x float> %reg1, i32 3 - %4 = bitcast float %0 to i32 - %5 = icmp eq i32 %4, 0 - %6 = sext i1 %5 to i32 - %7 = bitcast i32 %6 to float - %8 = bitcast float %7 to i32 - %9 = icmp ne i32 %8, 0 - %. = select i1 %9, float 0x36A0000000000000, float %0 - br label %LOOP - -LOOP: ; preds = %LOOP47, %main_body - %temp12.0 = phi float [ 0x36A0000000000000, %main_body ], [ %temp12.1, %LOOP47 ] - %temp8.0 = phi float [ 0.000000e+00, %main_body ], [ %38, %LOOP47 ] - %temp4.1 = phi float [ %., %main_body ], [ %52, %LOOP47 ] - %10 = bitcast float %temp4.1 to i32 - %11 = icmp eq i32 %10, 1 - %12 = sext i1 %11 to i32 - %13 = bitcast i32 %12 to float - %14 = bitcast float %13 to i32 - %15 = icmp ne i32 %14, 0 - br i1 %15, label %IF41, label %ENDIF40 - -IF41: ; preds = %LOOP - %16 = insertelement <4 x float> undef, float %0, i32 0 - %17 = insertelement <4 x float> %16, float %temp8.0, i32 1 - %18 = insertelement <4 x float> %17, float %temp12.0, i32 2 - %19 = insertelement <4 x float> %18, float 0.000000e+00, i32 3 - call void @llvm.R600.store.stream.output(<4 x float> %19, i32 0, i32 0, i32 1) - %20 = insertelement <4 x float> undef, float %0, i32 0 - %21 = insertelement <4 x float> %20, float %temp8.0, i32 1 - %22 = insertelement <4 x float> %21, float %temp12.0, i32 2 - %23 = insertelement <4 x float> %22, float 0.000000e+00, i32 3 - call void @llvm.R600.store.stream.output(<4 x float> %23, i32 0, i32 0, i32 2) - %24 = insertelement <4 x float> undef, float %0, i32 0 - %25 = insertelement <4 x float> %24, float %temp8.0, i32 1 - %26 = insertelement <4 x float> %25, float %temp12.0, i32 2 - %27 = insertelement <4 x float> %26, float 0.000000e+00, i32 3 - call void @llvm.R600.store.stream.output(<4 x float> %27, i32 0, i32 0, i32 4) - %28 = insertelement <4 x float> undef, float 0.000000e+00, i32 0 - %29 = insertelement <4 x float> %28, float 0.000000e+00, i32 1 - %30 = insertelement <4 x float> %29, float 0.000000e+00, i32 2 - %31 = insertelement <4 x float> %30, float 0.000000e+00, i32 3 - call void @llvm.R600.store.swizzle(<4 x float> %31, i32 60, i32 1) - %32 = insertelement <4 x float> undef, float %0, i32 0 - %33 = insertelement <4 x float> %32, float %temp8.0, i32 1 - %34 = insertelement <4 x float> %33, float %temp12.0, i32 2 - %35 = insertelement <4 x float> %34, float 0.000000e+00, i32 3 - call void @llvm.R600.store.swizzle(<4 x float> %35, i32 0, i32 2) - ret void - -ENDIF40: ; preds = %LOOP - %36 = bitcast float %temp8.0 to i32 - %37 = add i32 %36, 1 - %38 = bitcast i32 %37 to float - %39 = bitcast float %temp4.1 to i32 - %40 = urem i32 %39, 2 - %41 = bitcast i32 %40 to float - %42 = bitcast float %41 to i32 - %43 = icmp eq i32 %42, 0 - %44 = sext i1 %43 to i32 - %45 = bitcast i32 %44 to float - %46 = bitcast float %45 to i32 - %47 = icmp ne i32 %46, 0 - %48 = bitcast float %temp4.1 to i32 - br i1 %47, label %IF44, label %ELSE45 - -IF44: ; preds = %ENDIF40 - %49 = udiv i32 %48, 2 - br label %ENDIF43 - -ELSE45: ; preds = %ENDIF40 - %50 = mul i32 3, %48 - %51 = add i32 %50, 1 - br label %ENDIF43 - -ENDIF43: ; preds = %ELSE45, %IF44 - %.sink = phi i32 [ %49, %IF44 ], [ %51, %ELSE45 ] - %52 = bitcast i32 %.sink to float - %53 = load <4 x float>, <4 x float> addrspace(8)* null - %54 = extractelement <4 x float> %53, i32 0 - %55 = bitcast float %54 to i32 - br label %LOOP47 - -LOOP47: ; preds = %ENDIF48, %ENDIF43 - %temp12.1 = phi float [ %temp12.0, %ENDIF43 ], [ %67, %ENDIF48 ] - %temp28.0 = phi float [ 0.000000e+00, %ENDIF43 ], [ %70, %ENDIF48 ] - %56 = bitcast float %temp28.0 to i32 - %57 = icmp uge i32 %56, %55 - %58 = sext i1 %57 to i32 - %59 = bitcast i32 %58 to float - %60 = bitcast float %59 to i32 - %61 = icmp ne i32 %60, 0 - br i1 %61, label %LOOP, label %ENDIF48 - -ENDIF48: ; preds = %LOOP47 - %62 = bitcast float %temp12.1 to i32 - %63 = mul i32 %62, 2 - %64 = bitcast i32 %63 to float - %65 = bitcast float %64 to i32 - %66 = urem i32 %65, 2147483647 - %67 = bitcast i32 %66 to float - %68 = bitcast float %temp28.0 to i32 - %69 = add i32 %68, 1 - %70 = bitcast i32 %69 to float - br label %LOOP47 -} - -declare void @llvm.R600.store.stream.output(<4 x float>, i32, i32, i32) - -declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) - -attributes #0 = { "ShaderType"="1" } diff --git a/llvm/test/CodeGen/R600/reciprocal.ll b/llvm/test/CodeGen/R600/reciprocal.ll deleted file mode 100644 index b4ac47afced..00000000000 --- a/llvm/test/CodeGen/R600/reciprocal.ll +++ /dev/null @@ -1,15 +0,0 @@ -;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s - -;CHECK: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} - -define void @test(<4 x float> inreg %reg0) #0 { - %r0 = extractelement <4 x float> %reg0, i32 0 - %r1 = fdiv float 1.0, %r0 - %vec = insertelement <4 x float> undef, float %r1, i32 0 - call void @llvm.R600.store.swizzle(<4 x float> %vec, i32 0, i32 0) - ret void -} - -declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) - -attributes #0 = { "ShaderType"="0" } diff --git a/llvm/test/CodeGen/R600/register-count-comments.ll b/llvm/test/CodeGen/R600/register-count-comments.ll deleted file mode 100644 index de6bfb31088..00000000000 --- a/llvm/test/CodeGen/R600/register-count-comments.ll +++ /dev/null @@ -1,27 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs -asm-verbose < %s | FileCheck -check-prefix=SI %s - -declare i32 @llvm.SI.tid() nounwind readnone - -; SI-LABEL: {{^}}foo: -; SI: .section .AMDGPU.csdata -; SI: ; Kernel info: -; SI: ; NumSgprs: {{[0-9]+}} -; SI: ; NumVgprs: {{[0-9]+}} -define void @foo(i32 addrspace(1)* noalias %out, i32 addrspace(1)* %abase, i32 addrspace(1)* %bbase) nounwind { - %tid = call i32 @llvm.SI.tid() nounwind readnone - %aptr = getelementptr i32, i32 addrspace(1)* %abase, i32 %tid - %bptr = getelementptr i32, i32 addrspace(1)* %bbase, i32 %tid - %outptr = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %a = load i32, i32 addrspace(1)* %aptr, align 4 - %b = load i32, i32 addrspace(1)* %bptr, align 4 - %result = add i32 %a, %b - store i32 %result, i32 addrspace(1)* %outptr, align 4 - ret void -} - -; SI-LABEL: {{^}}one_vgpr_used: -; SI: NumVgprs: 1 -define void @one_vgpr_used(i32 addrspace(1)* %out, i32 %x) nounwind { - store i32 %x, i32 addrspace(1)* %out, align 4 - ret void -} diff --git a/llvm/test/CodeGen/R600/reorder-stores.ll b/llvm/test/CodeGen/R600/reorder-stores.ll deleted file mode 100644 index 187650ff9a5..00000000000 --- a/llvm/test/CodeGen/R600/reorder-stores.ll +++ /dev/null @@ -1,105 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI %s - -; SI-LABEL: {{^}}no_reorder_v2f64_global_load_store: -; SI: buffer_load_dwordx2 -; SI: buffer_load_dwordx2 -; SI: buffer_load_dwordx2 -; SI: buffer_load_dwordx2 -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 -; SI: s_endpgm -define void @no_reorder_v2f64_global_load_store(<2 x double> addrspace(1)* nocapture %x, <2 x double> addrspace(1)* nocapture %y) nounwind { - %tmp1 = load <2 x double>, <2 x double> addrspace(1)* %x, align 16 - %tmp4 = load <2 x double>, <2 x double> addrspace(1)* %y, align 16 - store <2 x double> %tmp4, <2 x double> addrspace(1)* %x, align 16 - store <2 x double> %tmp1, <2 x double> addrspace(1)* %y, align 16 - ret void -} - -; SI-LABEL: {{^}}no_reorder_scalarized_v2f64_local_load_store: -; SI: ds_read_b64 -; SI: ds_read_b64 -; SI: ds_write_b64 -; SI: ds_write_b64 -; SI: s_endpgm -define void @no_reorder_scalarized_v2f64_local_load_store(<2 x double> addrspace(3)* nocapture %x, <2 x double> addrspace(3)* nocapture %y) nounwind { - %tmp1 = load <2 x double>, <2 x double> addrspace(3)* %x, align 16 - %tmp4 = load <2 x double>, <2 x double> addrspace(3)* %y, align 16 - store <2 x double> %tmp4, <2 x double> addrspace(3)* %x, align 16 - store <2 x double> %tmp1, <2 x double> addrspace(3)* %y, align 16 - ret void -} - -; SI-LABEL: {{^}}no_reorder_split_v8i32_global_load_store: -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword - -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword - -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword - -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword - - -; SI: buffer_store_dword -; SI: buffer_store_dword -; SI: buffer_store_dword -; SI: buffer_store_dword - -; SI: buffer_store_dword -; SI: buffer_store_dword -; SI: buffer_store_dword -; SI: buffer_store_dword - -; SI: buffer_store_dword -; SI: buffer_store_dword -; SI: buffer_store_dword -; SI: buffer_store_dword - -; SI: buffer_store_dword -; SI: buffer_store_dword -; SI: buffer_store_dword -; SI: buffer_store_dword -; SI: s_endpgm -define void @no_reorder_split_v8i32_global_load_store(<8 x i32> addrspace(1)* nocapture %x, <8 x i32> addrspace(1)* nocapture %y) nounwind { - %tmp1 = load <8 x i32>, <8 x i32> addrspace(1)* %x, align 32 - %tmp4 = load <8 x i32>, <8 x i32> addrspace(1)* %y, align 32 - store <8 x i32> %tmp4, <8 x i32> addrspace(1)* %x, align 32 - store <8 x i32> %tmp1, <8 x i32> addrspace(1)* %y, align 32 - ret void -} - -; SI-LABEL: {{^}}no_reorder_extload_64: -; SI: ds_read_b64 -; SI: ds_read_b64 -; SI: ds_write_b64 -; SI-NOT: ds_read -; SI: ds_write_b64 -; SI: s_endpgm -define void @no_reorder_extload_64(<2 x i32> addrspace(3)* nocapture %x, <2 x i32> addrspace(3)* nocapture %y) nounwind { - %tmp1 = load <2 x i32>, <2 x i32> addrspace(3)* %x, align 8 - %tmp4 = load <2 x i32>, <2 x i32> addrspace(3)* %y, align 8 - %tmp1ext = zext <2 x i32> %tmp1 to <2 x i64> - %tmp4ext = zext <2 x i32> %tmp4 to <2 x i64> - %tmp7 = add <2 x i64> %tmp1ext, - %tmp9 = add <2 x i64> %tmp4ext, - %trunctmp9 = trunc <2 x i64> %tmp9 to <2 x i32> - %trunctmp7 = trunc <2 x i64> %tmp7 to <2 x i32> - store <2 x i32> %trunctmp9, <2 x i32> addrspace(3)* %x, align 8 - store <2 x i32> %trunctmp7, <2 x i32> addrspace(3)* %y, align 8 - ret void -} diff --git a/llvm/test/CodeGen/R600/rotl.i64.ll b/llvm/test/CodeGen/R600/rotl.i64.ll deleted file mode 100644 index 3f4ceb7e031..00000000000 --- a/llvm/test/CodeGen/R600/rotl.i64.ll +++ /dev/null @@ -1,39 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=BOTH %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=BOTH %s - -; BOTH-LABEL: {{^}}s_rotl_i64: -; BOTH-DAG: s_lshl_b64 -; BOTH-DAG: s_sub_i32 -; BOTH-DAG: s_lshr_b64 -; BOTH: s_or_b64 -; BOTH: s_endpgm -define void @s_rotl_i64(i64 addrspace(1)* %in, i64 %x, i64 %y) { -entry: - %0 = shl i64 %x, %y - %1 = sub i64 64, %y - %2 = lshr i64 %x, %1 - %3 = or i64 %0, %2 - store i64 %3, i64 addrspace(1)* %in - ret void -} - -; BOTH-LABEL: {{^}}v_rotl_i64: -; SI-DAG: v_lshl_b64 -; VI-DAG: v_lshlrev_b64 -; BOTH-DAG: v_sub_i32 -; SI: v_lshr_b64 -; VI: v_lshrrev_b64 -; BOTH: v_or_b32 -; BOTH: v_or_b32 -; BOTH: s_endpgm -define void @v_rotl_i64(i64 addrspace(1)* %in, i64 addrspace(1)* %xptr, i64 addrspace(1)* %yptr) { -entry: - %x = load i64, i64 addrspace(1)* %xptr, align 8 - %y = load i64, i64 addrspace(1)* %yptr, align 8 - %tmp0 = shl i64 %x, %y - %tmp1 = sub i64 64, %y - %tmp2 = lshr i64 %x, %tmp1 - %tmp3 = or i64 %tmp0, %tmp2 - store i64 %tmp3, i64 addrspace(1)* %in, align 8 - ret void -} diff --git a/llvm/test/CodeGen/R600/rotl.ll b/llvm/test/CodeGen/R600/rotl.ll deleted file mode 100644 index 6c144cd56ea..00000000000 --- a/llvm/test/CodeGen/R600/rotl.ll +++ /dev/null @@ -1,57 +0,0 @@ -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck --check-prefix=R600 -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s - -; FUNC-LABEL: {{^}}rotl_i32: -; R600: SUB_INT {{\** T[0-9]+\.[XYZW]}}, literal.x -; R600-NEXT: 32 -; R600: BIT_ALIGN_INT {{T[0-9]+\.[XYZW]}}, KC0[2].Z, KC0[2].Z, PV.{{[XYZW]}} - -; SI: s_sub_i32 [[SDST:s[0-9]+]], 32, {{[s][0-9]+}} -; SI: v_mov_b32_e32 [[VDST:v[0-9]+]], [[SDST]] -; SI: v_alignbit_b32 {{v[0-9]+, [s][0-9]+, s[0-9]+}}, [[VDST]] -define void @rotl_i32(i32 addrspace(1)* %in, i32 %x, i32 %y) { -entry: - %0 = shl i32 %x, %y - %1 = sub i32 32, %y - %2 = lshr i32 %x, %1 - %3 = or i32 %0, %2 - store i32 %3, i32 addrspace(1)* %in - ret void -} - -; FUNC-LABEL: {{^}}rotl_v2i32: -; SI-DAG: s_sub_i32 -; SI-DAG: s_sub_i32 -; SI-DAG: v_alignbit_b32 -; SI-DAG: v_alignbit_b32 -; SI: s_endpgm -define void @rotl_v2i32(<2 x i32> addrspace(1)* %in, <2 x i32> %x, <2 x i32> %y) { -entry: - %0 = shl <2 x i32> %x, %y - %1 = sub <2 x i32> , %y - %2 = lshr <2 x i32> %x, %1 - %3 = or <2 x i32> %0, %2 - store <2 x i32> %3, <2 x i32> addrspace(1)* %in - ret void -} - -; FUNC-LABEL: {{^}}rotl_v4i32: -; SI-DAG: s_sub_i32 -; SI-DAG: v_alignbit_b32 -; SI-DAG: s_sub_i32 -; SI-DAG: v_alignbit_b32 -; SI-DAG: s_sub_i32 -; SI-DAG: v_alignbit_b32 -; SI-DAG: s_sub_i32 -; SI-DAG: v_alignbit_b32 -; SI: s_endpgm -define void @rotl_v4i32(<4 x i32> addrspace(1)* %in, <4 x i32> %x, <4 x i32> %y) { -entry: - %0 = shl <4 x i32> %x, %y - %1 = sub <4 x i32> , %y - %2 = lshr <4 x i32> %x, %1 - %3 = or <4 x i32> %0, %2 - store <4 x i32> %3, <4 x i32> addrspace(1)* %in - ret void -} diff --git a/llvm/test/CodeGen/R600/rotr.i64.ll b/llvm/test/CodeGen/R600/rotr.i64.ll deleted file mode 100644 index 586de44a566..00000000000 --- a/llvm/test/CodeGen/R600/rotr.i64.ll +++ /dev/null @@ -1,61 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=BOTH %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=BOTH %s - -; BOTH-LABEL: {{^}}s_rotr_i64: -; BOTH-DAG: s_sub_i32 -; BOTH-DAG: s_lshr_b64 -; BOTH-DAG: s_lshl_b64 -; BOTH: s_or_b64 -define void @s_rotr_i64(i64 addrspace(1)* %in, i64 %x, i64 %y) { -entry: - %tmp0 = sub i64 64, %y - %tmp1 = shl i64 %x, %tmp0 - %tmp2 = lshr i64 %x, %y - %tmp3 = or i64 %tmp1, %tmp2 - store i64 %tmp3, i64 addrspace(1)* %in - ret void -} - -; BOTH-LABEL: {{^}}v_rotr_i64: -; BOTH-DAG: v_sub_i32 -; SI-DAG: v_lshr_b64 -; SI-DAG: v_lshl_b64 -; VI-DAG: v_lshrrev_b64 -; VI-DAG: v_lshlrev_b64 -; BOTH: v_or_b32 -; BOTH: v_or_b32 -define void @v_rotr_i64(i64 addrspace(1)* %in, i64 addrspace(1)* %xptr, i64 addrspace(1)* %yptr) { -entry: - %x = load i64, i64 addrspace(1)* %xptr, align 8 - %y = load i64, i64 addrspace(1)* %yptr, align 8 - %tmp0 = sub i64 64, %y - %tmp1 = shl i64 %x, %tmp0 - %tmp2 = lshr i64 %x, %y - %tmp3 = or i64 %tmp1, %tmp2 - store i64 %tmp3, i64 addrspace(1)* %in - ret void -} - -; BOTH-LABEL: {{^}}s_rotr_v2i64: -define void @s_rotr_v2i64(<2 x i64> addrspace(1)* %in, <2 x i64> %x, <2 x i64> %y) { -entry: - %tmp0 = sub <2 x i64> , %y - %tmp1 = shl <2 x i64> %x, %tmp0 - %tmp2 = lshr <2 x i64> %x, %y - %tmp3 = or <2 x i64> %tmp1, %tmp2 - store <2 x i64> %tmp3, <2 x i64> addrspace(1)* %in - ret void -} - -; BOTH-LABEL: {{^}}v_rotr_v2i64: -define void @v_rotr_v2i64(<2 x i64> addrspace(1)* %in, <2 x i64> addrspace(1)* %xptr, <2 x i64> addrspace(1)* %yptr) { -entry: - %x = load <2 x i64>, <2 x i64> addrspace(1)* %xptr, align 8 - %y = load <2 x i64>, <2 x i64> addrspace(1)* %yptr, align 8 - %tmp0 = sub <2 x i64> , %y - %tmp1 = shl <2 x i64> %x, %tmp0 - %tmp2 = lshr <2 x i64> %x, %y - %tmp3 = or <2 x i64> %tmp1, %tmp2 - store <2 x i64> %tmp3, <2 x i64> addrspace(1)* %in - ret void -} diff --git a/llvm/test/CodeGen/R600/rotr.ll b/llvm/test/CodeGen/R600/rotr.ll deleted file mode 100644 index 044f9ffe6d6..00000000000 --- a/llvm/test/CodeGen/R600/rotr.ll +++ /dev/null @@ -1,53 +0,0 @@ -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck --check-prefix=R600 -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s - -; FUNC-LABEL: {{^}}rotr_i32: -; R600: BIT_ALIGN_INT - -; SI: v_alignbit_b32 -define void @rotr_i32(i32 addrspace(1)* %in, i32 %x, i32 %y) { -entry: - %tmp0 = sub i32 32, %y - %tmp1 = shl i32 %x, %tmp0 - %tmp2 = lshr i32 %x, %y - %tmp3 = or i32 %tmp1, %tmp2 - store i32 %tmp3, i32 addrspace(1)* %in - ret void -} - -; FUNC-LABEL: {{^}}rotr_v2i32: -; R600: BIT_ALIGN_INT -; R600: BIT_ALIGN_INT - -; SI: v_alignbit_b32 -; SI: v_alignbit_b32 -define void @rotr_v2i32(<2 x i32> addrspace(1)* %in, <2 x i32> %x, <2 x i32> %y) { -entry: - %tmp0 = sub <2 x i32> , %y - %tmp1 = shl <2 x i32> %x, %tmp0 - %tmp2 = lshr <2 x i32> %x, %y - %tmp3 = or <2 x i32> %tmp1, %tmp2 - store <2 x i32> %tmp3, <2 x i32> addrspace(1)* %in - ret void -} - -; FUNC-LABEL: {{^}}rotr_v4i32: -; R600: BIT_ALIGN_INT -; R600: BIT_ALIGN_INT -; R600: BIT_ALIGN_INT -; R600: BIT_ALIGN_INT - -; SI: v_alignbit_b32 -; SI: v_alignbit_b32 -; SI: v_alignbit_b32 -; SI: v_alignbit_b32 -define void @rotr_v4i32(<4 x i32> addrspace(1)* %in, <4 x i32> %x, <4 x i32> %y) { -entry: - %tmp0 = sub <4 x i32> , %y - %tmp1 = shl <4 x i32> %x, %tmp0 - %tmp2 = lshr <4 x i32> %x, %y - %tmp3 = or <4 x i32> %tmp1, %tmp2 - store <4 x i32> %tmp3, <4 x i32> addrspace(1)* %in - ret void -} diff --git a/llvm/test/CodeGen/R600/rsq.ll b/llvm/test/CodeGen/R600/rsq.ll deleted file mode 100644 index b67b800c737..00000000000 --- a/llvm/test/CodeGen/R600/rsq.ll +++ /dev/null @@ -1,74 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -mattr=-fp32-denormals -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=SI-UNSAFE -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=SI -mattr=-fp32-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=SI-SAFE -check-prefix=SI %s - -declare i32 @llvm.r600.read.tidig.x() nounwind readnone -declare float @llvm.sqrt.f32(float) nounwind readnone -declare double @llvm.sqrt.f64(double) nounwind readnone - -; SI-LABEL: {{^}}rsq_f32: -; SI: v_rsq_f32_e32 -; SI: s_endpgm -define void @rsq_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind { - %val = load float, float addrspace(1)* %in, align 4 - %sqrt = call float @llvm.sqrt.f32(float %val) nounwind readnone - %div = fdiv float 1.0, %sqrt - store float %div, float addrspace(1)* %out, align 4 - ret void -} - -; SI-LABEL: {{^}}rsq_f64: -; SI-UNSAFE: v_rsq_f64_e32 -; SI-SAFE: v_sqrt_f64_e32 -; SI: s_endpgm -define void @rsq_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) nounwind { - %val = load double, double addrspace(1)* %in, align 4 - %sqrt = call double @llvm.sqrt.f64(double %val) nounwind readnone - %div = fdiv double 1.0, %sqrt - store double %div, double addrspace(1)* %out, align 4 - ret void -} - -; SI-LABEL: {{^}}rsq_f32_sgpr: -; SI: v_rsq_f32_e32 {{v[0-9]+}}, {{s[0-9]+}} -; SI: s_endpgm -define void @rsq_f32_sgpr(float addrspace(1)* noalias %out, float %val) nounwind { - %sqrt = call float @llvm.sqrt.f32(float %val) nounwind readnone - %div = fdiv float 1.0, %sqrt - store float %div, float addrspace(1)* %out, align 4 - ret void -} - -; Recognize that this is rsqrt(a) * rcp(b) * c, -; not 1 / ( 1 / sqrt(a)) * rcp(b) * c. - -; SI-LABEL: @rsqrt_fmul -; SI-DAG: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI-DAG: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; SI-DAG: buffer_load_dword [[C:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 - -; SI-UNSAFE-DAG: v_rsq_f32_e32 [[RSQA:v[0-9]+]], [[A]] -; SI-UNSAFE-DAG: v_rcp_f32_e32 [[RCPB:v[0-9]+]], [[B]] -; SI-UNSAFE-DAG: v_mul_f32_e32 [[TMP:v[0-9]+]], [[RCPB]], [[RSQA]] -; SI-UNSAFE: v_mul_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[C]] -; SI-UNSAFE: buffer_store_dword [[RESULT]] - -; SI-SAFE-NOT: v_rsq_f32 - -; SI: s_endpgm -define void @rsqrt_fmul(float addrspace(1)* %out, float addrspace(1)* %in) { - %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone - %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid - %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid - %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 - %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2 - - %a = load float, float addrspace(1)* %gep.0 - %b = load float, float addrspace(1)* %gep.1 - %c = load float, float addrspace(1)* %gep.2 - - %x = call float @llvm.sqrt.f32(float %a) - %y = fmul float %x, %b - %z = fdiv float %c, %y - store float %z, float addrspace(1)* %out.gep - ret void -} diff --git a/llvm/test/CodeGen/R600/rv7x0_count3.ll b/llvm/test/CodeGen/R600/rv7x0_count3.ll deleted file mode 100644 index c3fd923e459..00000000000 --- a/llvm/test/CodeGen/R600/rv7x0_count3.ll +++ /dev/null @@ -1,41 +0,0 @@ -; RUN: llc < %s -march=r600 -show-mc-encoding -mcpu=rv710 | FileCheck %s - -; CHECK: TEX 9 @6 ; encoding: [0x06,0x00,0x00,0x00,0x00,0x04,0x88,0x80] - -define void @test(<4 x float> inreg %reg0, <4 x float> inreg %reg1) #0 { - %1 = extractelement <4 x float> %reg1, i32 0 - %2 = extractelement <4 x float> %reg1, i32 1 - %3 = extractelement <4 x float> %reg1, i32 2 - %4 = extractelement <4 x float> %reg1, i32 3 - %5 = insertelement <4 x float> undef, float %1, i32 0 - %6 = insertelement <4 x float> %5, float %2, i32 1 - %7 = insertelement <4 x float> %6, float %3, i32 2 - %8 = insertelement <4 x float> %7, float %4, i32 3 - %9 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %8, i32 0, i32 0, i32 1) - %10 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %8, i32 1, i32 0, i32 1) - %11 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %8, i32 2, i32 0, i32 1) - %12 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %8, i32 3, i32 0, i32 1) - %13 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %8, i32 4, i32 0, i32 1) - %14 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %8, i32 5, i32 0, i32 1) - %15 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %8, i32 6, i32 0, i32 1) - %16 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %8, i32 7, i32 0, i32 1) - %17 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %8, i32 8, i32 0, i32 1) - %18 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %8, i32 9, i32 0, i32 1) - %19 = fadd <4 x float> %9, %10 - %20 = fadd <4 x float> %19, %11 - %21 = fadd <4 x float> %20, %12 - %22 = fadd <4 x float> %21, %13 - %23 = fadd <4 x float> %22, %14 - %24 = fadd <4 x float> %23, %15 - %25 = fadd <4 x float> %24, %16 - %26 = fadd <4 x float> %25, %17 - %27 = fadd <4 x float> %26, %18 - call void @llvm.R600.store.swizzle(<4 x float> %27, i32 0, i32 2) - ret void -} - -declare <4 x float> @llvm.AMDGPU.tex(<4 x float>, i32, i32, i32) readnone - -declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) - -attributes #0 = { "ShaderType"="1" } diff --git a/llvm/test/CodeGen/R600/s_movk_i32.ll b/llvm/test/CodeGen/R600/s_movk_i32.ll deleted file mode 100644 index 6b1a36c979c..00000000000 --- a/llvm/test/CodeGen/R600/s_movk_i32.ll +++ /dev/null @@ -1,185 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s - -; SI-LABEL: {{^}}s_movk_i32_k0: -; SI-DAG: s_mov_b32 [[LO_S_IMM:s[0-9]+]], 0xffff{{$}} -; SI-DAG: s_mov_b32 [[HI_S_IMM:s[0-9]+]], 1{{$}} -; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, -; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]] -; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]] -; SI: s_endpgm -define void @s_movk_i32_k0(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { - %loada = load i64, i64 addrspace(1)* %a, align 4 - %or = or i64 %loada, 4295032831 ; ((1 << 16) - 1) | (1 << 32) - store i64 %or, i64 addrspace(1)* %out - ret void -} - -; SI-LABEL: {{^}}s_movk_i32_k1: -; SI-DAG: s_movk_i32 [[LO_S_IMM:s[0-9]+]], 0x7fff{{$}} -; SI-DAG: s_mov_b32 [[HI_S_IMM:s[0-9]+]], 1{{$}} -; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, -; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]] -; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]] -; SI: s_endpgm -define void @s_movk_i32_k1(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { - %loada = load i64, i64 addrspace(1)* %a, align 4 - %or = or i64 %loada, 4295000063 ; ((1 << 15) - 1) | (1 << 32) - store i64 %or, i64 addrspace(1)* %out - ret void -} - -; SI-LABEL: {{^}}s_movk_i32_k2: -; SI-DAG: s_movk_i32 [[LO_S_IMM:s[0-9]+]], 0x7fff{{$}} -; SI-DAG: s_mov_b32 [[HI_S_IMM:s[0-9]+]], 64{{$}} -; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, -; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]] -; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]] -; SI: s_endpgm -define void @s_movk_i32_k2(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { - %loada = load i64, i64 addrspace(1)* %a, align 4 - %or = or i64 %loada, 274877939711 ; ((1 << 15) - 1) | (64 << 32) - store i64 %or, i64 addrspace(1)* %out - ret void -} - -; SI-LABEL: {{^}}s_movk_i32_k3: -; SI-DAG: s_mov_b32 [[LO_S_IMM:s[0-9]+]], 0x8000{{$}} -; SI-DAG: s_mov_b32 [[HI_S_IMM:s[0-9]+]], 1{{$}} -; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, -; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]] -; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]] -; SI: s_endpgm -define void @s_movk_i32_k3(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { - %loada = load i64, i64 addrspace(1)* %a, align 4 - %or = or i64 %loada, 4295000064 ; (1 << 15) | (1 << 32) - store i64 %or, i64 addrspace(1)* %out - ret void -} - -; SI-LABEL: {{^}}s_movk_i32_k4: -; SI-DAG: s_mov_b32 [[LO_S_IMM:s[0-9]+]], 0x20000{{$}} -; SI-DAG: s_mov_b32 [[HI_S_IMM:s[0-9]+]], 1{{$}} -; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, -; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]] -; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]] -; SI: s_endpgm -define void @s_movk_i32_k4(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { - %loada = load i64, i64 addrspace(1)* %a, align 4 - %or = or i64 %loada, 4295098368 ; (1 << 17) | (1 << 32) - store i64 %or, i64 addrspace(1)* %out - ret void -} - -; SI-LABEL: {{^}}s_movk_i32_k5: -; SI-DAG: s_movk_i32 [[LO_S_IMM:s[0-9]+]], 0xffef{{$}} -; SI-DAG: s_mov_b32 [[HI_S_IMM:s[0-9]+]], 0xff00ffff{{$}} -; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, -; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]] -; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]] -; SI: s_endpgm -define void @s_movk_i32_k5(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { - %loada = load i64, i64 addrspace(1)* %a, align 4 - %or = or i64 %loada, 18374967954648334319 ; -17 & 0xff00ffffffffffff - store i64 %or, i64 addrspace(1)* %out - ret void -} - -; SI-LABEL: {{^}}s_movk_i32_k6: -; SI-DAG: s_movk_i32 [[LO_S_IMM:s[0-9]+]], 0x41{{$}} -; SI-DAG: s_mov_b32 [[HI_S_IMM:s[0-9]+]], 63{{$}} -; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, -; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]] -; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]] -; SI: s_endpgm -define void @s_movk_i32_k6(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { - %loada = load i64, i64 addrspace(1)* %a, align 4 - %or = or i64 %loada, 270582939713 ; 65 | (63 << 32) - store i64 %or, i64 addrspace(1)* %out - ret void -} - -; SI-LABEL: {{^}}s_movk_i32_k7: -; SI-DAG: s_movk_i32 [[LO_S_IMM:s[0-9]+]], 0x2000{{$}} -; SI-DAG: s_movk_i32 [[HI_S_IMM:s[0-9]+]], 0x4000{{$}} -; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, -; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]] -; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]] -; SI: s_endpgm -define void @s_movk_i32_k7(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { - %loada = load i64, i64 addrspace(1)* %a, align 4 - %or = or i64 %loada, 70368744185856; ((1 << 13)) | ((1 << 14) << 32) - store i64 %or, i64 addrspace(1)* %out - ret void -} - - -; SI-LABEL: {{^}}s_movk_i32_k8: -; SI-DAG: s_movk_i32 [[LO_S_IMM:s[0-9]+]], 0x8000{{$}} -; SI-DAG: s_mov_b32 [[HI_S_IMM:s[0-9]+]], 0x11111111{{$}} -; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, -; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]] -; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]] -; SI: s_endpgm -define void @s_movk_i32_k8(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { - %loada = load i64, i64 addrspace(1)* %a, align 4 - %or = or i64 %loada, 1229782942255906816 ; 0x11111111ffff8000 - store i64 %or, i64 addrspace(1)* %out - ret void -} - -; SI-LABEL: {{^}}s_movk_i32_k9: -; SI-DAG: s_movk_i32 [[LO_S_IMM:s[0-9]+]], 0x8001{{$}} -; SI-DAG: s_mov_b32 [[HI_S_IMM:s[0-9]+]], 0x11111111{{$}} -; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, -; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]] -; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]] -; SI: s_endpgm -define void @s_movk_i32_k9(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { - %loada = load i64, i64 addrspace(1)* %a, align 4 - %or = or i64 %loada, 1229782942255906817 ; 0x11111111ffff8001 - store i64 %or, i64 addrspace(1)* %out - ret void -} - -; SI-LABEL: {{^}}s_movk_i32_k10: -; SI-DAG: s_movk_i32 [[LO_S_IMM:s[0-9]+]], 0x8888{{$}} -; SI-DAG: s_mov_b32 [[HI_S_IMM:s[0-9]+]], 0x11111111{{$}} -; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, -; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]] -; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]] -; SI: s_endpgm -define void @s_movk_i32_k10(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { - %loada = load i64, i64 addrspace(1)* %a, align 4 - %or = or i64 %loada, 1229782942255909000 ; 0x11111111ffff8888 - store i64 %or, i64 addrspace(1)* %out - ret void -} - -; SI-LABEL: {{^}}s_movk_i32_k11: -; SI-DAG: s_movk_i32 [[LO_S_IMM:s[0-9]+]], 0x8fff{{$}} -; SI-DAG: s_mov_b32 [[HI_S_IMM:s[0-9]+]], 0x11111111{{$}} -; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, -; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]] -; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]] -; SI: s_endpgm -define void @s_movk_i32_k11(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { - %loada = load i64, i64 addrspace(1)* %a, align 4 - %or = or i64 %loada, 1229782942255910911 ; 0x11111111ffff8fff - store i64 %or, i64 addrspace(1)* %out - ret void -} - -; SI-LABEL: {{^}}s_movk_i32_k12: -; SI-DAG: s_mov_b32 [[LO_S_IMM:s[0-9]+]], 0xffff7001{{$}} -; SI-DAG: s_mov_b32 [[HI_S_IMM:s[0-9]+]], 0x11111111{{$}} -; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, -; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]] -; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]] -; SI: s_endpgm -define void @s_movk_i32_k12(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { - %loada = load i64, i64 addrspace(1)* %a, align 4 - %or = or i64 %loada, 1229782942255902721 ; 0x11111111ffff7001 - store i64 %or, i64 addrspace(1)* %out - ret void -} diff --git a/llvm/test/CodeGen/R600/saddo.ll b/llvm/test/CodeGen/R600/saddo.ll deleted file mode 100644 index f8ced7942a6..00000000000 --- a/llvm/test/CodeGen/R600/saddo.ll +++ /dev/null @@ -1,63 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs< %s - -declare { i32, i1 } @llvm.sadd.with.overflow.i32(i32, i32) nounwind readnone -declare { i64, i1 } @llvm.sadd.with.overflow.i64(i64, i64) nounwind readnone - -; FUNC-LABEL: {{^}}saddo_i64_zext: -define void @saddo_i64_zext(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind { - %sadd = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %a, i64 %b) nounwind - %val = extractvalue { i64, i1 } %sadd, 0 - %carry = extractvalue { i64, i1 } %sadd, 1 - %ext = zext i1 %carry to i64 - %add2 = add i64 %val, %ext - store i64 %add2, i64 addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}s_saddo_i32: -define void @s_saddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 %a, i32 %b) nounwind { - %sadd = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 %a, i32 %b) nounwind - %val = extractvalue { i32, i1 } %sadd, 0 - %carry = extractvalue { i32, i1 } %sadd, 1 - store i32 %val, i32 addrspace(1)* %out, align 4 - store i1 %carry, i1 addrspace(1)* %carryout - ret void -} - -; FUNC-LABEL: {{^}}v_saddo_i32: -define void @v_saddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind { - %a = load i32, i32 addrspace(1)* %aptr, align 4 - %b = load i32, i32 addrspace(1)* %bptr, align 4 - %sadd = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 %a, i32 %b) nounwind - %val = extractvalue { i32, i1 } %sadd, 0 - %carry = extractvalue { i32, i1 } %sadd, 1 - store i32 %val, i32 addrspace(1)* %out, align 4 - store i1 %carry, i1 addrspace(1)* %carryout - ret void -} - -; FUNC-LABEL: {{^}}s_saddo_i64: -define void @s_saddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 %a, i64 %b) nounwind { - %sadd = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %a, i64 %b) nounwind - %val = extractvalue { i64, i1 } %sadd, 0 - %carry = extractvalue { i64, i1 } %sadd, 1 - store i64 %val, i64 addrspace(1)* %out, align 8 - store i1 %carry, i1 addrspace(1)* %carryout - ret void -} - -; FUNC-LABEL: {{^}}v_saddo_i64: -; SI: v_add_i32 -; SI: v_addc_u32 -define void @v_saddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind { - %a = load i64, i64 addrspace(1)* %aptr, align 4 - %b = load i64, i64 addrspace(1)* %bptr, align 4 - %sadd = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %a, i64 %b) nounwind - %val = extractvalue { i64, i1 } %sadd, 0 - %carry = extractvalue { i64, i1 } %sadd, 1 - store i64 %val, i64 addrspace(1)* %out, align 8 - store i1 %carry, i1 addrspace(1)* %carryout - ret void -} diff --git a/llvm/test/CodeGen/R600/salu-to-valu.ll b/llvm/test/CodeGen/R600/salu-to-valu.ll deleted file mode 100644 index 0b964957654..00000000000 --- a/llvm/test/CodeGen/R600/salu-to-valu.ll +++ /dev/null @@ -1,118 +0,0 @@ -; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s - -; In this test both the pointer and the offset operands to the -; BUFFER_LOAD instructions end up being stored in vgprs. This -; requires us to add the pointer and offset together, store the -; result in the offset operand (vaddr), and then store 0 in an -; sgpr register pair and use that for the pointer operand -; (low 64-bits of srsrc). - -; CHECK-LABEL: {{^}}mubuf: - -; Make sure we aren't using VGPRs for the source operand of s_mov_b64 -; CHECK-NOT: s_mov_b64 s[{{[0-9]+:[0-9]+}}], v - -; Make sure we aren't using VGPR's for the srsrc operand of BUFFER_LOAD_* -; instructions -; CHECK: buffer_load_ubyte v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64 -; CHECK: buffer_load_ubyte v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64 -define void @mubuf(i32 addrspace(1)* %out, i8 addrspace(1)* %in) { -entry: - %0 = call i32 @llvm.r600.read.tidig.x() #1 - %1 = call i32 @llvm.r600.read.tidig.y() #1 - %2 = sext i32 %0 to i64 - %3 = sext i32 %1 to i64 - br label %loop - -loop: - %4 = phi i64 [0, %entry], [%5, %loop] - %5 = add i64 %2, %4 - %6 = getelementptr i8, i8 addrspace(1)* %in, i64 %5 - %7 = load i8, i8 addrspace(1)* %6, align 1 - %8 = or i64 %5, 1 - %9 = getelementptr i8, i8 addrspace(1)* %in, i64 %8 - %10 = load i8, i8 addrspace(1)* %9, align 1 - %11 = add i8 %7, %10 - %12 = sext i8 %11 to i32 - store i32 %12, i32 addrspace(1)* %out - %13 = icmp slt i64 %5, 10 - br i1 %13, label %loop, label %done - -done: - ret void -} - -declare i32 @llvm.r600.read.tidig.x() #1 -declare i32 @llvm.r600.read.tidig.y() #1 - -attributes #1 = { nounwind readnone } - -; Test moving an SMRD instruction to the VALU - -; CHECK-LABEL: {{^}}smrd_valu: -; CHECK: buffer_load_dword [[OUT:v[0-9]+]] -; CHECK: buffer_store_dword [[OUT]] - -define void @smrd_valu(i32 addrspace(2)* addrspace(1)* %in, i32 %a, i32 addrspace(1)* %out) { -entry: - %0 = icmp ne i32 %a, 0 - br i1 %0, label %if, label %else - -if: - %1 = load i32 addrspace(2)*, i32 addrspace(2)* addrspace(1)* %in - br label %endif - -else: - %2 = getelementptr i32 addrspace(2)*, i32 addrspace(2)* addrspace(1)* %in - %3 = load i32 addrspace(2)*, i32 addrspace(2)* addrspace(1)* %2 - br label %endif - -endif: - %4 = phi i32 addrspace(2)* [%1, %if], [%3, %else] - %5 = getelementptr i32, i32 addrspace(2)* %4, i32 3000 - %6 = load i32, i32 addrspace(2)* %5 - store i32 %6, i32 addrspace(1)* %out - ret void -} - -; Test moving ann SMRD with an immediate offset to the VALU - -; CHECK-LABEL: {{^}}smrd_valu2: -; CHECK: buffer_load_dword -define void @smrd_valu2(i32 addrspace(1)* %out, [8 x i32] addrspace(2)* %in) { -entry: - %0 = call i32 @llvm.r600.read.tidig.x() nounwind readnone - %1 = add i32 %0, 4 - %2 = getelementptr [8 x i32], [8 x i32] addrspace(2)* %in, i32 %0, i32 4 - %3 = load i32, i32 addrspace(2)* %2 - store i32 %3, i32 addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}s_load_imm_v8i32: -; CHECK: buffer_load_dwordx4 -; CHECK: buffer_load_dwordx4 -define void @s_load_imm_v8i32(<8 x i32> addrspace(1)* %out, i32 addrspace(2)* nocapture readonly %in) { -entry: - %tmp0 = tail call i32 @llvm.r600.read.tidig.x() #1 - %tmp1 = getelementptr inbounds i32, i32 addrspace(2)* %in, i32 %tmp0 - %tmp2 = bitcast i32 addrspace(2)* %tmp1 to <8 x i32> addrspace(2)* - %tmp3 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp2, align 4 - store <8 x i32> %tmp3, <8 x i32> addrspace(1)* %out, align 32 - ret void -} - -; CHECK-LABEL: {{^}}s_load_imm_v16i32: -; CHECK: buffer_load_dwordx4 -; CHECK: buffer_load_dwordx4 -; CHECK: buffer_load_dwordx4 -; CHECK: buffer_load_dwordx4 -define void @s_load_imm_v16i32(<16 x i32> addrspace(1)* %out, i32 addrspace(2)* nocapture readonly %in) { -entry: - %tmp0 = tail call i32 @llvm.r600.read.tidig.x() #1 - %tmp1 = getelementptr inbounds i32, i32 addrspace(2)* %in, i32 %tmp0 - %tmp2 = bitcast i32 addrspace(2)* %tmp1 to <16 x i32> addrspace(2)* - %tmp3 = load <16 x i32>, <16 x i32> addrspace(2)* %tmp2, align 4 - store <16 x i32> %tmp3, <16 x i32> addrspace(1)* %out, align 32 - ret void -} diff --git a/llvm/test/CodeGen/R600/scalar_to_vector.ll b/llvm/test/CodeGen/R600/scalar_to_vector.ll deleted file mode 100644 index 0970e5d3063..00000000000 --- a/llvm/test/CodeGen/R600/scalar_to_vector.ll +++ /dev/null @@ -1,81 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s - - -; FUNC-LABEL: {{^}}scalar_to_vector_v2i32: -; SI: buffer_load_dword [[VAL:v[0-9]+]], -; SI: v_lshrrev_b32_e32 [[RESULT:v[0-9]+]], 16, [[VAL]] -; SI: buffer_store_short [[RESULT]] -; SI: buffer_store_short [[RESULT]] -; SI: buffer_store_short [[RESULT]] -; SI: buffer_store_short [[RESULT]] -; SI: s_endpgm -define void @scalar_to_vector_v2i32(<4 x i16> addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { - %tmp1 = load i32, i32 addrspace(1)* %in, align 4 - %bc = bitcast i32 %tmp1 to <2 x i16> - %tmp2 = shufflevector <2 x i16> %bc, <2 x i16> undef, <4 x i32> - store <4 x i16> %tmp2, <4 x i16> addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}scalar_to_vector_v2f32: -; SI: buffer_load_dword [[VAL:v[0-9]+]], -; SI: v_lshrrev_b32_e32 [[RESULT:v[0-9]+]], 16, [[VAL]] -; SI: buffer_store_short [[RESULT]] -; SI: buffer_store_short [[RESULT]] -; SI: buffer_store_short [[RESULT]] -; SI: buffer_store_short [[RESULT]] -; SI: s_endpgm -define void @scalar_to_vector_v2f32(<4 x i16> addrspace(1)* %out, float addrspace(1)* %in) nounwind { - %tmp1 = load float, float addrspace(1)* %in, align 4 - %bc = bitcast float %tmp1 to <2 x i16> - %tmp2 = shufflevector <2 x i16> %bc, <2 x i16> undef, <4 x i32> - store <4 x i16> %tmp2, <4 x i16> addrspace(1)* %out, align 8 - ret void -} - -; Getting a SCALAR_TO_VECTOR seems to be tricky. These cases managed -; to produce one, but for some reason never made it to selection. - - -; define void @scalar_to_vector_test2(<8 x i8> addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { -; %tmp1 = load i32, i32 addrspace(1)* %in, align 4 -; %bc = bitcast i32 %tmp1 to <4 x i8> - -; %tmp2 = shufflevector <4 x i8> %bc, <4 x i8> undef, <8 x i32> -; store <8 x i8> %tmp2, <8 x i8> addrspace(1)* %out, align 4 -; ret void -; } - -; define void @scalar_to_vector_test3(<4 x i32> addrspace(1)* %out) nounwind { -; %newvec0 = insertelement <2 x i64> undef, i64 12345, i32 0 -; %newvec1 = insertelement <2 x i64> %newvec0, i64 undef, i32 1 -; %bc = bitcast <2 x i64> %newvec1 to <4 x i32> -; %add = add <4 x i32> %bc, -; store <4 x i32> %add, <4 x i32> addrspace(1)* %out, align 16 -; ret void -; } - -; define void @scalar_to_vector_test4(<8 x i16> addrspace(1)* %out) nounwind { -; %newvec0 = insertelement <4 x i32> undef, i32 12345, i32 0 -; %bc = bitcast <4 x i32> %newvec0 to <8 x i16> -; %add = add <8 x i16> %bc, -; store <8 x i16> %add, <8 x i16> addrspace(1)* %out, align 16 -; ret void -; } - -; define void @scalar_to_vector_test5(<4 x i16> addrspace(1)* %out) nounwind { -; %newvec0 = insertelement <2 x i32> undef, i32 12345, i32 0 -; %bc = bitcast <2 x i32> %newvec0 to <4 x i16> -; %add = add <4 x i16> %bc, -; store <4 x i16> %add, <4 x i16> addrspace(1)* %out, align 16 -; ret void -; } - -; define void @scalar_to_vector_test6(<4 x i16> addrspace(1)* %out) nounwind { -; %newvec0 = insertelement <2 x i32> undef, i32 12345, i32 0 -; %bc = bitcast <2 x i32> %newvec0 to <4 x i16> -; %add = add <4 x i16> %bc, -; store <4 x i16> %add, <4 x i16> addrspace(1)* %out, align 16 -; ret void -; } diff --git a/llvm/test/CodeGen/R600/schedule-fs-loop-nested-if.ll b/llvm/test/CodeGen/R600/schedule-fs-loop-nested-if.ll deleted file mode 100644 index 11e8f5176f4..00000000000 --- a/llvm/test/CodeGen/R600/schedule-fs-loop-nested-if.ll +++ /dev/null @@ -1,82 +0,0 @@ -;RUN: llc < %s -march=r600 -mcpu=cayman -stress-sched -verify-misched -verify-machineinstrs -;REQUIRES: asserts - -define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1) #1 { -main_body: - %0 = extractelement <4 x float> %reg1, i32 0 - %1 = extractelement <4 x float> %reg1, i32 1 - %2 = extractelement <4 x float> %reg1, i32 2 - %3 = extractelement <4 x float> %reg1, i32 3 - %4 = fcmp ult float %1, 0.000000e+00 - %5 = select i1 %4, float 1.000000e+00, float 0.000000e+00 - %6 = fsub float -0.000000e+00, %5 - %7 = fptosi float %6 to i32 - %8 = bitcast i32 %7 to float - %9 = fcmp ult float %0, 5.700000e+01 - %10 = select i1 %9, float 1.000000e+00, float 0.000000e+00 - %11 = fsub float -0.000000e+00, %10 - %12 = fptosi float %11 to i32 - %13 = bitcast i32 %12 to float - %14 = bitcast float %8 to i32 - %15 = bitcast float %13 to i32 - %16 = and i32 %14, %15 - %17 = bitcast i32 %16 to float - %18 = bitcast float %17 to i32 - %19 = icmp ne i32 %18, 0 - %20 = fcmp ult float %0, 0.000000e+00 - %21 = select i1 %20, float 1.000000e+00, float 0.000000e+00 - %22 = fsub float -0.000000e+00, %21 - %23 = fptosi float %22 to i32 - %24 = bitcast i32 %23 to float - %25 = bitcast float %24 to i32 - %26 = icmp ne i32 %25, 0 - br i1 %19, label %IF, label %ELSE - -IF: ; preds = %main_body - %. = select i1 %26, float 0.000000e+00, float 1.000000e+00 - %.18 = select i1 %26, float 1.000000e+00, float 0.000000e+00 - br label %ENDIF - -ELSE: ; preds = %main_body - br i1 %26, label %ENDIF, label %ELSE17 - -ENDIF: ; preds = %ELSE17, %ELSE, %IF - %temp1.0 = phi float [ %., %IF ], [ %48, %ELSE17 ], [ 0.000000e+00, %ELSE ] - %temp2.0 = phi float [ 0.000000e+00, %IF ], [ %49, %ELSE17 ], [ 1.000000e+00, %ELSE ] - %temp.0 = phi float [ %.18, %IF ], [ %47, %ELSE17 ], [ 0.000000e+00, %ELSE ] - %27 = call float @llvm.AMDIL.clamp.(float %temp.0, float 0.000000e+00, float 1.000000e+00) - %28 = call float @llvm.AMDIL.clamp.(float %temp1.0, float 0.000000e+00, float 1.000000e+00) - %29 = call float @llvm.AMDIL.clamp.(float %temp2.0, float 0.000000e+00, float 1.000000e+00) - %30 = call float @llvm.AMDIL.clamp.(float 1.000000e+00, float 0.000000e+00, float 1.000000e+00) - %31 = insertelement <4 x float> undef, float %27, i32 0 - %32 = insertelement <4 x float> %31, float %28, i32 1 - %33 = insertelement <4 x float> %32, float %29, i32 2 - %34 = insertelement <4 x float> %33, float %30, i32 3 - call void @llvm.R600.store.swizzle(<4 x float> %34, i32 0, i32 0) - ret void - -ELSE17: ; preds = %ELSE - %35 = fadd float 0.000000e+00, 0x3FC99999A0000000 - %36 = fadd float 0.000000e+00, 0x3FC99999A0000000 - %37 = fadd float 0.000000e+00, 0x3FC99999A0000000 - %38 = fadd float %35, 0x3FC99999A0000000 - %39 = fadd float %36, 0x3FC99999A0000000 - %40 = fadd float %37, 0x3FC99999A0000000 - %41 = fadd float %38, 0x3FC99999A0000000 - %42 = fadd float %39, 0x3FC99999A0000000 - %43 = fadd float %40, 0x3FC99999A0000000 - %44 = fadd float %41, 0x3FC99999A0000000 - %45 = fadd float %42, 0x3FC99999A0000000 - %46 = fadd float %43, 0x3FC99999A0000000 - %47 = fadd float %44, 0x3FC99999A0000000 - %48 = fadd float %45, 0x3FC99999A0000000 - %49 = fadd float %46, 0x3FC99999A0000000 - br label %ENDIF -} - -declare float @llvm.AMDIL.clamp.(float, float, float) #0 - -declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) - -attributes #0 = { readnone } -attributes #1 = { "ShaderType"="1" } diff --git a/llvm/test/CodeGen/R600/schedule-fs-loop-nested.ll b/llvm/test/CodeGen/R600/schedule-fs-loop-nested.ll deleted file mode 100644 index 759197ca61f..00000000000 --- a/llvm/test/CodeGen/R600/schedule-fs-loop-nested.ll +++ /dev/null @@ -1,88 +0,0 @@ -;RUN: llc < %s -march=r600 -mcpu=cayman -stress-sched -verify-misched -verify-machineinstrs -;REQUIRES: asserts - -define void @main() { -main_body: - %0 = load <4 x float>, <4 x float> addrspace(9)* null - %1 = extractelement <4 x float> %0, i32 3 - %2 = fptosi float %1 to i32 - %3 = bitcast i32 %2 to float - %4 = bitcast float %3 to i32 - %5 = sdiv i32 %4, 4 - %6 = bitcast i32 %5 to float - %7 = bitcast float %6 to i32 - %8 = mul i32 %7, 4 - %9 = bitcast i32 %8 to float - %10 = bitcast float %9 to i32 - %11 = sub i32 0, %10 - %12 = bitcast i32 %11 to float - %13 = bitcast float %3 to i32 - %14 = bitcast float %12 to i32 - %15 = add i32 %13, %14 - %16 = bitcast i32 %15 to float - %17 = load <4 x float>, <4 x float> addrspace(9)* null - %18 = extractelement <4 x float> %17, i32 0 - %19 = load <4 x float>, <4 x float> addrspace(9)* null - %20 = extractelement <4 x float> %19, i32 1 - %21 = load <4 x float>, <4 x float> addrspace(9)* null - %22 = extractelement <4 x float> %21, i32 2 - br label %LOOP - -LOOP: ; preds = %IF31, %main_body - %temp12.0 = phi float [ 0.000000e+00, %main_body ], [ %47, %IF31 ] - %temp6.0 = phi float [ %22, %main_body ], [ %temp6.1, %IF31 ] - %temp5.0 = phi float [ %20, %main_body ], [ %temp5.1, %IF31 ] - %temp4.0 = phi float [ %18, %main_body ], [ %temp4.1, %IF31 ] - %23 = bitcast float %temp12.0 to i32 - %24 = bitcast float %6 to i32 - %25 = icmp sge i32 %23, %24 - %26 = sext i1 %25 to i32 - %27 = bitcast i32 %26 to float - %28 = bitcast float %27 to i32 - %29 = icmp ne i32 %28, 0 - br i1 %29, label %IF, label %LOOP29 - -IF: ; preds = %LOOP - %30 = call float @llvm.AMDIL.clamp.(float %temp4.0, float 0.000000e+00, float 1.000000e+00) - %31 = call float @llvm.AMDIL.clamp.(float %temp5.0, float 0.000000e+00, float 1.000000e+00) - %32 = call float @llvm.AMDIL.clamp.(float %temp6.0, float 0.000000e+00, float 1.000000e+00) - %33 = call float @llvm.AMDIL.clamp.(float 1.000000e+00, float 0.000000e+00, float 1.000000e+00) - %34 = insertelement <4 x float> undef, float %30, i32 0 - %35 = insertelement <4 x float> %34, float %31, i32 1 - %36 = insertelement <4 x float> %35, float %32, i32 2 - %37 = insertelement <4 x float> %36, float %33, i32 3 - call void @llvm.R600.store.swizzle(<4 x float> %37, i32 0, i32 0) - ret void - -LOOP29: ; preds = %LOOP, %ENDIF30 - %temp6.1 = phi float [ %temp4.1, %ENDIF30 ], [ %temp6.0, %LOOP ] - %temp5.1 = phi float [ %temp6.1, %ENDIF30 ], [ %temp5.0, %LOOP ] - %temp4.1 = phi float [ %temp5.1, %ENDIF30 ], [ %temp4.0, %LOOP ] - %temp20.0 = phi float [ %50, %ENDIF30 ], [ 0.000000e+00, %LOOP ] - %38 = bitcast float %temp20.0 to i32 - %39 = bitcast float %16 to i32 - %40 = icmp sge i32 %38, %39 - %41 = sext i1 %40 to i32 - %42 = bitcast i32 %41 to float - %43 = bitcast float %42 to i32 - %44 = icmp ne i32 %43, 0 - br i1 %44, label %IF31, label %ENDIF30 - -IF31: ; preds = %LOOP29 - %45 = bitcast float %temp12.0 to i32 - %46 = add i32 %45, 1 - %47 = bitcast i32 %46 to float - br label %LOOP - -ENDIF30: ; preds = %LOOP29 - %48 = bitcast float %temp20.0 to i32 - %49 = add i32 %48, 1 - %50 = bitcast i32 %49 to float - br label %LOOP29 -} - -declare float @llvm.AMDIL.clamp.(float, float, float) #0 - -declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) - -attributes #0 = { readnone } diff --git a/llvm/test/CodeGen/R600/schedule-fs-loop.ll b/llvm/test/CodeGen/R600/schedule-fs-loop.ll deleted file mode 100644 index 28cc08abc02..00000000000 --- a/llvm/test/CodeGen/R600/schedule-fs-loop.ll +++ /dev/null @@ -1,55 +0,0 @@ -;RUN: llc < %s -march=r600 -mcpu=cayman -stress-sched -verify-misched -verify-machineinstrs -;REQUIRES: asserts - -define void @main() { -main_body: - %0 = load <4 x float>, <4 x float> addrspace(9)* null - %1 = extractelement <4 x float> %0, i32 3 - %2 = fptosi float %1 to i32 - %3 = bitcast i32 %2 to float - %4 = load <4 x float>, <4 x float> addrspace(9)* null - %5 = extractelement <4 x float> %4, i32 0 - %6 = load <4 x float>, <4 x float> addrspace(9)* null - %7 = extractelement <4 x float> %6, i32 1 - %8 = load <4 x float>, <4 x float> addrspace(9)* null - %9 = extractelement <4 x float> %8, i32 2 - br label %LOOP - -LOOP: ; preds = %ENDIF, %main_body - %temp4.0 = phi float [ %5, %main_body ], [ %temp5.0, %ENDIF ] - %temp5.0 = phi float [ %7, %main_body ], [ %temp6.0, %ENDIF ] - %temp6.0 = phi float [ %9, %main_body ], [ %temp4.0, %ENDIF ] - %temp8.0 = phi float [ 0.000000e+00, %main_body ], [ %27, %ENDIF ] - %10 = bitcast float %temp8.0 to i32 - %11 = bitcast float %3 to i32 - %12 = icmp sge i32 %10, %11 - %13 = sext i1 %12 to i32 - %14 = bitcast i32 %13 to float - %15 = bitcast float %14 to i32 - %16 = icmp ne i32 %15, 0 - br i1 %16, label %IF, label %ENDIF - -IF: ; preds = %LOOP - %17 = call float @llvm.AMDIL.clamp.(float %temp4.0, float 0.000000e+00, float 1.000000e+00) - %18 = call float @llvm.AMDIL.clamp.(float %temp5.0, float 0.000000e+00, float 1.000000e+00) - %19 = call float @llvm.AMDIL.clamp.(float %temp6.0, float 0.000000e+00, float 1.000000e+00) - %20 = call float @llvm.AMDIL.clamp.(float 1.000000e+00, float 0.000000e+00, float 1.000000e+00) - %21 = insertelement <4 x float> undef, float %17, i32 0 - %22 = insertelement <4 x float> %21, float %18, i32 1 - %23 = insertelement <4 x float> %22, float %19, i32 2 - %24 = insertelement <4 x float> %23, float %20, i32 3 - call void @llvm.R600.store.swizzle(<4 x float> %24, i32 0, i32 0) - ret void - -ENDIF: ; preds = %LOOP - %25 = bitcast float %temp8.0 to i32 - %26 = add i32 %25, 1 - %27 = bitcast i32 %26 to float - br label %LOOP -} - -declare float @llvm.AMDIL.clamp.(float, float, float) #0 - -declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) - -attributes #0 = { readnone } diff --git a/llvm/test/CodeGen/R600/schedule-global-loads.ll b/llvm/test/CodeGen/R600/schedule-global-loads.ll deleted file mode 100644 index 3f728fd873b..00000000000 --- a/llvm/test/CodeGen/R600/schedule-global-loads.ll +++ /dev/null @@ -1,41 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=SI %s - - -declare i32 @llvm.r600.read.tidig.x() #1 - -; FIXME: This currently doesn't do a great job of clustering the -; loads, which end up with extra moves between them. Right now, it -; seems the only things areLoadsFromSameBasePtr is accomplishing is -; ordering the loads so that the lower address loads come first. - -; FUNC-LABEL: {{^}}cluster_global_arg_loads: -; SI-DAG: buffer_load_dword [[REG0:v[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} -; SI-DAG: buffer_load_dword [[REG1:v[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:4 -; SI: buffer_store_dword [[REG0]] -; SI: buffer_store_dword [[REG1]] -define void @cluster_global_arg_loads(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 addrspace(1)* %ptr) #0 { - %load0 = load i32, i32 addrspace(1)* %ptr, align 4 - %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 1 - %load1 = load i32, i32 addrspace(1)* %gep, align 4 - store i32 %load0, i32 addrspace(1)* %out0, align 4 - store i32 %load1, i32 addrspace(1)* %out1, align 4 - ret void -} - -; Test for a crach in SIInstrInfo::areLoadsFromSameBasePtr() when checking -; an MUBUF load which does not have a vaddr operand. -; FUNC-LABEL: {{^}}same_base_ptr_crash: -; SI: buffer_load_dword -; SI: buffer_load_dword -define void @same_base_ptr_crash(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %offset) { -entry: - %out1 = getelementptr i32, i32 addrspace(1)* %out, i32 %offset - %tmp0 = load i32, i32 addrspace(1)* %out - %tmp1 = load i32, i32 addrspace(1)* %out1 - %tmp2 = add i32 %tmp0, %tmp1 - store i32 %tmp2, i32 addrspace(1)* %out - ret void -} - -attributes #0 = { nounwind } -attributes #1 = { nounwind readnone } diff --git a/llvm/test/CodeGen/R600/schedule-if-2.ll b/llvm/test/CodeGen/R600/schedule-if-2.ll deleted file mode 100644 index 54946509683..00000000000 --- a/llvm/test/CodeGen/R600/schedule-if-2.ll +++ /dev/null @@ -1,94 +0,0 @@ -;RUN: llc < %s -march=r600 -mcpu=cayman -stress-sched -verify-misched -verify-machineinstrs -;REQUIRES: asserts - -define void @main() { -main_body: - %0 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2) - %1 = extractelement <4 x float> %0, i32 0 - %2 = fadd float 1.000000e+03, %1 - %3 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1) - %4 = extractelement <4 x float> %3, i32 0 - %5 = bitcast float %4 to i32 - %6 = icmp eq i32 %5, 0 - %7 = sext i1 %6 to i32 - %8 = bitcast i32 %7 to float - %9 = bitcast float %8 to i32 - %10 = icmp ne i32 %9, 0 - br i1 %10, label %IF, label %ELSE - -IF: ; preds = %main_body - %11 = call float @fabs(float %2) - %12 = fcmp ueq float %11, 0x7FF0000000000000 - %13 = select i1 %12, float 1.000000e+00, float 0.000000e+00 - %14 = fsub float -0.000000e+00, %13 - %15 = fptosi float %14 to i32 - %16 = bitcast i32 %15 to float - %17 = bitcast float %16 to i32 - %18 = icmp ne i32 %17, 0 - %. = select i1 %18, float 0x36A0000000000000, float 0.000000e+00 - %19 = fcmp une float %2, %2 - %20 = select i1 %19, float 1.000000e+00, float 0.000000e+00 - %21 = fsub float -0.000000e+00, %20 - %22 = fptosi float %21 to i32 - %23 = bitcast i32 %22 to float - %24 = bitcast float %23 to i32 - %25 = icmp ne i32 %24, 0 - %temp8.0 = select i1 %25, float 0x36A0000000000000, float 0.000000e+00 - %26 = bitcast float %. to i32 - %27 = sitofp i32 %26 to float - %28 = bitcast float %temp8.0 to i32 - %29 = sitofp i32 %28 to float - %30 = fcmp ugt float %2, 0.000000e+00 - %31 = select i1 %30, float 1.000000e+00, float %2 - %32 = fcmp uge float %31, 0.000000e+00 - %33 = select i1 %32, float %31, float -1.000000e+00 - %34 = fadd float %33, 1.000000e+00 - %35 = fmul float %34, 5.000000e-01 - br label %ENDIF - -ELSE: ; preds = %main_body - %36 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1) - %37 = extractelement <4 x float> %36, i32 0 - %38 = bitcast float %37 to i32 - %39 = icmp eq i32 %38, 1 - %40 = sext i1 %39 to i32 - %41 = bitcast i32 %40 to float - %42 = bitcast float %41 to i32 - %43 = icmp ne i32 %42, 0 - br i1 %43, label %IF23, label %ENDIF - -ENDIF: ; preds = %IF23, %ELSE, %IF - %temp4.0 = phi float [ %2, %IF ], [ %56, %IF23 ], [ 0.000000e+00, %ELSE ] - %temp5.0 = phi float [ %27, %IF ], [ %60, %IF23 ], [ 0.000000e+00, %ELSE ] - %temp6.0 = phi float [ %29, %IF ], [ 0.000000e+00, %ELSE ], [ 0.000000e+00, %IF23 ] - %temp7.0 = phi float [ %35, %IF ], [ 0.000000e+00, %ELSE ], [ 0.000000e+00, %IF23 ] - %44 = insertelement <4 x float> undef, float %temp4.0, i32 0 - %45 = insertelement <4 x float> %44, float %temp5.0, i32 1 - %46 = insertelement <4 x float> %45, float %temp6.0, i32 2 - %47 = insertelement <4 x float> %46, float %temp7.0, i32 3 - call void @llvm.R600.store.swizzle(<4 x float> %47, i32 0, i32 0) - ret void - -IF23: ; preds = %ELSE - %48 = fcmp ult float 0.000000e+00, %2 - %49 = select i1 %48, float 1.000000e+00, float 0.000000e+00 - %50 = fsub float -0.000000e+00, %49 - %51 = fptosi float %50 to i32 - %52 = bitcast i32 %51 to float - %53 = bitcast float %52 to i32 - %54 = icmp ne i32 %53, 0 - %.28 = select i1 %54, float 0x36A0000000000000, float 0.000000e+00 - %55 = bitcast float %.28 to i32 - %56 = sitofp i32 %55 to float - %57 = load <4 x float>, <4 x float> addrspace(8)* null - %58 = extractelement <4 x float> %57, i32 0 - %59 = fsub float -0.000000e+00, %58 - %60 = fadd float %2, %59 - br label %ENDIF -} - -declare float @fabs(float) #0 - -declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) - -attributes #0 = { readonly } diff --git a/llvm/test/CodeGen/R600/schedule-if.ll b/llvm/test/CodeGen/R600/schedule-if.ll deleted file mode 100644 index 94c653c8f25..00000000000 --- a/llvm/test/CodeGen/R600/schedule-if.ll +++ /dev/null @@ -1,46 +0,0 @@ -;RUN: llc < %s -march=r600 -mcpu=cayman -stress-sched -verify-misched -verify-machineinstrs -;REQUIRES: asserts - -define void @main() { -main_body: - %0 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1) - %1 = extractelement <4 x float> %0, i32 0 - %2 = bitcast float %1 to i32 - %3 = icmp eq i32 %2, 0 - %4 = sext i1 %3 to i32 - %5 = bitcast i32 %4 to float - %6 = bitcast float %5 to i32 - %7 = icmp ne i32 %6, 0 - br i1 %7, label %ENDIF, label %ELSE - -ELSE: ; preds = %main_body - %8 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1) - %9 = extractelement <4 x float> %8, i32 0 - %10 = bitcast float %9 to i32 - %11 = icmp eq i32 %10, 1 - %12 = sext i1 %11 to i32 - %13 = bitcast i32 %12 to float - %14 = bitcast float %13 to i32 - %15 = icmp ne i32 %14, 0 - br i1 %15, label %IF13, label %ENDIF - -ENDIF: ; preds = %IF13, %ELSE, %main_body - %temp.0 = phi float [ 1.000000e+03, %main_body ], [ 1.000000e+00, %IF13 ], [ 0.000000e+00, %ELSE ] - %temp1.0 = phi float [ 0.000000e+00, %main_body ], [ %23, %IF13 ], [ 0.000000e+00, %ELSE ] - %temp3.0 = phi float [ 1.000000e+00, %main_body ], [ 0.000000e+00, %ELSE ], [ 0.000000e+00, %IF13 ] - %16 = insertelement <4 x float> undef, float %temp.0, i32 0 - %17 = insertelement <4 x float> %16, float %temp1.0, i32 1 - %18 = insertelement <4 x float> %17, float 0.000000e+00, i32 2 - %19 = insertelement <4 x float> %18, float %temp3.0, i32 3 - call void @llvm.R600.store.swizzle(<4 x float> %19, i32 0, i32 0) - ret void - -IF13: ; preds = %ELSE - %20 = load <4 x float>, <4 x float> addrspace(8)* null - %21 = extractelement <4 x float> %20, i32 0 - %22 = fsub float -0.000000e+00, %21 - %23 = fadd float 1.000000e+03, %22 - br label %ENDIF -} - -declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) diff --git a/llvm/test/CodeGen/R600/schedule-kernel-arg-loads.ll b/llvm/test/CodeGen/R600/schedule-kernel-arg-loads.ll deleted file mode 100644 index 6b3e0814c38..00000000000 --- a/llvm/test/CodeGen/R600/schedule-kernel-arg-loads.ll +++ /dev/null @@ -1,51 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=SI --check-prefix=GCN %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=VI --check-prefix=GCN %s - -; FUNC-LABEL: {{^}}cluster_arg_loads: -; SI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x9 -; SI-NEXT: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb -; SI-NEXT: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd -; SI-NEXT: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0xe -; VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x24 -; VI-NEXT: s_nop 0 -; VI-NEXT: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c -; VI-NEXT: s_nop 0 -; VI-NEXT: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x34 -; VI-NEXT: s_nop 0 -; VI-NEXT: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x38 -define void @cluster_arg_loads(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 %x, i32 %y) nounwind { - store i32 %x, i32 addrspace(1)* %out0, align 4 - store i32 %y, i32 addrspace(1)* %out1, align 4 - ret void -} - -; Test for a crash in SIInstrInfo::areLoadsFromSameBasePtr() when -; s_load_dwordx2 has a register offset - -; FUNC-LABEL: @same_base_ptr_crash -; GCN: s_load_dwordx2 -; GCN: s_load_dwordx2 -; GCN: s_load_dwordx2 -; GCN: s_endpgm -define void @same_base_ptr_crash(i64 addrspace(1)* %out, - i64 %arg0, i64 %arg1, i64 %arg2, i64 %arg3, i64 %arg4, i64 %arg5, i64 %arg6, i64 %arg7, - i64 %arg8, i64 %arg9, i64 %arg10, i64 %arg11, i64 %arg12, i64 %arg13, i64 %arg14, i64 %arg15, - i64 %arg16, i64 %arg17, i64 %arg18, i64 %arg19, i64 %arg20, i64 %arg21, i64 %arg22, i64 %arg23, - i64 %arg24, i64 %arg25, i64 %arg26, i64 %arg27, i64 %arg28, i64 %arg29, i64 %arg30, i64 %arg31, - i64 %arg32, i64 %arg33, i64 %arg34, i64 %arg35, i64 %arg36, i64 %arg37, i64 %arg38, i64 %arg39, - i64 %arg40, i64 %arg41, i64 %arg42, i64 %arg43, i64 %arg44, i64 %arg45, i64 %arg46, i64 %arg47, - i64 %arg48, i64 %arg49, i64 %arg50, i64 %arg51, i64 %arg52, i64 %arg53, i64 %arg54, i64 %arg55, - i64 %arg56, i64 %arg57, i64 %arg58, i64 %arg59, i64 %arg60, i64 %arg61, i64 %arg62, i64 %arg63, - i64 %arg64, i64 %arg65, i64 %arg66, i64 %arg67, i64 %arg68, i64 %arg69, i64 %arg70, i64 %arg71, - i64 %arg72, i64 %arg73, i64 %arg74, i64 %arg75, i64 %arg76, i64 %arg77, i64 %arg78, i64 %arg79, - i64 %arg80, i64 %arg81, i64 %arg82, i64 %arg83, i64 %arg84, i64 %arg85, i64 %arg86, i64 %arg87, - i64 %arg88, i64 %arg89, i64 %arg90, i64 %arg91, i64 %arg92, i64 %arg93, i64 %arg94, i64 %arg95, - i64 %arg96, i64 %arg97, i64 %arg98, i64 %arg99, i64 %arg100, i64 %arg101, i64 %arg102, i64 %arg103, - i64 %arg104, i64 %arg105, i64 %arg106, i64 %arg107, i64 %arg108, i64 %arg109, i64 %arg110, i64 %arg111, - i64 %arg112, i64 %arg113, i64 %arg114, i64 %arg115, i64 %arg116, i64 %arg117, i64 %arg118, i64 %arg119, - i64 %arg120, i64 %arg121, i64 %arg122, i64 %arg123, i64 %arg124, i64 %arg125, i64 %arg126) { -entry: - %value = add i64 %arg125, %arg126 - store i64 %value, i64 addrspace(1)* %out, align 8 - ret void -} diff --git a/llvm/test/CodeGen/R600/schedule-vs-if-nested-loop-failure.ll b/llvm/test/CodeGen/R600/schedule-vs-if-nested-loop-failure.ll deleted file mode 100644 index 3863afda5dd..00000000000 --- a/llvm/test/CodeGen/R600/schedule-vs-if-nested-loop-failure.ll +++ /dev/null @@ -1,163 +0,0 @@ -; XFAIL: * -; REQUIRES: asserts -; RUN: llc -O0 -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck %s -check-prefix=SI -; RUN: llc -O0 -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck %s -check-prefix=SI - -declare void @llvm.AMDGPU.barrier.local() nounwind noduplicate - - -; SI-LABEL: {{^}}main( -define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1) #0 { -main_body: - %0 = extractelement <4 x float> %reg1, i32 0 - %1 = extractelement <4 x float> %reg1, i32 2 - %2 = fcmp ult float %0, 0.000000e+00 - %3 = select i1 %2, float 1.000000e+00, float 0.000000e+00 - %4 = fsub float -0.000000e+00, %3 - %5 = fptosi float %4 to i32 - %6 = bitcast i32 %5 to float - %7 = bitcast float %6 to i32 - %8 = icmp ne i32 %7, 0 - br i1 %8, label %LOOP, label %ENDIF - -Flow1: ; preds = %ENDIF19, %ENDIF16 - %9 = phi float [ %115, %ENDIF19 ], [ undef, %ENDIF16 ] - %10 = phi float [ %114, %ENDIF19 ], [ undef, %ENDIF16 ] - %11 = phi float [ %113, %ENDIF19 ], [ undef, %ENDIF16 ] - %12 = phi float [ %112, %ENDIF19 ], [ undef, %ENDIF16 ] - %13 = phi float [ %111, %ENDIF19 ], [ undef, %ENDIF16 ] - %14 = phi i1 [ false, %ENDIF19 ], [ true, %ENDIF16 ] - br label %Flow - -Flow2: ; preds = %Flow - br label %ENDIF - -ENDIF: ; preds = %main_body, %Flow2 - %temp.0 = phi float [ 0.000000e+00, %main_body ], [ %104, %Flow2 ] - %temp1.0 = phi float [ 1.000000e+00, %main_body ], [ %103, %Flow2 ] - %temp2.0 = phi float [ 0.000000e+00, %main_body ], [ %102, %Flow2 ] - %temp3.0 = phi float [ 0.000000e+00, %main_body ], [ %101, %Flow2 ] - %15 = extractelement <4 x float> %reg1, i32 1 - %16 = extractelement <4 x float> %reg1, i32 3 - %17 = load <4 x float>, <4 x float> addrspace(9)* null - %18 = extractelement <4 x float> %17, i32 0 - %19 = fmul float %18, %0 - %20 = load <4 x float>, <4 x float> addrspace(9)* null - %21 = extractelement <4 x float> %20, i32 1 - %22 = fmul float %21, %0 - %23 = load <4 x float>, <4 x float> addrspace(9)* null - %24 = extractelement <4 x float> %23, i32 2 - %25 = fmul float %24, %0 - %26 = load <4 x float>, <4 x float> addrspace(9)* null - %27 = extractelement <4 x float> %26, i32 3 - %28 = fmul float %27, %0 - %29 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 1) - %30 = extractelement <4 x float> %29, i32 0 - %31 = fmul float %30, %15 - %32 = fadd float %31, %19 - %33 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 1) - %34 = extractelement <4 x float> %33, i32 1 - %35 = fmul float %34, %15 - %36 = fadd float %35, %22 - %37 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 1) - %38 = extractelement <4 x float> %37, i32 2 - %39 = fmul float %38, %15 - %40 = fadd float %39, %25 - %41 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 1) - %42 = extractelement <4 x float> %41, i32 3 - %43 = fmul float %42, %15 - %44 = fadd float %43, %28 - %45 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 2) - %46 = extractelement <4 x float> %45, i32 0 - %47 = fmul float %46, %1 - %48 = fadd float %47, %32 - %49 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 2) - %50 = extractelement <4 x float> %49, i32 1 - %51 = fmul float %50, %1 - %52 = fadd float %51, %36 - %53 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 2) - %54 = extractelement <4 x float> %53, i32 2 - %55 = fmul float %54, %1 - %56 = fadd float %55, %40 - %57 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 2) - %58 = extractelement <4 x float> %57, i32 3 - %59 = fmul float %58, %1 - %60 = fadd float %59, %44 - %61 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 3) - %62 = extractelement <4 x float> %61, i32 0 - %63 = fmul float %62, %16 - %64 = fadd float %63, %48 - %65 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 3) - %66 = extractelement <4 x float> %65, i32 1 - %67 = fmul float %66, %16 - %68 = fadd float %67, %52 - %69 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 3) - %70 = extractelement <4 x float> %69, i32 2 - %71 = fmul float %70, %16 - %72 = fadd float %71, %56 - %73 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 3) - %74 = extractelement <4 x float> %73, i32 3 - %75 = fmul float %74, %16 - %76 = fadd float %75, %60 - %77 = insertelement <4 x float> undef, float %64, i32 0 - %78 = insertelement <4 x float> %77, float %68, i32 1 - %79 = insertelement <4 x float> %78, float %72, i32 2 - %80 = insertelement <4 x float> %79, float %76, i32 3 - call void @llvm.AMDGPU.barrier.local() - %81 = insertelement <4 x float> undef, float %temp.0, i32 0 - %82 = insertelement <4 x float> %81, float %temp1.0, i32 1 - %83 = insertelement <4 x float> %82, float %temp2.0, i32 2 - %84 = insertelement <4 x float> %83, float %temp3.0, i32 3 - call void @llvm.AMDGPU.barrier.local() - ret void - -LOOP: ; preds = %main_body, %Flow - %temp.1 = phi float [ %109, %Flow ], [ 0.000000e+00, %main_body ] - %temp1.1 = phi float [ %108, %Flow ], [ 1.000000e+00, %main_body ] - %temp2.1 = phi float [ %107, %Flow ], [ 0.000000e+00, %main_body ] - %temp3.1 = phi float [ %106, %Flow ], [ 0.000000e+00, %main_body ] - %temp4.0 = phi float [ %105, %Flow ], [ -2.000000e+00, %main_body ] - %85 = fcmp uge float %temp4.0, %0 - %86 = select i1 %85, float 1.000000e+00, float 0.000000e+00 - %87 = fsub float -0.000000e+00, %86 - %88 = fptosi float %87 to i32 - %89 = bitcast i32 %88 to float - %90 = bitcast float %89 to i32 - %91 = icmp ne i32 %90, 0 - %92 = xor i1 %91, true - br i1 %92, label %ENDIF16, label %Flow - -ENDIF16: ; preds = %LOOP - %93 = fcmp une float %1, %temp4.0 - %94 = select i1 %93, float 1.000000e+00, float 0.000000e+00 - %95 = fsub float -0.000000e+00, %94 - %96 = fptosi float %95 to i32 - %97 = bitcast i32 %96 to float - %98 = bitcast float %97 to i32 - %99 = icmp ne i32 %98, 0 - %100 = xor i1 %99, true - br i1 %100, label %ENDIF19, label %Flow1 - -Flow: ; preds = %Flow1, %LOOP - %101 = phi float [ %temp3.1, %Flow1 ], [ %temp3.1, %LOOP ] - %102 = phi float [ %temp2.1, %Flow1 ], [ %temp2.1, %LOOP ] - %103 = phi float [ %temp1.1, %Flow1 ], [ %temp1.1, %LOOP ] - %104 = phi float [ %temp.1, %Flow1 ], [ %temp.1, %LOOP ] - %105 = phi float [ %9, %Flow1 ], [ undef, %LOOP ] - %106 = phi float [ %10, %Flow1 ], [ undef, %LOOP ] - %107 = phi float [ %11, %Flow1 ], [ undef, %LOOP ] - %108 = phi float [ %12, %Flow1 ], [ undef, %LOOP ] - %109 = phi float [ %13, %Flow1 ], [ undef, %LOOP ] - %110 = phi i1 [ %14, %Flow1 ], [ true, %LOOP ] - br i1 %110, label %Flow2, label %LOOP - -ENDIF19: ; preds = %ENDIF16 - %111 = fadd float %temp.1, 1.000000e+00 - %112 = fadd float %temp1.1, 0.000000e+00 - %113 = fadd float %temp2.1, 0.000000e+00 - %114 = fadd float %temp3.1, 0.000000e+00 - %115 = fadd float %temp4.0, 1.000000e+00 - br label %Flow1 -} - -attributes #0 = { "ShaderType"="1" } diff --git a/llvm/test/CodeGen/R600/schedule-vs-if-nested-loop.ll b/llvm/test/CodeGen/R600/schedule-vs-if-nested-loop.ll deleted file mode 100644 index 8d980dbf899..00000000000 --- a/llvm/test/CodeGen/R600/schedule-vs-if-nested-loop.ll +++ /dev/null @@ -1,132 +0,0 @@ -;RUN: llc < %s -march=r600 -mcpu=cayman -stress-sched -verify-misched -;REQUIRES: asserts - -define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1) #0 { -main_body: - %0 = extractelement <4 x float> %reg1, i32 0 - %1 = extractelement <4 x float> %reg1, i32 1 - %2 = extractelement <4 x float> %reg1, i32 2 - %3 = extractelement <4 x float> %reg1, i32 3 - %4 = fcmp ult float %0, 0.000000e+00 - %5 = select i1 %4, float 1.000000e+00, float 0.000000e+00 - %6 = fsub float -0.000000e+00, %5 - %7 = fptosi float %6 to i32 - %8 = bitcast i32 %7 to float - %9 = bitcast float %8 to i32 - %10 = icmp ne i32 %9, 0 - br i1 %10, label %LOOP, label %ENDIF - -ENDIF: ; preds = %ENDIF16, %LOOP, %main_body - %temp.0 = phi float [ 0.000000e+00, %main_body ], [ %temp.1, %LOOP ], [ %temp.1, %ENDIF16 ] - %temp1.0 = phi float [ 1.000000e+00, %main_body ], [ %temp1.1, %LOOP ], [ %temp1.1, %ENDIF16 ] - %temp2.0 = phi float [ 0.000000e+00, %main_body ], [ %temp2.1, %LOOP ], [ %temp2.1, %ENDIF16 ] - %temp3.0 = phi float [ 0.000000e+00, %main_body ], [ %temp3.1, %LOOP ], [ %temp3.1, %ENDIF16 ] - %11 = load <4 x float>, <4 x float> addrspace(9)* null - %12 = extractelement <4 x float> %11, i32 0 - %13 = fmul float %12, %0 - %14 = load <4 x float>, <4 x float> addrspace(9)* null - %15 = extractelement <4 x float> %14, i32 1 - %16 = fmul float %15, %0 - %17 = load <4 x float>, <4 x float> addrspace(9)* null - %18 = extractelement <4 x float> %17, i32 2 - %19 = fmul float %18, %0 - %20 = load <4 x float>, <4 x float> addrspace(9)* null - %21 = extractelement <4 x float> %20, i32 3 - %22 = fmul float %21, %0 - %23 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 1) - %24 = extractelement <4 x float> %23, i32 0 - %25 = fmul float %24, %1 - %26 = fadd float %25, %13 - %27 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 1) - %28 = extractelement <4 x float> %27, i32 1 - %29 = fmul float %28, %1 - %30 = fadd float %29, %16 - %31 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 1) - %32 = extractelement <4 x float> %31, i32 2 - %33 = fmul float %32, %1 - %34 = fadd float %33, %19 - %35 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 1) - %36 = extractelement <4 x float> %35, i32 3 - %37 = fmul float %36, %1 - %38 = fadd float %37, %22 - %39 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 2) - %40 = extractelement <4 x float> %39, i32 0 - %41 = fmul float %40, %2 - %42 = fadd float %41, %26 - %43 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 2) - %44 = extractelement <4 x float> %43, i32 1 - %45 = fmul float %44, %2 - %46 = fadd float %45, %30 - %47 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 2) - %48 = extractelement <4 x float> %47, i32 2 - %49 = fmul float %48, %2 - %50 = fadd float %49, %34 - %51 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 2) - %52 = extractelement <4 x float> %51, i32 3 - %53 = fmul float %52, %2 - %54 = fadd float %53, %38 - %55 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 3) - %56 = extractelement <4 x float> %55, i32 0 - %57 = fmul float %56, %3 - %58 = fadd float %57, %42 - %59 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 3) - %60 = extractelement <4 x float> %59, i32 1 - %61 = fmul float %60, %3 - %62 = fadd float %61, %46 - %63 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 3) - %64 = extractelement <4 x float> %63, i32 2 - %65 = fmul float %64, %3 - %66 = fadd float %65, %50 - %67 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 3) - %68 = extractelement <4 x float> %67, i32 3 - %69 = fmul float %68, %3 - %70 = fadd float %69, %54 - %71 = insertelement <4 x float> undef, float %58, i32 0 - %72 = insertelement <4 x float> %71, float %62, i32 1 - %73 = insertelement <4 x float> %72, float %66, i32 2 - %74 = insertelement <4 x float> %73, float %70, i32 3 - call void @llvm.R600.store.swizzle(<4 x float> %74, i32 60, i32 1) - %75 = insertelement <4 x float> undef, float %temp.0, i32 0 - %76 = insertelement <4 x float> %75, float %temp1.0, i32 1 - %77 = insertelement <4 x float> %76, float %temp2.0, i32 2 - %78 = insertelement <4 x float> %77, float %temp3.0, i32 3 - call void @llvm.R600.store.swizzle(<4 x float> %78, i32 0, i32 2) - ret void - -LOOP: ; preds = %main_body, %ENDIF19 - %temp.1 = phi float [ %93, %ENDIF19 ], [ 0.000000e+00, %main_body ] - %temp1.1 = phi float [ %94, %ENDIF19 ], [ 1.000000e+00, %main_body ] - %temp2.1 = phi float [ %95, %ENDIF19 ], [ 0.000000e+00, %main_body ] - %temp3.1 = phi float [ %96, %ENDIF19 ], [ 0.000000e+00, %main_body ] - %temp4.0 = phi float [ %97, %ENDIF19 ], [ -2.000000e+00, %main_body ] - %79 = fcmp uge float %temp4.0, %0 - %80 = select i1 %79, float 1.000000e+00, float 0.000000e+00 - %81 = fsub float -0.000000e+00, %80 - %82 = fptosi float %81 to i32 - %83 = bitcast i32 %82 to float - %84 = bitcast float %83 to i32 - %85 = icmp ne i32 %84, 0 - br i1 %85, label %ENDIF, label %ENDIF16 - -ENDIF16: ; preds = %LOOP - %86 = fcmp une float %2, %temp4.0 - %87 = select i1 %86, float 1.000000e+00, float 0.000000e+00 - %88 = fsub float -0.000000e+00, %87 - %89 = fptosi float %88 to i32 - %90 = bitcast i32 %89 to float - %91 = bitcast float %90 to i32 - %92 = icmp ne i32 %91, 0 - br i1 %92, label %ENDIF, label %ENDIF19 - -ENDIF19: ; preds = %ENDIF16 - %93 = fadd float %temp.1, 1.000000e+00 - %94 = fadd float %temp1.1, 0.000000e+00 - %95 = fadd float %temp2.1, 0.000000e+00 - %96 = fadd float %temp3.1, 0.000000e+00 - %97 = fadd float %temp4.0, 1.000000e+00 - br label %LOOP -} - -declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) - -attributes #0 = { "ShaderType"="1" } diff --git a/llvm/test/CodeGen/R600/scratch-buffer.ll b/llvm/test/CodeGen/R600/scratch-buffer.ll deleted file mode 100644 index 56088718ada..00000000000 --- a/llvm/test/CodeGen/R600/scratch-buffer.ll +++ /dev/null @@ -1,87 +0,0 @@ -; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=SI < %s | FileCheck %s -; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck %s - -; When a frame index offset is more than 12-bits, make sure we don't store -; it in mubuf's offset field. - -; Also, make sure we use the same register for storing the scratch buffer addresss -; for both stores. This register is allocated by the register scavenger, so we -; should be able to reuse the same regiser for each scratch buffer access. - -; CHECK-LABEL: {{^}}legal_offset_fi: -; CHECK: v_mov_b32_e32 [[OFFSET:v[0-9]+]], 0{{$}} -; CHECK: buffer_store_dword v{{[0-9]+}}, [[OFFSET]], s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen -; CHECK: v_mov_b32_e32 [[OFFSET]], 0x8000 -; CHECK: buffer_store_dword v{{[0-9]+}}, [[OFFSET]], s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen{{$}} - -define void @legal_offset_fi(i32 addrspace(1)* %out, i32 %cond, i32 %if_offset, i32 %else_offset) { -entry: - %scratch0 = alloca [8192 x i32] - %scratch1 = alloca [8192 x i32] - - %scratchptr0 = getelementptr [8192 x i32], [8192 x i32]* %scratch0, i32 0, i32 0 - store i32 1, i32* %scratchptr0 - - %scratchptr1 = getelementptr [8192 x i32], [8192 x i32]* %scratch1, i32 0, i32 0 - store i32 2, i32* %scratchptr1 - - %cmp = icmp eq i32 %cond, 0 - br i1 %cmp, label %if, label %else - -if: - %if_ptr = getelementptr [8192 x i32], [8192 x i32]* %scratch0, i32 0, i32 %if_offset - %if_value = load i32, i32* %if_ptr - br label %done - -else: - %else_ptr = getelementptr [8192 x i32], [8192 x i32]* %scratch1, i32 0, i32 %else_offset - %else_value = load i32, i32* %else_ptr - br label %done - -done: - %value = phi i32 [%if_value, %if], [%else_value, %else] - store i32 %value, i32 addrspace(1)* %out - ret void - - ret void - -} - -; CHECK-LABEL: {{^}}legal_offset_fi_offset -; CHECK: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen -; CHECK: v_add_i32_e32 [[OFFSET:v[0-9]+]], 0x8000 -; CHECK: buffer_store_dword v{{[0-9]+}}, [[OFFSET]], s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen{{$}} - -define void @legal_offset_fi_offset(i32 addrspace(1)* %out, i32 %cond, i32 addrspace(1)* %offsets, i32 %if_offset, i32 %else_offset) { -entry: - %scratch0 = alloca [8192 x i32] - %scratch1 = alloca [8192 x i32] - - %offset0 = load i32, i32 addrspace(1)* %offsets - %scratchptr0 = getelementptr [8192 x i32], [8192 x i32]* %scratch0, i32 0, i32 %offset0 - store i32 %offset0, i32* %scratchptr0 - - %offsetptr1 = getelementptr i32, i32 addrspace(1)* %offsets, i32 1 - %offset1 = load i32, i32 addrspace(1)* %offsetptr1 - %scratchptr1 = getelementptr [8192 x i32], [8192 x i32]* %scratch1, i32 0, i32 %offset1 - store i32 %offset1, i32* %scratchptr1 - - %cmp = icmp eq i32 %cond, 0 - br i1 %cmp, label %if, label %else - -if: - %if_ptr = getelementptr [8192 x i32], [8192 x i32]* %scratch0, i32 0, i32 %if_offset - %if_value = load i32, i32* %if_ptr - br label %done - -else: - %else_ptr = getelementptr [8192 x i32], [8192 x i32]* %scratch1, i32 0, i32 %else_offset - %else_value = load i32, i32* %else_ptr - br label %done - -done: - %value = phi i32 [%if_value, %if], [%else_value, %else] - store i32 %value, i32 addrspace(1)* %out - ret void -} - diff --git a/llvm/test/CodeGen/R600/sdiv.ll b/llvm/test/CodeGen/R600/sdiv.ll deleted file mode 100644 index de645353a40..00000000000 --- a/llvm/test/CodeGen/R600/sdiv.ll +++ /dev/null @@ -1,104 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s - -; The code generated by sdiv is long and complex and may frequently change. -; The goal of this test is to make sure the ISel doesn't fail. -; -; This program was previously failing to compile when one of the selectcc -; opcodes generated by the sdiv lowering was being legalized and optimized to: -; selectcc Remainder -1, 0, -1, SETGT -; This was fixed by adding an additional pattern in R600Instructions.td to -; match this pattern with a CNDGE_INT. - -; FUNC-LABEL: {{^}}sdiv_i32: -; EG: CF_END -define void @sdiv_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { - %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 - %num = load i32, i32 addrspace(1) * %in - %den = load i32, i32 addrspace(1) * %den_ptr - %result = sdiv i32 %num, %den - store i32 %result, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}sdiv_i32_4: -define void @sdiv_i32_4(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { - %num = load i32, i32 addrspace(1) * %in - %result = sdiv i32 %num, 4 - store i32 %result, i32 addrspace(1)* %out - ret void -} - -; Multiply by a weird constant to make sure setIntDivIsCheap is -; working. - -; FUNC-LABEL: {{^}}slow_sdiv_i32_3435: -; SI: buffer_load_dword [[VAL:v[0-9]+]], -; SI: v_mov_b32_e32 [[MAGIC:v[0-9]+]], 0x98a1930b -; SI: v_mul_hi_i32 [[TMP:v[0-9]+]], [[MAGIC]], [[VAL]] -; SI: v_add_i32 -; SI: v_lshrrev_b32 -; SI: v_ashrrev_i32 -; SI: v_add_i32 -; SI: buffer_store_dword -; SI: s_endpgm -define void @slow_sdiv_i32_3435(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { - %num = load i32, i32 addrspace(1) * %in - %result = sdiv i32 %num, 3435 - store i32 %result, i32 addrspace(1)* %out - ret void -} - -define void @sdiv_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { - %den_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1 - %num = load <2 x i32>, <2 x i32> addrspace(1) * %in - %den = load <2 x i32>, <2 x i32> addrspace(1) * %den_ptr - %result = sdiv <2 x i32> %num, %den - store <2 x i32> %result, <2 x i32> addrspace(1)* %out - ret void -} - -define void @sdiv_v2i32_4(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { - %num = load <2 x i32>, <2 x i32> addrspace(1) * %in - %result = sdiv <2 x i32> %num, - store <2 x i32> %result, <2 x i32> addrspace(1)* %out - ret void -} - -define void @sdiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { - %den_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1 - %num = load <4 x i32>, <4 x i32> addrspace(1) * %in - %den = load <4 x i32>, <4 x i32> addrspace(1) * %den_ptr - %result = sdiv <4 x i32> %num, %den - store <4 x i32> %result, <4 x i32> addrspace(1)* %out - ret void -} - -define void @sdiv_v4i32_4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { - %num = load <4 x i32>, <4 x i32> addrspace(1) * %in - %result = sdiv <4 x i32> %num, - store <4 x i32> %result, <4 x i32> addrspace(1)* %out - ret void -} - -; Tests for 64-bit divide bypass. -; define void @test_get_quotient(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind { -; %result = sdiv i64 %a, %b -; store i64 %result, i64 addrspace(1)* %out, align 8 -; ret void -; } - -; define void @test_get_remainder(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind { -; %result = srem i64 %a, %b -; store i64 %result, i64 addrspace(1)* %out, align 8 -; ret void -; } - -; define void @test_get_quotient_and_remainder(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind { -; %resultdiv = sdiv i64 %a, %b -; %resultrem = srem i64 %a, %b -; %result = add i64 %resultdiv, %resultrem -; store i64 %result, i64 addrspace(1)* %out, align 8 -; ret void -; } diff --git a/llvm/test/CodeGen/R600/sdivrem24.ll b/llvm/test/CodeGen/R600/sdivrem24.ll deleted file mode 100644 index ad5df39f550..00000000000 --- a/llvm/test/CodeGen/R600/sdivrem24.ll +++ /dev/null @@ -1,239 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s - -; FUNC-LABEL: {{^}}sdiv24_i8: -; SI: v_cvt_f32_i32 -; SI: v_cvt_f32_i32 -; SI: v_rcp_f32 -; SI: v_cvt_i32_f32 - -; EG: INT_TO_FLT -; EG-DAG: INT_TO_FLT -; EG-DAG: RECIP_IEEE -; EG: FLT_TO_INT -define void @sdiv24_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) { - %den_ptr = getelementptr i8, i8 addrspace(1)* %in, i8 1 - %num = load i8, i8 addrspace(1) * %in - %den = load i8, i8 addrspace(1) * %den_ptr - %result = sdiv i8 %num, %den - store i8 %result, i8 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}sdiv24_i16: -; SI: v_cvt_f32_i32 -; SI: v_cvt_f32_i32 -; SI: v_rcp_f32 -; SI: v_cvt_i32_f32 - -; EG: INT_TO_FLT -; EG-DAG: INT_TO_FLT -; EG-DAG: RECIP_IEEE -; EG: FLT_TO_INT -define void @sdiv24_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) { - %den_ptr = getelementptr i16, i16 addrspace(1)* %in, i16 1 - %num = load i16, i16 addrspace(1) * %in, align 2 - %den = load i16, i16 addrspace(1) * %den_ptr, align 2 - %result = sdiv i16 %num, %den - store i16 %result, i16 addrspace(1)* %out, align 2 - ret void -} - -; FUNC-LABEL: {{^}}sdiv24_i32: -; SI: v_cvt_f32_i32 -; SI: v_cvt_f32_i32 -; SI: v_rcp_f32 -; SI: v_cvt_i32_f32 - -; EG: INT_TO_FLT -; EG-DAG: INT_TO_FLT -; EG-DAG: RECIP_IEEE -; EG: FLT_TO_INT -define void @sdiv24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { - %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 - %num = load i32, i32 addrspace(1) * %in, align 4 - %den = load i32, i32 addrspace(1) * %den_ptr, align 4 - %num.i24.0 = shl i32 %num, 8 - %den.i24.0 = shl i32 %den, 8 - %num.i24 = ashr i32 %num.i24.0, 8 - %den.i24 = ashr i32 %den.i24.0, 8 - %result = sdiv i32 %num.i24, %den.i24 - store i32 %result, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}sdiv25_i32: -; SI-NOT: v_cvt_f32_i32 -; SI-NOT: v_rcp_f32 - -; EG-NOT: INT_TO_FLT -; EG-NOT: RECIP_IEEE -define void @sdiv25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { - %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 - %num = load i32, i32 addrspace(1) * %in, align 4 - %den = load i32, i32 addrspace(1) * %den_ptr, align 4 - %num.i24.0 = shl i32 %num, 7 - %den.i24.0 = shl i32 %den, 7 - %num.i24 = ashr i32 %num.i24.0, 7 - %den.i24 = ashr i32 %den.i24.0, 7 - %result = sdiv i32 %num.i24, %den.i24 - store i32 %result, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}test_no_sdiv24_i32_1: -; SI-NOT: v_cvt_f32_i32 -; SI-NOT: v_rcp_f32 - -; EG-NOT: INT_TO_FLT -; EG-NOT: RECIP_IEEE -define void @test_no_sdiv24_i32_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { - %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 - %num = load i32, i32 addrspace(1) * %in, align 4 - %den = load i32, i32 addrspace(1) * %den_ptr, align 4 - %num.i24.0 = shl i32 %num, 8 - %den.i24.0 = shl i32 %den, 7 - %num.i24 = ashr i32 %num.i24.0, 8 - %den.i24 = ashr i32 %den.i24.0, 7 - %result = sdiv i32 %num.i24, %den.i24 - store i32 %result, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}test_no_sdiv24_i32_2: -; SI-NOT: v_cvt_f32_i32 -; SI-NOT: v_rcp_f32 - -; EG-NOT: INT_TO_FLT -; EG-NOT: RECIP_IEEE -define void @test_no_sdiv24_i32_2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { - %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 - %num = load i32, i32 addrspace(1) * %in, align 4 - %den = load i32, i32 addrspace(1) * %den_ptr, align 4 - %num.i24.0 = shl i32 %num, 7 - %den.i24.0 = shl i32 %den, 8 - %num.i24 = ashr i32 %num.i24.0, 7 - %den.i24 = ashr i32 %den.i24.0, 8 - %result = sdiv i32 %num.i24, %den.i24 - store i32 %result, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}srem24_i8: -; SI: v_cvt_f32_i32 -; SI: v_cvt_f32_i32 -; SI: v_rcp_f32 -; SI: v_cvt_i32_f32 - -; EG: INT_TO_FLT -; EG-DAG: INT_TO_FLT -; EG-DAG: RECIP_IEEE -; EG: FLT_TO_INT -define void @srem24_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) { - %den_ptr = getelementptr i8, i8 addrspace(1)* %in, i8 1 - %num = load i8, i8 addrspace(1) * %in - %den = load i8, i8 addrspace(1) * %den_ptr - %result = srem i8 %num, %den - store i8 %result, i8 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}srem24_i16: -; SI: v_cvt_f32_i32 -; SI: v_cvt_f32_i32 -; SI: v_rcp_f32 -; SI: v_cvt_i32_f32 - -; EG: INT_TO_FLT -; EG-DAG: INT_TO_FLT -; EG-DAG: RECIP_IEEE -; EG: FLT_TO_INT -define void @srem24_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) { - %den_ptr = getelementptr i16, i16 addrspace(1)* %in, i16 1 - %num = load i16, i16 addrspace(1) * %in, align 2 - %den = load i16, i16 addrspace(1) * %den_ptr, align 2 - %result = srem i16 %num, %den - store i16 %result, i16 addrspace(1)* %out, align 2 - ret void -} - -; FUNC-LABEL: {{^}}srem24_i32: -; SI: v_cvt_f32_i32 -; SI: v_cvt_f32_i32 -; SI: v_rcp_f32 -; SI: v_cvt_i32_f32 - -; EG: INT_TO_FLT -; EG-DAG: INT_TO_FLT -; EG-DAG: RECIP_IEEE -; EG: FLT_TO_INT -define void @srem24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { - %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 - %num = load i32, i32 addrspace(1) * %in, align 4 - %den = load i32, i32 addrspace(1) * %den_ptr, align 4 - %num.i24.0 = shl i32 %num, 8 - %den.i24.0 = shl i32 %den, 8 - %num.i24 = ashr i32 %num.i24.0, 8 - %den.i24 = ashr i32 %den.i24.0, 8 - %result = srem i32 %num.i24, %den.i24 - store i32 %result, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}srem25_i32: -; SI-NOT: v_cvt_f32_i32 -; SI-NOT: v_rcp_f32 - -; EG-NOT: INT_TO_FLT -; EG-NOT: RECIP_IEEE -define void @srem25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { - %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 - %num = load i32, i32 addrspace(1) * %in, align 4 - %den = load i32, i32 addrspace(1) * %den_ptr, align 4 - %num.i24.0 = shl i32 %num, 7 - %den.i24.0 = shl i32 %den, 7 - %num.i24 = ashr i32 %num.i24.0, 7 - %den.i24 = ashr i32 %den.i24.0, 7 - %result = srem i32 %num.i24, %den.i24 - store i32 %result, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}test_no_srem24_i32_1: -; SI-NOT: v_cvt_f32_i32 -; SI-NOT: v_rcp_f32 - -; EG-NOT: INT_TO_FLT -; EG-NOT: RECIP_IEEE -define void @test_no_srem24_i32_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { - %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 - %num = load i32, i32 addrspace(1) * %in, align 4 - %den = load i32, i32 addrspace(1) * %den_ptr, align 4 - %num.i24.0 = shl i32 %num, 8 - %den.i24.0 = shl i32 %den, 7 - %num.i24 = ashr i32 %num.i24.0, 8 - %den.i24 = ashr i32 %den.i24.0, 7 - %result = srem i32 %num.i24, %den.i24 - store i32 %result, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}test_no_srem24_i32_2: -; SI-NOT: v_cvt_f32_i32 -; SI-NOT: v_rcp_f32 - -; EG-NOT: INT_TO_FLT -; EG-NOT: RECIP_IEEE -define void @test_no_srem24_i32_2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { - %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 - %num = load i32, i32 addrspace(1) * %in, align 4 - %den = load i32, i32 addrspace(1) * %den_ptr, align 4 - %num.i24.0 = shl i32 %num, 7 - %den.i24.0 = shl i32 %den, 8 - %num.i24 = ashr i32 %num.i24.0, 7 - %den.i24 = ashr i32 %den.i24.0, 8 - %result = srem i32 %num.i24, %den.i24 - store i32 %result, i32 addrspace(1)* %out, align 4 - ret void -} diff --git a/llvm/test/CodeGen/R600/sdivrem64.ll b/llvm/test/CodeGen/R600/sdivrem64.ll deleted file mode 100644 index a9b2b7f9df5..00000000000 --- a/llvm/test/CodeGen/R600/sdivrem64.ll +++ /dev/null @@ -1,225 +0,0 @@ -;RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck --check-prefix=SI --check-prefix=GCN --check-prefix=FUNC %s -;RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck --check-prefix=VI --check-prefix=GCN --check-prefix=FUNC %s -;RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck --check-prefix=EG --check-prefix=FUNC %s - -;FUNC-LABEL: {{^}}test_sdiv: -;EG: RECIP_UINT -;EG: LSHL {{.*}}, 1, -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT - -;GCN: v_bfe_u32 -;GCN: v_bfe_u32 -;GCN: v_bfe_u32 -;GCN: v_bfe_u32 -;GCN: v_bfe_u32 -;GCN: v_bfe_u32 -;GCN: v_bfe_u32 -;GCN: v_bfe_u32 -;GCN: v_bfe_u32 -;GCN: v_bfe_u32 -;GCN: v_bfe_u32 -;GCN: v_bfe_u32 -;GCN: v_bfe_u32 -;GCN: v_bfe_u32 -;GCN: v_bfe_u32 -;GCN: v_bfe_u32 -;GCN: v_bfe_u32 -;GCN: v_bfe_u32 -;GCN: v_bfe_u32 -;GCN: v_bfe_u32 -;GCN: v_bfe_u32 -;GCN: v_bfe_u32 -;GCN: v_bfe_u32 -;GCN: v_bfe_u32 -;GCN: v_bfe_u32 -;GCN: v_bfe_u32 -;GCN: v_bfe_u32 -;GCN: v_bfe_u32 -;GCN: v_bfe_u32 -;GCN: v_bfe_u32 -;GCN-NOT: v_mad_f32 -;SI-NOT: v_lshr_b64 -;VI-NOT: v_lshrrev_b64 -;GCN: s_endpgm -define void @test_sdiv(i64 addrspace(1)* %out, i64 %x, i64 %y) { - %result = sdiv i64 %x, %y - store i64 %result, i64 addrspace(1)* %out - ret void -} - -;FUNC-LABEL: {{^}}test_srem: -;EG: RECIP_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: AND_INT {{.*}}, 1, - -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN-NOT: v_mad_f32 -;SI-NOT: v_lshr_b64 -;VI-NOT: v_lshrrev_b64 -;GCN: s_endpgm -define void @test_srem(i64 addrspace(1)* %out, i64 %x, i64 %y) { - %result = urem i64 %x, %y - store i64 %result, i64 addrspace(1)* %out - ret void -} - -;FUNC-LABEL: {{^}}test_sdiv3264: -;EG: RECIP_UINT -;EG-NOT: BFE_UINT - -;GCN-NOT: s_bfe_u32 -;GCN-NOT: v_mad_f32 -;SI-NOT: v_lshr_b64 -;VI-NOT: v_lshrrev_b64 -;GCN: s_endpgm -define void @test_sdiv3264(i64 addrspace(1)* %out, i64 %x, i64 %y) { - %1 = ashr i64 %x, 33 - %2 = ashr i64 %y, 33 - %result = sdiv i64 %1, %2 - store i64 %result, i64 addrspace(1)* %out - ret void -} - -;FUNC-LABEL: {{^}}test_srem3264: -;EG: RECIP_UINT -;EG-NOT: BFE_UINT - -;GCN-NOT: s_bfe_u32 -;GCN-NOT: v_mad_f32 -;SI-NOT: v_lshr_b64 -;VI-NOT: v_lshrrev_b64 -;GCN: s_endpgm -define void @test_srem3264(i64 addrspace(1)* %out, i64 %x, i64 %y) { - %1 = ashr i64 %x, 33 - %2 = ashr i64 %y, 33 - %result = srem i64 %1, %2 - store i64 %result, i64 addrspace(1)* %out - ret void -} - -;FUNC-LABEL: {{^}}test_sdiv2464: -;EG: INT_TO_FLT -;EG: INT_TO_FLT -;EG: FLT_TO_INT -;EG-NOT: RECIP_UINT -;EG-NOT: BFE_UINT - -;GCN-NOT: s_bfe_u32 -;GCN: v_mad_f32 -;SI-NOT: v_lshr_b64 -;VI-NOT: v_lshrrev_b64 -;GCN: s_endpgm -define void @test_sdiv2464(i64 addrspace(1)* %out, i64 %x, i64 %y) { - %1 = ashr i64 %x, 40 - %2 = ashr i64 %y, 40 - %result = sdiv i64 %1, %2 - store i64 %result, i64 addrspace(1)* %out - ret void -} - -;FUNC-LABEL: {{^}}test_srem2464: -;EG: INT_TO_FLT -;EG: INT_TO_FLT -;EG: FLT_TO_INT -;EG-NOT: RECIP_UINT -;EG-NOT: BFE_UINT - -;GCN-NOT: s_bfe_u32 -;GCN: v_mad_f32 -;SI-NOT: v_lshr_b64 -;VI-NOT: v_lshrrev_b64 -;GCN: s_endpgm -define void @test_srem2464(i64 addrspace(1)* %out, i64 %x, i64 %y) { - %1 = ashr i64 %x, 40 - %2 = ashr i64 %y, 40 - %result = srem i64 %1, %2 - store i64 %result, i64 addrspace(1)* %out - ret void -} diff --git a/llvm/test/CodeGen/R600/select-i1.ll b/llvm/test/CodeGen/R600/select-i1.ll deleted file mode 100644 index 6735394e93a..00000000000 --- a/llvm/test/CodeGen/R600/select-i1.ll +++ /dev/null @@ -1,15 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s - -; FIXME: This should go in existing select.ll test, except the current testcase there is broken on SI - -; FUNC-LABEL: {{^}}select_i1: -; SI: v_cndmask_b32 -; SI-NOT: v_cndmask_b32 -define void @select_i1(i1 addrspace(1)* %out, i32 %cond, i1 %a, i1 %b) nounwind { - %cmp = icmp ugt i32 %cond, 5 - %sel = select i1 %cmp, i1 %a, i1 %b - store i1 %sel, i1 addrspace(1)* %out, align 4 - ret void -} - diff --git a/llvm/test/CodeGen/R600/select-vectors.ll b/llvm/test/CodeGen/R600/select-vectors.ll deleted file mode 100644 index 59082c65cc8..00000000000 --- a/llvm/test/CodeGen/R600/select-vectors.ll +++ /dev/null @@ -1,156 +0,0 @@ -; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s - -; Test expansion of scalar selects on vectors. -; Evergreen not enabled since it seems to be having problems with doubles. - - -; FUNC-LABEL: {{^}}select_v4i8: -; SI: v_cndmask_b32_e64 -; SI: v_cndmask_b32_e64 -; SI: v_cndmask_b32_e64 -; SI: v_cndmask_b32_e64 -define void @select_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> %a, <4 x i8> %b, i8 %c) nounwind { - %cmp = icmp eq i8 %c, 0 - %select = select i1 %cmp, <4 x i8> %a, <4 x i8> %b - store <4 x i8> %select, <4 x i8> addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}select_v4i16: -; SI: v_cndmask_b32_e64 -; SI: v_cndmask_b32_e64 -; SI: v_cndmask_b32_e64 -; SI: v_cndmask_b32_e64 -define void @select_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %a, <4 x i16> %b, i32 %c) nounwind { - %cmp = icmp eq i32 %c, 0 - %select = select i1 %cmp, <4 x i16> %a, <4 x i16> %b - store <4 x i16> %select, <4 x i16> addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}select_v2i32: -; SI: v_cndmask_b32_e64 -; SI: v_cndmask_b32_e64 -; SI: buffer_store_dwordx2 -define void @select_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b, i32 %c) nounwind { - %cmp = icmp eq i32 %c, 0 - %select = select i1 %cmp, <2 x i32> %a, <2 x i32> %b - store <2 x i32> %select, <2 x i32> addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}select_v4i32: -; SI: v_cndmask_b32_e64 -; SI: v_cndmask_b32_e64 -; SI: v_cndmask_b32_e64 -; SI: v_cndmask_b32_e64 -; SI: buffer_store_dwordx4 -define void @select_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b, i32 %c) nounwind { - %cmp = icmp eq i32 %c, 0 - %select = select i1 %cmp, <4 x i32> %a, <4 x i32> %b - store <4 x i32> %select, <4 x i32> addrspace(1)* %out, align 16 - ret void -} - -; FUNC-LABEL: {{^}}select_v8i32: -; SI: v_cndmask_b32_e64 -; SI: v_cndmask_b32_e64 -; SI: v_cndmask_b32_e64 -; SI: v_cndmask_b32_e64 -; SI: v_cndmask_b32_e64 -; SI: v_cndmask_b32_e64 -; SI: v_cndmask_b32_e64 -; SI: v_cndmask_b32_e64 -define void @select_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> %a, <8 x i32> %b, i32 %c) nounwind { - %cmp = icmp eq i32 %c, 0 - %select = select i1 %cmp, <8 x i32> %a, <8 x i32> %b - store <8 x i32> %select, <8 x i32> addrspace(1)* %out, align 16 - ret void -} - -; FUNC-LABEL: {{^}}select_v2f32: -; SI: buffer_store_dwordx2 -define void @select_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b, i32 %c) nounwind { - %cmp = icmp eq i32 %c, 0 - %select = select i1 %cmp, <2 x float> %a, <2 x float> %b - store <2 x float> %select, <2 x float> addrspace(1)* %out, align 16 - ret void -} - -; FUNC-LABEL: {{^}}select_v4f32: -; SI: buffer_store_dwordx4 -define void @select_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, <4 x float> %b, i32 %c) nounwind { - %cmp = icmp eq i32 %c, 0 - %select = select i1 %cmp, <4 x float> %a, <4 x float> %b - store <4 x float> %select, <4 x float> addrspace(1)* %out, align 16 - ret void -} - -; FUNC-LABEL: {{^}}select_v8f32: -; SI: v_cndmask_b32_e64 -; SI: v_cndmask_b32_e64 -; SI: v_cndmask_b32_e64 -; SI: v_cndmask_b32_e64 -; SI: v_cndmask_b32_e64 -; SI: v_cndmask_b32_e64 -; SI: v_cndmask_b32_e64 -; SI: v_cndmask_b32_e64 -define void @select_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, <8 x float> %b, i32 %c) nounwind { - %cmp = icmp eq i32 %c, 0 - %select = select i1 %cmp, <8 x float> %a, <8 x float> %b - store <8 x float> %select, <8 x float> addrspace(1)* %out, align 16 - ret void -} - -; FUNC-LABEL: {{^}}select_v2f64: -; SI: v_cndmask_b32_e64 -; SI: v_cndmask_b32_e64 -; SI: v_cndmask_b32_e64 -; SI: v_cndmask_b32_e64 -define void @select_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b, i32 %c) nounwind { - %cmp = icmp eq i32 %c, 0 - %select = select i1 %cmp, <2 x double> %a, <2 x double> %b - store <2 x double> %select, <2 x double> addrspace(1)* %out, align 16 - ret void -} - -; FUNC-LABEL: {{^}}select_v4f64: -; SI: v_cndmask_b32_e64 -; SI: v_cndmask_b32_e64 -; SI: v_cndmask_b32_e64 -; SI: v_cndmask_b32_e64 -; SI: v_cndmask_b32_e64 -; SI: v_cndmask_b32_e64 -; SI: v_cndmask_b32_e64 -; SI: v_cndmask_b32_e64 -define void @select_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b, i32 %c) nounwind { - %cmp = icmp eq i32 %c, 0 - %select = select i1 %cmp, <4 x double> %a, <4 x double> %b - store <4 x double> %select, <4 x double> addrspace(1)* %out, align 16 - ret void -} - -; FUNC-LABEL: {{^}}select_v8f64: -; SI: v_cndmask_b32_e64 -; SI: v_cndmask_b32_e64 -; SI: v_cndmask_b32_e64 -; SI: v_cndmask_b32_e64 -; SI: v_cndmask_b32_e64 -; SI: v_cndmask_b32_e64 -; SI: v_cndmask_b32_e64 -; SI: v_cndmask_b32_e64 -; SI: v_cndmask_b32_e64 -; SI: v_cndmask_b32_e64 -; SI: v_cndmask_b32_e64 -; SI: v_cndmask_b32_e64 -; SI: v_cndmask_b32_e64 -; SI: v_cndmask_b32_e64 -; SI: v_cndmask_b32_e64 -; SI: v_cndmask_b32_e64 -define void @select_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %a, <8 x double> %b, i32 %c) nounwind { - %cmp = icmp eq i32 %c, 0 - %select = select i1 %cmp, <8 x double> %a, <8 x double> %b - store <8 x double> %select, <8 x double> addrspace(1)* %out, align 16 - ret void -} diff --git a/llvm/test/CodeGen/R600/select.ll b/llvm/test/CodeGen/R600/select.ll deleted file mode 100644 index 45f3cd5a7ac..00000000000 --- a/llvm/test/CodeGen/R600/select.ll +++ /dev/null @@ -1,47 +0,0 @@ -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s - - -; Normally icmp + select is optimized to select_cc, when this happens the -; DAGLegalizer never sees the select and doesn't have a chance to leaglize it. -; -; In order to avoid the select_cc optimization, this test case calculates the -; condition for the select in a separate basic block. - -; FUNC-LABEL: {{^}}select: -; EG-DAG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.X -; EG-DAG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.X -; EG-DAG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XY -; EG-DAG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XY -; EG-DAG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XYZW -; EG-DAG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XYZW -define void @select (i32 addrspace(1)* %i32out, float addrspace(1)* %f32out, - <2 x i32> addrspace(1)* %v2i32out, <2 x float> addrspace(1)* %v2f32out, - <4 x i32> addrspace(1)* %v4i32out, <4 x float> addrspace(1)* %v4f32out, - i32 %cond) { -entry: - br label %for -body: - %inc = add i32 %i, 1 - %br_cmp.i = icmp eq i1 %br_cmp, 0 - br label %for -for: - %i = phi i32 [ %inc, %body], [ 0, %entry ] - %br_cmp = phi i1 [ %br_cmp.i, %body ], [ 0, %entry ] - %0 = icmp eq i32 %cond, %i - %1 = select i1 %br_cmp, i32 2, i32 3 - %2 = select i1 %br_cmp, float 2.0 , float 5.0 - %3 = select i1 %br_cmp, <2 x i32> , <2 x i32> - %4 = select i1 %br_cmp, <2 x float> , <2 x float> - %5 = select i1 %br_cmp, <4 x i32> , <4 x i32> - %6 = select i1 %br_cmp, <4 x float> , <4 x float> - br i1 %0, label %body, label %done - -done: - store i32 %1, i32 addrspace(1)* %i32out - store float %2, float addrspace(1)* %f32out - store <2 x i32> %3, <2 x i32> addrspace(1)* %v2i32out - store <2 x float> %4, <2 x float> addrspace(1)* %v2f32out - store <4 x i32> %5, <4 x i32> addrspace(1)* %v4i32out - store <4 x float> %6, <4 x float> addrspace(1)* %v4f32out - ret void -} diff --git a/llvm/test/CodeGen/R600/select64.ll b/llvm/test/CodeGen/R600/select64.ll deleted file mode 100644 index 5cebb30dc72..00000000000 --- a/llvm/test/CodeGen/R600/select64.ll +++ /dev/null @@ -1,68 +0,0 @@ -; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s -; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s - -; CHECK-LABEL: {{^}}select0: -; i64 select should be split into two i32 selects, and we shouldn't need -; to use a shfit to extract the hi dword of the input. -; CHECK-NOT: s_lshr_b64 -; CHECK: v_cndmask -; CHECK: v_cndmask -define void @select0(i64 addrspace(1)* %out, i32 %cond, i64 %in) { -entry: - %0 = icmp ugt i32 %cond, 5 - %1 = select i1 %0, i64 0, i64 %in - store i64 %1, i64 addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}select_trunc_i64: -; CHECK: v_cndmask_b32 -; CHECK-NOT: v_cndmask_b32 -define void @select_trunc_i64(i32 addrspace(1)* %out, i32 %cond, i64 %in) nounwind { - %cmp = icmp ugt i32 %cond, 5 - %sel = select i1 %cmp, i64 0, i64 %in - %trunc = trunc i64 %sel to i32 - store i32 %trunc, i32 addrspace(1)* %out, align 4 - ret void -} - -; CHECK-LABEL: {{^}}select_trunc_i64_2: -; CHECK: v_cndmask_b32 -; CHECK-NOT: v_cndmask_b32 -define void @select_trunc_i64_2(i32 addrspace(1)* %out, i32 %cond, i64 %a, i64 %b) nounwind { - %cmp = icmp ugt i32 %cond, 5 - %sel = select i1 %cmp, i64 %a, i64 %b - %trunc = trunc i64 %sel to i32 - store i32 %trunc, i32 addrspace(1)* %out, align 4 - ret void -} - -; CHECK-LABEL: {{^}}v_select_trunc_i64_2: -; CHECK: v_cndmask_b32 -; CHECK-NOT: v_cndmask_b32 -define void @v_select_trunc_i64_2(i32 addrspace(1)* %out, i32 %cond, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind { - %cmp = icmp ugt i32 %cond, 5 - %a = load i64, i64 addrspace(1)* %aptr, align 8 - %b = load i64, i64 addrspace(1)* %bptr, align 8 - %sel = select i1 %cmp, i64 %a, i64 %b - %trunc = trunc i64 %sel to i32 - store i32 %trunc, i32 addrspace(1)* %out, align 4 - ret void -} - -; CHECK-LABEL: {{^}}v_select_i64_split_imm: -; CHECK: s_mov_b32 [[SHI:s[0-9]+]], 63 -; CHECK: s_mov_b32 [[SLO:s[0-9]+]], 0 -; CHECK-DAG: v_mov_b32_e32 [[VHI:v[0-9]+]], [[SHI]] -; CHECK-DAG: v_mov_b32_e32 [[VLO:v[0-9]+]], [[SLO]] -; CHECK-DAG: v_cndmask_b32_e64 {{v[0-9]+}}, [[VLO]], {{v[0-9]+}} -; CHECK-DAG: v_cndmask_b32_e64 {{v[0-9]+}}, [[VHI]], {{v[0-9]+}} -; CHECK: s_endpgm -define void @v_select_i64_split_imm(i64 addrspace(1)* %out, i32 %cond, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind { - %cmp = icmp ugt i32 %cond, 5 - %a = load i64, i64 addrspace(1)* %aptr, align 8 - %b = load i64, i64 addrspace(1)* %bptr, align 8 - %sel = select i1 %cmp, i64 %a, i64 270582939648 ; 63 << 32 - store i64 %sel, i64 addrspace(1)* %out, align 8 - ret void -} diff --git a/llvm/test/CodeGen/R600/selectcc-cnd.ll b/llvm/test/CodeGen/R600/selectcc-cnd.ll deleted file mode 100644 index 94d0ace7569..00000000000 --- a/llvm/test/CodeGen/R600/selectcc-cnd.ll +++ /dev/null @@ -1,12 +0,0 @@ -;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s - -;CHECK-NOT: SETE -;CHECK: CNDE {{\*?}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, 1.0, literal.x, -;CHECK: 1073741824 -define void @test(float addrspace(1)* %out, float addrspace(1)* %in) { - %1 = load float, float addrspace(1)* %in - %2 = fcmp oeq float %1, 0.0 - %3 = select i1 %2, float 1.0, float 2.0 - store float %3, float addrspace(1)* %out - ret void -} diff --git a/llvm/test/CodeGen/R600/selectcc-cnde-int.ll b/llvm/test/CodeGen/R600/selectcc-cnde-int.ll deleted file mode 100644 index 58a4ee7d62b..00000000000 --- a/llvm/test/CodeGen/R600/selectcc-cnde-int.ll +++ /dev/null @@ -1,12 +0,0 @@ -;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s - -;CHECK-NOT: SETE_INT -;CHECK: CNDE_INT {{\*?}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, 1, literal.x, -;CHECK-NEXT: 2 -define void @test(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { - %1 = load i32, i32 addrspace(1)* %in - %2 = icmp eq i32 %1, 0 - %3 = select i1 %2, i32 1, i32 2 - store i32 %3, i32 addrspace(1)* %out - ret void -} diff --git a/llvm/test/CodeGen/R600/selectcc-icmp-select-float.ll b/llvm/test/CodeGen/R600/selectcc-icmp-select-float.ll deleted file mode 100644 index e870ee891e6..00000000000 --- a/llvm/test/CodeGen/R600/selectcc-icmp-select-float.ll +++ /dev/null @@ -1,16 +0,0 @@ -; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s - -; Note additional optimizations may cause this SGT to be replaced with a -; CND* instruction. -; CHECK: SETGT_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, literal.x, -; CHECK-NEXT: -1 -; Test a selectcc with i32 LHS/RHS and float True/False - -define void @test(float addrspace(1)* %out, i32 addrspace(1)* %in) { -entry: - %0 = load i32, i32 addrspace(1)* %in - %1 = icmp sge i32 %0, 0 - %2 = select i1 %1, float 1.0, float 0.0 - store float %2, float addrspace(1)* %out - ret void -} diff --git a/llvm/test/CodeGen/R600/selectcc-opt.ll b/llvm/test/CodeGen/R600/selectcc-opt.ll deleted file mode 100644 index 65be4a626a1..00000000000 --- a/llvm/test/CodeGen/R600/selectcc-opt.ll +++ /dev/null @@ -1,80 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s - - -; FUNC-LABEL: {{^}}test_a: -; EG-NOT: CND -; EG: SET{{[NEQGTL]+}}_DX10 - -define void @test_a(i32 addrspace(1)* %out, float %in) { -entry: - %0 = fcmp olt float %in, 0.000000e+00 - %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00 - %2 = fsub float -0.000000e+00, %1 - %3 = fptosi float %2 to i32 - %4 = bitcast i32 %3 to float - %5 = bitcast float %4 to i32 - %6 = icmp ne i32 %5, 0 - br i1 %6, label %IF, label %ENDIF - -IF: - %7 = getelementptr i32, i32 addrspace(1)* %out, i32 1 - store i32 0, i32 addrspace(1)* %7 - br label %ENDIF - -ENDIF: - store i32 0, i32 addrspace(1)* %out - ret void -} - -; Same as test_a, but the branch labels are swapped to produce the inverse cc -; for the icmp instruction - -; EG-LABEL: {{^}}test_b: -; EG: SET{{[GTEQN]+}}_DX10 -; EG-NEXT: PRED_ -; EG-NEXT: ALU clause starting -define void @test_b(i32 addrspace(1)* %out, float %in) { -entry: - %0 = fcmp olt float %in, 0.0 - %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00 - %2 = fsub float -0.000000e+00, %1 - %3 = fptosi float %2 to i32 - %4 = bitcast i32 %3 to float - %5 = bitcast float %4 to i32 - %6 = icmp ne i32 %5, 0 - br i1 %6, label %ENDIF, label %IF - -IF: - %7 = getelementptr i32, i32 addrspace(1)* %out, i32 1 - store i32 0, i32 addrspace(1)* %7 - br label %ENDIF - -ENDIF: - store i32 0, i32 addrspace(1)* %out - ret void -} - -; Test a CND*_INT instruction with float true/false values -; EG-LABEL: {{^}}test_c: -; EG: CND{{[GTE]+}}_INT -define void @test_c(float addrspace(1)* %out, i32 %in) { -entry: - %0 = icmp sgt i32 %in, 0 - %1 = select i1 %0, float 2.0, float 3.0 - store float %1, float addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}selectcc_bool: -; SI: v_cmp_ne_i32 -; SI-NEXT: v_cndmask_b32_e64 -; SI-NOT: cmp -; SI-NOT: cndmask -define void @selectcc_bool(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind { - %icmp0 = icmp ne i32 %a, %b - %ext = select i1 %icmp0, i32 -1, i32 0 - store i32 %ext, i32 addrspace(1)* %out - ret void -} diff --git a/llvm/test/CodeGen/R600/selectcc.ll b/llvm/test/CodeGen/R600/selectcc.ll deleted file mode 100644 index f378e15dd76..00000000000 --- a/llvm/test/CodeGen/R600/selectcc.ll +++ /dev/null @@ -1,20 +0,0 @@ -; RUN: llc -verify-machineinstrs -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s -; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s - -; FUNC-LABEL: {{^}}selectcc_i64: -; EG: XOR_INT -; EG: XOR_INT -; EG: OR_INT -; EG: CNDE_INT -; EG: CNDE_INT -; SI: v_cmp_eq_i64 -; SI: v_cndmask -; SI: v_cndmask -define void @selectcc_i64(i64 addrspace(1) * %out, i64 %lhs, i64 %rhs, i64 %true, i64 %false) { -entry: - %0 = icmp eq i64 %lhs, %rhs - %1 = select i1 %0, i64 %true, i64 %false - store i64 %1, i64 addrspace(1)* %out - ret void -} diff --git a/llvm/test/CodeGen/R600/set-dx10.ll b/llvm/test/CodeGen/R600/set-dx10.ll deleted file mode 100644 index 53694dcffa6..00000000000 --- a/llvm/test/CodeGen/R600/set-dx10.ll +++ /dev/null @@ -1,161 +0,0 @@ -; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s - -; These tests check that floating point comparisons which are used by select -; to store integer true (-1) and false (0) values are lowered to one of the -; SET*DX10 instructions. - -; CHECK: {{^}}fcmp_une_select_fptosi: -; CHECK: SETNE_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x, -; CHECK-NEXT: LSHR -; CHECK-NEXT: 1084227584(5.000000e+00) -define void @fcmp_une_select_fptosi(i32 addrspace(1)* %out, float %in) { -entry: - %0 = fcmp une float %in, 5.0 - %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00 - %2 = fsub float -0.000000e+00, %1 - %3 = fptosi float %2 to i32 - store i32 %3, i32 addrspace(1)* %out - ret void -} - -; CHECK: {{^}}fcmp_une_select_i32: -; CHECK: SETNE_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x, -; CHECK-NEXT: LSHR -; CHECK-NEXT: 1084227584(5.000000e+00) -define void @fcmp_une_select_i32(i32 addrspace(1)* %out, float %in) { -entry: - %0 = fcmp une float %in, 5.0 - %1 = select i1 %0, i32 -1, i32 0 - store i32 %1, i32 addrspace(1)* %out - ret void -} - -; CHECK: {{^}}fcmp_oeq_select_fptosi: -; CHECK: SETE_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x, -; CHECK-NEXT: LSHR -; CHECK-NEXT: 1084227584(5.000000e+00) -define void @fcmp_oeq_select_fptosi(i32 addrspace(1)* %out, float %in) { -entry: - %0 = fcmp oeq float %in, 5.0 - %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00 - %2 = fsub float -0.000000e+00, %1 - %3 = fptosi float %2 to i32 - store i32 %3, i32 addrspace(1)* %out - ret void -} - -; CHECK: {{^}}fcmp_oeq_select_i32: -; CHECK: SETE_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x, -; CHECK-NEXT: LSHR -; CHECK-NEXT: 1084227584(5.000000e+00) -define void @fcmp_oeq_select_i32(i32 addrspace(1)* %out, float %in) { -entry: - %0 = fcmp oeq float %in, 5.0 - %1 = select i1 %0, i32 -1, i32 0 - store i32 %1, i32 addrspace(1)* %out - ret void -} - -; CHECK: {{^}}fcmp_ogt_select_fptosi: -; CHECK: SETGT_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x, -; CHECK-NEXT: LSHR -; CHECK-NEXT: 1084227584(5.000000e+00) -define void @fcmp_ogt_select_fptosi(i32 addrspace(1)* %out, float %in) { -entry: - %0 = fcmp ogt float %in, 5.0 - %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00 - %2 = fsub float -0.000000e+00, %1 - %3 = fptosi float %2 to i32 - store i32 %3, i32 addrspace(1)* %out - ret void -} - -; CHECK: {{^}}fcmp_ogt_select_i32: -; CHECK: SETGT_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x, -; CHECK-NEXT: LSHR -; CHECK-NEXT: 1084227584(5.000000e+00) -define void @fcmp_ogt_select_i32(i32 addrspace(1)* %out, float %in) { -entry: - %0 = fcmp ogt float %in, 5.0 - %1 = select i1 %0, i32 -1, i32 0 - store i32 %1, i32 addrspace(1)* %out - ret void -} - -; CHECK: {{^}}fcmp_oge_select_fptosi: -; CHECK: SETGE_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x, -; CHECK-NEXT: LSHR -; CHECK-NEXT: 1084227584(5.000000e+00) -define void @fcmp_oge_select_fptosi(i32 addrspace(1)* %out, float %in) { -entry: - %0 = fcmp oge float %in, 5.0 - %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00 - %2 = fsub float -0.000000e+00, %1 - %3 = fptosi float %2 to i32 - store i32 %3, i32 addrspace(1)* %out - ret void -} - -; CHECK: {{^}}fcmp_oge_select_i32: -; CHECK: SETGE_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x, -; CHECK-NEXT: LSHR -; CHECK-NEXT: 1084227584(5.000000e+00) -define void @fcmp_oge_select_i32(i32 addrspace(1)* %out, float %in) { -entry: - %0 = fcmp oge float %in, 5.0 - %1 = select i1 %0, i32 -1, i32 0 - store i32 %1, i32 addrspace(1)* %out - ret void -} - -; CHECK: {{^}}fcmp_ole_select_fptosi: -; CHECK: SETGE_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z, -; CHECK-NEXT: LSHR -; CHECK-NEXT: 1084227584(5.000000e+00) -define void @fcmp_ole_select_fptosi(i32 addrspace(1)* %out, float %in) { -entry: - %0 = fcmp ole float %in, 5.0 - %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00 - %2 = fsub float -0.000000e+00, %1 - %3 = fptosi float %2 to i32 - store i32 %3, i32 addrspace(1)* %out - ret void -} - -; CHECK: {{^}}fcmp_ole_select_i32: -; CHECK: SETGE_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z, -; CHECK-NEXT: LSHR -; CHECK-NEXT: 1084227584(5.000000e+00) -define void @fcmp_ole_select_i32(i32 addrspace(1)* %out, float %in) { -entry: - %0 = fcmp ole float %in, 5.0 - %1 = select i1 %0, i32 -1, i32 0 - store i32 %1, i32 addrspace(1)* %out - ret void -} - -; CHECK: {{^}}fcmp_olt_select_fptosi: -; CHECK: SETGT_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z, -; CHECK-NEXT: LSHR -; CHECK-NEXT: 1084227584(5.000000e+00) -define void @fcmp_olt_select_fptosi(i32 addrspace(1)* %out, float %in) { -entry: - %0 = fcmp olt float %in, 5.0 - %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00 - %2 = fsub float -0.000000e+00, %1 - %3 = fptosi float %2 to i32 - store i32 %3, i32 addrspace(1)* %out - ret void -} - -; CHECK: {{^}}fcmp_olt_select_i32: -; CHECK: SETGT_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z, -; CHECK-NEXT: LSHR -; CHECK-NEXT: 1084227584(5.000000e+00) -define void @fcmp_olt_select_i32(i32 addrspace(1)* %out, float %in) { -entry: - %0 = fcmp olt float %in, 5.0 - %1 = select i1 %0, i32 -1, i32 0 - store i32 %1, i32 addrspace(1)* %out - ret void -} diff --git a/llvm/test/CodeGen/R600/setcc-equivalent.ll b/llvm/test/CodeGen/R600/setcc-equivalent.ll deleted file mode 100644 index 11ea793650c..00000000000 --- a/llvm/test/CodeGen/R600/setcc-equivalent.ll +++ /dev/null @@ -1,30 +0,0 @@ -; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG %s - -; EG-LABEL: {{^}}and_setcc_setcc_i32: -; EG: AND_INT -; EG-NEXT: SETE_INT -define void @and_setcc_setcc_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) { - %cmp1 = icmp eq i32 %a, -1 - %cmp2 = icmp eq i32 %b, -1 - %and = and i1 %cmp1, %cmp2 - %ext = sext i1 %and to i32 - store i32 %ext, i32 addrspace(1)* %out, align 4 - ret void -} - -; EG-LABEL: {{^}}and_setcc_setcc_v4i32: -; EG: AND_INT -; EG: AND_INT -; EG: SETE_INT -; EG: AND_INT -; EG: SETE_INT -; EG: AND_INT -; EG: SETE_INT -define void @and_setcc_setcc_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b) { - %cmp1 = icmp eq <4 x i32> %a, - %cmp2 = icmp eq <4 x i32> %b, - %and = and <4 x i1> %cmp1, %cmp2 - %ext = sext <4 x i1> %and to <4 x i32> - store <4 x i32> %ext, <4 x i32> addrspace(1)* %out, align 4 - ret void -} diff --git a/llvm/test/CodeGen/R600/setcc-opt.ll b/llvm/test/CodeGen/R600/setcc-opt.ll deleted file mode 100644 index 4e6a10d6b78..00000000000 --- a/llvm/test/CodeGen/R600/setcc-opt.ll +++ /dev/null @@ -1,236 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s - -; FUNC-LABEL: {{^}}sext_bool_icmp_eq_0: -; GCN-NOT: v_cmp -; GCN: v_cmp_ne_i32_e32 vcc, -; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc -; GCN-NEXT:buffer_store_byte [[RESULT]] -; GCN-NEXT: s_endpgm - -; EG: SETNE_INT * [[CMP:T[0-9]+]].[[CMPCHAN:[XYZW]]], KC0[2].Z, KC0[2].W -; EG: AND_INT T{{[0-9]+.[XYZW]}}, PS, 1 -define void @sext_bool_icmp_eq_0(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind { - %icmp0 = icmp eq i32 %a, %b - %ext = sext i1 %icmp0 to i32 - %icmp1 = icmp eq i32 %ext, 0 - store i1 %icmp1, i1 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}sext_bool_icmp_ne_0: -; GCN-NOT: v_cmp -; GCN: v_cmp_ne_i32_e32 vcc, -; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc -; GCN-NEXT: buffer_store_byte [[RESULT]] -; GCN-NEXT: s_endpgm - -; EG: SETNE_INT * [[CMP:T[0-9]+]].[[CMPCHAN:[XYZW]]], KC0[2].Z, KC0[2].W -; EG: AND_INT T{{[0-9]+.[XYZW]}}, PS, 1 -define void @sext_bool_icmp_ne_0(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind { - %icmp0 = icmp ne i32 %a, %b - %ext = sext i1 %icmp0 to i32 - %icmp1 = icmp ne i32 %ext, 0 - store i1 %icmp1, i1 addrspace(1)* %out - ret void -} - -; This really folds away to false -; FUNC-LABEL: {{^}}sext_bool_icmp_eq_1: -; GCN: v_cmp_eq_i32_e32 vcc, -; GCN-NEXT: v_cndmask_b32_e64 [[TMP:v[0-9]+]], 0, -1, vcc -; GCN-NEXT: v_cmp_eq_i32_e32 vcc, 1, [[TMP]]{{$}} -; GCN-NEXT: v_cndmask_b32_e64 [[TMP:v[0-9]+]], 0, 1, -; GCN-NEXT: buffer_store_byte [[TMP]] -; GCN-NEXT: s_endpgm -define void @sext_bool_icmp_eq_1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind { - %icmp0 = icmp eq i32 %a, %b - %ext = sext i1 %icmp0 to i32 - %icmp1 = icmp eq i32 %ext, 1 - store i1 %icmp1, i1 addrspace(1)* %out - ret void -} - -; This really folds away to true -; FUNC-LABEL: {{^}}sext_bool_icmp_ne_1: -; GCN: v_cmp_ne_i32_e32 vcc, -; GCN-NEXT: v_cndmask_b32_e64 [[TMP:v[0-9]+]], 0, -1, vcc -; GCN-NEXT: v_cmp_ne_i32_e32 vcc, 1, [[TMP]]{{$}} -; GCN-NEXT: v_cndmask_b32_e64 [[TMP:v[0-9]+]], 0, 1, -; GCN-NEXT: buffer_store_byte [[TMP]] -; GCN-NEXT: s_endpgm -define void @sext_bool_icmp_ne_1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind { - %icmp0 = icmp ne i32 %a, %b - %ext = sext i1 %icmp0 to i32 - %icmp1 = icmp ne i32 %ext, 1 - store i1 %icmp1, i1 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}zext_bool_icmp_eq_0: -; GCN-NOT: v_cmp -; GCN: v_cmp_ne_i32_e32 vcc, -; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc -; GCN-NEXT: buffer_store_byte [[RESULT]] -; GCN-NEXT: s_endpgm -define void @zext_bool_icmp_eq_0(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind { - %icmp0 = icmp eq i32 %a, %b - %ext = zext i1 %icmp0 to i32 - %icmp1 = icmp eq i32 %ext, 0 - store i1 %icmp1, i1 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}zext_bool_icmp_ne_0: -; GCN-NOT: v_cmp -; GCN: v_cmp_ne_i32_e32 vcc, -; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc -; GCN-NEXT: buffer_store_byte [[RESULT]] -; GCN-NEXT: s_endpgm -define void @zext_bool_icmp_ne_0(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind { - %icmp0 = icmp ne i32 %a, %b - %ext = zext i1 %icmp0 to i32 - %icmp1 = icmp ne i32 %ext, 0 - store i1 %icmp1, i1 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}zext_bool_icmp_eq_1: -; GCN-NOT: v_cmp -; GCN: v_cmp_eq_i32_e32 vcc, -; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc -; GCN-NEXT: buffer_store_byte [[RESULT]] -; GCN-NEXT: s_endpgm -define void @zext_bool_icmp_eq_1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind { - %icmp0 = icmp eq i32 %a, %b - %ext = zext i1 %icmp0 to i32 - %icmp1 = icmp eq i32 %ext, 1 - store i1 %icmp1, i1 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}zext_bool_icmp_ne_1: -; GCN-NOT: v_cmp -; GCN: v_cmp_eq_i32_e32 vcc, -; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc -; GCN-NEXT: buffer_store_byte [[RESULT]] -define void @zext_bool_icmp_ne_1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind { - %icmp0 = icmp ne i32 %a, %b - %ext = zext i1 %icmp0 to i32 - %icmp1 = icmp ne i32 %ext, 1 - store i1 %icmp1, i1 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}sext_bool_icmp_ne_k: -; SI-DAG: s_load_dword [[A:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb -; SI-DAG: s_load_dword [[B:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc -; VI-DAG: s_load_dword [[A:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c -; VI-DAG: s_load_dword [[B:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30 -; GCN: v_mov_b32_e32 [[VB:v[0-9]+]], [[B]] -; GCN: v_cmp_ne_i32_e32 vcc, 2, [[VB]]{{$}} -; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc -; GCN: buffer_store_byte -; GCN: s_endpgm -define void @sext_bool_icmp_ne_k(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind { - %icmp0 = icmp ne i32 %a, %b - %ext = sext i1 %icmp0 to i32 - %icmp1 = icmp ne i32 %ext, 2 - store i1 %icmp1, i1 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}cmp_zext_k_i8max: -; GCN: buffer_load_ubyte [[B:v[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:44 -; GCN: v_mov_b32_e32 [[K255:v[0-9]+]], 0xff{{$}} -; GCN: v_cmp_ne_i32_e32 vcc, [[K255]], [[B]] -; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc -; GCN-NEXT: buffer_store_byte [[RESULT]] -; GCN: s_endpgm -define void @cmp_zext_k_i8max(i1 addrspace(1)* %out, i8 %b) nounwind { - %b.ext = zext i8 %b to i32 - %icmp0 = icmp ne i32 %b.ext, 255 - store i1 %icmp0, i1 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}cmp_sext_k_neg1: -; GCN: buffer_load_sbyte [[B:v[0-9]+]] -; GCN: v_cmp_ne_i32_e32 vcc, -1, [[B]]{{$}} -; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc -; GCN-NEXT: buffer_store_byte [[RESULT]] -; GCN: s_endpgm -define void @cmp_sext_k_neg1(i1 addrspace(1)* %out, i8 addrspace(1)* %b.ptr) nounwind { - %b = load i8, i8 addrspace(1)* %b.ptr - %b.ext = sext i8 %b to i32 - %icmp0 = icmp ne i32 %b.ext, -1 - store i1 %icmp0, i1 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}cmp_sext_k_neg1_i8_sext_arg: -; GCN: s_load_dword [[B:s[0-9]+]] -; GCN: v_cmp_ne_i32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], -1, [[B]] -; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[CMP]] -; GCN-NEXT: buffer_store_byte [[RESULT]] -; GCN: s_endpgm -define void @cmp_sext_k_neg1_i8_sext_arg(i1 addrspace(1)* %out, i8 signext %b) nounwind { - %b.ext = sext i8 %b to i32 - %icmp0 = icmp ne i32 %b.ext, -1 - store i1 %icmp0, i1 addrspace(1)* %out - ret void -} - -; FIXME: This ends up doing a buffer_load_ubyte, and and compare to -; 255. Seems to be because of ordering problems when not allowing load widths to be reduced. -; Should do a buffer_load_sbyte and compare with -1 - -; FUNC-LABEL: {{^}}cmp_sext_k_neg1_i8_arg: -; GCN-DAG: buffer_load_ubyte [[B:v[0-9]+]] -; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0xff{{$}} -; GCN: v_cmp_ne_i32_e32 vcc, [[K]], [[B]]{{$}} -; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc -; GCN-NEXT: buffer_store_byte [[RESULT]] -; GCN: s_endpgm -define void @cmp_sext_k_neg1_i8_arg(i1 addrspace(1)* %out, i8 %b) nounwind { - %b.ext = sext i8 %b to i32 - %icmp0 = icmp ne i32 %b.ext, -1 - store i1 %icmp0, i1 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}cmp_zext_k_neg1: -; GCN: v_mov_b32_e32 [[RESULT:v[0-9]+]], 1{{$}} -; GCN: buffer_store_byte [[RESULT]] -; GCN: s_endpgm -define void @cmp_zext_k_neg1(i1 addrspace(1)* %out, i8 %b) nounwind { - %b.ext = zext i8 %b to i32 - %icmp0 = icmp ne i32 %b.ext, -1 - store i1 %icmp0, i1 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}zext_bool_icmp_ne_k: -; GCN: v_mov_b32_e32 [[RESULT:v[0-9]+]], 1{{$}} -; GCN: buffer_store_byte [[RESULT]] -; GCN-NEXT: s_endpgm -define void @zext_bool_icmp_ne_k(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind { - %icmp0 = icmp ne i32 %a, %b - %ext = zext i1 %icmp0 to i32 - %icmp1 = icmp ne i32 %ext, 2 - store i1 %icmp1, i1 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}zext_bool_icmp_eq_k: -; GCN: v_mov_b32_e32 [[RESULT:v[0-9]+]], 0{{$}} -; GCN: buffer_store_byte [[RESULT]] -; GCN-NEXT: s_endpgm -define void @zext_bool_icmp_eq_k(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind { - %icmp0 = icmp ne i32 %a, %b - %ext = zext i1 %icmp0 to i32 - %icmp1 = icmp eq i32 %ext, 2 - store i1 %icmp1, i1 addrspace(1)* %out - ret void -} diff --git a/llvm/test/CodeGen/R600/setcc.ll b/llvm/test/CodeGen/R600/setcc.ll deleted file mode 100644 index f33a82df5ff..00000000000 --- a/llvm/test/CodeGen/R600/setcc.ll +++ /dev/null @@ -1,377 +0,0 @@ -; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=R600 --check-prefix=FUNC %s -; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck -check-prefix=SI -check-prefix=FUNC %s - -declare i32 @llvm.r600.read.tidig.x() nounwind readnone - -; FUNC-LABEL: {{^}}setcc_v2i32: -; R600-DAG: SETE_INT * T{{[0-9]+\.[XYZW]}}, KC0[3].X, KC0[3].Z -; R600-DAG: SETE_INT * T{{[0-9]+\.[XYZW]}}, KC0[2].W, KC0[3].Y - -define void @setcc_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) { - %result = icmp eq <2 x i32> %a, %b - %sext = sext <2 x i1> %result to <2 x i32> - store <2 x i32> %sext, <2 x i32> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}setcc_v4i32: -; R600-DAG: SETE_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -; R600-DAG: SETE_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -; R600-DAG: SETE_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -; R600-DAG: SETE_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} - -define void @setcc_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { - %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1 - %a = load <4 x i32>, <4 x i32> addrspace(1) * %in - %b = load <4 x i32>, <4 x i32> addrspace(1) * %b_ptr - %result = icmp eq <4 x i32> %a, %b - %sext = sext <4 x i1> %result to <4 x i32> - store <4 x i32> %sext, <4 x i32> addrspace(1)* %out - ret void -} - -;;;==========================================================================;;; -;; Float comparisons -;;;==========================================================================;;; - -; FUNC-LABEL: {{^}}f32_oeq: -; R600: SETE_DX10 -; SI: v_cmp_eq_f32 -define void @f32_oeq(i32 addrspace(1)* %out, float %a, float %b) { -entry: - %0 = fcmp oeq float %a, %b - %1 = sext i1 %0 to i32 - store i32 %1, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}f32_ogt: -; R600: SETGT_DX10 -; SI: v_cmp_gt_f32 -define void @f32_ogt(i32 addrspace(1)* %out, float %a, float %b) { -entry: - %0 = fcmp ogt float %a, %b - %1 = sext i1 %0 to i32 - store i32 %1, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}f32_oge: -; R600: SETGE_DX10 -; SI: v_cmp_ge_f32 -define void @f32_oge(i32 addrspace(1)* %out, float %a, float %b) { -entry: - %0 = fcmp oge float %a, %b - %1 = sext i1 %0 to i32 - store i32 %1, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}f32_olt: -; R600: SETGT_DX10 -; SI: v_cmp_lt_f32 -define void @f32_olt(i32 addrspace(1)* %out, float %a, float %b) { -entry: - %0 = fcmp olt float %a, %b - %1 = sext i1 %0 to i32 - store i32 %1, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}f32_ole: -; R600: SETGE_DX10 -; SI: v_cmp_le_f32 -define void @f32_ole(i32 addrspace(1)* %out, float %a, float %b) { -entry: - %0 = fcmp ole float %a, %b - %1 = sext i1 %0 to i32 - store i32 %1, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}f32_one: -; R600-DAG: SETE_DX10 -; R600-DAG: SETE_DX10 -; R600-DAG: AND_INT -; R600-DAG: SETNE_DX10 -; R600-DAG: AND_INT -; R600-DAG: SETNE_INT - -; SI: v_cmp_lg_f32_e32 vcc -; SI-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc -define void @f32_one(i32 addrspace(1)* %out, float %a, float %b) { -entry: - %0 = fcmp one float %a, %b - %1 = sext i1 %0 to i32 - store i32 %1, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}f32_ord: -; R600-DAG: SETE_DX10 -; R600-DAG: SETE_DX10 -; R600-DAG: AND_INT -; R600-DAG: SETNE_INT -; SI: v_cmp_o_f32 -define void @f32_ord(i32 addrspace(1)* %out, float %a, float %b) { -entry: - %0 = fcmp ord float %a, %b - %1 = sext i1 %0 to i32 - store i32 %1, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}f32_ueq: -; R600-DAG: SETNE_DX10 -; R600-DAG: SETNE_DX10 -; R600-DAG: OR_INT -; R600-DAG: SETE_DX10 -; R600-DAG: OR_INT -; R600-DAG: SETNE_INT - -; SI: v_cmp_nlg_f32_e32 vcc -; SI-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc -define void @f32_ueq(i32 addrspace(1)* %out, float %a, float %b) { -entry: - %0 = fcmp ueq float %a, %b - %1 = sext i1 %0 to i32 - store i32 %1, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}f32_ugt: -; R600: SETGE -; R600: SETE_DX10 -; SI: v_cmp_nle_f32_e32 vcc -; SI-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc -define void @f32_ugt(i32 addrspace(1)* %out, float %a, float %b) { -entry: - %0 = fcmp ugt float %a, %b - %1 = sext i1 %0 to i32 - store i32 %1, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}f32_uge: -; R600: SETGT -; R600: SETE_DX10 - -; SI: v_cmp_nlt_f32_e32 vcc -; SI-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc -define void @f32_uge(i32 addrspace(1)* %out, float %a, float %b) { -entry: - %0 = fcmp uge float %a, %b - %1 = sext i1 %0 to i32 - store i32 %1, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}f32_ult: -; R600: SETGE -; R600: SETE_DX10 - -; SI: v_cmp_nge_f32_e32 vcc -; SI-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc -define void @f32_ult(i32 addrspace(1)* %out, float %a, float %b) { -entry: - %0 = fcmp ult float %a, %b - %1 = sext i1 %0 to i32 - store i32 %1, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}f32_ule: -; R600: SETGT -; R600: SETE_DX10 - -; SI: v_cmp_ngt_f32_e32 vcc -; SI-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc -define void @f32_ule(i32 addrspace(1)* %out, float %a, float %b) { -entry: - %0 = fcmp ule float %a, %b - %1 = sext i1 %0 to i32 - store i32 %1, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}f32_une: -; R600: SETNE_DX10 -; SI: v_cmp_neq_f32 -define void @f32_une(i32 addrspace(1)* %out, float %a, float %b) { -entry: - %0 = fcmp une float %a, %b - %1 = sext i1 %0 to i32 - store i32 %1, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}f32_uno: -; R600: SETNE_DX10 -; R600: SETNE_DX10 -; R600: OR_INT -; R600: SETNE_INT -; SI: v_cmp_u_f32 -define void @f32_uno(i32 addrspace(1)* %out, float %a, float %b) { -entry: - %0 = fcmp uno float %a, %b - %1 = sext i1 %0 to i32 - store i32 %1, i32 addrspace(1)* %out - ret void -} - -;;;==========================================================================;;; -;; 32-bit integer comparisons -;;;==========================================================================;;; - -; FUNC-LABEL: {{^}}i32_eq: -; R600: SETE_INT -; SI: v_cmp_eq_i32 -define void @i32_eq(i32 addrspace(1)* %out, i32 %a, i32 %b) { -entry: - %0 = icmp eq i32 %a, %b - %1 = sext i1 %0 to i32 - store i32 %1, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}i32_ne: -; R600: SETNE_INT -; SI: v_cmp_ne_i32 -define void @i32_ne(i32 addrspace(1)* %out, i32 %a, i32 %b) { -entry: - %0 = icmp ne i32 %a, %b - %1 = sext i1 %0 to i32 - store i32 %1, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}i32_ugt: -; R600: SETGT_UINT -; SI: v_cmp_gt_u32 -define void @i32_ugt(i32 addrspace(1)* %out, i32 %a, i32 %b) { -entry: - %0 = icmp ugt i32 %a, %b - %1 = sext i1 %0 to i32 - store i32 %1, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}i32_uge: -; R600: SETGE_UINT -; SI: v_cmp_ge_u32 -define void @i32_uge(i32 addrspace(1)* %out, i32 %a, i32 %b) { -entry: - %0 = icmp uge i32 %a, %b - %1 = sext i1 %0 to i32 - store i32 %1, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}i32_ult: -; R600: SETGT_UINT -; SI: v_cmp_lt_u32 -define void @i32_ult(i32 addrspace(1)* %out, i32 %a, i32 %b) { -entry: - %0 = icmp ult i32 %a, %b - %1 = sext i1 %0 to i32 - store i32 %1, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}i32_ule: -; R600: SETGE_UINT -; SI: v_cmp_le_u32 -define void @i32_ule(i32 addrspace(1)* %out, i32 %a, i32 %b) { -entry: - %0 = icmp ule i32 %a, %b - %1 = sext i1 %0 to i32 - store i32 %1, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}i32_sgt: -; R600: SETGT_INT -; SI: v_cmp_gt_i32 -define void @i32_sgt(i32 addrspace(1)* %out, i32 %a, i32 %b) { -entry: - %0 = icmp sgt i32 %a, %b - %1 = sext i1 %0 to i32 - store i32 %1, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}i32_sge: -; R600: SETGE_INT -; SI: v_cmp_ge_i32 -define void @i32_sge(i32 addrspace(1)* %out, i32 %a, i32 %b) { -entry: - %0 = icmp sge i32 %a, %b - %1 = sext i1 %0 to i32 - store i32 %1, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}i32_slt: -; R600: SETGT_INT -; SI: v_cmp_lt_i32 -define void @i32_slt(i32 addrspace(1)* %out, i32 %a, i32 %b) { -entry: - %0 = icmp slt i32 %a, %b - %1 = sext i1 %0 to i32 - store i32 %1, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}i32_sle: -; R600: SETGE_INT -; SI: v_cmp_le_i32 -define void @i32_sle(i32 addrspace(1)* %out, i32 %a, i32 %b) { -entry: - %0 = icmp sle i32 %a, %b - %1 = sext i1 %0 to i32 - store i32 %1, i32 addrspace(1)* %out - ret void -} - -; FIXME: This does 4 compares -; FUNC-LABEL: {{^}}v3i32_eq: -; SI-DAG: v_cmp_eq_i32 -; SI-DAG: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, -; SI-DAG: v_cmp_eq_i32 -; SI-DAG: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, -; SI-DAG: v_cmp_eq_i32 -; SI-DAG: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, -; SI: s_endpgm -define void @v3i32_eq(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(1)* %ptra, <3 x i32> addrspace(1)* %ptrb) { - %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone - %gep.a = getelementptr <3 x i32>, <3 x i32> addrspace(1)* %ptra, i32 %tid - %gep.b = getelementptr <3 x i32>, <3 x i32> addrspace(1)* %ptrb, i32 %tid - %gep.out = getelementptr <3 x i32>, <3 x i32> addrspace(1)* %out, i32 %tid - %a = load <3 x i32>, <3 x i32> addrspace(1)* %gep.a - %b = load <3 x i32>, <3 x i32> addrspace(1)* %gep.b - %cmp = icmp eq <3 x i32> %a, %b - %ext = sext <3 x i1> %cmp to <3 x i32> - store <3 x i32> %ext, <3 x i32> addrspace(1)* %gep.out - ret void -} - -; FUNC-LABEL: {{^}}v3i8_eq: -; SI-DAG: v_cmp_eq_i32 -; SI-DAG: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, -; SI-DAG: v_cmp_eq_i32 -; SI-DAG: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, -; SI-DAG: v_cmp_eq_i32 -; SI-DAG: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, -; SI: s_endpgm -define void @v3i8_eq(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %ptra, <3 x i8> addrspace(1)* %ptrb) { - %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone - %gep.a = getelementptr <3 x i8>, <3 x i8> addrspace(1)* %ptra, i32 %tid - %gep.b = getelementptr <3 x i8>, <3 x i8> addrspace(1)* %ptrb, i32 %tid - %gep.out = getelementptr <3 x i8>, <3 x i8> addrspace(1)* %out, i32 %tid - %a = load <3 x i8>, <3 x i8> addrspace(1)* %gep.a - %b = load <3 x i8>, <3 x i8> addrspace(1)* %gep.b - %cmp = icmp eq <3 x i8> %a, %b - %ext = sext <3 x i1> %cmp to <3 x i8> - store <3 x i8> %ext, <3 x i8> addrspace(1)* %gep.out - ret void -} diff --git a/llvm/test/CodeGen/R600/setcc64.ll b/llvm/test/CodeGen/R600/setcc64.ll deleted file mode 100644 index 231be7aa3da..00000000000 --- a/llvm/test/CodeGen/R600/setcc64.ll +++ /dev/null @@ -1,259 +0,0 @@ -;RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs| FileCheck --check-prefix=SI --check-prefix=FUNC %s -;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs| FileCheck --check-prefix=SI --check-prefix=FUNC %s - -; XXX: Merge this into setcc, once R600 supports 64-bit operations - -;;;==========================================================================;;; -;; Double comparisons -;;;==========================================================================;;; - -; FUNC-LABEL: {{^}}f64_oeq: -; SI: v_cmp_eq_f64 -define void @f64_oeq(i32 addrspace(1)* %out, double %a, double %b) { -entry: - %0 = fcmp oeq double %a, %b - %1 = sext i1 %0 to i32 - store i32 %1, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}f64_ogt: -; SI: v_cmp_gt_f64 -define void @f64_ogt(i32 addrspace(1)* %out, double %a, double %b) { -entry: - %0 = fcmp ogt double %a, %b - %1 = sext i1 %0 to i32 - store i32 %1, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}f64_oge: -; SI: v_cmp_ge_f64 -define void @f64_oge(i32 addrspace(1)* %out, double %a, double %b) { -entry: - %0 = fcmp oge double %a, %b - %1 = sext i1 %0 to i32 - store i32 %1, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}f64_olt: -; SI: v_cmp_lt_f64 -define void @f64_olt(i32 addrspace(1)* %out, double %a, double %b) { -entry: - %0 = fcmp olt double %a, %b - %1 = sext i1 %0 to i32 - store i32 %1, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}f64_ole: -; SI: v_cmp_le_f64 -define void @f64_ole(i32 addrspace(1)* %out, double %a, double %b) { -entry: - %0 = fcmp ole double %a, %b - %1 = sext i1 %0 to i32 - store i32 %1, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}f64_one: -; SI: v_cmp_lg_f64_e32 vcc -; SI-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc -define void @f64_one(i32 addrspace(1)* %out, double %a, double %b) { -entry: - %0 = fcmp one double %a, %b - %1 = sext i1 %0 to i32 - store i32 %1, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}f64_ord: -; SI: v_cmp_o_f64 -define void @f64_ord(i32 addrspace(1)* %out, double %a, double %b) { -entry: - %0 = fcmp ord double %a, %b - %1 = sext i1 %0 to i32 - store i32 %1, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}f64_ueq: -; SI: v_cmp_nlg_f64_e32 vcc -; SI-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc -define void @f64_ueq(i32 addrspace(1)* %out, double %a, double %b) { -entry: - %0 = fcmp ueq double %a, %b - %1 = sext i1 %0 to i32 - store i32 %1, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}f64_ugt: - -; SI: v_cmp_nle_f64_e32 vcc -; SI-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc -define void @f64_ugt(i32 addrspace(1)* %out, double %a, double %b) { -entry: - %0 = fcmp ugt double %a, %b - %1 = sext i1 %0 to i32 - store i32 %1, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}f64_uge: -; SI: v_cmp_nlt_f64_e32 vcc -; SI-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc -define void @f64_uge(i32 addrspace(1)* %out, double %a, double %b) { -entry: - %0 = fcmp uge double %a, %b - %1 = sext i1 %0 to i32 - store i32 %1, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}f64_ult: -; SI: v_cmp_nge_f64_e32 vcc -; SI-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc -define void @f64_ult(i32 addrspace(1)* %out, double %a, double %b) { -entry: - %0 = fcmp ult double %a, %b - %1 = sext i1 %0 to i32 - store i32 %1, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}f64_ule: -; SI: v_cmp_ngt_f64_e32 vcc -; SI-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc -define void @f64_ule(i32 addrspace(1)* %out, double %a, double %b) { -entry: - %0 = fcmp ule double %a, %b - %1 = sext i1 %0 to i32 - store i32 %1, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}f64_une: -; SI: v_cmp_neq_f64 -define void @f64_une(i32 addrspace(1)* %out, double %a, double %b) { -entry: - %0 = fcmp une double %a, %b - %1 = sext i1 %0 to i32 - store i32 %1, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}f64_uno: -; SI: v_cmp_u_f64 -define void @f64_uno(i32 addrspace(1)* %out, double %a, double %b) { -entry: - %0 = fcmp uno double %a, %b - %1 = sext i1 %0 to i32 - store i32 %1, i32 addrspace(1)* %out - ret void -} - -;;;==========================================================================;;; -;; 64-bit integer comparisons -;;;==========================================================================;;; - -; FUNC-LABEL: {{^}}i64_eq: -; SI: v_cmp_eq_i64 -define void @i64_eq(i32 addrspace(1)* %out, i64 %a, i64 %b) { -entry: - %0 = icmp eq i64 %a, %b - %1 = sext i1 %0 to i32 - store i32 %1, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}i64_ne: -; SI: v_cmp_ne_i64 -define void @i64_ne(i32 addrspace(1)* %out, i64 %a, i64 %b) { -entry: - %0 = icmp ne i64 %a, %b - %1 = sext i1 %0 to i32 - store i32 %1, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}i64_ugt: -; SI: v_cmp_gt_u64 -define void @i64_ugt(i32 addrspace(1)* %out, i64 %a, i64 %b) { -entry: - %0 = icmp ugt i64 %a, %b - %1 = sext i1 %0 to i32 - store i32 %1, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}i64_uge: -; SI: v_cmp_ge_u64 -define void @i64_uge(i32 addrspace(1)* %out, i64 %a, i64 %b) { -entry: - %0 = icmp uge i64 %a, %b - %1 = sext i1 %0 to i32 - store i32 %1, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}i64_ult: -; SI: v_cmp_lt_u64 -define void @i64_ult(i32 addrspace(1)* %out, i64 %a, i64 %b) { -entry: - %0 = icmp ult i64 %a, %b - %1 = sext i1 %0 to i32 - store i32 %1, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}i64_ule: -; SI: v_cmp_le_u64 -define void @i64_ule(i32 addrspace(1)* %out, i64 %a, i64 %b) { -entry: - %0 = icmp ule i64 %a, %b - %1 = sext i1 %0 to i32 - store i32 %1, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}i64_sgt: -; SI: v_cmp_gt_i64 -define void @i64_sgt(i32 addrspace(1)* %out, i64 %a, i64 %b) { -entry: - %0 = icmp sgt i64 %a, %b - %1 = sext i1 %0 to i32 - store i32 %1, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}i64_sge: -; SI: v_cmp_ge_i64 -define void @i64_sge(i32 addrspace(1)* %out, i64 %a, i64 %b) { -entry: - %0 = icmp sge i64 %a, %b - %1 = sext i1 %0 to i32 - store i32 %1, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}i64_slt: -; SI: v_cmp_lt_i64 -define void @i64_slt(i32 addrspace(1)* %out, i64 %a, i64 %b) { -entry: - %0 = icmp slt i64 %a, %b - %1 = sext i1 %0 to i32 - store i32 %1, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}i64_sle: -; SI: v_cmp_le_i64 -define void @i64_sle(i32 addrspace(1)* %out, i64 %a, i64 %b) { -entry: - %0 = icmp sle i64 %a, %b - %1 = sext i1 %0 to i32 - store i32 %1, i32 addrspace(1)* %out - ret void -} diff --git a/llvm/test/CodeGen/R600/seto.ll b/llvm/test/CodeGen/R600/seto.ll deleted file mode 100644 index 9b5d6b5dbd6..00000000000 --- a/llvm/test/CodeGen/R600/seto.ll +++ /dev/null @@ -1,15 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s - -; CHECK-LABEL: {{^}}main: -; CHECK: v_cmp_o_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], [[SREG:s[0-9]+]], [[SREG]] -; CHECK-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, 1.0, [[CMP]] -define void @main(float %p) { -main_body: - %c = fcmp oeq float %p, %p - %r = select i1 %c, float 1.000000e+00, float 0.000000e+00 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %r, float %r, float %r, float %r) - ret void -} - -declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) diff --git a/llvm/test/CodeGen/R600/setuo.ll b/llvm/test/CodeGen/R600/setuo.ll deleted file mode 100644 index 76346c4f624..00000000000 --- a/llvm/test/CodeGen/R600/setuo.ll +++ /dev/null @@ -1,15 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s - -; CHECK-LABEL: {{^}}main: -; CHECK: v_cmp_u_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], [[SREG:s[0-9]+]], [[SREG]] -; CHECK-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, 1.0, [[CMP]] -define void @main(float %p) { -main_body: - %c = fcmp une float %p, %p - %r = select i1 %c, float 1.000000e+00, float 0.000000e+00 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %r, float %r, float %r, float %r) - ret void -} - -declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) diff --git a/llvm/test/CodeGen/R600/sext-eliminate.ll b/llvm/test/CodeGen/R600/sext-eliminate.ll deleted file mode 100644 index 7dc6eb87f6b..00000000000 --- a/llvm/test/CodeGen/R600/sext-eliminate.ll +++ /dev/null @@ -1,26 +0,0 @@ -; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s - - -; FUNC-LABEL: {{^}}sext_in_reg_i1_i32_add: - -; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+\.[XYZW]]], [[ADDR:T[0-9]+.[XYZW]]] -; EG: SUB_INT {{[* ]*}}[[RES]] -; EG-NOT: BFE -define void @sext_in_reg_i1_i32_add(i32 addrspace(1)* %out, i1 %a, i32 %b) { - %sext = sext i1 %a to i32 - %res = add i32 %b, %sext - store i32 %res, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}sext_in_reg_i1_i32_sub: - -; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+\.[XYZW]]], [[ADDR:T[0-9]+.[XYZW]]] -; EG: ADD_INT {{[* ]*}}[[RES]] -; EG-NOT: BFE -define void @sext_in_reg_i1_i32_sub(i32 addrspace(1)* %out, i1 %a, i32 %b) { - %sext = sext i1 %a to i32 - %res = sub i32 %b, %sext - store i32 %res, i32 addrspace(1)* %out - ret void -} diff --git a/llvm/test/CodeGen/R600/sext-in-reg.ll b/llvm/test/CodeGen/R600/sext-in-reg.ll deleted file mode 100644 index 5aedda2ce1a..00000000000 --- a/llvm/test/CodeGen/R600/sext-in-reg.ll +++ /dev/null @@ -1,611 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s - -declare i32 @llvm.AMDGPU.imax(i32, i32) nounwind readnone -declare i32 @llvm.r600.read.tidig.x() nounwind readnone - - -; FUNC-LABEL: {{^}}sext_in_reg_i1_i32: -; SI: s_load_dword [[ARG:s[0-9]+]], -; SI: s_bfe_i32 [[SEXTRACT:s[0-9]+]], [[ARG]], 0x10000 -; SI: v_mov_b32_e32 [[EXTRACT:v[0-9]+]], [[SEXTRACT]] -; SI: buffer_store_dword [[EXTRACT]], - -; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+\.[XYZW]]], [[ADDR:T[0-9]+.[XYZW]]] -; EG: BFE_INT [[RES]], {{.*}}, 0.0, 1 -; EG-NEXT: LSHR * [[ADDR]] -define void @sext_in_reg_i1_i32(i32 addrspace(1)* %out, i32 %in) { - %shl = shl i32 %in, 31 - %sext = ashr i32 %shl, 31 - store i32 %sext, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}sext_in_reg_i8_to_i32: -; SI: s_add_i32 [[VAL:s[0-9]+]], -; SI: s_sext_i32_i8 [[EXTRACT:s[0-9]+]], [[VAL]] -; SI: v_mov_b32_e32 [[VEXTRACT:v[0-9]+]], [[EXTRACT]] -; SI: buffer_store_dword [[VEXTRACT]], - -; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+\.[XYZW]]], [[ADDR:T[0-9]+.[XYZW]]] -; EG: ADD_INT -; EG-NEXT: BFE_INT [[RES]], {{.*}}, 0.0, literal -; EG-NEXT: LSHR * [[ADDR]] -define void @sext_in_reg_i8_to_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind { - %c = add i32 %a, %b ; add to prevent folding into extload - %shl = shl i32 %c, 24 - %ashr = ashr i32 %shl, 24 - store i32 %ashr, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}sext_in_reg_i16_to_i32: -; SI: s_add_i32 [[VAL:s[0-9]+]], -; SI: s_sext_i32_i16 [[EXTRACT:s[0-9]+]], [[VAL]] -; SI: v_mov_b32_e32 [[VEXTRACT:v[0-9]+]], [[EXTRACT]] -; SI: buffer_store_dword [[VEXTRACT]], - -; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+\.[XYZW]]], [[ADDR:T[0-9]+.[XYZW]]] -; EG: ADD_INT -; EG-NEXT: BFE_INT [[RES]], {{.*}}, 0.0, literal -; EG-NEXT: LSHR * [[ADDR]] -define void @sext_in_reg_i16_to_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind { - %c = add i32 %a, %b ; add to prevent folding into extload - %shl = shl i32 %c, 16 - %ashr = ashr i32 %shl, 16 - store i32 %ashr, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}sext_in_reg_i8_to_v1i32: -; SI: s_add_i32 [[VAL:s[0-9]+]], -; SI: s_sext_i32_i8 [[EXTRACT:s[0-9]+]], [[VAL]] -; SI: v_mov_b32_e32 [[VEXTRACT:v[0-9]+]], [[EXTRACT]] -; SI: buffer_store_dword [[VEXTRACT]], - -; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+\.[XYZW]]], [[ADDR:T[0-9]+.[XYZW]]] -; EG: ADD_INT -; EG-NEXT: BFE_INT [[RES]], {{.*}}, 0.0, literal -; EG-NEXT: LSHR * [[ADDR]] -define void @sext_in_reg_i8_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i32> %a, <1 x i32> %b) nounwind { - %c = add <1 x i32> %a, %b ; add to prevent folding into extload - %shl = shl <1 x i32> %c, - %ashr = ashr <1 x i32> %shl, - store <1 x i32> %ashr, <1 x i32> addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}sext_in_reg_i1_to_i64: -; SI: s_lshl_b64 [[VAL:s\[[0-9]+:[0-9]+\]]] -; SI-DAG: s_bfe_i64 s{{\[}}[[SLO:[0-9]+]]:[[SHI:[0-9]+]]{{\]}}, [[VAL]], 0x10000 -; SI-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], s[[SLO]] -; SI-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[SHI]] -; SI: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}} -define void @sext_in_reg_i1_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind { - %c = shl i64 %a, %b - %shl = shl i64 %c, 63 - %ashr = ashr i64 %shl, 63 - store i64 %ashr, i64 addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}sext_in_reg_i8_to_i64: -; SI: s_lshl_b64 [[VAL:s\[[0-9]+:[0-9]+\]]] -; SI-DAG: s_bfe_i64 s{{\[}}[[SLO:[0-9]+]]:[[SHI:[0-9]+]]{{\]}}, [[VAL]], 0x80000 -; SI-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], s[[SLO]] -; SI-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[SHI]] -; SI: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}} - -; EG: MEM_{{.*}} STORE_{{.*}} [[RES_LO:T[0-9]+\.[XYZW]]], [[ADDR_LO:T[0-9]+.[XYZW]]] -; EG: MEM_{{.*}} STORE_{{.*}} [[RES_HI:T[0-9]+\.[XYZW]]], [[ADDR_HI:T[0-9]+.[XYZW]]] -; EG: LSHL -; EG: BFE_INT {{\*?}} [[RES_LO]], {{.*}}, 0.0, literal -; EG: ASHR [[RES_HI]] -; EG-NOT: BFE_INT -; EG: LSHR -; EG: LSHR -;; TODO Check address computation, using | with variables in {{}} does not work, -;; also the _LO/_HI order might be different -define void @sext_in_reg_i8_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind { - %c = shl i64 %a, %b - %shl = shl i64 %c, 56 - %ashr = ashr i64 %shl, 56 - store i64 %ashr, i64 addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}sext_in_reg_i16_to_i64: -; SI: s_lshl_b64 [[VAL:s\[[0-9]+:[0-9]+\]]] -; SI-DAG: s_bfe_i64 s{{\[}}[[SLO:[0-9]+]]:[[SHI:[0-9]+]]{{\]}}, [[VAL]], 0x100000 -; SI-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], s[[SLO]] -; SI-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[SHI]] -; SI: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}} - -; EG: MEM_{{.*}} STORE_{{.*}} [[RES_LO:T[0-9]+\.[XYZW]]], [[ADDR_LO:T[0-9]+.[XYZW]]] -; EG: MEM_{{.*}} STORE_{{.*}} [[RES_HI:T[0-9]+\.[XYZW]]], [[ADDR_HI:T[0-9]+.[XYZW]]] -; EG: LSHL -; EG: BFE_INT {{\*?}} [[RES_LO]], {{.*}}, 0.0, literal -; EG: ASHR [[RES_HI]] -; EG-NOT: BFE_INT -; EG: LSHR -; EG: LSHR -;; TODO Check address computation, using | with variables in {{}} does not work, -;; also the _LO/_HI order might be different -define void @sext_in_reg_i16_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind { - %c = shl i64 %a, %b - %shl = shl i64 %c, 48 - %ashr = ashr i64 %shl, 48 - store i64 %ashr, i64 addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}sext_in_reg_i32_to_i64: -; SI: s_lshl_b64 [[VAL:s\[[0-9]+:[0-9]+\]]] -; SI-DAG: s_bfe_i64 s{{\[}}[[SLO:[0-9]+]]:[[SHI:[0-9]+]]{{\]}}, [[VAL]], 0x200000 -; SI-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], s[[SLO]] -; SI-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[SHI]] -; SI: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}} - -; EG: MEM_{{.*}} STORE_{{.*}} [[RES_LO:T[0-9]+\.[XYZW]]], [[ADDR_LO:T[0-9]+.[XYZW]]] -; EG: MEM_{{.*}} STORE_{{.*}} [[RES_HI:T[0-9]+\.[XYZW]]], [[ADDR_HI:T[0-9]+.[XYZW]]] -; EG-NOT: BFE_INT - -; EG: ASHR [[RES_HI]] - -; EG: LSHR -; EG: LSHR -;; TODO Check address computation, using | with variables in {{}} does not work, -;; also the _LO/_HI order might be different -define void @sext_in_reg_i32_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind { - %c = shl i64 %a, %b - %shl = shl i64 %c, 32 - %ashr = ashr i64 %shl, 32 - store i64 %ashr, i64 addrspace(1)* %out, align 8 - ret void -} - -; This is broken on Evergreen for some reason related to the <1 x i64> kernel arguments. -; XFUNC-LABEL: {{^}}sext_in_reg_i8_to_v1i64: -; XSI: s_bfe_i32 [[EXTRACT:s[0-9]+]], {{s[0-9]+}}, 524288 -; XSI: s_ashr_i32 {{v[0-9]+}}, [[EXTRACT]], 31 -; XSI: buffer_store_dword -; XEG: BFE_INT -; XEG: ASHR -; define void @sext_in_reg_i8_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i64> %a, <1 x i64> %b) nounwind { -; %c = add <1 x i64> %a, %b -; %shl = shl <1 x i64> %c, -; %ashr = ashr <1 x i64> %shl, -; store <1 x i64> %ashr, <1 x i64> addrspace(1)* %out, align 8 -; ret void -; } - -; FUNC-LABEL: {{^}}v_sext_in_reg_i1_to_i64: -; SI: buffer_load_dwordx2 -; SI: v_lshl_b64 v{{\[}}[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]{{\]}} -; SI: v_bfe_i32 v[[LO:[0-9]+]], v[[VAL_LO]], 0, 1 -; SI: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]] -; SI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} -define void @v_sext_in_reg_i1_to_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind { - %tid = call i32 @llvm.r600.read.tidig.x() - %a.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid - %b.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid - %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %tid - %a = load i64, i64 addrspace(1)* %a.gep, align 8 - %b = load i64, i64 addrspace(1)* %b.gep, align 8 - - %c = shl i64 %a, %b - %shl = shl i64 %c, 63 - %ashr = ashr i64 %shl, 63 - store i64 %ashr, i64 addrspace(1)* %out.gep, align 8 - ret void -} - -; FUNC-LABEL: {{^}}v_sext_in_reg_i8_to_i64: -; SI: buffer_load_dwordx2 -; SI: v_lshl_b64 v{{\[}}[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]{{\]}} -; SI: v_bfe_i32 v[[LO:[0-9]+]], v[[VAL_LO]], 0, 8 -; SI: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]] -; SI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} -define void @v_sext_in_reg_i8_to_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind { - %tid = call i32 @llvm.r600.read.tidig.x() - %a.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid - %b.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid - %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %tid - %a = load i64, i64 addrspace(1)* %a.gep, align 8 - %b = load i64, i64 addrspace(1)* %b.gep, align 8 - - %c = shl i64 %a, %b - %shl = shl i64 %c, 56 - %ashr = ashr i64 %shl, 56 - store i64 %ashr, i64 addrspace(1)* %out.gep, align 8 - ret void -} - -; FUNC-LABEL: {{^}}v_sext_in_reg_i16_to_i64: -; SI: buffer_load_dwordx2 -; SI: v_lshl_b64 v{{\[}}[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]{{\]}} -; SI: v_bfe_i32 v[[LO:[0-9]+]], v[[VAL_LO]], 0, 16 -; SI: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]] -; SI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} -define void @v_sext_in_reg_i16_to_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind { - %tid = call i32 @llvm.r600.read.tidig.x() - %a.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid - %b.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid - %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %tid - %a = load i64, i64 addrspace(1)* %a.gep, align 8 - %b = load i64, i64 addrspace(1)* %b.gep, align 8 - - %c = shl i64 %a, %b - %shl = shl i64 %c, 48 - %ashr = ashr i64 %shl, 48 - store i64 %ashr, i64 addrspace(1)* %out.gep, align 8 - ret void -} - -; FUNC-LABEL: {{^}}v_sext_in_reg_i32_to_i64: -; SI: buffer_load_dwordx2 -; SI: v_lshl_b64 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, -; SI: v_ashrrev_i32_e32 v[[SHR:[0-9]+]], 31, v[[LO]] -; SI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[SHR]]{{\]}} -define void @v_sext_in_reg_i32_to_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind { - %tid = call i32 @llvm.r600.read.tidig.x() - %a.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid - %b.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid - %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %tid - %a = load i64, i64 addrspace(1)* %a.gep, align 8 - %b = load i64, i64 addrspace(1)* %b.gep, align 8 - - %c = shl i64 %a, %b - %shl = shl i64 %c, 32 - %ashr = ashr i64 %shl, 32 - store i64 %ashr, i64 addrspace(1)* %out.gep, align 8 - ret void -} - -; FUNC-LABEL: {{^}}sext_in_reg_i1_in_i32_other_amount: -; SI-NOT: s_lshl -; SI-NOT: s_ashr -; SI: s_bfe_i32 {{s[0-9]+}}, {{s[0-9]+}}, 0x190001 - -; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+\.[XYZW]]], [[ADDR:T[0-9]+.[XYZW]]] -; EG-NOT: BFE -; EG: ADD_INT -; EG: LSHL -; EG: ASHR [[RES]] -; EG: LSHR {{\*?}} [[ADDR]] -define void @sext_in_reg_i1_in_i32_other_amount(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind { - %c = add i32 %a, %b - %x = shl i32 %c, 6 - %y = ashr i32 %x, 7 - store i32 %y, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}sext_in_reg_v2i1_in_v2i32_other_amount: -; SI-NOT: s_lshl -; SI-NOT: s_ashr -; SI-DAG: s_bfe_i32 {{s[0-9]+}}, {{s[0-9]+}}, 0x190001 -; SI-DAG: s_bfe_i32 {{s[0-9]+}}, {{s[0-9]+}}, 0x190001 -; SI: s_endpgm - -; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+]]{{\.[XYZW][XYZW]}}, [[ADDR:T[0-9]+.[XYZW]]] -; EG-NOT: BFE -; EG: ADD_INT -; EG: LSHL -; EG: ASHR [[RES]] -; EG: LSHL -; EG: ASHR [[RES]] -; EG: LSHR {{\*?}} [[ADDR]] -define void @sext_in_reg_v2i1_in_v2i32_other_amount(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) nounwind { - %c = add <2 x i32> %a, %b - %x = shl <2 x i32> %c, - %y = ashr <2 x i32> %x, - store <2 x i32> %y, <2 x i32> addrspace(1)* %out, align 2 - ret void -} - - -; FUNC-LABEL: {{^}}sext_in_reg_v2i1_to_v2i32: -; SI: s_bfe_i32 {{s[0-9]+}}, {{s[0-9]+}}, 0x10000 -; SI: s_bfe_i32 {{s[0-9]+}}, {{s[0-9]+}}, 0x10000 -; SI: buffer_store_dwordx2 - -; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+]]{{\.[XYZW][XYZW]}}, [[ADDR:T[0-9]+.[XYZW]]] -; EG: BFE_INT [[RES]] -; EG: BFE_INT [[RES]] -; EG: LSHR {{\*?}} [[ADDR]] -define void @sext_in_reg_v2i1_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) nounwind { - %c = add <2 x i32> %a, %b ; add to prevent folding into extload - %shl = shl <2 x i32> %c, - %ashr = ashr <2 x i32> %shl, - store <2 x i32> %ashr, <2 x i32> addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}sext_in_reg_v4i1_to_v4i32: -; SI: s_bfe_i32 {{s[0-9]+}}, {{s[0-9]+}}, 0x10000 -; SI: s_bfe_i32 {{s[0-9]+}}, {{s[0-9]+}}, 0x10000 -; SI: s_bfe_i32 {{s[0-9]+}}, {{s[0-9]+}}, 0x10000 -; SI: s_bfe_i32 {{s[0-9]+}}, {{s[0-9]+}}, 0x10000 -; SI: buffer_store_dwordx4 - -; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+]]{{\.[XYZW][XYZW][XYZW][XYZW]}}, [[ADDR:T[0-9]+.[XYZW]]] -; EG: BFE_INT [[RES]] -; EG: BFE_INT [[RES]] -; EG: BFE_INT [[RES]] -; EG: BFE_INT [[RES]] -; EG: LSHR {{\*?}} [[ADDR]] -define void @sext_in_reg_v4i1_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b) nounwind { - %c = add <4 x i32> %a, %b ; add to prevent folding into extload - %shl = shl <4 x i32> %c, - %ashr = ashr <4 x i32> %shl, - store <4 x i32> %ashr, <4 x i32> addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}sext_in_reg_v2i8_to_v2i32: -; SI: s_sext_i32_i8 {{s[0-9]+}}, {{s[0-9]+}} -; SI: s_sext_i32_i8 {{s[0-9]+}}, {{s[0-9]+}} -; SI: buffer_store_dwordx2 - -; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+]]{{\.[XYZW][XYZW]}}, [[ADDR:T[0-9]+.[XYZW]]] -; EG: BFE_INT [[RES]] -; EG: BFE_INT [[RES]] -; EG: LSHR {{\*?}} [[ADDR]] -define void @sext_in_reg_v2i8_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) nounwind { - %c = add <2 x i32> %a, %b ; add to prevent folding into extload - %shl = shl <2 x i32> %c, - %ashr = ashr <2 x i32> %shl, - store <2 x i32> %ashr, <2 x i32> addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}sext_in_reg_v4i8_to_v4i32: -; SI: s_sext_i32_i8 {{s[0-9]+}}, {{s[0-9]+}} -; SI: s_sext_i32_i8 {{s[0-9]+}}, {{s[0-9]+}} -; SI: s_sext_i32_i8 {{s[0-9]+}}, {{s[0-9]+}} -; SI: s_sext_i32_i8 {{s[0-9]+}}, {{s[0-9]+}} -; SI: buffer_store_dwordx4 - -; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+]]{{\.[XYZW][XYZW][XYZW][XYZW]}}, [[ADDR:T[0-9]+.[XYZW]]] -; EG: BFE_INT [[RES]] -; EG: BFE_INT [[RES]] -; EG: BFE_INT [[RES]] -; EG: BFE_INT [[RES]] -; EG: LSHR {{\*?}} [[ADDR]] -define void @sext_in_reg_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b) nounwind { - %c = add <4 x i32> %a, %b ; add to prevent folding into extload - %shl = shl <4 x i32> %c, - %ashr = ashr <4 x i32> %shl, - store <4 x i32> %ashr, <4 x i32> addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}sext_in_reg_v2i16_to_v2i32: -; SI: s_sext_i32_i16 {{s[0-9]+}}, {{s[0-9]+}} -; SI: s_sext_i32_i16 {{s[0-9]+}}, {{s[0-9]+}} -; SI: buffer_store_dwordx2 - -; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+]]{{\.[XYZW][XYZW]}}, [[ADDR:T[0-9]+.[XYZW]]] -; EG: BFE_INT [[RES]] -; EG: BFE_INT [[RES]] -; EG: LSHR {{\*?}} [[ADDR]] -define void @sext_in_reg_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) nounwind { - %c = add <2 x i32> %a, %b ; add to prevent folding into extload - %shl = shl <2 x i32> %c, - %ashr = ashr <2 x i32> %shl, - store <2 x i32> %ashr, <2 x i32> addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}testcase: -define void @testcase(i8 addrspace(1)* %out, i8 %a) nounwind { - %and_a_1 = and i8 %a, 1 - %cmp_eq = icmp eq i8 %and_a_1, 0 - %cmp_slt = icmp slt i8 %a, 0 - %sel0 = select i1 %cmp_slt, i8 0, i8 %a - %sel1 = select i1 %cmp_eq, i8 0, i8 %a - %xor = xor i8 %sel0, %sel1 - store i8 %xor, i8 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}testcase_3: -define void @testcase_3(i8 addrspace(1)* %out, i8 %a) nounwind { - %and_a_1 = and i8 %a, 1 - %cmp_eq = icmp eq i8 %and_a_1, 0 - %cmp_slt = icmp slt i8 %a, 0 - %sel0 = select i1 %cmp_slt, i8 0, i8 %a - %sel1 = select i1 %cmp_eq, i8 0, i8 %a - %xor = xor i8 %sel0, %sel1 - store i8 %xor, i8 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}vgpr_sext_in_reg_v4i8_to_v4i32: -; SI: v_bfe_i32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 8 -; SI: v_bfe_i32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 8 -; SI: v_bfe_i32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 8 -; SI: v_bfe_i32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 8 -define void @vgpr_sext_in_reg_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %a, <4 x i32> addrspace(1)* %b) nounwind { - %loada = load <4 x i32>, <4 x i32> addrspace(1)* %a, align 16 - %loadb = load <4 x i32>, <4 x i32> addrspace(1)* %b, align 16 - %c = add <4 x i32> %loada, %loadb ; add to prevent folding into extload - %shl = shl <4 x i32> %c, - %ashr = ashr <4 x i32> %shl, - store <4 x i32> %ashr, <4 x i32> addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}vgpr_sext_in_reg_v4i16_to_v4i32: -; SI: v_bfe_i32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 16 -; SI: v_bfe_i32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 16 -define void @vgpr_sext_in_reg_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %a, <4 x i32> addrspace(1)* %b) nounwind { - %loada = load <4 x i32>, <4 x i32> addrspace(1)* %a, align 16 - %loadb = load <4 x i32>, <4 x i32> addrspace(1)* %b, align 16 - %c = add <4 x i32> %loada, %loadb ; add to prevent folding into extload - %shl = shl <4 x i32> %c, - %ashr = ashr <4 x i32> %shl, - store <4 x i32> %ashr, <4 x i32> addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}sext_in_reg_to_illegal_type: -; SI: buffer_load_sbyte -; SI: v_max_i32 -; SI-NOT: bfe -; SI: buffer_store_short -define void @sext_in_reg_to_illegal_type(i16 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %src) nounwind { - %tmp5 = load i8, i8 addrspace(1)* %src, align 1 - %tmp2 = sext i8 %tmp5 to i32 - %tmp3 = tail call i32 @llvm.AMDGPU.imax(i32 %tmp2, i32 0) nounwind readnone - %tmp4 = trunc i32 %tmp3 to i8 - %tmp6 = sext i8 %tmp4 to i16 - store i16 %tmp6, i16 addrspace(1)* %out, align 2 - ret void -} - -declare i32 @llvm.AMDGPU.bfe.i32(i32, i32, i32) nounwind readnone - -; FUNC-LABEL: {{^}}bfe_0_width: -; SI-NOT: {{[^@]}}bfe -; SI: s_endpgm -define void @bfe_0_width(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) nounwind { - %load = load i32, i32 addrspace(1)* %ptr, align 4 - %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %load, i32 8, i32 0) nounwind readnone - store i32 %bfe, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_8_bfe_8: -; SI: v_bfe_i32 -; SI-NOT: {{[^@]}}bfe -; SI: s_endpgm -define void @bfe_8_bfe_8(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) nounwind { - %load = load i32, i32 addrspace(1)* %ptr, align 4 - %bfe0 = call i32 @llvm.AMDGPU.bfe.i32(i32 %load, i32 0, i32 8) nounwind readnone - %bfe1 = call i32 @llvm.AMDGPU.bfe.i32(i32 %bfe0, i32 0, i32 8) nounwind readnone - store i32 %bfe1, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_8_bfe_16: -; SI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 8 -; SI: s_endpgm -define void @bfe_8_bfe_16(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) nounwind { - %load = load i32, i32 addrspace(1)* %ptr, align 4 - %bfe0 = call i32 @llvm.AMDGPU.bfe.i32(i32 %load, i32 0, i32 8) nounwind readnone - %bfe1 = call i32 @llvm.AMDGPU.bfe.i32(i32 %bfe0, i32 0, i32 16) nounwind readnone - store i32 %bfe1, i32 addrspace(1)* %out, align 4 - ret void -} - -; This really should be folded into 1 -; FUNC-LABEL: {{^}}bfe_16_bfe_8: -; SI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 8 -; SI-NOT: {{[^@]}}bfe -; SI: s_endpgm -define void @bfe_16_bfe_8(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) nounwind { - %load = load i32, i32 addrspace(1)* %ptr, align 4 - %bfe0 = call i32 @llvm.AMDGPU.bfe.i32(i32 %load, i32 0, i32 16) nounwind readnone - %bfe1 = call i32 @llvm.AMDGPU.bfe.i32(i32 %bfe0, i32 0, i32 8) nounwind readnone - store i32 %bfe1, i32 addrspace(1)* %out, align 4 - ret void -} - -; Make sure there isn't a redundant BFE -; FUNC-LABEL: {{^}}sext_in_reg_i8_to_i32_bfe: -; SI: s_sext_i32_i8 s{{[0-9]+}}, s{{[0-9]+}} -; SI-NOT: {{[^@]}}bfe -; SI: s_endpgm -define void @sext_in_reg_i8_to_i32_bfe(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind { - %c = add i32 %a, %b ; add to prevent folding into extload - %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %c, i32 0, i32 8) nounwind readnone - %shl = shl i32 %bfe, 24 - %ashr = ashr i32 %shl, 24 - store i32 %ashr, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}sext_in_reg_i8_to_i32_bfe_wrong: -define void @sext_in_reg_i8_to_i32_bfe_wrong(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind { - %c = add i32 %a, %b ; add to prevent folding into extload - %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %c, i32 8, i32 0) nounwind readnone - %shl = shl i32 %bfe, 24 - %ashr = ashr i32 %shl, 24 - store i32 %ashr, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}sextload_i8_to_i32_bfe: -; SI: buffer_load_sbyte -; SI-NOT: {{[^@]}}bfe -; SI: s_endpgm -define void @sextload_i8_to_i32_bfe(i32 addrspace(1)* %out, i8 addrspace(1)* %ptr) nounwind { - %load = load i8, i8 addrspace(1)* %ptr, align 1 - %sext = sext i8 %load to i32 - %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %sext, i32 0, i32 8) nounwind readnone - %shl = shl i32 %bfe, 24 - %ashr = ashr i32 %shl, 24 - store i32 %ashr, i32 addrspace(1)* %out, align 4 - ret void -} - -; SI: .text -; FUNC-LABEL: {{^}}sextload_i8_to_i32_bfe_0:{{.*$}} -; SI-NOT: {{[^@]}}bfe -; SI: s_endpgm -define void @sextload_i8_to_i32_bfe_0(i32 addrspace(1)* %out, i8 addrspace(1)* %ptr) nounwind { - %load = load i8, i8 addrspace(1)* %ptr, align 1 - %sext = sext i8 %load to i32 - %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %sext, i32 8, i32 0) nounwind readnone - %shl = shl i32 %bfe, 24 - %ashr = ashr i32 %shl, 24 - store i32 %ashr, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}sext_in_reg_i1_bfe_offset_0: -; SI-NOT: shr -; SI-NOT: shl -; SI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 1 -; SI: s_endpgm -define void @sext_in_reg_i1_bfe_offset_0(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { - %x = load i32, i32 addrspace(1)* %in, align 4 - %shl = shl i32 %x, 31 - %shr = ashr i32 %shl, 31 - %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %shr, i32 0, i32 1) - store i32 %bfe, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}sext_in_reg_i1_bfe_offset_1: -; SI: buffer_load_dword -; SI-NOT: shl -; SI-NOT: shr -; SI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 1, 1 -; SI: s_endpgm -define void @sext_in_reg_i1_bfe_offset_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { - %x = load i32, i32 addrspace(1)* %in, align 4 - %shl = shl i32 %x, 30 - %shr = ashr i32 %shl, 30 - %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %shr, i32 1, i32 1) - store i32 %bfe, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}sext_in_reg_i2_bfe_offset_1: -; SI: buffer_load_dword -; SI-NOT: v_lshl -; SI-NOT: v_ashr -; SI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 2 -; SI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 1, 2 -; SI: s_endpgm -define void @sext_in_reg_i2_bfe_offset_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { - %x = load i32, i32 addrspace(1)* %in, align 4 - %shl = shl i32 %x, 30 - %shr = ashr i32 %shl, 30 - %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %shr, i32 1, i32 2) - store i32 %bfe, i32 addrspace(1)* %out, align 4 - ret void -} diff --git a/llvm/test/CodeGen/R600/sgpr-control-flow.ll b/llvm/test/CodeGen/R600/sgpr-control-flow.ll deleted file mode 100644 index 38289ced632..00000000000 --- a/llvm/test/CodeGen/R600/sgpr-control-flow.ll +++ /dev/null @@ -1,105 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s -; -; -; Most SALU instructions ignore control flow, so we need to make sure -; they don't overwrite values from other blocks. - -; If the branch decision is made based on a value in an SGPR then all -; threads will execute the same code paths, so we don't need to worry -; about instructions in different blocks overwriting each other. -; SI-LABEL: {{^}}sgpr_if_else_salu_br: -; SI: s_add -; SI: s_add - -define void @sgpr_if_else_salu_br(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d, i32 %e) { -entry: - %0 = icmp eq i32 %a, 0 - br i1 %0, label %if, label %else - -if: - %1 = add i32 %b, %c - br label %endif - -else: - %2 = add i32 %d, %e - br label %endif - -endif: - %3 = phi i32 [%1, %if], [%2, %else] - %4 = add i32 %3, %a - store i32 %4, i32 addrspace(1)* %out - ret void -} - -; The two S_ADD instructions should write to different registers, since -; different threads will take different control flow paths. - -; SI-LABEL: {{^}}sgpr_if_else_valu_br: -; SI: s_add_i32 [[SGPR:s[0-9]+]] -; SI-NOT: s_add_i32 [[SGPR]] - -define void @sgpr_if_else_valu_br(i32 addrspace(1)* %out, float %a, i32 %b, i32 %c, i32 %d, i32 %e) { -entry: - %tid = call i32 @llvm.r600.read.tidig.x() #0 - %tid_f = uitofp i32 %tid to float - %tmp1 = fcmp ueq float %tid_f, 0.0 - br i1 %tmp1, label %if, label %else - -if: - %tmp2 = add i32 %b, %c - br label %endif - -else: - %tmp3 = add i32 %d, %e - br label %endif - -endif: - %tmp4 = phi i32 [%tmp2, %if], [%tmp3, %else] - store i32 %tmp4, i32 addrspace(1)* %out - ret void -} - -; FIXME: Should write to different SGPR pairs instead of copying to -; VALU for i1 phi. - -; SI-LABEL: {{^}}sgpr_if_else_valu_cmp_phi_br: -; SI: buffer_load_dword [[AVAL:v[0-9]+]] -; SI: v_cmp_gt_i32_e32 [[CMP_IF:vcc]], 0, [[AVAL]] -; SI: v_cndmask_b32_e64 [[V_CMP:v[0-9]+]], 0, -1, [[CMP_IF]] - -; SI: BB2_1: -; SI: buffer_load_dword [[AVAL:v[0-9]+]] -; SI: v_cmp_eq_i32_e32 [[CMP_ELSE:vcc]], 0, [[AVAL]] -; SI: v_cndmask_b32_e64 [[V_CMP]], 0, -1, [[CMP_ELSE]] - -; SI: v_cmp_ne_i32_e32 [[CMP_CMP:vcc]], 0, [[V_CMP]] -; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP_CMP]] -; SI: buffer_store_dword [[RESULT]] -define void @sgpr_if_else_valu_cmp_phi_br(i32 addrspace(1)* %out, i32 addrspace(1)* %a, i32 addrspace(1)* %b) { -entry: - %tid = call i32 @llvm.r600.read.tidig.x() #0 - %tmp1 = icmp eq i32 %tid, 0 - br i1 %tmp1, label %if, label %else - -if: - %gep.if = getelementptr i32, i32 addrspace(1)* %a, i32 %tid - %a.val = load i32, i32 addrspace(1)* %gep.if - %cmp.if = icmp eq i32 %a.val, 0 - br label %endif - -else: - %gep.else = getelementptr i32, i32 addrspace(1)* %b, i32 %tid - %b.val = load i32, i32 addrspace(1)* %gep.else - %cmp.else = icmp slt i32 %b.val, 0 - br label %endif - -endif: - %tmp4 = phi i1 [%cmp.if, %if], [%cmp.else, %else] - %ext = sext i1 %tmp4 to i32 - store i32 %ext, i32 addrspace(1)* %out - ret void -} - -declare i32 @llvm.r600.read.tidig.x() #0 - -attributes #0 = { readnone } diff --git a/llvm/test/CodeGen/R600/sgpr-copy-duplicate-operand.ll b/llvm/test/CodeGen/R600/sgpr-copy-duplicate-operand.ll deleted file mode 100644 index df67fcca22f..00000000000 --- a/llvm/test/CodeGen/R600/sgpr-copy-duplicate-operand.ll +++ /dev/null @@ -1,19 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s - -; Copy VGPR -> SGPR used twice as an instruction operand, which is then -; used in an REG_SEQUENCE that also needs to be handled. - -; SI-LABEL: {{^}}test_dup_operands: -; SI: v_add_i32_e32 -define void @test_dup_operands(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %in) { - %a = load <2 x i32>, <2 x i32> addrspace(1)* %in - %lo = extractelement <2 x i32> %a, i32 0 - %hi = extractelement <2 x i32> %a, i32 1 - %add = add i32 %lo, %lo - %vec0 = insertelement <2 x i32> undef, i32 %add, i32 0 - %vec1 = insertelement <2 x i32> %vec0, i32 %hi, i32 1 - store <2 x i32> %vec1, <2 x i32> addrspace(1)* %out, align 8 - ret void -} - diff --git a/llvm/test/CodeGen/R600/sgpr-copy.ll b/llvm/test/CodeGen/R600/sgpr-copy.ll deleted file mode 100644 index b849c4038bc..00000000000 --- a/llvm/test/CodeGen/R600/sgpr-copy.ll +++ /dev/null @@ -1,379 +0,0 @@ -; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s -; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s - -; This test checks that no VGPR to SGPR copies are created by the register -; allocator. -; CHECK-LABEL: {{^}}phi1: -; CHECK: s_buffer_load_dword [[DST:s[0-9]]], {{s\[[0-9]+:[0-9]+\]}}, 0x0 -; CHECK: v_mov_b32_e32 v{{[0-9]}}, [[DST]] - -define void @phi1(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 { -main_body: - %20 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %0, i32 0 - %21 = load <16 x i8>, <16 x i8> addrspace(2)* %20, !tbaa !1 - %22 = call float @llvm.SI.load.const(<16 x i8> %21, i32 0) - %23 = call float @llvm.SI.load.const(<16 x i8> %21, i32 16) - %24 = call float @llvm.SI.load.const(<16 x i8> %21, i32 32) - %25 = fptosi float %23 to i32 - %26 = icmp ne i32 %25, 0 - br i1 %26, label %ENDIF, label %ELSE - -ELSE: ; preds = %main_body - %27 = fsub float -0.000000e+00, %22 - br label %ENDIF - -ENDIF: ; preds = %main_body, %ELSE - %temp.0 = phi float [ %27, %ELSE ], [ %22, %main_body ] - %28 = fadd float %temp.0, %24 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %28, float %28, float 0.000000e+00, float 1.000000e+00) - ret void -} - -; Make sure this program doesn't crash -; CHECK-LABEL: {{^}}phi2: -define void @phi2(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 { -main_body: - %20 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %0, i32 0 - %21 = load <16 x i8>, <16 x i8> addrspace(2)* %20, !tbaa !1 - %22 = call float @llvm.SI.load.const(<16 x i8> %21, i32 16) - %23 = call float @llvm.SI.load.const(<16 x i8> %21, i32 32) - %24 = call float @llvm.SI.load.const(<16 x i8> %21, i32 36) - %25 = call float @llvm.SI.load.const(<16 x i8> %21, i32 40) - %26 = call float @llvm.SI.load.const(<16 x i8> %21, i32 48) - %27 = call float @llvm.SI.load.const(<16 x i8> %21, i32 52) - %28 = call float @llvm.SI.load.const(<16 x i8> %21, i32 56) - %29 = call float @llvm.SI.load.const(<16 x i8> %21, i32 64) - %30 = call float @llvm.SI.load.const(<16 x i8> %21, i32 68) - %31 = call float @llvm.SI.load.const(<16 x i8> %21, i32 72) - %32 = call float @llvm.SI.load.const(<16 x i8> %21, i32 76) - %33 = call float @llvm.SI.load.const(<16 x i8> %21, i32 80) - %34 = call float @llvm.SI.load.const(<16 x i8> %21, i32 84) - %35 = call float @llvm.SI.load.const(<16 x i8> %21, i32 88) - %36 = call float @llvm.SI.load.const(<16 x i8> %21, i32 92) - %37 = getelementptr <32 x i8>, <32 x i8> addrspace(2)* %2, i32 0 - %38 = load <32 x i8>, <32 x i8> addrspace(2)* %37, !tbaa !1 - %39 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %1, i32 0 - %40 = load <16 x i8>, <16 x i8> addrspace(2)* %39, !tbaa !1 - %41 = call float @llvm.SI.fs.interp(i32 0, i32 0, i32 %3, <2 x i32> %5) - %42 = call float @llvm.SI.fs.interp(i32 1, i32 0, i32 %3, <2 x i32> %5) - %43 = call float @llvm.SI.fs.interp(i32 0, i32 1, i32 %3, <2 x i32> %5) - %44 = call float @llvm.SI.fs.interp(i32 1, i32 1, i32 %3, <2 x i32> %5) - %45 = call float @llvm.SI.fs.interp(i32 2, i32 1, i32 %3, <2 x i32> %5) - %46 = bitcast float %41 to i32 - %47 = bitcast float %42 to i32 - %48 = insertelement <2 x i32> undef, i32 %46, i32 0 - %49 = insertelement <2 x i32> %48, i32 %47, i32 1 - %50 = call <4 x float> @llvm.SI.sample.v2i32(<2 x i32> %49, <32 x i8> %38, <16 x i8> %40, i32 2) - %51 = extractelement <4 x float> %50, i32 2 - %52 = call float @fabs(float %51) - %53 = fmul float %43, %43 - %54 = fmul float %44, %44 - %55 = fadd float %54, %53 - %56 = fmul float %45, %45 - %57 = fadd float %55, %56 - %58 = call float @llvm.AMDGPU.rsq.f32(float %57) - %59 = fmul float %43, %58 - %60 = fmul float %44, %58 - %61 = fmul float %45, %58 - %62 = fmul float %59, %23 - %63 = fmul float %60, %24 - %64 = fadd float %63, %62 - %65 = fmul float %61, %25 - %66 = fadd float %64, %65 - %67 = fsub float -0.000000e+00, %26 - %68 = fmul float %66, %52 - %69 = fadd float %68, %67 - %70 = fmul float %27, %69 - %71 = fmul float %28, %69 - %72 = call float @fabs(float %70) - %73 = fcmp olt float 0x3EE4F8B580000000, %72 - %74 = sext i1 %73 to i32 - %75 = bitcast i32 %74 to float - %76 = bitcast float %75 to i32 - %77 = icmp ne i32 %76, 0 - br i1 %77, label %IF, label %ENDIF - -IF: ; preds = %main_body - %78 = fsub float -0.000000e+00, %70 - %79 = call float @llvm.AMDIL.exp.(float %78) - %80 = fsub float -0.000000e+00, %79 - %81 = fadd float 1.000000e+00, %80 - %82 = fdiv float 1.000000e+00, %70 - %83 = fmul float %81, %82 - %84 = fmul float %32, %83 - br label %ENDIF - -ENDIF: ; preds = %main_body, %IF - %temp4.0 = phi float [ %84, %IF ], [ %32, %main_body ] - %85 = call float @fabs(float %71) - %86 = fcmp olt float 0x3EE4F8B580000000, %85 - %87 = sext i1 %86 to i32 - %88 = bitcast i32 %87 to float - %89 = bitcast float %88 to i32 - %90 = icmp ne i32 %89, 0 - br i1 %90, label %IF25, label %ENDIF24 - -IF25: ; preds = %ENDIF - %91 = fsub float -0.000000e+00, %71 - %92 = call float @llvm.AMDIL.exp.(float %91) - %93 = fsub float -0.000000e+00, %92 - %94 = fadd float 1.000000e+00, %93 - %95 = fdiv float 1.000000e+00, %71 - %96 = fmul float %94, %95 - %97 = fmul float %36, %96 - br label %ENDIF24 - -ENDIF24: ; preds = %ENDIF, %IF25 - %temp8.0 = phi float [ %97, %IF25 ], [ %36, %ENDIF ] - %98 = fmul float %29, %temp4.0 - %99 = fmul float %30, %temp4.0 - %100 = fmul float %31, %temp4.0 - %101 = fmul float %33, %temp8.0 - %102 = fadd float %101, %98 - %103 = fmul float %34, %temp8.0 - %104 = fadd float %103, %99 - %105 = fmul float %35, %temp8.0 - %106 = fadd float %105, %100 - %107 = call float @llvm.pow.f32(float %52, float %22) - %108 = fsub float -0.000000e+00, %102 - %109 = fmul float %108, %107 - %110 = fsub float -0.000000e+00, %104 - %111 = fmul float %110, %107 - %112 = fsub float -0.000000e+00, %106 - %113 = fmul float %112, %107 - %114 = call i32 @llvm.SI.packf16(float %109, float %111) - %115 = bitcast i32 %114 to float - %116 = call i32 @llvm.SI.packf16(float %113, float 1.000000e+00) - %117 = bitcast i32 %116 to float - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %115, float %117, float %115, float %117) - ret void -} - -; We just want ot make sure the program doesn't crash -; CHECK-LABEL: {{^}}loop: - -define void @loop(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 { -main_body: - %20 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %0, i32 0 - %21 = load <16 x i8>, <16 x i8> addrspace(2)* %20, !tbaa !1 - %22 = call float @llvm.SI.load.const(<16 x i8> %21, i32 0) - %23 = call float @llvm.SI.load.const(<16 x i8> %21, i32 4) - %24 = call float @llvm.SI.load.const(<16 x i8> %21, i32 8) - %25 = call float @llvm.SI.load.const(<16 x i8> %21, i32 12) - %26 = fptosi float %25 to i32 - %27 = bitcast i32 %26 to float - %28 = bitcast float %27 to i32 - br label %LOOP - -LOOP: ; preds = %ENDIF, %main_body - %temp4.0 = phi float [ %22, %main_body ], [ %temp5.0, %ENDIF ] - %temp5.0 = phi float [ %23, %main_body ], [ %temp6.0, %ENDIF ] - %temp6.0 = phi float [ %24, %main_body ], [ %temp4.0, %ENDIF ] - %temp8.0 = phi float [ 0.000000e+00, %main_body ], [ %37, %ENDIF ] - %29 = bitcast float %temp8.0 to i32 - %30 = icmp sge i32 %29, %28 - %31 = sext i1 %30 to i32 - %32 = bitcast i32 %31 to float - %33 = bitcast float %32 to i32 - %34 = icmp ne i32 %33, 0 - br i1 %34, label %IF, label %ENDIF - -IF: ; preds = %LOOP - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %temp4.0, float %temp5.0, float %temp6.0, float 1.000000e+00) - ret void - -ENDIF: ; preds = %LOOP - %35 = bitcast float %temp8.0 to i32 - %36 = add i32 %35, 1 - %37 = bitcast i32 %36 to float - br label %LOOP -} - -; Function Attrs: nounwind readnone -declare float @llvm.SI.load.const(<16 x i8>, i32) #1 - -; Function Attrs: readonly -declare float @fabs(float) #2 - -declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) - -attributes #0 = { "ShaderType"="0" } -attributes #1 = { nounwind readnone } -attributes #2 = { readonly } -attributes #3 = { readnone } -attributes #4 = { nounwind readonly } - -!0 = !{!"const", null} -!1 = !{!0, !0, i64 0, i32 1} - -; Function Attrs: nounwind readnone -declare float @llvm.SI.fs.interp(i32, i32, i32, <2 x i32>) #1 - -; Function Attrs: nounwind readnone -declare <4 x float> @llvm.SI.sample.v2i32(<2 x i32>, <32 x i8>, <16 x i8>, i32) #1 - -; Function Attrs: readnone -declare float @llvm.AMDGPU.rsq.f32(float) #3 - -; Function Attrs: readnone -declare float @llvm.AMDIL.exp.(float) #3 - -; Function Attrs: nounwind readonly -declare float @llvm.pow.f32(float, float) #4 - -; Function Attrs: nounwind readnone -declare i32 @llvm.SI.packf16(float, float) #1 - -; This checks for a bug in the FixSGPRCopies pass where VReg96 -; registers were being identified as an SGPR regclass which was causing -; an assertion failure. - -; CHECK-LABEL: {{^}}sample_v3: -; CHECK: image_sample -; CHECK: image_sample -; CHECK: exp -; CHECK: s_endpgm -define void @sample_v3([17 x <16 x i8>] addrspace(2)* byval, [32 x <16 x i8>] addrspace(2)* byval, [16 x <32 x i8>] addrspace(2)* byval, float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 { - -entry: - %21 = getelementptr [17 x <16 x i8>], [17 x <16 x i8>] addrspace(2)* %0, i64 0, i32 0 - %22 = load <16 x i8>, <16 x i8> addrspace(2)* %21, !tbaa !2 - %23 = call float @llvm.SI.load.const(<16 x i8> %22, i32 16) - %24 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 0 - %25 = load <32 x i8>, <32 x i8> addrspace(2)* %24, !tbaa !2 - %26 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 0 - %27 = load <16 x i8>, <16 x i8> addrspace(2)* %26, !tbaa !2 - %28 = fcmp oeq float %23, 0.0 - br i1 %28, label %if, label %else - -if: - %val.if = call <4 x float> @llvm.SI.sample.v2i32(<2 x i32> , <32 x i8> %25, <16 x i8> %27, i32 2) - %val.if.0 = extractelement <4 x float> %val.if, i32 0 - %val.if.1 = extractelement <4 x float> %val.if, i32 1 - %val.if.2 = extractelement <4 x float> %val.if, i32 2 - br label %endif - -else: - %val.else = call <4 x float> @llvm.SI.sample.v2i32(<2 x i32> , <32 x i8> %25, <16 x i8> %27, i32 2) - %val.else.0 = extractelement <4 x float> %val.else, i32 0 - %val.else.1 = extractelement <4 x float> %val.else, i32 1 - %val.else.2 = extractelement <4 x float> %val.else, i32 2 - br label %endif - -endif: - %val.0 = phi float [%val.if.0, %if], [%val.else.0, %else] - %val.1 = phi float [%val.if.1, %if], [%val.else.1, %else] - %val.2 = phi float [%val.if.2, %if], [%val.else.2, %else] - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %val.0, float %val.1, float %val.2, float 0.0) - ret void -} - -!2 = !{!"const", null, i32 1} - -; CHECK-LABEL: {{^}}copy1: -; CHECK: buffer_load_dword -; CHECK: v_add -; CHECK: s_endpgm -define void @copy1(float addrspace(1)* %out, float addrspace(1)* %in0) { -entry: - %0 = load float, float addrspace(1)* %in0 - %1 = fcmp oeq float %0, 0.0 - br i1 %1, label %if0, label %endif - -if0: - %2 = bitcast float %0 to i32 - %3 = fcmp olt float %0, 0.0 - br i1 %3, label %if1, label %endif - -if1: - %4 = add i32 %2, 1 - br label %endif - -endif: - %5 = phi i32 [ 0, %entry ], [ %2, %if0 ], [ %4, %if1 ] - %6 = bitcast i32 %5 to float - store float %6, float addrspace(1)* %out - ret void -} - -; This test is just checking that we don't crash / assertion fail. -; CHECK-LABEL: {{^}}copy2: -; CHECK: s_endpgm - -define void @copy2([17 x <16 x i8>] addrspace(2)* byval, [32 x <16 x i8>] addrspace(2)* byval, [16 x <32 x i8>] addrspace(2)* byval, float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 { -entry: - br label %LOOP68 - -LOOP68: - %temp4.7 = phi float [ 0.000000e+00, %entry ], [ %v, %ENDIF69 ] - %t = phi i32 [ 20, %entry ], [ %x, %ENDIF69 ] - %g = icmp eq i32 0, %t - %l = bitcast float %temp4.7 to i32 - br i1 %g, label %IF70, label %ENDIF69 - -IF70: - %q = icmp ne i32 %l, 13 - %temp.8 = select i1 %q, float 1.000000e+00, float 0.000000e+00 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %temp.8, float 0.000000e+00, float 0.000000e+00, float 1.000000e+00) - ret void - -ENDIF69: - %u = add i32 %l, %t - %v = bitcast i32 %u to float - %x = add i32 %t, -1 - br label %LOOP68 -} - -attributes #0 = { "ShaderType"="0" } - -; This test checks that image_sample resource descriptors aren't loaded into -; vgprs. The verifier will fail if this happens. -; CHECK-LABEL:{{^}}sample_rsrc: -; CHECK: image_sample -; CHECK: image_sample -; CHECK: s_endpgm -define void @sample_rsrc([6 x <16 x i8>] addrspace(2)* byval %arg, [17 x <16 x i8>] addrspace(2)* byval %arg1, [16 x <4 x i32>] addrspace(2)* byval %arg2, [32 x <8 x i32>] addrspace(2)* byval %arg3, float inreg %arg4, i32 inreg %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <3 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, <2 x i32> %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, i32 %arg19, float %arg20, float %arg21) #0 { -bb: - %tmp = getelementptr [17 x <16 x i8>], [17 x <16 x i8>] addrspace(2)* %arg1, i32 0, i32 0 - %tmp22 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp, !tbaa !0 - %tmp23 = call float @llvm.SI.load.const(<16 x i8> %tmp22, i32 16) - %tmp25 = getelementptr [32 x <8 x i32>], [32 x <8 x i32>] addrspace(2)* %arg3, i32 0, i32 0 - %tmp26 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp25, !tbaa !0 - %tmp27 = getelementptr [16 x <4 x i32>], [16 x <4 x i32>] addrspace(2)* %arg2, i32 0, i32 0 - %tmp28 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp27, !tbaa !0 - %tmp29 = call float @llvm.SI.fs.interp(i32 0, i32 0, i32 %arg5, <2 x i32> %arg7) - %tmp30 = call float @llvm.SI.fs.interp(i32 1, i32 0, i32 %arg5, <2 x i32> %arg7) - %tmp31 = bitcast float %tmp23 to i32 - %tmp36 = icmp ne i32 %tmp31, 0 - br i1 %tmp36, label %bb38, label %bb80 - -bb38: ; preds = %bb - %tmp52 = bitcast float %tmp29 to i32 - %tmp53 = bitcast float %tmp30 to i32 - %tmp54 = insertelement <2 x i32> undef, i32 %tmp52, i32 0 - %tmp55 = insertelement <2 x i32> %tmp54, i32 %tmp53, i32 1 - %tmp56 = bitcast <8 x i32> %tmp26 to <32 x i8> - %tmp57 = bitcast <4 x i32> %tmp28 to <16 x i8> - %tmp58 = call <4 x float> @llvm.SI.sample.v2i32(<2 x i32> %tmp55, <32 x i8> %tmp56, <16 x i8> %tmp57, i32 2) - br label %bb71 - -bb80: ; preds = %bb - %tmp81 = bitcast float %tmp29 to i32 - %tmp82 = bitcast float %tmp30 to i32 - %tmp82.2 = add i32 %tmp82, 1 - %tmp83 = insertelement <2 x i32> undef, i32 %tmp81, i32 0 - %tmp84 = insertelement <2 x i32> %tmp83, i32 %tmp82.2, i32 1 - %tmp85 = bitcast <8 x i32> %tmp26 to <32 x i8> - %tmp86 = bitcast <4 x i32> %tmp28 to <16 x i8> - %tmp87 = call <4 x float> @llvm.SI.sample.v2i32(<2 x i32> %tmp84, <32 x i8> %tmp85, <16 x i8> %tmp86, i32 2) - br label %bb71 - -bb71: ; preds = %bb80, %bb38 - %tmp72 = phi <4 x float> [ %tmp58, %bb38 ], [ %tmp87, %bb80 ] - %tmp88 = extractelement <4 x float> %tmp72, i32 0 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %tmp88, float %tmp88, float %tmp88, float %tmp88) - ret void -} - -attributes #0 = { "ShaderType"="0" "unsafe-fp-math"="true" } -attributes #1 = { nounwind readnone } diff --git a/llvm/test/CodeGen/R600/shared-op-cycle.ll b/llvm/test/CodeGen/R600/shared-op-cycle.ll deleted file mode 100644 index f52a9baf4d1..00000000000 --- a/llvm/test/CodeGen/R600/shared-op-cycle.ll +++ /dev/null @@ -1,32 +0,0 @@ -; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s - -; CHECK: {{^}}main: -; CHECK: MULADD_IEEE * -; CHECK-NOT: MULADD_IEEE * - -define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1, <4 x float> inreg %reg2) #0 { - %w0 = extractelement <4 x float> %reg0, i32 3 - %w1 = extractelement <4 x float> %reg1, i32 3 - %w2 = extractelement <4 x float> %reg2, i32 3 - %sq0 = fmul float %w0, %w0 - %r0 = fadd float %sq0, 2.0 - %sq1 = fmul float %w1, %w1 - %r1 = fadd float %sq1, 2.0 - %sq2 = fmul float %w2, %w2 - %r2 = fadd float %sq2, 2.0 - %v0 = insertelement <4 x float> undef, float %r0, i32 0 - %v1 = insertelement <4 x float> %v0, float %r1, i32 1 - %v2 = insertelement <4 x float> %v1, float %r2, i32 2 - %res = call float @llvm.AMDGPU.dp4(<4 x float> %v2, <4 x float> %v2) - %vecres = insertelement <4 x float> undef, float %res, i32 0 - call void @llvm.R600.store.swizzle(<4 x float> %vecres, i32 0, i32 2) - ret void -} - -; Function Attrs: readnone -declare float @llvm.AMDGPU.dp4(<4 x float>, <4 x float>) #1 - -declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) - -attributes #0 = { "ShaderType"="1" } -attributes #1 = { readnone } \ No newline at end of file diff --git a/llvm/test/CodeGen/R600/shl.ll b/llvm/test/CodeGen/R600/shl.ll deleted file mode 100644 index 53b63dc4b8a..00000000000 --- a/llvm/test/CodeGen/R600/shl.ll +++ /dev/null @@ -1,180 +0,0 @@ -;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG %s -;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI %s -;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=VI %s - -;EG: {{^}}shl_v2i32: -;EG: LSHL {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -;EG: LSHL {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} - -;SI: {{^}}shl_v2i32: -;SI: v_lshl_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -;SI: v_lshl_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} - -;VI: {{^}}shl_v2i32: -;VI: v_lshlrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -;VI: v_lshlrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} - -define void @shl_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { - %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1 - %a = load <2 x i32>, <2 x i32> addrspace(1) * %in - %b = load <2 x i32>, <2 x i32> addrspace(1) * %b_ptr - %result = shl <2 x i32> %a, %b - store <2 x i32> %result, <2 x i32> addrspace(1)* %out - ret void -} - -;EG: {{^}}shl_v4i32: -;EG: LSHL {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -;EG: LSHL {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -;EG: LSHL {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -;EG: LSHL {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} - -;SI: {{^}}shl_v4i32: -;SI: v_lshl_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -;SI: v_lshl_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -;SI: v_lshl_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -;SI: v_lshl_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} - -;VI: {{^}}shl_v4i32: -;VI: v_lshlrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -;VI: v_lshlrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -;VI: v_lshlrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -;VI: v_lshlrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} - -define void @shl_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { - %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1 - %a = load <4 x i32>, <4 x i32> addrspace(1) * %in - %b = load <4 x i32>, <4 x i32> addrspace(1) * %b_ptr - %result = shl <4 x i32> %a, %b - store <4 x i32> %result, <4 x i32> addrspace(1)* %out - ret void -} - -;EG: {{^}}shl_i64: -;EG: SUB_INT {{\*? *}}[[COMPSH:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHIFT:T[0-9]+\.[XYZW]]] -;EG: LSHR {{\* *}}[[TEMP:T[0-9]+\.[XYZW]]], [[OPLO:T[0-9]+\.[XYZW]]], {{[[COMPSH]]|PV.[XYZW]}} -;EG: LSHR {{\*? *}}[[OVERF:T[0-9]+\.[XYZW]]], {{[[TEMP]]|PV.[XYZW]}}, 1 -;EG_CHECK-DAG: ADD_INT {{\*? *}}[[BIGSH:T[0-9]+\.[XYZW]]], [[SHIFT]], literal -;EG-DAG: LSHL {{\*? *}}[[HISMTMP:T[0-9]+\.[XYZW]]], [[OPHI:T[0-9]+\.[XYZW]]], [[SHIFT]] -;EG-DAG: OR_INT {{\*? *}}[[HISM:T[0-9]+\.[XYZW]]], {{[[HISMTMP]]|PV.[XYZW]}}, {{[[OVERF]]|PV.[XYZW]}} -;EG-DAG: LSHL {{\*? *}}[[LOSM:T[0-9]+\.[XYZW]]], [[OPLO]], {{PS|[[SHIFT]]}} -;EG-DAG: SETGT_UINT {{\*? *}}[[RESC:T[0-9]+\.[XYZW]]], [[SHIFT]], literal -;EG-DAG: CNDE_INT {{\*? *}}[[RESLO:T[0-9]+\.[XYZW]]], {{T[0-9]+\.[XYZW]}} -;EG-DAG: CNDE_INT {{\*? *}}[[RESHI:T[0-9]+\.[XYZW]]], {{T[0-9]+\.[XYZW], .*}}, 0.0 - -;SI: {{^}}shl_i64: -;SI: v_lshl_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}} - -;VI: {{^}}shl_i64: -;VI: v_lshlrev_b64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}} - -define void @shl_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { - %b_ptr = getelementptr i64, i64 addrspace(1)* %in, i64 1 - %a = load i64, i64 addrspace(1) * %in - %b = load i64, i64 addrspace(1) * %b_ptr - %result = shl i64 %a, %b - store i64 %result, i64 addrspace(1)* %out - ret void -} - -;EG: {{^}}shl_v2i64: -;EG-DAG: SUB_INT {{\*? *}}[[COMPSHA:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHA:T[0-9]+\.[XYZW]]] -;EG-DAG: SUB_INT {{\*? *}}[[COMPSHB:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHB:T[0-9]+\.[XYZW]]] -;EG-DAG: LSHR {{\*? *}}[[COMPSHA]] -;EG-DAG: LSHR {{\*? *}}[[COMPSHB]] -;EG-DAG: LSHR {{.*}}, 1 -;EG-DAG: LSHR {{.*}}, 1 -;EG-DAG: ADD_INT {{\*? *}}[[BIGSHA:T[0-9]+\.[XYZW]]]{{.*}}, literal -;EG-DAG: ADD_INT {{\*? *}}[[BIGSHB:T[0-9]+\.[XYZW]]]{{.*}}, literal -;EG-DAG: LSHL {{.*}}, [[SHA]] -;EG-DAG: LSHL {{.*}}, [[SHB]] -;EG-DAG: LSHL {{.*}}, [[SHA]] -;EG-DAG: LSHL {{.*}}, [[SHB]] -;EG-DAG: LSHL -;EG-DAG: LSHL -;EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHA]], literal -;EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHB]], literal -;EG-DAG: CNDE_INT {{.*}}, 0.0 -;EG-DAG: CNDE_INT {{.*}}, 0.0 -;EG-DAG: CNDE_INT -;EG-DAG: CNDE_INT - -;SI: {{^}}shl_v2i64: -;SI: v_lshl_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}} -;SI: v_lshl_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}} - -;VI: {{^}}shl_v2i64: -;VI: v_lshlrev_b64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}} -;VI: v_lshlrev_b64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}} - -define void @shl_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) { - %b_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %in, i64 1 - %a = load <2 x i64>, <2 x i64> addrspace(1) * %in - %b = load <2 x i64>, <2 x i64> addrspace(1) * %b_ptr - %result = shl <2 x i64> %a, %b - store <2 x i64> %result, <2 x i64> addrspace(1)* %out - ret void -} - -;EG: {{^}}shl_v4i64: -;EG-DAG: SUB_INT {{\*? *}}[[COMPSHA:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHA:T[0-9]+\.[XYZW]]] -;EG-DAG: SUB_INT {{\*? *}}[[COMPSHB:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHB:T[0-9]+\.[XYZW]]] -;EG-DAG: SUB_INT {{\*? *}}[[COMPSHC:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHC:T[0-9]+\.[XYZW]]] -;EG-DAG: SUB_INT {{\*? *}}[[COMPSHD:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHD:T[0-9]+\.[XYZW]]] -;EG-DAG: LSHR {{\*? *}}[[COMPSHA]] -;EG-DAG: LSHR {{\*? *}}[[COMPSHB]] -;EG-DAG: LSHR {{\*? *}}[[COMPSHC]] -;EG-DAG: LSHR {{\*? *}}[[COMPSHD]] -;EG-DAG: LSHR {{.*}}, 1 -;EG-DAG: LSHR {{.*}}, 1 -;EG-DAG: LSHR {{.*}}, 1 -;EG-DAG: LSHR {{.*}}, 1 -;EG-DAG: ADD_INT {{\*? *}}[[BIGSHA:T[0-9]+\.[XYZW]]]{{.*}}, literal -;EG-DAG: ADD_INT {{\*? *}}[[BIGSHB:T[0-9]+\.[XYZW]]]{{.*}}, literal -;EG-DAG: ADD_INT {{\*? *}}[[BIGSHC:T[0-9]+\.[XYZW]]]{{.*}}, literal -;EG-DAG: ADD_INT {{\*? *}}[[BIGSHD:T[0-9]+\.[XYZW]]]{{.*}}, literal -;EG-DAG: LSHL {{.*}}, [[SHA]] -;EG-DAG: LSHL {{.*}}, [[SHB]] -;EG-DAG: LSHL {{.*}}, [[SHC]] -;EG-DAG: LSHL {{.*}}, [[SHD]] -;EG-DAG: LSHL {{.*}}, [[SHA]] -;EG-DAG: LSHL {{.*}}, [[SHB]] -;EG-DAG: LSHL {{.*}}, [[SHC]] -;EG-DAG: LSHL {{.*}}, [[SHD]] -;EG-DAG: LSHL -;EG-DAG: LSHL -;EG-DAG: LSHL -;EG-DAG: LSHL -;EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHA]], literal -;EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHB]], literal -;EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHC]], literal -;EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHD]], literal -;EG-DAG: CNDE_INT {{.*}}, 0.0 -;EG-DAG: CNDE_INT {{.*}}, 0.0 -;EG-DAG: CNDE_INT {{.*}}, 0.0 -;EG-DAG: CNDE_INT {{.*}}, 0.0 -;EG-DAG: CNDE_INT -;EG-DAG: CNDE_INT -;EG-DAG: CNDE_INT -;EG-DAG: CNDE_INT - -;SI: {{^}}shl_v4i64: -;SI: v_lshl_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}} -;SI: v_lshl_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}} -;SI: v_lshl_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}} -;SI: v_lshl_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}} - -;VI: {{^}}shl_v4i64: -;VI: v_lshlrev_b64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}} -;VI: v_lshlrev_b64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}} -;VI: v_lshlrev_b64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}} -;VI: v_lshlrev_b64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}} - -define void @shl_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) { - %b_ptr = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %in, i64 1 - %a = load <4 x i64>, <4 x i64> addrspace(1) * %in - %b = load <4 x i64>, <4 x i64> addrspace(1) * %b_ptr - %result = shl <4 x i64> %a, %b - store <4 x i64> %result, <4 x i64> addrspace(1)* %out - ret void -} diff --git a/llvm/test/CodeGen/R600/shl_add_constant.ll b/llvm/test/CodeGen/R600/shl_add_constant.ll deleted file mode 100644 index b1485bfaaeb..00000000000 --- a/llvm/test/CodeGen/R600/shl_add_constant.ll +++ /dev/null @@ -1,90 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s - -declare i32 @llvm.r600.read.tidig.x() #1 - -; Test with inline immediate - -; FUNC-LABEL: {{^}}shl_2_add_9_i32: -; SI: v_lshlrev_b32_e32 [[REG:v[0-9]+]], 2, {{v[0-9]+}} -; SI: v_add_i32_e32 [[RESULT:v[0-9]+]], 36, [[REG]] -; SI: buffer_store_dword [[RESULT]] -; SI: s_endpgm -define void @shl_2_add_9_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { - %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1 - %ptr = getelementptr i32, i32 addrspace(1)* %in, i32 %tid.x - %val = load i32, i32 addrspace(1)* %ptr, align 4 - %add = add i32 %val, 9 - %result = shl i32 %add, 2 - store i32 %result, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}shl_2_add_9_i32_2_add_uses: -; SI-DAG: v_add_i32_e32 [[ADDREG:v[0-9]+]], 9, {{v[0-9]+}} -; SI-DAG: v_lshlrev_b32_e32 [[SHLREG:v[0-9]+]], 2, {{v[0-9]+}} -; SI-DAG: buffer_store_dword [[ADDREG]] -; SI-DAG: buffer_store_dword [[SHLREG]] -; SI: s_endpgm -define void @shl_2_add_9_i32_2_add_uses(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 addrspace(1)* %in) #0 { - %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1 - %ptr = getelementptr i32, i32 addrspace(1)* %in, i32 %tid.x - %val = load i32, i32 addrspace(1)* %ptr, align 4 - %add = add i32 %val, 9 - %result = shl i32 %add, 2 - store i32 %result, i32 addrspace(1)* %out0, align 4 - store i32 %add, i32 addrspace(1)* %out1, align 4 - ret void -} - -; Test with add literal constant - -; FUNC-LABEL: {{^}}shl_2_add_999_i32: -; SI: v_lshlrev_b32_e32 [[REG:v[0-9]+]], 2, {{v[0-9]+}} -; SI: v_add_i32_e32 [[RESULT:v[0-9]+]], 0xf9c, [[REG]] -; SI: buffer_store_dword [[RESULT]] -; SI: s_endpgm -define void @shl_2_add_999_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { - %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1 - %ptr = getelementptr i32, i32 addrspace(1)* %in, i32 %tid.x - %val = load i32, i32 addrspace(1)* %ptr, align 4 - %shl = add i32 %val, 999 - %result = shl i32 %shl, 2 - store i32 %result, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}test_add_shl_add_constant: -; SI-DAG: s_load_dword [[X:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb -; SI-DAG: s_load_dword [[Y:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc -; SI: s_lshl_b32 [[SHL3:s[0-9]+]], [[X]], 3 -; SI: s_add_i32 [[TMP:s[0-9]+]], [[SHL3]], [[Y]] -; SI: s_add_i32 [[RESULT:s[0-9]+]], [[TMP]], 0x3d8 -; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[RESULT]] -; SI: buffer_store_dword [[VRESULT]] -define void @test_add_shl_add_constant(i32 addrspace(1)* %out, i32 %x, i32 %y) #0 { - %add.0 = add i32 %x, 123 - %shl = shl i32 %add.0, 3 - %add.1 = add i32 %shl, %y - store i32 %add.1, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}test_add_shl_add_constant_inv: -; SI-DAG: s_load_dword [[X:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb -; SI-DAG: s_load_dword [[Y:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc -; SI: s_lshl_b32 [[SHL3:s[0-9]+]], [[X]], 3 -; SI: s_add_i32 [[TMP:s[0-9]+]], [[SHL3]], [[Y]] -; SI: s_add_i32 [[RESULT:s[0-9]+]], [[TMP]], 0x3d8 -; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[RESULT]] -; SI: buffer_store_dword [[VRESULT]] - -define void @test_add_shl_add_constant_inv(i32 addrspace(1)* %out, i32 %x, i32 %y) #0 { - %add.0 = add i32 %x, 123 - %shl = shl i32 %add.0, 3 - %add.1 = add i32 %y, %shl - store i32 %add.1, i32 addrspace(1)* %out, align 4 - ret void -} - -attributes #0 = { nounwind } -attributes #1 = { nounwind readnone } diff --git a/llvm/test/CodeGen/R600/shl_add_ptr.ll b/llvm/test/CodeGen/R600/shl_add_ptr.ll deleted file mode 100644 index 6671e909cd1..00000000000 --- a/llvm/test/CodeGen/R600/shl_add_ptr.ll +++ /dev/null @@ -1,284 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck -check-prefix=SI %s - -; Test that doing a shift of a pointer with a constant add will be -; folded into the constant offset addressing mode even if the add has -; multiple uses. This is relevant to accessing 2 separate, adjacent -; LDS globals. - - -declare i32 @llvm.r600.read.tidig.x() #1 - -@lds0 = addrspace(3) global [512 x float] undef, align 4 -@lds1 = addrspace(3) global [512 x float] undef, align 4 - - -; Make sure the (add tid, 2) << 2 gets folded into the ds's offset as (tid << 2) + 8 - -; SI-LABEL: {{^}}load_shl_base_lds_0: -; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}} -; SI: ds_read_b32 {{v[0-9]+}}, [[PTR]] offset:8 -; SI: s_endpgm -define void @load_shl_base_lds_0(float addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { - %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1 - %idx.0 = add nsw i32 %tid.x, 2 - %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds0, i32 0, i32 %idx.0 - %val0 = load float, float addrspace(3)* %arrayidx0, align 4 - store i32 %idx.0, i32 addrspace(1)* %add_use, align 4 - store float %val0, float addrspace(1)* %out - ret void -} - -; Make sure once the first use is folded into the addressing mode, the -; remaining add use goes through the normal shl + add constant fold. - -; SI-LABEL: {{^}}load_shl_base_lds_1: -; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}} -; SI: ds_read_b32 [[RESULT:v[0-9]+]], [[PTR]] offset:8 -; SI: v_add_i32_e32 [[ADDUSE:v[0-9]+]], 8, v{{[0-9]+}} -; SI-DAG: buffer_store_dword [[RESULT]] -; SI-DAG: buffer_store_dword [[ADDUSE]] -; SI: s_endpgm -define void @load_shl_base_lds_1(float addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { - %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1 - %idx.0 = add nsw i32 %tid.x, 2 - %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds0, i32 0, i32 %idx.0 - %val0 = load float, float addrspace(3)* %arrayidx0, align 4 - %shl_add_use = shl i32 %idx.0, 2 - store i32 %shl_add_use, i32 addrspace(1)* %add_use, align 4 - store float %val0, float addrspace(1)* %out - ret void -} - -@maxlds = addrspace(3) global [65536 x i8] undef, align 4 - -; SI-LABEL: {{^}}load_shl_base_lds_max_offset -; SI: ds_read_u8 v{{[0-9]+}}, v{{[0-9]+}} offset:65535 -; SI: s_endpgm -define void @load_shl_base_lds_max_offset(i8 addrspace(1)* %out, i8 addrspace(3)* %lds, i32 addrspace(1)* %add_use) #0 { - %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1 - %idx.0 = add nsw i32 %tid.x, 65535 - %arrayidx0 = getelementptr inbounds [65536 x i8], [65536 x i8] addrspace(3)* @maxlds, i32 0, i32 %idx.0 - %val0 = load i8, i8 addrspace(3)* %arrayidx0 - store i32 %idx.0, i32 addrspace(1)* %add_use - store i8 %val0, i8 addrspace(1)* %out - ret void -} - -; The two globals are placed adjacent in memory, so the same base -; pointer can be used with an offset into the second one. - -; SI-LABEL: {{^}}load_shl_base_lds_2: -; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}} -; SI: s_mov_b32 m0, -1 -; SI-NEXT: ds_read2st64_b32 {{v\[[0-9]+:[0-9]+\]}}, [[PTR]] offset0:1 offset1:9 -; SI: s_endpgm -define void @load_shl_base_lds_2(float addrspace(1)* %out) #0 { - %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1 - %idx.0 = add nsw i32 %tid.x, 64 - %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds0, i32 0, i32 %idx.0 - %val0 = load float, float addrspace(3)* %arrayidx0, align 4 - %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds1, i32 0, i32 %idx.0 - %val1 = load float, float addrspace(3)* %arrayidx1, align 4 - %sum = fadd float %val0, %val1 - store float %sum, float addrspace(1)* %out, align 4 - ret void -} - -; SI-LABEL: {{^}}store_shl_base_lds_0: -; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}} -; SI: ds_write_b32 [[PTR]], {{v[0-9]+}} offset:8 -; SI: s_endpgm -define void @store_shl_base_lds_0(float addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { - %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1 - %idx.0 = add nsw i32 %tid.x, 2 - %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds0, i32 0, i32 %idx.0 - store float 1.0, float addrspace(3)* %arrayidx0, align 4 - store i32 %idx.0, i32 addrspace(1)* %add_use, align 4 - ret void -} - - -; -------------------------------------------------------------------------------- -; Atomics. - -@lds2 = addrspace(3) global [512 x i32] undef, align 4 - -; define void @atomic_load_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { -; %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1 -; %idx.0 = add nsw i32 %tid.x, 2 -; %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0 -; %val = load atomic i32, i32 addrspace(3)* %arrayidx0 seq_cst, align 4 -; store i32 %val, i32 addrspace(1)* %out, align 4 -; store i32 %idx.0, i32 addrspace(1)* %add_use, align 4 -; ret void -; } - - -; SI-LABEL: {{^}}atomic_cmpxchg_shl_base_lds_0: -; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}} -; SI: ds_cmpst_rtn_b32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}}, {{v[0-9]+}} offset:8 -; SI: s_endpgm -define void @atomic_cmpxchg_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use, i32 %swap) #0 { - %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1 - %idx.0 = add nsw i32 %tid.x, 2 - %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0 - %pair = cmpxchg i32 addrspace(3)* %arrayidx0, i32 7, i32 %swap seq_cst monotonic - %result = extractvalue { i32, i1 } %pair, 0 - store i32 %result, i32 addrspace(1)* %out, align 4 - store i32 %idx.0, i32 addrspace(1)* %add_use, align 4 - ret void -} - -; SI-LABEL: {{^}}atomic_swap_shl_base_lds_0: -; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}} -; SI: ds_wrxchg_rtn_b32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8 -; SI: s_endpgm -define void @atomic_swap_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { - %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1 - %idx.0 = add nsw i32 %tid.x, 2 - %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0 - %val = atomicrmw xchg i32 addrspace(3)* %arrayidx0, i32 3 seq_cst - store i32 %val, i32 addrspace(1)* %out, align 4 - store i32 %idx.0, i32 addrspace(1)* %add_use, align 4 - ret void -} - -; SI-LABEL: {{^}}atomic_add_shl_base_lds_0: -; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}} -; SI: ds_add_rtn_u32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8 -; SI: s_endpgm -define void @atomic_add_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { - %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1 - %idx.0 = add nsw i32 %tid.x, 2 - %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0 - %val = atomicrmw add i32 addrspace(3)* %arrayidx0, i32 3 seq_cst - store i32 %val, i32 addrspace(1)* %out, align 4 - store i32 %idx.0, i32 addrspace(1)* %add_use, align 4 - ret void -} - -; SI-LABEL: {{^}}atomic_sub_shl_base_lds_0: -; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}} -; SI: ds_sub_rtn_u32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8 -; SI: s_endpgm -define void @atomic_sub_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { - %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1 - %idx.0 = add nsw i32 %tid.x, 2 - %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0 - %val = atomicrmw sub i32 addrspace(3)* %arrayidx0, i32 3 seq_cst - store i32 %val, i32 addrspace(1)* %out, align 4 - store i32 %idx.0, i32 addrspace(1)* %add_use, align 4 - ret void -} - -; SI-LABEL: {{^}}atomic_and_shl_base_lds_0: -; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}} -; SI: ds_and_rtn_b32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8 -; SI: s_endpgm -define void @atomic_and_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { - %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1 - %idx.0 = add nsw i32 %tid.x, 2 - %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0 - %val = atomicrmw and i32 addrspace(3)* %arrayidx0, i32 3 seq_cst - store i32 %val, i32 addrspace(1)* %out, align 4 - store i32 %idx.0, i32 addrspace(1)* %add_use, align 4 - ret void -} - -; SI-LABEL: {{^}}atomic_or_shl_base_lds_0: -; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}} -; SI: ds_or_rtn_b32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8 -; SI: s_endpgm -define void @atomic_or_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { - %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1 - %idx.0 = add nsw i32 %tid.x, 2 - %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0 - %val = atomicrmw or i32 addrspace(3)* %arrayidx0, i32 3 seq_cst - store i32 %val, i32 addrspace(1)* %out, align 4 - store i32 %idx.0, i32 addrspace(1)* %add_use, align 4 - ret void -} - -; SI-LABEL: {{^}}atomic_xor_shl_base_lds_0: -; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}} -; SI: ds_xor_rtn_b32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8 -; SI: s_endpgm -define void @atomic_xor_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { - %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1 - %idx.0 = add nsw i32 %tid.x, 2 - %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0 - %val = atomicrmw xor i32 addrspace(3)* %arrayidx0, i32 3 seq_cst - store i32 %val, i32 addrspace(1)* %out, align 4 - store i32 %idx.0, i32 addrspace(1)* %add_use, align 4 - ret void -} - -; define void @atomic_nand_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { -; %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1 -; %idx.0 = add nsw i32 %tid.x, 2 -; %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0 -; %val = atomicrmw nand i32 addrspace(3)* %arrayidx0, i32 3 seq_cst -; store i32 %val, i32 addrspace(1)* %out, align 4 -; store i32 %idx.0, i32 addrspace(1)* %add_use, align 4 -; ret void -; } - -; SI-LABEL: {{^}}atomic_min_shl_base_lds_0: -; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}} -; SI: ds_min_rtn_i32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8 -; SI: s_endpgm -define void @atomic_min_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { - %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1 - %idx.0 = add nsw i32 %tid.x, 2 - %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0 - %val = atomicrmw min i32 addrspace(3)* %arrayidx0, i32 3 seq_cst - store i32 %val, i32 addrspace(1)* %out, align 4 - store i32 %idx.0, i32 addrspace(1)* %add_use, align 4 - ret void -} - -; SI-LABEL: {{^}}atomic_max_shl_base_lds_0: -; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}} -; SI: ds_max_rtn_i32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8 -; SI: s_endpgm -define void @atomic_max_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { - %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1 - %idx.0 = add nsw i32 %tid.x, 2 - %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0 - %val = atomicrmw max i32 addrspace(3)* %arrayidx0, i32 3 seq_cst - store i32 %val, i32 addrspace(1)* %out, align 4 - store i32 %idx.0, i32 addrspace(1)* %add_use, align 4 - ret void -} - -; SI-LABEL: {{^}}atomic_umin_shl_base_lds_0: -; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}} -; SI: ds_min_rtn_u32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8 -; SI: s_endpgm -define void @atomic_umin_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { - %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1 - %idx.0 = add nsw i32 %tid.x, 2 - %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0 - %val = atomicrmw umin i32 addrspace(3)* %arrayidx0, i32 3 seq_cst - store i32 %val, i32 addrspace(1)* %out, align 4 - store i32 %idx.0, i32 addrspace(1)* %add_use, align 4 - ret void -} - -; SI-LABEL: {{^}}atomic_umax_shl_base_lds_0: -; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}} -; SI: ds_max_rtn_u32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8 -; SI: s_endpgm -define void @atomic_umax_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { - %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1 - %idx.0 = add nsw i32 %tid.x, 2 - %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0 - %val = atomicrmw umax i32 addrspace(3)* %arrayidx0, i32 3 seq_cst - store i32 %val, i32 addrspace(1)* %out, align 4 - store i32 %idx.0, i32 addrspace(1)* %add_use, align 4 - ret void -} - -attributes #0 = { nounwind } -attributes #1 = { nounwind readnone } diff --git a/llvm/test/CodeGen/R600/si-annotate-cf-assertion.ll b/llvm/test/CodeGen/R600/si-annotate-cf-assertion.ll deleted file mode 100644 index 69d719385ac..00000000000 --- a/llvm/test/CodeGen/R600/si-annotate-cf-assertion.ll +++ /dev/null @@ -1,25 +0,0 @@ -; REQUIRES: asserts -; XFAIL: * -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs-asm-verbose=false < %s | FileCheck %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs-asm-verbose=false < %s | FileCheck %s - - -define void @test(i32 addrspace(1)* %g, i8 addrspace(3)* %l, i32 %x) nounwind { -; CHECK-LABEL: {{^}}test: - -entry: - switch i32 %x, label %sw.default [ - i32 0, label %sw.bb - i32 60, label %sw.bb - ] - -sw.bb: - unreachable - -sw.default: - unreachable - -sw.epilog: - ret void -} - diff --git a/llvm/test/CodeGen/R600/si-annotate-cf.ll b/llvm/test/CodeGen/R600/si-annotate-cf.ll deleted file mode 100644 index bbcb861f37d..00000000000 --- a/llvm/test/CodeGen/R600/si-annotate-cf.ll +++ /dev/null @@ -1,63 +0,0 @@ -; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s -; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s - -; FUNC-LABEL: {{^}}break_inserted_outside_of_loop: - -; SI: [[LOOP_LABEL:[A-Z0-9]+]]: -; Lowered break instructin: -; SI: s_or_b64 -; Lowered Loop instruction: -; SI: s_andn2_b64 -; s_cbranch_execnz [[LOOP_LABEL]] -; SI: s_endpgm -define void @break_inserted_outside_of_loop(i32 addrspace(1)* %out, i32 %a, i32 %b) { -main_body: - %0 = and i32 %a, %b - %1 = trunc i32 %0 to i1 - br label %ENDIF - -ENDLOOP: - store i32 0, i32 addrspace(1)* %out - ret void - -ENDIF: - br i1 %1, label %ENDLOOP, label %ENDIF -} - - -; FUNC-LABEL: {{^}}phi_cond_outside_loop: -; FIXME: This could be folded into the s_or_b64 instruction -; SI: s_mov_b64 [[ZERO:s\[[0-9]+:[0-9]+\]]], 0 -; SI: [[LOOP_LABEL:[A-Z0-9]+]] -; SI: v_cmp_ne_i32_e32 vcc, 0, v{{[0-9]+}} - -; SI_IF_BREAK instruction: -; SI: s_or_b64 [[BREAK:s\[[0-9]+:[0-9]+\]]], vcc, [[ZERO]] - -; SI_LOOP instruction: -; SI: s_andn2_b64 exec, exec, [[BREAK]] -; SI: s_cbranch_execnz [[LOOP_LABEL]] -; SI: s_endpgm - -define void @phi_cond_outside_loop(i32 %a, i32 %b) { -entry: - %0 = icmp eq i32 %a , 0 - br i1 %0, label %if, label %else - -if: - br label %endif - -else: - %1 = icmp eq i32 %b, 0 - br label %endif - -endif: - %2 = phi i1 [0, %if], [%1, %else] - br label %loop - -loop: - br i1 %2, label %exit, label %loop - -exit: - ret void -} diff --git a/llvm/test/CodeGen/R600/si-lod-bias.ll b/llvm/test/CodeGen/R600/si-lod-bias.ll deleted file mode 100644 index 944499a1146..00000000000 --- a/llvm/test/CodeGen/R600/si-lod-bias.ll +++ /dev/null @@ -1,52 +0,0 @@ -;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s -;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s - -; This shader has the potential to generated illegal VGPR to SGPR copies if -; the wrong register class is used for the REG_SEQUENCE instructions. - -; CHECK: {{^}}main: -; CHECK: image_sample_b v{{\[[0-9]:[0-9]\]}}, 15, 0, 0, 0, 0, 0, 0, 0, v{{\[[0-9]:[0-9]\]}} - -define void @main(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 { -main_body: - %20 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %0, i32 0 - %21 = load <16 x i8>, <16 x i8> addrspace(2)* %20, !tbaa !1 - %22 = call float @llvm.SI.load.const(<16 x i8> %21, i32 16) - %23 = getelementptr <32 x i8>, <32 x i8> addrspace(2)* %2, i32 0 - %24 = load <32 x i8>, <32 x i8> addrspace(2)* %23, !tbaa !1 - %25 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %1, i32 0 - %26 = load <16 x i8>, <16 x i8> addrspace(2)* %25, !tbaa !1 - %27 = call float @llvm.SI.fs.interp(i32 0, i32 0, i32 %3, <2 x i32> %5) - %28 = call float @llvm.SI.fs.interp(i32 1, i32 0, i32 %3, <2 x i32> %5) - %29 = bitcast float %22 to i32 - %30 = bitcast float %27 to i32 - %31 = bitcast float %28 to i32 - %32 = insertelement <4 x i32> undef, i32 %29, i32 0 - %33 = insertelement <4 x i32> %32, i32 %30, i32 1 - %34 = insertelement <4 x i32> %33, i32 %31, i32 2 - %35 = insertelement <4 x i32> %34, i32 undef, i32 3 - %36 = call <4 x float> @llvm.SI.sampleb.v4i32(<4 x i32> %35, <32 x i8> %24, <16 x i8> %26, i32 2) - %37 = extractelement <4 x float> %36, i32 0 - %38 = extractelement <4 x float> %36, i32 1 - %39 = extractelement <4 x float> %36, i32 2 - %40 = extractelement <4 x float> %36, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %37, float %38, float %39, float %40) - ret void -} - -; Function Attrs: nounwind readnone -declare float @llvm.SI.load.const(<16 x i8>, i32) #1 - -; Function Attrs: nounwind readnone -declare float @llvm.SI.fs.interp(i32, i32, i32, <2 x i32>) #1 - -; Function Attrs: nounwind readnone -declare <4 x float> @llvm.SI.sampleb.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32) #1 - -declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) - -attributes #0 = { "ShaderType"="0" } -attributes #1 = { nounwind readnone } - -!0 = !{!"const", null} -!1 = !{!0, !0, i64 0, i32 1} diff --git a/llvm/test/CodeGen/R600/si-sgpr-spill.ll b/llvm/test/CodeGen/R600/si-sgpr-spill.ll deleted file mode 100644 index 84652701f77..00000000000 --- a/llvm/test/CodeGen/R600/si-sgpr-spill.ll +++ /dev/null @@ -1,1568 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck %s -; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck %s - -; These tests check that the compiler won't crash when it needs to spill -; SGPRs. - -; CHECK-LABEL: {{^}}main: -; CHECK: s_wqm -; Writing to M0 from an SMRD instruction will hang the GPU. -; CHECK-NOT: s_buffer_load_dword m0 -; CHECK: s_endpgm -@ddxy_lds = external addrspace(3) global [64 x i32] - -define void @main([17 x <16 x i8>] addrspace(2)* byval, [32 x <16 x i8>] addrspace(2)* byval, [16 x <32 x i8>] addrspace(2)* byval, float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 { -main_body: - %21 = getelementptr [17 x <16 x i8>], [17 x <16 x i8>] addrspace(2)* %0, i64 0, i32 0 - %22 = load <16 x i8>, <16 x i8> addrspace(2)* %21, !tbaa !0 - %23 = call float @llvm.SI.load.const(<16 x i8> %22, i32 96) - %24 = call float @llvm.SI.load.const(<16 x i8> %22, i32 100) - %25 = call float @llvm.SI.load.const(<16 x i8> %22, i32 104) - %26 = call float @llvm.SI.load.const(<16 x i8> %22, i32 112) - %27 = call float @llvm.SI.load.const(<16 x i8> %22, i32 116) - %28 = call float @llvm.SI.load.const(<16 x i8> %22, i32 120) - %29 = call float @llvm.SI.load.const(<16 x i8> %22, i32 128) - %30 = call float @llvm.SI.load.const(<16 x i8> %22, i32 132) - %31 = call float @llvm.SI.load.const(<16 x i8> %22, i32 140) - %32 = call float @llvm.SI.load.const(<16 x i8> %22, i32 144) - %33 = call float @llvm.SI.load.const(<16 x i8> %22, i32 160) - %34 = call float @llvm.SI.load.const(<16 x i8> %22, i32 176) - %35 = call float @llvm.SI.load.const(<16 x i8> %22, i32 180) - %36 = call float @llvm.SI.load.const(<16 x i8> %22, i32 184) - %37 = call float @llvm.SI.load.const(<16 x i8> %22, i32 192) - %38 = call float @llvm.SI.load.const(<16 x i8> %22, i32 196) - %39 = call float @llvm.SI.load.const(<16 x i8> %22, i32 200) - %40 = call float @llvm.SI.load.const(<16 x i8> %22, i32 208) - %41 = call float @llvm.SI.load.const(<16 x i8> %22, i32 212) - %42 = call float @llvm.SI.load.const(<16 x i8> %22, i32 216) - %43 = call float @llvm.SI.load.const(<16 x i8> %22, i32 224) - %44 = call float @llvm.SI.load.const(<16 x i8> %22, i32 240) - %45 = call float @llvm.SI.load.const(<16 x i8> %22, i32 244) - %46 = call float @llvm.SI.load.const(<16 x i8> %22, i32 248) - %47 = call float @llvm.SI.load.const(<16 x i8> %22, i32 256) - %48 = call float @llvm.SI.load.const(<16 x i8> %22, i32 272) - %49 = call float @llvm.SI.load.const(<16 x i8> %22, i32 276) - %50 = call float @llvm.SI.load.const(<16 x i8> %22, i32 280) - %51 = call float @llvm.SI.load.const(<16 x i8> %22, i32 288) - %52 = call float @llvm.SI.load.const(<16 x i8> %22, i32 292) - %53 = call float @llvm.SI.load.const(<16 x i8> %22, i32 296) - %54 = call float @llvm.SI.load.const(<16 x i8> %22, i32 304) - %55 = call float @llvm.SI.load.const(<16 x i8> %22, i32 308) - %56 = call float @llvm.SI.load.const(<16 x i8> %22, i32 312) - %57 = call float @llvm.SI.load.const(<16 x i8> %22, i32 368) - %58 = call float @llvm.SI.load.const(<16 x i8> %22, i32 372) - %59 = call float @llvm.SI.load.const(<16 x i8> %22, i32 376) - %60 = call float @llvm.SI.load.const(<16 x i8> %22, i32 384) - %61 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 0 - %62 = load <32 x i8>, <32 x i8> addrspace(2)* %61, !tbaa !0 - %63 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 0 - %64 = load <16 x i8>, <16 x i8> addrspace(2)* %63, !tbaa !0 - %65 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 1 - %66 = load <32 x i8>, <32 x i8> addrspace(2)* %65, !tbaa !0 - %67 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 1 - %68 = load <16 x i8>, <16 x i8> addrspace(2)* %67, !tbaa !0 - %69 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 2 - %70 = load <32 x i8>, <32 x i8> addrspace(2)* %69, !tbaa !0 - %71 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 2 - %72 = load <16 x i8>, <16 x i8> addrspace(2)* %71, !tbaa !0 - %73 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 3 - %74 = load <32 x i8>, <32 x i8> addrspace(2)* %73, !tbaa !0 - %75 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 3 - %76 = load <16 x i8>, <16 x i8> addrspace(2)* %75, !tbaa !0 - %77 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 4 - %78 = load <32 x i8>, <32 x i8> addrspace(2)* %77, !tbaa !0 - %79 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 4 - %80 = load <16 x i8>, <16 x i8> addrspace(2)* %79, !tbaa !0 - %81 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 5 - %82 = load <32 x i8>, <32 x i8> addrspace(2)* %81, !tbaa !0 - %83 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 5 - %84 = load <16 x i8>, <16 x i8> addrspace(2)* %83, !tbaa !0 - %85 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 6 - %86 = load <32 x i8>, <32 x i8> addrspace(2)* %85, !tbaa !0 - %87 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 6 - %88 = load <16 x i8>, <16 x i8> addrspace(2)* %87, !tbaa !0 - %89 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 7 - %90 = load <32 x i8>, <32 x i8> addrspace(2)* %89, !tbaa !0 - %91 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 7 - %92 = load <16 x i8>, <16 x i8> addrspace(2)* %91, !tbaa !0 - %93 = call float @llvm.SI.fs.interp(i32 0, i32 0, i32 %4, <2 x i32> %6) - %94 = call float @llvm.SI.fs.interp(i32 1, i32 0, i32 %4, <2 x i32> %6) - %95 = call float @llvm.SI.fs.interp(i32 0, i32 1, i32 %4, <2 x i32> %6) - %96 = call float @llvm.SI.fs.interp(i32 1, i32 1, i32 %4, <2 x i32> %6) - %97 = call float @llvm.SI.fs.interp(i32 2, i32 1, i32 %4, <2 x i32> %6) - %98 = call float @llvm.SI.fs.interp(i32 0, i32 2, i32 %4, <2 x i32> %6) - %99 = call float @llvm.SI.fs.interp(i32 1, i32 2, i32 %4, <2 x i32> %6) - %100 = call float @llvm.SI.fs.interp(i32 2, i32 2, i32 %4, <2 x i32> %6) - %101 = call float @llvm.SI.fs.interp(i32 0, i32 3, i32 %4, <2 x i32> %6) - %102 = call float @llvm.SI.fs.interp(i32 1, i32 3, i32 %4, <2 x i32> %6) - %103 = call float @llvm.SI.fs.interp(i32 2, i32 3, i32 %4, <2 x i32> %6) - %104 = call float @llvm.SI.fs.interp(i32 0, i32 4, i32 %4, <2 x i32> %6) - %105 = call float @llvm.SI.fs.interp(i32 1, i32 4, i32 %4, <2 x i32> %6) - %106 = call float @llvm.SI.fs.interp(i32 2, i32 4, i32 %4, <2 x i32> %6) - %107 = call float @llvm.SI.fs.interp(i32 0, i32 5, i32 %4, <2 x i32> %6) - %108 = call float @llvm.SI.fs.interp(i32 1, i32 5, i32 %4, <2 x i32> %6) - %109 = call float @llvm.SI.fs.interp(i32 2, i32 5, i32 %4, <2 x i32> %6) - %110 = call i32 @llvm.SI.tid() - %111 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %110 - %112 = bitcast float %93 to i32 - store i32 %112, i32 addrspace(3)* %111 - %113 = bitcast float %94 to i32 - store i32 %113, i32 addrspace(3)* %111 - %114 = call i32 @llvm.SI.tid() - %115 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %114 - %116 = and i32 %114, -4 - %117 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %116 - %118 = add i32 %116, 1 - %119 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %118 - %120 = bitcast float %93 to i32 - store i32 %120, i32 addrspace(3)* %115 - %121 = load i32, i32 addrspace(3)* %117 - %122 = bitcast i32 %121 to float - %123 = load i32, i32 addrspace(3)* %119 - %124 = bitcast i32 %123 to float - %125 = fsub float %124, %122 - %126 = bitcast float %94 to i32 - store i32 %126, i32 addrspace(3)* %115 - %127 = load i32, i32 addrspace(3)* %117 - %128 = bitcast i32 %127 to float - %129 = load i32, i32 addrspace(3)* %119 - %130 = bitcast i32 %129 to float - %131 = fsub float %130, %128 - %132 = insertelement <4 x float> undef, float %125, i32 0 - %133 = insertelement <4 x float> %132, float %131, i32 1 - %134 = insertelement <4 x float> %133, float %131, i32 2 - %135 = insertelement <4 x float> %134, float %131, i32 3 - %136 = extractelement <4 x float> %135, i32 0 - %137 = extractelement <4 x float> %135, i32 1 - %138 = fmul float %60, %93 - %139 = fmul float %60, %94 - %140 = fmul float %60, %94 - %141 = fmul float %60, %94 - %142 = call i32 @llvm.SI.tid() - %143 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %142 - %144 = bitcast float %138 to i32 - store i32 %144, i32 addrspace(3)* %143 - %145 = bitcast float %139 to i32 - store i32 %145, i32 addrspace(3)* %143 - %146 = bitcast float %140 to i32 - store i32 %146, i32 addrspace(3)* %143 - %147 = bitcast float %141 to i32 - store i32 %147, i32 addrspace(3)* %143 - %148 = call i32 @llvm.SI.tid() - %149 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %148 - %150 = and i32 %148, -4 - %151 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %150 - %152 = add i32 %150, 2 - %153 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %152 - %154 = bitcast float %138 to i32 - store i32 %154, i32 addrspace(3)* %149 - %155 = load i32, i32 addrspace(3)* %151 - %156 = bitcast i32 %155 to float - %157 = load i32, i32 addrspace(3)* %153 - %158 = bitcast i32 %157 to float - %159 = fsub float %158, %156 - %160 = bitcast float %139 to i32 - store i32 %160, i32 addrspace(3)* %149 - %161 = load i32, i32 addrspace(3)* %151 - %162 = bitcast i32 %161 to float - %163 = load i32, i32 addrspace(3)* %153 - %164 = bitcast i32 %163 to float - %165 = fsub float %164, %162 - %166 = bitcast float %140 to i32 - store i32 %166, i32 addrspace(3)* %149 - %167 = load i32, i32 addrspace(3)* %151 - %168 = bitcast i32 %167 to float - %169 = load i32, i32 addrspace(3)* %153 - %170 = bitcast i32 %169 to float - %171 = fsub float %170, %168 - %172 = bitcast float %141 to i32 - store i32 %172, i32 addrspace(3)* %149 - %173 = load i32, i32 addrspace(3)* %151 - %174 = bitcast i32 %173 to float - %175 = load i32, i32 addrspace(3)* %153 - %176 = bitcast i32 %175 to float - %177 = fsub float %176, %174 - %178 = insertelement <4 x float> undef, float %159, i32 0 - %179 = insertelement <4 x float> %178, float %165, i32 1 - %180 = insertelement <4 x float> %179, float %171, i32 2 - %181 = insertelement <4 x float> %180, float %177, i32 3 - %182 = extractelement <4 x float> %181, i32 0 - %183 = extractelement <4 x float> %181, i32 1 - %184 = fdiv float 1.000000e+00, %97 - %185 = fmul float %33, %184 - %186 = fcmp uge float 1.000000e+00, %185 - %187 = select i1 %186, float %185, float 1.000000e+00 - %188 = fmul float %187, %30 - %189 = call float @ceil(float %188) - %190 = fcmp uge float 3.000000e+00, %189 - %191 = select i1 %190, float 3.000000e+00, float %189 - %192 = fdiv float 1.000000e+00, %191 - %193 = fdiv float 1.000000e+00, %30 - %194 = fmul float %191, %193 - %195 = fmul float %31, %194 - %196 = fmul float %95, %95 - %197 = fmul float %96, %96 - %198 = fadd float %197, %196 - %199 = fmul float %97, %97 - %200 = fadd float %198, %199 - %201 = call float @llvm.AMDGPU.rsq.f32(float %200) - %202 = fmul float %95, %201 - %203 = fmul float %96, %201 - %204 = fmul float %202, %29 - %205 = fmul float %203, %29 - %206 = fmul float %204, -1.000000e+00 - %207 = fmul float %205, 1.000000e+00 - %208 = fmul float %206, %32 - %209 = fmul float %207, %32 - %210 = fsub float -0.000000e+00, %208 - %211 = fadd float %93, %210 - %212 = fsub float -0.000000e+00, %209 - %213 = fadd float %94, %212 - %214 = fmul float %206, %192 - %215 = fmul float %207, %192 - %216 = fmul float -1.000000e+00, %192 - %217 = bitcast float %136 to i32 - %218 = bitcast float %182 to i32 - %219 = bitcast float %137 to i32 - %220 = bitcast float %183 to i32 - %221 = insertelement <8 x i32> undef, i32 %217, i32 0 - %222 = insertelement <8 x i32> %221, i32 %218, i32 1 - %223 = insertelement <8 x i32> %222, i32 %219, i32 2 - %224 = insertelement <8 x i32> %223, i32 %220, i32 3 - br label %LOOP - -LOOP: ; preds = %ENDIF, %main_body - %temp24.0 = phi float [ 1.000000e+00, %main_body ], [ %258, %ENDIF ] - %temp28.0 = phi float [ %211, %main_body ], [ %253, %ENDIF ] - %temp29.0 = phi float [ %213, %main_body ], [ %255, %ENDIF ] - %temp30.0 = phi float [ 1.000000e+00, %main_body ], [ %257, %ENDIF ] - %225 = fcmp oge float %temp24.0, %191 - %226 = sext i1 %225 to i32 - %227 = bitcast i32 %226 to float - %228 = bitcast float %227 to i32 - %229 = icmp ne i32 %228, 0 - br i1 %229, label %IF, label %ENDIF - -IF: ; preds = %LOOP - %230 = bitcast float %136 to i32 - %231 = bitcast float %182 to i32 - %232 = bitcast float %137 to i32 - %233 = bitcast float %183 to i32 - %234 = insertelement <8 x i32> undef, i32 %230, i32 0 - %235 = insertelement <8 x i32> %234, i32 %231, i32 1 - %236 = insertelement <8 x i32> %235, i32 %232, i32 2 - %237 = insertelement <8 x i32> %236, i32 %233, i32 3 - br label %LOOP65 - -ENDIF: ; preds = %LOOP - %238 = bitcast float %temp28.0 to i32 - %239 = bitcast float %temp29.0 to i32 - %240 = insertelement <8 x i32> %224, i32 %238, i32 4 - %241 = insertelement <8 x i32> %240, i32 %239, i32 5 - %242 = insertelement <8 x i32> %241, i32 undef, i32 6 - %243 = insertelement <8 x i32> %242, i32 undef, i32 7 - %244 = call <4 x float> @llvm.SI.sampled.v8i32(<8 x i32> %243, <32 x i8> %62, <16 x i8> %64, i32 2) - %245 = extractelement <4 x float> %244, i32 3 - %246 = fcmp oge float %temp30.0, %245 - %247 = sext i1 %246 to i32 - %248 = bitcast i32 %247 to float - %249 = bitcast float %248 to i32 - %250 = and i32 %249, 1065353216 - %251 = bitcast i32 %250 to float - %252 = fmul float %214, %251 - %253 = fadd float %252, %temp28.0 - %254 = fmul float %215, %251 - %255 = fadd float %254, %temp29.0 - %256 = fmul float %216, %251 - %257 = fadd float %256, %temp30.0 - %258 = fadd float %temp24.0, 1.000000e+00 - br label %LOOP - -LOOP65: ; preds = %ENDIF66, %IF - %temp24.1 = phi float [ 0.000000e+00, %IF ], [ %610, %ENDIF66 ] - %temp28.1 = phi float [ %temp28.0, %IF ], [ %605, %ENDIF66 ] - %temp29.1 = phi float [ %temp29.0, %IF ], [ %607, %ENDIF66 ] - %temp30.1 = phi float [ %temp30.0, %IF ], [ %609, %ENDIF66 ] - %temp32.0 = phi float [ 1.000000e+00, %IF ], [ %611, %ENDIF66 ] - %259 = fcmp oge float %temp24.1, %195 - %260 = sext i1 %259 to i32 - %261 = bitcast i32 %260 to float - %262 = bitcast float %261 to i32 - %263 = icmp ne i32 %262, 0 - br i1 %263, label %IF67, label %ENDIF66 - -IF67: ; preds = %LOOP65 - %264 = bitcast float %136 to i32 - %265 = bitcast float %182 to i32 - %266 = bitcast float %137 to i32 - %267 = bitcast float %183 to i32 - %268 = bitcast float %temp28.1 to i32 - %269 = bitcast float %temp29.1 to i32 - %270 = insertelement <8 x i32> undef, i32 %264, i32 0 - %271 = insertelement <8 x i32> %270, i32 %265, i32 1 - %272 = insertelement <8 x i32> %271, i32 %266, i32 2 - %273 = insertelement <8 x i32> %272, i32 %267, i32 3 - %274 = insertelement <8 x i32> %273, i32 %268, i32 4 - %275 = insertelement <8 x i32> %274, i32 %269, i32 5 - %276 = insertelement <8 x i32> %275, i32 undef, i32 6 - %277 = insertelement <8 x i32> %276, i32 undef, i32 7 - %278 = call <4 x float> @llvm.SI.sampled.v8i32(<8 x i32> %277, <32 x i8> %66, <16 x i8> %68, i32 2) - %279 = extractelement <4 x float> %278, i32 0 - %280 = extractelement <4 x float> %278, i32 1 - %281 = extractelement <4 x float> %278, i32 2 - %282 = extractelement <4 x float> %278, i32 3 - %283 = fmul float %282, %47 - %284 = bitcast float %136 to i32 - %285 = bitcast float %182 to i32 - %286 = bitcast float %137 to i32 - %287 = bitcast float %183 to i32 - %288 = bitcast float %temp28.1 to i32 - %289 = bitcast float %temp29.1 to i32 - %290 = insertelement <8 x i32> undef, i32 %284, i32 0 - %291 = insertelement <8 x i32> %290, i32 %285, i32 1 - %292 = insertelement <8 x i32> %291, i32 %286, i32 2 - %293 = insertelement <8 x i32> %292, i32 %287, i32 3 - %294 = insertelement <8 x i32> %293, i32 %288, i32 4 - %295 = insertelement <8 x i32> %294, i32 %289, i32 5 - %296 = insertelement <8 x i32> %295, i32 undef, i32 6 - %297 = insertelement <8 x i32> %296, i32 undef, i32 7 - %298 = call <4 x float> @llvm.SI.sampled.v8i32(<8 x i32> %297, <32 x i8> %82, <16 x i8> %84, i32 2) - %299 = extractelement <4 x float> %298, i32 0 - %300 = extractelement <4 x float> %298, i32 1 - %301 = extractelement <4 x float> %298, i32 2 - %302 = bitcast float %136 to i32 - %303 = bitcast float %182 to i32 - %304 = bitcast float %137 to i32 - %305 = bitcast float %183 to i32 - %306 = bitcast float %temp28.1 to i32 - %307 = bitcast float %temp29.1 to i32 - %308 = insertelement <8 x i32> undef, i32 %302, i32 0 - %309 = insertelement <8 x i32> %308, i32 %303, i32 1 - %310 = insertelement <8 x i32> %309, i32 %304, i32 2 - %311 = insertelement <8 x i32> %310, i32 %305, i32 3 - %312 = insertelement <8 x i32> %311, i32 %306, i32 4 - %313 = insertelement <8 x i32> %312, i32 %307, i32 5 - %314 = insertelement <8 x i32> %313, i32 undef, i32 6 - %315 = insertelement <8 x i32> %314, i32 undef, i32 7 - %316 = call <4 x float> @llvm.SI.sampled.v8i32(<8 x i32> %315, <32 x i8> %78, <16 x i8> %80, i32 2) - %317 = extractelement <4 x float> %316, i32 0 - %318 = extractelement <4 x float> %316, i32 1 - %319 = extractelement <4 x float> %316, i32 2 - %320 = fmul float %317, %23 - %321 = fmul float %318, %24 - %322 = fmul float %319, %25 - %323 = fmul float %299, %26 - %324 = fadd float %323, %320 - %325 = fmul float %300, %27 - %326 = fadd float %325, %321 - %327 = fmul float %301, %28 - %328 = fadd float %327, %322 - %329 = fadd float %279, %324 - %330 = fadd float %280, %326 - %331 = fadd float %281, %328 - %332 = bitcast float %136 to i32 - %333 = bitcast float %182 to i32 - %334 = bitcast float %137 to i32 - %335 = bitcast float %183 to i32 - %336 = bitcast float %temp28.1 to i32 - %337 = bitcast float %temp29.1 to i32 - %338 = insertelement <8 x i32> undef, i32 %332, i32 0 - %339 = insertelement <8 x i32> %338, i32 %333, i32 1 - %340 = insertelement <8 x i32> %339, i32 %334, i32 2 - %341 = insertelement <8 x i32> %340, i32 %335, i32 3 - %342 = insertelement <8 x i32> %341, i32 %336, i32 4 - %343 = insertelement <8 x i32> %342, i32 %337, i32 5 - %344 = insertelement <8 x i32> %343, i32 undef, i32 6 - %345 = insertelement <8 x i32> %344, i32 undef, i32 7 - %346 = call <4 x float> @llvm.SI.sampled.v8i32(<8 x i32> %345, <32 x i8> %62, <16 x i8> %64, i32 2) - %347 = extractelement <4 x float> %346, i32 0 - %348 = extractelement <4 x float> %346, i32 1 - %349 = extractelement <4 x float> %346, i32 2 - %350 = fadd float %347, -5.000000e-01 - %351 = fadd float %348, -5.000000e-01 - %352 = fadd float %349, -5.000000e-01 - %353 = fmul float %350, %350 - %354 = fmul float %351, %351 - %355 = fadd float %354, %353 - %356 = fmul float %352, %352 - %357 = fadd float %355, %356 - %358 = call float @llvm.AMDGPU.rsq.f32(float %357) - %359 = fmul float %350, %358 - %360 = fmul float %351, %358 - %361 = fmul float %352, %358 - %362 = bitcast float %136 to i32 - %363 = bitcast float %182 to i32 - %364 = bitcast float %137 to i32 - %365 = bitcast float %183 to i32 - %366 = bitcast float %temp28.1 to i32 - %367 = bitcast float %temp29.1 to i32 - %368 = insertelement <8 x i32> undef, i32 %362, i32 0 - %369 = insertelement <8 x i32> %368, i32 %363, i32 1 - %370 = insertelement <8 x i32> %369, i32 %364, i32 2 - %371 = insertelement <8 x i32> %370, i32 %365, i32 3 - %372 = insertelement <8 x i32> %371, i32 %366, i32 4 - %373 = insertelement <8 x i32> %372, i32 %367, i32 5 - %374 = insertelement <8 x i32> %373, i32 undef, i32 6 - %375 = insertelement <8 x i32> %374, i32 undef, i32 7 - %376 = call <4 x float> @llvm.SI.sampled.v8i32(<8 x i32> %375, <32 x i8> %70, <16 x i8> %72, i32 2) - %377 = extractelement <4 x float> %376, i32 0 - %378 = extractelement <4 x float> %376, i32 1 - %379 = extractelement <4 x float> %376, i32 2 - %380 = extractelement <4 x float> %376, i32 3 - %381 = fsub float -0.000000e+00, %95 - %382 = fsub float -0.000000e+00, %96 - %383 = fsub float -0.000000e+00, %97 - %384 = fmul float %359, %381 - %385 = fmul float %360, %382 - %386 = fadd float %385, %384 - %387 = fmul float %361, %383 - %388 = fadd float %386, %387 - %389 = fmul float %388, %359 - %390 = fmul float %388, %360 - %391 = fmul float %388, %361 - %392 = fmul float 2.000000e+00, %389 - %393 = fmul float 2.000000e+00, %390 - %394 = fmul float 2.000000e+00, %391 - %395 = fsub float -0.000000e+00, %392 - %396 = fadd float %381, %395 - %397 = fsub float -0.000000e+00, %393 - %398 = fadd float %382, %397 - %399 = fsub float -0.000000e+00, %394 - %400 = fadd float %383, %399 - %401 = fmul float %396, %98 - %402 = fmul float %396, %99 - %403 = fmul float %396, %100 - %404 = fmul float %398, %101 - %405 = fadd float %404, %401 - %406 = fmul float %398, %102 - %407 = fadd float %406, %402 - %408 = fmul float %398, %103 - %409 = fadd float %408, %403 - %410 = fmul float %400, %104 - %411 = fadd float %410, %405 - %412 = fmul float %400, %105 - %413 = fadd float %412, %407 - %414 = fmul float %400, %106 - %415 = fadd float %414, %409 - %416 = bitcast float %136 to i32 - %417 = bitcast float %182 to i32 - %418 = bitcast float %137 to i32 - %419 = bitcast float %183 to i32 - %420 = bitcast float %temp28.1 to i32 - %421 = bitcast float %temp29.1 to i32 - %422 = insertelement <8 x i32> undef, i32 %416, i32 0 - %423 = insertelement <8 x i32> %422, i32 %417, i32 1 - %424 = insertelement <8 x i32> %423, i32 %418, i32 2 - %425 = insertelement <8 x i32> %424, i32 %419, i32 3 - %426 = insertelement <8 x i32> %425, i32 %420, i32 4 - %427 = insertelement <8 x i32> %426, i32 %421, i32 5 - %428 = insertelement <8 x i32> %427, i32 undef, i32 6 - %429 = insertelement <8 x i32> %428, i32 undef, i32 7 - %430 = call <4 x float> @llvm.SI.sampled.v8i32(<8 x i32> %429, <32 x i8> %86, <16 x i8> %88, i32 2) - %431 = extractelement <4 x float> %430, i32 0 - %432 = extractelement <4 x float> %430, i32 1 - %433 = extractelement <4 x float> %430, i32 2 - %434 = fmul float %48, %411 - %435 = fmul float %49, %411 - %436 = fmul float %50, %411 - %437 = fmul float %51, %413 - %438 = fadd float %437, %434 - %439 = fmul float %52, %413 - %440 = fadd float %439, %435 - %441 = fmul float %53, %413 - %442 = fadd float %441, %436 - %443 = fmul float %54, %415 - %444 = fadd float %443, %438 - %445 = fmul float %55, %415 - %446 = fadd float %445, %440 - %447 = fmul float %56, %415 - %448 = fadd float %447, %442 - %449 = insertelement <4 x float> undef, float %444, i32 0 - %450 = insertelement <4 x float> %449, float %446, i32 1 - %451 = insertelement <4 x float> %450, float %448, i32 2 - %452 = insertelement <4 x float> %451, float %195, i32 3 - %453 = call <4 x float> @llvm.AMDGPU.cube(<4 x float> %452) - %454 = extractelement <4 x float> %453, i32 0 - %455 = extractelement <4 x float> %453, i32 1 - %456 = extractelement <4 x float> %453, i32 2 - %457 = extractelement <4 x float> %453, i32 3 - %458 = call float @fabs(float %456) - %459 = fdiv float 1.000000e+00, %458 - %460 = fmul float %454, %459 - %461 = fadd float %460, 1.500000e+00 - %462 = fmul float %455, %459 - %463 = fadd float %462, 1.500000e+00 - %464 = bitcast float %463 to i32 - %465 = bitcast float %461 to i32 - %466 = bitcast float %457 to i32 - %467 = insertelement <4 x i32> undef, i32 %464, i32 0 - %468 = insertelement <4 x i32> %467, i32 %465, i32 1 - %469 = insertelement <4 x i32> %468, i32 %466, i32 2 - %470 = insertelement <4 x i32> %469, i32 undef, i32 3 - %471 = call <4 x float> @llvm.SI.sample.v4i32(<4 x i32> %470, <32 x i8> %90, <16 x i8> %92, i32 4) - %472 = extractelement <4 x float> %471, i32 0 - %473 = extractelement <4 x float> %471, i32 1 - %474 = extractelement <4 x float> %471, i32 2 - %475 = fmul float %431, %472 - %476 = fadd float %475, %329 - %477 = fmul float %432, %473 - %478 = fadd float %477, %330 - %479 = fmul float %433, %474 - %480 = fadd float %479, %331 - %481 = fmul float %107, %107 - %482 = fmul float %108, %108 - %483 = fadd float %482, %481 - %484 = fmul float %109, %109 - %485 = fadd float %483, %484 - %486 = call float @llvm.AMDGPU.rsq.f32(float %485) - %487 = fmul float %107, %486 - %488 = fmul float %108, %486 - %489 = fmul float %109, %486 - %490 = fmul float %377, %40 - %491 = fmul float %378, %41 - %492 = fmul float %379, %42 - %493 = fmul float %359, %487 - %494 = fmul float %360, %488 - %495 = fadd float %494, %493 - %496 = fmul float %361, %489 - %497 = fadd float %495, %496 - %498 = fmul float %497, %359 - %499 = fmul float %497, %360 - %500 = fmul float %497, %361 - %501 = fmul float 2.000000e+00, %498 - %502 = fmul float 2.000000e+00, %499 - %503 = fmul float 2.000000e+00, %500 - %504 = fsub float -0.000000e+00, %501 - %505 = fadd float %487, %504 - %506 = fsub float -0.000000e+00, %502 - %507 = fadd float %488, %506 - %508 = fsub float -0.000000e+00, %503 - %509 = fadd float %489, %508 - %510 = fmul float %95, %95 - %511 = fmul float %96, %96 - %512 = fadd float %511, %510 - %513 = fmul float %97, %97 - %514 = fadd float %512, %513 - %515 = call float @llvm.AMDGPU.rsq.f32(float %514) - %516 = fmul float %95, %515 - %517 = fmul float %96, %515 - %518 = fmul float %97, %515 - %519 = fmul float %505, %516 - %520 = fmul float %507, %517 - %521 = fadd float %520, %519 - %522 = fmul float %509, %518 - %523 = fadd float %521, %522 - %524 = fsub float -0.000000e+00, %523 - %525 = fcmp uge float %524, 0.000000e+00 - %526 = select i1 %525, float %524, float 0.000000e+00 - %527 = fmul float %43, %380 - %528 = fadd float %527, 1.000000e+00 - %529 = call float @llvm.pow.f32(float %526, float %528) - %530 = fmul float %476, %37 - %531 = fmul float %478, %38 - %532 = fmul float %480, %39 - %533 = fmul float %359, %487 - %534 = fmul float %360, %488 - %535 = fadd float %534, %533 - %536 = fmul float %361, %489 - %537 = fadd float %535, %536 - %538 = fcmp uge float %537, 0.000000e+00 - %539 = select i1 %538, float %537, float 0.000000e+00 - %540 = fmul float %530, %539 - %541 = fmul float %531, %539 - %542 = fmul float %532, %539 - %543 = fmul float %490, %529 - %544 = fadd float %543, %540 - %545 = fmul float %491, %529 - %546 = fadd float %545, %541 - %547 = fmul float %492, %529 - %548 = fadd float %547, %542 - %549 = fmul float %476, %34 - %550 = fmul float %478, %35 - %551 = fmul float %480, %36 - %552 = fmul float %544, %57 - %553 = fadd float %552, %549 - %554 = fmul float %546, %58 - %555 = fadd float %554, %550 - %556 = fmul float %548, %59 - %557 = fadd float %556, %551 - %558 = bitcast float %136 to i32 - %559 = bitcast float %182 to i32 - %560 = bitcast float %137 to i32 - %561 = bitcast float %183 to i32 - %562 = bitcast float %temp28.1 to i32 - %563 = bitcast float %temp29.1 to i32 - %564 = insertelement <8 x i32> undef, i32 %558, i32 0 - %565 = insertelement <8 x i32> %564, i32 %559, i32 1 - %566 = insertelement <8 x i32> %565, i32 %560, i32 2 - %567 = insertelement <8 x i32> %566, i32 %561, i32 3 - %568 = insertelement <8 x i32> %567, i32 %562, i32 4 - %569 = insertelement <8 x i32> %568, i32 %563, i32 5 - %570 = insertelement <8 x i32> %569, i32 undef, i32 6 - %571 = insertelement <8 x i32> %570, i32 undef, i32 7 - %572 = call <4 x float> @llvm.SI.sampled.v8i32(<8 x i32> %571, <32 x i8> %74, <16 x i8> %76, i32 2) - %573 = extractelement <4 x float> %572, i32 0 - %574 = extractelement <4 x float> %572, i32 1 - %575 = extractelement <4 x float> %572, i32 2 - %576 = fmul float %573, %44 - %577 = fadd float %576, %553 - %578 = fmul float %574, %45 - %579 = fadd float %578, %555 - %580 = fmul float %575, %46 - %581 = fadd float %580, %557 - %582 = call i32 @llvm.SI.packf16(float %577, float %579) - %583 = bitcast i32 %582 to float - %584 = call i32 @llvm.SI.packf16(float %581, float %283) - %585 = bitcast i32 %584 to float - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %583, float %585, float %583, float %585) - ret void - -ENDIF66: ; preds = %LOOP65 - %586 = bitcast float %temp28.1 to i32 - %587 = bitcast float %temp29.1 to i32 - %588 = insertelement <8 x i32> %237, i32 %586, i32 4 - %589 = insertelement <8 x i32> %588, i32 %587, i32 5 - %590 = insertelement <8 x i32> %589, i32 undef, i32 6 - %591 = insertelement <8 x i32> %590, i32 undef, i32 7 - %592 = call <4 x float> @llvm.SI.sampled.v8i32(<8 x i32> %591, <32 x i8> %62, <16 x i8> %64, i32 2) - %593 = extractelement <4 x float> %592, i32 3 - %594 = fcmp oge float %temp30.1, %593 - %595 = sext i1 %594 to i32 - %596 = bitcast i32 %595 to float - %597 = bitcast float %596 to i32 - %598 = and i32 %597, 1065353216 - %599 = bitcast i32 %598 to float - %600 = fmul float 5.000000e-01, %temp32.0 - %601 = fsub float -0.000000e+00, %600 - %602 = fmul float %599, %temp32.0 - %603 = fadd float %602, %601 - %604 = fmul float %214, %603 - %605 = fadd float %604, %temp28.1 - %606 = fmul float %215, %603 - %607 = fadd float %606, %temp29.1 - %608 = fmul float %216, %603 - %609 = fadd float %608, %temp30.1 - %610 = fadd float %temp24.1, 1.000000e+00 - %611 = fmul float %temp32.0, 5.000000e-01 - br label %LOOP65 -} - -; Function Attrs: nounwind readnone -declare float @llvm.SI.load.const(<16 x i8>, i32) #1 - -; Function Attrs: nounwind readnone -declare float @llvm.SI.fs.interp(i32, i32, i32, <2 x i32>) #1 - -; Function Attrs: readnone -declare i32 @llvm.SI.tid() #2 - -; Function Attrs: readonly -declare float @ceil(float) #3 - -; Function Attrs: readnone -declare float @llvm.AMDGPU.rsq.f32(float) #2 - -; Function Attrs: nounwind readnone -declare <4 x float> @llvm.SI.sampled.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32) #1 - -; Function Attrs: readnone -declare <4 x float> @llvm.AMDGPU.cube(<4 x float>) #2 - -; Function Attrs: readnone -declare float @fabs(float) #2 - -; Function Attrs: nounwind readnone -declare <4 x float> @llvm.SI.sample.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32) #1 - -; Function Attrs: nounwind readonly -declare float @llvm.pow.f32(float, float) #4 - -; Function Attrs: nounwind readnone -declare i32 @llvm.SI.packf16(float, float) #1 - -declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) - -attributes #0 = { "ShaderType"="0" } -attributes #1 = { nounwind readnone } -attributes #2 = { readnone } -attributes #3 = { readonly } -attributes #4 = { nounwind readonly } - -!0 = !{!"const", null, i32 1} - -; CHECK-LABEL: {{^}}main1: -; CHECK: s_endpgm -define void @main1([17 x <16 x i8>] addrspace(2)* byval, [32 x <16 x i8>] addrspace(2)* byval, [16 x <32 x i8>] addrspace(2)* byval, float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 { -main_body: - %21 = getelementptr [17 x <16 x i8>], [17 x <16 x i8>] addrspace(2)* %0, i64 0, i32 0 - %22 = load <16 x i8>, <16 x i8> addrspace(2)* %21, !tbaa !0 - %23 = call float @llvm.SI.load.const(<16 x i8> %22, i32 0) - %24 = call float @llvm.SI.load.const(<16 x i8> %22, i32 4) - %25 = call float @llvm.SI.load.const(<16 x i8> %22, i32 8) - %26 = call float @llvm.SI.load.const(<16 x i8> %22, i32 12) - %27 = call float @llvm.SI.load.const(<16 x i8> %22, i32 28) - %28 = call float @llvm.SI.load.const(<16 x i8> %22, i32 48) - %29 = call float @llvm.SI.load.const(<16 x i8> %22, i32 52) - %30 = call float @llvm.SI.load.const(<16 x i8> %22, i32 56) - %31 = call float @llvm.SI.load.const(<16 x i8> %22, i32 64) - %32 = call float @llvm.SI.load.const(<16 x i8> %22, i32 68) - %33 = call float @llvm.SI.load.const(<16 x i8> %22, i32 72) - %34 = call float @llvm.SI.load.const(<16 x i8> %22, i32 76) - %35 = call float @llvm.SI.load.const(<16 x i8> %22, i32 128) - %36 = call float @llvm.SI.load.const(<16 x i8> %22, i32 132) - %37 = call float @llvm.SI.load.const(<16 x i8> %22, i32 144) - %38 = call float @llvm.SI.load.const(<16 x i8> %22, i32 148) - %39 = call float @llvm.SI.load.const(<16 x i8> %22, i32 152) - %40 = call float @llvm.SI.load.const(<16 x i8> %22, i32 160) - %41 = call float @llvm.SI.load.const(<16 x i8> %22, i32 164) - %42 = call float @llvm.SI.load.const(<16 x i8> %22, i32 168) - %43 = call float @llvm.SI.load.const(<16 x i8> %22, i32 172) - %44 = call float @llvm.SI.load.const(<16 x i8> %22, i32 176) - %45 = call float @llvm.SI.load.const(<16 x i8> %22, i32 180) - %46 = call float @llvm.SI.load.const(<16 x i8> %22, i32 184) - %47 = call float @llvm.SI.load.const(<16 x i8> %22, i32 192) - %48 = call float @llvm.SI.load.const(<16 x i8> %22, i32 196) - %49 = call float @llvm.SI.load.const(<16 x i8> %22, i32 200) - %50 = call float @llvm.SI.load.const(<16 x i8> %22, i32 208) - %51 = call float @llvm.SI.load.const(<16 x i8> %22, i32 212) - %52 = call float @llvm.SI.load.const(<16 x i8> %22, i32 216) - %53 = call float @llvm.SI.load.const(<16 x i8> %22, i32 220) - %54 = call float @llvm.SI.load.const(<16 x i8> %22, i32 236) - %55 = call float @llvm.SI.load.const(<16 x i8> %22, i32 240) - %56 = call float @llvm.SI.load.const(<16 x i8> %22, i32 244) - %57 = call float @llvm.SI.load.const(<16 x i8> %22, i32 248) - %58 = call float @llvm.SI.load.const(<16 x i8> %22, i32 252) - %59 = call float @llvm.SI.load.const(<16 x i8> %22, i32 256) - %60 = call float @llvm.SI.load.const(<16 x i8> %22, i32 260) - %61 = call float @llvm.SI.load.const(<16 x i8> %22, i32 264) - %62 = call float @llvm.SI.load.const(<16 x i8> %22, i32 268) - %63 = call float @llvm.SI.load.const(<16 x i8> %22, i32 272) - %64 = call float @llvm.SI.load.const(<16 x i8> %22, i32 276) - %65 = call float @llvm.SI.load.const(<16 x i8> %22, i32 280) - %66 = call float @llvm.SI.load.const(<16 x i8> %22, i32 284) - %67 = call float @llvm.SI.load.const(<16 x i8> %22, i32 288) - %68 = call float @llvm.SI.load.const(<16 x i8> %22, i32 292) - %69 = call float @llvm.SI.load.const(<16 x i8> %22, i32 464) - %70 = call float @llvm.SI.load.const(<16 x i8> %22, i32 468) - %71 = call float @llvm.SI.load.const(<16 x i8> %22, i32 472) - %72 = call float @llvm.SI.load.const(<16 x i8> %22, i32 496) - %73 = call float @llvm.SI.load.const(<16 x i8> %22, i32 500) - %74 = call float @llvm.SI.load.const(<16 x i8> %22, i32 504) - %75 = call float @llvm.SI.load.const(<16 x i8> %22, i32 512) - %76 = call float @llvm.SI.load.const(<16 x i8> %22, i32 516) - %77 = call float @llvm.SI.load.const(<16 x i8> %22, i32 524) - %78 = call float @llvm.SI.load.const(<16 x i8> %22, i32 532) - %79 = call float @llvm.SI.load.const(<16 x i8> %22, i32 536) - %80 = call float @llvm.SI.load.const(<16 x i8> %22, i32 540) - %81 = call float @llvm.SI.load.const(<16 x i8> %22, i32 544) - %82 = call float @llvm.SI.load.const(<16 x i8> %22, i32 548) - %83 = call float @llvm.SI.load.const(<16 x i8> %22, i32 552) - %84 = call float @llvm.SI.load.const(<16 x i8> %22, i32 556) - %85 = call float @llvm.SI.load.const(<16 x i8> %22, i32 560) - %86 = call float @llvm.SI.load.const(<16 x i8> %22, i32 564) - %87 = call float @llvm.SI.load.const(<16 x i8> %22, i32 568) - %88 = call float @llvm.SI.load.const(<16 x i8> %22, i32 572) - %89 = call float @llvm.SI.load.const(<16 x i8> %22, i32 576) - %90 = call float @llvm.SI.load.const(<16 x i8> %22, i32 580) - %91 = call float @llvm.SI.load.const(<16 x i8> %22, i32 584) - %92 = call float @llvm.SI.load.const(<16 x i8> %22, i32 588) - %93 = call float @llvm.SI.load.const(<16 x i8> %22, i32 592) - %94 = call float @llvm.SI.load.const(<16 x i8> %22, i32 596) - %95 = call float @llvm.SI.load.const(<16 x i8> %22, i32 600) - %96 = call float @llvm.SI.load.const(<16 x i8> %22, i32 604) - %97 = call float @llvm.SI.load.const(<16 x i8> %22, i32 608) - %98 = call float @llvm.SI.load.const(<16 x i8> %22, i32 612) - %99 = call float @llvm.SI.load.const(<16 x i8> %22, i32 616) - %100 = call float @llvm.SI.load.const(<16 x i8> %22, i32 624) - %101 = call float @llvm.SI.load.const(<16 x i8> %22, i32 628) - %102 = call float @llvm.SI.load.const(<16 x i8> %22, i32 632) - %103 = call float @llvm.SI.load.const(<16 x i8> %22, i32 636) - %104 = call float @llvm.SI.load.const(<16 x i8> %22, i32 640) - %105 = call float @llvm.SI.load.const(<16 x i8> %22, i32 644) - %106 = call float @llvm.SI.load.const(<16 x i8> %22, i32 648) - %107 = call float @llvm.SI.load.const(<16 x i8> %22, i32 652) - %108 = call float @llvm.SI.load.const(<16 x i8> %22, i32 656) - %109 = call float @llvm.SI.load.const(<16 x i8> %22, i32 660) - %110 = call float @llvm.SI.load.const(<16 x i8> %22, i32 664) - %111 = call float @llvm.SI.load.const(<16 x i8> %22, i32 668) - %112 = call float @llvm.SI.load.const(<16 x i8> %22, i32 672) - %113 = call float @llvm.SI.load.const(<16 x i8> %22, i32 676) - %114 = call float @llvm.SI.load.const(<16 x i8> %22, i32 680) - %115 = call float @llvm.SI.load.const(<16 x i8> %22, i32 684) - %116 = call float @llvm.SI.load.const(<16 x i8> %22, i32 688) - %117 = call float @llvm.SI.load.const(<16 x i8> %22, i32 692) - %118 = call float @llvm.SI.load.const(<16 x i8> %22, i32 696) - %119 = call float @llvm.SI.load.const(<16 x i8> %22, i32 700) - %120 = call float @llvm.SI.load.const(<16 x i8> %22, i32 704) - %121 = call float @llvm.SI.load.const(<16 x i8> %22, i32 708) - %122 = call float @llvm.SI.load.const(<16 x i8> %22, i32 712) - %123 = call float @llvm.SI.load.const(<16 x i8> %22, i32 716) - %124 = call float @llvm.SI.load.const(<16 x i8> %22, i32 864) - %125 = call float @llvm.SI.load.const(<16 x i8> %22, i32 868) - %126 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 0 - %127 = load <32 x i8>, <32 x i8> addrspace(2)* %126, !tbaa !0 - %128 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 0 - %129 = load <16 x i8>, <16 x i8> addrspace(2)* %128, !tbaa !0 - %130 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 1 - %131 = load <32 x i8>, <32 x i8> addrspace(2)* %130, !tbaa !0 - %132 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 1 - %133 = load <16 x i8>, <16 x i8> addrspace(2)* %132, !tbaa !0 - %134 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 2 - %135 = load <32 x i8>, <32 x i8> addrspace(2)* %134, !tbaa !0 - %136 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 2 - %137 = load <16 x i8>, <16 x i8> addrspace(2)* %136, !tbaa !0 - %138 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 3 - %139 = load <32 x i8>, <32 x i8> addrspace(2)* %138, !tbaa !0 - %140 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 3 - %141 = load <16 x i8>, <16 x i8> addrspace(2)* %140, !tbaa !0 - %142 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 4 - %143 = load <32 x i8>, <32 x i8> addrspace(2)* %142, !tbaa !0 - %144 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 4 - %145 = load <16 x i8>, <16 x i8> addrspace(2)* %144, !tbaa !0 - %146 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 5 - %147 = load <32 x i8>, <32 x i8> addrspace(2)* %146, !tbaa !0 - %148 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 5 - %149 = load <16 x i8>, <16 x i8> addrspace(2)* %148, !tbaa !0 - %150 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 6 - %151 = load <32 x i8>, <32 x i8> addrspace(2)* %150, !tbaa !0 - %152 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 6 - %153 = load <16 x i8>, <16 x i8> addrspace(2)* %152, !tbaa !0 - %154 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 7 - %155 = load <32 x i8>, <32 x i8> addrspace(2)* %154, !tbaa !0 - %156 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 7 - %157 = load <16 x i8>, <16 x i8> addrspace(2)* %156, !tbaa !0 - %158 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 8 - %159 = load <32 x i8>, <32 x i8> addrspace(2)* %158, !tbaa !0 - %160 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 8 - %161 = load <16 x i8>, <16 x i8> addrspace(2)* %160, !tbaa !0 - %162 = fcmp ugt float %17, 0.000000e+00 - %163 = select i1 %162, float 1.000000e+00, float 0.000000e+00 - %164 = call float @llvm.SI.fs.interp(i32 0, i32 0, i32 %4, <2 x i32> %6) - %165 = call float @llvm.SI.fs.interp(i32 1, i32 0, i32 %4, <2 x i32> %6) - %166 = call float @llvm.SI.fs.interp(i32 2, i32 0, i32 %4, <2 x i32> %6) - %167 = call float @llvm.SI.fs.interp(i32 3, i32 0, i32 %4, <2 x i32> %6) - %168 = call float @llvm.SI.fs.interp(i32 0, i32 1, i32 %4, <2 x i32> %6) - %169 = call float @llvm.SI.fs.interp(i32 1, i32 1, i32 %4, <2 x i32> %6) - %170 = call float @llvm.SI.fs.interp(i32 2, i32 1, i32 %4, <2 x i32> %6) - %171 = call float @llvm.SI.fs.interp(i32 3, i32 1, i32 %4, <2 x i32> %6) - %172 = call float @llvm.SI.fs.interp(i32 0, i32 2, i32 %4, <2 x i32> %6) - %173 = call float @llvm.SI.fs.interp(i32 1, i32 2, i32 %4, <2 x i32> %6) - %174 = call float @llvm.SI.fs.interp(i32 2, i32 2, i32 %4, <2 x i32> %6) - %175 = call float @llvm.SI.fs.interp(i32 3, i32 2, i32 %4, <2 x i32> %6) - %176 = call float @llvm.SI.fs.interp(i32 0, i32 3, i32 %4, <2 x i32> %6) - %177 = call float @llvm.SI.fs.interp(i32 1, i32 3, i32 %4, <2 x i32> %6) - %178 = call float @llvm.SI.fs.interp(i32 2, i32 3, i32 %4, <2 x i32> %6) - %179 = call float @llvm.SI.fs.interp(i32 3, i32 3, i32 %4, <2 x i32> %6) - %180 = call float @llvm.SI.fs.interp(i32 0, i32 4, i32 %4, <2 x i32> %6) - %181 = call float @llvm.SI.fs.interp(i32 1, i32 4, i32 %4, <2 x i32> %6) - %182 = call float @llvm.SI.fs.interp(i32 2, i32 4, i32 %4, <2 x i32> %6) - %183 = call float @llvm.SI.fs.interp(i32 3, i32 4, i32 %4, <2 x i32> %6) - %184 = call float @llvm.SI.fs.interp(i32 0, i32 5, i32 %4, <2 x i32> %6) - %185 = call float @llvm.SI.fs.interp(i32 1, i32 5, i32 %4, <2 x i32> %6) - %186 = call float @llvm.SI.fs.interp(i32 2, i32 5, i32 %4, <2 x i32> %6) - %187 = call float @llvm.SI.fs.interp(i32 3, i32 5, i32 %4, <2 x i32> %6) - %188 = call float @llvm.SI.fs.interp(i32 0, i32 6, i32 %4, <2 x i32> %6) - %189 = call float @llvm.SI.fs.interp(i32 1, i32 6, i32 %4, <2 x i32> %6) - %190 = call float @llvm.SI.fs.interp(i32 2, i32 6, i32 %4, <2 x i32> %6) - %191 = call float @llvm.SI.fs.interp(i32 3, i32 6, i32 %4, <2 x i32> %6) - %192 = call float @llvm.SI.fs.interp(i32 0, i32 7, i32 %4, <2 x i32> %6) - %193 = call float @llvm.SI.fs.interp(i32 1, i32 7, i32 %4, <2 x i32> %6) - %194 = call float @llvm.SI.fs.interp(i32 2, i32 7, i32 %4, <2 x i32> %6) - %195 = call float @llvm.SI.fs.interp(i32 3, i32 7, i32 %4, <2 x i32> %6) - %196 = fmul float %14, %124 - %197 = fadd float %196, %125 - %198 = call float @llvm.AMDIL.clamp.(float %163, float 0.000000e+00, float 1.000000e+00) - %199 = call float @llvm.AMDIL.clamp.(float 0.000000e+00, float 0.000000e+00, float 1.000000e+00) - %200 = call float @llvm.AMDIL.clamp.(float 0.000000e+00, float 0.000000e+00, float 1.000000e+00) - %201 = call float @llvm.AMDIL.clamp.(float 1.000000e+00, float 0.000000e+00, float 1.000000e+00) - %202 = bitcast float %198 to i32 - %203 = icmp ne i32 %202, 0 - %. = select i1 %203, float -1.000000e+00, float 1.000000e+00 - %204 = fsub float -0.000000e+00, %164 - %205 = fadd float %44, %204 - %206 = fsub float -0.000000e+00, %165 - %207 = fadd float %45, %206 - %208 = fsub float -0.000000e+00, %166 - %209 = fadd float %46, %208 - %210 = fmul float %205, %205 - %211 = fmul float %207, %207 - %212 = fadd float %211, %210 - %213 = fmul float %209, %209 - %214 = fadd float %212, %213 - %215 = call float @llvm.AMDGPU.rsq.f32(float %214) - %216 = fmul float %205, %215 - %217 = fmul float %207, %215 - %218 = fmul float %209, %215 - %219 = fmul float %., %54 - %220 = fmul float %13, %47 - %221 = fmul float %197, %48 - %222 = bitcast float %174 to i32 - %223 = bitcast float %175 to i32 - %224 = insertelement <2 x i32> undef, i32 %222, i32 0 - %225 = insertelement <2 x i32> %224, i32 %223, i32 1 - %226 = call <4 x float> @llvm.SI.sample.v2i32(<2 x i32> %225, <32 x i8> %131, <16 x i8> %133, i32 2) - %227 = extractelement <4 x float> %226, i32 0 - %228 = extractelement <4 x float> %226, i32 1 - %229 = extractelement <4 x float> %226, i32 2 - %230 = extractelement <4 x float> %226, i32 3 - %231 = fmul float %227, 0x4012611180000000 - %232 = fmul float %228, 0x4012611180000000 - %233 = fmul float %229, 0x4012611180000000 - %234 = call float @llvm.AMDGPU.lrp(float %27, float %231, float 1.000000e+00) - %235 = call float @llvm.AMDGPU.lrp(float %27, float %232, float 1.000000e+00) - %236 = call float @llvm.AMDGPU.lrp(float %27, float %233, float 1.000000e+00) - %237 = fmul float %216, %184 - %238 = fmul float %217, %185 - %239 = fadd float %238, %237 - %240 = fmul float %218, %186 - %241 = fadd float %239, %240 - %242 = fmul float %216, %187 - %243 = fmul float %217, %188 - %244 = fadd float %243, %242 - %245 = fmul float %218, %189 - %246 = fadd float %244, %245 - %247 = fmul float %216, %190 - %248 = fmul float %217, %191 - %249 = fadd float %248, %247 - %250 = fmul float %218, %192 - %251 = fadd float %249, %250 - %252 = call float @llvm.AMDIL.clamp.(float %251, float 0.000000e+00, float 1.000000e+00) - %253 = fmul float %214, 0x3F5A36E2E0000000 - %254 = call float @llvm.AMDIL.clamp.(float %253, float 0.000000e+00, float 1.000000e+00) - %255 = fsub float -0.000000e+00, %254 - %256 = fadd float 1.000000e+00, %255 - %257 = call float @llvm.pow.f32(float %252, float 2.500000e-01) - %258 = fmul float %39, %257 - %259 = fmul float %241, %258 - %260 = fmul float %246, %258 - %261 = fmul float %259, %230 - %262 = fmul float %260, %230 - %263 = fadd float %252, 0x3EE4F8B580000000 - %264 = fsub float -0.000000e+00, %252 - %265 = fadd float 1.000000e+00, %264 - %266 = fmul float 1.200000e+01, %265 - %267 = fadd float %266, 4.000000e+00 - %268 = fsub float -0.000000e+00, %267 - %269 = fmul float %268, %263 - %270 = fsub float -0.000000e+00, %267 - %271 = fmul float %270, %263 - %272 = fsub float -0.000000e+00, %267 - %273 = fmul float %272, %263 - %274 = fdiv float 1.000000e+00, %269 - %275 = fdiv float 1.000000e+00, %271 - %276 = fdiv float 1.000000e+00, %273 - %277 = fmul float %261, %274 - %278 = fmul float %262, %275 - %279 = fmul float %263, %276 - br label %LOOP - -LOOP: ; preds = %LOOP, %main_body - %temp144.0 = phi float [ 1.000000e+00, %main_body ], [ %292, %LOOP ] - %temp168.0 = phi float [ %176, %main_body ], [ %288, %LOOP ] - %temp169.0 = phi float [ %177, %main_body ], [ %289, %LOOP ] - %temp170.0 = phi float [ %256, %main_body ], [ %290, %LOOP ] - %280 = bitcast float %temp168.0 to i32 - %281 = bitcast float %temp169.0 to i32 - %282 = insertelement <4 x i32> undef, i32 %280, i32 0 - %283 = insertelement <4 x i32> %282, i32 %281, i32 1 - %284 = insertelement <4 x i32> %283, i32 0, i32 2 - %285 = insertelement <4 x i32> %284, i32 undef, i32 3 - %286 = call <4 x float> @llvm.SI.samplel.v4i32(<4 x i32> %285, <32 x i8> %147, <16 x i8> %149, i32 2) - %287 = extractelement <4 x float> %286, i32 3 - %288 = fadd float %temp168.0, %277 - %289 = fadd float %temp169.0, %278 - %290 = fadd float %temp170.0, %279 - %291 = fsub float -0.000000e+00, %287 - %292 = fadd float %290, %291 - %293 = fcmp oge float 0.000000e+00, %292 - %294 = sext i1 %293 to i32 - %295 = bitcast i32 %294 to float - %296 = bitcast float %295 to i32 - %297 = icmp ne i32 %296, 0 - br i1 %297, label %IF189, label %LOOP - -IF189: ; preds = %LOOP - %298 = extractelement <4 x float> %286, i32 0 - %299 = extractelement <4 x float> %286, i32 1 - %300 = extractelement <4 x float> %286, i32 2 - %301 = fsub float -0.000000e+00, %292 - %302 = fadd float %temp144.0, %301 - %303 = fdiv float 1.000000e+00, %302 - %304 = fmul float %292, %303 - %305 = fadd float %304, -1.000000e+00 - %306 = fmul float %305, %277 - %307 = fadd float %306, %288 - %308 = fmul float %305, %278 - %309 = fadd float %308, %289 - %310 = fsub float -0.000000e+00, %176 - %311 = fadd float %307, %310 - %312 = fsub float -0.000000e+00, %177 - %313 = fadd float %309, %312 - %314 = fadd float %176, %311 - %315 = fadd float %177, %313 - %316 = fmul float %311, %67 - %317 = fmul float %313, %68 - %318 = fmul float %316, %55 - %319 = fmul float %316, %56 - %320 = fmul float %317, %57 - %321 = fadd float %320, %318 - %322 = fmul float %317, %58 - %323 = fadd float %322, %319 - %324 = fadd float %178, %321 - %325 = fadd float %179, %323 - %326 = fmul float %316, %59 - %327 = fmul float %316, %60 - %328 = fmul float %316, %61 - %329 = fmul float %316, %62 - %330 = fmul float %317, %63 - %331 = fadd float %330, %326 - %332 = fmul float %317, %64 - %333 = fadd float %332, %327 - %334 = fmul float %317, %65 - %335 = fadd float %334, %328 - %336 = fmul float %317, %66 - %337 = fadd float %336, %329 - %338 = fadd float %168, %331 - %339 = fadd float %169, %333 - %340 = fadd float %170, %335 - %341 = fadd float %171, %337 - %342 = bitcast float %338 to i32 - %343 = bitcast float %339 to i32 - %344 = insertelement <2 x i32> undef, i32 %342, i32 0 - %345 = insertelement <2 x i32> %344, i32 %343, i32 1 - %346 = call <4 x float> @llvm.SI.sample.v2i32(<2 x i32> %345, <32 x i8> %135, <16 x i8> %137, i32 2) - %347 = extractelement <4 x float> %346, i32 0 - %348 = extractelement <4 x float> %346, i32 1 - %349 = extractelement <4 x float> %346, i32 2 - %350 = extractelement <4 x float> %346, i32 3 - %351 = fmul float %347, %23 - %352 = fmul float %348, %24 - %353 = fmul float %349, %25 - %354 = fmul float %350, %26 - %355 = fmul float %351, %180 - %356 = fmul float %352, %181 - %357 = fmul float %353, %182 - %358 = fmul float %354, %183 - %359 = fsub float -0.000000e+00, %350 - %360 = fadd float 1.000000e+00, %359 - %361 = fmul float %360, %49 - %362 = call float @llvm.AMDGPU.lrp(float %361, float %347, float %355) - %363 = call float @llvm.AMDGPU.lrp(float %361, float %348, float %356) - %364 = call float @llvm.AMDGPU.lrp(float %361, float %349, float %357) - %365 = bitcast float %340 to i32 - %366 = bitcast float %341 to i32 - %367 = insertelement <2 x i32> undef, i32 %365, i32 0 - %368 = insertelement <2 x i32> %367, i32 %366, i32 1 - %369 = call <4 x float> @llvm.SI.sample.v2i32(<2 x i32> %368, <32 x i8> %151, <16 x i8> %153, i32 2) - %370 = extractelement <4 x float> %369, i32 2 - %371 = fmul float %362, %234 - %372 = fmul float %363, %235 - %373 = fmul float %364, %236 - %374 = fmul float %358, %230 - %375 = bitcast float %314 to i32 - %376 = bitcast float %315 to i32 - %377 = insertelement <2 x i32> undef, i32 %375, i32 0 - %378 = insertelement <2 x i32> %377, i32 %376, i32 1 - %379 = call <4 x float> @llvm.SI.sample.v2i32(<2 x i32> %378, <32 x i8> %139, <16 x i8> %141, i32 2) - %380 = extractelement <4 x float> %379, i32 0 - %381 = extractelement <4 x float> %379, i32 1 - %382 = extractelement <4 x float> %379, i32 2 - %383 = extractelement <4 x float> %379, i32 3 - %384 = fcmp olt float 0.000000e+00, %382 - %385 = sext i1 %384 to i32 - %386 = bitcast i32 %385 to float - %387 = bitcast float %386 to i32 - %388 = icmp ne i32 %387, 0 - %.224 = select i1 %388, float %381, float %380 - %.225 = select i1 %388, float %383, float %381 - %389 = bitcast float %324 to i32 - %390 = bitcast float %325 to i32 - %391 = insertelement <2 x i32> undef, i32 %389, i32 0 - %392 = insertelement <2 x i32> %391, i32 %390, i32 1 - %393 = call <4 x float> @llvm.SI.sample.v2i32(<2 x i32> %392, <32 x i8> %143, <16 x i8> %145, i32 2) - %394 = extractelement <4 x float> %393, i32 0 - %395 = extractelement <4 x float> %393, i32 1 - %396 = extractelement <4 x float> %393, i32 2 - %397 = extractelement <4 x float> %393, i32 3 - %398 = fcmp olt float 0.000000e+00, %396 - %399 = sext i1 %398 to i32 - %400 = bitcast i32 %399 to float - %401 = bitcast float %400 to i32 - %402 = icmp ne i32 %401, 0 - %temp112.1 = select i1 %402, float %395, float %394 - %temp113.1 = select i1 %402, float %397, float %395 - %403 = fmul float %.224, 2.000000e+00 - %404 = fadd float %403, -1.000000e+00 - %405 = fmul float %.225, 2.000000e+00 - %406 = fadd float %405, -1.000000e+00 - %407 = fmul float %temp112.1, 2.000000e+00 - %408 = fadd float %407, -1.000000e+00 - %409 = fmul float %temp113.1, 2.000000e+00 - %410 = fadd float %409, -1.000000e+00 - %411 = fsub float -0.000000e+00, %404 - %412 = fmul float %411, %35 - %413 = fsub float -0.000000e+00, %406 - %414 = fmul float %413, %35 - %415 = fsub float -0.000000e+00, %408 - %416 = fmul float %415, %36 - %417 = fsub float -0.000000e+00, %410 - %418 = fmul float %417, %36 - %419 = fmul float %416, %370 - %420 = fmul float %418, %370 - %421 = call float @fabs(float %412) - %422 = call float @fabs(float %414) - %423 = fsub float -0.000000e+00, %421 - %424 = fadd float 1.000000e+00, %423 - %425 = fsub float -0.000000e+00, %422 - %426 = fadd float 1.000000e+00, %425 - %427 = fmul float %424, %419 - %428 = fadd float %427, %412 - %429 = fmul float %426, %420 - %430 = fadd float %429, %414 - %431 = fmul float %428, %428 - %432 = fmul float %430, %430 - %433 = fadd float %431, %432 - %434 = fsub float -0.000000e+00, %433 - %435 = fadd float 0x3FF00068E0000000, %434 - %436 = call float @llvm.AMDIL.clamp.(float %435, float 0.000000e+00, float 1.000000e+00) - %437 = call float @llvm.AMDGPU.rsq.f32(float %436) - %438 = fmul float %437, %436 - %439 = fsub float -0.000000e+00, %436 - %440 = call float @llvm.AMDGPU.cndlt(float %439, float %438, float 0.000000e+00) - %441 = fmul float %184, %428 - %442 = fmul float %185, %428 - %443 = fmul float %186, %428 - %444 = fmul float %187, %430 - %445 = fadd float %444, %441 - %446 = fmul float %188, %430 - %447 = fadd float %446, %442 - %448 = fmul float %189, %430 - %449 = fadd float %448, %443 - %450 = fmul float %190, %440 - %451 = fadd float %450, %445 - %452 = fmul float %191, %440 - %453 = fadd float %452, %447 - %454 = fmul float %192, %440 - %455 = fadd float %454, %449 - %456 = fmul float %451, %451 - %457 = fmul float %453, %453 - %458 = fadd float %457, %456 - %459 = fmul float %455, %455 - %460 = fadd float %458, %459 - %461 = call float @llvm.AMDGPU.rsq.f32(float %460) - %462 = fmul float %451, %461 - %463 = fmul float %453, %461 - %464 = fmul float %455, %461 - %465 = fcmp olt float 0.000000e+00, %219 - %466 = sext i1 %465 to i32 - %467 = bitcast i32 %466 to float - %468 = bitcast float %467 to i32 - %469 = icmp ne i32 %468, 0 - br i1 %469, label %IF198, label %ENDIF197 - -IF198: ; preds = %IF189 - %470 = fsub float -0.000000e+00, %462 - %471 = fsub float -0.000000e+00, %463 - %472 = fsub float -0.000000e+00, %464 - br label %ENDIF197 - -ENDIF197: ; preds = %IF189, %IF198 - %temp14.0 = phi float [ %472, %IF198 ], [ %464, %IF189 ] - %temp13.0 = phi float [ %471, %IF198 ], [ %463, %IF189 ] - %temp12.0 = phi float [ %470, %IF198 ], [ %462, %IF189 ] - %473 = bitcast float %220 to i32 - %474 = bitcast float %221 to i32 - %475 = insertelement <2 x i32> undef, i32 %473, i32 0 - %476 = insertelement <2 x i32> %475, i32 %474, i32 1 - %477 = call <4 x float> @llvm.SI.sample.v2i32(<2 x i32> %476, <32 x i8> %159, <16 x i8> %161, i32 2) - %478 = extractelement <4 x float> %477, i32 0 - %479 = extractelement <4 x float> %477, i32 1 - %480 = extractelement <4 x float> %477, i32 2 - %481 = extractelement <4 x float> %477, i32 3 - %482 = fmul float %478, %40 - %483 = fadd float %482, %41 - %484 = fmul float %479, %40 - %485 = fadd float %484, %41 - %486 = fmul float %480, %40 - %487 = fadd float %486, %41 - %488 = fmul float %481, %42 - %489 = fadd float %488, %43 - %490 = bitcast float %172 to i32 - %491 = bitcast float %173 to i32 - %492 = insertelement <2 x i32> undef, i32 %490, i32 0 - %493 = insertelement <2 x i32> %492, i32 %491, i32 1 - %494 = call <4 x float> @llvm.SI.sample.v2i32(<2 x i32> %493, <32 x i8> %155, <16 x i8> %157, i32 2) - %495 = extractelement <4 x float> %494, i32 0 - %496 = extractelement <4 x float> %494, i32 1 - %497 = extractelement <4 x float> %494, i32 2 - %498 = extractelement <4 x float> %494, i32 3 - %499 = fmul float %498, 3.200000e+01 - %500 = fadd float %499, -1.600000e+01 - %501 = call float @llvm.AMDIL.exp.(float %500) - %502 = fmul float %495, %501 - %503 = fmul float %496, %501 - %504 = fmul float %497, %501 - %505 = fmul float %28, %502 - %506 = fadd float %505, %193 - %507 = fmul float %29, %503 - %508 = fadd float %507, %194 - %509 = fmul float %30, %504 - %510 = fadd float %509, %195 - %511 = fmul float %506, %489 - %512 = fmul float %508, %489 - %513 = fmul float %510, %489 - %514 = fmul float %489, 5.000000e-01 - %515 = fadd float %514, 5.000000e-01 - %516 = fmul float %483, %515 - %517 = fadd float %516, %511 - %518 = fmul float %485, %515 - %519 = fadd float %518, %512 - %520 = fmul float %487, %515 - %521 = fadd float %520, %513 - %522 = fmul float %517, %371 - %523 = fmul float %519, %372 - %524 = fmul float %521, %373 - %525 = fmul float %428, 0x3FDB272440000000 - %526 = fmul float %430, 0xBFDB272440000000 - %527 = fadd float %526, %525 - %528 = fmul float %440, 0x3FE99999A0000000 - %529 = fadd float %527, %528 - %530 = fmul float %529, 5.000000e-01 - %531 = fadd float %530, 0x3FE3333340000000 - %532 = fmul float %531, %531 - %533 = fmul float %522, %532 - %534 = fmul float %523, %532 - %535 = fmul float %524, %532 - %536 = fsub float -0.000000e+00, %72 - %537 = fsub float -0.000000e+00, %73 - %538 = fsub float -0.000000e+00, %74 - %539 = fmul float %temp12.0, %536 - %540 = fmul float %temp13.0, %537 - %541 = fadd float %540, %539 - %542 = fmul float %temp14.0, %538 - %543 = fadd float %541, %542 - %544 = call float @llvm.AMDIL.clamp.(float %543, float 0.000000e+00, float 1.000000e+00) - %545 = fmul float %371, %544 - %546 = fmul float %372, %544 - %547 = fmul float %373, %544 - %548 = fmul float %545, %69 - %549 = fmul float %546, %70 - %550 = fmul float %547, %71 - %551 = fsub float -0.000000e+00, %164 - %552 = fadd float %97, %551 - %553 = fsub float -0.000000e+00, %165 - %554 = fadd float %98, %553 - %555 = fsub float -0.000000e+00, %166 - %556 = fadd float %99, %555 - %557 = fmul float %552, %552 - %558 = fmul float %554, %554 - %559 = fadd float %558, %557 - %560 = fmul float %556, %556 - %561 = fadd float %559, %560 - %562 = call float @llvm.AMDGPU.rsq.f32(float %561) - %563 = fmul float %562, %561 - %564 = fsub float -0.000000e+00, %561 - %565 = call float @llvm.AMDGPU.cndlt(float %564, float %563, float 0.000000e+00) - %566 = fsub float -0.000000e+00, %84 - %567 = fadd float %565, %566 - %568 = fsub float -0.000000e+00, %83 - %569 = fadd float %565, %568 - %570 = fsub float -0.000000e+00, %82 - %571 = fadd float %565, %570 - %572 = fsub float -0.000000e+00, %84 - %573 = fadd float %83, %572 - %574 = fsub float -0.000000e+00, %83 - %575 = fadd float %82, %574 - %576 = fsub float -0.000000e+00, %82 - %577 = fadd float %81, %576 - %578 = fdiv float 1.000000e+00, %573 - %579 = fdiv float 1.000000e+00, %575 - %580 = fdiv float 1.000000e+00, %577 - %581 = fmul float %567, %578 - %582 = fmul float %569, %579 - %583 = fmul float %571, %580 - %584 = fcmp olt float %565, %83 - %585 = sext i1 %584 to i32 - %586 = bitcast i32 %585 to float - %587 = bitcast float %586 to i32 - %588 = icmp ne i32 %587, 0 - br i1 %588, label %ENDIF200, label %ELSE202 - -ELSE202: ; preds = %ENDIF197 - %589 = fcmp olt float %565, %82 - %590 = sext i1 %589 to i32 - %591 = bitcast i32 %590 to float - %592 = bitcast float %591 to i32 - %593 = icmp ne i32 %592, 0 - br i1 %593, label %ENDIF200, label %ELSE205 - -ENDIF200: ; preds = %ELSE205, %ELSE202, %ENDIF197 - %temp80.0 = phi float [ %581, %ENDIF197 ], [ %.226, %ELSE205 ], [ %582, %ELSE202 ] - %temp88.0 = phi float [ %122, %ENDIF197 ], [ %.227, %ELSE205 ], [ %120, %ELSE202 ] - %temp89.0 = phi float [ %123, %ENDIF197 ], [ %.228, %ELSE205 ], [ %121, %ELSE202 ] - %temp90.0 = phi float [ %120, %ENDIF197 ], [ %116, %ELSE205 ], [ %118, %ELSE202 ] - %temp91.0 = phi float [ %121, %ENDIF197 ], [ %117, %ELSE205 ], [ %119, %ELSE202 ] - %594 = fcmp olt float %565, %83 - %595 = sext i1 %594 to i32 - %596 = bitcast i32 %595 to float - %597 = bitcast float %596 to i32 - %598 = icmp ne i32 %597, 0 - br i1 %598, label %ENDIF209, label %ELSE211 - -ELSE205: ; preds = %ELSE202 - %599 = fcmp olt float %565, %81 - %600 = sext i1 %599 to i32 - %601 = bitcast i32 %600 to float - %602 = bitcast float %601 to i32 - %603 = icmp ne i32 %602, 0 - %.226 = select i1 %603, float %583, float 1.000000e+00 - %.227 = select i1 %603, float %118, float %116 - %.228 = select i1 %603, float %119, float %117 - br label %ENDIF200 - -ELSE211: ; preds = %ENDIF200 - %604 = fcmp olt float %565, %82 - %605 = sext i1 %604 to i32 - %606 = bitcast i32 %605 to float - %607 = bitcast float %606 to i32 - %608 = icmp ne i32 %607, 0 - br i1 %608, label %ENDIF209, label %ELSE214 - -ENDIF209: ; preds = %ELSE214, %ELSE211, %ENDIF200 - %temp52.0 = phi float [ %108, %ENDIF200 ], [ %100, %ELSE214 ], [ %104, %ELSE211 ] - %temp53.0 = phi float [ %109, %ENDIF200 ], [ %101, %ELSE214 ], [ %105, %ELSE211 ] - %temp54.0 = phi float [ %110, %ENDIF200 ], [ %102, %ELSE214 ], [ %106, %ELSE211 ] - %temp55.0 = phi float [ %111, %ENDIF200 ], [ %103, %ELSE214 ], [ %107, %ELSE211 ] - %temp68.0 = phi float [ %112, %ENDIF200 ], [ %.230, %ELSE214 ], [ %108, %ELSE211 ] - %temp69.0 = phi float [ %113, %ENDIF200 ], [ %.231, %ELSE214 ], [ %109, %ELSE211 ] - %temp70.0 = phi float [ %114, %ENDIF200 ], [ %.232, %ELSE214 ], [ %110, %ELSE211 ] - %temp71.0 = phi float [ %115, %ENDIF200 ], [ %.233, %ELSE214 ], [ %111, %ELSE211 ] - %609 = fmul float %164, %85 - %610 = fmul float %165, %86 - %611 = fadd float %609, %610 - %612 = fmul float %166, %87 - %613 = fadd float %611, %612 - %614 = fmul float %167, %88 - %615 = fadd float %613, %614 - %616 = fmul float %164, %89 - %617 = fmul float %165, %90 - %618 = fadd float %616, %617 - %619 = fmul float %166, %91 - %620 = fadd float %618, %619 - %621 = fmul float %167, %92 - %622 = fadd float %620, %621 - %623 = fmul float %164, %93 - %624 = fmul float %165, %94 - %625 = fadd float %623, %624 - %626 = fmul float %166, %95 - %627 = fadd float %625, %626 - %628 = fmul float %167, %96 - %629 = fadd float %627, %628 - %630 = fsub float -0.000000e+00, %78 - %631 = fadd float 1.000000e+00, %630 - %632 = call float @fabs(float %615) - %633 = call float @fabs(float %622) - %634 = fcmp oge float %631, %632 - %635 = sext i1 %634 to i32 - %636 = bitcast i32 %635 to float - %637 = bitcast float %636 to i32 - %638 = and i32 %637, 1065353216 - %639 = bitcast i32 %638 to float - %640 = fcmp oge float %631, %633 - %641 = sext i1 %640 to i32 - %642 = bitcast i32 %641 to float - %643 = bitcast float %642 to i32 - %644 = and i32 %643, 1065353216 - %645 = bitcast i32 %644 to float - %646 = fmul float %639, %645 - %647 = fmul float %629, %646 - %648 = fmul float %615, %temp68.0 - %649 = fadd float %648, %temp70.0 - %650 = fmul float %622, %temp69.0 - %651 = fadd float %650, %temp71.0 - %652 = fmul float %615, %temp52.0 - %653 = fadd float %652, %temp54.0 - %654 = fmul float %622, %temp53.0 - %655 = fadd float %654, %temp55.0 - %656 = fadd float %temp80.0, -1.000000e+00 - %657 = fmul float %656, %77 - %658 = fadd float %657, 1.000000e+00 - %659 = call float @llvm.AMDIL.clamp.(float %658, float 0.000000e+00, float 1.000000e+00) - %660 = bitcast float %649 to i32 - %661 = bitcast float %651 to i32 - %662 = bitcast float 0.000000e+00 to i32 - %663 = insertelement <4 x i32> undef, i32 %660, i32 0 - %664 = insertelement <4 x i32> %663, i32 %661, i32 1 - %665 = insertelement <4 x i32> %664, i32 %662, i32 2 - %666 = insertelement <4 x i32> %665, i32 undef, i32 3 - %667 = call <4 x float> @llvm.SI.samplel.v4i32(<4 x i32> %666, <32 x i8> %127, <16 x i8> %129, i32 2) - %668 = extractelement <4 x float> %667, i32 0 - %669 = extractelement <4 x float> %667, i32 1 - %670 = bitcast float %653 to i32 - %671 = bitcast float %655 to i32 - %672 = bitcast float 0.000000e+00 to i32 - %673 = insertelement <4 x i32> undef, i32 %670, i32 0 - %674 = insertelement <4 x i32> %673, i32 %671, i32 1 - %675 = insertelement <4 x i32> %674, i32 %672, i32 2 - %676 = insertelement <4 x i32> %675, i32 undef, i32 3 - %677 = call <4 x float> @llvm.SI.samplel.v4i32(<4 x i32> %676, <32 x i8> %127, <16 x i8> %129, i32 2) - %678 = extractelement <4 x float> %677, i32 0 - %679 = extractelement <4 x float> %677, i32 1 - %680 = fsub float -0.000000e+00, %669 - %681 = fadd float 1.000000e+00, %680 - %682 = fsub float -0.000000e+00, %679 - %683 = fadd float 1.000000e+00, %682 - %684 = fmul float %681, 2.500000e-01 - %685 = fmul float %683, 2.500000e-01 - %686 = fsub float -0.000000e+00, %684 - %687 = fadd float %668, %686 - %688 = fsub float -0.000000e+00, %685 - %689 = fadd float %678, %688 - %690 = fmul float %647, %temp88.0 - %691 = fadd float %690, %temp89.0 - %692 = fmul float %647, %temp90.0 - %693 = fadd float %692, %temp91.0 - %694 = call float @llvm.AMDIL.clamp.(float %691, float 0.000000e+00, float 1.000000e+00) - %695 = call float @llvm.AMDIL.clamp.(float %693, float 0.000000e+00, float 1.000000e+00) - %696 = fsub float -0.000000e+00, %694 - %697 = fadd float %668, %696 - %698 = fsub float -0.000000e+00, %695 - %699 = fadd float %678, %698 - %700 = fmul float %668, %668 - %701 = fmul float %678, %678 - %702 = fsub float -0.000000e+00, %700 - %703 = fadd float %687, %702 - %704 = fsub float -0.000000e+00, %701 - %705 = fadd float %689, %704 - %706 = fcmp uge float %703, %75 - %707 = select i1 %706, float %703, float %75 - %708 = fcmp uge float %705, %75 - %709 = select i1 %708, float %705, float %75 - %710 = fmul float %697, %697 - %711 = fadd float %710, %707 - %712 = fmul float %699, %699 - %713 = fadd float %712, %709 - %714 = fdiv float 1.000000e+00, %711 - %715 = fdiv float 1.000000e+00, %713 - %716 = fmul float %707, %714 - %717 = fmul float %709, %715 - %718 = fcmp oge float %697, 0.000000e+00 - %719 = sext i1 %718 to i32 - %720 = bitcast i32 %719 to float - %721 = bitcast float %720 to i32 - %722 = icmp ne i32 %721, 0 - %.229 = select i1 %722, float 1.000000e+00, float %716 - %723 = fcmp oge float %699, 0.000000e+00 - %724 = sext i1 %723 to i32 - %725 = bitcast i32 %724 to float - %726 = bitcast float %725 to i32 - %727 = icmp ne i32 %726, 0 - %temp28.0 = select i1 %727, float 1.000000e+00, float %717 - %728 = call float @llvm.AMDGPU.lrp(float %659, float %temp28.0, float %.229) - %729 = call float @llvm.pow.f32(float %728, float %76) - %730 = fmul float %729, %79 - %731 = fadd float %730, %80 - %732 = call float @llvm.AMDIL.clamp.(float %731, float 0.000000e+00, float 1.000000e+00) - %733 = fmul float %732, %732 - %734 = fmul float 2.000000e+00, %732 - %735 = fsub float -0.000000e+00, %734 - %736 = fadd float 3.000000e+00, %735 - %737 = fmul float %733, %736 - %738 = fmul float %548, %737 - %739 = fmul float %549, %737 - %740 = fmul float %550, %737 - %741 = fmul float %738, %515 - %742 = fadd float %741, %533 - %743 = fmul float %739, %515 - %744 = fadd float %743, %534 - %745 = fmul float %740, %515 - %746 = fadd float %745, %535 - %747 = call float @llvm.AMDGPU.lrp(float %230, float %287, float 1.000000e+00) - %748 = call float @llvm.AMDGPU.lrp(float %37, float %298, float 1.000000e+00) - %749 = call float @llvm.AMDGPU.lrp(float %37, float %299, float 1.000000e+00) - %750 = call float @llvm.AMDGPU.lrp(float %37, float %300, float 1.000000e+00) - %751 = call float @llvm.AMDGPU.lrp(float %38, float %747, float 1.000000e+00) - %752 = fmul float %748, %751 - %753 = fmul float %749, %751 - %754 = fmul float %750, %751 - %755 = fmul float %742, %752 - %756 = fmul float %744, %753 - %757 = fmul float %746, %754 - %758 = fmul float %temp12.0, %216 - %759 = fmul float %temp13.0, %217 - %760 = fadd float %759, %758 - %761 = fmul float %temp14.0, %218 - %762 = fadd float %760, %761 - %763 = call float @fabs(float %762) - %764 = fmul float %763, %763 - %765 = fmul float %764, %50 - %766 = fadd float %765, %51 - %767 = call float @llvm.AMDIL.clamp.(float %766, float 0.000000e+00, float 1.000000e+00) - %768 = fsub float -0.000000e+00, %767 - %769 = fadd float 1.000000e+00, %768 - %770 = fmul float %33, %769 - %771 = fmul float %33, %769 - %772 = fmul float %33, %769 - %773 = fmul float %34, %769 - %774 = call float @llvm.AMDGPU.lrp(float %770, float %31, float %755) - %775 = call float @llvm.AMDGPU.lrp(float %771, float %31, float %756) - %776 = call float @llvm.AMDGPU.lrp(float %772, float %31, float %757) - %777 = call float @llvm.AMDGPU.lrp(float %773, float %32, float %374) - %778 = fcmp uge float %774, 0x3E6FFFFE60000000 - %779 = select i1 %778, float %774, float 0x3E6FFFFE60000000 - %780 = fcmp uge float %775, 0x3E6FFFFE60000000 - %781 = select i1 %780, float %775, float 0x3E6FFFFE60000000 - %782 = fcmp uge float %776, 0x3E6FFFFE60000000 - %783 = select i1 %782, float %776, float 0x3E6FFFFE60000000 - %784 = fcmp uge float %779, 6.550400e+04 - %785 = select i1 %784, float 6.550400e+04, float %779 - %786 = fcmp uge float %781, 6.550400e+04 - %787 = select i1 %786, float 6.550400e+04, float %781 - %788 = fcmp uge float %783, 6.550400e+04 - %789 = select i1 %788, float 6.550400e+04, float %783 - %790 = fmul float %777, %52 - %791 = fadd float %790, %53 - %792 = call float @llvm.AMDIL.clamp.(float %791, float 0.000000e+00, float 1.000000e+00) - %793 = call i32 @llvm.SI.packf16(float %785, float %787) - %794 = bitcast i32 %793 to float - %795 = call i32 @llvm.SI.packf16(float %789, float %792) - %796 = bitcast i32 %795 to float - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %794, float %796, float %794, float %796) - ret void - -ELSE214: ; preds = %ELSE211 - %797 = fcmp olt float %565, %81 - %798 = sext i1 %797 to i32 - %799 = bitcast i32 %798 to float - %800 = bitcast float %799 to i32 - %801 = icmp ne i32 %800, 0 - %.230 = select i1 %801, float %104, float %100 - %.231 = select i1 %801, float %105, float %101 - %.232 = select i1 %801, float %106, float %102 - %.233 = select i1 %801, float %107, float %103 - br label %ENDIF209 -} - -; Function Attrs: readnone -declare float @llvm.AMDIL.clamp.(float, float, float) #2 - -; Function Attrs: nounwind readnone -declare <4 x float> @llvm.SI.sample.v2i32(<2 x i32>, <32 x i8>, <16 x i8>, i32) #1 - -; Function Attrs: readnone -declare float @llvm.AMDGPU.lrp(float, float, float) #2 - -; Function Attrs: nounwind readnone -declare <4 x float> @llvm.SI.samplel.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32) #1 - -; Function Attrs: readnone -declare float @llvm.AMDGPU.cndlt(float, float, float) #2 - -; Function Attrs: readnone -declare float @llvm.AMDIL.exp.(float) #2 - -attributes #0 = { "ShaderType"="0" } -attributes #1 = { nounwind readnone } -attributes #2 = { readnone } -attributes #3 = { nounwind readonly } -attributes #4 = { readonly } diff --git a/llvm/test/CodeGen/R600/si-spill-cf.ll b/llvm/test/CodeGen/R600/si-spill-cf.ll deleted file mode 100644 index 4b2d8ec6bf0..00000000000 --- a/llvm/test/CodeGen/R600/si-spill-cf.ll +++ /dev/null @@ -1,501 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI < %s -verify-machineinstrs | FileCheck -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga < %s -verify-machineinstrs | FileCheck -check-prefix=SI %s - -; If this occurs it is likely due to reordering and the restore was -; originally supposed to happen before SI_END_CF. -; SI: s_or_b64 exec, exec, [[SAVED:s\[[0-9]+:[0-9]+\]|[a-z]+]] -; SI-NOT: v_readlane_b32 [[SAVED]] - -define void @main() #0 { -main_body: - %0 = call float @llvm.SI.load.const(<16 x i8> undef, i32 16) - %1 = call float @llvm.SI.load.const(<16 x i8> undef, i32 32) - %2 = call float @llvm.SI.load.const(<16 x i8> undef, i32 80) - %3 = call float @llvm.SI.load.const(<16 x i8> undef, i32 84) - %4 = call float @llvm.SI.load.const(<16 x i8> undef, i32 88) - %5 = call float @llvm.SI.load.const(<16 x i8> undef, i32 96) - %6 = call float @llvm.SI.load.const(<16 x i8> undef, i32 100) - %7 = call float @llvm.SI.load.const(<16 x i8> undef, i32 104) - %8 = call float @llvm.SI.load.const(<16 x i8> undef, i32 112) - %9 = call float @llvm.SI.load.const(<16 x i8> undef, i32 116) - %10 = call float @llvm.SI.load.const(<16 x i8> undef, i32 120) - %11 = call float @llvm.SI.load.const(<16 x i8> undef, i32 128) - %12 = call float @llvm.SI.load.const(<16 x i8> undef, i32 132) - %13 = call float @llvm.SI.load.const(<16 x i8> undef, i32 136) - %14 = call float @llvm.SI.load.const(<16 x i8> undef, i32 144) - %15 = call float @llvm.SI.load.const(<16 x i8> undef, i32 148) - %16 = call float @llvm.SI.load.const(<16 x i8> undef, i32 152) - %17 = call float @llvm.SI.load.const(<16 x i8> undef, i32 160) - %18 = call float @llvm.SI.load.const(<16 x i8> undef, i32 164) - %19 = call float @llvm.SI.load.const(<16 x i8> undef, i32 168) - %20 = call float @llvm.SI.load.const(<16 x i8> undef, i32 176) - %21 = call float @llvm.SI.load.const(<16 x i8> undef, i32 180) - %22 = call float @llvm.SI.load.const(<16 x i8> undef, i32 184) - %23 = call float @llvm.SI.load.const(<16 x i8> undef, i32 192) - %24 = call float @llvm.SI.load.const(<16 x i8> undef, i32 196) - %25 = call float @llvm.SI.load.const(<16 x i8> undef, i32 200) - %26 = call float @llvm.SI.load.const(<16 x i8> undef, i32 208) - %27 = call float @llvm.SI.load.const(<16 x i8> undef, i32 212) - %28 = call float @llvm.SI.load.const(<16 x i8> undef, i32 216) - %29 = call float @llvm.SI.load.const(<16 x i8> undef, i32 224) - %30 = call float @llvm.SI.load.const(<16 x i8> undef, i32 228) - %31 = call float @llvm.SI.load.const(<16 x i8> undef, i32 232) - %32 = call float @llvm.SI.load.const(<16 x i8> undef, i32 240) - %33 = call float @llvm.SI.load.const(<16 x i8> undef, i32 244) - %34 = call float @llvm.SI.load.const(<16 x i8> undef, i32 248) - %35 = call float @llvm.SI.load.const(<16 x i8> undef, i32 256) - %36 = call float @llvm.SI.load.const(<16 x i8> undef, i32 260) - %37 = call float @llvm.SI.load.const(<16 x i8> undef, i32 264) - %38 = call float @llvm.SI.load.const(<16 x i8> undef, i32 272) - %39 = call float @llvm.SI.load.const(<16 x i8> undef, i32 276) - %40 = call float @llvm.SI.load.const(<16 x i8> undef, i32 280) - %41 = call float @llvm.SI.load.const(<16 x i8> undef, i32 288) - %42 = call float @llvm.SI.load.const(<16 x i8> undef, i32 292) - %43 = call float @llvm.SI.load.const(<16 x i8> undef, i32 296) - %44 = call float @llvm.SI.load.const(<16 x i8> undef, i32 304) - %45 = call float @llvm.SI.load.const(<16 x i8> undef, i32 308) - %46 = call float @llvm.SI.load.const(<16 x i8> undef, i32 312) - %47 = call float @llvm.SI.load.const(<16 x i8> undef, i32 320) - %48 = call float @llvm.SI.load.const(<16 x i8> undef, i32 324) - %49 = call float @llvm.SI.load.const(<16 x i8> undef, i32 328) - %50 = call float @llvm.SI.load.const(<16 x i8> undef, i32 336) - %51 = call float @llvm.SI.load.const(<16 x i8> undef, i32 340) - %52 = call float @llvm.SI.load.const(<16 x i8> undef, i32 344) - %53 = call float @llvm.SI.load.const(<16 x i8> undef, i32 352) - %54 = call float @llvm.SI.load.const(<16 x i8> undef, i32 356) - %55 = call float @llvm.SI.load.const(<16 x i8> undef, i32 360) - %56 = call float @llvm.SI.load.const(<16 x i8> undef, i32 368) - %57 = call float @llvm.SI.load.const(<16 x i8> undef, i32 372) - %58 = call float @llvm.SI.load.const(<16 x i8> undef, i32 376) - %59 = call float @llvm.SI.load.const(<16 x i8> undef, i32 384) - %60 = call float @llvm.SI.load.const(<16 x i8> undef, i32 388) - %61 = call float @llvm.SI.load.const(<16 x i8> undef, i32 392) - %62 = call float @llvm.SI.load.const(<16 x i8> undef, i32 400) - %63 = call float @llvm.SI.load.const(<16 x i8> undef, i32 404) - %64 = call float @llvm.SI.load.const(<16 x i8> undef, i32 408) - %65 = call float @llvm.SI.load.const(<16 x i8> undef, i32 416) - %66 = call float @llvm.SI.load.const(<16 x i8> undef, i32 420) - br label %LOOP - -LOOP: ; preds = %ENDIF2795, %main_body - %temp894.0 = phi float [ 0.000000e+00, %main_body ], [ %temp894.1, %ENDIF2795 ] - %temp18.0 = phi float [ undef, %main_body ], [ %temp18.1, %ENDIF2795 ] - %67 = icmp sgt i32 undef, 4 - br i1 %67, label %ENDLOOP, label %ENDIF - -ENDLOOP: ; preds = %ELSE2566, %LOOP - %68 = call float @llvm.AMDGPU.lrp(float %0, float undef, float undef) - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float undef, float %68, float undef, float 1.000000e+00) - ret void - -ENDIF: ; preds = %LOOP - %69 = fsub float %2, undef - %70 = fsub float %3, undef - %71 = fsub float %4, undef - %72 = fmul float %69, 0.000000e+00 - %73 = fmul float %70, undef - %74 = fmul float %71, undef - %75 = fsub float %6, undef - %76 = fsub float %7, undef - %77 = fmul float %75, undef - %78 = fmul float %76, 0.000000e+00 - %79 = call float @llvm.minnum.f32(float %74, float %78) - %80 = call float @llvm.maxnum.f32(float %72, float 0.000000e+00) - %81 = call float @llvm.maxnum.f32(float %73, float %77) - %82 = call float @llvm.maxnum.f32(float undef, float %79) - %83 = call float @llvm.minnum.f32(float %80, float %81) - %84 = call float @llvm.minnum.f32(float %83, float undef) - %85 = fsub float %14, undef - %86 = fsub float %15, undef - %87 = fsub float %16, undef - %88 = fmul float %85, undef - %89 = fmul float %86, undef - %90 = fmul float %87, undef - %91 = fsub float %17, undef - %92 = fsub float %18, undef - %93 = fsub float %19, undef - %94 = fmul float %91, 0.000000e+00 - %95 = fmul float %92, undef - %96 = fmul float %93, undef - %97 = call float @llvm.minnum.f32(float %89, float %95) - %98 = call float @llvm.maxnum.f32(float %88, float %94) - %99 = call float @llvm.maxnum.f32(float %90, float %96) - %100 = call float @llvm.maxnum.f32(float undef, float %97) - %101 = call float @llvm.maxnum.f32(float %100, float undef) - %102 = call float @llvm.minnum.f32(float %98, float undef) - %103 = call float @llvm.minnum.f32(float %102, float %99) - %104 = fsub float %30, undef - %105 = fsub float %31, undef - %106 = fmul float %104, 0.000000e+00 - %107 = fmul float %105, 0.000000e+00 - %108 = call float @llvm.minnum.f32(float undef, float %106) - %109 = call float @llvm.maxnum.f32(float undef, float %107) - %110 = call float @llvm.maxnum.f32(float undef, float %108) - %111 = call float @llvm.maxnum.f32(float %110, float undef) - %112 = call float @llvm.minnum.f32(float undef, float %109) - %113 = fsub float %32, undef - %114 = fsub float %33, undef - %115 = fsub float %34, undef - %116 = fmul float %113, 0.000000e+00 - %117 = fmul float %114, undef - %118 = fmul float %115, undef - %119 = fsub float %35, undef - %120 = fsub float %36, undef - %121 = fsub float %37, undef - %122 = fmul float %119, undef - %123 = fmul float %120, undef - %124 = fmul float %121, undef - %125 = call float @llvm.minnum.f32(float %116, float %122) - %126 = call float @llvm.minnum.f32(float %117, float %123) - %127 = call float @llvm.minnum.f32(float %118, float %124) - %128 = call float @llvm.maxnum.f32(float %125, float %126) - %129 = call float @llvm.maxnum.f32(float %128, float %127) - %130 = fsub float %38, undef - %131 = fsub float %39, undef - %132 = fsub float %40, undef - %133 = fmul float %130, 0.000000e+00 - %134 = fmul float %131, undef - %135 = fmul float %132, undef - %136 = fsub float %41, undef - %137 = fsub float %42, undef - %138 = fsub float %43, undef - %139 = fmul float %136, undef - %140 = fmul float %137, undef - %141 = fmul float %138, undef - %142 = call float @llvm.minnum.f32(float %133, float %139) - %143 = call float @llvm.minnum.f32(float %134, float %140) - %144 = call float @llvm.minnum.f32(float %135, float %141) - %145 = call float @llvm.maxnum.f32(float %142, float %143) - %146 = call float @llvm.maxnum.f32(float %145, float %144) - %147 = fsub float %44, undef - %148 = fsub float %45, undef - %149 = fsub float %46, undef - %150 = fmul float %147, 0.000000e+00 - %151 = fmul float %148, 0.000000e+00 - %152 = fmul float %149, undef - %153 = fsub float %47, undef - %154 = fsub float %48, undef - %155 = fsub float %49, undef - %156 = fmul float %153, undef - %157 = fmul float %154, 0.000000e+00 - %158 = fmul float %155, undef - %159 = call float @llvm.minnum.f32(float %150, float %156) - %160 = call float @llvm.minnum.f32(float %151, float %157) - %161 = call float @llvm.minnum.f32(float %152, float %158) - %162 = call float @llvm.maxnum.f32(float %159, float %160) - %163 = call float @llvm.maxnum.f32(float %162, float %161) - %164 = fsub float %50, undef - %165 = fsub float %51, undef - %166 = fsub float %52, undef - %167 = fmul float %164, undef - %168 = fmul float %165, 0.000000e+00 - %169 = fmul float %166, 0.000000e+00 - %170 = fsub float %53, undef - %171 = fsub float %54, undef - %172 = fsub float %55, undef - %173 = fdiv float 1.000000e+00, %temp18.0 - %174 = fmul float %170, undef - %175 = fmul float %171, undef - %176 = fmul float %172, %173 - %177 = call float @llvm.minnum.f32(float %167, float %174) - %178 = call float @llvm.minnum.f32(float %168, float %175) - %179 = call float @llvm.minnum.f32(float %169, float %176) - %180 = call float @llvm.maxnum.f32(float %177, float %178) - %181 = call float @llvm.maxnum.f32(float %180, float %179) - %182 = fsub float %62, undef - %183 = fsub float %63, undef - %184 = fsub float %64, undef - %185 = fmul float %182, 0.000000e+00 - %186 = fmul float %183, undef - %187 = fmul float %184, undef - %188 = fsub float %65, undef - %189 = fsub float %66, undef - %190 = fmul float %188, undef - %191 = fmul float %189, undef - %192 = call float @llvm.maxnum.f32(float %185, float %190) - %193 = call float @llvm.maxnum.f32(float %186, float %191) - %194 = call float @llvm.maxnum.f32(float %187, float undef) - %195 = call float @llvm.minnum.f32(float %192, float %193) - %196 = call float @llvm.minnum.f32(float %195, float %194) - %.temp292.7 = select i1 undef, float %163, float undef - %temp292.9 = select i1 false, float %181, float %.temp292.7 - %.temp292.9 = select i1 undef, float undef, float %temp292.9 - %197 = fcmp ogt float undef, 0.000000e+00 - %198 = fcmp olt float undef, %196 - %199 = and i1 %197, %198 - %200 = fcmp olt float undef, %.temp292.9 - %201 = and i1 %199, %200 - %temp292.11 = select i1 %201, float undef, float %.temp292.9 - br i1 undef, label %IF2565, label %ELSE2566 - -IF2565: ; preds = %ENDIF - br i1 false, label %ENDIF2582, label %ELSE2584 - -ELSE2566: ; preds = %ENDIF - %202 = fcmp oeq float %temp292.11, 1.000000e+04 - br i1 %202, label %ENDLOOP, label %ELSE2593 - -ENDIF2564: ; preds = %ENDIF2594, %ENDIF2588 - %temp894.1 = phi float [ undef, %ENDIF2588 ], [ %temp894.2, %ENDIF2594 ] - %temp18.1 = phi float [ %219, %ENDIF2588 ], [ undef, %ENDIF2594 ] - %203 = fsub float %5, undef - %204 = fmul float %203, undef - %205 = call float @llvm.maxnum.f32(float undef, float %204) - %206 = call float @llvm.minnum.f32(float %205, float undef) - %207 = call float @llvm.minnum.f32(float %206, float undef) - %208 = fcmp ogt float undef, 0.000000e+00 - %209 = fcmp olt float undef, 1.000000e+00 - %210 = and i1 %208, %209 - %211 = fcmp olt float undef, %207 - %212 = and i1 %210, %211 - br i1 %212, label %ENDIF2795, label %ELSE2797 - -ELSE2584: ; preds = %IF2565 - br label %ENDIF2582 - -ENDIF2582: ; preds = %ELSE2584, %IF2565 - %213 = fadd float %1, undef - %214 = fadd float 0.000000e+00, %213 - %215 = call float @llvm.AMDIL.fraction.(float %214) - br i1 undef, label %IF2589, label %ELSE2590 - -IF2589: ; preds = %ENDIF2582 - br label %ENDIF2588 - -ELSE2590: ; preds = %ENDIF2582 - br label %ENDIF2588 - -ENDIF2588: ; preds = %ELSE2590, %IF2589 - %216 = fsub float 1.000000e+00, %215 - %217 = call float @llvm.sqrt.f32(float %216) - %218 = fmul float %217, undef - %219 = fadd float %218, undef - br label %ENDIF2564 - -ELSE2593: ; preds = %ELSE2566 - %220 = fcmp oeq float %temp292.11, %82 - %221 = fcmp olt float %82, %84 - %222 = and i1 %220, %221 - br i1 %222, label %ENDIF2594, label %ELSE2596 - -ELSE2596: ; preds = %ELSE2593 - %223 = fcmp oeq float %temp292.11, %101 - %224 = fcmp olt float %101, %103 - %225 = and i1 %223, %224 - br i1 %225, label %ENDIF2594, label %ELSE2632 - -ENDIF2594: ; preds = %ELSE2788, %ELSE2785, %ELSE2782, %ELSE2779, %IF2775, %ELSE2761, %ELSE2758, %IF2757, %ELSE2704, %ELSE2686, %ELSE2671, %ELSE2668, %IF2667, %ELSE2632, %ELSE2596, %ELSE2593 - %temp894.2 = phi float [ 0.000000e+00, %IF2667 ], [ 0.000000e+00, %ELSE2671 ], [ 0.000000e+00, %IF2757 ], [ 0.000000e+00, %ELSE2761 ], [ %temp894.0, %ELSE2758 ], [ 0.000000e+00, %IF2775 ], [ 0.000000e+00, %ELSE2779 ], [ 0.000000e+00, %ELSE2782 ], [ %.2848, %ELSE2788 ], [ 0.000000e+00, %ELSE2785 ], [ 0.000000e+00, %ELSE2593 ], [ 0.000000e+00, %ELSE2632 ], [ 0.000000e+00, %ELSE2704 ], [ 0.000000e+00, %ELSE2686 ], [ 0.000000e+00, %ELSE2668 ], [ 0.000000e+00, %ELSE2596 ] - %226 = fmul float %temp894.2, undef - br label %ENDIF2564 - -ELSE2632: ; preds = %ELSE2596 - br i1 undef, label %ENDIF2594, label %ELSE2650 - -ELSE2650: ; preds = %ELSE2632 - %227 = fcmp oeq float %temp292.11, %111 - %228 = fcmp olt float %111, %112 - %229 = and i1 %227, %228 - br i1 %229, label %IF2667, label %ELSE2668 - -IF2667: ; preds = %ELSE2650 - br i1 undef, label %ENDIF2594, label %ELSE2671 - -ELSE2668: ; preds = %ELSE2650 - %230 = fcmp oeq float %temp292.11, %129 - %231 = fcmp olt float %129, undef - %232 = and i1 %230, %231 - br i1 %232, label %ENDIF2594, label %ELSE2686 - -ELSE2671: ; preds = %IF2667 - br label %ENDIF2594 - -ELSE2686: ; preds = %ELSE2668 - %233 = fcmp oeq float %temp292.11, %146 - %234 = fcmp olt float %146, undef - %235 = and i1 %233, %234 - br i1 %235, label %ENDIF2594, label %ELSE2704 - -ELSE2704: ; preds = %ELSE2686 - %236 = fcmp oeq float %temp292.11, %181 - %237 = fcmp olt float %181, undef - %238 = and i1 %236, %237 - br i1 %238, label %ENDIF2594, label %ELSE2740 - -ELSE2740: ; preds = %ELSE2704 - br i1 undef, label %IF2757, label %ELSE2758 - -IF2757: ; preds = %ELSE2740 - br i1 undef, label %ENDIF2594, label %ELSE2761 - -ELSE2758: ; preds = %ELSE2740 - br i1 undef, label %IF2775, label %ENDIF2594 - -ELSE2761: ; preds = %IF2757 - br label %ENDIF2594 - -IF2775: ; preds = %ELSE2758 - %239 = fcmp olt float undef, undef - br i1 %239, label %ENDIF2594, label %ELSE2779 - -ELSE2779: ; preds = %IF2775 - br i1 undef, label %ENDIF2594, label %ELSE2782 - -ELSE2782: ; preds = %ELSE2779 - br i1 undef, label %ENDIF2594, label %ELSE2785 - -ELSE2785: ; preds = %ELSE2782 - %240 = fcmp olt float undef, 0.000000e+00 - br i1 %240, label %ENDIF2594, label %ELSE2788 - -ELSE2788: ; preds = %ELSE2785 - %241 = fcmp olt float 0.000000e+00, undef - %.2848 = select i1 %241, float -1.000000e+00, float 1.000000e+00 - br label %ENDIF2594 - -ELSE2797: ; preds = %ENDIF2564 - %242 = fsub float %8, undef - %243 = fsub float %9, undef - %244 = fsub float %10, undef - %245 = fmul float %242, undef - %246 = fmul float %243, undef - %247 = fmul float %244, undef - %248 = fsub float %11, undef - %249 = fsub float %12, undef - %250 = fsub float %13, undef - %251 = fmul float %248, undef - %252 = fmul float %249, undef - %253 = fmul float %250, undef - %254 = call float @llvm.minnum.f32(float %245, float %251) - %255 = call float @llvm.minnum.f32(float %246, float %252) - %256 = call float @llvm.maxnum.f32(float %247, float %253) - %257 = call float @llvm.maxnum.f32(float %254, float %255) - %258 = call float @llvm.maxnum.f32(float %257, float undef) - %259 = call float @llvm.minnum.f32(float undef, float %256) - %260 = fcmp ogt float %258, 0.000000e+00 - %261 = fcmp olt float %258, 1.000000e+00 - %262 = and i1 %260, %261 - %263 = fcmp olt float %258, %259 - %264 = and i1 %262, %263 - br i1 %264, label %ENDIF2795, label %ELSE2800 - -ENDIF2795: ; preds = %ELSE2824, %ELSE2821, %ELSE2818, %ELSE2815, %ELSE2812, %ELSE2809, %ELSE2806, %ELSE2803, %ELSE2800, %ELSE2797, %ENDIF2564 - br label %LOOP - -ELSE2800: ; preds = %ELSE2797 - br i1 undef, label %ENDIF2795, label %ELSE2803 - -ELSE2803: ; preds = %ELSE2800 - %265 = fsub float %20, undef - %266 = fsub float %21, undef - %267 = fsub float %22, undef - %268 = fmul float %265, undef - %269 = fmul float %266, undef - %270 = fmul float %267, 0.000000e+00 - %271 = fsub float %23, undef - %272 = fsub float %24, undef - %273 = fsub float %25, undef - %274 = fmul float %271, undef - %275 = fmul float %272, undef - %276 = fmul float %273, undef - %277 = call float @llvm.minnum.f32(float %268, float %274) - %278 = call float @llvm.maxnum.f32(float %269, float %275) - %279 = call float @llvm.maxnum.f32(float %270, float %276) - %280 = call float @llvm.maxnum.f32(float %277, float undef) - %281 = call float @llvm.maxnum.f32(float %280, float undef) - %282 = call float @llvm.minnum.f32(float undef, float %278) - %283 = call float @llvm.minnum.f32(float %282, float %279) - %284 = fcmp ogt float %281, 0.000000e+00 - %285 = fcmp olt float %281, 1.000000e+00 - %286 = and i1 %284, %285 - %287 = fcmp olt float %281, %283 - %288 = and i1 %286, %287 - br i1 %288, label %ENDIF2795, label %ELSE2806 - -ELSE2806: ; preds = %ELSE2803 - %289 = fsub float %26, undef - %290 = fsub float %27, undef - %291 = fsub float %28, undef - %292 = fmul float %289, undef - %293 = fmul float %290, 0.000000e+00 - %294 = fmul float %291, undef - %295 = fsub float %29, undef - %296 = fmul float %295, undef - %297 = call float @llvm.minnum.f32(float %292, float %296) - %298 = call float @llvm.minnum.f32(float %293, float undef) - %299 = call float @llvm.maxnum.f32(float %294, float undef) - %300 = call float @llvm.maxnum.f32(float %297, float %298) - %301 = call float @llvm.maxnum.f32(float %300, float undef) - %302 = call float @llvm.minnum.f32(float undef, float %299) - %303 = fcmp ogt float %301, 0.000000e+00 - %304 = fcmp olt float %301, 1.000000e+00 - %305 = and i1 %303, %304 - %306 = fcmp olt float %301, %302 - %307 = and i1 %305, %306 - br i1 %307, label %ENDIF2795, label %ELSE2809 - -ELSE2809: ; preds = %ELSE2806 - br i1 undef, label %ENDIF2795, label %ELSE2812 - -ELSE2812: ; preds = %ELSE2809 - br i1 undef, label %ENDIF2795, label %ELSE2815 - -ELSE2815: ; preds = %ELSE2812 - br i1 undef, label %ENDIF2795, label %ELSE2818 - -ELSE2818: ; preds = %ELSE2815 - br i1 undef, label %ENDIF2795, label %ELSE2821 - -ELSE2821: ; preds = %ELSE2818 - %308 = fsub float %56, undef - %309 = fsub float %57, undef - %310 = fsub float %58, undef - %311 = fmul float %308, undef - %312 = fmul float %309, 0.000000e+00 - %313 = fmul float %310, undef - %314 = fsub float %59, undef - %315 = fsub float %60, undef - %316 = fsub float %61, undef - %317 = fmul float %314, undef - %318 = fmul float %315, undef - %319 = fmul float %316, undef - %320 = call float @llvm.maxnum.f32(float %311, float %317) - %321 = call float @llvm.maxnum.f32(float %312, float %318) - %322 = call float @llvm.maxnum.f32(float %313, float %319) - %323 = call float @llvm.minnum.f32(float %320, float %321) - %324 = call float @llvm.minnum.f32(float %323, float %322) - %325 = fcmp ogt float undef, 0.000000e+00 - %326 = fcmp olt float undef, 1.000000e+00 - %327 = and i1 %325, %326 - %328 = fcmp olt float undef, %324 - %329 = and i1 %327, %328 - br i1 %329, label %ENDIF2795, label %ELSE2824 - -ELSE2824: ; preds = %ELSE2821 - %.2849 = select i1 undef, float 0.000000e+00, float 1.000000e+00 - br label %ENDIF2795 -} - -; Function Attrs: nounwind readnone -declare float @llvm.SI.load.const(<16 x i8>, i32) #1 - -; Function Attrs: readnone -declare float @llvm.AMDIL.fraction.(float) #2 - -; Function Attrs: nounwind readnone -declare float @llvm.sqrt.f32(float) #1 - -; Function Attrs: nounwind readnone -declare float @llvm.minnum.f32(float, float) #1 - -; Function Attrs: nounwind readnone -declare float @llvm.maxnum.f32(float, float) #1 - -; Function Attrs: readnone -declare float @llvm.AMDGPU.lrp(float, float, float) #2 - -declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) - -attributes #0 = { "ShaderType"="0" "enable-no-nans-fp-math"="true" } -attributes #1 = { nounwind readnone } -attributes #2 = { readnone } diff --git a/llvm/test/CodeGen/R600/si-triv-disjoint-mem-access.ll b/llvm/test/CodeGen/R600/si-triv-disjoint-mem-access.ll deleted file mode 100644 index 5a6129aaa3f..00000000000 --- a/llvm/test/CodeGen/R600/si-triv-disjoint-mem-access.ll +++ /dev/null @@ -1,236 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -enable-misched -enable-aa-sched-mi < %s | FileCheck -check-prefix=FUNC -check-prefix=CI %s - -declare void @llvm.SI.tbuffer.store.i32(<16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -declare void @llvm.SI.tbuffer.store.v4i32(<16 x i8>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -declare void @llvm.AMDGPU.barrier.local() #2 - - -@stored_lds_ptr = addrspace(3) global i32 addrspace(3)* undef, align 4 -@stored_constant_ptr = addrspace(3) global i32 addrspace(2)* undef, align 8 -@stored_global_ptr = addrspace(3) global i32 addrspace(1)* undef, align 8 - -; FUNC-LABEL: @reorder_local_load_global_store_local_load -; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:4 -; CI-NEXT: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:8 -; CI: buffer_store_dword -define void @reorder_local_load_global_store_local_load(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 { - %ptr0 = load i32 addrspace(3)*, i32 addrspace(3)* addrspace(3)* @stored_lds_ptr, align 4 - - %ptr1 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 1 - %ptr2 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 2 - - %tmp1 = load i32, i32 addrspace(3)* %ptr1, align 4 - store i32 99, i32 addrspace(1)* %gptr, align 4 - %tmp2 = load i32, i32 addrspace(3)* %ptr2, align 4 - - %add = add nsw i32 %tmp1, %tmp2 - - store i32 %add, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: @no_reorder_local_load_volatile_global_store_local_load -; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:4 -; CI: buffer_store_dword -; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:8 -define void @no_reorder_local_load_volatile_global_store_local_load(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 { - %ptr0 = load i32 addrspace(3)*, i32 addrspace(3)* addrspace(3)* @stored_lds_ptr, align 4 - - %ptr1 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 1 - %ptr2 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 2 - - %tmp1 = load i32, i32 addrspace(3)* %ptr1, align 4 - store volatile i32 99, i32 addrspace(1)* %gptr, align 4 - %tmp2 = load i32, i32 addrspace(3)* %ptr2, align 4 - - %add = add nsw i32 %tmp1, %tmp2 - - store i32 %add, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: @no_reorder_barrier_local_load_global_store_local_load -; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:4 -; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:8 -; CI: buffer_store_dword -define void @no_reorder_barrier_local_load_global_store_local_load(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 { - %ptr0 = load i32 addrspace(3)*, i32 addrspace(3)* addrspace(3)* @stored_lds_ptr, align 4 - - %ptr1 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 1 - %ptr2 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 2 - - %tmp1 = load i32, i32 addrspace(3)* %ptr1, align 4 - store i32 99, i32 addrspace(1)* %gptr, align 4 - call void @llvm.AMDGPU.barrier.local() #2 - %tmp2 = load i32, i32 addrspace(3)* %ptr2, align 4 - - %add = add nsw i32 %tmp1, %tmp2 - - store i32 %add, i32 addrspace(1)* %out, align 4 - ret void -} - -; Technically we could reorder these, but just comparing the -; instruction type of the load is insufficient. - -; FUNC-LABEL: @no_reorder_constant_load_global_store_constant_load -; CI: buffer_load_dword -; CI: buffer_store_dword -; CI: buffer_load_dword -; CI: buffer_store_dword -define void @no_reorder_constant_load_global_store_constant_load(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 { - %ptr0 = load i32 addrspace(2)*, i32 addrspace(2)* addrspace(3)* @stored_constant_ptr, align 8 - - %ptr1 = getelementptr inbounds i32, i32 addrspace(2)* %ptr0, i64 1 - %ptr2 = getelementptr inbounds i32, i32 addrspace(2)* %ptr0, i64 2 - - %tmp1 = load i32, i32 addrspace(2)* %ptr1, align 4 - store i32 99, i32 addrspace(1)* %gptr, align 4 - %tmp2 = load i32, i32 addrspace(2)* %ptr2, align 4 - - %add = add nsw i32 %tmp1, %tmp2 - - store i32 %add, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: @reorder_constant_load_local_store_constant_load -; CI: buffer_load_dword -; CI: buffer_load_dword -; CI: ds_write_b32 -; CI: buffer_store_dword -define void @reorder_constant_load_local_store_constant_load(i32 addrspace(1)* %out, i32 addrspace(3)* %lptr) #0 { - %ptr0 = load i32 addrspace(2)*, i32 addrspace(2)* addrspace(3)* @stored_constant_ptr, align 8 - - %ptr1 = getelementptr inbounds i32, i32 addrspace(2)* %ptr0, i64 1 - %ptr2 = getelementptr inbounds i32, i32 addrspace(2)* %ptr0, i64 2 - - %tmp1 = load i32, i32 addrspace(2)* %ptr1, align 4 - store i32 99, i32 addrspace(3)* %lptr, align 4 - %tmp2 = load i32, i32 addrspace(2)* %ptr2, align 4 - - %add = add nsw i32 %tmp1, %tmp2 - - store i32 %add, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: @reorder_smrd_load_local_store_smrd_load -; CI: s_load_dword -; CI: s_load_dword -; CI: s_load_dword -; CI: ds_write_b32 -; CI: buffer_store_dword -define void @reorder_smrd_load_local_store_smrd_load(i32 addrspace(1)* %out, i32 addrspace(3)* noalias %lptr, i32 addrspace(2)* %ptr0) #0 { - %ptr1 = getelementptr inbounds i32, i32 addrspace(2)* %ptr0, i64 1 - %ptr2 = getelementptr inbounds i32, i32 addrspace(2)* %ptr0, i64 2 - - %tmp1 = load i32, i32 addrspace(2)* %ptr1, align 4 - store i32 99, i32 addrspace(3)* %lptr, align 4 - %tmp2 = load i32, i32 addrspace(2)* %ptr2, align 4 - - %add = add nsw i32 %tmp1, %tmp2 - - store i32 %add, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: @reorder_global_load_local_store_global_load -; CI: buffer_load_dword -; CI: buffer_load_dword -; CI: ds_write_b32 -; CI: buffer_store_dword -define void @reorder_global_load_local_store_global_load(i32 addrspace(1)* %out, i32 addrspace(3)* %lptr, i32 addrspace(1)* %ptr0) #0 { - %ptr1 = getelementptr inbounds i32, i32 addrspace(1)* %ptr0, i64 1 - %ptr2 = getelementptr inbounds i32, i32 addrspace(1)* %ptr0, i64 2 - - %tmp1 = load i32, i32 addrspace(1)* %ptr1, align 4 - store i32 99, i32 addrspace(3)* %lptr, align 4 - %tmp2 = load i32, i32 addrspace(1)* %ptr2, align 4 - - %add = add nsw i32 %tmp1, %tmp2 - - store i32 %add, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: @reorder_local_offsets -; CI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:12 -; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:400 -; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:404 -; CI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:400 -; CI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:404 -; CI: buffer_store_dword -; CI: s_endpgm -define void @reorder_local_offsets(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* noalias nocapture readnone %gptr, i32 addrspace(3)* noalias nocapture %ptr0) #0 { - %ptr1 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 3 - %ptr2 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 100 - %ptr3 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 101 - - store i32 123, i32 addrspace(3)* %ptr1, align 4 - %tmp1 = load i32, i32 addrspace(3)* %ptr2, align 4 - %tmp2 = load i32, i32 addrspace(3)* %ptr3, align 4 - store i32 123, i32 addrspace(3)* %ptr2, align 4 - %tmp3 = load i32, i32 addrspace(3)* %ptr1, align 4 - store i32 789, i32 addrspace(3)* %ptr3, align 4 - - %add.0 = add nsw i32 %tmp2, %tmp1 - %add.1 = add nsw i32 %add.0, %tmp3 - store i32 %add.1, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: @reorder_global_offsets -; CI: buffer_store_dword {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:12 -; CI: buffer_load_dword {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:400 -; CI: buffer_load_dword {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:404 -; CI: buffer_store_dword {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:400 -; CI: buffer_store_dword {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:404 -; CI: buffer_store_dword -; CI: s_endpgm -define void @reorder_global_offsets(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* noalias nocapture readnone %gptr, i32 addrspace(1)* noalias nocapture %ptr0) #0 { - %ptr1 = getelementptr inbounds i32, i32 addrspace(1)* %ptr0, i32 3 - %ptr2 = getelementptr inbounds i32, i32 addrspace(1)* %ptr0, i32 100 - %ptr3 = getelementptr inbounds i32, i32 addrspace(1)* %ptr0, i32 101 - - store i32 123, i32 addrspace(1)* %ptr1, align 4 - %tmp1 = load i32, i32 addrspace(1)* %ptr2, align 4 - %tmp2 = load i32, i32 addrspace(1)* %ptr3, align 4 - store i32 123, i32 addrspace(1)* %ptr2, align 4 - %tmp3 = load i32, i32 addrspace(1)* %ptr1, align 4 - store i32 789, i32 addrspace(1)* %ptr3, align 4 - - %add.0 = add nsw i32 %tmp2, %tmp1 - %add.1 = add nsw i32 %add.0, %tmp3 - store i32 %add.1, i32 addrspace(1)* %out, align 4 - ret void -} - -; XFUNC-LABEL: @reorder_local_load_tbuffer_store_local_load -; XCI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}}, 0x4 -; XCI: TBUFFER_STORE_FORMAT -; XCI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}}, 0x8 -; define void @reorder_local_load_tbuffer_store_local_load(i32 addrspace(1)* %out, i32 %a1, i32 %vaddr) #1 { -; %ptr0 = load i32 addrspace(3)*, i32 addrspace(3)* addrspace(3)* @stored_lds_ptr, align 4 - -; %ptr1 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 1 -; %ptr2 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 2 - -; %tmp1 = load i32, i32 addrspace(3)* %ptr1, align 4 - -; %vdata = insertelement <4 x i32> undef, i32 %a1, i32 0 -; call void @llvm.SI.tbuffer.store.v4i32(<16 x i8> undef, <4 x i32> %vdata, -; i32 4, i32 %vaddr, i32 0, i32 32, i32 14, i32 4, i32 1, i32 0, i32 1, -; i32 1, i32 0) - -; %tmp2 = load i32, i32 addrspace(3)* %ptr2, align 4 - -; %add = add nsw i32 %tmp1, %tmp2 - -; store i32 %add, i32 addrspace(1)* %out, align 4 -; ret void -; } - -attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="true" "use-soft-float"="false" } -attributes #1 = { "ShaderType"="1" nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="true" "use-soft-float"="false" } -attributes #2 = { nounwind noduplicate } diff --git a/llvm/test/CodeGen/R600/si-vector-hang.ll b/llvm/test/CodeGen/R600/si-vector-hang.ll deleted file mode 100644 index bd427dd3ed4..00000000000 --- a/llvm/test/CodeGen/R600/si-vector-hang.ll +++ /dev/null @@ -1,105 +0,0 @@ -; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s -; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s - -; CHECK: {{^}}test_8_min_char: -; CHECK: buffer_store_byte -; CHECK: buffer_store_byte -; CHECK: buffer_store_byte -; CHECK: buffer_store_byte -; CHECK: buffer_store_byte -; CHECK: buffer_store_byte -; CHECK: buffer_store_byte -; CHECK: buffer_store_byte -; ModuleID = 'radeon' - -define void @test_8_min_char(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture readonly %in0, i8 addrspace(1)* nocapture readonly %in1) #0 { -entry: - %0 = load i8, i8 addrspace(1)* %in0, align 1 - %1 = insertelement <8 x i8> undef, i8 %0, i32 0 - %arrayidx2.i.i = getelementptr inbounds i8, i8 addrspace(1)* %in0, i64 1 - %2 = load i8, i8 addrspace(1)* %arrayidx2.i.i, align 1 - %3 = insertelement <8 x i8> %1, i8 %2, i32 1 - %arrayidx6.i.i = getelementptr inbounds i8, i8 addrspace(1)* %in0, i64 2 - %4 = load i8, i8 addrspace(1)* %arrayidx6.i.i, align 1 - %5 = insertelement <8 x i8> %3, i8 %4, i32 2 - %arrayidx10.i.i = getelementptr inbounds i8, i8 addrspace(1)* %in0, i64 3 - %6 = load i8, i8 addrspace(1)* %arrayidx10.i.i, align 1 - %7 = insertelement <8 x i8> %5, i8 %6, i32 3 - %arrayidx.i.i = getelementptr inbounds i8, i8 addrspace(1)* %in0, i64 4 - %8 = load i8, i8 addrspace(1)* %arrayidx.i.i, align 1 - %9 = insertelement <8 x i8> undef, i8 %8, i32 0 - %arrayidx2.i9.i = getelementptr inbounds i8, i8 addrspace(1)* %in0, i64 5 - %10 = load i8, i8 addrspace(1)* %arrayidx2.i9.i, align 1 - %11 = insertelement <8 x i8> %9, i8 %10, i32 1 - %arrayidx6.i11.i = getelementptr inbounds i8, i8 addrspace(1)* %in0, i64 6 - %12 = load i8, i8 addrspace(1)* %arrayidx6.i11.i, align 1 - %13 = insertelement <8 x i8> %11, i8 %12, i32 2 - %arrayidx10.i13.i = getelementptr inbounds i8, i8 addrspace(1)* %in0, i64 7 - %14 = load i8, i8 addrspace(1)* %arrayidx10.i13.i, align 1 - %15 = insertelement <8 x i8> %13, i8 %14, i32 3 - %vecinit5.i = shufflevector <8 x i8> %7, <8 x i8> %15, <8 x i32> - %16 = load i8, i8 addrspace(1)* %in1, align 1 - %17 = insertelement <8 x i8> undef, i8 %16, i32 0 - %arrayidx2.i.i4 = getelementptr inbounds i8, i8 addrspace(1)* %in1, i64 1 - %18 = load i8, i8 addrspace(1)* %arrayidx2.i.i4, align 1 - %19 = insertelement <8 x i8> %17, i8 %18, i32 1 - %arrayidx6.i.i5 = getelementptr inbounds i8, i8 addrspace(1)* %in1, i64 2 - %20 = load i8, i8 addrspace(1)* %arrayidx6.i.i5, align 1 - %21 = insertelement <8 x i8> %19, i8 %20, i32 2 - %arrayidx10.i.i6 = getelementptr inbounds i8, i8 addrspace(1)* %in1, i64 3 - %22 = load i8, i8 addrspace(1)* %arrayidx10.i.i6, align 1 - %23 = insertelement <8 x i8> %21, i8 %22, i32 3 - %arrayidx.i.i7 = getelementptr inbounds i8, i8 addrspace(1)* %in1, i64 4 - %24 = load i8, i8 addrspace(1)* %arrayidx.i.i7, align 1 - %25 = insertelement <8 x i8> undef, i8 %24, i32 0 - %arrayidx2.i9.i8 = getelementptr inbounds i8, i8 addrspace(1)* %in1, i64 5 - %26 = load i8, i8 addrspace(1)* %arrayidx2.i9.i8, align 1 - %27 = insertelement <8 x i8> %25, i8 %26, i32 1 - %arrayidx6.i11.i9 = getelementptr inbounds i8, i8 addrspace(1)* %in1, i64 6 - %28 = load i8, i8 addrspace(1)* %arrayidx6.i11.i9, align 1 - %29 = insertelement <8 x i8> %27, i8 %28, i32 2 - %arrayidx10.i13.i10 = getelementptr inbounds i8, i8 addrspace(1)* %in1, i64 7 - %30 = load i8, i8 addrspace(1)* %arrayidx10.i13.i10, align 1 - %31 = insertelement <8 x i8> %29, i8 %30, i32 3 - %vecinit5.i11 = shufflevector <8 x i8> %23, <8 x i8> %31, <8 x i32> - %cmp.i = icmp slt <8 x i8> %vecinit5.i, %vecinit5.i11 - %cond.i = select <8 x i1> %cmp.i, <8 x i8> %vecinit5.i, <8 x i8> %vecinit5.i11 - %32 = extractelement <8 x i8> %cond.i, i32 0 - store i8 %32, i8 addrspace(1)* %out, align 1 - %33 = extractelement <8 x i8> %cond.i, i32 1 - %arrayidx2.i.i.i = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 1 - store i8 %33, i8 addrspace(1)* %arrayidx2.i.i.i, align 1 - %34 = extractelement <8 x i8> %cond.i, i32 2 - %arrayidx.i.i.i = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 2 - store i8 %34, i8 addrspace(1)* %arrayidx.i.i.i, align 1 - %35 = extractelement <8 x i8> %cond.i, i32 3 - %arrayidx2.i6.i.i = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 3 - store i8 %35, i8 addrspace(1)* %arrayidx2.i6.i.i, align 1 - %arrayidx.i.i3 = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 4 - %36 = extractelement <8 x i8> %cond.i, i32 4 - store i8 %36, i8 addrspace(1)* %arrayidx.i.i3, align 1 - %37 = extractelement <8 x i8> %cond.i, i32 5 - %arrayidx2.i.i6.i = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 5 - store i8 %37, i8 addrspace(1)* %arrayidx2.i.i6.i, align 1 - %38 = extractelement <8 x i8> %cond.i, i32 6 - %arrayidx.i.i7.i = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 6 - store i8 %38, i8 addrspace(1)* %arrayidx.i.i7.i, align 1 - %39 = extractelement <8 x i8> %cond.i, i32 7 - %arrayidx2.i6.i8.i = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 7 - store i8 %39, i8 addrspace(1)* %arrayidx2.i6.i8.i, align 1 - ret void -} - -attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } - -!opencl.kernels = !{!0, !1, !2, !3, !4, !5, !6, !7, !8} - -!0 = !{null} -!1 = !{null} -!2 = !{null} -!3 = !{void (i8 addrspace(1)*, i8 addrspace(1)*, i8 addrspace(1)*)* @test_8_min_char} -!4 = !{null} -!5 = !{null} -!6 = !{null} -!7 = !{null} -!8 = !{null} diff --git a/llvm/test/CodeGen/R600/sign_extend.ll b/llvm/test/CodeGen/R600/sign_extend.ll deleted file mode 100644 index 06bee114c23..00000000000 --- a/llvm/test/CodeGen/R600/sign_extend.ll +++ /dev/null @@ -1,63 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s - -; SI-LABEL: {{^}}s_sext_i1_to_i32: -; SI: v_cndmask_b32_e64 -; SI: s_endpgm -define void @s_sext_i1_to_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind { - %cmp = icmp eq i32 %a, %b - %sext = sext i1 %cmp to i32 - store i32 %sext, i32 addrspace(1)* %out, align 4 - ret void -} - -; SI-LABEL: {{^}}test_s_sext_i32_to_i64: -; SI: s_ashr_i32 -; SI: s_endpg -define void @test_s_sext_i32_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) nounwind { -entry: - %mul = mul i32 %a, %b - %add = add i32 %mul, %c - %sext = sext i32 %add to i64 - store i64 %sext, i64 addrspace(1)* %out, align 8 - ret void -} - -; SI-LABEL: {{^}}s_sext_i1_to_i64: -; SI: v_cndmask_b32_e64 v[[LOREG:[0-9]+]], 0, -1, vcc -; SI: v_mov_b32_e32 v[[HIREG:[0-9]+]], v[[LOREG]] -; SI: buffer_store_dwordx2 v{{\[}}[[LOREG]]:[[HIREG]]{{\]}} -; SI: s_endpgm -define void @s_sext_i1_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b) nounwind { - %cmp = icmp eq i32 %a, %b - %sext = sext i1 %cmp to i64 - store i64 %sext, i64 addrspace(1)* %out, align 8 - ret void -} - -; SI-LABEL: {{^}}s_sext_i32_to_i64: -; SI: s_ashr_i32 -; SI: s_endpgm -define void @s_sext_i32_to_i64(i64 addrspace(1)* %out, i32 %a) nounwind { - %sext = sext i32 %a to i64 - store i64 %sext, i64 addrspace(1)* %out, align 8 - ret void -} - -; SI-LABEL: {{^}}v_sext_i32_to_i64: -; SI: v_ashr -; SI: s_endpgm -define void @v_sext_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { - %val = load i32, i32 addrspace(1)* %in, align 4 - %sext = sext i32 %val to i64 - store i64 %sext, i64 addrspace(1)* %out, align 8 - ret void -} - -; SI-LABEL: {{^}}s_sext_i16_to_i64: -; SI: s_endpgm -define void @s_sext_i16_to_i64(i64 addrspace(1)* %out, i16 %a) nounwind { - %sext = sext i16 %a to i64 - store i64 %sext, i64 addrspace(1)* %out, align 8 - ret void -} diff --git a/llvm/test/CodeGen/R600/simplify-demanded-bits-build-pair.ll b/llvm/test/CodeGen/R600/simplify-demanded-bits-build-pair.ll deleted file mode 100644 index dffee70b6b0..00000000000 --- a/llvm/test/CodeGen/R600/simplify-demanded-bits-build-pair.ll +++ /dev/null @@ -1,39 +0,0 @@ -; XFAIL: * -; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=SI -mattr=-promote-alloca < %s | FileCheck -check-prefix=SI %s -; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=tonga -mattr=-promote-alloca < %s | FileCheck -check-prefix=SI %s - -; 64-bit select was originally lowered with a build_pair, and this -; could be simplified to 1 cndmask instead of 2, but that broken when -; it started being implemented with a v2i32 build_vector and -; bitcasting. -define void @trunc_select_i64(i32 addrspace(1)* %out, i64 %a, i64 %b, i32 %c) { - %cmp = icmp eq i32 %c, 0 - %select = select i1 %cmp, i64 %a, i64 %b - %trunc = trunc i64 %select to i32 - store i32 %trunc, i32 addrspace(1)* %out, align 4 - ret void -} - -; FIXME: Fix truncating store for local memory -; SI-LABEL: {{^}}trunc_load_alloca_i64: -; SI: v_movrels_b32 -; SI-NOT: v_movrels_b32 -; SI: s_endpgm -define void @trunc_load_alloca_i64(i64 addrspace(1)* %out, i32 %a, i32 %b) { - %idx = add i32 %a, %b - %alloca = alloca i64, i32 4 - %gep0 = getelementptr i64, i64* %alloca, i64 0 - %gep1 = getelementptr i64, i64* %alloca, i64 1 - %gep2 = getelementptr i64, i64* %alloca, i64 2 - %gep3 = getelementptr i64, i64* %alloca, i64 3 - store i64 24, i64* %gep0, align 8 - store i64 9334, i64* %gep1, align 8 - store i64 3935, i64* %gep2, align 8 - store i64 9342, i64* %gep3, align 8 - %gep = getelementptr i64, i64* %alloca, i32 %idx - %load = load i64, i64* %gep, align 8 - %mask = and i64 %load, 4294967296 - %add = add i64 %mask, -1 - store i64 %add, i64 addrspace(1)* %out, align 4 - ret void -} diff --git a/llvm/test/CodeGen/R600/sint_to_fp.f64.ll b/llvm/test/CodeGen/R600/sint_to_fp.f64.ll deleted file mode 100644 index da4e91db3a3..00000000000 --- a/llvm/test/CodeGen/R600/sint_to_fp.f64.ll +++ /dev/null @@ -1,61 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s - -declare i32 @llvm.r600.read.tidig.x() nounwind readnone - -; SI-LABEL: {{^}}sint_to_fp_i32_to_f64 -; SI: v_cvt_f64_i32_e32 -define void @sint_to_fp_i32_to_f64(double addrspace(1)* %out, i32 %in) { - %result = sitofp i32 %in to double - store double %result, double addrspace(1)* %out - ret void -} - -; FIXME: select on 0, 0 -; SI-LABEL: {{^}}sint_to_fp_i1_f64: -; SI: v_cmp_eq_i32_e64 [[CMP:s\[[0-9]+:[0-9]\]]], -; We can't fold the SGPRs into v_cndmask_b32_e64, because it already -; uses an SGPR for [[CMP]] -; SI: v_cndmask_b32_e64 v{{[0-9]+}}, 0, v{{[0-9]+}}, [[CMP]] -; SI: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 0, [[CMP]] -; SI: buffer_store_dwordx2 -; SI: s_endpgm -define void @sint_to_fp_i1_f64(double addrspace(1)* %out, i32 %in) { - %cmp = icmp eq i32 %in, 0 - %fp = sitofp i1 %cmp to double - store double %fp, double addrspace(1)* %out, align 4 - ret void -} - -; SI-LABEL: {{^}}sint_to_fp_i1_f64_load: -; SI: v_cndmask_b32_e64 [[IRESULT:v[0-9]]], 0, -1 -; SI-NEXT: v_cvt_f64_i32_e32 [[RESULT:v\[[0-9]+:[0-9]\]]], [[IRESULT]] -; SI: buffer_store_dwordx2 [[RESULT]] -; SI: s_endpgm -define void @sint_to_fp_i1_f64_load(double addrspace(1)* %out, i1 %in) { - %fp = sitofp i1 %in to double - store double %fp, double addrspace(1)* %out, align 8 - ret void -} - -; SI-LABEL: @s_sint_to_fp_i64_to_f64 -define void @s_sint_to_fp_i64_to_f64(double addrspace(1)* %out, i64 %in) { - %result = sitofp i64 %in to double - store double %result, double addrspace(1)* %out - ret void -} - -; SI-LABEL: @v_sint_to_fp_i64_to_f64 -; SI: buffer_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}} -; SI: v_cvt_f64_i32_e32 [[HI_CONV:v\[[0-9]+:[0-9]+\]]], v[[HI]] -; SI: v_ldexp_f64 [[LDEXP:v\[[0-9]+:[0-9]+\]]], [[HI_CONV]], 32 -; SI: v_cvt_f64_u32_e32 [[LO_CONV:v\[[0-9]+:[0-9]+\]]], v[[LO]] -; SI: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[LDEXP]], [[LO_CONV]] -; SI: buffer_store_dwordx2 [[RESULT]] -define void @v_sint_to_fp_i64_to_f64(double addrspace(1)* %out, i64 addrspace(1)* %in) { - %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone - %gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid - %val = load i64, i64 addrspace(1)* %gep, align 8 - %result = sitofp i64 %val to double - store double %result, double addrspace(1)* %out - ret void -} diff --git a/llvm/test/CodeGen/R600/sint_to_fp.ll b/llvm/test/CodeGen/R600/sint_to_fp.ll deleted file mode 100644 index 8506441d136..00000000000 --- a/llvm/test/CodeGen/R600/sint_to_fp.ll +++ /dev/null @@ -1,64 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s - - -; FUNC-LABEL: {{^}}s_sint_to_fp_i32_to_f32: -; R600: INT_TO_FLT * T{{[0-9]+\.[XYZW]}}, KC0[2].Z -; SI: v_cvt_f32_i32_e32 {{v[0-9]+}}, {{s[0-9]+$}} -define void @s_sint_to_fp_i32_to_f32(float addrspace(1)* %out, i32 %in) { - %result = sitofp i32 %in to float - store float %result, float addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}sint_to_fp_v2i32: -; R600-DAG: INT_TO_FLT * T{{[0-9]+\.[XYZW]}}, KC0[2].W -; R600-DAG: INT_TO_FLT * T{{[0-9]+\.[XYZW]}}, KC0[3].X - -; SI: v_cvt_f32_i32_e32 -; SI: v_cvt_f32_i32_e32 -define void @sint_to_fp_v2i32(<2 x float> addrspace(1)* %out, <2 x i32> %in) { - %result = sitofp <2 x i32> %in to <2 x float> - store <2 x float> %result, <2 x float> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}sint_to_fp_v4i32: -; R600: INT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -; R600: INT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -; R600: INT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -; R600: INT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} - -; SI: v_cvt_f32_i32_e32 -; SI: v_cvt_f32_i32_e32 -; SI: v_cvt_f32_i32_e32 -; SI: v_cvt_f32_i32_e32 -define void @sint_to_fp_v4i32(<4 x float> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { - %value = load <4 x i32>, <4 x i32> addrspace(1) * %in - %result = sitofp <4 x i32> %value to <4 x float> - store <4 x float> %result, <4 x float> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}sint_to_fp_i1_f32: -; SI: v_cmp_eq_i32_e64 [[CMP:s\[[0-9]+:[0-9]\]]], -; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1.0, [[CMP]] -; SI: buffer_store_dword [[RESULT]], -; SI: s_endpgm -define void @sint_to_fp_i1_f32(float addrspace(1)* %out, i32 %in) { - %cmp = icmp eq i32 %in, 0 - %fp = uitofp i1 %cmp to float - store float %fp, float addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}sint_to_fp_i1_f32_load: -; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1.0 -; SI: buffer_store_dword [[RESULT]], -; SI: s_endpgm -define void @sint_to_fp_i1_f32_load(float addrspace(1)* %out, i1 %in) { - %fp = sitofp i1 %in to float - store float %fp, float addrspace(1)* %out, align 4 - ret void -} diff --git a/llvm/test/CodeGen/R600/smrd.ll b/llvm/test/CodeGen/R600/smrd.ll deleted file mode 100644 index b0c18ca5959..00000000000 --- a/llvm/test/CodeGen/R600/smrd.ll +++ /dev/null @@ -1,111 +0,0 @@ -; RUN: llc < %s -march=amdgcn -mcpu=SI -show-mc-encoding -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=GCN %s -; RUN: llc < %s -march=amdgcn -mcpu=tonga -show-mc-encoding -verify-machineinstrs | FileCheck --check-prefix=VI --check-prefix=GCN %s - -; SMRD load with an immediate offset. -; GCN-LABEL: {{^}}smrd0: -; SI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x1 ; encoding: [0x01 -; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x4 -define void @smrd0(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) { -entry: - %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 1 - %1 = load i32, i32 addrspace(2)* %0 - store i32 %1, i32 addrspace(1)* %out - ret void -} - -; SMRD load with the largest possible immediate offset. -; GCN-LABEL: {{^}}smrd1: -; SI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xff ; encoding: [0xff -; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3fc -define void @smrd1(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) { -entry: - %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 255 - %1 = load i32, i32 addrspace(2)* %0 - store i32 %1, i32 addrspace(1)* %out - ret void -} - -; SMRD load with an offset greater than the largest possible immediate. -; GCN-LABEL: {{^}}smrd2: -; SI: s_movk_i32 s[[OFFSET:[0-9]]], 0x400 -; SI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], s[[OFFSET]] ; encoding: [0x0[[OFFSET]] -; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x400 -; GCN: s_endpgm -define void @smrd2(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) { -entry: - %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 256 - %1 = load i32, i32 addrspace(2)* %0 - store i32 %1, i32 addrspace(1)* %out - ret void -} - -; SMRD load with a 64-bit offset -; GCN-LABEL: {{^}}smrd3: -; FIXME: There are too many copies here because we don't fold immediates -; through REG_SEQUENCE -; SI: s_mov_b32 s[[SLO:[0-9]+]], 0 ; -; SI: s_mov_b32 s[[SHI:[0-9]+]], 4 -; SI: s_mov_b32 s[[SSLO:[0-9]+]], s[[SLO]] -; SI-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], s[[SSLO]] -; SI-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[SHI]] -; FIXME: We should be able to use s_load_dword here -; SI: buffer_load_dword v{{[0-9]+}}, v{{\[}}[[VLO]]:[[VHI]]{{\]}}, s[{{[0-9]+:[0-9]+}}], 0 addr64 -; TODO: Add VI checks -; GCN: s_endpgm -define void @smrd3(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) { -entry: - %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 4294967296 ; 2 ^ 32 - %1 = load i32, i32 addrspace(2)* %0 - store i32 %1, i32 addrspace(1)* %out - ret void -} - -; SMRD load using the load.const intrinsic with an immediate offset -; GCN-LABEL: {{^}}smrd_load_const0: -; SI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x4 ; encoding: [0x04 -; VI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x10 -define void @smrd_load_const0(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 { -main_body: - %20 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %0, i32 0 - %21 = load <16 x i8>, <16 x i8> addrspace(2)* %20 - %22 = call float @llvm.SI.load.const(<16 x i8> %21, i32 16) - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %22, float %22, float %22, float %22) - ret void -} - -; SMRD load using the load.const intrinsic with the largest possible immediate -; offset. -; GCN-LABEL: {{^}}smrd_load_const1: -; SI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xff ; encoding: [0xff -; VI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3fc -define void @smrd_load_const1(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 { -main_body: - %20 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %0, i32 0 - %21 = load <16 x i8>, <16 x i8> addrspace(2)* %20 - %22 = call float @llvm.SI.load.const(<16 x i8> %21, i32 1020) - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %22, float %22, float %22, float %22) - ret void -} -; SMRD load using the load.const intrinsic with an offset greater than the -; largets possible immediate. -; immediate offset. -; GCN-LABEL: {{^}}smrd_load_const2: -; SI: s_movk_i32 s[[OFFSET:[0-9]]], 0x400 -; SI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], s[[OFFSET]] ; encoding: [0x0[[OFFSET]] -; VI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x400 -define void @smrd_load_const2(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 { -main_body: - %20 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %0, i32 0 - %21 = load <16 x i8>, <16 x i8> addrspace(2)* %20 - %22 = call float @llvm.SI.load.const(<16 x i8> %21, i32 1024) - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %22, float %22, float %22, float %22) - ret void -} - -; Function Attrs: nounwind readnone -declare float @llvm.SI.load.const(<16 x i8>, i32) #1 - -declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) - -attributes #0 = { "ShaderType"="0" } -attributes #1 = { nounwind readnone } diff --git a/llvm/test/CodeGen/R600/split-scalar-i64-add.ll b/llvm/test/CodeGen/R600/split-scalar-i64-add.ll deleted file mode 100644 index 46409cdfae1..00000000000 --- a/llvm/test/CodeGen/R600/split-scalar-i64-add.ll +++ /dev/null @@ -1,48 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s - -declare i32 @llvm.r600.read.tidig.x() readnone - -; This is broken because the low half of the 64-bit add remains on the -; SALU, but the upper half does not. The addc expects the carry bit -; set in vcc, which is undefined since the low scalar half add sets -; scc instead. - -; FUNC-LABEL: {{^}}imp_def_vcc_split_i64_add_0: -; SI: v_add_i32 -; SI: v_addc_u32 -define void @imp_def_vcc_split_i64_add_0(i64 addrspace(1)* %out, i32 %val) { - %vec.0 = insertelement <2 x i32> undef, i32 %val, i32 0 - %vec.1 = insertelement <2 x i32> %vec.0, i32 999999, i32 1 - %bc = bitcast <2 x i32> %vec.1 to i64 - %add = add i64 %bc, 399 - store i64 %add, i64 addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}imp_def_vcc_split_i64_add_1: -; SI: v_add_i32 -; SI: v_addc_u32 -define void @imp_def_vcc_split_i64_add_1(i64 addrspace(1)* %out, i32 %val0, i64 %val1) { - %vec.0 = insertelement <2 x i32> undef, i32 %val0, i32 0 - %vec.1 = insertelement <2 x i32> %vec.0, i32 99999, i32 1 - %bc = bitcast <2 x i32> %vec.1 to i64 - %add = add i64 %bc, %val1 - store i64 %add, i64 addrspace(1)* %out, align 8 - ret void -} - -; Doesn't use constants -; FUNC-LABEL @imp_def_vcc_split_i64_add_2 -; SI: v_add_i32 -; SI: v_addc_u32 -define void @imp_def_vcc_split_i64_add_2(i64 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %val0, i64 %val1) { - %tid = call i32 @llvm.r600.read.tidig.x() readnone - %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid - %load = load i32, i32 addrspace(1)* %gep - %vec.0 = insertelement <2 x i32> undef, i32 %val0, i32 0 - %vec.1 = insertelement <2 x i32> %vec.0, i32 %load, i32 1 - %bc = bitcast <2 x i32> %vec.1 to i64 - %add = add i64 %bc, %val1 - store i64 %add, i64 addrspace(1)* %out, align 8 - ret void -} diff --git a/llvm/test/CodeGen/R600/sra.ll b/llvm/test/CodeGen/R600/sra.ll deleted file mode 100644 index bcbc32f4c05..00000000000 --- a/llvm/test/CodeGen/R600/sra.ll +++ /dev/null @@ -1,213 +0,0 @@ -;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG %s -;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI %s -;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=VI %s - -;EG-LABEL: {{^}}ashr_v2i32: -;EG: ASHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -;EG: ASHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} - -;SI-LABEL: {{^}}ashr_v2i32: -;SI: v_ashr_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -;SI: v_ashr_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} - -;VI-LABEL: {{^}}ashr_v2i32: -;VI: v_ashrrev_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -;VI: v_ashrrev_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} - -define void @ashr_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { - %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1 - %a = load <2 x i32>, <2 x i32> addrspace(1) * %in - %b = load <2 x i32>, <2 x i32> addrspace(1) * %b_ptr - %result = ashr <2 x i32> %a, %b - store <2 x i32> %result, <2 x i32> addrspace(1)* %out - ret void -} - -;EG-LABEL: {{^}}ashr_v4i32: -;EG: ASHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -;EG: ASHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -;EG: ASHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -;EG: ASHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} - -;SI-LABEL: {{^}}ashr_v4i32: -;SI: v_ashr_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -;SI: v_ashr_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -;SI: v_ashr_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -;SI: v_ashr_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} - -;VI-LABEL: {{^}}ashr_v4i32: -;VI: v_ashrrev_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -;VI: v_ashrrev_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -;VI: v_ashrrev_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -;VI: v_ashrrev_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} - -define void @ashr_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { - %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1 - %a = load <4 x i32>, <4 x i32> addrspace(1) * %in - %b = load <4 x i32>, <4 x i32> addrspace(1) * %b_ptr - %result = ashr <4 x i32> %a, %b - store <4 x i32> %result, <4 x i32> addrspace(1)* %out - ret void -} - -;EG-LABEL: {{^}}ashr_i64: -;EG: ASHR - -;SI-LABEL: {{^}}ashr_i64: -;SI: s_ashr_i64 s[{{[0-9]}}:{{[0-9]}}], s[{{[0-9]}}:{{[0-9]}}], 8 - -;VI-LABEL: {{^}}ashr_i64: -;VI: s_ashr_i64 s[{{[0-9]}}:{{[0-9]}}], s[{{[0-9]}}:{{[0-9]}}], 8 - -define void @ashr_i64(i64 addrspace(1)* %out, i32 %in) { -entry: - %0 = sext i32 %in to i64 - %1 = ashr i64 %0, 8 - store i64 %1, i64 addrspace(1)* %out - ret void -} - -;EG-LABEL: {{^}}ashr_i64_2: -;EG: SUB_INT {{\*? *}}[[COMPSH:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHIFT:T[0-9]+\.[XYZW]]] -;EG: LSHL {{\* *}}[[TEMP:T[0-9]+\.[XYZW]]], [[OPHI:T[0-9]+\.[XYZW]]], {{[[COMPSH]]|PV.[XYZW]}} -;EG: LSHL {{\*? *}}[[OVERF:T[0-9]+\.[XYZW]]], {{[[TEMP]]|PV.[XYZW]}}, 1 -;EG_CHECK-DAG: ADD_INT {{\*? *}}[[BIGSH:T[0-9]+\.[XYZW]]], [[SHIFT]], literal -;EG-DAG: LSHR {{\*? *}}[[LOSMTMP:T[0-9]+\.[XYZW]]], [[OPLO:T[0-9]+\.[XYZW]]], [[SHIFT]] -;EG-DAG: OR_INT {{\*? *}}[[LOSM:T[0-9]+\.[XYZW]]], {{[[LOSMTMP]]|PV.[XYZW]}}, {{[[OVERF]]|PV.[XYZW]}} -;EG-DAG: ASHR {{\*? *}}[[HISM:T[0-9]+\.[XYZW]]], [[OPHI]], {{PS|[[SHIFT]]}} -;EG-DAG: ASHR {{\*? *}}[[LOBIG:T[0-9]+\.[XYZW]]], [[OPHI]], literal -;EG-DAG: ASHR {{\*? *}}[[HIBIG:T[0-9]+\.[XYZW]]], [[OPHI]], literal -;EG-DAG: SETGT_UINT {{\*? *}}[[RESC:T[0-9]+\.[XYZW]]], [[SHIFT]], literal -;EG-DAG: CNDE_INT {{\*? *}}[[RESLO:T[0-9]+\.[XYZW]]], {{T[0-9]+\.[XYZW]}} -;EG-DAG: CNDE_INT {{\*? *}}[[RESHI:T[0-9]+\.[XYZW]]], {{T[0-9]+\.[XYZW]}} - -;SI-LABEL: {{^}}ashr_i64_2: -;SI: v_ashr_i64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}} - -;VI-LABEL: {{^}}ashr_i64_2: -;VI: v_ashrrev_i64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}} - -define void @ashr_i64_2(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { -entry: - %b_ptr = getelementptr i64, i64 addrspace(1)* %in, i64 1 - %a = load i64, i64 addrspace(1) * %in - %b = load i64, i64 addrspace(1) * %b_ptr - %result = ashr i64 %a, %b - store i64 %result, i64 addrspace(1)* %out - ret void -} - -;EG-LABEL: {{^}}ashr_v2i64: -;EG-DAG: SUB_INT {{\*? *}}[[COMPSHA:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHA:T[0-9]+\.[XYZW]]] -;EG-DAG: SUB_INT {{\*? *}}[[COMPSHB:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHB:T[0-9]+\.[XYZW]]] -;EG-DAG: LSHL {{\*? *}}[[COMPSHA]] -;EG-DAG: LSHL {{\*? *}}[[COMPSHB]] -;EG-DAG: LSHL {{.*}}, 1 -;EG-DAG: LSHL {{.*}}, 1 -;EG-DAG: ASHR {{.*}}, [[SHA]] -;EG-DAG: ASHR {{.*}}, [[SHB]] -;EG-DAG: LSHR {{.*}}, [[SHA]] -;EG-DAG: LSHR {{.*}}, [[SHB]] -;EG-DAG: OR_INT -;EG-DAG: OR_INT -;EG-DAG: ADD_INT {{\*? *}}[[BIGSHA:T[0-9]+\.[XYZW]]]{{.*}}, literal -;EG-DAG: ADD_INT {{\*? *}}[[BIGSHB:T[0-9]+\.[XYZW]]]{{.*}}, literal -;EG-DAG: ASHR -;EG-DAG: ASHR -;EG-DAG: ASHR {{.*}}, literal -;EG-DAG: ASHR {{.*}}, literal -;EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHA]], literal -;EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHB]], literal -;EG-DAG: CNDE_INT -;EG-DAG: CNDE_INT -;EG-DAG: CNDE_INT -;EG-DAG: CNDE_INT - -;SI-LABEL: {{^}}ashr_v2i64: -;SI: v_ashr_i64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}} -;SI: v_ashr_i64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}} - -;VI-LABEL: {{^}}ashr_v2i64: -;VI: v_ashrrev_i64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}} -;VI: v_ashrrev_i64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}} - -define void @ashr_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) { - %b_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %in, i64 1 - %a = load <2 x i64>, <2 x i64> addrspace(1) * %in - %b = load <2 x i64>, <2 x i64> addrspace(1) * %b_ptr - %result = ashr <2 x i64> %a, %b - store <2 x i64> %result, <2 x i64> addrspace(1)* %out - ret void -} - -;EG-LABEL: {{^}}ashr_v4i64: -;EG-DAG: SUB_INT {{\*? *}}[[COMPSHA:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHA:T[0-9]+\.[XYZW]]] -;EG-DAG: SUB_INT {{\*? *}}[[COMPSHB:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHB:T[0-9]+\.[XYZW]]] -;EG-DAG: SUB_INT {{\*? *}}[[COMPSHC:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHC:T[0-9]+\.[XYZW]]] -;EG-DAG: SUB_INT {{\*? *}}[[COMPSHD:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHD:T[0-9]+\.[XYZW]]] -;EG-DAG: LSHL {{\*? *}}[[COMPSHA]] -;EG-DAG: LSHL {{\*? *}}[[COMPSHB]] -;EG-DAG: LSHL {{\*? *}}[[COMPSHC]] -;EG-DAG: LSHL {{\*? *}}[[COMPSHD]] -;EG-DAG: LSHL {{.*}}, 1 -;EG-DAG: LSHL {{.*}}, 1 -;EG-DAG: LSHL {{.*}}, 1 -;EG-DAG: LSHL {{.*}}, 1 -;EG-DAG: ASHR {{.*}}, [[SHA]] -;EG-DAG: ASHR {{.*}}, [[SHB]] -;EG-DAG: ASHR {{.*}}, [[SHC]] -;EG-DAG: ASHR {{.*}}, [[SHD]] -;EG-DAG: LSHR {{.*}}, [[SHA]] -;EG-DAG: LSHR {{.*}}, [[SHB]] -;EG-DAG: LSHR {{.*}}, [[SHA]] -;EG-DAG: LSHR {{.*}}, [[SHB]] -;EG-DAG: OR_INT -;EG-DAG: OR_INT -;EG-DAG: OR_INT -;EG-DAG: OR_INT -;EG-DAG: ADD_INT {{\*? *}}[[BIGSHA:T[0-9]+\.[XYZW]]]{{.*}}, literal -;EG-DAG: ADD_INT {{\*? *}}[[BIGSHB:T[0-9]+\.[XYZW]]]{{.*}}, literal -;EG-DAG: ADD_INT {{\*? *}}[[BIGSHC:T[0-9]+\.[XYZW]]]{{.*}}, literal -;EG-DAG: ADD_INT {{\*? *}}[[BIGSHD:T[0-9]+\.[XYZW]]]{{.*}}, literal -;EG-DAG: ASHR -;EG-DAG: ASHR -;EG-DAG: ASHR -;EG-DAG: ASHR -;EG-DAG: ASHR {{.*}}, literal -;EG-DAG: ASHR {{.*}}, literal -;EG-DAG: ASHR {{.*}}, literal -;EG-DAG: ASHR {{.*}}, literal -;EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHA]], literal -;EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHB]], literal -;EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHC]], literal -;EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHD]], literal -;EG-DAG: CNDE_INT -;EG-DAG: CNDE_INT -;EG-DAG: CNDE_INT -;EG-DAG: CNDE_INT -;EG-DAG: CNDE_INT -;EG-DAG: CNDE_INT -;EG-DAG: CNDE_INT -;EG-DAG: CNDE_INT - -;SI-LABEL: {{^}}ashr_v4i64: -;SI: v_ashr_i64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}} -;SI: v_ashr_i64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}} -;SI: v_ashr_i64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}} -;SI: v_ashr_i64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}} - -;VI-LABEL: {{^}}ashr_v4i64: -;VI: v_ashrrev_i64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}} -;VI: v_ashrrev_i64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}} -;VI: v_ashrrev_i64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}} -;VI: v_ashrrev_i64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}} - -define void @ashr_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) { - %b_ptr = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %in, i64 1 - %a = load <4 x i64>, <4 x i64> addrspace(1) * %in - %b = load <4 x i64>, <4 x i64> addrspace(1) * %b_ptr - %result = ashr <4 x i64> %a, %b - store <4 x i64> %result, <4 x i64> addrspace(1)* %out - ret void -} - diff --git a/llvm/test/CodeGen/R600/srem.ll b/llvm/test/CodeGen/R600/srem.ll deleted file mode 100644 index c78fd549b31..00000000000 --- a/llvm/test/CodeGen/R600/srem.ll +++ /dev/null @@ -1,112 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=redwood < %s - -define void @srem_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { - %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 - %num = load i32, i32 addrspace(1) * %in - %den = load i32, i32 addrspace(1) * %den_ptr - %result = srem i32 %num, %den - store i32 %result, i32 addrspace(1)* %out - ret void -} - -define void @srem_i32_4(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { - %num = load i32, i32 addrspace(1) * %in - %result = srem i32 %num, 4 - store i32 %result, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}srem_i32_7: -; SI: v_mov_b32_e32 [[MAGIC:v[0-9]+]], 0x92492493 -; SI: v_mul_hi_i32 {{v[0-9]+}}, [[MAGIC]], -; SI: v_mul_lo_i32 -; SI: v_sub_i32 -; SI: s_endpgm -define void @srem_i32_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { - %num = load i32, i32 addrspace(1) * %in - %result = srem i32 %num, 7 - store i32 %result, i32 addrspace(1)* %out - ret void -} - -define void @srem_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { - %den_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1 - %num = load <2 x i32>, <2 x i32> addrspace(1) * %in - %den = load <2 x i32>, <2 x i32> addrspace(1) * %den_ptr - %result = srem <2 x i32> %num, %den - store <2 x i32> %result, <2 x i32> addrspace(1)* %out - ret void -} - -define void @srem_v2i32_4(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { - %num = load <2 x i32>, <2 x i32> addrspace(1) * %in - %result = srem <2 x i32> %num, - store <2 x i32> %result, <2 x i32> addrspace(1)* %out - ret void -} - -define void @srem_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { - %den_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1 - %num = load <4 x i32>, <4 x i32> addrspace(1) * %in - %den = load <4 x i32>, <4 x i32> addrspace(1) * %den_ptr - %result = srem <4 x i32> %num, %den - store <4 x i32> %result, <4 x i32> addrspace(1)* %out - ret void -} - -define void @srem_v4i32_4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { - %num = load <4 x i32>, <4 x i32> addrspace(1) * %in - %result = srem <4 x i32> %num, - store <4 x i32> %result, <4 x i32> addrspace(1)* %out - ret void -} - -define void @srem_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { - %den_ptr = getelementptr i64, i64 addrspace(1)* %in, i64 1 - %num = load i64, i64 addrspace(1) * %in - %den = load i64, i64 addrspace(1) * %den_ptr - %result = srem i64 %num, %den - store i64 %result, i64 addrspace(1)* %out - ret void -} - -define void @srem_i64_4(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { - %num = load i64, i64 addrspace(1) * %in - %result = srem i64 %num, 4 - store i64 %result, i64 addrspace(1)* %out - ret void -} - -define void @srem_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) { - %den_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %in, i64 1 - %num = load <2 x i64>, <2 x i64> addrspace(1) * %in - %den = load <2 x i64>, <2 x i64> addrspace(1) * %den_ptr - %result = srem <2 x i64> %num, %den - store <2 x i64> %result, <2 x i64> addrspace(1)* %out - ret void -} - -define void @srem_v2i64_4(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) { - %num = load <2 x i64>, <2 x i64> addrspace(1) * %in - %result = srem <2 x i64> %num, - store <2 x i64> %result, <2 x i64> addrspace(1)* %out - ret void -} - -define void @srem_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) { - %den_ptr = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %in, i64 1 - %num = load <4 x i64>, <4 x i64> addrspace(1) * %in - %den = load <4 x i64>, <4 x i64> addrspace(1) * %den_ptr - %result = srem <4 x i64> %num, %den - store <4 x i64> %result, <4 x i64> addrspace(1)* %out - ret void -} - -define void @srem_v4i64_4(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) { - %num = load <4 x i64>, <4 x i64> addrspace(1) * %in - %result = srem <4 x i64> %num, - store <4 x i64> %result, <4 x i64> addrspace(1)* %out - ret void -} diff --git a/llvm/test/CodeGen/R600/srl.ll b/llvm/test/CodeGen/R600/srl.ll deleted file mode 100644 index 4904d7fa1bd..00000000000 --- a/llvm/test/CodeGen/R600/srl.ll +++ /dev/null @@ -1,186 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s - -; FUNC-LABEL: {{^}}lshr_i32: -; SI: v_lshrrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -; VI: v_lshrrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -; EG: LSHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -define void @lshr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { - %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 - %a = load i32, i32 addrspace(1)* %in - %b = load i32, i32 addrspace(1)* %b_ptr - %result = lshr i32 %a, %b - store i32 %result, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}lshr_v2i32: -; SI: v_lshr_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -; SI: v_lshr_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} - -; VI: v_lshrrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -; VI: v_lshrrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} - -; EG: LSHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -; EG: LSHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -define void @lshr_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { - %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1 - %a = load <2 x i32>, <2 x i32> addrspace(1)* %in - %b = load <2 x i32>, <2 x i32> addrspace(1)* %b_ptr - %result = lshr <2 x i32> %a, %b - store <2 x i32> %result, <2 x i32> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}lshr_v4i32: -; SI: v_lshr_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -; SI: v_lshr_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -; SI: v_lshr_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -; SI: v_lshr_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} - -; VI: v_lshrrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -; VI: v_lshrrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -; VI: v_lshrrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -; VI: v_lshrrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} - -; EG: LSHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -; EG: LSHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -; EG: LSHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -; EG: LSHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -define void @lshr_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { - %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1 - %a = load <4 x i32>, <4 x i32> addrspace(1)* %in - %b = load <4 x i32>, <4 x i32> addrspace(1)* %b_ptr - %result = lshr <4 x i32> %a, %b - store <4 x i32> %result, <4 x i32> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}lshr_i64: -; SI: v_lshr_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}} -; VI: v_lshrrev_b64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}} - -; EG: SUB_INT {{\*? *}}[[COMPSH:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHIFT:T[0-9]+\.[XYZW]]] -; EG: LSHL {{\* *}}[[TEMP:T[0-9]+\.[XYZW]]], [[OPHI:T[0-9]+\.[XYZW]]], {{[[COMPSH]]|PV.[XYZW]}} -; EG: LSHL {{\*? *}}[[OVERF:T[0-9]+\.[XYZW]]], {{[[TEMP]]|PV.[XYZW]}}, 1 -; EG-DAG: ADD_INT {{\*? *}}[[BIGSH:T[0-9]+\.[XYZW]]], [[SHIFT]], literal -; EG-DAG: LSHR {{\*? *}}[[LOSMTMP:T[0-9]+\.[XYZW]]], [[OPLO:T[0-9]+\.[XYZW]]], [[SHIFT]] -; EG-DAG: OR_INT {{\*? *}}[[LOSM:T[0-9]+\.[XYZW]]], {{[[LOSMTMP]]|PV.[XYZW]}}, {{[[OVERF]]|PV.[XYZW]}} -; EG-DAG: LSHR {{\*? *}}[[HISM:T[0-9]+\.[XYZW]]], [[OPHI]], {{PS|[[SHIFT]]}} -; EG-DAG: LSHR {{\*? *}}[[LOBIG:T[0-9]+\.[XYZW]]], [[OPHI]], {{PS|[[SHIFT]]}} -; EG-DAG: SETGT_UINT {{\*? *}}[[RESC:T[0-9]+\.[XYZW]]], [[SHIFT]], literal -; EG-DAG: CNDE_INT {{\*? *}}[[RESLO:T[0-9]+\.[XYZW]]], {{T[0-9]+\.[XYZW]}} -; EG-DAG: CNDE_INT {{\*? *}}[[RESHI:T[0-9]+\.[XYZW]]], {{T[0-9]+\.[XYZW], .*}}, 0.0 -define void @lshr_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { - %b_ptr = getelementptr i64, i64 addrspace(1)* %in, i64 1 - %a = load i64, i64 addrspace(1)* %in - %b = load i64, i64 addrspace(1)* %b_ptr - %result = lshr i64 %a, %b - store i64 %result, i64 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}lshr_v2i64: -; SI: v_lshr_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}} -; SI: v_lshr_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}} - -; VI: v_lshrrev_b64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}} -; VI: v_lshrrev_b64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}} - -; EG-DAG: SUB_INT {{\*? *}}[[COMPSHA:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHA:T[0-9]+\.[XYZW]]] -; EG-DAG: SUB_INT {{\*? *}}[[COMPSHB:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHB:T[0-9]+\.[XYZW]]] -; EG-DAG: LSHL {{\*? *}}[[COMPSHA]] -; EG-DAG: LSHL {{\*? *}}[[COMPSHB]] -; EG-DAG: LSHL {{.*}}, 1 -; EG-DAG: LSHL {{.*}}, 1 -; EG-DAG: LSHR {{.*}}, [[SHA]] -; EG-DAG: LSHR {{.*}}, [[SHB]] -; EG-DAG: LSHR {{.*}}, [[SHA]] -; EG-DAG: LSHR {{.*}}, [[SHB]] -; EG-DAG: OR_INT -; EG-DAG: OR_INT -; EG-DAG: ADD_INT {{\*? *}}[[BIGSHA:T[0-9]+\.[XYZW]]]{{.*}}, literal -; EG-DAG: ADD_INT {{\*? *}}[[BIGSHB:T[0-9]+\.[XYZW]]]{{.*}}, literal -; EG-DAG: LSHR -; EG-DAG: LSHR -; EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHA]], literal -; EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHB]], literal -; EG-DAG: CNDE_INT {{.*}}, 0.0 -; EG-DAG: CNDE_INT {{.*}}, 0.0 -; EG-DAG: CNDE_INT -; EG-DAG: CNDE_INT -define void @lshr_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) { - %b_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %in, i64 1 - %a = load <2 x i64>, <2 x i64> addrspace(1)* %in - %b = load <2 x i64>, <2 x i64> addrspace(1)* %b_ptr - %result = lshr <2 x i64> %a, %b - store <2 x i64> %result, <2 x i64> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}lshr_v4i64: -; SI: v_lshr_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}} -; SI: v_lshr_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}} -; SI: v_lshr_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}} -; SI: v_lshr_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}} - -; VI: v_lshrrev_b64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}} -; VI: v_lshrrev_b64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}} -; VI: v_lshrrev_b64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}} -; VI: v_lshrrev_b64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}} - -; EG-DAG: SUB_INT {{\*? *}}[[COMPSHA:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHA:T[0-9]+\.[XYZW]]] -; EG-DAG: SUB_INT {{\*? *}}[[COMPSHB:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHB:T[0-9]+\.[XYZW]]] -; EG-DAG: SUB_INT {{\*? *}}[[COMPSHC:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHC:T[0-9]+\.[XYZW]]] -; EG-DAG: SUB_INT {{\*? *}}[[COMPSHD:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHD:T[0-9]+\.[XYZW]]] -; EG-DAG: LSHL {{\*? *}}[[COMPSHA]] -; EG-DAG: LSHL {{\*? *}}[[COMPSHB]] -; EG-DAG: LSHL {{\*? *}}[[COMPSHC]] -; EG-DAG: LSHL {{\*? *}}[[COMPSHD]] -; EG-DAG: LSHL {{.*}}, 1 -; EG-DAG: LSHL {{.*}}, 1 -; EG-DAG: LSHL {{.*}}, 1 -; EG-DAG: LSHL {{.*}}, 1 -; EG-DAG: LSHR {{.*}}, [[SHA]] -; EG-DAG: LSHR {{.*}}, [[SHB]] -; EG-DAG: LSHR {{.*}}, [[SHC]] -; EG-DAG: LSHR {{.*}}, [[SHD]] -; EG-DAG: LSHR {{.*}}, [[SHA]] -; EG-DAG: LSHR {{.*}}, [[SHB]] -; EG-DAG: LSHR {{.*}}, [[SHC]] -; EG-DAG: LSHR {{.*}}, [[SHD]] -; EG-DAG: OR_INT -; EG-DAG: OR_INT -; EG-DAG: OR_INT -; EG-DAG: OR_INT -; EG-DAG: ADD_INT {{\*? *}}[[BIGSHA:T[0-9]+\.[XYZW]]]{{.*}}, literal -; EG-DAG: ADD_INT {{\*? *}}[[BIGSHB:T[0-9]+\.[XYZW]]]{{.*}}, literal -; EG-DAG: ADD_INT {{\*? *}}[[BIGSHC:T[0-9]+\.[XYZW]]]{{.*}}, literal -; EG-DAG: ADD_INT {{\*? *}}[[BIGSHD:T[0-9]+\.[XYZW]]]{{.*}}, literal -; EG-DAG: LSHR -; EG-DAG: LSHR -; EG-DAG: LSHR -; EG-DAG: LSHR -; EG-DAG: LSHR -; EG-DAG: LSHR -; EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHA]], literal -; EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHB]], literal -; EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHC]], literal -; EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHD]], literal -; EG-DAG: CNDE_INT {{.*}}, 0.0 -; EG-DAG: CNDE_INT {{.*}}, 0.0 -; EG-DAG: CNDE_INT {{.*}}, 0.0 -; EG-DAG: CNDE_INT {{.*}}, 0.0 -; EG-DAG: CNDE_INT -; EG-DAG: CNDE_INT -; EG-DAG: CNDE_INT -; EG-DAG: CNDE_INT -define void @lshr_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) { - %b_ptr = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %in, i64 1 - %a = load <4 x i64>, <4 x i64> addrspace(1)* %in - %b = load <4 x i64>, <4 x i64> addrspace(1)* %b_ptr - %result = lshr <4 x i64> %a, %b - store <4 x i64> %result, <4 x i64> addrspace(1)* %out - ret void -} diff --git a/llvm/test/CodeGen/R600/ssubo.ll b/llvm/test/CodeGen/R600/ssubo.ll deleted file mode 100644 index 26884a1b776..00000000000 --- a/llvm/test/CodeGen/R600/ssubo.ll +++ /dev/null @@ -1,65 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs< %s - -declare { i32, i1 } @llvm.ssub.with.overflow.i32(i32, i32) nounwind readnone -declare { i64, i1 } @llvm.ssub.with.overflow.i64(i64, i64) nounwind readnone - -; FUNC-LABEL: {{^}}ssubo_i64_zext: -define void @ssubo_i64_zext(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind { - %ssub = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 %a, i64 %b) nounwind - %val = extractvalue { i64, i1 } %ssub, 0 - %carry = extractvalue { i64, i1 } %ssub, 1 - %ext = zext i1 %carry to i64 - %add2 = add i64 %val, %ext - store i64 %add2, i64 addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}s_ssubo_i32: -define void @s_ssubo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 %a, i32 %b) nounwind { - %ssub = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 %a, i32 %b) nounwind - %val = extractvalue { i32, i1 } %ssub, 0 - %carry = extractvalue { i32, i1 } %ssub, 1 - store i32 %val, i32 addrspace(1)* %out, align 4 - store i1 %carry, i1 addrspace(1)* %carryout - ret void -} - -; FUNC-LABEL: {{^}}v_ssubo_i32: -define void @v_ssubo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind { - %a = load i32, i32 addrspace(1)* %aptr, align 4 - %b = load i32, i32 addrspace(1)* %bptr, align 4 - %ssub = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 %a, i32 %b) nounwind - %val = extractvalue { i32, i1 } %ssub, 0 - %carry = extractvalue { i32, i1 } %ssub, 1 - store i32 %val, i32 addrspace(1)* %out, align 4 - store i1 %carry, i1 addrspace(1)* %carryout - ret void -} - -; FUNC-LABEL: {{^}}s_ssubo_i64: -; SI: s_sub_u32 -; SI: s_subb_u32 -define void @s_ssubo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 %a, i64 %b) nounwind { - %ssub = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 %a, i64 %b) nounwind - %val = extractvalue { i64, i1 } %ssub, 0 - %carry = extractvalue { i64, i1 } %ssub, 1 - store i64 %val, i64 addrspace(1)* %out, align 8 - store i1 %carry, i1 addrspace(1)* %carryout - ret void -} - -; FUNC-LABEL: {{^}}v_ssubo_i64: -; SI: v_sub_i32_e32 -; SI: v_subb_u32_e32 -define void @v_ssubo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind { - %a = load i64, i64 addrspace(1)* %aptr, align 4 - %b = load i64, i64 addrspace(1)* %bptr, align 4 - %ssub = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 %a, i64 %b) nounwind - %val = extractvalue { i64, i1 } %ssub, 0 - %carry = extractvalue { i64, i1 } %ssub, 1 - store i64 %val, i64 addrspace(1)* %out, align 8 - store i1 %carry, i1 addrspace(1)* %carryout - ret void -} diff --git a/llvm/test/CodeGen/R600/store-barrier.ll b/llvm/test/CodeGen/R600/store-barrier.ll deleted file mode 100644 index 4a72b4d090a..00000000000 --- a/llvm/test/CodeGen/R600/store-barrier.ll +++ /dev/null @@ -1,42 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck --check-prefix=CHECK %s -; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck --check-prefix=CHECK %s - -; This test is for a bug in the machine scheduler where stores without -; an underlying object would be moved across the barrier. In this -; test, the <2 x i8> store will be split into two i8 stores, so they -; won't have an underlying object. - -; CHECK-LABEL: {{^}}test: -; CHECK: ds_write_b8 -; CHECK: ds_write_b8 -; CHECK: s_barrier -; CHECK: s_endpgm -; Function Attrs: nounwind -define void @test(<2 x i8> addrspace(3)* nocapture %arg, <2 x i8> addrspace(1)* nocapture readonly %arg1, i32 addrspace(1)* nocapture readonly %arg2, <2 x i8> addrspace(1)* nocapture %arg3, i32 %arg4, i64 %tmp9) { -bb: - %tmp10 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp9 - %tmp13 = load i32, i32 addrspace(1)* %tmp10, align 2 - %tmp14 = getelementptr inbounds <2 x i8>, <2 x i8> addrspace(3)* %arg, i32 %tmp13 - %tmp15 = load <2 x i8>, <2 x i8> addrspace(3)* %tmp14, align 2 - %tmp16 = add i32 %tmp13, 1 - %tmp17 = getelementptr inbounds <2 x i8>, <2 x i8> addrspace(3)* %arg, i32 %tmp16 - store <2 x i8> %tmp15, <2 x i8> addrspace(3)* %tmp17, align 2 - tail call void @llvm.AMDGPU.barrier.local() #2 - %tmp25 = load i32, i32 addrspace(1)* %tmp10, align 4 - %tmp26 = sext i32 %tmp25 to i64 - %tmp27 = sext i32 %arg4 to i64 - %tmp28 = getelementptr inbounds <2 x i8>, <2 x i8> addrspace(3)* %arg, i32 %tmp25, i32 %arg4 - %tmp29 = load i8, i8 addrspace(3)* %tmp28, align 1 - %tmp30 = getelementptr inbounds <2 x i8>, <2 x i8> addrspace(1)* %arg3, i64 %tmp26, i64 %tmp27 - store i8 %tmp29, i8 addrspace(1)* %tmp30, align 1 - %tmp32 = getelementptr inbounds <2 x i8>, <2 x i8> addrspace(3)* %arg, i32 %tmp25, i32 0 - %tmp33 = load i8, i8 addrspace(3)* %tmp32, align 1 - %tmp35 = getelementptr inbounds <2 x i8>, <2 x i8> addrspace(1)* %arg3, i64 %tmp26, i64 0 - store i8 %tmp33, i8 addrspace(1)* %tmp35, align 1 - ret void -} - -; Function Attrs: noduplicate nounwind -declare void @llvm.AMDGPU.barrier.local() #2 - -attributes #2 = { noduplicate nounwind } diff --git a/llvm/test/CodeGen/R600/store-v3i32.ll b/llvm/test/CodeGen/R600/store-v3i32.ll deleted file mode 100644 index 33617b55ed6..00000000000 --- a/llvm/test/CodeGen/R600/store-v3i32.ll +++ /dev/null @@ -1,13 +0,0 @@ -; XFAIL: * -; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI %s -; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI %s - -; 3 vectors have the same size and alignment as 4 vectors, so this -; should be done in a single store. - -; SI-LABEL: {{^}}store_v3i32: -; SI: buffer_store_dwordx4 -define void @store_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> %a) nounwind { - store <3 x i32> %a, <3 x i32> addrspace(1)* %out, align 16 - ret void -} diff --git a/llvm/test/CodeGen/R600/store-v3i64.ll b/llvm/test/CodeGen/R600/store-v3i64.ll deleted file mode 100644 index e0c554ad2c1..00000000000 --- a/llvm/test/CodeGen/R600/store-v3i64.ll +++ /dev/null @@ -1,29 +0,0 @@ -; XFAIL: * -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s - -; SI-LABEL: {{^}}global_store_v3i64: -; SI: buffer_store_dwordx4 -; SI: buffer_store_dwordx4 -define void @global_store_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> %x) { - store <3 x i64> %x, <3 x i64> addrspace(1)* %out, align 32 - ret void -} - -; SI-LABEL: {{^}}global_store_v3i64_unaligned: -define void @global_store_v3i64_unaligned(<3 x i64> addrspace(1)* %out, <3 x i64> %x) { - store <3 x i64> %x, <3 x i64> addrspace(1)* %out, align 1 - ret void -} - -; SI-LABEL: {{^}}local_store_v3i64: -define void @local_store_v3i64(<3 x i64> addrspace(3)* %out, <3 x i64> %x) { - store <3 x i64> %x, <3 x i64> addrspace(3)* %out, align 32 - ret void -} - -; SI-LABEL: {{^}}local_store_v3i64_unaligned: -define void @local_store_v3i64_unaligned(<3 x i64> addrspace(1)* %out, <3 x i64> %x) { - store <3 x i64> %x, <3 x i64> addrspace(1)* %out, align 1 - ret void -} diff --git a/llvm/test/CodeGen/R600/store-vector-ptrs.ll b/llvm/test/CodeGen/R600/store-vector-ptrs.ll deleted file mode 100644 index d5af3b29118..00000000000 --- a/llvm/test/CodeGen/R600/store-vector-ptrs.ll +++ /dev/null @@ -1,12 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s - -; This tests for a bug that caused a crash in -; AMDGPUDAGToDAGISel::SelectMUBUFScratch() which is used for selecting -; scratch loads and stores. -; CHECK-LABEL: {{^}}store_vector_ptrs: -define void @store_vector_ptrs(<4 x i32*>* %out, <4 x [1024 x i32]*> %array) nounwind { - %p = getelementptr [1024 x i32], <4 x [1024 x i32]*> %array, <4 x i16> zeroinitializer, <4 x i16> - store <4 x i32*> %p, <4 x i32*>* %out - ret void -} diff --git a/llvm/test/CodeGen/R600/store.ll b/llvm/test/CodeGen/R600/store.ll deleted file mode 100644 index 0f89405e073..00000000000 --- a/llvm/test/CodeGen/R600/store.ll +++ /dev/null @@ -1,369 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=cayman < %s | FileCheck -check-prefix=CM -check-prefix=FUNC %s - -;===------------------------------------------------------------------------===; -; Global Address Space -;===------------------------------------------------------------------------===; -; FUNC-LABEL: {{^}}store_i1: -; EG: MEM_RAT MSKOR -; SI: buffer_store_byte -define void @store_i1(i1 addrspace(1)* %out) { -entry: - store i1 true, i1 addrspace(1)* %out - ret void -} - -; i8 store -; FUNC-LABEL: {{^}}store_i8: -; EG: MEM_RAT MSKOR T[[RW_GPR:[0-9]]].XW, T{{[0-9]}}.X - -; IG 0: Get the byte index and truncate the value -; EG: AND_INT * T{{[0-9]}}.[[BI_CHAN:[XYZW]]], KC0[2].Y, literal.x -; EG: LSHL T{{[0-9]}}.[[SHIFT_CHAN:[XYZW]]], PV.[[BI_CHAN]], literal.x -; EG: AND_INT * T{{[0-9]}}.[[TRUNC_CHAN:[XYZW]]], KC0[2].Z, literal.y -; EG-NEXT: 3(4.203895e-45), 255(3.573311e-43) - - -; IG 1: Truncate the calculated the shift amount for the mask - -; IG 2: Shift the value and the mask -; EG: LSHL T[[RW_GPR]].X, PS, PV.[[SHIFT_CHAN]] -; EG: LSHL * T[[RW_GPR]].W, literal.x, PV.[[SHIFT_CHAN]] -; EG-NEXT: 255 -; IG 3: Initialize the Y and Z channels to zero -; XXX: An optimal scheduler should merge this into one of the prevous IGs. -; EG: MOV T[[RW_GPR]].Y, 0.0 -; EG: MOV * T[[RW_GPR]].Z, 0.0 - -; SI: buffer_store_byte - -define void @store_i8(i8 addrspace(1)* %out, i8 %in) { -entry: - store i8 %in, i8 addrspace(1)* %out - ret void -} - -; i16 store -; FUNC-LABEL: {{^}}store_i16: -; EG: MEM_RAT MSKOR T[[RW_GPR:[0-9]]].XW, T{{[0-9]}}.X - -; IG 0: Get the byte index and truncate the value - - -; EG: AND_INT * T{{[0-9]}}.[[BI_CHAN:[XYZW]]], KC0[2].Y, literal.x -; EG-NEXT: 3(4.203895e-45), - -; EG: LSHL T{{[0-9]}}.[[SHIFT_CHAN:[XYZW]]], PV.[[BI_CHAN]], literal.x -; EG: AND_INT * T{{[0-9]}}.[[TRUNC_CHAN:[XYZW]]], KC0[2].Z, literal.y - -; EG-NEXT: 3(4.203895e-45), 65535(9.183409e-41) -; IG 1: Truncate the calculated the shift amount for the mask - -; IG 2: Shift the value and the mask -; EG: LSHL T[[RW_GPR]].X, PS, PV.[[SHIFT_CHAN]] -; EG: LSHL * T[[RW_GPR]].W, literal.x, PV.[[SHIFT_CHAN]] -; EG-NEXT: 65535 -; IG 3: Initialize the Y and Z channels to zero -; XXX: An optimal scheduler should merge this into one of the prevous IGs. -; EG: MOV T[[RW_GPR]].Y, 0.0 -; EG: MOV * T[[RW_GPR]].Z, 0.0 - -; SI: buffer_store_short -define void @store_i16(i16 addrspace(1)* %out, i16 %in) { -entry: - store i16 %in, i16 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}store_v2i8: -; EG: MEM_RAT MSKOR -; EG-NOT: MEM_RAT MSKOR - -; SI: buffer_store_byte -; SI: buffer_store_byte -define void @store_v2i8(<2 x i8> addrspace(1)* %out, <2 x i32> %in) { -entry: - %0 = trunc <2 x i32> %in to <2 x i8> - store <2 x i8> %0, <2 x i8> addrspace(1)* %out - ret void -} - - -; FUNC-LABEL: {{^}}store_v2i16: -; EG: MEM_RAT_CACHELESS STORE_RAW - -; CM: MEM_RAT_CACHELESS STORE_DWORD - -; SI: buffer_store_short -; SI: buffer_store_short -define void @store_v2i16(<2 x i16> addrspace(1)* %out, <2 x i32> %in) { -entry: - %0 = trunc <2 x i32> %in to <2 x i16> - store <2 x i16> %0, <2 x i16> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}store_v4i8: -; EG: MEM_RAT_CACHELESS STORE_RAW - -; CM: MEM_RAT_CACHELESS STORE_DWORD - -; SI: buffer_store_byte -; SI: buffer_store_byte -; SI: buffer_store_byte -; SI: buffer_store_byte -define void @store_v4i8(<4 x i8> addrspace(1)* %out, <4 x i32> %in) { -entry: - %0 = trunc <4 x i32> %in to <4 x i8> - store <4 x i8> %0, <4 x i8> addrspace(1)* %out - ret void -} - -; floating-point store -; FUNC-LABEL: {{^}}store_f32: -; EG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+\.X, T[0-9]+\.X}}, 1 - -; CM: MEM_RAT_CACHELESS STORE_DWORD T{{[0-9]+\.X, T[0-9]+\.X}} - -; SI: buffer_store_dword - -define void @store_f32(float addrspace(1)* %out, float %in) { - store float %in, float addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}store_v4i16: -; EG: MEM_RAT MSKOR -; EG: MEM_RAT MSKOR -; EG: MEM_RAT MSKOR -; EG: MEM_RAT MSKOR -; EG-NOT: MEM_RAT MSKOR - -; SI: buffer_store_short -; SI: buffer_store_short -; SI: buffer_store_short -; SI: buffer_store_short -; SI-NOT: buffer_store_byte -define void @store_v4i16(<4 x i16> addrspace(1)* %out, <4 x i32> %in) { -entry: - %0 = trunc <4 x i32> %in to <4 x i16> - store <4 x i16> %0, <4 x i16> addrspace(1)* %out - ret void -} - -; vec2 floating-point stores -; FUNC-LABEL: {{^}}store_v2f32: -; EG: MEM_RAT_CACHELESS STORE_RAW - -; CM: MEM_RAT_CACHELESS STORE_DWORD - -; SI: buffer_store_dwordx2 - -define void @store_v2f32(<2 x float> addrspace(1)* %out, float %a, float %b) { -entry: - %0 = insertelement <2 x float> , float %a, i32 0 - %1 = insertelement <2 x float> %0, float %b, i32 1 - store <2 x float> %1, <2 x float> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}store_v4i32: -; EG: MEM_RAT_CACHELESS STORE_RAW -; EG-NOT: MEM_RAT_CACHELESS STORE_RAW - -; CM: MEM_RAT_CACHELESS STORE_DWORD -; CM-NOT: MEM_RAT_CACHELESS STORE_DWORD - -; SI: buffer_store_dwordx4 -define void @store_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %in) { -entry: - store <4 x i32> %in, <4 x i32> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}store_i64_i8: -; EG: MEM_RAT MSKOR -; SI: buffer_store_byte -define void @store_i64_i8(i8 addrspace(1)* %out, i64 %in) { -entry: - %0 = trunc i64 %in to i8 - store i8 %0, i8 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}store_i64_i16: -; EG: MEM_RAT MSKOR -; SI: buffer_store_short -define void @store_i64_i16(i16 addrspace(1)* %out, i64 %in) { -entry: - %0 = trunc i64 %in to i16 - store i16 %0, i16 addrspace(1)* %out - ret void -} - -;===------------------------------------------------------------------------===; -; Local Address Space -;===------------------------------------------------------------------------===; - -; FUNC-LABEL: {{^}}store_local_i1: -; EG: LDS_BYTE_WRITE -; SI: ds_write_b8 -define void @store_local_i1(i1 addrspace(3)* %out) { -entry: - store i1 true, i1 addrspace(3)* %out - ret void -} - -; FUNC-LABEL: {{^}}store_local_i8: -; EG: LDS_BYTE_WRITE - -; SI: ds_write_b8 -define void @store_local_i8(i8 addrspace(3)* %out, i8 %in) { - store i8 %in, i8 addrspace(3)* %out - ret void -} - -; FUNC-LABEL: {{^}}store_local_i16: -; EG: LDS_SHORT_WRITE - -; SI: ds_write_b16 -define void @store_local_i16(i16 addrspace(3)* %out, i16 %in) { - store i16 %in, i16 addrspace(3)* %out - ret void -} - -; FUNC-LABEL: {{^}}store_local_v2i16: -; EG: LDS_WRITE - -; CM: LDS_WRITE - -; SI: ds_write_b16 -; SI: ds_write_b16 -define void @store_local_v2i16(<2 x i16> addrspace(3)* %out, <2 x i16> %in) { -entry: - store <2 x i16> %in, <2 x i16> addrspace(3)* %out - ret void -} - -; FUNC-LABEL: {{^}}store_local_v4i8: -; EG: LDS_WRITE - -; CM: LDS_WRITE - -; SI: ds_write_b8 -; SI: ds_write_b8 -; SI: ds_write_b8 -; SI: ds_write_b8 -define void @store_local_v4i8(<4 x i8> addrspace(3)* %out, <4 x i8> %in) { -entry: - store <4 x i8> %in, <4 x i8> addrspace(3)* %out - ret void -} - -; FUNC-LABEL: {{^}}store_local_v2i32: -; EG: LDS_WRITE -; EG: LDS_WRITE - -; CM: LDS_WRITE -; CM: LDS_WRITE - -; SI: ds_write_b64 -define void @store_local_v2i32(<2 x i32> addrspace(3)* %out, <2 x i32> %in) { -entry: - store <2 x i32> %in, <2 x i32> addrspace(3)* %out - ret void -} - -; FUNC-LABEL: {{^}}store_local_v4i32: -; EG: LDS_WRITE -; EG: LDS_WRITE -; EG: LDS_WRITE -; EG: LDS_WRITE - -; CM: LDS_WRITE -; CM: LDS_WRITE -; CM: LDS_WRITE -; CM: LDS_WRITE - -; SI: ds_write_b32 -; SI: ds_write_b32 -; SI: ds_write_b32 -; SI: ds_write_b32 -define void @store_local_v4i32(<4 x i32> addrspace(3)* %out, <4 x i32> %in) { -entry: - store <4 x i32> %in, <4 x i32> addrspace(3)* %out - ret void -} - -; FUNC-LABEL: {{^}}store_local_i64_i8: -; EG: LDS_BYTE_WRITE -; SI: ds_write_b8 -define void @store_local_i64_i8(i8 addrspace(3)* %out, i64 %in) { -entry: - %0 = trunc i64 %in to i8 - store i8 %0, i8 addrspace(3)* %out - ret void -} - -; FUNC-LABEL: {{^}}store_local_i64_i16: -; EG: LDS_SHORT_WRITE -; SI: ds_write_b16 -define void @store_local_i64_i16(i16 addrspace(3)* %out, i64 %in) { -entry: - %0 = trunc i64 %in to i16 - store i16 %0, i16 addrspace(3)* %out - ret void -} - -; The stores in this function are combined by the optimizer to create a -; 64-bit store with 32-bit alignment. This is legal for SI and the legalizer -; should not try to split the 64-bit store back into 2 32-bit stores. -; -; Evergreen / Northern Islands don't support 64-bit stores yet, so there should -; be two 32-bit stores. - -; FUNC-LABEL: {{^}}vecload2: -; EG: MEM_RAT_CACHELESS STORE_RAW - -; CM: MEM_RAT_CACHELESS STORE_DWORD - -; SI: buffer_store_dwordx2 -define void @vecload2(i32 addrspace(1)* nocapture %out, i32 addrspace(2)* nocapture %mem) #0 { -entry: - %0 = load i32, i32 addrspace(2)* %mem, align 4 - %arrayidx1.i = getelementptr inbounds i32, i32 addrspace(2)* %mem, i64 1 - %1 = load i32, i32 addrspace(2)* %arrayidx1.i, align 4 - store i32 %0, i32 addrspace(1)* %out, align 4 - %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1 - store i32 %1, i32 addrspace(1)* %arrayidx1, align 4 - ret void -} - -attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" } - -; When i128 was a legal type this program generated cannot select errors: - -; FUNC-LABEL: {{^}}"i128-const-store": -; FIXME: We should be able to to this with one store instruction -; EG: STORE_RAW -; EG: STORE_RAW -; EG: STORE_RAW -; EG: STORE_RAW -; CM: STORE_DWORD -; CM: STORE_DWORD -; CM: STORE_DWORD -; CM: STORE_DWORD -; SI: buffer_store_dwordx4 -define void @i128-const-store(i32 addrspace(1)* %out) { -entry: - store i32 1, i32 addrspace(1)* %out, align 4 - %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1 - store i32 1, i32 addrspace(1)* %arrayidx2, align 4 - %arrayidx4 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 2 - store i32 2, i32 addrspace(1)* %arrayidx4, align 4 - %arrayidx6 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 3 - store i32 2, i32 addrspace(1)* %arrayidx6, align 4 - ret void -} diff --git a/llvm/test/CodeGen/R600/store.r600.ll b/llvm/test/CodeGen/R600/store.r600.ll deleted file mode 100644 index 696fb033b5e..00000000000 --- a/llvm/test/CodeGen/R600/store.r600.ll +++ /dev/null @@ -1,22 +0,0 @@ -; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG %s - -; XXX: Merge this test into store.ll once it is supported on SI - -; v4i32 store -; EG: {{^}}store_v4i32: -; EG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+\.XYZW, T[0-9]+\.X}}, 1 - -define void @store_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { - %1 = load <4 x i32>, <4 x i32> addrspace(1) * %in - store <4 x i32> %1, <4 x i32> addrspace(1)* %out - ret void -} - -; v4f32 store -; EG: {{^}}store_v4f32: -; EG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+\.XYZW, T[0-9]+\.X}}, 1 -define void @store_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) { - %1 = load <4 x float>, <4 x float> addrspace(1) * %in - store <4 x float> %1, <4 x float> addrspace(1)* %out - ret void -} diff --git a/llvm/test/CodeGen/R600/structurize.ll b/llvm/test/CodeGen/R600/structurize.ll deleted file mode 100644 index 02e592e9a55..00000000000 --- a/llvm/test/CodeGen/R600/structurize.ll +++ /dev/null @@ -1,83 +0,0 @@ -; RUN: llc < %s -march=r600 -mcpu=redwood -mattr=disable-irstructurizer | FileCheck %s -; Test case for a crash in the AMDILCFGStructurizer from a CFG like this: -; -; entry -; / \ -; diamond_head branch_from -; / \ | -; diamond_false diamond_true -; \ / -; done -; -; When the diamond_true branch had more than 100 instructions. -; -; - -; CHECK-LABEL: {{^}}branch_into_diamond: -; === entry block: -; CHECK: ALU_PUSH_BEFORE -; === Branch instruction (IF): -; CHECK: JUMP - ; === branch_from block - ; CHECK: ALU - ; === Duplicated diamond_true block (There can be more than one ALU clause): - ; === XXX: We should be able to optimize this so the basic block is not - ; === duplicated. See comments in - ; === AMDGPUCFGStructurizer::improveSimpleJumpintoIf() - ; CHECK: ALU -; === Branch instruction (ELSE): -; CHECK: ELSE - ; === diamond_head block: - ; CHECK: ALU_PUSH_BEFORE - ; === Branch instruction (IF): - ; CHECK: JUMP - ; === diamond_true block (There can be more than one ALU clause): - ; ALU - ; === Branch instruction (ELSE): - ; CHECK: ELSE - ; === diamond_false block plus implicit ENDIF - ; CHECK: ALU_POP_AFTER -; === Branch instruction (ENDIF): -; CHECK: POP -; === done block: -; CHECK: ALU -; CHECK: MEM_RAT_CACHELESS -; CHECK: CF_END - - -define void @branch_into_diamond(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) { -entry: -%0 = icmp ne i32 %a, 0 - br i1 %0, label %diamond_head, label %branch_from - -diamond_head: - %1 = icmp ne i32 %a, 1 - br i1 %1, label %diamond_true, label %diamond_false - -branch_from: - %2 = add i32 %a, 1 - br label %diamond_true - -diamond_false: - %3 = add i32 %a, 2 - br label %done - -diamond_true: - %4 = phi i32 [%2, %branch_from], [%a, %diamond_head] - ; This block needs to be > 100 ISA instructions to hit the bug, - ; so we'll use udiv instructions. - %div0 = udiv i32 %a, %b - %div1 = udiv i32 %div0, %4 - %div2 = udiv i32 %div1, 11 - %div3 = udiv i32 %div2, %a - %div4 = udiv i32 %div3, %b - %div5 = udiv i32 %div4, %c - %div6 = udiv i32 %div5, %div0 - %div7 = udiv i32 %div6, %div1 - br label %done - -done: - %5 = phi i32 [%3, %diamond_false], [%div7, %diamond_true] - store i32 %5, i32 addrspace(1)* %out - ret void -} diff --git a/llvm/test/CodeGen/R600/structurize1.ll b/llvm/test/CodeGen/R600/structurize1.ll deleted file mode 100644 index 77432c1f9d2..00000000000 --- a/llvm/test/CodeGen/R600/structurize1.ll +++ /dev/null @@ -1,62 +0,0 @@ -; RUN: llc < %s -march=r600 -mattr=disable-ifcvt -mcpu=redwood | FileCheck %s - -; This tests for abug where the AMDILCFGStructurizer was crashing on loops -; like this: -; -; for (i = 0; i < x; i++) { -; if (cond0) { -; if (cond1) { -; -; } else { -; -; } -; if (cond2) { -; -; } -; } -; } - -; CHECK-LABEL: {{^}}if_inside_loop: -; CHECK: LOOP_START_DX10 -; CHECK: END_LOOP -define void @if_inside_loop(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d) { -entry: - br label %for.body - -for.body: - %0 = phi i32 [0, %entry], [%inc, %for.inc] - %val = phi i32 [0, %entry], [%val.for.inc, %for.inc] - %inc = add i32 %0, 1 - %1 = icmp ult i32 10, %a - br i1 %1, label %for.inc, label %if.then - -if.then: - %2 = icmp ne i32 0, %b - br i1 %2, label %if.then.true, label %if.then.false - -if.then.true: - %3 = add i32 %a, %val - br label %if - -if.then.false: - %4 = mul i32 %a, %val - br label %if - -if: - %val.if = phi i32 [%3, %if.then.true], [%4, %if.then.false] - %5 = icmp ne i32 0, %c - br i1 %5, label %if.true, label %for.inc - -if.true: - %6 = add i32 %a, %val.if - br label %for.inc - -for.inc: - %val.for.inc = phi i32 [%val, %for.body], [%val.if, %if], [%6, %if.true] - %7 = icmp ne i32 0, %d - br i1 %7, label %for.body, label %exit - -exit: - store i32 %val.for.inc, i32 addrspace(1)* %out - ret void -} diff --git a/llvm/test/CodeGen/R600/sub.ll b/llvm/test/CodeGen/R600/sub.ll deleted file mode 100644 index b7fba0efa5b..00000000000 --- a/llvm/test/CodeGen/R600/sub.ll +++ /dev/null @@ -1,130 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s - - -declare i32 @llvm.r600.read.tidig.x() readnone - -; FUNC-LABEL: {{^}}test_sub_i32: -; EG: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} - -; SI: v_subrev_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -define void @test_sub_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { - %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 - %a = load i32, i32 addrspace(1)* %in - %b = load i32, i32 addrspace(1)* %b_ptr - %result = sub i32 %a, %b - store i32 %result, i32 addrspace(1)* %out - ret void -} - - -; FUNC-LABEL: {{^}}test_sub_v2i32: -; EG: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -; EG: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} - -; SI: v_sub_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -; SI: v_sub_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} - -define void @test_sub_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { - %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1 - %a = load <2 x i32>, <2 x i32> addrspace(1) * %in - %b = load <2 x i32>, <2 x i32> addrspace(1) * %b_ptr - %result = sub <2 x i32> %a, %b - store <2 x i32> %result, <2 x i32> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}test_sub_v4i32: -; EG: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -; EG: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -; EG: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -; EG: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} - -; SI: v_sub_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -; SI: v_sub_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -; SI: v_sub_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -; SI: v_sub_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} - -define void @test_sub_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { - %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1 - %a = load <4 x i32>, <4 x i32> addrspace(1) * %in - %b = load <4 x i32>, <4 x i32> addrspace(1) * %b_ptr - %result = sub <4 x i32> %a, %b - store <4 x i32> %result, <4 x i32> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}s_sub_i64: -; SI: s_sub_u32 -; SI: s_subb_u32 - -; EG: MEM_RAT_CACHELESS STORE_RAW [[LO:T[0-9]+\.[XYZW]]] -; EG: MEM_RAT_CACHELESS STORE_RAW [[HI:T[0-9]+\.[XYZW]]] -; EG-DAG: SUB_INT {{[* ]*}}[[LO]] -; EG-DAG: SUBB_UINT -; EG-DAG: SUB_INT -; EG-DAG: SUB_INT {{[* ]*}}[[HI]] -; EG-NOT: SUB -define void @s_sub_i64(i64 addrspace(1)* noalias %out, i64 %a, i64 %b) nounwind { - %result = sub i64 %a, %b - store i64 %result, i64 addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}v_sub_i64: -; SI: v_sub_i32_e32 -; SI: v_subb_u32_e32 - -; EG: MEM_RAT_CACHELESS STORE_RAW [[LO:T[0-9]+\.[XYZW]]] -; EG: MEM_RAT_CACHELESS STORE_RAW [[HI:T[0-9]+\.[XYZW]]] -; EG-DAG: SUB_INT {{[* ]*}}[[LO]] -; EG-DAG: SUBB_UINT -; EG-DAG: SUB_INT -; EG-DAG: SUB_INT {{[* ]*}}[[HI]] -; EG-NOT: SUB -define void @v_sub_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %inA, i64 addrspace(1)* noalias %inB) nounwind { - %tid = call i32 @llvm.r600.read.tidig.x() readnone - %a_ptr = getelementptr i64, i64 addrspace(1)* %inA, i32 %tid - %b_ptr = getelementptr i64, i64 addrspace(1)* %inB, i32 %tid - %a = load i64, i64 addrspace(1)* %a_ptr - %b = load i64, i64 addrspace(1)* %b_ptr - %result = sub i64 %a, %b - store i64 %result, i64 addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}v_test_sub_v2i64: -; SI: v_sub_i32_e32 -; SI: v_subb_u32_e32 -; SI: v_sub_i32_e32 -; SI: v_subb_u32_e32 -define void @v_test_sub_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* noalias %inA, <2 x i64> addrspace(1)* noalias %inB) { - %tid = call i32 @llvm.r600.read.tidig.x() readnone - %a_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %inA, i32 %tid - %b_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %inB, i32 %tid - %a = load <2 x i64>, <2 x i64> addrspace(1)* %a_ptr - %b = load <2 x i64>, <2 x i64> addrspace(1)* %b_ptr - %result = sub <2 x i64> %a, %b - store <2 x i64> %result, <2 x i64> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}v_test_sub_v4i64: -; SI: v_sub_i32_e32 -; SI: v_subb_u32_e32 -; SI: v_sub_i32_e32 -; SI: v_subb_u32_e32 -; SI: v_sub_i32_e32 -; SI: v_subb_u32_e32 -; SI: v_sub_i32_e32 -; SI: v_subb_u32_e32 -define void @v_test_sub_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* noalias %inA, <4 x i64> addrspace(1)* noalias %inB) { - %tid = call i32 @llvm.r600.read.tidig.x() readnone - %a_ptr = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %inA, i32 %tid - %b_ptr = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %inB, i32 %tid - %a = load <4 x i64>, <4 x i64> addrspace(1)* %a_ptr - %b = load <4 x i64>, <4 x i64> addrspace(1)* %b_ptr - %result = sub <4 x i64> %a, %b - store <4 x i64> %result, <4 x i64> addrspace(1)* %out - ret void -} diff --git a/llvm/test/CodeGen/R600/subreg-coalescer-crash.ll b/llvm/test/CodeGen/R600/subreg-coalescer-crash.ll deleted file mode 100644 index c4dae4736cf..00000000000 --- a/llvm/test/CodeGen/R600/subreg-coalescer-crash.ll +++ /dev/null @@ -1,109 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs -o - %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -o - %s - -; SI-LABEL:{{^}}row_filter_C1_D0: -; SI: s_endpgm -; Function Attrs: nounwind -define void @row_filter_C1_D0() { -entry: - br i1 undef, label %for.inc.1, label %do.body.preheader - -do.body.preheader: ; preds = %entry - %0 = insertelement <4 x i32> zeroinitializer, i32 undef, i32 1 - br i1 undef, label %do.body56.1, label %do.body90 - -do.body90: ; preds = %do.body56.2, %do.body56.1, %do.body.preheader - %1 = phi <4 x i32> [ %6, %do.body56.2 ], [ %5, %do.body56.1 ], [ %0, %do.body.preheader ] - %2 = insertelement <4 x i32> %1, i32 undef, i32 2 - %3 = insertelement <4 x i32> %2, i32 undef, i32 3 - br i1 undef, label %do.body124.1, label %do.body.1562.preheader - -do.body.1562.preheader: ; preds = %do.body124.1, %do.body90 - %storemerge = phi <4 x i32> [ %3, %do.body90 ], [ %7, %do.body124.1 ] - %4 = insertelement <4 x i32> undef, i32 undef, i32 1 - br label %for.inc.1 - -do.body56.1: ; preds = %do.body.preheader - %5 = insertelement <4 x i32> %0, i32 undef, i32 1 - %or.cond472.1 = or i1 undef, undef - br i1 %or.cond472.1, label %do.body56.2, label %do.body90 - -do.body56.2: ; preds = %do.body56.1 - %6 = insertelement <4 x i32> %5, i32 undef, i32 1 - br label %do.body90 - -do.body124.1: ; preds = %do.body90 - %7 = insertelement <4 x i32> %3, i32 undef, i32 3 - br label %do.body.1562.preheader - -for.inc.1: ; preds = %do.body.1562.preheader, %entry - %storemerge591 = phi <4 x i32> [ zeroinitializer, %entry ], [ %storemerge, %do.body.1562.preheader ] - %add.i495 = add <4 x i32> %storemerge591, undef - unreachable -} - -; SI-LABEL: {{^}}foo: -; SI: s_endpgm -define void @foo() #0 { -bb: - br i1 undef, label %bb2, label %bb1 - -bb1: ; preds = %bb - br i1 undef, label %bb4, label %bb6 - -bb2: ; preds = %bb4, %bb - %tmp = phi float [ %tmp5, %bb4 ], [ 0.000000e+00, %bb ] - br i1 undef, label %bb9, label %bb13 - -bb4: ; preds = %bb7, %bb6, %bb1 - %tmp5 = phi float [ undef, %bb1 ], [ undef, %bb6 ], [ %tmp8, %bb7 ] - br label %bb2 - -bb6: ; preds = %bb1 - br i1 undef, label %bb7, label %bb4 - -bb7: ; preds = %bb6 - %tmp8 = fmul float undef, undef - br label %bb4 - -bb9: ; preds = %bb2 - %tmp10 = call <4 x float> @llvm.SI.sample.v2i32(<2 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 2) - %tmp11 = extractelement <4 x float> %tmp10, i32 1 - %tmp12 = extractelement <4 x float> %tmp10, i32 3 - br label %bb14 - -bb13: ; preds = %bb2 - br i1 undef, label %bb23, label %bb24 - -bb14: ; preds = %bb27, %bb24, %bb9 - %tmp15 = phi float [ %tmp12, %bb9 ], [ undef, %bb27 ], [ 0.000000e+00, %bb24 ] - %tmp16 = phi float [ %tmp11, %bb9 ], [ undef, %bb27 ], [ %tmp25, %bb24 ] - %tmp17 = fmul float 10.5, %tmp16 - %tmp18 = fmul float 11.5, %tmp15 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %tmp18, float %tmp17, float %tmp17, float %tmp17) - ret void - -bb23: ; preds = %bb13 - br i1 undef, label %bb24, label %bb26 - -bb24: ; preds = %bb26, %bb23, %bb13 - %tmp25 = phi float [ %tmp, %bb13 ], [ %tmp, %bb26 ], [ 0.000000e+00, %bb23 ] - br i1 undef, label %bb27, label %bb14 - -bb26: ; preds = %bb23 - br label %bb24 - -bb27: ; preds = %bb24 - br label %bb14 -} - -; Function Attrs: nounwind readnone -declare <4 x float> @llvm.SI.sample.v2i32(<2 x i32>, <32 x i8>, <16 x i8>, i32) #1 - -; Function Attrs: nounwind readnone -declare i32 @llvm.SI.packf16(float, float) #1 - -declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) - -attributes #0 = { "ShaderType"="0" "enable-no-nans-fp-math"="true" "unsafe-fp-math"="true" } -attributes #1 = { nounwind readnone } diff --git a/llvm/test/CodeGen/R600/subreg-eliminate-dead.ll b/llvm/test/CodeGen/R600/subreg-eliminate-dead.ll deleted file mode 100644 index 8bd995a8ecb..00000000000 --- a/llvm/test/CodeGen/R600/subreg-eliminate-dead.ll +++ /dev/null @@ -1,19 +0,0 @@ -; RUN: llc -mtriple=amdgcn-- -verify-machineinstrs -o - %s | FileCheck %s -; LiveRangeEdit::eliminateDeadDef did not update LiveInterval sub ranges -; properly. - -; Just make sure this test doesn't crash. -; CHECK-LABEL: foobar: -; CHECK: s_endpgm -define void @foobar() { - %v0 = icmp eq <4 x i32> undef, - %v3 = sext <4 x i1> %v0 to <4 x i32> - %v4 = extractelement <4 x i32> %v3, i32 1 - %v5 = icmp ne i32 %v4, 0 - %v6 = select i1 %v5, i32 undef, i32 0 - %v15 = insertelement <2 x i32> undef, i32 %v6, i32 1 - store <2 x i32> %v15, <2 x i32> addrspace(1)* undef, align 8 - ret void -} - -declare double @llvm.fma.f64(double, double, double) diff --git a/llvm/test/CodeGen/R600/swizzle-export.ll b/llvm/test/CodeGen/R600/swizzle-export.ll deleted file mode 100644 index 000ee2faa47..00000000000 --- a/llvm/test/CodeGen/R600/swizzle-export.ll +++ /dev/null @@ -1,129 +0,0 @@ -; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG %s - -;EG: {{^}}main: -;EG: EXPORT T{{[0-9]+}}.XYXX -;EG: EXPORT T{{[0-9]+}}.ZXXX -;EG: EXPORT T{{[0-9]+}}.XXWX -;EG: EXPORT T{{[0-9]+}}.XXXW - -define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1) #0 { -main_body: - %0 = extractelement <4 x float> %reg1, i32 0 - %1 = extractelement <4 x float> %reg1, i32 1 - %2 = extractelement <4 x float> %reg1, i32 2 - %3 = extractelement <4 x float> %reg1, i32 3 - %4 = load <4 x float>, <4 x float> addrspace(8)* null - %5 = extractelement <4 x float> %4, i32 1 - %6 = load <4 x float>, <4 x float> addrspace(8)* null - %7 = extractelement <4 x float> %6, i32 2 - %8 = load <4 x float>, <4 x float> addrspace(8)* null - %9 = extractelement <4 x float> %8, i32 0 - %10 = fmul float 0.000000e+00, %9 - %11 = load <4 x float>, <4 x float> addrspace(8)* null - %12 = extractelement <4 x float> %11, i32 0 - %13 = fmul float %5, %12 - %14 = load <4 x float>, <4 x float> addrspace(8)* null - %15 = extractelement <4 x float> %14, i32 0 - %16 = fmul float 0.000000e+00, %15 - %17 = load <4 x float>, <4 x float> addrspace(8)* null - %18 = extractelement <4 x float> %17, i32 0 - %19 = fmul float 0.000000e+00, %18 - %20 = load <4 x float>, <4 x float> addrspace(8)* null - %21 = extractelement <4 x float> %20, i32 0 - %22 = fmul float %7, %21 - %23 = load <4 x float>, <4 x float> addrspace(8)* null - %24 = extractelement <4 x float> %23, i32 0 - %25 = fmul float 0.000000e+00, %24 - %26 = load <4 x float>, <4 x float> addrspace(8)* null - %27 = extractelement <4 x float> %26, i32 0 - %28 = fmul float 0.000000e+00, %27 - %29 = load <4 x float>, <4 x float> addrspace(8)* null - %30 = extractelement <4 x float> %29, i32 0 - %31 = fmul float 0.000000e+00, %30 - %32 = load <4 x float>, <4 x float> addrspace(8)* null - %33 = extractelement <4 x float> %32, i32 0 - %34 = fmul float 0.000000e+00, %33 - %35 = load <4 x float>, <4 x float> addrspace(8)* null - %36 = extractelement <4 x float> %35, i32 0 - %37 = fmul float 0.000000e+00, %36 - %38 = load <4 x float>, <4 x float> addrspace(8)* null - %39 = extractelement <4 x float> %38, i32 0 - %40 = fmul float 1.000000e+00, %39 - %41 = load <4 x float>, <4 x float> addrspace(8)* null - %42 = extractelement <4 x float> %41, i32 0 - %43 = fmul float 0.000000e+00, %42 - %44 = load <4 x float>, <4 x float> addrspace(8)* null - %45 = extractelement <4 x float> %44, i32 0 - %46 = fmul float 0.000000e+00, %45 - %47 = load <4 x float>, <4 x float> addrspace(8)* null - %48 = extractelement <4 x float> %47, i32 0 - %49 = fmul float 0.000000e+00, %48 - %50 = load <4 x float>, <4 x float> addrspace(8)* null - %51 = extractelement <4 x float> %50, i32 0 - %52 = fmul float 0.000000e+00, %51 - %53 = load <4 x float>, <4 x float> addrspace(8)* null - %54 = extractelement <4 x float> %53, i32 0 - %55 = fmul float 1.000000e+00, %54 - %56 = insertelement <4 x float> undef, float %0, i32 0 - %57 = insertelement <4 x float> %56, float %1, i32 1 - %58 = insertelement <4 x float> %57, float %2, i32 2 - %59 = insertelement <4 x float> %58, float %3, i32 3 - call void @llvm.R600.store.swizzle(<4 x float> %59, i32 60, i32 1) - %60 = insertelement <4 x float> undef, float %10, i32 0 - %61 = insertelement <4 x float> %60, float %13, i32 1 - %62 = insertelement <4 x float> %61, float %16, i32 2 - %63 = insertelement <4 x float> %62, float %19, i32 3 - call void @llvm.R600.store.swizzle(<4 x float> %63, i32 0, i32 2) - %64 = insertelement <4 x float> undef, float %22, i32 0 - %65 = insertelement <4 x float> %64, float %25, i32 1 - %66 = insertelement <4 x float> %65, float %28, i32 2 - %67 = insertelement <4 x float> %66, float %31, i32 3 - call void @llvm.R600.store.swizzle(<4 x float> %67, i32 1, i32 2) - %68 = insertelement <4 x float> undef, float %34, i32 0 - %69 = insertelement <4 x float> %68, float %37, i32 1 - %70 = insertelement <4 x float> %69, float %40, i32 2 - %71 = insertelement <4 x float> %70, float %43, i32 3 - call void @llvm.R600.store.swizzle(<4 x float> %71, i32 2, i32 2) - %72 = insertelement <4 x float> undef, float %46, i32 0 - %73 = insertelement <4 x float> %72, float %49, i32 1 - %74 = insertelement <4 x float> %73, float %52, i32 2 - %75 = insertelement <4 x float> %74, float %55, i32 3 - call void @llvm.R600.store.swizzle(<4 x float> %75, i32 3, i32 2) - ret void -} - -; EG: {{^}}main2: -; EG: T{{[0-9]+}}.XY__ -; EG: T{{[0-9]+}}.ZXY0 - -define void @main2(<4 x float> inreg %reg0, <4 x float> inreg %reg1) #0 { -main_body: - %0 = extractelement <4 x float> %reg1, i32 0 - %1 = extractelement <4 x float> %reg1, i32 1 - %2 = fadd float %0, 2.5 - %3 = fmul float %1, 3.5 - %4 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1) - %5 = extractelement <4 x float> %4, i32 0 - %6 = call float @llvm.cos.f32(float %5) - %7 = load <4 x float>, <4 x float> addrspace(8)* null - %8 = extractelement <4 x float> %7, i32 0 - %9 = load <4 x float>, <4 x float> addrspace(8)* null - %10 = extractelement <4 x float> %9, i32 1 - %11 = insertelement <4 x float> undef, float %2, i32 0 - %12 = insertelement <4 x float> %11, float %3, i32 1 - call void @llvm.R600.store.swizzle(<4 x float> %12, i32 60, i32 1) - %13 = insertelement <4 x float> undef, float %6, i32 0 - %14 = insertelement <4 x float> %13, float %8, i32 1 - %15 = insertelement <4 x float> %14, float %10, i32 2 - %16 = insertelement <4 x float> %15, float 0.000000e+00, i32 3 - call void @llvm.R600.store.swizzle(<4 x float> %16, i32 0, i32 2) - ret void -} - -; Function Attrs: nounwind readonly -declare float @llvm.cos.f32(float) #1 - -declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) - -attributes #0 = { "ShaderType"="1" } -attributes #1 = { nounwind readonly } diff --git a/llvm/test/CodeGen/R600/tex-clause-antidep.ll b/llvm/test/CodeGen/R600/tex-clause-antidep.ll deleted file mode 100644 index cbb9c50974a..00000000000 --- a/llvm/test/CodeGen/R600/tex-clause-antidep.ll +++ /dev/null @@ -1,25 +0,0 @@ -;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s - -;CHECK: TEX -;CHECK-NEXT: ALU - -define void @test(<4 x float> inreg %reg0) #0 { - %1 = extractelement <4 x float> %reg0, i32 0 - %2 = extractelement <4 x float> %reg0, i32 1 - %3 = extractelement <4 x float> %reg0, i32 2 - %4 = extractelement <4 x float> %reg0, i32 3 - %5 = insertelement <4 x float> undef, float %1, i32 0 - %6 = insertelement <4 x float> %5, float %2, i32 1 - %7 = insertelement <4 x float> %6, float %3, i32 2 - %8 = insertelement <4 x float> %7, float %4, i32 3 - %9 = call <4 x float> @llvm.R600.tex(<4 x float> %8, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %10 = call <4 x float> @llvm.R600.tex(<4 x float> %8, i32 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %11 = fadd <4 x float> %9, %10 - call void @llvm.R600.store.swizzle(<4 x float> %11, i32 0, i32 0) - ret void -} - -declare <4 x float> @llvm.R600.tex(<4 x float>, i32, i32, i32, i32, i32, i32, i32, i32, i32) readnone -declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) - -attributes #0 = { "ShaderType"="1" } \ No newline at end of file diff --git a/llvm/test/CodeGen/R600/texture-input-merge.ll b/llvm/test/CodeGen/R600/texture-input-merge.ll deleted file mode 100644 index 789538af582..00000000000 --- a/llvm/test/CodeGen/R600/texture-input-merge.ll +++ /dev/null @@ -1,31 +0,0 @@ -;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s - -;CHECK-NOT: MOV - -define void @test(<4 x float> inreg %reg0) #0 { - %1 = extractelement <4 x float> %reg0, i32 0 - %2 = extractelement <4 x float> %reg0, i32 1 - %3 = extractelement <4 x float> %reg0, i32 2 - %4 = extractelement <4 x float> %reg0, i32 3 - %5 = fmul float %1, 3.0 - %6 = fmul float %2, 3.0 - %7 = fmul float %3, 3.0 - %8 = fmul float %4, 3.0 - %9 = insertelement <4 x float> undef, float %5, i32 0 - %10 = insertelement <4 x float> %9, float %6, i32 1 - %11 = insertelement <4 x float> undef, float %7, i32 0 - %12 = insertelement <4 x float> %11, float %5, i32 1 - %13 = insertelement <4 x float> undef, float %8, i32 0 - %14 = call <4 x float> @llvm.R600.tex(<4 x float> %10, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %15 = call <4 x float> @llvm.R600.tex(<4 x float> %12, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %16 = call <4 x float> @llvm.R600.tex(<4 x float> %13, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %17 = fadd <4 x float> %14, %15 - %18 = fadd <4 x float> %17, %16 - call void @llvm.R600.store.swizzle(<4 x float> %18, i32 0, i32 0) - ret void -} - -declare <4 x float> @llvm.R600.tex(<4 x float>, i32, i32, i32, i32, i32, i32, i32, i32, i32) readnone -declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) - -attributes #0 = { "ShaderType"="1" } \ No newline at end of file diff --git a/llvm/test/CodeGen/R600/trunc-cmp-constant.ll b/llvm/test/CodeGen/R600/trunc-cmp-constant.ll deleted file mode 100644 index dac74728b3c..00000000000 --- a/llvm/test/CodeGen/R600/trunc-cmp-constant.ll +++ /dev/null @@ -1,170 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s - -; FUNC-LABEL {{^}}sextload_i1_to_i32_trunc_cmp_eq_0: -; SI: buffer_load_ubyte [[LOAD:v[0-9]+]] -; SI: v_and_b32_e32 [[TMP:v[0-9]+]], 1, [[LOAD]] -; SI: v_cmp_eq_i32_e32 vcc, 1, [[TMP]]{{$}} -; SI: s_xor_b64 s{{\[[0-9]+:[0-9]+\]}}, vcc, -1{{$}} -; SI: v_cndmask_b32_e64 -; SI: buffer_store_byte -define void @sextload_i1_to_i32_trunc_cmp_eq_0(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind { - %load = load i1, i1 addrspace(1)* %in - %ext = sext i1 %load to i32 - %cmp = icmp eq i32 %ext, 0 - store i1 %cmp, i1 addrspace(1)* %out - ret void -} - -; FIXME: The negate should be inverting the compare. -; FUNC-LABEL: {{^}}zextload_i1_to_i32_trunc_cmp_eq_0: -; SI: buffer_load_ubyte [[LOAD:v[0-9]+]] -; SI: v_and_b32_e32 [[TMP:v[0-9]+]], 1, [[LOAD]] -; SI: v_cmp_eq_i32_e32 vcc, 1, [[TMP]]{{$}} -; SI-NEXT: s_xor_b64 [[NEG:s\[[0-9]+:[0-9]+\]]], vcc, -1 -; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[NEG]] -; SI-NEXT: buffer_store_byte [[RESULT]] -define void @zextload_i1_to_i32_trunc_cmp_eq_0(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind { - %load = load i1, i1 addrspace(1)* %in - %ext = zext i1 %load to i32 - %cmp = icmp eq i32 %ext, 0 - store i1 %cmp, i1 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}sextload_i1_to_i32_trunc_cmp_eq_1: -; SI: v_mov_b32_e32 [[RESULT:v[0-9]+]], 0{{$}} -; SI: buffer_store_byte [[RESULT]] -define void @sextload_i1_to_i32_trunc_cmp_eq_1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind { - %load = load i1, i1 addrspace(1)* %in - %ext = sext i1 %load to i32 - %cmp = icmp eq i32 %ext, 1 - store i1 %cmp, i1 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}zextload_i1_to_i32_trunc_cmp_eq_1: -; SI: buffer_load_ubyte [[LOAD:v[0-9]+]] -; SI: v_and_b32_e32 [[RESULT:v[0-9]+]], 1, [[LOAD]] -; SI-NEXT: buffer_store_byte [[RESULT]] -define void @zextload_i1_to_i32_trunc_cmp_eq_1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind { - %load = load i1, i1 addrspace(1)* %in - %ext = zext i1 %load to i32 - %cmp = icmp eq i32 %ext, 1 - store i1 %cmp, i1 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}sextload_i1_to_i32_trunc_cmp_eq_neg1: -; SI: buffer_load_ubyte [[LOAD:v[0-9]+]] -; SI: v_and_b32_e32 [[RESULT:v[0-9]+]], 1, [[LOAD]] -; SI-NEXT: buffer_store_byte [[RESULT]] -define void @sextload_i1_to_i32_trunc_cmp_eq_neg1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind { - %load = load i1, i1 addrspace(1)* %in - %ext = sext i1 %load to i32 - %cmp = icmp eq i32 %ext, -1 - store i1 %cmp, i1 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}zextload_i1_to_i32_trunc_cmp_eq_neg1: -; SI: v_mov_b32_e32 [[RESULT:v[0-9]+]], 0{{$}} -; SI: buffer_store_byte [[RESULT]] -define void @zextload_i1_to_i32_trunc_cmp_eq_neg1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind { - %load = load i1, i1 addrspace(1)* %in - %ext = zext i1 %load to i32 - %cmp = icmp eq i32 %ext, -1 - store i1 %cmp, i1 addrspace(1)* %out - ret void -} - - -; FUNC-LABEL {{^}}sextload_i1_to_i32_trunc_cmp_ne_0: -; SI: buffer_load_ubyte [[LOAD:v[0-9]+]] -; SI: v_and_b32_e32 [[TMP:v[0-9]+]], 1, [[LOAD]] -; SI-NEXT: buffer_store_byte [[RESULT]] -define void @sextload_i1_to_i32_trunc_cmp_ne_0(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind { - %load = load i1, i1 addrspace(1)* %in - %ext = sext i1 %load to i32 - %cmp = icmp ne i32 %ext, 0 - store i1 %cmp, i1 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}zextload_i1_to_i32_trunc_cmp_ne_0: -; SI: buffer_load_ubyte [[LOAD:v[0-9]+]] -; SI: v_and_b32_e32 [[TMP:v[0-9]+]], 1, [[LOAD]] -; SI-NEXT: buffer_store_byte [[RESULT]] -define void @zextload_i1_to_i32_trunc_cmp_ne_0(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind { - %load = load i1, i1 addrspace(1)* %in - %ext = zext i1 %load to i32 - %cmp = icmp ne i32 %ext, 0 - store i1 %cmp, i1 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}sextload_i1_to_i32_trunc_cmp_ne_1: -; SI: v_mov_b32_e32 [[RESULT:v[0-9]+]], 1{{$}} -; SI: buffer_store_byte [[RESULT]] -define void @sextload_i1_to_i32_trunc_cmp_ne_1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind { - %load = load i1, i1 addrspace(1)* %in - %ext = sext i1 %load to i32 - %cmp = icmp ne i32 %ext, 1 - store i1 %cmp, i1 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}zextload_i1_to_i32_trunc_cmp_ne_1: -; SI: buffer_load_ubyte [[LOAD:v[0-9]+]] -; SI: v_and_b32_e32 [[TMP:v[0-9]+]], 1, [[LOAD]] -; SI: v_cmp_eq_i32_e32 vcc, 1, [[TMP]]{{$}} -; SI-NEXT: s_xor_b64 [[NEG:s\[[0-9]+:[0-9]+\]]], vcc, -1 -; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[NEG]] -; SI-NEXT: buffer_store_byte [[RESULT]] -define void @zextload_i1_to_i32_trunc_cmp_ne_1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind { - %load = load i1, i1 addrspace(1)* %in - %ext = zext i1 %load to i32 - %cmp = icmp ne i32 %ext, 1 - store i1 %cmp, i1 addrspace(1)* %out - ret void -} - -; FIXME: This should be one compare. -; FUNC-LABEL: {{^}}sextload_i1_to_i32_trunc_cmp_ne_neg1: -; XSI: buffer_load_ubyte [[LOAD:v[0-9]+]] -; XSI: v_and_b32_e32 [[TMP:v[0-9]+]], 1, [[LOAD]] -; XSI: v_cmp_eq_i32_e64 [[CMP0:s\[[0-9]+:[0-9]+\]]], [[TMP]], 0{{$}} -; XSI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[CMP0]] -; XSI-NEXT: buffer_store_byte [[RESULT]] -define void @sextload_i1_to_i32_trunc_cmp_ne_neg1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind { - %load = load i1, i1 addrspace(1)* %in - %ext = sext i1 %load to i32 - %cmp = icmp ne i32 %ext, -1 - store i1 %cmp, i1 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}zextload_i1_to_i32_trunc_cmp_ne_neg1: -; SI: v_mov_b32_e32 [[RESULT:v[0-9]+]], 1{{$}} -; SI: buffer_store_byte [[RESULT]] -define void @zextload_i1_to_i32_trunc_cmp_ne_neg1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind { - %load = load i1, i1 addrspace(1)* %in - %ext = zext i1 %load to i32 - %cmp = icmp ne i32 %ext, -1 - store i1 %cmp, i1 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}masked_load_i1_to_i32_trunc_cmp_ne_neg1: -; SI: buffer_load_sbyte [[LOAD:v[0-9]+]] -; SI: v_cmp_ne_i32_e32 vcc, -1, [[LOAD]]{{$}} -; SI-NEXT: v_cndmask_b32_e64 -; SI-NEXT: buffer_store_byte -define void @masked_load_i1_to_i32_trunc_cmp_ne_neg1(i1 addrspace(1)* %out, i8 addrspace(1)* %in) nounwind { - %load = load i8, i8 addrspace(1)* %in - %masked = and i8 %load, 255 - %ext = sext i8 %masked to i32 - %cmp = icmp ne i32 %ext, -1 - store i1 %cmp, i1 addrspace(1)* %out - ret void -} diff --git a/llvm/test/CodeGen/R600/trunc-store-f64-to-f16.ll b/llvm/test/CodeGen/R600/trunc-store-f64-to-f16.ll deleted file mode 100644 index c29872beef8..00000000000 --- a/llvm/test/CodeGen/R600/trunc-store-f64-to-f16.ll +++ /dev/null @@ -1,56 +0,0 @@ -; XFAIL: * -; RUN: llc -march=amdgcn -mcpu=SI < %s - -; GCN-LABEL: {{^}}global_truncstore_f64_to_f16: -; GCN: s_endpgm -define void @global_truncstore_f64_to_f16(half addrspace(1)* %out, double addrspace(1)* %in) #0 { - %val = load double, double addrspace(1)* %in - %cvt = fptrunc double %val to half - store half %cvt, half addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}global_truncstore_v2f64_to_v2f16: -; GCN: s_endpgm -define void @global_truncstore_v2f64_to_v2f16(<2 x half> addrspace(1)* %out, <2 x double> addrspace(1)* %in) #0 { - %val = load <2 x double>, <2 x double> addrspace(1)* %in - %cvt = fptrunc <2 x double> %val to <2 x half> - store <2 x half> %cvt, <2 x half> addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}global_truncstore_v3f64_to_v3f16: -; GCN: s_endpgm -define void @global_truncstore_v3f64_to_v3f16(<3 x half> addrspace(1)* %out, <3 x double> addrspace(1)* %in) #0 { - %val = load <3 x double>, <3 x double> addrspace(1)* %in - %cvt = fptrunc <3 x double> %val to <3 x half> - store <3 x half> %cvt, <3 x half> addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}global_truncstore_v4f64_to_v4f16: -; GCN: s_endpgm -define void @global_truncstore_v4f64_to_v4f16(<4 x half> addrspace(1)* %out, <4 x double> addrspace(1)* %in) #0 { - %val = load <4 x double>, <4 x double> addrspace(1)* %in - %cvt = fptrunc <4 x double> %val to <4 x half> - store <4 x half> %cvt, <4 x half> addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}global_truncstore_v8f64_to_v8f16: -; GCN: s_endpgm -define void @global_truncstore_v8f64_to_v8f16(<8 x half> addrspace(1)* %out, <8 x double> addrspace(1)* %in) #0 { - %val = load <8 x double>, <8 x double> addrspace(1)* %in - %cvt = fptrunc <8 x double> %val to <8 x half> - store <8 x half> %cvt, <8 x half> addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}global_truncstore_v16f64_to_v16f16: -; GCN: s_endpgm -define void @global_truncstore_v16f64_to_v16f16(<16 x half> addrspace(1)* %out, <16 x double> addrspace(1)* %in) #0 { - %val = load <16 x double>, <16 x double> addrspace(1)* %in - %cvt = fptrunc <16 x double> %val to <16 x half> - store <16 x half> %cvt, <16 x half> addrspace(1)* %out - ret void -} diff --git a/llvm/test/CodeGen/R600/trunc-store-i1.ll b/llvm/test/CodeGen/R600/trunc-store-i1.ll deleted file mode 100644 index b71a838b62c..00000000000 --- a/llvm/test/CodeGen/R600/trunc-store-i1.ll +++ /dev/null @@ -1,33 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s - - -; SI-LABEL: {{^}}global_truncstore_i32_to_i1: -; SI: s_load_dword [[LOAD:s[0-9]+]], -; SI: s_and_b32 [[SREG:s[0-9]+]], [[LOAD]], 1 -; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], [[SREG]] -; SI: buffer_store_byte [[VREG]], -define void @global_truncstore_i32_to_i1(i1 addrspace(1)* %out, i32 %val) nounwind { - %trunc = trunc i32 %val to i1 - store i1 %trunc, i1 addrspace(1)* %out, align 1 - ret void -} - -; SI-LABEL: {{^}}global_truncstore_i64_to_i1: -; SI: buffer_store_byte -define void @global_truncstore_i64_to_i1(i1 addrspace(1)* %out, i64 %val) nounwind { - %trunc = trunc i64 %val to i1 - store i1 %trunc, i1 addrspace(1)* %out, align 1 - ret void -} - -; SI-LABEL: {{^}}global_truncstore_i16_to_i1: -; SI: s_load_dword [[LOAD:s[0-9]+]], -; SI: s_and_b32 [[SREG:s[0-9]+]], [[LOAD]], 1 -; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], [[SREG]] -; SI: buffer_store_byte [[VREG]], -define void @global_truncstore_i16_to_i1(i1 addrspace(1)* %out, i16 %val) nounwind { - %trunc = trunc i16 %val to i1 - store i1 %trunc, i1 addrspace(1)* %out, align 1 - ret void -} diff --git a/llvm/test/CodeGen/R600/trunc-vector-store-assertion-failure.ll b/llvm/test/CodeGen/R600/trunc-vector-store-assertion-failure.ll deleted file mode 100644 index 878ea3f4899..00000000000 --- a/llvm/test/CodeGen/R600/trunc-vector-store-assertion-failure.ll +++ /dev/null @@ -1,20 +0,0 @@ -; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s - -; This tests for a bug in the SelectionDAG where custom lowered truncated -; vector stores at the end of a basic block were not being added to the -; LegalizedNodes list, which triggered an assertion failure. - -; CHECK-LABEL: {{^}}test: -; CHECK: MEM_RAT_CACHELESS STORE_RAW -define void @test(<4 x i8> addrspace(1)* %out, i32 %cond, <4 x i8> %in) { -entry: - %0 = icmp eq i32 %cond, 0 - br i1 %0, label %if, label %done - -if: - store <4 x i8> %in, <4 x i8> addrspace(1)* %out - br label %done - -done: - ret void -} diff --git a/llvm/test/CodeGen/R600/trunc.ll b/llvm/test/CodeGen/R600/trunc.ll deleted file mode 100644 index bf690ca4cb2..00000000000 --- a/llvm/test/CodeGen/R600/trunc.ll +++ /dev/null @@ -1,100 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s -; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG %s - -declare i32 @llvm.r600.read.tidig.x() nounwind readnone - -define void @trunc_i64_to_i32_store(i32 addrspace(1)* %out, i64 %in) { -; SI-LABEL: {{^}}trunc_i64_to_i32_store: -; SI: s_load_dword [[SLOAD:s[0-9]+]], s[0:1], 0xb -; SI: v_mov_b32_e32 [[VLOAD:v[0-9]+]], [[SLOAD]] -; SI: buffer_store_dword [[VLOAD]] - -; EG-LABEL: {{^}}trunc_i64_to_i32_store: -; EG: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 -; EG: LSHR -; EG-NEXT: 2( - - %result = trunc i64 %in to i32 store i32 %result, i32 addrspace(1)* %out, align 4 - ret void -} - -; SI-LABEL: {{^}}trunc_load_shl_i64: -; SI-DAG: s_load_dwordx2 -; SI-DAG: s_load_dword [[SREG:s[0-9]+]], -; SI: s_lshl_b32 [[SHL:s[0-9]+]], [[SREG]], 2 -; SI: v_mov_b32_e32 [[VSHL:v[0-9]+]], [[SHL]] -; SI: buffer_store_dword [[VSHL]], -define void @trunc_load_shl_i64(i32 addrspace(1)* %out, i64 %a) { - %b = shl i64 %a, 2 - %result = trunc i64 %b to i32 - store i32 %result, i32 addrspace(1)* %out, align 4 - ret void -} - -; SI-LABEL: {{^}}trunc_shl_i64: -; SI: s_load_dwordx2 s{{\[}}[[LO_SREG:[0-9]+]]:{{[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd -; SI: s_lshl_b64 s{{\[}}[[LO_SHL:[0-9]+]]:{{[0-9]+\]}}, s{{\[}}[[LO_SREG]]:{{[0-9]+\]}}, 2 -; SI: s_add_u32 s[[LO_SREG2:[0-9]+]], s[[LO_SHL]], -; SI: s_addc_u32 -; SI: v_mov_b32_e32 -; SI: v_mov_b32_e32 -; SI: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], s[[LO_SREG2]] -; SI: buffer_store_dword v[[LO_VREG]], -define void @trunc_shl_i64(i64 addrspace(1)* %out2, i32 addrspace(1)* %out, i64 %a) { - %aa = add i64 %a, 234 ; Prevent shrinking store. - %b = shl i64 %aa, 2 - %result = trunc i64 %b to i32 - store i32 %result, i32 addrspace(1)* %out, align 4 - store i64 %b, i64 addrspace(1)* %out2, align 8 ; Prevent reducing ops to 32-bits - ret void -} - -; SI-LABEL: {{^}}trunc_i32_to_i1: -; SI: v_and_b32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}} -; SI: v_cmp_eq_i32 -define void @trunc_i32_to_i1(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) { - %a = load i32, i32 addrspace(1)* %ptr, align 4 - %trunc = trunc i32 %a to i1 - %result = select i1 %trunc, i32 1, i32 0 - store i32 %result, i32 addrspace(1)* %out, align 4 - ret void -} - -; SI-LABEL: {{^}}sgpr_trunc_i32_to_i1: -; SI: v_and_b32_e64 v{{[0-9]+}}, 1, s{{[0-9]+}} -; SI: v_cmp_eq_i32 -define void @sgpr_trunc_i32_to_i1(i32 addrspace(1)* %out, i32 %a) { - %trunc = trunc i32 %a to i1 - %result = select i1 %trunc, i32 1, i32 0 - store i32 %result, i32 addrspace(1)* %out, align 4 - ret void -} - -; SI-LABEL: {{^}}s_trunc_i64_to_i1: -; SI: s_load_dwordx2 s{{\[}}[[SLO:[0-9]+]]:{{[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0xb -; SI: v_and_b32_e64 [[MASKED:v[0-9]+]], 1, s[[SLO]] -; SI: v_cmp_eq_i32_e32 vcc, 1, [[MASKED]] -; SI: v_cndmask_b32_e64 {{v[0-9]+}}, -12, 63, vcc -define void @s_trunc_i64_to_i1(i32 addrspace(1)* %out, i64 %x) { - %trunc = trunc i64 %x to i1 - %sel = select i1 %trunc, i32 63, i32 -12 - store i32 %sel, i32 addrspace(1)* %out - ret void -} - -; SI-LABEL: {{^}}v_trunc_i64_to_i1: -; SI: buffer_load_dwordx2 v{{\[}}[[VLO:[0-9]+]]:{{[0-9]+\]}} -; SI: v_and_b32_e32 [[MASKED:v[0-9]+]], 1, v[[VLO]] -; SI: v_cmp_eq_i32_e32 vcc, 1, [[MASKED]] -; SI: v_cndmask_b32_e64 {{v[0-9]+}}, -12, 63, vcc -define void @v_trunc_i64_to_i1(i32 addrspace(1)* %out, i64 addrspace(1)* %in) { - %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone - %gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid - %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %x = load i64, i64 addrspace(1)* %gep - - %trunc = trunc i64 %x to i1 - %sel = select i1 %trunc, i32 63, i32 -12 - store i32 %sel, i32 addrspace(1)* %out.gep - ret void -} diff --git a/llvm/test/CodeGen/R600/tti-unroll-prefs.ll b/llvm/test/CodeGen/R600/tti-unroll-prefs.ll deleted file mode 100644 index 76c32afc1f2..00000000000 --- a/llvm/test/CodeGen/R600/tti-unroll-prefs.ll +++ /dev/null @@ -1,58 +0,0 @@ -; RUN: opt -loop-unroll -S -mtriple=amdgcn-- -mcpu=SI %s | FileCheck %s - -; This IR comes from this OpenCL C code: -; -; if (b + 4 > a) { -; for (int i = 0; i < 4; i++, b++) { -; if (b + 1 <= a) -; *(dst + c + b) = 0; -; else -; break; -; } -; } -; -; This test is meant to check that this loop isn't unrolled into more than -; four iterations. The loop unrolling preferences we currently use cause this -; loop to not be unrolled at all, but that may change in the future. - -; CHECK-LABEL: @test -; CHECK: store i8 0, i8 addrspace(1)* -; CHECK-NOT: store i8 0, i8 addrspace(1)* -; CHECK: ret void -define void @test(i8 addrspace(1)* nocapture %dst, i32 %a, i32 %b, i32 %c) { -entry: - %add = add nsw i32 %b, 4 - %cmp = icmp sgt i32 %add, %a - br i1 %cmp, label %for.cond.preheader, label %if.end7 - -for.cond.preheader: ; preds = %entry - %cmp313 = icmp slt i32 %b, %a - br i1 %cmp313, label %if.then4.lr.ph, label %if.end7.loopexit - -if.then4.lr.ph: ; preds = %for.cond.preheader - %0 = sext i32 %c to i64 - br label %if.then4 - -if.then4: ; preds = %if.then4.lr.ph, %if.then4 - %i.015 = phi i32 [ 0, %if.then4.lr.ph ], [ %inc, %if.then4 ] - %b.addr.014 = phi i32 [ %b, %if.then4.lr.ph ], [ %add2, %if.then4 ] - %add2 = add nsw i32 %b.addr.014, 1 - %1 = sext i32 %b.addr.014 to i64 - %add.ptr.sum = add nsw i64 %1, %0 - %add.ptr5 = getelementptr inbounds i8, i8 addrspace(1)* %dst, i64 %add.ptr.sum - store i8 0, i8 addrspace(1)* %add.ptr5, align 1 - %inc = add nsw i32 %i.015, 1 - %cmp1 = icmp slt i32 %inc, 4 - %cmp3 = icmp slt i32 %add2, %a - %or.cond = and i1 %cmp3, %cmp1 - br i1 %or.cond, label %if.then4, label %for.cond.if.end7.loopexit_crit_edge - -for.cond.if.end7.loopexit_crit_edge: ; preds = %if.then4 - br label %if.end7.loopexit - -if.end7.loopexit: ; preds = %for.cond.if.end7.loopexit_crit_edge, %for.cond.preheader - br label %if.end7 - -if.end7: ; preds = %if.end7.loopexit, %entry - ret void -} diff --git a/llvm/test/CodeGen/R600/uaddo.ll b/llvm/test/CodeGen/R600/uaddo.ll deleted file mode 100644 index 11438f267ad..00000000000 --- a/llvm/test/CodeGen/R600/uaddo.ll +++ /dev/null @@ -1,85 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs< %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s - -declare { i32, i1 } @llvm.uadd.with.overflow.i32(i32, i32) nounwind readnone -declare { i64, i1 } @llvm.uadd.with.overflow.i64(i64, i64) nounwind readnone - -; FUNC-LABEL: {{^}}uaddo_i64_zext: -; SI: add -; SI: addc -; SI: addc - -; EG: ADDC_UINT -; EG: ADDC_UINT -define void @uaddo_i64_zext(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind { - %uadd = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %b) nounwind - %val = extractvalue { i64, i1 } %uadd, 0 - %carry = extractvalue { i64, i1 } %uadd, 1 - %ext = zext i1 %carry to i64 - %add2 = add i64 %val, %ext - store i64 %add2, i64 addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}s_uaddo_i32: -; SI: s_add_i32 - -; EG: ADDC_UINT -; EG: ADD_INT -define void @s_uaddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 %a, i32 %b) nounwind { - %uadd = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %a, i32 %b) nounwind - %val = extractvalue { i32, i1 } %uadd, 0 - %carry = extractvalue { i32, i1 } %uadd, 1 - store i32 %val, i32 addrspace(1)* %out, align 4 - store i1 %carry, i1 addrspace(1)* %carryout - ret void -} - -; FUNC-LABEL: {{^}}v_uaddo_i32: -; SI: v_add_i32 - -; EG: ADDC_UINT -; EG: ADD_INT -define void @v_uaddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind { - %a = load i32, i32 addrspace(1)* %aptr, align 4 - %b = load i32, i32 addrspace(1)* %bptr, align 4 - %uadd = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %a, i32 %b) nounwind - %val = extractvalue { i32, i1 } %uadd, 0 - %carry = extractvalue { i32, i1 } %uadd, 1 - store i32 %val, i32 addrspace(1)* %out, align 4 - store i1 %carry, i1 addrspace(1)* %carryout - ret void -} - -; FUNC-LABEL: {{^}}s_uaddo_i64: -; SI: s_add_u32 -; SI: s_addc_u32 - -; EG: ADDC_UINT -; EG: ADD_INT -define void @s_uaddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 %a, i64 %b) nounwind { - %uadd = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %b) nounwind - %val = extractvalue { i64, i1 } %uadd, 0 - %carry = extractvalue { i64, i1 } %uadd, 1 - store i64 %val, i64 addrspace(1)* %out, align 8 - store i1 %carry, i1 addrspace(1)* %carryout - ret void -} - -; FUNC-LABEL: {{^}}v_uaddo_i64: -; SI: v_add_i32 -; SI: v_addc_u32 - -; EG: ADDC_UINT -; EG: ADD_INT -define void @v_uaddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind { - %a = load i64, i64 addrspace(1)* %aptr, align 4 - %b = load i64, i64 addrspace(1)* %bptr, align 4 - %uadd = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %b) nounwind - %val = extractvalue { i64, i1 } %uadd, 0 - %carry = extractvalue { i64, i1 } %uadd, 1 - store i64 %val, i64 addrspace(1)* %out, align 8 - store i1 %carry, i1 addrspace(1)* %carryout - ret void -} diff --git a/llvm/test/CodeGen/R600/udiv.ll b/llvm/test/CodeGen/R600/udiv.ll deleted file mode 100644 index de22a22e502..00000000000 --- a/llvm/test/CodeGen/R600/udiv.ll +++ /dev/null @@ -1,48 +0,0 @@ -;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG %s -;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI %s -;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=SI %s - -;EG-LABEL: {{^}}test: -;EG-NOT: SETGE_INT -;EG: CF_END - -define void @test(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { - %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 - %a = load i32, i32 addrspace(1) * %in - %b = load i32, i32 addrspace(1) * %b_ptr - %result = udiv i32 %a, %b - store i32 %result, i32 addrspace(1)* %out - ret void -} - -;The code generated by udiv is long and complex and may frequently change. -;The goal of this test is to make sure the ISel doesn't fail when it gets -;a v4i32 udiv - -;EG-LABEL: {{^}}test2: -;EG: CF_END -;SI-LABEL: {{^}}test2: -;SI: s_endpgm - -define void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { - %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1 - %a = load <2 x i32>, <2 x i32> addrspace(1) * %in - %b = load <2 x i32>, <2 x i32> addrspace(1) * %b_ptr - %result = udiv <2 x i32> %a, %b - store <2 x i32> %result, <2 x i32> addrspace(1)* %out - ret void -} - -;EG-LABEL: {{^}}test4: -;EG: CF_END -;SI-LABEL: {{^}}test4: -;SI: s_endpgm - -define void @test4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { - %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1 - %a = load <4 x i32>, <4 x i32> addrspace(1) * %in - %b = load <4 x i32>, <4 x i32> addrspace(1) * %b_ptr - %result = udiv <4 x i32> %a, %b - store <4 x i32> %result, <4 x i32> addrspace(1)* %out - ret void -} diff --git a/llvm/test/CodeGen/R600/udivrem.ll b/llvm/test/CodeGen/R600/udivrem.ll deleted file mode 100644 index b3837f28209..00000000000 --- a/llvm/test/CodeGen/R600/udivrem.ll +++ /dev/null @@ -1,345 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck --check-prefix=SI --check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck --check-prefix=SI --check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck --check-prefix=EG --check-prefix=FUNC %s - -; FUNC-LABEL: {{^}}test_udivrem: -; EG: RECIP_UINT -; EG-DAG: MULHI -; EG-DAG: MULLO_INT -; EG-DAG: SUB_INT -; EG: CNDE_INT -; EG: MULHI -; EG-DAG: ADD_INT -; EG-DAG: SUB_INT -; EG: CNDE_INT -; EG: MULHI -; EG: MULLO_INT -; EG: SUB_INT -; EG-DAG: SETGE_UINT -; EG-DAG: SETGE_UINT -; EG: AND_INT -; EG-DAG: ADD_INT -; EG-DAG: SUB_INT -; EG-DAG: CNDE_INT -; EG-DAG: CNDE_INT -; EG-DAG: ADD_INT -; EG-DAG: SUB_INT -; EG-DAG: CNDE_INT -; EG-DAG: CNDE_INT - -; SI: v_rcp_iflag_f32_e32 [[RCP:v[0-9]+]] -; SI-DAG: v_mul_hi_u32 [[RCP_HI:v[0-9]+]], [[RCP]] -; SI-DAG: v_mul_lo_i32 [[RCP_LO:v[0-9]+]], [[RCP]] -; SI-DAG: v_sub_i32_e32 [[NEG_RCP_LO:v[0-9]+]], 0, [[RCP_LO]] -; SI: v_cndmask_b32_e64 -; SI: v_mul_hi_u32 [[E:v[0-9]+]], {{v[0-9]+}}, [[RCP]] -; SI-DAG: v_add_i32_e32 [[RCP_A_E:v[0-9]+]], [[E]], [[RCP]] -; SI-DAG: v_subrev_i32_e32 [[RCP_S_E:v[0-9]+]], [[E]], [[RCP]] -; SI: v_cndmask_b32_e64 -; SI: v_mul_hi_u32 [[Quotient:v[0-9]+]] -; SI: v_mul_lo_i32 [[Num_S_Remainder:v[0-9]+]] -; SI-DAG: v_sub_i32_e32 [[Remainder:v[0-9]+]], {{[vs][0-9]+}}, [[Num_S_Remainder]] -; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_cndmask_b32_e64 -; SI: v_and_b32_e32 [[Tmp1:v[0-9]+]] -; SI-DAG: v_add_i32_e32 [[Quotient_A_One:v[0-9]+]], 1, [[Quotient]] -; SI-DAG: v_subrev_i32_e32 [[Quotient_S_One:v[0-9]+]], -; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_add_i32_e32 [[Remainder_A_Den:v[0-9]+]], -; SI-DAG: v_subrev_i32_e32 [[Remainder_S_Den:v[0-9]+]], -; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_cndmask_b32_e64 -; SI: s_endpgm -define void @test_udivrem(i32 addrspace(1)* %out, i32 %x, i32 %y) { - %result0 = udiv i32 %x, %y - store i32 %result0, i32 addrspace(1)* %out - %result1 = urem i32 %x, %y - store i32 %result1, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}test_udivrem_v2: -; EG-DAG: RECIP_UINT -; EG-DAG: MULHI -; EG-DAG: MULLO_INT -; EG-DAG: SUB_INT -; EG-DAG: CNDE_INT -; EG-DAG: MULHI -; EG-DAG: ADD_INT -; EG-DAG: SUB_INT -; EG-DAG: CNDE_INT -; EG-DAG: MULHI -; EG-DAG: MULLO_INT -; EG-DAG: SUB_INT -; EG-DAG: SETGE_UINT -; EG-DAG: SETGE_UINT -; EG-DAG: AND_INT -; EG-DAG: ADD_INT -; EG-DAG: SUB_INT -; EG-DAG: CNDE_INT -; EG-DAG: CNDE_INT -; EG-DAG: ADD_INT -; EG-DAG: SUB_INT -; EG-DAG: CNDE_INT -; EG-DAG: CNDE_INT -; EG-DAG: RECIP_UINT -; EG-DAG: MULHI -; EG-DAG: MULLO_INT -; EG-DAG: SUB_INT -; EG-DAG: CNDE_INT -; EG-DAG: MULHI -; EG-DAG: ADD_INT -; EG-DAG: SUB_INT -; EG-DAG: CNDE_INT -; EG-DAG: MULHI -; EG-DAG: MULLO_INT -; EG-DAG: SUB_INT -; EG-DAG: SETGE_UINT -; EG-DAG: SETGE_UINT -; EG-DAG: AND_INT -; EG-DAG: ADD_INT -; EG-DAG: SUB_INT -; EG-DAG: CNDE_INT -; EG-DAG: CNDE_INT -; EG-DAG: ADD_INT -; EG-DAG: SUB_INT -; EG-DAG: CNDE_INT -; EG-DAG: CNDE_INT - -; SI-DAG: v_rcp_iflag_f32_e32 [[FIRST_RCP:v[0-9]+]] -; SI-DAG: v_mul_hi_u32 [[FIRST_RCP_HI:v[0-9]+]], [[FIRST_RCP]] -; SI-DAG: v_mul_lo_i32 [[FIRST_RCP_LO:v[0-9]+]], [[FIRST_RCP]] -; SI-DAG: v_sub_i32_e32 [[FIRST_NEG_RCP_LO:v[0-9]+]], 0, [[FIRST_RCP_LO]] -; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_mul_hi_u32 [[FIRST_E:v[0-9]+]], {{v[0-9]+}}, [[FIRST_RCP]] -; SI-DAG: v_add_i32_e32 [[FIRST_RCP_A_E:v[0-9]+]], [[FIRST_E]], [[FIRST_RCP]] -; SI-DAG: v_subrev_i32_e32 [[FIRST_RCP_S_E:v[0-9]+]], [[FIRST_E]], [[FIRST_RCP]] -; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_mul_hi_u32 [[FIRST_Quotient:v[0-9]+]] -; SI-DAG: v_mul_lo_i32 [[FIRST_Num_S_Remainder:v[0-9]+]] -; SI-DAG: v_subrev_i32_e32 [[FIRST_Remainder:v[0-9]+]], [[FIRST_Num_S_Remainder]], v{{[0-9]+}} -; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_and_b32_e32 [[FIRST_Tmp1:v[0-9]+]] -; SI-DAG: v_add_i32_e32 [[FIRST_Quotient_A_One:v[0-9]+]], {{.*}}, [[FIRST_Quotient]] -; SI-DAG: v_subrev_i32_e32 [[FIRST_Quotient_S_One:v[0-9]+]], -; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_add_i32_e32 [[FIRST_Remainder_A_Den:v[0-9]+]], -; SI-DAG: v_subrev_i32_e32 [[FIRST_Remainder_S_Den:v[0-9]+]], -; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_rcp_iflag_f32_e32 [[SECOND_RCP:v[0-9]+]] -; SI-DAG: v_mul_hi_u32 [[SECOND_RCP_HI:v[0-9]+]], [[SECOND_RCP]] -; SI-DAG: v_mul_lo_i32 [[SECOND_RCP_LO:v[0-9]+]], [[SECOND_RCP]] -; SI-DAG: v_sub_i32_e32 [[SECOND_NEG_RCP_LO:v[0-9]+]], 0, [[SECOND_RCP_LO]] -; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_mul_hi_u32 [[SECOND_E:v[0-9]+]], {{v[0-9]+}}, [[SECOND_RCP]] -; SI-DAG: v_add_i32_e32 [[SECOND_RCP_A_E:v[0-9]+]], [[SECOND_E]], [[SECOND_RCP]] -; SI-DAG: v_subrev_i32_e32 [[SECOND_RCP_S_E:v[0-9]+]], [[SECOND_E]], [[SECOND_RCP]] -; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_mul_hi_u32 [[SECOND_Quotient:v[0-9]+]] -; SI-DAG: v_mul_lo_i32 [[SECOND_Num_S_Remainder:v[0-9]+]] -; SI-DAG: v_subrev_i32_e32 [[SECOND_Remainder:v[0-9]+]], [[SECOND_Num_S_Remainder]], v{{[0-9]+}} -; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_and_b32_e32 [[SECOND_Tmp1:v[0-9]+]] -; SI-DAG: v_add_i32_e32 [[SECOND_Quotient_A_One:v[0-9]+]], {{.*}}, [[SECOND_Quotient]] -; SI-DAG: v_subrev_i32_e32 [[SECOND_Quotient_S_One:v[0-9]+]], -; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_add_i32_e32 [[SECOND_Remainder_A_Den:v[0-9]+]], -; SI-DAG: v_subrev_i32_e32 [[SECOND_Remainder_S_Den:v[0-9]+]], -; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_cndmask_b32_e64 -; SI: s_endpgm -define void @test_udivrem_v2(<2 x i32> addrspace(1)* %out, <2 x i32> %x, <2 x i32> %y) { - %result0 = udiv <2 x i32> %x, %y - store <2 x i32> %result0, <2 x i32> addrspace(1)* %out - %result1 = urem <2 x i32> %x, %y - store <2 x i32> %result1, <2 x i32> addrspace(1)* %out - ret void -} - - -; FUNC-LABEL: {{^}}test_udivrem_v4: -; EG-DAG: RECIP_UINT -; EG-DAG: MULHI -; EG-DAG: MULLO_INT -; EG-DAG: SUB_INT -; EG-DAG: CNDE_INT -; EG-DAG: MULHI -; EG-DAG: ADD_INT -; EG-DAG: SUB_INT -; EG-DAG: CNDE_INT -; EG-DAG: MULHI -; EG-DAG: MULLO_INT -; EG-DAG: SUB_INT -; EG-DAG: SETGE_UINT -; EG-DAG: SETGE_UINT -; EG-DAG: AND_INT -; EG-DAG: ADD_INT -; EG-DAG: SUB_INT -; EG-DAG: CNDE_INT -; EG-DAG: CNDE_INT -; EG-DAG: ADD_INT -; EG-DAG: SUB_INT -; EG-DAG: CNDE_INT -; EG-DAG: CNDE_INT -; EG-DAG: RECIP_UINT -; EG-DAG: MULHI -; EG-DAG: MULLO_INT -; EG-DAG: SUB_INT -; EG-DAG: CNDE_INT -; EG-DAG: MULHI -; EG-DAG: ADD_INT -; EG-DAG: SUB_INT -; EG-DAG: CNDE_INT -; EG-DAG: MULHI -; EG-DAG: MULLO_INT -; EG-DAG: SUB_INT -; EG-DAG: SETGE_UINT -; EG-DAG: SETGE_UINT -; EG-DAG: AND_INT -; EG-DAG: ADD_INT -; EG-DAG: SUB_INT -; EG-DAG: CNDE_INT -; EG-DAG: CNDE_INT -; EG-DAG: ADD_INT -; EG-DAG: SUB_INT -; EG-DAG: CNDE_INT -; EG-DAG: CNDE_INT -; EG-DAG: RECIP_UINT -; EG-DAG: MULHI -; EG-DAG: MULLO_INT -; EG-DAG: SUB_INT -; EG-DAG: CNDE_INT -; EG-DAG: MULHI -; EG-DAG: ADD_INT -; EG-DAG: SUB_INT -; EG-DAG: CNDE_INT -; EG-DAG: MULHI -; EG-DAG: MULLO_INT -; EG-DAG: SUB_INT -; EG-DAG: SETGE_UINT -; EG-DAG: SETGE_UINT -; EG-DAG: AND_INT -; EG-DAG: ADD_INT -; EG-DAG: SUB_INT -; EG-DAG: CNDE_INT -; EG-DAG: CNDE_INT -; EG-DAG: ADD_INT -; EG-DAG: SUB_INT -; EG-DAG: CNDE_INT -; EG-DAG: CNDE_INT -; EG-DAG: RECIP_UINT -; EG-DAG: MULHI -; EG-DAG: MULLO_INT -; EG-DAG: SUB_INT -; EG-DAG: CNDE_INT -; EG-DAG: MULHI -; EG-DAG: ADD_INT -; EG-DAG: SUB_INT -; EG-DAG: CNDE_INT -; EG-DAG: MULHI -; EG-DAG: MULLO_INT -; EG-DAG: SUB_INT -; EG-DAG: SETGE_UINT -; EG-DAG: SETGE_UINT -; EG-DAG: AND_INT -; EG-DAG: ADD_INT -; EG-DAG: SUB_INT -; EG-DAG: CNDE_INT -; EG-DAG: CNDE_INT -; EG-DAG: ADD_INT -; EG-DAG: SUB_INT -; EG-DAG: CNDE_INT -; EG-DAG: CNDE_INT - -; SI-DAG: v_rcp_iflag_f32_e32 [[FIRST_RCP:v[0-9]+]] -; SI-DAG: v_mul_hi_u32 [[FIRST_RCP_HI:v[0-9]+]], [[FIRST_RCP]] -; SI-DAG: v_mul_lo_i32 [[FIRST_RCP_LO:v[0-9]+]], [[FIRST_RCP]] -; SI-DAG: v_sub_i32_e32 [[FIRST_NEG_RCP_LO:v[0-9]+]], 0, [[FIRST_RCP_LO]] -; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_mul_hi_u32 [[FIRST_E:v[0-9]+]], {{v[0-9]+}}, [[FIRST_RCP]] -; SI-DAG: v_add_i32_e32 [[FIRST_RCP_A_E:v[0-9]+]], [[FIRST_E]], [[FIRST_RCP]] -; SI-DAG: v_subrev_i32_e32 [[FIRST_RCP_S_E:v[0-9]+]], [[FIRST_E]], [[FIRST_RCP]] -; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_mul_hi_u32 [[FIRST_Quotient:v[0-9]+]] -; SI-DAG: v_mul_lo_i32 [[FIRST_Num_S_Remainder:v[0-9]+]] -; SI-DAG: v_subrev_i32_e32 [[FIRST_Remainder:v[l0-9]+]], [[FIRST_Num_S_Remainder]], v{{[0-9]+}} -; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_and_b32_e32 [[FIRST_Tmp1:v[0-9]+]] -; SI-DAG: v_add_i32_e32 [[FIRST_Quotient_A_One:v[0-9]+]], {{.*}}, [[FIRST_Quotient]] -; SI-DAG: v_subrev_i32_e32 [[FIRST_Quotient_S_One:v[0-9]+]], -; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_add_i32_e32 [[FIRST_Remainder_A_Den:v[0-9]+]], -; SI-DAG: v_subrev_i32_e32 [[FIRST_Remainder_S_Den:v[0-9]+]], -; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_rcp_iflag_f32_e32 [[SECOND_RCP:v[0-9]+]] -; SI-DAG: v_mul_hi_u32 [[SECOND_RCP_HI:v[0-9]+]], [[SECOND_RCP]] -; SI-DAG: v_mul_lo_i32 [[SECOND_RCP_LO:v[0-9]+]], [[SECOND_RCP]] -; SI-DAG: v_sub_i32_e32 [[SECOND_NEG_RCP_LO:v[0-9]+]], 0, [[SECOND_RCP_LO]] -; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_mul_hi_u32 [[SECOND_E:v[0-9]+]], {{v[0-9]+}}, [[SECOND_RCP]] -; SI-DAG: v_add_i32_e32 [[SECOND_RCP_A_E:v[0-9]+]], [[SECOND_E]], [[SECOND_RCP]] -; SI-DAG: v_subrev_i32_e32 [[SECOND_RCP_S_E:v[0-9]+]], [[SECOND_E]], [[SECOND_RCP]] -; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_mul_hi_u32 [[SECOND_Quotient:v[0-9]+]] -; SI-DAG: v_mul_lo_i32 [[SECOND_Num_S_Remainder:v[0-9]+]] -; SI-DAG: v_subrev_i32_e32 [[SECOND_Remainder:v[0-9]+]], [[SECOND_Num_S_Remainder]], v{{[0-9]+}} -; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_and_b32_e32 [[SECOND_Tmp1:v[0-9]+]] -; SI-DAG: v_add_i32_e32 [[SECOND_Quotient_A_One:v[0-9]+]], {{.*}}, [[SECOND_Quotient]] -; SI-DAG: v_subrev_i32_e32 [[SECOND_Quotient_S_One:v[0-9]+]], -; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_add_i32_e32 [[SECOND_Remainder_A_Den:v[0-9]+]], -; SI-DAG: v_subrev_i32_e32 [[SECOND_Remainder_S_Den:v[0-9]+]], -; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_rcp_iflag_f32_e32 [[THIRD_RCP:v[0-9]+]] -; SI-DAG: v_mul_hi_u32 [[THIRD_RCP_HI:v[0-9]+]], [[THIRD_RCP]] -; SI-DAG: v_mul_lo_i32 [[THIRD_RCP_LO:v[0-9]+]], [[THIRD_RCP]] -; SI-DAG: v_sub_i32_e32 [[THIRD_NEG_RCP_LO:v[0-9]+]], 0, [[THIRD_RCP_LO]] -; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_mul_hi_u32 [[THIRD_E:v[0-9]+]], {{v[0-9]+}}, [[THIRD_RCP]] -; SI-DAG: v_add_i32_e32 [[THIRD_RCP_A_E:v[0-9]+]], [[THIRD_E]], [[THIRD_RCP]] -; SI-DAG: v_subrev_i32_e32 [[THIRD_RCP_S_E:v[0-9]+]], [[THIRD_E]], [[THIRD_RCP]] -; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_mul_hi_u32 [[THIRD_Quotient:v[0-9]+]] -; SI-DAG: v_mul_lo_i32 [[THIRD_Num_S_Remainder:v[0-9]+]] -; SI-DAG: v_subrev_i32_e32 [[THIRD_Remainder:v[0-9]+]], [[THIRD_Num_S_Remainder]], {{v[0-9]+}} -; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_and_b32_e32 [[THIRD_Tmp1:v[0-9]+]] -; SI-DAG: v_add_i32_e32 [[THIRD_Quotient_A_One:v[0-9]+]], {{.*}}, [[THIRD_Quotient]] -; SI-DAG: v_subrev_i32_e32 [[THIRD_Quotient_S_One:v[0-9]+]], -; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_add_i32_e32 [[THIRD_Remainder_A_Den:v[0-9]+]], -; SI-DAG: v_subrev_i32_e32 [[THIRD_Remainder_S_Den:v[0-9]+]], -; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_rcp_iflag_f32_e32 [[FOURTH_RCP:v[0-9]+]] -; SI-DAG: v_mul_hi_u32 [[FOURTH_RCP_HI:v[0-9]+]], [[FOURTH_RCP]] -; SI-DAG: v_mul_lo_i32 [[FOURTH_RCP_LO:v[0-9]+]], [[FOURTH_RCP]] -; SI-DAG: v_sub_i32_e32 [[FOURTH_NEG_RCP_LO:v[0-9]+]], 0, [[FOURTH_RCP_LO]] -; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_mul_hi_u32 [[FOURTH_E:v[0-9]+]], {{v[0-9]+}}, [[FOURTH_RCP]] -; SI-DAG: v_add_i32_e32 [[FOURTH_RCP_A_E:v[0-9]+]], [[FOURTH_E]], [[FOURTH_RCP]] -; SI-DAG: v_subrev_i32_e32 [[FOURTH_RCP_S_E:v[0-9]+]], [[FOURTH_E]], [[FOURTH_RCP]] -; SI-DAG: v_cndmask_b32_e64 -; SI: s_endpgm -define void @test_udivrem_v4(<4 x i32> addrspace(1)* %out, <4 x i32> %x, <4 x i32> %y) { - %result0 = udiv <4 x i32> %x, %y - store <4 x i32> %result0, <4 x i32> addrspace(1)* %out - %result1 = urem <4 x i32> %x, %y - store <4 x i32> %result1, <4 x i32> addrspace(1)* %out - ret void -} diff --git a/llvm/test/CodeGen/R600/udivrem24.ll b/llvm/test/CodeGen/R600/udivrem24.ll deleted file mode 100644 index 4de881b66f1..00000000000 --- a/llvm/test/CodeGen/R600/udivrem24.ll +++ /dev/null @@ -1,245 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s - -; FUNC-LABEL: {{^}}udiv24_i8: -; SI: v_cvt_f32_ubyte -; SI: v_cvt_f32_ubyte -; SI: v_rcp_f32 -; SI: v_cvt_u32_f32 - -; EG: UINT_TO_FLT -; EG-DAG: UINT_TO_FLT -; EG-DAG: RECIP_IEEE -; EG: FLT_TO_UINT -define void @udiv24_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) { - %den_ptr = getelementptr i8, i8 addrspace(1)* %in, i8 1 - %num = load i8, i8 addrspace(1) * %in - %den = load i8, i8 addrspace(1) * %den_ptr - %result = udiv i8 %num, %den - store i8 %result, i8 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}udiv24_i16: -; SI: v_cvt_f32_u32 -; SI: v_cvt_f32_u32 -; SI: v_rcp_f32 -; SI: v_cvt_u32_f32 - -; EG: UINT_TO_FLT -; EG-DAG: UINT_TO_FLT -; EG-DAG: RECIP_IEEE -; EG: FLT_TO_UINT -define void @udiv24_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) { - %den_ptr = getelementptr i16, i16 addrspace(1)* %in, i16 1 - %num = load i16, i16 addrspace(1) * %in, align 2 - %den = load i16, i16 addrspace(1) * %den_ptr, align 2 - %result = udiv i16 %num, %den - store i16 %result, i16 addrspace(1)* %out, align 2 - ret void -} - -; FUNC-LABEL: {{^}}udiv24_i32: -; SI: v_cvt_f32_u32 -; SI-DAG: v_cvt_f32_u32 -; SI-DAG: v_rcp_f32 -; SI: v_cvt_u32_f32 - -; EG: UINT_TO_FLT -; EG-DAG: UINT_TO_FLT -; EG-DAG: RECIP_IEEE -; EG: FLT_TO_UINT -define void @udiv24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { - %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 - %num = load i32, i32 addrspace(1) * %in, align 4 - %den = load i32, i32 addrspace(1) * %den_ptr, align 4 - %num.i24.0 = shl i32 %num, 8 - %den.i24.0 = shl i32 %den, 8 - %num.i24 = lshr i32 %num.i24.0, 8 - %den.i24 = lshr i32 %den.i24.0, 8 - %result = udiv i32 %num.i24, %den.i24 - store i32 %result, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}udiv25_i32: -; RCP_IFLAG is for URECIP in the full 32b alg -; SI: v_rcp_iflag -; SI-NOT: v_rcp_f32 - -; EG-NOT: UINT_TO_FLT -; EG-NOT: RECIP_IEEE -define void @udiv25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { - %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 - %num = load i32, i32 addrspace(1) * %in, align 4 - %den = load i32, i32 addrspace(1) * %den_ptr, align 4 - %num.i24.0 = shl i32 %num, 7 - %den.i24.0 = shl i32 %den, 7 - %num.i24 = lshr i32 %num.i24.0, 7 - %den.i24 = lshr i32 %den.i24.0, 7 - %result = udiv i32 %num.i24, %den.i24 - store i32 %result, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}test_no_udiv24_i32_1: -; RCP_IFLAG is for URECIP in the full 32b alg -; SI: v_rcp_iflag -; SI-NOT: v_rcp_f32 - -; EG-NOT: UINT_TO_FLT -; EG-NOT: RECIP_IEEE -define void @test_no_udiv24_i32_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { - %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 - %num = load i32, i32 addrspace(1) * %in, align 4 - %den = load i32, i32 addrspace(1) * %den_ptr, align 4 - %num.i24.0 = shl i32 %num, 8 - %den.i24.0 = shl i32 %den, 7 - %num.i24 = lshr i32 %num.i24.0, 8 - %den.i24 = lshr i32 %den.i24.0, 7 - %result = udiv i32 %num.i24, %den.i24 - store i32 %result, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}test_no_udiv24_i32_2: -; RCP_IFLAG is for URECIP in the full 32b alg -; SI: v_rcp_iflag -; SI-NOT: v_rcp_f32 - -; EG-NOT: UINT_TO_FLT -; EG-NOT: RECIP_IEEE -define void @test_no_udiv24_i32_2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { - %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 - %num = load i32, i32 addrspace(1) * %in, align 4 - %den = load i32, i32 addrspace(1) * %den_ptr, align 4 - %num.i24.0 = shl i32 %num, 7 - %den.i24.0 = shl i32 %den, 8 - %num.i24 = lshr i32 %num.i24.0, 7 - %den.i24 = lshr i32 %den.i24.0, 8 - %result = udiv i32 %num.i24, %den.i24 - store i32 %result, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}urem24_i8: -; SI: v_cvt_f32_ubyte -; SI: v_cvt_f32_ubyte -; SI: v_rcp_f32 -; SI: v_cvt_u32_f32 - -; EG: UINT_TO_FLT -; EG-DAG: UINT_TO_FLT -; EG-DAG: RECIP_IEEE -; EG: FLT_TO_UINT -define void @urem24_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) { - %den_ptr = getelementptr i8, i8 addrspace(1)* %in, i8 1 - %num = load i8, i8 addrspace(1) * %in - %den = load i8, i8 addrspace(1) * %den_ptr - %result = urem i8 %num, %den - store i8 %result, i8 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}urem24_i16: -; SI: v_cvt_f32_u32 -; SI: v_cvt_f32_u32 -; SI: v_rcp_f32 -; SI: v_cvt_u32_f32 - -; EG: UINT_TO_FLT -; EG-DAG: UINT_TO_FLT -; EG-DAG: RECIP_IEEE -; EG: FLT_TO_UINT -define void @urem24_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) { - %den_ptr = getelementptr i16, i16 addrspace(1)* %in, i16 1 - %num = load i16, i16 addrspace(1) * %in, align 2 - %den = load i16, i16 addrspace(1) * %den_ptr, align 2 - %result = urem i16 %num, %den - store i16 %result, i16 addrspace(1)* %out, align 2 - ret void -} - -; FUNC-LABEL: {{^}}urem24_i32: -; SI: v_cvt_f32_u32 -; SI: v_cvt_f32_u32 -; SI: v_rcp_f32 -; SI: v_cvt_u32_f32 - -; EG: UINT_TO_FLT -; EG-DAG: UINT_TO_FLT -; EG-DAG: RECIP_IEEE -; EG: FLT_TO_UINT -define void @urem24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { - %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 - %num = load i32, i32 addrspace(1) * %in, align 4 - %den = load i32, i32 addrspace(1) * %den_ptr, align 4 - %num.i24.0 = shl i32 %num, 8 - %den.i24.0 = shl i32 %den, 8 - %num.i24 = lshr i32 %num.i24.0, 8 - %den.i24 = lshr i32 %den.i24.0, 8 - %result = urem i32 %num.i24, %den.i24 - store i32 %result, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}urem25_i32: -; RCP_IFLAG is for URECIP in the full 32b alg -; SI: v_rcp_iflag -; SI-NOT: v_rcp_f32 - -; EG-NOT: UINT_TO_FLT -; EG-NOT: RECIP_IEEE -define void @urem25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { - %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 - %num = load i32, i32 addrspace(1) * %in, align 4 - %den = load i32, i32 addrspace(1) * %den_ptr, align 4 - %num.i24.0 = shl i32 %num, 7 - %den.i24.0 = shl i32 %den, 7 - %num.i24 = lshr i32 %num.i24.0, 7 - %den.i24 = lshr i32 %den.i24.0, 7 - %result = urem i32 %num.i24, %den.i24 - store i32 %result, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}test_no_urem24_i32_1: -; RCP_IFLAG is for URECIP in the full 32b alg -; SI: v_rcp_iflag -; SI-NOT: v_rcp_f32 - -; EG-NOT: UINT_TO_FLT -; EG-NOT: RECIP_IEEE -define void @test_no_urem24_i32_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { - %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 - %num = load i32, i32 addrspace(1) * %in, align 4 - %den = load i32, i32 addrspace(1) * %den_ptr, align 4 - %num.i24.0 = shl i32 %num, 8 - %den.i24.0 = shl i32 %den, 7 - %num.i24 = lshr i32 %num.i24.0, 8 - %den.i24 = lshr i32 %den.i24.0, 7 - %result = urem i32 %num.i24, %den.i24 - store i32 %result, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}test_no_urem24_i32_2: -; RCP_IFLAG is for URECIP in the full 32b alg -; SI: v_rcp_iflag -; SI-NOT: v_rcp_f32 - -; EG-NOT: UINT_TO_FLT -; EG-NOT: RECIP_IEEE -define void @test_no_urem24_i32_2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { - %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 - %num = load i32, i32 addrspace(1) * %in, align 4 - %den = load i32, i32 addrspace(1) * %den_ptr, align 4 - %num.i24.0 = shl i32 %num, 7 - %den.i24.0 = shl i32 %den, 8 - %num.i24 = lshr i32 %num.i24.0, 7 - %den.i24 = lshr i32 %den.i24.0, 8 - %result = urem i32 %num.i24, %den.i24 - store i32 %result, i32 addrspace(1)* %out, align 4 - ret void -} diff --git a/llvm/test/CodeGen/R600/udivrem64.ll b/llvm/test/CodeGen/R600/udivrem64.ll deleted file mode 100644 index 9f3069bdf80..00000000000 --- a/llvm/test/CodeGen/R600/udivrem64.ll +++ /dev/null @@ -1,223 +0,0 @@ -;RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck --check-prefix=SI --check-prefix=GCN --check-prefix=FUNC %s -;RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck --check-prefix=VI --check-prefix=GCN --check-prefix=FUNC %s -;RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck --check-prefix=EG --check-prefix=FUNC %s - -;FUNC-LABEL: {{^}}test_udiv: -;EG: RECIP_UINT -;EG: LSHL {{.*}}, 1, -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT - -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN-NOT: v_mad_f32 -;SI-NOT: v_lshr_b64 -;VI-NOT: v_lshrrev_b64 -;GCN: s_endpgm -define void @test_udiv(i64 addrspace(1)* %out, i64 %x, i64 %y) { - %result = udiv i64 %x, %y - store i64 %result, i64 addrspace(1)* %out - ret void -} - -;FUNC-LABEL: {{^}}test_urem: -;EG: RECIP_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: AND_INT {{.*}}, 1, - -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN-NOT: v_mad_f32 -;SI-NOT: v_lshr_b64 -;VI-NOT: v_lshrrev_b64 -;GCN: s_endpgm -define void @test_urem(i64 addrspace(1)* %out, i64 %x, i64 %y) { - %result = urem i64 %x, %y - store i64 %result, i64 addrspace(1)* %out - ret void -} - -;FUNC-LABEL: {{^}}test_udiv3264: -;EG: RECIP_UINT -;EG-NOT: BFE_UINT - -;GCN-NOT: s_bfe_u32 -;GCN-NOT: v_mad_f32 -;SI-NOT: v_lshr_b64 -;VI-NOT: v_lshrrev_b64 -;GCN: s_endpgm -define void @test_udiv3264(i64 addrspace(1)* %out, i64 %x, i64 %y) { - %1 = lshr i64 %x, 33 - %2 = lshr i64 %y, 33 - %result = udiv i64 %1, %2 - store i64 %result, i64 addrspace(1)* %out - ret void -} - -;FUNC-LABEL: {{^}}test_urem3264: -;EG: RECIP_UINT -;EG-NOT: BFE_UINT - -;GCN-NOT: s_bfe_u32 -;GCN-NOT: v_mad_f32 -;SI-NOT: v_lshr_b64 -;VI-NOT: v_lshrrev_b64 -;GCN: s_endpgm -define void @test_urem3264(i64 addrspace(1)* %out, i64 %x, i64 %y) { - %1 = lshr i64 %x, 33 - %2 = lshr i64 %y, 33 - %result = urem i64 %1, %2 - store i64 %result, i64 addrspace(1)* %out - ret void -} - -;FUNC-LABEL: {{^}}test_udiv2464: -;EG: UINT_TO_FLT -;EG: UINT_TO_FLT -;EG: FLT_TO_UINT -;EG-NOT: RECIP_UINT -;EG-NOT: BFE_UINT - -;SI-NOT: v_lshr_b64 -;VI-NOT: v_lshrrev_b64 -;GCN: v_mad_f32 -;GCN: s_endpgm -define void @test_udiv2464(i64 addrspace(1)* %out, i64 %x, i64 %y) { - %1 = lshr i64 %x, 40 - %2 = lshr i64 %y, 40 - %result = udiv i64 %1, %2 - store i64 %result, i64 addrspace(1)* %out - ret void -} - -;FUNC-LABEL: {{^}}test_urem2464: -;EG: UINT_TO_FLT -;EG: UINT_TO_FLT -;EG: FLT_TO_UINT -;EG-NOT: RECIP_UINT -;EG-NOT: BFE_UINT - -;SI-NOT: v_lshr_b64 -;VI-NOT: v_lshrrev_b64 -;GCN: v_mad_f32 -;GCN: s_endpgm -define void @test_urem2464(i64 addrspace(1)* %out, i64 %x, i64 %y) { - %1 = lshr i64 %x, 40 - %2 = lshr i64 %y, 40 - %result = urem i64 %1, %2 - store i64 %result, i64 addrspace(1)* %out - ret void -} diff --git a/llvm/test/CodeGen/R600/uint_to_fp.f64.ll b/llvm/test/CodeGen/R600/uint_to_fp.f64.ll deleted file mode 100644 index dfec8eb15cb..00000000000 --- a/llvm/test/CodeGen/R600/uint_to_fp.f64.ll +++ /dev/null @@ -1,98 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s - -declare i32 @llvm.r600.read.tidig.x() nounwind readnone - -; SI-LABEL: {{^}}v_uint_to_fp_i64_to_f64 -; SI: buffer_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}} -; SI: v_cvt_f64_u32_e32 [[HI_CONV:v\[[0-9]+:[0-9]+\]]], v[[HI]] -; SI: v_ldexp_f64 [[LDEXP:v\[[0-9]+:[0-9]+\]]], [[HI_CONV]], 32 -; SI: v_cvt_f64_u32_e32 [[LO_CONV:v\[[0-9]+:[0-9]+\]]], v[[LO]] -; SI: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[LDEXP]], [[LO_CONV]] -; SI: buffer_store_dwordx2 [[RESULT]] -define void @v_uint_to_fp_i64_to_f64(double addrspace(1)* %out, i64 addrspace(1)* %in) { - %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone - %gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid - %val = load i64, i64 addrspace(1)* %gep, align 8 - %result = uitofp i64 %val to double - store double %result, double addrspace(1)* %out - ret void -} - -; SI-LABEL: {{^}}s_uint_to_fp_i64_to_f64 -define void @s_uint_to_fp_i64_to_f64(double addrspace(1)* %out, i64 %in) { - %cast = uitofp i64 %in to double - store double %cast, double addrspace(1)* %out, align 8 - ret void -} - -; SI-LABEL: {{^}}s_uint_to_fp_v2i64_to_v2f64 -define void @s_uint_to_fp_v2i64_to_v2f64(<2 x double> addrspace(1)* %out, <2 x i64> %in) { - %cast = uitofp <2 x i64> %in to <2 x double> - store <2 x double> %cast, <2 x double> addrspace(1)* %out, align 16 - ret void -} - -; SI-LABEL: {{^}}s_uint_to_fp_v4i64_to_v4f64 -define void @s_uint_to_fp_v4i64_to_v4f64(<4 x double> addrspace(1)* %out, <4 x i64> %in) { - %cast = uitofp <4 x i64> %in to <4 x double> - store <4 x double> %cast, <4 x double> addrspace(1)* %out, align 16 - ret void -} - -; SI-LABEL: {{^}}s_uint_to_fp_i32_to_f64 -; SI: v_cvt_f64_u32_e32 -; SI: s_endpgm -define void @s_uint_to_fp_i32_to_f64(double addrspace(1)* %out, i32 %in) { - %cast = uitofp i32 %in to double - store double %cast, double addrspace(1)* %out, align 8 - ret void -} - -; SI-LABEL: {{^}}s_uint_to_fp_v2i32_to_v2f64 -; SI: v_cvt_f64_u32_e32 -; SI: v_cvt_f64_u32_e32 -; SI: s_endpgm -define void @s_uint_to_fp_v2i32_to_v2f64(<2 x double> addrspace(1)* %out, <2 x i32> %in) { - %cast = uitofp <2 x i32> %in to <2 x double> - store <2 x double> %cast, <2 x double> addrspace(1)* %out, align 16 - ret void -} - -; SI-LABEL: {{^}}s_uint_to_fp_v4i32_to_v4f64 -; SI: v_cvt_f64_u32_e32 -; SI: v_cvt_f64_u32_e32 -; SI: v_cvt_f64_u32_e32 -; SI: v_cvt_f64_u32_e32 -; SI: s_endpgm -define void @s_uint_to_fp_v4i32_to_v4f64(<4 x double> addrspace(1)* %out, <4 x i32> %in) { - %cast = uitofp <4 x i32> %in to <4 x double> - store <4 x double> %cast, <4 x double> addrspace(1)* %out, align 16 - ret void -} - -; FIXME: select on 0, 0 -; SI-LABEL: {{^}}uint_to_fp_i1_to_f64: -; SI: v_cmp_eq_i32_e64 [[CMP:s\[[0-9]+:[0-9]\]]], -; We can't fold the SGPRs into v_cndmask_b32_e64, because it already -; uses an SGPR for [[CMP]] -; SI: v_cndmask_b32_e64 v{{[0-9]+}}, 0, v{{[0-9]+}}, [[CMP]] -; SI: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 0, [[CMP]] -; SI: buffer_store_dwordx2 -; SI: s_endpgm -define void @uint_to_fp_i1_to_f64(double addrspace(1)* %out, i32 %in) { - %cmp = icmp eq i32 %in, 0 - %fp = uitofp i1 %cmp to double - store double %fp, double addrspace(1)* %out, align 4 - ret void -} - -; SI-LABEL: {{^}}uint_to_fp_i1_to_f64_load: -; SI: v_cndmask_b32_e64 [[IRESULT:v[0-9]]], 0, 1 -; SI-NEXT: v_cvt_f64_u32_e32 [[RESULT:v\[[0-9]+:[0-9]\]]], [[IRESULT]] -; SI: buffer_store_dwordx2 [[RESULT]] -; SI: s_endpgm -define void @uint_to_fp_i1_to_f64_load(double addrspace(1)* %out, i1 %in) { - %fp = uitofp i1 %in to double - store double %fp, double addrspace(1)* %out, align 8 - ret void -} diff --git a/llvm/test/CodeGen/R600/uint_to_fp.ll b/llvm/test/CodeGen/R600/uint_to_fp.ll deleted file mode 100644 index 00fea80b1bc..00000000000 --- a/llvm/test/CodeGen/R600/uint_to_fp.ll +++ /dev/null @@ -1,82 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s - -; FUNC-LABEL: {{^}}uint_to_fp_i32_to_f32: -; R600-DAG: UINT_TO_FLT * T{{[0-9]+\.[XYZW]}}, KC0[2].Z - -; SI: v_cvt_f32_u32_e32 -; SI: s_endpgm -define void @uint_to_fp_i32_to_f32(float addrspace(1)* %out, i32 %in) { - %result = uitofp i32 %in to float - store float %result, float addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}uint_to_fp_v2i32_to_v2f32: -; R600-DAG: UINT_TO_FLT * T{{[0-9]+\.[XYZW]}}, KC0[2].W -; R600-DAG: UINT_TO_FLT * T{{[0-9]+\.[XYZW]}}, KC0[3].X - -; SI: v_cvt_f32_u32_e32 -; SI: v_cvt_f32_u32_e32 -; SI: s_endpgm -define void @uint_to_fp_v2i32_to_v2f32(<2 x float> addrspace(1)* %out, <2 x i32> %in) { - %result = uitofp <2 x i32> %in to <2 x float> - store <2 x float> %result, <2 x float> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}uint_to_fp_v4i32_to_v4f32: -; R600: UINT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -; R600: UINT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -; R600: UINT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -; R600: UINT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} - -; SI: v_cvt_f32_u32_e32 -; SI: v_cvt_f32_u32_e32 -; SI: v_cvt_f32_u32_e32 -; SI: v_cvt_f32_u32_e32 -; SI: s_endpgm -define void @uint_to_fp_v4i32_to_v4f32(<4 x float> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { - %value = load <4 x i32>, <4 x i32> addrspace(1) * %in - %result = uitofp <4 x i32> %value to <4 x float> - store <4 x float> %result, <4 x float> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}uint_to_fp_i64_to_f32: -; R600: UINT_TO_FLT -; R600: UINT_TO_FLT -; R600: MULADD_IEEE -; SI: v_cvt_f32_u32_e32 -; SI: v_cvt_f32_u32_e32 -; SI: v_madmk_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, 0x4f800000 -; SI: s_endpgm -define void @uint_to_fp_i64_to_f32(float addrspace(1)* %out, i64 %in) { -entry: - %0 = uitofp i64 %in to float - store float %0, float addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}uint_to_fp_i1_to_f32: -; SI: v_cmp_eq_i32_e64 [[CMP:s\[[0-9]+:[0-9]\]]], -; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1.0, [[CMP]] -; SI: buffer_store_dword [[RESULT]], -; SI: s_endpgm -define void @uint_to_fp_i1_to_f32(float addrspace(1)* %out, i32 %in) { - %cmp = icmp eq i32 %in, 0 - %fp = uitofp i1 %cmp to float - store float %fp, float addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}uint_to_fp_i1_to_f32_load: -; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1.0 -; SI: buffer_store_dword [[RESULT]], -; SI: s_endpgm -define void @uint_to_fp_i1_to_f32_load(float addrspace(1)* %out, i1 %in) { - %fp = uitofp i1 %in to float - store float %fp, float addrspace(1)* %out, align 4 - ret void -} diff --git a/llvm/test/CodeGen/R600/unaligned-load-store.ll b/llvm/test/CodeGen/R600/unaligned-load-store.ll deleted file mode 100644 index 82d88ebd3ae..00000000000 --- a/llvm/test/CodeGen/R600/unaligned-load-store.ll +++ /dev/null @@ -1,254 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s - -; SI-LABEL: {{^}}unaligned_load_store_i16_local: -; SI: ds_read_u8 -; SI: ds_read_u8 -; SI: ds_write_b8 -; SI: ds_write_b8 -; SI: s_endpgm -define void @unaligned_load_store_i16_local(i16 addrspace(3)* %p, i16 addrspace(3)* %r) nounwind { - %v = load i16, i16 addrspace(3)* %p, align 1 - store i16 %v, i16 addrspace(3)* %r, align 1 - ret void -} - -; SI-LABEL: {{^}}unaligned_load_store_i16_global: -; SI: buffer_load_ubyte -; SI: buffer_load_ubyte -; SI: buffer_store_byte -; SI: buffer_store_byte -; SI: s_endpgm -define void @unaligned_load_store_i16_global(i16 addrspace(1)* %p, i16 addrspace(1)* %r) nounwind { - %v = load i16, i16 addrspace(1)* %p, align 1 - store i16 %v, i16 addrspace(1)* %r, align 1 - ret void -} - -; SI-LABEL: {{^}}unaligned_load_store_i32_local: -; SI: ds_read_u8 -; SI: ds_read_u8 -; SI: ds_read_u8 -; SI: ds_read_u8 -; SI: ds_write_b8 -; SI: ds_write_b8 -; SI: ds_write_b8 -; SI: ds_write_b8 -; SI: s_endpgm -define void @unaligned_load_store_i32_local(i32 addrspace(3)* %p, i32 addrspace(3)* %r) nounwind { - %v = load i32, i32 addrspace(3)* %p, align 1 - store i32 %v, i32 addrspace(3)* %r, align 1 - ret void -} - -; SI-LABEL: {{^}}unaligned_load_store_i32_global: -; SI: buffer_load_ubyte -; SI: buffer_load_ubyte -; SI: buffer_load_ubyte -; SI: buffer_load_ubyte -; SI: buffer_store_byte -; SI: buffer_store_byte -; SI: buffer_store_byte -; SI: buffer_store_byte -define void @unaligned_load_store_i32_global(i32 addrspace(1)* %p, i32 addrspace(1)* %r) nounwind { - %v = load i32, i32 addrspace(1)* %p, align 1 - store i32 %v, i32 addrspace(1)* %r, align 1 - ret void -} - -; SI-LABEL: {{^}}unaligned_load_store_i64_local: -; SI: ds_read_u8 -; SI: ds_read_u8 -; SI: ds_read_u8 -; SI: ds_read_u8 -; SI: ds_read_u8 -; SI: ds_read_u8 -; SI: ds_read_u8 -; SI: ds_read_u8 -; SI: ds_write_b8 -; SI: ds_write_b8 -; SI: ds_write_b8 -; SI: ds_write_b8 -; SI: ds_write_b8 -; SI: ds_write_b8 -; SI: ds_write_b8 -; SI: ds_write_b8 -; SI: s_endpgm -define void @unaligned_load_store_i64_local(i64 addrspace(3)* %p, i64 addrspace(3)* %r) { - %v = load i64, i64 addrspace(3)* %p, align 1 - store i64 %v, i64 addrspace(3)* %r, align 1 - ret void -} - -; SI-LABEL: {{^}}unaligned_load_store_i64_global: -; SI: buffer_load_ubyte -; SI: buffer_load_ubyte -; SI: buffer_load_ubyte -; SI: buffer_load_ubyte -; SI: buffer_load_ubyte -; SI: buffer_load_ubyte -; SI: buffer_load_ubyte -; SI: buffer_load_ubyte -; SI: buffer_store_byte -; SI: buffer_store_byte -; SI: buffer_store_byte -; SI: buffer_store_byte -; SI: buffer_store_byte -; SI: buffer_store_byte -; SI: buffer_store_byte -; SI: buffer_store_byte -define void @unaligned_load_store_i64_global(i64 addrspace(1)* %p, i64 addrspace(1)* %r) { - %v = load i64, i64 addrspace(1)* %p, align 1 - store i64 %v, i64 addrspace(1)* %r, align 1 - ret void -} - -; SI-LABEL: {{^}}unaligned_load_store_v4i32_local: -; SI: ds_read_u8 -; SI: ds_read_u8 -; SI: ds_read_u8 -; SI: ds_read_u8 - -; SI: ds_read_u8 -; SI: ds_read_u8 -; SI: ds_read_u8 -; SI: ds_read_u8 - -; SI: ds_read_u8 -; SI: ds_read_u8 -; SI: ds_read_u8 -; SI: ds_read_u8 - -; SI: ds_read_u8 -; SI: ds_read_u8 -; SI: ds_read_u8 -; SI: ds_read_u8 - -; SI: ds_write_b8 -; SI: ds_write_b8 -; SI: ds_write_b8 -; SI: ds_write_b8 - -; SI: ds_write_b8 -; SI: ds_write_b8 -; SI: ds_write_b8 -; SI: ds_write_b8 - -; SI: ds_write_b8 -; SI: ds_write_b8 -; SI: ds_write_b8 -; SI: ds_write_b8 - -; SI: ds_write_b8 -; SI: ds_write_b8 -; SI: ds_write_b8 -; SI: ds_write_b8 -; SI: s_endpgm -define void @unaligned_load_store_v4i32_local(<4 x i32> addrspace(3)* %p, <4 x i32> addrspace(3)* %r) nounwind { - %v = load <4 x i32>, <4 x i32> addrspace(3)* %p, align 1 - store <4 x i32> %v, <4 x i32> addrspace(3)* %r, align 1 - ret void -} - -; FIXME: We mark v4i32 as custom, so misaligned loads are never expanded. -; FIXME-SI-LABEL: {{^}}unaligned_load_store_v4i32_global -; FIXME-SI: buffer_load_ubyte -; FIXME-SI: buffer_load_ubyte -; FIXME-SI: buffer_load_ubyte -; FIXME-SI: buffer_load_ubyte -; FIXME-SI: buffer_load_ubyte -; FIXME-SI: buffer_load_ubyte -; FIXME-SI: buffer_load_ubyte -; FIXME-SI: buffer_load_ubyte -; FIXME-SI: buffer_load_ubyte -; FIXME-SI: buffer_load_ubyte -; FIXME-SI: buffer_load_ubyte -; FIXME-SI: buffer_load_ubyte -; FIXME-SI: buffer_load_ubyte -; FIXME-SI: buffer_load_ubyte -; FIXME-SI: buffer_load_ubyte -; FIXME-SI: buffer_load_ubyte -define void @unaligned_load_store_v4i32_global(<4 x i32> addrspace(1)* %p, <4 x i32> addrspace(1)* %r) nounwind { - %v = load <4 x i32>, <4 x i32> addrspace(1)* %p, align 1 - store <4 x i32> %v, <4 x i32> addrspace(1)* %r, align 1 - ret void -} - -; SI-LABEL: {{^}}load_lds_i64_align_4: -; SI: ds_read2_b32 -; SI: s_endpgm -define void @load_lds_i64_align_4(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 { - %val = load i64, i64 addrspace(3)* %in, align 4 - store i64 %val, i64 addrspace(1)* %out, align 8 - ret void -} - -; SI-LABEL: {{^}}load_lds_i64_align_4_with_offset -; SI: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]}} offset0:8 offset1:9 -; SI: s_endpgm -define void @load_lds_i64_align_4_with_offset(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 { - %ptr = getelementptr i64, i64 addrspace(3)* %in, i32 4 - %val = load i64, i64 addrspace(3)* %ptr, align 4 - store i64 %val, i64 addrspace(1)* %out, align 8 - ret void -} - -; SI-LABEL: {{^}}load_lds_i64_align_4_with_split_offset: -; The tests for the case where the lo offset is 8-bits, but the hi offset is 9-bits -; SI: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]}} offset1:1 -; SI: s_endpgm -define void @load_lds_i64_align_4_with_split_offset(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 { - %ptr = bitcast i64 addrspace(3)* %in to i32 addrspace(3)* - %ptr255 = getelementptr i32, i32 addrspace(3)* %ptr, i32 255 - %ptri64 = bitcast i32 addrspace(3)* %ptr255 to i64 addrspace(3)* - %val = load i64, i64 addrspace(3)* %ptri64, align 4 - store i64 %val, i64 addrspace(1)* %out, align 8 - ret void -} - -; SI-LABEL: {{^}}load_lds_i64_align_1: -; SI: ds_read_u8 -; SI: ds_read_u8 -; SI: ds_read_u8 -; SI: ds_read_u8 -; SI: ds_read_u8 -; SI: ds_read_u8 -; SI: ds_read_u8 -; SI: ds_read_u8 -; SI: buffer_store_dwordx2 -; SI: s_endpgm - -define void @load_lds_i64_align_1(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 { - %val = load i64, i64 addrspace(3)* %in, align 1 - store i64 %val, i64 addrspace(1)* %out, align 8 - ret void -} - -; SI-LABEL: {{^}}store_lds_i64_align_4: -; SI: ds_write2_b32 -; SI: s_endpgm -define void @store_lds_i64_align_4(i64 addrspace(3)* %out, i64 %val) #0 { - store i64 %val, i64 addrspace(3)* %out, align 4 - ret void -} - -; SI-LABEL: {{^}}store_lds_i64_align_4_with_offset -; SI: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset0:8 offset1:9 -; SI: s_endpgm -define void @store_lds_i64_align_4_with_offset(i64 addrspace(3)* %out) #0 { - %ptr = getelementptr i64, i64 addrspace(3)* %out, i32 4 - store i64 0, i64 addrspace(3)* %ptr, align 4 - ret void -} - -; SI-LABEL: {{^}}store_lds_i64_align_4_with_split_offset: -; The tests for the case where the lo offset is 8-bits, but the hi offset is 9-bits -; SI: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset1:1 -; SI: s_endpgm -define void @store_lds_i64_align_4_with_split_offset(i64 addrspace(3)* %out) #0 { - %ptr = bitcast i64 addrspace(3)* %out to i32 addrspace(3)* - %ptr255 = getelementptr i32, i32 addrspace(3)* %ptr, i32 255 - %ptri64 = bitcast i32 addrspace(3)* %ptr255 to i64 addrspace(3)* - store i64 0, i64 addrspace(3)* %out, align 4 - ret void -} diff --git a/llvm/test/CodeGen/R600/unhandled-loop-condition-assertion.ll b/llvm/test/CodeGen/R600/unhandled-loop-condition-assertion.ll deleted file mode 100644 index 036a7e91b47..00000000000 --- a/llvm/test/CodeGen/R600/unhandled-loop-condition-assertion.ll +++ /dev/null @@ -1,115 +0,0 @@ -; REQUIRES: asserts -; XFAIL: * -; RUN: llc -O0 -verify-machineinstrs -asm-verbose=0 -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=COMMON %s -; RUN: llc -O0 -verify-machineinstrs -asm-verbose=0 -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=COMMON %s -; RUN: llc -O0 -verify-machineinstrs -asm-verbose=0 -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=COMMON %s - -; SI hits an assertion at -O0, evergreen hits a not implemented unreachable. - -; COMMON-LABEL: {{^}}branch_true: -define void @branch_true(i8 addrspace(1)* nocapture %main, i32 %main_stride) #0 { -entry: - br i1 true, label %for.end, label %for.body.lr.ph - -for.body.lr.ph: ; preds = %entry - %add.ptr.sum = shl i32 %main_stride, 1 - %add.ptr1.sum = add i32 %add.ptr.sum, %main_stride - %add.ptr4.sum = shl i32 %main_stride, 2 - br label %for.body - -for.body: ; preds = %for.body, %for.body.lr.ph - %main.addr.011 = phi i8 addrspace(1)* [ %main, %for.body.lr.ph ], [ %add.ptr6, %for.body ] - %0 = bitcast i8 addrspace(1)* %main.addr.011 to i32 addrspace(1)* - %1 = load i32, i32 addrspace(1)* %0, align 4 - %add.ptr = getelementptr inbounds i8, i8 addrspace(1)* %main.addr.011, i32 %main_stride - %2 = bitcast i8 addrspace(1)* %add.ptr to i32 addrspace(1)* - %3 = load i32, i32 addrspace(1)* %2, align 4 - %add.ptr1 = getelementptr inbounds i8, i8 addrspace(1)* %main.addr.011, i32 %add.ptr.sum - %4 = bitcast i8 addrspace(1)* %add.ptr1 to i32 addrspace(1)* - %5 = load i32, i32 addrspace(1)* %4, align 4 - %add.ptr2 = getelementptr inbounds i8, i8 addrspace(1)* %main.addr.011, i32 %add.ptr1.sum - %6 = bitcast i8 addrspace(1)* %add.ptr2 to i32 addrspace(1)* - %7 = load i32, i32 addrspace(1)* %6, align 4 - %add.ptr3 = getelementptr inbounds i8, i8 addrspace(1)* %main.addr.011, i32 %add.ptr4.sum - %8 = bitcast i8 addrspace(1)* %add.ptr3 to i32 addrspace(1)* - %9 = load i32, i32 addrspace(1)* %8, align 4 - %add.ptr6 = getelementptr inbounds i8, i8 addrspace(1)* %main.addr.011, i32 undef - br i1 undef, label %for.end, label %for.body - -for.end: ; preds = %for.body, %entry - ret void -} - -; COMMON-LABEL: {{^}}branch_false: -; SI: .text -; SI-NEXT: s_endpgm -define void @branch_false(i8 addrspace(1)* nocapture %main, i32 %main_stride) #0 { -entry: - br i1 false, label %for.end, label %for.body.lr.ph - -for.body.lr.ph: ; preds = %entry - %add.ptr.sum = shl i32 %main_stride, 1 - %add.ptr1.sum = add i32 %add.ptr.sum, %main_stride - %add.ptr4.sum = shl i32 %main_stride, 2 - br label %for.body - -for.body: ; preds = %for.body, %for.body.lr.ph - %main.addr.011 = phi i8 addrspace(1)* [ %main, %for.body.lr.ph ], [ %add.ptr6, %for.body ] - %0 = bitcast i8 addrspace(1)* %main.addr.011 to i32 addrspace(1)* - %1 = load i32, i32 addrspace(1)* %0, align 4 - %add.ptr = getelementptr inbounds i8, i8 addrspace(1)* %main.addr.011, i32 %main_stride - %2 = bitcast i8 addrspace(1)* %add.ptr to i32 addrspace(1)* - %3 = load i32, i32 addrspace(1)* %2, align 4 - %add.ptr1 = getelementptr inbounds i8, i8 addrspace(1)* %main.addr.011, i32 %add.ptr.sum - %4 = bitcast i8 addrspace(1)* %add.ptr1 to i32 addrspace(1)* - %5 = load i32, i32 addrspace(1)* %4, align 4 - %add.ptr2 = getelementptr inbounds i8, i8 addrspace(1)* %main.addr.011, i32 %add.ptr1.sum - %6 = bitcast i8 addrspace(1)* %add.ptr2 to i32 addrspace(1)* - %7 = load i32, i32 addrspace(1)* %6, align 4 - %add.ptr3 = getelementptr inbounds i8, i8 addrspace(1)* %main.addr.011, i32 %add.ptr4.sum - %8 = bitcast i8 addrspace(1)* %add.ptr3 to i32 addrspace(1)* - %9 = load i32, i32 addrspace(1)* %8, align 4 - %add.ptr6 = getelementptr inbounds i8, i8 addrspace(1)* %main.addr.011, i32 undef - br i1 undef, label %for.end, label %for.body - -for.end: ; preds = %for.body, %entry - ret void -} - -; COMMON-LABEL: {{^}}branch_undef: -; SI: .text -; SI-NEXT: s_endpgm -define void @branch_undef(i8 addrspace(1)* nocapture %main, i32 %main_stride) #0 { -entry: - br i1 undef, label %for.end, label %for.body.lr.ph - -for.body.lr.ph: ; preds = %entry - %add.ptr.sum = shl i32 %main_stride, 1 - %add.ptr1.sum = add i32 %add.ptr.sum, %main_stride - %add.ptr4.sum = shl i32 %main_stride, 2 - br label %for.body - -for.body: ; preds = %for.body, %for.body.lr.ph - %main.addr.011 = phi i8 addrspace(1)* [ %main, %for.body.lr.ph ], [ %add.ptr6, %for.body ] - %0 = bitcast i8 addrspace(1)* %main.addr.011 to i32 addrspace(1)* - %1 = load i32, i32 addrspace(1)* %0, align 4 - %add.ptr = getelementptr inbounds i8, i8 addrspace(1)* %main.addr.011, i32 %main_stride - %2 = bitcast i8 addrspace(1)* %add.ptr to i32 addrspace(1)* - %3 = load i32, i32 addrspace(1)* %2, align 4 - %add.ptr1 = getelementptr inbounds i8, i8 addrspace(1)* %main.addr.011, i32 %add.ptr.sum - %4 = bitcast i8 addrspace(1)* %add.ptr1 to i32 addrspace(1)* - %5 = load i32, i32 addrspace(1)* %4, align 4 - %add.ptr2 = getelementptr inbounds i8, i8 addrspace(1)* %main.addr.011, i32 %add.ptr1.sum - %6 = bitcast i8 addrspace(1)* %add.ptr2 to i32 addrspace(1)* - %7 = load i32, i32 addrspace(1)* %6, align 4 - %add.ptr3 = getelementptr inbounds i8, i8 addrspace(1)* %main.addr.011, i32 %add.ptr4.sum - %8 = bitcast i8 addrspace(1)* %add.ptr3 to i32 addrspace(1)* - %9 = load i32, i32 addrspace(1)* %8, align 4 - %add.ptr6 = getelementptr inbounds i8, i8 addrspace(1)* %main.addr.011, i32 undef - br i1 undef, label %for.end, label %for.body - -for.end: ; preds = %for.body, %entry - ret void -} - -attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/R600/unroll.ll b/llvm/test/CodeGen/R600/unroll.ll deleted file mode 100644 index 411a15a4b83..00000000000 --- a/llvm/test/CodeGen/R600/unroll.ll +++ /dev/null @@ -1,36 +0,0 @@ -; RUN: opt -mtriple=amdgcn-- -loop-unroll -simplifycfg -sroa %s -S -o - | FileCheck %s -; RUN: opt -mtriple=r600-- -loop-unroll -simplifycfg -sroa %s -S -o - | FileCheck %s - - -; This test contains a simple loop that initializes an array declared in -; private memory. We want to make sure these kinds of loops are always -; unrolled, because private memory is slow. - -; CHECK-LABEL: @test -; CHECK-NOT: alloca -; CHECK: store i32 5, i32 addrspace(1)* %out -define void @test(i32 addrspace(1)* %out) { -entry: - %0 = alloca [32 x i32] - br label %loop.header - -loop.header: - %counter = phi i32 [0, %entry], [%inc, %loop.inc] - br label %loop.body - -loop.body: - %ptr = getelementptr [32 x i32], [32 x i32]* %0, i32 0, i32 %counter - store i32 %counter, i32* %ptr - br label %loop.inc - -loop.inc: - %inc = add i32 %counter, 1 - %1 = icmp sge i32 %counter, 32 - br i1 %1, label %exit, label %loop.header - -exit: - %2 = getelementptr [32 x i32], [32 x i32]* %0, i32 0, i32 5 - %3 = load i32, i32* %2 - store i32 %3, i32 addrspace(1)* %out - ret void -} diff --git a/llvm/test/CodeGen/R600/unsupported-cc.ll b/llvm/test/CodeGen/R600/unsupported-cc.ll deleted file mode 100644 index 8ab4faf2f14..00000000000 --- a/llvm/test/CodeGen/R600/unsupported-cc.ll +++ /dev/null @@ -1,125 +0,0 @@ -; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s - -; These tests are for condition codes that are not supported by the hardware - -; CHECK-LABEL: {{^}}slt: -; CHECK: SETGT_INT {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z -; CHECK-NEXT: LSHR -; CHECK-NEXT: 5(7.006492e-45) -define void @slt(i32 addrspace(1)* %out, i32 %in) { -entry: - %0 = icmp slt i32 %in, 5 - %1 = select i1 %0, i32 -1, i32 0 - store i32 %1, i32 addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}ult_i32: -; CHECK: SETGT_UINT {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z -; CHECK-NEXT: LSHR -; CHECK-NEXT: 5(7.006492e-45) -define void @ult_i32(i32 addrspace(1)* %out, i32 %in) { -entry: - %0 = icmp ult i32 %in, 5 - %1 = select i1 %0, i32 -1, i32 0 - store i32 %1, i32 addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}ult_float: -; CHECK: SETGE * T{{[0-9]}}.[[CHAN:[XYZW]]], KC0[2].Z, literal.x -; CHECK-NEXT: 1084227584(5.000000e+00) -; CHECK-NEXT: SETE T{{[0-9]\.[XYZW]}}, PV.[[CHAN]], 0.0 -; CHECK-NEXT: LSHR * -define void @ult_float(float addrspace(1)* %out, float %in) { -entry: - %0 = fcmp ult float %in, 5.0 - %1 = select i1 %0, float 1.0, float 0.0 - store float %1, float addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}ult_float_native: -; CHECK: SETGE T{{[0-9]\.[XYZW]}}, KC0[2].Z, literal.x -; CHECK-NEXT: LSHR * -; CHECK-NEXT: 1084227584(5.000000e+00) -define void @ult_float_native(float addrspace(1)* %out, float %in) { -entry: - %0 = fcmp ult float %in, 5.0 - %1 = select i1 %0, float 0.0, float 1.0 - store float %1, float addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}olt: -; CHECK: SETGT T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z -; CHECK-NEXT: LSHR * -; CHECK-NEXT: 1084227584(5.000000e+00) -define void @olt(float addrspace(1)* %out, float %in) { -entry: - %0 = fcmp olt float %in, 5.0 - %1 = select i1 %0, float 1.0, float 0.0 - store float %1, float addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}sle: -; CHECK: SETGT_INT {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z -; CHECK-NEXT: LSHR -; CHECK-NEXT: 6(8.407791e-45) -define void @sle(i32 addrspace(1)* %out, i32 %in) { -entry: - %0 = icmp sle i32 %in, 5 - %1 = select i1 %0, i32 -1, i32 0 - store i32 %1, i32 addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}ule_i32: -; CHECK: SETGT_UINT {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z -; CHECK-NEXT: LSHR -; CHECK-NEXT: 6(8.407791e-45) -define void @ule_i32(i32 addrspace(1)* %out, i32 %in) { -entry: - %0 = icmp ule i32 %in, 5 - %1 = select i1 %0, i32 -1, i32 0 - store i32 %1, i32 addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}ule_float: -; CHECK: SETGT * T{{[0-9]}}.[[CHAN:[XYZW]]], KC0[2].Z, literal.x -; CHECK-NEXT: 1084227584(5.000000e+00) -; CHECK-NEXT: SETE T{{[0-9]\.[XYZW]}}, PV.[[CHAN]], 0.0 -; CHECK-NEXT: LSHR * -define void @ule_float(float addrspace(1)* %out, float %in) { -entry: - %0 = fcmp ule float %in, 5.0 - %1 = select i1 %0, float 1.0, float 0.0 - store float %1, float addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}ule_float_native: -; CHECK: SETGT T{{[0-9]\.[XYZW]}}, KC0[2].Z, literal.x -; CHECK-NEXT: LSHR * -; CHECK-NEXT: 1084227584(5.000000e+00) -define void @ule_float_native(float addrspace(1)* %out, float %in) { -entry: - %0 = fcmp ule float %in, 5.0 - %1 = select i1 %0, float 0.0, float 1.0 - store float %1, float addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}ole: -; CHECK: SETGE T{{[0-9]\.[XYZW]}}, literal.x, KC0[2].Z -; CHECK-NEXT: LSHR * -; CHECK-NEXT:1084227584(5.000000e+00) -define void @ole(float addrspace(1)* %out, float %in) { -entry: - %0 = fcmp ole float %in, 5.0 - %1 = select i1 %0, float 1.0, float 0.0 - store float %1, float addrspace(1)* %out - ret void -} diff --git a/llvm/test/CodeGen/R600/urecip.ll b/llvm/test/CodeGen/R600/urecip.ll deleted file mode 100644 index daacc771708..00000000000 --- a/llvm/test/CodeGen/R600/urecip.ll +++ /dev/null @@ -1,13 +0,0 @@ -;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s -;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s - -;CHECK: v_rcp_iflag_f32_e32 - -define void @test(i32 %p, i32 %q) { - %i = udiv i32 %p, %q - %r = bitcast i32 %i to float - call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %r, float %r, float %r, float %r) - ret void -} - -declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) diff --git a/llvm/test/CodeGen/R600/urem.ll b/llvm/test/CodeGen/R600/urem.ll deleted file mode 100644 index 62841ec2d6c..00000000000 --- a/llvm/test/CodeGen/R600/urem.ll +++ /dev/null @@ -1,94 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s - -; The code generated by urem is long and complex and may frequently -; change. The goal of this test is to make sure the ISel doesn't fail -; when it gets a v2i32/v4i32 urem - -; FUNC-LABEL: {{^}}test_urem_i32: -; SI: s_endpgm -; EG: CF_END -define void @test_urem_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { - %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 - %a = load i32, i32 addrspace(1)* %in - %b = load i32, i32 addrspace(1)* %b_ptr - %result = urem i32 %a, %b - store i32 %result, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}test_urem_i32_7: -; SI: v_mov_b32_e32 [[MAGIC:v[0-9]+]], 0x24924925 -; SI: v_mul_hi_u32 {{v[0-9]+}}, [[MAGIC]] -; SI: v_subrev_i32 -; SI: v_mul_lo_i32 -; SI: v_sub_i32 -; SI: buffer_store_dword -; SI: s_endpgm -define void @test_urem_i32_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { - %num = load i32, i32 addrspace(1) * %in - %result = urem i32 %num, 7 - store i32 %result, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}test_urem_v2i32: -; SI: s_endpgm -; EG: CF_END -define void @test_urem_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { - %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1 - %a = load <2 x i32>, <2 x i32> addrspace(1)* %in - %b = load <2 x i32>, <2 x i32> addrspace(1)* %b_ptr - %result = urem <2 x i32> %a, %b - store <2 x i32> %result, <2 x i32> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}test_urem_v4i32: -; SI: s_endpgm -; EG: CF_END -define void @test_urem_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { - %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1 - %a = load <4 x i32>, <4 x i32> addrspace(1)* %in - %b = load <4 x i32>, <4 x i32> addrspace(1)* %b_ptr - %result = urem <4 x i32> %a, %b - store <4 x i32> %result, <4 x i32> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}test_urem_i64: -; SI: s_endpgm -; EG: CF_END -define void @test_urem_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { - %b_ptr = getelementptr i64, i64 addrspace(1)* %in, i64 1 - %a = load i64, i64 addrspace(1)* %in - %b = load i64, i64 addrspace(1)* %b_ptr - %result = urem i64 %a, %b - store i64 %result, i64 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}test_urem_v2i64: -; SI: s_endpgm -; EG: CF_END -define void @test_urem_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) { - %b_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %in, i64 1 - %a = load <2 x i64>, <2 x i64> addrspace(1)* %in - %b = load <2 x i64>, <2 x i64> addrspace(1)* %b_ptr - %result = urem <2 x i64> %a, %b - store <2 x i64> %result, <2 x i64> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}test_urem_v4i64: -; SI: s_endpgm -; EG: CF_END -define void @test_urem_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) { - %b_ptr = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %in, i64 1 - %a = load <4 x i64>, <4 x i64> addrspace(1)* %in - %b = load <4 x i64>, <4 x i64> addrspace(1)* %b_ptr - %result = urem <4 x i64> %a, %b - store <4 x i64> %result, <4 x i64> addrspace(1)* %out - ret void -} diff --git a/llvm/test/CodeGen/R600/use-sgpr-multiple-times.ll b/llvm/test/CodeGen/R600/use-sgpr-multiple-times.ll deleted file mode 100644 index f26f30022b4..00000000000 --- a/llvm/test/CodeGen/R600/use-sgpr-multiple-times.ll +++ /dev/null @@ -1,103 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN %s - -declare float @llvm.fma.f32(float, float, float) #1 -declare float @llvm.fmuladd.f32(float, float, float) #1 -declare i32 @llvm.AMDGPU.imad24(i32, i32, i32) #1 - - -; GCN-LABEL: {{^}}test_sgpr_use_twice_binop: -; GCN: s_load_dword [[SGPR:s[0-9]+]], -; GCN: v_add_f32_e64 [[RESULT:v[0-9]+]], [[SGPR]], [[SGPR]] -; GCN: buffer_store_dword [[RESULT]] -define void @test_sgpr_use_twice_binop(float addrspace(1)* %out, float %a) #0 { - %dbl = fadd float %a, %a - store float %dbl, float addrspace(1)* %out, align 4 - ret void -} - -; GCN-LABEL: {{^}}test_sgpr_use_three_ternary_op: -; GCN: s_load_dword [[SGPR:s[0-9]+]], -; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[SGPR]], [[SGPR]], [[SGPR]] -; GCN: buffer_store_dword [[RESULT]] -define void @test_sgpr_use_three_ternary_op(float addrspace(1)* %out, float %a) #0 { - %fma = call float @llvm.fma.f32(float %a, float %a, float %a) #1 - store float %fma, float addrspace(1)* %out, align 4 - ret void -} - -; GCN-LABEL: {{^}}test_sgpr_use_twice_ternary_op_a_a_b: -; SI: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb -; SI: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc -; VI: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c -; VI: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30 -; GCN: v_mov_b32_e32 [[VGPR1:v[0-9]+]], [[SGPR1]] -; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[SGPR0]], [[SGPR0]], [[VGPR1]] -; GCN: buffer_store_dword [[RESULT]] -define void @test_sgpr_use_twice_ternary_op_a_a_b(float addrspace(1)* %out, float %a, float %b) #0 { - %fma = call float @llvm.fma.f32(float %a, float %a, float %b) #1 - store float %fma, float addrspace(1)* %out, align 4 - ret void -} - -; GCN-LABEL: {{^}}test_sgpr_use_twice_ternary_op_a_b_a: -; SI: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb -; SI: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc -; VI: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c -; VI: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30 -; GCN: v_mov_b32_e32 [[VGPR1:v[0-9]+]], [[SGPR1]] -; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[VGPR1]], [[SGPR0]], [[SGPR0]] -; GCN: buffer_store_dword [[RESULT]] -define void @test_sgpr_use_twice_ternary_op_a_b_a(float addrspace(1)* %out, float %a, float %b) #0 { - %fma = call float @llvm.fma.f32(float %a, float %b, float %a) #1 - store float %fma, float addrspace(1)* %out, align 4 - ret void -} - -; GCN-LABEL: {{^}}test_sgpr_use_twice_ternary_op_b_a_a: -; SI: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb -; SI: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc -; VI: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c -; VI: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30 -; GCN: v_mov_b32_e32 [[VGPR1:v[0-9]+]], [[SGPR1]] -; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[SGPR0]], [[VGPR1]], [[SGPR0]] -; GCN: buffer_store_dword [[RESULT]] -define void @test_sgpr_use_twice_ternary_op_b_a_a(float addrspace(1)* %out, float %a, float %b) #0 { - %fma = call float @llvm.fma.f32(float %b, float %a, float %a) #1 - store float %fma, float addrspace(1)* %out, align 4 - ret void -} - -; GCN-LABEL: {{^}}test_sgpr_use_twice_ternary_op_a_a_imm: -; GCN: s_load_dword [[SGPR:s[0-9]+]] -; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[SGPR]], [[SGPR]], 2.0 -; GCN: buffer_store_dword [[RESULT]] -define void @test_sgpr_use_twice_ternary_op_a_a_imm(float addrspace(1)* %out, float %a) #0 { - %fma = call float @llvm.fma.f32(float %a, float %a, float 2.0) #1 - store float %fma, float addrspace(1)* %out, align 4 - ret void -} - -; GCN-LABEL: {{^}}test_sgpr_use_twice_ternary_op_a_imm_a: -; GCN: s_load_dword [[SGPR:s[0-9]+]] -; GCN: v_fma_f32 [[RESULT:v[0-9]+]], 2.0, [[SGPR]], [[SGPR]] -; GCN: buffer_store_dword [[RESULT]] -define void @test_sgpr_use_twice_ternary_op_a_imm_a(float addrspace(1)* %out, float %a) #0 { - %fma = call float @llvm.fma.f32(float %a, float 2.0, float %a) #1 - store float %fma, float addrspace(1)* %out, align 4 - ret void -} - -; Don't use fma since fma c, x, y is canonicalized to fma x, c, y -; GCN-LABEL: {{^}}test_sgpr_use_twice_ternary_op_imm_a_a: -; GCN: s_load_dword [[SGPR:s[0-9]+]] -; GCN: v_mad_i32_i24 [[RESULT:v[0-9]+]], 2, [[SGPR]], [[SGPR]] -; GCN: buffer_store_dword [[RESULT]] -define void @test_sgpr_use_twice_ternary_op_imm_a_a(i32 addrspace(1)* %out, i32 %a) #0 { - %fma = call i32 @llvm.AMDGPU.imad24(i32 2, i32 %a, i32 %a) #1 - store i32 %fma, i32 addrspace(1)* %out, align 4 - ret void -} - -attributes #0 = { nounwind } -attributes #1 = { nounwind readnone } diff --git a/llvm/test/CodeGen/R600/usubo.ll b/llvm/test/CodeGen/R600/usubo.ll deleted file mode 100644 index 3c9b1622a07..00000000000 --- a/llvm/test/CodeGen/R600/usubo.ll +++ /dev/null @@ -1,86 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs< %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s - -declare { i32, i1 } @llvm.usub.with.overflow.i32(i32, i32) nounwind readnone -declare { i64, i1 } @llvm.usub.with.overflow.i64(i64, i64) nounwind readnone - -; FUNC-LABEL: {{^}}usubo_i64_zext: - -; EG: SUBB_UINT -; EG: ADDC_UINT -define void @usubo_i64_zext(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind { - %usub = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 %a, i64 %b) nounwind - %val = extractvalue { i64, i1 } %usub, 0 - %carry = extractvalue { i64, i1 } %usub, 1 - %ext = zext i1 %carry to i64 - %add2 = add i64 %val, %ext - store i64 %add2, i64 addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}s_usubo_i32: -; SI: s_sub_i32 - -; EG-DAG: SUBB_UINT -; EG-DAG: SUB_INT -define void @s_usubo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 %a, i32 %b) nounwind { - %usub = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %a, i32 %b) nounwind - %val = extractvalue { i32, i1 } %usub, 0 - %carry = extractvalue { i32, i1 } %usub, 1 - store i32 %val, i32 addrspace(1)* %out, align 4 - store i1 %carry, i1 addrspace(1)* %carryout - ret void -} - -; FUNC-LABEL: {{^}}v_usubo_i32: -; SI: v_subrev_i32_e32 - -; EG-DAG: SUBB_UINT -; EG-DAG: SUB_INT -define void @v_usubo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind { - %a = load i32, i32 addrspace(1)* %aptr, align 4 - %b = load i32, i32 addrspace(1)* %bptr, align 4 - %usub = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %a, i32 %b) nounwind - %val = extractvalue { i32, i1 } %usub, 0 - %carry = extractvalue { i32, i1 } %usub, 1 - store i32 %val, i32 addrspace(1)* %out, align 4 - store i1 %carry, i1 addrspace(1)* %carryout - ret void -} - -; FUNC-LABEL: {{^}}s_usubo_i64: -; SI: s_sub_u32 -; SI: s_subb_u32 - -; EG-DAG: SUBB_UINT -; EG-DAG: SUB_INT -; EG-DAG: SUB_INT -; EG: SUB_INT -define void @s_usubo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 %a, i64 %b) nounwind { - %usub = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 %a, i64 %b) nounwind - %val = extractvalue { i64, i1 } %usub, 0 - %carry = extractvalue { i64, i1 } %usub, 1 - store i64 %val, i64 addrspace(1)* %out, align 8 - store i1 %carry, i1 addrspace(1)* %carryout - ret void -} - -; FUNC-LABEL: {{^}}v_usubo_i64: -; SI: v_sub_i32 -; SI: v_subb_u32 - -; EG-DAG: SUBB_UINT -; EG-DAG: SUB_INT -; EG-DAG: SUB_INT -; EG: SUB_INT -define void @v_usubo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind { - %a = load i64, i64 addrspace(1)* %aptr, align 4 - %b = load i64, i64 addrspace(1)* %bptr, align 4 - %usub = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 %a, i64 %b) nounwind - %val = extractvalue { i64, i1 } %usub, 0 - %carry = extractvalue { i64, i1 } %usub, 1 - store i64 %val, i64 addrspace(1)* %out, align 8 - store i1 %carry, i1 addrspace(1)* %carryout - ret void -} diff --git a/llvm/test/CodeGen/R600/v1i64-kernel-arg.ll b/llvm/test/CodeGen/R600/v1i64-kernel-arg.ll deleted file mode 100644 index 31755125c03..00000000000 --- a/llvm/test/CodeGen/R600/v1i64-kernel-arg.ll +++ /dev/null @@ -1,17 +0,0 @@ -; REQUIRES: asserts -; XFAIL: * -; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck %s - -; CHECK-LABEL: {{^}}kernel_arg_i64: -define void @kernel_arg_i64(i64 addrspace(1)* %out, i64 %a) nounwind { - store i64 %a, i64 addrspace(1)* %out, align 8 - ret void -} - -; i64 arg works, v1i64 arg does not. -; CHECK-LABEL: {{^}}kernel_arg_v1i64: -define void @kernel_arg_v1i64(<1 x i64> addrspace(1)* %out, <1 x i64> %a) nounwind { - store <1 x i64> %a, <1 x i64> addrspace(1)* %out, align 8 - ret void -} - diff --git a/llvm/test/CodeGen/R600/v_cndmask.ll b/llvm/test/CodeGen/R600/v_cndmask.ll deleted file mode 100644 index c368c5aaf7d..00000000000 --- a/llvm/test/CodeGen/R600/v_cndmask.ll +++ /dev/null @@ -1,39 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s - -declare i32 @llvm.r600.read.tidig.x() #1 - -; SI-LABEL: {{^}}v_cnd_nan_nosgpr: -; SI: v_cndmask_b32_e64 v{{[0-9]}}, v{{[0-9]}}, -1, s{{\[[0-9]+:[0-9]+\]}} -; SI-DAG: v{{[0-9]}} -; All nan values are converted to 0xffffffff -; SI: s_endpgm -define void @v_cnd_nan_nosgpr(float addrspace(1)* %out, i32 %c, float addrspace(1)* %fptr) #0 { - %idx = call i32 @llvm.r600.read.tidig.x() #1 - %f.gep = getelementptr float, float addrspace(1)* %fptr, i32 %idx - %f = load float, float addrspace(1)* %fptr - %setcc = icmp ne i32 %c, 0 - %select = select i1 %setcc, float 0xFFFFFFFFE0000000, float %f - store float %select, float addrspace(1)* %out - ret void -} - - -; This requires slightly trickier SGPR operand legalization since the -; single constant bus SGPR usage is the last operand, and it should -; never be moved. - -; SI-LABEL: {{^}}v_cnd_nan: -; SI: v_cndmask_b32_e64 v{{[0-9]}}, v{{[0-9]}}, -1, s{{\[[0-9]+:[0-9]+\]}} -; SI-DAG: v{{[0-9]}} -; All nan values are converted to 0xffffffff -; SI: s_endpgm -define void @v_cnd_nan(float addrspace(1)* %out, i32 %c, float %f) #0 { - %setcc = icmp ne i32 %c, 0 - %select = select i1 %setcc, float 0xFFFFFFFFE0000000, float %f - store float %select, float addrspace(1)* %out - ret void -} - -attributes #0 = { nounwind } -attributes #1 = { nounwind readnone } diff --git a/llvm/test/CodeGen/R600/valu-i1.ll b/llvm/test/CodeGen/R600/valu-i1.ll deleted file mode 100644 index 7d0ebd139f5..00000000000 --- a/llvm/test/CodeGen/R600/valu-i1.ll +++ /dev/null @@ -1,188 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs -enable-misched -asm-verbose < %s | FileCheck -check-prefix=SI %s - -declare i32 @llvm.r600.read.tidig.x() nounwind readnone - -; SI-LABEL: @test_if -; Make sure the i1 values created by the cfg structurizer pass are -; moved using VALU instructions -; SI-NOT: s_mov_b64 s[{{[0-9]:[0-9]}}], -1 -; SI: v_mov_b32_e32 v{{[0-9]}}, -1 -define void @test_if(i32 %a, i32 %b, i32 addrspace(1)* %src, i32 addrspace(1)* %dst) #1 { -entry: - switch i32 %a, label %default [ - i32 0, label %case0 - i32 1, label %case1 - ] - -case0: - %arrayidx1 = getelementptr i32, i32 addrspace(1)* %dst, i32 %b - store i32 0, i32 addrspace(1)* %arrayidx1, align 4 - br label %end - -case1: - %arrayidx5 = getelementptr i32, i32 addrspace(1)* %dst, i32 %b - store i32 1, i32 addrspace(1)* %arrayidx5, align 4 - br label %end - -default: - %cmp8 = icmp eq i32 %a, 2 - %arrayidx10 = getelementptr i32, i32 addrspace(1)* %dst, i32 %b - br i1 %cmp8, label %if, label %else - -if: - store i32 2, i32 addrspace(1)* %arrayidx10, align 4 - br label %end - -else: - store i32 3, i32 addrspace(1)* %arrayidx10, align 4 - br label %end - -end: - ret void -} - -; SI-LABEL: @simple_test_v_if -; SI: v_cmp_ne_i32_e32 vcc, 0, v{{[0-9]+}} -; SI: s_and_saveexec_b64 [[BR_SREG:s\[[0-9]+:[0-9]+\]]], vcc -; SI: s_xor_b64 [[BR_SREG]], exec, [[BR_SREG]] - -; SI: ; BB#1 -; SI: buffer_store_dword -; SI: s_endpgm - -; SI: BB1_2: -; SI: s_or_b64 exec, exec, [[BR_SREG]] -; SI: s_endpgm -define void @simple_test_v_if(i32 addrspace(1)* %dst, i32 addrspace(1)* %src) #1 { - %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone - %is.0 = icmp ne i32 %tid, 0 - br i1 %is.0, label %store, label %exit - -store: - %gep = getelementptr i32, i32 addrspace(1)* %dst, i32 %tid - store i32 999, i32 addrspace(1)* %gep - ret void - -exit: - ret void -} - -; SI-LABEL: @simple_test_v_loop -; SI: v_cmp_ne_i32_e32 vcc, 0, v{{[0-9]+}} -; SI: s_and_saveexec_b64 [[BR_SREG:s\[[0-9]+:[0-9]+\]]], vcc -; SI: s_xor_b64 [[BR_SREG]], exec, [[BR_SREG]] -; SI: s_cbranch_execz BB2_2 - -; SI: ; BB#1: -; SI: s_mov_b64 {{s\[[0-9]+:[0-9]+\]}}, 0{{$}} - -; SI: BB2_3: -; SI: buffer_load_dword -; SI: buffer_store_dword -; SI: v_cmp_eq_i32_e32 vcc, -; SI: s_or_b64 [[OR_SREG:s\[[0-9]+:[0-9]+\]]] -; SI: s_andn2_b64 exec, exec, [[OR_SREG]] -; SI: s_cbranch_execnz BB2_3 - -define void @simple_test_v_loop(i32 addrspace(1)* %dst, i32 addrspace(1)* %src) #1 { -entry: - %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone - %is.0 = icmp ne i32 %tid, 0 - %limit = add i32 %tid, 64 - br i1 %is.0, label %loop, label %exit - -loop: - %i = phi i32 [%tid, %entry], [%i.inc, %loop] - %gep.src = getelementptr i32, i32 addrspace(1)* %src, i32 %i - %gep.dst = getelementptr i32, i32 addrspace(1)* %dst, i32 %i - %load = load i32, i32 addrspace(1)* %src - store i32 %load, i32 addrspace(1)* %gep.dst - %i.inc = add nsw i32 %i, 1 - %cmp = icmp eq i32 %limit, %i.inc - br i1 %cmp, label %exit, label %loop - -exit: - ret void -} - -; SI-LABEL: @multi_vcond_loop - -; Load loop limit from buffer -; Branch to exit if uniformly not taken -; SI: ; BB#0: -; SI: buffer_load_dword [[VBOUND:v[0-9]+]] -; SI: v_cmp_lt_i32_e32 vcc -; SI: s_and_saveexec_b64 [[OUTER_CMP_SREG:s\[[0-9]+:[0-9]+\]]], vcc -; SI: s_xor_b64 [[OUTER_CMP_SREG]], exec, [[OUTER_CMP_SREG]] -; SI: s_cbranch_execz BB3_2 - -; Initialize inner condition to false -; SI: ; BB#1: -; SI: s_mov_b64 [[ZERO:s\[[0-9]+:[0-9]+\]]], 0{{$}} -; SI: s_mov_b64 [[COND_STATE:s\[[0-9]+:[0-9]+\]]], [[ZERO]] - -; Clear exec bits for workitems that load -1s -; SI: BB3_3: -; SI: buffer_load_dword [[B:v[0-9]+]] -; SI: buffer_load_dword [[A:v[0-9]+]] -; SI-DAG: v_cmp_ne_i32_e64 [[NEG1_CHECK_0:s\[[0-9]+:[0-9]+\]]], -1, [[A]] -; SI-DAG: v_cmp_ne_i32_e32 [[NEG1_CHECK_1:vcc]], -1, [[B]] -; SI: s_and_b64 [[ORNEG1:s\[[0-9]+:[0-9]+\]]], [[NEG1_CHECK_1]], [[NEG1_CHECK_0]] -; SI: s_and_saveexec_b64 [[ORNEG1]], [[ORNEG1]] -; SI: s_xor_b64 [[ORNEG1]], exec, [[ORNEG1]] -; SI: s_cbranch_execz BB3_5 - -; SI: BB#4: -; SI: buffer_store_dword -; SI: v_cmp_ge_i64_e32 vcc -; SI: s_or_b64 [[COND_STATE]], vcc, [[COND_STATE]] - -; SI: BB3_5: -; SI: s_or_b64 exec, exec, [[ORNEG1]] -; SI: s_or_b64 [[COND_STATE]], [[ORNEG1]], [[COND_STATE]] -; SI: s_andn2_b64 exec, exec, [[COND_STATE]] -; SI: s_cbranch_execnz BB3_3 - -; SI: BB#6 -; SI: s_or_b64 exec, exec, [[COND_STATE]] - -; SI: BB3_2: -; SI-NOT: [[COND_STATE]] -; SI: s_endpgm - -define void @multi_vcond_loop(i32 addrspace(1)* noalias nocapture %arg, i32 addrspace(1)* noalias nocapture readonly %arg1, i32 addrspace(1)* noalias nocapture readonly %arg2, i32 addrspace(1)* noalias nocapture readonly %arg3) #1 { -bb: - %tmp = tail call i32 @llvm.r600.read.tidig.x() #0 - %tmp4 = sext i32 %tmp to i64 - %tmp5 = getelementptr inbounds i32, i32 addrspace(1)* %arg3, i64 %tmp4 - %tmp6 = load i32, i32 addrspace(1)* %tmp5, align 4 - %tmp7 = icmp sgt i32 %tmp6, 0 - %tmp8 = sext i32 %tmp6 to i64 - br i1 %tmp7, label %bb10, label %bb26 - -bb10: ; preds = %bb, %bb20 - %tmp11 = phi i64 [ %tmp23, %bb20 ], [ 0, %bb ] - %tmp12 = add nsw i64 %tmp11, %tmp4 - %tmp13 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp12 - %tmp14 = load i32, i32 addrspace(1)* %tmp13, align 4 - %tmp15 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp12 - %tmp16 = load i32, i32 addrspace(1)* %tmp15, align 4 - %tmp17 = icmp ne i32 %tmp14, -1 - %tmp18 = icmp ne i32 %tmp16, -1 - %tmp19 = and i1 %tmp17, %tmp18 - br i1 %tmp19, label %bb20, label %bb26 - -bb20: ; preds = %bb10 - %tmp21 = add nsw i32 %tmp16, %tmp14 - %tmp22 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp12 - store i32 %tmp21, i32 addrspace(1)* %tmp22, align 4 - %tmp23 = add nuw nsw i64 %tmp11, 1 - %tmp24 = icmp slt i64 %tmp23, %tmp8 - br i1 %tmp24, label %bb10, label %bb26 - -bb26: ; preds = %bb10, %bb20, %bb - ret void -} - -attributes #0 = { nounwind readnone } -attributes #1 = { nounwind } diff --git a/llvm/test/CodeGen/R600/vector-alloca.ll b/llvm/test/CodeGen/R600/vector-alloca.ll deleted file mode 100644 index 6f3b4847fbd..00000000000 --- a/llvm/test/CodeGen/R600/vector-alloca.ll +++ /dev/null @@ -1,77 +0,0 @@ -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck --check-prefix=EG -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=verde -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=verde -mattr=+promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=+promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC %s - -; FUNC-LABEL: {{^}}vector_read: -; EG: MOV -; EG: MOV -; EG: MOV -; EG: MOV -; EG: MOVA_INT -define void @vector_read(i32 addrspace(1)* %out, i32 %index) { -entry: - %0 = alloca [4 x i32] - %x = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 0 - %y = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 1 - %z = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 2 - %w = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 3 - store i32 0, i32* %x - store i32 1, i32* %y - store i32 2, i32* %z - store i32 3, i32* %w - %1 = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 %index - %2 = load i32, i32* %1 - store i32 %2, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}vector_write: -; EG: MOV -; EG: MOV -; EG: MOV -; EG: MOV -; EG: MOVA_INT -; EG: MOVA_INT -define void @vector_write(i32 addrspace(1)* %out, i32 %w_index, i32 %r_index) { -entry: - %0 = alloca [4 x i32] - %x = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 0 - %y = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 1 - %z = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 2 - %w = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 3 - store i32 0, i32* %x - store i32 0, i32* %y - store i32 0, i32* %z - store i32 0, i32* %w - %1 = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 %w_index - store i32 1, i32* %1 - %2 = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 %r_index - %3 = load i32, i32* %2 - store i32 %3, i32 addrspace(1)* %out - ret void -} - -; This test should be optimize to: -; store i32 0, i32 addrspace(1)* %out -; FUNC-LABEL: {{^}}bitcast_gep: -; EG: STORE_RAW -define void @bitcast_gep(i32 addrspace(1)* %out, i32 %w_index, i32 %r_index) { -entry: - %0 = alloca [4 x i32] - %x = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 0 - %y = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 1 - %z = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 2 - %w = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 3 - store i32 0, i32* %x - store i32 0, i32* %y - store i32 0, i32* %z - store i32 0, i32* %w - %1 = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 1 - %2 = bitcast i32* %1 to [4 x i32]* - %3 = getelementptr [4 x i32], [4 x i32]* %2, i32 0, i32 0 - %4 = load i32, i32* %3 - store i32 %4, i32 addrspace(1)* %out - ret void -} diff --git a/llvm/test/CodeGen/R600/vertex-fetch-encoding.ll b/llvm/test/CodeGen/R600/vertex-fetch-encoding.ll deleted file mode 100644 index fb6a17e6714..00000000000 --- a/llvm/test/CodeGen/R600/vertex-fetch-encoding.ll +++ /dev/null @@ -1,25 +0,0 @@ -; RUN: llc < %s -march=r600 -show-mc-encoding -mcpu=barts | FileCheck --check-prefix=NI %s -; RUN: llc < %s -march=r600 -show-mc-encoding -mcpu=cayman | FileCheck --check-prefix=CM %s - -; NI: {{^}}vtx_fetch32: -; NI: VTX_READ_32 T[[GPR:[0-9]]].X, T[[GPR]].X, 0 ; encoding: [0x40,0x01,0x0[[GPR]],0x10,0x0[[GPR]],0xf0,0x5f,0x13,0x00,0x00,0x08,0x00 -; CM: {{^}}vtx_fetch32: -; CM: VTX_READ_32 T[[GPR:[0-9]]].X, T[[GPR]].X, 0 ; encoding: [0x40,0x01,0x0[[GPR]],0x00,0x0[[GPR]],0xf0,0x5f,0x13,0x00,0x00,0x00,0x00 - -define void @vtx_fetch32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { -entry: - %0 = load i32, i32 addrspace(1)* %in - store i32 %0, i32 addrspace(1)* %out - ret void -} - -; NI: {{^}}vtx_fetch128: -; NI: VTX_READ_128 T[[DST:[0-9]]].XYZW, T[[SRC:[0-9]]].X, 0 ; encoding: [0x40,0x01,0x0[[SRC]],0x40,0x0[[DST]],0x10,0x8d,0x18,0x00,0x00,0x08,0x00 -; XXX: Add a case for Cayman when v4i32 stores are supported. - -define void @vtx_fetch128(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { -entry: - %0 = load <4 x i32>, <4 x i32> addrspace(1)* %in - store <4 x i32> %0, <4 x i32> addrspace(1)* %out - ret void -} diff --git a/llvm/test/CodeGen/R600/vop-shrink.ll b/llvm/test/CodeGen/R600/vop-shrink.ll deleted file mode 100644 index 9b2f229c05a..00000000000 --- a/llvm/test/CodeGen/R600/vop-shrink.ll +++ /dev/null @@ -1,51 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s - -; Test that we correctly commute a sub instruction -; FUNC-LABEL: {{^}}sub_rev: -; SI-NOT: v_sub_i32_e32 v{{[0-9]+}}, s -; SI: v_subrev_i32_e32 v{{[0-9]+}}, s - -; ModuleID = 'vop-shrink.ll' - -define void @sub_rev(i32 addrspace(1)* %out, <4 x i32> %sgpr, i32 %cond) { -entry: - %vgpr = call i32 @llvm.r600.read.tidig.x() #1 - %tmp = icmp eq i32 %cond, 0 - br i1 %tmp, label %if, label %else - -if: ; preds = %entry - %tmp1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 - %tmp2 = extractelement <4 x i32> %sgpr, i32 1 - store i32 %tmp2, i32 addrspace(1)* %out - br label %endif - -else: ; preds = %entry - %tmp3 = extractelement <4 x i32> %sgpr, i32 2 - %tmp4 = sub i32 %vgpr, %tmp3 - store i32 %tmp4, i32 addrspace(1)* %out - br label %endif - -endif: ; preds = %else, %if - ret void -} - -; Test that we fold an immediate that was illegal for a 64-bit op into the -; 32-bit op when we shrink it. - -; FUNC-LABEL: {{^}}add_fold: -; SI: v_add_f32_e32 v{{[0-9]+}}, 0x44800000 -define void @add_fold(float addrspace(1)* %out) { -entry: - %tmp = call i32 @llvm.r600.read.tidig.x() - %tmp1 = uitofp i32 %tmp to float - %tmp2 = fadd float %tmp1, 1.024000e+03 - store float %tmp2, float addrspace(1)* %out - ret void -} - -; Function Attrs: nounwind readnone -declare i32 @llvm.r600.read.tidig.x() #0 - -attributes #0 = { nounwind readnone } -attributes #1 = { readnone } diff --git a/llvm/test/CodeGen/R600/vselect.ll b/llvm/test/CodeGen/R600/vselect.ll deleted file mode 100644 index a3014b03d2b..00000000000 --- a/llvm/test/CodeGen/R600/vselect.ll +++ /dev/null @@ -1,77 +0,0 @@ -;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG %s -;RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck --check-prefix=SI %s -;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=SI %s - -;EG: {{^}}test_select_v2i32: -;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} - -;SI: {{^}}test_select_v2i32: -;SI: v_cndmask_b32_e64 -;SI: v_cndmask_b32_e64 - -define void @test_select_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in0, <2 x i32> addrspace(1)* %in1) { -entry: - %0 = load <2 x i32>, <2 x i32> addrspace(1)* %in0 - %1 = load <2 x i32>, <2 x i32> addrspace(1)* %in1 - %cmp = icmp ne <2 x i32> %0, %1 - %result = select <2 x i1> %cmp, <2 x i32> %0, <2 x i32> %1 - store <2 x i32> %result, <2 x i32> addrspace(1)* %out - ret void -} - -;EG: {{^}}test_select_v2f32: -;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} - -;SI: {{^}}test_select_v2f32: -;SI: v_cndmask_b32_e64 -;SI: v_cndmask_b32_e64 - -define void @test_select_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in0, <2 x float> addrspace(1)* %in1) { -entry: - %0 = load <2 x float>, <2 x float> addrspace(1)* %in0 - %1 = load <2 x float>, <2 x float> addrspace(1)* %in1 - %cmp = fcmp une <2 x float> %0, %1 - %result = select <2 x i1> %cmp, <2 x float> %0, <2 x float> %1 - store <2 x float> %result, <2 x float> addrspace(1)* %out - ret void -} - -;EG: {{^}}test_select_v4i32: -;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} - -;SI: {{^}}test_select_v4i32: -;SI: v_cndmask_b32_e64 -;SI: v_cndmask_b32_e64 -;SI: v_cndmask_b32_e64 -;SI: v_cndmask_b32_e64 - -define void @test_select_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in0, <4 x i32> addrspace(1)* %in1) { -entry: - %0 = load <4 x i32>, <4 x i32> addrspace(1)* %in0 - %1 = load <4 x i32>, <4 x i32> addrspace(1)* %in1 - %cmp = icmp ne <4 x i32> %0, %1 - %result = select <4 x i1> %cmp, <4 x i32> %0, <4 x i32> %1 - store <4 x i32> %result, <4 x i32> addrspace(1)* %out - ret void -} - -;EG: {{^}}test_select_v4f32: -;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} - -define void @test_select_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in0, <4 x float> addrspace(1)* %in1) { -entry: - %0 = load <4 x float>, <4 x float> addrspace(1)* %in0 - %1 = load <4 x float>, <4 x float> addrspace(1)* %in1 - %cmp = fcmp une <4 x float> %0, %1 - %result = select <4 x i1> %cmp, <4 x float> %0, <4 x float> %1 - store <4 x float> %result, <4 x float> addrspace(1)* %out - ret void -} diff --git a/llvm/test/CodeGen/R600/vselect64.ll b/llvm/test/CodeGen/R600/vselect64.ll deleted file mode 100644 index ef85ebe7899..00000000000 --- a/llvm/test/CodeGen/R600/vselect64.ll +++ /dev/null @@ -1,15 +0,0 @@ -; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s -; XXX: Merge this test into vselect.ll once SI supports 64-bit select. - -; CHECK-LABEL: {{^}}test_select_v4i64: -; Make sure the vectors aren't being stored on the stack. We know they are -; being stored on the stack if the shaders uses at leat 10 registers. -; CHECK-NOT: {{\**}} MOV T{{[0-9][0-9]}}.X -define void @test_select_v4i64(<4 x i64> addrspace(1)* %out, <4 x i32> %c) { -entry: - %cmp = icmp ne <4 x i32> %c, - %result = select <4 x i1> %cmp, <4 x i64> , <4 x i64> - store <4 x i64> %result, <4 x i64> addrspace(1)* %out - ret void -} - diff --git a/llvm/test/CodeGen/R600/vtx-fetch-branch.ll b/llvm/test/CodeGen/R600/vtx-fetch-branch.ll deleted file mode 100644 index 4584d6e2525..00000000000 --- a/llvm/test/CodeGen/R600/vtx-fetch-branch.ll +++ /dev/null @@ -1,29 +0,0 @@ -; RUN: llc -march=r600 -mcpu=redwood %s -o - | FileCheck %s - -; This tests for a bug where vertex fetch clauses right before an ENDIF -; instruction where being emitted after the ENDIF. We were using ALU_POP_AFTER -; for the ALU clause before the vetex fetch instead of emitting a POP instruction -; after the fetch clause. - - -; CHECK-LABEL: {{^}}test: -; CHECK-NOT: ALU_POP_AFTER -; CHECK: TEX -; CHECK-NEXT: POP -define void @test(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %cond) { -entry: - %0 = icmp eq i32 %cond, 0 - br i1 %0, label %endif, label %if - -if: - %1 = load i32, i32 addrspace(1)* %in - br label %endif - -endif: - %x = phi i32 [ %1, %if], [ 0, %entry] - store i32 %x, i32 addrspace(1)* %out - br label %done - -done: - ret void -} diff --git a/llvm/test/CodeGen/R600/vtx-schedule.ll b/llvm/test/CodeGen/R600/vtx-schedule.ll deleted file mode 100644 index 912e258ebb8..00000000000 --- a/llvm/test/CodeGen/R600/vtx-schedule.ll +++ /dev/null @@ -1,18 +0,0 @@ -; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s - -; This test is for a scheduler bug where VTX_READ instructions that used -; the result of another VTX_READ instruction were being grouped in the -; same fetch clasue. - -; CHECK: {{^}}test: -; CHECK: Fetch clause -; CHECK: VTX_READ_32 [[IN0:T[0-9]+\.X]], [[IN0]], 0 -; CHECK: Fetch clause -; CHECK: VTX_READ_32 [[IN1:T[0-9]+\.X]], [[IN1]], 0 -define void @test(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* addrspace(1)* nocapture %in0) { -entry: - %0 = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(1)* %in0 - %1 = load i32, i32 addrspace(1)* %0 - store i32 %1, i32 addrspace(1)* %out - ret void -} diff --git a/llvm/test/CodeGen/R600/wait.ll b/llvm/test/CodeGen/R600/wait.ll deleted file mode 100644 index 5cc7577cad3..00000000000 --- a/llvm/test/CodeGen/R600/wait.ll +++ /dev/null @@ -1,45 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -strict-whitespace %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -strict-whitespace %s - -; CHECK-LABEL: {{^}}main: -; CHECK: s_load_dwordx4 -; CHECK: s_load_dwordx4 -; CHECK: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; CHECK: s_endpgm -define void @main(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <32 x i8> addrspace(2)* inreg %arg2, <16 x i8> addrspace(2)* inreg %arg3, <16 x i8> addrspace(2)* inreg %arg4, i32 inreg %arg5, i32 %arg6, i32 %arg7, i32 %arg8, i32 %arg9, float addrspace(2)* inreg %constptr) #0 { -main_body: - %tmp = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %arg3, i32 0 - %tmp10 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp, !tbaa !0 - %tmp11 = call <4 x float> @llvm.SI.vs.load.input(<16 x i8> %tmp10, i32 0, i32 %arg6) - %tmp12 = extractelement <4 x float> %tmp11, i32 0 - %tmp13 = extractelement <4 x float> %tmp11, i32 1 - call void @llvm.AMDGPU.barrier.global() #1 - %tmp14 = extractelement <4 x float> %tmp11, i32 2 -; %tmp15 = extractelement <4 x float> %tmp11, i32 3 - %tmp15 = load float, float addrspace(2)* %constptr, align 4 ; Force waiting for expcnt and lgkmcnt - %tmp16 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %arg3, i32 1 - %tmp17 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp16, !tbaa !0 - %tmp18 = call <4 x float> @llvm.SI.vs.load.input(<16 x i8> %tmp17, i32 0, i32 %arg6) - %tmp19 = extractelement <4 x float> %tmp18, i32 0 - %tmp20 = extractelement <4 x float> %tmp18, i32 1 - %tmp21 = extractelement <4 x float> %tmp18, i32 2 - %tmp22 = extractelement <4 x float> %tmp18, i32 3 - call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 32, i32 0, float %tmp19, float %tmp20, float %tmp21, float %tmp22) - call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %tmp12, float %tmp13, float %tmp14, float %tmp15) - ret void -} - -; Function Attrs: noduplicate nounwind -declare void @llvm.AMDGPU.barrier.global() #1 - -; Function Attrs: nounwind readnone -declare <4 x float> @llvm.SI.vs.load.input(<16 x i8>, i32, i32) #2 - -declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) - -attributes #0 = { "ShaderType"="1" } -attributes #1 = { noduplicate nounwind } -attributes #2 = { nounwind readnone } - -!0 = !{!1, !1, i64 0, i32 1} -!1 = !{!"const", null} diff --git a/llvm/test/CodeGen/R600/work-item-intrinsics.ll b/llvm/test/CodeGen/R600/work-item-intrinsics.ll deleted file mode 100644 index 4328e964c1b..00000000000 --- a/llvm/test/CodeGen/R600/work-item-intrinsics.ll +++ /dev/null @@ -1,238 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s - - -; FUNC-LABEL: {{^}}ngroups_x: -; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] -; EG: MOV [[VAL]], KC0[0].X - -; GCN: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0 -; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] -; GCN: buffer_store_dword [[VVAL]] -define void @ngroups_x (i32 addrspace(1)* %out) { -entry: - %0 = call i32 @llvm.r600.read.ngroups.x() #0 - store i32 %0, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}ngroups_y: -; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] -; EG: MOV [[VAL]], KC0[0].Y - -; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x1 -; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x4 -; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] -; GCN: buffer_store_dword [[VVAL]] -define void @ngroups_y (i32 addrspace(1)* %out) { -entry: - %0 = call i32 @llvm.r600.read.ngroups.y() #0 - store i32 %0, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}ngroups_z: -; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] -; EG: MOV [[VAL]], KC0[0].Z - -; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x2 -; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x8 -; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] -; GCN: buffer_store_dword [[VVAL]] -define void @ngroups_z (i32 addrspace(1)* %out) { -entry: - %0 = call i32 @llvm.r600.read.ngroups.z() #0 - store i32 %0, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}global_size_x: -; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] -; EG: MOV [[VAL]], KC0[0].W - -; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x3 -; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0xc -; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] -; GCN: buffer_store_dword [[VVAL]] -define void @global_size_x (i32 addrspace(1)* %out) { -entry: - %0 = call i32 @llvm.r600.read.global.size.x() #0 - store i32 %0, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}global_size_y: -; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] -; EG: MOV [[VAL]], KC0[1].X - -; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x4 -; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x10 -; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] -; GCN: buffer_store_dword [[VVAL]] -define void @global_size_y (i32 addrspace(1)* %out) { -entry: - %0 = call i32 @llvm.r600.read.global.size.y() #0 - store i32 %0, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}global_size_z: -; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] -; EG: MOV [[VAL]], KC0[1].Y - -; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x5 -; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x14 -; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] -; GCN: buffer_store_dword [[VVAL]] -define void @global_size_z (i32 addrspace(1)* %out) { -entry: - %0 = call i32 @llvm.r600.read.global.size.z() #0 - store i32 %0, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}local_size_x: -; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] -; EG: MOV [[VAL]], KC0[1].Z - -; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x6 -; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x18 -; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] -; GCN: buffer_store_dword [[VVAL]] -define void @local_size_x (i32 addrspace(1)* %out) { -entry: - %0 = call i32 @llvm.r600.read.local.size.x() #0 - store i32 %0, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}local_size_y: -; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] -; EG: MOV [[VAL]], KC0[1].W - -; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x7 -; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x1c -; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] -; GCN: buffer_store_dword [[VVAL]] -define void @local_size_y (i32 addrspace(1)* %out) { -entry: - %0 = call i32 @llvm.r600.read.local.size.y() #0 - store i32 %0, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}local_size_z: -; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] -; EG: MOV [[VAL]], KC0[2].X - -; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x8 -; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x20 -; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] -; GCN: buffer_store_dword [[VVAL]] -define void @local_size_z (i32 addrspace(1)* %out) { -entry: - %0 = call i32 @llvm.r600.read.local.size.z() #0 - store i32 %0, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}get_work_dim: -; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] -; EG: MOV [[VAL]], KC0[2].Z - -; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0xb -; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x2c -; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] -; GCN: buffer_store_dword [[VVAL]] -define void @get_work_dim (i32 addrspace(1)* %out) { -entry: - %0 = call i32 @llvm.AMDGPU.read.workdim() #0 - store i32 %0, i32 addrspace(1)* %out - ret void -} - -; The tgid values are stored in sgprs offset by the number of user sgprs. -; Currently we always use exactly 2 user sgprs for the pointer to the -; kernel arguments, but this may change in the future. - -; FUNC-LABEL: {{^}}tgid_x: -; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], s4 -; GCN: buffer_store_dword [[VVAL]] -define void @tgid_x (i32 addrspace(1)* %out) { -entry: - %0 = call i32 @llvm.r600.read.tgid.x() #0 - store i32 %0, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}tgid_y: -; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], s5 -; GCN: buffer_store_dword [[VVAL]] -define void @tgid_y (i32 addrspace(1)* %out) { -entry: - %0 = call i32 @llvm.r600.read.tgid.y() #0 - store i32 %0, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}tgid_z: -; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], s6 -; GCN: buffer_store_dword [[VVAL]] -define void @tgid_z (i32 addrspace(1)* %out) { -entry: - %0 = call i32 @llvm.r600.read.tgid.z() #0 - store i32 %0, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}tidig_x: -; GCN: buffer_store_dword v0 -define void @tidig_x (i32 addrspace(1)* %out) { -entry: - %0 = call i32 @llvm.r600.read.tidig.x() #0 - store i32 %0, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}tidig_y: -; GCN: buffer_store_dword v1 -define void @tidig_y (i32 addrspace(1)* %out) { -entry: - %0 = call i32 @llvm.r600.read.tidig.y() #0 - store i32 %0, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}tidig_z: -; GCN: buffer_store_dword v2 -define void @tidig_z (i32 addrspace(1)* %out) { -entry: - %0 = call i32 @llvm.r600.read.tidig.z() #0 - store i32 %0, i32 addrspace(1)* %out - ret void -} - -declare i32 @llvm.r600.read.ngroups.x() #0 -declare i32 @llvm.r600.read.ngroups.y() #0 -declare i32 @llvm.r600.read.ngroups.z() #0 - -declare i32 @llvm.r600.read.global.size.x() #0 -declare i32 @llvm.r600.read.global.size.y() #0 -declare i32 @llvm.r600.read.global.size.z() #0 - -declare i32 @llvm.r600.read.local.size.x() #0 -declare i32 @llvm.r600.read.local.size.y() #0 -declare i32 @llvm.r600.read.local.size.z() #0 - -declare i32 @llvm.r600.read.tgid.x() #0 -declare i32 @llvm.r600.read.tgid.y() #0 -declare i32 @llvm.r600.read.tgid.z() #0 - -declare i32 @llvm.r600.read.tidig.x() #0 -declare i32 @llvm.r600.read.tidig.y() #0 -declare i32 @llvm.r600.read.tidig.z() #0 - -declare i32 @llvm.AMDGPU.read.workdim() #0 - -attributes #0 = { readnone } diff --git a/llvm/test/CodeGen/R600/wrong-transalu-pos-fix.ll b/llvm/test/CodeGen/R600/wrong-transalu-pos-fix.ll deleted file mode 100644 index 8b383e4c393..00000000000 --- a/llvm/test/CodeGen/R600/wrong-transalu-pos-fix.ll +++ /dev/null @@ -1,81 +0,0 @@ -; RUN: llc -march=r600 -mcpu=redwood -mtriple=r600-- < %s | FileCheck %s - -; We want all MULLO_INT inst to be last in their instruction group -;CHECK: {{^}}fill3d: -;CHECK-NOT: MULLO_INT T[0-9]+ - -define void @fill3d(i32 addrspace(1)* nocapture %out) #0 { -entry: - %x.i = tail call i32 @llvm.r600.read.global.size.x() #1 - %y.i18 = tail call i32 @llvm.r600.read.global.size.y() #1 - %mul = mul i32 %y.i18, %x.i - %z.i17 = tail call i32 @llvm.r600.read.global.size.z() #1 - %mul3 = mul i32 %mul, %z.i17 - %x.i.i = tail call i32 @llvm.r600.read.tgid.x() #1 - %x.i12.i = tail call i32 @llvm.r600.read.local.size.x() #1 - %mul26.i = mul i32 %x.i12.i, %x.i.i - %x.i4.i = tail call i32 @llvm.r600.read.tidig.x() #1 - %add.i16 = add i32 %x.i4.i, %mul26.i - %mul7 = mul i32 %add.i16, %y.i18 - %y.i.i = tail call i32 @llvm.r600.read.tgid.y() #1 - %y.i14.i = tail call i32 @llvm.r600.read.local.size.y() #1 - %mul30.i = mul i32 %y.i14.i, %y.i.i - %y.i6.i = tail call i32 @llvm.r600.read.tidig.y() #1 - %add.i14 = add i32 %mul30.i, %mul7 - %mul819 = add i32 %add.i14, %y.i6.i - %add = mul i32 %mul819, %z.i17 - %z.i.i = tail call i32 @llvm.r600.read.tgid.z() #1 - %z.i16.i = tail call i32 @llvm.r600.read.local.size.z() #1 - %mul33.i = mul i32 %z.i16.i, %z.i.i - %z.i8.i = tail call i32 @llvm.r600.read.tidig.z() #1 - %add.i = add i32 %z.i8.i, %mul33.i - %add13 = add i32 %add.i, %add - %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %add13 - store i32 %mul3, i32 addrspace(1)* %arrayidx, align 4 - ret void -} - -; Function Attrs: nounwind readnone -declare i32 @llvm.r600.read.tgid.x() #1 - -; Function Attrs: nounwind readnone -declare i32 @llvm.r600.read.tgid.y() #1 - -; Function Attrs: nounwind readnone -declare i32 @llvm.r600.read.tgid.z() #1 - -; Function Attrs: nounwind readnone -declare i32 @llvm.r600.read.local.size.x() #1 - -; Function Attrs: nounwind readnone -declare i32 @llvm.r600.read.local.size.y() #1 - -; Function Attrs: nounwind readnone -declare i32 @llvm.r600.read.local.size.z() #1 - -; Function Attrs: nounwind readnone -declare i32 @llvm.r600.read.tidig.x() #1 - -; Function Attrs: nounwind readnone -declare i32 @llvm.r600.read.tidig.y() #1 - -; Function Attrs: nounwind readnone -declare i32 @llvm.r600.read.tidig.z() #1 - -; Function Attrs: nounwind readnone -declare i32 @llvm.r600.read.global.size.x() #1 - -; Function Attrs: nounwind readnone -declare i32 @llvm.r600.read.global.size.y() #1 - -; Function Attrs: nounwind readnone -declare i32 @llvm.r600.read.global.size.z() #1 - -attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { nounwind readnone } - -!opencl.kernels = !{!0, !1, !2} - -!0 = !{null} -!1 = !{null} -!2 = !{void (i32 addrspace(1)*)* @fill3d} diff --git a/llvm/test/CodeGen/R600/xor.ll b/llvm/test/CodeGen/R600/xor.ll deleted file mode 100644 index 089db59eabc..00000000000 --- a/llvm/test/CodeGen/R600/xor.ll +++ /dev/null @@ -1,173 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s - - -; FUNC-LABEL: {{^}}xor_v2i32: -; EG: XOR_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -; EG: XOR_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} - -; SI: v_xor_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -; SI: v_xor_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} - -define void @xor_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in0, <2 x i32> addrspace(1)* %in1) { - %a = load <2 x i32>, <2 x i32> addrspace(1) * %in0 - %b = load <2 x i32>, <2 x i32> addrspace(1) * %in1 - %result = xor <2 x i32> %a, %b - store <2 x i32> %result, <2 x i32> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}xor_v4i32: -; EG: XOR_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -; EG: XOR_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -; EG: XOR_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -; EG: XOR_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} - -; SI: v_xor_b32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}} -; SI: v_xor_b32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}} -; SI: v_xor_b32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}} -; SI: v_xor_b32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}} - -define void @xor_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in0, <4 x i32> addrspace(1)* %in1) { - %a = load <4 x i32>, <4 x i32> addrspace(1) * %in0 - %b = load <4 x i32>, <4 x i32> addrspace(1) * %in1 - %result = xor <4 x i32> %a, %b - store <4 x i32> %result, <4 x i32> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}xor_i1: -; EG: XOR_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], PS}} - -; SI-DAG: v_cmp_le_f32_e32 [[CMP0:vcc]], 0, {{v[0-9]+}} -; SI-DAG: v_cmp_le_f32_e64 [[CMP1:s\[[0-9]+:[0-9]+\]]], 1.0, {{v[0-9]+}} -; SI: s_xor_b64 [[XOR:s\[[0-9]+:[0-9]+\]]], [[CMP0]], [[CMP1]] -; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, [[XOR]] -; SI: buffer_store_dword [[RESULT]] -; SI: s_endpgm -define void @xor_i1(float addrspace(1)* %out, float addrspace(1)* %in0, float addrspace(1)* %in1) { - %a = load float, float addrspace(1) * %in0 - %b = load float, float addrspace(1) * %in1 - %acmp = fcmp oge float %a, 0.000000e+00 - %bcmp = fcmp oge float %b, 1.000000e+00 - %xor = xor i1 %acmp, %bcmp - %result = select i1 %xor, float %a, float %b - store float %result, float addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}v_xor_i1: -; SI: buffer_load_ubyte [[B:v[0-9]+]] -; SI: buffer_load_ubyte [[A:v[0-9]+]] -; SI: v_xor_b32_e32 [[XOR:v[0-9]+]], [[A]], [[B]] -; SI: v_and_b32_e32 [[RESULT:v[0-9]+]], 1, [[XOR]] -; SI: buffer_store_byte [[RESULT]] -define void @v_xor_i1(i1 addrspace(1)* %out, i1 addrspace(1)* %in0, i1 addrspace(1)* %in1) { - %a = load i1, i1 addrspace(1)* %in0 - %b = load i1, i1 addrspace(1)* %in1 - %xor = xor i1 %a, %b - store i1 %xor, i1 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}vector_xor_i32: -; SI: v_xor_b32_e32 -define void @vector_xor_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) { - %a = load i32, i32 addrspace(1)* %in0 - %b = load i32, i32 addrspace(1)* %in1 - %result = xor i32 %a, %b - store i32 %result, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}scalar_xor_i32: -; SI: s_xor_b32 -define void @scalar_xor_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) { - %result = xor i32 %a, %b - store i32 %result, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}scalar_not_i32: -; SI: s_not_b32 -define void @scalar_not_i32(i32 addrspace(1)* %out, i32 %a) { - %result = xor i32 %a, -1 - store i32 %result, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}vector_not_i32: -; SI: v_not_b32 -define void @vector_not_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) { - %a = load i32, i32 addrspace(1)* %in0 - %b = load i32, i32 addrspace(1)* %in1 - %result = xor i32 %a, -1 - store i32 %result, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}vector_xor_i64: -; SI: v_xor_b32_e32 -; SI: v_xor_b32_e32 -; SI: s_endpgm -define void @vector_xor_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in0, i64 addrspace(1)* %in1) { - %a = load i64, i64 addrspace(1)* %in0 - %b = load i64, i64 addrspace(1)* %in1 - %result = xor i64 %a, %b - store i64 %result, i64 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}scalar_xor_i64: -; SI: s_xor_b64 -; SI: s_endpgm -define void @scalar_xor_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) { - %result = xor i64 %a, %b - store i64 %result, i64 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}scalar_not_i64: -; SI: s_not_b64 -define void @scalar_not_i64(i64 addrspace(1)* %out, i64 %a) { - %result = xor i64 %a, -1 - store i64 %result, i64 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}vector_not_i64: -; SI: v_not_b32 -; SI: v_not_b32 -define void @vector_not_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in0, i64 addrspace(1)* %in1) { - %a = load i64, i64 addrspace(1)* %in0 - %b = load i64, i64 addrspace(1)* %in1 - %result = xor i64 %a, -1 - store i64 %result, i64 addrspace(1)* %out - ret void -} - -; Test that we have a pattern to match xor inside a branch. -; Note that in the future the backend may be smart enough to -; use an SALU instruction for this. - -; FUNC-LABEL: {{^}}xor_cf: -; SI: s_xor_b64 -define void @xor_cf(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %a, i64 %b) { -entry: - %0 = icmp eq i64 %a, 0 - br i1 %0, label %if, label %else - -if: - %1 = xor i64 %a, %b - br label %endif - -else: - %2 = load i64, i64 addrspace(1)* %in - br label %endif - -endif: - %3 = phi i64 [%1, %if], [%2, %else] - store i64 %3, i64 addrspace(1)* %out - ret void -} diff --git a/llvm/test/CodeGen/R600/zero_extend.ll b/llvm/test/CodeGen/R600/zero_extend.ll deleted file mode 100644 index 033055db185..00000000000 --- a/llvm/test/CodeGen/R600/zero_extend.ll +++ /dev/null @@ -1,41 +0,0 @@ -; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600 -; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI -; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s --check-prefix=SI - -; R600: {{^}}test: -; R600: MEM_RAT_CACHELESS STORE_RAW -; R600: MEM_RAT_CACHELESS STORE_RAW - -; SI: {{^}}test: -; SI: s_mov_b32 [[ZERO:s[0-9]]], 0{{$}} -; SI: v_mov_b32_e32 v[[V_ZERO:[0-9]]], [[ZERO]] -; SI: buffer_store_dwordx2 v[0:[[V_ZERO]]{{\]}} -define void @test(i64 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) { -entry: - %0 = mul i32 %a, %b - %1 = add i32 %0, %c - %2 = zext i32 %1 to i64 - store i64 %2, i64 addrspace(1)* %out - ret void -} - -; SI-LABEL: {{^}}testi1toi32: -; SI: v_cndmask_b32 -define void @testi1toi32(i32 addrspace(1)* %out, i32 %a, i32 %b) { -entry: - %0 = icmp eq i32 %a, %b - %1 = zext i1 %0 to i32 - store i32 %1, i32 addrspace(1)* %out - ret void -} - -; SI-LABEL: {{^}}zext_i1_to_i64: -; SI: s_mov_b32 s{{[0-9]+}}, 0 -; SI: v_cmp_eq_i32 -; SI: v_cndmask_b32 -define void @zext_i1_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b) nounwind { - %cmp = icmp eq i32 %a, %b - %ext = zext i1 %cmp to i64 - store i64 %ext, i64 addrspace(1)* %out, align 8 - ret void -} -- cgit v1.2.3