diff options
| author | Matt Arsenault <Matthew.Arsenault@amd.com> | 2017-09-20 05:01:53 +0000 |
|---|---|---|
| committer | Matt Arsenault <Matthew.Arsenault@amd.com> | 2017-09-20 05:01:53 +0000 |
| commit | b81495dccb2977e8861fa40e0b38657b46f148e6 (patch) | |
| tree | 5b9cc193b82458716f81e05376caf19c023f973f | |
| parent | e08ccfe3a116e4c0907fed266788d05681cb5db2 (diff) | |
| download | bcm5719-llvm-b81495dccb2977e8861fa40e0b38657b46f148e6.tar.gz bcm5719-llvm-b81495dccb2977e8861fa40e0b38657b46f148e6.zip | |
AMDGPU: Match load d16 hi instructions
Also starts selecting global loads for constant address
in some cases. Some end up selecting to mubuf still, which
requires investigation.
We still get sub-optimal regalloc and extra waitcnts inserted
due to not really tracking the liveness of the separate register
halves.
llvm-svn: 313716
| -rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp | 12 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUInstructions.td | 7 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/BUFInstructions.td | 67 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/DSInstructions.td | 47 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/FLATInstructions.td | 78 | ||||
| -rw-r--r-- | llvm/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll | 10 | ||||
| -rw-r--r-- | llvm/test/CodeGen/AMDGPU/fabs.f16.ll | 6 | ||||
| -rw-r--r-- | llvm/test/CodeGen/AMDGPU/load-hi16.ll | 506 | ||||
| -rw-r--r-- | llvm/test/CodeGen/AMDGPU/packed-op-sel.ll | 10 | ||||
| -rw-r--r-- | llvm/test/CodeGen/AMDGPU/sext-in-reg.ll | 9 |
10 files changed, 686 insertions, 66 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index f91cce1b08b..3ad19694570 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -126,10 +126,10 @@ private: bool SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, SDValue &VAddr, SDValue &SOffset, SDValue &Offset, SDValue &SLC) const; - bool SelectMUBUFScratchOffen(SDNode *Root, + bool SelectMUBUFScratchOffen(SDNode *Parent, SDValue Addr, SDValue &RSrc, SDValue &VAddr, SDValue &SOffset, SDValue &ImmOffset) const; - bool SelectMUBUFScratchOffset(SDNode *Root, + bool SelectMUBUFScratchOffset(SDNode *Parent, SDValue Addr, SDValue &SRsrc, SDValue &Soffset, SDValue &Offset) const; @@ -1107,7 +1107,7 @@ std::pair<SDValue, SDValue> AMDGPUDAGToDAGISel::foldFrameIndex(SDValue N) const MVT::i32)); } -bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDNode *Root, +bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDNode *Parent, SDValue Addr, SDValue &Rsrc, SDValue &VAddr, SDValue &SOffset, SDValue &ImmOffset) const { @@ -1130,7 +1130,7 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDNode *Root, // In a call sequence, stores to the argument stack area are relative to the // stack pointer. - const MachinePointerInfo &PtrInfo = cast<MemSDNode>(Root)->getPointerInfo(); + const MachinePointerInfo &PtrInfo = cast<MemSDNode>(Parent)->getPointerInfo(); unsigned SOffsetReg = isStackPtrRelative(PtrInfo) ? Info->getStackPtrOffsetReg() : Info->getScratchWaveOffsetReg(); @@ -1160,7 +1160,7 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDNode *Root, return true; } -bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffset(SDNode *Root, +bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffset(SDNode *Parent, SDValue Addr, SDValue &SRsrc, SDValue &SOffset, @@ -1175,7 +1175,7 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffset(SDNode *Root, SRsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32); - const MachinePointerInfo &PtrInfo = cast<MemSDNode>(Root)->getPointerInfo(); + const MachinePointerInfo &PtrInfo = cast<MemSDNode>(Parent)->getPointerInfo(); unsigned SOffsetReg = isStackPtrRelative(PtrInfo) ? Info->getStackPtrOffsetReg() : Info->getScratchWaveOffsetReg(); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td index 226191420f1..52f803ac097 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td @@ -252,6 +252,11 @@ class GlobalAddress : CodePatPred<[{ return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS; }]>; +class GlobalLoadAddress : CodePatPred<[{ + auto AS = cast<MemSDNode>(N)->getAddressSpace(); + return AS == AMDGPUASI.GLOBAL_ADDRESS || AS == AMDGPUASI.CONSTANT_ADDRESS; +}]>; + class FlatLoadAddress : CodePatPred<[{ const auto AS = cast<MemSDNode>(N)->getAddressSpace(); return AS == AMDGPUASI.FLAT_ADDRESS || @@ -292,7 +297,7 @@ class PrivateStore <SDPatternOperator op> : StoreFrag <op>, PrivateAddress; class LocalLoad <SDPatternOperator op> : LoadFrag <op>, LocalAddress; class LocalStore <SDPatternOperator op> : StoreFrag <op>, LocalAddress; -class GlobalLoad <SDPatternOperator op> : LoadFrag<op>, GlobalAddress; +class GlobalLoad <SDPatternOperator op> : LoadFrag<op>, GlobalLoadAddress; class GlobalStore <SDPatternOperator op> : StoreFrag<op>, GlobalAddress; class FlatLoad <SDPatternOperator op> : LoadFrag <op>, FlatLoadAddress; diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td index 075788edb33..076ce0f0cc4 100644 --- a/llvm/lib/Target/AMDGPU/BUFInstructions.td +++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td @@ -11,8 +11,8 @@ def MUBUFAddr32 : ComplexPattern<i64, 9, "SelectMUBUFAddr32">; def MUBUFAddr64 : ComplexPattern<i64, 7, "SelectMUBUFAddr64">; def MUBUFAddr64Atomic : ComplexPattern<i64, 5, "SelectMUBUFAddr64">; -def MUBUFScratchOffen : ComplexPattern<i64, 4, "SelectMUBUFScratchOffen", [], [SDNPWantRoot]>; -def MUBUFScratchOffset : ComplexPattern<i64, 3, "SelectMUBUFScratchOffset", [], [SDNPWantRoot], 20>; +def MUBUFScratchOffen : ComplexPattern<i64, 4, "SelectMUBUFScratchOffen", [], [SDNPWantParent]>; +def MUBUFScratchOffset : ComplexPattern<i64, 3, "SelectMUBUFScratchOffset", [], [SDNPWantParent], 20>; def MUBUFOffset : ComplexPattern<i64, 6, "SelectMUBUFOffset">; def MUBUFOffsetNoGLC : ComplexPattern<i64, 3, "SelectMUBUFOffset">; @@ -425,16 +425,18 @@ class MUBUF_SetupAddr<int addrKind> { class MUBUF_Load_Pseudo <string opName, int addrKind, RegisterClass vdataClass, + bit HasTiedDest = 0, list<dag> pattern=[], // Workaround bug bz30254 int addrKindCopy = addrKind> : MUBUF_Pseudo<opName, (outs vdataClass:$vdata), - getMUBUFIns<addrKindCopy>.ret, + !con(getMUBUFIns<addrKindCopy>.ret, !if(HasTiedDest, (ins vdataClass:$vdata_in), (ins))), " $vdata, " # getMUBUFAsmOps<addrKindCopy>.ret # "$glc$slc$tfe", pattern>, MUBUF_SetupAddr<addrKindCopy> { let PseudoInstr = opName # "_" # getAddrName<addrKindCopy>.ret; + let Constraints = !if(HasTiedDest, "$vdata = $vdata_in", ""); let mayLoad = 1; let mayStore = 0; let maybeAtomic = 1; @@ -444,27 +446,30 @@ class MUBUF_Load_Pseudo <string opName, // opcode because it needs an N+1 register class dest register. multiclass MUBUF_Pseudo_Loads<string opName, RegisterClass vdataClass, ValueType load_vt = i32, - SDPatternOperator ld = null_frag> { + SDPatternOperator ld = null_frag, + bit TiedDest = 0> { def _OFFSET : MUBUF_Load_Pseudo <opName, BUFAddrKind.Offset, vdataClass, + TiedDest, [(set load_vt:$vdata, (ld (MUBUFOffset v4i32:$srsrc, i32:$soffset, i16:$offset, i1:$glc, i1:$slc, i1:$tfe)))]>, MUBUFAddr64Table<0>; def _ADDR64 : MUBUF_Load_Pseudo <opName, BUFAddrKind.Addr64, vdataClass, + TiedDest, [(set load_vt:$vdata, (ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i16:$offset, i1:$glc, i1:$slc, i1:$tfe)))]>, MUBUFAddr64Table<1>; - def _OFFEN : MUBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, vdataClass>; - def _IDXEN : MUBUF_Load_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass>; - def _BOTHEN : MUBUF_Load_Pseudo <opName, BUFAddrKind.BothEn, vdataClass>; + def _OFFEN : MUBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, vdataClass, TiedDest>; + def _IDXEN : MUBUF_Load_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass, TiedDest>; + def _BOTHEN : MUBUF_Load_Pseudo <opName, BUFAddrKind.BothEn, vdataClass, TiedDest>; let DisableWQM = 1 in { - def _OFFSET_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.Offset, vdataClass>; - def _OFFEN_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, vdataClass>; - def _IDXEN_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass>; - def _BOTHEN_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.BothEn, vdataClass>; + def _OFFSET_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.Offset, vdataClass, TiedDest>; + def _OFFEN_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, vdataClass, TiedDest>; + def _IDXEN_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass, TiedDest>; + def _BOTHEN_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.BothEn, vdataClass, TiedDest>; } } @@ -812,7 +817,7 @@ defm BUFFER_LOAD_UBYTE_D16 : MUBUF_Pseudo_Loads < >; defm BUFFER_LOAD_UBYTE_D16_HI : MUBUF_Pseudo_Loads < - "buffer_load_ubyte_d16_hi", VGPR_32, i32 + "buffer_load_ubyte_d16_hi", VGPR_32, i32, null_frag, 1 >; defm BUFFER_LOAD_SBYTE_D16 : MUBUF_Pseudo_Loads < @@ -820,7 +825,7 @@ defm BUFFER_LOAD_SBYTE_D16 : MUBUF_Pseudo_Loads < >; defm BUFFER_LOAD_SBYTE_D16_HI : MUBUF_Pseudo_Loads < - "buffer_load_sbyte_d16_hi", VGPR_32, i32 + "buffer_load_sbyte_d16_hi", VGPR_32, i32, null_frag, 1 >; defm BUFFER_LOAD_SHORT_D16 : MUBUF_Pseudo_Loads < @@ -828,7 +833,7 @@ defm BUFFER_LOAD_SHORT_D16 : MUBUF_Pseudo_Loads < >; defm BUFFER_LOAD_SHORT_D16_HI : MUBUF_Pseudo_Loads < - "buffer_load_short_d16_hi", VGPR_32, i32 + "buffer_load_short_d16_hi", VGPR_32, i32, null_frag, 1 >; defm BUFFER_STORE_BYTE_D16_HI : MUBUF_Pseudo_Stores < @@ -1149,6 +1154,34 @@ multiclass MUBUFScratchLoadPat <MUBUF_Pseudo InstrOffen, >; } +// XXX - Is it possible to have a complex pattern in a PatFrag? +multiclass MUBUFScratchLoadPat_Hi16 <MUBUF_Pseudo InstrOffen, + MUBUF_Pseudo InstrOffset, + ValueType vt, PatFrag ld> { + def : Pat < + (build_vector vt:$lo, (vt (ld (MUBUFScratchOffen v4i32:$srsrc, i32:$vaddr, + i32:$soffset, u16imm:$offset)))), + (v2i16 (InstrOffen $vaddr, $srsrc, $soffset, $offset, 0, 0, 0, $lo)) + >; + + def : Pat < + (build_vector f16:$lo, (f16 (bitconvert (vt (ld (MUBUFScratchOffen v4i32:$srsrc, i32:$vaddr, + i32:$soffset, u16imm:$offset)))))), + (v2f16 (InstrOffen $vaddr, $srsrc, $soffset, $offset, 0, 0, 0, $lo)) + >; + + + def : Pat < + (build_vector vt:$lo, (vt (ld (MUBUFScratchOffset v4i32:$srsrc, i32:$soffset, u16imm:$offset)))), + (v2i16 (InstrOffset $srsrc, $soffset, $offset, 0, 0, 0, $lo)) + >; + + def : Pat < + (build_vector f16:$lo, (f16 (bitconvert (vt (ld (MUBUFScratchOffset v4i32:$srsrc, i32:$soffset, u16imm:$offset)))))), + (v2f16 (InstrOffset $srsrc, $soffset, $offset, 0, 0, 0, $lo)) + >; +} + defm : MUBUFScratchLoadPat <BUFFER_LOAD_SBYTE_OFFEN, BUFFER_LOAD_SBYTE_OFFSET, i32, sextloadi8_private>; defm : MUBUFScratchLoadPat <BUFFER_LOAD_UBYTE_OFFEN, BUFFER_LOAD_UBYTE_OFFSET, i32, az_extloadi8_private>; defm : MUBUFScratchLoadPat <BUFFER_LOAD_SBYTE_OFFEN, BUFFER_LOAD_SBYTE_OFFSET, i16, sextloadi8_private>; @@ -1160,6 +1193,12 @@ defm : MUBUFScratchLoadPat <BUFFER_LOAD_DWORD_OFFEN, BUFFER_LOAD_DWORD_OFFSET, i defm : MUBUFScratchLoadPat <BUFFER_LOAD_DWORDX2_OFFEN, BUFFER_LOAD_DWORDX2_OFFSET, v2i32, load_private>; defm : MUBUFScratchLoadPat <BUFFER_LOAD_DWORDX4_OFFEN, BUFFER_LOAD_DWORDX4_OFFSET, v4i32, load_private>; +let Predicates = [HasD16LoadStore] in { +defm : MUBUFScratchLoadPat_Hi16<BUFFER_LOAD_SHORT_D16_HI_OFFEN, BUFFER_LOAD_SHORT_D16_HI_OFFSET, i16, load_private>; +defm : MUBUFScratchLoadPat_Hi16<BUFFER_LOAD_UBYTE_D16_HI_OFFEN, BUFFER_LOAD_UBYTE_D16_HI_OFFSET, i16, az_extloadi8_private>; +defm : MUBUFScratchLoadPat_Hi16<BUFFER_LOAD_SBYTE_D16_HI_OFFEN, BUFFER_LOAD_SBYTE_D16_HI_OFFSET, i16, sextloadi8_private>; +} + // BUFFER_LOAD_DWORD*, addr64=0 multiclass MUBUF_Load_Dword <ValueType vt, MUBUF_Pseudo offset, diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td index c7c3e015dc0..e66bf402178 100644 --- a/llvm/lib/Target/AMDGPU/DSInstructions.td +++ b/llvm/lib/Target/AMDGPU/DSInstructions.td @@ -145,16 +145,22 @@ class DS_1A2D_Off8_RET<string opName, let hasPostISelHook = 1; } -class DS_1A_RET<string opName, RegisterClass rc = VGPR_32, Operand ofs = offset> +class DS_1A_RET<string opName, RegisterClass rc = VGPR_32, bit HasTiedOutput = 0, Operand ofs = offset> : DS_Pseudo<opName, (outs rc:$vdst), - (ins VGPR_32:$addr, ofs:$offset, gds:$gds), + !if(HasTiedOutput, + (ins VGPR_32:$addr, ofs:$offset, gds:$gds, rc:$vdst_in), + (ins VGPR_32:$addr, ofs:$offset, gds:$gds)), "$vdst, $addr$offset$gds"> { - + let Constraints = !if(HasTiedOutput, "$vdst = $vdst_in", ""); + let DisableEncoding = !if(HasTiedOutput, "$vdst_in", ""); let has_data0 = 0; let has_data1 = 0; } +class DS_1A_RET_Tied<string opName, RegisterClass rc = VGPR_32> : + DS_1A_RET<opName, rc, 1>; + class DS_1A_Off8_RET <string opName, RegisterClass rc = VGPR_32> : DS_Pseudo<opName, (outs rc:$vdst), @@ -450,7 +456,7 @@ def DS_WRITE_SRC2_B32 : DS_1A<"ds_write_src2_b32">; def DS_WRITE_SRC2_B64 : DS_1A<"ds_write_src2_b64">; let Uses = [EXEC], mayLoad = 0, mayStore = 0, isConvergent = 1 in { -def DS_SWIZZLE_B32 : DS_1A_RET <"ds_swizzle_b32", VGPR_32, SwizzleImm>; +def DS_SWIZZLE_B32 : DS_1A_RET <"ds_swizzle_b32", VGPR_32, 0, SwizzleImm>; } let mayStore = 0 in { @@ -468,12 +474,12 @@ def DS_READ2_B64 : DS_1A_Off8_RET<"ds_read2_b64", VReg_128>; def DS_READ2ST64_B64 : DS_1A_Off8_RET<"ds_read2st64_b64", VReg_128>; let SubtargetPredicate = HasD16LoadStore in { -def DS_READ_U8_D16 : DS_1A_RET<"ds_read_u8_d16">; -def DS_READ_U8_D16_HI : DS_1A_RET<"ds_read_u8_d16_hi">; -def DS_READ_I8_D16 : DS_1A_RET<"ds_read_i8_d16">; -def DS_READ_I8_D16_HI : DS_1A_RET<"ds_read_i8_d16_hi">; -def DS_READ_U16_D16 : DS_1A_RET<"ds_read_u16_d16">; -def DS_READ_U16_D16_HI : DS_1A_RET<"ds_read_u16_d16_hi">; +def DS_READ_U8_D16 : DS_1A_RET_Tied<"ds_read_u8_d16">; +def DS_READ_U8_D16_HI : DS_1A_RET_Tied<"ds_read_u8_d16_hi">; +def DS_READ_I8_D16 : DS_1A_RET_Tied<"ds_read_i8_d16">; +def DS_READ_I8_D16_HI : DS_1A_RET_Tied<"ds_read_i8_d16_hi">; +def DS_READ_U16_D16 : DS_1A_RET_Tied<"ds_read_u16_d16">; +def DS_READ_U16_D16_HI : DS_1A_RET_Tied<"ds_read_u16_d16_hi">; } let SubtargetPredicate = HasDSAddTid in { @@ -543,6 +549,18 @@ class DSReadPat <DS_Pseudo inst, ValueType vt, PatFrag frag> : Pat < (inst $ptr, (as_i16imm $offset), (i1 0)) >; +multiclass DSReadPat_Hi16 <DS_Pseudo inst, PatFrag frag, ValueType vt = i16> { + def : Pat < + (build_vector vt:$lo, (vt (frag (DS1Addr1Offset i32:$ptr, i32:$offset)))), + (v2i16 (inst $ptr, (as_i16imm $offset), (i1 0), $lo)) + >; + + def : Pat < + (build_vector f16:$lo, (f16 (bitconvert (vt (frag (DS1Addr1Offset i32:$ptr, i32:$offset)))))), + (v2f16 (inst $ptr, (as_i16imm $offset), (i1 0), $lo)) + >; +} + def : DSReadPat <DS_READ_I8, i32, sextloadi8_local_m0>; def : DSReadPat <DS_READ_U8, i32, az_extloadi8_local_m0>; def : DSReadPat <DS_READ_I8, i16, sextloadi8_local_m0>; @@ -565,6 +583,15 @@ def : Pat < (DS_READ2_B32 $ptr, $offset0, $offset1, (i1 0)) >; + +let Predicates = [HasD16LoadStore] in { +let AddedComplexity = 100 in { +defm : DSReadPat_Hi16<DS_READ_U16_D16_HI, load_local>; +defm : DSReadPat_Hi16<DS_READ_U8_D16_HI, az_extloadi8_local>; +defm : DSReadPat_Hi16<DS_READ_I8_D16_HI, sextloadi8_local>; +} +} + class DSWritePat <DS_Pseudo inst, ValueType vt, PatFrag frag> : Pat < (frag vt:$value, (DS1Addr1Offset i32:$ptr, i32:$offset)), (inst $ptr, $value, (as_i16imm $offset), (i1 0)) diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td index 85f610304b0..8c32ce232dc 100644 --- a/llvm/lib/Target/AMDGPU/FLATInstructions.td +++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td @@ -125,15 +125,18 @@ class FLAT_Real <bits<7> op, FLAT_Pseudo ps> : // same encoding value as exec_hi, so it isn't possible to use that if // saddr is 32-bit (which isn't handled here yet). class FLAT_Load_Pseudo <string opName, RegisterClass regClass, + bit HasTiedOutput = 0, bit HasSignedOffset = 0, bit HasSaddr = 0, bit EnableSaddr = 0> : FLAT_Pseudo< opName, (outs regClass:$vdst), !con( !con( - !con((ins VReg_64:$vaddr), - !if(EnableSaddr, (ins SReg_64:$saddr), (ins))), - (ins !if(HasSignedOffset,offset_s13,offset_u12):$offset)), - (ins GLC:$glc, slc:$slc)), + !con( + !con((ins VReg_64:$vaddr), + !if(EnableSaddr, (ins SReg_64:$saddr), (ins))), + (ins !if(HasSignedOffset,offset_s13,offset_u12):$offset)), + (ins GLC:$glc, slc:$slc)), + !if(HasTiedOutput, (ins regClass:$vdst_in), (ins))), " $vdst, $vaddr"#!if(HasSaddr, !if(EnableSaddr, ", $saddr", ", off"), "")#"$offset$glc$slc"> { let has_data = 0; let mayLoad = 1; @@ -141,6 +144,9 @@ class FLAT_Load_Pseudo <string opName, RegisterClass regClass, let enabled_saddr = EnableSaddr; let PseudoInstr = opName#!if(!and(HasSaddr, EnableSaddr), "_SADDR", ""); let maybeAtomic = 1; + + let Constraints = !if(HasTiedOutput, "$vdst = $vdst_in", ""); + let DisableEncoding = !if(HasTiedOutput, "$vdst_in", ""); } class FLAT_Store_Pseudo <string opName, RegisterClass vdataClass, @@ -163,10 +169,10 @@ class FLAT_Store_Pseudo <string opName, RegisterClass vdataClass, let maybeAtomic = 1; } -multiclass FLAT_Global_Load_Pseudo<string opName, RegisterClass regClass> { +multiclass FLAT_Global_Load_Pseudo<string opName, RegisterClass regClass, bit HasTiedInput = 0> { let is_flat_global = 1 in { - def "" : FLAT_Load_Pseudo<opName, regClass, 1, 1>; - def _SADDR : FLAT_Load_Pseudo<opName, regClass, 1, 1, 1>; + def "" : FLAT_Load_Pseudo<opName, regClass, HasTiedInput, 1, 1>; + def _SADDR : FLAT_Load_Pseudo<opName, regClass, HasTiedInput, 1, 1, 1>; } } @@ -360,12 +366,12 @@ def FLAT_STORE_DWORDX4 : FLAT_Store_Pseudo <"flat_store_dwordx4", VReg_128>; def FLAT_STORE_DWORDX3 : FLAT_Store_Pseudo <"flat_store_dwordx3", VReg_96>; let SubtargetPredicate = HasD16LoadStore in { -def FLAT_LOAD_UBYTE_D16 : FLAT_Load_Pseudo <"flat_load_ubyte_d16", VGPR_32>; -def FLAT_LOAD_UBYTE_D16_HI : FLAT_Load_Pseudo <"flat_load_ubyte_d16_hi", VGPR_32>; -def FLAT_LOAD_SBYTE_D16 : FLAT_Load_Pseudo <"flat_load_sbyte_d16", VGPR_32>; -def FLAT_LOAD_SBYTE_D16_HI : FLAT_Load_Pseudo <"flat_load_sbyte_d16_hi", VGPR_32>; -def FLAT_LOAD_SHORT_D16 : FLAT_Load_Pseudo <"flat_load_short_d16", VGPR_32>; -def FLAT_LOAD_SHORT_D16_HI : FLAT_Load_Pseudo <"flat_load_short_d16_hi", VGPR_32>; +def FLAT_LOAD_UBYTE_D16 : FLAT_Load_Pseudo <"flat_load_ubyte_d16", VGPR_32, 1>; +def FLAT_LOAD_UBYTE_D16_HI : FLAT_Load_Pseudo <"flat_load_ubyte_d16_hi", VGPR_32, 1>; +def FLAT_LOAD_SBYTE_D16 : FLAT_Load_Pseudo <"flat_load_sbyte_d16", VGPR_32, 1>; +def FLAT_LOAD_SBYTE_D16_HI : FLAT_Load_Pseudo <"flat_load_sbyte_d16_hi", VGPR_32, 1>; +def FLAT_LOAD_SHORT_D16 : FLAT_Load_Pseudo <"flat_load_short_d16", VGPR_32, 1>; +def FLAT_LOAD_SHORT_D16_HI : FLAT_Load_Pseudo <"flat_load_short_d16_hi", VGPR_32, 1>; def FLAT_STORE_BYTE_D16_HI : FLAT_Store_Pseudo <"flat_store_byte_d16_hi", VGPR_32>; def FLAT_STORE_SHORT_D16_HI : FLAT_Store_Pseudo <"flat_store_short_d16_hi", VGPR_32>; @@ -483,12 +489,12 @@ defm GLOBAL_LOAD_DWORDX2 : FLAT_Global_Load_Pseudo <"global_load_dwordx2", VReg defm GLOBAL_LOAD_DWORDX3 : FLAT_Global_Load_Pseudo <"global_load_dwordx3", VReg_96>; defm GLOBAL_LOAD_DWORDX4 : FLAT_Global_Load_Pseudo <"global_load_dwordx4", VReg_128>; -defm GLOBAL_LOAD_UBYTE_D16 : FLAT_Global_Load_Pseudo <"global_load_ubyte_d16", VGPR_32>; -defm GLOBAL_LOAD_UBYTE_D16_HI : FLAT_Global_Load_Pseudo <"global_load_ubyte_d16_hi", VGPR_32>; -defm GLOBAL_LOAD_SBYTE_D16 : FLAT_Global_Load_Pseudo <"global_load_sbyte_d16", VGPR_32>; -defm GLOBAL_LOAD_SBYTE_D16_HI : FLAT_Global_Load_Pseudo <"global_load_sbyte_d16_hi", VGPR_32>; -defm GLOBAL_LOAD_SHORT_D16 : FLAT_Global_Load_Pseudo <"global_load_short_d16", VGPR_32>; -defm GLOBAL_LOAD_SHORT_D16_HI : FLAT_Global_Load_Pseudo <"global_load_short_d16_hi", VGPR_32>; +defm GLOBAL_LOAD_UBYTE_D16 : FLAT_Global_Load_Pseudo <"global_load_ubyte_d16", VGPR_32, 1>; +defm GLOBAL_LOAD_UBYTE_D16_HI : FLAT_Global_Load_Pseudo <"global_load_ubyte_d16_hi", VGPR_32, 1>; +defm GLOBAL_LOAD_SBYTE_D16 : FLAT_Global_Load_Pseudo <"global_load_sbyte_d16", VGPR_32, 1>; +defm GLOBAL_LOAD_SBYTE_D16_HI : FLAT_Global_Load_Pseudo <"global_load_sbyte_d16_hi", VGPR_32, 1>; +defm GLOBAL_LOAD_SHORT_D16 : FLAT_Global_Load_Pseudo <"global_load_short_d16", VGPR_32, 1>; +defm GLOBAL_LOAD_SHORT_D16_HI : FLAT_Global_Load_Pseudo <"global_load_short_d16_hi", VGPR_32, 1>; defm GLOBAL_STORE_BYTE : FLAT_Global_Store_Pseudo <"global_store_byte", VGPR_32>; defm GLOBAL_STORE_SHORT : FLAT_Global_Store_Pseudo <"global_store_short", VGPR_32>; @@ -624,6 +630,30 @@ class FlatLoadPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : Pat (inst $vaddr, $offset, 0, $slc) >; +multiclass FlatLoadPat_Hi16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt = i16> { + def : Pat < + (build_vector vt:$elt0, (vt (node (FLATOffset i64:$vaddr, i16:$offset, i1:$slc)))), + (v2i16 (inst $vaddr, $offset, 0, $slc, $elt0)) + >; + + def : Pat < + (build_vector f16:$elt0, (f16 (bitconvert (vt (node (FLATOffset i64:$vaddr, i16:$offset, i1:$slc)))))), + (v2f16 (inst $vaddr, $offset, 0, $slc, $elt0)) + >; +} + +multiclass FlatSignedLoadPat_Hi16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt = i16> { + def : Pat < + (build_vector vt:$elt0, (vt (node (FLATOffsetSigned i64:$vaddr, i16:$offset, i1:$slc)))), + (v2i16 (inst $vaddr, $offset, 0, $slc, $elt0)) + >; + + def : Pat < + (build_vector f16:$elt0, (f16 (bitconvert (vt (node (FLATOffsetSigned i64:$vaddr, i16:$offset, i1:$slc)))))), + (v2f16 (inst $vaddr, $offset, 0, $slc, $elt0)) + >; +} + class FlatLoadAtomicPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : Pat < (vt (node (FLATAtomic i64:$vaddr, i16:$offset, i1:$slc))), (inst $vaddr, $offset, 0, $slc) @@ -729,6 +759,12 @@ def : FlatStorePat <FLAT_STORE_SHORT, store_flat, i16>; let Predicates = [HasD16LoadStore] in { def : FlatStorePat <FLAT_STORE_SHORT_D16_HI, truncstorei16_hi16_flat, i32>; def : FlatStorePat <FLAT_STORE_BYTE_D16_HI, truncstorei8_hi16_flat, i32>; + +let AddedComplexity = 3 in { +defm : FlatLoadPat_Hi16 <FLAT_LOAD_UBYTE_D16_HI, az_extloadi8_flat>; +defm : FlatLoadPat_Hi16 <FLAT_LOAD_SBYTE_D16_HI, sextloadi8_flat>; +defm : FlatLoadPat_Hi16 <FLAT_LOAD_SHORT_D16_HI, load_flat>; +} } } // End Predicates = [HasFlatAddressSpace] @@ -761,6 +797,10 @@ def : FlatStoreSignedPat <GLOBAL_STORE_DWORDX4, store_global, v4i32>; let Predicates = [HasD16LoadStore] in { def : FlatStoreSignedPat <GLOBAL_STORE_SHORT_D16_HI, truncstorei16_hi16_global, i32>; def : FlatStoreSignedPat <GLOBAL_STORE_BYTE_D16_HI, truncstorei8_hi16_global, i32>; + +defm : FlatSignedLoadPat_Hi16 <GLOBAL_LOAD_UBYTE_D16_HI, az_extloadi8_global>; +defm : FlatSignedLoadPat_Hi16 <GLOBAL_LOAD_SBYTE_D16_HI, sextloadi8_global>; +defm : FlatSignedLoadPat_Hi16 <GLOBAL_LOAD_SHORT_D16_HI, load_global>; } def : FlatStoreSignedAtomicPat <GLOBAL_STORE_DWORD, store_atomic_global, i32>; diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll index 2d7e49832e2..b3d6d79e8db 100644 --- a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll @@ -92,14 +92,18 @@ define amdgpu_kernel void @extract_vector_elt_v4i16(i16 addrspace(1)* %out, <4 x } ; GCN-LABEL: {{^}}dynamic_extract_vector_elt_v3i16: -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort +; SICIVI: buffer_load_ushort +; SICIVI: buffer_load_ushort +; SICIVI: buffer_load_ushort ; SICIVI: buffer_store_short ; SICIVI: buffer_store_short ; SICIVI: buffer_store_short +; GFX9: buffer_load_ushort +; GFX9: buffer_load_ushort +; GFX9: global_load_short_d16_hi + ; GFX9: buffer_store_dword ; GFX9: buffer_store_dword diff --git a/llvm/test/CodeGen/AMDGPU/fabs.f16.ll b/llvm/test/CodeGen/AMDGPU/fabs.f16.ll index 9da2479853b..4429cfa7b8a 100644 --- a/llvm/test/CodeGen/AMDGPU/fabs.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fabs.f16.ll @@ -7,7 +7,7 @@ ; unless isFabsFree returns true ; GCN-LABEL: {{^}}s_fabs_free_f16: -; GCN: flat_load_ushort [[VAL:v[0-9]+]], +; GCN: {{flat|global}}_load_ushort [[VAL:v[0-9]+]], ; GCN: v_and_b32_e32 [[RESULT:v[0-9]+]], 0x7fff, [[VAL]] ; GCN: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] @@ -75,8 +75,8 @@ define amdgpu_kernel void @s_fabs_v4f16(<4 x half> addrspace(1)* %out, <4 x half } ; GCN-LABEL: {{^}}fabs_fold_f16: -; GCN: flat_load_ushort [[IN0:v[0-9]+]] -; GCN: flat_load_ushort [[IN1:v[0-9]+]] +; GCN: {{flat|global}}_load_ushort [[IN0:v[0-9]+]] +; GCN: {{flat|global}}_load_ushort [[IN1:v[0-9]+]] ; CI-DAG: v_cvt_f32_f16_e32 [[CVT0:v[0-9]+]], [[IN0]] ; CI-DAG: v_cvt_f32_f16_e64 [[ABS_CVT1:v[0-9]+]], |[[IN1]]| diff --git a/llvm/test/CodeGen/AMDGPU/load-hi16.ll b/llvm/test/CodeGen/AMDGPU/load-hi16.ll new file mode 100644 index 00000000000..806664bb32e --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/load-hi16.ll @@ -0,0 +1,506 @@ +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s +; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI %s + +; GCN-LABEL: {{^}}load_local_hi_v2i16_undeflo: +; GCN: s_waitcnt +; GFX9-NEXT: ds_read_u16_d16_hi v0, v0 +; GFX9-NEXT: s_waitcnt +; GFX9-NEXT: s_setpc_b64 + +; VI: ds_read_u16 +define <2 x i16> @load_local_hi_v2i16_undeflo(i16 addrspace(3)* %in) #0 { +entry: + %load = load i16, i16 addrspace(3)* %in + %build = insertelement <2 x i16> undef, i16 %load, i32 1 + ret <2 x i16> %build +} + +; GCN-LABEL: {{^}}load_local_hi_v2i16_reglo: +; GCN: s_waitcnt +; GFX9-NEXT: ds_read_u16_d16_hi v1, v0 +; GFX9-NEXT: s_waitcnt +; GFX9-NEXT: v_mov_b32_e32 v0, v1 +; GFX9-NEXT: s_setpc_b64 + +; VI: ds_read_u16 +define <2 x i16> @load_local_hi_v2i16_reglo(i16 addrspace(3)* %in, i16 %reg) #0 { +entry: + %load = load i16, i16 addrspace(3)* %in + %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 + %build1 = insertelement <2 x i16> %build0, i16 %load, i32 1 + ret <2 x i16> %build1 +} + +; Show that we get reasonable regalloc without physreg constraints. +; GCN-LABEL: {{^}}load_local_hi_v2i16_reglo_vreg: +; GCN: s_waitcnt +; GFX9-NEXT: ds_read_u16_d16_hi v1, v0 +; GFX9-NEXT: s_waitcnt +; GFX9-NEXT: global_store_dword v[0:1], v1, off{{$}} +; GFX9-NEXT: s_waitcnt +; GFX9-NEXT: s_setpc_b64 + +; VI: ds_read_u16 +define void @load_local_hi_v2i16_reglo_vreg(i16 addrspace(3)* %in, i16 %reg) #0 { +entry: + %load = load i16, i16 addrspace(3)* %in + %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 + %build1 = insertelement <2 x i16> %build0, i16 %load, i32 1 + store <2 x i16> %build1, <2 x i16> addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}load_local_hi_v2i16_zerolo: +; GCN: s_waitcnt +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: ds_read_u16_d16_hi v1, v0 +; GFX9-NEXT: s_waitcnt +; GFX9-NEXT: v_mov_b32_e32 v0, v1 +; GFX9-NEXT: s_setpc_b64 + +; VI: ds_read_u16 +define <2 x i16> @load_local_hi_v2i16_zerolo(i16 addrspace(3)* %in) #0 { +entry: + %load = load i16, i16 addrspace(3)* %in + %build = insertelement <2 x i16> zeroinitializer, i16 %load, i32 1 + ret <2 x i16> %build +} + +; FIXME: Remove m0 initialization +; GCN-LABEL: {{^}}load_local_hi_v2i16_zerolo_shift: +; GCN: s_waitcnt +; GFX9-NEXT: s_mov_b32 m0, -1 +; GFX9-NEXT: ds_read_u16 v0, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: s_setpc_b64 + +; VI: ds_read_u16 +; VI: v_lshlrev_b32_e32 v0, 16, v0 +define i32 @load_local_hi_v2i16_zerolo_shift(i16 addrspace(3)* %in) #0 { +entry: + %load = load i16, i16 addrspace(3)* %in + %zext = zext i16 %load to i32 + %shift = shl i32 %zext, 16 + ret i32 %shift +} + +; GCN-LABEL: {{^}}load_local_hi_v2f16_reglo_vreg: +; GCN: s_waitcnt +; GFX9-NEXT: ds_read_u16_d16_hi v1, v0 +; GFX9-NEXT: s_waitcnt +; GFX9-NEXT: global_store_dword v[0:1], v1, off{{$}} +; GFX9-NEXT: s_waitcnt +; GFX9-NEXT: s_setpc_b64 + +; VI: ds_read_u16 +define void @load_local_hi_v2f16_reglo_vreg(half addrspace(3)* %in, half %reg) #0 { +entry: + %load = load half, half addrspace(3)* %in + %build0 = insertelement <2 x half> undef, half %reg, i32 0 + %build1 = insertelement <2 x half> %build0, half %load, i32 1 + store <2 x half> %build1, <2 x half> addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}load_local_hi_v2i16_reglo_vreg_zexti8: +; GCN: s_waitcnt +; GFX9-NEXT: ds_read_u8_d16_hi v1, v0 +; GFX9-NEXT: s_waitcnt +; GFX9-NEXT: global_store_dword v[0:1], v1, off{{$}} +; GFX9-NEXT: s_waitcnt +; GFX9-NEXT: s_setpc_b64 + +; VI: ds_read_u8 +define void @load_local_hi_v2i16_reglo_vreg_zexti8(i8 addrspace(3)* %in, i16 %reg) #0 { +entry: + %load = load i8, i8 addrspace(3)* %in + %ext = zext i8 %load to i16 + %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 + %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1 + store <2 x i16> %build1, <2 x i16> addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}load_local_hi_v2i16_reglo_vreg_sexti8: +; GCN: s_waitcnt +; GFX9-NEXT: ds_read_i8_d16_hi v1, v0 +; GFX9-NEXT: s_waitcnt +; GFX9-NEXT: global_store_dword v[0:1], v1, off{{$}} +; GFX9-NEXT: s_waitcnt +; GFX9-NEXT: s_setpc_b64 + +; VI: ds_read_i8 +define void @load_local_hi_v2i16_reglo_vreg_sexti8(i8 addrspace(3)* %in, i16 %reg) #0 { +entry: + %load = load i8, i8 addrspace(3)* %in + %ext = sext i8 %load to i16 + %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 + %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1 + store <2 x i16> %build1, <2 x i16> addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}load_global_hi_v2i16_reglo_vreg: +; GCN: s_waitcnt +; GFX9-NEXT: global_load_short_d16_hi v2, v[0:1], off offset:-4094 +; GFX9-NEXT: s_waitcnt +; GFX9-NEXT: global_store_dword +; GFX9-NEXT: s_waitcnt +; GFX9-NEXT: s_setpc_b64 +define void @load_global_hi_v2i16_reglo_vreg(i16 addrspace(1)* %in, i16 %reg) #0 { +entry: + %gep = getelementptr inbounds i16, i16 addrspace(1)* %in, i64 -2047 + %load = load i16, i16 addrspace(1)* %gep + %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 + %build1 = insertelement <2 x i16> %build0, i16 %load, i32 1 + store <2 x i16> %build1, <2 x i16> addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}load_global_hi_v2f16_reglo_vreg: +; GCN: s_waitcnt +; GFX9-NEXT: global_load_short_d16_hi v2, v[0:1], off offset:-4094 +; GFX9-NEXT: s_waitcnt +; GFX9-NEXT: global_store_dword +; GFX9-NEXT: s_waitcnt +; GFX9-NEXT: s_setpc_b64 +define void @load_global_hi_v2f16_reglo_vreg(half addrspace(1)* %in, half %reg) #0 { +entry: + %gep = getelementptr inbounds half, half addrspace(1)* %in, i64 -2047 + %load = load half, half addrspace(1)* %gep + %build0 = insertelement <2 x half> undef, half %reg, i32 0 + %build1 = insertelement <2 x half> %build0, half %load, i32 1 + store <2 x half> %build1, <2 x half> addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}load_global_hi_v2i16_reglo_vreg_zexti8: +; GCN: s_waitcnt +; GFX9-NEXT: global_load_ubyte_d16_hi v2, v[0:1], off offset:-4095 +; GFX9-NEXT: s_waitcnt +; GFX9-NEXT: global_store_dword +; GFX9-NEXT: s_waitcnt +; GFX9-NEXT: s_setpc_b64 +define void @load_global_hi_v2i16_reglo_vreg_zexti8(i8 addrspace(1)* %in, i16 %reg) #0 { +entry: + %gep = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 -4095 + %load = load i8, i8 addrspace(1)* %gep + %ext = zext i8 %load to i16 + %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 + %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1 + store <2 x i16> %build1, <2 x i16> addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}load_global_hi_v2i16_reglo_vreg_sexti8: +; GCN: s_waitcnt +; GFX9-NEXT: global_load_sbyte_d16_hi v2, v[0:1], off offset:-4095 +; GFX9-NEXT: s_waitcnt +; GFX9-NEXT: global_store_dword +; GFX9-NEXT: s_waitcnt +; GFX9-NEXT: s_setpc_b64 +define void @load_global_hi_v2i16_reglo_vreg_sexti8(i8 addrspace(1)* %in, i16 %reg) #0 { +entry: + %gep = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 -4095 + %load = load i8, i8 addrspace(1)* %gep + %ext = sext i8 %load to i16 + %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 + %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1 + store <2 x i16> %build1, <2 x i16> addrspace(1)* undef + ret void +} + +; GCN-LABEL: load_flat_hi_v2i16_reglo_vreg: +; GCN: s_waitcnt +; GFX9-NEXT: flat_load_short_d16_hi v2, v[0:1] +; GFX9-NEXT: s_waitcnt +; GFX9-NEXT: global_store_dword v[0:1], v2 +; GFX9-NEXT: s_waitcnt +; GFX9-NEXT: s_setpc_b64 + +; VI: flat_load_ushort v{{[0-9]+}} +; VI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, +; VI: v_or_b32_sdwa +define void @load_flat_hi_v2i16_reglo_vreg(i16 addrspace(4)* %in, i16 %reg) #0 { +entry: + %load = load i16, i16 addrspace(4)* %in + %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 + %build1 = insertelement <2 x i16> %build0, i16 %load, i32 1 + store <2 x i16> %build1, <2 x i16> addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}load_flat_hi_v2f16_reglo_vreg: +; GCN: s_waitcnt +; GFX9-NEXT: flat_load_short_d16_hi v2, v[0:1] +; GFX9-NEXT: s_waitcnt +; GFX9-NEXT: global_store_dword v[0:1], v2 +; GFX9-NEXT: s_waitcnt +; GFX9-NEXT: s_setpc_b64 + +; VI: flat_load_ushort v{{[0-9]+}} +; VI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, +; VI: v_or_b32_sdwa +define void @load_flat_hi_v2f16_reglo_vreg(half addrspace(4)* %in, half %reg) #0 { +entry: + %load = load half, half addrspace(4)* %in + %build0 = insertelement <2 x half> undef, half %reg, i32 0 + %build1 = insertelement <2 x half> %build0, half %load, i32 1 + store <2 x half> %build1, <2 x half> addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}load_flat_hi_v2i16_reglo_vreg_zexti8: +; GCN: s_waitcnt +; GFX9-NEXT: flat_load_ubyte_d16_hi v2, v[0:1] +; GFX9-NEXT: s_waitcnt +; GFX9-NEXT: global_store_dword v[0:1], v2 +; GFX9-NEXT: s_waitcnt +; GFX9-NEXT: s_setpc_b64 + +; VI: flat_load_ubyte v{{[0-9]+}} +; VI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, +; VI: v_or_b32_sdwa +define void @load_flat_hi_v2i16_reglo_vreg_zexti8(i8 addrspace(4)* %in, i16 %reg) #0 { +entry: + %load = load i8, i8 addrspace(4)* %in + %ext = zext i8 %load to i16 + %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 + %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1 + store <2 x i16> %build1, <2 x i16> addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}load_flat_hi_v2i16_reglo_vreg_sexti8: +; GCN: s_waitcnt +; GFX9-NEXT: flat_load_sbyte_d16_hi v2, v[0:1] +; GFX9-NEXT: s_waitcnt +; GFX9-NEXT: global_store_dword v[0:1], v2 +; GFX9-NEXT: s_waitcnt +; GFX9-NEXT: s_setpc_b64 + +; VI: flat_load_sbyte v{{[0-9]+}} +; VI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, +; VI: v_or_b32_sdwa +define void @load_flat_hi_v2i16_reglo_vreg_sexti8(i8 addrspace(4)* %in, i16 %reg) #0 { +entry: + %load = load i8, i8 addrspace(4)* %in + %ext = sext i8 %load to i16 + %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 + %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1 + store <2 x i16> %build1, <2 x i16> addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg: +; GCN: s_waitcnt +; GFX9-NEXT: buffer_load_short_d16_hi v1, v0, s[0:3], s4 offen offset:4094{{$}} +; GFX9-NEXT: s_waitcnt +; GFX9-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1 +; GFX9-NEXT: s_waitcnt +; GFX9-NEXT: s_setpc_b64 + +; VI: buffer_load_ushort v{{[0-9]+}}, v0, s[0:3], s4 offen offset:4094{{$}} +define void @load_private_hi_v2i16_reglo_vreg(i16* %in, i16 %reg) #0 { +entry: + %gep = getelementptr inbounds i16, i16* %in, i64 2047 + %load = load i16, i16* %gep + %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 + %build1 = insertelement <2 x i16> %build0, i16 %load, i32 1 + store <2 x i16> %build1, <2 x i16> addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}load_private_hi_v2f16_reglo_vreg: +; GCN: s_waitcnt +; GFX9-NEXT: buffer_load_short_d16_hi v1, v0, s[0:3], s4 offen offset:4094{{$}} +; GFX9-NEXT: s_waitcnt +; GFX9-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1 +; GFX9-NEXT: s_waitcnt +; GFX9-NEXT: s_setpc_b64 + +; VI: buffer_load_ushort v{{[0-9]+}}, v0, s[0:3], s4 offen offset:4094{{$}} +define void @load_private_hi_v2f16_reglo_vreg(half* %in, half %reg) #0 { +entry: + %gep = getelementptr inbounds half, half* %in, i64 2047 + %load = load half, half* %gep + %build0 = insertelement <2 x half> undef, half %reg, i32 0 + %build1 = insertelement <2 x half> %build0, half %load, i32 1 + store <2 x half> %build1, <2 x half> addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_nooff: +; GCN: s_waitcnt +; GFX9-NEXT: buffer_load_short_d16_hi v1, off, s[0:3], s4 offset:4094{{$}} +; GFX9-NEXT: s_waitcnt +; GFX9-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1 +; GFX9-NEXT: s_waitcnt +; GFX9-NEXT: s_setpc_b64 + +; VI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s4 offset:4094{{$}} +define void @load_private_hi_v2i16_reglo_vreg_nooff(i16* %in, i16 %reg) #0 { +entry: + %load = load volatile i16, i16* inttoptr (i32 4094 to i16*) + %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 + %build1 = insertelement <2 x i16> %build0, i16 %load, i32 1 + store <2 x i16> %build1, <2 x i16> addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}load_private_hi_v2f16_reglo_vreg_nooff: +; GCN: s_waitcnt +; GFX9-NEXT: buffer_load_short_d16_hi v1, off, s[0:3], s4 offset:4094{{$}} +; GFX9-NEXT: s_waitcnt +; GFX9-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1 +; GFX9-NEXT: s_waitcnt +; GFX9-NEXT: s_setpc_b64 + +; VI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s4 offset:4094{{$}} +define void @load_private_hi_v2f16_reglo_vreg_nooff(half* %in, half %reg) #0 { +entry: + %load = load volatile half, half* inttoptr (i32 4094 to half*) + %build0 = insertelement <2 x half> undef, half %reg, i32 0 + %build1 = insertelement <2 x half> %build0, half %load, i32 1 + store <2 x half> %build1, <2 x half> addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_zexti8: +; GCN: s_waitcnt +; GFX9-NEXT: buffer_load_ubyte_d16_hi v1, v0, s[0:3], s4 offen offset:2047{{$}} +; GFX9-NEXT: s_waitcnt +; GFX9-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1 +; GFX9-NEXT: s_waitcnt +; GFX9-NEXT: s_setpc_b64 + +; VI: buffer_load_ubyte v{{[0-9]+}}, v0, s[0:3], s4 offen offset:2047{{$}} +define void @load_private_hi_v2i16_reglo_vreg_zexti8(i8* %in, i16 %reg) #0 { +entry: + %gep = getelementptr inbounds i8, i8* %in, i64 2047 + %load = load i8, i8* %gep + %ext = zext i8 %load to i16 + %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 + %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1 + store <2 x i16> %build1, <2 x i16> addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_sexti8: +; GCN: s_waitcnt +; GFX9-NEXT: buffer_load_sbyte_d16_hi v1, v0, s[0:3], s4 offen offset:2047{{$}} +; GFX9-NEXT: s_waitcnt +; GFX9-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1 +; GFX9-NEXT: s_waitcnt +; GFX9-NEXT: s_setpc_b64 + +; VI: buffer_load_sbyte v{{[0-9]+}}, v0, s[0:3], s4 offen offset:2047{{$}} +define void @load_private_hi_v2i16_reglo_vreg_sexti8(i8* %in, i16 %reg) #0 { +entry: + %gep = getelementptr inbounds i8, i8* %in, i64 2047 + %load = load i8, i8* %gep + %ext = sext i8 %load to i16 + %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 + %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1 + store <2 x i16> %build1, <2 x i16> addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_nooff_zexti8: +; GCN: s_waitcnt +; GFX9-NEXT: buffer_load_ubyte_d16_hi v1, off, s[0:3], s4 offset:4094{{$}} +; GFX9-NEXT: s_waitcnt +; GFX9-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1 +; GFX9-NEXT: s_waitcnt +; GFX9-NEXT: s_setpc_b64 + +; VI: buffer_load_ubyte v0, off, s[0:3], s4 offset:4094{{$}} +define void @load_private_hi_v2i16_reglo_vreg_nooff_zexti8(i8* %in, i16 %reg) #0 { +entry: + %load = load volatile i8, i8* inttoptr (i32 4094 to i8*) + %ext = zext i8 %load to i16 + %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 + %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1 + store <2 x i16> %build1, <2 x i16> addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_nooff_sexti8: +; GCN: s_waitcnt +; GFX9-NEXT: buffer_load_sbyte_d16_hi v1, off, s[0:3], s4 offset:4094{{$}} +; GFX9-NEXT: s_waitcnt +; GFX9-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1 +; GFX9-NEXT: s_waitcnt +; GFX9-NEXT: s_setpc_b64 + +; VI: buffer_load_sbyte v0, off, s[0:3], s4 offset:4094{{$}} +define void @load_private_hi_v2i16_reglo_vreg_nooff_sexti8(i8* %in, i16 %reg) #0 { +entry: + %load = load volatile i8, i8* inttoptr (i32 4094 to i8*) + %ext = sext i8 %load to i16 + %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 + %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1 + store <2 x i16> %build1, <2 x i16> addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}load_private_hi_v2f16_reglo_vreg_nooff_zexti8: +; GCN: s_waitcnt +; GFX9-NEXT: buffer_load_ubyte_d16_hi v1, off, s[0:3], s4 offset:4094{{$}} +; GFX9-NEXT: s_waitcnt +; GFX9-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1 +; GFX9-NEXT: s_waitcnt +; GFX9-NEXT: s_setpc_b64 + +; VI: buffer_load_ubyte v0, off, s[0:3], s4 offset:4094{{$}} +define void @load_private_hi_v2f16_reglo_vreg_nooff_zexti8(i8* %in, half %reg) #0 { +entry: + %load = load volatile i8, i8* inttoptr (i32 4094 to i8*) + %ext = zext i8 %load to i16 + %bc.ext = bitcast i16 %ext to half + %build0 = insertelement <2 x half> undef, half %reg, i32 0 + %build1 = insertelement <2 x half> %build0, half %bc.ext, i32 1 + store <2 x half> %build1, <2 x half> addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}load_constant_hi_v2i16_reglo_vreg: +; GCN: s_waitcnt +; GFX9-NEXT: global_load_short_d16_hi v2, v[0:1], off offset:-4094 +; GFX9-NEXT: s_waitcnt +; GFX9-NEXT: global_store_dword +; GFX9-NEXT: s_waitcnt +; GFX9-NEXT: s_setpc_b64 + +; VI: flat_load_ushort +define void @load_constant_hi_v2i16_reglo_vreg(i16 addrspace(2)* %in, i16 %reg) #0 { +entry: + %gep = getelementptr inbounds i16, i16 addrspace(2)* %in, i64 -2047 + %load = load i16, i16 addrspace(2)* %gep + %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 + %build1 = insertelement <2 x i16> %build0, i16 %load, i32 1 + store <2 x i16> %build1, <2 x i16> addrspace(1)* undef + ret void +} + +; GCN-LABEL: load_constant_hi_v2f16_reglo_vreg +; GCN: s_waitcnt +; GFX9-NEXT: global_load_short_d16_hi v2, v[0:1], off offset:-4094 +; GFX9-NEXT: s_waitcnt +; GFX9-NEXT: global_store_dword +; GFX9-NEXT: s_waitcnt +; GFX9-NEXT: s_setpc_b64 + +; VI: flat_load_ushort +define void @load_constant_hi_v2f16_reglo_vreg(half addrspace(2)* %in, half %reg) #0 { +entry: + %gep = getelementptr inbounds half, half addrspace(2)* %in, i64 -2047 + %load = load half, half addrspace(2)* %gep + %build0 = insertelement <2 x half> undef, half %reg, i32 0 + %build1 = insertelement <2 x half> %build0, half %load, i32 1 + store <2 x half> %build1, <2 x half> addrspace(1)* undef + ret void +} + +attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/packed-op-sel.ll b/llvm/test/CodeGen/AMDGPU/packed-op-sel.ll index 4970375d40d..69675a3351c 100644 --- a/llvm/test/CodeGen/AMDGPU/packed-op-sel.ll +++ b/llvm/test/CodeGen/AMDGPU/packed-op-sel.ll @@ -228,15 +228,13 @@ bb: ret void } +; FIXME: Can we avoid waitcnt between the two halves? ; GCN-LABEL: {{^}}fma_vector_vector_neg_scalar_lo_scalar_hi: ; GCN: ds_read_b32 [[VEC0:v[0-9]+]] ; GCN: ds_read_b32 [[VEC1:v[0-9]+]] -; GCN: ds_read_u16 [[SCALAR0:v[0-9]+]] -; GCN: ds_read_u16 [[SCALAR1:v[0-9]+]] - -; FIXME: Remove and -; GCN: v_and_b32_e32 [[SCALAR0]], 0xffff, [[SCALAR0]] -; GCN: v_lshl_or_b32 [[PACKED:v[0-9]+]], [[SCALAR1]], 16, [[SCALAR0]] +; GCN: ds_read_u16 [[PACKED:v[0-9]+]] +; GCN-NEXT: s_waitcnt +; GCN: ds_read_u16_d16_hi [[PACKED]] ; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[PACKED]] neg_lo:[0,0,1] neg_hi:[0,0,1]{{$}} define amdgpu_kernel void @fma_vector_vector_neg_scalar_lo_scalar_hi(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds, half addrspace(3)* %arg2) #0 { diff --git a/llvm/test/CodeGen/AMDGPU/sext-in-reg.ll b/llvm/test/CodeGen/AMDGPU/sext-in-reg.ll index 3541e6d114c..83b980a527e 100644 --- a/llvm/test/CodeGen/AMDGPU/sext-in-reg.ll +++ b/llvm/test/CodeGen/AMDGPU/sext-in-reg.ll @@ -663,10 +663,10 @@ define amdgpu_kernel void @sext_in_reg_v2i1_to_v2i16(<2 x i16> addrspace(1)* %ou ; FUNC-LABEL: {{^}}sext_in_reg_v3i1_to_v3i16: ; GFX9: v_pk_add_u16 -; GFX9: v_pk_add_u16 -; GFX9: v_pk_lshlrev_b16 v{{[0-9]+}}, 15, v{{[0-9]+}} ; GFX9: v_pk_lshlrev_b16 v{{[0-9]+}}, 15, v{{[0-9]+}} ; GFX9: v_pk_ashrrev_i16 v{{[0-9]+}}, 15, v{{[0-9]+}} +; GFX9: v_pk_add_u16 +; GFX9: v_pk_lshlrev_b16 v{{[0-9]+}}, 15, v{{[0-9]+}} ; GFX9: v_pk_ashrrev_i16 v{{[0-9]+}}, 15, v{{[0-9]+}} define amdgpu_kernel void @sext_in_reg_v3i1_to_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %a, <3 x i16> %b) #0 { %c = add <3 x i16> %a, %b ; add to prevent folding into extload @@ -702,10 +702,11 @@ define amdgpu_kernel void @sext_in_reg_v2i8_to_v2i16(<2 x i16> addrspace(1)* %ou ; FUNC-LABEL: {{^}}sext_in_reg_v3i8_to_v3i16: ; GFX9: v_pk_add_u16 -; GFX9: v_pk_add_u16 -; GFX9: v_pk_lshlrev_b16 v{{[0-9]+}}, 8, v{{[0-9]+}} ; GFX9: v_pk_lshlrev_b16 v{{[0-9]+}}, 8, v{{[0-9]+}} ; GFX9: v_pk_ashrrev_i16 v{{[0-9]+}}, 8, v{{[0-9]+}} + +; GFX9: v_pk_add_u16 +; GFX9: v_pk_lshlrev_b16 v{{[0-9]+}}, 8, v{{[0-9]+}} ; GFX9: v_pk_ashrrev_i16 v{{[0-9]+}}, 8, v{{[0-9]+}} define amdgpu_kernel void @sext_in_reg_v3i8_to_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %a, <3 x i16> %b) #0 { %c = add <3 x i16> %a, %b ; add to prevent folding into extload |

