summaryrefslogtreecommitdiffstats
path: root/llvm/test/CodeGen/AMDGPU
diff options
context:
space:
mode:
authorMatthias Braun <matze@braunis.de>2018-01-19 06:08:15 +0000
committerMatthias Braun <matze@braunis.de>2018-01-19 06:08:15 +0000
commit8bb5228db925bd4d3ef4d41c88b2f7c51d8221fa (patch)
tree6d24f43c40f1f809368b4043fad52c9ab20fd557 /llvm/test/CodeGen/AMDGPU
parentf4cd9083acecb478f1d19532bb23624128b5af40 (diff)
downloadbcm5719-llvm-8bb5228db925bd4d3ef4d41c88b2f7c51d8221fa.tar.gz
bcm5719-llvm-8bb5228db925bd4d3ef4d41c88b2f7c51d8221fa.zip
Move tests to the correct place
test/CodeGen/MIR is for testing the MIR parser/printer. Tests for passes and targets belong to test/CodeGen/TARGETNAME. llvm-svn: 322925
Diffstat (limited to 'llvm/test/CodeGen/AMDGPU')
-rw-r--r--llvm/test/CodeGen/AMDGPU/fold-imm-f16-f32.mir709
-rw-r--r--llvm/test/CodeGen/AMDGPU/fold-multiple.mir40
-rw-r--r--llvm/test/CodeGen/AMDGPU/memory-legalizer-atomic-insert-end.mir122
-rw-r--r--llvm/test/CodeGen/AMDGPU/memory-legalizer-multiple-mem-operands-atomics.mir163
-rw-r--r--llvm/test/CodeGen/AMDGPU/memory-legalizer-multiple-mem-operands-nontemporal-1.mir161
-rw-r--r--llvm/test/CodeGen/AMDGPU/memory-legalizer-multiple-mem-operands-nontemporal-2.mir161
6 files changed, 1356 insertions, 0 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/fold-imm-f16-f32.mir b/llvm/test/CodeGen/AMDGPU/fold-imm-f16-f32.mir
new file mode 100644
index 00000000000..cae8ed80d16
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/fold-imm-f16-f32.mir
@@ -0,0 +1,709 @@
+# RUN: llc --mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs -run-pass si-fold-operands,si-shrink-instructions %s -o - | FileCheck %s
+--- |
+ define amdgpu_kernel void @add_f32_1.0_one_f16_use() #0 {
+ %f16.val0 = load volatile half, half addrspace(1)* undef
+ %f16.val1 = load volatile half, half addrspace(1)* undef
+ %f32.val = load volatile float, float addrspace(1)* undef
+ %f16.add0 = fadd half %f16.val0, 0xH3C00
+ %f32.add = fadd float %f32.val, 1.000000e+00
+ store volatile half %f16.add0, half addrspace(1)* undef
+ store volatile float %f32.add, float addrspace(1)* undef
+ ret void
+ }
+
+ define amdgpu_kernel void @add_f32_1.0_multi_f16_use() #0 {
+ %f16.val0 = load volatile half, half addrspace(1)* undef
+ %f16.val1 = load volatile half, half addrspace(1)* undef
+ %f32.val = load volatile float, float addrspace(1)* undef
+ %f16.add0 = fadd half %f16.val0, 0xH3C00
+ %f32.add = fadd float %f32.val, 1.000000e+00
+ store volatile half %f16.add0, half addrspace(1)* undef
+ store volatile float %f32.add, float addrspace(1)* undef
+ ret void
+ }
+
+ define amdgpu_kernel void @add_f32_1.0_one_f32_use_one_f16_use () #0 {
+ %f16.val0 = load volatile half, half addrspace(1)* undef
+ %f16.val1 = load volatile half, half addrspace(1)* undef
+ %f32.val = load volatile float, float addrspace(1)* undef
+ %f16.add0 = fadd half %f16.val0, 0xH3C00
+ %f32.add = fadd float %f32.val, 1.000000e+00
+ store volatile half %f16.add0, half addrspace(1)* undef
+ store volatile float %f32.add, float addrspace(1)* undef
+ ret void
+ }
+
+ define amdgpu_kernel void @add_f32_1.0_one_f32_use_multi_f16_use () #0 {
+ %f16.val0 = load volatile half, half addrspace(1)* undef
+ %f16.val1 = load volatile half, half addrspace(1)* undef
+ %f32.val = load volatile float, float addrspace(1)* undef
+ %f16.add0 = fadd half %f16.val0, 0xH3C00
+ %f16.add1 = fadd half %f16.val1, 0xH3C00
+ %f32.add = fadd float %f32.val, 1.000000e+00
+ store volatile half %f16.add0, half addrspace(1)* undef
+ store volatile half %f16.add1, half addrspace(1)* undef
+ store volatile float %f32.add, float addrspace(1)* undef
+ ret void
+ }
+
+ define amdgpu_kernel void @add_i32_1_multi_f16_use() #0 {
+ %f16.val0 = load volatile half, half addrspace(1)* undef
+ %f16.val1 = load volatile half, half addrspace(1)* undef
+ %f16.add0 = fadd half %f16.val0, 0xH0001
+ %f16.add1 = fadd half %f16.val1, 0xH0001
+ store volatile half %f16.add0, half addrspace(1)* undef
+ store volatile half %f16.add1,half addrspace(1)* undef
+ ret void
+ }
+
+ define amdgpu_kernel void @add_i32_m2_one_f32_use_multi_f16_use () #0 {
+ %f16.val0 = load volatile half, half addrspace(1)* undef
+ %f16.val1 = load volatile half, half addrspace(1)* undef
+ %f32.val = load volatile float, float addrspace(1)* undef
+ %f16.add0 = fadd half %f16.val0, 0xHFFFE
+ %f16.add1 = fadd half %f16.val1, 0xHFFFE
+ %f32.add = fadd float %f32.val, 0xffffffffc0000000
+ store volatile half %f16.add0, half addrspace(1)* undef
+ store volatile half %f16.add1, half addrspace(1)* undef
+ store volatile float %f32.add, float addrspace(1)* undef
+ ret void
+ }
+
+ define amdgpu_kernel void @add_f16_1.0_multi_f32_use() #0 {
+ %f32.val0 = load volatile float, float addrspace(1)* undef
+ %f32.val1 = load volatile float, float addrspace(1)* undef
+ %f32.val = load volatile float, float addrspace(1)* undef
+ %f32.add0 = fadd float %f32.val0, 1.0
+ %f32.add1 = fadd float %f32.val1, 1.0
+ store volatile float %f32.add0, float addrspace(1)* undef
+ store volatile float %f32.add1, float addrspace(1)* undef
+ ret void
+ }
+
+ define amdgpu_kernel void @add_f16_1.0_other_high_bits_multi_f16_use() #0 {
+ %f16.val0 = load volatile half, half addrspace(1)* undef
+ %f16.val1 = load volatile half, half addrspace(1)* undef
+ %f32.val = load volatile half, half addrspace(1)* undef
+ %f16.add0 = fadd half %f16.val0, 0xH3C00
+ %f32.add = fadd half %f32.val, 1.000000e+00
+ store volatile half %f16.add0, half addrspace(1)* undef
+ store volatile half %f32.add, half addrspace(1)* undef
+ ret void
+ }
+
+ define amdgpu_kernel void @add_f16_1.0_other_high_bits_use_f16_f32() #0 {
+ %f16.val0 = load volatile half, half addrspace(1)* undef
+ %f16.val1 = load volatile half, half addrspace(1)* undef
+ %f32.val = load volatile half, half addrspace(1)* undef
+ %f16.add0 = fadd half %f16.val0, 0xH3C00
+ %f32.add = fadd half %f32.val, 1.000000e+00
+ store volatile half %f16.add0, half addrspace(1)* undef
+ store volatile half %f32.add, half addrspace(1)* undef
+ ret void
+ }
+
+ attributes #0 = { nounwind }
+
+...
+---
+
+# f32 1.0 with a single use should be folded as the low 32-bits of a
+# literal constant.
+
+# CHECK-LABEL: name: add_f32_1.0_one_f16_use
+# CHECK: %13:vgpr_32 = V_ADD_F16_e32 1065353216, killed %11, implicit %exec
+
+name: add_f32_1.0_one_f16_use
+alignment: 0
+exposesReturnsTwice: false
+legalized: false
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: sreg_64 }
+ - { id: 1, class: sreg_32 }
+ - { id: 2, class: sgpr_32 }
+ - { id: 3, class: vgpr_32 }
+ - { id: 4, class: sreg_64 }
+ - { id: 5, class: sreg_32 }
+ - { id: 6, class: sreg_64 }
+ - { id: 7, class: sreg_32 }
+ - { id: 8, class: sreg_32 }
+ - { id: 9, class: sreg_32 }
+ - { id: 10, class: sreg_128 }
+ - { id: 11, class: vgpr_32 }
+ - { id: 12, class: vgpr_32 }
+ - { id: 13, class: vgpr_32 }
+frameInfo:
+ isFrameAddressTaken: false
+ isReturnAddressTaken: false
+ hasStackMap: false
+ hasPatchPoint: false
+ stackSize: 0
+ offsetAdjustment: 0
+ maxAlignment: 0
+ adjustsStack: false
+ hasCalls: false
+ maxCallFrameSize: 0
+ hasOpaqueSPAdjustment: false
+ hasVAStart: false
+ hasMustTailInVarArgFunc: false
+body: |
+ bb.0 (%ir-block.0):
+ %4 = IMPLICIT_DEF
+ %5 = COPY %4.sub1
+ %6 = IMPLICIT_DEF
+ %7 = COPY %6.sub0
+ %8 = S_MOV_B32 61440
+ %9 = S_MOV_B32 -1
+ %10 = REG_SEQUENCE killed %7, 1, killed %5, 2, killed %9, 3, killed %8, 4
+ %11 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 2 from `half addrspace(1)* undef`)
+ %12 = V_MOV_B32_e32 1065353216, implicit %exec
+ %13 = V_ADD_F16_e64 0, killed %11, 0, %12, 0, 0, implicit %exec
+ BUFFER_STORE_SHORT_OFFSET killed %13, %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 2 into `half addrspace(1)* undef`)
+ S_ENDPGM
+
+...
+---
+# Materialized f32 inline immediate should not be folded into the f16
+# operands
+
+# CHECK-LABEL: name: add_f32_1.0_multi_f16_use
+# CHECK: %13:vgpr_32 = V_MOV_B32_e32 1065353216, implicit %exec
+# CHECK: %14:vgpr_32 = V_ADD_F16_e32 killed %11, %13, implicit %exec
+# CHECK: %15:vgpr_32 = V_ADD_F16_e32 killed %12, killed %13, implicit %exec
+
+
+name: add_f32_1.0_multi_f16_use
+alignment: 0
+exposesReturnsTwice: false
+legalized: false
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: sreg_64 }
+ - { id: 1, class: sreg_32 }
+ - { id: 2, class: sgpr_32 }
+ - { id: 3, class: vgpr_32 }
+ - { id: 4, class: sreg_64 }
+ - { id: 5, class: sreg_32 }
+ - { id: 6, class: sreg_64 }
+ - { id: 7, class: sreg_32 }
+ - { id: 8, class: sreg_32 }
+ - { id: 9, class: sreg_32 }
+ - { id: 10, class: sreg_128 }
+ - { id: 11, class: vgpr_32 }
+ - { id: 12, class: vgpr_32 }
+ - { id: 13, class: vgpr_32 }
+ - { id: 14, class: vgpr_32 }
+ - { id: 15, class: vgpr_32 }
+frameInfo:
+ isFrameAddressTaken: false
+ isReturnAddressTaken: false
+ hasStackMap: false
+ hasPatchPoint: false
+ stackSize: 0
+ offsetAdjustment: 0
+ maxAlignment: 0
+ adjustsStack: false
+ hasCalls: false
+ maxCallFrameSize: 0
+ hasOpaqueSPAdjustment: false
+ hasVAStart: false
+ hasMustTailInVarArgFunc: false
+body: |
+ bb.0 (%ir-block.0):
+ %4 = IMPLICIT_DEF
+ %5 = COPY %4.sub1
+ %6 = IMPLICIT_DEF
+ %7 = COPY %6.sub0
+ %8 = S_MOV_B32 61440
+ %9 = S_MOV_B32 -1
+ %10 = REG_SEQUENCE killed %7, 1, killed %5, 2, killed %9, 3, killed %8, 4
+ %11 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 2 from `half addrspace(1)* undef`)
+ %12 = BUFFER_LOAD_DWORD_OFFSET %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 4 from `float addrspace(1)* undef`)
+ %13 = V_MOV_B32_e32 1065353216, implicit %exec
+ %14 = V_ADD_F16_e64 0, killed %11, 0, %13, 0, 0, implicit %exec
+ %15 = V_ADD_F16_e64 0, killed %12, 0, killed %13, 0, 0, implicit %exec
+ BUFFER_STORE_SHORT_OFFSET killed %14, %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 2 into `half addrspace(1)* undef`)
+ BUFFER_STORE_SHORT_OFFSET killed %15, %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 2 into `half addrspace(1)* undef`)
+ S_ENDPGM
+
+...
+---
+
+# f32 1.0 should be folded into the single f32 use as an inline
+# immediate, and folded into the single f16 use as a literal constant
+
+# CHECK-LABEL: name: add_f32_1.0_one_f32_use_one_f16_use
+# CHECK: %15:vgpr_32 = V_ADD_F16_e32 1065353216, %11, implicit %exec
+# CHECK: %16:vgpr_32 = V_ADD_F32_e32 1065353216, killed %13, implicit %exec
+
+name: add_f32_1.0_one_f32_use_one_f16_use
+alignment: 0
+exposesReturnsTwice: false
+legalized: false
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: sreg_64 }
+ - { id: 1, class: sreg_32 }
+ - { id: 2, class: sgpr_32 }
+ - { id: 3, class: vgpr_32 }
+ - { id: 4, class: sreg_64 }
+ - { id: 5, class: sreg_32 }
+ - { id: 6, class: sreg_64 }
+ - { id: 7, class: sreg_32 }
+ - { id: 8, class: sreg_32 }
+ - { id: 9, class: sreg_32 }
+ - { id: 10, class: sreg_128 }
+ - { id: 11, class: vgpr_32 }
+ - { id: 12, class: vgpr_32 }
+ - { id: 13, class: vgpr_32 }
+ - { id: 14, class: vgpr_32 }
+ - { id: 15, class: vgpr_32 }
+ - { id: 16, class: vgpr_32 }
+frameInfo:
+ isFrameAddressTaken: false
+ isReturnAddressTaken: false
+ hasStackMap: false
+ hasPatchPoint: false
+ stackSize: 0
+ offsetAdjustment: 0
+ maxAlignment: 0
+ adjustsStack: false
+ hasCalls: false
+ maxCallFrameSize: 0
+ hasOpaqueSPAdjustment: false
+ hasVAStart: false
+ hasMustTailInVarArgFunc: false
+body: |
+ bb.0 (%ir-block.0):
+ %4 = IMPLICIT_DEF
+ %5 = COPY %4.sub1
+ %6 = IMPLICIT_DEF
+ %7 = COPY %6.sub0
+ %8 = S_MOV_B32 61440
+ %9 = S_MOV_B32 -1
+ %10 = REG_SEQUENCE killed %7, 1, killed %5, 2, killed %9, 3, killed %8, 4
+ %11 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 2 from `half addrspace(1)* undef`)
+ %12 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 2 from `half addrspace(1)* undef`)
+ %13 = BUFFER_LOAD_DWORD_OFFSET %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 4 from `float addrspace(1)* undef`)
+ %14 = V_MOV_B32_e32 1065353216, implicit %exec
+ %15 = V_ADD_F16_e64 0, %11, 0, %14, 0, 0, implicit %exec
+ %16 = V_ADD_F32_e64 0, killed %13, 0, killed %14, 0, 0, implicit %exec
+ BUFFER_STORE_SHORT_OFFSET killed %15, %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 2 into `half addrspace(1)* undef`)
+ BUFFER_STORE_DWORD_OFFSET killed %16, %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 4 into `float addrspace(1)* undef`)
+ S_ENDPGM
+
+...
+---
+
+# f32 1.0 should be folded for the single f32 use as an inline
+# constant, and not folded as a multi-use literal for the f16 cases
+
+# CHECK-LABEL: name: add_f32_1.0_one_f32_use_multi_f16_use
+# CHECK: %14:vgpr_32 = V_MOV_B32_e32 1065353216, implicit %exec
+# CHECK: %15:vgpr_32 = V_ADD_F16_e32 %11, %14, implicit %exec
+# CHECK: %16:vgpr_32 = V_ADD_F16_e32 %12, %14, implicit %exec
+# CHECK: %17:vgpr_32 = V_ADD_F32_e32 1065353216, killed %13, implicit %exec
+
+name: add_f32_1.0_one_f32_use_multi_f16_use
+alignment: 0
+exposesReturnsTwice: false
+legalized: false
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: sreg_64 }
+ - { id: 1, class: sreg_32 }
+ - { id: 2, class: sgpr_32 }
+ - { id: 3, class: vgpr_32 }
+ - { id: 4, class: sreg_64 }
+ - { id: 5, class: sreg_32 }
+ - { id: 6, class: sreg_64 }
+ - { id: 7, class: sreg_32 }
+ - { id: 8, class: sreg_32 }
+ - { id: 9, class: sreg_32 }
+ - { id: 10, class: sreg_128 }
+ - { id: 11, class: vgpr_32 }
+ - { id: 12, class: vgpr_32 }
+ - { id: 13, class: vgpr_32 }
+ - { id: 14, class: vgpr_32 }
+ - { id: 15, class: vgpr_32 }
+ - { id: 16, class: vgpr_32 }
+ - { id: 17, class: vgpr_32 }
+frameInfo:
+ isFrameAddressTaken: false
+ isReturnAddressTaken: false
+ hasStackMap: false
+ hasPatchPoint: false
+ stackSize: 0
+ offsetAdjustment: 0
+ maxAlignment: 0
+ adjustsStack: false
+ hasCalls: false
+ maxCallFrameSize: 0
+ hasOpaqueSPAdjustment: false
+ hasVAStart: false
+ hasMustTailInVarArgFunc: false
+body: |
+ bb.0 (%ir-block.0):
+ %4 = IMPLICIT_DEF
+ %5 = COPY %4.sub1
+ %6 = IMPLICIT_DEF
+ %7 = COPY %6.sub0
+ %8 = S_MOV_B32 61440
+ %9 = S_MOV_B32 -1
+ %10 = REG_SEQUENCE killed %7, 1, killed %5, 2, killed %9, 3, killed %8, 4
+ %11 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 2 from `half addrspace(1)* undef`)
+ %12 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 2 from `half addrspace(1)* undef`)
+ %13 = BUFFER_LOAD_DWORD_OFFSET %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 4 from `float addrspace(1)* undef`)
+ %14 = V_MOV_B32_e32 1065353216, implicit %exec
+ %15 = V_ADD_F16_e64 0, %11, 0, %14, 0, 0, implicit %exec
+ %16 = V_ADD_F16_e64 0, %12, 0, %14, 0, 0, implicit %exec
+ %17 = V_ADD_F32_e64 0, killed %13, 0, killed %14, 0, 0, implicit %exec
+ BUFFER_STORE_SHORT_OFFSET killed %15, %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 2 into `half addrspace(1)* undef`)
+ BUFFER_STORE_SHORT_OFFSET killed %16, %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 2 into `half addrspace(1)* undef`)
+ BUFFER_STORE_DWORD_OFFSET killed %17, %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 4 into `float addrspace(1)* undef`)
+ S_ENDPGM
+
+...
+---
+# CHECK-LABEL: name: add_i32_1_multi_f16_use
+# CHECK: %13:vgpr_32 = V_MOV_B32_e32 1, implicit %exec
+# CHECK: %14:vgpr_32 = V_ADD_F16_e32 1, killed %11, implicit %exec
+# CHECK: %15:vgpr_32 = V_ADD_F16_e32 1, killed %12, implicit %exec
+
+
+name: add_i32_1_multi_f16_use
+alignment: 0
+exposesReturnsTwice: false
+legalized: false
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: sreg_64 }
+ - { id: 1, class: sreg_32 }
+ - { id: 2, class: sgpr_32 }
+ - { id: 3, class: vgpr_32 }
+ - { id: 4, class: sreg_64 }
+ - { id: 5, class: sreg_32 }
+ - { id: 6, class: sreg_64 }
+ - { id: 7, class: sreg_32 }
+ - { id: 8, class: sreg_32 }
+ - { id: 9, class: sreg_32 }
+ - { id: 10, class: sreg_128 }
+ - { id: 11, class: vgpr_32 }
+ - { id: 12, class: vgpr_32 }
+ - { id: 13, class: vgpr_32 }
+ - { id: 14, class: vgpr_32 }
+ - { id: 15, class: vgpr_32 }
+frameInfo:
+ isFrameAddressTaken: false
+ isReturnAddressTaken: false
+ hasStackMap: false
+ hasPatchPoint: false
+ stackSize: 0
+ offsetAdjustment: 0
+ maxAlignment: 0
+ adjustsStack: false
+ hasCalls: false
+ maxCallFrameSize: 0
+ hasOpaqueSPAdjustment: false
+ hasVAStart: false
+ hasMustTailInVarArgFunc: false
+body: |
+ bb.0 (%ir-block.0):
+ %4 = IMPLICIT_DEF
+ %5 = COPY %4.sub1
+ %6 = IMPLICIT_DEF
+ %7 = COPY %6.sub0
+ %8 = S_MOV_B32 61440
+ %9 = S_MOV_B32 -1
+ %10 = REG_SEQUENCE killed %7, 1, killed %5, 2, killed %9, 3, killed %8, 4
+ %11 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 2 from `half addrspace(1)* undef`)
+ %12 = BUFFER_LOAD_DWORD_OFFSET %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 4 from `float addrspace(1)* undef`)
+ %13 = V_MOV_B32_e32 1, implicit %exec
+ %14 = V_ADD_F16_e64 0, killed %11, 0, %13, 0, 0, implicit %exec
+ %15 = V_ADD_F16_e64 0, killed %12, 0, killed %13, 0, 0, implicit %exec
+ BUFFER_STORE_SHORT_OFFSET killed %14, %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 2 into `half addrspace(1)* undef`)
+ BUFFER_STORE_SHORT_OFFSET killed %15, %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 2 into `half addrspace(1)* undef`)
+ S_ENDPGM
+
+...
+---
+
+# CHECK-LABEL: name: add_i32_m2_one_f32_use_multi_f16_use
+# CHECK: %14:vgpr_32 = V_MOV_B32_e32 -2, implicit %exec
+# CHECK: %15:vgpr_32 = V_ADD_F16_e32 -2, %11, implicit %exec
+# CHECK: %16:vgpr_32 = V_ADD_F16_e32 -2, %12, implicit %exec
+# CHECK: %17:vgpr_32 = V_ADD_F32_e32 -2, killed %13, implicit %exec
+
+name: add_i32_m2_one_f32_use_multi_f16_use
+alignment: 0
+exposesReturnsTwice: false
+legalized: false
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: sreg_64 }
+ - { id: 1, class: sreg_32 }
+ - { id: 2, class: sgpr_32 }
+ - { id: 3, class: vgpr_32 }
+ - { id: 4, class: sreg_64 }
+ - { id: 5, class: sreg_32 }
+ - { id: 6, class: sreg_64 }
+ - { id: 7, class: sreg_32 }
+ - { id: 8, class: sreg_32 }
+ - { id: 9, class: sreg_32 }
+ - { id: 10, class: sreg_128 }
+ - { id: 11, class: vgpr_32 }
+ - { id: 12, class: vgpr_32 }
+ - { id: 13, class: vgpr_32 }
+ - { id: 14, class: vgpr_32 }
+ - { id: 15, class: vgpr_32 }
+ - { id: 16, class: vgpr_32 }
+ - { id: 17, class: vgpr_32 }
+frameInfo:
+ isFrameAddressTaken: false
+ isReturnAddressTaken: false
+ hasStackMap: false
+ hasPatchPoint: false
+ stackSize: 0
+ offsetAdjustment: 0
+ maxAlignment: 0
+ adjustsStack: false
+ hasCalls: false
+ maxCallFrameSize: 0
+ hasOpaqueSPAdjustment: false
+ hasVAStart: false
+ hasMustTailInVarArgFunc: false
+body: |
+ bb.0 (%ir-block.0):
+ %4 = IMPLICIT_DEF
+ %5 = COPY %4.sub1
+ %6 = IMPLICIT_DEF
+ %7 = COPY %6.sub0
+ %8 = S_MOV_B32 61440
+ %9 = S_MOV_B32 -1
+ %10 = REG_SEQUENCE killed %7, 1, killed %5, 2, killed %9, 3, killed %8, 4
+ %11 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 2 from `half addrspace(1)* undef`)
+ %12 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 2 from `half addrspace(1)* undef`)
+ %13 = BUFFER_LOAD_DWORD_OFFSET %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 4 from `float addrspace(1)* undef`)
+ %14 = V_MOV_B32_e32 -2, implicit %exec
+ %15 = V_ADD_F16_e64 0, %11, 0, %14, 0, 0, implicit %exec
+ %16 = V_ADD_F16_e64 0, %12, 0, %14, 0, 0, implicit %exec
+ %17 = V_ADD_F32_e64 0, killed %13, 0, killed %14, 0, 0, implicit %exec
+ BUFFER_STORE_SHORT_OFFSET killed %15, %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 2 into `half addrspace(1)* undef`)
+ BUFFER_STORE_SHORT_OFFSET killed %16, %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 2 into `half addrspace(1)* undef`)
+ BUFFER_STORE_DWORD_OFFSET killed %17, %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 4 into `float addrspace(1)* undef`)
+ S_ENDPGM
+
+...
+---
+
+# f32 1.0 should be folded for the single f32 use as an inline
+# constant, and not folded as a multi-use literal for the f16 cases
+
+# CHECK-LABEL: name: add_f16_1.0_multi_f32_use
+# CHECK: %13:vgpr_32 = V_MOV_B32_e32 15360, implicit %exec
+# CHECK: %14:vgpr_32 = V_ADD_F32_e32 %11, %13, implicit %exec
+# CHECK: %15:vgpr_32 = V_ADD_F32_e32 %12, %13, implicit %exec
+
+name: add_f16_1.0_multi_f32_use
+alignment: 0
+exposesReturnsTwice: false
+legalized: false
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: sreg_64 }
+ - { id: 1, class: sreg_32 }
+ - { id: 2, class: sgpr_32 }
+ - { id: 3, class: vgpr_32 }
+ - { id: 4, class: sreg_64 }
+ - { id: 5, class: sreg_32 }
+ - { id: 6, class: sreg_64 }
+ - { id: 7, class: sreg_32 }
+ - { id: 8, class: sreg_32 }
+ - { id: 9, class: sreg_32 }
+ - { id: 10, class: sreg_128 }
+ - { id: 11, class: vgpr_32 }
+ - { id: 12, class: vgpr_32 }
+ - { id: 13, class: vgpr_32 }
+ - { id: 14, class: vgpr_32 }
+ - { id: 15, class: vgpr_32 }
+frameInfo:
+ isFrameAddressTaken: false
+ isReturnAddressTaken: false
+ hasStackMap: false
+ hasPatchPoint: false
+ stackSize: 0
+ offsetAdjustment: 0
+ maxAlignment: 0
+ adjustsStack: false
+ hasCalls: false
+ maxCallFrameSize: 0
+ hasOpaqueSPAdjustment: false
+ hasVAStart: false
+ hasMustTailInVarArgFunc: false
+body: |
+ bb.0 (%ir-block.0):
+ %4 = IMPLICIT_DEF
+ %5 = COPY %4.sub1
+ %6 = IMPLICIT_DEF
+ %7 = COPY %6.sub0
+ %8 = S_MOV_B32 61440
+ %9 = S_MOV_B32 -1
+ %10 = REG_SEQUENCE killed %7, 1, killed %5, 2, killed %9, 3, killed %8, 4
+ %11 = BUFFER_LOAD_DWORD_OFFSET %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 4 from `float addrspace(1)* undef`)
+ %12 = BUFFER_LOAD_DWORD_OFFSET %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 4 from `float addrspace(1)* undef`)
+ %13 = V_MOV_B32_e32 15360, implicit %exec
+ %14 = V_ADD_F32_e64 0, %11, 0, %13, 0, 0, implicit %exec
+ %15 = V_ADD_F32_e64 0, %12, 0, %13, 0, 0, implicit %exec
+ BUFFER_STORE_DWORD_OFFSET killed %14, %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 4 into `float addrspace(1)* undef`)
+ BUFFER_STORE_DWORD_OFFSET killed %15, %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 4 into `float addrspace(1)* undef`)
+ S_ENDPGM
+
+...
+---
+
+# The low 16-bits are an inline immediate, but the high bits are junk
+# FIXME: Should be able to fold this
+
+# CHECK-LABEL: name: add_f16_1.0_other_high_bits_multi_f16_use
+# CHECK: %13:vgpr_32 = V_MOV_B32_e32 80886784, implicit %exec
+# CHECK: %14:vgpr_32 = V_ADD_F16_e32 %11, %13, implicit %exec
+# CHECK: %15:vgpr_32 = V_ADD_F16_e32 %12, %13, implicit %exec
+
+name: add_f16_1.0_other_high_bits_multi_f16_use
+alignment: 0
+exposesReturnsTwice: false
+legalized: false
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: sreg_64 }
+ - { id: 1, class: sreg_32 }
+ - { id: 2, class: sgpr_32 }
+ - { id: 3, class: vgpr_32 }
+ - { id: 4, class: sreg_64 }
+ - { id: 5, class: sreg_32 }
+ - { id: 6, class: sreg_64 }
+ - { id: 7, class: sreg_32 }
+ - { id: 8, class: sreg_32 }
+ - { id: 9, class: sreg_32 }
+ - { id: 10, class: sreg_128 }
+ - { id: 11, class: vgpr_32 }
+ - { id: 12, class: vgpr_32 }
+ - { id: 13, class: vgpr_32 }
+ - { id: 14, class: vgpr_32 }
+ - { id: 15, class: vgpr_32 }
+frameInfo:
+ isFrameAddressTaken: false
+ isReturnAddressTaken: false
+ hasStackMap: false
+ hasPatchPoint: false
+ stackSize: 0
+ offsetAdjustment: 0
+ maxAlignment: 0
+ adjustsStack: false
+ hasCalls: false
+ maxCallFrameSize: 0
+ hasOpaqueSPAdjustment: false
+ hasVAStart: false
+ hasMustTailInVarArgFunc: false
+body: |
+ bb.0 (%ir-block.0):
+ %4 = IMPLICIT_DEF
+ %5 = COPY %4.sub1
+ %6 = IMPLICIT_DEF
+ %7 = COPY %6.sub0
+ %8 = S_MOV_B32 61440
+ %9 = S_MOV_B32 -1
+ %10 = REG_SEQUENCE killed %7, 1, killed %5, 2, killed %9, 3, killed %8, 4
+ %11 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 2 from `half addrspace(1)* undef`)
+ %12 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 2 from `half addrspace(1)* undef`)
+ %13 = V_MOV_B32_e32 80886784, implicit %exec
+ %14 = V_ADD_F16_e64 0, %11, 0, %13, 0, 0, implicit %exec
+ %15 = V_ADD_F16_e64 0, %12, 0, %13, 0, 0, implicit %exec
+ BUFFER_STORE_SHORT_OFFSET killed %14, %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 2 into `half addrspace(1)* undef`)
+ BUFFER_STORE_SHORT_OFFSET killed %15, %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 2 into `half addrspace(1)* undef`)
+ S_ENDPGM
+
+...
+---
+
+# FIXME: Should fold inline immediate into f16 and literal use into
+# f32 instruction.
+
+# CHECK-LABEL: name: add_f16_1.0_other_high_bits_use_f16_f32
+# CHECK: %13:vgpr_32 = V_MOV_B32_e32 305413120, implicit %exec
+# CHECK: %14:vgpr_32 = V_ADD_F32_e32 %11, %13, implicit %exec
+# CHECK: %15:vgpr_32 = V_ADD_F16_e32 %12, %13, implicit %exec
+name: add_f16_1.0_other_high_bits_use_f16_f32
+alignment: 0
+exposesReturnsTwice: false
+legalized: false
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: sreg_64 }
+ - { id: 1, class: sreg_32 }
+ - { id: 2, class: sgpr_32 }
+ - { id: 3, class: vgpr_32 }
+ - { id: 4, class: sreg_64 }
+ - { id: 5, class: sreg_32 }
+ - { id: 6, class: sreg_64 }
+ - { id: 7, class: sreg_32 }
+ - { id: 8, class: sreg_32 }
+ - { id: 9, class: sreg_32 }
+ - { id: 10, class: sreg_128 }
+ - { id: 11, class: vgpr_32 }
+ - { id: 12, class: vgpr_32 }
+ - { id: 13, class: vgpr_32 }
+ - { id: 14, class: vgpr_32 }
+ - { id: 15, class: vgpr_32 }
+frameInfo:
+ isFrameAddressTaken: false
+ isReturnAddressTaken: false
+ hasStackMap: false
+ hasPatchPoint: false
+ stackSize: 0
+ offsetAdjustment: 0
+ maxAlignment: 0
+ adjustsStack: false
+ hasCalls: false
+ maxCallFrameSize: 0
+ hasOpaqueSPAdjustment: false
+ hasVAStart: false
+ hasMustTailInVarArgFunc: false
+body: |
+ bb.0 (%ir-block.0):
+ %4 = IMPLICIT_DEF
+ %5 = COPY %4.sub1
+ %6 = IMPLICIT_DEF
+ %7 = COPY %6.sub0
+ %8 = S_MOV_B32 61440
+ %9 = S_MOV_B32 -1
+ %10 = REG_SEQUENCE killed %7, 1, killed %5, 2, killed %9, 3, killed %8, 4
+ %11 = BUFFER_LOAD_DWORD_OFFSET %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 4 from `float addrspace(1)* undef`)
+ %12 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 2 from `half addrspace(1)* undef`)
+ %13 = V_MOV_B32_e32 305413120, implicit %exec
+ %14 = V_ADD_F32_e64 0, %11, 0, %13, 0, 0, implicit %exec
+ %15 = V_ADD_F16_e64 0, %12, 0, %13, 0, 0, implicit %exec
+ BUFFER_STORE_DWORD_OFFSET killed %14, %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 4 into `float addrspace(1)* undef`)
+ BUFFER_STORE_SHORT_OFFSET killed %15, %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 2 into `half addrspace(1)* undef`)
+ S_ENDPGM
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/fold-multiple.mir b/llvm/test/CodeGen/AMDGPU/fold-multiple.mir
new file mode 100644
index 00000000000..b9b6ee6887b
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/fold-multiple.mir
@@ -0,0 +1,40 @@
+# RUN: llc --mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs -run-pass si-fold-operands,si-shrink-instructions %s -o - | FileCheck %s
+--- |
+ define amdgpu_kernel void @test() #0 {
+ ret void
+ }
+
+ attributes #0 = { nounwind }
+
+...
+---
+
+# This used to crash / trigger an assertion, because re-scanning the use list
+# after constant-folding the definition of %3 lead to the definition of %2
+# being processed twice.
+
+# CHECK-LABEL: name: test
+# CHECK: %2:vgpr_32 = V_LSHLREV_B32_e32 2, killed %0, implicit %exec
+# CHECK: %4:vgpr_32 = V_AND_B32_e32 8, killed %2, implicit %exec
+
+name: test
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: vgpr_32 }
+ - { id: 1, class: sreg_32 }
+ - { id: 2, class: vgpr_32 }
+ - { id: 3, class: sreg_32 }
+ - { id: 4, class: vgpr_32 }
+ - { id: 5, class: sreg_128 }
+body: |
+ bb.0 (%ir-block.0):
+ %0 = IMPLICIT_DEF
+ %1 = S_MOV_B32 2
+ %2 = V_LSHLREV_B32_e64 %1, killed %0, implicit %exec
+ %3 = S_LSHL_B32 %1, killed %1, implicit-def dead %scc
+ %4 = V_AND_B32_e64 killed %2, killed %3, implicit %exec
+ %5 = IMPLICIT_DEF
+ BUFFER_STORE_DWORD_OFFSET killed %4, killed %5, 0, 0, 0, 0, 0, implicit %exec
+ S_ENDPGM
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-atomic-insert-end.mir b/llvm/test/CodeGen/AMDGPU/memory-legalizer-atomic-insert-end.mir
new file mode 100644
index 00000000000..d4ddfbe31b9
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-atomic-insert-end.mir
@@ -0,0 +1,122 @@
+# RUN: llc -march=amdgcn -mcpu=gfx803 -run-pass si-memory-legalizer %s -o - | FileCheck %s
+
+--- |
+ ; ModuleID = '<stdin>'
+ source_filename = "<stdin>"
+ target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"
+
+ ; Function Attrs: nounwind readnone
+ declare i32 @llvm.amdgcn.workitem.id.x() #0
+
+ ; Function Attrs: nounwind
+ define amdgpu_kernel void @atomic_max_i32_noret(
+ i32 addrspace(1)* %out,
+ i32 addrspace(1)* addrspace(1)* %in,
+ i32 addrspace(1)* %x,
+ i32 %y) #1 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %idxprom = sext i32 %tid to i64
+ %tid.gep = getelementptr i32 addrspace(1)*, i32 addrspace(1)* addrspace(1)* %in, i64 %idxprom
+ %ptr = load volatile i32 addrspace(1)*, i32 addrspace(1)* addrspace(1)* %tid.gep
+ %xor = xor i32 %tid, 1
+ %cmp = icmp ne i32 %xor, 0
+ %1 = call { i1, i64 } @llvm.amdgcn.if(i1 %cmp)
+ %2 = extractvalue { i1, i64 } %1, 0
+ %3 = extractvalue { i1, i64 } %1, 1
+ br i1 %2, label %atomic, label %exit
+
+ atomic: ; preds = %0
+ %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 100
+ %ret = atomicrmw max i32 addrspace(1)* %gep, i32 %y seq_cst
+ br label %exit
+
+ exit: ; preds = %atomic, %0
+ call void @llvm.amdgcn.end.cf(i64 %3)
+ ret void
+ }
+
+ declare { i1, i64 } @llvm.amdgcn.if(i1)
+
+ declare void @llvm.amdgcn.end.cf(i64)
+
+ ; Function Attrs: nounwind
+ declare void @llvm.stackprotector(i8*, i8**) #3
+
+ attributes #0 = { nounwind readnone "target-cpu"="tahiti" }
+ attributes #1 = { nounwind "target-cpu"="tahiti" }
+ attributes #2 = { readnone }
+ attributes #3 = { nounwind }
+
+...
+---
+
+# CHECK-LABEL: name: atomic_max_i32_noret
+
+# CHECK-LABEL: bb.1.atomic:
+# CHECK: BUFFER_ATOMIC_SMAX_ADDR64
+# CHECK-NEXT: S_WAITCNT 3952
+# CHECK-NEXT: BUFFER_WBINVL1_VOL
+
+name: atomic_max_i32_noret
+alignment: 0
+exposesReturnsTwice: false
+legalized: false
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+liveins:
+ - { reg: '%sgpr0_sgpr1' }
+ - { reg: '%vgpr0' }
+frameInfo:
+ isFrameAddressTaken: false
+ isReturnAddressTaken: false
+ hasStackMap: false
+ hasPatchPoint: false
+ stackSize: 0
+ offsetAdjustment: 0
+ maxAlignment: 0
+ adjustsStack: false
+ hasCalls: false
+ maxCallFrameSize: 0
+ hasOpaqueSPAdjustment: false
+ hasVAStart: false
+ hasMustTailInVarArgFunc: false
+body: |
+ bb.0 (%ir-block.0):
+ successors: %bb.1.atomic(0x40000000), %bb.2.exit(0x40000000)
+ liveins: %vgpr0, %sgpr0_sgpr1
+
+ %sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM %sgpr0_sgpr1, 11, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+ %vgpr1 = V_ASHRREV_I32_e32 31, %vgpr0, implicit %exec
+ %vgpr1_vgpr2 = V_LSHL_B64 %vgpr0_vgpr1, 3, implicit %exec
+ %sgpr7 = S_MOV_B32 61440
+ %sgpr6 = S_MOV_B32 0
+ S_WAITCNT 127
+ %vgpr1_vgpr2 = BUFFER_LOAD_DWORDX2_ADDR64 killed %vgpr1_vgpr2, %sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 8 from %ir.tid.gep)
+ %vgpr0 = V_XOR_B32_e32 1, killed %vgpr0, implicit %exec
+ V_CMP_NE_U32_e32 0, killed %vgpr0, implicit-def %vcc, implicit %exec
+ %sgpr2_sgpr3 = S_AND_SAVEEXEC_B64 killed %vcc, implicit-def %exec, implicit-def %scc, implicit %exec
+ %sgpr2_sgpr3 = S_XOR_B64 %exec, killed %sgpr2_sgpr3, implicit-def dead %scc
+ SI_MASK_BRANCH %bb.2.exit, implicit %exec
+
+ bb.1.atomic:
+ successors: %bb.2.exit(0x80000000)
+ liveins: %sgpr4_sgpr5_sgpr6_sgpr7:0x0000000C, %sgpr0_sgpr1, %sgpr2_sgpr3, %vgpr1_vgpr2_vgpr3_vgpr4:0x00000003
+
+ %sgpr0 = S_LOAD_DWORD_IMM killed %sgpr0_sgpr1, 15, 0 :: (non-temporal dereferenceable invariant load 4 from `i32 addrspace(2)* undef`)
+ dead %vgpr0 = V_MOV_B32_e32 -1, implicit %exec
+ dead %vgpr0 = V_MOV_B32_e32 61440, implicit %exec
+ %sgpr4_sgpr5 = S_MOV_B64 0
+ S_WAITCNT 127
+ %vgpr0 = V_MOV_B32_e32 killed %sgpr0, implicit %exec, implicit %exec
+ S_WAITCNT 3952
+ BUFFER_ATOMIC_SMAX_ADDR64 killed %vgpr0, killed %vgpr1_vgpr2, killed %sgpr4_sgpr5_sgpr6_sgpr7, 0, 400, 0, implicit %exec :: (volatile load seq_cst 4 from %ir.gep)
+
+ bb.2.exit:
+ liveins: %sgpr2_sgpr3
+
+ %exec = S_OR_B64 %exec, killed %sgpr2_sgpr3, implicit-def %scc
+ S_ENDPGM
+
+...
+
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-multiple-mem-operands-atomics.mir b/llvm/test/CodeGen/AMDGPU/memory-legalizer-multiple-mem-operands-atomics.mir
new file mode 100644
index 00000000000..2f3095c777a
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-multiple-mem-operands-atomics.mir
@@ -0,0 +1,163 @@
+# RUN: llc -march=amdgcn -mcpu=gfx803 -run-pass si-memory-legalizer %s -o - | FileCheck %s
+
+--- |
+ ; ModuleID = 'memory-legalizer-multiple-mem-operands.ll'
+ source_filename = "memory-legalizer-multiple-mem-operands.ll"
+ target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"
+
+ define amdgpu_kernel void @multiple_mem_operands(i32 addrspace(1)* %out, i32 %cond, i32 %if_offset, i32 %else_offset) #0 {
+ entry:
+ %scratch0 = alloca [8192 x i32]
+ %scratch1 = alloca [8192 x i32]
+ %scratchptr01 = bitcast [8192 x i32]* %scratch0 to i32*
+ store i32 1, i32* %scratchptr01
+ %scratchptr12 = bitcast [8192 x i32]* %scratch1 to i32*
+ store i32 2, i32* %scratchptr12
+ %cmp = icmp eq i32 %cond, 0
+ br i1 %cmp, label %if, label %else, !structurizecfg.uniform !0, !amdgpu.uniform !0
+
+ if: ; preds = %entry
+ %if_ptr = getelementptr [8192 x i32], [8192 x i32]* %scratch0, i32 0, i32 %if_offset, !amdgpu.uniform !0
+ %if_value = load atomic i32, i32* %if_ptr syncscope("workgroup") seq_cst, align 4
+ br label %done, !structurizecfg.uniform !0
+
+ else: ; preds = %entry
+ %else_ptr = getelementptr [8192 x i32], [8192 x i32]* %scratch1, i32 0, i32 %else_offset, !amdgpu.uniform !0
+ %else_value = load atomic i32, i32* %else_ptr syncscope("agent") unordered, align 4
+ br label %done, !structurizecfg.uniform !0
+
+ done: ; preds = %else, %if
+ %value = phi i32 [ %if_value, %if ], [ %else_value, %else ]
+ store i32 %value, i32 addrspace(1)* %out
+ ret void
+ }
+
+ ; Function Attrs: convergent nounwind
+ declare { i1, i64 } @llvm.amdgcn.if(i1) #1
+
+ ; Function Attrs: convergent nounwind
+ declare { i1, i64 } @llvm.amdgcn.else(i64) #1
+
+ ; Function Attrs: convergent nounwind readnone
+ declare i64 @llvm.amdgcn.break(i64) #2
+
+ ; Function Attrs: convergent nounwind readnone
+ declare i64 @llvm.amdgcn.if.break(i1, i64) #2
+
+ ; Function Attrs: convergent nounwind readnone
+ declare i64 @llvm.amdgcn.else.break(i64, i64) #2
+
+ ; Function Attrs: convergent nounwind
+ declare i1 @llvm.amdgcn.loop(i64) #1
+
+ ; Function Attrs: convergent nounwind
+ declare void @llvm.amdgcn.end.cf(i64) #1
+
+ attributes #0 = { "target-cpu"="gfx803" }
+ attributes #1 = { convergent nounwind }
+ attributes #2 = { convergent nounwind readnone }
+
+ !0 = !{}
+
+...
+---
+
+# CHECK-LABEL: name: multiple_mem_operands
+
+# CHECK-LABEL: bb.3.done:
+# CHECK: S_WAITCNT 3952
+# CHECK-NEXT: BUFFER_LOAD_DWORD_OFFEN
+# CHECK-NEXT: S_WAITCNT 3952
+# CHECK-NEXT: BUFFER_WBINVL1_VOL
+
+name: multiple_mem_operands
+alignment: 0
+exposesReturnsTwice: false
+legalized: false
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+registers:
+liveins:
+ - { reg: '%sgpr0_sgpr1', virtual-reg: '' }
+ - { reg: '%sgpr3', virtual-reg: '' }
+frameInfo:
+ isFrameAddressTaken: false
+ isReturnAddressTaken: false
+ hasStackMap: false
+ hasPatchPoint: false
+ stackSize: 65540
+ offsetAdjustment: 0
+ maxAlignment: 4
+ adjustsStack: false
+ hasCalls: false
+ stackProtector: ''
+ maxCallFrameSize: 0
+ hasOpaqueSPAdjustment: false
+ hasVAStart: false
+ hasMustTailInVarArgFunc: false
+ savePoint: ''
+ restorePoint: ''
+fixedStack:
+ - { id: 0, type: default, offset: 0, size: 4, alignment: 4, stack-id: 0,
+ isImmutable: false, isAliased: false, callee-saved-register: '' }
+stack:
+ - { id: 0, name: scratch0, type: default, offset: 4, size: 32768, alignment: 4,
+ stack-id: 0, callee-saved-register: '', local-offset: 0, di-variable: '',
+ di-expression: '', di-location: '' }
+ - { id: 1, name: scratch1, type: default, offset: 32772, size: 32768,
+ alignment: 4, stack-id: 0, callee-saved-register: '', local-offset: 32768,
+ di-variable: '', di-expression: '', di-location: '' }
+constants:
+body: |
+ bb.0.entry:
+ successors: %bb.1.if(0x30000000), %bb.2.else(0x50000000)
+ liveins: %sgpr0_sgpr1, %sgpr3
+
+ %sgpr2 = S_LOAD_DWORD_IMM %sgpr0_sgpr1, 44, 0 :: (non-temporal dereferenceable invariant load 4 from `i32 addrspace(2)* undef`)
+ %sgpr8 = S_MOV_B32 &SCRATCH_RSRC_DWORD0, implicit-def %sgpr8_sgpr9_sgpr10_sgpr11
+ %sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM %sgpr0_sgpr1, 36, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+ %sgpr9 = S_MOV_B32 &SCRATCH_RSRC_DWORD1, implicit-def %sgpr8_sgpr9_sgpr10_sgpr11
+ %sgpr10 = S_MOV_B32 4294967295, implicit-def %sgpr8_sgpr9_sgpr10_sgpr11
+ %sgpr11 = S_MOV_B32 15204352, implicit-def %sgpr8_sgpr9_sgpr10_sgpr11
+ %vgpr0 = V_MOV_B32_e32 1, implicit %exec
+ BUFFER_STORE_DWORD_OFFSET killed %vgpr0, %sgpr8_sgpr9_sgpr10_sgpr11, %sgpr3, 4, 0, 0, 0, implicit %exec :: (store 4 into %ir.scratchptr01)
+ S_WAITCNT 127
+ S_CMP_LG_U32 killed %sgpr2, 0, implicit-def %scc
+ S_WAITCNT 3855
+ %vgpr0 = V_MOV_B32_e32 2, implicit %exec
+ %vgpr1 = V_MOV_B32_e32 32772, implicit %exec
+ BUFFER_STORE_DWORD_OFFEN killed %vgpr0, killed %vgpr1, %sgpr8_sgpr9_sgpr10_sgpr11, %sgpr3, 0, 0, 0, 0, implicit %exec :: (store 4 into %ir.scratchptr12)
+ S_CBRANCH_SCC0 %bb.1.if, implicit killed %scc
+
+ bb.2.else:
+ successors: %bb.3.done(0x80000000)
+ liveins: %sgpr0_sgpr1, %sgpr4_sgpr5, %sgpr3, %sgpr8_sgpr9_sgpr10_sgpr11
+
+ %sgpr0 = S_LOAD_DWORD_IMM killed %sgpr0_sgpr1, 52, 0 :: (non-temporal dereferenceable invariant load 4 from `i32 addrspace(2)* undef`)
+ S_WAITCNT 3855
+ %vgpr0 = V_MOV_B32_e32 32772, implicit %exec
+ S_BRANCH %bb.3.done
+
+ bb.1.if:
+ successors: %bb.3.done(0x80000000)
+ liveins: %sgpr0_sgpr1, %sgpr4_sgpr5, %sgpr3, %sgpr8_sgpr9_sgpr10_sgpr11
+
+ %sgpr0 = S_LOAD_DWORD_IMM killed %sgpr0_sgpr1, 48, 0 :: (non-temporal dereferenceable invariant load 4 from `i32 addrspace(2)* undef`)
+ S_WAITCNT 3855
+ %vgpr0 = V_MOV_B32_e32 4, implicit %exec
+
+ bb.3.done:
+ liveins: %sgpr3, %sgpr4_sgpr5, %sgpr8_sgpr9_sgpr10_sgpr11, %vgpr0, %sgpr0
+
+ S_WAITCNT 127
+ %sgpr0 = S_LSHL_B32 killed %sgpr0, 2, implicit-def dead %scc
+ %vgpr0 = V_ADD_I32_e32 killed %sgpr0, killed %vgpr0, implicit-def dead %vcc, implicit %exec
+ %vgpr0 = BUFFER_LOAD_DWORD_OFFEN killed %vgpr0, killed %sgpr8_sgpr9_sgpr10_sgpr11, %sgpr3, 0, 0, 0, 0, implicit %exec :: (load syncscope("agent") unordered 4 from %ir.else_ptr), (load syncscope("workgroup") seq_cst 4 from %ir.if_ptr)
+ %vgpr1 = V_MOV_B32_e32 %sgpr4, implicit %exec, implicit-def %vgpr1_vgpr2, implicit %sgpr4_sgpr5
+ %vgpr2 = V_MOV_B32_e32 killed %sgpr5, implicit %exec, implicit %sgpr4_sgpr5, implicit %exec
+ S_WAITCNT 3952
+ FLAT_STORE_DWORD killed %vgpr1_vgpr2, killed %vgpr0, 0, 0, 0, implicit %exec, implicit %flat_scr :: (store 4 into %ir.out)
+ S_ENDPGM
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-multiple-mem-operands-nontemporal-1.mir b/llvm/test/CodeGen/AMDGPU/memory-legalizer-multiple-mem-operands-nontemporal-1.mir
new file mode 100644
index 00000000000..263bbeb0596
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-multiple-mem-operands-nontemporal-1.mir
@@ -0,0 +1,161 @@
+# RUN: llc -march=amdgcn -mcpu=gfx803 -run-pass si-memory-legalizer %s -o - | FileCheck %s
+
+--- |
+ ; ModuleID = 'memory-legalizer-multiple-mem-operands.ll'
+ source_filename = "memory-legalizer-multiple-mem-operands.ll"
+ target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"
+
+ define amdgpu_kernel void @multiple_mem_operands(i32 addrspace(1)* %out, i32 %cond, i32 %if_offset, i32 %else_offset) #0 {
+ entry:
+ %scratch0 = alloca [8192 x i32]
+ %scratch1 = alloca [8192 x i32]
+ %scratchptr01 = bitcast [8192 x i32]* %scratch0 to i32*
+ store i32 1, i32* %scratchptr01
+ %scratchptr12 = bitcast [8192 x i32]* %scratch1 to i32*
+ store i32 2, i32* %scratchptr12
+ %cmp = icmp eq i32 %cond, 0
+ br i1 %cmp, label %if, label %else, !structurizecfg.uniform !0, !amdgpu.uniform !0
+
+ if: ; preds = %entry
+ %if_ptr = getelementptr [8192 x i32], [8192 x i32]* %scratch0, i32 0, i32 %if_offset, !amdgpu.uniform !0
+ %if_value = load i32, i32* %if_ptr, align 4, !nontemporal !1
+ br label %done, !structurizecfg.uniform !0
+
+ else: ; preds = %entry
+ %else_ptr = getelementptr [8192 x i32], [8192 x i32]* %scratch1, i32 0, i32 %else_offset, !amdgpu.uniform !0
+ %else_value = load i32, i32* %else_ptr, align 4, !nontemporal !1
+ br label %done, !structurizecfg.uniform !0
+
+ done: ; preds = %else, %if
+ %value = phi i32 [ %if_value, %if ], [ %else_value, %else ]
+ store i32 %value, i32 addrspace(1)* %out
+ ret void
+ }
+
+ ; Function Attrs: convergent nounwind
+ declare { i1, i64 } @llvm.amdgcn.if(i1) #1
+
+ ; Function Attrs: convergent nounwind
+ declare { i1, i64 } @llvm.amdgcn.else(i64) #1
+
+ ; Function Attrs: convergent nounwind readnone
+ declare i64 @llvm.amdgcn.break(i64) #2
+
+ ; Function Attrs: convergent nounwind readnone
+ declare i64 @llvm.amdgcn.if.break(i1, i64) #2
+
+ ; Function Attrs: convergent nounwind readnone
+ declare i64 @llvm.amdgcn.else.break(i64, i64) #2
+
+ ; Function Attrs: convergent nounwind
+ declare i1 @llvm.amdgcn.loop(i64) #1
+
+ ; Function Attrs: convergent nounwind
+ declare void @llvm.amdgcn.end.cf(i64) #1
+
+ attributes #0 = { "target-cpu"="gfx803" }
+ attributes #1 = { convergent nounwind }
+ attributes #2 = { convergent nounwind readnone }
+
+ !0 = !{}
+ !1 = !{i32 1}
+
+...
+---
+
+# CHECK-LABEL: name: multiple_mem_operands
+
+# CHECK-LABEL: bb.3.done:
+# CHECK: BUFFER_LOAD_DWORD_OFFEN killed %vgpr0, killed %sgpr8_sgpr9_sgpr10_sgpr11, %sgpr3, 0, 1, 1, 0
+
+name: multiple_mem_operands
+alignment: 0
+exposesReturnsTwice: false
+legalized: false
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+registers:
+liveins:
+ - { reg: '%sgpr0_sgpr1', virtual-reg: '' }
+ - { reg: '%sgpr3', virtual-reg: '' }
+frameInfo:
+ isFrameAddressTaken: false
+ isReturnAddressTaken: false
+ hasStackMap: false
+ hasPatchPoint: false
+ stackSize: 65540
+ offsetAdjustment: 0
+ maxAlignment: 4
+ adjustsStack: false
+ hasCalls: false
+ stackProtector: ''
+ maxCallFrameSize: 0
+ hasOpaqueSPAdjustment: false
+ hasVAStart: false
+ hasMustTailInVarArgFunc: false
+ savePoint: ''
+ restorePoint: ''
+fixedStack:
+ - { id: 0, type: default, offset: 0, size: 4, alignment: 4, stack-id: 0,
+ isImmutable: false, isAliased: false, callee-saved-register: '' }
+stack:
+ - { id: 0, name: scratch0, type: default, offset: 4, size: 32768, alignment: 4,
+ stack-id: 0, callee-saved-register: '', local-offset: 0, di-variable: '',
+ di-expression: '', di-location: '' }
+ - { id: 1, name: scratch1, type: default, offset: 32772, size: 32768,
+ alignment: 4, stack-id: 0, callee-saved-register: '', local-offset: 32768,
+ di-variable: '', di-expression: '', di-location: '' }
+constants:
+body: |
+ bb.0.entry:
+ successors: %bb.1.if(0x30000000), %bb.2.else(0x50000000)
+ liveins: %sgpr0_sgpr1, %sgpr3
+
+ %sgpr2 = S_LOAD_DWORD_IMM %sgpr0_sgpr1, 44, 0 :: (non-temporal dereferenceable invariant load 4 from `i32 addrspace(2)* undef`)
+ %sgpr8 = S_MOV_B32 &SCRATCH_RSRC_DWORD0, implicit-def %sgpr8_sgpr9_sgpr10_sgpr11
+ %sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM %sgpr0_sgpr1, 36, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+ %sgpr9 = S_MOV_B32 &SCRATCH_RSRC_DWORD1, implicit-def %sgpr8_sgpr9_sgpr10_sgpr11
+ %sgpr10 = S_MOV_B32 4294967295, implicit-def %sgpr8_sgpr9_sgpr10_sgpr11
+ %sgpr11 = S_MOV_B32 15204352, implicit-def %sgpr8_sgpr9_sgpr10_sgpr11
+ %vgpr0 = V_MOV_B32_e32 1, implicit %exec
+ BUFFER_STORE_DWORD_OFFSET killed %vgpr0, %sgpr8_sgpr9_sgpr10_sgpr11, %sgpr3, 4, 0, 0, 0, implicit %exec :: (store 4 into %ir.scratchptr01)
+ S_WAITCNT 127
+ S_CMP_LG_U32 killed %sgpr2, 0, implicit-def %scc
+ S_WAITCNT 3855
+ %vgpr0 = V_MOV_B32_e32 2, implicit %exec
+ %vgpr1 = V_MOV_B32_e32 32772, implicit %exec
+ BUFFER_STORE_DWORD_OFFEN killed %vgpr0, killed %vgpr1, %sgpr8_sgpr9_sgpr10_sgpr11, %sgpr3, 0, 0, 0, 0, implicit %exec :: (store 4 into %ir.scratchptr12)
+ S_CBRANCH_SCC0 %bb.1.if, implicit killed %scc
+
+ bb.2.else:
+ successors: %bb.3.done(0x80000000)
+ liveins: %sgpr0_sgpr1, %sgpr4_sgpr5, %sgpr3, %sgpr8_sgpr9_sgpr10_sgpr11
+
+ %sgpr0 = S_LOAD_DWORD_IMM killed %sgpr0_sgpr1, 52, 0 :: (non-temporal dereferenceable invariant load 4 from `i32 addrspace(2)* undef`)
+ S_WAITCNT 3855
+ %vgpr0 = V_MOV_B32_e32 32772, implicit %exec
+ S_BRANCH %bb.3.done
+
+ bb.1.if:
+ successors: %bb.3.done(0x80000000)
+ liveins: %sgpr0_sgpr1, %sgpr4_sgpr5, %sgpr3, %sgpr8_sgpr9_sgpr10_sgpr11
+
+ %sgpr0 = S_LOAD_DWORD_IMM killed %sgpr0_sgpr1, 48, 0 :: (non-temporal dereferenceable invariant load 4 from `i32 addrspace(2)* undef`)
+ S_WAITCNT 3855
+ %vgpr0 = V_MOV_B32_e32 4, implicit %exec
+
+ bb.3.done:
+ liveins: %sgpr3, %sgpr4_sgpr5, %sgpr8_sgpr9_sgpr10_sgpr11, %vgpr0, %sgpr0
+
+ S_WAITCNT 127
+ %sgpr0 = S_LSHL_B32 killed %sgpr0, 2, implicit-def dead %scc
+ %vgpr0 = V_ADD_I32_e32 killed %sgpr0, killed %vgpr0, implicit-def dead %vcc, implicit %exec
+ %vgpr0 = BUFFER_LOAD_DWORD_OFFEN killed %vgpr0, killed %sgpr8_sgpr9_sgpr10_sgpr11, %sgpr3, 0, 0, 0, 0, implicit %exec :: (non-temporal load 4 from %ir.else_ptr), (non-temporal load 4 from %ir.if_ptr)
+ %vgpr1 = V_MOV_B32_e32 %sgpr4, implicit %exec, implicit-def %vgpr1_vgpr2, implicit %sgpr4_sgpr5
+ %vgpr2 = V_MOV_B32_e32 killed %sgpr5, implicit %exec, implicit %sgpr4_sgpr5, implicit %exec
+ S_WAITCNT 3952
+ FLAT_STORE_DWORD killed %vgpr1_vgpr2, killed %vgpr0, 0, 0, 0, implicit %exec, implicit %flat_scr :: (store 4 into %ir.out)
+ S_ENDPGM
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-multiple-mem-operands-nontemporal-2.mir b/llvm/test/CodeGen/AMDGPU/memory-legalizer-multiple-mem-operands-nontemporal-2.mir
new file mode 100644
index 00000000000..7e0c9e44e37
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-multiple-mem-operands-nontemporal-2.mir
@@ -0,0 +1,161 @@
+# RUN: llc -march=amdgcn -mcpu=gfx803 -run-pass si-memory-legalizer %s -o - | FileCheck %s
+
+--- |
+ ; ModuleID = 'memory-legalizer-multiple-mem-operands.ll'
+ source_filename = "memory-legalizer-multiple-mem-operands.ll"
+ target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"
+
+ define amdgpu_kernel void @multiple_mem_operands(i32 addrspace(1)* %out, i32 %cond, i32 %if_offset, i32 %else_offset) #0 {
+ entry:
+ %scratch0 = alloca [8192 x i32]
+ %scratch1 = alloca [8192 x i32]
+ %scratchptr01 = bitcast [8192 x i32]* %scratch0 to i32*
+ store i32 1, i32* %scratchptr01
+ %scratchptr12 = bitcast [8192 x i32]* %scratch1 to i32*
+ store i32 2, i32* %scratchptr12
+ %cmp = icmp eq i32 %cond, 0
+ br i1 %cmp, label %if, label %else, !structurizecfg.uniform !0, !amdgpu.uniform !0
+
+ if: ; preds = %entry
+ %if_ptr = getelementptr [8192 x i32], [8192 x i32]* %scratch0, i32 0, i32 %if_offset, !amdgpu.uniform !0
+ %if_value = load i32, i32* %if_ptr, align 4, !nontemporal !1
+ br label %done, !structurizecfg.uniform !0
+
+ else: ; preds = %entry
+ %else_ptr = getelementptr [8192 x i32], [8192 x i32]* %scratch1, i32 0, i32 %else_offset, !amdgpu.uniform !0
+ %else_value = load i32, i32* %else_ptr, align 4
+ br label %done, !structurizecfg.uniform !0
+
+ done: ; preds = %else, %if
+ %value = phi i32 [ %if_value, %if ], [ %else_value, %else ]
+ store i32 %value, i32 addrspace(1)* %out
+ ret void
+ }
+
+ ; Function Attrs: convergent nounwind
+ declare { i1, i64 } @llvm.amdgcn.if(i1) #1
+
+ ; Function Attrs: convergent nounwind
+ declare { i1, i64 } @llvm.amdgcn.else(i64) #1
+
+ ; Function Attrs: convergent nounwind readnone
+ declare i64 @llvm.amdgcn.break(i64) #2
+
+ ; Function Attrs: convergent nounwind readnone
+ declare i64 @llvm.amdgcn.if.break(i1, i64) #2
+
+ ; Function Attrs: convergent nounwind readnone
+ declare i64 @llvm.amdgcn.else.break(i64, i64) #2
+
+ ; Function Attrs: convergent nounwind
+ declare i1 @llvm.amdgcn.loop(i64) #1
+
+ ; Function Attrs: convergent nounwind
+ declare void @llvm.amdgcn.end.cf(i64) #1
+
+ attributes #0 = { "target-cpu"="gfx803" }
+ attributes #1 = { convergent nounwind }
+ attributes #2 = { convergent nounwind readnone }
+
+ !0 = !{}
+ !1 = !{i32 1}
+
+...
+---
+
+# CHECK-LABEL: name: multiple_mem_operands
+
+# CHECK-LABEL: bb.3.done:
+# CHECK: BUFFER_LOAD_DWORD_OFFEN killed %vgpr0, killed %sgpr8_sgpr9_sgpr10_sgpr11, %sgpr3, 0, 0, 0, 0
+
+name: multiple_mem_operands
+alignment: 0
+exposesReturnsTwice: false
+legalized: false
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+registers:
+liveins:
+ - { reg: '%sgpr0_sgpr1', virtual-reg: '' }
+ - { reg: '%sgpr3', virtual-reg: '' }
+frameInfo:
+ isFrameAddressTaken: false
+ isReturnAddressTaken: false
+ hasStackMap: false
+ hasPatchPoint: false
+ stackSize: 65540
+ offsetAdjustment: 0
+ maxAlignment: 4
+ adjustsStack: false
+ hasCalls: false
+ stackProtector: ''
+ maxCallFrameSize: 0
+ hasOpaqueSPAdjustment: false
+ hasVAStart: false
+ hasMustTailInVarArgFunc: false
+ savePoint: ''
+ restorePoint: ''
+fixedStack:
+ - { id: 0, type: default, offset: 0, size: 4, alignment: 4, stack-id: 0,
+ isImmutable: false, isAliased: false, callee-saved-register: '' }
+stack:
+ - { id: 0, name: scratch0, type: default, offset: 4, size: 32768, alignment: 4,
+ stack-id: 0, callee-saved-register: '', local-offset: 0, di-variable: '',
+ di-expression: '', di-location: '' }
+ - { id: 1, name: scratch1, type: default, offset: 32772, size: 32768,
+ alignment: 4, stack-id: 0, callee-saved-register: '', local-offset: 32768,
+ di-variable: '', di-expression: '', di-location: '' }
+constants:
+body: |
+ bb.0.entry:
+ successors: %bb.1.if(0x30000000), %bb.2.else(0x50000000)
+ liveins: %sgpr0_sgpr1, %sgpr3
+
+ %sgpr2 = S_LOAD_DWORD_IMM %sgpr0_sgpr1, 44, 0 :: (non-temporal dereferenceable invariant load 4 from `i32 addrspace(2)* undef`)
+ %sgpr8 = S_MOV_B32 &SCRATCH_RSRC_DWORD0, implicit-def %sgpr8_sgpr9_sgpr10_sgpr11
+ %sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM %sgpr0_sgpr1, 36, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+ %sgpr9 = S_MOV_B32 &SCRATCH_RSRC_DWORD1, implicit-def %sgpr8_sgpr9_sgpr10_sgpr11
+ %sgpr10 = S_MOV_B32 4294967295, implicit-def %sgpr8_sgpr9_sgpr10_sgpr11
+ %sgpr11 = S_MOV_B32 15204352, implicit-def %sgpr8_sgpr9_sgpr10_sgpr11
+ %vgpr0 = V_MOV_B32_e32 1, implicit %exec
+ BUFFER_STORE_DWORD_OFFSET killed %vgpr0, %sgpr8_sgpr9_sgpr10_sgpr11, %sgpr3, 4, 0, 0, 0, implicit %exec :: (store 4 into %ir.scratchptr01)
+ S_WAITCNT 127
+ S_CMP_LG_U32 killed %sgpr2, 0, implicit-def %scc
+ S_WAITCNT 3855
+ %vgpr0 = V_MOV_B32_e32 2, implicit %exec
+ %vgpr1 = V_MOV_B32_e32 32772, implicit %exec
+ BUFFER_STORE_DWORD_OFFEN killed %vgpr0, killed %vgpr1, %sgpr8_sgpr9_sgpr10_sgpr11, %sgpr3, 0, 0, 0, 0, implicit %exec :: (store 4 into %ir.scratchptr12)
+ S_CBRANCH_SCC0 %bb.1.if, implicit killed %scc
+
+ bb.2.else:
+ successors: %bb.3.done(0x80000000)
+ liveins: %sgpr0_sgpr1, %sgpr4_sgpr5, %sgpr3, %sgpr8_sgpr9_sgpr10_sgpr11
+
+ %sgpr0 = S_LOAD_DWORD_IMM killed %sgpr0_sgpr1, 52, 0 :: (non-temporal dereferenceable invariant load 4 from `i32 addrspace(2)* undef`)
+ S_WAITCNT 3855
+ %vgpr0 = V_MOV_B32_e32 32772, implicit %exec
+ S_BRANCH %bb.3.done
+
+ bb.1.if:
+ successors: %bb.3.done(0x80000000)
+ liveins: %sgpr0_sgpr1, %sgpr4_sgpr5, %sgpr3, %sgpr8_sgpr9_sgpr10_sgpr11
+
+ %sgpr0 = S_LOAD_DWORD_IMM killed %sgpr0_sgpr1, 48, 0 :: (non-temporal dereferenceable invariant load 4 from `i32 addrspace(2)* undef`)
+ S_WAITCNT 3855
+ %vgpr0 = V_MOV_B32_e32 4, implicit %exec
+
+ bb.3.done:
+ liveins: %sgpr3, %sgpr4_sgpr5, %sgpr8_sgpr9_sgpr10_sgpr11, %vgpr0, %sgpr0
+
+ S_WAITCNT 127
+ %sgpr0 = S_LSHL_B32 killed %sgpr0, 2, implicit-def dead %scc
+ %vgpr0 = V_ADD_I32_e32 killed %sgpr0, killed %vgpr0, implicit-def dead %vcc, implicit %exec
+ %vgpr0 = BUFFER_LOAD_DWORD_OFFEN killed %vgpr0, killed %sgpr8_sgpr9_sgpr10_sgpr11, %sgpr3, 0, 0, 0, 0, implicit %exec :: (load 4 from %ir.else_ptr), (non-temporal load 4 from %ir.if_ptr)
+ %vgpr1 = V_MOV_B32_e32 %sgpr4, implicit %exec, implicit-def %vgpr1_vgpr2, implicit %sgpr4_sgpr5
+ %vgpr2 = V_MOV_B32_e32 killed %sgpr5, implicit %exec, implicit %sgpr4_sgpr5, implicit %exec
+ S_WAITCNT 3952
+ FLAT_STORE_DWORD killed %vgpr1_vgpr2, killed %vgpr0, 0, 0, 0, implicit %exec, implicit %flat_scr :: (store 4 into %ir.out)
+ S_ENDPGM
+
+...
OpenPOWER on IntegriCloud