Move tests to the correct place

test/CodeGen/MIR is for testing the MIR parser/printer. Tests for passes and targets belong to test/CodeGen/TARGETNAME. llvm-svn: 322925
author: Matthias Braun <matze@braunis.de> 2018-01-19 06:08:15 +0000
committer: Matthias Braun <matze@braunis.de> 2018-01-19 06:08:15 +0000
commit: 8bb5228db925bd4d3ef4d41c88b2f7c51d8221fa (patch)
tree: 6d24f43c40f1f809368b4043fad52c9ab20fd557 /llvm/test/CodeGen/AMDGPU
parent: f4cd9083acecb478f1d19532bb23624128b5af40 (diff)
download: bcm5719-llvm-8bb5228db925bd4d3ef4d41c88b2f7c51d8221fa.tar.gz
bcm5719-llvm-8bb5228db925bd4d3ef4d41c88b2f7c51d8221fa.zip
6 files changed, 1356 insertions, 0 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/fold-imm-f16-f32.mir b/llvm/test/CodeGen/AMDGPU/fold-imm-f16-f32.mir
new file mode 100644
index 00000000000..cae8ed80d16
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/fold-imm-f16-f32.mir
@@ -0,0 +1,709 @@
+# RUN: llc --mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs -run-pass si-fold-operands,si-shrink-instructions %s -o - | FileCheck %s
+--- |
+  define amdgpu_kernel void @add_f32_1.0_one_f16_use() #0 {
+    %f16.val0 = load volatile half, half addrspace(1)* undef
+    %f16.val1 = load volatile half, half addrspace(1)* undef
+    %f32.val = load volatile float, float addrspace(1)* undef
+    %f16.add0 = fadd half %f16.val0, 0xH3C00
+    %f32.add = fadd float %f32.val, 1.000000e+00
+    store volatile half %f16.add0, half addrspace(1)* undef
+    store volatile float %f32.add, float addrspace(1)* undef
+    ret void
+  }
+
+  define amdgpu_kernel void @add_f32_1.0_multi_f16_use() #0 {
+    %f16.val0 = load volatile half, half addrspace(1)* undef
+    %f16.val1 = load volatile half, half addrspace(1)* undef
+    %f32.val = load volatile float, float addrspace(1)* undef
+    %f16.add0 = fadd half %f16.val0, 0xH3C00
+    %f32.add = fadd float %f32.val, 1.000000e+00
+    store volatile half %f16.add0, half addrspace(1)* undef
+    store volatile float %f32.add, float addrspace(1)* undef
+    ret void
+  }
+
+  define amdgpu_kernel void @add_f32_1.0_one_f32_use_one_f16_use () #0 {
+    %f16.val0 = load volatile half, half addrspace(1)* undef
+    %f16.val1 = load volatile half, half addrspace(1)* undef
+    %f32.val = load volatile float, float addrspace(1)* undef
+    %f16.add0 = fadd half %f16.val0, 0xH3C00
+    %f32.add = fadd float %f32.val, 1.000000e+00
+    store volatile half %f16.add0, half addrspace(1)* undef
+    store volatile float %f32.add, float addrspace(1)* undef
+    ret void
+  }
+
+  define amdgpu_kernel void @add_f32_1.0_one_f32_use_multi_f16_use () #0 {
+    %f16.val0 = load volatile half, half addrspace(1)* undef
+    %f16.val1 = load volatile half, half addrspace(1)* undef
+    %f32.val = load volatile float, float addrspace(1)* undef
+    %f16.add0 = fadd half %f16.val0, 0xH3C00
+    %f16.add1 = fadd half %f16.val1, 0xH3C00
+    %f32.add = fadd float %f32.val, 1.000000e+00
+    store volatile half %f16.add0, half addrspace(1)* undef
+    store volatile half %f16.add1, half addrspace(1)* undef
+    store volatile float %f32.add, float addrspace(1)* undef
+    ret void
+  }
+
+  define amdgpu_kernel void @add_i32_1_multi_f16_use() #0 {
+    %f16.val0 = load volatile half, half addrspace(1)* undef
+    %f16.val1 = load volatile half, half addrspace(1)* undef
+    %f16.add0 = fadd half %f16.val0, 0xH0001
+    %f16.add1 = fadd half %f16.val1, 0xH0001
+    store volatile half %f16.add0, half addrspace(1)* undef
+    store volatile half %f16.add1,half addrspace(1)* undef
+    ret void
+  }
+
+  define amdgpu_kernel void @add_i32_m2_one_f32_use_multi_f16_use () #0 {
+    %f16.val0 = load volatile half, half addrspace(1)* undef
+    %f16.val1 = load volatile half, half addrspace(1)* undef
+    %f32.val = load volatile float, float addrspace(1)* undef
+    %f16.add0 = fadd half %f16.val0, 0xHFFFE
+    %f16.add1 = fadd half %f16.val1, 0xHFFFE
+    %f32.add = fadd float %f32.val, 0xffffffffc0000000
+    store volatile half %f16.add0, half addrspace(1)* undef
+    store volatile half %f16.add1, half addrspace(1)* undef
+    store volatile float %f32.add, float addrspace(1)* undef
+    ret void
+  }
+
+  define amdgpu_kernel void @add_f16_1.0_multi_f32_use() #0 {
+    %f32.val0 = load volatile float, float addrspace(1)* undef
+    %f32.val1 = load volatile float, float addrspace(1)* undef
+    %f32.val = load volatile float, float addrspace(1)* undef
+    %f32.add0 = fadd float %f32.val0, 1.0
+    %f32.add1 = fadd float %f32.val1, 1.0
+    store volatile float %f32.add0, float addrspace(1)* undef
+    store volatile float %f32.add1, float addrspace(1)* undef
+    ret void
+  }
+
+  define amdgpu_kernel void @add_f16_1.0_other_high_bits_multi_f16_use() #0 {
+    %f16.val0 = load volatile half, half addrspace(1)* undef
+    %f16.val1 = load volatile half, half addrspace(1)* undef
+    %f32.val = load volatile half, half addrspace(1)* undef
+    %f16.add0 = fadd half %f16.val0, 0xH3C00
+    %f32.add = fadd half %f32.val, 1.000000e+00
+    store volatile half %f16.add0, half addrspace(1)* undef
+    store volatile half %f32.add, half addrspace(1)* undef
+    ret void
+  }
+
+  define amdgpu_kernel void @add_f16_1.0_other_high_bits_use_f16_f32() #0 {
+    %f16.val0 = load volatile half, half addrspace(1)* undef
+    %f16.val1 = load volatile half, half addrspace(1)* undef
+    %f32.val = load volatile half, half addrspace(1)* undef
+    %f16.add0 = fadd half %f16.val0, 0xH3C00
+    %f32.add = fadd half %f32.val, 1.000000e+00
+    store volatile half %f16.add0, half addrspace(1)* undef
+    store volatile half %f32.add, half addrspace(1)* undef
+    ret void
+  }
+
+  attributes #0 = { nounwind }
+
+...
+---
+
+# f32 1.0 with a single use should be folded as the low 32-bits of a
+#  literal constant.
+
+# CHECK-LABEL: name: add_f32_1.0_one_f16_use
+# CHECK: %13:vgpr_32 = V_ADD_F16_e32  1065353216, killed %11, implicit %exec
+
+name:            add_f32_1.0_one_f16_use
+alignment:       0
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: sreg_64 }
+  - { id: 1, class: sreg_32 }
+  - { id: 2, class: sgpr_32 }
+  - { id: 3, class: vgpr_32 }
+  - { id: 4, class: sreg_64 }
+  - { id: 5, class: sreg_32 }
+  - { id: 6, class: sreg_64 }
+  - { id: 7, class: sreg_32 }
+  - { id: 8, class: sreg_32 }
+  - { id: 9, class: sreg_32 }
+  - { id: 10, class: sreg_128 }
+  - { id: 11, class: vgpr_32 }
+  - { id: 12, class: vgpr_32 }
+  - { id: 13, class: vgpr_32 }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+body:             |
+  bb.0 (%ir-block.0):
+    %4 = IMPLICIT_DEF
+    %5 = COPY %4.sub1
+    %6 = IMPLICIT_DEF
+    %7 = COPY %6.sub0
+    %8 = S_MOV_B32 61440
+    %9 = S_MOV_B32 -1
+    %10 = REG_SEQUENCE killed %7, 1, killed %5, 2, killed %9, 3, killed %8, 4
+    %11 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 2 from `half addrspace(1)* undef`)
+    %12 = V_MOV_B32_e32 1065353216, implicit %exec
+    %13 = V_ADD_F16_e64 0, killed %11, 0, %12, 0, 0, implicit %exec
+    BUFFER_STORE_SHORT_OFFSET killed %13, %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 2 into `half addrspace(1)* undef`)
+    S_ENDPGM
+
+...
+---
+# Materialized f32 inline immediate should not be folded into the f16
+# operands
+
+# CHECK-LABEL: name: add_f32_1.0_multi_f16_use
+# CHECK: %13:vgpr_32 = V_MOV_B32_e32 1065353216, implicit %exec
+# CHECK: %14:vgpr_32 = V_ADD_F16_e32 killed %11, %13, implicit %exec
+# CHECK: %15:vgpr_32 = V_ADD_F16_e32 killed %12, killed %13, implicit %exec
+
+
+name:            add_f32_1.0_multi_f16_use
+alignment:       0
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: sreg_64 }
+  - { id: 1, class: sreg_32 }
+  - { id: 2, class: sgpr_32 }
+  - { id: 3, class: vgpr_32 }
+  - { id: 4, class: sreg_64 }
+  - { id: 5, class: sreg_32 }
+  - { id: 6, class: sreg_64 }
+  - { id: 7, class: sreg_32 }
+  - { id: 8, class: sreg_32 }
+  - { id: 9, class: sreg_32 }
+  - { id: 10, class: sreg_128 }
+  - { id: 11, class: vgpr_32 }
+  - { id: 12, class: vgpr_32 }
+  - { id: 13, class: vgpr_32 }
+  - { id: 14, class: vgpr_32 }
+  - { id: 15, class: vgpr_32 }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+body:             |
+  bb.0 (%ir-block.0):
+    %4 = IMPLICIT_DEF
+    %5 = COPY %4.sub1
+    %6 = IMPLICIT_DEF
+    %7 = COPY %6.sub0
+    %8 = S_MOV_B32 61440
+    %9 = S_MOV_B32 -1
+    %10 = REG_SEQUENCE killed %7, 1, killed %5, 2, killed %9, 3, killed %8, 4
+    %11 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 2 from `half addrspace(1)* undef`)
+    %12 = BUFFER_LOAD_DWORD_OFFSET %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 4 from `float addrspace(1)* undef`)
+    %13 = V_MOV_B32_e32 1065353216, implicit %exec
+    %14 = V_ADD_F16_e64 0, killed %11, 0, %13, 0, 0, implicit %exec
+    %15 = V_ADD_F16_e64 0, killed %12, 0, killed %13, 0, 0, implicit %exec
+    BUFFER_STORE_SHORT_OFFSET killed %14, %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 2 into `half addrspace(1)* undef`)
+    BUFFER_STORE_SHORT_OFFSET killed %15, %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 2 into `half addrspace(1)* undef`)
+    S_ENDPGM
+
+...
+---
+
+# f32 1.0 should be folded into the single f32 use as an inline
+#  immediate, and folded into the single f16 use as a literal constant
+
+# CHECK-LABEL: name: add_f32_1.0_one_f32_use_one_f16_use
+# CHECK: %15:vgpr_32 = V_ADD_F16_e32 1065353216, %11, implicit %exec
+# CHECK: %16:vgpr_32 = V_ADD_F32_e32 1065353216, killed %13, implicit %exec
+
+name:            add_f32_1.0_one_f32_use_one_f16_use
+alignment:       0
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: sreg_64 }
+  - { id: 1, class: sreg_32 }
+  - { id: 2, class: sgpr_32 }
+  - { id: 3, class: vgpr_32 }
+  - { id: 4, class: sreg_64 }
+  - { id: 5, class: sreg_32 }
+  - { id: 6, class: sreg_64 }
+  - { id: 7, class: sreg_32 }
+  - { id: 8, class: sreg_32 }
+  - { id: 9, class: sreg_32 }
+  - { id: 10, class: sreg_128 }
+  - { id: 11, class: vgpr_32 }
+  - { id: 12, class: vgpr_32 }
+  - { id: 13, class: vgpr_32 }
+  - { id: 14, class: vgpr_32 }
+  - { id: 15, class: vgpr_32 }
+  - { id: 16, class: vgpr_32 }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+body:             |
+  bb.0 (%ir-block.0):
+    %4 = IMPLICIT_DEF
+    %5 = COPY %4.sub1
+    %6 = IMPLICIT_DEF
+    %7 = COPY %6.sub0
+    %8 = S_MOV_B32 61440
+    %9 = S_MOV_B32 -1
+    %10 = REG_SEQUENCE killed %7, 1, killed %5, 2, killed %9, 3, killed %8, 4
+    %11 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 2 from `half addrspace(1)* undef`)
+    %12 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 2 from `half addrspace(1)* undef`)
+    %13 = BUFFER_LOAD_DWORD_OFFSET %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 4 from `float addrspace(1)* undef`)
+    %14 = V_MOV_B32_e32 1065353216, implicit %exec
+    %15 = V_ADD_F16_e64 0, %11, 0, %14, 0, 0, implicit %exec
+    %16 = V_ADD_F32_e64 0, killed %13, 0, killed %14, 0, 0, implicit %exec
+    BUFFER_STORE_SHORT_OFFSET killed %15, %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 2 into `half addrspace(1)* undef`)
+    BUFFER_STORE_DWORD_OFFSET killed %16, %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 4 into `float addrspace(1)* undef`)
+    S_ENDPGM
+
+...
+---
+
+# f32 1.0 should be folded for the single f32 use as an inline
+#  constant, and not folded as a multi-use literal for the f16 cases
+
+# CHECK-LABEL: name: add_f32_1.0_one_f32_use_multi_f16_use
+# CHECK: %14:vgpr_32 = V_MOV_B32_e32 1065353216, implicit %exec
+# CHECK: %15:vgpr_32 = V_ADD_F16_e32  %11, %14, implicit %exec
+# CHECK: %16:vgpr_32 = V_ADD_F16_e32 %12,  %14, implicit %exec
+# CHECK: %17:vgpr_32 = V_ADD_F32_e32 1065353216, killed %13, implicit %exec
+
+name:            add_f32_1.0_one_f32_use_multi_f16_use
+alignment:       0
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: sreg_64 }
+  - { id: 1, class: sreg_32 }
+  - { id: 2, class: sgpr_32 }
+  - { id: 3, class: vgpr_32 }
+  - { id: 4, class: sreg_64 }
+  - { id: 5, class: sreg_32 }
+  - { id: 6, class: sreg_64 }
+  - { id: 7, class: sreg_32 }
+  - { id: 8, class: sreg_32 }
+  - { id: 9, class: sreg_32 }
+  - { id: 10, class: sreg_128 }
+  - { id: 11, class: vgpr_32 }
+  - { id: 12, class: vgpr_32 }
+  - { id: 13, class: vgpr_32 }
+  - { id: 14, class: vgpr_32 }
+  - { id: 15, class: vgpr_32 }
+  - { id: 16, class: vgpr_32 }
+  - { id: 17, class: vgpr_32 }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+body:             |
+  bb.0 (%ir-block.0):
+    %4 = IMPLICIT_DEF
+    %5 = COPY %4.sub1
+    %6 = IMPLICIT_DEF
+    %7 = COPY %6.sub0
+    %8 = S_MOV_B32 61440
+    %9 = S_MOV_B32 -1
+    %10 = REG_SEQUENCE killed %7, 1, killed %5, 2, killed %9, 3, killed %8, 4
+    %11 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 2 from `half addrspace(1)* undef`)
+    %12 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 2 from `half addrspace(1)* undef`)
+    %13 = BUFFER_LOAD_DWORD_OFFSET %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 4 from `float addrspace(1)* undef`)
+    %14 = V_MOV_B32_e32 1065353216, implicit %exec
+    %15 = V_ADD_F16_e64 0, %11, 0, %14, 0, 0, implicit %exec
+    %16 = V_ADD_F16_e64 0, %12, 0, %14, 0, 0, implicit %exec
+    %17 = V_ADD_F32_e64 0, killed %13, 0, killed %14, 0, 0, implicit %exec
+    BUFFER_STORE_SHORT_OFFSET killed %15, %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 2 into `half addrspace(1)* undef`)
+    BUFFER_STORE_SHORT_OFFSET killed %16, %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 2 into `half addrspace(1)* undef`)
+    BUFFER_STORE_DWORD_OFFSET killed %17, %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 4 into `float addrspace(1)* undef`)
+    S_ENDPGM
+
+...
+---
+# CHECK-LABEL: name: add_i32_1_multi_f16_use
+# CHECK: %13:vgpr_32 = V_MOV_B32_e32 1, implicit %exec
+# CHECK: %14:vgpr_32 = V_ADD_F16_e32 1, killed %11, implicit %exec
+# CHECK: %15:vgpr_32 = V_ADD_F16_e32 1, killed %12, implicit %exec
+
+
+name:            add_i32_1_multi_f16_use
+alignment:       0
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: sreg_64 }
+  - { id: 1, class: sreg_32 }
+  - { id: 2, class: sgpr_32 }
+  - { id: 3, class: vgpr_32 }
+  - { id: 4, class: sreg_64 }
+  - { id: 5, class: sreg_32 }
+  - { id: 6, class: sreg_64 }
+  - { id: 7, class: sreg_32 }
+  - { id: 8, class: sreg_32 }
+  - { id: 9, class: sreg_32 }
+  - { id: 10, class: sreg_128 }
+  - { id: 11, class: vgpr_32 }
+  - { id: 12, class: vgpr_32 }
+  - { id: 13, class: vgpr_32 }
+  - { id: 14, class: vgpr_32 }
+  - { id: 15, class: vgpr_32 }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+body:             |
+  bb.0 (%ir-block.0):
+    %4 = IMPLICIT_DEF
+    %5 = COPY %4.sub1
+    %6 = IMPLICIT_DEF
+    %7 = COPY %6.sub0
+    %8 = S_MOV_B32 61440
+    %9 = S_MOV_B32 -1
+    %10 = REG_SEQUENCE killed %7, 1, killed %5, 2, killed %9, 3, killed %8, 4
+    %11 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 2 from `half addrspace(1)* undef`)
+    %12 = BUFFER_LOAD_DWORD_OFFSET %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 4 from `float addrspace(1)* undef`)
+    %13 = V_MOV_B32_e32 1, implicit %exec
+    %14 = V_ADD_F16_e64 0, killed %11, 0, %13, 0, 0, implicit %exec
+    %15 = V_ADD_F16_e64 0, killed %12, 0, killed %13, 0, 0, implicit %exec
+    BUFFER_STORE_SHORT_OFFSET killed %14, %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 2 into `half addrspace(1)* undef`)
+    BUFFER_STORE_SHORT_OFFSET killed %15, %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 2 into `half addrspace(1)* undef`)
+    S_ENDPGM
+
+...
+---
+
+# CHECK-LABEL: name: add_i32_m2_one_f32_use_multi_f16_use
+# CHECK: %14:vgpr_32 = V_MOV_B32_e32 -2, implicit %exec
+# CHECK: %15:vgpr_32 = V_ADD_F16_e32 -2, %11, implicit %exec
+# CHECK: %16:vgpr_32 = V_ADD_F16_e32 -2, %12, implicit %exec
+# CHECK: %17:vgpr_32 = V_ADD_F32_e32 -2, killed %13, implicit %exec
+
+name:            add_i32_m2_one_f32_use_multi_f16_use
+alignment:       0
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: sreg_64 }
+  - { id: 1, class: sreg_32 }
+  - { id: 2, class: sgpr_32 }
+  - { id: 3, class: vgpr_32 }
+  - { id: 4, class: sreg_64 }
+  - { id: 5, class: sreg_32 }
+  - { id: 6, class: sreg_64 }
+  - { id: 7, class: sreg_32 }
+  - { id: 8, class: sreg_32 }
+  - { id: 9, class: sreg_32 }
+  - { id: 10, class: sreg_128 }
+  - { id: 11, class: vgpr_32 }
+  - { id: 12, class: vgpr_32 }
+  - { id: 13, class: vgpr_32 }
+  - { id: 14, class: vgpr_32 }
+  - { id: 15, class: vgpr_32 }
+  - { id: 16, class: vgpr_32 }
+  - { id: 17, class: vgpr_32 }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+body:             |
+  bb.0 (%ir-block.0):
+    %4 = IMPLICIT_DEF
+    %5 = COPY %4.sub1
+    %6 = IMPLICIT_DEF
+    %7 = COPY %6.sub0
+    %8 = S_MOV_B32 61440
+    %9 = S_MOV_B32 -1
+    %10 = REG_SEQUENCE killed %7, 1, killed %5, 2, killed %9, 3, killed %8, 4
+    %11 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 2 from `half addrspace(1)* undef`)
+    %12 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 2 from `half addrspace(1)* undef`)
+    %13 = BUFFER_LOAD_DWORD_OFFSET %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 4 from `float addrspace(1)* undef`)
+    %14 = V_MOV_B32_e32 -2, implicit %exec
+    %15 = V_ADD_F16_e64 0, %11, 0, %14, 0, 0, implicit %exec
+    %16 = V_ADD_F16_e64 0, %12, 0, %14, 0, 0, implicit %exec
+    %17 = V_ADD_F32_e64 0, killed %13, 0, killed %14, 0, 0, implicit %exec
+    BUFFER_STORE_SHORT_OFFSET killed %15, %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 2 into `half addrspace(1)* undef`)
+    BUFFER_STORE_SHORT_OFFSET killed %16, %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 2 into `half addrspace(1)* undef`)
+    BUFFER_STORE_DWORD_OFFSET killed %17, %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 4 into `float addrspace(1)* undef`)
+    S_ENDPGM
+
+...
+---
+
+# f32 1.0 should be folded for the single f32 use as an inline
+#  constant, and not folded as a multi-use literal for the f16 cases
+
+# CHECK-LABEL: name: add_f16_1.0_multi_f32_use
+# CHECK: %13:vgpr_32 = V_MOV_B32_e32 15360, implicit %exec
+# CHECK: %14:vgpr_32 = V_ADD_F32_e32 %11, %13, implicit %exec
+# CHECK: %15:vgpr_32 = V_ADD_F32_e32 %12, %13, implicit %exec
+
+name:            add_f16_1.0_multi_f32_use
+alignment:       0
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: sreg_64 }
+  - { id: 1, class: sreg_32 }
+  - { id: 2, class: sgpr_32 }
+  - { id: 3, class: vgpr_32 }
+  - { id: 4, class: sreg_64 }
+  - { id: 5, class: sreg_32 }
+  - { id: 6, class: sreg_64 }
+  - { id: 7, class: sreg_32 }
+  - { id: 8, class: sreg_32 }
+  - { id: 9, class: sreg_32 }
+  - { id: 10, class: sreg_128 }
+  - { id: 11, class: vgpr_32 }
+  - { id: 12, class: vgpr_32 }
+  - { id: 13, class: vgpr_32 }
+  - { id: 14, class: vgpr_32 }
+  - { id: 15, class: vgpr_32 }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+body:             |
+  bb.0 (%ir-block.0):
+    %4 = IMPLICIT_DEF
+    %5 = COPY %4.sub1
+    %6 = IMPLICIT_DEF
+    %7 = COPY %6.sub0
+    %8 = S_MOV_B32 61440
+    %9 = S_MOV_B32 -1
+    %10 = REG_SEQUENCE killed %7, 1, killed %5, 2, killed %9, 3, killed %8, 4
+    %11 = BUFFER_LOAD_DWORD_OFFSET %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 4 from `float addrspace(1)* undef`)
+    %12 = BUFFER_LOAD_DWORD_OFFSET %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 4 from `float addrspace(1)* undef`)
+    %13 = V_MOV_B32_e32 15360, implicit %exec
+    %14 = V_ADD_F32_e64 0, %11, 0, %13, 0, 0, implicit %exec
+    %15 = V_ADD_F32_e64 0, %12, 0, %13, 0, 0, implicit %exec
+    BUFFER_STORE_DWORD_OFFSET killed %14, %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 4 into `float addrspace(1)* undef`)
+    BUFFER_STORE_DWORD_OFFSET killed %15, %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 4 into `float addrspace(1)* undef`)
+    S_ENDPGM
+
+...
+---
+
+# The low 16-bits are an inline immediate, but the high bits are junk
+# FIXME: Should be able to fold this
+
+# CHECK-LABEL: name: add_f16_1.0_other_high_bits_multi_f16_use
+# CHECK: %13:vgpr_32 = V_MOV_B32_e32 80886784, implicit %exec
+# CHECK: %14:vgpr_32 = V_ADD_F16_e32 %11, %13, implicit %exec
+# CHECK: %15:vgpr_32 = V_ADD_F16_e32 %12, %13, implicit %exec
+
+name:            add_f16_1.0_other_high_bits_multi_f16_use
+alignment:       0
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: sreg_64 }
+  - { id: 1, class: sreg_32 }
+  - { id: 2, class: sgpr_32 }
+  - { id: 3, class: vgpr_32 }
+  - { id: 4, class: sreg_64 }
+  - { id: 5, class: sreg_32 }
+  - { id: 6, class: sreg_64 }
+  - { id: 7, class: sreg_32 }
+  - { id: 8, class: sreg_32 }
+  - { id: 9, class: sreg_32 }
+  - { id: 10, class: sreg_128 }
+  - { id: 11, class: vgpr_32 }
+  - { id: 12, class: vgpr_32 }
+  - { id: 13, class: vgpr_32 }
+  - { id: 14, class: vgpr_32 }
+  - { id: 15, class: vgpr_32 }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+body:             |
+  bb.0 (%ir-block.0):
+    %4 = IMPLICIT_DEF
+    %5 = COPY %4.sub1
+    %6 = IMPLICIT_DEF
+    %7 = COPY %6.sub0
+    %8 = S_MOV_B32 61440
+    %9 = S_MOV_B32 -1
+    %10 = REG_SEQUENCE killed %7, 1, killed %5, 2, killed %9, 3, killed %8, 4
+    %11 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 2 from `half addrspace(1)* undef`)
+    %12 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 2 from `half addrspace(1)* undef`)
+    %13 = V_MOV_B32_e32 80886784, implicit %exec
+    %14 = V_ADD_F16_e64 0, %11, 0, %13, 0, 0, implicit %exec
+    %15 = V_ADD_F16_e64 0, %12, 0, %13, 0, 0, implicit %exec
+    BUFFER_STORE_SHORT_OFFSET killed %14, %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 2 into `half addrspace(1)* undef`)
+    BUFFER_STORE_SHORT_OFFSET killed %15, %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 2 into `half addrspace(1)* undef`)
+    S_ENDPGM
+
+...
+---
+
+# FIXME: Should fold inline immediate into f16 and literal use into
+# f32 instruction.
+
+# CHECK-LABEL: name: add_f16_1.0_other_high_bits_use_f16_f32
+# CHECK: %13:vgpr_32 = V_MOV_B32_e32 305413120, implicit %exec
+# CHECK: %14:vgpr_32 = V_ADD_F32_e32 %11, %13, implicit %exec
+# CHECK: %15:vgpr_32 = V_ADD_F16_e32 %12, %13, implicit %exec
+name:            add_f16_1.0_other_high_bits_use_f16_f32
+alignment:       0
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: sreg_64 }
+  - { id: 1, class: sreg_32 }
+  - { id: 2, class: sgpr_32 }
+  - { id: 3, class: vgpr_32 }
+  - { id: 4, class: sreg_64 }
+  - { id: 5, class: sreg_32 }
+  - { id: 6, class: sreg_64 }
+  - { id: 7, class: sreg_32 }
+  - { id: 8, class: sreg_32 }
+  - { id: 9, class: sreg_32 }
+  - { id: 10, class: sreg_128 }
+  - { id: 11, class: vgpr_32 }
+  - { id: 12, class: vgpr_32 }
+  - { id: 13, class: vgpr_32 }
+  - { id: 14, class: vgpr_32 }
+  - { id: 15, class: vgpr_32 }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+body:             |
+  bb.0 (%ir-block.0):
+    %4 = IMPLICIT_DEF
+    %5 = COPY %4.sub1
+    %6 = IMPLICIT_DEF
+    %7 = COPY %6.sub0
+    %8 = S_MOV_B32 61440
+    %9 = S_MOV_B32 -1
+    %10 = REG_SEQUENCE killed %7, 1, killed %5, 2, killed %9, 3, killed %8, 4
+    %11 = BUFFER_LOAD_DWORD_OFFSET %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 4 from `float addrspace(1)* undef`)
+    %12 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 2 from `half addrspace(1)* undef`)
+    %13 = V_MOV_B32_e32 305413120, implicit %exec
+    %14 = V_ADD_F32_e64 0, %11, 0, %13, 0, 0, implicit %exec
+    %15 = V_ADD_F16_e64 0, %12, 0, %13, 0, 0, implicit %exec
+    BUFFER_STORE_DWORD_OFFSET killed %14, %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 4 into `float addrspace(1)* undef`)
+    BUFFER_STORE_SHORT_OFFSET killed %15, %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 2 into `half addrspace(1)* undef`)
+    S_ENDPGM
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/fold-multiple.mir b/llvm/test/CodeGen/AMDGPU/fold-multiple.mir
new file mode 100644
index 00000000000..b9b6ee6887b
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/fold-multiple.mir
@@ -0,0 +1,40 @@
+# RUN: llc --mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs -run-pass si-fold-operands,si-shrink-instructions %s -o - | FileCheck %s
+--- |
+  define amdgpu_kernel void @test() #0 {
+    ret void
+  }
+
+  attributes #0 = { nounwind }
+
+...
+---
+
+# This used to crash / trigger an assertion, because re-scanning the use list
+# after constant-folding the definition of %3 lead to the definition of %2
+# being processed twice.
+
+# CHECK-LABEL: name: test
+# CHECK: %2:vgpr_32 = V_LSHLREV_B32_e32 2, killed %0, implicit %exec
+# CHECK: %4:vgpr_32 = V_AND_B32_e32 8, killed %2, implicit %exec
+
+name:            test
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: vgpr_32 }
+  - { id: 1, class: sreg_32 }
+  - { id: 2, class: vgpr_32 }
+  - { id: 3, class: sreg_32 }
+  - { id: 4, class: vgpr_32 }
+  - { id: 5, class: sreg_128 }
+body:             |
+  bb.0 (%ir-block.0):
+    %0 = IMPLICIT_DEF
+    %1 = S_MOV_B32 2
+    %2 = V_LSHLREV_B32_e64 %1, killed %0, implicit %exec
+    %3 = S_LSHL_B32 %1, killed %1, implicit-def dead %scc
+    %4 = V_AND_B32_e64 killed %2, killed %3, implicit %exec
+    %5 = IMPLICIT_DEF
+    BUFFER_STORE_DWORD_OFFSET killed %4, killed %5, 0, 0, 0, 0, 0, implicit %exec
+    S_ENDPGM
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-atomic-insert-end.mir b/llvm/test/CodeGen/AMDGPU/memory-legalizer-atomic-insert-end.mir
new file mode 100644
index 00000000000..d4ddfbe31b9
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-atomic-insert-end.mir
@@ -0,0 +1,122 @@
+# RUN: llc -march=amdgcn -mcpu=gfx803 -run-pass si-memory-legalizer  %s -o - | FileCheck %s
+
+--- |
+  ; ModuleID = '<stdin>'
+  source_filename = "<stdin>"
+  target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"
+
+  ; Function Attrs: nounwind readnone
+  declare i32 @llvm.amdgcn.workitem.id.x() #0
+
+  ; Function Attrs: nounwind
+  define amdgpu_kernel void @atomic_max_i32_noret(
+      i32 addrspace(1)* %out,
+      i32 addrspace(1)* addrspace(1)* %in,
+      i32 addrspace(1)* %x,
+      i32 %y) #1 {
+    %tid = call i32 @llvm.amdgcn.workitem.id.x()
+    %idxprom = sext i32 %tid to i64
+    %tid.gep = getelementptr i32 addrspace(1)*, i32 addrspace(1)* addrspace(1)* %in, i64 %idxprom
+    %ptr = load volatile i32 addrspace(1)*, i32 addrspace(1)* addrspace(1)* %tid.gep
+    %xor = xor i32 %tid, 1
+    %cmp = icmp ne i32 %xor, 0
+    %1 = call { i1, i64 } @llvm.amdgcn.if(i1 %cmp)
+    %2 = extractvalue { i1, i64 } %1, 0
+    %3 = extractvalue { i1, i64 } %1, 1
+    br i1 %2, label %atomic, label %exit
+
+  atomic:                                           ; preds = %0
+    %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 100
+    %ret = atomicrmw max i32 addrspace(1)* %gep, i32 %y seq_cst
+    br label %exit
+
+  exit:                                             ; preds = %atomic, %0
+    call void @llvm.amdgcn.end.cf(i64 %3)
+    ret void
+  }
+
+  declare { i1, i64 } @llvm.amdgcn.if(i1)
+
+  declare void @llvm.amdgcn.end.cf(i64)
+
+  ; Function Attrs: nounwind
+  declare void @llvm.stackprotector(i8*, i8**) #3
+
+  attributes #0 = { nounwind readnone "target-cpu"="tahiti" }
+  attributes #1 = { nounwind "target-cpu"="tahiti" }
+  attributes #2 = { readnone }
+  attributes #3 = { nounwind }
+
+...
+---
+
+# CHECK-LABEL: name: atomic_max_i32_noret
+
+# CHECK-LABEL: bb.1.atomic:
+# CHECK:       BUFFER_ATOMIC_SMAX_ADDR64
+# CHECK-NEXT:  S_WAITCNT 3952
+# CHECK-NEXT:  BUFFER_WBINVL1_VOL
+
+name:            atomic_max_i32_noret
+alignment:       0
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+liveins:
+  - { reg: '%sgpr0_sgpr1' }
+  - { reg: '%vgpr0' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+body:             |
+  bb.0 (%ir-block.0):
+    successors: %bb.1.atomic(0x40000000), %bb.2.exit(0x40000000)
+    liveins: %vgpr0, %sgpr0_sgpr1
+ 
+    %sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM %sgpr0_sgpr1, 11, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+    %vgpr1 = V_ASHRREV_I32_e32 31, %vgpr0, implicit %exec
+    %vgpr1_vgpr2 = V_LSHL_B64 %vgpr0_vgpr1, 3, implicit %exec
+    %sgpr7 = S_MOV_B32 61440
+    %sgpr6 = S_MOV_B32 0
+    S_WAITCNT 127
+    %vgpr1_vgpr2 = BUFFER_LOAD_DWORDX2_ADDR64 killed %vgpr1_vgpr2, %sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 8 from %ir.tid.gep)
+    %vgpr0 = V_XOR_B32_e32 1, killed %vgpr0, implicit %exec
+    V_CMP_NE_U32_e32 0, killed %vgpr0, implicit-def %vcc, implicit %exec
+    %sgpr2_sgpr3 = S_AND_SAVEEXEC_B64 killed %vcc, implicit-def %exec, implicit-def %scc, implicit %exec
+    %sgpr2_sgpr3 = S_XOR_B64 %exec, killed %sgpr2_sgpr3, implicit-def dead %scc
+    SI_MASK_BRANCH %bb.2.exit, implicit %exec
+ 
+  bb.1.atomic:
+    successors: %bb.2.exit(0x80000000)
+    liveins: %sgpr4_sgpr5_sgpr6_sgpr7:0x0000000C, %sgpr0_sgpr1, %sgpr2_sgpr3, %vgpr1_vgpr2_vgpr3_vgpr4:0x00000003
+ 
+    %sgpr0 = S_LOAD_DWORD_IMM killed %sgpr0_sgpr1, 15, 0 :: (non-temporal dereferenceable invariant load 4 from `i32 addrspace(2)* undef`)
+    dead %vgpr0 = V_MOV_B32_e32 -1, implicit %exec
+    dead %vgpr0 = V_MOV_B32_e32 61440, implicit %exec
+    %sgpr4_sgpr5 = S_MOV_B64 0
+    S_WAITCNT 127
+    %vgpr0 = V_MOV_B32_e32 killed %sgpr0, implicit %exec, implicit %exec
+    S_WAITCNT 3952
+    BUFFER_ATOMIC_SMAX_ADDR64 killed %vgpr0, killed %vgpr1_vgpr2, killed %sgpr4_sgpr5_sgpr6_sgpr7, 0, 400, 0, implicit %exec :: (volatile load seq_cst 4 from %ir.gep)
+ 
+  bb.2.exit:
+    liveins: %sgpr2_sgpr3
+
+    %exec = S_OR_B64 %exec, killed %sgpr2_sgpr3, implicit-def %scc
+    S_ENDPGM
+
+...
+
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-multiple-mem-operands-atomics.mir b/llvm/test/CodeGen/AMDGPU/memory-legalizer-multiple-mem-operands-atomics.mir
new file mode 100644
index 00000000000..2f3095c777a
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-multiple-mem-operands-atomics.mir
@@ -0,0 +1,163 @@
+# RUN: llc -march=amdgcn -mcpu=gfx803 -run-pass si-memory-legalizer  %s -o - | FileCheck %s
+
+--- |
+  ; ModuleID = 'memory-legalizer-multiple-mem-operands.ll'
+  source_filename = "memory-legalizer-multiple-mem-operands.ll"
+  target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"
+
+  define amdgpu_kernel void @multiple_mem_operands(i32 addrspace(1)* %out, i32 %cond, i32 %if_offset, i32 %else_offset) #0 {
+  entry:
+    %scratch0 = alloca [8192 x i32]
+    %scratch1 = alloca [8192 x i32]
+    %scratchptr01 = bitcast [8192 x i32]* %scratch0 to i32*
+    store i32 1, i32* %scratchptr01
+    %scratchptr12 = bitcast [8192 x i32]* %scratch1 to i32*
+    store i32 2, i32* %scratchptr12
+    %cmp = icmp eq i32 %cond, 0
+    br i1 %cmp, label %if, label %else, !structurizecfg.uniform !0, !amdgpu.uniform !0
+
+  if:                                               ; preds = %entry
+    %if_ptr = getelementptr [8192 x i32], [8192 x i32]* %scratch0, i32 0, i32 %if_offset, !amdgpu.uniform !0
+    %if_value = load atomic i32, i32* %if_ptr syncscope("workgroup") seq_cst, align 4
+    br label %done, !structurizecfg.uniform !0
+
+  else:                                             ; preds = %entry
+    %else_ptr = getelementptr [8192 x i32], [8192 x i32]* %scratch1, i32 0, i32 %else_offset, !amdgpu.uniform !0
+    %else_value = load atomic i32, i32* %else_ptr syncscope("agent") unordered, align 4
+    br label %done, !structurizecfg.uniform !0
+
+  done:                                             ; preds = %else, %if
+    %value = phi i32 [ %if_value, %if ], [ %else_value, %else ]
+    store i32 %value, i32 addrspace(1)* %out
+    ret void
+  }
+
+  ; Function Attrs: convergent nounwind
+  declare { i1, i64 } @llvm.amdgcn.if(i1) #1
+
+  ; Function Attrs: convergent nounwind
+  declare { i1, i64 } @llvm.amdgcn.else(i64) #1
+
+  ; Function Attrs: convergent nounwind readnone
+  declare i64 @llvm.amdgcn.break(i64) #2
+
+  ; Function Attrs: convergent nounwind readnone
+  declare i64 @llvm.amdgcn.if.break(i1, i64) #2
+
+  ; Function Attrs: convergent nounwind readnone
+  declare i64 @llvm.amdgcn.else.break(i64, i64) #2
+
+  ; Function Attrs: convergent nounwind
+  declare i1 @llvm.amdgcn.loop(i64) #1
+
+  ; Function Attrs: convergent nounwind
+  declare void @llvm.amdgcn.end.cf(i64) #1
+
+  attributes #0 = { "target-cpu"="gfx803" }
+  attributes #1 = { convergent nounwind }
+  attributes #2 = { convergent nounwind readnone }
+
+  !0 = !{}
+
+...
+---
+
+# CHECK-LABEL: name: multiple_mem_operands
+
+# CHECK-LABEL: bb.3.done:
+# CHECK:       S_WAITCNT 3952
+# CHECK-NEXT:  BUFFER_LOAD_DWORD_OFFEN
+# CHECK-NEXT:  S_WAITCNT 3952
+# CHECK-NEXT:  BUFFER_WBINVL1_VOL
+
+name:            multiple_mem_operands
+alignment:       0
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+registers:
+liveins:
+  - { reg: '%sgpr0_sgpr1', virtual-reg: '' }
+  - { reg: '%sgpr3', virtual-reg: '' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       65540
+  offsetAdjustment: 0
+  maxAlignment:    4
+  adjustsStack:    false
+  hasCalls:        false
+  stackProtector:  ''
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+  savePoint:       ''
+  restorePoint:    ''
+fixedStack:
+  - { id: 0, type: default, offset: 0, size: 4, alignment: 4, stack-id: 0,
+      isImmutable: false, isAliased: false, callee-saved-register: '' }
+stack:
+  - { id: 0, name: scratch0, type: default, offset: 4, size: 32768, alignment: 4,
+      stack-id: 0, callee-saved-register: '', local-offset: 0, di-variable: '',
+      di-expression: '', di-location: '' }
+  - { id: 1, name: scratch1, type: default, offset: 32772, size: 32768,
+      alignment: 4, stack-id: 0, callee-saved-register: '', local-offset: 32768,
+      di-variable: '', di-expression: '', di-location: '' }
+constants:
+body:             |
+  bb.0.entry:
+    successors: %bb.1.if(0x30000000), %bb.2.else(0x50000000)
+    liveins: %sgpr0_sgpr1, %sgpr3
+
+    %sgpr2 = S_LOAD_DWORD_IMM %sgpr0_sgpr1, 44, 0 :: (non-temporal dereferenceable invariant load 4 from `i32 addrspace(2)* undef`)
+    %sgpr8 = S_MOV_B32 &SCRATCH_RSRC_DWORD0, implicit-def %sgpr8_sgpr9_sgpr10_sgpr11
+    %sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM %sgpr0_sgpr1, 36, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+    %sgpr9 = S_MOV_B32 &SCRATCH_RSRC_DWORD1, implicit-def %sgpr8_sgpr9_sgpr10_sgpr11
+    %sgpr10 = S_MOV_B32 4294967295, implicit-def %sgpr8_sgpr9_sgpr10_sgpr11
+    %sgpr11 = S_MOV_B32 15204352, implicit-def %sgpr8_sgpr9_sgpr10_sgpr11
+    %vgpr0 = V_MOV_B32_e32 1, implicit %exec
+    BUFFER_STORE_DWORD_OFFSET killed %vgpr0, %sgpr8_sgpr9_sgpr10_sgpr11, %sgpr3, 4, 0, 0, 0, implicit %exec :: (store 4 into %ir.scratchptr01)
+    S_WAITCNT 127
+    S_CMP_LG_U32 killed %sgpr2, 0, implicit-def %scc
+    S_WAITCNT 3855
+    %vgpr0 = V_MOV_B32_e32 2, implicit %exec
+    %vgpr1 = V_MOV_B32_e32 32772, implicit %exec
+    BUFFER_STORE_DWORD_OFFEN killed %vgpr0, killed %vgpr1, %sgpr8_sgpr9_sgpr10_sgpr11, %sgpr3, 0, 0, 0, 0, implicit %exec :: (store 4 into %ir.scratchptr12)
+    S_CBRANCH_SCC0 %bb.1.if, implicit killed %scc
+
+  bb.2.else:
+    successors: %bb.3.done(0x80000000)
+    liveins: %sgpr0_sgpr1, %sgpr4_sgpr5, %sgpr3, %sgpr8_sgpr9_sgpr10_sgpr11
+
+    %sgpr0 = S_LOAD_DWORD_IMM killed %sgpr0_sgpr1, 52, 0 :: (non-temporal dereferenceable invariant load 4 from `i32 addrspace(2)* undef`)
+    S_WAITCNT 3855
+    %vgpr0 = V_MOV_B32_e32 32772, implicit %exec
+    S_BRANCH %bb.3.done
+
+  bb.1.if:
+    successors: %bb.3.done(0x80000000)
+    liveins: %sgpr0_sgpr1, %sgpr4_sgpr5, %sgpr3, %sgpr8_sgpr9_sgpr10_sgpr11
+
+    %sgpr0 = S_LOAD_DWORD_IMM killed %sgpr0_sgpr1, 48, 0 :: (non-temporal dereferenceable invariant load 4 from `i32 addrspace(2)* undef`)
+    S_WAITCNT 3855
+    %vgpr0 = V_MOV_B32_e32 4, implicit %exec
+
+  bb.3.done:
+    liveins: %sgpr3, %sgpr4_sgpr5, %sgpr8_sgpr9_sgpr10_sgpr11, %vgpr0, %sgpr0
+
+    S_WAITCNT 127
+    %sgpr0 = S_LSHL_B32 killed %sgpr0, 2, implicit-def dead %scc
+    %vgpr0 = V_ADD_I32_e32 killed %sgpr0, killed %vgpr0, implicit-def dead %vcc, implicit %exec
+    %vgpr0 = BUFFER_LOAD_DWORD_OFFEN killed %vgpr0, killed %sgpr8_sgpr9_sgpr10_sgpr11, %sgpr3, 0, 0, 0, 0, implicit %exec :: (load syncscope("agent") unordered 4 from %ir.else_ptr), (load syncscope("workgroup") seq_cst 4 from %ir.if_ptr)
+    %vgpr1 = V_MOV_B32_e32 %sgpr4, implicit %exec, implicit-def %vgpr1_vgpr2, implicit %sgpr4_sgpr5
+    %vgpr2 = V_MOV_B32_e32 killed %sgpr5, implicit %exec, implicit %sgpr4_sgpr5, implicit %exec
+    S_WAITCNT 3952
+    FLAT_STORE_DWORD killed %vgpr1_vgpr2, killed %vgpr0, 0, 0, 0, implicit %exec, implicit %flat_scr :: (store 4 into %ir.out)
+    S_ENDPGM
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-multiple-mem-operands-nontemporal-1.mir b/llvm/test/CodeGen/AMDGPU/memory-legalizer-multiple-mem-operands-nontemporal-1.mir
new file mode 100644
index 00000000000..263bbeb0596
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-multiple-mem-operands-nontemporal-1.mir
@@ -0,0 +1,161 @@
+# RUN: llc -march=amdgcn -mcpu=gfx803 -run-pass si-memory-legalizer  %s -o - | FileCheck %s
+
+--- |
+  ; ModuleID = 'memory-legalizer-multiple-mem-operands.ll'
+  source_filename = "memory-legalizer-multiple-mem-operands.ll"
+  target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"
+
+  define amdgpu_kernel void @multiple_mem_operands(i32 addrspace(1)* %out, i32 %cond, i32 %if_offset, i32 %else_offset) #0 {
+  entry:
+    %scratch0 = alloca [8192 x i32]
+    %scratch1 = alloca [8192 x i32]
+    %scratchptr01 = bitcast [8192 x i32]* %scratch0 to i32*
+    store i32 1, i32* %scratchptr01
+    %scratchptr12 = bitcast [8192 x i32]* %scratch1 to i32*
+    store i32 2, i32* %scratchptr12
+    %cmp = icmp eq i32 %cond, 0
+    br i1 %cmp, label %if, label %else, !structurizecfg.uniform !0, !amdgpu.uniform !0
+
+  if:                                               ; preds = %entry
+    %if_ptr = getelementptr [8192 x i32], [8192 x i32]* %scratch0, i32 0, i32 %if_offset, !amdgpu.uniform !0
+    %if_value = load i32, i32* %if_ptr, align 4, !nontemporal !1
+    br label %done, !structurizecfg.uniform !0
+
+  else:                                             ; preds = %entry
+    %else_ptr = getelementptr [8192 x i32], [8192 x i32]* %scratch1, i32 0, i32 %else_offset, !amdgpu.uniform !0
+    %else_value = load i32, i32* %else_ptr, align 4, !nontemporal !1
+    br label %done, !structurizecfg.uniform !0
+
+  done:                                             ; preds = %else, %if
+    %value = phi i32 [ %if_value, %if ], [ %else_value, %else ]
+    store i32 %value, i32 addrspace(1)* %out
+    ret void
+  }
+
+  ; Function Attrs: convergent nounwind
+  declare { i1, i64 } @llvm.amdgcn.if(i1) #1
+
+  ; Function Attrs: convergent nounwind
+  declare { i1, i64 } @llvm.amdgcn.else(i64) #1
+
+  ; Function Attrs: convergent nounwind readnone
+  declare i64 @llvm.amdgcn.break(i64) #2
+
+  ; Function Attrs: convergent nounwind readnone
+  declare i64 @llvm.amdgcn.if.break(i1, i64) #2
+
+  ; Function Attrs: convergent nounwind readnone
+  declare i64 @llvm.amdgcn.else.break(i64, i64) #2
+
+  ; Function Attrs: convergent nounwind
+  declare i1 @llvm.amdgcn.loop(i64) #1
+
+  ; Function Attrs: convergent nounwind
+  declare void @llvm.amdgcn.end.cf(i64) #1
+
+  attributes #0 = { "target-cpu"="gfx803" }
+  attributes #1 = { convergent nounwind }
+  attributes #2 = { convergent nounwind readnone }
+
+  !0 = !{}
+  !1 = !{i32 1}
+
+...
+---
+
+# CHECK-LABEL: name: multiple_mem_operands
+
+# CHECK-LABEL: bb.3.done:
+# CHECK: BUFFER_LOAD_DWORD_OFFEN killed %vgpr0, killed %sgpr8_sgpr9_sgpr10_sgpr11, %sgpr3, 0, 1, 1, 0
+
+name:            multiple_mem_operands
+alignment:       0
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+registers:
+liveins:
+  - { reg: '%sgpr0_sgpr1', virtual-reg: '' }
+  - { reg: '%sgpr3', virtual-reg: '' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       65540
+  offsetAdjustment: 0
+  maxAlignment:    4
+  adjustsStack:    false
+  hasCalls:        false
+  stackProtector:  ''
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+  savePoint:       ''
+  restorePoint:    ''
+fixedStack:
+  - { id: 0, type: default, offset: 0, size: 4, alignment: 4, stack-id: 0,
+      isImmutable: false, isAliased: false, callee-saved-register: '' }
+stack:
+  - { id: 0, name: scratch0, type: default, offset: 4, size: 32768, alignment: 4,
+      stack-id: 0, callee-saved-register: '', local-offset: 0, di-variable: '',
+      di-expression: '', di-location: '' }
+  - { id: 1, name: scratch1, type: default, offset: 32772, size: 32768,
+      alignment: 4, stack-id: 0, callee-saved-register: '', local-offset: 32768,
+      di-variable: '', di-expression: '', di-location: '' }
+constants:
+body:             |
+  bb.0.entry:
+    successors: %bb.1.if(0x30000000), %bb.2.else(0x50000000)
+    liveins: %sgpr0_sgpr1, %sgpr3
+
+    %sgpr2 = S_LOAD_DWORD_IMM %sgpr0_sgpr1, 44, 0 :: (non-temporal dereferenceable invariant load 4 from `i32 addrspace(2)* undef`)
+    %sgpr8 = S_MOV_B32 &SCRATCH_RSRC_DWORD0, implicit-def %sgpr8_sgpr9_sgpr10_sgpr11
+    %sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM %sgpr0_sgpr1, 36, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+    %sgpr9 = S_MOV_B32 &SCRATCH_RSRC_DWORD1, implicit-def %sgpr8_sgpr9_sgpr10_sgpr11
+    %sgpr10 = S_MOV_B32 4294967295, implicit-def %sgpr8_sgpr9_sgpr10_sgpr11
+    %sgpr11 = S_MOV_B32 15204352, implicit-def %sgpr8_sgpr9_sgpr10_sgpr11
+    %vgpr0 = V_MOV_B32_e32 1, implicit %exec
+    BUFFER_STORE_DWORD_OFFSET killed %vgpr0, %sgpr8_sgpr9_sgpr10_sgpr11, %sgpr3, 4, 0, 0, 0, implicit %exec :: (store 4 into %ir.scratchptr01)
+    S_WAITCNT 127
+    S_CMP_LG_U32 killed %sgpr2, 0, implicit-def %scc
+    S_WAITCNT 3855
+    %vgpr0 = V_MOV_B32_e32 2, implicit %exec
+    %vgpr1 = V_MOV_B32_e32 32772, implicit %exec
+    BUFFER_STORE_DWORD_OFFEN killed %vgpr0, killed %vgpr1, %sgpr8_sgpr9_sgpr10_sgpr11, %sgpr3, 0, 0, 0, 0, implicit %exec :: (store 4 into %ir.scratchptr12)
+    S_CBRANCH_SCC0 %bb.1.if, implicit killed %scc
+
+  bb.2.else:
+    successors: %bb.3.done(0x80000000)
+    liveins: %sgpr0_sgpr1, %sgpr4_sgpr5, %sgpr3, %sgpr8_sgpr9_sgpr10_sgpr11
+
+    %sgpr0 = S_LOAD_DWORD_IMM killed %sgpr0_sgpr1, 52, 0 :: (non-temporal dereferenceable invariant load 4 from `i32 addrspace(2)* undef`)
+    S_WAITCNT 3855
+    %vgpr0 = V_MOV_B32_e32 32772, implicit %exec
+    S_BRANCH %bb.3.done
+
+  bb.1.if:
+    successors: %bb.3.done(0x80000000)
+    liveins: %sgpr0_sgpr1, %sgpr4_sgpr5, %sgpr3, %sgpr8_sgpr9_sgpr10_sgpr11
+
+    %sgpr0 = S_LOAD_DWORD_IMM killed %sgpr0_sgpr1, 48, 0 :: (non-temporal dereferenceable invariant load 4 from `i32 addrspace(2)* undef`)
+    S_WAITCNT 3855
+    %vgpr0 = V_MOV_B32_e32 4, implicit %exec
+
+  bb.3.done:
+    liveins: %sgpr3, %sgpr4_sgpr5, %sgpr8_sgpr9_sgpr10_sgpr11, %vgpr0, %sgpr0
+
+    S_WAITCNT 127
+    %sgpr0 = S_LSHL_B32 killed %sgpr0, 2, implicit-def dead %scc
+    %vgpr0 = V_ADD_I32_e32 killed %sgpr0, killed %vgpr0, implicit-def dead %vcc, implicit %exec
+    %vgpr0 = BUFFER_LOAD_DWORD_OFFEN killed %vgpr0, killed %sgpr8_sgpr9_sgpr10_sgpr11, %sgpr3, 0, 0, 0, 0, implicit %exec :: (non-temporal load 4 from %ir.else_ptr), (non-temporal load 4 from %ir.if_ptr)
+    %vgpr1 = V_MOV_B32_e32 %sgpr4, implicit %exec, implicit-def %vgpr1_vgpr2, implicit %sgpr4_sgpr5
+    %vgpr2 = V_MOV_B32_e32 killed %sgpr5, implicit %exec, implicit %sgpr4_sgpr5, implicit %exec
+    S_WAITCNT 3952
+    FLAT_STORE_DWORD killed %vgpr1_vgpr2, killed %vgpr0, 0, 0, 0, implicit %exec, implicit %flat_scr :: (store 4 into %ir.out)
+    S_ENDPGM
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-multiple-mem-operands-nontemporal-2.mir b/llvm/test/CodeGen/AMDGPU/memory-legalizer-multiple-mem-operands-nontemporal-2.mir
new file mode 100644
index 00000000000..7e0c9e44e37
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-multiple-mem-operands-nontemporal-2.mir
@@ -0,0 +1,161 @@
+# RUN: llc -march=amdgcn -mcpu=gfx803 -run-pass si-memory-legalizer  %s -o - | FileCheck %s
+
+--- |
+  ; ModuleID = 'memory-legalizer-multiple-mem-operands.ll'
+  source_filename = "memory-legalizer-multiple-mem-operands.ll"
+  target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"
+
+  define amdgpu_kernel void @multiple_mem_operands(i32 addrspace(1)* %out, i32 %cond, i32 %if_offset, i32 %else_offset) #0 {
+  entry:
+    %scratch0 = alloca [8192 x i32]
+    %scratch1 = alloca [8192 x i32]
+    %scratchptr01 = bitcast [8192 x i32]* %scratch0 to i32*
+    store i32 1, i32* %scratchptr01
+    %scratchptr12 = bitcast [8192 x i32]* %scratch1 to i32*
+    store i32 2, i32* %scratchptr12
+    %cmp = icmp eq i32 %cond, 0
+    br i1 %cmp, label %if, label %else, !structurizecfg.uniform !0, !amdgpu.uniform !0
+
+  if:                                               ; preds = %entry
+    %if_ptr = getelementptr [8192 x i32], [8192 x i32]* %scratch0, i32 0, i32 %if_offset, !amdgpu.uniform !0
+    %if_value = load i32, i32* %if_ptr, align 4, !nontemporal !1
+    br label %done, !structurizecfg.uniform !0
+
+  else:                                             ; preds = %entry
+    %else_ptr = getelementptr [8192 x i32], [8192 x i32]* %scratch1, i32 0, i32 %else_offset, !amdgpu.uniform !0
+    %else_value = load i32, i32* %else_ptr, align 4
+    br label %done, !structurizecfg.uniform !0
+
+  done:                                             ; preds = %else, %if
+    %value = phi i32 [ %if_value, %if ], [ %else_value, %else ]
+    store i32 %value, i32 addrspace(1)* %out
+    ret void
+  }
+
+  ; Function Attrs: convergent nounwind
+  declare { i1, i64 } @llvm.amdgcn.if(i1) #1
+
+  ; Function Attrs: convergent nounwind
+  declare { i1, i64 } @llvm.amdgcn.else(i64) #1
+
+  ; Function Attrs: convergent nounwind readnone
+  declare i64 @llvm.amdgcn.break(i64) #2
+
+  ; Function Attrs: convergent nounwind readnone
+  declare i64 @llvm.amdgcn.if.break(i1, i64) #2
+
+  ; Function Attrs: convergent nounwind readnone
+  declare i64 @llvm.amdgcn.else.break(i64, i64) #2
+
+  ; Function Attrs: convergent nounwind
+  declare i1 @llvm.amdgcn.loop(i64) #1
+
+  ; Function Attrs: convergent nounwind
+  declare void @llvm.amdgcn.end.cf(i64) #1
+
+  attributes #0 = { "target-cpu"="gfx803" }
+  attributes #1 = { convergent nounwind }
+  attributes #2 = { convergent nounwind readnone }
+
+  !0 = !{}
+  !1 = !{i32 1}
+
+...
+---
+
+# CHECK-LABEL: name: multiple_mem_operands
+
+# CHECK-LABEL: bb.3.done:
+# CHECK: BUFFER_LOAD_DWORD_OFFEN killed %vgpr0, killed %sgpr8_sgpr9_sgpr10_sgpr11, %sgpr3, 0, 0, 0, 0
+
+name:            multiple_mem_operands
+alignment:       0
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+registers:
+liveins:
+  - { reg: '%sgpr0_sgpr1', virtual-reg: '' }
+  - { reg: '%sgpr3', virtual-reg: '' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       65540
+  offsetAdjustment: 0
+  maxAlignment:    4
+  adjustsStack:    false
+  hasCalls:        false
+  stackProtector:  ''
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+  savePoint:       ''
+  restorePoint:    ''
+fixedStack:
+  - { id: 0, type: default, offset: 0, size: 4, alignment: 4, stack-id: 0,
+      isImmutable: false, isAliased: false, callee-saved-register: '' }
+stack:
+  - { id: 0, name: scratch0, type: default, offset: 4, size: 32768, alignment: 4,
+      stack-id: 0, callee-saved-register: '', local-offset: 0, di-variable: '',
+      di-expression: '', di-location: '' }
+  - { id: 1, name: scratch1, type: default, offset: 32772, size: 32768,
+      alignment: 4, stack-id: 0, callee-saved-register: '', local-offset: 32768,
+      di-variable: '', di-expression: '', di-location: '' }
+constants:
+body:             |
+  bb.0.entry:
+    successors: %bb.1.if(0x30000000), %bb.2.else(0x50000000)
+    liveins: %sgpr0_sgpr1, %sgpr3
+
+    %sgpr2 = S_LOAD_DWORD_IMM %sgpr0_sgpr1, 44, 0 :: (non-temporal dereferenceable invariant load 4 from `i32 addrspace(2)* undef`)
+    %sgpr8 = S_MOV_B32 &SCRATCH_RSRC_DWORD0, implicit-def %sgpr8_sgpr9_sgpr10_sgpr11
+    %sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM %sgpr0_sgpr1, 36, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+    %sgpr9 = S_MOV_B32 &SCRATCH_RSRC_DWORD1, implicit-def %sgpr8_sgpr9_sgpr10_sgpr11
+    %sgpr10 = S_MOV_B32 4294967295, implicit-def %sgpr8_sgpr9_sgpr10_sgpr11
+    %sgpr11 = S_MOV_B32 15204352, implicit-def %sgpr8_sgpr9_sgpr10_sgpr11
+    %vgpr0 = V_MOV_B32_e32 1, implicit %exec
+    BUFFER_STORE_DWORD_OFFSET killed %vgpr0, %sgpr8_sgpr9_sgpr10_sgpr11, %sgpr3, 4, 0, 0, 0, implicit %exec :: (store 4 into %ir.scratchptr01)
+    S_WAITCNT 127
+    S_CMP_LG_U32 killed %sgpr2, 0, implicit-def %scc
+    S_WAITCNT 3855
+    %vgpr0 = V_MOV_B32_e32 2, implicit %exec
+    %vgpr1 = V_MOV_B32_e32 32772, implicit %exec
+    BUFFER_STORE_DWORD_OFFEN killed %vgpr0, killed %vgpr1, %sgpr8_sgpr9_sgpr10_sgpr11, %sgpr3, 0, 0, 0, 0, implicit %exec :: (store 4 into %ir.scratchptr12)
+    S_CBRANCH_SCC0 %bb.1.if, implicit killed %scc
+
+  bb.2.else:
+    successors: %bb.3.done(0x80000000)
+    liveins: %sgpr0_sgpr1, %sgpr4_sgpr5, %sgpr3, %sgpr8_sgpr9_sgpr10_sgpr11
+
+    %sgpr0 = S_LOAD_DWORD_IMM killed %sgpr0_sgpr1, 52, 0 :: (non-temporal dereferenceable invariant load 4 from `i32 addrspace(2)* undef`)
+    S_WAITCNT 3855
+    %vgpr0 = V_MOV_B32_e32 32772, implicit %exec
+    S_BRANCH %bb.3.done
+
+  bb.1.if:
+    successors: %bb.3.done(0x80000000)
+    liveins: %sgpr0_sgpr1, %sgpr4_sgpr5, %sgpr3, %sgpr8_sgpr9_sgpr10_sgpr11
+
+    %sgpr0 = S_LOAD_DWORD_IMM killed %sgpr0_sgpr1, 48, 0 :: (non-temporal dereferenceable invariant load 4 from `i32 addrspace(2)* undef`)
+    S_WAITCNT 3855
+    %vgpr0 = V_MOV_B32_e32 4, implicit %exec
+
+  bb.3.done:
+    liveins: %sgpr3, %sgpr4_sgpr5, %sgpr8_sgpr9_sgpr10_sgpr11, %vgpr0, %sgpr0
+
+    S_WAITCNT 127
+    %sgpr0 = S_LSHL_B32 killed %sgpr0, 2, implicit-def dead %scc
+    %vgpr0 = V_ADD_I32_e32 killed %sgpr0, killed %vgpr0, implicit-def dead %vcc, implicit %exec
+    %vgpr0 = BUFFER_LOAD_DWORD_OFFEN killed %vgpr0, killed %sgpr8_sgpr9_sgpr10_sgpr11, %sgpr3, 0, 0, 0, 0, implicit %exec :: (load 4 from %ir.else_ptr), (non-temporal load 4 from %ir.if_ptr)
+    %vgpr1 = V_MOV_B32_e32 %sgpr4, implicit %exec, implicit-def %vgpr1_vgpr2, implicit %sgpr4_sgpr5
+    %vgpr2 = V_MOV_B32_e32 killed %sgpr5, implicit %exec, implicit %sgpr4_sgpr5, implicit %exec
+    S_WAITCNT 3952
+    FLAT_STORE_DWORD killed %vgpr1_vgpr2, killed %vgpr0, 0, 0, 0, implicit %exec, implicit %flat_scr :: (store 4 into %ir.out)
+    S_ENDPGM
+
+...
author	Matthias Braun <matze@braunis.de>	2018-01-19 06:08:15 +0000
committer	Matthias Braun <matze@braunis.de>	2018-01-19 06:08:15 +0000
commit	8bb5228db925bd4d3ef4d41c88b2f7c51d8221fa (patch)
tree	6d24f43c40f1f809368b4043fad52c9ab20fd557 /llvm/test/CodeGen/AMDGPU
parent	f4cd9083acecb478f1d19532bb23624128b5af40 (diff)
download	bcm5719-llvm-8bb5228db925bd4d3ef4d41c88b2f7c51d8221fa.tar.gz bcm5719-llvm-8bb5228db925bd4d3ef4d41c88b2f7c51d8221fa.zip