diff options
author | Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com> | 2019-07-11 21:19:33 +0000 |
---|---|---|
committer | Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com> | 2019-07-11 21:19:33 +0000 |
commit | e67cc380a800d91297bae9e82ea3357ff39e379d (patch) | |
tree | f5638751a2f622faef02c4a7533cc95fd82e8fef /llvm/test/CodeGen | |
parent | 6bd26db06aae4fd27ea38f1aaac382005a079d29 (diff) | |
download | bcm5719-llvm-e67cc380a800d91297bae9e82ea3357ff39e379d.tar.gz bcm5719-llvm-e67cc380a800d91297bae9e82ea3357ff39e379d.zip |
[AMDGPU] gfx908 mfma support
Differential Revision: https://reviews.llvm.org/D64584
llvm-svn: 365824
Diffstat (limited to 'llvm/test/CodeGen')
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/accvgpr-copy.mir | 132 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/agpr-register-count.ll | 15 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/illegal-sgpr-to-vgpr-copy.ll | 17 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll | 1361 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/load-constant-i32.ll | 3 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/load-global-i32.ll | 6 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/load-local-i32.ll | 2 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/mai-inline.ll | 190 |
8 files changed, 1724 insertions, 2 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/accvgpr-copy.mir b/llvm/test/CodeGen/AMDGPU/accvgpr-copy.mir new file mode 100644 index 00000000000..8db8e1bb017 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/accvgpr-copy.mir @@ -0,0 +1,132 @@ +# RUN: llc -march=amdgcn -mcpu=gfx908 -run-pass postrapseudos -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s + +# GCN-LABEL: name: a_to_v +# GCN: $vgpr0 = V_ACCVGPR_READ_B32 killed $agpr0, implicit $exec +--- +name: a_to_v +body: | + bb.0: + $vgpr0 = COPY killed $agpr0, implicit $exec +... + +# GCN-LABEL: name: a4_to_v4 +# GCN: $vgpr0 = V_ACCVGPR_READ_B32 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $agpr0_agpr1_agpr2_agpr3 +# GCN: $vgpr1 = V_ACCVGPR_READ_B32 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 +# GCN: $vgpr2 = V_ACCVGPR_READ_B32 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 +# GCN: $vgpr3 = V_ACCVGPR_READ_B32 $agpr3, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3 +--- +name: a4_to_v4 +body: | + bb.0: + $vgpr0_vgpr1_vgpr2_vgpr3 = COPY killed $agpr0_agpr1_agpr2_agpr3, implicit $exec +... + +# GCN-LABEL: name: a16_to_v16 +# GCN: $vgpr0 = V_ACCVGPR_READ_B32 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 +# GCN: $vgpr15 = V_ACCVGPR_READ_B32 $agpr15, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 +--- +name: a16_to_v16 +body: | + bb.0: + $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, implicit $exec +... + +# GCN-LABEL: name: v_to_a +# GCN: $agpr0 = V_ACCVGPR_WRITE_B32 killed $vgpr0, implicit $exec +--- +name: v_to_a +body: | + bb.0: + $agpr0 = COPY killed $vgpr0, implicit $exec +... + +# GCN-LABEL: name: v4_to_a4 +# GCN: $agpr0 = V_ACCVGPR_WRITE_B32 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3, implicit $vgpr0_vgpr1_vgpr2_vgpr3 +# GCN: $agpr1 = V_ACCVGPR_WRITE_B32 $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3 +# GCN: $agpr2 = V_ACCVGPR_WRITE_B32 $vgpr2, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3 +# GCN: $agpr3 = V_ACCVGPR_WRITE_B32 $vgpr3, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec +--- +name: v4_to_a4 +body: | + bb.0: + $agpr0_agpr1_agpr2_agpr3 = COPY killed $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec +... + +# GCN-LABEL: name: v16_to_a16 +# GCN: $agpr0 = V_ACCVGPR_WRITE_B32 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +# GCN: $agpr15 = V_ACCVGPR_WRITE_B32 $vgpr15, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +--- +name: v16_to_a16 +body: | + bb.0: + $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = COPY killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $exec +... + +# GCN-LABEL: name: s_to_a +# GCN: $vgpr[[TMP:[0-9]+]] = V_MOV_B32_e32 killed $sgpr0, implicit $exec +# GCN: $agpr0 = V_ACCVGPR_WRITE_B32 killed $vgpr[[TMP]], implicit $exec +--- +name: s_to_a +tracksRegLiveness: true +body: | + bb.0: + liveins: $sgpr0 + $agpr0 = COPY killed $sgpr0, implicit $exec +... + +# GCN-LABEL: name: s2_to_a2 +# GCN: $vgpr[[TMP1:[0-9]+]] = V_MOV_B32_e32 killed $sgpr0, implicit $exec +# GCN: $agpr0 = V_ACCVGPR_WRITE_B32 killed $vgpr[[TMP1]], implicit $exec +# GCN: $vgpr[[TMP2:[0-9]+]] = V_MOV_B32_e32 killed $sgpr1, implicit $exec +# GCN: $agpr1 = V_ACCVGPR_WRITE_B32 killed $vgpr[[TMP2]], implicit $exec +--- +name: s2_to_a2 +tracksRegLiveness: true +body: | + bb.0: + liveins: $sgpr0_sgpr1 + $agpr0_agpr1 = COPY killed $sgpr0_sgpr1, implicit $exec +... + +# GCN-LABEL: name: a_to_a +# GCN: $vgpr[[TMP:[0-9]+]] = V_ACCVGPR_READ_B32 killed $agpr1, implicit $exec +# GCN: $agpr0 = V_ACCVGPR_WRITE_B32 killed $vgpr[[TMP]], implicit $exec +--- +name: a_to_a +tracksRegLiveness: true +body: | + bb.0: + $agpr1 = IMPLICIT_DEF + $agpr0 = COPY killed $agpr1, implicit $exec +... + +# GCN-LABEL: name: a2_to_a2 +# GCN: $vgpr[[TMP1:[0-9]+]] = V_ACCVGPR_READ_B32 killed $agpr1, implicit $exec +# GCN: $agpr2 = V_ACCVGPR_WRITE_B32 killed $vgpr[[TMP1]], implicit $exec +# GCN: $vgpr[[TMP2:[0-9]+]] = V_ACCVGPR_READ_B32 killed $agpr0, implicit $exec +# GCN: $agpr1 = V_ACCVGPR_WRITE_B32 killed $vgpr[[TMP2]], implicit $exec +--- +name: a2_to_a2 +tracksRegLiveness: true +body: | + bb.0: + $agpr0_agpr1 = IMPLICIT_DEF + $agpr1_agpr2 = COPY killed $agpr0_agpr1, implicit $exec +... + +# GCN-LABEL: name: a_to_a_spill +# Using last vgpr255 will raise error about absence of emergency spill slot. + +# GCN: $vgpr255 = V_ACCVGPR_READ_B32 killed $agpr1, implicit $exec +# GCN: $agpr0 = V_ACCVGPR_WRITE_B32 killed $vgpr255, implicit $exec + +--- +name: a_to_a_spill +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253, $vgpr254 + + $agpr1 = IMPLICIT_DEF + $agpr0 = COPY killed $agpr1, implicit $exec +... diff --git a/llvm/test/CodeGen/AMDGPU/agpr-register-count.ll b/llvm/test/CodeGen/AMDGPU/agpr-register-count.ll new file mode 100644 index 00000000000..ab4fcc54f65 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/agpr-register-count.ll @@ -0,0 +1,15 @@ +; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s + +declare <32 x i32> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x i32>, i32, i32, i32) + +; GCN-LABEL: {{^}}test_32_agprs: +; GCN: v_mfma_f32_32x32x1f32 a[0:31], {{v[0-9]+}}, {{v[0-9]+}}, 0 +; GCN-NOT: v28 +; GCN: NumVgprs: 32 +; GCN: VGPRBlocks: 7 +define amdgpu_kernel void @test_32_agprs(<32 x i32> addrspace(1)* %arg) { +bb: + %mai.1 = tail call <32 x i32> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>, i32 0, i32 0, i32 0) + store <32 x i32> %mai.1, <32 x i32> addrspace(1)* %arg + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/illegal-sgpr-to-vgpr-copy.ll b/llvm/test/CodeGen/AMDGPU/illegal-sgpr-to-vgpr-copy.ll index 0c5b8fbda22..abcf3342fcf 100644 --- a/llvm/test/CodeGen/AMDGPU/illegal-sgpr-to-vgpr-copy.ll +++ b/llvm/test/CodeGen/AMDGPU/illegal-sgpr-to-vgpr-copy.ll @@ -42,4 +42,21 @@ define amdgpu_kernel void @illegal_vgpr_to_sgpr_copy_v16i32() #0 { ret void } +; ERR: error: <unknown>:0:0: in function illegal_agpr_to_sgpr_copy_i32 void (): illegal SGPR to VGPR copy +; GCN: ; illegal copy a1 to s9 +define amdgpu_kernel void @illegal_agpr_to_sgpr_copy_i32() #1 { + %agpr = call i32 asm sideeffect "; def $0", "=${a1}"() + call void asm sideeffect "; use $0", "${s9}"(i32 %agpr) + ret void +} + +; ERR: error: <unknown>:0:0: in function illegal_agpr_to_sgpr_copy_v2i32 void (): illegal SGPR to VGPR copy +; GCN: ; illegal copy a[0:1] to s[10:11] +define amdgpu_kernel void @illegal_agpr_to_sgpr_copy_v2i32() #1 { + %vgpr = call <2 x i32> asm sideeffect "; def $0", "=${a[0:1]}"() + call void asm sideeffect "; use $0", "${s[10:11]}"(<2 x i32> %vgpr) + ret void +} + attributes #0 = { nounwind } +attributes #1 = { nounwind "target-cpu"="gfx908" } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll new file mode 100644 index 00000000000..0ce08777c14 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll @@ -0,0 +1,1361 @@ +; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s + +declare <32 x i32> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x i32>, i32, i32, i32) +declare <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float, float, <16 x float>, i32, i32, i32) +declare <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float, float, <4 x float>, i32, i32, i32) +declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x2f32(float, float, <16 x float>, i32, i32, i32) +declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x4f32(float, float, <4 x float>, i32, i32, i32) +declare <32 x i32> @llvm.amdgcn.mfma.f32.32x32x4f16(<4 x half>, <4 x half>, <32 x i32>, i32, i32, i32) +declare <16 x float> @llvm.amdgcn.mfma.f32.16x16x4f16(<4 x half>, <4 x half>, <16 x float>, i32, i32, i32) +declare <4 x float> @llvm.amdgcn.mfma.f32.4x4x4f16(<4 x half>, <4 x half>, <4 x float>, i32, i32, i32) +declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x8f16(<4 x half>, <4 x half>, <16 x float>, i32, i32, i32) +declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half>, <4 x half>, <4 x float>, i32, i32, i32) +declare <32 x i32> @llvm.amdgcn.mfma.i32.32x32x4i8(i32, i32, <32 x i32>, i32, i32, i32) +declare <16 x i32> @llvm.amdgcn.mfma.i32.16x16x4i8(i32, i32, <16 x i32>, i32, i32, i32) +declare <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32, i32, <4 x i32>, i32, i32, i32) +declare <16 x i32> @llvm.amdgcn.mfma.i32.32x32x8i8(i32, i32, <16 x i32>, i32, i32, i32) +declare <4 x i32> @llvm.amdgcn.mfma.i32.16x16x16i8(i32, i32, <4 x i32>, i32, i32, i32) +declare <32 x i32> @llvm.amdgcn.mfma.f32.32x32x2bf16(<2 x i16>, <2 x i16>, <32 x i32>, i32, i32, i32) +declare <16 x float> @llvm.amdgcn.mfma.f32.16x16x2bf16(<2 x i16>, <2 x i16>, <16 x float>, i32, i32, i32) +declare <4 x float> @llvm.amdgcn.mfma.f32.4x4x2bf16(<2 x i16>, <2 x i16>, <4 x float>, i32, i32, i32) +declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x4bf16(<2 x i16>, <2 x i16>, <16 x float>, i32, i32, i32) +declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x8bf16(<2 x i16>, <2 x i16>, <4 x float>, i32, i32, i32) +declare i32 @llvm.amdgcn.workitem.id.x() + +; GCN-LABEL: {{^}}test_mfma_f32_32x32x1f32: +; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2.0 +; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1.0 +; GCN-DAG: s_load_dwordx16 +; GCN-DAG: s_load_dwordx16 +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_mfma_f32_32x32x1f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +define amdgpu_kernel void @test_mfma_f32_32x32x1f32(<32 x i32> addrspace(1)* %arg) { +bb: + %in.1 = load <32 x i32>, <32 x i32> addrspace(1)* %arg + %mai.1 = tail call <32 x i32> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x i32> %in.1, i32 1, i32 2, i32 3) + store <32 x i32> %mai.1, <32 x i32> addrspace(1)* %arg + ret void +} + +; GCN-LABEL: {{^}}test_mfma_f32_16x16x1f32: +; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2.0 +; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1.0 +; GCN: s_load_dwordx16 +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_mfma_f32_16x16x1f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +define amdgpu_kernel void @test_mfma_f32_16x16x1f32(<16 x float> addrspace(1)* %arg) { +bb: + %in.1 = load <16 x float>, <16 x float> addrspace(1)* %arg + %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float 1.0, float 2.0, <16 x float> %in.1, i32 1, i32 2, i32 3) + store <16 x float> %mai.1, <16 x float> addrspace(1)* %arg + ret void +} + +; GCN-LABEL: {{^}}test_mfma_f32_4x4x1f32: +; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2.0 +; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1.0 +; GCN: s_load_dwordx4 +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_mfma_f32_4x4x1f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 +; GCN: v_accvgpr_read_b32 +; GCN: v_accvgpr_read_b32 +; GCN: v_accvgpr_read_b32 +; GCN: v_accvgpr_read_b32 +; GCN: global_store_dwordx4 +define amdgpu_kernel void @test_mfma_f32_4x4x1f32(<4 x float> addrspace(1)* %arg) { +bb: + %in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg + %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 2.0, <4 x float> %in.1, i32 1, i32 2, i32 3) + store <4 x float> %mai.1, <4 x float> addrspace(1)* %arg + ret void +} + +; GCN-LABEL: {{^}}test_mfma_f32_32x32x2f32: +; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2.0 +; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1.0 +; GCN: s_load_dwordx16 +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_mfma_f32_32x32x2f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +define amdgpu_kernel void @test_mfma_f32_32x32x2f32(<16 x float> addrspace(1)* %arg) { +bb: + %in.1 = load <16 x float>, <16 x float> addrspace(1)* %arg + %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x2f32(float 1.0, float 2.0, <16 x float> %in.1, i32 1, i32 2, i32 3) + store <16 x float> %mai.1, <16 x float> addrspace(1)* %arg + ret void +} + +; GCN-LABEL: {{^}}test_mfma_f32_16x16x4f32: +; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2.0 +; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1.0 +; GCN: s_load_dwordx4 +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_mfma_f32_16x16x4f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: global_store_dwordx4 +define amdgpu_kernel void @test_mfma_f32_16x16x4f32(<4 x float> addrspace(1)* %arg) { +bb: + %in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg + %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x4f32(float 1.0, float 2.0, <4 x float> %in.1, i32 1, i32 2, i32 3) + store <4 x float> %mai.1, <4 x float> addrspace(1)* %arg + ret void +} + +; GCN-LABEL: {{^}}test_mfma_f32_32x32x4f16: +; GCN-DAG: s_load_dwordx16 +; GCN-DAG: s_load_dwordx16 +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_mfma_f32_32x32x4f16 a[{{[0-9]+:[0-9]+}}], {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +define amdgpu_kernel void @test_mfma_f32_32x32x4f16(<32 x i32> addrspace(1)* %arg, <4 x half> addrspace(1)* %c) { +bb: + %in.1 = load <32 x i32>, <32 x i32> addrspace(1)* %arg + %c.1 = load <4 x half>, <4 x half> addrspace(1)* %c + %c2p = getelementptr <4 x half>, <4 x half> addrspace(1)* %c, i64 1 + %c.2 = load <4 x half>, <4 x half> addrspace(1)* %c2p + %mai.1 = tail call <32 x i32> @llvm.amdgcn.mfma.f32.32x32x4f16(<4 x half> %c.1, <4 x half> %c.2, <32 x i32> %in.1, i32 1, i32 2, i32 3) + store <32 x i32> %mai.1, <32 x i32> addrspace(1)* %arg + ret void +} + +; GCN-LABEL: {{^}}test_mfma_f32_16x16x4f16: +; GCN: s_load_dwordx16 +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_mfma_f32_16x16x4f16 a[{{[0-9]+:[0-9]+}}], {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +define amdgpu_kernel void @test_mfma_f32_16x16x4f16(<16 x float> addrspace(1)* %arg, <4 x half> addrspace(1)* %c) { +bb: + %in.1 = load <16 x float>, <16 x float> addrspace(1)* %arg + %c.1 = load <4 x half>, <4 x half> addrspace(1)* %c + %c2p = getelementptr <4 x half>, <4 x half> addrspace(1)* %c, i64 1 + %c.2 = load <4 x half>, <4 x half> addrspace(1)* %c2p + %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x4f16(<4 x half> %c.1, <4 x half> %c.2, <16 x float> %in.1, i32 1, i32 2, i32 3) + store <16 x float> %mai.1, <16 x float> addrspace(1)* %arg + ret void +} + +; GCN-LABEL: {{^}}test_mfma_f32_4x4x4f16: +; GCN: s_load_dwordx4 +; GCN: s_load_dwordx2 +; GCN: s_load_dwordx2 +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_mfma_f32_4x4x4f16 a[{{[0-9]+:[0-9]+}}], {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: global_store_dwordx4 +define amdgpu_kernel void @test_mfma_f32_4x4x4f16(<4 x float> addrspace(1)* %arg, <4 x half> addrspace(1)* %c) { +bb: + %in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg + %c.1 = load <4 x half>, <4 x half> addrspace(1)* %c + %c2p = getelementptr <4 x half>, <4 x half> addrspace(1)* %c, i64 1 + %c.2 = load <4 x half>, <4 x half> addrspace(1)* %c2p + %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x4f16(<4 x half> %c.1, <4 x half> %c.2, <4 x float> %in.1, i32 1, i32 2, i32 3) + store <4 x float> %mai.1, <4 x float> addrspace(1)* %arg + ret void +} + +; GCN-LABEL: {{^}}test_mfma_f32_32x32x8f16: +; GCN: s_load_dwordx16 +; GCN: s_waitcnt lgkmcnt(0) +; GCN: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_mfma_f32_32x32x8f16 a[{{[0-9]+:[0-9]+}}], {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +define amdgpu_kernel void @test_mfma_f32_32x32x8f16(<16 x float> addrspace(1)* %arg, <4 x half> addrspace(1)* %c) { +bb: + %in.1 = load <16 x float>, <16 x float> addrspace(1)* %arg + %c.1 = load <4 x half>, <4 x half> addrspace(1)* %c + %c2p = getelementptr <4 x half>, <4 x half> addrspace(1)* %c, i64 1 + %c.2 = load <4 x half>, <4 x half> addrspace(1)* %c2p + %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x8f16(<4 x half> %c.1, <4 x half> %c.2, <16 x float> %in.1, i32 1, i32 2, i32 3) + store <16 x float> %mai.1, <16 x float> addrspace(1)* %arg + ret void +} + +; GCN-LABEL: {{^}}test_mfma_f32_16x16x16f16: +; GCN: s_load_dwordx4 +; GCN: s_load_dwordx4 +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_mfma_f32_16x16x16f16 a[{{[0-9]+:[0-9]+}}], {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: global_store_dwordx4 +define amdgpu_kernel void @test_mfma_f32_16x16x16f16(<4 x float> addrspace(1)* %arg, <4 x half> addrspace(1)* %c) { +bb: + %in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg + %c.1 = load <4 x half>, <4 x half> addrspace(1)* %c + %c2p = getelementptr <4 x half>, <4 x half> addrspace(1)* %c, i64 1 + %c.2 = load <4 x half>, <4 x half> addrspace(1)* %c2p + %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %c.1, <4 x half> %c.2, <4 x float> %in.1, i32 1, i32 2, i32 3) + store <4 x float> %mai.1, <4 x float> addrspace(1)* %arg + ret void +} + +; GCN-LABEL: {{^}}test_mfma_i32_32x32x4i8: +; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2 +; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1 +; GCN-DAG: s_load_dwordx16 +; GCN-DAG: s_load_dwordx16 +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_mfma_i32_32x32x4i8 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +define amdgpu_kernel void @test_mfma_i32_32x32x4i8(<32 x i32> addrspace(1)* %arg) { +bb: + %in.1 = load <32 x i32>, <32 x i32> addrspace(1)* %arg + %mai.1 = tail call <32 x i32> @llvm.amdgcn.mfma.i32.32x32x4i8(i32 1, i32 2, <32 x i32> %in.1, i32 1, i32 2, i32 3) + store <32 x i32> %mai.1, <32 x i32> addrspace(1)* %arg + ret void +} + +; GCN-LABEL: {{^}}test_mfma_i32_16x16x4i8: +; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2 +; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1 +; GCN: s_load_dwordx16 +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_mfma_i32_16x16x4i8 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +define amdgpu_kernel void @test_mfma_i32_16x16x4i8(<16 x i32> addrspace(1)* %arg) { +bb: + %in.1 = load <16 x i32>, <16 x i32> addrspace(1)* %arg + %mai.1 = tail call <16 x i32> @llvm.amdgcn.mfma.i32.16x16x4i8(i32 1, i32 2, <16 x i32> %in.1, i32 1, i32 2, i32 3) + store <16 x i32> %mai.1, <16 x i32> addrspace(1)* %arg + ret void +} + +; GCN-LABEL: {{^}}test_mfma_i32_4x4x4i8: +; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2 +; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1 +; GCN: s_load_dwordx4 +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_mfma_i32_4x4x4i8 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 +; GCN: v_accvgpr_read_b32 +; GCN: v_accvgpr_read_b32 +; GCN: v_accvgpr_read_b32 +; GCN: v_accvgpr_read_b32 +; GCN: global_store_dwordx4 +define amdgpu_kernel void @test_mfma_i32_4x4x4i8(<4 x i32> addrspace(1)* %arg) { +bb: + %in.1 = load <4 x i32>, <4 x i32> addrspace(1)* %arg + %mai.1 = tail call <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32 1, i32 2, <4 x i32> %in.1, i32 1, i32 2, i32 3) + store <4 x i32> %mai.1, <4 x i32> addrspace(1)* %arg + ret void +} + +; GCN-LABEL: {{^}}test_mfma_i32_32x32x8i8: +; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2 +; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1 +; GCN: s_load_dwordx16 +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_mfma_i32_32x32x8i8 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +define amdgpu_kernel void @test_mfma_i32_32x32x8i8(<16 x i32> addrspace(1)* %arg) { +bb: + %in.1 = load <16 x i32>, <16 x i32> addrspace(1)* %arg + %mai.1 = tail call <16 x i32> @llvm.amdgcn.mfma.i32.32x32x8i8(i32 1, i32 2, <16 x i32> %in.1, i32 1, i32 2, i32 3) + store <16 x i32> %mai.1, <16 x i32> addrspace(1)* %arg + ret void +} + +; GCN-LABEL: {{^}}test_mfma_i32_16x16x16i8: +; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2 +; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1 +; GCN: s_load_dwordx4 +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_mfma_i32_16x16x16i8 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: global_store_dwordx4 +define amdgpu_kernel void @test_mfma_i32_16x16x16i8(<4 x i32> addrspace(1)* %arg) { +bb: + %in.1 = load <4 x i32>, <4 x i32> addrspace(1)* %arg + %mai.1 = tail call <4 x i32> @llvm.amdgcn.mfma.i32.16x16x16i8(i32 1, i32 2, <4 x i32> %in.1, i32 1, i32 2, i32 3) + store <4 x i32> %mai.1, <4 x i32> addrspace(1)* %arg + ret void +} + +; GCN-LABEL: {{^}}test_mfma_f32_32x32x2bf16: +; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2 +; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1 +; GCN-DAG: s_load_dwordx16 +; GCN-DAG: s_load_dwordx16 +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_mfma_f32_32x32x2bf16 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +define amdgpu_kernel void @test_mfma_f32_32x32x2bf16(<32 x i32> addrspace(1)* %arg) { +bb: + %in.1 = load <32 x i32>, <32 x i32> addrspace(1)* %arg + %a = bitcast i32 1 to <2 x i16> + %b = bitcast i32 2 to <2 x i16> + %mai.1 = tail call <32 x i32> @llvm.amdgcn.mfma.f32.32x32x2bf16(<2 x i16> %a, <2 x i16> %b, <32 x i32> %in.1, i32 1, i32 2, i32 3) + store <32 x i32> %mai.1, <32 x i32> addrspace(1)* %arg + ret void +} + +; GCN-LABEL: {{^}}test_mfma_f32_16x16x2bf16: +; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2 +; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1 +; GCN: s_load_dwordx16 +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_mfma_f32_16x16x2bf16 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +define amdgpu_kernel void @test_mfma_f32_16x16x2bf16(<16 x float> addrspace(1)* %arg) { +bb: + %in.1 = load <16 x float>, <16 x float> addrspace(1)* %arg + %a = bitcast i32 1 to <2 x i16> + %b = bitcast i32 2 to <2 x i16> + %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x2bf16(<2 x i16> %a, <2 x i16> %b, <16 x float> %in.1, i32 1, i32 2, i32 3) + store <16 x float> %mai.1, <16 x float> addrspace(1)* %arg + ret void +} + +; GCN-LABEL: {{^}}test_mfma_f32_4x4x2bf16: +; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2 +; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1 +; GCN: s_load_dwordx4 +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_mfma_f32_4x4x2bf16 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: global_store_dwordx4 +define amdgpu_kernel void @test_mfma_f32_4x4x2bf16(<4 x float> addrspace(1)* %arg) { +bb: + %in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg + %a = bitcast i32 1 to <2 x i16> + %b = bitcast i32 2 to <2 x i16> + %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x2bf16(<2 x i16> %a, <2 x i16> %b, <4 x float> %in.1, i32 1, i32 2, i32 3) + store <4 x float> %mai.1, <4 x float> addrspace(1)* %arg + ret void +} + +; GCN-LABEL: {{^}}test_mfma_f32_32x32x4bf16: +; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2 +; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1 +; GCN: s_load_dwordx16 +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_mfma_f32_32x32x4bf16 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +define amdgpu_kernel void @test_mfma_f32_32x32x4bf16(<16 x float> addrspace(1)* %arg) { +bb: + %in.1 = load <16 x float>, <16 x float> addrspace(1)* %arg + %a = bitcast i32 1 to <2 x i16> + %b = bitcast i32 2 to <2 x i16> + %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x4bf16(<2 x i16> %a, <2 x i16> %b, <16 x float> %in.1, i32 1, i32 2, i32 3) + store <16 x float> %mai.1, <16 x float> addrspace(1)* %arg + ret void +} + +; GCN-LABEL: {{^}}test_mfma_f32_16x16x8bf16: +; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2 +; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1 +; GCN: s_load_dwordx4 +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_mfma_f32_16x16x8bf16 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: global_store_dwordx4 +define amdgpu_kernel void @test_mfma_f32_16x16x8bf16(<4 x float> addrspace(1)* %arg) { +bb: + %in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg + %a = bitcast i32 1 to <2 x i16> + %b = bitcast i32 2 to <2 x i16> + %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x8bf16(<2 x i16> %a, <2 x i16> %b, <4 x float> %in.1, i32 1, i32 2, i32 3) + store <4 x float> %mai.1, <4 x float> addrspace(1)* %arg + ret void +} + +; GCN-LABEL: {{^}}test_mfma_f32_32x32x1f32_forward_acc: +; GCN: v_mfma_f32_32x32x1f32 [[MAI1:a\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v{{[0-9]+}}, a[{{[0-9]+:[0-9]+}}] +; GCN-NEXT: v_mfma_f32_32x32x1f32 a[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, [[MAI1]] +define amdgpu_kernel void @test_mfma_f32_32x32x1f32_forward_acc(<32 x i32> addrspace(1)* %arg) { +bb: + %in.1 = load <32 x i32>, <32 x i32> addrspace(1)* %arg + %mai.1 = tail call <32 x i32> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x i32> %in.1, i32 0, i32 0, i32 0) + %mai.2 = tail call <32 x i32> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x i32> %mai.1, i32 0, i32 0, i32 0) + store <32 x i32> %mai.2, <32 x i32> addrspace(1)* %arg + ret void +} + +; GCN-LABEL: {{^}}test_mfma_f32_16x16x1f32_forward_acc: +; GCN: v_mfma_f32_16x16x1f32 [[MAI1:a\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v{{[0-9]+}}, a[{{[0-9]+:[0-9]+}}] +; GCN-NEXT: v_mfma_f32_16x16x1f32 a[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, [[MAI1]] +define amdgpu_kernel void @test_mfma_f32_16x16x1f32_forward_acc(<16 x float> addrspace(1)* %arg) { +bb: + %in.1 = load <16 x float>, <16 x float> addrspace(1)* %arg + %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float 1.0, float 2.0, <16 x float> %in.1, i32 0, i32 0, i32 0) + %mai.2 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float 1.0, float 2.0, <16 x float> %mai.1, i32 0, i32 0, i32 0) + store <16 x float> %mai.2, <16 x float> addrspace(1)* %arg + ret void +} + +; GCN-LABEL: {{^}}test_mfma_f32_4x4x1f32_forward_acc: +; GCN: v_mfma_f32_4x4x1f32 [[MAI1:a\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v{{[0-9]+}}, a[{{[0-9]+:[0-9]+}}] +; GCN-NEXT: v_mfma_f32_4x4x1f32 a[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, [[MAI1]] +define amdgpu_kernel void @test_mfma_f32_4x4x1f32_forward_acc(<4 x float> addrspace(1)* %arg) { +bb: + %in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg + %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 2.0, <4 x float> %in.1, i32 0, i32 0, i32 0) + %mai.2 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 2.0, <4 x float> %mai.1, i32 0, i32 0, i32 0) + store <4 x float> %mai.2, <4 x float> addrspace(1)* %arg + ret void +} + +; GCN-LABEL: {{^}}test_mfma_f32_4x4x1f32_imm_splat: +; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2.0 +; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1.0 +; GCN: v_mfma_f32_4x4x1f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], 1.0 +; GCN: v_accvgpr_read_b32 +; GCN: v_accvgpr_read_b32 +; GCN: v_accvgpr_read_b32 +; GCN: v_accvgpr_read_b32 +; GCN: global_store_dwordx4 +define amdgpu_kernel void @test_mfma_f32_4x4x1f32_imm_splat(<4 x float> addrspace(1)* %arg) { +bb: + %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 2.0, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, i32 0, i32 0, i32 0) + store <4 x float> %mai.1, <4 x float> addrspace(1)* %arg + ret void +} + +; GCN-LABEL: {{^}}test_mfma_f32_16x16x1f32_imm_splat: +; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2.0 +; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1.0 +; GCN: v_mfma_f32_16x16x1f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], 1.0 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +define amdgpu_kernel void @test_mfma_f32_16x16x1f32_imm_splat(<16 x float> addrspace(1)* %arg) { +bb: + %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float 1.0, float 2.0, <16 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, i32 0, i32 0, i32 0) + store <16 x float> %mai.1, <16 x float> addrspace(1)* %arg + ret void +} + +; GCN-LABEL: {{^}}test_mfma_f32_32x32x8f16_imm_splat: +; GCN-DAG: v_mov_b32_e32 v[[TWO:[0-9]+]], 0x40004000 +; GCN-DAG: v_mov_b32_e32 v[[ONE:[0-9]+]], 0x3c003c00 +; GCN: v_mfma_f32_32x32x8f16 a[{{[0-9]+:[0-9]+}}], v{{\[}}[[ONE]]:{{[0-9]+}}], v{{\[}}[[TWO]]:{{[0-9]+}}], 1.0 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +define amdgpu_kernel void @test_mfma_f32_32x32x8f16_imm_splat(<16 x float> addrspace(1)* %arg) { +bb: + %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x8f16(<4 x half> <half 1.0, half 1.0, half 1.0, half 1.0>, <4 x half> <half 2.0, half 2.0, half 2.0, half 2.0>, <16 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, i32 0, i32 0, i32 0) + store <16 x float> %mai.1, <16 x float> addrspace(1)* %arg + ret void +} + +; GCN-LABEL: {{^}}test_mfma_f32_32x32x1f32_imm_splat: +; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2.0 +; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1.0 +; GCN: v_mfma_f32_32x32x1f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], 0 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm_splat(<32 x i32> addrspace(1)* %arg) { +bb: + %mai.1 = tail call <32 x i32> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>, i32 0, i32 0, i32 0) + store <32 x i32> %mai.1, <32 x i32> addrspace(1)* %arg + ret void +} + +; GCN-LABEL: {{^}}test_mfma_f32_4x4x1f32_imm: +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0 +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0 +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0 +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 2.0 +; GCN: v_mfma_f32_4x4x1f32 a[{{[0-9]+:[0-9]+}}], {{v[0-9]+}}, {{v[0-9]+}}, a[{{[0-9]+:[0-9]+}}] +; GCN: v_accvgpr_read_b32 +; GCN: v_accvgpr_read_b32 +; GCN: v_accvgpr_read_b32 +; GCN: v_accvgpr_read_b32 +; GCN: global_store_dwordx4 +define amdgpu_kernel void @test_mfma_f32_4x4x1f32_imm(<4 x float> addrspace(1)* %arg) { +bb: + %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 2.0, <4 x float> <float 1.0, float 2.0, float 1.0, float 1.0>, i32 0, i32 0, i32 0) + store <4 x float> %mai.1, <4 x float> addrspace(1)* %arg + ret void +} + +; GCN-LABEL: {{^}}test_mfma_f32_16x16x1f32_imm: +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0 +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 2.0 +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0 +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0 +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0 +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0 +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0 +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0 +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0 +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0 +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0 +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0 +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0 +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0 +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0 +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0 +; GCN: v_mfma_f32_16x16x1f32 a[{{[0-9]+:[0-9]+}}], {{v[0-9]+}}, {{v[0-9]+}}, a[{{[0-9]+:[0-9]+}}] +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +define amdgpu_kernel void @test_mfma_f32_16x16x1f32_imm(<16 x float> addrspace(1)* %arg) { +bb: + %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float 1.0, float 2.0, <16 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 2.0>, i32 0, i32 0, i32 0) + store <16 x float> %mai.1, <16 x float> addrspace(1)* %arg + ret void +} + +; GCN-LABEL: {{^}}test_mfma_f32_32x32x1f32_imm: +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1 +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 +; GCN: v_mfma_f32_32x32x1f32 a[{{[0-9]+:[0-9]+}}], {{v[0-9]+}}, {{v[0-9]+}}, a[{{[0-9]+:[0-9]+}}] +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm(<32 x i32> addrspace(1)* %arg) { +bb: + %mai.1 = tail call <32 x i32> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x i32> <i32 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>, i32 0, i32 0, i32 0) + store <32 x i32> %mai.1, <32 x i32> addrspace(1)* %arg + ret void +} + +; GCN-LABEL: {{^}}test_mfma_f32_4x4x1f32_lit_splat: +; GCN: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x42f60000 +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]] +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]] +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]] +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]] +; GCN: v_mfma_f32_4x4x1f32 a[{{[0-9]+:[0-9]+}}], {{v[0-9]+}}, {{v[0-9]+}}, a[{{[0-9]+:[0-9]+}}] +; GCN: v_accvgpr_read_b32 +; GCN: v_accvgpr_read_b32 +; GCN: v_accvgpr_read_b32 +; GCN: v_accvgpr_read_b32 +; GCN: global_store_dwordx4 +define amdgpu_kernel void @test_mfma_f32_4x4x1f32_lit_splat(<4 x float> addrspace(1)* %arg) { +bb: + %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 2.0, <4 x float> <float 123.0, float 123.0, float 123.0, float 123.0>, i32 0, i32 0, i32 0) + store <4 x float> %mai.1, <4 x float> addrspace(1)* %arg + ret void +} + +; GCN-LABEL: {{^}}test_mfma_f32_32x32x1f32_vecarg: +; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2.0 +; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1.0 +; GDN-DAG: global_load_dwordx4 +; GDN-DAG: global_load_dwordx4 +; GDN-DAG: global_load_dwordx4 +; GDN-DAG: global_load_dwordx4 +; GDN-DAG: global_load_dwordx4 +; GDN-DAG: global_load_dwordx4 +; GDN-DAG: global_load_dwordx4 +; GDN-DAG: global_load_dwordx4 +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_mfma_f32_32x32x1f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: v_accvgpr_read_b32 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +; GCN-DAG: global_store_dwordx4 +define amdgpu_kernel void @test_mfma_f32_32x32x1f32_vecarg(<32 x i32> addrspace(1)* %arg) { +bb: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds <32 x i32>, <32 x i32> addrspace(1)* %arg, i32 %tid + %in.1 = load <32 x i32>, <32 x i32> addrspace(1)* %gep + %mai.1 = tail call <32 x i32> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x i32> %in.1, i32 1, i32 2, i32 3) + store <32 x i32> %mai.1, <32 x i32> addrspace(1)* %gep + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll index d4eab1babc7..fd5c94868fe 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll @@ -3,6 +3,7 @@ ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=EG -check-prefix=FUNC %s ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=GCN -check-prefix=GCN-HSA -check-prefix=FUNC %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=GCN -check-prefix=GCN-HSA -check-prefix=FUNC %s ; FUNC-LABEL: {{^}}constant_load_i32: ; GCN: s_load_dword s{{[0-9]+}} @@ -402,6 +403,8 @@ define amdgpu_kernel void @constant_zextload_v32i32_to_v32i64(<32 x i64> addrspa ; GCN-NOHSA-DAG: buffer_store_dwordx4 ; GCN-NOHSA-DAG: buffer_store_dwordx4 +; GCN-NOT: accvgpr + ; GCN-HSA-DAG: {{flat|global}}_store_dwordx4 ; GCN-HSA-DAG: {{flat|global}}_store_dwordx4 ; GCN-HSA-DAG: {{flat|global}}_store_dwordx4 diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i32.ll b/llvm/test/CodeGen/AMDGPU/load-global-i32.ll index fcbf8a1fc4a..0a60413494a 100644 --- a/llvm/test/CodeGen/AMDGPU/load-global-i32.ll +++ b/llvm/test/CodeGen/AMDGPU/load-global-i32.ll @@ -2,8 +2,8 @@ ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=GCN -check-prefix=GCN-HSA -check-prefix=GCNX3-HSA -check-prefix=FUNC %s ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=GCNX3-NOHSA -check-prefix=FUNC %s ; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=EG -check-prefix=FUNC %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=GCN -check-prefix=GCN-HSA -check-prefix=GCNX3-HSA -check-prefix=FUNC %s - +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=GCN -check-prefix=GCN-HSA -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=GCN -check-prefix=GCN-HSA -check-prefix=FUNC %s ; FUNC-LABEL: {{^}}global_load_i32: ; GCN-NOHSA: buffer_load_dword v{{[0-9]+}} @@ -560,6 +560,8 @@ define amdgpu_kernel void @global_zextload_v32i32_to_v32i64(<32 x i64> addrspace ; GCN-NOHSA-DAG: buffer_store_dwordx4 ; GCN-NOHSA-DAG: buffer_store_dwordx4 +; GCN-NOT: accvgpr + ; GCN-HSA-DAG: {{flat|global}}_store_dwordx4 ; GCN-HSA-DAG: {{flat|global}}_store_dwordx4 ; GCN-HSA-DAG: {{flat|global}}_store_dwordx4 diff --git a/llvm/test/CodeGen/AMDGPU/load-local-i32.ll b/llvm/test/CodeGen/AMDGPU/load-local-i32.ll index f9b5cce823c..10887885ef6 100644 --- a/llvm/test/CodeGen/AMDGPU/load-local-i32.ll +++ b/llvm/test/CodeGen/AMDGPU/load-local-i32.ll @@ -1,6 +1,7 @@ ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC %s ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC %s ; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI,FUNC %s +; RUN: llc -march=amdgcn -mcpu=gfx908 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI,FUNC %s ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s ; Testing for ds_read/write_128 @@ -268,6 +269,7 @@ define amdgpu_kernel void @local_zextload_v32i32_to_v32i64(<32 x i64> addrspace( ; FUNC-LABEL: {{^}}local_load_v32i32: ; SICIVI: s_mov_b32 m0, -1 ; GFX9-NOT: m0 +; GFX9-NOT: accvgpr define amdgpu_kernel void @local_load_v32i32(<32 x i32> addrspace(3)* %out, <32 x i32> addrspace(3)* %in) #0 { %ld = load <32 x i32>, <32 x i32> addrspace(3)* %in diff --git a/llvm/test/CodeGen/AMDGPU/mai-inline.ll b/llvm/test/CodeGen/AMDGPU/mai-inline.ll new file mode 100644 index 00000000000..a9d8c5cb2e0 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/mai-inline.ll @@ -0,0 +1,190 @@ +; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX908 %s + +; GCN-LABEL: {{^}}accvgpr_write_read: +; GFX908: v_accvgpr_write [[AREG:a[0-9]+]], 1 +; GFX908: v_accvgpr_read [[VREG:v[0-9]+]], [[AREG]] +; GFX908: global_store_dword {{[^,]+}}, [[VREG]], off +define amdgpu_kernel void @accvgpr_write_read(float addrspace(1)* %arg) { +bb: + %in.1 = load float, float addrspace(1)* %arg + %init = tail call float asm "v_accvgpr_write $0, 1", "=a"() + %read = tail call float asm "v_accvgpr_read $0, $1", "=v,a"(float %init) + store float %read, float addrspace(1)* %arg + ret void +} + +; GCN-LABEL: {{^}}v_mfma_f32_4x4x1f32_avva +; GFX908: v_accvgpr_write_b32 +; GFX908: v_accvgpr_write_b32 +; GFX908: v_accvgpr_write_b32 +; GFX908: v_accvgpr_write_b32 +; GFX908: v_mfma_f32_4x4x1f32 a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9]+}}, a[{{[0-9:]+}}] +; GFX908: v_accvgpr_read_b32 +; GFX908: v_accvgpr_read_b32 +; GFX908: v_accvgpr_read_b32 +; GFX908: v_accvgpr_read_b32 +define amdgpu_kernel void @v_mfma_f32_4x4x1f32_avva(<4 x float> addrspace(1)* %arg) { +bb: + %in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg + %mai.1 = tail call <4 x float> asm "v_mfma_f32_4x4x1f32 $0, $1, $2, $3", "=a,v,v,a"(float 1.0, float 2.0, <4 x float> %in.1) + store <4 x float> %mai.1, <4 x float> addrspace(1)* %arg + ret void +} + +; GCN-LABEL: {{^}}v_mfma_f32_4x4x1f32_aaaa +; GFX908: v_accvgpr_write_b32 +; GFX908: v_accvgpr_write_b32 +; GFX908: v_accvgpr_write_b32 +; GFX908: v_accvgpr_write_b32 +; GFX908: v_mfma_f32_4x4x1f32 a[{{[0-9:]+}}], a{{[0-9]+}}, a{{[0-9]+}}, a[{{[0-9:]+}}] +; GFX908: v_accvgpr_read_b32 +; GFX908: v_accvgpr_read_b32 +; GFX908: v_accvgpr_read_b32 +; GFX908: v_accvgpr_read_b32 +define amdgpu_kernel void @v_mfma_f32_4x4x1f32_aaaa(<4 x float> addrspace(1)* %arg) { +bb: + %in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg + %mai.1 = tail call <4 x float> asm "v_mfma_f32_4x4x1f32 $0, $1, $2, $3", "=a,a,a,a"(float 1.0, float 2.0, <4 x float> %in.1) + store <4 x float> %mai.1, <4 x float> addrspace(1)* %arg + ret void +} + +; GCN-LABEL: {{^}}v_mfma_f32_4x4x4f16_aaaa +; GFX908: v_accvgpr_write_b32 +; GFX908: v_accvgpr_write_b32 +; GFX908: v_accvgpr_write_b32 +; GFX908: v_accvgpr_write_b32 +; GFX908: v_accvgpr_write_b32 +; GFX908: v_accvgpr_write_b32 +; GFX908: v_accvgpr_write_b32 +; GFX908: v_accvgpr_write_b32 +; GFX908: v_mfma_f32_4x4x4f16 a[{{[0-9:]+}}], a[{{[0-9]+:[0-9]+}}], a[{{[0-9]+:[0-9]+}}], a[{{[0-9:]+}}] +; GFX908: v_accvgpr_read_b32 +; GFX908: v_accvgpr_read_b32 +; GFX908: v_accvgpr_read_b32 +; GFX908: v_accvgpr_read_b32 +define amdgpu_kernel void @v_mfma_f32_4x4x4f16_aaaa(<4 x float> addrspace(1)* %arg) { +bb: + %in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg + %mai.1 = tail call <4 x float> asm "v_mfma_f32_4x4x4f16 $0, $1, $2, $3", "=a,a,a,a"(<4 x half> <half 0xH3800, half 0xH3800, half 0xH3800, half 0xH3800>, <4 x half> <half 0xH03FF, half 0xH03FF, half 0xH03FF, half 0xH03FF>, <4 x float> %in.1) + store <4 x float> %mai.1, <4 x float> addrspace(1)* %arg + ret void +} + +; GCN-LABEL: {{^}}v_mfma_f32_16x16x1f32_avaa +; GFX908: v_accvgpr_write_b32 +; GFX908: v_accvgpr_write_b32 +; GFX908: v_accvgpr_write_b32 +; GFX908: v_accvgpr_write_b32 +; GFX908: v_accvgpr_write_b32 +; GFX908: v_accvgpr_write_b32 +; GFX908: v_accvgpr_write_b32 +; GFX908: v_accvgpr_write_b32 +; GFX908: v_accvgpr_write_b32 +; GFX908: v_accvgpr_write_b32 +; GFX908: v_accvgpr_write_b32 +; GFX908: v_accvgpr_write_b32 +; GFX908: v_accvgpr_write_b32 +; GFX908: v_accvgpr_write_b32 +; GFX908: v_accvgpr_write_b32 +; GFX908: v_accvgpr_write_b32 +; GFX908: v_accvgpr_write_b32 +; GFX908: v_mfma_f32_16x16x1f32 a[{{[0-9:]+}}], v{{[0-9]+}}, a{{[0-9]+}}, a[{{[0-9:]+}}] +; GFX908: v_accvgpr_read_b32 +; GFX908: v_accvgpr_read_b32 +; GFX908: v_accvgpr_read_b32 +; GFX908: v_accvgpr_read_b32 +; GFX908: v_accvgpr_read_b32 +; GFX908: v_accvgpr_read_b32 +; GFX908: v_accvgpr_read_b32 +; GFX908: v_accvgpr_read_b32 +; GFX908: v_accvgpr_read_b32 +; GFX908: v_accvgpr_read_b32 +; GFX908: v_accvgpr_read_b32 +; GFX908: v_accvgpr_read_b32 +; GFX908: v_accvgpr_read_b32 +; GFX908: v_accvgpr_read_b32 +; GFX908: v_accvgpr_read_b32 +; GFX908: v_accvgpr_read_b32 +define amdgpu_kernel void @v_mfma_f32_16x16x1f32_avaa(<16 x float> addrspace(1)* %arg) { +bb: + %in.1 = load <16 x float>, <16 x float> addrspace(1)* %arg + %mai.1 = tail call <16 x float> asm "v_mfma_f32_16x16x1f32 $0, $1, $2, $3", "=a,v,a,a"(float 1.0, float 2.0, <16 x float> %in.1) + store <16 x float> %mai.1, <16 x float> addrspace(1)* %arg + ret void +} + +; GCN-LABEL: {{^}}v_mfma_f32_32x32x1f32_avaa +; GFX908: v_accvgpr_write_b32 +; GFX908: v_accvgpr_write_b32 +; GFX908: v_accvgpr_write_b32 +; GFX908: v_accvgpr_write_b32 +; GFX908: v_accvgpr_write_b32 +; GFX908: v_accvgpr_write_b32 +; GFX908: v_accvgpr_write_b32 +; GFX908: v_accvgpr_write_b32 +; GFX908: v_accvgpr_write_b32 +; GFX908: v_accvgpr_write_b32 +; GFX908: v_accvgpr_write_b32 +; GFX908: v_accvgpr_write_b32 +; GFX908: v_accvgpr_write_b32 +; GFX908: v_accvgpr_write_b32 +; GFX908: v_accvgpr_write_b32 +; GFX908: v_accvgpr_write_b32 +; GFX908: v_accvgpr_write_b32 +; GFX908: v_accvgpr_write_b32 +; GFX908: v_accvgpr_write_b32 +; GFX908: v_accvgpr_write_b32 +; GFX908: v_accvgpr_write_b32 +; GFX908: v_accvgpr_write_b32 +; GFX908: v_accvgpr_write_b32 +; GFX908: v_accvgpr_write_b32 +; GFX908: v_accvgpr_write_b32 +; GFX908: v_accvgpr_write_b32 +; GFX908: v_accvgpr_write_b32 +; GFX908: v_accvgpr_write_b32 +; GFX908: v_accvgpr_write_b32 +; GFX908: v_accvgpr_write_b32 +; GFX908: v_accvgpr_write_b32 +; GFX908: v_accvgpr_write_b32 +; GFX908: v_accvgpr_write_b32 +; GFX908: v_mfma_f32_32x32x1f32 a[{{[0-9:]+}}], v{{[0-9]+}}, a{{[0-9]+}}, a[{{[0-9:]+}}] +; GFX908: v_accvgpr_read_b32 +; GFX908: v_accvgpr_read_b32 +; GFX908: v_accvgpr_read_b32 +; GFX908: v_accvgpr_read_b32 +; GFX908: v_accvgpr_read_b32 +; GFX908: v_accvgpr_read_b32 +; GFX908: v_accvgpr_read_b32 +; GFX908: v_accvgpr_read_b32 +; GFX908: v_accvgpr_read_b32 +; GFX908: v_accvgpr_read_b32 +; GFX908: v_accvgpr_read_b32 +; GFX908: v_accvgpr_read_b32 +; GFX908: v_accvgpr_read_b32 +; GFX908: v_accvgpr_read_b32 +; GFX908: v_accvgpr_read_b32 +; GFX908: v_accvgpr_read_b32 +; GFX908: v_accvgpr_read_b32 +; GFX908: v_accvgpr_read_b32 +; GFX908: v_accvgpr_read_b32 +; GFX908: v_accvgpr_read_b32 +; GFX908: v_accvgpr_read_b32 +; GFX908: v_accvgpr_read_b32 +; GFX908: v_accvgpr_read_b32 +; GFX908: v_accvgpr_read_b32 +; GFX908: v_accvgpr_read_b32 +; GFX908: v_accvgpr_read_b32 +; GFX908: v_accvgpr_read_b32 +; GFX908: v_accvgpr_read_b32 +; GFX908: v_accvgpr_read_b32 +; GFX908: v_accvgpr_read_b32 +; GFX908: v_accvgpr_read_b32 +; GFX908: v_accvgpr_read_b32 +define amdgpu_kernel void @v_mfma_f32_32x32x1f32_avaa(<32 x i32> addrspace(1)* %arg) { +bb: + %in.1 = load <32 x i32>, <32 x i32> addrspace(1)* %arg + %mai.1 = tail call <32 x i32> asm "v_mfma_f32_32x32x1f32 $0, $1, $2, $3", "=a,v,a,a"(float 1.0, float 2.0, <32 x i32> %in.1) + store <32 x i32> %mai.1, <32 x i32> addrspace(1)* %arg + ret void +} |