diff options
| author | Farhana Aleen <farhana.aleen@gmail.com> | 2018-07-16 18:19:59 +0000 | 
|---|---|---|
| committer | Farhana Aleen <farhana.aleen@gmail.com> | 2018-07-16 18:19:59 +0000 | 
| commit | c370d7b33d0aba8b80f3a0b633f4f8d92c448833 (patch) | |
| tree | b4e3f2030a2771b0aba9849c07db42611c322fb4 | |
| parent | 7f01d209939c3a08c7d15ea145d7056dc138c23a (diff) | |
| download | bcm5719-llvm-c370d7b33d0aba8b80f3a0b633f4f8d92c448833.tar.gz bcm5719-llvm-c370d7b33d0aba8b80f3a0b633f4f8d92c448833.zip  | |
[AMDGPU] [AMDGPU] Support a fdot2 pattern.
Summary: Optimize fma((float)S0.x, (float)S1.x fma((float)S0.y, (float)S1.y, z))
                   -> fdot2((v2f16)S0, (v2f16)S1, (float)z)
Author: FarhanaAleen
Reviewed By: rampitec, b-sumner
Subscribers: AMDGPU
Differential Revision: https://reviews.llvm.org/D49146
llvm-svn: 337198
| -rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 1 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h | 1 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td | 5 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 79 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/SIISelLowering.h | 1 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/VOP3PInstructions.td | 2 | ||||
| -rw-r--r-- | llvm/test/CodeGen/AMDGPU/fdot2.ll | 232 | 
7 files changed, 320 insertions, 1 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index acdedab7e13..485927c8e44 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -3993,6 +3993,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {    NODE_NAME_CASE(FMED3)    NODE_NAME_CASE(SMED3)    NODE_NAME_CASE(UMED3) +  NODE_NAME_CASE(FDOT2)    NODE_NAME_CASE(URECIP)    NODE_NAME_CASE(DIV_SCALE)    NODE_NAME_CASE(DIV_FMAS) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h index 1e027dd6712..30967d30fdf 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -363,6 +363,7 @@ enum NodeType : unsigned {    FMED3,    SMED3,    UMED3, +  FDOT2,    URECIP,    DIV_SCALE,    DIV_FMAS, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td index f7ce519b291..96b7568eec1 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td @@ -341,6 +341,11 @@ def AMDGPUumed3 : SDNode<"AMDGPUISD::UMED3", AMDGPUDTIntTernaryOp,  def AMDGPUfmed3 : SDNode<"AMDGPUISD::FMED3", SDTFPTernaryOp, []>; +def AMDGPUfdot2 : SDNode<"AMDGPUISD::FDOT2", +                  SDTypeProfile<1, 3, [SDTCisSameAs<0, 3>, SDTCisSameAs<1, 2>, +                                       SDTCisFP<0>, SDTCisVec<1>]>, +                  []>; +  def AMDGPUperm : SDNode<"AMDGPUISD::PERM", AMDGPUDTIntTernaryOp, []>;  def AMDGPUinit_exec : SDNode<"AMDGPUISD::INIT_EXEC", diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 5caf03e909b..db1f2b3a3c3 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -623,6 +623,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,    setTargetDAGCombine(ISD::FSUB);    setTargetDAGCombine(ISD::FMINNUM);    setTargetDAGCombine(ISD::FMAXNUM); +  setTargetDAGCombine(ISD::FMA);    setTargetDAGCombine(ISD::SMIN);    setTargetDAGCombine(ISD::SMAX);    setTargetDAGCombine(ISD::UMIN); @@ -4945,6 +4946,9 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,    case Intrinsic::amdgcn_fmed3:      return DAG.getNode(AMDGPUISD::FMED3, DL, VT,                         Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); +  case Intrinsic::amdgcn_fdot2: +    return DAG.getNode(AMDGPUISD::FDOT2, DL, VT, +                       Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));    case Intrinsic::amdgcn_fmul_legacy:      return DAG.getNode(AMDGPUISD::FMUL_LEGACY, DL, VT,                         Op.getOperand(1), Op.getOperand(2)); @@ -7476,6 +7480,79 @@ SDValue SITargetLowering::performFSubCombine(SDNode *N,    return SDValue();  } +SDValue SITargetLowering::performFMACombine(SDNode *N, +                                            DAGCombinerInfo &DCI) const { +  SelectionDAG &DAG = DCI.DAG; +  EVT VT = N->getValueType(0); +  SDLoc SL(N); + +  if (!Subtarget->hasDLInsts() || VT != MVT::f32) +    return SDValue(); + +  // FMA((F32)S0.x, (F32)S1. x, FMA((F32)S0.y, (F32)S1.y, (F32)z)) -> +  //   FDOT2((V2F16)S0, (V2F16)S1, (F32)z)) +  SDValue Op1 = N->getOperand(0); +  SDValue Op2 = N->getOperand(1); +  SDValue FMA = N->getOperand(2); + +  if (FMA.getOpcode() != ISD::FMA || +      Op1.getOpcode() != ISD::FP_EXTEND || +      Op2.getOpcode() != ISD::FP_EXTEND) +    return SDValue(); + +  // fdot2_f32_f16 always flushes fp32 denormal operand and output to zero, +  // regardless of the denorm mode setting. Therefore, unsafe-fp-math/fp-contract +  // is sufficient to allow generaing fdot2. +  const TargetOptions &Options = DAG.getTarget().Options; +  if (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath || +      (N->getFlags().hasAllowContract() && +       FMA->getFlags().hasAllowContract())) { +    Op1 = Op1.getOperand(0); +    Op2 = Op2.getOperand(0); +    if (Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT || +        Op2.getOpcode() != ISD::EXTRACT_VECTOR_ELT) +      return SDValue(); + +    SDValue Vec1 = Op1.getOperand(0); +    SDValue Idx1 = Op1.getOperand(1); +    SDValue Vec2 = Op2.getOperand(0); + +    SDValue FMAOp1 = FMA.getOperand(0); +    SDValue FMAOp2 = FMA.getOperand(1); +    SDValue FMAAcc = FMA.getOperand(2); + +    if (FMAOp1.getOpcode() != ISD::FP_EXTEND || +        FMAOp2.getOpcode() != ISD::FP_EXTEND) +      return SDValue(); + +    FMAOp1 = FMAOp1.getOperand(0); +    FMAOp2 = FMAOp2.getOperand(0); +    if (FMAOp1.getOpcode() != ISD::EXTRACT_VECTOR_ELT || +        FMAOp2.getOpcode() != ISD::EXTRACT_VECTOR_ELT) +      return SDValue(); + +    SDValue Vec3 = FMAOp1.getOperand(0); +    SDValue Vec4 = FMAOp2.getOperand(0); +    SDValue Idx2 = FMAOp1.getOperand(1); + +    if (Idx1 != Op2.getOperand(1) || Idx2 != FMAOp2.getOperand(1) || +        // Idx1 and Idx2 cannot be the same. +        Idx1 == Idx2) +      return SDValue(); + +    if (Vec1 == Vec2 || Vec3 == Vec4) +      return SDValue(); + +    if (Vec1.getValueType() != MVT::v2f16 || Vec2.getValueType() != MVT::v2f16) +      return SDValue(); + +    if ((Vec1 == Vec3 && Vec2 == Vec4) || +        (Vec1 == Vec4 && Vec2 == Vec3)) +      return DAG.getNode(AMDGPUISD::FDOT2, SL, MVT::f32, Vec1, Vec2, FMAAcc); +  } +  return SDValue(); +} +  SDValue SITargetLowering::performSetCCCombine(SDNode *N,                                                DAGCombinerInfo &DCI) const {    SelectionDAG &DAG = DCI.DAG; @@ -7660,6 +7737,8 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,        return performMinMaxCombine(N, DCI);      break;    } +  case ISD::FMA: +    return performFMACombine(N, DCI);    case ISD::LOAD: {      if (SDValue Widended = widenLoad(cast<LoadSDNode>(N), DCI))        return Widended; diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h index 3e4ff84ab47..ad049f2a71c 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.h +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -136,6 +136,7 @@ private:    SDValue performSubCombine(SDNode *N, DAGCombinerInfo &DCI) const;    SDValue performFAddCombine(SDNode *N, DAGCombinerInfo &DCI) const;    SDValue performFSubCombine(SDNode *N, DAGCombinerInfo &DCI) const; +  SDValue performFMACombine(SDNode *N, DAGCombinerInfo &DCI) const;    SDValue performSetCCCombine(SDNode *N, DAGCombinerInfo &DCI) const;    SDValue performCvtF32UByteNCombine(SDNode *N, DAGCombinerInfo &DCI) const;    SDValue performClampCombine(SDNode *N, DAGCombinerInfo &DCI) const; diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td index 8e2eff13d6d..5c78ada3211 100644 --- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td @@ -167,7 +167,7 @@ defm : MadFmaMixPats<fma, V_FMA_MIX_F32, V_FMA_MIXLO_F16, V_FMA_MIXHI_F16>;  let SubtargetPredicate = HasDLInsts in { -def V_DOT2_F32_F16 : VOP3PInst<"v_dot2_f32_f16", VOP3_Profile<VOP_F32_V2F16_V2F16_F32>, int_amdgcn_fdot2>; +def V_DOT2_F32_F16 : VOP3PInst<"v_dot2_f32_f16", VOP3_Profile<VOP_F32_V2F16_V2F16_F32>, AMDGPUfdot2>;  def V_DOT2_I32_I16 : VOP3PInst<"v_dot2_i32_i16", VOP3_Profile<VOP_I32_V2I16_V2I16_I32>, int_amdgcn_sdot2>;  def V_DOT2_U32_U16 : VOP3PInst<"v_dot2_u32_u16", VOP3_Profile<VOP_I32_V2I16_V2I16_I32>, int_amdgcn_udot2>;  def V_DOT4_I32_I8  : VOP3PInst<"v_dot4_i32_i8", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_sdot4>; diff --git a/llvm/test/CodeGen/AMDGPU/fdot2.ll b/llvm/test/CodeGen/AMDGPU/fdot2.ll new file mode 100644 index 00000000000..35364d34060 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/fdot2.ll @@ -0,0 +1,232 @@ +; RUN: llc -march=amdgcn -mcpu=gfx900 -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck %s  -check-prefixes=GCN,GFX900 +; RUN: llc -march=amdgcn -mcpu=gfx906 -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck %s  -check-prefixes=GCN,GFX906-UNSAFE +; RUN: llc -march=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck %s  -check-prefixes=GCN,GFX906 +; RUN: llc -march=amdgcn -mcpu=gfx906 -mattr=-fp64-fp16-denormals,-fp32-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck %s  -check-prefixes=GCN,GFX906-CONTRACT +; RUN: llc -march=amdgcn -mcpu=gfx906 -mattr=+fp64-fp16-denormals,+fp32-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck %s  -check-prefixes=GCN,GFX906-DENORM-CONTRACT +; (fadd (fmul S1.x, S2.x), (fadd (fmul (S1.y, S2.y), z))) -> (fdot2 S1, S2, z) + +; Tests to make sure fdot2 is not generated when vector elements of dot-product expressions +; are not converted from f16 to f32. +; GCN-LABEL: {{^}}dotproduct_f16 +; GFX900: v_fma_legacy_f16 +; GCN900: v_fma_legacy_f16 + +; GFX906: v_mul_f16_e32 +; GFX906: v_mul_f16_e32 + +; GFX906-UNSAFE:  v_fma_legacy_f16 + +; GFX906-CONTRACT: v_mac_f16_e32 +; GFX906-DENORM-CONTRACT: v_fma_legacy_f16 +define amdgpu_kernel void @dotproduct_f16(<2 x half> addrspace(1)* %src1, +                                          <2 x half> addrspace(1)* %src2, +                                          half addrspace(1)* nocapture %dst) { +entry: +  %src1.vec = load <2 x half>, <2 x half> addrspace(1)* %src1 +  %src2.vec = load <2 x half>, <2 x half> addrspace(1)* %src2 + +  %src1.el1 = extractelement <2 x half> %src1.vec, i64 0 +  %src2.el1 = extractelement <2 x half> %src2.vec, i64 0 + +  %src1.el2 = extractelement <2 x half> %src1.vec, i64 1 +  %src2.el2 = extractelement <2 x half> %src2.vec, i64 1 + +  %mul2 = fmul half %src1.el2, %src2.el2 +  %mul1 = fmul half %src1.el1, %src2.el1 +  %acc = load half, half addrspace(1)* %dst, align 2 +  %acc1 = fadd half %mul2, %acc +  %acc2 = fadd half %mul1, %acc1 +  store half %acc2, half addrspace(1)* %dst, align 2 +  ret void +} + + +; We only want to generate fdot2 if vector element of dot product is converted from f16 to f32 +; and the vectors are of type <2 x half> +; GCN-LABEL: {{^}}dotproduct_f16_f32 +; GFX900: v_mad_mix_f32 +; GCN900: v_mad_mix_f32 + +; GFX906: v_mad_f32 +; GFX906: v_mac_f32_e32 + +; GFX906-UNSAFE: v_dot2_f32_f16 + +; GFX906-CONTRACT: v_dot2_f32_f16 + +; GFX906-DENORM-CONTRACT: v_dot2_f32_f16 +define amdgpu_kernel void @dotproduct_f16_f32(<2 x half> addrspace(1)* %src1, +                                              <2 x half> addrspace(1)* %src2, +                                              float addrspace(1)* nocapture %dst) { +entry: +  %src1.vec = load <2 x half>, <2 x half> addrspace(1)* %src1 +  %src2.vec = load <2 x half>, <2 x half> addrspace(1)* %src2 + +  %src1.el1 = extractelement <2 x half> %src1.vec, i64 0 +  %csrc1.el1 = fpext half %src1.el1 to float +  %src2.el1 = extractelement <2 x half> %src2.vec, i64 0 +  %csrc2.el1 = fpext half %src2.el1 to float + +  %src1.el2 = extractelement <2 x half> %src1.vec, i64 1 +  %csrc1.el2 = fpext half %src1.el2 to float +  %src2.el2 = extractelement <2 x half> %src2.vec, i64 1 +  %csrc2.el2 = fpext half %src2.el2 to float + +  %mul2 = fmul float %csrc1.el2, %csrc2.el2 +  %mul1 = fmul float %csrc1.el1, %csrc2.el1 +  %acc = load float, float addrspace(1)* %dst, align 4 +  %acc1 = fadd float %mul2, %acc +  %acc2 = fadd float %mul1, %acc1 +  store float %acc2, float addrspace(1)* %dst, align 4 +  ret void +} + +; We only want to generate fdot2 if vector element of dot product is converted from f16 to f32 +; and the vectors are of type <2 x half> +; GCN-LABEL: {{^}}dotproduct_diffvecorder +; GFX900: v_mad_mix_f32 +; GCN900: v_mad_mix_f32 + +; GFX906: v_mad_f32 +; GFX906: v_mac_f32_e32 + +; GFX906-UNSAFE: v_dot2_f32_f16 + +; GFX906-CONTRACT: v_dot2_f32_f16 +; GFX906-DENORM-CONTRACT: v_dot2_f32_f16 +define amdgpu_kernel void @dotproduct_diffvecorder(<2 x half> addrspace(1)* %src1, +                                                   <2 x half> addrspace(1)* %src2, +                                                   float addrspace(1)* nocapture %dst) { +entry: +  %src1.vec = load <2 x half>, <2 x half> addrspace(1)* %src1 +  %src2.vec = load <2 x half>, <2 x half> addrspace(1)* %src2 + +  %src1.el1 = extractelement <2 x half> %src1.vec, i64 0 +  %csrc1.el1 = fpext half %src1.el1 to float +  %src2.el1 = extractelement <2 x half> %src2.vec, i64 0 +  %csrc2.el1 = fpext half %src2.el1 to float + +  %src1.el2 = extractelement <2 x half> %src1.vec, i64 1 +  %csrc1.el2 = fpext half %src1.el2 to float +  %src2.el2 = extractelement <2 x half> %src2.vec, i64 1 +  %csrc2.el2 = fpext half %src2.el2 to float + +  %mul2 = fmul float %csrc2.el2, %csrc1.el2 +  %mul1 = fmul float %csrc1.el1, %csrc2.el1 +  %acc = load float, float addrspace(1)* %dst, align 4 +  %acc1 = fadd float %mul2, %acc +  %acc2 = fadd float %mul1, %acc1 +  store float %acc2, float addrspace(1)* %dst, align 4 +  ret void +} + +; Tests to make sure dot product is not generated when the vectors are not of <2 x half>. +; GCN-LABEL: {{^}}dotproduct_v4f16 +; GFX900: v_mad_mix_f32 + +; GFX906: v_mad_f32 +; GFX906: v_mac_f32_e32 + +; GFX906-UNSAFE: v_fma_mix_f32 + +; GFX906-CONTRACT: v_fma_mix_f32 +; GFX906-DENORM-CONTRACT: v_fma_mix_f32 +define amdgpu_kernel void @dotproduct_v4f16(<4 x half> addrspace(1)* %src1, +                                            <4 x half> addrspace(1)* %src2, +                                            float addrspace(1)* nocapture %dst) { +entry: +  %src1.vec = load <4 x half>, <4 x half> addrspace(1)* %src1 +  %src2.vec = load <4 x half>, <4 x half> addrspace(1)* %src2 + +  %src1.el1 = extractelement <4 x half> %src1.vec, i64 0 +  %csrc1.el1 = fpext half %src1.el1 to float +  %src2.el1 = extractelement <4 x half> %src2.vec, i64 0 +  %csrc2.el1 = fpext half %src2.el1 to float + +  %src1.el2 = extractelement <4 x half> %src1.vec, i64 1 +  %csrc1.el2 = fpext half %src1.el2 to float +  %src2.el2 = extractelement <4 x half> %src2.vec, i64 1 +  %csrc2.el2 = fpext half %src2.el2 to float + +  %mul2 = fmul float %csrc1.el2, %csrc2.el2 +  %mul1 = fmul float %csrc1.el1, %csrc2.el1 +  %acc = load float, float addrspace(1)* %dst, align 4 +  %acc1 = fadd float %mul2, %acc +  %acc2 = fadd float %mul1, %acc1 +  store float %acc2, float addrspace(1)* %dst, align 4 +  ret void +} + +; GCN-LABEL: {{^}}NotAdotproduct +; GFX900: v_mad_mix_f32 +; GCN900: v_mad_mix_f32 + +; GFX906: v_mad_f32 +; GFX906: v_mac_f32_e32 + +; GFX906-UNSAFE: v_fma_mix_f32 + +; GFX906-CONTRACT: v_fma_mix_f32 +; GFX906-DENORM-CONTRACT: v_fma_mix_f32 +define amdgpu_kernel void @NotAdotproduct(<2 x half> addrspace(1)* %src1, +                                          <2 x half> addrspace(1)* %src2, +                                          float addrspace(1)* nocapture %dst) { +entry: +  %src1.vec = load <2 x half>, <2 x half> addrspace(1)* %src1 +  %src2.vec = load <2 x half>, <2 x half> addrspace(1)* %src2 + +  %src1.el1 = extractelement <2 x half> %src1.vec, i64 0 +  %csrc1.el1 = fpext half %src1.el1 to float +  %src2.el1 = extractelement <2 x half> %src2.vec, i64 0 +  %csrc2.el1 = fpext half %src2.el1 to float + +  %src1.el2 = extractelement <2 x half> %src1.vec, i64 1 +  %csrc1.el2 = fpext half %src1.el2 to float +  %src2.el2 = extractelement <2 x half> %src2.vec, i64 1 +  %csrc2.el2 = fpext half %src2.el2 to float + +  %mul2 = fmul float %csrc1.el2, %csrc1.el1 +  %mul1 = fmul float %csrc2.el1, %csrc2.el2 +  %acc = load float, float addrspace(1)* %dst, align 4 +  %acc1 = fadd float %mul2, %acc +  %acc2 = fadd float %mul1, %acc1 +  store float %acc2, float addrspace(1)* %dst, align 4 +  ret void +} + +; GCN-LABEL: {{^}}Diff_Idx_NotAdotproduct +; GFX900: v_mad_mix_f32 +; GCN900: v_mad_mix_f32 + +; GFX906: v_mad_f32 +; GFX906: v_mac_f32_e32 + +; GFX906-UNSAFE: v_fma_mix_f32 + +; GFX906-CONTRACT: v_fma_mix_f32 +; GFX906-DENORM-CONTRACT: v_fma_mix_f32 +define amdgpu_kernel void @Diff_Idx_NotAdotproduct(<2 x half> addrspace(1)* %src1, +                                                   <2 x half> addrspace(1)* %src2, +                                                   float addrspace(1)* nocapture %dst) { +entry: +  %src1.vec = load <2 x half>, <2 x half> addrspace(1)* %src1 +  %src2.vec = load <2 x half>, <2 x half> addrspace(1)* %src2 + +  %src1.el1 = extractelement <2 x half> %src1.vec, i64 0 +  %csrc1.el1 = fpext half %src1.el1 to float +  %src2.el1 = extractelement <2 x half> %src2.vec, i64 0 +  %csrc2.el1 = fpext half %src2.el1 to float + +  %src1.el2 = extractelement <2 x half> %src1.vec, i64 1 +  %csrc1.el2 = fpext half %src1.el2 to float +  %src2.el2 = extractelement <2 x half> %src2.vec, i64 1 +  %csrc2.el2 = fpext half %src2.el2 to float + +  %mul2 = fmul float %csrc1.el2, %csrc2.el1 +  %mul1 = fmul float %csrc1.el1, %csrc2.el2 +  %acc = load float, float addrspace(1)* %dst, align 4 +  %acc1 = fadd float %mul2, %acc +  %acc2 = fadd float %mul1, %acc1 +  store float %acc2, float addrspace(1)* %dst, align 4 +  ret void +}
\ No newline at end of file  | 

