Index: llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -13,10 +13,12 @@ #include "AMDGPU.h" #include "AMDGPUTargetMachine.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "SIMachineFunctionInfo.h" #include "llvm/Analysis/LegacyDivergenceAnalysis.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/CodeGen/FunctionLoweringInfo.h" +#include "llvm/CodeGen/ISDOpcodes.h" #include "llvm/CodeGen/SelectionDAG.h" #include "llvm/CodeGen/SelectionDAGISel.h" #include "llvm/CodeGen/SelectionDAGNodes.h" @@ -233,6 +235,7 @@ bool SelectSMRDBufferImm(SDValue Addr, SDValue &Offset) const; bool SelectSMRDBufferImm32(SDValue Addr, SDValue &Offset) const; bool SelectMOVRELOffset(SDValue Index, SDValue &Base, SDValue &Offset) const; + bool SelectV_PACK_B32_F16(SDNode *Node, EVT VT); bool SelectVOP3Mods_NNaN(SDValue In, SDValue &Src, SDValue &SrcMods) const; bool SelectVOP3ModsImpl(SDValue In, SDValue &Src, unsigned &SrcMods, @@ -775,6 +778,9 @@ } } + if (SelectV_PACK_B32_F16(N, VT)) + return; + break; } @@ -2678,6 +2684,56 @@ SelectCode(N); } +bool AMDGPUDAGToDAGISel::SelectV_PACK_B32_F16(SDNode *Node, EVT VT) { + /* + Match this pattern when it's safe to do so + (v2f16 (build_vector (f16 (bitconvert (i16 (trunc VGPR_32:$src0)))), + (f16 (bitconvert (i16 (trunc VGPR_32:$src1)))))), + (V_PACK_B32_F16_e64 SRCMODS.NONE, VGPR_32:$src0, SRCMODS.NONE, VGPR_32:$src1) + */ + unsigned int Opc = Node->getOpcode(); + unsigned NumVectorElts = VT.getVectorNumElements(); + if (VT.getScalarSizeInBits() == 16) { + if (Opc == ISD::BUILD_VECTOR && NumVectorElts == 2 && + Node->getNumOperands() >= 2) { + SDValue LHS = Node->getOperand(0); + SDValue RHS = Node->getOperand(1); + + if (LHS.getValueType() == MVT::f16 && RHS.getValueType() == MVT::f16 && + LHS.getOpcode() == ISD::BITCAST && RHS.getOpcode() == ISD::BITCAST && + LHS.getNumOperands() >= 1 && RHS.getNumOperands() >= 1) { + + SDValue Op0 = LHS.getOperand(0); + SDValue Op1 = RHS.getOperand(0); + + if (Op0.getOpcode() == ISD::TRUNCATE && + Op1.getOpcode() == ISD::TRUNCATE && + Op0.getValueType() == MVT::i16 && Op1.getValueType() == MVT::i16) { + + const SITargetLowering &Lowering = + *static_cast(getTargetLowering()); + + // Select this instruction if the inputs have been flushed/quieted + // already + if (!Lowering.isCanonicalized(*CurDAG, Op0.getOperand(0)) || + !Lowering.isCanonicalized(*CurDAG, Op1.getOperand(0))) + return false; + + SDValue Mods = CurDAG->getTargetConstant(0, {}, MVT::i32); + CurDAG->SelectNodeTo( + Node, AMDGPU::V_PACK_B32_F16_e64, Node->getVTList(), + {Mods, Op0, Mods, Op1, CurDAG->getTargetConstant(0, {}, MVT::i1), + Mods}); + + return true; + } + } + } + } + + return false; +} + bool AMDGPUDAGToDAGISel::SelectVOP3ModsImpl(SDValue In, SDValue &Src, unsigned &Mods, bool AllowAbs) const { Index: llvm/lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -9630,7 +9630,8 @@ TruncSrc.getOperand(0).getValueType() == MVT::v2f16) { return isCanonicalized(DAG, TruncSrc.getOperand(0), MaxDepth - 1); } - } + } else if (Op.getValueType() == MVT::i32 && Src.getValueType() == MVT::f32) + return isCanonicalized(DAG, Src, MaxDepth - 1); return false; } Index: llvm/test/CodeGen/AMDGPU/v_pack.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/v_pack.ll @@ -0,0 +1,47 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx1010 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s + + +; GCN-LABEL: v_pack_b32_v2f16: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: global_load_dword v1, v0, s[0:1] glc dlc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: global_load_dword v2, v0, s[2:3] glc dlc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_f32_e32 v0, 2.0, v1 +; GCN-NEXT: v_add_f32_e32 v1, 2.0, v2 +; GCN-NEXT: v_pack_b32_f16 v0, v0, v1 +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ; use v0 +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: s_endpgm +define amdgpu_kernel void @v_pack_b32_v2f16(float addrspace(1)* %in0, float addrspace(1)* %in1) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %in0.gep = getelementptr inbounds float, float addrspace(1)* %in0, i64 %tid.ext + %in1.gep = getelementptr inbounds float, float addrspace(1)* %in1, i64 %tid.ext + %v0 = load volatile float, float addrspace(1)* %in0.gep + %v1 = load volatile float, float addrspace(1)* %in1.gep + %v0.mul = fadd float %v0, 2.0 + %v1.add = fadd float %v1, 2.0 + %val0 = bitcast float %v0.mul to i32 + %val1 = bitcast float %v1.add to i32 + %lo.i = trunc i32 %val0 to i16 + %hi.i = trunc i32 %val1 to i16 + %lo = bitcast i16 %lo.i to half + %hi = bitcast i16 %hi.i to half + %vec.0 = insertelement <2 x half> undef, half %lo, i32 0 + %vec.1 = insertelement <2 x half> %vec.0, half %hi, i32 1 + %vec.i32 = bitcast <2 x half> %vec.1 to i32 + call void asm sideeffect "; use $0", "v"(i32 %vec.i32) #0 + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } +