Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -6887,15 +6887,19 @@ return DAG.getConstantFP(C, SL, VT); } +static bool vectorEltWillFoldAway(SDValue Op) { + return Op.isUndef() || isa(Op); +} + SDValue SITargetLowering::performFCanonicalizeCombine( SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; SDValue N0 = N->getOperand(0); + EVT VT = N->getValueType(0); // fcanonicalize undef -> qnan if (N0.isUndef()) { - EVT VT = N->getValueType(0); APFloat QNaN = APFloat::getQNaN(SelectionDAG::EVTToAPFloatSemantics(VT)); return DAG.getConstantFP(QNaN, SDLoc(N), VT); } @@ -6905,6 +6909,40 @@ return getCanonicalConstantFP(DAG, SDLoc(N), VT, CFP->getValueAPF()); } + unsigned NumOps = N0.getNumOperands(); + + // fcanonicalize (build_vector x, k) -> build_vector (fcanonicalize x), + // (fcanonicalize k) + // + // fcanonicalize (build_vector x, undef) -> build_vector (fcanonicalize x), 0 + + // TODO: This could be better with wider vectors that will be split to v2f16, + // and to consider uses since there aren't that many packed operations. + if (N0.getOpcode() == ISD::BUILD_VECTOR && VT == MVT::v2f16) { + SDLoc SL(N); + SDValue NewElts[2]; + SDValue Lo = N0.getOperand(0); + SDValue Hi = N0.getOperand(1); + if (vectorEltWillFoldAway(Lo) || vectorEltWillFoldAway(Hi)) { + for (unsigned I = 0; I != 2; ++I) { + SDValue Op = N0.getOperand(I); + EVT EltVT = Op.getValueType(); + if (ConstantFPSDNode *CFP = dyn_cast(Op)) { + NewElts[I] = getCanonicalConstantFP(DAG, SL, EltVT, + CFP->getValueAPF()); + } else if (Op.isUndef()) { + // This would ordinarily be folded to a qNaN. Since this may be half + // of a packed operation, it may be cheaper to use a 0. + NewElts[I] = DAG.getConstantFP(0.0f, SL, EltVT); + } else { + NewElts[I] = DAG.getNode(ISD::FCANONICALIZE, SL, EltVT, Op); + } + } + + return DAG.getBuildVector(VT, SL, NewElts); + } + } + return isCanonicalized(DAG, N0) ? N0 : SDValue(); } Index: test/CodeGen/AMDGPU/fcanonicalize.f16.ll =================================================================== --- test/CodeGen/AMDGPU/fcanonicalize.f16.ll +++ test/CodeGen/AMDGPU/fcanonicalize.f16.ll @@ -6,8 +6,17 @@ declare half @llvm.canonicalize.f16(half) #0 declare <2 x half> @llvm.fabs.v2f16(<2 x half>) #0 declare <2 x half> @llvm.canonicalize.v2f16(<2 x half>) #0 +declare <4 x half> @llvm.canonicalize.v4f16(<4 x half>) #0 declare i32 @llvm.amdgcn.workitem.id.x() #0 +; GCN-LABEL: {{^}}test_fold_canonicalize_undef_value_f16: +; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e00{{$}} +; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REG]] +define amdgpu_kernel void @test_fold_canonicalize_undef_value_f16(half addrspace(1)* %out) #1 { + %canonicalized = call half @llvm.canonicalize.f16(half undef) + store half %canonicalized, half addrspace(1)* %out + ret void +} ; GCN-LABEL: {{^}}v_test_canonicalize_var_f16: ; GFX89: v_max_f16_e32 [[REG:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}} @@ -32,6 +41,21 @@ ret void } +; GCN-LABEL: {{^}}v_test_canonicalize_build_vector_v2f16: +; GFX9: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX9-NEXT: v_pk_max_f16 v0, v0, v0 + +; VI: v_max_f16_sdwa v1, v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI: v_max_f16_e32 v0, v0, v0 +; VI: v_or_b32_e32 v0, v0, v1 +define <2 x half> @v_test_canonicalize_build_vector_v2f16(half %lo, half %hi) #1 { + %ins0 = insertelement <2 x half> undef, half %lo, i32 0 + %ins1 = insertelement <2 x half> %ins0, half %hi, i32 1 + %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %ins1) + ret <2 x half> %canonicalized +} + ; GCN-LABEL: {{^}}v_test_canonicalize_fabs_var_f16: ; GFX89: v_max_f16_e64 [[REG:v[0-9]+]], |{{v[0-9]+}}|, |{{v[0-9]+}}| ; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REG]] @@ -476,6 +500,162 @@ ret void } +; GCN-LABEL: {{^}}s_test_canonicalize_undef_v2f16: +; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e007e00 +; GFX89: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]] +define amdgpu_kernel void @s_test_canonicalize_undef_v2f16(<2 x half> addrspace(1)* %out) #1 { + %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> undef) + store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}v_test_canonicalize_reg_undef_v2f16: +; GFX9: s_waitcnt +; GFX9-NEXT: v_max_f16_e32 v0, v0, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v0, 0, 16, v0 +; GFX9-NEXT: s_setpc_b64 + +; High bits known zero +; FIXME: Should also be true on gfx9 by default? +; VI: s_waitcnt +; VI-NEXT: v_max_f16_e32 v0, v0, v0 +; VI-NEXT: s_setpc_b64 +define <2 x half> @v_test_canonicalize_reg_undef_v2f16(half %val) #1 { + %vec = insertelement <2 x half> undef, half %val, i32 0 + %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %vec) + ret <2 x half> %canonicalized +} + +; GCN-LABEL: {{^}}v_test_canonicalize_undef_reg_v2f16: +; GFX9: s_waitcnt +; GFX9-NEXT: v_max_f16_e32 v0, v0, v0 +; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, 0 +; GFX9-NEXT: s_setpc_b64 + +; VI: s_waitcnt +; VI-NEXT: v_max_f16_sdwa v0, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_setpc_b64 +define <2 x half> @v_test_canonicalize_undef_reg_v2f16(half %val) #1 { + %vec = insertelement <2 x half> undef, half %val, i32 1 + %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %vec) + ret <2 x half> %canonicalized +} + +; GCN-LABEL: {{^}}v_test_canonicalize_reg_k_v2f16: +; GFX9: s_waitcnt +; GFX9-DAG: v_max_f16_e32 v0, v0, v0 +; GFX9-DAG: s_movk_i32 [[K:s[0-9]+]], 0x4000 +; GFX9: v_and_b32_e32 v0, 0xffff, v0 +; GFX9: v_lshl_or_b32 v0, [[K]], 16, v0 +; GFX9: s_setpc_b64 + +; VI: s_waitcnt +; VI-NEXT: v_max_f16_e32 v0, v0, v0 +; VI-NEXT: v_or_b32_e32 v0, 2.0, v0 +; VI-NEXT: s_setpc_b64 +define <2 x half> @v_test_canonicalize_reg_k_v2f16(half %val) #1 { + %vec0 = insertelement <2 x half> undef, half %val, i32 0 + %vec1 = insertelement <2 x half> %vec0, half 2.0, i32 1 + %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %vec1) + ret <2 x half> %canonicalized +} + +; GCN-LABEL: {{^}}v_test_canonicalize_k_reg_v2f16: +; GFX9: v_max_f16_e32 v0, v0, v0 +; GFX9: v_mov_b32_e32 [[K:v[0-9]+]], 0x4000 +; GFX9: v_lshl_or_b32 v0, v0, 16, [[K]] +; GFX9: s_setpc_b64 + +; VI: s_waitcnt +; VI-NEXT: v_max_f16_sdwa v0, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, 0x4000, v0 +; VI-NEXT: s_setpc_b64 +define <2 x half> @v_test_canonicalize_k_reg_v2f16(half %val) #1 { + %vec0 = insertelement <2 x half> undef, half 2.0, i32 0 + %vec1 = insertelement <2 x half> %vec0, half %val, i32 1 + %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %vec1) + ret <2 x half> %canonicalized +} + +; GCN-LABEL: {{^}}s_test_canonicalize_undef_v4f16: +; GCN: v_mov_b32_e32 v0, 0x7e007e00 +; GCN: v_mov_b32_e32 v1, v0 +define amdgpu_kernel void @s_test_canonicalize_undef_v4f16(<4 x half> addrspace(1)* %out) #1 { + %canonicalized = call <4 x half> @llvm.canonicalize.v4f16(<4 x half> undef) + store <4 x half> %canonicalized, <4 x half> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}v_test_canonicalize_reg_undef_undef_undef_v4f16: +; GFX9: s_waitcnt +; GFX9-NEXT: v_max_f16_e32 v0, v0, v0 +; GFX9-NEXT: s_movk_i32 [[K:s[0-9]+]], 0x7e00 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v0, [[K]], 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0x7e007e00 + +; VI: s_waitcnt +; VI-NEXT: v_max_f16_e32 v0, v0, v0 +; VI-NEXT: v_or_b32_e32 v0, 0x7e000000, v0 +; VI-NEXT: v_mov_b32_e32 v1, 0x7e007e00 +; VI-NEXT: s_setpc_b64 +define <4 x half> @v_test_canonicalize_reg_undef_undef_undef_v4f16(half %val) #1 { + %vec = insertelement <4 x half> undef, half %val, i32 0 + %canonicalized = call <4 x half> @llvm.canonicalize.v4f16(<4 x half> %vec) + ret <4 x half> %canonicalized +} + +; GCN-LABEL: {{^}}v_test_canonicalize_reg_reg_undef_undef_v4f16: +; GFX9: s_waitcnt +; GFX9-NEXT: v_max_f16_e32 v0, v0, v0 +; GFX9-NEXT: v_max_f16_e32 v1, v1, v1 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0x7e007e00 +; GFX9-NEXT: s_setpc_b64 + +; VI: s_waitcnt +; VI-DAG: v_max_f16_e32 v0, v0, v0 +; VI-DAG: v_max_f16_sdwa v1, v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: v_mov_b32_e32 v1, 0x7e007e00 +; VI-NEXT: s_setpc_b64 +define <4 x half> @v_test_canonicalize_reg_reg_undef_undef_v4f16(half %val0, half %val1) #1 { + %vec0 = insertelement <4 x half> undef, half %val0, i32 0 + %vec1 = insertelement <4 x half> %vec0, half %val1, i32 1 + %canonicalized = call <4 x half> @llvm.canonicalize.v4f16(<4 x half> %vec1) + ret <4 x half> %canonicalized +} + +; GCN-LABEL: {{^}}v_test_canonicalize_reg_undef_reg_reg_v4f16: +; GFX9: s_waitcnt +; GFX9-NEXT: v_max_f16_e32 v1, v1, v1 +; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff +; GFX9-NEXT: v_max_f16_e32 v0, v0, v0 +; GFX9-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX9-NEXT: v_and_b32_e32 v1, v3, v1 +; GFX9-NEXT: s_movk_i32 s6, 0x7e00 +; GFX9-NEXT: v_and_b32_e32 v0, v3, v0 +; GFX9-NEXT: v_lshl_or_b32 v0, s6, 16, v0 +; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX9-NEXT: s_setpc_b64 + +; VI: s_waitcnt +; VI-NEXT: v_max_f16_e32 v0, v0, v0 +; VI-NEXT: v_max_f16_e32 v1, v1, v1 +; VI-NEXT: v_max_f16_sdwa v2, v2, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, 0x7e000000, v0 +; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: s_setpc_b64 +define <4 x half> @v_test_canonicalize_reg_undef_reg_reg_v4f16(half %val0, half %val1, half %val2) #1 { + %vec0 = insertelement <4 x half> undef, half %val0, i32 0 + %vec1 = insertelement <4 x half> %vec0, half %val1, i32 2 + %vec2 = insertelement <4 x half> %vec1, half %val2, i32 3 + %canonicalized = call <4 x half> @llvm.canonicalize.v4f16(<4 x half> %vec2) + ret <4 x half> %canonicalized +} + attributes #0 = { nounwind readnone } attributes #1 = { nounwind } attributes #2 = { nounwind "target-features"="-fp64-fp16-denormals" }