Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -6989,27 +6989,42 @@ // TODO: This could be better with wider vectors that will be split to v2f16, // and to consider uses since there aren't that many packed operations. - if (N0.getOpcode() == ISD::BUILD_VECTOR && VT == MVT::v2f16) { + if (N0.getOpcode() == ISD::BUILD_VECTOR && VT == MVT::v2f16 && + isTypeLegal(MVT::v2f16)) { SDLoc SL(N); SDValue NewElts[2]; SDValue Lo = N0.getOperand(0); SDValue Hi = N0.getOperand(1); + EVT EltVT = Lo.getValueType(); + if (vectorEltWillFoldAway(Lo) || vectorEltWillFoldAway(Hi)) { for (unsigned I = 0; I != 2; ++I) { SDValue Op = N0.getOperand(I); - EVT EltVT = Op.getValueType(); if (ConstantFPSDNode *CFP = dyn_cast(Op)) { NewElts[I] = getCanonicalConstantFP(DAG, SL, EltVT, CFP->getValueAPF()); } else if (Op.isUndef()) { - // This would ordinarily be folded to a qNaN. Since this may be half - // of a packed operation, it may be cheaper to use a 0. - NewElts[I] = DAG.getConstantFP(0.0f, SL, EltVT); + // Handled below based on what the other operand is. + NewElts[I] = Op; } else { NewElts[I] = DAG.getNode(ISD::FCANONICALIZE, SL, EltVT, Op); } } + // If one half is undef, and one is constant, perfer a splat vector rather + // than the normal qNaN. If it's a register, prefer 0.0 since that's + // cheaper to use amd may be free with a packed operation. + if (NewElts[0].isUndef()) { + if (isa(NewElts[1])) + NewElts[0] = isa(NewElts[1]) ? + NewElts[1]: DAG.getConstantFP(0.0f, SL, EltVT); + } + + if (NewElts[1].isUndef()) { + NewElts[1] = isa(NewElts[0]) ? + NewElts[0] : DAG.getConstantFP(0.0f, SL, EltVT); + } + return DAG.getBuildVector(VT, SL, NewElts); } } Index: test/CodeGen/AMDGPU/clamp.ll =================================================================== --- test/CodeGen/AMDGPU/clamp.ll +++ test/CodeGen/AMDGPU/clamp.ll @@ -688,6 +688,38 @@ ret void } +; GCN-LABEL: {{^}}v_clamp_v2f16_undef_limit_elts0: +; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] +; GFX9-NOT: [[A]] +; GFX9: v_pk_max_f16 [[CLAMP:v[0-9]+]], [[A]], [[A]] clamp{{$}} +define amdgpu_kernel void @v_clamp_v2f16_undef_limit_elts0(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %aptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %aptr, i32 %tid + %out.gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid + %a = load <2 x half>, <2 x half> addrspace(1)* %gep0 + %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> ) + %med = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> ) + + store <2 x half> %med, <2 x half> addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_clamp_v2f16_undef_limit_elts1: +; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] +; GFX9-NOT: [[A]] +; GFX9: v_pk_max_f16 [[CLAMP:v[0-9]+]], [[A]], [[A]] clamp{{$}} +define amdgpu_kernel void @v_clamp_v2f16_undef_limit_elts1(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %aptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %aptr, i32 %tid + %out.gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid + %a = load <2 x half>, <2 x half> addrspace(1)* %gep0 + %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> ) + %med = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> ) + + store <2 x half> %med, <2 x half> addrspace(1)* %out.gep + ret void +} + ; GCN-LABEL: {{^}}v_clamp_diff_source_f32: ; GCN: v_add_f32_e32 [[A:v[0-9]+]] ; GCN: v_add_f32_e32 [[B:v[0-9]+]] Index: test/CodeGen/AMDGPU/fcanonicalize.f16.ll =================================================================== --- test/CodeGen/AMDGPU/fcanonicalize.f16.ll +++ test/CodeGen/AMDGPU/fcanonicalize.f16.ll @@ -565,20 +565,71 @@ } ; GCN-LABEL: {{^}}v_test_canonicalize_undef_reg_v2f16: -; GFX9: s_waitcnt -; GFX9-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, 0 -; GFX9-NEXT: s_setpc_b64 - -; VI: s_waitcnt -; VI-NEXT: v_max_f16_sdwa v0, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: s_setpc_b64 +; GFX89: s_waitcnt +; GFX89-NEXT: v_max_f16_sdwa v0, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX89-NEXT: s_setpc_b64 define <2 x half> @v_test_canonicalize_undef_reg_v2f16(half %val) #1 { %vec = insertelement <2 x half> undef, half %val, i32 1 %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %vec) ret <2 x half> %canonicalized } +; GCN-LABEL: {{^}}v_test_canonicalize_undef_lo_imm_hi_v2f16: +; GCN: s_waitcnt +; GFX89-NEXT: v_mov_b32_e32 v0, 0x3c003c00 +; GFX89-NEXT: s_setpc_b64 + +; CI-NEXT: v_mov_b32_e32 v0, 0x7fc00000 +; CI-NEXT: v_mov_b32_e32 v1, 1.0 +; CI-NEXT: s_setpc_b64 +define <2 x half> @v_test_canonicalize_undef_lo_imm_hi_v2f16() #1 { + %vec = insertelement <2 x half> undef, half 1.0, i32 1 + %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %vec) + ret <2 x half> %canonicalized +} + +; GCN-LABEL: {{^}}v_test_canonicalize_imm_lo_undef_hi_v2f16: +; GCN: s_waitcnt +; GFX89-NEXT: v_mov_b32_e32 v0, 0x3c003c00 +; GFX89-NEXT: s_setpc_b64 + +; CI-NEXT: v_mov_b32_e32 v0, 1.0 +; CI-NEXT: v_mov_b32_e32 v1, 0x7fc00000 +; CI-NEXT: s_setpc_b64 +define <2 x half> @v_test_canonicalize_imm_lo_undef_hi_v2f16() #1 { + %vec = insertelement <2 x half> undef, half 1.0, i32 0 + %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %vec) + ret <2 x half> %canonicalized +} + +; GCN-LABEL: {{^}}v_test_canonicalize_undef_lo_k_hi_v2f16: +; GCN: s_waitcnt +; GFX89-NEXT: v_mov_b32_e32 v0, 0x4c004c00 +; GFX89-NEXT: s_setpc_b64 + +; CI-NEXT: v_mov_b32_e32 v0, 0x7fc00000 +; CI-NEXT: v_mov_b32_e32 v1, 0x41800000 +; CI-NEXT: s_setpc_b64 +define <2 x half> @v_test_canonicalize_undef_lo_k_hi_v2f16() #1 { + %vec = insertelement <2 x half> undef, half 16.0, i32 1 + %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %vec) + ret <2 x half> %canonicalized +} + +; GCN-LABEL: {{^}}v_test_canonicalize_k_lo_undef_hi_v2f16: +; GCN: s_waitcnt +; GFX89-NEXT: v_mov_b32_e32 v0, 0x4c004c00 +; GFX89-NEXT: s_setpc_b64 + +; CI-NEXT: v_mov_b32_e32 v0, 0x41800000 +; CI-NEXT: v_mov_b32_e32 v1, 0x7fc00000 +; CI-NEXT: s_setpc_b64 +define <2 x half> @v_test_canonicalize_k_lo_undef_hi_v2f16() #1 { + %vec = insertelement <2 x half> undef, half 16.0, i32 0 + %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %vec) + ret <2 x half> %canonicalized +} + ; GCN-LABEL: {{^}}v_test_canonicalize_reg_k_v2f16: ; GFX9: s_waitcnt ; GFX9-DAG: v_max_f16_e32 v0, v0, v0