Index: llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -15985,7 +15985,8 @@ // fold (sint_to_fp (setcc x, y, cc)) -> (select (setcc x, y, cc), -1.0, 0.0) if (N0.getOpcode() == ISD::SETCC && N0.getValueType() == MVT::i1 && !VT.isVector() && - (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::ConstantFP, VT))) { + (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::ConstantFP, VT)) && + !TLI.isOperationLegalOrCustom(ISD::SINT_TO_FP, VT)) { SDLoc DL(N); return DAG.getSelect(DL, VT, N0, DAG.getConstantFP(-1.0, DL, VT), DAG.getConstantFP(0.0, DL, VT)); @@ -16035,7 +16036,8 @@ // fold (uint_to_fp (setcc x, y, cc)) -> (select (setcc x, y, cc), 1.0, 0.0) if (N0.getOpcode() == ISD::SETCC && !VT.isVector() && - (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::ConstantFP, VT))) { + (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::ConstantFP, VT)) && + !TLI.isOperationLegalOrCustom(ISD::UINT_TO_FP, VT)) { SDLoc DL(N); return DAG.getSelect(DL, VT, N0, DAG.getConstantFP(1.0, DL, VT), DAG.getConstantFP(0.0, DL, VT)); Index: llvm/lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -8785,13 +8785,53 @@ SDLoc DL(Op); SDValue Cond = Op.getOperand(0); + SDValue LHS = Op.getOperand(1); + SDValue RHS = Op.getOperand(2); + + if (const ConstantSDNode *CRHS = dyn_cast(RHS)) { + if (CRHS->isNullValue()) { + // Undo combine done in visitSINT_TO_FP / visitUINT_TO_FP. + // f64 (select (i1 cnd), [+|-]1.0, 0.0) -> f64 [u|s]int_to_fp (i1 cnd) + // + // It is larger and expensive to do the 2 selects and materialize the weird + // constant than selecting an i32 -1 / 0 and doing the conversion to f64. + // + // = 12 byte, (12 cycle or 20 cycles depending on part) + // v_cndmask_b32_e64 v0, 0, -1, s[0:1] + // v_cvt_f64_i32_e32 v[0:1], v0 + // + // vs. + // + // = 20 byte, 12 cycle + // v_mov_b32_e32 v0, 0xbff00000 + // v_cndmask_b32_e64 v1, 0, v0, s[0:1] + // v_mov_b32 v0, 0 + // + // FIXME: Should check real instruction rate. Scheduler model seems to + // say v_cvt_f64_i32 is always quarter rate, but some older documents + // suggest half + + if (const ConstantSDNode *CLHS = dyn_cast(LHS)) { + if (CLHS->getZExtValue() == DoubleToBits(-1.0)) { + SDValue Cvt = DAG.getNode(ISD::SINT_TO_FP, DL, MVT::f64, Cond); + return DAG.getNode(ISD::BITCAST, DL, MVT::i64, Cvt); + } + + if (CLHS->getZExtValue() == DoubleToBits(1.0)) { + SDValue Cvt = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f64, Cond); + return DAG.getNode(ISD::BITCAST, DL, MVT::i64, Cvt); + } + } + } + } + + LHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, LHS); + RHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, RHS); + SDValue Zero = DAG.getConstant(0, DL, MVT::i32); SDValue One = DAG.getConstant(1, DL, MVT::i32); - SDValue LHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1)); - SDValue RHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(2)); - SDValue Lo0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, Zero); SDValue Lo1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, Zero); Index: llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll +++ llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll @@ -38,12 +38,12 @@ ; CI: ; %bb.0: ; CI-NEXT: s_load_dword s2, s[4:5], 0x2 ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; CI-NEXT: v_mov_b32_e32 v0, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_cmp_eq_u32 s2, 0 -; CI-NEXT: s_cselect_b32 s2, 0xbff00000, 0 +; CI-NEXT: s_cselect_b64 s[2:3], -1, 0 +; CI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[2:3] +; CI-NEXT: v_cvt_f64_i32_e32 v[0:1], v0 ; CI-NEXT: v_mov_b32_e32 v3, s1 -; CI-NEXT: v_mov_b32_e32 v1, s2 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CI-NEXT: s_endpgm @@ -52,12 +52,12 @@ ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[4:5], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_cmp_eq_u32 s2, 0 -; VI-NEXT: s_cselect_b32 s2, 0xbff00000, 0 +; VI-NEXT: s_cselect_b64 s[2:3], -1, 0 +; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[2:3] +; VI-NEXT: v_cvt_f64_i32_e32 v[0:1], v0 ; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v1, s2 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm @@ -232,12 +232,12 @@ ; CI: ; %bb.0: ; CI-NEXT: s_load_dword s2, s[4:5], 0x2 ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; CI-NEXT: v_mov_b32_e32 v0, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_cmp_eq_u32 s2, 0 -; CI-NEXT: s_cselect_b32 s2, 0xbff00000, 0 +; CI-NEXT: s_cselect_b64 s[2:3], -1, 0 +; CI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[2:3] +; CI-NEXT: v_cvt_f64_i32_e32 v[0:1], v0 ; CI-NEXT: v_mov_b32_e32 v3, s1 -; CI-NEXT: v_mov_b32_e32 v1, s2 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CI-NEXT: s_endpgm @@ -246,12 +246,12 @@ ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[4:5], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_cmp_eq_u32 s2, 0 -; VI-NEXT: s_cselect_b32 s2, 0xbff00000, 0 +; VI-NEXT: s_cselect_b64 s[2:3], -1, 0 +; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[2:3] +; VI-NEXT: v_cvt_f64_i32_e32 v[0:1], v0 ; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v1, s2 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm @@ -265,11 +265,10 @@ ; GCN-LABEL: v_select_sint_to_fp_i1_vals_f64: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v4, 0xbff00000 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GCN-NEXT: v_mov_b32_e32 v3, 0 -; GCN-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc -; GCN-NEXT: flat_store_dwordx2 v[0:1], v[3:4] +; GCN-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc +; GCN-NEXT: v_cvt_f64_i32_e32 v[2:3], v2 +; GCN-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %in, 0 @@ -283,12 +282,12 @@ ; CI: ; %bb.0: ; CI-NEXT: s_load_dword s2, s[4:5], 0x2 ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; CI-NEXT: v_mov_b32_e32 v0, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_cmp_eq_u32 s2, 0 -; CI-NEXT: s_cselect_b32 s2, 0xbff00000, 0 +; CI-NEXT: s_cselect_b64 s[2:3], -1, 0 +; CI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[2:3] +; CI-NEXT: v_cvt_f64_i32_e32 v[0:1], v0 ; CI-NEXT: v_mov_b32_e32 v3, s1 -; CI-NEXT: v_mov_b32_e32 v1, s2 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CI-NEXT: s_endpgm @@ -297,12 +296,12 @@ ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[4:5], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_cmp_eq_u32 s2, 0 -; VI-NEXT: s_cselect_b32 s2, 0xbff00000, 0 +; VI-NEXT: s_cselect_b64 s[2:3], -1, 0 +; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[2:3] +; VI-NEXT: v_cvt_f64_i32_e32 v[0:1], v0 ; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v1, s2 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm @@ -316,11 +315,10 @@ ; GCN-LABEL: v_select_sint_to_fp_i1_vals_i64: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v4, 0xbff00000 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GCN-NEXT: v_mov_b32_e32 v3, 0 -; GCN-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc -; GCN-NEXT: flat_store_dwordx2 v[0:1], v[3:4] +; GCN-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc +; GCN-NEXT: v_cvt_f64_i32_e32 v[2:3], v2 +; GCN-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %in, 0 Index: llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll +++ llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll @@ -286,12 +286,12 @@ ; SI: ; %bb.0: ; SI-NEXT: s_load_dword s2, s[4:5], 0x2 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; SI-NEXT: v_mov_b32_e32 v0, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_cmp_eq_u32 s2, 0 -; SI-NEXT: s_cselect_b32 s2, 0x3ff00000, 0 +; SI-NEXT: s_cselect_b64 s[2:3], -1, 0 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3] +; SI-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 ; SI-NEXT: v_mov_b32_e32 v3, s1 -; SI-NEXT: v_mov_b32_e32 v1, s2 ; SI-NEXT: v_mov_b32_e32 v2, s0 ; SI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; SI-NEXT: s_endpgm @@ -300,12 +300,12 @@ ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[4:5], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_cmp_eq_u32 s2, 0 -; VI-NEXT: s_cselect_b32 s2, 0x3ff00000, 0 +; VI-NEXT: s_cselect_b64 s[2:3], -1, 0 +; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3] +; VI-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 ; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v1, s2 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm @@ -402,12 +402,12 @@ ; SI: ; %bb.0: ; SI-NEXT: s_load_dword s2, s[4:5], 0x2 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; SI-NEXT: v_mov_b32_e32 v0, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_cmp_eq_u32 s2, 0 -; SI-NEXT: s_cselect_b32 s2, 0x3ff00000, 0 +; SI-NEXT: s_cselect_b64 s[2:3], -1, 0 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3] +; SI-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 ; SI-NEXT: v_mov_b32_e32 v3, s1 -; SI-NEXT: v_mov_b32_e32 v1, s2 ; SI-NEXT: v_mov_b32_e32 v2, s0 ; SI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; SI-NEXT: s_endpgm @@ -416,12 +416,12 @@ ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[4:5], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_cmp_eq_u32 s2, 0 -; VI-NEXT: s_cselect_b32 s2, 0x3ff00000, 0 +; VI-NEXT: s_cselect_b64 s[2:3], -1, 0 +; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3] +; VI-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 ; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v1, s2 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm @@ -435,11 +435,10 @@ ; GCN-LABEL: v_select_uint_to_fp_i1_vals_f64: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v4, 0x3ff00000 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GCN-NEXT: v_mov_b32_e32 v3, 0 -; GCN-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc -; GCN-NEXT: flat_store_dwordx2 v[0:1], v[3:4] +; GCN-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GCN-NEXT: v_cvt_f64_u32_e32 v[2:3], v2 +; GCN-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %in, 0 @@ -453,12 +452,12 @@ ; SI: ; %bb.0: ; SI-NEXT: s_load_dword s2, s[4:5], 0x2 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; SI-NEXT: v_mov_b32_e32 v0, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_cmp_eq_u32 s2, 0 -; SI-NEXT: s_cselect_b32 s2, 0x3ff00000, 0 +; SI-NEXT: s_cselect_b64 s[2:3], -1, 0 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3] +; SI-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 ; SI-NEXT: v_mov_b32_e32 v3, s1 -; SI-NEXT: v_mov_b32_e32 v1, s2 ; SI-NEXT: v_mov_b32_e32 v2, s0 ; SI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; SI-NEXT: s_endpgm @@ -467,12 +466,12 @@ ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[4:5], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_cmp_eq_u32 s2, 0 -; VI-NEXT: s_cselect_b32 s2, 0x3ff00000, 0 +; VI-NEXT: s_cselect_b64 s[2:3], -1, 0 +; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3] +; VI-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 ; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v1, s2 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm @@ -486,11 +485,10 @@ ; GCN-LABEL: v_select_uint_to_fp_i1_vals_i64: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v4, 0x3ff00000 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GCN-NEXT: v_mov_b32_e32 v3, 0 -; GCN-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc -; GCN-NEXT: flat_store_dwordx2 v[0:1], v[3:4] +; GCN-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GCN-NEXT: v_cvt_f64_u32_e32 v[2:3], v2 +; GCN-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %in, 0