Index: lib/Target/R600/SIISelLowering.cpp =================================================================== --- lib/Target/R600/SIISelLowering.cpp +++ lib/Target/R600/SIISelLowering.cpp @@ -1002,8 +1002,45 @@ SDValue Zero = DAG.getConstant(0, MVT::i32); SDValue One = DAG.getConstant(1, MVT::i32); - SDValue LHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1)); - SDValue RHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(2)); + SDValue LHS = Op.getOperand(1); + SDValue RHS = Op.getOperand(2); + + // Undo combine done in visitSINT_TO_FP / visitUINT_TO_FP. + // f64 (select (i1 cnd), [+|-]1.0, 0.0) -> f64 [u|s]int_to_fp (i1 cnd) + // + // It is larger and expensive to do the 2 selects and materialize the weird + // constant than selecting an i32 -1 / 0 and doing the conversion to f64. + // + // = 16 byte, 12 cycle + // v_cndmask_b32_e32 v0, 0, -1, s[0:1] + // v_cvt_f64_i32_e32 v[0:1], v0 + // + // vs. + // + // = 20 byte, 16 cycle + // v_mov_b32_e32 v0, 0xbff00000 + // v_cndmask_b32_e64 v1, 0, v0, s[0:1] + // v_mov_b32 v0, 0 + // + + if (const ConstantSDNode *CRHS = dyn_cast(RHS)) { + if (CRHS->isNullValue()) { + if (const ConstantSDNode *CLHS = dyn_cast(LHS)) { + if (CLHS->getZExtValue() == DoubleToBits(-1.0)) { + SDValue Cvt = DAG.getNode(ISD::SINT_TO_FP, DL, MVT::f64, Cond); + return DAG.getNode(ISD::BITCAST, DL, MVT::i64, Cvt); + } + + if (CLHS->getZExtValue() == DoubleToBits(1.0)) { + SDValue Cvt = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f64, Cond); + return DAG.getNode(ISD::BITCAST, DL, MVT::i64, Cvt); + } + } + } + } + + LHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, LHS); + RHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, RHS); SDValue Lo0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, Zero); SDValue Lo1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, Zero); Index: test/CodeGen/R600/fceil64.ll =================================================================== --- test/CodeGen/R600/fceil64.ll +++ test/CodeGen/R600/fceil64.ll @@ -25,8 +25,8 @@ ; SI: v_cmp_lg_f64 ; SI: v_cmp_gt_f64 ; SI: s_and_b64 -; SI: v_cndmask_b32 -; SI: v_cndmask_b32 +; SI: v_cndmask_b32_e64 {{v[0-9]+}}, 0, 1, +; SI-NEXT: v_cvt_f64_u32_e32 ; SI: v_add_f64 ; SI: s_endpgm define void @fceil_f64(double addrspace(1)* %out, double %x) { Index: test/CodeGen/R600/ffloor.ll =================================================================== --- test/CodeGen/R600/ffloor.ll +++ test/CodeGen/R600/ffloor.ll @@ -26,8 +26,8 @@ ; SI: v_cmp_lg_f64 ; SI: v_cmp_lt_f64 ; SI: s_and_b64 -; SI: v_cndmask_b32 -; SI: v_cndmask_b32 +; SI: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, +; SI-NEXT: v_cvt_f64_i32_e32 ; SI: v_add_f64 ; SI: s_endpgm define void @ffloor_f64(double addrspace(1)* %out, double %x) { Index: test/CodeGen/R600/sint_to_fp.f64.ll =================================================================== --- test/CodeGen/R600/sint_to_fp.f64.ll +++ test/CodeGen/R600/sint_to_fp.f64.ll @@ -12,11 +12,9 @@ ; SI-LABEL: {{^}}sint_to_fp_i1_f64: ; SI: v_cmp_eq_i32_e64 [[CMP:s\[[0-9]+:[0-9]\]]], -; We can't fold the SGPRs into v_cndmask_b32_e64, because it already -; uses an SGPR for [[CMP]] -; SI: v_cndmask_b32_e64 v{{[0-9]+}}, 0, v{{[0-9]+}}, [[CMP]] -; SI: v_cndmask_b32_e64 v{{[0-9]+}}, 0, v{{[0-9]+}}, [[CMP]] -; SI: buffer_store_dwordx2 +; SI: v_cndmask_b32_e64 [[TMP:v[0-9]+]], 0, -1, [[CMP]] +; SI: v_cvt_f64_i32_e32 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[TMP]] +; SI: buffer_store_dwordx2 [[RESULT]] ; SI: s_endpgm define void @sint_to_fp_i1_f64(double addrspace(1)* %out, i32 %in) { %cmp = icmp eq i32 %in, 0 @@ -58,3 +56,42 @@ store double %result, double addrspace(1)* %out ret void } + +; SI-LABEL: {{^}}select_sint_to_fp_i1_vals_f64: +; SI: v_cmp_eq_i32_e64 [[CMP:s\[[0-9]+:[0-9]\]]], +; SI: v_cndmask_b32_e64 [[TMP:v[0-9]+]], 0, -1, [[CMP]] +; SI: v_cvt_f64_i32_e32 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[TMP]] +; SI: buffer_store_dwordx2 [[RESULT]] +; SI: s_endpgm +define void @select_sint_to_fp_i1_vals_f64(double addrspace(1)* %out, i32 %in) { + %cmp = icmp eq i32 %in, 0 + %select = select i1 %cmp, double -1.0, double 0.0 + store double %select, double addrspace(1)* %out, align 8 + ret void +} + +; SI-LABEL: {{^}}select_sint_to_fp_i1_vals_i64: +; SI: v_cmp_eq_i32_e64 [[CMP:s\[[0-9]+:[0-9]\]]], +; SI: v_cndmask_b32_e64 [[TMP:v[0-9]+]], 0, -1, [[CMP]] +; SI: v_cvt_f64_i32_e32 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[TMP]] +; SI: buffer_store_dwordx2 [[RESULT]] +; SI: s_endpgm +define void @select_sint_to_fp_i1_vals_i64(i64 addrspace(1)* %out, i32 %in) { + %cmp = icmp eq i32 %in, 0 + %select = select i1 %cmp, i64 u0xbff0000000000000, i64 0 + store i64 %select, i64 addrspace(1)* %out, align 8 + ret void +} + +; TODO: This should swap the selected order / invert the compare and do it. +; SI-LABEL: {{^}}swap_select_sint_to_fp_i1_vals_f64: +; SI-NOT: v_cvt_f64_u32 +; SI: v_cndmask_b32_e64 +; SI: v_cndmask_b32_e64 +; SI: s_endpgm +define void @swap_select_sint_to_fp_i1_vals_f64(double addrspace(1)* %out, i32 %in) { + %cmp = icmp eq i32 %in, 0 + %select = select i1 %cmp, double 0.0, double -1.0 + store double %select, double addrspace(1)* %out, align 8 + ret void +} Index: test/CodeGen/R600/uint_to_fp.f64.ll =================================================================== --- test/CodeGen/R600/uint_to_fp.f64.ll +++ test/CodeGen/R600/uint_to_fp.f64.ll @@ -72,11 +72,9 @@ ; SI-LABEL: {{^}}uint_to_fp_i1_to_f64: ; SI: v_cmp_eq_i32_e64 [[CMP:s\[[0-9]+:[0-9]\]]], -; We can't fold the SGPRs into v_cndmask_b32_e64, because it already -; uses an SGPR for [[CMP]] -; SI: v_cndmask_b32_e64 v{{[0-9]+}}, 0, v{{[0-9]+}}, [[CMP]] -; SI: v_cndmask_b32_e64 v{{[0-9]+}}, 0, v{{[0-9]+}}, [[CMP]] -; SI: buffer_store_dwordx2 +; SI: v_cndmask_b32_e64 [[TMP:v[0-9]+]], 0, 1, [[CMP]] +; SI: v_cvt_f64_u32_e32 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[TMP]] +; SI: buffer_store_dwordx2 [[RESULT]] ; SI: s_endpgm define void @uint_to_fp_i1_to_f64(double addrspace(1)* %out, i32 %in) { %cmp = icmp eq i32 %in, 0 @@ -95,3 +93,42 @@ store double %fp, double addrspace(1)* %out, align 8 ret void } + +; SI-LABEL: {{^}}select_uint_to_fp_i1_vals_f64: +; SI: v_cmp_eq_i32_e64 [[CMP:s\[[0-9]+:[0-9]\]]], +; SI: v_cndmask_b32_e64 [[TMP:v[0-9]+]], 0, 1, [[CMP]] +; SI: v_cvt_f64_u32_e32 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[TMP]] +; SI: buffer_store_dwordx2 [[RESULT]] +; SI: s_endpgm +define void @select_uint_to_fp_i1_vals_f64(double addrspace(1)* %out, i32 %in) { + %cmp = icmp eq i32 %in, 0 + %select = select i1 %cmp, double 1.0, double 0.0 + store double %select, double addrspace(1)* %out, align 8 + ret void +} + +; SI-LABEL: {{^}}select_uint_to_fp_i1_vals_i64: +; SI: v_cmp_eq_i32_e64 [[CMP:s\[[0-9]+:[0-9]\]]], +; SI: v_cndmask_b32_e64 [[TMP:v[0-9]+]], 0, 1, [[CMP]] +; SI: v_cvt_f64_u32_e32 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[TMP]] +; SI: buffer_store_dwordx2 [[RESULT]] +; SI: s_endpgm +define void @select_uint_to_fp_i1_vals_i64(i64 addrspace(1)* %out, i32 %in) { + %cmp = icmp eq i32 %in, 0 + %select = select i1 %cmp, i64 u0x3ff0000000000000, i64 0 + store i64 %select, i64 addrspace(1)* %out, align 8 + ret void +} + +; TODO: This should swap the selected order / invert the compare and do it. +; SI-LABEL: {{^}}swap_select_uint_to_fp_i1_vals_f64: +; SI-NOT: v_cvt_f64_u32 +; SI: v_cndmask_b32_e64 +; SI: v_cndmask_b32_e64 +; SI: s_endpgm +define void @swap_select_uint_to_fp_i1_vals_f64(double addrspace(1)* %out, i32 %in) { + %cmp = icmp eq i32 %in, 0 + %select = select i1 %cmp, double 0.0, double 1.0 + store double %select, double addrspace(1)* %out, align 8 + ret void +}