diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -2457,10 +2457,6 @@ SDLoc SL(Op); SDValue Src = Op.getOperand(0); - EVT SetCCVT = - getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i32); - SDValue ZeroI32 = DAG.getConstant(0, SL, MVT::i32); - SDValue Lo, Hi; std::tie(Lo, Hi) = split64BitValue(Src, DAG); SDValue Sign; @@ -2468,25 +2464,38 @@ if (Signed && Subtarget->isGCN()) { // We also need to consider the sign bit in Lo if Hi has just sign bits, // i.e. Hi is 0 or -1. However, that only needs to take the MSB into - // account. - SDValue HasSameSign = - DAG.getSetCC(SL, SetCCVT, DAG.getNode(ISD::XOR, SL, MVT::i32, Lo, Hi), - ZeroI32, ISD::SETGE); - SDValue MaxShAmt = DAG.getSelect(SL, MVT::i32, HasSameSign, - DAG.getConstant(33, SL, MVT::i32), - DAG.getConstant(32, SL, MVT::i32)); + // account. That is, the maximal shift is + // - 32 if Lo and Hi have opposite signs; + // - 33 if Lo and Hi have the same sign. + // + // Or, MaxShAmt = 33 + OppositeSign, where + // + // OppositeSign is defined as ((Lo ^ Hi) >> 31), which is + // - -1 if Lo and Hi have opposite signs; and + // - 0 otherwise. + // + // All in all, ShAmt is calculated as + // + // umin(sffbh(Hi), 33 + (Lo^Hi)>>31) - 1. + // + // or + // + // umin(sffbh(Hi) - 1, 32 + (Lo^Hi)>>31). + // + // to reduce the critical path. + SDValue OppositeSign = DAG.getNode( + ISD::SRA, SL, MVT::i32, DAG.getNode(ISD::XOR, SL, MVT::i32, Lo, Hi), + DAG.getConstant(31, SL, MVT::i32)); + SDValue MaxShAmt = + DAG.getNode(ISD::ADD, SL, MVT::i32, DAG.getConstant(32, SL, MVT::i32), + OppositeSign); // Count the leading sign bits. ShAmt = DAG.getNode(AMDGPUISD::FFBH_I32, SL, MVT::i32, Hi); - ShAmt = DAG.getSelect(SL, MVT::i32, - DAG.getSetCC(SL, SetCCVT, ShAmt, - DAG.getAllOnesConstant(SL, MVT::i32), - ISD::SETNE), - ShAmt, MaxShAmt); - // The shift amount for signed integers is [1, 33]. // Different from unsigned conversion, the shift should be one bit less to // preserve the sign bit. ShAmt = DAG.getNode(ISD::SUB, SL, MVT::i32, ShAmt, DAG.getConstant(1, SL, MVT::i32)); + ShAmt = DAG.getNode(ISD::UMIN, SL, MVT::i32, ShAmt, MaxShAmt); } else { if (Signed) { // Without 'ffbh_i32', only leading zeros could be counted. Take the @@ -2507,9 +2516,9 @@ // Split it again. std::tie(Lo, Hi) = split64BitValue(Norm, DAG); // Calculate the adjust bit for rounding. - SDValue Adjust = DAG.getSelect( - SL, MVT::i32, DAG.getSetCC(SL, SetCCVT, Lo, ZeroI32, ISD::SETNE), - DAG.getConstant(1, SL, MVT::i32), ZeroI32); + // (lo != 0) ? 1 : 0 => (lo >= 1) ? 1 : 0 => umin(1, lo) + SDValue Adjust = DAG.getNode(ISD::UMIN, SL, MVT::i32, + DAG.getConstant(1, SL, MVT::i32), Lo); // Get the 32-bit normalized integer. Norm = DAG.getNode(ISD::OR, SL, MVT::i32, Hi, Adjust); // Convert the normalized 32-bit integer into f32. diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -2065,7 +2065,6 @@ const LLT S64 = LLT::scalar(64); const LLT S32 = LLT::scalar(32); - const LLT S1 = LLT::scalar(1); assert(MRI.getType(Src) == S64); @@ -2089,29 +2088,24 @@ assert(MRI.getType(Dst) == S32); - auto Zero = B.buildConstant(S32, 0); auto One = B.buildConstant(S32, 1); - auto AllOnes = B.buildConstant(S32, -1); MachineInstrBuilder ShAmt; if (Signed) { - auto ThirtyThree = B.buildConstant(S32, 33); + auto ThirtyOne = B.buildConstant(S32, 31); auto X = B.buildXor(S32, Unmerge.getReg(0), Unmerge.getReg(1)); - auto HasSameSign = B.buildICmp(CmpInst::ICMP_SGE, S1, X, Zero); - auto MaxShAmt = B.buildSelect(S32, HasSameSign, ThirtyThree, ThirtyTwo); + auto OppositeSign = B.buildAShr(S32, X, ThirtyOne); + auto MaxShAmt = B.buildAdd(S32, ThirtyTwo, OppositeSign); auto LS = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32}, /*HasSideEffects=*/false) .addUse(Unmerge.getReg(1)); - auto NotAllSameBits = B.buildICmp(CmpInst::ICMP_NE, S1, LS, AllOnes); - auto LS2 = B.buildSelect(S32, NotAllSameBits, LS, MaxShAmt); - ShAmt = B.buildSub(S32, LS2, One); + auto LS2 = B.buildSub(S32, LS, One); + ShAmt = B.buildUMin(S32, LS2, MaxShAmt); } else ShAmt = B.buildCTLZ(S32, Unmerge.getReg(1)); auto Norm = B.buildShl(S64, Src, ShAmt); auto Unmerge2 = B.buildUnmerge({S32, S32}, Norm); - auto NotAllZeros = - B.buildICmp(CmpInst::ICMP_NE, S1, Unmerge2.getReg(0), Zero); - auto Adjust = B.buildSelect(S32, NotAllZeros, One, Zero); + auto Adjust = B.buildUMin(S32, One, Unmerge2.getReg(0)); auto Norm2 = B.buildOr(S32, Unmerge2.getReg(1), Adjust); auto FVal = Signed ? B.buildSITOFP(S32, Norm2) : B.buildUITOFP(S32, Norm2); auto Scale = B.buildSub(S32, ThirtyTwo, ShAmt); diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll @@ -1082,15 +1082,15 @@ ; SI-LABEL: v_test_sitofp_i64_byte_to_f32: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_ffbh_i32_e32 v2, 0 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, -1, v2 -; SI-NEXT: v_cndmask_b32_e32 v2, 33, v2, vcc ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_ashrrev_i32_e32 v2, 31, v0 +; SI-NEXT: v_ffbh_i32_e32 v3, 0 +; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v2 +; SI-NEXT: v_subrev_i32_e32 v3, vcc, 1, v3 ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: v_subrev_i32_e32 v2, vcc, 1, v2 +; SI-NEXT: v_min_u32_e32 v2, v3, v2 ; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], v2 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SI-NEXT: v_min_u32_e32 v0, 1, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_cvt_f32_i32_e32 v0, v0 ; SI-NEXT: v_sub_i32_e32 v1, vcc, 32, v2 @@ -1100,15 +1100,15 @@ ; VI-LABEL: v_test_sitofp_i64_byte_to_f32: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_ffbh_i32_e32 v2, 0 -; VI-NEXT: v_cmp_ne_u32_e32 vcc, -1, v2 -; VI-NEXT: v_cndmask_b32_e32 v2, 33, v2, vcc ; VI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; VI-NEXT: v_ashrrev_i32_e32 v2, 31, v0 +; VI-NEXT: v_ffbh_i32_e32 v3, 0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 32, v2 +; VI-NEXT: v_subrev_u32_e32 v3, vcc, 1, v3 ; VI-NEXT: v_mov_b32_e32 v1, 0 -; VI-NEXT: v_subrev_u32_e32 v2, vcc, 1, v2 +; VI-NEXT: v_min_u32_e32 v2, v3, v2 ; VI-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1] -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; VI-NEXT: v_min_u32_e32 v0, 1, v0 ; VI-NEXT: v_or_b32_e32 v0, v1, v0 ; VI-NEXT: v_cvt_f32_i32_e32 v0, v0 ; VI-NEXT: v_sub_u32_e32 v1, vcc, 32, v2 @@ -1128,8 +1128,7 @@ ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: v_min_u32_e32 v2, 32, v2 ; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], v2 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SI-NEXT: v_min_u32_e32 v0, 1, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_cvt_f32_u32_e32 v0, v0 ; SI-NEXT: v_sub_i32_e32 v1, vcc, 32, v2 @@ -1144,8 +1143,7 @@ ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: v_min_u32_e32 v2, 32, v2 ; VI-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1] -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; VI-NEXT: v_min_u32_e32 v0, 1, v0 ; VI-NEXT: v_or_b32_e32 v0, v1, v0 ; VI-NEXT: v_cvt_f32_u32_e32 v0, v0 ; VI-NEXT: v_sub_u32_e32 v1, vcc, 32, v2 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-sitofp.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-sitofp.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-sitofp.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-sitofp.mir @@ -100,48 +100,40 @@ ; GFX6: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 ; GFX6: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64) ; GFX6: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 - ; GFX6: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 - ; GFX6: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 - ; GFX6: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1 - ; GFX6: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 33 + ; GFX6: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX6: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 31 ; GFX6: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[UV]], [[UV1]] - ; GFX6: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(sge), [[XOR]](s32), [[C1]] - ; GFX6: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[C4]], [[C]] + ; GFX6: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[XOR]], [[C2]](s32) + ; GFX6: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[C]], [[ASHR]] ; GFX6: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.sffbh), [[UV1]](s32) - ; GFX6: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[INT]](s32), [[C3]] - ; GFX6: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s1), [[INT]], [[SELECT]] - ; GFX6: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[SELECT1]], [[C2]] - ; GFX6: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[COPY]], [[SUB]](s32) + ; GFX6: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[INT]], [[C1]] + ; GFX6: [[UMIN:%[0-9]+]]:_(s32) = G_UMIN [[SUB]], [[ADD]] + ; GFX6: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[COPY]], [[UMIN]](s32) ; GFX6: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[SHL]](s64) - ; GFX6: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[UV2]](s32), [[C1]] - ; GFX6: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[ICMP2]](s1), [[C2]], [[C1]] - ; GFX6: [[OR:%[0-9]+]]:_(s32) = G_OR [[UV3]], [[SELECT2]] + ; GFX6: [[UMIN1:%[0-9]+]]:_(s32) = G_UMIN [[C1]], [[UV2]] + ; GFX6: [[OR:%[0-9]+]]:_(s32) = G_OR [[UV3]], [[UMIN1]] ; GFX6: [[SITOFP:%[0-9]+]]:_(s32) = G_SITOFP [[OR]](s32) - ; GFX6: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[C]], [[SUB]] + ; GFX6: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[C]], [[UMIN]] ; GFX6: [[INT1:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.ldexp), [[SITOFP]](s32), [[SUB1]](s32) ; GFX6: $vgpr0 = COPY [[INT1]](s32) ; GFX8-LABEL: name: test_sitofp_s64_to_s32 ; GFX8: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 ; GFX8: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64) ; GFX8: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 - ; GFX8: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 - ; GFX8: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 - ; GFX8: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1 - ; GFX8: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 33 + ; GFX8: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX8: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 31 ; GFX8: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[UV]], [[UV1]] - ; GFX8: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(sge), [[XOR]](s32), [[C1]] - ; GFX8: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[C4]], [[C]] + ; GFX8: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[XOR]], [[C2]](s32) + ; GFX8: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[C]], [[ASHR]] ; GFX8: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.sffbh), [[UV1]](s32) - ; GFX8: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[INT]](s32), [[C3]] - ; GFX8: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s1), [[INT]], [[SELECT]] - ; GFX8: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[SELECT1]], [[C2]] - ; GFX8: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[COPY]], [[SUB]](s32) + ; GFX8: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[INT]], [[C1]] + ; GFX8: [[UMIN:%[0-9]+]]:_(s32) = G_UMIN [[SUB]], [[ADD]] + ; GFX8: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[COPY]], [[UMIN]](s32) ; GFX8: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[SHL]](s64) - ; GFX8: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[UV2]](s32), [[C1]] - ; GFX8: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[ICMP2]](s1), [[C2]], [[C1]] - ; GFX8: [[OR:%[0-9]+]]:_(s32) = G_OR [[UV3]], [[SELECT2]] + ; GFX8: [[UMIN1:%[0-9]+]]:_(s32) = G_UMIN [[C1]], [[UV2]] + ; GFX8: [[OR:%[0-9]+]]:_(s32) = G_OR [[UV3]], [[UMIN1]] ; GFX8: [[SITOFP:%[0-9]+]]:_(s32) = G_SITOFP [[OR]](s32) - ; GFX8: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[C]], [[SUB]] + ; GFX8: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[C]], [[UMIN]] ; GFX8: [[INT1:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.ldexp), [[SITOFP]](s32), [[SUB1]](s32) ; GFX8: $vgpr0 = COPY [[INT1]](s32) %0:_(s64) = COPY $vgpr0_vgpr1 @@ -420,24 +412,20 @@ ; GFX6: [[SEXT_INREG:%[0-9]+]]:_(s64) = G_SEXT_INREG [[COPY1]], 33 ; GFX6: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[SEXT_INREG]](s64) ; GFX6: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 - ; GFX6: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 - ; GFX6: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 - ; GFX6: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1 - ; GFX6: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 33 + ; GFX6: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX6: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 31 ; GFX6: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[UV]], [[UV1]] - ; GFX6: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(sge), [[XOR]](s32), [[C1]] - ; GFX6: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[C4]], [[C]] + ; GFX6: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[XOR]], [[C2]](s32) + ; GFX6: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[C]], [[ASHR]] ; GFX6: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.sffbh), [[UV1]](s32) - ; GFX6: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[INT]](s32), [[C3]] - ; GFX6: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s1), [[INT]], [[SELECT]] - ; GFX6: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[SELECT1]], [[C2]] - ; GFX6: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[SEXT_INREG]], [[SUB]](s32) + ; GFX6: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[INT]], [[C1]] + ; GFX6: [[UMIN:%[0-9]+]]:_(s32) = G_UMIN [[SUB]], [[ADD]] + ; GFX6: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[SEXT_INREG]], [[UMIN]](s32) ; GFX6: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[SHL]](s64) - ; GFX6: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[UV2]](s32), [[C1]] - ; GFX6: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[ICMP2]](s1), [[C2]], [[C1]] - ; GFX6: [[OR:%[0-9]+]]:_(s32) = G_OR [[UV3]], [[SELECT2]] + ; GFX6: [[UMIN1:%[0-9]+]]:_(s32) = G_UMIN [[C1]], [[UV2]] + ; GFX6: [[OR:%[0-9]+]]:_(s32) = G_OR [[UV3]], [[UMIN1]] ; GFX6: [[SITOFP:%[0-9]+]]:_(s32) = G_SITOFP [[OR]](s32) - ; GFX6: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[C]], [[SUB]] + ; GFX6: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[C]], [[UMIN]] ; GFX6: [[INT1:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.ldexp), [[SITOFP]](s32), [[SUB1]](s32) ; GFX6: $vgpr0 = COPY [[INT1]](s32) ; GFX8-LABEL: name: test_sitofp_s33_to_s32 @@ -446,24 +434,20 @@ ; GFX8: [[SEXT_INREG:%[0-9]+]]:_(s64) = G_SEXT_INREG [[COPY1]], 33 ; GFX8: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[SEXT_INREG]](s64) ; GFX8: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 - ; GFX8: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 - ; GFX8: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 - ; GFX8: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1 - ; GFX8: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 33 + ; GFX8: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX8: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 31 ; GFX8: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[UV]], [[UV1]] - ; GFX8: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(sge), [[XOR]](s32), [[C1]] - ; GFX8: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[C4]], [[C]] + ; GFX8: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[XOR]], [[C2]](s32) + ; GFX8: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[C]], [[ASHR]] ; GFX8: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.sffbh), [[UV1]](s32) - ; GFX8: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[INT]](s32), [[C3]] - ; GFX8: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s1), [[INT]], [[SELECT]] - ; GFX8: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[SELECT1]], [[C2]] - ; GFX8: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[SEXT_INREG]], [[SUB]](s32) + ; GFX8: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[INT]], [[C1]] + ; GFX8: [[UMIN:%[0-9]+]]:_(s32) = G_UMIN [[SUB]], [[ADD]] + ; GFX8: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[SEXT_INREG]], [[UMIN]](s32) ; GFX8: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[SHL]](s64) - ; GFX8: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[UV2]](s32), [[C1]] - ; GFX8: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[ICMP2]](s1), [[C2]], [[C1]] - ; GFX8: [[OR:%[0-9]+]]:_(s32) = G_OR [[UV3]], [[SELECT2]] + ; GFX8: [[UMIN1:%[0-9]+]]:_(s32) = G_UMIN [[C1]], [[UV2]] + ; GFX8: [[OR:%[0-9]+]]:_(s32) = G_OR [[UV3]], [[UMIN1]] ; GFX8: [[SITOFP:%[0-9]+]]:_(s32) = G_SITOFP [[OR]](s32) - ; GFX8: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[C]], [[SUB]] + ; GFX8: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[C]], [[UMIN]] ; GFX8: [[INT1:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.ldexp), [[SITOFP]](s32), [[SUB1]](s32) ; GFX8: $vgpr0 = COPY [[INT1]](s32) %0:_(s64) = COPY $vgpr0_vgpr1 @@ -482,24 +466,20 @@ ; GFX6: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 ; GFX6: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64) ; GFX6: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 - ; GFX6: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 - ; GFX6: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 - ; GFX6: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1 - ; GFX6: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 33 + ; GFX6: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX6: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 31 ; GFX6: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[UV]], [[UV1]] - ; GFX6: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(sge), [[XOR]](s32), [[C1]] - ; GFX6: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[C4]], [[C]] + ; GFX6: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[XOR]], [[C2]](s32) + ; GFX6: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[C]], [[ASHR]] ; GFX6: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.sffbh), [[UV1]](s32) - ; GFX6: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[INT]](s32), [[C3]] - ; GFX6: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s1), [[INT]], [[SELECT]] - ; GFX6: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[SELECT1]], [[C2]] - ; GFX6: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[COPY]], [[SUB]](s32) + ; GFX6: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[INT]], [[C1]] + ; GFX6: [[UMIN:%[0-9]+]]:_(s32) = G_UMIN [[SUB]], [[ADD]] + ; GFX6: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[COPY]], [[UMIN]](s32) ; GFX6: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[SHL]](s64) - ; GFX6: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[UV2]](s32), [[C1]] - ; GFX6: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[ICMP2]](s1), [[C2]], [[C1]] - ; GFX6: [[OR:%[0-9]+]]:_(s32) = G_OR [[UV3]], [[SELECT2]] + ; GFX6: [[UMIN1:%[0-9]+]]:_(s32) = G_UMIN [[C1]], [[UV2]] + ; GFX6: [[OR:%[0-9]+]]:_(s32) = G_OR [[UV3]], [[UMIN1]] ; GFX6: [[SITOFP:%[0-9]+]]:_(s32) = G_SITOFP [[OR]](s32) - ; GFX6: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[C]], [[SUB]] + ; GFX6: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[C]], [[UMIN]] ; GFX6: [[INT1:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.ldexp), [[SITOFP]](s32), [[SUB1]](s32) ; GFX6: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[INT1]](s32) ; GFX6: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FPTRUNC]](s16) @@ -508,24 +488,20 @@ ; GFX8: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 ; GFX8: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64) ; GFX8: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 - ; GFX8: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 - ; GFX8: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 - ; GFX8: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1 - ; GFX8: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 33 + ; GFX8: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX8: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 31 ; GFX8: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[UV]], [[UV1]] - ; GFX8: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(sge), [[XOR]](s32), [[C1]] - ; GFX8: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[C4]], [[C]] + ; GFX8: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[XOR]], [[C2]](s32) + ; GFX8: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[C]], [[ASHR]] ; GFX8: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.sffbh), [[UV1]](s32) - ; GFX8: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[INT]](s32), [[C3]] - ; GFX8: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s1), [[INT]], [[SELECT]] - ; GFX8: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[SELECT1]], [[C2]] - ; GFX8: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[COPY]], [[SUB]](s32) + ; GFX8: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[INT]], [[C1]] + ; GFX8: [[UMIN:%[0-9]+]]:_(s32) = G_UMIN [[SUB]], [[ADD]] + ; GFX8: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[COPY]], [[UMIN]](s32) ; GFX8: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[SHL]](s64) - ; GFX8: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[UV2]](s32), [[C1]] - ; GFX8: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[ICMP2]](s1), [[C2]], [[C1]] - ; GFX8: [[OR:%[0-9]+]]:_(s32) = G_OR [[UV3]], [[SELECT2]] + ; GFX8: [[UMIN1:%[0-9]+]]:_(s32) = G_UMIN [[C1]], [[UV2]] + ; GFX8: [[OR:%[0-9]+]]:_(s32) = G_OR [[UV3]], [[UMIN1]] ; GFX8: [[SITOFP:%[0-9]+]]:_(s32) = G_SITOFP [[OR]](s32) - ; GFX8: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[C]], [[SUB]] + ; GFX8: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[C]], [[UMIN]] ; GFX8: [[INT1:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.ldexp), [[SITOFP]](s32), [[SUB1]](s32) ; GFX8: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[INT1]](s32) ; GFX8: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FPTRUNC]](s16) @@ -547,47 +523,41 @@ ; GFX6: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[COPY]](<2 x s64>) ; GFX6: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV]](s64) ; GFX6: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 - ; GFX6: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 - ; GFX6: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 - ; GFX6: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1 - ; GFX6: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 33 + ; GFX6: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX6: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 31 ; GFX6: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[UV2]], [[UV3]] - ; GFX6: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(sge), [[XOR]](s32), [[C1]] - ; GFX6: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[C4]], [[C]] + ; GFX6: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[XOR]], [[C2]](s32) + ; GFX6: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[C]], [[ASHR]] ; GFX6: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.sffbh), [[UV3]](s32) - ; GFX6: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[INT]](s32), [[C3]] - ; GFX6: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s1), [[INT]], [[SELECT]] - ; GFX6: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[SELECT1]], [[C2]] - ; GFX6: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[UV]], [[SUB]](s32) + ; GFX6: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[INT]], [[C1]] + ; GFX6: [[UMIN:%[0-9]+]]:_(s32) = G_UMIN [[SUB]], [[ADD]] + ; GFX6: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[UV]], [[UMIN]](s32) ; GFX6: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[SHL]](s64) - ; GFX6: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[UV4]](s32), [[C1]] - ; GFX6: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[ICMP2]](s1), [[C2]], [[C1]] - ; GFX6: [[OR:%[0-9]+]]:_(s32) = G_OR [[UV5]], [[SELECT2]] + ; GFX6: [[UMIN1:%[0-9]+]]:_(s32) = G_UMIN [[C1]], [[UV4]] + ; GFX6: [[OR:%[0-9]+]]:_(s32) = G_OR [[UV5]], [[UMIN1]] ; GFX6: [[SITOFP:%[0-9]+]]:_(s32) = G_SITOFP [[OR]](s32) - ; GFX6: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[C]], [[SUB]] + ; GFX6: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[C]], [[UMIN]] ; GFX6: [[INT1:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.ldexp), [[SITOFP]](s32), [[SUB1]](s32) ; GFX6: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[INT1]](s32) ; GFX6: [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV1]](s64) ; GFX6: [[XOR1:%[0-9]+]]:_(s32) = G_XOR [[UV6]], [[UV7]] - ; GFX6: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(sge), [[XOR1]](s32), [[C1]] - ; GFX6: [[SELECT3:%[0-9]+]]:_(s32) = G_SELECT [[ICMP3]](s1), [[C4]], [[C]] + ; GFX6: [[ASHR1:%[0-9]+]]:_(s32) = G_ASHR [[XOR1]], [[C2]](s32) + ; GFX6: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[C]], [[ASHR1]] ; GFX6: [[INT2:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.sffbh), [[UV7]](s32) - ; GFX6: [[ICMP4:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[INT2]](s32), [[C3]] - ; GFX6: [[SELECT4:%[0-9]+]]:_(s32) = G_SELECT [[ICMP4]](s1), [[INT2]], [[SELECT3]] - ; GFX6: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[SELECT4]], [[C2]] - ; GFX6: [[SHL1:%[0-9]+]]:_(s64) = G_SHL [[UV1]], [[SUB2]](s32) + ; GFX6: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[INT2]], [[C1]] + ; GFX6: [[UMIN2:%[0-9]+]]:_(s32) = G_UMIN [[SUB2]], [[ADD1]] + ; GFX6: [[SHL1:%[0-9]+]]:_(s64) = G_SHL [[UV1]], [[UMIN2]](s32) ; GFX6: [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[SHL1]](s64) - ; GFX6: [[ICMP5:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[UV8]](s32), [[C1]] - ; GFX6: [[SELECT5:%[0-9]+]]:_(s32) = G_SELECT [[ICMP5]](s1), [[C2]], [[C1]] - ; GFX6: [[OR1:%[0-9]+]]:_(s32) = G_OR [[UV9]], [[SELECT5]] + ; GFX6: [[UMIN3:%[0-9]+]]:_(s32) = G_UMIN [[C1]], [[UV8]] + ; GFX6: [[OR1:%[0-9]+]]:_(s32) = G_OR [[UV9]], [[UMIN3]] ; GFX6: [[SITOFP1:%[0-9]+]]:_(s32) = G_SITOFP [[OR1]](s32) - ; GFX6: [[SUB3:%[0-9]+]]:_(s32) = G_SUB [[C]], [[SUB2]] + ; GFX6: [[SUB3:%[0-9]+]]:_(s32) = G_SUB [[C]], [[UMIN2]] ; GFX6: [[INT3:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.ldexp), [[SITOFP1]](s32), [[SUB3]](s32) ; GFX6: [[FPTRUNC1:%[0-9]+]]:_(s16) = G_FPTRUNC [[INT3]](s32) ; GFX6: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[FPTRUNC]](s16) ; GFX6: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[FPTRUNC1]](s16) - ; GFX6: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX6: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C5]](s32) + ; GFX6: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX6: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C3]](s32) ; GFX6: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] ; GFX6: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32) ; GFX6: $vgpr0 = COPY [[BITCAST]](<2 x s16>) @@ -596,47 +566,41 @@ ; GFX8: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[COPY]](<2 x s64>) ; GFX8: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV]](s64) ; GFX8: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 - ; GFX8: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 - ; GFX8: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 - ; GFX8: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1 - ; GFX8: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 33 + ; GFX8: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX8: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 31 ; GFX8: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[UV2]], [[UV3]] - ; GFX8: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(sge), [[XOR]](s32), [[C1]] - ; GFX8: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[C4]], [[C]] + ; GFX8: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[XOR]], [[C2]](s32) + ; GFX8: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[C]], [[ASHR]] ; GFX8: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.sffbh), [[UV3]](s32) - ; GFX8: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[INT]](s32), [[C3]] - ; GFX8: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s1), [[INT]], [[SELECT]] - ; GFX8: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[SELECT1]], [[C2]] - ; GFX8: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[UV]], [[SUB]](s32) + ; GFX8: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[INT]], [[C1]] + ; GFX8: [[UMIN:%[0-9]+]]:_(s32) = G_UMIN [[SUB]], [[ADD]] + ; GFX8: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[UV]], [[UMIN]](s32) ; GFX8: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[SHL]](s64) - ; GFX8: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[UV4]](s32), [[C1]] - ; GFX8: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[ICMP2]](s1), [[C2]], [[C1]] - ; GFX8: [[OR:%[0-9]+]]:_(s32) = G_OR [[UV5]], [[SELECT2]] + ; GFX8: [[UMIN1:%[0-9]+]]:_(s32) = G_UMIN [[C1]], [[UV4]] + ; GFX8: [[OR:%[0-9]+]]:_(s32) = G_OR [[UV5]], [[UMIN1]] ; GFX8: [[SITOFP:%[0-9]+]]:_(s32) = G_SITOFP [[OR]](s32) - ; GFX8: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[C]], [[SUB]] + ; GFX8: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[C]], [[UMIN]] ; GFX8: [[INT1:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.ldexp), [[SITOFP]](s32), [[SUB1]](s32) ; GFX8: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[INT1]](s32) ; GFX8: [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV1]](s64) ; GFX8: [[XOR1:%[0-9]+]]:_(s32) = G_XOR [[UV6]], [[UV7]] - ; GFX8: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(sge), [[XOR1]](s32), [[C1]] - ; GFX8: [[SELECT3:%[0-9]+]]:_(s32) = G_SELECT [[ICMP3]](s1), [[C4]], [[C]] + ; GFX8: [[ASHR1:%[0-9]+]]:_(s32) = G_ASHR [[XOR1]], [[C2]](s32) + ; GFX8: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[C]], [[ASHR1]] ; GFX8: [[INT2:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.sffbh), [[UV7]](s32) - ; GFX8: [[ICMP4:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[INT2]](s32), [[C3]] - ; GFX8: [[SELECT4:%[0-9]+]]:_(s32) = G_SELECT [[ICMP4]](s1), [[INT2]], [[SELECT3]] - ; GFX8: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[SELECT4]], [[C2]] - ; GFX8: [[SHL1:%[0-9]+]]:_(s64) = G_SHL [[UV1]], [[SUB2]](s32) + ; GFX8: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[INT2]], [[C1]] + ; GFX8: [[UMIN2:%[0-9]+]]:_(s32) = G_UMIN [[SUB2]], [[ADD1]] + ; GFX8: [[SHL1:%[0-9]+]]:_(s64) = G_SHL [[UV1]], [[UMIN2]](s32) ; GFX8: [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[SHL1]](s64) - ; GFX8: [[ICMP5:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[UV8]](s32), [[C1]] - ; GFX8: [[SELECT5:%[0-9]+]]:_(s32) = G_SELECT [[ICMP5]](s1), [[C2]], [[C1]] - ; GFX8: [[OR1:%[0-9]+]]:_(s32) = G_OR [[UV9]], [[SELECT5]] + ; GFX8: [[UMIN3:%[0-9]+]]:_(s32) = G_UMIN [[C1]], [[UV8]] + ; GFX8: [[OR1:%[0-9]+]]:_(s32) = G_OR [[UV9]], [[UMIN3]] ; GFX8: [[SITOFP1:%[0-9]+]]:_(s32) = G_SITOFP [[OR1]](s32) - ; GFX8: [[SUB3:%[0-9]+]]:_(s32) = G_SUB [[C]], [[SUB2]] + ; GFX8: [[SUB3:%[0-9]+]]:_(s32) = G_SUB [[C]], [[UMIN2]] ; GFX8: [[INT3:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.ldexp), [[SITOFP1]](s32), [[SUB3]](s32) ; GFX8: [[FPTRUNC1:%[0-9]+]]:_(s16) = G_FPTRUNC [[INT3]](s32) ; GFX8: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[FPTRUNC]](s16) ; GFX8: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[FPTRUNC1]](s16) - ; GFX8: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX8: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C5]](s32) + ; GFX8: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX8: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C3]](s32) ; GFX8: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] ; GFX8: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32) ; GFX8: $vgpr0 = COPY [[BITCAST]](<2 x s16>) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-uitofp.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-uitofp.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-uitofp.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-uitofp.mir @@ -75,15 +75,13 @@ ; GFX6: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 ; GFX6: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64) ; GFX6: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 - ; GFX6: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 - ; GFX6: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX6: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 ; GFX6: [[AMDGPU_FFBH_U32_:%[0-9]+]]:_(s32) = G_AMDGPU_FFBH_U32 [[UV1]](s32) ; GFX6: [[UMIN:%[0-9]+]]:_(s32) = G_UMIN [[AMDGPU_FFBH_U32_]], [[C]] ; GFX6: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[COPY]], [[UMIN]](s32) ; GFX6: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[SHL]](s64) - ; GFX6: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[UV2]](s32), [[C1]] - ; GFX6: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[C2]], [[C1]] - ; GFX6: [[OR:%[0-9]+]]:_(s32) = G_OR [[UV3]], [[SELECT]] + ; GFX6: [[UMIN1:%[0-9]+]]:_(s32) = G_UMIN [[C1]], [[UV2]] + ; GFX6: [[OR:%[0-9]+]]:_(s32) = G_OR [[UV3]], [[UMIN1]] ; GFX6: [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[OR]](s32) ; GFX6: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[C]], [[UMIN]] ; GFX6: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.ldexp), [[UITOFP]](s32), [[SUB]](s32) @@ -92,15 +90,13 @@ ; GFX8: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 ; GFX8: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64) ; GFX8: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 - ; GFX8: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 - ; GFX8: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX8: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 ; GFX8: [[AMDGPU_FFBH_U32_:%[0-9]+]]:_(s32) = G_AMDGPU_FFBH_U32 [[UV1]](s32) ; GFX8: [[UMIN:%[0-9]+]]:_(s32) = G_UMIN [[AMDGPU_FFBH_U32_]], [[C]] ; GFX8: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[COPY]], [[UMIN]](s32) ; GFX8: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[SHL]](s64) - ; GFX8: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[UV2]](s32), [[C1]] - ; GFX8: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[C2]], [[C1]] - ; GFX8: [[OR:%[0-9]+]]:_(s32) = G_OR [[UV3]], [[SELECT]] + ; GFX8: [[UMIN1:%[0-9]+]]:_(s32) = G_UMIN [[C1]], [[UV2]] + ; GFX8: [[OR:%[0-9]+]]:_(s32) = G_OR [[UV3]], [[UMIN1]] ; GFX8: [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[OR]](s32) ; GFX8: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[C]], [[UMIN]] ; GFX8: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.ldexp), [[UITOFP]](s32), [[SUB]](s32) @@ -393,15 +389,13 @@ ; GFX6: [[AND:%[0-9]+]]:_(s64) = G_AND [[COPY1]], [[C]] ; GFX6: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AND]](s64) ; GFX6: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 - ; GFX6: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 - ; GFX6: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX6: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 ; GFX6: [[AMDGPU_FFBH_U32_:%[0-9]+]]:_(s32) = G_AMDGPU_FFBH_U32 [[UV1]](s32) ; GFX6: [[UMIN:%[0-9]+]]:_(s32) = G_UMIN [[AMDGPU_FFBH_U32_]], [[C1]] ; GFX6: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[AND]], [[UMIN]](s32) ; GFX6: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[SHL]](s64) - ; GFX6: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[UV2]](s32), [[C2]] - ; GFX6: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[C3]], [[C2]] - ; GFX6: [[OR:%[0-9]+]]:_(s32) = G_OR [[UV3]], [[SELECT]] + ; GFX6: [[UMIN1:%[0-9]+]]:_(s32) = G_UMIN [[C2]], [[UV2]] + ; GFX6: [[OR:%[0-9]+]]:_(s32) = G_OR [[UV3]], [[UMIN1]] ; GFX6: [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[OR]](s32) ; GFX6: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[C1]], [[UMIN]] ; GFX6: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.ldexp), [[UITOFP]](s32), [[SUB]](s32) @@ -413,15 +407,13 @@ ; GFX8: [[AND:%[0-9]+]]:_(s64) = G_AND [[COPY1]], [[C]] ; GFX8: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AND]](s64) ; GFX8: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 - ; GFX8: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 - ; GFX8: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX8: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 ; GFX8: [[AMDGPU_FFBH_U32_:%[0-9]+]]:_(s32) = G_AMDGPU_FFBH_U32 [[UV1]](s32) ; GFX8: [[UMIN:%[0-9]+]]:_(s32) = G_UMIN [[AMDGPU_FFBH_U32_]], [[C1]] ; GFX8: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[AND]], [[UMIN]](s32) ; GFX8: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[SHL]](s64) - ; GFX8: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[UV2]](s32), [[C2]] - ; GFX8: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[C3]], [[C2]] - ; GFX8: [[OR:%[0-9]+]]:_(s32) = G_OR [[UV3]], [[SELECT]] + ; GFX8: [[UMIN1:%[0-9]+]]:_(s32) = G_UMIN [[C2]], [[UV2]] + ; GFX8: [[OR:%[0-9]+]]:_(s32) = G_OR [[UV3]], [[UMIN1]] ; GFX8: [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[OR]](s32) ; GFX8: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[C1]], [[UMIN]] ; GFX8: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.ldexp), [[UITOFP]](s32), [[SUB]](s32) @@ -442,15 +434,13 @@ ; GFX6: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 ; GFX6: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64) ; GFX6: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 - ; GFX6: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 - ; GFX6: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX6: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 ; GFX6: [[AMDGPU_FFBH_U32_:%[0-9]+]]:_(s32) = G_AMDGPU_FFBH_U32 [[UV1]](s32) ; GFX6: [[UMIN:%[0-9]+]]:_(s32) = G_UMIN [[AMDGPU_FFBH_U32_]], [[C]] ; GFX6: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[COPY]], [[UMIN]](s32) ; GFX6: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[SHL]](s64) - ; GFX6: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[UV2]](s32), [[C1]] - ; GFX6: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[C2]], [[C1]] - ; GFX6: [[OR:%[0-9]+]]:_(s32) = G_OR [[UV3]], [[SELECT]] + ; GFX6: [[UMIN1:%[0-9]+]]:_(s32) = G_UMIN [[C1]], [[UV2]] + ; GFX6: [[OR:%[0-9]+]]:_(s32) = G_OR [[UV3]], [[UMIN1]] ; GFX6: [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[OR]](s32) ; GFX6: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[C]], [[UMIN]] ; GFX6: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.ldexp), [[UITOFP]](s32), [[SUB]](s32) @@ -461,15 +451,13 @@ ; GFX8: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 ; GFX8: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64) ; GFX8: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 - ; GFX8: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 - ; GFX8: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX8: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 ; GFX8: [[AMDGPU_FFBH_U32_:%[0-9]+]]:_(s32) = G_AMDGPU_FFBH_U32 [[UV1]](s32) ; GFX8: [[UMIN:%[0-9]+]]:_(s32) = G_UMIN [[AMDGPU_FFBH_U32_]], [[C]] ; GFX8: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[COPY]], [[UMIN]](s32) ; GFX8: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[SHL]](s64) - ; GFX8: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[UV2]](s32), [[C1]] - ; GFX8: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[C2]], [[C1]] - ; GFX8: [[OR:%[0-9]+]]:_(s32) = G_OR [[UV3]], [[SELECT]] + ; GFX8: [[UMIN1:%[0-9]+]]:_(s32) = G_UMIN [[C1]], [[UV2]] + ; GFX8: [[OR:%[0-9]+]]:_(s32) = G_OR [[UV3]], [[UMIN1]] ; GFX8: [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[OR]](s32) ; GFX8: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[C]], [[UMIN]] ; GFX8: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.ldexp), [[UITOFP]](s32), [[SUB]](s32) @@ -493,35 +481,32 @@ ; GFX6: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[COPY]](<2 x s64>) ; GFX6: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV]](s64) ; GFX6: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 - ; GFX6: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 - ; GFX6: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX6: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 ; GFX6: [[AMDGPU_FFBH_U32_:%[0-9]+]]:_(s32) = G_AMDGPU_FFBH_U32 [[UV3]](s32) ; GFX6: [[UMIN:%[0-9]+]]:_(s32) = G_UMIN [[AMDGPU_FFBH_U32_]], [[C]] ; GFX6: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[UV]], [[UMIN]](s32) ; GFX6: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[SHL]](s64) - ; GFX6: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[UV4]](s32), [[C1]] - ; GFX6: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[C2]], [[C1]] - ; GFX6: [[OR:%[0-9]+]]:_(s32) = G_OR [[UV5]], [[SELECT]] + ; GFX6: [[UMIN1:%[0-9]+]]:_(s32) = G_UMIN [[C1]], [[UV4]] + ; GFX6: [[OR:%[0-9]+]]:_(s32) = G_OR [[UV5]], [[UMIN1]] ; GFX6: [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[OR]](s32) ; GFX6: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[C]], [[UMIN]] ; GFX6: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.ldexp), [[UITOFP]](s32), [[SUB]](s32) ; GFX6: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[INT]](s32) ; GFX6: [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV1]](s64) ; GFX6: [[AMDGPU_FFBH_U32_1:%[0-9]+]]:_(s32) = G_AMDGPU_FFBH_U32 [[UV7]](s32) - ; GFX6: [[UMIN1:%[0-9]+]]:_(s32) = G_UMIN [[AMDGPU_FFBH_U32_1]], [[C]] - ; GFX6: [[SHL1:%[0-9]+]]:_(s64) = G_SHL [[UV1]], [[UMIN1]](s32) + ; GFX6: [[UMIN2:%[0-9]+]]:_(s32) = G_UMIN [[AMDGPU_FFBH_U32_1]], [[C]] + ; GFX6: [[SHL1:%[0-9]+]]:_(s64) = G_SHL [[UV1]], [[UMIN2]](s32) ; GFX6: [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[SHL1]](s64) - ; GFX6: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[UV8]](s32), [[C1]] - ; GFX6: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s1), [[C2]], [[C1]] - ; GFX6: [[OR1:%[0-9]+]]:_(s32) = G_OR [[UV9]], [[SELECT1]] + ; GFX6: [[UMIN3:%[0-9]+]]:_(s32) = G_UMIN [[C1]], [[UV8]] + ; GFX6: [[OR1:%[0-9]+]]:_(s32) = G_OR [[UV9]], [[UMIN3]] ; GFX6: [[UITOFP1:%[0-9]+]]:_(s32) = G_UITOFP [[OR1]](s32) - ; GFX6: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[C]], [[UMIN1]] + ; GFX6: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[C]], [[UMIN2]] ; GFX6: [[INT1:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.ldexp), [[UITOFP1]](s32), [[SUB1]](s32) ; GFX6: [[FPTRUNC1:%[0-9]+]]:_(s16) = G_FPTRUNC [[INT1]](s32) ; GFX6: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[FPTRUNC]](s16) ; GFX6: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[FPTRUNC1]](s16) - ; GFX6: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX6: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C3]](s32) + ; GFX6: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX6: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C2]](s32) ; GFX6: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] ; GFX6: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32) ; GFX6: $vgpr0 = COPY [[BITCAST]](<2 x s16>) @@ -530,35 +515,32 @@ ; GFX8: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[COPY]](<2 x s64>) ; GFX8: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV]](s64) ; GFX8: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 - ; GFX8: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 - ; GFX8: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX8: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 ; GFX8: [[AMDGPU_FFBH_U32_:%[0-9]+]]:_(s32) = G_AMDGPU_FFBH_U32 [[UV3]](s32) ; GFX8: [[UMIN:%[0-9]+]]:_(s32) = G_UMIN [[AMDGPU_FFBH_U32_]], [[C]] ; GFX8: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[UV]], [[UMIN]](s32) ; GFX8: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[SHL]](s64) - ; GFX8: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[UV4]](s32), [[C1]] - ; GFX8: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[C2]], [[C1]] - ; GFX8: [[OR:%[0-9]+]]:_(s32) = G_OR [[UV5]], [[SELECT]] + ; GFX8: [[UMIN1:%[0-9]+]]:_(s32) = G_UMIN [[C1]], [[UV4]] + ; GFX8: [[OR:%[0-9]+]]:_(s32) = G_OR [[UV5]], [[UMIN1]] ; GFX8: [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[OR]](s32) ; GFX8: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[C]], [[UMIN]] ; GFX8: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.ldexp), [[UITOFP]](s32), [[SUB]](s32) ; GFX8: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[INT]](s32) ; GFX8: [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV1]](s64) ; GFX8: [[AMDGPU_FFBH_U32_1:%[0-9]+]]:_(s32) = G_AMDGPU_FFBH_U32 [[UV7]](s32) - ; GFX8: [[UMIN1:%[0-9]+]]:_(s32) = G_UMIN [[AMDGPU_FFBH_U32_1]], [[C]] - ; GFX8: [[SHL1:%[0-9]+]]:_(s64) = G_SHL [[UV1]], [[UMIN1]](s32) + ; GFX8: [[UMIN2:%[0-9]+]]:_(s32) = G_UMIN [[AMDGPU_FFBH_U32_1]], [[C]] + ; GFX8: [[SHL1:%[0-9]+]]:_(s64) = G_SHL [[UV1]], [[UMIN2]](s32) ; GFX8: [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[SHL1]](s64) - ; GFX8: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[UV8]](s32), [[C1]] - ; GFX8: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s1), [[C2]], [[C1]] - ; GFX8: [[OR1:%[0-9]+]]:_(s32) = G_OR [[UV9]], [[SELECT1]] + ; GFX8: [[UMIN3:%[0-9]+]]:_(s32) = G_UMIN [[C1]], [[UV8]] + ; GFX8: [[OR1:%[0-9]+]]:_(s32) = G_OR [[UV9]], [[UMIN3]] ; GFX8: [[UITOFP1:%[0-9]+]]:_(s32) = G_UITOFP [[OR1]](s32) - ; GFX8: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[C]], [[UMIN1]] + ; GFX8: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[C]], [[UMIN2]] ; GFX8: [[INT1:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.ldexp), [[UITOFP1]](s32), [[SUB1]](s32) ; GFX8: [[FPTRUNC1:%[0-9]+]]:_(s16) = G_FPTRUNC [[INT1]](s32) ; GFX8: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[FPTRUNC]](s16) ; GFX8: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[FPTRUNC1]](s16) - ; GFX8: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX8: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C3]](s32) + ; GFX8: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX8: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C2]](s32) ; GFX8: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] ; GFX8: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32) ; GFX8: $vgpr0 = COPY [[BITCAST]](<2 x s16>) diff --git a/llvm/test/CodeGen/AMDGPU/sint_to_fp.i64.ll b/llvm/test/CodeGen/AMDGPU/sint_to_fp.i64.ll --- a/llvm/test/CodeGen/AMDGPU/sint_to_fp.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/sint_to_fp.i64.ll @@ -13,21 +13,18 @@ ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s4, s0 ; GFX6-NEXT: s_mov_b32 s5, s1 -; GFX6-NEXT: s_xor_b32 s0, s2, s3 -; GFX6-NEXT: s_flbit_i32 s8, s3 -; GFX6-NEXT: v_cmp_gt_i32_e64 s[0:1], s0, -1 -; GFX6-NEXT: v_cndmask_b32_e64 v0, 32, 33, s[0:1] -; GFX6-NEXT: v_mov_b32_e32 v1, s8 -; GFX6-NEXT: v_cmp_ne_u32_e64 vcc, s8, -1 -; GFX6-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc -; GFX6-NEXT: v_add_i32_e32 v0, vcc, -1, v2 -; GFX6-NEXT: v_lshl_b64 v[0:1], s[2:3], v0 -; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX6-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX6-NEXT: v_cvt_f32_i32_e32 v0, v0 -; GFX6-NEXT: v_sub_i32_e32 v1, vcc, 33, v2 -; GFX6-NEXT: v_ldexp_f32_e32 v0, v0, v1 +; GFX6-NEXT: s_flbit_i32 s0, s3 +; GFX6-NEXT: s_xor_b32 s1, s2, s3 +; GFX6-NEXT: s_add_i32 s0, s0, -1 +; GFX6-NEXT: s_ashr_i32 s1, s1, 31 +; GFX6-NEXT: s_add_i32 s1, s1, 32 +; GFX6-NEXT: s_min_u32 s8, s0, s1 +; GFX6-NEXT: s_lshl_b64 s[0:1], s[2:3], s8 +; GFX6-NEXT: s_min_u32 s0, s0, 1 +; GFX6-NEXT: s_or_b32 s0, s1, s0 +; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s0 +; GFX6-NEXT: s_sub_i32 s0, 32, s8 +; GFX6-NEXT: v_ldexp_f32_e64 v0, v0, s0 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-NEXT: buffer_store_short v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm @@ -36,19 +33,17 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_xor_b32 s4, s2, s3 -; GFX8-NEXT: s_cmp_gt_i32 s4, -1 -; GFX8-NEXT: s_flbit_i32 s5, s3 -; GFX8-NEXT: s_cselect_b32 s4, 33, 32 -; GFX8-NEXT: s_cmp_lg_u32 s5, -1 -; GFX8-NEXT: s_cselect_b32 s6, s5, s4 -; GFX8-NEXT: s_add_i32 s4, s6, -1 +; GFX8-NEXT: s_xor_b32 s5, s2, s3 +; GFX8-NEXT: s_flbit_i32 s4, s3 +; GFX8-NEXT: s_ashr_i32 s5, s5, 31 +; GFX8-NEXT: s_add_i32 s4, s4, -1 +; GFX8-NEXT: s_add_i32 s5, s5, 32 +; GFX8-NEXT: s_min_u32 s4, s4, s5 ; GFX8-NEXT: s_lshl_b64 s[2:3], s[2:3], s4 -; GFX8-NEXT: v_cmp_ne_u32_e64 s[4:5], s2, 0 -; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] -; GFX8-NEXT: v_or_b32_e32 v0, s3, v0 -; GFX8-NEXT: v_cvt_f32_i32_e32 v0, v0 -; GFX8-NEXT: s_sub_i32 s2, 33, s6 +; GFX8-NEXT: s_min_u32 s2, s2, 1 +; GFX8-NEXT: s_or_b32 s2, s3, s2 +; GFX8-NEXT: v_cvt_f32_i32_e32 v0, s2 +; GFX8-NEXT: s_sub_i32 s2, 32, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_ldexp_f32 v0, v0, s2 ; GFX8-NEXT: v_cvt_f16_f32_e32 v2, v0 @@ -76,17 +71,15 @@ ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_xor_b32_e32 v0, v3, v4 ; GFX6-NEXT: v_ffbh_i32_e32 v5, v4 -; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, -1, v0 -; GFX6-NEXT: v_cndmask_b32_e64 v0, 32, 33, vcc -; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, -1, v5 -; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc -; GFX6-NEXT: v_add_i32_e32 v5, vcc, -1, v0 -; GFX6-NEXT: v_lshl_b64 v[3:4], v[3:4], v5 -; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; GFX6-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v0 +; GFX6-NEXT: v_add_i32_e32 v5, vcc, -1, v5 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GFX6-NEXT: v_min_u32_e32 v0, v5, v0 +; GFX6-NEXT: v_lshl_b64 v[3:4], v[3:4], v0 +; GFX6-NEXT: v_min_u32_e32 v3, 1, v3 ; GFX6-NEXT: v_or_b32_e32 v3, v4, v3 ; GFX6-NEXT: v_cvt_f32_i32_e32 v3, v3 -; GFX6-NEXT: v_sub_i32_e32 v0, vcc, 33, v0 +; GFX6-NEXT: v_sub_i32_e32 v0, vcc, 32, v0 ; GFX6-NEXT: v_ldexp_f32_e32 v0, v3, v0 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-NEXT: buffer_store_short v0, v[1:2], s[0:3], 0 addr64 @@ -96,33 +89,30 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX8-NEXT: v_mov_b32_e32 v3, 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v2, s3 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, s2, v1 -; GFX8-NEXT: v_addc_u32_e32 v2, vcc, v2, v3, vcc +; GFX8-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc ; GFX8-NEXT: flat_load_dwordx2 v[1:2], v[1:2] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_xor_b32_e32 v4, v1, v2 -; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, -1, v4 -; GFX8-NEXT: v_ffbh_i32_e32 v5, v2 -; GFX8-NEXT: v_cndmask_b32_e64 v4, 32, 33, vcc -; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, -1, v5 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc -; GFX8-NEXT: v_add_u32_e32 v5, vcc, -1, v4 -; GFX8-NEXT: v_lshlrev_b64 v[1:2], v5, v[1:2] -; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX8-NEXT: v_xor_b32_e32 v3, v1, v2 +; GFX8-NEXT: v_ffbh_i32_e32 v4, v2 +; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v3 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, -1, v4 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 32, v3 +; GFX8-NEXT: v_min_u32_e32 v3, v4, v3 +; GFX8-NEXT: v_lshlrev_b64 v[1:2], v3, v[1:2] +; GFX8-NEXT: v_sub_u32_e32 v3, vcc, 32, v3 +; GFX8-NEXT: v_min_u32_e32 v1, 1, v1 ; GFX8-NEXT: v_or_b32_e32 v1, v2, v1 ; GFX8-NEXT: v_cvt_f32_i32_e32 v1, v1 -; GFX8-NEXT: v_sub_u32_e32 v4, vcc, 33, v4 ; GFX8-NEXT: v_mov_b32_e32 v2, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 -; GFX8-NEXT: v_ldexp_f32 v1, v1, v4 -; GFX8-NEXT: v_cvt_f16_f32_e32 v4, v1 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v2, v3, vcc -; GFX8-NEXT: flat_store_short v[0:1], v4 +; GFX8-NEXT: v_ldexp_f32 v1, v1, v3 +; GFX8-NEXT: v_cvt_f16_f32_e32 v3, v1 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc +; GFX8-NEXT: flat_store_short v[0:1], v3 ; GFX8-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid @@ -142,21 +132,18 @@ ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s4, s0 ; GFX6-NEXT: s_mov_b32 s5, s1 -; GFX6-NEXT: s_xor_b32 s0, s2, s3 -; GFX6-NEXT: s_flbit_i32 s8, s3 -; GFX6-NEXT: v_cmp_gt_i32_e64 s[0:1], s0, -1 -; GFX6-NEXT: v_cndmask_b32_e64 v0, 32, 33, s[0:1] -; GFX6-NEXT: v_mov_b32_e32 v1, s8 -; GFX6-NEXT: v_cmp_ne_u32_e64 vcc, s8, -1 -; GFX6-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc -; GFX6-NEXT: v_add_i32_e32 v0, vcc, -1, v2 -; GFX6-NEXT: v_lshl_b64 v[0:1], s[2:3], v0 -; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX6-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX6-NEXT: v_cvt_f32_i32_e32 v0, v0 -; GFX6-NEXT: v_sub_i32_e32 v1, vcc, 33, v2 -; GFX6-NEXT: v_ldexp_f32_e32 v0, v0, v1 +; GFX6-NEXT: s_flbit_i32 s0, s3 +; GFX6-NEXT: s_xor_b32 s1, s2, s3 +; GFX6-NEXT: s_add_i32 s0, s0, -1 +; GFX6-NEXT: s_ashr_i32 s1, s1, 31 +; GFX6-NEXT: s_add_i32 s1, s1, 32 +; GFX6-NEXT: s_min_u32 s8, s0, s1 +; GFX6-NEXT: s_lshl_b64 s[0:1], s[2:3], s8 +; GFX6-NEXT: s_min_u32 s0, s0, 1 +; GFX6-NEXT: s_or_b32 s0, s1, s0 +; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s0 +; GFX6-NEXT: s_sub_i32 s0, 32, s8 +; GFX6-NEXT: v_ldexp_f32_e64 v0, v0, s0 ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -164,20 +151,18 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_xor_b32 s4, s2, s3 -; GFX8-NEXT: s_cmp_gt_i32 s4, -1 -; GFX8-NEXT: s_flbit_i32 s5, s3 -; GFX8-NEXT: s_cselect_b32 s4, 33, 32 -; GFX8-NEXT: s_cmp_lg_u32 s5, -1 -; GFX8-NEXT: s_cselect_b32 s6, s5, s4 -; GFX8-NEXT: s_add_i32 s4, s6, -1 +; GFX8-NEXT: s_xor_b32 s5, s2, s3 +; GFX8-NEXT: s_flbit_i32 s4, s3 +; GFX8-NEXT: s_ashr_i32 s5, s5, 31 +; GFX8-NEXT: s_add_i32 s4, s4, -1 +; GFX8-NEXT: s_add_i32 s5, s5, 32 +; GFX8-NEXT: s_min_u32 s4, s4, s5 ; GFX8-NEXT: s_lshl_b64 s[2:3], s[2:3], s4 -; GFX8-NEXT: v_cmp_ne_u32_e64 s[4:5], s2, 0 -; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] -; GFX8-NEXT: v_or_b32_e32 v0, s3, v0 -; GFX8-NEXT: v_cvt_f32_i32_e32 v2, v0 +; GFX8-NEXT: s_min_u32 s2, s2, 1 +; GFX8-NEXT: s_or_b32 s2, s3, s2 +; GFX8-NEXT: v_cvt_f32_i32_e32 v2, s2 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: s_sub_i32 s0, 33, s6 +; GFX8-NEXT: s_sub_i32 s0, 32, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_ldexp_f32 v2, v2, s0 ; GFX8-NEXT: flat_store_dword v[0:1], v2 @@ -203,17 +188,15 @@ ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_xor_b32_e32 v0, v3, v4 ; GFX6-NEXT: v_ffbh_i32_e32 v5, v4 -; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, -1, v0 -; GFX6-NEXT: v_cndmask_b32_e64 v0, 32, 33, vcc -; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, -1, v5 -; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc -; GFX6-NEXT: v_add_i32_e32 v5, vcc, -1, v0 -; GFX6-NEXT: v_lshl_b64 v[3:4], v[3:4], v5 -; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; GFX6-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v0 +; GFX6-NEXT: v_add_i32_e32 v5, vcc, -1, v5 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GFX6-NEXT: v_min_u32_e32 v0, v5, v0 +; GFX6-NEXT: v_lshl_b64 v[3:4], v[3:4], v0 +; GFX6-NEXT: v_min_u32_e32 v3, 1, v3 ; GFX6-NEXT: v_or_b32_e32 v3, v4, v3 ; GFX6-NEXT: v_cvt_f32_i32_e32 v3, v3 -; GFX6-NEXT: v_sub_i32_e32 v0, vcc, 33, v0 +; GFX6-NEXT: v_sub_i32_e32 v0, vcc, 32, v0 ; GFX6-NEXT: v_ldexp_f32_e32 v0, v3, v0 ; GFX6-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 ; GFX6-NEXT: s_endpgm @@ -222,31 +205,28 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX8-NEXT: v_mov_b32_e32 v3, 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 2, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v2, s3 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, s2, v1 -; GFX8-NEXT: v_addc_u32_e32 v2, vcc, v2, v3, vcc +; GFX8-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc ; GFX8-NEXT: flat_load_dwordx2 v[1:2], v[1:2] ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_xor_b32_e32 v0, v1, v2 -; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, -1, v0 -; GFX8-NEXT: v_ffbh_i32_e32 v5, v2 -; GFX8-NEXT: v_cndmask_b32_e64 v0, 32, 33, vcc -; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, -1, v5 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v0, v5, vcc -; GFX8-NEXT: v_add_u32_e32 v0, vcc, -1, v5 -; GFX8-NEXT: v_lshlrev_b64 v[0:1], v0, v[1:2] +; GFX8-NEXT: v_ffbh_i32_e32 v4, v2 +; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v0 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, -1, v4 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GFX8-NEXT: v_min_u32_e32 v4, v4, v0 +; GFX8-NEXT: v_lshlrev_b64 v[0:1], v4, v[1:2] ; GFX8-NEXT: v_mov_b32_e32 v2, s1 -; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX8-NEXT: v_min_u32_e32 v0, 1, v0 ; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX8-NEXT: v_cvt_f32_i32_e32 v6, v0 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v4 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v2, v3, vcc -; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 33, v5 -; GFX8-NEXT: v_ldexp_f32 v2, v6, v2 +; GFX8-NEXT: v_cvt_f32_i32_e32 v5, v0 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v3 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc +; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 32, v4 +; GFX8-NEXT: v_ldexp_f32 v2, v5, v2 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -266,74 +246,64 @@ ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_xor_b32 s8, s6, s7 -; GFX6-NEXT: s_flbit_i32 s10, s7 +; GFX6-NEXT: s_flbit_i32 s8, s7 +; GFX6-NEXT: s_xor_b32 s9, s6, s7 +; GFX6-NEXT: s_flbit_i32 s10, s5 ; GFX6-NEXT: s_xor_b32 s11, s4, s5 -; GFX6-NEXT: s_flbit_i32 s12, s5 -; GFX6-NEXT: v_cmp_gt_i32_e64 s[8:9], s8, -1 -; GFX6-NEXT: v_cndmask_b32_e64 v0, 32, 33, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v1, s10 -; GFX6-NEXT: v_cmp_gt_i32_e64 s[8:9], s11, -1 -; GFX6-NEXT: v_cndmask_b32_e64 v2, 32, 33, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v3, s12 -; GFX6-NEXT: v_cmp_ne_u32_e64 vcc, s10, -1 -; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GFX6-NEXT: v_cmp_ne_u32_e64 vcc, s12, -1 -; GFX6-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX6-NEXT: v_add_i32_e32 v2, vcc, -1, v0 -; GFX6-NEXT: v_sub_i32_e32 v4, vcc, 33, v0 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, -1, v1 -; GFX6-NEXT: v_sub_i32_e32 v5, vcc, 33, v1 -; GFX6-NEXT: v_lshl_b64 v[0:1], s[6:7], v2 -; GFX6-NEXT: v_lshl_b64 v[2:3], s[4:5], v3 -; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX6-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX6-NEXT: v_or_b32_e32 v1, v3, v2 -; GFX6-NEXT: v_cvt_f32_i32_e32 v0, v0 -; GFX6-NEXT: v_cvt_f32_i32_e32 v2, v1 -; GFX6-NEXT: v_ldexp_f32_e32 v1, v0, v4 -; GFX6-NEXT: v_ldexp_f32_e32 v0, v2, v5 +; GFX6-NEXT: s_add_i32 s8, s8, -1 +; GFX6-NEXT: s_ashr_i32 s9, s9, 31 +; GFX6-NEXT: s_add_i32 s10, s10, -1 +; GFX6-NEXT: s_ashr_i32 s11, s11, 31 +; GFX6-NEXT: s_add_i32 s9, s9, 32 +; GFX6-NEXT: s_add_i32 s11, s11, 32 +; GFX6-NEXT: s_min_u32 s8, s8, s9 +; GFX6-NEXT: s_min_u32 s9, s10, s11 +; GFX6-NEXT: s_lshl_b64 s[6:7], s[6:7], s8 +; GFX6-NEXT: s_sub_i32 s8, 32, s8 +; GFX6-NEXT: s_lshl_b64 s[4:5], s[4:5], s9 +; GFX6-NEXT: s_sub_i32 s9, 32, s9 +; GFX6-NEXT: s_min_u32 s6, s6, 1 +; GFX6-NEXT: s_min_u32 s4, s4, 1 +; GFX6-NEXT: s_or_b32 s6, s7, s6 +; GFX6-NEXT: s_or_b32 s4, s5, s4 +; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s6 +; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s4 +; GFX6-NEXT: v_ldexp_f32_e64 v1, v0, s8 +; GFX6-NEXT: v_ldexp_f32_e64 v0, v2, s9 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: s_sint_to_fp_v2i64_to_v2f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_xor_b32 s2, s6, s7 -; GFX8-NEXT: s_cmp_gt_i32 s2, -1 -; GFX8-NEXT: s_flbit_i32 s3, s7 -; GFX8-NEXT: s_cselect_b32 s2, 33, 32 -; GFX8-NEXT: s_cmp_lg_u32 s3, -1 -; GFX8-NEXT: s_cselect_b32 s2, s3, s2 -; GFX8-NEXT: s_add_i32 s3, s2, -1 -; GFX8-NEXT: s_sub_i32 s8, 33, s2 -; GFX8-NEXT: s_lshl_b64 s[2:3], s[6:7], s3 -; GFX8-NEXT: v_cmp_ne_u32_e64 s[6:7], s2, 0 -; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[6:7] -; GFX8-NEXT: s_xor_b32 s2, s4, s5 -; GFX8-NEXT: v_or_b32_e32 v0, s3, v0 -; GFX8-NEXT: s_cmp_gt_i32 s2, -1 -; GFX8-NEXT: s_flbit_i32 s3, s5 -; GFX8-NEXT: s_cselect_b32 s2, 33, 32 -; GFX8-NEXT: s_cmp_lg_u32 s3, -1 -; GFX8-NEXT: s_cselect_b32 s6, s3, s2 -; GFX8-NEXT: s_add_i32 s2, s6, -1 -; GFX8-NEXT: s_lshl_b64 s[2:3], s[4:5], s2 -; GFX8-NEXT: v_cmp_ne_u32_e64 s[4:5], s2, 0 -; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] -; GFX8-NEXT: v_or_b32_e32 v1, s3, v1 -; GFX8-NEXT: v_cvt_f32_i32_e32 v0, v0 -; GFX8-NEXT: v_cvt_f32_i32_e32 v2, v1 -; GFX8-NEXT: s_sub_i32 s2, 33, s6 -; GFX8-NEXT: v_ldexp_f32 v1, v0, s8 -; GFX8-NEXT: v_ldexp_f32 v0, v2, s2 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: s_xor_b32 s7, s2, s3 +; GFX8-NEXT: s_flbit_i32 s6, s3 +; GFX8-NEXT: s_ashr_i32 s7, s7, 31 +; GFX8-NEXT: s_add_i32 s6, s6, -1 +; GFX8-NEXT: s_add_i32 s7, s7, 32 +; GFX8-NEXT: s_min_u32 s6, s6, s7 +; GFX8-NEXT: s_lshl_b64 s[2:3], s[2:3], s6 +; GFX8-NEXT: s_min_u32 s2, s2, 1 +; GFX8-NEXT: s_or_b32 s2, s3, s2 +; GFX8-NEXT: v_cvt_f32_i32_e32 v0, s2 +; GFX8-NEXT: s_xor_b32 s2, s0, s1 +; GFX8-NEXT: s_flbit_i32 s8, s1 +; GFX8-NEXT: s_ashr_i32 s2, s2, 31 +; GFX8-NEXT: s_add_i32 s8, s8, -1 +; GFX8-NEXT: s_add_i32 s2, s2, 32 +; GFX8-NEXT: s_min_u32 s2, s8, s2 +; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], s2 +; GFX8-NEXT: s_min_u32 s0, s0, 1 +; GFX8-NEXT: s_or_b32 s0, s1, s0 +; GFX8-NEXT: v_cvt_f32_i32_e32 v2, s0 +; GFX8-NEXT: s_sub_i32 s0, 32, s6 +; GFX8-NEXT: v_ldexp_f32 v1, v0, s0 +; GFX8-NEXT: s_sub_i32 s0, 32, s2 +; GFX8-NEXT: v_ldexp_f32 v0, v2, s0 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_mov_b32_e32 v3, s5 ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm %result = sitofp <2 x i64> %in to <2 x float> @@ -366,54 +336,46 @@ ; GFX6-NEXT: v_ffbh_i32_e32 v15, v8 ; GFX6-NEXT: v_xor_b32_e32 v16, v5, v6 ; GFX6-NEXT: v_ffbh_i32_e32 v17, v6 -; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, -1, v0 -; GFX6-NEXT: v_cndmask_b32_e64 v0, 32, 33, vcc -; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, -1, v12 -; GFX6-NEXT: v_cndmask_b32_e64 v12, 32, 33, vcc -; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, -1, v14 -; GFX6-NEXT: v_cndmask_b32_e64 v14, 32, 33, vcc -; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, -1, v16 -; GFX6-NEXT: v_cndmask_b32_e64 v16, 32, 33, vcc -; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, -1, v9 -; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc -; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, -1, v13 -; GFX6-NEXT: v_cndmask_b32_e32 v9, v12, v13, vcc -; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, -1, v15 -; GFX6-NEXT: v_cndmask_b32_e32 v12, v14, v15, vcc -; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, -1, v17 -; GFX6-NEXT: v_cndmask_b32_e32 v13, v16, v17, vcc -; GFX6-NEXT: v_add_i32_e32 v14, vcc, -1, v0 -; GFX6-NEXT: v_sub_i32_e32 v15, vcc, 33, v0 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, -1, v9 -; GFX6-NEXT: v_sub_i32_e32 v9, vcc, 33, v9 -; GFX6-NEXT: v_add_i32_e32 v16, vcc, -1, v12 -; GFX6-NEXT: v_sub_i32_e32 v12, vcc, 33, v12 -; GFX6-NEXT: v_add_i32_e32 v17, vcc, -1, v13 -; GFX6-NEXT: v_sub_i32_e32 v13, vcc, 33, v13 -; GFX6-NEXT: v_lshl_b64 v[3:4], v[3:4], v14 -; GFX6-NEXT: v_lshl_b64 v[0:1], v[1:2], v0 -; GFX6-NEXT: v_lshl_b64 v[7:8], v[7:8], v16 -; GFX6-NEXT: v_lshl_b64 v[5:6], v[5:6], v17 -; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX6-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 -; GFX6-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX6-NEXT: v_or_b32_e32 v2, v4, v2 +; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v0 +; GFX6-NEXT: v_add_i32_e32 v9, vcc, -1, v9 +; GFX6-NEXT: v_ashrrev_i32_e32 v12, 31, v12 +; GFX6-NEXT: v_add_i32_e32 v13, vcc, -1, v13 +; GFX6-NEXT: v_ashrrev_i32_e32 v14, 31, v14 +; GFX6-NEXT: v_add_i32_e32 v15, vcc, -1, v15 +; GFX6-NEXT: v_ashrrev_i32_e32 v16, 31, v16 +; GFX6-NEXT: v_add_i32_e32 v17, vcc, -1, v17 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GFX6-NEXT: v_add_i32_e32 v12, vcc, 32, v12 +; GFX6-NEXT: v_add_i32_e32 v14, vcc, 32, v14 +; GFX6-NEXT: v_add_i32_e32 v16, vcc, 32, v16 +; GFX6-NEXT: v_min_u32_e32 v0, v9, v0 +; GFX6-NEXT: v_min_u32_e32 v9, v13, v12 +; GFX6-NEXT: v_min_u32_e32 v12, v15, v14 +; GFX6-NEXT: v_min_u32_e32 v13, v17, v16 +; GFX6-NEXT: v_lshl_b64 v[3:4], v[3:4], v0 +; GFX6-NEXT: v_sub_i32_e32 v14, vcc, 32, v0 +; GFX6-NEXT: v_lshl_b64 v[0:1], v[1:2], v9 +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 32, v9 +; GFX6-NEXT: v_lshl_b64 v[7:8], v[7:8], v12 +; GFX6-NEXT: v_sub_i32_e32 v9, vcc, 32, v12 +; GFX6-NEXT: v_lshl_b64 v[5:6], v[5:6], v13 +; GFX6-NEXT: v_sub_i32_e32 v12, vcc, 32, v13 +; GFX6-NEXT: v_min_u32_e32 v3, 1, v3 +; GFX6-NEXT: v_min_u32_e32 v0, 1, v0 +; GFX6-NEXT: v_min_u32_e32 v7, 1, v7 +; GFX6-NEXT: v_min_u32_e32 v5, 1, v5 +; GFX6-NEXT: v_or_b32_e32 v3, v4, v3 ; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX6-NEXT: v_or_b32_e32 v1, v8, v3 -; GFX6-NEXT: v_or_b32_e32 v3, v6, v5 -; GFX6-NEXT: v_cvt_f32_i32_e32 v2, v2 +; GFX6-NEXT: v_or_b32_e32 v1, v8, v7 +; GFX6-NEXT: v_or_b32_e32 v4, v6, v5 +; GFX6-NEXT: v_cvt_f32_i32_e32 v3, v3 ; GFX6-NEXT: v_cvt_f32_i32_e32 v0, v0 ; GFX6-NEXT: v_cvt_f32_i32_e32 v1, v1 -; GFX6-NEXT: v_cvt_f32_i32_e32 v4, v3 -; GFX6-NEXT: v_ldexp_f32_e32 v3, v2, v15 -; GFX6-NEXT: v_ldexp_f32_e32 v2, v0, v9 -; GFX6-NEXT: v_ldexp_f32_e32 v1, v1, v12 -; GFX6-NEXT: v_ldexp_f32_e32 v0, v4, v13 +; GFX6-NEXT: v_cvt_f32_i32_e32 v4, v4 +; GFX6-NEXT: v_ldexp_f32_e32 v3, v3, v14 +; GFX6-NEXT: v_ldexp_f32_e32 v2, v0, v2 +; GFX6-NEXT: v_ldexp_f32_e32 v1, v1, v9 +; GFX6-NEXT: v_ldexp_f32_e32 v0, v4, v12 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[10:11], s[0:3], 0 addr64 ; GFX6-NEXT: s_endpgm ; @@ -421,77 +383,68 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 5, v0 -; GFX8-NEXT: v_mov_b32_e32 v10, 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v2, s3 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, s2, v1 -; GFX8-NEXT: v_addc_u32_e32 v6, vcc, v2, v10, vcc +; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v2, vcc ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 16, v5 ; GFX8-NEXT: v_addc_u32_e32 v2, vcc, 0, v6, vcc ; GFX8-NEXT: flat_load_dwordx4 v[1:4], v[1:2] ; GFX8-NEXT: flat_load_dwordx4 v[5:8], v[5:6] ; GFX8-NEXT: v_add_u32_e32 v9, vcc, s0, v0 -; GFX8-NEXT: v_mov_b32_e32 v11, s1 -; GFX8-NEXT: v_addc_u32_e32 v10, vcc, v11, v10, vcc +; GFX8-NEXT: v_mov_b32_e32 v10, s1 +; GFX8-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_xor_b32_e32 v14, v3, v4 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_xor_b32_e32 v0, v7, v8 -; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, -1, v0 ; GFX8-NEXT: v_xor_b32_e32 v12, v5, v6 -; GFX8-NEXT: v_cndmask_b32_e64 v0, 32, 33, vcc -; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, -1, v12 -; GFX8-NEXT: v_cndmask_b32_e64 v12, 32, 33, vcc -; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, -1, v14 ; GFX8-NEXT: v_xor_b32_e32 v16, v1, v2 -; GFX8-NEXT: v_cndmask_b32_e64 v14, 32, 33, vcc -; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, -1, v16 ; GFX8-NEXT: v_ffbh_i32_e32 v11, v8 -; GFX8-NEXT: v_cndmask_b32_e64 v16, 32, 33, vcc -; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, -1, v11 ; GFX8-NEXT: v_ffbh_i32_e32 v13, v6 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v11, vcc -; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, -1, v13 ; GFX8-NEXT: v_ffbh_i32_e32 v15, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v11, v12, v13, vcc -; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, -1, v15 ; GFX8-NEXT: v_ffbh_i32_e32 v17, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v12, v14, v15, vcc -; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, -1, v17 -; GFX8-NEXT: v_cndmask_b32_e32 v13, v16, v17, vcc -; GFX8-NEXT: v_add_u32_e32 v14, vcc, -1, v0 -; GFX8-NEXT: v_sub_u32_e32 v15, vcc, 33, v0 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, -1, v11 -; GFX8-NEXT: v_lshlrev_b64 v[7:8], v14, v[7:8] -; GFX8-NEXT: v_add_u32_e32 v16, vcc, -1, v12 -; GFX8-NEXT: v_add_u32_e32 v17, vcc, -1, v13 -; GFX8-NEXT: v_lshlrev_b64 v[5:6], v0, v[5:6] -; GFX8-NEXT: v_sub_u32_e32 v11, vcc, 33, v11 -; GFX8-NEXT: v_sub_u32_e32 v12, vcc, 33, v12 -; GFX8-NEXT: v_sub_u32_e32 v13, vcc, 33, v13 -; GFX8-NEXT: v_lshlrev_b64 v[3:4], v16, v[3:4] -; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 -; GFX8-NEXT: v_lshlrev_b64 v[0:1], v17, v[1:2] -; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v0 +; GFX8-NEXT: v_ashrrev_i32_e32 v12, 31, v12 +; GFX8-NEXT: v_ashrrev_i32_e32 v14, 31, v14 +; GFX8-NEXT: v_ashrrev_i32_e32 v16, 31, v16 +; GFX8-NEXT: v_add_u32_e32 v11, vcc, -1, v11 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GFX8-NEXT: v_add_u32_e32 v13, vcc, -1, v13 +; GFX8-NEXT: v_add_u32_e32 v15, vcc, -1, v15 +; GFX8-NEXT: v_add_u32_e32 v17, vcc, -1, v17 +; GFX8-NEXT: v_add_u32_e32 v12, vcc, 32, v12 +; GFX8-NEXT: v_add_u32_e32 v14, vcc, 32, v14 +; GFX8-NEXT: v_add_u32_e32 v16, vcc, 32, v16 +; GFX8-NEXT: v_min_u32_e32 v0, v11, v0 +; GFX8-NEXT: v_min_u32_e32 v11, v13, v12 +; GFX8-NEXT: v_min_u32_e32 v12, v15, v14 +; GFX8-NEXT: v_min_u32_e32 v13, v17, v16 +; GFX8-NEXT: v_lshlrev_b64 v[5:6], v11, v[5:6] +; GFX8-NEXT: v_lshlrev_b64 v[3:4], v12, v[3:4] +; GFX8-NEXT: v_lshlrev_b64 v[7:8], v0, v[7:8] +; GFX8-NEXT: v_sub_u32_e32 v14, vcc, 32, v0 +; GFX8-NEXT: v_lshlrev_b64 v[0:1], v13, v[1:2] +; GFX8-NEXT: v_min_u32_e32 v7, 1, v7 +; GFX8-NEXT: v_min_u32_e32 v5, 1, v5 +; GFX8-NEXT: v_min_u32_e32 v3, 1, v3 +; GFX8-NEXT: v_min_u32_e32 v0, 1, v0 ; GFX8-NEXT: v_or_b32_e32 v3, v4, v3 -; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX8-NEXT: v_or_b32_e32 v2, v8, v2 ; GFX8-NEXT: v_or_b32_e32 v5, v6, v5 -; GFX8-NEXT: v_cvt_f32_i32_e32 v1, v2 +; GFX8-NEXT: v_or_b32_e32 v7, v8, v7 +; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX8-NEXT: v_cvt_f32_i32_e32 v1, v7 +; GFX8-NEXT: v_cvt_f32_i32_e32 v4, v5 ; GFX8-NEXT: v_cvt_f32_i32_e32 v3, v3 -; GFX8-NEXT: v_cvt_f32_i32_e32 v2, v5 -; GFX8-NEXT: v_cvt_f32_i32_e32 v4, v0 -; GFX8-NEXT: v_ldexp_f32 v1, v1, v15 +; GFX8-NEXT: v_cvt_f32_i32_e32 v5, v0 +; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 32, v13 +; GFX8-NEXT: v_sub_u32_e32 v11, vcc, 32, v11 +; GFX8-NEXT: v_sub_u32_e32 v12, vcc, 32, v12 +; GFX8-NEXT: v_ldexp_f32 v1, v1, v14 +; GFX8-NEXT: v_ldexp_f32 v0, v4, v11 ; GFX8-NEXT: v_ldexp_f32 v3, v3, v12 -; GFX8-NEXT: v_ldexp_f32 v0, v2, v11 -; GFX8-NEXT: v_ldexp_f32 v2, v4, v13 +; GFX8-NEXT: v_ldexp_f32 v2, v5, v2 ; GFX8-NEXT: flat_store_dwordx4 v[9:10], v[0:3] ; GFX8-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -511,36 +464,30 @@ ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_xor_b32 s8, s6, s7 -; GFX6-NEXT: s_flbit_i32 s10, s7 +; GFX6-NEXT: s_flbit_i32 s8, s7 +; GFX6-NEXT: s_xor_b32 s9, s6, s7 +; GFX6-NEXT: s_flbit_i32 s10, s5 ; GFX6-NEXT: s_xor_b32 s11, s4, s5 -; GFX6-NEXT: s_flbit_i32 s12, s5 -; GFX6-NEXT: v_cmp_gt_i32_e64 s[8:9], s8, -1 -; GFX6-NEXT: v_cndmask_b32_e64 v0, 32, 33, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v1, s10 -; GFX6-NEXT: v_cmp_gt_i32_e64 s[8:9], s11, -1 -; GFX6-NEXT: v_cndmask_b32_e64 v2, 32, 33, s[8:9] -; GFX6-NEXT: v_mov_b32_e32 v3, s12 -; GFX6-NEXT: v_cmp_ne_u32_e64 vcc, s10, -1 -; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GFX6-NEXT: v_cmp_ne_u32_e64 vcc, s12, -1 -; GFX6-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX6-NEXT: v_add_i32_e32 v2, vcc, -1, v0 -; GFX6-NEXT: v_sub_i32_e32 v4, vcc, 33, v0 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, -1, v1 -; GFX6-NEXT: v_sub_i32_e32 v5, vcc, 33, v1 -; GFX6-NEXT: v_lshl_b64 v[0:1], s[6:7], v2 -; GFX6-NEXT: v_lshl_b64 v[2:3], s[4:5], v3 -; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX6-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX6-NEXT: v_or_b32_e32 v1, v3, v2 -; GFX6-NEXT: v_cvt_f32_i32_e32 v0, v0 -; GFX6-NEXT: v_cvt_f32_i32_e32 v1, v1 -; GFX6-NEXT: v_ldexp_f32_e32 v0, v0, v4 -; GFX6-NEXT: v_ldexp_f32_e32 v1, v1, v5 +; GFX6-NEXT: s_add_i32 s8, s8, -1 +; GFX6-NEXT: s_ashr_i32 s9, s9, 31 +; GFX6-NEXT: s_add_i32 s10, s10, -1 +; GFX6-NEXT: s_ashr_i32 s11, s11, 31 +; GFX6-NEXT: s_add_i32 s9, s9, 32 +; GFX6-NEXT: s_add_i32 s11, s11, 32 +; GFX6-NEXT: s_min_u32 s8, s8, s9 +; GFX6-NEXT: s_min_u32 s9, s10, s11 +; GFX6-NEXT: s_lshl_b64 s[6:7], s[6:7], s8 +; GFX6-NEXT: s_sub_i32 s8, 32, s8 +; GFX6-NEXT: s_lshl_b64 s[4:5], s[4:5], s9 +; GFX6-NEXT: s_sub_i32 s9, 32, s9 +; GFX6-NEXT: s_min_u32 s6, s6, 1 +; GFX6-NEXT: s_min_u32 s4, s4, 1 +; GFX6-NEXT: s_or_b32 s6, s7, s6 +; GFX6-NEXT: s_or_b32 s4, s5, s4 +; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s6 +; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s4 +; GFX6-NEXT: v_ldexp_f32_e64 v0, v0, s8 +; GFX6-NEXT: v_ldexp_f32_e64 v1, v1, s9 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -553,33 +500,29 @@ ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_xor_b32 s2, s6, s7 -; GFX8-NEXT: s_cmp_gt_i32 s2, -1 -; GFX8-NEXT: s_flbit_i32 s3, s7 -; GFX8-NEXT: s_cselect_b32 s2, 33, 32 -; GFX8-NEXT: s_cmp_lg_u32 s3, -1 -; GFX8-NEXT: s_cselect_b32 s2, s3, s2 -; GFX8-NEXT: s_add_i32 s3, s2, -1 -; GFX8-NEXT: s_sub_i32 s8, 33, s2 -; GFX8-NEXT: s_lshl_b64 s[2:3], s[6:7], s3 -; GFX8-NEXT: v_cmp_ne_u32_e64 s[6:7], s2, 0 -; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[6:7] +; GFX8-NEXT: s_xor_b32 s3, s6, s7 +; GFX8-NEXT: s_flbit_i32 s2, s7 +; GFX8-NEXT: s_ashr_i32 s3, s3, 31 +; GFX8-NEXT: s_add_i32 s2, s2, -1 +; GFX8-NEXT: s_add_i32 s3, s3, 32 +; GFX8-NEXT: s_min_u32 s9, s2, s3 +; GFX8-NEXT: s_lshl_b64 s[2:3], s[6:7], s9 +; GFX8-NEXT: s_min_u32 s2, s2, 1 +; GFX8-NEXT: s_or_b32 s2, s3, s2 +; GFX8-NEXT: v_cvt_f32_i32_e32 v0, s2 ; GFX8-NEXT: s_xor_b32 s2, s4, s5 -; GFX8-NEXT: v_or_b32_e32 v0, s3, v0 -; GFX8-NEXT: s_cmp_gt_i32 s2, -1 -; GFX8-NEXT: s_flbit_i32 s3, s5 -; GFX8-NEXT: s_cselect_b32 s2, 33, 32 -; GFX8-NEXT: s_cmp_lg_u32 s3, -1 -; GFX8-NEXT: s_cselect_b32 s6, s3, s2 -; GFX8-NEXT: s_add_i32 s2, s6, -1 -; GFX8-NEXT: s_lshl_b64 s[2:3], s[4:5], s2 -; GFX8-NEXT: v_cmp_ne_u32_e64 s[4:5], s2, 0 -; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] -; GFX8-NEXT: v_or_b32_e32 v1, s3, v1 -; GFX8-NEXT: v_cvt_f32_i32_e32 v0, v0 -; GFX8-NEXT: v_cvt_f32_i32_e32 v1, v1 -; GFX8-NEXT: s_sub_i32 s2, 33, s6 -; GFX8-NEXT: v_ldexp_f32 v0, v0, s8 +; GFX8-NEXT: s_flbit_i32 s8, s5 +; GFX8-NEXT: s_ashr_i32 s2, s2, 31 +; GFX8-NEXT: s_add_i32 s8, s8, -1 +; GFX8-NEXT: s_add_i32 s2, s2, 32 +; GFX8-NEXT: s_min_u32 s7, s8, s2 +; GFX8-NEXT: s_lshl_b64 s[2:3], s[4:5], s7 +; GFX8-NEXT: s_min_u32 s2, s2, 1 +; GFX8-NEXT: s_or_b32 s2, s3, s2 +; GFX8-NEXT: v_cvt_f32_i32_e32 v1, s2 +; GFX8-NEXT: s_sub_i32 s6, 32, s9 +; GFX8-NEXT: s_sub_i32 s2, 32, s7 +; GFX8-NEXT: v_ldexp_f32 v0, v0, s6 ; GFX8-NEXT: v_ldexp_f32 v1, v1, s2 ; GFX8-NEXT: v_cvt_f16_f32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD ; GFX8-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -618,62 +561,54 @@ ; GFX6-NEXT: v_ffbh_i32_e32 v15, v8 ; GFX6-NEXT: v_xor_b32_e32 v16, v5, v6 ; GFX6-NEXT: v_ffbh_i32_e32 v17, v6 -; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, -1, v0 -; GFX6-NEXT: v_cndmask_b32_e64 v0, 32, 33, vcc -; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, -1, v12 -; GFX6-NEXT: v_cndmask_b32_e64 v12, 32, 33, vcc -; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, -1, v14 -; GFX6-NEXT: v_cndmask_b32_e64 v14, 32, 33, vcc -; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, -1, v16 -; GFX6-NEXT: v_cndmask_b32_e64 v16, 32, 33, vcc -; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, -1, v9 -; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc -; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, -1, v13 -; GFX6-NEXT: v_cndmask_b32_e32 v9, v12, v13, vcc -; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, -1, v15 -; GFX6-NEXT: v_cndmask_b32_e32 v12, v14, v15, vcc -; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, -1, v17 -; GFX6-NEXT: v_cndmask_b32_e32 v13, v16, v17, vcc -; GFX6-NEXT: v_add_i32_e32 v14, vcc, -1, v0 -; GFX6-NEXT: v_sub_i32_e32 v15, vcc, 33, v0 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, -1, v9 -; GFX6-NEXT: v_sub_i32_e32 v9, vcc, 33, v9 -; GFX6-NEXT: v_add_i32_e32 v16, vcc, -1, v12 -; GFX6-NEXT: v_sub_i32_e32 v12, vcc, 33, v12 -; GFX6-NEXT: v_add_i32_e32 v17, vcc, -1, v13 -; GFX6-NEXT: v_sub_i32_e32 v13, vcc, 33, v13 -; GFX6-NEXT: v_lshl_b64 v[3:4], v[3:4], v14 -; GFX6-NEXT: v_lshl_b64 v[0:1], v[1:2], v0 -; GFX6-NEXT: v_lshl_b64 v[7:8], v[7:8], v16 -; GFX6-NEXT: v_lshl_b64 v[5:6], v[5:6], v17 -; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX6-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 -; GFX6-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX6-NEXT: v_or_b32_e32 v2, v4, v2 +; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v0 +; GFX6-NEXT: v_add_i32_e32 v9, vcc, -1, v9 +; GFX6-NEXT: v_ashrrev_i32_e32 v12, 31, v12 +; GFX6-NEXT: v_add_i32_e32 v13, vcc, -1, v13 +; GFX6-NEXT: v_ashrrev_i32_e32 v14, 31, v14 +; GFX6-NEXT: v_add_i32_e32 v15, vcc, -1, v15 +; GFX6-NEXT: v_ashrrev_i32_e32 v16, 31, v16 +; GFX6-NEXT: v_add_i32_e32 v17, vcc, -1, v17 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GFX6-NEXT: v_add_i32_e32 v12, vcc, 32, v12 +; GFX6-NEXT: v_add_i32_e32 v14, vcc, 32, v14 +; GFX6-NEXT: v_add_i32_e32 v16, vcc, 32, v16 +; GFX6-NEXT: v_min_u32_e32 v0, v9, v0 +; GFX6-NEXT: v_min_u32_e32 v9, v13, v12 +; GFX6-NEXT: v_min_u32_e32 v12, v15, v14 +; GFX6-NEXT: v_min_u32_e32 v13, v17, v16 +; GFX6-NEXT: v_lshl_b64 v[3:4], v[3:4], v0 +; GFX6-NEXT: v_sub_i32_e32 v14, vcc, 32, v0 +; GFX6-NEXT: v_lshl_b64 v[0:1], v[1:2], v9 +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 32, v9 +; GFX6-NEXT: v_lshl_b64 v[7:8], v[7:8], v12 +; GFX6-NEXT: v_sub_i32_e32 v9, vcc, 32, v12 +; GFX6-NEXT: v_lshl_b64 v[5:6], v[5:6], v13 +; GFX6-NEXT: v_sub_i32_e32 v12, vcc, 32, v13 +; GFX6-NEXT: v_min_u32_e32 v3, 1, v3 +; GFX6-NEXT: v_min_u32_e32 v0, 1, v0 +; GFX6-NEXT: v_min_u32_e32 v7, 1, v7 +; GFX6-NEXT: v_min_u32_e32 v5, 1, v5 +; GFX6-NEXT: v_or_b32_e32 v3, v4, v3 ; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX6-NEXT: v_or_b32_e32 v1, v8, v3 -; GFX6-NEXT: v_or_b32_e32 v3, v6, v5 -; GFX6-NEXT: v_cvt_f32_i32_e32 v2, v2 +; GFX6-NEXT: v_or_b32_e32 v1, v8, v7 +; GFX6-NEXT: v_or_b32_e32 v4, v6, v5 +; GFX6-NEXT: v_cvt_f32_i32_e32 v3, v3 ; GFX6-NEXT: v_cvt_f32_i32_e32 v0, v0 ; GFX6-NEXT: v_cvt_f32_i32_e32 v1, v1 -; GFX6-NEXT: v_cvt_f32_i32_e32 v3, v3 -; GFX6-NEXT: v_ldexp_f32_e32 v2, v2, v15 -; GFX6-NEXT: v_ldexp_f32_e32 v0, v0, v9 -; GFX6-NEXT: v_ldexp_f32_e32 v1, v1, v12 -; GFX6-NEXT: v_ldexp_f32_e32 v3, v3, v13 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_i32_e32 v4, v4 +; GFX6-NEXT: v_ldexp_f32_e32 v3, v3, v14 +; GFX6-NEXT: v_ldexp_f32_e32 v0, v0, v2 +; GFX6-NEXT: v_ldexp_f32_e32 v1, v1, v9 +; GFX6-NEXT: v_ldexp_f32_e32 v2, v4, v12 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v1 -; GFX6-NEXT: v_or_b32_e32 v1, v0, v2 -; GFX6-NEXT: v_or_b32_e32 v0, v3, v4 +; GFX6-NEXT: v_or_b32_e32 v1, v0, v3 +; GFX6-NEXT: v_or_b32_e32 v0, v2, v4 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], v[10:11], s[0:3], 0 addr64 ; GFX6-NEXT: s_endpgm ; @@ -681,83 +616,74 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 5, v0 -; GFX8-NEXT: v_mov_b32_e32 v9, 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v10, 3, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v9, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v2, s3 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, s2, v1 -; GFX8-NEXT: v_addc_u32_e32 v6, vcc, v2, v9, vcc +; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v2, vcc ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 16, v5 ; GFX8-NEXT: v_addc_u32_e32 v2, vcc, 0, v6, vcc ; GFX8-NEXT: flat_load_dwordx4 v[1:4], v[1:2] ; GFX8-NEXT: flat_load_dwordx4 v[5:8], v[5:6] -; GFX8-NEXT: v_mov_b32_e32 v11, s1 +; GFX8-NEXT: v_mov_b32_e32 v10, s1 ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_xor_b32_e32 v15, v3, v4 +; GFX8-NEXT: v_xor_b32_e32 v14, v3, v4 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_xor_b32_e32 v0, v7, v8 -; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, -1, v0 -; GFX8-NEXT: v_xor_b32_e32 v13, v5, v6 -; GFX8-NEXT: v_cndmask_b32_e64 v0, 32, 33, vcc -; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, -1, v13 -; GFX8-NEXT: v_cndmask_b32_e64 v13, 32, 33, vcc -; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, -1, v15 -; GFX8-NEXT: v_xor_b32_e32 v17, v1, v2 -; GFX8-NEXT: v_cndmask_b32_e64 v15, 32, 33, vcc -; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, -1, v17 -; GFX8-NEXT: v_ffbh_i32_e32 v12, v8 -; GFX8-NEXT: v_cndmask_b32_e64 v17, 32, 33, vcc -; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, -1, v12 -; GFX8-NEXT: v_ffbh_i32_e32 v14, v6 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v12, vcc -; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, -1, v14 -; GFX8-NEXT: v_ffbh_i32_e32 v16, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v12, v13, v14, vcc -; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, -1, v16 -; GFX8-NEXT: v_ffbh_i32_e32 v18, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v13, v15, v16, vcc -; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, -1, v18 -; GFX8-NEXT: v_cndmask_b32_e32 v14, v17, v18, vcc -; GFX8-NEXT: v_add_u32_e32 v15, vcc, -1, v0 -; GFX8-NEXT: v_sub_u32_e32 v16, vcc, 33, v0 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, -1, v12 -; GFX8-NEXT: v_lshlrev_b64 v[7:8], v15, v[7:8] -; GFX8-NEXT: v_add_u32_e32 v17, vcc, -1, v13 -; GFX8-NEXT: v_add_u32_e32 v18, vcc, -1, v14 -; GFX8-NEXT: v_lshlrev_b64 v[5:6], v0, v[5:6] -; GFX8-NEXT: v_sub_u32_e32 v12, vcc, 33, v12 -; GFX8-NEXT: v_sub_u32_e32 v13, vcc, 33, v13 -; GFX8-NEXT: v_sub_u32_e32 v14, vcc, 33, v14 -; GFX8-NEXT: v_lshlrev_b64 v[3:4], v17, v[3:4] -; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 -; GFX8-NEXT: v_lshlrev_b64 v[0:1], v18, v[1:2] -; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX8-NEXT: v_xor_b32_e32 v12, v5, v6 +; GFX8-NEXT: v_xor_b32_e32 v16, v1, v2 +; GFX8-NEXT: v_ffbh_i32_e32 v11, v8 +; GFX8-NEXT: v_ffbh_i32_e32 v13, v6 +; GFX8-NEXT: v_ffbh_i32_e32 v15, v4 +; GFX8-NEXT: v_ffbh_i32_e32 v17, v2 +; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v0 +; GFX8-NEXT: v_ashrrev_i32_e32 v12, 31, v12 +; GFX8-NEXT: v_ashrrev_i32_e32 v14, 31, v14 +; GFX8-NEXT: v_ashrrev_i32_e32 v16, 31, v16 +; GFX8-NEXT: v_add_u32_e32 v11, vcc, -1, v11 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GFX8-NEXT: v_add_u32_e32 v13, vcc, -1, v13 +; GFX8-NEXT: v_add_u32_e32 v15, vcc, -1, v15 +; GFX8-NEXT: v_add_u32_e32 v17, vcc, -1, v17 +; GFX8-NEXT: v_add_u32_e32 v12, vcc, 32, v12 +; GFX8-NEXT: v_add_u32_e32 v14, vcc, 32, v14 +; GFX8-NEXT: v_add_u32_e32 v16, vcc, 32, v16 +; GFX8-NEXT: v_min_u32_e32 v0, v11, v0 +; GFX8-NEXT: v_min_u32_e32 v11, v13, v12 +; GFX8-NEXT: v_min_u32_e32 v12, v15, v14 +; GFX8-NEXT: v_min_u32_e32 v13, v17, v16 +; GFX8-NEXT: v_lshlrev_b64 v[5:6], v11, v[5:6] +; GFX8-NEXT: v_lshlrev_b64 v[3:4], v12, v[3:4] +; GFX8-NEXT: v_lshlrev_b64 v[7:8], v0, v[7:8] +; GFX8-NEXT: v_sub_u32_e32 v14, vcc, 32, v0 +; GFX8-NEXT: v_lshlrev_b64 v[0:1], v13, v[1:2] +; GFX8-NEXT: v_min_u32_e32 v7, 1, v7 +; GFX8-NEXT: v_min_u32_e32 v5, 1, v5 +; GFX8-NEXT: v_min_u32_e32 v3, 1, v3 +; GFX8-NEXT: v_min_u32_e32 v0, 1, v0 ; GFX8-NEXT: v_or_b32_e32 v3, v4, v3 ; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX8-NEXT: v_or_b32_e32 v2, v8, v2 +; GFX8-NEXT: v_or_b32_e32 v7, v8, v7 ; GFX8-NEXT: v_or_b32_e32 v5, v6, v5 -; GFX8-NEXT: v_cvt_f32_i32_e32 v1, v2 -; GFX8-NEXT: v_cvt_f32_i32_e32 v2, v5 +; GFX8-NEXT: v_cvt_f32_i32_e32 v1, v7 +; GFX8-NEXT: v_cvt_f32_i32_e32 v4, v5 ; GFX8-NEXT: v_cvt_f32_i32_e32 v3, v3 ; GFX8-NEXT: v_cvt_f32_i32_e32 v0, v0 -; GFX8-NEXT: v_ldexp_f32 v1, v1, v16 -; GFX8-NEXT: v_ldexp_f32 v2, v2, v12 -; GFX8-NEXT: v_ldexp_f32 v3, v3, v13 -; GFX8-NEXT: v_ldexp_f32 v0, v0, v14 +; GFX8-NEXT: v_sub_u32_e32 v11, vcc, 32, v11 +; GFX8-NEXT: v_sub_u32_e32 v12, vcc, 32, v12 +; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 32, v13 +; GFX8-NEXT: v_ldexp_f32 v4, v4, v11 +; GFX8-NEXT: v_ldexp_f32 v3, v3, v12 +; GFX8-NEXT: v_ldexp_f32 v0, v0, v2 +; GFX8-NEXT: v_ldexp_f32 v1, v1, v14 ; GFX8-NEXT: v_cvt_f16_f32_e32 v5, v0 ; GFX8-NEXT: v_cvt_f16_f32_sdwa v3, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD -; GFX8-NEXT: v_cvt_f16_f32_sdwa v4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD -; GFX8-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v10 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v11, v9, vcc +; GFX8-NEXT: v_cvt_f16_f32_sdwa v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD +; GFX8-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v9 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v10, vcc ; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 -; GFX8-NEXT: v_or_b32_e32 v2, v2, v4 +; GFX8-NEXT: v_or_b32_e32 v2, v4, v2 ; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX8-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/uint_to_fp.i64.ll b/llvm/test/CodeGen/AMDGPU/uint_to_fp.i64.ll --- a/llvm/test/CodeGen/AMDGPU/uint_to_fp.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/uint_to_fp.i64.ll @@ -16,10 +16,9 @@ ; GFX6-NEXT: s_flbit_i32_b32 s0, s3 ; GFX6-NEXT: s_min_u32 s8, s0, 32 ; GFX6-NEXT: s_lshl_b64 s[0:1], s[2:3], s8 -; GFX6-NEXT: v_cmp_ne_u32_e64 s[2:3], s0, 0 -; GFX6-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3] -; GFX6-NEXT: v_or_b32_e32 v0, s1, v0 -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, v0 +; GFX6-NEXT: s_min_u32 s0, s0, 1 +; GFX6-NEXT: s_or_b32 s0, s1, s0 +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s0 ; GFX6-NEXT: s_sub_i32 s0, 32, s8 ; GFX6-NEXT: v_ldexp_f32_e64 v0, v0, s0 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -31,13 +30,12 @@ ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_flbit_i32_b32 s4, s3 -; GFX8-NEXT: s_min_u32 s6, s4, 32 -; GFX8-NEXT: s_lshl_b64 s[2:3], s[2:3], s6 -; GFX8-NEXT: v_cmp_ne_u32_e64 s[4:5], s2, 0 -; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] -; GFX8-NEXT: v_or_b32_e32 v0, s3, v0 -; GFX8-NEXT: v_cvt_f32_u32_e32 v0, v0 -; GFX8-NEXT: s_sub_i32 s2, 32, s6 +; GFX8-NEXT: s_min_u32 s4, s4, 32 +; GFX8-NEXT: s_lshl_b64 s[2:3], s[2:3], s4 +; GFX8-NEXT: s_min_u32 s2, s2, 1 +; GFX8-NEXT: s_or_b32 s2, s3, s2 +; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GFX8-NEXT: s_sub_i32 s2, 32, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_ldexp_f32 v0, v0, s2 ; GFX8-NEXT: v_cvt_f16_f32_e32 v2, v0 @@ -66,8 +64,7 @@ ; GFX6-NEXT: v_ffbh_u32_e32 v0, v4 ; GFX6-NEXT: v_min_u32_e32 v0, 32, v0 ; GFX6-NEXT: v_lshl_b64 v[3:4], v[3:4], v0 -; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; GFX6-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GFX6-NEXT: v_min_u32_e32 v3, 1, v3 ; GFX6-NEXT: v_or_b32_e32 v3, v4, v3 ; GFX6-NEXT: v_cvt_f32_u32_e32 v3, v3 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, 32, v0 @@ -80,28 +77,26 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX8-NEXT: v_mov_b32_e32 v3, 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v2, s3 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, s2, v1 -; GFX8-NEXT: v_addc_u32_e32 v2, vcc, v2, v3, vcc +; GFX8-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc ; GFX8-NEXT: flat_load_dwordx2 v[1:2], v[1:2] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_ffbh_u32_e32 v4, v2 -; GFX8-NEXT: v_min_u32_e32 v4, 32, v4 -; GFX8-NEXT: v_lshlrev_b64 v[1:2], v4, v[1:2] -; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX8-NEXT: v_ffbh_u32_e32 v3, v2 +; GFX8-NEXT: v_min_u32_e32 v3, 32, v3 +; GFX8-NEXT: v_lshlrev_b64 v[1:2], v3, v[1:2] +; GFX8-NEXT: v_sub_u32_e32 v3, vcc, 32, v3 +; GFX8-NEXT: v_min_u32_e32 v1, 1, v1 ; GFX8-NEXT: v_or_b32_e32 v1, v2, v1 ; GFX8-NEXT: v_cvt_f32_u32_e32 v1, v1 -; GFX8-NEXT: v_sub_u32_e32 v4, vcc, 32, v4 ; GFX8-NEXT: v_mov_b32_e32 v2, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 -; GFX8-NEXT: v_ldexp_f32 v1, v1, v4 -; GFX8-NEXT: v_cvt_f16_f32_e32 v4, v1 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v2, v3, vcc -; GFX8-NEXT: flat_store_short v[0:1], v4 +; GFX8-NEXT: v_ldexp_f32 v1, v1, v3 +; GFX8-NEXT: v_cvt_f16_f32_e32 v3, v1 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc +; GFX8-NEXT: flat_store_short v[0:1], v3 ; GFX8-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid @@ -124,10 +119,9 @@ ; GFX6-NEXT: s_flbit_i32_b32 s0, s3 ; GFX6-NEXT: s_min_u32 s8, s0, 32 ; GFX6-NEXT: s_lshl_b64 s[0:1], s[2:3], s8 -; GFX6-NEXT: v_cmp_ne_u32_e64 s[2:3], s0, 0 -; GFX6-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3] -; GFX6-NEXT: v_or_b32_e32 v0, s1, v0 -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, v0 +; GFX6-NEXT: s_min_u32 s0, s0, 1 +; GFX6-NEXT: s_or_b32 s0, s1, s0 +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s0 ; GFX6-NEXT: s_sub_i32 s0, 32, s8 ; GFX6-NEXT: v_ldexp_f32_e64 v0, v0, s0 ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 @@ -138,14 +132,13 @@ ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_flbit_i32_b32 s4, s3 -; GFX8-NEXT: s_min_u32 s6, s4, 32 -; GFX8-NEXT: s_lshl_b64 s[2:3], s[2:3], s6 -; GFX8-NEXT: v_cmp_ne_u32_e64 s[4:5], s2, 0 -; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] -; GFX8-NEXT: v_or_b32_e32 v0, s3, v0 -; GFX8-NEXT: v_cvt_f32_u32_e32 v2, v0 +; GFX8-NEXT: s_min_u32 s4, s4, 32 +; GFX8-NEXT: s_lshl_b64 s[2:3], s[2:3], s4 +; GFX8-NEXT: s_min_u32 s2, s2, 1 +; GFX8-NEXT: s_or_b32 s2, s3, s2 +; GFX8-NEXT: v_cvt_f32_u32_e32 v2, s2 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: s_sub_i32 s0, 32, s6 +; GFX8-NEXT: s_sub_i32 s0, 32, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_ldexp_f32 v2, v2, s0 ; GFX8-NEXT: flat_store_dword v[0:1], v2 @@ -172,8 +165,7 @@ ; GFX6-NEXT: v_ffbh_u32_e32 v0, v4 ; GFX6-NEXT: v_min_u32_e32 v0, 32, v0 ; GFX6-NEXT: v_lshl_b64 v[3:4], v[3:4], v0 -; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; GFX6-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GFX6-NEXT: v_min_u32_e32 v3, 1, v3 ; GFX6-NEXT: v_or_b32_e32 v3, v4, v3 ; GFX6-NEXT: v_cvt_f32_u32_e32 v3, v3 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, 32, v0 @@ -185,26 +177,24 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX8-NEXT: v_mov_b32_e32 v3, 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 2, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v2, s3 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, s2, v1 -; GFX8-NEXT: v_addc_u32_e32 v2, vcc, v2, v3, vcc +; GFX8-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc ; GFX8-NEXT: flat_load_dwordx2 v[1:2], v[1:2] ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_ffbh_u32_e32 v0, v2 -; GFX8-NEXT: v_min_u32_e32 v5, 32, v0 -; GFX8-NEXT: v_lshlrev_b64 v[0:1], v5, v[1:2] +; GFX8-NEXT: v_min_u32_e32 v4, 32, v0 +; GFX8-NEXT: v_lshlrev_b64 v[0:1], v4, v[1:2] ; GFX8-NEXT: v_mov_b32_e32 v2, s1 -; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX8-NEXT: v_min_u32_e32 v0, 1, v0 ; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX8-NEXT: v_cvt_f32_u32_e32 v6, v0 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v4 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v2, v3, vcc -; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 32, v5 -; GFX8-NEXT: v_ldexp_f32 v2, v6, v2 +; GFX8-NEXT: v_cvt_f32_u32_e32 v5, v0 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v3 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc +; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 32, v4 +; GFX8-NEXT: v_ldexp_f32 v2, v5, v2 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -229,19 +219,17 @@ ; GFX6-NEXT: s_min_u32 s8, s8, 32 ; GFX6-NEXT: s_min_u32 s9, s9, 32 ; GFX6-NEXT: s_lshl_b64 s[6:7], s[6:7], s8 -; GFX6-NEXT: s_sub_i32 s10, 32, s8 +; GFX6-NEXT: s_sub_i32 s8, 32, s8 ; GFX6-NEXT: s_lshl_b64 s[4:5], s[4:5], s9 -; GFX6-NEXT: s_sub_i32 s11, 32, s9 -; GFX6-NEXT: v_cmp_ne_u32_e64 s[8:9], s6, 0 -; GFX6-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[8:9] -; GFX6-NEXT: v_cmp_ne_u32_e64 s[8:9], s4, 0 -; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[8:9] -; GFX6-NEXT: v_or_b32_e32 v0, s7, v0 -; GFX6-NEXT: v_or_b32_e32 v1, s5, v1 -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, v0 -; GFX6-NEXT: v_cvt_f32_u32_e32 v2, v1 -; GFX6-NEXT: v_ldexp_f32_e64 v1, v0, s10 -; GFX6-NEXT: v_ldexp_f32_e64 v0, v2, s11 +; GFX6-NEXT: s_sub_i32 s9, 32, s9 +; GFX6-NEXT: s_min_u32 s6, s6, 1 +; GFX6-NEXT: s_min_u32 s4, s4, 1 +; GFX6-NEXT: s_or_b32 s6, s7, s6 +; GFX6-NEXT: s_or_b32 s4, s5, s4 +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s6 +; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s4 +; GFX6-NEXT: v_ldexp_f32_e64 v1, v0, s8 +; GFX6-NEXT: v_ldexp_f32_e64 v0, v2, s9 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; @@ -251,22 +239,20 @@ ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_flbit_i32_b32 s6, s3 -; GFX8-NEXT: s_min_u32 s8, s6, 32 ; GFX8-NEXT: s_flbit_i32_b32 s7, s1 -; GFX8-NEXT: s_lshl_b64 s[2:3], s[2:3], s8 -; GFX8-NEXT: s_min_u32 s9, s7, 32 -; GFX8-NEXT: v_cmp_ne_u32_e64 s[6:7], s2, 0 -; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[6:7] -; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], s9 -; GFX8-NEXT: v_or_b32_e32 v0, s3, v0 -; GFX8-NEXT: v_cmp_ne_u32_e64 s[2:3], s0, 0 -; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[2:3] -; GFX8-NEXT: v_or_b32_e32 v1, s1, v1 -; GFX8-NEXT: v_cvt_f32_u32_e32 v0, v0 -; GFX8-NEXT: v_cvt_f32_u32_e32 v2, v1 -; GFX8-NEXT: s_sub_i32 s0, 32, s8 +; GFX8-NEXT: s_min_u32 s6, s6, 32 +; GFX8-NEXT: s_lshl_b64 s[2:3], s[2:3], s6 +; GFX8-NEXT: s_min_u32 s7, s7, 32 +; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], s7 +; GFX8-NEXT: s_min_u32 s2, s2, 1 +; GFX8-NEXT: s_min_u32 s0, s0, 1 +; GFX8-NEXT: s_or_b32 s2, s3, s2 +; GFX8-NEXT: s_or_b32 s0, s1, s0 +; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GFX8-NEXT: v_cvt_f32_u32_e32 v2, s0 +; GFX8-NEXT: s_sub_i32 s0, 32, s6 ; GFX8-NEXT: v_ldexp_f32 v1, v0, s0 -; GFX8-NEXT: s_sub_i32 s0, 32, s9 +; GFX8-NEXT: s_sub_i32 s0, 32, s7 ; GFX8-NEXT: v_ldexp_f32 v0, v2, s0 ; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_mov_b32_e32 v3, s5 @@ -310,14 +296,10 @@ ; GFX6-NEXT: v_sub_i32_e32 v9, vcc, 32, v12 ; GFX6-NEXT: v_lshl_b64 v[5:6], v[5:6], v13 ; GFX6-NEXT: v_sub_i32_e32 v12, vcc, 32, v13 -; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; GFX6-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX6-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 -; GFX6-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GFX6-NEXT: v_min_u32_e32 v3, 1, v3 +; GFX6-NEXT: v_min_u32_e32 v0, 1, v0 +; GFX6-NEXT: v_min_u32_e32 v7, 1, v7 +; GFX6-NEXT: v_min_u32_e32 v5, 1, v5 ; GFX6-NEXT: v_or_b32_e32 v3, v4, v3 ; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX6-NEXT: v_or_b32_e32 v1, v8, v7 @@ -337,19 +319,18 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 5, v0 -; GFX8-NEXT: v_mov_b32_e32 v10, 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v2, s3 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, s2, v1 -; GFX8-NEXT: v_addc_u32_e32 v6, vcc, v2, v10, vcc +; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v2, vcc ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 16, v5 ; GFX8-NEXT: v_addc_u32_e32 v2, vcc, 0, v6, vcc ; GFX8-NEXT: flat_load_dwordx4 v[1:4], v[1:2] ; GFX8-NEXT: flat_load_dwordx4 v[5:8], v[5:6] ; GFX8-NEXT: v_add_u32_e32 v9, vcc, s0, v0 -; GFX8-NEXT: v_mov_b32_e32 v11, s1 -; GFX8-NEXT: v_addc_u32_e32 v10, vcc, v11, v10, vcc +; GFX8-NEXT: v_mov_b32_e32 v10, s1 +; GFX8-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_ffbh_u32_e32 v12, v4 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -360,33 +341,29 @@ ; GFX8-NEXT: v_min_u32_e32 v11, 32, v11 ; GFX8-NEXT: v_min_u32_e32 v12, 32, v12 ; GFX8-NEXT: v_min_u32_e32 v13, 32, v13 -; GFX8-NEXT: v_lshlrev_b64 v[7:8], v0, v[7:8] -; GFX8-NEXT: v_sub_u32_e32 v14, vcc, 32, v0 ; GFX8-NEXT: v_lshlrev_b64 v[5:6], v11, v[5:6] ; GFX8-NEXT: v_lshlrev_b64 v[3:4], v12, v[3:4] +; GFX8-NEXT: v_lshlrev_b64 v[7:8], v0, v[7:8] +; GFX8-NEXT: v_sub_u32_e32 v14, vcc, 32, v0 ; GFX8-NEXT: v_lshlrev_b64 v[0:1], v13, v[1:2] -; GFX8-NEXT: v_sub_u32_e32 v11, vcc, 32, v11 -; GFX8-NEXT: v_sub_u32_e32 v12, vcc, 32, v12 -; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 32, v13 -; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 -; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX8-NEXT: v_min_u32_e32 v7, 1, v7 +; GFX8-NEXT: v_min_u32_e32 v5, 1, v5 +; GFX8-NEXT: v_min_u32_e32 v3, 1, v3 +; GFX8-NEXT: v_min_u32_e32 v0, 1, v0 ; GFX8-NEXT: v_or_b32_e32 v3, v4, v3 ; GFX8-NEXT: v_or_b32_e32 v5, v6, v5 ; GFX8-NEXT: v_or_b32_e32 v7, v8, v7 ; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX8-NEXT: v_cvt_f32_u32_e32 v1, v7 -; GFX8-NEXT: v_cvt_f32_u32_e32 v3, v3 ; GFX8-NEXT: v_cvt_f32_u32_e32 v4, v5 +; GFX8-NEXT: v_cvt_f32_u32_e32 v3, v3 ; GFX8-NEXT: v_cvt_f32_u32_e32 v5, v0 +; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 32, v13 +; GFX8-NEXT: v_sub_u32_e32 v11, vcc, 32, v11 +; GFX8-NEXT: v_sub_u32_e32 v12, vcc, 32, v12 ; GFX8-NEXT: v_ldexp_f32 v1, v1, v14 -; GFX8-NEXT: v_ldexp_f32 v3, v3, v12 ; GFX8-NEXT: v_ldexp_f32 v0, v4, v11 +; GFX8-NEXT: v_ldexp_f32 v3, v3, v12 ; GFX8-NEXT: v_ldexp_f32 v2, v5, v2 ; GFX8-NEXT: flat_store_dwordx4 v[9:10], v[0:3] ; GFX8-NEXT: s_endpgm @@ -412,19 +389,17 @@ ; GFX6-NEXT: s_min_u32 s8, s8, 32 ; GFX6-NEXT: s_min_u32 s9, s9, 32 ; GFX6-NEXT: s_lshl_b64 s[6:7], s[6:7], s8 -; GFX6-NEXT: s_sub_i32 s10, 32, s8 +; GFX6-NEXT: s_sub_i32 s8, 32, s8 ; GFX6-NEXT: s_lshl_b64 s[4:5], s[4:5], s9 -; GFX6-NEXT: s_sub_i32 s11, 32, s9 -; GFX6-NEXT: v_cmp_ne_u32_e64 s[8:9], s6, 0 -; GFX6-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[8:9] -; GFX6-NEXT: v_cmp_ne_u32_e64 s[8:9], s4, 0 -; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[8:9] -; GFX6-NEXT: v_or_b32_e32 v0, s7, v0 -; GFX6-NEXT: v_or_b32_e32 v1, s5, v1 -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, v0 -; GFX6-NEXT: v_cvt_f32_u32_e32 v1, v1 -; GFX6-NEXT: v_ldexp_f32_e64 v0, v0, s10 -; GFX6-NEXT: v_ldexp_f32_e64 v1, v1, s11 +; GFX6-NEXT: s_sub_i32 s9, 32, s9 +; GFX6-NEXT: s_min_u32 s6, s6, 1 +; GFX6-NEXT: s_min_u32 s4, s4, 1 +; GFX6-NEXT: s_or_b32 s6, s7, s6 +; GFX6-NEXT: s_or_b32 s4, s5, s4 +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s6 +; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s4 +; GFX6-NEXT: v_ldexp_f32_e64 v0, v0, s8 +; GFX6-NEXT: v_ldexp_f32_e64 v1, v1, s9 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -442,18 +417,16 @@ ; GFX8-NEXT: s_min_u32 s8, s2, 32 ; GFX8-NEXT: s_min_u32 s9, s3, 32 ; GFX8-NEXT: s_lshl_b64 s[2:3], s[6:7], s8 -; GFX8-NEXT: v_cmp_ne_u32_e64 s[6:7], s2, 0 -; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[6:7] -; GFX8-NEXT: v_or_b32_e32 v0, s3, v0 -; GFX8-NEXT: s_lshl_b64 s[2:3], s[4:5], s9 -; GFX8-NEXT: v_cmp_ne_u32_e64 s[4:5], s2, 0 -; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] -; GFX8-NEXT: v_or_b32_e32 v1, s3, v1 -; GFX8-NEXT: v_cvt_f32_u32_e32 v0, v0 -; GFX8-NEXT: v_cvt_f32_u32_e32 v1, v1 -; GFX8-NEXT: s_sub_i32 s8, 32, s8 +; GFX8-NEXT: s_min_u32 s2, s2, 1 +; GFX8-NEXT: s_or_b32 s2, s3, s2 +; GFX8-NEXT: s_lshl_b64 s[4:5], s[4:5], s9 +; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GFX8-NEXT: s_min_u32 s2, s4, 1 +; GFX8-NEXT: s_or_b32 s2, s5, s2 +; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s2 +; GFX8-NEXT: s_sub_i32 s6, 32, s8 ; GFX8-NEXT: s_sub_i32 s2, 32, s9 -; GFX8-NEXT: v_ldexp_f32 v0, v0, s8 +; GFX8-NEXT: v_ldexp_f32 v0, v0, s6 ; GFX8-NEXT: v_ldexp_f32 v1, v1, s2 ; GFX8-NEXT: v_cvt_f16_f32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD ; GFX8-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -500,14 +473,10 @@ ; GFX6-NEXT: v_sub_i32_e32 v9, vcc, 32, v12 ; GFX6-NEXT: v_lshl_b64 v[5:6], v[5:6], v13 ; GFX6-NEXT: v_sub_i32_e32 v12, vcc, 32, v13 -; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; GFX6-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX6-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 -; GFX6-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GFX6-NEXT: v_min_u32_e32 v3, 1, v3 +; GFX6-NEXT: v_min_u32_e32 v0, 1, v0 +; GFX6-NEXT: v_min_u32_e32 v7, 1, v7 +; GFX6-NEXT: v_min_u32_e32 v5, 1, v5 ; GFX6-NEXT: v_or_b32_e32 v3, v4, v3 ; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX6-NEXT: v_or_b32_e32 v1, v8, v7 @@ -535,43 +504,35 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 5, v0 -; GFX8-NEXT: v_mov_b32_e32 v9, 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v10, 3, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v9, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v2, s3 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, s2, v1 -; GFX8-NEXT: v_addc_u32_e32 v6, vcc, v2, v9, vcc +; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v2, vcc ; GFX8-NEXT: v_add_u32_e32 v1, vcc, 16, v5 ; GFX8-NEXT: v_addc_u32_e32 v2, vcc, 0, v6, vcc ; GFX8-NEXT: flat_load_dwordx4 v[1:4], v[1:2] ; GFX8-NEXT: flat_load_dwordx4 v[5:8], v[5:6] -; GFX8-NEXT: v_mov_b32_e32 v11, s1 +; GFX8-NEXT: v_mov_b32_e32 v10, s1 ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_ffbh_u32_e32 v13, v4 +; GFX8-NEXT: v_ffbh_u32_e32 v12, v4 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_ffbh_u32_e32 v0, v8 -; GFX8-NEXT: v_ffbh_u32_e32 v12, v6 -; GFX8-NEXT: v_ffbh_u32_e32 v14, v2 +; GFX8-NEXT: v_ffbh_u32_e32 v11, v6 +; GFX8-NEXT: v_ffbh_u32_e32 v13, v2 ; GFX8-NEXT: v_min_u32_e32 v0, 32, v0 +; GFX8-NEXT: v_min_u32_e32 v11, 32, v11 ; GFX8-NEXT: v_min_u32_e32 v12, 32, v12 ; GFX8-NEXT: v_min_u32_e32 v13, 32, v13 -; GFX8-NEXT: v_min_u32_e32 v14, 32, v14 +; GFX8-NEXT: v_lshlrev_b64 v[5:6], v11, v[5:6] +; GFX8-NEXT: v_lshlrev_b64 v[3:4], v12, v[3:4] ; GFX8-NEXT: v_lshlrev_b64 v[7:8], v0, v[7:8] -; GFX8-NEXT: v_sub_u32_e32 v15, vcc, 32, v0 -; GFX8-NEXT: v_lshlrev_b64 v[5:6], v12, v[5:6] -; GFX8-NEXT: v_lshlrev_b64 v[3:4], v13, v[3:4] -; GFX8-NEXT: v_lshlrev_b64 v[0:1], v14, v[1:2] -; GFX8-NEXT: v_sub_u32_e32 v12, vcc, 32, v12 -; GFX8-NEXT: v_sub_u32_e32 v13, vcc, 32, v13 -; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 32, v14 -; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 -; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX8-NEXT: v_sub_u32_e32 v14, vcc, 32, v0 +; GFX8-NEXT: v_lshlrev_b64 v[0:1], v13, v[1:2] +; GFX8-NEXT: v_min_u32_e32 v7, 1, v7 +; GFX8-NEXT: v_min_u32_e32 v5, 1, v5 +; GFX8-NEXT: v_min_u32_e32 v3, 1, v3 +; GFX8-NEXT: v_min_u32_e32 v0, 1, v0 ; GFX8-NEXT: v_or_b32_e32 v3, v4, v3 ; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX8-NEXT: v_or_b32_e32 v7, v8, v7 @@ -580,16 +541,19 @@ ; GFX8-NEXT: v_cvt_f32_u32_e32 v4, v5 ; GFX8-NEXT: v_cvt_f32_u32_e32 v3, v3 ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, v0 -; GFX8-NEXT: v_ldexp_f32 v1, v1, v15 -; GFX8-NEXT: v_ldexp_f32 v4, v4, v12 -; GFX8-NEXT: v_ldexp_f32 v3, v3, v13 +; GFX8-NEXT: v_sub_u32_e32 v11, vcc, 32, v11 +; GFX8-NEXT: v_sub_u32_e32 v12, vcc, 32, v12 +; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 32, v13 +; GFX8-NEXT: v_ldexp_f32 v4, v4, v11 +; GFX8-NEXT: v_ldexp_f32 v3, v3, v12 ; GFX8-NEXT: v_ldexp_f32 v0, v0, v2 +; GFX8-NEXT: v_ldexp_f32 v1, v1, v14 ; GFX8-NEXT: v_cvt_f16_f32_e32 v5, v0 ; GFX8-NEXT: v_cvt_f16_f32_sdwa v3, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD ; GFX8-NEXT: v_cvt_f16_f32_sdwa v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD ; GFX8-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v10 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v11, v9, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v9 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v10, vcc ; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 ; GFX8-NEXT: v_or_b32_e32 v2, v4, v2 ; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]