diff --git a/llvm/include/llvm/CodeGen/ISDOpcodes.h b/llvm/include/llvm/CodeGen/ISDOpcodes.h --- a/llvm/include/llvm/CodeGen/ISDOpcodes.h +++ b/llvm/include/llvm/CodeGen/ISDOpcodes.h @@ -1530,6 +1530,12 @@ return Code == SETEQ || Code == SETNE; } +/// Return true if this is a setcc instruction that performs an equality +/// comparison when used with floating point operands. +inline bool isFPEqualitySetCC(CondCode Code) { + return Code == SETOEQ || Code == SETONE || Code == SETUEQ || Code == SETUNE; +} + /// Return true if the specified condition returns true if the two operands to /// the condition are equal. Note that if one of the two operands is a NaN, /// this value is meaningless. diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -6066,10 +6066,15 @@ // (LHS0 < LHS1) | (RHS0 < RHS1) -> min(LHS0, RHS0) < LHS1 // and and-cmp-cmp will be replaced with max-cmp sequence: // (LHS0 < LHS1) & (RHS0 < RHS1) -> max(LHS0, RHS0) < LHS1 - if (OpVT.isInteger() && TLI.isOperationLegal(ISD::UMAX, OpVT) && - TLI.isOperationLegal(ISD::SMAX, OpVT) && - TLI.isOperationLegal(ISD::UMIN, OpVT) && - TLI.isOperationLegal(ISD::SMIN, OpVT)) { + if ((OpVT.isInteger() && TLI.isOperationLegal(ISD::UMAX, OpVT) && + TLI.isOperationLegal(ISD::SMAX, OpVT) && + TLI.isOperationLegal(ISD::UMIN, OpVT) && + TLI.isOperationLegal(ISD::SMIN, OpVT)) || + (OpVT.isFloatingPoint() && + ((TLI.isOperationLegalOrCustom(ISD::FMAXNUM, OpVT) && + TLI.isOperationLegalOrCustom(ISD::FMINNUM, OpVT)) || + (TLI.isOperationLegal(ISD::FMAXNUM_IEEE, OpVT) && + TLI.isOperationLegal(ISD::FMINNUM_IEEE, OpVT))))) { SDValue CommonValue; SDValue Operand1; SDValue Operand2; @@ -6080,7 +6085,9 @@ // predicate of one of the comparisons is the opposite of the other one. (CCL == CCR || CCL == ISD::getSetCCSwappedOperands(CCR)) && // The optimization does not work for `==` or `!=` . - !ISD::isIntEqualitySetCC(CCL) && !ISD::isIntEqualitySetCC(CCR)) { + !ISD::isIntEqualitySetCC(CCL) && !ISD::isFPEqualitySetCC(CCL) && + CCL != ISD::SETFALSE && CCL != ISD::SETO && CCL != ISD::SETUO && + CCL != ISD::SETTRUE) { if (CCL == CCR) { if (LHS0 == RHS0) { CommonValue = LHS0; @@ -6108,21 +6115,75 @@ } if (CC != ISD::SETCC_INVALID) { - unsigned NewOpcode; + std::optional NewOpcode; bool IsSigned = isSignedIntSetCC(CC); - if (((CC == ISD::SETLE || CC == ISD::SETULE || CC == ISD::SETLT || - CC == ISD::SETULT) && - (LogicOp->getOpcode() == ISD::OR)) || - ((CC == ISD::SETGE || CC == ISD::SETUGE || CC == ISD::SETGT || - CC == ISD::SETUGT) && - (LogicOp->getOpcode() == ISD::AND))) - NewOpcode = IsSigned ? ISD::SMIN : ISD::UMIN; - else - NewOpcode = IsSigned ? ISD::SMAX : ISD::UMAX; - - SDValue MinMaxValue = - DAG.getNode(NewOpcode, DL, OpVT, Operand1, Operand2); - return DAG.getSetCC(DL, VT, MinMaxValue, CommonValue, CC); + if (OpVT.isInteger()) { + if (((CC == ISD::SETLE || CC == ISD::SETULE || CC == ISD::SETLT || + CC == ISD::SETULT) && + (LogicOp->getOpcode() == ISD::OR)) || + ((CC == ISD::SETGE || CC == ISD::SETUGE || CC == ISD::SETGT || + CC == ISD::SETUGT) && + (LogicOp->getOpcode() == ISD::AND))) + NewOpcode = IsSigned ? ISD::SMIN : ISD::UMIN; + else + NewOpcode = IsSigned ? ISD::SMAX : ISD::UMAX; + } else if (OpVT.isFloatingPoint()) { + // The optimization cannot be applied for all the predicates because + // of the way FMINNUM/FMAXNUM and FMINNUM_IEEE/FMAXNUM_IEEE treat + // NaNs. For FMINNUM_IEEE/FMAXNUM_IEEE, the optimization cannot be + // applied at all if one of the operands is a signaling NaN. + bool isNotNaN = + DAG.isKnownNeverNaN(Operand1) && DAG.isKnownNeverNaN(Operand2); + bool isNotSNaN = + DAG.isKnownNeverSNaN(Operand1) && DAG.isKnownNeverSNaN(Operand2); + if (isNotNaN) { + // It is safe to use FMINNUM_IEEE/FMAXNUM_IEEE if all the operands + // are non NaN values. + if (((CC == ISD::SETLT || CC == ISD::SETLE) && + (LogicOp->getOpcode() == ISD::OR)) || + ((CC == ISD::SETGT || CC == ISD::SETGE) && + (LogicOp->getOpcode() == ISD::AND))) + NewOpcode = ISD::FMINNUM_IEEE; + else if (((CC == ISD::SETGT || CC == ISD::SETGE) && + (LogicOp->getOpcode() == ISD::OR)) || + ((CC == ISD::SETLT || CC == ISD::SETLE) && + (LogicOp->getOpcode() == ISD::AND))) + NewOpcode = ISD::FMAXNUM_IEEE; + } else if (isNotSNaN) { + // Both FMINNUM/FMAXNUM and FMINNUM_IEEE/FMAXNUM_IEEE treat quiet + // NaNs in the same way. The optimization is applied for the + // following cases. + if (((CC == ISD::SETOLT || CC == ISD::SETOLE) && + (LogicOp->getOpcode() == ISD::OR)) || + ((CC == ISD::SETUGT || CC == ISD::SETUGE) && + (LogicOp->getOpcode() == ISD::AND))) + NewOpcode = ISD::FMINNUM_IEEE; + else if (((CC == ISD::SETOGT || CC == ISD::SETOGE) && + (LogicOp->getOpcode() == ISD::OR)) || + ((CC == ISD::SETULT || CC == ISD::SETULE) && + (LogicOp->getOpcode() == ISD::AND))) + NewOpcode = ISD::FMAXNUM_IEEE; + } else if (TLI.isOperationLegalOrCustom(ISD::FMAXNUM, OpVT) && + TLI.isOperationLegalOrCustom(ISD::FMINNUM, OpVT)) { + // If FMINNUM/FAXNUM are supported, then we can handle signaling + // NaNs for the following cases. + if (((CC == ISD::SETOLT || CC == ISD::SETOLE) && + (LogicOp->getOpcode() == ISD::OR)) || + ((CC == ISD::SETUGT || CC == ISD::SETUGE) && + (LogicOp->getOpcode() == ISD::AND))) + NewOpcode = ISD::FMINNUM; + else if (((CC == ISD::SETOGT || CC == ISD::SETOGE) && + (LogicOp->getOpcode() == ISD::OR)) || + ((CC == ISD::SETULT || CC == ISD::SETULE) && + (LogicOp->getOpcode() == ISD::AND))) + NewOpcode = ISD::FMAXNUM; + } + } + if (NewOpcode) { + SDValue MinMaxValue = + DAG.getNode(*NewOpcode, DL, OpVT, Operand1, Operand2); + return DAG.getSetCC(DL, VT, MinMaxValue, CommonValue, CC); + } } } } diff --git a/llvm/test/CodeGen/AMDGPU/combine_andor_with_cmps.ll b/llvm/test/CodeGen/AMDGPU/combine_andor_with_cmps.ll --- a/llvm/test/CodeGen/AMDGPU/combine_andor_with_cmps.ll +++ b/llvm/test/CodeGen/AMDGPU/combine_andor_with_cmps.ll @@ -799,10 +799,9 @@ ; CHECK-LABEL: test54: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_min_f32_e32 v0, v0, v1 ; CHECK-NEXT: v_cmp_lt_f32_e32 vcc_lo, v0, v2 -; CHECK-NEXT: v_cmp_lt_f32_e64 s0, v1, v2 -; CHECK-NEXT: s_or_b32 s0, vcc_lo, s0 -; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; CHECK-NEXT: s_setpc_b64 s[30:31] %cmp1 = fcmp olt float %arg1, %arg3 %cmp2 = fcmp olt float %arg2, %arg3 @@ -814,10 +813,9 @@ ; CHECK-LABEL: test55: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] ; CHECK-NEXT: v_cmp_le_f64_e32 vcc_lo, v[0:1], v[4:5] -; CHECK-NEXT: v_cmp_le_f64_e64 s0, v[2:3], v[4:5] -; CHECK-NEXT: s_or_b32 s0, vcc_lo, s0 -; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; CHECK-NEXT: s_setpc_b64 s[30:31] %cmp1 = fcmp ole double %arg1, %arg3 %cmp2 = fcmp ole double %arg2, %arg3 @@ -829,10 +827,9 @@ ; CHECK-LABEL: test56: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] ; CHECK-NEXT: v_cmp_gt_f64_e32 vcc_lo, v[0:1], v[4:5] -; CHECK-NEXT: v_cmp_gt_f64_e64 s0, v[2:3], v[4:5] -; CHECK-NEXT: s_or_b32 s0, vcc_lo, s0 -; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; CHECK-NEXT: s_setpc_b64 s[30:31] %cmp1 = fcmp ogt double %arg1, %arg3 %cmp2 = fcmp ogt double %arg2, %arg3 @@ -844,10 +841,9 @@ ; CHECK-LABEL: test57: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_max_f32_e32 v0, v0, v1 ; CHECK-NEXT: v_cmp_ge_f32_e32 vcc_lo, v0, v2 -; CHECK-NEXT: v_cmp_ge_f32_e64 s0, v1, v2 -; CHECK-NEXT: s_or_b32 s0, vcc_lo, s0 -; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; CHECK-NEXT: s_setpc_b64 s[30:31] %cmp1 = fcmp oge float %arg1, %arg3 %cmp2 = fcmp oge float %arg2, %arg3 @@ -859,10 +855,9 @@ ; CHECK-LABEL: test58: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] ; CHECK-NEXT: v_cmp_nle_f64_e32 vcc_lo, v[0:1], v[4:5] -; CHECK-NEXT: v_cmp_nle_f64_e64 s0, v[2:3], v[4:5] -; CHECK-NEXT: s_and_b32 s0, vcc_lo, s0 -; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; CHECK-NEXT: s_setpc_b64 s[30:31] %cmp1 = fcmp ugt double %arg1, %arg3 %cmp2 = fcmp ugt double %arg2, %arg3 @@ -874,10 +869,9 @@ ; CHECK-LABEL: test59: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_min_f32_e32 v0, v0, v1 ; CHECK-NEXT: v_cmp_nlt_f32_e32 vcc_lo, v0, v2 -; CHECK-NEXT: v_cmp_nlt_f32_e64 s0, v1, v2 -; CHECK-NEXT: s_and_b32 s0, vcc_lo, s0 -; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; CHECK-NEXT: s_setpc_b64 s[30:31] %cmp1 = fcmp uge float %arg1, %arg3 %cmp2 = fcmp uge float %arg2, %arg3 @@ -889,10 +883,9 @@ ; CHECK-LABEL: test60: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_max_f32_e32 v0, v0, v1 ; CHECK-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v0, v2 -; CHECK-NEXT: v_cmp_ngt_f32_e64 s0, v1, v2 -; CHECK-NEXT: s_and_b32 s0, vcc_lo, s0 -; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; CHECK-NEXT: s_setpc_b64 s[30:31] %cmp1 = fcmp ule float %arg1, %arg3 %cmp2 = fcmp ule float %arg2, %arg3 @@ -904,10 +897,9 @@ ; CHECK-LABEL: test61: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] ; CHECK-NEXT: v_cmp_nge_f64_e32 vcc_lo, v[0:1], v[4:5] -; CHECK-NEXT: v_cmp_nge_f64_e64 s0, v[2:3], v[4:5] -; CHECK-NEXT: s_and_b32 s0, vcc_lo, s0 -; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; CHECK-NEXT: s_setpc_b64 s[30:31] %cmp1 = fcmp ult double %arg1, %arg3 %cmp2 = fcmp ult double %arg2, %arg3 @@ -920,10 +912,9 @@ ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_dual_add_f32 v0, 1.0, v0 :: v_dual_add_f32 v1, 2.0, v1 +; CHECK-NEXT: v_min_f32_e32 v0, v0, v1 ; CHECK-NEXT: v_cmp_lt_f32_e32 vcc_lo, v0, v2 -; CHECK-NEXT: v_cmp_lt_f32_e64 s0, v1, v2 -; CHECK-NEXT: s_or_b32 s0, vcc_lo, s0 -; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; CHECK-NEXT: s_setpc_b64 s[30:31] %add1 = fadd nnan float %arg1, 1.0 %add2 = fadd nnan float %arg2, 2.0 @@ -939,10 +930,9 @@ ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 ; CHECK-NEXT: v_add_f64 v[2:3], v[2:3], 2.0 +; CHECK-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] ; CHECK-NEXT: v_cmp_le_f64_e32 vcc_lo, v[0:1], v[4:5] -; CHECK-NEXT: v_cmp_le_f64_e64 s0, v[2:3], v[4:5] -; CHECK-NEXT: s_or_b32 s0, vcc_lo, s0 -; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; CHECK-NEXT: s_setpc_b64 s[30:31] %add1 = fadd nnan double %arg1, 1.0 %add2 = fadd nnan double %arg2, 2.0 @@ -958,10 +948,9 @@ ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 ; CHECK-NEXT: v_add_f64 v[2:3], v[2:3], 2.0 +; CHECK-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] ; CHECK-NEXT: v_cmp_gt_f64_e32 vcc_lo, v[0:1], v[4:5] -; CHECK-NEXT: v_cmp_gt_f64_e64 s0, v[2:3], v[4:5] -; CHECK-NEXT: s_or_b32 s0, vcc_lo, s0 -; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; CHECK-NEXT: s_setpc_b64 s[30:31] %add1 = fadd nnan double %arg1, 1.0 %add2 = fadd nnan double %arg2, 2.0 @@ -976,10 +965,9 @@ ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_dual_add_f32 v0, 1.0, v0 :: v_dual_add_f32 v1, 2.0, v1 +; CHECK-NEXT: v_max_f32_e32 v0, v0, v1 ; CHECK-NEXT: v_cmp_ge_f32_e32 vcc_lo, v0, v2 -; CHECK-NEXT: v_cmp_ge_f32_e64 s0, v1, v2 -; CHECK-NEXT: s_or_b32 s0, vcc_lo, s0 -; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; CHECK-NEXT: s_setpc_b64 s[30:31] %add1 = fadd nnan float %arg1, 1.0 %add2 = fadd nnan float %arg2, 2.0 @@ -995,10 +983,9 @@ ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 ; CHECK-NEXT: v_add_f64 v[2:3], v[2:3], 2.0 +; CHECK-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] ; CHECK-NEXT: v_cmp_gt_f64_e32 vcc_lo, v[0:1], v[4:5] -; CHECK-NEXT: v_cmp_gt_f64_e64 s0, v[2:3], v[4:5] -; CHECK-NEXT: s_and_b32 s0, vcc_lo, s0 -; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; CHECK-NEXT: s_setpc_b64 s[30:31] %add1 = fadd nnan double %arg1, 1.0 %add2 = fadd nnan double %arg2, 2.0 @@ -1013,10 +1000,9 @@ ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_dual_add_f32 v0, 1.0, v0 :: v_dual_add_f32 v1, 2.0, v1 +; CHECK-NEXT: v_min_f32_e32 v0, v0, v1 ; CHECK-NEXT: v_cmp_ge_f32_e32 vcc_lo, v0, v2 -; CHECK-NEXT: v_cmp_ge_f32_e64 s0, v1, v2 -; CHECK-NEXT: s_and_b32 s0, vcc_lo, s0 -; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; CHECK-NEXT: s_setpc_b64 s[30:31] %add1 = fadd nnan float %arg1, 1.0 %add2 = fadd nnan float %arg2, 2.0 @@ -1031,10 +1017,9 @@ ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_dual_add_f32 v0, 1.0, v0 :: v_dual_add_f32 v1, 2.0, v1 +; CHECK-NEXT: v_max_f32_e32 v0, v0, v1 ; CHECK-NEXT: v_cmp_le_f32_e32 vcc_lo, v0, v2 -; CHECK-NEXT: v_cmp_le_f32_e64 s0, v1, v2 -; CHECK-NEXT: s_and_b32 s0, vcc_lo, s0 -; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; CHECK-NEXT: s_setpc_b64 s[30:31] %add1 = fadd nnan float %arg1, 1.0 %add2 = fadd nnan float %arg2, 2.0 @@ -1050,10 +1035,9 @@ ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 ; CHECK-NEXT: v_add_f64 v[2:3], v[2:3], 2.0 +; CHECK-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] ; CHECK-NEXT: v_cmp_lt_f64_e32 vcc_lo, v[0:1], v[4:5] -; CHECK-NEXT: v_cmp_lt_f64_e64 s0, v[2:3], v[4:5] -; CHECK-NEXT: s_and_b32 s0, vcc_lo, s0 -; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; CHECK-NEXT: s_setpc_b64 s[30:31] %add1 = fadd nnan double %arg1, 1.0 %add2 = fadd nnan double %arg2, 2.0 @@ -1068,10 +1052,9 @@ ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v1, v1, v1 +; CHECK-NEXT: v_min_f32_e32 v0, v0, v1 ; CHECK-NEXT: v_cmp_lt_f32_e32 vcc_lo, v0, v2 -; CHECK-NEXT: v_cmp_lt_f32_e64 s0, v1, v2 -; CHECK-NEXT: s_or_b32 s0, vcc_lo, s0 -; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; CHECK-NEXT: s_setpc_b64 s[30:31] %var1 = call float @llvm.canonicalize.f32(float %arg1) %var2 = call float @llvm.canonicalize.f32(float %arg2) @@ -1087,10 +1070,9 @@ ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] ; CHECK-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; CHECK-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] ; CHECK-NEXT: v_cmp_le_f64_e32 vcc_lo, v[0:1], v[4:5] -; CHECK-NEXT: v_cmp_le_f64_e64 s0, v[2:3], v[4:5] -; CHECK-NEXT: s_or_b32 s0, vcc_lo, s0 -; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; CHECK-NEXT: s_setpc_b64 s[30:31] %var1 = call double @llvm.canonicalize.f64(double %arg1) %var2 = call double @llvm.canonicalize.f64(double %arg2) @@ -1106,10 +1088,9 @@ ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] ; CHECK-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; CHECK-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] ; CHECK-NEXT: v_cmp_gt_f64_e32 vcc_lo, v[0:1], v[4:5] -; CHECK-NEXT: v_cmp_gt_f64_e64 s0, v[2:3], v[4:5] -; CHECK-NEXT: s_or_b32 s0, vcc_lo, s0 -; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; CHECK-NEXT: s_setpc_b64 s[30:31] %var1 = call double @llvm.canonicalize.f64(double %arg1) %var2 = call double @llvm.canonicalize.f64(double %arg2) @@ -1124,10 +1105,9 @@ ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v1, v1, v1 +; CHECK-NEXT: v_max_f32_e32 v0, v0, v1 ; CHECK-NEXT: v_cmp_ge_f32_e32 vcc_lo, v0, v2 -; CHECK-NEXT: v_cmp_ge_f32_e64 s0, v1, v2 -; CHECK-NEXT: s_or_b32 s0, vcc_lo, s0 -; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; CHECK-NEXT: s_setpc_b64 s[30:31] %var1 = call float @llvm.canonicalize.f32(float %arg1) %var2 = call float @llvm.canonicalize.f32(float %arg2) @@ -1143,10 +1123,9 @@ ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] ; CHECK-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; CHECK-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] ; CHECK-NEXT: v_cmp_nle_f64_e32 vcc_lo, v[0:1], v[4:5] -; CHECK-NEXT: v_cmp_nle_f64_e64 s0, v[2:3], v[4:5] -; CHECK-NEXT: s_and_b32 s0, vcc_lo, s0 -; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; CHECK-NEXT: s_setpc_b64 s[30:31] %var1 = call double @llvm.canonicalize.f64(double %arg1) %var2 = call double @llvm.canonicalize.f64(double %arg2) @@ -1161,10 +1140,9 @@ ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v1, v1, v1 +; CHECK-NEXT: v_min_f32_e32 v0, v0, v1 ; CHECK-NEXT: v_cmp_nlt_f32_e32 vcc_lo, v0, v2 -; CHECK-NEXT: v_cmp_nlt_f32_e64 s0, v1, v2 -; CHECK-NEXT: s_and_b32 s0, vcc_lo, s0 -; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; CHECK-NEXT: s_setpc_b64 s[30:31] %var1 = call float @llvm.canonicalize.f32(float %arg1) %var2 = call float @llvm.canonicalize.f32(float %arg2) @@ -1179,10 +1157,9 @@ ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v1, v1, v1 +; CHECK-NEXT: v_max_f32_e32 v0, v0, v1 ; CHECK-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v0, v2 -; CHECK-NEXT: v_cmp_ngt_f32_e64 s0, v1, v2 -; CHECK-NEXT: s_and_b32 s0, vcc_lo, s0 -; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; CHECK-NEXT: s_setpc_b64 s[30:31] %var1 = call float @llvm.canonicalize.f32(float %arg1) %var2 = call float @llvm.canonicalize.f32(float %arg2) @@ -1198,10 +1175,9 @@ ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] ; CHECK-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; CHECK-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] ; CHECK-NEXT: v_cmp_nge_f64_e32 vcc_lo, v[0:1], v[4:5] -; CHECK-NEXT: v_cmp_nge_f64_e64 s0, v[2:3], v[4:5] -; CHECK-NEXT: s_and_b32 s0, vcc_lo, s0 -; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; CHECK-NEXT: s_setpc_b64 s[30:31] %var1 = call double @llvm.canonicalize.f64(double %arg1) %var2 = call double @llvm.canonicalize.f64(double %arg2) @@ -1215,10 +1191,9 @@ ; CHECK-LABEL: test78: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_min_f32_e32 v0, v0, v1 ; CHECK-NEXT: v_cmp_lt_f32_e32 vcc_lo, v0, v2 -; CHECK-NEXT: v_cmp_gt_f32_e64 s0, v2, v1 -; CHECK-NEXT: s_or_b32 s0, vcc_lo, s0 -; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; CHECK-NEXT: s_setpc_b64 s[30:31] %cmp1 = fcmp olt float %arg1, %arg3 %cmp2 = fcmp ogt float %arg3, %arg2 @@ -1230,10 +1205,9 @@ ; CHECK-LABEL: test79: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_max_f32_e32 v0, v0, v1 ; CHECK-NEXT: v_cmp_nge_f32_e32 vcc_lo, v0, v2 -; CHECK-NEXT: v_cmp_nle_f32_e64 s0, v2, v1 -; CHECK-NEXT: s_and_b32 s0, vcc_lo, s0 -; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; CHECK-NEXT: s_setpc_b64 s[30:31] %cmp1 = fcmp ult float %arg1, %arg3 %cmp2 = fcmp ugt float %arg3, %arg2 @@ -1246,10 +1220,9 @@ ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_dual_add_f32 v0, 1.0, v0 :: v_dual_add_f32 v1, 2.0, v1 +; CHECK-NEXT: v_max_f32_e32 v0, v0, v1 ; CHECK-NEXT: v_cmp_ge_f32_e32 vcc_lo, v0, v2 -; CHECK-NEXT: v_cmp_le_f32_e64 s0, v2, v1 -; CHECK-NEXT: s_or_b32 s0, vcc_lo, s0 -; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; CHECK-NEXT: s_setpc_b64 s[30:31] %add1 = fadd nnan float %arg1, 1.0 %add2 = fadd nnan float %arg2, 2.0 @@ -1265,10 +1238,9 @@ ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 ; CHECK-NEXT: v_add_f64 v[2:3], v[2:3], 2.0 +; CHECK-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] ; CHECK-NEXT: v_cmp_gt_f64_e32 vcc_lo, v[0:1], v[4:5] -; CHECK-NEXT: v_cmp_lt_f64_e64 s0, v[4:5], v[2:3] -; CHECK-NEXT: s_and_b32 s0, vcc_lo, s0 -; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; CHECK-NEXT: s_setpc_b64 s[30:31] %add1 = fadd nnan double %arg1, 1.0 %add2 = fadd nnan double %arg2, 2.0 @@ -1284,10 +1256,9 @@ ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] ; CHECK-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; CHECK-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] ; CHECK-NEXT: v_cmp_le_f64_e32 vcc_lo, v[0:1], v[4:5] -; CHECK-NEXT: v_cmp_ge_f64_e64 s0, v[4:5], v[2:3] -; CHECK-NEXT: s_or_b32 s0, vcc_lo, s0 -; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; CHECK-NEXT: s_setpc_b64 s[30:31] %var1 = call double @llvm.canonicalize.f64(double %arg1) %var2 = call double @llvm.canonicalize.f64(double %arg2) @@ -1302,10 +1273,9 @@ ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v1, v1, v1 +; CHECK-NEXT: v_max_f32_e32 v0, v0, v1 ; CHECK-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v0, v2 -; CHECK-NEXT: v_cmp_nlt_f32_e64 s0, v2, v1 -; CHECK-NEXT: s_and_b32 s0, vcc_lo, s0 -; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; CHECK-NEXT: s_setpc_b64 s[30:31] %var1 = call float @llvm.canonicalize.f32(float %arg1) %var2 = call float @llvm.canonicalize.f32(float %arg2) @@ -1321,10 +1291,9 @@ ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_max_f16_e32 v0, v0, v0 ; CHECK-NEXT: v_max_f16_e32 v1, v1, v1 +; CHECK-NEXT: v_min_f16_e32 v0, v0, v1 ; CHECK-NEXT: v_cmp_lt_f16_e32 vcc_lo, v0, v2 -; CHECK-NEXT: v_cmp_lt_f16_e64 s0, v1, v2 -; CHECK-NEXT: s_or_b32 s0, vcc_lo, s0 -; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; CHECK-NEXT: s_setpc_b64 s[30:31] %var1 = call half @llvm.canonicalize.f16(half %arg1) %var2 = call half @llvm.canonicalize.f16(half %arg2) @@ -1340,17 +1309,13 @@ ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_pk_max_f16 v0, v0, v0 ; CHECK-NEXT: v_pk_max_f16 v1, v1, v1 -; CHECK-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; CHECK-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; CHECK-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; CHECK-NEXT: v_pk_min_f16 v0, v0, v1 +; CHECK-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; CHECK-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; CHECK-NEXT: v_cmp_le_f16_e32 vcc_lo, v0, v2 -; CHECK-NEXT: v_cmp_le_f16_e64 s0, v1, v2 -; CHECK-NEXT: v_cmp_le_f16_e64 s1, v4, v3 -; CHECK-NEXT: v_cmp_le_f16_e64 s2, v5, v3 -; CHECK-NEXT: s_or_b32 s0, vcc_lo, s0 -; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 -; CHECK-NEXT: s_or_b32 s0, s1, s2 -; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; CHECK-NEXT: v_cmp_le_f16_e32 vcc_lo, v3, v1 +; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo ; CHECK-NEXT: s_setpc_b64 s[30:31] %var1 = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %arg1) %var2 = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %arg2) @@ -1366,17 +1331,13 @@ ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_pk_max_f16 v0, v0, v0 ; CHECK-NEXT: v_pk_max_f16 v1, v1, v1 -; CHECK-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; CHECK-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; CHECK-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; CHECK-NEXT: v_pk_max_f16 v0, v0, v1 +; CHECK-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; CHECK-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; CHECK-NEXT: v_cmp_gt_f16_e32 vcc_lo, v0, v2 -; CHECK-NEXT: v_cmp_gt_f16_e64 s0, v1, v2 -; CHECK-NEXT: v_cmp_gt_f16_e64 s1, v4, v3 -; CHECK-NEXT: v_cmp_gt_f16_e64 s2, v5, v3 -; CHECK-NEXT: s_or_b32 s0, vcc_lo, s0 -; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 -; CHECK-NEXT: s_or_b32 s0, s1, s2 -; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; CHECK-NEXT: v_cmp_gt_f16_e32 vcc_lo, v3, v1 +; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo ; CHECK-NEXT: s_setpc_b64 s[30:31] %var1 = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %arg1) %var2 = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %arg2) @@ -1392,10 +1353,9 @@ ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_max_f16_e32 v0, v0, v0 ; CHECK-NEXT: v_max_f16_e32 v1, v1, v1 +; CHECK-NEXT: v_max_f16_e32 v0, v0, v1 ; CHECK-NEXT: v_cmp_ge_f16_e32 vcc_lo, v0, v2 -; CHECK-NEXT: v_cmp_ge_f16_e64 s0, v1, v2 -; CHECK-NEXT: s_or_b32 s0, vcc_lo, s0 -; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; CHECK-NEXT: s_setpc_b64 s[30:31] %var1 = call half @llvm.canonicalize.f16(half %arg1) %var2 = call half @llvm.canonicalize.f16(half %arg2) @@ -1411,17 +1371,13 @@ ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_pk_max_f16 v0, v0, v0 ; CHECK-NEXT: v_pk_max_f16 v1, v1, v1 -; CHECK-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; CHECK-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; CHECK-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; CHECK-NEXT: v_pk_min_f16 v0, v0, v1 +; CHECK-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; CHECK-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; CHECK-NEXT: v_cmp_nle_f16_e32 vcc_lo, v0, v2 -; CHECK-NEXT: v_cmp_nle_f16_e64 s0, v1, v2 -; CHECK-NEXT: v_cmp_nle_f16_e64 s1, v4, v3 -; CHECK-NEXT: v_cmp_nle_f16_e64 s2, v5, v3 -; CHECK-NEXT: s_and_b32 s0, vcc_lo, s0 -; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 -; CHECK-NEXT: s_and_b32 s0, s1, s2 -; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; CHECK-NEXT: v_cmp_nle_f16_e32 vcc_lo, v3, v1 +; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo ; CHECK-NEXT: s_setpc_b64 s[30:31] %var1 = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %arg1) %var2 = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %arg2) @@ -1437,10 +1393,9 @@ ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_max_f16_e32 v0, v0, v0 ; CHECK-NEXT: v_max_f16_e32 v1, v1, v1 +; CHECK-NEXT: v_min_f16_e32 v0, v0, v1 ; CHECK-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v0, v2 -; CHECK-NEXT: v_cmp_nlt_f16_e64 s0, v1, v2 -; CHECK-NEXT: s_and_b32 s0, vcc_lo, s0 -; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; CHECK-NEXT: s_setpc_b64 s[30:31] %var1 = call half @llvm.canonicalize.f16(half %arg1) %var2 = call half @llvm.canonicalize.f16(half %arg2) @@ -1456,10 +1411,9 @@ ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_max_f16_e32 v0, v0, v0 ; CHECK-NEXT: v_max_f16_e32 v1, v1, v1 +; CHECK-NEXT: v_max_f16_e32 v0, v0, v1 ; CHECK-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v0, v2 -; CHECK-NEXT: v_cmp_ngt_f16_e64 s0, v1, v2 -; CHECK-NEXT: s_and_b32 s0, vcc_lo, s0 -; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; CHECK-NEXT: s_setpc_b64 s[30:31] %var1 = call half @llvm.canonicalize.f16(half %arg1) %var2 = call half @llvm.canonicalize.f16(half %arg2) @@ -1475,17 +1429,13 @@ ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_pk_max_f16 v0, v0, v0 ; CHECK-NEXT: v_pk_max_f16 v1, v1, v1 -; CHECK-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; CHECK-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; CHECK-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; CHECK-NEXT: v_pk_max_f16 v0, v0, v1 +; CHECK-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; CHECK-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; CHECK-NEXT: v_cmp_nge_f16_e32 vcc_lo, v0, v2 -; CHECK-NEXT: v_cmp_nge_f16_e64 s0, v1, v2 -; CHECK-NEXT: v_cmp_nge_f16_e64 s1, v4, v3 -; CHECK-NEXT: v_cmp_nge_f16_e64 s2, v5, v3 -; CHECK-NEXT: s_and_b32 s0, vcc_lo, s0 -; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 -; CHECK-NEXT: s_and_b32 s0, s1, s2 -; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; CHECK-NEXT: v_cmp_nge_f16_e32 vcc_lo, v3, v1 +; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo ; CHECK-NEXT: s_setpc_b64 s[30:31] %var1 = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %arg1) %var2 = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %arg2) @@ -1869,12 +1819,9 @@ ; CHECK-LABEL: test107: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_min3_f32 v0, v0, v1, v2 ; CHECK-NEXT: v_cmp_lt_f32_e32 vcc_lo, v0, v3 -; CHECK-NEXT: v_cmp_lt_f32_e64 s0, v1, v3 -; CHECK-NEXT: v_cmp_lt_f32_e64 s1, v2, v3 -; CHECK-NEXT: s_or_b32 s0, vcc_lo, s0 -; CHECK-NEXT: s_or_b32 s0, s0, s1 -; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; CHECK-NEXT: s_setpc_b64 s[30:31] %cmp1 = fcmp olt float %arg1, %C %cmp2 = fcmp olt float %arg2, %C @@ -1888,12 +1835,9 @@ ; CHECK-LABEL: test108: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_max3_f32 v0, v0, v1, v2 ; CHECK-NEXT: v_cmp_nge_f32_e32 vcc_lo, v0, v3 -; CHECK-NEXT: v_cmp_nge_f32_e64 s0, v1, v3 -; CHECK-NEXT: v_cmp_nge_f32_e64 s1, v2, v3 -; CHECK-NEXT: s_and_b32 s0, vcc_lo, s0 -; CHECK-NEXT: s_and_b32 s0, s0, s1 -; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; CHECK-NEXT: s_setpc_b64 s[30:31] %cmp1 = fcmp ult float %arg1, %C %cmp2 = fcmp ult float %arg2, %C @@ -1907,13 +1851,12 @@ ; CHECK-LABEL: test109: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v0, v0, v0 +; CHECK-NEXT: v_dual_max_f32 v3, v3, v3 :: v_dual_max_f32 v2, v2, v2 +; CHECK-NEXT: v_dual_min_f32 v0, v0, v1 :: v_dual_max_f32 v1, v2, v3 ; CHECK-NEXT: v_cmp_lt_f32_e32 vcc_lo, v0, v4 -; CHECK-NEXT: v_cmp_lt_f32_e64 s0, v1, v4 -; CHECK-NEXT: v_cmp_gt_f32_e64 s1, v2, v4 -; CHECK-NEXT: v_cmp_gt_f32_e64 s2, v3, v4 +; CHECK-NEXT: v_cmp_gt_f32_e64 s0, v1, v4 ; CHECK-NEXT: s_or_b32 s0, vcc_lo, s0 -; CHECK-NEXT: s_or_b32 s1, s1, s2 -; CHECK-NEXT: s_or_b32 s0, s0, s1 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 ; CHECK-NEXT: s_setpc_b64 s[30:31] %cmp1 = fcmp olt float %arg1, %C @@ -1932,13 +1875,10 @@ ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_dual_add_f32 v0, v0, v4 :: v_dual_add_f32 v1, v1, v5 ; CHECK-NEXT: v_dual_add_f32 v2, v2, v6 :: v_dual_add_f32 v3, v3, v7 +; CHECK-NEXT: v_dual_max_f32 v0, v0, v1 :: v_dual_min_f32 v1, v2, v3 ; CHECK-NEXT: v_cmp_lt_f32_e32 vcc_lo, v0, v8 -; CHECK-NEXT: v_cmp_lt_f32_e64 s0, v1, v8 -; CHECK-NEXT: v_cmp_gt_f32_e64 s1, v2, v8 -; CHECK-NEXT: v_cmp_gt_f32_e64 s2, v3, v8 +; CHECK-NEXT: v_cmp_gt_f32_e64 s0, v1, v8 ; CHECK-NEXT: s_and_b32 s0, vcc_lo, s0 -; CHECK-NEXT: s_and_b32 s1, s1, s2 -; CHECK-NEXT: s_and_b32 s0, s0, s1 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 ; CHECK-NEXT: s_setpc_b64 s[30:31] %add1 = fadd nnan float %arg1, %C1 @@ -1959,20 +1899,13 @@ ; CHECK-LABEL: test111: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_dual_max_f32 v3, v3, v3 :: v_dual_max_f32 v2, v2, v2 +; CHECK-NEXT: v_dual_min_f32 v2, v2, v3 :: v_dual_max_f32 v3, v4, v4 +; CHECK-NEXT: v_min3_f32 v0, v0, v1, v2 +; CHECK-NEXT: v_min_f32_e32 v0, v0, v3 +; CHECK-NEXT: v_min3_f32 v0, v5, v6, v0 ; CHECK-NEXT: v_cmp_lt_f32_e32 vcc_lo, v0, v8 -; CHECK-NEXT: v_cmp_lt_f32_e64 s0, v1, v8 -; CHECK-NEXT: v_cmp_lt_f32_e64 s1, v2, v8 -; CHECK-NEXT: v_cmp_lt_f32_e64 s2, v3, v8 -; CHECK-NEXT: s_or_b32 s3, vcc_lo, s0 -; CHECK-NEXT: v_cmp_lt_f32_e32 vcc_lo, v4, v8 -; CHECK-NEXT: s_or_b32 s2, s1, s2 -; CHECK-NEXT: v_cmp_lt_f32_e64 s0, v5, v8 -; CHECK-NEXT: v_cmp_lt_f32_e64 s1, v6, v8 -; CHECK-NEXT: s_or_b32 s2, s3, s2 -; CHECK-NEXT: s_or_b32 s2, s2, vcc_lo -; CHECK-NEXT: s_or_b32 s0, s0, s1 -; CHECK-NEXT: s_or_b32 s0, s0, s2 -; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; CHECK-NEXT: s_setpc_b64 s[30:31] %cmp1 = fcmp olt float %arg1, %C %cmp2 = fcmp olt float %arg2, %C @@ -1996,19 +1929,14 @@ ; CHECK-LABEL: test112: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_cmp_lt_f32_e32 vcc_lo, v0, v8 -; CHECK-NEXT: v_cmp_lt_f32_e64 s0, v1, v8 -; CHECK-NEXT: v_cmp_lt_f32_e64 s1, v2, v8 -; CHECK-NEXT: v_cmp_lt_f32_e64 s2, v3, v8 -; CHECK-NEXT: s_or_b32 s3, vcc_lo, s0 +; CHECK-NEXT: v_dual_max_f32 v3, v3, v3 :: v_dual_max_f32 v2, v2, v2 ; CHECK-NEXT: v_cmp_nge_f32_e32 vcc_lo, v4, v8 -; CHECK-NEXT: s_or_b32 s2, s1, s2 -; CHECK-NEXT: v_cmp_lt_f32_e64 s0, v5, v8 -; CHECK-NEXT: v_cmp_lt_f32_e64 s1, v6, v8 -; CHECK-NEXT: s_or_b32 s2, s3, s2 -; CHECK-NEXT: s_or_b32 s2, s2, vcc_lo -; CHECK-NEXT: s_or_b32 s0, s0, s1 -; CHECK-NEXT: s_or_b32 s0, s0, s2 +; CHECK-NEXT: v_dual_max_f32 v5, v5, v5 :: v_dual_min_f32 v2, v2, v3 +; CHECK-NEXT: v_max_f32_e32 v3, v6, v6 +; CHECK-NEXT: v_min3_f32 v0, v0, v1, v2 +; CHECK-NEXT: v_min3_f32 v0, v0, v5, v3 +; CHECK-NEXT: v_cmp_lt_f32_e64 s0, v0, v8 +; CHECK-NEXT: s_or_b32 s0, s0, vcc_lo ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 ; CHECK-NEXT: s_setpc_b64 s[30:31] %cmp1 = fcmp olt float %arg1, %C @@ -2033,11 +1961,11 @@ ; CHECK-LABEL: test113: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_cmp_nge_f32_e32 vcc_lo, v0, v3 -; CHECK-NEXT: v_cmp_nge_f32_e64 s0, v1, v3 -; CHECK-NEXT: v_cmp_lt_f32_e64 s1, v2, v3 -; CHECK-NEXT: s_and_b32 s0, vcc_lo, s0 -; CHECK-NEXT: s_or_b32 s0, s0, s1 +; CHECK-NEXT: v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v0, v0, v0 +; CHECK-NEXT: v_cmp_lt_f32_e32 vcc_lo, v2, v3 +; CHECK-NEXT: v_max_f32_e32 v0, v0, v1 +; CHECK-NEXT: v_cmp_nge_f32_e64 s0, v0, v3 +; CHECK-NEXT: s_or_b32 s0, s0, vcc_lo ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 ; CHECK-NEXT: s_setpc_b64 s[30:31] %cmp1 = fcmp ult float %arg1, %C @@ -2052,11 +1980,11 @@ ; CHECK-LABEL: test114: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_cmp_gt_f32_e32 vcc_lo, v0, v3 -; CHECK-NEXT: v_cmp_gt_f32_e64 s0, v1, v3 -; CHECK-NEXT: v_cmp_nge_f32_e64 s1, v2, v3 -; CHECK-NEXT: s_or_b32 s0, vcc_lo, s0 -; CHECK-NEXT: s_and_b32 s0, s0, s1 +; CHECK-NEXT: v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v0, v0, v0 +; CHECK-NEXT: v_cmp_nge_f32_e32 vcc_lo, v2, v3 +; CHECK-NEXT: v_max_f32_e32 v0, v0, v1 +; CHECK-NEXT: v_cmp_gt_f32_e64 s0, v0, v3 +; CHECK-NEXT: s_and_b32 s0, s0, vcc_lo ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 ; CHECK-NEXT: s_setpc_b64 s[30:31] %cmp1 = fcmp ogt float %arg1, %C @@ -2071,14 +1999,12 @@ ; CHECK-LABEL: test115: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_dual_max_f32 v2, v2, v2 :: v_dual_max_f32 v3, v3, v3 +; CHECK-NEXT: v_dual_max_f32 v2, v2, v2 :: v_dual_max_f32 v1, v1, v1 +; CHECK-NEXT: v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v3, v3, v3 +; CHECK-NEXT: v_dual_min_f32 v0, v0, v1 :: v_dual_max_f32 v1, v2, v3 ; CHECK-NEXT: v_cmp_lt_f32_e32 vcc_lo, v0, v4 -; CHECK-NEXT: v_cmp_lt_f32_e64 s0, v1, v4 -; CHECK-NEXT: v_cmp_nge_f32_e64 s1, v2, v4 -; CHECK-NEXT: v_cmp_nge_f32_e64 s2, v3, v4 +; CHECK-NEXT: v_cmp_nge_f32_e64 s0, v1, v4 ; CHECK-NEXT: s_or_b32 s0, vcc_lo, s0 -; CHECK-NEXT: s_and_b32 s1, s1, s2 -; CHECK-NEXT: s_or_b32 s0, s0, s1 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 ; CHECK-NEXT: s_setpc_b64 s[30:31] %cmp1 = fcmp olt float %arg1, %C @@ -2097,25 +2023,22 @@ ; CHECK-LABEL: test116: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_cmp_lt_f32_e32 vcc_lo, v0, v10 -; CHECK-NEXT: v_cmp_lt_f32_e64 s0, v1, v10 -; CHECK-NEXT: v_cmp_gt_f32_e64 s1, v2, v10 -; CHECK-NEXT: v_cmp_gt_f32_e64 s2, v3, v10 -; CHECK-NEXT: v_cmp_lt_f32_e64 s3, v4, v10 -; CHECK-NEXT: v_cmp_lt_f32_e64 s4, v5, v10 -; CHECK-NEXT: v_cmp_gt_f32_e64 s5, v6, v10 -; CHECK-NEXT: v_cmp_gt_f32_e64 s6, v7, v10 -; CHECK-NEXT: v_cmp_lt_f32_e64 s7, v8, v10 -; CHECK-NEXT: v_cmp_lt_f32_e64 s8, v9, v10 -; CHECK-NEXT: s_or_b32 s0, vcc_lo, s0 -; CHECK-NEXT: s_or_b32 s1, s1, s2 -; CHECK-NEXT: s_or_b32 s2, s3, s4 -; CHECK-NEXT: s_or_b32 s3, s5, s6 -; CHECK-NEXT: s_or_b32 s4, s7, s8 +; CHECK-NEXT: v_dual_max_f32 v9, v9, v9 :: v_dual_max_f32 v8, v8, v8 +; CHECK-NEXT: v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v0, v0, v0 +; CHECK-NEXT: v_dual_max_f32 v3, v3, v3 :: v_dual_max_f32 v2, v2, v2 +; CHECK-NEXT: v_dual_max_f32 v5, v5, v5 :: v_dual_max_f32 v4, v4, v4 +; CHECK-NEXT: v_dual_max_f32 v7, v7, v7 :: v_dual_max_f32 v6, v6, v6 +; CHECK-NEXT: v_min_f32_e32 v8, v8, v9 +; CHECK-NEXT: v_dual_max_f32 v2, v2, v3 :: v_dual_min_f32 v3, v4, v5 +; CHECK-NEXT: v_max_f32_e32 v4, v6, v7 +; CHECK-NEXT: v_min3_f32 v0, v0, v1, v8 +; CHECK-NEXT: v_cmp_gt_f32_e32 vcc_lo, v2, v10 +; CHECK-NEXT: v_cmp_lt_f32_e64 s0, v3, v10 +; CHECK-NEXT: v_cmp_gt_f32_e64 s1, v4, v10 +; CHECK-NEXT: v_cmp_lt_f32_e64 s2, v0, v10 +; CHECK-NEXT: s_or_b32 s0, s0, s1 +; CHECK-NEXT: s_or_b32 s1, s2, vcc_lo ; CHECK-NEXT: s_or_b32 s0, s0, s1 -; CHECK-NEXT: s_or_b32 s1, s2, s3 -; CHECK-NEXT: s_or_b32 s0, s4, s0 -; CHECK-NEXT: s_or_b32 s0, s1, s0 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 ; CHECK-NEXT: s_setpc_b64 s[30:31] %cmp1 = fcmp olt float %arg1, %C @@ -2144,27 +2067,20 @@ ; CHECK-LABEL: test117: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_dual_max_f32 v3, v3, v3 :: v_dual_max_f32 v6, v6, v6 +; CHECK-NEXT: v_dual_max_f32 v7, v7, v7 :: v_dual_max_f32 v10, v10, v10 +; CHECK-NEXT: v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v0, v0, v0 +; CHECK-NEXT: v_dual_max_f32 v11, v11, v11 :: v_dual_max_f32 v2, v2, v2 +; CHECK-NEXT: v_min_f32_e32 v6, v6, v7 +; CHECK-NEXT: v_dual_min_f32 v0, v0, v1 :: v_dual_min_f32 v1, v10, v11 +; CHECK-NEXT: v_min_f32_e32 v2, v2, v3 +; CHECK-NEXT: v_min3_f32 v3, v4, v5, v6 ; CHECK-NEXT: v_cmp_lt_f32_e32 vcc_lo, v0, v12 -; CHECK-NEXT: v_cmp_lt_f32_e64 s0, v1, v12 -; CHECK-NEXT: v_cmp_lt_f32_e64 s1, v2, v13 -; CHECK-NEXT: v_cmp_lt_f32_e64 s2, v3, v13 -; CHECK-NEXT: v_cmp_lt_f32_e64 s3, v4, v13 -; CHECK-NEXT: v_cmp_lt_f32_e64 s4, v5, v13 -; CHECK-NEXT: v_cmp_lt_f32_e64 s5, v6, v13 -; CHECK-NEXT: v_cmp_lt_f32_e64 s6, v7, v13 -; CHECK-NEXT: v_cmp_lt_f32_e64 s7, v8, v12 -; CHECK-NEXT: v_cmp_lt_f32_e64 s8, v9, v12 -; CHECK-NEXT: v_cmp_lt_f32_e64 s9, v10, v12 -; CHECK-NEXT: v_cmp_lt_f32_e64 s10, v11, v12 +; CHECK-NEXT: v_min3_f32 v0, v8, v9, v1 +; CHECK-NEXT: v_cmp_lt_f32_e64 s0, v2, v13 +; CHECK-NEXT: v_cmp_lt_f32_e64 s1, v3, v13 +; CHECK-NEXT: v_cmp_lt_f32_e64 s2, v0, v12 ; CHECK-NEXT: s_or_b32 s0, vcc_lo, s0 -; CHECK-NEXT: s_or_b32 s1, s1, s2 -; CHECK-NEXT: s_or_b32 s2, s3, s4 -; CHECK-NEXT: s_or_b32 s3, s5, s6 -; CHECK-NEXT: s_or_b32 s4, s7, s8 -; CHECK-NEXT: s_or_b32 s5, s9, s10 -; CHECK-NEXT: s_or_b32 s0, s0, s1 -; CHECK-NEXT: s_or_b32 s1, s2, s3 -; CHECK-NEXT: s_or_b32 s2, s4, s5 ; CHECK-NEXT: s_or_b32 s0, s0, s1 ; CHECK-NEXT: s_or_b32 s0, s2, s0 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 @@ -2202,14 +2118,10 @@ ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_dual_add_f32 v0, v0, v4 :: v_dual_add_f32 v1, v1, v5 ; CHECK-NEXT: v_dual_add_f32 v2, v2, v6 :: v_dual_add_f32 v3, v3, v7 +; CHECK-NEXT: v_min_f32_e32 v0, v0, v1 +; CHECK-NEXT: v_max3_f32 v0, v0, v2, v3 ; CHECK-NEXT: v_cmp_lt_f32_e32 vcc_lo, v0, v8 -; CHECK-NEXT: v_cmp_lt_f32_e64 s0, v1, v8 -; CHECK-NEXT: v_cmp_lt_f32_e64 s1, v2, v8 -; CHECK-NEXT: v_cmp_lt_f32_e64 s2, v3, v8 -; CHECK-NEXT: s_or_b32 s0, vcc_lo, s0 -; CHECK-NEXT: s_and_b32 s1, s1, s2 -; CHECK-NEXT: s_and_b32 s0, s0, s1 -; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; CHECK-NEXT: s_setpc_b64 s[30:31] %add1 = fadd nnan float %arg1, %C1 %add2 = fadd nnan float %arg2, %C2 @@ -2229,16 +2141,12 @@ ; CHECK-LABEL: test119: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_dual_add_f32 v0, v0, v4 :: v_dual_add_f32 v1, v1, v5 ; CHECK-NEXT: v_dual_add_f32 v2, v2, v6 :: v_dual_add_f32 v3, v3, v7 +; CHECK-NEXT: v_dual_add_f32 v0, v0, v4 :: v_dual_add_f32 v1, v1, v5 +; CHECK-NEXT: v_min_f32_e32 v2, v2, v3 +; CHECK-NEXT: v_minmax_f32 v0, v0, v1, v2 ; CHECK-NEXT: v_cmp_lt_f32_e32 vcc_lo, v0, v8 -; CHECK-NEXT: v_cmp_lt_f32_e64 s0, v1, v8 -; CHECK-NEXT: v_cmp_lt_f32_e64 s1, v2, v8 -; CHECK-NEXT: v_cmp_lt_f32_e64 s2, v3, v8 -; CHECK-NEXT: s_or_b32 s0, vcc_lo, s0 -; CHECK-NEXT: s_or_b32 s1, s1, s2 -; CHECK-NEXT: s_and_b32 s0, s0, s1 -; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; CHECK-NEXT: s_setpc_b64 s[30:31] %add1 = fadd nnan float %arg1, %C1 %add2 = fadd nnan float %arg2, %C2 @@ -2258,16 +2166,12 @@ ; CHECK-LABEL: test120: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_dual_add_f32 v0, v0, v4 :: v_dual_add_f32 v1, v1, v5 ; CHECK-NEXT: v_dual_add_f32 v2, v2, v6 :: v_dual_add_f32 v3, v3, v7 +; CHECK-NEXT: v_dual_add_f32 v0, v0, v4 :: v_dual_add_f32 v1, v1, v5 +; CHECK-NEXT: v_max_f32_e32 v2, v2, v3 +; CHECK-NEXT: v_min3_f32 v0, v0, v1, v2 ; CHECK-NEXT: v_cmp_lt_f32_e32 vcc_lo, v0, v8 -; CHECK-NEXT: v_cmp_lt_f32_e64 s0, v1, v8 -; CHECK-NEXT: v_cmp_lt_f32_e64 s1, v2, v8 -; CHECK-NEXT: v_cmp_lt_f32_e64 s2, v3, v8 -; CHECK-NEXT: s_or_b32 s0, vcc_lo, s0 -; CHECK-NEXT: s_and_b32 s1, s1, s2 -; CHECK-NEXT: s_or_b32 s0, s0, s1 -; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; CHECK-NEXT: s_setpc_b64 s[30:31] %add1 = fadd nnan float %arg1, %C1 %add2 = fadd nnan float %arg2, %C2 @@ -2287,16 +2191,12 @@ ; CHECK-LABEL: test121: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_dual_add_f32 v0, v0, v4 :: v_dual_add_f32 v1, v1, v5 ; CHECK-NEXT: v_dual_add_f32 v2, v2, v6 :: v_dual_add_f32 v3, v3, v7 +; CHECK-NEXT: v_dual_add_f32 v0, v0, v4 :: v_dual_add_f32 v1, v1, v5 +; CHECK-NEXT: v_max_f32_e32 v2, v2, v3 +; CHECK-NEXT: v_maxmin_f32 v0, v0, v1, v2 ; CHECK-NEXT: v_cmp_lt_f32_e32 vcc_lo, v0, v8 -; CHECK-NEXT: v_cmp_lt_f32_e64 s0, v1, v8 -; CHECK-NEXT: v_cmp_lt_f32_e64 s1, v2, v8 -; CHECK-NEXT: v_cmp_lt_f32_e64 s2, v3, v8 -; CHECK-NEXT: s_and_b32 s0, vcc_lo, s0 -; CHECK-NEXT: s_and_b32 s1, s1, s2 -; CHECK-NEXT: s_or_b32 s0, s0, s1 -; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; CHECK-NEXT: s_setpc_b64 s[30:31] %add1 = fadd nnan float %arg1, %C1 %add2 = fadd nnan float %arg2, %C2 diff --git a/llvm/test/CodeGen/AMDGPU/fma.f16.ll b/llvm/test/CodeGen/AMDGPU/fma.f16.ll --- a/llvm/test/CodeGen/AMDGPU/fma.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fma.f16.ll @@ -92,18 +92,30 @@ ; Regression test for a crash caused by D139469. define i32 @test_D139469_f16(half %arg) { -; GFX9-LABEL: test_D139469_f16: -; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mul_f16_e32 v1, 0x291e, v0 -; GFX9-NEXT: s_movk_i32 s4, 0x291e -; GFX9-NEXT: v_cmp_gt_f16_e32 vcc, 0, v1 -; GFX9-NEXT: v_mov_b32_e32 v1, 0x211e -; GFX9-NEXT: v_fma_f16 v0, v0, s4, v1 -; GFX9-NEXT: v_cmp_gt_f16_e64 s[4:5], 0, v0 -; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-SDAG-LABEL: test_D139469_f16: +; GFX9-SDAG: ; %bb.0: ; %bb +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-SDAG-NEXT: s_movk_i32 s4, 0x291e +; GFX9-SDAG-NEXT: v_mov_b32_e32 v2, 0x211e +; GFX9-SDAG-NEXT: v_mul_f16_e32 v1, 0x291e, v0 +; GFX9-SDAG-NEXT: v_fma_f16 v0, v0, s4, v2 +; GFX9-SDAG-NEXT: v_min_f16_e32 v0, v1, v0 +; GFX9-SDAG-NEXT: v_cmp_gt_f16_e32 vcc, 0, v0 +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-GISEL-LABEL: test_D139469_f16: +; GFX9-GISEL: ; %bb.0: ; %bb +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-GISEL-NEXT: v_mul_f16_e32 v1, 0x291e, v0 +; GFX9-GISEL-NEXT: s_movk_i32 s4, 0x291e +; GFX9-GISEL-NEXT: v_cmp_gt_f16_e32 vcc, 0, v1 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0x211e +; GFX9-GISEL-NEXT: v_fma_f16 v0, v0, s4, v1 +; GFX9-GISEL-NEXT: v_cmp_gt_f16_e64 s[4:5], 0, v0 +; GFX9-GISEL-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-SDAG-LABEL: test_D139469_f16: ; GFX10-SDAG: ; %bb.0: ; %bb @@ -111,10 +123,9 @@ ; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, 0x211e ; GFX10-SDAG-NEXT: v_mul_f16_e32 v2, 0x291e, v0 ; GFX10-SDAG-NEXT: v_fmac_f16_e32 v1, 0x291e, v0 -; GFX10-SDAG-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v2 -; GFX10-SDAG-NEXT: v_cmp_gt_f16_e64 s4, 0, v1 -; GFX10-SDAG-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 +; GFX10-SDAG-NEXT: v_min_f16_e32 v0, v2, v1 +; GFX10-SDAG-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v0 +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-GISEL-LABEL: test_D139469_f16: @@ -143,17 +154,14 @@ ; GFX9-SDAG: ; %bb.0: ; %bb ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-SDAG-NEXT: s_movk_i32 s4, 0x291e +; GFX9-SDAG-NEXT: v_mov_b32_e32 v2, 0x211e ; GFX9-SDAG-NEXT: v_pk_mul_f16 v1, v0, s4 op_sel_hi:[1,0] +; GFX9-SDAG-NEXT: v_pk_fma_f16 v0, v0, s4, v2 op_sel_hi:[1,0,0] +; GFX9-SDAG-NEXT: v_pk_min_f16 v1, v1, v0 ; GFX9-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-SDAG-NEXT: v_cmp_gt_f16_e32 vcc, 0, v1 -; GFX9-SDAG-NEXT: v_cmp_lt_f16_sdwa s[6:7], v1, v2 src0_sel:WORD_1 src1_sel:DWORD -; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, 0x211e -; GFX9-SDAG-NEXT: v_pk_fma_f16 v0, v0, s4, v1 op_sel_hi:[1,0,0] -; GFX9-SDAG-NEXT: v_cmp_gt_f16_e64 s[4:5], 0, v0 -; GFX9-SDAG-NEXT: v_cmp_lt_f16_sdwa s[8:9], v0, v2 src0_sel:WORD_1 src1_sel:DWORD -; GFX9-SDAG-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] -; GFX9-SDAG-NEXT: s_or_b64 s[4:5], s[6:7], s[8:9] +; GFX9-SDAG-NEXT: v_cmp_lt_f16_sdwa s[4:5], v1, v2 src0_sel:WORD_1 src1_sel:DWORD +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] ; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -182,13 +190,10 @@ ; GFX10-SDAG-NEXT: v_pk_mul_f16 v1, 0x291e, v0 op_sel_hi:[0,1] ; GFX10-SDAG-NEXT: v_pk_fma_f16 v0, 0x291e, v0, s4 op_sel_hi:[0,1,0] ; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-SDAG-NEXT: v_pk_min_f16 v1, v1, v0 ; GFX10-SDAG-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v1 -; GFX10-SDAG-NEXT: v_cmp_gt_f16_e64 s4, 0, v0 -; GFX10-SDAG-NEXT: v_cmp_lt_f16_sdwa s5, v1, v2 src0_sel:WORD_1 src1_sel:DWORD -; GFX10-SDAG-NEXT: v_cmp_lt_f16_sdwa s6, v0, v2 src0_sel:WORD_1 src1_sel:DWORD -; GFX10-SDAG-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 -; GFX10-SDAG-NEXT: s_or_b32 s4, s5, s6 +; GFX10-SDAG-NEXT: v_cmp_lt_f16_sdwa s4, v1, v2 src0_sel:WORD_1 src1_sel:DWORD +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, s4 ; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/or.ll b/llvm/test/CodeGen/AMDGPU/or.ll --- a/llvm/test/CodeGen/AMDGPU/or.ll +++ b/llvm/test/CodeGen/AMDGPU/or.ll @@ -1160,77 +1160,74 @@ ; GFX6-LABEL: or_i1: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd -; GFX6-NEXT: s_mov_b32 s11, 0xf000 -; GFX6-NEXT: s_mov_b32 s10, -1 -; GFX6-NEXT: s_mov_b32 s2, s10 -; GFX6-NEXT: s_mov_b32 s3, s11 +; GFX6-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_mov_b32 s10, s2 +; GFX6-NEXT: s_mov_b32 s11, s3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s12, s6 ; GFX6-NEXT: s_mov_b32 s13, s7 -; GFX6-NEXT: s_mov_b32 s14, s10 -; GFX6-NEXT: s_mov_b32 s15, s11 -; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_mov_b32 s14, s2 +; GFX6-NEXT: s_mov_b32 s15, s3 +; GFX6-NEXT: buffer_load_dword v0, off, s[8:11], 0 ; GFX6-NEXT: buffer_load_dword v1, off, s[12:15], 0 -; GFX6-NEXT: s_mov_b32 s8, s4 -; GFX6-NEXT: s_mov_b32 s9, s5 +; GFX6-NEXT: s_mov_b32 s0, s4 +; GFX6-NEXT: s_mov_b32 s1, s5 ; GFX6-NEXT: s_waitcnt vmcnt(1) -; GFX6-NEXT: v_cmp_le_f32_e32 vcc, 0, v0 +; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cmp_le_f32_e64 s[0:1], 0, v1 -; GFX6-NEXT: s_or_b64 s[0:1], s[0:1], vcc -; GFX6-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] -; GFX6-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX6-NEXT: v_max_f32_e32 v0, v1, v0 +; GFX6-NEXT: v_cmp_le_f32_e32 vcc, 0, v0 +; GFX6-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: or_i1: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX8-NEXT: s_mov_b32 s11, 0xf000 -; GFX8-NEXT: s_mov_b32 s10, -1 -; GFX8-NEXT: s_mov_b32 s2, s10 -; GFX8-NEXT: s_mov_b32 s3, s11 +; GFX8-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; GFX8-NEXT: s_mov_b32 s3, 0xf000 +; GFX8-NEXT: s_mov_b32 s2, -1 +; GFX8-NEXT: s_mov_b32 s10, s2 +; GFX8-NEXT: s_mov_b32 s11, s3 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_mov_b32 s12, s6 ; GFX8-NEXT: s_mov_b32 s13, s7 -; GFX8-NEXT: s_mov_b32 s14, s10 -; GFX8-NEXT: s_mov_b32 s15, s11 -; GFX8-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; GFX8-NEXT: s_mov_b32 s14, s2 +; GFX8-NEXT: s_mov_b32 s15, s3 +; GFX8-NEXT: buffer_load_dword v0, off, s[8:11], 0 ; GFX8-NEXT: buffer_load_dword v1, off, s[12:15], 0 -; GFX8-NEXT: s_mov_b32 s8, s4 -; GFX8-NEXT: s_mov_b32 s9, s5 +; GFX8-NEXT: s_mov_b32 s0, s4 +; GFX8-NEXT: s_mov_b32 s1, s5 ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_cmp_le_f32_e32 vcc, 0, v0 +; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cmp_le_f32_e64 s[0:1], 0, v1 -; GFX8-NEXT: s_or_b64 s[0:1], s[0:1], vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] -; GFX8-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX8-NEXT: v_max_f32_e32 v0, v1, v0 +; GFX8-NEXT: v_cmp_le_f32_e32 vcc, 0, v0 +; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; ; EG-LABEL: or_i1: ; EG: ; %bb.0: -; EG-NEXT: ALU 0, @12, KC0[CB0:0-32], KC1[] -; EG-NEXT: TEX 0 @8 -; EG-NEXT: ALU 0, @13, KC0[CB0:0-32], KC1[] -; EG-NEXT: TEX 0 @10 -; EG-NEXT: ALU 5, @14, KC0[CB0:0-32], KC1[] +; EG-NEXT: ALU 1, @10, KC0[CB0:0-32], KC1[] +; EG-NEXT: TEX 1 @6 +; EG-NEXT: ALU 4, @12, KC0[CB0:0-32], KC1[] ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: PAD -; EG-NEXT: Fetch clause starting at 8: -; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 -; EG-NEXT: Fetch clause starting at 10: +; EG-NEXT: Fetch clause starting at 6: ; EG-NEXT: VTX_READ_32 T1.X, T1.X, 0, #1 +; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 +; EG-NEXT: ALU clause starting at 10: +; EG-NEXT: MOV T0.X, KC0[2].Z, +; EG-NEXT: MOV * T1.X, KC0[2].W, ; EG-NEXT: ALU clause starting at 12: -; EG-NEXT: MOV * T0.X, KC0[2].W, -; EG-NEXT: ALU clause starting at 13: -; EG-NEXT: MOV * T1.X, KC0[2].Z, -; EG-NEXT: ALU clause starting at 14: -; EG-NEXT: SETGE_DX10 T0.W, T0.X, 0.0, -; EG-NEXT: SETGE_DX10 * T1.W, T1.X, 0.0, -; EG-NEXT: OR_INT * T0.W, PS, PV.W, +; EG-NEXT: MAX_DX10 * T0.W, T0.X, T1.X, +; EG-NEXT: SETGE_DX10 * T0.W, PV.W, 0.0, ; EG-NEXT: AND_INT T0.X, PV.W, 1, ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)