Index: llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -97,12 +97,16 @@ SDValue performMulhuCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performCtlz_CttzCombine(const SDLoc &SL, SDValue Cond, SDValue LHS, SDValue RHS, DAGCombinerInfo &DCI) const; + + SDValue foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI, + SDValue N) const; SDValue performSelectCombine(SDNode *N, DAGCombinerInfo &DCI) const; TargetLowering::NegatibleCost getConstantNegateCost(const ConstantFPSDNode *C) const; bool isConstantCostlierToNegate(SDValue N) const; + bool isConstantCheaperToNegate(SDValue N) const; SDValue performFNegCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performFAbsCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const; Index: llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -567,6 +567,7 @@ case ISD::FMAXNUM: case ISD::FMINNUM_IEEE: case ISD::FMAXNUM_IEEE: + case ISD::SELECT: case ISD::FSIN: case ISD::FTRUNC: case ISD::FRINT: @@ -592,7 +593,8 @@ /// modifiers. LLVM_READONLY static bool opMustUseVOP3Encoding(const SDNode *N, MVT VT) { - return N->getNumOperands() > 2 || VT == MVT::f64; + return (N->getNumOperands() > 2 && N->getOpcode() != ISD::SELECT) || + VT == MVT::f64; } // Most FP instructions support source modifiers, but this could be refined @@ -604,7 +606,6 @@ switch (N->getOpcode()) { case ISD::CopyToReg: - case ISD::SELECT: case ISD::FDIV: case ISD::FREM: case ISD::INLINEASM: @@ -629,6 +630,9 @@ return true; } } + case ISD::SELECT: + // TODO: Only applies if select will be vector + return N->getValueType(0) == MVT::f32; default: return true; } @@ -644,6 +648,8 @@ unsigned NumMayIncreaseSize = 0; MVT VT = N->getValueType(0).getScalarType().getSimpleVT(); + assert(!N->use_empty()); + // XXX - Should this limit number of uses to check? for (const SDNode *U : N->uses()) { if (!hasSourceMods(U)) @@ -3733,8 +3739,9 @@ // // select c, (fabs x), (fabs y) -> fabs (select c, x, y) // select c, (fabs x), +k -> fabs (select c, x, k) -static SDValue foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI, - SDValue N) { +SDValue +AMDGPUTargetLowering::foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI, + SDValue N) const { SelectionDAG &DAG = DCI.DAG; SDValue Cond = N.getOperand(0); SDValue LHS = N.getOperand(1); @@ -3743,6 +3750,9 @@ EVT VT = N.getValueType(); if ((LHS.getOpcode() == ISD::FABS && RHS.getOpcode() == ISD::FABS) || (LHS.getOpcode() == ISD::FNEG && RHS.getOpcode() == ISD::FNEG)) { + if (!AMDGPUTargetLowering::allUsesHaveSourceMods(N.getNode())) + return SDValue(); + return distributeOpThroughSelect(DCI, LHS.getOpcode(), SDLoc(N), Cond, LHS, RHS); } @@ -3774,10 +3784,24 @@ } if (ShouldFoldNeg) { + if (LHS.getOpcode() == ISD::FABS && CRHS->isNegative()) + return SDValue(); + + // We're going to be forced to use a source modifier anyway, there's no + // point to pulling the negate out unless we can get a size reduction by + // negating the constant. + // + // TODO: Generalize to use getCheaperNegatedExpression which doesn't know + // about cheaper constants. + if (NewLHS.getOpcode() == ISD::FABS && + getConstantNegateCost(CRHS) != NegatibleCost::Cheaper) + return SDValue(); + + if (!AMDGPUTargetLowering::allUsesHaveSourceMods(N.getNode())) + return SDValue(); + if (LHS.getOpcode() == ISD::FNEG) NewRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS); - else if (CRHS->isNegative()) - return SDValue(); if (Inv) std::swap(NewLHS, NewRHS); @@ -3792,7 +3816,6 @@ return SDValue(); } - SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N, DAGCombinerInfo &DCI) const { if (SDValue Folded = foldFreeOpFromSelect(DCI, SDValue(N, 0))) @@ -3868,6 +3891,12 @@ return false; } +bool AMDGPUTargetLowering::isConstantCheaperToNegate(SDValue N) const { + if (const ConstantFPSDNode *C = isConstOrConstSplatFP(N)) + return getConstantNegateCost(C) == NegatibleCost::Cheaper; + return false; +} + static unsigned inverseMinMax(unsigned Opc) { switch (Opc) { case ISD::FMAXNUM: @@ -4103,6 +4132,11 @@ DAG.getConstant(0x8000, SL, SrcVT)); return DAG.getNode(ISD::FP16_TO_FP, SL, N->getValueType(0), IntFNeg); } + case ISD::SELECT: { + // fneg (select c, a, b) -> select c, (fneg a), (fneg b) + // TODO: Invert conditions of foldFreeOpFromSelect + return SDValue(); + } default: return SDValue(); } Index: llvm/test/CodeGen/AMDGPU/fmin_fmax_legacy.amdgcn.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/fmin_fmax_legacy.amdgcn.ll +++ llvm/test/CodeGen/AMDGPU/fmin_fmax_legacy.amdgcn.ll @@ -1,19 +1,16 @@ ; RUN: llc -march=amdgcn < %s | FileCheck -enable-var-scope -check-prefixes=SI-SAFE,GCN %s -; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -march=amdgcn < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GCN-NNAN %s +; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -march=amdgcn < %s | FileCheck -enable-var-scope --check-prefixes=GCN %s ; RUN: llc -march=amdgcn -mcpu=fiji < %s | FileCheck -enable-var-scope -check-prefixes=VI-SAFE,GCN %s -; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -march=amdgcn -mcpu=fiji < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GCN-NNAN %s +; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -march=amdgcn -mcpu=fiji < %s | FileCheck -enable-var-scope --check-prefixes=GCN,VI-NNAN %s ; GCN-LABEL: {{^}}min_fneg_select_regression_0: ; GCN-NOT: v_mul -; SI-SAFE: v_max_legacy_f32_e64 [[MIN:v[0-9]+]], -1.0, -v0 +; SI: v_max_legacy_f32_e64 [[MIN:v[0-9]+]], -1.0, -v0 ; VI-SAFE: v_cmp_nle_f32_e32 vcc, 1.0, v0 -; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc -; VI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 - -; GCN-NONAN: v_max_f32_e64 v0, -v0, -1.0 +; VI-SAFE-NEXT: v_cndmask_b32_e64 v0, -1.0, -v0, vcc define amdgpu_ps float @min_fneg_select_regression_0(float %a, float %b) #0 { %fneg.a = fsub float -0.0, %a %cmp.a = fcmp ult float %a, 1.0 @@ -24,13 +21,12 @@ ; GCN-LABEL: {{^}}min_fneg_select_regression_posk_0: ; GCN-NOT: v_mul -; SI-SAFE: v_max_legacy_f32_e64 [[MIN:v[0-9]+]], 1.0, -v0 +; SI: v_max_legacy_f32_e64 [[MIN:v[0-9]+]], 1.0, -v0 ; VI-SAFE: v_cmp_nle_f32_e32 vcc, -1.0, v0 -; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, -1.0, v0, vcc -; VI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 +; VI-SAFE-NEXT: v_cndmask_b32_e64 v0, 1.0, -v0, vcc -; GCN-NONAN: v_max_f32_e64 v{{[0-9]+}}, -v0, 1.0 +; VI-NNAN: v_max_f32_e64 v{{[0-9]+}}, -v0, 1.0 define amdgpu_ps float @min_fneg_select_regression_posk_0(float %a, float %b) #0 { %fneg.a = fsub float -0.0, %a %cmp.a = fcmp ult float %a, -1.0 @@ -44,8 +40,7 @@ ; SI-SAFE: v_min_legacy_f32_e64 [[MIN:v[0-9]+]], -1.0, -v0 ; VI-SAFE: v_cmp_nge_f32_e32 vcc, 1.0, v0 -; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc -; VI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 +; VI-SAFE-NEXT: v_cndmask_b32_e64 v0, -1.0, -v0, vcc ; GCN-NONAN: v_min_f32_e64 [[MIN:v[0-9]+]], -v0, -1.0 define amdgpu_ps float @max_fneg_select_regression_0(float %a) #0 { @@ -61,8 +56,7 @@ ; SI-SAFE: v_min_legacy_f32_e64 [[MIN:v[0-9]+]], 1.0, -v0 ; VI-SAFE: v_cmp_nge_f32_e32 vcc, -1.0, v0 -; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, -1.0, v0, vcc -; VI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 +; VI-SAFE-NEXT: v_cndmask_b32_e64 v0, 1.0, -v0, vcc ; GCN-NONAN: v_min_f32_e64 [[MIN:v[0-9]+]], -v0, 1.0 define amdgpu_ps float @max_fneg_select_regression_posk_0(float %a) #0 { @@ -73,13 +67,12 @@ } ; GCN-LABEL: {{^}}select_fneg_a_or_q_cmp_ugt_a_neg1: -; SI-SAFE: v_min_legacy_f32_e64 v0, 1.0, -v0 +; SI: v_min_legacy_f32_e64 v0, 1.0, -v0 ; VI-SAFE: v_cmp_nge_f32_e32 vcc, -1.0, v0 -; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, -1.0, v0, vcc -; VI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 +; VI-SAFE-NEXT: v_cndmask_b32_e64 v0, 1.0, -v0, vcc -; GCN-NNAN: v_min_f32_e64 v0, -v0, 1.0 +; VI-NNAN: v_min_f32_e64 v0, -v0, 1.0 define amdgpu_ps float @select_fneg_a_or_q_cmp_ugt_a_neg1(float %a, float %b) #0 { %fneg.a = fneg float %a %cmp.a = fcmp ugt float %a, -1.0 @@ -88,13 +81,12 @@ } ; GCN-LABEL: {{^}}select_fneg_a_or_q_cmp_ult_a_neg1: -; SI-SAFE: v_max_legacy_f32_e64 v0, 1.0, -v0 +; SI: v_max_legacy_f32_e64 v0, 1.0, -v0 ; VI-SAFE: v_cmp_nle_f32_e32 vcc, -1.0, v0 -; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, -1.0, v0, vcc -; VI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 +; VI-SAFE-NEXT: v_cndmask_b32_e64 v0, 1.0, -v0, vcc -; GCN-NNAN: v_max_f32_e64 v0, -v0, 1.0 +; VI-NNAN: v_max_f32_e64 v0, -v0, 1.0 define amdgpu_ps float @select_fneg_a_or_q_cmp_ult_a_neg1(float %a, float %b) #0 { %fneg.a = fneg float %a %cmp.a = fcmp ult float %a, -1.0 @@ -103,13 +95,12 @@ } ; GCN-LABEL: {{^}}select_fneg_a_or_q_cmp_ogt_a_neg1: -; SI-SAFE: v_min_legacy_f32_e64 v0, -v0, 1.0 +; SI: v_min_legacy_f32_e64 v0, -v0, 1.0 ; VI-SAFE: v_cmp_lt_f32_e32 vcc, -1.0, v0 -; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, -1.0, v0, vcc -; VI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 +; VI-SAFE-NEXT: v_cndmask_b32_e64 v0, 1.0, -v0, vcc -; GCN-NNAN: v_min_f32_e64 v0, -v0, 1.0 +; VI-NNAN: v_min_f32_e64 v0, -v0, 1.0 define amdgpu_ps float @select_fneg_a_or_q_cmp_ogt_a_neg1(float %a, float %b) #0 { %fneg.a = fneg float %a %cmp.a = fcmp ogt float %a, -1.0 @@ -118,13 +109,12 @@ } ; GCN-LABEL: {{^}}select_fneg_a_or_q_cmp_olt_a_neg1: -; SI-SAFE: v_max_legacy_f32_e64 v0, -v0, 1.0 +; SI: v_max_legacy_f32_e64 v0, -v0, 1.0 ; VI-SAFE: v_cmp_gt_f32_e32 vcc, -1.0, v0 -; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, -1.0, v0, vcc -; VI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 +; VI-SAFE-NEXT: v_cndmask_b32_e64 v0, 1.0, -v0, vcc -; GCN-NANN: v_max_f32_e64 v0, -v0, 1.0 +; VI-NANN: v_max_f32_e64 v0, -v0, 1.0 define amdgpu_ps float @select_fneg_a_or_q_cmp_olt_a_neg1(float %a, float %b) #0 { %fneg.a = fneg float %a %cmp.a = fcmp olt float %a, -1.0 @@ -133,17 +123,16 @@ } ; GCN-LABEL: {{^}}select_fneg_a_or_q_cmp_ugt_a_neg8: -; SI-SAFE: s_mov_b32 [[K:s[0-9]+]], 0x41000000 -; SI-SAFE-NEXT: v_min_legacy_f32_e64 v0, [[K]], -v0 +; SI: s_mov_b32 [[K:s[0-9]+]], 0x41000000 +; SI-NEXT: v_min_legacy_f32_e64 v0, [[K]], -v0 ; VI-SAFE-DAG: s_mov_b32 [[K0:s[0-9]+]], 0xc1000000 -; VI-SAFE-DAG: v_mov_b32_e32 [[K1:v[0-9]+]], 0xc1000000 +; VI-SAFE-DAG: v_mov_b32_e32 [[K1:v[0-9]+]], 0x41000000 ; VI-SAFE: v_cmp_nge_f32_e32 vcc, [[K0]], v0 -; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, [[K1]], v0, vcc -; VI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 +; VI-SAFE-NEXT: v_cndmask_b32_e64 v0, [[K1]], -v0, vcc -; GCN-NNAN: s_mov_b32 [[K:s[0-9]+]], 0x41000000 -; GCN-NNAN-NEXT: v_min_f32_e64 v0, -v0, [[K]] +; VI-NNAN: s_mov_b32 [[K:s[0-9]+]], 0x41000000 +; VI-NNAN-NEXT: v_min_f32_e64 v0, -v0, [[K]] define amdgpu_ps float @select_fneg_a_or_q_cmp_ugt_a_neg8(float %a, float %b) #0 { %fneg.a = fneg float %a %cmp.a = fcmp ugt float %a, -8.0 @@ -152,17 +141,16 @@ } ; GCN-LABEL: {{^}}select_fneg_a_or_q_cmp_ult_a_neg8: -; SI-SAFE: s_mov_b32 [[K:s[0-9]+]], 0x41000000 -; SI-SAFE-NEXT: v_max_legacy_f32_e64 v0, [[K]], -v0 +; SI: s_mov_b32 [[K:s[0-9]+]], 0x41000000 +; SI-NEXT: v_max_legacy_f32_e64 v0, [[K]], -v0 ; VI-SAFE-DAG: s_mov_b32 [[K0:s[0-9]+]], 0xc1000000 -; VI-SAFE-DAG: v_mov_b32_e32 [[K1:v[0-9]+]], 0xc1000000 +; VI-SAFE-DAG: v_mov_b32_e32 [[K1:v[0-9]+]], 0x41000000 ; VI-SAFE: v_cmp_nle_f32_e32 vcc, [[K0]], v0 -; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, [[K1]], v0, vcc -; VI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 +; VI-SAFE-NEXT: v_cndmask_b32_e64 v0, [[K1]], -v0, vcc -; GCN-NNAN: s_mov_b32 [[K:s[0-9]+]], 0x41000000 -; GCN-NNAN-NEXT: v_max_f32_e64 v0, -v0, [[K]] +; VI-NNAN: s_mov_b32 [[K:s[0-9]+]], 0x41000000 +; VI-NNAN-NEXT: v_max_f32_e64 v0, -v0, [[K]] define amdgpu_ps float @select_fneg_a_or_q_cmp_ult_a_neg8(float %a, float %b) #0 { %fneg.a = fneg float %a %cmp.a = fcmp ult float %a, -8.0 @@ -171,17 +159,16 @@ } ; GCN-LABEL: {{^}}select_fneg_a_or_q_cmp_ogt_a_neg8: -; SI-SAFE: s_mov_b32 [[K:s[0-9]+]], 0x41000000 -; SI-SAFE-NEXT: v_min_legacy_f32_e64 v0, -v0, [[K]] +; SI: s_mov_b32 [[K:s[0-9]+]], 0x41000000 +; SI-NEXT: v_min_legacy_f32_e64 v0, -v0, [[K]] ; VI-SAFE-DAG: s_mov_b32 [[K0:s[0-9]+]], 0xc1000000 -; VI-SAFE-DAG: v_mov_b32_e32 [[K1:v[0-9]+]], 0xc1000000 +; VI-SAFE-DAG: v_mov_b32_e32 [[K1:v[0-9]+]], 0x41000000 ; VI-SAFE: v_cmp_lt_f32_e32 vcc, [[K0]], v0 -; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, [[K1]], v0, vcc -; VI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 +; VI-SAFE-NEXT: v_cndmask_b32_e64 v0, [[K1]], -v0, vcc -; GCN-NNAN: s_mov_b32 [[K:s[0-9]+]], 0x41000000 -; GCN-NNAN-NEXT: v_min_f32_e64 v0, -v0, [[K]] +; VI-NNAN: s_mov_b32 [[K:s[0-9]+]], 0x41000000 +; VI-NNAN-NEXT: v_min_f32_e64 v0, -v0, [[K]] define amdgpu_ps float @select_fneg_a_or_q_cmp_ogt_a_neg8(float %a, float %b) #0 { %fneg.a = fneg float %a %cmp.a = fcmp ogt float %a, -8.0 @@ -190,18 +177,17 @@ } ; GCN-LABEL: {{^}}select_fneg_a_or_q_cmp_olt_a_neg8: -; SI-SAFE: s_mov_b32 [[K:s[0-9]+]], 0x41000000 -; SI-SAFE-NEXT: v_max_legacy_f32_e64 v0, -v0, [[K]] +; SI: s_mov_b32 [[K:s[0-9]+]], 0x41000000 +; SI-NEXT: v_max_legacy_f32_e64 v0, -v0, [[K]] ; VI-SAFE-DAG: s_mov_b32 [[K0:s[0-9]+]], 0xc1000000 -; VI-SAFE-DAG: v_mov_b32_e32 [[K1:v[0-9]+]], 0xc1000000 +; VI-SAFE-DAG: v_mov_b32_e32 [[K1:v[0-9]+]], 0x41000000 ; VI-SAFE: v_cmp_gt_f32_e32 vcc, [[K0]], v0 -; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, [[K1]], v0, vcc -; VI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 +; VI-SAFE-NEXT: v_cndmask_b32_e64 v0, [[K1]], -v0, vcc -; GCN-NNAN: s_mov_b32 [[K:s[0-9]+]], 0x41000000 -; GCN-NNAN-NEXT: v_max_f32_e64 v0, -v0, [[K]] +; VI-NNAN: s_mov_b32 [[K:s[0-9]+]], 0x41000000 +; VI-NNAN-NEXT: v_max_f32_e64 v0, -v0, [[K]] define amdgpu_ps float @select_fneg_a_or_q_cmp_olt_a_neg8(float %a, float %b) #0 { %fneg.a = fneg float %a %cmp.a = fcmp olt float %a, -8.0 @@ -210,13 +196,12 @@ } ; GCN-LABEL: {{^}}select_fneg_a_or_neg1_cmp_olt_a_1: -; SI-SAFE: v_max_legacy_f32_e64 v0, -v0, -1.0 +; SI: v_max_legacy_f32_e64 v0, -v0, -1.0 ; VI-SAFE: v_cmp_gt_f32_e32 vcc, 1.0, v0 -; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc -; VI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 +; VI-SAFE-NEXT: v_cndmask_b32_e64 v0, -1.0, -v0, vcc -; GCN-NNAN: v_max_f32_e64 v0, -v0, -1.0 +; VI-NNAN: v_max_f32_e64 v0, -v0, -1.0 define amdgpu_ps float @select_fneg_a_or_neg1_cmp_olt_a_1(float %a, float %b) #0 { %fneg.a = fneg float %a %cmp.a = fcmp olt float %a, 1.0 @@ -225,15 +210,14 @@ } ; GCN-LABEL: {{^}}ult_a_select_fneg_a_b: -; SI-SAFE: v_cmp_nge_f32_e32 vcc, v0, v1 -; SI-SAFE-NEXT: v_cndmask_b32_e64 v0, v1, -v0, vcc +; SI: v_cmp_nge_f32_e32 vcc, v0, v1 +; SI-NEXT: v_cndmask_b32_e64 v0, v1, -v0, vcc ; VI-SAFE: v_cmp_nge_f32_e32 vcc, v0, v1 ; VI-SAFE-NEXT: v_cndmask_b32_e64 v0, v1, -v0, vcc -; GCN-NNAN: v_cmp_lt_f32_e32 vcc, v0, v1 -; GCN-NNAN-NEXT: v_cndmask_b32_e64 v0, v1, -v0, vcc - +; VI-NNAN: v_cmp_lt_f32_e32 vcc, v0, v1 +; VI-NNAN-NEXT: v_cndmask_b32_e64 v0, v1, -v0, vcc define amdgpu_ps float @ult_a_select_fneg_a_b(float %a, float %b) #0 { %fneg.a = fneg float %a %cmp.a = fcmp ult float %a, %b @@ -242,14 +226,14 @@ } ; GCN-LABEL: {{^}}ugt_a_select_fneg_a_b: -; SI-SAFE: v_cmp_nle_f32_e32 vcc, v0, v1 -; SI-SAFE-NEXT: v_cndmask_b32_e64 v0, v1, -v0, vcc +; SI: v_cmp_nle_f32_e32 vcc, v0, v1 +; SI-NEXT: v_cndmask_b32_e64 v0, v1, -v0, vcc ; VI-SAFE: v_cmp_nle_f32_e32 vcc, v0, v1 ; VI-SAFE-NEXT: v_cndmask_b32_e64 v0, v1, -v0, vcc -; SI-NNAN: v_cmp_gt_f32_e32 vcc, v0, v1 -; SI-NNAN-NEXT: v_cndmask_b32_e64 v0, v1, -v0, vcc +; VI-NNAN: v_cmp_gt_f32_e32 vcc, v0, v1 +; VI-NNAN-NEXT: v_cndmask_b32_e64 v0, v1, -v0, vcc define amdgpu_ps float @ugt_a_select_fneg_a_b(float %a, float %b) #0 { %fneg.a = fneg float %a %cmp.a = fcmp ugt float %a, %b Index: llvm/test/CodeGen/AMDGPU/fneg-combines.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/fneg-combines.ll +++ llvm/test/CodeGen/AMDGPU/fneg-combines.ll @@ -226,6 +226,10 @@ ; GCN-SAFE-DAG: v_mad_f32 [[A:v[0-9]+]], ; GCN-SAFE-DAG: v_cmp_ngt_f32_e32 {{.*}}, [[A]] ; GCN-SAFE-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, -[[A]] + +; GCN-NSZ-DAG: v_mul_f32_e32 v{{[0-9]+}}, 0, v +; GCN-NSZ: v_cmp_ngt_f32 +; GCN-NSZ: v_cndmask_b32_e64 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}} define amdgpu_ps float @fneg_fadd_0(float inreg %tmp2, float inreg %tmp6, <4 x i32> %arg) local_unnamed_addr #0 { .entry: %tmp7 = fdiv float 1.000000e+00, %tmp6 @@ -246,9 +250,12 @@ ; GCN-LABEL: {{^}}fneg_fadd_0_nsz: ; GCN-NSZ-DAG: v_rcp_f32_e32 [[A:v[0-9]+]], ; GCN-NSZ-DAG: v_mov_b32_e32 [[B:v[0-9]+]], -; GCN-NSZ-DAG: v_mov_b32_e32 [[C:v[0-9]+]], -; GCN-NSZ-DAG: v_mul_f32_e32 [[D:v[0-9]+]], -; GCN-NSZ-DAG: v_cmp_nlt_f32_e64 {{.*}}, -[[D]] +; GCN-NSZ-DAG: v_mov_b32_e32 [[C:v[0-9]+]], 0x7fc00000 +; GCN-NSZ-DAG: v_mul_f32_e32 [[D:v[0-9]+]], 0, [[A]] +; GCN-NSZ-DAG: v_cmp_ngt_f32_e32 {{.*}}, s{{[0-9]+}}, [[D]] +; GCN-NSZ-DAG: v_cndmask_b32_e64 [[E:v[0-9]+]], -[[D]], v{{[0-9]+}}, +; GCN-NSZ-DAG: v_cmp_nlt_f32_e32 {{.*}}, 0 +; GCN-NSZ-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, [[C]], 0, define amdgpu_ps float @fneg_fadd_0_nsz(float inreg %tmp2, float inreg %tmp6, <4 x i32> %arg) local_unnamed_addr #2 { .entry: %tmp7 = fdiv afn float 1.000000e+00, %tmp6 @@ -2731,6 +2738,79 @@ ret <2 x half> %add } +; FIXME: This fneg should fold into select +; GCN-LABEL: {{^}}v_fneg_select_f32: +; GCN: s_waitcnt +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GCN-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 +; GCN-NEXT: s_setpc_b64 +define float @v_fneg_select_f32(i32 %arg0, float %a, float %b, float %c) { + %cond = icmp eq i32 %arg0, 0 + %select = select i1 %cond, float %a, float %b + %fneg = fneg float %select + ret float %fneg +} + +; FIXME: This fneg should fold into select +; GCN-LABEL: {{^}}v_fneg_select_2_f32: +; GCN: s_waitcnt +; GCN-NSZ-NEXT: v_add_f32_e32 [[ADD2:v[0-9]+]], 2.0, v1 +; GCN-NSZ-NEXT: v_add_f32_e32 [[ADD4:v[0-9]+]], 4.0, v2 +; GCN-NSZ-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN-NSZ-NEXT: v_cndmask_b32_e32 v0, [[ADD4]], [[ADD2]], vcc +; GCN-NSZ-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 + +; GCN-SAFE-NEXT: v_add_f32_e32 [[ADD2:v[0-9]+]], 2.0, v1 +; GCN-SAFE-NEXT: v_add_f32_e32 [[ADD4:v[0-9]+]], 4.0, v2 +; GCN-SAFE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN-SAFE-NEXT: v_cndmask_b32_e32 v0, [[ADD4]], [[ADD2]], vcc +; GCN-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 + +; GCN-NEXT: s_setpc_b64 +define float @v_fneg_select_2_f32(i32 %arg0, float %a, float %b, float %c) { + %cond = icmp eq i32 %arg0, 0 + %add.0 = fadd float %a, 2.0 + %add.1 = fadd float %b, 4.0 + %select = select i1 %cond, float %add.0, float %add.1 + %neg.select = fneg float %select + ret float %neg.select +} + +; GCN-LABEL: {{^}}v_fneg_posk_select_f32: +; GCN: v_cmp_ne_u32_e32 vcc, 0, v0 +; GCN-NEXT: v_cndmask_b32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}}, vcc +; GCN-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 +define amdgpu_kernel void @v_fneg_posk_select_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext + %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext + %a = load volatile float, ptr addrspace(1) %a.gep + %cond = icmp eq i32 %tid, 0 + %select = select i1 %cond, float 4.0, float %a + %fneg = fneg float %select + store float %fneg, ptr addrspace(1) %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_fneg_negk_select_f32: +; GCN: v_cmp_ne_u32_e32 vcc, 0, v0 +; GCN-NEXT: v_cndmask_b32_e32 v{{[0-9]+}}, -4.0, v{{[0-9]+}}, vcc +; GCN-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 +define amdgpu_kernel void @v_fneg_negk_select_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext + %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext + %a = load volatile float, ptr addrspace(1) %a.gep + %cond = icmp eq i32 %tid, 0 + %select = select i1 %cond, float -4.0, float %a + %fneg = fneg float %select + store float %fneg, ptr addrspace(1) %out.gep + ret void +} + declare i32 @llvm.amdgcn.workitem.id.x() #1 declare float @llvm.fma.f32(float, float, float) #1 declare <2 x float> @llvm.fma.v2f32(<2 x float>, <2 x float>, <2 x float>) Index: llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll +++ llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll @@ -203,9 +203,9 @@ ; ; SI-NSZ-LABEL: fneg_fadd_0_f32: ; SI-NSZ: ; %bb.0: ; %.entry -; SI-NSZ-NEXT: v_div_scale_f32 v0, s[2:3], s1, s1, -1.0 +; SI-NSZ-NEXT: v_div_scale_f32 v0, s[2:3], s1, s1, 1.0 ; SI-NSZ-NEXT: v_rcp_f32_e32 v1, v0 -; SI-NSZ-NEXT: v_div_scale_f32 v2, vcc, -1.0, s1, -1.0 +; SI-NSZ-NEXT: v_div_scale_f32 v2, vcc, 1.0, s1, 1.0 ; SI-NSZ-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; SI-NSZ-NEXT: v_fma_f32 v3, -v0, v1, 1.0 ; SI-NSZ-NEXT: v_fma_f32 v1, v3, v1, v1 @@ -215,11 +215,11 @@ ; SI-NSZ-NEXT: v_fma_f32 v0, -v0, v3, v2 ; SI-NSZ-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; SI-NSZ-NEXT: v_div_fmas_f32 v0, v0, v1, v3 -; SI-NSZ-NEXT: v_div_fixup_f32 v0, v0, s1, -1.0 +; SI-NSZ-NEXT: v_div_fixup_f32 v0, v0, s1, 1.0 ; SI-NSZ-NEXT: v_mul_f32_e32 v0, 0, v0 ; SI-NSZ-NEXT: v_mov_b32_e32 v1, s0 -; SI-NSZ-NEXT: v_cmp_nlt_f32_e64 vcc, -v0, s0 -; SI-NSZ-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; SI-NSZ-NEXT: v_cmp_ngt_f32_e32 vcc, s0, v0 +; SI-NSZ-NEXT: v_cndmask_b32_e64 v0, -v0, v1, vcc ; SI-NSZ-NEXT: v_mov_b32_e32 v1, 0x7fc00000 ; SI-NSZ-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v0 ; SI-NSZ-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc @@ -251,8 +251,8 @@ ; ; VI-NSZ-LABEL: fneg_fadd_0_f32: ; VI-NSZ: ; %bb.0: ; %.entry -; VI-NSZ-NEXT: v_div_scale_f32 v0, s[2:3], s1, s1, -1.0 -; VI-NSZ-NEXT: v_div_scale_f32 v1, vcc, -1.0, s1, -1.0 +; VI-NSZ-NEXT: v_div_scale_f32 v0, s[2:3], s1, s1, 1.0 +; VI-NSZ-NEXT: v_div_scale_f32 v1, vcc, 1.0, s1, 1.0 ; VI-NSZ-NEXT: v_rcp_f32_e32 v2, v0 ; VI-NSZ-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; VI-NSZ-NEXT: v_fma_f32 v3, -v0, v2, 1.0 @@ -265,10 +265,10 @@ ; VI-NSZ-NEXT: v_div_fmas_f32 v0, v0, v2, v3 ; VI-NSZ-NEXT: v_mov_b32_e32 v2, s0 ; VI-NSZ-NEXT: v_mov_b32_e32 v1, 0x7fc00000 -; VI-NSZ-NEXT: v_div_fixup_f32 v0, v0, s1, -1.0 +; VI-NSZ-NEXT: v_div_fixup_f32 v0, v0, s1, 1.0 ; VI-NSZ-NEXT: v_mul_f32_e32 v0, 0, v0 -; VI-NSZ-NEXT: v_cmp_nlt_f32_e64 vcc, -v0, s0 -; VI-NSZ-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; VI-NSZ-NEXT: v_cmp_ngt_f32_e32 vcc, s0, v0 +; VI-NSZ-NEXT: v_cndmask_b32_e64 v0, -v0, v2, vcc ; VI-NSZ-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v0 ; VI-NSZ-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc ; VI-NSZ-NEXT: ; return to shader part epilog @@ -302,9 +302,9 @@ ; GCN-NSZ: ; %bb.0: ; %.entry ; GCN-NSZ-NEXT: v_rcp_f32_e32 v0, s1 ; GCN-NSZ-NEXT: v_mov_b32_e32 v1, s0 -; GCN-NSZ-NEXT: v_mul_f32_e32 v0, 0x80000000, v0 -; GCN-NSZ-NEXT: v_cmp_nlt_f32_e64 vcc, -v0, s0 -; GCN-NSZ-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GCN-NSZ-NEXT: v_mul_f32_e32 v0, 0, v0 +; GCN-NSZ-NEXT: v_cmp_ngt_f32_e32 vcc, s0, v0 +; GCN-NSZ-NEXT: v_cndmask_b32_e64 v0, -v0, v1, vcc ; GCN-NSZ-NEXT: v_mov_b32_e32 v1, 0x7fc00000 ; GCN-NSZ-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v0 ; GCN-NSZ-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc @@ -2773,14 +2773,12 @@ ; SI-LABEL: s_fneg_select_infloop_regression_f32: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; SI-NEXT: v_bfrev_b32_e32 v0, 1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_bitcmp1_b32 s1, 0 -; SI-NEXT: v_mov_b32_e32 v1, s0 -; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc -; SI-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc -; SI-NEXT: v_xor_b32_e32 v2, 0x80000000, v0 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: s_cselect_b64 s[0:1], -1, 0 +; SI-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[0:1] +; SI-NEXT: v_cndmask_b32_e64 v2, -v0, 0, s[0:1] ; SI-NEXT: v_mov_b32_e32 v0, s2 ; SI-NEXT: v_mov_b32_e32 v1, s3 ; SI-NEXT: flat_store_dword v[0:1], v2 @@ -2789,14 +2787,12 @@ ; VI-LABEL: s_fneg_select_infloop_regression_f32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; VI-NEXT: v_bfrev_b32_e32 v0, 1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_bitcmp1_b32 s1, 0 -; VI-NEXT: v_mov_b32_e32 v1, s0 -; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc -; VI-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc -; VI-NEXT: v_xor_b32_e32 v2, 0x80000000, v0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 +; VI-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[0:1] +; VI-NEXT: v_cndmask_b32_e64 v2, -v0, 0, s[0:1] ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -2815,9 +2811,7 @@ ; GCN-NEXT: v_and_b32_e32 v1, 1, v1 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 ; GCN-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc -; GCN-NEXT: v_bfrev_b32_e32 v1, 1 -; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GCN-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 +; GCN-NEXT: v_cndmask_b32_e64 v0, -v0, 0, vcc ; GCN-NEXT: s_setpc_b64 s[30:31] %i = select i1 %arg1, float 0.0, float %arg %i2 = fneg float %i @@ -2832,9 +2826,7 @@ ; GCN-NEXT: v_and_b32_e32 v1, 1, v1 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 ; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GCN-NEXT: v_bfrev_b32_e32 v1, 1 -; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GCN-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 +; GCN-NEXT: v_cndmask_b32_e64 v0, -v0, 0, vcc ; GCN-NEXT: s_setpc_b64 s[30:31] %i = select i1 %arg1, float %arg, float 0.0 %i2 = fneg float %i @@ -2849,9 +2841,7 @@ ; GCN-NEXT: v_and_b32_e32 v1, 1, v1 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 ; GCN-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc -; GCN-NEXT: v_bfrev_b32_e32 v1, 1 -; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc -; GCN-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 +; GCN-NEXT: v_cndmask_b32_e64 v0, 0, -v0, vcc ; GCN-NEXT: s_setpc_b64 s[30:31] %i = select i1 %arg1, float 0.0, float %arg %i2 = fneg float %i @@ -2866,9 +2856,7 @@ ; GCN-NEXT: v_and_b32_e32 v1, 1, v1 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 ; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GCN-NEXT: v_bfrev_b32_e32 v1, 1 -; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc -; GCN-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 +; GCN-NEXT: v_cndmask_b32_e64 v0, 0, -v0, vcc ; GCN-NEXT: s_setpc_b64 s[30:31] %i = select i1 %arg1, float %arg, float 0.0 %i2 = fneg float %i @@ -2884,8 +2872,7 @@ ; GCN-NEXT: v_and_b32_e32 v1, 1, v1 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 ; GCN-NEXT: v_cndmask_b32_e64 v0, v0, 2.0, vcc -; GCN-NEXT: v_cndmask_b32_e64 v0, v0, -2.0, vcc -; GCN-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 +; GCN-NEXT: v_cndmask_b32_e64 v0, -v0, 2.0, vcc ; GCN-NEXT: s_setpc_b64 s[30:31] %i = select i1 %arg1, float 2.0, float %arg %i2 = fneg float %i @@ -2900,8 +2887,7 @@ ; GCN-NEXT: v_and_b32_e32 v1, 1, v1 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 ; GCN-NEXT: v_cndmask_b32_e32 v0, 2.0, v0, vcc -; GCN-NEXT: v_cndmask_b32_e64 v0, v0, -2.0, vcc -; GCN-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 +; GCN-NEXT: v_cndmask_b32_e64 v0, -v0, 2.0, vcc ; GCN-NEXT: s_setpc_b64 s[30:31] %i = select i1 %arg1, float %arg, float 2.0 %i2 = fneg float %i @@ -2916,8 +2902,7 @@ ; GCN-NEXT: v_and_b32_e32 v1, 1, v1 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 ; GCN-NEXT: v_cndmask_b32_e64 v0, v0, 2.0, vcc -; GCN-NEXT: v_cndmask_b32_e32 v0, -2.0, v0, vcc -; GCN-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 +; GCN-NEXT: v_cndmask_b32_e64 v0, 2.0, -v0, vcc ; GCN-NEXT: s_setpc_b64 s[30:31] %i = select i1 %arg1, float 2.0, float %arg %i2 = fneg float %i @@ -2932,8 +2917,7 @@ ; GCN-NEXT: v_and_b32_e32 v1, 1, v1 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 ; GCN-NEXT: v_cndmask_b32_e32 v0, 2.0, v0, vcc -; GCN-NEXT: v_cndmask_b32_e32 v0, -2.0, v0, vcc -; GCN-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 +; GCN-NEXT: v_cndmask_b32_e64 v0, 2.0, -v0, vcc ; GCN-NEXT: s_setpc_b64 s[30:31] %i = select i1 %arg1, float %arg, float 2.0 %i2 = fneg float %i @@ -2949,8 +2933,7 @@ ; GCN-NEXT: v_and_b32_e32 v1, 1, v1 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 ; GCN-NEXT: v_cndmask_b32_e64 v0, v0, -2.0, vcc -; GCN-NEXT: v_cndmask_b32_e64 v0, v0, 2.0, vcc -; GCN-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 +; GCN-NEXT: v_cndmask_b32_e64 v0, -v0, -2.0, vcc ; GCN-NEXT: s_setpc_b64 s[30:31] %i = select i1 %arg1, float -2.0, float %arg %i2 = fneg float %i @@ -2965,8 +2948,7 @@ ; GCN-NEXT: v_and_b32_e32 v1, 1, v1 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 ; GCN-NEXT: v_cndmask_b32_e32 v0, -2.0, v0, vcc -; GCN-NEXT: v_cndmask_b32_e64 v0, v0, 2.0, vcc -; GCN-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 +; GCN-NEXT: v_cndmask_b32_e64 v0, -v0, -2.0, vcc ; GCN-NEXT: s_setpc_b64 s[30:31] %i = select i1 %arg1, float %arg, float -2.0 %i2 = fneg float %i @@ -2981,8 +2963,7 @@ ; GCN-NEXT: v_and_b32_e32 v1, 1, v1 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 ; GCN-NEXT: v_cndmask_b32_e64 v0, v0, -2.0, vcc -; GCN-NEXT: v_cndmask_b32_e32 v0, 2.0, v0, vcc -; GCN-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 +; GCN-NEXT: v_cndmask_b32_e64 v0, -2.0, -v0, vcc ; GCN-NEXT: s_setpc_b64 s[30:31] %i = select i1 %arg1, float -2.0, float %arg %i2 = fneg float %i @@ -2997,8 +2978,7 @@ ; GCN-NEXT: v_and_b32_e32 v1, 1, v1 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 ; GCN-NEXT: v_cndmask_b32_e32 v0, -2.0, v0, vcc -; GCN-NEXT: v_cndmask_b32_e32 v0, 2.0, v0, vcc -; GCN-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 +; GCN-NEXT: v_cndmask_b32_e64 v0, -2.0, -v0, vcc ; GCN-NEXT: s_setpc_b64 s[30:31] %i = select i1 %arg1, float %arg, float -2.0 %i2 = fneg float %i @@ -3013,14 +2993,16 @@ ; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_bitcmp1_b32 s4, 0 +; SI-NEXT: s_and_b32 s4, 1, s4 +; SI-NEXT: s_cselect_b32 s3, 0, s3 +; SI-NEXT: s_cselect_b32 s2, 0, s2 +; SI-NEXT: s_xor_b32 s3, s3, 0x80000000 +; SI-NEXT: s_cmp_eq_u32 s4, 1 ; SI-NEXT: s_cselect_b32 s3, 0, s3 ; SI-NEXT: s_cselect_b32 s2, 0, s2 -; SI-NEXT: s_cselect_b32 s3, 0x80000000, s3 -; SI-NEXT: v_mov_b32_e32 v0, s2 -; SI-NEXT: s_xor_b32 s2, s3, 0x80000000 ; SI-NEXT: v_mov_b32_e32 v3, s1 -; SI-NEXT: v_mov_b32_e32 v1, s2 +; SI-NEXT: v_mov_b32_e32 v0, s2 +; SI-NEXT: v_mov_b32_e32 v1, s3 ; SI-NEXT: v_mov_b32_e32 v2, s0 ; SI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; SI-NEXT: s_endpgm @@ -3031,14 +3013,16 @@ ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_bitcmp1_b32 s4, 0 +; VI-NEXT: s_and_b32 s4, 1, s4 +; VI-NEXT: s_cselect_b32 s3, 0, s3 +; VI-NEXT: s_cselect_b32 s2, 0, s2 +; VI-NEXT: s_xor_b32 s3, s3, 0x80000000 +; VI-NEXT: s_cmp_eq_u32 s4, 1 ; VI-NEXT: s_cselect_b32 s3, 0, s3 ; VI-NEXT: s_cselect_b32 s2, 0, s2 -; VI-NEXT: s_cselect_b32 s3, 0x80000000, s3 -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: s_xor_b32 s2, s3, 0x80000000 ; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v1, s2 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm @@ -3056,10 +3040,10 @@ ; GCN-NEXT: v_and_b32_e32 v2, 1, v2 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 ; GCN-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc -; GCN-NEXT: v_bfrev_b32_e32 v2, 1 -; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GCN-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc ; GCN-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 +; GCN-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; GCN-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc ; GCN-NEXT: s_setpc_b64 s[30:31] %i = select i1 %arg1, double 0.0, double %arg %i2 = fneg double %i @@ -3072,14 +3056,13 @@ ; SI: ; %bb.0: ; SI-NEXT: s_load_dword s2, s[0:1], 0x9 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SI-NEXT: v_bfrev_b32_e32 v1, 1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 ; SI-NEXT: s_bitcmp1_b32 s2, 16 -; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc -; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; SI-NEXT: v_cvt_f16_f32_e64 v2, -v0 +; SI-NEXT: s_cselect_b64 s[2:3], -1, 0 +; SI-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[2:3] +; SI-NEXT: v_cndmask_b32_e64 v0, -v0, 0, s[2:3] +; SI-NEXT: v_cvt_f16_f32_e32 v2, v0 ; SI-NEXT: v_mov_b32_e32 v0, s0 ; SI-NEXT: v_mov_b32_e32 v1, s1 ; SI-NEXT: flat_store_short v[0:1], v2 @@ -3089,14 +3072,13 @@ ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[0:1], 0x24 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c -; VI-NEXT: v_mov_b32_e32 v0, 0x8000 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_bitcmp1_b32 s2, 16 -; VI-NEXT: v_mov_b32_e32 v1, s2 -; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc -; VI-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc -; VI-NEXT: v_xor_b32_e32 v2, 0x8000, v0 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: s_cselect_b64 s[2:3], -1, 0 +; VI-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[2:3] +; VI-NEXT: v_xor_b32_e32 v0, 0x8000, v0 +; VI-NEXT: v_cndmask_b32_e64 v2, v0, 0, s[2:3] ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_short v[0:1], v2 @@ -3115,11 +3097,9 @@ ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_and_b32_e32 v1, 1, v1 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 -; SI-NEXT: v_bfrev_b32_e32 v1, 1 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc -; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; SI-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 +; SI-NEXT: v_cndmask_b32_e64 v0, -v0, 0, vcc ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_fneg_select_infloop_regression_f16: @@ -3128,9 +3108,8 @@ ; VI-NEXT: v_and_b32_e32 v1, 1, v1 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 ; VI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc -; VI-NEXT: v_mov_b32_e32 v1, 0x8000 -; VI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; VI-NEXT: v_xor_b32_e32 v0, 0x8000, v0 +; VI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc ; VI-NEXT: s_setpc_b64 s[30:31] %i = select i1 %arg1, half 0.0, half %arg %i2 = fneg half %i @@ -3287,7 +3266,7 @@ ; SI-NEXT: v_mov_b32_e32 v0, s0 ; SI-NEXT: s_cselect_b64 s[0:1], -1, 0 ; SI-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[0:1] -; SI-NEXT: v_and_b32_e32 v2, 0x7fffffff, v0 +; SI-NEXT: v_cndmask_b32_e64 v2, |v0|, 0, s[0:1] ; SI-NEXT: v_mov_b32_e32 v0, s2 ; SI-NEXT: v_mov_b32_e32 v1, s3 ; SI-NEXT: flat_store_dword v[0:1], v2 @@ -3301,7 +3280,7 @@ ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 ; VI-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[0:1] -; VI-NEXT: v_and_b32_e32 v2, 0x7fffffff, v0 +; VI-NEXT: v_cndmask_b32_e64 v2, |v0|, 0, s[0:1] ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -3320,7 +3299,7 @@ ; GCN-NEXT: v_and_b32_e32 v1, 1, v1 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 ; GCN-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc -; GCN-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 +; GCN-NEXT: v_cndmask_b32_e64 v0, |v0|, 0, vcc ; GCN-NEXT: s_setpc_b64 s[30:31] %i = select i1 %arg1, float 0.0, float %arg %i2 = call float @llvm.fabs.f32(float %i) @@ -3332,14 +3311,12 @@ ; SI-LABEL: s_fneg_fabs_select_infloop_regression: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; SI-NEXT: v_bfrev_b32_e32 v0, 1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_bitcmp1_b32 s1, 0 -; SI-NEXT: v_mov_b32_e32 v1, s0 +; SI-NEXT: v_mov_b32_e32 v0, s0 ; SI-NEXT: s_cselect_b64 s[0:1], -1, 0 -; SI-NEXT: v_cndmask_b32_e64 v1, v1, 0, s[0:1] -; SI-NEXT: v_cndmask_b32_e64 v0, |v1|, v0, s[0:1] -; SI-NEXT: v_xor_b32_e32 v2, 0x80000000, v0 +; SI-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[0:1] +; SI-NEXT: v_cndmask_b32_e64 v2, -|v0|, 0, s[0:1] ; SI-NEXT: v_mov_b32_e32 v0, s2 ; SI-NEXT: v_mov_b32_e32 v1, s3 ; SI-NEXT: flat_store_dword v[0:1], v2 @@ -3348,14 +3325,12 @@ ; VI-LABEL: s_fneg_fabs_select_infloop_regression: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; VI-NEXT: v_bfrev_b32_e32 v0, 1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_bitcmp1_b32 s1, 0 -; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 -; VI-NEXT: v_cndmask_b32_e64 v1, v1, 0, s[0:1] -; VI-NEXT: v_cndmask_b32_e64 v0, |v1|, v0, s[0:1] -; VI-NEXT: v_xor_b32_e32 v2, 0x80000000, v0 +; VI-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[0:1] +; VI-NEXT: v_cndmask_b32_e64 v2, -|v0|, 0, s[0:1] ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -3375,9 +3350,7 @@ ; GCN-NEXT: v_and_b32_e32 v1, 1, v1 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 ; GCN-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc -; GCN-NEXT: v_bfrev_b32_e32 v1, 1 -; GCN-NEXT: v_cndmask_b32_e64 v0, |v0|, v1, vcc -; GCN-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 +; GCN-NEXT: v_cndmask_b32_e64 v0, -|v0|, 0, vcc ; GCN-NEXT: s_setpc_b64 s[30:31] %i = select i1 %arg1, float 0.0, float %arg %i2 = call float @llvm.fabs.f32(float %i) Index: llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract-legacy.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract-legacy.ll +++ llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract-legacy.ll @@ -7,10 +7,9 @@ ; GCN-LABEL: {{^}}select_fneg_posk_src_rcp_legacy_f32: ; GCN: buffer_load_dword [[X:v[0-9]+]] -; GCN: v_rcp_legacy_f32_e32 [[RCP:v[0-9]+]], [[X]] -; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], -2.0, [[RCP]], vcc -; GCN: v_xor_b32_e32 [[NEG_SELECT:v[0-9]+]], 0x80000000, [[SELECT]] -; GCN-NEXT: buffer_store_dword [[NEG_SELECT]] +; GCN: v_rcp_legacy_f32_e64 [[RCP:v[0-9]+]], -[[X]] +; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 2.0, [[RCP]], vcc +; GCN-NEXT: buffer_store_dword [[SELECT]] define amdgpu_kernel void @select_fneg_posk_src_rcp_legacy_f32(i32 %c) #2 { %x = load volatile float, ptr addrspace(1) undef %y = load volatile float, ptr addrspace(1) undef @@ -25,10 +24,9 @@ ; GCN-LABEL: {{^}}select_fneg_posk_src_mul_legacy_f32: ; GCN: buffer_load_dword [[X:v[0-9]+]] -; GCN: v_mul_legacy_f32_e32 [[MUL:v[0-9]+]], 4.0, [[X]] -; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], -2.0, [[MUL]], vcc -; GCN: v_xor_b32_e32 [[NEG_SELECT:v[0-9]+]], 0x80000000, [[SELECT]] -; GCN-NEXT: buffer_store_dword [[NEG_SELECT]] +; GCN: v_mul_legacy_f32_e32 [[MUL:v[0-9]+]], -4.0, [[X]] +; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 2.0, [[MUL]], vcc +; GCN-NEXT: buffer_store_dword [[SELECT]] define amdgpu_kernel void @select_fneg_posk_src_mul_legacy_f32(i32 %c) #2 { %x = load volatile float, ptr addrspace(1) undef %cmp = icmp eq i32 %c, 0 Index: llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.f16.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.f16.ll +++ llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.f16.ll @@ -993,20 +993,20 @@ ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; CI-NEXT: v_cvt_f32_f16_e64 v1, |v1| +; CI-NEXT: v_cvt_f32_f16_e64 v1, -|v1| ; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_cndmask_b32_e32 v0, -4.0, v1, vcc -; CI-NEXT: v_mul_f32_e64 v0, -v0, v2 +; CI-NEXT: v_cndmask_b32_e32 v0, 4.0, v1, vcc +; CI-NEXT: v_mul_f32_e32 v0, v0, v2 ; CI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: mul_select_negfabs_posk_f16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_and_b32_e32 v1, 0x7fff, v1 -; VI-NEXT: v_mov_b32_e32 v3, 0xc400 +; VI-NEXT: v_or_b32_e32 v1, 0x8000, v1 +; VI-NEXT: v_mov_b32_e32 v3, 0x4400 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; VI-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc -; VI-NEXT: v_mul_f16_e64 v0, -v0, v2 +; VI-NEXT: v_mul_f16_e32 v0, v0, v2 ; VI-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %c, 0 %fabs.x = call half @llvm.fabs.f16(half %x) @@ -1023,20 +1023,20 @@ ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; CI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; CI-NEXT: v_cvt_f32_f16_e64 v1, |v1| +; CI-NEXT: v_cvt_f32_f16_e64 v1, -|v1| ; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_cndmask_b32_e32 v0, -4.0, v1, vcc -; CI-NEXT: v_mul_f32_e64 v0, -v0, v2 +; CI-NEXT: v_cndmask_b32_e32 v0, 4.0, v1, vcc +; CI-NEXT: v_mul_f32_e32 v0, v0, v2 ; CI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: mul_select_posk_negfabs_f16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_and_b32_e32 v1, 0x7fff, v1 -; VI-NEXT: v_mov_b32_e32 v3, 0xc400 +; VI-NEXT: v_or_b32_e32 v1, 0x8000, v1 +; VI-NEXT: v_mov_b32_e32 v3, 0x4400 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; VI-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc -; VI-NEXT: v_mul_f16_e64 v0, -v0, v2 +; VI-NEXT: v_mul_f16_e32 v0, v0, v2 ; VI-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %c, 0 %fabs.x = call half @llvm.fabs.f16(half %x) @@ -1053,19 +1053,20 @@ ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_cvt_f32_f16_e64 v1, -|v1| ; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_cndmask_b32_e32 v0, 4.0, v1, vcc -; CI-NEXT: v_mul_f32_e64 v0, -|v0|, v2 +; CI-NEXT: v_cndmask_b32_e32 v0, -4.0, v1, vcc +; CI-NEXT: v_mul_f32_e32 v0, v0, v2 ; CI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: mul_select_negfabs_negk_f16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v3, 0x4400 +; VI-NEXT: v_or_b32_e32 v1, 0x8000, v1 +; VI-NEXT: v_mov_b32_e32 v3, 0xc400 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; VI-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc -; VI-NEXT: v_mul_f16_e64 v0, -|v0|, v2 +; VI-NEXT: v_mul_f16_e32 v0, v0, v2 ; VI-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %c, 0 %fabs.x = call half @llvm.fabs.f16(half %x) @@ -1082,19 +1083,20 @@ ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; CI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_cvt_f32_f16_e64 v1, -|v1| ; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_cndmask_b32_e32 v0, 4.0, v1, vcc -; CI-NEXT: v_mul_f32_e64 v0, -|v0|, v2 +; CI-NEXT: v_cndmask_b32_e32 v0, -4.0, v1, vcc +; CI-NEXT: v_mul_f32_e32 v0, v0, v2 ; CI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: mul_select_negk_negfabs_f16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v3, 0x4400 +; VI-NEXT: v_or_b32_e32 v1, 0x8000, v1 +; VI-NEXT: v_mov_b32_e32 v3, 0xc400 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; VI-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc -; VI-NEXT: v_mul_f16_e64 v0, -|v0|, v2 +; VI-NEXT: v_mul_f16_e32 v0, v0, v2 ; VI-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %c, 0 %fabs.x = call half @llvm.fabs.f16(half %x) Index: llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.legal.f16.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.legal.f16.ll +++ llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.legal.f16.ll @@ -6,11 +6,10 @@ ; VI-LABEL: select_fneg_posk_src_rcp_f16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_rcp_f16_e32 v1, v1 -; VI-NEXT: v_mov_b32_e32 v2, 0xc000 +; VI-NEXT: v_rcp_f16_e64 v1, -v1 +; VI-NEXT: v_mov_b32_e32 v2, 0x4000 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; VI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; VI-NEXT: v_xor_b32_e32 v0, 0x8000, v0 ; VI-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %c, 0 %rcp = call half @llvm.amdgcn.rcp.f16(half %x) Index: llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.ll +++ llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.ll @@ -372,8 +372,8 @@ ; GCN-LABEL: {{^}}add_select_fneg_inv2pi_f32: ; GCN-DAG: buffer_load_dword [[X:v[0-9]+]] ; GCN-DAG: buffer_load_dword [[Y:v[0-9]+]] -; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0xbe22f983 +; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0xbe22f983 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[K]], [[X]], vcc ; GCN: v_sub_f32_e32 v{{[0-9]+}}, [[Y]], [[SELECT]] define amdgpu_kernel void @add_select_fneg_inv2pi_f32(i32 %c) #0 { @@ -639,8 +639,8 @@ ; GCN-DAG: s_cmp_eq_u32 s{{[0-9]+}}, 0 ; GCN: s_cselect_b64 [[VCC:.*]], -1, 0 -; GCN: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], -4.0, |[[X]]|, [[VCC]] -; GCN: v_mul_f32_e64 v{{[0-9]+}}, -[[SELECT]], [[Y]] +; GCN: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], 4.0, -|[[X]]|, [[VCC]] +; GCN: v_mul_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Y]] define amdgpu_kernel void @mul_select_negfabs_posk_f32(i32 %c) #0 { %x = load volatile float, ptr addrspace(1) undef %y = load volatile float, ptr addrspace(1) undef @@ -659,8 +659,8 @@ ; GCN-DAG: s_cmp_lg_u32 s{{[0-9]+}}, 0 ; GCN: s_cselect_b64 [[VCC:.*]], -1, 0 -; GCN: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], -4.0, |[[X]]|, [[VCC]] -; GCN: v_mul_f32_e64 v{{[0-9]+}}, -[[SELECT]], [[Y]] +; GCN: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], 4.0, -|[[X]]|, [[VCC]] +; GCN: v_mul_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Y]] define amdgpu_kernel void @mul_select_posk_negfabs_f32(i32 %c) #0 { %x = load volatile float, ptr addrspace(1) undef %y = load volatile float, ptr addrspace(1) undef @@ -677,8 +677,8 @@ ; GCN: buffer_load_dword [[X:v[0-9]+]] ; GCN: buffer_load_dword [[Y:v[0-9]+]] -; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 4.0, [[X]], vcc -; GCN: v_mul_f32_e64 v{{[0-9]+}}, -|[[SELECT]]|, [[Y]] +; GCN: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], -4.0, -|[[X]]|, s +; GCN: v_mul_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Y]] define amdgpu_kernel void @mul_select_negfabs_negk_f32(i32 %c) #0 { %x = load volatile float, ptr addrspace(1) undef %y = load volatile float, ptr addrspace(1) undef @@ -696,9 +696,9 @@ ; GCN: buffer_load_dword [[Y:v[0-9]+]] ; GCN: s_cmp_lg_u32 -; GCN: s_cselect_b64 vcc, -1, 0 -; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 4.0, [[X]], vcc -; GCN: v_mul_f32_e64 v{{[0-9]+}}, -|[[SELECT]]|, [[Y]] +; GCN: s_cselect_b64 s[0:1], -1, 0 +; GCN: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], -4.0, -|[[X]]|, s[0:1] +; GCN: v_mul_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Y]] define amdgpu_kernel void @mul_select_negk_negfabs_f32(i32 %c) #0 { %x = load volatile float, ptr addrspace(1) undef %y = load volatile float, ptr addrspace(1) undef @@ -804,10 +804,9 @@ ; GCN-LABEL: {{^}}select_fneg_posk_src_rcp_f32: ; GCN: buffer_load_dword [[X:v[0-9]+]] -; GCN: v_rcp_f32_e32 [[RCP:v[0-9]+]], [[X]] -; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], -2.0, [[RCP]], vcc -; GCN: v_xor_b32_e32 [[NEG_SELECT:v[0-9]+]], 0x80000000, [[SELECT]] -; GCN-NEXT: buffer_store_dword [[NEG_SELECT]] +; GCN: v_rcp_f32_e64 [[RCP:v[0-9]+]], -[[X]] +; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 2.0, [[RCP]], vcc +; GCN-NEXT: buffer_store_dword [[SELECT]] define amdgpu_kernel void @select_fneg_posk_src_rcp_f32(i32 %c) #0 { %x = load volatile float, ptr addrspace(1) undef %y = load volatile float, ptr addrspace(1) undef @@ -824,10 +823,14 @@ ; GCN: buffer_load_dword [[Y:v[0-9]+]] ; GCN-DAG: s_cmp_eq_u32 s{{[0-9]+}}, 0 -; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0xbe22f983 -; GCN: s_cselect_b64 [[VCC:.*]], -1, 0 -; GCN: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], [[K]], |[[X]]|, [[VCC]] -; GCN: v_mul_f32_e64 v{{[0-9]+}}, -[[SELECT]], [[Y]] +; GCN-DAG: s_cselect_b64 [[VCC:.*]], -1, 0 + +; SI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e22f983 +; SI: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], [[K]], -|[[X]]|, [[VCC]] +; SI: v_mul_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Y]] + +; VI: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], 0.15915494, -|[[X]]|, [[VCC]] +; VI: v_mul_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Y]] define amdgpu_kernel void @mul_select_negfabs_posk_inv2pi_f32(i32 %c) #0 { %x = load volatile float, ptr addrspace(1) undef %y = load volatile float, ptr addrspace(1) undef @@ -845,10 +848,16 @@ ; GCN: buffer_load_dword [[Y:v[0-9]+]] ; GCN-DAG: s_cmp_lg_u32 s{{[0-9]+}}, 0 -; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0xbe22f983 -; GCN: s_cselect_b64 [[VCC:.*]], -1, 0 -; GCN: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], [[K]], |[[X]]|, [[VCC]] -; GCN: v_mul_f32_e64 v{{[0-9]+}}, -[[SELECT]], [[Y]] + +; GCN-DAG: s_cselect_b64 [[VCC:.*]], -1, 0 + +; SI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e22f983 +; SI: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], [[K]], -|[[X]]|, [[VCC]] +; SI: v_mul_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Y]] + + +; VI: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], 0.15915494, -|[[X]]|, [[VCC]] +; VI: v_mul_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Y]] define amdgpu_kernel void @mul_select_posk_inv2pi_negfabs_f32(i32 %c) #0 { %x = load volatile float, ptr addrspace(1) undef %y = load volatile float, ptr addrspace(1) undef @@ -864,12 +873,9 @@ ; GCN-LABEL: {{^}}mul_select_negfabs_negk_inv2pi_f32: ; GCN: buffer_load_dword [[X:v[0-9]+]] ; GCN: buffer_load_dword [[Y:v[0-9]+]] -; SI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e22f983 -; SI: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[K]], [[X]], vcc - -; VI: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 0.15915494, [[X]], vcc - -; GCN: v_mul_f32_e64 v{{[0-9]+}}, -|[[SELECT]]|, [[Y]] +; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0xbe22f983 +; GCN: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], [[K]], -|[[X]]|, s +; GCN: v_mul_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Y]] define amdgpu_kernel void @mul_select_negfabs_negk_inv2pi_f32(i32 %c) #0 { %x = load volatile float, ptr addrspace(1) undef %y = load volatile float, ptr addrspace(1) undef @@ -886,13 +892,11 @@ ; GCN: buffer_load_dword [[X:v[0-9]+]] ; GCN: buffer_load_dword [[Y:v[0-9]+]] -; SI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e22f983 +; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0xbe22f983 ; GCN: s_cmp_lg_u32 -; GCN: s_cselect_b64 vcc, -1, 0 -; SI: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[K]], [[X]], vcc - -; VI: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 0.15915494, [[X]], vcc -; GCN: v_mul_f32_e64 v{{[0-9]+}}, -|[[SELECT]]|, [[Y]] +; GCN: s_cselect_b64 s[0:1], -1, 0 +; GCN: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], [[K]], -|[[X]]|, s[0:1] +; GCN: v_mul_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Y]] define amdgpu_kernel void @mul_select_negk_inv2pi_negfabs_f32(i32 %c) #0 { %x = load volatile float, ptr addrspace(1) undef %y = load volatile float, ptr addrspace(1) undef @@ -908,11 +912,10 @@ ; GCN-LABEL: {{^}}mul_select_negfabs_posk_0_f32: ; GCN: buffer_load_dword [[X:v[0-9]+]] ; GCN: buffer_load_dword [[Y:v[0-9]+]] -; GCN-DAG: v_bfrev_b32_e32 [[K:v[0-9]+]], 1{{$}} ; GCN-DAG: s_cmp_eq_u32 s{{[0-9]+}}, 0 ; GCN: s_cselect_b64 [[VCC:.*]], -1, 0 -; GCN: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], [[K]], |[[X]]|, [[VCC]] -; GCN: v_mul_f32_e64 v{{[0-9]+}}, -[[SELECT]], [[Y]] +; GCN: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], 0, -|[[X]]|, [[VCC]] +; GCN: v_mul_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Y]] define amdgpu_kernel void @mul_select_negfabs_posk_0_f32(i32 %c) #0 { %x = load volatile float, ptr addrspace(1) undef %y = load volatile float, ptr addrspace(1) undef @@ -930,11 +933,10 @@ ; GCN: buffer_load_dword [[X:v[0-9]+]] ; GCN: buffer_load_dword [[Y:v[0-9]+]] -; GCN-DAG: v_bfrev_b32_e32 [[K:v[0-9]+]], 1{{$}} ; GCN-DAG: s_cmp_lg_u32 s{{[0-9]+}}, 0 ; GCN: s_cselect_b64 [[VCC:.*]], -1, 0 -; GCN: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], [[K]], |[[X]]|, [[VCC]] -; GCN: v_mul_f32_e64 v{{[0-9]+}}, -[[SELECT]], [[Y]] +; GCN: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], 0, -|[[X]]|, [[VCC]] +; GCN: v_mul_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Y]] define amdgpu_kernel void @mul_select_posk_0_negfabs_f32(i32 %c) #0 { %x = load volatile float, ptr addrspace(1) undef %y = load volatile float, ptr addrspace(1) undef