Index: llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -567,6 +567,7 @@ case ISD::FMAXNUM: case ISD::FMINNUM_IEEE: case ISD::FMAXNUM_IEEE: + case ISD::SELECT: case ISD::FSIN: case ISD::FTRUNC: case ISD::FRINT: @@ -592,7 +593,8 @@ /// modifiers. LLVM_READONLY static bool opMustUseVOP3Encoding(const SDNode *N, MVT VT) { - return N->getNumOperands() > 2 || VT == MVT::f64; + return (N->getNumOperands() > 2 && N->getOpcode() != ISD::SELECT) || + VT == MVT::f64; } // Most FP instructions support source modifiers, but this could be refined @@ -604,7 +606,6 @@ switch (N->getOpcode()) { case ISD::CopyToReg: - case ISD::SELECT: case ISD::FDIV: case ISD::FREM: case ISD::INLINEASM: @@ -629,6 +630,9 @@ return true; } } + case ISD::SELECT: + // TODO: Only applies if select will be vector + return N->getValueType(0) == MVT::f32; default: return true; } @@ -3723,6 +3727,9 @@ EVT VT = N.getValueType(); if ((LHS.getOpcode() == ISD::FABS && RHS.getOpcode() == ISD::FABS) || (LHS.getOpcode() == ISD::FNEG && RHS.getOpcode() == ISD::FNEG)) { + if (!AMDGPUTargetLowering::allUsesHaveSourceMods(N.getNode())) + return SDValue(); + return distributeOpThroughSelect(DCI, LHS.getOpcode(), SDLoc(N), Cond, LHS, RHS); } @@ -3759,6 +3766,9 @@ else if (CRHS->isNegative()) return SDValue(); + if (!AMDGPUTargetLowering::allUsesHaveSourceMods(N.getNode())) + return SDValue(); + if (Inv) std::swap(NewLHS, NewRHS); @@ -4078,6 +4088,22 @@ DAG.getConstant(0x8000, SL, SrcVT)); return DAG.getNode(ISD::FP16_TO_FP, SL, N->getValueType(0), IntFNeg); } + case ISD::SELECT: { + SDLoc SL(N); + SDValue Cond = N0.getOperand(0); + SDValue LHS = N0.getOperand(1); + SDValue RHS = N0.getOperand(2); + + if (isa(RHS)) + return SDValue(); + + SDValue Res = DAG.getNode(ISD::SELECT, SL, VT, Cond, + DAG.getNode(ISD::FNEG, SL, VT, LHS), + DAG.getNode(ISD::FNEG, SL, VT, RHS)); + if (!N0.hasOneUse()) + DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res)); + return Res; + } default: return SDValue(); } Index: llvm/test/CodeGen/AMDGPU/fmin_fmax_legacy.amdgcn.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/fmin_fmax_legacy.amdgcn.ll +++ llvm/test/CodeGen/AMDGPU/fmin_fmax_legacy.amdgcn.ll @@ -1,19 +1,16 @@ ; RUN: llc -march=amdgcn < %s | FileCheck -enable-var-scope -check-prefixes=SI-SAFE,GCN %s -; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -march=amdgcn < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GCN-NNAN %s +; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -march=amdgcn < %s | FileCheck -enable-var-scope --check-prefixes=GCN %s ; RUN: llc -march=amdgcn -mcpu=fiji < %s | FileCheck -enable-var-scope -check-prefixes=VI-SAFE,GCN %s -; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -march=amdgcn -mcpu=fiji < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GCN-NNAN %s +; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -march=amdgcn -mcpu=fiji < %s | FileCheck -enable-var-scope --check-prefixes=GCN,VI-NNAN %s ; GCN-LABEL: {{^}}min_fneg_select_regression_0: ; GCN-NOT: v_mul -; SI-SAFE: v_max_legacy_f32_e64 [[MIN:v[0-9]+]], -1.0, -v0 +; SI: v_max_legacy_f32_e64 [[MIN:v[0-9]+]], -1.0, -v0 ; VI-SAFE: v_cmp_nle_f32_e32 vcc, 1.0, v0 -; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc -; VI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 - -; GCN-NONAN: v_max_f32_e64 v0, -v0, -1.0 +; VI-SAFE-NEXT: v_cndmask_b32_e64 v0, -1.0, -v0, vcc define amdgpu_ps float @min_fneg_select_regression_0(float %a, float %b) #0 { %fneg.a = fsub float -0.0, %a %cmp.a = fcmp ult float %a, 1.0 @@ -24,13 +21,12 @@ ; GCN-LABEL: {{^}}min_fneg_select_regression_posk_0: ; GCN-NOT: v_mul -; SI-SAFE: v_max_legacy_f32_e64 [[MIN:v[0-9]+]], 1.0, -v0 +; SI: v_max_legacy_f32_e64 [[MIN:v[0-9]+]], 1.0, -v0 ; VI-SAFE: v_cmp_nle_f32_e32 vcc, -1.0, v0 -; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, -1.0, v0, vcc -; VI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 +; VI-SAFE-NEXT: v_cndmask_b32_e64 v0, 1.0, -v0, vcc -; GCN-NONAN: v_max_f32_e64 v{{[0-9]+}}, -v0, 1.0 +; VI-NNAN: v_max_f32_e64 v{{[0-9]+}}, -v0, 1.0 define amdgpu_ps float @min_fneg_select_regression_posk_0(float %a, float %b) #0 { %fneg.a = fsub float -0.0, %a %cmp.a = fcmp ult float %a, -1.0 @@ -44,8 +40,7 @@ ; SI-SAFE: v_min_legacy_f32_e64 [[MIN:v[0-9]+]], -1.0, -v0 ; VI-SAFE: v_cmp_nge_f32_e32 vcc, 1.0, v0 -; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc -; VI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 +; VI-SAFE-NEXT: v_cndmask_b32_e64 v0, -1.0, -v0, vcc ; GCN-NONAN: v_min_f32_e64 [[MIN:v[0-9]+]], -v0, -1.0 define amdgpu_ps float @max_fneg_select_regression_0(float %a) #0 { @@ -61,8 +56,7 @@ ; SI-SAFE: v_min_legacy_f32_e64 [[MIN:v[0-9]+]], 1.0, -v0 ; VI-SAFE: v_cmp_nge_f32_e32 vcc, -1.0, v0 -; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, -1.0, v0, vcc -; VI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 +; VI-SAFE-NEXT: v_cndmask_b32_e64 v0, 1.0, -v0, vcc ; GCN-NONAN: v_min_f32_e64 [[MIN:v[0-9]+]], -v0, 1.0 define amdgpu_ps float @max_fneg_select_regression_posk_0(float %a) #0 { @@ -73,13 +67,12 @@ } ; GCN-LABEL: {{^}}select_fneg_a_or_q_cmp_ugt_a_neg1: -; SI-SAFE: v_min_legacy_f32_e64 v0, 1.0, -v0 +; SI: v_min_legacy_f32_e64 v0, 1.0, -v0 ; VI-SAFE: v_cmp_nge_f32_e32 vcc, -1.0, v0 -; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, -1.0, v0, vcc -; VI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 +; VI-SAFE-NEXT: v_cndmask_b32_e64 v0, 1.0, -v0, vcc -; GCN-NNAN: v_min_f32_e64 v0, -v0, 1.0 +; VI-NNAN: v_min_f32_e64 v0, -v0, 1.0 define amdgpu_ps float @select_fneg_a_or_q_cmp_ugt_a_neg1(float %a, float %b) #0 { %fneg.a = fneg float %a %cmp.a = fcmp ugt float %a, -1.0 @@ -88,13 +81,12 @@ } ; GCN-LABEL: {{^}}select_fneg_a_or_q_cmp_ult_a_neg1: -; SI-SAFE: v_max_legacy_f32_e64 v0, 1.0, -v0 +; SI: v_max_legacy_f32_e64 v0, 1.0, -v0 ; VI-SAFE: v_cmp_nle_f32_e32 vcc, -1.0, v0 -; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, -1.0, v0, vcc -; VI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 +; VI-SAFE-NEXT: v_cndmask_b32_e64 v0, 1.0, -v0, vcc -; GCN-NNAN: v_max_f32_e64 v0, -v0, 1.0 +; VI-NNAN: v_max_f32_e64 v0, -v0, 1.0 define amdgpu_ps float @select_fneg_a_or_q_cmp_ult_a_neg1(float %a, float %b) #0 { %fneg.a = fneg float %a %cmp.a = fcmp ult float %a, -1.0 @@ -103,13 +95,12 @@ } ; GCN-LABEL: {{^}}select_fneg_a_or_q_cmp_ogt_a_neg1: -; SI-SAFE: v_min_legacy_f32_e64 v0, -v0, 1.0 +; SI: v_min_legacy_f32_e64 v0, -v0, 1.0 ; VI-SAFE: v_cmp_lt_f32_e32 vcc, -1.0, v0 -; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, -1.0, v0, vcc -; VI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 +; VI-SAFE-NEXT: v_cndmask_b32_e64 v0, 1.0, -v0, vcc -; GCN-NNAN: v_min_f32_e64 v0, -v0, 1.0 +; VI-NNAN: v_min_f32_e64 v0, -v0, 1.0 define amdgpu_ps float @select_fneg_a_or_q_cmp_ogt_a_neg1(float %a, float %b) #0 { %fneg.a = fneg float %a %cmp.a = fcmp ogt float %a, -1.0 @@ -118,13 +109,12 @@ } ; GCN-LABEL: {{^}}select_fneg_a_or_q_cmp_olt_a_neg1: -; SI-SAFE: v_max_legacy_f32_e64 v0, -v0, 1.0 +; SI: v_max_legacy_f32_e64 v0, -v0, 1.0 ; VI-SAFE: v_cmp_gt_f32_e32 vcc, -1.0, v0 -; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, -1.0, v0, vcc -; VI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 +; VI-SAFE-NEXT: v_cndmask_b32_e64 v0, 1.0, -v0, vcc -; GCN-NANN: v_max_f32_e64 v0, -v0, 1.0 +; VI-NANN: v_max_f32_e64 v0, -v0, 1.0 define amdgpu_ps float @select_fneg_a_or_q_cmp_olt_a_neg1(float %a, float %b) #0 { %fneg.a = fneg float %a %cmp.a = fcmp olt float %a, -1.0 @@ -133,17 +123,16 @@ } ; GCN-LABEL: {{^}}select_fneg_a_or_q_cmp_ugt_a_neg8: -; SI-SAFE: s_mov_b32 [[K:s[0-9]+]], 0x41000000 -; SI-SAFE-NEXT: v_min_legacy_f32_e64 v0, [[K]], -v0 +; SI: s_mov_b32 [[K:s[0-9]+]], 0x41000000 +; SI-NEXT: v_min_legacy_f32_e64 v0, [[K]], -v0 ; VI-SAFE-DAG: s_mov_b32 [[K0:s[0-9]+]], 0xc1000000 -; VI-SAFE-DAG: v_mov_b32_e32 [[K1:v[0-9]+]], 0xc1000000 +; VI-SAFE-DAG: v_mov_b32_e32 [[K1:v[0-9]+]], 0x41000000 ; VI-SAFE: v_cmp_nge_f32_e32 vcc, [[K0]], v0 -; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, [[K1]], v0, vcc -; VI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 +; VI-SAFE-NEXT: v_cndmask_b32_e64 v0, [[K1]], -v0, vcc -; GCN-NNAN: s_mov_b32 [[K:s[0-9]+]], 0x41000000 -; GCN-NNAN-NEXT: v_min_f32_e64 v0, -v0, [[K]] +; VI-NNAN: s_mov_b32 [[K:s[0-9]+]], 0x41000000 +; VI-NNAN-NEXT: v_min_f32_e64 v0, -v0, [[K]] define amdgpu_ps float @select_fneg_a_or_q_cmp_ugt_a_neg8(float %a, float %b) #0 { %fneg.a = fneg float %a %cmp.a = fcmp ugt float %a, -8.0 @@ -152,17 +141,16 @@ } ; GCN-LABEL: {{^}}select_fneg_a_or_q_cmp_ult_a_neg8: -; SI-SAFE: s_mov_b32 [[K:s[0-9]+]], 0x41000000 -; SI-SAFE-NEXT: v_max_legacy_f32_e64 v0, [[K]], -v0 +; SI: s_mov_b32 [[K:s[0-9]+]], 0x41000000 +; SI-NEXT: v_max_legacy_f32_e64 v0, [[K]], -v0 ; VI-SAFE-DAG: s_mov_b32 [[K0:s[0-9]+]], 0xc1000000 -; VI-SAFE-DAG: v_mov_b32_e32 [[K1:v[0-9]+]], 0xc1000000 +; VI-SAFE-DAG: v_mov_b32_e32 [[K1:v[0-9]+]], 0x41000000 ; VI-SAFE: v_cmp_nle_f32_e32 vcc, [[K0]], v0 -; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, [[K1]], v0, vcc -; VI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 +; VI-SAFE-NEXT: v_cndmask_b32_e64 v0, [[K1]], -v0, vcc -; GCN-NNAN: s_mov_b32 [[K:s[0-9]+]], 0x41000000 -; GCN-NNAN-NEXT: v_max_f32_e64 v0, -v0, [[K]] +; VI-NNAN: s_mov_b32 [[K:s[0-9]+]], 0x41000000 +; VI-NNAN-NEXT: v_max_f32_e64 v0, -v0, [[K]] define amdgpu_ps float @select_fneg_a_or_q_cmp_ult_a_neg8(float %a, float %b) #0 { %fneg.a = fneg float %a %cmp.a = fcmp ult float %a, -8.0 @@ -171,17 +159,16 @@ } ; GCN-LABEL: {{^}}select_fneg_a_or_q_cmp_ogt_a_neg8: -; SI-SAFE: s_mov_b32 [[K:s[0-9]+]], 0x41000000 -; SI-SAFE-NEXT: v_min_legacy_f32_e64 v0, -v0, [[K]] +; SI: s_mov_b32 [[K:s[0-9]+]], 0x41000000 +; SI-NEXT: v_min_legacy_f32_e64 v0, -v0, [[K]] ; VI-SAFE-DAG: s_mov_b32 [[K0:s[0-9]+]], 0xc1000000 -; VI-SAFE-DAG: v_mov_b32_e32 [[K1:v[0-9]+]], 0xc1000000 +; VI-SAFE-DAG: v_mov_b32_e32 [[K1:v[0-9]+]], 0x41000000 ; VI-SAFE: v_cmp_lt_f32_e32 vcc, [[K0]], v0 -; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, [[K1]], v0, vcc -; VI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 +; VI-SAFE-NEXT: v_cndmask_b32_e64 v0, [[K1]], -v0, vcc -; GCN-NNAN: s_mov_b32 [[K:s[0-9]+]], 0x41000000 -; GCN-NNAN-NEXT: v_min_f32_e64 v0, -v0, [[K]] +; VI-NNAN: s_mov_b32 [[K:s[0-9]+]], 0x41000000 +; VI-NNAN-NEXT: v_min_f32_e64 v0, -v0, [[K]] define amdgpu_ps float @select_fneg_a_or_q_cmp_ogt_a_neg8(float %a, float %b) #0 { %fneg.a = fneg float %a %cmp.a = fcmp ogt float %a, -8.0 @@ -190,18 +177,17 @@ } ; GCN-LABEL: {{^}}select_fneg_a_or_q_cmp_olt_a_neg8: -; SI-SAFE: s_mov_b32 [[K:s[0-9]+]], 0x41000000 -; SI-SAFE-NEXT: v_max_legacy_f32_e64 v0, -v0, [[K]] +; SI: s_mov_b32 [[K:s[0-9]+]], 0x41000000 +; SI-NEXT: v_max_legacy_f32_e64 v0, -v0, [[K]] ; VI-SAFE-DAG: s_mov_b32 [[K0:s[0-9]+]], 0xc1000000 -; VI-SAFE-DAG: v_mov_b32_e32 [[K1:v[0-9]+]], 0xc1000000 +; VI-SAFE-DAG: v_mov_b32_e32 [[K1:v[0-9]+]], 0x41000000 ; VI-SAFE: v_cmp_gt_f32_e32 vcc, [[K0]], v0 -; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, [[K1]], v0, vcc -; VI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 +; VI-SAFE-NEXT: v_cndmask_b32_e64 v0, [[K1]], -v0, vcc -; GCN-NNAN: s_mov_b32 [[K:s[0-9]+]], 0x41000000 -; GCN-NNAN-NEXT: v_max_f32_e64 v0, -v0, [[K]] +; VI-NNAN: s_mov_b32 [[K:s[0-9]+]], 0x41000000 +; VI-NNAN-NEXT: v_max_f32_e64 v0, -v0, [[K]] define amdgpu_ps float @select_fneg_a_or_q_cmp_olt_a_neg8(float %a, float %b) #0 { %fneg.a = fneg float %a %cmp.a = fcmp olt float %a, -8.0 @@ -210,13 +196,12 @@ } ; GCN-LABEL: {{^}}select_fneg_a_or_neg1_cmp_olt_a_1: -; SI-SAFE: v_max_legacy_f32_e64 v0, -v0, -1.0 +; SI: v_max_legacy_f32_e64 v0, -v0, -1.0 ; VI-SAFE: v_cmp_gt_f32_e32 vcc, 1.0, v0 -; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc -; VI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 +; VI-SAFE-NEXT: v_cndmask_b32_e64 v0, -1.0, -v0, vcc -; GCN-NNAN: v_max_f32_e64 v0, -v0, -1.0 +; VI-NNAN: v_max_f32_e64 v0, -v0, -1.0 define amdgpu_ps float @select_fneg_a_or_neg1_cmp_olt_a_1(float %a, float %b) #0 { %fneg.a = fneg float %a %cmp.a = fcmp olt float %a, 1.0 @@ -225,15 +210,14 @@ } ; GCN-LABEL: {{^}}ult_a_select_fneg_a_b: -; SI-SAFE: v_cmp_nge_f32_e32 vcc, v0, v1 -; SI-SAFE-NEXT: v_cndmask_b32_e64 v0, v1, -v0, vcc +; SI: v_cmp_nge_f32_e32 vcc, v0, v1 +; SI-NEXT: v_cndmask_b32_e64 v0, v1, -v0, vcc ; VI-SAFE: v_cmp_nge_f32_e32 vcc, v0, v1 ; VI-SAFE-NEXT: v_cndmask_b32_e64 v0, v1, -v0, vcc -; GCN-NNAN: v_cmp_lt_f32_e32 vcc, v0, v1 -; GCN-NNAN-NEXT: v_cndmask_b32_e64 v0, v1, -v0, vcc - +; VI-NNAN: v_cmp_lt_f32_e32 vcc, v0, v1 +; VI-NNAN-NEXT: v_cndmask_b32_e64 v0, v1, -v0, vcc define amdgpu_ps float @ult_a_select_fneg_a_b(float %a, float %b) #0 { %fneg.a = fneg float %a %cmp.a = fcmp ult float %a, %b @@ -242,14 +226,14 @@ } ; GCN-LABEL: {{^}}ugt_a_select_fneg_a_b: -; SI-SAFE: v_cmp_nle_f32_e32 vcc, v0, v1 -; SI-SAFE-NEXT: v_cndmask_b32_e64 v0, v1, -v0, vcc +; SI: v_cmp_nle_f32_e32 vcc, v0, v1 +; SI-NEXT: v_cndmask_b32_e64 v0, v1, -v0, vcc ; VI-SAFE: v_cmp_nle_f32_e32 vcc, v0, v1 ; VI-SAFE-NEXT: v_cndmask_b32_e64 v0, v1, -v0, vcc -; SI-NNAN: v_cmp_gt_f32_e32 vcc, v0, v1 -; SI-NNAN-NEXT: v_cndmask_b32_e64 v0, v1, -v0, vcc +; VI-NNAN: v_cmp_gt_f32_e32 vcc, v0, v1 +; VI-NNAN-NEXT: v_cndmask_b32_e64 v0, v1, -v0, vcc define amdgpu_ps float @ugt_a_select_fneg_a_b(float %a, float %b) #0 { %fneg.a = fneg float %a %cmp.a = fcmp ugt float %a, %b Index: llvm/test/CodeGen/AMDGPU/fneg-combines.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/fneg-combines.ll +++ llvm/test/CodeGen/AMDGPU/fneg-combines.ll @@ -226,6 +226,10 @@ ; GCN-SAFE-DAG: v_mad_f32 [[A:v[0-9]+]], ; GCN-SAFE-DAG: v_cmp_ngt_f32_e32 {{.*}}, [[A]] ; GCN-SAFE-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, -[[A]] + +; GCN-NSZ-DAG: v_mul_f32_e32 v{{[0-9]+}}, 0, v +; GCN-NSZ: v_cmp_ngt_f32 +; GCN-NSZ: v_cndmask_b32_e64 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}} define amdgpu_ps float @fneg_fadd_0(float inreg %tmp2, float inreg %tmp6, <4 x i32> %arg) local_unnamed_addr #0 { .entry: %tmp7 = fdiv float 1.000000e+00, %tmp6 @@ -246,9 +250,12 @@ ; GCN-LABEL: {{^}}fneg_fadd_0_nsz: ; GCN-NSZ-DAG: v_rcp_f32_e32 [[A:v[0-9]+]], ; GCN-NSZ-DAG: v_mov_b32_e32 [[B:v[0-9]+]], -; GCN-NSZ-DAG: v_mov_b32_e32 [[C:v[0-9]+]], -; GCN-NSZ-DAG: v_mul_f32_e32 [[D:v[0-9]+]], -; GCN-NSZ-DAG: v_cmp_nlt_f32_e64 {{.*}}, -[[D]] +; GCN-NSZ-DAG: v_mov_b32_e32 [[C:v[0-9]+]], 0x7fc00000 +; GCN-NSZ-DAG: v_mul_f32_e32 [[D:v[0-9]+]], 0, [[A]] +; GCN-NSZ-DAG: v_cmp_ngt_f32_e32 {{.*}}, s{{[0-9]+}}, [[D]] +; GCN-NSZ-DAG: v_cndmask_b32_e64 [[E:v[0-9]+]], -[[D]], v{{[0-9]+}}, +; GCN-NSZ-DAG: v_cmp_nlt_f32_e32 {{.*}}, 0 +; GCN-NSZ-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, [[C]], 0, define amdgpu_ps float @fneg_fadd_0_nsz(float inreg %tmp2, float inreg %tmp6, <4 x i32> %arg) local_unnamed_addr #2 { .entry: %tmp7 = fdiv afn float 1.000000e+00, %tmp6 @@ -2731,6 +2738,72 @@ ret <2 x half> %add } +; GCN-LABEL: {{^}}v_fneg_select_f32: +; GCN: s_waitcnt +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN-NEXT: v_cndmask_b32_e64 v0, -v2, -v1, vcc +; GCN-NEXT: s_setpc_b64 +define float @v_fneg_select_f32(i32 %arg0, float %a, float %b, float %c) { + %cond = icmp eq i32 %arg0, 0 + %select = select i1 %cond, float %a, float %b + %fneg = fneg float %select + ret float %fneg +} + +; GCN-LABEL: {{^}}v_fneg_select_2_f32: +; GCN: s_waitcnt +; GCN-NSZ-NEXT: v_sub_f32_e32 [[ADD2:v[0-9]+]], -2.0, v1 +; GCN-NSZ-NEXT: v_sub_f32_e32 [[ADD4:v[0-9]+]], -4.0, v2 +; GCN-NSZ-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN-NSZ-NEXT: v_cndmask_b32_e32 v0, [[ADD4]], [[ADD2]], vcc + +; GCN-SAFE-NEXT: v_add_f32_e32 [[ADD2:v[0-9]+]], 2.0, v1 +; GCN-SAFE-NEXT: v_add_f32_e32 [[ADD4:v[0-9]+]], 4.0, v2 +; GCN-SAFE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN-SAFE-NEXT: v_cndmask_b32_e64 v0, -[[ADD4]], -[[ADD2]], vcc + +; GCN-NEXT: s_setpc_b64 +define float @v_fneg_select_2_f32(i32 %arg0, float %a, float %b, float %c) { + %cond = icmp eq i32 %arg0, 0 + %add.0 = fadd float %a, 2.0 + %add.1 = fadd float %b, 4.0 + %select = select i1 %cond, float %add.0, float %add.1 + %neg.select = fneg float %select + ret float %neg.select +} + +; GCN-LABEL: {{^}}v_fneg_posk_select_f32: +; GCN: v_cmp_ne_u32_e32 vcc, 0, v0 +; GCN-NEXT: v_cndmask_b32_e64 v{{[0-9]+}}, -4.0, -v{{[0-9]+}}, vcc +define amdgpu_kernel void @v_fneg_posk_select_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext + %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext + %a = load volatile float, ptr addrspace(1) %a.gep + %cond = icmp eq i32 %tid, 0 + %select = select i1 %cond, float 4.0, float %a + %fneg = fneg float %select + store float %fneg, ptr addrspace(1) %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_fneg_negk_select_f32: +; GCN: v_cmp_ne_u32_e32 vcc, 0, v0 +; GCN-NEXT: v_cndmask_b32_e64 v{{[0-9]+}}, 4.0, -v{{[0-9]+}}, vcc +define amdgpu_kernel void @v_fneg_negk_select_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext + %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext + %a = load volatile float, ptr addrspace(1) %a.gep + %cond = icmp eq i32 %tid, 0 + %select = select i1 %cond, float -4.0, float %a + %fneg = fneg float %select + store float %fneg, ptr addrspace(1) %out.gep + ret void +} + declare i32 @llvm.amdgcn.workitem.id.x() #1 declare float @llvm.fma.f32(float, float, float) #1 declare <2 x float> @llvm.fma.v2f32(<2 x float>, <2 x float>, <2 x float>) Index: llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll +++ llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll @@ -216,10 +216,10 @@ ; SI-NSZ-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; SI-NSZ-NEXT: v_div_fmas_f32 v0, v0, v1, v3 ; SI-NSZ-NEXT: v_div_fixup_f32 v0, v0, s1, 1.0 -; SI-NSZ-NEXT: v_mul_f32_e32 v0, 0x80000000, v0 +; SI-NSZ-NEXT: v_mul_f32_e32 v0, 0, v0 ; SI-NSZ-NEXT: v_mov_b32_e32 v1, s0 -; SI-NSZ-NEXT: v_cmp_nlt_f32_e64 vcc, -v0, s0 -; SI-NSZ-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; SI-NSZ-NEXT: v_cmp_ngt_f32_e32 vcc, s0, v0 +; SI-NSZ-NEXT: v_cndmask_b32_e64 v0, -v0, v1, vcc ; SI-NSZ-NEXT: v_mov_b32_e32 v1, 0x7fc00000 ; SI-NSZ-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v0 ; SI-NSZ-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc @@ -266,9 +266,9 @@ ; VI-NSZ-NEXT: v_mov_b32_e32 v2, s0 ; VI-NSZ-NEXT: v_mov_b32_e32 v1, 0x7fc00000 ; VI-NSZ-NEXT: v_div_fixup_f32 v0, v0, s1, 1.0 -; VI-NSZ-NEXT: v_mul_f32_e32 v0, 0x80000000, v0 -; VI-NSZ-NEXT: v_cmp_nlt_f32_e64 vcc, -v0, s0 -; VI-NSZ-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; VI-NSZ-NEXT: v_mul_f32_e32 v0, 0, v0 +; VI-NSZ-NEXT: v_cmp_ngt_f32_e32 vcc, s0, v0 +; VI-NSZ-NEXT: v_cndmask_b32_e64 v0, -v0, v2, vcc ; VI-NSZ-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v0 ; VI-NSZ-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc ; VI-NSZ-NEXT: ; return to shader part epilog @@ -302,9 +302,9 @@ ; GCN-NSZ: ; %bb.0: ; %.entry ; GCN-NSZ-NEXT: v_rcp_f32_e32 v0, s1 ; GCN-NSZ-NEXT: v_mov_b32_e32 v1, s0 -; GCN-NSZ-NEXT: v_mul_f32_e32 v0, 0x80000000, v0 -; GCN-NSZ-NEXT: v_cmp_nlt_f32_e64 vcc, -v0, s0 -; GCN-NSZ-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GCN-NSZ-NEXT: v_mul_f32_e32 v0, 0, v0 +; GCN-NSZ-NEXT: v_cmp_ngt_f32_e32 vcc, s0, v0 +; GCN-NSZ-NEXT: v_cndmask_b32_e64 v0, -v0, v1, vcc ; GCN-NSZ-NEXT: v_mov_b32_e32 v1, 0x7fc00000 ; GCN-NSZ-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v0 ; GCN-NSZ-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc Index: llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll +++ llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll @@ -409,22 +409,23 @@ ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_and_b32_e32 v0, 1, v0 +; GCN-NEXT: v_xor_b32_e32 v2, 0x80000000, v2 +; GCN-NEXT: v_xor_b32_e32 v4, 0x80000000, v4 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GCN-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; GCN-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc -; GCN-NEXT: v_xor_b32_e32 v1, 0x80000000, v2 +; GCN-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: fneg_xor_select_f64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_xor_b32_e32 v2, 0x80000000, v2 +; GFX11-NEXT: v_xor_b32_e32 v4, 0x80000000, v4 ; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX11-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc_lo -; GFX11-NEXT: v_xor_b32_e32 v1, 0x80000000, v2 +; GFX11-NEXT: v_dual_cndmask_b32 v0, v3, v1 :: v_dual_cndmask_b32 v1, v4, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] %select = select i1 %cond, double %arg0, double %arg1 %fneg = fneg double %select Index: llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract-legacy.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract-legacy.ll +++ llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract-legacy.ll @@ -7,10 +7,9 @@ ; GCN-LABEL: {{^}}select_fneg_posk_src_rcp_legacy_f32: ; GCN: buffer_load_dword [[X:v[0-9]+]] -; GCN: v_rcp_legacy_f32_e32 [[RCP:v[0-9]+]], [[X]] -; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], -2.0, [[RCP]], vcc -; GCN: v_xor_b32_e32 [[NEG_SELECT:v[0-9]+]], 0x80000000, [[SELECT]] -; GCN-NEXT: buffer_store_dword [[NEG_SELECT]] +; GCN: v_rcp_legacy_f32_e64 [[RCP:v[0-9]+]], -[[X]] +; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 2.0, [[RCP]], vcc +; GCN-NEXT: buffer_store_dword [[SELECT]] define amdgpu_kernel void @select_fneg_posk_src_rcp_legacy_f32(i32 %c) #2 { %x = load volatile float, ptr addrspace(1) undef %y = load volatile float, ptr addrspace(1) undef @@ -25,10 +24,9 @@ ; GCN-LABEL: {{^}}select_fneg_posk_src_mul_legacy_f32: ; GCN: buffer_load_dword [[X:v[0-9]+]] -; GCN: v_mul_legacy_f32_e32 [[MUL:v[0-9]+]], 4.0, [[X]] -; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], -2.0, [[MUL]], vcc -; GCN: v_xor_b32_e32 [[NEG_SELECT:v[0-9]+]], 0x80000000, [[SELECT]] -; GCN-NEXT: buffer_store_dword [[NEG_SELECT]] +; GCN: v_mul_legacy_f32_e32 [[MUL:v[0-9]+]], -4.0, [[X]] +; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 2.0, [[MUL]], vcc +; GCN-NEXT: buffer_store_dword [[SELECT]] define amdgpu_kernel void @select_fneg_posk_src_mul_legacy_f32(i32 %c) #2 { %x = load volatile float, ptr addrspace(1) undef %cmp = icmp eq i32 %c, 0 Index: llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.legal.f16.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.legal.f16.ll +++ llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.legal.f16.ll @@ -6,11 +6,10 @@ ; VI-LABEL: select_fneg_posk_src_rcp_f16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_rcp_f16_e32 v1, v1 -; VI-NEXT: v_mov_b32_e32 v2, 0xc000 +; VI-NEXT: v_rcp_f16_e64 v1, -v1 +; VI-NEXT: v_mov_b32_e32 v2, 0x4000 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; VI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; VI-NEXT: v_xor_b32_e32 v0, 0x8000, v0 ; VI-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %c, 0 %rcp = call half @llvm.amdgcn.rcp.f16(half %x) Index: llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.ll +++ llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.ll @@ -804,10 +804,9 @@ ; GCN-LABEL: {{^}}select_fneg_posk_src_rcp_f32: ; GCN: buffer_load_dword [[X:v[0-9]+]] -; GCN: v_rcp_f32_e32 [[RCP:v[0-9]+]], [[X]] -; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], -2.0, [[RCP]], vcc -; GCN: v_xor_b32_e32 [[NEG_SELECT:v[0-9]+]], 0x80000000, [[SELECT]] -; GCN-NEXT: buffer_store_dword [[NEG_SELECT]] +; GCN: v_rcp_f32_e64 [[RCP:v[0-9]+]], -[[X]] +; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 2.0, [[RCP]], vcc +; GCN-NEXT: buffer_store_dword [[SELECT]] define amdgpu_kernel void @select_fneg_posk_src_rcp_f32(i32 %c) #0 { %x = load volatile float, ptr addrspace(1) undef %y = load volatile float, ptr addrspace(1) undef