diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h --- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h @@ -696,6 +696,9 @@ /// (fma fneg(x), fneg(y), z) -> (fma x, y, z) bool matchRedundantNegOperands(MachineInstr &MI, BuildFnTy &MatchInfo); + bool matchFsubToFneg(MachineInstr &MI, Register &MatchInfo); + void applyFsubToFneg(MachineInstr &MI, Register &MatchInfo); + bool canCombineFMadOrFMA(MachineInstr &MI, bool &AllowFusionGlobally, bool &HasFMAD, bool &Aggressive, bool CanReassociate = false); diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td --- a/llvm/include/llvm/Target/GlobalISel/Combine.td +++ b/llvm/include/llvm/Target/GlobalISel/Combine.td @@ -856,6 +856,13 @@ [{ return Helper.matchRedundantNegOperands(*${root}, ${matchinfo}); }]), (apply [{ Helper.applyBuildFnNoErase(*${root}, ${matchinfo}); }])>; +// Transform (fsub +-0.0, X) -> (fneg X) +def fsub_to_fneg: GICombineRule< + (defs root:$root, register_matchinfo:$matchinfo), + (match (wip_match_opcode G_FSUB):$root, + [{ return Helper.matchFsubToFneg(*${root}, ${matchinfo}); }]), + (apply [{ Helper.applyFsubToFneg(*${root}, ${matchinfo}); }])>; + // Transform (fadd x, (fmul y, z)) -> (fma y, z, x) // (fadd x, (fmul y, z)) -> (fmad y, z, x) // Transform (fadd (fmul x, y), z) -> (fma x, y, z) @@ -1056,7 +1063,8 @@ form_bitfield_extract, constant_fold, fabs_fneg_fold, intdiv_combines, mulh_combines, redundant_neg_operands, and_or_disjoint_mask, fma_combines, fold_binop_into_select, - sub_add_reg, select_to_minmax, redundant_binop_in_equality]>; + sub_add_reg, select_to_minmax, redundant_binop_in_equality, + fsub_to_fneg]>; // A combine group used to for prelegalizer combiners at -O0. The combines in // this group have been selected based on experiments to balance code size and diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp --- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp @@ -5190,6 +5190,38 @@ return true; } +bool CombinerHelper::matchFsubToFneg(MachineInstr &MI, Register &MatchInfo) { + assert(MI.getOpcode() == TargetOpcode::G_FSUB); + + Register LHS = MI.getOperand(1).getReg(); + MatchInfo = MI.getOperand(2).getReg(); + LLT Ty = MRI.getType(MI.getOperand(0).getReg()); + + const auto LHSCst = Ty.isVector() + ? getFConstantSplat(LHS, MRI, /* allowUndef */ true) + : getFConstantVRegValWithLookThrough(LHS, MRI); + if (!LHSCst) + return false; + + // -0.0 is always allowed + if (LHSCst->Value.isNegZero()) + return true; + + // +0.0 is only allowed if nsz is set. + if (LHSCst->Value.isPosZero()) + return MI.getFlag(MachineInstr::FmNsz); + + return false; +} + +void CombinerHelper::applyFsubToFneg(MachineInstr &MI, Register &MatchInfo) { + Builder.setInstrAndDebugLoc(MI); + Register Dst = MI.getOperand(0).getReg(); + Builder.buildFNeg( + Dst, Builder.buildFCanonicalize(MRI.getType(Dst), MatchInfo).getReg(0)); + eraseInst(MI); +} + /// Checks if \p MI is TargetOpcode::G_FMUL and contractable either /// due to global flags or MachineInstr flags. static bool isContractableFMul(MachineInstr &MI, bool AllowFusionGlobally) { diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fsub-fneg.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fsub-fneg.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fsub-fneg.mir @@ -0,0 +1,387 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -march=amdgcn -run-pass=amdgpu-prelegalizer-combiner -verify-machineinstrs %s -o - | FileCheck %s + +--- +name: test_f16_poszero_nsz +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + + ; CHECK-LABEL: name: test_f16_poszero_nsz + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK-NEXT: %input:_(s16) = G_TRUNC [[COPY]](s32) + ; CHECK-NEXT: [[FNEG:%[0-9]+]]:_(s16) = G_FNEG %input + ; CHECK-NEXT: [[FCANONICALIZE:%[0-9]+]]:_(s16) = G_FCANONICALIZE [[FNEG]] + ; CHECK-NEXT: %res:_(s32) = G_ANYEXT [[FCANONICALIZE]](s16) + ; CHECK-NEXT: $vgpr0 = COPY %res(s32) + %0:_(s32) = COPY $vgpr0 + %input:_(s16) = G_TRUNC %0 + %cst:_(s16) = G_FCONSTANT half 0.0 + %sub:_(s16) = nsz G_FSUB %cst, %input + %res:_(s32) = G_ANYEXT %sub + $vgpr0 = COPY %res +... + +--- +name: test_f16_poszero_nonsz_nofold +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + + ; CHECK-LABEL: name: test_f16_poszero_nonsz_nofold + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK-NEXT: %input:_(s16) = G_TRUNC [[COPY]](s32) + ; CHECK-NEXT: %cst:_(s16) = G_FCONSTANT half 0xH0000 + ; CHECK-NEXT: %sub:_(s16) = G_FSUB %cst, %input + ; CHECK-NEXT: %res:_(s32) = G_ANYEXT %sub(s16) + ; CHECK-NEXT: $vgpr0 = COPY %res(s32) + %0:_(s32) = COPY $vgpr0 + %input:_(s16) = G_TRUNC %0 + %cst:_(s16) = G_FCONSTANT half 0.0 + %sub:_(s16) = G_FSUB %cst, %input + %res:_(s32) = G_ANYEXT %sub + $vgpr0 = COPY %res +... + +--- +name: test_f16_negzero +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + + ; CHECK-LABEL: name: test_f16_negzero + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK-NEXT: %input:_(s16) = G_TRUNC [[COPY]](s32) + ; CHECK-NEXT: [[FNEG:%[0-9]+]]:_(s16) = G_FNEG %input + ; CHECK-NEXT: [[FCANONICALIZE:%[0-9]+]]:_(s16) = G_FCANONICALIZE [[FNEG]] + ; CHECK-NEXT: %res:_(s32) = G_ANYEXT [[FCANONICALIZE]](s16) + ; CHECK-NEXT: $vgpr0 = COPY %res(s32) + %0:_(s32) = COPY $vgpr0 + %input:_(s16) = G_TRUNC %0 + %cst:_(s16) = G_FCONSTANT half -0.0 + %sub:_(s16) = G_FSUB %cst, %input + %res:_(s32) = G_ANYEXT %sub + $vgpr0 = COPY %res +... + +--- +name: test_f32_poszero_nsz +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + + ; CHECK-LABEL: name: test_f32_poszero_nsz + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %input:_(s32) = COPY $vgpr0 + ; CHECK-NEXT: [[FNEG:%[0-9]+]]:_(s32) = G_FNEG %input + ; CHECK-NEXT: [[FCANONICALIZE:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[FNEG]] + ; CHECK-NEXT: $vgpr0 = COPY [[FCANONICALIZE]](s32) + %input:_(s32) = COPY $vgpr0 + %cst:_(s32) = G_FCONSTANT float 0.0 + %sub:_(s32) = nsz G_FSUB %cst, %input + $vgpr0 = COPY %sub +... + +--- +name: test_f32_poszero_nonsz_nofold +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + + ; CHECK-LABEL: name: test_f32_poszero_nonsz_nofold + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %input:_(s32) = COPY $vgpr0 + ; CHECK-NEXT: %cst:_(s32) = G_FCONSTANT float 0.000000e+00 + ; CHECK-NEXT: %sub:_(s32) = G_FSUB %cst, %input + ; CHECK-NEXT: $vgpr0 = COPY %sub(s32) + %input:_(s32) = COPY $vgpr0 + %cst:_(s32) = G_FCONSTANT float 0.0 + %sub:_(s32) = G_FSUB %cst, %input + $vgpr0 = COPY %sub +... + +--- +name: test_f32_negzero +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + + ; CHECK-LABEL: name: test_f32_negzero + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %input:_(s32) = COPY $vgpr0 + ; CHECK-NEXT: [[FNEG:%[0-9]+]]:_(s32) = G_FNEG %input + ; CHECK-NEXT: [[FCANONICALIZE:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[FNEG]] + ; CHECK-NEXT: $vgpr0 = COPY [[FCANONICALIZE]](s32) + %input:_(s32) = COPY $vgpr0 + %cst:_(s32) = G_FCONSTANT float -0.0 + %sub:_(s32) = G_FSUB %cst, %input + $vgpr0 = COPY %sub +... + +--- +name: test_f64_poszero_nsz +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0_vgpr1 + + ; CHECK-LABEL: name: test_f64_poszero_nsz + ; CHECK: liveins: $vgpr0_vgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %input:_(s64) = COPY $vgpr0_vgpr1 + ; CHECK-NEXT: [[FNEG:%[0-9]+]]:_(s64) = G_FNEG %input + ; CHECK-NEXT: [[FCANONICALIZE:%[0-9]+]]:_(s64) = G_FCANONICALIZE [[FNEG]] + ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[FCANONICALIZE]](s64) + %input:_(s64) = COPY $vgpr0_vgpr1 + %cst:_(s64) = G_FCONSTANT double 0.0 + %sub:_(s64) = nsz G_FSUB %cst, %input + $vgpr0_vgpr1 = COPY %sub +... + +--- +name: test_f64_poszero_nonsz_nofold +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0_vgpr1 + + ; CHECK-LABEL: name: test_f64_poszero_nonsz_nofold + ; CHECK: liveins: $vgpr0_vgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %input:_(s64) = COPY $vgpr0_vgpr1 + ; CHECK-NEXT: %cst:_(s64) = G_FCONSTANT double 0.000000e+00 + ; CHECK-NEXT: %sub:_(s64) = G_FSUB %cst, %input + ; CHECK-NEXT: $vgpr0_vgpr1 = COPY %sub(s64) + %input:_(s64) = COPY $vgpr0_vgpr1 + %cst:_(s64) = G_FCONSTANT double 0.0 + %sub:_(s64) = G_FSUB %cst, %input + $vgpr0_vgpr1 = COPY %sub +... + +--- +name: test_f64_negzero +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0_vgpr1 + + ; CHECK-LABEL: name: test_f64_negzero + ; CHECK: liveins: $vgpr0_vgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %input:_(s64) = COPY $vgpr0_vgpr1 + ; CHECK-NEXT: [[FNEG:%[0-9]+]]:_(s64) = G_FNEG %input + ; CHECK-NEXT: [[FCANONICALIZE:%[0-9]+]]:_(s64) = G_FCANONICALIZE [[FNEG]] + ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[FCANONICALIZE]](s64) + %input:_(s64) = COPY $vgpr0_vgpr1 + %cst:_(s64) = G_FCONSTANT double -0.0 + %sub:_(s64) = G_FSUB %cst, %input + $vgpr0_vgpr1 = COPY %sub +... + +--- +name: test_v4f16_poszero_nsz +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0_vgpr1 + + ; CHECK-LABEL: name: test_v4f16_poszero_nsz + ; CHECK: liveins: $vgpr0_vgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %input:_(<4 x s16>) = COPY $vgpr0_vgpr1 + ; CHECK-NEXT: [[FNEG:%[0-9]+]]:_(<4 x s16>) = G_FNEG %input + ; CHECK-NEXT: [[FCANONICALIZE:%[0-9]+]]:_(<4 x s16>) = G_FCANONICALIZE [[FNEG]] + ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[FCANONICALIZE]](<4 x s16>) + %input:_(<4 x s16>) = COPY $vgpr0_vgpr1 + %cst:_(s16) = G_FCONSTANT half 0.0 + %veccst:_(<4 x s16>) = G_BUILD_VECTOR %cst, %cst, %cst, %cst + %sub:_(<4 x s16>) = nsz G_FSUB %veccst, %input + $vgpr0_vgpr1 = COPY %sub +... + +--- +name: test_v4f16_poszero_nonsz_nofold +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0_vgpr1 + + ; CHECK-LABEL: name: test_v4f16_poszero_nonsz_nofold + ; CHECK: liveins: $vgpr0_vgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %input:_(<4 x s16>) = COPY $vgpr0_vgpr1 + ; CHECK-NEXT: %cst:_(s16) = G_FCONSTANT half 0xH0000 + ; CHECK-NEXT: %veccst:_(<4 x s16>) = G_BUILD_VECTOR %cst(s16), %cst(s16), %cst(s16), %cst(s16) + ; CHECK-NEXT: %sub:_(<4 x s16>) = G_FSUB %veccst, %input + ; CHECK-NEXT: $vgpr0_vgpr1 = COPY %sub(<4 x s16>) + %input:_(<4 x s16>) = COPY $vgpr0_vgpr1 + %cst:_(s16) = G_FCONSTANT half 0.0 + %veccst:_(<4 x s16>) = G_BUILD_VECTOR %cst, %cst, %cst, %cst + %sub:_(<4 x s16>) = G_FSUB %veccst, %input + $vgpr0_vgpr1 = COPY %sub +... + +--- +name: test_v4f16_negzero +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0_vgpr1 + + ; CHECK-LABEL: name: test_v4f16_negzero + ; CHECK: liveins: $vgpr0_vgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %input:_(<4 x s16>) = COPY $vgpr0_vgpr1 + ; CHECK-NEXT: [[FNEG:%[0-9]+]]:_(<4 x s16>) = G_FNEG %input + ; CHECK-NEXT: [[FCANONICALIZE:%[0-9]+]]:_(<4 x s16>) = G_FCANONICALIZE [[FNEG]] + ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[FCANONICALIZE]](<4 x s16>) + %input:_(<4 x s16>) = COPY $vgpr0_vgpr1 + %cst:_(s16) = G_FCONSTANT half -0.0 + %veccst:_(<4 x s16>) = G_BUILD_VECTOR %cst, %cst, %cst, %cst + %sub:_(<4 x s16>) = G_FSUB %veccst, %input + $vgpr0_vgpr1 = COPY %sub +... + +--- +name: test_v4f32 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0_vgpr1_vgpr2_vgpr3 + + ; CHECK-LABEL: name: test_v4f32 + ; CHECK: liveins: $vgpr0_vgpr1_vgpr2_vgpr3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %input:_(<4 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 + ; CHECK-NEXT: [[FNEG:%[0-9]+]]:_(<4 x s32>) = G_FNEG %input + ; CHECK-NEXT: [[FCANONICALIZE:%[0-9]+]]:_(<4 x s32>) = G_FCANONICALIZE [[FNEG]] + ; CHECK-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[FCANONICALIZE]](<4 x s32>) + %input:_(<4 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 + %cst:_(s32) = G_FCONSTANT float 0.0 + %veccst:_(<4 x s32>) = G_BUILD_VECTOR %cst, %cst, %cst, %cst + %sub:_(<4 x s32>) = nsz G_FSUB %veccst, %input + $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %sub +... + +--- +name: test_v4f32_negzero +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0_vgpr1_vgpr2_vgpr3 + + ; CHECK-LABEL: name: test_v4f32_negzero + ; CHECK: liveins: $vgpr0_vgpr1_vgpr2_vgpr3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %input:_(<4 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 + ; CHECK-NEXT: [[FNEG:%[0-9]+]]:_(<4 x s32>) = G_FNEG %input + ; CHECK-NEXT: [[FCANONICALIZE:%[0-9]+]]:_(<4 x s32>) = G_FCANONICALIZE [[FNEG]] + ; CHECK-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[FCANONICALIZE]](<4 x s32>) + %input:_(<4 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 + %cst:_(s32) = G_FCONSTANT float -0.0 + %veccst:_(<4 x s32>) = G_BUILD_VECTOR %cst, %cst, %cst, %cst + %sub:_(<4 x s32>) = G_FSUB %veccst, %input + $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %sub +... + +--- +name: test_v4f32_negzero_undef_elt +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0_vgpr1_vgpr2_vgpr3 + + ; CHECK-LABEL: name: test_v4f32_negzero_undef_elt + ; CHECK: liveins: $vgpr0_vgpr1_vgpr2_vgpr3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %input:_(<4 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 + ; CHECK-NEXT: [[FNEG:%[0-9]+]]:_(<4 x s32>) = G_FNEG %input + ; CHECK-NEXT: [[FCANONICALIZE:%[0-9]+]]:_(<4 x s32>) = G_FCANONICALIZE [[FNEG]] + ; CHECK-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[FCANONICALIZE]](<4 x s32>) + %input:_(<4 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 + %cst:_(s32) = G_FCONSTANT float -0.0 + %undef:_(s32) = G_IMPLICIT_DEF + %veccst:_(<4 x s32>) = G_BUILD_VECTOR %cst, %undef, %cst, %cst + %sub:_(<4 x s32>) = G_FSUB %veccst, %input + $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %sub +... + +--- +name: test_v4f32_poszero_undef_elt +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0_vgpr1_vgpr2_vgpr3 + + ; CHECK-LABEL: name: test_v4f32_poszero_undef_elt + ; CHECK: liveins: $vgpr0_vgpr1_vgpr2_vgpr3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %input:_(<4 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 + ; CHECK-NEXT: [[FNEG:%[0-9]+]]:_(<4 x s32>) = G_FNEG %input + ; CHECK-NEXT: [[FCANONICALIZE:%[0-9]+]]:_(<4 x s32>) = G_FCANONICALIZE [[FNEG]] + ; CHECK-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[FCANONICALIZE]](<4 x s32>) + %input:_(<4 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 + %cst:_(s32) = G_FCONSTANT float 0.0 + %undef:_(s32) = G_IMPLICIT_DEF + %veccst:_(<4 x s32>) = G_BUILD_VECTOR %cst, %undef, %cst, %cst + %sub:_(<4 x s32>) = nsz G_FSUB %veccst, %input + $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %sub +... + +--- +name: test_v2f64 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0_vgpr1_vgpr2_vgpr3 + + ; CHECK-LABEL: name: test_v2f64 + ; CHECK: liveins: $vgpr0_vgpr1_vgpr2_vgpr3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %input:_(<2 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 + ; CHECK-NEXT: [[FNEG:%[0-9]+]]:_(<2 x s64>) = G_FNEG %input + ; CHECK-NEXT: [[FCANONICALIZE:%[0-9]+]]:_(<2 x s64>) = G_FCANONICALIZE [[FNEG]] + ; CHECK-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[FCANONICALIZE]](<2 x s64>) + %input:_(<2 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 + %cst:_(s64) = G_FCONSTANT double 0.0 + %veccst:_(<2 x s64>) = G_BUILD_VECTOR %cst, %cst + %sub:_(<2 x s64>) = nsz G_FSUB %veccst, %input + $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %sub +... + +--- +name: test_v2f64_negzero +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0_vgpr1_vgpr2_vgpr3 + + ; CHECK-LABEL: name: test_v2f64_negzero + ; CHECK: liveins: $vgpr0_vgpr1_vgpr2_vgpr3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %input:_(<2 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 + ; CHECK-NEXT: [[FNEG:%[0-9]+]]:_(<2 x s64>) = G_FNEG %input + ; CHECK-NEXT: [[FCANONICALIZE:%[0-9]+]]:_(<2 x s64>) = G_FCANONICALIZE [[FNEG]] + ; CHECK-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[FCANONICALIZE]](<2 x s64>) + %input:_(<2 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 + %cst:_(s64) = G_FCONSTANT double -0.0 + %veccst:_(<2 x s64>) = G_BUILD_VECTOR %cst, %cst + %sub:_(<2 x s64>) = G_FSUB %veccst, %input + $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %sub +... + diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3.ll @@ -23,7 +23,7 @@ ; SI-NEXT: s_mov_b64 s[8:9], s[6:7] ; SI-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_sub_f32_e32 v2, 0x80000000, v2 +; SI-NEXT: v_mul_f32_e32 v2, -1.0, v2 ; SI-NEXT: v_med3_f32 v2, v2, v3, v4 ; SI-NEXT: s_mov_b64 s[2:3], s[10:11] ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 @@ -56,7 +56,7 @@ ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v6 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_sub_f32_e32 v4, 0x80000000, v7 +; VI-NEXT: v_mul_f32_e32 v4, -1.0, v7 ; VI-NEXT: v_med3_f32 v2, v4, v2, v3 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm @@ -72,7 +72,7 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_sub_f32_e32 v1, 0x80000000, v1 +; GFX9-NEXT: v_max_f32_e64 v1, -v1, -v1 ; GFX9-NEXT: v_med3_f32 v1, v1, v2, v3 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm @@ -88,7 +88,7 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_load_dword v3, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_sub_f32_e32 v1, 0x80000000, v1 +; GFX10-NEXT: v_max_f32_e64 v1, -v1, -v1 ; GFX10-NEXT: v_med3_f32 v1, v1, v2, v3 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm @@ -104,7 +104,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_sub_f32_e32 v1, 0x80000000, v1 +; GFX11-NEXT: v_max_f32_e64 v1, -v1, -v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_med3_f32 v1, v1, v2, v3 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -145,7 +145,7 @@ ; SI-NEXT: s_mov_b64 s[8:9], s[6:7] ; SI-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_sub_f32_e32 v2, 0x80000000, v2 +; SI-NEXT: v_mul_f32_e32 v2, -1.0, v2 ; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; SI-NEXT: v_min_f32_e32 v5, v2, v3 ; SI-NEXT: v_max_f32_e32 v2, v2, v3 @@ -183,7 +183,7 @@ ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v6 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_sub_f32_e32 v4, 0x80000000, v7 +; VI-NEXT: v_mul_f32_e32 v4, -1.0, v7 ; VI-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; VI-NEXT: v_min_f32_e32 v5, v4, v2 ; VI-NEXT: v_max_f32_e32 v2, v4, v2 @@ -204,7 +204,7 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_sub_f32_e32 v1, 0x80000000, v1 +; GFX9-NEXT: v_max_f32_e64 v1, -v1, -v1 ; GFX9-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX9-NEXT: v_min_f32_e32 v4, v1, v2 ; GFX9-NEXT: v_max_f32_e32 v1, v1, v2 @@ -225,7 +225,7 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_load_dword v3, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_sub_f32_e32 v1, 0x80000000, v1 +; GFX10-NEXT: v_max_f32_e64 v1, -v1, -v1 ; GFX10-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX10-NEXT: v_max_f32_e32 v3, v3, v3 ; GFX10-NEXT: v_max_f32_e32 v4, v1, v2 @@ -246,7 +246,8 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_dual_sub_f32 v1, 0x80000000, v1 :: v_dual_max_f32 v2, v2, v2 +; GFX11-NEXT: v_max_f32_e64 v1, -v1, -v1 +; GFX11-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_min_f32_e32 v4, v1, v2 ; GFX11-NEXT: v_dual_max_f32 v1, v1, v2 :: v_dual_max_f32 v2, v3, v3 @@ -289,9 +290,8 @@ ; SI-NEXT: s_mov_b64 s[8:9], s[6:7] ; SI-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_mov_b32 s2, 0x80000000 -; SI-NEXT: v_sub_f32_e32 v2, 0x80000000, v2 -; SI-NEXT: v_sub_f32_e64 v4, s2, |v4| +; SI-NEXT: v_mul_f32_e32 v2, -1.0, v2 +; SI-NEXT: v_mul_f32_e64 v4, -1.0, |v4| ; SI-NEXT: v_med3_f32 v2, v2, |v3|, v4 ; SI-NEXT: s_mov_b64 s[2:3], s[10:11] ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 @@ -320,13 +320,12 @@ ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: flat_load_dword v3, v[4:5] glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_mov_b32 s2, 0x80000000 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v6 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_sub_f32_e32 v4, 0x80000000, v7 -; VI-NEXT: v_sub_f32_e64 v3, s2, |v3| +; VI-NEXT: v_mul_f32_e32 v4, -1.0, v7 +; VI-NEXT: v_mul_f32_e64 v3, -1.0, |v3| ; VI-NEXT: v_med3_f32 v2, v4, |v2|, v3 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm @@ -342,9 +341,8 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_mov_b32 s2, 0x80000000 -; GFX9-NEXT: v_sub_f32_e32 v1, 0x80000000, v1 -; GFX9-NEXT: v_sub_f32_e64 v3, s2, |v3| +; GFX9-NEXT: v_max_f32_e64 v1, -v1, -v1 +; GFX9-NEXT: v_max_f32_e64 v3, -|v3|, -|v3| ; GFX9-NEXT: v_med3_f32 v1, v1, |v2|, v3 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm @@ -360,8 +358,8 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_load_dword v3, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_sub_f32_e32 v1, 0x80000000, v1 -; GFX10-NEXT: v_sub_f32_e64 v3, 0x80000000, |v3| +; GFX10-NEXT: v_max_f32_e64 v1, -v1, -v1 +; GFX10-NEXT: v_max_f32_e64 v3, -|v3|, -|v3| ; GFX10-NEXT: v_med3_f32 v1, v1, |v2|, v3 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm @@ -377,8 +375,8 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_sub_f32_e32 v1, 0x80000000, v1 -; GFX11-NEXT: v_sub_f32_e64 v3, 0x80000000, |v3| +; GFX11-NEXT: v_max_f32_e64 v1, -v1, -v1 +; GFX11-NEXT: v_max_f32_e64 v3, -|v3|, -|v3| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_med3_f32 v1, v1, |v2|, v3 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] @@ -425,10 +423,9 @@ ; SI-NEXT: s_mov_b64 s[8:9], s[6:7] ; SI-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_mov_b32 s2, 0x80000000 -; SI-NEXT: v_sub_f32_e64 v2, s2, |v2| -; SI-NEXT: v_sub_f32_e64 v3, s2, |v3| -; SI-NEXT: v_sub_f32_e64 v4, s2, |v4| +; SI-NEXT: v_mul_f32_e64 v2, -1.0, |v2| +; SI-NEXT: v_mul_f32_e64 v3, -1.0, |v3| +; SI-NEXT: v_mul_f32_e64 v4, -1.0, |v4| ; SI-NEXT: v_med3_f32 v2, v2, v3, v4 ; SI-NEXT: s_mov_b64 s[2:3], s[10:11] ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 @@ -457,14 +454,13 @@ ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: flat_load_dword v3, v[4:5] glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_mov_b32 s2, 0x80000000 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v6 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_sub_f32_e64 v4, s2, |v7| -; VI-NEXT: v_sub_f32_e64 v2, s2, |v2| -; VI-NEXT: v_sub_f32_e64 v3, s2, |v3| +; VI-NEXT: v_mul_f32_e64 v4, -1.0, |v7| +; VI-NEXT: v_mul_f32_e64 v2, -1.0, |v2| +; VI-NEXT: v_mul_f32_e64 v3, -1.0, |v3| ; VI-NEXT: v_med3_f32 v2, v4, v2, v3 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm @@ -480,10 +476,9 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_mov_b32 s2, 0x80000000 -; GFX9-NEXT: v_sub_f32_e64 v1, s2, |v1| -; GFX9-NEXT: v_sub_f32_e64 v2, s2, |v2| -; GFX9-NEXT: v_sub_f32_e64 v3, s2, |v3| +; GFX9-NEXT: v_max_f32_e64 v1, -|v1|, -|v1| +; GFX9-NEXT: v_max_f32_e64 v2, -|v2|, -|v2| +; GFX9-NEXT: v_max_f32_e64 v3, -|v3|, -|v3| ; GFX9-NEXT: v_med3_f32 v1, v1, v2, v3 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm @@ -499,9 +494,9 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_load_dword v3, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_sub_f32_e64 v1, 0x80000000, |v1| -; GFX10-NEXT: v_sub_f32_e64 v2, 0x80000000, |v2| -; GFX10-NEXT: v_sub_f32_e64 v3, 0x80000000, |v3| +; GFX10-NEXT: v_max_f32_e64 v1, -|v1|, -|v1| +; GFX10-NEXT: v_max_f32_e64 v2, -|v2|, -|v2| +; GFX10-NEXT: v_max_f32_e64 v3, -|v3|, -|v3| ; GFX10-NEXT: v_med3_f32 v1, v1, v2, v3 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm @@ -517,9 +512,9 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_sub_f32_e64 v1, 0x80000000, |v1| -; GFX11-NEXT: v_sub_f32_e64 v2, 0x80000000, |v2| -; GFX11-NEXT: v_sub_f32_e64 v3, 0x80000000, |v3| +; GFX11-NEXT: v_max_f32_e64 v1, -|v1|, -|v1| +; GFX11-NEXT: v_max_f32_e64 v2, -|v2|, -|v2| +; GFX11-NEXT: v_max_f32_e64 v3, -|v3|, -|v3| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_med3_f32 v1, v1, v2, v3 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/v_pack.ll b/llvm/test/CodeGen/AMDGPU/v_pack.ll --- a/llvm/test/CodeGen/AMDGPU/v_pack.ll +++ b/llvm/test/CodeGen/AMDGPU/v_pack.ll @@ -223,9 +223,7 @@ ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: v_add_f16_e32 v0, 2.0, v1 ; GISEL-NEXT: v_add_f16_e32 v1, 2.0, v2 -; GISEL-NEXT: v_sub_f16_e32 v0, 0x8000, v0 -; GISEL-NEXT: v_sub_f16_e32 v1, 0x8000, v1 -; GISEL-NEXT: v_pack_b32_f16 v0, v0, v1 +; GISEL-NEXT: v_pack_b32_f16 v0, -v0, -v1 ; GISEL-NEXT: ;;#ASMSTART ; GISEL-NEXT: ; use v0 ; GISEL-NEXT: ;;#ASMEND