diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td --- a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td @@ -56,6 +56,14 @@ [{ return RegBankHelper.matchIntMinMaxToMed3(*${min_or_max}, ${matchinfo}); }]), (apply [{ RegBankHelper.applyMed3(*${min_or_max}, ${matchinfo}); }])>; +def remove_fcanonicalize_matchinfo : GIDefMatchData<"Register">; + +def remove_fcanonicalize : GICombineRule< + (defs root:$fcanonicalize, remove_fcanonicalize_matchinfo:$matchinfo), + (match (wip_match_opcode G_FCANONICALIZE):$fcanonicalize, + [{ return PostLegalizerHelper.matchRemoveFcanonicalize(*${fcanonicalize}, ${matchinfo}); }]), + (apply [{ Helper.replaceSingleDefInstWithReg(*${fcanonicalize}, ${matchinfo}); }])>; + // Combines which should only apply on SI/VI def gfx6gfx7_combines : GICombineGroup<[fcmp_select_to_fmin_fmax_legacy]>; @@ -68,7 +76,7 @@ def AMDGPUPostLegalizerCombinerHelper: GICombinerHelper< "AMDGPUGenPostLegalizerCombinerHelper", [all_combines, gfx6gfx7_combines, - uchar_to_float, cvt_f32_ubyteN]> { + uchar_to_float, cvt_f32_ubyteN, remove_fcanonicalize]> { let DisableRuleOption = "amdgpupostlegalizercombiner-disable-rule"; let StateClass = "AMDGPUPostLegalizerCombinerHelperState"; let AdditionalArguments = []; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp @@ -66,6 +66,8 @@ bool matchCvtF32UByteN(MachineInstr &MI, CvtF32UByteMatchInfo &MatchInfo); void applyCvtF32UByteN(MachineInstr &MI, const CvtF32UByteMatchInfo &MatchInfo); + + bool matchRemoveFcanonicalize(MachineInstr &MI, Register &Reg); }; bool AMDGPUPostLegalizerCombinerHelper::matchFMinFMaxLegacy( @@ -245,6 +247,14 @@ MI.eraseFromParent(); } +bool AMDGPUPostLegalizerCombinerHelper::matchRemoveFcanonicalize( + MachineInstr &MI, Register &Reg) { + const SITargetLowering *TLI = static_cast( + MF.getSubtarget().getTargetLowering()); + Reg = MI.getOperand(1).getReg(); + return TLI->isCanonicalized(Reg, MF); +} + class AMDGPUPostLegalizerCombinerHelperState { protected: CombinerHelper &Helper; diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h --- a/llvm/lib/Target/AMDGPU/SIISelLowering.h +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -444,7 +444,10 @@ bool isCanonicalized(SelectionDAG &DAG, SDValue Op, unsigned MaxDepth = 5) const; + bool isCanonicalized(Register Reg, MachineFunction &MF, + unsigned MaxDepth = 5) const; bool denormalsEnabledForType(const SelectionDAG &DAG, EVT VT) const; + bool denormalsEnabledForType(LLT Ty, MachineFunction &MF) const; bool isKnownNeverNaNForTargetNode(SDValue Op, const SelectionDAG &DAG, diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -9649,6 +9649,45 @@ llvm_unreachable("invalid operation"); } +bool SITargetLowering::isCanonicalized(Register Reg, MachineFunction &MF, + unsigned MaxDepth) const { + MachineRegisterInfo &MRI = MF.getRegInfo(); + MachineInstr *MI = MRI.getVRegDef(Reg); + unsigned Opcode = MI->getOpcode(); + + if (Opcode == AMDGPU::G_FCANONICALIZE) + return true; + + if (Opcode == AMDGPU::G_FCONSTANT) { + auto F = MI->getOperand(1).getFPImm()->getValueAPF(); + if (F.isNaN() && F.isSignaling()) + return false; + return !F.isDenormal() || denormalsEnabledForType(MRI.getType(Reg), MF); + } + + if (MaxDepth == 0) + return false; + + switch (Opcode) { + case AMDGPU::G_FMINNUM_IEEE: + case AMDGPU::G_FMAXNUM_IEEE: { + if (Subtarget->supportsMinMaxDenormModes() || + denormalsEnabledForType(MRI.getType(Reg), MF)) + return true; + for (unsigned I = 1, E = MI->getNumOperands(); I != E; ++I) { + if (!isCanonicalized(MI->getOperand(I).getReg(), MF, MaxDepth - 1)) + return false; + } + return true; + } + default: + return denormalsEnabledForType(MRI.getType(Reg), MF) && + isKnownNeverSNaN(Reg, MRI); + } + + llvm_unreachable("invalid operation"); +} + // Constant fold canonicalize. SDValue SITargetLowering::getCanonicalConstantFP( SelectionDAG &DAG, const SDLoc &SL, EVT VT, const APFloat &C) const { @@ -12014,6 +12053,19 @@ } } +bool SITargetLowering::denormalsEnabledForType(LLT Ty, + MachineFunction &MF) const { + switch (Ty.getScalarSizeInBits()) { + case 32: + return hasFP32Denormals(MF); + case 64: + case 16: + return hasFP64FP16Denormals(MF); + default: + return false; + } +} + bool SITargetLowering::isKnownNeverNaNForTargetNode(SDValue Op, const SelectionDAG &DAG, bool SNaN, diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fcanonicalize.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fcanonicalize.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fcanonicalize.mir @@ -0,0 +1,223 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=amdgcn-amd-amdhsa -run-pass=amdgpu-postlegalizer-combiner -verify-machineinstrs %s -o - | FileCheck %s + +--- +name: test_fcanonicalize +tracksRegLiveness: true +legalized: true +body: | + bb.0: + liveins: $vgpr0 + + ; CHECK-LABEL: name: test_fcanonicalize + ; CHECK: liveins: $vgpr0 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK: [[FCANONICALIZE:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[COPY]] + ; CHECK: $vgpr0 = COPY [[FCANONICALIZE]](s32) + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = G_FCANONICALIZE %0 + %2:_(s32) = G_FCANONICALIZE %1 + $vgpr0 = COPY %2(s32) +... + +--- +name: test_fconstant +tracksRegLiveness: true +legalized: true +body: | + bb.0: + + ; CHECK-LABEL: name: test_fconstant + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+10 + ; CHECK: $vgpr0 = COPY [[C]](s32) + %0:_(s32) = G_FCONSTANT float 1.0e10 + %1:_(s32) = G_FCANONICALIZE %0 + $vgpr0 = COPY %1(s32) +... + +--- +name: test_denormal_fconstant +tracksRegLiveness: true +legalized: true +machineFunctionInfo: + mode: + fp64-fp16-output-denormals: false + fp64-fp16-input-denormals: false +body: | + bb.0: + + ; CHECK-LABEL: name: test_denormal_fconstant + ; CHECK: [[C:%[0-9]+]]:_(s64) = G_FCONSTANT double 1.618950e-319 + ; CHECK: [[FCANONICALIZE:%[0-9]+]]:_(s64) = G_FCANONICALIZE [[C]] + ; CHECK: $vgpr0_vgpr1 = COPY [[FCANONICALIZE]](s64) + %0:_(s64) = G_FCONSTANT double 0x0000000000008000 + %1:_(s64) = G_FCANONICALIZE %0 + $vgpr0_vgpr1 = COPY %1(s64) +... + +--- +name: test_fminnum_with_fminnum_argument_s32_ieee_mode_on +tracksRegLiveness: true +legalized: true +machineFunctionInfo: + mode: + ieee: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2 + + ; CHECK-LABEL: name: test_fminnum_with_fminnum_argument_s32_ieee_mode_on + ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK: [[FCANONICALIZE:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[COPY]] + ; CHECK: [[FCANONICALIZE1:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[COPY1]] + ; CHECK: [[FMINNUM_IEEE:%[0-9]+]]:_(s32) = G_FMINNUM_IEEE [[FCANONICALIZE]], [[FCANONICALIZE1]] + ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; CHECK: [[FCANONICALIZE2:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[COPY2]] + ; CHECK: [[FMINNUM_IEEE1:%[0-9]+]]:_(s32) = G_FMINNUM_IEEE [[FMINNUM_IEEE]], [[FCANONICALIZE2]] + ; CHECK: $vgpr0 = COPY [[FMINNUM_IEEE1]](s32) + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $vgpr1 + %7:_(s32) = G_FCANONICALIZE %0 + %8:_(s32) = G_FCANONICALIZE %1 + %2:_(s32) = G_FMINNUM_IEEE %7, %8 + %3:_(s32) = COPY $vgpr2 + %5:_(s32) = G_FCANONICALIZE %2 + %6:_(s32) = G_FCANONICALIZE %3 + %4:_(s32) = G_FMINNUM_IEEE %5, %6 + $vgpr0 = COPY %4(s32) +... + +--- +name: test_fminnum_with_fmaxnum_argument_s32_ieee_mode_on +tracksRegLiveness: true +legalized: true +machineFunctionInfo: + mode: + ieee: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2 + + ; CHECK-LABEL: name: test_fminnum_with_fmaxnum_argument_s32_ieee_mode_on + ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK: [[FCANONICALIZE:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[COPY]] + ; CHECK: [[FCANONICALIZE1:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[COPY1]] + ; CHECK: [[FMAXNUM_IEEE:%[0-9]+]]:_(s32) = G_FMAXNUM_IEEE [[FCANONICALIZE]], [[FCANONICALIZE1]] + ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; CHECK: [[FCANONICALIZE2:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[COPY2]] + ; CHECK: [[FMINNUM_IEEE:%[0-9]+]]:_(s32) = G_FMINNUM_IEEE [[FMAXNUM_IEEE]], [[FCANONICALIZE2]] + ; CHECK: $vgpr0 = COPY [[FMINNUM_IEEE]](s32) + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $vgpr1 + %7:_(s32) = G_FCANONICALIZE %0 + %8:_(s32) = G_FCANONICALIZE %1 + %2:_(s32) = G_FMAXNUM_IEEE %7, %8 + %3:_(s32) = COPY $vgpr2 + %5:_(s32) = G_FCANONICALIZE %2 + %6:_(s32) = G_FCANONICALIZE %3 + %4:_(s32) = G_FMINNUM_IEEE %5, %6 + $vgpr0 = COPY %4(s32) +... + +--- +name: test_fmaxnum_with_fmaxnum_argument_s32_ieee_mode_on +tracksRegLiveness: true +legalized: true +machineFunctionInfo: + mode: + ieee: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2 + + ; CHECK-LABEL: name: test_fmaxnum_with_fmaxnum_argument_s32_ieee_mode_on + ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK: [[FCANONICALIZE:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[COPY]] + ; CHECK: [[FCANONICALIZE1:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[COPY1]] + ; CHECK: [[FMAXNUM_IEEE:%[0-9]+]]:_(s32) = G_FMAXNUM_IEEE [[FCANONICALIZE]], [[FCANONICALIZE1]] + ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; CHECK: [[FCANONICALIZE2:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[COPY2]] + ; CHECK: [[FMAXNUM_IEEE1:%[0-9]+]]:_(s32) = G_FMAXNUM_IEEE [[FMAXNUM_IEEE]], [[FCANONICALIZE2]] + ; CHECK: $vgpr0 = COPY [[FMAXNUM_IEEE1]](s32) + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $vgpr1 + %7:_(s32) = G_FCANONICALIZE %0 + %8:_(s32) = G_FCANONICALIZE %1 + %2:_(s32) = G_FMAXNUM_IEEE %7, %8 + %3:_(s32) = COPY $vgpr2 + %5:_(s32) = G_FCANONICALIZE %2 + %6:_(s32) = G_FCANONICALIZE %3 + %4:_(s32) = G_FMAXNUM_IEEE %5, %6 + $vgpr0 = COPY %4(s32) +... + +--- +name: test_fmaxnum_with_fminnum_argument_s32_ieee_mode_on +tracksRegLiveness: true +legalized: true +machineFunctionInfo: + mode: + ieee: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2 + + ; CHECK-LABEL: name: test_fmaxnum_with_fminnum_argument_s32_ieee_mode_on + ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK: [[FCANONICALIZE:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[COPY]] + ; CHECK: [[FCANONICALIZE1:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[COPY1]] + ; CHECK: [[FMINNUM_IEEE:%[0-9]+]]:_(s32) = G_FMINNUM_IEEE [[FCANONICALIZE]], [[FCANONICALIZE1]] + ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; CHECK: [[FCANONICALIZE2:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[COPY2]] + ; CHECK: [[FMAXNUM_IEEE:%[0-9]+]]:_(s32) = G_FMAXNUM_IEEE [[FMINNUM_IEEE]], [[FCANONICALIZE2]] + ; CHECK: $vgpr0 = COPY [[FMAXNUM_IEEE]](s32) + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $vgpr1 + %7:_(s32) = G_FCANONICALIZE %0 + %8:_(s32) = G_FCANONICALIZE %1 + %2:_(s32) = G_FMINNUM_IEEE %7, %8 + %3:_(s32) = COPY $vgpr2 + %5:_(s32) = G_FCANONICALIZE %2 + %6:_(s32) = G_FCANONICALIZE %3 + %4:_(s32) = G_FMAXNUM_IEEE %5, %6 + $vgpr0 = COPY %4(s32) +... + +--- +name: test_multiple_uses +tracksRegLiveness: true +legalized: true +machineFunctionInfo: + mode: + ieee: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + + ; CHECK-LABEL: name: test_multiple_uses + ; CHECK: liveins: $vgpr0, $vgpr1 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK: [[FCANONICALIZE:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[COPY]] + ; CHECK: [[FCANONICALIZE1:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[COPY1]] + ; CHECK: [[FMINNUM_IEEE:%[0-9]+]]:_(s32) = G_FMINNUM_IEEE [[FCANONICALIZE]], [[FCANONICALIZE1]] + ; CHECK: [[FMAXNUM_IEEE:%[0-9]+]]:_(s32) = G_FMAXNUM_IEEE [[FMINNUM_IEEE]], [[FMINNUM_IEEE]] + ; CHECK: $vgpr0 = COPY [[FMAXNUM_IEEE]](s32) + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $vgpr1 + %6:_(s32) = G_FCANONICALIZE %0 + %7:_(s32) = G_FCANONICALIZE %1 + %2:_(s32) = G_FMINNUM_IEEE %6, %7 + %4:_(s32) = G_FCANONICALIZE %2 + %5:_(s32) = G_FCANONICALIZE %2 + %3:_(s32) = G_FMAXNUM_IEEE %4, %5 + $vgpr0 = COPY %3(s32) +... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3.ll @@ -132,11 +132,8 @@ ; SI-NEXT: v_min_f32_e32 v5, v2, v3 ; SI-NEXT: v_max_f32_e32 v2, v2, v3 ; SI-NEXT: v_mul_f32_e32 v3, 1.0, v4 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_min_f32_e32 v2, v2, v3 -; SI-NEXT: v_mul_f32_e32 v3, 1.0, v5 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_max_f32_e32 v2, v3, v2 +; SI-NEXT: v_max_f32_e32 v2, v5, v2 ; SI-NEXT: s_mov_b64 s[2:3], s[10:11] ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm @@ -174,11 +171,8 @@ ; VI-NEXT: v_min_f32_e32 v5, v4, v2 ; VI-NEXT: v_max_f32_e32 v2, v4, v2 ; VI-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; VI-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; VI-NEXT: v_min_f32_e32 v2, v2, v3 -; VI-NEXT: v_mul_f32_e32 v3, 1.0, v5 -; VI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; VI-NEXT: v_max_f32_e32 v2, v3, v2 +; VI-NEXT: v_max_f32_e32 v2, v5, v2 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -198,12 +192,9 @@ ; GFX9-NEXT: v_max_f32_e32 v1, v1, v1 ; GFX9-NEXT: v_min_f32_e32 v4, v1, v2 ; GFX9-NEXT: v_max_f32_e32 v1, v1, v2 -; GFX9-NEXT: v_max_f32_e32 v3, v3, v3 -; GFX9-NEXT: v_max_f32_e32 v1, v1, v1 -; GFX9-NEXT: v_min_f32_e32 v1, v1, v3 -; GFX9-NEXT: v_max_f32_e32 v2, v4, v4 -; GFX9-NEXT: v_max_f32_e32 v1, v1, v1 -; GFX9-NEXT: v_max_f32_e32 v1, v2, v1 +; GFX9-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX9-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX9-NEXT: v_max_f32_e32 v1, v4, v1 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; @@ -224,10 +215,7 @@ ; GFX10-NEXT: v_max_f32_e32 v1, v1, v1 ; GFX10-NEXT: v_max_f32_e32 v4, v1, v2 ; GFX10-NEXT: v_min_f32_e32 v1, v1, v2 -; GFX10-NEXT: v_max_f32_e32 v4, v4, v4 -; GFX10-NEXT: v_max_f32_e32 v1, v1, v1 ; GFX10-NEXT: v_min_f32_e32 v2, v4, v3 -; GFX10-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX10-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm @@ -640,11 +628,8 @@ ; SI-NEXT: v_mul_f32_e32 v3, 1.0, v4 ; SI-NEXT: buffer_store_dword v5, off, s[0:3], 0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_min_f32_e32 v2, v2, v3 -; SI-NEXT: v_mul_f32_e32 v3, 1.0, v5 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_max_f32_e32 v2, v3, v2 +; SI-NEXT: v_max_f32_e32 v2, v5, v2 ; SI-NEXT: s_mov_b64 s[2:3], s[10:11] ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm @@ -679,13 +664,10 @@ ; VI-NEXT: v_mul_f32_e32 v4, 1.0, v7 ; VI-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; VI-NEXT: v_min_f32_e32 v5, v4, v2 -; VI-NEXT: v_max_f32_e32 v2, v4, v2 ; VI-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; VI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_max_f32_e32 v2, v4, v2 ; VI-NEXT: v_min_f32_e32 v2, v2, v3 -; VI-NEXT: v_mul_f32_e32 v3, 1.0, v5 -; VI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; VI-NEXT: v_max_f32_e32 v2, v3, v2 +; VI-NEXT: v_max_f32_e32 v2, v5, v2 ; VI-NEXT: flat_store_dword v[0:1], v5 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -706,14 +688,11 @@ ; GFX9-NEXT: v_max_f32_e32 v2, v2, v2 ; GFX9-NEXT: v_min_f32_e32 v4, v1, v2 ; GFX9-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX9-NEXT: v_max_f32_e32 v3, v3, v3 ; GFX9-NEXT: global_store_dword v[0:1], v4, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_max_f32_e32 v3, v3, v3 -; GFX9-NEXT: v_max_f32_e32 v1, v1, v1 ; GFX9-NEXT: v_min_f32_e32 v1, v1, v3 -; GFX9-NEXT: v_max_f32_e32 v2, v4, v4 -; GFX9-NEXT: v_max_f32_e32 v1, v1, v1 -; GFX9-NEXT: v_max_f32_e32 v1, v2, v1 +; GFX9-NEXT: v_max_f32_e32 v1, v4, v1 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; @@ -733,11 +712,8 @@ ; GFX10-NEXT: v_max_f32_e32 v3, v3, v3 ; GFX10-NEXT: v_max_f32_e32 v4, v1, v2 ; GFX10-NEXT: v_min_f32_e32 v1, v1, v2 -; GFX10-NEXT: v_max_f32_e32 v4, v4, v4 ; GFX10-NEXT: v_min_f32_e32 v2, v4, v3 -; GFX10-NEXT: v_max_f32_e32 v3, v1, v1 -; GFX10-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX10-NEXT: v_max_f32_e32 v2, v3, v2 +; GFX10-NEXT: v_max_f32_e32 v2, v1, v2 ; GFX10-NEXT: global_store_dword v[0:1], v1, off ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_store_dword v0, v2, s[0:1]