diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h --- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h @@ -629,6 +629,15 @@ /// (G_SMULO x, 2) -> (G_SADDO x, x) bool matchMulOBy2(MachineInstr &MI, BuildFnTy &MatchInfo); + /// Transform (fadd x, fneg(y)) -> (fsub x, y) + /// (fadd fneg(x), y) -> (fsub y, x) + /// (fsub x, fneg(y)) -> (fadd x, y) + /// (fmul fneg(x), fneg(y)) -> (fmul x, y) + /// (fdiv fneg(x), fneg(y)) -> (fdiv x, y) + /// (fmad fneg(x), fneg(y), z) -> (fmad x, y, z) + /// (fma fneg(x), fneg(y), z) -> (fma x, y, z) + bool matchRedundantNegOperands(MachineInstr &MI, BuildFnTy &MatchInfo); + private: /// Given a non-indexed load or store instruction \p MI, find an offset that /// can be usefully and legally folded into it as a post-indexing operation. diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td --- a/llvm/include/llvm/Target/GlobalISel/Combine.td +++ b/llvm/include/llvm/Target/GlobalISel/Combine.td @@ -732,6 +732,12 @@ def mulh_combines : GICombineGroup<[mulh_to_lshr]>; +def redundant_neg_operands: GICombineRule< + (defs root:$root, build_fn_matchinfo:$matchinfo), + (match (wip_match_opcode G_FADD, G_FSUB, G_FMUL, G_FDIV, G_FMAD, G_FMA):$root, + [{ return Helper.matchRedundantNegOperands(*${root}, ${matchinfo}); }]), + (apply [{ Helper.applyBuildFnNoErase(*${root}, ${matchinfo}); }])>; + // FIXME: These should use the custom predicate feature once it lands. def undef_combines : GICombineGroup<[undef_to_fp_zero, undef_to_int_zero, undef_to_negative_one, @@ -779,7 +785,7 @@ shift_immed_chain, shift_of_shifted_logic_chain, load_or_combine, truncstore_merge, div_rem_to_divrem, funnel_shift_combines, form_bitfield_extract, constant_fold, fabs_fneg_fold, - intdiv_combines, mulh_combines]>; + intdiv_combines, mulh_combines, redundant_neg_operands]>; // A combine group used to for prelegalizer combiners at -O0. The combines in // this group have been selected based on experiments to balance code size and diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp --- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp @@ -4618,6 +4618,51 @@ MI.eraseFromParent(); } +bool CombinerHelper::matchRedundantNegOperands(MachineInstr &MI, + BuildFnTy &MatchInfo) { + unsigned Opc = MI.getOpcode(); + assert(Opc == TargetOpcode::G_FADD || Opc == TargetOpcode::G_FSUB || + Opc == TargetOpcode::G_FMUL || Opc == TargetOpcode::G_FDIV || + Opc == TargetOpcode::G_FMAD || Opc == TargetOpcode::G_FMA); + + Register Dst = MI.getOperand(0).getReg(); + Register X = MI.getOperand(1).getReg(); + Register Y = MI.getOperand(2).getReg(); + LLT Type = MRI.getType(Dst); + + // fold (fadd x, fneg(y)) -> (fsub x, y) + // fold (fadd fneg(y), x) -> (fsub x, y) + // G_ADD is commutative so both cases are checked by m_GFAdd + if (mi_match(Dst, MRI, m_GFAdd(m_Reg(X), m_GFNeg(m_Reg(Y)))) && + isLegalOrBeforeLegalizer({TargetOpcode::G_FSUB, {Type}})) { + Opc = TargetOpcode::G_FSUB; + } + /// fold (fsub x, fneg(y)) -> (fadd x, y) + else if (mi_match(Dst, MRI, m_GFSub(m_Reg(X), m_GFNeg(m_Reg(Y)))) && + isLegalOrBeforeLegalizer({TargetOpcode::G_FADD, {Type}})) { + Opc = TargetOpcode::G_FADD; + } + // fold (fmul fneg(x), fneg(y)) -> (fmul x, y) + // fold (fdiv fneg(x), fneg(y)) -> (fdiv x, y) + // fold (fmad fneg(x), fneg(y), z) -> (fmad x, y, z) + // fold (fma fneg(x), fneg(y), z) -> (fma x, y, z) + else if ((Opc == TargetOpcode::G_FMUL || Opc == TargetOpcode::G_FDIV || + Opc == TargetOpcode::G_FMAD || Opc == TargetOpcode::G_FMA) && + mi_match(X, MRI, m_GFNeg(m_Reg(X))) && + mi_match(Y, MRI, m_GFNeg(m_Reg(Y)))) { + // no opcode change + } else + return false; + + MatchInfo = [=, &MI](MachineIRBuilder &B) { + Observer.changingInstr(MI); + MI.setDesc(B.getTII().get(Opc)); + MI.getOperand(1).setReg(X); + MI.getOperand(2).setReg(Y); + Observer.changedInstr(MI); + }; + return true; +} bool CombinerHelper::tryCombine(MachineInstr &MI) { if (tryCombineCopy(MI)) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-redundant-neg.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-redundant-neg.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-redundant-neg.mir @@ -0,0 +1,137 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -march=amdgcn -run-pass=amdgpu-prelegalizer-combiner -verify-machineinstrs %s -o - | FileCheck %s + +--- +name: test_add_rhs +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + + ; CHECK-LABEL: name: test_add_rhs + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK-NEXT: [[FSUB:%[0-9]+]]:_(s32) = G_FSUB [[COPY]], [[COPY1]] + ; CHECK-NEXT: $vgpr0 = COPY [[FSUB]](s32) + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $vgpr1 + %2:_(s32) = G_FNEG %1 + %3:_(s32) = G_FADD %0, %2 + $vgpr0 = COPY %3(s32) + +... +--- +name: test_add_lhs +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + + ; CHECK-LABEL: name: test_add_lhs + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK-NEXT: [[FSUB:%[0-9]+]]:_(s32) = G_FSUB [[COPY1]], [[COPY]] + ; CHECK-NEXT: $vgpr0 = COPY [[FSUB]](s32) + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $vgpr1 + %2:_(s32) = G_FNEG %0 + %3:_(s32) = G_FADD %2, %1 + $vgpr0 = COPY %3(s32) + +... +--- +name: test_sub +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + + ; CHECK-LABEL: name: test_sub + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[COPY]], [[COPY1]] + ; CHECK-NEXT: $vgpr0 = COPY [[FADD]](s32) + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $vgpr1 + %2:_(s32) = G_FNEG %1 + %3:_(s32) = G_FSUB %0, %2 + $vgpr0 = COPY %3(s32) + +... +--- +name: test_mul +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + + ; CHECK-LABEL: name: test_mul + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[COPY]], [[COPY1]] + ; CHECK-NEXT: $vgpr0 = COPY [[FMUL]](s32) + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $vgpr1 + %2:_(s32) = G_FNEG %0 + %3:_(s32) = G_FNEG %1 + %4:_(s32) = G_FMUL %2, %3 + $vgpr0 = COPY %4(s32) + +... +--- +name: test_div +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + + ; CHECK-LABEL: name: test_div + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK-NEXT: [[FDIV:%[0-9]+]]:_(s32) = G_FDIV [[COPY]], [[COPY1]] + ; CHECK-NEXT: $vgpr0 = COPY [[FDIV]](s32) + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $vgpr1 + %2:_(s32) = G_FNEG %0 + %3:_(s32) = G_FNEG %1 + %4:_(s32) = G_FDIV %2, %3 + $vgpr0 = COPY %4(s32) + +... +--- +name: test_fmad +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2 + + ; CHECK-LABEL: name: test_fmad + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; CHECK-NEXT: [[FMAD:%[0-9]+]]:_(s32) = G_FMAD [[COPY]], [[COPY1]], [[COPY2]] + ; CHECK-NEXT: $vgpr0 = COPY [[FMAD]](s32) + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $vgpr1 + %2:_(s32) = COPY $vgpr2 + %3:_(s32) = G_FNEG %0 + %4:_(s32) = G_FNEG %1 + %5:_(s32) = G_FMAD %3, %4, %2 + $vgpr0 = COPY %5(s32) + +... +--- +name: test_fma +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2 + + ; CHECK-LABEL: name: test_fma + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; CHECK-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[COPY2]] + ; CHECK-NEXT: $vgpr0 = COPY [[FMA]](s32) + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $vgpr1 + %2:_(s32) = COPY $vgpr2 + %3:_(s32) = G_FNEG %0 + %4:_(s32) = G_FNEG %1 + %5:_(s32) = G_FMA %3, %4, %2 + $vgpr0 = COPY %5(s32) + +... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fma.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fma.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fma.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fma.ll @@ -253,36 +253,21 @@ ; GFX6-LABEL: v_fma_v2f16_fneg_lhs_rhs: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v6, 0xffff -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_and_b32_e32 v0, v0, v6 -; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; GFX6-NEXT: v_and_b32_e32 v2, v2, v6 -; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX6-NEXT: s_mov_b32 s4, 0x80008000 -; GFX6-NEXT: v_xor_b32_e32 v0, s4, v0 -; GFX6-NEXT: v_xor_b32_e32 v1, s4, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 -; GFX6-NEXT: v_fma_f32 v0, v0, v1, v4 +; GFX6-NEXT: v_fma_f32 v0, v0, v2, v4 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: v_fma_f32 v1, v2, v3, v5 +; GFX6-NEXT: v_fma_f32 v1, v1, v3, v5 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fma_v2f16_fneg_lhs_rhs: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s4, 0x80008000 -; GFX8-NEXT: v_xor_b32_e32 v0, s4, v0 -; GFX8-NEXT: v_xor_b32_e32 v1, s4, v1 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v1 ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v2 @@ -296,14 +281,14 @@ ; GFX9-LABEL: v_fma_v2f16_fneg_lhs_rhs: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_pk_fma_f16 v0, v0, v1, v2 neg_lo:[1,1,0] neg_hi:[1,1,0] +; GFX9-NEXT: v_pk_fma_f16 v0, v0, v1, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fma_v2f16_fneg_lhs_rhs: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_pk_fma_f16 v0, v0, v1, v2 neg_lo:[1,1,0] neg_hi:[1,1,0] +; GFX10-NEXT: v_pk_fma_f16 v0, v0, v1, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] %x.fneg = fneg <2 x half> %x %y.fneg = fneg <2 x half> %y @@ -414,26 +399,26 @@ ; GFX6-LABEL: v_fma_f64_fneg_all: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_fma_f64 v[0:1], -v[0:1], -v[2:3], -v[4:5] +; GFX6-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], -v[4:5] ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fma_f64_fneg_all: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_fma_f64 v[0:1], -v[0:1], -v[2:3], -v[4:5] +; GFX8-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], -v[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_fma_f64_fneg_all: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_fma_f64 v[0:1], -v[0:1], -v[2:3], -v[4:5] +; GFX9-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], -v[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fma_f64_fneg_all: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_fma_f64 v[0:1], -v[0:1], -v[2:3], -v[4:5] +; GFX10-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], -v[4:5] ; GFX10-NEXT: s_setpc_b64 s[30:31] %neg.x = fneg double %x %neg.y = fneg double %y diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmul.v2f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmul.v2f16.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmul.v2f16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmul.v2f16.ll @@ -92,15 +92,12 @@ ; GFX9-LABEL: v_fmul_v2f16_fneg_lhs_fneg_rhs: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_pk_mul_f16 v0, v0, v1 neg_lo:[1,1] neg_hi:[1,1] +; GFX9-NEXT: v_pk_mul_f16 v0, v0, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fmul_v2f16_fneg_lhs_fneg_rhs: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s4, 0x80008000 -; GFX8-NEXT: v_xor_b32_e32 v0, s4, v0 -; GFX8-NEXT: v_xor_b32_e32 v1, s4, v1 ; GFX8-NEXT: v_mul_f16_e32 v2, v0, v1 ; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_mov_b32_e32 v1, 16 @@ -112,7 +109,7 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_pk_mul_f16 v0, v0, v1 neg_lo:[1,1] neg_hi:[1,1] +; GFX10-NEXT: v_pk_mul_f16 v0, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] %neg.a = fneg <2 x half> %a %neg.b = fneg <2 x half> %b @@ -256,18 +253,13 @@ ; GFX9-LABEL: v_fmul_v4f16_fneg_lhs_fneg_rhs: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_pk_mul_f16 v0, v0, v2 neg_lo:[1,1] neg_hi:[1,1] -; GFX9-NEXT: v_pk_mul_f16 v1, v1, v3 neg_lo:[1,1] neg_hi:[1,1] +; GFX9-NEXT: v_pk_mul_f16 v0, v0, v2 +; GFX9-NEXT: v_pk_mul_f16 v1, v1, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fmul_v4f16_fneg_lhs_fneg_rhs: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s4, 0x80008000 -; GFX8-NEXT: v_xor_b32_e32 v0, s4, v0 -; GFX8-NEXT: v_xor_b32_e32 v1, s4, v1 -; GFX8-NEXT: v_xor_b32_e32 v2, s4, v2 -; GFX8-NEXT: v_xor_b32_e32 v3, s4, v3 ; GFX8-NEXT: v_mul_f16_e32 v4, v0, v2 ; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_mul_f16_e32 v2, v1, v3 @@ -283,8 +275,8 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_pk_mul_f16 v0, v0, v2 neg_lo:[1,1] neg_hi:[1,1] -; GFX10-NEXT: v_pk_mul_f16 v1, v1, v3 neg_lo:[1,1] neg_hi:[1,1] +; GFX10-NEXT: v_pk_mul_f16 v0, v0, v2 +; GFX10-NEXT: v_pk_mul_f16 v1, v1, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] %neg.a = fneg <4 x half> %a %neg.b = fneg <4 x half> %b @@ -426,21 +418,14 @@ ; GFX9-LABEL: v_fmul_v6f16_fneg_lhs_fneg_rhs: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_pk_mul_f16 v0, v0, v3 neg_lo:[1,1] neg_hi:[1,1] -; GFX9-NEXT: v_pk_mul_f16 v1, v1, v4 neg_lo:[1,1] neg_hi:[1,1] -; GFX9-NEXT: v_pk_mul_f16 v2, v2, v5 neg_lo:[1,1] neg_hi:[1,1] +; GFX9-NEXT: v_pk_mul_f16 v0, v0, v3 +; GFX9-NEXT: v_pk_mul_f16 v1, v1, v4 +; GFX9-NEXT: v_pk_mul_f16 v2, v2, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fmul_v6f16_fneg_lhs_fneg_rhs: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s4, 0x80008000 -; GFX8-NEXT: v_xor_b32_e32 v0, s4, v0 -; GFX8-NEXT: v_xor_b32_e32 v1, s4, v1 -; GFX8-NEXT: v_xor_b32_e32 v2, s4, v2 -; GFX8-NEXT: v_xor_b32_e32 v3, s4, v3 -; GFX8-NEXT: v_xor_b32_e32 v4, s4, v4 -; GFX8-NEXT: v_xor_b32_e32 v5, s4, v5 ; GFX8-NEXT: v_mul_f16_e32 v6, v0, v3 ; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_mul_f16_e32 v3, v1, v4 @@ -461,9 +446,9 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_pk_mul_f16 v0, v0, v3 neg_lo:[1,1] neg_hi:[1,1] -; GFX10-NEXT: v_pk_mul_f16 v1, v1, v4 neg_lo:[1,1] neg_hi:[1,1] -; GFX10-NEXT: v_pk_mul_f16 v2, v2, v5 neg_lo:[1,1] neg_hi:[1,1] +; GFX10-NEXT: v_pk_mul_f16 v0, v0, v3 +; GFX10-NEXT: v_pk_mul_f16 v1, v1, v4 +; GFX10-NEXT: v_pk_mul_f16 v2, v2, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] %neg.a = fneg <6 x half> %a %neg.b = fneg <6 x half> %b @@ -625,24 +610,15 @@ ; GFX9-LABEL: v_fmul_v8f16_fneg_lhs_fneg_rhs: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_pk_mul_f16 v0, v0, v4 neg_lo:[1,1] neg_hi:[1,1] -; GFX9-NEXT: v_pk_mul_f16 v1, v1, v5 neg_lo:[1,1] neg_hi:[1,1] -; GFX9-NEXT: v_pk_mul_f16 v2, v2, v6 neg_lo:[1,1] neg_hi:[1,1] -; GFX9-NEXT: v_pk_mul_f16 v3, v3, v7 neg_lo:[1,1] neg_hi:[1,1] +; GFX9-NEXT: v_pk_mul_f16 v0, v0, v4 +; GFX9-NEXT: v_pk_mul_f16 v1, v1, v5 +; GFX9-NEXT: v_pk_mul_f16 v2, v2, v6 +; GFX9-NEXT: v_pk_mul_f16 v3, v3, v7 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fmul_v8f16_fneg_lhs_fneg_rhs: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s4, 0x80008000 -; GFX8-NEXT: v_xor_b32_e32 v0, s4, v0 -; GFX8-NEXT: v_xor_b32_e32 v1, s4, v1 -; GFX8-NEXT: v_xor_b32_e32 v2, s4, v2 -; GFX8-NEXT: v_xor_b32_e32 v3, s4, v3 -; GFX8-NEXT: v_xor_b32_e32 v4, s4, v4 -; GFX8-NEXT: v_xor_b32_e32 v5, s4, v5 -; GFX8-NEXT: v_xor_b32_e32 v6, s4, v6 -; GFX8-NEXT: v_xor_b32_e32 v7, s4, v7 ; GFX8-NEXT: v_mul_f16_e32 v8, v0, v4 ; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_mul_f16_e32 v4, v1, v5 @@ -667,10 +643,10 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_pk_mul_f16 v0, v0, v4 neg_lo:[1,1] neg_hi:[1,1] -; GFX10-NEXT: v_pk_mul_f16 v1, v1, v5 neg_lo:[1,1] neg_hi:[1,1] -; GFX10-NEXT: v_pk_mul_f16 v2, v2, v6 neg_lo:[1,1] neg_hi:[1,1] -; GFX10-NEXT: v_pk_mul_f16 v3, v3, v7 neg_lo:[1,1] neg_hi:[1,1] +; GFX10-NEXT: v_pk_mul_f16 v0, v0, v4 +; GFX10-NEXT: v_pk_mul_f16 v1, v1, v5 +; GFX10-NEXT: v_pk_mul_f16 v2, v2, v6 +; GFX10-NEXT: v_pk_mul_f16 v3, v3, v7 ; GFX10-NEXT: s_setpc_b64 s[30:31] %neg.a = fneg <8 x half> %a %neg.b = fneg <8 x half> %b