diff --git a/llvm/include/llvm/CodeGen/TargetInstrInfo.h b/llvm/include/llvm/CodeGen/TargetInstrInfo.h --- a/llvm/include/llvm/CodeGen/TargetInstrInfo.h +++ b/llvm/include/llvm/CodeGen/TargetInstrInfo.h @@ -1170,11 +1170,22 @@ /// will be set to true. bool isReassociationCandidate(const MachineInstr &Inst, bool &Commuted) const; - /// Return true when \P Inst is both associative and commutative. - virtual bool isAssociativeAndCommutative(const MachineInstr &Inst) const { + /// Return true when \P Inst is both associative and commutative. If \P Invert + /// is true, then the inverse of \P Inst operation must be tested. + virtual bool isAssociativeAndCommutative(const MachineInstr &Inst, + bool Invert = false) const { return false; } + /// Return the inverse operation opcode if it exists for \P Opcode (e.g. add + /// for sub and vice versa). + virtual std::optional getInverseOpcode(unsigned Opcode) const { + return std::nullopt; + } + + /// Return true when \P Opcode1 or its inversion is equal to \P Opcode2. + bool areOpcodesEqualOrInverse(unsigned Opcode1, unsigned Opcode2) const; + /// Return true when \P Inst has reassociable operands in the same \P MBB. virtual bool hasReassociableOperands(const MachineInstr &Inst, const MachineBasicBlock *MBB) const; @@ -1207,6 +1218,15 @@ SmallVectorImpl &DelInstrs, DenseMap &InstrIdxForVirtReg) const; + /// Reassociation of some instructions requires inverse operations (e.g. + /// (X + A) - Y => (X - Y) + A). This method returns a pair of new opcodes + /// (new root opcode, new prev opcode) that must be used to reassociate \P + /// Root and \P Prev accoring to \P Pattern. + std::pair + getReassociationOpcodes(MachineCombinerPattern Pattern, + const MachineInstr &Root, + const MachineInstr &Prev) const; + /// The limit on resource length extension we accept in MachineCombiner Pass. virtual int getExtendResourceLenLimit() const { return 0; } diff --git a/llvm/lib/CodeGen/TargetInstrInfo.cpp b/llvm/lib/CodeGen/TargetInstrInfo.cpp --- a/llvm/lib/CodeGen/TargetInstrInfo.cpp +++ b/llvm/lib/CodeGen/TargetInstrInfo.cpp @@ -715,39 +715,50 @@ return MI1 && MI2 && MI1->getParent() == MBB && MI2->getParent() == MBB; } +bool TargetInstrInfo::areOpcodesEqualOrInverse(unsigned Opcode1, + unsigned Opcode2) const { + return Opcode1 == Opcode2 || getInverseOpcode(Opcode1) == Opcode2; +} + bool TargetInstrInfo::hasReassociableSibling(const MachineInstr &Inst, bool &Commuted) const { const MachineBasicBlock *MBB = Inst.getParent(); const MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); MachineInstr *MI1 = MRI.getUniqueVRegDef(Inst.getOperand(1).getReg()); MachineInstr *MI2 = MRI.getUniqueVRegDef(Inst.getOperand(2).getReg()); - unsigned AssocOpcode = Inst.getOpcode(); + unsigned Opcode = Inst.getOpcode(); - // If only one operand has the same opcode and it's the second source operand, - // the operands must be commuted. - Commuted = MI1->getOpcode() != AssocOpcode && MI2->getOpcode() == AssocOpcode; + // If only one operand has the same or inverse opcode and it's the second + // source operand, the operands must be commuted. + Commuted = !areOpcodesEqualOrInverse(Opcode, MI1->getOpcode()) && + areOpcodesEqualOrInverse(Opcode, MI2->getOpcode()); if (Commuted) std::swap(MI1, MI2); // 1. The previous instruction must be the same type as Inst. - // 2. The previous instruction must also be associative/commutative (this can - // be different even for instructions with the same opcode if traits like - // fast-math-flags are included). + // 2. The previous instruction must also be associative/commutative or be the + // inverse of such an operation (this can be different even for + // instructions with the same opcode if traits like fast-math-flags are + // included). // 3. The previous instruction must have virtual register definitions for its // operands in the same basic block as Inst. // 4. The previous instruction's result must only be used by Inst. - return MI1->getOpcode() == AssocOpcode && isAssociativeAndCommutative(*MI1) && + return areOpcodesEqualOrInverse(Opcode, MI1->getOpcode()) && + (isAssociativeAndCommutative(*MI1) || + isAssociativeAndCommutative(*MI1, /* Invert */ true)) && hasReassociableOperands(*MI1, MBB) && MRI.hasOneNonDBGUse(MI1->getOperand(0).getReg()); } -// 1. The operation must be associative and commutative. +// 1. The operation must be associative and commutative or be the inverse of +// such an operation. // 2. The instruction must have virtual register definitions for its // operands in the same basic block. // 3. The instruction must have a reassociable sibling. bool TargetInstrInfo::isReassociationCandidate(const MachineInstr &Inst, bool &Commuted) const { - return isAssociativeAndCommutative(Inst) && + return (isAssociativeAndCommutative(Inst) || + isAssociativeAndCommutative(Inst, /* Invert */ true)) && hasReassociableOperands(Inst, Inst.getParent()) && hasReassociableSibling(Inst, Commuted); } @@ -801,6 +812,111 @@ return false; } +std::pair +TargetInstrInfo::getReassociationOpcodes(MachineCombinerPattern Pattern, + const MachineInstr &Root, + const MachineInstr &Prev) const { + bool AssocCommutRoot = isAssociativeAndCommutative(Root); + bool AssocCommutPrev = isAssociativeAndCommutative(Prev); + + // Early exit if both opcodes are associative and commutative. It's a trivial + // reassociation when we only change operands order. In this case opcodes are + // not required to have inverse versions. + if (AssocCommutRoot && AssocCommutPrev) { + assert(Root.getOpcode() == Prev.getOpcode() && "Expected to be equal"); + return std::make_pair(Root.getOpcode(), Root.getOpcode()); + } + + // At least one instruction is not associative or commutative. + // Since we have matched one of the reassociation patterns, we expect that the + // instructions' opcodes are equal or one of them is the inversion of the + // other. + assert(areOpcodesEqualOrInverse(Root.getOpcode(), Prev.getOpcode()) && + "Incorrectly matched pattern"); + unsigned AssocCommutOpcode = Root.getOpcode(); + unsigned InverseOpcode = getInverseOpcode(Root.getOpcode()).value(); + if (!AssocCommutRoot) + std::swap(AssocCommutOpcode, InverseOpcode); + + // The transformation rule (`+` is any associative and commutative binary + // operation, `-` is the inverse): + // REASSOC_AX_BY: + // (A + X) + Y => A + (X + Y) + // (A + X) - Y => A + (X - Y) + // (A - X) + Y => A - (X - Y) + // (A - X) - Y => A - (X + Y) + // REASSOC_XA_BY: + // (X + A) + Y => (X + Y) + A + // (X + A) - Y => (X - Y) + A + // (X - A) + Y => (X + Y) - A + // (X - A) - Y => (X - Y) - A + // REASSOC_AX_YB: + // Y + (A + X) => (Y + X) + A + // Y - (A + X) => (Y - X) - A + // Y + (A - X) => (Y - X) + A + // Y - (A - X) => (Y + X) - A + // REASSOC_XA_YB: + // Y + (X + A) => (Y + X) + A + // Y - (X + A) => (Y - X) - A + // Y + (X - A) => (Y + X) - A + // Y - (X - A) => (Y - X) + A + switch (Pattern) { + default: + llvm_unreachable("Unexpected pattern"); + case MachineCombinerPattern::REASSOC_AX_BY: + if (!AssocCommutRoot && AssocCommutPrev) + return {AssocCommutOpcode, InverseOpcode}; + if (AssocCommutRoot && !AssocCommutPrev) + return {InverseOpcode, InverseOpcode}; + if (!AssocCommutRoot && !AssocCommutPrev) + return {InverseOpcode, AssocCommutOpcode}; + break; + case MachineCombinerPattern::REASSOC_XA_BY: + if (!AssocCommutRoot && AssocCommutPrev) + return {AssocCommutOpcode, InverseOpcode}; + if (AssocCommutRoot && !AssocCommutPrev) + return {InverseOpcode, AssocCommutOpcode}; + if (!AssocCommutRoot && !AssocCommutPrev) + return {InverseOpcode, InverseOpcode}; + break; + case MachineCombinerPattern::REASSOC_AX_YB: + if (!AssocCommutRoot && AssocCommutPrev) + return {InverseOpcode, InverseOpcode}; + if (AssocCommutRoot && !AssocCommutPrev) + return {AssocCommutOpcode, InverseOpcode}; + if (!AssocCommutRoot && !AssocCommutPrev) + return {InverseOpcode, AssocCommutOpcode}; + break; + case MachineCombinerPattern::REASSOC_XA_YB: + if (!AssocCommutRoot && AssocCommutPrev) + return {InverseOpcode, InverseOpcode}; + if (AssocCommutRoot && !AssocCommutPrev) + return {InverseOpcode, AssocCommutOpcode}; + if (!AssocCommutRoot && !AssocCommutPrev) + return {AssocCommutOpcode, InverseOpcode}; + break; + } + llvm_unreachable("Unhandled combination"); +} + +// Return a pair of boolean flags showing if the new root and new prev operands +// must be swapped. See visual example of the rule in +// TargetInstrInfo::getReassociationOpcodes. +static std::pair mustSwapOperands(MachineCombinerPattern Pattern) { + switch (Pattern) { + default: + llvm_unreachable("Unexpected pattern"); + case MachineCombinerPattern::REASSOC_AX_BY: + return {false, false}; + case MachineCombinerPattern::REASSOC_XA_BY: + return {true, false}; + case MachineCombinerPattern::REASSOC_AX_YB: + return {true, true}; + case MachineCombinerPattern::REASSOC_XA_YB: + return {true, true}; + } +} + /// Attempt the reassociation transformation to reduce critical path length. /// See the above comments before getMachineCombinerPatterns(). void TargetInstrInfo::reassociateOps( @@ -863,21 +979,35 @@ Register NewVR = MRI.createVirtualRegister(RC); InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); - unsigned Opcode = Root.getOpcode(); + auto [NewRootOpc, NewPrevOpc] = getReassociationOpcodes(Pattern, Root, Prev); bool KillA = OpA.isKill(); bool KillX = OpX.isKill(); bool KillY = OpY.isKill(); + bool KillNewVR = true; + + auto [SwapRootOperands, SwapPrevOperands] = mustSwapOperands(Pattern); + + if (SwapPrevOperands) { + std::swap(RegX, RegY); + std::swap(KillX, KillY); + } // Create new instructions for insertion. MachineInstrBuilder MIB1 = - BuildMI(*MF, MIMetadata(Prev), TII->get(Opcode), NewVR) + BuildMI(*MF, MIMetadata(Prev), TII->get(NewPrevOpc), NewVR) .addReg(RegX, getKillRegState(KillX)) .addReg(RegY, getKillRegState(KillY)) .setMIFlags(Prev.getFlags()); + + if (SwapRootOperands) { + std::swap(RegA, NewVR); + std::swap(KillA, KillNewVR); + } + MachineInstrBuilder MIB2 = - BuildMI(*MF, MIMetadata(Root), TII->get(Opcode), RegC) + BuildMI(*MF, MIMetadata(Root), TII->get(NewRootOpc), RegC) .addReg(RegA, getKillRegState(KillA)) - .addReg(NewVR, getKillRegState(true)) + .addReg(NewVR, getKillRegState(KillNewVR)) .setMIFlags(Root.getFlags()); setSpecialOperandAttr(Root, Prev, *MIB1, *MIB2); diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.h b/llvm/lib/Target/AArch64/AArch64InstrInfo.h --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.h +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.h @@ -260,8 +260,10 @@ SmallVectorImpl &Patterns, bool DoRegPressureReduce) const override; /// Return true when Inst is associative and commutative so that it can be - /// reassociated. - bool isAssociativeAndCommutative(const MachineInstr &Inst) const override; + /// reassociated. If Invert is true, then the inverse of Inst operation must + /// be checked. + bool isAssociativeAndCommutative(const MachineInstr &Inst, + bool Invert) const override; /// When getMachineCombinerPatterns() finds patterns, this function generates /// the instructions that could replace the original code sequence void genAlternativeCodeSequence( diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -4939,8 +4939,10 @@ // 1. Other data types (integer, vectors) // 2. Other math / logic operations (xor, or) // 3. Other forms of the same operation (intrinsics and other variants) -bool AArch64InstrInfo::isAssociativeAndCommutative( - const MachineInstr &Inst) const { +bool AArch64InstrInfo::isAssociativeAndCommutative(const MachineInstr &Inst, + bool Invert) const { + if (Invert) + return false; switch (Inst.getOpcode()) { case AArch64::FADDDrr: case AArch64::FADDSrr: diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.h b/llvm/lib/Target/PowerPC/PPCInstrInfo.h --- a/llvm/lib/Target/PowerPC/PPCInstrInfo.h +++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.h @@ -498,7 +498,8 @@ finalizeInsInstrs(MachineInstr &Root, MachineCombinerPattern &P, SmallVectorImpl &InsInstrs) const override; - bool isAssociativeAndCommutative(const MachineInstr &Inst) const override; + bool isAssociativeAndCommutative(const MachineInstr &Inst, + bool Invert) const override; /// On PowerPC, we try to reassociate FMA chain which will increase /// instruction size. Set extension resource length limit to 1 for edge case. diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp --- a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp +++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp @@ -251,7 +251,10 @@ // reduce the critical path. Mostly, this means floating-point operations, // because they have high latencies(>=5) (compared to other operations, such as // and/or, which are also associative and commutative, but have low latencies). -bool PPCInstrInfo::isAssociativeAndCommutative(const MachineInstr &Inst) const { +bool PPCInstrInfo::isAssociativeAndCommutative(const MachineInstr &Inst, + bool Invert) const { + if (Invert) + return false; switch (Inst.getOpcode()) { // Floating point: // FP Add: diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.h b/llvm/lib/Target/RISCV/RISCVInstrInfo.h --- a/llvm/lib/Target/RISCV/RISCVInstrInfo.h +++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.h @@ -205,7 +205,8 @@ bool hasReassociableSibling(const MachineInstr &Inst, bool &Commuted) const override; - bool isAssociativeAndCommutative(const MachineInstr &Inst) const override; + bool isAssociativeAndCommutative(const MachineInstr &Inst, + bool Invert) const override; protected: const RISCVSubtarget &STI; diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp --- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp @@ -1200,9 +1200,12 @@ return RISCV::hasEqualFRM(Inst, Sibling); } -bool RISCVInstrInfo::isAssociativeAndCommutative( - const MachineInstr &Inst) const { +bool RISCVInstrInfo::isAssociativeAndCommutative(const MachineInstr &Inst, + bool Invert) const { unsigned Opc = Inst.getOpcode(); + if (Invert) + return false; + if (isFADD(Opc) || isFMUL(Opc)) return Inst.getFlag(MachineInstr::MIFlag::FmReassoc) && Inst.getFlag(MachineInstr::MIFlag::FmNsz); diff --git a/llvm/lib/Target/X86/X86InstrInfo.h b/llvm/lib/Target/X86/X86InstrInfo.h --- a/llvm/lib/Target/X86/X86InstrInfo.h +++ b/llvm/lib/Target/X86/X86InstrInfo.h @@ -506,7 +506,8 @@ bool useMachineCombiner() const override { return true; } - bool isAssociativeAndCommutative(const MachineInstr &Inst) const override; + bool isAssociativeAndCommutative(const MachineInstr &Inst, + bool Invert) const override; bool hasReassociableOperands(const MachineInstr &Inst, const MachineBasicBlock *MBB) const override; diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp --- a/llvm/lib/Target/X86/X86InstrInfo.cpp +++ b/llvm/lib/Target/X86/X86InstrInfo.cpp @@ -8716,7 +8716,10 @@ // 1. Other data types (integer, vectors) // 2. Other math / logic operations (xor, or) // 3. Other forms of the same operation (intrinsics and other variants) -bool X86InstrInfo::isAssociativeAndCommutative(const MachineInstr &Inst) const { +bool X86InstrInfo::isAssociativeAndCommutative(const MachineInstr &Inst, + bool Invert) const { + if (Invert) + return false; switch (Inst.getOpcode()) { case X86::ADD8rr: case X86::ADD16rr: diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic.ll b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic.ll --- a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic.ll +++ b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic.ll @@ -711,7 +711,7 @@ ; CHECK-NOLSE-O1-NEXT: ldurb w11, [x0, #-256] ; CHECK-NOLSE-O1-NEXT: ldrb w8, [x8] ; CHECK-NOLSE-O1-NEXT: add w9, w9, w11 -; CHECK-NOLSE-O1-NEXT: add w9, w10, w9 +; CHECK-NOLSE-O1-NEXT: add w9, w9, w10 ; CHECK-NOLSE-O1-NEXT: add w0, w9, w8 ; CHECK-NOLSE-O1-NEXT: ret ; @@ -735,7 +735,7 @@ ; CHECK-LSE-O1-NEXT: ldrb w9, [x0, w1, sxtw] ; CHECK-LSE-O1-NEXT: ldurb w10, [x0, #-256] ; CHECK-LSE-O1-NEXT: add w8, w8, w10 -; CHECK-LSE-O1-NEXT: add w8, w9, w8 +; CHECK-LSE-O1-NEXT: add w8, w8, w9 ; CHECK-LSE-O1-NEXT: add x9, x0, #291, lsl #12 ; =1191936 ; CHECK-LSE-O1-NEXT: ldrb w9, [x9] ; CHECK-LSE-O1-NEXT: add w0, w8, w9 @@ -781,7 +781,7 @@ ; CHECK-NOLSE-O1-NEXT: ldurh w11, [x0, #-256] ; CHECK-NOLSE-O1-NEXT: ldrh w8, [x8] ; CHECK-NOLSE-O1-NEXT: add w9, w9, w11 -; CHECK-NOLSE-O1-NEXT: add w9, w10, w9 +; CHECK-NOLSE-O1-NEXT: add w9, w9, w10 ; CHECK-NOLSE-O1-NEXT: add w0, w9, w8 ; CHECK-NOLSE-O1-NEXT: ret ; @@ -805,7 +805,7 @@ ; CHECK-LSE-O1-NEXT: ldrh w9, [x0, w1, sxtw #1] ; CHECK-LSE-O1-NEXT: ldurh w10, [x0, #-256] ; CHECK-LSE-O1-NEXT: add w8, w8, w10 -; CHECK-LSE-O1-NEXT: add w8, w9, w8 +; CHECK-LSE-O1-NEXT: add w8, w8, w9 ; CHECK-LSE-O1-NEXT: add x9, x0, #291, lsl #12 ; =1191936 ; CHECK-LSE-O1-NEXT: ldrh w9, [x9] ; CHECK-LSE-O1-NEXT: add w0, w8, w9 @@ -851,7 +851,7 @@ ; CHECK-NOLSE-O1-NEXT: ldur w11, [x0, #-256] ; CHECK-NOLSE-O1-NEXT: ldr w8, [x8] ; CHECK-NOLSE-O1-NEXT: add w9, w9, w11 -; CHECK-NOLSE-O1-NEXT: add w9, w10, w9 +; CHECK-NOLSE-O1-NEXT: add w9, w9, w10 ; CHECK-NOLSE-O1-NEXT: add w0, w9, w8 ; CHECK-NOLSE-O1-NEXT: ret ; @@ -873,7 +873,7 @@ ; CHECK-LSE-O1-NEXT: ldr w9, [x0, w1, sxtw #2] ; CHECK-LSE-O1-NEXT: ldur w10, [x0, #-256] ; CHECK-LSE-O1-NEXT: add w8, w8, w10 -; CHECK-LSE-O1-NEXT: add w8, w9, w8 +; CHECK-LSE-O1-NEXT: add w8, w8, w9 ; CHECK-LSE-O1-NEXT: add x9, x0, #291, lsl #12 ; =1191936 ; CHECK-LSE-O1-NEXT: ldr w9, [x9] ; CHECK-LSE-O1-NEXT: add w0, w8, w9 @@ -917,7 +917,7 @@ ; CHECK-NOLSE-O1-NEXT: ldur x11, [x0, #-256] ; CHECK-NOLSE-O1-NEXT: ldr x8, [x8] ; CHECK-NOLSE-O1-NEXT: add x9, x9, x11 -; CHECK-NOLSE-O1-NEXT: add x9, x10, x9 +; CHECK-NOLSE-O1-NEXT: add x9, x9, x10 ; CHECK-NOLSE-O1-NEXT: add x0, x9, x8 ; CHECK-NOLSE-O1-NEXT: ret ; @@ -939,7 +939,7 @@ ; CHECK-LSE-O1-NEXT: ldr x9, [x0, w1, sxtw #3] ; CHECK-LSE-O1-NEXT: ldur x10, [x0, #-256] ; CHECK-LSE-O1-NEXT: add x8, x8, x10 -; CHECK-LSE-O1-NEXT: add x8, x9, x8 +; CHECK-LSE-O1-NEXT: add x8, x8, x9 ; CHECK-LSE-O1-NEXT: add x9, x0, #291, lsl #12 ; =1191936 ; CHECK-LSE-O1-NEXT: ldr x9, [x9] ; CHECK-LSE-O1-NEXT: add x0, x8, x9 diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-pcsections.ll b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-pcsections.ll --- a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-pcsections.ll +++ b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-pcsections.ll @@ -389,8 +389,8 @@ ; CHECK-NEXT: renamable $w10 = LDRBBroW renamable $x0, killed renamable $w1, 1, 0, pcsections !0 :: (load unordered (s8) from %ir.ptr_regoff) ; CHECK-NEXT: renamable $w11 = LDURBBi killed renamable $x0, -256, pcsections !0 :: (load monotonic (s8) from %ir.ptr_unscaled) ; CHECK-NEXT: renamable $w8 = LDRBBui killed renamable $x8, 0, pcsections !0 :: (load unordered (s8) from %ir.ptr_random) - ; CHECK-NEXT: $w9 = ADDWrs killed renamable $w9, killed renamable $w11, 0 - ; CHECK-NEXT: $w9 = ADDWrs killed renamable $w10, killed renamable $w9, 0 + ; CHECK-NEXT: $w9 = ADDWrs killed renamable $w9, killed renamable $w11, 0, pcsections !0 + ; CHECK-NEXT: $w9 = ADDWrs killed renamable $w9, killed renamable $w10, 0, pcsections !0 ; CHECK-NEXT: $w0 = ADDWrs killed renamable $w9, killed renamable $w8, 0, pcsections !0 ; CHECK-NEXT: RET undef $lr, implicit $w0 %ptr_unsigned = getelementptr i8, ptr %p, i32 4095 @@ -421,8 +421,8 @@ ; CHECK-NEXT: renamable $w10 = LDRHHroW renamable $x0, killed renamable $w1, 1, 1, pcsections !0 :: (load unordered (s16) from %ir.ptr_regoff) ; CHECK-NEXT: renamable $w11 = LDURHHi killed renamable $x0, -256, pcsections !0 :: (load monotonic (s16) from %ir.ptr_unscaled) ; CHECK-NEXT: renamable $w8 = LDRHHui killed renamable $x8, 0, pcsections !0 :: (load unordered (s16) from %ir.ptr_random) - ; CHECK-NEXT: $w9 = ADDWrs killed renamable $w9, killed renamable $w11, 0 - ; CHECK-NEXT: $w9 = ADDWrs killed renamable $w10, killed renamable $w9, 0 + ; CHECK-NEXT: $w9 = ADDWrs killed renamable $w9, killed renamable $w11, 0, pcsections !0 + ; CHECK-NEXT: $w9 = ADDWrs killed renamable $w9, killed renamable $w10, 0, pcsections !0 ; CHECK-NEXT: $w0 = ADDWrs killed renamable $w9, killed renamable $w8, 0, pcsections !0 ; CHECK-NEXT: RET undef $lr, implicit $w0 %ptr_unsigned = getelementptr i16, ptr %p, i32 4095 @@ -453,8 +453,8 @@ ; CHECK-NEXT: renamable $w10 = LDRWroW renamable $x0, killed renamable $w1, 1, 1, pcsections !0 :: (load unordered (s32) from %ir.ptr_regoff) ; CHECK-NEXT: renamable $w11 = LDURWi killed renamable $x0, -256, pcsections !0 :: (load monotonic (s32) from %ir.ptr_unscaled) ; CHECK-NEXT: renamable $w8 = LDRWui killed renamable $x8, 0, pcsections !0 :: (load unordered (s32) from %ir.ptr_random) - ; CHECK-NEXT: $w9 = ADDWrs killed renamable $w9, killed renamable $w11, 0 - ; CHECK-NEXT: $w9 = ADDWrs killed renamable $w10, killed renamable $w9, 0 + ; CHECK-NEXT: $w9 = ADDWrs killed renamable $w9, killed renamable $w11, 0, pcsections !0 + ; CHECK-NEXT: $w9 = ADDWrs killed renamable $w9, killed renamable $w10, 0, pcsections !0 ; CHECK-NEXT: $w0 = ADDWrs killed renamable $w9, killed renamable $w8, 0, pcsections !0 ; CHECK-NEXT: RET undef $lr, implicit $w0 %ptr_unsigned = getelementptr i32, ptr %p, i32 4095 @@ -485,8 +485,8 @@ ; CHECK-NEXT: renamable $x10 = LDRXroW renamable $x0, killed renamable $w1, 1, 1, pcsections !0 :: (load unordered (s64) from %ir.ptr_regoff) ; CHECK-NEXT: renamable $x11 = LDURXi killed renamable $x0, -256, pcsections !0 :: (load monotonic (s64) from %ir.ptr_unscaled) ; CHECK-NEXT: renamable $x8 = LDRXui killed renamable $x8, 0, pcsections !0 :: (load unordered (s64) from %ir.ptr_random) - ; CHECK-NEXT: $x9 = ADDXrs killed renamable $x9, killed renamable $x11, 0 - ; CHECK-NEXT: $x9 = ADDXrs killed renamable $x10, killed renamable $x9, 0 + ; CHECK-NEXT: $x9 = ADDXrs killed renamable $x9, killed renamable $x11, 0, pcsections !0 + ; CHECK-NEXT: $x9 = ADDXrs killed renamable $x9, killed renamable $x10, 0, pcsections !0 ; CHECK-NEXT: $x0 = ADDXrs killed renamable $x9, killed renamable $x8, 0, pcsections !0 ; CHECK-NEXT: RET undef $lr, implicit $x0 %ptr_unsigned = getelementptr i64, ptr %p, i32 4095 diff --git a/llvm/test/CodeGen/AArch64/arm64-rev.ll b/llvm/test/CodeGen/AArch64/arm64-rev.ll --- a/llvm/test/CodeGen/AArch64/arm64-rev.ll +++ b/llvm/test/CodeGen/AArch64/arm64-rev.ll @@ -850,14 +850,14 @@ ; GISEL-NEXT: and x12, x8, #0xff00000000 ; GISEL-NEXT: and x13, x9, #0xff0000000000 ; GISEL-NEXT: orr x10, x11, x10 -; GISEL-NEXT: orr x11, x12, x13 +; GISEL-NEXT: orr x11, x13, x12 ; GISEL-NEXT: and x12, x8, #0xff0000 ; GISEL-NEXT: and x13, x9, #0xff000000 -; GISEL-NEXT: orr x12, x12, x13 +; GISEL-NEXT: orr x12, x13, x12 ; GISEL-NEXT: and x8, x8, #0xff -; GISEL-NEXT: orr x10, x10, x11 -; GISEL-NEXT: orr x8, x12, x8 -; GISEL-NEXT: orr x8, x10, x8 +; GISEL-NEXT: orr x10, x11, x10 +; GISEL-NEXT: orr x8, x8, x12 +; GISEL-NEXT: orr x8, x8, x10 ; GISEL-NEXT: and x9, x9, #0xff00 ; GISEL-NEXT: orr x0, x9, x8 ; GISEL-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/machine-combiner.ll b/llvm/test/CodeGen/AArch64/machine-combiner.ll --- a/llvm/test/CodeGen/AArch64/machine-combiner.ll +++ b/llvm/test/CodeGen/AArch64/machine-combiner.ll @@ -75,7 +75,7 @@ ; CHECK-UNSAFE: // %bb.0: ; CHECK-UNSAFE-NEXT: fadd s0, s0, s1 ; CHECK-UNSAFE-NEXT: fadd s1, s2, s3 -; CHECK-UNSAFE-NEXT: fadd s0, s0, s1 +; CHECK-UNSAFE-NEXT: fadd s0, s1, s0 ; CHECK-UNSAFE-NEXT: ret %t0 = fadd float %x0, %x1 %t1 = fadd float %x2, %t0 @@ -94,8 +94,8 @@ ; CHECK-UNSAFE-LABEL: reassociate_adds3: ; CHECK-UNSAFE: // %bb.0: ; CHECK-UNSAFE-NEXT: fadd s0, s0, s1 -; CHECK-UNSAFE-NEXT: fadd s1, s2, s3 -; CHECK-UNSAFE-NEXT: fadd s0, s0, s1 +; CHECK-UNSAFE-NEXT: fadd s1, s3, s2 +; CHECK-UNSAFE-NEXT: fadd s0, s1, s0 ; CHECK-UNSAFE-NEXT: ret %t0 = fadd float %x0, %x1 %t1 = fadd float %t0, %x2 @@ -114,8 +114,8 @@ ; CHECK-UNSAFE-LABEL: reassociate_adds4: ; CHECK-UNSAFE: // %bb.0: ; CHECK-UNSAFE-NEXT: fadd s0, s0, s1 -; CHECK-UNSAFE-NEXT: fadd s1, s2, s3 -; CHECK-UNSAFE-NEXT: fadd s0, s0, s1 +; CHECK-UNSAFE-NEXT: fadd s1, s3, s2 +; CHECK-UNSAFE-NEXT: fadd s0, s1, s0 ; CHECK-UNSAFE-NEXT: ret %t0 = fadd float %x0, %x1 %t1 = fadd float %x2, %t0 @@ -174,8 +174,8 @@ ; CHECK-UNSAFE-LABEL: reassociate_adds6: ; CHECK-UNSAFE: // %bb.0: ; CHECK-UNSAFE-NEXT: fdiv s0, s0, s1 -; CHECK-UNSAFE-NEXT: fadd s1, s2, s3 -; CHECK-UNSAFE-NEXT: fadd s0, s0, s1 +; CHECK-UNSAFE-NEXT: fadd s1, s3, s2 +; CHECK-UNSAFE-NEXT: fadd s0, s1, s0 ; CHECK-UNSAFE-NEXT: ret %t0 = fdiv float %x0, %x1 %t1 = fadd float %x2, %t0 @@ -196,8 +196,8 @@ ; CHECK-UNSAFE-LABEL: reassociate_muls1: ; CHECK-UNSAFE: // %bb.0: ; CHECK-UNSAFE-NEXT: fdiv s0, s0, s1 -; CHECK-UNSAFE-NEXT: fmul s1, s2, s3 -; CHECK-UNSAFE-NEXT: fmul s0, s0, s1 +; CHECK-UNSAFE-NEXT: fmul s1, s3, s2 +; CHECK-UNSAFE-NEXT: fmul s0, s1, s0 ; CHECK-UNSAFE-NEXT: ret %t0 = fdiv float %x0, %x1 %t1 = fmul float %x2, %t0 @@ -218,8 +218,8 @@ ; CHECK-UNSAFE-LABEL: reassociate_adds_double: ; CHECK-UNSAFE: // %bb.0: ; CHECK-UNSAFE-NEXT: fdiv d0, d0, d1 -; CHECK-UNSAFE-NEXT: fadd d1, d2, d3 -; CHECK-UNSAFE-NEXT: fadd d0, d0, d1 +; CHECK-UNSAFE-NEXT: fadd d1, d3, d2 +; CHECK-UNSAFE-NEXT: fadd d0, d1, d0 ; CHECK-UNSAFE-NEXT: ret %t0 = fdiv double %x0, %x1 %t1 = fadd double %x2, %t0 @@ -240,8 +240,8 @@ ; CHECK-UNSAFE-LABEL: reassociate_muls_double: ; CHECK-UNSAFE: // %bb.0: ; CHECK-UNSAFE-NEXT: fdiv d0, d0, d1 -; CHECK-UNSAFE-NEXT: fmul d1, d2, d3 -; CHECK-UNSAFE-NEXT: fmul d0, d0, d1 +; CHECK-UNSAFE-NEXT: fmul d1, d3, d2 +; CHECK-UNSAFE-NEXT: fmul d0, d1, d0 ; CHECK-UNSAFE-NEXT: ret %t0 = fdiv double %x0, %x1 %t1 = fmul double %x2, %t0 @@ -283,7 +283,7 @@ ; CHECK-UNSAFE: // %bb.0: ; CHECK-UNSAFE-NEXT: fadd v0.4s, v0.4s, v1.4s ; CHECK-UNSAFE-NEXT: fadd v1.4s, v2.4s, v3.4s -; CHECK-UNSAFE-NEXT: fadd v0.4s, v0.4s, v1.4s +; CHECK-UNSAFE-NEXT: fadd v0.4s, v1.4s, v0.4s ; CHECK-UNSAFE-NEXT: ret %t0 = fadd <4 x float> %x0, %x1 %t1 = fadd <4 x float> %x2, %t0 @@ -302,8 +302,8 @@ ; CHECK-UNSAFE-LABEL: vector_reassociate_adds3: ; CHECK-UNSAFE: // %bb.0: ; CHECK-UNSAFE-NEXT: fadd v0.4s, v0.4s, v1.4s -; CHECK-UNSAFE-NEXT: fadd v1.4s, v2.4s, v3.4s -; CHECK-UNSAFE-NEXT: fadd v0.4s, v0.4s, v1.4s +; CHECK-UNSAFE-NEXT: fadd v1.4s, v3.4s, v2.4s +; CHECK-UNSAFE-NEXT: fadd v0.4s, v1.4s, v0.4s ; CHECK-UNSAFE-NEXT: ret %t0 = fadd <4 x float> %x0, %x1 %t1 = fadd <4 x float> %t0, %x2 @@ -322,8 +322,8 @@ ; CHECK-UNSAFE-LABEL: vector_reassociate_adds4: ; CHECK-UNSAFE: // %bb.0: ; CHECK-UNSAFE-NEXT: fadd v0.4s, v0.4s, v1.4s -; CHECK-UNSAFE-NEXT: fadd v1.4s, v2.4s, v3.4s -; CHECK-UNSAFE-NEXT: fadd v0.4s, v0.4s, v1.4s +; CHECK-UNSAFE-NEXT: fadd v1.4s, v3.4s, v2.4s +; CHECK-UNSAFE-NEXT: fadd v0.4s, v1.4s, v0.4s ; CHECK-UNSAFE-NEXT: ret %t0 = fadd <4 x float> %x0, %x1 %t1 = fadd <4 x float> %x2, %t0 @@ -343,8 +343,8 @@ ; CHECK-UNSAFE-LABEL: reassociate_muls_v4f32: ; CHECK-UNSAFE: // %bb.0: ; CHECK-UNSAFE-NEXT: fadd v0.4s, v0.4s, v1.4s -; CHECK-UNSAFE-NEXT: fmul v1.4s, v2.4s, v3.4s -; CHECK-UNSAFE-NEXT: fmul v0.4s, v0.4s, v1.4s +; CHECK-UNSAFE-NEXT: fmul v1.4s, v3.4s, v2.4s +; CHECK-UNSAFE-NEXT: fmul v0.4s, v1.4s, v0.4s ; CHECK-UNSAFE-NEXT: ret %t0 = fadd <4 x float> %x0, %x1 %t1 = fmul <4 x float> %x2, %t0 @@ -365,8 +365,8 @@ ; CHECK-UNSAFE-LABEL: reassociate_muls_v2f64: ; CHECK-UNSAFE: // %bb.0: ; CHECK-UNSAFE-NEXT: fadd v0.2d, v0.2d, v1.2d -; CHECK-UNSAFE-NEXT: fmul v1.2d, v2.2d, v3.2d -; CHECK-UNSAFE-NEXT: fmul v0.2d, v0.2d, v1.2d +; CHECK-UNSAFE-NEXT: fmul v1.2d, v3.2d, v2.2d +; CHECK-UNSAFE-NEXT: fmul v0.2d, v1.2d, v0.2d ; CHECK-UNSAFE-NEXT: ret %t0 = fadd <2 x double> %x0, %x1 %t1 = fmul <2 x double> %x2, %t0 diff --git a/llvm/test/CodeGen/AArch64/sqrt-fastmath.ll b/llvm/test/CodeGen/AArch64/sqrt-fastmath.ll --- a/llvm/test/CodeGen/AArch64/sqrt-fastmath.ll +++ b/llvm/test/CodeGen/AArch64/sqrt-fastmath.ll @@ -24,9 +24,9 @@ ; CHECK-NEXT: frsqrts s2, s0, s2 ; CHECK-NEXT: fmul s1, s1, s2 ; CHECK-NEXT: fmul s2, s1, s1 -; CHECK-NEXT: fmul s1, s1, s0 +; CHECK-NEXT: fmul s1, s0, s1 ; CHECK-NEXT: frsqrts s2, s0, s2 -; CHECK-NEXT: fmul s1, s2, s1 +; CHECK-NEXT: fmul s1, s1, s2 ; CHECK-NEXT: fcsel s0, s0, s1, eq ; CHECK-NEXT: ret %1 = tail call fast float @llvm.sqrt.f32(float %a) @@ -47,9 +47,9 @@ ; CHECK-NEXT: frsqrts s2, s0, s2 ; CHECK-NEXT: fmul s1, s1, s2 ; CHECK-NEXT: fmul s2, s1, s1 -; CHECK-NEXT: fmul s1, s1, s0 +; CHECK-NEXT: fmul s1, s0, s1 ; CHECK-NEXT: frsqrts s2, s0, s2 -; CHECK-NEXT: fmul s1, s2, s1 +; CHECK-NEXT: fmul s1, s1, s2 ; CHECK-NEXT: fcsel s0, s0, s1, eq ; CHECK-NEXT: ret %1 = tail call fast float @llvm.sqrt.f32(float %a) @@ -69,9 +69,9 @@ ; CHECK-NEXT: frsqrts v2.2s, v0.2s, v2.2s ; CHECK-NEXT: fmul v1.2s, v1.2s, v2.2s ; CHECK-NEXT: fmul v2.2s, v1.2s, v1.2s -; CHECK-NEXT: fmul v1.2s, v1.2s, v0.2s +; CHECK-NEXT: fmul v1.2s, v0.2s, v1.2s ; CHECK-NEXT: frsqrts v2.2s, v0.2s, v2.2s -; CHECK-NEXT: fmul v1.2s, v2.2s, v1.2s +; CHECK-NEXT: fmul v1.2s, v1.2s, v2.2s ; CHECK-NEXT: fcmeq v2.2s, v0.2s, #0.0 ; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b ; CHECK-NEXT: ret @@ -92,9 +92,9 @@ ; CHECK-NEXT: frsqrts v2.4s, v0.4s, v2.4s ; CHECK-NEXT: fmul v1.4s, v1.4s, v2.4s ; CHECK-NEXT: fmul v2.4s, v1.4s, v1.4s -; CHECK-NEXT: fmul v1.4s, v1.4s, v0.4s +; CHECK-NEXT: fmul v1.4s, v0.4s, v1.4s ; CHECK-NEXT: frsqrts v2.4s, v0.4s, v2.4s -; CHECK-NEXT: fmul v1.4s, v2.4s, v1.4s +; CHECK-NEXT: fmul v1.4s, v1.4s, v2.4s ; CHECK-NEXT: fcmeq v2.4s, v0.4s, #0.0 ; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b ; CHECK-NEXT: ret @@ -119,16 +119,16 @@ ; CHECK-NEXT: frsqrts v5.4s, v1.4s, v5.4s ; CHECK-NEXT: fmul v2.4s, v2.4s, v4.4s ; CHECK-NEXT: fmul v4.4s, v2.4s, v2.4s -; CHECK-NEXT: fmul v2.4s, v2.4s, v0.4s +; CHECK-NEXT: fmul v2.4s, v0.4s, v2.4s ; CHECK-NEXT: frsqrts v4.4s, v0.4s, v4.4s ; CHECK-NEXT: fmul v3.4s, v3.4s, v5.4s ; CHECK-NEXT: fmul v5.4s, v3.4s, v3.4s -; CHECK-NEXT: fmul v3.4s, v3.4s, v1.4s +; CHECK-NEXT: fmul v3.4s, v1.4s, v3.4s ; CHECK-NEXT: frsqrts v5.4s, v1.4s, v5.4s -; CHECK-NEXT: fmul v2.4s, v4.4s, v2.4s +; CHECK-NEXT: fmul v2.4s, v2.4s, v4.4s ; CHECK-NEXT: fcmeq v4.4s, v0.4s, #0.0 ; CHECK-NEXT: bif v0.16b, v2.16b, v4.16b -; CHECK-NEXT: fmul v3.4s, v5.4s, v3.4s +; CHECK-NEXT: fmul v3.4s, v3.4s, v5.4s ; CHECK-NEXT: fcmeq v5.4s, v1.4s, #0.0 ; CHECK-NEXT: bif v1.16b, v3.16b, v5.16b ; CHECK-NEXT: ret @@ -153,9 +153,9 @@ ; CHECK-NEXT: frsqrts d2, d0, d2 ; CHECK-NEXT: fmul d1, d1, d2 ; CHECK-NEXT: fmul d2, d1, d1 -; CHECK-NEXT: fmul d1, d1, d0 +; CHECK-NEXT: fmul d1, d0, d1 ; CHECK-NEXT: frsqrts d2, d0, d2 -; CHECK-NEXT: fmul d1, d2, d1 +; CHECK-NEXT: fmul d1, d1, d2 ; CHECK-NEXT: fcsel d0, d0, d1, eq ; CHECK-NEXT: ret %1 = tail call fast double @llvm.sqrt.f64(double %a) @@ -179,9 +179,9 @@ ; CHECK-NEXT: frsqrts d2, d0, d2 ; CHECK-NEXT: fmul d1, d1, d2 ; CHECK-NEXT: fmul d2, d1, d1 -; CHECK-NEXT: fmul d1, d1, d0 +; CHECK-NEXT: fmul d1, d0, d1 ; CHECK-NEXT: frsqrts d2, d0, d2 -; CHECK-NEXT: fmul d1, d2, d1 +; CHECK-NEXT: fmul d1, d1, d2 ; CHECK-NEXT: fcsel d0, d0, d1, eq ; CHECK-NEXT: ret %1 = tail call fast double @llvm.sqrt.f64(double %a) @@ -204,9 +204,9 @@ ; CHECK-NEXT: frsqrts v2.2d, v0.2d, v2.2d ; CHECK-NEXT: fmul v1.2d, v1.2d, v2.2d ; CHECK-NEXT: fmul v2.2d, v1.2d, v1.2d -; CHECK-NEXT: fmul v1.2d, v1.2d, v0.2d +; CHECK-NEXT: fmul v1.2d, v0.2d, v1.2d ; CHECK-NEXT: frsqrts v2.2d, v0.2d, v2.2d -; CHECK-NEXT: fmul v1.2d, v2.2d, v1.2d +; CHECK-NEXT: fmul v1.2d, v1.2d, v2.2d ; CHECK-NEXT: fcmeq v2.2d, v0.2d, #0.0 ; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b ; CHECK-NEXT: ret @@ -237,16 +237,16 @@ ; CHECK-NEXT: frsqrts v5.2d, v1.2d, v5.2d ; CHECK-NEXT: fmul v2.2d, v2.2d, v4.2d ; CHECK-NEXT: fmul v4.2d, v2.2d, v2.2d -; CHECK-NEXT: fmul v2.2d, v2.2d, v0.2d +; CHECK-NEXT: fmul v2.2d, v0.2d, v2.2d ; CHECK-NEXT: frsqrts v4.2d, v0.2d, v4.2d ; CHECK-NEXT: fmul v3.2d, v3.2d, v5.2d ; CHECK-NEXT: fmul v5.2d, v3.2d, v3.2d -; CHECK-NEXT: fmul v3.2d, v3.2d, v1.2d +; CHECK-NEXT: fmul v3.2d, v1.2d, v3.2d ; CHECK-NEXT: frsqrts v5.2d, v1.2d, v5.2d -; CHECK-NEXT: fmul v2.2d, v4.2d, v2.2d +; CHECK-NEXT: fmul v2.2d, v2.2d, v4.2d ; CHECK-NEXT: fcmeq v4.2d, v0.2d, #0.0 ; CHECK-NEXT: bif v0.16b, v2.16b, v4.16b -; CHECK-NEXT: fmul v3.2d, v5.2d, v3.2d +; CHECK-NEXT: fmul v3.2d, v3.2d, v5.2d ; CHECK-NEXT: fcmeq v5.2d, v1.2d, #0.0 ; CHECK-NEXT: bif v1.16b, v3.16b, v5.16b ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/PowerPC/machine-combiner.ll b/llvm/test/CodeGen/PowerPC/machine-combiner.ll --- a/llvm/test/CodeGen/PowerPC/machine-combiner.ll +++ b/llvm/test/CodeGen/PowerPC/machine-combiner.ll @@ -25,7 +25,7 @@ ; CHECK: # %bb.0: ; CHECK: fadds [[REG0:[0-9]+]], 1, 2 ; CHECK: fadds [[REG1:[0-9]+]], 3, 4 -; CHECK: fadds 1, [[REG0]], [[REG1]] +; CHECK: fadds 1, [[REG1]], [[REG0]] ; CHECK-NEXT: blr %t0 = fadd reassoc nsz float %x0, %x1 @@ -38,8 +38,8 @@ ; CHECK-LABEL: reassociate_adds3: ; CHECK: # %bb.0: ; CHECK: fadds [[REG0:[0-9]+]], 1, 2 -; CHECK: fadds [[REG1:[0-9]+]], 3, 4 -; CHECK: fadds 1, [[REG0]], [[REG1]] +; CHECK: fadds [[REG1:[0-9]+]], 4, 3 +; CHECK: fadds 1, [[REG1]], [[REG0]] ; CHECK-NEXT: blr %t0 = fadd reassoc nsz float %x0, %x1 @@ -52,8 +52,8 @@ ; CHECK-LABEL: reassociate_adds4: ; CHECK: # %bb.0: ; CHECK: fadds [[REG0:[0-9]+]], 1, 2 -; CHECK: fadds [[REG1:[0-9]+]], 3, 4 -; CHECK: fadds 1, [[REG0]], [[REG1]] +; CHECK: fadds [[REG1:[0-9]+]], 4, 3 +; CHECK: fadds 1, [[REG1]], [[REG0]] ; CHECK-NEXT: blr %t0 = fadd reassoc nsz float %x0, %x1 @@ -108,7 +108,7 @@ ; CHECK: # %bb.0: ; CHECK-PWR: xvaddsp [[REG0:[0-9]+]], 34, 35 ; CHECK-PWR: xvaddsp [[REG1:[0-9]+]], 36, 37 -; CHECK-PWR: xvaddsp 34, [[REG0]], [[REG1]] +; CHECK-PWR: xvaddsp 34, [[REG1]], [[REG0]] ; CHECK-NEXT: blr %t0 = fadd reassoc nsz <4 x float> %x0, %x1 @@ -121,8 +121,8 @@ ; CHECK-LABEL: vector_reassociate_adds3: ; CHECK: # %bb.0: ; CHECK-PWR: xvaddsp [[REG0:[0-9]+]], 34, 35 -; CHECK-PWR: xvaddsp [[REG1:[0-9]+]], 36, 37 -; CHECK-PWR: xvaddsp 34, [[REG0]], [[REG1]] +; CHECK-PWR: xvaddsp [[REG1:[0-9]+]], 37, 36 +; CHECK-PWR: xvaddsp 34, [[REG1]], [[REG0]] ; CHECK-NEXT: blr %t0 = fadd reassoc nsz <4 x float> %x0, %x1 @@ -135,8 +135,8 @@ ; CHECK-LABEL: vector_reassociate_adds4: ; CHECK: # %bb.0: ; CHECK-PWR: xvaddsp [[REG0:[0-9]+]], 34, 35 -; CHECK-PWR: xvaddsp [[REG1:[0-9]+]], 36, 37 -; CHECK-PWR: xvaddsp 34, [[REG0]], [[REG1]] +; CHECK-PWR: xvaddsp [[REG1:[0-9]+]], 37, 36 +; CHECK-PWR: xvaddsp 34, [[REG1]], [[REG0]] ; CHECK-NEXT: blr %t0 = fadd reassoc nsz <4 x float> %x0, %x1 diff --git a/llvm/test/CodeGen/RISCV/machine-combiner.ll b/llvm/test/CodeGen/RISCV/machine-combiner.ll --- a/llvm/test/CodeGen/RISCV/machine-combiner.ll +++ b/llvm/test/CodeGen/RISCV/machine-combiner.ll @@ -21,7 +21,7 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: fadd.d ft0, fa0, fa1 ; CHECK-NEXT: fadd.d ft1, fa2, fa3 -; CHECK-NEXT: fadd.d fa0, ft0, ft1 +; CHECK-NEXT: fadd.d fa0, ft1, ft0 ; CHECK-NEXT: ret %t0 = fadd nsz reassoc double %a0, %a1 %t1 = fadd nsz reassoc double %a2, %t0 @@ -33,8 +33,8 @@ ; CHECK-LABEL: test_reassoc_fadd3: ; CHECK: # %bb.0: ; CHECK-NEXT: fadd.d ft0, fa0, fa1 -; CHECK-NEXT: fadd.d ft1, fa2, fa3 -; CHECK-NEXT: fadd.d fa0, ft0, ft1 +; CHECK-NEXT: fadd.d ft1, fa3, fa2 +; CHECK-NEXT: fadd.d fa0, ft1, ft0 ; CHECK-NEXT: ret %t0 = fadd nsz reassoc double %a0, %a1 %t1 = fadd nsz reassoc double %t0, %a2 @@ -46,8 +46,8 @@ ; CHECK-LABEL: test_reassoc_fadd4: ; CHECK: # %bb.0: ; CHECK-NEXT: fadd.d ft0, fa0, fa1 -; CHECK-NEXT: fadd.d ft1, fa2, fa3 -; CHECK-NEXT: fadd.d fa0, ft0, ft1 +; CHECK-NEXT: fadd.d ft1, fa3, fa2 +; CHECK-NEXT: fadd.d fa0, ft1, ft0 ; CHECK-NEXT: ret %t0 = fadd nsz reassoc double %a0, %a1 %t1 = fadd nsz reassoc double %a2, %t0 @@ -73,7 +73,7 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: fmul.d ft0, fa0, fa1 ; CHECK-NEXT: fmul.d ft1, fa2, fa3 -; CHECK-NEXT: fmul.d fa0, ft0, ft1 +; CHECK-NEXT: fmul.d fa0, ft1, ft0 ; CHECK-NEXT: ret %t0 = fmul nsz reassoc double %a0, %a1 %t1 = fmul nsz reassoc double %a2, %t0 @@ -85,8 +85,8 @@ ; CHECK-LABEL: test_reassoc_fmul3: ; CHECK: # %bb.0: ; CHECK-NEXT: fmul.d ft0, fa0, fa1 -; CHECK-NEXT: fmul.d ft1, fa2, fa3 -; CHECK-NEXT: fmul.d fa0, ft0, ft1 +; CHECK-NEXT: fmul.d ft1, fa3, fa2 +; CHECK-NEXT: fmul.d fa0, ft1, ft0 ; CHECK-NEXT: ret %t0 = fmul nsz reassoc double %a0, %a1 %t1 = fmul nsz reassoc double %t0, %a2 @@ -98,8 +98,8 @@ ; CHECK-LABEL: test_reassoc_fmul4: ; CHECK: # %bb.0: ; CHECK-NEXT: fmul.d ft0, fa0, fa1 -; CHECK-NEXT: fmul.d ft1, fa2, fa3 -; CHECK-NEXT: fmul.d fa0, ft0, ft1 +; CHECK-NEXT: fmul.d ft1, fa3, fa2 +; CHECK-NEXT: fmul.d fa0, ft1, ft0 ; CHECK-NEXT: ret %t0 = fmul nsz reassoc double %a0, %a1 %t1 = fmul nsz reassoc double %a2, %t0 @@ -135,7 +135,7 @@ ; CHECK-NEXT: fmul.d ft1, ft1, fa1 ; CHECK-NEXT: fadd.d ft2, fa0, fa1 ; CHECK-NEXT: fadd.d ft3, fa2, fa1 -; CHECK-NEXT: fmul.d ft0, ft0, ft1 +; CHECK-NEXT: fmul.d ft0, ft1, ft0 ; CHECK-NEXT: fadd.d ft1, fa2, ft2 ; CHECK-NEXT: fmul.d ft2, fa0, ft3 ; CHECK-NEXT: fsub.d ft1, fa3, ft1 diff --git a/llvm/test/CodeGen/X86/avx512-mask-op.ll b/llvm/test/CodeGen/X86/avx512-mask-op.ll --- a/llvm/test/CodeGen/X86/avx512-mask-op.ll +++ b/llvm/test/CodeGen/X86/avx512-mask-op.ll @@ -1243,8 +1243,8 @@ ; X86-NEXT: kmovd %eax, %k2 ; X86-NEXT: kshiftlq $63, %k2, %k2 ; X86-NEXT: kshiftrq $58, %k2, %k2 -; X86-NEXT: korq %k2, %k1, %k1 -; X86-NEXT: korq %k1, %k0, %k0 +; X86-NEXT: korq %k1, %k2, %k1 +; X86-NEXT: korq %k0, %k1, %k0 ; X86-NEXT: vpmovm2b %k0, %zmm0 ; X86-NEXT: retl %a = bitcast i64 %x to <64 x i1> @@ -1364,8 +1364,8 @@ ; X86-NEXT: kmovd %eax, %k2 ; X86-NEXT: kshiftlq $63, %k2, %k2 ; X86-NEXT: kshiftrq $58, %k2, %k2 -; X86-NEXT: korq %k2, %k1, %k1 -; X86-NEXT: korq %k1, %k0, %k0 +; X86-NEXT: korq %k1, %k2, %k1 +; X86-NEXT: korq %k0, %k1, %k0 ; X86-NEXT: vpmovm2b %k0, %zmm0 ; X86-NEXT: retl %a = bitcast i64 %x to <64 x i1> diff --git a/llvm/test/CodeGen/X86/avx512fp16-intrinsics.ll b/llvm/test/CodeGen/X86/avx512fp16-intrinsics.ll --- a/llvm/test/CodeGen/X86/avx512fp16-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512fp16-intrinsics.ll @@ -53,8 +53,8 @@ ; CHECK-NEXT: vmulph %zmm2, %zmm0, %zmm0 ; CHECK-NEXT: vfmadd213ph {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to32}, %zmm2, %zmm0 ; CHECK-NEXT: vmulph {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to32}, %zmm2, %zmm2 -; CHECK-NEXT: vmulph %zmm1, %zmm0, %zmm0 -; CHECK-NEXT: vmulph %zmm0, %zmm2, %zmm0 +; CHECK-NEXT: vmulph %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: vmulph %zmm2, %zmm0, %zmm0 ; CHECK-NEXT: retq %1 = call fast <32 x half> @llvm.sqrt.v32f16(<32 x half> %a0) %2 = fdiv fast <32 x half> %a1, %1 diff --git a/llvm/test/CodeGen/X86/avx512fp16-machine-combiner.ll b/llvm/test/CodeGen/X86/avx512fp16-machine-combiner.ll --- a/llvm/test/CodeGen/X86/avx512fp16-machine-combiner.ll +++ b/llvm/test/CodeGen/X86/avx512fp16-machine-combiner.ll @@ -26,7 +26,7 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vaddsh %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: vaddsh %xmm3, %xmm2, %xmm1 -; CHECK-NEXT: vaddsh %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vaddsh %xmm0, %xmm1, %xmm0 ; CHECK-NEXT: retq %t0 = fadd reassoc nsz half %x0, %x1 %t1 = fadd reassoc nsz half %x2, %t0 @@ -38,8 +38,8 @@ ; CHECK-LABEL: reassociate_adds3: ; CHECK: # %bb.0: ; CHECK-NEXT: vaddsh %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vaddsh %xmm3, %xmm2, %xmm1 -; CHECK-NEXT: vaddsh %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vaddsh %xmm2, %xmm3, %xmm1 +; CHECK-NEXT: vaddsh %xmm0, %xmm1, %xmm0 ; CHECK-NEXT: retq %t0 = fadd reassoc nsz half %x0, %x1 %t1 = fadd reassoc nsz half %t0, %x2 @@ -51,8 +51,8 @@ ; CHECK-LABEL: reassociate_adds4: ; CHECK: # %bb.0: ; CHECK-NEXT: vaddsh %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vaddsh %xmm3, %xmm2, %xmm1 -; CHECK-NEXT: vaddsh %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vaddsh %xmm2, %xmm3, %xmm1 +; CHECK-NEXT: vaddsh %xmm0, %xmm1, %xmm0 ; CHECK-NEXT: retq %t0 = fadd reassoc nsz half %x0, %x1 %t1 = fadd reassoc nsz half %x2, %t0 @@ -93,8 +93,8 @@ ; CHECK-LABEL: reassociate_adds6: ; CHECK: # %bb.0: ; CHECK-NEXT: vdivsh %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vaddsh %xmm3, %xmm2, %xmm1 -; CHECK-NEXT: vaddsh %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vaddsh %xmm2, %xmm3, %xmm1 +; CHECK-NEXT: vaddsh %xmm0, %xmm1, %xmm0 ; CHECK-NEXT: retq %t0 = fdiv reassoc nsz half %x0, %x1 %t1 = fadd reassoc nsz half %x2, %t0 @@ -108,8 +108,8 @@ ; CHECK-LABEL: reassociate_muls1: ; CHECK: # %bb.0: ; CHECK-NEXT: vdivsh %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vmulsh %xmm3, %xmm2, %xmm1 -; CHECK-NEXT: vmulsh %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vmulsh %xmm2, %xmm3, %xmm1 +; CHECK-NEXT: vmulsh %xmm0, %xmm1, %xmm0 ; CHECK-NEXT: retq %t0 = fdiv reassoc nsz half %x0, %x1 %t1 = fmul reassoc nsz half %x2, %t0 @@ -123,8 +123,8 @@ ; CHECK-LABEL: reassociate_adds_v8f16: ; CHECK: # %bb.0: ; CHECK-NEXT: vdivph %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vaddph %xmm3, %xmm2, %xmm1 -; CHECK-NEXT: vaddph %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vaddph %xmm2, %xmm3, %xmm1 +; CHECK-NEXT: vaddph %xmm0, %xmm1, %xmm0 ; CHECK-NEXT: retq %t0 = fdiv reassoc nsz <8 x half> %x0, %x1 %t1 = fadd reassoc nsz <8 x half> %x2, %t0 @@ -138,8 +138,8 @@ ; CHECK-LABEL: reassociate_muls_v8f16: ; CHECK: # %bb.0: ; CHECK-NEXT: vaddph %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vmulph %xmm3, %xmm2, %xmm1 -; CHECK-NEXT: vmulph %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vmulph %xmm2, %xmm3, %xmm1 +; CHECK-NEXT: vmulph %xmm0, %xmm1, %xmm0 ; CHECK-NEXT: retq %t0 = fadd reassoc nsz <8 x half> %x0, %x1 %t1 = fmul reassoc nsz <8 x half> %x2, %t0 @@ -153,8 +153,8 @@ ; CHECK-LABEL: reassociate_adds_v16f16: ; CHECK: # %bb.0: ; CHECK-NEXT: vdivph %ymm1, %ymm0, %ymm0 -; CHECK-NEXT: vaddph %ymm3, %ymm2, %ymm1 -; CHECK-NEXT: vaddph %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vaddph %ymm2, %ymm3, %ymm1 +; CHECK-NEXT: vaddph %ymm0, %ymm1, %ymm0 ; CHECK-NEXT: retq %t0 = fdiv reassoc nsz <16 x half> %x0, %x1 %t1 = fadd reassoc nsz <16 x half> %x2, %t0 @@ -168,8 +168,8 @@ ; CHECK-LABEL: reassociate_muls_v16f16: ; CHECK: # %bb.0: ; CHECK-NEXT: vaddph %ymm1, %ymm0, %ymm0 -; CHECK-NEXT: vmulph %ymm3, %ymm2, %ymm1 -; CHECK-NEXT: vmulph %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vmulph %ymm2, %ymm3, %ymm1 +; CHECK-NEXT: vmulph %ymm0, %ymm1, %ymm0 ; CHECK-NEXT: retq %t0 = fadd reassoc nsz <16 x half> %x0, %x1 %t1 = fmul reassoc nsz <16 x half> %x2, %t0 @@ -183,8 +183,8 @@ ; CHECK-LABEL: reassociate_adds_v32f16: ; CHECK: # %bb.0: ; CHECK-NEXT: vdivph %zmm1, %zmm0, %zmm0 -; CHECK-NEXT: vaddph %zmm3, %zmm2, %zmm1 -; CHECK-NEXT: vaddph %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vaddph %zmm2, %zmm3, %zmm1 +; CHECK-NEXT: vaddph %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: retq %t0 = fdiv reassoc nsz <32 x half> %x0, %x1 %t1 = fadd reassoc nsz <32 x half> %x2, %t0 @@ -198,8 +198,8 @@ ; CHECK-LABEL: reassociate_muls_v32f16: ; CHECK: # %bb.0: ; CHECK-NEXT: vaddph %zmm1, %zmm0, %zmm0 -; CHECK-NEXT: vmulph %zmm3, %zmm2, %zmm1 -; CHECK-NEXT: vmulph %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vmulph %zmm2, %zmm3, %zmm1 +; CHECK-NEXT: vmulph %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: retq %t0 = fadd reassoc nsz <32 x half> %x0, %x1 %t1 = fmul reassoc nsz <32 x half> %x2, %t0 @@ -213,8 +213,8 @@ ; CHECK-LABEL: reassociate_mins_half: ; CHECK: # %bb.0: ; CHECK-NEXT: vdivsh %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vminsh %xmm3, %xmm2, %xmm1 -; CHECK-NEXT: vminsh %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vminsh %xmm2, %xmm3, %xmm1 +; CHECK-NEXT: vminsh %xmm0, %xmm1, %xmm0 ; CHECK-NEXT: retq %t0 = fdiv half %x0, %x1 %cmp1 = fcmp olt half %x2, %t0 @@ -230,8 +230,8 @@ ; CHECK-LABEL: reassociate_maxs_half: ; CHECK: # %bb.0: ; CHECK-NEXT: vdivsh %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vmaxsh %xmm3, %xmm2, %xmm1 -; CHECK-NEXT: vmaxsh %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vmaxsh %xmm2, %xmm3, %xmm1 +; CHECK-NEXT: vmaxsh %xmm0, %xmm1, %xmm0 ; CHECK-NEXT: retq %t0 = fdiv half %x0, %x1 %cmp1 = fcmp ogt half %x2, %t0 @@ -247,8 +247,8 @@ ; CHECK-LABEL: reassociate_mins_v8f16: ; CHECK: # %bb.0: ; CHECK-NEXT: vaddph %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vminph %xmm3, %xmm2, %xmm1 -; CHECK-NEXT: vminph %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vminph %xmm2, %xmm3, %xmm1 +; CHECK-NEXT: vminph %xmm0, %xmm1, %xmm0 ; CHECK-NEXT: retq %t0 = fadd <8 x half> %x0, %x1 %cmp1 = fcmp olt <8 x half> %x2, %t0 @@ -264,8 +264,8 @@ ; CHECK-LABEL: reassociate_maxs_v8f16: ; CHECK: # %bb.0: ; CHECK-NEXT: vaddph %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vmaxph %xmm3, %xmm2, %xmm1 -; CHECK-NEXT: vmaxph %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vmaxph %xmm2, %xmm3, %xmm1 +; CHECK-NEXT: vmaxph %xmm0, %xmm1, %xmm0 ; CHECK-NEXT: retq %t0 = fadd <8 x half> %x0, %x1 %cmp1 = fcmp ogt <8 x half> %x2, %t0 @@ -281,8 +281,8 @@ ; CHECK-LABEL: reassociate_mins_v16f16: ; CHECK: # %bb.0: ; CHECK-NEXT: vaddph %ymm1, %ymm0, %ymm0 -; CHECK-NEXT: vminph %ymm3, %ymm2, %ymm1 -; CHECK-NEXT: vminph %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vminph %ymm2, %ymm3, %ymm1 +; CHECK-NEXT: vminph %ymm0, %ymm1, %ymm0 ; CHECK-NEXT: retq %t0 = fadd <16 x half> %x0, %x1 %cmp1 = fcmp olt <16 x half> %x2, %t0 @@ -298,8 +298,8 @@ ; CHECK-LABEL: reassociate_maxs_v16f16: ; CHECK: # %bb.0: ; CHECK-NEXT: vaddph %ymm1, %ymm0, %ymm0 -; CHECK-NEXT: vmaxph %ymm3, %ymm2, %ymm1 -; CHECK-NEXT: vmaxph %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vmaxph %ymm2, %ymm3, %ymm1 +; CHECK-NEXT: vmaxph %ymm0, %ymm1, %ymm0 ; CHECK-NEXT: retq %t0 = fadd <16 x half> %x0, %x1 %cmp1 = fcmp ogt <16 x half> %x2, %t0 @@ -315,8 +315,8 @@ ; CHECK-LABEL: reassociate_mins_v32f16: ; CHECK: # %bb.0: ; CHECK-NEXT: vaddph %zmm1, %zmm0, %zmm0 -; CHECK-NEXT: vminph %zmm3, %zmm2, %zmm1 -; CHECK-NEXT: vminph %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vminph %zmm2, %zmm3, %zmm1 +; CHECK-NEXT: vminph %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: retq %t0 = fadd <32 x half> %x0, %x1 %cmp1 = fcmp olt <32 x half> %x2, %t0 @@ -332,8 +332,8 @@ ; CHECK-LABEL: reassociate_maxs_v16f32: ; CHECK: # %bb.0: ; CHECK-NEXT: vaddph %zmm1, %zmm0, %zmm0 -; CHECK-NEXT: vmaxph %zmm3, %zmm2, %zmm1 -; CHECK-NEXT: vmaxph %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vmaxph %zmm2, %zmm3, %zmm1 +; CHECK-NEXT: vmaxph %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: retq %t0 = fadd <32 x half> %x0, %x1 %cmp1 = fcmp ogt <32 x half> %x2, %t0 diff --git a/llvm/test/CodeGen/X86/avx512fp16-mscatter.ll b/llvm/test/CodeGen/X86/avx512fp16-mscatter.ll --- a/llvm/test/CodeGen/X86/avx512fp16-mscatter.ll +++ b/llvm/test/CodeGen/X86/avx512fp16-mscatter.ll @@ -7,11 +7,11 @@ ; CHECK-NEXT: vpbroadcastq %rdi, %zmm3 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2 ; CHECK-NEXT: vpmovsxdq %ymm2, %zmm2 -; CHECK-NEXT: vpaddq %zmm3, %zmm2, %zmm4 -; CHECK-NEXT: vpaddq %zmm4, %zmm2, %zmm2 +; CHECK-NEXT: vpaddq %zmm2, %zmm3, %zmm4 +; CHECK-NEXT: vpaddq %zmm2, %zmm4, %zmm2 ; CHECK-NEXT: vpmovsxdq %ymm0, %zmm0 -; CHECK-NEXT: vpaddq %zmm3, %zmm0, %zmm3 -; CHECK-NEXT: vpaddq %zmm3, %zmm0, %zmm0 +; CHECK-NEXT: vpaddq %zmm0, %zmm3, %zmm3 +; CHECK-NEXT: vpaddq %zmm0, %zmm3, %zmm0 ; CHECK-NEXT: vmovq %xmm0, %rax ; CHECK-NEXT: vmovsh %xmm1, (%rax) ; CHECK-NEXT: vpsrld $16, %xmm1, %xmm3 diff --git a/llvm/test/CodeGen/X86/avx512vl-intrinsics.ll b/llvm/test/CodeGen/X86/avx512vl-intrinsics.ll --- a/llvm/test/CodeGen/X86/avx512vl-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512vl-intrinsics.ll @@ -4358,8 +4358,8 @@ ; X86-NEXT: vcvtps2ph $2, %xmm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x1d,0xc2,0x02] ; X86-NEXT: vcvtps2ph $10, %xmm0, %xmm3 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0x89,0x1d,0xc3,0x0a] ; X86-NEXT: vcvtps2ph $11, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x1d,0xc1,0x0b] -; X86-NEXT: vpaddw %xmm1, %xmm3, %xmm0 # encoding: [0xc5,0xe1,0xfd,0xc1] -; X86-NEXT: vpaddw %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfd,0xc0] +; X86-NEXT: vpaddw %xmm3, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfd,0xc3] +; X86-NEXT: vpaddw %xmm2, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfd,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_x86_vcvtps2ph_128: @@ -4368,8 +4368,8 @@ ; X64-NEXT: vcvtps2ph $2, %xmm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x1d,0xc2,0x02] ; X64-NEXT: vcvtps2ph $10, %xmm0, %xmm3 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0x89,0x1d,0xc3,0x0a] ; X64-NEXT: vcvtps2ph $11, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x1d,0xc1,0x0b] -; X64-NEXT: vpaddw %xmm1, %xmm3, %xmm0 # encoding: [0xc5,0xe1,0xfd,0xc1] -; X64-NEXT: vpaddw %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfd,0xc0] +; X64-NEXT: vpaddw %xmm3, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfd,0xc3] +; X64-NEXT: vpaddw %xmm2, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfd,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res1 = call <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.128(<4 x float> %a0, i32 2, <8 x i16> zeroinitializer, i8 -1) %res2 = call <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.128(<4 x float> %a0, i32 10, <8 x i16> zeroinitializer, i8 %mask) @@ -4389,8 +4389,8 @@ ; X86-NEXT: vcvtps2ph $2, %ymm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x1d,0xc2,0x02] ; X86-NEXT: vcvtps2ph $11, %ymm0, %xmm3 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xa9,0x1d,0xc3,0x0b] ; X86-NEXT: vcvtps2ph $12, %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x1d,0xc1,0x0c] -; X86-NEXT: vpaddw %xmm1, %xmm3, %xmm0 # encoding: [0xc5,0xe1,0xfd,0xc1] -; X86-NEXT: vpaddw %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfd,0xc0] +; X86-NEXT: vpaddw %xmm3, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfd,0xc3] +; X86-NEXT: vpaddw %xmm2, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfd,0xc2] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] ; @@ -4400,8 +4400,8 @@ ; X64-NEXT: vcvtps2ph $2, %ymm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x1d,0xc2,0x02] ; X64-NEXT: vcvtps2ph $11, %ymm0, %xmm3 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xa9,0x1d,0xc3,0x0b] ; X64-NEXT: vcvtps2ph $12, %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x1d,0xc1,0x0c] -; X64-NEXT: vpaddw %xmm1, %xmm3, %xmm0 # encoding: [0xc5,0xe1,0xfd,0xc1] -; X64-NEXT: vpaddw %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfd,0xc0] +; X64-NEXT: vpaddw %xmm3, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfd,0xc3] +; X64-NEXT: vpaddw %xmm2, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfd,0xc2] ; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X64-NEXT: retq # encoding: [0xc3] %res1 = call <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.256(<8 x float> %a0, i32 2, <8 x i16> zeroinitializer, i8 -1) diff --git a/llvm/test/CodeGen/X86/horizontal-sum.ll b/llvm/test/CodeGen/X86/horizontal-sum.ll --- a/llvm/test/CodeGen/X86/horizontal-sum.ll +++ b/llvm/test/CodeGen/X86/horizontal-sum.ll @@ -703,14 +703,14 @@ ; AVX1-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm4[0],xmm1[0] ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[3,3,3,3] ; AVX1-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0] -; AVX1-SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0,1,2,3],xmm2[4,5,6,7] -; AVX1-SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[1,1,1,1] ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[0,0,0,0] ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[2,2,2,2] -; AVX1-SLOW-NEXT: vpaddd %xmm4, %xmm2, %xmm2 -; AVX1-SLOW-NEXT: vpaddd %xmm3, %xmm2, %xmm2 +; AVX1-SLOW-NEXT: vpaddd %xmm2, %xmm4, %xmm2 +; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm3, %xmm1 ; AVX1-SLOW-NEXT: vpaddd %xmm2, %xmm1, %xmm1 ; AVX1-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7] ; AVX1-SLOW-NEXT: retq @@ -727,14 +727,14 @@ ; AVX1-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm4[0],xmm1[0] ; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[3,3,3,3] ; AVX1-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0] -; AVX1-FAST-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0,1,2,3],xmm2[4,5,6,7] -; AVX1-FAST-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vphaddd %xmm3, %xmm3, %xmm1 ; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[2,2,2,2] ; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] -; AVX1-FAST-NEXT: vpaddd %xmm3, %xmm1, %xmm1 -; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm2, %xmm1 +; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm3, %xmm1 +; AVX1-FAST-NEXT: vpaddd %xmm2, %xmm1, %xmm1 ; AVX1-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7] ; AVX1-FAST-NEXT: retq ; @@ -759,8 +759,8 @@ ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[2,2,2,2] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3] -; AVX2-SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; AVX2-SLOW-NEXT: vpaddd %xmm0, %xmm2, %xmm0 +; AVX2-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpaddd %xmm2, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: sequential_sum_v4i32_v4i32: @@ -782,8 +782,8 @@ ; AVX2-FAST-NEXT: vpbroadcastd %xmm4, %xmm4 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3] -; AVX2-FAST-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; AVX2-FAST-NEXT: vpaddd %xmm0, %xmm2, %xmm0 +; AVX2-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpaddd %xmm2, %xmm0, %xmm0 ; AVX2-FAST-NEXT: retq %5 = shufflevector <4 x i32> %0, <4 x i32> %1, <2 x i32> %6 = shufflevector <4 x i32> %0, <4 x i32> %1, <2 x i32> diff --git a/llvm/test/CodeGen/X86/machine-combiner-int-vec.ll b/llvm/test/CodeGen/X86/machine-combiner-int-vec.ll --- a/llvm/test/CodeGen/X86/machine-combiner-int-vec.ll +++ b/llvm/test/CodeGen/X86/machine-combiner-int-vec.ll @@ -16,8 +16,8 @@ ; AVX2-LABEL: reassociate_and_v4i32: ; AVX2: # %bb.0: ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm1 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpand %xmm2, %xmm3, %xmm1 +; AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: reassociate_and_v4i32: @@ -43,8 +43,8 @@ ; AVX2-LABEL: reassociate_or_v4i32: ; AVX2: # %bb.0: ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpor %xmm3, %xmm2, %xmm1 -; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpor %xmm2, %xmm3, %xmm1 +; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: reassociate_or_v4i32: @@ -70,8 +70,8 @@ ; AVX2-LABEL: reassociate_xor_v4i32: ; AVX2: # %bb.0: ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpxor %xmm3, %xmm2, %xmm1 -; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm2, %xmm3, %xmm1 +; AVX2-NEXT: vpxor %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: reassociate_xor_v4i32: @@ -102,8 +102,8 @@ ; AVX2-LABEL: reassociate_and_v8i32: ; AVX2: # %bb.0: ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm1 -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpand %ymm2, %ymm3, %ymm1 +; AVX2-NEXT: vpand %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: reassociate_and_v8i32: @@ -132,8 +132,8 @@ ; AVX2-LABEL: reassociate_or_v8i32: ; AVX2: # %bb.0: ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpor %ymm3, %ymm2, %ymm1 -; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpor %ymm2, %ymm3, %ymm1 +; AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: reassociate_or_v8i32: @@ -162,8 +162,8 @@ ; AVX2-LABEL: reassociate_xor_v8i32: ; AVX2: # %bb.0: ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpxor %ymm3, %ymm2, %ymm1 -; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpxor %ymm2, %ymm3, %ymm1 +; AVX2-NEXT: vpxor %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: reassociate_xor_v8i32: @@ -201,11 +201,11 @@ ; AVX2-LABEL: reassociate_and_v16i32: ; AVX2: # %bb.0: ; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpand %ymm6, %ymm4, %ymm2 -; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpand %ymm4, %ymm6, %ymm2 +; AVX2-NEXT: vpand %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: vpaddd %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpand %ymm7, %ymm5, %ymm2 -; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpand %ymm5, %ymm7, %ymm2 +; AVX2-NEXT: vpand %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: retq ; ; AVX512-LABEL: reassociate_and_v16i32: @@ -240,11 +240,11 @@ ; AVX2-LABEL: reassociate_or_v16i32: ; AVX2: # %bb.0: ; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpor %ymm6, %ymm4, %ymm2 -; AVX2-NEXT: vpor %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpor %ymm4, %ymm6, %ymm2 +; AVX2-NEXT: vpor %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: vpaddd %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpor %ymm7, %ymm5, %ymm2 -; AVX2-NEXT: vpor %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpor %ymm5, %ymm7, %ymm2 +; AVX2-NEXT: vpor %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: retq ; ; AVX512-LABEL: reassociate_or_v16i32: @@ -279,11 +279,11 @@ ; AVX2-LABEL: reassociate_xor_v16i32: ; AVX2: # %bb.0: ; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpxor %ymm6, %ymm4, %ymm2 -; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpxor %ymm4, %ymm6, %ymm2 +; AVX2-NEXT: vpxor %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: vpaddd %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpxor %ymm7, %ymm5, %ymm2 -; AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpxor %ymm5, %ymm7, %ymm2 +; AVX2-NEXT: vpxor %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: retq ; ; AVX512-LABEL: reassociate_xor_v16i32: @@ -311,8 +311,8 @@ ; AVX-LABEL: reassociate_umax_v16i8: ; AVX: # %bb.0: ; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpmaxub %xmm3, %xmm2, %xmm1 -; AVX-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpmaxub %xmm2, %xmm3, %xmm1 +; AVX-NEXT: vpmaxub %xmm0, %xmm1, %xmm0 ; AVX-NEXT: retq %t0 = add <16 x i8> %x0, %x1 @@ -336,8 +336,8 @@ ; AVX-LABEL: reassociate_umax_v8i16: ; AVX: # %bb.0: ; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpmaxuw %xmm3, %xmm2, %xmm1 -; AVX-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpmaxuw %xmm2, %xmm3, %xmm1 +; AVX-NEXT: vpmaxuw %xmm0, %xmm1, %xmm0 ; AVX-NEXT: retq %t0 = add <8 x i16> %x0, %x1 @@ -374,8 +374,8 @@ ; AVX-LABEL: reassociate_umax_v4i32: ; AVX: # %bb.0: ; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpmaxud %xmm3, %xmm2, %xmm1 -; AVX-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpmaxud %xmm2, %xmm3, %xmm1 +; AVX-NEXT: vpmaxud %xmm0, %xmm1, %xmm0 ; AVX-NEXT: retq %t0 = add <4 x i32> %x0, %x1 @@ -439,8 +439,8 @@ ; AVX512-LABEL: reassociate_umax_v2i64: ; AVX512: # %bb.0: ; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpmaxuq %xmm3, %xmm2, %xmm1 -; AVX512-NEXT: vpmaxuq %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpmaxuq %xmm2, %xmm3, %xmm1 +; AVX512-NEXT: vpmaxuq %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: retq %t0 = add <2 x i64> %x0, %x1 @@ -470,8 +470,8 @@ ; AVX-LABEL: reassociate_smax_v16i8: ; AVX: # %bb.0: ; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpmaxsb %xmm3, %xmm2, %xmm1 -; AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpmaxsb %xmm2, %xmm3, %xmm1 +; AVX-NEXT: vpmaxsb %xmm0, %xmm1, %xmm0 ; AVX-NEXT: retq %t0 = add <16 x i8> %x0, %x1 @@ -493,8 +493,8 @@ ; AVX-LABEL: reassociate_smax_v8i16: ; AVX: # %bb.0: ; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpmaxsw %xmm3, %xmm2, %xmm1 -; AVX-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpmaxsw %xmm2, %xmm3, %xmm1 +; AVX-NEXT: vpmaxsw %xmm0, %xmm1, %xmm0 ; AVX-NEXT: retq %t0 = add <8 x i16> %x0, %x1 @@ -524,8 +524,8 @@ ; AVX-LABEL: reassociate_smax_v4i32: ; AVX: # %bb.0: ; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpmaxsd %xmm3, %xmm2, %xmm1 -; AVX-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpmaxsd %xmm2, %xmm3, %xmm1 +; AVX-NEXT: vpmaxsd %xmm0, %xmm1, %xmm0 ; AVX-NEXT: retq %t0 = add <4 x i32> %x0, %x1 @@ -584,8 +584,8 @@ ; AVX512-LABEL: reassociate_smax_v2i64: ; AVX512: # %bb.0: ; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpmaxsq %xmm3, %xmm2, %xmm1 -; AVX512-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpmaxsq %xmm2, %xmm3, %xmm1 +; AVX512-NEXT: vpmaxsq %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: retq %t0 = add <2 x i64> %x0, %x1 @@ -607,8 +607,8 @@ ; AVX-LABEL: reassociate_umin_v16i8: ; AVX: # %bb.0: ; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpminub %xmm3, %xmm2, %xmm1 -; AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpminub %xmm2, %xmm3, %xmm1 +; AVX-NEXT: vpminub %xmm0, %xmm1, %xmm0 ; AVX-NEXT: retq %t0 = add <16 x i8> %x0, %x1 @@ -635,8 +635,8 @@ ; AVX-LABEL: reassociate_umin_v8i16: ; AVX: # %bb.0: ; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpminuw %xmm3, %xmm2, %xmm1 -; AVX-NEXT: vpminuw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpminuw %xmm2, %xmm3, %xmm1 +; AVX-NEXT: vpminuw %xmm0, %xmm1, %xmm0 ; AVX-NEXT: retq %t0 = add <8 x i16> %x0, %x1 @@ -672,8 +672,8 @@ ; AVX-LABEL: reassociate_umin_v4i32: ; AVX: # %bb.0: ; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpminud %xmm3, %xmm2, %xmm1 -; AVX-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpminud %xmm2, %xmm3, %xmm1 +; AVX-NEXT: vpminud %xmm0, %xmm1, %xmm0 ; AVX-NEXT: retq %t0 = add <4 x i32> %x0, %x1 @@ -737,8 +737,8 @@ ; AVX512-LABEL: reassociate_umin_v2i64: ; AVX512: # %bb.0: ; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpminuq %xmm3, %xmm2, %xmm1 -; AVX512-NEXT: vpminuq %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpminuq %xmm2, %xmm3, %xmm1 +; AVX512-NEXT: vpminuq %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: retq %t0 = add <2 x i64> %x0, %x1 @@ -768,8 +768,8 @@ ; AVX-LABEL: reassociate_smin_v16i8: ; AVX: # %bb.0: ; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpminsb %xmm3, %xmm2, %xmm1 -; AVX-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpminsb %xmm2, %xmm3, %xmm1 +; AVX-NEXT: vpminsb %xmm0, %xmm1, %xmm0 ; AVX-NEXT: retq %t0 = add <16 x i8> %x0, %x1 @@ -791,8 +791,8 @@ ; AVX-LABEL: reassociate_smin_v8i16: ; AVX: # %bb.0: ; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpminsw %xmm3, %xmm2, %xmm1 -; AVX-NEXT: vpminsw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpminsw %xmm2, %xmm3, %xmm1 +; AVX-NEXT: vpminsw %xmm0, %xmm1, %xmm0 ; AVX-NEXT: retq %t0 = add <8 x i16> %x0, %x1 @@ -822,8 +822,8 @@ ; AVX-LABEL: reassociate_smin_v4i32: ; AVX: # %bb.0: ; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpminsd %xmm3, %xmm2, %xmm1 -; AVX-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpminsd %xmm2, %xmm3, %xmm1 +; AVX-NEXT: vpminsd %xmm0, %xmm1, %xmm0 ; AVX-NEXT: retq %t0 = add <4 x i32> %x0, %x1 @@ -882,8 +882,8 @@ ; AVX512-LABEL: reassociate_smin_v2i64: ; AVX512: # %bb.0: ; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpminsq %xmm3, %xmm2, %xmm1 -; AVX512-NEXT: vpminsq %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpminsq %xmm2, %xmm3, %xmm1 +; AVX512-NEXT: vpminsq %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: retq %t0 = add <2 x i64> %x0, %x1 @@ -910,8 +910,8 @@ ; AVX-LABEL: reassociate_umax_v32i8: ; AVX: # %bb.0: ; AVX-NEXT: vpaddb %ymm1, %ymm0, %ymm0 -; AVX-NEXT: vpmaxub %ymm3, %ymm2, %ymm1 -; AVX-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; AVX-NEXT: vpmaxub %ymm2, %ymm3, %ymm1 +; AVX-NEXT: vpmaxub %ymm0, %ymm1, %ymm0 ; AVX-NEXT: retq %t0 = add <32 x i8> %x0, %x1 @@ -940,8 +940,8 @@ ; AVX-LABEL: reassociate_umax_v16i16: ; AVX: # %bb.0: ; AVX-NEXT: vpaddw %ymm1, %ymm0, %ymm0 -; AVX-NEXT: vpmaxuw %ymm3, %ymm2, %ymm1 -; AVX-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; AVX-NEXT: vpmaxuw %ymm2, %ymm3, %ymm1 +; AVX-NEXT: vpmaxuw %ymm0, %ymm1, %ymm0 ; AVX-NEXT: retq %t0 = add <16 x i16> %x0, %x1 @@ -995,8 +995,8 @@ ; AVX-LABEL: reassociate_umax_v8i32: ; AVX: # %bb.0: ; AVX-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; AVX-NEXT: vpmaxud %ymm3, %ymm2, %ymm1 -; AVX-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 +; AVX-NEXT: vpmaxud %ymm2, %ymm3, %ymm1 +; AVX-NEXT: vpmaxud %ymm0, %ymm1, %ymm0 ; AVX-NEXT: retq %t0 = add <8 x i32> %x0, %x1 @@ -1091,8 +1091,8 @@ ; AVX512-LABEL: reassociate_umax_v4i64: ; AVX512: # %bb.0: ; AVX512-NEXT: vpaddq %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vpmaxuq %ymm3, %ymm2, %ymm1 -; AVX512-NEXT: vpmaxuq %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpmaxuq %ymm2, %ymm3, %ymm1 +; AVX512-NEXT: vpmaxuq %ymm0, %ymm1, %ymm0 ; AVX512-NEXT: retq %t0 = add <4 x i64> %x0, %x1 @@ -1133,8 +1133,8 @@ ; AVX-LABEL: reassociate_smax_v32i8: ; AVX: # %bb.0: ; AVX-NEXT: vpaddb %ymm1, %ymm0, %ymm0 -; AVX-NEXT: vpmaxsb %ymm3, %ymm2, %ymm1 -; AVX-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; AVX-NEXT: vpmaxsb %ymm2, %ymm3, %ymm1 +; AVX-NEXT: vpmaxsb %ymm0, %ymm1, %ymm0 ; AVX-NEXT: retq %t0 = add <32 x i8> %x0, %x1 @@ -1159,8 +1159,8 @@ ; AVX-LABEL: reassociate_smax_v16i16: ; AVX: # %bb.0: ; AVX-NEXT: vpaddw %ymm1, %ymm0, %ymm0 -; AVX-NEXT: vpmaxsw %ymm3, %ymm2, %ymm1 -; AVX-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; AVX-NEXT: vpmaxsw %ymm2, %ymm3, %ymm1 +; AVX-NEXT: vpmaxsw %ymm0, %ymm1, %ymm0 ; AVX-NEXT: retq %t0 = add <16 x i16> %x0, %x1 @@ -1201,8 +1201,8 @@ ; AVX-LABEL: reassociate_smax_v8i32: ; AVX: # %bb.0: ; AVX-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; AVX-NEXT: vpmaxsd %ymm3, %ymm2, %ymm1 -; AVX-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 +; AVX-NEXT: vpmaxsd %ymm2, %ymm3, %ymm1 +; AVX-NEXT: vpmaxsd %ymm0, %ymm1, %ymm0 ; AVX-NEXT: retq %t0 = add <8 x i32> %x0, %x1 @@ -1292,8 +1292,8 @@ ; AVX512-LABEL: reassociate_smax_v4i64: ; AVX512: # %bb.0: ; AVX512-NEXT: vpaddq %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vpmaxsq %ymm3, %ymm2, %ymm1 -; AVX512-NEXT: vpmaxsq %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpmaxsq %ymm2, %ymm3, %ymm1 +; AVX512-NEXT: vpmaxsq %ymm0, %ymm1, %ymm0 ; AVX512-NEXT: retq %t0 = add <4 x i64> %x0, %x1 @@ -1318,8 +1318,8 @@ ; AVX-LABEL: reassociate_umin_v32i8: ; AVX: # %bb.0: ; AVX-NEXT: vpaddb %ymm1, %ymm0, %ymm0 -; AVX-NEXT: vpminub %ymm3, %ymm2, %ymm1 -; AVX-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; AVX-NEXT: vpminub %ymm2, %ymm3, %ymm1 +; AVX-NEXT: vpminub %ymm0, %ymm1, %ymm0 ; AVX-NEXT: retq %t0 = add <32 x i8> %x0, %x1 @@ -1354,8 +1354,8 @@ ; AVX-LABEL: reassociate_umin_v16i16: ; AVX: # %bb.0: ; AVX-NEXT: vpaddw %ymm1, %ymm0, %ymm0 -; AVX-NEXT: vpminuw %ymm3, %ymm2, %ymm1 -; AVX-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; AVX-NEXT: vpminuw %ymm2, %ymm3, %ymm1 +; AVX-NEXT: vpminuw %ymm0, %ymm1, %ymm0 ; AVX-NEXT: retq %t0 = add <16 x i16> %x0, %x1 @@ -1408,8 +1408,8 @@ ; AVX-LABEL: reassociate_umin_v8i32: ; AVX: # %bb.0: ; AVX-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; AVX-NEXT: vpminud %ymm3, %ymm2, %ymm1 -; AVX-NEXT: vpminud %ymm1, %ymm0, %ymm0 +; AVX-NEXT: vpminud %ymm2, %ymm3, %ymm1 +; AVX-NEXT: vpminud %ymm0, %ymm1, %ymm0 ; AVX-NEXT: retq %t0 = add <8 x i32> %x0, %x1 @@ -1504,8 +1504,8 @@ ; AVX512-LABEL: reassociate_umin_v4i64: ; AVX512: # %bb.0: ; AVX512-NEXT: vpaddq %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vpminuq %ymm3, %ymm2, %ymm1 -; AVX512-NEXT: vpminuq %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpminuq %ymm2, %ymm3, %ymm1 +; AVX512-NEXT: vpminuq %ymm0, %ymm1, %ymm0 ; AVX512-NEXT: retq %t0 = add <4 x i64> %x0, %x1 @@ -1546,8 +1546,8 @@ ; AVX-LABEL: reassociate_smin_v32i8: ; AVX: # %bb.0: ; AVX-NEXT: vpaddb %ymm1, %ymm0, %ymm0 -; AVX-NEXT: vpminsb %ymm3, %ymm2, %ymm1 -; AVX-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; AVX-NEXT: vpminsb %ymm2, %ymm3, %ymm1 +; AVX-NEXT: vpminsb %ymm0, %ymm1, %ymm0 ; AVX-NEXT: retq %t0 = add <32 x i8> %x0, %x1 @@ -1572,8 +1572,8 @@ ; AVX-LABEL: reassociate_smin_v16i16: ; AVX: # %bb.0: ; AVX-NEXT: vpaddw %ymm1, %ymm0, %ymm0 -; AVX-NEXT: vpminsw %ymm3, %ymm2, %ymm1 -; AVX-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; AVX-NEXT: vpminsw %ymm2, %ymm3, %ymm1 +; AVX-NEXT: vpminsw %ymm0, %ymm1, %ymm0 ; AVX-NEXT: retq %t0 = add <16 x i16> %x0, %x1 @@ -1614,8 +1614,8 @@ ; AVX-LABEL: reassociate_smin_v8i32: ; AVX: # %bb.0: ; AVX-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; AVX-NEXT: vpminsd %ymm3, %ymm2, %ymm1 -; AVX-NEXT: vpminsd %ymm1, %ymm0, %ymm0 +; AVX-NEXT: vpminsd %ymm2, %ymm3, %ymm1 +; AVX-NEXT: vpminsd %ymm0, %ymm1, %ymm0 ; AVX-NEXT: retq %t0 = add <8 x i32> %x0, %x1 @@ -1705,8 +1705,8 @@ ; AVX512-LABEL: reassociate_smin_v4i64: ; AVX512: # %bb.0: ; AVX512-NEXT: vpaddq %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vpminsq %ymm3, %ymm2, %ymm1 -; AVX512-NEXT: vpminsq %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpminsq %ymm2, %ymm3, %ymm1 +; AVX512-NEXT: vpminsq %ymm0, %ymm1, %ymm0 ; AVX512-NEXT: retq %t0 = add <4 x i64> %x0, %x1 @@ -1740,17 +1740,17 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpaddb %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpmaxub %ymm6, %ymm4, %ymm2 -; AVX2-NEXT: vpmaxub %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpmaxub %ymm7, %ymm5, %ymm2 -; AVX2-NEXT: vpmaxub %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpmaxub %ymm4, %ymm6, %ymm2 +; AVX2-NEXT: vpmaxub %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vpmaxub %ymm5, %ymm7, %ymm2 +; AVX2-NEXT: vpmaxub %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: retq ; ; AVX512-LABEL: reassociate_umax_v64i8: ; AVX512: # %bb.0: ; AVX512-NEXT: vpaddb %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vpmaxub %zmm3, %zmm2, %zmm1 -; AVX512-NEXT: vpmaxub %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpmaxub %zmm2, %zmm3, %zmm1 +; AVX512-NEXT: vpmaxub %zmm0, %zmm1, %zmm0 ; AVX512-NEXT: retq %t0 = add <64 x i8> %x0, %x1 @@ -1798,17 +1798,17 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpaddw %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpaddw %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpmaxuw %ymm6, %ymm4, %ymm2 -; AVX2-NEXT: vpmaxuw %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpmaxuw %ymm7, %ymm5, %ymm2 -; AVX2-NEXT: vpmaxuw %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpmaxuw %ymm4, %ymm6, %ymm2 +; AVX2-NEXT: vpmaxuw %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vpmaxuw %ymm5, %ymm7, %ymm2 +; AVX2-NEXT: vpmaxuw %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: retq ; ; AVX512-LABEL: reassociate_umax_v32i16: ; AVX512: # %bb.0: ; AVX512-NEXT: vpaddw %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vpmaxuw %zmm3, %zmm2, %zmm1 -; AVX512-NEXT: vpmaxuw %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpmaxuw %zmm2, %zmm3, %zmm1 +; AVX512-NEXT: vpmaxuw %zmm0, %zmm1, %zmm0 ; AVX512-NEXT: retq %t0 = add <32 x i16> %x0, %x1 @@ -1907,17 +1907,17 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpaddd %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpmaxud %ymm6, %ymm4, %ymm2 -; AVX2-NEXT: vpmaxud %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpmaxud %ymm7, %ymm5, %ymm2 -; AVX2-NEXT: vpmaxud %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpmaxud %ymm4, %ymm6, %ymm2 +; AVX2-NEXT: vpmaxud %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vpmaxud %ymm5, %ymm7, %ymm2 +; AVX2-NEXT: vpmaxud %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: retq ; ; AVX512-LABEL: reassociate_umax_v16i32: ; AVX512: # %bb.0: ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vpmaxud %zmm3, %zmm2, %zmm1 -; AVX512-NEXT: vpmaxud %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpmaxud %zmm2, %zmm3, %zmm1 +; AVX512-NEXT: vpmaxud %zmm0, %zmm1, %zmm0 ; AVX512-NEXT: retq %t0 = add <16 x i32> %x0, %x1 @@ -2091,8 +2091,8 @@ ; AVX512-LABEL: reassociate_umax_v8i64: ; AVX512: # %bb.0: ; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vpmaxuq %zmm3, %zmm2, %zmm1 -; AVX512-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpmaxuq %zmm2, %zmm3, %zmm1 +; AVX512-NEXT: vpmaxuq %zmm0, %zmm1, %zmm0 ; AVX512-NEXT: retq %t0 = add <8 x i64> %x0, %x1 @@ -2164,17 +2164,17 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpaddb %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpmaxsb %ymm6, %ymm4, %ymm2 -; AVX2-NEXT: vpmaxsb %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpmaxsb %ymm7, %ymm5, %ymm2 -; AVX2-NEXT: vpmaxsb %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpmaxsb %ymm4, %ymm6, %ymm2 +; AVX2-NEXT: vpmaxsb %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vpmaxsb %ymm5, %ymm7, %ymm2 +; AVX2-NEXT: vpmaxsb %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: retq ; ; AVX512-LABEL: reassociate_smax_v64i8: ; AVX512: # %bb.0: ; AVX512-NEXT: vpaddb %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vpmaxsb %zmm3, %zmm2, %zmm1 -; AVX512-NEXT: vpmaxsb %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpmaxsb %zmm2, %zmm3, %zmm1 +; AVX512-NEXT: vpmaxsb %zmm0, %zmm1, %zmm0 ; AVX512-NEXT: retq %t0 = add <64 x i8> %x0, %x1 @@ -2206,17 +2206,17 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpaddw %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpaddw %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpmaxsw %ymm6, %ymm4, %ymm2 -; AVX2-NEXT: vpmaxsw %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpmaxsw %ymm7, %ymm5, %ymm2 -; AVX2-NEXT: vpmaxsw %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpmaxsw %ymm4, %ymm6, %ymm2 +; AVX2-NEXT: vpmaxsw %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vpmaxsw %ymm5, %ymm7, %ymm2 +; AVX2-NEXT: vpmaxsw %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: retq ; ; AVX512-LABEL: reassociate_smax_v32i16: ; AVX512: # %bb.0: ; AVX512-NEXT: vpaddw %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vpmaxsw %zmm3, %zmm2, %zmm1 -; AVX512-NEXT: vpmaxsw %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpmaxsw %zmm2, %zmm3, %zmm1 +; AVX512-NEXT: vpmaxsw %zmm0, %zmm1, %zmm0 ; AVX512-NEXT: retq %t0 = add <32 x i16> %x0, %x1 @@ -2288,17 +2288,17 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpaddd %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpmaxsd %ymm6, %ymm4, %ymm2 -; AVX2-NEXT: vpmaxsd %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpmaxsd %ymm7, %ymm5, %ymm2 -; AVX2-NEXT: vpmaxsd %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpmaxsd %ymm4, %ymm6, %ymm2 +; AVX2-NEXT: vpmaxsd %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vpmaxsd %ymm5, %ymm7, %ymm2 +; AVX2-NEXT: vpmaxsd %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: retq ; ; AVX512-LABEL: reassociate_smax_v16i32: ; AVX512: # %bb.0: ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vpmaxsd %zmm3, %zmm2, %zmm1 -; AVX512-NEXT: vpmaxsd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpmaxsd %zmm2, %zmm3, %zmm1 +; AVX512-NEXT: vpmaxsd %zmm0, %zmm1, %zmm0 ; AVX512-NEXT: retq %t0 = add <16 x i32> %x0, %x1 @@ -2463,8 +2463,8 @@ ; AVX512-LABEL: reassociate_smax_v8i64: ; AVX512: # %bb.0: ; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vpmaxsq %zmm3, %zmm2, %zmm1 -; AVX512-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpmaxsq %zmm2, %zmm3, %zmm1 +; AVX512-NEXT: vpmaxsq %zmm0, %zmm1, %zmm0 ; AVX512-NEXT: retq %t0 = add <8 x i64> %x0, %x1 @@ -2496,17 +2496,17 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpaddb %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpminub %ymm6, %ymm4, %ymm2 -; AVX2-NEXT: vpminub %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpminub %ymm7, %ymm5, %ymm2 -; AVX2-NEXT: vpminub %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpminub %ymm4, %ymm6, %ymm2 +; AVX2-NEXT: vpminub %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vpminub %ymm5, %ymm7, %ymm2 +; AVX2-NEXT: vpminub %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: retq ; ; AVX512-LABEL: reassociate_umin_v64i8: ; AVX512: # %bb.0: ; AVX512-NEXT: vpaddb %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vpminub %zmm3, %zmm2, %zmm1 -; AVX512-NEXT: vpminub %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpminub %zmm2, %zmm3, %zmm1 +; AVX512-NEXT: vpminub %zmm0, %zmm1, %zmm0 ; AVX512-NEXT: retq %t0 = add <64 x i8> %x0, %x1 @@ -2566,17 +2566,17 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpaddw %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpaddw %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpminuw %ymm6, %ymm4, %ymm2 -; AVX2-NEXT: vpminuw %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpminuw %ymm7, %ymm5, %ymm2 -; AVX2-NEXT: vpminuw %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpminuw %ymm4, %ymm6, %ymm2 +; AVX2-NEXT: vpminuw %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vpminuw %ymm5, %ymm7, %ymm2 +; AVX2-NEXT: vpminuw %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: retq ; ; AVX512-LABEL: reassociate_umin_v32i16: ; AVX512: # %bb.0: ; AVX512-NEXT: vpaddw %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vpminuw %zmm3, %zmm2, %zmm1 -; AVX512-NEXT: vpminuw %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpminuw %zmm2, %zmm3, %zmm1 +; AVX512-NEXT: vpminuw %zmm0, %zmm1, %zmm0 ; AVX512-NEXT: retq %t0 = add <32 x i16> %x0, %x1 @@ -2672,17 +2672,17 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpaddd %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpminud %ymm6, %ymm4, %ymm2 -; AVX2-NEXT: vpminud %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpminud %ymm7, %ymm5, %ymm2 -; AVX2-NEXT: vpminud %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpminud %ymm4, %ymm6, %ymm2 +; AVX2-NEXT: vpminud %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vpminud %ymm5, %ymm7, %ymm2 +; AVX2-NEXT: vpminud %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: retq ; ; AVX512-LABEL: reassociate_umin_v16i32: ; AVX512: # %bb.0: ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vpminud %zmm3, %zmm2, %zmm1 -; AVX512-NEXT: vpminud %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpminud %zmm2, %zmm3, %zmm1 +; AVX512-NEXT: vpminud %zmm0, %zmm1, %zmm0 ; AVX512-NEXT: retq %t0 = add <16 x i32> %x0, %x1 @@ -2856,8 +2856,8 @@ ; AVX512-LABEL: reassociate_umin_v8i64: ; AVX512: # %bb.0: ; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vpminuq %zmm3, %zmm2, %zmm1 -; AVX512-NEXT: vpminuq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpminuq %zmm2, %zmm3, %zmm1 +; AVX512-NEXT: vpminuq %zmm0, %zmm1, %zmm0 ; AVX512-NEXT: retq %t0 = add <8 x i64> %x0, %x1 @@ -2929,17 +2929,17 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpaddb %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpminsb %ymm6, %ymm4, %ymm2 -; AVX2-NEXT: vpminsb %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpminsb %ymm7, %ymm5, %ymm2 -; AVX2-NEXT: vpminsb %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpminsb %ymm4, %ymm6, %ymm2 +; AVX2-NEXT: vpminsb %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vpminsb %ymm5, %ymm7, %ymm2 +; AVX2-NEXT: vpminsb %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: retq ; ; AVX512-LABEL: reassociate_smin_v64i8: ; AVX512: # %bb.0: ; AVX512-NEXT: vpaddb %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vpminsb %zmm3, %zmm2, %zmm1 -; AVX512-NEXT: vpminsb %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpminsb %zmm2, %zmm3, %zmm1 +; AVX512-NEXT: vpminsb %zmm0, %zmm1, %zmm0 ; AVX512-NEXT: retq %t0 = add <64 x i8> %x0, %x1 @@ -2971,17 +2971,17 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpaddw %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpaddw %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpminsw %ymm6, %ymm4, %ymm2 -; AVX2-NEXT: vpminsw %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpminsw %ymm7, %ymm5, %ymm2 -; AVX2-NEXT: vpminsw %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpminsw %ymm4, %ymm6, %ymm2 +; AVX2-NEXT: vpminsw %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vpminsw %ymm5, %ymm7, %ymm2 +; AVX2-NEXT: vpminsw %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: retq ; ; AVX512-LABEL: reassociate_smin_v32i16: ; AVX512: # %bb.0: ; AVX512-NEXT: vpaddw %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vpminsw %zmm3, %zmm2, %zmm1 -; AVX512-NEXT: vpminsw %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpminsw %zmm2, %zmm3, %zmm1 +; AVX512-NEXT: vpminsw %zmm0, %zmm1, %zmm0 ; AVX512-NEXT: retq %t0 = add <32 x i16> %x0, %x1 @@ -3053,17 +3053,17 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpaddd %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpminsd %ymm6, %ymm4, %ymm2 -; AVX2-NEXT: vpminsd %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpminsd %ymm7, %ymm5, %ymm2 -; AVX2-NEXT: vpminsd %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpminsd %ymm4, %ymm6, %ymm2 +; AVX2-NEXT: vpminsd %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vpminsd %ymm5, %ymm7, %ymm2 +; AVX2-NEXT: vpminsd %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: retq ; ; AVX512-LABEL: reassociate_smin_v16i32: ; AVX512: # %bb.0: ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vpminsd %zmm3, %zmm2, %zmm1 -; AVX512-NEXT: vpminsd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpminsd %zmm2, %zmm3, %zmm1 +; AVX512-NEXT: vpminsd %zmm0, %zmm1, %zmm0 ; AVX512-NEXT: retq %t0 = add <16 x i32> %x0, %x1 @@ -3228,8 +3228,8 @@ ; AVX512-LABEL: reassociate_smin_v8i64: ; AVX512: # %bb.0: ; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vpminsq %zmm3, %zmm2, %zmm1 -; AVX512-NEXT: vpminsq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpminsq %zmm2, %zmm3, %zmm1 +; AVX512-NEXT: vpminsq %zmm0, %zmm1, %zmm0 ; AVX512-NEXT: retq %t0 = add <8 x i64> %x0, %x1 diff --git a/llvm/test/CodeGen/X86/machine-combiner.ll b/llvm/test/CodeGen/X86/machine-combiner.ll --- a/llvm/test/CodeGen/X86/machine-combiner.ll +++ b/llvm/test/CodeGen/X86/machine-combiner.ll @@ -44,7 +44,7 @@ ; AVX: # %bb.0: ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vaddss %xmm3, %xmm2, %xmm1 -; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vaddss %xmm0, %xmm1, %xmm0 ; AVX-NEXT: retq %t0 = fadd reassoc nsz float %x0, %x1 %t1 = fadd reassoc nsz float %x2, %t0 @@ -63,8 +63,8 @@ ; AVX-LABEL: reassociate_adds3: ; AVX: # %bb.0: ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vaddss %xmm3, %xmm2, %xmm1 -; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vaddss %xmm2, %xmm3, %xmm1 +; AVX-NEXT: vaddss %xmm0, %xmm1, %xmm0 ; AVX-NEXT: retq %t0 = fadd reassoc nsz float %x0, %x1 %t1 = fadd reassoc nsz float %t0, %x2 @@ -83,8 +83,8 @@ ; AVX-LABEL: reassociate_adds4: ; AVX: # %bb.0: ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vaddss %xmm3, %xmm2, %xmm1 -; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vaddss %xmm2, %xmm3, %xmm1 +; AVX-NEXT: vaddss %xmm0, %xmm1, %xmm0 ; AVX-NEXT: retq %t0 = fadd reassoc nsz float %x0, %x1 %t1 = fadd reassoc nsz float %x2, %t0 @@ -143,8 +143,8 @@ ; AVX-LABEL: reassociate_adds6: ; AVX: # %bb.0: ; AVX-NEXT: vdivss %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vaddss %xmm3, %xmm2, %xmm1 -; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vaddss %xmm2, %xmm3, %xmm1 +; AVX-NEXT: vaddss %xmm0, %xmm1, %xmm0 ; AVX-NEXT: retq %t0 = fdiv reassoc nsz float %x0, %x1 %t1 = fadd reassoc nsz float %x2, %t0 @@ -165,8 +165,8 @@ ; AVX-LABEL: reassociate_muls1: ; AVX: # %bb.0: ; AVX-NEXT: vdivss %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmulss %xmm3, %xmm2, %xmm1 -; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmulss %xmm2, %xmm3, %xmm1 +; AVX-NEXT: vmulss %xmm0, %xmm1, %xmm0 ; AVX-NEXT: retq %t0 = fdiv reassoc nsz float %x0, %x1 %t1 = fmul reassoc nsz float %x2, %t0 @@ -187,8 +187,8 @@ ; AVX-LABEL: reassociate_adds_double: ; AVX: # %bb.0: ; AVX-NEXT: vdivsd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vaddsd %xmm3, %xmm2, %xmm1 -; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vaddsd %xmm2, %xmm3, %xmm1 +; AVX-NEXT: vaddsd %xmm0, %xmm1, %xmm0 ; AVX-NEXT: retq %t0 = fdiv reassoc nsz double %x0, %x1 %t1 = fadd reassoc nsz double %x2, %t0 @@ -209,8 +209,8 @@ ; AVX-LABEL: reassociate_muls_double: ; AVX: # %bb.0: ; AVX-NEXT: vdivsd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmulsd %xmm3, %xmm2, %xmm1 -; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmulsd %xmm2, %xmm3, %xmm1 +; AVX-NEXT: vmulsd %xmm0, %xmm1, %xmm0 ; AVX-NEXT: retq %t0 = fdiv reassoc nsz double %x0, %x1 %t1 = fmul reassoc nsz double %x2, %t0 @@ -231,8 +231,8 @@ ; AVX1-LABEL: reassociate_adds_v4f32: ; AVX1: # %bb.0: ; AVX1-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vaddps %xmm3, %xmm2, %xmm1 -; AVX1-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vaddps %xmm2, %xmm3, %xmm1 +; AVX1-NEXT: vaddps %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; ; AVX512-LABEL: reassociate_adds_v4f32: @@ -259,8 +259,8 @@ ; AVX1-LABEL: reassociate_adds_v2f64: ; AVX1: # %bb.0: ; AVX1-NEXT: vmulpd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vaddpd %xmm3, %xmm2, %xmm1 -; AVX1-NEXT: vaddpd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vaddpd %xmm2, %xmm3, %xmm1 +; AVX1-NEXT: vaddpd %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; ; AVX512-LABEL: reassociate_adds_v2f64: @@ -287,8 +287,8 @@ ; AVX-LABEL: reassociate_muls_v4f32: ; AVX: # %bb.0: ; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmulps %xmm3, %xmm2, %xmm1 -; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmulps %xmm2, %xmm3, %xmm1 +; AVX-NEXT: vmulps %xmm0, %xmm1, %xmm0 ; AVX-NEXT: retq %t0 = fadd reassoc nsz <4 x float> %x0, %x1 %t1 = fmul reassoc nsz <4 x float> %x2, %t0 @@ -309,8 +309,8 @@ ; AVX-LABEL: reassociate_muls_v2f64: ; AVX: # %bb.0: ; AVX-NEXT: vaddpd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmulpd %xmm3, %xmm2, %xmm1 -; AVX-NEXT: vmulpd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmulpd %xmm2, %xmm3, %xmm1 +; AVX-NEXT: vmulpd %xmm0, %xmm1, %xmm0 ; AVX-NEXT: retq %t0 = fadd reassoc nsz <2 x double> %x0, %x1 %t1 = fmul reassoc nsz <2 x double> %x2, %t0 @@ -334,8 +334,8 @@ ; AVX1-LABEL: reassociate_adds_v8f32: ; AVX1: # %bb.0: ; AVX1-NEXT: vmulps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vaddps %ymm3, %ymm2, %ymm1 -; AVX1-NEXT: vaddps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vaddps %ymm2, %ymm3, %ymm1 +; AVX1-NEXT: vaddps %ymm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; ; AVX512-LABEL: reassociate_adds_v8f32: @@ -365,8 +365,8 @@ ; AVX1-LABEL: reassociate_adds_v4f64: ; AVX1: # %bb.0: ; AVX1-NEXT: vmulpd %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vaddpd %ymm3, %ymm2, %ymm1 -; AVX1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vaddpd %ymm2, %ymm3, %ymm1 +; AVX1-NEXT: vaddpd %ymm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; ; AVX512-LABEL: reassociate_adds_v4f64: @@ -396,8 +396,8 @@ ; AVX-LABEL: reassociate_muls_v8f32: ; AVX: # %bb.0: ; AVX-NEXT: vaddps %ymm1, %ymm0, %ymm0 -; AVX-NEXT: vmulps %ymm3, %ymm2, %ymm1 -; AVX-NEXT: vmulps %ymm1, %ymm0, %ymm0 +; AVX-NEXT: vmulps %ymm2, %ymm3, %ymm1 +; AVX-NEXT: vmulps %ymm0, %ymm1, %ymm0 ; AVX-NEXT: retq %t0 = fadd reassoc nsz <8 x float> %x0, %x1 %t1 = fmul reassoc nsz <8 x float> %x2, %t0 @@ -421,8 +421,8 @@ ; AVX-LABEL: reassociate_muls_v4f64: ; AVX: # %bb.0: ; AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0 -; AVX-NEXT: vmulpd %ymm3, %ymm2, %ymm1 -; AVX-NEXT: vmulpd %ymm1, %ymm0, %ymm0 +; AVX-NEXT: vmulpd %ymm2, %ymm3, %ymm1 +; AVX-NEXT: vmulpd %ymm0, %ymm1, %ymm0 ; AVX-NEXT: retq %t0 = fadd reassoc nsz <4 x double> %x0, %x1 %t1 = fmul reassoc nsz <4 x double> %x2, %t0 @@ -453,10 +453,10 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: vmulps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vmulps %ymm3, %ymm1, %ymm1 -; AVX1-NEXT: vaddps %ymm6, %ymm4, %ymm2 -; AVX1-NEXT: vaddps %ymm2, %ymm0, %ymm0 -; AVX1-NEXT: vaddps %ymm7, %ymm5, %ymm2 -; AVX1-NEXT: vaddps %ymm2, %ymm1, %ymm1 +; AVX1-NEXT: vaddps %ymm4, %ymm6, %ymm2 +; AVX1-NEXT: vaddps %ymm0, %ymm2, %ymm0 +; AVX1-NEXT: vaddps %ymm5, %ymm7, %ymm2 +; AVX1-NEXT: vaddps %ymm1, %ymm2, %ymm1 ; AVX1-NEXT: retq ; ; AVX512-LABEL: reassociate_adds_v16f32: @@ -493,10 +493,10 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: vmulpd %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vmulpd %ymm3, %ymm1, %ymm1 -; AVX1-NEXT: vaddpd %ymm6, %ymm4, %ymm2 -; AVX1-NEXT: vaddpd %ymm2, %ymm0, %ymm0 -; AVX1-NEXT: vaddpd %ymm7, %ymm5, %ymm2 -; AVX1-NEXT: vaddpd %ymm2, %ymm1, %ymm1 +; AVX1-NEXT: vaddpd %ymm4, %ymm6, %ymm2 +; AVX1-NEXT: vaddpd %ymm0, %ymm2, %ymm0 +; AVX1-NEXT: vaddpd %ymm5, %ymm7, %ymm2 +; AVX1-NEXT: vaddpd %ymm1, %ymm2, %ymm1 ; AVX1-NEXT: retq ; ; AVX512-LABEL: reassociate_adds_v8f64: @@ -533,17 +533,17 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: vaddps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vaddps %ymm3, %ymm1, %ymm1 -; AVX1-NEXT: vmulps %ymm6, %ymm4, %ymm2 -; AVX1-NEXT: vmulps %ymm2, %ymm0, %ymm0 -; AVX1-NEXT: vmulps %ymm7, %ymm5, %ymm2 -; AVX1-NEXT: vmulps %ymm2, %ymm1, %ymm1 +; AVX1-NEXT: vmulps %ymm4, %ymm6, %ymm2 +; AVX1-NEXT: vmulps %ymm0, %ymm2, %ymm0 +; AVX1-NEXT: vmulps %ymm5, %ymm7, %ymm2 +; AVX1-NEXT: vmulps %ymm1, %ymm2, %ymm1 ; AVX1-NEXT: retq ; ; AVX512-LABEL: reassociate_muls_v16f32: ; AVX512: # %bb.0: ; AVX512-NEXT: vaddps %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vmulps %zmm3, %zmm2, %zmm1 -; AVX512-NEXT: vmulps %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vmulps %zmm2, %zmm3, %zmm1 +; AVX512-NEXT: vmulps %zmm0, %zmm1, %zmm0 ; AVX512-NEXT: retq %t0 = fadd reassoc nsz <16 x float> %x0, %x1 %t1 = fmul reassoc nsz <16 x float> %x2, %t0 @@ -574,17 +574,17 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: vaddpd %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vaddpd %ymm3, %ymm1, %ymm1 -; AVX1-NEXT: vmulpd %ymm6, %ymm4, %ymm2 -; AVX1-NEXT: vmulpd %ymm2, %ymm0, %ymm0 -; AVX1-NEXT: vmulpd %ymm7, %ymm5, %ymm2 -; AVX1-NEXT: vmulpd %ymm2, %ymm1, %ymm1 +; AVX1-NEXT: vmulpd %ymm4, %ymm6, %ymm2 +; AVX1-NEXT: vmulpd %ymm0, %ymm2, %ymm0 +; AVX1-NEXT: vmulpd %ymm5, %ymm7, %ymm2 +; AVX1-NEXT: vmulpd %ymm1, %ymm2, %ymm1 ; AVX1-NEXT: retq ; ; AVX512-LABEL: reassociate_muls_v8f64: ; AVX512: # %bb.0: ; AVX512-NEXT: vaddpd %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vmulpd %zmm3, %zmm2, %zmm1 -; AVX512-NEXT: vmulpd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vmulpd %zmm2, %zmm3, %zmm1 +; AVX512-NEXT: vmulpd %zmm0, %zmm1, %zmm0 ; AVX512-NEXT: retq %t0 = fadd reassoc nsz <8 x double> %x0, %x1 %t1 = fmul reassoc nsz <8 x double> %x2, %t0 @@ -605,8 +605,8 @@ ; AVX-LABEL: reassociate_mins_single: ; AVX: # %bb.0: ; AVX-NEXT: vdivss %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vminss %xmm3, %xmm2, %xmm1 -; AVX-NEXT: vminss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vminss %xmm2, %xmm3, %xmm1 +; AVX-NEXT: vminss %xmm0, %xmm1, %xmm0 ; AVX-NEXT: retq %t0 = fdiv float %x0, %x1 %cmp1 = fcmp olt float %x2, %t0 @@ -629,8 +629,8 @@ ; AVX-LABEL: reassociate_maxs_single: ; AVX: # %bb.0: ; AVX-NEXT: vdivss %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmaxss %xmm3, %xmm2, %xmm1 -; AVX-NEXT: vmaxss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmaxss %xmm2, %xmm3, %xmm1 +; AVX-NEXT: vmaxss %xmm0, %xmm1, %xmm0 ; AVX-NEXT: retq %t0 = fdiv float %x0, %x1 %cmp1 = fcmp ogt float %x2, %t0 @@ -653,8 +653,8 @@ ; AVX-LABEL: reassociate_mins_double: ; AVX: # %bb.0: ; AVX-NEXT: vdivsd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vminsd %xmm3, %xmm2, %xmm1 -; AVX-NEXT: vminsd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vminsd %xmm2, %xmm3, %xmm1 +; AVX-NEXT: vminsd %xmm0, %xmm1, %xmm0 ; AVX-NEXT: retq %t0 = fdiv double %x0, %x1 %cmp1 = fcmp olt double %x2, %t0 @@ -677,8 +677,8 @@ ; AVX-LABEL: reassociate_maxs_double: ; AVX: # %bb.0: ; AVX-NEXT: vdivsd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmaxsd %xmm3, %xmm2, %xmm1 -; AVX-NEXT: vmaxsd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmaxsd %xmm2, %xmm3, %xmm1 +; AVX-NEXT: vmaxsd %xmm0, %xmm1, %xmm0 ; AVX-NEXT: retq %t0 = fdiv double %x0, %x1 %cmp1 = fcmp ogt double %x2, %t0 @@ -701,8 +701,8 @@ ; AVX-LABEL: reassociate_mins_v4f32: ; AVX: # %bb.0: ; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vminps %xmm3, %xmm2, %xmm1 -; AVX-NEXT: vminps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vminps %xmm2, %xmm3, %xmm1 +; AVX-NEXT: vminps %xmm0, %xmm1, %xmm0 ; AVX-NEXT: retq %t0 = fadd <4 x float> %x0, %x1 %cmp1 = fcmp olt <4 x float> %x2, %t0 @@ -725,8 +725,8 @@ ; AVX-LABEL: reassociate_maxs_v4f32: ; AVX: # %bb.0: ; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmaxps %xmm3, %xmm2, %xmm1 -; AVX-NEXT: vmaxps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmaxps %xmm2, %xmm3, %xmm1 +; AVX-NEXT: vmaxps %xmm0, %xmm1, %xmm0 ; AVX-NEXT: retq %t0 = fadd <4 x float> %x0, %x1 %cmp1 = fcmp ogt <4 x float> %x2, %t0 @@ -749,8 +749,8 @@ ; AVX-LABEL: reassociate_mins_v2f64: ; AVX: # %bb.0: ; AVX-NEXT: vaddpd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vminpd %xmm3, %xmm2, %xmm1 -; AVX-NEXT: vminpd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vminpd %xmm2, %xmm3, %xmm1 +; AVX-NEXT: vminpd %xmm0, %xmm1, %xmm0 ; AVX-NEXT: retq %t0 = fadd <2 x double> %x0, %x1 %cmp1 = fcmp olt <2 x double> %x2, %t0 @@ -773,8 +773,8 @@ ; AVX-LABEL: reassociate_maxs_v2f64: ; AVX: # %bb.0: ; AVX-NEXT: vaddpd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmaxpd %xmm3, %xmm2, %xmm1 -; AVX-NEXT: vmaxpd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmaxpd %xmm2, %xmm3, %xmm1 +; AVX-NEXT: vmaxpd %xmm0, %xmm1, %xmm0 ; AVX-NEXT: retq %t0 = fadd <2 x double> %x0, %x1 %cmp1 = fcmp ogt <2 x double> %x2, %t0 @@ -800,8 +800,8 @@ ; AVX-LABEL: reassociate_mins_v8f32: ; AVX: # %bb.0: ; AVX-NEXT: vaddps %ymm1, %ymm0, %ymm0 -; AVX-NEXT: vminps %ymm3, %ymm2, %ymm1 -; AVX-NEXT: vminps %ymm1, %ymm0, %ymm0 +; AVX-NEXT: vminps %ymm2, %ymm3, %ymm1 +; AVX-NEXT: vminps %ymm0, %ymm1, %ymm0 ; AVX-NEXT: retq %t0 = fadd <8 x float> %x0, %x1 %cmp1 = fcmp olt <8 x float> %x2, %t0 @@ -827,8 +827,8 @@ ; AVX-LABEL: reassociate_maxs_v8f32: ; AVX: # %bb.0: ; AVX-NEXT: vaddps %ymm1, %ymm0, %ymm0 -; AVX-NEXT: vmaxps %ymm3, %ymm2, %ymm1 -; AVX-NEXT: vmaxps %ymm1, %ymm0, %ymm0 +; AVX-NEXT: vmaxps %ymm2, %ymm3, %ymm1 +; AVX-NEXT: vmaxps %ymm0, %ymm1, %ymm0 ; AVX-NEXT: retq %t0 = fadd <8 x float> %x0, %x1 %cmp1 = fcmp ogt <8 x float> %x2, %t0 @@ -854,8 +854,8 @@ ; AVX-LABEL: reassociate_mins_v4f64: ; AVX: # %bb.0: ; AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0 -; AVX-NEXT: vminpd %ymm3, %ymm2, %ymm1 -; AVX-NEXT: vminpd %ymm1, %ymm0, %ymm0 +; AVX-NEXT: vminpd %ymm2, %ymm3, %ymm1 +; AVX-NEXT: vminpd %ymm0, %ymm1, %ymm0 ; AVX-NEXT: retq %t0 = fadd <4 x double> %x0, %x1 %cmp1 = fcmp olt <4 x double> %x2, %t0 @@ -881,8 +881,8 @@ ; AVX-LABEL: reassociate_maxs_v4f64: ; AVX: # %bb.0: ; AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0 -; AVX-NEXT: vmaxpd %ymm3, %ymm2, %ymm1 -; AVX-NEXT: vmaxpd %ymm1, %ymm0, %ymm0 +; AVX-NEXT: vmaxpd %ymm2, %ymm3, %ymm1 +; AVX-NEXT: vmaxpd %ymm0, %ymm1, %ymm0 ; AVX-NEXT: retq %t0 = fadd <4 x double> %x0, %x1 %cmp1 = fcmp ogt <4 x double> %x2, %t0 @@ -915,17 +915,17 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: vaddps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vaddps %ymm3, %ymm1, %ymm1 -; AVX1-NEXT: vminps %ymm6, %ymm4, %ymm2 -; AVX1-NEXT: vminps %ymm2, %ymm0, %ymm0 -; AVX1-NEXT: vminps %ymm7, %ymm5, %ymm2 -; AVX1-NEXT: vminps %ymm2, %ymm1, %ymm1 +; AVX1-NEXT: vminps %ymm4, %ymm6, %ymm2 +; AVX1-NEXT: vminps %ymm0, %ymm2, %ymm0 +; AVX1-NEXT: vminps %ymm5, %ymm7, %ymm2 +; AVX1-NEXT: vminps %ymm1, %ymm2, %ymm1 ; AVX1-NEXT: retq ; ; AVX512-LABEL: reassociate_mins_v16f32: ; AVX512: # %bb.0: ; AVX512-NEXT: vaddps %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vminps %zmm3, %zmm2, %zmm1 -; AVX512-NEXT: vminps %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vminps %zmm2, %zmm3, %zmm1 +; AVX512-NEXT: vminps %zmm0, %zmm1, %zmm0 ; AVX512-NEXT: retq %t0 = fadd <16 x float> %x0, %x1 %cmp1 = fcmp olt <16 x float> %x2, %t0 @@ -958,17 +958,17 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: vaddps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vaddps %ymm3, %ymm1, %ymm1 -; AVX1-NEXT: vmaxps %ymm6, %ymm4, %ymm2 -; AVX1-NEXT: vmaxps %ymm2, %ymm0, %ymm0 -; AVX1-NEXT: vmaxps %ymm7, %ymm5, %ymm2 -; AVX1-NEXT: vmaxps %ymm2, %ymm1, %ymm1 +; AVX1-NEXT: vmaxps %ymm4, %ymm6, %ymm2 +; AVX1-NEXT: vmaxps %ymm0, %ymm2, %ymm0 +; AVX1-NEXT: vmaxps %ymm5, %ymm7, %ymm2 +; AVX1-NEXT: vmaxps %ymm1, %ymm2, %ymm1 ; AVX1-NEXT: retq ; ; AVX512-LABEL: reassociate_maxs_v16f32: ; AVX512: # %bb.0: ; AVX512-NEXT: vaddps %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vmaxps %zmm3, %zmm2, %zmm1 -; AVX512-NEXT: vmaxps %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vmaxps %zmm2, %zmm3, %zmm1 +; AVX512-NEXT: vmaxps %zmm0, %zmm1, %zmm0 ; AVX512-NEXT: retq %t0 = fadd <16 x float> %x0, %x1 %cmp1 = fcmp ogt <16 x float> %x2, %t0 @@ -1001,17 +1001,17 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: vaddpd %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vaddpd %ymm3, %ymm1, %ymm1 -; AVX1-NEXT: vminpd %ymm6, %ymm4, %ymm2 -; AVX1-NEXT: vminpd %ymm2, %ymm0, %ymm0 -; AVX1-NEXT: vminpd %ymm7, %ymm5, %ymm2 -; AVX1-NEXT: vminpd %ymm2, %ymm1, %ymm1 +; AVX1-NEXT: vminpd %ymm4, %ymm6, %ymm2 +; AVX1-NEXT: vminpd %ymm0, %ymm2, %ymm0 +; AVX1-NEXT: vminpd %ymm5, %ymm7, %ymm2 +; AVX1-NEXT: vminpd %ymm1, %ymm2, %ymm1 ; AVX1-NEXT: retq ; ; AVX512-LABEL: reassociate_mins_v8f64: ; AVX512: # %bb.0: ; AVX512-NEXT: vaddpd %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vminpd %zmm3, %zmm2, %zmm1 -; AVX512-NEXT: vminpd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vminpd %zmm2, %zmm3, %zmm1 +; AVX512-NEXT: vminpd %zmm0, %zmm1, %zmm0 ; AVX512-NEXT: retq %t0 = fadd <8 x double> %x0, %x1 %cmp1 = fcmp olt <8 x double> %x2, %t0 @@ -1044,17 +1044,17 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: vaddpd %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vaddpd %ymm3, %ymm1, %ymm1 -; AVX1-NEXT: vmaxpd %ymm6, %ymm4, %ymm2 -; AVX1-NEXT: vmaxpd %ymm2, %ymm0, %ymm0 -; AVX1-NEXT: vmaxpd %ymm7, %ymm5, %ymm2 -; AVX1-NEXT: vmaxpd %ymm2, %ymm1, %ymm1 +; AVX1-NEXT: vmaxpd %ymm4, %ymm6, %ymm2 +; AVX1-NEXT: vmaxpd %ymm0, %ymm2, %ymm0 +; AVX1-NEXT: vmaxpd %ymm5, %ymm7, %ymm2 +; AVX1-NEXT: vmaxpd %ymm1, %ymm2, %ymm1 ; AVX1-NEXT: retq ; ; AVX512-LABEL: reassociate_maxs_v8f64: ; AVX512: # %bb.0: ; AVX512-NEXT: vaddpd %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vmaxpd %zmm3, %zmm2, %zmm1 -; AVX512-NEXT: vmaxpd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vmaxpd %zmm2, %zmm3, %zmm1 +; AVX512-NEXT: vmaxpd %zmm0, %zmm1, %zmm0 ; AVX512-NEXT: retq %t0 = fadd <8 x double> %x0, %x1 %cmp1 = fcmp ogt <8 x double> %x2, %t0 diff --git a/llvm/test/CodeGen/X86/madd.ll b/llvm/test/CodeGen/X86/madd.ll --- a/llvm/test/CodeGen/X86/madd.ll +++ b/llvm/test/CodeGen/X86/madd.ll @@ -2690,8 +2690,8 @@ ; AVX-NEXT: vpaddd %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vmovdqu (%r10), %xmm2 ; AVX-NEXT: vpmaddwd (%rax), %xmm2, %xmm2 -; AVX-NEXT: vpaddd %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vpaddd %xmm0, %xmm2, %xmm0 +; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] diff --git a/llvm/test/CodeGen/X86/masked_gather_scatter.ll b/llvm/test/CodeGen/X86/masked_gather_scatter.ll --- a/llvm/test/CodeGen/X86/masked_gather_scatter.ll +++ b/llvm/test/CodeGen/X86/masked_gather_scatter.ll @@ -502,11 +502,11 @@ ; KNL_64-NEXT: vpsrlq $32, %zmm0, %zmm0 ; KNL_64-NEXT: vpmuludq %zmm3, %zmm0, %zmm0 ; KNL_64-NEXT: vpsllq $32, %zmm0, %zmm0 -; KNL_64-NEXT: vpaddq %zmm2, %zmm0, %zmm0 +; KNL_64-NEXT: vpaddq %zmm0, %zmm2, %zmm0 ; KNL_64-NEXT: vpmovzxdq {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero ; KNL_64-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm1 -; KNL_64-NEXT: vpaddq %zmm1, %zmm0, %zmm0 -; KNL_64-NEXT: vpaddq %zmm0, %zmm4, %zmm1 +; KNL_64-NEXT: vpaddq %zmm1, %zmm4, %zmm1 +; KNL_64-NEXT: vpaddq %zmm1, %zmm0, %zmm1 ; KNL_64-NEXT: kxnorw %k0, %k0, %k1 ; KNL_64-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; KNL_64-NEXT: vpgatherqd 72(,%zmm1), %ymm0 {%k1} @@ -586,11 +586,11 @@ ; KNL_64-NEXT: vpsrlq $32, %zmm0, %zmm0 ; KNL_64-NEXT: vpmuludq %zmm3, %zmm0, %zmm0 ; KNL_64-NEXT: vpsllq $32, %zmm0, %zmm0 -; KNL_64-NEXT: vpaddq %zmm2, %zmm0, %zmm0 +; KNL_64-NEXT: vpaddq %zmm0, %zmm2, %zmm0 ; KNL_64-NEXT: vpmovzxdq {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero ; KNL_64-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm1 -; KNL_64-NEXT: vpaddq %zmm1, %zmm0, %zmm0 -; KNL_64-NEXT: vpaddq %zmm0, %zmm4, %zmm1 +; KNL_64-NEXT: vpaddq %zmm1, %zmm4, %zmm1 +; KNL_64-NEXT: vpaddq %zmm1, %zmm0, %zmm1 ; KNL_64-NEXT: kxnorw %k0, %k0, %k1 ; KNL_64-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; KNL_64-NEXT: vpgatherqd 72(,%zmm1), %ymm0 {%k1} diff --git a/llvm/test/CodeGen/X86/mul-constant-i64.ll b/llvm/test/CodeGen/X86/mul-constant-i64.ll --- a/llvm/test/CodeGen/X86/mul-constant-i64.ll +++ b/llvm/test/CodeGen/X86/mul-constant-i64.ll @@ -1000,8 +1000,8 @@ ; X86-NEXT: addl %ecx, %ecx ; X86-NEXT: movl $29, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: addl %ecx, %edx ; X86-NEXT: addl %esi, %edx +; X86-NEXT: addl %ecx, %edx ; X86-NEXT: popl %esi ; X86-NEXT: .cfi_def_cfa_offset 4 ; X86-NEXT: retl diff --git a/llvm/test/CodeGen/X86/pr46877.ll b/llvm/test/CodeGen/X86/pr46877.ll --- a/llvm/test/CodeGen/X86/pr46877.ll +++ b/llvm/test/CodeGen/X86/pr46877.ll @@ -13,7 +13,7 @@ ; CHECK-NEXT: vmulss %xmm2, %xmm1, %xmm10 ; CHECK-NEXT: vfmsub213ss {{.*#+}} xmm10 = (xmm3 * xmm10) - xmm0 ; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm5 = -(xmm12 * xmm5) + xmm0 -; CHECK-NEXT: vmulss %xmm5, %xmm4, %xmm2 +; CHECK-NEXT: vmulss %xmm4, %xmm5, %xmm2 ; CHECK-NEXT: vmulss %xmm2, %xmm10, %xmm4 ; CHECK-NEXT: vmulss %xmm6, %xmm12, %xmm2 ; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm2 = -(xmm7 * xmm2) + xmm0 @@ -24,33 +24,33 @@ ; CHECK-NEXT: vfnmadd132ss {{.*#+}} xmm2 = -(xmm2 * mem) + xmm0 ; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm7, %xmm3 ; CHECK-NEXT: vfnmadd132ss {{.*#+}} xmm3 = -(xmm3 * mem) + xmm0 -; CHECK-NEXT: vmulss %xmm3, %xmm2, %xmm2 +; CHECK-NEXT: vmulss %xmm2, %xmm3, %xmm2 ; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm0, %xmm3 ; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm3, %xmm4 ; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm4 = -(xmm14 * xmm4) + xmm0 -; CHECK-NEXT: vmulss %xmm4, %xmm5, %xmm4 +; CHECK-NEXT: vmulss %xmm5, %xmm4, %xmm4 ; CHECK-NEXT: vmovss {{.*#+}} xmm5 = mem[0],zero,zero,zero ; CHECK-NEXT: vfnmadd132ss {{.*#+}} xmm5 = -(xmm5 * mem) + xmm0 -; CHECK-NEXT: vmulss %xmm5, %xmm2, %xmm2 +; CHECK-NEXT: vmulss %xmm2, %xmm5, %xmm2 ; CHECK-NEXT: vmovss {{.*#+}} xmm7 = mem[0],zero,zero,zero ; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm7, %xmm5 ; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm5 = -(xmm9 * xmm5) + xmm0 -; CHECK-NEXT: vmulss %xmm5, %xmm4, %xmm4 +; CHECK-NEXT: vmulss %xmm4, %xmm5, %xmm4 ; CHECK-NEXT: vmovss {{.*#+}} xmm5 = mem[0],zero,zero,zero ; CHECK-NEXT: vmulss %xmm0, %xmm5, %xmm8 ; CHECK-NEXT: vmovss %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-NEXT: vmovaps %xmm5, %xmm10 ; CHECK-NEXT: vmulss %xmm14, %xmm8, %xmm5 ; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm5 = -(xmm12 * xmm5) + xmm0 -; CHECK-NEXT: vmulss %xmm5, %xmm2, %xmm2 +; CHECK-NEXT: vmulss %xmm2, %xmm5, %xmm2 ; CHECK-NEXT: vmovss {{.*#+}} xmm5 = mem[0],zero,zero,zero ; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm5 = -(xmm13 * xmm5) + xmm0 -; CHECK-NEXT: vmulss %xmm5, %xmm4, %xmm4 +; CHECK-NEXT: vmulss %xmm4, %xmm5, %xmm4 ; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm3, %xmm11 ; CHECK-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero ; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm3 = -(xmm11 * xmm3) + xmm0 -; CHECK-NEXT: vmulss %xmm3, %xmm2, %xmm2 -; CHECK-NEXT: vmulss %xmm2, %xmm4, %xmm2 +; CHECK-NEXT: vmulss %xmm2, %xmm3, %xmm2 +; CHECK-NEXT: vmulss %xmm4, %xmm2, %xmm2 ; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm2, %xmm2 ; CHECK-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero ; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm3 = -(xmm15 * xmm3) + xmm0 @@ -62,8 +62,8 @@ ; CHECK-NEXT: vmovss {{.*#+}} xmm8 = mem[0],zero,zero,zero ; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm8, %xmm6 ; CHECK-NEXT: vfnmadd132ss {{.*#+}} xmm6 = -(xmm6 * mem) + xmm0 -; CHECK-NEXT: vmulss %xmm6, %xmm4, %xmm4 -; CHECK-NEXT: vmulss %xmm4, %xmm2, %xmm2 +; CHECK-NEXT: vmulss %xmm4, %xmm6, %xmm4 +; CHECK-NEXT: vmulss %xmm2, %xmm4, %xmm2 ; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm2, %xmm2 ; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm2, %xmm2 ; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm2, %xmm2 @@ -75,8 +75,8 @@ ; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm4 = -(xmm1 * xmm4) + xmm0 ; CHECK-NEXT: vmovss {{.*#+}} xmm6 = mem[0],zero,zero,zero ; CHECK-NEXT: vfnmadd132ss {{.*#+}} xmm6 = -(xmm6 * mem) + xmm0 -; CHECK-NEXT: vmulss %xmm6, %xmm4, %xmm4 -; CHECK-NEXT: vmulss %xmm4, %xmm2, %xmm2 +; CHECK-NEXT: vmulss %xmm4, %xmm6, %xmm4 +; CHECK-NEXT: vmulss %xmm2, %xmm4, %xmm2 ; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm2, %xmm2 ; CHECK-NEXT: vmovss {{.*#+}} xmm4 = mem[0],zero,zero,zero ; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm10, %xmm1 @@ -88,7 +88,7 @@ ; CHECK-NEXT: vfnmadd132ss {{.*#+}} xmm10 = -(xmm10 * mem) + xmm0 ; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm13, %xmm12 ; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm12 = -(xmm7 * xmm12) + xmm0 -; CHECK-NEXT: vmulss %xmm12, %xmm10, %xmm10 +; CHECK-NEXT: vmulss %xmm10, %xmm12, %xmm10 ; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm4, %xmm4 ; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm4, %xmm4 ; CHECK-NEXT: vmulss %xmm4, %xmm10, %xmm12 @@ -100,17 +100,17 @@ ; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm0, %xmm9 ; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm9, %xmm1 ; CHECK-NEXT: vfnmadd132ss {{.*#+}} xmm1 = -(xmm1 * mem) + xmm0 -; CHECK-NEXT: vmulss %xmm6, %xmm2, %xmm2 +; CHECK-NEXT: vmulss %xmm2, %xmm6, %xmm2 ; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm3, %xmm6 ; CHECK-NEXT: vfnmadd132ss {{.*#+}} xmm6 = -(xmm6 * mem) + xmm0 -; CHECK-NEXT: vmulss %xmm1, %xmm2, %xmm1 -; CHECK-NEXT: vmulss %xmm6, %xmm1, %xmm1 +; CHECK-NEXT: vmulss %xmm2, %xmm1, %xmm1 +; CHECK-NEXT: vmulss %xmm1, %xmm6, %xmm1 ; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm3, %xmm2 ; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm2 = -(xmm13 * xmm2) + xmm0 -; CHECK-NEXT: vmulss %xmm2, %xmm1, %xmm1 +; CHECK-NEXT: vmulss %xmm1, %xmm2, %xmm1 ; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm12, %xmm2 ; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm2, %xmm2 -; CHECK-NEXT: vmulss %xmm1, %xmm2, %xmm4 +; CHECK-NEXT: vmulss %xmm2, %xmm1, %xmm4 ; CHECK-NEXT: vmovss {{.*#+}} xmm12 = mem[0],zero,zero,zero ; CHECK-NEXT: vmovss {{.*#+}} xmm6 = mem[0],zero,zero,zero ; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm6, %xmm3 @@ -120,24 +120,24 @@ ; CHECK-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload ; CHECK-NEXT: # xmm1 = mem[0],zero,zero,zero ; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm1, %xmm1 -; CHECK-NEXT: vmulss %xmm2, %xmm3, %xmm2 +; CHECK-NEXT: vmulss %xmm3, %xmm2, %xmm2 ; CHECK-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero ; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm1 = -(xmm3 * xmm1) + xmm0 -; CHECK-NEXT: vmulss %xmm1, %xmm2, %xmm1 +; CHECK-NEXT: vmulss %xmm2, %xmm1, %xmm1 ; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm4, %xmm2 ; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm2, %xmm2 -; CHECK-NEXT: vmulss %xmm1, %xmm2, %xmm1 +; CHECK-NEXT: vmulss %xmm2, %xmm1, %xmm1 ; CHECK-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 4-byte Reload ; CHECK-NEXT: # xmm10 = mem[0],zero,zero,zero ; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm10, %xmm2 ; CHECK-NEXT: vfnmadd132ss {{.*#+}} xmm7 = -(xmm7 * mem) + xmm0 ; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm2 = -(xmm12 * xmm2) + xmm0 -; CHECK-NEXT: vmulss %xmm7, %xmm2, %xmm2 +; CHECK-NEXT: vmulss %xmm2, %xmm7, %xmm2 ; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm1, %xmm1 ; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm1, %xmm1 ; CHECK-NEXT: vfnmadd132ss {{.*#+}} xmm8 = -(xmm8 * mem) + xmm0 ; CHECK-NEXT: vmulss %xmm2, %xmm8, %xmm2 -; CHECK-NEXT: vmulss %xmm2, %xmm1, %xmm1 +; CHECK-NEXT: vmulss %xmm1, %xmm2, %xmm1 ; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm1, %xmm1 ; CHECK-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero ; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm2 = -(xmm15 * xmm2) + xmm0 @@ -147,13 +147,13 @@ ; CHECK-NEXT: vfnmadd132ss {{.*#+}} xmm2 = -(xmm2 * mem) + xmm0 ; CHECK-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero ; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm3 = -(xmm6 * xmm3) + xmm0 -; CHECK-NEXT: vmulss %xmm2, %xmm3, %xmm2 +; CHECK-NEXT: vmulss %xmm3, %xmm2, %xmm2 ; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm9, %xmm3 ; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm9, %xmm4 ; CHECK-NEXT: vfnmadd132ss {{.*#+}} xmm4 = -(xmm4 * mem) + xmm0 -; CHECK-NEXT: vmulss %xmm4, %xmm2, %xmm2 +; CHECK-NEXT: vmulss %xmm2, %xmm4, %xmm2 ; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm1, %xmm1 -; CHECK-NEXT: vmulss %xmm2, %xmm1, %xmm1 +; CHECK-NEXT: vmulss %xmm1, %xmm2, %xmm1 ; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm11 = -(xmm6 * xmm11) + xmm0 ; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm5, %xmm2 ; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm2 = -(xmm15 * xmm2) + xmm0 @@ -161,7 +161,7 @@ ; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm4, %xmm4 ; CHECK-NEXT: vfnmadd132ss {{.*#+}} xmm4 = -(xmm4 * mem) + xmm0 ; CHECK-NEXT: vmulss %xmm2, %xmm11, %xmm2 -; CHECK-NEXT: vmulss %xmm4, %xmm2, %xmm2 +; CHECK-NEXT: vmulss %xmm2, %xmm4, %xmm2 ; CHECK-NEXT: vfnmadd132ss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm14 # 4-byte Folded Reload ; CHECK-NEXT: # xmm14 = -(xmm14 * mem) + xmm0 ; CHECK-NEXT: vmulss %xmm2, %xmm14, %xmm4 @@ -188,18 +188,18 @@ ; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm10 = -(xmm12 * xmm10) + xmm0 ; CHECK-NEXT: vfmsub213ss {{.*#+}} xmm8 = (xmm15 * xmm8) - xmm0 ; CHECK-NEXT: vfnmadd132ss {{.*#+}} xmm3 = -(xmm3 * mem) + xmm0 -; CHECK-NEXT: vmulss %xmm3, %xmm4, %xmm0 -; CHECK-NEXT: vmulss %xmm5, %xmm0, %xmm0 +; CHECK-NEXT: vmulss %xmm4, %xmm3, %xmm0 +; CHECK-NEXT: vmulss %xmm0, %xmm5, %xmm0 ; CHECK-NEXT: vmulss %xmm0, %xmm9, %xmm0 -; CHECK-NEXT: vmulss %xmm7, %xmm0, %xmm0 +; CHECK-NEXT: vmulss %xmm0, %xmm7, %xmm0 ; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm1, %xmm1 -; CHECK-NEXT: vmulss %xmm0, %xmm1, %xmm0 -; CHECK-NEXT: vmulss %xmm6, %xmm2, %xmm1 +; CHECK-NEXT: vmulss %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vmulss %xmm2, %xmm6, %xmm1 ; CHECK-NEXT: vmulss %xmm1, %xmm11, %xmm1 ; CHECK-NEXT: vmulss %xmm1, %xmm10, %xmm1 ; CHECK-NEXT: vmulss {{[0-9]+}}(%rsp), %xmm0, %xmm0 ; CHECK-NEXT: vmulss %xmm1, %xmm8, %xmm1 -; CHECK-NEXT: vmulss %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vmulss %xmm0, %xmm1, %xmm0 ; CHECK-NEXT: vmovss %xmm0, (%rdi) ; CHECK-NEXT: retq entry: diff --git a/llvm/test/CodeGen/X86/sqrt-fastmath.ll b/llvm/test/CodeGen/X86/sqrt-fastmath.ll --- a/llvm/test/CodeGen/X86/sqrt-fastmath.ll +++ b/llvm/test/CodeGen/X86/sqrt-fastmath.ll @@ -598,8 +598,8 @@ ; AVX1-NEXT: vmulss %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX1-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 -; AVX1-NEXT: vmulss %xmm0, %xmm2, %xmm0 -; AVX1-NEXT: vmulss %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vmulss %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vmulss %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX512-LABEL: div_sqrt_fabs_f32: @@ -610,8 +610,8 @@ ; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vfmadd213ss {{.*#+}} xmm1 = (xmm2 * xmm1) + mem ; AVX512-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 -; AVX512-NEXT: vmulss %xmm0, %xmm2, %xmm0 -; AVX512-NEXT: vmulss %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vmulss %xmm2, %xmm0, %xmm0 +; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq %s = call fast float @llvm.sqrt.f32(float %z) %a = call fast float @llvm.fabs.f32(float %y) @@ -778,8 +778,8 @@ ; AVX1-NEXT: vmulss %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX1-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 -; AVX1-NEXT: vmulss %xmm0, %xmm2, %xmm0 -; AVX1-NEXT: vmulss %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vmulss %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vmulss %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX512-LABEL: div_sqrt_f32: @@ -790,8 +790,8 @@ ; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vfmadd213ss {{.*#+}} xmm1 = (xmm2 * xmm1) + mem ; AVX512-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 -; AVX512-NEXT: vmulss %xmm0, %xmm2, %xmm0 -; AVX512-NEXT: vmulss %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vmulss %xmm2, %xmm0, %xmm0 +; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq %s = call fast float @llvm.sqrt.f32(float %y) %m = fmul fast float %s, %y diff --git a/llvm/test/CodeGen/X86/umulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/X86/umulo-128-legalisation-lowering.ll --- a/llvm/test/CodeGen/X86/umulo-128-legalisation-lowering.ll +++ b/llvm/test/CodeGen/X86/umulo-128-legalisation-lowering.ll @@ -143,11 +143,11 @@ ; X86-NEXT: setne %al ; X86-NEXT: andb %bl, %al ; X86-NEXT: orb {{[-0-9]+}}(%e{{[sb]}}p), %al # 1-byte Folded Reload -; X86-NEXT: orb {{[-0-9]+}}(%e{{[sb]}}p), %al # 1-byte Folded Reload -; X86-NEXT: orb {{[-0-9]+}}(%e{{[sb]}}p), %al # 1-byte Folded Reload -; X86-NEXT: orb %bh, %al -; X86-NEXT: andb $1, %al -; X86-NEXT: movb %al, 16(%ecx) +; X86-NEXT: orb {{[-0-9]+}}(%e{{[sb]}}p), %bh # 1-byte Folded Reload +; X86-NEXT: orb {{[-0-9]+}}(%e{{[sb]}}p), %bh # 1-byte Folded Reload +; X86-NEXT: orb %al, %bh +; X86-NEXT: andb $1, %bh +; X86-NEXT: movb %bh, 16(%ecx) ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: addl $24, %esp ; X86-NEXT: .cfi_def_cfa_offset 20 diff --git a/llvm/test/CodeGen/X86/x86-interleaved-access.ll b/llvm/test/CodeGen/X86/x86-interleaved-access.ll --- a/llvm/test/CodeGen/X86/x86-interleaved-access.ll +++ b/llvm/test/CodeGen/X86/x86-interleaved-access.ll @@ -841,10 +841,10 @@ ; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7,8,9,10] ; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9] ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 -; AVX1-NEXT: vpaddb %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vpaddb %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vpaddb %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpaddb %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vpaddb %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpaddb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 ; AVX1-NEXT: retq ; @@ -866,10 +866,10 @@ ; AVX2OR512-NEXT: vpalignr {{.*#+}} ymm2 = ymm3[11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10],ymm3[27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26] ; AVX2OR512-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] ; AVX2OR512-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm1 -; AVX2OR512-NEXT: vpaddb %ymm2, %ymm1, %ymm1 +; AVX2OR512-NEXT: vpaddb %ymm1, %ymm2, %ymm1 ; AVX2OR512-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10],ymm0[27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26] ; AVX2OR512-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,26,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25] -; AVX2OR512-NEXT: vpaddb %ymm1, %ymm0, %ymm0 +; AVX2OR512-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX2OR512-NEXT: retq %wide.vec = load <96 x i8>, ptr %ptr %v1 = shufflevector <96 x i8> %wide.vec, <96 x i8> undef,<32 x i32> @@ -896,10 +896,10 @@ ; AVX-NEXT: vpalignr {{.*#+}} xmm2 = xmm3[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10] ; AVX-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] ; AVX-NEXT: vpblendvb %xmm4, %xmm0, %xmm1, %xmm1 -; AVX-NEXT: vpaddb %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpaddb %xmm1, %xmm2, %xmm1 ; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7,8,9,10] ; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9] -; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; AVX-NEXT: retq %wide.vec = load <48 x i8>, ptr %ptr %v1 = shufflevector <48 x i8> %wide.vec, <48 x i8> undef,<16 x i32> @@ -924,8 +924,8 @@ ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm1[1,4,7,u,u,u,u,u,u,u,u] ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,14],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] ; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpaddb %xmm0, %xmm3, %xmm0 -; AVX-NEXT: vpaddb %xmm0, %xmm2, %xmm0 +; AVX-NEXT: vpaddb %xmm3, %xmm0, %xmm0 +; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX-NEXT: retq %wide.vec = load <24 x i8>, ptr %ptr %v1 = shufflevector <24 x i8> %wide.vec, <24 x i8> undef,<8 x i32> @@ -1366,44 +1366,44 @@ ; AVX1-NEXT: vmovdqa {{.*#+}} xmm11 = [128,128,128,128,128,128,128,128,128,128,128,2,5,8,11,14] ; AVX1-NEXT: vpshufb %xmm11, %xmm4, %xmm4 ; AVX1-NEXT: vpor %xmm4, %xmm10, %xmm4 -; AVX1-NEXT: vpaddb %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vpaddb %xmm4, %xmm5, %xmm4 ; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX1-NEXT: vpshufb %xmm9, %xmm3, %xmm5 ; AVX1-NEXT: vpshufb %xmm11, %xmm8, %xmm8 ; AVX1-NEXT: vpor %xmm5, %xmm8, %xmm5 -; AVX1-NEXT: vpaddb %xmm2, %xmm5, %xmm2 +; AVX1-NEXT: vpaddb %xmm5, %xmm2, %xmm2 ; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX1-NEXT: vpshufb %xmm9, %xmm3, %xmm5 ; AVX1-NEXT: vpshufb %xmm11, %xmm12, %xmm8 ; AVX1-NEXT: vpor %xmm5, %xmm8, %xmm5 -; AVX1-NEXT: vpaddb %xmm1, %xmm5, %xmm1 +; AVX1-NEXT: vpaddb %xmm5, %xmm1, %xmm1 ; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX1-NEXT: vpshufb %xmm9, %xmm3, %xmm5 ; AVX1-NEXT: vpshufb %xmm11, %xmm15, %xmm8 ; AVX1-NEXT: vpor %xmm5, %xmm8, %xmm5 -; AVX1-NEXT: vpaddb %xmm6, %xmm5, %xmm5 +; AVX1-NEXT: vpaddb %xmm5, %xmm6, %xmm5 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [5,6,7,8,9,10,128,128,128,128,128,0,1,2,3,4] ; AVX1-NEXT: vpshufb %xmm6, %xmm0, %xmm0 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm8 = [128,128,128,128,128,128,2,5,8,11,14,128,128,128,128,128] ; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX1-NEXT: vpshufb %xmm8, %xmm3, %xmm3 ; AVX1-NEXT: vpor %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpaddb %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpaddb %xmm0, %xmm4, %xmm0 ; AVX1-NEXT: vpshufb %xmm6, %xmm7, %xmm3 ; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX1-NEXT: vpshufb %xmm8, %xmm4, %xmm4 ; AVX1-NEXT: vpor %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vpaddb %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpaddb %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpshufb %xmm6, %xmm14, %xmm3 ; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX1-NEXT: vpshufb %xmm8, %xmm4, %xmm4 ; AVX1-NEXT: vpor %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vpaddb %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vpshufb %xmm6, %xmm13, %xmm3 ; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX1-NEXT: vpshufb %xmm8, %xmm4, %xmm4 ; AVX1-NEXT: vpor %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vpaddb %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vpaddb %xmm3, %xmm5, %xmm3 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 ; AVX1-NEXT: retq @@ -1456,11 +1456,11 @@ ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255] ; AVX2-NEXT: # ymm7 = mem[0,1,0,1] ; AVX2-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 -; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpaddb %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: vpalignr {{.*#+}} ymm2 = ymm5[5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,21,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20] ; AVX2-NEXT: vpshufb %ymm6, %ymm1, %ymm1 ; AVX2-NEXT: vpblendvb %ymm7, %ymm2, %ymm1, %ymm1 -; AVX2-NEXT: vpaddb %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vpaddb %ymm1, %ymm3, %ymm1 ; AVX2-NEXT: retq ; ; AVX512-LABEL: interleaved_load_vf64_i8_stride3: @@ -1491,10 +1491,10 @@ ; AVX512-NEXT: kmovq %rax, %k1 ; AVX512-NEXT: vpblendmb %zmm1, %zmm0, %zmm2 {%k1} ; AVX512-NEXT: vpalignr {{.*#+}} zmm1 = zmm3[11,12,13,14,15],zmm1[0,1,2,3,4,5,6,7,8,9,10],zmm3[27,28,29,30,31],zmm1[16,17,18,19,20,21,22,23,24,25,26],zmm3[43,44,45,46,47],zmm1[32,33,34,35,36,37,38,39,40,41,42],zmm3[59,60,61,62,63],zmm1[48,49,50,51,52,53,54,55,56,57,58] -; AVX512-NEXT: vpaddb %zmm1, %zmm2, %zmm1 +; AVX512-NEXT: vpaddb %zmm2, %zmm1, %zmm1 ; AVX512-NEXT: vpalignr {{.*#+}} zmm0 = zmm0[11,12,13,14,15],zmm3[0,1,2,3,4,5,6,7,8,9,10],zmm0[27,28,29,30,31],zmm3[16,17,18,19,20,21,22,23,24,25,26],zmm0[43,44,45,46,47],zmm3[32,33,34,35,36,37,38,39,40,41,42],zmm0[59,60,61,62,63],zmm3[48,49,50,51,52,53,54,55,56,57,58] ; AVX512-NEXT: vpalignr {{.*#+}} zmm0 = zmm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,26,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25,42,43,44,45,46,47,32,33,34,35,36,37,38,39,40,41,58,59,60,61,62,63,48,49,50,51,52,53,54,55,56,57] -; AVX512-NEXT: vpaddb %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpaddb %zmm0, %zmm1, %zmm0 ; AVX512-NEXT: retq %wide.vec = load <192 x i8>, ptr %ptr, align 1 %v1 = shufflevector <192 x i8> %wide.vec, <192 x i8> undef, <64 x i32>