Index: llvm/lib/CodeGen/MachineCombiner.cpp =================================================================== --- llvm/lib/CodeGen/MachineCombiner.cpp +++ llvm/lib/CodeGen/MachineCombiner.cpp @@ -92,9 +92,11 @@ bool doSubstitute(unsigned NewSize, unsigned OldSize, bool OptForSize); bool combineInstructions(MachineBasicBlock *); MachineInstr *getOperandDef(const MachineOperand &MO); - unsigned getDepth(SmallVectorImpl &InsInstrs, - DenseMap &InstrIdxForVirtReg, - MachineTraceMetrics::Trace BlockTrace); + bool isCoalescableCopy(MachineInstr *MI); + std::pair + getDepth(SmallVectorImpl &InsInstrs, + DenseMap &InstrIdxForVirtReg, + MachineTraceMetrics::Trace BlockTrace); unsigned getLatency(MachineInstr *Root, MachineInstr *NewRoot, MachineTraceMetrics::Trace BlockTrace); bool @@ -158,6 +160,43 @@ return DefInstr; } +/// Check if MI is a COPY instruction, and its src and dst registers can be +/// coalesced. +bool MachineCombiner::isCoalescableCopy(MachineInstr *MI) { + if (!MI->isCopy()) + return false; + + Register Dst = MI->getOperand(0).getReg(); + Register Src = MI->getOperand(1).getReg(); + + if (!MI->isFullCopy()) { + // If src RC contains super registers of dst RC, it can also be coalesced. + if (MI->getOperand(0).getSubReg() || Src.isPhysical() || Dst.isPhysical()) + return false; + + auto SrcSub = MI->getOperand(1).getSubReg(); + auto SrcRC = MRI->getRegClass(Src); + auto DstRC = MRI->getRegClass(Dst); + return TRI->getMatchingSuperRegClass(SrcRC, DstRC, SrcSub) != nullptr; + } + + if (Src.isPhysical() && Dst.isPhysical()) + return Src == Dst; + + if (Src.isVirtual() && Dst.isVirtual()) { + auto SrcRC = MRI->getRegClass(Src); + auto DstRC = MRI->getRegClass(Dst); + return SrcRC->hasSuperClassEq(DstRC) || SrcRC->hasSubClassEq(DstRC); + } + + if (Src.isVirtual()) + std::swap(Src, Dst); + + // Now Src is physical register, Dst is virtual register. + auto DstRC = MRI->getRegClass(Dst); + return DstRC->contains(Src); +} + /// Computes depth of instructions in vector \InsInstr. /// /// \param InsInstrs is a vector of machine instructions @@ -165,8 +204,8 @@ /// of defining machine instruction in \p InsInstrs /// \param BlockTrace is a trace of machine instructions /// -/// \returns Depth of last instruction in \InsInstrs ("NewRoot") -unsigned +/// \returns Depth of the first and last instruction in \InsInstrs ("NewRoot") +std::pair MachineCombiner::getDepth(SmallVectorImpl &InsInstrs, DenseMap &InstrIdxForVirtReg, MachineTraceMetrics::Trace BlockTrace) { @@ -204,9 +243,10 @@ MachineInstr *DefInstr = getOperandDef(MO); if (DefInstr) { DepthOp = BlockTrace.getInstrCycles(*DefInstr).Depth; - LatencyOp = TSchedModel.computeOperandLatency( - DefInstr, DefInstr->findRegisterDefOperandIdx(MO.getReg()), - InstrPtr, InstrPtr->findRegisterUseOperandIdx(MO.getReg())); + if (!isCoalescableCopy(DefInstr)) + LatencyOp = TSchedModel.computeOperandLatency( + DefInstr, DefInstr->findRegisterDefOperandIdx(MO.getReg()), + InstrPtr, InstrPtr->findRegisterUseOperandIdx(MO.getReg())); } } IDepth = std::max(IDepth, DepthOp + LatencyOp); @@ -214,7 +254,7 @@ InstrDepth.push_back(IDepth); } unsigned NewRootIdx = InsInstrs.size() - 1; - return InstrDepth[NewRootIdx]; + return {InstrDepth[0], InstrDepth[NewRootIdx]}; } /// Computes instruction latency as max of latency of defined operands. @@ -337,8 +377,11 @@ assert(TSchedModel.hasInstrSchedModelOrItineraries() && "Missing machine model\n"); // Get depth and latency of NewRoot and Root. - unsigned NewRootDepth = getDepth(InsInstrs, InstrIdxForVirtReg, BlockTrace); + unsigned NewFirstDepth, NewRootDepth; + std::tie(NewFirstDepth, NewRootDepth) = + getDepth(InsInstrs, InstrIdxForVirtReg, BlockTrace); unsigned RootDepth = BlockTrace.getInstrCycles(*Root).Depth; + unsigned FirstDepth = BlockTrace.getInstrCycles(*DelInstrs[0]).Depth; LLVM_DEBUG(dbgs() << " Dependence data for " << *Root << "\tNewRootDepth: " << NewRootDepth << "\tRootDepth: " << RootDepth); @@ -366,9 +409,9 @@ getLatenciesForInstrSequences(*Root, InsInstrs, DelInstrs, BlockTrace); unsigned RootSlack = BlockTrace.getInstrSlack(*Root); - unsigned NewCycleCount = NewRootDepth + NewRootLatency; + unsigned NewCycleCount = NewFirstDepth + NewRootLatency; unsigned OldCycleCount = - RootDepth + RootLatency + (SlackIsAccurate ? RootSlack : 0); + FirstDepth + RootLatency + (SlackIsAccurate ? RootSlack : 0); LLVM_DEBUG(dbgs() << "\n\tNewRootLatency: " << NewRootLatency << "\tRootLatency: " << RootLatency << "\n\tRootSlack: " << RootSlack << " SlackIsAccurate=" << SlackIsAccurate @@ -381,7 +424,10 @@ LLVM_DEBUG(dbgs() << "\n\t\tNewCycleCount = " << NewCycleCount << ", OldCycleCount = " << OldCycleCount << "\n"); - return NewCycleCount <= OldCycleCount; + if (NewCycleCount == OldCycleCount) + return InsInstrs.size() < DelInstrs.size(); + else + return NewCycleCount < OldCycleCount; } /// helper routine to convert instructions into SC Index: llvm/test/CodeGen/AArch64/aarch64-combine-fmul-fsub.mir =================================================================== --- llvm/test/CodeGen/AArch64/aarch64-combine-fmul-fsub.mir +++ llvm/test/CodeGen/AArch64/aarch64-combine-fmul-fsub.mir @@ -1,8 +1,8 @@ -# RUN: llc -run-pass=machine-combiner -o - -mtriple=aarch64-unknown-linux -mcpu=cortex-a57 -enable-unsafe-fp-math -machine-combiner-verify-pattern-order=true %s | FileCheck --check-prefixes=UNPROFITABLE,ALL %s -# RUN: llc -run-pass=machine-combiner -o - -mtriple=aarch64-unknown-linux -mcpu=falkor -enable-unsafe-fp-math %s -machine-combiner-verify-pattern-order=true | FileCheck --check-prefixes=PROFITABLE,ALL %s -# RUN: llc -run-pass=machine-combiner -o - -mtriple=aarch64-unknown-linux -mcpu=exynos-m3 -enable-unsafe-fp-math -machine-combiner-verify-pattern-order=true %s | FileCheck --check-prefixes=PROFITABLE,ALL %s -# RUN: llc -run-pass=machine-combiner -o - -mtriple=aarch64-unknown-linux -mcpu=thunderx2t99 -enable-unsafe-fp-math -machine-combiner-verify-pattern-order=true %s | FileCheck --check-prefixes=PROFITABLE,ALL %s -# RUN: llc -run-pass=machine-combiner -o - -mtriple=aarch64-unknown-linux -mcpu=thunderx3t110 -enable-unsafe-fp-math -machine-combiner-verify-pattern-order=true %s | FileCheck --check-prefixes=PROFITABLE,ALL %s +# RUN: llc -run-pass=machine-combiner -o - -mtriple=aarch64-unknown-linux -mcpu=cortex-a57 -enable-unsafe-fp-math -machine-combiner-verify-pattern-order=true %s | FileCheck --check-prefixes=UNPROFITABLE %s +# RUN: llc -run-pass=machine-combiner -o - -mtriple=aarch64-unknown-linux -mcpu=falkor -enable-unsafe-fp-math %s -machine-combiner-verify-pattern-order=true | FileCheck --check-prefixes=PROFITABLE %s +# RUN: llc -run-pass=machine-combiner -o - -mtriple=aarch64-unknown-linux -mcpu=exynos-m3 -enable-unsafe-fp-math -machine-combiner-verify-pattern-order=true %s | FileCheck --check-prefixes=EXYNOS %s +# RUN: llc -run-pass=machine-combiner -o - -mtriple=aarch64-unknown-linux -mcpu=thunderx2t99 -enable-unsafe-fp-math -machine-combiner-verify-pattern-order=true %s | FileCheck --check-prefixes=THUNDER2 %s +# RUN: llc -run-pass=machine-combiner -o - -mtriple=aarch64-unknown-linux -mcpu=thunderx3t110 -enable-unsafe-fp-math -machine-combiner-verify-pattern-order=true %s | FileCheck --check-prefixes=THUNDER3 %s # name: f1_2s registers: @@ -26,9 +26,21 @@ # UNPROFITABLE: %3:fpr64 = FMULv2f32 %0, %1 # UNPROFITABLE-NEXT: FSUBv2f32 killed %3, %2 # +# THUNDER2-LABEL: name: f1_2s +# THUNDER2: [[R1:%[0-9]+]]:fpr64 = FNEGv2f32 %2 +# THUNDER2-NEXT: FMLAv2f32 killed [[R1]], %0, %1 +# +# THUNDER3-LABEL: name: f1_2s +# THUNDER3: [[R1:%[0-9]+]]:fpr64 = FNEGv2f32 %2 +# THUNDER3-NEXT: FMLAv2f32 killed [[R1]], %0, %1 +# # PROFITABLE-LABEL: name: f1_2s # PROFITABLE: [[R1:%[0-9]+]]:fpr64 = FNEGv2f32 %2 # PROFITABLE-NEXT: FMLAv2f32 killed [[R1]], %0, %1 +# +# EXYNOS-LABEL: name: f1_2s +# EXYNOS: %3:fpr64 = FMULv2f32 %0, %1 +# EXYNOS-NEXT: FSUBv2f32 killed %3, % --- name: f1_4s registers: @@ -52,9 +64,21 @@ # UNPROFITABLE: %3:fpr128 = FMULv4f32 %0, %1 # UNPROFITABLE-NEXT: FSUBv4f32 killed %3, %2 # +# THUNDER2-LABEL: name: f1_4s +# THUNDER2: [[R1:%[0-9]+]]:fpr128 = FNEGv4f32 %2 +# THUNDER2-NEXT: FMLAv4f32 killed [[R1]], %0, %1 +# +# THUNDER3-LABEL: name: f1_4s +# THUNDER3: [[R1:%[0-9]+]]:fpr128 = FNEGv4f32 %2 +# THUNDER3-NEXT: FMLAv4f32 killed [[R1]], %0, %1 +# # PROFITABLE-LABEL: name: f1_4s # PROFITABLE: [[R1:%[0-9]+]]:fpr128 = FNEGv4f32 %2 # PROFITABLE-NEXT: FMLAv4f32 killed [[R1]], %0, %1 +# +# EXYNOS-LABEL: name: f1_4s +# EXYNOS: %3:fpr128 = FMULv4f32 %0, %1 +# EXYNOS-NEXT: FSUBv4f32 killed %3, %2 --- name: f1_2d registers: @@ -78,9 +102,21 @@ # UNPROFITABLE: %3:fpr128 = FMULv2f64 %0, %1 # UNPROFITABLE-NEXT: FSUBv2f64 killed %3, %2 # +# THUNDER2-LABEL: name: f1_2d +# THUNDER2: [[R1:%[0-9]+]]:fpr128 = FNEGv2f64 %2 +# THUNDER2-NEXT: FMLAv2f64 killed [[R1]], %0, %1 +# +# THUNDER3-LABEL: name: f1_2d +# THUNDER3: [[R1:%[0-9]+]]:fpr128 = FNEGv2f64 %2 +# THUNDER3-NEXT: FMLAv2f64 killed [[R1]], %0, %1 +# # PROFITABLE-LABEL: name: f1_2d # PROFITABLE: [[R1:%[0-9]+]]:fpr128 = FNEGv2f64 %2 # PROFITABLE-NEXT: FMLAv2f64 killed [[R1]], %0, %1 +# +# EXYNOS-LABEL: name: f1_2d +# EXYNOS: %3:fpr128 = FMULv2f64 %0, %1 +# EXYNOS-NEXT: FSUBv2f64 killed %3, %2 --- name: f1_both_fmul_2s registers: @@ -104,9 +140,27 @@ RET_ReallyLR implicit $q0 ... -# ALL-LABEL: name: f1_both_fmul_2s -# ALL: %4:fpr64 = FMULv2f32 %0, %1 -# ALL-NEXT: FMLSv2f32 killed %4, %2, %3 +# UNPROFITABLE-LABEL: name: f1_both_fmul_2s +# UNPROFITABLE: %4:fpr64 = FMULv2f32 %0, %1 +# UNPROFITABLE-NEXT: %6:fpr64 = FMLSv2f32 killed %4, %2, %3 +# +# THUNDER2-LABEL: name: f1_both_fmul_2s +# THUNDER2: %4:fpr64 = FMULv2f32 %0, %1 +# THUNDER2-NEXT: %6:fpr64 = FMLSv2f32 killed %4, %2, %3 +# +# THUNDER3-LABEL: name: f1_both_fmul_2s +# THUNDER3: %4:fpr64 = FMULv2f32 %0, %1 +# THUNDER3-NEXT: %6:fpr64 = FMLSv2f32 killed %4, %2, %3 +# +# PROFITABLE-LABEL: name: f1_both_fmul_2s +# PROFITABLE: %4:fpr64 = FMULv2f32 %0, %1 +# PROFITABLE-NEXT: %5:fpr64 = FMULv2f32 %2, %3 +# PROFITABLE-NEXT: FSUBv2f32 killed %4, %5 +# +# EXYNOS-LABEL: name: f1_both_fmul_2s +# EXYNOS: %4:fpr64 = FMULv2f32 %0, %1 +# EXYNOS-NEXT: %5:fpr64 = FMULv2f32 %2, %3 +# EXYNOS-NEXT: FSUBv2f32 killed %4, %5 --- name: f1_both_fmul_4s registers: @@ -130,9 +184,27 @@ RET_ReallyLR implicit $q0 ... -# ALL-LABEL: name: f1_both_fmul_4s -# ALL: %4:fpr128 = FMULv4f32 %0, %1 -# ALL-NEXT: FMLSv4f32 killed %4, %2, %3 +# UNPROFITABLE-LABEL: name: f1_both_fmul_4s +# UNPROFITABLE: %4:fpr128 = FMULv4f32 %0, %1 +# UNPROFITABLE-NEXT: %6:fpr128 = FMLSv4f32 killed %4, %2, %3 +# +# THUNDER2-LABEL: name: f1_both_fmul_4s +# THUNDER2: %4:fpr128 = FMULv4f32 %0, %1 +# THUNDER2-NEXT: %6:fpr128 = FMLSv4f32 killed %4, %2, %3 +# +# THUNDER3-LABEL: name: f1_both_fmul_4s +# THUNDER3: %4:fpr128 = FMULv4f32 %0, %1 +# THUNDER3-NEXT: %6:fpr128 = FMLSv4f32 killed %4, %2, %3 +# +# PROFITABLE-LABEL: name: f1_both_fmul_4s +# PROFITABLE: %4:fpr128 = FMULv4f32 %0, %1 +# PROFITABLE-NEXT: %5:fpr128 = FMULv4f32 %2, %3 +# PROFITABLE-NEXT: FSUBv4f32 killed %4, %5 +# +# EXYNOS-LABEL: name: f1_both_fmul_4s +# EXYNOS: %4:fpr128 = FMULv4f32 %0, %1 +# EXYNOS-NEXT: %5:fpr128 = FMULv4f32 %2, %3 +# EXYNOS-NEXT: FSUBv4f32 killed %4, %5 --- name: f1_both_fmul_2d registers: @@ -156,7 +228,25 @@ RET_ReallyLR implicit $q0 ... -# ALL-LABEL: name: f1_both_fmul_2d -# ALL: %4:fpr128 = FMULv2f64 %0, %1 -# ALL-NEXT: FMLSv2f64 killed %4, %2, %3 +# UNPROFITABLE-LABEL: name: f1_both_fmul_2d +# UNPROFITABLE: %4:fpr128 = FMULv2f64 %0, %1 +# UNPROFITABLE-NEXT: %6:fpr128 = FMLSv2f64 killed %4, %2, %3 +# +# THUNDER2-LABEL: name: f1_both_fmul_2d +# THUNDER2: %4:fpr128 = FMULv2f64 %0, %1 +# THUNDER2-NEXT: %6:fpr128 = FMLSv2f64 killed %4, %2, %3 +# +# THUNDER3-LABEL: name: f1_both_fmul_2d +# THUNDER3: %4:fpr128 = FMULv2f64 %0, %1 +# THUNDER3-NEXT: %6:fpr128 = FMLSv2f64 killed %4, %2, %3 +# +# PROFITABLE-LABEL: name: f1_both_fmul_2d +# PROFITABLE: %4:fpr128 = FMULv2f64 %0, %1 +# PROFITABLE-NEXT: %5:fpr128 = FMULv2f64 %2, %3 +# PROFITABLE-NEXT: FSUBv2f64 killed %4, %5 +# +# EXYNOS-LABEL: name: f1_both_fmul_2d +# EXYNOS: %4:fpr128 = FMULv2f64 %0, %1 +# EXYNOS-NEXT: %5:fpr128 = FMULv2f64 %2, %3 +# EXYNOS-NEXT: FSUBv2f64 killed %4, %5 Index: llvm/test/CodeGen/AArch64/arm64-fma-combines.ll =================================================================== --- llvm/test/CodeGen/AArch64/arm64-fma-combines.ll +++ llvm/test/CodeGen/AArch64/arm64-fma-combines.ll @@ -3,7 +3,8 @@ define void @foo_2d(double* %src) { ; CHECK-LABEL: %entry ; CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} -; CHECK: fmadd {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} +; CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} +; CHECK-NEXT: fadd {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} entry: %arrayidx1 = getelementptr inbounds double, double* %src, i64 5 %arrayidx2 = getelementptr inbounds double, double* %src, i64 11 Index: llvm/test/CodeGen/AArch64/i128-math.ll =================================================================== --- llvm/test/CodeGen/AArch64/i128-math.ll +++ llvm/test/CodeGen/AArch64/i128-math.ll @@ -259,9 +259,10 @@ ; CHECK-LABEL: u128_mul: ; CHECK: // %bb.0: ; CHECK-NEXT: umulh x8, x0, x2 +; CHECK-NEXT: mul x9, x1, x2 ; CHECK-NEXT: madd x8, x0, x3, x8 ; CHECK-NEXT: mul x0, x0, x2 -; CHECK-NEXT: madd x1, x1, x2, x8 +; CHECK-NEXT: add x1, x8, x9 ; CHECK-NEXT: ret %1 = mul i128 %x, %y ret i128 %1 @@ -357,9 +358,10 @@ ; CHECK-LABEL: i128_mul: ; CHECK: // %bb.0: ; CHECK-NEXT: umulh x8, x0, x2 +; CHECK-NEXT: mul x9, x1, x2 ; CHECK-NEXT: madd x8, x0, x3, x8 ; CHECK-NEXT: mul x0, x0, x2 -; CHECK-NEXT: madd x1, x1, x2, x8 +; CHECK-NEXT: add x1, x8, x9 ; CHECK-NEXT: ret %1 = mul i128 %x, %y ret i128 %1 Index: llvm/test/CodeGen/AArch64/machine-combiner-madd.ll =================================================================== --- llvm/test/CodeGen/AArch64/machine-combiner-madd.ll +++ llvm/test/CodeGen/AArch64/machine-combiner-madd.ll @@ -10,13 +10,12 @@ ; RUN: llc -mtriple=aarch64-linux-gnu -mcpu=thunderx3t110 < %s | FileCheck %s ; RUN: llc -mtriple=aarch64-linux-gnu -mcpu=tsv110 < %s | FileCheck %s -; Make sure that inst-combine fuses the multiply add in the addressing mode of -; the load. +; Make sure that machine combiner doesn't fuse the multiply add because the +; latency of max(mul, load)+add is shorter than load+madd. ; CHECK-LABEL: fun: -; CHECK-NOT: mul -; CHECK: madd -; CHECK-NOT: mul +; CHECK: mul +; CHECK-NOT: madd %class.D = type { %class.basic_string.base, [4 x i8] } %class.basic_string.base = type <{ i64, i64, i32 }> Index: llvm/test/CodeGen/AArch64/madd-combiner.ll =================================================================== --- llvm/test/CodeGen/AArch64/madd-combiner.ll +++ llvm/test/CodeGen/AArch64/madd-combiner.ll @@ -6,8 +6,8 @@ define i32 @mul_add_imm(i32 %a, i32 %b) { ; CHECK-LABEL: mul_add_imm: ; CHECK: ; %bb.0: -; CHECK-NEXT: orr w8, wzr, #0x4 -; CHECK-NEXT: madd w0, w0, w1, w8 +; CHECK-NEXT: mul w8, w0, w1 +; CHECK-NEXT: add w0, w8, #4 ; CHECK-NEXT: ret %1 = mul i32 %a, %b %2 = add i32 %1, 4 Index: llvm/test/CodeGen/AArch64/madd-lohi.ll =================================================================== --- llvm/test/CodeGen/AArch64/madd-lohi.ll +++ llvm/test/CodeGen/AArch64/madd-lohi.ll @@ -6,17 +6,19 @@ ; CHECK-LABEL: test_128bitmul: ; CHECK: ; %bb.0: ; CHECK-NEXT: umulh x8, x0, x2 +; CHECK-NEXT: mul x9, x1, x2 ; CHECK-NEXT: madd x8, x0, x3, x8 ; CHECK-NEXT: mul x0, x0, x2 -; CHECK-NEXT: madd x1, x1, x2, x8 +; CHECK-NEXT: add x1, x8, x9 ; CHECK-NEXT: ret ; ; CHECK-BE-LABEL: test_128bitmul: ; CHECK-BE: // %bb.0: ; CHECK-BE-NEXT: umulh x8, x1, x3 +; CHECK-BE-NEXT: mul x9, x0, x3 ; CHECK-BE-NEXT: madd x8, x1, x2, x8 ; CHECK-BE-NEXT: mul x1, x1, x3 -; CHECK-BE-NEXT: madd x0, x0, x3, x8 +; CHECK-BE-NEXT: add x0, x8, x9 ; CHECK-BE-NEXT: ret Index: llvm/test/CodeGen/AArch64/mul-lohi.ll =================================================================== --- llvm/test/CodeGen/AArch64/mul-lohi.ll +++ llvm/test/CodeGen/AArch64/mul-lohi.ll @@ -3,16 +3,20 @@ define i128 @test_128bitmul(i128 %lhs, i128 %rhs) { ; CHECK-LABEL: test_128bitmul: +; CHECK: mul [[TEMP0:x[0-9]+]], x0, x3 ; CHECK: umulh [[HI:x[0-9]+]], x0, x2 -; CHECK: madd [[TEMP1:x[0-9]+]], x0, x3, [[HI]] -; CHECK-DAG: madd x1, x1, x2, [[TEMP1]] +; CHECK: add [[TEMP1:x[0-9]+]], [[HI]], [[TEMP0]] +; CHECK: mul [[TEMP2:x[0-9]+]], x1, x2 +; CHECK-DAG: add x1, [[TEMP1]], [[TEMP2]] ; CHECK-DAG: mul x0, x0, x2 ; CHECK-NEXT: ret ; CHECK-BE-LABEL: test_128bitmul: +; CHECK-BE: mul [[TEMP0:x[0-9]+]], x1, x2 ; CHECK-BE: umulh [[HI:x[0-9]+]], x1, x3 -; CHECK-BE: madd [[TEMP1:x[0-9]+]], x1, x2, [[HI]] -; CHECK-BE-DAG: madd x0, x0, x3, [[TEMP1]] +; CHECK-BE: add [[TEMP1:x[0-9]+]], [[HI]], [[TEMP0]] +; CHECK-BE: mul [[TEMP2:x[0-9]+]], x0, x3 +; CHECK-BE-DAG: add x0, [[TEMP1]], [[TEMP2]] ; CHECK-BE-DAG: mul x1, x1, x3 ; CHECK-BE-NEXT: ret Index: llvm/test/CodeGen/AArch64/srem-seteq-vec-nonsplat.ll =================================================================== --- llvm/test/CodeGen/AArch64/srem-seteq-vec-nonsplat.ll +++ llvm/test/CodeGen/AArch64/srem-seteq-vec-nonsplat.ll @@ -354,14 +354,15 @@ ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI13_0] ; CHECK-NEXT: adrp x8, .LCPI13_1 ; CHECK-NEXT: smull2 v2.2d, v0.4s, v1.4s +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI13_1] ; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s -; CHECK-NEXT: uzp2 v1.4s, v1.4s, v2.4s -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI13_1] ; CHECK-NEXT: adrp x8, .LCPI13_2 -; CHECK-NEXT: mla v1.4s, v0.4s, v2.4s -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI13_2] +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v2.4s +; CHECK-NEXT: mul v2.4s, v0.4s, v3.4s +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI13_2] ; CHECK-NEXT: adrp x8, .LCPI13_3 -; CHECK-NEXT: sshl v2.4s, v1.4s, v2.4s +; CHECK-NEXT: add v1.4s, v1.4s, v2.4s +; CHECK-NEXT: sshl v2.4s, v1.4s, v3.4s ; CHECK-NEXT: usra v2.4s, v1.4s, #31 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI13_3] ; CHECK-NEXT: mls v0.4s, v2.4s, v1.4s @@ -383,14 +384,15 @@ ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI14_0] ; CHECK-NEXT: adrp x8, .LCPI14_1 ; CHECK-NEXT: smull2 v2.2d, v0.4s, v1.4s +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI14_1] ; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s -; CHECK-NEXT: uzp2 v1.4s, v1.4s, v2.4s -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI14_1] ; CHECK-NEXT: adrp x8, .LCPI14_2 -; CHECK-NEXT: mla v1.4s, v0.4s, v2.4s -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI14_2] +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v2.4s +; CHECK-NEXT: mul v2.4s, v0.4s, v3.4s +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI14_2] ; CHECK-NEXT: adrp x8, .LCPI14_3 -; CHECK-NEXT: sshl v2.4s, v1.4s, v2.4s +; CHECK-NEXT: add v1.4s, v1.4s, v2.4s +; CHECK-NEXT: sshl v2.4s, v1.4s, v3.4s ; CHECK-NEXT: usra v2.4s, v1.4s, #31 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI14_3] ; CHECK-NEXT: mls v0.4s, v2.4s, v1.4s @@ -412,14 +414,15 @@ ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI15_0] ; CHECK-NEXT: adrp x8, .LCPI15_1 ; CHECK-NEXT: smull2 v2.2d, v0.4s, v1.4s +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI15_1] ; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s -; CHECK-NEXT: uzp2 v1.4s, v1.4s, v2.4s -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI15_1] ; CHECK-NEXT: adrp x8, .LCPI15_2 -; CHECK-NEXT: mla v1.4s, v0.4s, v2.4s -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI15_2] +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v2.4s +; CHECK-NEXT: mul v2.4s, v0.4s, v3.4s +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI15_2] ; CHECK-NEXT: adrp x8, .LCPI15_3 -; CHECK-NEXT: sshl v2.4s, v1.4s, v2.4s +; CHECK-NEXT: add v1.4s, v1.4s, v2.4s +; CHECK-NEXT: sshl v2.4s, v1.4s, v3.4s ; CHECK-NEXT: usra v2.4s, v1.4s, #31 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI15_3] ; CHECK-NEXT: mls v0.4s, v2.4s, v1.4s Index: llvm/test/CodeGen/X86/machine-combiner-int-vec.ll =================================================================== --- llvm/test/CodeGen/X86/machine-combiner-int-vec.ll +++ llvm/test/CodeGen/X86/machine-combiner-int-vec.ll @@ -9,15 +9,15 @@ ; SSE-LABEL: reassociate_and_v4i32: ; SSE: # %bb.0: ; SSE-NEXT: paddd %xmm1, %xmm0 +; SSE-NEXT: pand %xmm3, %xmm2 ; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: pand %xmm3, %xmm0 ; SSE-NEXT: retq ; ; AVX2-LABEL: reassociate_and_v4i32: ; AVX2: # %bb.0: ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpand %xmm0, %xmm2, %xmm0 -; AVX2-NEXT: vpand %xmm0, %xmm3, %xmm0 +; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm1 +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: reassociate_and_v4i32: @@ -36,15 +36,15 @@ ; SSE-LABEL: reassociate_or_v4i32: ; SSE: # %bb.0: ; SSE-NEXT: paddd %xmm1, %xmm0 +; SSE-NEXT: por %xmm3, %xmm2 ; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: por %xmm3, %xmm0 ; SSE-NEXT: retq ; ; AVX2-LABEL: reassociate_or_v4i32: ; AVX2: # %bb.0: ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpor %xmm0, %xmm2, %xmm0 -; AVX2-NEXT: vpor %xmm0, %xmm3, %xmm0 +; AVX2-NEXT: vpor %xmm3, %xmm2, %xmm1 +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: reassociate_or_v4i32: @@ -63,15 +63,15 @@ ; SSE-LABEL: reassociate_xor_v4i32: ; SSE: # %bb.0: ; SSE-NEXT: paddd %xmm1, %xmm0 +; SSE-NEXT: pxor %xmm3, %xmm2 ; SSE-NEXT: pxor %xmm2, %xmm0 -; SSE-NEXT: pxor %xmm3, %xmm0 ; SSE-NEXT: retq ; ; AVX2-LABEL: reassociate_xor_v4i32: ; AVX2: # %bb.0: ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpxor %xmm0, %xmm2, %xmm0 -; AVX2-NEXT: vpxor %xmm0, %xmm3, %xmm0 +; AVX2-NEXT: vpxor %xmm3, %xmm2, %xmm1 +; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: reassociate_xor_v4i32: @@ -92,18 +92,18 @@ ; SSE-LABEL: reassociate_and_v8i32: ; SSE: # %bb.0: ; SSE-NEXT: paddd %xmm2, %xmm0 +; SSE-NEXT: pand %xmm6, %xmm4 ; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: pand %xmm6, %xmm0 ; SSE-NEXT: paddd %xmm3, %xmm1 +; SSE-NEXT: pand %xmm7, %xmm5 ; SSE-NEXT: pand %xmm5, %xmm1 -; SSE-NEXT: pand %xmm7, %xmm1 ; SSE-NEXT: retq ; ; AVX2-LABEL: reassociate_and_v8i32: ; AVX2: # %bb.0: ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpand %ymm0, %ymm2, %ymm0 -; AVX2-NEXT: vpand %ymm0, %ymm3, %ymm0 +; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm1 +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: reassociate_and_v8i32: @@ -122,18 +122,18 @@ ; SSE-LABEL: reassociate_or_v8i32: ; SSE: # %bb.0: ; SSE-NEXT: paddd %xmm2, %xmm0 +; SSE-NEXT: por %xmm6, %xmm4 ; SSE-NEXT: por %xmm4, %xmm0 -; SSE-NEXT: por %xmm6, %xmm0 ; SSE-NEXT: paddd %xmm3, %xmm1 +; SSE-NEXT: por %xmm7, %xmm5 ; SSE-NEXT: por %xmm5, %xmm1 -; SSE-NEXT: por %xmm7, %xmm1 ; SSE-NEXT: retq ; ; AVX2-LABEL: reassociate_or_v8i32: ; AVX2: # %bb.0: ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpor %ymm0, %ymm2, %ymm0 -; AVX2-NEXT: vpor %ymm0, %ymm3, %ymm0 +; AVX2-NEXT: vpor %ymm3, %ymm2, %ymm1 +; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: reassociate_or_v8i32: @@ -152,18 +152,18 @@ ; SSE-LABEL: reassociate_xor_v8i32: ; SSE: # %bb.0: ; SSE-NEXT: paddd %xmm2, %xmm0 +; SSE-NEXT: pxor %xmm6, %xmm4 ; SSE-NEXT: pxor %xmm4, %xmm0 -; SSE-NEXT: pxor %xmm6, %xmm0 ; SSE-NEXT: paddd %xmm3, %xmm1 +; SSE-NEXT: pxor %xmm7, %xmm5 ; SSE-NEXT: pxor %xmm5, %xmm1 -; SSE-NEXT: pxor %xmm7, %xmm1 ; SSE-NEXT: retq ; ; AVX2-LABEL: reassociate_xor_v8i32: ; AVX2: # %bb.0: ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpxor %ymm0, %ymm2, %ymm0 -; AVX2-NEXT: vpxor %ymm0, %ymm3, %ymm0 +; AVX2-NEXT: vpxor %ymm3, %ymm2, %ymm1 +; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: reassociate_xor_v8i32: @@ -201,11 +201,11 @@ ; AVX2-LABEL: reassociate_and_v16i32: ; AVX2: # %bb.0: ; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpand %ymm0, %ymm4, %ymm0 -; AVX2-NEXT: vpand %ymm0, %ymm6, %ymm0 +; AVX2-NEXT: vpand %ymm6, %ymm4, %ymm2 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpaddd %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpand %ymm1, %ymm5, %ymm1 -; AVX2-NEXT: vpand %ymm1, %ymm7, %ymm1 +; AVX2-NEXT: vpand %ymm7, %ymm5, %ymm2 +; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: retq ; ; AVX512-LABEL: reassociate_and_v16i32: @@ -240,11 +240,11 @@ ; AVX2-LABEL: reassociate_or_v16i32: ; AVX2: # %bb.0: ; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpor %ymm0, %ymm4, %ymm0 -; AVX2-NEXT: vpor %ymm0, %ymm6, %ymm0 +; AVX2-NEXT: vpor %ymm6, %ymm4, %ymm2 +; AVX2-NEXT: vpor %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpaddd %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpor %ymm1, %ymm5, %ymm1 -; AVX2-NEXT: vpor %ymm1, %ymm7, %ymm1 +; AVX2-NEXT: vpor %ymm7, %ymm5, %ymm2 +; AVX2-NEXT: vpor %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: retq ; ; AVX512-LABEL: reassociate_or_v16i32: @@ -279,11 +279,11 @@ ; AVX2-LABEL: reassociate_xor_v16i32: ; AVX2: # %bb.0: ; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpxor %ymm0, %ymm4, %ymm0 -; AVX2-NEXT: vpxor %ymm0, %ymm6, %ymm0 +; AVX2-NEXT: vpxor %ymm6, %ymm4, %ymm2 +; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpaddd %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpxor %ymm1, %ymm5, %ymm1 -; AVX2-NEXT: vpxor %ymm1, %ymm7, %ymm1 +; AVX2-NEXT: vpxor %ymm7, %ymm5, %ymm2 +; AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: retq ; ; AVX512-LABEL: reassociate_xor_v16i32: @@ -304,15 +304,15 @@ ; SSE-LABEL: reassociate_umax_v16i8: ; SSE: # %bb.0: ; SSE-NEXT: paddb %xmm1, %xmm0 +; SSE-NEXT: pmaxub %xmm3, %xmm2 ; SSE-NEXT: pmaxub %xmm2, %xmm0 -; SSE-NEXT: pmaxub %xmm3, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: reassociate_umax_v16i8: ; AVX: # %bb.0: ; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpmaxub %xmm0, %xmm2, %xmm0 -; AVX-NEXT: vpmaxub %xmm0, %xmm3, %xmm0 +; AVX-NEXT: vpmaxub %xmm3, %xmm2, %xmm1 +; AVX-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq %t0 = add <16 x i8> %x0, %x1 @@ -336,8 +336,8 @@ ; AVX-LABEL: reassociate_umax_v8i16: ; AVX: # %bb.0: ; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpmaxuw %xmm0, %xmm2, %xmm0 -; AVX-NEXT: vpmaxuw %xmm0, %xmm3, %xmm0 +; AVX-NEXT: vpmaxuw %xmm3, %xmm2, %xmm1 +; AVX-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq %t0 = add <8 x i16> %x0, %x1 @@ -374,8 +374,8 @@ ; AVX-LABEL: reassociate_umax_v4i32: ; AVX: # %bb.0: ; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpmaxud %xmm0, %xmm2, %xmm0 -; AVX-NEXT: vpmaxud %xmm0, %xmm3, %xmm0 +; AVX-NEXT: vpmaxud %xmm3, %xmm2, %xmm1 +; AVX-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq %t0 = add <4 x i32> %x0, %x1 @@ -439,8 +439,8 @@ ; AVX512-LABEL: reassociate_umax_v2i64: ; AVX512: # %bb.0: ; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpmaxuq %xmm0, %xmm2, %xmm0 -; AVX512-NEXT: vpmaxuq %xmm0, %xmm3, %xmm0 +; AVX512-NEXT: vpmaxuq %xmm3, %xmm2, %xmm1 +; AVX512-NEXT: vpmaxuq %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq %t0 = add <2 x i64> %x0, %x1 @@ -470,8 +470,8 @@ ; AVX-LABEL: reassociate_smax_v16i8: ; AVX: # %bb.0: ; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpmaxsb %xmm0, %xmm2, %xmm0 -; AVX-NEXT: vpmaxsb %xmm0, %xmm3, %xmm0 +; AVX-NEXT: vpmaxsb %xmm3, %xmm2, %xmm1 +; AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq %t0 = add <16 x i8> %x0, %x1 @@ -486,15 +486,15 @@ ; SSE-LABEL: reassociate_smax_v8i16: ; SSE: # %bb.0: ; SSE-NEXT: paddw %xmm1, %xmm0 +; SSE-NEXT: pmaxsw %xmm3, %xmm2 ; SSE-NEXT: pmaxsw %xmm2, %xmm0 -; SSE-NEXT: pmaxsw %xmm3, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: reassociate_smax_v8i16: ; AVX: # %bb.0: ; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpmaxsw %xmm0, %xmm2, %xmm0 -; AVX-NEXT: vpmaxsw %xmm0, %xmm3, %xmm0 +; AVX-NEXT: vpmaxsw %xmm3, %xmm2, %xmm1 +; AVX-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq %t0 = add <8 x i16> %x0, %x1 @@ -524,8 +524,8 @@ ; AVX-LABEL: reassociate_smax_v4i32: ; AVX: # %bb.0: ; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpmaxsd %xmm0, %xmm2, %xmm0 -; AVX-NEXT: vpmaxsd %xmm0, %xmm3, %xmm0 +; AVX-NEXT: vpmaxsd %xmm3, %xmm2, %xmm1 +; AVX-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq %t0 = add <4 x i32> %x0, %x1 @@ -584,8 +584,8 @@ ; AVX512-LABEL: reassociate_smax_v2i64: ; AVX512: # %bb.0: ; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpmaxsq %xmm0, %xmm2, %xmm0 -; AVX512-NEXT: vpmaxsq %xmm0, %xmm3, %xmm0 +; AVX512-NEXT: vpmaxsq %xmm3, %xmm2, %xmm1 +; AVX512-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq %t0 = add <2 x i64> %x0, %x1 @@ -600,15 +600,15 @@ ; SSE-LABEL: reassociate_umin_v16i8: ; SSE: # %bb.0: ; SSE-NEXT: paddb %xmm1, %xmm0 +; SSE-NEXT: pminub %xmm3, %xmm2 ; SSE-NEXT: pminub %xmm2, %xmm0 -; SSE-NEXT: pminub %xmm3, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: reassociate_umin_v16i8: ; AVX: # %bb.0: ; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpminub %xmm0, %xmm2, %xmm0 -; AVX-NEXT: vpminub %xmm0, %xmm3, %xmm0 +; AVX-NEXT: vpminub %xmm3, %xmm2, %xmm1 +; AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq %t0 = add <16 x i8> %x0, %x1 @@ -635,8 +635,8 @@ ; AVX-LABEL: reassociate_umin_v8i16: ; AVX: # %bb.0: ; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpminuw %xmm0, %xmm2, %xmm0 -; AVX-NEXT: vpminuw %xmm0, %xmm3, %xmm0 +; AVX-NEXT: vpminuw %xmm3, %xmm2, %xmm1 +; AVX-NEXT: vpminuw %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq %t0 = add <8 x i16> %x0, %x1 @@ -672,8 +672,8 @@ ; AVX-LABEL: reassociate_umin_v4i32: ; AVX: # %bb.0: ; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpminud %xmm0, %xmm2, %xmm0 -; AVX-NEXT: vpminud %xmm0, %xmm3, %xmm0 +; AVX-NEXT: vpminud %xmm3, %xmm2, %xmm1 +; AVX-NEXT: vpminud %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq %t0 = add <4 x i32> %x0, %x1 @@ -737,8 +737,8 @@ ; AVX512-LABEL: reassociate_umin_v2i64: ; AVX512: # %bb.0: ; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpminuq %xmm0, %xmm2, %xmm0 -; AVX512-NEXT: vpminuq %xmm0, %xmm3, %xmm0 +; AVX512-NEXT: vpminuq %xmm3, %xmm2, %xmm1 +; AVX512-NEXT: vpminuq %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq %t0 = add <2 x i64> %x0, %x1 @@ -768,8 +768,8 @@ ; AVX-LABEL: reassociate_smin_v16i8: ; AVX: # %bb.0: ; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpminsb %xmm0, %xmm2, %xmm0 -; AVX-NEXT: vpminsb %xmm0, %xmm3, %xmm0 +; AVX-NEXT: vpminsb %xmm3, %xmm2, %xmm1 +; AVX-NEXT: vpminsb %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq %t0 = add <16 x i8> %x0, %x1 @@ -784,15 +784,15 @@ ; SSE-LABEL: reassociate_smin_v8i16: ; SSE: # %bb.0: ; SSE-NEXT: paddw %xmm1, %xmm0 +; SSE-NEXT: pminsw %xmm3, %xmm2 ; SSE-NEXT: pminsw %xmm2, %xmm0 -; SSE-NEXT: pminsw %xmm3, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: reassociate_smin_v8i16: ; AVX: # %bb.0: ; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpminsw %xmm0, %xmm2, %xmm0 -; AVX-NEXT: vpminsw %xmm0, %xmm3, %xmm0 +; AVX-NEXT: vpminsw %xmm3, %xmm2, %xmm1 +; AVX-NEXT: vpminsw %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq %t0 = add <8 x i16> %x0, %x1 @@ -822,8 +822,8 @@ ; AVX-LABEL: reassociate_smin_v4i32: ; AVX: # %bb.0: ; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpminsd %xmm0, %xmm2, %xmm0 -; AVX-NEXT: vpminsd %xmm0, %xmm3, %xmm0 +; AVX-NEXT: vpminsd %xmm3, %xmm2, %xmm1 +; AVX-NEXT: vpminsd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq %t0 = add <4 x i32> %x0, %x1 @@ -882,8 +882,8 @@ ; AVX512-LABEL: reassociate_smin_v2i64: ; AVX512: # %bb.0: ; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpminsq %xmm0, %xmm2, %xmm0 -; AVX512-NEXT: vpminsq %xmm0, %xmm3, %xmm0 +; AVX512-NEXT: vpminsq %xmm3, %xmm2, %xmm1 +; AVX512-NEXT: vpminsq %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq %t0 = add <2 x i64> %x0, %x1 @@ -901,17 +901,17 @@ ; SSE: # %bb.0: ; SSE-NEXT: paddb %xmm2, %xmm0 ; SSE-NEXT: paddb %xmm3, %xmm1 -; SSE-NEXT: pmaxub %xmm5, %xmm1 +; SSE-NEXT: pmaxub %xmm6, %xmm4 ; SSE-NEXT: pmaxub %xmm4, %xmm0 -; SSE-NEXT: pmaxub %xmm6, %xmm0 -; SSE-NEXT: pmaxub %xmm7, %xmm1 +; SSE-NEXT: pmaxub %xmm7, %xmm5 +; SSE-NEXT: pmaxub %xmm5, %xmm1 ; SSE-NEXT: retq ; ; AVX-LABEL: reassociate_umax_v32i8: ; AVX: # %bb.0: ; AVX-NEXT: vpaddb %ymm1, %ymm0, %ymm0 -; AVX-NEXT: vpmaxub %ymm0, %ymm2, %ymm0 -; AVX-NEXT: vpmaxub %ymm0, %ymm3, %ymm0 +; AVX-NEXT: vpmaxub %ymm3, %ymm2, %ymm1 +; AVX-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 ; AVX-NEXT: retq %t0 = add <32 x i8> %x0, %x1 @@ -940,8 +940,8 @@ ; AVX-LABEL: reassociate_umax_v16i16: ; AVX: # %bb.0: ; AVX-NEXT: vpaddw %ymm1, %ymm0, %ymm0 -; AVX-NEXT: vpmaxuw %ymm0, %ymm2, %ymm0 -; AVX-NEXT: vpmaxuw %ymm0, %ymm3, %ymm0 +; AVX-NEXT: vpmaxuw %ymm3, %ymm2, %ymm1 +; AVX-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 ; AVX-NEXT: retq %t0 = add <16 x i16> %x0, %x1 @@ -995,8 +995,8 @@ ; AVX-LABEL: reassociate_umax_v8i32: ; AVX: # %bb.0: ; AVX-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; AVX-NEXT: vpmaxud %ymm0, %ymm2, %ymm0 -; AVX-NEXT: vpmaxud %ymm0, %ymm3, %ymm0 +; AVX-NEXT: vpmaxud %ymm3, %ymm2, %ymm1 +; AVX-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 ; AVX-NEXT: retq %t0 = add <8 x i32> %x0, %x1 @@ -1091,8 +1091,8 @@ ; AVX512-LABEL: reassociate_umax_v4i64: ; AVX512: # %bb.0: ; AVX512-NEXT: vpaddq %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vpmaxuq %ymm0, %ymm2, %ymm0 -; AVX512-NEXT: vpmaxuq %ymm0, %ymm3, %ymm0 +; AVX512-NEXT: vpmaxuq %ymm3, %ymm2, %ymm1 +; AVX512-NEXT: vpmaxuq %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: retq %t0 = add <4 x i64> %x0, %x1 @@ -1133,8 +1133,8 @@ ; AVX-LABEL: reassociate_smax_v32i8: ; AVX: # %bb.0: ; AVX-NEXT: vpaddb %ymm1, %ymm0, %ymm0 -; AVX-NEXT: vpmaxsb %ymm0, %ymm2, %ymm0 -; AVX-NEXT: vpmaxsb %ymm0, %ymm3, %ymm0 +; AVX-NEXT: vpmaxsb %ymm3, %ymm2, %ymm1 +; AVX-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 ; AVX-NEXT: retq %t0 = add <32 x i8> %x0, %x1 @@ -1150,17 +1150,17 @@ ; SSE: # %bb.0: ; SSE-NEXT: paddw %xmm2, %xmm0 ; SSE-NEXT: paddw %xmm3, %xmm1 -; SSE-NEXT: pmaxsw %xmm5, %xmm1 +; SSE-NEXT: pmaxsw %xmm6, %xmm4 ; SSE-NEXT: pmaxsw %xmm4, %xmm0 -; SSE-NEXT: pmaxsw %xmm6, %xmm0 -; SSE-NEXT: pmaxsw %xmm7, %xmm1 +; SSE-NEXT: pmaxsw %xmm7, %xmm5 +; SSE-NEXT: pmaxsw %xmm5, %xmm1 ; SSE-NEXT: retq ; ; AVX-LABEL: reassociate_smax_v16i16: ; AVX: # %bb.0: ; AVX-NEXT: vpaddw %ymm1, %ymm0, %ymm0 -; AVX-NEXT: vpmaxsw %ymm0, %ymm2, %ymm0 -; AVX-NEXT: vpmaxsw %ymm0, %ymm3, %ymm0 +; AVX-NEXT: vpmaxsw %ymm3, %ymm2, %ymm1 +; AVX-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 ; AVX-NEXT: retq %t0 = add <16 x i16> %x0, %x1 @@ -1201,8 +1201,8 @@ ; AVX-LABEL: reassociate_smax_v8i32: ; AVX: # %bb.0: ; AVX-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; AVX-NEXT: vpmaxsd %ymm0, %ymm2, %ymm0 -; AVX-NEXT: vpmaxsd %ymm0, %ymm3, %ymm0 +; AVX-NEXT: vpmaxsd %ymm3, %ymm2, %ymm1 +; AVX-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 ; AVX-NEXT: retq %t0 = add <8 x i32> %x0, %x1 @@ -1292,8 +1292,8 @@ ; AVX512-LABEL: reassociate_smax_v4i64: ; AVX512: # %bb.0: ; AVX512-NEXT: vpaddq %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vpmaxsq %ymm0, %ymm2, %ymm0 -; AVX512-NEXT: vpmaxsq %ymm0, %ymm3, %ymm0 +; AVX512-NEXT: vpmaxsq %ymm3, %ymm2, %ymm1 +; AVX512-NEXT: vpmaxsq %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: retq %t0 = add <4 x i64> %x0, %x1 @@ -1309,17 +1309,17 @@ ; SSE: # %bb.0: ; SSE-NEXT: paddb %xmm2, %xmm0 ; SSE-NEXT: paddb %xmm3, %xmm1 -; SSE-NEXT: pminub %xmm5, %xmm1 +; SSE-NEXT: pminub %xmm6, %xmm4 ; SSE-NEXT: pminub %xmm4, %xmm0 -; SSE-NEXT: pminub %xmm6, %xmm0 -; SSE-NEXT: pminub %xmm7, %xmm1 +; SSE-NEXT: pminub %xmm7, %xmm5 +; SSE-NEXT: pminub %xmm5, %xmm1 ; SSE-NEXT: retq ; ; AVX-LABEL: reassociate_umin_v32i8: ; AVX: # %bb.0: ; AVX-NEXT: vpaddb %ymm1, %ymm0, %ymm0 -; AVX-NEXT: vpminub %ymm0, %ymm2, %ymm0 -; AVX-NEXT: vpminub %ymm0, %ymm3, %ymm0 +; AVX-NEXT: vpminub %ymm3, %ymm2, %ymm1 +; AVX-NEXT: vpminub %ymm1, %ymm0, %ymm0 ; AVX-NEXT: retq %t0 = add <32 x i8> %x0, %x1 @@ -1354,8 +1354,8 @@ ; AVX-LABEL: reassociate_umin_v16i16: ; AVX: # %bb.0: ; AVX-NEXT: vpaddw %ymm1, %ymm0, %ymm0 -; AVX-NEXT: vpminuw %ymm0, %ymm2, %ymm0 -; AVX-NEXT: vpminuw %ymm0, %ymm3, %ymm0 +; AVX-NEXT: vpminuw %ymm3, %ymm2, %ymm1 +; AVX-NEXT: vpminuw %ymm1, %ymm0, %ymm0 ; AVX-NEXT: retq %t0 = add <16 x i16> %x0, %x1 @@ -1408,8 +1408,8 @@ ; AVX-LABEL: reassociate_umin_v8i32: ; AVX: # %bb.0: ; AVX-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; AVX-NEXT: vpminud %ymm0, %ymm2, %ymm0 -; AVX-NEXT: vpminud %ymm0, %ymm3, %ymm0 +; AVX-NEXT: vpminud %ymm3, %ymm2, %ymm1 +; AVX-NEXT: vpminud %ymm1, %ymm0, %ymm0 ; AVX-NEXT: retq %t0 = add <8 x i32> %x0, %x1 @@ -1504,8 +1504,8 @@ ; AVX512-LABEL: reassociate_umin_v4i64: ; AVX512: # %bb.0: ; AVX512-NEXT: vpaddq %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vpminuq %ymm0, %ymm2, %ymm0 -; AVX512-NEXT: vpminuq %ymm0, %ymm3, %ymm0 +; AVX512-NEXT: vpminuq %ymm3, %ymm2, %ymm1 +; AVX512-NEXT: vpminuq %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: retq %t0 = add <4 x i64> %x0, %x1 @@ -1546,8 +1546,8 @@ ; AVX-LABEL: reassociate_smin_v32i8: ; AVX: # %bb.0: ; AVX-NEXT: vpaddb %ymm1, %ymm0, %ymm0 -; AVX-NEXT: vpminsb %ymm0, %ymm2, %ymm0 -; AVX-NEXT: vpminsb %ymm0, %ymm3, %ymm0 +; AVX-NEXT: vpminsb %ymm3, %ymm2, %ymm1 +; AVX-NEXT: vpminsb %ymm1, %ymm0, %ymm0 ; AVX-NEXT: retq %t0 = add <32 x i8> %x0, %x1 @@ -1563,17 +1563,17 @@ ; SSE: # %bb.0: ; SSE-NEXT: paddw %xmm2, %xmm0 ; SSE-NEXT: paddw %xmm3, %xmm1 -; SSE-NEXT: pminsw %xmm5, %xmm1 +; SSE-NEXT: pminsw %xmm6, %xmm4 ; SSE-NEXT: pminsw %xmm4, %xmm0 -; SSE-NEXT: pminsw %xmm6, %xmm0 -; SSE-NEXT: pminsw %xmm7, %xmm1 +; SSE-NEXT: pminsw %xmm7, %xmm5 +; SSE-NEXT: pminsw %xmm5, %xmm1 ; SSE-NEXT: retq ; ; AVX-LABEL: reassociate_smin_v16i16: ; AVX: # %bb.0: ; AVX-NEXT: vpaddw %ymm1, %ymm0, %ymm0 -; AVX-NEXT: vpminsw %ymm0, %ymm2, %ymm0 -; AVX-NEXT: vpminsw %ymm0, %ymm3, %ymm0 +; AVX-NEXT: vpminsw %ymm3, %ymm2, %ymm1 +; AVX-NEXT: vpminsw %ymm1, %ymm0, %ymm0 ; AVX-NEXT: retq %t0 = add <16 x i16> %x0, %x1 @@ -1614,8 +1614,8 @@ ; AVX-LABEL: reassociate_smin_v8i32: ; AVX: # %bb.0: ; AVX-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; AVX-NEXT: vpminsd %ymm0, %ymm2, %ymm0 -; AVX-NEXT: vpminsd %ymm0, %ymm3, %ymm0 +; AVX-NEXT: vpminsd %ymm3, %ymm2, %ymm1 +; AVX-NEXT: vpminsd %ymm1, %ymm0, %ymm0 ; AVX-NEXT: retq %t0 = add <8 x i32> %x0, %x1 @@ -1705,8 +1705,8 @@ ; AVX512-LABEL: reassociate_smin_v4i64: ; AVX512: # %bb.0: ; AVX512-NEXT: vpaddq %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vpminsq %ymm0, %ymm2, %ymm0 -; AVX512-NEXT: vpminsq %ymm0, %ymm3, %ymm0 +; AVX512-NEXT: vpminsq %ymm3, %ymm2, %ymm1 +; AVX512-NEXT: vpminsq %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: retq %t0 = add <4 x i64> %x0, %x1 @@ -1740,17 +1740,17 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpaddb %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpmaxub %ymm1, %ymm5, %ymm1 -; AVX2-NEXT: vpmaxub %ymm0, %ymm4, %ymm0 -; AVX2-NEXT: vpmaxub %ymm0, %ymm6, %ymm0 -; AVX2-NEXT: vpmaxub %ymm1, %ymm7, %ymm1 +; AVX2-NEXT: vpmaxub %ymm6, %ymm4, %ymm2 +; AVX2-NEXT: vpmaxub %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpmaxub %ymm7, %ymm5, %ymm2 +; AVX2-NEXT: vpmaxub %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: retq ; ; AVX512-LABEL: reassociate_umax_v64i8: ; AVX512: # %bb.0: ; AVX512-NEXT: vpaddb %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vpmaxub %zmm0, %zmm2, %zmm0 -; AVX512-NEXT: vpmaxub %zmm0, %zmm3, %zmm0 +; AVX512-NEXT: vpmaxub %zmm3, %zmm2, %zmm1 +; AVX512-NEXT: vpmaxub %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: retq %t0 = add <64 x i8> %x0, %x1 @@ -1798,17 +1798,17 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpaddw %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpaddw %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpmaxuw %ymm1, %ymm5, %ymm1 -; AVX2-NEXT: vpmaxuw %ymm0, %ymm4, %ymm0 -; AVX2-NEXT: vpmaxuw %ymm0, %ymm6, %ymm0 -; AVX2-NEXT: vpmaxuw %ymm1, %ymm7, %ymm1 +; AVX2-NEXT: vpmaxuw %ymm6, %ymm4, %ymm2 +; AVX2-NEXT: vpmaxuw %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpmaxuw %ymm7, %ymm5, %ymm2 +; AVX2-NEXT: vpmaxuw %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: retq ; ; AVX512-LABEL: reassociate_umax_v32i16: ; AVX512: # %bb.0: ; AVX512-NEXT: vpaddw %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vpmaxuw %zmm0, %zmm2, %zmm0 -; AVX512-NEXT: vpmaxuw %zmm0, %zmm3, %zmm0 +; AVX512-NEXT: vpmaxuw %zmm3, %zmm2, %zmm1 +; AVX512-NEXT: vpmaxuw %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: retq %t0 = add <32 x i16> %x0, %x1 @@ -1907,17 +1907,17 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpaddd %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpmaxud %ymm1, %ymm5, %ymm1 -; AVX2-NEXT: vpmaxud %ymm0, %ymm4, %ymm0 -; AVX2-NEXT: vpmaxud %ymm0, %ymm6, %ymm0 -; AVX2-NEXT: vpmaxud %ymm1, %ymm7, %ymm1 +; AVX2-NEXT: vpmaxud %ymm6, %ymm4, %ymm2 +; AVX2-NEXT: vpmaxud %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpmaxud %ymm7, %ymm5, %ymm2 +; AVX2-NEXT: vpmaxud %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: retq ; ; AVX512-LABEL: reassociate_umax_v16i32: ; AVX512: # %bb.0: ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vpmaxud %zmm0, %zmm2, %zmm0 -; AVX512-NEXT: vpmaxud %zmm0, %zmm3, %zmm0 +; AVX512-NEXT: vpmaxud %zmm3, %zmm2, %zmm1 +; AVX512-NEXT: vpmaxud %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: retq %t0 = add <16 x i32> %x0, %x1 @@ -2091,8 +2091,8 @@ ; AVX512-LABEL: reassociate_umax_v8i64: ; AVX512: # %bb.0: ; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vpmaxuq %zmm0, %zmm2, %zmm0 -; AVX512-NEXT: vpmaxuq %zmm0, %zmm3, %zmm0 +; AVX512-NEXT: vpmaxuq %zmm3, %zmm2, %zmm1 +; AVX512-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: retq %t0 = add <8 x i64> %x0, %x1 @@ -2164,17 +2164,17 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpaddb %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpmaxsb %ymm1, %ymm5, %ymm1 -; AVX2-NEXT: vpmaxsb %ymm0, %ymm4, %ymm0 -; AVX2-NEXT: vpmaxsb %ymm0, %ymm6, %ymm0 -; AVX2-NEXT: vpmaxsb %ymm1, %ymm7, %ymm1 +; AVX2-NEXT: vpmaxsb %ymm6, %ymm4, %ymm2 +; AVX2-NEXT: vpmaxsb %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpmaxsb %ymm7, %ymm5, %ymm2 +; AVX2-NEXT: vpmaxsb %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: retq ; ; AVX512-LABEL: reassociate_smax_v64i8: ; AVX512: # %bb.0: ; AVX512-NEXT: vpaddb %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vpmaxsb %zmm0, %zmm2, %zmm0 -; AVX512-NEXT: vpmaxsb %zmm0, %zmm3, %zmm0 +; AVX512-NEXT: vpmaxsb %zmm3, %zmm2, %zmm1 +; AVX512-NEXT: vpmaxsb %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: retq %t0 = add <64 x i8> %x0, %x1 @@ -2206,17 +2206,17 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpaddw %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpaddw %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpmaxsw %ymm1, %ymm5, %ymm1 -; AVX2-NEXT: vpmaxsw %ymm0, %ymm4, %ymm0 -; AVX2-NEXT: vpmaxsw %ymm0, %ymm6, %ymm0 -; AVX2-NEXT: vpmaxsw %ymm1, %ymm7, %ymm1 +; AVX2-NEXT: vpmaxsw %ymm6, %ymm4, %ymm2 +; AVX2-NEXT: vpmaxsw %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpmaxsw %ymm7, %ymm5, %ymm2 +; AVX2-NEXT: vpmaxsw %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: retq ; ; AVX512-LABEL: reassociate_smax_v32i16: ; AVX512: # %bb.0: ; AVX512-NEXT: vpaddw %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vpmaxsw %zmm0, %zmm2, %zmm0 -; AVX512-NEXT: vpmaxsw %zmm0, %zmm3, %zmm0 +; AVX512-NEXT: vpmaxsw %zmm3, %zmm2, %zmm1 +; AVX512-NEXT: vpmaxsw %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: retq %t0 = add <32 x i16> %x0, %x1 @@ -2288,17 +2288,17 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpaddd %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpmaxsd %ymm1, %ymm5, %ymm1 -; AVX2-NEXT: vpmaxsd %ymm0, %ymm4, %ymm0 -; AVX2-NEXT: vpmaxsd %ymm0, %ymm6, %ymm0 -; AVX2-NEXT: vpmaxsd %ymm1, %ymm7, %ymm1 +; AVX2-NEXT: vpmaxsd %ymm6, %ymm4, %ymm2 +; AVX2-NEXT: vpmaxsd %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpmaxsd %ymm7, %ymm5, %ymm2 +; AVX2-NEXT: vpmaxsd %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: retq ; ; AVX512-LABEL: reassociate_smax_v16i32: ; AVX512: # %bb.0: ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vpmaxsd %zmm0, %zmm2, %zmm0 -; AVX512-NEXT: vpmaxsd %zmm0, %zmm3, %zmm0 +; AVX512-NEXT: vpmaxsd %zmm3, %zmm2, %zmm1 +; AVX512-NEXT: vpmaxsd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: retq %t0 = add <16 x i32> %x0, %x1 @@ -2463,8 +2463,8 @@ ; AVX512-LABEL: reassociate_smax_v8i64: ; AVX512: # %bb.0: ; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vpmaxsq %zmm0, %zmm2, %zmm0 -; AVX512-NEXT: vpmaxsq %zmm0, %zmm3, %zmm0 +; AVX512-NEXT: vpmaxsq %zmm3, %zmm2, %zmm1 +; AVX512-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: retq %t0 = add <8 x i64> %x0, %x1 @@ -2496,17 +2496,17 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpaddb %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpminub %ymm1, %ymm5, %ymm1 -; AVX2-NEXT: vpminub %ymm0, %ymm4, %ymm0 -; AVX2-NEXT: vpminub %ymm0, %ymm6, %ymm0 -; AVX2-NEXT: vpminub %ymm1, %ymm7, %ymm1 +; AVX2-NEXT: vpminub %ymm6, %ymm4, %ymm2 +; AVX2-NEXT: vpminub %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpminub %ymm7, %ymm5, %ymm2 +; AVX2-NEXT: vpminub %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: retq ; ; AVX512-LABEL: reassociate_umin_v64i8: ; AVX512: # %bb.0: ; AVX512-NEXT: vpaddb %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vpminub %zmm0, %zmm2, %zmm0 -; AVX512-NEXT: vpminub %zmm0, %zmm3, %zmm0 +; AVX512-NEXT: vpminub %zmm3, %zmm2, %zmm1 +; AVX512-NEXT: vpminub %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: retq %t0 = add <64 x i8> %x0, %x1 @@ -2566,17 +2566,17 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpaddw %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpaddw %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpminuw %ymm1, %ymm5, %ymm1 -; AVX2-NEXT: vpminuw %ymm0, %ymm4, %ymm0 -; AVX2-NEXT: vpminuw %ymm0, %ymm6, %ymm0 -; AVX2-NEXT: vpminuw %ymm1, %ymm7, %ymm1 +; AVX2-NEXT: vpminuw %ymm6, %ymm4, %ymm2 +; AVX2-NEXT: vpminuw %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpminuw %ymm7, %ymm5, %ymm2 +; AVX2-NEXT: vpminuw %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: retq ; ; AVX512-LABEL: reassociate_umin_v32i16: ; AVX512: # %bb.0: ; AVX512-NEXT: vpaddw %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vpminuw %zmm0, %zmm2, %zmm0 -; AVX512-NEXT: vpminuw %zmm0, %zmm3, %zmm0 +; AVX512-NEXT: vpminuw %zmm3, %zmm2, %zmm1 +; AVX512-NEXT: vpminuw %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: retq %t0 = add <32 x i16> %x0, %x1 @@ -2672,17 +2672,17 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpaddd %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpminud %ymm1, %ymm5, %ymm1 -; AVX2-NEXT: vpminud %ymm0, %ymm4, %ymm0 -; AVX2-NEXT: vpminud %ymm0, %ymm6, %ymm0 -; AVX2-NEXT: vpminud %ymm1, %ymm7, %ymm1 +; AVX2-NEXT: vpminud %ymm6, %ymm4, %ymm2 +; AVX2-NEXT: vpminud %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpminud %ymm7, %ymm5, %ymm2 +; AVX2-NEXT: vpminud %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: retq ; ; AVX512-LABEL: reassociate_umin_v16i32: ; AVX512: # %bb.0: ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vpminud %zmm0, %zmm2, %zmm0 -; AVX512-NEXT: vpminud %zmm0, %zmm3, %zmm0 +; AVX512-NEXT: vpminud %zmm3, %zmm2, %zmm1 +; AVX512-NEXT: vpminud %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: retq %t0 = add <16 x i32> %x0, %x1 @@ -2856,8 +2856,8 @@ ; AVX512-LABEL: reassociate_umin_v8i64: ; AVX512: # %bb.0: ; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vpminuq %zmm0, %zmm2, %zmm0 -; AVX512-NEXT: vpminuq %zmm0, %zmm3, %zmm0 +; AVX512-NEXT: vpminuq %zmm3, %zmm2, %zmm1 +; AVX512-NEXT: vpminuq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: retq %t0 = add <8 x i64> %x0, %x1 @@ -2929,17 +2929,17 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpaddb %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpminsb %ymm1, %ymm5, %ymm1 -; AVX2-NEXT: vpminsb %ymm0, %ymm4, %ymm0 -; AVX2-NEXT: vpminsb %ymm0, %ymm6, %ymm0 -; AVX2-NEXT: vpminsb %ymm1, %ymm7, %ymm1 +; AVX2-NEXT: vpminsb %ymm6, %ymm4, %ymm2 +; AVX2-NEXT: vpminsb %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpminsb %ymm7, %ymm5, %ymm2 +; AVX2-NEXT: vpminsb %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: retq ; ; AVX512-LABEL: reassociate_smin_v64i8: ; AVX512: # %bb.0: ; AVX512-NEXT: vpaddb %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vpminsb %zmm0, %zmm2, %zmm0 -; AVX512-NEXT: vpminsb %zmm0, %zmm3, %zmm0 +; AVX512-NEXT: vpminsb %zmm3, %zmm2, %zmm1 +; AVX512-NEXT: vpminsb %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: retq %t0 = add <64 x i8> %x0, %x1 @@ -2971,17 +2971,17 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpaddw %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpaddw %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpminsw %ymm1, %ymm5, %ymm1 -; AVX2-NEXT: vpminsw %ymm0, %ymm4, %ymm0 -; AVX2-NEXT: vpminsw %ymm0, %ymm6, %ymm0 -; AVX2-NEXT: vpminsw %ymm1, %ymm7, %ymm1 +; AVX2-NEXT: vpminsw %ymm6, %ymm4, %ymm2 +; AVX2-NEXT: vpminsw %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpminsw %ymm7, %ymm5, %ymm2 +; AVX2-NEXT: vpminsw %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: retq ; ; AVX512-LABEL: reassociate_smin_v32i16: ; AVX512: # %bb.0: ; AVX512-NEXT: vpaddw %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vpminsw %zmm0, %zmm2, %zmm0 -; AVX512-NEXT: vpminsw %zmm0, %zmm3, %zmm0 +; AVX512-NEXT: vpminsw %zmm3, %zmm2, %zmm1 +; AVX512-NEXT: vpminsw %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: retq %t0 = add <32 x i16> %x0, %x1 @@ -3053,17 +3053,17 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpaddd %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpminsd %ymm1, %ymm5, %ymm1 -; AVX2-NEXT: vpminsd %ymm0, %ymm4, %ymm0 -; AVX2-NEXT: vpminsd %ymm0, %ymm6, %ymm0 -; AVX2-NEXT: vpminsd %ymm1, %ymm7, %ymm1 +; AVX2-NEXT: vpminsd %ymm6, %ymm4, %ymm2 +; AVX2-NEXT: vpminsd %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpminsd %ymm7, %ymm5, %ymm2 +; AVX2-NEXT: vpminsd %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: retq ; ; AVX512-LABEL: reassociate_smin_v16i32: ; AVX512: # %bb.0: ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vpminsd %zmm0, %zmm2, %zmm0 -; AVX512-NEXT: vpminsd %zmm0, %zmm3, %zmm0 +; AVX512-NEXT: vpminsd %zmm3, %zmm2, %zmm1 +; AVX512-NEXT: vpminsd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: retq %t0 = add <16 x i32> %x0, %x1 @@ -3228,8 +3228,8 @@ ; AVX512-LABEL: reassociate_smin_v8i64: ; AVX512: # %bb.0: ; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vpminsq %zmm0, %zmm2, %zmm0 -; AVX512-NEXT: vpminsq %zmm0, %zmm3, %zmm0 +; AVX512-NEXT: vpminsq %zmm3, %zmm2, %zmm1 +; AVX512-NEXT: vpminsq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: retq %t0 = add <8 x i64> %x0, %x1 Index: llvm/test/CodeGen/X86/machine-combiner-int.ll =================================================================== --- llvm/test/CodeGen/X86/machine-combiner-int.ll +++ llvm/test/CodeGen/X86/machine-combiner-int.ll @@ -13,8 +13,8 @@ ; CHECK-NEXT: # kill ; CHECK-NEXT: # kill ; CHECK-NEXT: leal (%rdi,%rsi), %eax +; CHECK-NEXT: imull %ecx, %edx ; CHECK-NEXT: imull %edx, %eax -; CHECK-NEXT: imull %ecx, %eax ; CHECK-NEXT: # kill ; CHECK-NEXT: retq %t0 = add i16 %x0, %x1 @@ -29,8 +29,8 @@ ; CHECK-NEXT: # kill ; CHECK-NEXT: # kill ; CHECK-NEXT: leal (%rdi,%rsi), %eax +; CHECK-NEXT: imull %ecx, %edx ; CHECK-NEXT: imull %edx, %eax -; CHECK-NEXT: imull %ecx, %eax ; CHECK-NEXT: retq ; DEAD: ADD32rr @@ -47,8 +47,8 @@ ; CHECK-LABEL: reassociate_muls_i64: ; CHECK: # %bb.0: ; CHECK-NEXT: leaq (%rdi,%rsi), %rax +; CHECK-NEXT: imulq %rcx, %rdx ; CHECK-NEXT: imulq %rdx, %rax -; CHECK-NEXT: imulq %rcx, %rax ; CHECK-NEXT: retq %t0 = add i64 %x0, %x1 %t1 = mul i64 %x2, %t0 @@ -62,10 +62,10 @@ define i8 @reassociate_ands_i8(i8 %x0, i8 %x1, i8 %x2, i8 %x3) { ; CHECK-LABEL: reassociate_ands_i8: ; CHECK: # %bb.0: -; CHECK-NEXT: movl %edi, %eax -; CHECK-NEXT: subb %sil, %al -; CHECK-NEXT: andb %dl, %al +; CHECK-NEXT: movl %edx, %eax +; CHECK-NEXT: subb %sil, %dil ; CHECK-NEXT: andb %cl, %al +; CHECK-NEXT: andb %dil, %al ; CHECK-NEXT: # kill ; CHECK-NEXT: retq %t0 = sub i8 %x0, %x1 @@ -79,10 +79,10 @@ define i32 @reassociate_ands_i32(i32 %x0, i32 %x1, i32 %x2, i32 %x3) { ; CHECK-LABEL: reassociate_ands_i32: ; CHECK: # %bb.0: -; CHECK-NEXT: movl %edi, %eax -; CHECK-NEXT: subl %esi, %eax -; CHECK-NEXT: andl %edx, %eax +; CHECK-NEXT: movl %edx, %eax +; CHECK-NEXT: subl %esi, %edi ; CHECK-NEXT: andl %ecx, %eax +; CHECK-NEXT: andl %edi, %eax ; CHECK-NEXT: retq %t0 = sub i32 %x0, %x1 %t1 = and i32 %x2, %t0 @@ -93,10 +93,10 @@ define i64 @reassociate_ands_i64(i64 %x0, i64 %x1, i64 %x2, i64 %x3) { ; CHECK-LABEL: reassociate_ands_i64: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %rdi, %rax -; CHECK-NEXT: subq %rsi, %rax -; CHECK-NEXT: andq %rdx, %rax +; CHECK-NEXT: movq %rdx, %rax +; CHECK-NEXT: subq %rsi, %rdi ; CHECK-NEXT: andq %rcx, %rax +; CHECK-NEXT: andq %rdi, %rax ; CHECK-NEXT: retq %t0 = sub i64 %x0, %x1 %t1 = and i64 %x2, %t0 @@ -110,10 +110,10 @@ define i8 @reassociate_ors_i8(i8 %x0, i8 %x1, i8 %x2, i8 %x3) { ; CHECK-LABEL: reassociate_ors_i8: ; CHECK: # %bb.0: -; CHECK-NEXT: movl %edi, %eax -; CHECK-NEXT: subb %sil, %al -; CHECK-NEXT: orb %dl, %al +; CHECK-NEXT: movl %edx, %eax +; CHECK-NEXT: subb %sil, %dil ; CHECK-NEXT: orb %cl, %al +; CHECK-NEXT: orb %dil, %al ; CHECK-NEXT: # kill ; CHECK-NEXT: retq %t0 = sub i8 %x0, %x1 @@ -127,10 +127,10 @@ define i32 @reassociate_ors_i32(i32 %x0, i32 %x1, i32 %x2, i32 %x3) { ; CHECK-LABEL: reassociate_ors_i32: ; CHECK: # %bb.0: -; CHECK-NEXT: movl %edi, %eax -; CHECK-NEXT: subl %esi, %eax -; CHECK-NEXT: orl %edx, %eax +; CHECK-NEXT: movl %edx, %eax +; CHECK-NEXT: subl %esi, %edi ; CHECK-NEXT: orl %ecx, %eax +; CHECK-NEXT: orl %edi, %eax ; CHECK-NEXT: retq %t0 = sub i32 %x0, %x1 %t1 = or i32 %x2, %t0 @@ -141,10 +141,10 @@ define i64 @reassociate_ors_i64(i64 %x0, i64 %x1, i64 %x2, i64 %x3) { ; CHECK-LABEL: reassociate_ors_i64: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %rdi, %rax -; CHECK-NEXT: subq %rsi, %rax -; CHECK-NEXT: orq %rdx, %rax +; CHECK-NEXT: movq %rdx, %rax +; CHECK-NEXT: subq %rsi, %rdi ; CHECK-NEXT: orq %rcx, %rax +; CHECK-NEXT: orq %rdi, %rax ; CHECK-NEXT: retq %t0 = sub i64 %x0, %x1 %t1 = or i64 %x2, %t0 @@ -158,10 +158,10 @@ define i8 @reassociate_xors_i8(i8 %x0, i8 %x1, i8 %x2, i8 %x3) { ; CHECK-LABEL: reassociate_xors_i8: ; CHECK: # %bb.0: -; CHECK-NEXT: movl %edi, %eax -; CHECK-NEXT: subb %sil, %al -; CHECK-NEXT: xorb %dl, %al +; CHECK-NEXT: movl %edx, %eax +; CHECK-NEXT: subb %sil, %dil ; CHECK-NEXT: xorb %cl, %al +; CHECK-NEXT: xorb %dil, %al ; CHECK-NEXT: # kill ; CHECK-NEXT: retq %t0 = sub i8 %x0, %x1 @@ -175,10 +175,10 @@ define i32 @reassociate_xors_i32(i32 %x0, i32 %x1, i32 %x2, i32 %x3) { ; CHECK-LABEL: reassociate_xors_i32: ; CHECK: # %bb.0: -; CHECK-NEXT: movl %edi, %eax -; CHECK-NEXT: subl %esi, %eax -; CHECK-NEXT: xorl %edx, %eax +; CHECK-NEXT: movl %edx, %eax +; CHECK-NEXT: subl %esi, %edi ; CHECK-NEXT: xorl %ecx, %eax +; CHECK-NEXT: xorl %edi, %eax ; CHECK-NEXT: retq %t0 = sub i32 %x0, %x1 %t1 = xor i32 %x2, %t0 @@ -189,10 +189,10 @@ define i64 @reassociate_xors_i64(i64 %x0, i64 %x1, i64 %x2, i64 %x3) { ; CHECK-LABEL: reassociate_xors_i64: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %rdi, %rax -; CHECK-NEXT: subq %rsi, %rax -; CHECK-NEXT: xorq %rdx, %rax +; CHECK-NEXT: movq %rdx, %rax +; CHECK-NEXT: subq %rsi, %rdi ; CHECK-NEXT: xorq %rcx, %rax +; CHECK-NEXT: xorq %rdi, %rax ; CHECK-NEXT: retq %t0 = sub i64 %x0, %x1 %t1 = xor i64 %x2, %t0