Index: lib/Target/X86/X86FastISel.cpp =================================================================== --- lib/Target/X86/X86FastISel.cpp +++ lib/Target/X86/X86FastISel.cpp @@ -3337,7 +3337,7 @@ AM.getFullAddress(AddrOps); MachineInstr *Result = - XII.foldMemoryOperandImpl(*FuncInfo.MF, MI, OpNo, AddrOps, Size, Alignment); + XII.foldMemoryOperandImpl(*FuncInfo.MF, MI, OpNo, AddrOps, Size, Alignment, /*AllowCommute=*/ true); if (!Result) return false; Index: lib/Target/X86/X86InstrInfo.h =================================================================== --- lib/Target/X86/X86InstrInfo.h +++ lib/Target/X86/X86InstrInfo.h @@ -404,7 +404,7 @@ MachineInstr* MI, unsigned OpNum, const SmallVectorImpl &MOs, - unsigned Size, unsigned Alignment) const; + unsigned Size, unsigned Alignment, bool AllowCommute) const; void getUnconditionalBranch(MCInst &Branch, Index: lib/Target/X86/X86InstrInfo.cpp =================================================================== --- lib/Target/X86/X86InstrInfo.cpp +++ lib/Target/X86/X86InstrInfo.cpp @@ -3926,55 +3926,34 @@ if (!DefMI->isSafeToMove(this, nullptr, SawStore)) return nullptr; - // We try to commute MI if possible. - unsigned IdxEnd = (MI->isCommutable()) ? 2 : 1; - for (unsigned Idx = 0; Idx < IdxEnd; Idx++) { - // Collect information about virtual register operands of MI. - unsigned SrcOperandId = 0; - bool FoundSrcOperand = false; - for (unsigned i = 0, e = MI->getDesc().getNumOperands(); i != e; ++i) { - MachineOperand &MO = MI->getOperand(i); - if (!MO.isReg()) - continue; - unsigned Reg = MO.getReg(); - if (Reg != FoldAsLoadDefReg) - continue; - // Do not fold if we have a subreg use or a def or multiple uses. - if (MO.getSubReg() || MO.isDef() || FoundSrcOperand) - return nullptr; - - SrcOperandId = i; - FoundSrcOperand = true; - } - if (!FoundSrcOperand) return nullptr; - - // Check whether we can fold the def into SrcOperandId. - SmallVector Ops; - Ops.push_back(SrcOperandId); - MachineInstr *FoldMI = foldMemoryOperand(MI, Ops, DefMI); - if (FoldMI) { - FoldAsLoadDefReg = 0; - return FoldMI; - } - - if (Idx == 1) { - // MI was changed but it didn't help, commute it back! - commuteInstruction(MI, false); + // Collect information about virtual register operands of MI. + unsigned SrcOperandId = 0; + bool FoundSrcOperand = false; + for (unsigned i = 0, e = MI->getDesc().getNumOperands(); i != e; ++i) { + MachineOperand &MO = MI->getOperand(i); + if (!MO.isReg()) + continue; + unsigned Reg = MO.getReg(); + if (Reg != FoldAsLoadDefReg) + continue; + // Do not fold if we have a subreg use or a def or multiple uses. + if (MO.getSubReg() || MO.isDef() || FoundSrcOperand) return nullptr; - } - // Check whether we can commute MI and enable folding. - if (MI->isCommutable()) { - MachineInstr *NewMI = commuteInstruction(MI, false); - // Unable to commute. - if (!NewMI) return nullptr; - if (NewMI != MI) { - // New instruction. It doesn't need to be kept. - NewMI->eraseFromParent(); - return nullptr; - } - } + SrcOperandId = i; + FoundSrcOperand = true; } + if (!FoundSrcOperand) return nullptr; + + // Check whether we can fold the def into SrcOperandId. + SmallVector Ops; + Ops.push_back(SrcOperandId); + MachineInstr *FoldMI = foldMemoryOperand(MI, Ops, DefMI); + if (FoldMI) { + FoldAsLoadDefReg = 0; + return FoldMI; + } + return nullptr; } @@ -4134,7 +4113,7 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI, unsigned i, const SmallVectorImpl &MOs, - unsigned Size, unsigned Align) const { + unsigned Size, unsigned Align, bool AllowCommute) const { const DenseMap > *OpcodeTablePtr = nullptr; bool isCallRegIndirect = Subtarget.callRegIndirect(); @@ -4231,6 +4210,58 @@ } } + // If the instruction and target operand are commutable, commute the instruction and try again. + if (AllowCommute) { + unsigned OriginalOpIdx = i, CommuteOpIdx1, CommuteOpIdx2; + if (findCommutedOpIndices(MI, CommuteOpIdx1, CommuteOpIdx2)) { + bool HasDef = MI->getDesc().getNumDefs(); + unsigned Reg0 = HasDef ? MI->getOperand(0).getReg() : 0; + unsigned Reg1 = MI->getOperand(CommuteOpIdx1).getReg(); + unsigned Reg2 = MI->getOperand(CommuteOpIdx2).getReg(); + bool Tied0 = 0 == MI->getDesc().getOperandConstraint(CommuteOpIdx1, MCOI::TIED_TO); + bool Tied1 = 0 == MI->getDesc().getOperandConstraint(CommuteOpIdx2, MCOI::TIED_TO); + + // If either of the commutable operands are tied to the destination then we can not commute + fold. + if ((HasDef && Reg0 == Reg1 && Tied0) || + (HasDef && Reg0 == Reg2 && Tied1)) + return nullptr; + + if ((CommuteOpIdx1 == OriginalOpIdx) || (CommuteOpIdx2 == OriginalOpIdx)) { + MachineInstr* CommutedMI = commuteInstruction(MI, false); + if (!CommutedMI) { + // Unable to commute. + return nullptr; + } + if (CommutedMI != MI) { + // New instruction. We can't fold from this. + CommutedMI->eraseFromParent(); + return nullptr; + } + + // Attempt to fold with the commuted version of the instruction. + unsigned CommuteOpIdx = (CommuteOpIdx1 == OriginalOpIdx ? CommuteOpIdx2 : CommuteOpIdx1); + NewMI = foldMemoryOperandImpl(MF, MI, CommuteOpIdx, MOs, Size, Align, /*AllowCommute=*/ false); + if (NewMI) + return NewMI; + + // Folding failed again - undo the commute before returning. + MachineInstr* UncommutedMI = commuteInstruction(MI, false); + if (!UncommutedMI) { + // Unable to commute. + return nullptr; + } + if (UncommutedMI != MI) { + // New instruction. It doesn't need to be kept. + UncommutedMI->eraseFromParent(); + return nullptr; + } + + // Return here to prevent duplicate fuse failure report. + return nullptr; + } + } + } + // No fusion if (PrintFailedFusing && !MI->isCopy()) dbgs() << "We failed to fuse operand " << i << " in " << *MI; @@ -4440,7 +4471,7 @@ SmallVector MOs; MOs.push_back(MachineOperand::CreateFI(FrameIndex)); - return foldMemoryOperandImpl(MF, MI, Ops[0], MOs, Size, Alignment); + return foldMemoryOperandImpl(MF, MI, Ops[0], MOs, Size, Alignment, /*AllowCommute=*/ true); } static bool isPartialRegisterLoad(const MachineInstr &LoadMI, @@ -4593,7 +4624,7 @@ break; } } - return foldMemoryOperandImpl(MF, MI, Ops[0], MOs, 0, Alignment); + return foldMemoryOperandImpl(MF, MI, Ops[0], MOs, 0, Alignment, /*AllowCommute=*/ true); } Index: test/CodeGen/X86/2014-10-16-xxhash.ll =================================================================== --- test/CodeGen/X86/2014-10-16-xxhash.ll +++ test/CodeGen/X86/2014-10-16-xxhash.ll @@ -0,0 +1,74 @@ +; RUN: llc < %s -verify-machineinstrs -mtriple=i386--netbsd -O3 +; Regression test for http://reviews.llvm.org/D5701 + +; ModuleID = 'xxhash.i' +target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128" +target triple = "i386--netbsd" + +%struct.XXH_state64_t = type { i32, i32, i64, i64, i64 } + +@a = common global i32 0, align 4 +@b = common global i64 0, align 8 + +; Function Attrs: nounwind uwtable +define i64 @fn1() #0 { +entry: + %0 = load i32* @a, align 4, !tbaa !1 + %1 = inttoptr i32 %0 to %struct.XXH_state64_t* + %total_len = getelementptr inbounds %struct.XXH_state64_t* %1, i32 0, i32 0 + %2 = load i32* %total_len, align 4, !tbaa !5 + %tobool = icmp eq i32 %2, 0 + br i1 %tobool, label %if.else, label %if.then + +if.then: ; preds = %entry + %v3 = getelementptr inbounds %struct.XXH_state64_t* %1, i32 0, i32 3 + %3 = load i64* %v3, align 4, !tbaa !8 + %v4 = getelementptr inbounds %struct.XXH_state64_t* %1, i32 0, i32 4 + %4 = load i64* %v4, align 4, !tbaa !9 + %v2 = getelementptr inbounds %struct.XXH_state64_t* %1, i32 0, i32 2 + %5 = load i64* %v2, align 4, !tbaa !10 + %shl = shl i64 %5, 1 + %or = or i64 %shl, %5 + %shl2 = shl i64 %3, 2 + %shr = lshr i64 %3, 1 + %or3 = or i64 %shl2, %shr + %add = add i64 %or, %or3 + %mul = mul i64 %4, -4417276706812531889 + %shl4 = mul i64 %4, -8834553413625063778 + %shr5 = ashr i64 %mul, 3 + %or6 = or i64 %shr5, %shl4 + %mul7 = mul nsw i64 %or6, 1400714785074694791 + %xor = xor i64 %add, %mul7 + store i64 %xor, i64* @b, align 8, !tbaa !11 + %mul8 = mul nsw i64 %xor, 1400714785074694791 + br label %if.end + +if.else: ; preds = %entry + %6 = load i64* @b, align 8, !tbaa !11 + %xor10 = xor i64 %6, -4417276706812531889 + %mul11 = mul nsw i64 %xor10, 400714785074694791 + br label %if.end + +if.end: ; preds = %if.else, %if.then + %storemerge.in = phi i64 [ %mul11, %if.else ], [ %mul8, %if.then ] + %storemerge = add i64 %storemerge.in, -8796714831421723037 + store i64 %storemerge, i64* @b, align 8, !tbaa !11 + ret i64 undef +} + +attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } + +!llvm.ident = !{!0} + +!0 = metadata !{metadata !"clang version 3.6 (trunk 219587)"} +!1 = metadata !{metadata !2, metadata !2, i64 0} +!2 = metadata !{metadata !"int", metadata !3, i64 0} +!3 = metadata !{metadata !"omnipotent char", metadata !4, i64 0} +!4 = metadata !{metadata !"Simple C/C++ TBAA"} +!5 = metadata !{metadata !6, metadata !2, i64 0} +!6 = metadata !{metadata !"XXH_state64_t", metadata !2, i64 0, metadata !2, i64 4, metadata !7, i64 8, metadata !7, i64 16, metadata !7, i64 24} +!7 = metadata !{metadata !"long long", metadata !3, i64 0} +!8 = metadata !{metadata !6, metadata !7, i64 16} +!9 = metadata !{metadata !6, metadata !7, i64 24} +!10 = metadata !{metadata !6, metadata !7, i64 8} +!11 = metadata !{metadata !7, metadata !7, i64 0} Index: test/CodeGen/X86/avx1-stack-reload-folding.ll =================================================================== --- test/CodeGen/X86/avx1-stack-reload-folding.ll +++ test/CodeGen/X86/avx1-stack-reload-folding.ll @@ -0,0 +1,16 @@ +; RUN: llc -O3 -disable-peephole -mcpu=corei7-avx -mattr=+avx < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-unknown" + +; Function Attrs: nounwind readonly uwtable +define <32 x double> @_Z14vstack_foldDv32_dS_(<32 x double> %a, <32 x double> %b) #0 { + %1 = fadd <32 x double> %a, %b + %2 = fsub <32 x double> %a, %b + %3 = fmul <32 x double> %1, %2 + ret <32 x double> %3 + + ;CHECK-NOT: vmovapd {{.*#+}} 32-byte Reload + ;CHECK: vmulpd {{[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload + ;CHECK-NOT: vmovapd {{.*#+}} 32-byte Reload +}