Index: llvm/lib/CodeGen/CriticalAntiDepBreaker.cpp =================================================================== --- llvm/lib/CodeGen/CriticalAntiDepBreaker.cpp +++ llvm/lib/CodeGen/CriticalAntiDepBreaker.cpp @@ -68,12 +68,10 @@ for (MachineBasicBlock::succ_iterator SI = BB->succ_begin(), SE = BB->succ_end(); SI != SE; ++SI) for (const auto &LI : (*SI)->liveins()) { - for (MCRegAliasIterator AI(LI.PhysReg, TRI, true); AI.isValid(); ++AI) { - unsigned Reg = *AI; - Classes[Reg] = reinterpret_cast(-1); - KillIndices[Reg] = BBSize; - DefIndices[Reg] = ~0u; - } + unsigned Reg = LI.PhysReg; + Classes[Reg] = reinterpret_cast(-1); + KillIndices[Reg] = BBSize; + DefIndices[Reg] = ~0u; } // Mark live-out callee-saved registers. In a return block this is @@ -86,12 +84,9 @@ unsigned Reg = *I; if (!IsReturnBlock && !Pristine.test(Reg)) continue; - for (MCRegAliasIterator AI(*I, TRI, true); AI.isValid(); ++AI) { - unsigned Reg = *AI; - Classes[Reg] = reinterpret_cast(-1); - KillIndices[Reg] = BBSize; - DefIndices[Reg] = ~0u; - } + Classes[Reg] = reinterpret_cast(-1); + KillIndices[Reg] = BBSize; + DefIndices[Reg] = ~0u; } } @@ -302,9 +297,6 @@ if (!Keep) KeepRegs.reset(SubregReg); } - // Conservatively mark super-registers as unusable. - for (MCSuperRegIterator SR(Reg, TRI); SR.isValid(); ++SR) - Classes[*SR] = reinterpret_cast(-1); } } for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { @@ -328,13 +320,9 @@ RegRefs.insert(std::make_pair(Reg, &MO)); // It wasn't previously live but now it is, this is a kill. - // Repeat for all aliases. - for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) { - unsigned AliasReg = *AI; - if (KillIndices[AliasReg] == ~0u) { - KillIndices[AliasReg] = Count; - DefIndices[AliasReg] = ~0u; - } + if (KillIndices[Reg] == ~0u) { + KillIndices[Reg] = Count; + DefIndices[Reg] = ~0u; } } } @@ -413,26 +401,52 @@ // If any instructions that define AntiDepReg also define the NewReg, it's // not suitable. For example, Instruction with multiple definitions can // result in this condition. - if (isNewRegClobberedByRefs(RegRefBegin, RegRefEnd, NewReg)) continue; - // If NewReg is dead and NewReg's most recent def is not before - // AntiDepReg's kill, it's safe to replace AntiDepReg with NewReg. + bool NewRegAliasClobberedByRefs = false; + for (MCRegAliasIterator AI(NewReg, TRI, true); AI.isValid(); ++AI) { + unsigned AliasReg = *AI; + if (isNewRegClobberedByRefs(RegRefBegin, RegRefEnd, AliasReg)) { + NewRegAliasClobberedByRefs = true; + break; + } + } + if (NewRegAliasClobberedByRefs) + continue; assert(((KillIndices[AntiDepReg] == ~0u) != (DefIndices[AntiDepReg] == ~0u)) && "Kill and Def maps aren't consistent for AntiDepReg!"); - assert(((KillIndices[NewReg] == ~0u) != (DefIndices[NewReg] == ~0u)) - && "Kill and Def maps aren't consistent for NewReg!"); - if (KillIndices[NewReg] != ~0u || - Classes[NewReg] == reinterpret_cast(-1) || - KillIndices[AntiDepReg] > DefIndices[NewReg]) - continue; - // If NewReg overlaps any of the forbidden registers, we can't use it. - bool Forbidden = false; - for (SmallVectorImpl::iterator it = Forbid.begin(), - ite = Forbid.end(); it != ite; ++it) - if (TRI->regsOverlap(NewReg, *it)) { - Forbidden = true; + bool Suitable = true; + for (MCRegAliasIterator AI(NewReg, TRI, true); AI.isValid(); ++AI) { + unsigned AliasReg = *AI; + // If every alias of NewReg is dead and most recent def of that alias + // is not before AntiDepReg's kill (so the alias and AntiDepReg aren't + // both live at any program point), it's safe to replace AntiDepReg + // with NewReg. + assert( + ((KillIndices[AliasReg] == ~0u) != (DefIndices[AliasReg] == ~0u)) && + "Kill and Def maps aren't consistent for alias of NewReg!"); + if (KillIndices[AliasReg] != ~0u || // Alias Live + // Alias used implicitly + Classes[AliasReg] == reinterpret_cast(-1) || + // both Alias and AntiDepReg are live at a program point + KillIndices[AntiDepReg] > DefIndices[AliasReg]) { + Suitable = false; break; } - if (Forbidden) continue; + // If NewReg overlaps any of the forbidden registers, we can't use it. + bool Forbidden = false; + for (SmallVectorImpl::iterator FI = Forbid.begin(), + FIE = Forbid.end(); + FI != FIE; ++FI) + if (TRI->regsOverlap(AliasReg, *FI)) { + Forbidden = true; + break; + } + if (Forbidden) { + Suitable = false; + break; + } + } + if (!Suitable) + continue; return NewReg; } Index: llvm/test/CodeGen/X86/critical-anti-dep-breaker.ll =================================================================== --- llvm/test/CodeGen/X86/critical-anti-dep-breaker.ll +++ llvm/test/CodeGen/X86/critical-anti-dep-breaker.ll @@ -5,7 +5,7 @@ ; The critical-anti-dependency-breaker must not use register def information from a kill inst. ; This test case expects such an instruction to appear as a comment with def info for RDI. ; There is an anti-dependency (WAR) hazard using RAX using default reg allocation and scheduling. -; The post-RA-scheduler and critical-anti-dependency breaker can eliminate that hazard using R10. +; The post-RA-scheduler and critical-anti-dependency breaker can eliminate that hazard using RSI. ; That is the first free register that isn't used as a param in the call to "@Image". @PartClass = external global i32 @@ -23,12 +23,12 @@ ; CHECK-NEXT: .cfi_offset %rbx, -24 ; CHECK-NEXT: .cfi_offset %rbp, -16 ; CHECK-NEXT: movq NullToken@{{.*}}(%rip), %rax -; CHECK-NEXT: movq PartClass@{{.*}}(%rip), %r10 +; CHECK-NEXT: movq PartClass@{{.*}}(%rip), %rsi ; CHECK-NEXT: xorl %edx, %edx ; CHECK-NEXT: xorl %ecx, %ecx ; CHECK-NEXT: movq %rdi, %rbx +; CHECK-NEXT: movl (%rsi), %ebp ; CHECK-NEXT: movq (%rax), %rax -; CHECK-NEXT: movl (%r10), %ebp ; CHECK-NEXT: movq %rax, {{[0-9]+}}(%rsp) ; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rax ; CHECK-NEXT: movl %ebp, %esi Index: llvm/test/CodeGen/X86/fma.ll =================================================================== --- llvm/test/CodeGen/X86/fma.ll +++ llvm/test/CodeGen/X86/fma.ll @@ -1390,31 +1390,31 @@ ; FMACALL32_BDVER2-NEXT: ## xmm1 = mem[0],zero,zero,zero ; FMACALL32_BDVER2-NEXT: vinsertps $16, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x28,0x10] ; FMACALL32_BDVER2-NEXT: ## xmm1 = xmm1[0],mem[0],xmm1[2,3] -; FMACALL32_BDVER2-NEXT: vmovss {{[0-9]+}}(%esp), %xmm2 ## encoding: [0xc5,0xfa,0x10,0x54,0x24,0x4c] +; FMACALL32_BDVER2-NEXT: vmovss {{[0-9]+}}(%esp), %xmm2 ## encoding: [0xc5,0xfa,0x10,0x54,0x24,0x5c] ; FMACALL32_BDVER2-NEXT: ## xmm2 = mem[0],zero,zero,zero -; FMACALL32_BDVER2-NEXT: vinsertps $16, {{[0-9]+}}(%esp), %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x21,0x54,0x24,0x48,0x10] -; FMACALL32_BDVER2-NEXT: ## xmm2 = xmm2[0],mem[0],xmm2[2,3] ; FMACALL32_BDVER2-NEXT: vinsertps $32, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x44,0x24,0x34,0x20] ; FMACALL32_BDVER2-NEXT: ## xmm0 = xmm0[0,1],mem[0],xmm0[3] ; FMACALL32_BDVER2-NEXT: vinsertps $32, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x24,0x20] ; FMACALL32_BDVER2-NEXT: ## xmm1 = xmm1[0,1],mem[0],xmm1[3] -; FMACALL32_BDVER2-NEXT: vinsertps $32, {{[0-9]+}}(%esp), %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x21,0x54,0x24,0x44,0x20] -; FMACALL32_BDVER2-NEXT: ## xmm2 = xmm2[0,1],mem[0],xmm2[3] ; FMACALL32_BDVER2-NEXT: vinsertps $48, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x44,0x24,0x30,0x30] ; FMACALL32_BDVER2-NEXT: ## xmm0 = xmm0[0,1,2],mem[0] ; FMACALL32_BDVER2-NEXT: vinsertps $48, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x20,0x30] ; FMACALL32_BDVER2-NEXT: ## xmm1 = xmm1[0,1,2],mem[0] -; FMACALL32_BDVER2-NEXT: vinsertps $48, {{[0-9]+}}(%esp), %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x21,0x54,0x24,0x40,0x30] -; FMACALL32_BDVER2-NEXT: ## xmm2 = xmm2[0,1,2],mem[0] ; FMACALL32_BDVER2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x18,0xc1,0x01] -; FMACALL32_BDVER2-NEXT: vmovss {{[0-9]+}}(%esp), %xmm1 ## encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x5c] -; FMACALL32_BDVER2-NEXT: ## xmm1 = mem[0],zero,zero,zero -; FMACALL32_BDVER2-NEXT: vinsertps $16, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x58,0x10] -; FMACALL32_BDVER2-NEXT: ## xmm1 = xmm1[0],mem[0],xmm1[2,3] +; FMACALL32_BDVER2-NEXT: vinsertps $16, {{[0-9]+}}(%esp), %xmm2, %xmm1 ## encoding: [0xc4,0xe3,0x69,0x21,0x4c,0x24,0x58,0x10] +; FMACALL32_BDVER2-NEXT: ## xmm1 = xmm2[0],mem[0],xmm2[2,3] +; FMACALL32_BDVER2-NEXT: vmovss {{[0-9]+}}(%esp), %xmm2 ## encoding: [0xc5,0xfa,0x10,0x54,0x24,0x4c] +; FMACALL32_BDVER2-NEXT: ## xmm2 = mem[0],zero,zero,zero +; FMACALL32_BDVER2-NEXT: vinsertps $16, {{[0-9]+}}(%esp), %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x21,0x54,0x24,0x48,0x10] +; FMACALL32_BDVER2-NEXT: ## xmm2 = xmm2[0],mem[0],xmm2[2,3] ; FMACALL32_BDVER2-NEXT: vinsertps $32, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x54,0x20] ; FMACALL32_BDVER2-NEXT: ## xmm1 = xmm1[0,1],mem[0],xmm1[3] +; FMACALL32_BDVER2-NEXT: vinsertps $32, {{[0-9]+}}(%esp), %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x21,0x54,0x24,0x44,0x20] +; FMACALL32_BDVER2-NEXT: ## xmm2 = xmm2[0,1],mem[0],xmm2[3] ; FMACALL32_BDVER2-NEXT: vinsertps $48, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x50,0x30] ; FMACALL32_BDVER2-NEXT: ## xmm1 = xmm1[0,1,2],mem[0] +; FMACALL32_BDVER2-NEXT: vinsertps $48, {{[0-9]+}}(%esp), %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x21,0x54,0x24,0x40,0x30] +; FMACALL32_BDVER2-NEXT: ## xmm2 = xmm2[0,1,2],mem[0] ; FMACALL32_BDVER2-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ## encoding: [0xc4,0xe3,0x75,0x18,0xca,0x01] ; FMACALL32_BDVER2-NEXT: movl %ebp, %esp ## encoding: [0x89,0xec] ; FMACALL32_BDVER2-NEXT: popl %ebp ## encoding: [0x5d] @@ -1501,12 +1501,12 @@ ; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4 ; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x30] +; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 ## 16-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x4c,0x24,0x40] ; FMACALL32_BDVER2-NEXT: vmovhps %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf8,0x17,0x44,0x24,0x10] -; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x40] -; FMACALL32_BDVER2-NEXT: vmovlps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 ## 16-byte Folded Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x12,0x44,0x24,0x58] -; FMACALL32_BDVER2-NEXT: ## xmm0 = mem[0,1],xmm0[2,3] +; FMACALL32_BDVER2-NEXT: vmovlps {{[-0-9]+}}(%e{{[sb]}}p), %xmm1, %xmm0 ## 16-byte Folded Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf0,0x12,0x44,0x24,0x58] +; FMACALL32_BDVER2-NEXT: ## xmm0 = mem[0,1],xmm1[2,3] ; FMACALL32_BDVER2-NEXT: vmovups %xmm0, (%esp) ## encoding: [0xc5,0xf8,0x11,0x04,0x24] ; FMACALL32_BDVER2-NEXT: fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x28] ; FMACALL32_BDVER2-NEXT: calll _fma ## encoding: [0xe8,A,A,A,A] @@ -1716,12 +1716,12 @@ ; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4 ; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x70] +; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 ## 16-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x4c,0x24,0x50] ; FMACALL32_BDVER2-NEXT: vmovhps %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf8,0x17,0x44,0x24,0x10] -; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x50] -; FMACALL32_BDVER2-NEXT: vmovlps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 ## 16-byte Folded Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x12,0x44,0x24,0x68] -; FMACALL32_BDVER2-NEXT: ## xmm0 = mem[0,1],xmm0[2,3] +; FMACALL32_BDVER2-NEXT: vmovlps {{[-0-9]+}}(%e{{[sb]}}p), %xmm1, %xmm0 ## 16-byte Folded Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf0,0x12,0x44,0x24,0x68] +; FMACALL32_BDVER2-NEXT: ## xmm0 = mem[0,1],xmm1[2,3] ; FMACALL32_BDVER2-NEXT: vmovups %xmm0, (%esp) ## encoding: [0xc5,0xf8,0x11,0x04,0x24] ; FMACALL32_BDVER2-NEXT: fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x30] ; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload @@ -1926,25 +1926,25 @@ ; FMACALL32_BDVER2-NEXT: andl $-32, %esp ## encoding: [0x83,0xe4,0xe0] ; FMACALL32_BDVER2-NEXT: subl $352, %esp ## encoding: [0x81,0xec,0x60,0x01,0x00,0x00] ; FMACALL32_BDVER2-NEXT: ## imm = 0x160 -; FMACALL32_BDVER2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 32-byte Spill -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x29,0x84,0x24,0xe0,0x00,0x00,0x00] -; FMACALL32_BDVER2-NEXT: vmovsd 56(%ebp), %xmm0 ## encoding: [0xc5,0xfb,0x10,0x45,0x38] -; FMACALL32_BDVER2-NEXT: ## xmm0 = mem[0],zero ; FMACALL32_BDVER2-NEXT: vmovaps %ymm2, {{[-0-9]+}}(%e{{[sb]}}p) ## 32-byte Spill ; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x29,0x94,0x24,0x00,0x01,0x00,0x00] -; FMACALL32_BDVER2-NEXT: vextractf128 $1, %ymm3, %xmm2 ## encoding: [0xc4,0xe3,0x7d,0x19,0xda,0x01] -; FMACALL32_BDVER2-NEXT: vmovaps %ymm3, {{[-0-9]+}}(%e{{[sb]}}p) ## 32-byte Spill -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x29,0x9c,0x24,0xc0,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vmovsd 56(%ebp), %xmm2 ## encoding: [0xc5,0xfb,0x10,0x55,0x38] +; FMACALL32_BDVER2-NEXT: ## xmm2 = mem[0],zero ; FMACALL32_BDVER2-NEXT: vmovaps %ymm1, {{[-0-9]+}}(%e{{[sb]}}p) ## 32-byte Spill ; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x29,0x8c,0x24,0xa0,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vextractf128 $1, %ymm1, %xmm1 ## encoding: [0xc4,0xe3,0x7d,0x19,0xc9,0x01] +; FMACALL32_BDVER2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 32-byte Spill +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x29,0x84,0x24,0xe0,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vmovaps %ymm3, {{[-0-9]+}}(%e{{[sb]}}p) ## 32-byte Spill +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x29,0x9c,0x24,0xc0,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x29,0x8c,0x24,0x40,0x01,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vmovsd %xmm2, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfb,0x11,0x54,0x24,0x10] +; FMACALL32_BDVER2-NEXT: vextractf128 $1, %ymm3, %xmm2 ## encoding: [0xc4,0xe3,0x7d,0x19,0xda,0x01] +; FMACALL32_BDVER2-NEXT: vmovlhps %xmm2, %xmm1, %xmm0 ## encoding: [0xc5,0xf0,0x16,0xc2] +; FMACALL32_BDVER2-NEXT: ## xmm0 = xmm1[0],xmm2[0] ; FMACALL32_BDVER2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill ; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x29,0x94,0x24,0x30,0x01,0x00,0x00] -; FMACALL32_BDVER2-NEXT: vmovsd %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfb,0x11,0x44,0x24,0x10] -; FMACALL32_BDVER2-NEXT: vextractf128 $1, %ymm1, %xmm0 ## encoding: [0xc4,0xe3,0x7d,0x19,0xc8,0x01] -; FMACALL32_BDVER2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x29,0x84,0x24,0x40,0x01,0x00,0x00] -; FMACALL32_BDVER2-NEXT: vmovlhps %xmm2, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x16,0xc2] -; FMACALL32_BDVER2-NEXT: ## xmm0 = xmm0[0],xmm2[0] ; FMACALL32_BDVER2-NEXT: vmovups %xmm0, (%esp) ## encoding: [0xc5,0xf8,0x11,0x04,0x24] ; FMACALL32_BDVER2-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] ; FMACALL32_BDVER2-NEXT: calll _fma ## encoding: [0xe8,A,A,A,A] @@ -1981,12 +1981,12 @@ ; FMACALL32_BDVER2-NEXT: ## xmm0 = mem[0],zero ; FMACALL32_BDVER2-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xbc,0x24,0xc0,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm1 ## 32-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x8c,0x24,0x00,0x01,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vmovsd %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfb,0x11,0x44,0x24,0x10] ; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x00,0x01,0x00,0x00] -; FMACALL32_BDVER2-NEXT: vextractf128 $1, %ymm0, %xmm1 ## encoding: [0xc4,0xe3,0x7d,0x19,0xc1,0x01] -; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0xe0,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vextractf128 $1, %ymm1, %xmm1 ## encoding: [0xc4,0xe3,0x7d,0x19,0xc9,0x01] ; FMACALL32_BDVER2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill ; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x29,0x4c,0x24,0x20] ; FMACALL32_BDVER2-NEXT: vextractf128 $1, %ymm0, %xmm0 ## encoding: [0xc4,0xe3,0x7d,0x19,0xc0,0x01] @@ -2002,12 +2002,12 @@ ; FMACALL32_BDVER2-NEXT: ## xmm0 = mem[0],zero ; FMACALL32_BDVER2-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xbc,0x24,0xa0,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 ## 16-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x4c,0x24,0x30] ; FMACALL32_BDVER2-NEXT: vmovsd %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfb,0x11,0x44,0x24,0x10] -; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x30] -; FMACALL32_BDVER2-NEXT: vunpcklpd {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 ## 16-byte Folded Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf9,0x14,0x44,0x24,0x20] -; FMACALL32_BDVER2-NEXT: ## xmm0 = xmm0[0],mem[0] +; FMACALL32_BDVER2-NEXT: vunpcklpd {{[-0-9]+}}(%e{{[sb]}}p), %xmm1, %xmm0 ## 16-byte Folded Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf1,0x14,0x44,0x24,0x20] +; FMACALL32_BDVER2-NEXT: ## xmm0 = xmm1[0],mem[0] ; FMACALL32_BDVER2-NEXT: vmovups %xmm0, (%esp) ## encoding: [0xc5,0xf8,0x11,0x04,0x24] ; FMACALL32_BDVER2-NEXT: calll _fma ## encoding: [0xe8,A,A,A,A] ; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4 @@ -2041,12 +2041,12 @@ ; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4 ; FMACALL32_BDVER2-NEXT: vmovsd 64(%ebp), %xmm0 ## encoding: [0xc5,0xfb,0x10,0x45,0x40] ; FMACALL32_BDVER2-NEXT: ## xmm0 = mem[0],zero +; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 ## 16-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x8c,0x24,0x30,0x01,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vmovsd %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfb,0x11,0x44,0x24,0x10] -; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x84,0x24,0x30,0x01,0x00,0x00] -; FMACALL32_BDVER2-NEXT: vmovlps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 ## 16-byte Folded Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x12,0x84,0x24,0x48,0x01,0x00,0x00] -; FMACALL32_BDVER2-NEXT: ## xmm0 = mem[0,1],xmm0[2,3] +; FMACALL32_BDVER2-NEXT: vmovlps {{[-0-9]+}}(%e{{[sb]}}p), %xmm1, %xmm0 ## 16-byte Folded Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf0,0x12,0x84,0x24,0x48,0x01,0x00,0x00] +; FMACALL32_BDVER2-NEXT: ## xmm0 = mem[0,1],xmm1[2,3] ; FMACALL32_BDVER2-NEXT: vmovups %xmm0, (%esp) ## encoding: [0xc5,0xf8,0x11,0x04,0x24] ; FMACALL32_BDVER2-NEXT: fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x60] ; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload Index: llvm/test/CodeGen/X86/recip-fastmath.ll =================================================================== --- llvm/test/CodeGen/X86/recip-fastmath.ll +++ llvm/test/CodeGen/X86/recip-fastmath.ll @@ -1293,13 +1293,13 @@ ; BDVER2: # %bb.0: ; BDVER2-NEXT: vrcpps %ymm0, %ymm2 ; BDVER2-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; BDVER2-NEXT: vrcpps %ymm1, %ymm5 ; BDVER2-NEXT: vfmsubps {{.*#+}} ymm4 = (ymm0 * ymm2) - ymm3 ; BDVER2-NEXT: vfnmaddps {{.*#+}} ymm2 = -(ymm2 * ymm4) + ymm2 +; BDVER2-NEXT: vfmsubps {{.*#+}} ymm4 = (ymm1 * ymm5) - ymm3 ; BDVER2-NEXT: vfmsubps {{.*#+}} ymm0 = (ymm0 * ymm2) - ymm3 ; BDVER2-NEXT: vfnmaddps {{.*#+}} ymm0 = -(ymm2 * ymm0) + ymm2 -; BDVER2-NEXT: vrcpps %ymm1, %ymm2 -; BDVER2-NEXT: vfmsubps {{.*#+}} ymm4 = (ymm1 * ymm2) - ymm3 -; BDVER2-NEXT: vfnmaddps {{.*#+}} ymm2 = -(ymm2 * ymm4) + ymm2 +; BDVER2-NEXT: vfnmaddps {{.*#+}} ymm2 = -(ymm5 * ymm4) + ymm5 ; BDVER2-NEXT: vfmsubps {{.*#+}} ymm1 = (ymm1 * ymm2) - ymm3 ; BDVER2-NEXT: vfnmaddps {{.*#+}} ymm1 = -(ymm2 * ymm1) + ymm2 ; BDVER2-NEXT: retq @@ -1308,19 +1308,19 @@ ; BTVER2: # %bb.0: ; BTVER2-NEXT: vmovaps {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; BTVER2-NEXT: vrcpps %ymm0, %ymm2 +; BTVER2-NEXT: vrcpps %ymm1, %ymm5 ; BTVER2-NEXT: vmulps %ymm2, %ymm0, %ymm3 ; BTVER2-NEXT: vsubps %ymm3, %ymm4, %ymm3 ; BTVER2-NEXT: vmulps %ymm3, %ymm2, %ymm3 ; BTVER2-NEXT: vaddps %ymm3, %ymm2, %ymm2 +; BTVER2-NEXT: vmulps %ymm5, %ymm1, %ymm3 +; BTVER2-NEXT: vsubps %ymm3, %ymm4, %ymm3 ; BTVER2-NEXT: vmulps %ymm2, %ymm0, %ymm0 ; BTVER2-NEXT: vsubps %ymm0, %ymm4, %ymm0 +; BTVER2-NEXT: vmulps %ymm3, %ymm5, %ymm3 ; BTVER2-NEXT: vmulps %ymm0, %ymm2, %ymm0 ; BTVER2-NEXT: vaddps %ymm0, %ymm2, %ymm0 -; BTVER2-NEXT: vrcpps %ymm1, %ymm2 -; BTVER2-NEXT: vmulps %ymm2, %ymm1, %ymm3 -; BTVER2-NEXT: vsubps %ymm3, %ymm4, %ymm3 -; BTVER2-NEXT: vmulps %ymm3, %ymm2, %ymm3 -; BTVER2-NEXT: vaddps %ymm3, %ymm2, %ymm2 +; BTVER2-NEXT: vaddps %ymm3, %ymm5, %ymm2 ; BTVER2-NEXT: vmulps %ymm2, %ymm1, %ymm1 ; BTVER2-NEXT: vsubps %ymm1, %ymm4, %ymm1 ; BTVER2-NEXT: vmulps %ymm1, %ymm2, %ymm1 Index: llvm/test/CodeGen/X86/recip-fastmath2.ll =================================================================== --- llvm/test/CodeGen/X86/recip-fastmath2.ll +++ llvm/test/CodeGen/X86/recip-fastmath2.ll @@ -1361,12 +1361,12 @@ ; BDVER2: # %bb.0: ; BDVER2-NEXT: vrcpps %ymm0, %ymm2 ; BDVER2-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; BDVER2-NEXT: vrcpps %ymm1, %ymm4 ; BDVER2-NEXT: vfmsubps {{.*#+}} ymm0 = (ymm0 * ymm2) - ymm3 +; BDVER2-NEXT: vfmsubps {{.*#+}} ymm1 = (ymm1 * ymm4) - ymm3 ; BDVER2-NEXT: vfnmaddps {{.*#+}} ymm0 = -(ymm2 * ymm0) + ymm2 -; BDVER2-NEXT: vrcpps %ymm1, %ymm2 -; BDVER2-NEXT: vfmsubps {{.*#+}} ymm1 = (ymm1 * ymm2) - ymm3 ; BDVER2-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm3 -; BDVER2-NEXT: vfnmaddps {{.*#+}} ymm1 = -(ymm2 * ymm1) + ymm2 +; BDVER2-NEXT: vfnmaddps {{.*#+}} ymm1 = -(ymm4 * ymm1) + ymm4 ; BDVER2-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm2 ; BDVER2-NEXT: vmulps %ymm0, %ymm3, %ymm0 ; BDVER2-NEXT: vmulps %ymm1, %ymm2, %ymm1 @@ -1376,16 +1376,16 @@ ; BTVER2: # %bb.0: ; BTVER2-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; BTVER2-NEXT: vrcpps %ymm0, %ymm2 +; BTVER2-NEXT: vrcpps %ymm1, %ymm4 ; BTVER2-NEXT: vmulps %ymm2, %ymm0, %ymm0 +; BTVER2-NEXT: vmulps %ymm4, %ymm1, %ymm1 ; BTVER2-NEXT: vsubps %ymm0, %ymm3, %ymm0 +; BTVER2-NEXT: vsubps %ymm1, %ymm3, %ymm1 ; BTVER2-NEXT: vmulps %ymm0, %ymm2, %ymm0 +; BTVER2-NEXT: vmulps %ymm1, %ymm4, %ymm1 ; BTVER2-NEXT: vaddps %ymm0, %ymm2, %ymm0 -; BTVER2-NEXT: vrcpps %ymm1, %ymm2 -; BTVER2-NEXT: vmulps %ymm2, %ymm1, %ymm1 -; BTVER2-NEXT: vsubps %ymm1, %ymm3, %ymm1 ; BTVER2-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm3 -; BTVER2-NEXT: vmulps %ymm1, %ymm2, %ymm1 -; BTVER2-NEXT: vaddps %ymm1, %ymm2, %ymm1 +; BTVER2-NEXT: vaddps %ymm1, %ymm4, %ymm1 ; BTVER2-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm2 ; BTVER2-NEXT: vmulps %ymm0, %ymm3, %ymm0 ; BTVER2-NEXT: vmulps %ymm1, %ymm2, %ymm1 @@ -1573,16 +1573,16 @@ ; BDVER2: # %bb.0: ; BDVER2-NEXT: vrcpps %ymm0, %ymm2 ; BDVER2-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; BDVER2-NEXT: vmovaps {{.*#+}} ymm6 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] ; BDVER2-NEXT: vfmsubps {{.*#+}} ymm4 = (ymm0 * ymm2) - ymm3 ; BDVER2-NEXT: vfnmaddps {{.*#+}} ymm2 = -(ymm2 * ymm4) + ymm2 -; BDVER2-NEXT: vmovaps {{.*#+}} ymm4 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] -; BDVER2-NEXT: vmulps %ymm4, %ymm2, %ymm5 -; BDVER2-NEXT: vfmsubps {{.*#+}} ymm0 = (ymm0 * ymm5) - ymm4 +; BDVER2-NEXT: vrcpps %ymm1, %ymm4 +; BDVER2-NEXT: vmulps %ymm6, %ymm2, %ymm5 +; BDVER2-NEXT: vfmsubps {{.*#+}} ymm3 = (ymm1 * ymm4) - ymm3 +; BDVER2-NEXT: vfmsubps {{.*#+}} ymm0 = (ymm0 * ymm5) - ymm6 ; BDVER2-NEXT: vfnmaddps {{.*#+}} ymm0 = -(ymm2 * ymm0) + ymm5 -; BDVER2-NEXT: vrcpps %ymm1, %ymm2 +; BDVER2-NEXT: vfnmaddps {{.*#+}} ymm2 = -(ymm4 * ymm3) + ymm4 ; BDVER2-NEXT: vmovaps {{.*#+}} ymm5 = [9.0E+0,1.0E+1,1.1E+1,1.2E+1,1.3E+1,1.4E+1,1.5E+1,1.6E+1] -; BDVER2-NEXT: vfmsubps {{.*#+}} ymm3 = (ymm1 * ymm2) - ymm3 -; BDVER2-NEXT: vfnmaddps {{.*#+}} ymm2 = -(ymm2 * ymm3) + ymm2 ; BDVER2-NEXT: vmulps %ymm5, %ymm2, %ymm4 ; BDVER2-NEXT: vfmsubps {{.*#+}} ymm1 = (ymm1 * ymm4) - ymm5 ; BDVER2-NEXT: vfnmaddps {{.*#+}} ymm1 = -(ymm2 * ymm1) + ymm4 @@ -1592,22 +1592,22 @@ ; BTVER2: # %bb.0: ; BTVER2-NEXT: vmovaps {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; BTVER2-NEXT: vrcpps %ymm0, %ymm2 +; BTVER2-NEXT: vmovaps {{.*#+}} ymm6 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] ; BTVER2-NEXT: vmulps %ymm2, %ymm0, %ymm3 ; BTVER2-NEXT: vsubps %ymm3, %ymm4, %ymm3 ; BTVER2-NEXT: vmulps %ymm3, %ymm2, %ymm3 ; BTVER2-NEXT: vaddps %ymm3, %ymm2, %ymm2 -; BTVER2-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] -; BTVER2-NEXT: vmulps %ymm3, %ymm2, %ymm5 +; BTVER2-NEXT: vmulps %ymm6, %ymm2, %ymm5 ; BTVER2-NEXT: vmulps %ymm5, %ymm0, %ymm0 -; BTVER2-NEXT: vsubps %ymm0, %ymm3, %ymm0 +; BTVER2-NEXT: vsubps %ymm0, %ymm6, %ymm0 ; BTVER2-NEXT: vmulps %ymm0, %ymm2, %ymm0 -; BTVER2-NEXT: vrcpps %ymm1, %ymm2 -; BTVER2-NEXT: vmulps %ymm2, %ymm1, %ymm3 ; BTVER2-NEXT: vaddps %ymm0, %ymm5, %ymm0 -; BTVER2-NEXT: vmovaps {{.*#+}} ymm5 = [9.0E+0,1.0E+1,1.1E+1,1.2E+1,1.3E+1,1.4E+1,1.5E+1,1.6E+1] +; BTVER2-NEXT: vrcpps %ymm1, %ymm5 +; BTVER2-NEXT: vmulps %ymm5, %ymm1, %ymm3 ; BTVER2-NEXT: vsubps %ymm3, %ymm4, %ymm3 -; BTVER2-NEXT: vmulps %ymm3, %ymm2, %ymm3 -; BTVER2-NEXT: vaddps %ymm3, %ymm2, %ymm2 +; BTVER2-NEXT: vmulps %ymm3, %ymm5, %ymm3 +; BTVER2-NEXT: vaddps %ymm3, %ymm5, %ymm2 +; BTVER2-NEXT: vmovaps {{.*#+}} ymm5 = [9.0E+0,1.0E+1,1.1E+1,1.2E+1,1.3E+1,1.4E+1,1.5E+1,1.6E+1] ; BTVER2-NEXT: vmulps %ymm5, %ymm2, %ymm4 ; BTVER2-NEXT: vmulps %ymm4, %ymm1, %ymm1 ; BTVER2-NEXT: vsubps %ymm1, %ymm5, %ymm1