Index: llvm/trunk/lib/Target/X86/X86InstrInfo.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86InstrInfo.cpp +++ llvm/trunk/lib/Target/X86/X86InstrInfo.cpp @@ -8479,6 +8479,19 @@ return nullptr; } +static bool shouldPreventUndefRegUpdateMemFold(MachineFunction &MF, MachineInstr &MI) { + if (MF.getFunction().optForSize() || !hasUndefRegUpdate(MI.getOpcode()) || + !MI.getOperand(1).isReg()) + return false; + + MachineRegisterInfo &RegInfo = MF.getRegInfo(); + MachineInstr *VRegDef = RegInfo.getUniqueVRegDef(MI.getOperand(1).getReg()); + if (VRegDef == nullptr) + return false; + return VRegDef->isImplicitDef(); +} + + MachineInstr *X86InstrInfo::foldMemoryOperandImpl( MachineFunction &MF, MachineInstr &MI, unsigned OpNum, ArrayRef MOs, MachineBasicBlock::iterator InsertPt, @@ -8497,10 +8510,10 @@ MI.getOpcode() == X86::PUSH64r)) return nullptr; - // Avoid partial register update stalls unless optimizing for size. - // TODO: we should block undef reg update as well. + // Avoid partial and undef register update stalls unless optimizing for size. if (!MF.getFunction().optForSize() && - hasPartialRegUpdate(MI.getOpcode(), Subtarget)) + (hasPartialRegUpdate(MI.getOpcode(), Subtarget) || + shouldPreventUndefRegUpdateMemFold(MF, MI))) return nullptr; unsigned NumOps = MI.getDesc().getNumOperands(); @@ -8674,11 +8687,10 @@ if (NoFusing) return nullptr; - // Unless optimizing for size, don't fold to avoid partial - // register update stalls - // TODO: we should block undef reg update as well. + // Avoid partial and undef register update stalls unless optimizing for size. if (!MF.getFunction().optForSize() && - hasPartialRegUpdate(MI.getOpcode(), Subtarget)) + (hasPartialRegUpdate(MI.getOpcode(), Subtarget) || + shouldPreventUndefRegUpdateMemFold(MF, MI))) return nullptr; // Don't fold subreg spills, or reloads that use a high subreg. @@ -8875,10 +8887,10 @@ // Check switch flag if (NoFusing) return nullptr; - // Avoid partial register update stalls unless optimizing for size. - // TODO: we should block undef reg update as well. + // Avoid partial and undef register update stalls unless optimizing for size. if (!MF.getFunction().optForSize() && - hasPartialRegUpdate(MI.getOpcode(), Subtarget)) + (hasPartialRegUpdate(MI.getOpcode(), Subtarget) || + shouldPreventUndefRegUpdateMemFold(MF, MI))) return nullptr; // Determine the alignment of the load. Index: llvm/trunk/test/CodeGen/X86/fast-isel-int-float-conversion-x86-64.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/fast-isel-int-float-conversion-x86-64.ll +++ llvm/trunk/test/CodeGen/X86/fast-isel-int-float-conversion-x86-64.ll @@ -27,7 +27,8 @@ ; ; AVX-LABEL: long_to_double_rm: ; AVX: # %bb.0: # %entry -; AVX-NEXT: vcvtsi2sdq (%rdi), %xmm0, %xmm0 +; AVX-NEXT: movq (%rdi), %rax +; AVX-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm0 ; AVX-NEXT: retq entry: %0 = load i64, i64* %a @@ -75,7 +76,8 @@ ; ; AVX-LABEL: long_to_float_rm: ; AVX: # %bb.0: # %entry -; AVX-NEXT: vcvtsi2ssq (%rdi), %xmm0, %xmm0 +; AVX-NEXT: movq (%rdi), %rax +; AVX-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0 ; AVX-NEXT: retq entry: %0 = load i64, i64* %a Index: llvm/trunk/test/CodeGen/X86/fast-isel-int-float-conversion.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/fast-isel-int-float-conversion.ll +++ llvm/trunk/test/CodeGen/X86/fast-isel-int-float-conversion.ll @@ -43,7 +43,8 @@ ; AVX_X86-NEXT: .cfi_def_cfa_register %ebp ; AVX_X86-NEXT: andl $-8, %esp ; AVX_X86-NEXT: subl $8, %esp -; AVX_X86-NEXT: vcvtsi2sdl 8(%ebp), %xmm0, %xmm0 +; AVX_X86-NEXT: movl 8(%ebp), %eax +; AVX_X86-NEXT: vcvtsi2sdl %eax, %xmm0, %xmm0 ; AVX_X86-NEXT: vmovsd %xmm0, (%esp) ; AVX_X86-NEXT: fldl (%esp) ; AVX_X86-NEXT: movl %ebp, %esp @@ -64,7 +65,8 @@ ; ; AVX-LABEL: int_to_double_rm: ; AVX: # %bb.0: # %entry -; AVX-NEXT: vcvtsi2sdl (%rdi), %xmm0, %xmm0 +; AVX-NEXT: movl (%rdi), %eax +; AVX-NEXT: vcvtsi2sdl %eax, %xmm0, %xmm0 ; AVX-NEXT: retq ; ; SSE2_X86-LABEL: int_to_double_rm: @@ -187,7 +189,8 @@ ; AVX_X86: # %bb.0: # %entry ; AVX_X86-NEXT: pushl %eax ; AVX_X86-NEXT: .cfi_def_cfa_offset 8 -; AVX_X86-NEXT: vcvtsi2ssl {{[0-9]+}}(%esp), %xmm0, %xmm0 +; AVX_X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; AVX_X86-NEXT: vcvtsi2ssl %eax, %xmm0, %xmm0 ; AVX_X86-NEXT: vmovss %xmm0, (%esp) ; AVX_X86-NEXT: flds (%esp) ; AVX_X86-NEXT: popl %eax @@ -207,7 +210,8 @@ ; ; AVX-LABEL: int_to_float_rm: ; AVX: # %bb.0: # %entry -; AVX-NEXT: vcvtsi2ssl (%rdi), %xmm0, %xmm0 +; AVX-NEXT: movl (%rdi), %eax +; AVX-NEXT: vcvtsi2ssl %eax, %xmm0, %xmm0 ; AVX-NEXT: retq ; ; SSE2_X86-LABEL: int_to_float_rm: Index: llvm/trunk/test/CodeGen/X86/vector-sqrt.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/vector-sqrt.ll +++ llvm/trunk/test/CodeGen/X86/vector-sqrt.ll @@ -5,8 +5,10 @@ define <2 x double> @sqrtd2(double* nocapture readonly %v) local_unnamed_addr #0 { ; CHECK-LABEL: sqrtd2: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vsqrtsd (%rdi), %xmm0, %xmm0 -; CHECK-NEXT: vsqrtsd 8(%rdi), %xmm1, %xmm1 +; CHECK-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; CHECK-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; CHECK-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vsqrtsd %xmm1, %xmm1, %xmm1 ; CHECK-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; CHECK-NEXT: retq entry: @@ -27,10 +29,14 @@ define <4 x float> @sqrtf4(float* nocapture readonly %v) local_unnamed_addr #0 { ; CHECK-LABEL: sqrtf4: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vsqrtss (%rdi), %xmm0, %xmm0 -; CHECK-NEXT: vsqrtss 4(%rdi), %xmm1, %xmm1 -; CHECK-NEXT: vsqrtss 8(%rdi), %xmm2, %xmm2 -; CHECK-NEXT: vsqrtss 12(%rdi), %xmm3, %xmm3 +; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; CHECK-NEXT: vsqrtss %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vsqrtss %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; CHECK-NEXT: vsqrtss %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero +; CHECK-NEXT: vsqrtss %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] ; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3] ; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0]