Index: llvm/trunk/lib/Target/X86/X86InstrInfo.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86InstrInfo.cpp +++ llvm/trunk/lib/Target/X86/X86InstrInfo.cpp @@ -5295,21 +5295,57 @@ Size, Alignment, /*AllowCommute=*/true); } -static bool isPartialRegisterLoad(const MachineInstr &LoadMI, - const MachineFunction &MF) { +/// Check if \p LoadMI is a partial register load that we can't fold into \p MI +/// because the latter uses contents that wouldn't be defined in the folded +/// version. For instance, this transformation isn't legal: +/// movss (%rdi), %xmm0 +/// addps %xmm0, %xmm0 +/// -> +/// addps (%rdi), %xmm0 +/// +/// But this one is: +/// movss (%rdi), %xmm0 +/// addss %xmm0, %xmm0 +/// -> +/// addss (%rdi), %xmm0 +/// +static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI, + const MachineInstr &UserMI, + const MachineFunction &MF) { unsigned Opc = LoadMI.getOpcode(); + unsigned UserOpc = UserMI.getOpcode(); unsigned RegSize = MF.getRegInfo().getRegClass(LoadMI.getOperand(0).getReg())->getSize(); - if ((Opc == X86::MOVSSrm || Opc == X86::VMOVSSrm) && RegSize > 4) + if ((Opc == X86::MOVSSrm || Opc == X86::VMOVSSrm) && RegSize > 4) { // These instructions only load 32 bits, we can't fold them if the - // destination register is wider than 32 bits (4 bytes). - return true; + // destination register is wider than 32 bits (4 bytes), and its user + // instruction isn't scalar (SS). + switch (UserOpc) { + case X86::ADDSSrr_Int: case X86::VADDSSrr_Int: + case X86::DIVSSrr_Int: case X86::VDIVSSrr_Int: + case X86::MULSSrr_Int: case X86::VMULSSrr_Int: + case X86::SUBSSrr_Int: case X86::VSUBSSrr_Int: + return false; + default: + return true; + } + } - if ((Opc == X86::MOVSDrm || Opc == X86::VMOVSDrm) && RegSize > 8) + if ((Opc == X86::MOVSDrm || Opc == X86::VMOVSDrm) && RegSize > 8) { // These instructions only load 64 bits, we can't fold them if the - // destination register is wider than 64 bits (8 bytes). - return true; + // destination register is wider than 64 bits (8 bytes), and its user + // instruction isn't scalar (SD). + switch (UserOpc) { + case X86::ADDSDrr_Int: case X86::VADDSDrr_Int: + case X86::DIVSDrr_Int: case X86::VDIVSDrr_Int: + case X86::MULSDrr_Int: case X86::VMULSDrr_Int: + case X86::SUBSDrr_Int: case X86::VSUBSDrr_Int: + return false; + default: + return true; + } + } return false; } @@ -5321,7 +5357,7 @@ unsigned NumOps = LoadMI->getDesc().getNumOperands(); int FrameIndex; if (isLoadFromStackSlot(LoadMI, FrameIndex)) { - if (isPartialRegisterLoad(*LoadMI, MF)) + if (isNonFoldablePartialRegisterLoad(*LoadMI, *MI, MF)) return nullptr; return foldMemoryOperandImpl(MF, MI, Ops, InsertPt, FrameIndex); } @@ -5434,7 +5470,7 @@ break; } default: { - if (isPartialRegisterLoad(*LoadMI, MF)) + if (isNonFoldablePartialRegisterLoad(*LoadMI, *MI, MF)) return nullptr; // Folding a normal load. Just copy the load's address operands. Index: llvm/trunk/test/CodeGen/X86/fold-load-binops.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/fold-load-binops.ll +++ llvm/trunk/test/CodeGen/X86/fold-load-binops.ll @@ -0,0 +1,142 @@ +; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+sse2 < %s | FileCheck %s --check-prefix=SSE +; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx < %s | FileCheck %s --check-prefix=AVX + +; Verify that we're folding the load into the math instruction. +; This pattern is generated out of the simplest intrinsics usage: +; _mm_add_ss(a, _mm_load_ss(b)); + +define <4 x float> @addss(<4 x float> %va, float* %pb) { +; SSE-LABEL: addss: +; SSE: # BB#0: +; SSE-NEXT: addss (%rdi), %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: addss: +; AVX: # BB#0: +; AVX-NEXT: vaddss (%rdi), %xmm0, %xmm0 +; AVX-NEXT: retq + %a = extractelement <4 x float> %va, i32 0 + %b = load float, float* %pb + %r = fadd float %a, %b + %vr = insertelement <4 x float> %va, float %r, i32 0 + ret <4 x float> %vr +} + +define <2 x double> @addsd(<2 x double> %va, double* %pb) { +; SSE-LABEL: addsd: +; SSE: # BB#0: +; SSE-NEXT: addsd (%rdi), %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: addsd: +; AVX: # BB#0: +; AVX-NEXT: vaddsd (%rdi), %xmm0, %xmm0 +; AVX-NEXT: retq + %a = extractelement <2 x double> %va, i32 0 + %b = load double, double* %pb + %r = fadd double %a, %b + %vr = insertelement <2 x double> %va, double %r, i32 0 + ret <2 x double> %vr +} + +define <4 x float> @subss(<4 x float> %va, float* %pb) { +; SSE-LABEL: subss: +; SSE: # BB#0: +; SSE-NEXT: subss (%rdi), %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: subss: +; AVX: # BB#0: +; AVX-NEXT: vsubss (%rdi), %xmm0, %xmm0 +; AVX-NEXT: retq + %a = extractelement <4 x float> %va, i32 0 + %b = load float, float* %pb + %r = fsub float %a, %b + %vr = insertelement <4 x float> %va, float %r, i32 0 + ret <4 x float> %vr +} + +define <2 x double> @subsd(<2 x double> %va, double* %pb) { +; SSE-LABEL: subsd: +; SSE: # BB#0: +; SSE-NEXT: subsd (%rdi), %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: subsd: +; AVX: # BB#0: +; AVX-NEXT: vsubsd (%rdi), %xmm0, %xmm0 +; AVX-NEXT: retq + %a = extractelement <2 x double> %va, i32 0 + %b = load double, double* %pb + %r = fsub double %a, %b + %vr = insertelement <2 x double> %va, double %r, i32 0 + ret <2 x double> %vr +} + +define <4 x float> @mulss(<4 x float> %va, float* %pb) { +; SSE-LABEL: mulss: +; SSE: # BB#0: +; SSE-NEXT: mulss (%rdi), %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: mulss: +; AVX: # BB#0: +; AVX-NEXT: vmulss (%rdi), %xmm0, %xmm0 +; AVX-NEXT: retq + %a = extractelement <4 x float> %va, i32 0 + %b = load float, float* %pb + %r = fmul float %a, %b + %vr = insertelement <4 x float> %va, float %r, i32 0 + ret <4 x float> %vr +} + +define <2 x double> @mulsd(<2 x double> %va, double* %pb) { +; SSE-LABEL: mulsd: +; SSE: # BB#0: +; SSE-NEXT: mulsd (%rdi), %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: mulsd: +; AVX: # BB#0: +; AVX-NEXT: vmulsd (%rdi), %xmm0, %xmm0 +; AVX-NEXT: retq + %a = extractelement <2 x double> %va, i32 0 + %b = load double, double* %pb + %r = fmul double %a, %b + %vr = insertelement <2 x double> %va, double %r, i32 0 + ret <2 x double> %vr +} + +define <4 x float> @divss(<4 x float> %va, float* %pb) { +; SSE-LABEL: divss: +; SSE: # BB#0: +; SSE-NEXT: divss (%rdi), %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: divss: +; AVX: # BB#0: +; AVX-NEXT: vdivss (%rdi), %xmm0, %xmm0 +; AVX-NEXT: retq + %a = extractelement <4 x float> %va, i32 0 + %b = load float, float* %pb + %r = fdiv float %a, %b + %vr = insertelement <4 x float> %va, float %r, i32 0 + ret <4 x float> %vr +} + +define <2 x double> @divsd(<2 x double> %va, double* %pb) { +; SSE-LABEL: divsd: +; SSE: # BB#0: +; SSE-NEXT: divsd (%rdi), %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: divsd: +; AVX: # BB#0: +; AVX-NEXT: vdivsd (%rdi), %xmm0, %xmm0 +; AVX-NEXT: retq + %a = extractelement <2 x double> %va, i32 0 + %b = load double, double* %pb + %r = fdiv double %a, %b + %vr = insertelement <2 x double> %va, double %r, i32 0 + ret <2 x double> %vr +}