Index: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp +++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp @@ -26174,52 +26174,6 @@ return SDValue(); } -static SDValue NarrowVectorLoadToElement(LoadSDNode *Load, unsigned Index, - SelectionDAG &DAG) { - SDLoc dl(Load); - MVT VT = Load->getSimpleValueType(0); - MVT EVT = VT.getVectorElementType(); - SDValue Addr = Load->getOperand(1); - SDValue NewAddr = DAG.getNode( - ISD::ADD, dl, Addr.getSimpleValueType(), Addr, - DAG.getConstant(Index * EVT.getStoreSize(), dl, - Addr.getSimpleValueType())); - - SDValue NewLoad = - DAG.getLoad(EVT, dl, Load->getChain(), NewAddr, - DAG.getMachineFunction().getMachineMemOperand( - Load->getMemOperand(), 0, EVT.getStoreSize())); - return NewLoad; -} - -static SDValue PerformINSERTPSCombine(SDNode *N, SelectionDAG &DAG, - const X86Subtarget *Subtarget) { - SDLoc dl(N); - MVT VT = N->getOperand(1)->getSimpleValueType(0); - assert((VT == MVT::v4f32 || VT == MVT::v4i32) && - "X86insertps is only defined for v4x32"); - - SDValue Ld = N->getOperand(1); - if (MayFoldLoad(Ld)) { - // Extract the countS bits from the immediate so we can get the proper - // address when narrowing the vector load to a specific element. - // When the second source op is a memory address, insertps doesn't use - // countS and just gets an f32 from that address. - unsigned DestIndex = - cast(N->getOperand(2))->getZExtValue() >> 6; - - Ld = NarrowVectorLoadToElement(cast(Ld), DestIndex, DAG); - - // Create this as a scalar to vector to match the instruction pattern. - SDValue LoadScalarToVector = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Ld); - // countS bits are ignored when loading from memory on insertps, which - // means we don't need to explicitly set them to 0. - return DAG.getNode(X86ISD::INSERTPS, dl, VT, N->getOperand(0), - LoadScalarToVector, N->getOperand(2)); - } - return SDValue(); -} - static SDValue PerformBLENDICombine(SDNode *N, SelectionDAG &DAG) { SDValue V0 = N->getOperand(0); SDValue V1 = N->getOperand(1); @@ -26685,11 +26639,6 @@ case X86ISD::VPERM2X128: case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, DCI,Subtarget); case ISD::FMA: return PerformFMACombine(N, DAG, Subtarget); - case X86ISD::INSERTPS: { - if (getTargetMachine().getOptLevel() > CodeGenOpt::None) - return PerformINSERTPSCombine(N, DAG, Subtarget); - break; - } case X86ISD::BLENDI: return PerformBLENDICombine(N, DAG); } Index: llvm/trunk/lib/Target/X86/X86InstrInfo.h =================================================================== --- llvm/trunk/lib/Target/X86/X86InstrInfo.h +++ llvm/trunk/lib/Target/X86/X86InstrInfo.h @@ -512,6 +512,14 @@ MachineBasicBlock::iterator &MBBI, LiveVariables *LV) const; + /// Handles memory folding for special case instructions, for instance those + /// requiring custom manipulation of the address. + MachineInstr *foldMemoryOperandCustom(MachineFunction &MF, MachineInstr *MI, + unsigned OpNum, + ArrayRef MOs, + MachineBasicBlock::iterator InsertPt, + unsigned Size, unsigned Align) const; + /// isFrameOperand - Return true and the FrameIndex if the specified /// operand and follow operands form a reference to the stack frame. bool isFrameOperand(const MachineInstr *MI, unsigned int Op, Index: llvm/trunk/lib/Target/X86/X86InstrInfo.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86InstrInfo.cpp +++ llvm/trunk/lib/Target/X86/X86InstrInfo.cpp @@ -4876,12 +4876,35 @@ return false; } -static void addOperands(MachineInstrBuilder &MIB, ArrayRef MOs) { +static void addOperands(MachineInstrBuilder &MIB, ArrayRef MOs, + int PtrOffset = 0) { unsigned NumAddrOps = MOs.size(); - for (unsigned i = 0; i != NumAddrOps; ++i) - MIB.addOperand(MOs[i]); - if (NumAddrOps < 4) // FrameIndex only - addOffset(MIB, 0); + + if (NumAddrOps < 4) { + // FrameIndex only - add an immediate offset (whether its zero or not). + for (unsigned i = 0; i != NumAddrOps; ++i) + MIB.addOperand(MOs[i]); + addOffset(MIB, PtrOffset); + } else { + // General Memory Addressing - we need to add any offset to an existing + // offset. + assert(MOs.size() == 5 && "Unexpected memory operand list length"); + for (unsigned i = 0; i != NumAddrOps; ++i) { + const MachineOperand &MO = MOs[i]; + if (i == 3 && PtrOffset != 0) { + assert((MO.isImm() || MO.isGlobal()) && + "Unexpected memory operand type"); + if (MO.isImm()) { + MIB.addImm(MO.getImm() + PtrOffset); + } else { + MIB.addGlobalAddress(MO.getGlobal(), MO.getOffset() + PtrOffset, + MO.getTargetFlags()); + } + } else { + MIB.addOperand(MO); + } + } + } } static MachineInstr *FuseTwoAddrInst(MachineFunction &MF, unsigned Opcode, @@ -4916,7 +4939,8 @@ static MachineInstr *FuseInst(MachineFunction &MF, unsigned Opcode, unsigned OpNo, ArrayRef MOs, MachineBasicBlock::iterator InsertPt, - MachineInstr *MI, const TargetInstrInfo &TII) { + MachineInstr *MI, const TargetInstrInfo &TII, + int PtrOffset = 0) { // Omit the implicit operands, something BuildMI can't do. MachineInstr *NewMI = MF.CreateMachineInstr(TII.get(Opcode), MI->getDebugLoc(), true); @@ -4926,7 +4950,7 @@ MachineOperand &MO = MI->getOperand(i); if (i == OpNo) { assert(MO.isReg() && "Expected to fold into reg operand!"); - addOperands(MIB, MOs); + addOperands(MIB, MOs, PtrOffset); } else { MIB.addOperand(MO); } @@ -4948,6 +4972,40 @@ return MIB.addImm(0); } +MachineInstr *X86InstrInfo::foldMemoryOperandCustom( + MachineFunction &MF, MachineInstr *MI, unsigned OpNum, + ArrayRef MOs, MachineBasicBlock::iterator InsertPt, + unsigned Size, unsigned Align) const { + switch (MI->getOpcode()) { + case X86::INSERTPSrr: + case X86::VINSERTPSrr: + // Attempt to convert the load of inserted vector into a fold load + // of a single float. + if (OpNum == 2) { + unsigned Imm = MI->getOperand(MI->getNumOperands() - 1).getImm(); + unsigned ZMask = Imm & 15; + unsigned DstIdx = (Imm >> 4) & 3; + unsigned SrcIdx = (Imm >> 6) & 3; + + unsigned RCSize = getRegClass(MI->getDesc(), OpNum, &RI, MF)->getSize(); + if (Size <= RCSize && 4 <= Align) { + int PtrOffset = SrcIdx * 4; + unsigned NewImm = (DstIdx << 4) | ZMask; + unsigned NewOpCode = + (MI->getOpcode() == X86::VINSERTPSrr ? X86::VINSERTPSrm + : X86::INSERTPSrm); + MachineInstr *NewMI = + FuseInst(MF, NewOpCode, OpNum, MOs, InsertPt, MI, *this, PtrOffset); + NewMI->getOperand(NewMI->getNumOperands() - 1).setImm(NewImm); + return NewMI; + } + } + break; + }; + + return nullptr; +} + MachineInstr *X86InstrInfo::foldMemoryOperandImpl( MachineFunction &MF, MachineInstr *MI, unsigned OpNum, ArrayRef MOs, MachineBasicBlock::iterator InsertPt, @@ -4977,6 +5035,12 @@ return nullptr; MachineInstr *NewMI = nullptr; + + // Attempt to fold any custom cases we have. + if (NewMI = + foldMemoryOperandCustom(MF, MI, OpNum, MOs, InsertPt, Size, Align)) + return NewMI; + // Folding a memory location into the two-address part of a two-address // instruction is different than folding it other places. It requires // replacing the *two* registers with the memory location. Index: llvm/trunk/test/CodeGen/X86/avx.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/avx.ll +++ llvm/trunk/test/CodeGen/X86/avx.ll @@ -32,7 +32,7 @@ ; On X32, account for the argument's move to registers ; X32: movl 4(%esp), %eax ; CHECK-NOT: mov -; CHECK: insertps $48 +; CHECK: vinsertps $48, (%{{...}}), {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] ; CHECK-NEXT: ret %1 = load <4 x float>, <4 x float>* %pb, align 16 %2 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %1, i32 48) @@ -46,7 +46,7 @@ ; X32: movl 4(%esp), %eax ; CHECK-NOT: mov ;; Try to match a bit more of the instr, since we need the load's offset. -; CHECK: insertps $96, 4(%{{...}}), % +; CHECK: vinsertps $32, 4(%{{...}}), {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] ; CHECK-NEXT: ret %1 = load <4 x float>, <4 x float>* %pb, align 16 %2 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %1, i32 96) @@ -60,7 +60,7 @@ ; X32: movl 8(%esp), %ecx ; CHECK-NOT: mov ;; Try to match a bit more of the instr, since we need the load's offset. -; CHECK: vinsertps $192, 12(%{{...}},%{{...}}), % +; CHECK: vinsertps $0, 12(%{{...}},%{{...}}), {{.*#+}} xmm0 = mem[0],xmm0[1,2,3] ; CHECK-NEXT: ret %1 = getelementptr inbounds <4 x float>, <4 x float>* %pb, i64 %index %2 = load <4 x float>, <4 x float>* %1, align 16 Index: llvm/trunk/test/CodeGen/X86/insertps-unfold-load-bug.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/insertps-unfold-load-bug.ll +++ llvm/trunk/test/CodeGen/X86/insertps-unfold-load-bug.ll @@ -0,0 +1,33 @@ +; RUN: llc -mtriple=i686-unknown-unknown -mattr=+sse4.1 < %s | FileCheck %s -check-prefix=X32 +; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 < %s | FileCheck %s -check-prefix=X64 + +; Test for case where insertps was folding the load of the insertion element, but a later optimization +; was then manipulating the load. + +define <4 x float> @insertps_unfold(<4 x float>* %v0, <4 x float>* %v1) { +; X32-LABEL: insertps_unfold: +; X32: # BB#0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X32-NEXT: movaps (%eax), %xmm0 +; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] +; X32-NEXT: addps %xmm1, %xmm0 +; X32-NEXT: retl +; +; X64-LABEL: insertps_unfold: +; X64: # BB#0: +; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X64-NEXT: movaps (%rdi), %xmm0 +; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] +; X64-NEXT: addps %xmm1, %xmm0 +; X64-NEXT: retq + %a = getelementptr inbounds <4 x float>, <4 x float>* %v1, i64 0, i64 1 + %b = load float, float* %a, align 4 + %c = insertelement <4 x float> undef, float %b, i32 0 + %d = load <4 x float>, <4 x float>* %v1, align 16 + %e = load <4 x float>, <4 x float>* %v0, align 16 + %f = shufflevector <4 x float> %e, <4 x float> %d, <4 x i32> + %g = fadd <4 x float> %c, %f + ret <4 x float> %g +} Index: llvm/trunk/test/CodeGen/X86/sse41.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/sse41.ll +++ llvm/trunk/test/CodeGen/X86/sse41.ll @@ -794,12 +794,12 @@ ; X32-LABEL: insertps_from_vector_load: ; X32: ## BB#0: ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] +; X32-NEXT: insertps $48, (%{{...}}), {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] ; X32-NEXT: retl ; ; X64-LABEL: insertps_from_vector_load: ; X64: ## BB#0: -; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] +; X64-NEXT: insertps $48, (%{{...}}), {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] ; X64-NEXT: retq %1 = load <4 x float>, <4 x float>* %pb, align 16 %2 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %1, i32 48) @@ -812,12 +812,12 @@ ; X32-LABEL: insertps_from_vector_load_offset: ; X32: ## BB#0: ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[1],xmm0[3] +; X32-NEXT: insertps $32, 4(%{{...}}), {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] ; X32-NEXT: retl ; ; X64-LABEL: insertps_from_vector_load_offset: ; X64: ## BB#0: -; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[1],xmm0[3] +; X64-NEXT: insertps $32, 4(%{{...}}), {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] ; X64-NEXT: retq %1 = load <4 x float>, <4 x float>* %pb, align 16 %2 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %1, i32 96) @@ -831,13 +831,13 @@ ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-NEXT: shll $4, %ecx -; X32-NEXT: insertps {{.*#+}} xmm0 = mem[3],xmm0[1,2,3] +; X32-NEXT: insertps $0, 12(%{{...}},%{{...}}), {{.*#+}} xmm0 = mem[0],xmm0[1,2,3] ; X32-NEXT: retl ; ; X64-LABEL: insertps_from_vector_load_offset_2: ; X64: ## BB#0: ; X64-NEXT: shlq $4, %rsi -; X64-NEXT: insertps {{.*#+}} xmm0 = mem[3],xmm0[1,2,3] +; X64-NEXT: insertps $0, 12(%{{...}},%{{...}}), {{.*#+}} xmm0 = mem[0],xmm0[1,2,3] ; X64-NEXT: retq %1 = getelementptr inbounds <4 x float>, <4 x float>* %pb, i64 %index %2 = load <4 x float>, <4 x float>* %1, align 16 @@ -968,12 +968,12 @@ ; X32-LABEL: pr20087: ; X32: ## BB#0: ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[2],mem[2] +; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[2],mem[0] ; X32-NEXT: retl ; ; X64-LABEL: pr20087: ; X64: ## BB#0: -; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[2],mem[2] +; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[2],mem[0] ; X64-NEXT: retq %load = load <4 x float> , <4 x float> *%ptr %ret = shufflevector <4 x float> %load, <4 x float> %a, <4 x i32> Index: llvm/trunk/test/CodeGen/X86/stack-folding-fp-avx1.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/stack-folding-fp-avx1.ll +++ llvm/trunk/test/CodeGen/X86/stack-folding-fp-avx1.ll @@ -946,7 +946,15 @@ ret <8 x float> %2 } -; TODO stack_fold_insertps +define <4 x float> @stack_fold_insertps(<4 x float> %a0, <4 x float> %a1) { + ;CHECK-LABEL: stack_fold_insertps + ;CHECK: vinsertps $17, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + ;CHECK-NEXT: {{.*#+}} xmm0 = zero,mem[0],xmm0[2,3] + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a0, <4 x float> %a1, i8 209) + ret <4 x float> %2 +} +declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i8) nounwind readnone define <2 x double> @stack_fold_maxpd(<2 x double> %a0, <2 x double> %a1) { ;CHECK-LABEL: stack_fold_maxpd Index: llvm/trunk/test/CodeGen/X86/stack-folding-fp-sse42.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/stack-folding-fp-sse42.ll +++ llvm/trunk/test/CodeGen/X86/stack-folding-fp-sse42.ll @@ -637,7 +637,15 @@ } declare <4 x float> @llvm.x86.sse3.hsub.ps(<4 x float>, <4 x float>) nounwind readnone -; TODO stack_fold_insertps +define <4 x float> @stack_fold_insertps(<4 x float> %a0, <4 x float> %a1) { + ;CHECK-LABEL: stack_fold_insertps + ;CHECK: insertps $17, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + ;CHECK-NEXT: {{.*#+}} xmm0 = zero,mem[0],xmm0[2,3] + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a0, <4 x float> %a1, i8 209) + ret <4 x float> %2 +} +declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i8) nounwind readnone define <2 x double> @stack_fold_maxpd(<2 x double> %a0, <2 x double> %a1) { ;CHECK-LABEL: stack_fold_maxpd