Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -26185,52 +26185,6 @@ return SDValue(); } -static SDValue NarrowVectorLoadToElement(LoadSDNode *Load, unsigned Index, - SelectionDAG &DAG) { - SDLoc dl(Load); - MVT VT = Load->getSimpleValueType(0); - MVT EVT = VT.getVectorElementType(); - SDValue Addr = Load->getOperand(1); - SDValue NewAddr = DAG.getNode( - ISD::ADD, dl, Addr.getSimpleValueType(), Addr, - DAG.getConstant(Index * EVT.getStoreSize(), dl, - Addr.getSimpleValueType())); - - SDValue NewLoad = - DAG.getLoad(EVT, dl, Load->getChain(), NewAddr, - DAG.getMachineFunction().getMachineMemOperand( - Load->getMemOperand(), 0, EVT.getStoreSize())); - return NewLoad; -} - -static SDValue PerformINSERTPSCombine(SDNode *N, SelectionDAG &DAG, - const X86Subtarget *Subtarget) { - SDLoc dl(N); - MVT VT = N->getOperand(1)->getSimpleValueType(0); - assert((VT == MVT::v4f32 || VT == MVT::v4i32) && - "X86insertps is only defined for v4x32"); - - SDValue Ld = N->getOperand(1); - if (MayFoldLoad(Ld)) { - // Extract the countS bits from the immediate so we can get the proper - // address when narrowing the vector load to a specific element. - // When the second source op is a memory address, insertps doesn't use - // countS and just gets an f32 from that address. - unsigned DestIndex = - cast(N->getOperand(2))->getZExtValue() >> 6; - - Ld = NarrowVectorLoadToElement(cast(Ld), DestIndex, DAG); - - // Create this as a scalar to vector to match the instruction pattern. - SDValue LoadScalarToVector = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Ld); - // countS bits are ignored when loading from memory on insertps, which - // means we don't need to explicitly set them to 0. - return DAG.getNode(X86ISD::INSERTPS, dl, VT, N->getOperand(0), - LoadScalarToVector, N->getOperand(2)); - } - return SDValue(); -} - static SDValue PerformBLENDICombine(SDNode *N, SelectionDAG &DAG) { SDValue V0 = N->getOperand(0); SDValue V1 = N->getOperand(1); @@ -26694,14 +26648,9 @@ case X86ISD::MOVSD: case X86ISD::VPERMILPI: case X86ISD::VPERM2X128: - case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, DCI,Subtarget); + case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, DCI, Subtarget); case ISD::FMA: return PerformFMACombine(N, DAG, Subtarget); - case X86ISD::INSERTPS: { - if (getTargetMachine().getOptLevel() > CodeGenOpt::None) - return PerformINSERTPSCombine(N, DAG, Subtarget); - break; - } - case X86ISD::BLENDI: return PerformBLENDICombine(N, DAG); + case X86ISD::BLENDI: return PerformBLENDICombine(N, DAG); } return SDValue(); Index: lib/Target/X86/X86InstrInfo.h =================================================================== --- lib/Target/X86/X86InstrInfo.h +++ lib/Target/X86/X86InstrInfo.h @@ -443,7 +443,7 @@ const MachineInstr *DefMI, unsigned DefIdx, const MachineInstr *UseMI, unsigned UseIdx) const override; - + bool useMachineCombiner() const override { return true; } @@ -512,6 +512,14 @@ MachineBasicBlock::iterator &MBBI, LiveVariables *LV) const; + /// Handles memory folding for special case instructions, e.g. those requiring + /// custom manipulation of the address. + MachineInstr *foldMemoryOperandSpecial(MachineFunction &MF, MachineInstr *MI, + unsigned OpNum, + ArrayRef MOs, + MachineBasicBlock::iterator InsertPt, + unsigned Size, unsigned Align) const; + /// isFrameOperand - Return true and the FrameIndex if the specified /// operand and follow operands form a reference to the stack frame. bool isFrameOperand(const MachineInstr *MI, unsigned int Op, Index: lib/Target/X86/X86InstrInfo.cpp =================================================================== --- lib/Target/X86/X86InstrInfo.cpp +++ lib/Target/X86/X86InstrInfo.cpp @@ -4824,12 +4824,35 @@ return false; } -static void addOperands(MachineInstrBuilder &MIB, ArrayRef MOs) { +static void addOperands(MachineInstrBuilder &MIB, ArrayRef MOs, + int PtrOffset = 0) { unsigned NumAddrOps = MOs.size(); - for (unsigned i = 0; i != NumAddrOps; ++i) - MIB.addOperand(MOs[i]); - if (NumAddrOps < 4) // FrameIndex only - addOffset(MIB, 0); + + if (NumAddrOps < 4) { + // FrameIndex only - add an immediate offset (whether its zero or not). + for (unsigned i = 0; i != NumAddrOps; ++i) + MIB.addOperand(MOs[i]); + addOffset(MIB, PtrOffset); + } else { + // General Memory Addressing - we need to add any offset to an existing + // offset. + assert(MOs.size() == 5 && "Unexpected memory operand list length"); + for (unsigned i = 0; i != NumAddrOps; ++i) { + const MachineOperand &MO = MOs[i]; + if (i == 3 && PtrOffset != 0) { + assert((MO.isImm() || MO.isGlobal()) && + "Unexpected memory operand type"); + if (MO.isImm()) { + MIB.addImm(MO.getImm() + PtrOffset); + } else { + MIB.addGlobalAddress(MO.getGlobal(), MO.getOffset() + PtrOffset, + MO.getTargetFlags()); + } + } else { + MIB.addOperand(MO); + } + } + } } static MachineInstr *FuseTwoAddrInst(MachineFunction &MF, unsigned Opcode, @@ -4864,17 +4887,18 @@ static MachineInstr *FuseInst(MachineFunction &MF, unsigned Opcode, unsigned OpNo, ArrayRef MOs, MachineBasicBlock::iterator InsertPt, - MachineInstr *MI, const TargetInstrInfo &TII) { + MachineInstr *MI, const TargetInstrInfo &TII, + int PtrOffset = 0) { // Omit the implicit operands, something BuildMI can't do. - MachineInstr *NewMI = MF.CreateMachineInstr(TII.get(Opcode), - MI->getDebugLoc(), true); + MachineInstr *NewMI = + MF.CreateMachineInstr(TII.get(Opcode), MI->getDebugLoc(), true); MachineInstrBuilder MIB(MF, NewMI); for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { MachineOperand &MO = MI->getOperand(i); if (i == OpNo) { assert(MO.isReg() && "Expected to fold into reg operand!"); - addOperands(MIB, MOs); + addOperands(MIB, MOs, PtrOffset); } else { MIB.addOperand(MO); } @@ -4896,6 +4920,40 @@ return MIB.addImm(0); } +MachineInstr *X86InstrInfo::foldMemoryOperandSpecial( + MachineFunction &MF, MachineInstr *MI, unsigned OpNum, + ArrayRef MOs, MachineBasicBlock::iterator InsertPt, + unsigned Size, unsigned Align) const { + switch (MI->getOpcode()) { + case X86::INSERTPSrr: + case X86::VINSERTPSrr: + // Attempt to convert the load of inserted vector into a fold load + // of a single float. + if (OpNum == 2) { + unsigned Imm = MI->getOperand(MI->getNumOperands() - 1).getImm(); + unsigned ZMask = Imm & 15; + unsigned DstIdx = (Imm >> 4) & 3; + unsigned SrcIdx = (Imm >> 6) & 3; + + unsigned RCSize = getRegClass(MI->getDesc(), OpNum, &RI, MF)->getSize(); + if (Size <= RCSize && 4 <= Align) { + int PtrOffset = SrcIdx * 4; + unsigned NewImm = (DstIdx << 4) | ZMask; + unsigned NewOpCode = + (MI->getOpcode() == X86::VINSERTPSrr ? X86::VINSERTPSrm + : X86::INSERTPSrm); + MachineInstr *NewMI = + FuseInst(MF, NewOpCode, OpNum, MOs, InsertPt, MI, *this, PtrOffset); + NewMI->getOperand(NewMI->getNumOperands() - 1).setImm(NewImm); + return NewMI; + } + } + break; + }; + + return nullptr; +} + MachineInstr *X86InstrInfo::foldMemoryOperandImpl( MachineFunction &MF, MachineInstr *MI, unsigned OpNum, ArrayRef MOs, MachineBasicBlock::iterator InsertPt, @@ -4925,6 +4983,12 @@ return nullptr; MachineInstr *NewMI = nullptr; + + // Attempt to fold any special cases we have. + if (NewMI = + foldMemoryOperandSpecial(MF, MI, OpNum, MOs, InsertPt, Size, Align)) + return NewMI; + // Folding a memory location into the two-address part of a two-address // instruction is different than folding it other places. It requires // replacing the *two* registers with the memory location. Index: test/CodeGen/X86/avx.ll =================================================================== --- test/CodeGen/X86/avx.ll +++ test/CodeGen/X86/avx.ll @@ -32,7 +32,7 @@ ; On X32, account for the argument's move to registers ; X32: movl 4(%esp), %eax ; CHECK-NOT: mov -; CHECK: insertps $48 +; CHECK: vinsertps $48, (%{{...}}), {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] ; CHECK-NEXT: ret %1 = load <4 x float>, <4 x float>* %pb, align 16 %2 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %1, i32 48) @@ -46,7 +46,7 @@ ; X32: movl 4(%esp), %eax ; CHECK-NOT: mov ;; Try to match a bit more of the instr, since we need the load's offset. -; CHECK: insertps $96, 4(%{{...}}), % +; CHECK: vinsertps $32, 4(%{{...}}), {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] ; CHECK-NEXT: ret %1 = load <4 x float>, <4 x float>* %pb, align 16 %2 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %1, i32 96) @@ -60,7 +60,7 @@ ; X32: movl 8(%esp), %ecx ; CHECK-NOT: mov ;; Try to match a bit more of the instr, since we need the load's offset. -; CHECK: vinsertps $192, 12(%{{...}},%{{...}}), % +; CHECK: vinsertps $0, 12(%{{...}},%{{...}}), {{.*#+}} xmm0 = mem[0],xmm0[1,2,3] ; CHECK-NEXT: ret %1 = getelementptr inbounds <4 x float>, <4 x float>* %pb, i64 %index %2 = load <4 x float>, <4 x float>* %1, align 16 Index: test/CodeGen/X86/sse41.ll =================================================================== --- test/CodeGen/X86/sse41.ll +++ test/CodeGen/X86/sse41.ll @@ -794,12 +794,12 @@ ; X32-LABEL: insertps_from_vector_load: ; X32: ## BB#0: ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] +; X32-NEXT: insertps $48, (%{{...}}), {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] ; X32-NEXT: retl ; ; X64-LABEL: insertps_from_vector_load: ; X64: ## BB#0: -; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] +; X64-NEXT: insertps $48, (%{{...}}), {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] ; X64-NEXT: retq %1 = load <4 x float>, <4 x float>* %pb, align 16 %2 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %1, i32 48) @@ -812,12 +812,12 @@ ; X32-LABEL: insertps_from_vector_load_offset: ; X32: ## BB#0: ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[1],xmm0[3] +; X32-NEXT: insertps $32, 4(%{{...}}), {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] ; X32-NEXT: retl ; ; X64-LABEL: insertps_from_vector_load_offset: ; X64: ## BB#0: -; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[1],xmm0[3] +; X64-NEXT: insertps $32, 4(%{{...}}), {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] ; X64-NEXT: retq %1 = load <4 x float>, <4 x float>* %pb, align 16 %2 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %1, i32 96) @@ -831,13 +831,13 @@ ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-NEXT: shll $4, %ecx -; X32-NEXT: insertps {{.*#+}} xmm0 = mem[3],xmm0[1,2,3] +; X32-NEXT: insertps $0, 12(%{{...}},%{{...}}), {{.*#+}} xmm0 = mem[0],xmm0[1,2,3] ; X32-NEXT: retl ; ; X64-LABEL: insertps_from_vector_load_offset_2: ; X64: ## BB#0: ; X64-NEXT: shlq $4, %rsi -; X64-NEXT: insertps {{.*#+}} xmm0 = mem[3],xmm0[1,2,3] +; X64-NEXT: insertps $0, 12(%{{...}},%{{...}}), {{.*#+}} xmm0 = mem[0],xmm0[1,2,3] ; X64-NEXT: retq %1 = getelementptr inbounds <4 x float>, <4 x float>* %pb, i64 %index %2 = load <4 x float>, <4 x float>* %1, align 16 @@ -968,12 +968,12 @@ ; X32-LABEL: pr20087: ; X32: ## BB#0: ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[2],mem[2] +; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[2],mem[0] ; X32-NEXT: retl ; ; X64-LABEL: pr20087: ; X64: ## BB#0: -; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[2],mem[2] +; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[2],mem[0] ; X64-NEXT: retq %load = load <4 x float> , <4 x float> *%ptr %ret = shufflevector <4 x float> %load, <4 x float> %a, <4 x i32> Index: test/CodeGen/X86/stack-folding-fp-avx1.ll =================================================================== --- test/CodeGen/X86/stack-folding-fp-avx1.ll +++ test/CodeGen/X86/stack-folding-fp-avx1.ll @@ -946,7 +946,15 @@ ret <8 x float> %2 } -; TODO stack_fold_insertps +define <4 x float> @stack_fold_insertps(<4 x float> %a0, <4 x float> %a1) { + ;CHECK-LABEL: stack_fold_insertps + ;CHECK: vinsertps $17, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + ;CHECK-NEXT: {{.*#+}} xmm0 = zero,mem[0],xmm0[2,3] + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a0, <4 x float> %a1, i8 209) + ret <4 x float> %2 +} +declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i8) nounwind readnone define <2 x double> @stack_fold_maxpd(<2 x double> %a0, <2 x double> %a1) { ;CHECK-LABEL: stack_fold_maxpd Index: test/CodeGen/X86/stack-folding-fp-sse42.ll =================================================================== --- test/CodeGen/X86/stack-folding-fp-sse42.ll +++ test/CodeGen/X86/stack-folding-fp-sse42.ll @@ -637,7 +637,15 @@ } declare <4 x float> @llvm.x86.sse3.hsub.ps(<4 x float>, <4 x float>) nounwind readnone -; TODO stack_fold_insertps +define <4 x float> @stack_fold_insertps(<4 x float> %a0, <4 x float> %a1) { + ;CHECK-LABEL: stack_fold_insertps + ;CHECK: insertps $17, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + ;CHECK-NEXT: {{.*#+}} xmm0 = zero,mem[0],xmm0[2,3] + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() + %2 = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a0, <4 x float> %a1, i8 209) + ret <4 x float> %2 +} +declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i8) nounwind readnone define <2 x double> @stack_fold_maxpd(<2 x double> %a0, <2 x double> %a1) { ;CHECK-LABEL: stack_fold_maxpd