Index: lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -18780,6 +18780,43 @@ unsigned InsIdx = cast(N2)->getZExtValue(); + // Push subvector bitcasts to the output, adjusting the index as we go. + // insert_subvector(bitcast(v), bitcast(s), c1) + // -> bitcast(insert_subvector(v, s, c2)) + if ((N0.isUndef() || N0.getOpcode() == ISD::BITCAST) && + N1.getOpcode() == ISD::BITCAST) { + SDValue N0Src = peekThroughBitcasts(N0); + SDValue N1Src = peekThroughBitcasts(N1); + EVT N0SrcSVT = N0Src.getValueType().getScalarType(); + EVT N1SrcSVT = N1Src.getValueType().getScalarType(); + if ((N0.isUndef() || N0SrcSVT == N1SrcSVT) && + N0Src.getValueType().isVector() && N1Src.getValueType().isVector()) { + EVT NewVT; + SDLoc DL(N); + SDValue NewIdx; + MVT IdxVT = TLI.getVectorIdxTy(DAG.getDataLayout()); + LLVMContext &Ctx = *DAG.getContext(); + unsigned NumElts = VT.getVectorNumElements(); + unsigned EltSizeInBits = VT.getScalarSizeInBits(); + if ((EltSizeInBits % N1SrcSVT.getSizeInBits()) == 0) { + unsigned Scale = EltSizeInBits / N1SrcSVT.getSizeInBits(); + NewVT = EVT::getVectorVT(Ctx, N1SrcSVT, NumElts * Scale); + NewIdx = DAG.getConstant(InsIdx * Scale, DL, IdxVT); + } else if ((N1SrcSVT.getSizeInBits() % EltSizeInBits) == 0) { + unsigned Scale = N1SrcSVT.getSizeInBits() / EltSizeInBits; + if ((NumElts % Scale) == 0 && (InsIdx % Scale) == 0) { + NewVT = EVT::getVectorVT(Ctx, N1SrcSVT, NumElts / Scale); + NewIdx = DAG.getConstant(InsIdx / Scale, DL, IdxVT); + } + } + if (NewIdx && hasOperation(ISD::INSERT_SUBVECTOR, NewVT)) { + SDValue Res = DAG.getBitcast(NewVT, N0Src); + Res = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, NewVT, Res, N1Src, NewIdx); + return DAG.getBitcast(VT, Res); + } + } + } + // Canonicalize insert_subvector dag nodes. // Example: // (insert_subvector (insert_subvector A, Idx0), Idx1) Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -196,7 +196,7 @@ // Integer absolute. if (Subtarget.hasCMov()) { setOperationAction(ISD::ABS , MVT::i16 , Custom); - setOperationAction(ISD::ABS , MVT::i32 , Custom); + setOperationAction(ISD::ABS , MVT::i32 , Custom); } setOperationAction(ISD::ABS , MVT::i64 , Custom); @@ -25828,7 +25828,7 @@ // If this is a canonical idempotent atomicrmw w/no uses, we have a better // lowering available in lowerAtomicArith. - // TODO: push more cases through this path. + // TODO: push more cases through this path. if (auto *C = dyn_cast(AI->getValOperand())) if (AI->getOperation() == AtomicRMWInst::Or && C->isZero() && AI->use_empty()) @@ -25886,7 +25886,7 @@ /// Emit a locked operation on a stack location which does not change any /// memory location, but does involve a lock prefix. Location is chosen to be /// a) very likely accessed only by a single thread to minimize cache traffic, -/// and b) definitely dereferenceable. Returns the new Chain result. +/// and b) definitely dereferenceable. Returns the new Chain result. static SDValue emitLockedStackOp(SelectionDAG &DAG, const X86Subtarget &Subtarget, SDValue Chain, SDLoc DL) { @@ -25895,22 +25895,22 @@ // operations issued by the current processor. As such, the location // referenced is not relevant for the ordering properties of the instruction. // See: Intel® 64 and IA-32 ArchitecturesSoftware Developer’s Manual, - // 8.2.3.9 Loads and Stores Are Not Reordered with Locked Instructions + // 8.2.3.9 Loads and Stores Are Not Reordered with Locked Instructions // 2) Using an immediate operand appears to be the best encoding choice // here since it doesn't require an extra register. // 3) OR appears to be very slightly faster than ADD. (Though, the difference // is small enough it might just be measurement noise.) // 4) When choosing offsets, there are several contributing factors: // a) If there's no redzone, we default to TOS. (We could allocate a cache - // line aligned stack object to improve this case.) + // line aligned stack object to improve this case.) // b) To minimize our chances of introducing a false dependence, we prefer - // to offset the stack usage from TOS slightly. + // to offset the stack usage from TOS slightly. // c) To minimize concerns about cross thread stack usage - in particular, // the idiomatic MyThreadPool.run([&StackVars]() {...}) pattern which // captures state in the TOS frame and accesses it from many threads - // we want to use an offset such that the offset is in a distinct cache // line from the TOS frame. - // + // // For a general discussion of the tradeoffs and benchmark results, see: // https://shipilev.net/blog/2014/on-the-fence-with-dependencies/ @@ -25963,7 +25963,7 @@ if (Subtarget.hasMFence()) return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0)); - SDValue Chain = Op.getOperand(0); + SDValue Chain = Op.getOperand(0); return emitLockedStackOp(DAG, Subtarget, Chain, dl); } @@ -26452,12 +26452,12 @@ // seq_cst which isn't SingleThread, everything just needs to be preserved // during codegen and then dropped. Note that we expect (but don't assume), // that orderings other than seq_cst and acq_rel have been canonicalized to - // a store or load. + // a store or load. if (AN->getOrdering() == AtomicOrdering::SequentiallyConsistent && AN->getSyncScopeID() == SyncScope::System) { // Prefer a locked operation against a stack location to minimize cache // traffic. This assumes that stack locations are very likely to be - // accessed only by the owning thread. + // accessed only by the owning thread. SDValue NewChain = emitLockedStackOp(DAG, Subtarget, Chain, DL); assert(!N->hasAnyUseOfValue(0)); // NOTE: The getUNDEF is needed to give something for the unused result 0. @@ -35133,7 +35133,7 @@ } // TODO: This switch could include FNEG and the x86-specific FP logic ops - // (FAND, FANDN, FOR, FXOR). But that may require enhancements to avoid + // (FAND, FANDN, FOR, FXOR). But that may require enhancements to avoid // missed load folding and fma+fneg combining. switch (Vec.getOpcode()) { case ISD::FMA: // Begin 3 operands @@ -43066,42 +43066,6 @@ } } - // Push subvector bitcasts to the output, adjusting the index as we go. - // insert_subvector(bitcast(v), bitcast(s), c1) -> - // bitcast(insert_subvector(v,s,c2)) - // TODO: Move this to generic - which only supports same scalar sizes. - if ((Vec.isUndef() || Vec.getOpcode() == ISD::BITCAST) && - SubVec.getOpcode() == ISD::BITCAST) { - SDValue VecSrc = peekThroughBitcasts(Vec); - SDValue SubVecSrc = peekThroughBitcasts(SubVec); - MVT VecSrcSVT = VecSrc.getSimpleValueType().getScalarType(); - MVT SubVecSrcSVT = SubVecSrc.getSimpleValueType().getScalarType(); - if (Vec.isUndef() || VecSrcSVT == SubVecSrcSVT) { - MVT NewOpVT; - SDValue NewIdx; - unsigned NumElts = OpVT.getVectorNumElements(); - unsigned EltSizeInBits = OpVT.getScalarSizeInBits(); - if ((EltSizeInBits % SubVecSrcSVT.getSizeInBits()) == 0) { - unsigned Scale = EltSizeInBits / SubVecSrcSVT.getSizeInBits(); - NewOpVT = MVT::getVectorVT(SubVecSrcSVT, NumElts * Scale); - NewIdx = DAG.getIntPtrConstant(IdxVal * Scale, dl); - } else if ((SubVecSrcSVT.getSizeInBits() % EltSizeInBits) == 0) { - unsigned Scale = SubVecSrcSVT.getSizeInBits() / EltSizeInBits; - if ((IdxVal % Scale) == 0) { - NewOpVT = MVT::getVectorVT(SubVecSrcSVT, NumElts / Scale); - NewIdx = DAG.getIntPtrConstant(IdxVal / Scale, dl); - } - } - if (NewIdx && DAG.getTargetLoweringInfo().isOperationLegal( - ISD::INSERT_SUBVECTOR, NewOpVT)) { - SDValue Res = DAG.getBitcast(NewOpVT, VecSrc); - Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NewOpVT, Res, SubVecSrc, - NewIdx); - return DAG.getBitcast(OpVT, Res); - } - } - } - // Match concat_vector style patterns. SmallVector SubVectorOps; if (collectConcatOps(N, SubVectorOps))