diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -5039,13 +5039,30 @@ // Other Lowering Hooks //===----------------------------------------------------------------------===// -static bool MayFoldLoad(SDValue Op, bool AssumeSingleUse = false) { - return (AssumeSingleUse || Op.hasOneUse()) && ISD::isNormalLoad(Op.getNode()); +static bool mayFoldLoad(SDValue Op, const X86Subtarget &Subtarget, + bool AssumeSingleUse = false) { + if (!AssumeSingleUse && !Op.hasOneUse()) + return false; + if (!ISD::isNormalLoad(Op.getNode())) + return false; + + // If this is an unaligned vector, make sure the target supports folding it. + auto *Ld = cast(Op.getNode()); + if (!Subtarget.hasAVX() && !Subtarget.hasSSEUnalignedMem() && + Ld->getValueSizeInBits(0) == 128 && Ld->getAlignment() < 16) + return false; + + // TODO: If this is a non-temporal load and the target has an instruction + // for it, it should not be folded. See "useNonTemporalLoad()". + + return true; } -static bool MayFoldLoadIntoBroadcastFromMem(SDValue Op, MVT EltVT, +static bool mayFoldLoadIntoBroadcastFromMem(SDValue Op, MVT EltVT, + const X86Subtarget &Subtarget, bool AssumeSingleUse = false) { - if (!MayFoldLoad(Op, AssumeSingleUse)) + assert(Subtarget.hasAVX() && "Expected AVX for broadcast from memory"); + if (!mayFoldLoad(Op, Subtarget, AssumeSingleUse)) return false; // We can not replace a wide volatile load with a broadcast-from-memory, @@ -8996,8 +9013,9 @@ Broadcast = concatSubVectors(Broadcast, Broadcast, DAG, DL); } else { if (!Subtarget.hasAVX2() && - !MayFoldLoadIntoBroadcastFromMem( + !mayFoldLoadIntoBroadcastFromMem( RepeatLoad, RepeatVT.getScalarType().getSimpleVT(), + Subtarget, /*AssumeSingleUse=*/true)) return SDValue(); Broadcast = @@ -12727,8 +12745,8 @@ &DAG](SDValue &Input, MutableArrayRef InputMask) { unsigned EltSizeInBits = Input.getScalarValueSizeInBits(); - if (!Subtarget.hasAVX2() && - (!Subtarget.hasAVX() || EltSizeInBits < 32 || !MayFoldLoad(Input))) + if (!Subtarget.hasAVX2() && (!Subtarget.hasAVX() || EltSizeInBits < 32 || + !mayFoldLoad(Input, Subtarget))) return; if (isNoopShuffleMask(InputMask)) return; @@ -16413,7 +16431,7 @@ bool SplatLo = isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1); bool SplatHi = isShuffleEquivalent(Mask, {2, 3, 2, 3}, V1); if ((SplatLo || SplatHi) && !Subtarget.hasAVX512() && V1.hasOneUse() && - MayFoldLoad(peekThroughOneUseBitcasts(V1))) { + mayFoldLoad(peekThroughOneUseBitcasts(V1), Subtarget)) { auto *Ld = cast(peekThroughOneUseBitcasts(V1)); if (!Ld->isNonTemporal()) { MVT MemVT = VT.getHalfNumVectorElementsVT(); @@ -19413,7 +19431,8 @@ // FIXME: relax the profitability check iff all N1 uses are insertions. if (!VT.is128BitVector() && IdxVal >= NumEltsIn128 && ((Subtarget.hasAVX2() && EltSizeInBits != 8) || - (Subtarget.hasAVX() && (EltSizeInBits >= 32) && MayFoldLoad(N1)))) { + (Subtarget.hasAVX() && (EltSizeInBits >= 32) && + mayFoldLoad(N1, Subtarget)))) { SDValue N1SplatVec = DAG.getSplatBuildVector(VT, dl, N1); SmallVector BlendMask; for (unsigned i = 0; i != NumElts; ++i) @@ -19486,7 +19505,7 @@ // combine either bitwise AND or insert of float 0.0 to set these bits. bool MinSize = DAG.getMachineFunction().getFunction().hasMinSize(); - if (IdxVal == 0 && (!MinSize || !MayFoldLoad(N1))) { + if (IdxVal == 0 && (!MinSize || !mayFoldLoad(N1, Subtarget))) { // If this is an insertion of 32-bits into the low 32-bits of // a vector, we prefer to generate a blend with immediate rather // than an insertps. Blends are simpler operations in hardware and so @@ -24626,8 +24645,8 @@ // being inserted between two CMOV's. (in i16 case too TBN) // https://bugs.llvm.org/show_bug.cgi?id=40974 if ((Op.getValueType() == MVT::i8 && Subtarget.hasCMov()) || - (Op.getValueType() == MVT::i16 && !MayFoldLoad(Op1) && - !MayFoldLoad(Op2))) { + (Op.getValueType() == MVT::i16 && !mayFoldLoad(Op1, Subtarget) && + !mayFoldLoad(Op2, Subtarget))) { Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1); Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2); SDValue Ops[] = { Op2, Op1, CC, Cond }; @@ -36974,7 +36993,7 @@ if (isUndefOrEqual(Mask, 0)) { if (V1.getValueType() == MaskVT && V1.getOpcode() == ISD::SCALAR_TO_VECTOR && - MayFoldLoad(V1.getOperand(0))) { + mayFoldLoad(V1.getOperand(0), Subtarget)) { if (Depth == 0 && Root.getOpcode() == X86ISD::VBROADCAST) return SDValue(); // Nothing to do! Res = V1.getOperand(0); @@ -38415,8 +38434,10 @@ SDValue N0 = V.getOperand(0); SDValue N1 = V.getOperand(1); unsigned Imm = V.getConstantOperandVal(2); - if (!MayFoldLoad(peekThroughOneUseBitcasts(N0)) || - MayFoldLoad(peekThroughOneUseBitcasts(N1))) + const X86Subtarget &Subtarget = + static_cast(DAG.getSubtarget()); + if (!mayFoldLoad(peekThroughOneUseBitcasts(N0), Subtarget) || + mayFoldLoad(peekThroughOneUseBitcasts(N1), Subtarget)) return SDValue(); Imm = ((Imm & 0x0F) << 4) | ((Imm & 0xF0) >> 4); return DAG.getNode(X86ISD::SHUFP, DL, VT, N1, N0, @@ -51652,8 +51673,9 @@ // concat_vectors(movddup(x),movddup(x)) -> broadcast(x) if (Op0.getOpcode() == X86ISD::MOVDDUP && VT == MVT::v4f64 && - (Subtarget.hasAVX2() || MayFoldLoadIntoBroadcastFromMem( - Op0.getOperand(0), VT.getScalarType()))) + (Subtarget.hasAVX2() || + mayFoldLoadIntoBroadcastFromMem(Op0.getOperand(0), VT.getScalarType(), + Subtarget))) return DAG.getNode(X86ISD::VBROADCAST, DL, VT, DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f64, Op0.getOperand(0), @@ -51662,7 +51684,7 @@ // concat_vectors(scalar_to_vector(x),scalar_to_vector(x)) -> broadcast(x) if (Op0.getOpcode() == ISD::SCALAR_TO_VECTOR && (Subtarget.hasAVX2() || - (EltSizeInBits >= 32 && MayFoldLoad(Op0.getOperand(0)))) && + (EltSizeInBits >= 32 && mayFoldLoad(Op0.getOperand(0), Subtarget))) && Op0.getOperand(0).getValueType() == VT.getScalarType()) return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Op0.getOperand(0)); @@ -52994,7 +53016,7 @@ case ISD::SRL: { SDValue N0 = Op.getOperand(0); // Look out for (store (shl (load), x)). - if (MayFoldLoad(N0) && IsFoldableRMW(N0, Op)) + if (mayFoldLoad(N0, Subtarget) && IsFoldableRMW(N0, Op)) return false; break; } @@ -53009,11 +53031,11 @@ SDValue N0 = Op.getOperand(0); SDValue N1 = Op.getOperand(1); // Avoid disabling potential load folding opportunities. - if (MayFoldLoad(N1) && + if (mayFoldLoad(N1, Subtarget) && (!Commute || !isa(N0) || (Op.getOpcode() != ISD::MUL && IsFoldableRMW(N1, Op)))) return false; - if (MayFoldLoad(N0) && + if (mayFoldLoad(N0, Subtarget) && ((Commute && !isa(N1)) || (Op.getOpcode() != ISD::MUL && IsFoldableRMW(N0, Op)))) return false; diff --git a/llvm/test/CodeGen/X86/oddshuffles.ll b/llvm/test/CodeGen/X86/oddshuffles.ll --- a/llvm/test/CodeGen/X86/oddshuffles.ll +++ b/llvm/test/CodeGen/X86/oddshuffles.ll @@ -1398,40 +1398,40 @@ define void @interleave_24i32_out(<24 x i32>* %p, <8 x i32>* %q1, <8 x i32>* %q2, <8 x i32>* %q3) nounwind { ; SSE2-LABEL: interleave_24i32_out: ; SSE2: # %bb.0: +; SSE2-NEXT: movdqu 64(%rdi), %xmm9 ; SSE2-NEXT: movups 80(%rdi), %xmm8 -; SSE2-NEXT: movups 64(%rdi), %xmm3 -; SSE2-NEXT: movdqu (%rdi), %xmm1 -; SSE2-NEXT: movups 16(%rdi), %xmm5 -; SSE2-NEXT: movups 32(%rdi), %xmm10 -; SSE2-NEXT: movdqu 48(%rdi), %xmm2 -; SSE2-NEXT: movdqa %xmm1, %xmm11 -; SSE2-NEXT: movaps %xmm10, %xmm7 -; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,1],xmm5[3,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm5[0,0] -; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm5[1,1,1,1] -; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm10[1,1] -; SSE2-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,3],xmm5[0,2] -; SSE2-NEXT: movdqa %xmm2, %xmm5 -; SSE2-NEXT: movaps %xmm8, %xmm4 -; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,1],xmm3[3,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm2[2,3,2,3] -; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm3[0,0] -; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm3[1,1,1,1] -; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,3],xmm8[1,1] -; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm3[0,2] -; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[2,0] -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm7[2,0] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm10[0,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1] -; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm8[0,3] -; SSE2-NEXT: movups %xmm5, 16(%rsi) -; SSE2-NEXT: movups %xmm11, (%rsi) -; SSE2-NEXT: movups %xmm2, 16(%rdx) -; SSE2-NEXT: movups %xmm1, (%rdx) -; SSE2-NEXT: movups %xmm6, 16(%rcx) -; SSE2-NEXT: movups %xmm0, (%rcx) +; SSE2-NEXT: movdqu (%rdi), %xmm0 +; SSE2-NEXT: movdqu 16(%rdi), %xmm10 +; SSE2-NEXT: movups 32(%rdi), %xmm5 +; SSE2-NEXT: movdqu 48(%rdi), %xmm3 +; SSE2-NEXT: movaps %xmm5, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm0[2,3,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm10[1,1,1,1] +; SSE2-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1] +; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm5[0,3] +; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm10[2,3] +; SSE2-NEXT: movdqa %xmm0, %xmm4 +; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,3],xmm5[2,0] +; SSE2-NEXT: movaps %xmm8, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm9[1,1,1,1] +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm8[0,3] +; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,1],xmm9[2,3] +; SSE2-NEXT: movdqa %xmm3, %xmm2 +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm8[2,0] +; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,1],xmm9[3,3] +; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm9[0,0] +; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm5[2,0] +; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,1],xmm10[3,3] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm10[0,0] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm6[2,0] +; SSE2-NEXT: movups %xmm2, 16(%rsi) +; SSE2-NEXT: movups %xmm4, (%rsi) +; SSE2-NEXT: movups %xmm3, 16(%rdx) +; SSE2-NEXT: movups %xmm0, (%rdx) +; SSE2-NEXT: movups %xmm1, 16(%rcx) +; SSE2-NEXT: movups %xmm7, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: interleave_24i32_out: diff --git a/llvm/test/CodeGen/X86/vec_insert-5.ll b/llvm/test/CodeGen/X86/vec_insert-5.ll --- a/llvm/test/CodeGen/X86/vec_insert-5.ll +++ b/llvm/test/CodeGen/X86/vec_insert-5.ll @@ -97,20 +97,18 @@ ; X32-LABEL: t4_under_aligned: ; X32: # %bb.0: ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movups (%eax), %xmm1 -; X32-NEXT: xorps %xmm2, %xmm2 -; X32-NEXT: xorps %xmm0, %xmm0 -; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[3,0] -; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[2,3] +; X32-NEXT: movups (%eax), %xmm0 +; X32-NEXT: xorps %xmm1, %xmm1 +; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[1,0] +; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3] ; X32-NEXT: retl ; ; ALIGN-LABEL: t4_under_aligned: ; ALIGN: # %bb.0: -; ALIGN-NEXT: movups (%rdi), %xmm1 -; ALIGN-NEXT: xorps %xmm2, %xmm2 -; ALIGN-NEXT: xorps %xmm0, %xmm0 -; ALIGN-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[3,0] -; ALIGN-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[2,3] +; ALIGN-NEXT: movups (%rdi), %xmm0 +; ALIGN-NEXT: xorps %xmm1, %xmm1 +; ALIGN-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[1,0] +; ALIGN-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3] ; ALIGN-NEXT: retq ; ; UNALIGN-LABEL: t4_under_aligned: