diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -2058,18 +2058,19 @@ /// source is constant so it does not need to be loaded. /// It returns EVT::Other if the type should be determined using generic /// target-independent logic. -EVT -X86TargetLowering::getOptimalMemOpType( +/// For vector ops we check that the overall size isn't larger than our +/// preferred vector width. +EVT X86TargetLowering::getOptimalMemOpType( uint64_t Size, unsigned DstAlign, unsigned SrcAlign, bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc, const AttributeList &FuncAttributes) const { if (!FuncAttributes.hasFnAttribute(Attribute::NoImplicitFloat)) { - if (Size >= 16 && - (!Subtarget.isUnalignedMem16Slow() || - ((DstAlign == 0 || DstAlign >= 16) && - (SrcAlign == 0 || SrcAlign >= 16)))) { + if (Size >= 16 && (!Subtarget.isUnalignedMem16Slow() || + ((DstAlign == 0 || DstAlign >= 16) && + (SrcAlign == 0 || SrcAlign >= 16)))) { // FIXME: Check if unaligned 32-byte accesses are slow. - if (Size >= 32 && Subtarget.hasAVX()) { + if (Size >= 32 && Subtarget.hasAVX() && + (Subtarget.getPreferVectorWidth() >= 256)) { // Although this isn't a well-supported type for AVX1, we'll let // legalization and shuffle lowering produce the optimal codegen. If we // choose an optimal type with a vector element larger than a byte, @@ -2077,11 +2078,12 @@ // multiply) before we splat as a vector. return MVT::v32i8; } - if (Subtarget.hasSSE2()) + if (Subtarget.hasSSE2() && (Subtarget.getPreferVectorWidth() >= 128)) return MVT::v16i8; // TODO: Can SSE1 handle a byte vector? // If we have SSE1 registers we should be able to use them. - if (Subtarget.hasSSE1() && (Subtarget.is64Bit() || Subtarget.hasX87())) + if (Subtarget.hasSSE1() && (Subtarget.is64Bit() || Subtarget.hasX87()) && + (Subtarget.getPreferVectorWidth() >= 128)) return MVT::v4f32; } else if ((!IsMemset || ZeroMemset) && !MemcpyStrSrc && Size >= 8 && !Subtarget.is64Bit() && Subtarget.hasSSE2()) { @@ -4963,6 +4965,10 @@ unsigned MaxIntSize = Subtarget.is64Bit() ? 64 : 32; return (MemVT.getSizeInBits() <= MaxIntSize); } + // Make sure we don't merge greater than our preferred vector + // width. + if (MemVT.getSizeInBits() > Subtarget.getPreferVectorWidth()) + return false; return true; } diff --git a/llvm/test/CodeGen/X86/vector-width-store-merge.ll b/llvm/test/CodeGen/X86/vector-width-store-merge.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/vector-width-store-merge.ll @@ -0,0 +1,53 @@ +; RUN: llc < %s -mtriple=x86_64-- | FileCheck %s + +; This tests whether or not we generate vectors large than preferred vector width when +; lowering memmove. + +; Function Attrs: nounwind uwtable +define weak_odr dso_local void @A(i8* %src, i8* %dst) local_unnamed_addr #0 { +entry: +; CHECK: A +; CHECK-NOT: vmovups %ymm +; CHECK: vmovups %xmm + call void @llvm.memmove.p0i8.p0i8.i64(i8* align 1 %dst, i8* align 1 %src, i64 32, i1 false) + ret void +} + +; Function Attrs: nounwind uwtable +define weak_odr dso_local void @B(i8* %src, i8* %dst) local_unnamed_addr #0 { +entry: +; CHECK: B +; CHECK-NOT: vmovups %zmm +; CHECK: vmovups %xmm + call void @llvm.memmove.p0i8.p0i8.i64(i8* align 1 %dst, i8* align 1 %src, i64 64, i1 false) + ret void +} + +; Function Attrs: nounwind uwtable +define weak_odr dso_local void @C(i8* %src, i8* %dst) local_unnamed_addr #2 { +entry: +; CHECK: C +; CHECK-NOT: vmovups %ymm +; CHECK: vmovups %ymm + call void @llvm.memmove.p0i8.p0i8.i64(i8* align 1 %dst, i8* align 1 %src, i64 32, i1 false) + ret void +} + +; Function Attrs: nounwind uwtable +define weak_odr dso_local void @D(i8* %src, i8* %dst) local_unnamed_addr #2 { +entry: +; CHECK: D +; CHECK-NOT: vmovups %zmm +; CHECK: vmovups %ymm + call void @llvm.memmove.p0i8.p0i8.i64(i8* align 1 %dst, i8* align 1 %src, i64 64, i1 false) + ret void +} + +; Function Attrs: argmemonly nounwind +declare void @llvm.memmove.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i1 immarg) #1 + +attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "prefer-vector-width"="128" "stack-protector-buffer-size"="8" "target-cpu"="skylake-avx512" "target-features"="+adx,+aes,+avx,+avx2,+avx512bw,+avx512cd,+avx512dq,+avx512f,+avx512vl,+bmi,+bmi2,+clflushopt,+clwb,+cx16,+cx8,+f16c,+fma,+fsgsbase,+fxsr,+invpcid,+lzcnt,+mmx,+movbe,+mpx,+pclmul,+pku,+popcnt,+prfchw,+rdrnd,+rdseed,+sahf,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsavec,+xsaveopt,+xsaves" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { argmemonly nounwind } +attributes #2 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "prefer-vector-width"="256" "stack-protector-buffer-size"="8" "target-cpu"="skylake-avx512" "target-features"="+adx,+aes,+avx,+avx2,+avx512bw,+avx512cd,+avx512dq,+avx512f,+avx512vl,+bmi,+bmi2,+clflushopt,+clwb,+cx16,+cx8,+f16c,+fma,+fsgsbase,+fxsr,+invpcid,+lzcnt,+mmx,+movbe,+mpx,+pclmul,+pku,+popcnt,+prfchw,+rdrnd,+rdseed,+sahf,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsavec,+xsaveopt,+xsaves" "unsafe-fp-math"="false" "use-soft-float"="false" } + +!0 = !{i32 1, !"wchar_size", i32 4}