Index: llvm/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/lib/Target/X86/X86ISelLowering.cpp +++ llvm/lib/Target/X86/X86ISelLowering.cpp @@ -93,6 +93,11 @@ "stores respectively."), cl::Hidden); +static cl::opt + EnableLightAVX("x86-light-avx", cl::init(false), + cl::desc("Enable generation of 256 AVX load/stores," + "even whith -mprefered-vector-width=128")); + /// Call this when the user attempts to do something unsupported, like /// returning a double without SSE2 enabled on x86_64. This is not fatal, unlike /// report_fatal_error, so calling code should attempt to recover without @@ -2657,7 +2662,7 @@ } // FIXME: Check if unaligned 32-byte accesses are slow. if (Op.size() >= 32 && Subtarget.hasAVX() && - (Subtarget.getPreferVectorWidth() >= 256)) { + (Subtarget.getPreferVectorWidth() >= 256 || EnableLightAVX)) { // Although this isn't a well-supported type for AVX1, we'll let // legalization and shuffle lowering produce the optimal codegen. If we // choose an optimal type with a vector element larger than a byte, Index: llvm/test/CodeGen/X86/memcpy-light-avx.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/X86/memcpy-light-avx.ll @@ -0,0 +1,14 @@ +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=haswell -mattr=prefer-128-bit -x86-light-avx=true | FileCheck %s + +declare void @llvm.memcpy.p0.p0.i64(ptr nocapture, ptr nocapture, i64, i1) nounwind + +define void @test1(ptr %a, ptr %b) nounwind { +; CHECK-LABEL: test1: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups (%rsi), %ymm0 +; CHECK-NEXT: vmovups %ymm0, (%rdi) +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq + tail call void @llvm.memcpy.p0.p0.i64(ptr %a, ptr %b, i64 32, i1 0 ) + ret void +}