Index: lib/Target/X86/X86.td =================================================================== --- lib/Target/X86/X86.td +++ lib/Target/X86/X86.td @@ -323,6 +323,10 @@ : SubtargetFeature<"fast-gather", "HasFastGather", "true", "Indicates if gather is reasonably fast.">; +def FeaturePreferAVX256 + : SubtargetFeature<"prefer-avx256", "PreferAVX256", "true", + "Prefer 256-bit AVX instructions">; + //===----------------------------------------------------------------------===// // Register File Description //===----------------------------------------------------------------------===// Index: lib/Target/X86/X86Subtarget.h =================================================================== --- lib/Target/X86/X86Subtarget.h +++ lib/Target/X86/X86Subtarget.h @@ -348,6 +348,9 @@ /// unsigned MaxInlineSizeThreshold; + /// Prefer 256-bit AVX instructions over 512-bit instructions. + bool PreferAVX256; + /// What processor and OS we're targeting. Triple TargetTriple; @@ -561,6 +564,8 @@ bool hasCLFLUSHOPT() const { return HasCLFLUSHOPT; } bool hasCLWB() const { return HasCLWB; } + bool preferAVX256() const { return PreferAVX256; } + bool isXRaySupported() const override { return is64Bit(); } X86ProcFamilyEnum getProcFamily() const { return X86ProcFamily; } Index: lib/Target/X86/X86Subtarget.cpp =================================================================== --- lib/Target/X86/X86Subtarget.cpp +++ lib/Target/X86/X86Subtarget.cpp @@ -365,6 +365,7 @@ X86ProcFamily = Others; GatherOverhead = 1024; ScatterOverhead = 1024; + PreferAVX256 = false; } X86Subtarget &X86Subtarget::initializeSubtargetDependencies(StringRef CPU, Index: lib/Target/X86/X86TargetMachine.cpp =================================================================== --- lib/Target/X86/X86TargetMachine.cpp +++ lib/Target/X86/X86TargetMachine.cpp @@ -255,6 +255,18 @@ if (SoftFloat) Key += FS.empty() ? "+soft-float" : ",+soft-float"; + // Translate vector width function attribute into subtarget features. This + // overrides any CPU specific turning parameter + if (F.hasFnAttribute("prefer-vector-width")) { + StringRef Val = F.getFnAttribute("prefer-vector-width").getValueAsString(); + unsigned Width; + if (!Val.getAsInteger(0, Width)) { + if (Key.size() > CPU.size()) + Key += ","; + Key += (Width < 512) ? "+prefer-avx256" : "-prefer-avx256"; + } + } + FS = Key.substr(CPU.size()); auto &I = SubtargetMap[Key]; Index: lib/Target/X86/X86TargetTransformInfo.cpp =================================================================== --- lib/Target/X86/X86TargetTransformInfo.cpp +++ lib/Target/X86/X86TargetTransformInfo.cpp @@ -131,7 +131,7 @@ unsigned X86TTIImpl::getRegisterBitWidth(bool Vector) const { if (Vector) { - if (ST->hasAVX512()) + if (ST->hasAVX512() && !ST->preferAVX256()) return 512; if (ST->hasAVX()) return 256; Index: test/Transforms/LoopVectorize/X86/avx512.ll =================================================================== --- test/Transforms/LoopVectorize/X86/avx512.ll +++ test/Transforms/LoopVectorize/X86/avx512.ll @@ -1,4 +1,5 @@ ; RUN: opt -mattr=+avx512f --loop-vectorize -S < %s | llc -mattr=+avx512f | FileCheck %s +; RUN: opt -mattr=+avx512vl,+prefer-avx256 --loop-vectorize -S < %s | llc -mattr=+avx512f | FileCheck %s --check-prefix=CHECK-PREFER-AVX256 target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-apple-macosx10.9.0" @@ -10,6 +11,12 @@ ; CHECK: vmovdqu32 %zmm{{.}}, ; CHECK-NOT: %ymm +; Verify that we don't generate 512-bit wide vectors when prefer-avx256 width says not to + +; CHECK-PREFER-AVX256-LABEL: f: +; CHECK-PREFER-AVX256: vmovdqu %ymm{{.}}, +; CHECK-PREFER-AVX256-NOT: %zmm + define void @f(i32* %a, i32 %n) { entry: %cmp4 = icmp sgt i32 %n, 0 @@ -33,3 +40,38 @@ for.end: ; preds = %for.end.loopexit, %entry ret void } + +; Verify that we "prefer-vector-width=256" attribute prevents the use of 512-bit +; vectors + +; CHECK-LABEL: g: +; CHECK: vmovdqu %ymm{{.}}, +; CHECK-NOT: %zmm + +; CHECK-PREFER-AVX256-LABEL: g: +; CHECK-PREFER-AVX256: vmovdqu %ymm{{.}}, +; CHECK-PREFER-AVX256-NOT: %zmm + +define void @g(i32* %a, i32 %n) "prefer-vector-width"="256" { +entry: + %cmp4 = icmp sgt i32 %n, 0 + br i1 %cmp4, label %for.body.preheader, label %for.end + +for.body.preheader: ; preds = %entry + br label %for.body + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ] + %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv + store i32 %n, i32* %arrayidx, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %n + br i1 %exitcond, label %for.end.loopexit, label %for.body + +for.end.loopexit: ; preds = %for.body + br label %for.end + +for.end: ; preds = %for.end.loopexit, %entry + ret void +}