diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td --- a/llvm/lib/Target/X86/X86.td +++ b/llvm/lib/Target/X86/X86.td @@ -615,6 +615,10 @@ : SubtargetFeature<"prefer-256-bit", "Prefer256Bit", "true", "Prefer 256-bit AVX instructions">; +def TuningAllowLight256Bit + : SubtargetFeature<"allow-light-256-bit", "AllowLight256Bit", "true", + "Enable generation of 256-bit load/stores even if we prefer 128-bit">; + def TuningPreferMaskRegisters : SubtargetFeature<"prefer-mask-registers", "PreferMaskRegisters", "true", "Prefer AVX512 mask registers over PTEST/MOVMSK">; @@ -777,7 +781,8 @@ TuningFastVariablePerLaneShuffle, TuningPOPCNTFalseDeps, TuningLZCNTFalseDeps, - TuningInsertVZEROUPPER]; + TuningInsertVZEROUPPER, + TuningAllowLight256Bit]; list HSWFeatures = !listconcat(IVBFeatures, HSWAdditionalFeatures); @@ -805,7 +810,8 @@ TuningFastVariableCrossLaneShuffle, TuningFastVariablePerLaneShuffle, TuningPOPCNTFalseDeps, - TuningInsertVZEROUPPER]; + TuningInsertVZEROUPPER, + TuningAllowLight256Bit]; list SKLFeatures = !listconcat(BDWFeatures, SKLAdditionalFeatures); @@ -833,7 +839,8 @@ TuningFastVariablePerLaneShuffle, TuningPrefer256Bit, TuningPOPCNTFalseDeps, - TuningInsertVZEROUPPER]; + TuningInsertVZEROUPPER, + TuningAllowLight256Bit]; list SKXFeatures = !listconcat(BDWFeatures, SKXAdditionalFeatures); @@ -870,7 +877,8 @@ TuningFastVariableCrossLaneShuffle, TuningFastVariablePerLaneShuffle, TuningPrefer256Bit, - TuningInsertVZEROUPPER]; + TuningInsertVZEROUPPER, + TuningAllowLight256Bit]; list CNLFeatures = !listconcat(SKLFeatures, CNLAdditionalFeatures); @@ -894,7 +902,8 @@ TuningFastVariableCrossLaneShuffle, TuningFastVariablePerLaneShuffle, TuningPrefer256Bit, - TuningInsertVZEROUPPER]; + TuningInsertVZEROUPPER, + TuningAllowLight256Bit]; list ICLFeatures = !listconcat(CNLFeatures, ICLAdditionalFeatures); @@ -1276,7 +1285,8 @@ TuningFastMOVBE, TuningSlowSHLD, TuningSBBDepBreaking, - TuningInsertVZEROUPPER]; + TuningInsertVZEROUPPER, + TuningAllowLight256Bit]; list ZN2AdditionalFeatures = [FeatureCLWB, FeatureRDPID, FeatureRDPRU, diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -2702,7 +2702,7 @@ } // FIXME: Check if unaligned 32-byte accesses are slow. if (Op.size() >= 32 && Subtarget.hasAVX() && - (Subtarget.getPreferVectorWidth() >= 256)) { + Subtarget.useLight256BitInstructions()) { // Although this isn't a well-supported type for AVX1, we'll let // legalization and shuffle lowering produce the optimal codegen. If we // choose an optimal type with a vector element larger than a byte, diff --git a/llvm/lib/Target/X86/X86Subtarget.h b/llvm/lib/Target/X86/X86Subtarget.h --- a/llvm/lib/Target/X86/X86Subtarget.h +++ b/llvm/lib/Target/X86/X86Subtarget.h @@ -255,6 +255,10 @@ return hasAVX512() && (canExtendTo512DQ() || RequiredVectorWidth > 256); } + bool useLight256BitInstructions() const { + return getPreferVectorWidth() >= 256 || AllowLight256Bit; + } + bool useBWIRegs() const { return hasBWI() && useAVX512Regs(); } diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h --- a/llvm/lib/Target/X86/X86TargetTransformInfo.h +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h @@ -92,6 +92,7 @@ // Perf-tuning flags. X86::TuningFastGather, X86::TuningSlowUAMem32, + X86::TuningAllowLight256Bit, // Based on whether user set the -mprefer-vector-width command line. X86::TuningPrefer128Bit, diff --git a/llvm/test/CodeGen/X86/memcpy-light-avx.ll b/llvm/test/CodeGen/X86/memcpy-light-avx.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/memcpy-light-avx.ll @@ -0,0 +1,26 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=haswell -mattr=prefer-128-bit | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=znver1 -mattr=prefer-128-bit | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=avx2,+prefer-128-bit,+allow-light-256-bit | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=avx2,+prefer-128-bit,-allow-light-256-bit | FileCheck %s --check-prefixes=NO256 + +declare void @llvm.memcpy.p0.p0.i64(ptr nocapture, ptr nocapture, i64, i1) nounwind + +define void @test1(ptr %a, ptr %b) nounwind { +; CHECK-LABEL: test1: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups (%rsi), %ymm0 +; CHECK-NEXT: vmovups %ymm0, (%rdi) +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; NO256-LABEL: test1: +; NO256: # %bb.0: +; NO256-NEXT: vmovups (%rsi), %xmm0 +; NO256-NEXT: vmovups 16(%rsi), %xmm1 +; NO256-NEXT: vmovups %xmm1, 16(%rdi) +; NO256-NEXT: vmovups %xmm0, (%rdi) +; NO256-NEXT: retq + tail call void @llvm.memcpy.p0.p0.i64(ptr %a, ptr %b, i64 32, i1 0 ) + ret void +} diff --git a/llvm/test/CodeGen/X86/vector-width-store-merge.ll b/llvm/test/CodeGen/X86/vector-width-store-merge.ll --- a/llvm/test/CodeGen/X86/vector-width-store-merge.ll +++ b/llvm/test/CodeGen/X86/vector-width-store-merge.ll @@ -1,18 +1,27 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-- | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-- -mcpu=skylake| FileCheck %s --check-prefixes=CHECK,PREFER256 +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=sandybridge| FileCheck %s --check-prefixes=CHECK,LIGHT256 +; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver1| FileCheck %s --check-prefixes=CHECK,PREFER256 ; This tests whether or not we generate vectors large than preferred vector width when ; lowering memmove. ; Function Attrs: nounwind uwtable define weak_odr dso_local void @A(ptr %src, ptr %dst) local_unnamed_addr #0 { -; CHECK-LABEL: A: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vmovups (%rdi), %xmm0 -; CHECK-NEXT: vmovups 16(%rdi), %xmm1 -; CHECK-NEXT: vmovups %xmm1, 16(%rsi) -; CHECK-NEXT: vmovups %xmm0, (%rsi) -; CHECK-NEXT: retq +; PREFER256-LABEL: A: +; PREFER256: # %bb.0: # %entry +; PREFER256-NEXT: vmovups (%rdi), %ymm0 +; PREFER256-NEXT: vmovups %ymm0, (%rsi) +; PREFER256-NEXT: vzeroupper +; PREFER256-NEXT: retq +; +; LIGHT256-LABEL: A: +; LIGHT256: # %bb.0: # %entry +; LIGHT256-NEXT: vmovups (%rdi), %xmm0 +; LIGHT256-NEXT: vmovups 16(%rdi), %xmm1 +; LIGHT256-NEXT: vmovups %xmm1, 16(%rsi) +; LIGHT256-NEXT: vmovups %xmm0, (%rsi) +; LIGHT256-NEXT: retq entry: call void @llvm.memmove.p0.p0.i64(ptr align 1 %dst, ptr align 1 %src, i64 32, i1 false) ret void @@ -20,17 +29,26 @@ ; Function Attrs: nounwind uwtable define weak_odr dso_local void @B(ptr %src, ptr %dst) local_unnamed_addr #0 { -; CHECK-LABEL: B: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vmovups (%rdi), %xmm0 -; CHECK-NEXT: vmovups 16(%rdi), %xmm1 -; CHECK-NEXT: vmovups 32(%rdi), %xmm2 -; CHECK-NEXT: vmovups 48(%rdi), %xmm3 -; CHECK-NEXT: vmovups %xmm3, 48(%rsi) -; CHECK-NEXT: vmovups %xmm2, 32(%rsi) -; CHECK-NEXT: vmovups %xmm1, 16(%rsi) -; CHECK-NEXT: vmovups %xmm0, (%rsi) -; CHECK-NEXT: retq +; PREFER256-LABEL: B: +; PREFER256: # %bb.0: # %entry +; PREFER256-NEXT: vmovups (%rdi), %ymm0 +; PREFER256-NEXT: vmovups 32(%rdi), %ymm1 +; PREFER256-NEXT: vmovups %ymm1, 32(%rsi) +; PREFER256-NEXT: vmovups %ymm0, (%rsi) +; PREFER256-NEXT: vzeroupper +; PREFER256-NEXT: retq +; +; LIGHT256-LABEL: B: +; LIGHT256: # %bb.0: # %entry +; LIGHT256-NEXT: vmovups (%rdi), %xmm0 +; LIGHT256-NEXT: vmovups 16(%rdi), %xmm1 +; LIGHT256-NEXT: vmovups 32(%rdi), %xmm2 +; LIGHT256-NEXT: vmovups 48(%rdi), %xmm3 +; LIGHT256-NEXT: vmovups %xmm3, 48(%rsi) +; LIGHT256-NEXT: vmovups %xmm2, 32(%rsi) +; LIGHT256-NEXT: vmovups %xmm1, 16(%rsi) +; LIGHT256-NEXT: vmovups %xmm0, (%rsi) +; LIGHT256-NEXT: retq entry: call void @llvm.memmove.p0.p0.i64(ptr align 1 %dst, ptr align 1 %src, i64 64, i1 false) ret void @@ -67,7 +85,7 @@ ; Function Attrs: argmemonly nounwind declare void @llvm.memmove.p0.p0.i64(ptr nocapture, ptr nocapture readonly, i64, i1 immarg) #1 -attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "prefer-vector-width"="128" "stack-protector-buffer-size"="8" "target-cpu"="skylake-avx512" "target-features"="+adx,+aes,+avx,+avx2,+avx512bw,+avx512cd,+avx512dq,+avx512f,+avx512vl,+bmi,+bmi2,+clflushopt,+clwb,+cx16,+cx8,+f16c,+fma,+fsgsbase,+fxsr,+invpcid,+lzcnt,+mmx,+movbe,+pclmul,+pku,+popcnt,+prfchw,+rdrnd,+rdseed,+sahf,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsavec,+xsaveopt,+xsaves" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "prefer-vector-width"="128" "stack-protector-buffer-size"="8" "target-features"="+adx,+aes,+avx,+avx2,+avx512bw,+avx512cd,+avx512dq,+avx512f,+avx512vl,+bmi,+bmi2,+clflushopt,+clwb,+cx16,+cx8,+f16c,+fma,+fsgsbase,+fxsr,+invpcid,+lzcnt,+mmx,+movbe,+pclmul,+pku,+popcnt,+prfchw,+rdrnd,+rdseed,+sahf,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsavec,+xsaveopt,+xsaves" "unsafe-fp-math"="false" "use-soft-float"="false" } attributes #1 = { argmemonly nounwind } attributes #2 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "prefer-vector-width"="256" "stack-protector-buffer-size"="8" "target-cpu"="skylake-avx512" "target-features"="+adx,+aes,+avx,+avx2,+avx512bw,+avx512cd,+avx512dq,+avx512f,+avx512vl,+bmi,+bmi2,+clflushopt,+clwb,+cx16,+cx8,+f16c,+fma,+fsgsbase,+fxsr,+invpcid,+lzcnt,+mmx,+movbe,+pclmul,+pku,+popcnt,+prfchw,+rdrnd,+rdseed,+sahf,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsavec,+xsaveopt,+xsaves" "unsafe-fp-math"="false" "use-soft-float"="false" }