Index: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp +++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp @@ -5996,7 +5996,10 @@ /// or SDValue() otherwise. static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget* Subtarget, SelectionDAG &DAG) { - if (!Subtarget->hasFp256()) + // VBROADCAST requires AVX. + // TODO: Splats could be generated for non-AVX CPUs using SSE + // instructions, but there's less potential gain for only 128-bit vectors. + if (!Subtarget->hasAVX()) return SDValue(); MVT VT = Op.getSimpleValueType(); @@ -6073,17 +6076,34 @@ } } + unsigned ScalarSize = Ld.getValueType().getSizeInBits(); bool IsGE256 = (VT.getSizeInBits() >= 256); - // Handle the broadcasting a single constant scalar from the constant pool - // into a vector. On Sandybridge it is still better to load a constant vector + // When optimizing for size, generate up to 5 extra bytes for a broadcast + // instruction to save 8 or more bytes of constant pool data. + // TODO: If multiple splats are generated to load the same constant, + // it may be detrimental to overall size. There needs to be a way to detect + // that condition to know if this is truly a size win. + const Function *F = DAG.getMachineFunction().getFunction(); + bool OptForSize = F->getAttributes(). + hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize); + + // Handle broadcasting a single constant scalar from the constant pool + // into a vector. + // On Sandybridge (no AVX2), it is still better to load a constant vector // from the constant pool and not to broadcast it from a scalar. - if (ConstSplatVal && Subtarget->hasInt256()) { + // But override that restriction when optimizing for size. + // TODO: Check if splatting is recommended for other AVX-capable CPUs. + if (ConstSplatVal && (Subtarget->hasAVX2() || OptForSize)) { EVT CVT = Ld.getValueType(); assert(!CVT.isVector() && "Must not broadcast a vector type"); - unsigned ScalarSize = CVT.getSizeInBits(); - if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)) { + // Splat f32, i32, v4f64, v4i64 in all cases with AVX2. + // For size optimization, also splat v2f64 and v2i64, and for size opt + // with AVX2, also splat i8 and i16. + // With pattern matching, the VBROADCAST node may become a VMOVDDUP. + if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) || + (OptForSize && (ScalarSize == 64 || Subtarget->hasAVX2()))) { const Constant *C = nullptr; if (ConstantSDNode *CI = dyn_cast(Ld)) C = CI->getConstantIntValue(); @@ -6104,7 +6124,6 @@ } bool IsLoad = ISD::isNormalLoad(Ld.getNode()); - unsigned ScalarSize = Ld.getValueType().getSizeInBits(); // Handle AVX2 in-register broadcasts. if (!IsLoad && Subtarget->hasInt256() && Index: llvm/trunk/lib/Target/X86/X86InstrSSE.td =================================================================== --- llvm/trunk/lib/Target/X86/X86InstrSSE.td +++ llvm/trunk/lib/Target/X86/X86InstrSSE.td @@ -5290,6 +5290,13 @@ (VMOVDDUPYrr VR256:$src)>; } +let Predicates = [UseAVX, OptForSize] in { + def : Pat<(v2f64 (X86VBroadcast (loadf64 addr:$src))), + (VMOVDDUPrm addr:$src)>; + def : Pat<(v2i64 (X86VBroadcast (loadi64 addr:$src))), + (VMOVDDUPrm addr:$src)>; +} + let Predicates = [UseSSE3] in { def : Pat<(X86Movddup (memopv2f64 addr:$src)), (MOVDDUPrm addr:$src)>; Index: llvm/trunk/test/CodeGen/X86/splat-for-size.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/splat-for-size.ll +++ llvm/trunk/test/CodeGen/X86/splat-for-size.ll @@ -0,0 +1,141 @@ +; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=avx < %s | FileCheck %s -check-prefix=CHECK --check-prefix=AVX +; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=avx2 < %s | FileCheck %s -check-prefix=CHECK --check-prefix=AVX2 + +; Check constant loads of every 128-bit and 256-bit vector type +; for size optimization using splat ops available with AVX and AVX2. + +; There is no AVX broadcast from double to 128-bit vector because movddup has been around since SSE3 (grrr). +define <2 x double> @splat_v2f64(<2 x double> %x) #0 { + %add = fadd <2 x double> %x, + ret <2 x double> %add +; CHECK-LABEL: splat_v2f64 +; CHECK: vmovddup +; CHECK: vaddpd +; CHECK-NEXT: retq +} + +define <4 x double> @splat_v4f64(<4 x double> %x) #0 { + %add = fadd <4 x double> %x, + ret <4 x double> %add +; CHECK-LABEL: splat_v4f64 +; CHECK: vbroadcastsd +; CHECK-NEXT: vaddpd +; CHECK-NEXT: retq +} + +define <4 x float> @splat_v4f32(<4 x float> %x) #0 { + %add = fadd <4 x float> %x, + ret <4 x float> %add +; CHECK-LABEL: splat_v4f32 +; CHECK: vbroadcastss +; CHECK-NEXT: vaddps +; CHECK-NEXT: retq +} + +define <8 x float> @splat_v8f32(<8 x float> %x) #0 { + %add = fadd <8 x float> %x, + ret <8 x float> %add +; CHECK-LABEL: splat_v8f32 +; CHECK: vbroadcastss +; CHECK-NEXT: vaddps +; CHECK-NEXT: retq +} + +; AVX can't do integer splats, so fake it: use vmovddup to splat 64-bit value. +; We also generate vmovddup for AVX2 because it's one byte smaller than vpbroadcastq. +define <2 x i64> @splat_v2i64(<2 x i64> %x) #0 { + %add = add <2 x i64> %x, + ret <2 x i64> %add +; CHECK-LABEL: splat_v2i64 +; CHECK: vmovddup +; CHECK: vpaddq +; CHECK-NEXT: retq +} + +; AVX can't do 256-bit integer ops, so we split this into two 128-bit vectors, +; and then we fake it: use vmovddup to splat 64-bit value. +define <4 x i64> @splat_v4i64(<4 x i64> %x) #0 { + %add = add <4 x i64> %x, + ret <4 x i64> %add +; CHECK-LABEL: splat_v4i64 +; AVX: vmovddup +; AVX: vpaddq +; AVX: vpaddq +; AVX2: vpbroadcastq +; AVX2: vpaddq +; CHECK: retq +} + +; AVX can't do integer splats, so fake it: use vbroadcastss to splat 32-bit value. +define <4 x i32> @splat_v4i32(<4 x i32> %x) #0 { + %add = add <4 x i32> %x, + ret <4 x i32> %add +; CHECK-LABEL: splat_v4i32 +; AVX: vbroadcastss +; AVX2: vpbroadcastd +; CHECK-NEXT: vpaddd +; CHECK-NEXT: retq +} + +; AVX can't do integer splats, so fake it: use vbroadcastss to splat 32-bit value. +define <8 x i32> @splat_v8i32(<8 x i32> %x) #0 { + %add = add <8 x i32> %x, + ret <8 x i32> %add +; CHECK-LABEL: splat_v8i32 +; AVX: vbroadcastss +; AVX: vpaddd +; AVX: vpaddd +; AVX2: vpbroadcastd +; AVX2: vpaddd +; CHECK: retq +} + +; AVX can't do integer splats, and there's no broadcast fakery for 16-bit. Could use pshuflw, etc? +define <8 x i16> @splat_v8i16(<8 x i16> %x) #0 { + %add = add <8 x i16> %x, + ret <8 x i16> %add +; CHECK-LABEL: splat_v8i16 +; AVX-NOT: broadcast +; AVX2: vpbroadcastw +; CHECK: vpaddw +; CHECK-NEXT: retq +} + +; AVX can't do integer splats, and there's no broadcast fakery for 16-bit. Could use pshuflw, etc? +define <16 x i16> @splat_v16i16(<16 x i16> %x) #0 { + %add = add <16 x i16> %x, + ret <16 x i16> %add +; CHECK-LABEL: splat_v16i16 +; AVX-NOT: broadcast +; AVX: vpaddw +; AVX: vpaddw +; AVX2: vpbroadcastw +; AVX2: vpaddw +; CHECK: retq +} + +; AVX can't do integer splats, and there's no broadcast fakery for 8-bit. Could use pshufb, etc? +define <16 x i8> @splat_v16i8(<16 x i8> %x) #0 { + %add = add <16 x i8> %x, + ret <16 x i8> %add +; CHECK-LABEL: splat_v16i8 +; AVX-NOT: broadcast +; AVX2: vpbroadcastb +; CHECK: vpaddb +; CHECK-NEXT: retq +} + +; AVX can't do integer splats, and there's no broadcast fakery for 8-bit. Could use pshufb, etc? +define <32 x i8> @splat_v32i8(<32 x i8> %x) #0 { + %add = add <32 x i8> %x, + ret <32 x i8> %add +; CHECK-LABEL: splat_v32i8 +; AVX-NOT: broadcast +; AVX: vpaddb +; AVX: vpaddb +; AVX2: vpbroadcastb +; AVX2: vpaddb +; CHECK: retq +} + +attributes #0 = { optsize }