Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -6007,7 +6007,10 @@ /// or SDValue() otherwise. static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget* Subtarget, SelectionDAG &DAG) { - if (!Subtarget->hasFp256()) + // VBROADCAST requires AVX. + // TODO: It's possible to optimize splats for non-AVX CPUs using SSE + // instructions, but there's less potential gain for only 128-bit vectors. + if (!Subtarget->hasAVX()) return SDValue(); MVT VT = Op.getSimpleValueType(); @@ -6084,25 +6087,57 @@ } } - bool IsGE256 = (VT.getSizeInBits() >= 256); + unsigned VecSize = VT.getSizeInBits(); + unsigned ScalarSize = Ld.getValueType().getSizeInBits(); + bool IsGE256 = (VecSize >= 256); + + // When optimizing for size, generate up to 5 extra bytes for a broadcast + // instruction to save 8 or more bytes of constant pool data. + const Function *F = DAG.getMachineFunction().getFunction(); + bool OptForSize = F->getAttributes(). + hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize); - // Handle the broadcasting a single constant scalar from the constant pool - // into a vector. On Sandybridge it is still better to load a constant vector + // Handle broadcasting a single constant scalar from the constant pool + // into a vector. + // On Sandybridge (no AVX2), it is still better to load a constant vector // from the constant pool and not to broadcast it from a scalar. - if (ConstSplatVal && Subtarget->hasInt256()) { + // But override that restriction when optimizing for size. + // TODO: Check if splatting is recommended for other AVX-capable CPUs. + if (ConstSplatVal && (Subtarget->hasAVX2() || OptForSize)) { EVT CVT = Ld.getValueType(); assert(!CVT.isVector() && "Must not broadcast a vector type"); - unsigned ScalarSize = CVT.getSizeInBits(); - - if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)) { - const Constant *C = nullptr; + unsigned Opcode = X86ISD::VBROADCAST; // This only changes for v2[f|i]64. + const Constant *C = nullptr; + + // The v2[f/i]64 case is a mess because there is no VBROADCAST to handle it. + // Deal with it before handling all other vector types. + if (VecSize == 128 && ScalarSize == 64) { + // This is only a size optimization - could be slightly slower in time. + if (OptForSize) { + if (ConstantSDNode *CI = dyn_cast(Ld)) { + C = CI->getConstantIntValue(); + if (!Subtarget->hasAVX2()) { + // For an AVX CPU, fake an int splat with FP splat. + Opcode = X86ISD::MOVDDUP; + CVT = MVT::v2f64; + VT = MVT::v2f64; + } + } else if (ConstantFPSDNode *CF = dyn_cast(Ld)) { + C = CF->getConstantFPValue(); + Opcode = X86ISD::MOVDDUP; + } + } + } else if (ScalarSize >= 32 || (OptForSize && Subtarget->hasAVX2())) { + // Splat f32, f64, i32, i64 (excluding v2f64 and v2i64 handled above). + // For size optimization with AVX2, also splat i8 and i16. + // For an AVX CPU, fake i32 splats using the FP splat instruction. if (ConstantSDNode *CI = dyn_cast(Ld)) C = CI->getConstantIntValue(); else if (ConstantFPSDNode *CF = dyn_cast(Ld)) C = CF->getConstantFPValue(); + } - assert(C && "Invalid constant type"); - + if (C) { const TargetLowering &TLI = DAG.getTargetLoweringInfo(); SDValue CP = DAG.getConstantPool(C, TLI.getPointerTy()); unsigned Alignment = cast(CP)->getAlignment(); @@ -6110,12 +6145,11 @@ MachinePointerInfo::getConstantPool(), false, false, false, Alignment); - return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld); + return DAG.getNode(Opcode, dl, VT, Ld); } } bool IsLoad = ISD::isNormalLoad(Ld.getNode()); - unsigned ScalarSize = Ld.getValueType().getSizeInBits(); // Handle AVX2 in-register broadcasts. if (!IsLoad && Subtarget->hasInt256() && Index: test/CodeGen/X86/splat-for-size.ll =================================================================== --- test/CodeGen/X86/splat-for-size.ll +++ test/CodeGen/X86/splat-for-size.ll @@ -0,0 +1,141 @@ +; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=avx < %s | FileCheck %s -check-prefix=CHECK --check-prefix=AVX +; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=avx2 < %s | FileCheck %s -check-prefix=CHECK --check-prefix=AVX2 + +; Check constant loads of every 128-bit and 256-bit vector type +; for size optimization using splat ops available with AVX and AVX2. + +; There is no AVX broadcast from double to 128-bit vector because movddup has been around since SSE3 (grrr). +define <2 x double> @splat_v2f64(<2 x double> %x) #0 { + %add = fadd <2 x double> %x, + ret <2 x double> %add +; CHECK-LABEL: splat_v2f64 +; CHECK: vmovddup +; CHECK: vaddpd +; CHECK-NEXT: retq +} + +define <4 x double> @splat_v4f64(<4 x double> %x) #0 { + %add = fadd <4 x double> %x, + ret <4 x double> %add +; CHECK-LABEL: splat_v4f64 +; CHECK: vbroadcastsd +; CHECK-NEXT: vaddpd +; CHECK-NEXT: retq +} + +define <4 x float> @splat_v4f32(<4 x float> %x) #0 { + %add = fadd <4 x float> %x, + ret <4 x float> %add +; CHECK-LABEL: splat_v4f32 +; CHECK: vbroadcastss +; CHECK-NEXT: vaddps +; CHECK-NEXT: retq +} + +define <8 x float> @splat_v8f32(<8 x float> %x) #0 { + %add = fadd <8 x float> %x, + ret <8 x float> %add +; CHECK-LABEL: splat_v8f32 +; CHECK: vbroadcastss +; CHECK-NEXT: vaddps +; CHECK-NEXT: retq +} + +; AVX can't do integer splats, so fake it: use vmovddup to splat 64-bit value. +define <2 x i64> @splat_v2i64(<2 x i64> %x) #0 { + %add = add <2 x i64> %x, + ret <2 x i64> %add +; CHECK-LABEL: splat_v2i64 +; AVX: vmovddup +; AVX2: vpbroadcastq +; CHECK: vpaddq +; CHECK-NEXT: retq +} + +; AVX can't do 256-bit integer ops, so we split this into two 128-bit vectors, +; and then we fake it: use vmovddup to splat 64-bit value. +define <4 x i64> @splat_v4i64(<4 x i64> %x) #0 { + %add = add <4 x i64> %x, + ret <4 x i64> %add +; CHECK-LABEL: splat_v4i64 +; AVX: vmovddup +; AVX: vpaddq +; AVX: vpaddq +; AVX2: vpbroadcastq +; AVX2: vpaddq +; CHECK: retq +} + +; AVX can't do integer splats, so fake it: use vbroadcastss to splat 32-bit value. +define <4 x i32> @splat_v4i32(<4 x i32> %x) #0 { + %add = add <4 x i32> %x, + ret <4 x i32> %add +; CHECK-LABEL: splat_v4i32 +; AVX: vbroadcastss +; AVX2: vpbroadcastd +; CHECK-NEXT: vpaddd +; CHECK-NEXT: retq +} + +; AVX can't do integer splats, so fake it: use vbroadcastss to splat 32-bit value. +define <8 x i32> @splat_v8i32(<8 x i32> %x) #0 { + %add = add <8 x i32> %x, + ret <8 x i32> %add +; CHECK-LABEL: splat_v8i32 +; AVX: vbroadcastss +; AVX: vpaddd +; AVX: vpaddd +; AVX2: vpbroadcastd +; AVX2: vpaddd +; CHECK: retq +} + +; AVX can't do integer splats, and there's no broadcast fakery for 16-bit. Could use pshuflw, etc? +define <8 x i16> @splat_v8i16(<8 x i16> %x) #0 { + %add = add <8 x i16> %x, + ret <8 x i16> %add +; CHECK-LABEL: splat_v8i16 +; AVX-NOT: broadcast +; AVX2: vpbroadcastw +; CHECK: vpaddw +; CHECK-NEXT: retq +} + +; AVX can't do integer splats, and there's no broadcast fakery for 16-bit. Could use pshuflw, etc? +define <16 x i16> @splat_v16i16(<16 x i16> %x) #0 { + %add = add <16 x i16> %x, + ret <16 x i16> %add +; CHECK-LABEL: splat_v16i16 +; AVX-NOT: broadcast +; AVX: vpaddw +; AVX: vpaddw +; AVX2: vpbroadcastw +; AVX2: vpaddw +; CHECK: retq +} + +; AVX can't do integer splats, and there's no broadcast fakery for 8-bit. Could use pshufb, etc? +define <16 x i8> @splat_v16i8(<16 x i8> %x) #0 { + %add = add <16 x i8> %x, + ret <16 x i8> %add +; CHECK-LABEL: splat_v16i8 +; AVX-NOT: broadcast +; AVX2: vpbroadcastb +; CHECK: vpaddb +; CHECK-NEXT: retq +} + +; AVX can't do integer splats, and there's no broadcast fakery for 8-bit. Could use pshufb, etc? +define <32 x i8> @splat_v32i8(<32 x i8> %x) #0 { + %add = add <32 x i8> %x, + ret <32 x i8> %add +; CHECK-LABEL: splat_v32i8 +; AVX-NOT: broadcast +; AVX: vpaddb +; AVX: vpaddb +; AVX2: vpbroadcastb +; AVX2: vpaddb +; CHECK: retq +} + +attributes #0 = { optsize }