Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -8334,29 +8334,40 @@ /// filtering. While a little annoying to re-dispatch on type here, there isn't /// a convenient way to factor it out. /// FIXME: This is very similar to LowerVectorBroadcast - can we merge them? -static SDValue lowerVectorShuffleAsBroadcast(SDLoc DL, MVT VT, SDValue V, - ArrayRef Mask, +static SDValue lowerVectorShuffleAsBroadcast(SDLoc DL, MVT VT, SDValue V1, + SDValue V2, ArrayRef Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) { - if (!Subtarget.hasAVX()) - return SDValue(); - if (VT.isInteger() && !Subtarget.hasAVX2()) + if (!((Subtarget.hasSSE3() && VT == MVT::v2f64) || + (Subtarget.hasAVX() && VT.isFloatingPoint()) || + (Subtarget.hasAVX2() && VT.isInteger()))) return SDValue(); + // With MOVDDUP (v2f64) we can broadcast from a register or a load, otherwise + // we can only broadcast from a register with AVX2. + unsigned NumElts = Mask.size(); + unsigned Opcode = VT == MVT::v2f64 ? X86ISD::MOVDDUP : X86ISD::VBROADCAST; + bool BroadcastFromReg = (Opcode == X86ISD::MOVDDUP) || Subtarget.hasAVX2(); + // Check that the mask is a broadcast. int BroadcastIdx = -1; - for (int M : Mask) - if (M >= 0 && BroadcastIdx == -1) - BroadcastIdx = M; - else if (M >= 0 && M != BroadcastIdx) - return SDValue(); + for (int i = 0; i != (int)NumElts; ++i) { + SmallVector BroadcastMask(NumElts, i); + if (isShuffleEquivalent(V1, V2, Mask, BroadcastMask)) { + BroadcastIdx = i; + break; + } + } + if (BroadcastIdx < 0) + return SDValue(); assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with " "a sorted mask where the broadcast " "comes from V1."); // Go up the chain of (vector) values to find a scalar load that we can // combine with the broadcast. + SDValue V = V1; for (;;) { switch (V.getOpcode()) { case ISD::CONCAT_VECTORS: { @@ -8409,9 +8420,8 @@ (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0)) { V = V.getOperand(BroadcastIdx); - // If the scalar isn't a load, we can't broadcast from it in AVX1. - // Only AVX2 has register broadcasts. - if (!Subtarget.hasAVX2() && !isShuffleFoldableLoad(V)) + // If we can't broadcast from a register, check that the input is a load. + if (!BroadcastFromReg && !isShuffleFoldableLoad(V)) return SDValue(); } else if (MayFoldLoad(BC) && !cast(BC)->isVolatile()) { // 32-bit targets need to load i64 as a f64 and then bitcast the result. @@ -8428,8 +8438,8 @@ V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr, DAG.getMachineFunction().getMachineMemOperand( Ld->getMemOperand(), Offset, SVT.getStoreSize())); - } else if (!Subtarget.hasAVX2()) { - // We can't broadcast from a vector register without AVX2. + } else if (!BroadcastFromReg) { + // We can't broadcast from a vector register. return SDValue(); } else if (BroadcastIdx != 0) { // We can only broadcast from the zero-element of a vector register, @@ -8452,8 +8462,10 @@ DAG.getIntPtrConstant(BroadcastIdx, DL)); } - V = DAG.getNode(X86ISD::VBROADCAST, DL, BroadcastVT, V); - return DAG.getBitcast(VT, V); + if (Opcode == X86ISD::MOVDDUP && !V.getValueType().isVector()) + V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V); + + return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, BroadcastVT, V)); } // Check for whether we can use INSERTPS to perform the shuffle. We only use @@ -8669,11 +8681,10 @@ assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!"); if (isSingleInputShuffleMask(Mask)) { - // Use low duplicate instructions for masks that match their pattern. - if (Subtarget.hasSSE3()) - if (isShuffleEquivalent(V1, V2, Mask, {0, 0})) - return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, V1); - + // Check for being able to broadcast a single element. + if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v2f64, V1, V2, + Mask, Subtarget, DAG)) + return Broadcast; // Straight shuffle of a single input vector. Simulate this by using the // single input as both of the "inputs" to this instruction.. unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1); @@ -8751,7 +8762,7 @@ if (isSingleInputShuffleMask(Mask)) { // Check for being able to broadcast a single element. - if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v2i64, V1, + if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG)) return Broadcast; @@ -8973,7 +8984,7 @@ if (NumV2Elements == 0) { // Check for being able to broadcast a single element. - if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v4f32, V1, + if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v4f32, V1, V2, Mask, Subtarget, DAG)) return Broadcast; @@ -9065,7 +9076,7 @@ if (NumV2Elements == 0) { // Check for being able to broadcast a single element. - if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v4i32, V1, + if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG)) return Broadcast; @@ -9705,7 +9716,7 @@ if (NumV2Inputs == 0) { // Check for being able to broadcast a single element. - if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8i16, V1, + if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG)) return Broadcast; @@ -9904,7 +9915,7 @@ // For single-input shuffles, there are some nicer lowering tricks we can use. if (NumV2Elements == 0) { // Check for being able to broadcast a single element. - if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v16i8, V1, + if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG)) return Broadcast; @@ -10957,7 +10968,7 @@ if (isSingleInputShuffleMask(Mask)) { // Check for being able to broadcast a single element. - if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v4f64, V1, + if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG)) return Broadcast; @@ -11055,7 +11066,7 @@ return Blend; // Check for being able to broadcast a single element. - if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v4i64, V1, + if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG)) return Broadcast; @@ -11128,7 +11139,7 @@ return Blend; // Check for being able to broadcast a single element. - if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8f32, V1, + if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG)) return Broadcast; @@ -11234,7 +11245,7 @@ return Blend; // Check for being able to broadcast a single element. - if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8i32, V1, + if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG)) return Broadcast; @@ -11316,7 +11327,7 @@ return ZExt; // Check for being able to broadcast a single element. - if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v16i16, V1, + if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG)) return Broadcast; @@ -11413,7 +11424,7 @@ return ZExt; // Check for being able to broadcast a single element. - if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v32i8, V1, + if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG)) return Broadcast; @@ -11723,7 +11734,7 @@ // Check for being able to broadcast a single element. if (SDValue Broadcast = - lowerVectorShuffleAsBroadcast(DL, VT, V1, Mask, Subtarget, DAG)) + lowerVectorShuffleAsBroadcast(DL, VT, V1, V2, Mask, Subtarget, DAG)) return Broadcast; // Dispatch to each element type for lowering. If we don't have supprot for Index: test/CodeGen/X86/avx-splat.ll =================================================================== --- test/CodeGen/X86/avx-splat.ll +++ test/CodeGen/X86/avx-splat.ll @@ -135,8 +135,7 @@ define <2 x double> @splat_load_2f64_11(<2 x double>* %ptr) { ; CHECK-LABEL: splat_load_2f64_11: ; CHECK: ## BB#0: -; CHECK-NEXT: vmovaps (%rdi), %xmm0 -; CHECK-NEXT: vmovhlps {{.*#+}} xmm0 = xmm0[1,1] +; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] ; CHECK-NEXT: retq %x = load <2 x double>, <2 x double>* %ptr %x1 = shufflevector <2 x double> %x, <2 x double> undef, <2 x i32> Index: test/CodeGen/X86/avx-vbroadcast.ll =================================================================== --- test/CodeGen/X86/avx-vbroadcast.ll +++ test/CodeGen/X86/avx-vbroadcast.ll @@ -315,14 +315,12 @@ ; X32-LABEL: load_splat_2f64_2f64_1111: ; X32: ## BB#0: ## %entry ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: vmovaps (%eax), %xmm0 -; X32-NEXT: vmovhlps {{.*#+}} xmm0 = xmm0[1,1] +; X32-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] ; X32-NEXT: retl ; ; X64-LABEL: load_splat_2f64_2f64_1111: ; X64: ## BB#0: ## %entry -; X64-NEXT: vmovaps (%rdi), %xmm0 -; X64-NEXT: vmovhlps {{.*#+}} xmm0 = xmm0[1,1] +; X64-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] ; X64-NEXT: retq entry: %ld = load <2 x double>, <2 x double>* %ptr Index: test/CodeGen/X86/avx2-vbroadcast.ll =================================================================== --- test/CodeGen/X86/avx2-vbroadcast.ll +++ test/CodeGen/X86/avx2-vbroadcast.ll @@ -494,14 +494,12 @@ ; X32-LABEL: load_splat_2f64_2f64_1111: ; X32: ## BB#0: ## %entry ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: vmovaps (%eax), %xmm0 -; X32-NEXT: vmovhlps {{.*#+}} xmm0 = xmm0[1,1] +; X32-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] ; X32-NEXT: retl ; ; X64-LABEL: load_splat_2f64_2f64_1111: ; X64: ## BB#0: ## %entry -; X64-NEXT: vmovaps (%rdi), %xmm0 -; X64-NEXT: vmovhlps {{.*#+}} xmm0 = xmm0[1,1] +; X64-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] ; X64-NEXT: retq entry: %ld = load <2 x double>, <2 x double>* %ptr Index: test/CodeGen/X86/extractelement-load.ll =================================================================== --- test/CodeGen/X86/extractelement-load.ll +++ test/CodeGen/X86/extractelement-load.ll @@ -63,13 +63,13 @@ ; ; X64-SSSE3-LABEL: t3: ; X64-SSSE3: # BB#0: # %bb -; X64-SSSE3-NEXT: movupd (%rax), %xmm0 -; X64-SSSE3-NEXT: movhpd %xmm0, (%rax) +; X64-SSSE3-NEXT: movddup {{.*#+}} xmm0 = mem[0,0] +; X64-SSSE3-NEXT: movlpd %xmm0, (%rax) ; ; X64-AVX-LABEL: t3: ; X64-AVX: # BB#0: # %bb -; X64-AVX-NEXT: vmovupd (%rax), %xmm0 -; X64-AVX-NEXT: vmovhpd %xmm0, (%rax) +; X64-AVX-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; X64-AVX-NEXT: vmovlpd %xmm0, (%rax) bb: %tmp13 = load <2 x double>, <2 x double>* undef, align 1 %.sroa.3.24.vec.extract = extractelement <2 x double> %tmp13, i32 1 Index: test/CodeGen/X86/vec_extract-sse4.ll =================================================================== --- test/CodeGen/X86/vec_extract-sse4.ll +++ test/CodeGen/X86/vec_extract-sse4.ll @@ -20,8 +20,7 @@ ; CHECK: # BB#0: ; CHECK-NEXT: pushl %eax ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: movapd (%eax), %xmm0 -; CHECK-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0] +; CHECK-NEXT: movddup {{.*#+}} xmm0 = mem[0,0] ; CHECK-NEXT: movss %xmm0, (%esp) ; CHECK-NEXT: flds (%esp) ; CHECK-NEXT: popl %eax