Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -7246,6 +7246,10 @@ bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode()); bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode()); + unsigned VectorSizeInBits = V1.getValueType().getSizeInBits(); + unsigned ScalarSizeInBits = VectorSizeInBits / Mask.size(); + assert(!(VectorSizeInBits % ScalarSizeInBits) && "Illegal shuffle mask size"); + for (int i = 0, Size = Mask.size(); i < Size; ++i) { int M = Mask[i]; // Handle the easy cases. @@ -7254,17 +7258,47 @@ continue; } - // If this is an index into a build_vector node (which has the same number - // of elements), dig out the input value and use it. + // Determine shuffle input and normalize the mask. SDValue V = M < Size ? V1 : V2; - if (V.getOpcode() != ISD::BUILD_VECTOR || Size != (int)V.getNumOperands()) + M %= Size; + + // Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements. + if (V.getOpcode() != ISD::BUILD_VECTOR) continue; - SDValue Input = V.getOperand(M % Size); - // The UNDEF opcode check really should be dead code here, but not quite - // worth asserting on (it isn't invalid, just unexpected). - if (Input.isUndef() || X86::isZeroNode(Input)) - Zeroable[i] = true; + // If the BUILD_VECTOR has fewer elements then the bitcasted portion of + // the (larger) source element must be UNDEF/ZERO. + if ((Size % V.getNumOperands()) == 0) { + unsigned Scale = Size / V->getNumOperands(); + SDValue Op = V.getOperand(M / Scale); + if (Op.isUndef() || X86::isZeroNode(Op)) + Zeroable[i] = true; + else if (ConstantSDNode *Cst = dyn_cast(Op)) { + APInt Val = Cst->getAPIntValue(); + Val = Val.lshr((M % Scale) * ScalarSizeInBits); + Val = Val.getLoBits(ScalarSizeInBits); + Zeroable[i] = (Val == 0); + } else if (ConstantFPSDNode *Cst = dyn_cast(Op)) { + APInt Val = Cst->getValueAPF().bitcastToAPInt(); + Val = Val.lshr((M % Scale) * ScalarSizeInBits); + Val = Val.getLoBits(ScalarSizeInBits); + Zeroable[i] = (Val == 0); + } + continue; + } + + // If the BUILD_VECTOR has more elements then all the (smaller) source + // elements must be UNDEF or ZERO. + if ((V.getNumOperands() % Size) == 0) { + unsigned Scale = V->getNumOperands() / Size; + bool AllZeroable = true; + for (unsigned j = 0; j != Scale; ++j) { + SDValue Op = V.getOperand((M * Scale) + j); + AllZeroable &= (Op.isUndef() || X86::isZeroNode(Op)); + } + Zeroable[i] = AllZeroable; + continue; + } } return Zeroable; Index: test/CodeGen/X86/insertps-combine.ll =================================================================== --- test/CodeGen/X86/insertps-combine.ll +++ test/CodeGen/X86/insertps-combine.ll @@ -135,22 +135,18 @@ define <4 x float> @insertps_zero_from_v2f64(<4 x float> %a0, <2 x double>* %a1) nounwind { ; SSE-LABEL: insertps_zero_from_v2f64: ; SSE: # BB#0: -; SSE-NEXT: movapd {{.*#+}} xmm1 = [1.000000e+00,2.000000e+00] -; SSE-NEXT: movapd (%rdi), %xmm2 -; SSE-NEXT: addpd %xmm1, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[2,3] -; SSE-NEXT: movapd %xmm2, (%rdi) -; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: movapd (%rdi), %xmm1 +; SSE-NEXT: addpd {{.*}}(%rip), %xmm1 +; SSE-NEXT: insertps {{.*#+}} xmm0 = zero,xmm0[2,2,3] +; SSE-NEXT: movapd %xmm1, (%rdi) ; SSE-NEXT: retq ; ; AVX-LABEL: insertps_zero_from_v2f64: ; AVX: # BB#0: -; AVX-NEXT: vmovapd {{.*#+}} xmm1 = [1.000000e+00,2.000000e+00] -; AVX-NEXT: vaddpd (%rdi), %xmm1, %xmm2 -; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,0] -; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm1[0,2],xmm0[2,3] -; AVX-NEXT: vmovapd %xmm2, (%rdi) +; AVX-NEXT: vmovapd (%rdi), %xmm1 +; AVX-NEXT: vaddpd {{.*}}(%rip), %xmm1, %xmm1 +; AVX-NEXT: vinsertps {{.*#+}} xmm0 = zero,xmm0[2,2,3] +; AVX-NEXT: vmovapd %xmm1, (%rdi) ; AVX-NEXT: retq %1 = load <2 x double>, <2 x double>* %a1 %2 = bitcast <2 x double> to <4 x float> @@ -163,27 +159,23 @@ define <4 x float> @insertps_zero_from_v2i64(<4 x float> %a0, <2 x i64>* %a1) nounwind { ; SSE-LABEL: insertps_zero_from_v2i64: ; SSE: # BB#0: -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [1,18446744073709551614] -; SSE-NEXT: movdqa (%rdi), %xmm2 -; SSE-NEXT: paddq %xmm1, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[2,3] -; SSE-NEXT: movdqa %xmm2, (%rdi) -; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: movdqa (%rdi), %xmm1 +; SSE-NEXT: paddq {{.*}}(%rip), %xmm1 +; SSE-NEXT: insertps {{.*#+}} xmm0 = zero,xmm0[2,2,3] +; SSE-NEXT: movdqa %xmm1, (%rdi) ; SSE-NEXT: retq ; ; AVX-LABEL: insertps_zero_from_v2i64: ; AVX: # BB#0: -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [1,18446744073709551614] -; AVX-NEXT: vpaddq (%rdi), %xmm1, %xmm2 -; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,0] -; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm1[0,2],xmm0[2,3] -; AVX-NEXT: vmovdqa %xmm2, (%rdi) +; AVX-NEXT: vmovdqa (%rdi), %xmm1 +; AVX-NEXT: vpaddq {{.*}}(%rip), %xmm1, %xmm1 +; AVX-NEXT: vinsertps {{.*#+}} xmm0 = zero,xmm0[2,2,3] +; AVX-NEXT: vmovdqa %xmm1, (%rdi) ; AVX-NEXT: retq %1 = load <2 x i64>, <2 x i64>* %a1 %2 = bitcast <2 x i64> to <4 x float> %3 = add <2 x i64> %1, - %4 = shufflevector <4 x float> %a0, <4 x float> %2, <4 x i32> + %4 = shufflevector <4 x float> %a0, <4 x float> %2, <4 x i32> store <2 x i64> %3, <2 x i64> *%a1 ret <4 x float> %4 } @@ -191,21 +183,18 @@ define <4 x float> @insertps_zero_from_v8i16(<4 x float> %a0, <8 x i16>* %a1) nounwind { ; SSE-LABEL: insertps_zero_from_v8i16: ; SSE: # BB#0: -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,0,1,1,2,2,3,3] -; SSE-NEXT: movdqa (%rdi), %xmm2 -; SSE-NEXT: paddw %xmm1, %xmm2 -; SSE-NEXT: blendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: movdqa %xmm2, (%rdi) +; SSE-NEXT: movdqa (%rdi), %xmm1 +; SSE-NEXT: paddw {{.*}}(%rip), %xmm1 +; SSE-NEXT: insertps {{.*#+}} xmm0 = zero,xmm0[2,2,3] +; SSE-NEXT: movdqa %xmm1, (%rdi) ; SSE-NEXT: retq ; ; AVX-LABEL: insertps_zero_from_v8i16: ; AVX: # BB#0: -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,1,1,2,2,3,3] -; AVX-NEXT: vpaddw (%rdi), %xmm1, %xmm2 -; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1] -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX-NEXT: vmovdqa %xmm2, (%rdi) +; AVX-NEXT: vmovdqa (%rdi), %xmm1 +; AVX-NEXT: vpaddw {{.*}}(%rip), %xmm1, %xmm1 +; AVX-NEXT: vinsertps {{.*#+}} xmm0 = zero,xmm0[2,2,3] +; AVX-NEXT: vmovdqa %xmm1, (%rdi) ; AVX-NEXT: retq %1 = load <8 x i16>, <8 x i16>* %a1 %2 = bitcast <8 x i16> to <4 x float> Index: test/CodeGen/X86/widen_load-2.ll =================================================================== --- test/CodeGen/X86/widen_load-2.ll +++ test/CodeGen/X86/widen_load-2.ll @@ -210,26 +210,26 @@ define void @rot(%i8vec3pack* nocapture sret %result, %i8vec3pack* %X, %i8vec3pack* %rot) nounwind { ; CHECK-LABEL: rot: ; CHECK: # BB#0: # %entry -; CHECK-NEXT: movdqa {{.*#+}} xmm0 = <158,158,158,u> -; CHECK-NEXT: movdqa {{.*#+}} xmm1 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> -; CHECK-NEXT: pshufb %xmm1, %xmm0 -; CHECK-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; CHECK-NEXT: movd %xmm0, %eax +; CHECK-NEXT: movdqa {{.*#+}} xmm0 = <0,4,8,128,u,u,u,u,u,u,u,u,u,u,u,u> +; CHECK-NEXT: movdqa {{.*#+}} xmm1 = <158,158,158,u> +; CHECK-NEXT: pshufb %xmm0, %xmm1 +; CHECK-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; CHECK-NEXT: movd %xmm1, %eax ; CHECK-NEXT: movw %ax, (%rsi) ; CHECK-NEXT: movb $-98, 2(%rsi) -; CHECK-NEXT: movdqa {{.*#+}} xmm0 = <1,1,1,u> -; CHECK-NEXT: pshufb %xmm1, %xmm0 -; CHECK-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; CHECK-NEXT: movdqa {{.*#+}} xmm1 = <1,1,1,u> +; CHECK-NEXT: pshufb %xmm0, %xmm1 +; CHECK-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero ; CHECK-NEXT: movd %xmm0, %eax ; CHECK-NEXT: movw %ax, (%rdx) ; CHECK-NEXT: movb $1, 2(%rdx) ; CHECK-NEXT: pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; CHECK-NEXT: movdqa %xmm0, %xmm2 -; CHECK-NEXT: psrld $1, %xmm2 -; CHECK-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm0[6,7] -; CHECK-NEXT: pextrb $8, %xmm2, 2(%rdi) -; CHECK-NEXT: pshufb %xmm1, %xmm2 -; CHECK-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero +; CHECK-NEXT: movdqa %xmm0, %xmm1 +; CHECK-NEXT: psrld $1, %xmm1 +; CHECK-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm0[6,7] +; CHECK-NEXT: pextrb $8, %xmm1, 2(%rdi) +; CHECK-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] +; CHECK-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero ; CHECK-NEXT: movd %xmm0, %eax ; CHECK-NEXT: movw %ax, (%rdi) ; CHECK-NEXT: movq %rdi, %rax