Index: llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -18174,6 +18174,7 @@ SDValue LHS = N->getOperand(0); SDValue RHS = N->getOperand(1); SDValue Ops[] = {LHS, RHS}; + EVT VT = N->getValueType(0); // See if we can constant fold the vector operation. if (SDValue Fold = DAG.FoldConstantVectorArithmetic( @@ -18191,7 +18192,6 @@ ShuffleVectorSDNode *SVN1 = cast(RHS); if (SVN0->getMask().equals(SVN1->getMask())) { - EVT VT = N->getValueType(0); SDValue UndefVector = LHS.getOperand(1); SDValue NewBinOp = DAG.getNode(N->getOpcode(), SDLoc(N), VT, LHS.getOperand(0), RHS.getOperand(0), @@ -18202,6 +18202,29 @@ } } + // The following pattern is likely to emerge with vector reduction ops. Moving + // the binary operation ahead of insertion may allow using a narrower vector + // instruction that has better performance than the wide version of the op: + // VBinOp (ins undef, X, Z), (ins undef, Y, Z) --> ins VecC, (VBinOp X, Y), Z + if (LHS.getOpcode() == ISD::INSERT_SUBVECTOR && LHS.getOperand(0).isUndef() && + RHS.getOpcode() == ISD::INSERT_SUBVECTOR && RHS.getOperand(0).isUndef() && + LHS.getOperand(2) == RHS.getOperand(2) && + (LHS.hasOneUse() || RHS.hasOneUse())) { + SDValue X = LHS.getOperand(1); + SDValue Y = RHS.getOperand(1); + SDValue Z = LHS.getOperand(2); + EVT NarrowVT = X.getValueType(); + if (NarrowVT == Y.getValueType() && + TLI.isOperationLegalOrCustomOrPromote(N->getOpcode(), NarrowVT)) { + // (binop undef, undef) may not return undef, so compute that result. + SDLoc DL(N); + SDValue VecC = DAG.getNode(N->getOpcode(), DL, VT, DAG.getUNDEF(VT), + DAG.getUNDEF(VT)); + SDValue NarrowBO = DAG.getNode(N->getOpcode(), DL, NarrowVT, X, Y); + return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, VecC, NarrowBO, Z); + } + } + return SDValue(); } Index: llvm/trunk/test/CodeGen/X86/avx512-hadd-hsub.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/avx512-hadd-hsub.ll +++ llvm/trunk/test/CodeGen/X86/avx512-hadd-hsub.ll @@ -111,14 +111,14 @@ ; KNL: # %bb.0: ; KNL-NEXT: vshufps {{.*#+}} ymm2 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6] ; KNL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7] -; KNL-NEXT: vpaddd %zmm0, %zmm2, %zmm0 +; KNL-NEXT: vpaddd %ymm0, %ymm2, %ymm0 ; KNL-NEXT: retq ; ; SKX-LABEL: hadd_16_3: ; SKX: # %bb.0: ; SKX-NEXT: vshufps {{.*#+}} ymm2 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6] ; SKX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7] -; SKX-NEXT: vpaddd %zmm0, %zmm2, %zmm0 +; SKX-NEXT: vpaddd %ymm0, %ymm2, %ymm0 ; SKX-NEXT: retq %x226 = shufflevector <16 x i32> %x225, <16 x i32> %x227, <16 x i32> @@ -134,14 +134,14 @@ ; KNL: # %bb.0: ; KNL-NEXT: vshufps {{.*#+}} ymm2 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6] ; KNL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7] -; KNL-NEXT: vaddps %zmm0, %zmm2, %zmm0 +; KNL-NEXT: vaddps %ymm0, %ymm2, %ymm0 ; KNL-NEXT: retq ; ; SKX-LABEL: fhadd_16_3: ; SKX: # %bb.0: ; SKX-NEXT: vshufps {{.*#+}} ymm2 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6] ; SKX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7] -; SKX-NEXT: vaddps %zmm0, %zmm2, %zmm0 +; SKX-NEXT: vaddps %ymm0, %ymm2, %ymm0 ; SKX-NEXT: retq %x226 = shufflevector <16 x float> %x225, <16 x float> %x227, <16 x i32> @@ -156,14 +156,14 @@ ; KNL: # %bb.0: ; KNL-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; KNL-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; KNL-NEXT: vaddpd %zmm0, %zmm2, %zmm0 +; KNL-NEXT: vaddpd %ymm0, %ymm2, %ymm0 ; KNL-NEXT: retq ; ; SKX-LABEL: fhadd_16_4: ; SKX: # %bb.0: ; SKX-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; SKX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; SKX-NEXT: vaddpd %zmm0, %zmm2, %zmm0 +; SKX-NEXT: vaddpd %ymm0, %ymm2, %ymm0 ; SKX-NEXT: retq %x226 = shufflevector <8 x double> %x225, <8 x double> %x227, <8 x i32> %x228 = shufflevector <8 x double> %x225, <8 x double> %x227, <8 x i32> Index: llvm/trunk/test/CodeGen/X86/scalarize-fp.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/scalarize-fp.ll +++ llvm/trunk/test/CodeGen/X86/scalarize-fp.ll @@ -198,9 +198,8 @@ ; ; AVX-LABEL: fadd_op1_constant_v4f64: ; AVX: # %bb.0: -; AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 ; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0 +; AVX-NEXT: vaddpd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq %v = insertelement <4 x double> undef, double %x, i32 0 %b = fadd <4 x double> %v, @@ -219,7 +218,7 @@ ; AVX: # %bb.0: ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0 +; AVX-NEXT: vaddpd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq %x = load double, double* %p %v = insertelement <4 x double> undef, double %x, i32 0 @@ -237,9 +236,8 @@ ; ; AVX-LABEL: fsub_op0_constant_v4f64: ; AVX: # %bb.0: -; AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 ; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX-NEXT: vsubpd %ymm0, %ymm1, %ymm0 +; AVX-NEXT: vsubpd %xmm0, %xmm1, %xmm0 ; AVX-NEXT: retq %v = insertelement <4 x double> undef, double %x, i32 0 %b = fsub <4 x double> , %v @@ -258,7 +256,7 @@ ; AVX: # %bb.0: ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX-NEXT: vsubpd %ymm0, %ymm1, %ymm0 +; AVX-NEXT: vsubpd %xmm0, %xmm1, %xmm0 ; AVX-NEXT: retq %x = load double, double* %p %v = insertelement <4 x double> undef, double %x, i32 0 @@ -275,9 +273,8 @@ ; ; AVX-LABEL: fmul_op1_constant_v4f64: ; AVX: # %bb.0: -; AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 ; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX-NEXT: vmulpd %ymm1, %ymm0, %ymm0 +; AVX-NEXT: vmulpd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq %v = insertelement <4 x double> undef, double %x, i32 0 %b = fmul <4 x double> %v, @@ -296,7 +293,7 @@ ; AVX: # %bb.0: ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX-NEXT: vmulpd %ymm1, %ymm0, %ymm0 +; AVX-NEXT: vmulpd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq %x = load double, double* %p %v = insertelement <4 x double> undef, double %x, i32 0 @@ -313,9 +310,8 @@ ; ; AVX-LABEL: fdiv_op1_constant_v4f64: ; AVX: # %bb.0: -; AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 ; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX-NEXT: vdivpd %ymm1, %ymm0, %ymm0 +; AVX-NEXT: vdivpd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq %v = insertelement <4 x double> undef, double %x, i32 0 %b = fdiv <4 x double> %v, @@ -334,7 +330,7 @@ ; AVX: # %bb.0: ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX-NEXT: vdivpd %ymm1, %ymm0, %ymm0 +; AVX-NEXT: vdivpd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq %x = load double, double* %p %v = insertelement <4 x double> undef, double %x, i32 0 @@ -352,9 +348,8 @@ ; ; AVX-LABEL: fdiv_op0_constant_v4f64: ; AVX: # %bb.0: -; AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 ; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX-NEXT: vdivpd %ymm0, %ymm1, %ymm0 +; AVX-NEXT: vdivpd %xmm0, %xmm1, %xmm0 ; AVX-NEXT: retq %v = insertelement <4 x double> undef, double %x, i32 0 %b = fdiv <4 x double> , %v @@ -373,7 +368,7 @@ ; AVX: # %bb.0: ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX-NEXT: vdivpd %ymm0, %ymm1, %ymm0 +; AVX-NEXT: vdivpd %xmm0, %xmm1, %xmm0 ; AVX-NEXT: retq %x = load double, double* %p %v = insertelement <4 x double> undef, double %x, i32 0 Index: llvm/trunk/test/CodeGen/X86/vector-partial-undef.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/vector-partial-undef.ll +++ llvm/trunk/test/CodeGen/X86/vector-partial-undef.ll @@ -13,9 +13,7 @@ ; ; AVX-LABEL: xor_insert_insert: ; AVX: # %bb.0: -; AVX-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 -; AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX-NEXT: vxorps %ymm1, %ymm0, %ymm0 +; AVX-NEXT: vxorps %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq %xw = shufflevector <2 x i64> %x, <2 x i64> undef, <4 x i32> %yw = shufflevector <2 x i64> %y, <2 x i64> undef, <4 x i32> @@ -32,9 +30,9 @@ ; ; AVX-LABEL: xor_insert_insert_high_half: ; AVX: # %bb.0: -; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX-NEXT: vxorps %ymm1, %ymm0, %ymm0 +; AVX-NEXT: vxorps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX-NEXT: retq %xw = shufflevector <2 x i64> %x, <2 x i64> undef, <4 x i32> %yw = shufflevector <2 x i64> %y, <2 x i64> undef, <4 x i32>