diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -23503,14 +23503,58 @@ } SDValue DAGCombiner::visitSCALAR_TO_VECTOR(SDNode *N) { - SDValue Scalar = N->getOperand(0); EVT VT = N->getValueType(0); if (!VT.isFixedLengthVector()) return SDValue(); + // Try to convert a scalar binop with an extracted vector element to a vector + // binop. This is intended to reduce potentially expensive register moves. + // TODO: Check if both operands are extracted. + // TODO: Generalize this, so it can be called from visitINSERT_VECTOR_ELT(). + SDValue Scalar = N->getOperand(0); + unsigned Opcode = Scalar.getOpcode(); + if (Scalar.hasOneUse() && Scalar->getNumValues() == 1 && + TLI.isBinOp(Opcode) && VT.getScalarType() == Scalar.getValueType() && + DAG.isSafeToSpeculativelyExecute(Opcode) && hasOperation(Opcode, VT)) { + // Match an extract element and get a shuffle mask equivalent. + SmallVector ShufMask(VT.getVectorNumElements(), -1); + auto getShuffleMaskForExtElt = [&](SDValue EE) { + if (EE.getOpcode() == ISD::EXTRACT_VECTOR_ELT && + EE.getOperand(0).getValueType() == VT && + isa(EE.getOperand(1))) { + // Mask = {ExtractIndex, undef, undef....} + ShufMask[0] = EE.getConstantOperandVal(1); + // Make sure the shuffle is legal if we are crossing lanes. + return TLI.isShuffleMaskLegal(ShufMask, VT); + } + return false; + }; + + // s2v (bo (extelt V, Idx), C) --> shuffle (bo V, C'), {Idx, -1, -1...} + if (auto *C = dyn_cast(Scalar.getOperand(1))) { + if (getShuffleMaskForExtElt(Scalar.getOperand(0))) { + SDLoc DL(N); + SDValue V = Scalar.getOperand(0).getOperand(0); + SDValue VecC = DAG.getConstant(C->getAPIntValue(), DL, VT); + SDValue VecBO = DAG.getNode(Opcode, DL, VT, V, VecC); + return DAG.getVectorShuffle(VT, DL, VecBO, DAG.getUNDEF(VT), ShufMask); + } + } + // s2v (bo C, (extelt V, Idx)) --> shuffle (bo C', V), {Idx, -1, -1...} + if (auto *C = dyn_cast(Scalar.getOperand(0))) { + if (getShuffleMaskForExtElt(Scalar.getOperand(1))) { + SDLoc DL(N); + SDValue V = Scalar.getOperand(1).getOperand(0); + SDValue VecC = DAG.getConstant(C->getAPIntValue(), DL, VT); + SDValue VecBO = DAG.getNode(Opcode, DL, VT, VecC, V); + return DAG.getVectorShuffle(VT, DL, VecBO, DAG.getUNDEF(VT), ShufMask); + } + } + } + // Replace a SCALAR_TO_VECTOR(EXTRACT_VECTOR_ELT(V,C0)) pattern // with a VECTOR_SHUFFLE and possible truncate. - if (Scalar.getOpcode() != ISD::EXTRACT_VECTOR_ELT || + if (Opcode != ISD::EXTRACT_VECTOR_ELT || !Scalar.getOperand(0).getValueType().isFixedLengthVector()) return SDValue(); diff --git a/llvm/test/CodeGen/X86/cvt16.ll b/llvm/test/CodeGen/X86/cvt16.ll --- a/llvm/test/CodeGen/X86/cvt16.ll +++ b/llvm/test/CodeGen/X86/cvt16.ll @@ -89,9 +89,7 @@ ; F16C-LABEL: test3: ; F16C: # %bb.0: ; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; F16C-NEXT: vmovd %xmm0, %eax -; F16C-NEXT: movzwl %ax, %eax -; F16C-NEXT: vmovd %eax, %xmm0 +; F16C-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; F16C-NEXT: vcvtph2ps %xmm0, %xmm0 ; F16C-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/half.ll b/llvm/test/CodeGen/X86/half.ll --- a/llvm/test/CodeGen/X86/half.ll +++ b/llvm/test/CodeGen/X86/half.ll @@ -860,9 +860,7 @@ ; BWON-F16C-NEXT: vcvtph2ps %xmm1, %xmm1 ; BWON-F16C-NEXT: vaddss %xmm0, %xmm1, %xmm0 ; BWON-F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; BWON-F16C-NEXT: vmovd %xmm0, %eax -; BWON-F16C-NEXT: movzwl %ax, %eax -; BWON-F16C-NEXT: vmovd %eax, %xmm0 +; BWON-F16C-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; BWON-F16C-NEXT: vcvtph2ps %xmm0, %xmm0 ; BWON-F16C-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/urem-vector-lkk.ll b/llvm/test/CodeGen/X86/urem-vector-lkk.ll --- a/llvm/test/CodeGen/X86/urem-vector-lkk.ll +++ b/llvm/test/CodeGen/X86/urem-vector-lkk.ll @@ -139,42 +139,57 @@ define <4 x i16> @dont_fold_urem_power_of_two(<4 x i16> %x) { ; SSE-LABEL: dont_fold_urem_power_of_two: ; SSE: # %bb.0: +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [63,63,63,63] +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: pextrw $1, %xmm0, %eax +; SSE-NEXT: andl $31, %eax +; SSE-NEXT: pinsrw $1, %eax, %xmm1 +; SSE-NEXT: pextrw $2, %xmm0, %eax +; SSE-NEXT: andl $7, %eax +; SSE-NEXT: pinsrw $2, %eax, %xmm1 ; SSE-NEXT: pextrw $3, %xmm0, %eax ; SSE-NEXT: imull $44151, %eax, %ecx # imm = 0xAC77 ; SSE-NEXT: shrl $22, %ecx ; SSE-NEXT: imull $95, %ecx, %ecx ; SSE-NEXT: subl %ecx, %eax -; SSE-NEXT: pextrw $1, %xmm0, %ecx -; SSE-NEXT: andl $31, %ecx -; SSE-NEXT: movd %xmm0, %edx -; SSE-NEXT: andl $63, %edx -; SSE-NEXT: movd %edx, %xmm1 -; SSE-NEXT: pinsrw $1, %ecx, %xmm1 -; SSE-NEXT: pextrw $2, %xmm0, %ecx -; SSE-NEXT: andl $7, %ecx -; SSE-NEXT: pinsrw $2, %ecx, %xmm1 ; SSE-NEXT: pinsrw $3, %eax, %xmm1 ; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: dont_fold_urem_power_of_two: -; AVX: # %bb.0: -; AVX-NEXT: vpextrw $3, %xmm0, %eax -; AVX-NEXT: imull $44151, %eax, %ecx # imm = 0xAC77 -; AVX-NEXT: shrl $22, %ecx -; AVX-NEXT: imull $95, %ecx, %ecx -; AVX-NEXT: subl %ecx, %eax -; AVX-NEXT: vpextrw $1, %xmm0, %ecx -; AVX-NEXT: andl $31, %ecx -; AVX-NEXT: vmovd %xmm0, %edx -; AVX-NEXT: andl $63, %edx -; AVX-NEXT: vmovd %edx, %xmm1 -; AVX-NEXT: vpinsrw $1, %ecx, %xmm1, %xmm1 -; AVX-NEXT: vpextrw $2, %xmm0, %ecx -; AVX-NEXT: andl $7, %ecx -; AVX-NEXT: vpinsrw $2, %ecx, %xmm1, %xmm0 -; AVX-NEXT: vpinsrw $3, %eax, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: dont_fold_urem_power_of_two: +; AVX1: # %bb.0: +; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; AVX1-NEXT: vpextrw $1, %xmm0, %eax +; AVX1-NEXT: andl $31, %eax +; AVX1-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1 +; AVX1-NEXT: vpextrw $2, %xmm0, %eax +; AVX1-NEXT: andl $7, %eax +; AVX1-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1 +; AVX1-NEXT: vpextrw $3, %xmm0, %eax +; AVX1-NEXT: imull $44151, %eax, %ecx # imm = 0xAC77 +; AVX1-NEXT: shrl $22, %ecx +; AVX1-NEXT: imull $95, %ecx, %ecx +; AVX1-NEXT: subl %ecx, %eax +; AVX1-NEXT: vpinsrw $3, %eax, %xmm1, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: dont_fold_urem_power_of_two: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [63,63,63,63] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm1 +; AVX2-NEXT: vpextrw $1, %xmm0, %eax +; AVX2-NEXT: andl $31, %eax +; AVX2-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1 +; AVX2-NEXT: vpextrw $2, %xmm0, %eax +; AVX2-NEXT: andl $7, %eax +; AVX2-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1 +; AVX2-NEXT: vpextrw $3, %xmm0, %eax +; AVX2-NEXT: imull $44151, %eax, %ecx # imm = 0xAC77 +; AVX2-NEXT: shrl $22, %ecx +; AVX2-NEXT: imull $95, %ecx, %ecx +; AVX2-NEXT: subl %ecx, %eax +; AVX2-NEXT: vpinsrw $3, %eax, %xmm1, %xmm0 +; AVX2-NEXT: retq %1 = urem <4 x i16> %x, ret <4 x i16> %1 } diff --git a/llvm/test/CodeGen/X86/vec_shift5.ll b/llvm/test/CodeGen/X86/vec_shift5.ll --- a/llvm/test/CodeGen/X86/vec_shift5.ll +++ b/llvm/test/CodeGen/X86/vec_shift5.ll @@ -219,10 +219,10 @@ define <4 x i32> @extelt0_sub_pslli_v4i32(<4 x i32> %x, <4 x i32> %y){ ; CHECK-LABEL: extelt0_sub_pslli_v4i32: ; CHECK: # %bb.0: -; CHECK-NEXT: movd %xmm1, %eax -; CHECK-NEXT: movl $32, %ecx -; CHECK-NEXT: subl %eax, %ecx -; CHECK-NEXT: movd %ecx, %xmm1 +; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [32,32,32,32] +; CHECK-NEXT: psubd %xmm1, %xmm2 +; CHECK-NEXT: pxor %xmm1, %xmm1 +; CHECK-NEXT: movss {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3] ; CHECK-NEXT: pslld %xmm1, %xmm0 ; CHECK-NEXT: ret{{[l|q]}} %ext = extractelement <4 x i32> %y, i64 0 @@ -232,14 +232,23 @@ } define <4 x i32> @extelt1_add_psrli_v4i32(<4 x i32> %x, <4 x i32> %y){ -; CHECK-LABEL: extelt1_add_psrli_v4i32: -; CHECK: # %bb.0: -; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] -; CHECK-NEXT: movd %xmm1, %eax -; CHECK-NEXT: addl $3, %eax -; CHECK-NEXT: movd %eax, %xmm1 -; CHECK-NEXT: psrld %xmm1, %xmm0 -; CHECK-NEXT: ret{{[l|q]}} +; X86-LABEL: extelt1_add_psrli_v4i32: +; X86: # %bb.0: +; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] +; X86-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 +; X86-NEXT: xorps %xmm2, %xmm2 +; X86-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] +; X86-NEXT: psrld %xmm2, %xmm0 +; X86-NEXT: retl +; +; X64-LABEL: extelt1_add_psrli_v4i32: +; X64: # %bb.0: +; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] +; X64-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; X64-NEXT: xorps %xmm2, %xmm2 +; X64-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] +; X64-NEXT: psrld %xmm2, %xmm0 +; X64-NEXT: retq %ext = extractelement <4 x i32> %y, i64 1 %bo = add i32 %ext, 3 %r = tail call <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32> %x, i32 %bo)