diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -18409,6 +18409,65 @@ return DAG.getBitcast(VT, Shuf); } +static SDValue pushInsertVectorEltIntoBuildVectorOperandOfVSelectLikeShuffle( + SDNode *N, SelectionDAG &DAG) { + assert(N->getOpcode() == ISD::INSERT_VECTOR_ELT); + + auto *InnerShuf = dyn_cast(N->getOperand(0)); + if (!InnerShuf || !InnerShuf->hasOneUse()) + return SDValue(); + + SDValue InVal = N->getOperand(1); + + // We must know which element is being inserted. + auto EltNo = dyn_cast(N->getOperand(2)); + if (!EltNo) + return SDValue(); + + unsigned Elt = EltNo->getZExtValue(); + + SDLoc DL(N); + EVT VT = N->getValueType(0); + unsigned NumElts = VT.getVectorNumElements(); + + SDValue InVec = InnerShuf->getOperand(0); + SDValue InnerBuildVec = InnerShuf->getOperand(1); + + SmallVector InnerMask(InnerShuf->getMask().begin(), + InnerShuf->getMask().end()); + + if (InnerBuildVec.getOpcode() != ISD::BUILD_VECTOR) { + std::swap(InVec, InnerBuildVec); + ShuffleVectorSDNode::commuteMask(InnerMask); + } + + if (InnerBuildVec.getOpcode() != ISD::BUILD_VECTOR) + return SDValue(); + + SmallVector NewOps(NumElts, DAG.getUNDEF(VT.getScalarType())); + SmallVector NewMask(NumElts, -1); + for (unsigned I = 0; I != NumElts; ++I) { + if (Elt == I) { + NewOps[I] = InVal; + InnerMask[I] = I + NumElts; + continue; + } + + if (InnerMask[I] < 0) + continue; + + if ((unsigned)InnerMask[I] >= NumElts) { + NewOps[I] = InnerBuildVec.getOperand(InnerMask[I] - NumElts); + NewMask[I] = I + NumElts; + continue; + } + + NewMask[I] = InnerMask[I]; + } + return DAG.getTargetLoweringInfo().buildLegalVectorShuffle( + VT, DL, InVec, DAG.getBuildVector(VT, DL, NewOps), NewMask, DAG); +} + SDValue DAGCombiner::visitINSERT_VECTOR_ELT(SDNode *N) { SDValue InVec = N->getOperand(0); SDValue InVal = N->getOperand(1); @@ -18477,6 +18536,29 @@ if (LegalOperations && !TLI.isOperationLegal(ISD::BUILD_VECTOR, VT)) return SDValue(); + if (Level == BeforeLegalizeTypes && + InVec.getOpcode() == ISD::INSERT_VECTOR_ELT && InVec.hasOneUse() && + isa(InVec.getOperand(2))) { + SmallVector Ops(NumElts, DAG.getUNDEF(VT.getScalarType())); + unsigned Elt0 = cast(InVec.getOperand(2))->getZExtValue(); + if (Elt0 < Ops.size()) + Ops[Elt0] = InVec.getOperand(1); + if (Elt < Ops.size()) + Ops[Elt] = InVal; + SmallVector BlendMask; + for (unsigned I = 0; I != NumElts; ++I) + BlendMask.push_back(I); + BlendMask[Elt0] = Elt0 + BlendMask.size(); + BlendMask[Elt] = Elt + BlendMask.size(); + return DAG.getTargetLoweringInfo().buildLegalVectorShuffle( + VT, DL, InVec.getOperand(0), DAG.getBuildVector(VT, DL, Ops), BlendMask, + DAG); + } + + if (SDValue V = + pushInsertVectorEltIntoBuildVectorOperandOfVSelectLikeShuffle(N, DAG)) + return V; + // Check that the operand is a BUILD_VECTOR (or UNDEF, which can essentially // be converted to a BUILD_VECTOR). Fill in the Ops vector with the // vector elements. @@ -20936,6 +21018,135 @@ InnerShuf->getOperand(1), CombinedMask); } +static SDValue pullInsertVectorEltIntoBuildVectorOperandOfVSelectLikeShuffle( + ShuffleVectorSDNode *Shuf, SelectionDAG &DAG) { + if (!Shuf->getOperand(0).hasOneUse() && !Shuf->getOperand(1).hasOneUse()) + return SDValue(); + + SDValue InVec = Shuf->getOperand(0); + SDValue OuterBuildVec = Shuf->getOperand(1); + SmallVector OuterMask(Shuf->getMask().begin(), + Shuf->getMask().end()); + + if (OuterBuildVec.getOpcode() != ISD::BUILD_VECTOR) { + std::swap(InVec, OuterBuildVec); + ShuffleVectorSDNode::commuteMask(OuterMask); + } + + if (InVec.getOpcode() != ISD::INSERT_VECTOR_ELT || + OuterBuildVec.getOpcode() != ISD::BUILD_VECTOR) + return SDValue(); + + SDValue InsertVectorElt = InVec; + InVec = InsertVectorElt->getOperand(0); + SDValue InVal = InsertVectorElt->getOperand(1); + + // We must know which element is being inserted. + auto *EltNo = dyn_cast(InsertVectorElt->getOperand(2)); + if (!EltNo) + return SDValue(); + + unsigned Elt = EltNo->getZExtValue(); + + SDLoc DL(Shuf); + EVT VT = Shuf->getValueType(0); + unsigned NumElts = VT.getVectorNumElements(); + + SmallVector NewOps(NumElts, DAG.getUNDEF(VT.getScalarType())); + SmallVector NewMask(NumElts, -1); + for (unsigned I = 0; I != NumElts; ++I) { + if (OuterMask[I] < 0) + continue; + + if ((unsigned)OuterMask[I] >= NumElts) { + NewOps[I] = OuterBuildVec.getOperand(OuterMask[I] - NumElts); + NewMask[I] = I + NumElts; + continue; + } + + if ((unsigned)OuterMask[I] == Elt) { + NewOps[I] = InVal; + NewMask[I] = I + NumElts; + continue; + } + + NewMask[I] = OuterMask[I]; + } + return DAG.getTargetLoweringInfo().buildLegalVectorShuffle( + VT, DL, InVec, DAG.getBuildVector(VT, DL, NewOps), NewMask, DAG); +} + +static SDValue mergeTwoVSelectLikeShuffles(ShuffleVectorSDNode *OuterShuf, + SelectionDAG &DAG) { + if (!OuterShuf->getOperand(0).hasOneUse() && + !OuterShuf->getOperand(1).hasOneUse()) + return SDValue(); + + SDValue InVec = OuterShuf->getOperand(0); + SDValue OuterBuildVec = OuterShuf->getOperand(1); + + SmallVector OuterMask(OuterShuf->getMask().begin(), + OuterShuf->getMask().end()); + + if (OuterBuildVec.getOpcode() != ISD::BUILD_VECTOR) { + std::swap(InVec, OuterBuildVec); + ShuffleVectorSDNode::commuteMask(OuterMask); + } + + if (OuterBuildVec.getOpcode() != ISD::BUILD_VECTOR) + return SDValue(); + + auto *InnerShuf = dyn_cast(InVec); + if (!InnerShuf) + return SDValue(); + + InVec = InnerShuf->getOperand(0); + SDValue InnerBuildVec = InnerShuf->getOperand(1); + + SmallVector InnerMask(InnerShuf->getMask().begin(), + InnerShuf->getMask().end()); + + if (InnerBuildVec.getOpcode() != ISD::BUILD_VECTOR) { + std::swap(InVec, InnerBuildVec); + ShuffleVectorSDNode::commuteMask(InnerMask); + } + + if (InnerBuildVec.getOpcode() != ISD::BUILD_VECTOR) + return SDValue(); + + SDLoc DL(OuterShuf); + + EVT VT = OuterShuf->getValueType(0); + + unsigned NumElts = VT.getVectorNumElements(); + + SmallVector NewOps(NumElts, DAG.getUNDEF(VT.getScalarType())); + SmallVector NewMask(NumElts, -1); + for (unsigned I = 0; I != NumElts; ++I) { + if (OuterMask[I] < 0) + continue; + + if ((unsigned)OuterMask[I] >= NumElts) { + NewOps[I] = OuterBuildVec.getOperand(OuterMask[I] - NumElts); + NewMask[I] = I + NumElts; + continue; + } + + if (InnerMask[OuterMask[I]] < 0) + continue; + + if ((unsigned)InnerMask[OuterMask[I]] >= NumElts) { + NewOps[I] = InnerBuildVec.getOperand(InnerMask[OuterMask[I]] - NumElts); + NewMask[I] = I + NumElts; + continue; + } + + NewMask[I] = InnerMask[OuterMask[I]]; + } + return DAG.getTargetLoweringInfo().buildLegalVectorShuffle( + VT, DL, InVec, DAG.getBuildVector(VT, DL, NewOps), NewMask, DAG); +} + /// If the shuffle mask is taking exactly one element from the first vector /// operand and passing through all other elements from the second vector /// operand, return the index of the mask element that is choosing an element @@ -21170,6 +21381,13 @@ if (SimplifyDemandedVectorElts(SDValue(N, 0))) return SDValue(N, 0); + if (SDValue V = pullInsertVectorEltIntoBuildVectorOperandOfVSelectLikeShuffle( + SVN, DAG)) + return V; + + if (SDValue V = mergeTwoVSelectLikeShuffles(SVN, DAG)) + return V; + // This is intentionally placed after demanded elements simplification because // it could eliminate knowledge of undef elements created by this shuffle. if (SDValue ShufOp = simplifyShuffleOfShuffle(SVN)) diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll b/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll --- a/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll +++ b/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll @@ -1261,10 +1261,8 @@ ; CHECK-NEXT: bfi x9, x8, #1, #2 ; CHECK-NEXT: str h0, [x9] ; CHECK-NEXT: ldr d1, [sp, #8] -; CHECK-NEXT: mov v1.h[1], v0.h[1] -; CHECK-NEXT: mov v1.h[2], v0.h[2] -; CHECK-NEXT: mov v1.h[3], v0.h[3] -; CHECK-NEXT: mov v0.16b, v1.16b +; CHECK-NEXT: mov v0.h[0], v1.h[0] +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: add sp, sp, #16 // =16 ; CHECK-NEXT: ret %tmp = extractelement <8 x i16> %x, i32 0 diff --git a/llvm/test/CodeGen/AArch64/arm64-vector-insertion.ll b/llvm/test/CodeGen/AArch64/arm64-vector-insertion.ll --- a/llvm/test/CodeGen/AArch64/arm64-vector-insertion.ll +++ b/llvm/test/CodeGen/AArch64/arm64-vector-insertion.ll @@ -239,11 +239,11 @@ define <4 x float> @test_insert_2_f32_undef_zero(float %a) { ; CHECK-LABEL: test_insert_2_f32_undef_zero: ; CHECK: // %bb.0: -; CHECK-NEXT: movi.2d v1, #0000000000000000 ; CHECK-NEXT: // kill: def $s0 killed $s0 def $q0 -; CHECK-NEXT: mov.s v1[0], v0[0] -; CHECK-NEXT: mov.s v1[2], v0[0] -; CHECK-NEXT: mov.16b v0, v1 +; CHECK-NEXT: movi.2d v1, #0000000000000000 +; CHECK-NEXT: dup.4s v0, v0[0] +; CHECK-NEXT: trn1.4s v0, v1, v0 +; CHECK-NEXT: trn2.4s v0, v0, v1 ; CHECK-NEXT: ret %v.0 = insertelement <4 x float> , float %a, i32 0 %v.1 = insertelement <4 x float> %v.0, float %a, i32 2 @@ -266,9 +266,9 @@ ; CHECK-LABEL: test_insert_2_f32_var: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $s0 killed $s0 def $q0 -; CHECK-NEXT: mov.s v1[0], v0[0] -; CHECK-NEXT: mov.s v1[2], v0[0] -; CHECK-NEXT: mov.16b v0, v1 +; CHECK-NEXT: dup.4s v0, v0[0] +; CHECK-NEXT: trn1.4s v0, v1, v0 +; CHECK-NEXT: trn2.4s v0, v0, v1 ; CHECK-NEXT: ret %v.0 = insertelement <4 x float> %b, float %a, i32 0 %v.1 = insertelement <4 x float> %v.0, float %a, i32 2 diff --git a/llvm/test/CodeGen/PowerPC/aix-vec_insert_elt.ll b/llvm/test/CodeGen/PowerPC/aix-vec_insert_elt.ll --- a/llvm/test/CodeGen/PowerPC/aix-vec_insert_elt.ll +++ b/llvm/test/CodeGen/PowerPC/aix-vec_insert_elt.ll @@ -119,28 +119,40 @@ define <4 x i32> @testWordImm(<4 x i32> %a, i64 %b) { ; CHECK-64-LABEL: testWordImm: ; CHECK-64: # %bb.0: # %entry +; CHECK-64-NEXT: ld 4, L..C0(2) # %const.0 ; CHECK-64-NEXT: mtfprwz 0, 3 -; CHECK-64-NEXT: xxinsertw 34, 0, 4 -; CHECK-64-NEXT: xxinsertw 34, 0, 12 +; CHECK-64-NEXT: xxspltw 36, 0, 1 +; CHECK-64-NEXT: lxvx 35, 0, 4 +; CHECK-64-NEXT: vperm 2, 2, 4, 3 ; CHECK-64-NEXT: blr ; ; CHECK-32-LABEL: testWordImm: ; CHECK-32: # %bb.0: # %entry -; CHECK-32-NEXT: mtfprwz 0, 4 -; CHECK-32-NEXT: xxinsertw 34, 0, 4 -; CHECK-32-NEXT: xxinsertw 34, 0, 12 +; CHECK-32-NEXT: lwz 3, L..C0(2) # %const.0 +; CHECK-32-NEXT: stw 4, -16(1) +; CHECK-32-NEXT: lxvx 35, 0, 3 +; CHECK-32-NEXT: addi 3, 1, -16 +; CHECK-32-NEXT: lxvwsx 36, 0, 3 +; CHECK-32-NEXT: vperm 2, 2, 4, 3 ; CHECK-32-NEXT: blr ; ; CHECK-64-P10-LABEL: testWordImm: ; CHECK-64-P10: # %bb.0: # %entry -; CHECK-64-P10-NEXT: vinsw 2, 3, 4 -; CHECK-64-P10-NEXT: vinsw 2, 3, 12 +; CHECK-64-P10-NEXT: ld 4, L..C0(2) # %const.0 +; CHECK-64-P10-NEXT: mtfprwz 0, 3 +; CHECK-64-P10-NEXT: xxspltw 36, 0, 1 +; CHECK-64-P10-NEXT: lxvx 35, 0, 4 +; CHECK-64-P10-NEXT: vperm 2, 2, 4, 3 ; CHECK-64-P10-NEXT: blr ; ; CHECK-32-P10-LABEL: testWordImm: ; CHECK-32-P10: # %bb.0: # %entry -; CHECK-32-P10-NEXT: vinsw 2, 4, 4 -; CHECK-32-P10-NEXT: vinsw 2, 4, 12 +; CHECK-32-P10-NEXT: lwz 3, L..C0(2) # %const.0 +; CHECK-32-P10-NEXT: stw 4, -16(1) +; CHECK-32-P10-NEXT: lxvx 35, 0, 3 +; CHECK-32-P10-NEXT: addi 3, 1, -16 +; CHECK-32-P10-NEXT: lxvwsx 36, 0, 3 +; CHECK-32-P10-NEXT: vperm 2, 2, 4, 3 ; CHECK-32-P10-NEXT: blr entry: %conv = trunc i64 %b to i32 @@ -262,8 +274,8 @@ define <4 x float> @testFloat1(<4 x float> %a, float %b, i32 zeroext %idx1) { ; CHECK-64-LABEL: testFloat1: ; CHECK-64: # %bb.0: # %entry -; CHECK-64-DAG: rlwinm 3, 4, 2, 28, 29 -; CHECK-64-DAG: addi 4, 1, -16 +; CHECK-64-NEXT: rlwinm 3, 4, 2, 28, 29 +; CHECK-64-NEXT: addi 4, 1, -16 ; CHECK-64-NEXT: stxv 34, -16(1) ; CHECK-64-NEXT: stfsx 1, 4, 3 ; CHECK-64-NEXT: lxv 34, -16(1) @@ -302,8 +314,8 @@ ; CHECK-64-LABEL: testFloat2: ; CHECK-64: # %bb.0: # %entry ; CHECK-64-NEXT: lwz 6, 0(3) -; CHECK-64-DAG: rlwinm 4, 4, 2, 28, 29 -; CHECK-64-DAG: addi 7, 1, -32 +; CHECK-64-NEXT: rlwinm 4, 4, 2, 28, 29 +; CHECK-64-NEXT: addi 7, 1, -32 ; CHECK-64-NEXT: stxv 34, -32(1) ; CHECK-64-NEXT: stwx 6, 7, 4 ; CHECK-64-NEXT: rlwinm 4, 5, 2, 28, 29 @@ -365,8 +377,8 @@ ; CHECK-64-LABEL: testFloat3: ; CHECK-64: # %bb.0: # %entry ; CHECK-64-NEXT: lis 6, 1 -; CHECK-64-DAG: rlwinm 4, 4, 2, 28, 29 -; CHECK-64-DAG: addi 7, 1, -32 +; CHECK-64-NEXT: rlwinm 4, 4, 2, 28, 29 +; CHECK-64-NEXT: addi 7, 1, -32 ; CHECK-64-NEXT: lwzx 6, 3, 6 ; CHECK-64-NEXT: stxv 34, -32(1) ; CHECK-64-NEXT: stwx 6, 7, 4 @@ -438,29 +450,29 @@ ; CHECK-64-LABEL: testFloatImm1: ; CHECK-64: # %bb.0: # %entry ; CHECK-64-NEXT: xscvdpspn 0, 1 -; CHECK-64-NEXT: xxinsertw 34, 0, 0 -; CHECK-64-NEXT: xxinsertw 34, 0, 8 +; CHECK-64-NEXT: xxspltw 35, 0, 0 +; CHECK-64-NEXT: vmrgow 2, 3, 2 ; CHECK-64-NEXT: blr ; ; CHECK-32-LABEL: testFloatImm1: ; CHECK-32: # %bb.0: # %entry ; CHECK-32-NEXT: xscvdpspn 0, 1 -; CHECK-32-NEXT: xxinsertw 34, 0, 0 -; CHECK-32-NEXT: xxinsertw 34, 0, 8 +; CHECK-32-NEXT: xxspltw 35, 0, 1 +; CHECK-32-NEXT: vmrgow 2, 3, 2 ; CHECK-32-NEXT: blr ; ; CHECK-64-P10-LABEL: testFloatImm1: ; CHECK-64-P10: # %bb.0: # %entry ; CHECK-64-P10-NEXT: xscvdpspn 0, 1 -; CHECK-64-P10-NEXT: xxinsertw 34, 0, 0 -; CHECK-64-P10-NEXT: xxinsertw 34, 0, 8 +; CHECK-64-P10-NEXT: xxspltw 35, 0, 0 +; CHECK-64-P10-NEXT: vmrgow 2, 3, 2 ; CHECK-64-P10-NEXT: blr ; ; CHECK-32-P10-LABEL: testFloatImm1: ; CHECK-32-P10: # %bb.0: # %entry ; CHECK-32-P10-NEXT: xscvdpspn 0, 1 -; CHECK-32-P10-NEXT: xxinsertw 34, 0, 0 -; CHECK-32-P10-NEXT: xxinsertw 34, 0, 8 +; CHECK-32-P10-NEXT: xxspltw 35, 0, 1 +; CHECK-32-P10-NEXT: vmrgow 2, 3, 2 ; CHECK-32-P10-NEXT: blr entry: %vecins = insertelement <4 x float> %a, float %b, i32 0 @@ -471,38 +483,48 @@ define <4 x float> @testFloatImm2(<4 x float> %a, i32* %b) { ; CHECK-64-LABEL: testFloatImm2: ; CHECK-64: # %bb.0: # %entry -; CHECK-64-NEXT: lfs 0, 0(3) -; CHECK-64-NEXT: xscvdpspn 0, 0 -; CHECK-64-NEXT: xxinsertw 34, 0, 0 -; CHECK-64-NEXT: lfs 0, 4(3) -; CHECK-64-NEXT: xscvdpspn 0, 0 -; CHECK-64-NEXT: xxinsertw 34, 0, 8 +; CHECK-64-NEXT: li 4, 4 +; CHECK-64-NEXT: lxsiwzx 36, 0, 3 +; CHECK-64-NEXT: lxsiwzx 35, 3, 4 +; CHECK-64-NEXT: ld 3, L..C1(2) # %const.0 +; CHECK-64-NEXT: vpkudum 3, 4, 3 +; CHECK-64-NEXT: lxvx 36, 0, 3 +; CHECK-64-NEXT: vperm 2, 3, 2, 4 ; CHECK-64-NEXT: blr ; ; CHECK-32-LABEL: testFloatImm2: ; CHECK-32: # %bb.0: # %entry ; CHECK-32-NEXT: lfs 0, 0(3) -; CHECK-32-NEXT: xscvdpspn 0, 0 -; CHECK-32-NEXT: xxinsertw 34, 0, 0 -; CHECK-32-NEXT: lfs 0, 4(3) -; CHECK-32-NEXT: xscvdpspn 0, 0 -; CHECK-32-NEXT: xxinsertw 34, 0, 8 +; CHECK-32-NEXT: xscvdpspn 36, 0 +; CHECK-32-NEXT: lfs 1, 4(3) +; CHECK-32-NEXT: xscvdpspn 35, 1 +; CHECK-32-NEXT: lwz 3, L..C1(2) # %const.0 +; CHECK-32-NEXT: vpkudum 3, 4, 3 +; CHECK-32-NEXT: lxvx 36, 0, 3 +; CHECK-32-NEXT: vperm 2, 3, 2, 4 ; CHECK-32-NEXT: blr ; ; CHECK-64-P10-LABEL: testFloatImm2: ; CHECK-64-P10: # %bb.0: # %entry -; CHECK-64-P10-NEXT: lwz 4, 0(3) -; CHECK-64-P10-NEXT: lwz 3, 4(3) -; CHECK-64-P10-NEXT: vinsw 2, 4, 0 -; CHECK-64-P10-NEXT: vinsw 2, 3, 8 +; CHECK-64-P10-NEXT: li 4, 4 +; CHECK-64-P10-NEXT: lxsiwzx 36, 0, 3 +; CHECK-64-P10-NEXT: lxsiwzx 35, 3, 4 +; CHECK-64-P10-NEXT: ld 3, L..C1(2) # %const.0 +; CHECK-64-P10-NEXT: vpkudum 3, 4, 3 +; CHECK-64-P10-NEXT: lxvx 36, 0, 3 +; CHECK-64-P10-NEXT: vperm 2, 3, 2, 4 ; CHECK-64-P10-NEXT: blr ; ; CHECK-32-P10-LABEL: testFloatImm2: ; CHECK-32-P10: # %bb.0: # %entry -; CHECK-32-P10-NEXT: lwz 4, 0(3) -; CHECK-32-P10-NEXT: lwz 3, 4(3) -; CHECK-32-P10-NEXT: vinsw 2, 4, 0 -; CHECK-32-P10-NEXT: vinsw 2, 3, 8 +; CHECK-32-P10-NEXT: lfs 0, 0(3) +; CHECK-32-P10-NEXT: xscvdpspn 36, 0 +; CHECK-32-P10-NEXT: lfs 1, 4(3) +; CHECK-32-P10-NEXT: xscvdpspn 35, 1 +; CHECK-32-P10-NEXT: lwz 3, L..C1(2) # %const.0 +; CHECK-32-P10-NEXT: vpkudum 3, 4, 3 +; CHECK-32-P10-NEXT: lxvx 36, 0, 3 +; CHECK-32-P10-NEXT: vperm 2, 3, 2, 4 ; CHECK-32-P10-NEXT: blr entry: %0 = bitcast i32* %b to float* @@ -519,44 +541,53 @@ ; CHECK-64-LABEL: testFloatImm3: ; CHECK-64: # %bb.0: # %entry ; CHECK-64-NEXT: lis 4, 4 -; CHECK-64-NEXT: lfsx 0, 3, 4 +; CHECK-64-NEXT: lxsiwzx 35, 3, 4 ; CHECK-64-NEXT: li 4, 1 ; CHECK-64-NEXT: rldic 4, 4, 38, 25 -; CHECK-64-NEXT: xscvdpspn 0, 0 -; CHECK-64-NEXT: xxinsertw 34, 0, 0 -; CHECK-64-NEXT: lfsx 0, 3, 4 -; CHECK-64-NEXT: xscvdpspn 0, 0 -; CHECK-64-NEXT: xxinsertw 34, 0, 8 +; CHECK-64-NEXT: lxsiwzx 36, 3, 4 +; CHECK-64-NEXT: ld 3, L..C2(2) # %const.0 +; CHECK-64-NEXT: vpkudum 3, 3, 4 +; CHECK-64-NEXT: lxvx 36, 0, 3 +; CHECK-64-NEXT: vperm 2, 3, 2, 4 ; CHECK-64-NEXT: blr ; ; CHECK-32-LABEL: testFloatImm3: ; CHECK-32: # %bb.0: # %entry ; CHECK-32-NEXT: lis 4, 4 +; CHECK-32-NEXT: lfs 1, 0(3) +; CHECK-32-NEXT: xscvdpspn 36, 1 ; CHECK-32-NEXT: lfsx 0, 3, 4 -; CHECK-32-NEXT: xscvdpspn 0, 0 -; CHECK-32-NEXT: xxinsertw 34, 0, 0 -; CHECK-32-NEXT: lfs 0, 0(3) -; CHECK-32-NEXT: xscvdpspn 0, 0 -; CHECK-32-NEXT: xxinsertw 34, 0, 8 +; CHECK-32-NEXT: lwz 3, L..C2(2) # %const.0 +; CHECK-32-NEXT: xscvdpspn 35, 0 +; CHECK-32-NEXT: vpkudum 3, 3, 4 +; CHECK-32-NEXT: lxvx 36, 0, 3 +; CHECK-32-NEXT: vperm 2, 3, 2, 4 ; CHECK-32-NEXT: blr ; ; CHECK-64-P10-LABEL: testFloatImm3: ; CHECK-64-P10: # %bb.0: # %entry -; CHECK-64-P10-NEXT: plwz 4, 262144(3), 0 -; CHECK-64-P10-NEXT: vinsw 2, 4, 0 +; CHECK-64-P10-NEXT: lis 4, 4 +; CHECK-64-P10-NEXT: lxsiwzx 35, 3, 4 ; CHECK-64-P10-NEXT: li 4, 1 ; CHECK-64-P10-NEXT: rldic 4, 4, 38, 25 -; CHECK-64-P10-NEXT: lwzx 3, 3, 4 -; CHECK-64-P10-NEXT: vinsw 2, 3, 8 +; CHECK-64-P10-NEXT: lxsiwzx 36, 3, 4 +; CHECK-64-P10-NEXT: ld 3, L..C2(2) # %const.0 +; CHECK-64-P10-NEXT: vpkudum 3, 3, 4 +; CHECK-64-P10-NEXT: lxvx 36, 0, 3 +; CHECK-64-P10-NEXT: vperm 2, 3, 2, 4 ; CHECK-64-P10-NEXT: blr ; ; CHECK-32-P10-LABEL: testFloatImm3: ; CHECK-32-P10: # %bb.0: # %entry ; CHECK-32-P10-NEXT: lis 4, 4 -; CHECK-32-P10-NEXT: lwzx 4, 3, 4 -; CHECK-32-P10-NEXT: lwz 3, 0(3) -; CHECK-32-P10-NEXT: vinsw 2, 4, 0 -; CHECK-32-P10-NEXT: vinsw 2, 3, 8 +; CHECK-32-P10-NEXT: lfs 1, 0(3) +; CHECK-32-P10-NEXT: xscvdpspn 36, 1 +; CHECK-32-P10-NEXT: lfsx 0, 3, 4 +; CHECK-32-P10-NEXT: lwz 3, L..C2(2) # %const.0 +; CHECK-32-P10-NEXT: xscvdpspn 35, 0 +; CHECK-32-P10-NEXT: vpkudum 3, 3, 4 +; CHECK-32-P10-NEXT: lxvx 36, 0, 3 +; CHECK-32-P10-NEXT: vperm 2, 3, 2, 4 ; CHECK-32-P10-NEXT: blr entry: %add.ptr = getelementptr inbounds i32, i32* %b, i64 65536 @@ -575,7 +606,7 @@ define <2 x double> @testDouble1(<2 x double> %a, double %b, i32 zeroext %idx1) { ; CHECK-64-LABEL: testDouble1: ; CHECK-64: # %bb.0: # %entry -; CHECK-64: rlwinm 3, 4, 3, 28, 28 +; CHECK-64-NEXT: rlwinm 3, 4, 3, 28, 28 ; CHECK-64-NEXT: addi 4, 1, -16 ; CHECK-64-NEXT: stxv 34, -16(1) ; CHECK-64-NEXT: stfdx 1, 4, 3 @@ -601,8 +632,8 @@ ; ; CHECK-32-P10-LABEL: testDouble1: ; CHECK-32-P10: # %bb.0: # %entry -; CHECK-32-P10-DAG: addi 4, 1, -16 -; CHECK-32-P10-DAG: rlwinm 3, 5, 3, 28, 28 +; CHECK-32-P10-NEXT: addi 4, 1, -16 +; CHECK-32-P10-NEXT: rlwinm 3, 5, 3, 28, 28 ; CHECK-32-P10-NEXT: stxv 34, -16(1) ; CHECK-32-P10-NEXT: stfdx 1, 4, 3 ; CHECK-32-P10-NEXT: lxv 34, -16(1) @@ -616,8 +647,8 @@ ; CHECK-64-LABEL: testDouble2: ; CHECK-64: # %bb.0: # %entry ; CHECK-64-NEXT: ld 6, 0(3) -; CHECK-64-DAG: rlwinm 4, 4, 3, 28, 28 -; CHECK-64-DAG: addi 7, 1, -32 +; CHECK-64-NEXT: rlwinm 4, 4, 3, 28, 28 +; CHECK-64-NEXT: addi 7, 1, -32 ; CHECK-64-NEXT: stxv 34, -32(1) ; CHECK-64-NEXT: stdx 6, 7, 4 ; CHECK-64-NEXT: li 4, 1 @@ -661,8 +692,8 @@ ; CHECK-32-P10-LABEL: testDouble2: ; CHECK-32-P10: # %bb.0: # %entry ; CHECK-32-P10-NEXT: lfd 0, 0(3) -; CHECK-32-P10-DAG: addi 6, 1, -32 -; CHECK-32-P10-DAG: rlwinm 4, 4, 3, 28, 28 +; CHECK-32-P10-NEXT: addi 6, 1, -32 +; CHECK-32-P10-NEXT: rlwinm 4, 4, 3, 28, 28 ; CHECK-32-P10-NEXT: stxv 34, -32(1) ; CHECK-32-P10-NEXT: rlwinm 5, 5, 3, 28, 28 ; CHECK-32-P10-NEXT: stfdx 0, 6, 4 @@ -688,8 +719,8 @@ ; CHECK-64-LABEL: testDouble3: ; CHECK-64: # %bb.0: # %entry ; CHECK-64-NEXT: lis 6, 1 -; CHECK-64-DAG: rlwinm 4, 4, 3, 28, 28 -; CHECK-64-DAG: addi 7, 1, -32 +; CHECK-64-NEXT: rlwinm 4, 4, 3, 28, 28 +; CHECK-64-NEXT: addi 7, 1, -32 ; CHECK-64-NEXT: ldx 6, 3, 6 ; CHECK-64-NEXT: stxv 34, -32(1) ; CHECK-64-NEXT: stdx 6, 7, 4 diff --git a/llvm/test/CodeGen/PowerPC/vec_insert_elt.ll b/llvm/test/CodeGen/PowerPC/vec_insert_elt.ll --- a/llvm/test/CodeGen/PowerPC/vec_insert_elt.ll +++ b/llvm/test/CodeGen/PowerPC/vec_insert_elt.ll @@ -99,21 +99,28 @@ define <4 x i32> @testWordImm(<4 x i32> %a, i64 %b) { ; CHECK-LABEL: testWordImm: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vinsw v2, r5, 8 -; CHECK-NEXT: vinsw v2, r5, 0 +; CHECK-NEXT: mtvsrws v3, r5 +; CHECK-NEXT: vmrgow v2, v3, v2 ; CHECK-NEXT: blr ; ; CHECK-BE-LABEL: testWordImm: ; CHECK-BE: # %bb.0: # %entry -; CHECK-BE-NEXT: vinsw v2, r5, 4 -; CHECK-BE-NEXT: vinsw v2, r5, 12 +; CHECK-BE-NEXT: addis r3, r2, .LCPI3_0@toc@ha +; CHECK-BE-NEXT: mtfprwz f0, r5 +; CHECK-BE-NEXT: addi r3, r3, .LCPI3_0@toc@l +; CHECK-BE-NEXT: xxspltw v4, vs0, 1 +; CHECK-BE-NEXT: lxvx v3, 0, r3 +; CHECK-BE-NEXT: vperm v2, v2, v4, v3 ; CHECK-BE-NEXT: blr ; ; CHECK-P9-LABEL: testWordImm: ; CHECK-P9: # %bb.0: # %entry +; CHECK-P9-NEXT: addis r3, r2, .LCPI3_0@toc@ha ; CHECK-P9-NEXT: mtfprwz f0, r5 -; CHECK-P9-NEXT: xxinsertw v2, vs0, 4 -; CHECK-P9-NEXT: xxinsertw v2, vs0, 12 +; CHECK-P9-NEXT: addi r3, r3, .LCPI3_0@toc@l +; CHECK-P9-NEXT: xxspltw v4, vs0, 1 +; CHECK-P9-NEXT: lxvx v3, 0, r3 +; CHECK-P9-NEXT: vperm v2, v2, v4, v3 ; CHECK-P9-NEXT: blr entry: %conv = trunc i64 %b to i32 @@ -344,22 +351,23 @@ ; CHECK-LABEL: testFloatImm1: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: xscvdpspn vs0, f1 -; CHECK-NEXT: xxinsertw v2, vs0, 12 -; CHECK-NEXT: xxinsertw v2, vs0, 4 +; CHECK-NEXT: plxv v4, .LCPI10_0@PCREL(0), 1 +; CHECK-NEXT: xxspltw v3, vs0, 1 +; CHECK-NEXT: vperm v2, v2, v3, v4 ; CHECK-NEXT: blr ; ; CHECK-BE-LABEL: testFloatImm1: ; CHECK-BE: # %bb.0: # %entry ; CHECK-BE-NEXT: xscvdpspn vs0, f1 -; CHECK-BE-NEXT: xxinsertw v2, vs0, 0 -; CHECK-BE-NEXT: xxinsertw v2, vs0, 8 +; CHECK-BE-NEXT: xxspltw v3, vs0, 0 +; CHECK-BE-NEXT: vmrgow v2, v3, v2 ; CHECK-BE-NEXT: blr ; ; CHECK-P9-LABEL: testFloatImm1: ; CHECK-P9: # %bb.0: # %entry ; CHECK-P9-NEXT: xscvdpspn vs0, f1 -; CHECK-P9-NEXT: xxinsertw v2, vs0, 0 -; CHECK-P9-NEXT: xxinsertw v2, vs0, 8 +; CHECK-P9-NEXT: xxspltw v3, vs0, 0 +; CHECK-P9-NEXT: vmrgow v2, v3, v2 ; CHECK-P9-NEXT: blr entry: %vecins = insertelement <4 x float> %a, float %b, i32 0 @@ -370,28 +378,37 @@ define <4 x float> @testFloatImm2(<4 x float> %a, i32* %b) { ; CHECK-LABEL: testFloatImm2: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: lwz r3, 0(r5) -; CHECK-NEXT: vinsw v2, r3, 12 -; CHECK-NEXT: lwz r3, 4(r5) -; CHECK-NEXT: vinsw v2, r3, 4 +; CHECK-NEXT: li r3, 4 +; CHECK-NEXT: lxsiwzx v4, 0, r5 +; CHECK-NEXT: plxv v5, .LCPI11_0@PCREL(0), 1 +; CHECK-NEXT: lxsiwzx v3, r5, r3 +; CHECK-NEXT: vperm v3, v3, v4, v5 +; CHECK-NEXT: plxv v4, .LCPI11_1@PCREL(0), 1 +; CHECK-NEXT: vperm v2, v2, v3, v4 ; CHECK-NEXT: blr ; ; CHECK-BE-LABEL: testFloatImm2: ; CHECK-BE: # %bb.0: # %entry -; CHECK-BE-NEXT: lwz r3, 0(r5) -; CHECK-BE-NEXT: vinsw v2, r3, 0 -; CHECK-BE-NEXT: lwz r3, 4(r5) -; CHECK-BE-NEXT: vinsw v2, r3, 8 +; CHECK-BE-NEXT: li r3, 4 +; CHECK-BE-NEXT: lxsiwzx v4, 0, r5 +; CHECK-BE-NEXT: lxsiwzx v3, r5, r3 +; CHECK-BE-NEXT: addis r3, r2, .LCPI11_0@toc@ha +; CHECK-BE-NEXT: addi r3, r3, .LCPI11_0@toc@l +; CHECK-BE-NEXT: vpkudum v3, v4, v3 +; CHECK-BE-NEXT: lxvx v4, 0, r3 +; CHECK-BE-NEXT: vperm v2, v3, v2, v4 ; CHECK-BE-NEXT: blr ; ; CHECK-P9-LABEL: testFloatImm2: ; CHECK-P9: # %bb.0: # %entry -; CHECK-P9-NEXT: lfs f0, 0(r5) -; CHECK-P9-NEXT: xscvdpspn vs0, f0 -; CHECK-P9-NEXT: xxinsertw v2, vs0, 0 -; CHECK-P9-NEXT: lfs f0, 4(r5) -; CHECK-P9-NEXT: xscvdpspn vs0, f0 -; CHECK-P9-NEXT: xxinsertw v2, vs0, 8 +; CHECK-P9-NEXT: li r3, 4 +; CHECK-P9-NEXT: lxsiwzx v4, 0, r5 +; CHECK-P9-NEXT: lxsiwzx v3, r5, r3 +; CHECK-P9-NEXT: addis r3, r2, .LCPI11_0@toc@ha +; CHECK-P9-NEXT: addi r3, r3, .LCPI11_0@toc@l +; CHECK-P9-NEXT: vpkudum v3, v4, v3 +; CHECK-P9-NEXT: lxvx v4, 0, r3 +; CHECK-P9-NEXT: vperm v2, v3, v2, v4 ; CHECK-P9-NEXT: blr entry: %0 = bitcast i32* %b to float* @@ -407,35 +424,43 @@ define <4 x float> @testFloatImm3(<4 x float> %a, i32* %b) { ; CHECK-LABEL: testFloatImm3: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: plwz r3, 262144(r5), 0 -; CHECK-NEXT: vinsw v2, r3, 12 +; CHECK-NEXT: lis r3, 4 +; CHECK-NEXT: plxv v5, .LCPI12_0@PCREL(0), 1 +; CHECK-NEXT: lxsiwzx v3, r5, r3 ; CHECK-NEXT: li r3, 1 ; CHECK-NEXT: rldic r3, r3, 38, 25 -; CHECK-NEXT: lwzx r3, r5, r3 -; CHECK-NEXT: vinsw v2, r3, 4 +; CHECK-NEXT: lxsiwzx v4, r5, r3 +; CHECK-NEXT: vperm v3, v4, v3, v5 +; CHECK-NEXT: plxv v4, .LCPI12_1@PCREL(0), 1 +; CHECK-NEXT: vperm v2, v2, v3, v4 ; CHECK-NEXT: blr ; ; CHECK-BE-LABEL: testFloatImm3: ; CHECK-BE: # %bb.0: # %entry -; CHECK-BE-NEXT: plwz r3, 262144(r5), 0 -; CHECK-BE-NEXT: vinsw v2, r3, 0 +; CHECK-BE-NEXT: lis r3, 4 +; CHECK-BE-NEXT: lxsiwzx v3, r5, r3 ; CHECK-BE-NEXT: li r3, 1 ; CHECK-BE-NEXT: rldic r3, r3, 38, 25 -; CHECK-BE-NEXT: lwzx r3, r5, r3 -; CHECK-BE-NEXT: vinsw v2, r3, 8 +; CHECK-BE-NEXT: lxsiwzx v4, r5, r3 +; CHECK-BE-NEXT: addis r3, r2, .LCPI12_0@toc@ha +; CHECK-BE-NEXT: addi r3, r3, .LCPI12_0@toc@l +; CHECK-BE-NEXT: vpkudum v3, v3, v4 +; CHECK-BE-NEXT: lxvx v4, 0, r3 +; CHECK-BE-NEXT: vperm v2, v3, v2, v4 ; CHECK-BE-NEXT: blr ; ; CHECK-P9-LABEL: testFloatImm3: ; CHECK-P9: # %bb.0: # %entry ; CHECK-P9-NEXT: lis r3, 4 -; CHECK-P9-NEXT: lfsx f0, r5, r3 +; CHECK-P9-NEXT: lxsiwzx v3, r5, r3 ; CHECK-P9-NEXT: li r3, 1 ; CHECK-P9-NEXT: rldic r3, r3, 38, 25 -; CHECK-P9-NEXT: xscvdpspn vs0, f0 -; CHECK-P9-NEXT: xxinsertw v2, vs0, 0 -; CHECK-P9-NEXT: lfsx f0, r5, r3 -; CHECK-P9-NEXT: xscvdpspn vs0, f0 -; CHECK-P9-NEXT: xxinsertw v2, vs0, 8 +; CHECK-P9-NEXT: lxsiwzx v4, r5, r3 +; CHECK-P9-NEXT: addis r3, r2, .LCPI12_0@toc@ha +; CHECK-P9-NEXT: addi r3, r3, .LCPI12_0@toc@l +; CHECK-P9-NEXT: vpkudum v3, v3, v4 +; CHECK-P9-NEXT: lxvx v4, 0, r3 +; CHECK-P9-NEXT: vperm v2, v3, v2, v4 ; CHECK-P9-NEXT: blr entry: %add.ptr = getelementptr inbounds i32, i32* %b, i64 65536 diff --git a/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll b/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll --- a/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll +++ b/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll @@ -1397,39 +1397,39 @@ define arm_aapcs_vfpcc void @arm_biquad_cascade_stereo_df2T_f32(%struct.arm_biquad_cascade_stereo_df2T_instance_f32* nocapture readonly %0, float* %1, float* %2, i32 %3) { ; CHECK-LABEL: arm_biquad_cascade_stereo_df2T_f32: ; CHECK: @ %bb.0: -; CHECK-NEXT: .save {r4, r5, r6, r7, r8, lr} -; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr} +; CHECK-NEXT: .save {r4, r5, r6, r7, lr} +; CHECK-NEXT: push {r4, r5, r6, r7, lr} +; CHECK-NEXT: .pad #4 +; CHECK-NEXT: sub sp, #4 ; CHECK-NEXT: .vsave {d8, d9, d10, d11} ; CHECK-NEXT: vpush {d8, d9, d10, d11} ; CHECK-NEXT: .pad #24 ; CHECK-NEXT: sub sp, #24 -; CHECK-NEXT: mov r8, r3 +; CHECK-NEXT: mov r6, r3 ; CHECK-NEXT: ldrb.w r12, [r0] ; CHECK-NEXT: ldrd r3, r0, [r0, #4] ; CHECK-NEXT: movs r4, #0 -; CHECK-NEXT: cmp.w r8, #0 ; CHECK-NEXT: strd r4, r4, [sp, #16] -; CHECK-NEXT: beq .LBB17_5 +; CHECK-NEXT: cbz r6, .LBB17_5 ; CHECK-NEXT: @ %bb.1: ; CHECK-NEXT: movs r5, #2 ; CHECK-NEXT: viwdup.u32 q0, r4, r5, #1 ; CHECK-NEXT: mov r4, sp ; CHECK-NEXT: .LBB17_2: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB17_3 Depth 2 -; CHECK-NEXT: ldrd r5, r7, [r0] -; CHECK-NEXT: vldrw.u32 q1, [r3] -; CHECK-NEXT: vldr s8, [r0, #8] -; CHECK-NEXT: ldr r6, [r0, #12] -; CHECK-NEXT: vstrw.32 q1, [r4] -; CHECK-NEXT: vdup.32 q1, r7 -; CHECK-NEXT: vldr s12, [r0, #16] -; CHECK-NEXT: vmov.f32 s6, s8 -; CHECK-NEXT: dls lr, r8 -; CHECK-NEXT: vmov.f32 s7, s8 -; CHECK-NEXT: vdup.32 q2, r6 -; CHECK-NEXT: vmov.f32 s10, s12 +; CHECK-NEXT: vldr s4, [r0, #4] +; CHECK-NEXT: vldrw.u32 q3, [r3] +; CHECK-NEXT: vldr s8, [r0, #12] ; CHECK-NEXT: mov r7, r2 -; CHECK-NEXT: vmov.f32 s11, s12 +; CHECK-NEXT: vldr s6, [r0, #8] +; CHECK-NEXT: vldr s10, [r0, #16] +; CHECK-NEXT: vmov.f32 s5, s4 +; CHECK-NEXT: vmov.f32 s9, s8 +; CHECK-NEXT: ldr r5, [r0] +; CHECK-NEXT: vmov.f32 s7, s6 +; CHECK-NEXT: vstrw.32 q3, [r4] +; CHECK-NEXT: vmov.f32 s11, s10 +; CHECK-NEXT: dls lr, r6 ; CHECK-NEXT: .LBB17_3: @ Parent Loop BB17_2 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 ; CHECK-NEXT: vldrw.u32 q4, [r1, q0, uxtw #2] @@ -1459,7 +1459,8 @@ ; CHECK-NEXT: .LBB17_7: ; CHECK-NEXT: add sp, #24 ; CHECK-NEXT: vpop {d8, d9, d10, d11} -; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, pc} +; CHECK-NEXT: add sp, #4 +; CHECK-NEXT: pop {r4, r5, r6, r7, pc} %5 = alloca [6 x float], align 4 %6 = getelementptr inbounds %struct.arm_biquad_cascade_stereo_df2T_instance_f32, %struct.arm_biquad_cascade_stereo_df2T_instance_f32* %0, i32 0, i32 1 %7 = load float*, float** %6, align 4 diff --git a/llvm/test/CodeGen/Thumb2/mve-vst3.ll b/llvm/test/CodeGen/Thumb2/mve-vst3.ll --- a/llvm/test/CodeGen/Thumb2/mve-vst3.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vst3.ll @@ -267,20 +267,17 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, lr} ; CHECK-NEXT: push {r4, lr} -; CHECK-NEXT: ldrh r2, [r0, #6] -; CHECK-NEXT: ldrh.w lr, [r0, #4] -; CHECK-NEXT: ldrh.w r12, [r0, #8] -; CHECK-NEXT: vmov.16 q0[4], r2 -; CHECK-NEXT: ldrh r3, [r0, #2] -; CHECK-NEXT: vmov q1[2], q1[0], lr, r2 -; CHECK-NEXT: ldrh r4, [r0] +; CHECK-NEXT: ldrh r4, [r0, #6] +; CHECK-NEXT: ldrh.w r12, [r0, #4] +; CHECK-NEXT: ldrh.w lr, [r0, #2] +; CHECK-NEXT: vmov.16 q0[4], r4 +; CHECK-NEXT: ldrh r2, [r0, #8] +; CHECK-NEXT: ldrh r3, [r0] ; CHECK-NEXT: ldrh r0, [r0, #10] ; CHECK-NEXT: vmov.16 q0[5], r0 ; CHECK-NEXT: vmov r0, s2 -; CHECK-NEXT: vmov q0[2], q0[0], r4, r3 -; CHECK-NEXT: vmov.f32 s1, s4 -; CHECK-NEXT: vmov.f32 s3, s2 -; CHECK-NEXT: vmov.32 q0[2], r12 +; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 +; CHECK-NEXT: vmov q0[3], q0[1], r12, lr ; CHECK-NEXT: vstrh.32 q0, [r1] ; CHECK-NEXT: str r0, [r1, #8] ; CHECK-NEXT: pop {r4, pc} diff --git a/llvm/test/CodeGen/X86/2008-06-25-VecISelBug.ll b/llvm/test/CodeGen/X86/2008-06-25-VecISelBug.ll --- a/llvm/test/CodeGen/X86/2008-06-25-VecISelBug.ll +++ b/llvm/test/CodeGen/X86/2008-06-25-VecISelBug.ll @@ -4,7 +4,7 @@ define void @t() nounwind { ; CHECK-LABEL: t: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movaps {{.*#+}} xmm0 = [0.0E+0,0.0E+0,0.0E+0,1.0E+0] +; CHECK-NEXT: xorps %xmm0, %xmm0 ; CHECK-NEXT: movaps %xmm0, 0 entry: %tmp1 = shufflevector <4 x float> zeroinitializer, <4 x float> < float 0.000000e+00, float 1.000000e+00, float 0.000000e+00, float 1.000000e+00 >, <4 x i32> < i32 0, i32 1, i32 4, i32 5 > diff --git a/llvm/test/CodeGen/X86/avx-cvt-3.ll b/llvm/test/CodeGen/X86/avx-cvt-3.ll --- a/llvm/test/CodeGen/X86/avx-cvt-3.ll +++ b/llvm/test/CodeGen/X86/avx-cvt-3.ll @@ -93,31 +93,13 @@ define <8 x float> @sitofp_insert_constants_v8i32(<8 x i32> %a0) { ; X86-LABEL: sitofp_insert_constants_v8i32: ; X86: # %bb.0: -; X86-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; X86-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3,4,5,6,7] -; X86-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; X86-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5],xmm1[6,7] -; X86-NEXT: vextractf128 $1, %ymm0, %xmm0 -; X86-NEXT: movl $2, %eax -; X86-NEXT: vpinsrd $0, %eax, %xmm0, %xmm0 -; X86-NEXT: movl $-3, %eax -; X86-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 -; X86-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; X86-NEXT: vblendps {{.*#+}} ymm0 = mem[0],ymm0[1],mem[2],ymm0[3],mem[4,5],ymm0[6,7] ; X86-NEXT: vcvtdq2ps %ymm0, %ymm0 ; X86-NEXT: retl ; ; X64-LABEL: sitofp_insert_constants_v8i32: ; X64: # %bb.0: -; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; X64-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3,4,5,6,7] -; X64-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; X64-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5],xmm1[6,7] -; X64-NEXT: vextractf128 $1, %ymm0, %xmm0 -; X64-NEXT: movl $2, %eax -; X64-NEXT: vpinsrd $0, %eax, %xmm0, %xmm0 -; X64-NEXT: movl $-3, %eax -; X64-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 -; X64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; X64-NEXT: vblendps {{.*#+}} ymm0 = mem[0],ymm0[1],mem[2],ymm0[3],mem[4,5],ymm0[6,7] ; X64-NEXT: vcvtdq2ps %ymm0, %ymm0 ; X64-NEXT: retq %1 = insertelement <8 x i32> %a0, i32 0, i32 0 diff --git a/llvm/test/CodeGen/X86/avx512-insert-extract.ll b/llvm/test/CodeGen/X86/avx512-insert-extract.ll --- a/llvm/test/CodeGen/X86/avx512-insert-extract.ll +++ b/llvm/test/CodeGen/X86/avx512-insert-extract.ll @@ -6,11 +6,11 @@ define <16 x float> @test1(<16 x float> %x, float* %br, float %y) nounwind { ; CHECK-LABEL: test1: ; CHECK: ## %bb.0: -; CHECK-NEXT: vinsertps {{.*#+}} xmm2 = xmm0[0],mem[0],xmm0[2,3] -; CHECK-NEXT: vinsertf32x4 $0, %xmm2, %zmm0, %zmm2 -; CHECK-NEXT: vextractf32x4 $3, %zmm0, %xmm0 -; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] -; CHECK-NEXT: vinsertf32x4 $3, %xmm0, %zmm2, %zmm0 +; CHECK-NEXT: vbroadcastss (%rdi), %xmm2 +; CHECK-NEXT: vbroadcastss %xmm1, %ymm1 +; CHECK-NEXT: vinsertf64x4 $1, %ymm1, %zmm2, %zmm1 +; CHECK-NEXT: vmovaps {{.*#+}} zmm2 = [0,17,2,3,4,5,6,7,8,9,10,11,12,13,30,15] +; CHECK-NEXT: vpermt2ps %zmm1, %zmm2, %zmm0 ; CHECK-NEXT: retq %rrr = load float, float* %br %rrr2 = insertelement <16 x float> %x, float %rrr, i32 1 @@ -19,14 +19,23 @@ } define <8 x double> @test2(<8 x double> %x, double* %br, double %y) nounwind { -; CHECK-LABEL: test2: -; CHECK: ## %bb.0: -; CHECK-NEXT: vmovhps {{.*#+}} xmm2 = xmm0[0,1],mem[0,1] -; CHECK-NEXT: vinsertf32x4 $0, %xmm2, %zmm0, %zmm2 -; CHECK-NEXT: vextractf32x4 $3, %zmm0, %xmm0 -; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; CHECK-NEXT: vinsertf32x4 $3, %xmm0, %zmm2, %zmm0 -; CHECK-NEXT: retq +; KNL-LABEL: test2: +; KNL: ## %bb.0: +; KNL-NEXT: vbroadcastsd (%rdi), %ymm2 +; KNL-NEXT: vbroadcastsd %xmm1, %ymm1 +; KNL-NEXT: movb $66, %al +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: vinsertf64x4 $1, %ymm1, %zmm2, %zmm0 {%k1} +; KNL-NEXT: retq +; +; SKX-LABEL: test2: +; SKX: ## %bb.0: +; SKX-NEXT: vbroadcastsd (%rdi), %ymm2 +; SKX-NEXT: vbroadcastsd %xmm1, %ymm1 +; SKX-NEXT: movb $66, %al +; SKX-NEXT: kmovd %eax, %k1 +; SKX-NEXT: vinsertf64x4 $1, %ymm1, %zmm2, %zmm0 {%k1} +; SKX-NEXT: retq %rrr = load double, double* %br %rrr2 = insertelement <8 x double> %x, double %rrr, i32 1 %rrr3 = insertelement <8 x double> %rrr2, double %y, i32 6 @@ -535,14 +544,26 @@ } define <8 x i64> @insert_v8i64(<8 x i64> %x, i64 %y , i64* %ptr) { -; CHECK-LABEL: insert_v8i64: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpinsrq $1, (%rsi), %xmm0, %xmm1 -; CHECK-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm1 -; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0 -; CHECK-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm0 -; CHECK-NEXT: vinserti32x4 $1, %xmm0, %zmm1, %zmm0 -; CHECK-NEXT: retq +; KNL-LABEL: insert_v8i64: +; KNL: ## %bb.0: +; KNL-NEXT: vpbroadcastq (%rsi), %xmm1 +; KNL-NEXT: vmovq %rdi, %xmm2 +; KNL-NEXT: vpbroadcastq %xmm2, %xmm2 +; KNL-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; KNL-NEXT: movb $10, %al +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 {%k1} +; KNL-NEXT: retq +; +; SKX-LABEL: insert_v8i64: +; SKX: ## %bb.0: +; SKX-NEXT: vpbroadcastq (%rsi), %xmm1 +; SKX-NEXT: vpbroadcastq %rdi, %xmm2 +; SKX-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; SKX-NEXT: movb $10, %al +; SKX-NEXT: kmovd %eax, %k1 +; SKX-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 {%k1} +; SKX-NEXT: retq %val = load i64, i64* %ptr %r1 = insertelement <8 x i64> %x, i64 %val, i32 1 %r2 = insertelement <8 x i64> %r1, i64 %y, i32 3 @@ -550,13 +571,22 @@ } define <4 x i64> @insert_v4i64(<4 x i64> %x, i64 %y , i64* %ptr) { -; CHECK-LABEL: insert_v4i64: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpinsrq $1, (%rsi), %xmm0, %xmm1 -; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0 -; CHECK-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm0 -; CHECK-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; CHECK-NEXT: retq +; KNL-LABEL: insert_v4i64: +; KNL: ## %bb.0: +; KNL-NEXT: vpbroadcastq (%rsi), %xmm1 +; KNL-NEXT: vmovq %rdi, %xmm2 +; KNL-NEXT: vpbroadcastq %xmm2, %xmm2 +; KNL-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; KNL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; KNL-NEXT: retq +; +; SKX-LABEL: insert_v4i64: +; SKX: ## %bb.0: +; SKX-NEXT: vpbroadcastq (%rsi), %xmm1 +; SKX-NEXT: vpbroadcastq %rdi, %xmm2 +; SKX-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; SKX-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; SKX-NEXT: retq %val = load i64, i64* %ptr %r1 = insertelement <4 x i64> %x, i64 %val, i32 1 %r2 = insertelement <4 x i64> %r1, i64 %y, i32 3 @@ -566,8 +596,9 @@ define <2 x i64> @insert_v2i64(<2 x i64> %x, i64 %y , i64* %ptr) { ; CHECK-LABEL: insert_v2i64: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpinsrq $0, %rdi, %xmm0, %xmm0 -; CHECK-NEXT: vpinsrq $1, (%rsi), %xmm0, %xmm0 +; CHECK-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; CHECK-NEXT: vmovq %rdi, %xmm1 +; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; CHECK-NEXT: retq %val = load i64, i64* %ptr %r1 = insertelement <2 x i64> %x, i64 %val, i32 1 @@ -576,14 +607,26 @@ } define <16 x i32> @insert_v16i32(<16 x i32> %x, i32 %y, i32* %ptr) { -; CHECK-LABEL: insert_v16i32: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpinsrd $1, (%rsi), %xmm0, %xmm1 -; CHECK-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm1 -; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0 -; CHECK-NEXT: vpinsrd $1, %edi, %xmm0, %xmm0 -; CHECK-NEXT: vinserti32x4 $1, %xmm0, %zmm1, %zmm0 -; CHECK-NEXT: retq +; KNL-LABEL: insert_v16i32: +; KNL: ## %bb.0: +; KNL-NEXT: vpbroadcastd (%rsi), %xmm1 +; KNL-NEXT: vmovd %edi, %xmm2 +; KNL-NEXT: vpbroadcastd %xmm2, %xmm2 +; KNL-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; KNL-NEXT: movw $34, %ax +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} +; KNL-NEXT: retq +; +; SKX-LABEL: insert_v16i32: +; SKX: ## %bb.0: +; SKX-NEXT: vpbroadcastd (%rsi), %xmm1 +; SKX-NEXT: vpbroadcastd %edi, %xmm2 +; SKX-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; SKX-NEXT: movw $34, %ax +; SKX-NEXT: kmovd %eax, %k1 +; SKX-NEXT: vinserti32x8 $0, %ymm1, %zmm0, %zmm0 {%k1} +; SKX-NEXT: retq %val = load i32, i32* %ptr %r1 = insertelement <16 x i32> %x, i32 %val, i32 1 %r2 = insertelement <16 x i32> %r1, i32 %y, i32 5 @@ -591,13 +634,22 @@ } define <8 x i32> @insert_v8i32(<8 x i32> %x, i32 %y, i32* %ptr) { -; CHECK-LABEL: insert_v8i32: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpinsrd $1, (%rsi), %xmm0, %xmm1 -; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0 -; CHECK-NEXT: vpinsrd $1, %edi, %xmm0, %xmm0 -; CHECK-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; CHECK-NEXT: retq +; KNL-LABEL: insert_v8i32: +; KNL: ## %bb.0: +; KNL-NEXT: vpbroadcastd (%rsi), %xmm1 +; KNL-NEXT: vmovd %edi, %xmm2 +; KNL-NEXT: vpbroadcastd %xmm2, %xmm2 +; KNL-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; KNL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] +; KNL-NEXT: retq +; +; SKX-LABEL: insert_v8i32: +; SKX: ## %bb.0: +; SKX-NEXT: vpbroadcastd (%rsi), %xmm1 +; SKX-NEXT: vpbroadcastd %edi, %xmm2 +; SKX-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; SKX-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] +; SKX-NEXT: retq %val = load i32, i32* %ptr %r1 = insertelement <8 x i32> %x, i32 %val, i32 1 %r2 = insertelement <8 x i32> %r1, i32 %y, i32 5 @@ -607,8 +659,9 @@ define <4 x i32> @insert_v4i32(<4 x i32> %x, i32 %y, i32* %ptr) { ; CHECK-LABEL: insert_v4i32: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpinsrd $1, (%rsi), %xmm0, %xmm0 -; CHECK-NEXT: vpinsrd $3, %edi, %xmm0, %xmm0 +; CHECK-NEXT: vpinsrd $1, (%rsi), %xmm0, %xmm1 +; CHECK-NEXT: vpinsrd $3, %edi, %xmm1, %xmm1 +; CHECK-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] ; CHECK-NEXT: retq %val = load i32, i32* %ptr %r1 = insertelement <4 x i32> %x, i32 %val, i32 1 @@ -617,14 +670,25 @@ } define <32 x i16> @insert_v32i16(<32 x i16> %x, i16 %y, i16* %ptr) { -; CHECK-LABEL: insert_v32i16: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpinsrw $1, (%rsi), %xmm0, %xmm1 -; CHECK-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm1 -; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0 -; CHECK-NEXT: vpinsrw $1, %edi, %xmm0, %xmm0 -; CHECK-NEXT: vinserti32x4 $1, %xmm0, %zmm1, %zmm0 -; CHECK-NEXT: retq +; KNL-LABEL: insert_v32i16: +; KNL: ## %bb.0: +; KNL-NEXT: vpbroadcastw (%rsi), %xmm1 +; KNL-NEXT: vmovd %edi, %xmm2 +; KNL-NEXT: vpslld $16, %xmm2, %xmm2 +; KNL-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; KNL-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm0 +; KNL-NEXT: retq +; +; SKX-LABEL: insert_v32i16: +; SKX: ## %bb.0: +; SKX-NEXT: vpbroadcastw (%rsi), %xmm1 +; SKX-NEXT: vmovd %edi, %xmm2 +; SKX-NEXT: vpslld $16, %xmm2, %xmm2 +; SKX-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; SKX-NEXT: movl $514, %eax ## imm = 0x202 +; SKX-NEXT: kmovd %eax, %k1 +; SKX-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1} +; SKX-NEXT: retq %val = load i16, i16* %ptr %r1 = insertelement <32 x i16> %x, i16 %val, i32 1 %r2 = insertelement <32 x i16> %r1, i16 %y, i32 9 @@ -634,10 +698,11 @@ define <16 x i16> @insert_v16i16(<16 x i16> %x, i16 %y, i16* %ptr) { ; CHECK-LABEL: insert_v16i16: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpinsrw $1, (%rsi), %xmm0, %xmm1 -; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0 -; CHECK-NEXT: vpinsrw $1, %edi, %xmm0, %xmm0 -; CHECK-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; CHECK-NEXT: vpbroadcastw (%rsi), %xmm1 +; CHECK-NEXT: vmovd %edi, %xmm2 +; CHECK-NEXT: vpslld $16, %xmm2, %xmm2 +; CHECK-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; CHECK-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6,7,8],ymm1[9],ymm0[10,11,12,13,14,15] ; CHECK-NEXT: retq %val = load i16, i16* %ptr %r1 = insertelement <16 x i16> %x, i16 %val, i32 1 @@ -648,8 +713,10 @@ define <8 x i16> @insert_v8i16(<8 x i16> %x, i16 %y, i16* %ptr) { ; CHECK-LABEL: insert_v8i16: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpinsrw $1, (%rsi), %xmm0, %xmm0 -; CHECK-NEXT: vpinsrw $5, %edi, %xmm0, %xmm0 +; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: vpinsrw $1, (%rsi), %xmm1, %xmm1 +; CHECK-NEXT: vpinsrw $5, %edi, %xmm1, %xmm1 +; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4],xmm1[5],xmm0[6,7] ; CHECK-NEXT: retq %val = load i16, i16* %ptr %r1 = insertelement <8 x i16> %x, i16 %val, i32 1 @@ -658,14 +725,14 @@ } define <64 x i8> @insert_v64i8(<64 x i8> %x, i8 %y, i8* %ptr) { -; CHECK-LABEL: insert_v64i8: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpinsrb $1, (%rsi), %xmm0, %xmm1 -; CHECK-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm1 -; CHECK-NEXT: vextracti32x4 $3, %zmm0, %xmm0 -; CHECK-NEXT: vpinsrb $2, %edi, %xmm0, %xmm0 -; CHECK-NEXT: vinserti32x4 $3, %xmm0, %zmm1, %zmm0 -; CHECK-NEXT: retq +; KNL-LABEL: insert_v64i8: +; KNL: ## %bb.0: +; KNL-NEXT: vpbroadcastb (%rsi), %ymm1 +; KNL-NEXT: vmovd %edi, %xmm2 +; KNL-NEXT: vpbroadcastb %xmm2, %ymm2 +; KNL-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 +; KNL-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm0 +; KNL-NEXT: retq %val = load i8, i8* %ptr %r1 = insertelement <64 x i8> %x, i8 %val, i32 1 %r2 = insertelement <64 x i8> %r1, i8 %y, i32 50 @@ -673,13 +740,26 @@ } define <32 x i8> @insert_v32i8(<32 x i8> %x, i8 %y, i8* %ptr) { -; CHECK-LABEL: insert_v32i8: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpinsrb $1, (%rsi), %xmm0, %xmm1 -; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0 -; CHECK-NEXT: vpinsrb $1, %edi, %xmm0, %xmm0 -; CHECK-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; CHECK-NEXT: retq +; KNL-LABEL: insert_v32i8: +; KNL: ## %bb.0: +; KNL-NEXT: vpbroadcastb (%rsi), %xmm1 +; KNL-NEXT: vmovd %edi, %xmm2 +; KNL-NEXT: vpsllw $8, %xmm2, %xmm2 +; KNL-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; KNL-NEXT: vmovdqa {{.*#+}} ymm2 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; KNL-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; KNL-NEXT: retq +; +; SKX-LABEL: insert_v32i8: +; SKX: ## %bb.0: +; SKX-NEXT: vpbroadcastb (%rsi), %xmm1 +; SKX-NEXT: vmovd %edi, %xmm2 +; SKX-NEXT: vpsllw $8, %xmm2, %xmm2 +; SKX-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; SKX-NEXT: movl $131074, %eax ## imm = 0x20002 +; SKX-NEXT: kmovd %eax, %k1 +; SKX-NEXT: vmovdqu8 %ymm1, %ymm0 {%k1} +; SKX-NEXT: retq %val = load i8, i8* %ptr %r1 = insertelement <32 x i8> %x, i8 %val, i32 1 %r2 = insertelement <32 x i8> %r1, i8 %y, i32 17 @@ -687,11 +767,24 @@ } define <16 x i8> @insert_v16i8(<16 x i8> %x, i8 %y, i8* %ptr) { -; CHECK-LABEL: insert_v16i8: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpinsrb $3, (%rsi), %xmm0, %xmm0 -; CHECK-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 -; CHECK-NEXT: retq +; KNL-LABEL: insert_v16i8: +; KNL: ## %bb.0: +; KNL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; KNL-NEXT: vpinsrb $3, (%rsi), %xmm1, %xmm1 +; KNL-NEXT: vpinsrb $10, %edi, %xmm1, %xmm1 +; KNL-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] +; KNL-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0 +; KNL-NEXT: retq +; +; SKX-LABEL: insert_v16i8: +; SKX: ## %bb.0: +; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; SKX-NEXT: vpinsrb $3, (%rsi), %xmm1, %xmm1 +; SKX-NEXT: vpinsrb $10, %edi, %xmm1, %xmm1 +; SKX-NEXT: movw $1032, %ax ## imm = 0x408 +; SKX-NEXT: kmovd %eax, %k1 +; SKX-NEXT: vmovdqu8 %xmm1, %xmm0 {%k1} +; SKX-NEXT: retq %val = load i8, i8* %ptr %r1 = insertelement <16 x i8> %x, i8 %val, i32 3 %r2 = insertelement <16 x i8> %r1, i8 %y, i32 10 diff --git a/llvm/test/CodeGen/X86/insertelement-ones.ll b/llvm/test/CodeGen/X86/insertelement-ones.ll --- a/llvm/test/CodeGen/X86/insertelement-ones.ll +++ b/llvm/test/CodeGen/X86/insertelement-ones.ll @@ -211,23 +211,20 @@ define <8 x i16> @insert_v8i16_x12345x7(<8 x i16> %a) { ; SSE2-LABEL: insert_v8i16_x12345x7: ; SSE2: # %bb.0: -; SSE2-NEXT: movl $65535, %eax # imm = 0xFFFF -; SSE2-NEXT: pinsrw $0, %eax, %xmm0 -; SSE2-NEXT: pinsrw $6, %eax, %xmm0 +; SSE2-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: orps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE2-NEXT: retq ; ; SSE3-LABEL: insert_v8i16_x12345x7: ; SSE3: # %bb.0: -; SSE3-NEXT: movl $65535, %eax # imm = 0xFFFF -; SSE3-NEXT: pinsrw $0, %eax, %xmm0 -; SSE3-NEXT: pinsrw $6, %eax, %xmm0 +; SSE3-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE3-NEXT: orps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE3-NEXT: retq ; ; SSSE3-LABEL: insert_v8i16_x12345x7: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movl $65535, %eax # imm = 0xFFFF -; SSSE3-NEXT: pinsrw $0, %eax, %xmm0 -; SSSE3-NEXT: pinsrw $6, %eax, %xmm0 +; SSSE3-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSSE3-NEXT: orps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: insert_v8i16_x12345x7: @@ -282,35 +279,28 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX1-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX1-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: insert_v16i16_x12345x789ABCDEx: ; AVX2: # %bb.0: -; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1,2,3,4,5],xmm1[6],xmm0[7] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0] ; AVX2-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 -; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7],ymm0[8,9,10,11,12,13,14],ymm2[15] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vpblendvb %ymm1, %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: retq ; ; AVX512F-LABEL: insert_v16i16_x12345x789ABCDEx: ; AVX512F: # %bb.0: -; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [32,1,2,3,4,5,38,7,8,9,10,11,12,13,14,47] -; AVX512F-NEXT: vpermt2w %zmm1, %zmm2, %zmm0 -; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0] +; AVX512F-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 +; AVX512F-NEXT: vpblendvb %ymm1, %ymm0, %ymm2, %ymm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: insert_v16i16_x12345x789ABCDEx: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [16,1,2,3,4,5,22,7,8,9,10,11,12,13,14,31] -; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0 +; AVX512VL-NEXT: movw $-32703, %ax # imm = 0x8041 +; AVX512VL-NEXT: kmovd %eax, %k1 +; AVX512VL-NEXT: vmovdqu16 %ymm1, %ymm0 {%k1} ; AVX512VL-NEXT: retq %1 = insertelement <16 x i16> %a, i16 -1, i32 0 %2 = insertelement <16 x i16> %1, i16 -1, i32 6 @@ -321,55 +311,59 @@ define <16 x i8> @insert_v16i8_x123456789ABCDEx(<16 x i8> %a) { ; SSE2-LABEL: insert_v16i8_x123456789ABCDEx: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: movl $255, %eax -; SSE2-NEXT: movd %eax, %xmm2 -; SSE2-NEXT: pandn %xmm2, %xmm1 -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0] -; SSE2-NEXT: por %xmm2, %xmm0 +; SSE2-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: orps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE2-NEXT: retq ; ; SSE3-LABEL: insert_v16i8_x123456789ABCDEx: ; SSE3: # %bb.0: -; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; SSE3-NEXT: pand %xmm1, %xmm0 -; SSE3-NEXT: movl $255, %eax -; SSE3-NEXT: movd %eax, %xmm2 -; SSE3-NEXT: pandn %xmm2, %xmm1 -; SSE3-NEXT: por %xmm1, %xmm0 -; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE3-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0] -; SSE3-NEXT: por %xmm2, %xmm0 +; SSE3-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE3-NEXT: orps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE3-NEXT: retq ; ; SSSE3-LABEL: insert_v16i8_x123456789ABCDEx: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movl $255, %eax -; SSSE3-NEXT: movd %eax, %xmm1 -; SSSE3-NEXT: movdqa %xmm1, %xmm2 -; SSSE3-NEXT: palignr {{.*#+}} xmm2 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0] -; SSSE3-NEXT: pshufb {{.*#+}} xmm2 = xmm2[15,0,1,2,3,4,5,6,7,8,9,10,11,12,13],zero -; SSSE3-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0] -; SSSE3-NEXT: por %xmm2, %xmm1 -; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = zero,xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14],zero +; SSSE3-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: insert_v16i8_x123456789ABCDEx: ; SSE41: # %bb.0: -; SSE41-NEXT: movl $255, %eax -; SSE41-NEXT: pinsrb $0, %eax, %xmm0 -; SSE41-NEXT: pinsrb $15, %eax, %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: movaps {{.*#+}} xmm0 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0] +; SSE41-NEXT: pcmpeqd %xmm2, %xmm2 +; SSE41-NEXT: pblendvb %xmm0, %xmm1, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: retq ; -; AVX-LABEL: insert_v16i8_x123456789ABCDEx: -; AVX: # %bb.0: -; AVX-NEXT: movl $255, %eax -; AVX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 -; AVX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: insert_v16i8_x123456789ABCDEx: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0] +; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpblendvb %xmm1, %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: insert_v16i8_x123456789ABCDEx: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0] +; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpblendvb %xmm1, %xmm0, %xmm2, %xmm0 +; AVX2-NEXT: retq +; +; AVX512F-LABEL: insert_v16i8_x123456789ABCDEx: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0] +; AVX512F-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX512F-NEXT: vpblendvb %xmm1, %xmm0, %xmm2, %xmm0 +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: insert_v16i8_x123456789ABCDEx: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX512VL-NEXT: movw $-32767, %ax # imm = 0x8001 +; AVX512VL-NEXT: kmovd %eax, %k1 +; AVX512VL-NEXT: vmovdqu8 %xmm1, %xmm0 {%k1} +; AVX512VL-NEXT: retq %1 = insertelement <16 x i8> %a, i8 -1, i32 0 %2 = insertelement <16 x i8> %1, i8 -1, i32 15 ret <16 x i8> %2 @@ -445,36 +439,31 @@ ; ; AVX1-LABEL: insert_v32i8_x123456789ABCDEzGHIJKLMNOPQRSTxx: ; AVX1: # %bb.0: -; AVX1-NEXT: movl $255, %eax -; AVX1-NEXT: vpinsrb $0, %eax, %xmm0, %xmm1 -; AVX1-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 -; AVX1-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX1-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: insert_v32i8_x123456789ABCDEzGHIJKLMNOPQRSTxx: ; AVX2: # %bb.0: -; AVX2-NEXT: movl $255, %eax -; AVX2-NEXT: vpinsrb $0, %eax, %xmm0, %xmm1 -; AVX2-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 -; AVX2-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0] +; AVX2-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 +; AVX2-NEXT: vpblendvb %ymm1, %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: retq ; -; AVX512-LABEL: insert_v32i8_x123456789ABCDEzGHIJKLMNOPQRSTxx: -; AVX512: # %bb.0: -; AVX512-NEXT: movl $255, %eax -; AVX512-NEXT: vpinsrb $0, %eax, %xmm0, %xmm1 -; AVX512-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 -; AVX512-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX512-NEXT: retq +; AVX512F-LABEL: insert_v32i8_x123456789ABCDEzGHIJKLMNOPQRSTxx: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0] +; AVX512F-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 +; AVX512F-NEXT: vpblendvb %ymm1, %ymm0, %ymm2, %ymm0 +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: insert_v32i8_x123456789ABCDEzGHIJKLMNOPQRSTxx: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; AVX512VL-NEXT: movl $-1073709055, %eax # imm = 0xC0008001 +; AVX512VL-NEXT: kmovd %eax, %k1 +; AVX512VL-NEXT: vmovdqu8 %ymm1, %ymm0 {%k1} +; AVX512VL-NEXT: retq %1 = insertelement <32 x i8> %a, i8 -1, i32 0 %2 = insertelement <32 x i8> %1, i8 -1, i32 15 %3 = insertelement <32 x i8> %2, i8 -1, i32 30 diff --git a/llvm/test/CodeGen/X86/insertelement-zero.ll b/llvm/test/CodeGen/X86/insertelement-zero.ll --- a/llvm/test/CodeGen/X86/insertelement-zero.ll +++ b/llvm/test/CodeGen/X86/insertelement-zero.ll @@ -313,23 +313,17 @@ define <8 x i16> @insert_v8i16_z12345z7(<8 x i16> %a) { ; SSE2-LABEL: insert_v8i16_z12345z7: ; SSE2: # %bb.0: -; SSE2-NEXT: xorl %eax, %eax -; SSE2-NEXT: pinsrw $0, %eax, %xmm0 -; SSE2-NEXT: pinsrw $6, %eax, %xmm0 +; SSE2-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE2-NEXT: retq ; ; SSE3-LABEL: insert_v8i16_z12345z7: ; SSE3: # %bb.0: -; SSE3-NEXT: xorl %eax, %eax -; SSE3-NEXT: pinsrw $0, %eax, %xmm0 -; SSE3-NEXT: pinsrw $6, %eax, %xmm0 +; SSE3-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE3-NEXT: retq ; ; SSSE3-LABEL: insert_v8i16_z12345z7: ; SSSE3: # %bb.0: -; SSSE3-NEXT: xorl %eax, %eax -; SSSE3-NEXT: pinsrw $0, %eax, %xmm0 -; SSSE3-NEXT: pinsrw $6, %eax, %xmm0 +; SSSE3-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: insert_v8i16_z12345z7: @@ -391,46 +385,15 @@ } define <16 x i8> @insert_v16i8_z123456789ABCDEz(<16 x i8> %a) { -; SSE2-LABEL: insert_v16i8_z123456789ABCDEz: -; SSE2: # %bb.0: -; SSE2-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: retq +; SSE-LABEL: insert_v16i8_z123456789ABCDEz: +; SSE: # %bb.0: +; SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE-NEXT: retq ; -; SSE3-LABEL: insert_v16i8_z123456789ABCDEz: -; SSE3: # %bb.0: -; SSE3-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE3-NEXT: retq -; -; SSSE3-LABEL: insert_v16i8_z123456789ABCDEz: -; SSSE3: # %bb.0: -; SSSE3-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSSE3-NEXT: retq -; -; SSE41-LABEL: insert_v16i8_z123456789ABCDEz: -; SSE41: # %bb.0: -; SSE41-NEXT: xorl %eax, %eax -; SSE41-NEXT: pinsrb $0, %eax, %xmm0 -; SSE41-NEXT: pinsrb $15, %eax, %xmm0 -; SSE41-NEXT: retq -; -; AVX1-LABEL: insert_v16i8_z123456789ABCDEz: -; AVX1: # %bb.0: -; AVX1-NEXT: xorl %eax, %eax -; AVX1-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 -; AVX1-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; AVX1-NEXT: retq -; -; AVX2-SLOW-LABEL: insert_v16i8_z123456789ABCDEz: -; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: xorl %eax, %eax -; AVX2-SLOW-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: retq -; -; AVX2-FAST-LABEL: insert_v16i8_z123456789ABCDEz: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX2-FAST-NEXT: retq +; AVX-LABEL: insert_v16i8_z123456789ABCDEz: +; AVX: # %bb.0: +; AVX-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: retq %1 = insertelement <16 x i8> %a, i8 0, i32 0 %2 = insertelement <16 x i8> %1, i8 0, i32 15 ret <16 x i8> %2 diff --git a/llvm/test/CodeGen/X86/masked_load.ll b/llvm/test/CodeGen/X86/masked_load.ll --- a/llvm/test/CodeGen/X86/masked_load.ll +++ b/llvm/test/CodeGen/X86/masked_load.ll @@ -6318,20 +6318,15 @@ define <4 x float> @mload_constmask_v4f32(<4 x float>* %addr, <4 x float> %dst) { ; SSE2-LABEL: mload_constmask_v4f32: ; SSE2: ## %bb.0: -; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSE2-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero -; SSE2-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm2[0] -; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,0] +; SSE2-NEXT: movups (%rdi), %xmm1 +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] ; SSE2-NEXT: retq ; ; SSE42-LABEL: mload_constmask_v4f32: ; SSE42: ## %bb.0: -; SSE42-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE42-NEXT: movups (%rdi), %xmm1 ; SSE42-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] -; SSE42-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] -; SSE42-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] ; SSE42-NEXT: retq ; ; AVX1OR2-LABEL: mload_constmask_v4f32: @@ -6419,20 +6414,20 @@ define <4 x i32> @mload_constmask_v4i32(<4 x i32>* %addr, <4 x i32> %dst) { ; SSE2-LABEL: mload_constmask_v4i32: ; SSE2: ## %bb.0: -; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSE2-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[2,0] +; SSE2-NEXT: movq {{.*#+}} xmm2 = mem[0],zero +; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] ; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE42-LABEL: mload_constmask_v4i32: ; SSE42: ## %bb.0: -; SSE42-NEXT: pinsrd $1, 4(%rdi), %xmm0 -; SSE42-NEXT: pinsrd $2, 8(%rdi), %xmm0 -; SSE42-NEXT: pinsrd $3, 12(%rdi), %xmm0 +; SSE42-NEXT: pinsrd $1, 4(%rdi), %xmm1 +; SSE42-NEXT: pinsrd $2, 8(%rdi), %xmm1 +; SSE42-NEXT: pinsrd $3, 12(%rdi), %xmm1 +; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] ; SSE42-NEXT: retq ; ; AVX1-LABEL: mload_constmask_v4i32: diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll @@ -2422,12 +2422,15 @@ ; SSE2-NEXT: movzbl (%rsi), %ecx ; SSE2-NEXT: shll $8, %ecx ; SSE2-NEXT: orl %eax, %ecx -; SSE2-NEXT: movd %ecx, %xmm1 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[1,1,1,3,4,5,6,7] +; SSE2-NEXT: movd %ecx, %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: xorps %xmm2, %xmm2 +; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[1,1,1,3,4,5,6,7] ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,1] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,0,0,0] ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,1,1,1,4,5,6,7] ; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,4,4,4] ; SSE2-NEXT: packuswb %xmm1, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-shuffle-avx512.ll b/llvm/test/CodeGen/X86/vector-shuffle-avx512.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-avx512.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-avx512.ll @@ -534,8 +534,8 @@ define void @test_demandedelts_pshufb_v32i8_v16i8(<2 x i32>* %src, <8 x i32>* %dst) { ; SKX64-LABEL: test_demandedelts_pshufb_v32i8_v16i8: ; SKX64: # %bb.0: -; SKX64-NEXT: vmovdqa 32(%rdi), %xmm0 -; SKX64-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,13,14,15,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero +; SKX64-NEXT: vpbroadcastd 44(%rdi), %xmm0 +; SKX64-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; SKX64-NEXT: vmovdqa %ymm0, 672(%rsi) ; SKX64-NEXT: vmovdqa 208(%rdi), %xmm0 ; SKX64-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,6,7,0,1,2,3],zero,zero,zero,zero,zero,zero,zero,zero @@ -545,11 +545,11 @@ ; ; KNL64-LABEL: test_demandedelts_pshufb_v32i8_v16i8: ; KNL64: # %bb.0: -; KNL64-NEXT: vmovdqa 32(%rdi), %xmm0 -; KNL64-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,13,14,15,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero +; KNL64-NEXT: vpbroadcastd 44(%rdi), %xmm0 +; KNL64-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; KNL64-NEXT: vmovdqa %ymm0, 672(%rsi) -; KNL64-NEXT: vmovdqa 208(%rdi), %xmm0 -; KNL64-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,6,7,0,1,2,3],zero,zero,zero,zero,zero,zero,zero,zero +; KNL64-NEXT: vpshufd {{.*#+}} xmm0 = mem[1,0,2,3] +; KNL64-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; KNL64-NEXT: vmovdqa %ymm0, 832(%rsi) ; KNL64-NEXT: retq ; @@ -557,8 +557,8 @@ ; SKX32: # %bb.0: ; SKX32-NEXT: movl {{[0-9]+}}(%esp), %eax ; SKX32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; SKX32-NEXT: vmovdqa 32(%ecx), %xmm0 -; SKX32-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,13,14,15,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero +; SKX32-NEXT: vpbroadcastd 44(%ecx), %xmm0 +; SKX32-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; SKX32-NEXT: vmovdqa %ymm0, 672(%eax) ; SKX32-NEXT: vmovdqa 208(%ecx), %xmm0 ; SKX32-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,6,7,0,1,2,3],zero,zero,zero,zero,zero,zero,zero,zero @@ -569,13 +569,13 @@ ; KNL32-LABEL: test_demandedelts_pshufb_v32i8_v16i8: ; KNL32: # %bb.0: ; KNL32-NEXT: movl {{[0-9]+}}(%esp), %eax -; KNL32-NEXT: vmovdqa 32(%eax), %xmm0 -; KNL32-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,13,14,15,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero ; KNL32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; KNL32-NEXT: vmovdqa %ymm0, 672(%ecx) -; KNL32-NEXT: vmovdqa 208(%eax), %xmm0 -; KNL32-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,6,7,0,1,2,3],zero,zero,zero,zero,zero,zero,zero,zero -; KNL32-NEXT: vmovdqa %ymm0, 832(%ecx) +; KNL32-NEXT: vpbroadcastd 44(%ecx), %xmm0 +; KNL32-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; KNL32-NEXT: vmovdqa %ymm0, 672(%eax) +; KNL32-NEXT: vpshufd {{.*#+}} xmm0 = mem[1,0,2,3] +; KNL32-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; KNL32-NEXT: vmovdqa %ymm0, 832(%eax) ; KNL32-NEXT: retl %t64 = bitcast <2 x i32>* %src to <16 x i32>* %t87 = load <16 x i32>, <16 x i32>* %t64, align 64 diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll @@ -2756,31 +2756,26 @@ define <4 x float> @PR30264(<4 x float> %x) { ; SSE2-LABEL: PR30264: ; SSE2: # %bb.0: -; SSE2-NEXT: xorps %xmm1, %xmm1 -; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],mem[2,3] +; SSE2-NEXT: movaps {{.*#+}} xmm1 = +; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] ; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: PR30264: ; SSSE3: # %bb.0: -; SSSE3-NEXT: xorps %xmm1, %xmm1 -; SSSE3-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],mem[2,3] +; SSSE3-NEXT: movaps {{.*#+}} xmm1 = +; SSSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] ; SSSE3-NEXT: movaps %xmm1, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: PR30264: ; SSE41: # %bb.0: -; SSE41-NEXT: movaps {{.*#+}} xmm1 = -; SSE41-NEXT: insertps {{.*#+}} xmm1 = xmm0[0],zero,xmm1[2,3] -; SSE41-NEXT: movaps %xmm1, %xmm0 +; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],mem[1,2,3] ; SSE41-NEXT: retq ; ; AVX-LABEL: PR30264: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm1 = -; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],zero,xmm1[2,3] +; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],mem[1,2,3] ; AVX-NEXT: retq %shuf1 = shufflevector <4 x float> %x, <4 x float> , <4 x i32> %shuf2 = shufflevector <4 x float> %shuf1, <4 x float> , <4 x i32> @@ -2835,28 +2830,32 @@ } define <8 x i16> @shuffle_extract_insert(<8 x i16> %a) { -; SSE-LABEL: shuffle_extract_insert: -; SSE: # %bb.0: -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7] -; SSE-NEXT: retq +; SSE2-LABEL: shuffle_extract_insert: +; SSE2: # %bb.0: +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,65535] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7] +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: pandn %xmm2, %xmm1 +; SSE2-NEXT: por %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: retq ; -; AVX1-LABEL: shuffle_extract_insert: -; AVX1: # %bb.0: -; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7] -; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7] -; AVX1-NEXT: retq +; SSSE3-LABEL: shuffle_extract_insert: +; SSSE3: # %bb.0: +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[4,5,2,3,0,1,6,7,12,13,10,11,8,9,14,15] +; SSSE3-NEXT: retq ; -; AVX2-SLOW-LABEL: shuffle_extract_insert: -; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7] -; AVX2-SLOW-NEXT: retq +; SSE41-LABEL: shuffle_extract_insert: +; SSE41: # %bb.0: +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[4,5,2,3,0,1,6,7,12,13,10,11,8,9,14,15] +; SSE41-NEXT: retq ; -; AVX2-FAST-LABEL: shuffle_extract_insert: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,2,3,0,1,6,7,12,13,10,11,8,9,14,15] -; AVX2-FAST-NEXT: retq +; AVX-LABEL: shuffle_extract_insert: +; AVX: # %bb.0: +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,2,3,0,1,6,7,12,13,10,11,8,9,14,15] +; AVX-NEXT: retq %a0 = extractelement <8 x i16> %a, i32 0 %a1 = extractelement <8 x i16> %a, i32 1 %a3 = extractelement <8 x i16> %a, i32 3 @@ -2928,41 +2927,66 @@ define <8 x i16> @shuffle_extract_concat_insert(<4 x i16> %lhsa, <4 x i16> %rhsa, <8 x i16> %b) { ; SSE2-LABEL: shuffle_extract_concat_insert: ; SSE2: # %bb.0: -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7] -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,3,2,3,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,3,2,4,5,6,7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,0,65535,65535,65535,65535,65535] +; SSE2-NEXT: pand %xmm4, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE2-NEXT: pandn %xmm0, %xmm4 +; SSE2-NEXT: por %xmm2, %xmm4 +; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,0,65535,0,65535] +; SSE2-NEXT: pand %xmm0, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,0] +; SSE2-NEXT: pandn %xmm1, %xmm0 +; SSE2-NEXT: por %xmm4, %xmm0 +; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm3[0],xmm0[1,2,3] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: shuffle_extract_concat_insert: ; SSSE3: # %bb.0: -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u] -; SSSE3-NEXT: pshufb {{.*#+}} xmm2 = xmm2[0,1,6,7,10,11,14,15,u,u,u,u,u,u,u,u] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,0,1],zero,zero,xmm0[u,u],zero,zero,xmm0[u,u],zero,zero +; SSSE3-NEXT: pshufb {{.*#+}} xmm2 = xmm2[u,u,u,u],zero,zero,xmm2[6,7,u,u,10,11,u,u,14,15] +; SSSE3-NEXT: por %xmm2, %xmm0 +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,4,5,6,7],zero,zero,xmm0[10,11],zero,zero,xmm0[14,15] +; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[u,u,u,u],zero,zero,zero,zero,xmm1[4,5],zero,zero,xmm1[0,1],zero,zero +; SSSE3-NEXT: por %xmm1, %xmm0 +; SSSE3-NEXT: movss {{.*#+}} xmm0 = xmm3[0],xmm0[1,2,3] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: shuffle_extract_concat_insert: ; SSE41: # %bb.0: -; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u] -; SSE41-NEXT: pshufb {{.*#+}} xmm2 = xmm2[0,1,6,7,10,11,14,15,u,u,u,u,u,u,u,u] -; SSE41-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] +; SSE41-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,0,1,1] +; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm2[0,1],xmm4[2],xmm2[3,4,5,6,7] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,1,0] +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm4[0,1,2,3],xmm0[4],xmm4[5],xmm0[6],xmm4[7] +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3,4,5,6,7] ; SSE41-NEXT: retq ; -; AVX-LABEL: shuffle_extract_concat_insert: -; AVX: # %bb.0: -; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[0,1,6,7,10,11,14,15,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX-NEXT: retq +; AVX1-LABEL: shuffle_extract_concat_insert: +; AVX1: # %bb.0: +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,1,0] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3,4,5,6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_extract_concat_insert: +; AVX2: # %bb.0: +; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX2-NEXT: vpbroadcastw %xmm0, %xmm0 +; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3,4,5,6,7] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,1,0] +; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7] +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0],xmm0[1,2,3] +; AVX2-NEXT: retq %a = shufflevector <4 x i16> %lhsa, <4 x i16> %rhsa, <8 x i32> %a0 = extractelement <8 x i16> %a, i32 0 %a4 = extractelement <8 x i16> %a, i32 4 @@ -2984,37 +3008,39 @@ ; SSE2-LABEL: shuffle_scalar_to_vector_extract: ; SSE2: # %bb.0: ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE2-NEXT: psraw $8, %xmm1 -; SSE2-NEXT: pextrw $7, %xmm1, %eax -; SSE2-NEXT: movd %eax, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: psraw $8, %xmm0 +; SSE2-NEXT: pextrw $4, %xmm0, %eax +; SSE2-NEXT: pextrw $7, %xmm0, %ecx +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: pinsrw $1, %eax, %xmm1 +; SSE2-NEXT: pinsrw $4, %ecx, %xmm1 ; SSE2-NEXT: movsbl (%rsi), %eax -; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE2-NEXT: pinsrw $5, %eax, %xmm1 ; SSE2-NEXT: movsbl (%rdx), %eax -; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSE2-NEXT: pinsrw $6, %eax, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],mem[1,0] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: shuffle_scalar_to_vector_extract: ; SSSE3: # %bb.0: ; SSSE3-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSSE3-NEXT: psraw $8, %xmm1 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSSE3-NEXT: psraw $8, %xmm0 +; SSSE3-NEXT: pextrw $4, %xmm0, %eax +; SSSE3-NEXT: pextrw $7, %xmm0, %ecx +; SSSE3-NEXT: pxor %xmm1, %xmm1 +; SSSE3-NEXT: pinsrw $1, %eax, %xmm1 +; SSSE3-NEXT: pinsrw $4, %ecx, %xmm1 ; SSSE3-NEXT: movsbl (%rsi), %eax -; SSSE3-NEXT: movd %eax, %xmm2 -; SSSE3-NEXT: palignr {{.*#+}} xmm2 = xmm1[14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; SSSE3-NEXT: pinsrw $5, %eax, %xmm1 ; SSSE3-NEXT: movsbl (%rdx), %eax -; SSSE3-NEXT: movd %eax, %xmm0 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSSE3-NEXT: pxor %xmm0, %xmm0 -; SSSE3-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSSE3-NEXT: pinsrw $6, %eax, %xmm1 +; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],mem[1,0] +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: shuffle_scalar_to_vector_extract: @@ -3024,30 +3050,43 @@ ; SSE41-NEXT: pextrw $7, %xmm0, %ecx ; SSE41-NEXT: pxor %xmm0, %xmm0 ; SSE41-NEXT: pinsrw $1, %eax, %xmm0 -; SSE41-NEXT: movl $65531, %eax # imm = 0xFFFB -; SSE41-NEXT: pinsrw $2, %eax, %xmm0 ; SSE41-NEXT: pinsrw $4, %ecx, %xmm0 ; SSE41-NEXT: movsbl (%rsi), %eax ; SSE41-NEXT: pinsrw $5, %eax, %xmm0 ; SSE41-NEXT: movsbl (%rdx), %eax ; SSE41-NEXT: pinsrw $6, %eax, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],mem[2,3],xmm0[4,5,6,7] ; SSE41-NEXT: retq ; -; AVX-LABEL: shuffle_scalar_to_vector_extract: -; AVX: # %bb.0: -; AVX-NEXT: vpmovsxbw (%rdi), %xmm0 -; AVX-NEXT: vpextrw $4, %xmm0, %eax -; AVX-NEXT: vpextrw $7, %xmm0, %ecx -; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 -; AVX-NEXT: movl $65531, %eax # imm = 0xFFFB -; AVX-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 -; AVX-NEXT: vpinsrw $4, %ecx, %xmm0, %xmm0 -; AVX-NEXT: movsbl (%rsi), %eax -; AVX-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 -; AVX-NEXT: movsbl (%rdx), %eax -; AVX-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: shuffle_scalar_to_vector_extract: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmovsxbw (%rdi), %xmm0 +; AVX1-NEXT: vpextrw $4, %xmm0, %eax +; AVX1-NEXT: vpextrw $7, %xmm0, %ecx +; AVX1-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 +; AVX1-NEXT: vpinsrw $4, %ecx, %xmm0, %xmm0 +; AVX1-NEXT: movsbl (%rsi), %eax +; AVX1-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 +; AVX1-NEXT: movsbl (%rdx), %eax +; AVX1-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],mem[2,3],xmm0[4,5,6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_scalar_to_vector_extract: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovsxbw (%rdi), %xmm0 +; AVX2-NEXT: vpextrw $4, %xmm0, %eax +; AVX2-NEXT: vpextrw $7, %xmm0, %ecx +; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 +; AVX2-NEXT: vpinsrw $4, %ecx, %xmm0, %xmm0 +; AVX2-NEXT: movsbl (%rsi), %eax +; AVX2-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 +; AVX2-NEXT: movsbl (%rdx), %eax +; AVX2-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2,3] +; AVX2-NEXT: retq %tmp = load <8 x i8>, <8 x i8>* %p0, align 1 %tmp1 = sext <8 x i8> %tmp to <8 x i16> %tmp2 = load i8, i8* %p1, align 1 diff --git a/llvm/test/CodeGen/X86/widen_conv-4.ll b/llvm/test/CodeGen/X86/widen_conv-4.ll --- a/llvm/test/CodeGen/X86/widen_conv-4.ll +++ b/llvm/test/CodeGen/X86/widen_conv-4.ll @@ -79,17 +79,16 @@ ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SSE2-NEXT: movzwl (%ecx), %edx ; X86-SSE2-NEXT: movd %edx, %xmm0 -; X86-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255] -; X86-SSE2-NEXT: pand %xmm1, %xmm0 ; X86-SSE2-NEXT: movzbl 2(%ecx), %ecx -; X86-SSE2-NEXT: movd %ecx, %xmm2 -; X86-SSE2-NEXT: pslld $16, %xmm2 -; X86-SSE2-NEXT: pandn %xmm2, %xmm1 -; X86-SSE2-NEXT: por %xmm0, %xmm1 -; X86-SSE2-NEXT: pxor %xmm0, %xmm0 -; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; X86-SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; X86-SSE2-NEXT: cvtdq2ps %xmm1, %xmm0 +; X86-SSE2-NEXT: pxor %xmm1, %xmm1 +; X86-SSE2-NEXT: pinsrw $4, %ecx, %xmm1 +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255] +; X86-SSE2-NEXT: pand %xmm2, %xmm1 +; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X86-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; X86-SSE2-NEXT: pandn %xmm0, %xmm2 +; X86-SSE2-NEXT: por %xmm1, %xmm2 +; X86-SSE2-NEXT: cvtdq2ps %xmm2, %xmm0 ; X86-SSE2-NEXT: movss %xmm0, (%eax) ; X86-SSE2-NEXT: movaps %xmm0, %xmm1 ; X86-SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] @@ -116,17 +115,16 @@ ; X64-SSE2: # %bb.0: # %entry ; X64-SSE2-NEXT: movzwl (%rsi), %eax ; X64-SSE2-NEXT: movd %eax, %xmm0 -; X64-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255] -; X64-SSE2-NEXT: pand %xmm1, %xmm0 ; X64-SSE2-NEXT: movzbl 2(%rsi), %eax -; X64-SSE2-NEXT: movd %eax, %xmm2 -; X64-SSE2-NEXT: pslld $16, %xmm2 -; X64-SSE2-NEXT: pandn %xmm2, %xmm1 -; X64-SSE2-NEXT: por %xmm0, %xmm1 -; X64-SSE2-NEXT: pxor %xmm0, %xmm0 -; X64-SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; X64-SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; X64-SSE2-NEXT: cvtdq2ps %xmm1, %xmm0 +; X64-SSE2-NEXT: pxor %xmm1, %xmm1 +; X64-SSE2-NEXT: pinsrw $4, %eax, %xmm1 +; X64-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255] +; X64-SSE2-NEXT: pand %xmm2, %xmm1 +; X64-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X64-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; X64-SSE2-NEXT: pandn %xmm0, %xmm2 +; X64-SSE2-NEXT: por %xmm1, %xmm2 +; X64-SSE2-NEXT: cvtdq2ps %xmm2, %xmm0 ; X64-SSE2-NEXT: movlps %xmm0, (%rdi) ; X64-SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; X64-SSE2-NEXT: movss %xmm0, 8(%rdi)