diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -18477,6 +18477,24 @@ if (LegalOperations && !TLI.isOperationLegal(ISD::BUILD_VECTOR, VT)) return SDValue(); + if (Level == BeforeLegalizeTypes && + InVec.getOpcode() == ISD::INSERT_VECTOR_ELT && InVec.hasOneUse() && + isa(InVec.getOperand(2))) { + SmallVector Ops(NumElts, DAG.getUNDEF(VT.getScalarType())); + unsigned Elt0 = cast(InVec.getOperand(2))->getZExtValue(); + if (Elt0 < Ops.size()) + Ops[Elt0] = InVec.getOperand(1); + if (Elt < Ops.size()) + Ops[Elt] = InVal; + SmallVector BlendMask; + for (unsigned I = 0; I != NumElts; ++I) + BlendMask.push_back(I); + BlendMask[Elt0] = Elt0 + BlendMask.size(); + BlendMask[Elt] = Elt + BlendMask.size(); + return DAG.getVectorShuffle(VT, DL, InVec.getOperand(0), + DAG.getBuildVector(VT, DL, Ops), BlendMask); + } + // Check that the operand is a BUILD_VECTOR (or UNDEF, which can essentially // be converted to a BUILD_VECTOR). Fill in the Ops vector with the // vector elements. @@ -20936,6 +20954,51 @@ InnerShuf->getOperand(1), CombinedMask); } +static SDValue pullInsertVectorEltIntoBuildVectorOperandOfVSelectLikeShuffle( + ShuffleVectorSDNode *Shuf, SelectionDAG &DAG) { + if (!ShuffleVectorInst::isSelectMask(Shuf->getMask())) + return SDValue(); + + SDValue N0 = Shuf->getOperand(0); + SDValue N1 = Shuf->getOperand(1); + + if (!N0.hasOneUse() || !N1.hasOneUse()) + return SDValue(); + + SmallVector Mask(Shuf->getMask().begin(), Shuf->getMask().end()); + + if (N1.getOpcode() != ISD::BUILD_VECTOR) { + std::swap(N0, N1); + ShuffleVectorSDNode::commuteMask(Mask); + } + + if (N0.getOpcode() != ISD::INSERT_VECTOR_ELT || + N1.getOpcode() != ISD::BUILD_VECTOR) + return SDValue(); + + SDValue InVec = N0->getOperand(0); + SDValue InVal = N0->getOperand(1); + SDLoc DL(Shuf); + + EVT VT = Shuf->getValueType(0); + + // We must know which element is being inserted. + auto *EltNo = dyn_cast(N0->getOperand(2)); + if (!EltNo) + return SDValue(); + + unsigned Elt = EltNo->getZExtValue(); + + SmallVector Ops(N1->op_begin(), N1->op_end()); + // Have the shuffle kept the element inserted? + if (Elt < Ops.size() && Mask[Elt] == (int)Elt) { + Ops[Elt] = InVal; + Mask[Elt] = Elt + Mask.size(); + } + return DAG.getVectorShuffle(VT, DL, InVec, DAG.getBuildVector(VT, DL, Ops), + Mask); +} + /// If the shuffle mask is taking exactly one element from the first vector /// operand and passing through all other elements from the second vector /// operand, return the index of the mask element that is choosing an element @@ -21170,6 +21233,10 @@ if (SimplifyDemandedVectorElts(SDValue(N, 0))) return SDValue(N, 0); + if (SDValue V = pullInsertVectorEltIntoBuildVectorOperandOfVSelectLikeShuffle( + SVN, DAG)) + return V; + // This is intentionally placed after demanded elements simplification because // it could eliminate knowledge of undef elements created by this shuffle. if (SDValue ShufOp = simplifyShuffleOfShuffle(SVN)) diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll b/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll --- a/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll +++ b/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll @@ -1261,10 +1261,8 @@ ; CHECK-NEXT: bfi x9, x8, #1, #2 ; CHECK-NEXT: str h0, [x9] ; CHECK-NEXT: ldr d1, [sp, #8] -; CHECK-NEXT: mov v1.h[1], v0.h[1] -; CHECK-NEXT: mov v1.h[2], v0.h[2] -; CHECK-NEXT: mov v1.h[3], v0.h[3] -; CHECK-NEXT: mov v0.16b, v1.16b +; CHECK-NEXT: mov v0.h[0], v1.h[0] +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: add sp, sp, #16 // =16 ; CHECK-NEXT: ret %tmp = extractelement <8 x i16> %x, i32 0 diff --git a/llvm/test/CodeGen/AArch64/arm64-vector-insertion.ll b/llvm/test/CodeGen/AArch64/arm64-vector-insertion.ll --- a/llvm/test/CodeGen/AArch64/arm64-vector-insertion.ll +++ b/llvm/test/CodeGen/AArch64/arm64-vector-insertion.ll @@ -44,9 +44,11 @@ define <16 x i8> @test_insert_v16i8_insert_2(i8 %a) { ; CHECK-LABEL: test_insert_v16i8_insert_2: ; CHECK: // %bb.0: -; CHECK-NEXT: movi.2d v0, #0000000000000000 -; CHECK-NEXT: mov.b v0[1], w0 -; CHECK-NEXT: mov.b v0[2], w0 +; CHECK-NEXT: adrp x8, .LCPI3_0 +; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI3_0] +; CHECK-NEXT: dup.16b v2, w0 +; CHECK-NEXT: movi.2d v1, #0000000000000000 +; CHECK-NEXT: tbl.16b v0, { v1, v2 }, v0 ; CHECK-NEXT: ret %v.0 = insertelement <16 x i8> zeroinitializer, i8 %a, i32 2 %v.1 = insertelement <16 x i8> %v.0, i8 %a, i32 1 @@ -56,9 +58,11 @@ define <16 x i8> @test_insert_v16i8_insert_2_undef_base(i8 %a) { ; CHECK-LABEL: test_insert_v16i8_insert_2_undef_base: ; CHECK: // %bb.0: -; CHECK-NEXT: dup.16b v0, w0 -; CHECK-NEXT: mov.b v0[5], wzr -; CHECK-NEXT: mov.b v0[9], wzr +; CHECK-NEXT: adrp x8, .LCPI4_0 +; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI4_0] +; CHECK-NEXT: movi.2d v1, #0000000000000000 +; CHECK-NEXT: dup.16b v2, w0 +; CHECK-NEXT: tbl.16b v0, { v1, v2 }, v0 ; CHECK-NEXT: ret %v.0 = insertelement <16 x i8> , i8 %a, i32 0 %v.1 = insertelement <16 x i8> %v.0, i8 %a, i32 1 @@ -80,13 +84,15 @@ define <16 x i8> @test_insert_v16i8_insert_2_undef_base_different_valeus(i8 %a, i8 %b) { ; CHECK-LABEL: test_insert_v16i8_insert_2_undef_base_different_valeus: ; CHECK: // %bb.0: -; CHECK-NEXT: dup.16b v0, w0 -; CHECK-NEXT: mov.b v0[2], w1 -; CHECK-NEXT: mov.b v0[5], wzr -; CHECK-NEXT: mov.b v0[7], w1 -; CHECK-NEXT: mov.b v0[9], wzr -; CHECK-NEXT: mov.b v0[12], w1 -; CHECK-NEXT: mov.b v0[15], w1 +; CHECK-NEXT: movi.2d v0, #0000000000000000 +; CHECK-NEXT: adrp x8, .LCPI5_0 +; CHECK-NEXT: dup.16b v1, w0 +; CHECK-NEXT: mov.b v1[2], w1 +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI5_0] +; CHECK-NEXT: mov.b v1[7], w1 +; CHECK-NEXT: mov.b v1[12], w1 +; CHECK-NEXT: mov.b v1[15], w1 +; CHECK-NEXT: tbl.16b v0, { v0, v1 }, v2 ; CHECK-NEXT: ret %v.0 = insertelement <16 x i8> , i8 %a, i32 0 %v.1 = insertelement <16 x i8> %v.0, i8 %a, i32 1 @@ -126,11 +132,12 @@ define <8 x half> @test_insert_v8f16_insert_2(half %a) { ; CHECK-LABEL: test_insert_v8f16_insert_2: ; CHECK: // %bb.0: -; CHECK-NEXT: movi.2d v1, #0000000000000000 +; CHECK-NEXT: adrp x8, .LCPI7_0 +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI7_0] ; CHECK-NEXT: // kill: def $h0 killed $h0 def $q0 -; CHECK-NEXT: mov.h v1[1], v0[0] -; CHECK-NEXT: mov.h v1[2], v0[0] -; CHECK-NEXT: mov.16b v0, v1 +; CHECK-NEXT: dup.8h v3, v0[0] +; CHECK-NEXT: movi.2d v2, #0000000000000000 +; CHECK-NEXT: tbl.16b v0, { v2, v3 }, v1 ; CHECK-NEXT: ret %v.0 = insertelement <8 x half> zeroinitializer, half %a, i32 2 %v.1 = insertelement <8 x half> %v.0, half %a, i32 1 @@ -140,9 +147,11 @@ define <8 x i16> @test_insert_v8i16_insert_2(i16 %a) { ; CHECK-LABEL: test_insert_v8i16_insert_2: ; CHECK: // %bb.0: -; CHECK-NEXT: dup.8h v0, w0 -; CHECK-NEXT: mov.h v0[3], wzr -; CHECK-NEXT: mov.h v0[7], wzr +; CHECK-NEXT: adrp x8, .LCPI8_0 +; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI8_0] +; CHECK-NEXT: movi.2d v1, #0000000000000000 +; CHECK-NEXT: dup.8h v2, w0 +; CHECK-NEXT: tbl.16b v0, { v1, v2 }, v0 ; CHECK-NEXT: ret %v.0 = insertelement <8 x i16> , i16 %a, i32 0 %v.1 = insertelement <8 x i16> %v.0, i16 %a, i32 1 @@ -156,10 +165,11 @@ define <8 x i16> @test_insert_v8i16_insert_3(i16 %a) { ; CHECK-LABEL: test_insert_v8i16_insert_3: ; CHECK: // %bb.0: -; CHECK-NEXT: dup.8h v0, w0 -; CHECK-NEXT: mov.h v0[1], wzr -; CHECK-NEXT: mov.h v0[3], wzr -; CHECK-NEXT: mov.h v0[7], wzr +; CHECK-NEXT: adrp x8, .LCPI9_0 +; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI9_0] +; CHECK-NEXT: movi.2d v1, #0000000000000000 +; CHECK-NEXT: dup.8h v2, w0 +; CHECK-NEXT: tbl.16b v0, { v1, v2 }, v0 ; CHECK-NEXT: ret %v.0 = insertelement <8 x i16> , i16 %a, i32 0 %v.2 = insertelement <8 x i16> %v.0, i16 %a, i32 2 @@ -172,11 +182,11 @@ define <8 x i16> @test_insert_v8i16_insert_4(i16 %a) { ; CHECK-LABEL: test_insert_v8i16_insert_4: ; CHECK: // %bb.0: -; CHECK-NEXT: movi.2d v0, #0000000000000000 -; CHECK-NEXT: mov.h v0[0], w0 -; CHECK-NEXT: mov.h v0[2], w0 -; CHECK-NEXT: mov.h v0[4], w0 -; CHECK-NEXT: mov.h v0[5], w0 +; CHECK-NEXT: adrp x8, .LCPI10_0 +; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI10_0] +; CHECK-NEXT: movi.2d v1, #0000000000000000 +; CHECK-NEXT: dup.8h v2, w0 +; CHECK-NEXT: tbl.16b v0, { v1, v2 }, v0 ; CHECK-NEXT: ret %v.0 = insertelement <8 x i16> , i16 %a, i32 0 %v.2 = insertelement <8 x i16> %v.0, i16 %a, i32 2 @@ -188,10 +198,11 @@ define <8 x i16> @test_insert_v8i16_insert_5(i16 %a) { ; CHECK-LABEL: test_insert_v8i16_insert_5: ; CHECK: // %bb.0: -; CHECK-NEXT: movi.2d v0, #0000000000000000 -; CHECK-NEXT: mov.h v0[0], w0 -; CHECK-NEXT: mov.h v0[4], w0 -; CHECK-NEXT: mov.h v0[5], w0 +; CHECK-NEXT: adrp x8, .LCPI11_0 +; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI11_0] +; CHECK-NEXT: movi.2d v1, #0000000000000000 +; CHECK-NEXT: dup.8h v2, w0 +; CHECK-NEXT: tbl.16b v0, { v1, v2 }, v0 ; CHECK-NEXT: ret %v.0 = insertelement <8 x i16> , i16 %a, i32 0 %v.3 = insertelement <8 x i16> %v.0, i16 %a, i32 4 @@ -239,11 +250,11 @@ define <4 x float> @test_insert_2_f32_undef_zero(float %a) { ; CHECK-LABEL: test_insert_2_f32_undef_zero: ; CHECK: // %bb.0: -; CHECK-NEXT: movi.2d v1, #0000000000000000 ; CHECK-NEXT: // kill: def $s0 killed $s0 def $q0 -; CHECK-NEXT: mov.s v1[0], v0[0] -; CHECK-NEXT: mov.s v1[2], v0[0] -; CHECK-NEXT: mov.16b v0, v1 +; CHECK-NEXT: movi.2d v1, #0000000000000000 +; CHECK-NEXT: dup.4s v0, v0[0] +; CHECK-NEXT: trn1.4s v0, v1, v0 +; CHECK-NEXT: trn2.4s v0, v0, v1 ; CHECK-NEXT: ret %v.0 = insertelement <4 x float> , float %a, i32 0 %v.1 = insertelement <4 x float> %v.0, float %a, i32 2 @@ -266,9 +277,9 @@ ; CHECK-LABEL: test_insert_2_f32_var: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $s0 killed $s0 def $q0 -; CHECK-NEXT: mov.s v1[0], v0[0] -; CHECK-NEXT: mov.s v1[2], v0[0] -; CHECK-NEXT: mov.16b v0, v1 +; CHECK-NEXT: dup.4s v0, v0[0] +; CHECK-NEXT: trn1.4s v0, v1, v0 +; CHECK-NEXT: trn2.4s v0, v0, v1 ; CHECK-NEXT: ret %v.0 = insertelement <4 x float> %b, float %a, i32 0 %v.1 = insertelement <4 x float> %v.0, float %a, i32 2 diff --git a/llvm/test/CodeGen/PowerPC/aix-vec_insert_elt.ll b/llvm/test/CodeGen/PowerPC/aix-vec_insert_elt.ll --- a/llvm/test/CodeGen/PowerPC/aix-vec_insert_elt.ll +++ b/llvm/test/CodeGen/PowerPC/aix-vec_insert_elt.ll @@ -119,28 +119,40 @@ define <4 x i32> @testWordImm(<4 x i32> %a, i64 %b) { ; CHECK-64-LABEL: testWordImm: ; CHECK-64: # %bb.0: # %entry +; CHECK-64-NEXT: ld 4, L..C0(2) # %const.0 ; CHECK-64-NEXT: mtfprwz 0, 3 -; CHECK-64-NEXT: xxinsertw 34, 0, 4 -; CHECK-64-NEXT: xxinsertw 34, 0, 12 +; CHECK-64-NEXT: xxspltw 36, 0, 1 +; CHECK-64-NEXT: lxvx 35, 0, 4 +; CHECK-64-NEXT: vperm 2, 2, 4, 3 ; CHECK-64-NEXT: blr ; ; CHECK-32-LABEL: testWordImm: ; CHECK-32: # %bb.0: # %entry -; CHECK-32-NEXT: mtfprwz 0, 4 -; CHECK-32-NEXT: xxinsertw 34, 0, 4 -; CHECK-32-NEXT: xxinsertw 34, 0, 12 +; CHECK-32-NEXT: lwz 3, L..C0(2) # %const.0 +; CHECK-32-NEXT: stw 4, -16(1) +; CHECK-32-NEXT: lxvx 35, 0, 3 +; CHECK-32-NEXT: addi 3, 1, -16 +; CHECK-32-NEXT: lxvwsx 36, 0, 3 +; CHECK-32-NEXT: vperm 2, 2, 4, 3 ; CHECK-32-NEXT: blr ; ; CHECK-64-P10-LABEL: testWordImm: ; CHECK-64-P10: # %bb.0: # %entry -; CHECK-64-P10-NEXT: vinsw 2, 3, 4 -; CHECK-64-P10-NEXT: vinsw 2, 3, 12 +; CHECK-64-P10-NEXT: ld 4, L..C0(2) # %const.0 +; CHECK-64-P10-NEXT: mtfprwz 0, 3 +; CHECK-64-P10-NEXT: xxspltw 36, 0, 1 +; CHECK-64-P10-NEXT: lxvx 35, 0, 4 +; CHECK-64-P10-NEXT: vperm 2, 2, 4, 3 ; CHECK-64-P10-NEXT: blr ; ; CHECK-32-P10-LABEL: testWordImm: ; CHECK-32-P10: # %bb.0: # %entry -; CHECK-32-P10-NEXT: vinsw 2, 4, 4 -; CHECK-32-P10-NEXT: vinsw 2, 4, 12 +; CHECK-32-P10-NEXT: lwz 3, L..C0(2) # %const.0 +; CHECK-32-P10-NEXT: stw 4, -16(1) +; CHECK-32-P10-NEXT: lxvx 35, 0, 3 +; CHECK-32-P10-NEXT: addi 3, 1, -16 +; CHECK-32-P10-NEXT: lxvwsx 36, 0, 3 +; CHECK-32-P10-NEXT: vperm 2, 2, 4, 3 ; CHECK-32-P10-NEXT: blr entry: %conv = trunc i64 %b to i32 @@ -262,8 +274,8 @@ define <4 x float> @testFloat1(<4 x float> %a, float %b, i32 zeroext %idx1) { ; CHECK-64-LABEL: testFloat1: ; CHECK-64: # %bb.0: # %entry -; CHECK-64-DAG: rlwinm 3, 4, 2, 28, 29 -; CHECK-64-DAG: addi 4, 1, -16 +; CHECK-64-NEXT: rlwinm 3, 4, 2, 28, 29 +; CHECK-64-NEXT: addi 4, 1, -16 ; CHECK-64-NEXT: stxv 34, -16(1) ; CHECK-64-NEXT: stfsx 1, 4, 3 ; CHECK-64-NEXT: lxv 34, -16(1) @@ -302,8 +314,8 @@ ; CHECK-64-LABEL: testFloat2: ; CHECK-64: # %bb.0: # %entry ; CHECK-64-NEXT: lwz 6, 0(3) -; CHECK-64-DAG: rlwinm 4, 4, 2, 28, 29 -; CHECK-64-DAG: addi 7, 1, -32 +; CHECK-64-NEXT: rlwinm 4, 4, 2, 28, 29 +; CHECK-64-NEXT: addi 7, 1, -32 ; CHECK-64-NEXT: stxv 34, -32(1) ; CHECK-64-NEXT: stwx 6, 7, 4 ; CHECK-64-NEXT: rlwinm 4, 5, 2, 28, 29 @@ -365,8 +377,8 @@ ; CHECK-64-LABEL: testFloat3: ; CHECK-64: # %bb.0: # %entry ; CHECK-64-NEXT: lis 6, 1 -; CHECK-64-DAG: rlwinm 4, 4, 2, 28, 29 -; CHECK-64-DAG: addi 7, 1, -32 +; CHECK-64-NEXT: rlwinm 4, 4, 2, 28, 29 +; CHECK-64-NEXT: addi 7, 1, -32 ; CHECK-64-NEXT: lwzx 6, 3, 6 ; CHECK-64-NEXT: stxv 34, -32(1) ; CHECK-64-NEXT: stwx 6, 7, 4 @@ -438,29 +450,29 @@ ; CHECK-64-LABEL: testFloatImm1: ; CHECK-64: # %bb.0: # %entry ; CHECK-64-NEXT: xscvdpspn 0, 1 -; CHECK-64-NEXT: xxinsertw 34, 0, 0 -; CHECK-64-NEXT: xxinsertw 34, 0, 8 +; CHECK-64-NEXT: xxspltw 35, 0, 0 +; CHECK-64-NEXT: vmrgow 2, 3, 2 ; CHECK-64-NEXT: blr ; ; CHECK-32-LABEL: testFloatImm1: ; CHECK-32: # %bb.0: # %entry ; CHECK-32-NEXT: xscvdpspn 0, 1 -; CHECK-32-NEXT: xxinsertw 34, 0, 0 -; CHECK-32-NEXT: xxinsertw 34, 0, 8 +; CHECK-32-NEXT: xxspltw 35, 0, 1 +; CHECK-32-NEXT: vmrgow 2, 3, 2 ; CHECK-32-NEXT: blr ; ; CHECK-64-P10-LABEL: testFloatImm1: ; CHECK-64-P10: # %bb.0: # %entry ; CHECK-64-P10-NEXT: xscvdpspn 0, 1 -; CHECK-64-P10-NEXT: xxinsertw 34, 0, 0 -; CHECK-64-P10-NEXT: xxinsertw 34, 0, 8 +; CHECK-64-P10-NEXT: xxspltw 35, 0, 0 +; CHECK-64-P10-NEXT: vmrgow 2, 3, 2 ; CHECK-64-P10-NEXT: blr ; ; CHECK-32-P10-LABEL: testFloatImm1: ; CHECK-32-P10: # %bb.0: # %entry ; CHECK-32-P10-NEXT: xscvdpspn 0, 1 -; CHECK-32-P10-NEXT: xxinsertw 34, 0, 0 -; CHECK-32-P10-NEXT: xxinsertw 34, 0, 8 +; CHECK-32-P10-NEXT: xxspltw 35, 0, 1 +; CHECK-32-P10-NEXT: vmrgow 2, 3, 2 ; CHECK-32-P10-NEXT: blr entry: %vecins = insertelement <4 x float> %a, float %b, i32 0 @@ -471,38 +483,48 @@ define <4 x float> @testFloatImm2(<4 x float> %a, i32* %b) { ; CHECK-64-LABEL: testFloatImm2: ; CHECK-64: # %bb.0: # %entry -; CHECK-64-NEXT: lfs 0, 0(3) -; CHECK-64-NEXT: xscvdpspn 0, 0 -; CHECK-64-NEXT: xxinsertw 34, 0, 0 -; CHECK-64-NEXT: lfs 0, 4(3) -; CHECK-64-NEXT: xscvdpspn 0, 0 -; CHECK-64-NEXT: xxinsertw 34, 0, 8 +; CHECK-64-NEXT: li 4, 4 +; CHECK-64-NEXT: lxsiwzx 36, 0, 3 +; CHECK-64-NEXT: lxsiwzx 35, 3, 4 +; CHECK-64-NEXT: ld 3, L..C1(2) # %const.0 +; CHECK-64-NEXT: vpkudum 3, 4, 3 +; CHECK-64-NEXT: lxvx 36, 0, 3 +; CHECK-64-NEXT: vperm 2, 3, 2, 4 ; CHECK-64-NEXT: blr ; ; CHECK-32-LABEL: testFloatImm2: ; CHECK-32: # %bb.0: # %entry ; CHECK-32-NEXT: lfs 0, 0(3) -; CHECK-32-NEXT: xscvdpspn 0, 0 -; CHECK-32-NEXT: xxinsertw 34, 0, 0 -; CHECK-32-NEXT: lfs 0, 4(3) -; CHECK-32-NEXT: xscvdpspn 0, 0 -; CHECK-32-NEXT: xxinsertw 34, 0, 8 +; CHECK-32-NEXT: xscvdpspn 36, 0 +; CHECK-32-NEXT: lfs 1, 4(3) +; CHECK-32-NEXT: xscvdpspn 35, 1 +; CHECK-32-NEXT: lwz 3, L..C1(2) # %const.0 +; CHECK-32-NEXT: vpkudum 3, 4, 3 +; CHECK-32-NEXT: lxvx 36, 0, 3 +; CHECK-32-NEXT: vperm 2, 3, 2, 4 ; CHECK-32-NEXT: blr ; ; CHECK-64-P10-LABEL: testFloatImm2: ; CHECK-64-P10: # %bb.0: # %entry -; CHECK-64-P10-NEXT: lwz 4, 0(3) -; CHECK-64-P10-NEXT: lwz 3, 4(3) -; CHECK-64-P10-NEXT: vinsw 2, 4, 0 -; CHECK-64-P10-NEXT: vinsw 2, 3, 8 +; CHECK-64-P10-NEXT: li 4, 4 +; CHECK-64-P10-NEXT: lxsiwzx 36, 0, 3 +; CHECK-64-P10-NEXT: lxsiwzx 35, 3, 4 +; CHECK-64-P10-NEXT: ld 3, L..C1(2) # %const.0 +; CHECK-64-P10-NEXT: vpkudum 3, 4, 3 +; CHECK-64-P10-NEXT: lxvx 36, 0, 3 +; CHECK-64-P10-NEXT: vperm 2, 3, 2, 4 ; CHECK-64-P10-NEXT: blr ; ; CHECK-32-P10-LABEL: testFloatImm2: ; CHECK-32-P10: # %bb.0: # %entry -; CHECK-32-P10-NEXT: lwz 4, 0(3) -; CHECK-32-P10-NEXT: lwz 3, 4(3) -; CHECK-32-P10-NEXT: vinsw 2, 4, 0 -; CHECK-32-P10-NEXT: vinsw 2, 3, 8 +; CHECK-32-P10-NEXT: lfs 0, 0(3) +; CHECK-32-P10-NEXT: xscvdpspn 36, 0 +; CHECK-32-P10-NEXT: lfs 1, 4(3) +; CHECK-32-P10-NEXT: xscvdpspn 35, 1 +; CHECK-32-P10-NEXT: lwz 3, L..C1(2) # %const.0 +; CHECK-32-P10-NEXT: vpkudum 3, 4, 3 +; CHECK-32-P10-NEXT: lxvx 36, 0, 3 +; CHECK-32-P10-NEXT: vperm 2, 3, 2, 4 ; CHECK-32-P10-NEXT: blr entry: %0 = bitcast i32* %b to float* @@ -519,44 +541,53 @@ ; CHECK-64-LABEL: testFloatImm3: ; CHECK-64: # %bb.0: # %entry ; CHECK-64-NEXT: lis 4, 4 -; CHECK-64-NEXT: lfsx 0, 3, 4 +; CHECK-64-NEXT: lxsiwzx 35, 3, 4 ; CHECK-64-NEXT: li 4, 1 ; CHECK-64-NEXT: rldic 4, 4, 38, 25 -; CHECK-64-NEXT: xscvdpspn 0, 0 -; CHECK-64-NEXT: xxinsertw 34, 0, 0 -; CHECK-64-NEXT: lfsx 0, 3, 4 -; CHECK-64-NEXT: xscvdpspn 0, 0 -; CHECK-64-NEXT: xxinsertw 34, 0, 8 +; CHECK-64-NEXT: lxsiwzx 36, 3, 4 +; CHECK-64-NEXT: ld 3, L..C2(2) # %const.0 +; CHECK-64-NEXT: vpkudum 3, 3, 4 +; CHECK-64-NEXT: lxvx 36, 0, 3 +; CHECK-64-NEXT: vperm 2, 3, 2, 4 ; CHECK-64-NEXT: blr ; ; CHECK-32-LABEL: testFloatImm3: ; CHECK-32: # %bb.0: # %entry ; CHECK-32-NEXT: lis 4, 4 +; CHECK-32-NEXT: lfs 1, 0(3) +; CHECK-32-NEXT: xscvdpspn 36, 1 ; CHECK-32-NEXT: lfsx 0, 3, 4 -; CHECK-32-NEXT: xscvdpspn 0, 0 -; CHECK-32-NEXT: xxinsertw 34, 0, 0 -; CHECK-32-NEXT: lfs 0, 0(3) -; CHECK-32-NEXT: xscvdpspn 0, 0 -; CHECK-32-NEXT: xxinsertw 34, 0, 8 +; CHECK-32-NEXT: lwz 3, L..C2(2) # %const.0 +; CHECK-32-NEXT: xscvdpspn 35, 0 +; CHECK-32-NEXT: vpkudum 3, 3, 4 +; CHECK-32-NEXT: lxvx 36, 0, 3 +; CHECK-32-NEXT: vperm 2, 3, 2, 4 ; CHECK-32-NEXT: blr ; ; CHECK-64-P10-LABEL: testFloatImm3: ; CHECK-64-P10: # %bb.0: # %entry -; CHECK-64-P10-NEXT: plwz 4, 262144(3), 0 -; CHECK-64-P10-NEXT: vinsw 2, 4, 0 +; CHECK-64-P10-NEXT: lis 4, 4 +; CHECK-64-P10-NEXT: lxsiwzx 35, 3, 4 ; CHECK-64-P10-NEXT: li 4, 1 ; CHECK-64-P10-NEXT: rldic 4, 4, 38, 25 -; CHECK-64-P10-NEXT: lwzx 3, 3, 4 -; CHECK-64-P10-NEXT: vinsw 2, 3, 8 +; CHECK-64-P10-NEXT: lxsiwzx 36, 3, 4 +; CHECK-64-P10-NEXT: ld 3, L..C2(2) # %const.0 +; CHECK-64-P10-NEXT: vpkudum 3, 3, 4 +; CHECK-64-P10-NEXT: lxvx 36, 0, 3 +; CHECK-64-P10-NEXT: vperm 2, 3, 2, 4 ; CHECK-64-P10-NEXT: blr ; ; CHECK-32-P10-LABEL: testFloatImm3: ; CHECK-32-P10: # %bb.0: # %entry ; CHECK-32-P10-NEXT: lis 4, 4 -; CHECK-32-P10-NEXT: lwzx 4, 3, 4 -; CHECK-32-P10-NEXT: lwz 3, 0(3) -; CHECK-32-P10-NEXT: vinsw 2, 4, 0 -; CHECK-32-P10-NEXT: vinsw 2, 3, 8 +; CHECK-32-P10-NEXT: lfs 1, 0(3) +; CHECK-32-P10-NEXT: xscvdpspn 36, 1 +; CHECK-32-P10-NEXT: lfsx 0, 3, 4 +; CHECK-32-P10-NEXT: lwz 3, L..C2(2) # %const.0 +; CHECK-32-P10-NEXT: xscvdpspn 35, 0 +; CHECK-32-P10-NEXT: vpkudum 3, 3, 4 +; CHECK-32-P10-NEXT: lxvx 36, 0, 3 +; CHECK-32-P10-NEXT: vperm 2, 3, 2, 4 ; CHECK-32-P10-NEXT: blr entry: %add.ptr = getelementptr inbounds i32, i32* %b, i64 65536 @@ -575,7 +606,7 @@ define <2 x double> @testDouble1(<2 x double> %a, double %b, i32 zeroext %idx1) { ; CHECK-64-LABEL: testDouble1: ; CHECK-64: # %bb.0: # %entry -; CHECK-64: rlwinm 3, 4, 3, 28, 28 +; CHECK-64-NEXT: rlwinm 3, 4, 3, 28, 28 ; CHECK-64-NEXT: addi 4, 1, -16 ; CHECK-64-NEXT: stxv 34, -16(1) ; CHECK-64-NEXT: stfdx 1, 4, 3 @@ -601,8 +632,8 @@ ; ; CHECK-32-P10-LABEL: testDouble1: ; CHECK-32-P10: # %bb.0: # %entry -; CHECK-32-P10-DAG: addi 4, 1, -16 -; CHECK-32-P10-DAG: rlwinm 3, 5, 3, 28, 28 +; CHECK-32-P10-NEXT: addi 4, 1, -16 +; CHECK-32-P10-NEXT: rlwinm 3, 5, 3, 28, 28 ; CHECK-32-P10-NEXT: stxv 34, -16(1) ; CHECK-32-P10-NEXT: stfdx 1, 4, 3 ; CHECK-32-P10-NEXT: lxv 34, -16(1) @@ -616,8 +647,8 @@ ; CHECK-64-LABEL: testDouble2: ; CHECK-64: # %bb.0: # %entry ; CHECK-64-NEXT: ld 6, 0(3) -; CHECK-64-DAG: rlwinm 4, 4, 3, 28, 28 -; CHECK-64-DAG: addi 7, 1, -32 +; CHECK-64-NEXT: rlwinm 4, 4, 3, 28, 28 +; CHECK-64-NEXT: addi 7, 1, -32 ; CHECK-64-NEXT: stxv 34, -32(1) ; CHECK-64-NEXT: stdx 6, 7, 4 ; CHECK-64-NEXT: li 4, 1 @@ -661,8 +692,8 @@ ; CHECK-32-P10-LABEL: testDouble2: ; CHECK-32-P10: # %bb.0: # %entry ; CHECK-32-P10-NEXT: lfd 0, 0(3) -; CHECK-32-P10-DAG: addi 6, 1, -32 -; CHECK-32-P10-DAG: rlwinm 4, 4, 3, 28, 28 +; CHECK-32-P10-NEXT: addi 6, 1, -32 +; CHECK-32-P10-NEXT: rlwinm 4, 4, 3, 28, 28 ; CHECK-32-P10-NEXT: stxv 34, -32(1) ; CHECK-32-P10-NEXT: rlwinm 5, 5, 3, 28, 28 ; CHECK-32-P10-NEXT: stfdx 0, 6, 4 @@ -688,8 +719,8 @@ ; CHECK-64-LABEL: testDouble3: ; CHECK-64: # %bb.0: # %entry ; CHECK-64-NEXT: lis 6, 1 -; CHECK-64-DAG: rlwinm 4, 4, 3, 28, 28 -; CHECK-64-DAG: addi 7, 1, -32 +; CHECK-64-NEXT: rlwinm 4, 4, 3, 28, 28 +; CHECK-64-NEXT: addi 7, 1, -32 ; CHECK-64-NEXT: ldx 6, 3, 6 ; CHECK-64-NEXT: stxv 34, -32(1) ; CHECK-64-NEXT: stdx 6, 7, 4 diff --git a/llvm/test/CodeGen/PowerPC/vec_insert_elt.ll b/llvm/test/CodeGen/PowerPC/vec_insert_elt.ll --- a/llvm/test/CodeGen/PowerPC/vec_insert_elt.ll +++ b/llvm/test/CodeGen/PowerPC/vec_insert_elt.ll @@ -99,21 +99,28 @@ define <4 x i32> @testWordImm(<4 x i32> %a, i64 %b) { ; CHECK-LABEL: testWordImm: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vinsw v2, r5, 8 -; CHECK-NEXT: vinsw v2, r5, 0 +; CHECK-NEXT: mtvsrws v3, r5 +; CHECK-NEXT: vmrgow v2, v3, v2 ; CHECK-NEXT: blr ; ; CHECK-BE-LABEL: testWordImm: ; CHECK-BE: # %bb.0: # %entry -; CHECK-BE-NEXT: vinsw v2, r5, 4 -; CHECK-BE-NEXT: vinsw v2, r5, 12 +; CHECK-BE-NEXT: addis r3, r2, .LCPI3_0@toc@ha +; CHECK-BE-NEXT: mtfprwz f0, r5 +; CHECK-BE-NEXT: addi r3, r3, .LCPI3_0@toc@l +; CHECK-BE-NEXT: xxspltw v4, vs0, 1 +; CHECK-BE-NEXT: lxvx v3, 0, r3 +; CHECK-BE-NEXT: vperm v2, v2, v4, v3 ; CHECK-BE-NEXT: blr ; ; CHECK-P9-LABEL: testWordImm: ; CHECK-P9: # %bb.0: # %entry +; CHECK-P9-NEXT: addis r3, r2, .LCPI3_0@toc@ha ; CHECK-P9-NEXT: mtfprwz f0, r5 -; CHECK-P9-NEXT: xxinsertw v2, vs0, 4 -; CHECK-P9-NEXT: xxinsertw v2, vs0, 12 +; CHECK-P9-NEXT: addi r3, r3, .LCPI3_0@toc@l +; CHECK-P9-NEXT: xxspltw v4, vs0, 1 +; CHECK-P9-NEXT: lxvx v3, 0, r3 +; CHECK-P9-NEXT: vperm v2, v2, v4, v3 ; CHECK-P9-NEXT: blr entry: %conv = trunc i64 %b to i32 @@ -344,22 +351,23 @@ ; CHECK-LABEL: testFloatImm1: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: xscvdpspn vs0, f1 -; CHECK-NEXT: xxinsertw v2, vs0, 12 -; CHECK-NEXT: xxinsertw v2, vs0, 4 +; CHECK-NEXT: plxv v4, .LCPI10_0@PCREL(0), 1 +; CHECK-NEXT: xxspltw v3, vs0, 1 +; CHECK-NEXT: vperm v2, v2, v3, v4 ; CHECK-NEXT: blr ; ; CHECK-BE-LABEL: testFloatImm1: ; CHECK-BE: # %bb.0: # %entry ; CHECK-BE-NEXT: xscvdpspn vs0, f1 -; CHECK-BE-NEXT: xxinsertw v2, vs0, 0 -; CHECK-BE-NEXT: xxinsertw v2, vs0, 8 +; CHECK-BE-NEXT: xxspltw v3, vs0, 0 +; CHECK-BE-NEXT: vmrgow v2, v3, v2 ; CHECK-BE-NEXT: blr ; ; CHECK-P9-LABEL: testFloatImm1: ; CHECK-P9: # %bb.0: # %entry ; CHECK-P9-NEXT: xscvdpspn vs0, f1 -; CHECK-P9-NEXT: xxinsertw v2, vs0, 0 -; CHECK-P9-NEXT: xxinsertw v2, vs0, 8 +; CHECK-P9-NEXT: xxspltw v3, vs0, 0 +; CHECK-P9-NEXT: vmrgow v2, v3, v2 ; CHECK-P9-NEXT: blr entry: %vecins = insertelement <4 x float> %a, float %b, i32 0 @@ -370,28 +378,37 @@ define <4 x float> @testFloatImm2(<4 x float> %a, i32* %b) { ; CHECK-LABEL: testFloatImm2: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: lwz r3, 0(r5) -; CHECK-NEXT: vinsw v2, r3, 12 -; CHECK-NEXT: lwz r3, 4(r5) -; CHECK-NEXT: vinsw v2, r3, 4 +; CHECK-NEXT: li r3, 4 +; CHECK-NEXT: lxsiwzx v4, 0, r5 +; CHECK-NEXT: plxv v5, .LCPI11_0@PCREL(0), 1 +; CHECK-NEXT: lxsiwzx v3, r5, r3 +; CHECK-NEXT: vperm v3, v3, v4, v5 +; CHECK-NEXT: plxv v4, .LCPI11_1@PCREL(0), 1 +; CHECK-NEXT: vperm v2, v2, v3, v4 ; CHECK-NEXT: blr ; ; CHECK-BE-LABEL: testFloatImm2: ; CHECK-BE: # %bb.0: # %entry -; CHECK-BE-NEXT: lwz r3, 0(r5) -; CHECK-BE-NEXT: vinsw v2, r3, 0 -; CHECK-BE-NEXT: lwz r3, 4(r5) -; CHECK-BE-NEXT: vinsw v2, r3, 8 +; CHECK-BE-NEXT: li r3, 4 +; CHECK-BE-NEXT: lxsiwzx v4, 0, r5 +; CHECK-BE-NEXT: lxsiwzx v3, r5, r3 +; CHECK-BE-NEXT: addis r3, r2, .LCPI11_0@toc@ha +; CHECK-BE-NEXT: addi r3, r3, .LCPI11_0@toc@l +; CHECK-BE-NEXT: vpkudum v3, v4, v3 +; CHECK-BE-NEXT: lxvx v4, 0, r3 +; CHECK-BE-NEXT: vperm v2, v3, v2, v4 ; CHECK-BE-NEXT: blr ; ; CHECK-P9-LABEL: testFloatImm2: ; CHECK-P9: # %bb.0: # %entry -; CHECK-P9-NEXT: lfs f0, 0(r5) -; CHECK-P9-NEXT: xscvdpspn vs0, f0 -; CHECK-P9-NEXT: xxinsertw v2, vs0, 0 -; CHECK-P9-NEXT: lfs f0, 4(r5) -; CHECK-P9-NEXT: xscvdpspn vs0, f0 -; CHECK-P9-NEXT: xxinsertw v2, vs0, 8 +; CHECK-P9-NEXT: li r3, 4 +; CHECK-P9-NEXT: lxsiwzx v4, 0, r5 +; CHECK-P9-NEXT: lxsiwzx v3, r5, r3 +; CHECK-P9-NEXT: addis r3, r2, .LCPI11_0@toc@ha +; CHECK-P9-NEXT: addi r3, r3, .LCPI11_0@toc@l +; CHECK-P9-NEXT: vpkudum v3, v4, v3 +; CHECK-P9-NEXT: lxvx v4, 0, r3 +; CHECK-P9-NEXT: vperm v2, v3, v2, v4 ; CHECK-P9-NEXT: blr entry: %0 = bitcast i32* %b to float* @@ -407,35 +424,43 @@ define <4 x float> @testFloatImm3(<4 x float> %a, i32* %b) { ; CHECK-LABEL: testFloatImm3: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: plwz r3, 262144(r5), 0 -; CHECK-NEXT: vinsw v2, r3, 12 +; CHECK-NEXT: lis r3, 4 +; CHECK-NEXT: plxv v5, .LCPI12_0@PCREL(0), 1 +; CHECK-NEXT: lxsiwzx v3, r5, r3 ; CHECK-NEXT: li r3, 1 ; CHECK-NEXT: rldic r3, r3, 38, 25 -; CHECK-NEXT: lwzx r3, r5, r3 -; CHECK-NEXT: vinsw v2, r3, 4 +; CHECK-NEXT: lxsiwzx v4, r5, r3 +; CHECK-NEXT: vperm v3, v4, v3, v5 +; CHECK-NEXT: plxv v4, .LCPI12_1@PCREL(0), 1 +; CHECK-NEXT: vperm v2, v2, v3, v4 ; CHECK-NEXT: blr ; ; CHECK-BE-LABEL: testFloatImm3: ; CHECK-BE: # %bb.0: # %entry -; CHECK-BE-NEXT: plwz r3, 262144(r5), 0 -; CHECK-BE-NEXT: vinsw v2, r3, 0 +; CHECK-BE-NEXT: lis r3, 4 +; CHECK-BE-NEXT: lxsiwzx v3, r5, r3 ; CHECK-BE-NEXT: li r3, 1 ; CHECK-BE-NEXT: rldic r3, r3, 38, 25 -; CHECK-BE-NEXT: lwzx r3, r5, r3 -; CHECK-BE-NEXT: vinsw v2, r3, 8 +; CHECK-BE-NEXT: lxsiwzx v4, r5, r3 +; CHECK-BE-NEXT: addis r3, r2, .LCPI12_0@toc@ha +; CHECK-BE-NEXT: addi r3, r3, .LCPI12_0@toc@l +; CHECK-BE-NEXT: vpkudum v3, v3, v4 +; CHECK-BE-NEXT: lxvx v4, 0, r3 +; CHECK-BE-NEXT: vperm v2, v3, v2, v4 ; CHECK-BE-NEXT: blr ; ; CHECK-P9-LABEL: testFloatImm3: ; CHECK-P9: # %bb.0: # %entry ; CHECK-P9-NEXT: lis r3, 4 -; CHECK-P9-NEXT: lfsx f0, r5, r3 +; CHECK-P9-NEXT: lxsiwzx v3, r5, r3 ; CHECK-P9-NEXT: li r3, 1 ; CHECK-P9-NEXT: rldic r3, r3, 38, 25 -; CHECK-P9-NEXT: xscvdpspn vs0, f0 -; CHECK-P9-NEXT: xxinsertw v2, vs0, 0 -; CHECK-P9-NEXT: lfsx f0, r5, r3 -; CHECK-P9-NEXT: xscvdpspn vs0, f0 -; CHECK-P9-NEXT: xxinsertw v2, vs0, 8 +; CHECK-P9-NEXT: lxsiwzx v4, r5, r3 +; CHECK-P9-NEXT: addis r3, r2, .LCPI12_0@toc@ha +; CHECK-P9-NEXT: addi r3, r3, .LCPI12_0@toc@l +; CHECK-P9-NEXT: vpkudum v3, v3, v4 +; CHECK-P9-NEXT: lxvx v4, 0, r3 +; CHECK-P9-NEXT: vperm v2, v3, v2, v4 ; CHECK-P9-NEXT: blr entry: %add.ptr = getelementptr inbounds i32, i32* %b, i64 65536 diff --git a/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll b/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll --- a/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll +++ b/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll @@ -1397,39 +1397,39 @@ define arm_aapcs_vfpcc void @arm_biquad_cascade_stereo_df2T_f32(%struct.arm_biquad_cascade_stereo_df2T_instance_f32* nocapture readonly %0, float* %1, float* %2, i32 %3) { ; CHECK-LABEL: arm_biquad_cascade_stereo_df2T_f32: ; CHECK: @ %bb.0: -; CHECK-NEXT: .save {r4, r5, r6, r7, r8, lr} -; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr} +; CHECK-NEXT: .save {r4, r5, r6, r7, lr} +; CHECK-NEXT: push {r4, r5, r6, r7, lr} +; CHECK-NEXT: .pad #4 +; CHECK-NEXT: sub sp, #4 ; CHECK-NEXT: .vsave {d8, d9, d10, d11} ; CHECK-NEXT: vpush {d8, d9, d10, d11} ; CHECK-NEXT: .pad #24 ; CHECK-NEXT: sub sp, #24 -; CHECK-NEXT: mov r8, r3 +; CHECK-NEXT: mov r6, r3 ; CHECK-NEXT: ldrb.w r12, [r0] ; CHECK-NEXT: ldrd r3, r0, [r0, #4] ; CHECK-NEXT: movs r4, #0 -; CHECK-NEXT: cmp.w r8, #0 ; CHECK-NEXT: strd r4, r4, [sp, #16] -; CHECK-NEXT: beq .LBB17_5 +; CHECK-NEXT: cbz r6, .LBB17_5 ; CHECK-NEXT: @ %bb.1: ; CHECK-NEXT: movs r5, #2 ; CHECK-NEXT: viwdup.u32 q0, r4, r5, #1 ; CHECK-NEXT: mov r4, sp ; CHECK-NEXT: .LBB17_2: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB17_3 Depth 2 -; CHECK-NEXT: ldrd r5, r7, [r0] -; CHECK-NEXT: vldrw.u32 q1, [r3] -; CHECK-NEXT: vldr s8, [r0, #8] -; CHECK-NEXT: ldr r6, [r0, #12] -; CHECK-NEXT: vstrw.32 q1, [r4] -; CHECK-NEXT: vdup.32 q1, r7 -; CHECK-NEXT: vldr s12, [r0, #16] -; CHECK-NEXT: vmov.f32 s6, s8 -; CHECK-NEXT: dls lr, r8 -; CHECK-NEXT: vmov.f32 s7, s8 -; CHECK-NEXT: vdup.32 q2, r6 -; CHECK-NEXT: vmov.f32 s10, s12 +; CHECK-NEXT: vldr s4, [r0, #4] +; CHECK-NEXT: vldrw.u32 q3, [r3] +; CHECK-NEXT: vldr s8, [r0, #12] ; CHECK-NEXT: mov r7, r2 -; CHECK-NEXT: vmov.f32 s11, s12 +; CHECK-NEXT: vldr s6, [r0, #8] +; CHECK-NEXT: vldr s10, [r0, #16] +; CHECK-NEXT: vmov.f32 s5, s4 +; CHECK-NEXT: vmov.f32 s9, s8 +; CHECK-NEXT: ldr r5, [r0] +; CHECK-NEXT: vmov.f32 s7, s6 +; CHECK-NEXT: vstrw.32 q3, [r4] +; CHECK-NEXT: vmov.f32 s11, s10 +; CHECK-NEXT: dls lr, r6 ; CHECK-NEXT: .LBB17_3: @ Parent Loop BB17_2 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 ; CHECK-NEXT: vldrw.u32 q4, [r1, q0, uxtw #2] @@ -1459,7 +1459,8 @@ ; CHECK-NEXT: .LBB17_7: ; CHECK-NEXT: add sp, #24 ; CHECK-NEXT: vpop {d8, d9, d10, d11} -; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, pc} +; CHECK-NEXT: add sp, #4 +; CHECK-NEXT: pop {r4, r5, r6, r7, pc} %5 = alloca [6 x float], align 4 %6 = getelementptr inbounds %struct.arm_biquad_cascade_stereo_df2T_instance_f32, %struct.arm_biquad_cascade_stereo_df2T_instance_f32* %0, i32 0, i32 1 %7 = load float*, float** %6, align 4 diff --git a/llvm/test/CodeGen/X86/avx-cvt-3.ll b/llvm/test/CodeGen/X86/avx-cvt-3.ll --- a/llvm/test/CodeGen/X86/avx-cvt-3.ll +++ b/llvm/test/CodeGen/X86/avx-cvt-3.ll @@ -93,31 +93,13 @@ define <8 x float> @sitofp_insert_constants_v8i32(<8 x i32> %a0) { ; X86-LABEL: sitofp_insert_constants_v8i32: ; X86: # %bb.0: -; X86-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; X86-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3,4,5,6,7] -; X86-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; X86-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5],xmm1[6,7] -; X86-NEXT: vextractf128 $1, %ymm0, %xmm0 -; X86-NEXT: movl $2, %eax -; X86-NEXT: vpinsrd $0, %eax, %xmm0, %xmm0 -; X86-NEXT: movl $-3, %eax -; X86-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 -; X86-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; X86-NEXT: vblendps {{.*#+}} ymm0 = mem[0],ymm0[1],mem[2],ymm0[3],mem[4,5],ymm0[6,7] ; X86-NEXT: vcvtdq2ps %ymm0, %ymm0 ; X86-NEXT: retl ; ; X64-LABEL: sitofp_insert_constants_v8i32: ; X64: # %bb.0: -; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; X64-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3,4,5,6,7] -; X64-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; X64-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5],xmm1[6,7] -; X64-NEXT: vextractf128 $1, %ymm0, %xmm0 -; X64-NEXT: movl $2, %eax -; X64-NEXT: vpinsrd $0, %eax, %xmm0, %xmm0 -; X64-NEXT: movl $-3, %eax -; X64-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 -; X64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; X64-NEXT: vblendps {{.*#+}} ymm0 = mem[0],ymm0[1],mem[2],ymm0[3],mem[4,5],ymm0[6,7] ; X64-NEXT: vcvtdq2ps %ymm0, %ymm0 ; X64-NEXT: retq %1 = insertelement <8 x i32> %a0, i32 0, i32 0 diff --git a/llvm/test/CodeGen/X86/avx512-insert-extract.ll b/llvm/test/CodeGen/X86/avx512-insert-extract.ll --- a/llvm/test/CodeGen/X86/avx512-insert-extract.ll +++ b/llvm/test/CodeGen/X86/avx512-insert-extract.ll @@ -6,11 +6,11 @@ define <16 x float> @test1(<16 x float> %x, float* %br, float %y) nounwind { ; CHECK-LABEL: test1: ; CHECK: ## %bb.0: -; CHECK-NEXT: vinsertps {{.*#+}} xmm2 = xmm0[0],mem[0],xmm0[2,3] -; CHECK-NEXT: vinsertf32x4 $0, %xmm2, %zmm0, %zmm2 -; CHECK-NEXT: vextractf32x4 $3, %zmm0, %xmm0 -; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] -; CHECK-NEXT: vinsertf32x4 $3, %xmm0, %zmm2, %zmm0 +; CHECK-NEXT: vbroadcastss (%rdi), %xmm2 +; CHECK-NEXT: vbroadcastss %xmm1, %ymm1 +; CHECK-NEXT: vinsertf64x4 $1, %ymm1, %zmm2, %zmm1 +; CHECK-NEXT: vmovaps {{.*#+}} zmm2 = [0,17,2,3,4,5,6,7,8,9,10,11,12,13,30,15] +; CHECK-NEXT: vpermt2ps %zmm1, %zmm2, %zmm0 ; CHECK-NEXT: retq %rrr = load float, float* %br %rrr2 = insertelement <16 x float> %x, float %rrr, i32 1 @@ -19,14 +19,23 @@ } define <8 x double> @test2(<8 x double> %x, double* %br, double %y) nounwind { -; CHECK-LABEL: test2: -; CHECK: ## %bb.0: -; CHECK-NEXT: vmovhps {{.*#+}} xmm2 = xmm0[0,1],mem[0,1] -; CHECK-NEXT: vinsertf32x4 $0, %xmm2, %zmm0, %zmm2 -; CHECK-NEXT: vextractf32x4 $3, %zmm0, %xmm0 -; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; CHECK-NEXT: vinsertf32x4 $3, %xmm0, %zmm2, %zmm0 -; CHECK-NEXT: retq +; KNL-LABEL: test2: +; KNL: ## %bb.0: +; KNL-NEXT: vbroadcastsd (%rdi), %ymm2 +; KNL-NEXT: vbroadcastsd %xmm1, %ymm1 +; KNL-NEXT: movb $66, %al +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: vinsertf64x4 $1, %ymm1, %zmm2, %zmm0 {%k1} +; KNL-NEXT: retq +; +; SKX-LABEL: test2: +; SKX: ## %bb.0: +; SKX-NEXT: vbroadcastsd (%rdi), %ymm2 +; SKX-NEXT: vbroadcastsd %xmm1, %ymm1 +; SKX-NEXT: movb $66, %al +; SKX-NEXT: kmovd %eax, %k1 +; SKX-NEXT: vinsertf64x4 $1, %ymm1, %zmm2, %zmm0 {%k1} +; SKX-NEXT: retq %rrr = load double, double* %br %rrr2 = insertelement <8 x double> %x, double %rrr, i32 1 %rrr3 = insertelement <8 x double> %rrr2, double %y, i32 6 @@ -535,14 +544,26 @@ } define <8 x i64> @insert_v8i64(<8 x i64> %x, i64 %y , i64* %ptr) { -; CHECK-LABEL: insert_v8i64: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpinsrq $1, (%rsi), %xmm0, %xmm1 -; CHECK-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm1 -; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0 -; CHECK-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm0 -; CHECK-NEXT: vinserti32x4 $1, %xmm0, %zmm1, %zmm0 -; CHECK-NEXT: retq +; KNL-LABEL: insert_v8i64: +; KNL: ## %bb.0: +; KNL-NEXT: vpbroadcastq (%rsi), %xmm1 +; KNL-NEXT: vmovq %rdi, %xmm2 +; KNL-NEXT: vpbroadcastq %xmm2, %xmm2 +; KNL-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; KNL-NEXT: movb $10, %al +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 {%k1} +; KNL-NEXT: retq +; +; SKX-LABEL: insert_v8i64: +; SKX: ## %bb.0: +; SKX-NEXT: vpbroadcastq (%rsi), %xmm1 +; SKX-NEXT: vpbroadcastq %rdi, %xmm2 +; SKX-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; SKX-NEXT: movb $10, %al +; SKX-NEXT: kmovd %eax, %k1 +; SKX-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 {%k1} +; SKX-NEXT: retq %val = load i64, i64* %ptr %r1 = insertelement <8 x i64> %x, i64 %val, i32 1 %r2 = insertelement <8 x i64> %r1, i64 %y, i32 3 @@ -550,13 +571,22 @@ } define <4 x i64> @insert_v4i64(<4 x i64> %x, i64 %y , i64* %ptr) { -; CHECK-LABEL: insert_v4i64: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpinsrq $1, (%rsi), %xmm0, %xmm1 -; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0 -; CHECK-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm0 -; CHECK-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; CHECK-NEXT: retq +; KNL-LABEL: insert_v4i64: +; KNL: ## %bb.0: +; KNL-NEXT: vpbroadcastq (%rsi), %xmm1 +; KNL-NEXT: vmovq %rdi, %xmm2 +; KNL-NEXT: vpbroadcastq %xmm2, %xmm2 +; KNL-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; KNL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; KNL-NEXT: retq +; +; SKX-LABEL: insert_v4i64: +; SKX: ## %bb.0: +; SKX-NEXT: vpbroadcastq (%rsi), %xmm1 +; SKX-NEXT: vpbroadcastq %rdi, %xmm2 +; SKX-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; SKX-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; SKX-NEXT: retq %val = load i64, i64* %ptr %r1 = insertelement <4 x i64> %x, i64 %val, i32 1 %r2 = insertelement <4 x i64> %r1, i64 %y, i32 3 @@ -566,8 +596,9 @@ define <2 x i64> @insert_v2i64(<2 x i64> %x, i64 %y , i64* %ptr) { ; CHECK-LABEL: insert_v2i64: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpinsrq $0, %rdi, %xmm0, %xmm0 -; CHECK-NEXT: vpinsrq $1, (%rsi), %xmm0, %xmm0 +; CHECK-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; CHECK-NEXT: vmovq %rdi, %xmm1 +; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; CHECK-NEXT: retq %val = load i64, i64* %ptr %r1 = insertelement <2 x i64> %x, i64 %val, i32 1 @@ -576,14 +607,26 @@ } define <16 x i32> @insert_v16i32(<16 x i32> %x, i32 %y, i32* %ptr) { -; CHECK-LABEL: insert_v16i32: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpinsrd $1, (%rsi), %xmm0, %xmm1 -; CHECK-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm1 -; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0 -; CHECK-NEXT: vpinsrd $1, %edi, %xmm0, %xmm0 -; CHECK-NEXT: vinserti32x4 $1, %xmm0, %zmm1, %zmm0 -; CHECK-NEXT: retq +; KNL-LABEL: insert_v16i32: +; KNL: ## %bb.0: +; KNL-NEXT: vpbroadcastd (%rsi), %xmm1 +; KNL-NEXT: vmovd %edi, %xmm2 +; KNL-NEXT: vpbroadcastd %xmm2, %xmm2 +; KNL-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; KNL-NEXT: movw $34, %ax +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} +; KNL-NEXT: retq +; +; SKX-LABEL: insert_v16i32: +; SKX: ## %bb.0: +; SKX-NEXT: vpbroadcastd (%rsi), %xmm1 +; SKX-NEXT: vpbroadcastd %edi, %xmm2 +; SKX-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; SKX-NEXT: movw $34, %ax +; SKX-NEXT: kmovd %eax, %k1 +; SKX-NEXT: vinserti32x8 $0, %ymm1, %zmm0, %zmm0 {%k1} +; SKX-NEXT: retq %val = load i32, i32* %ptr %r1 = insertelement <16 x i32> %x, i32 %val, i32 1 %r2 = insertelement <16 x i32> %r1, i32 %y, i32 5 @@ -591,13 +634,22 @@ } define <8 x i32> @insert_v8i32(<8 x i32> %x, i32 %y, i32* %ptr) { -; CHECK-LABEL: insert_v8i32: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpinsrd $1, (%rsi), %xmm0, %xmm1 -; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0 -; CHECK-NEXT: vpinsrd $1, %edi, %xmm0, %xmm0 -; CHECK-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; CHECK-NEXT: retq +; KNL-LABEL: insert_v8i32: +; KNL: ## %bb.0: +; KNL-NEXT: vpbroadcastd (%rsi), %xmm1 +; KNL-NEXT: vmovd %edi, %xmm2 +; KNL-NEXT: vpbroadcastd %xmm2, %xmm2 +; KNL-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; KNL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] +; KNL-NEXT: retq +; +; SKX-LABEL: insert_v8i32: +; SKX: ## %bb.0: +; SKX-NEXT: vpbroadcastd (%rsi), %xmm1 +; SKX-NEXT: vpbroadcastd %edi, %xmm2 +; SKX-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; SKX-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] +; SKX-NEXT: retq %val = load i32, i32* %ptr %r1 = insertelement <8 x i32> %x, i32 %val, i32 1 %r2 = insertelement <8 x i32> %r1, i32 %y, i32 5 @@ -607,8 +659,9 @@ define <4 x i32> @insert_v4i32(<4 x i32> %x, i32 %y, i32* %ptr) { ; CHECK-LABEL: insert_v4i32: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpinsrd $1, (%rsi), %xmm0, %xmm0 -; CHECK-NEXT: vpinsrd $3, %edi, %xmm0, %xmm0 +; CHECK-NEXT: vpinsrd $1, (%rsi), %xmm0, %xmm1 +; CHECK-NEXT: vpinsrd $3, %edi, %xmm1, %xmm1 +; CHECK-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] ; CHECK-NEXT: retq %val = load i32, i32* %ptr %r1 = insertelement <4 x i32> %x, i32 %val, i32 1 @@ -617,14 +670,25 @@ } define <32 x i16> @insert_v32i16(<32 x i16> %x, i16 %y, i16* %ptr) { -; CHECK-LABEL: insert_v32i16: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpinsrw $1, (%rsi), %xmm0, %xmm1 -; CHECK-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm1 -; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0 -; CHECK-NEXT: vpinsrw $1, %edi, %xmm0, %xmm0 -; CHECK-NEXT: vinserti32x4 $1, %xmm0, %zmm1, %zmm0 -; CHECK-NEXT: retq +; KNL-LABEL: insert_v32i16: +; KNL: ## %bb.0: +; KNL-NEXT: vpbroadcastw (%rsi), %xmm1 +; KNL-NEXT: vmovd %edi, %xmm2 +; KNL-NEXT: vpslld $16, %xmm2, %xmm2 +; KNL-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; KNL-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm0 +; KNL-NEXT: retq +; +; SKX-LABEL: insert_v32i16: +; SKX: ## %bb.0: +; SKX-NEXT: vpbroadcastw (%rsi), %xmm1 +; SKX-NEXT: vmovd %edi, %xmm2 +; SKX-NEXT: vpslld $16, %xmm2, %xmm2 +; SKX-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; SKX-NEXT: movl $514, %eax ## imm = 0x202 +; SKX-NEXT: kmovd %eax, %k1 +; SKX-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1} +; SKX-NEXT: retq %val = load i16, i16* %ptr %r1 = insertelement <32 x i16> %x, i16 %val, i32 1 %r2 = insertelement <32 x i16> %r1, i16 %y, i32 9 @@ -634,10 +698,11 @@ define <16 x i16> @insert_v16i16(<16 x i16> %x, i16 %y, i16* %ptr) { ; CHECK-LABEL: insert_v16i16: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpinsrw $1, (%rsi), %xmm0, %xmm1 -; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0 -; CHECK-NEXT: vpinsrw $1, %edi, %xmm0, %xmm0 -; CHECK-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; CHECK-NEXT: vpbroadcastw (%rsi), %xmm1 +; CHECK-NEXT: vmovd %edi, %xmm2 +; CHECK-NEXT: vpslld $16, %xmm2, %xmm2 +; CHECK-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; CHECK-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6,7,8],ymm1[9],ymm0[10,11,12,13,14,15] ; CHECK-NEXT: retq %val = load i16, i16* %ptr %r1 = insertelement <16 x i16> %x, i16 %val, i32 1 @@ -648,8 +713,10 @@ define <8 x i16> @insert_v8i16(<8 x i16> %x, i16 %y, i16* %ptr) { ; CHECK-LABEL: insert_v8i16: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpinsrw $1, (%rsi), %xmm0, %xmm0 -; CHECK-NEXT: vpinsrw $5, %edi, %xmm0, %xmm0 +; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: vpinsrw $1, (%rsi), %xmm1, %xmm1 +; CHECK-NEXT: vpinsrw $5, %edi, %xmm1, %xmm1 +; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4],xmm1[5],xmm0[6,7] ; CHECK-NEXT: retq %val = load i16, i16* %ptr %r1 = insertelement <8 x i16> %x, i16 %val, i32 1 @@ -658,14 +725,14 @@ } define <64 x i8> @insert_v64i8(<64 x i8> %x, i8 %y, i8* %ptr) { -; CHECK-LABEL: insert_v64i8: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpinsrb $1, (%rsi), %xmm0, %xmm1 -; CHECK-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm1 -; CHECK-NEXT: vextracti32x4 $3, %zmm0, %xmm0 -; CHECK-NEXT: vpinsrb $2, %edi, %xmm0, %xmm0 -; CHECK-NEXT: vinserti32x4 $3, %xmm0, %zmm1, %zmm0 -; CHECK-NEXT: retq +; KNL-LABEL: insert_v64i8: +; KNL: ## %bb.0: +; KNL-NEXT: vpbroadcastb (%rsi), %ymm1 +; KNL-NEXT: vmovd %edi, %xmm2 +; KNL-NEXT: vpbroadcastb %xmm2, %ymm2 +; KNL-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 +; KNL-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm0 +; KNL-NEXT: retq %val = load i8, i8* %ptr %r1 = insertelement <64 x i8> %x, i8 %val, i32 1 %r2 = insertelement <64 x i8> %r1, i8 %y, i32 50 @@ -673,13 +740,26 @@ } define <32 x i8> @insert_v32i8(<32 x i8> %x, i8 %y, i8* %ptr) { -; CHECK-LABEL: insert_v32i8: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpinsrb $1, (%rsi), %xmm0, %xmm1 -; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0 -; CHECK-NEXT: vpinsrb $1, %edi, %xmm0, %xmm0 -; CHECK-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; CHECK-NEXT: retq +; KNL-LABEL: insert_v32i8: +; KNL: ## %bb.0: +; KNL-NEXT: vpbroadcastb (%rsi), %xmm1 +; KNL-NEXT: vmovd %edi, %xmm2 +; KNL-NEXT: vpsllw $8, %xmm2, %xmm2 +; KNL-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; KNL-NEXT: vmovdqa {{.*#+}} ymm2 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; KNL-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; KNL-NEXT: retq +; +; SKX-LABEL: insert_v32i8: +; SKX: ## %bb.0: +; SKX-NEXT: vpbroadcastb (%rsi), %xmm1 +; SKX-NEXT: vmovd %edi, %xmm2 +; SKX-NEXT: vpsllw $8, %xmm2, %xmm2 +; SKX-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; SKX-NEXT: movl $131074, %eax ## imm = 0x20002 +; SKX-NEXT: kmovd %eax, %k1 +; SKX-NEXT: vmovdqu8 %ymm1, %ymm0 {%k1} +; SKX-NEXT: retq %val = load i8, i8* %ptr %r1 = insertelement <32 x i8> %x, i8 %val, i32 1 %r2 = insertelement <32 x i8> %r1, i8 %y, i32 17 @@ -687,11 +767,24 @@ } define <16 x i8> @insert_v16i8(<16 x i8> %x, i8 %y, i8* %ptr) { -; CHECK-LABEL: insert_v16i8: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpinsrb $3, (%rsi), %xmm0, %xmm0 -; CHECK-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 -; CHECK-NEXT: retq +; KNL-LABEL: insert_v16i8: +; KNL: ## %bb.0: +; KNL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; KNL-NEXT: vpinsrb $3, (%rsi), %xmm1, %xmm1 +; KNL-NEXT: vpinsrb $10, %edi, %xmm1, %xmm1 +; KNL-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] +; KNL-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0 +; KNL-NEXT: retq +; +; SKX-LABEL: insert_v16i8: +; SKX: ## %bb.0: +; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; SKX-NEXT: vpinsrb $3, (%rsi), %xmm1, %xmm1 +; SKX-NEXT: vpinsrb $10, %edi, %xmm1, %xmm1 +; SKX-NEXT: movw $1032, %ax ## imm = 0x408 +; SKX-NEXT: kmovd %eax, %k1 +; SKX-NEXT: vmovdqu8 %xmm1, %xmm0 {%k1} +; SKX-NEXT: retq %val = load i8, i8* %ptr %r1 = insertelement <16 x i8> %x, i8 %val, i32 3 %r2 = insertelement <16 x i8> %r1, i8 %y, i32 10 diff --git a/llvm/test/CodeGen/X86/avx512-mask-op.ll b/llvm/test/CodeGen/X86/avx512-mask-op.ll --- a/llvm/test/CodeGen/X86/avx512-mask-op.ll +++ b/llvm/test/CodeGen/X86/avx512-mask-op.ll @@ -1296,19 +1296,21 @@ define <8 x i1> @test18(i8 %a, i16 %y) { ; KNL-LABEL: test18: ; KNL: ## %bb.0: -; KNL-NEXT: kmovw %edi, %k0 -; KNL-NEXT: kmovw %esi, %k1 -; KNL-NEXT: kshiftrw $8, %k1, %k2 -; KNL-NEXT: kshiftrw $9, %k1, %k1 -; KNL-NEXT: movw $-65, %ax -; KNL-NEXT: kmovw %eax, %k3 -; KNL-NEXT: kandw %k3, %k0, %k0 -; KNL-NEXT: kshiftlw $6, %k1, %k1 -; KNL-NEXT: korw %k1, %k0, %k0 -; KNL-NEXT: kshiftlw $9, %k0, %k0 -; KNL-NEXT: kshiftrw $9, %k0, %k0 -; KNL-NEXT: kshiftlw $7, %k2, %k1 -; KNL-NEXT: korw %k1, %k0, %k1 +; KNL-NEXT: kmovw %edi, %k1 +; KNL-NEXT: kmovw %esi, %k0 +; KNL-NEXT: kshiftrw $9, %k0, %k2 +; KNL-NEXT: kshiftrw $8, %k0, %k0 +; KNL-NEXT: kshiftlw $7, %k0, %k0 +; KNL-NEXT: kshiftlw $6, %k2, %k2 +; KNL-NEXT: kshiftlw $9, %k2, %k2 +; KNL-NEXT: kshiftrw $9, %k2, %k2 +; KNL-NEXT: korw %k0, %k2, %k2 +; KNL-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k2} {z} +; KNL-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; KNL-NEXT: movb $-64, %al +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; KNL-NEXT: vptestmq %zmm1, %zmm1, %k1 ; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; KNL-NEXT: vpmovdw %zmm0, %ymm0 ; KNL-NEXT: ## kill: def $xmm0 killed $xmm0 killed $ymm0 @@ -1319,35 +1321,38 @@ ; SKX: ## %bb.0: ; SKX-NEXT: kmovd %edi, %k0 ; SKX-NEXT: kmovd %esi, %k1 -; SKX-NEXT: kshiftrw $8, %k1, %k2 -; SKX-NEXT: kshiftrw $9, %k1, %k1 -; SKX-NEXT: movb $-65, %al -; SKX-NEXT: kmovd %eax, %k3 -; SKX-NEXT: kandb %k3, %k0, %k0 -; SKX-NEXT: kshiftlb $6, %k1, %k1 -; SKX-NEXT: korb %k1, %k0, %k0 -; SKX-NEXT: kshiftlb $1, %k0, %k0 -; SKX-NEXT: kshiftrb $1, %k0, %k0 -; SKX-NEXT: kshiftlb $7, %k2, %k1 -; SKX-NEXT: korb %k1, %k0, %k0 +; SKX-NEXT: kshiftrw $9, %k1, %k2 +; SKX-NEXT: kshiftrw $8, %k1, %k1 +; SKX-NEXT: kshiftlb $7, %k1, %k1 +; SKX-NEXT: kshiftlb $6, %k2, %k2 +; SKX-NEXT: kshiftlb $1, %k2, %k2 +; SKX-NEXT: kshiftrb $1, %k2, %k2 +; SKX-NEXT: korb %k1, %k2, %k1 +; SKX-NEXT: vpmovm2d %k1, %ymm0 +; SKX-NEXT: vpmovm2d %k0, %ymm1 +; SKX-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; SKX-NEXT: vpmovd2m %ymm0, %k0 ; SKX-NEXT: vpmovm2w %k0, %xmm0 +; SKX-NEXT: vzeroupper ; SKX-NEXT: retq ; ; AVX512BW-LABEL: test18: ; AVX512BW: ## %bb.0: -; AVX512BW-NEXT: kmovd %edi, %k0 -; AVX512BW-NEXT: kmovd %esi, %k1 -; AVX512BW-NEXT: kshiftrw $8, %k1, %k2 -; AVX512BW-NEXT: kshiftrw $9, %k1, %k1 -; AVX512BW-NEXT: movw $-65, %ax -; AVX512BW-NEXT: kmovd %eax, %k3 -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $6, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $9, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $9, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $7, %k2, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 +; AVX512BW-NEXT: kmovd %edi, %k1 +; AVX512BW-NEXT: kmovd %esi, %k0 +; AVX512BW-NEXT: kshiftrw $9, %k0, %k2 +; AVX512BW-NEXT: kshiftrw $8, %k0, %k0 +; AVX512BW-NEXT: kshiftlw $7, %k0, %k0 +; AVX512BW-NEXT: kshiftlw $6, %k2, %k2 +; AVX512BW-NEXT: kshiftlw $9, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $9, %k2, %k2 +; AVX512BW-NEXT: korw %k0, %k2, %k2 +; AVX512BW-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k2} {z} +; AVX512BW-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; AVX512BW-NEXT: movb $-64, %al +; AVX512BW-NEXT: kmovd %eax, %k1 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512BW-NEXT: vptestmq %zmm1, %zmm1, %k0 ; AVX512BW-NEXT: vpmovm2w %k0, %zmm0 ; AVX512BW-NEXT: ## kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512BW-NEXT: vzeroupper @@ -1357,17 +1362,19 @@ ; AVX512DQ: ## %bb.0: ; AVX512DQ-NEXT: kmovw %edi, %k0 ; AVX512DQ-NEXT: kmovw %esi, %k1 -; AVX512DQ-NEXT: kshiftrw $8, %k1, %k2 -; AVX512DQ-NEXT: kshiftrw $9, %k1, %k1 -; AVX512DQ-NEXT: movb $-65, %al -; AVX512DQ-NEXT: kmovw %eax, %k3 -; AVX512DQ-NEXT: kandb %k3, %k0, %k0 -; AVX512DQ-NEXT: kshiftlb $6, %k1, %k1 -; AVX512DQ-NEXT: korb %k1, %k0, %k0 -; AVX512DQ-NEXT: kshiftlb $1, %k0, %k0 -; AVX512DQ-NEXT: kshiftrb $1, %k0, %k0 -; AVX512DQ-NEXT: kshiftlb $7, %k2, %k1 -; AVX512DQ-NEXT: korb %k1, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $9, %k1, %k2 +; AVX512DQ-NEXT: kshiftrw $8, %k1, %k1 +; AVX512DQ-NEXT: kshiftlb $7, %k1, %k1 +; AVX512DQ-NEXT: kshiftlb $6, %k2, %k2 +; AVX512DQ-NEXT: kshiftlb $1, %k2, %k2 +; AVX512DQ-NEXT: kshiftrb $1, %k2, %k2 +; AVX512DQ-NEXT: korb %k1, %k2, %k1 +; AVX512DQ-NEXT: vpmovm2q %k1, %zmm0 +; AVX512DQ-NEXT: vpmovm2q %k0, %zmm1 +; AVX512DQ-NEXT: movb $-64, %al +; AVX512DQ-NEXT: kmovw %eax, %k1 +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512DQ-NEXT: vpmovq2m %zmm1, %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 ; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0 ; AVX512DQ-NEXT: ## kill: def $xmm0 killed $xmm0 killed $ymm0 @@ -1378,18 +1385,19 @@ ; X86: ## %bb.0: ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k0 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 -; X86-NEXT: kshiftrw $8, %k1, %k2 -; X86-NEXT: kshiftrw $9, %k1, %k1 -; X86-NEXT: movb $-65, %al -; X86-NEXT: kmovd %eax, %k3 -; X86-NEXT: kandb %k3, %k0, %k0 -; X86-NEXT: kshiftlb $6, %k1, %k1 -; X86-NEXT: korb %k1, %k0, %k0 -; X86-NEXT: kshiftlb $1, %k0, %k0 -; X86-NEXT: kshiftrb $1, %k0, %k0 -; X86-NEXT: kshiftlb $7, %k2, %k1 -; X86-NEXT: korb %k1, %k0, %k0 +; X86-NEXT: kshiftrw $9, %k1, %k2 +; X86-NEXT: kshiftrw $8, %k1, %k1 +; X86-NEXT: kshiftlb $7, %k1, %k1 +; X86-NEXT: kshiftlb $6, %k2, %k2 +; X86-NEXT: kshiftlb $1, %k2, %k2 +; X86-NEXT: kshiftrb $1, %k2, %k2 +; X86-NEXT: korb %k1, %k2, %k1 +; X86-NEXT: vpmovm2d %k1, %ymm0 +; X86-NEXT: vpmovm2d %k0, %ymm1 +; X86-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; X86-NEXT: vpmovd2m %ymm0, %k0 ; X86-NEXT: vpmovm2w %k0, %xmm0 +; X86-NEXT: vzeroupper ; X86-NEXT: retl %b = bitcast i8 %a to <8 x i1> %b1 = bitcast i16 %y to <16 x i1> diff --git a/llvm/test/CodeGen/X86/clear_upper_vector_element_bits.ll b/llvm/test/CodeGen/X86/clear_upper_vector_element_bits.ll --- a/llvm/test/CodeGen/X86/clear_upper_vector_element_bits.ll +++ b/llvm/test/CodeGen/X86/clear_upper_vector_element_bits.ll @@ -542,7 +542,7 @@ define <4 x i64> @_clearupper4xi64b(<4 x i64>) nounwind { ; SSE2-LABEL: _clearupper4xi64b: ; SSE2: # %bb.0: -; SSE2-NEXT: movaps {{.*#+}} xmm2 = [NaN,0.0E+0,NaN,0.0E+0] +; SSE2-NEXT: movaps {{.*#+}} xmm2 = [4294967295,0,4294967295,0] ; SSE2-NEXT: andps %xmm2, %xmm0 ; SSE2-NEXT: andps %xmm2, %xmm1 ; SSE2-NEXT: retq @@ -692,53 +692,83 @@ ; SSE2-LABEL: _clearupper16xi8b: ; SSE2: # %bb.0: ; SSE2-NEXT: pushq %rbx -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; SSE2-NEXT: movq %xmm1, %r10 -; SSE2-NEXT: movq %r10, %r8 -; SSE2-NEXT: shrq $56, %r8 -; SSE2-NEXT: andl $15, %r8d -; SSE2-NEXT: movq %r10, %r9 -; SSE2-NEXT: shrq $48, %r9 -; SSE2-NEXT: andl $15, %r9d -; SSE2-NEXT: movq %r10, %rsi -; SSE2-NEXT: shrq $40, %rsi -; SSE2-NEXT: andl $15, %esi -; SSE2-NEXT: movq %r10, %r11 -; SSE2-NEXT: shrq $32, %r11 -; SSE2-NEXT: andl $15, %r11d -; SSE2-NEXT: movq %xmm0, %rax -; SSE2-NEXT: movq %rax, %rdx -; SSE2-NEXT: shrq $56, %rdx +; SSE2-NEXT: movq %xmm0, %r8 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; SSE2-NEXT: movq %xmm0, %rdx +; SSE2-NEXT: movq %rdx, %r9 +; SSE2-NEXT: movq %rdx, %r10 +; SSE2-NEXT: movq %rdx, %rdi +; SSE2-NEXT: movq %rdx, %rsi +; SSE2-NEXT: movl %edx, %ebx +; SSE2-NEXT: movl %edx, %ecx +; SSE2-NEXT: movl %edx, %eax +; SSE2-NEXT: shrl $8, %eax +; SSE2-NEXT: andl $15, %eax ; SSE2-NEXT: andl $15, %edx -; SSE2-NEXT: movq %rax, %rcx -; SSE2-NEXT: shrq $48, %rcx +; SSE2-NEXT: shlq $8, %rax +; SSE2-NEXT: orq %rdx, %rax +; SSE2-NEXT: movq %r8, %rdx +; SSE2-NEXT: shrl $16, %ecx ; SSE2-NEXT: andl $15, %ecx -; SSE2-NEXT: movq %rax, %rdi +; SSE2-NEXT: shlq $16, %rcx +; SSE2-NEXT: orq %rax, %rcx +; SSE2-NEXT: movq %r8, %r11 +; SSE2-NEXT: shrl $24, %ebx +; SSE2-NEXT: andl $15, %ebx +; SSE2-NEXT: shlq $24, %rbx +; SSE2-NEXT: orq %rcx, %rbx +; SSE2-NEXT: movq %r8, %rcx +; SSE2-NEXT: shrq $32, %rsi +; SSE2-NEXT: andl $15, %esi +; SSE2-NEXT: shlq $32, %rsi +; SSE2-NEXT: orq %rbx, %rsi +; SSE2-NEXT: movq %r8, %rax ; SSE2-NEXT: shrq $40, %rdi ; SSE2-NEXT: andl $15, %edi -; SSE2-NEXT: movq %rax, %rbx -; SSE2-NEXT: shrq $32, %rbx -; SSE2-NEXT: andl $15, %ebx -; SSE2-NEXT: shlq $32, %rbx -; SSE2-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F -; SSE2-NEXT: orq %rbx, %rax ; SSE2-NEXT: shlq $40, %rdi -; SSE2-NEXT: orq %rax, %rdi -; SSE2-NEXT: shlq $48, %rcx -; SSE2-NEXT: orq %rdi, %rcx +; SSE2-NEXT: orq %rsi, %rdi +; SSE2-NEXT: movl %r8d, %esi +; SSE2-NEXT: shrq $56, %r9 +; SSE2-NEXT: andl $15, %r9d +; SSE2-NEXT: shrq $48, %r10 +; SSE2-NEXT: andl $15, %r10d +; SSE2-NEXT: shlq $48, %r10 +; SSE2-NEXT: shlq $56, %r9 +; SSE2-NEXT: orq %r10, %r9 +; SSE2-NEXT: movl %r8d, %ebx +; SSE2-NEXT: orq %rdi, %r9 +; SSE2-NEXT: movl %r8d, %edi +; SSE2-NEXT: shrl $8, %edi +; SSE2-NEXT: andl $15, %edi +; SSE2-NEXT: andl $15, %r8d +; SSE2-NEXT: shlq $8, %rdi +; SSE2-NEXT: orq %r8, %rdi +; SSE2-NEXT: shrl $16, %ebx +; SSE2-NEXT: andl $15, %ebx +; SSE2-NEXT: shlq $16, %rbx +; SSE2-NEXT: orq %rdi, %rbx +; SSE2-NEXT: shrl $24, %esi +; SSE2-NEXT: andl $15, %esi +; SSE2-NEXT: shlq $24, %rsi +; SSE2-NEXT: orq %rbx, %rsi +; SSE2-NEXT: shrq $32, %rax +; SSE2-NEXT: andl $15, %eax +; SSE2-NEXT: shlq $32, %rax +; SSE2-NEXT: orq %rsi, %rax +; SSE2-NEXT: shrq $40, %rcx +; SSE2-NEXT: andl $15, %ecx +; SSE2-NEXT: shlq $40, %rcx +; SSE2-NEXT: orq %rax, %rcx +; SSE2-NEXT: shrq $56, %rdx +; SSE2-NEXT: andl $15, %edx +; SSE2-NEXT: shrq $48, %r11 +; SSE2-NEXT: andl $15, %r11d +; SSE2-NEXT: shlq $48, %r11 ; SSE2-NEXT: shlq $56, %rdx +; SSE2-NEXT: orq %r11, %rdx ; SSE2-NEXT: orq %rcx, %rdx -; SSE2-NEXT: shlq $32, %r11 -; SSE2-NEXT: andl $252645135, %r10d # imm = 0xF0F0F0F -; SSE2-NEXT: orq %r11, %r10 -; SSE2-NEXT: shlq $40, %rsi -; SSE2-NEXT: orq %r10, %rsi -; SSE2-NEXT: shlq $48, %r9 -; SSE2-NEXT: orq %rsi, %r9 -; SSE2-NEXT: shlq $56, %r8 -; SSE2-NEXT: orq %r9, %r8 ; SSE2-NEXT: movq %rdx, %xmm0 -; SSE2-NEXT: movq %r8, %xmm1 +; SSE2-NEXT: movq %r9, %xmm1 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE2-NEXT: popq %rbx ; SSE2-NEXT: retq @@ -746,52 +776,82 @@ ; SSE42-LABEL: _clearupper16xi8b: ; SSE42: # %bb.0: ; SSE42-NEXT: pushq %rbx -; SSE42-NEXT: pextrq $1, %xmm0, %r10 -; SSE42-NEXT: movq %r10, %r8 -; SSE42-NEXT: shrq $56, %r8 -; SSE42-NEXT: andl $15, %r8d -; SSE42-NEXT: movq %r10, %r9 -; SSE42-NEXT: shrq $48, %r9 -; SSE42-NEXT: andl $15, %r9d -; SSE42-NEXT: movq %r10, %rsi -; SSE42-NEXT: shrq $40, %rsi -; SSE42-NEXT: andl $15, %esi -; SSE42-NEXT: movq %r10, %r11 -; SSE42-NEXT: shrq $32, %r11 -; SSE42-NEXT: andl $15, %r11d -; SSE42-NEXT: movq %xmm0, %rax -; SSE42-NEXT: movq %rax, %rdx -; SSE42-NEXT: shrq $56, %rdx +; SSE42-NEXT: pextrq $1, %xmm0, %r8 +; SSE42-NEXT: movq %xmm0, %rdx +; SSE42-NEXT: movq %rdx, %r9 +; SSE42-NEXT: movq %rdx, %r10 +; SSE42-NEXT: movq %rdx, %rdi +; SSE42-NEXT: movq %rdx, %rsi +; SSE42-NEXT: movl %edx, %ebx +; SSE42-NEXT: movl %edx, %ecx +; SSE42-NEXT: movl %edx, %eax +; SSE42-NEXT: shrl $8, %eax +; SSE42-NEXT: andl $15, %eax ; SSE42-NEXT: andl $15, %edx -; SSE42-NEXT: movq %rax, %rcx -; SSE42-NEXT: shrq $48, %rcx +; SSE42-NEXT: shlq $8, %rax +; SSE42-NEXT: orq %rdx, %rax +; SSE42-NEXT: movq %r8, %rdx +; SSE42-NEXT: shrl $16, %ecx ; SSE42-NEXT: andl $15, %ecx -; SSE42-NEXT: movq %rax, %rdi +; SSE42-NEXT: shlq $16, %rcx +; SSE42-NEXT: orq %rax, %rcx +; SSE42-NEXT: movq %r8, %r11 +; SSE42-NEXT: shrl $24, %ebx +; SSE42-NEXT: andl $15, %ebx +; SSE42-NEXT: shlq $24, %rbx +; SSE42-NEXT: orq %rcx, %rbx +; SSE42-NEXT: movq %r8, %rcx +; SSE42-NEXT: shrq $32, %rsi +; SSE42-NEXT: andl $15, %esi +; SSE42-NEXT: shlq $32, %rsi +; SSE42-NEXT: orq %rbx, %rsi +; SSE42-NEXT: movq %r8, %rax ; SSE42-NEXT: shrq $40, %rdi ; SSE42-NEXT: andl $15, %edi -; SSE42-NEXT: movq %rax, %rbx -; SSE42-NEXT: shrq $32, %rbx -; SSE42-NEXT: andl $15, %ebx -; SSE42-NEXT: shlq $32, %rbx -; SSE42-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F -; SSE42-NEXT: orq %rbx, %rax ; SSE42-NEXT: shlq $40, %rdi -; SSE42-NEXT: orq %rax, %rdi -; SSE42-NEXT: shlq $48, %rcx -; SSE42-NEXT: orq %rdi, %rcx +; SSE42-NEXT: orq %rsi, %rdi +; SSE42-NEXT: movl %r8d, %esi +; SSE42-NEXT: shrq $56, %r9 +; SSE42-NEXT: andl $15, %r9d +; SSE42-NEXT: shrq $48, %r10 +; SSE42-NEXT: andl $15, %r10d +; SSE42-NEXT: shlq $48, %r10 +; SSE42-NEXT: shlq $56, %r9 +; SSE42-NEXT: orq %r10, %r9 +; SSE42-NEXT: movl %r8d, %ebx +; SSE42-NEXT: orq %rdi, %r9 +; SSE42-NEXT: movl %r8d, %edi +; SSE42-NEXT: shrl $8, %edi +; SSE42-NEXT: andl $15, %edi +; SSE42-NEXT: andl $15, %r8d +; SSE42-NEXT: shlq $8, %rdi +; SSE42-NEXT: orq %r8, %rdi +; SSE42-NEXT: shrl $16, %ebx +; SSE42-NEXT: andl $15, %ebx +; SSE42-NEXT: shlq $16, %rbx +; SSE42-NEXT: orq %rdi, %rbx +; SSE42-NEXT: shrl $24, %esi +; SSE42-NEXT: andl $15, %esi +; SSE42-NEXT: shlq $24, %rsi +; SSE42-NEXT: orq %rbx, %rsi +; SSE42-NEXT: shrq $32, %rax +; SSE42-NEXT: andl $15, %eax +; SSE42-NEXT: shlq $32, %rax +; SSE42-NEXT: orq %rsi, %rax +; SSE42-NEXT: shrq $40, %rcx +; SSE42-NEXT: andl $15, %ecx +; SSE42-NEXT: shlq $40, %rcx +; SSE42-NEXT: orq %rax, %rcx +; SSE42-NEXT: shrq $56, %rdx +; SSE42-NEXT: andl $15, %edx +; SSE42-NEXT: shrq $48, %r11 +; SSE42-NEXT: andl $15, %r11d +; SSE42-NEXT: shlq $48, %r11 ; SSE42-NEXT: shlq $56, %rdx +; SSE42-NEXT: orq %r11, %rdx ; SSE42-NEXT: orq %rcx, %rdx -; SSE42-NEXT: shlq $32, %r11 -; SSE42-NEXT: andl $252645135, %r10d # imm = 0xF0F0F0F -; SSE42-NEXT: orq %r11, %r10 -; SSE42-NEXT: shlq $40, %rsi -; SSE42-NEXT: orq %r10, %rsi -; SSE42-NEXT: shlq $48, %r9 -; SSE42-NEXT: orq %rsi, %r9 -; SSE42-NEXT: shlq $56, %r8 -; SSE42-NEXT: orq %r9, %r8 -; SSE42-NEXT: movq %r8, %xmm1 -; SSE42-NEXT: movq %rdx, %xmm0 +; SSE42-NEXT: movq %rdx, %xmm1 +; SSE42-NEXT: movq %r9, %xmm0 ; SSE42-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE42-NEXT: popq %rbx ; SSE42-NEXT: retq @@ -800,52 +860,82 @@ ; AVX: # %bb.0: ; AVX-NEXT: pushq %rbx ; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %r9 -; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; AVX-NEXT: movq %r9, %r8 -; AVX-NEXT: shrq $56, %r8 -; AVX-NEXT: andl $15, %r8d -; AVX-NEXT: movq %r9, %r10 -; AVX-NEXT: shrq $48, %r10 -; AVX-NEXT: andl $15, %r10d -; AVX-NEXT: movq %rcx, %rdx -; AVX-NEXT: shldq $24, %r9, %rdx -; AVX-NEXT: andl $15, %edx -; AVX-NEXT: movq %r9, %r11 -; AVX-NEXT: shrq $32, %r11 -; AVX-NEXT: andl $15, %r11d -; AVX-NEXT: movq %rcx, %rdi -; AVX-NEXT: shrq $56, %rdi -; AVX-NEXT: andl $15, %edi -; AVX-NEXT: movq %rcx, %rsi -; AVX-NEXT: shrq $48, %rsi -; AVX-NEXT: andl $15, %esi -; AVX-NEXT: movq %rcx, %rax -; AVX-NEXT: shrq $40, %rax +; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; AVX-NEXT: movq %rax, %r10 +; AVX-NEXT: movq %rax, %r8 +; AVX-NEXT: movq %rax, %r9 +; AVX-NEXT: movq %rax, %rdi +; AVX-NEXT: movl %eax, %esi +; AVX-NEXT: movl %eax, %edx +; AVX-NEXT: movl %eax, %ecx +; AVX-NEXT: andl $15, %ecx +; AVX-NEXT: shrl $8, %eax ; AVX-NEXT: andl $15, %eax -; AVX-NEXT: movq %rcx, %rbx -; AVX-NEXT: shrq $32, %rbx -; AVX-NEXT: andl $15, %ebx -; AVX-NEXT: shlq $32, %rbx -; AVX-NEXT: andl $252645135, %ecx # imm = 0xF0F0F0F -; AVX-NEXT: orq %rbx, %rcx -; AVX-NEXT: shlq $40, %rax +; AVX-NEXT: shlq $8, %rax ; AVX-NEXT: orq %rcx, %rax -; AVX-NEXT: shlq $48, %rsi -; AVX-NEXT: orq %rax, %rsi -; AVX-NEXT: shlq $56, %rdi +; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx +; AVX-NEXT: shrl $16, %edx +; AVX-NEXT: andl $15, %edx +; AVX-NEXT: shlq $16, %rdx +; AVX-NEXT: orq %rax, %rdx +; AVX-NEXT: movq %rcx, %rax +; AVX-NEXT: shrl $24, %esi +; AVX-NEXT: andl $15, %esi +; AVX-NEXT: shlq $24, %rsi +; AVX-NEXT: orq %rdx, %rsi +; AVX-NEXT: movq %rcx, %r11 +; AVX-NEXT: shrq $32, %rdi +; AVX-NEXT: andl $15, %edi +; AVX-NEXT: shlq $32, %rdi ; AVX-NEXT: orq %rsi, %rdi -; AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; AVX-NEXT: shlq $32, %r11 -; AVX-NEXT: andl $252645135, %r9d # imm = 0xF0F0F0F -; AVX-NEXT: orq %r11, %r9 -; AVX-NEXT: shlq $40, %rdx -; AVX-NEXT: orq %r9, %rdx -; AVX-NEXT: shlq $48, %r10 -; AVX-NEXT: orq %rdx, %r10 -; AVX-NEXT: shlq $56, %r8 -; AVX-NEXT: orq %r10, %r8 -; AVX-NEXT: movq %r8, -{{[0-9]+}}(%rsp) +; AVX-NEXT: movq %rcx, %rbx +; AVX-NEXT: shrq $40, %r9 +; AVX-NEXT: andl $15, %r9d +; AVX-NEXT: shlq $40, %r9 +; AVX-NEXT: orq %rdi, %r9 +; AVX-NEXT: movq %rcx, %rdi +; AVX-NEXT: shrq $56, %r10 +; AVX-NEXT: andl $15, %r10d +; AVX-NEXT: shrq $48, %r8 +; AVX-NEXT: andl $15, %r8d +; AVX-NEXT: shlq $48, %r8 +; AVX-NEXT: shlq $56, %r10 +; AVX-NEXT: orq %r8, %r10 +; AVX-NEXT: movl %ecx, %edx +; AVX-NEXT: orq %r9, %r10 +; AVX-NEXT: movl %ecx, %esi +; AVX-NEXT: movq %r10, -{{[0-9]+}}(%rsp) +; AVX-NEXT: movl %ecx, %r8d +; AVX-NEXT: andl $15, %r8d +; AVX-NEXT: shrl $8, %ecx +; AVX-NEXT: andl $15, %ecx +; AVX-NEXT: shlq $8, %rcx +; AVX-NEXT: orq %r8, %rcx +; AVX-NEXT: shrl $16, %esi +; AVX-NEXT: andl $15, %esi +; AVX-NEXT: shlq $16, %rsi +; AVX-NEXT: orq %rcx, %rsi +; AVX-NEXT: shrl $24, %edx +; AVX-NEXT: andl $15, %edx +; AVX-NEXT: shlq $24, %rdx +; AVX-NEXT: orq %rsi, %rdx +; AVX-NEXT: shrq $32, %rdi +; AVX-NEXT: andl $15, %edi +; AVX-NEXT: shlq $32, %rdi +; AVX-NEXT: orq %rdx, %rdi +; AVX-NEXT: shrq $40, %rbx +; AVX-NEXT: andl $15, %ebx +; AVX-NEXT: shlq $40, %rbx +; AVX-NEXT: orq %rdi, %rbx +; AVX-NEXT: shrq $56, %rax +; AVX-NEXT: andl $15, %eax +; AVX-NEXT: shrq $48, %r11 +; AVX-NEXT: andl $15, %r11d +; AVX-NEXT: shlq $48, %r11 +; AVX-NEXT: shlq $56, %rax +; AVX-NEXT: orq %r11, %rax +; AVX-NEXT: orq %rbx, %rax +; AVX-NEXT: movq %rax, -{{[0-9]+}}(%rsp) ; AVX-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0 ; AVX-NEXT: popq %rbx ; AVX-NEXT: retq @@ -874,53 +964,83 @@ ; SSE2-LABEL: _clearupper32xi8b: ; SSE2: # %bb.0: ; SSE2-NEXT: pushq %rbx -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] -; SSE2-NEXT: movq %xmm2, %r10 -; SSE2-NEXT: movq %r10, %r8 -; SSE2-NEXT: shrq $56, %r8 -; SSE2-NEXT: andl $15, %r8d -; SSE2-NEXT: movq %r10, %r9 -; SSE2-NEXT: shrq $48, %r9 -; SSE2-NEXT: andl $15, %r9d -; SSE2-NEXT: movq %r10, %rsi -; SSE2-NEXT: shrq $40, %rsi -; SSE2-NEXT: andl $15, %esi -; SSE2-NEXT: movq %r10, %r11 -; SSE2-NEXT: shrq $32, %r11 -; SSE2-NEXT: andl $15, %r11d -; SSE2-NEXT: movq %xmm0, %rax -; SSE2-NEXT: movq %rax, %rdx -; SSE2-NEXT: shrq $56, %rdx +; SSE2-NEXT: movq %xmm0, %r8 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; SSE2-NEXT: movq %xmm0, %rdx +; SSE2-NEXT: movq %rdx, %r9 +; SSE2-NEXT: movq %rdx, %r10 +; SSE2-NEXT: movq %rdx, %rdi +; SSE2-NEXT: movq %rdx, %rsi +; SSE2-NEXT: movl %edx, %ebx +; SSE2-NEXT: movl %edx, %ecx +; SSE2-NEXT: movl %edx, %eax +; SSE2-NEXT: shrl $8, %eax +; SSE2-NEXT: andl $15, %eax ; SSE2-NEXT: andl $15, %edx -; SSE2-NEXT: movq %rax, %rcx -; SSE2-NEXT: shrq $48, %rcx +; SSE2-NEXT: shlq $8, %rax +; SSE2-NEXT: orq %rdx, %rax +; SSE2-NEXT: movq %r8, %rdx +; SSE2-NEXT: shrl $16, %ecx ; SSE2-NEXT: andl $15, %ecx -; SSE2-NEXT: movq %rax, %rdi +; SSE2-NEXT: shlq $16, %rcx +; SSE2-NEXT: orq %rax, %rcx +; SSE2-NEXT: movq %r8, %r11 +; SSE2-NEXT: shrl $24, %ebx +; SSE2-NEXT: andl $15, %ebx +; SSE2-NEXT: shlq $24, %rbx +; SSE2-NEXT: orq %rcx, %rbx +; SSE2-NEXT: movq %r8, %rcx +; SSE2-NEXT: shrq $32, %rsi +; SSE2-NEXT: andl $15, %esi +; SSE2-NEXT: shlq $32, %rsi +; SSE2-NEXT: orq %rbx, %rsi +; SSE2-NEXT: movq %r8, %rax ; SSE2-NEXT: shrq $40, %rdi ; SSE2-NEXT: andl $15, %edi -; SSE2-NEXT: movq %rax, %rbx -; SSE2-NEXT: shrq $32, %rbx -; SSE2-NEXT: andl $15, %ebx -; SSE2-NEXT: shlq $32, %rbx -; SSE2-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F -; SSE2-NEXT: orq %rbx, %rax ; SSE2-NEXT: shlq $40, %rdi -; SSE2-NEXT: orq %rax, %rdi -; SSE2-NEXT: shlq $48, %rcx -; SSE2-NEXT: orq %rdi, %rcx +; SSE2-NEXT: orq %rsi, %rdi +; SSE2-NEXT: movl %r8d, %esi +; SSE2-NEXT: shrq $56, %r9 +; SSE2-NEXT: andl $15, %r9d +; SSE2-NEXT: shrq $48, %r10 +; SSE2-NEXT: andl $15, %r10d +; SSE2-NEXT: shlq $48, %r10 +; SSE2-NEXT: shlq $56, %r9 +; SSE2-NEXT: orq %r10, %r9 +; SSE2-NEXT: movl %r8d, %ebx +; SSE2-NEXT: orq %rdi, %r9 +; SSE2-NEXT: movl %r8d, %edi +; SSE2-NEXT: shrl $8, %edi +; SSE2-NEXT: andl $15, %edi +; SSE2-NEXT: andl $15, %r8d +; SSE2-NEXT: shlq $8, %rdi +; SSE2-NEXT: orq %r8, %rdi +; SSE2-NEXT: shrl $16, %ebx +; SSE2-NEXT: andl $15, %ebx +; SSE2-NEXT: shlq $16, %rbx +; SSE2-NEXT: orq %rdi, %rbx +; SSE2-NEXT: shrl $24, %esi +; SSE2-NEXT: andl $15, %esi +; SSE2-NEXT: shlq $24, %rsi +; SSE2-NEXT: orq %rbx, %rsi +; SSE2-NEXT: shrq $32, %rax +; SSE2-NEXT: andl $15, %eax +; SSE2-NEXT: shlq $32, %rax +; SSE2-NEXT: orq %rsi, %rax +; SSE2-NEXT: shrq $40, %rcx +; SSE2-NEXT: andl $15, %ecx +; SSE2-NEXT: shlq $40, %rcx +; SSE2-NEXT: orq %rax, %rcx +; SSE2-NEXT: shrq $56, %rdx +; SSE2-NEXT: andl $15, %edx +; SSE2-NEXT: shrq $48, %r11 +; SSE2-NEXT: andl $15, %r11d +; SSE2-NEXT: shlq $48, %r11 ; SSE2-NEXT: shlq $56, %rdx +; SSE2-NEXT: orq %r11, %rdx ; SSE2-NEXT: orq %rcx, %rdx -; SSE2-NEXT: shlq $32, %r11 -; SSE2-NEXT: andl $252645135, %r10d # imm = 0xF0F0F0F -; SSE2-NEXT: orq %r11, %r10 -; SSE2-NEXT: shlq $40, %rsi -; SSE2-NEXT: orq %r10, %rsi -; SSE2-NEXT: shlq $48, %r9 -; SSE2-NEXT: orq %rsi, %r9 -; SSE2-NEXT: shlq $56, %r8 -; SSE2-NEXT: orq %r9, %r8 ; SSE2-NEXT: movq %rdx, %xmm0 -; SSE2-NEXT: movq %r8, %xmm2 +; SSE2-NEXT: movq %r9, %xmm2 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; SSE2-NEXT: popq %rbx ; SSE2-NEXT: retq @@ -928,250 +1048,344 @@ ; SSE42-LABEL: _clearupper32xi8b: ; SSE42: # %bb.0: ; SSE42-NEXT: pushq %rbx -; SSE42-NEXT: pextrq $1, %xmm0, %r10 -; SSE42-NEXT: movq %r10, %r8 -; SSE42-NEXT: shrq $56, %r8 -; SSE42-NEXT: andl $15, %r8d -; SSE42-NEXT: movq %r10, %r9 -; SSE42-NEXT: shrq $48, %r9 -; SSE42-NEXT: andl $15, %r9d -; SSE42-NEXT: movq %r10, %rsi -; SSE42-NEXT: shrq $40, %rsi -; SSE42-NEXT: andl $15, %esi -; SSE42-NEXT: movq %r10, %r11 -; SSE42-NEXT: shrq $32, %r11 -; SSE42-NEXT: andl $15, %r11d -; SSE42-NEXT: movq %xmm0, %rax -; SSE42-NEXT: movq %rax, %rdx -; SSE42-NEXT: shrq $56, %rdx +; SSE42-NEXT: pextrq $1, %xmm0, %r8 +; SSE42-NEXT: movq %xmm0, %rdx +; SSE42-NEXT: movq %rdx, %r9 +; SSE42-NEXT: movq %rdx, %r10 +; SSE42-NEXT: movq %rdx, %rdi +; SSE42-NEXT: movq %rdx, %rsi +; SSE42-NEXT: movl %edx, %ebx +; SSE42-NEXT: movl %edx, %ecx +; SSE42-NEXT: movl %edx, %eax +; SSE42-NEXT: shrl $8, %eax +; SSE42-NEXT: andl $15, %eax ; SSE42-NEXT: andl $15, %edx -; SSE42-NEXT: movq %rax, %rcx -; SSE42-NEXT: shrq $48, %rcx +; SSE42-NEXT: shlq $8, %rax +; SSE42-NEXT: orq %rdx, %rax +; SSE42-NEXT: movq %r8, %rdx +; SSE42-NEXT: shrl $16, %ecx ; SSE42-NEXT: andl $15, %ecx -; SSE42-NEXT: movq %rax, %rdi +; SSE42-NEXT: shlq $16, %rcx +; SSE42-NEXT: orq %rax, %rcx +; SSE42-NEXT: movq %r8, %r11 +; SSE42-NEXT: shrl $24, %ebx +; SSE42-NEXT: andl $15, %ebx +; SSE42-NEXT: shlq $24, %rbx +; SSE42-NEXT: orq %rcx, %rbx +; SSE42-NEXT: movq %r8, %rcx +; SSE42-NEXT: shrq $32, %rsi +; SSE42-NEXT: andl $15, %esi +; SSE42-NEXT: shlq $32, %rsi +; SSE42-NEXT: orq %rbx, %rsi +; SSE42-NEXT: movq %r8, %rax ; SSE42-NEXT: shrq $40, %rdi ; SSE42-NEXT: andl $15, %edi -; SSE42-NEXT: movq %rax, %rbx -; SSE42-NEXT: shrq $32, %rbx -; SSE42-NEXT: andl $15, %ebx -; SSE42-NEXT: shlq $32, %rbx -; SSE42-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F -; SSE42-NEXT: orq %rbx, %rax ; SSE42-NEXT: shlq $40, %rdi -; SSE42-NEXT: orq %rax, %rdi -; SSE42-NEXT: shlq $48, %rcx -; SSE42-NEXT: orq %rdi, %rcx +; SSE42-NEXT: orq %rsi, %rdi +; SSE42-NEXT: movl %r8d, %esi +; SSE42-NEXT: shrq $56, %r9 +; SSE42-NEXT: andl $15, %r9d +; SSE42-NEXT: shrq $48, %r10 +; SSE42-NEXT: andl $15, %r10d +; SSE42-NEXT: shlq $48, %r10 +; SSE42-NEXT: shlq $56, %r9 +; SSE42-NEXT: orq %r10, %r9 +; SSE42-NEXT: movl %r8d, %ebx +; SSE42-NEXT: orq %rdi, %r9 +; SSE42-NEXT: movl %r8d, %edi +; SSE42-NEXT: shrl $8, %edi +; SSE42-NEXT: andl $15, %edi +; SSE42-NEXT: andl $15, %r8d +; SSE42-NEXT: shlq $8, %rdi +; SSE42-NEXT: orq %r8, %rdi +; SSE42-NEXT: shrl $16, %ebx +; SSE42-NEXT: andl $15, %ebx +; SSE42-NEXT: shlq $16, %rbx +; SSE42-NEXT: orq %rdi, %rbx +; SSE42-NEXT: shrl $24, %esi +; SSE42-NEXT: andl $15, %esi +; SSE42-NEXT: shlq $24, %rsi +; SSE42-NEXT: orq %rbx, %rsi +; SSE42-NEXT: shrq $32, %rax +; SSE42-NEXT: andl $15, %eax +; SSE42-NEXT: shlq $32, %rax +; SSE42-NEXT: orq %rsi, %rax +; SSE42-NEXT: shrq $40, %rcx +; SSE42-NEXT: andl $15, %ecx +; SSE42-NEXT: shlq $40, %rcx +; SSE42-NEXT: orq %rax, %rcx +; SSE42-NEXT: shrq $56, %rdx +; SSE42-NEXT: andl $15, %edx +; SSE42-NEXT: shrq $48, %r11 +; SSE42-NEXT: andl $15, %r11d +; SSE42-NEXT: shlq $48, %r11 ; SSE42-NEXT: shlq $56, %rdx +; SSE42-NEXT: orq %r11, %rdx ; SSE42-NEXT: orq %rcx, %rdx -; SSE42-NEXT: shlq $32, %r11 -; SSE42-NEXT: andl $252645135, %r10d # imm = 0xF0F0F0F -; SSE42-NEXT: orq %r11, %r10 -; SSE42-NEXT: shlq $40, %rsi -; SSE42-NEXT: orq %r10, %rsi -; SSE42-NEXT: shlq $48, %r9 -; SSE42-NEXT: orq %rsi, %r9 -; SSE42-NEXT: shlq $56, %r8 -; SSE42-NEXT: orq %r9, %r8 -; SSE42-NEXT: movq %r8, %xmm2 -; SSE42-NEXT: movq %rdx, %xmm0 +; SSE42-NEXT: movq %rdx, %xmm2 +; SSE42-NEXT: movq %r9, %xmm0 ; SSE42-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; SSE42-NEXT: popq %rbx ; SSE42-NEXT: retq ; ; AVX1-LABEL: _clearupper32xi8b: ; AVX1: # %bb.0: +; AVX1-NEXT: pushq %rbx ; AVX1-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) ; AVX1-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; AVX1-NEXT: movq %rax, %r10 ; AVX1-NEXT: movq %rax, %r8 -; AVX1-NEXT: movq %rax, %rdx -; AVX1-NEXT: movq %rax, %rsi +; AVX1-NEXT: movq %rax, %r9 ; AVX1-NEXT: movq %rax, %rdi -; AVX1-NEXT: movq %rax, %rcx -; AVX1-NEXT: shrq $32, %rcx +; AVX1-NEXT: movl %eax, %esi +; AVX1-NEXT: movl %eax, %edx +; AVX1-NEXT: movl %eax, %ecx ; AVX1-NEXT: andl $15, %ecx -; AVX1-NEXT: shlq $32, %rcx -; AVX1-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F +; AVX1-NEXT: shrl $8, %eax +; AVX1-NEXT: andl $15, %eax +; AVX1-NEXT: shlq $8, %rax ; AVX1-NEXT: orq %rcx, %rax -; AVX1-NEXT: shrq $40, %rdi +; AVX1-NEXT: movq -{{[0-9]+}}(%rsp), %rcx +; AVX1-NEXT: shrl $16, %edx +; AVX1-NEXT: andl $15, %edx +; AVX1-NEXT: shlq $16, %rdx +; AVX1-NEXT: orq %rax, %rdx +; AVX1-NEXT: movq %rcx, %rax +; AVX1-NEXT: shrl $24, %esi +; AVX1-NEXT: andl $15, %esi +; AVX1-NEXT: shlq $24, %rsi +; AVX1-NEXT: orq %rdx, %rsi +; AVX1-NEXT: movq %rcx, %r11 +; AVX1-NEXT: shrq $32, %rdi ; AVX1-NEXT: andl $15, %edi -; AVX1-NEXT: shlq $40, %rdi -; AVX1-NEXT: orq %rax, %rdi -; AVX1-NEXT: movq -{{[0-9]+}}(%rsp), %rax -; AVX1-NEXT: shrq $48, %rsi -; AVX1-NEXT: andl $15, %esi -; AVX1-NEXT: shlq $48, %rsi -; AVX1-NEXT: orq %rdi, %rsi -; AVX1-NEXT: movq %rax, %rcx -; AVX1-NEXT: shrq $56, %rdx -; AVX1-NEXT: andl $15, %edx -; AVX1-NEXT: shlq $56, %rdx -; AVX1-NEXT: orq %rsi, %rdx -; AVX1-NEXT: movq %rax, %rsi -; AVX1-NEXT: shldq $24, %rax, %r8 -; AVX1-NEXT: movq %rdx, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: movq %rax, %rdx -; AVX1-NEXT: shrq $32, %rdx -; AVX1-NEXT: andl $15, %edx -; AVX1-NEXT: shlq $32, %rdx -; AVX1-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F -; AVX1-NEXT: orq %rdx, %rax +; AVX1-NEXT: shlq $32, %rdi +; AVX1-NEXT: orq %rsi, %rdi +; AVX1-NEXT: movq %rcx, %rbx +; AVX1-NEXT: shrq $40, %r9 +; AVX1-NEXT: andl $15, %r9d +; AVX1-NEXT: shlq $40, %r9 +; AVX1-NEXT: orq %rdi, %r9 +; AVX1-NEXT: movq %rcx, %rdi +; AVX1-NEXT: shrq $56, %r10 +; AVX1-NEXT: andl $15, %r10d +; AVX1-NEXT: shrq $48, %r8 +; AVX1-NEXT: andl $15, %r8d +; AVX1-NEXT: shlq $48, %r8 +; AVX1-NEXT: shlq $56, %r10 +; AVX1-NEXT: orq %r8, %r10 +; AVX1-NEXT: movl %ecx, %edx +; AVX1-NEXT: orq %r9, %r10 +; AVX1-NEXT: movl %ecx, %esi +; AVX1-NEXT: movq %r10, -{{[0-9]+}}(%rsp) +; AVX1-NEXT: movl %ecx, %r8d ; AVX1-NEXT: andl $15, %r8d -; AVX1-NEXT: shlq $40, %r8 -; AVX1-NEXT: orq %rax, %r8 -; AVX1-NEXT: shrq $48, %rsi -; AVX1-NEXT: andl $15, %esi -; AVX1-NEXT: shlq $48, %rsi -; AVX1-NEXT: orq %r8, %rsi -; AVX1-NEXT: shrq $56, %rcx -; AVX1-NEXT: andl $15, %ecx -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: shlq $56, %rcx -; AVX1-NEXT: orq %rsi, %rcx -; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: movl %eax, %ecx ; AVX1-NEXT: shrl $8, %ecx -; AVX1-NEXT: vmovd %eax, %xmm1 -; AVX1-NEXT: vpinsrb $1, %ecx, %xmm1, %xmm1 -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: shrl $16, %ecx -; AVX1-NEXT: vpinsrb $2, %ecx, %xmm1, %xmm1 -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: shrl $24, %ecx -; AVX1-NEXT: vpinsrb $3, %ecx, %xmm1, %xmm1 -; AVX1-NEXT: movq %rax, %rcx -; AVX1-NEXT: shrq $32, %rcx -; AVX1-NEXT: vpinsrb $4, %ecx, %xmm1, %xmm1 -; AVX1-NEXT: movq %rax, %rcx -; AVX1-NEXT: shrq $40, %rcx -; AVX1-NEXT: vpinsrb $5, %ecx, %xmm1, %xmm1 -; AVX1-NEXT: movq %rax, %rcx -; AVX1-NEXT: shrq $48, %rcx -; AVX1-NEXT: vpinsrb $6, %ecx, %xmm1, %xmm1 -; AVX1-NEXT: vpextrq $1, %xmm0, %rcx +; AVX1-NEXT: andl $15, %ecx +; AVX1-NEXT: shlq $8, %rcx +; AVX1-NEXT: orq %r8, %rcx +; AVX1-NEXT: shrl $16, %esi +; AVX1-NEXT: andl $15, %esi +; AVX1-NEXT: shlq $16, %rsi +; AVX1-NEXT: orq %rcx, %rsi +; AVX1-NEXT: shrl $24, %edx +; AVX1-NEXT: andl $15, %edx +; AVX1-NEXT: shlq $24, %rdx +; AVX1-NEXT: orq %rsi, %rdx +; AVX1-NEXT: shrq $32, %rdi +; AVX1-NEXT: andl $15, %edi +; AVX1-NEXT: shlq $32, %rdi +; AVX1-NEXT: orq %rdx, %rdi +; AVX1-NEXT: shrq $40, %rbx +; AVX1-NEXT: andl $15, %ebx +; AVX1-NEXT: shlq $40, %rbx +; AVX1-NEXT: orq %rdi, %rbx ; AVX1-NEXT: shrq $56, %rax -; AVX1-NEXT: vpinsrb $7, %eax, %xmm1, %xmm0 +; AVX1-NEXT: andl $15, %eax +; AVX1-NEXT: shrq $48, %r11 +; AVX1-NEXT: andl $15, %r11d +; AVX1-NEXT: shlq $48, %r11 +; AVX1-NEXT: shlq $56, %rax +; AVX1-NEXT: orq %r11, %rax +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: orq %rbx, %rax +; AVX1-NEXT: vmovq %xmm0, %rcx +; AVX1-NEXT: movq %rax, -{{[0-9]+}}(%rsp) ; AVX1-NEXT: movl %ecx, %eax ; AVX1-NEXT: shrl $8, %eax -; AVX1-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 -; AVX1-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 +; AVX1-NEXT: vmovd %ecx, %xmm1 +; AVX1-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 ; AVX1-NEXT: movl %ecx, %eax ; AVX1-NEXT: shrl $16, %eax -; AVX1-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 +; AVX1-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 ; AVX1-NEXT: movl %ecx, %eax ; AVX1-NEXT: shrl $24, %eax -; AVX1-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; AVX1-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1 ; AVX1-NEXT: movq %rcx, %rax ; AVX1-NEXT: shrq $32, %rax -; AVX1-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 +; AVX1-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 ; AVX1-NEXT: movq %rcx, %rax ; AVX1-NEXT: shrq $40, %rax -; AVX1-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; AVX1-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 ; AVX1-NEXT: movq %rcx, %rax ; AVX1-NEXT: shrq $48, %rax -; AVX1-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 +; AVX1-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 +; AVX1-NEXT: vpextrq $1, %xmm0, %rax ; AVX1-NEXT: shrq $56, %rcx -; AVX1-NEXT: vpinsrb $15, %ecx, %xmm0, %xmm0 +; AVX1-NEXT: vpinsrb $7, %ecx, %xmm1, %xmm0 +; AVX1-NEXT: movl %eax, %ecx +; AVX1-NEXT: shrl $8, %ecx +; AVX1-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 +; AVX1-NEXT: vpinsrb $9, %ecx, %xmm0, %xmm0 +; AVX1-NEXT: movl %eax, %ecx +; AVX1-NEXT: shrl $16, %ecx +; AVX1-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 +; AVX1-NEXT: movl %eax, %ecx +; AVX1-NEXT: shrl $24, %ecx +; AVX1-NEXT: vpinsrb $11, %ecx, %xmm0, %xmm0 +; AVX1-NEXT: movq %rax, %rcx +; AVX1-NEXT: shrq $32, %rcx +; AVX1-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 +; AVX1-NEXT: movq %rax, %rcx +; AVX1-NEXT: shrq $40, %rcx +; AVX1-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 +; AVX1-NEXT: movq %rax, %rcx +; AVX1-NEXT: shrq $48, %rcx +; AVX1-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; AVX1-NEXT: shrq $56, %rax +; AVX1-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 ; AVX1-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: popq %rbx ; AVX1-NEXT: retq ; ; AVX2-LABEL: _clearupper32xi8b: ; AVX2: # %bb.0: +; AVX2-NEXT: pushq %rbx ; AVX2-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) ; AVX2-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; AVX2-NEXT: movq %rax, %r10 ; AVX2-NEXT: movq %rax, %r8 -; AVX2-NEXT: movq %rax, %rdx -; AVX2-NEXT: movq %rax, %rsi +; AVX2-NEXT: movq %rax, %r9 ; AVX2-NEXT: movq %rax, %rdi -; AVX2-NEXT: movq %rax, %rcx -; AVX2-NEXT: shrq $32, %rcx +; AVX2-NEXT: movl %eax, %esi +; AVX2-NEXT: movl %eax, %edx +; AVX2-NEXT: movl %eax, %ecx ; AVX2-NEXT: andl $15, %ecx -; AVX2-NEXT: shlq $32, %rcx -; AVX2-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F +; AVX2-NEXT: shrl $8, %eax +; AVX2-NEXT: andl $15, %eax +; AVX2-NEXT: shlq $8, %rax ; AVX2-NEXT: orq %rcx, %rax -; AVX2-NEXT: shrq $40, %rdi +; AVX2-NEXT: movq -{{[0-9]+}}(%rsp), %rcx +; AVX2-NEXT: shrl $16, %edx +; AVX2-NEXT: andl $15, %edx +; AVX2-NEXT: shlq $16, %rdx +; AVX2-NEXT: orq %rax, %rdx +; AVX2-NEXT: movq %rcx, %rax +; AVX2-NEXT: shrl $24, %esi +; AVX2-NEXT: andl $15, %esi +; AVX2-NEXT: shlq $24, %rsi +; AVX2-NEXT: orq %rdx, %rsi +; AVX2-NEXT: movq %rcx, %r11 +; AVX2-NEXT: shrq $32, %rdi ; AVX2-NEXT: andl $15, %edi -; AVX2-NEXT: shlq $40, %rdi -; AVX2-NEXT: orq %rax, %rdi -; AVX2-NEXT: movq -{{[0-9]+}}(%rsp), %rax -; AVX2-NEXT: shrq $48, %rsi -; AVX2-NEXT: andl $15, %esi -; AVX2-NEXT: shlq $48, %rsi -; AVX2-NEXT: orq %rdi, %rsi -; AVX2-NEXT: movq %rax, %rcx -; AVX2-NEXT: shrq $56, %rdx -; AVX2-NEXT: andl $15, %edx -; AVX2-NEXT: shlq $56, %rdx -; AVX2-NEXT: orq %rsi, %rdx -; AVX2-NEXT: movq %rax, %rsi -; AVX2-NEXT: shldq $24, %rax, %r8 -; AVX2-NEXT: movq %rdx, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: movq %rax, %rdx -; AVX2-NEXT: shrq $32, %rdx -; AVX2-NEXT: andl $15, %edx -; AVX2-NEXT: shlq $32, %rdx -; AVX2-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F -; AVX2-NEXT: orq %rdx, %rax +; AVX2-NEXT: shlq $32, %rdi +; AVX2-NEXT: orq %rsi, %rdi +; AVX2-NEXT: movq %rcx, %rbx +; AVX2-NEXT: shrq $40, %r9 +; AVX2-NEXT: andl $15, %r9d +; AVX2-NEXT: shlq $40, %r9 +; AVX2-NEXT: orq %rdi, %r9 +; AVX2-NEXT: movq %rcx, %rdi +; AVX2-NEXT: shrq $56, %r10 +; AVX2-NEXT: andl $15, %r10d +; AVX2-NEXT: shrq $48, %r8 +; AVX2-NEXT: andl $15, %r8d +; AVX2-NEXT: shlq $48, %r8 +; AVX2-NEXT: shlq $56, %r10 +; AVX2-NEXT: orq %r8, %r10 +; AVX2-NEXT: movl %ecx, %edx +; AVX2-NEXT: orq %r9, %r10 +; AVX2-NEXT: movl %ecx, %esi +; AVX2-NEXT: movq %r10, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: movl %ecx, %r8d ; AVX2-NEXT: andl $15, %r8d -; AVX2-NEXT: shlq $40, %r8 -; AVX2-NEXT: orq %rax, %r8 -; AVX2-NEXT: shrq $48, %rsi -; AVX2-NEXT: andl $15, %esi -; AVX2-NEXT: shlq $48, %rsi -; AVX2-NEXT: orq %r8, %rsi -; AVX2-NEXT: shrq $56, %rcx -; AVX2-NEXT: andl $15, %ecx -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-NEXT: shlq $56, %rcx -; AVX2-NEXT: orq %rsi, %rcx -; AVX2-NEXT: vmovq %xmm0, %rax -; AVX2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: movl %eax, %ecx ; AVX2-NEXT: shrl $8, %ecx -; AVX2-NEXT: vmovd %eax, %xmm1 -; AVX2-NEXT: vpinsrb $1, %ecx, %xmm1, %xmm1 -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: shrl $16, %ecx -; AVX2-NEXT: vpinsrb $2, %ecx, %xmm1, %xmm1 -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: shrl $24, %ecx -; AVX2-NEXT: vpinsrb $3, %ecx, %xmm1, %xmm1 -; AVX2-NEXT: movq %rax, %rcx -; AVX2-NEXT: shrq $32, %rcx -; AVX2-NEXT: vpinsrb $4, %ecx, %xmm1, %xmm1 -; AVX2-NEXT: movq %rax, %rcx -; AVX2-NEXT: shrq $40, %rcx -; AVX2-NEXT: vpinsrb $5, %ecx, %xmm1, %xmm1 -; AVX2-NEXT: movq %rax, %rcx -; AVX2-NEXT: shrq $48, %rcx -; AVX2-NEXT: vpinsrb $6, %ecx, %xmm1, %xmm1 -; AVX2-NEXT: vpextrq $1, %xmm0, %rcx +; AVX2-NEXT: andl $15, %ecx +; AVX2-NEXT: shlq $8, %rcx +; AVX2-NEXT: orq %r8, %rcx +; AVX2-NEXT: shrl $16, %esi +; AVX2-NEXT: andl $15, %esi +; AVX2-NEXT: shlq $16, %rsi +; AVX2-NEXT: orq %rcx, %rsi +; AVX2-NEXT: shrl $24, %edx +; AVX2-NEXT: andl $15, %edx +; AVX2-NEXT: shlq $24, %rdx +; AVX2-NEXT: orq %rsi, %rdx +; AVX2-NEXT: shrq $32, %rdi +; AVX2-NEXT: andl $15, %edi +; AVX2-NEXT: shlq $32, %rdi +; AVX2-NEXT: orq %rdx, %rdi +; AVX2-NEXT: shrq $40, %rbx +; AVX2-NEXT: andl $15, %ebx +; AVX2-NEXT: shlq $40, %rbx +; AVX2-NEXT: orq %rdi, %rbx ; AVX2-NEXT: shrq $56, %rax -; AVX2-NEXT: vpinsrb $7, %eax, %xmm1, %xmm0 +; AVX2-NEXT: andl $15, %eax +; AVX2-NEXT: shrq $48, %r11 +; AVX2-NEXT: andl $15, %r11d +; AVX2-NEXT: shlq $48, %r11 +; AVX2-NEXT: shlq $56, %rax +; AVX2-NEXT: orq %r11, %rax +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-NEXT: orq %rbx, %rax +; AVX2-NEXT: vmovq %xmm0, %rcx +; AVX2-NEXT: movq %rax, -{{[0-9]+}}(%rsp) ; AVX2-NEXT: movl %ecx, %eax ; AVX2-NEXT: shrl $8, %eax -; AVX2-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 -; AVX2-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 +; AVX2-NEXT: vmovd %ecx, %xmm1 +; AVX2-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 ; AVX2-NEXT: movl %ecx, %eax ; AVX2-NEXT: shrl $16, %eax -; AVX2-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 +; AVX2-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 ; AVX2-NEXT: movl %ecx, %eax ; AVX2-NEXT: shrl $24, %eax -; AVX2-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; AVX2-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1 ; AVX2-NEXT: movq %rcx, %rax ; AVX2-NEXT: shrq $32, %rax -; AVX2-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 +; AVX2-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 ; AVX2-NEXT: movq %rcx, %rax ; AVX2-NEXT: shrq $40, %rax -; AVX2-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; AVX2-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 ; AVX2-NEXT: movq %rcx, %rax ; AVX2-NEXT: shrq $48, %rax -; AVX2-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 +; AVX2-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 +; AVX2-NEXT: vpextrq $1, %xmm0, %rax ; AVX2-NEXT: shrq $56, %rcx -; AVX2-NEXT: vpinsrb $15, %ecx, %xmm0, %xmm0 +; AVX2-NEXT: vpinsrb $7, %ecx, %xmm1, %xmm0 +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: shrl $8, %ecx +; AVX2-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 +; AVX2-NEXT: vpinsrb $9, %ecx, %xmm0, %xmm0 +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: shrl $16, %ecx +; AVX2-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: shrl $24, %ecx +; AVX2-NEXT: vpinsrb $11, %ecx, %xmm0, %xmm0 +; AVX2-NEXT: movq %rax, %rcx +; AVX2-NEXT: shrq $32, %rcx +; AVX2-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 +; AVX2-NEXT: movq %rax, %rcx +; AVX2-NEXT: shrq $40, %rcx +; AVX2-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 +; AVX2-NEXT: movq %rax, %rcx +; AVX2-NEXT: shrq $48, %rcx +; AVX2-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; AVX2-NEXT: shrq $56, %rax +; AVX2-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 ; AVX2-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm1 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-NEXT: popq %rbx ; AVX2-NEXT: retq %x4 = bitcast <32 x i8> %0 to <64 x i4> %r0 = insertelement <64 x i4> %x4, i4 zeroinitializer, i32 1 diff --git a/llvm/test/CodeGen/X86/insertelement-ones.ll b/llvm/test/CodeGen/X86/insertelement-ones.ll --- a/llvm/test/CodeGen/X86/insertelement-ones.ll +++ b/llvm/test/CodeGen/X86/insertelement-ones.ll @@ -152,9 +152,8 @@ ; SSE2: # %bb.0: ; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero ; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3] -; SSE2-NEXT: movl $-1, %eax -; SSE2-NEXT: movd %eax, %xmm2 -; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[3,0] +; SSE2-NEXT: pcmpeqd %xmm2, %xmm2 +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[3,0] ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,2] ; SSE2-NEXT: retq ; @@ -162,9 +161,8 @@ ; SSE3: # %bb.0: ; SSE3-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero ; SSE3-NEXT: movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3] -; SSE3-NEXT: movl $-1, %eax -; SSE3-NEXT: movd %eax, %xmm2 -; SSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[3,0] +; SSE3-NEXT: pcmpeqd %xmm2, %xmm2 +; SSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[3,0] ; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,2] ; SSE3-NEXT: retq ; @@ -172,9 +170,8 @@ ; SSSE3: # %bb.0: ; SSSE3-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero ; SSSE3-NEXT: movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3] -; SSSE3-NEXT: movl $-1, %eax -; SSSE3-NEXT: movd %eax, %xmm2 -; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[3,0] +; SSSE3-NEXT: pcmpeqd %xmm2, %xmm2 +; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[3,0] ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,2] ; SSSE3-NEXT: retq ; @@ -211,23 +208,20 @@ define <8 x i16> @insert_v8i16_x12345x7(<8 x i16> %a) { ; SSE2-LABEL: insert_v8i16_x12345x7: ; SSE2: # %bb.0: -; SSE2-NEXT: movl $65535, %eax # imm = 0xFFFF -; SSE2-NEXT: pinsrw $0, %eax, %xmm0 -; SSE2-NEXT: pinsrw $6, %eax, %xmm0 +; SSE2-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: orps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE2-NEXT: retq ; ; SSE3-LABEL: insert_v8i16_x12345x7: ; SSE3: # %bb.0: -; SSE3-NEXT: movl $65535, %eax # imm = 0xFFFF -; SSE3-NEXT: pinsrw $0, %eax, %xmm0 -; SSE3-NEXT: pinsrw $6, %eax, %xmm0 +; SSE3-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE3-NEXT: orps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE3-NEXT: retq ; ; SSSE3-LABEL: insert_v8i16_x12345x7: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movl $65535, %eax # imm = 0xFFFF -; SSSE3-NEXT: pinsrw $0, %eax, %xmm0 -; SSSE3-NEXT: pinsrw $6, %eax, %xmm0 +; SSSE3-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSSE3-NEXT: orps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: insert_v8i16_x12345x7: @@ -249,26 +243,26 @@ define <16 x i16> @insert_v16i16_x12345x789ABCDEx(<16 x i16> %a) { ; SSE2-LABEL: insert_v16i16_x12345x789ABCDEx: ; SSE2: # %bb.0: -; SSE2-NEXT: movl $65535, %eax # imm = 0xFFFF -; SSE2-NEXT: pinsrw $0, %eax, %xmm0 -; SSE2-NEXT: pinsrw $6, %eax, %xmm0 -; SSE2-NEXT: pinsrw $7, %eax, %xmm1 +; SSE2-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: orps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE2-NEXT: orps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; SSE2-NEXT: retq ; ; SSE3-LABEL: insert_v16i16_x12345x789ABCDEx: ; SSE3: # %bb.0: -; SSE3-NEXT: movl $65535, %eax # imm = 0xFFFF -; SSE3-NEXT: pinsrw $0, %eax, %xmm0 -; SSE3-NEXT: pinsrw $6, %eax, %xmm0 -; SSE3-NEXT: pinsrw $7, %eax, %xmm1 +; SSE3-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE3-NEXT: orps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE3-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE3-NEXT: orps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; SSE3-NEXT: retq ; ; SSSE3-LABEL: insert_v16i16_x12345x789ABCDEx: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movl $65535, %eax # imm = 0xFFFF -; SSSE3-NEXT: pinsrw $0, %eax, %xmm0 -; SSSE3-NEXT: pinsrw $6, %eax, %xmm0 -; SSSE3-NEXT: pinsrw $7, %eax, %xmm1 +; SSSE3-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSSE3-NEXT: orps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSSE3-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSSE3-NEXT: orps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: insert_v16i16_x12345x789ABCDEx: @@ -282,35 +276,28 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX1-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX1-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: insert_v16i16_x12345x789ABCDEx: ; AVX2: # %bb.0: -; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1,2,3,4,5],xmm1[6],xmm0[7] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0] ; AVX2-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 -; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7],ymm0[8,9,10,11,12,13,14],ymm2[15] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vpblendvb %ymm1, %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: retq ; ; AVX512F-LABEL: insert_v16i16_x12345x789ABCDEx: ; AVX512F: # %bb.0: -; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [32,1,2,3,4,5,38,7,8,9,10,11,12,13,14,47] -; AVX512F-NEXT: vpermt2w %zmm1, %zmm2, %zmm0 -; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0] +; AVX512F-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 +; AVX512F-NEXT: vpblendvb %ymm1, %ymm0, %ymm2, %ymm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: insert_v16i16_x12345x789ABCDEx: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [16,1,2,3,4,5,22,7,8,9,10,11,12,13,14,31] -; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0 +; AVX512VL-NEXT: movw $-32703, %ax # imm = 0x8041 +; AVX512VL-NEXT: kmovd %eax, %k1 +; AVX512VL-NEXT: vmovdqu16 %ymm1, %ymm0 {%k1} ; AVX512VL-NEXT: retq %1 = insertelement <16 x i16> %a, i16 -1, i32 0 %2 = insertelement <16 x i16> %1, i16 -1, i32 6 @@ -321,55 +308,59 @@ define <16 x i8> @insert_v16i8_x123456789ABCDEx(<16 x i8> %a) { ; SSE2-LABEL: insert_v16i8_x123456789ABCDEx: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: movl $255, %eax -; SSE2-NEXT: movd %eax, %xmm2 -; SSE2-NEXT: pandn %xmm2, %xmm1 -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0] -; SSE2-NEXT: por %xmm2, %xmm0 +; SSE2-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: orps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE2-NEXT: retq ; ; SSE3-LABEL: insert_v16i8_x123456789ABCDEx: ; SSE3: # %bb.0: -; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; SSE3-NEXT: pand %xmm1, %xmm0 -; SSE3-NEXT: movl $255, %eax -; SSE3-NEXT: movd %eax, %xmm2 -; SSE3-NEXT: pandn %xmm2, %xmm1 -; SSE3-NEXT: por %xmm1, %xmm0 -; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE3-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0] -; SSE3-NEXT: por %xmm2, %xmm0 +; SSE3-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE3-NEXT: orps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE3-NEXT: retq ; ; SSSE3-LABEL: insert_v16i8_x123456789ABCDEx: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movl $255, %eax -; SSSE3-NEXT: movd %eax, %xmm1 -; SSSE3-NEXT: movdqa %xmm1, %xmm2 -; SSSE3-NEXT: palignr {{.*#+}} xmm2 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0] -; SSSE3-NEXT: pshufb {{.*#+}} xmm2 = xmm2[15,0,1,2,3,4,5,6,7,8,9,10,11,12,13],zero -; SSSE3-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0] -; SSSE3-NEXT: por %xmm2, %xmm1 -; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = zero,xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14],zero +; SSSE3-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: insert_v16i8_x123456789ABCDEx: ; SSE41: # %bb.0: -; SSE41-NEXT: movl $255, %eax -; SSE41-NEXT: pinsrb $0, %eax, %xmm0 -; SSE41-NEXT: pinsrb $15, %eax, %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: movaps {{.*#+}} xmm0 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0] +; SSE41-NEXT: pcmpeqd %xmm2, %xmm2 +; SSE41-NEXT: pblendvb %xmm0, %xmm1, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: retq ; -; AVX-LABEL: insert_v16i8_x123456789ABCDEx: -; AVX: # %bb.0: -; AVX-NEXT: movl $255, %eax -; AVX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 -; AVX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: insert_v16i8_x123456789ABCDEx: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0] +; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpblendvb %xmm1, %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: insert_v16i8_x123456789ABCDEx: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0] +; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpblendvb %xmm1, %xmm0, %xmm2, %xmm0 +; AVX2-NEXT: retq +; +; AVX512F-LABEL: insert_v16i8_x123456789ABCDEx: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0] +; AVX512F-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX512F-NEXT: vpblendvb %xmm1, %xmm0, %xmm2, %xmm0 +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: insert_v16i8_x123456789ABCDEx: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX512VL-NEXT: movw $-32767, %ax # imm = 0x8001 +; AVX512VL-NEXT: kmovd %eax, %k1 +; AVX512VL-NEXT: vmovdqu8 %xmm1, %xmm0 {%k1} +; AVX512VL-NEXT: retq %1 = insertelement <16 x i8> %a, i8 -1, i32 0 %2 = insertelement <16 x i8> %1, i8 -1, i32 15 ret <16 x i8> %2 @@ -378,103 +369,65 @@ define <32 x i8> @insert_v32i8_x123456789ABCDEzGHIJKLMNOPQRSTxx(<32 x i8> %a) { ; SSE2-LABEL: insert_v32i8_x123456789ABCDEzGHIJKLMNOPQRSTxx: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: movl $255, %eax -; SSE2-NEXT: movd %eax, %xmm3 -; SSE2-NEXT: pandn %xmm3, %xmm2 -; SSE2-NEXT: por %xmm2, %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0] -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: movdqa %xmm3, %xmm4 -; SSE2-NEXT: pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0] -; SSE2-NEXT: por %xmm4, %xmm0 -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1] -; SSE2-NEXT: por %xmm3, %xmm1 -; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: por %xmm4, %xmm1 +; SSE2-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: orps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE2-NEXT: orps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; SSE2-NEXT: retq ; ; SSE3-LABEL: insert_v32i8_x123456789ABCDEzGHIJKLMNOPQRSTxx: ; SSE3: # %bb.0: -; SSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; SSE3-NEXT: pand %xmm2, %xmm0 -; SSE3-NEXT: movl $255, %eax -; SSE3-NEXT: movd %eax, %xmm3 -; SSE3-NEXT: pandn %xmm3, %xmm2 -; SSE3-NEXT: por %xmm2, %xmm0 -; SSE3-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0] -; SSE3-NEXT: pand %xmm2, %xmm0 -; SSE3-NEXT: movdqa %xmm3, %xmm4 -; SSE3-NEXT: pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0] -; SSE3-NEXT: por %xmm4, %xmm0 -; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE3-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1] -; SSE3-NEXT: por %xmm3, %xmm1 -; SSE3-NEXT: pand %xmm2, %xmm1 -; SSE3-NEXT: por %xmm4, %xmm1 +; SSE3-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE3-NEXT: orps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE3-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE3-NEXT: orps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; SSE3-NEXT: retq ; ; SSSE3-LABEL: insert_v32i8_x123456789ABCDEzGHIJKLMNOPQRSTxx: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movl $255, %eax -; SSSE3-NEXT: movd %eax, %xmm3 -; SSSE3-NEXT: movdqa %xmm3, %xmm2 -; SSSE3-NEXT: palignr {{.*#+}} xmm2 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0] -; SSSE3-NEXT: pshufb {{.*#+}} xmm2 = xmm2[15,0,1,2,3,4,5,6,7,8,9,10,11,12,13],zero -; SSSE3-NEXT: movdqa %xmm3, %xmm0 -; SSSE3-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0] -; SSSE3-NEXT: por %xmm0, %xmm2 -; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13],zero,xmm1[u] -; SSSE3-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1] -; SSSE3-NEXT: por %xmm3, %xmm1 -; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],zero -; SSSE3-NEXT: por %xmm0, %xmm1 -; SSSE3-NEXT: movdqa %xmm2, %xmm0 +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = zero,xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14],zero +; SSSE3-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSSE3-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSSE3-NEXT: orps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: insert_v32i8_x123456789ABCDEzGHIJKLMNOPQRSTxx: ; SSE41: # %bb.0: -; SSE41-NEXT: movl $255, %eax -; SSE41-NEXT: pinsrb $0, %eax, %xmm0 -; SSE41-NEXT: pinsrb $15, %eax, %xmm0 -; SSE41-NEXT: pinsrb $14, %eax, %xmm1 -; SSE41-NEXT: pinsrb $15, %eax, %xmm1 +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: movaps {{.*#+}} xmm0 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0] +; SSE41-NEXT: pcmpeqd %xmm3, %xmm3 +; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6],xmm3[7] +; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm3 +; SSE41-NEXT: movdqa %xmm3, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: insert_v32i8_x123456789ABCDEzGHIJKLMNOPQRSTxx: ; AVX1: # %bb.0: -; AVX1-NEXT: movl $255, %eax -; AVX1-NEXT: vpinsrb $0, %eax, %xmm0, %xmm1 -; AVX1-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 -; AVX1-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX1-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: insert_v32i8_x123456789ABCDEzGHIJKLMNOPQRSTxx: ; AVX2: # %bb.0: -; AVX2-NEXT: movl $255, %eax -; AVX2-NEXT: vpinsrb $0, %eax, %xmm0, %xmm1 -; AVX2-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 -; AVX2-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0] +; AVX2-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 +; AVX2-NEXT: vpblendvb %ymm1, %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: retq ; -; AVX512-LABEL: insert_v32i8_x123456789ABCDEzGHIJKLMNOPQRSTxx: -; AVX512: # %bb.0: -; AVX512-NEXT: movl $255, %eax -; AVX512-NEXT: vpinsrb $0, %eax, %xmm0, %xmm1 -; AVX512-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 -; AVX512-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX512-NEXT: retq +; AVX512F-LABEL: insert_v32i8_x123456789ABCDEzGHIJKLMNOPQRSTxx: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0] +; AVX512F-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 +; AVX512F-NEXT: vpblendvb %ymm1, %ymm0, %ymm2, %ymm0 +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: insert_v32i8_x123456789ABCDEzGHIJKLMNOPQRSTxx: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; AVX512VL-NEXT: movl $-1073709055, %eax # imm = 0xC0008001 +; AVX512VL-NEXT: kmovd %eax, %k1 +; AVX512VL-NEXT: vmovdqu8 %ymm1, %ymm0 {%k1} +; AVX512VL-NEXT: retq %1 = insertelement <32 x i8> %a, i8 -1, i32 0 %2 = insertelement <32 x i8> %1, i8 -1, i32 15 %3 = insertelement <32 x i8> %2, i8 -1, i32 30 diff --git a/llvm/test/CodeGen/X86/insertelement-zero.ll b/llvm/test/CodeGen/X86/insertelement-zero.ll --- a/llvm/test/CodeGen/X86/insertelement-zero.ll +++ b/llvm/test/CodeGen/X86/insertelement-zero.ll @@ -45,30 +45,30 @@ define <4 x double> @insert_v4f64_0zz3(<4 x double> %a) { ; SSE2-LABEL: insert_v4f64_0zz3: ; SSE2: # %bb.0: -; SSE2-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero ; SSE2-NEXT: xorpd %xmm2, %xmm2 ; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] +; SSE2-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero ; SSE2-NEXT: retq ; ; SSE3-LABEL: insert_v4f64_0zz3: ; SSE3: # %bb.0: -; SSE3-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero ; SSE3-NEXT: xorpd %xmm2, %xmm2 ; SSE3-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] +; SSE3-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero ; SSE3-NEXT: retq ; ; SSSE3-LABEL: insert_v4f64_0zz3: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero ; SSSE3-NEXT: xorpd %xmm2, %xmm2 ; SSSE3-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] +; SSSE3-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero ; SSSE3-NEXT: retq ; ; SSE41-LABEL: insert_v4f64_0zz3: ; SSE41: # %bb.0: -; SSE41-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero ; SSE41-NEXT: xorps %xmm2, %xmm2 ; SSE41-NEXT: blendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; SSE41-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero ; SSE41-NEXT: retq ; ; AVX-LABEL: insert_v4f64_0zz3: @@ -191,7 +191,8 @@ ; SSE2: # %bb.0: ; SSE2-NEXT: xorps %xmm2, %xmm2 ; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3] -; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[3,0] +; SSE2-NEXT: xorps %xmm2, %xmm2 +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[3,0] ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,2] ; SSE2-NEXT: retq ; @@ -199,7 +200,8 @@ ; SSE3: # %bb.0: ; SSE3-NEXT: xorps %xmm2, %xmm2 ; SSE3-NEXT: movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3] -; SSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[3,0] +; SSE3-NEXT: xorps %xmm2, %xmm2 +; SSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[3,0] ; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,2] ; SSE3-NEXT: retq ; @@ -207,7 +209,8 @@ ; SSSE3: # %bb.0: ; SSSE3-NEXT: xorps %xmm2, %xmm2 ; SSSE3-NEXT: movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3] -; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[3,0] +; SSSE3-NEXT: xorps %xmm2, %xmm2 +; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[3,0] ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,2] ; SSSE3-NEXT: retq ; @@ -268,29 +271,20 @@ define <8 x i32> @insert_v8i32_z12345z7(<8 x i32> %a) { ; SSE2-LABEL: insert_v8i32_z12345z7: ; SSE2: # %bb.0: -; SSE2-NEXT: xorps %xmm2, %xmm2 -; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3] -; SSE2-NEXT: xorps %xmm2, %xmm2 -; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[3,0] -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,2] +; SSE2-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; SSE2-NEXT: retq ; ; SSE3-LABEL: insert_v8i32_z12345z7: ; SSE3: # %bb.0: -; SSE3-NEXT: xorps %xmm2, %xmm2 -; SSE3-NEXT: movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3] -; SSE3-NEXT: xorps %xmm2, %xmm2 -; SSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[3,0] -; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,2] +; SSE3-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE3-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; SSE3-NEXT: retq ; ; SSSE3-LABEL: insert_v8i32_z12345z7: ; SSSE3: # %bb.0: -; SSSE3-NEXT: xorps %xmm2, %xmm2 -; SSSE3-NEXT: movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3] -; SSSE3-NEXT: xorps %xmm2, %xmm2 -; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[3,0] -; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,2] +; SSSE3-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSSE3-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: insert_v8i32_z12345z7: @@ -313,23 +307,17 @@ define <8 x i16> @insert_v8i16_z12345z7(<8 x i16> %a) { ; SSE2-LABEL: insert_v8i16_z12345z7: ; SSE2: # %bb.0: -; SSE2-NEXT: xorl %eax, %eax -; SSE2-NEXT: pinsrw $0, %eax, %xmm0 -; SSE2-NEXT: pinsrw $6, %eax, %xmm0 +; SSE2-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE2-NEXT: retq ; ; SSE3-LABEL: insert_v8i16_z12345z7: ; SSE3: # %bb.0: -; SSE3-NEXT: xorl %eax, %eax -; SSE3-NEXT: pinsrw $0, %eax, %xmm0 -; SSE3-NEXT: pinsrw $6, %eax, %xmm0 +; SSE3-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE3-NEXT: retq ; ; SSSE3-LABEL: insert_v8i16_z12345z7: ; SSSE3: # %bb.0: -; SSSE3-NEXT: xorl %eax, %eax -; SSSE3-NEXT: pinsrw $0, %eax, %xmm0 -; SSSE3-NEXT: pinsrw $6, %eax, %xmm0 +; SSSE3-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: insert_v8i16_z12345z7: @@ -351,26 +339,20 @@ define <16 x i16> @insert_v16i16_z12345z789ABCDEz(<16 x i16> %a) { ; SSE2-LABEL: insert_v16i16_z12345z789ABCDEz: ; SSE2: # %bb.0: -; SSE2-NEXT: xorl %eax, %eax -; SSE2-NEXT: pinsrw $0, %eax, %xmm0 -; SSE2-NEXT: pinsrw $6, %eax, %xmm0 -; SSE2-NEXT: pinsrw $7, %eax, %xmm1 +; SSE2-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; SSE2-NEXT: retq ; ; SSE3-LABEL: insert_v16i16_z12345z789ABCDEz: ; SSE3: # %bb.0: -; SSE3-NEXT: xorl %eax, %eax -; SSE3-NEXT: pinsrw $0, %eax, %xmm0 -; SSE3-NEXT: pinsrw $6, %eax, %xmm0 -; SSE3-NEXT: pinsrw $7, %eax, %xmm1 +; SSE3-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE3-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; SSE3-NEXT: retq ; ; SSSE3-LABEL: insert_v16i16_z12345z789ABCDEz: ; SSSE3: # %bb.0: -; SSSE3-NEXT: xorl %eax, %eax -; SSSE3-NEXT: pinsrw $0, %eax, %xmm0 -; SSSE3-NEXT: pinsrw $6, %eax, %xmm0 -; SSSE3-NEXT: pinsrw $7, %eax, %xmm1 +; SSSE3-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSSE3-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: insert_v16i16_z12345z789ABCDEz: @@ -391,46 +373,15 @@ } define <16 x i8> @insert_v16i8_z123456789ABCDEz(<16 x i8> %a) { -; SSE2-LABEL: insert_v16i8_z123456789ABCDEz: -; SSE2: # %bb.0: -; SSE2-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: retq +; SSE-LABEL: insert_v16i8_z123456789ABCDEz: +; SSE: # %bb.0: +; SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE-NEXT: retq ; -; SSE3-LABEL: insert_v16i8_z123456789ABCDEz: -; SSE3: # %bb.0: -; SSE3-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE3-NEXT: retq -; -; SSSE3-LABEL: insert_v16i8_z123456789ABCDEz: -; SSSE3: # %bb.0: -; SSSE3-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSSE3-NEXT: retq -; -; SSE41-LABEL: insert_v16i8_z123456789ABCDEz: -; SSE41: # %bb.0: -; SSE41-NEXT: xorl %eax, %eax -; SSE41-NEXT: pinsrb $0, %eax, %xmm0 -; SSE41-NEXT: pinsrb $15, %eax, %xmm0 -; SSE41-NEXT: retq -; -; AVX1-LABEL: insert_v16i8_z123456789ABCDEz: -; AVX1: # %bb.0: -; AVX1-NEXT: xorl %eax, %eax -; AVX1-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 -; AVX1-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; AVX1-NEXT: retq -; -; AVX2-SLOW-LABEL: insert_v16i8_z123456789ABCDEz: -; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: xorl %eax, %eax -; AVX2-SLOW-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: retq -; -; AVX2-FAST-LABEL: insert_v16i8_z123456789ABCDEz: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX2-FAST-NEXT: retq +; AVX-LABEL: insert_v16i8_z123456789ABCDEz: +; AVX: # %bb.0: +; AVX-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: retq %1 = insertelement <16 x i8> %a, i8 0, i32 0 %2 = insertelement <16 x i8> %1, i8 0, i32 15 ret <16 x i8> %2 @@ -457,11 +408,9 @@ ; ; SSE41-LABEL: insert_v32i8_z123456789ABCDEzGHIJKLMNOPQRSTzz: ; SSE41: # %bb.0: -; SSE41-NEXT: xorl %eax, %eax -; SSE41-NEXT: pinsrb $0, %eax, %xmm0 -; SSE41-NEXT: pinsrb $15, %eax, %xmm0 ; SSE41-NEXT: pxor %xmm2, %xmm2 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6],xmm2[7] +; SSE41-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: insert_v32i8_z123456789ABCDEzGHIJKLMNOPQRSTzz: diff --git a/llvm/test/CodeGen/X86/masked_load.ll b/llvm/test/CodeGen/X86/masked_load.ll --- a/llvm/test/CodeGen/X86/masked_load.ll +++ b/llvm/test/CodeGen/X86/masked_load.ll @@ -6318,20 +6318,15 @@ define <4 x float> @mload_constmask_v4f32(<4 x float>* %addr, <4 x float> %dst) { ; SSE2-LABEL: mload_constmask_v4f32: ; SSE2: ## %bb.0: -; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSE2-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero -; SSE2-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm2[0] -; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,0] +; SSE2-NEXT: movups (%rdi), %xmm1 +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] ; SSE2-NEXT: retq ; ; SSE42-LABEL: mload_constmask_v4f32: ; SSE42: ## %bb.0: -; SSE42-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE42-NEXT: movups (%rdi), %xmm1 ; SSE42-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] -; SSE42-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] -; SSE42-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] ; SSE42-NEXT: retq ; ; AVX1OR2-LABEL: mload_constmask_v4f32: @@ -6419,20 +6414,20 @@ define <4 x i32> @mload_constmask_v4i32(<4 x i32>* %addr, <4 x i32> %dst) { ; SSE2-LABEL: mload_constmask_v4i32: ; SSE2: ## %bb.0: -; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSE2-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[2,0] +; SSE2-NEXT: movq {{.*#+}} xmm2 = mem[0],zero +; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] ; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE42-LABEL: mload_constmask_v4i32: ; SSE42: ## %bb.0: -; SSE42-NEXT: pinsrd $1, 4(%rdi), %xmm0 -; SSE42-NEXT: pinsrd $2, 8(%rdi), %xmm0 -; SSE42-NEXT: pinsrd $3, 12(%rdi), %xmm0 +; SSE42-NEXT: pinsrd $1, 4(%rdi), %xmm1 +; SSE42-NEXT: pinsrd $2, 8(%rdi), %xmm1 +; SSE42-NEXT: pinsrd $3, 12(%rdi), %xmm1 +; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] ; SSE42-NEXT: retq ; ; AVX1-LABEL: mload_constmask_v4i32: @@ -6515,21 +6510,18 @@ define <8 x float> @mload_constmask_v8f32(<8 x float>* %addr, <8 x float> %dst) { ; SSE2-LABEL: mload_constmask_v8f32: ; SSE2: ## %bb.0: -; SSE2-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero ; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSE2-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; SSE2-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero -; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,0],xmm0[3,0] -; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm3[0,2] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm2[0,3] +; SSE2-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] ; SSE2-NEXT: movaps %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSE42-LABEL: mload_constmask_v8f32: ; SSE42: ## %bb.0: -; SSE42-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSE42-NEXT: blendps {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] -; SSE42-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3] -; SSE42-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] +; SSE42-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero +; SSE42-NEXT: insertps {{.*#+}} xmm2 = xmm2[0,1],mem[0],xmm2[3] +; SSE42-NEXT: blendps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3] ; SSE42-NEXT: retq ; ; AVX1OR2-LABEL: mload_constmask_v8f32: @@ -6576,9 +6568,11 @@ define <8 x float> @mload_constmask_v8f32_zero(<8 x float>* %addr, <8 x float> %dst) { ; SSE2-LABEL: mload_constmask_v8f32_zero: ; SSE2: ## %bb.0: -; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0] ; SSE2-NEXT: xorps %xmm1, %xmm1 ; SSE2-NEXT: retq ; @@ -6631,8 +6625,8 @@ define <4 x double> @mload_constmask_v4f64(<4 x double>* %addr, <4 x double> %dst) { ; SSE-LABEL: mload_constmask_v4f64: ; SSE: ## %bb.0: -; SSE-NEXT: movups (%rdi), %xmm0 ; SSE-NEXT: movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] +; SSE-NEXT: movups (%rdi), %xmm0 ; SSE-NEXT: retq ; ; AVX1OR2-LABEL: mload_constmask_v4f64: @@ -6681,12 +6675,10 @@ define <8 x i32> @mload_constmask_v8i32(<8 x i32>* %addr, <8 x i32> %dst) { ; SSE2-LABEL: mload_constmask_v8i32: ; SSE2: ## %bb.0: -; SSE2-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero ; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSE2-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; SSE2-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero -; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,0],xmm0[3,0] -; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm3[0,2] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm2[0,3] +; SSE2-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] ; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] @@ -6695,10 +6687,13 @@ ; ; SSE42-LABEL: mload_constmask_v8i32: ; SSE42: ## %bb.0: -; SSE42-NEXT: pinsrd $0, (%rdi), %xmm0 -; SSE42-NEXT: pinsrd $1, 4(%rdi), %xmm0 -; SSE42-NEXT: pinsrd $2, 8(%rdi), %xmm0 -; SSE42-NEXT: pinsrd $3, 28(%rdi), %xmm1 +; SSE42-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] +; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm2[6,7] +; SSE42-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE42-NEXT: pinsrd $1, 4(%rdi), %xmm2 +; SSE42-NEXT: pinsrd $2, 8(%rdi), %xmm2 +; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,5],xmm0[6,7] ; SSE42-NEXT: retq ; ; AVX1OR2-LABEL: mload_constmask_v8i32: @@ -6750,8 +6745,10 @@ ; ; SSE42-LABEL: mload_constmask_v4i64: ; SSE42: ## %bb.0: -; SSE42-NEXT: pinsrq $0, (%rdi), %xmm0 -; SSE42-NEXT: pinsrq $1, 24(%rdi), %xmm1 +; SSE42-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero +; SSE42-NEXT: blendps {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] +; SSE42-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero +; SSE42-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; SSE42-NEXT: retq ; ; AVX1OR2-LABEL: mload_constmask_v4i64: @@ -6798,9 +6795,9 @@ define <8 x double> @mload_constmask_v8f64(<8 x double>* %addr, <8 x double> %dst) { ; SSE-LABEL: mload_constmask_v8f64: ; SSE: ## %bb.0: -; SSE-NEXT: movups (%rdi), %xmm0 ; SSE-NEXT: movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] ; SSE-NEXT: movhps {{.*#+}} xmm3 = xmm3[0,1],mem[0,1] +; SSE-NEXT: movups (%rdi), %xmm0 ; SSE-NEXT: retq ; ; AVX1OR2-LABEL: mload_constmask_v8f64: @@ -6847,14 +6844,14 @@ ; SSE-LABEL: mload_constmask_v16f64_allones_split: ; SSE: ## %bb.0: ; SSE-NEXT: movq %rdi, %rax -; SSE-NEXT: movups (%rsi), %xmm0 -; SSE-NEXT: movups 16(%rsi), %xmm1 -; SSE-NEXT: movups 32(%rsi), %xmm2 -; SSE-NEXT: movups 48(%rsi), %xmm3 ; SSE-NEXT: movlps {{.*#+}} xmm4 = mem[0,1],xmm4[2,3] ; SSE-NEXT: movlps {{.*#+}} xmm5 = mem[0,1],xmm5[2,3] ; SSE-NEXT: movlps {{.*#+}} xmm6 = mem[0,1],xmm6[2,3] ; SSE-NEXT: movlps {{.*#+}} xmm7 = mem[0,1],xmm7[2,3] +; SSE-NEXT: movups (%rsi), %xmm0 +; SSE-NEXT: movups 16(%rsi), %xmm1 +; SSE-NEXT: movups 32(%rsi), %xmm2 +; SSE-NEXT: movups 48(%rsi), %xmm3 ; SSE-NEXT: movaps %xmm7, 112(%rdi) ; SSE-NEXT: movaps %xmm6, 96(%rdi) ; SSE-NEXT: movaps %xmm5, 80(%rdi) @@ -7179,10 +7176,10 @@ ; SSE2-LABEL: load_one_mask_bit_set6: ; SSE2: ## %bb.0: ; SSE2-NEXT: movq %rdi, %rax -; SSE2-NEXT: movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] -; SSE2-NEXT: movlps {{.*#+}} xmm5 = mem[0,1],xmm5[2,3] ; SSE2-NEXT: movsd {{.*#+}} xmm8 = mem[0],zero ; SSE2-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm8[0] +; SSE2-NEXT: movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] +; SSE2-NEXT: movlps {{.*#+}} xmm5 = mem[0,1],xmm5[2,3] ; SSE2-NEXT: movaps %xmm7, 112(%rdi) ; SSE2-NEXT: movaps %xmm4, 64(%rdi) ; SSE2-NEXT: movaps %xmm3, 48(%rdi) @@ -7196,17 +7193,20 @@ ; SSE42-LABEL: load_one_mask_bit_set6: ; SSE42: ## %bb.0: ; SSE42-NEXT: movq %rdi, %rax -; SSE42-NEXT: pinsrq $0, 16(%rsi), %xmm1 -; SSE42-NEXT: pinsrq $0, 80(%rsi), %xmm5 -; SSE42-NEXT: pinsrq $1, 104(%rsi), %xmm6 +; SSE42-NEXT: movsd {{.*#+}} xmm8 = mem[0],zero +; SSE42-NEXT: blendps {{.*#+}} xmm8 = xmm8[0,1],xmm1[2,3] +; SSE42-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero +; SSE42-NEXT: blendps {{.*#+}} xmm1 = xmm1[0,1],xmm5[2,3] +; SSE42-NEXT: movsd {{.*#+}} xmm5 = mem[0],zero +; SSE42-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm5[0] ; SSE42-NEXT: movaps %xmm7, 112(%rdi) ; SSE42-NEXT: movaps %xmm4, 64(%rdi) ; SSE42-NEXT: movaps %xmm3, 48(%rdi) ; SSE42-NEXT: movaps %xmm2, 32(%rdi) ; SSE42-NEXT: movaps %xmm0, (%rdi) -; SSE42-NEXT: movdqa %xmm6, 96(%rdi) -; SSE42-NEXT: movdqa %xmm5, 80(%rdi) -; SSE42-NEXT: movdqa %xmm1, 16(%rdi) +; SSE42-NEXT: movaps %xmm6, 96(%rdi) +; SSE42-NEXT: movaps %xmm1, 80(%rdi) +; SSE42-NEXT: movaps %xmm8, 16(%rdi) ; SSE42-NEXT: retq ; ; AVX1-LABEL: load_one_mask_bit_set6: diff --git a/llvm/test/CodeGen/X86/mulvi32.ll b/llvm/test/CodeGen/X86/mulvi32.ll --- a/llvm/test/CodeGen/X86/mulvi32.ll +++ b/llvm/test/CodeGen/X86/mulvi32.ll @@ -134,31 +134,31 @@ define <4 x i64> @_mul4xi32toi64a(<4 x i32>, <4 x i32>) { ; SSE2-LABEL: _mul4xi32toi64a: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,1,1,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,1,3,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,1,3,3] ; SSE2-NEXT: pmuludq %xmm3, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,3,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,3,3] -; SSE2-NEXT: pmuludq %xmm3, %xmm1 -; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] +; SSE2-NEXT: pmuludq %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm2, %xmm1 ; SSE2-NEXT: retq ; ; SSE42-LABEL: _mul4xi32toi64a: ; SSE42: # %bb.0: -; SSE42-NEXT: pmovzxdq {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero -; SSE42-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero +; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,2,3,3] +; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,2,3,3] ; SSE42-NEXT: pmuludq %xmm3, %xmm2 -; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,1,3,3] -; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,1,3,3] -; SSE42-NEXT: pmuludq %xmm3, %xmm1 -; SSE42-NEXT: movdqa %xmm2, %xmm0 +; SSE42-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; SSE42-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; SSE42-NEXT: pmuludq %xmm1, %xmm0 +; SSE42-NEXT: movdqa %xmm2, %xmm1 ; SSE42-NEXT: retq ; ; AVX1-LABEL: _mul4xi32toi64a: ; AVX1: # %bb.0: -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,1,3,3] -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,1,3,3] -; AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,1,3,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,1,3,3] +; AVX1-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/shuffle-extract-subvector.ll b/llvm/test/CodeGen/X86/shuffle-extract-subvector.ll --- a/llvm/test/CodeGen/X86/shuffle-extract-subvector.ll +++ b/llvm/test/CodeGen/X86/shuffle-extract-subvector.ll @@ -4,28 +4,13 @@ define void @f(<4 x half>* %a, <4 x half>* %b, <8 x half>* %c) { ; CHECK-LABEL: f: ; CHECK: # %bb.0: -; CHECK-NEXT: movzwl (%rdi), %eax -; CHECK-NEXT: movzwl 2(%rdi), %ecx -; CHECK-NEXT: movw %cx, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movw %ax, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movzwl 6(%rdi), %r8d -; CHECK-NEXT: movzwl 4(%rdi), %r11d -; CHECK-NEXT: movq (%rsi), %rsi -; CHECK-NEXT: movq %rsi, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movq (%rdi), %rax +; CHECK-NEXT: movq %rax, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm0 -; CHECK-NEXT: pextrw $1, %xmm0, %r9d -; CHECK-NEXT: movd %xmm0, %r10d -; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %esi -; CHECK-NEXT: pextrw $3, %xmm0, %eax -; CHECK-NEXT: pextrw $2, %xmm0, %edi -; CHECK-NEXT: movw %r11w, 8(%rdx) -; CHECK-NEXT: movw %cx, 4(%rdx) -; CHECK-NEXT: movw %r8w, 12(%rdx) -; CHECK-NEXT: movw %si, (%rdx) -; CHECK-NEXT: movw %di, 10(%rdx) -; CHECK-NEXT: movw %ax, 14(%rdx) -; CHECK-NEXT: movw %r10w, 2(%rdx) -; CHECK-NEXT: movw %r9w, 6(%rdx) +; CHECK-NEXT: movq (%rsi), %rax +; CHECK-NEXT: movq %rax, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; CHECK-NEXT: movdqa %xmm0, (%rdx) ; CHECK-NEXT: retq %tmp4 = load <4 x half>, <4 x half>* %a %tmp5 = load <4 x half>, <4 x half>* %b diff --git a/llvm/test/CodeGen/X86/vec_insert-7.ll b/llvm/test/CodeGen/X86/vec_insert-7.ll --- a/llvm/test/CodeGen/X86/vec_insert-7.ll +++ b/llvm/test/CodeGen/X86/vec_insert-7.ll @@ -8,7 +8,8 @@ define x86_mmx @mmx_movzl(x86_mmx %x) nounwind { ; X32-LABEL: mmx_movzl: ; X32: ## %bb.0: -; X32-NEXT: movq {{\.?LCPI[0-9]+_[0-9]+}}, %mm0 +; X32-NEXT: movl $32, %eax +; X32-NEXT: movd %eax, %mm0 ; X32-NEXT: retl ; ; X64-LABEL: mmx_movzl: diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll @@ -2422,12 +2422,15 @@ ; SSE2-NEXT: movzbl (%rsi), %ecx ; SSE2-NEXT: shll $8, %ecx ; SSE2-NEXT: orl %eax, %ecx -; SSE2-NEXT: movd %ecx, %xmm1 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[1,1,1,3,4,5,6,7] +; SSE2-NEXT: movd %ecx, %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: xorps %xmm2, %xmm2 +; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[1,1,1,3,4,5,6,7] ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,1] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,0,0,0] ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,1,1,1,4,5,6,7] ; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,4,4,4] ; SSE2-NEXT: packuswb %xmm1, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-shuffle-avx512.ll b/llvm/test/CodeGen/X86/vector-shuffle-avx512.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-avx512.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-avx512.ll @@ -534,8 +534,8 @@ define void @test_demandedelts_pshufb_v32i8_v16i8(<2 x i32>* %src, <8 x i32>* %dst) { ; SKX64-LABEL: test_demandedelts_pshufb_v32i8_v16i8: ; SKX64: # %bb.0: -; SKX64-NEXT: vmovdqa 32(%rdi), %xmm0 -; SKX64-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,13,14,15,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero +; SKX64-NEXT: vpbroadcastd 44(%rdi), %xmm0 +; SKX64-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; SKX64-NEXT: vmovdqa %ymm0, 672(%rsi) ; SKX64-NEXT: vmovdqa 208(%rdi), %xmm0 ; SKX64-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,6,7,0,1,2,3],zero,zero,zero,zero,zero,zero,zero,zero @@ -545,11 +545,11 @@ ; ; KNL64-LABEL: test_demandedelts_pshufb_v32i8_v16i8: ; KNL64: # %bb.0: -; KNL64-NEXT: vmovdqa 32(%rdi), %xmm0 -; KNL64-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,13,14,15,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero +; KNL64-NEXT: vpbroadcastd 44(%rdi), %xmm0 +; KNL64-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; KNL64-NEXT: vmovdqa %ymm0, 672(%rsi) -; KNL64-NEXT: vmovdqa 208(%rdi), %xmm0 -; KNL64-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,6,7,0,1,2,3],zero,zero,zero,zero,zero,zero,zero,zero +; KNL64-NEXT: vpshufd {{.*#+}} xmm0 = mem[1,0,2,3] +; KNL64-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; KNL64-NEXT: vmovdqa %ymm0, 832(%rsi) ; KNL64-NEXT: retq ; @@ -557,8 +557,8 @@ ; SKX32: # %bb.0: ; SKX32-NEXT: movl {{[0-9]+}}(%esp), %eax ; SKX32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; SKX32-NEXT: vmovdqa 32(%ecx), %xmm0 -; SKX32-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,13,14,15,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero +; SKX32-NEXT: vpbroadcastd 44(%ecx), %xmm0 +; SKX32-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; SKX32-NEXT: vmovdqa %ymm0, 672(%eax) ; SKX32-NEXT: vmovdqa 208(%ecx), %xmm0 ; SKX32-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,6,7,0,1,2,3],zero,zero,zero,zero,zero,zero,zero,zero @@ -569,13 +569,13 @@ ; KNL32-LABEL: test_demandedelts_pshufb_v32i8_v16i8: ; KNL32: # %bb.0: ; KNL32-NEXT: movl {{[0-9]+}}(%esp), %eax -; KNL32-NEXT: vmovdqa 32(%eax), %xmm0 -; KNL32-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,13,14,15,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero ; KNL32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; KNL32-NEXT: vmovdqa %ymm0, 672(%ecx) -; KNL32-NEXT: vmovdqa 208(%eax), %xmm0 -; KNL32-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,6,7,0,1,2,3],zero,zero,zero,zero,zero,zero,zero,zero -; KNL32-NEXT: vmovdqa %ymm0, 832(%ecx) +; KNL32-NEXT: vpbroadcastd 44(%ecx), %xmm0 +; KNL32-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; KNL32-NEXT: vmovdqa %ymm0, 672(%eax) +; KNL32-NEXT: vpshufd {{.*#+}} xmm0 = mem[1,0,2,3] +; KNL32-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; KNL32-NEXT: vmovdqa %ymm0, 832(%eax) ; KNL32-NEXT: retl %t64 = bitcast <2 x i32>* %src to <16 x i32>* %t87 = load <16 x i32>, <16 x i32>* %t64, align 64 diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll @@ -2835,28 +2835,32 @@ } define <8 x i16> @shuffle_extract_insert(<8 x i16> %a) { -; SSE-LABEL: shuffle_extract_insert: -; SSE: # %bb.0: -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7] -; SSE-NEXT: retq +; SSE2-LABEL: shuffle_extract_insert: +; SSE2: # %bb.0: +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,65535] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7] +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: pandn %xmm2, %xmm1 +; SSE2-NEXT: por %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: retq ; -; AVX1-LABEL: shuffle_extract_insert: -; AVX1: # %bb.0: -; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7] -; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7] -; AVX1-NEXT: retq +; SSSE3-LABEL: shuffle_extract_insert: +; SSSE3: # %bb.0: +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[4,5,2,3,0,1,6,7,12,13,10,11,8,9,14,15] +; SSSE3-NEXT: retq ; -; AVX2-SLOW-LABEL: shuffle_extract_insert: -; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7] -; AVX2-SLOW-NEXT: retq +; SSE41-LABEL: shuffle_extract_insert: +; SSE41: # %bb.0: +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[4,5,2,3,0,1,6,7,12,13,10,11,8,9,14,15] +; SSE41-NEXT: retq ; -; AVX2-FAST-LABEL: shuffle_extract_insert: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,2,3,0,1,6,7,12,13,10,11,8,9,14,15] -; AVX2-FAST-NEXT: retq +; AVX-LABEL: shuffle_extract_insert: +; AVX: # %bb.0: +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,2,3,0,1,6,7,12,13,10,11,8,9,14,15] +; AVX-NEXT: retq %a0 = extractelement <8 x i16> %a, i32 0 %a1 = extractelement <8 x i16> %a, i32 1 %a3 = extractelement <8 x i16> %a, i32 3 @@ -2928,41 +2932,66 @@ define <8 x i16> @shuffle_extract_concat_insert(<4 x i16> %lhsa, <4 x i16> %rhsa, <8 x i16> %b) { ; SSE2-LABEL: shuffle_extract_concat_insert: ; SSE2: # %bb.0: -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7] -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,3,2,3,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,3,2,4,5,6,7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,0,65535,65535,65535,65535,65535] +; SSE2-NEXT: pand %xmm4, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE2-NEXT: pandn %xmm0, %xmm4 +; SSE2-NEXT: por %xmm2, %xmm4 +; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,0,65535,0,65535] +; SSE2-NEXT: pand %xmm0, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,0] +; SSE2-NEXT: pandn %xmm1, %xmm0 +; SSE2-NEXT: por %xmm4, %xmm0 +; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm3[0],xmm0[1,2,3] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: shuffle_extract_concat_insert: ; SSSE3: # %bb.0: -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u] -; SSSE3-NEXT: pshufb {{.*#+}} xmm2 = xmm2[0,1,6,7,10,11,14,15,u,u,u,u,u,u,u,u] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,0,1],zero,zero,xmm0[u,u],zero,zero,xmm0[u,u],zero,zero +; SSSE3-NEXT: pshufb {{.*#+}} xmm2 = xmm2[u,u,u,u],zero,zero,xmm2[6,7,u,u,10,11,u,u,14,15] +; SSSE3-NEXT: por %xmm2, %xmm0 +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,4,5,6,7],zero,zero,xmm0[10,11],zero,zero,xmm0[14,15] +; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[u,u,u,u],zero,zero,zero,zero,xmm1[4,5],zero,zero,xmm1[0,1],zero,zero +; SSSE3-NEXT: por %xmm1, %xmm0 +; SSSE3-NEXT: movss {{.*#+}} xmm0 = xmm3[0],xmm0[1,2,3] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: shuffle_extract_concat_insert: ; SSE41: # %bb.0: -; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u] -; SSE41-NEXT: pshufb {{.*#+}} xmm2 = xmm2[0,1,6,7,10,11,14,15,u,u,u,u,u,u,u,u] -; SSE41-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] +; SSE41-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,0,1,1] +; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm2[0,1],xmm4[2],xmm2[3,4,5,6,7] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,1,0] +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm4[0,1,2,3],xmm0[4],xmm4[5],xmm0[6],xmm4[7] +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3,4,5,6,7] ; SSE41-NEXT: retq ; -; AVX-LABEL: shuffle_extract_concat_insert: -; AVX: # %bb.0: -; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[0,1,6,7,10,11,14,15,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX-NEXT: retq +; AVX1-LABEL: shuffle_extract_concat_insert: +; AVX1: # %bb.0: +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,1,0] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3,4,5,6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_extract_concat_insert: +; AVX2: # %bb.0: +; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX2-NEXT: vpbroadcastw %xmm0, %xmm0 +; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3,4,5,6,7] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,1,0] +; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7] +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0],xmm0[1,2,3] +; AVX2-NEXT: retq %a = shufflevector <4 x i16> %lhsa, <4 x i16> %rhsa, <8 x i32> %a0 = extractelement <8 x i16> %a, i32 0 %a4 = extractelement <8 x i16> %a, i32 4 @@ -2984,37 +3013,39 @@ ; SSE2-LABEL: shuffle_scalar_to_vector_extract: ; SSE2: # %bb.0: ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE2-NEXT: psraw $8, %xmm1 -; SSE2-NEXT: pextrw $7, %xmm1, %eax -; SSE2-NEXT: movd %eax, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: psraw $8, %xmm0 +; SSE2-NEXT: pextrw $4, %xmm0, %eax +; SSE2-NEXT: pextrw $7, %xmm0, %ecx +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: pinsrw $1, %eax, %xmm1 +; SSE2-NEXT: pinsrw $4, %ecx, %xmm1 ; SSE2-NEXT: movsbl (%rsi), %eax -; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE2-NEXT: pinsrw $5, %eax, %xmm1 ; SSE2-NEXT: movsbl (%rdx), %eax -; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSE2-NEXT: pinsrw $6, %eax, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],mem[1,0] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: shuffle_scalar_to_vector_extract: ; SSSE3: # %bb.0: ; SSSE3-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSSE3-NEXT: psraw $8, %xmm1 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSSE3-NEXT: psraw $8, %xmm0 +; SSSE3-NEXT: pextrw $4, %xmm0, %eax +; SSSE3-NEXT: pextrw $7, %xmm0, %ecx +; SSSE3-NEXT: pxor %xmm1, %xmm1 +; SSSE3-NEXT: pinsrw $1, %eax, %xmm1 +; SSSE3-NEXT: pinsrw $4, %ecx, %xmm1 ; SSSE3-NEXT: movsbl (%rsi), %eax -; SSSE3-NEXT: movd %eax, %xmm2 -; SSSE3-NEXT: palignr {{.*#+}} xmm2 = xmm1[14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; SSSE3-NEXT: pinsrw $5, %eax, %xmm1 ; SSSE3-NEXT: movsbl (%rdx), %eax -; SSSE3-NEXT: movd %eax, %xmm0 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSSE3-NEXT: pxor %xmm0, %xmm0 -; SSSE3-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSSE3-NEXT: pinsrw $6, %eax, %xmm1 +; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],mem[1,0] +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: shuffle_scalar_to_vector_extract: @@ -3024,30 +3055,43 @@ ; SSE41-NEXT: pextrw $7, %xmm0, %ecx ; SSE41-NEXT: pxor %xmm0, %xmm0 ; SSE41-NEXT: pinsrw $1, %eax, %xmm0 -; SSE41-NEXT: movl $65531, %eax # imm = 0xFFFB -; SSE41-NEXT: pinsrw $2, %eax, %xmm0 ; SSE41-NEXT: pinsrw $4, %ecx, %xmm0 ; SSE41-NEXT: movsbl (%rsi), %eax ; SSE41-NEXT: pinsrw $5, %eax, %xmm0 ; SSE41-NEXT: movsbl (%rdx), %eax ; SSE41-NEXT: pinsrw $6, %eax, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],mem[2,3],xmm0[4,5,6,7] ; SSE41-NEXT: retq ; -; AVX-LABEL: shuffle_scalar_to_vector_extract: -; AVX: # %bb.0: -; AVX-NEXT: vpmovsxbw (%rdi), %xmm0 -; AVX-NEXT: vpextrw $4, %xmm0, %eax -; AVX-NEXT: vpextrw $7, %xmm0, %ecx -; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 -; AVX-NEXT: movl $65531, %eax # imm = 0xFFFB -; AVX-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 -; AVX-NEXT: vpinsrw $4, %ecx, %xmm0, %xmm0 -; AVX-NEXT: movsbl (%rsi), %eax -; AVX-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 -; AVX-NEXT: movsbl (%rdx), %eax -; AVX-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: shuffle_scalar_to_vector_extract: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmovsxbw (%rdi), %xmm0 +; AVX1-NEXT: vpextrw $4, %xmm0, %eax +; AVX1-NEXT: vpextrw $7, %xmm0, %ecx +; AVX1-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 +; AVX1-NEXT: vpinsrw $4, %ecx, %xmm0, %xmm0 +; AVX1-NEXT: movsbl (%rsi), %eax +; AVX1-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 +; AVX1-NEXT: movsbl (%rdx), %eax +; AVX1-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],mem[2,3],xmm0[4,5,6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_scalar_to_vector_extract: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovsxbw (%rdi), %xmm0 +; AVX2-NEXT: vpextrw $4, %xmm0, %eax +; AVX2-NEXT: vpextrw $7, %xmm0, %ecx +; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 +; AVX2-NEXT: vpinsrw $4, %ecx, %xmm0, %xmm0 +; AVX2-NEXT: movsbl (%rsi), %eax +; AVX2-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 +; AVX2-NEXT: movsbl (%rdx), %eax +; AVX2-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2,3] +; AVX2-NEXT: retq %tmp = load <8 x i8>, <8 x i8>* %p0, align 1 %tmp1 = sext <8 x i8> %tmp to <8 x i16> %tmp2 = load i8, i8* %p1, align 1