Index: llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -15627,7 +15627,7 @@ if (Ty.getSizeInBits() > MaximumLegalStoreInBits) break; - if (TLI.isTypeLegal(Ty) && + if (isTypeLegal(Ty) && TLI.canMergeStoresTo(FirstStoreAS, Ty, DAG) && TLI.allowsMemoryAccess(Context, DL, Ty, *FirstInChain->getMemOperand(), &IsFast) && Index: llvm/test/CodeGen/SystemZ/vec-combine-02.ll =================================================================== --- llvm/test/CodeGen/SystemZ/vec-combine-02.ll +++ llvm/test/CodeGen/SystemZ/vec-combine-02.ll @@ -142,11 +142,13 @@ <4 x i32> %val2, <4 x i32> %val3, i8 *%base) { ; CHECK-LABEL: f5: -; CHECK-DAG: vsteb %v24, 0(%r2), 11 -; CHECK-DAG: vsteb %v26, 1(%r2), 15 -; CHECK-DAG: vsteb %v28, 2(%r2), 3 -; CHECK-DAG: vsteb %v30, 3(%r2), 7 -; CHECK: br %r14 +; CHECK-DAG: larl %r1, .LCPI4_0 +; CHECK-DAG: vl %v2, 0(%r1) +; CHECK-DAG: vmrhg %v0, %v28, %v30 +; CHECK-DAG: vmrlg %v1, %v24, %v26 +; CHECK-DAG: vperm %v0, %v1, %v0, %v2 +; CHECK-DAG: vstef %v0, 0(%r2), 0 +; CHECK-DAG: br %r14 %bitcast0 = bitcast <4 x i32> %val0 to <8 x i16> %bitcast1 = bitcast <4 x i32> %val1 to <8 x i16> %bitcast2 = bitcast <4 x i32> %val2 to <8 x i16> Index: llvm/test/CodeGen/X86/masked_compressstore.ll =================================================================== --- llvm/test/CodeGen/X86/masked_compressstore.ll +++ llvm/test/CodeGen/X86/masked_compressstore.ll @@ -1124,37 +1124,25 @@ define void @compressstore_v16f32_const(float* %base, <16 x float> %V) { ; SSE2-LABEL: compressstore_v16f32_const: ; SSE2: ## %bb.0: ## %cond.store -; SSE2-NEXT: movss %xmm0, (%rdi) -; SSE2-NEXT: movaps %xmm0, %xmm4 -; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm0[2,3] -; SSE2-NEXT: movss %xmm4, 4(%rdi) -; SSE2-NEXT: movaps %xmm0, %xmm4 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1] -; SSE2-NEXT: movss %xmm4, 8(%rdi) -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3] -; SSE2-NEXT: movss %xmm0, 12(%rdi) -; SSE2-NEXT: movss %xmm1, 16(%rdi) +; SSE2-NEXT: movlps %xmm0, (%rdi) +; SSE2-NEXT: movaps %xmm1, %xmm4 +; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,0],xmm0[3,0] +; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm0[2,3] +; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; SSE2-NEXT: movss %xmm0, 8(%rdi) ; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[2,3] ; SSE2-NEXT: movss %xmm0, 20(%rdi) -; SSE2-NEXT: movaps %xmm1, %xmm0 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; SSE2-NEXT: movss %xmm0, 24(%rdi) -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1,2,3] -; SSE2-NEXT: movss %xmm1, 28(%rdi) +; SSE2-NEXT: movlps %xmm4, 12(%rdi) ; SSE2-NEXT: movss %xmm2, 32(%rdi) -; SSE2-NEXT: movaps %xmm2, %xmm0 -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm2[2,3] -; SSE2-NEXT: movss %xmm0, 36(%rdi) -; SSE2-NEXT: movhlps {{.*#+}} xmm2 = xmm2[1,1] -; SSE2-NEXT: movss %xmm2, 40(%rdi) +; SSE2-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] +; SSE2-NEXT: movlps %xmm1, 24(%rdi) ; SSE2-NEXT: movss %xmm3, 44(%rdi) +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,2,2,3] +; SSE2-NEXT: movlps %xmm2, 36(%rdi) ; SSE2-NEXT: movaps %xmm3, %xmm0 -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm3[2,3] -; SSE2-NEXT: movss %xmm0, 48(%rdi) -; SSE2-NEXT: movaps %xmm3, %xmm0 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1] -; SSE2-NEXT: movss %xmm0, 52(%rdi) +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm3[2,3] +; SSE2-NEXT: movlps %xmm0, 48(%rdi) ; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1,2,3] ; SSE2-NEXT: movss %xmm3, 56(%rdi) ; SSE2-NEXT: retq @@ -1165,33 +1153,35 @@ ; SSE42-NEXT: movups %xmm1, 16(%rdi) ; SSE42-NEXT: insertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0] ; SSE42-NEXT: movups %xmm2, 32(%rdi) -; SSE42-NEXT: extractps $1, %xmm3, 48(%rdi) -; SSE42-NEXT: extractps $2, %xmm3, 52(%rdi) ; SSE42-NEXT: extractps $3, %xmm3, 56(%rdi) +; SSE42-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,2,2,3] +; SSE42-NEXT: movlps %xmm3, 48(%rdi) ; SSE42-NEXT: retq ; ; AVX1-LABEL: compressstore_v16f32_const: ; AVX1: ## %bb.0: ## %cond.store ; AVX1-NEXT: vmovups %ymm0, (%rdi) ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0 -; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[0] -; AVX1-NEXT: vmovups %xmm1, 32(%rdi) -; AVX1-NEXT: vextractps $1, %xmm0, 48(%rdi) -; AVX1-NEXT: vextractps $2, %xmm0, 52(%rdi) ; AVX1-NEXT: vextractps $3, %xmm0, 56(%rdi) +; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[1,2,3,3] +; AVX1-NEXT: vmovlps %xmm2, 48(%rdi) +; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; AVX1-NEXT: vmovups %xmm0, 32(%rdi) ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: compressstore_v16f32_const: ; AVX2: ## %bb.0: ## %cond.store ; AVX2-NEXT: vmovups %ymm0, (%rdi) +; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm0 +; AVX2-NEXT: vextractps $3, %xmm0, 56(%rdi) +; AVX2-NEXT: vmovaps {{.*#+}} ymm0 = <0,1,2,4,5,6,7,u> +; AVX2-NEXT: vpermps %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX2-NEXT: vmovlps %xmm0, 48(%rdi) ; AVX2-NEXT: vmovaps {{.*#+}} xmm0 = [0,1,2,4] ; AVX2-NEXT: vpermps %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vmovups %xmm0, 32(%rdi) -; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm0 -; AVX2-NEXT: vextractps $1, %xmm0, 48(%rdi) -; AVX2-NEXT: vextractps $2, %xmm0, 52(%rdi) -; AVX2-NEXT: vextractps $3, %xmm0, 56(%rdi) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; Index: llvm/test/CodeGen/X86/pr40994.ll =================================================================== --- llvm/test/CodeGen/X86/pr40994.ll +++ llvm/test/CodeGen/X86/pr40994.ll @@ -4,15 +4,9 @@ define <8 x i8> @foo(<16 x i8> %a) { ; CHECK-LABEL: foo: ; CHECK: # %bb.0: # %cond.store -; CHECK-NEXT: pextrb $0, %xmm0, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: pextrb $2, %xmm0, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: pextrb $4, %xmm0, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: pextrb $6, %xmm0, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: pextrb $8, %xmm0, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: pextrb $10, %xmm0, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: pextrb $12, %xmm0, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: pextrb $14, %xmm0, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: pmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; CHECK-NEXT: movdqa %xmm0, %xmm1 +; CHECK-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; CHECK-NEXT: movq %xmm1, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: retq %v = alloca i8, i32 8, align 16 call void @llvm.masked.compressstore.v16i8(<16 x i8> %a, i8* %v, <16 x i1> )