Index: lib/CodeGen/SelectionDAG/TargetLowering.cpp =================================================================== --- lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -1426,12 +1426,36 @@ if (SimplifyDemandedBits(Src, DemandedSrcBits, DemandedSrcElts, KnownSrcBits, TLO, Depth + 1)) return true; + } else if ((NumSrcEltBits % BitWidth) == 0 && + TLO.DAG.getDataLayout().isLittleEndian()) { + unsigned Scale = NumSrcEltBits / BitWidth; + unsigned NumSrcElts = SrcVT.isVector() ? SrcVT.getVectorNumElements() : 1; + APInt DemandedSrcBits = APInt::getNullValue(NumSrcEltBits); + APInt DemandedSrcElts = APInt::getNullValue(NumSrcElts); + for (unsigned i = 0; i != NumElts; ++i) + if (DemandedElts[i]) { + unsigned Offset = (i % Scale) * BitWidth; + DemandedSrcBits.insertBits(DemandedBits, Offset); + DemandedSrcElts.setBit(i / Scale); + } + + if (SrcVT.isVector()) { + APInt KnownSrcUndef, KnownSrcZero; + if (SimplifyDemandedVectorElts(Src, DemandedSrcElts, KnownSrcUndef, + KnownSrcZero, TLO, Depth + 1)) + return true; + } + + KnownBits KnownSrcBits; + if (SimplifyDemandedBits(Src, DemandedSrcBits, DemandedSrcElts, + KnownSrcBits, TLO, Depth + 1)) + return true; } // If this is a bitcast, let computeKnownBits handle it. Only do this on a // recursive call where Known may be useful to the caller. if (Depth > 0) { - Known = TLO.DAG.computeKnownBits(Op, Depth); + Known = TLO.DAG.computeKnownBits(Op, DemandedElts, Depth); return false; } break; Index: test/CodeGen/AMDGPU/store-weird-sizes.ll =================================================================== --- test/CodeGen/AMDGPU/store-weird-sizes.ll +++ test/CodeGen/AMDGPU/store-weird-sizes.ll @@ -45,7 +45,8 @@ ; HAWAII-NEXT: v_mov_b32_e32 v2, s2 ; HAWAII-NEXT: ds_write_b16 v1, v2 offset:4 ; HAWAII-NEXT: s_waitcnt vmcnt(0) -; HAWAII-NEXT: v_and_b32_e32 v0, 0x7f, v0 +; HAWAII-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; HAWAII-NEXT: v_bfe_u32 v0, v0, 16, 7 ; HAWAII-NEXT: ds_write_b8 v1, v0 offset:6 ; HAWAII-NEXT: ds_write_b32 v1, v3 ; HAWAII-NEXT: s_endpgm @@ -67,7 +68,8 @@ ; FIJI-NEXT: v_mov_b32_e32 v2, s2 ; FIJI-NEXT: ds_write_b16 v1, v2 offset:4 ; FIJI-NEXT: s_waitcnt vmcnt(0) -; FIJI-NEXT: v_and_b32_e32 v0, 0x7f, v0 +; FIJI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; FIJI-NEXT: v_bfe_u32 v0, v0, 16, 7 ; FIJI-NEXT: ds_write_b8 v1, v0 offset:6 ; FIJI-NEXT: ds_write_b32 v1, v3 ; FIJI-NEXT: s_endpgm Index: test/CodeGen/X86/bitcast-setcc-256.ll =================================================================== --- test/CodeGen/X86/bitcast-setcc-256.ll +++ test/CodeGen/X86/bitcast-setcc-256.ll @@ -448,22 +448,6 @@ define void @bitcast_4i64_store(i4* %p, <4 x i64> %a0) { ; SSE2-SSSE3-LABEL: bitcast_4i64_store: ; SSE2-SSSE3: # %bb.0: -; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] -; SSE2-SSSE3-NEXT: pxor %xmm2, %xmm1 -; SSE2-SSSE3-NEXT: movdqa %xmm2, %xmm3 -; SSE2-SSSE3-NEXT: pcmpeqd %xmm1, %xmm3 -; SSE2-SSSE3-NEXT: movdqa %xmm2, %xmm4 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm4 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,0,2,2] -; SSE2-SSSE3-NEXT: pand %xmm3, %xmm1 -; SSE2-SSSE3-NEXT: por %xmm4, %xmm1 -; SSE2-SSSE3-NEXT: pxor %xmm2, %xmm0 -; SSE2-SSSE3-NEXT: movdqa %xmm2, %xmm3 -; SSE2-SSSE3-NEXT: pcmpeqd %xmm0, %xmm3 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm0, %xmm2 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2] -; SSE2-SSSE3-NEXT: pand %xmm3, %xmm0 -; SSE2-SSSE3-NEXT: por %xmm2, %xmm0 ; SSE2-SSSE3-NEXT: packssdw %xmm1, %xmm0 ; SSE2-SSSE3-NEXT: movmskps %xmm0, %eax ; SSE2-SSSE3-NEXT: movb %al, (%rdi) Index: test/CodeGen/X86/dagcombine-cse.ll =================================================================== --- test/CodeGen/X86/dagcombine-cse.ll +++ test/CodeGen/X86/dagcombine-cse.ll @@ -14,18 +14,11 @@ ; ; X64-LABEL: t: ; X64: ## %bb.0: ## %entry -; X64-NEXT: ## kill: def $edx killed $edx def $rdx -; X64-NEXT: ## kill: def $esi killed $esi def $rsi ; X64-NEXT: imull %ecx, %esi -; X64-NEXT: leal (%rsi,%rdx), %eax -; X64-NEXT: cltq +; X64-NEXT: addl %edx, %esi +; X64-NEXT: movslq %esi, %rax ; X64-NEXT: movl (%rdi,%rax), %eax -; X64-NEXT: leal 4(%rsi,%rdx), %ecx -; X64-NEXT: movslq %ecx, %rcx -; X64-NEXT: movzwl (%rdi,%rcx), %ecx -; X64-NEXT: shlq $32, %rcx -; X64-NEXT: orq %rax, %rcx -; X64-NEXT: movq %rcx, %xmm0 +; X64-NEXT: movq %rax, %xmm0 ; X64-NEXT: movd %xmm0, %eax ; X64-NEXT: retq entry: Index: test/CodeGen/X86/masked_store.ll =================================================================== --- test/CodeGen/X86/masked_store.ll +++ test/CodeGen/X86/masked_store.ll @@ -36,25 +36,21 @@ define void @store_v2f64_v2i64(<2 x i64> %trigger, <2 x double>* %addr, <2 x double> %val) { ; SSE2-LABEL: store_v2f64_v2i64: ; SSE2: ## %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648] -; SSE2-NEXT: pxor %xmm3, %xmm0 -; SSE2-NEXT: movdqa %xmm3, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm2 -; SSE2-NEXT: pcmpeqd %xmm3, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-NEXT: movdqa %xmm0, %xmm4 -; SSE2-NEXT: pand %xmm2, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] -; SSE2-NEXT: por %xmm3, %xmm4 -; SSE2-NEXT: movd %xmm4, %eax +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] +; SSE2-NEXT: pxor %xmm2, %xmm0 +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm3 +; SSE2-NEXT: pcmpeqd %xmm2, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; SSE2-NEXT: pand %xmm3, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] +; SSE2-NEXT: por %xmm2, %xmm0 +; SSE2-NEXT: movd %xmm0, %eax ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: je LBB1_2 ; SSE2-NEXT: ## %bb.1: ## %cond.store ; SSE2-NEXT: movlpd %xmm1, (%rdi) ; SSE2-NEXT: LBB1_2: ## %else -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,2,2] -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: por %xmm3, %xmm0 ; SSE2-NEXT: pextrw $4, %xmm0, %eax ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: je LBB1_4 @@ -117,20 +113,16 @@ ; SSE2-NEXT: movdqa %xmm4, %xmm5 ; SSE2-NEXT: pcmpgtd %xmm0, %xmm5 ; SSE2-NEXT: pcmpeqd %xmm4, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-NEXT: movdqa %xmm0, %xmm7 -; SSE2-NEXT: pand %xmm5, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3] -; SSE2-NEXT: por %xmm6, %xmm7 -; SSE2-NEXT: movd %xmm7, %eax +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] +; SSE2-NEXT: pand %xmm5, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3] +; SSE2-NEXT: por %xmm6, %xmm0 +; SSE2-NEXT: movd %xmm0, %eax ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: je LBB2_2 ; SSE2-NEXT: ## %bb.1: ## %cond.store ; SSE2-NEXT: movlpd %xmm2, (%rdi) ; SSE2-NEXT: LBB2_2: ## %else -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,2,2] -; SSE2-NEXT: pand %xmm5, %xmm0 -; SSE2-NEXT: por %xmm6, %xmm0 ; SSE2-NEXT: pextrw $4, %xmm0, %eax ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: je LBB2_4 @@ -140,10 +132,9 @@ ; SSE2-NEXT: pxor %xmm4, %xmm1 ; SSE2-NEXT: movdqa %xmm4, %xmm0 ; SSE2-NEXT: pcmpgtd %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2] ; SSE2-NEXT: pcmpeqd %xmm4, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: pand %xmm0, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; SSE2-NEXT: por %xmm1, %xmm0 ; SSE2-NEXT: pextrw $0, %xmm0, %eax @@ -863,25 +854,21 @@ define void @store_v2i64_v2i64(<2 x i64> %trigger, <2 x i64>* %addr, <2 x i64> %val) { ; SSE2-LABEL: store_v2i64_v2i64: ; SSE2: ## %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648] -; SSE2-NEXT: pxor %xmm3, %xmm0 -; SSE2-NEXT: movdqa %xmm3, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm2 -; SSE2-NEXT: pcmpeqd %xmm3, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-NEXT: movdqa %xmm0, %xmm4 -; SSE2-NEXT: pand %xmm2, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] -; SSE2-NEXT: por %xmm3, %xmm4 -; SSE2-NEXT: movd %xmm4, %eax +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] +; SSE2-NEXT: pxor %xmm2, %xmm0 +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm3 +; SSE2-NEXT: pcmpeqd %xmm2, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; SSE2-NEXT: pand %xmm3, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] +; SSE2-NEXT: por %xmm2, %xmm0 +; SSE2-NEXT: movd %xmm0, %eax ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: je LBB7_2 ; SSE2-NEXT: ## %bb.1: ## %cond.store ; SSE2-NEXT: movq %xmm1, (%rdi) ; SSE2-NEXT: LBB7_2: ## %else -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,2,2] -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: por %xmm3, %xmm0 ; SSE2-NEXT: pextrw $4, %xmm0, %eax ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: je LBB7_4 @@ -950,20 +937,16 @@ ; SSE2-NEXT: movdqa %xmm4, %xmm5 ; SSE2-NEXT: pcmpgtd %xmm0, %xmm5 ; SSE2-NEXT: pcmpeqd %xmm4, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-NEXT: movdqa %xmm0, %xmm7 -; SSE2-NEXT: pand %xmm5, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3] -; SSE2-NEXT: por %xmm6, %xmm7 -; SSE2-NEXT: movd %xmm7, %eax +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] +; SSE2-NEXT: pand %xmm5, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3] +; SSE2-NEXT: por %xmm6, %xmm0 +; SSE2-NEXT: movd %xmm0, %eax ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: je LBB8_2 ; SSE2-NEXT: ## %bb.1: ## %cond.store ; SSE2-NEXT: movq %xmm2, (%rdi) ; SSE2-NEXT: LBB8_2: ## %else -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,2,2] -; SSE2-NEXT: pand %xmm5, %xmm0 -; SSE2-NEXT: por %xmm6, %xmm0 ; SSE2-NEXT: pextrw $4, %xmm0, %eax ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: je LBB8_4 @@ -974,10 +957,9 @@ ; SSE2-NEXT: pxor %xmm4, %xmm1 ; SSE2-NEXT: movdqa %xmm4, %xmm0 ; SSE2-NEXT: pcmpgtd %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2] ; SSE2-NEXT: pcmpeqd %xmm4, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: pand %xmm0, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; SSE2-NEXT: por %xmm1, %xmm0 ; SSE2-NEXT: pextrw $0, %xmm0, %eax Index: test/CodeGen/X86/movmsk-cmp.ll =================================================================== --- test/CodeGen/X86/movmsk-cmp.ll +++ test/CodeGen/X86/movmsk-cmp.ll @@ -929,22 +929,6 @@ define i1 @allones_v4i64_sign(<4 x i64> %arg) { ; SSE2-LABEL: allones_v4i64_sign: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] -; SSE2-NEXT: pxor %xmm2, %xmm1 -; SSE2-NEXT: movdqa %xmm2, %xmm3 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm3 -; SSE2-NEXT: movdqa %xmm2, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,0,2,2] -; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: por %xmm4, %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm0 -; SSE2-NEXT: movdqa %xmm2, %xmm3 -; SSE2-NEXT: pcmpeqd %xmm0, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2] -; SSE2-NEXT: pand %xmm3, %xmm0 -; SSE2-NEXT: por %xmm2, %xmm0 ; SSE2-NEXT: packssdw %xmm1, %xmm0 ; SSE2-NEXT: movmskps %xmm0, %eax ; SSE2-NEXT: cmpb $15, %al @@ -989,22 +973,6 @@ define i1 @allzeros_v4i64_sign(<4 x i64> %arg) { ; SSE2-LABEL: allzeros_v4i64_sign: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] -; SSE2-NEXT: pxor %xmm2, %xmm1 -; SSE2-NEXT: movdqa %xmm2, %xmm3 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm3 -; SSE2-NEXT: movdqa %xmm2, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,0,2,2] -; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: por %xmm4, %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm0 -; SSE2-NEXT: movdqa %xmm2, %xmm3 -; SSE2-NEXT: pcmpeqd %xmm0, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2] -; SSE2-NEXT: pand %xmm3, %xmm0 -; SSE2-NEXT: por %xmm2, %xmm0 ; SSE2-NEXT: packssdw %xmm1, %xmm0 ; SSE2-NEXT: movmskps %xmm0, %eax ; SSE2-NEXT: testb %al, %al @@ -4522,22 +4490,6 @@ define i32 @movmskpd256(<4 x double> %x) { ; SSE2-LABEL: movmskpd256: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] -; SSE2-NEXT: pxor %xmm2, %xmm1 -; SSE2-NEXT: movdqa %xmm2, %xmm3 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm3 -; SSE2-NEXT: movdqa %xmm2, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,0,2,2] -; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: por %xmm4, %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm0 -; SSE2-NEXT: movdqa %xmm2, %xmm3 -; SSE2-NEXT: pcmpeqd %xmm0, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2] -; SSE2-NEXT: pand %xmm3, %xmm0 -; SSE2-NEXT: por %xmm2, %xmm0 ; SSE2-NEXT: packssdw %xmm1, %xmm0 ; SSE2-NEXT: movmskps %xmm0, %eax ; SSE2-NEXT: retq