Index: lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -16646,18 +16646,23 @@ // If extraction is cheap, we don't need to look at the binop operands // for concat ops. The narrow binop alone makes this transform profitable. - // TODO: We're not dealing with the bitcasted pattern here. That limitation - // should be lifted. - if (Extract->getOperand(0) == BinOp && BinOp.hasOneUse() && - TLI.isExtractSubvectorCheap(NarrowBVT, WideBVT, ExtractIndex)) { + // We can't just reuse the original extract index operand because we may have + // bitcasted. + unsigned ConcatOpNum = ExtractIndex / NumElems; + unsigned ExtBOIdx = ConcatOpNum * NarrowBVT.getVectorNumElements(); + EVT ExtBOIdxVT = Extract->getOperand(1).getValueType(); + if (TLI.isExtractSubvectorCheap(NarrowBVT, WideBVT, ExtBOIdx) && + BinOp.hasOneUse() && Extract->getOperand(0)->hasOneUse()) { // extract (binop B0, B1), N --> binop (extract B0, N), (extract B1, N) SDLoc DL(Extract); + SDValue NewExtIndex = DAG.getConstant(ExtBOIdx, DL, ExtBOIdxVT); SDValue X = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NarrowBVT, - BinOp.getOperand(0), Extract->getOperand(1)); + BinOp.getOperand(0), NewExtIndex); SDValue Y = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NarrowBVT, - BinOp.getOperand(1), Extract->getOperand(1)); - return DAG.getNode(BOpcode, DL, NarrowBVT, X, Y, - BinOp.getNode()->getFlags()); + BinOp.getOperand(1), NewExtIndex); + SDValue NarrowBinOp = DAG.getNode(BOpcode, DL, NarrowBVT, X, Y, + BinOp.getNode()->getFlags()); + return DAG.getBitcast(VT, NarrowBinOp); } // Only handle the case where we are doubling and then halving. A larger ratio @@ -16686,11 +16691,7 @@ return SDValue(); // If one of the binop operands was not the result of a concat, we must - // extract a half-sized operand for our new narrow binop. We can't just reuse - // the original extract index operand because we may have bitcasted. - unsigned ConcatOpNum = ExtractIndex / NumElems; - unsigned ExtBOIdx = ConcatOpNum * NarrowBVT.getVectorNumElements(); - EVT ExtBOIdxVT = Extract->getOperand(1).getValueType(); + // extract a half-sized operand for our new narrow binop. SDLoc DL(Extract); // extract (binop (concat X1, X2), (concat Y1, Y2)), N --> binop XN, YN Index: test/CodeGen/AArch64/arm64-ld1.ll =================================================================== --- test/CodeGen/AArch64/arm64-ld1.ll +++ test/CodeGen/AArch64/arm64-ld1.ll @@ -915,7 +915,9 @@ ; CHECK: ld1r_2s_from_dup ; CHECK: ld1r.2s { [[ARG1:v[0-9]+]] }, [x0] ; CHECK-NEXT: ld1r.2s { [[ARG2:v[0-9]+]] }, [x1] -; CHECK-NEXT: usubl.8h v[[RESREGNUM:[0-9]+]], [[ARG1]], [[ARG2]] +; CHECK-NEXT: ushll.8h [[ARG1]], [[ARG1]], #0 +; CHECK-NEXT: ushll.8h [[ARG2]], [[ARG2]], #0 +; CHECK-NEXT: sub.4h v[[RESREGNUM:[0-9]+]], [[ARG1]], [[ARG2]] ; CHECK-NEXT: str d[[RESREGNUM]], [x2] ; CHECK-NEXT: ret %tmp = bitcast i8* %a to i32* Index: test/CodeGen/X86/avx-vperm2x128.ll =================================================================== --- test/CodeGen/X86/avx-vperm2x128.ll +++ test/CodeGen/X86/avx-vperm2x128.ll @@ -210,8 +210,8 @@ ; ; AVX2-LABEL: shuffle_v16i16_4501: ; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 -; AVX2-NEXT: vpsubw %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpsubw %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ; AVX2-NEXT: retq entry: Index: test/CodeGen/X86/avx1-logical-load-folding.ll =================================================================== --- test/CodeGen/X86/avx1-logical-load-folding.ll +++ test/CodeGen/X86/avx1-logical-load-folding.ll @@ -8,18 +8,16 @@ ; X86: ## %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: vmovaps (%ecx), %ymm0 -; X86-NEXT: vandps LCPI0_0, %ymm0, %ymm0 +; X86-NEXT: vmovaps (%ecx), %xmm0 +; X86-NEXT: vandps LCPI0_0, %xmm0, %xmm0 ; X86-NEXT: vmovss %xmm0, (%eax) -; X86-NEXT: vzeroupper ; X86-NEXT: retl ; ; X64-LABEL: test1: ; X64: ## %bb.0: -; X64-NEXT: vmovaps (%rdi), %ymm0 -; X64-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 +; X64-NEXT: vmovaps (%rdi), %xmm0 +; X64-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 ; X64-NEXT: vmovss %xmm0, (%rsi) -; X64-NEXT: vzeroupper ; X64-NEXT: retq %tmp1 = bitcast float* %A to <8 x float>* %tmp2 = load <8 x float>, <8 x float>* %tmp1, align 32 @@ -37,18 +35,16 @@ ; X86: ## %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: vmovaps (%ecx), %ymm0 -; X86-NEXT: vorps LCPI1_0, %ymm0, %ymm0 +; X86-NEXT: vmovaps (%ecx), %xmm0 +; X86-NEXT: vorps LCPI1_0, %xmm0, %xmm0 ; X86-NEXT: vmovss %xmm0, (%eax) -; X86-NEXT: vzeroupper ; X86-NEXT: retl ; ; X64-LABEL: test2: ; X64: ## %bb.0: -; X64-NEXT: vmovaps (%rdi), %ymm0 -; X64-NEXT: vorps {{.*}}(%rip), %ymm0, %ymm0 +; X64-NEXT: vmovaps (%rdi), %xmm0 +; X64-NEXT: vorps {{.*}}(%rip), %xmm0, %xmm0 ; X64-NEXT: vmovss %xmm0, (%rsi) -; X64-NEXT: vzeroupper ; X64-NEXT: retq %tmp1 = bitcast float* %A to <8 x float>* %tmp2 = load <8 x float>, <8 x float>* %tmp1, align 32 @@ -66,18 +62,16 @@ ; X86: ## %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: vmovaps (%ecx), %ymm0 -; X86-NEXT: vxorps LCPI2_0, %ymm0, %ymm0 +; X86-NEXT: vmovaps (%ecx), %xmm0 +; X86-NEXT: vxorps LCPI2_0, %xmm0, %xmm0 ; X86-NEXT: vmovss %xmm0, (%eax) -; X86-NEXT: vzeroupper ; X86-NEXT: retl ; ; X64-LABEL: test3: ; X64: ## %bb.0: -; X64-NEXT: vmovaps (%rdi), %ymm0 -; X64-NEXT: vxorps {{.*}}(%rip), %ymm0, %ymm0 +; X64-NEXT: vmovaps (%rdi), %xmm0 +; X64-NEXT: vxorps {{.*}}(%rip), %xmm0, %xmm0 ; X64-NEXT: vmovss %xmm0, (%rsi) -; X64-NEXT: vzeroupper ; X64-NEXT: retq %tmp1 = bitcast float* %A to <8 x float>* %tmp2 = load <8 x float>, <8 x float>* %tmp1, align 32 @@ -94,18 +88,16 @@ ; X86: ## %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: vmovaps (%ecx), %ymm0 -; X86-NEXT: vandnps LCPI3_0, %ymm0, %ymm0 +; X86-NEXT: vmovaps (%ecx), %xmm0 +; X86-NEXT: vandnps LCPI3_0, %xmm0, %xmm0 ; X86-NEXT: vmovss %xmm0, (%eax) -; X86-NEXT: vzeroupper ; X86-NEXT: retl ; ; X64-LABEL: test4: ; X64: ## %bb.0: -; X64-NEXT: vmovaps (%rdi), %ymm0 -; X64-NEXT: vandnps {{.*}}(%rip), %ymm0, %ymm0 +; X64-NEXT: vmovaps (%rdi), %xmm0 +; X64-NEXT: vandnps {{.*}}(%rip), %xmm0, %xmm0 ; X64-NEXT: vmovss %xmm0, (%rsi) -; X64-NEXT: vzeroupper ; X64-NEXT: retq %tmp1 = bitcast float* %A to <8 x float>* %tmp2 = load <8 x float>, <8 x float>* %tmp1, align 32 Index: test/CodeGen/X86/i64-mem-copy.ll =================================================================== --- test/CodeGen/X86/i64-mem-copy.ll +++ test/CodeGen/X86/i64-mem-copy.ll @@ -90,8 +90,9 @@ ; X32AVX-LABEL: store_i64_from_vector256: ; X32AVX: # %bb.0: ; X32AVX-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32AVX-NEXT: vpaddw %ymm1, %ymm0, %ymm0 +; X32AVX-NEXT: vextracti128 $1, %ymm1, %xmm1 ; X32AVX-NEXT: vextracti128 $1, %ymm0, %xmm0 +; X32AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; X32AVX-NEXT: vmovq %xmm0, (%eax) ; X32AVX-NEXT: vzeroupper ; X32AVX-NEXT: retl Index: test/CodeGen/X86/pr36199.ll =================================================================== --- test/CodeGen/X86/pr36199.ll +++ test/CodeGen/X86/pr36199.ll @@ -4,7 +4,7 @@ define void @foo(<16 x float> %x) { ; CHECK-LABEL: foo: ; CHECK: # %bb.0: -; CHECK-NEXT: vaddps %zmm0, %zmm0, %zmm0 +; CHECK-NEXT: vaddps %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; CHECK-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 ; CHECK-NEXT: vmovups %zmm0, (%rax) Index: test/CodeGen/X86/sad.ll =================================================================== --- test/CodeGen/X86/sad.ll +++ test/CodeGen/X86/sad.ll @@ -1359,7 +1359,7 @@ ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -1371,7 +1371,7 @@ ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpaddq %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq