Index: llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -16640,18 +16640,23 @@ // If extraction is cheap, we don't need to look at the binop operands // for concat ops. The narrow binop alone makes this transform profitable. - // TODO: We're not dealing with the bitcasted pattern here. That limitation - // should be lifted. - if (Extract->getOperand(0) == BinOp && BinOp.hasOneUse() && - TLI.isExtractSubvectorCheap(NarrowBVT, WideBVT, ExtractIndex)) { + // We can't just reuse the original extract index operand because we may have + // bitcasted. + unsigned ConcatOpNum = ExtractIndex / NumElems; + unsigned ExtBOIdx = ConcatOpNum * NarrowBVT.getVectorNumElements(); + EVT ExtBOIdxVT = Extract->getOperand(1).getValueType(); + if (TLI.isExtractSubvectorCheap(NarrowBVT, WideBVT, ExtBOIdx) && + BinOp.hasOneUse() && Extract->getOperand(0)->hasOneUse()) { // extract (binop B0, B1), N --> binop (extract B0, N), (extract B1, N) SDLoc DL(Extract); + SDValue NewExtIndex = DAG.getConstant(ExtBOIdx, DL, ExtBOIdxVT); SDValue X = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NarrowBVT, - BinOp.getOperand(0), Extract->getOperand(1)); + BinOp.getOperand(0), NewExtIndex); SDValue Y = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NarrowBVT, - BinOp.getOperand(1), Extract->getOperand(1)); - return DAG.getNode(BOpcode, DL, NarrowBVT, X, Y, - BinOp.getNode()->getFlags()); + BinOp.getOperand(1), NewExtIndex); + SDValue NarrowBinOp = DAG.getNode(BOpcode, DL, NarrowBVT, X, Y, + BinOp.getNode()->getFlags()); + return DAG.getBitcast(VT, NarrowBinOp); } // Only handle the case where we are doubling and then halving. A larger ratio @@ -16680,11 +16685,7 @@ return SDValue(); // If one of the binop operands was not the result of a concat, we must - // extract a half-sized operand for our new narrow binop. We can't just reuse - // the original extract index operand because we may have bitcasted. - unsigned ConcatOpNum = ExtractIndex / NumElems; - unsigned ExtBOIdx = ConcatOpNum * NarrowBVT.getVectorNumElements(); - EVT ExtBOIdxVT = Extract->getOperand(1).getValueType(); + // extract a half-sized operand for our new narrow binop. SDLoc DL(Extract); // extract (binop (concat X1, X2), (concat Y1, Y2)), N --> binop XN, YN Index: llvm/trunk/test/CodeGen/AArch64/arm64-ld1.ll =================================================================== --- llvm/trunk/test/CodeGen/AArch64/arm64-ld1.ll +++ llvm/trunk/test/CodeGen/AArch64/arm64-ld1.ll @@ -915,7 +915,9 @@ ; CHECK: ld1r_2s_from_dup ; CHECK: ld1r.2s { [[ARG1:v[0-9]+]] }, [x0] ; CHECK-NEXT: ld1r.2s { [[ARG2:v[0-9]+]] }, [x1] -; CHECK-NEXT: usubl.8h v[[RESREGNUM:[0-9]+]], [[ARG1]], [[ARG2]] +; CHECK-NEXT: ushll.8h [[ARG1]], [[ARG1]], #0 +; CHECK-NEXT: ushll.8h [[ARG2]], [[ARG2]], #0 +; CHECK-NEXT: sub.4h v[[RESREGNUM:[0-9]+]], [[ARG1]], [[ARG2]] ; CHECK-NEXT: str d[[RESREGNUM]], [x2] ; CHECK-NEXT: ret %tmp = bitcast i8* %a to i32* Index: llvm/trunk/test/CodeGen/X86/avx-vperm2x128.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/avx-vperm2x128.ll +++ llvm/trunk/test/CodeGen/X86/avx-vperm2x128.ll @@ -210,8 +210,8 @@ ; ; AVX2-LABEL: shuffle_v16i16_4501: ; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 -; AVX2-NEXT: vpsubw %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpsubw %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ; AVX2-NEXT: retq entry: Index: llvm/trunk/test/CodeGen/X86/avx1-logical-load-folding.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/avx1-logical-load-folding.ll +++ llvm/trunk/test/CodeGen/X86/avx1-logical-load-folding.ll @@ -8,18 +8,16 @@ ; X86: ## %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: vmovaps (%ecx), %ymm0 -; X86-NEXT: vandps LCPI0_0, %ymm0, %ymm0 +; X86-NEXT: vmovaps (%ecx), %xmm0 +; X86-NEXT: vandps LCPI0_0, %xmm0, %xmm0 ; X86-NEXT: vmovss %xmm0, (%eax) -; X86-NEXT: vzeroupper ; X86-NEXT: retl ; ; X64-LABEL: test1: ; X64: ## %bb.0: -; X64-NEXT: vmovaps (%rdi), %ymm0 -; X64-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 +; X64-NEXT: vmovaps (%rdi), %xmm0 +; X64-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 ; X64-NEXT: vmovss %xmm0, (%rsi) -; X64-NEXT: vzeroupper ; X64-NEXT: retq %tmp1 = bitcast float* %A to <8 x float>* %tmp2 = load <8 x float>, <8 x float>* %tmp1, align 32 @@ -37,18 +35,16 @@ ; X86: ## %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: vmovaps (%ecx), %ymm0 -; X86-NEXT: vorps LCPI1_0, %ymm0, %ymm0 +; X86-NEXT: vmovaps (%ecx), %xmm0 +; X86-NEXT: vorps LCPI1_0, %xmm0, %xmm0 ; X86-NEXT: vmovss %xmm0, (%eax) -; X86-NEXT: vzeroupper ; X86-NEXT: retl ; ; X64-LABEL: test2: ; X64: ## %bb.0: -; X64-NEXT: vmovaps (%rdi), %ymm0 -; X64-NEXT: vorps {{.*}}(%rip), %ymm0, %ymm0 +; X64-NEXT: vmovaps (%rdi), %xmm0 +; X64-NEXT: vorps {{.*}}(%rip), %xmm0, %xmm0 ; X64-NEXT: vmovss %xmm0, (%rsi) -; X64-NEXT: vzeroupper ; X64-NEXT: retq %tmp1 = bitcast float* %A to <8 x float>* %tmp2 = load <8 x float>, <8 x float>* %tmp1, align 32 @@ -66,18 +62,16 @@ ; X86: ## %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: vmovaps (%ecx), %ymm0 -; X86-NEXT: vxorps LCPI2_0, %ymm0, %ymm0 +; X86-NEXT: vmovaps (%ecx), %xmm0 +; X86-NEXT: vxorps LCPI2_0, %xmm0, %xmm0 ; X86-NEXT: vmovss %xmm0, (%eax) -; X86-NEXT: vzeroupper ; X86-NEXT: retl ; ; X64-LABEL: test3: ; X64: ## %bb.0: -; X64-NEXT: vmovaps (%rdi), %ymm0 -; X64-NEXT: vxorps {{.*}}(%rip), %ymm0, %ymm0 +; X64-NEXT: vmovaps (%rdi), %xmm0 +; X64-NEXT: vxorps {{.*}}(%rip), %xmm0, %xmm0 ; X64-NEXT: vmovss %xmm0, (%rsi) -; X64-NEXT: vzeroupper ; X64-NEXT: retq %tmp1 = bitcast float* %A to <8 x float>* %tmp2 = load <8 x float>, <8 x float>* %tmp1, align 32 @@ -94,18 +88,16 @@ ; X86: ## %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: vmovaps (%ecx), %ymm0 -; X86-NEXT: vandnps LCPI3_0, %ymm0, %ymm0 +; X86-NEXT: vmovaps (%ecx), %xmm0 +; X86-NEXT: vandnps LCPI3_0, %xmm0, %xmm0 ; X86-NEXT: vmovss %xmm0, (%eax) -; X86-NEXT: vzeroupper ; X86-NEXT: retl ; ; X64-LABEL: test4: ; X64: ## %bb.0: -; X64-NEXT: vmovaps (%rdi), %ymm0 -; X64-NEXT: vandnps {{.*}}(%rip), %ymm0, %ymm0 +; X64-NEXT: vmovaps (%rdi), %xmm0 +; X64-NEXT: vandnps {{.*}}(%rip), %xmm0, %xmm0 ; X64-NEXT: vmovss %xmm0, (%rsi) -; X64-NEXT: vzeroupper ; X64-NEXT: retq %tmp1 = bitcast float* %A to <8 x float>* %tmp2 = load <8 x float>, <8 x float>* %tmp1, align 32 Index: llvm/trunk/test/CodeGen/X86/i64-mem-copy.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/i64-mem-copy.ll +++ llvm/trunk/test/CodeGen/X86/i64-mem-copy.ll @@ -90,8 +90,9 @@ ; X32AVX-LABEL: store_i64_from_vector256: ; X32AVX: # %bb.0: ; X32AVX-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32AVX-NEXT: vpaddw %ymm1, %ymm0, %ymm0 +; X32AVX-NEXT: vextracti128 $1, %ymm1, %xmm1 ; X32AVX-NEXT: vextracti128 $1, %ymm0, %xmm0 +; X32AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; X32AVX-NEXT: vmovq %xmm0, (%eax) ; X32AVX-NEXT: vzeroupper ; X32AVX-NEXT: retl Index: llvm/trunk/test/CodeGen/X86/pr36199.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/pr36199.ll +++ llvm/trunk/test/CodeGen/X86/pr36199.ll @@ -4,7 +4,7 @@ define void @foo(<16 x float> %x) { ; CHECK-LABEL: foo: ; CHECK: # %bb.0: -; CHECK-NEXT: vaddps %zmm0, %zmm0, %zmm0 +; CHECK-NEXT: vaddps %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; CHECK-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 ; CHECK-NEXT: vmovups %zmm0, (%rax) Index: llvm/trunk/test/CodeGen/X86/sad.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/sad.ll +++ llvm/trunk/test/CodeGen/X86/sad.ll @@ -1359,7 +1359,7 @@ ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -1371,7 +1371,7 @@ ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpaddq %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq Index: llvm/trunk/test/CodeGen/X86/vector-reduce-mul.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/vector-reduce-mul.ll +++ llvm/trunk/test/CodeGen/X86/vector-reduce-mul.ll @@ -1853,7 +1853,7 @@ ; AVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpextrb $0, %xmm0, %eax ; AVX2-NEXT: # kill: def $al killed $al killed $eax ; AVX2-NEXT: vzeroupper @@ -1948,7 +1948,7 @@ ; AVX512DQ-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpsrlw $8, %xmm0, %xmm1 ; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512DQ-NEXT: vpextrb $0, %xmm0, %eax ; AVX512DQ-NEXT: # kill: def $al killed $al killed $eax ; AVX512DQ-NEXT: vzeroupper @@ -1987,7 +1987,7 @@ ; AVX512DQVL-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 ; AVX512DQVL-NEXT: vpsrlw $8, %xmm0, %xmm1 ; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0 +; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512DQVL-NEXT: vpextrb $0, %xmm0, %eax ; AVX512DQVL-NEXT: # kill: def $al killed $al killed $eax ; AVX512DQVL-NEXT: vzeroupper @@ -2229,7 +2229,7 @@ ; AVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpextrb $0, %xmm0, %eax ; AVX2-NEXT: # kill: def $al killed $al killed $eax ; AVX2-NEXT: vzeroupper @@ -2278,7 +2278,7 @@ ; AVX512BW-NEXT: vpackuswb %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpsrlw $8, %xmm0, %xmm1 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] -; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: vpextrb $0, %xmm0, %eax ; AVX512BW-NEXT: # kill: def $al killed $al killed $eax ; AVX512BW-NEXT: vzeroupper @@ -2327,7 +2327,7 @@ ; AVX512BWVL-NEXT: vpackuswb %zmm1, %zmm0, %zmm0 ; AVX512BWVL-NEXT: vpsrlw $8, %xmm0, %xmm1 ; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] -; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0 +; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512BWVL-NEXT: vpextrb $0, %xmm0, %eax ; AVX512BWVL-NEXT: # kill: def $al killed $al killed $eax ; AVX512BWVL-NEXT: vzeroupper @@ -2375,7 +2375,7 @@ ; AVX512DQ-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpsrlw $8, %xmm0, %xmm1 ; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512DQ-NEXT: vpextrb $0, %xmm0, %eax ; AVX512DQ-NEXT: # kill: def $al killed $al killed $eax ; AVX512DQ-NEXT: vzeroupper @@ -2423,7 +2423,7 @@ ; AVX512DQVL-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 ; AVX512DQVL-NEXT: vpsrlw $8, %xmm0, %xmm1 ; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0 +; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512DQVL-NEXT: vpextrb $0, %xmm0, %eax ; AVX512DQVL-NEXT: # kill: def $al killed $al killed $eax ; AVX512DQVL-NEXT: vzeroupper @@ -2801,7 +2801,7 @@ ; AVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpextrb $0, %xmm0, %eax ; AVX2-NEXT: # kill: def $al killed $al killed $eax ; AVX2-NEXT: vzeroupper @@ -2859,7 +2859,7 @@ ; AVX512BW-NEXT: vpackuswb %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpsrlw $8, %xmm0, %xmm1 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] -; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: vpextrb $0, %xmm0, %eax ; AVX512BW-NEXT: # kill: def $al killed $al killed $eax ; AVX512BW-NEXT: vzeroupper @@ -2917,7 +2917,7 @@ ; AVX512BWVL-NEXT: vpackuswb %zmm1, %zmm0, %zmm0 ; AVX512BWVL-NEXT: vpsrlw $8, %xmm0, %xmm1 ; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] -; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0 +; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512BWVL-NEXT: vpextrb $0, %xmm0, %eax ; AVX512BWVL-NEXT: # kill: def $al killed $al killed $eax ; AVX512BWVL-NEXT: vzeroupper @@ -2983,7 +2983,7 @@ ; AVX512DQ-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpsrlw $8, %xmm0, %xmm1 ; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512DQ-NEXT: vpextrb $0, %xmm0, %eax ; AVX512DQ-NEXT: # kill: def $al killed $al killed $eax ; AVX512DQ-NEXT: vzeroupper @@ -3049,7 +3049,7 @@ ; AVX512DQVL-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 ; AVX512DQVL-NEXT: vpsrlw $8, %xmm0, %xmm1 ; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0 +; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512DQVL-NEXT: vpextrb $0, %xmm0, %eax ; AVX512DQVL-NEXT: # kill: def $al killed $al killed $eax ; AVX512DQVL-NEXT: vzeroupper