diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -11207,8 +11207,7 @@ } // Simplify the operands using demanded-bits information. - if (!VT.isVector() && - SimplifyDemandedBits(SDValue(N, 0))) + if (SimplifyDemandedBits(SDValue(N, 0))) return SDValue(N, 0); // (trunc adde(X, Y, Carry)) -> (adde trunc(X), trunc(Y), Carry) diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -1868,7 +1868,8 @@ // zero/one bits live out. unsigned OperandBitWidth = Src.getScalarValueSizeInBits(); APInt TruncMask = DemandedBits.zext(OperandBitWidth); - if (SimplifyDemandedBits(Src, TruncMask, Known, TLO, Depth + 1)) + if (SimplifyDemandedBits(Src, TruncMask, DemandedElts, Known, TLO, + Depth + 1)) return true; Known = Known.trunc(BitWidth); @@ -1891,9 +1892,9 @@ // undesirable. break; - SDValue ShAmt = Src.getOperand(1); - auto *ShAmtC = dyn_cast(ShAmt); - if (!ShAmtC || ShAmtC->getAPIntValue().uge(BitWidth)) + const APInt *ShAmtC = + TLO.DAG.getValidShiftAmountConstant(Src, DemandedElts); + if (!ShAmtC) break; uint64_t ShVal = ShAmtC->getZExtValue(); @@ -1905,6 +1906,7 @@ if (!(HighBits & DemandedBits)) { // None of the shifted in bits are needed. Add a truncate of the // shift input, then shift it. + SDValue ShAmt = Src.getOperand(1); if (TLO.LegalTypes()) ShAmt = TLO.DAG.getConstant(ShVal, dl, getShiftAmountTy(VT, DL)); SDValue NewTrunc = diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -8409,6 +8409,13 @@ return true; } +/// isAnyExtended - Check if a node is a vector value that is any-extended. +static bool isAnyExtended(SDNode *N, SelectionDAG &DAG) { + if (N->getOpcode() == ISD::ANY_EXTEND) + return true; + return false; +} + /// isSignExtended - Check if a node is a vector value that is sign-extended /// or a constant BUILD_VECTOR with sign-extended elements. static bool isSignExtended(SDNode *N, SelectionDAG &DAG) { @@ -8496,7 +8503,8 @@ /// before extension is less than 64 bits we add a an extension to resize /// the vector to 64 bits. static SDValue SkipExtensionForVMULL(SDNode *N, SelectionDAG &DAG) { - if (N->getOpcode() == ISD::SIGN_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND) + if (N->getOpcode() == ISD::SIGN_EXTEND || + N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::ANY_EXTEND) return AddRequiredExtensionForVMULL(N->getOperand(0), DAG, N->getOperand(0)->getValueType(0), N->getValueType(0), @@ -8577,14 +8585,16 @@ SDNode *N1 = Op.getOperand(1).getNode(); unsigned NewOpc = 0; bool isMLA = false; + bool isN0AExt = isAnyExtended(N0, DAG); + bool isN1AExt = isAnyExtended(N1, DAG); bool isN0SExt = isSignExtended(N0, DAG); bool isN1SExt = isSignExtended(N1, DAG); - if (isN0SExt && isN1SExt) + if ((isN0SExt || isN0AExt) && (isN1SExt || isN1AExt)) NewOpc = ARMISD::VMULLs; else { bool isN0ZExt = isZeroExtended(N0, DAG); bool isN1ZExt = isZeroExtended(N1, DAG); - if (isN0ZExt && isN1ZExt) + if ((isN0ZExt || isN0AExt) && (isN1ZExt || isN1AExt)) NewOpc = ARMISD::VMULLu; else if (isN1SExt || isN1ZExt) { // Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these diff --git a/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll b/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll --- a/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll +++ b/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll @@ -237,8 +237,8 @@ ; VI-NEXT: v_add_u32_sdwa v0, vcc, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; VI-NEXT: s_or_b32 s0, s1, 4 ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: s_and_b32 s0, s0, 0xff -; VI-NEXT: v_or_b32_e32 v2, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: v_or_b32_sdwa v2, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: flat_store_short v[0:1], v2 diff --git a/llvm/test/CodeGen/ARM/lowerMUL-newload.ll b/llvm/test/CodeGen/ARM/lowerMUL-newload.ll --- a/llvm/test/CodeGen/ARM/lowerMUL-newload.ll +++ b/llvm/test/CodeGen/ARM/lowerMUL-newload.ll @@ -4,18 +4,19 @@ define void @func1(i16* %a, i16* %b, i16* %c) { ; CHECK-LABEL: func1: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldr d16, [r2, #16] ; CHECK-NEXT: add r3, r1, #16 -; CHECK-NEXT: vldr d18, [r2, #16] -; CHECK-NEXT: vld1.16 {d16}, [r3:64] -; CHECK-NEXT: vmovl.u16 q8, d16 -; CHECK-NEXT: vaddw.s16 q10, q8, d18 -; CHECK-NEXT: vmovn.i32 d19, q10 +; CHECK-NEXT: vld1.16 {d17}, [r3:64] +; CHECK-NEXT: vmovl.u16 q9, d17 +; CHECK-NEXT: vmovl.u16 q10, d16 +; CHECK-NEXT: vadd.i32 q10, q10, q9 +; CHECK-NEXT: vmovn.i32 d17, q10 ; CHECK-NEXT: vldr d20, [r0, #16] -; CHECK-NEXT: vstr d19, [r0, #16] -; CHECK-NEXT: vldr d19, [r2, #16] -; CHECK-NEXT: vmull.s16 q11, d18, d19 -; CHECK-NEXT: vmovl.s16 q9, d19 -; CHECK-NEXT: vmla.i32 q11, q8, q9 +; CHECK-NEXT: vstr d17, [r0, #16] +; CHECK-NEXT: vldr d17, [r2, #16] +; CHECK-NEXT: vmull.s16 q11, d16, d17 +; CHECK-NEXT: vmovl.s16 q8, d17 +; CHECK-NEXT: vmla.i32 q11, q9, q8 ; CHECK-NEXT: vmovn.i32 d16, q11 ; CHECK-NEXT: vstr d16, [r1, #16] ; CHECK-NEXT: vldr d16, [r2, #16] @@ -75,23 +76,25 @@ define void @func2(i16* %a, i16* %b, i16* %c) { ; CHECK-LABEL: func2: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldr d16, [r2, #16] ; CHECK-NEXT: add r3, r1, #16 -; CHECK-NEXT: vldr d18, [r2, #16] -; CHECK-NEXT: vld1.16 {d16}, [r3:64] -; CHECK-NEXT: vmovl.u16 q8, d16 -; CHECK-NEXT: vaddw.s16 q10, q8, d18 -; CHECK-NEXT: vmovn.i32 d19, q10 +; CHECK-NEXT: vld1.16 {d17}, [r3:64] +; CHECK-NEXT: vmovl.u16 q9, d17 +; CHECK-NEXT: vmovl.u16 q10, d16 +; CHECK-NEXT: vadd.i32 q10, q10, q9 +; CHECK-NEXT: vmovn.i32 d17, q10 ; CHECK-NEXT: vldr d20, [r0, #16] -; CHECK-NEXT: vstr d19, [r0, #16] -; CHECK-NEXT: vldr d19, [r2, #16] -; CHECK-NEXT: vmull.s16 q11, d18, d19 -; CHECK-NEXT: vmovl.s16 q9, d19 -; CHECK-NEXT: vmla.i32 q11, q8, q9 +; CHECK-NEXT: vstr d17, [r0, #16] +; CHECK-NEXT: vldr d17, [r2, #16] +; CHECK-NEXT: vmull.s16 q11, d16, d17 +; CHECK-NEXT: vmovl.s16 q8, d17 +; CHECK-NEXT: vmla.i32 q11, q9, q8 ; CHECK-NEXT: vmovn.i32 d16, q11 ; CHECK-NEXT: vstr d16, [r1, #16] ; CHECK-NEXT: vldr d16, [r2, #16] ; CHECK-NEXT: vmlal.s16 q11, d16, d20 -; CHECK-NEXT: vaddw.s16 q8, q11, d20 +; CHECK-NEXT: vmovl.u16 q8, d20 +; CHECK-NEXT: vadd.i32 q8, q11, q8 ; CHECK-NEXT: vmovn.i32 d16, q8 ; CHECK-NEXT: vstr d16, [r0, #16] ; CHECK-NEXT: bx lr diff --git a/llvm/test/CodeGen/X86/avg.ll b/llvm/test/CodeGen/X86/avg.ll --- a/llvm/test/CodeGen/X86/avg.ll +++ b/llvm/test/CodeGen/X86/avg.ll @@ -178,69 +178,65 @@ ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm6[3,3,0,1] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero ; AVX1-NEXT: vpaddd %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm6[1,1,2,3] -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero -; AVX1-NEXT: vpaddd %xmm3, %xmm4, %xmm3 -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero -; AVX1-NEXT: vpaddd %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm7[1,1,2,3] -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero -; AVX1-NEXT: vpaddd %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero -; AVX1-NEXT: vpaddd %xmm5, %xmm8, %xmm5 -; AVX1-NEXT: vpcmpeqd %xmm6, %xmm6, %xmm6 -; AVX1-NEXT: vpsubd %xmm6, %xmm1, %xmm1 -; AVX1-NEXT: vpsubd %xmm6, %xmm2, %xmm2 -; AVX1-NEXT: vpsubd %xmm6, %xmm3, %xmm3 -; AVX1-NEXT: vpsubd %xmm6, %xmm0, %xmm0 -; AVX1-NEXT: vpsubd %xmm6, %xmm4, %xmm4 -; AVX1-NEXT: vpsubd %xmm6, %xmm5, %xmm5 -; AVX1-NEXT: vpsrld $1, %xmm5, %xmm5 -; AVX1-NEXT: vpsrld $1, %xmm4, %xmm4 -; AVX1-NEXT: vpackusdw %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vpsrld $1, %xmm0, %xmm0 -; AVX1-NEXT: vpsrld $1, %xmm3, %xmm3 -; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpsrld $1, %xmm2, %xmm2 -; AVX1-NEXT: vpsrld $1, %xmm1, %xmm1 ; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] -; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm6[1,1,2,3] +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero +; AVX1-NEXT: vpaddd %xmm2, %xmm4, %xmm2 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero +; AVX1-NEXT: vpaddd %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm7[1,1,2,3] +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero +; AVX1-NEXT: vpaddd %xmm2, %xmm5, %xmm2 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero +; AVX1-NEXT: vpaddd %xmm3, %xmm8, %xmm3 +; AVX1-NEXT: vpackusdw %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpsubw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpsubw %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm4[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vpsubw %xmm3, %xmm2, %xmm1 +; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] ; AVX1-NEXT: vmovq %xmm1, (%rax) ; AVX1-NEXT: vmovdqu %xmm0, (%rax) ; AVX1-NEXT: retq ; ; AVX2-LABEL: avg_v24i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq 8(%rdi), %xmm0 -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpbroadcastq 8(%rdi), %xmm1 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpaddd %ymm3, %ymm0, %ymm0 ; AVX2-NEXT: vpbroadcastq 8(%rsi), %xmm3 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero -; AVX2-NEXT: vpaddd %ymm3, %ymm0, %ymm0 -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero ; AVX2-NEXT: vpaddd %ymm3, %ymm1, %ymm1 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero ; AVX2-NEXT: vpaddd %ymm3, %ymm2, %ymm2 -; AVX2-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3 -; AVX2-NEXT: vpsubd %ymm3, %ymm0, %ymm0 -; AVX2-NEXT: vpsubd %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpsubd %ymm3, %ymm2, %ymm2 -; AVX2-NEXT: vpsrld $1, %ymm2, %ymm2 -; AVX2-NEXT: vpsrld $1, %ymm1, %ymm1 -; AVX2-NEXT: vpsrld $1, %ymm0, %ymm0 -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm1[2,3],ymm0[2,3] -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX2-NEXT: vpackusdw %ymm3, %ymm0, %ymm0 -; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3 +; AVX2-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 +; AVX2-NEXT: vpsubw %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpsrlw $1, %xmm0, %xmm0 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpand %xmm4, %xmm0, %xmm0 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm5 +; AVX2-NEXT: vpackusdw %xmm5, %xmm1, %xmm1 +; AVX2-NEXT: vpsubw %xmm3, %xmm1, %xmm1 +; AVX2-NEXT: vpsrlw $1, %xmm1, %xmm1 +; AVX2-NEXT: vpand %xmm4, %xmm1, %xmm1 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm1 ; AVX2-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 +; AVX2-NEXT: vpsubw %xmm3, %xmm1, %xmm1 +; AVX2-NEXT: vpsrlw $1, %xmm1, %xmm1 ; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vmovq %xmm1, (%rax) ; AVX2-NEXT: vmovdqu %xmm0, (%rax) @@ -453,174 +449,163 @@ ; AVX1-LABEL: avg_v48i8: ; AVX1: # %bb.0: ; AVX1-NEXT: vmovdqa (%rdi), %xmm0 -; AVX1-NEXT: vmovdqa 16(%rdi), %xmm4 +; AVX1-NEXT: vmovdqa 16(%rdi), %xmm3 ; AVX1-NEXT: vmovdqa 32(%rdi), %xmm1 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[3,3,0,1] -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,2,3] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm15 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,3,0,1] -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm11 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,2,3] -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm14 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm13 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[2,3,0,1] -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[3,3,0,1] -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[1,1,2,3] -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero -; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vmovdqa (%rsi), %xmm0 -; AVX1-NEXT: vmovdqa 16(%rsi), %xmm4 -; AVX1-NEXT: vmovdqa 32(%rsi), %xmm3 -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[3,3,0,1] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero -; AVX1-NEXT: vpaddd %xmm2, %xmm5, %xmm12 -; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[3,3,0,1] -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero -; AVX1-NEXT: vpaddd %xmm5, %xmm6, %xmm10 +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[1,1,2,3] +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[2,3,0,1] +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm15 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[3,3,0,1] +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm13 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[1,1,2,3] +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm12 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm14 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[2,3,0,1] +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm11 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[3,3,0,1] +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm10 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero ; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[1,1,2,3] -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero -; AVX1-NEXT: vpaddd %xmm6, %xmm7, %xmm9 -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero -; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm8 -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero -; AVX1-NEXT: vpaddd %xmm3, %xmm15, %xmm15 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm9 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm8 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero +; AVX1-NEXT: vmovdqa (%rsi), %xmm6 +; AVX1-NEXT: vmovdqa 16(%rsi), %xmm3 +; AVX1-NEXT: vmovdqa 32(%rsi), %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[2,3,0,1] +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero +; AVX1-NEXT: vpaddd %xmm5, %xmm7, %xmm5 ; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[3,3,0,1] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero -; AVX1-NEXT: vpaddd %xmm7, %xmm11, %xmm7 +; AVX1-NEXT: vpaddd %xmm7, %xmm2, %xmm2 +; AVX1-NEXT: vpackusdw %xmm2, %xmm5, %xmm7 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero -; AVX1-NEXT: vpaddd %xmm2, %xmm14, %xmm14 +; AVX1-NEXT: vpaddd %xmm2, %xmm4, %xmm2 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; AVX1-NEXT: vpaddd %xmm0, %xmm13, %xmm13 -; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[2,3,0,1] -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero -; AVX1-NEXT: vpaddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload -; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[3,3,0,1] -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero -; AVX1-NEXT: vpaddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[1,1,2,3] +; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[2,3,0,1] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero -; AVX1-NEXT: vpaddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero -; AVX1-NEXT: vpaddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpsubd %xmm3, %xmm12, %xmm11 -; AVX1-NEXT: vpsubd %xmm3, %xmm10, %xmm10 -; AVX1-NEXT: vpsubd %xmm3, %xmm9, %xmm9 -; AVX1-NEXT: vpsubd %xmm3, %xmm8, %xmm8 -; AVX1-NEXT: vpsubd %xmm3, %xmm15, %xmm12 -; AVX1-NEXT: vpsubd %xmm3, %xmm7, %xmm7 -; AVX1-NEXT: vpsubd %xmm3, %xmm14, %xmm0 -; AVX1-NEXT: vpsubd %xmm3, %xmm13, %xmm2 -; AVX1-NEXT: vpsubd %xmm3, %xmm5, %xmm5 -; AVX1-NEXT: vpsubd %xmm3, %xmm6, %xmm6 -; AVX1-NEXT: vpsubd %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpsubd %xmm3, %xmm4, %xmm3 -; AVX1-NEXT: vpsrld $1, %xmm3, %xmm3 -; AVX1-NEXT: vpsrld $1, %xmm1, %xmm1 -; AVX1-NEXT: vpackusdw %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vpsrld $1, %xmm6, %xmm3 -; AVX1-NEXT: vpsrld $1, %xmm5, %xmm4 -; AVX1-NEXT: vpackusdw %xmm3, %xmm4, %xmm3 -; AVX1-NEXT: vpsrld $1, %xmm2, %xmm2 -; AVX1-NEXT: vpsrld $1, %xmm0, %xmm0 -; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0 -; AVX1-NEXT: vpsrld $1, %xmm7, %xmm2 -; AVX1-NEXT: vpsrld $1, %xmm12, %xmm4 +; AVX1-NEXT: vpaddd %xmm1, %xmm15, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm6[3,3,0,1] +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero +; AVX1-NEXT: vpaddd %xmm2, %xmm13, %xmm2 +; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm6[1,1,2,3] +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero +; AVX1-NEXT: vpaddd %xmm2, %xmm12, %xmm2 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero +; AVX1-NEXT: vpaddd %xmm4, %xmm14, %xmm4 ; AVX1-NEXT: vpackusdw %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpsrld $1, %xmm8, %xmm4 -; AVX1-NEXT: vpsrld $1, %xmm9, %xmm5 +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[2,3,0,1] +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero +; AVX1-NEXT: vpaddd %xmm4, %xmm11, %xmm4 +; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[3,3,0,1] +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero +; AVX1-NEXT: vpaddd %xmm5, %xmm10, %xmm5 ; AVX1-NEXT: vpackusdw %xmm5, %xmm4, %xmm4 -; AVX1-NEXT: vpsrld $1, %xmm10, %xmm5 -; AVX1-NEXT: vpsrld $1, %xmm11, %xmm6 -; AVX1-NEXT: vpackusdw %xmm5, %xmm6, %xmm5 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255] -; AVX1-NEXT: vpand %xmm6, %xmm5, %xmm5 -; AVX1-NEXT: vpand %xmm6, %xmm4, %xmm4 -; AVX1-NEXT: vpackuswb %xmm5, %xmm4, %xmm4 -; AVX1-NEXT: vpand %xmm6, %xmm2, %xmm2 -; AVX1-NEXT: vpand %xmm6, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm6, %xmm3, %xmm2 -; AVX1-NEXT: vpand %xmm6, %xmm1, %xmm1 -; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[1,1,2,3] +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero +; AVX1-NEXT: vpaddd %xmm5, %xmm9, %xmm5 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero +; AVX1-NEXT: vpaddd %xmm3, %xmm8, %xmm3 +; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5 +; AVX1-NEXT: vpsubw %xmm5, %xmm7, %xmm6 +; AVX1-NEXT: vpsrlw $1, %xmm6, %xmm6 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vpand %xmm7, %xmm6, %xmm6 +; AVX1-NEXT: vpsubw %xmm5, %xmm0, %xmm0 +; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm7, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm6, %xmm0, %xmm0 +; AVX1-NEXT: vpsubw %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm7, %xmm1, %xmm1 +; AVX1-NEXT: vpsubw %xmm5, %xmm2, %xmm2 +; AVX1-NEXT: vpsrlw $1, %xmm2, %xmm2 +; AVX1-NEXT: vpand %xmm7, %xmm2, %xmm2 +; AVX1-NEXT: vpackuswb %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpsubw %xmm5, %xmm4, %xmm2 +; AVX1-NEXT: vpsrlw $1, %xmm2, %xmm2 +; AVX1-NEXT: vpand %xmm7, %xmm2, %xmm2 +; AVX1-NEXT: vpsubw %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vpsrlw $1, %xmm3, %xmm3 +; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3 +; AVX1-NEXT: vpackuswb %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vmovdqu %xmm2, (%rax) ; AVX1-NEXT: vmovdqu %xmm1, (%rax) ; AVX1-NEXT: vmovdqu %xmm0, (%rax) -; AVX1-NEXT: vmovdqu %xmm4, (%rax) ; AVX1-NEXT: retq ; ; AVX2-LABEL: avg_v48i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq 24(%rdi), %xmm0 -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpbroadcastq 8(%rdi), %xmm2 +; AVX2-NEXT: vpbroadcastq 24(%rdi), %xmm2 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpbroadcastq 40(%rdi), %xmm4 -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero,xmm4[4],zero,zero,zero,xmm4[5],zero,zero,zero,xmm4[6],zero,zero,zero,xmm4[7],zero,zero,zero -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpbroadcastq 24(%rsi), %xmm6 -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero,xmm6[4],zero,zero,zero,xmm6[5],zero,zero,zero,xmm6[6],zero,zero,zero,xmm6[7],zero,zero,zero -; AVX2-NEXT: vpaddd %ymm6, %ymm0, %ymm0 -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpaddd %ymm6, %ymm1, %ymm1 -; AVX2-NEXT: vpbroadcastq 8(%rsi), %xmm6 -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero,xmm6[4],zero,zero,zero,xmm6[5],zero,zero,zero,xmm6[6],zero,zero,zero,xmm6[7],zero,zero,zero -; AVX2-NEXT: vpaddd %ymm6, %ymm2, %ymm2 -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpaddd %ymm6, %ymm3, %ymm3 -; AVX2-NEXT: vpbroadcastq 40(%rsi), %xmm6 -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero,xmm6[4],zero,zero,zero,xmm6[5],zero,zero,zero,xmm6[6],zero,zero,zero,xmm6[7],zero,zero,zero -; AVX2-NEXT: vpaddd %ymm6, %ymm4, %ymm4 +; AVX2-NEXT: vpbroadcastq 8(%rdi), %xmm3 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpbroadcastq 40(%rdi), %xmm5 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero,xmm5[4],zero,zero,zero,xmm5[5],zero,zero,zero,xmm5[6],zero,zero,zero,xmm5[7],zero,zero,zero ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpaddd %ymm6, %ymm5, %ymm5 -; AVX2-NEXT: vpcmpeqd %ymm6, %ymm6, %ymm6 -; AVX2-NEXT: vpsubd %ymm6, %ymm0, %ymm0 -; AVX2-NEXT: vpsubd %ymm6, %ymm1, %ymm1 -; AVX2-NEXT: vpsubd %ymm6, %ymm2, %ymm2 -; AVX2-NEXT: vpsubd %ymm6, %ymm3, %ymm3 -; AVX2-NEXT: vpsubd %ymm6, %ymm4, %ymm4 -; AVX2-NEXT: vpsubd %ymm6, %ymm5, %ymm5 -; AVX2-NEXT: vpsrld $1, %ymm5, %ymm5 -; AVX2-NEXT: vpsrld $1, %ymm4, %ymm4 -; AVX2-NEXT: vpsrld $1, %ymm3, %ymm3 -; AVX2-NEXT: vpsrld $1, %ymm2, %ymm2 -; AVX2-NEXT: vpsrld $1, %ymm1, %ymm1 -; AVX2-NEXT: vpsrld $1, %ymm0, %ymm0 -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm1[2,3],ymm0[2,3] -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX2-NEXT: vpackusdw %ymm6, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm3[2,3],ymm2[2,3] -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 -; AVX2-NEXT: vpackusdw %ymm6, %ymm2, %ymm2 -; AVX2-NEXT: vpand %ymm1, %ymm2, %ymm2 -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm2[2,3],ymm0[2,3] -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 -; AVX2-NEXT: vpackuswb %ymm3, %ymm0, %ymm0 -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm5[2,3],ymm4[2,3] -; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm3 -; AVX2-NEXT: vpackusdw %ymm2, %ymm3, %ymm2 -; AVX2-NEXT: vpand %ymm1, %ymm2, %ymm1 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vmovdqu %xmm1, (%rax) -; AVX2-NEXT: vmovdqu %ymm0, (%rax) +; AVX2-NEXT: vpaddd %ymm6, %ymm0, %ymm6 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm7 +; AVX2-NEXT: vpbroadcastq 24(%rsi), %xmm0 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; AVX2-NEXT: vpaddd %ymm0, %ymm2, %ymm8 +; AVX2-NEXT: vpbroadcastq 8(%rsi), %xmm0 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; AVX2-NEXT: vpaddd %ymm0, %ymm3, %ymm9 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpaddd %ymm0, %ymm4, %ymm1 +; AVX2-NEXT: vpbroadcastq 40(%rsi), %xmm0 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; AVX2-NEXT: vpaddd %ymm0, %ymm5, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm6, %xmm2 +; AVX2-NEXT: vpackusdw %xmm2, %xmm6, %xmm3 +; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpsubw %xmm2, %xmm3, %xmm3 +; AVX2-NEXT: vpsrlw $1, %xmm3, %xmm4 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpand %xmm3, %xmm4, %xmm4 +; AVX2-NEXT: vextracti128 $1, %ymm7, %xmm5 +; AVX2-NEXT: vpackusdw %xmm5, %xmm7, %xmm5 +; AVX2-NEXT: vpsubw %xmm2, %xmm5, %xmm5 +; AVX2-NEXT: vpsrlw $1, %xmm5, %xmm5 +; AVX2-NEXT: vpand %xmm3, %xmm5, %xmm5 +; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm4 +; AVX2-NEXT: vextracti128 $1, %ymm8, %xmm5 +; AVX2-NEXT: vpackusdw %xmm5, %xmm8, %xmm5 +; AVX2-NEXT: vpsubw %xmm2, %xmm5, %xmm5 +; AVX2-NEXT: vpsrlw $1, %xmm5, %xmm5 +; AVX2-NEXT: vpand %xmm3, %xmm5, %xmm5 +; AVX2-NEXT: vextracti128 $1, %ymm9, %xmm6 +; AVX2-NEXT: vpackusdw %xmm6, %xmm9, %xmm6 +; AVX2-NEXT: vpsubw %xmm2, %xmm6, %xmm6 +; AVX2-NEXT: vpsrlw $1, %xmm6, %xmm6 +; AVX2-NEXT: vpand %xmm3, %xmm6, %xmm6 +; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm6, %ymm5 +; AVX2-NEXT: vpackuswb %ymm5, %ymm4, %ymm4 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm5 +; AVX2-NEXT: vpackusdw %xmm5, %xmm1, %xmm1 +; AVX2-NEXT: vpsubw %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpsrlw $1, %xmm1, %xmm1 +; AVX2-NEXT: vpand %xmm3, %xmm1, %xmm1 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm5 +; AVX2-NEXT: vpackusdw %xmm5, %xmm0, %xmm0 +; AVX2-NEXT: vpsubw %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpsrlw $1, %xmm0, %xmm0 +; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpackuswb %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vmovdqu %xmm0, (%rax) +; AVX2-NEXT: vmovdqu %ymm4, (%rax) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/combine-sra.ll b/llvm/test/CodeGen/X86/combine-sra.ll --- a/llvm/test/CodeGen/X86/combine-sra.ll +++ b/llvm/test/CodeGen/X86/combine-sra.ll @@ -248,15 +248,17 @@ ; ; AVX2-SLOW-LABEL: combine_vec_ashr_trunc_ashr: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] +; AVX2-SLOW-NEXT: vpsrlq $32, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] ; AVX2-SLOW-NEXT: vpsravd {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: combine_vec_ashr_trunc_ashr: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [1,3,5,7,5,7,6,7] +; AVX2-FAST-NEXT: vpsrlq $32, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7] ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vpsravd {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-FAST-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/known-signbits-vector.ll b/llvm/test/CodeGen/X86/known-signbits-vector.ll --- a/llvm/test/CodeGen/X86/known-signbits-vector.ll +++ b/llvm/test/CodeGen/X86/known-signbits-vector.ll @@ -251,9 +251,8 @@ define <4 x double> @signbits_sext_shuffle_sitofp(<4 x i32> %a0, <4 x i64> %a1) nounwind { ; X86-LABEL: signbits_sext_shuffle_sitofp: ; X86: # %bb.0: -; X86-NEXT: vpmovsxdq %xmm0, %xmm1 -; X86-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; X86-NEXT: vpmovsxdq %xmm0, %xmm0 +; X86-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero +; X86-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] ; X86-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; X86-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2] ; X86-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] @@ -264,9 +263,8 @@ ; ; X64-AVX1-LABEL: signbits_sext_shuffle_sitofp: ; X64-AVX1: # %bb.0: -; X64-AVX1-NEXT: vpmovsxdq %xmm0, %xmm1 -; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; X64-AVX1-NEXT: vpmovsxdq %xmm0, %xmm0 +; X64-AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] ; X64-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; X64-AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2] ; X64-AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] @@ -277,7 +275,7 @@ ; ; X64-AVX2-LABEL: signbits_sext_shuffle_sitofp: ; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: vpmovsxdq %xmm0, %ymm0 +; X64-AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; X64-AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,2,1,0] ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X64-AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] @@ -325,18 +323,12 @@ ; TODO: Fix vpshufd+vpsrlq -> vpshufd/vpermilps define <2 x double> @signbits_ashr_concat_ashr_extract_sitofp(<2 x i64> %a0, <4 x i64> %a1) nounwind { -; X86-LABEL: signbits_ashr_concat_ashr_extract_sitofp: -; X86: # %bb.0: -; X86-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,3,2,3] -; X86-NEXT: vcvtdq2pd %xmm0, %xmm0 -; X86-NEXT: retl -; -; X64-LABEL: signbits_ashr_concat_ashr_extract_sitofp: -; X64: # %bb.0: -; X64-NEXT: vpsrlq $32, %xmm0, %xmm0 -; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; X64-NEXT: vcvtdq2pd %xmm0, %xmm0 -; X64-NEXT: retq +; CHECK-LABEL: signbits_ashr_concat_ashr_extract_sitofp: +; CHECK: # %bb.0: +; CHECK-NEXT: vpsrlq $32, %xmm0, %xmm0 +; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-NEXT: vcvtdq2pd %xmm0, %xmm0 +; CHECK-NEXT: ret{{[l|q]}} %1 = ashr <2 x i64> %a0, %2 = shufflevector <2 x i64> %1, <2 x i64> undef, <4 x i32> %3 = shufflevector <4 x i64> %a1, <4 x i64> %2, <4 x i32> diff --git a/llvm/test/CodeGen/X86/min-legal-vector-width.ll b/llvm/test/CodeGen/X86/min-legal-vector-width.ll --- a/llvm/test/CodeGen/X86/min-legal-vector-width.ll +++ b/llvm/test/CodeGen/X86/min-legal-vector-width.ll @@ -933,9 +933,9 @@ define <32 x i8> @trunc_v32i16_v32i8_sign(<32 x i16>* %x) nounwind "min-legal-vector-width"="256" { ; CHECK-AVX512-LABEL: trunc_v32i16_v32i8_sign: ; CHECK-AVX512: # %bb.0: -; CHECK-AVX512-NEXT: vpsraw $8, 32(%rdi), %ymm0 -; CHECK-AVX512-NEXT: vpsraw $8, (%rdi), %ymm1 -; CHECK-AVX512-NEXT: vpacksswb %ymm0, %ymm1, %ymm0 +; CHECK-AVX512-NEXT: vpsrlw $8, 32(%rdi), %ymm0 +; CHECK-AVX512-NEXT: vpsrlw $8, (%rdi), %ymm1 +; CHECK-AVX512-NEXT: vpackuswb %ymm0, %ymm1, %ymm0 ; CHECK-AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; CHECK-AVX512-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-trunc.ll b/llvm/test/CodeGen/X86/vector-trunc.ll --- a/llvm/test/CodeGen/X86/vector-trunc.ll +++ b/llvm/test/CodeGen/X86/vector-trunc.ll @@ -66,22 +66,26 @@ ; ; AVX2-SLOW-LABEL: trunc8i64_8i32_ashr: ; AVX2-SLOW: # %bb.0: # %entry -; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm2[1,3],ymm0[5,7],ymm2[5,7] +; AVX2-SLOW-NEXT: vpsrlq $32, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpsrlq $32, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6] ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: trunc8i64_8i32_ashr: ; AVX2-FAST: # %bb.0: # %entry -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm2 = [1,3,5,7,5,7,6,7] -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpsrlq $32, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpsrlq $32, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-FAST-NEXT: retq ; ; AVX512-LABEL: trunc8i64_8i32_ashr: ; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vpsraq $32, %zmm0, %zmm0 +; AVX512-NEXT: vpsrlq $32, %zmm0, %zmm0 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0 ; AVX512-NEXT: retq entry: @@ -387,33 +391,48 @@ } define <8 x i16> @trunc8i32_8i16_ashr(<8 x i32> %a) { -; SSE-LABEL: trunc8i32_8i16_ashr: -; SSE: # %bb.0: # %entry -; SSE-NEXT: psrad $16, %xmm1 -; SSE-NEXT: psrad $16, %xmm0 -; SSE-NEXT: packssdw %xmm1, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: trunc8i32_8i16_ashr: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: psrad $16, %xmm1 +; SSE2-NEXT: psrad $16, %xmm0 +; SSE2-NEXT: packssdw %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: trunc8i32_8i16_ashr: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2,3,6,7,10,11,14,15,10,11,14,15,14,15,255,255] +; SSSE3-NEXT: pshufb %xmm2, %xmm1 +; SSSE3-NEXT: pshufb %xmm2, %xmm0 +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: trunc8i32_8i16_ashr: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: psrld $16, %xmm1 +; SSE41-NEXT: psrld $16, %xmm0 +; SSE41-NEXT: packusdw %xmm1, %xmm0 +; SSE41-NEXT: retq ; ; AVX1-LABEL: trunc8i32_8i16_ashr: ; AVX1: # %bb.0: # %entry ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpsrad $16, %xmm1, %xmm1 -; AVX1-NEXT: vpsrad $16, %xmm0, %xmm0 -; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsrld $16, %xmm1, %xmm1 +; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0 +; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc8i32_8i16_ashr: ; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vpsrad $16, %ymm0, %ymm0 +; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: trunc8i32_8i16_ashr: ; AVX512F: # %bb.0: # %entry -; AVX512F-NEXT: vpsrad $16, %ymm0, %ymm0 +; AVX512F-NEXT: vpsrld $16, %ymm0, %ymm0 ; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX512F-NEXT: vzeroupper @@ -421,14 +440,14 @@ ; ; AVX512VL-LABEL: trunc8i32_8i16_ashr: ; AVX512VL: # %bb.0: # %entry -; AVX512VL-NEXT: vpsrad $16, %ymm0, %ymm0 +; AVX512VL-NEXT: vpsrld $16, %ymm0, %ymm0 ; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0 ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: trunc8i32_8i16_ashr: ; AVX512BW: # %bb.0: # %entry -; AVX512BW-NEXT: vpsrad $16, %ymm0, %ymm0 +; AVX512BW-NEXT: vpsrld $16, %ymm0, %ymm0 ; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX512BW-NEXT: vzeroupper @@ -436,7 +455,7 @@ ; ; AVX512BWVL-LABEL: trunc8i32_8i16_ashr: ; AVX512BWVL: # %bb.0: # %entry -; AVX512BWVL-NEXT: vpsrad $16, %ymm0, %ymm0 +; AVX512BWVL-NEXT: vpsrld $16, %ymm0, %ymm0 ; AVX512BWVL-NEXT: vpmovdw %ymm0, %xmm0 ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq @@ -689,28 +708,52 @@ } define void @trunc16i32_16i16_ashr(<16 x i32> %a) { -; SSE-LABEL: trunc16i32_16i16_ashr: -; SSE: # %bb.0: # %entry -; SSE-NEXT: psrad $16, %xmm3 -; SSE-NEXT: psrad $16, %xmm2 -; SSE-NEXT: packssdw %xmm3, %xmm2 -; SSE-NEXT: psrad $16, %xmm1 -; SSE-NEXT: psrad $16, %xmm0 -; SSE-NEXT: packssdw %xmm1, %xmm0 -; SSE-NEXT: movdqu %xmm2, (%rax) -; SSE-NEXT: movdqu %xmm0, (%rax) -; SSE-NEXT: retq +; SSE2-LABEL: trunc16i32_16i16_ashr: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: psrad $16, %xmm1 +; SSE2-NEXT: psrad $16, %xmm0 +; SSE2-NEXT: packssdw %xmm1, %xmm0 +; SSE2-NEXT: psrad $16, %xmm3 +; SSE2-NEXT: psrad $16, %xmm2 +; SSE2-NEXT: packssdw %xmm3, %xmm2 +; SSE2-NEXT: movdqu %xmm2, (%rax) +; SSE2-NEXT: movdqu %xmm0, (%rax) +; SSE2-NEXT: retq +; +; SSSE3-LABEL: trunc16i32_16i16_ashr: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: psrad $16, %xmm1 +; SSSE3-NEXT: psrad $16, %xmm0 +; SSSE3-NEXT: packssdw %xmm1, %xmm0 +; SSSE3-NEXT: psrad $16, %xmm3 +; SSSE3-NEXT: psrad $16, %xmm2 +; SSSE3-NEXT: packssdw %xmm3, %xmm2 +; SSSE3-NEXT: movdqu %xmm2, (%rax) +; SSSE3-NEXT: movdqu %xmm0, (%rax) +; SSSE3-NEXT: retq +; +; SSE41-LABEL: trunc16i32_16i16_ashr: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: psrld $16, %xmm3 +; SSE41-NEXT: psrld $16, %xmm2 +; SSE41-NEXT: packusdw %xmm3, %xmm2 +; SSE41-NEXT: psrld $16, %xmm1 +; SSE41-NEXT: psrld $16, %xmm0 +; SSE41-NEXT: packusdw %xmm1, %xmm0 +; SSE41-NEXT: movdqu %xmm2, (%rax) +; SSE41-NEXT: movdqu %xmm0, (%rax) +; SSE41-NEXT: retq ; ; AVX1-LABEL: trunc16i32_16i16_ashr: ; AVX1: # %bb.0: # %entry ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vpsrad $16, %xmm2, %xmm2 -; AVX1-NEXT: vpsrad $16, %xmm1, %xmm1 -; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpsrld $16, %xmm2, %xmm2 +; AVX1-NEXT: vpsrld $16, %xmm1, %xmm1 +; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpsrad $16, %xmm2, %xmm2 -; AVX1-NEXT: vpsrad $16, %xmm0, %xmm0 -; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpsrld $16, %xmm2, %xmm2 +; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0 +; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vmovdqu %xmm1, (%rax) ; AVX1-NEXT: vmovdqu %xmm0, (%rax) ; AVX1-NEXT: vzeroupper @@ -718,9 +761,9 @@ ; ; AVX2-LABEL: trunc16i32_16i16_ashr: ; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vpsrad $16, %ymm1, %ymm1 -; AVX2-NEXT: vpsrad $16, %ymm0, %ymm0 -; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpsrld $16, %ymm1, %ymm1 +; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0 +; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: vmovdqu %ymm0, (%rax) ; AVX2-NEXT: vzeroupper @@ -895,41 +938,65 @@ } define void @trunc16i32_16i8_ashr(<16 x i32> %a) { -; SSE-LABEL: trunc16i32_16i8_ashr: -; SSE: # %bb.0: # %entry -; SSE-NEXT: psrad $24, %xmm1 -; SSE-NEXT: psrad $24, %xmm0 -; SSE-NEXT: packssdw %xmm1, %xmm0 -; SSE-NEXT: psrad $24, %xmm3 -; SSE-NEXT: psrad $24, %xmm2 -; SSE-NEXT: packssdw %xmm3, %xmm2 -; SSE-NEXT: packsswb %xmm2, %xmm0 -; SSE-NEXT: movdqu %xmm0, (%rax) -; SSE-NEXT: retq +; SSE2-LABEL: trunc16i32_16i8_ashr: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: psrld $24, %xmm1 +; SSE2-NEXT: psrld $24, %xmm0 +; SSE2-NEXT: packuswb %xmm1, %xmm0 +; SSE2-NEXT: psrld $24, %xmm3 +; SSE2-NEXT: psrld $24, %xmm2 +; SSE2-NEXT: packuswb %xmm3, %xmm2 +; SSE2-NEXT: packuswb %xmm2, %xmm0 +; SSE2-NEXT: movdqu %xmm0, (%rax) +; SSE2-NEXT: retq +; +; SSSE3-LABEL: trunc16i32_16i8_ashr: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: psrld $24, %xmm1 +; SSSE3-NEXT: psrld $24, %xmm0 +; SSSE3-NEXT: packuswb %xmm1, %xmm0 +; SSSE3-NEXT: psrld $24, %xmm3 +; SSSE3-NEXT: psrld $24, %xmm2 +; SSSE3-NEXT: packuswb %xmm3, %xmm2 +; SSSE3-NEXT: packuswb %xmm2, %xmm0 +; SSSE3-NEXT: movdqu %xmm0, (%rax) +; SSSE3-NEXT: retq +; +; SSE41-LABEL: trunc16i32_16i8_ashr: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: psrld $24, %xmm1 +; SSE41-NEXT: psrld $24, %xmm0 +; SSE41-NEXT: packusdw %xmm1, %xmm0 +; SSE41-NEXT: psrld $24, %xmm3 +; SSE41-NEXT: psrld $24, %xmm2 +; SSE41-NEXT: packusdw %xmm3, %xmm2 +; SSE41-NEXT: packuswb %xmm2, %xmm0 +; SSE41-NEXT: movdqu %xmm0, (%rax) +; SSE41-NEXT: retq ; ; AVX1-LABEL: trunc16i32_16i8_ashr: ; AVX1: # %bb.0: # %entry ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpsrad $24, %xmm2, %xmm2 -; AVX1-NEXT: vpsrad $24, %xmm0, %xmm0 -; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpsrld $24, %xmm2, %xmm2 +; AVX1-NEXT: vpsrld $24, %xmm0, %xmm0 +; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vpsrad $24, %xmm2, %xmm2 -; AVX1-NEXT: vpsrad $24, %xmm1, %xmm1 -; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsrld $24, %xmm2, %xmm2 +; AVX1-NEXT: vpsrld $24, %xmm1, %xmm1 +; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovdqu %xmm0, (%rax) ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc16i32_16i8_ashr: ; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vpsrad $24, %ymm1, %ymm1 -; AVX2-NEXT: vpsrad $24, %ymm0, %ymm0 -; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpsrld $24, %ymm1, %ymm1 +; AVX2-NEXT: vpsrld $24, %ymm0, %ymm0 +; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovdqu %xmm0, (%rax) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -1089,27 +1156,27 @@ define void @trunc16i16_16i8_ashr(<16 x i16> %a) { ; SSE-LABEL: trunc16i16_16i8_ashr: ; SSE: # %bb.0: # %entry -; SSE-NEXT: psraw $8, %xmm1 -; SSE-NEXT: psraw $8, %xmm0 -; SSE-NEXT: packsswb %xmm1, %xmm0 +; SSE-NEXT: psrlw $8, %xmm1 +; SSE-NEXT: psrlw $8, %xmm0 +; SSE-NEXT: packuswb %xmm1, %xmm0 ; SSE-NEXT: movdqu %xmm0, (%rax) ; SSE-NEXT: retq ; ; AVX1-LABEL: trunc16i16_16i8_ashr: ; AVX1: # %bb.0: # %entry ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpsraw $8, %xmm1, %xmm1 -; AVX1-NEXT: vpsraw $8, %xmm0, %xmm0 -; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 +; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovdqu %xmm0, (%rax) ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc16i16_16i8_ashr: ; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vpsraw $8, %ymm0, %ymm0 +; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovdqu %xmm0, (%rax) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -1132,7 +1199,7 @@ ; ; AVX512BW-LABEL: trunc16i16_16i8_ashr: ; AVX512BW: # %bb.0: # %entry -; AVX512BW-NEXT: vpsraw $8, %ymm0, %ymm0 +; AVX512BW-NEXT: vpsrlw $8, %ymm0, %ymm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512BW-NEXT: vmovdqu %xmm0, (%rax) ; AVX512BW-NEXT: vzeroupper