diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp @@ -856,6 +856,13 @@ return; } break; + case ISD::USHLSAT: + case ISD::SSHLSAT: + if (SDValue Expanded = TLI.expandShlSat(Node, DAG)) { + Results.push_back(Expanded); + return; + } + break; case ISD::FP_TO_SINT_SAT: case ISD::FP_TO_UINT_SAT: // Expand the fpsosisat if it is scalable to prevent it from unrolling below. diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -9227,9 +9227,13 @@ assert(VT == RHS.getValueType() && "Expected operands to be the same type"); assert(VT.isInteger() && "Expected operands to be integers"); + if (VT.isVector() && !isOperationLegalOrCustom(ISD::VSELECT, VT)) + return DAG.UnrollVectorOp(Node); + // If LHS != (LHS << RHS) >> RHS, we have overflow and must saturate. unsigned BW = VT.getScalarSizeInBits(); + EVT BoolVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT); SDValue Result = DAG.getNode(ISD::SHL, dl, VT, LHS, RHS); SDValue Orig = DAG.getNode(IsSigned ? ISD::SRA : ISD::SRL, dl, VT, Result, RHS); @@ -9238,14 +9242,14 @@ if (IsSigned) { SDValue SatMin = DAG.getConstant(APInt::getSignedMinValue(BW), dl, VT); SDValue SatMax = DAG.getConstant(APInt::getSignedMaxValue(BW), dl, VT); - SatVal = DAG.getSelectCC(dl, LHS, DAG.getConstant(0, dl, VT), - SatMin, SatMax, ISD::SETLT); + SDValue Cond = + DAG.getSetCC(dl, BoolVT, LHS, DAG.getConstant(0, dl, VT), ISD::SETLT); + SatVal = DAG.getSelect(dl, VT, Cond, SatMin, SatMax); } else { SatVal = DAG.getConstant(APInt::getMaxValue(BW), dl, VT); } - Result = DAG.getSelectCC(dl, LHS, Orig, SatVal, Result, ISD::SETNE); - - return Result; + SDValue Cond = DAG.getSetCC(dl, BoolVT, LHS, Orig, ISD::SETNE); + return DAG.getSelect(dl, VT, Cond, SatVal, Result); } SDValue diff --git a/llvm/test/CodeGen/RISCV/rvv/sshl_sat_vec.ll b/llvm/test/CodeGen/RISCV/rvv/sshl_sat_vec.ll --- a/llvm/test/CodeGen/RISCV/rvv/sshl_sat_vec.ll +++ b/llvm/test/CodeGen/RISCV/rvv/sshl_sat_vec.ll @@ -9,34 +9,18 @@ define <2 x i64> @vec_v2i64(<2 x i64> %x, <2 x i64> %y) nounwind { ; CHECK-LABEL: vec_v2i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 0, e64, m1, ta, ma -; CHECK-NEXT: vmv.x.s a2, v8 -; CHECK-NEXT: li a1, -1 -; CHECK-NEXT: vmv.x.s a3, v9 -; CHECK-NEXT: sll a0, a2, a3 -; CHECK-NEXT: sra a3, a0, a3 -; CHECK-NEXT: srli a1, a1, 1 -; CHECK-NEXT: beq a2, a3, .LBB0_2 -; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: slti a0, a2, 0 -; CHECK-NEXT: add a0, a0, a1 -; CHECK-NEXT: .LBB0_2: -; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, ma -; CHECK-NEXT: vslidedown.vi v9, v9, 1 -; CHECK-NEXT: vmv.x.s a4, v9 -; CHECK-NEXT: vslidedown.vi v8, v8, 1 -; CHECK-NEXT: vmv.x.s a3, v8 -; CHECK-NEXT: sll a2, a3, a4 -; CHECK-NEXT: sra a4, a2, a4 -; CHECK-NEXT: beq a3, a4, .LBB0_4 -; CHECK-NEXT: # %bb.3: -; CHECK-NEXT: slti a2, a3, 0 -; CHECK-NEXT: add a2, a2, a1 -; CHECK-NEXT: .LBB0_4: ; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; CHECK-NEXT: vmv.v.x v8, a2 -; CHECK-NEXT: vsetvli zero, zero, e64, m1, tu, ma -; CHECK-NEXT: vmv.s.x v8, a0 +; CHECK-NEXT: vmslt.vx v0, v8, zero +; CHECK-NEXT: li a0, -1 +; CHECK-NEXT: srli a1, a0, 1 +; CHECK-NEXT: vsll.vv v10, v8, v9 +; CHECK-NEXT: vsra.vv v9, v10, v9 +; CHECK-NEXT: vmsne.vv v8, v8, v9 +; CHECK-NEXT: vmv.v.x v9, a1 +; CHECK-NEXT: slli a0, a0, 63 +; CHECK-NEXT: vmerge.vxm v9, v9, a0, v0 +; CHECK-NEXT: vmv.v.v v0, v8 +; CHECK-NEXT: vmerge.vvm v8, v10, v9, v0 ; CHECK-NEXT: ret %tmp = call <2 x i64> @llvm.sshl.sat.v2i64(<2 x i64> %x, <2 x i64> %y) ret <2 x i64> %tmp @@ -45,69 +29,19 @@ define <4 x i32> @vec_v4i32(<4 x i32> %x, <4 x i32> %y) nounwind { ; CHECK-LABEL: vec_v4i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 0, e32, m1, ta, ma -; CHECK-NEXT: vmv.x.s a0, v8 -; CHECK-NEXT: slli a2, a0, 32 -; CHECK-NEXT: li a0, -1 -; CHECK-NEXT: vmv.x.s a3, v9 -; CHECK-NEXT: sll a1, a2, a3 -; CHECK-NEXT: sra a3, a1, a3 -; CHECK-NEXT: srli a0, a0, 1 -; CHECK-NEXT: beq a2, a3, .LBB1_2 -; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: slti a1, a2, 0 -; CHECK-NEXT: add a1, a1, a0 -; CHECK-NEXT: .LBB1_2: -; CHECK-NEXT: addi sp, sp, -16 -; CHECK-NEXT: srli a1, a1, 32 -; CHECK-NEXT: sw a1, 0(sp) -; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; CHECK-NEXT: vslidedown.vi v10, v9, 3 -; CHECK-NEXT: vmv.x.s a3, v10 -; CHECK-NEXT: vslidedown.vi v10, v8, 3 -; CHECK-NEXT: vmv.x.s a1, v10 -; CHECK-NEXT: slli a2, a1, 32 -; CHECK-NEXT: sll a1, a2, a3 -; CHECK-NEXT: sra a3, a1, a3 -; CHECK-NEXT: beq a2, a3, .LBB1_4 -; CHECK-NEXT: # %bb.3: -; CHECK-NEXT: slti a1, a2, 0 -; CHECK-NEXT: add a1, a1, a0 -; CHECK-NEXT: .LBB1_4: -; CHECK-NEXT: srli a3, a1, 32 -; CHECK-NEXT: vslidedown.vi v10, v9, 2 -; CHECK-NEXT: vmv.x.s a4, v10 -; CHECK-NEXT: vslidedown.vi v10, v8, 2 -; CHECK-NEXT: vmv.x.s a1, v10 -; CHECK-NEXT: slli a2, a1, 32 -; CHECK-NEXT: sll a1, a2, a4 -; CHECK-NEXT: sra a4, a1, a4 -; CHECK-NEXT: sw a3, 12(sp) -; CHECK-NEXT: beq a2, a4, .LBB1_6 -; CHECK-NEXT: # %bb.5: -; CHECK-NEXT: slti a1, a2, 0 -; CHECK-NEXT: add a1, a1, a0 -; CHECK-NEXT: .LBB1_6: -; CHECK-NEXT: srli a3, a1, 32 -; CHECK-NEXT: vslidedown.vi v9, v9, 1 -; CHECK-NEXT: vmv.x.s a4, v9 -; CHECK-NEXT: vslidedown.vi v8, v8, 1 -; CHECK-NEXT: vmv.x.s a1, v8 -; CHECK-NEXT: slli a2, a1, 32 -; CHECK-NEXT: sll a1, a2, a4 -; CHECK-NEXT: sra a4, a1, a4 -; CHECK-NEXT: sw a3, 8(sp) -; CHECK-NEXT: beq a2, a4, .LBB1_8 -; CHECK-NEXT: # %bb.7: -; CHECK-NEXT: slti a1, a2, 0 -; CHECK-NEXT: add a1, a1, a0 -; CHECK-NEXT: .LBB1_8: -; CHECK-NEXT: srli a0, a1, 32 -; CHECK-NEXT: sw a0, 4(sp) -; CHECK-NEXT: mv a0, sp ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: vmslt.vx v0, v8, zero +; CHECK-NEXT: lui a0, 524288 +; CHECK-NEXT: addiw a0, a0, -1 +; CHECK-NEXT: vsll.vv v10, v8, v9 +; CHECK-NEXT: vsra.vv v9, v10, v9 +; CHECK-NEXT: vmsne.vv v8, v8, v9 +; CHECK-NEXT: vmv.v.x v9, a0 +; CHECK-NEXT: li a0, 1 +; CHECK-NEXT: slli a0, a0, 31 +; CHECK-NEXT: vmerge.vxm v9, v9, a0, v0 +; CHECK-NEXT: vmv.v.v v0, v8 +; CHECK-NEXT: vmerge.vvm v8, v10, v9, v0 ; CHECK-NEXT: ret %tmp = call <4 x i32> @llvm.sshl.sat.v4i32(<4 x i32> %x, <4 x i32> %y) ret <4 x i32> %tmp @@ -116,125 +50,17 @@ define <8 x i16> @vec_v8i16(<8 x i16> %x, <8 x i16> %y) nounwind { ; CHECK-LABEL: vec_v8i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 0, e16, m1, ta, ma -; CHECK-NEXT: vmv.x.s a0, v8 -; CHECK-NEXT: slli a2, a0, 48 -; CHECK-NEXT: li a0, -1 -; CHECK-NEXT: vmv.x.s a3, v9 -; CHECK-NEXT: sll a1, a2, a3 -; CHECK-NEXT: sra a3, a1, a3 -; CHECK-NEXT: srli a0, a0, 1 -; CHECK-NEXT: beq a2, a3, .LBB2_2 -; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: slti a1, a2, 0 -; CHECK-NEXT: add a1, a1, a0 -; CHECK-NEXT: .LBB2_2: -; CHECK-NEXT: addi sp, sp, -16 -; CHECK-NEXT: srli a1, a1, 48 -; CHECK-NEXT: sh a1, 0(sp) -; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; CHECK-NEXT: vslidedown.vi v10, v9, 7 -; CHECK-NEXT: vmv.x.s a3, v10 -; CHECK-NEXT: vslidedown.vi v10, v8, 7 -; CHECK-NEXT: vmv.x.s a1, v10 -; CHECK-NEXT: slli a2, a1, 48 -; CHECK-NEXT: sll a1, a2, a3 -; CHECK-NEXT: sra a3, a1, a3 -; CHECK-NEXT: beq a2, a3, .LBB2_4 -; CHECK-NEXT: # %bb.3: -; CHECK-NEXT: slti a1, a2, 0 -; CHECK-NEXT: add a1, a1, a0 -; CHECK-NEXT: .LBB2_4: -; CHECK-NEXT: srli a3, a1, 48 -; CHECK-NEXT: vslidedown.vi v10, v9, 6 -; CHECK-NEXT: vmv.x.s a4, v10 -; CHECK-NEXT: vslidedown.vi v10, v8, 6 -; CHECK-NEXT: vmv.x.s a1, v10 -; CHECK-NEXT: slli a2, a1, 48 -; CHECK-NEXT: sll a1, a2, a4 -; CHECK-NEXT: sra a4, a1, a4 -; CHECK-NEXT: sh a3, 14(sp) -; CHECK-NEXT: beq a2, a4, .LBB2_6 -; CHECK-NEXT: # %bb.5: -; CHECK-NEXT: slti a1, a2, 0 -; CHECK-NEXT: add a1, a1, a0 -; CHECK-NEXT: .LBB2_6: -; CHECK-NEXT: srli a3, a1, 48 -; CHECK-NEXT: vslidedown.vi v10, v9, 5 -; CHECK-NEXT: vmv.x.s a4, v10 -; CHECK-NEXT: vslidedown.vi v10, v8, 5 -; CHECK-NEXT: vmv.x.s a1, v10 -; CHECK-NEXT: slli a2, a1, 48 -; CHECK-NEXT: sll a1, a2, a4 -; CHECK-NEXT: sra a4, a1, a4 -; CHECK-NEXT: sh a3, 12(sp) -; CHECK-NEXT: beq a2, a4, .LBB2_8 -; CHECK-NEXT: # %bb.7: -; CHECK-NEXT: slti a1, a2, 0 -; CHECK-NEXT: add a1, a1, a0 -; CHECK-NEXT: .LBB2_8: -; CHECK-NEXT: srli a3, a1, 48 -; CHECK-NEXT: vslidedown.vi v10, v9, 4 -; CHECK-NEXT: vmv.x.s a4, v10 -; CHECK-NEXT: vslidedown.vi v10, v8, 4 -; CHECK-NEXT: vmv.x.s a1, v10 -; CHECK-NEXT: slli a2, a1, 48 -; CHECK-NEXT: sll a1, a2, a4 -; CHECK-NEXT: sra a4, a1, a4 -; CHECK-NEXT: sh a3, 10(sp) -; CHECK-NEXT: beq a2, a4, .LBB2_10 -; CHECK-NEXT: # %bb.9: -; CHECK-NEXT: slti a1, a2, 0 -; CHECK-NEXT: add a1, a1, a0 -; CHECK-NEXT: .LBB2_10: -; CHECK-NEXT: srli a3, a1, 48 -; CHECK-NEXT: vslidedown.vi v10, v9, 3 -; CHECK-NEXT: vmv.x.s a4, v10 -; CHECK-NEXT: vslidedown.vi v10, v8, 3 -; CHECK-NEXT: vmv.x.s a1, v10 -; CHECK-NEXT: slli a2, a1, 48 -; CHECK-NEXT: sll a1, a2, a4 -; CHECK-NEXT: sra a4, a1, a4 -; CHECK-NEXT: sh a3, 8(sp) -; CHECK-NEXT: beq a2, a4, .LBB2_12 -; CHECK-NEXT: # %bb.11: -; CHECK-NEXT: slti a1, a2, 0 -; CHECK-NEXT: add a1, a1, a0 -; CHECK-NEXT: .LBB2_12: -; CHECK-NEXT: srli a3, a1, 48 -; CHECK-NEXT: vslidedown.vi v10, v9, 2 -; CHECK-NEXT: vmv.x.s a4, v10 -; CHECK-NEXT: vslidedown.vi v10, v8, 2 -; CHECK-NEXT: vmv.x.s a1, v10 -; CHECK-NEXT: slli a2, a1, 48 -; CHECK-NEXT: sll a1, a2, a4 -; CHECK-NEXT: sra a4, a1, a4 -; CHECK-NEXT: sh a3, 6(sp) -; CHECK-NEXT: beq a2, a4, .LBB2_14 -; CHECK-NEXT: # %bb.13: -; CHECK-NEXT: slti a1, a2, 0 -; CHECK-NEXT: add a1, a1, a0 -; CHECK-NEXT: .LBB2_14: -; CHECK-NEXT: srli a3, a1, 48 -; CHECK-NEXT: vslidedown.vi v9, v9, 1 -; CHECK-NEXT: vmv.x.s a4, v9 -; CHECK-NEXT: vslidedown.vi v8, v8, 1 -; CHECK-NEXT: vmv.x.s a1, v8 -; CHECK-NEXT: slli a2, a1, 48 -; CHECK-NEXT: sll a1, a2, a4 -; CHECK-NEXT: sra a4, a1, a4 -; CHECK-NEXT: sh a3, 4(sp) -; CHECK-NEXT: beq a2, a4, .LBB2_16 -; CHECK-NEXT: # %bb.15: -; CHECK-NEXT: slti a1, a2, 0 -; CHECK-NEXT: add a1, a1, a0 -; CHECK-NEXT: .LBB2_16: -; CHECK-NEXT: srli a0, a1, 48 -; CHECK-NEXT: sh a0, 2(sp) -; CHECK-NEXT: mv a0, sp ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; CHECK-NEXT: vle16.v v8, (a0) -; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: vmslt.vx v0, v8, zero +; CHECK-NEXT: lui a0, 8 +; CHECK-NEXT: addiw a1, a0, -1 +; CHECK-NEXT: vsll.vv v10, v8, v9 +; CHECK-NEXT: vsra.vv v9, v10, v9 +; CHECK-NEXT: vmsne.vv v8, v8, v9 +; CHECK-NEXT: vmv.v.x v9, a1 +; CHECK-NEXT: vmerge.vxm v9, v9, a0, v0 +; CHECK-NEXT: vmv.v.v v0, v8 +; CHECK-NEXT: vmerge.vvm v8, v10, v9, v0 ; CHECK-NEXT: ret %tmp = call <8 x i16> @llvm.sshl.sat.v8i16(<8 x i16> %x, <8 x i16> %y) ret <8 x i16> %tmp @@ -243,238 +69,102 @@ define <16 x i8> @vec_v16i8(<16 x i8> %x, <16 x i8> %y) nounwind { ; CHECK-LABEL: vec_v16i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 0, e8, m1, ta, ma -; CHECK-NEXT: vmv.x.s a0, v8 -; CHECK-NEXT: slli a2, a0, 56 -; CHECK-NEXT: li a0, -1 -; CHECK-NEXT: vmv.x.s a3, v9 -; CHECK-NEXT: sll a1, a2, a3 -; CHECK-NEXT: sra a3, a1, a3 -; CHECK-NEXT: srli a0, a0, 1 -; CHECK-NEXT: beq a2, a3, .LBB3_2 -; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: slti a1, a2, 0 -; CHECK-NEXT: add a1, a1, a0 -; CHECK-NEXT: .LBB3_2: -; CHECK-NEXT: addi sp, sp, -16 -; CHECK-NEXT: srli a1, a1, 56 -; CHECK-NEXT: sb a1, 0(sp) -; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; CHECK-NEXT: vslidedown.vi v10, v9, 15 -; CHECK-NEXT: vmv.x.s a3, v10 -; CHECK-NEXT: vslidedown.vi v10, v8, 15 -; CHECK-NEXT: vmv.x.s a1, v10 -; CHECK-NEXT: slli a2, a1, 56 -; CHECK-NEXT: sll a1, a2, a3 -; CHECK-NEXT: sra a3, a1, a3 -; CHECK-NEXT: beq a2, a3, .LBB3_4 -; CHECK-NEXT: # %bb.3: -; CHECK-NEXT: slti a1, a2, 0 -; CHECK-NEXT: add a1, a1, a0 -; CHECK-NEXT: .LBB3_4: -; CHECK-NEXT: srli a3, a1, 56 -; CHECK-NEXT: vslidedown.vi v10, v9, 14 -; CHECK-NEXT: vmv.x.s a4, v10 -; CHECK-NEXT: vslidedown.vi v10, v8, 14 -; CHECK-NEXT: vmv.x.s a1, v10 -; CHECK-NEXT: slli a2, a1, 56 -; CHECK-NEXT: sll a1, a2, a4 -; CHECK-NEXT: sra a4, a1, a4 -; CHECK-NEXT: sb a3, 15(sp) -; CHECK-NEXT: beq a2, a4, .LBB3_6 -; CHECK-NEXT: # %bb.5: -; CHECK-NEXT: slti a1, a2, 0 -; CHECK-NEXT: add a1, a1, a0 -; CHECK-NEXT: .LBB3_6: -; CHECK-NEXT: srli a3, a1, 56 -; CHECK-NEXT: vslidedown.vi v10, v9, 13 -; CHECK-NEXT: vmv.x.s a4, v10 -; CHECK-NEXT: vslidedown.vi v10, v8, 13 -; CHECK-NEXT: vmv.x.s a1, v10 -; CHECK-NEXT: slli a2, a1, 56 -; CHECK-NEXT: sll a1, a2, a4 -; CHECK-NEXT: sra a4, a1, a4 -; CHECK-NEXT: sb a3, 14(sp) -; CHECK-NEXT: beq a2, a4, .LBB3_8 -; CHECK-NEXT: # %bb.7: -; CHECK-NEXT: slti a1, a2, 0 -; CHECK-NEXT: add a1, a1, a0 -; CHECK-NEXT: .LBB3_8: -; CHECK-NEXT: srli a3, a1, 56 -; CHECK-NEXT: vslidedown.vi v10, v9, 12 -; CHECK-NEXT: vmv.x.s a4, v10 -; CHECK-NEXT: vslidedown.vi v10, v8, 12 -; CHECK-NEXT: vmv.x.s a1, v10 -; CHECK-NEXT: slli a2, a1, 56 -; CHECK-NEXT: sll a1, a2, a4 -; CHECK-NEXT: sra a4, a1, a4 -; CHECK-NEXT: sb a3, 13(sp) -; CHECK-NEXT: beq a2, a4, .LBB3_10 -; CHECK-NEXT: # %bb.9: -; CHECK-NEXT: slti a1, a2, 0 -; CHECK-NEXT: add a1, a1, a0 -; CHECK-NEXT: .LBB3_10: -; CHECK-NEXT: srli a3, a1, 56 -; CHECK-NEXT: vslidedown.vi v10, v9, 11 -; CHECK-NEXT: vmv.x.s a4, v10 -; CHECK-NEXT: vslidedown.vi v10, v8, 11 -; CHECK-NEXT: vmv.x.s a1, v10 -; CHECK-NEXT: slli a2, a1, 56 -; CHECK-NEXT: sll a1, a2, a4 -; CHECK-NEXT: sra a4, a1, a4 -; CHECK-NEXT: sb a3, 12(sp) -; CHECK-NEXT: beq a2, a4, .LBB3_12 -; CHECK-NEXT: # %bb.11: -; CHECK-NEXT: slti a1, a2, 0 -; CHECK-NEXT: add a1, a1, a0 -; CHECK-NEXT: .LBB3_12: -; CHECK-NEXT: srli a3, a1, 56 -; CHECK-NEXT: vslidedown.vi v10, v9, 10 -; CHECK-NEXT: vmv.x.s a4, v10 -; CHECK-NEXT: vslidedown.vi v10, v8, 10 -; CHECK-NEXT: vmv.x.s a1, v10 -; CHECK-NEXT: slli a2, a1, 56 -; CHECK-NEXT: sll a1, a2, a4 -; CHECK-NEXT: sra a4, a1, a4 -; CHECK-NEXT: sb a3, 11(sp) -; CHECK-NEXT: beq a2, a4, .LBB3_14 -; CHECK-NEXT: # %bb.13: -; CHECK-NEXT: slti a1, a2, 0 -; CHECK-NEXT: add a1, a1, a0 -; CHECK-NEXT: .LBB3_14: -; CHECK-NEXT: srli a3, a1, 56 -; CHECK-NEXT: vslidedown.vi v10, v9, 9 -; CHECK-NEXT: vmv.x.s a4, v10 -; CHECK-NEXT: vslidedown.vi v10, v8, 9 -; CHECK-NEXT: vmv.x.s a1, v10 -; CHECK-NEXT: slli a2, a1, 56 -; CHECK-NEXT: sll a1, a2, a4 -; CHECK-NEXT: sra a4, a1, a4 -; CHECK-NEXT: sb a3, 10(sp) -; CHECK-NEXT: beq a2, a4, .LBB3_16 -; CHECK-NEXT: # %bb.15: -; CHECK-NEXT: slti a1, a2, 0 -; CHECK-NEXT: add a1, a1, a0 -; CHECK-NEXT: .LBB3_16: -; CHECK-NEXT: srli a3, a1, 56 -; CHECK-NEXT: vslidedown.vi v10, v9, 8 -; CHECK-NEXT: vmv.x.s a4, v10 -; CHECK-NEXT: vslidedown.vi v10, v8, 8 -; CHECK-NEXT: vmv.x.s a1, v10 -; CHECK-NEXT: slli a2, a1, 56 -; CHECK-NEXT: sll a1, a2, a4 -; CHECK-NEXT: sra a4, a1, a4 -; CHECK-NEXT: sb a3, 9(sp) -; CHECK-NEXT: beq a2, a4, .LBB3_18 -; CHECK-NEXT: # %bb.17: -; CHECK-NEXT: slti a1, a2, 0 -; CHECK-NEXT: add a1, a1, a0 -; CHECK-NEXT: .LBB3_18: -; CHECK-NEXT: srli a3, a1, 56 -; CHECK-NEXT: vslidedown.vi v10, v9, 7 -; CHECK-NEXT: vmv.x.s a4, v10 -; CHECK-NEXT: vslidedown.vi v10, v8, 7 -; CHECK-NEXT: vmv.x.s a1, v10 -; CHECK-NEXT: slli a2, a1, 56 -; CHECK-NEXT: sll a1, a2, a4 -; CHECK-NEXT: sra a4, a1, a4 -; CHECK-NEXT: sb a3, 8(sp) -; CHECK-NEXT: beq a2, a4, .LBB3_20 -; CHECK-NEXT: # %bb.19: -; CHECK-NEXT: slti a1, a2, 0 -; CHECK-NEXT: add a1, a1, a0 -; CHECK-NEXT: .LBB3_20: -; CHECK-NEXT: srli a3, a1, 56 -; CHECK-NEXT: vslidedown.vi v10, v9, 6 -; CHECK-NEXT: vmv.x.s a4, v10 -; CHECK-NEXT: vslidedown.vi v10, v8, 6 -; CHECK-NEXT: vmv.x.s a1, v10 -; CHECK-NEXT: slli a2, a1, 56 -; CHECK-NEXT: sll a1, a2, a4 -; CHECK-NEXT: sra a4, a1, a4 -; CHECK-NEXT: sb a3, 7(sp) -; CHECK-NEXT: beq a2, a4, .LBB3_22 -; CHECK-NEXT: # %bb.21: -; CHECK-NEXT: slti a1, a2, 0 -; CHECK-NEXT: add a1, a1, a0 -; CHECK-NEXT: .LBB3_22: -; CHECK-NEXT: srli a3, a1, 56 -; CHECK-NEXT: vslidedown.vi v10, v9, 5 -; CHECK-NEXT: vmv.x.s a4, v10 -; CHECK-NEXT: vslidedown.vi v10, v8, 5 -; CHECK-NEXT: vmv.x.s a1, v10 -; CHECK-NEXT: slli a2, a1, 56 -; CHECK-NEXT: sll a1, a2, a4 -; CHECK-NEXT: sra a4, a1, a4 -; CHECK-NEXT: sb a3, 6(sp) -; CHECK-NEXT: beq a2, a4, .LBB3_24 -; CHECK-NEXT: # %bb.23: -; CHECK-NEXT: slti a1, a2, 0 -; CHECK-NEXT: add a1, a1, a0 -; CHECK-NEXT: .LBB3_24: -; CHECK-NEXT: srli a3, a1, 56 -; CHECK-NEXT: vslidedown.vi v10, v9, 4 -; CHECK-NEXT: vmv.x.s a4, v10 -; CHECK-NEXT: vslidedown.vi v10, v8, 4 -; CHECK-NEXT: vmv.x.s a1, v10 -; CHECK-NEXT: slli a2, a1, 56 -; CHECK-NEXT: sll a1, a2, a4 -; CHECK-NEXT: sra a4, a1, a4 -; CHECK-NEXT: sb a3, 5(sp) -; CHECK-NEXT: beq a2, a4, .LBB3_26 -; CHECK-NEXT: # %bb.25: -; CHECK-NEXT: slti a1, a2, 0 -; CHECK-NEXT: add a1, a1, a0 -; CHECK-NEXT: .LBB3_26: -; CHECK-NEXT: srli a3, a1, 56 -; CHECK-NEXT: vslidedown.vi v10, v9, 3 -; CHECK-NEXT: vmv.x.s a4, v10 -; CHECK-NEXT: vslidedown.vi v10, v8, 3 -; CHECK-NEXT: vmv.x.s a1, v10 -; CHECK-NEXT: slli a2, a1, 56 -; CHECK-NEXT: sll a1, a2, a4 -; CHECK-NEXT: sra a4, a1, a4 -; CHECK-NEXT: sb a3, 4(sp) -; CHECK-NEXT: beq a2, a4, .LBB3_28 -; CHECK-NEXT: # %bb.27: -; CHECK-NEXT: slti a1, a2, 0 -; CHECK-NEXT: add a1, a1, a0 -; CHECK-NEXT: .LBB3_28: -; CHECK-NEXT: srli a3, a1, 56 -; CHECK-NEXT: vslidedown.vi v10, v9, 2 -; CHECK-NEXT: vmv.x.s a4, v10 -; CHECK-NEXT: vslidedown.vi v10, v8, 2 -; CHECK-NEXT: vmv.x.s a1, v10 -; CHECK-NEXT: slli a2, a1, 56 -; CHECK-NEXT: sll a1, a2, a4 -; CHECK-NEXT: sra a4, a1, a4 -; CHECK-NEXT: sb a3, 3(sp) -; CHECK-NEXT: beq a2, a4, .LBB3_30 -; CHECK-NEXT: # %bb.29: -; CHECK-NEXT: slti a1, a2, 0 -; CHECK-NEXT: add a1, a1, a0 -; CHECK-NEXT: .LBB3_30: -; CHECK-NEXT: srli a3, a1, 56 -; CHECK-NEXT: vslidedown.vi v9, v9, 1 -; CHECK-NEXT: vmv.x.s a4, v9 -; CHECK-NEXT: vslidedown.vi v8, v8, 1 -; CHECK-NEXT: vmv.x.s a1, v8 -; CHECK-NEXT: slli a2, a1, 56 -; CHECK-NEXT: sll a1, a2, a4 -; CHECK-NEXT: sra a4, a1, a4 -; CHECK-NEXT: sb a3, 2(sp) -; CHECK-NEXT: beq a2, a4, .LBB3_32 -; CHECK-NEXT: # %bb.31: -; CHECK-NEXT: slti a1, a2, 0 -; CHECK-NEXT: add a1, a1, a0 -; CHECK-NEXT: .LBB3_32: -; CHECK-NEXT: srli a0, a1, 56 -; CHECK-NEXT: sb a0, 1(sp) -; CHECK-NEXT: mv a0, sp ; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; CHECK-NEXT: vle8.v v8, (a0) -; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: vmslt.vx v0, v8, zero +; CHECK-NEXT: li a0, 127 +; CHECK-NEXT: vsll.vv v10, v8, v9 +; CHECK-NEXT: vsra.vv v9, v10, v9 +; CHECK-NEXT: vmsne.vv v8, v8, v9 +; CHECK-NEXT: vmv.v.x v9, a0 +; CHECK-NEXT: li a0, 128 +; CHECK-NEXT: vmerge.vxm v9, v9, a0, v0 +; CHECK-NEXT: vmv.v.v v0, v8 +; CHECK-NEXT: vmerge.vvm v8, v10, v9, v0 ; CHECK-NEXT: ret %tmp = call <16 x i8> @llvm.sshl.sat.v16i8(<16 x i8> %x, <16 x i8> %y) ret <16 x i8> %tmp } + +declare @llvm.sshl.sat.nxv2i64(, ) +declare @llvm.sshl.sat.nxv4i32(, ) +declare @llvm.sshl.sat.nxv8i16(, ) +declare @llvm.sshl.sat.nxv16i8(, ) + +define @vec_nxv2i64( %x, %y) nounwind { +; CHECK-LABEL: vec_nxv2i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e64, m2, ta, ma +; CHECK-NEXT: vmslt.vx v0, v8, zero +; CHECK-NEXT: li a0, -1 +; CHECK-NEXT: srli a1, a0, 1 +; CHECK-NEXT: vsll.vv v12, v8, v10 +; CHECK-NEXT: vsra.vv v14, v12, v10 +; CHECK-NEXT: vmsne.vv v10, v8, v14 +; CHECK-NEXT: vmv.v.x v8, a1 +; CHECK-NEXT: slli a0, a0, 63 +; CHECK-NEXT: vmerge.vxm v8, v8, a0, v0 +; CHECK-NEXT: vmv1r.v v0, v10 +; CHECK-NEXT: vmerge.vvm v8, v12, v8, v0 +; CHECK-NEXT: ret + %tmp = call @llvm.sshl.sat.nxv2i64( %x, %y) + ret %tmp +} + +define @vec_nxv4i32( %x, %y) nounwind { +; CHECK-LABEL: vec_nxv4i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e32, m2, ta, ma +; CHECK-NEXT: vmslt.vx v0, v8, zero +; CHECK-NEXT: lui a0, 524288 +; CHECK-NEXT: addiw a0, a0, -1 +; CHECK-NEXT: vsll.vv v12, v8, v10 +; CHECK-NEXT: vsra.vv v14, v12, v10 +; CHECK-NEXT: vmsne.vv v10, v8, v14 +; CHECK-NEXT: vmv.v.x v8, a0 +; CHECK-NEXT: li a0, 1 +; CHECK-NEXT: slli a0, a0, 31 +; CHECK-NEXT: vmerge.vxm v8, v8, a0, v0 +; CHECK-NEXT: vmv1r.v v0, v10 +; CHECK-NEXT: vmerge.vvm v8, v12, v8, v0 +; CHECK-NEXT: ret + %tmp = call @llvm.sshl.sat.nxv4i32( %x, %y) + ret %tmp +} + +define @vec_nxv8i16( %x, %y) nounwind { +; CHECK-LABEL: vec_nxv8i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; CHECK-NEXT: vmslt.vx v0, v8, zero +; CHECK-NEXT: lui a0, 8 +; CHECK-NEXT: addiw a1, a0, -1 +; CHECK-NEXT: vsll.vv v12, v8, v10 +; CHECK-NEXT: vsra.vv v14, v12, v10 +; CHECK-NEXT: vmsne.vv v10, v8, v14 +; CHECK-NEXT: vmv.v.x v8, a1 +; CHECK-NEXT: vmerge.vxm v8, v8, a0, v0 +; CHECK-NEXT: vmv1r.v v0, v10 +; CHECK-NEXT: vmerge.vvm v8, v12, v8, v0 +; CHECK-NEXT: ret + %tmp = call @llvm.sshl.sat.nxv8i16( %x, %y) + ret %tmp +} + +define @vec_nxv16i8( %x, %y) nounwind { +; CHECK-LABEL: vec_nxv16i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e8, m2, ta, ma +; CHECK-NEXT: vmslt.vx v0, v8, zero +; CHECK-NEXT: li a0, 127 +; CHECK-NEXT: vsll.vv v12, v8, v10 +; CHECK-NEXT: vsra.vv v14, v12, v10 +; CHECK-NEXT: vmsne.vv v10, v8, v14 +; CHECK-NEXT: vmv.v.x v8, a0 +; CHECK-NEXT: li a0, 128 +; CHECK-NEXT: vmerge.vxm v8, v8, a0, v0 +; CHECK-NEXT: vmv1r.v v0, v10 +; CHECK-NEXT: vmerge.vvm v8, v12, v8, v0 +; CHECK-NEXT: ret + %tmp = call @llvm.sshl.sat.nxv16i8( %x, %y) + ret %tmp +} diff --git a/llvm/test/CodeGen/RISCV/rvv/ushl_sat_vec.ll b/llvm/test/CodeGen/RISCV/rvv/ushl_sat_vec.ll --- a/llvm/test/CodeGen/RISCV/rvv/ushl_sat_vec.ll +++ b/llvm/test/CodeGen/RISCV/rvv/ushl_sat_vec.ll @@ -9,30 +9,11 @@ define <2 x i64> @vec_v2i64(<2 x i64> %x, <2 x i64> %y) nounwind { ; CHECK-LABEL: vec_v2i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, ma -; CHECK-NEXT: vslidedown.vi v10, v9, 1 -; CHECK-NEXT: vmv.x.s a0, v10 -; CHECK-NEXT: vslidedown.vi v10, v8, 1 -; CHECK-NEXT: vmv.x.s a1, v10 -; CHECK-NEXT: sll a2, a1, a0 -; CHECK-NEXT: srl a0, a2, a0 -; CHECK-NEXT: xor a0, a1, a0 -; CHECK-NEXT: seqz a0, a0 -; CHECK-NEXT: addi a0, a0, -1 -; CHECK-NEXT: or a0, a0, a2 ; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; CHECK-NEXT: vmv.v.x v10, a0 -; CHECK-NEXT: vmv.x.s a0, v9 -; CHECK-NEXT: vmv.x.s a1, v8 -; CHECK-NEXT: sll a2, a1, a0 -; CHECK-NEXT: srl a0, a2, a0 -; CHECK-NEXT: xor a0, a1, a0 -; CHECK-NEXT: seqz a0, a0 -; CHECK-NEXT: addi a0, a0, -1 -; CHECK-NEXT: or a0, a0, a2 -; CHECK-NEXT: vsetvli zero, zero, e64, m1, tu, ma -; CHECK-NEXT: vmv.s.x v10, a0 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vsll.vv v10, v8, v9 +; CHECK-NEXT: vsrl.vv v9, v10, v9 +; CHECK-NEXT: vmsne.vv v0, v8, v9 +; CHECK-NEXT: vmerge.vim v8, v10, -1, v0 ; CHECK-NEXT: ret %tmp = call <2 x i64> @llvm.ushl.sat.v2i64(<2 x i64> %x, <2 x i64> %y) ret <2 x i64> %tmp @@ -41,63 +22,11 @@ define <4 x i32> @vec_v4i32(<4 x i32> %x, <4 x i32> %y) nounwind { ; CHECK-LABEL: vec_v4i32: ; CHECK: # %bb.0: -; CHECK-NEXT: addi sp, sp, -16 -; CHECK-NEXT: vsetivli zero, 0, e32, m1, ta, ma -; CHECK-NEXT: vmv.x.s a0, v9 -; CHECK-NEXT: vmv.x.s a1, v8 -; CHECK-NEXT: slli a1, a1, 32 -; CHECK-NEXT: sll a2, a1, a0 -; CHECK-NEXT: srl a0, a2, a0 -; CHECK-NEXT: xor a0, a1, a0 -; CHECK-NEXT: seqz a0, a0 -; CHECK-NEXT: addi a0, a0, -1 -; CHECK-NEXT: or a0, a0, a2 -; CHECK-NEXT: srli a0, a0, 32 -; CHECK-NEXT: sw a0, 0(sp) -; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; CHECK-NEXT: vslidedown.vi v10, v9, 3 -; CHECK-NEXT: vmv.x.s a0, v10 -; CHECK-NEXT: vslidedown.vi v10, v8, 3 -; CHECK-NEXT: vmv.x.s a1, v10 -; CHECK-NEXT: slli a1, a1, 32 -; CHECK-NEXT: sll a2, a1, a0 -; CHECK-NEXT: srl a0, a2, a0 -; CHECK-NEXT: xor a0, a1, a0 -; CHECK-NEXT: seqz a0, a0 -; CHECK-NEXT: addi a0, a0, -1 -; CHECK-NEXT: or a0, a0, a2 -; CHECK-NEXT: srli a0, a0, 32 -; CHECK-NEXT: sw a0, 12(sp) -; CHECK-NEXT: vslidedown.vi v10, v9, 2 -; CHECK-NEXT: vmv.x.s a0, v10 -; CHECK-NEXT: vslidedown.vi v10, v8, 2 -; CHECK-NEXT: vmv.x.s a1, v10 -; CHECK-NEXT: slli a1, a1, 32 -; CHECK-NEXT: sll a2, a1, a0 -; CHECK-NEXT: srl a0, a2, a0 -; CHECK-NEXT: xor a0, a1, a0 -; CHECK-NEXT: seqz a0, a0 -; CHECK-NEXT: addi a0, a0, -1 -; CHECK-NEXT: or a0, a0, a2 -; CHECK-NEXT: srli a0, a0, 32 -; CHECK-NEXT: sw a0, 8(sp) -; CHECK-NEXT: vslidedown.vi v9, v9, 1 -; CHECK-NEXT: vmv.x.s a0, v9 -; CHECK-NEXT: vslidedown.vi v8, v8, 1 -; CHECK-NEXT: vmv.x.s a1, v8 -; CHECK-NEXT: slli a1, a1, 32 -; CHECK-NEXT: sll a2, a1, a0 -; CHECK-NEXT: srl a0, a2, a0 -; CHECK-NEXT: xor a0, a1, a0 -; CHECK-NEXT: seqz a0, a0 -; CHECK-NEXT: addi a0, a0, -1 -; CHECK-NEXT: or a0, a0, a2 -; CHECK-NEXT: srli a0, a0, 32 -; CHECK-NEXT: sw a0, 4(sp) -; CHECK-NEXT: mv a0, sp ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: vsll.vv v10, v8, v9 +; CHECK-NEXT: vsrl.vv v9, v10, v9 +; CHECK-NEXT: vmsne.vv v0, v8, v9 +; CHECK-NEXT: vmerge.vim v8, v10, -1, v0 ; CHECK-NEXT: ret %tmp = call <4 x i32> @llvm.ushl.sat.v4i32(<4 x i32> %x, <4 x i32> %y) ret <4 x i32> %tmp @@ -106,115 +35,11 @@ define <8 x i16> @vec_v8i16(<8 x i16> %x, <8 x i16> %y) nounwind { ; CHECK-LABEL: vec_v8i16: ; CHECK: # %bb.0: -; CHECK-NEXT: addi sp, sp, -16 -; CHECK-NEXT: vsetivli zero, 0, e16, m1, ta, ma -; CHECK-NEXT: vmv.x.s a0, v9 -; CHECK-NEXT: vmv.x.s a1, v8 -; CHECK-NEXT: slli a1, a1, 48 -; CHECK-NEXT: sll a2, a1, a0 -; CHECK-NEXT: srl a0, a2, a0 -; CHECK-NEXT: xor a0, a1, a0 -; CHECK-NEXT: seqz a0, a0 -; CHECK-NEXT: addi a0, a0, -1 -; CHECK-NEXT: or a0, a0, a2 -; CHECK-NEXT: srli a0, a0, 48 -; CHECK-NEXT: sh a0, 0(sp) -; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; CHECK-NEXT: vslidedown.vi v10, v9, 7 -; CHECK-NEXT: vmv.x.s a0, v10 -; CHECK-NEXT: vslidedown.vi v10, v8, 7 -; CHECK-NEXT: vmv.x.s a1, v10 -; CHECK-NEXT: slli a1, a1, 48 -; CHECK-NEXT: sll a2, a1, a0 -; CHECK-NEXT: srl a0, a2, a0 -; CHECK-NEXT: xor a0, a1, a0 -; CHECK-NEXT: seqz a0, a0 -; CHECK-NEXT: addi a0, a0, -1 -; CHECK-NEXT: or a0, a0, a2 -; CHECK-NEXT: srli a0, a0, 48 -; CHECK-NEXT: sh a0, 14(sp) -; CHECK-NEXT: vslidedown.vi v10, v9, 6 -; CHECK-NEXT: vmv.x.s a0, v10 -; CHECK-NEXT: vslidedown.vi v10, v8, 6 -; CHECK-NEXT: vmv.x.s a1, v10 -; CHECK-NEXT: slli a1, a1, 48 -; CHECK-NEXT: sll a2, a1, a0 -; CHECK-NEXT: srl a0, a2, a0 -; CHECK-NEXT: xor a0, a1, a0 -; CHECK-NEXT: seqz a0, a0 -; CHECK-NEXT: addi a0, a0, -1 -; CHECK-NEXT: or a0, a0, a2 -; CHECK-NEXT: srli a0, a0, 48 -; CHECK-NEXT: sh a0, 12(sp) -; CHECK-NEXT: vslidedown.vi v10, v9, 5 -; CHECK-NEXT: vmv.x.s a0, v10 -; CHECK-NEXT: vslidedown.vi v10, v8, 5 -; CHECK-NEXT: vmv.x.s a1, v10 -; CHECK-NEXT: slli a1, a1, 48 -; CHECK-NEXT: sll a2, a1, a0 -; CHECK-NEXT: srl a0, a2, a0 -; CHECK-NEXT: xor a0, a1, a0 -; CHECK-NEXT: seqz a0, a0 -; CHECK-NEXT: addi a0, a0, -1 -; CHECK-NEXT: or a0, a0, a2 -; CHECK-NEXT: srli a0, a0, 48 -; CHECK-NEXT: sh a0, 10(sp) -; CHECK-NEXT: vslidedown.vi v10, v9, 4 -; CHECK-NEXT: vmv.x.s a0, v10 -; CHECK-NEXT: vslidedown.vi v10, v8, 4 -; CHECK-NEXT: vmv.x.s a1, v10 -; CHECK-NEXT: slli a1, a1, 48 -; CHECK-NEXT: sll a2, a1, a0 -; CHECK-NEXT: srl a0, a2, a0 -; CHECK-NEXT: xor a0, a1, a0 -; CHECK-NEXT: seqz a0, a0 -; CHECK-NEXT: addi a0, a0, -1 -; CHECK-NEXT: or a0, a0, a2 -; CHECK-NEXT: srli a0, a0, 48 -; CHECK-NEXT: sh a0, 8(sp) -; CHECK-NEXT: vslidedown.vi v10, v9, 3 -; CHECK-NEXT: vmv.x.s a0, v10 -; CHECK-NEXT: vslidedown.vi v10, v8, 3 -; CHECK-NEXT: vmv.x.s a1, v10 -; CHECK-NEXT: slli a1, a1, 48 -; CHECK-NEXT: sll a2, a1, a0 -; CHECK-NEXT: srl a0, a2, a0 -; CHECK-NEXT: xor a0, a1, a0 -; CHECK-NEXT: seqz a0, a0 -; CHECK-NEXT: addi a0, a0, -1 -; CHECK-NEXT: or a0, a0, a2 -; CHECK-NEXT: srli a0, a0, 48 -; CHECK-NEXT: sh a0, 6(sp) -; CHECK-NEXT: vslidedown.vi v10, v9, 2 -; CHECK-NEXT: vmv.x.s a0, v10 -; CHECK-NEXT: vslidedown.vi v10, v8, 2 -; CHECK-NEXT: vmv.x.s a1, v10 -; CHECK-NEXT: slli a1, a1, 48 -; CHECK-NEXT: sll a2, a1, a0 -; CHECK-NEXT: srl a0, a2, a0 -; CHECK-NEXT: xor a0, a1, a0 -; CHECK-NEXT: seqz a0, a0 -; CHECK-NEXT: addi a0, a0, -1 -; CHECK-NEXT: or a0, a0, a2 -; CHECK-NEXT: srli a0, a0, 48 -; CHECK-NEXT: sh a0, 4(sp) -; CHECK-NEXT: vslidedown.vi v9, v9, 1 -; CHECK-NEXT: vmv.x.s a0, v9 -; CHECK-NEXT: vslidedown.vi v8, v8, 1 -; CHECK-NEXT: vmv.x.s a1, v8 -; CHECK-NEXT: slli a1, a1, 48 -; CHECK-NEXT: sll a2, a1, a0 -; CHECK-NEXT: srl a0, a2, a0 -; CHECK-NEXT: xor a0, a1, a0 -; CHECK-NEXT: seqz a0, a0 -; CHECK-NEXT: addi a0, a0, -1 -; CHECK-NEXT: or a0, a0, a2 -; CHECK-NEXT: srli a0, a0, 48 -; CHECK-NEXT: sh a0, 2(sp) -; CHECK-NEXT: mv a0, sp ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; CHECK-NEXT: vle16.v v8, (a0) -; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: vsll.vv v10, v8, v9 +; CHECK-NEXT: vsrl.vv v9, v10, v9 +; CHECK-NEXT: vmsne.vv v0, v8, v9 +; CHECK-NEXT: vmerge.vim v8, v10, -1, v0 ; CHECK-NEXT: ret %tmp = call <8 x i16> @llvm.ushl.sat.v8i16(<8 x i16> %x, <8 x i16> %y) ret <8 x i16> %tmp @@ -223,220 +48,69 @@ define <16 x i8> @vec_v16i8(<16 x i8> %x, <16 x i8> %y) nounwind { ; CHECK-LABEL: vec_v16i8: ; CHECK: # %bb.0: -; CHECK-NEXT: addi sp, sp, -16 -; CHECK-NEXT: vsetivli zero, 0, e8, m1, ta, ma -; CHECK-NEXT: vmv.x.s a0, v9 -; CHECK-NEXT: vmv.x.s a1, v8 -; CHECK-NEXT: slli a1, a1, 56 -; CHECK-NEXT: sll a2, a1, a0 -; CHECK-NEXT: srl a0, a2, a0 -; CHECK-NEXT: xor a0, a1, a0 -; CHECK-NEXT: seqz a0, a0 -; CHECK-NEXT: addi a0, a0, -1 -; CHECK-NEXT: or a0, a0, a2 -; CHECK-NEXT: srli a0, a0, 56 -; CHECK-NEXT: sb a0, 0(sp) -; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; CHECK-NEXT: vslidedown.vi v10, v9, 15 -; CHECK-NEXT: vmv.x.s a0, v10 -; CHECK-NEXT: vslidedown.vi v10, v8, 15 -; CHECK-NEXT: vmv.x.s a1, v10 -; CHECK-NEXT: slli a1, a1, 56 -; CHECK-NEXT: sll a2, a1, a0 -; CHECK-NEXT: srl a0, a2, a0 -; CHECK-NEXT: xor a0, a1, a0 -; CHECK-NEXT: seqz a0, a0 -; CHECK-NEXT: addi a0, a0, -1 -; CHECK-NEXT: or a0, a0, a2 -; CHECK-NEXT: srli a0, a0, 56 -; CHECK-NEXT: sb a0, 15(sp) -; CHECK-NEXT: vslidedown.vi v10, v9, 14 -; CHECK-NEXT: vmv.x.s a0, v10 -; CHECK-NEXT: vslidedown.vi v10, v8, 14 -; CHECK-NEXT: vmv.x.s a1, v10 -; CHECK-NEXT: slli a1, a1, 56 -; CHECK-NEXT: sll a2, a1, a0 -; CHECK-NEXT: srl a0, a2, a0 -; CHECK-NEXT: xor a0, a1, a0 -; CHECK-NEXT: seqz a0, a0 -; CHECK-NEXT: addi a0, a0, -1 -; CHECK-NEXT: or a0, a0, a2 -; CHECK-NEXT: srli a0, a0, 56 -; CHECK-NEXT: sb a0, 14(sp) -; CHECK-NEXT: vslidedown.vi v10, v9, 13 -; CHECK-NEXT: vmv.x.s a0, v10 -; CHECK-NEXT: vslidedown.vi v10, v8, 13 -; CHECK-NEXT: vmv.x.s a1, v10 -; CHECK-NEXT: slli a1, a1, 56 -; CHECK-NEXT: sll a2, a1, a0 -; CHECK-NEXT: srl a0, a2, a0 -; CHECK-NEXT: xor a0, a1, a0 -; CHECK-NEXT: seqz a0, a0 -; CHECK-NEXT: addi a0, a0, -1 -; CHECK-NEXT: or a0, a0, a2 -; CHECK-NEXT: srli a0, a0, 56 -; CHECK-NEXT: sb a0, 13(sp) -; CHECK-NEXT: vslidedown.vi v10, v9, 12 -; CHECK-NEXT: vmv.x.s a0, v10 -; CHECK-NEXT: vslidedown.vi v10, v8, 12 -; CHECK-NEXT: vmv.x.s a1, v10 -; CHECK-NEXT: slli a1, a1, 56 -; CHECK-NEXT: sll a2, a1, a0 -; CHECK-NEXT: srl a0, a2, a0 -; CHECK-NEXT: xor a0, a1, a0 -; CHECK-NEXT: seqz a0, a0 -; CHECK-NEXT: addi a0, a0, -1 -; CHECK-NEXT: or a0, a0, a2 -; CHECK-NEXT: srli a0, a0, 56 -; CHECK-NEXT: sb a0, 12(sp) -; CHECK-NEXT: vslidedown.vi v10, v9, 11 -; CHECK-NEXT: vmv.x.s a0, v10 -; CHECK-NEXT: vslidedown.vi v10, v8, 11 -; CHECK-NEXT: vmv.x.s a1, v10 -; CHECK-NEXT: slli a1, a1, 56 -; CHECK-NEXT: sll a2, a1, a0 -; CHECK-NEXT: srl a0, a2, a0 -; CHECK-NEXT: xor a0, a1, a0 -; CHECK-NEXT: seqz a0, a0 -; CHECK-NEXT: addi a0, a0, -1 -; CHECK-NEXT: or a0, a0, a2 -; CHECK-NEXT: srli a0, a0, 56 -; CHECK-NEXT: sb a0, 11(sp) -; CHECK-NEXT: vslidedown.vi v10, v9, 10 -; CHECK-NEXT: vmv.x.s a0, v10 -; CHECK-NEXT: vslidedown.vi v10, v8, 10 -; CHECK-NEXT: vmv.x.s a1, v10 -; CHECK-NEXT: slli a1, a1, 56 -; CHECK-NEXT: sll a2, a1, a0 -; CHECK-NEXT: srl a0, a2, a0 -; CHECK-NEXT: xor a0, a1, a0 -; CHECK-NEXT: seqz a0, a0 -; CHECK-NEXT: addi a0, a0, -1 -; CHECK-NEXT: or a0, a0, a2 -; CHECK-NEXT: srli a0, a0, 56 -; CHECK-NEXT: sb a0, 10(sp) -; CHECK-NEXT: vslidedown.vi v10, v9, 9 -; CHECK-NEXT: vmv.x.s a0, v10 -; CHECK-NEXT: vslidedown.vi v10, v8, 9 -; CHECK-NEXT: vmv.x.s a1, v10 -; CHECK-NEXT: slli a1, a1, 56 -; CHECK-NEXT: sll a2, a1, a0 -; CHECK-NEXT: srl a0, a2, a0 -; CHECK-NEXT: xor a0, a1, a0 -; CHECK-NEXT: seqz a0, a0 -; CHECK-NEXT: addi a0, a0, -1 -; CHECK-NEXT: or a0, a0, a2 -; CHECK-NEXT: srli a0, a0, 56 -; CHECK-NEXT: sb a0, 9(sp) -; CHECK-NEXT: vslidedown.vi v10, v9, 8 -; CHECK-NEXT: vmv.x.s a0, v10 -; CHECK-NEXT: vslidedown.vi v10, v8, 8 -; CHECK-NEXT: vmv.x.s a1, v10 -; CHECK-NEXT: slli a1, a1, 56 -; CHECK-NEXT: sll a2, a1, a0 -; CHECK-NEXT: srl a0, a2, a0 -; CHECK-NEXT: xor a0, a1, a0 -; CHECK-NEXT: seqz a0, a0 -; CHECK-NEXT: addi a0, a0, -1 -; CHECK-NEXT: or a0, a0, a2 -; CHECK-NEXT: srli a0, a0, 56 -; CHECK-NEXT: sb a0, 8(sp) -; CHECK-NEXT: vslidedown.vi v10, v9, 7 -; CHECK-NEXT: vmv.x.s a0, v10 -; CHECK-NEXT: vslidedown.vi v10, v8, 7 -; CHECK-NEXT: vmv.x.s a1, v10 -; CHECK-NEXT: slli a1, a1, 56 -; CHECK-NEXT: sll a2, a1, a0 -; CHECK-NEXT: srl a0, a2, a0 -; CHECK-NEXT: xor a0, a1, a0 -; CHECK-NEXT: seqz a0, a0 -; CHECK-NEXT: addi a0, a0, -1 -; CHECK-NEXT: or a0, a0, a2 -; CHECK-NEXT: srli a0, a0, 56 -; CHECK-NEXT: sb a0, 7(sp) -; CHECK-NEXT: vslidedown.vi v10, v9, 6 -; CHECK-NEXT: vmv.x.s a0, v10 -; CHECK-NEXT: vslidedown.vi v10, v8, 6 -; CHECK-NEXT: vmv.x.s a1, v10 -; CHECK-NEXT: slli a1, a1, 56 -; CHECK-NEXT: sll a2, a1, a0 -; CHECK-NEXT: srl a0, a2, a0 -; CHECK-NEXT: xor a0, a1, a0 -; CHECK-NEXT: seqz a0, a0 -; CHECK-NEXT: addi a0, a0, -1 -; CHECK-NEXT: or a0, a0, a2 -; CHECK-NEXT: srli a0, a0, 56 -; CHECK-NEXT: sb a0, 6(sp) -; CHECK-NEXT: vslidedown.vi v10, v9, 5 -; CHECK-NEXT: vmv.x.s a0, v10 -; CHECK-NEXT: vslidedown.vi v10, v8, 5 -; CHECK-NEXT: vmv.x.s a1, v10 -; CHECK-NEXT: slli a1, a1, 56 -; CHECK-NEXT: sll a2, a1, a0 -; CHECK-NEXT: srl a0, a2, a0 -; CHECK-NEXT: xor a0, a1, a0 -; CHECK-NEXT: seqz a0, a0 -; CHECK-NEXT: addi a0, a0, -1 -; CHECK-NEXT: or a0, a0, a2 -; CHECK-NEXT: srli a0, a0, 56 -; CHECK-NEXT: sb a0, 5(sp) -; CHECK-NEXT: vslidedown.vi v10, v9, 4 -; CHECK-NEXT: vmv.x.s a0, v10 -; CHECK-NEXT: vslidedown.vi v10, v8, 4 -; CHECK-NEXT: vmv.x.s a1, v10 -; CHECK-NEXT: slli a1, a1, 56 -; CHECK-NEXT: sll a2, a1, a0 -; CHECK-NEXT: srl a0, a2, a0 -; CHECK-NEXT: xor a0, a1, a0 -; CHECK-NEXT: seqz a0, a0 -; CHECK-NEXT: addi a0, a0, -1 -; CHECK-NEXT: or a0, a0, a2 -; CHECK-NEXT: srli a0, a0, 56 -; CHECK-NEXT: sb a0, 4(sp) -; CHECK-NEXT: vslidedown.vi v10, v9, 3 -; CHECK-NEXT: vmv.x.s a0, v10 -; CHECK-NEXT: vslidedown.vi v10, v8, 3 -; CHECK-NEXT: vmv.x.s a1, v10 -; CHECK-NEXT: slli a1, a1, 56 -; CHECK-NEXT: sll a2, a1, a0 -; CHECK-NEXT: srl a0, a2, a0 -; CHECK-NEXT: xor a0, a1, a0 -; CHECK-NEXT: seqz a0, a0 -; CHECK-NEXT: addi a0, a0, -1 -; CHECK-NEXT: or a0, a0, a2 -; CHECK-NEXT: srli a0, a0, 56 -; CHECK-NEXT: sb a0, 3(sp) -; CHECK-NEXT: vslidedown.vi v10, v9, 2 -; CHECK-NEXT: vmv.x.s a0, v10 -; CHECK-NEXT: vslidedown.vi v10, v8, 2 -; CHECK-NEXT: vmv.x.s a1, v10 -; CHECK-NEXT: slli a1, a1, 56 -; CHECK-NEXT: sll a2, a1, a0 -; CHECK-NEXT: srl a0, a2, a0 -; CHECK-NEXT: xor a0, a1, a0 -; CHECK-NEXT: seqz a0, a0 -; CHECK-NEXT: addi a0, a0, -1 -; CHECK-NEXT: or a0, a0, a2 -; CHECK-NEXT: srli a0, a0, 56 -; CHECK-NEXT: sb a0, 2(sp) -; CHECK-NEXT: vslidedown.vi v9, v9, 1 -; CHECK-NEXT: vmv.x.s a0, v9 -; CHECK-NEXT: vslidedown.vi v8, v8, 1 -; CHECK-NEXT: vmv.x.s a1, v8 -; CHECK-NEXT: slli a1, a1, 56 -; CHECK-NEXT: sll a2, a1, a0 -; CHECK-NEXT: srl a0, a2, a0 -; CHECK-NEXT: xor a0, a1, a0 -; CHECK-NEXT: seqz a0, a0 -; CHECK-NEXT: addi a0, a0, -1 -; CHECK-NEXT: or a0, a0, a2 -; CHECK-NEXT: srli a0, a0, 56 -; CHECK-NEXT: sb a0, 1(sp) -; CHECK-NEXT: mv a0, sp ; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; CHECK-NEXT: vle8.v v8, (a0) -; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: vsll.vv v10, v8, v9 +; CHECK-NEXT: vsrl.vv v9, v10, v9 +; CHECK-NEXT: vmsne.vv v0, v8, v9 +; CHECK-NEXT: vmerge.vim v8, v10, -1, v0 ; CHECK-NEXT: ret %tmp = call <16 x i8> @llvm.ushl.sat.v16i8(<16 x i8> %x, <16 x i8> %y) ret <16 x i8> %tmp } + +declare @llvm.ushl.sat.nxv2i64(, ) +declare @llvm.ushl.sat.nxv4i32(, ) +declare @llvm.ushl.sat.nxv8i16(, ) +declare @llvm.ushl.sat.nxv16i8(, ) + +define @vec_nxv2i64( %x, %y) nounwind { +; CHECK-LABEL: vec_nxv2i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e64, m2, ta, ma +; CHECK-NEXT: vsll.vv v12, v8, v10 +; CHECK-NEXT: vsrl.vv v10, v12, v10 +; CHECK-NEXT: vmsne.vv v0, v8, v10 +; CHECK-NEXT: vmerge.vim v8, v12, -1, v0 +; CHECK-NEXT: ret + %tmp = call @llvm.ushl.sat.nxv2i64( %x, %y) + ret %tmp +} + +define @vec_nxv4i32( %x, %y) nounwind { +; CHECK-LABEL: vec_nxv4i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e32, m2, ta, ma +; CHECK-NEXT: vsll.vv v12, v8, v10 +; CHECK-NEXT: vsrl.vv v10, v12, v10 +; CHECK-NEXT: vmsne.vv v0, v8, v10 +; CHECK-NEXT: vmerge.vim v8, v12, -1, v0 +; CHECK-NEXT: ret + %tmp = call @llvm.ushl.sat.nxv4i32( %x, %y) + ret %tmp +} + +define @vec_nxv8i16( %x, %y) nounwind { +; CHECK-LABEL: vec_nxv8i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; CHECK-NEXT: vsll.vv v12, v8, v10 +; CHECK-NEXT: vsrl.vv v10, v12, v10 +; CHECK-NEXT: vmsne.vv v0, v8, v10 +; CHECK-NEXT: vmerge.vim v8, v12, -1, v0 +; CHECK-NEXT: ret + %tmp = call @llvm.ushl.sat.nxv8i16( %x, %y) + ret %tmp +} + +define @vec_nxv16i8( %x, %y) nounwind { +; CHECK-LABEL: vec_nxv16i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e8, m2, ta, ma +; CHECK-NEXT: vsll.vv v12, v8, v10 +; CHECK-NEXT: vsrl.vv v10, v12, v10 +; CHECK-NEXT: vmsne.vv v0, v8, v10 +; CHECK-NEXT: vmerge.vim v8, v12, -1, v0 +; CHECK-NEXT: ret + %tmp = call @llvm.ushl.sat.nxv16i8( %x, %y) + ret %tmp +} diff --git a/llvm/test/CodeGen/X86/sshl_sat_vec.ll b/llvm/test/CodeGen/X86/sshl_sat_vec.ll --- a/llvm/test/CodeGen/X86/sshl_sat_vec.ll +++ b/llvm/test/CodeGen/X86/sshl_sat_vec.ll @@ -11,73 +11,53 @@ define <2 x i64> @vec_v2i64(<2 x i64> %x, <2 x i64> %y) nounwind { ; X64-LABEL: vec_v2i64: ; X64: # %bb.0: -; X64-NEXT: movq %xmm0, %rax -; X64-NEXT: xorl %edx, %edx -; X64-NEXT: testq %rax, %rax -; X64-NEXT: sets %dl -; X64-NEXT: movabsq $9223372036854775807, %rsi # imm = 0x7FFFFFFFFFFFFFFF -; X64-NEXT: addq %rsi, %rdx -; X64-NEXT: movq %xmm1, %rcx -; X64-NEXT: movq %rax, %rdi -; X64-NEXT: shlq %cl, %rdi -; X64-NEXT: movq %rdi, %r8 -; X64-NEXT: # kill: def $cl killed $cl killed $rcx -; X64-NEXT: sarq %cl, %r8 -; X64-NEXT: cmpq %r8, %rax -; X64-NEXT: cmovneq %rdx, %rdi -; X64-NEXT: movq %rdi, %xmm2 -; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; X64-NEXT: movq %xmm0, %rax -; X64-NEXT: xorl %edx, %edx -; X64-NEXT: testq %rax, %rax -; X64-NEXT: sets %dl -; X64-NEXT: addq %rsi, %rdx -; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] -; X64-NEXT: movq %xmm0, %rcx -; X64-NEXT: movq %rax, %rsi -; X64-NEXT: shlq %cl, %rsi -; X64-NEXT: movq %rsi, %rdi -; X64-NEXT: # kill: def $cl killed $cl killed $rcx -; X64-NEXT: sarq %cl, %rdi -; X64-NEXT: cmpq %rdi, %rax -; X64-NEXT: cmovneq %rdx, %rsi -; X64-NEXT: movq %rsi, %xmm0 -; X64-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0] -; X64-NEXT: movdqa %xmm2, %xmm0 +; X64-NEXT: movdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; X64-NEXT: movdqa %xmm2, %xmm3 +; X64-NEXT: psrlq %xmm1, %xmm3 +; X64-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,2,3] +; X64-NEXT: movdqa %xmm2, %xmm5 +; X64-NEXT: psrlq %xmm4, %xmm5 +; X64-NEXT: movsd {{.*#+}} xmm5 = xmm3[0],xmm5[1] +; X64-NEXT: movdqa %xmm0, %xmm6 +; X64-NEXT: psllq %xmm1, %xmm6 +; X64-NEXT: movdqa %xmm0, %xmm3 +; X64-NEXT: psllq %xmm4, %xmm3 +; X64-NEXT: movdqa %xmm3, %xmm7 +; X64-NEXT: movsd {{.*#+}} xmm3 = xmm6[0],xmm3[1] +; X64-NEXT: psrlq %xmm1, %xmm6 +; X64-NEXT: psrlq %xmm4, %xmm7 +; X64-NEXT: movsd {{.*#+}} xmm7 = xmm6[0],xmm7[1] +; X64-NEXT: xorpd %xmm5, %xmm7 +; X64-NEXT: psubq %xmm5, %xmm7 +; X64-NEXT: pcmpeqd %xmm0, %xmm7 +; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm7[1,0,3,2] +; X64-NEXT: pand %xmm7, %xmm1 +; X64-NEXT: andpd %xmm1, %xmm3 +; X64-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] +; X64-NEXT: pand %xmm2, %xmm0 +; X64-NEXT: pxor %xmm5, %xmm5 +; X64-NEXT: pcmpgtd %xmm4, %xmm5 +; X64-NEXT: pcmpeqd %xmm4, %xmm4 +; X64-NEXT: pxor %xmm5, %xmm4 +; X64-NEXT: pandn %xmm4, %xmm2 +; X64-NEXT: por %xmm0, %xmm2 +; X64-NEXT: pandn %xmm2, %xmm1 +; X64-NEXT: por %xmm3, %xmm1 +; X64-NEXT: movdqa %xmm1, %xmm0 ; X64-NEXT: retq ; ; X64-AVX2-LABEL: vec_v2i64: ; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: vpextrq $1, %xmm0, %rax -; X64-AVX2-NEXT: xorl %edx, %edx -; X64-AVX2-NEXT: testq %rax, %rax -; X64-AVX2-NEXT: sets %dl -; X64-AVX2-NEXT: movabsq $9223372036854775807, %rsi # imm = 0x7FFFFFFFFFFFFFFF -; X64-AVX2-NEXT: addq %rsi, %rdx -; X64-AVX2-NEXT: vpextrq $1, %xmm1, %rcx -; X64-AVX2-NEXT: movq %rax, %rdi -; X64-AVX2-NEXT: shlq %cl, %rdi -; X64-AVX2-NEXT: movq %rdi, %r8 -; X64-AVX2-NEXT: # kill: def $cl killed $cl killed $rcx -; X64-AVX2-NEXT: sarq %cl, %r8 -; X64-AVX2-NEXT: cmpq %r8, %rax -; X64-AVX2-NEXT: cmovneq %rdx, %rdi -; X64-AVX2-NEXT: vmovq %rdi, %xmm2 -; X64-AVX2-NEXT: vmovq %xmm0, %rax -; X64-AVX2-NEXT: xorl %edx, %edx -; X64-AVX2-NEXT: testq %rax, %rax -; X64-AVX2-NEXT: sets %dl -; X64-AVX2-NEXT: addq %rsi, %rdx -; X64-AVX2-NEXT: vmovq %xmm1, %rcx -; X64-AVX2-NEXT: movq %rax, %rsi -; X64-AVX2-NEXT: shlq %cl, %rsi -; X64-AVX2-NEXT: movq %rsi, %rdi -; X64-AVX2-NEXT: # kill: def $cl killed $cl killed $rcx -; X64-AVX2-NEXT: sarq %cl, %rdi -; X64-AVX2-NEXT: cmpq %rdi, %rax -; X64-AVX2-NEXT: cmovneq %rdx, %rsi -; X64-AVX2-NEXT: vmovq %rsi, %xmm0 -; X64-AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; X64-AVX2-NEXT: vmovapd {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; X64-AVX2-NEXT: vmovapd {{.*#+}} xmm3 = [9223372036854775807,9223372036854775807] +; X64-AVX2-NEXT: vblendvpd %xmm0, %xmm2, %xmm3, %xmm3 +; X64-AVX2-NEXT: vpsrlvq %xmm1, %xmm2, %xmm2 +; X64-AVX2-NEXT: vpsllvq %xmm1, %xmm0, %xmm4 +; X64-AVX2-NEXT: vpsrlvq %xmm1, %xmm4, %xmm1 +; X64-AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; X64-AVX2-NEXT: vpsubq %xmm2, %xmm1, %xmm1 +; X64-AVX2-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vblendvpd %xmm0, %xmm4, %xmm3, %xmm0 ; X64-AVX2-NEXT: retq ; ; X86-LABEL: vec_v2i64: @@ -175,132 +155,53 @@ define <4 x i32> @vec_v4i32(<4 x i32> %x, <4 x i32> %y) nounwind { ; X64-LABEL: vec_v4i32: ; X64: # %bb.0: -; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,3,3,3] -; X64-NEXT: movd %xmm2, %eax -; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm1[3,3,3,3] -; X64-NEXT: movd %xmm2, %ecx -; X64-NEXT: movl %eax, %edx -; X64-NEXT: shll %cl, %edx -; X64-NEXT: movl %edx, %esi -; X64-NEXT: # kill: def $cl killed $cl killed $ecx -; X64-NEXT: sarl %cl, %esi -; X64-NEXT: xorl %ecx, %ecx -; X64-NEXT: testl %eax, %eax -; X64-NEXT: sets %cl -; X64-NEXT: addl $2147483647, %ecx # imm = 0x7FFFFFFF -; X64-NEXT: cmpl %esi, %eax -; X64-NEXT: cmovel %edx, %ecx -; X64-NEXT: movd %ecx, %xmm2 -; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] -; X64-NEXT: movd %xmm3, %eax -; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] -; X64-NEXT: movd %xmm3, %ecx -; X64-NEXT: movl %eax, %edx -; X64-NEXT: shll %cl, %edx -; X64-NEXT: movl %edx, %esi -; X64-NEXT: # kill: def $cl killed $cl killed $ecx -; X64-NEXT: sarl %cl, %esi -; X64-NEXT: xorl %ecx, %ecx -; X64-NEXT: testl %eax, %eax -; X64-NEXT: sets %cl -; X64-NEXT: addl $2147483647, %ecx # imm = 0x7FFFFFFF -; X64-NEXT: cmpl %esi, %eax -; X64-NEXT: cmovel %edx, %ecx -; X64-NEXT: movd %ecx, %xmm3 -; X64-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; X64-NEXT: movd %xmm0, %eax -; X64-NEXT: movd %xmm1, %ecx -; X64-NEXT: movl %eax, %edx -; X64-NEXT: shll %cl, %edx -; X64-NEXT: movl %edx, %esi -; X64-NEXT: # kill: def $cl killed $cl killed $ecx -; X64-NEXT: sarl %cl, %esi -; X64-NEXT: xorl %ecx, %ecx -; X64-NEXT: testl %eax, %eax -; X64-NEXT: sets %cl -; X64-NEXT: addl $2147483647, %ecx # imm = 0x7FFFFFFF -; X64-NEXT: cmpl %esi, %eax -; X64-NEXT: cmovel %edx, %ecx -; X64-NEXT: movd %ecx, %xmm2 -; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; X64-NEXT: movd %xmm0, %eax -; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] -; X64-NEXT: movd %xmm0, %ecx -; X64-NEXT: movl %eax, %edx -; X64-NEXT: shll %cl, %edx -; X64-NEXT: movl %edx, %esi -; X64-NEXT: # kill: def $cl killed $cl killed $ecx -; X64-NEXT: sarl %cl, %esi -; X64-NEXT: xorl %ecx, %ecx -; X64-NEXT: testl %eax, %eax -; X64-NEXT: sets %cl -; X64-NEXT: addl $2147483647, %ecx # imm = 0x7FFFFFFF -; X64-NEXT: cmpl %esi, %eax -; X64-NEXT: cmovel %edx, %ecx -; X64-NEXT: movd %ecx, %xmm0 -; X64-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; X64-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; X64-NEXT: movdqa %xmm2, %xmm0 +; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] +; X64-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7] +; X64-NEXT: pshuflw {{.*#+}} xmm4 = xmm1[0,1,1,1,4,5,6,7] +; X64-NEXT: pslld $23, %xmm1 +; X64-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; X64-NEXT: cvttps2dq %xmm1, %xmm5 +; X64-NEXT: movdqa %xmm0, %xmm1 +; X64-NEXT: pmuludq %xmm5, %xmm1 +; X64-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,2,2,3] +; X64-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3] +; X64-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; X64-NEXT: pmuludq %xmm7, %xmm5 +; X64-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] +; X64-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] +; X64-NEXT: pshuflw {{.*#+}} xmm5 = xmm2[2,3,3,3,4,5,6,7] +; X64-NEXT: movdqa %xmm6, %xmm7 +; X64-NEXT: psrad %xmm5, %xmm7 +; X64-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,1,1,4,5,6,7] +; X64-NEXT: movdqa %xmm1, %xmm5 +; X64-NEXT: psrad %xmm2, %xmm5 +; X64-NEXT: punpckhqdq {{.*#+}} xmm5 = xmm5[1],xmm7[1] +; X64-NEXT: movdqa %xmm6, %xmm2 +; X64-NEXT: psrad %xmm3, %xmm2 +; X64-NEXT: psrad %xmm4, %xmm1 +; X64-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; X64-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm5[0,3] +; X64-NEXT: pcmpeqd %xmm0, %xmm1 +; X64-NEXT: pand %xmm1, %xmm6 +; X64-NEXT: pxor %xmm2, %xmm2 +; X64-NEXT: pcmpgtd %xmm0, %xmm2 +; X64-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; X64-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; X64-NEXT: por %xmm2, %xmm0 +; X64-NEXT: pandn %xmm0, %xmm1 +; X64-NEXT: por %xmm6, %xmm1 +; X64-NEXT: movdqa %xmm1, %xmm0 ; X64-NEXT: retq ; ; X64-AVX2-LABEL: vec_v4i32: ; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: vpextrd $1, %xmm0, %eax -; X64-AVX2-NEXT: vpextrd $1, %xmm1, %ecx -; X64-AVX2-NEXT: movl %eax, %edx -; X64-AVX2-NEXT: shll %cl, %edx -; X64-AVX2-NEXT: movl %edx, %esi -; X64-AVX2-NEXT: # kill: def $cl killed $cl killed $ecx -; X64-AVX2-NEXT: sarl %cl, %esi -; X64-AVX2-NEXT: xorl %edi, %edi -; X64-AVX2-NEXT: testl %eax, %eax -; X64-AVX2-NEXT: sets %dil -; X64-AVX2-NEXT: addl $2147483647, %edi # imm = 0x7FFFFFFF -; X64-AVX2-NEXT: cmpl %esi, %eax -; X64-AVX2-NEXT: cmovel %edx, %edi -; X64-AVX2-NEXT: vmovd %xmm0, %eax -; X64-AVX2-NEXT: vmovd %xmm1, %ecx -; X64-AVX2-NEXT: movl %eax, %edx -; X64-AVX2-NEXT: shll %cl, %edx -; X64-AVX2-NEXT: movl %edx, %esi -; X64-AVX2-NEXT: # kill: def $cl killed $cl killed $ecx -; X64-AVX2-NEXT: sarl %cl, %esi -; X64-AVX2-NEXT: xorl %ecx, %ecx -; X64-AVX2-NEXT: testl %eax, %eax -; X64-AVX2-NEXT: sets %cl -; X64-AVX2-NEXT: addl $2147483647, %ecx # imm = 0x7FFFFFFF -; X64-AVX2-NEXT: cmpl %esi, %eax -; X64-AVX2-NEXT: cmovel %edx, %ecx -; X64-AVX2-NEXT: vmovd %ecx, %xmm2 -; X64-AVX2-NEXT: vpinsrd $1, %edi, %xmm2, %xmm2 -; X64-AVX2-NEXT: vpextrd $2, %xmm0, %eax -; X64-AVX2-NEXT: vpextrd $2, %xmm1, %ecx -; X64-AVX2-NEXT: movl %eax, %edx -; X64-AVX2-NEXT: shll %cl, %edx -; X64-AVX2-NEXT: movl %edx, %esi -; X64-AVX2-NEXT: # kill: def $cl killed $cl killed $ecx -; X64-AVX2-NEXT: sarl %cl, %esi -; X64-AVX2-NEXT: xorl %ecx, %ecx -; X64-AVX2-NEXT: testl %eax, %eax -; X64-AVX2-NEXT: sets %cl -; X64-AVX2-NEXT: addl $2147483647, %ecx # imm = 0x7FFFFFFF -; X64-AVX2-NEXT: cmpl %esi, %eax -; X64-AVX2-NEXT: cmovel %edx, %ecx -; X64-AVX2-NEXT: vpinsrd $2, %ecx, %xmm2, %xmm2 -; X64-AVX2-NEXT: vpextrd $3, %xmm0, %eax -; X64-AVX2-NEXT: vpextrd $3, %xmm1, %ecx -; X64-AVX2-NEXT: movl %eax, %edx -; X64-AVX2-NEXT: shll %cl, %edx -; X64-AVX2-NEXT: movl %edx, %esi -; X64-AVX2-NEXT: # kill: def $cl killed $cl killed $ecx -; X64-AVX2-NEXT: sarl %cl, %esi -; X64-AVX2-NEXT: xorl %ecx, %ecx -; X64-AVX2-NEXT: testl %eax, %eax -; X64-AVX2-NEXT: sets %cl -; X64-AVX2-NEXT: addl $2147483647, %ecx # imm = 0x7FFFFFFF -; X64-AVX2-NEXT: cmpl %esi, %eax -; X64-AVX2-NEXT: cmovel %edx, %ecx -; X64-AVX2-NEXT: vpinsrd $3, %ecx, %xmm2, %xmm0 +; X64-AVX2-NEXT: vbroadcastss {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] +; X64-AVX2-NEXT: vbroadcastss {{.*#+}} xmm3 = [2147483647,2147483647,2147483647,2147483647] +; X64-AVX2-NEXT: vblendvps %xmm0, %xmm2, %xmm3, %xmm2 +; X64-AVX2-NEXT: vpsllvd %xmm1, %xmm0, %xmm3 +; X64-AVX2-NEXT: vpsravd %xmm1, %xmm3, %xmm1 +; X64-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vblendvps %xmm0, %xmm3, %xmm2, %xmm0 ; X64-AVX2-NEXT: retq ; ; X86-LABEL: vec_v4i32: @@ -376,241 +277,84 @@ define <8 x i16> @vec_v8i16(<8 x i16> %x, <8 x i16> %y) nounwind { ; X64-LABEL: vec_v8i16: ; X64: # %bb.0: -; X64-NEXT: pextrw $7, %xmm0, %eax -; X64-NEXT: pextrw $7, %xmm1, %ecx -; X64-NEXT: movl %eax, %edx -; X64-NEXT: shll %cl, %edx -; X64-NEXT: movswl %dx, %esi -; X64-NEXT: # kill: def $cl killed $cl killed $ecx -; X64-NEXT: sarl %cl, %esi -; X64-NEXT: xorl %ecx, %ecx -; X64-NEXT: testw %ax, %ax -; X64-NEXT: sets %cl -; X64-NEXT: addl $32767, %ecx # imm = 0x7FFF -; X64-NEXT: cmpw %si, %ax -; X64-NEXT: cmovel %edx, %ecx -; X64-NEXT: movd %ecx, %xmm2 -; X64-NEXT: pextrw $6, %xmm0, %eax -; X64-NEXT: pextrw $6, %xmm1, %ecx -; X64-NEXT: movl %eax, %edx -; X64-NEXT: shll %cl, %edx -; X64-NEXT: movswl %dx, %esi -; X64-NEXT: # kill: def $cl killed $cl killed $ecx -; X64-NEXT: sarl %cl, %esi -; X64-NEXT: xorl %ecx, %ecx -; X64-NEXT: testw %ax, %ax -; X64-NEXT: sets %cl -; X64-NEXT: addl $32767, %ecx # imm = 0x7FFF -; X64-NEXT: cmpw %si, %ax -; X64-NEXT: cmovel %edx, %ecx -; X64-NEXT: movd %ecx, %xmm3 -; X64-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; X64-NEXT: pextrw $5, %xmm0, %eax -; X64-NEXT: pextrw $5, %xmm1, %ecx -; X64-NEXT: movl %eax, %edx -; X64-NEXT: shll %cl, %edx -; X64-NEXT: movswl %dx, %esi -; X64-NEXT: # kill: def $cl killed $cl killed $ecx -; X64-NEXT: sarl %cl, %esi -; X64-NEXT: xorl %ecx, %ecx -; X64-NEXT: testw %ax, %ax -; X64-NEXT: sets %cl -; X64-NEXT: addl $32767, %ecx # imm = 0x7FFF -; X64-NEXT: cmpw %si, %ax -; X64-NEXT: cmovel %edx, %ecx -; X64-NEXT: movd %ecx, %xmm4 -; X64-NEXT: pextrw $4, %xmm0, %eax -; X64-NEXT: pextrw $4, %xmm1, %ecx -; X64-NEXT: movl %eax, %edx -; X64-NEXT: shll %cl, %edx -; X64-NEXT: movswl %dx, %esi -; X64-NEXT: # kill: def $cl killed $cl killed $ecx -; X64-NEXT: sarl %cl, %esi -; X64-NEXT: xorl %ecx, %ecx -; X64-NEXT: testw %ax, %ax -; X64-NEXT: sets %cl -; X64-NEXT: addl $32767, %ecx # imm = 0x7FFF -; X64-NEXT: cmpw %si, %ax -; X64-NEXT: cmovel %edx, %ecx -; X64-NEXT: movd %ecx, %xmm2 -; X64-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; X64-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; X64-NEXT: pextrw $3, %xmm0, %eax -; X64-NEXT: pextrw $3, %xmm1, %ecx -; X64-NEXT: movl %eax, %edx -; X64-NEXT: shll %cl, %edx -; X64-NEXT: movswl %dx, %esi -; X64-NEXT: # kill: def $cl killed $cl killed $ecx -; X64-NEXT: sarl %cl, %esi -; X64-NEXT: xorl %ecx, %ecx -; X64-NEXT: testw %ax, %ax -; X64-NEXT: sets %cl -; X64-NEXT: addl $32767, %ecx # imm = 0x7FFF -; X64-NEXT: cmpw %si, %ax -; X64-NEXT: cmovel %edx, %ecx -; X64-NEXT: movd %ecx, %xmm4 -; X64-NEXT: pextrw $2, %xmm0, %eax -; X64-NEXT: pextrw $2, %xmm1, %ecx -; X64-NEXT: movl %eax, %edx -; X64-NEXT: shll %cl, %edx -; X64-NEXT: movswl %dx, %esi -; X64-NEXT: # kill: def $cl killed $cl killed $ecx -; X64-NEXT: sarl %cl, %esi -; X64-NEXT: xorl %ecx, %ecx -; X64-NEXT: testw %ax, %ax -; X64-NEXT: sets %cl -; X64-NEXT: addl $32767, %ecx # imm = 0x7FFF -; X64-NEXT: cmpw %si, %ax -; X64-NEXT: cmovel %edx, %ecx -; X64-NEXT: movd %ecx, %xmm3 -; X64-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; X64-NEXT: pextrw $1, %xmm0, %eax -; X64-NEXT: pextrw $1, %xmm1, %ecx -; X64-NEXT: movl %eax, %edx -; X64-NEXT: shll %cl, %edx -; X64-NEXT: movswl %dx, %esi -; X64-NEXT: # kill: def $cl killed $cl killed $ecx -; X64-NEXT: sarl %cl, %esi -; X64-NEXT: xorl %ecx, %ecx -; X64-NEXT: testw %ax, %ax -; X64-NEXT: sets %cl -; X64-NEXT: addl $32767, %ecx # imm = 0x7FFF -; X64-NEXT: cmpw %si, %ax -; X64-NEXT: cmovel %edx, %ecx -; X64-NEXT: movd %ecx, %xmm4 -; X64-NEXT: movd %xmm0, %eax -; X64-NEXT: movd %xmm1, %ecx -; X64-NEXT: movl %eax, %edx -; X64-NEXT: shll %cl, %edx -; X64-NEXT: movswl %dx, %esi -; X64-NEXT: # kill: def $cl killed $cl killed $ecx -; X64-NEXT: sarl %cl, %esi -; X64-NEXT: xorl %ecx, %ecx -; X64-NEXT: testw %ax, %ax -; X64-NEXT: sets %cl -; X64-NEXT: addl $32767, %ecx # imm = 0x7FFF -; X64-NEXT: cmpw %si, %ax -; X64-NEXT: cmovel %edx, %ecx -; X64-NEXT: movd %ecx, %xmm0 -; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; X64-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; X64-NEXT: movdqa %xmm1, %xmm2 +; X64-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7] +; X64-NEXT: pslld $23, %xmm2 +; X64-NEXT: movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216] +; X64-NEXT: paddd %xmm3, %xmm2 +; X64-NEXT: cvttps2dq %xmm2, %xmm2 +; X64-NEXT: pslld $16, %xmm2 +; X64-NEXT: psrad $16, %xmm2 +; X64-NEXT: movdqa %xmm1, %xmm4 +; X64-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3] +; X64-NEXT: pslld $23, %xmm4 +; X64-NEXT: paddd %xmm3, %xmm4 +; X64-NEXT: cvttps2dq %xmm4, %xmm3 +; X64-NEXT: pslld $16, %xmm3 +; X64-NEXT: psrad $16, %xmm3 +; X64-NEXT: packssdw %xmm2, %xmm3 +; X64-NEXT: pmullw %xmm0, %xmm3 +; X64-NEXT: psllw $12, %xmm1 +; X64-NEXT: movdqa %xmm1, %xmm2 +; X64-NEXT: psraw $15, %xmm2 +; X64-NEXT: movdqa %xmm3, %xmm4 +; X64-NEXT: psraw $8, %xmm4 +; X64-NEXT: pand %xmm2, %xmm4 +; X64-NEXT: pandn %xmm3, %xmm2 +; X64-NEXT: por %xmm4, %xmm2 +; X64-NEXT: paddw %xmm1, %xmm1 +; X64-NEXT: movdqa %xmm1, %xmm4 +; X64-NEXT: psraw $15, %xmm4 +; X64-NEXT: movdqa %xmm4, %xmm5 +; X64-NEXT: pandn %xmm2, %xmm5 +; X64-NEXT: psraw $4, %xmm2 +; X64-NEXT: pand %xmm4, %xmm2 +; X64-NEXT: por %xmm5, %xmm2 +; X64-NEXT: paddw %xmm1, %xmm1 +; X64-NEXT: movdqa %xmm1, %xmm4 +; X64-NEXT: psraw $15, %xmm4 +; X64-NEXT: movdqa %xmm4, %xmm5 +; X64-NEXT: pandn %xmm2, %xmm5 +; X64-NEXT: psraw $2, %xmm2 +; X64-NEXT: pand %xmm4, %xmm2 +; X64-NEXT: por %xmm5, %xmm2 +; X64-NEXT: paddw %xmm1, %xmm1 +; X64-NEXT: psraw $15, %xmm1 +; X64-NEXT: movdqa %xmm1, %xmm4 +; X64-NEXT: pandn %xmm2, %xmm4 +; X64-NEXT: psraw $1, %xmm2 +; X64-NEXT: pand %xmm1, %xmm2 +; X64-NEXT: por %xmm4, %xmm2 +; X64-NEXT: pcmpeqw %xmm0, %xmm2 +; X64-NEXT: pand %xmm2, %xmm3 +; X64-NEXT: pxor %xmm1, %xmm1 +; X64-NEXT: pcmpgtw %xmm0, %xmm1 +; X64-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; X64-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; X64-NEXT: por %xmm1, %xmm0 +; X64-NEXT: pandn %xmm0, %xmm2 +; X64-NEXT: por %xmm3, %xmm2 +; X64-NEXT: movdqa %xmm2, %xmm0 ; X64-NEXT: retq ; ; X64-AVX2-LABEL: vec_v8i16: ; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: vpextrw $1, %xmm0, %edx -; X64-AVX2-NEXT: vpextrw $1, %xmm1, %ecx -; X64-AVX2-NEXT: movl %edx, %esi -; X64-AVX2-NEXT: shll %cl, %esi -; X64-AVX2-NEXT: movswl %si, %edi -; X64-AVX2-NEXT: # kill: def $cl killed $cl killed $ecx -; X64-AVX2-NEXT: sarl %cl, %edi -; X64-AVX2-NEXT: xorl %eax, %eax -; X64-AVX2-NEXT: testw %dx, %dx -; X64-AVX2-NEXT: sets %al -; X64-AVX2-NEXT: addl $32767, %eax # imm = 0x7FFF -; X64-AVX2-NEXT: cmpw %di, %dx -; X64-AVX2-NEXT: cmovel %esi, %eax -; X64-AVX2-NEXT: vmovd %xmm0, %edx -; X64-AVX2-NEXT: vmovd %xmm1, %ecx -; X64-AVX2-NEXT: movl %edx, %esi -; X64-AVX2-NEXT: shll %cl, %esi -; X64-AVX2-NEXT: movswl %si, %edi -; X64-AVX2-NEXT: # kill: def $cl killed $cl killed $ecx -; X64-AVX2-NEXT: sarl %cl, %edi -; X64-AVX2-NEXT: xorl %ecx, %ecx -; X64-AVX2-NEXT: testw %dx, %dx -; X64-AVX2-NEXT: sets %cl -; X64-AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; X64-AVX2-NEXT: cmpw %di, %dx -; X64-AVX2-NEXT: cmovel %esi, %ecx -; X64-AVX2-NEXT: vmovd %ecx, %xmm2 -; X64-AVX2-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 -; X64-AVX2-NEXT: vpextrw $2, %xmm0, %eax -; X64-AVX2-NEXT: vpextrw $2, %xmm1, %ecx -; X64-AVX2-NEXT: movl %eax, %edx -; X64-AVX2-NEXT: shll %cl, %edx -; X64-AVX2-NEXT: movswl %dx, %esi -; X64-AVX2-NEXT: # kill: def $cl killed $cl killed $ecx -; X64-AVX2-NEXT: sarl %cl, %esi -; X64-AVX2-NEXT: xorl %ecx, %ecx -; X64-AVX2-NEXT: testw %ax, %ax -; X64-AVX2-NEXT: sets %cl -; X64-AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; X64-AVX2-NEXT: cmpw %si, %ax -; X64-AVX2-NEXT: cmovel %edx, %ecx -; X64-AVX2-NEXT: vpinsrw $2, %ecx, %xmm2, %xmm2 -; X64-AVX2-NEXT: vpextrw $3, %xmm0, %eax -; X64-AVX2-NEXT: vpextrw $3, %xmm1, %ecx -; X64-AVX2-NEXT: movl %eax, %edx -; X64-AVX2-NEXT: shll %cl, %edx -; X64-AVX2-NEXT: movswl %dx, %esi -; X64-AVX2-NEXT: # kill: def $cl killed $cl killed $ecx -; X64-AVX2-NEXT: sarl %cl, %esi -; X64-AVX2-NEXT: xorl %ecx, %ecx -; X64-AVX2-NEXT: testw %ax, %ax -; X64-AVX2-NEXT: sets %cl -; X64-AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; X64-AVX2-NEXT: cmpw %si, %ax -; X64-AVX2-NEXT: cmovel %edx, %ecx -; X64-AVX2-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2 -; X64-AVX2-NEXT: vpextrw $4, %xmm0, %eax -; X64-AVX2-NEXT: vpextrw $4, %xmm1, %ecx -; X64-AVX2-NEXT: movl %eax, %edx -; X64-AVX2-NEXT: shll %cl, %edx -; X64-AVX2-NEXT: movswl %dx, %esi -; X64-AVX2-NEXT: # kill: def $cl killed $cl killed $ecx -; X64-AVX2-NEXT: sarl %cl, %esi -; X64-AVX2-NEXT: xorl %ecx, %ecx -; X64-AVX2-NEXT: testw %ax, %ax -; X64-AVX2-NEXT: sets %cl -; X64-AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; X64-AVX2-NEXT: cmpw %si, %ax -; X64-AVX2-NEXT: cmovel %edx, %ecx -; X64-AVX2-NEXT: vpinsrw $4, %ecx, %xmm2, %xmm2 -; X64-AVX2-NEXT: vpextrw $5, %xmm0, %eax -; X64-AVX2-NEXT: vpextrw $5, %xmm1, %ecx -; X64-AVX2-NEXT: movl %eax, %edx -; X64-AVX2-NEXT: shll %cl, %edx -; X64-AVX2-NEXT: movswl %dx, %esi -; X64-AVX2-NEXT: # kill: def $cl killed $cl killed $ecx -; X64-AVX2-NEXT: sarl %cl, %esi -; X64-AVX2-NEXT: xorl %ecx, %ecx -; X64-AVX2-NEXT: testw %ax, %ax -; X64-AVX2-NEXT: sets %cl -; X64-AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; X64-AVX2-NEXT: cmpw %si, %ax -; X64-AVX2-NEXT: cmovel %edx, %ecx -; X64-AVX2-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 -; X64-AVX2-NEXT: vpextrw $6, %xmm0, %eax -; X64-AVX2-NEXT: vpextrw $6, %xmm1, %ecx -; X64-AVX2-NEXT: movl %eax, %edx -; X64-AVX2-NEXT: shll %cl, %edx -; X64-AVX2-NEXT: movswl %dx, %esi -; X64-AVX2-NEXT: # kill: def $cl killed $cl killed $ecx -; X64-AVX2-NEXT: sarl %cl, %esi -; X64-AVX2-NEXT: xorl %ecx, %ecx -; X64-AVX2-NEXT: testw %ax, %ax -; X64-AVX2-NEXT: sets %cl -; X64-AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; X64-AVX2-NEXT: cmpw %si, %ax -; X64-AVX2-NEXT: cmovel %edx, %ecx -; X64-AVX2-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 -; X64-AVX2-NEXT: vpextrw $7, %xmm0, %eax -; X64-AVX2-NEXT: vpextrw $7, %xmm1, %ecx -; X64-AVX2-NEXT: movl %eax, %edx -; X64-AVX2-NEXT: shll %cl, %edx -; X64-AVX2-NEXT: movswl %dx, %esi -; X64-AVX2-NEXT: # kill: def $cl killed $cl killed $ecx -; X64-AVX2-NEXT: sarl %cl, %esi -; X64-AVX2-NEXT: xorl %ecx, %ecx -; X64-AVX2-NEXT: testw %ax, %ax -; X64-AVX2-NEXT: sets %cl -; X64-AVX2-NEXT: addl $32767, %ecx # imm = 0x7FFF -; X64-AVX2-NEXT: cmpw %si, %ax -; X64-AVX2-NEXT: cmovel %edx, %ecx -; X64-AVX2-NEXT: vpinsrw $7, %ecx, %xmm2, %xmm0 +; X64-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; X64-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; X64-AVX2-NEXT: vpsllvd %ymm1, %ymm2, %ymm2 +; X64-AVX2-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] +; X64-AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; X64-AVX2-NEXT: vpmovsxwd %xmm2, %ymm3 +; X64-AVX2-NEXT: vpsravd %ymm1, %ymm3, %ymm1 +; X64-AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3 +; X64-AVX2-NEXT: vpackssdw %xmm3, %xmm1, %xmm1 +; X64-AVX2-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm1 +; X64-AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; X64-AVX2-NEXT: vpcmpgtw %xmm0, %xmm3, %xmm0 +; X64-AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [32767,32767,32767,32767,32767,32767,32767,32767] +; X64-AVX2-NEXT: vpblendvb %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm0 +; X64-AVX2-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 +; X64-AVX2-NEXT: vzeroupper ; X64-AVX2-NEXT: retq ; ; X86-LABEL: vec_v8i16: @@ -748,492 +492,137 @@ define <16 x i8> @vec_v16i8(<16 x i8> %x, <16 x i8> %y) nounwind { ; X64-LABEL: vec_v16i8: ; X64: # %bb.0: -; X64-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; X64-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx -; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; X64-NEXT: movl %eax, %edx -; X64-NEXT: shlb %cl, %dl -; X64-NEXT: movzbl %dl, %edx -; X64-NEXT: movl %edx, %esi -; X64-NEXT: sarb %cl, %sil -; X64-NEXT: xorl %ecx, %ecx -; X64-NEXT: testb %al, %al -; X64-NEXT: sets %cl -; X64-NEXT: addl $127, %ecx -; X64-NEXT: cmpb %sil, %al -; X64-NEXT: cmovel %edx, %ecx -; X64-NEXT: movd %ecx, %xmm0 -; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx -; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; X64-NEXT: movl %eax, %edx -; X64-NEXT: shlb %cl, %dl -; X64-NEXT: movzbl %dl, %edx -; X64-NEXT: movl %edx, %esi -; X64-NEXT: sarb %cl, %sil -; X64-NEXT: xorl %ecx, %ecx -; X64-NEXT: testb %al, %al -; X64-NEXT: sets %cl -; X64-NEXT: addl $127, %ecx -; X64-NEXT: cmpb %sil, %al -; X64-NEXT: cmovel %edx, %ecx -; X64-NEXT: movd %ecx, %xmm1 -; X64-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx -; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; X64-NEXT: movl %eax, %edx -; X64-NEXT: shlb %cl, %dl -; X64-NEXT: movzbl %dl, %edx -; X64-NEXT: movl %edx, %esi -; X64-NEXT: sarb %cl, %sil -; X64-NEXT: xorl %ecx, %ecx -; X64-NEXT: testb %al, %al -; X64-NEXT: sets %cl -; X64-NEXT: addl $127, %ecx -; X64-NEXT: cmpb %sil, %al -; X64-NEXT: cmovel %edx, %ecx -; X64-NEXT: movd %ecx, %xmm2 -; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx -; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; X64-NEXT: movl %eax, %edx -; X64-NEXT: shlb %cl, %dl -; X64-NEXT: movzbl %dl, %edx -; X64-NEXT: movl %edx, %esi -; X64-NEXT: sarb %cl, %sil -; X64-NEXT: xorl %ecx, %ecx -; X64-NEXT: testb %al, %al -; X64-NEXT: sets %cl -; X64-NEXT: addl $127, %ecx -; X64-NEXT: cmpb %sil, %al -; X64-NEXT: cmovel %edx, %ecx -; X64-NEXT: movd %ecx, %xmm0 -; X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx -; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; X64-NEXT: movl %eax, %edx -; X64-NEXT: shlb %cl, %dl -; X64-NEXT: movzbl %dl, %edx -; X64-NEXT: movl %edx, %esi -; X64-NEXT: sarb %cl, %sil -; X64-NEXT: xorl %ecx, %ecx -; X64-NEXT: testb %al, %al -; X64-NEXT: sets %cl -; X64-NEXT: addl $127, %ecx -; X64-NEXT: cmpb %sil, %al -; X64-NEXT: cmovel %edx, %ecx -; X64-NEXT: movd %ecx, %xmm1 -; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx -; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; X64-NEXT: movl %eax, %edx -; X64-NEXT: shlb %cl, %dl -; X64-NEXT: movzbl %dl, %edx -; X64-NEXT: movl %edx, %esi -; X64-NEXT: sarb %cl, %sil -; X64-NEXT: xorl %ecx, %ecx -; X64-NEXT: testb %al, %al -; X64-NEXT: sets %cl -; X64-NEXT: addl $127, %ecx -; X64-NEXT: cmpb %sil, %al -; X64-NEXT: cmovel %edx, %ecx -; X64-NEXT: movd %ecx, %xmm2 -; X64-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx -; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; X64-NEXT: movl %eax, %edx -; X64-NEXT: shlb %cl, %dl -; X64-NEXT: movzbl %dl, %edx -; X64-NEXT: movl %edx, %esi -; X64-NEXT: sarb %cl, %sil -; X64-NEXT: xorl %ecx, %ecx -; X64-NEXT: testb %al, %al -; X64-NEXT: sets %cl -; X64-NEXT: addl $127, %ecx -; X64-NEXT: cmpb %sil, %al -; X64-NEXT: cmovel %edx, %ecx -; X64-NEXT: movd %ecx, %xmm3 -; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx -; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; X64-NEXT: movl %eax, %edx -; X64-NEXT: shlb %cl, %dl -; X64-NEXT: movzbl %dl, %edx -; X64-NEXT: movl %edx, %esi -; X64-NEXT: sarb %cl, %sil -; X64-NEXT: xorl %ecx, %ecx -; X64-NEXT: testb %al, %al -; X64-NEXT: sets %cl -; X64-NEXT: addl $127, %ecx -; X64-NEXT: cmpb %sil, %al -; X64-NEXT: cmovel %edx, %ecx -; X64-NEXT: movd %ecx, %xmm1 -; X64-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] -; X64-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; X64-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx -; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; X64-NEXT: movl %eax, %edx -; X64-NEXT: shlb %cl, %dl -; X64-NEXT: movzbl %dl, %edx -; X64-NEXT: movl %edx, %esi -; X64-NEXT: sarb %cl, %sil -; X64-NEXT: xorl %ecx, %ecx -; X64-NEXT: testb %al, %al -; X64-NEXT: sets %cl -; X64-NEXT: addl $127, %ecx -; X64-NEXT: cmpb %sil, %al -; X64-NEXT: cmovel %edx, %ecx -; X64-NEXT: movd %ecx, %xmm2 -; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx -; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; X64-NEXT: movl %eax, %edx -; X64-NEXT: shlb %cl, %dl -; X64-NEXT: movzbl %dl, %edx -; X64-NEXT: movl %edx, %esi -; X64-NEXT: sarb %cl, %sil -; X64-NEXT: xorl %ecx, %ecx -; X64-NEXT: testb %al, %al -; X64-NEXT: sets %cl -; X64-NEXT: addl $127, %ecx -; X64-NEXT: cmpb %sil, %al -; X64-NEXT: cmovel %edx, %ecx -; X64-NEXT: movd %ecx, %xmm0 -; X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx -; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; X64-NEXT: movl %eax, %edx -; X64-NEXT: shlb %cl, %dl -; X64-NEXT: movzbl %dl, %edx -; X64-NEXT: movl %edx, %esi -; X64-NEXT: sarb %cl, %sil -; X64-NEXT: xorl %ecx, %ecx -; X64-NEXT: testb %al, %al -; X64-NEXT: sets %cl -; X64-NEXT: addl $127, %ecx -; X64-NEXT: cmpb %sil, %al -; X64-NEXT: cmovel %edx, %ecx -; X64-NEXT: movd %ecx, %xmm3 -; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx -; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; X64-NEXT: movl %eax, %edx -; X64-NEXT: shlb %cl, %dl -; X64-NEXT: movzbl %dl, %edx -; X64-NEXT: movl %edx, %esi -; X64-NEXT: sarb %cl, %sil -; X64-NEXT: xorl %ecx, %ecx -; X64-NEXT: testb %al, %al -; X64-NEXT: sets %cl -; X64-NEXT: addl $127, %ecx -; X64-NEXT: cmpb %sil, %al -; X64-NEXT: cmovel %edx, %ecx -; X64-NEXT: movd %ecx, %xmm2 -; X64-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] -; X64-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx -; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; X64-NEXT: movl %eax, %edx -; X64-NEXT: shlb %cl, %dl -; X64-NEXT: movzbl %dl, %edx -; X64-NEXT: movl %edx, %esi -; X64-NEXT: sarb %cl, %sil -; X64-NEXT: xorl %ecx, %ecx -; X64-NEXT: testb %al, %al -; X64-NEXT: sets %cl -; X64-NEXT: addl $127, %ecx -; X64-NEXT: cmpb %sil, %al -; X64-NEXT: cmovel %edx, %ecx -; X64-NEXT: movd %ecx, %xmm0 -; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx -; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; X64-NEXT: movl %eax, %edx -; X64-NEXT: shlb %cl, %dl -; X64-NEXT: movzbl %dl, %edx -; X64-NEXT: movl %edx, %esi -; X64-NEXT: sarb %cl, %sil -; X64-NEXT: xorl %ecx, %ecx -; X64-NEXT: testb %al, %al -; X64-NEXT: sets %cl -; X64-NEXT: addl $127, %ecx -; X64-NEXT: cmpb %sil, %al -; X64-NEXT: cmovel %edx, %ecx -; X64-NEXT: movd %ecx, %xmm3 -; X64-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx -; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx -; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi -; X64-NEXT: movl %esi, %edi -; X64-NEXT: shlb %cl, %dil -; X64-NEXT: movzbl %dil, %edi -; X64-NEXT: movl %edi, %r8d -; X64-NEXT: sarb %cl, %r8b -; X64-NEXT: xorl %ecx, %ecx -; X64-NEXT: testb %sil, %sil -; X64-NEXT: sets %cl -; X64-NEXT: addl $127, %ecx -; X64-NEXT: cmpb %r8b, %sil -; X64-NEXT: cmovel %edi, %ecx -; X64-NEXT: movd %ecx, %xmm4 -; X64-NEXT: movl %edx, %esi -; X64-NEXT: movl %eax, %ecx -; X64-NEXT: shlb %cl, %sil -; X64-NEXT: movzbl %sil, %esi -; X64-NEXT: movl %esi, %edi -; X64-NEXT: sarb %cl, %dil -; X64-NEXT: xorl %eax, %eax -; X64-NEXT: testb %dl, %dl -; X64-NEXT: sets %al -; X64-NEXT: addl $127, %eax -; X64-NEXT: cmpb %dil, %dl -; X64-NEXT: cmovel %esi, %eax -; X64-NEXT: movd %eax, %xmm0 -; X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; X64-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; X64-NEXT: psllw $5, %xmm1 +; X64-NEXT: pxor %xmm3, %xmm3 +; X64-NEXT: pxor %xmm4, %xmm4 +; X64-NEXT: pcmpgtb %xmm1, %xmm4 +; X64-NEXT: movdqa %xmm0, %xmm2 +; X64-NEXT: psllw $4, %xmm2 +; X64-NEXT: pand %xmm4, %xmm2 +; X64-NEXT: pandn %xmm0, %xmm4 +; X64-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; X64-NEXT: por %xmm4, %xmm2 +; X64-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15] +; X64-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] +; X64-NEXT: paddb %xmm1, %xmm1 +; X64-NEXT: pxor %xmm6, %xmm6 +; X64-NEXT: pcmpgtb %xmm1, %xmm6 +; X64-NEXT: movdqa %xmm6, %xmm7 +; X64-NEXT: pandn %xmm2, %xmm7 +; X64-NEXT: psllw $2, %xmm2 +; X64-NEXT: pand %xmm6, %xmm2 +; X64-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; X64-NEXT: por %xmm7, %xmm2 +; X64-NEXT: paddb %xmm1, %xmm1 +; X64-NEXT: pxor %xmm6, %xmm6 +; X64-NEXT: pcmpgtb %xmm1, %xmm6 +; X64-NEXT: movdqa %xmm6, %xmm1 +; X64-NEXT: pandn %xmm2, %xmm1 +; X64-NEXT: paddb %xmm2, %xmm2 +; X64-NEXT: pand %xmm6, %xmm2 +; X64-NEXT: por %xmm1, %xmm2 +; X64-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm2[8],xmm6[9],xmm2[9],xmm6[10],xmm2[10],xmm6[11],xmm2[11],xmm6[12],xmm2[12],xmm6[13],xmm2[13],xmm6[14],xmm2[14],xmm6[15],xmm2[15] +; X64-NEXT: pxor %xmm1, %xmm1 +; X64-NEXT: pcmpgtw %xmm4, %xmm1 +; X64-NEXT: movdqa %xmm1, %xmm7 +; X64-NEXT: pandn %xmm6, %xmm7 +; X64-NEXT: psraw $4, %xmm6 +; X64-NEXT: pand %xmm1, %xmm6 +; X64-NEXT: por %xmm7, %xmm6 +; X64-NEXT: paddw %xmm4, %xmm4 +; X64-NEXT: pxor %xmm1, %xmm1 +; X64-NEXT: pcmpgtw %xmm4, %xmm1 +; X64-NEXT: movdqa %xmm1, %xmm7 +; X64-NEXT: pandn %xmm6, %xmm7 +; X64-NEXT: psraw $2, %xmm6 +; X64-NEXT: pand %xmm1, %xmm6 +; X64-NEXT: por %xmm7, %xmm6 +; X64-NEXT: paddw %xmm4, %xmm4 +; X64-NEXT: pxor %xmm1, %xmm1 +; X64-NEXT: pcmpgtw %xmm4, %xmm1 +; X64-NEXT: movdqa %xmm1, %xmm4 +; X64-NEXT: pandn %xmm6, %xmm4 +; X64-NEXT: psraw $1, %xmm6 +; X64-NEXT: pand %xmm1, %xmm6 +; X64-NEXT: por %xmm4, %xmm6 +; X64-NEXT: psrlw $8, %xmm6 +; X64-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; X64-NEXT: pxor %xmm4, %xmm4 +; X64-NEXT: pcmpgtw %xmm5, %xmm4 +; X64-NEXT: movdqa %xmm4, %xmm7 +; X64-NEXT: pandn %xmm1, %xmm7 +; X64-NEXT: psraw $4, %xmm1 +; X64-NEXT: pand %xmm4, %xmm1 +; X64-NEXT: por %xmm7, %xmm1 +; X64-NEXT: paddw %xmm5, %xmm5 +; X64-NEXT: pxor %xmm4, %xmm4 +; X64-NEXT: pcmpgtw %xmm5, %xmm4 +; X64-NEXT: movdqa %xmm4, %xmm7 +; X64-NEXT: pandn %xmm1, %xmm7 +; X64-NEXT: psraw $2, %xmm1 +; X64-NEXT: pand %xmm4, %xmm1 +; X64-NEXT: por %xmm7, %xmm1 +; X64-NEXT: paddw %xmm5, %xmm5 +; X64-NEXT: pxor %xmm4, %xmm4 +; X64-NEXT: pcmpgtw %xmm5, %xmm4 +; X64-NEXT: movdqa %xmm4, %xmm5 +; X64-NEXT: pandn %xmm1, %xmm5 +; X64-NEXT: psraw $1, %xmm1 +; X64-NEXT: pand %xmm4, %xmm1 +; X64-NEXT: por %xmm5, %xmm1 +; X64-NEXT: psrlw $8, %xmm1 +; X64-NEXT: packuswb %xmm6, %xmm1 +; X64-NEXT: pcmpeqb %xmm0, %xmm1 +; X64-NEXT: pand %xmm1, %xmm2 +; X64-NEXT: pcmpgtb %xmm0, %xmm3 +; X64-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; X64-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; X64-NEXT: por %xmm3, %xmm0 +; X64-NEXT: pandn %xmm0, %xmm1 +; X64-NEXT: por %xmm2, %xmm1 +; X64-NEXT: movdqa %xmm1, %xmm0 ; X64-NEXT: retq ; ; X64-AVX2-LABEL: vec_v16i8: ; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: vpextrb $1, %xmm1, %ecx -; X64-AVX2-NEXT: vpextrb $1, %xmm0, %edx -; X64-AVX2-NEXT: movl %edx, %eax -; X64-AVX2-NEXT: shlb %cl, %al -; X64-AVX2-NEXT: movzbl %al, %esi -; X64-AVX2-NEXT: movl %esi, %edi -; X64-AVX2-NEXT: # kill: def $cl killed $cl killed $ecx -; X64-AVX2-NEXT: sarb %cl, %dil -; X64-AVX2-NEXT: xorl %eax, %eax -; X64-AVX2-NEXT: testb %dl, %dl -; X64-AVX2-NEXT: sets %al -; X64-AVX2-NEXT: addl $127, %eax -; X64-AVX2-NEXT: cmpb %dil, %dl -; X64-AVX2-NEXT: cmovel %esi, %eax -; X64-AVX2-NEXT: vmovd %xmm1, %ecx -; X64-AVX2-NEXT: vmovd %xmm0, %edx -; X64-AVX2-NEXT: movl %edx, %esi -; X64-AVX2-NEXT: shlb %cl, %sil -; X64-AVX2-NEXT: movzbl %sil, %esi -; X64-AVX2-NEXT: movl %esi, %edi -; X64-AVX2-NEXT: # kill: def $cl killed $cl killed $ecx -; X64-AVX2-NEXT: sarb %cl, %dil -; X64-AVX2-NEXT: xorl %ecx, %ecx -; X64-AVX2-NEXT: testb %dl, %dl -; X64-AVX2-NEXT: sets %cl -; X64-AVX2-NEXT: addl $127, %ecx -; X64-AVX2-NEXT: cmpb %dil, %dl -; X64-AVX2-NEXT: cmovel %esi, %ecx -; X64-AVX2-NEXT: vmovd %ecx, %xmm2 -; X64-AVX2-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2 -; X64-AVX2-NEXT: vpextrb $2, %xmm1, %ecx -; X64-AVX2-NEXT: vpextrb $2, %xmm0, %eax -; X64-AVX2-NEXT: movl %eax, %edx -; X64-AVX2-NEXT: shlb %cl, %dl -; X64-AVX2-NEXT: movzbl %dl, %edx -; X64-AVX2-NEXT: movl %edx, %esi -; X64-AVX2-NEXT: # kill: def $cl killed $cl killed $ecx -; X64-AVX2-NEXT: sarb %cl, %sil -; X64-AVX2-NEXT: xorl %ecx, %ecx -; X64-AVX2-NEXT: testb %al, %al -; X64-AVX2-NEXT: sets %cl -; X64-AVX2-NEXT: addl $127, %ecx -; X64-AVX2-NEXT: cmpb %sil, %al -; X64-AVX2-NEXT: cmovel %edx, %ecx -; X64-AVX2-NEXT: vpinsrb $2, %ecx, %xmm2, %xmm2 -; X64-AVX2-NEXT: vpextrb $3, %xmm1, %ecx -; X64-AVX2-NEXT: vpextrb $3, %xmm0, %eax -; X64-AVX2-NEXT: movl %eax, %edx -; X64-AVX2-NEXT: shlb %cl, %dl -; X64-AVX2-NEXT: movzbl %dl, %edx -; X64-AVX2-NEXT: movl %edx, %esi -; X64-AVX2-NEXT: # kill: def $cl killed $cl killed $ecx -; X64-AVX2-NEXT: sarb %cl, %sil -; X64-AVX2-NEXT: xorl %ecx, %ecx -; X64-AVX2-NEXT: testb %al, %al -; X64-AVX2-NEXT: sets %cl -; X64-AVX2-NEXT: addl $127, %ecx -; X64-AVX2-NEXT: cmpb %sil, %al -; X64-AVX2-NEXT: cmovel %edx, %ecx -; X64-AVX2-NEXT: vpinsrb $3, %ecx, %xmm2, %xmm2 -; X64-AVX2-NEXT: vpextrb $4, %xmm1, %ecx -; X64-AVX2-NEXT: vpextrb $4, %xmm0, %eax -; X64-AVX2-NEXT: movl %eax, %edx -; X64-AVX2-NEXT: shlb %cl, %dl -; X64-AVX2-NEXT: movzbl %dl, %edx -; X64-AVX2-NEXT: movl %edx, %esi -; X64-AVX2-NEXT: # kill: def $cl killed $cl killed $ecx -; X64-AVX2-NEXT: sarb %cl, %sil -; X64-AVX2-NEXT: xorl %ecx, %ecx -; X64-AVX2-NEXT: testb %al, %al -; X64-AVX2-NEXT: sets %cl -; X64-AVX2-NEXT: addl $127, %ecx -; X64-AVX2-NEXT: cmpb %sil, %al -; X64-AVX2-NEXT: cmovel %edx, %ecx -; X64-AVX2-NEXT: vpinsrb $4, %ecx, %xmm2, %xmm2 -; X64-AVX2-NEXT: vpextrb $5, %xmm1, %ecx -; X64-AVX2-NEXT: vpextrb $5, %xmm0, %eax -; X64-AVX2-NEXT: movl %eax, %edx -; X64-AVX2-NEXT: shlb %cl, %dl -; X64-AVX2-NEXT: movzbl %dl, %edx -; X64-AVX2-NEXT: movl %edx, %esi -; X64-AVX2-NEXT: # kill: def $cl killed $cl killed $ecx -; X64-AVX2-NEXT: sarb %cl, %sil -; X64-AVX2-NEXT: xorl %ecx, %ecx -; X64-AVX2-NEXT: testb %al, %al -; X64-AVX2-NEXT: sets %cl -; X64-AVX2-NEXT: addl $127, %ecx -; X64-AVX2-NEXT: cmpb %sil, %al -; X64-AVX2-NEXT: cmovel %edx, %ecx -; X64-AVX2-NEXT: vpinsrb $5, %ecx, %xmm2, %xmm2 -; X64-AVX2-NEXT: vpextrb $6, %xmm1, %ecx -; X64-AVX2-NEXT: vpextrb $6, %xmm0, %eax -; X64-AVX2-NEXT: movl %eax, %edx -; X64-AVX2-NEXT: shlb %cl, %dl -; X64-AVX2-NEXT: movzbl %dl, %edx -; X64-AVX2-NEXT: movl %edx, %esi -; X64-AVX2-NEXT: # kill: def $cl killed $cl killed $ecx -; X64-AVX2-NEXT: sarb %cl, %sil -; X64-AVX2-NEXT: xorl %ecx, %ecx -; X64-AVX2-NEXT: testb %al, %al -; X64-AVX2-NEXT: sets %cl -; X64-AVX2-NEXT: addl $127, %ecx -; X64-AVX2-NEXT: cmpb %sil, %al -; X64-AVX2-NEXT: cmovel %edx, %ecx -; X64-AVX2-NEXT: vpinsrb $6, %ecx, %xmm2, %xmm2 -; X64-AVX2-NEXT: vpextrb $7, %xmm1, %ecx -; X64-AVX2-NEXT: vpextrb $7, %xmm0, %eax -; X64-AVX2-NEXT: movl %eax, %edx -; X64-AVX2-NEXT: shlb %cl, %dl -; X64-AVX2-NEXT: movzbl %dl, %edx -; X64-AVX2-NEXT: movl %edx, %esi -; X64-AVX2-NEXT: # kill: def $cl killed $cl killed $ecx -; X64-AVX2-NEXT: sarb %cl, %sil -; X64-AVX2-NEXT: xorl %ecx, %ecx -; X64-AVX2-NEXT: testb %al, %al -; X64-AVX2-NEXT: sets %cl -; X64-AVX2-NEXT: addl $127, %ecx -; X64-AVX2-NEXT: cmpb %sil, %al -; X64-AVX2-NEXT: cmovel %edx, %ecx -; X64-AVX2-NEXT: vpinsrb $7, %ecx, %xmm2, %xmm2 -; X64-AVX2-NEXT: vpextrb $8, %xmm1, %ecx -; X64-AVX2-NEXT: vpextrb $8, %xmm0, %eax -; X64-AVX2-NEXT: movl %eax, %edx -; X64-AVX2-NEXT: shlb %cl, %dl -; X64-AVX2-NEXT: movzbl %dl, %edx -; X64-AVX2-NEXT: movl %edx, %esi -; X64-AVX2-NEXT: # kill: def $cl killed $cl killed $ecx -; X64-AVX2-NEXT: sarb %cl, %sil -; X64-AVX2-NEXT: xorl %ecx, %ecx -; X64-AVX2-NEXT: testb %al, %al -; X64-AVX2-NEXT: sets %cl -; X64-AVX2-NEXT: addl $127, %ecx -; X64-AVX2-NEXT: cmpb %sil, %al -; X64-AVX2-NEXT: cmovel %edx, %ecx -; X64-AVX2-NEXT: vpinsrb $8, %ecx, %xmm2, %xmm2 -; X64-AVX2-NEXT: vpextrb $9, %xmm1, %ecx -; X64-AVX2-NEXT: vpextrb $9, %xmm0, %eax -; X64-AVX2-NEXT: movl %eax, %edx -; X64-AVX2-NEXT: shlb %cl, %dl -; X64-AVX2-NEXT: movzbl %dl, %edx -; X64-AVX2-NEXT: movl %edx, %esi -; X64-AVX2-NEXT: # kill: def $cl killed $cl killed $ecx -; X64-AVX2-NEXT: sarb %cl, %sil -; X64-AVX2-NEXT: xorl %ecx, %ecx -; X64-AVX2-NEXT: testb %al, %al -; X64-AVX2-NEXT: sets %cl -; X64-AVX2-NEXT: addl $127, %ecx -; X64-AVX2-NEXT: cmpb %sil, %al -; X64-AVX2-NEXT: cmovel %edx, %ecx -; X64-AVX2-NEXT: vpinsrb $9, %ecx, %xmm2, %xmm2 -; X64-AVX2-NEXT: vpextrb $10, %xmm1, %ecx -; X64-AVX2-NEXT: vpextrb $10, %xmm0, %eax -; X64-AVX2-NEXT: movl %eax, %edx -; X64-AVX2-NEXT: shlb %cl, %dl -; X64-AVX2-NEXT: movzbl %dl, %edx -; X64-AVX2-NEXT: movl %edx, %esi -; X64-AVX2-NEXT: # kill: def $cl killed $cl killed $ecx -; X64-AVX2-NEXT: sarb %cl, %sil -; X64-AVX2-NEXT: xorl %ecx, %ecx -; X64-AVX2-NEXT: testb %al, %al -; X64-AVX2-NEXT: sets %cl -; X64-AVX2-NEXT: addl $127, %ecx -; X64-AVX2-NEXT: cmpb %sil, %al -; X64-AVX2-NEXT: cmovel %edx, %ecx -; X64-AVX2-NEXT: vpinsrb $10, %ecx, %xmm2, %xmm2 -; X64-AVX2-NEXT: vpextrb $11, %xmm1, %ecx -; X64-AVX2-NEXT: vpextrb $11, %xmm0, %eax -; X64-AVX2-NEXT: movl %eax, %edx -; X64-AVX2-NEXT: shlb %cl, %dl -; X64-AVX2-NEXT: movzbl %dl, %edx -; X64-AVX2-NEXT: movl %edx, %esi -; X64-AVX2-NEXT: # kill: def $cl killed $cl killed $ecx -; X64-AVX2-NEXT: sarb %cl, %sil -; X64-AVX2-NEXT: xorl %ecx, %ecx -; X64-AVX2-NEXT: testb %al, %al -; X64-AVX2-NEXT: sets %cl -; X64-AVX2-NEXT: addl $127, %ecx -; X64-AVX2-NEXT: cmpb %sil, %al -; X64-AVX2-NEXT: cmovel %edx, %ecx -; X64-AVX2-NEXT: vpinsrb $11, %ecx, %xmm2, %xmm2 -; X64-AVX2-NEXT: vpextrb $12, %xmm1, %ecx -; X64-AVX2-NEXT: vpextrb $12, %xmm0, %eax -; X64-AVX2-NEXT: movl %eax, %edx -; X64-AVX2-NEXT: shlb %cl, %dl -; X64-AVX2-NEXT: movzbl %dl, %edx -; X64-AVX2-NEXT: movl %edx, %esi -; X64-AVX2-NEXT: # kill: def $cl killed $cl killed $ecx -; X64-AVX2-NEXT: sarb %cl, %sil -; X64-AVX2-NEXT: xorl %ecx, %ecx -; X64-AVX2-NEXT: testb %al, %al -; X64-AVX2-NEXT: sets %cl -; X64-AVX2-NEXT: addl $127, %ecx -; X64-AVX2-NEXT: cmpb %sil, %al -; X64-AVX2-NEXT: cmovel %edx, %ecx -; X64-AVX2-NEXT: vpinsrb $12, %ecx, %xmm2, %xmm2 -; X64-AVX2-NEXT: vpextrb $13, %xmm1, %ecx -; X64-AVX2-NEXT: vpextrb $13, %xmm0, %eax -; X64-AVX2-NEXT: movl %eax, %edx -; X64-AVX2-NEXT: shlb %cl, %dl -; X64-AVX2-NEXT: movzbl %dl, %edx -; X64-AVX2-NEXT: movl %edx, %esi -; X64-AVX2-NEXT: # kill: def $cl killed $cl killed $ecx -; X64-AVX2-NEXT: sarb %cl, %sil -; X64-AVX2-NEXT: xorl %ecx, %ecx -; X64-AVX2-NEXT: testb %al, %al -; X64-AVX2-NEXT: sets %cl -; X64-AVX2-NEXT: addl $127, %ecx -; X64-AVX2-NEXT: cmpb %sil, %al -; X64-AVX2-NEXT: cmovel %edx, %ecx -; X64-AVX2-NEXT: vpinsrb $13, %ecx, %xmm2, %xmm2 -; X64-AVX2-NEXT: vpextrb $14, %xmm1, %ecx -; X64-AVX2-NEXT: vpextrb $14, %xmm0, %eax -; X64-AVX2-NEXT: movl %eax, %edx -; X64-AVX2-NEXT: shlb %cl, %dl -; X64-AVX2-NEXT: movzbl %dl, %edx -; X64-AVX2-NEXT: movl %edx, %esi -; X64-AVX2-NEXT: # kill: def $cl killed $cl killed $ecx -; X64-AVX2-NEXT: sarb %cl, %sil -; X64-AVX2-NEXT: xorl %ecx, %ecx -; X64-AVX2-NEXT: testb %al, %al -; X64-AVX2-NEXT: sets %cl -; X64-AVX2-NEXT: addl $127, %ecx -; X64-AVX2-NEXT: cmpb %sil, %al -; X64-AVX2-NEXT: cmovel %edx, %ecx -; X64-AVX2-NEXT: vpinsrb $14, %ecx, %xmm2, %xmm2 -; X64-AVX2-NEXT: vpextrb $15, %xmm1, %ecx -; X64-AVX2-NEXT: vpextrb $15, %xmm0, %eax -; X64-AVX2-NEXT: movl %eax, %edx -; X64-AVX2-NEXT: shlb %cl, %dl -; X64-AVX2-NEXT: movzbl %dl, %edx -; X64-AVX2-NEXT: movl %edx, %esi -; X64-AVX2-NEXT: # kill: def $cl killed $cl killed $ecx -; X64-AVX2-NEXT: sarb %cl, %sil -; X64-AVX2-NEXT: xorl %ecx, %ecx -; X64-AVX2-NEXT: testb %al, %al -; X64-AVX2-NEXT: sets %cl -; X64-AVX2-NEXT: addl $127, %ecx -; X64-AVX2-NEXT: cmpb %sil, %al -; X64-AVX2-NEXT: cmovel %edx, %ecx -; X64-AVX2-NEXT: vpinsrb $15, %ecx, %xmm2, %xmm0 +; X64-AVX2-NEXT: vpsllw $5, %xmm1, %xmm1 +; X64-AVX2-NEXT: vpsllw $4, %xmm0, %xmm2 +; X64-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; X64-AVX2-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm2 +; X64-AVX2-NEXT: vpsllw $2, %xmm2, %xmm3 +; X64-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 +; X64-AVX2-NEXT: vpaddb %xmm1, %xmm1, %xmm4 +; X64-AVX2-NEXT: vpblendvb %xmm4, %xmm3, %xmm2, %xmm2 +; X64-AVX2-NEXT: vpaddb %xmm2, %xmm2, %xmm3 +; X64-AVX2-NEXT: vpaddb %xmm4, %xmm4, %xmm4 +; X64-AVX2-NEXT: vpblendvb %xmm4, %xmm3, %xmm2, %xmm2 +; X64-AVX2-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; X64-AVX2-NEXT: vpsraw $4, %xmm3, %xmm4 +; X64-AVX2-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; X64-AVX2-NEXT: vpblendvb %xmm5, %xmm4, %xmm3, %xmm3 +; X64-AVX2-NEXT: vpsraw $2, %xmm3, %xmm4 +; X64-AVX2-NEXT: vpaddw %xmm5, %xmm5, %xmm5 +; X64-AVX2-NEXT: vpblendvb %xmm5, %xmm4, %xmm3, %xmm3 +; X64-AVX2-NEXT: vpsraw $1, %xmm3, %xmm4 +; X64-AVX2-NEXT: vpaddw %xmm5, %xmm5, %xmm5 +; X64-AVX2-NEXT: vpblendvb %xmm5, %xmm4, %xmm3, %xmm3 +; X64-AVX2-NEXT: vpsrlw $8, %xmm3, %xmm3 +; X64-AVX2-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X64-AVX2-NEXT: vpsraw $4, %xmm4, %xmm5 +; X64-AVX2-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X64-AVX2-NEXT: vpblendvb %xmm1, %xmm5, %xmm4, %xmm4 +; X64-AVX2-NEXT: vpsraw $2, %xmm4, %xmm5 +; X64-AVX2-NEXT: vpaddw %xmm1, %xmm1, %xmm1 +; X64-AVX2-NEXT: vpblendvb %xmm1, %xmm5, %xmm4, %xmm4 +; X64-AVX2-NEXT: vpsraw $1, %xmm4, %xmm5 +; X64-AVX2-NEXT: vpaddw %xmm1, %xmm1, %xmm1 +; X64-AVX2-NEXT: vpblendvb %xmm1, %xmm5, %xmm4, %xmm1 +; X64-AVX2-NEXT: vpsrlw $8, %xmm1, %xmm1 +; X64-AVX2-NEXT: vpackuswb %xmm3, %xmm1, %xmm1 +; X64-AVX2-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm1 +; X64-AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; X64-AVX2-NEXT: vpblendvb %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm0 +; X64-AVX2-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 ; X64-AVX2-NEXT: retq ; ; X86-LABEL: vec_v16i8: diff --git a/llvm/test/CodeGen/X86/ushl_sat_vec.ll b/llvm/test/CodeGen/X86/ushl_sat_vec.ll --- a/llvm/test/CodeGen/X86/ushl_sat_vec.ll +++ b/llvm/test/CodeGen/X86/ushl_sat_vec.ll @@ -11,57 +11,31 @@ define <2 x i64> @vec_v2i64(<2 x i64> %x, <2 x i64> %y) nounwind { ; X64-LABEL: vec_v2i64: ; X64: # %bb.0: -; X64-NEXT: movq %xmm0, %rax -; X64-NEXT: movq %xmm1, %rcx -; X64-NEXT: movq %rax, %rdx -; X64-NEXT: shlq %cl, %rdx -; X64-NEXT: movq %rdx, %rsi -; X64-NEXT: # kill: def $cl killed $cl killed $rcx -; X64-NEXT: shrq %cl, %rsi -; X64-NEXT: cmpq %rsi, %rax -; X64-NEXT: movq $-1, %rax -; X64-NEXT: cmovneq %rax, %rdx -; X64-NEXT: movq %rdx, %xmm2 -; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; X64-NEXT: movq %xmm0, %rdx -; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] -; X64-NEXT: movq %xmm0, %rcx -; X64-NEXT: movq %rdx, %rsi -; X64-NEXT: shlq %cl, %rsi -; X64-NEXT: movq %rsi, %rdi -; X64-NEXT: # kill: def $cl killed $cl killed $rcx -; X64-NEXT: shrq %cl, %rdi -; X64-NEXT: cmpq %rdi, %rdx -; X64-NEXT: cmovneq %rax, %rsi -; X64-NEXT: movq %rsi, %xmm0 -; X64-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0] -; X64-NEXT: movdqa %xmm2, %xmm0 +; X64-NEXT: movdqa %xmm0, %xmm2 +; X64-NEXT: psllq %xmm1, %xmm2 +; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] +; X64-NEXT: movdqa %xmm0, %xmm4 +; X64-NEXT: psllq %xmm3, %xmm4 +; X64-NEXT: movdqa %xmm4, %xmm5 +; X64-NEXT: movsd {{.*#+}} xmm4 = xmm2[0],xmm4[1] +; X64-NEXT: psrlq %xmm1, %xmm2 +; X64-NEXT: psrlq %xmm3, %xmm5 +; X64-NEXT: movsd {{.*#+}} xmm5 = xmm2[0],xmm5[1] +; X64-NEXT: pcmpeqd %xmm5, %xmm0 +; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,0,3,2] +; X64-NEXT: pand %xmm1, %xmm0 +; X64-NEXT: pcmpeqd %xmm1, %xmm1 +; X64-NEXT: pxor %xmm1, %xmm0 +; X64-NEXT: por %xmm4, %xmm0 ; X64-NEXT: retq ; ; X64-AVX2-LABEL: vec_v2i64: ; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: vpextrq $1, %xmm0, %rax -; X64-AVX2-NEXT: vpextrq $1, %xmm1, %rcx -; X64-AVX2-NEXT: movq %rax, %rdx -; X64-AVX2-NEXT: shlq %cl, %rdx -; X64-AVX2-NEXT: movq %rdx, %rsi -; X64-AVX2-NEXT: # kill: def $cl killed $cl killed $rcx -; X64-AVX2-NEXT: shrq %cl, %rsi -; X64-AVX2-NEXT: cmpq %rsi, %rax -; X64-AVX2-NEXT: movq $-1, %rax -; X64-AVX2-NEXT: cmovneq %rax, %rdx -; X64-AVX2-NEXT: vmovq %rdx, %xmm2 -; X64-AVX2-NEXT: vmovq %xmm0, %rdx -; X64-AVX2-NEXT: vmovq %xmm1, %rcx -; X64-AVX2-NEXT: movq %rdx, %rsi -; X64-AVX2-NEXT: shlq %cl, %rsi -; X64-AVX2-NEXT: movq %rsi, %rdi -; X64-AVX2-NEXT: # kill: def $cl killed $cl killed $rcx -; X64-AVX2-NEXT: shrq %cl, %rdi -; X64-AVX2-NEXT: cmpq %rdi, %rdx -; X64-AVX2-NEXT: cmovneq %rax, %rsi -; X64-AVX2-NEXT: vmovq %rsi, %xmm0 -; X64-AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; X64-AVX2-NEXT: vpsllvq %xmm1, %xmm0, %xmm2 +; X64-AVX2-NEXT: vpsrlvq %xmm1, %xmm2, %xmm1 +; X64-AVX2-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; X64-AVX2-NEXT: vblendvpd %xmm0, %xmm2, %xmm1, %xmm0 ; X64-AVX2-NEXT: retq ; ; X86-LABEL: vec_v2i64: @@ -147,102 +121,45 @@ define <4 x i32> @vec_v4i32(<4 x i32> %x, <4 x i32> %y) nounwind { ; X64-LABEL: vec_v4i32: ; X64: # %bb.0: -; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,3,3,3] -; X64-NEXT: movd %xmm2, %eax -; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm1[3,3,3,3] -; X64-NEXT: movd %xmm2, %ecx -; X64-NEXT: movl %eax, %edx -; X64-NEXT: shll %cl, %edx -; X64-NEXT: movl %edx, %esi -; X64-NEXT: # kill: def $cl killed $cl killed $ecx -; X64-NEXT: shrl %cl, %esi -; X64-NEXT: cmpl %esi, %eax -; X64-NEXT: movl $-1, %eax -; X64-NEXT: cmovnel %eax, %edx -; X64-NEXT: movd %edx, %xmm2 -; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] -; X64-NEXT: movd %xmm3, %edx -; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] -; X64-NEXT: movd %xmm3, %ecx -; X64-NEXT: movl %edx, %esi -; X64-NEXT: shll %cl, %esi -; X64-NEXT: movl %esi, %edi -; X64-NEXT: # kill: def $cl killed $cl killed $ecx -; X64-NEXT: shrl %cl, %edi -; X64-NEXT: cmpl %edi, %edx -; X64-NEXT: cmovnel %eax, %esi -; X64-NEXT: movd %esi, %xmm3 -; X64-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; X64-NEXT: movd %xmm0, %edx -; X64-NEXT: movd %xmm1, %ecx -; X64-NEXT: movl %edx, %esi -; X64-NEXT: shll %cl, %esi -; X64-NEXT: movl %esi, %edi -; X64-NEXT: # kill: def $cl killed $cl killed $ecx -; X64-NEXT: shrl %cl, %edi -; X64-NEXT: cmpl %edi, %edx -; X64-NEXT: cmovnel %eax, %esi -; X64-NEXT: movd %esi, %xmm2 -; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; X64-NEXT: movd %xmm0, %edx -; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] -; X64-NEXT: movd %xmm0, %ecx -; X64-NEXT: movl %edx, %esi -; X64-NEXT: shll %cl, %esi -; X64-NEXT: movl %esi, %edi -; X64-NEXT: # kill: def $cl killed $cl killed $ecx -; X64-NEXT: shrl %cl, %edi -; X64-NEXT: cmpl %edi, %edx -; X64-NEXT: cmovnel %eax, %esi -; X64-NEXT: movd %esi, %xmm0 -; X64-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; X64-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; X64-NEXT: movdqa %xmm2, %xmm0 +; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] +; X64-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7] +; X64-NEXT: pshuflw {{.*#+}} xmm4 = xmm1[0,1,1,1,4,5,6,7] +; X64-NEXT: pslld $23, %xmm1 +; X64-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; X64-NEXT: cvttps2dq %xmm1, %xmm1 +; X64-NEXT: movdqa %xmm0, %xmm5 +; X64-NEXT: pmuludq %xmm1, %xmm5 +; X64-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,2,2,3] +; X64-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3] +; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; X64-NEXT: pmuludq %xmm7, %xmm1 +; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; X64-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1] +; X64-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[2,3,3,3,4,5,6,7] +; X64-NEXT: movdqa %xmm6, %xmm7 +; X64-NEXT: psrld %xmm1, %xmm7 +; X64-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,1,1,1,4,5,6,7] +; X64-NEXT: movdqa %xmm5, %xmm2 +; X64-NEXT: psrld %xmm1, %xmm2 +; X64-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm7[1] +; X64-NEXT: movdqa %xmm6, %xmm1 +; X64-NEXT: psrld %xmm3, %xmm1 +; X64-NEXT: psrld %xmm4, %xmm5 +; X64-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm1[0] +; X64-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm2[0,3] +; X64-NEXT: pcmpeqd %xmm5, %xmm0 +; X64-NEXT: pcmpeqd %xmm1, %xmm1 +; X64-NEXT: pxor %xmm1, %xmm0 +; X64-NEXT: por %xmm6, %xmm0 ; X64-NEXT: retq ; ; X64-AVX2-LABEL: vec_v4i32: ; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: vpextrd $1, %xmm0, %eax -; X64-AVX2-NEXT: vpextrd $1, %xmm1, %ecx -; X64-AVX2-NEXT: movl %eax, %edx -; X64-AVX2-NEXT: shll %cl, %edx -; X64-AVX2-NEXT: movl %edx, %esi -; X64-AVX2-NEXT: # kill: def $cl killed $cl killed $ecx -; X64-AVX2-NEXT: shrl %cl, %esi -; X64-AVX2-NEXT: cmpl %esi, %eax -; X64-AVX2-NEXT: movl $-1, %eax -; X64-AVX2-NEXT: cmovnel %eax, %edx -; X64-AVX2-NEXT: vmovd %xmm0, %esi -; X64-AVX2-NEXT: vmovd %xmm1, %ecx -; X64-AVX2-NEXT: movl %esi, %edi -; X64-AVX2-NEXT: shll %cl, %edi -; X64-AVX2-NEXT: movl %edi, %r8d -; X64-AVX2-NEXT: # kill: def $cl killed $cl killed $ecx -; X64-AVX2-NEXT: shrl %cl, %r8d -; X64-AVX2-NEXT: cmpl %r8d, %esi -; X64-AVX2-NEXT: cmovnel %eax, %edi -; X64-AVX2-NEXT: vmovd %edi, %xmm2 -; X64-AVX2-NEXT: vpinsrd $1, %edx, %xmm2, %xmm2 -; X64-AVX2-NEXT: vpextrd $2, %xmm0, %edx -; X64-AVX2-NEXT: vpextrd $2, %xmm1, %ecx -; X64-AVX2-NEXT: movl %edx, %esi -; X64-AVX2-NEXT: shll %cl, %esi -; X64-AVX2-NEXT: movl %esi, %edi -; X64-AVX2-NEXT: # kill: def $cl killed $cl killed $ecx -; X64-AVX2-NEXT: shrl %cl, %edi -; X64-AVX2-NEXT: cmpl %edi, %edx -; X64-AVX2-NEXT: cmovnel %eax, %esi -; X64-AVX2-NEXT: vpinsrd $2, %esi, %xmm2, %xmm2 -; X64-AVX2-NEXT: vpextrd $3, %xmm0, %edx -; X64-AVX2-NEXT: vpextrd $3, %xmm1, %ecx -; X64-AVX2-NEXT: movl %edx, %esi -; X64-AVX2-NEXT: shll %cl, %esi -; X64-AVX2-NEXT: movl %esi, %edi -; X64-AVX2-NEXT: # kill: def $cl killed $cl killed $ecx -; X64-AVX2-NEXT: shrl %cl, %edi -; X64-AVX2-NEXT: cmpl %edi, %edx -; X64-AVX2-NEXT: cmovnel %eax, %esi -; X64-AVX2-NEXT: vpinsrd $3, %esi, %xmm2, %xmm0 +; X64-AVX2-NEXT: vpsllvd %xmm1, %xmm0, %xmm2 +; X64-AVX2-NEXT: vpsrlvd %xmm1, %xmm2, %xmm1 +; X64-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; X64-AVX2-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0 ; X64-AVX2-NEXT: retq ; ; X86-LABEL: vec_v4i32: @@ -303,195 +220,76 @@ define <8 x i16> @vec_v8i16(<8 x i16> %x, <8 x i16> %y) nounwind { ; X64-LABEL: vec_v8i16: ; X64: # %bb.0: -; X64-NEXT: pextrw $7, %xmm0, %eax -; X64-NEXT: pextrw $7, %xmm1, %ecx -; X64-NEXT: movl %eax, %edx -; X64-NEXT: shll %cl, %edx -; X64-NEXT: movzwl %dx, %edx -; X64-NEXT: movl %edx, %esi -; X64-NEXT: # kill: def $cl killed $cl killed $ecx -; X64-NEXT: shrl %cl, %esi -; X64-NEXT: cmpw %si, %ax -; X64-NEXT: movl $65535, %eax # imm = 0xFFFF -; X64-NEXT: cmovnel %eax, %edx -; X64-NEXT: movd %edx, %xmm2 -; X64-NEXT: pextrw $6, %xmm0, %edx -; X64-NEXT: pextrw $6, %xmm1, %ecx -; X64-NEXT: movl %edx, %esi -; X64-NEXT: shll %cl, %esi -; X64-NEXT: movzwl %si, %esi -; X64-NEXT: movl %esi, %edi -; X64-NEXT: # kill: def $cl killed $cl killed $ecx -; X64-NEXT: shrl %cl, %edi -; X64-NEXT: cmpw %di, %dx -; X64-NEXT: cmovnel %eax, %esi -; X64-NEXT: movd %esi, %xmm3 -; X64-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; X64-NEXT: pextrw $5, %xmm0, %edx -; X64-NEXT: pextrw $5, %xmm1, %ecx -; X64-NEXT: movl %edx, %esi -; X64-NEXT: shll %cl, %esi -; X64-NEXT: movzwl %si, %esi -; X64-NEXT: movl %esi, %edi -; X64-NEXT: # kill: def $cl killed $cl killed $ecx -; X64-NEXT: shrl %cl, %edi -; X64-NEXT: cmpw %di, %dx -; X64-NEXT: cmovnel %eax, %esi -; X64-NEXT: movd %esi, %xmm4 -; X64-NEXT: pextrw $4, %xmm0, %edx -; X64-NEXT: pextrw $4, %xmm1, %ecx -; X64-NEXT: movl %edx, %esi -; X64-NEXT: shll %cl, %esi -; X64-NEXT: movzwl %si, %esi -; X64-NEXT: movl %esi, %edi -; X64-NEXT: # kill: def $cl killed $cl killed $ecx -; X64-NEXT: shrl %cl, %edi -; X64-NEXT: cmpw %di, %dx -; X64-NEXT: cmovnel %eax, %esi -; X64-NEXT: movd %esi, %xmm2 -; X64-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; X64-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; X64-NEXT: pextrw $3, %xmm0, %edx -; X64-NEXT: pextrw $3, %xmm1, %ecx -; X64-NEXT: movl %edx, %esi -; X64-NEXT: shll %cl, %esi -; X64-NEXT: movzwl %si, %esi -; X64-NEXT: movl %esi, %edi -; X64-NEXT: # kill: def $cl killed $cl killed $ecx -; X64-NEXT: shrl %cl, %edi -; X64-NEXT: cmpw %di, %dx -; X64-NEXT: cmovnel %eax, %esi -; X64-NEXT: movd %esi, %xmm4 -; X64-NEXT: pextrw $2, %xmm0, %edx -; X64-NEXT: pextrw $2, %xmm1, %ecx -; X64-NEXT: movl %edx, %esi -; X64-NEXT: shll %cl, %esi -; X64-NEXT: movzwl %si, %esi -; X64-NEXT: movl %esi, %edi -; X64-NEXT: # kill: def $cl killed $cl killed $ecx -; X64-NEXT: shrl %cl, %edi -; X64-NEXT: cmpw %di, %dx -; X64-NEXT: cmovnel %eax, %esi -; X64-NEXT: movd %esi, %xmm3 -; X64-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; X64-NEXT: pextrw $1, %xmm0, %edx -; X64-NEXT: pextrw $1, %xmm1, %ecx -; X64-NEXT: movl %edx, %esi -; X64-NEXT: shll %cl, %esi -; X64-NEXT: movzwl %si, %esi -; X64-NEXT: movl %esi, %edi -; X64-NEXT: # kill: def $cl killed $cl killed $ecx -; X64-NEXT: shrl %cl, %edi -; X64-NEXT: cmpw %di, %dx -; X64-NEXT: cmovnel %eax, %esi -; X64-NEXT: movd %esi, %xmm4 -; X64-NEXT: movd %xmm0, %edx -; X64-NEXT: movd %xmm1, %ecx -; X64-NEXT: movl %edx, %esi -; X64-NEXT: shll %cl, %esi -; X64-NEXT: movzwl %si, %esi -; X64-NEXT: movl %esi, %edi -; X64-NEXT: # kill: def $cl killed $cl killed $ecx -; X64-NEXT: shrl %cl, %edi -; X64-NEXT: cmpw %di, %dx -; X64-NEXT: cmovnel %eax, %esi -; X64-NEXT: movd %esi, %xmm0 -; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; X64-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; X64-NEXT: movdqa %xmm1, %xmm2 +; X64-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7] +; X64-NEXT: pslld $23, %xmm2 +; X64-NEXT: movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216] +; X64-NEXT: paddd %xmm3, %xmm2 +; X64-NEXT: cvttps2dq %xmm2, %xmm4 +; X64-NEXT: pslld $16, %xmm4 +; X64-NEXT: psrad $16, %xmm4 +; X64-NEXT: movdqa %xmm1, %xmm2 +; X64-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] +; X64-NEXT: pslld $23, %xmm2 +; X64-NEXT: paddd %xmm3, %xmm2 +; X64-NEXT: cvttps2dq %xmm2, %xmm2 +; X64-NEXT: pslld $16, %xmm2 +; X64-NEXT: psrad $16, %xmm2 +; X64-NEXT: packssdw %xmm4, %xmm2 +; X64-NEXT: pmullw %xmm0, %xmm2 +; X64-NEXT: psllw $12, %xmm1 +; X64-NEXT: movdqa %xmm1, %xmm3 +; X64-NEXT: psraw $15, %xmm3 +; X64-NEXT: movdqa %xmm2, %xmm4 +; X64-NEXT: psrlw $8, %xmm4 +; X64-NEXT: pand %xmm3, %xmm4 +; X64-NEXT: pandn %xmm2, %xmm3 +; X64-NEXT: por %xmm4, %xmm3 +; X64-NEXT: paddw %xmm1, %xmm1 +; X64-NEXT: movdqa %xmm1, %xmm4 +; X64-NEXT: psraw $15, %xmm4 +; X64-NEXT: movdqa %xmm4, %xmm5 +; X64-NEXT: pandn %xmm3, %xmm5 +; X64-NEXT: psrlw $4, %xmm3 +; X64-NEXT: pand %xmm4, %xmm3 +; X64-NEXT: por %xmm5, %xmm3 +; X64-NEXT: paddw %xmm1, %xmm1 +; X64-NEXT: movdqa %xmm1, %xmm4 +; X64-NEXT: psraw $15, %xmm4 +; X64-NEXT: movdqa %xmm4, %xmm5 +; X64-NEXT: pandn %xmm3, %xmm5 +; X64-NEXT: psrlw $2, %xmm3 +; X64-NEXT: pand %xmm4, %xmm3 +; X64-NEXT: por %xmm5, %xmm3 +; X64-NEXT: paddw %xmm1, %xmm1 +; X64-NEXT: psraw $15, %xmm1 +; X64-NEXT: movdqa %xmm1, %xmm4 +; X64-NEXT: pandn %xmm3, %xmm4 +; X64-NEXT: psrlw $1, %xmm3 +; X64-NEXT: pand %xmm1, %xmm3 +; X64-NEXT: por %xmm4, %xmm3 +; X64-NEXT: pcmpeqw %xmm3, %xmm0 +; X64-NEXT: pcmpeqd %xmm1, %xmm1 +; X64-NEXT: pxor %xmm1, %xmm0 +; X64-NEXT: por %xmm2, %xmm0 ; X64-NEXT: retq ; ; X64-AVX2-LABEL: vec_v8i16: ; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: vpextrw $1, %xmm0, %eax -; X64-AVX2-NEXT: vpextrw $1, %xmm1, %ecx -; X64-AVX2-NEXT: movl %eax, %edx -; X64-AVX2-NEXT: shll %cl, %edx -; X64-AVX2-NEXT: movzwl %dx, %edx -; X64-AVX2-NEXT: movl %edx, %esi -; X64-AVX2-NEXT: # kill: def $cl killed $cl killed $ecx -; X64-AVX2-NEXT: shrl %cl, %esi -; X64-AVX2-NEXT: cmpw %si, %ax -; X64-AVX2-NEXT: movl $65535, %eax # imm = 0xFFFF -; X64-AVX2-NEXT: cmovnel %eax, %edx -; X64-AVX2-NEXT: vmovd %xmm0, %esi -; X64-AVX2-NEXT: vmovd %xmm1, %ecx -; X64-AVX2-NEXT: movl %esi, %edi -; X64-AVX2-NEXT: shll %cl, %edi -; X64-AVX2-NEXT: movzwl %di, %edi -; X64-AVX2-NEXT: movl %edi, %r8d -; X64-AVX2-NEXT: # kill: def $cl killed $cl killed $ecx -; X64-AVX2-NEXT: shrl %cl, %r8d -; X64-AVX2-NEXT: cmpw %r8w, %si -; X64-AVX2-NEXT: cmovnel %eax, %edi -; X64-AVX2-NEXT: vmovd %edi, %xmm2 -; X64-AVX2-NEXT: vpinsrw $1, %edx, %xmm2, %xmm2 -; X64-AVX2-NEXT: vpextrw $2, %xmm0, %edx -; X64-AVX2-NEXT: vpextrw $2, %xmm1, %ecx -; X64-AVX2-NEXT: movl %edx, %esi -; X64-AVX2-NEXT: shll %cl, %esi -; X64-AVX2-NEXT: movzwl %si, %esi -; X64-AVX2-NEXT: movl %esi, %edi -; X64-AVX2-NEXT: # kill: def $cl killed $cl killed $ecx -; X64-AVX2-NEXT: shrl %cl, %edi -; X64-AVX2-NEXT: cmpw %di, %dx -; X64-AVX2-NEXT: cmovnel %eax, %esi -; X64-AVX2-NEXT: vpinsrw $2, %esi, %xmm2, %xmm2 -; X64-AVX2-NEXT: vpextrw $3, %xmm0, %edx -; X64-AVX2-NEXT: vpextrw $3, %xmm1, %ecx -; X64-AVX2-NEXT: movl %edx, %esi -; X64-AVX2-NEXT: shll %cl, %esi -; X64-AVX2-NEXT: movzwl %si, %esi -; X64-AVX2-NEXT: movl %esi, %edi -; X64-AVX2-NEXT: # kill: def $cl killed $cl killed $ecx -; X64-AVX2-NEXT: shrl %cl, %edi -; X64-AVX2-NEXT: cmpw %di, %dx -; X64-AVX2-NEXT: cmovnel %eax, %esi -; X64-AVX2-NEXT: vpinsrw $3, %esi, %xmm2, %xmm2 -; X64-AVX2-NEXT: vpextrw $4, %xmm0, %edx -; X64-AVX2-NEXT: vpextrw $4, %xmm1, %ecx -; X64-AVX2-NEXT: movl %edx, %esi -; X64-AVX2-NEXT: shll %cl, %esi -; X64-AVX2-NEXT: movzwl %si, %esi -; X64-AVX2-NEXT: movl %esi, %edi -; X64-AVX2-NEXT: # kill: def $cl killed $cl killed $ecx -; X64-AVX2-NEXT: shrl %cl, %edi -; X64-AVX2-NEXT: cmpw %di, %dx -; X64-AVX2-NEXT: cmovnel %eax, %esi -; X64-AVX2-NEXT: vpinsrw $4, %esi, %xmm2, %xmm2 -; X64-AVX2-NEXT: vpextrw $5, %xmm0, %edx -; X64-AVX2-NEXT: vpextrw $5, %xmm1, %ecx -; X64-AVX2-NEXT: movl %edx, %esi -; X64-AVX2-NEXT: shll %cl, %esi -; X64-AVX2-NEXT: movzwl %si, %esi -; X64-AVX2-NEXT: movl %esi, %edi -; X64-AVX2-NEXT: # kill: def $cl killed $cl killed $ecx -; X64-AVX2-NEXT: shrl %cl, %edi -; X64-AVX2-NEXT: cmpw %di, %dx -; X64-AVX2-NEXT: cmovnel %eax, %esi -; X64-AVX2-NEXT: vpinsrw $5, %esi, %xmm2, %xmm2 -; X64-AVX2-NEXT: vpextrw $6, %xmm0, %edx -; X64-AVX2-NEXT: vpextrw $6, %xmm1, %ecx -; X64-AVX2-NEXT: movl %edx, %esi -; X64-AVX2-NEXT: shll %cl, %esi -; X64-AVX2-NEXT: movzwl %si, %esi -; X64-AVX2-NEXT: movl %esi, %edi -; X64-AVX2-NEXT: # kill: def $cl killed $cl killed $ecx -; X64-AVX2-NEXT: shrl %cl, %edi -; X64-AVX2-NEXT: cmpw %di, %dx -; X64-AVX2-NEXT: cmovnel %eax, %esi -; X64-AVX2-NEXT: vpinsrw $6, %esi, %xmm2, %xmm2 -; X64-AVX2-NEXT: vpextrw $7, %xmm0, %edx -; X64-AVX2-NEXT: vpextrw $7, %xmm1, %ecx -; X64-AVX2-NEXT: movl %edx, %esi -; X64-AVX2-NEXT: shll %cl, %esi -; X64-AVX2-NEXT: movzwl %si, %esi -; X64-AVX2-NEXT: movl %esi, %edi -; X64-AVX2-NEXT: # kill: def $cl killed $cl killed $ecx -; X64-AVX2-NEXT: shrl %cl, %edi -; X64-AVX2-NEXT: cmpw %di, %dx -; X64-AVX2-NEXT: cmovnel %eax, %esi -; X64-AVX2-NEXT: vpinsrw $7, %esi, %xmm2, %xmm0 +; X64-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; X64-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; X64-AVX2-NEXT: vpsllvd %ymm1, %ymm2, %ymm2 +; X64-AVX2-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] +; X64-AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; X64-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero +; X64-AVX2-NEXT: vpsrlvd %ymm1, %ymm3, %ymm1 +; X64-AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3 +; X64-AVX2-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 +; X64-AVX2-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; X64-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpor %xmm2, %xmm0, %xmm0 +; X64-AVX2-NEXT: vzeroupper ; X64-AVX2-NEXT: retq ; ; X86-LABEL: vec_v8i16: @@ -610,366 +408,82 @@ define <16 x i8> @vec_v16i8(<16 x i8> %x, <16 x i8> %y) nounwind { ; X64-LABEL: vec_v16i8: ; X64: # %bb.0: -; X64-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; X64-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx -; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; X64-NEXT: movl %eax, %edx -; X64-NEXT: shlb %cl, %dl -; X64-NEXT: movzbl %dl, %esi -; X64-NEXT: movl %esi, %edx -; X64-NEXT: shrb %cl, %dl -; X64-NEXT: cmpb %dl, %al -; X64-NEXT: movl $255, %edx -; X64-NEXT: cmovnel %edx, %esi -; X64-NEXT: movd %esi, %xmm0 -; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx -; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; X64-NEXT: movl %eax, %esi -; X64-NEXT: shlb %cl, %sil -; X64-NEXT: movzbl %sil, %esi -; X64-NEXT: movl %esi, %edi -; X64-NEXT: shrb %cl, %dil -; X64-NEXT: cmpb %dil, %al -; X64-NEXT: cmovnel %edx, %esi -; X64-NEXT: movd %esi, %xmm1 -; X64-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx -; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; X64-NEXT: movl %eax, %esi -; X64-NEXT: shlb %cl, %sil -; X64-NEXT: movzbl %sil, %esi -; X64-NEXT: movl %esi, %edi -; X64-NEXT: shrb %cl, %dil -; X64-NEXT: cmpb %dil, %al -; X64-NEXT: cmovnel %edx, %esi -; X64-NEXT: movd %esi, %xmm2 -; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx -; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; X64-NEXT: movl %eax, %esi -; X64-NEXT: shlb %cl, %sil -; X64-NEXT: movzbl %sil, %esi -; X64-NEXT: movl %esi, %edi -; X64-NEXT: shrb %cl, %dil -; X64-NEXT: cmpb %dil, %al -; X64-NEXT: cmovnel %edx, %esi -; X64-NEXT: movd %esi, %xmm0 -; X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx -; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; X64-NEXT: movl %eax, %esi -; X64-NEXT: shlb %cl, %sil -; X64-NEXT: movzbl %sil, %esi -; X64-NEXT: movl %esi, %edi -; X64-NEXT: shrb %cl, %dil -; X64-NEXT: cmpb %dil, %al -; X64-NEXT: cmovnel %edx, %esi -; X64-NEXT: movd %esi, %xmm1 -; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx -; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; X64-NEXT: movl %eax, %esi -; X64-NEXT: shlb %cl, %sil -; X64-NEXT: movzbl %sil, %esi -; X64-NEXT: movl %esi, %edi -; X64-NEXT: shrb %cl, %dil -; X64-NEXT: cmpb %dil, %al -; X64-NEXT: cmovnel %edx, %esi -; X64-NEXT: movd %esi, %xmm2 -; X64-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx -; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; X64-NEXT: movl %eax, %esi -; X64-NEXT: shlb %cl, %sil -; X64-NEXT: movzbl %sil, %esi -; X64-NEXT: movl %esi, %edi -; X64-NEXT: shrb %cl, %dil -; X64-NEXT: cmpb %dil, %al -; X64-NEXT: cmovnel %edx, %esi -; X64-NEXT: movd %esi, %xmm3 -; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx -; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; X64-NEXT: movl %eax, %esi -; X64-NEXT: shlb %cl, %sil -; X64-NEXT: movzbl %sil, %esi -; X64-NEXT: movl %esi, %edi -; X64-NEXT: shrb %cl, %dil -; X64-NEXT: cmpb %dil, %al -; X64-NEXT: cmovnel %edx, %esi -; X64-NEXT: movd %esi, %xmm1 -; X64-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] -; X64-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; X64-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx -; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; X64-NEXT: movl %eax, %esi -; X64-NEXT: shlb %cl, %sil -; X64-NEXT: movzbl %sil, %esi -; X64-NEXT: movl %esi, %edi -; X64-NEXT: shrb %cl, %dil -; X64-NEXT: cmpb %dil, %al -; X64-NEXT: cmovnel %edx, %esi -; X64-NEXT: movd %esi, %xmm2 -; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx -; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; X64-NEXT: movl %eax, %esi -; X64-NEXT: shlb %cl, %sil -; X64-NEXT: movzbl %sil, %esi -; X64-NEXT: movl %esi, %edi -; X64-NEXT: shrb %cl, %dil -; X64-NEXT: cmpb %dil, %al -; X64-NEXT: cmovnel %edx, %esi -; X64-NEXT: movd %esi, %xmm0 -; X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx -; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; X64-NEXT: movl %eax, %esi -; X64-NEXT: shlb %cl, %sil -; X64-NEXT: movzbl %sil, %esi -; X64-NEXT: movl %esi, %edi -; X64-NEXT: shrb %cl, %dil -; X64-NEXT: cmpb %dil, %al -; X64-NEXT: cmovnel %edx, %esi -; X64-NEXT: movd %esi, %xmm3 -; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx -; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; X64-NEXT: movl %eax, %esi -; X64-NEXT: shlb %cl, %sil -; X64-NEXT: movzbl %sil, %esi -; X64-NEXT: movl %esi, %edi -; X64-NEXT: shrb %cl, %dil -; X64-NEXT: cmpb %dil, %al -; X64-NEXT: cmovnel %edx, %esi -; X64-NEXT: movd %esi, %xmm2 -; X64-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] -; X64-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx -; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; X64-NEXT: movl %eax, %esi -; X64-NEXT: shlb %cl, %sil -; X64-NEXT: movzbl %sil, %esi -; X64-NEXT: movl %esi, %edi -; X64-NEXT: shrb %cl, %dil -; X64-NEXT: cmpb %dil, %al -; X64-NEXT: cmovnel %edx, %esi -; X64-NEXT: movd %esi, %xmm0 -; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx -; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; X64-NEXT: movl %eax, %esi -; X64-NEXT: shlb %cl, %sil -; X64-NEXT: movzbl %sil, %esi -; X64-NEXT: movl %esi, %edi -; X64-NEXT: shrb %cl, %dil -; X64-NEXT: cmpb %dil, %al -; X64-NEXT: cmovnel %edx, %esi -; X64-NEXT: movd %esi, %xmm3 -; X64-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx -; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi -; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %edi -; X64-NEXT: movl %edi, %r8d -; X64-NEXT: shlb %cl, %r8b -; X64-NEXT: movzbl %r8b, %r8d -; X64-NEXT: movl %r8d, %r9d -; X64-NEXT: shrb %cl, %r9b -; X64-NEXT: cmpb %r9b, %dil -; X64-NEXT: cmovnel %edx, %r8d -; X64-NEXT: movd %r8d, %xmm4 -; X64-NEXT: movl %esi, %edi -; X64-NEXT: movl %eax, %ecx -; X64-NEXT: shlb %cl, %dil -; X64-NEXT: movzbl %dil, %edi -; X64-NEXT: movl %edi, %r8d -; X64-NEXT: shrb %cl, %r8b -; X64-NEXT: cmpb %r8b, %sil -; X64-NEXT: cmovnel %edx, %edi -; X64-NEXT: movd %edi, %xmm0 -; X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; X64-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; X64-NEXT: psllw $5, %xmm1 +; X64-NEXT: pxor %xmm3, %xmm3 +; X64-NEXT: pxor %xmm4, %xmm4 +; X64-NEXT: pcmpgtb %xmm1, %xmm4 +; X64-NEXT: movdqa %xmm4, %xmm5 +; X64-NEXT: pandn %xmm0, %xmm5 +; X64-NEXT: movdqa %xmm0, %xmm2 +; X64-NEXT: psllw $4, %xmm2 +; X64-NEXT: pand %xmm4, %xmm2 +; X64-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; X64-NEXT: por %xmm5, %xmm2 +; X64-NEXT: paddb %xmm1, %xmm1 +; X64-NEXT: pxor %xmm5, %xmm5 +; X64-NEXT: pcmpgtb %xmm1, %xmm5 +; X64-NEXT: movdqa %xmm5, %xmm6 +; X64-NEXT: pandn %xmm2, %xmm6 +; X64-NEXT: psllw $2, %xmm2 +; X64-NEXT: pand %xmm5, %xmm2 +; X64-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; X64-NEXT: por %xmm6, %xmm2 +; X64-NEXT: paddb %xmm1, %xmm1 +; X64-NEXT: pcmpgtb %xmm1, %xmm3 +; X64-NEXT: movdqa %xmm3, %xmm1 +; X64-NEXT: pandn %xmm2, %xmm1 +; X64-NEXT: paddb %xmm2, %xmm2 +; X64-NEXT: pand %xmm3, %xmm2 +; X64-NEXT: por %xmm1, %xmm2 +; X64-NEXT: movdqa %xmm2, %xmm1 +; X64-NEXT: psrlw $4, %xmm1 +; X64-NEXT: pand %xmm4, %xmm1 +; X64-NEXT: pandn %xmm2, %xmm4 +; X64-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; X64-NEXT: por %xmm4, %xmm1 +; X64-NEXT: movdqa %xmm5, %xmm4 +; X64-NEXT: pandn %xmm1, %xmm4 +; X64-NEXT: psrlw $2, %xmm1 +; X64-NEXT: pand %xmm5, %xmm1 +; X64-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; X64-NEXT: por %xmm4, %xmm1 +; X64-NEXT: movdqa %xmm3, %xmm4 +; X64-NEXT: pandn %xmm1, %xmm4 +; X64-NEXT: psrlw $1, %xmm1 +; X64-NEXT: pand %xmm3, %xmm1 +; X64-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; X64-NEXT: por %xmm4, %xmm1 +; X64-NEXT: pcmpeqb %xmm1, %xmm0 +; X64-NEXT: pcmpeqd %xmm1, %xmm1 +; X64-NEXT: pxor %xmm1, %xmm0 +; X64-NEXT: por %xmm2, %xmm0 ; X64-NEXT: retq ; ; X64-AVX2-LABEL: vec_v16i8: ; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: vpextrb $1, %xmm1, %ecx -; X64-AVX2-NEXT: vpextrb $1, %xmm0, %eax -; X64-AVX2-NEXT: movl %eax, %edx -; X64-AVX2-NEXT: shlb %cl, %dl -; X64-AVX2-NEXT: movzbl %dl, %edx -; X64-AVX2-NEXT: movl %edx, %esi -; X64-AVX2-NEXT: # kill: def $cl killed $cl killed $ecx -; X64-AVX2-NEXT: shrb %cl, %sil -; X64-AVX2-NEXT: cmpb %sil, %al -; X64-AVX2-NEXT: movl $255, %eax -; X64-AVX2-NEXT: cmovnel %eax, %edx -; X64-AVX2-NEXT: vmovd %xmm1, %ecx -; X64-AVX2-NEXT: vmovd %xmm0, %esi -; X64-AVX2-NEXT: movl %esi, %edi -; X64-AVX2-NEXT: shlb %cl, %dil -; X64-AVX2-NEXT: movzbl %dil, %edi -; X64-AVX2-NEXT: movl %edi, %r8d -; X64-AVX2-NEXT: # kill: def $cl killed $cl killed $ecx -; X64-AVX2-NEXT: shrb %cl, %r8b -; X64-AVX2-NEXT: cmpb %r8b, %sil -; X64-AVX2-NEXT: cmovnel %eax, %edi -; X64-AVX2-NEXT: vmovd %edi, %xmm2 -; X64-AVX2-NEXT: vpinsrb $1, %edx, %xmm2, %xmm2 -; X64-AVX2-NEXT: vpextrb $2, %xmm1, %ecx -; X64-AVX2-NEXT: vpextrb $2, %xmm0, %edx -; X64-AVX2-NEXT: movl %edx, %esi -; X64-AVX2-NEXT: shlb %cl, %sil -; X64-AVX2-NEXT: movzbl %sil, %esi -; X64-AVX2-NEXT: movl %esi, %edi -; X64-AVX2-NEXT: # kill: def $cl killed $cl killed $ecx -; X64-AVX2-NEXT: shrb %cl, %dil -; X64-AVX2-NEXT: cmpb %dil, %dl -; X64-AVX2-NEXT: cmovnel %eax, %esi -; X64-AVX2-NEXT: vpinsrb $2, %esi, %xmm2, %xmm2 -; X64-AVX2-NEXT: vpextrb $3, %xmm1, %ecx -; X64-AVX2-NEXT: vpextrb $3, %xmm0, %edx -; X64-AVX2-NEXT: movl %edx, %esi -; X64-AVX2-NEXT: shlb %cl, %sil -; X64-AVX2-NEXT: movzbl %sil, %esi -; X64-AVX2-NEXT: movl %esi, %edi -; X64-AVX2-NEXT: # kill: def $cl killed $cl killed $ecx -; X64-AVX2-NEXT: shrb %cl, %dil -; X64-AVX2-NEXT: cmpb %dil, %dl -; X64-AVX2-NEXT: cmovnel %eax, %esi -; X64-AVX2-NEXT: vpinsrb $3, %esi, %xmm2, %xmm2 -; X64-AVX2-NEXT: vpextrb $4, %xmm1, %ecx -; X64-AVX2-NEXT: vpextrb $4, %xmm0, %edx -; X64-AVX2-NEXT: movl %edx, %esi -; X64-AVX2-NEXT: shlb %cl, %sil -; X64-AVX2-NEXT: movzbl %sil, %esi -; X64-AVX2-NEXT: movl %esi, %edi -; X64-AVX2-NEXT: # kill: def $cl killed $cl killed $ecx -; X64-AVX2-NEXT: shrb %cl, %dil -; X64-AVX2-NEXT: cmpb %dil, %dl -; X64-AVX2-NEXT: cmovnel %eax, %esi -; X64-AVX2-NEXT: vpinsrb $4, %esi, %xmm2, %xmm2 -; X64-AVX2-NEXT: vpextrb $5, %xmm1, %ecx -; X64-AVX2-NEXT: vpextrb $5, %xmm0, %edx -; X64-AVX2-NEXT: movl %edx, %esi -; X64-AVX2-NEXT: shlb %cl, %sil -; X64-AVX2-NEXT: movzbl %sil, %esi -; X64-AVX2-NEXT: movl %esi, %edi -; X64-AVX2-NEXT: # kill: def $cl killed $cl killed $ecx -; X64-AVX2-NEXT: shrb %cl, %dil -; X64-AVX2-NEXT: cmpb %dil, %dl -; X64-AVX2-NEXT: cmovnel %eax, %esi -; X64-AVX2-NEXT: vpinsrb $5, %esi, %xmm2, %xmm2 -; X64-AVX2-NEXT: vpextrb $6, %xmm1, %ecx -; X64-AVX2-NEXT: vpextrb $6, %xmm0, %edx -; X64-AVX2-NEXT: movl %edx, %esi -; X64-AVX2-NEXT: shlb %cl, %sil -; X64-AVX2-NEXT: movzbl %sil, %esi -; X64-AVX2-NEXT: movl %esi, %edi -; X64-AVX2-NEXT: # kill: def $cl killed $cl killed $ecx -; X64-AVX2-NEXT: shrb %cl, %dil -; X64-AVX2-NEXT: cmpb %dil, %dl -; X64-AVX2-NEXT: cmovnel %eax, %esi -; X64-AVX2-NEXT: vpinsrb $6, %esi, %xmm2, %xmm2 -; X64-AVX2-NEXT: vpextrb $7, %xmm1, %ecx -; X64-AVX2-NEXT: vpextrb $7, %xmm0, %edx -; X64-AVX2-NEXT: movl %edx, %esi -; X64-AVX2-NEXT: shlb %cl, %sil -; X64-AVX2-NEXT: movzbl %sil, %esi -; X64-AVX2-NEXT: movl %esi, %edi -; X64-AVX2-NEXT: # kill: def $cl killed $cl killed $ecx -; X64-AVX2-NEXT: shrb %cl, %dil -; X64-AVX2-NEXT: cmpb %dil, %dl -; X64-AVX2-NEXT: cmovnel %eax, %esi -; X64-AVX2-NEXT: vpinsrb $7, %esi, %xmm2, %xmm2 -; X64-AVX2-NEXT: vpextrb $8, %xmm1, %ecx -; X64-AVX2-NEXT: vpextrb $8, %xmm0, %edx -; X64-AVX2-NEXT: movl %edx, %esi -; X64-AVX2-NEXT: shlb %cl, %sil -; X64-AVX2-NEXT: movzbl %sil, %esi -; X64-AVX2-NEXT: movl %esi, %edi -; X64-AVX2-NEXT: # kill: def $cl killed $cl killed $ecx -; X64-AVX2-NEXT: shrb %cl, %dil -; X64-AVX2-NEXT: cmpb %dil, %dl -; X64-AVX2-NEXT: cmovnel %eax, %esi -; X64-AVX2-NEXT: vpinsrb $8, %esi, %xmm2, %xmm2 -; X64-AVX2-NEXT: vpextrb $9, %xmm1, %ecx -; X64-AVX2-NEXT: vpextrb $9, %xmm0, %edx -; X64-AVX2-NEXT: movl %edx, %esi -; X64-AVX2-NEXT: shlb %cl, %sil -; X64-AVX2-NEXT: movzbl %sil, %esi -; X64-AVX2-NEXT: movl %esi, %edi -; X64-AVX2-NEXT: # kill: def $cl killed $cl killed $ecx -; X64-AVX2-NEXT: shrb %cl, %dil -; X64-AVX2-NEXT: cmpb %dil, %dl -; X64-AVX2-NEXT: cmovnel %eax, %esi -; X64-AVX2-NEXT: vpinsrb $9, %esi, %xmm2, %xmm2 -; X64-AVX2-NEXT: vpextrb $10, %xmm1, %ecx -; X64-AVX2-NEXT: vpextrb $10, %xmm0, %edx -; X64-AVX2-NEXT: movl %edx, %esi -; X64-AVX2-NEXT: shlb %cl, %sil -; X64-AVX2-NEXT: movzbl %sil, %esi -; X64-AVX2-NEXT: movl %esi, %edi -; X64-AVX2-NEXT: # kill: def $cl killed $cl killed $ecx -; X64-AVX2-NEXT: shrb %cl, %dil -; X64-AVX2-NEXT: cmpb %dil, %dl -; X64-AVX2-NEXT: cmovnel %eax, %esi -; X64-AVX2-NEXT: vpinsrb $10, %esi, %xmm2, %xmm2 -; X64-AVX2-NEXT: vpextrb $11, %xmm1, %ecx -; X64-AVX2-NEXT: vpextrb $11, %xmm0, %edx -; X64-AVX2-NEXT: movl %edx, %esi -; X64-AVX2-NEXT: shlb %cl, %sil -; X64-AVX2-NEXT: movzbl %sil, %esi -; X64-AVX2-NEXT: movl %esi, %edi -; X64-AVX2-NEXT: # kill: def $cl killed $cl killed $ecx -; X64-AVX2-NEXT: shrb %cl, %dil -; X64-AVX2-NEXT: cmpb %dil, %dl -; X64-AVX2-NEXT: cmovnel %eax, %esi -; X64-AVX2-NEXT: vpinsrb $11, %esi, %xmm2, %xmm2 -; X64-AVX2-NEXT: vpextrb $12, %xmm1, %ecx -; X64-AVX2-NEXT: vpextrb $12, %xmm0, %edx -; X64-AVX2-NEXT: movl %edx, %esi -; X64-AVX2-NEXT: shlb %cl, %sil -; X64-AVX2-NEXT: movzbl %sil, %esi -; X64-AVX2-NEXT: movl %esi, %edi -; X64-AVX2-NEXT: # kill: def $cl killed $cl killed $ecx -; X64-AVX2-NEXT: shrb %cl, %dil -; X64-AVX2-NEXT: cmpb %dil, %dl -; X64-AVX2-NEXT: cmovnel %eax, %esi -; X64-AVX2-NEXT: vpinsrb $12, %esi, %xmm2, %xmm2 -; X64-AVX2-NEXT: vpextrb $13, %xmm1, %ecx -; X64-AVX2-NEXT: vpextrb $13, %xmm0, %edx -; X64-AVX2-NEXT: movl %edx, %esi -; X64-AVX2-NEXT: shlb %cl, %sil -; X64-AVX2-NEXT: movzbl %sil, %esi -; X64-AVX2-NEXT: movl %esi, %edi -; X64-AVX2-NEXT: # kill: def $cl killed $cl killed $ecx -; X64-AVX2-NEXT: shrb %cl, %dil -; X64-AVX2-NEXT: cmpb %dil, %dl -; X64-AVX2-NEXT: cmovnel %eax, %esi -; X64-AVX2-NEXT: vpinsrb $13, %esi, %xmm2, %xmm2 -; X64-AVX2-NEXT: vpextrb $14, %xmm1, %ecx -; X64-AVX2-NEXT: vpextrb $14, %xmm0, %edx -; X64-AVX2-NEXT: movl %edx, %esi -; X64-AVX2-NEXT: shlb %cl, %sil -; X64-AVX2-NEXT: movzbl %sil, %esi -; X64-AVX2-NEXT: movl %esi, %edi -; X64-AVX2-NEXT: # kill: def $cl killed $cl killed $ecx -; X64-AVX2-NEXT: shrb %cl, %dil -; X64-AVX2-NEXT: cmpb %dil, %dl -; X64-AVX2-NEXT: cmovnel %eax, %esi -; X64-AVX2-NEXT: vpinsrb $14, %esi, %xmm2, %xmm2 -; X64-AVX2-NEXT: vpextrb $15, %xmm1, %ecx -; X64-AVX2-NEXT: vpextrb $15, %xmm0, %edx -; X64-AVX2-NEXT: movl %edx, %esi -; X64-AVX2-NEXT: shlb %cl, %sil -; X64-AVX2-NEXT: movzbl %sil, %esi -; X64-AVX2-NEXT: movl %esi, %edi -; X64-AVX2-NEXT: # kill: def $cl killed $cl killed $ecx -; X64-AVX2-NEXT: shrb %cl, %dil -; X64-AVX2-NEXT: cmpb %dil, %dl -; X64-AVX2-NEXT: cmovnel %eax, %esi -; X64-AVX2-NEXT: vpinsrb $15, %esi, %xmm2, %xmm0 +; X64-AVX2-NEXT: vpsllw $5, %xmm1, %xmm1 +; X64-AVX2-NEXT: vpsllw $4, %xmm0, %xmm2 +; X64-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; X64-AVX2-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm2 +; X64-AVX2-NEXT: vpsllw $2, %xmm2, %xmm3 +; X64-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 +; X64-AVX2-NEXT: vpaddb %xmm1, %xmm1, %xmm4 +; X64-AVX2-NEXT: vpblendvb %xmm4, %xmm3, %xmm2, %xmm2 +; X64-AVX2-NEXT: vpaddb %xmm2, %xmm2, %xmm3 +; X64-AVX2-NEXT: vpaddb %xmm4, %xmm4, %xmm5 +; X64-AVX2-NEXT: vpblendvb %xmm5, %xmm3, %xmm2, %xmm2 +; X64-AVX2-NEXT: vpsrlw $4, %xmm2, %xmm3 +; X64-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 +; X64-AVX2-NEXT: vpblendvb %xmm1, %xmm3, %xmm2, %xmm1 +; X64-AVX2-NEXT: vpsrlw $2, %xmm1, %xmm3 +; X64-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 +; X64-AVX2-NEXT: vpblendvb %xmm4, %xmm3, %xmm1, %xmm1 +; X64-AVX2-NEXT: vpsrlw $1, %xmm1, %xmm3 +; X64-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 +; X64-AVX2-NEXT: vpblendvb %xmm5, %xmm3, %xmm1, %xmm1 +; X64-AVX2-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; X64-AVX2-NEXT: vpblendvb %xmm0, %xmm2, %xmm1, %xmm0 ; X64-AVX2-NEXT: retq ; ; X86-LABEL: vec_v16i8: