Index: include/llvm/IR/IntrinsicsX86.td =================================================================== --- include/llvm/IR/IntrinsicsX86.td +++ include/llvm/IR/IntrinsicsX86.td @@ -378,28 +378,28 @@ // Integer arithmetic ops. let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". - def int_x86_sse2_padds_b : GCCBuiltin<"__builtin_ia32_paddsb128">, + def int_x86_sse2_padds_b : // FIXME: remove this intrinsic Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem, Commutative]>; - def int_x86_sse2_padds_w : GCCBuiltin<"__builtin_ia32_paddsw128">, + def int_x86_sse2_padds_w : // FIXME: remove this intrinsic Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty], [IntrNoMem, Commutative]>; - def int_x86_sse2_paddus_b : GCCBuiltin<"__builtin_ia32_paddusb128">, + def int_x86_sse2_paddus_b : // FIXME: remove this intrinsic Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem, Commutative]>; - def int_x86_sse2_paddus_w : GCCBuiltin<"__builtin_ia32_paddusw128">, + def int_x86_sse2_paddus_w : // FIXME: remove this intrinsic Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty], [IntrNoMem, Commutative]>; - def int_x86_sse2_psubs_b : GCCBuiltin<"__builtin_ia32_psubsb128">, + def int_x86_sse2_psubs_b : // FIXME: remove this intrinsic Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>; - def int_x86_sse2_psubs_w : GCCBuiltin<"__builtin_ia32_psubsw128">, + def int_x86_sse2_psubs_w : // FIXME: remove this intrinsic Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty], [IntrNoMem]>; - def int_x86_sse2_psubus_b : GCCBuiltin<"__builtin_ia32_psubusb128">, + def int_x86_sse2_psubus_b : // FIXME: remove this intrinsic Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>; - def int_x86_sse2_psubus_w : GCCBuiltin<"__builtin_ia32_psubusw128">, + def int_x86_sse2_psubus_w : // FIXME: remove this intrinsic Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty], [IntrNoMem]>; def int_x86_sse2_pmulhu_w : GCCBuiltin<"__builtin_ia32_pmulhuw128">, @@ -1627,28 +1627,28 @@ // Integer arithmetic ops. let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". - def int_x86_avx2_padds_b : GCCBuiltin<"__builtin_ia32_paddsb256">, + def int_x86_avx2_padds_b : // FIXME: remove this intrinsic Intrinsic<[llvm_v32i8_ty], [llvm_v32i8_ty, llvm_v32i8_ty], [IntrNoMem, Commutative]>; - def int_x86_avx2_padds_w : GCCBuiltin<"__builtin_ia32_paddsw256">, + def int_x86_avx2_padds_w : // FIXME: remove this intrinsic Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty, llvm_v16i16_ty], [IntrNoMem, Commutative]>; - def int_x86_avx2_paddus_b : GCCBuiltin<"__builtin_ia32_paddusb256">, + def int_x86_avx2_paddus_b : // FIXME: remove this intrinsic Intrinsic<[llvm_v32i8_ty], [llvm_v32i8_ty, llvm_v32i8_ty], [IntrNoMem, Commutative]>; - def int_x86_avx2_paddus_w : GCCBuiltin<"__builtin_ia32_paddusw256">, + def int_x86_avx2_paddus_w : // FIXME: remove this intrinsic Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty, llvm_v16i16_ty], [IntrNoMem, Commutative]>; - def int_x86_avx2_psubs_b : GCCBuiltin<"__builtin_ia32_psubsb256">, + def int_x86_avx2_psubs_b : // FIXME: remove this intrinsic Intrinsic<[llvm_v32i8_ty], [llvm_v32i8_ty, llvm_v32i8_ty], [IntrNoMem]>; - def int_x86_avx2_psubs_w : GCCBuiltin<"__builtin_ia32_psubsw256">, + def int_x86_avx2_psubs_w : // FIXME: remove this intrinsic Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty, llvm_v16i16_ty], [IntrNoMem]>; - def int_x86_avx2_psubus_b : GCCBuiltin<"__builtin_ia32_psubusb256">, + def int_x86_avx2_psubus_b : // FIXME: remove this intrinsic Intrinsic<[llvm_v32i8_ty], [llvm_v32i8_ty, llvm_v32i8_ty], [IntrNoMem]>; - def int_x86_avx2_psubus_w : GCCBuiltin<"__builtin_ia32_psubusw256">, + def int_x86_avx2_psubus_w : // FIXME: remove this intrinsic Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty, llvm_v16i16_ty], [IntrNoMem]>; def int_x86_avx2_pmulhu_w : GCCBuiltin<"__builtin_ia32_pmulhuw256">, @@ -4701,7 +4701,7 @@ def int_x86_avx512_mask_padds_b_256 : // FIXME: remove this intrinsic Intrinsic<[llvm_v32i8_ty], [llvm_v32i8_ty, llvm_v32i8_ty, llvm_v32i8_ty, llvm_i32_ty], [IntrNoMem]>; - def int_x86_avx512_mask_padds_b_512 : GCCBuiltin<"__builtin_ia32_paddsb512_mask">, + def int_x86_avx512_mask_padds_b_512 : // FIXME: remove this intrinsic Intrinsic<[llvm_v64i8_ty], [llvm_v64i8_ty, llvm_v64i8_ty, llvm_v64i8_ty, llvm_i64_ty], [IntrNoMem]>; def int_x86_avx512_mask_padds_w_128 : // FIXME: remove this intrinsic @@ -4710,7 +4710,7 @@ def int_x86_avx512_mask_padds_w_256 : // FIXME: remove this intrinsic Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty, llvm_v16i16_ty, llvm_v16i16_ty, llvm_i16_ty], [IntrNoMem]>; - def int_x86_avx512_mask_padds_w_512 : GCCBuiltin<"__builtin_ia32_paddsw512_mask">, + def int_x86_avx512_mask_padds_w_512 : // FIXME: remove this intrinsic Intrinsic<[llvm_v32i16_ty], [llvm_v32i16_ty, llvm_v32i16_ty, llvm_v32i16_ty, llvm_i32_ty], [IntrNoMem]>; def int_x86_avx512_mask_paddus_b_128 : // FIXME: remove this intrinsic @@ -4719,7 +4719,7 @@ def int_x86_avx512_mask_paddus_b_256 : // FIXME: remove this intrinsic Intrinsic<[llvm_v32i8_ty], [llvm_v32i8_ty, llvm_v32i8_ty, llvm_v32i8_ty, llvm_i32_ty], [IntrNoMem]>; - def int_x86_avx512_mask_paddus_b_512 : GCCBuiltin<"__builtin_ia32_paddusb512_mask">, + def int_x86_avx512_mask_paddus_b_512 : // FIXME: remove this intrinsic Intrinsic<[llvm_v64i8_ty], [llvm_v64i8_ty, llvm_v64i8_ty, llvm_v64i8_ty, llvm_i64_ty], [IntrNoMem]>; def int_x86_avx512_mask_paddus_w_128 : // FIXME: remove this intrinsic @@ -4728,7 +4728,7 @@ def int_x86_avx512_mask_paddus_w_256 : // FIXME: remove this intrinsic Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty, llvm_v16i16_ty, llvm_v16i16_ty, llvm_i16_ty], [IntrNoMem]>; - def int_x86_avx512_mask_paddus_w_512 : GCCBuiltin<"__builtin_ia32_paddusw512_mask">, + def int_x86_avx512_mask_paddus_w_512 : // FIXME: remove this intrinsic Intrinsic<[llvm_v32i16_ty], [llvm_v32i16_ty, llvm_v32i16_ty, llvm_v32i16_ty, llvm_i32_ty], [IntrNoMem]>; def int_x86_avx512_mask_psubs_b_128 : // FIXME: remove this intrinsic @@ -4737,7 +4737,7 @@ def int_x86_avx512_mask_psubs_b_256 : // FIXME: remove this intrinsic Intrinsic<[llvm_v32i8_ty], [llvm_v32i8_ty, llvm_v32i8_ty, llvm_v32i8_ty, llvm_i32_ty], [IntrNoMem]>; - def int_x86_avx512_mask_psubs_b_512 : GCCBuiltin<"__builtin_ia32_psubsb512_mask">, + def int_x86_avx512_mask_psubs_b_512 : // FIXME: remove this intrinsic Intrinsic<[llvm_v64i8_ty], [llvm_v64i8_ty, llvm_v64i8_ty, llvm_v64i8_ty, llvm_i64_ty], [IntrNoMem]>; def int_x86_avx512_mask_psubs_w_128 : // FIXME: remove this intrinsic @@ -4746,7 +4746,7 @@ def int_x86_avx512_mask_psubs_w_256 : // FIXME: remove this intrinsic Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty, llvm_v16i16_ty, llvm_v16i16_ty, llvm_i16_ty], [IntrNoMem]>; - def int_x86_avx512_mask_psubs_w_512 : GCCBuiltin<"__builtin_ia32_psubsw512_mask">, + def int_x86_avx512_mask_psubs_w_512 : // FIXME: remove this intrinsic Intrinsic<[llvm_v32i16_ty], [llvm_v32i16_ty, llvm_v32i16_ty, llvm_v32i16_ty, llvm_i32_ty], [IntrNoMem]>; def int_x86_avx512_mask_psubus_b_128 : // FIXME: remove this intrinsic @@ -4755,7 +4755,7 @@ def int_x86_avx512_mask_psubus_b_256 : // FIXME: remove this intrinsic Intrinsic<[llvm_v32i8_ty], [llvm_v32i8_ty, llvm_v32i8_ty, llvm_v32i8_ty, llvm_i32_ty], [IntrNoMem]>; - def int_x86_avx512_mask_psubus_b_512 : GCCBuiltin<"__builtin_ia32_psubusb512_mask">, + def int_x86_avx512_mask_psubus_b_512 : // FIXME: remove this intrinsic Intrinsic<[llvm_v64i8_ty], [llvm_v64i8_ty, llvm_v64i8_ty, llvm_v64i8_ty, llvm_i64_ty], [IntrNoMem]>; def int_x86_avx512_mask_psubus_w_128 : // FIXME: remove this intrinsic @@ -4764,7 +4764,7 @@ def int_x86_avx512_mask_psubus_w_256 : // FIXME: remove this intrinsic Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty, llvm_v16i16_ty, llvm_v16i16_ty, llvm_i16_ty], [IntrNoMem]>; - def int_x86_avx512_mask_psubus_w_512 : GCCBuiltin<"__builtin_ia32_psubusw512_mask">, + def int_x86_avx512_mask_psubus_w_512 : // FIXME: remove this intrinsic Intrinsic<[llvm_v32i16_ty], [llvm_v32i16_ty, llvm_v32i16_ty, llvm_v32i16_ty, llvm_i32_ty], [IntrNoMem]>; def int_x86_avx512_pmulhu_w_512 : GCCBuiltin<"__builtin_ia32_pmulhuw512">, Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -32234,6 +32234,53 @@ } } + // Match VSELECTs into add with unsigned saturation. + if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC && + ((Subtarget.hasSSE2() && (VT == MVT::v16i8 || VT == MVT::v8i16)) || + (Subtarget.hasAVX() && (VT == MVT::v32i8 || VT == MVT::v16i16)) || + (Subtarget.useBWIRegs() && (VT == MVT::v64i8 || VT == MVT::v32i16 || + VT == MVT::v16i32 || MVT::v8i64)))) { + ISD::CondCode CC = cast(Cond.getOperand(2))->get(); + + SDValue CondLHS = Cond->getOperand(0); + SDValue CondRHS = Cond->getOperand(1); + + // Canonicalize ADD to CondRHS to simplify the logic below. + if (CondLHS.getOpcode() == ISD::ADD) { + std::swap(CondLHS, CondRHS); + CC = ISD::getSetCCInverse(CC, true); + } + + // Check if one of the arms of the VSELECT is vector with all bits set. + // If it's on the left side invert the predicate to simplify logic below. + SDValue Other; + if (ISD::isBuildVectorAllOnes(LHS.getNode())) { + Other = RHS; + CC = ISD::getSetCCInverse(CC, true); + } else if (ISD::isBuildVectorAllOnes(RHS.getNode())) { + Other = LHS; + } + + // We can test against either of the addition operands. + if (Other.getNode() && Other->getNumOperands() == 2 && + (DAG.isEqualTo(Other->getOperand(0), CondLHS) || + DAG.isEqualTo(Other->getOperand(1), CondLHS))) { + SDValue OpLHS = Other->getOperand(0), OpRHS = Other->getOperand(1); + + auto ADDUSBuilder = [](SelectionDAG &DAG, const SDLoc &DL, + ArrayRef Ops) { + return DAG.getNode(X86ISD::ADDUS, DL, Ops[0].getValueType(), Ops); + }; + + // x < x+y ? x+y : ~0 --> addus x, y + // x <= x+y ? x+y : ~0 --> addus x, y + if ((CC == ISD::SETULE || CC == ISD::SETULT) && + Other->getOpcode() == ISD::ADD && DAG.isEqualTo(Other, CondRHS)) + return SplitOpsAndApply(DAG, Subtarget, DL, VT, { OpLHS, OpRHS }, + ADDUSBuilder); + } + } + if (SDValue V = combineVSelectWithAllOnesOrZeros(N, DAG, DCI, Subtarget)) return V; @@ -36076,6 +36123,116 @@ return DAG.getNode(Opc, DL, VT, LHS, RHS); } +/// This function detects the addition or subtraction with saturation pattern +/// between 2 i8/i16 vectors and replace this operation with the +/// efficient X86ISD::ADDUS/X86ISD::ADDS/X86ISD::SUBUS/x86ISD::SUBS instruction. +static SDValue detectAddSubSatPattern(SDValue In, EVT VT, SelectionDAG &DAG, + const X86Subtarget &Subtarget, + const SDLoc &DL) { + if (!VT.isVector()) + return SDValue(); + EVT InVT = In.getValueType(); + unsigned NumElems = VT.getVectorNumElements(); + + EVT ScalarVT = VT.getVectorElementType(); + if ((ScalarVT != MVT::i8 && ScalarVT != MVT::i16) || + InVT.getSizeInBits() % 128 != 0 || !isPowerOf2_32(NumElems)) + return SDValue(); + + // InScalarVT is the intermediate type in AddSubSat pattern + // and it should be greater than the original input type. + EVT InScalarVT = InVT.getVectorElementType(); + if (InScalarVT.getSizeInBits() <= ScalarVT.getSizeInBits()) + return SDValue(); + + if (!Subtarget.hasSSE2()) + return SDValue(); + + // Detect the following pattern: + // %2 = zext <16 x i8> %0 to <16 x i16> + // %3 = zext <16 x i8> %1 to <16 x i16> + // %4 = add nuw nsw <16 x i16> %3, %2 + // %5 = icmp ult <16 x i16> %4, <16 x i16> (vector of max InScalarVT values) + // %6 = select <16 x i1> %5, <16 x i16> (vector of max InScalarVT values) + // %7 = trunc <16 x i16> %6 to <16 x i8> + + // Detect a Sat Pattern + bool Signed = true; + SDValue Sat = detectSSatPattern(In, VT, false); + if (!Sat) { + Sat = detectUSatPattern(In, VT); + Signed = false; + } + if (!Sat) + return SDValue(); + if (Sat.getOpcode() != ISD::ADD && Sat.getOpcode() != ISD::SUB) + return SDValue(); + + unsigned Opcode = Sat.getOpcode() == ISD::ADD ? Signed ? X86ISD::ADDS + : X86ISD::ADDUS + : Signed ? X86ISD::SUBS + : X86ISD::SUBUS; + + // Get addition elements. + SDValue LHS = Sat.getOperand(0); + SDValue RHS = Sat.getOperand(1); + + // Check if Op is a result of type promotion. + auto IsExtended = [=, &DAG](SDValue Op) { + unsigned Opcode = Op.getOpcode(); + unsigned EltSize = ScalarVT.getSizeInBits(); + unsigned ExtEltSize = InScalarVT.getSizeInBits(); + unsigned ExtPartSize = ExtEltSize - EltSize; + + // Extension of non-constant operand. + if (Opcode == ISD::ZERO_EXTEND || Opcode == ISD::SIGN_EXTEND) { + if (Signed) + return DAG.ComputeNumSignBits(Op) > ExtPartSize; + else { + APInt HighBitsMask = APInt::getHighBitsSet(ExtEltSize, ExtPartSize); + return DAG.MaskedValueIsZero(Op, HighBitsMask); + } + } else if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) { + // Build vector of constant nodes. Each of them needs to be a correct + // extension from a constant of ScalarVT type. + unsigned NumOperands = Op.getNumOperands(); + for (unsigned i = 0; i < NumOperands; ++i) { + APInt Elt = cast(Op.getOperand(i))->getAPIntValue(); + Elt = Elt.getHiBits(Elt.isSignedIntN(ExtEltSize) ? ExtPartSize + 1 + : ExtPartSize); + if ((Signed && (!Elt.isAllOnesValue() && !Elt.isNullValue())) || + (!Signed && !Elt.isNullValue())) + return false; + } + return true; + } + return false; + }; + + // Either both operands are extended or one of them is extended + // and another one is a vector of constants. + if (!IsExtended(LHS) || !IsExtended(RHS)) + return SDValue(); + + // Truncate extended nodes to result type. + LHS = DAG.getNode(ISD::TRUNCATE, DL, VT, LHS); + RHS = DAG.getNode(ISD::TRUNCATE, DL, VT, RHS); + + // Make sure that operands after truncate are of the same type + // as those before extension. + if (LHS.getValueType() != VT || RHS.getValueType() != VT) + return SDValue(); + + // The pattern is detected, emit ADDS/ADDUS/SUBS/SUBUS instruction. + auto AddSubSatBuilder = [Opcode](SelectionDAG &DAG, const SDLoc &DL, + ArrayRef Ops) { + EVT VT = Ops[0].getValueType(); + return DAG.getNode(Opcode, DL, VT, Ops); + }; + return SplitOpsAndApply(DAG, Subtarget, DL, VT, { LHS, RHS }, + AddSubSatBuilder); +} + static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { EVT VT = N->getValueType(0); @@ -36090,6 +36247,10 @@ if (SDValue Avg = detectAVGPattern(Src, VT, DAG, Subtarget, DL)) return Avg; + // Try to detect addition or subtraction with saturation. + if (SDValue AddSubSat = detectAddSubSatPattern(Src, VT, DAG, Subtarget, DL)) + return AddSubSat; + // Try to combine truncation with signed/unsigned saturation. if (SDValue Val = combineTruncateWithSat(Src, VT, DL, DAG, Subtarget)) return Val; Index: test/CodeGen/X86/avx2-intrinsics-canonical.ll =================================================================== --- /dev/null +++ test/CodeGen/X86/avx2-intrinsics-canonical.ll @@ -0,0 +1,357 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=avx2 -show-mc-encoding | FileCheck %s --check-prefix=CHECK --check-prefix=AVX2 --check-prefix=X86 --check-prefix=X86-AVX +; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+avx512f,+avx512bw,+avx512vl,+avx512dq -show-mc-encoding | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512VL --check-prefix=X86 --check-prefix=X86-AVX512VL +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=avx2 -show-mc-encoding | FileCheck %s --check-prefix=CHECK --check-prefix=AVX2 --check-prefix=X64 --check-prefix=X64-AVX +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512bw,+avx512vl,+avx512dq -show-mc-encoding | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512VL --check-prefix=X64 --check-prefix=X64-AVX512VL + +; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/sse2-builtins.c + +define <32 x i8> @test_x86_avx2_padds_b(<32 x i8> %a0, <32 x i8> %a1) { +; X86-AVX-LABEL: test_x86_avx2_padds_b: +; X86-AVX: ## %bb.0: +; X86-AVX-NEXT: vpaddsb %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xec,0xc1] +; X86-AVX-NEXT: retl ## encoding: [0xc3] +; +; X86-AVX512VL-LABEL: test_x86_avx2_padds_b: +; X86-AVX512VL: ## %bb.0: +; X86-AVX512VL-NEXT: vpaddsb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xec,0xc1] +; X86-AVX512VL-NEXT: retl ## encoding: [0xc3] +; +; X64-AVX-LABEL: test_x86_avx2_padds_b: +; X64-AVX: ## %bb.0: +; X64-AVX-NEXT: vpaddsb %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xec,0xc1] +; X64-AVX-NEXT: retq ## encoding: [0xc3] +; +; X64-AVX512VL-LABEL: test_x86_avx2_padds_b: +; X64-AVX512VL: ## %bb.0: +; X64-AVX512VL-NEXT: vpaddsb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xec,0xc1] +; X64-AVX512VL-NEXT: retq ## encoding: [0xc3] + %1 = sext <32 x i8> %a0 to <32 x i16> + %2 = sext <32 x i8> %a1 to <32 x i16> + %3 = add nsw <32 x i16> %1, %2 + %4 = icmp slt <32 x i16> %3, + %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> + %6 = icmp sgt <32 x i16> %5, + %7 = select <32 x i1> %6, <32 x i16> %5, <32 x i16> + %8 = trunc <32 x i16> %7 to <32 x i8> + ret <32 x i8> %8 +} + +define <16 x i16> @test_x86_avx2_padds_w(<16 x i16> %a0, <16 x i16> %a1) { +; X86-AVX-LABEL: test_x86_avx2_padds_w: +; X86-AVX: ## %bb.0: +; X86-AVX-NEXT: vpaddsw %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xed,0xc1] +; X86-AVX-NEXT: retl ## encoding: [0xc3] +; +; X86-AVX512VL-LABEL: test_x86_avx2_padds_w: +; X86-AVX512VL: ## %bb.0: +; X86-AVX512VL-NEXT: vpaddsw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xed,0xc1] +; X86-AVX512VL-NEXT: retl ## encoding: [0xc3] +; +; X64-AVX-LABEL: test_x86_avx2_padds_w: +; X64-AVX: ## %bb.0: +; X64-AVX-NEXT: vpaddsw %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xed,0xc1] +; X64-AVX-NEXT: retq ## encoding: [0xc3] +; +; X64-AVX512VL-LABEL: test_x86_avx2_padds_w: +; X64-AVX512VL: ## %bb.0: +; X64-AVX512VL-NEXT: vpaddsw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xed,0xc1] +; X64-AVX512VL-NEXT: retq ## encoding: [0xc3] + %1 = sext <16 x i16> %a0 to <16 x i32> + %2 = sext <16 x i16> %a1 to <16 x i32> + %3 = add nsw <16 x i32> %1, %2 + %4 = icmp slt <16 x i32> %3, + %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> + %6 = icmp sgt <16 x i32> %5, + %7 = select <16 x i1> %6, <16 x i32> %5, <16 x i32> + %8 = trunc <16 x i32> %7 to <16 x i16> + ret <16 x i16> %8 +} + +define <32 x i8> @test_x86_avx2_paddus_b(<32 x i8> %a0, <32 x i8> %a1) { +; X86-AVX-LABEL: test_x86_avx2_paddus_b: +; X86-AVX: ## %bb.0: +; X86-AVX-NEXT: vpaddusb %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xdc,0xc1] +; X86-AVX-NEXT: retl ## encoding: [0xc3] +; +; X86-AVX512VL-LABEL: test_x86_avx2_paddus_b: +; X86-AVX512VL: ## %bb.0: +; X86-AVX512VL-NEXT: vpaddusb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xdc,0xc1] +; X86-AVX512VL-NEXT: retl ## encoding: [0xc3] +; +; X64-AVX-LABEL: test_x86_avx2_paddus_b: +; X64-AVX: ## %bb.0: +; X64-AVX-NEXT: vpaddusb %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xdc,0xc1] +; X64-AVX-NEXT: retq ## encoding: [0xc3] +; +; X64-AVX512VL-LABEL: test_x86_avx2_paddus_b: +; X64-AVX512VL: ## %bb.0: +; X64-AVX512VL-NEXT: vpaddusb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xdc,0xc1] +; X64-AVX512VL-NEXT: retq ## encoding: [0xc3] + %1 = add <32 x i8> %a0, %a1 + %2 = icmp ugt <32 x i8> %a0, %1 + %3 = select <32 x i1> %2, <32 x i8> , <32 x i8> %1 + ret <32 x i8> %3 +} + +define <16 x i16> @test_x86_avx2_paddus_w(<16 x i16> %a0, <16 x i16> %a1) { +; X86-AVX-LABEL: test_x86_avx2_paddus_w: +; X86-AVX: ## %bb.0: +; X86-AVX-NEXT: vpaddusw %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xdd,0xc1] +; X86-AVX-NEXT: retl ## encoding: [0xc3] +; +; X86-AVX512VL-LABEL: test_x86_avx2_paddus_w: +; X86-AVX512VL: ## %bb.0: +; X86-AVX512VL-NEXT: vpaddusw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xdd,0xc1] +; X86-AVX512VL-NEXT: retl ## encoding: [0xc3] +; +; X64-AVX-LABEL: test_x86_avx2_paddus_w: +; X64-AVX: ## %bb.0: +; X64-AVX-NEXT: vpaddusw %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xdd,0xc1] +; X64-AVX-NEXT: retq ## encoding: [0xc3] +; +; X64-AVX512VL-LABEL: test_x86_avx2_paddus_w: +; X64-AVX512VL: ## %bb.0: +; X64-AVX512VL-NEXT: vpaddusw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xdd,0xc1] +; X64-AVX512VL-NEXT: retq ## encoding: [0xc3] + %1 = add <16 x i16> %a0, %a1 + %2 = icmp ugt <16 x i16> %a0, %1 + %3 = select <16 x i1> %2, <16 x i16> , <16 x i16> %1 + ret <16 x i16> %3 +} + +define <32 x i8> @test_x86_avx2_psubs_b(<32 x i8> %a0, <32 x i8> %a1) { +; X86-AVX-LABEL: test_x86_avx2_psubs_b: +; X86-AVX: ## %bb.0: +; X86-AVX-NEXT: vpsubsb %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xe8,0xc1] +; X86-AVX-NEXT: retl ## encoding: [0xc3] +; +; X86-AVX512VL-LABEL: test_x86_avx2_psubs_b: +; X86-AVX512VL: ## %bb.0: +; X86-AVX512VL-NEXT: vpsubsb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe8,0xc1] +; X86-AVX512VL-NEXT: retl ## encoding: [0xc3] +; +; X64-AVX-LABEL: test_x86_avx2_psubs_b: +; X64-AVX: ## %bb.0: +; X64-AVX-NEXT: vpsubsb %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xe8,0xc1] +; X64-AVX-NEXT: retq ## encoding: [0xc3] +; +; X64-AVX512VL-LABEL: test_x86_avx2_psubs_b: +; X64-AVX512VL: ## %bb.0: +; X64-AVX512VL-NEXT: vpsubsb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe8,0xc1] +; X64-AVX512VL-NEXT: retq ## encoding: [0xc3] + %1 = sext <32 x i8> %a0 to <32 x i16> + %2 = sext <32 x i8> %a1 to <32 x i16> + %3 = sub nsw <32 x i16> %1, %2 + %4 = icmp slt <32 x i16> %3, + %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> + %6 = icmp sgt <32 x i16> %5, + %7 = select <32 x i1> %6, <32 x i16> %5, <32 x i16> + %8 = trunc <32 x i16> %7 to <32 x i8> + ret <32 x i8> %8 +} + +define <16 x i16> @test_x86_avx2_psubs_w(<16 x i16> %a0, <16 x i16> %a1) { +; X86-AVX-LABEL: test_x86_avx2_psubs_w: +; X86-AVX: ## %bb.0: +; X86-AVX-NEXT: vpsubsw %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xe9,0xc1] +; X86-AVX-NEXT: retl ## encoding: [0xc3] +; +; X86-AVX512VL-LABEL: test_x86_avx2_psubs_w: +; X86-AVX512VL: ## %bb.0: +; X86-AVX512VL-NEXT: vpsubsw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe9,0xc1] +; X86-AVX512VL-NEXT: retl ## encoding: [0xc3] +; +; X64-AVX-LABEL: test_x86_avx2_psubs_w: +; X64-AVX: ## %bb.0: +; X64-AVX-NEXT: vpsubsw %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xe9,0xc1] +; X64-AVX-NEXT: retq ## encoding: [0xc3] +; +; X64-AVX512VL-LABEL: test_x86_avx2_psubs_w: +; X64-AVX512VL: ## %bb.0: +; X64-AVX512VL-NEXT: vpsubsw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe9,0xc1] +; X64-AVX512VL-NEXT: retq ## encoding: [0xc3] + %1 = sext <16 x i16> %a0 to <16 x i32> + %2 = sext <16 x i16> %a1 to <16 x i32> + %3 = sub nsw <16 x i32> %1, %2 + %4 = icmp slt <16 x i32> %3, + %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> + %6 = icmp sgt <16 x i32> %5, + %7 = select <16 x i1> %6, <16 x i32> %5, <16 x i32> + %8 = trunc <16 x i32> %7 to <16 x i16> + ret <16 x i16> %8 +} + +define <32 x i8> @test_x86_avx2_psubus_b(<32 x i8> %a0, <32 x i8> %a1) { +; X86-AVX-LABEL: test_x86_avx2_psubus_b: +; X86-AVX: ## %bb.0: +; X86-AVX-NEXT: vpsubusb %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xd8,0xc1] +; X86-AVX-NEXT: retl ## encoding: [0xc3] +; +; X86-AVX512VL-LABEL: test_x86_avx2_psubus_b: +; X86-AVX512VL: ## %bb.0: +; X86-AVX512VL-NEXT: vpsubusb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd8,0xc1] +; X86-AVX512VL-NEXT: retl ## encoding: [0xc3] +; +; X64-AVX-LABEL: test_x86_avx2_psubus_b: +; X64-AVX: ## %bb.0: +; X64-AVX-NEXT: vpsubusb %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xd8,0xc1] +; X64-AVX-NEXT: retq ## encoding: [0xc3] +; +; X64-AVX512VL-LABEL: test_x86_avx2_psubus_b: +; X64-AVX512VL: ## %bb.0: +; X64-AVX512VL-NEXT: vpsubusb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd8,0xc1] +; X64-AVX512VL-NEXT: retq ## encoding: [0xc3] + %cmp = icmp ugt <32 x i8> %a0, %a1 + %sel = select <32 x i1> %cmp, <32 x i8> %a0, <32 x i8> %a1 + %sub = sub <32 x i8> %sel, %a1 + ret <32 x i8> %sub +} + +define <16 x i16> @test_x86_avx2_psubus_w(<16 x i16> %a0, <16 x i16> %a1) { +; X86-AVX-LABEL: test_x86_avx2_psubus_w: +; X86-AVX: ## %bb.0: +; X86-AVX-NEXT: vpsubusw %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xd9,0xc1] +; X86-AVX-NEXT: retl ## encoding: [0xc3] +; +; X86-AVX512VL-LABEL: test_x86_avx2_psubus_w: +; X86-AVX512VL: ## %bb.0: +; X86-AVX512VL-NEXT: vpsubusw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd9,0xc1] +; X86-AVX512VL-NEXT: retl ## encoding: [0xc3] +; +; X64-AVX-LABEL: test_x86_avx2_psubus_w: +; X64-AVX: ## %bb.0: +; X64-AVX-NEXT: vpsubusw %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xd9,0xc1] +; X64-AVX-NEXT: retq ## encoding: [0xc3] +; +; X64-AVX512VL-LABEL: test_x86_avx2_psubus_w: +; X64-AVX512VL: ## %bb.0: +; X64-AVX512VL-NEXT: vpsubusw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd9,0xc1] +; X64-AVX512VL-NEXT: retq ## encoding: [0xc3] + %cmp = icmp ugt <16 x i16> %a0, %a1 + %sel = select <16 x i1> %cmp, <16 x i16> %a0, <16 x i16> %a1 + %sub = sub <16 x i16> %sel, %a1 + ret <16 x i16> %sub +} + +define <32 x i16> @test_x86_avx2_padds_w_512(<32 x i16> %a, <32 x i16> %b) { +; X86-AVX-LABEL: test_x86_avx2_padds_w_512: +; X86-AVX: ## %bb.0: +; X86-AVX-NEXT: vpaddsw %ymm2, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xed,0xc2] +; X86-AVX-NEXT: vpaddsw %ymm3, %ymm1, %ymm1 ## encoding: [0xc5,0xf5,0xed,0xcb] +; X86-AVX-NEXT: retl ## encoding: [0xc3] +; +; X86-AVX512VL-LABEL: test_x86_avx2_padds_w_512: +; X86-AVX512VL: ## %bb.0: +; X86-AVX512VL-NEXT: vpaddsw %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xed,0xc1] +; X86-AVX512VL-NEXT: retl ## encoding: [0xc3] +; +; X64-AVX-LABEL: test_x86_avx2_padds_w_512: +; X64-AVX: ## %bb.0: +; X64-AVX-NEXT: vpaddsw %ymm2, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xed,0xc2] +; X64-AVX-NEXT: vpaddsw %ymm3, %ymm1, %ymm1 ## encoding: [0xc5,0xf5,0xed,0xcb] +; X64-AVX-NEXT: retq ## encoding: [0xc3] +; +; X64-AVX512VL-LABEL: test_x86_avx2_padds_w_512: +; X64-AVX512VL: ## %bb.0: +; X64-AVX512VL-NEXT: vpaddsw %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xed,0xc1] +; X64-AVX512VL-NEXT: retq ## encoding: [0xc3] + %1 = sext <32 x i16> %a to <32 x i32> + %2 = sext <32 x i16> %b to <32 x i32> + %3 = add nsw <32 x i32> %1, %2 + %4 = icmp slt <32 x i32> %3, + %5 = select <32 x i1> %4, <32 x i32> %3, <32 x i32> + %6 = icmp sgt <32 x i32> %5, + %7 = select <32 x i1> %6, <32 x i32> %5, <32 x i32> + %8 = trunc <32 x i32> %7 to <32 x i16> + ret <32 x i16> %8 +} + +define <32 x i16> @test_x86_avx2_psubs_w_512(<32 x i16> %a, <32 x i16> %b) { +; X86-AVX-LABEL: test_x86_avx2_psubs_w_512: +; X86-AVX: ## %bb.0: +; X86-AVX-NEXT: vpsubsw %ymm2, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xe9,0xc2] +; X86-AVX-NEXT: vpsubsw %ymm3, %ymm1, %ymm1 ## encoding: [0xc5,0xf5,0xe9,0xcb] +; X86-AVX-NEXT: retl ## encoding: [0xc3] +; +; X86-AVX512VL-LABEL: test_x86_avx2_psubs_w_512: +; X86-AVX512VL: ## %bb.0: +; X86-AVX512VL-NEXT: vpsubsw %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xe9,0xc1] +; X86-AVX512VL-NEXT: retl ## encoding: [0xc3] +; +; X64-AVX-LABEL: test_x86_avx2_psubs_w_512: +; X64-AVX: ## %bb.0: +; X64-AVX-NEXT: vpsubsw %ymm2, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xe9,0xc2] +; X64-AVX-NEXT: vpsubsw %ymm3, %ymm1, %ymm1 ## encoding: [0xc5,0xf5,0xe9,0xcb] +; X64-AVX-NEXT: retq ## encoding: [0xc3] +; +; X64-AVX512VL-LABEL: test_x86_avx2_psubs_w_512: +; X64-AVX512VL: ## %bb.0: +; X64-AVX512VL-NEXT: vpsubsw %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xe9,0xc1] +; X64-AVX512VL-NEXT: retq ## encoding: [0xc3] + %1 = sext <32 x i16> %a to <32 x i32> + %2 = sext <32 x i16> %b to <32 x i32> + %3 = sub nsw <32 x i32> %1, %2 + %4 = icmp slt <32 x i32> %3, + %5 = select <32 x i1> %4, <32 x i32> %3, <32 x i32> + %6 = icmp sgt <32 x i32> %5, + %7 = select <32 x i1> %6, <32 x i32> %5, <32 x i32> + %8 = trunc <32 x i32> %7 to <32 x i16> + ret <32 x i16> %8 +} + +define <32 x i16> @test_x86_avx2_paddus_w_512(<32 x i16> %a, <32 x i16> %b) { +; X86-AVX-LABEL: test_x86_avx2_paddus_w_512: +; X86-AVX: ## %bb.0: +; X86-AVX-NEXT: vpaddusw %ymm2, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xdd,0xc2] +; X86-AVX-NEXT: vpaddusw %ymm3, %ymm1, %ymm1 ## encoding: [0xc5,0xf5,0xdd,0xcb] +; X86-AVX-NEXT: retl ## encoding: [0xc3] +; +; X86-AVX512VL-LABEL: test_x86_avx2_paddus_w_512: +; X86-AVX512VL: ## %bb.0: +; X86-AVX512VL-NEXT: vpaddusw %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xdd,0xc1] +; X86-AVX512VL-NEXT: retl ## encoding: [0xc3] +; +; X64-AVX-LABEL: test_x86_avx2_paddus_w_512: +; X64-AVX: ## %bb.0: +; X64-AVX-NEXT: vpaddusw %ymm2, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xdd,0xc2] +; X64-AVX-NEXT: vpaddusw %ymm3, %ymm1, %ymm1 ## encoding: [0xc5,0xf5,0xdd,0xcb] +; X64-AVX-NEXT: retq ## encoding: [0xc3] +; +; X64-AVX512VL-LABEL: test_x86_avx2_paddus_w_512: +; X64-AVX512VL: ## %bb.0: +; X64-AVX512VL-NEXT: vpaddusw %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xdd,0xc1] +; X64-AVX512VL-NEXT: retq ## encoding: [0xc3] + %1 = add <32 x i16> %a, %b + %2 = icmp ugt <32 x i16> %a, %1 + %3 = select <32 x i1> %2, <32 x i16> , <32 x i16> %1 + ret <32 x i16> %3 +} + +define <32 x i16> @test_x86_avx2_psubus_w_512(<32 x i16> %a, <32 x i16> %b) { +; X86-AVX-LABEL: test_x86_avx2_psubus_w_512: +; X86-AVX: ## %bb.0: +; X86-AVX-NEXT: vpsubusw %ymm2, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xd9,0xc2] +; X86-AVX-NEXT: vpsubusw %ymm3, %ymm1, %ymm1 ## encoding: [0xc5,0xf5,0xd9,0xcb] +; X86-AVX-NEXT: retl ## encoding: [0xc3] +; +; X86-AVX512VL-LABEL: test_x86_avx2_psubus_w_512: +; X86-AVX512VL: ## %bb.0: +; X86-AVX512VL-NEXT: vpsubusw %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xd9,0xc1] +; X86-AVX512VL-NEXT: retl ## encoding: [0xc3] +; +; X64-AVX-LABEL: test_x86_avx2_psubus_w_512: +; X64-AVX: ## %bb.0: +; X64-AVX-NEXT: vpsubusw %ymm2, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xd9,0xc2] +; X64-AVX-NEXT: vpsubusw %ymm3, %ymm1, %ymm1 ## encoding: [0xc5,0xf5,0xd9,0xcb] +; X64-AVX-NEXT: retq ## encoding: [0xc3] +; +; X64-AVX512VL-LABEL: test_x86_avx2_psubus_w_512: +; X64-AVX512VL: ## %bb.0: +; X64-AVX512VL-NEXT: vpsubusw %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xd9,0xc1] +; X64-AVX512VL-NEXT: retq ## encoding: [0xc3] + %cmp = icmp ugt <32 x i16> %a, %b + %sel = select <32 x i1> %cmp, <32 x i16> %a, <32 x i16> %b + %sub = sub <32 x i16> %sel, %b + ret <32 x i16> %sub +} Index: test/CodeGen/X86/avx2-intrinsics-fast-isel.ll =================================================================== --- test/CodeGen/X86/avx2-intrinsics-fast-isel.ll +++ test/CodeGen/X86/avx2-intrinsics-fast-isel.ll @@ -98,11 +98,18 @@ ; CHECK-NEXT: ret{{[l|q]}} %arg0 = bitcast <4 x i64> %a0 to <32 x i8> %arg1 = bitcast <4 x i64> %a1 to <32 x i8> - %res = call <32 x i8> @llvm.x86.avx2.padds.b(<32 x i8> %arg0, <32 x i8> %arg1) - %bc = bitcast <32 x i8> %res to <4 x i64> + %1 = sext <32 x i8> %arg0 to <32 x i16> + %2 = sext <32 x i8> %arg1 to <32 x i16> + %3 = add nsw <32 x i16> %1, %2 + %4 = icmp slt <32 x i16> %3, + %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> + %6 = icmp sgt <32 x i16> %5, + %7 = select <32 x i1> %6, <32 x i16> %5, <32 x i16> + %8 = trunc <32 x i16> %7 to <32 x i8> + %bc = bitcast <32 x i8> %8 to <4 x i64> ret <4 x i64> %bc } -declare <32 x i8> @llvm.x86.avx2.padds.b(<32 x i8>, <32 x i8>) nounwind readnone + define <4 x i64> @test_mm256_adds_epi16(<4 x i64> %a0, <4 x i64> %a1) { ; CHECK-LABEL: test_mm256_adds_epi16: @@ -111,11 +118,18 @@ ; CHECK-NEXT: ret{{[l|q]}} %arg0 = bitcast <4 x i64> %a0 to <16 x i16> %arg1 = bitcast <4 x i64> %a1 to <16 x i16> - %res = call <16 x i16> @llvm.x86.avx2.padds.w(<16 x i16> %arg0, <16 x i16> %arg1) - %bc = bitcast <16 x i16> %res to <4 x i64> + %1 = sext <16 x i16> %arg0 to <16 x i32> + %2 = sext <16 x i16> %arg1 to <16 x i32> + %3 = add nsw <16 x i32> %1, %2 + %4 = icmp slt <16 x i32> %3, + %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> + %6 = icmp sgt <16 x i32> %5, + %7 = select <16 x i1> %6, <16 x i32> %5, <16 x i32> + %8 = trunc <16 x i32> %7 to <16 x i16> + %bc = bitcast <16 x i16> %8 to <4 x i64> ret <4 x i64> %bc } -declare <16 x i16> @llvm.x86.avx2.padds.w(<16 x i16>, <16 x i16>) nounwind readnone + define <4 x i64> @test_mm256_adds_epu8(<4 x i64> %a0, <4 x i64> %a1) { ; CHECK-LABEL: test_mm256_adds_epu8: @@ -124,11 +138,13 @@ ; CHECK-NEXT: ret{{[l|q]}} %arg0 = bitcast <4 x i64> %a0 to <32 x i8> %arg1 = bitcast <4 x i64> %a1 to <32 x i8> - %res = call <32 x i8> @llvm.x86.avx2.paddus.b(<32 x i8> %arg0, <32 x i8> %arg1) - %bc = bitcast <32 x i8> %res to <4 x i64> + %1 = add <32 x i8> %arg0, %arg1 + %2 = icmp ugt <32 x i8> %arg0, %1 + %3 = select <32 x i1> %2, <32 x i8> , <32 x i8> %1 + %bc = bitcast <32 x i8> %3 to <4 x i64> ret <4 x i64> %bc } -declare <32 x i8> @llvm.x86.avx2.paddus.b(<32 x i8>, <32 x i8>) nounwind readnone + define <4 x i64> @test_mm256_adds_epu16(<4 x i64> %a0, <4 x i64> %a1) { ; CHECK-LABEL: test_mm256_adds_epu16: @@ -137,11 +153,12 @@ ; CHECK-NEXT: ret{{[l|q]}} %arg0 = bitcast <4 x i64> %a0 to <16 x i16> %arg1 = bitcast <4 x i64> %a1 to <16 x i16> - %res = call <16 x i16> @llvm.x86.avx2.paddus.w(<16 x i16> %arg0, <16 x i16> %arg1) - %bc = bitcast <16 x i16> %res to <4 x i64> + %1 = add <16 x i16> %arg0, %arg1 + %2 = icmp ugt <16 x i16> %arg0, %1 + %3 = select <16 x i1> %2, <16 x i16> , <16 x i16> %1 + %bc = bitcast <16 x i16> %3 to <4 x i64> ret <4 x i64> %bc } -declare <16 x i16> @llvm.x86.avx2.paddus.w(<16 x i16>, <16 x i16>) nounwind readnone define <4 x i64> @test_mm256_alignr_epi8(<4 x i64> %a0, <4 x i64> %a1) { ; CHECK-LABEL: test_mm256_alignr_epi8: @@ -2529,11 +2546,18 @@ ; CHECK-NEXT: ret{{[l|q]}} %arg0 = bitcast <4 x i64> %a0 to <32 x i8> %arg1 = bitcast <4 x i64> %a1 to <32 x i8> - %res = call <32 x i8> @llvm.x86.avx2.psubs.b(<32 x i8> %arg0, <32 x i8> %arg1) - %bc = bitcast <32 x i8> %res to <4 x i64> + %1 = sext <32 x i8> %arg0 to <32 x i16> + %2 = sext <32 x i8> %arg1 to <32 x i16> + %3 = sub nsw <32 x i16> %1, %2 + %4 = icmp slt <32 x i16> %3, + %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> + %6 = icmp sgt <32 x i16> %5, + %7 = select <32 x i1> %6, <32 x i16> %5, <32 x i16> + %8 = trunc <32 x i16> %7 to <32 x i8> + %bc = bitcast <32 x i8> %8 to <4 x i64> ret <4 x i64> %bc } -declare <32 x i8> @llvm.x86.avx2.psubs.b(<32 x i8>, <32 x i8>) nounwind readnone + define <4 x i64> @test_mm256_subs_epi16(<4 x i64> %a0, <4 x i64> %a1) { ; CHECK-LABEL: test_mm256_subs_epi16: @@ -2542,37 +2566,49 @@ ; CHECK-NEXT: ret{{[l|q]}} %arg0 = bitcast <4 x i64> %a0 to <16 x i16> %arg1 = bitcast <4 x i64> %a1 to <16 x i16> - %res = call <16 x i16> @llvm.x86.avx2.psubs.w(<16 x i16> %arg0, <16 x i16> %arg1) - %bc = bitcast <16 x i16> %res to <4 x i64> + %1 = sext <16 x i16> %arg0 to <16 x i32> + %2 = sext <16 x i16> %arg1 to <16 x i32> + %3 = sub nsw <16 x i32> %1, %2 + %4 = icmp slt <16 x i32> %3, + %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> + %6 = icmp sgt <16 x i32> %5, + %7 = select <16 x i1> %6, <16 x i32> %5, <16 x i32> + %8 = trunc <16 x i32> %7 to <16 x i16> + %bc = bitcast <16 x i16> %8 to <4 x i64> ret <4 x i64> %bc } -declare <16 x i16> @llvm.x86.avx2.psubs.w(<16 x i16>, <16 x i16>) nounwind readnone + define <4 x i64> @test_mm256_subs_epu8(<4 x i64> %a0, <4 x i64> %a1) { ; CHECK-LABEL: test_mm256_subs_epu8: ; CHECK: # %bb.0: -; CHECK-NEXT: vpsubusb %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vpsubb %ymm1, %ymm0, %ymm0 ; CHECK-NEXT: ret{{[l|q]}} %arg0 = bitcast <4 x i64> %a0 to <32 x i8> %arg1 = bitcast <4 x i64> %a1 to <32 x i8> - %res = call <32 x i8> @llvm.x86.avx2.psubus.b(<32 x i8> %arg0, <32 x i8> %arg1) - %bc = bitcast <32 x i8> %res to <4 x i64> + %cmp = icmp ugt <32 x i8> %arg0, %arg1 + %sel = select <32 x i1> %cmp, <32 x i8> %arg0, <32 x i8> %arg1 + %sub = sub <32 x i8> %sel, %arg1 + %bc = bitcast <32 x i8> %sub to <4 x i64> ret <4 x i64> %bc } -declare <32 x i8> @llvm.x86.avx2.psubus.b(<32 x i8>, <32 x i8>) nounwind readnone + define <4 x i64> @test_mm256_subs_epu16(<4 x i64> %a0, <4 x i64> %a1) { ; CHECK-LABEL: test_mm256_subs_epu16: ; CHECK: # %bb.0: -; CHECK-NEXT: vpsubusw %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vpsubw %ymm1, %ymm0, %ymm0 ; CHECK-NEXT: ret{{[l|q]}} %arg0 = bitcast <4 x i64> %a0 to <16 x i16> %arg1 = bitcast <4 x i64> %a1 to <16 x i16> - %res = call <16 x i16> @llvm.x86.avx2.psubus.w(<16 x i16> %arg0, <16 x i16> %arg1) - %bc = bitcast <16 x i16> %res to <4 x i64> + %cmp = icmp ugt <16 x i16> %arg0, %arg1 + %sel = select <16 x i1> %cmp, <16 x i16> %arg0, <16 x i16> %arg1 + %sub = sub <16 x i16> %sel, %arg1 + %bc = bitcast <16 x i16> %sub to <4 x i64> ret <4 x i64> %bc } -declare <16 x i16> @llvm.x86.avx2.psubus.w(<16 x i16>, <16 x i16>) nounwind readnone define <4 x i64> @test_mm256_unpackhi_epi8(<4 x i64> %a0, <4 x i64> %a1) nounwind { ; CHECK-LABEL: test_mm256_unpackhi_epi8: Index: test/CodeGen/X86/avx512bw-intrinsics-canonical.ll =================================================================== --- /dev/null +++ test/CodeGen/X86/avx512bw-intrinsics-canonical.ll @@ -0,0 +1,675 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512BW +; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512F-32 + +; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/sse2-builtins.c + +define <32 x i16> @test_mask_adds_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) { +; AVX512BW-LABEL: test_mask_adds_epi16_rr_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: vpaddsw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_adds_epi16_rr_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: vpaddsw %zmm1, %zmm0, %zmm0 +; AVX512F-32-NEXT: retl + %1 = sext <32 x i16> %a to <32 x i32> + %2 = sext <32 x i16> %b to <32 x i32> + %3 = add nsw <32 x i32> %1, %2 + %4 = icmp slt <32 x i32> %3, + %5 = select <32 x i1> %4, <32 x i32> %3, <32 x i32> + %6 = icmp sgt <32 x i32> %5, + %7 = select <32 x i1> %6, <32 x i32> %5, <32 x i32> + %8 = trunc <32 x i32> %7 to <32 x i16> + ret <32 x i16> %8 +} + +define <32 x i16> @test_mask_adds_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) { +; AVX512BW-LABEL: test_mask_adds_epi16_rrk_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: kmovd %edi, %k1 +; AVX512BW-NEXT: vpaddsw %zmm1, %zmm0, %zmm2 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_adds_epi16_rrk_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 +; AVX512F-32-NEXT: vpaddsw %zmm1, %zmm0, %zmm2 {%k1} +; AVX512F-32-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512F-32-NEXT: retl + %1 = sext <32 x i16> %a to <32 x i32> + %2 = sext <32 x i16> %b to <32 x i32> + %3 = add nsw <32 x i32> %1, %2 + %4 = icmp slt <32 x i32> %3, + %5 = select <32 x i1> %4, <32 x i32> %3, <32 x i32> + %6 = icmp sgt <32 x i32> %5, + %7 = select <32 x i1> %6, <32 x i32> %5, <32 x i32> + %8 = trunc <32 x i32> %7 to <32 x i16> + %9 = bitcast i32 %mask to <32 x i1> + %10 = select <32 x i1> %9, <32 x i16> %8, <32 x i16> %passThru + ret <32 x i16> %10 +} + +define <32 x i16> @test_mask_adds_epi16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i32 %mask) { +; AVX512BW-LABEL: test_mask_adds_epi16_rrkz_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: kmovd %edi, %k1 +; AVX512BW-NEXT: vpaddsw %zmm1, %zmm0, %zmm0 {%k1} {z} +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_adds_epi16_rrkz_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 +; AVX512F-32-NEXT: vpaddsw %zmm1, %zmm0, %zmm0 {%k1} {z} +; AVX512F-32-NEXT: retl + %1 = sext <32 x i16> %a to <32 x i32> + %2 = sext <32 x i16> %b to <32 x i32> + %3 = add nsw <32 x i32> %1, %2 + %4 = icmp slt <32 x i32> %3, + %5 = select <32 x i1> %4, <32 x i32> %3, <32 x i32> + %6 = icmp sgt <32 x i32> %5, + %7 = select <32 x i1> %6, <32 x i32> %5, <32 x i32> + %8 = trunc <32 x i32> %7 to <32 x i16> + %9 = bitcast i32 %mask to <32 x i1> + %10 = select <32 x i1> %9, <32 x i16> %8, <32 x i16> zeroinitializer + ret <32 x i16> %10 +} + +define <32 x i16> @test_mask_adds_epi16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) { +; AVX512BW-LABEL: test_mask_adds_epi16_rm_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: vpaddsw (%rdi), %zmm0, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_adds_epi16_rm_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax +; AVX512F-32-NEXT: vpaddsw (%eax), %zmm0, %zmm0 +; AVX512F-32-NEXT: retl + %b = load <32 x i16>, <32 x i16>* %ptr_b + %1 = sext <32 x i16> %a to <32 x i32> + %2 = sext <32 x i16> %b to <32 x i32> + %3 = add nsw <32 x i32> %1, %2 + %4 = icmp slt <32 x i32> %3, + %5 = select <32 x i1> %4, <32 x i32> %3, <32 x i32> + %6 = icmp sgt <32 x i32> %5, + %7 = select <32 x i1> %6, <32 x i32> %5, <32 x i32> + %8 = trunc <32 x i32> %7 to <32 x i16> + ret <32 x i16> %8 +} + +define <32 x i16> @test_mask_adds_epi16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <32 x i16> %passThru, i32 %mask) { +; AVX512BW-LABEL: test_mask_adds_epi16_rmk_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: kmovd %esi, %k1 +; AVX512BW-NEXT: vpaddsw (%rdi), %zmm0, %zmm1 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_adds_epi16_rmk_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax +; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 +; AVX512F-32-NEXT: vpaddsw (%eax), %zmm0, %zmm1 {%k1} +; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512F-32-NEXT: retl + %b = load <32 x i16>, <32 x i16>* %ptr_b + %1 = sext <32 x i16> %a to <32 x i32> + %2 = sext <32 x i16> %b to <32 x i32> + %3 = add nsw <32 x i32> %1, %2 + %4 = icmp slt <32 x i32> %3, + %5 = select <32 x i1> %4, <32 x i32> %3, <32 x i32> + %6 = icmp sgt <32 x i32> %5, + %7 = select <32 x i1> %6, <32 x i32> %5, <32 x i32> + %8 = trunc <32 x i32> %7 to <32 x i16> + %9 = bitcast i32 %mask to <32 x i1> + %10 = select <32 x i1> %9, <32 x i16> %8, <32 x i16> %passThru + ret <32 x i16> %10 +} + +define <32 x i16> @test_mask_adds_epi16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i32 %mask) { +; AVX512BW-LABEL: test_mask_adds_epi16_rmkz_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: kmovd %esi, %k1 +; AVX512BW-NEXT: vpaddsw (%rdi), %zmm0, %zmm0 {%k1} {z} +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_adds_epi16_rmkz_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax +; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 +; AVX512F-32-NEXT: vpaddsw (%eax), %zmm0, %zmm0 {%k1} {z} +; AVX512F-32-NEXT: retl + %b = load <32 x i16>, <32 x i16>* %ptr_b + %1 = sext <32 x i16> %a to <32 x i32> + %2 = sext <32 x i16> %b to <32 x i32> + %3 = add nsw <32 x i32> %1, %2 + %4 = icmp slt <32 x i32> %3, + %5 = select <32 x i1> %4, <32 x i32> %3, <32 x i32> + %6 = icmp sgt <32 x i32> %5, + %7 = select <32 x i1> %6, <32 x i32> %5, <32 x i32> + %8 = trunc <32 x i32> %7 to <32 x i16> + %9 = bitcast i32 %mask to <32 x i1> + %10 = select <32 x i1> %9, <32 x i16> %8, <32 x i16> zeroinitializer + ret <32 x i16> %10 +} + +define <32 x i16> @test_mask_subs_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) { +; AVX512BW-LABEL: test_mask_subs_epi16_rr_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: vpsubsw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_subs_epi16_rr_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: vpsubsw %zmm1, %zmm0, %zmm0 +; AVX512F-32-NEXT: retl + %1 = sext <32 x i16> %a to <32 x i32> + %2 = sext <32 x i16> %b to <32 x i32> + %3 = sub nsw <32 x i32> %1, %2 + %4 = icmp slt <32 x i32> %3, + %5 = select <32 x i1> %4, <32 x i32> %3, <32 x i32> + %6 = icmp sgt <32 x i32> %5, + %7 = select <32 x i1> %6, <32 x i32> %5, <32 x i32> + %8 = trunc <32 x i32> %7 to <32 x i16> + ret <32 x i16> %8 +} + +define <32 x i16> @test_mask_subs_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) { +; AVX512BW-LABEL: test_mask_subs_epi16_rrk_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: kmovd %edi, %k1 +; AVX512BW-NEXT: vpsubsw %zmm1, %zmm0, %zmm2 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_subs_epi16_rrk_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 +; AVX512F-32-NEXT: vpsubsw %zmm1, %zmm0, %zmm2 {%k1} +; AVX512F-32-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512F-32-NEXT: retl + %1 = sext <32 x i16> %a to <32 x i32> + %2 = sext <32 x i16> %b to <32 x i32> + %3 = sub nsw <32 x i32> %1, %2 + %4 = icmp slt <32 x i32> %3, + %5 = select <32 x i1> %4, <32 x i32> %3, <32 x i32> + %6 = icmp sgt <32 x i32> %5, + %7 = select <32 x i1> %6, <32 x i32> %5, <32 x i32> + %8 = trunc <32 x i32> %7 to <32 x i16> + %9 = bitcast i32 %mask to <32 x i1> + %10 = select <32 x i1> %9, <32 x i16> %8, <32 x i16> %passThru + ret <32 x i16> %10 +} + +define <32 x i16> @test_mask_subs_epi16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i32 %mask) { +; AVX512BW-LABEL: test_mask_subs_epi16_rrkz_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: kmovd %edi, %k1 +; AVX512BW-NEXT: vpsubsw %zmm1, %zmm0, %zmm0 {%k1} {z} +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_subs_epi16_rrkz_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 +; AVX512F-32-NEXT: vpsubsw %zmm1, %zmm0, %zmm0 {%k1} {z} +; AVX512F-32-NEXT: retl + %1 = sext <32 x i16> %a to <32 x i32> + %2 = sext <32 x i16> %b to <32 x i32> + %3 = sub nsw <32 x i32> %1, %2 + %4 = icmp slt <32 x i32> %3, + %5 = select <32 x i1> %4, <32 x i32> %3, <32 x i32> + %6 = icmp sgt <32 x i32> %5, + %7 = select <32 x i1> %6, <32 x i32> %5, <32 x i32> + %8 = trunc <32 x i32> %7 to <32 x i16> + %9 = bitcast i32 %mask to <32 x i1> + %10 = select <32 x i1> %9, <32 x i16> %8, <32 x i16> zeroinitializer + ret <32 x i16> %10 +} + +define <32 x i16> @test_mask_subs_epi16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) { +; AVX512BW-LABEL: test_mask_subs_epi16_rm_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: vpsubsw (%rdi), %zmm0, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_subs_epi16_rm_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax +; AVX512F-32-NEXT: vpsubsw (%eax), %zmm0, %zmm0 +; AVX512F-32-NEXT: retl + %b = load <32 x i16>, <32 x i16>* %ptr_b + %1 = sext <32 x i16> %a to <32 x i32> + %2 = sext <32 x i16> %b to <32 x i32> + %3 = sub nsw <32 x i32> %1, %2 + %4 = icmp slt <32 x i32> %3, + %5 = select <32 x i1> %4, <32 x i32> %3, <32 x i32> + %6 = icmp sgt <32 x i32> %5, + %7 = select <32 x i1> %6, <32 x i32> %5, <32 x i32> + %8 = trunc <32 x i32> %7 to <32 x i16> + ret <32 x i16> %8 +} + +define <32 x i16> @test_mask_subs_epi16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <32 x i16> %passThru, i32 %mask) { +; AVX512BW-LABEL: test_mask_subs_epi16_rmk_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: kmovd %esi, %k1 +; AVX512BW-NEXT: vpsubsw (%rdi), %zmm0, %zmm1 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_subs_epi16_rmk_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax +; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 +; AVX512F-32-NEXT: vpsubsw (%eax), %zmm0, %zmm1 {%k1} +; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512F-32-NEXT: retl + %b = load <32 x i16>, <32 x i16>* %ptr_b + %1 = sext <32 x i16> %a to <32 x i32> + %2 = sext <32 x i16> %b to <32 x i32> + %3 = sub nsw <32 x i32> %1, %2 + %4 = icmp slt <32 x i32> %3, + %5 = select <32 x i1> %4, <32 x i32> %3, <32 x i32> + %6 = icmp sgt <32 x i32> %5, + %7 = select <32 x i1> %6, <32 x i32> %5, <32 x i32> + %8 = trunc <32 x i32> %7 to <32 x i16> + %9 = bitcast i32 %mask to <32 x i1> + %10 = select <32 x i1> %9, <32 x i16> %8, <32 x i16> %passThru + ret <32 x i16> %10 +} + +define <32 x i16> @test_mask_subs_epi16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i32 %mask) { +; AVX512BW-LABEL: test_mask_subs_epi16_rmkz_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: kmovd %esi, %k1 +; AVX512BW-NEXT: vpsubsw (%rdi), %zmm0, %zmm0 {%k1} {z} +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_subs_epi16_rmkz_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax +; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 +; AVX512F-32-NEXT: vpsubsw (%eax), %zmm0, %zmm0 {%k1} {z} +; AVX512F-32-NEXT: retl + %b = load <32 x i16>, <32 x i16>* %ptr_b + %1 = sext <32 x i16> %a to <32 x i32> + %2 = sext <32 x i16> %b to <32 x i32> + %3 = sub nsw <32 x i32> %1, %2 + %4 = icmp slt <32 x i32> %3, + %5 = select <32 x i1> %4, <32 x i32> %3, <32 x i32> + %6 = icmp sgt <32 x i32> %5, + %7 = select <32 x i1> %6, <32 x i32> %5, <32 x i32> + %8 = trunc <32 x i32> %7 to <32 x i16> + %9 = bitcast i32 %mask to <32 x i1> + %10 = select <32 x i1> %9, <32 x i16> %8, <32 x i16> zeroinitializer + ret <32 x i16> %10 +} + +define <32 x i16> @test_mask_adds_epu16_rr_512(<32 x i16> %a, <32 x i16> %b) { +; AVX512BW-LABEL: test_mask_adds_epu16_rr_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: vpaddusw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_adds_epu16_rr_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: vpaddusw %zmm1, %zmm0, %zmm0 +; AVX512F-32-NEXT: retl + %1 = add <32 x i16> %a, %b + %2 = icmp ugt <32 x i16> %a, %1 + %3 = select <32 x i1> %2, <32 x i16> , <32 x i16> %1 + ret <32 x i16> %3 +} + +define <32 x i16> @test_mask_adds_epu16_rrk_512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) { +; AVX512BW-LABEL: test_mask_adds_epu16_rrk_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: kmovd %edi, %k1 +; AVX512BW-NEXT: vpaddusw %zmm1, %zmm0, %zmm2 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_adds_epu16_rrk_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 +; AVX512F-32-NEXT: vpaddusw %zmm1, %zmm0, %zmm2 {%k1} +; AVX512F-32-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512F-32-NEXT: retl + %1 = add <32 x i16> %a, %b + %2 = icmp ugt <32 x i16> %a, %1 + %3 = select <32 x i1> %2, <32 x i16> , <32 x i16> %1 + %4 = bitcast i32 %mask to <32 x i1> + %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> %passThru + ret <32 x i16> %5 +} + +define <32 x i16> @test_mask_adds_epu16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i32 %mask) { +; AVX512BW-LABEL: test_mask_adds_epu16_rrkz_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: kmovd %edi, %k1 +; AVX512BW-NEXT: vpaddusw %zmm1, %zmm0, %zmm0 {%k1} {z} +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_adds_epu16_rrkz_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 +; AVX512F-32-NEXT: vpaddusw %zmm1, %zmm0, %zmm0 {%k1} {z} +; AVX512F-32-NEXT: retl + %1 = add <32 x i16> %a, %b + %2 = icmp ugt <32 x i16> %a, %1 + %3 = select <32 x i1> %2, <32 x i16> , <32 x i16> %1 + %4 = bitcast i32 %mask to <32 x i1> + %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> zeroinitializer + ret <32 x i16> %5 +} + +define <32 x i16> @test_mask_adds_epu16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) { +; AVX512BW-LABEL: test_mask_adds_epu16_rm_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: vpaddusw (%rdi), %zmm0, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_adds_epu16_rm_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax +; AVX512F-32-NEXT: vpaddusw (%eax), %zmm0, %zmm0 +; AVX512F-32-NEXT: retl + %b = load <32 x i16>, <32 x i16>* %ptr_b + %1 = add <32 x i16> %a, %b + %2 = icmp ugt <32 x i16> %a, %1 + %3 = select <32 x i1> %2, <32 x i16> , <32 x i16> %1 + ret <32 x i16> %3 +} + +define <32 x i16> @test_mask_adds_epu16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <32 x i16> %passThru, i32 %mask) { +; AVX512BW-LABEL: test_mask_adds_epu16_rmk_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: kmovd %esi, %k1 +; AVX512BW-NEXT: vpaddusw (%rdi), %zmm0, %zmm1 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_adds_epu16_rmk_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax +; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 +; AVX512F-32-NEXT: vpaddusw (%eax), %zmm0, %zmm1 {%k1} +; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512F-32-NEXT: retl + %b = load <32 x i16>, <32 x i16>* %ptr_b + %1 = add <32 x i16> %a, %b + %2 = icmp ugt <32 x i16> %a, %1 + %3 = select <32 x i1> %2, <32 x i16> , <32 x i16> %1 + %4 = bitcast i32 %mask to <32 x i1> + %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> %passThru + ret <32 x i16> %5 +} + +define <32 x i16> @test_mask_adds_epu16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i32 %mask) { +; AVX512BW-LABEL: test_mask_adds_epu16_rmkz_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: kmovd %esi, %k1 +; AVX512BW-NEXT: vpaddusw (%rdi), %zmm0, %zmm0 {%k1} {z} +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_adds_epu16_rmkz_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax +; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 +; AVX512F-32-NEXT: vpaddusw (%eax), %zmm0, %zmm0 {%k1} {z} +; AVX512F-32-NEXT: retl + %b = load <32 x i16>, <32 x i16>* %ptr_b + %1 = add <32 x i16> %a, %b + %2 = icmp ugt <32 x i16> %a, %1 + %3 = select <32 x i1> %2, <32 x i16> , <32 x i16> %1 + %4 = bitcast i32 %mask to <32 x i1> + %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> zeroinitializer + ret <32 x i16> %5 +} + +define <32 x i16> @test_mask_subs_epu16_rr_512(<32 x i16> %a, <32 x i16> %b) { +; AVX512BW-LABEL: test_mask_subs_epu16_rr_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: vpsubusw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_subs_epu16_rr_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: vpsubusw %zmm1, %zmm0, %zmm0 +; AVX512F-32-NEXT: retl + %cmp = icmp ugt <32 x i16> %a, %b + %sel = select <32 x i1> %cmp, <32 x i16> %a, <32 x i16> %b + %sub = sub <32 x i16> %sel, %b + ret <32 x i16> %sub +} + +define <32 x i16> @test_mask_subs_epu16_rrk_512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) { +; AVX512BW-LABEL: test_mask_subs_epu16_rrk_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: kmovd %edi, %k1 +; AVX512BW-NEXT: vpsubusw %zmm1, %zmm0, %zmm2 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_subs_epu16_rrk_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 +; AVX512F-32-NEXT: vpsubusw %zmm1, %zmm0, %zmm2 {%k1} +; AVX512F-32-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512F-32-NEXT: retl + %cmp = icmp ugt <32 x i16> %a, %b + %sel = select <32 x i1> %cmp, <32 x i16> %a, <32 x i16> %b + %sub = sub <32 x i16> %sel, %b + %bc = bitcast i32 %mask to <32 x i1> + %res = select <32 x i1> %bc, <32 x i16> %sub, <32 x i16> %passThru + ret <32 x i16> %res +} + +define <32 x i16> @test_mask_subs_epu16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i32 %mask) { +; AVX512BW-LABEL: test_mask_subs_epu16_rrkz_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: kmovd %edi, %k1 +; AVX512BW-NEXT: vpsubusw %zmm1, %zmm0, %zmm0 {%k1} {z} +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_subs_epu16_rrkz_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 +; AVX512F-32-NEXT: vpsubusw %zmm1, %zmm0, %zmm0 {%k1} {z} +; AVX512F-32-NEXT: retl + %cmp = icmp ugt <32 x i16> %a, %b + %sel = select <32 x i1> %cmp, <32 x i16> %a, <32 x i16> %b + %sub = sub <32 x i16> %sel, %b + %bc = bitcast i32 %mask to <32 x i1> + %res = select <32 x i1> %bc, <32 x i16> %sub, <32 x i16> zeroinitializer + ret <32 x i16> %res +} + +define <32 x i16> @test_mask_subs_epu16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) { +; AVX512BW-LABEL: test_mask_subs_epu16_rm_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: vpsubusw (%rdi), %zmm0, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_subs_epu16_rm_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax +; AVX512F-32-NEXT: vpsubusw (%eax), %zmm0, %zmm0 +; AVX512F-32-NEXT: retl + %b = load <32 x i16>, <32 x i16>* %ptr_b + %cmp = icmp ugt <32 x i16> %a, %b + %sel = select <32 x i1> %cmp, <32 x i16> %a, <32 x i16> %b + %sub = sub <32 x i16> %sel, %b + ret <32 x i16> %sub +} + +define <32 x i16> @test_mask_subs_epu16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <32 x i16> %passThru, i32 %mask) { +; AVX512BW-LABEL: test_mask_subs_epu16_rmk_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: kmovd %esi, %k1 +; AVX512BW-NEXT: vpsubusw (%rdi), %zmm0, %zmm1 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_subs_epu16_rmk_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax +; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 +; AVX512F-32-NEXT: vpsubusw (%eax), %zmm0, %zmm1 {%k1} +; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512F-32-NEXT: retl + %b = load <32 x i16>, <32 x i16>* %ptr_b + %cmp = icmp ugt <32 x i16> %a, %b + %sel = select <32 x i1> %cmp, <32 x i16> %a, <32 x i16> %b + %sub = sub <32 x i16> %sel, %b + %bc = bitcast i32 %mask to <32 x i1> + %res = select <32 x i1> %bc, <32 x i16> %sub, <32 x i16> %passThru + ret <32 x i16> %res +} + +define <32 x i16> @test_mask_subs_epu16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i32 %mask) { +; AVX512BW-LABEL: test_mask_subs_epu16_rmkz_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: kmovd %esi, %k1 +; AVX512BW-NEXT: vpsubusw (%rdi), %zmm0, %zmm0 {%k1} {z} +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_subs_epu16_rmkz_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax +; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 +; AVX512F-32-NEXT: vpsubusw (%eax), %zmm0, %zmm0 {%k1} {z} +; AVX512F-32-NEXT: retl + %b = load <32 x i16>, <32 x i16>* %ptr_b + %cmp = icmp ugt <32 x i16> %a, %b + %sel = select <32 x i1> %cmp, <32 x i16> %a, <32 x i16> %b + %sub = sub <32 x i16> %sel, %b + %bc = bitcast i32 %mask to <32 x i1> + %res = select <32 x i1> %bc, <32 x i16> %sub, <32 x i16> zeroinitializer + ret <32 x i16> %res +} + +define <64 x i16> @test_mask_adds_epi16_rr_1024(<64 x i16> %a, <64 x i16> %b) { +; AVX512BW-LABEL: test_mask_adds_epi16_rr_1024: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: vpaddsw %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddsw %zmm3, %zmm1, %zmm1 +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_adds_epi16_rr_1024: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: pushl %ebp +; AVX512F-32-NEXT: .cfi_def_cfa_offset 8 +; AVX512F-32-NEXT: .cfi_offset %ebp, -8 +; AVX512F-32-NEXT: movl %esp, %ebp +; AVX512F-32-NEXT: .cfi_def_cfa_register %ebp +; AVX512F-32-NEXT: andl $-64, %esp +; AVX512F-32-NEXT: subl $64, %esp +; AVX512F-32-NEXT: vpaddsw %zmm2, %zmm0, %zmm0 +; AVX512F-32-NEXT: vpaddsw 8(%ebp), %zmm1, %zmm1 +; AVX512F-32-NEXT: movl %ebp, %esp +; AVX512F-32-NEXT: popl %ebp +; AVX512F-32-NEXT: .cfi_def_cfa %esp, 4 +; AVX512F-32-NEXT: retl + %1 = sext <64 x i16> %a to <64 x i32> + %2 = sext <64 x i16> %b to <64 x i32> + %3 = add nsw <64 x i32> %1, %2 + %4 = icmp slt <64 x i32> %3, + %5 = select <64 x i1> %4, <64 x i32> %3, <64 x i32> + %6 = icmp sgt <64 x i32> %5, + %7 = select <64 x i1> %6, <64 x i32> %5, <64 x i32> + %8 = trunc <64 x i32> %7 to <64 x i16> + ret <64 x i16> %8 +} + +define <64 x i16> @test_mask_subs_epi16_rr_1024(<64 x i16> %a, <64 x i16> %b) { +; AVX512BW-LABEL: test_mask_subs_epi16_rr_1024: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: vpsubsw %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpsubsw %zmm3, %zmm1, %zmm1 +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_subs_epi16_rr_1024: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: pushl %ebp +; AVX512F-32-NEXT: .cfi_def_cfa_offset 8 +; AVX512F-32-NEXT: .cfi_offset %ebp, -8 +; AVX512F-32-NEXT: movl %esp, %ebp +; AVX512F-32-NEXT: .cfi_def_cfa_register %ebp +; AVX512F-32-NEXT: andl $-64, %esp +; AVX512F-32-NEXT: subl $64, %esp +; AVX512F-32-NEXT: vpsubsw %zmm2, %zmm0, %zmm0 +; AVX512F-32-NEXT: vpsubsw 8(%ebp), %zmm1, %zmm1 +; AVX512F-32-NEXT: movl %ebp, %esp +; AVX512F-32-NEXT: popl %ebp +; AVX512F-32-NEXT: .cfi_def_cfa %esp, 4 +; AVX512F-32-NEXT: retl + %1 = sext <64 x i16> %a to <64 x i32> + %2 = sext <64 x i16> %b to <64 x i32> + %3 = sub nsw <64 x i32> %1, %2 + %4 = icmp slt <64 x i32> %3, + %5 = select <64 x i1> %4, <64 x i32> %3, <64 x i32> + %6 = icmp sgt <64 x i32> %5, + %7 = select <64 x i1> %6, <64 x i32> %5, <64 x i32> + %8 = trunc <64 x i32> %7 to <64 x i16> + ret <64 x i16> %8 +} + +define <64 x i16> @test_mask_adds_epu16_rr_1024(<64 x i16> %a, <64 x i16> %b) { +; AVX512BW-LABEL: test_mask_adds_epu16_rr_1024: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: vpaddusw %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddusw %zmm3, %zmm1, %zmm1 +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_adds_epu16_rr_1024: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: pushl %ebp +; AVX512F-32-NEXT: .cfi_def_cfa_offset 8 +; AVX512F-32-NEXT: .cfi_offset %ebp, -8 +; AVX512F-32-NEXT: movl %esp, %ebp +; AVX512F-32-NEXT: .cfi_def_cfa_register %ebp +; AVX512F-32-NEXT: andl $-64, %esp +; AVX512F-32-NEXT: subl $64, %esp +; AVX512F-32-NEXT: vpaddusw %zmm2, %zmm0, %zmm0 +; AVX512F-32-NEXT: vpaddusw 8(%ebp), %zmm1, %zmm1 +; AVX512F-32-NEXT: movl %ebp, %esp +; AVX512F-32-NEXT: popl %ebp +; AVX512F-32-NEXT: .cfi_def_cfa %esp, 4 +; AVX512F-32-NEXT: retl + %1 = add <64 x i16> %a, %b + %2 = icmp ugt <64 x i16> %a, %1 + %3 = select <64 x i1> %2, <64 x i16> , <64 x i16> %1 + ret <64 x i16> %3 +} + +define <64 x i16> @test_mask_subs_epu16_rr_1024(<64 x i16> %a, <64 x i16> %b) { +; AVX512BW-LABEL: test_mask_subs_epu16_rr_1024: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: vpsubusw %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpsubusw %zmm3, %zmm1, %zmm1 +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_subs_epu16_rr_1024: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: pushl %ebp +; AVX512F-32-NEXT: .cfi_def_cfa_offset 8 +; AVX512F-32-NEXT: .cfi_offset %ebp, -8 +; AVX512F-32-NEXT: movl %esp, %ebp +; AVX512F-32-NEXT: .cfi_def_cfa_register %ebp +; AVX512F-32-NEXT: andl $-64, %esp +; AVX512F-32-NEXT: subl $64, %esp +; AVX512F-32-NEXT: vpsubusw %zmm2, %zmm0, %zmm0 +; AVX512F-32-NEXT: vpsubusw 8(%ebp), %zmm1, %zmm1 +; AVX512F-32-NEXT: movl %ebp, %esp +; AVX512F-32-NEXT: popl %ebp +; AVX512F-32-NEXT: .cfi_def_cfa %esp, 4 +; AVX512F-32-NEXT: retl + %cmp = icmp ugt <64 x i16> %a, %b + %sel = select <64 x i1> %cmp, <64 x i16> %a, <64 x i16> %b + %sub = sub <64 x i16> %sel, %b + ret <64 x i16> %sub +} Index: test/CodeGen/X86/avx512bwvl-intrinsics-canonical.ll =================================================================== --- /dev/null +++ test/CodeGen/X86/avx512bwvl-intrinsics-canonical.ll @@ -0,0 +1,1572 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw -mattr=+avx512vl --show-mc-encoding| FileCheck %s + +; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/sse2-builtins.c + +define <8 x i16> @test_mask_adds_epi16_rr_128(<8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: test_mask_adds_epi16_rr_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpaddsw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xed,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %1 = sext <8 x i16> %a to <8 x i32> + %2 = sext <8 x i16> %b to <8 x i32> + %3 = add nsw <8 x i32> %1, %2 + %4 = icmp slt <8 x i32> %3, + %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> + %6 = icmp sgt <8 x i32> %5, + %7 = select <8 x i1> %6, <8 x i32> %5, <8 x i32> + %8 = trunc <8 x i32> %7 to <8 x i16> + ret <8 x i16> %8 +} + +define <8 x i16> @test_mask_adds_epi16_rrk_128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) { +; CHECK-LABEL: test_mask_adds_epi16_rrk_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpaddsw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xed,0xd1] +; CHECK-NEXT: vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] +; CHECK-NEXT: retq ## encoding: [0xc3] + %1 = sext <8 x i16> %a to <8 x i32> + %2 = sext <8 x i16> %b to <8 x i32> + %3 = add nsw <8 x i32> %1, %2 + %4 = icmp slt <8 x i32> %3, + %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> + %6 = icmp sgt <8 x i32> %5, + %7 = select <8 x i1> %6, <8 x i32> %5, <8 x i32> + %8 = trunc <8 x i32> %7 to <8 x i16> + %9 = bitcast i8 %mask to <8 x i1> + %10 = select <8 x i1> %9, <8 x i16> %8, <8 x i16> %passThru + ret <8 x i16> %10 +} + +define <8 x i16> @test_mask_adds_epi16_rrkz_128(<8 x i16> %a, <8 x i16> %b, i8 %mask) { +; CHECK-LABEL: test_mask_adds_epi16_rrkz_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpaddsw %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xed,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %1 = sext <8 x i16> %a to <8 x i32> + %2 = sext <8 x i16> %b to <8 x i32> + %3 = add nsw <8 x i32> %1, %2 + %4 = icmp slt <8 x i32> %3, + %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> + %6 = icmp sgt <8 x i32> %5, + %7 = select <8 x i1> %6, <8 x i32> %5, <8 x i32> + %8 = trunc <8 x i32> %7 to <8 x i16> + %9 = bitcast i8 %mask to <8 x i1> + %10 = select <8 x i1> %9, <8 x i16> %8, <8 x i16> zeroinitializer + ret <8 x i16> %10 +} + +define <8 x i16> @test_mask_adds_epi16_rm_128(<8 x i16> %a, <8 x i16>* %ptr_b) { +; CHECK-LABEL: test_mask_adds_epi16_rm_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpaddsw (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xed,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <8 x i16>, <8 x i16>* %ptr_b + %1 = sext <8 x i16> %a to <8 x i32> + %2 = sext <8 x i16> %b to <8 x i32> + %3 = add nsw <8 x i32> %1, %2 + %4 = icmp slt <8 x i32> %3, + %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> + %6 = icmp sgt <8 x i32> %5, + %7 = select <8 x i1> %6, <8 x i32> %5, <8 x i32> + %8 = trunc <8 x i32> %7 to <8 x i16> + ret <8 x i16> %8 +} + +define <8 x i16> @test_mask_adds_epi16_rmk_128(<8 x i16> %a, <8 x i16>* %ptr_b, <8 x i16> %passThru, i8 %mask) { +; CHECK-LABEL: test_mask_adds_epi16_rmk_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpaddsw (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xed,0x0f] +; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <8 x i16>, <8 x i16>* %ptr_b + %1 = sext <8 x i16> %a to <8 x i32> + %2 = sext <8 x i16> %b to <8 x i32> + %3 = add nsw <8 x i32> %1, %2 + %4 = icmp slt <8 x i32> %3, + %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> + %6 = icmp sgt <8 x i32> %5, + %7 = select <8 x i1> %6, <8 x i32> %5, <8 x i32> + %8 = trunc <8 x i32> %7 to <8 x i16> + %9 = bitcast i8 %mask to <8 x i1> + %10 = select <8 x i1> %9, <8 x i16> %8, <8 x i16> %passThru + ret <8 x i16> %10 +} + +define <8 x i16> @test_mask_adds_epi16_rmkz_128(<8 x i16> %a, <8 x i16>* %ptr_b, i8 %mask) { +; CHECK-LABEL: test_mask_adds_epi16_rmkz_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpaddsw (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xed,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <8 x i16>, <8 x i16>* %ptr_b + %1 = sext <8 x i16> %a to <8 x i32> + %2 = sext <8 x i16> %b to <8 x i32> + %3 = add nsw <8 x i32> %1, %2 + %4 = icmp slt <8 x i32> %3, + %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> + %6 = icmp sgt <8 x i32> %5, + %7 = select <8 x i1> %6, <8 x i32> %5, <8 x i32> + %8 = trunc <8 x i32> %7 to <8 x i16> + %9 = bitcast i8 %mask to <8 x i1> + %10 = select <8 x i1> %9, <8 x i16> %8, <8 x i16> zeroinitializer + ret <8 x i16> %10 +} + +define <16 x i16> @test_mask_adds_epi16_rr_256(<16 x i16> %a, <16 x i16> %b) { +; CHECK-LABEL: test_mask_adds_epi16_rr_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpaddsw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xed,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %1 = sext <16 x i16> %a to <16 x i32> + %2 = sext <16 x i16> %b to <16 x i32> + %3 = add nsw <16 x i32> %1, %2 + %4 = icmp slt <16 x i32> %3, + %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> + %6 = icmp sgt <16 x i32> %5, + %7 = select <16 x i1> %6, <16 x i32> %5, <16 x i32> + %8 = trunc <16 x i32> %7 to <16 x i16> + ret <16 x i16> %8 +} + +define <16 x i16> @test_mask_adds_epi16_rrk_256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) { +; CHECK-LABEL: test_mask_adds_epi16_rrk_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpaddsw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xed,0xd1] +; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] +; CHECK-NEXT: retq ## encoding: [0xc3] + %1 = sext <16 x i16> %a to <16 x i32> + %2 = sext <16 x i16> %b to <16 x i32> + %3 = add nsw <16 x i32> %1, %2 + %4 = icmp slt <16 x i32> %3, + %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> + %6 = icmp sgt <16 x i32> %5, + %7 = select <16 x i1> %6, <16 x i32> %5, <16 x i32> + %8 = trunc <16 x i32> %7 to <16 x i16> + %9 = bitcast i16 %mask to <16 x i1> + %10 = select <16 x i1> %9, <16 x i16> %8, <16 x i16> %passThru + ret <16 x i16> %10 +} + +define <16 x i16> @test_mask_adds_epi16_rrkz_256(<16 x i16> %a, <16 x i16> %b, i16 %mask) { +; CHECK-LABEL: test_mask_adds_epi16_rrkz_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpaddsw %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xed,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %1 = sext <16 x i16> %a to <16 x i32> + %2 = sext <16 x i16> %b to <16 x i32> + %3 = add nsw <16 x i32> %1, %2 + %4 = icmp slt <16 x i32> %3, + %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> + %6 = icmp sgt <16 x i32> %5, + %7 = select <16 x i1> %6, <16 x i32> %5, <16 x i32> + %8 = trunc <16 x i32> %7 to <16 x i16> + %9 = bitcast i16 %mask to <16 x i1> + %10 = select <16 x i1> %9, <16 x i16> %8, <16 x i16> zeroinitializer + ret <16 x i16> %10 +} + +define <16 x i16> @test_mask_adds_epi16_rm_256(<16 x i16> %a, <16 x i16>* %ptr_b) { +; CHECK-LABEL: test_mask_adds_epi16_rm_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpaddsw (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xed,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <16 x i16>, <16 x i16>* %ptr_b + %1 = sext <16 x i16> %a to <16 x i32> + %2 = sext <16 x i16> %b to <16 x i32> + %3 = add nsw <16 x i32> %1, %2 + %4 = icmp slt <16 x i32> %3, + %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> + %6 = icmp sgt <16 x i32> %5, + %7 = select <16 x i1> %6, <16 x i32> %5, <16 x i32> + %8 = trunc <16 x i32> %7 to <16 x i16> + ret <16 x i16> %8 +} + +define <16 x i16> @test_mask_adds_epi16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr_b, <16 x i16> %passThru, i16 %mask) { +; CHECK-LABEL: test_mask_adds_epi16_rmk_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpaddsw (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xed,0x0f] +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <16 x i16>, <16 x i16>* %ptr_b + %1 = sext <16 x i16> %a to <16 x i32> + %2 = sext <16 x i16> %b to <16 x i32> + %3 = add nsw <16 x i32> %1, %2 + %4 = icmp slt <16 x i32> %3, + %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> + %6 = icmp sgt <16 x i32> %5, + %7 = select <16 x i1> %6, <16 x i32> %5, <16 x i32> + %8 = trunc <16 x i32> %7 to <16 x i16> + %9 = bitcast i16 %mask to <16 x i1> + %10 = select <16 x i1> %9, <16 x i16> %8, <16 x i16> %passThru + ret <16 x i16> %10 +} + +define <16 x i16> @test_mask_adds_epi16_rmkz_256(<16 x i16> %a, <16 x i16>* %ptr_b, i16 %mask) { +; CHECK-LABEL: test_mask_adds_epi16_rmkz_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpaddsw (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xed,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <16 x i16>, <16 x i16>* %ptr_b + %1 = sext <16 x i16> %a to <16 x i32> + %2 = sext <16 x i16> %b to <16 x i32> + %3 = add nsw <16 x i32> %1, %2 + %4 = icmp slt <16 x i32> %3, + %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> + %6 = icmp sgt <16 x i32> %5, + %7 = select <16 x i1> %6, <16 x i32> %5, <16 x i32> + %8 = trunc <16 x i32> %7 to <16 x i16> + %9 = bitcast i16 %mask to <16 x i1> + %10 = select <16 x i1> %9, <16 x i16> %8, <16 x i16> zeroinitializer + ret <16 x i16> %10 +} + +define <8 x i16> @test_mask_subs_epi16_rr_128(<8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: test_mask_subs_epi16_rr_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpsubsw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe9,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %1 = sext <8 x i16> %a to <8 x i32> + %2 = sext <8 x i16> %b to <8 x i32> + %3 = sub nsw <8 x i32> %1, %2 + %4 = icmp slt <8 x i32> %3, + %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> + %6 = icmp sgt <8 x i32> %5, + %7 = select <8 x i1> %6, <8 x i32> %5, <8 x i32> + %8 = trunc <8 x i32> %7 to <8 x i16> + ret <8 x i16> %8 +} + +define <8 x i16> @test_mask_subs_epi16_rrk_128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) { +; CHECK-LABEL: test_mask_subs_epi16_rrk_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpsubsw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xe9,0xd1] +; CHECK-NEXT: vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] +; CHECK-NEXT: retq ## encoding: [0xc3] + %1 = sext <8 x i16> %a to <8 x i32> + %2 = sext <8 x i16> %b to <8 x i32> + %3 = sub nsw <8 x i32> %1, %2 + %4 = icmp slt <8 x i32> %3, + %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> + %6 = icmp sgt <8 x i32> %5, + %7 = select <8 x i1> %6, <8 x i32> %5, <8 x i32> + %8 = trunc <8 x i32> %7 to <8 x i16> + %9 = bitcast i8 %mask to <8 x i1> + %10 = select <8 x i1> %9, <8 x i16> %8, <8 x i16> %passThru + ret <8 x i16> %10 +} + +define <8 x i16> @test_mask_subs_epi16_rrkz_128(<8 x i16> %a, <8 x i16> %b, i8 %mask) { +; CHECK-LABEL: test_mask_subs_epi16_rrkz_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpsubsw %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xe9,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %1 = sext <8 x i16> %a to <8 x i32> + %2 = sext <8 x i16> %b to <8 x i32> + %3 = sub nsw <8 x i32> %1, %2 + %4 = icmp slt <8 x i32> %3, + %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> + %6 = icmp sgt <8 x i32> %5, + %7 = select <8 x i1> %6, <8 x i32> %5, <8 x i32> + %8 = trunc <8 x i32> %7 to <8 x i16> + %9 = bitcast i8 %mask to <8 x i1> + %10 = select <8 x i1> %9, <8 x i16> %8, <8 x i16> zeroinitializer + ret <8 x i16> %10 +} + +define <8 x i16> @test_mask_subs_epi16_rm_128(<8 x i16> %a, <8 x i16>* %ptr_b) { +; CHECK-LABEL: test_mask_subs_epi16_rm_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpsubsw (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe9,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <8 x i16>, <8 x i16>* %ptr_b + %1 = sext <8 x i16> %a to <8 x i32> + %2 = sext <8 x i16> %b to <8 x i32> + %3 = sub nsw <8 x i32> %1, %2 + %4 = icmp slt <8 x i32> %3, + %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> + %6 = icmp sgt <8 x i32> %5, + %7 = select <8 x i1> %6, <8 x i32> %5, <8 x i32> + %8 = trunc <8 x i32> %7 to <8 x i16> + ret <8 x i16> %8 +} + +define <8 x i16> @test_mask_subs_epi16_rmk_128(<8 x i16> %a, <8 x i16>* %ptr_b, <8 x i16> %passThru, i8 %mask) { +; CHECK-LABEL: test_mask_subs_epi16_rmk_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpsubsw (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xe9,0x0f] +; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <8 x i16>, <8 x i16>* %ptr_b + %1 = sext <8 x i16> %a to <8 x i32> + %2 = sext <8 x i16> %b to <8 x i32> + %3 = sub nsw <8 x i32> %1, %2 + %4 = icmp slt <8 x i32> %3, + %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> + %6 = icmp sgt <8 x i32> %5, + %7 = select <8 x i1> %6, <8 x i32> %5, <8 x i32> + %8 = trunc <8 x i32> %7 to <8 x i16> + %9 = bitcast i8 %mask to <8 x i1> + %10 = select <8 x i1> %9, <8 x i16> %8, <8 x i16> %passThru + ret <8 x i16> %10 +} + +define <8 x i16> @test_mask_subs_epi16_rmkz_128(<8 x i16> %a, <8 x i16>* %ptr_b, i8 %mask) { +; CHECK-LABEL: test_mask_subs_epi16_rmkz_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpsubsw (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xe9,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <8 x i16>, <8 x i16>* %ptr_b + %1 = sext <8 x i16> %a to <8 x i32> + %2 = sext <8 x i16> %b to <8 x i32> + %3 = sub nsw <8 x i32> %1, %2 + %4 = icmp slt <8 x i32> %3, + %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> + %6 = icmp sgt <8 x i32> %5, + %7 = select <8 x i1> %6, <8 x i32> %5, <8 x i32> + %8 = trunc <8 x i32> %7 to <8 x i16> + %9 = bitcast i8 %mask to <8 x i1> + %10 = select <8 x i1> %9, <8 x i16> %8, <8 x i16> zeroinitializer + ret <8 x i16> %10 +} + +define <16 x i16> @test_mask_subs_epi16_rr_256(<16 x i16> %a, <16 x i16> %b) { +; CHECK-LABEL: test_mask_subs_epi16_rr_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpsubsw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe9,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %1 = sext <16 x i16> %a to <16 x i32> + %2 = sext <16 x i16> %b to <16 x i32> + %3 = sub nsw <16 x i32> %1, %2 + %4 = icmp slt <16 x i32> %3, + %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> + %6 = icmp sgt <16 x i32> %5, + %7 = select <16 x i1> %6, <16 x i32> %5, <16 x i32> + %8 = trunc <16 x i32> %7 to <16 x i16> + ret <16 x i16> %8 +} + +define <16 x i16> @test_mask_subs_epi16_rrk_256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) { +; CHECK-LABEL: test_mask_subs_epi16_rrk_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpsubsw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xe9,0xd1] +; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] +; CHECK-NEXT: retq ## encoding: [0xc3] + %1 = sext <16 x i16> %a to <16 x i32> + %2 = sext <16 x i16> %b to <16 x i32> + %3 = sub nsw <16 x i32> %1, %2 + %4 = icmp slt <16 x i32> %3, + %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> + %6 = icmp sgt <16 x i32> %5, + %7 = select <16 x i1> %6, <16 x i32> %5, <16 x i32> + %8 = trunc <16 x i32> %7 to <16 x i16> + %9 = bitcast i16 %mask to <16 x i1> + %10 = select <16 x i1> %9, <16 x i16> %8, <16 x i16> %passThru + ret <16 x i16> %10 +} + +define <16 x i16> @test_mask_subs_epi16_rrkz_256(<16 x i16> %a, <16 x i16> %b, i16 %mask) { +; CHECK-LABEL: test_mask_subs_epi16_rrkz_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpsubsw %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xe9,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %1 = sext <16 x i16> %a to <16 x i32> + %2 = sext <16 x i16> %b to <16 x i32> + %3 = sub nsw <16 x i32> %1, %2 + %4 = icmp slt <16 x i32> %3, + %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> + %6 = icmp sgt <16 x i32> %5, + %7 = select <16 x i1> %6, <16 x i32> %5, <16 x i32> + %8 = trunc <16 x i32> %7 to <16 x i16> + %9 = bitcast i16 %mask to <16 x i1> + %10 = select <16 x i1> %9, <16 x i16> %8, <16 x i16> zeroinitializer + ret <16 x i16> %10 +} + +define <16 x i16> @test_mask_subs_epi16_rm_256(<16 x i16> %a, <16 x i16>* %ptr_b) { +; CHECK-LABEL: test_mask_subs_epi16_rm_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpsubsw (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe9,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <16 x i16>, <16 x i16>* %ptr_b + %1 = sext <16 x i16> %a to <16 x i32> + %2 = sext <16 x i16> %b to <16 x i32> + %3 = sub nsw <16 x i32> %1, %2 + %4 = icmp slt <16 x i32> %3, + %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> + %6 = icmp sgt <16 x i32> %5, + %7 = select <16 x i1> %6, <16 x i32> %5, <16 x i32> + %8 = trunc <16 x i32> %7 to <16 x i16> + ret <16 x i16> %8 +} + +define <16 x i16> @test_mask_subs_epi16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr_b, <16 x i16> %passThru, i16 %mask) { +; CHECK-LABEL: test_mask_subs_epi16_rmk_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpsubsw (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xe9,0x0f] +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <16 x i16>, <16 x i16>* %ptr_b + %1 = sext <16 x i16> %a to <16 x i32> + %2 = sext <16 x i16> %b to <16 x i32> + %3 = sub nsw <16 x i32> %1, %2 + %4 = icmp slt <16 x i32> %3, + %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> + %6 = icmp sgt <16 x i32> %5, + %7 = select <16 x i1> %6, <16 x i32> %5, <16 x i32> + %8 = trunc <16 x i32> %7 to <16 x i16> + %9 = bitcast i16 %mask to <16 x i1> + %10 = select <16 x i1> %9, <16 x i16> %8, <16 x i16> %passThru + ret <16 x i16> %10 +} + +define <16 x i16> @test_mask_subs_epi16_rmkz_256(<16 x i16> %a, <16 x i16>* %ptr_b, i16 %mask) { +; CHECK-LABEL: test_mask_subs_epi16_rmkz_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpsubsw (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xe9,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <16 x i16>, <16 x i16>* %ptr_b + %1 = sext <16 x i16> %a to <16 x i32> + %2 = sext <16 x i16> %b to <16 x i32> + %3 = sub nsw <16 x i32> %1, %2 + %4 = icmp slt <16 x i32> %3, + %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> + %6 = icmp sgt <16 x i32> %5, + %7 = select <16 x i1> %6, <16 x i32> %5, <16 x i32> + %8 = trunc <16 x i32> %7 to <16 x i16> + %9 = bitcast i16 %mask to <16 x i1> + %10 = select <16 x i1> %9, <16 x i16> %8, <16 x i16> zeroinitializer + ret <16 x i16> %10 +} + +define <8 x i16> @test_mask_adds_epu16_rr_128(<8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: test_mask_adds_epu16_rr_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpaddusw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdd,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %1 = add <8 x i16> %a, %b + %2 = icmp ugt <8 x i16> %a, %1 + %3 = select <8 x i1> %2, <8 x i16> , <8 x i16> %1 + ret <8 x i16> %3 +} + +define <8 x i16> @test_mask_adds_epu16_rrk_128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) { +; CHECK-LABEL: test_mask_adds_epu16_rrk_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpaddusw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xdd,0xd1] +; CHECK-NEXT: vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] +; CHECK-NEXT: retq ## encoding: [0xc3] + %1 = add <8 x i16> %a, %b + %2 = icmp ugt <8 x i16> %a, %1 + %3 = select <8 x i1> %2, <8 x i16> , <8 x i16> %1 + %4 = bitcast i8 %mask to <8 x i1> + %5 = select <8 x i1> %4, <8 x i16> %3, <8 x i16> %passThru + ret <8 x i16> %5 +} + +define <8 x i16> @test_mask_adds_epu16_rrkz_128(<8 x i16> %a, <8 x i16> %b, i8 %mask) { +; CHECK-LABEL: test_mask_adds_epu16_rrkz_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpaddusw %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xdd,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %1 = add <8 x i16> %a, %b + %2 = icmp ugt <8 x i16> %a, %1 + %3 = select <8 x i1> %2, <8 x i16> , <8 x i16> %1 + %4 = bitcast i8 %mask to <8 x i1> + %5 = select <8 x i1> %4, <8 x i16> %3, <8 x i16> zeroinitializer + ret <8 x i16> %5 +} + +define <8 x i16> @test_mask_adds_epu16_rm_128(<8 x i16> %a, <8 x i16>* %ptr_b) { +; CHECK-LABEL: test_mask_adds_epu16_rm_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpaddusw (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdd,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <8 x i16>, <8 x i16>* %ptr_b + %1 = add <8 x i16> %a, %b + %2 = icmp ugt <8 x i16> %a, %1 + %3 = select <8 x i1> %2, <8 x i16> , <8 x i16> %1 + ret <8 x i16> %3 +} + +define <8 x i16> @test_mask_adds_epu16_rmk_128(<8 x i16> %a, <8 x i16>* %ptr_b, <8 x i16> %passThru, i8 %mask) { +; CHECK-LABEL: test_mask_adds_epu16_rmk_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpaddusw (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xdd,0x0f] +; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <8 x i16>, <8 x i16>* %ptr_b + %1 = add <8 x i16> %a, %b + %2 = icmp ugt <8 x i16> %a, %1 + %3 = select <8 x i1> %2, <8 x i16> , <8 x i16> %1 + %4 = bitcast i8 %mask to <8 x i1> + %5 = select <8 x i1> %4, <8 x i16> %3, <8 x i16> %passThru + ret <8 x i16> %5 +} + +define <8 x i16> @test_mask_adds_epu16_rmkz_128(<8 x i16> %a, <8 x i16>* %ptr_b, i8 %mask) { +; CHECK-LABEL: test_mask_adds_epu16_rmkz_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpaddusw (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xdd,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <8 x i16>, <8 x i16>* %ptr_b + %1 = add <8 x i16> %a, %b + %2 = icmp ugt <8 x i16> %a, %1 + %3 = select <8 x i1> %2, <8 x i16> , <8 x i16> %1 + %4 = bitcast i8 %mask to <8 x i1> + %5 = select <8 x i1> %4, <8 x i16> %3, <8 x i16> zeroinitializer + ret <8 x i16> %5 +} + +define <16 x i16> @test_mask_adds_epu16_rr_256(<16 x i16> %a, <16 x i16> %b) { +; CHECK-LABEL: test_mask_adds_epu16_rr_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpaddusw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xdd,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %1 = add <16 x i16> %a, %b + %2 = icmp ugt <16 x i16> %a, %1 + %3 = select <16 x i1> %2, <16 x i16> , <16 x i16> %1 + ret <16 x i16> %3 +} + +define <16 x i16> @test_mask_adds_epu16_rrk_256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) { +; CHECK-LABEL: test_mask_adds_epu16_rrk_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpaddusw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xdd,0xd1] +; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] +; CHECK-NEXT: retq ## encoding: [0xc3] + %1 = add <16 x i16> %a, %b + %2 = icmp ugt <16 x i16> %a, %1 + %3 = select <16 x i1> %2, <16 x i16> , <16 x i16> %1 + %4 = bitcast i16 %mask to <16 x i1> + %5 = select <16 x i1> %4, <16 x i16> %3, <16 x i16> %passThru + ret <16 x i16> %5 +} + +define <16 x i16> @test_mask_adds_epu16_rrkz_256(<16 x i16> %a, <16 x i16> %b, i16 %mask) { +; CHECK-LABEL: test_mask_adds_epu16_rrkz_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpaddusw %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xdd,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %1 = add <16 x i16> %a, %b + %2 = icmp ugt <16 x i16> %a, %1 + %3 = select <16 x i1> %2, <16 x i16> , <16 x i16> %1 + %4 = bitcast i16 %mask to <16 x i1> + %5 = select <16 x i1> %4, <16 x i16> %3, <16 x i16> zeroinitializer + ret <16 x i16> %5 +} + +define <16 x i16> @test_mask_adds_epu16_rm_256(<16 x i16> %a, <16 x i16>* %ptr_b) { +; CHECK-LABEL: test_mask_adds_epu16_rm_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpaddusw (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xdd,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <16 x i16>, <16 x i16>* %ptr_b + %1 = add <16 x i16> %a, %b + %2 = icmp ugt <16 x i16> %a, %1 + %3 = select <16 x i1> %2, <16 x i16> , <16 x i16> %1 + ret <16 x i16> %3 +} + +define <16 x i16> @test_mask_adds_epu16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr_b, <16 x i16> %passThru, i16 %mask) { +; CHECK-LABEL: test_mask_adds_epu16_rmk_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpaddusw (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xdd,0x0f] +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <16 x i16>, <16 x i16>* %ptr_b + %1 = add <16 x i16> %a, %b + %2 = icmp ugt <16 x i16> %a, %1 + %3 = select <16 x i1> %2, <16 x i16> , <16 x i16> %1 + %4 = bitcast i16 %mask to <16 x i1> + %5 = select <16 x i1> %4, <16 x i16> %3, <16 x i16> %passThru + ret <16 x i16> %5 +} + +define <16 x i16> @test_mask_adds_epu16_rmkz_256(<16 x i16> %a, <16 x i16>* %ptr_b, i16 %mask) { +; CHECK-LABEL: test_mask_adds_epu16_rmkz_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpaddusw (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xdd,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <16 x i16>, <16 x i16>* %ptr_b + %1 = add <16 x i16> %a, %b + %2 = icmp ugt <16 x i16> %a, %1 + %3 = select <16 x i1> %2, <16 x i16> , <16 x i16> %1 + %4 = bitcast i16 %mask to <16 x i1> + %5 = select <16 x i1> %4, <16 x i16> %3, <16 x i16> zeroinitializer + ret <16 x i16> %5 +} + +define <8 x i16> @test_mask_subs_epu16_rr_128(<8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: test_mask_subs_epu16_rr_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd9,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %cmp = icmp ugt <8 x i16> %a, %b + %sel = select <8 x i1> %cmp, <8 x i16> %a, <8 x i16> %b + %sub = sub <8 x i16> %sel, %b + ret <8 x i16> %sub +} + +define <8 x i16> @test_mask_subs_epu16_rrk_128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) { +; CHECK-LABEL: test_mask_subs_epu16_rrk_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpsubusw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xd9,0xd1] +; CHECK-NEXT: vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] +; CHECK-NEXT: retq ## encoding: [0xc3] + %cmp = icmp ugt <8 x i16> %a, %b + %sel = select <8 x i1> %cmp, <8 x i16> %a, <8 x i16> %b + %sub = sub <8 x i16> %sel, %b + %bc = bitcast i8 %mask to <8 x i1> + %res = select <8 x i1> %bc, <8 x i16> %sub, <8 x i16> %passThru + ret <8 x i16> %res +} + +define <8 x i16> @test_mask_subs_epu16_rrkz_128(<8 x i16> %a, <8 x i16> %b, i8 %mask) { +; CHECK-LABEL: test_mask_subs_epu16_rrkz_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xd9,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %cmp = icmp ugt <8 x i16> %a, %b + %sel = select <8 x i1> %cmp, <8 x i16> %a, <8 x i16> %b + %sub = sub <8 x i16> %sel, %b + %bc = bitcast i8 %mask to <8 x i1> + %res = select <8 x i1> %bc, <8 x i16> %sub, <8 x i16> zeroinitializer + ret <8 x i16> %res +} + +define <8 x i16> @test_mask_subs_epu16_rm_128(<8 x i16> %a, <8 x i16>* %ptr_b) { +; CHECK-LABEL: test_mask_subs_epu16_rm_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpsubusw (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd9,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <8 x i16>, <8 x i16>* %ptr_b + %cmp = icmp ugt <8 x i16> %a, %b + %sel = select <8 x i1> %cmp, <8 x i16> %a, <8 x i16> %b + %sub = sub <8 x i16> %sel, %b + ret <8 x i16> %sub +} + +define <8 x i16> @test_mask_subs_epu16_rmk_128(<8 x i16> %a, <8 x i16>* %ptr_b, <8 x i16> %passThru, i8 %mask) { +; CHECK-LABEL: test_mask_subs_epu16_rmk_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpsubusw (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xd9,0x0f] +; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <8 x i16>, <8 x i16>* %ptr_b + %cmp = icmp ugt <8 x i16> %a, %b + %sel = select <8 x i1> %cmp, <8 x i16> %a, <8 x i16> %b + %sub = sub <8 x i16> %sel, %b + %bc = bitcast i8 %mask to <8 x i1> + %res = select <8 x i1> %bc, <8 x i16> %sub, <8 x i16> %passThru + ret <8 x i16> %res +} + +define <8 x i16> @test_mask_subs_epu16_rmkz_128(<8 x i16> %a, <8 x i16>* %ptr_b, i8 %mask) { +; CHECK-LABEL: test_mask_subs_epu16_rmkz_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpsubusw (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xd9,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <8 x i16>, <8 x i16>* %ptr_b + %cmp = icmp ugt <8 x i16> %a, %b + %sel = select <8 x i1> %cmp, <8 x i16> %a, <8 x i16> %b + %sub = sub <8 x i16> %sel, %b + %bc = bitcast i8 %mask to <8 x i1> + %res = select <8 x i1> %bc, <8 x i16> %sub, <8 x i16> zeroinitializer + ret <8 x i16> %res +} + +define <16 x i16> @test_mask_subs_epu16_rr_256(<16 x i16> %a, <16 x i16> %b) { +; CHECK-LABEL: test_mask_subs_epu16_rr_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpsubusw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd9,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %cmp = icmp ugt <16 x i16> %a, %b + %sel = select <16 x i1> %cmp, <16 x i16> %a, <16 x i16> %b + %sub = sub <16 x i16> %sel, %b + ret <16 x i16> %sub +} + +define <16 x i16> @test_mask_subs_epu16_rrk_256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) { +; CHECK-LABEL: test_mask_subs_epu16_rrk_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpsubusw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xd9,0xd1] +; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] +; CHECK-NEXT: retq ## encoding: [0xc3] + %cmp = icmp ugt <16 x i16> %a, %b + %sel = select <16 x i1> %cmp, <16 x i16> %a, <16 x i16> %b + %sub = sub <16 x i16> %sel, %b + %bc = bitcast i16 %mask to <16 x i1> + %res = select <16 x i1> %bc, <16 x i16> %sub, <16 x i16> %passThru + ret <16 x i16> %res +} + +define <16 x i16> @test_mask_subs_epu16_rrkz_256(<16 x i16> %a, <16 x i16> %b, i16 %mask) { +; CHECK-LABEL: test_mask_subs_epu16_rrkz_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpsubusw %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xd9,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %cmp = icmp ugt <16 x i16> %a, %b + %sel = select <16 x i1> %cmp, <16 x i16> %a, <16 x i16> %b + %sub = sub <16 x i16> %sel, %b + %bc = bitcast i16 %mask to <16 x i1> + %res = select <16 x i1> %bc, <16 x i16> %sub, <16 x i16> zeroinitializer + ret <16 x i16> %res +} + +define <16 x i16> @test_mask_subs_epu16_rm_256(<16 x i16> %a, <16 x i16>* %ptr_b) { +; CHECK-LABEL: test_mask_subs_epu16_rm_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpsubusw (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd9,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <16 x i16>, <16 x i16>* %ptr_b + %cmp = icmp ugt <16 x i16> %a, %b + %sel = select <16 x i1> %cmp, <16 x i16> %a, <16 x i16> %b + %sub = sub <16 x i16> %sel, %b + ret <16 x i16> %sub +} + +define <16 x i16> @test_mask_subs_epu16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr_b, <16 x i16> %passThru, i16 %mask) { +; CHECK-LABEL: test_mask_subs_epu16_rmk_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpsubusw (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xd9,0x0f] +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <16 x i16>, <16 x i16>* %ptr_b + %cmp = icmp ugt <16 x i16> %a, %b + %sel = select <16 x i1> %cmp, <16 x i16> %a, <16 x i16> %b + %sub = sub <16 x i16> %sel, %b + %bc = bitcast i16 %mask to <16 x i1> + %res = select <16 x i1> %bc, <16 x i16> %sub, <16 x i16> %passThru + ret <16 x i16> %res +} + +define <16 x i16> @test_mask_subs_epu16_rmkz_256(<16 x i16> %a, <16 x i16>* %ptr_b, i16 %mask) { +; CHECK-LABEL: test_mask_subs_epu16_rmkz_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpsubusw (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xd9,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <16 x i16>, <16 x i16>* %ptr_b + %cmp = icmp ugt <16 x i16> %a, %b + %sel = select <16 x i1> %cmp, <16 x i16> %a, <16 x i16> %b + %sub = sub <16 x i16> %sel, %b + %bc = bitcast i16 %mask to <16 x i1> + %res = select <16 x i1> %bc, <16 x i16> %sub, <16 x i16> zeroinitializer + ret <16 x i16> %res +} + +define <16 x i8> @test_mask_adds_epi8_rr_128(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: test_mask_adds_epi8_rr_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpaddsb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xec,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %1 = sext <16 x i8> %a to <16 x i16> + %2 = sext <16 x i8> %b to <16 x i16> + %3 = add nsw <16 x i16> %1, %2 + %4 = icmp slt <16 x i16> %3, + %5 = select <16 x i1> %4, <16 x i16> %3, <16 x i16> + %6 = icmp sgt <16 x i16> %5, + %7 = select <16 x i1> %6, <16 x i16> %5, <16 x i16> + %8 = trunc <16 x i16> %7 to <16 x i8> + ret <16 x i8> %8 +} + +define <16 x i8> @test_mask_adds_epi8_rrk_128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask) { +; CHECK-LABEL: test_mask_adds_epi8_rrk_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpaddsb %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xec,0xd1] +; CHECK-NEXT: vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] +; CHECK-NEXT: retq ## encoding: [0xc3] + %1 = sext <16 x i8> %a to <16 x i16> + %2 = sext <16 x i8> %b to <16 x i16> + %3 = add nsw <16 x i16> %1, %2 + %4 = icmp slt <16 x i16> %3, + %5 = select <16 x i1> %4, <16 x i16> %3, <16 x i16> + %6 = icmp sgt <16 x i16> %5, + %7 = select <16 x i1> %6, <16 x i16> %5, <16 x i16> + %8 = trunc <16 x i16> %7 to <16 x i8> + %9 = bitcast i16 %mask to <16 x i1> + %10 = select <16 x i1> %9, <16 x i8> %8, <16 x i8> %passThru + ret <16 x i8> %10 +} + +define <16 x i8> @test_mask_adds_epi8_rrkz_128(<16 x i8> %a, <16 x i8> %b, i16 %mask) { +; CHECK-LABEL: test_mask_adds_epi8_rrkz_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpaddsb %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xec,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %1 = sext <16 x i8> %a to <16 x i16> + %2 = sext <16 x i8> %b to <16 x i16> + %3 = add nsw <16 x i16> %1, %2 + %4 = icmp slt <16 x i16> %3, + %5 = select <16 x i1> %4, <16 x i16> %3, <16 x i16> + %6 = icmp sgt <16 x i16> %5, + %7 = select <16 x i1> %6, <16 x i16> %5, <16 x i16> + %8 = trunc <16 x i16> %7 to <16 x i8> + %9 = bitcast i16 %mask to <16 x i1> + %10 = select <16 x i1> %9, <16 x i8> %8, <16 x i8> zeroinitializer + ret <16 x i8> %10 +} + +define <16 x i8> @test_mask_adds_epi8_rm_128(<16 x i8> %a, <16 x i8>* %ptr_b) { +; CHECK-LABEL: test_mask_adds_epi8_rm_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpaddsb (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xec,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <16 x i8>, <16 x i8>* %ptr_b + %1 = sext <16 x i8> %a to <16 x i16> + %2 = sext <16 x i8> %b to <16 x i16> + %3 = add nsw <16 x i16> %1, %2 + %4 = icmp slt <16 x i16> %3, + %5 = select <16 x i1> %4, <16 x i16> %3, <16 x i16> + %6 = icmp sgt <16 x i16> %5, + %7 = select <16 x i1> %6, <16 x i16> %5, <16 x i16> + %8 = trunc <16 x i16> %7 to <16 x i8> + ret <16 x i8> %8 +} + +define <16 x i8> @test_mask_adds_epi8_rmk_128(<16 x i8> %a, <16 x i8>* %ptr_b, <16 x i8> %passThru, i16 %mask) { +; CHECK-LABEL: test_mask_adds_epi8_rmk_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpaddsb (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xec,0x0f] +; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <16 x i8>, <16 x i8>* %ptr_b + %1 = sext <16 x i8> %a to <16 x i16> + %2 = sext <16 x i8> %b to <16 x i16> + %3 = add nsw <16 x i16> %1, %2 + %4 = icmp slt <16 x i16> %3, + %5 = select <16 x i1> %4, <16 x i16> %3, <16 x i16> + %6 = icmp sgt <16 x i16> %5, + %7 = select <16 x i1> %6, <16 x i16> %5, <16 x i16> + %8 = trunc <16 x i16> %7 to <16 x i8> + %9 = bitcast i16 %mask to <16 x i1> + %10 = select <16 x i1> %9, <16 x i8> %8, <16 x i8> %passThru + ret <16 x i8> %10 +} + +define <16 x i8> @test_mask_adds_epi8_rmkz_128(<16 x i8> %a, <16 x i8>* %ptr_b, i16 %mask) { +; CHECK-LABEL: test_mask_adds_epi8_rmkz_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpaddsb (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xec,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <16 x i8>, <16 x i8>* %ptr_b + %1 = sext <16 x i8> %a to <16 x i16> + %2 = sext <16 x i8> %b to <16 x i16> + %3 = add nsw <16 x i16> %1, %2 + %4 = icmp slt <16 x i16> %3, + %5 = select <16 x i1> %4, <16 x i16> %3, <16 x i16> + %6 = icmp sgt <16 x i16> %5, + %7 = select <16 x i1> %6, <16 x i16> %5, <16 x i16> + %8 = trunc <16 x i16> %7 to <16 x i8> + %9 = bitcast i16 %mask to <16 x i1> + %10 = select <16 x i1> %9, <16 x i8> %8, <16 x i8> zeroinitializer + ret <16 x i8> %10 +} + +define <32 x i8> @test_mask_adds_epi8_rr_256(<32 x i8> %a, <32 x i8> %b) { +; CHECK-LABEL: test_mask_adds_epi8_rr_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpaddsb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xec,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %1 = sext <32 x i8> %a to <32 x i16> + %2 = sext <32 x i8> %b to <32 x i16> + %3 = add nsw <32 x i16> %1, %2 + %4 = icmp slt <32 x i16> %3, + %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> + %6 = icmp sgt <32 x i16> %5, + %7 = select <32 x i1> %6, <32 x i16> %5, <32 x i16> + %8 = trunc <32 x i16> %7 to <32 x i8> + ret <32 x i8> %8 +} + +define <32 x i8> @test_mask_adds_epi8_rrk_256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask) { +; CHECK-LABEL: test_mask_adds_epi8_rrk_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpaddsb %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xec,0xd1] +; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] +; CHECK-NEXT: retq ## encoding: [0xc3] + %1 = sext <32 x i8> %a to <32 x i16> + %2 = sext <32 x i8> %b to <32 x i16> + %3 = add nsw <32 x i16> %1, %2 + %4 = icmp slt <32 x i16> %3, + %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> + %6 = icmp sgt <32 x i16> %5, + %7 = select <32 x i1> %6, <32 x i16> %5, <32 x i16> + %8 = trunc <32 x i16> %7 to <32 x i8> + %9 = bitcast i32 %mask to <32 x i1> + %10 = select <32 x i1> %9, <32 x i8> %8, <32 x i8> %passThru + ret <32 x i8> %10 +} + +define <32 x i8> @test_mask_adds_epi8_rrkz_256(<32 x i8> %a, <32 x i8> %b, i32 %mask) { +; CHECK-LABEL: test_mask_adds_epi8_rrkz_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpaddsb %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xec,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %1 = sext <32 x i8> %a to <32 x i16> + %2 = sext <32 x i8> %b to <32 x i16> + %3 = add nsw <32 x i16> %1, %2 + %4 = icmp slt <32 x i16> %3, + %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> + %6 = icmp sgt <32 x i16> %5, + %7 = select <32 x i1> %6, <32 x i16> %5, <32 x i16> + %8 = trunc <32 x i16> %7 to <32 x i8> + %9 = bitcast i32 %mask to <32 x i1> + %10 = select <32 x i1> %9, <32 x i8> %8, <32 x i8> zeroinitializer + ret <32 x i8> %10 +} + +define <32 x i8> @test_mask_adds_epi8_rm_256(<32 x i8> %a, <32 x i8>* %ptr_b) { +; CHECK-LABEL: test_mask_adds_epi8_rm_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpaddsb (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xec,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <32 x i8>, <32 x i8>* %ptr_b + %1 = sext <32 x i8> %a to <32 x i16> + %2 = sext <32 x i8> %b to <32 x i16> + %3 = add nsw <32 x i16> %1, %2 + %4 = icmp slt <32 x i16> %3, + %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> + %6 = icmp sgt <32 x i16> %5, + %7 = select <32 x i1> %6, <32 x i16> %5, <32 x i16> + %8 = trunc <32 x i16> %7 to <32 x i8> + ret <32 x i8> %8 +} + +define <32 x i8> @test_mask_adds_epi8_rmk_256(<32 x i8> %a, <32 x i8>* %ptr_b, <32 x i8> %passThru, i32 %mask) { +; CHECK-LABEL: test_mask_adds_epi8_rmk_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpaddsb (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xec,0x0f] +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <32 x i8>, <32 x i8>* %ptr_b + %1 = sext <32 x i8> %a to <32 x i16> + %2 = sext <32 x i8> %b to <32 x i16> + %3 = add nsw <32 x i16> %1, %2 + %4 = icmp slt <32 x i16> %3, + %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> + %6 = icmp sgt <32 x i16> %5, + %7 = select <32 x i1> %6, <32 x i16> %5, <32 x i16> + %8 = trunc <32 x i16> %7 to <32 x i8> + %9 = bitcast i32 %mask to <32 x i1> + %10 = select <32 x i1> %9, <32 x i8> %8, <32 x i8> %passThru + ret <32 x i8> %10 +} + +define <32 x i8> @test_mask_adds_epi8_rmkz_256(<32 x i8> %a, <32 x i8>* %ptr_b, i32 %mask) { +; CHECK-LABEL: test_mask_adds_epi8_rmkz_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpaddsb (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xec,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <32 x i8>, <32 x i8>* %ptr_b + %1 = sext <32 x i8> %a to <32 x i16> + %2 = sext <32 x i8> %b to <32 x i16> + %3 = add nsw <32 x i16> %1, %2 + %4 = icmp slt <32 x i16> %3, + %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> + %6 = icmp sgt <32 x i16> %5, + %7 = select <32 x i1> %6, <32 x i16> %5, <32 x i16> + %8 = trunc <32 x i16> %7 to <32 x i8> + %9 = bitcast i32 %mask to <32 x i1> + %10 = select <32 x i1> %9, <32 x i8> %8, <32 x i8> zeroinitializer + ret <32 x i8> %10 +} + +define <16 x i8> @test_mask_subs_epi8_rr_128(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: test_mask_subs_epi8_rr_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpsubsb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe8,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %1 = sext <16 x i8> %a to <16 x i16> + %2 = sext <16 x i8> %b to <16 x i16> + %3 = sub nsw <16 x i16> %1, %2 + %4 = icmp slt <16 x i16> %3, + %5 = select <16 x i1> %4, <16 x i16> %3, <16 x i16> + %6 = icmp sgt <16 x i16> %5, + %7 = select <16 x i1> %6, <16 x i16> %5, <16 x i16> + %8 = trunc <16 x i16> %7 to <16 x i8> + ret <16 x i8> %8 +} + +define <16 x i8> @test_mask_subs_epi8_rrk_128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask) { +; CHECK-LABEL: test_mask_subs_epi8_rrk_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpsubsb %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xe8,0xd1] +; CHECK-NEXT: vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] +; CHECK-NEXT: retq ## encoding: [0xc3] + %1 = sext <16 x i8> %a to <16 x i16> + %2 = sext <16 x i8> %b to <16 x i16> + %3 = sub nsw <16 x i16> %1, %2 + %4 = icmp slt <16 x i16> %3, + %5 = select <16 x i1> %4, <16 x i16> %3, <16 x i16> + %6 = icmp sgt <16 x i16> %5, + %7 = select <16 x i1> %6, <16 x i16> %5, <16 x i16> + %8 = trunc <16 x i16> %7 to <16 x i8> + %9 = bitcast i16 %mask to <16 x i1> + %10 = select <16 x i1> %9, <16 x i8> %8, <16 x i8> %passThru + ret <16 x i8> %10 +} + +define <16 x i8> @test_mask_subs_epi8_rrkz_128(<16 x i8> %a, <16 x i8> %b, i16 %mask) { +; CHECK-LABEL: test_mask_subs_epi8_rrkz_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpsubsb %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xe8,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %1 = sext <16 x i8> %a to <16 x i16> + %2 = sext <16 x i8> %b to <16 x i16> + %3 = sub nsw <16 x i16> %1, %2 + %4 = icmp slt <16 x i16> %3, + %5 = select <16 x i1> %4, <16 x i16> %3, <16 x i16> + %6 = icmp sgt <16 x i16> %5, + %7 = select <16 x i1> %6, <16 x i16> %5, <16 x i16> + %8 = trunc <16 x i16> %7 to <16 x i8> + %9 = bitcast i16 %mask to <16 x i1> + %10 = select <16 x i1> %9, <16 x i8> %8, <16 x i8> zeroinitializer + ret <16 x i8> %10 +} + +define <16 x i8> @test_mask_subs_epi8_rm_128(<16 x i8> %a, <16 x i8>* %ptr_b) { +; CHECK-LABEL: test_mask_subs_epi8_rm_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpsubsb (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe8,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <16 x i8>, <16 x i8>* %ptr_b + %1 = sext <16 x i8> %a to <16 x i16> + %2 = sext <16 x i8> %b to <16 x i16> + %3 = sub nsw <16 x i16> %1, %2 + %4 = icmp slt <16 x i16> %3, + %5 = select <16 x i1> %4, <16 x i16> %3, <16 x i16> + %6 = icmp sgt <16 x i16> %5, + %7 = select <16 x i1> %6, <16 x i16> %5, <16 x i16> + %8 = trunc <16 x i16> %7 to <16 x i8> + ret <16 x i8> %8 +} + +define <16 x i8> @test_mask_subs_epi8_rmk_128(<16 x i8> %a, <16 x i8>* %ptr_b, <16 x i8> %passThru, i16 %mask) { +; CHECK-LABEL: test_mask_subs_epi8_rmk_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpsubsb (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xe8,0x0f] +; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <16 x i8>, <16 x i8>* %ptr_b + %1 = sext <16 x i8> %a to <16 x i16> + %2 = sext <16 x i8> %b to <16 x i16> + %3 = sub nsw <16 x i16> %1, %2 + %4 = icmp slt <16 x i16> %3, + %5 = select <16 x i1> %4, <16 x i16> %3, <16 x i16> + %6 = icmp sgt <16 x i16> %5, + %7 = select <16 x i1> %6, <16 x i16> %5, <16 x i16> + %8 = trunc <16 x i16> %7 to <16 x i8> + %9 = bitcast i16 %mask to <16 x i1> + %10 = select <16 x i1> %9, <16 x i8> %8, <16 x i8> %passThru + ret <16 x i8> %10 +} + +define <16 x i8> @test_mask_subs_epi8_rmkz_128(<16 x i8> %a, <16 x i8>* %ptr_b, i16 %mask) { +; CHECK-LABEL: test_mask_subs_epi8_rmkz_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpsubsb (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xe8,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <16 x i8>, <16 x i8>* %ptr_b + %1 = sext <16 x i8> %a to <16 x i16> + %2 = sext <16 x i8> %b to <16 x i16> + %3 = sub nsw <16 x i16> %1, %2 + %4 = icmp slt <16 x i16> %3, + %5 = select <16 x i1> %4, <16 x i16> %3, <16 x i16> + %6 = icmp sgt <16 x i16> %5, + %7 = select <16 x i1> %6, <16 x i16> %5, <16 x i16> + %8 = trunc <16 x i16> %7 to <16 x i8> + %9 = bitcast i16 %mask to <16 x i1> + %10 = select <16 x i1> %9, <16 x i8> %8, <16 x i8> zeroinitializer + ret <16 x i8> %10 +} + +define <32 x i8> @test_mask_subs_epi8_rr_256(<32 x i8> %a, <32 x i8> %b) { +; CHECK-LABEL: test_mask_subs_epi8_rr_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpsubsb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe8,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %1 = sext <32 x i8> %a to <32 x i16> + %2 = sext <32 x i8> %b to <32 x i16> + %3 = sub nsw <32 x i16> %1, %2 + %4 = icmp slt <32 x i16> %3, + %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> + %6 = icmp sgt <32 x i16> %5, + %7 = select <32 x i1> %6, <32 x i16> %5, <32 x i16> + %8 = trunc <32 x i16> %7 to <32 x i8> + ret <32 x i8> %8 +} + +define <32 x i8> @test_mask_subs_epi8_rrk_256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask) { +; CHECK-LABEL: test_mask_subs_epi8_rrk_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpsubsb %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xe8,0xd1] +; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] +; CHECK-NEXT: retq ## encoding: [0xc3] + %1 = sext <32 x i8> %a to <32 x i16> + %2 = sext <32 x i8> %b to <32 x i16> + %3 = sub nsw <32 x i16> %1, %2 + %4 = icmp slt <32 x i16> %3, + %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> + %6 = icmp sgt <32 x i16> %5, + %7 = select <32 x i1> %6, <32 x i16> %5, <32 x i16> + %8 = trunc <32 x i16> %7 to <32 x i8> + %9 = bitcast i32 %mask to <32 x i1> + %10 = select <32 x i1> %9, <32 x i8> %8, <32 x i8> %passThru + ret <32 x i8> %10 +} + +define <32 x i8> @test_mask_subs_epi8_rrkz_256(<32 x i8> %a, <32 x i8> %b, i32 %mask) { +; CHECK-LABEL: test_mask_subs_epi8_rrkz_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpsubsb %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xe8,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %1 = sext <32 x i8> %a to <32 x i16> + %2 = sext <32 x i8> %b to <32 x i16> + %3 = sub nsw <32 x i16> %1, %2 + %4 = icmp slt <32 x i16> %3, + %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> + %6 = icmp sgt <32 x i16> %5, + %7 = select <32 x i1> %6, <32 x i16> %5, <32 x i16> + %8 = trunc <32 x i16> %7 to <32 x i8> + %9 = bitcast i32 %mask to <32 x i1> + %10 = select <32 x i1> %9, <32 x i8> %8, <32 x i8> zeroinitializer + ret <32 x i8> %10 +} + +define <32 x i8> @test_mask_subs_epi8_rm_256(<32 x i8> %a, <32 x i8>* %ptr_b) { +; CHECK-LABEL: test_mask_subs_epi8_rm_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpsubsb (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe8,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <32 x i8>, <32 x i8>* %ptr_b + %1 = sext <32 x i8> %a to <32 x i16> + %2 = sext <32 x i8> %b to <32 x i16> + %3 = sub nsw <32 x i16> %1, %2 + %4 = icmp slt <32 x i16> %3, + %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> + %6 = icmp sgt <32 x i16> %5, + %7 = select <32 x i1> %6, <32 x i16> %5, <32 x i16> + %8 = trunc <32 x i16> %7 to <32 x i8> + ret <32 x i8> %8 +} + +define <32 x i8> @test_mask_subs_epi8_rmk_256(<32 x i8> %a, <32 x i8>* %ptr_b, <32 x i8> %passThru, i32 %mask) { +; CHECK-LABEL: test_mask_subs_epi8_rmk_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpsubsb (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xe8,0x0f] +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <32 x i8>, <32 x i8>* %ptr_b + %1 = sext <32 x i8> %a to <32 x i16> + %2 = sext <32 x i8> %b to <32 x i16> + %3 = sub nsw <32 x i16> %1, %2 + %4 = icmp slt <32 x i16> %3, + %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> + %6 = icmp sgt <32 x i16> %5, + %7 = select <32 x i1> %6, <32 x i16> %5, <32 x i16> + %8 = trunc <32 x i16> %7 to <32 x i8> + %9 = bitcast i32 %mask to <32 x i1> + %10 = select <32 x i1> %9, <32 x i8> %8, <32 x i8> %passThru + ret <32 x i8> %10 +} + +define <32 x i8> @test_mask_subs_epi8_rmkz_256(<32 x i8> %a, <32 x i8>* %ptr_b, i32 %mask) { +; CHECK-LABEL: test_mask_subs_epi8_rmkz_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpsubsb (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xe8,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <32 x i8>, <32 x i8>* %ptr_b + %1 = sext <32 x i8> %a to <32 x i16> + %2 = sext <32 x i8> %b to <32 x i16> + %3 = sub nsw <32 x i16> %1, %2 + %4 = icmp slt <32 x i16> %3, + %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> + %6 = icmp sgt <32 x i16> %5, + %7 = select <32 x i1> %6, <32 x i16> %5, <32 x i16> + %8 = trunc <32 x i16> %7 to <32 x i8> + %9 = bitcast i32 %mask to <32 x i1> + %10 = select <32 x i1> %9, <32 x i8> %8, <32 x i8> zeroinitializer + ret <32 x i8> %10 +} + +define <16 x i8> @test_mask_adds_epu8_rr_128(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: test_mask_adds_epu8_rr_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpaddusb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdc,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %1 = add <16 x i8> %a, %b + %2 = icmp ugt <16 x i8> %a, %1 + %3 = select <16 x i1> %2, <16 x i8> , <16 x i8> %1 + ret <16 x i8> %3 +} + +define <16 x i8> @test_mask_adds_epu8_rrk_128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask) { +; CHECK-LABEL: test_mask_adds_epu8_rrk_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpaddusb %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xdc,0xd1] +; CHECK-NEXT: vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] +; CHECK-NEXT: retq ## encoding: [0xc3] + %1 = add <16 x i8> %a, %b + %2 = icmp ugt <16 x i8> %a, %1 + %3 = select <16 x i1> %2, <16 x i8> , <16 x i8> %1 + %4 = bitcast i16 %mask to <16 x i1> + %5 = select <16 x i1> %4, <16 x i8> %3, <16 x i8> %passThru + ret <16 x i8> %5 +} + +define <16 x i8> @test_mask_adds_epu8_rrkz_128(<16 x i8> %a, <16 x i8> %b, i16 %mask) { +; CHECK-LABEL: test_mask_adds_epu8_rrkz_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpaddusb %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xdc,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %1 = add <16 x i8> %a, %b + %2 = icmp ugt <16 x i8> %a, %1 + %3 = select <16 x i1> %2, <16 x i8> , <16 x i8> %1 + %4 = bitcast i16 %mask to <16 x i1> + %5 = select <16 x i1> %4, <16 x i8> %3, <16 x i8> zeroinitializer + ret <16 x i8> %5 +} + +define <16 x i8> @test_mask_adds_epu8_rm_128(<16 x i8> %a, <16 x i8>* %ptr_b) { +; CHECK-LABEL: test_mask_adds_epu8_rm_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpaddusb (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdc,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <16 x i8>, <16 x i8>* %ptr_b + %1 = add <16 x i8> %a, %b + %2 = icmp ugt <16 x i8> %a, %1 + %3 = select <16 x i1> %2, <16 x i8> , <16 x i8> %1 + ret <16 x i8> %3 +} + +define <16 x i8> @test_mask_adds_epu8_rmk_128(<16 x i8> %a, <16 x i8>* %ptr_b, <16 x i8> %passThru, i16 %mask) { +; CHECK-LABEL: test_mask_adds_epu8_rmk_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpaddusb (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xdc,0x0f] +; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <16 x i8>, <16 x i8>* %ptr_b + %1 = add <16 x i8> %a, %b + %2 = icmp ugt <16 x i8> %a, %1 + %3 = select <16 x i1> %2, <16 x i8> , <16 x i8> %1 + %4 = bitcast i16 %mask to <16 x i1> + %5 = select <16 x i1> %4, <16 x i8> %3, <16 x i8> %passThru + ret <16 x i8> %5 +} + +define <16 x i8> @test_mask_adds_epu8_rmkz_128(<16 x i8> %a, <16 x i8>* %ptr_b, i16 %mask) { +; CHECK-LABEL: test_mask_adds_epu8_rmkz_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpaddusb (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xdc,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <16 x i8>, <16 x i8>* %ptr_b + %1 = add <16 x i8> %a, %b + %2 = icmp ugt <16 x i8> %a, %1 + %3 = select <16 x i1> %2, <16 x i8> , <16 x i8> %1 + %4 = bitcast i16 %mask to <16 x i1> + %5 = select <16 x i1> %4, <16 x i8> %3, <16 x i8> zeroinitializer + ret <16 x i8> %5 +} + +define <32 x i8> @test_mask_adds_epu8_rr_256(<32 x i8> %a, <32 x i8> %b) { +; CHECK-LABEL: test_mask_adds_epu8_rr_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpaddusb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xdc,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %1 = add <32 x i8> %a, %b + %2 = icmp ugt <32 x i8> %a, %1 + %3 = select <32 x i1> %2, <32 x i8> , <32 x i8> %1 + ret <32 x i8> %3 +} + +define <32 x i8> @test_mask_adds_epu8_rrk_256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask) { +; CHECK-LABEL: test_mask_adds_epu8_rrk_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpaddusb %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xdc,0xd1] +; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] +; CHECK-NEXT: retq ## encoding: [0xc3] + %1 = add <32 x i8> %a, %b + %2 = icmp ugt <32 x i8> %a, %1 + %3 = select <32 x i1> %2, <32 x i8> , <32 x i8> %1 + %4 = bitcast i32 %mask to <32 x i1> + %5 = select <32 x i1> %4, <32 x i8> %3, <32 x i8> %passThru + ret <32 x i8> %5 +} + +define <32 x i8> @test_mask_adds_epu8_rrkz_256(<32 x i8> %a, <32 x i8> %b, i32 %mask) { +; CHECK-LABEL: test_mask_adds_epu8_rrkz_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpaddusb %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xdc,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %1 = add <32 x i8> %a, %b + %2 = icmp ugt <32 x i8> %a, %1 + %3 = select <32 x i1> %2, <32 x i8> , <32 x i8> %1 + %4 = bitcast i32 %mask to <32 x i1> + %5 = select <32 x i1> %4, <32 x i8> %3, <32 x i8> zeroinitializer + ret <32 x i8> %5 +} + +define <32 x i8> @test_mask_adds_epu8_rm_256(<32 x i8> %a, <32 x i8>* %ptr_b) { +; CHECK-LABEL: test_mask_adds_epu8_rm_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpaddusb (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xdc,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <32 x i8>, <32 x i8>* %ptr_b + %1 = add <32 x i8> %a, %b + %2 = icmp ugt <32 x i8> %a, %1 + %3 = select <32 x i1> %2, <32 x i8> , <32 x i8> %1 + ret <32 x i8> %3 +} + +define <32 x i8> @test_mask_adds_epu8_rmk_256(<32 x i8> %a, <32 x i8>* %ptr_b, <32 x i8> %passThru, i32 %mask) { +; CHECK-LABEL: test_mask_adds_epu8_rmk_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpaddusb (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xdc,0x0f] +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <32 x i8>, <32 x i8>* %ptr_b + %1 = add <32 x i8> %a, %b + %2 = icmp ugt <32 x i8> %a, %1 + %3 = select <32 x i1> %2, <32 x i8> , <32 x i8> %1 + %4 = bitcast i32 %mask to <32 x i1> + %5 = select <32 x i1> %4, <32 x i8> %3, <32 x i8> %passThru + ret <32 x i8> %5 +} + +define <32 x i8> @test_mask_adds_epu8_rmkz_256(<32 x i8> %a, <32 x i8>* %ptr_b, i32 %mask) { +; CHECK-LABEL: test_mask_adds_epu8_rmkz_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpaddusb (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xdc,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <32 x i8>, <32 x i8>* %ptr_b + %1 = add <32 x i8> %a, %b + %2 = icmp ugt <32 x i8> %a, %1 + %3 = select <32 x i1> %2, <32 x i8> , <32 x i8> %1 + %4 = bitcast i32 %mask to <32 x i1> + %5 = select <32 x i1> %4, <32 x i8> %3, <32 x i8> zeroinitializer + ret <32 x i8> %5 +} + +define <16 x i8> @test_mask_subs_epu8_rr_128(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: test_mask_subs_epu8_rr_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd8,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %cmp = icmp ugt <16 x i8> %a, %b + %sel = select <16 x i1> %cmp, <16 x i8> %a, <16 x i8> %b + %sub = sub <16 x i8> %sel, %b + ret <16 x i8> %sub +} + +define <16 x i8> @test_mask_subs_epu8_rrk_128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask) { +; CHECK-LABEL: test_mask_subs_epu8_rrk_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpsubusb %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xd8,0xd1] +; CHECK-NEXT: vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] +; CHECK-NEXT: retq ## encoding: [0xc3] + %cmp = icmp ugt <16 x i8> %a, %b + %sel = select <16 x i1> %cmp, <16 x i8> %a, <16 x i8> %b + %sub = sub <16 x i8> %sel, %b + %bc = bitcast i16 %mask to <16 x i1> + %res = select <16 x i1> %bc, <16 x i8> %sub, <16 x i8> %passThru + ret <16 x i8> %res +} + +define <16 x i8> @test_mask_subs_epu8_rrkz_128(<16 x i8> %a, <16 x i8> %b, i16 %mask) { +; CHECK-LABEL: test_mask_subs_epu8_rrkz_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xd8,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %cmp = icmp ugt <16 x i8> %a, %b + %sel = select <16 x i1> %cmp, <16 x i8> %a, <16 x i8> %b + %sub = sub <16 x i8> %sel, %b + %bc = bitcast i16 %mask to <16 x i1> + %res = select <16 x i1> %bc, <16 x i8> %sub, <16 x i8> zeroinitializer + ret <16 x i8> %res +} + +define <16 x i8> @test_mask_subs_epu8_rm_128(<16 x i8> %a, <16 x i8>* %ptr_b) { +; CHECK-LABEL: test_mask_subs_epu8_rm_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpsubusb (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd8,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <16 x i8>, <16 x i8>* %ptr_b + %cmp = icmp ugt <16 x i8> %a, %b + %sel = select <16 x i1> %cmp, <16 x i8> %a, <16 x i8> %b + %sub = sub <16 x i8> %sel, %b + ret <16 x i8> %sub +} + +define <16 x i8> @test_mask_subs_epu8_rmk_128(<16 x i8> %a, <16 x i8>* %ptr_b, <16 x i8> %passThru, i16 %mask) { +; CHECK-LABEL: test_mask_subs_epu8_rmk_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpsubusb (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xd8,0x0f] +; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <16 x i8>, <16 x i8>* %ptr_b + %cmp = icmp ugt <16 x i8> %a, %b + %sel = select <16 x i1> %cmp, <16 x i8> %a, <16 x i8> %b + %sub = sub <16 x i8> %sel, %b + %bc = bitcast i16 %mask to <16 x i1> + %res = select <16 x i1> %bc, <16 x i8> %sub, <16 x i8> %passThru + ret <16 x i8> %res +} + +define <16 x i8> @test_mask_subs_epu8_rmkz_128(<16 x i8> %a, <16 x i8>* %ptr_b, i16 %mask) { +; CHECK-LABEL: test_mask_subs_epu8_rmkz_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpsubusb (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xd8,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <16 x i8>, <16 x i8>* %ptr_b + %cmp = icmp ugt <16 x i8> %a, %b + %sel = select <16 x i1> %cmp, <16 x i8> %a, <16 x i8> %b + %sub = sub <16 x i8> %sel, %b + %bc = bitcast i16 %mask to <16 x i1> + %res = select <16 x i1> %bc, <16 x i8> %sub, <16 x i8> zeroinitializer + ret <16 x i8> %res +} + +define <32 x i8> @test_mask_subs_epu8_rr_256(<32 x i8> %a, <32 x i8> %b) { +; CHECK-LABEL: test_mask_subs_epu8_rr_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpsubusb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd8,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %cmp = icmp ugt <32 x i8> %a, %b + %sel = select <32 x i1> %cmp, <32 x i8> %a, <32 x i8> %b + %sub = sub <32 x i8> %sel, %b + ret <32 x i8> %sub +} + +define <32 x i8> @test_mask_subs_epu8_rrk_256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask) { +; CHECK-LABEL: test_mask_subs_epu8_rrk_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpsubusb %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xd8,0xd1] +; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] +; CHECK-NEXT: retq ## encoding: [0xc3] + %cmp = icmp ugt <32 x i8> %a, %b + %sel = select <32 x i1> %cmp, <32 x i8> %a, <32 x i8> %b + %sub = sub <32 x i8> %sel, %b + %bc = bitcast i32 %mask to <32 x i1> + %res = select <32 x i1> %bc, <32 x i8> %sub, <32 x i8> %passThru + ret <32 x i8> %res +} + +define <32 x i8> @test_mask_subs_epu8_rrkz_256(<32 x i8> %a, <32 x i8> %b, i32 %mask) { +; CHECK-LABEL: test_mask_subs_epu8_rrkz_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpsubusb %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xd8,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %cmp = icmp ugt <32 x i8> %a, %b + %sel = select <32 x i1> %cmp, <32 x i8> %a, <32 x i8> %b + %sub = sub <32 x i8> %sel, %b + %bc = bitcast i32 %mask to <32 x i1> + %res = select <32 x i1> %bc, <32 x i8> %sub, <32 x i8> zeroinitializer + ret <32 x i8> %res +} + +define <32 x i8> @test_mask_subs_epu8_rm_256(<32 x i8> %a, <32 x i8>* %ptr_b) { +; CHECK-LABEL: test_mask_subs_epu8_rm_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpsubusb (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd8,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <32 x i8>, <32 x i8>* %ptr_b + %cmp = icmp ugt <32 x i8> %a, %b + %sel = select <32 x i1> %cmp, <32 x i8> %a, <32 x i8> %b + %sub = sub <32 x i8> %sel, %b + ret <32 x i8> %sub +} + +define <32 x i8> @test_mask_subs_epu8_rmk_256(<32 x i8> %a, <32 x i8>* %ptr_b, <32 x i8> %passThru, i32 %mask) { +; CHECK-LABEL: test_mask_subs_epu8_rmk_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpsubusb (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xd8,0x0f] +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <32 x i8>, <32 x i8>* %ptr_b + %cmp = icmp ugt <32 x i8> %a, %b + %sel = select <32 x i1> %cmp, <32 x i8> %a, <32 x i8> %b + %sub = sub <32 x i8> %sel, %b + %bc = bitcast i32 %mask to <32 x i1> + %res = select <32 x i1> %bc, <32 x i8> %sub, <32 x i8> %passThru + ret <32 x i8> %res +} + +define <32 x i8> @test_mask_subs_epu8_rmkz_256(<32 x i8> %a, <32 x i8>* %ptr_b, i32 %mask) { +; CHECK-LABEL: test_mask_subs_epu8_rmkz_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpsubusb (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xd8,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <32 x i8>, <32 x i8>* %ptr_b + %cmp = icmp ugt <32 x i8> %a, %b + %sel = select <32 x i1> %cmp, <32 x i8> %a, <32 x i8> %b + %sub = sub <32 x i8> %sel, %b + %bc = bitcast i32 %mask to <32 x i1> + %res = select <32 x i1> %bc, <32 x i8> %sub, <32 x i8> zeroinitializer + ret <32 x i8> %res +} Index: test/CodeGen/X86/sse2-intrinsics-canonical.ll =================================================================== --- /dev/null +++ test/CodeGen/X86/sse2-intrinsics-canonical.ll @@ -0,0 +1,666 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i386-apple-darwin -mattr=-avx,+sse2 -show-mc-encoding | FileCheck %s --check-prefix=CHECK --check-prefix=SSE +; RUN: llc < %s -mtriple=i386-apple-darwin -mattr=+avx2 -show-mc-encoding | FileCheck %s --check-prefix=CHECK --check-prefix=VCHECK --check-prefix=AVX2 +; RUN: llc < %s -mtriple=i386-apple-darwin -mcpu=skx -show-mc-encoding | FileCheck %s --check-prefix=CHECK --check-prefix=VCHECK --check-prefix=SKX + +; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/sse2-builtins.c + +define <16 x i8> @test_x86_sse2_padds_b(<16 x i8> %a0, <16 x i8> %a1) { +; SSE-LABEL: test_x86_sse2_padds_b: +; SSE: ## %bb.0: +; SSE-NEXT: paddsb %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xec,0xc1] +; SSE-NEXT: retl ## encoding: [0xc3] +; +; AVX2-LABEL: test_x86_sse2_padds_b: +; AVX2: ## %bb.0: +; AVX2-NEXT: vpaddsb %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xec,0xc1] +; AVX2-NEXT: retl ## encoding: [0xc3] +; +; SKX-LABEL: test_x86_sse2_padds_b: +; SKX: ## %bb.0: +; SKX-NEXT: vpaddsb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xec,0xc1] +; SKX-NEXT: retl ## encoding: [0xc3] + %1 = sext <16 x i8> %a0 to <16 x i16> + %2 = sext <16 x i8> %a1 to <16 x i16> + %3 = add nsw <16 x i16> %1, %2 + %4 = icmp slt <16 x i16> %3, + %5 = select <16 x i1> %4, <16 x i16> %3, <16 x i16> + %6 = icmp sgt <16 x i16> %5, + %7 = select <16 x i1> %6, <16 x i16> %5, <16 x i16> + %8 = trunc <16 x i16> %7 to <16 x i8> + ret <16 x i8> %8 +} + +define <8 x i16> @test_x86_sse2_padds_w(<8 x i16> %a0, <8 x i16> %a1) { +; SSE-LABEL: test_x86_sse2_padds_w: +; SSE: ## %bb.0: +; SSE-NEXT: paddsw %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xed,0xc1] +; SSE-NEXT: retl ## encoding: [0xc3] +; +; AVX2-LABEL: test_x86_sse2_padds_w: +; AVX2: ## %bb.0: +; AVX2-NEXT: vpaddsw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xed,0xc1] +; AVX2-NEXT: retl ## encoding: [0xc3] +; +; SKX-LABEL: test_x86_sse2_padds_w: +; SKX: ## %bb.0: +; SKX-NEXT: vpaddsw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xed,0xc1] +; SKX-NEXT: retl ## encoding: [0xc3] + %1 = sext <8 x i16> %a0 to <8 x i32> + %2 = sext <8 x i16> %a1 to <8 x i32> + %3 = add nsw <8 x i32> %1, %2 + %4 = icmp slt <8 x i32> %3, + %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> + %6 = icmp sgt <8 x i32> %5, + %7 = select <8 x i1> %6, <8 x i32> %5, <8 x i32> + %8 = trunc <8 x i32> %7 to <8 x i16> + ret <8 x i16> %8 +} + +define <16 x i8> @test_x86_sse2_paddus_b(<16 x i8> %a0, <16 x i8> %a1) { +; SSE-LABEL: test_x86_sse2_paddus_b: +; SSE: ## %bb.0: +; SSE-NEXT: paddusb %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xdc,0xc1] +; SSE-NEXT: retl ## encoding: [0xc3] +; +; AVX2-LABEL: test_x86_sse2_paddus_b: +; AVX2: ## %bb.0: +; AVX2-NEXT: vpaddusb %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xdc,0xc1] +; AVX2-NEXT: retl ## encoding: [0xc3] +; +; SKX-LABEL: test_x86_sse2_paddus_b: +; SKX: ## %bb.0: +; SKX-NEXT: vpaddusb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdc,0xc1] +; SKX-NEXT: retl ## encoding: [0xc3] + %1 = add <16 x i8> %a0, %a1 + %2 = icmp ugt <16 x i8> %a0, %1 + %3 = select <16 x i1> %2, <16 x i8> , <16 x i8> %1 + ret <16 x i8> %3 +} + +define <8 x i16> @test_x86_sse2_paddus_w(<8 x i16> %a0, <8 x i16> %a1) { +; SSE-LABEL: test_x86_sse2_paddus_w: +; SSE: ## %bb.0: +; SSE-NEXT: paddusw %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xdd,0xc1] +; SSE-NEXT: retl ## encoding: [0xc3] +; +; AVX2-LABEL: test_x86_sse2_paddus_w: +; AVX2: ## %bb.0: +; AVX2-NEXT: vpaddusw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xdd,0xc1] +; AVX2-NEXT: retl ## encoding: [0xc3] +; +; SKX-LABEL: test_x86_sse2_paddus_w: +; SKX: ## %bb.0: +; SKX-NEXT: vpaddusw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdd,0xc1] +; SKX-NEXT: retl ## encoding: [0xc3] + %1 = add <8 x i16> %a0, %a1 + %2 = icmp ugt <8 x i16> %a0, %1 + %3 = select <8 x i1> %2, <8 x i16> , <8 x i16> %1 + ret <8 x i16> %3 +} + +define <16 x i8> @test_x86_sse2_psubs_b(<16 x i8> %a0, <16 x i8> %a1) { +; SSE-LABEL: test_x86_sse2_psubs_b: +; SSE: ## %bb.0: +; SSE-NEXT: psubsb %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xe8,0xc1] +; SSE-NEXT: retl ## encoding: [0xc3] +; +; AVX2-LABEL: test_x86_sse2_psubs_b: +; AVX2: ## %bb.0: +; AVX2-NEXT: vpsubsb %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xe8,0xc1] +; AVX2-NEXT: retl ## encoding: [0xc3] +; +; SKX-LABEL: test_x86_sse2_psubs_b: +; SKX: ## %bb.0: +; SKX-NEXT: vpsubsb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe8,0xc1] +; SKX-NEXT: retl ## encoding: [0xc3] + %1 = sext <16 x i8> %a0 to <16 x i16> + %2 = sext <16 x i8> %a1 to <16 x i16> + %3 = sub nsw <16 x i16> %1, %2 + %4 = icmp slt <16 x i16> %3, + %5 = select <16 x i1> %4, <16 x i16> %3, <16 x i16> + %6 = icmp sgt <16 x i16> %5, + %7 = select <16 x i1> %6, <16 x i16> %5, <16 x i16> + %8 = trunc <16 x i16> %7 to <16 x i8> + ret <16 x i8> %8 +} + +define <8 x i16> @test_x86_sse2_psubs_w(<8 x i16> %a0, <8 x i16> %a1) { +; SSE-LABEL: test_x86_sse2_psubs_w: +; SSE: ## %bb.0: +; SSE-NEXT: psubsw %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xe9,0xc1] +; SSE-NEXT: retl ## encoding: [0xc3] +; +; AVX2-LABEL: test_x86_sse2_psubs_w: +; AVX2: ## %bb.0: +; AVX2-NEXT: vpsubsw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xe9,0xc1] +; AVX2-NEXT: retl ## encoding: [0xc3] +; +; SKX-LABEL: test_x86_sse2_psubs_w: +; SKX: ## %bb.0: +; SKX-NEXT: vpsubsw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe9,0xc1] +; SKX-NEXT: retl ## encoding: [0xc3] + %1 = sext <8 x i16> %a0 to <8 x i32> + %2 = sext <8 x i16> %a1 to <8 x i32> + %3 = sub nsw <8 x i32> %1, %2 + %4 = icmp slt <8 x i32> %3, + %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> + %6 = icmp sgt <8 x i32> %5, + %7 = select <8 x i1> %6, <8 x i32> %5, <8 x i32> + %8 = trunc <8 x i32> %7 to <8 x i16> + ret <8 x i16> %8 +} + +define <16 x i8> @test_x86_sse2_psubus_b(<16 x i8> %a0, <16 x i8> %a1) { +; SSE-LABEL: test_x86_sse2_psubus_b: +; SSE: ## %bb.0: +; SSE-NEXT: psubusb %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xd8,0xc1] +; SSE-NEXT: retl ## encoding: [0xc3] +; +; AVX2-LABEL: test_x86_sse2_psubus_b: +; AVX2: ## %bb.0: +; AVX2-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xd8,0xc1] +; AVX2-NEXT: retl ## encoding: [0xc3] +; +; SKX-LABEL: test_x86_sse2_psubus_b: +; SKX: ## %bb.0: +; SKX-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd8,0xc1] +; SKX-NEXT: retl ## encoding: [0xc3] + %cmp = icmp ugt <16 x i8> %a0, %a1 + %sel = select <16 x i1> %cmp, <16 x i8> %a0, <16 x i8> %a1 + %sub = sub <16 x i8> %sel, %a1 + ret <16 x i8> %sub +} + +define <8 x i16> @test_x86_sse2_psubus_w(<8 x i16> %a0, <8 x i16> %a1) { +; SSE-LABEL: test_x86_sse2_psubus_w: +; SSE: ## %bb.0: +; SSE-NEXT: psubusw %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xd9,0xc1] +; SSE-NEXT: retl ## encoding: [0xc3] +; +; AVX2-LABEL: test_x86_sse2_psubus_w: +; AVX2: ## %bb.0: +; AVX2-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xd9,0xc1] +; AVX2-NEXT: retl ## encoding: [0xc3] +; +; SKX-LABEL: test_x86_sse2_psubus_w: +; SKX: ## %bb.0: +; SKX-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd9,0xc1] +; SKX-NEXT: retl ## encoding: [0xc3] + %cmp = icmp ugt <8 x i16> %a0, %a1 + %sel = select <8 x i1> %cmp, <8 x i16> %a0, <8 x i16> %a1 + %sub = sub <8 x i16> %sel, %a1 + ret <8 x i16> %sub +} + +define <8 x i8> @test_x86_sse2_padds_b_64(<8 x i8> %a0, <8 x i8> %a1) { +; SSE-LABEL: test_x86_sse2_padds_b_64: +; SSE: ## %bb.0: +; SSE-NEXT: psllw $8, %xmm0 ## encoding: [0x66,0x0f,0x71,0xf0,0x08] +; SSE-NEXT: psraw $8, %xmm0 ## encoding: [0x66,0x0f,0x71,0xe0,0x08] +; SSE-NEXT: psllw $8, %xmm1 ## encoding: [0x66,0x0f,0x71,0xf1,0x08] +; SSE-NEXT: psraw $8, %xmm1 ## encoding: [0x66,0x0f,0x71,0xe1,0x08] +; SSE-NEXT: paddw %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xfd,0xc1] +; SSE-NEXT: pminsw LCPI8_0, %xmm0 ## encoding: [0x66,0x0f,0xea,0x05,A,A,A,A] +; SSE-NEXT: ## fixup A - offset: 4, value: LCPI8_0, kind: FK_Data_4 +; SSE-NEXT: pmaxsw LCPI8_1, %xmm0 ## encoding: [0x66,0x0f,0xee,0x05,A,A,A,A] +; SSE-NEXT: ## fixup A - offset: 4, value: LCPI8_1, kind: FK_Data_4 +; SSE-NEXT: retl ## encoding: [0xc3] +; +; AVX2-LABEL: test_x86_sse2_padds_b_64: +; AVX2: ## %bb.0: +; AVX2-NEXT: vpsllw $8, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x71,0xf0,0x08] +; AVX2-NEXT: vpsraw $8, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x71,0xe0,0x08] +; AVX2-NEXT: vpsllw $8, %xmm1, %xmm1 ## encoding: [0xc5,0xf1,0x71,0xf1,0x08] +; AVX2-NEXT: vpsraw $8, %xmm1, %xmm1 ## encoding: [0xc5,0xf1,0x71,0xe1,0x08] +; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfd,0xc1] +; AVX2-NEXT: vpminsw LCPI8_0, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xea,0x05,A,A,A,A] +; AVX2-NEXT: ## fixup A - offset: 4, value: LCPI8_0, kind: FK_Data_4 +; AVX2-NEXT: vpmaxsw LCPI8_1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xee,0x05,A,A,A,A] +; AVX2-NEXT: ## fixup A - offset: 4, value: LCPI8_1, kind: FK_Data_4 +; AVX2-NEXT: retl ## encoding: [0xc3] +; +; SKX-LABEL: test_x86_sse2_padds_b_64: +; SKX: ## %bb.0: +; SKX-NEXT: vpsllw $8, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x71,0xf0,0x08] +; SKX-NEXT: vpsraw $8, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x71,0xe0,0x08] +; SKX-NEXT: vpsllw $8, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x71,0xf1,0x08] +; SKX-NEXT: vpsraw $8, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x71,0xe1,0x08] +; SKX-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc1] +; SKX-NEXT: vpminsw LCPI8_0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xea,0x05,A,A,A,A] +; SKX-NEXT: ## fixup A - offset: 4, value: LCPI8_0, kind: FK_Data_4 +; SKX-NEXT: vpmaxsw LCPI8_1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xee,0x05,A,A,A,A] +; SKX-NEXT: ## fixup A - offset: 4, value: LCPI8_1, kind: FK_Data_4 +; SKX-NEXT: retl ## encoding: [0xc3] + %1 = sext <8 x i8> %a0 to <8 x i16> + %2 = sext <8 x i8> %a1 to <8 x i16> + %3 = add nsw <8 x i16> %1, %2 + %4 = icmp slt <8 x i16> %3, + %5 = select <8 x i1> %4, <8 x i16> %3, <8 x i16> + %6 = icmp sgt <8 x i16> %5, + %7 = select <8 x i1> %6, <8 x i16> %5, <8 x i16> + %8 = trunc <8 x i16> %7 to <8 x i8> + ret <8 x i8> %8 +} + +define <4 x i16> @test_x86_sse2_padds_w_64(<4 x i16> %a0, <4 x i16> %a1) { +; SSE-LABEL: test_x86_sse2_padds_w_64: +; SSE: ## %bb.0: +; SSE-NEXT: pslld $16, %xmm0 ## encoding: [0x66,0x0f,0x72,0xf0,0x10] +; SSE-NEXT: psrad $16, %xmm0 ## encoding: [0x66,0x0f,0x72,0xe0,0x10] +; SSE-NEXT: pslld $16, %xmm1 ## encoding: [0x66,0x0f,0x72,0xf1,0x10] +; SSE-NEXT: psrad $16, %xmm1 ## encoding: [0x66,0x0f,0x72,0xe1,0x10] +; SSE-NEXT: paddd %xmm0, %xmm1 ## encoding: [0x66,0x0f,0xfe,0xc8] +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [32767,32767,32767,32767] +; SSE-NEXT: ## encoding: [0x66,0x0f,0x6f,0x05,A,A,A,A] +; SSE-NEXT: ## fixup A - offset: 4, value: LCPI9_0, kind: FK_Data_4 +; SSE-NEXT: movdqa %xmm0, %xmm2 ## encoding: [0x66,0x0f,0x6f,0xd0] +; SSE-NEXT: pcmpgtd %xmm1, %xmm2 ## encoding: [0x66,0x0f,0x66,0xd1] +; SSE-NEXT: pand %xmm2, %xmm1 ## encoding: [0x66,0x0f,0xdb,0xca] +; SSE-NEXT: pandn %xmm0, %xmm2 ## encoding: [0x66,0x0f,0xdf,0xd0] +; SSE-NEXT: por %xmm1, %xmm2 ## encoding: [0x66,0x0f,0xeb,0xd1] +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [4294934528,4294934528,4294934528,4294934528] +; SSE-NEXT: ## encoding: [0x66,0x0f,0x6f,0x0d,A,A,A,A] +; SSE-NEXT: ## fixup A - offset: 4, value: LCPI9_1, kind: FK_Data_4 +; SSE-NEXT: movdqa %xmm2, %xmm0 ## encoding: [0x66,0x0f,0x6f,0xc2] +; SSE-NEXT: pcmpgtd %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x66,0xc1] +; SSE-NEXT: pand %xmm0, %xmm2 ## encoding: [0x66,0x0f,0xdb,0xd0] +; SSE-NEXT: pandn %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xdf,0xc1] +; SSE-NEXT: por %xmm2, %xmm0 ## encoding: [0x66,0x0f,0xeb,0xc2] +; SSE-NEXT: retl ## encoding: [0xc3] +; +; AVX2-LABEL: test_x86_sse2_padds_w_64: +; AVX2: ## %bb.0: +; AVX2-NEXT: vpslld $16, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x72,0xf0,0x10] +; AVX2-NEXT: vpsrad $16, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x72,0xe0,0x10] +; AVX2-NEXT: vpslld $16, %xmm1, %xmm1 ## encoding: [0xc5,0xf1,0x72,0xf1,0x10] +; AVX2-NEXT: vpsrad $16, %xmm1, %xmm1 ## encoding: [0xc5,0xf1,0x72,0xe1,0x10] +; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfe,0xc1] +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [32767,32767,32767,32767] +; AVX2-NEXT: ## encoding: [0xc4,0xe2,0x79,0x58,0x0d,A,A,A,A] +; AVX2-NEXT: ## fixup A - offset: 5, value: LCPI9_0, kind: FK_Data_4 +; AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x39,0xc1] +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4294934528,4294934528,4294934528,4294934528] +; AVX2-NEXT: ## encoding: [0xc4,0xe2,0x79,0x58,0x0d,A,A,A,A] +; AVX2-NEXT: ## fixup A - offset: 5, value: LCPI9_1, kind: FK_Data_4 +; AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x3d,0xc1] +; AVX2-NEXT: retl ## encoding: [0xc3] +; +; SKX-LABEL: test_x86_sse2_padds_w_64: +; SKX: ## %bb.0: +; SKX-NEXT: vpslld $16, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x72,0xf0,0x10] +; SKX-NEXT: vpsrad $16, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x72,0xe0,0x10] +; SKX-NEXT: vpslld $16, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x72,0xf1,0x10] +; SKX-NEXT: vpsrad $16, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x72,0xe1,0x10] +; SKX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc1] +; SKX-NEXT: vpminsd LCPI9_0{1to4}, %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7d,0x18,0x39,0x05,A,A,A,A] +; SKX-NEXT: ## fixup A - offset: 6, value: LCPI9_0, kind: FK_Data_4 +; SKX-NEXT: vpmaxsd LCPI9_1{1to4}, %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7d,0x18,0x3d,0x05,A,A,A,A] +; SKX-NEXT: ## fixup A - offset: 6, value: LCPI9_1, kind: FK_Data_4 +; SKX-NEXT: retl ## encoding: [0xc3] + %1 = sext <4 x i16> %a0 to <4 x i32> + %2 = sext <4 x i16> %a1 to <4 x i32> + %3 = add nsw <4 x i32> %1, %2 + %4 = icmp slt <4 x i32> %3, + %5 = select <4 x i1> %4, <4 x i32> %3, <4 x i32> + %6 = icmp sgt <4 x i32> %5, + %7 = select <4 x i1> %6, <4 x i32> %5, <4 x i32> + %8 = trunc <4 x i32> %7 to <4 x i16> + ret <4 x i16> %8 +} + +define <8 x i8> @test_x86_sse2_paddus_b_64(<8 x i8> %a0, <8 x i8> %a1) { +; SSE-LABEL: test_x86_sse2_paddus_b_64: +; SSE: ## %bb.0: +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] +; SSE-NEXT: ## encoding: [0x66,0x0f,0x6f,0x15,A,A,A,A] +; SSE-NEXT: ## fixup A - offset: 4, value: LCPI10_0, kind: FK_Data_4 +; SSE-NEXT: paddw %xmm0, %xmm1 ## encoding: [0x66,0x0f,0xfd,0xc8] +; SSE-NEXT: pand %xmm2, %xmm0 ## encoding: [0x66,0x0f,0xdb,0xc2] +; SSE-NEXT: pand %xmm1, %xmm2 ## encoding: [0x66,0x0f,0xdb,0xd1] +; SSE-NEXT: pcmpgtw %xmm2, %xmm0 ## encoding: [0x66,0x0f,0x65,0xc2] +; SSE-NEXT: movdqa %xmm0, %xmm2 ## encoding: [0x66,0x0f,0x6f,0xd0] +; SSE-NEXT: pandn %xmm1, %xmm2 ## encoding: [0x66,0x0f,0xdf,0xd1] +; SSE-NEXT: pand LCPI10_0, %xmm0 ## encoding: [0x66,0x0f,0xdb,0x05,A,A,A,A] +; SSE-NEXT: ## fixup A - offset: 4, value: LCPI10_0, kind: FK_Data_4 +; SSE-NEXT: por %xmm2, %xmm0 ## encoding: [0x66,0x0f,0xeb,0xc2] +; SSE-NEXT: retl ## encoding: [0xc3] +; +; AVX2-LABEL: test_x86_sse2_paddus_b_64: +; AVX2: ## %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] +; AVX2-NEXT: ## encoding: [0xc5,0xf9,0x6f,0x15,A,A,A,A] +; AVX2-NEXT: ## fixup A - offset: 4, value: LCPI10_0, kind: FK_Data_4 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm3 ## encoding: [0xc5,0xf9,0xdb,0xda] +; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfd,0xc1] +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm1 ## encoding: [0xc5,0xf9,0xdb,0xca] +; AVX2-NEXT: vpcmpgtw %xmm1, %xmm3, %xmm1 ## encoding: [0xc5,0xe1,0x65,0xc9] +; AVX2-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x4c,0xc2,0x10] +; AVX2-NEXT: retl ## encoding: [0xc3] +; +; SKX-LABEL: test_x86_sse2_paddus_b_64: +; SKX: ## %bb.0: +; SKX-NEXT: vmovdqa LCPI10_0, %xmm2 ## EVEX TO VEX Compression xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] +; SKX-NEXT: ## encoding: [0xc5,0xf9,0x6f,0x15,A,A,A,A] +; SKX-NEXT: ## fixup A - offset: 4, value: LCPI10_0, kind: FK_Data_4 +; SKX-NEXT: vpand %xmm2, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdb,0xda] +; SKX-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc1] +; SKX-NEXT: vpand %xmm2, %xmm0, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdb,0xca] +; SKX-NEXT: vpcmpnleuw %xmm1, %xmm3, %k1 ## encoding: [0x62,0xf3,0xe5,0x08,0x3e,0xc9,0x06] +; SKX-NEXT: vmovdqu16 LCPI10_0, %xmm0 {%k1} ## encoding: [0x62,0xf1,0xff,0x09,0x6f,0x05,A,A,A,A] +; SKX-NEXT: ## fixup A - offset: 6, value: LCPI10_0, kind: FK_Data_4 +; SKX-NEXT: retl ## encoding: [0xc3] + %1 = add <8 x i8> %a0, %a1 + %2 = icmp ugt <8 x i8> %a0, %1 + %3 = select <8 x i1> %2, <8 x i8> , <8 x i8> %1 + ret <8 x i8> %3 +} + +define <4 x i16> @test_x86_sse2_paddus_w_64(<4 x i16> %a0, <4 x i16> %a1) { +; SSE-LABEL: test_x86_sse2_paddus_w_64: +; SSE: ## %bb.0: +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,0,65535,0,65535,0] +; SSE-NEXT: ## encoding: [0x66,0x0f,0x6f,0x15,A,A,A,A] +; SSE-NEXT: ## fixup A - offset: 4, value: LCPI11_0, kind: FK_Data_4 +; SSE-NEXT: paddd %xmm0, %xmm1 ## encoding: [0x66,0x0f,0xfe,0xc8] +; SSE-NEXT: pand %xmm2, %xmm0 ## encoding: [0x66,0x0f,0xdb,0xc2] +; SSE-NEXT: pand %xmm1, %xmm2 ## encoding: [0x66,0x0f,0xdb,0xd1] +; SSE-NEXT: pcmpgtd %xmm2, %xmm0 ## encoding: [0x66,0x0f,0x66,0xc2] +; SSE-NEXT: movdqa %xmm0, %xmm2 ## encoding: [0x66,0x0f,0x6f,0xd0] +; SSE-NEXT: pandn %xmm1, %xmm2 ## encoding: [0x66,0x0f,0xdf,0xd1] +; SSE-NEXT: pand LCPI11_0, %xmm0 ## encoding: [0x66,0x0f,0xdb,0x05,A,A,A,A] +; SSE-NEXT: ## fixup A - offset: 4, value: LCPI11_0, kind: FK_Data_4 +; SSE-NEXT: por %xmm2, %xmm0 ## encoding: [0x66,0x0f,0xeb,0xc2] +; SSE-NEXT: retl ## encoding: [0xc3] +; +; AVX2-LABEL: test_x86_sse2_paddus_w_64: +; AVX2: ## %bb.0: +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xef,0xd2] +; AVX2-NEXT: vpblendw $170, %xmm2, %xmm0, %xmm3 ## encoding: [0xc4,0xe3,0x79,0x0e,0xda,0xaa] +; AVX2-NEXT: ## xmm3 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7] +; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfe,0xc1] +; AVX2-NEXT: vpblendw $170, %xmm2, %xmm0, %xmm1 ## encoding: [0xc4,0xe3,0x79,0x0e,0xca,0xaa] +; AVX2-NEXT: ## xmm1 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7] +; AVX2-NEXT: vpcmpgtd %xmm1, %xmm3, %xmm1 ## encoding: [0xc5,0xe1,0x66,0xc9] +; AVX2-NEXT: vbroadcastss {{.*#+}} xmm2 = [65535,65535,65535,65535] +; AVX2-NEXT: ## encoding: [0xc4,0xe2,0x79,0x18,0x15,A,A,A,A] +; AVX2-NEXT: ## fixup A - offset: 5, value: LCPI11_0, kind: FK_Data_4 +; AVX2-NEXT: vblendvps %xmm1, %xmm2, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x4a,0xc2,0x10] +; AVX2-NEXT: retl ## encoding: [0xc3] +; +; SKX-LABEL: test_x86_sse2_paddus_w_64: +; SKX: ## %bb.0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2] +; SKX-NEXT: vpblendw $170, %xmm2, %xmm0, %xmm3 ## encoding: [0xc4,0xe3,0x79,0x0e,0xda,0xaa] +; SKX-NEXT: ## xmm3 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7] +; SKX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc1] +; SKX-NEXT: vpblendw $170, %xmm2, %xmm0, %xmm1 ## encoding: [0xc4,0xe3,0x79,0x0e,0xca,0xaa] +; SKX-NEXT: ## xmm1 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7] +; SKX-NEXT: vpcmpnleud %xmm1, %xmm3, %k1 ## encoding: [0x62,0xf3,0x65,0x08,0x1e,0xc9,0x06] +; SKX-NEXT: vpbroadcastd LCPI11_0, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x58,0x05,A,A,A,A] +; SKX-NEXT: ## fixup A - offset: 6, value: LCPI11_0, kind: FK_Data_4 +; SKX-NEXT: retl ## encoding: [0xc3] + %1 = add <4 x i16> %a0, %a1 + %2 = icmp ugt <4 x i16> %a0, %1 + %3 = select <4 x i1> %2, <4 x i16> , <4 x i16> %1 + ret <4 x i16> %3 +} + +define <8 x i8> @test_x86_sse2_psubs_b_64(<8 x i8> %a0, <8 x i8> %a1) { +; SSE-LABEL: test_x86_sse2_psubs_b_64: +; SSE: ## %bb.0: +; SSE-NEXT: psllw $8, %xmm0 ## encoding: [0x66,0x0f,0x71,0xf0,0x08] +; SSE-NEXT: psraw $8, %xmm0 ## encoding: [0x66,0x0f,0x71,0xe0,0x08] +; SSE-NEXT: psllw $8, %xmm1 ## encoding: [0x66,0x0f,0x71,0xf1,0x08] +; SSE-NEXT: psraw $8, %xmm1 ## encoding: [0x66,0x0f,0x71,0xe1,0x08] +; SSE-NEXT: psubw %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xf9,0xc1] +; SSE-NEXT: pminsw LCPI12_0, %xmm0 ## encoding: [0x66,0x0f,0xea,0x05,A,A,A,A] +; SSE-NEXT: ## fixup A - offset: 4, value: LCPI12_0, kind: FK_Data_4 +; SSE-NEXT: pmaxsw LCPI12_1, %xmm0 ## encoding: [0x66,0x0f,0xee,0x05,A,A,A,A] +; SSE-NEXT: ## fixup A - offset: 4, value: LCPI12_1, kind: FK_Data_4 +; SSE-NEXT: retl ## encoding: [0xc3] +; +; AVX2-LABEL: test_x86_sse2_psubs_b_64: +; AVX2: ## %bb.0: +; AVX2-NEXT: vpsllw $8, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x71,0xf0,0x08] +; AVX2-NEXT: vpsraw $8, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x71,0xe0,0x08] +; AVX2-NEXT: vpsllw $8, %xmm1, %xmm1 ## encoding: [0xc5,0xf1,0x71,0xf1,0x08] +; AVX2-NEXT: vpsraw $8, %xmm1, %xmm1 ## encoding: [0xc5,0xf1,0x71,0xe1,0x08] +; AVX2-NEXT: vpsubw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xf9,0xc1] +; AVX2-NEXT: vpminsw LCPI12_0, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xea,0x05,A,A,A,A] +; AVX2-NEXT: ## fixup A - offset: 4, value: LCPI12_0, kind: FK_Data_4 +; AVX2-NEXT: vpmaxsw LCPI12_1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xee,0x05,A,A,A,A] +; AVX2-NEXT: ## fixup A - offset: 4, value: LCPI12_1, kind: FK_Data_4 +; AVX2-NEXT: retl ## encoding: [0xc3] +; +; SKX-LABEL: test_x86_sse2_psubs_b_64: +; SKX: ## %bb.0: +; SKX-NEXT: vpsllw $8, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x71,0xf0,0x08] +; SKX-NEXT: vpsraw $8, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x71,0xe0,0x08] +; SKX-NEXT: vpsllw $8, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x71,0xf1,0x08] +; SKX-NEXT: vpsraw $8, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x71,0xe1,0x08] +; SKX-NEXT: vpsubw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xf9,0xc1] +; SKX-NEXT: vpminsw LCPI12_0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xea,0x05,A,A,A,A] +; SKX-NEXT: ## fixup A - offset: 4, value: LCPI12_0, kind: FK_Data_4 +; SKX-NEXT: vpmaxsw LCPI12_1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xee,0x05,A,A,A,A] +; SKX-NEXT: ## fixup A - offset: 4, value: LCPI12_1, kind: FK_Data_4 +; SKX-NEXT: retl ## encoding: [0xc3] + %1 = sext <8 x i8> %a0 to <8 x i16> + %2 = sext <8 x i8> %a1 to <8 x i16> + %3 = sub nsw <8 x i16> %1, %2 + %4 = icmp slt <8 x i16> %3, + %5 = select <8 x i1> %4, <8 x i16> %3, <8 x i16> + %6 = icmp sgt <8 x i16> %5, + %7 = select <8 x i1> %6, <8 x i16> %5, <8 x i16> + %8 = trunc <8 x i16> %7 to <8 x i8> + ret <8 x i8> %8 +} + +define <4 x i16> @test_x86_sse2_psubs_w_64(<4 x i16> %a0, <4 x i16> %a1) { +; SSE-LABEL: test_x86_sse2_psubs_w_64: +; SSE: ## %bb.0: +; SSE-NEXT: pslld $16, %xmm0 ## encoding: [0x66,0x0f,0x72,0xf0,0x10] +; SSE-NEXT: psrad $16, %xmm0 ## encoding: [0x66,0x0f,0x72,0xe0,0x10] +; SSE-NEXT: pslld $16, %xmm1 ## encoding: [0x66,0x0f,0x72,0xf1,0x10] +; SSE-NEXT: psrad $16, %xmm1 ## encoding: [0x66,0x0f,0x72,0xe1,0x10] +; SSE-NEXT: psubd %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xfa,0xc1] +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [32767,32767,32767,32767] +; SSE-NEXT: ## encoding: [0x66,0x0f,0x6f,0x0d,A,A,A,A] +; SSE-NEXT: ## fixup A - offset: 4, value: LCPI13_0, kind: FK_Data_4 +; SSE-NEXT: movdqa %xmm1, %xmm2 ## encoding: [0x66,0x0f,0x6f,0xd1] +; SSE-NEXT: pcmpgtd %xmm0, %xmm2 ## encoding: [0x66,0x0f,0x66,0xd0] +; SSE-NEXT: pand %xmm2, %xmm0 ## encoding: [0x66,0x0f,0xdb,0xc2] +; SSE-NEXT: pandn %xmm1, %xmm2 ## encoding: [0x66,0x0f,0xdf,0xd1] +; SSE-NEXT: por %xmm0, %xmm2 ## encoding: [0x66,0x0f,0xeb,0xd0] +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [4294934528,4294934528,4294934528,4294934528] +; SSE-NEXT: ## encoding: [0x66,0x0f,0x6f,0x0d,A,A,A,A] +; SSE-NEXT: ## fixup A - offset: 4, value: LCPI13_1, kind: FK_Data_4 +; SSE-NEXT: movdqa %xmm2, %xmm0 ## encoding: [0x66,0x0f,0x6f,0xc2] +; SSE-NEXT: pcmpgtd %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x66,0xc1] +; SSE-NEXT: pand %xmm0, %xmm2 ## encoding: [0x66,0x0f,0xdb,0xd0] +; SSE-NEXT: pandn %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xdf,0xc1] +; SSE-NEXT: por %xmm2, %xmm0 ## encoding: [0x66,0x0f,0xeb,0xc2] +; SSE-NEXT: retl ## encoding: [0xc3] +; +; AVX2-LABEL: test_x86_sse2_psubs_w_64: +; AVX2: ## %bb.0: +; AVX2-NEXT: vpslld $16, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x72,0xf0,0x10] +; AVX2-NEXT: vpsrad $16, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x72,0xe0,0x10] +; AVX2-NEXT: vpslld $16, %xmm1, %xmm1 ## encoding: [0xc5,0xf1,0x72,0xf1,0x10] +; AVX2-NEXT: vpsrad $16, %xmm1, %xmm1 ## encoding: [0xc5,0xf1,0x72,0xe1,0x10] +; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfa,0xc1] +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [32767,32767,32767,32767] +; AVX2-NEXT: ## encoding: [0xc4,0xe2,0x79,0x58,0x0d,A,A,A,A] +; AVX2-NEXT: ## fixup A - offset: 5, value: LCPI13_0, kind: FK_Data_4 +; AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x39,0xc1] +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4294934528,4294934528,4294934528,4294934528] +; AVX2-NEXT: ## encoding: [0xc4,0xe2,0x79,0x58,0x0d,A,A,A,A] +; AVX2-NEXT: ## fixup A - offset: 5, value: LCPI13_1, kind: FK_Data_4 +; AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x3d,0xc1] +; AVX2-NEXT: retl ## encoding: [0xc3] +; +; SKX-LABEL: test_x86_sse2_psubs_w_64: +; SKX: ## %bb.0: +; SKX-NEXT: vpslld $16, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x72,0xf0,0x10] +; SKX-NEXT: vpsrad $16, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x72,0xe0,0x10] +; SKX-NEXT: vpslld $16, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x72,0xf1,0x10] +; SKX-NEXT: vpsrad $16, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x72,0xe1,0x10] +; SKX-NEXT: vpsubd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfa,0xc1] +; SKX-NEXT: vpminsd LCPI13_0{1to4}, %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7d,0x18,0x39,0x05,A,A,A,A] +; SKX-NEXT: ## fixup A - offset: 6, value: LCPI13_0, kind: FK_Data_4 +; SKX-NEXT: vpmaxsd LCPI13_1{1to4}, %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7d,0x18,0x3d,0x05,A,A,A,A] +; SKX-NEXT: ## fixup A - offset: 6, value: LCPI13_1, kind: FK_Data_4 +; SKX-NEXT: retl ## encoding: [0xc3] + %1 = sext <4 x i16> %a0 to <4 x i32> + %2 = sext <4 x i16> %a1 to <4 x i32> + %3 = sub nsw <4 x i32> %1, %2 + %4 = icmp slt <4 x i32> %3, + %5 = select <4 x i1> %4, <4 x i32> %3, <4 x i32> + %6 = icmp sgt <4 x i32> %5, + %7 = select <4 x i1> %6, <4 x i32> %5, <4 x i32> + %8 = trunc <4 x i32> %7 to <4 x i16> + ret <4 x i16> %8 +} + +define <8 x i8> @test_x86_sse2_psubus_b_64(<8 x i8> %a0, <8 x i8> %a1) { +; SSE-LABEL: test_x86_sse2_psubus_b_64: +; SSE: ## %bb.0: +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] +; SSE-NEXT: ## encoding: [0x66,0x0f,0x6f,0x15,A,A,A,A] +; SSE-NEXT: ## fixup A - offset: 4, value: LCPI14_0, kind: FK_Data_4 +; SSE-NEXT: movdqa %xmm1, %xmm3 ## encoding: [0x66,0x0f,0x6f,0xd9] +; SSE-NEXT: pand %xmm2, %xmm3 ## encoding: [0x66,0x0f,0xdb,0xda] +; SSE-NEXT: pand %xmm2, %xmm0 ## encoding: [0x66,0x0f,0xdb,0xc2] +; SSE-NEXT: pmaxsw %xmm3, %xmm0 ## encoding: [0x66,0x0f,0xee,0xc3] +; SSE-NEXT: psubw %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xf9,0xc1] +; SSE-NEXT: retl ## encoding: [0xc3] +; +; AVX2-LABEL: test_x86_sse2_psubus_b_64: +; AVX2: ## %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] +; AVX2-NEXT: ## encoding: [0xc5,0xf9,0x6f,0x15,A,A,A,A] +; AVX2-NEXT: ## fixup A - offset: 4, value: LCPI14_0, kind: FK_Data_4 +; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm3 ## encoding: [0xc5,0xf1,0xdb,0xda] +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xdb,0xc2] +; AVX2-NEXT: vpmaxuw %xmm3, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x3e,0xc3] +; AVX2-NEXT: vpsubw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xf9,0xc1] +; AVX2-NEXT: retl ## encoding: [0xc3] +; +; SKX-LABEL: test_x86_sse2_psubus_b_64: +; SKX: ## %bb.0: +; SKX-NEXT: vmovdqa LCPI14_0, %xmm2 ## EVEX TO VEX Compression xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] +; SKX-NEXT: ## encoding: [0xc5,0xf9,0x6f,0x15,A,A,A,A] +; SKX-NEXT: ## fixup A - offset: 4, value: LCPI14_0, kind: FK_Data_4 +; SKX-NEXT: vpand %xmm2, %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xdb,0xda] +; SKX-NEXT: vpand %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdb,0xc2] +; SKX-NEXT: vpmaxuw %xmm3, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x3e,0xc3] +; SKX-NEXT: vpsubw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xf9,0xc1] +; SKX-NEXT: retl ## encoding: [0xc3] + %cmp = icmp ugt <8 x i8> %a0, %a1 + %sel = select <8 x i1> %cmp, <8 x i8> %a0, <8 x i8> %a1 + %sub = sub <8 x i8> %sel, %a1 + ret <8 x i8> %sub +} + +define <4 x i16> @test_x86_sse2_psubus_w_64(<4 x i16> %a0, <4 x i16> %a1) { +; SSE-LABEL: test_x86_sse2_psubus_w_64: +; SSE: ## %bb.0: +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,0,65535,0,65535,0] +; SSE-NEXT: ## encoding: [0x66,0x0f,0x6f,0x15,A,A,A,A] +; SSE-NEXT: ## fixup A - offset: 4, value: LCPI15_0, kind: FK_Data_4 +; SSE-NEXT: movdqa %xmm1, %xmm3 ## encoding: [0x66,0x0f,0x6f,0xd9] +; SSE-NEXT: pand %xmm2, %xmm3 ## encoding: [0x66,0x0f,0xdb,0xda] +; SSE-NEXT: pand %xmm2, %xmm0 ## encoding: [0x66,0x0f,0xdb,0xc2] +; SSE-NEXT: movdqa %xmm0, %xmm2 ## encoding: [0x66,0x0f,0x6f,0xd0] +; SSE-NEXT: pcmpgtd %xmm3, %xmm2 ## encoding: [0x66,0x0f,0x66,0xd3] +; SSE-NEXT: pand %xmm2, %xmm0 ## encoding: [0x66,0x0f,0xdb,0xc2] +; SSE-NEXT: pandn %xmm3, %xmm2 ## encoding: [0x66,0x0f,0xdf,0xd3] +; SSE-NEXT: por %xmm0, %xmm2 ## encoding: [0x66,0x0f,0xeb,0xd0] +; SSE-NEXT: psubd %xmm1, %xmm2 ## encoding: [0x66,0x0f,0xfa,0xd1] +; SSE-NEXT: movdqa %xmm2, %xmm0 ## encoding: [0x66,0x0f,0x6f,0xc2] +; SSE-NEXT: retl ## encoding: [0xc3] +; +; AVX2-LABEL: test_x86_sse2_psubus_w_64: +; AVX2: ## %bb.0: +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xef,0xd2] +; AVX2-NEXT: vpblendw $170, %xmm2, %xmm1, %xmm3 ## encoding: [0xc4,0xe3,0x71,0x0e,0xda,0xaa] +; AVX2-NEXT: ## xmm3 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] +; AVX2-NEXT: vpblendw $170, %xmm2, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0e,0xc2,0xaa] +; AVX2-NEXT: ## xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7] +; AVX2-NEXT: vpmaxud %xmm3, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x3f,0xc3] +; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfa,0xc1] +; AVX2-NEXT: retl ## encoding: [0xc3] +; +; SKX-LABEL: test_x86_sse2_psubus_w_64: +; SKX: ## %bb.0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2] +; SKX-NEXT: vpblendw $170, %xmm2, %xmm1, %xmm3 ## encoding: [0xc4,0xe3,0x71,0x0e,0xda,0xaa] +; SKX-NEXT: ## xmm3 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] +; SKX-NEXT: vpblendw $170, %xmm2, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0e,0xc2,0xaa] +; SKX-NEXT: ## xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7] +; SKX-NEXT: vpmaxud %xmm3, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x3f,0xc3] +; SKX-NEXT: vpsubd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfa,0xc1] +; SKX-NEXT: retl ## encoding: [0xc3] + %cmp = icmp ugt <4 x i16> %a0, %a1 + %sel = select <4 x i1> %cmp, <4 x i16> %a0, <4 x i16> %a1 + %sub = sub <4 x i16> %sel, %a1 + ret <4 x i16> %sub +} + +define <8 x i16> @test_x86_sse2_padds_w_fail(<8 x i16> %a0, <8 x i16> %a1) { +; SSE-LABEL: test_x86_sse2_padds_w_fail: +; SSE: ## %bb.0: +; SSE-NEXT: pxor %xmm0, %xmm0 ## encoding: [0x66,0x0f,0xef,0xc0] +; SSE-NEXT: paddsw %xmm0, %xmm0 ## encoding: [0x66,0x0f,0xed,0xc0] +; SSE-NEXT: retl ## encoding: [0xc3] +; +; AVX2-LABEL: test_x86_sse2_padds_w_fail: +; AVX2: ## %bb.0: +; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xef,0xc0] +; AVX2-NEXT: vpaddsw %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xed,0xc0] +; AVX2-NEXT: retl ## encoding: [0xc3] +; +; SKX-LABEL: test_x86_sse2_padds_w_fail: +; SKX: ## %bb.0: +; SKX-NEXT: vpxor %xmm0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xef,0xc0] +; SKX-NEXT: vpaddsw %xmm0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xed,0xc0] +; SKX-NEXT: retl ## encoding: [0xc3] + %1 = shufflevector <16 x i8> zeroinitializer, <16 x i8> undef, <8 x i32> + %2 = sext <8 x i8> %1 to <8 x i32> + %3 = add nuw nsw <8 x i32> zeroinitializer, %2 + %4 = icmp slt <8 x i32> %3, + %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> + %6 = icmp sgt <8 x i32> %5, + %7 = select <8 x i1> %6, <8 x i32> %5, <8 x i32> + %8 = trunc <8 x i32> %7 to <8 x i16> + ret <8 x i16> %8 +} + +define <8 x i16> @test_x86_sse2_psubs_w_fail(<8 x i16> %a0, <8 x i16> %a1) { +; SSE-LABEL: test_x86_sse2_psubs_w_fail: +; SSE: ## %bb.0: +; SSE-NEXT: pxor %xmm0, %xmm0 ## encoding: [0x66,0x0f,0xef,0xc0] +; SSE-NEXT: psubsw %xmm0, %xmm0 ## encoding: [0x66,0x0f,0xe9,0xc0] +; SSE-NEXT: retl ## encoding: [0xc3] +; +; AVX2-LABEL: test_x86_sse2_psubs_w_fail: +; AVX2: ## %bb.0: +; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xef,0xc0] +; AVX2-NEXT: vpsubsw %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xe9,0xc0] +; AVX2-NEXT: retl ## encoding: [0xc3] +; +; SKX-LABEL: test_x86_sse2_psubs_w_fail: +; SKX: ## %bb.0: +; SKX-NEXT: vpxor %xmm0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xef,0xc0] +; SKX-NEXT: vpsubsw %xmm0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe9,0xc0] +; SKX-NEXT: retl ## encoding: [0xc3] + %1 = shufflevector <16 x i8> zeroinitializer, <16 x i8> undef, <8 x i32> + %2 = sext <8 x i8> %1 to <8 x i32> + %3 = sub nuw nsw <8 x i32> zeroinitializer, %2 + %4 = icmp slt <8 x i32> %3, + %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> + %6 = icmp sgt <8 x i32> %5, + %7 = select <8 x i1> %6, <8 x i32> %5, <8 x i32> + %8 = trunc <8 x i32> %7 to <8 x i16> + ret <8 x i16> %8 +} Index: test/CodeGen/X86/sse2-intrinsics-fast-isel.ll =================================================================== --- test/CodeGen/X86/sse2-intrinsics-fast-isel.ll +++ test/CodeGen/X86/sse2-intrinsics-fast-isel.ll @@ -112,8 +112,15 @@ ; X64-NEXT: retq %arg0 = bitcast <2 x i64> %a0 to <16 x i8> %arg1 = bitcast <2 x i64> %a1 to <16 x i8> - %res = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %arg0, <16 x i8> %arg1) - %bc = bitcast <16 x i8> %res to <2 x i64> + %1 = sext <16 x i8> %arg0 to <16 x i16> + %2 = sext <16 x i8> %arg1 to <16 x i16> + %3 = add nsw <16 x i16> %1, %2 + %4 = icmp slt <16 x i16> %3, + %5 = select <16 x i1> %4, <16 x i16> %3, <16 x i16> + %6 = icmp sgt <16 x i16> %5, + %7 = select <16 x i1> %6, <16 x i16> %5, <16 x i16> + %8 = trunc <16 x i16> %7 to <16 x i8> + %bc = bitcast <16 x i8> %8 to <2 x i64> ret <2 x i64> %bc } declare <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8>, <16 x i8>) nounwind readnone @@ -130,11 +137,17 @@ ; X64-NEXT: retq %arg0 = bitcast <2 x i64> %a0 to <8 x i16> %arg1 = bitcast <2 x i64> %a1 to <8 x i16> - %res = call <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> %arg0, <8 x i16> %arg1) - %bc = bitcast <8 x i16> %res to <2 x i64> + %1 = sext <8 x i16> %arg0 to <8 x i32> + %2 = sext <8 x i16> %arg1 to <8 x i32> + %3 = add nsw <8 x i32> %1, %2 + %4 = icmp slt <8 x i32> %3, + %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> + %6 = icmp sgt <8 x i32> %5, + %7 = select <8 x i1> %6, <8 x i32> %5, <8 x i32> + %8 = trunc <8 x i32> %7 to <8 x i16> + %bc = bitcast <8 x i16> %8 to <2 x i64> ret <2 x i64> %bc } -declare <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16>, <8 x i16>) nounwind readnone define <2 x i64> @test_mm_adds_epu8(<2 x i64> %a0, <2 x i64> %a1) nounwind { ; X32-LABEL: test_mm_adds_epu8: @@ -148,11 +161,12 @@ ; X64-NEXT: retq %arg0 = bitcast <2 x i64> %a0 to <16 x i8> %arg1 = bitcast <2 x i64> %a1 to <16 x i8> - %res = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %arg0, <16 x i8> %arg1) - %bc = bitcast <16 x i8> %res to <2 x i64> + %1 = add <16 x i8> %arg0, %arg1 + %2 = icmp ugt <16 x i8> %arg0, %1 + %3 = select <16 x i1> %2, <16 x i8> , <16 x i8> %1 + %bc = bitcast <16 x i8> %3 to <2 x i64> ret <2 x i64> %bc } -declare <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8>, <16 x i8>) nounwind readnone define <2 x i64> @test_mm_adds_epu16(<2 x i64> %a0, <2 x i64> %a1) nounwind { ; X32-LABEL: test_mm_adds_epu16: @@ -166,11 +180,12 @@ ; X64-NEXT: retq %arg0 = bitcast <2 x i64> %a0 to <8 x i16> %arg1 = bitcast <2 x i64> %a1 to <8 x i16> - %res = call <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16> %arg0, <8 x i16> %arg1) - %bc = bitcast <8 x i16> %res to <2 x i64> + %1 = add <8 x i16> %arg0, %arg1 + %2 = icmp ugt <8 x i16> %arg0, %1 + %3 = select <8 x i1> %2, <8 x i16> , <8 x i16> %1 + %bc = bitcast <8 x i16> %3 to <2 x i64> ret <2 x i64> %bc } -declare <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16>, <8 x i16>) nounwind readnone define <2 x double> @test_mm_and_pd(<2 x double> %a0, <2 x double> %a1) nounwind { ; X32-LABEL: test_mm_and_pd: @@ -3507,11 +3522,17 @@ ; X64-NEXT: retq %arg0 = bitcast <2 x i64> %a0 to <16 x i8> %arg1 = bitcast <2 x i64> %a1 to <16 x i8> - %res = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %arg0, <16 x i8> %arg1) - %bc = bitcast <16 x i8> %res to <2 x i64> + %1 = sext <16 x i8> %arg0 to <16 x i16> + %2 = sext <16 x i8> %arg1 to <16 x i16> + %3 = sub nsw <16 x i16> %1, %2 + %4 = icmp slt <16 x i16> %3, + %5 = select <16 x i1> %4, <16 x i16> %3, <16 x i16> + %6 = icmp sgt <16 x i16> %5, + %7 = select <16 x i1> %6, <16 x i16> %5, <16 x i16> + %8 = trunc <16 x i16> %7 to <16 x i8> + %bc = bitcast <16 x i8> %8 to <2 x i64> ret <2 x i64> %bc } -declare <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8>, <16 x i8>) nounwind readnone define <2 x i64> @test_mm_subs_epi16(<2 x i64> %a0, <2 x i64> %a1) nounwind { ; X32-LABEL: test_mm_subs_epi16: @@ -3525,47 +3546,69 @@ ; X64-NEXT: retq %arg0 = bitcast <2 x i64> %a0 to <8 x i16> %arg1 = bitcast <2 x i64> %a1 to <8 x i16> - %res = call <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16> %arg0, <8 x i16> %arg1) - %bc = bitcast <8 x i16> %res to <2 x i64> + %1 = sext <8 x i16> %arg0 to <8 x i32> + %2 = sext <8 x i16> %arg1 to <8 x i32> + %3 = sub nsw <8 x i32> %1, %2 + %4 = icmp slt <8 x i32> %3, + %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> + %6 = icmp sgt <8 x i32> %5, + %7 = select <8 x i1> %6, <8 x i32> %5, <8 x i32> + %8 = trunc <8 x i32> %7 to <8 x i16> + %bc = bitcast <8 x i16> %8 to <2 x i64> ret <2 x i64> %bc } -declare <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16>, <8 x i16>) nounwind readnone define <2 x i64> @test_mm_subs_epu8(<2 x i64> %a0, <2 x i64> %a1) nounwind { ; X32-LABEL: test_mm_subs_epu8: ; X32: # %bb.0: -; X32-NEXT: psubusb %xmm1, %xmm0 +; X32-NEXT: pmaxub %xmm1, %xmm0 +; X32-NEXT: psubb %xmm1, %xmm0 ; X32-NEXT: retl ; ; X64-LABEL: test_mm_subs_epu8: ; X64: # %bb.0: -; X64-NEXT: psubusb %xmm1, %xmm0 +; X64-NEXT: pmaxub %xmm1, %xmm0 +; X64-NEXT: psubb %xmm1, %xmm0 ; X64-NEXT: retq %arg0 = bitcast <2 x i64> %a0 to <16 x i8> %arg1 = bitcast <2 x i64> %a1 to <16 x i8> - %res = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %arg0, <16 x i8> %arg1) - %bc = bitcast <16 x i8> %res to <2 x i64> + %cmp = icmp ugt <16 x i8> %arg0, %arg1 + %sel = select <16 x i1> %cmp, <16 x i8> %arg0, <16 x i8> %arg1 + %sub = sub <16 x i8> %sel, %arg1 + %bc = bitcast <16 x i8> %sub to <2 x i64> ret <2 x i64> %bc } -declare <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8>, <16 x i8>) nounwind readnone define <2 x i64> @test_mm_subs_epu16(<2 x i64> %a0, <2 x i64> %a1) nounwind { ; X32-LABEL: test_mm_subs_epu16: ; X32: # %bb.0: -; X32-NEXT: psubusw %xmm1, %xmm0 +; X32-NEXT: movdqa .LCPI190_0, %xmm2 # xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] +; X32-NEXT: movdqa %xmm1, %xmm3 +; X32-NEXT: pxor %xmm2, %xmm3 +; X32-NEXT: pxor %xmm2, %xmm0 +; X32-NEXT: pmaxsw %xmm3, %xmm0 +; X32-NEXT: pxor %xmm2, %xmm0 +; X32-NEXT: psubw %xmm1, %xmm0 ; X32-NEXT: retl ; ; X64-LABEL: test_mm_subs_epu16: ; X64: # %bb.0: -; X64-NEXT: psubusw %xmm1, %xmm0 +; X64-NEXT: movdqa .LCPI190_0(%rip), %xmm2 # xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] +; X64-NEXT: movdqa %xmm1, %xmm3 +; X64-NEXT: pxor %xmm2, %xmm3 +; X64-NEXT: pxor %xmm2, %xmm0 +; X64-NEXT: pmaxsw %xmm3, %xmm0 +; X64-NEXT: pxor %xmm2, %xmm0 +; X64-NEXT: psubw %xmm1, %xmm0 ; X64-NEXT: retq %arg0 = bitcast <2 x i64> %a0 to <8 x i16> %arg1 = bitcast <2 x i64> %a1 to <8 x i16> - %res = call <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16> %arg0, <8 x i16> %arg1) - %bc = bitcast <8 x i16> %res to <2 x i64> + %cmp = icmp ugt <8 x i16> %arg0, %arg1 + %sel = select <8 x i1> %cmp, <8 x i16> %arg0, <8 x i16> %arg1 + %sub = sub <8 x i16> %sel, %arg1 + %bc = bitcast <8 x i16> %sub to <2 x i64> ret <2 x i64> %bc } -declare <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16>, <8 x i16>) nounwind readnone define i32 @test_mm_ucomieq_sd(<2 x double> %a0, <2 x double> %a1) nounwind { ; X32-LABEL: test_mm_ucomieq_sd: