Index: include/llvm/CodeGen/ISDOpcodes.h =================================================================== --- include/llvm/CodeGen/ISDOpcodes.h +++ include/llvm/CodeGen/ISDOpcodes.h @@ -223,6 +223,18 @@ /// Same for multiplication. SMULO, UMULO, + /// [SU]SAT(X, N) - Saturate X to an N-bit integer. + /// That is: + /// sat(X, N) = (X < MIN_N) ? MIN_N : ((X > MAX_N) ? MAX_N : X) + /// with: + /// usat: MIN_N = 0, MAX_N = 2^N-1 + /// ssat: MIN_N = -2^(N-1), MAX_N = 2^(N-1)-1 + /// In other words: + /// usat(X, N) = min(max(X, 0), 2^N-1) + /// ssat(X, N) = min(max(X, -2^(N-1)), 2^(N-1)-1) + /// These nodes are generated from the llvm.[su]sat intrinsics. + SSAT, USAT, + /// Simple binary floating point operators. FADD, FSUB, FMUL, FMA, FDIV, FREM, Index: include/llvm/Target/TargetSelectionDAG.td =================================================================== --- include/llvm/Target/TargetSelectionDAG.td +++ include/llvm/Target/TargetSelectionDAG.td @@ -99,6 +99,9 @@ def SDTIntShiftOp : SDTypeProfile<1, 2, [ // shl, sra, srl SDTCisSameAs<0, 1>, SDTCisInt<0>, SDTCisInt<2> ]>; +def SDTIntSatOp : SDTypeProfile<1, 2, [ // ssat, usat + SDTCisSameAs<0, 1>, SDTCisInt<0>, SDTCisInt<2> +]>; def SDTIntBinHiLoOp : SDTypeProfile<2, 2, [ // mulhi, mullo, sdivrem, udivrem SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>,SDTCisInt<0> ]>; @@ -374,6 +377,9 @@ def extractelt : SDNode<"ISD::EXTRACT_VECTOR_ELT", SDTVecExtract>; def insertelt : SDNode<"ISD::INSERT_VECTOR_ELT", SDTVecInsert>; +def ssat : SDNode<"ISD::SSAT" , SDTIntSatOp>; +def usat : SDNode<"ISD::USAT" , SDTIntSatOp>; + def fadd : SDNode<"ISD::FADD" , SDTFPBinOp, [SDNPCommutative]>; def fsub : SDNode<"ISD::FSUB" , SDTFPBinOp>; def fmul : SDNode<"ISD::FMUL" , SDTFPBinOp, [SDNPCommutative]>; Index: lib/CodeGen/SelectionDAG/LegalizeDAG.cpp =================================================================== --- lib/CodeGen/SelectionDAG/LegalizeDAG.cpp +++ lib/CodeGen/SelectionDAG/LegalizeDAG.cpp @@ -3757,6 +3757,32 @@ Results.push_back(DAG.getBoolExtOrTrunc(SetCC, dl, ResultType, ResultType)); break; } + case ISD::USAT: + case ISD::SSAT: { + SDValue Val = Node->getOperand(0); + EVT VT = Val.getValueType(); + unsigned BitWidth = VT.getScalarSizeInBits(); + uint64_t SatBit = cast(Node->getOperand(1))->getZExtValue(); + APInt Max, Min; + + if (Node->getOpcode() == ISD::USAT) { + Max = APInt::getLowBitsSet(BitWidth, SatBit); + Min = APInt::getNullValue(BitWidth); + } else { + Max = APInt::getLowBitsSet(BitWidth, SatBit - 1); + Min = APInt::getHighBitsSet(BitWidth, BitWidth - SatBit + 1); + } + + SDValue MaxV = DAG.getConstant(Max, VT); + SDValue MinV = DAG.getConstant(Min, VT); + + SDValue Res; + Res = DAG.getSelectCC(dl, Val, MinV, MinV, Val, ISD::SETLT); + Res = DAG.getSelectCC(dl, Res, MaxV, MaxV, Res, ISD::SETGT); + + Results.push_back(Res); + break; + } case ISD::UMULO: case ISD::SMULO: { EVT VT = Node->getValueType(0); Index: lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp =================================================================== --- lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp +++ lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp @@ -121,6 +121,10 @@ case ISD::SMULO: case ISD::UMULO: Res = PromoteIntRes_XMULO(N, ResNo); break; + case ISD::USAT: + case ISD::SSAT: + Res = PromoteIntRes_SAT(N); break; + case ISD::ATOMIC_LOAD: Res = PromoteIntRes_Atomic0(cast(N)); break; @@ -646,6 +650,15 @@ return DAG.getNode(ISD::TRUNCATE, dl, NVT, Res); } +SDValue DAGTypeLegalizer::PromoteIntRes_SAT(SDNode *N) { + SDValue Op = SExtPromotedInteger(N->getOperand(0)); + EVT NVT = Op.getValueType(); + SDValue SatBit = N->getOperand(1); + SDLoc dl(N); + + return DAG.getNode(N->getOpcode(), dl, NVT, Op, SatBit); +} + SDValue DAGTypeLegalizer::PromoteIntRes_UADDSUBO(SDNode *N, unsigned ResNo) { if (ResNo == 1) return PromoteIntRes_Overflow(N); @@ -1248,6 +1261,9 @@ case ISD::SRA: case ISD::SRL: ExpandIntRes_Shift(N, Lo, Hi); break; + case ISD::USAT: + case ISD::SSAT: ExpandIntRes_SAT(N, Lo, Hi); break; + case ISD::SADDO: case ISD::SSUBO: ExpandIntRes_SADDSUBO(N, Lo, Hi); break; case ISD::UADDO: @@ -2106,6 +2122,77 @@ SplitInteger(TLI.makeLibCall(DAG, LC, VT, Ops, 2, true, dl).first, Lo, Hi); } +void DAGTypeLegalizer::ExpandIntRes_SAT(SDNode *N, SDValue &Lo, SDValue &Hi) { + EVT VT = N->getValueType(0); + SDLoc dl(N); + SDValue Val = N->getOperand(0); + unsigned BitWidth = VT.getScalarSizeInBits(); + uint64_t SatBit = cast(N->getOperand(1))->getZExtValue(); + APInt Max, Min; + + if (N->getOpcode() == ISD::USAT) { + Max = APInt::getLowBitsSet(BitWidth, SatBit); + Min = APInt::getNullValue(BitWidth); + } else { + Max = APInt::getLowBitsSet(BitWidth, SatBit - 1); + Min = APInt::getHighBitsSet(BitWidth, BitWidth - SatBit + 1); + } + + SDValue MaxV = DAG.getConstant(Max, VT); + SDValue MinV = DAG.getConstant(Min, VT); + + SDValue LoMaxV, HiMaxV; + SDValue LoMinV, HiMinV; + + SplitInteger(MaxV, LoMaxV, HiMaxV); + SetExpandedInteger(MaxV, LoMaxV, HiMaxV); + SplitInteger(MinV, LoMinV, HiMinV); + SetExpandedInteger(MinV, LoMinV, HiMinV); + + SDValue MinSetCC; + { + SDValue NewLHS = Val, NewRHS = MinV; + ISD::CondCode CCCode = ISD::SETLT; + IntegerExpandSetCCOperands(NewLHS, NewRHS, CCCode, dl); + // If ExpandSetCCOperands returned a scalar, we need to compare the result + // against zero to select between true and false values. + if (!NewRHS.getNode()) { + NewRHS = DAG.getConstant(0, NewLHS.getValueType()); + CCCode = ISD::SETNE; + } + EVT SetCCType = getSetCCResultType(NewLHS.getValueType()); + MinSetCC = DAG.getSetCC(dl, SetCCType, NewLHS, NewRHS, CCCode); + } + + SDValue MaxSetCC; + { + SDValue NewLHS = Val, NewRHS = MaxV; + ISD::CondCode CCCode = ISD::SETGT; + IntegerExpandSetCCOperands(NewLHS, NewRHS, CCCode, dl); + // If ExpandSetCCOperands returned a scalar, we need to compare the result + // against zero to select between true and false values. + if (!NewRHS.getNode()) { + NewRHS = DAG.getConstant(0, NewLHS.getValueType()); + CCCode = ISD::SETNE; + } + EVT SetCCType = getSetCCResultType(NewLHS.getValueType()); + MaxSetCC = DAG.getSetCC(dl, SetCCType, NewLHS, NewRHS, CCCode); + } + assert(MinSetCC.getValueType() == MaxSetCC.getValueType() && + "Inconsistent SETCC result types when expanding SSAT/USAT."); + + SDValue LoVal, HiVal; + GetExpandedInteger(Val, LoVal, HiVal); + + EVT ResVT = LoVal.getValueType(); + + Lo = DAG.getSelect(dl, ResVT, MinSetCC, LoMinV, LoVal); + Lo = DAG.getSelect(dl, ResVT, MaxSetCC, LoMaxV, Lo); + + Hi = DAG.getSelect(dl, ResVT, MinSetCC, HiMinV, HiVal); + Hi = DAG.getSelect(dl, ResVT, MaxSetCC, HiMaxV, Hi); +} + void DAGTypeLegalizer::ExpandIntRes_Shift(SDNode *N, SDValue &Lo, SDValue &Hi) { EVT VT = N->getValueType(0); Index: lib/CodeGen/SelectionDAG/LegalizeTypes.h =================================================================== --- lib/CodeGen/SelectionDAG/LegalizeTypes.h +++ lib/CodeGen/SelectionDAG/LegalizeTypes.h @@ -258,6 +258,7 @@ SDValue PromoteIntRes_UNDEF(SDNode *N); SDValue PromoteIntRes_VAARG(SDNode *N); SDValue PromoteIntRes_XMULO(SDNode *N, unsigned ResNo); + SDValue PromoteIntRes_SAT(SDNode *N); // Integer Operand Promotion. bool PromoteIntegerOperand(SDNode *N, unsigned OperandNo); @@ -334,6 +335,8 @@ void ExpandIntRes_UREM (SDNode *N, SDValue &Lo, SDValue &Hi); void ExpandIntRes_Shift (SDNode *N, SDValue &Lo, SDValue &Hi); + void ExpandIntRes_SAT (SDNode *N, SDValue &Lo, SDValue &Hi); + void ExpandIntRes_SADDSUBO (SDNode *N, SDValue &Lo, SDValue &Hi); void ExpandIntRes_UADDSUBO (SDNode *N, SDValue &Lo, SDValue &Hi); void ExpandIntRes_XMULO (SDNode *N, SDValue &Lo, SDValue &Hi); @@ -521,6 +524,7 @@ SDValue ScalarizeVecRes_UnaryOp(SDNode *N); SDValue ScalarizeVecRes_InregOp(SDNode *N); + SDValue ScalarizeVecRes_SAT(SDNode *N); SDValue ScalarizeVecRes_BITCAST(SDNode *N); SDValue ScalarizeVecRes_BUILD_VECTOR(SDNode *N); SDValue ScalarizeVecRes_CONVERT_RNDSAT(SDNode *N); @@ -571,6 +575,7 @@ void SplitVecRes_ExtendOp(SDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_InregOp(SDNode *N, SDValue &Lo, SDValue &Hi); + void SplitVecRes_SAT(SDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_BITCAST(SDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_BUILD_PAIR(SDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_BUILD_VECTOR(SDNode *N, SDValue &Lo, SDValue &Hi); @@ -648,6 +653,7 @@ SDValue WidenVecRes_Shift(SDNode *N); SDValue WidenVecRes_Unary(SDNode *N); SDValue WidenVecRes_InregOp(SDNode *N); + SDValue WidenVecRes_SAT(SDNode *N); // Widen Vector Operand. bool WidenVectorOperand(SDNode *N, unsigned OpNo); Index: lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp =================================================================== --- lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp +++ lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp @@ -105,6 +105,7 @@ SDValue ExpandLoad(SDValue Op); SDValue ExpandStore(SDValue Op); SDValue ExpandFNEG(SDValue Op); + SDValue ExpandSAT(SDValue Op); /// \brief Implements vector promotion. /// @@ -316,6 +317,8 @@ case ISD::ANY_EXTEND_VECTOR_INREG: case ISD::SIGN_EXTEND_VECTOR_INREG: case ISD::ZERO_EXTEND_VECTOR_INREG: + case ISD::SSAT: + case ISD::USAT: QueryType = Node->getValueType(0); break; case ISD::FP_ROUND_INREG: @@ -685,11 +688,60 @@ return ExpandFNEG(Op); case ISD::SETCC: return UnrollVSETCC(Op); + case ISD::SSAT: + case ISD::USAT: + return ExpandSAT(Op); default: return DAG.UnrollVectorOp(Op.getNode()); } } +SDValue VectorLegalizer::ExpandSAT(SDValue Op) { + SDLoc DL(Op); + + SDValue Val = Op.getOperand(0); + EVT VT = Val.getValueType(); + + uint64_t SatBit = cast(Op.getOperand(1))->getZExtValue(); + + assert(VT.isVector() && VT == Val.getValueType() && "Invalid type"); + + if (TLI.getOperationAction(ISD::VSELECT, VT) == TargetLowering::Expand || + TLI.getOperationAction(ISD::SETCC, VT) == TargetLowering::Expand) + return DAG.UnrollVectorOp(Op.getNode()); + + EVT SVT = VT.getScalarType(); + unsigned BitWidth = SVT.getSizeInBits(); + APInt Max, Min; + + if (Op->getOpcode() == ISD::USAT) { + Max = APInt::getLowBitsSet(BitWidth, SatBit); + Min = APInt::getNullValue(BitWidth); + } else { + Max = APInt::getLowBitsSet(BitWidth, SatBit - 1); + Min = APInt::getHighBitsSet(BitWidth, BitWidth - SatBit + 1); + } + + SmallVector MaxSplatArr(VT.getVectorNumElements(), + DAG.getConstant(Max, SVT)); + SmallVector MinSplatArr(VT.getVectorNumElements(), + DAG.getConstant(Min, SVT)); + + SDValue MaxV = DAG.getNode(ISD::BUILD_VECTOR, DL, VT, MaxSplatArr); + SDValue MinV = DAG.getNode(ISD::BUILD_VECTOR, DL, VT, MinSplatArr); + + EVT SetCCType = TLI.getSetCCResultType(*DAG.getContext(), VT); + SDValue MinSetCC = DAG.getSetCC(DL, SetCCType, Val, MinV, ISD::SETLT); + + SDValue Res; + Res = DAG.getSelect(DL, VT, MinSetCC, MinV, Val); + + SDValue MaxSetCC = DAG.getSetCC(DL, SetCCType, Res, MaxV, ISD::SETGT); + Res = DAG.getSelect(DL, VT, MaxSetCC, MaxV, Res); + + return Res; +} + SDValue VectorLegalizer::ExpandSELECT(SDValue Op) { // Lower a select instruction where the condition is a scalar and the // operands are vectors. Lower this select to VSELECT and implement it Index: lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp =================================================================== --- lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -66,6 +66,10 @@ case ISD::SETCC: R = ScalarizeVecRes_SETCC(N); break; case ISD::UNDEF: R = ScalarizeVecRes_UNDEF(N); break; case ISD::VECTOR_SHUFFLE: R = ScalarizeVecRes_VECTOR_SHUFFLE(N); break; + + case ISD::SSAT: + case ISD::USAT: R = ScalarizeVecRes_SAT(N); break; + case ISD::ANY_EXTEND: case ISD::BSWAP: case ISD::CTLZ: @@ -135,6 +139,12 @@ SetScalarizedVector(SDValue(N, ResNo), R); } +SDValue DAGTypeLegalizer::ScalarizeVecRes_SAT(SDNode *N) { + SDValue Op = GetScalarizedVector(N->getOperand(0)); + SDValue SatBit = N->getOperand(1); + return DAG.getNode(N->getOpcode(), SDLoc(N), Op.getValueType(), Op, SatBit); +} + SDValue DAGTypeLegalizer::ScalarizeVecRes_BinOp(SDNode *N) { SDValue LHS = GetScalarizedVector(N->getOperand(0)); SDValue RHS = GetScalarizedVector(N->getOperand(1)); @@ -670,6 +680,10 @@ case ISD::FREM: SplitVecRes_BinOp(N, Lo, Hi); break; + case ISD::SSAT: + case ISD::USAT: + SplitVecRes_SAT(N, Lo, Hi); + break; case ISD::FMA: SplitVecRes_TernaryOp(N, Lo, Hi); break; @@ -680,6 +694,16 @@ SetSplitVector(SDValue(N, ResNo), Lo, Hi); } +void DAGTypeLegalizer::SplitVecRes_SAT(SDNode *N, SDValue &Lo, SDValue &Hi) { + SDValue OpLo, OpHi; + GetSplitVector(N->getOperand(0), OpLo, OpHi); + SDValue SatBit = N->getOperand(1); + SDLoc dl(N); + + Lo = DAG.getNode(N->getOpcode(), dl, OpLo.getValueType(), OpLo, SatBit); + Hi = DAG.getNode(N->getOpcode(), dl, OpHi.getValueType(), OpHi, SatBit); +} + void DAGTypeLegalizer::SplitVecRes_BinOp(SDNode *N, SDValue &Lo, SDValue &Hi) { SDValue LHSLo, LHSHi; @@ -1789,6 +1813,10 @@ case ISD::FMA: Res = WidenVecRes_Ternary(N); break; + case ISD::USAT: + case ISD::SSAT: + Res = WidenVecRes_SAT(N); + break; } // If Res is null, the sub-method took care of registering the result. @@ -2039,6 +2067,13 @@ return DAG.getNode(N->getOpcode(), SDLoc(N), WidenVT, InOp, ShOp); } +SDValue DAGTypeLegalizer::WidenVecRes_SAT(SDNode *N) { + EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)); + SDValue InOp = GetWidenedVector(N->getOperand(0)); + SDValue SatBitOp = N->getOperand(1); + return DAG.getNode(N->getOpcode(), SDLoc(N), WidenVT, InOp, SatBitOp); +} + SDValue DAGTypeLegalizer::WidenVecRes_Shift(SDNode *N) { EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)); SDValue InOp = GetWidenedVector(N->getOperand(0)); Index: lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp =================================================================== --- lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -5288,6 +5288,21 @@ setValue(&I, DAG.getNode(ISD::CTPOP, sdl, Ty, Arg)); return nullptr; } + case Intrinsic::ssat: + case Intrinsic::usat: { + unsigned Opcode; + switch (Intrinsic) { + default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. + case Intrinsic::ssat: Opcode = ISD::SSAT; break; + case Intrinsic::usat: Opcode = ISD::USAT; break; + } + + setValue(&I, DAG.getNode(Opcode, sdl, + getValue(I.getArgOperand(0)).getValueType(), + getValue(I.getArgOperand(0)), + getValue(I.getArgOperand(1)))); + return nullptr; + } case Intrinsic::stacksave: { SDValue Op = getRoot(); Res = DAG.getNode(ISD::STACKSAVE, sdl, Index: lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp =================================================================== --- lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp +++ lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp @@ -215,6 +215,8 @@ case ISD::UMULO: return "umulo"; case ISD::SUBC: return "subc"; case ISD::SUBE: return "sube"; + case ISD::SSAT: return "ssat"; + case ISD::USAT: return "usat"; case ISD::SHL_PARTS: return "shl_parts"; case ISD::SRA_PARTS: return "sra_parts"; case ISD::SRL_PARTS: return "srl_parts"; Index: lib/CodeGen/TargetLoweringBase.cpp =================================================================== --- lib/CodeGen/TargetLoweringBase.cpp +++ lib/CodeGen/TargetLoweringBase.cpp @@ -766,6 +766,9 @@ setOperationAction(ISD::FMINNUM, (MVT::SimpleValueType)VT, Expand); setOperationAction(ISD::FMAXNUM, (MVT::SimpleValueType)VT, Expand); + setOperationAction(ISD::SSAT, (MVT::SimpleValueType)VT, Expand); + setOperationAction(ISD::USAT, (MVT::SimpleValueType)VT, Expand); + // These library functions default to expand. setOperationAction(ISD::FROUND, (MVT::SimpleValueType)VT, Expand); Index: test/CodeGen/X86/saturation-legalization.ll =================================================================== --- /dev/null +++ test/CodeGen/X86/saturation-legalization.ll @@ -0,0 +1,465 @@ +; RUN: llc < %s | FileCheck %s --check-prefix=CHECK --check-prefix=NOVSELECT +; RUN: llc -mattr=+sse4.2 < %s | FileCheck %s --check-prefix=CHECK --check-prefix=SSE42 + +target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux" + +; There's no way of catching both qword/dword variants of both ABCD and +; numbered regs, so a the patterns involving 64bits are a bit too specific. + +; What we really want to know is that the usat intrinsics are legalized to code +; equivalent to (same for ssat, with the appropriate min/max constants): +; %1 = icmp slt i8 %x, 0 +; %2 = select i1 %1, i8 0, i8 %x +; %3 = icmp sgt i8 %x, 15 +; %4 = select i1 %3, i8 15, i8 %2 +; ret i8 %4 + +declare i32 @llvm.ssat.i32(i32 %x, i32 %n) +declare i32 @llvm.usat.i32(i32 %x, i32 %n) + +define i32 @test_ssat_expand(i32 %x) { +; CHECK-LABEL: test_ssat_expand +; CHECK: cmpl $-128, %edi +; CHECK-NEXT: movl $-128, [[RES:%e[a-z0-9]+]] +; CHECK-NEXT: cmovgel %edi, [[RES]] +; CHECK-NEXT: cmpl $127, [[RES]] +; CHECK-NEXT: movl $127, %eax +; CHECK-NEXT: cmovlel [[RES]], %eax +; CHECK-NEXT: retq + %ssat_x = call i32 @llvm.ssat.i32(i32 %x, i32 8) + ret i32 %ssat_x +} + +define i32 @test_usat_expand(i32 %x) { +; CHECK-LABEL: test_usat_expand +; CHECK: xorl [[RES:%e[a-z0-9]+]], [[RES]] +; CHECK-NEXT: testl %edi, %edi +; CHECK-NEXT: cmovnsl %edi, [[RES]] +; CHECK-NEXT: cmpl $255, [[RES]] +; CHECK-NEXT: movl $255, %eax +; CHECK-NEXT: cmovlel [[RES]], %eax +; CHECK-NEXT: retq + %ssat_x = call i32 @llvm.usat.i32(i32 %x, i32 8) + ret i32 %ssat_x +} + +declare i12 @llvm.ssat.i12(i12 %x, i32 %n) +declare i12 @llvm.usat.i12(i12 %x, i32 %n) + +define i12 @test_ssat_12_promote_intres(i12 %x) { +; CHECK-LABEL: test_ssat_12_promote_intres +; CHECK: shll $4, %edi +; CHECK-NEXT: sarw $4, %di +; CHECK-NEXT: movswl %di, %eax +; CHECK-NEXT: cmpl $-128, %eax +; CHECK-NEXT: movw $-128, [[RES:%[bcd]x]] +; CHECK-NEXT: cmovlw [[RES]], %ax +; CHECK-NEXT: cwtl +; CHECK-NEXT: cmpl $127, %eax +; CHECK-NEXT: movw $127, [[RES]] +; CHECK-NEXT: cmovgw [[RES]], %ax +; CHECK-NEXT: # kill: AX AX EAX +; CHECK-NEXT: retq + %ssat_x = call i12 @llvm.ssat.i12(i12 %x, i32 8) + ret i12 %ssat_x +} + +define i12 @test_usat_12_promote_intres(i12 %x) { +; CHECK-LABEL: test_usat_12_promote_intres +; CHECK: shll $4, %edi +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: sarw $4, %di +; CHECK-NEXT: cmovnsw %di, %ax +; CHECK-NEXT: cwtl +; CHECK-NEXT: cmpl $255, %eax +; CHECK-NEXT: movw $255, [[MAX:%[bcd]x]] +; CHECK-NEXT: cmovgw [[MAX]], %ax +; CHECK-NEXT: # kill: AX AX EAX +; CHECK-NEXT: retq + %ssat_x = call i12 @llvm.usat.i12(i12 %x, i32 8) + ret i12 %ssat_x +} + +declare i8 @llvm.ssat.i8(i8 %x, i32 %n) +declare i8 @llvm.usat.i8(i8 %x, i32 %n) + +define i8 @test_ssat_8_expand(i8 %x) { +; CHECK-LABEL: test_ssat_8_expand +; CHECK: movsbl %dil, %eax +; CHECK-NEXT: cmpl $-8, %eax +; CHECK-NEXT: movb $-8, %cl +; CHECK-NEXT: jl [[LTBB:.LBB[0-9_]+]] +; CHECK: movb %dil, %cl +; CHECK: [[LTBB]]: +; CHECK-NEXT: movsbl %cl, %eax +; CHECK-NEXT: cmpl $7, %eax +; CHECK-NEXT: movb $7, %al +; CHECK-NEXT: jg [[GTBB:.LBB[0-9_]+]] +; CHECK: movb %cl, %al +; CHECK: [[GTBB]]: +; CHECK-NEXT: retq + %ssat_x = call i8 @llvm.ssat.i8(i8 %x, i32 4) + ret i8 %ssat_x +} + +define i8 @test_usat_8_expand(i8 %x) { +; CHECK-LABEL: test_usat_8_expand +; CHECK: testb %dil, %dil +; CHECK-NEXT: jns [[MAXCMPBB:.LBB[0-9_]+]] +; CHECK: xorl %edi, %edi +; CHECK: [[MAXCMPBB]]: +; CHECK-NEXT: movsbl %dil, %eax +; CHECK-NEXT: cmpl $15, %eax +; CHECK-NEXT: movb $15, %al +; CHECK-NEXT: jg [[EXITBB:.LBB[0-9_]+]] +; CHECK: movb %dil, %al +; CHECK: [[EXITBB]]: +; CHECK-NEXT: retq + + %ssat_x = call i8 @llvm.usat.i8(i8 %x, i32 4) + ret i8 %ssat_x +} + +declare i128 @llvm.ssat.i128(i128 %x, i32 %n) +declare i128 @llvm.usat.i128(i128 %x, i32 %n) + +; FIXME: add a testcase for when the saturation bit is larger than 64 +define i128 @test_ssat_128_expand_intres(i128 %x) { +; CHECK-LABEL: test_ssat_128_expand_intres +; Compare the lower 64bits of %x with -8. +; CHECK: cmpq $-8, %rdi +; If they are (ult) below -8, and the higher bits are -1, %x isn't in ]-8,-1] +; CHECK-NEXT: setb [[X_LT_MIN:%[a-z0-9]+]] +; Compare the higher 64bits of %x with -1. +; CHECK-NEXT: cmpq $-1, %rsi +; CHECK-NEXT: setl [[HI_X_LT_MONE:%[a-z0-9]+]] +; If they are equal to -1, %x is negative, so the lower check is enough. +; CHECK-NEXT: je [[HI_X_EQ_MONE_LBL:.LBB[0-9_]+]] +; If they are lower than -1, %x < -1*2^64, so %x < -8. +; If they are greater than -1, %x isn't negative, so %x > -8. +; CHECK: movb [[HI_X_LT_MONE]], [[X_LT_MIN]] +; CHECK: [[HI_X_EQ_MONE_LBL]]: +; %rdx <- (%x < -8) ? -1 : %rsi (higher bits of %x) +; CHECK-NEXT: cmpb $1, [[X_LT_MIN]] +; CHECK-NEXT: sbbq %rdx, %rdx +; CHECK-NEXT: notq %rdx +; CHECK-NEXT: orq %rsi, %rdx +; %rax <- (%x < -8) ? -8 : %rdi +; CHECK-NEXT: testb [[X_LT_MIN]], [[X_LT_MIN]] +; CHECK-NEXT: movq $-8, %rax +; CHECK-NEXT: cmoveq %rdi, %rax +; Compare the lower bits with 7. +; CHECK-NEXT: cmpq $7, %rdi +; CHECK-NEXT: seta [[X_GT_MAX:%[a-z0-9]+]] +; CHECK-NEXT: testq %rsi, %rsi +; CHECK-NEXT: setg [[HI_X_GT_ZERO:%[a-z0-9]+]] +; If the higher bits aren't equal to 0, make sure they're greater than 0. +; CHECK-NEXT: je [[HI_X_EQ_ZERO_LBL:.LBB[0-9_]+]] +; CHECK: movb [[HI_X_GT_ZERO]], [[X_GT_MAX]] +; CHECK: [[HI_X_EQ_ZERO_LBL]]: +; CHECK-NEXT: xorl %e[[MAX_HI:[a-z0-9]+]], %e[[MAX_HI]] +; CHECK-NEXT: testb [[X_GT_MAX]], [[X_GT_MAX]] +; CHECK-NEXT: movl $7, %e[[MAX_LO:[a-z0-9]+]] +; If %x > 7, return 7. Else return the minimum. +; CHECK-NEXT: cmovneq %r[[MAX_LO]], %rax +; CHECK-NEXT: cmovneq %r[[MAX_HI]], %rdx +; CHECK-NEXT: retq + %ssat_x = call i128 @llvm.ssat.i128(i128 %x, i32 4) + ret i128 %ssat_x +} + +define i128 @test_usat_128_expand_intres(i128 %x) { +; CHECK-LABEL: test_usat_128_expand_intres +; CHECK: cmpq $15, %rdi +; CHECK-NEXT: seta [[X_GT_MAX:%[a-z0-9]+]] +; CHECK-NEXT: xorl %e[[ZERO:[a-z0-9]+]], %e[[ZERO]] +; CHECK-NEXT: testq %rsi, %rsi +; CHECK-NEXT: cmovsq %r[[ZERO]], %rsi +; CHECK-NEXT: cmovsq %r[[ZERO]], %rdi +; CHECK-NEXT: setg [[HI_X_GT_ZERO:%[a-z0-9]+]] +; CHECK-NEXT: je [[HI_X_EQ_ZERO_LBL:.LBB[0-9_]+]] +; CHECK: movb [[HI_X_GT_ZERO]], [[X_GT_MAX]] +; CHECK: [[HI_X_EQ_ZERO_LBL]]: +; CHECK-NEXT: testb [[X_GT_MAX]], [[X_GT_MAX]] +; CHECK-NEXT: movl $15, %e[[MAX_LO:[a-z0-9]+]] +; CHECK-NEXT: cmovneq %r[[MAX_LO]], %rdi +; CHECK-NEXT: cmovneq %r[[ZERO]], %rsi +; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: movq %rsi, %rdx +; CHECK-NEXT: retq + %ssat_x = call i128 @llvm.usat.i128(i128 %x, i32 4) + ret i128 %ssat_x +} + +declare i117 @llvm.ssat.i117(i117 %x, i32 %n) +declare i117 @llvm.usat.i117(i117 %x, i32 %n) + +define i117 @test_ssat_117_promote_expand_intres(i117 %x) { +; CHECK-LABEL: test_ssat_117_promote_expand_intres +; CHECK: movq %rsi, [[SEXT_HI_X:%[a-z0-9]+]] +; CHECK-NEXT: shlq $11, [[SEXT_HI_X]] +; CHECK-NEXT: sarq $11, [[SEXT_HI_X]] +; CHECK-NEXT: cmpq $7, %rdi +; CHECK-NEXT: seta [[X_GT_MAX:%[a-z0-9]+]] +; CHECK-NEXT: testq [[SEXT_HI_X]], [[SEXT_HI_X]] +; CHECK-NEXT: setg [[SEXT_HI_X_GT_ZERO:%[a-z0-9]+]] +; CHECK-NEXT: movabsq $9007199254740991, [[MASK_I117:%[a-z0-9]+]] # imm = 0x1FFFFFFFFFFFFF +; CHECK-NEXT: andq [[MASK_I117]], %rsi +; CHECK-NEXT: je [[NO_HI_BITS_SET_LBL:.LBB[0-9_]+]] +; CHECK: movb [[SEXT_HI_X_GT_ZERO]], [[X_GT_MAX]] +; CHECK: [[NO_HI_BITS_SET_LBL]]: +; CHECK-NEXT: cmpq $-8, %rdi +; CHECK-NEXT: setb [[X_LT_ZERO:%[a-z0-9]+]] +; CHECK-NEXT: cmpq $-1, [[SEXT_HI_X]] +; CHECK-NEXT: setl [[SEXT_HI_X_LT_ZERO:%[a-z0-9]+]] +; CHECK-NEXT: cmpq [[MASK_I117]], %rsi +; CHECK-NEXT: je [[NO_HI_BITS_SET_LBL_2:.LBB[0-9_]+]] +; CHECK: movb [[SEXT_HI_X_LT_ZERO]], [[X_LT_ZERO]] +; CHECK: [[NO_HI_BITS_SET_LBL_2]]: +; CHECK-NEXT: cmpb $1, [[X_LT_ZERO]] +; CHECK-NEXT: sbbq [[HI_TMP_RES:%[a-z0-9]+]], [[HI_TMP_RES]] +; CHECK-NEXT: notq [[HI_TMP_RES]] +; CHECK-NEXT: orq [[SEXT_HI_X]], [[HI_TMP_RES]] +; CHECK-NEXT: testb [[X_LT_ZERO]], [[X_LT_ZERO]] +; CHECK-NEXT: movq $-8, [[MIN_LO:%[a-z0-9]+]] +; CHECK-NEXT: cmovneq [[MIN_LO]], %rdi +; CHECK-NEXT: xorl %edx, %edx +; CHECK-NEXT: testb [[X_GT_MAX]], [[X_GT_MAX]] +; CHECK-NEXT: movl $7, %eax +; CHECK-NEXT: cmoveq %rdi, %rax +; CHECK-NEXT: cmoveq [[HI_TMP_RES]], %rdx +; CHECK-NEXT: retq + %ssat_x = call i117 @llvm.ssat.i117(i117 %x, i32 4) + ret i117 %ssat_x +} + +define i117 @test_usat_117_promote_expand_intres(i117 %x) { +; CHECK-LABEL: test_usat_117_promote_expand_intres +; CHECK: movq %rsi, [[SEXT_HI_X:%[a-z0-9]+]] +; CHECK-NEXT: shlq $11, [[SEXT_HI_X]] +; CHECK-NEXT: sarq $11, [[SEXT_HI_X]] +; CHECK-NEXT: cmpq $15, %rdi +; CHECK-NEXT: seta [[LO_GT_MAX:%[a-z0-9]+]] +; CHECK-NEXT: xorl [[ZERO:%[a-z0-9]+]]d, [[ZERO]]d +; CHECK-NEXT: testq [[SEXT_HI_X]], [[SEXT_HI_X]] +; CHECK-NEXT: cmovsq [[ZERO]], %rdx +; CHECK-NEXT: cmovsq [[ZERO]], %rdi +; CHECK-NEXT: setg [[SEXT_HI_X_GT_ZERO:%[a-z0-9]+]] +; CHECK-NEXT: movabsq $9007199254740991, [[MASK_I117:%[a-z0-9]+]] # imm = 0x1FFFFFFFFFFFFF +; CHECK-NEXT: testq [[MASK_I117]], %rsi +; CHECK-NEXT: je [[NO_HI_BITS_SET_LBL:.LBB[0-9_]+]] +; CHECK: movb [[SEXT_HI_X_GT_ZERO]], [[LO_GT_MAX]] +; CHECK: [[NO_HI_BITS_SET_LBL]]: +; CHECK: testb [[LO_GT_MAX]], [[LO_GT_MAX]] +; CHECK-NEXT: movl $15, %e[[MAX_LO:[a-z]+]] +; CHECK-NEXT: cmovneq %r[[MAX_LO]], %rdi +; CHECK-NEXT: cmovneq [[ZERO]], %rdx +; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: retq + %ssat_x = call i117 @llvm.usat.i117(i117 %x, i32 4) + ret i117 %ssat_x +} + +declare <2 x i64> @llvm.ssat.v2i64(<2 x i64> %x, i32 %n) +declare <2 x i64> @llvm.usat.v2i64(<2 x i64> %x, i32 %n) + +define <2 x i64> @test_ssat_v2i64_unroll_or_expand(<2 x i64> %x) { +; NOVSELECT-LABEL: test_ssat_v2i64_unroll_or_expand +; NOVSELECT: movd %xmm0, [[X0:%[a-z0-9]+]] +; NOVSELECT-NEXT: cmpq $-8, [[X0]] +; NOVSELECT-NEXT: movq $-8, [[MIN:%[a-z0-9]+]] +; NOVSELECT-NEXT: cmovlq [[MIN]], [[X0]] +; NOVSELECT-NEXT: cmpq $7, [[X0]] +; NOVSELECT-NEXT: movl $7, %e[[MAX:[a-z0-9]+]] +; NOVSELECT-NEXT: cmovgq %r[[MAX]], [[X0]] +; NOVSELECT-NEXT: movd [[X0]], %xmm1 +; NOVSELECT-NEXT: pshufd $78, %xmm0, %xmm0 # xmm0 = xmm0[2,3,0,1] +; NOVSELECT-NEXT: movd %xmm0, [[X1:%[a-z0-9]+]] +; NOVSELECT-NEXT: cmpq $-8, [[X1]] +; NOVSELECT-NEXT: cmovlq [[MIN]], [[X1]] +; NOVSELECT-NEXT: cmpq $7, [[X1]] +; NOVSELECT-NEXT: cmovgq %r[[MAX]], [[X1]] +; NOVSELECT-NEXT: movd [[X1]], %xmm0 +; NOVSELECT-NEXT: punpcklqdq %xmm0, %xmm1 # xmm1 = xmm1[0],xmm0[0] +; NOVSELECT-NEXT: movdqa %xmm1, %xmm0 +; NOVSELECT-NEXT: retq + +; SSE42-LABEL: test_ssat_v2i64_unroll_or_expand +; SSE42: movdqa %xmm0, [[TMP1:%xmm[0-9]+]] +; SSE42-NEXT: movdqa {{.*}}(%rip), %[[MIN:xmm[0-9]+]] # [[MIN]] = [18446744073709551608,18446744073709551608] +; SSE42-NEXT: movdqa %[[MIN]], %xmm0 +; SSE42-NEXT: pcmpgtq [[TMP1]], %xmm0 +; SSE42-NEXT: blendvpd %[[MIN]], [[TMP1]] +; SSE42-NEXT: movdqa {{.*}}(%rip), %[[MAX:xmm[0-9]+]] # [[MAX]] = [7,7] +; SSE42-NEXT: movapd [[TMP1]], %xmm0 +; SSE42-NEXT: pcmpgtq %[[MAX]], %xmm0 +; SSE42-NEXT: blendvpd %[[MAX]], [[TMP1]] +; SSE42-NEXT: movapd [[TMP1]], %xmm0 +; SSE42-NEXT: retq + %ssat_x = call <2 x i64> @llvm.ssat.v2i64(<2 x i64> %x, i32 4) + ret <2 x i64> %ssat_x +} + +define <2 x i64> @test_usat_v2i64_unroll_or_expand(<2 x i64> %x) { +; NOVSELECT-LABEL: test_usat_v2i64_unroll_or_expand +; NOVSELECT: movd %xmm0, [[X0:%[a-z0-9]+]] +; NOVSELECT-NEXT: xorl %e[[MIN:[a-z0-9]+]], %e[[MIN]] +; NOVSELECT-NEXT: testq [[X0]], [[X0]] +; NOVSELECT-NEXT: cmovsq %r[[MIN]], [[X0]] +; NOVSELECT-NEXT: cmpq $15, [[X0]] +; NOVSELECT-NEXT: movl $15, %e[[MAX:[a-z0-9]+]] +; NOVSELECT-NEXT: cmovgq %r[[MAX]], [[X0]] +; NOVSELECT-NEXT: movd [[X0]], %xmm1 +; NOVSELECT-NEXT: pshufd $78, %xmm0, %xmm0 # xmm0 = xmm0[2,3,0,1] +; NOVSELECT-NEXT: movd %xmm0, [[X1:%[a-z0-9]+]] +; NOVSELECT-NEXT: testq [[X1]], [[X1]] +; NOVSELECT-NEXT: cmovsq %r[[MIN]], [[X1]] +; NOVSELECT-NEXT: cmpq $15, [[X1]] +; NOVSELECT-NEXT: cmovgq %r[[MAX]], [[X1]] +; NOVSELECT-NEXT: movd [[X1]], %xmm0 +; NOVSELECT-NEXT: punpcklqdq %xmm0, %xmm1 # xmm1 = xmm1[0],xmm0[0] +; NOVSELECT-NEXT: movdqa %xmm1, %xmm0 +; NOVSELECT-NEXT: retq + +; SSE42-LABEL: test_usat_v2i64_unroll +; SSE42: movdqa %xmm0, [[TMP1:%xmm[0-9]+]] +; SSE42-NEXT: xorpd [[MIN:%xmm[0-9]+]], [[MIN]] +; SSE42-NEXT: pxor %xmm0, %xmm0 +; SSE42-NEXT: pcmpgtq [[TMP1]], %xmm0 +; SSE42-NEXT: blendvpd [[MIN]], [[TMP1]] +; SSE42-NEXT: movdqa {{.*}}(%rip), %[[MAX:xmm[0-9]+]] # [[MAX]] = [15,15] +; SSE42-NEXT: movapd [[TMP1]], %xmm0 +; SSE42-NEXT: pcmpgtq %[[MAX]], %xmm0 +; SSE42-NEXT: blendvpd %[[MAX]], [[TMP1]] +; SSE42-NEXT: movapd [[TMP1]], %xmm0 +; SSE42-NEXT: retq + %ssat_x = call <2 x i64> @llvm.usat.v2i64(<2 x i64> %x, i32 4) + ret <2 x i64> %ssat_x +} + +declare <4 x i30> @llvm.ssat.v4i30(<4 x i30> %x, i32 %n) +declare <4 x i30> @llvm.usat.v4i30(<4 x i30> %x, i32 %n) + +define <4 x i30> @test_ssat_v4i30_promote_expand(<4 x i30> %x) { +; SSE42-LABEL: test_ssat_v4i30_promote_expand +; SSE42: movdqa %xmm0, [[TMP1:%xmm[0-9]+]] +; SSE42-NEXT: pslld $2, [[TMP1]] +; SSE42-NEXT: psrad $2, [[TMP1]] +; SSE42-NEXT: movdqa {{.*}}(%rip), %[[MIN:xmm[0-9]+]] # [[MIN]] = [4294967288,4294967288,4294967288,4294967288] +; SSE42-NEXT: movdqa %[[MIN]], %xmm0 +; SSE42-NEXT: pcmpgtd [[TMP1]], %xmm0 +; SSE42-NEXT: blendvps %[[MIN]], [[TMP1]] +; SSE42-NEXT: movdqa {{.*}}(%rip), %[[MAX:xmm[0-9]+]] # [[MAX]] = [7,7,7,7] +; SSE42-NEXT: movaps [[TMP1]], %xmm0 +; SSE42-NEXT: pcmpgtd %[[MAX]], %xmm0 +; SSE42-NEXT: blendvps %[[MAX]], [[TMP1]] +; SSE42-NEXT: movaps [[TMP1]], %xmm0 +; SSE42-NEXT: retq + %ssat_x = call <4 x i30> @llvm.ssat.v4i30(<4 x i30> %x, i32 4) + ret <4 x i30> %ssat_x +} + +define <4 x i30> @test_usat_v4i30_promote_expand(<4 x i30> %x) { +; SSE42-LABEL: test_usat_v4i30_promote_expand +; SSE42: movdqa %xmm0, [[TMP1]] +; SSE42-NEXT: pslld $2, [[TMP1]] +; SSE42-NEXT: psrad $2, [[TMP1]] +; SSE42-NEXT: xorps [[MIN:%xmm[0-9]+]], [[MIN]] +; SSE42-NEXT: pxor %xmm0, %xmm0 +; SSE42-NEXT: pcmpgtd [[TMP1]], %xmm0 +; SSE42-NEXT: blendvps [[MIN]], [[TMP1]] +; SSE42-NEXT: movdqa {{.*}}(%rip), %[[MAX:xmm[0-9]+]] # [[MAX]] = [15,15,15,15] +; SSE42-NEXT: movaps [[TMP1]], %xmm0 +; SSE42-NEXT: pcmpgtd %[[MAX]], %xmm0 +; SSE42-NEXT: blendvps %[[MAX]], [[TMP1]] +; SSE42-NEXT: movaps [[TMP1]], %xmm0 +; SSE42-NEXT: retq + %ssat_x = call <4 x i30> @llvm.usat.v4i30(<4 x i30> %x, i32 4) + ret <4 x i30> %ssat_x +} + +declare <3 x i32> @llvm.ssat.v3i32(<3 x i32> %x, i32 %n) +declare <3 x i32> @llvm.usat.v3i32(<3 x i32> %x, i32 %n) + +;; The 4th element is undef, but it's not obvious so also saturate it. + +define <3 x i32> @test_ssat_v3i32_widen(<3 x i32> %x) { +; NOVSELECT-LABEL: test_ssat_v3i32_widen +; NOVSELECT: pshufd $-25, %xmm0, %xmm1 # xmm1 = xmm0[3,1,2,3] +; NOVSELECT-NEXT: movd %xmm1, [[X0:%[a-z0-9]+]] +; NOVSELECT-NEXT: cmpl $-8, [[X0]] +; NOVSELECT-NEXT: movl $-8, [[MIN:%[a-z0-9]+]] +; NOVSELECT-NEXT: cmovll [[MIN]], [[X0]] +; NOVSELECT-NEXT: cmpl $7, [[X0]] +; NOVSELECT-NEXT: movl $7, [[MAX:%[a-z0-9]+]] +; NOVSELECT-NEXT: cmovgl [[MAX]], [[X0]] +; NOVSELECT-NEXT: movd [[X0]], %xmm1 +; NOVSELECT-NEXT: pshufd $-27, %xmm0, %xmm2 # xmm2 = xmm0[1,1,2,3] +; NOVSELECT-NEXT: movd %xmm2, [[X1:%[a-z0-9]+]] +; NOVSELECT-NEXT: cmpl $-8, [[X1]] +; NOVSELECT-NEXT: cmovll [[MIN]], [[X1]] +; NOVSELECT-NEXT: cmpl $7, [[X1]] +; NOVSELECT-NEXT: cmovgl [[MAX]], [[X1]] +; NOVSELECT-NEXT: movd [[X1]], %xmm2 +; NOVSELECT-NEXT: punpckldq %xmm1, %xmm2 # xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; NOVSELECT-NEXT: movd %xmm0, [[X2:%[a-z0-9]+]] +; NOVSELECT-NEXT: cmpl $-8, [[X2]] +; NOVSELECT-NEXT: cmovll [[MIN]], [[X2]] +; NOVSELECT-NEXT: cmpl $7, [[X2]] +; NOVSELECT-NEXT: cmovgl [[MAX]], [[X2]] +; NOVSELECT-NEXT: movd [[X2]], %xmm1 +; NOVSELECT-NEXT: pshufd $78, %xmm0, %xmm0 # xmm0 = xmm0[2,3,0,1] +; NOVSELECT-NEXT: movd %xmm0, [[X3:%[a-z0-9]+]] +; NOVSELECT-NEXT: cmpl $-8, [[X3]] +; NOVSELECT-NEXT: cmovll [[MIN]], [[X3]] +; NOVSELECT-NEXT: cmpl $7, [[X3]] +; NOVSELECT-NEXT: cmovgl [[MAX]], [[X3]] +; NOVSELECT-NEXT: movd [[X3]], %xmm0 +; NOVSELECT-NEXT: punpckldq %xmm0, %xmm1 # xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; NOVSELECT-NEXT: punpckldq %xmm2, %xmm1 # xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; NOVSELECT-NEXT: movdqa %xmm1, %xmm0 +; NOVSELECT-NEXT: retq + %ssat_x = call <3 x i32> @llvm.ssat.v3i32(<3 x i32> %x, i32 4) + ret <3 x i32> %ssat_x +} + +define <3 x i32> @test_usat_v3i32_widen(<3 x i32> %x) { +; NOVSELECT-LABEL: test_usat_v3i32_widen +; NOVSELECT: pshufd $-25, %xmm0, %xmm1 # xmm1 = xmm0[3,1,2,3] +; NOVSELECT-NEXT: movd %xmm1, [[X0:%[a-z0-9]+]] +; NOVSELECT-NEXT: xorl [[MIN:%[a-z0-9]+]], [[MIN]] +; NOVSELECT-NEXT: testl [[X0]], [[X0]] +; NOVSELECT-NEXT: cmovsl [[MIN]], [[X0]] +; NOVSELECT-NEXT: cmpl $15, [[X0]] +; NOVSELECT-NEXT: movl $15, [[MAX:%[a-z0-9]+]] +; NOVSELECT-NEXT: cmovgl [[MAX]], [[X0]] +; NOVSELECT-NEXT: movd [[X0]], %xmm1 +; NOVSELECT-NEXT: pshufd $-27, %xmm0, %xmm2 # xmm2 = xmm0[1,1,2,3] +; NOVSELECT-NEXT: movd %xmm2, [[X1:%[a-z0-9]+]] +; NOVSELECT-NEXT: testl [[X1]], [[X1]] +; NOVSELECT-NEXT: cmovsl [[MIN]], [[X1]] +; NOVSELECT-NEXT: cmpl $15, [[X1]] +; NOVSELECT-NEXT: cmovgl [[MAX]], [[X1]] +; NOVSELECT-NEXT: movd [[X1]], %xmm2 +; NOVSELECT-NEXT: punpckldq %xmm1, %xmm2 # xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; NOVSELECT-NEXT: movd %xmm0, [[X2:%[a-z0-9]+]] +; NOVSELECT-NEXT: testl [[X2]], [[X2]] +; NOVSELECT-NEXT: cmovsl [[MIN]], [[X2]] +; NOVSELECT-NEXT: cmpl $15, [[X2]] +; NOVSELECT-NEXT: cmovgl [[MAX]], [[X2]] +; NOVSELECT-NEXT: movd [[X2]], %xmm1 +; NOVSELECT-NEXT: pshufd $78, %xmm0, %xmm0 # xmm0 = xmm0[2,3,0,1] +; NOVSELECT-NEXT: movd %xmm0, [[X3:%[a-z0-9]+]] +; NOVSELECT-NEXT: testl [[X3]], [[X3]] +; NOVSELECT-NEXT: cmovsl [[MIN]], [[X3]] +; NOVSELECT-NEXT: cmpl $15, [[X3]] +; NOVSELECT-NEXT: cmovgl [[MAX]], [[X3]] +; NOVSELECT-NEXT: movd [[X3]], %xmm0 +; NOVSELECT-NEXT: punpckldq %xmm0, %xmm1 # xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; NOVSELECT-NEXT: punpckldq %xmm2, %xmm1 # xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; NOVSELECT-NEXT: movdqa %xmm1, %xmm0 +; NOVSELECT-NEXT: retq + %ssat_x = call <3 x i32> @llvm.usat.v3i32(<3 x i32> %x, i32 4) + ret <3 x i32> %ssat_x +}