Index: llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h =================================================================== --- llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h +++ llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h @@ -316,7 +316,7 @@ LegalizeResult lowerExtract(MachineInstr &MI); LegalizeResult lowerInsert(MachineInstr &MI); LegalizeResult lowerSADDO_SSUBO(MachineInstr &MI); - LegalizeResult lowerAddSubSatToMinMax(MachineInstr &MI); + LegalizeResult lowerAddSubSatToMinMax(MachineInstr &MI, LLT WideTy = LLT()); LegalizeResult lowerAddSubSatToAddoSubo(MachineInstr &MI); LegalizeResult lowerBswap(MachineInstr &MI); LegalizeResult lowerBitreverse(MachineInstr &MI); Index: llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h =================================================================== --- llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h +++ llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h @@ -827,6 +827,17 @@ changeTo(typeIdx(TypeIdx), Ty)); } + /// Lower the instruction, and prefer to perform the expansion in \p Ty. This + /// type is treated as an optimization hint, and the type request may not be + /// respected for all lowerings. + LegalizeRuleSet &lowerMinScalar(unsigned TypeIdx, const LLT Ty) { + using namespace LegalityPredicates; + using namespace LegalizeMutations; + return actionIf(LegalizeAction::Lower, + scalarNarrowerThan(TypeIdx, Ty.getSizeInBits()), + changeTo(typeIdx(TypeIdx), Ty)); + } + /// Ensure the scalar is at most as wide as Ty. LegalizeRuleSet &maxScalarOrElt(unsigned TypeIdx, const LLT Ty) { using namespace LegalityPredicates; Index: llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp =================================================================== --- llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp +++ llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp @@ -2699,23 +2699,26 @@ return lowerReadWriteRegister(MI); case G_UADDSAT: case G_USUBSAT: { + LLT Ty = LowerTy.isValid() ? LowerTy : + MRI.getType(MI.getOperand(0).getReg()); + // Try to make a reasonable guess about which lowering strategy to use. The // target can override this with custom lowering and calling the // implementation functions. - LLT Ty = MRI.getType(MI.getOperand(0).getReg()); if (LI.isLegalOrCustom({G_UMIN, Ty})) - return lowerAddSubSatToMinMax(MI); + return lowerAddSubSatToMinMax(MI, Ty); return lowerAddSubSatToAddoSubo(MI); } case G_SADDSAT: case G_SSUBSAT: { - LLT Ty = MRI.getType(MI.getOperand(0).getReg()); + LLT Ty = LowerTy.isValid() ? LowerTy : + MRI.getType(MI.getOperand(0).getReg()); // FIXME: It would probably make more sense to see if G_SADDO is preferred, // since it's a shorter expansion. However, we would need to figure out the // preferred boolean type for the carry out for the query. if (LI.isLegalOrCustom({G_SMIN, Ty}) && LI.isLegalOrCustom({G_SMAX, Ty})) - return lowerAddSubSatToMinMax(MI); + return lowerAddSubSatToMinMax(MI, Ty); return lowerAddSubSatToAddoSubo(MI); } } @@ -5294,41 +5297,62 @@ return Legalized; } +/// Expand saturating add/sub which uses min/max instructions. If \p WideTy is +/// provided, this will perform the expansion in a wider bitwidth. LegalizerHelper::LegalizeResult -LegalizerHelper::lowerAddSubSatToMinMax(MachineInstr &MI) { +LegalizerHelper::lowerAddSubSatToMinMax(MachineInstr &MI, LLT WideTy) { + const unsigned Opc = MI.getOpcode(); Register Res = MI.getOperand(0).getReg(); Register LHS = MI.getOperand(1).getReg(); Register RHS = MI.getOperand(2).getReg(); - LLT Ty = MRI.getType(Res); - bool IsSigned; - bool IsAdd; - unsigned BaseOp; - switch (MI.getOpcode()) { - default: - llvm_unreachable("unexpected addsat/subsat opcode"); - case TargetOpcode::G_UADDSAT: - IsSigned = false; - IsAdd = true; - BaseOp = TargetOpcode::G_ADD; - break; - case TargetOpcode::G_SADDSAT: - IsSigned = true; - IsAdd = true; - BaseOp = TargetOpcode::G_ADD; - break; - case TargetOpcode::G_USUBSAT: - IsSigned = false; - IsAdd = false; - BaseOp = TargetOpcode::G_SUB; - break; - case TargetOpcode::G_SSUBSAT: - IsSigned = true; - IsAdd = false; - BaseOp = TargetOpcode::G_SUB; - break; + + Register OrigRes = Res; + LLT OrigTy = MRI.getType(Res); + LLT Ty = OrigTy; + bool DoPromote = WideTy.isValid() && WideTy != OrigTy; + + if (DoPromote) { + bool IsSigned = Opc == TargetOpcode::G_SADDSAT || + Opc == TargetOpcode::G_SSUBSAT; + Res = MRI.createGenericVirtualRegister(WideTy); + Ty = WideTy; + if (IsSigned) { + LHS = MIRBuilder.buildSExt(WideTy, LHS).getReg(0); + RHS = MIRBuilder.buildSExt(WideTy, RHS).getReg(0); + } else { + LHS = MIRBuilder.buildZExt(WideTy, LHS).getReg(0); + RHS = MIRBuilder.buildZExt(WideTy, RHS).getReg(0); + } } - if (IsSigned) { + // usub.sat(a, b) -> umax(a, b) - b + // + // TODO: If the target had umin, and not umax an alternative would be + // usub.sat(a, b) -> a - umin(a, b) + if (Opc == TargetOpcode::G_USUBSAT) { + auto Max = MIRBuilder.buildUMax(Ty, LHS, RHS); + MIRBuilder.buildSub(Res, Max, RHS); + } else if (Opc == TargetOpcode::G_UADDSAT) { + if (DoPromote) { + // uadd.sat(a, b) -> umin(zext(a) + zext(b), max_val) + // + // If we're promoting, we know the wider add won't overflow, so do a + // simple clamp on the wider result. + + unsigned OrigNumBits = OrigTy.getScalarSizeInBits(); + unsigned NewNumBits = Ty.getScalarSizeInBits(); + APInt MaxVal = APInt::getAllOnesValue(OrigNumBits); + auto SatMax = MIRBuilder.buildConstant(Ty, MaxVal.zext(NewNumBits)); + auto Add = MIRBuilder.buildAdd(Ty, LHS, RHS); + MIRBuilder.buildUMin(Res, Add, SatMax); + } else { + // uadd.sat(a, b) -> a + umin(~a, b) + Register Not = MIRBuilder.buildNot(Ty, LHS).getReg(0); + auto Min = MIRBuilder.buildUMin(Ty, Not, RHS); + MIRBuilder.buildAdd(Res, LHS, Min); + } + } else { + const bool IsAdd = Opc == TargetOpcode::G_SADDSAT; // sadd.sat(a, b) -> // hi = 0x7fffffff - smax(a, 0) // lo = 0x80000000 - smin(a, 0) @@ -5339,11 +5363,15 @@ // a - smin(smax(lo, b), hi) // TODO: AMDGPU can use a "median of 3" instruction here: // a +/- med3(lo, b, hi) - uint64_t NumBits = Ty.getScalarSizeInBits(); + unsigned OrigNumBits = OrigTy.getScalarSizeInBits(); + unsigned NewNumBits = Ty.getScalarSizeInBits(); auto MaxVal = - MIRBuilder.buildConstant(Ty, APInt::getSignedMaxValue(NumBits)); + MIRBuilder.buildConstant( + Ty, APInt::getSignedMaxValue(OrigNumBits).sextOrSelf(NewNumBits)); auto MinVal = - MIRBuilder.buildConstant(Ty, APInt::getSignedMinValue(NumBits)); + MIRBuilder.buildConstant( + Ty, APInt::getSignedMinValue(OrigNumBits).sextOrSelf(NewNumBits)); + MachineInstrBuilder Hi, Lo; if (IsAdd) { auto Zero = MIRBuilder.buildConstant(Ty, 0); @@ -5358,15 +5386,14 @@ } auto RHSClamped = MIRBuilder.buildSMin(Ty, MIRBuilder.buildSMax(Ty, Lo, RHS), Hi); + + unsigned BaseOp = IsAdd ? TargetOpcode::G_ADD : TargetOpcode::G_SUB; MIRBuilder.buildInstr(BaseOp, {Res}, {LHS, RHSClamped}); - } else { - // uadd.sat(a, b) -> a + umin(~a, b) - // usub.sat(a, b) -> a - umin(a, b) - Register Not = IsAdd ? MIRBuilder.buildNot(Ty, LHS).getReg(0) : LHS; - auto Min = MIRBuilder.buildUMin(Ty, Not, RHS); - MIRBuilder.buildInstr(BaseOp, {Res}, {LHS, Min}); } + if (DoPromote) + MIRBuilder.buildTrunc(OrigRes, Res); + MI.eraseFromParent(); return Legalized; } Index: llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -476,7 +476,7 @@ } else { // Clamp bit support was added in VI, along with 16-bit operations. getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) - .minScalar(0, S32) + .lowerMinScalar(0, S32) .scalarize(0) .lower(); } @@ -484,7 +484,7 @@ // FIXME: DAG expansion gets better results. The widening uses the smaller // range values and goes for the min/max lowering directly. getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) - .minScalar(0, S32) + .lowerMinScalar(0, S32) .scalarize(0) .lower(); } Index: llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-saddsat.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-saddsat.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-saddsat.mir @@ -13,22 +13,20 @@ ; GFX6: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX6: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 ; GFX6: [[COPY2:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; GFX6: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY2]], 7 ; GFX6: [[COPY3:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) - ; GFX6: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 25 - ; GFX6: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY2]], [[C]](s32) - ; GFX6: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY3]], [[C]](s32) - ; GFX6: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 2147483647 - ; GFX6: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 -2147483648 - ; GFX6: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 - ; GFX6: [[SMAX:%[0-9]+]]:_(s32) = G_SMAX [[SHL]], [[C3]] - ; GFX6: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[C1]], [[SMAX]] - ; GFX6: [[SMIN:%[0-9]+]]:_(s32) = G_SMIN [[SHL]], [[C3]] - ; GFX6: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[C2]], [[SMIN]] - ; GFX6: [[SMAX1:%[0-9]+]]:_(s32) = G_SMAX [[SUB1]], [[SHL1]] + ; GFX6: [[SEXT_INREG1:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY3]], 7 + ; GFX6: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 63 + ; GFX6: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -64 + ; GFX6: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; GFX6: [[SMAX:%[0-9]+]]:_(s32) = G_SMAX [[SEXT_INREG]], [[C2]] + ; GFX6: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[C]], [[SMAX]] + ; GFX6: [[SMIN:%[0-9]+]]:_(s32) = G_SMIN [[SEXT_INREG]], [[C2]] + ; GFX6: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[C1]], [[SMIN]] + ; GFX6: [[SMAX1:%[0-9]+]]:_(s32) = G_SMAX [[SUB1]], [[SEXT_INREG1]] ; GFX6: [[SMIN1:%[0-9]+]]:_(s32) = G_SMIN [[SMAX1]], [[SUB]] - ; GFX6: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[SHL]], [[SMIN1]] - ; GFX6: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[ADD]], [[C]](s32) - ; GFX6: [[COPY4:%[0-9]+]]:_(s32) = COPY [[ASHR]](s32) + ; GFX6: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[SEXT_INREG]], [[SMIN1]] + ; GFX6: [[COPY4:%[0-9]+]]:_(s32) = COPY [[ADD]](s32) ; GFX6: $vgpr0 = COPY [[COPY4]](s32) ; GFX8-LABEL: name: saddsat_s7 ; GFX8: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 @@ -82,22 +80,20 @@ ; GFX6: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX6: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 ; GFX6: [[COPY2:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; GFX6: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY2]], 8 ; GFX6: [[COPY3:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) - ; GFX6: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 - ; GFX6: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY2]], [[C]](s32) - ; GFX6: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY3]], [[C]](s32) - ; GFX6: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 2147483647 - ; GFX6: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 -2147483648 - ; GFX6: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 - ; GFX6: [[SMAX:%[0-9]+]]:_(s32) = G_SMAX [[SHL]], [[C3]] - ; GFX6: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[C1]], [[SMAX]] - ; GFX6: [[SMIN:%[0-9]+]]:_(s32) = G_SMIN [[SHL]], [[C3]] - ; GFX6: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[C2]], [[SMIN]] - ; GFX6: [[SMAX1:%[0-9]+]]:_(s32) = G_SMAX [[SUB1]], [[SHL1]] + ; GFX6: [[SEXT_INREG1:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY3]], 8 + ; GFX6: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 127 + ; GFX6: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -128 + ; GFX6: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; GFX6: [[SMAX:%[0-9]+]]:_(s32) = G_SMAX [[SEXT_INREG]], [[C2]] + ; GFX6: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[C]], [[SMAX]] + ; GFX6: [[SMIN:%[0-9]+]]:_(s32) = G_SMIN [[SEXT_INREG]], [[C2]] + ; GFX6: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[C1]], [[SMIN]] + ; GFX6: [[SMAX1:%[0-9]+]]:_(s32) = G_SMAX [[SUB1]], [[SEXT_INREG1]] ; GFX6: [[SMIN1:%[0-9]+]]:_(s32) = G_SMIN [[SMAX1]], [[SUB]] - ; GFX6: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[SHL]], [[SMIN1]] - ; GFX6: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[ADD]], [[C]](s32) - ; GFX6: [[COPY4:%[0-9]+]]:_(s32) = COPY [[ASHR]](s32) + ; GFX6: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[SEXT_INREG]], [[SMIN1]] + ; GFX6: [[COPY4:%[0-9]+]]:_(s32) = COPY [[ADD]](s32) ; GFX6: $vgpr0 = COPY [[COPY4]](s32) ; GFX8-LABEL: name: saddsat_s8 ; GFX8: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 @@ -160,41 +156,39 @@ ; GFX6: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[COPY1]], [[C1]](s32) ; GFX6: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[COPY1]], [[C2]](s32) ; GFX6: [[COPY2:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; GFX6: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY2]], 8 ; GFX6: [[COPY3:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) - ; GFX6: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY2]], [[C2]](s32) - ; GFX6: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY3]], [[C2]](s32) - ; GFX6: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 2147483647 - ; GFX6: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 -2147483648 + ; GFX6: [[SEXT_INREG1:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY3]], 8 + ; GFX6: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 127 + ; GFX6: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 -128 ; GFX6: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 - ; GFX6: [[SMAX:%[0-9]+]]:_(s32) = G_SMAX [[SHL]], [[C5]] + ; GFX6: [[SMAX:%[0-9]+]]:_(s32) = G_SMAX [[SEXT_INREG]], [[C5]] ; GFX6: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[C3]], [[SMAX]] - ; GFX6: [[SMIN:%[0-9]+]]:_(s32) = G_SMIN [[SHL]], [[C5]] + ; GFX6: [[SMIN:%[0-9]+]]:_(s32) = G_SMIN [[SEXT_INREG]], [[C5]] ; GFX6: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[C4]], [[SMIN]] - ; GFX6: [[SMAX1:%[0-9]+]]:_(s32) = G_SMAX [[SUB1]], [[SHL1]] + ; GFX6: [[SMAX1:%[0-9]+]]:_(s32) = G_SMAX [[SUB1]], [[SEXT_INREG1]] ; GFX6: [[SMIN1:%[0-9]+]]:_(s32) = G_SMIN [[SMAX1]], [[SUB]] - ; GFX6: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[SHL]], [[SMIN1]] - ; GFX6: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[ADD]], [[C2]](s32) + ; GFX6: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[SEXT_INREG]], [[SMIN1]] ; GFX6: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX6: [[SEXT_INREG2:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY4]], 8 ; GFX6: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LSHR3]](s32) - ; GFX6: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[COPY4]], [[C2]](s32) - ; GFX6: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[COPY5]], [[C2]](s32) - ; GFX6: [[SMAX2:%[0-9]+]]:_(s32) = G_SMAX [[SHL2]], [[C5]] + ; GFX6: [[SEXT_INREG3:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY5]], 8 + ; GFX6: [[SMAX2:%[0-9]+]]:_(s32) = G_SMAX [[SEXT_INREG2]], [[C5]] ; GFX6: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[C3]], [[SMAX2]] - ; GFX6: [[SMIN2:%[0-9]+]]:_(s32) = G_SMIN [[SHL2]], [[C5]] + ; GFX6: [[SMIN2:%[0-9]+]]:_(s32) = G_SMIN [[SEXT_INREG2]], [[C5]] ; GFX6: [[SUB3:%[0-9]+]]:_(s32) = G_SUB [[C4]], [[SMIN2]] - ; GFX6: [[SMAX3:%[0-9]+]]:_(s32) = G_SMAX [[SUB3]], [[SHL3]] + ; GFX6: [[SMAX3:%[0-9]+]]:_(s32) = G_SMAX [[SUB3]], [[SEXT_INREG3]] ; GFX6: [[SMIN3:%[0-9]+]]:_(s32) = G_SMIN [[SMAX3]], [[SUB2]] - ; GFX6: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[SHL2]], [[SMIN3]] - ; GFX6: [[ASHR1:%[0-9]+]]:_(s32) = G_ASHR [[ADD1]], [[C2]](s32) + ; GFX6: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[SEXT_INREG2]], [[SMIN3]] ; GFX6: [[C6:%[0-9]+]]:_(s16) = G_CONSTANT i16 255 - ; GFX6: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[ASHR]](s32) + ; GFX6: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[ADD]](s32) ; GFX6: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C6]] ; GFX6: [[COPY6:%[0-9]+]]:_(s32) = COPY [[C]](s32) ; GFX6: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 255 - ; GFX6: [[COPY7:%[0-9]+]]:_(s32) = COPY [[ASHR1]](s32) + ; GFX6: [[COPY7:%[0-9]+]]:_(s32) = COPY [[ADD1]](s32) ; GFX6: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C7]] - ; GFX6: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[COPY6]](s32) - ; GFX6: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[SHL4]](s32) + ; GFX6: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[COPY6]](s32) + ; GFX6: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[SHL]](s32) ; GFX6: [[OR:%[0-9]+]]:_(s16) = G_OR [[AND]], [[TRUNC1]] ; GFX6: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[OR]](s16) ; GFX6: $vgpr0 = COPY [[ANYEXT]](s32) @@ -307,22 +301,20 @@ ; GFX6: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX6: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 ; GFX6: [[COPY2:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; GFX6: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY2]], 16 ; GFX6: [[COPY3:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) - ; GFX6: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX6: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY2]], [[C]](s32) - ; GFX6: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY3]], [[C]](s32) - ; GFX6: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 2147483647 - ; GFX6: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 -2147483648 - ; GFX6: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 - ; GFX6: [[SMAX:%[0-9]+]]:_(s32) = G_SMAX [[SHL]], [[C3]] - ; GFX6: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[C1]], [[SMAX]] - ; GFX6: [[SMIN:%[0-9]+]]:_(s32) = G_SMIN [[SHL]], [[C3]] - ; GFX6: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[C2]], [[SMIN]] - ; GFX6: [[SMAX1:%[0-9]+]]:_(s32) = G_SMAX [[SUB1]], [[SHL1]] + ; GFX6: [[SEXT_INREG1:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY3]], 16 + ; GFX6: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 32767 + ; GFX6: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -32768 + ; GFX6: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; GFX6: [[SMAX:%[0-9]+]]:_(s32) = G_SMAX [[SEXT_INREG]], [[C2]] + ; GFX6: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[C]], [[SMAX]] + ; GFX6: [[SMIN:%[0-9]+]]:_(s32) = G_SMIN [[SEXT_INREG]], [[C2]] + ; GFX6: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[C1]], [[SMIN]] + ; GFX6: [[SMAX1:%[0-9]+]]:_(s32) = G_SMAX [[SUB1]], [[SEXT_INREG1]] ; GFX6: [[SMIN1:%[0-9]+]]:_(s32) = G_SMIN [[SMAX1]], [[SUB]] - ; GFX6: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[SHL]], [[SMIN1]] - ; GFX6: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[ADD]], [[C]](s32) - ; GFX6: [[COPY4:%[0-9]+]]:_(s32) = COPY [[ASHR]](s32) + ; GFX6: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[SEXT_INREG]], [[SMIN1]] + ; GFX6: [[COPY4:%[0-9]+]]:_(s32) = COPY [[ADD]](s32) ; GFX6: $vgpr0 = COPY [[COPY4]](s32) ; GFX8-LABEL: name: saddsat_s16 ; GFX8: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 @@ -373,39 +365,37 @@ ; GFX6: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>) ; GFX6: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) ; GFX6: [[COPY2:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) + ; GFX6: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY2]], 16 ; GFX6: [[COPY3:%[0-9]+]]:_(s32) = COPY [[BITCAST1]](s32) - ; GFX6: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY2]], [[C]](s32) - ; GFX6: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY3]], [[C]](s32) - ; GFX6: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 2147483647 - ; GFX6: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 -2147483648 + ; GFX6: [[SEXT_INREG1:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY3]], 16 + ; GFX6: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 32767 + ; GFX6: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 -32768 ; GFX6: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 - ; GFX6: [[SMAX:%[0-9]+]]:_(s32) = G_SMAX [[SHL]], [[C3]] + ; GFX6: [[SMAX:%[0-9]+]]:_(s32) = G_SMAX [[SEXT_INREG]], [[C3]] ; GFX6: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[C1]], [[SMAX]] - ; GFX6: [[SMIN:%[0-9]+]]:_(s32) = G_SMIN [[SHL]], [[C3]] + ; GFX6: [[SMIN:%[0-9]+]]:_(s32) = G_SMIN [[SEXT_INREG]], [[C3]] ; GFX6: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[C2]], [[SMIN]] - ; GFX6: [[SMAX1:%[0-9]+]]:_(s32) = G_SMAX [[SUB1]], [[SHL1]] + ; GFX6: [[SMAX1:%[0-9]+]]:_(s32) = G_SMAX [[SUB1]], [[SEXT_INREG1]] ; GFX6: [[SMIN1:%[0-9]+]]:_(s32) = G_SMIN [[SMAX1]], [[SUB]] - ; GFX6: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[SHL]], [[SMIN1]] - ; GFX6: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[ADD]], [[C]](s32) + ; GFX6: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[SEXT_INREG]], [[SMIN1]] ; GFX6: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX6: [[SEXT_INREG2:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY4]], 16 ; GFX6: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) - ; GFX6: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[COPY4]], [[C]](s32) - ; GFX6: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[COPY5]], [[C]](s32) - ; GFX6: [[SMAX2:%[0-9]+]]:_(s32) = G_SMAX [[SHL2]], [[C3]] + ; GFX6: [[SEXT_INREG3:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY5]], 16 + ; GFX6: [[SMAX2:%[0-9]+]]:_(s32) = G_SMAX [[SEXT_INREG2]], [[C3]] ; GFX6: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[C1]], [[SMAX2]] - ; GFX6: [[SMIN2:%[0-9]+]]:_(s32) = G_SMIN [[SHL2]], [[C3]] + ; GFX6: [[SMIN2:%[0-9]+]]:_(s32) = G_SMIN [[SEXT_INREG2]], [[C3]] ; GFX6: [[SUB3:%[0-9]+]]:_(s32) = G_SUB [[C2]], [[SMIN2]] - ; GFX6: [[SMAX3:%[0-9]+]]:_(s32) = G_SMAX [[SUB3]], [[SHL3]] + ; GFX6: [[SMAX3:%[0-9]+]]:_(s32) = G_SMAX [[SUB3]], [[SEXT_INREG3]] ; GFX6: [[SMIN3:%[0-9]+]]:_(s32) = G_SMIN [[SMAX3]], [[SUB2]] - ; GFX6: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[SHL2]], [[SMIN3]] - ; GFX6: [[ASHR1:%[0-9]+]]:_(s32) = G_ASHR [[ADD1]], [[C]](s32) + ; GFX6: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[SEXT_INREG2]], [[SMIN3]] ; GFX6: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 - ; GFX6: [[COPY6:%[0-9]+]]:_(s32) = COPY [[ASHR]](s32) + ; GFX6: [[COPY6:%[0-9]+]]:_(s32) = COPY [[ADD]](s32) ; GFX6: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C4]] - ; GFX6: [[COPY7:%[0-9]+]]:_(s32) = COPY [[ASHR1]](s32) + ; GFX6: [[COPY7:%[0-9]+]]:_(s32) = COPY [[ADD1]](s32) ; GFX6: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C4]] - ; GFX6: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32) - ; GFX6: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL4]] + ; GFX6: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32) + ; GFX6: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] ; GFX6: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) ; GFX6: $vgpr0 = COPY [[BITCAST2]](<2 x s16>) ; GFX8-LABEL: name: saddsat_v2s16 @@ -478,57 +468,54 @@ ; GFX6: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV5]](<2 x s16>) ; GFX6: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) ; GFX6: [[COPY1:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) + ; GFX6: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY1]], 16 ; GFX6: [[COPY2:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) - ; GFX6: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY1]], [[C]](s32) - ; GFX6: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY2]], [[C]](s32) - ; GFX6: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 2147483647 - ; GFX6: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 -2147483648 + ; GFX6: [[SEXT_INREG1:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY2]], 16 + ; GFX6: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 32767 + ; GFX6: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 -32768 ; GFX6: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 - ; GFX6: [[SMAX:%[0-9]+]]:_(s32) = G_SMAX [[SHL]], [[C3]] + ; GFX6: [[SMAX:%[0-9]+]]:_(s32) = G_SMAX [[SEXT_INREG]], [[C3]] ; GFX6: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[C1]], [[SMAX]] - ; GFX6: [[SMIN:%[0-9]+]]:_(s32) = G_SMIN [[SHL]], [[C3]] + ; GFX6: [[SMIN:%[0-9]+]]:_(s32) = G_SMIN [[SEXT_INREG]], [[C3]] ; GFX6: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[C2]], [[SMIN]] - ; GFX6: [[SMAX1:%[0-9]+]]:_(s32) = G_SMAX [[SUB1]], [[SHL1]] + ; GFX6: [[SMAX1:%[0-9]+]]:_(s32) = G_SMAX [[SUB1]], [[SEXT_INREG1]] ; GFX6: [[SMIN1:%[0-9]+]]:_(s32) = G_SMIN [[SMAX1]], [[SUB]] - ; GFX6: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[SHL]], [[SMIN1]] - ; GFX6: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[ADD]], [[C]](s32) + ; GFX6: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[SEXT_INREG]], [[SMIN1]] ; GFX6: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX6: [[SEXT_INREG2:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY3]], 16 ; GFX6: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) - ; GFX6: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[COPY3]], [[C]](s32) - ; GFX6: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[COPY4]], [[C]](s32) - ; GFX6: [[SMAX2:%[0-9]+]]:_(s32) = G_SMAX [[SHL2]], [[C3]] + ; GFX6: [[SEXT_INREG3:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY4]], 16 + ; GFX6: [[SMAX2:%[0-9]+]]:_(s32) = G_SMAX [[SEXT_INREG2]], [[C3]] ; GFX6: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[C1]], [[SMAX2]] - ; GFX6: [[SMIN2:%[0-9]+]]:_(s32) = G_SMIN [[SHL2]], [[C3]] + ; GFX6: [[SMIN2:%[0-9]+]]:_(s32) = G_SMIN [[SEXT_INREG2]], [[C3]] ; GFX6: [[SUB3:%[0-9]+]]:_(s32) = G_SUB [[C2]], [[SMIN2]] - ; GFX6: [[SMAX3:%[0-9]+]]:_(s32) = G_SMAX [[SUB3]], [[SHL3]] + ; GFX6: [[SMAX3:%[0-9]+]]:_(s32) = G_SMAX [[SUB3]], [[SEXT_INREG3]] ; GFX6: [[SMIN3:%[0-9]+]]:_(s32) = G_SMIN [[SMAX3]], [[SUB2]] - ; GFX6: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[SHL2]], [[SMIN3]] - ; GFX6: [[ASHR1:%[0-9]+]]:_(s32) = G_ASHR [[ADD1]], [[C]](s32) + ; GFX6: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[SEXT_INREG2]], [[SMIN3]] ; GFX6: [[COPY5:%[0-9]+]]:_(s32) = COPY [[BITCAST1]](s32) + ; GFX6: [[SEXT_INREG4:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY5]], 16 ; GFX6: [[COPY6:%[0-9]+]]:_(s32) = COPY [[BITCAST3]](s32) - ; GFX6: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[COPY5]], [[C]](s32) - ; GFX6: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[COPY6]], [[C]](s32) - ; GFX6: [[SMAX4:%[0-9]+]]:_(s32) = G_SMAX [[SHL4]], [[C3]] + ; GFX6: [[SEXT_INREG5:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY6]], 16 + ; GFX6: [[SMAX4:%[0-9]+]]:_(s32) = G_SMAX [[SEXT_INREG4]], [[C3]] ; GFX6: [[SUB4:%[0-9]+]]:_(s32) = G_SUB [[C1]], [[SMAX4]] - ; GFX6: [[SMIN4:%[0-9]+]]:_(s32) = G_SMIN [[SHL4]], [[C3]] + ; GFX6: [[SMIN4:%[0-9]+]]:_(s32) = G_SMIN [[SEXT_INREG4]], [[C3]] ; GFX6: [[SUB5:%[0-9]+]]:_(s32) = G_SUB [[C2]], [[SMIN4]] - ; GFX6: [[SMAX5:%[0-9]+]]:_(s32) = G_SMAX [[SUB5]], [[SHL5]] + ; GFX6: [[SMAX5:%[0-9]+]]:_(s32) = G_SMAX [[SUB5]], [[SEXT_INREG5]] ; GFX6: [[SMIN5:%[0-9]+]]:_(s32) = G_SMIN [[SMAX5]], [[SUB4]] - ; GFX6: [[ADD2:%[0-9]+]]:_(s32) = G_ADD [[SHL4]], [[SMIN5]] - ; GFX6: [[ASHR2:%[0-9]+]]:_(s32) = G_ASHR [[ADD2]], [[C]](s32) + ; GFX6: [[ADD2:%[0-9]+]]:_(s32) = G_ADD [[SEXT_INREG4]], [[SMIN5]] ; GFX6: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 - ; GFX6: [[COPY7:%[0-9]+]]:_(s32) = COPY [[ASHR]](s32) + ; GFX6: [[COPY7:%[0-9]+]]:_(s32) = COPY [[ADD]](s32) ; GFX6: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C4]] - ; GFX6: [[COPY8:%[0-9]+]]:_(s32) = COPY [[ASHR1]](s32) + ; GFX6: [[COPY8:%[0-9]+]]:_(s32) = COPY [[ADD1]](s32) ; GFX6: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY8]], [[C4]] - ; GFX6: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32) - ; GFX6: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL6]] + ; GFX6: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32) + ; GFX6: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] ; GFX6: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) - ; GFX6: [[COPY9:%[0-9]+]]:_(s32) = COPY [[ASHR2]](s32) + ; GFX6: [[COPY9:%[0-9]+]]:_(s32) = COPY [[ADD2]](s32) ; GFX6: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY9]], [[C4]] ; GFX6: [[COPY10:%[0-9]+]]:_(s32) = COPY [[C3]](s32) - ; GFX6: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[COPY10]], [[C]](s32) - ; GFX6: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL7]] + ; GFX6: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY10]], [[C]](s32) + ; GFX6: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] ; GFX6: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32) ; GFX6: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>) ; GFX6: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS]](<4 x s16>), 0 @@ -662,70 +649,66 @@ ; GFX6: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>) ; GFX6: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) ; GFX6: [[COPY2:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) + ; GFX6: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY2]], 16 ; GFX6: [[COPY3:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) - ; GFX6: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY2]], [[C]](s32) - ; GFX6: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY3]], [[C]](s32) - ; GFX6: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 2147483647 - ; GFX6: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 -2147483648 + ; GFX6: [[SEXT_INREG1:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY3]], 16 + ; GFX6: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 32767 + ; GFX6: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 -32768 ; GFX6: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 - ; GFX6: [[SMAX:%[0-9]+]]:_(s32) = G_SMAX [[SHL]], [[C3]] + ; GFX6: [[SMAX:%[0-9]+]]:_(s32) = G_SMAX [[SEXT_INREG]], [[C3]] ; GFX6: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[C1]], [[SMAX]] - ; GFX6: [[SMIN:%[0-9]+]]:_(s32) = G_SMIN [[SHL]], [[C3]] + ; GFX6: [[SMIN:%[0-9]+]]:_(s32) = G_SMIN [[SEXT_INREG]], [[C3]] ; GFX6: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[C2]], [[SMIN]] - ; GFX6: [[SMAX1:%[0-9]+]]:_(s32) = G_SMAX [[SUB1]], [[SHL1]] + ; GFX6: [[SMAX1:%[0-9]+]]:_(s32) = G_SMAX [[SUB1]], [[SEXT_INREG1]] ; GFX6: [[SMIN1:%[0-9]+]]:_(s32) = G_SMIN [[SMAX1]], [[SUB]] - ; GFX6: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[SHL]], [[SMIN1]] - ; GFX6: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[ADD]], [[C]](s32) + ; GFX6: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[SEXT_INREG]], [[SMIN1]] ; GFX6: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX6: [[SEXT_INREG2:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY4]], 16 ; GFX6: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) - ; GFX6: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[COPY4]], [[C]](s32) - ; GFX6: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[COPY5]], [[C]](s32) - ; GFX6: [[SMAX2:%[0-9]+]]:_(s32) = G_SMAX [[SHL2]], [[C3]] + ; GFX6: [[SEXT_INREG3:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY5]], 16 + ; GFX6: [[SMAX2:%[0-9]+]]:_(s32) = G_SMAX [[SEXT_INREG2]], [[C3]] ; GFX6: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[C1]], [[SMAX2]] - ; GFX6: [[SMIN2:%[0-9]+]]:_(s32) = G_SMIN [[SHL2]], [[C3]] + ; GFX6: [[SMIN2:%[0-9]+]]:_(s32) = G_SMIN [[SEXT_INREG2]], [[C3]] ; GFX6: [[SUB3:%[0-9]+]]:_(s32) = G_SUB [[C2]], [[SMIN2]] - ; GFX6: [[SMAX3:%[0-9]+]]:_(s32) = G_SMAX [[SUB3]], [[SHL3]] + ; GFX6: [[SMAX3:%[0-9]+]]:_(s32) = G_SMAX [[SUB3]], [[SEXT_INREG3]] ; GFX6: [[SMIN3:%[0-9]+]]:_(s32) = G_SMIN [[SMAX3]], [[SUB2]] - ; GFX6: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[SHL2]], [[SMIN3]] - ; GFX6: [[ASHR1:%[0-9]+]]:_(s32) = G_ASHR [[ADD1]], [[C]](s32) + ; GFX6: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[SEXT_INREG2]], [[SMIN3]] ; GFX6: [[COPY6:%[0-9]+]]:_(s32) = COPY [[BITCAST1]](s32) + ; GFX6: [[SEXT_INREG4:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY6]], 16 ; GFX6: [[COPY7:%[0-9]+]]:_(s32) = COPY [[BITCAST3]](s32) - ; GFX6: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[COPY6]], [[C]](s32) - ; GFX6: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[COPY7]], [[C]](s32) - ; GFX6: [[SMAX4:%[0-9]+]]:_(s32) = G_SMAX [[SHL4]], [[C3]] + ; GFX6: [[SEXT_INREG5:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY7]], 16 + ; GFX6: [[SMAX4:%[0-9]+]]:_(s32) = G_SMAX [[SEXT_INREG4]], [[C3]] ; GFX6: [[SUB4:%[0-9]+]]:_(s32) = G_SUB [[C1]], [[SMAX4]] - ; GFX6: [[SMIN4:%[0-9]+]]:_(s32) = G_SMIN [[SHL4]], [[C3]] + ; GFX6: [[SMIN4:%[0-9]+]]:_(s32) = G_SMIN [[SEXT_INREG4]], [[C3]] ; GFX6: [[SUB5:%[0-9]+]]:_(s32) = G_SUB [[C2]], [[SMIN4]] - ; GFX6: [[SMAX5:%[0-9]+]]:_(s32) = G_SMAX [[SUB5]], [[SHL5]] + ; GFX6: [[SMAX5:%[0-9]+]]:_(s32) = G_SMAX [[SUB5]], [[SEXT_INREG5]] ; GFX6: [[SMIN5:%[0-9]+]]:_(s32) = G_SMIN [[SMAX5]], [[SUB4]] - ; GFX6: [[ADD2:%[0-9]+]]:_(s32) = G_ADD [[SHL4]], [[SMIN5]] - ; GFX6: [[ASHR2:%[0-9]+]]:_(s32) = G_ASHR [[ADD2]], [[C]](s32) + ; GFX6: [[ADD2:%[0-9]+]]:_(s32) = G_ADD [[SEXT_INREG4]], [[SMIN5]] ; GFX6: [[COPY8:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) + ; GFX6: [[SEXT_INREG6:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY8]], 16 ; GFX6: [[COPY9:%[0-9]+]]:_(s32) = COPY [[LSHR3]](s32) - ; GFX6: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[COPY8]], [[C]](s32) - ; GFX6: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[COPY9]], [[C]](s32) - ; GFX6: [[SMAX6:%[0-9]+]]:_(s32) = G_SMAX [[SHL6]], [[C3]] + ; GFX6: [[SEXT_INREG7:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY9]], 16 + ; GFX6: [[SMAX6:%[0-9]+]]:_(s32) = G_SMAX [[SEXT_INREG6]], [[C3]] ; GFX6: [[SUB6:%[0-9]+]]:_(s32) = G_SUB [[C1]], [[SMAX6]] - ; GFX6: [[SMIN6:%[0-9]+]]:_(s32) = G_SMIN [[SHL6]], [[C3]] + ; GFX6: [[SMIN6:%[0-9]+]]:_(s32) = G_SMIN [[SEXT_INREG6]], [[C3]] ; GFX6: [[SUB7:%[0-9]+]]:_(s32) = G_SUB [[C2]], [[SMIN6]] - ; GFX6: [[SMAX7:%[0-9]+]]:_(s32) = G_SMAX [[SUB7]], [[SHL7]] + ; GFX6: [[SMAX7:%[0-9]+]]:_(s32) = G_SMAX [[SUB7]], [[SEXT_INREG7]] ; GFX6: [[SMIN7:%[0-9]+]]:_(s32) = G_SMIN [[SMAX7]], [[SUB6]] - ; GFX6: [[ADD3:%[0-9]+]]:_(s32) = G_ADD [[SHL6]], [[SMIN7]] - ; GFX6: [[ASHR3:%[0-9]+]]:_(s32) = G_ASHR [[ADD3]], [[C]](s32) + ; GFX6: [[ADD3:%[0-9]+]]:_(s32) = G_ADD [[SEXT_INREG6]], [[SMIN7]] ; GFX6: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 - ; GFX6: [[COPY10:%[0-9]+]]:_(s32) = COPY [[ASHR]](s32) + ; GFX6: [[COPY10:%[0-9]+]]:_(s32) = COPY [[ADD]](s32) ; GFX6: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C4]] - ; GFX6: [[COPY11:%[0-9]+]]:_(s32) = COPY [[ASHR1]](s32) + ; GFX6: [[COPY11:%[0-9]+]]:_(s32) = COPY [[ADD1]](s32) ; GFX6: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY11]], [[C4]] - ; GFX6: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32) - ; GFX6: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL8]] + ; GFX6: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32) + ; GFX6: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] ; GFX6: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) - ; GFX6: [[COPY12:%[0-9]+]]:_(s32) = COPY [[ASHR2]](s32) + ; GFX6: [[COPY12:%[0-9]+]]:_(s32) = COPY [[ADD2]](s32) ; GFX6: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY12]], [[C4]] - ; GFX6: [[COPY13:%[0-9]+]]:_(s32) = COPY [[ASHR3]](s32) + ; GFX6: [[COPY13:%[0-9]+]]:_(s32) = COPY [[ADD3]](s32) ; GFX6: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY13]], [[C4]] - ; GFX6: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C]](s32) - ; GFX6: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL9]] + ; GFX6: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C]](s32) + ; GFX6: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] ; GFX6: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32) ; GFX6: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>) ; GFX6: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>) Index: llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ssubsat.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ssubsat.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ssubsat.mir @@ -13,22 +13,20 @@ ; GFX6: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX6: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 ; GFX6: [[COPY2:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; GFX6: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY2]], 7 ; GFX6: [[COPY3:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) - ; GFX6: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 25 - ; GFX6: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY2]], [[C]](s32) - ; GFX6: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY3]], [[C]](s32) - ; GFX6: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 2147483647 - ; GFX6: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 -2147483648 - ; GFX6: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1 - ; GFX6: [[SMAX:%[0-9]+]]:_(s32) = G_SMAX [[SHL]], [[C3]] - ; GFX6: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[SMAX]], [[C1]] - ; GFX6: [[SMIN:%[0-9]+]]:_(s32) = G_SMIN [[SHL]], [[C3]] - ; GFX6: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[SMIN]], [[C2]] - ; GFX6: [[SMAX1:%[0-9]+]]:_(s32) = G_SMAX [[SUB]], [[SHL1]] + ; GFX6: [[SEXT_INREG1:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY3]], 7 + ; GFX6: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 63 + ; GFX6: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -64 + ; GFX6: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1 + ; GFX6: [[SMAX:%[0-9]+]]:_(s32) = G_SMAX [[SEXT_INREG]], [[C2]] + ; GFX6: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[SMAX]], [[C]] + ; GFX6: [[SMIN:%[0-9]+]]:_(s32) = G_SMIN [[SEXT_INREG]], [[C2]] + ; GFX6: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[SMIN]], [[C1]] + ; GFX6: [[SMAX1:%[0-9]+]]:_(s32) = G_SMAX [[SUB]], [[SEXT_INREG1]] ; GFX6: [[SMIN1:%[0-9]+]]:_(s32) = G_SMIN [[SMAX1]], [[SUB1]] - ; GFX6: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[SHL]], [[SMIN1]] - ; GFX6: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[SUB2]], [[C]](s32) - ; GFX6: [[COPY4:%[0-9]+]]:_(s32) = COPY [[ASHR]](s32) + ; GFX6: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[SEXT_INREG]], [[SMIN1]] + ; GFX6: [[COPY4:%[0-9]+]]:_(s32) = COPY [[SUB2]](s32) ; GFX6: $vgpr0 = COPY [[COPY4]](s32) ; GFX8-LABEL: name: ssubsat_s7 ; GFX8: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 @@ -82,22 +80,20 @@ ; GFX6: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX6: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 ; GFX6: [[COPY2:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; GFX6: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY2]], 8 ; GFX6: [[COPY3:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) - ; GFX6: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 - ; GFX6: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY2]], [[C]](s32) - ; GFX6: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY3]], [[C]](s32) - ; GFX6: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 2147483647 - ; GFX6: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 -2147483648 - ; GFX6: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1 - ; GFX6: [[SMAX:%[0-9]+]]:_(s32) = G_SMAX [[SHL]], [[C3]] - ; GFX6: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[SMAX]], [[C1]] - ; GFX6: [[SMIN:%[0-9]+]]:_(s32) = G_SMIN [[SHL]], [[C3]] - ; GFX6: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[SMIN]], [[C2]] - ; GFX6: [[SMAX1:%[0-9]+]]:_(s32) = G_SMAX [[SUB]], [[SHL1]] + ; GFX6: [[SEXT_INREG1:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY3]], 8 + ; GFX6: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 127 + ; GFX6: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -128 + ; GFX6: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1 + ; GFX6: [[SMAX:%[0-9]+]]:_(s32) = G_SMAX [[SEXT_INREG]], [[C2]] + ; GFX6: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[SMAX]], [[C]] + ; GFX6: [[SMIN:%[0-9]+]]:_(s32) = G_SMIN [[SEXT_INREG]], [[C2]] + ; GFX6: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[SMIN]], [[C1]] + ; GFX6: [[SMAX1:%[0-9]+]]:_(s32) = G_SMAX [[SUB]], [[SEXT_INREG1]] ; GFX6: [[SMIN1:%[0-9]+]]:_(s32) = G_SMIN [[SMAX1]], [[SUB1]] - ; GFX6: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[SHL]], [[SMIN1]] - ; GFX6: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[SUB2]], [[C]](s32) - ; GFX6: [[COPY4:%[0-9]+]]:_(s32) = COPY [[ASHR]](s32) + ; GFX6: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[SEXT_INREG]], [[SMIN1]] + ; GFX6: [[COPY4:%[0-9]+]]:_(s32) = COPY [[SUB2]](s32) ; GFX6: $vgpr0 = COPY [[COPY4]](s32) ; GFX8-LABEL: name: ssubsat_s8 ; GFX8: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 @@ -160,41 +156,39 @@ ; GFX6: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[COPY1]], [[C1]](s32) ; GFX6: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[COPY1]], [[C2]](s32) ; GFX6: [[COPY2:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; GFX6: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY2]], 8 ; GFX6: [[COPY3:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) - ; GFX6: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY2]], [[C2]](s32) - ; GFX6: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY3]], [[C2]](s32) - ; GFX6: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 2147483647 - ; GFX6: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 -2147483648 + ; GFX6: [[SEXT_INREG1:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY3]], 8 + ; GFX6: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 127 + ; GFX6: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 -128 ; GFX6: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1 - ; GFX6: [[SMAX:%[0-9]+]]:_(s32) = G_SMAX [[SHL]], [[C5]] + ; GFX6: [[SMAX:%[0-9]+]]:_(s32) = G_SMAX [[SEXT_INREG]], [[C5]] ; GFX6: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[SMAX]], [[C3]] - ; GFX6: [[SMIN:%[0-9]+]]:_(s32) = G_SMIN [[SHL]], [[C5]] + ; GFX6: [[SMIN:%[0-9]+]]:_(s32) = G_SMIN [[SEXT_INREG]], [[C5]] ; GFX6: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[SMIN]], [[C4]] - ; GFX6: [[SMAX1:%[0-9]+]]:_(s32) = G_SMAX [[SUB]], [[SHL1]] + ; GFX6: [[SMAX1:%[0-9]+]]:_(s32) = G_SMAX [[SUB]], [[SEXT_INREG1]] ; GFX6: [[SMIN1:%[0-9]+]]:_(s32) = G_SMIN [[SMAX1]], [[SUB1]] - ; GFX6: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[SHL]], [[SMIN1]] - ; GFX6: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[SUB2]], [[C2]](s32) + ; GFX6: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[SEXT_INREG]], [[SMIN1]] ; GFX6: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX6: [[SEXT_INREG2:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY4]], 8 ; GFX6: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LSHR3]](s32) - ; GFX6: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[COPY4]], [[C2]](s32) - ; GFX6: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[COPY5]], [[C2]](s32) - ; GFX6: [[SMAX2:%[0-9]+]]:_(s32) = G_SMAX [[SHL2]], [[C5]] + ; GFX6: [[SEXT_INREG3:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY5]], 8 + ; GFX6: [[SMAX2:%[0-9]+]]:_(s32) = G_SMAX [[SEXT_INREG2]], [[C5]] ; GFX6: [[SUB3:%[0-9]+]]:_(s32) = G_SUB [[SMAX2]], [[C3]] - ; GFX6: [[SMIN2:%[0-9]+]]:_(s32) = G_SMIN [[SHL2]], [[C5]] + ; GFX6: [[SMIN2:%[0-9]+]]:_(s32) = G_SMIN [[SEXT_INREG2]], [[C5]] ; GFX6: [[SUB4:%[0-9]+]]:_(s32) = G_SUB [[SMIN2]], [[C4]] - ; GFX6: [[SMAX3:%[0-9]+]]:_(s32) = G_SMAX [[SUB3]], [[SHL3]] + ; GFX6: [[SMAX3:%[0-9]+]]:_(s32) = G_SMAX [[SUB3]], [[SEXT_INREG3]] ; GFX6: [[SMIN3:%[0-9]+]]:_(s32) = G_SMIN [[SMAX3]], [[SUB4]] - ; GFX6: [[SUB5:%[0-9]+]]:_(s32) = G_SUB [[SHL2]], [[SMIN3]] - ; GFX6: [[ASHR1:%[0-9]+]]:_(s32) = G_ASHR [[SUB5]], [[C2]](s32) + ; GFX6: [[SUB5:%[0-9]+]]:_(s32) = G_SUB [[SEXT_INREG2]], [[SMIN3]] ; GFX6: [[C6:%[0-9]+]]:_(s16) = G_CONSTANT i16 255 - ; GFX6: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[ASHR]](s32) + ; GFX6: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[SUB2]](s32) ; GFX6: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C6]] ; GFX6: [[COPY6:%[0-9]+]]:_(s32) = COPY [[C]](s32) ; GFX6: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 255 - ; GFX6: [[COPY7:%[0-9]+]]:_(s32) = COPY [[ASHR1]](s32) + ; GFX6: [[COPY7:%[0-9]+]]:_(s32) = COPY [[SUB5]](s32) ; GFX6: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C7]] - ; GFX6: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[COPY6]](s32) - ; GFX6: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[SHL4]](s32) + ; GFX6: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[COPY6]](s32) + ; GFX6: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[SHL]](s32) ; GFX6: [[OR:%[0-9]+]]:_(s16) = G_OR [[AND]], [[TRUNC1]] ; GFX6: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[OR]](s16) ; GFX6: $vgpr0 = COPY [[ANYEXT]](s32) @@ -307,22 +301,20 @@ ; GFX6: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX6: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 ; GFX6: [[COPY2:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; GFX6: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY2]], 16 ; GFX6: [[COPY3:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) - ; GFX6: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX6: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY2]], [[C]](s32) - ; GFX6: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY3]], [[C]](s32) - ; GFX6: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 2147483647 - ; GFX6: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 -2147483648 - ; GFX6: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1 - ; GFX6: [[SMAX:%[0-9]+]]:_(s32) = G_SMAX [[SHL]], [[C3]] - ; GFX6: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[SMAX]], [[C1]] - ; GFX6: [[SMIN:%[0-9]+]]:_(s32) = G_SMIN [[SHL]], [[C3]] - ; GFX6: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[SMIN]], [[C2]] - ; GFX6: [[SMAX1:%[0-9]+]]:_(s32) = G_SMAX [[SUB]], [[SHL1]] + ; GFX6: [[SEXT_INREG1:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY3]], 16 + ; GFX6: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 32767 + ; GFX6: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -32768 + ; GFX6: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1 + ; GFX6: [[SMAX:%[0-9]+]]:_(s32) = G_SMAX [[SEXT_INREG]], [[C2]] + ; GFX6: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[SMAX]], [[C]] + ; GFX6: [[SMIN:%[0-9]+]]:_(s32) = G_SMIN [[SEXT_INREG]], [[C2]] + ; GFX6: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[SMIN]], [[C1]] + ; GFX6: [[SMAX1:%[0-9]+]]:_(s32) = G_SMAX [[SUB]], [[SEXT_INREG1]] ; GFX6: [[SMIN1:%[0-9]+]]:_(s32) = G_SMIN [[SMAX1]], [[SUB1]] - ; GFX6: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[SHL]], [[SMIN1]] - ; GFX6: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[SUB2]], [[C]](s32) - ; GFX6: [[COPY4:%[0-9]+]]:_(s32) = COPY [[ASHR]](s32) + ; GFX6: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[SEXT_INREG]], [[SMIN1]] + ; GFX6: [[COPY4:%[0-9]+]]:_(s32) = COPY [[SUB2]](s32) ; GFX6: $vgpr0 = COPY [[COPY4]](s32) ; GFX8-LABEL: name: ssubsat_s16 ; GFX8: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 @@ -373,39 +365,37 @@ ; GFX6: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>) ; GFX6: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) ; GFX6: [[COPY2:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) + ; GFX6: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY2]], 16 ; GFX6: [[COPY3:%[0-9]+]]:_(s32) = COPY [[BITCAST1]](s32) - ; GFX6: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY2]], [[C]](s32) - ; GFX6: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY3]], [[C]](s32) - ; GFX6: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 2147483647 - ; GFX6: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 -2147483648 + ; GFX6: [[SEXT_INREG1:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY3]], 16 + ; GFX6: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 32767 + ; GFX6: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 -32768 ; GFX6: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1 - ; GFX6: [[SMAX:%[0-9]+]]:_(s32) = G_SMAX [[SHL]], [[C3]] + ; GFX6: [[SMAX:%[0-9]+]]:_(s32) = G_SMAX [[SEXT_INREG]], [[C3]] ; GFX6: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[SMAX]], [[C1]] - ; GFX6: [[SMIN:%[0-9]+]]:_(s32) = G_SMIN [[SHL]], [[C3]] + ; GFX6: [[SMIN:%[0-9]+]]:_(s32) = G_SMIN [[SEXT_INREG]], [[C3]] ; GFX6: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[SMIN]], [[C2]] - ; GFX6: [[SMAX1:%[0-9]+]]:_(s32) = G_SMAX [[SUB]], [[SHL1]] + ; GFX6: [[SMAX1:%[0-9]+]]:_(s32) = G_SMAX [[SUB]], [[SEXT_INREG1]] ; GFX6: [[SMIN1:%[0-9]+]]:_(s32) = G_SMIN [[SMAX1]], [[SUB1]] - ; GFX6: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[SHL]], [[SMIN1]] - ; GFX6: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[SUB2]], [[C]](s32) + ; GFX6: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[SEXT_INREG]], [[SMIN1]] ; GFX6: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX6: [[SEXT_INREG2:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY4]], 16 ; GFX6: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) - ; GFX6: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[COPY4]], [[C]](s32) - ; GFX6: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[COPY5]], [[C]](s32) - ; GFX6: [[SMAX2:%[0-9]+]]:_(s32) = G_SMAX [[SHL2]], [[C3]] + ; GFX6: [[SEXT_INREG3:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY5]], 16 + ; GFX6: [[SMAX2:%[0-9]+]]:_(s32) = G_SMAX [[SEXT_INREG2]], [[C3]] ; GFX6: [[SUB3:%[0-9]+]]:_(s32) = G_SUB [[SMAX2]], [[C1]] - ; GFX6: [[SMIN2:%[0-9]+]]:_(s32) = G_SMIN [[SHL2]], [[C3]] + ; GFX6: [[SMIN2:%[0-9]+]]:_(s32) = G_SMIN [[SEXT_INREG2]], [[C3]] ; GFX6: [[SUB4:%[0-9]+]]:_(s32) = G_SUB [[SMIN2]], [[C2]] - ; GFX6: [[SMAX3:%[0-9]+]]:_(s32) = G_SMAX [[SUB3]], [[SHL3]] + ; GFX6: [[SMAX3:%[0-9]+]]:_(s32) = G_SMAX [[SUB3]], [[SEXT_INREG3]] ; GFX6: [[SMIN3:%[0-9]+]]:_(s32) = G_SMIN [[SMAX3]], [[SUB4]] - ; GFX6: [[SUB5:%[0-9]+]]:_(s32) = G_SUB [[SHL2]], [[SMIN3]] - ; GFX6: [[ASHR1:%[0-9]+]]:_(s32) = G_ASHR [[SUB5]], [[C]](s32) + ; GFX6: [[SUB5:%[0-9]+]]:_(s32) = G_SUB [[SEXT_INREG2]], [[SMIN3]] ; GFX6: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 - ; GFX6: [[COPY6:%[0-9]+]]:_(s32) = COPY [[ASHR]](s32) + ; GFX6: [[COPY6:%[0-9]+]]:_(s32) = COPY [[SUB2]](s32) ; GFX6: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C4]] - ; GFX6: [[COPY7:%[0-9]+]]:_(s32) = COPY [[ASHR1]](s32) + ; GFX6: [[COPY7:%[0-9]+]]:_(s32) = COPY [[SUB5]](s32) ; GFX6: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C4]] - ; GFX6: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32) - ; GFX6: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL4]] + ; GFX6: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32) + ; GFX6: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] ; GFX6: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) ; GFX6: $vgpr0 = COPY [[BITCAST2]](<2 x s16>) ; GFX8-LABEL: name: ssubsat_v2s16 @@ -478,57 +468,54 @@ ; GFX6: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV5]](<2 x s16>) ; GFX6: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) ; GFX6: [[COPY1:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) + ; GFX6: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY1]], 16 ; GFX6: [[COPY2:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) - ; GFX6: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY1]], [[C]](s32) - ; GFX6: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY2]], [[C]](s32) - ; GFX6: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 2147483647 - ; GFX6: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 -2147483648 + ; GFX6: [[SEXT_INREG1:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY2]], 16 + ; GFX6: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 32767 + ; GFX6: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 -32768 ; GFX6: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1 - ; GFX6: [[SMAX:%[0-9]+]]:_(s32) = G_SMAX [[SHL]], [[C3]] + ; GFX6: [[SMAX:%[0-9]+]]:_(s32) = G_SMAX [[SEXT_INREG]], [[C3]] ; GFX6: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[SMAX]], [[C1]] - ; GFX6: [[SMIN:%[0-9]+]]:_(s32) = G_SMIN [[SHL]], [[C3]] + ; GFX6: [[SMIN:%[0-9]+]]:_(s32) = G_SMIN [[SEXT_INREG]], [[C3]] ; GFX6: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[SMIN]], [[C2]] - ; GFX6: [[SMAX1:%[0-9]+]]:_(s32) = G_SMAX [[SUB]], [[SHL1]] + ; GFX6: [[SMAX1:%[0-9]+]]:_(s32) = G_SMAX [[SUB]], [[SEXT_INREG1]] ; GFX6: [[SMIN1:%[0-9]+]]:_(s32) = G_SMIN [[SMAX1]], [[SUB1]] - ; GFX6: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[SHL]], [[SMIN1]] - ; GFX6: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[SUB2]], [[C]](s32) + ; GFX6: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[SEXT_INREG]], [[SMIN1]] ; GFX6: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX6: [[SEXT_INREG2:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY3]], 16 ; GFX6: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) - ; GFX6: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[COPY3]], [[C]](s32) - ; GFX6: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[COPY4]], [[C]](s32) - ; GFX6: [[SMAX2:%[0-9]+]]:_(s32) = G_SMAX [[SHL2]], [[C3]] + ; GFX6: [[SEXT_INREG3:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY4]], 16 + ; GFX6: [[SMAX2:%[0-9]+]]:_(s32) = G_SMAX [[SEXT_INREG2]], [[C3]] ; GFX6: [[SUB3:%[0-9]+]]:_(s32) = G_SUB [[SMAX2]], [[C1]] - ; GFX6: [[SMIN2:%[0-9]+]]:_(s32) = G_SMIN [[SHL2]], [[C3]] + ; GFX6: [[SMIN2:%[0-9]+]]:_(s32) = G_SMIN [[SEXT_INREG2]], [[C3]] ; GFX6: [[SUB4:%[0-9]+]]:_(s32) = G_SUB [[SMIN2]], [[C2]] - ; GFX6: [[SMAX3:%[0-9]+]]:_(s32) = G_SMAX [[SUB3]], [[SHL3]] + ; GFX6: [[SMAX3:%[0-9]+]]:_(s32) = G_SMAX [[SUB3]], [[SEXT_INREG3]] ; GFX6: [[SMIN3:%[0-9]+]]:_(s32) = G_SMIN [[SMAX3]], [[SUB4]] - ; GFX6: [[SUB5:%[0-9]+]]:_(s32) = G_SUB [[SHL2]], [[SMIN3]] - ; GFX6: [[ASHR1:%[0-9]+]]:_(s32) = G_ASHR [[SUB5]], [[C]](s32) + ; GFX6: [[SUB5:%[0-9]+]]:_(s32) = G_SUB [[SEXT_INREG2]], [[SMIN3]] ; GFX6: [[COPY5:%[0-9]+]]:_(s32) = COPY [[BITCAST1]](s32) + ; GFX6: [[SEXT_INREG4:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY5]], 16 ; GFX6: [[COPY6:%[0-9]+]]:_(s32) = COPY [[BITCAST3]](s32) - ; GFX6: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[COPY5]], [[C]](s32) - ; GFX6: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[COPY6]], [[C]](s32) - ; GFX6: [[SMAX4:%[0-9]+]]:_(s32) = G_SMAX [[SHL4]], [[C3]] + ; GFX6: [[SEXT_INREG5:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY6]], 16 + ; GFX6: [[SMAX4:%[0-9]+]]:_(s32) = G_SMAX [[SEXT_INREG4]], [[C3]] ; GFX6: [[SUB6:%[0-9]+]]:_(s32) = G_SUB [[SMAX4]], [[C1]] - ; GFX6: [[SMIN4:%[0-9]+]]:_(s32) = G_SMIN [[SHL4]], [[C3]] + ; GFX6: [[SMIN4:%[0-9]+]]:_(s32) = G_SMIN [[SEXT_INREG4]], [[C3]] ; GFX6: [[SUB7:%[0-9]+]]:_(s32) = G_SUB [[SMIN4]], [[C2]] - ; GFX6: [[SMAX5:%[0-9]+]]:_(s32) = G_SMAX [[SUB6]], [[SHL5]] + ; GFX6: [[SMAX5:%[0-9]+]]:_(s32) = G_SMAX [[SUB6]], [[SEXT_INREG5]] ; GFX6: [[SMIN5:%[0-9]+]]:_(s32) = G_SMIN [[SMAX5]], [[SUB7]] - ; GFX6: [[SUB8:%[0-9]+]]:_(s32) = G_SUB [[SHL4]], [[SMIN5]] - ; GFX6: [[ASHR2:%[0-9]+]]:_(s32) = G_ASHR [[SUB8]], [[C]](s32) + ; GFX6: [[SUB8:%[0-9]+]]:_(s32) = G_SUB [[SEXT_INREG4]], [[SMIN5]] ; GFX6: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 - ; GFX6: [[COPY7:%[0-9]+]]:_(s32) = COPY [[ASHR]](s32) + ; GFX6: [[COPY7:%[0-9]+]]:_(s32) = COPY [[SUB2]](s32) ; GFX6: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C4]] - ; GFX6: [[COPY8:%[0-9]+]]:_(s32) = COPY [[ASHR1]](s32) + ; GFX6: [[COPY8:%[0-9]+]]:_(s32) = COPY [[SUB5]](s32) ; GFX6: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY8]], [[C4]] - ; GFX6: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32) - ; GFX6: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL6]] + ; GFX6: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32) + ; GFX6: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] ; GFX6: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) - ; GFX6: [[COPY9:%[0-9]+]]:_(s32) = COPY [[ASHR2]](s32) + ; GFX6: [[COPY9:%[0-9]+]]:_(s32) = COPY [[SUB8]](s32) ; GFX6: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY9]], [[C4]] ; GFX6: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 - ; GFX6: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[C5]], [[C]](s32) - ; GFX6: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL7]] + ; GFX6: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[C5]], [[C]](s32) + ; GFX6: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] ; GFX6: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32) ; GFX6: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>) ; GFX6: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS]](<4 x s16>), 0 @@ -662,70 +649,66 @@ ; GFX6: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>) ; GFX6: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) ; GFX6: [[COPY2:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) + ; GFX6: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY2]], 16 ; GFX6: [[COPY3:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) - ; GFX6: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY2]], [[C]](s32) - ; GFX6: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY3]], [[C]](s32) - ; GFX6: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 2147483647 - ; GFX6: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 -2147483648 + ; GFX6: [[SEXT_INREG1:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY3]], 16 + ; GFX6: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 32767 + ; GFX6: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 -32768 ; GFX6: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1 - ; GFX6: [[SMAX:%[0-9]+]]:_(s32) = G_SMAX [[SHL]], [[C3]] + ; GFX6: [[SMAX:%[0-9]+]]:_(s32) = G_SMAX [[SEXT_INREG]], [[C3]] ; GFX6: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[SMAX]], [[C1]] - ; GFX6: [[SMIN:%[0-9]+]]:_(s32) = G_SMIN [[SHL]], [[C3]] + ; GFX6: [[SMIN:%[0-9]+]]:_(s32) = G_SMIN [[SEXT_INREG]], [[C3]] ; GFX6: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[SMIN]], [[C2]] - ; GFX6: [[SMAX1:%[0-9]+]]:_(s32) = G_SMAX [[SUB]], [[SHL1]] + ; GFX6: [[SMAX1:%[0-9]+]]:_(s32) = G_SMAX [[SUB]], [[SEXT_INREG1]] ; GFX6: [[SMIN1:%[0-9]+]]:_(s32) = G_SMIN [[SMAX1]], [[SUB1]] - ; GFX6: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[SHL]], [[SMIN1]] - ; GFX6: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[SUB2]], [[C]](s32) + ; GFX6: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[SEXT_INREG]], [[SMIN1]] ; GFX6: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX6: [[SEXT_INREG2:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY4]], 16 ; GFX6: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) - ; GFX6: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[COPY4]], [[C]](s32) - ; GFX6: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[COPY5]], [[C]](s32) - ; GFX6: [[SMAX2:%[0-9]+]]:_(s32) = G_SMAX [[SHL2]], [[C3]] + ; GFX6: [[SEXT_INREG3:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY5]], 16 + ; GFX6: [[SMAX2:%[0-9]+]]:_(s32) = G_SMAX [[SEXT_INREG2]], [[C3]] ; GFX6: [[SUB3:%[0-9]+]]:_(s32) = G_SUB [[SMAX2]], [[C1]] - ; GFX6: [[SMIN2:%[0-9]+]]:_(s32) = G_SMIN [[SHL2]], [[C3]] + ; GFX6: [[SMIN2:%[0-9]+]]:_(s32) = G_SMIN [[SEXT_INREG2]], [[C3]] ; GFX6: [[SUB4:%[0-9]+]]:_(s32) = G_SUB [[SMIN2]], [[C2]] - ; GFX6: [[SMAX3:%[0-9]+]]:_(s32) = G_SMAX [[SUB3]], [[SHL3]] + ; GFX6: [[SMAX3:%[0-9]+]]:_(s32) = G_SMAX [[SUB3]], [[SEXT_INREG3]] ; GFX6: [[SMIN3:%[0-9]+]]:_(s32) = G_SMIN [[SMAX3]], [[SUB4]] - ; GFX6: [[SUB5:%[0-9]+]]:_(s32) = G_SUB [[SHL2]], [[SMIN3]] - ; GFX6: [[ASHR1:%[0-9]+]]:_(s32) = G_ASHR [[SUB5]], [[C]](s32) + ; GFX6: [[SUB5:%[0-9]+]]:_(s32) = G_SUB [[SEXT_INREG2]], [[SMIN3]] ; GFX6: [[COPY6:%[0-9]+]]:_(s32) = COPY [[BITCAST1]](s32) + ; GFX6: [[SEXT_INREG4:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY6]], 16 ; GFX6: [[COPY7:%[0-9]+]]:_(s32) = COPY [[BITCAST3]](s32) - ; GFX6: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[COPY6]], [[C]](s32) - ; GFX6: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[COPY7]], [[C]](s32) - ; GFX6: [[SMAX4:%[0-9]+]]:_(s32) = G_SMAX [[SHL4]], [[C3]] + ; GFX6: [[SEXT_INREG5:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY7]], 16 + ; GFX6: [[SMAX4:%[0-9]+]]:_(s32) = G_SMAX [[SEXT_INREG4]], [[C3]] ; GFX6: [[SUB6:%[0-9]+]]:_(s32) = G_SUB [[SMAX4]], [[C1]] - ; GFX6: [[SMIN4:%[0-9]+]]:_(s32) = G_SMIN [[SHL4]], [[C3]] + ; GFX6: [[SMIN4:%[0-9]+]]:_(s32) = G_SMIN [[SEXT_INREG4]], [[C3]] ; GFX6: [[SUB7:%[0-9]+]]:_(s32) = G_SUB [[SMIN4]], [[C2]] - ; GFX6: [[SMAX5:%[0-9]+]]:_(s32) = G_SMAX [[SUB6]], [[SHL5]] + ; GFX6: [[SMAX5:%[0-9]+]]:_(s32) = G_SMAX [[SUB6]], [[SEXT_INREG5]] ; GFX6: [[SMIN5:%[0-9]+]]:_(s32) = G_SMIN [[SMAX5]], [[SUB7]] - ; GFX6: [[SUB8:%[0-9]+]]:_(s32) = G_SUB [[SHL4]], [[SMIN5]] - ; GFX6: [[ASHR2:%[0-9]+]]:_(s32) = G_ASHR [[SUB8]], [[C]](s32) + ; GFX6: [[SUB8:%[0-9]+]]:_(s32) = G_SUB [[SEXT_INREG4]], [[SMIN5]] ; GFX6: [[COPY8:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) + ; GFX6: [[SEXT_INREG6:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY8]], 16 ; GFX6: [[COPY9:%[0-9]+]]:_(s32) = COPY [[LSHR3]](s32) - ; GFX6: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[COPY8]], [[C]](s32) - ; GFX6: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[COPY9]], [[C]](s32) - ; GFX6: [[SMAX6:%[0-9]+]]:_(s32) = G_SMAX [[SHL6]], [[C3]] + ; GFX6: [[SEXT_INREG7:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY9]], 16 + ; GFX6: [[SMAX6:%[0-9]+]]:_(s32) = G_SMAX [[SEXT_INREG6]], [[C3]] ; GFX6: [[SUB9:%[0-9]+]]:_(s32) = G_SUB [[SMAX6]], [[C1]] - ; GFX6: [[SMIN6:%[0-9]+]]:_(s32) = G_SMIN [[SHL6]], [[C3]] + ; GFX6: [[SMIN6:%[0-9]+]]:_(s32) = G_SMIN [[SEXT_INREG6]], [[C3]] ; GFX6: [[SUB10:%[0-9]+]]:_(s32) = G_SUB [[SMIN6]], [[C2]] - ; GFX6: [[SMAX7:%[0-9]+]]:_(s32) = G_SMAX [[SUB9]], [[SHL7]] + ; GFX6: [[SMAX7:%[0-9]+]]:_(s32) = G_SMAX [[SUB9]], [[SEXT_INREG7]] ; GFX6: [[SMIN7:%[0-9]+]]:_(s32) = G_SMIN [[SMAX7]], [[SUB10]] - ; GFX6: [[SUB11:%[0-9]+]]:_(s32) = G_SUB [[SHL6]], [[SMIN7]] - ; GFX6: [[ASHR3:%[0-9]+]]:_(s32) = G_ASHR [[SUB11]], [[C]](s32) + ; GFX6: [[SUB11:%[0-9]+]]:_(s32) = G_SUB [[SEXT_INREG6]], [[SMIN7]] ; GFX6: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 - ; GFX6: [[COPY10:%[0-9]+]]:_(s32) = COPY [[ASHR]](s32) + ; GFX6: [[COPY10:%[0-9]+]]:_(s32) = COPY [[SUB2]](s32) ; GFX6: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C4]] - ; GFX6: [[COPY11:%[0-9]+]]:_(s32) = COPY [[ASHR1]](s32) + ; GFX6: [[COPY11:%[0-9]+]]:_(s32) = COPY [[SUB5]](s32) ; GFX6: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY11]], [[C4]] - ; GFX6: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32) - ; GFX6: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL8]] + ; GFX6: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32) + ; GFX6: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] ; GFX6: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) - ; GFX6: [[COPY12:%[0-9]+]]:_(s32) = COPY [[ASHR2]](s32) + ; GFX6: [[COPY12:%[0-9]+]]:_(s32) = COPY [[SUB8]](s32) ; GFX6: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY12]], [[C4]] - ; GFX6: [[COPY13:%[0-9]+]]:_(s32) = COPY [[ASHR3]](s32) + ; GFX6: [[COPY13:%[0-9]+]]:_(s32) = COPY [[SUB11]](s32) ; GFX6: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY13]], [[C4]] - ; GFX6: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C]](s32) - ; GFX6: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL9]] + ; GFX6: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C]](s32) + ; GFX6: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] ; GFX6: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32) ; GFX6: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>) ; GFX6: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>) Index: llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-uaddsat.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-uaddsat.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-uaddsat.mir @@ -12,17 +12,14 @@ ; GFX6-LABEL: name: uaddsat_s7 ; GFX6: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX6: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX6: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 127 ; GFX6: [[COPY2:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; GFX6: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C]] ; GFX6: [[COPY3:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) - ; GFX6: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 25 - ; GFX6: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY2]], [[C]](s32) - ; GFX6: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY3]], [[C]](s32) - ; GFX6: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1 - ; GFX6: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[SHL]], [[C1]] - ; GFX6: [[UMIN:%[0-9]+]]:_(s32) = G_UMIN [[XOR]], [[SHL1]] - ; GFX6: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[SHL]], [[UMIN]] - ; GFX6: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[ADD]], [[C]](s32) - ; GFX6: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX6: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C]] + ; GFX6: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[AND]], [[AND1]] + ; GFX6: [[UMIN:%[0-9]+]]:_(s32) = G_UMIN [[ADD]], [[C]] + ; GFX6: [[COPY4:%[0-9]+]]:_(s32) = COPY [[UMIN]](s32) ; GFX6: $vgpr0 = COPY [[COPY4]](s32) ; GFX8-LABEL: name: uaddsat_s7 ; GFX8: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 @@ -66,17 +63,14 @@ ; GFX6-LABEL: name: uaddsat_s8 ; GFX6: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX6: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX6: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 255 ; GFX6: [[COPY2:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; GFX6: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C]] ; GFX6: [[COPY3:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) - ; GFX6: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 - ; GFX6: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY2]], [[C]](s32) - ; GFX6: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY3]], [[C]](s32) - ; GFX6: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1 - ; GFX6: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[SHL]], [[C1]] - ; GFX6: [[UMIN:%[0-9]+]]:_(s32) = G_UMIN [[XOR]], [[SHL1]] - ; GFX6: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[SHL]], [[UMIN]] - ; GFX6: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[ADD]], [[C]](s32) - ; GFX6: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX6: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C]] + ; GFX6: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[AND]], [[AND1]] + ; GFX6: [[UMIN:%[0-9]+]]:_(s32) = G_UMIN [[ADD]], [[C]] + ; GFX6: [[COPY4:%[0-9]+]]:_(s32) = COPY [[UMIN]](s32) ; GFX6: $vgpr0 = COPY [[COPY4]](s32) ; GFX8-LABEL: name: uaddsat_s8 ; GFX8: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 @@ -129,33 +123,28 @@ ; GFX6: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[COPY1]], [[C]](s32) ; GFX6: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[COPY1]], [[C1]](s32) ; GFX6: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[COPY1]], [[C2]](s32) + ; GFX6: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 255 ; GFX6: [[COPY2:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; GFX6: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C3]] ; GFX6: [[COPY3:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) - ; GFX6: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY2]], [[C2]](s32) - ; GFX6: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY3]], [[C2]](s32) - ; GFX6: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1 - ; GFX6: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[SHL]], [[C3]] - ; GFX6: [[UMIN:%[0-9]+]]:_(s32) = G_UMIN [[XOR]], [[SHL1]] - ; GFX6: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[SHL]], [[UMIN]] - ; GFX6: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[ADD]], [[C2]](s32) + ; GFX6: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C3]] + ; GFX6: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[AND]], [[AND1]] + ; GFX6: [[UMIN:%[0-9]+]]:_(s32) = G_UMIN [[ADD]], [[C3]] ; GFX6: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX6: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C3]] ; GFX6: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LSHR3]](s32) - ; GFX6: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[COPY4]], [[C2]](s32) - ; GFX6: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[COPY5]], [[C2]](s32) - ; GFX6: [[XOR1:%[0-9]+]]:_(s32) = G_XOR [[SHL2]], [[C3]] - ; GFX6: [[UMIN1:%[0-9]+]]:_(s32) = G_UMIN [[XOR1]], [[SHL3]] - ; GFX6: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[SHL2]], [[UMIN1]] - ; GFX6: [[LSHR7:%[0-9]+]]:_(s32) = G_LSHR [[ADD1]], [[C2]](s32) + ; GFX6: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C3]] + ; GFX6: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[AND2]], [[AND3]] + ; GFX6: [[UMIN1:%[0-9]+]]:_(s32) = G_UMIN [[ADD1]], [[C3]] ; GFX6: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 255 - ; GFX6: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR6]](s32) - ; GFX6: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C4]] + ; GFX6: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[UMIN]](s32) + ; GFX6: [[AND4:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C4]] ; GFX6: [[COPY6:%[0-9]+]]:_(s32) = COPY [[C]](s32) - ; GFX6: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 255 - ; GFX6: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LSHR7]](s32) - ; GFX6: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C5]] - ; GFX6: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[COPY6]](s32) - ; GFX6: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[SHL4]](s32) - ; GFX6: [[OR:%[0-9]+]]:_(s16) = G_OR [[AND]], [[TRUNC1]] + ; GFX6: [[COPY7:%[0-9]+]]:_(s32) = COPY [[UMIN1]](s32) + ; GFX6: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C3]] + ; GFX6: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[COPY6]](s32) + ; GFX6: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[SHL]](s32) + ; GFX6: [[OR:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[TRUNC1]] ; GFX6: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[OR]](s16) ; GFX6: $vgpr0 = COPY [[ANYEXT]](s32) ; GFX8-LABEL: name: uaddsat_v2s8 @@ -251,17 +240,14 @@ ; GFX6-LABEL: name: uaddsat_s16 ; GFX6: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX6: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX6: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 ; GFX6: [[COPY2:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; GFX6: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C]] ; GFX6: [[COPY3:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) - ; GFX6: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX6: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY2]], [[C]](s32) - ; GFX6: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY3]], [[C]](s32) - ; GFX6: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1 - ; GFX6: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[SHL]], [[C1]] - ; GFX6: [[UMIN:%[0-9]+]]:_(s32) = G_UMIN [[XOR]], [[SHL1]] - ; GFX6: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[SHL]], [[UMIN]] - ; GFX6: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[ADD]], [[C]](s32) - ; GFX6: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX6: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C]] + ; GFX6: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[AND]], [[AND1]] + ; GFX6: [[UMIN:%[0-9]+]]:_(s32) = G_UMIN [[ADD]], [[C]] + ; GFX6: [[COPY4:%[0-9]+]]:_(s32) = COPY [[UMIN]](s32) ; GFX6: $vgpr0 = COPY [[COPY4]](s32) ; GFX8-LABEL: name: uaddsat_s16 ; GFX8: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 @@ -302,30 +288,25 @@ ; GFX6: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX6: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>) ; GFX6: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX6: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 ; GFX6: [[COPY2:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) + ; GFX6: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] ; GFX6: [[COPY3:%[0-9]+]]:_(s32) = COPY [[BITCAST1]](s32) - ; GFX6: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY2]], [[C]](s32) - ; GFX6: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY3]], [[C]](s32) - ; GFX6: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1 - ; GFX6: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[SHL]], [[C1]] - ; GFX6: [[UMIN:%[0-9]+]]:_(s32) = G_UMIN [[XOR]], [[SHL1]] - ; GFX6: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[SHL]], [[UMIN]] - ; GFX6: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[ADD]], [[C]](s32) + ; GFX6: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C1]] + ; GFX6: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[AND]], [[AND1]] + ; GFX6: [[UMIN:%[0-9]+]]:_(s32) = G_UMIN [[ADD]], [[C1]] ; GFX6: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX6: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C1]] ; GFX6: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) - ; GFX6: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[COPY4]], [[C]](s32) - ; GFX6: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[COPY5]], [[C]](s32) - ; GFX6: [[XOR1:%[0-9]+]]:_(s32) = G_XOR [[SHL2]], [[C1]] - ; GFX6: [[UMIN1:%[0-9]+]]:_(s32) = G_UMIN [[XOR1]], [[SHL3]] - ; GFX6: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[SHL2]], [[UMIN1]] - ; GFX6: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[ADD1]], [[C]](s32) - ; GFX6: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 - ; GFX6: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) - ; GFX6: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C2]] - ; GFX6: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LSHR3]](s32) - ; GFX6: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C2]] - ; GFX6: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32) - ; GFX6: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL4]] + ; GFX6: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C1]] + ; GFX6: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[AND2]], [[AND3]] + ; GFX6: [[UMIN1:%[0-9]+]]:_(s32) = G_UMIN [[ADD1]], [[C1]] + ; GFX6: [[COPY6:%[0-9]+]]:_(s32) = COPY [[UMIN]](s32) + ; GFX6: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C1]] + ; GFX6: [[COPY7:%[0-9]+]]:_(s32) = COPY [[UMIN1]](s32) + ; GFX6: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C1]] + ; GFX6: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C]](s32) + ; GFX6: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL]] ; GFX6: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) ; GFX6: $vgpr0 = COPY [[BITCAST2]](<2 x s16>) ; GFX8-LABEL: name: uaddsat_v2s16 @@ -382,44 +363,37 @@ ; GFX6: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) ; GFX6: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV5]](<2 x s16>) ; GFX6: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) + ; GFX6: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 ; GFX6: [[COPY1:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) + ; GFX6: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]] ; GFX6: [[COPY2:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) - ; GFX6: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY1]], [[C]](s32) - ; GFX6: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY2]], [[C]](s32) - ; GFX6: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1 - ; GFX6: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[SHL]], [[C1]] - ; GFX6: [[UMIN:%[0-9]+]]:_(s32) = G_UMIN [[XOR]], [[SHL1]] - ; GFX6: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[SHL]], [[UMIN]] - ; GFX6: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[ADD]], [[C]](s32) + ; GFX6: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] + ; GFX6: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[AND]], [[AND1]] + ; GFX6: [[UMIN:%[0-9]+]]:_(s32) = G_UMIN [[ADD]], [[C1]] ; GFX6: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX6: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C1]] ; GFX6: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) - ; GFX6: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[COPY3]], [[C]](s32) - ; GFX6: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[COPY4]], [[C]](s32) - ; GFX6: [[XOR1:%[0-9]+]]:_(s32) = G_XOR [[SHL2]], [[C1]] - ; GFX6: [[UMIN1:%[0-9]+]]:_(s32) = G_UMIN [[XOR1]], [[SHL3]] - ; GFX6: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[SHL2]], [[UMIN1]] - ; GFX6: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[ADD1]], [[C]](s32) + ; GFX6: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C1]] + ; GFX6: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[AND2]], [[AND3]] + ; GFX6: [[UMIN1:%[0-9]+]]:_(s32) = G_UMIN [[ADD1]], [[C1]] ; GFX6: [[COPY5:%[0-9]+]]:_(s32) = COPY [[BITCAST1]](s32) + ; GFX6: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C1]] ; GFX6: [[COPY6:%[0-9]+]]:_(s32) = COPY [[BITCAST3]](s32) - ; GFX6: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[COPY5]], [[C]](s32) - ; GFX6: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[COPY6]], [[C]](s32) - ; GFX6: [[XOR2:%[0-9]+]]:_(s32) = G_XOR [[SHL4]], [[C1]] - ; GFX6: [[UMIN2:%[0-9]+]]:_(s32) = G_UMIN [[XOR2]], [[SHL5]] - ; GFX6: [[ADD2:%[0-9]+]]:_(s32) = G_ADD [[SHL4]], [[UMIN2]] - ; GFX6: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[ADD2]], [[C]](s32) - ; GFX6: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 - ; GFX6: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LSHR4]](s32) - ; GFX6: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C2]] - ; GFX6: [[COPY8:%[0-9]+]]:_(s32) = COPY [[LSHR5]](s32) - ; GFX6: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY8]], [[C2]] - ; GFX6: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32) - ; GFX6: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL6]] + ; GFX6: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C1]] + ; GFX6: [[ADD2:%[0-9]+]]:_(s32) = G_ADD [[AND4]], [[AND5]] + ; GFX6: [[UMIN2:%[0-9]+]]:_(s32) = G_UMIN [[ADD2]], [[C1]] + ; GFX6: [[COPY7:%[0-9]+]]:_(s32) = COPY [[UMIN]](s32) + ; GFX6: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C1]] + ; GFX6: [[COPY8:%[0-9]+]]:_(s32) = COPY [[UMIN1]](s32) + ; GFX6: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY8]], [[C1]] + ; GFX6: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C]](s32) + ; GFX6: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND6]], [[SHL]] ; GFX6: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) - ; GFX6: [[COPY9:%[0-9]+]]:_(s32) = COPY [[LSHR6]](s32) - ; GFX6: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY9]], [[C2]] - ; GFX6: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 - ; GFX6: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[C3]], [[C]](s32) - ; GFX6: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL7]] + ; GFX6: [[COPY9:%[0-9]+]]:_(s32) = COPY [[UMIN2]](s32) + ; GFX6: [[AND8:%[0-9]+]]:_(s32) = G_AND [[COPY9]], [[C1]] + ; GFX6: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; GFX6: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[C2]], [[C]](s32) + ; GFX6: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND8]], [[SHL1]] ; GFX6: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32) ; GFX6: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>) ; GFX6: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS]](<4 x s16>), 0 @@ -531,53 +505,44 @@ ; GFX6: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) ; GFX6: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>) ; GFX6: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) + ; GFX6: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 ; GFX6: [[COPY2:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) + ; GFX6: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] ; GFX6: [[COPY3:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) - ; GFX6: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY2]], [[C]](s32) - ; GFX6: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY3]], [[C]](s32) - ; GFX6: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1 - ; GFX6: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[SHL]], [[C1]] - ; GFX6: [[UMIN:%[0-9]+]]:_(s32) = G_UMIN [[XOR]], [[SHL1]] - ; GFX6: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[SHL]], [[UMIN]] - ; GFX6: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[ADD]], [[C]](s32) + ; GFX6: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C1]] + ; GFX6: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[AND]], [[AND1]] + ; GFX6: [[UMIN:%[0-9]+]]:_(s32) = G_UMIN [[ADD]], [[C1]] ; GFX6: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX6: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C1]] ; GFX6: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) - ; GFX6: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[COPY4]], [[C]](s32) - ; GFX6: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[COPY5]], [[C]](s32) - ; GFX6: [[XOR1:%[0-9]+]]:_(s32) = G_XOR [[SHL2]], [[C1]] - ; GFX6: [[UMIN1:%[0-9]+]]:_(s32) = G_UMIN [[XOR1]], [[SHL3]] - ; GFX6: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[SHL2]], [[UMIN1]] - ; GFX6: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[ADD1]], [[C]](s32) + ; GFX6: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C1]] + ; GFX6: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[AND2]], [[AND3]] + ; GFX6: [[UMIN1:%[0-9]+]]:_(s32) = G_UMIN [[ADD1]], [[C1]] ; GFX6: [[COPY6:%[0-9]+]]:_(s32) = COPY [[BITCAST1]](s32) + ; GFX6: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C1]] ; GFX6: [[COPY7:%[0-9]+]]:_(s32) = COPY [[BITCAST3]](s32) - ; GFX6: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[COPY6]], [[C]](s32) - ; GFX6: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[COPY7]], [[C]](s32) - ; GFX6: [[XOR2:%[0-9]+]]:_(s32) = G_XOR [[SHL4]], [[C1]] - ; GFX6: [[UMIN2:%[0-9]+]]:_(s32) = G_UMIN [[XOR2]], [[SHL5]] - ; GFX6: [[ADD2:%[0-9]+]]:_(s32) = G_ADD [[SHL4]], [[UMIN2]] - ; GFX6: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[ADD2]], [[C]](s32) + ; GFX6: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C1]] + ; GFX6: [[ADD2:%[0-9]+]]:_(s32) = G_ADD [[AND4]], [[AND5]] + ; GFX6: [[UMIN2:%[0-9]+]]:_(s32) = G_UMIN [[ADD2]], [[C1]] ; GFX6: [[COPY8:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) + ; GFX6: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY8]], [[C1]] ; GFX6: [[COPY9:%[0-9]+]]:_(s32) = COPY [[LSHR3]](s32) - ; GFX6: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[COPY8]], [[C]](s32) - ; GFX6: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[COPY9]], [[C]](s32) - ; GFX6: [[XOR3:%[0-9]+]]:_(s32) = G_XOR [[SHL6]], [[C1]] - ; GFX6: [[UMIN3:%[0-9]+]]:_(s32) = G_UMIN [[XOR3]], [[SHL7]] - ; GFX6: [[ADD3:%[0-9]+]]:_(s32) = G_ADD [[SHL6]], [[UMIN3]] - ; GFX6: [[LSHR7:%[0-9]+]]:_(s32) = G_LSHR [[ADD3]], [[C]](s32) - ; GFX6: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 - ; GFX6: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LSHR4]](s32) - ; GFX6: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C2]] - ; GFX6: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR5]](s32) - ; GFX6: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY11]], [[C2]] - ; GFX6: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32) - ; GFX6: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL8]] + ; GFX6: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY9]], [[C1]] + ; GFX6: [[ADD3:%[0-9]+]]:_(s32) = G_ADD [[AND6]], [[AND7]] + ; GFX6: [[UMIN3:%[0-9]+]]:_(s32) = G_UMIN [[ADD3]], [[C1]] + ; GFX6: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UMIN]](s32) + ; GFX6: [[AND8:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C1]] + ; GFX6: [[COPY11:%[0-9]+]]:_(s32) = COPY [[UMIN1]](s32) + ; GFX6: [[AND9:%[0-9]+]]:_(s32) = G_AND [[COPY11]], [[C1]] + ; GFX6: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[C]](s32) + ; GFX6: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND8]], [[SHL]] ; GFX6: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) - ; GFX6: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LSHR6]](s32) - ; GFX6: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY12]], [[C2]] - ; GFX6: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LSHR7]](s32) - ; GFX6: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY13]], [[C2]] - ; GFX6: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C]](s32) - ; GFX6: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL9]] + ; GFX6: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UMIN2]](s32) + ; GFX6: [[AND10:%[0-9]+]]:_(s32) = G_AND [[COPY12]], [[C1]] + ; GFX6: [[COPY13:%[0-9]+]]:_(s32) = COPY [[UMIN3]](s32) + ; GFX6: [[AND11:%[0-9]+]]:_(s32) = G_AND [[COPY13]], [[C1]] + ; GFX6: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[C]](s32) + ; GFX6: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND10]], [[SHL1]] ; GFX6: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32) ; GFX6: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>) ; GFX6: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>) Index: llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-usubsat.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-usubsat.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-usubsat.mir @@ -12,15 +12,14 @@ ; GFX6-LABEL: name: usubsat_s7 ; GFX6: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX6: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX6: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 127 ; GFX6: [[COPY2:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; GFX6: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C]] ; GFX6: [[COPY3:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) - ; GFX6: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 25 - ; GFX6: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY2]], [[C]](s32) - ; GFX6: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY3]], [[C]](s32) - ; GFX6: [[UMIN:%[0-9]+]]:_(s32) = G_UMIN [[SHL]], [[SHL1]] - ; GFX6: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[SHL]], [[UMIN]] - ; GFX6: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[SUB]], [[C]](s32) - ; GFX6: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX6: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C]] + ; GFX6: [[UMAX:%[0-9]+]]:_(s32) = G_UMAX [[AND]], [[AND1]] + ; GFX6: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UMAX]], [[AND1]] + ; GFX6: [[COPY4:%[0-9]+]]:_(s32) = COPY [[SUB]](s32) ; GFX6: $vgpr0 = COPY [[COPY4]](s32) ; GFX8-LABEL: name: usubsat_s7 ; GFX8: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 @@ -64,15 +63,14 @@ ; GFX6-LABEL: name: usubsat_s8 ; GFX6: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX6: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX6: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 255 ; GFX6: [[COPY2:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; GFX6: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C]] ; GFX6: [[COPY3:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) - ; GFX6: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 - ; GFX6: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY2]], [[C]](s32) - ; GFX6: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY3]], [[C]](s32) - ; GFX6: [[UMIN:%[0-9]+]]:_(s32) = G_UMIN [[SHL]], [[SHL1]] - ; GFX6: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[SHL]], [[UMIN]] - ; GFX6: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[SUB]], [[C]](s32) - ; GFX6: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX6: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C]] + ; GFX6: [[UMAX:%[0-9]+]]:_(s32) = G_UMAX [[AND]], [[AND1]] + ; GFX6: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UMAX]], [[AND1]] + ; GFX6: [[COPY4:%[0-9]+]]:_(s32) = COPY [[SUB]](s32) ; GFX6: $vgpr0 = COPY [[COPY4]](s32) ; GFX8-LABEL: name: usubsat_s8 ; GFX8: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 @@ -125,30 +123,28 @@ ; GFX6: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[COPY1]], [[C]](s32) ; GFX6: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[COPY1]], [[C1]](s32) ; GFX6: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[COPY1]], [[C2]](s32) + ; GFX6: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 255 ; GFX6: [[COPY2:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; GFX6: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C3]] ; GFX6: [[COPY3:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) - ; GFX6: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY2]], [[C2]](s32) - ; GFX6: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY3]], [[C2]](s32) - ; GFX6: [[UMIN:%[0-9]+]]:_(s32) = G_UMIN [[SHL]], [[SHL1]] - ; GFX6: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[SHL]], [[UMIN]] - ; GFX6: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[SUB]], [[C2]](s32) + ; GFX6: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C3]] + ; GFX6: [[UMAX:%[0-9]+]]:_(s32) = G_UMAX [[AND]], [[AND1]] + ; GFX6: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UMAX]], [[AND1]] ; GFX6: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX6: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C3]] ; GFX6: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LSHR3]](s32) - ; GFX6: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[COPY4]], [[C2]](s32) - ; GFX6: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[COPY5]], [[C2]](s32) - ; GFX6: [[UMIN1:%[0-9]+]]:_(s32) = G_UMIN [[SHL2]], [[SHL3]] - ; GFX6: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[SHL2]], [[UMIN1]] - ; GFX6: [[LSHR7:%[0-9]+]]:_(s32) = G_LSHR [[SUB1]], [[C2]](s32) - ; GFX6: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 255 - ; GFX6: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR6]](s32) - ; GFX6: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C3]] + ; GFX6: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C3]] + ; GFX6: [[UMAX1:%[0-9]+]]:_(s32) = G_UMAX [[AND2]], [[AND3]] + ; GFX6: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[UMAX1]], [[AND3]] + ; GFX6: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 255 + ; GFX6: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[SUB]](s32) + ; GFX6: [[AND4:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C4]] ; GFX6: [[COPY6:%[0-9]+]]:_(s32) = COPY [[C]](s32) - ; GFX6: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 255 - ; GFX6: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LSHR7]](s32) - ; GFX6: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C4]] - ; GFX6: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[COPY6]](s32) - ; GFX6: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[SHL4]](s32) - ; GFX6: [[OR:%[0-9]+]]:_(s16) = G_OR [[AND]], [[TRUNC1]] + ; GFX6: [[COPY7:%[0-9]+]]:_(s32) = COPY [[SUB1]](s32) + ; GFX6: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C3]] + ; GFX6: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[COPY6]](s32) + ; GFX6: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[SHL]](s32) + ; GFX6: [[OR:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[TRUNC1]] ; GFX6: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[OR]](s16) ; GFX6: $vgpr0 = COPY [[ANYEXT]](s32) ; GFX8-LABEL: name: usubsat_v2s8 @@ -244,15 +240,14 @@ ; GFX6-LABEL: name: usubsat_s16 ; GFX6: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX6: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX6: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 ; GFX6: [[COPY2:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; GFX6: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C]] ; GFX6: [[COPY3:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) - ; GFX6: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX6: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY2]], [[C]](s32) - ; GFX6: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY3]], [[C]](s32) - ; GFX6: [[UMIN:%[0-9]+]]:_(s32) = G_UMIN [[SHL]], [[SHL1]] - ; GFX6: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[SHL]], [[UMIN]] - ; GFX6: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[SUB]], [[C]](s32) - ; GFX6: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX6: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C]] + ; GFX6: [[UMAX:%[0-9]+]]:_(s32) = G_UMAX [[AND]], [[AND1]] + ; GFX6: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UMAX]], [[AND1]] + ; GFX6: [[COPY4:%[0-9]+]]:_(s32) = COPY [[SUB]](s32) ; GFX6: $vgpr0 = COPY [[COPY4]](s32) ; GFX8-LABEL: name: usubsat_s16 ; GFX8: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 @@ -293,27 +288,25 @@ ; GFX6: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX6: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>) ; GFX6: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX6: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 ; GFX6: [[COPY2:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) + ; GFX6: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] ; GFX6: [[COPY3:%[0-9]+]]:_(s32) = COPY [[BITCAST1]](s32) - ; GFX6: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY2]], [[C]](s32) - ; GFX6: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY3]], [[C]](s32) - ; GFX6: [[UMIN:%[0-9]+]]:_(s32) = G_UMIN [[SHL]], [[SHL1]] - ; GFX6: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[SHL]], [[UMIN]] - ; GFX6: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[SUB]], [[C]](s32) + ; GFX6: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C1]] + ; GFX6: [[UMAX:%[0-9]+]]:_(s32) = G_UMAX [[AND]], [[AND1]] + ; GFX6: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UMAX]], [[AND1]] ; GFX6: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX6: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C1]] ; GFX6: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) - ; GFX6: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[COPY4]], [[C]](s32) - ; GFX6: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[COPY5]], [[C]](s32) - ; GFX6: [[UMIN1:%[0-9]+]]:_(s32) = G_UMIN [[SHL2]], [[SHL3]] - ; GFX6: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[SHL2]], [[UMIN1]] - ; GFX6: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[SUB1]], [[C]](s32) - ; GFX6: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 - ; GFX6: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) - ; GFX6: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C1]] - ; GFX6: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LSHR3]](s32) - ; GFX6: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C1]] - ; GFX6: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32) - ; GFX6: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL4]] + ; GFX6: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C1]] + ; GFX6: [[UMAX1:%[0-9]+]]:_(s32) = G_UMAX [[AND2]], [[AND3]] + ; GFX6: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[UMAX1]], [[AND3]] + ; GFX6: [[COPY6:%[0-9]+]]:_(s32) = COPY [[SUB]](s32) + ; GFX6: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C1]] + ; GFX6: [[COPY7:%[0-9]+]]:_(s32) = COPY [[SUB1]](s32) + ; GFX6: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C1]] + ; GFX6: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C]](s32) + ; GFX6: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL]] ; GFX6: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) ; GFX6: $vgpr0 = COPY [[BITCAST2]](<2 x s16>) ; GFX8-LABEL: name: usubsat_v2s16 @@ -370,40 +363,37 @@ ; GFX6: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) ; GFX6: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV5]](<2 x s16>) ; GFX6: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) + ; GFX6: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 ; GFX6: [[COPY1:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) + ; GFX6: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]] ; GFX6: [[COPY2:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) - ; GFX6: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY1]], [[C]](s32) - ; GFX6: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY2]], [[C]](s32) - ; GFX6: [[UMIN:%[0-9]+]]:_(s32) = G_UMIN [[SHL]], [[SHL1]] - ; GFX6: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[SHL]], [[UMIN]] - ; GFX6: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[SUB]], [[C]](s32) + ; GFX6: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] + ; GFX6: [[UMAX:%[0-9]+]]:_(s32) = G_UMAX [[AND]], [[AND1]] + ; GFX6: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UMAX]], [[AND1]] ; GFX6: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX6: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C1]] ; GFX6: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) - ; GFX6: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[COPY3]], [[C]](s32) - ; GFX6: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[COPY4]], [[C]](s32) - ; GFX6: [[UMIN1:%[0-9]+]]:_(s32) = G_UMIN [[SHL2]], [[SHL3]] - ; GFX6: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[SHL2]], [[UMIN1]] - ; GFX6: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[SUB1]], [[C]](s32) + ; GFX6: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C1]] + ; GFX6: [[UMAX1:%[0-9]+]]:_(s32) = G_UMAX [[AND2]], [[AND3]] + ; GFX6: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[UMAX1]], [[AND3]] ; GFX6: [[COPY5:%[0-9]+]]:_(s32) = COPY [[BITCAST1]](s32) + ; GFX6: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C1]] ; GFX6: [[COPY6:%[0-9]+]]:_(s32) = COPY [[BITCAST3]](s32) - ; GFX6: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[COPY5]], [[C]](s32) - ; GFX6: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[COPY6]], [[C]](s32) - ; GFX6: [[UMIN2:%[0-9]+]]:_(s32) = G_UMIN [[SHL4]], [[SHL5]] - ; GFX6: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[SHL4]], [[UMIN2]] - ; GFX6: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[SUB2]], [[C]](s32) - ; GFX6: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 - ; GFX6: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LSHR4]](s32) - ; GFX6: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C1]] - ; GFX6: [[COPY8:%[0-9]+]]:_(s32) = COPY [[LSHR5]](s32) - ; GFX6: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY8]], [[C1]] - ; GFX6: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32) - ; GFX6: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL6]] + ; GFX6: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C1]] + ; GFX6: [[UMAX2:%[0-9]+]]:_(s32) = G_UMAX [[AND4]], [[AND5]] + ; GFX6: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[UMAX2]], [[AND5]] + ; GFX6: [[COPY7:%[0-9]+]]:_(s32) = COPY [[SUB]](s32) + ; GFX6: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C1]] + ; GFX6: [[COPY8:%[0-9]+]]:_(s32) = COPY [[SUB1]](s32) + ; GFX6: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY8]], [[C1]] + ; GFX6: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C]](s32) + ; GFX6: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND6]], [[SHL]] ; GFX6: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) - ; GFX6: [[COPY9:%[0-9]+]]:_(s32) = COPY [[LSHR6]](s32) - ; GFX6: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY9]], [[C1]] + ; GFX6: [[COPY9:%[0-9]+]]:_(s32) = COPY [[SUB2]](s32) + ; GFX6: [[AND8:%[0-9]+]]:_(s32) = G_AND [[COPY9]], [[C1]] ; GFX6: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 - ; GFX6: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[C2]], [[C]](s32) - ; GFX6: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL7]] + ; GFX6: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[C2]], [[C]](s32) + ; GFX6: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND8]], [[SHL1]] ; GFX6: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32) ; GFX6: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>) ; GFX6: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS]](<4 x s16>), 0 @@ -515,48 +505,44 @@ ; GFX6: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) ; GFX6: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>) ; GFX6: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) + ; GFX6: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 ; GFX6: [[COPY2:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) + ; GFX6: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] ; GFX6: [[COPY3:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) - ; GFX6: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY2]], [[C]](s32) - ; GFX6: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY3]], [[C]](s32) - ; GFX6: [[UMIN:%[0-9]+]]:_(s32) = G_UMIN [[SHL]], [[SHL1]] - ; GFX6: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[SHL]], [[UMIN]] - ; GFX6: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[SUB]], [[C]](s32) + ; GFX6: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C1]] + ; GFX6: [[UMAX:%[0-9]+]]:_(s32) = G_UMAX [[AND]], [[AND1]] + ; GFX6: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UMAX]], [[AND1]] ; GFX6: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX6: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C1]] ; GFX6: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) - ; GFX6: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[COPY4]], [[C]](s32) - ; GFX6: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[COPY5]], [[C]](s32) - ; GFX6: [[UMIN1:%[0-9]+]]:_(s32) = G_UMIN [[SHL2]], [[SHL3]] - ; GFX6: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[SHL2]], [[UMIN1]] - ; GFX6: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[SUB1]], [[C]](s32) + ; GFX6: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C1]] + ; GFX6: [[UMAX1:%[0-9]+]]:_(s32) = G_UMAX [[AND2]], [[AND3]] + ; GFX6: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[UMAX1]], [[AND3]] ; GFX6: [[COPY6:%[0-9]+]]:_(s32) = COPY [[BITCAST1]](s32) + ; GFX6: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C1]] ; GFX6: [[COPY7:%[0-9]+]]:_(s32) = COPY [[BITCAST3]](s32) - ; GFX6: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[COPY6]], [[C]](s32) - ; GFX6: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[COPY7]], [[C]](s32) - ; GFX6: [[UMIN2:%[0-9]+]]:_(s32) = G_UMIN [[SHL4]], [[SHL5]] - ; GFX6: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[SHL4]], [[UMIN2]] - ; GFX6: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[SUB2]], [[C]](s32) + ; GFX6: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C1]] + ; GFX6: [[UMAX2:%[0-9]+]]:_(s32) = G_UMAX [[AND4]], [[AND5]] + ; GFX6: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[UMAX2]], [[AND5]] ; GFX6: [[COPY8:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) + ; GFX6: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY8]], [[C1]] ; GFX6: [[COPY9:%[0-9]+]]:_(s32) = COPY [[LSHR3]](s32) - ; GFX6: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[COPY8]], [[C]](s32) - ; GFX6: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[COPY9]], [[C]](s32) - ; GFX6: [[UMIN3:%[0-9]+]]:_(s32) = G_UMIN [[SHL6]], [[SHL7]] - ; GFX6: [[SUB3:%[0-9]+]]:_(s32) = G_SUB [[SHL6]], [[UMIN3]] - ; GFX6: [[LSHR7:%[0-9]+]]:_(s32) = G_LSHR [[SUB3]], [[C]](s32) - ; GFX6: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 - ; GFX6: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LSHR4]](s32) - ; GFX6: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C1]] - ; GFX6: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR5]](s32) - ; GFX6: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY11]], [[C1]] - ; GFX6: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32) - ; GFX6: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL8]] + ; GFX6: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY9]], [[C1]] + ; GFX6: [[UMAX3:%[0-9]+]]:_(s32) = G_UMAX [[AND6]], [[AND7]] + ; GFX6: [[SUB3:%[0-9]+]]:_(s32) = G_SUB [[UMAX3]], [[AND7]] + ; GFX6: [[COPY10:%[0-9]+]]:_(s32) = COPY [[SUB]](s32) + ; GFX6: [[AND8:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C1]] + ; GFX6: [[COPY11:%[0-9]+]]:_(s32) = COPY [[SUB1]](s32) + ; GFX6: [[AND9:%[0-9]+]]:_(s32) = G_AND [[COPY11]], [[C1]] + ; GFX6: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[C]](s32) + ; GFX6: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND8]], [[SHL]] ; GFX6: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) - ; GFX6: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LSHR6]](s32) - ; GFX6: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY12]], [[C1]] - ; GFX6: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LSHR7]](s32) - ; GFX6: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY13]], [[C1]] - ; GFX6: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C]](s32) - ; GFX6: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL9]] + ; GFX6: [[COPY12:%[0-9]+]]:_(s32) = COPY [[SUB2]](s32) + ; GFX6: [[AND10:%[0-9]+]]:_(s32) = G_AND [[COPY12]], [[C1]] + ; GFX6: [[COPY13:%[0-9]+]]:_(s32) = COPY [[SUB3]](s32) + ; GFX6: [[AND11:%[0-9]+]]:_(s32) = G_AND [[COPY13]], [[C1]] + ; GFX6: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[C]](s32) + ; GFX6: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND10]], [[SHL1]] ; GFX6: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32) ; GFX6: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>) ; GFX6: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>) @@ -622,8 +608,8 @@ ; GFX6-LABEL: name: usubsat_s32 ; GFX6: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX6: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX6: [[UMIN:%[0-9]+]]:_(s32) = G_UMIN [[COPY]], [[COPY1]] - ; GFX6: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[COPY]], [[UMIN]] + ; GFX6: [[UMAX:%[0-9]+]]:_(s32) = G_UMAX [[COPY]], [[COPY1]] + ; GFX6: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UMAX]], [[COPY1]] ; GFX6: $vgpr0 = COPY [[SUB]](s32) ; GFX8-LABEL: name: usubsat_s32 ; GFX8: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 @@ -652,10 +638,10 @@ ; GFX6: [[COPY1:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr2_vgpr3 ; GFX6: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>) ; GFX6: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<2 x s32>) - ; GFX6: [[UMIN:%[0-9]+]]:_(s32) = G_UMIN [[UV]], [[UV2]] - ; GFX6: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UV]], [[UMIN]] - ; GFX6: [[UMIN1:%[0-9]+]]:_(s32) = G_UMIN [[UV1]], [[UV3]] - ; GFX6: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[UV1]], [[UMIN1]] + ; GFX6: [[UMAX:%[0-9]+]]:_(s32) = G_UMAX [[UV]], [[UV2]] + ; GFX6: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UMAX]], [[UV2]] + ; GFX6: [[UMAX1:%[0-9]+]]:_(s32) = G_UMAX [[UV1]], [[UV3]] + ; GFX6: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[UMAX1]], [[UV3]] ; GFX6: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[SUB]](s32), [[SUB1]](s32) ; GFX6: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) ; GFX8-LABEL: name: usubsat_v2s32 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll @@ -8,16 +8,15 @@ ; GFX6-LABEL: v_saddsat_i7: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 25, v0 +; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 7 ; GFX6-NEXT: v_min_i32_e32 v3, 0, v0 ; GFX6-NEXT: v_max_i32_e32 v2, 0, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 25, v1 -; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0x80000000, v3 -; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 0x7fffffff, v2 +; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 7 +; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0xffffffc0, v3 +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 63, v2 ; GFX6-NEXT: v_max_i32_e32 v1, v3, v1 ; GFX6-NEXT: v_min_i32_e32 v1, v1, v2 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GFX6-NEXT: v_ashrrev_i32_e32 v0, 25, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_saddsat_i7: @@ -61,20 +60,19 @@ define amdgpu_ps i7 @s_saddsat_i7(i7 inreg %lhs, i7 inreg %rhs) { ; GFX6-LABEL: s_saddsat_i7: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_lshl_b32 s0, s0, 25 -; GFX6-NEXT: s_lshl_b32 s1, s1, 25 +; GFX6-NEXT: s_bfe_i32 s0, s0, 0x70000 +; GFX6-NEXT: s_bfe_i32 s1, s1, 0x70000 ; GFX6-NEXT: s_cmp_gt_i32 s0, 0 ; GFX6-NEXT: s_cselect_b32 s2, s0, 0 -; GFX6-NEXT: s_sub_i32 s2, 0x7fffffff, s2 +; GFX6-NEXT: s_sub_i32 s2, 63, s2 ; GFX6-NEXT: s_cmp_lt_i32 s0, 0 ; GFX6-NEXT: s_cselect_b32 s3, s0, 0 -; GFX6-NEXT: s_sub_i32 s3, 0x80000000, s3 +; GFX6-NEXT: s_sub_i32 s3, 0xffffffc0, s3 ; GFX6-NEXT: s_cmp_gt_i32 s3, s1 ; GFX6-NEXT: s_cselect_b32 s1, s3, s1 ; GFX6-NEXT: s_cmp_lt_i32 s1, s2 ; GFX6-NEXT: s_cselect_b32 s1, s1, s2 ; GFX6-NEXT: s_add_i32 s0, s0, s1 -; GFX6-NEXT: s_ashr_i32 s0, s0, 25 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_saddsat_i7: @@ -132,16 +130,15 @@ ; GFX6-LABEL: v_saddsat_i8: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 8 ; GFX6-NEXT: v_min_i32_e32 v3, 0, v0 ; GFX6-NEXT: v_max_i32_e32 v2, 0, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0x80000000, v3 -; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 0x7fffffff, v2 +; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 8 +; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0xffffff80, v3 +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 0x7f, v2 ; GFX6-NEXT: v_max_i32_e32 v1, v3, v1 ; GFX6-NEXT: v_min_i32_e32 v1, v1, v2 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GFX6-NEXT: v_ashrrev_i32_e32 v0, 24, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_saddsat_i8: @@ -185,20 +182,19 @@ define amdgpu_ps i8 @s_saddsat_i8(i8 inreg %lhs, i8 inreg %rhs) { ; GFX6-LABEL: s_saddsat_i8: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_lshl_b32 s0, s0, 24 -; GFX6-NEXT: s_lshl_b32 s1, s1, 24 +; GFX6-NEXT: s_sext_i32_i8 s0, s0 ; GFX6-NEXT: s_cmp_gt_i32 s0, 0 ; GFX6-NEXT: s_cselect_b32 s2, s0, 0 -; GFX6-NEXT: s_sub_i32 s2, 0x7fffffff, s2 +; GFX6-NEXT: s_sub_i32 s2, 0x7f, s2 ; GFX6-NEXT: s_cmp_lt_i32 s0, 0 ; GFX6-NEXT: s_cselect_b32 s3, s0, 0 -; GFX6-NEXT: s_sub_i32 s3, 0x80000000, s3 +; GFX6-NEXT: s_sext_i32_i8 s1, s1 +; GFX6-NEXT: s_sub_i32 s3, 0xffffff80, s3 ; GFX6-NEXT: s_cmp_gt_i32 s3, s1 ; GFX6-NEXT: s_cselect_b32 s1, s3, s1 ; GFX6-NEXT: s_cmp_lt_i32 s1, s2 ; GFX6-NEXT: s_cselect_b32 s1, s1, s2 ; GFX6-NEXT: s_add_i32 s0, s0, s1 -; GFX6-NEXT: s_ashr_i32 s0, s0, 24 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_saddsat_i8: @@ -257,30 +253,28 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, 8, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; GFX6-NEXT: s_mov_b32 s5, 0x80000000 +; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 8 +; GFX6-NEXT: s_mov_b32 s5, 0xffffff80 ; GFX6-NEXT: v_min_i32_e32 v5, 0, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 8, v1 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 8 ; GFX6-NEXT: v_sub_i32_e32 v5, vcc, s5, v5 -; GFX6-NEXT: s_brev_b32 s4, -2 +; GFX6-NEXT: s_movk_i32 s4, 0x7f ; GFX6-NEXT: v_max_i32_e32 v4, 0, v0 ; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s4, v4 ; GFX6-NEXT: v_max_i32_e32 v1, v5, v1 ; GFX6-NEXT: v_min_i32_e32 v1, v1, v4 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v2 +; GFX6-NEXT: v_bfe_i32 v1, v2, 0, 8 ; GFX6-NEXT: v_min_i32_e32 v4, 0, v1 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 24, v3 +; GFX6-NEXT: v_bfe_i32 v2, v3, 0, 8 ; GFX6-NEXT: v_max_i32_e32 v3, 0, v1 ; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s5, v4 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s4, v3 ; GFX6-NEXT: v_max_i32_e32 v2, v4, v2 ; GFX6-NEXT: v_min_i32_e32 v2, v2, v3 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v2 -; GFX6-NEXT: v_ashrrev_i32_e32 v1, 24, v1 ; GFX6-NEXT: v_mov_b32_e32 v2, 0xff -; GFX6-NEXT: v_ashrrev_i32_e32 v0, 24, v0 ; GFX6-NEXT: v_and_b32_e32 v1, v1, v2 ; GFX6-NEXT: v_and_b32_e32 v0, v0, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 8, v1 @@ -368,25 +362,24 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_lshr_b32 s2, s0, 8 ; GFX6-NEXT: s_lshr_b32 s3, s1, 8 -; GFX6-NEXT: s_lshl_b32 s0, s0, 24 -; GFX6-NEXT: s_lshl_b32 s1, s1, 24 +; GFX6-NEXT: s_sext_i32_i8 s0, s0 ; GFX6-NEXT: s_cmp_gt_i32 s0, 0 -; GFX6-NEXT: s_brev_b32 s4, -2 +; GFX6-NEXT: s_movk_i32 s4, 0x7f ; GFX6-NEXT: s_cselect_b32 s6, s0, 0 ; GFX6-NEXT: s_sub_i32 s6, s4, s6 ; GFX6-NEXT: s_cmp_lt_i32 s0, 0 -; GFX6-NEXT: s_mov_b32 s5, 0x80000000 +; GFX6-NEXT: s_mov_b32 s5, 0xffffff80 ; GFX6-NEXT: s_cselect_b32 s7, s0, 0 +; GFX6-NEXT: s_sext_i32_i8 s1, s1 ; GFX6-NEXT: s_sub_i32 s7, s5, s7 ; GFX6-NEXT: s_cmp_gt_i32 s7, s1 ; GFX6-NEXT: s_cselect_b32 s1, s7, s1 ; GFX6-NEXT: s_cmp_lt_i32 s1, s6 ; GFX6-NEXT: s_cselect_b32 s1, s1, s6 ; GFX6-NEXT: s_add_i32 s0, s0, s1 -; GFX6-NEXT: s_lshl_b32 s1, s2, 24 -; GFX6-NEXT: s_ashr_i32 s0, s0, 24 -; GFX6-NEXT: s_lshl_b32 s2, s3, 24 +; GFX6-NEXT: s_sext_i32_i8 s1, s2 ; GFX6-NEXT: s_cmp_gt_i32 s1, 0 +; GFX6-NEXT: s_sext_i32_i8 s2, s3 ; GFX6-NEXT: s_cselect_b32 s3, s1, 0 ; GFX6-NEXT: s_sub_i32 s3, s4, s3 ; GFX6-NEXT: s_cmp_lt_i32 s1, 0 @@ -398,7 +391,6 @@ ; GFX6-NEXT: s_cselect_b32 s2, s2, s3 ; GFX6-NEXT: s_add_i32 s1, s1, s2 ; GFX6-NEXT: s_movk_i32 s2, 0xff -; GFX6-NEXT: s_ashr_i32 s1, s1, 24 ; GFX6-NEXT: s_and_b32 s1, s1, s2 ; GFX6-NEXT: s_and_b32 s0, s0, s2 ; GFX6-NEXT: s_lshl_b32 s1, s1, 8 @@ -523,60 +515,56 @@ ; GFX6-NEXT: v_lshrrev_b32_e32 v2, 8, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 24, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; GFX6-NEXT: s_mov_b32 s5, 0x80000000 +; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 8 +; GFX6-NEXT: s_mov_b32 s5, 0xffffff80 ; GFX6-NEXT: v_min_i32_e32 v10, 0, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v5, 8, v1 ; GFX6-NEXT: v_lshrrev_b32_e32 v6, 16, v1 ; GFX6-NEXT: v_lshrrev_b32_e32 v7, 24, v1 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 8 ; GFX6-NEXT: v_sub_i32_e32 v10, vcc, s5, v10 -; GFX6-NEXT: s_brev_b32 s4, -2 +; GFX6-NEXT: s_movk_i32 s4, 0x7f ; GFX6-NEXT: v_max_i32_e32 v8, 0, v0 ; GFX6-NEXT: v_sub_i32_e32 v8, vcc, s4, v8 ; GFX6-NEXT: v_max_i32_e32 v1, v10, v1 ; GFX6-NEXT: v_min_i32_e32 v1, v1, v8 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v2 +; GFX6-NEXT: v_bfe_i32 v1, v2, 0, 8 ; GFX6-NEXT: v_min_i32_e32 v8, 0, v1 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 24, v5 +; GFX6-NEXT: v_bfe_i32 v2, v5, 0, 8 ; GFX6-NEXT: v_max_i32_e32 v5, 0, v1 ; GFX6-NEXT: v_sub_i32_e32 v8, vcc, s5, v8 ; GFX6-NEXT: v_sub_i32_e32 v5, vcc, s4, v5 ; GFX6-NEXT: v_max_i32_e32 v2, v8, v2 ; GFX6-NEXT: v_min_i32_e32 v2, v2, v5 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 24, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 24, v6 +; GFX6-NEXT: v_bfe_i32 v2, v3, 0, 8 +; GFX6-NEXT: v_bfe_i32 v3, v6, 0, 8 ; GFX6-NEXT: v_min_i32_e32 v6, 0, v2 -; GFX6-NEXT: v_sub_i32_e32 v6, vcc, s5, v6 -; GFX6-NEXT: v_bfrev_b32_e32 v9, -2 ; GFX6-NEXT: v_max_i32_e32 v5, 0, v2 -; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v9, v5 +; GFX6-NEXT: v_sub_i32_e32 v6, vcc, s5, v6 +; GFX6-NEXT: v_sub_i32_e32 v5, vcc, s4, v5 ; GFX6-NEXT: v_max_i32_e32 v3, v6, v3 ; GFX6-NEXT: v_min_i32_e32 v3, v3, v5 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 24, v4 -; GFX6-NEXT: v_mov_b32_e32 v11, 0x80000000 +; GFX6-NEXT: v_bfe_i32 v3, v4, 0, 8 +; GFX6-NEXT: s_movk_i32 s4, 0xff +; GFX6-NEXT: v_mov_b32_e32 v11, 0xffffff80 ; GFX6-NEXT: v_min_i32_e32 v6, 0, v3 +; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX6-NEXT: v_mov_b32_e32 v9, 0x7f ; GFX6-NEXT: v_max_i32_e32 v5, 0, v3 -; GFX6-NEXT: v_ashrrev_i32_e32 v1, 24, v1 -; GFX6-NEXT: s_movk_i32 s4, 0xff -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 24, v7 +; GFX6-NEXT: v_bfe_i32 v4, v7, 0, 8 ; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v11, v6 -; GFX6-NEXT: v_ashrrev_i32_e32 v0, 24, v0 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 ; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v9, v5 ; GFX6-NEXT: v_max_i32_e32 v4, v6, v4 -; GFX6-NEXT: v_min_i32_e32 v4, v4, v5 -; GFX6-NEXT: v_ashrrev_i32_e32 v2, 24, v2 ; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v4 +; GFX6-NEXT: v_min_i32_e32 v4, v4, v5 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: v_and_b32_e32 v1, s4, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_ashrrev_i32_e32 v3, 24, v3 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v4 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: v_and_b32_e32 v1, s4, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v1 @@ -730,25 +718,24 @@ ; GFX6-NEXT: s_lshr_b32 s5, s1, 8 ; GFX6-NEXT: s_lshr_b32 s6, s1, 16 ; GFX6-NEXT: s_lshr_b32 s7, s1, 24 -; GFX6-NEXT: s_lshl_b32 s0, s0, 24 -; GFX6-NEXT: s_lshl_b32 s1, s1, 24 +; GFX6-NEXT: s_sext_i32_i8 s0, s0 ; GFX6-NEXT: s_cmp_gt_i32 s0, 0 -; GFX6-NEXT: s_brev_b32 s8, -2 +; GFX6-NEXT: s_movk_i32 s8, 0x7f ; GFX6-NEXT: s_cselect_b32 s10, s0, 0 ; GFX6-NEXT: s_sub_i32 s10, s8, s10 ; GFX6-NEXT: s_cmp_lt_i32 s0, 0 -; GFX6-NEXT: s_mov_b32 s9, 0x80000000 +; GFX6-NEXT: s_mov_b32 s9, 0xffffff80 ; GFX6-NEXT: s_cselect_b32 s11, s0, 0 +; GFX6-NEXT: s_sext_i32_i8 s1, s1 ; GFX6-NEXT: s_sub_i32 s11, s9, s11 ; GFX6-NEXT: s_cmp_gt_i32 s11, s1 ; GFX6-NEXT: s_cselect_b32 s1, s11, s1 ; GFX6-NEXT: s_cmp_lt_i32 s1, s10 ; GFX6-NEXT: s_cselect_b32 s1, s1, s10 ; GFX6-NEXT: s_add_i32 s0, s0, s1 -; GFX6-NEXT: s_lshl_b32 s1, s2, 24 -; GFX6-NEXT: s_ashr_i32 s0, s0, 24 -; GFX6-NEXT: s_lshl_b32 s2, s5, 24 +; GFX6-NEXT: s_sext_i32_i8 s1, s2 ; GFX6-NEXT: s_cmp_gt_i32 s1, 0 +; GFX6-NEXT: s_sext_i32_i8 s2, s5 ; GFX6-NEXT: s_cselect_b32 s5, s1, 0 ; GFX6-NEXT: s_sub_i32 s5, s8, s5 ; GFX6-NEXT: s_cmp_lt_i32 s1, 0 @@ -759,13 +746,12 @@ ; GFX6-NEXT: s_cmp_lt_i32 s2, s5 ; GFX6-NEXT: s_cselect_b32 s2, s2, s5 ; GFX6-NEXT: s_add_i32 s1, s1, s2 -; GFX6-NEXT: s_lshl_b32 s2, s3, 24 -; GFX6-NEXT: s_ashr_i32 s1, s1, 24 -; GFX6-NEXT: s_lshl_b32 s3, s6, 24 +; GFX6-NEXT: s_sext_i32_i8 s2, s3 ; GFX6-NEXT: s_cmp_gt_i32 s2, 0 ; GFX6-NEXT: s_cselect_b32 s5, s2, 0 ; GFX6-NEXT: s_sub_i32 s5, s8, s5 ; GFX6-NEXT: s_cmp_lt_i32 s2, 0 +; GFX6-NEXT: s_sext_i32_i8 s3, s6 ; GFX6-NEXT: s_cselect_b32 s6, s2, 0 ; GFX6-NEXT: s_sub_i32 s6, s9, s6 ; GFX6-NEXT: s_cmp_gt_i32 s6, s3 @@ -773,14 +759,13 @@ ; GFX6-NEXT: s_cmp_lt_i32 s3, s5 ; GFX6-NEXT: s_cselect_b32 s3, s3, s5 ; GFX6-NEXT: s_add_i32 s2, s2, s3 -; GFX6-NEXT: s_lshl_b32 s3, s4, 24 -; GFX6-NEXT: s_ashr_i32 s2, s2, 24 -; GFX6-NEXT: s_lshl_b32 s4, s7, 24 +; GFX6-NEXT: s_sext_i32_i8 s3, s4 ; GFX6-NEXT: s_cmp_gt_i32 s3, 0 ; GFX6-NEXT: s_cselect_b32 s5, s3, 0 ; GFX6-NEXT: s_sub_i32 s5, s8, s5 ; GFX6-NEXT: s_cmp_lt_i32 s3, 0 ; GFX6-NEXT: s_cselect_b32 s6, s3, 0 +; GFX6-NEXT: s_sext_i32_i8 s4, s7 ; GFX6-NEXT: s_sub_i32 s6, s9, s6 ; GFX6-NEXT: s_cmp_gt_i32 s6, s4 ; GFX6-NEXT: s_cselect_b32 s4, s6, s4 @@ -794,7 +779,6 @@ ; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: s_and_b32 s1, s2, s4 ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 -; GFX6-NEXT: s_ashr_i32 s3, s3, 24 ; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: s_and_b32 s1, s3, s4 ; GFX6-NEXT: s_lshl_b32 s1, s1, 24 @@ -1009,16 +993,15 @@ ; GFX6-LABEL: v_saddsat_i24: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 24 ; GFX6-NEXT: v_min_i32_e32 v3, 0, v0 ; GFX6-NEXT: v_max_i32_e32 v2, 0, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0x80000000, v3 -; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 0x7fffffff, v2 +; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 24 +; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0xff800000, v3 +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 0x7fffff, v2 ; GFX6-NEXT: v_max_i32_e32 v1, v3, v1 ; GFX6-NEXT: v_min_i32_e32 v1, v1, v2 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GFX6-NEXT: v_ashrrev_i32_e32 v0, 8, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_saddsat_i24: @@ -1063,20 +1046,19 @@ define amdgpu_ps i24 @s_saddsat_i24(i24 inreg %lhs, i24 inreg %rhs) { ; GFX6-LABEL: s_saddsat_i24: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_lshl_b32 s0, s0, 8 -; GFX6-NEXT: s_lshl_b32 s1, s1, 8 +; GFX6-NEXT: s_bfe_i32 s0, s0, 0x180000 +; GFX6-NEXT: s_bfe_i32 s1, s1, 0x180000 ; GFX6-NEXT: s_cmp_gt_i32 s0, 0 ; GFX6-NEXT: s_cselect_b32 s2, s0, 0 -; GFX6-NEXT: s_sub_i32 s2, 0x7fffffff, s2 +; GFX6-NEXT: s_sub_i32 s2, 0x7fffff, s2 ; GFX6-NEXT: s_cmp_lt_i32 s0, 0 ; GFX6-NEXT: s_cselect_b32 s3, s0, 0 -; GFX6-NEXT: s_sub_i32 s3, 0x80000000, s3 +; GFX6-NEXT: s_sub_i32 s3, 0xff800000, s3 ; GFX6-NEXT: s_cmp_gt_i32 s3, s1 ; GFX6-NEXT: s_cselect_b32 s1, s3, s1 ; GFX6-NEXT: s_cmp_lt_i32 s1, s2 ; GFX6-NEXT: s_cselect_b32 s1, s1, s2 ; GFX6-NEXT: s_add_i32 s0, s0, s1 -; GFX6-NEXT: s_ashr_i32 s0, s0, 8 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_saddsat_i24: @@ -2883,16 +2865,15 @@ ; GFX6-LABEL: v_saddsat_i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16 ; GFX6-NEXT: v_min_i32_e32 v3, 0, v0 ; GFX6-NEXT: v_max_i32_e32 v2, 0, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0x80000000, v3 -; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 0x7fffffff, v2 +; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16 +; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0xffff8000, v3 +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 0x7fff, v2 ; GFX6-NEXT: v_max_i32_e32 v1, v3, v1 ; GFX6-NEXT: v_min_i32_e32 v1, v1, v2 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GFX6-NEXT: v_ashrrev_i32_e32 v0, 16, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_saddsat_i16: @@ -2927,20 +2908,19 @@ define amdgpu_ps i16 @s_saddsat_i16(i16 inreg %lhs, i16 inreg %rhs) { ; GFX6-LABEL: s_saddsat_i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_lshl_b32 s0, s0, 16 -; GFX6-NEXT: s_lshl_b32 s1, s1, 16 +; GFX6-NEXT: s_sext_i32_i16 s0, s0 ; GFX6-NEXT: s_cmp_gt_i32 s0, 0 ; GFX6-NEXT: s_cselect_b32 s2, s0, 0 -; GFX6-NEXT: s_sub_i32 s2, 0x7fffffff, s2 +; GFX6-NEXT: s_sub_i32 s2, 0x7fff, s2 ; GFX6-NEXT: s_cmp_lt_i32 s0, 0 ; GFX6-NEXT: s_cselect_b32 s3, s0, 0 -; GFX6-NEXT: s_sub_i32 s3, 0x80000000, s3 +; GFX6-NEXT: s_sext_i32_i16 s1, s1 +; GFX6-NEXT: s_sub_i32 s3, 0xffff8000, s3 ; GFX6-NEXT: s_cmp_gt_i32 s3, s1 ; GFX6-NEXT: s_cselect_b32 s1, s3, s1 ; GFX6-NEXT: s_cmp_lt_i32 s1, s2 ; GFX6-NEXT: s_cselect_b32 s1, s1, s2 ; GFX6-NEXT: s_add_i32 s0, s0, s1 -; GFX6-NEXT: s_ashr_i32 s0, s0, 16 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_saddsat_i16: @@ -2984,18 +2964,17 @@ define amdgpu_ps half @saddsat_i16_sv(i16 inreg %lhs, i16 %rhs) { ; GFX6-LABEL: saddsat_i16_sv: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_lshl_b32 s0, s0, 16 +; GFX6-NEXT: s_sext_i32_i16 s0, s0 ; GFX6-NEXT: s_cmp_gt_i32 s0, 0 ; GFX6-NEXT: s_cselect_b32 s1, s0, 0 -; GFX6-NEXT: s_sub_i32 s1, 0x7fffffff, s1 +; GFX6-NEXT: s_sub_i32 s1, 0x7fff, s1 ; GFX6-NEXT: s_cmp_lt_i32 s0, 0 ; GFX6-NEXT: s_cselect_b32 s2, s0, 0 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: s_sub_i32 s2, 0x80000000, s2 +; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX6-NEXT: s_sub_i32 s2, 0xffff8000, s2 ; GFX6-NEXT: v_max_i32_e32 v0, s2, v0 ; GFX6-NEXT: v_min_i32_e32 v0, s1, v0 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v0 -; GFX6-NEXT: v_ashrrev_i32_e32 v0, 16, v0 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: saddsat_i16_sv: @@ -3031,16 +3010,15 @@ define amdgpu_ps half @saddsat_i16_vs(i16 %lhs, i16 inreg %rhs) { ; GFX6-LABEL: saddsat_i16_vs: ; GFX6: ; %bb.0: -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16 ; GFX6-NEXT: v_min_i32_e32 v2, 0, v0 ; GFX6-NEXT: v_max_i32_e32 v1, 0, v0 -; GFX6-NEXT: s_lshl_b32 s0, s0, 16 -; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 0x80000000, v2 -; GFX6-NEXT: v_sub_i32_e32 v1, vcc, 0x7fffffff, v1 +; GFX6-NEXT: s_sext_i32_i16 s0, s0 +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 0xffff8000, v2 +; GFX6-NEXT: v_sub_i32_e32 v1, vcc, 0x7fff, v1 ; GFX6-NEXT: v_max_i32_e32 v2, s0, v2 ; GFX6-NEXT: v_min_i32_e32 v1, v2, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GFX6-NEXT: v_ashrrev_i32_e32 v0, 16, v0 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: saddsat_i16_vs: @@ -3073,28 +3051,26 @@ ; GFX6-LABEL: v_saddsat_v2i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: s_mov_b32 s5, 0x80000000 +; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX6-NEXT: s_mov_b32 s5, 0xffff8000 ; GFX6-NEXT: v_min_i32_e32 v5, 0, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 16 ; GFX6-NEXT: v_sub_i32_e32 v5, vcc, s5, v5 -; GFX6-NEXT: s_brev_b32 s4, -2 +; GFX6-NEXT: s_movk_i32 s4, 0x7fff ; GFX6-NEXT: v_max_i32_e32 v4, 0, v0 ; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s4, v4 ; GFX6-NEXT: v_max_i32_e32 v2, v5, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16 ; GFX6-NEXT: v_min_i32_e32 v2, v2, v4 ; GFX6-NEXT: v_min_i32_e32 v4, 0, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX6-NEXT: v_bfe_i32 v2, v3, 0, 16 ; GFX6-NEXT: v_max_i32_e32 v3, 0, v1 ; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s5, v4 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s4, v3 ; GFX6-NEXT: v_max_i32_e32 v2, v4, v2 ; GFX6-NEXT: v_min_i32_e32 v2, v2, v3 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v2 -; GFX6-NEXT: v_ashrrev_i32_e32 v0, 16, v0 -; GFX6-NEXT: v_ashrrev_i32_e32 v1, 16, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_saddsat_v2i16: @@ -3140,25 +3116,24 @@ define amdgpu_ps i32 @s_saddsat_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs) { ; GFX6-LABEL: s_saddsat_v2i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_lshl_b32 s0, s0, 16 -; GFX6-NEXT: s_lshl_b32 s2, s2, 16 +; GFX6-NEXT: s_sext_i32_i16 s0, s0 ; GFX6-NEXT: s_cmp_gt_i32 s0, 0 -; GFX6-NEXT: s_brev_b32 s4, -2 +; GFX6-NEXT: s_movk_i32 s4, 0x7fff ; GFX6-NEXT: s_cselect_b32 s6, s0, 0 ; GFX6-NEXT: s_sub_i32 s6, s4, s6 ; GFX6-NEXT: s_cmp_lt_i32 s0, 0 -; GFX6-NEXT: s_mov_b32 s5, 0x80000000 +; GFX6-NEXT: s_mov_b32 s5, 0xffff8000 ; GFX6-NEXT: s_cselect_b32 s7, s0, 0 +; GFX6-NEXT: s_sext_i32_i16 s2, s2 ; GFX6-NEXT: s_sub_i32 s7, s5, s7 ; GFX6-NEXT: s_cmp_gt_i32 s7, s2 ; GFX6-NEXT: s_cselect_b32 s2, s7, s2 ; GFX6-NEXT: s_cmp_lt_i32 s2, s6 ; GFX6-NEXT: s_cselect_b32 s2, s2, s6 ; GFX6-NEXT: s_add_i32 s0, s0, s2 -; GFX6-NEXT: s_ashr_i32 s0, s0, 16 -; GFX6-NEXT: s_lshl_b32 s1, s1, 16 -; GFX6-NEXT: s_lshl_b32 s2, s3, 16 +; GFX6-NEXT: s_sext_i32_i16 s1, s1 ; GFX6-NEXT: s_cmp_gt_i32 s1, 0 +; GFX6-NEXT: s_sext_i32_i16 s2, s3 ; GFX6-NEXT: s_cselect_b32 s3, s1, 0 ; GFX6-NEXT: s_sub_i32 s3, s4, s3 ; GFX6-NEXT: s_cmp_lt_i32 s1, 0 @@ -3170,7 +3145,6 @@ ; GFX6-NEXT: s_cselect_b32 s2, s2, s3 ; GFX6-NEXT: s_add_i32 s1, s1, s2 ; GFX6-NEXT: s_mov_b32 s2, 0xffff -; GFX6-NEXT: s_ashr_i32 s1, s1, 16 ; GFX6-NEXT: s_and_b32 s1, s1, s2 ; GFX6-NEXT: s_and_b32 s0, s0, s2 ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 @@ -3243,33 +3217,31 @@ define amdgpu_ps float @saddsat_v2i16_sv(<2 x i16> inreg %lhs, <2 x i16> %rhs) { ; GFX6-LABEL: saddsat_v2i16_sv: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_lshl_b32 s0, s0, 16 +; GFX6-NEXT: s_sext_i32_i16 s0, s0 ; GFX6-NEXT: s_cmp_gt_i32 s0, 0 -; GFX6-NEXT: s_brev_b32 s2, -2 +; GFX6-NEXT: s_movk_i32 s2, 0x7fff ; GFX6-NEXT: s_cselect_b32 s4, s0, 0 ; GFX6-NEXT: s_sub_i32 s4, s2, s4 ; GFX6-NEXT: s_cmp_lt_i32 s0, 0 -; GFX6-NEXT: s_mov_b32 s3, 0x80000000 +; GFX6-NEXT: s_mov_b32 s3, 0xffff8000 ; GFX6-NEXT: s_cselect_b32 s5, s0, 0 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16 ; GFX6-NEXT: s_sub_i32 s5, s3, s5 ; GFX6-NEXT: v_max_i32_e32 v0, s5, v0 ; GFX6-NEXT: v_min_i32_e32 v0, s4, v0 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v0 -; GFX6-NEXT: s_lshl_b32 s0, s1, 16 +; GFX6-NEXT: s_sext_i32_i16 s0, s1 ; GFX6-NEXT: s_cmp_gt_i32 s0, 0 ; GFX6-NEXT: s_cselect_b32 s1, s0, 0 ; GFX6-NEXT: s_sub_i32 s1, s2, s1 ; GFX6-NEXT: s_cmp_lt_i32 s0, 0 ; GFX6-NEXT: s_cselect_b32 s2, s0, 0 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16 ; GFX6-NEXT: s_sub_i32 s2, s3, s2 ; GFX6-NEXT: v_max_i32_e32 v1, s2, v1 ; GFX6-NEXT: v_min_i32_e32 v1, s1, v1 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, s0, v1 -; GFX6-NEXT: v_ashrrev_i32_e32 v1, 16, v1 ; GFX6-NEXT: s_mov_b32 s0, 0xffff -; GFX6-NEXT: v_ashrrev_i32_e32 v0, 16, v0 ; GFX6-NEXT: v_and_b32_e32 v1, s0, v1 ; GFX6-NEXT: v_and_b32_e32 v0, s0, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 @@ -3325,29 +3297,27 @@ define amdgpu_ps float @saddsat_v2i16_vs(<2 x i16> %lhs, <2 x i16> inreg %rhs) { ; GFX6-LABEL: saddsat_v2i16_vs: ; GFX6: ; %bb.0: -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: s_mov_b32 s3, 0x80000000 +; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX6-NEXT: s_mov_b32 s3, 0xffff8000 ; GFX6-NEXT: v_min_i32_e32 v3, 0, v0 -; GFX6-NEXT: s_lshl_b32 s0, s0, 16 +; GFX6-NEXT: s_sext_i32_i16 s0, s0 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s3, v3 -; GFX6-NEXT: s_brev_b32 s2, -2 +; GFX6-NEXT: s_movk_i32 s2, 0x7fff ; GFX6-NEXT: v_max_i32_e32 v2, 0, v0 ; GFX6-NEXT: v_max_i32_e32 v3, s0, v3 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s2, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16 ; GFX6-NEXT: v_min_i32_e32 v2, v3, v2 ; GFX6-NEXT: v_min_i32_e32 v3, 0, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GFX6-NEXT: v_max_i32_e32 v2, 0, v1 -; GFX6-NEXT: s_lshl_b32 s0, s1, 16 +; GFX6-NEXT: s_sext_i32_i16 s0, s1 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s3, v3 ; GFX6-NEXT: v_max_i32_e32 v3, s0, v3 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s2, v2 ; GFX6-NEXT: v_min_i32_e32 v2, v3, v2 -; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v2 -; GFX6-NEXT: v_ashrrev_i32_e32 v1, 16, v1 ; GFX6-NEXT: s_mov_b32 s0, 0xffff -; GFX6-NEXT: v_ashrrev_i32_e32 v0, 16, v0 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v2 ; GFX6-NEXT: v_and_b32_e32 v1, s0, v1 ; GFX6-NEXT: v_and_b32_e32 v0, s0, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 @@ -3407,52 +3377,48 @@ ; GFX6-LABEL: v_saddsat_v4i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: s_mov_b32 s5, 0x80000000 +; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX6-NEXT: s_mov_b32 s5, 0xffff8000 ; GFX6-NEXT: v_min_i32_e32 v10, 0, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_bfe_i32 v4, v4, 0, 16 ; GFX6-NEXT: v_sub_i32_e32 v10, vcc, s5, v10 -; GFX6-NEXT: s_brev_b32 s4, -2 +; GFX6-NEXT: s_movk_i32 s4, 0x7fff ; GFX6-NEXT: v_max_i32_e32 v8, 0, v0 ; GFX6-NEXT: v_sub_i32_e32 v8, vcc, s4, v8 ; GFX6-NEXT: v_max_i32_e32 v4, v10, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16 ; GFX6-NEXT: v_min_i32_e32 v4, v4, v8 ; GFX6-NEXT: v_min_i32_e32 v8, 0, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX6-NEXT: v_bfe_i32 v4, v5, 0, 16 ; GFX6-NEXT: v_max_i32_e32 v5, 0, v1 ; GFX6-NEXT: v_sub_i32_e32 v8, vcc, s5, v8 ; GFX6-NEXT: v_sub_i32_e32 v5, vcc, s4, v5 ; GFX6-NEXT: v_max_i32_e32 v4, v8, v4 ; GFX6-NEXT: v_min_i32_e32 v4, v4, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 16 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX6-NEXT: v_bfe_i32 v4, v6, 0, 16 ; GFX6-NEXT: v_min_i32_e32 v6, 0, v2 -; GFX6-NEXT: v_sub_i32_e32 v6, vcc, s5, v6 -; GFX6-NEXT: v_bfrev_b32_e32 v9, -2 ; GFX6-NEXT: v_max_i32_e32 v5, 0, v2 -; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v9, v5 +; GFX6-NEXT: v_sub_i32_e32 v6, vcc, s5, v6 +; GFX6-NEXT: v_sub_i32_e32 v5, vcc, s4, v5 ; GFX6-NEXT: v_max_i32_e32 v4, v6, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_bfe_i32 v3, v3, 0, 16 ; GFX6-NEXT: v_min_i32_e32 v4, v4, v5 -; GFX6-NEXT: v_mov_b32_e32 v11, 0x80000000 +; GFX6-NEXT: v_mov_b32_e32 v11, 0xffff8000 ; GFX6-NEXT: v_min_i32_e32 v6, 0, v3 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; GFX6-NEXT: v_mov_b32_e32 v9, 0x7fff ; GFX6-NEXT: v_max_i32_e32 v5, 0, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v7 +; GFX6-NEXT: v_bfe_i32 v4, v7, 0, 16 ; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v11, v6 +; GFX6-NEXT: s_mov_b32 s4, 0xffff ; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v9, v5 ; GFX6-NEXT: v_max_i32_e32 v4, v6, v4 +; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 ; GFX6-NEXT: v_min_i32_e32 v4, v4, v5 -; GFX6-NEXT: v_ashrrev_i32_e32 v1, 16, v1 -; GFX6-NEXT: s_mov_b32 s4, 0xffff -; GFX6-NEXT: v_ashrrev_i32_e32 v0, 16, v0 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v4 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 -; GFX6-NEXT: v_ashrrev_i32_e32 v2, 16, v2 -; GFX6-NEXT: v_ashrrev_i32_e32 v3, 16, v3 ; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 @@ -3524,25 +3490,24 @@ define amdgpu_ps <2 x i32> @s_saddsat_v4i16(<4 x i16> inreg %lhs, <4 x i16> inreg %rhs) { ; GFX6-LABEL: s_saddsat_v4i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_lshl_b32 s0, s0, 16 -; GFX6-NEXT: s_lshl_b32 s4, s4, 16 +; GFX6-NEXT: s_sext_i32_i16 s0, s0 ; GFX6-NEXT: s_cmp_gt_i32 s0, 0 -; GFX6-NEXT: s_brev_b32 s8, -2 +; GFX6-NEXT: s_movk_i32 s8, 0x7fff ; GFX6-NEXT: s_cselect_b32 s10, s0, 0 ; GFX6-NEXT: s_sub_i32 s10, s8, s10 ; GFX6-NEXT: s_cmp_lt_i32 s0, 0 -; GFX6-NEXT: s_mov_b32 s9, 0x80000000 +; GFX6-NEXT: s_mov_b32 s9, 0xffff8000 ; GFX6-NEXT: s_cselect_b32 s11, s0, 0 +; GFX6-NEXT: s_sext_i32_i16 s4, s4 ; GFX6-NEXT: s_sub_i32 s11, s9, s11 ; GFX6-NEXT: s_cmp_gt_i32 s11, s4 ; GFX6-NEXT: s_cselect_b32 s4, s11, s4 ; GFX6-NEXT: s_cmp_lt_i32 s4, s10 ; GFX6-NEXT: s_cselect_b32 s4, s4, s10 ; GFX6-NEXT: s_add_i32 s0, s0, s4 -; GFX6-NEXT: s_ashr_i32 s0, s0, 16 -; GFX6-NEXT: s_lshl_b32 s1, s1, 16 -; GFX6-NEXT: s_lshl_b32 s4, s5, 16 +; GFX6-NEXT: s_sext_i32_i16 s1, s1 ; GFX6-NEXT: s_cmp_gt_i32 s1, 0 +; GFX6-NEXT: s_sext_i32_i16 s4, s5 ; GFX6-NEXT: s_cselect_b32 s5, s1, 0 ; GFX6-NEXT: s_sub_i32 s5, s8, s5 ; GFX6-NEXT: s_cmp_lt_i32 s1, 0 @@ -3553,13 +3518,12 @@ ; GFX6-NEXT: s_cmp_lt_i32 s4, s5 ; GFX6-NEXT: s_cselect_b32 s4, s4, s5 ; GFX6-NEXT: s_add_i32 s1, s1, s4 -; GFX6-NEXT: s_ashr_i32 s1, s1, 16 -; GFX6-NEXT: s_lshl_b32 s2, s2, 16 -; GFX6-NEXT: s_lshl_b32 s4, s6, 16 +; GFX6-NEXT: s_sext_i32_i16 s2, s2 ; GFX6-NEXT: s_cmp_gt_i32 s2, 0 ; GFX6-NEXT: s_cselect_b32 s5, s2, 0 ; GFX6-NEXT: s_sub_i32 s5, s8, s5 ; GFX6-NEXT: s_cmp_lt_i32 s2, 0 +; GFX6-NEXT: s_sext_i32_i16 s4, s6 ; GFX6-NEXT: s_cselect_b32 s6, s2, 0 ; GFX6-NEXT: s_sub_i32 s6, s9, s6 ; GFX6-NEXT: s_cmp_gt_i32 s6, s4 @@ -3567,14 +3531,13 @@ ; GFX6-NEXT: s_cmp_lt_i32 s4, s5 ; GFX6-NEXT: s_cselect_b32 s4, s4, s5 ; GFX6-NEXT: s_add_i32 s2, s2, s4 -; GFX6-NEXT: s_ashr_i32 s2, s2, 16 -; GFX6-NEXT: s_lshl_b32 s3, s3, 16 -; GFX6-NEXT: s_lshl_b32 s4, s7, 16 +; GFX6-NEXT: s_sext_i32_i16 s3, s3 ; GFX6-NEXT: s_cmp_gt_i32 s3, 0 ; GFX6-NEXT: s_cselect_b32 s5, s3, 0 ; GFX6-NEXT: s_sub_i32 s5, s8, s5 ; GFX6-NEXT: s_cmp_lt_i32 s3, 0 ; GFX6-NEXT: s_cselect_b32 s6, s3, 0 +; GFX6-NEXT: s_sext_i32_i16 s4, s7 ; GFX6-NEXT: s_sub_i32 s6, s9, s6 ; GFX6-NEXT: s_cmp_gt_i32 s6, s4 ; GFX6-NEXT: s_cselect_b32 s4, s6, s4 @@ -3583,7 +3546,6 @@ ; GFX6-NEXT: s_add_i32 s3, s3, s4 ; GFX6-NEXT: s_mov_b32 s4, 0xffff ; GFX6-NEXT: s_and_b32 s1, s1, s4 -; GFX6-NEXT: s_ashr_i32 s3, s3, 16 ; GFX6-NEXT: s_and_b32 s0, s0, s4 ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_or_b32 s0, s0, s1 @@ -3714,79 +3676,73 @@ ; GFX6-LABEL: v_saddsat_v6i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: s_mov_b32 s5, 0x80000000 +; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX6-NEXT: s_mov_b32 s5, 0xffff8000 ; GFX6-NEXT: v_min_i32_e32 v14, 0, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX6-NEXT: v_bfe_i32 v6, v6, 0, 16 ; GFX6-NEXT: v_sub_i32_e32 v14, vcc, s5, v14 -; GFX6-NEXT: s_brev_b32 s4, -2 +; GFX6-NEXT: s_movk_i32 s4, 0x7fff ; GFX6-NEXT: v_max_i32_e32 v12, 0, v0 ; GFX6-NEXT: v_sub_i32_e32 v12, vcc, s4, v12 ; GFX6-NEXT: v_max_i32_e32 v6, v14, v6 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16 ; GFX6-NEXT: v_min_i32_e32 v6, v6, v12 ; GFX6-NEXT: v_min_i32_e32 v12, 0, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v6 -; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v7 +; GFX6-NEXT: v_bfe_i32 v6, v7, 0, 16 ; GFX6-NEXT: v_max_i32_e32 v7, 0, v1 ; GFX6-NEXT: v_sub_i32_e32 v12, vcc, s5, v12 ; GFX6-NEXT: v_sub_i32_e32 v7, vcc, s4, v7 ; GFX6-NEXT: v_max_i32_e32 v6, v12, v6 ; GFX6-NEXT: v_min_i32_e32 v6, v6, v7 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 16 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v6 -; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v8 +; GFX6-NEXT: v_bfe_i32 v6, v8, 0, 16 ; GFX6-NEXT: v_min_i32_e32 v8, 0, v2 -; GFX6-NEXT: v_sub_i32_e32 v8, vcc, s5, v8 -; GFX6-NEXT: v_bfrev_b32_e32 v13, -2 ; GFX6-NEXT: v_max_i32_e32 v7, 0, v2 -; GFX6-NEXT: v_sub_i32_e32 v7, vcc, v13, v7 +; GFX6-NEXT: v_sub_i32_e32 v8, vcc, s5, v8 +; GFX6-NEXT: v_sub_i32_e32 v7, vcc, s4, v7 ; GFX6-NEXT: v_max_i32_e32 v6, v8, v6 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_bfe_i32 v3, v3, 0, 16 ; GFX6-NEXT: v_min_i32_e32 v6, v6, v7 -; GFX6-NEXT: v_mov_b32_e32 v15, 0x80000000 +; GFX6-NEXT: v_mov_b32_e32 v15, 0xffff8000 ; GFX6-NEXT: v_min_i32_e32 v8, 0, v3 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v6 +; GFX6-NEXT: v_mov_b32_e32 v13, 0x7fff ; GFX6-NEXT: v_max_i32_e32 v7, 0, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v9 +; GFX6-NEXT: v_bfe_i32 v6, v9, 0, 16 ; GFX6-NEXT: v_sub_i32_e32 v8, vcc, v15, v8 ; GFX6-NEXT: v_sub_i32_e32 v7, vcc, v13, v7 ; GFX6-NEXT: v_max_i32_e32 v6, v8, v6 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_bfe_i32 v4, v4, 0, 16 ; GFX6-NEXT: v_min_i32_e32 v6, v6, v7 ; GFX6-NEXT: v_min_i32_e32 v8, 0, v4 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v6 ; GFX6-NEXT: v_max_i32_e32 v7, 0, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v10 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v6 +; GFX6-NEXT: v_bfe_i32 v6, v10, 0, 16 ; GFX6-NEXT: v_sub_i32_e32 v8, vcc, v15, v8 ; GFX6-NEXT: v_sub_i32_e32 v7, vcc, v13, v7 ; GFX6-NEXT: v_max_i32_e32 v6, v8, v6 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX6-NEXT: v_bfe_i32 v5, v5, 0, 16 ; GFX6-NEXT: v_min_i32_e32 v6, v6, v7 ; GFX6-NEXT: v_min_i32_e32 v8, 0, v5 -; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v6 ; GFX6-NEXT: v_max_i32_e32 v7, 0, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v11 -; GFX6-NEXT: v_sub_i32_e32 v8, vcc, v15, v8 -; GFX6-NEXT: v_ashrrev_i32_e32 v1, 16, v1 ; GFX6-NEXT: s_mov_b32 s4, 0xffff +; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v6 +; GFX6-NEXT: v_bfe_i32 v6, v11, 0, 16 +; GFX6-NEXT: v_sub_i32_e32 v8, vcc, v15, v8 +; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 ; GFX6-NEXT: v_sub_i32_e32 v7, vcc, v13, v7 ; GFX6-NEXT: v_max_i32_e32 v6, v8, v6 -; GFX6-NEXT: v_ashrrev_i32_e32 v0, 16, v0 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 ; GFX6-NEXT: v_min_i32_e32 v6, v6, v7 -; GFX6-NEXT: v_add_i32_e32 v5, vcc, v5, v6 -; GFX6-NEXT: v_ashrrev_i32_e32 v2, 16, v2 -; GFX6-NEXT: v_ashrrev_i32_e32 v3, 16, v3 ; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: v_and_b32_e32 v1, s4, v2 ; GFX6-NEXT: v_and_b32_e32 v2, s4, v3 -; GFX6-NEXT: v_ashrrev_i32_e32 v5, 16, v5 +; GFX6-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; GFX6-NEXT: v_and_b32_e32 v3, s4, v5 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_ashrrev_i32_e32 v4, 16, v4 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX6-NEXT: v_and_b32_e32 v2, s4, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 @@ -3875,25 +3831,24 @@ define amdgpu_ps <3 x i32> @s_saddsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inreg %rhs) { ; GFX6-LABEL: s_saddsat_v6i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_lshl_b32 s0, s0, 16 -; GFX6-NEXT: s_lshl_b32 s6, s6, 16 +; GFX6-NEXT: s_sext_i32_i16 s0, s0 ; GFX6-NEXT: s_cmp_gt_i32 s0, 0 -; GFX6-NEXT: s_brev_b32 s12, -2 +; GFX6-NEXT: s_movk_i32 s12, 0x7fff ; GFX6-NEXT: s_cselect_b32 s14, s0, 0 ; GFX6-NEXT: s_sub_i32 s14, s12, s14 ; GFX6-NEXT: s_cmp_lt_i32 s0, 0 -; GFX6-NEXT: s_mov_b32 s13, 0x80000000 +; GFX6-NEXT: s_mov_b32 s13, 0xffff8000 ; GFX6-NEXT: s_cselect_b32 s15, s0, 0 +; GFX6-NEXT: s_sext_i32_i16 s6, s6 ; GFX6-NEXT: s_sub_i32 s15, s13, s15 ; GFX6-NEXT: s_cmp_gt_i32 s15, s6 ; GFX6-NEXT: s_cselect_b32 s6, s15, s6 ; GFX6-NEXT: s_cmp_lt_i32 s6, s14 ; GFX6-NEXT: s_cselect_b32 s6, s6, s14 ; GFX6-NEXT: s_add_i32 s0, s0, s6 -; GFX6-NEXT: s_ashr_i32 s0, s0, 16 -; GFX6-NEXT: s_lshl_b32 s1, s1, 16 -; GFX6-NEXT: s_lshl_b32 s6, s7, 16 +; GFX6-NEXT: s_sext_i32_i16 s1, s1 ; GFX6-NEXT: s_cmp_gt_i32 s1, 0 +; GFX6-NEXT: s_sext_i32_i16 s6, s7 ; GFX6-NEXT: s_cselect_b32 s7, s1, 0 ; GFX6-NEXT: s_sub_i32 s7, s12, s7 ; GFX6-NEXT: s_cmp_lt_i32 s1, 0 @@ -3904,13 +3859,12 @@ ; GFX6-NEXT: s_cmp_lt_i32 s6, s7 ; GFX6-NEXT: s_cselect_b32 s6, s6, s7 ; GFX6-NEXT: s_add_i32 s1, s1, s6 -; GFX6-NEXT: s_ashr_i32 s1, s1, 16 -; GFX6-NEXT: s_lshl_b32 s2, s2, 16 -; GFX6-NEXT: s_lshl_b32 s6, s8, 16 +; GFX6-NEXT: s_sext_i32_i16 s2, s2 ; GFX6-NEXT: s_cmp_gt_i32 s2, 0 ; GFX6-NEXT: s_cselect_b32 s7, s2, 0 ; GFX6-NEXT: s_sub_i32 s7, s12, s7 ; GFX6-NEXT: s_cmp_lt_i32 s2, 0 +; GFX6-NEXT: s_sext_i32_i16 s6, s8 ; GFX6-NEXT: s_cselect_b32 s8, s2, 0 ; GFX6-NEXT: s_sub_i32 s8, s13, s8 ; GFX6-NEXT: s_cmp_gt_i32 s8, s6 @@ -3918,42 +3872,39 @@ ; GFX6-NEXT: s_cmp_lt_i32 s6, s7 ; GFX6-NEXT: s_cselect_b32 s6, s6, s7 ; GFX6-NEXT: s_add_i32 s2, s2, s6 -; GFX6-NEXT: s_ashr_i32 s2, s2, 16 -; GFX6-NEXT: s_lshl_b32 s3, s3, 16 -; GFX6-NEXT: s_lshl_b32 s6, s9, 16 +; GFX6-NEXT: s_sext_i32_i16 s3, s3 ; GFX6-NEXT: s_cmp_gt_i32 s3, 0 ; GFX6-NEXT: s_cselect_b32 s7, s3, 0 ; GFX6-NEXT: s_sub_i32 s7, s12, s7 ; GFX6-NEXT: s_cmp_lt_i32 s3, 0 ; GFX6-NEXT: s_cselect_b32 s8, s3, 0 +; GFX6-NEXT: s_sext_i32_i16 s6, s9 ; GFX6-NEXT: s_sub_i32 s8, s13, s8 ; GFX6-NEXT: s_cmp_gt_i32 s8, s6 ; GFX6-NEXT: s_cselect_b32 s6, s8, s6 ; GFX6-NEXT: s_cmp_lt_i32 s6, s7 ; GFX6-NEXT: s_cselect_b32 s6, s6, s7 ; GFX6-NEXT: s_add_i32 s3, s3, s6 -; GFX6-NEXT: s_ashr_i32 s3, s3, 16 -; GFX6-NEXT: s_lshl_b32 s4, s4, 16 -; GFX6-NEXT: s_lshl_b32 s6, s10, 16 +; GFX6-NEXT: s_sext_i32_i16 s4, s4 ; GFX6-NEXT: s_cmp_gt_i32 s4, 0 ; GFX6-NEXT: s_cselect_b32 s7, s4, 0 ; GFX6-NEXT: s_sub_i32 s7, s12, s7 ; GFX6-NEXT: s_cmp_lt_i32 s4, 0 ; GFX6-NEXT: s_cselect_b32 s8, s4, 0 +; GFX6-NEXT: s_sext_i32_i16 s6, s10 ; GFX6-NEXT: s_sub_i32 s8, s13, s8 ; GFX6-NEXT: s_cmp_gt_i32 s8, s6 ; GFX6-NEXT: s_cselect_b32 s6, s8, s6 ; GFX6-NEXT: s_cmp_lt_i32 s6, s7 ; GFX6-NEXT: s_cselect_b32 s6, s6, s7 ; GFX6-NEXT: s_add_i32 s4, s4, s6 -; GFX6-NEXT: s_ashr_i32 s4, s4, 16 -; GFX6-NEXT: s_lshl_b32 s5, s5, 16 -; GFX6-NEXT: s_lshl_b32 s6, s11, 16 +; GFX6-NEXT: s_sext_i32_i16 s5, s5 ; GFX6-NEXT: s_cmp_gt_i32 s5, 0 ; GFX6-NEXT: s_cselect_b32 s7, s5, 0 ; GFX6-NEXT: s_sub_i32 s7, s12, s7 ; GFX6-NEXT: s_cmp_lt_i32 s5, 0 ; GFX6-NEXT: s_cselect_b32 s8, s5, 0 +; GFX6-NEXT: s_sext_i32_i16 s6, s11 ; GFX6-NEXT: s_sub_i32 s8, s13, s8 ; GFX6-NEXT: s_cmp_gt_i32 s8, s6 ; GFX6-NEXT: s_cselect_b32 s6, s8, s6 @@ -3967,7 +3918,6 @@ ; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: s_and_b32 s1, s2, s6 ; GFX6-NEXT: s_and_b32 s2, s3, s6 -; GFX6-NEXT: s_ashr_i32 s5, s5, 16 ; GFX6-NEXT: s_and_b32 s3, s5, s6 ; GFX6-NEXT: s_lshl_b32 s2, s2, 16 ; GFX6-NEXT: s_or_b32 s1, s1, s2 @@ -4129,103 +4079,95 @@ ; GFX6-LABEL: v_saddsat_v8i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: s_mov_b32 s5, 0x80000000 +; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX6-NEXT: s_mov_b32 s5, 0xffff8000 ; GFX6-NEXT: v_min_i32_e32 v18, 0, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX6-NEXT: v_bfe_i32 v8, v8, 0, 16 ; GFX6-NEXT: v_sub_i32_e32 v18, vcc, s5, v18 -; GFX6-NEXT: s_brev_b32 s4, -2 +; GFX6-NEXT: s_movk_i32 s4, 0x7fff ; GFX6-NEXT: v_max_i32_e32 v16, 0, v0 ; GFX6-NEXT: v_sub_i32_e32 v16, vcc, s4, v16 ; GFX6-NEXT: v_max_i32_e32 v8, v18, v8 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16 ; GFX6-NEXT: v_min_i32_e32 v8, v8, v16 ; GFX6-NEXT: v_min_i32_e32 v16, 0, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v8 -; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; GFX6-NEXT: v_bfe_i32 v8, v9, 0, 16 ; GFX6-NEXT: v_max_i32_e32 v9, 0, v1 ; GFX6-NEXT: v_sub_i32_e32 v16, vcc, s5, v16 ; GFX6-NEXT: v_sub_i32_e32 v9, vcc, s4, v9 ; GFX6-NEXT: v_max_i32_e32 v8, v16, v8 ; GFX6-NEXT: v_min_i32_e32 v8, v8, v9 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 16 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v8 -; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v10 +; GFX6-NEXT: v_bfe_i32 v8, v10, 0, 16 ; GFX6-NEXT: v_min_i32_e32 v10, 0, v2 -; GFX6-NEXT: v_sub_i32_e32 v10, vcc, s5, v10 -; GFX6-NEXT: v_bfrev_b32_e32 v17, -2 ; GFX6-NEXT: v_max_i32_e32 v9, 0, v2 -; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v17, v9 +; GFX6-NEXT: v_sub_i32_e32 v10, vcc, s5, v10 +; GFX6-NEXT: v_sub_i32_e32 v9, vcc, s4, v9 ; GFX6-NEXT: v_max_i32_e32 v8, v10, v8 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_bfe_i32 v3, v3, 0, 16 ; GFX6-NEXT: v_min_i32_e32 v8, v8, v9 -; GFX6-NEXT: v_mov_b32_e32 v19, 0x80000000 +; GFX6-NEXT: v_mov_b32_e32 v19, 0xffff8000 ; GFX6-NEXT: v_min_i32_e32 v10, 0, v3 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v8 +; GFX6-NEXT: v_mov_b32_e32 v17, 0x7fff ; GFX6-NEXT: v_max_i32_e32 v9, 0, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v11 +; GFX6-NEXT: v_bfe_i32 v8, v11, 0, 16 ; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v19, v10 ; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v17, v9 ; GFX6-NEXT: v_max_i32_e32 v8, v10, v8 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_bfe_i32 v4, v4, 0, 16 ; GFX6-NEXT: v_min_i32_e32 v8, v8, v9 ; GFX6-NEXT: v_min_i32_e32 v10, 0, v4 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v8 ; GFX6-NEXT: v_max_i32_e32 v9, 0, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v12 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v8 +; GFX6-NEXT: v_bfe_i32 v8, v12, 0, 16 ; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v19, v10 ; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v17, v9 ; GFX6-NEXT: v_max_i32_e32 v8, v10, v8 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX6-NEXT: v_bfe_i32 v5, v5, 0, 16 ; GFX6-NEXT: v_min_i32_e32 v8, v8, v9 ; GFX6-NEXT: v_min_i32_e32 v10, 0, v5 -; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v8 ; GFX6-NEXT: v_max_i32_e32 v9, 0, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v13 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v8 +; GFX6-NEXT: v_bfe_i32 v8, v13, 0, 16 ; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v19, v10 ; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v17, v9 ; GFX6-NEXT: v_max_i32_e32 v8, v10, v8 -; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX6-NEXT: v_bfe_i32 v6, v6, 0, 16 ; GFX6-NEXT: v_min_i32_e32 v8, v8, v9 ; GFX6-NEXT: v_min_i32_e32 v10, 0, v6 -; GFX6-NEXT: v_add_i32_e32 v5, vcc, v5, v8 ; GFX6-NEXT: v_max_i32_e32 v9, 0, v6 -; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v14 +; GFX6-NEXT: v_add_i32_e32 v5, vcc, v5, v8 +; GFX6-NEXT: v_bfe_i32 v8, v14, 0, 16 ; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v19, v10 ; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v17, v9 ; GFX6-NEXT: v_max_i32_e32 v8, v10, v8 -; GFX6-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX6-NEXT: v_bfe_i32 v7, v7, 0, 16 ; GFX6-NEXT: v_min_i32_e32 v8, v8, v9 ; GFX6-NEXT: v_min_i32_e32 v10, 0, v7 -; GFX6-NEXT: v_add_i32_e32 v6, vcc, v6, v8 -; GFX6-NEXT: v_max_i32_e32 v9, 0, v7 -; GFX6-NEXT: v_ashrrev_i32_e32 v1, 16, v1 ; GFX6-NEXT: s_mov_b32 s4, 0xffff -; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v15 -; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v19, v10 -; GFX6-NEXT: v_ashrrev_i32_e32 v0, 16, v0 +; GFX6-NEXT: v_max_i32_e32 v9, 0, v7 ; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX6-NEXT: v_add_i32_e32 v6, vcc, v6, v8 +; GFX6-NEXT: v_bfe_i32 v8, v15, 0, 16 +; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v19, v10 ; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v17, v9 ; GFX6-NEXT: v_max_i32_e32 v8, v10, v8 -; GFX6-NEXT: v_min_i32_e32 v8, v8, v9 -; GFX6-NEXT: v_ashrrev_i32_e32 v2, 16, v2 -; GFX6-NEXT: v_ashrrev_i32_e32 v3, 16, v3 ; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_add_i32_e32 v7, vcc, v7, v8 +; GFX6-NEXT: v_min_i32_e32 v8, v8, v9 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: v_and_b32_e32 v1, s4, v2 ; GFX6-NEXT: v_and_b32_e32 v2, s4, v3 -; GFX6-NEXT: v_ashrrev_i32_e32 v5, 16, v5 ; GFX6-NEXT: v_and_b32_e32 v3, s4, v5 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_ashrrev_i32_e32 v4, 16, v4 -; GFX6-NEXT: v_ashrrev_i32_e32 v7, 16, v7 +; GFX6-NEXT: v_add_i32_e32 v7, vcc, v7, v8 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX6-NEXT: v_and_b32_e32 v2, s4, v4 ; GFX6-NEXT: v_and_b32_e32 v4, s4, v7 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_ashrrev_i32_e32 v6, 16, v6 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX6-NEXT: v_and_b32_e32 v3, s4, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 @@ -4332,25 +4274,24 @@ define amdgpu_ps <4 x i32> @s_saddsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inreg %rhs) { ; GFX6-LABEL: s_saddsat_v8i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_lshl_b32 s0, s0, 16 -; GFX6-NEXT: s_lshl_b32 s8, s8, 16 +; GFX6-NEXT: s_sext_i32_i16 s0, s0 ; GFX6-NEXT: s_cmp_gt_i32 s0, 0 -; GFX6-NEXT: s_brev_b32 s16, -2 +; GFX6-NEXT: s_movk_i32 s16, 0x7fff ; GFX6-NEXT: s_cselect_b32 s18, s0, 0 ; GFX6-NEXT: s_sub_i32 s18, s16, s18 ; GFX6-NEXT: s_cmp_lt_i32 s0, 0 -; GFX6-NEXT: s_mov_b32 s17, 0x80000000 +; GFX6-NEXT: s_mov_b32 s17, 0xffff8000 ; GFX6-NEXT: s_cselect_b32 s19, s0, 0 +; GFX6-NEXT: s_sext_i32_i16 s8, s8 ; GFX6-NEXT: s_sub_i32 s19, s17, s19 ; GFX6-NEXT: s_cmp_gt_i32 s19, s8 ; GFX6-NEXT: s_cselect_b32 s8, s19, s8 ; GFX6-NEXT: s_cmp_lt_i32 s8, s18 ; GFX6-NEXT: s_cselect_b32 s8, s8, s18 ; GFX6-NEXT: s_add_i32 s0, s0, s8 -; GFX6-NEXT: s_ashr_i32 s0, s0, 16 -; GFX6-NEXT: s_lshl_b32 s1, s1, 16 -; GFX6-NEXT: s_lshl_b32 s8, s9, 16 +; GFX6-NEXT: s_sext_i32_i16 s1, s1 ; GFX6-NEXT: s_cmp_gt_i32 s1, 0 +; GFX6-NEXT: s_sext_i32_i16 s8, s9 ; GFX6-NEXT: s_cselect_b32 s9, s1, 0 ; GFX6-NEXT: s_sub_i32 s9, s16, s9 ; GFX6-NEXT: s_cmp_lt_i32 s1, 0 @@ -4361,13 +4302,12 @@ ; GFX6-NEXT: s_cmp_lt_i32 s8, s9 ; GFX6-NEXT: s_cselect_b32 s8, s8, s9 ; GFX6-NEXT: s_add_i32 s1, s1, s8 -; GFX6-NEXT: s_ashr_i32 s1, s1, 16 -; GFX6-NEXT: s_lshl_b32 s2, s2, 16 -; GFX6-NEXT: s_lshl_b32 s8, s10, 16 +; GFX6-NEXT: s_sext_i32_i16 s2, s2 ; GFX6-NEXT: s_cmp_gt_i32 s2, 0 ; GFX6-NEXT: s_cselect_b32 s9, s2, 0 ; GFX6-NEXT: s_sub_i32 s9, s16, s9 ; GFX6-NEXT: s_cmp_lt_i32 s2, 0 +; GFX6-NEXT: s_sext_i32_i16 s8, s10 ; GFX6-NEXT: s_cselect_b32 s10, s2, 0 ; GFX6-NEXT: s_sub_i32 s10, s17, s10 ; GFX6-NEXT: s_cmp_gt_i32 s10, s8 @@ -4375,70 +4315,65 @@ ; GFX6-NEXT: s_cmp_lt_i32 s8, s9 ; GFX6-NEXT: s_cselect_b32 s8, s8, s9 ; GFX6-NEXT: s_add_i32 s2, s2, s8 -; GFX6-NEXT: s_ashr_i32 s2, s2, 16 -; GFX6-NEXT: s_lshl_b32 s3, s3, 16 -; GFX6-NEXT: s_lshl_b32 s8, s11, 16 +; GFX6-NEXT: s_sext_i32_i16 s3, s3 ; GFX6-NEXT: s_cmp_gt_i32 s3, 0 ; GFX6-NEXT: s_cselect_b32 s9, s3, 0 ; GFX6-NEXT: s_sub_i32 s9, s16, s9 ; GFX6-NEXT: s_cmp_lt_i32 s3, 0 ; GFX6-NEXT: s_cselect_b32 s10, s3, 0 +; GFX6-NEXT: s_sext_i32_i16 s8, s11 ; GFX6-NEXT: s_sub_i32 s10, s17, s10 ; GFX6-NEXT: s_cmp_gt_i32 s10, s8 ; GFX6-NEXT: s_cselect_b32 s8, s10, s8 ; GFX6-NEXT: s_cmp_lt_i32 s8, s9 ; GFX6-NEXT: s_cselect_b32 s8, s8, s9 ; GFX6-NEXT: s_add_i32 s3, s3, s8 -; GFX6-NEXT: s_ashr_i32 s3, s3, 16 -; GFX6-NEXT: s_lshl_b32 s4, s4, 16 -; GFX6-NEXT: s_lshl_b32 s8, s12, 16 +; GFX6-NEXT: s_sext_i32_i16 s4, s4 ; GFX6-NEXT: s_cmp_gt_i32 s4, 0 ; GFX6-NEXT: s_cselect_b32 s9, s4, 0 ; GFX6-NEXT: s_sub_i32 s9, s16, s9 ; GFX6-NEXT: s_cmp_lt_i32 s4, 0 ; GFX6-NEXT: s_cselect_b32 s10, s4, 0 +; GFX6-NEXT: s_sext_i32_i16 s8, s12 ; GFX6-NEXT: s_sub_i32 s10, s17, s10 ; GFX6-NEXT: s_cmp_gt_i32 s10, s8 ; GFX6-NEXT: s_cselect_b32 s8, s10, s8 ; GFX6-NEXT: s_cmp_lt_i32 s8, s9 ; GFX6-NEXT: s_cselect_b32 s8, s8, s9 ; GFX6-NEXT: s_add_i32 s4, s4, s8 -; GFX6-NEXT: s_ashr_i32 s4, s4, 16 -; GFX6-NEXT: s_lshl_b32 s5, s5, 16 -; GFX6-NEXT: s_lshl_b32 s8, s13, 16 +; GFX6-NEXT: s_sext_i32_i16 s5, s5 ; GFX6-NEXT: s_cmp_gt_i32 s5, 0 ; GFX6-NEXT: s_cselect_b32 s9, s5, 0 ; GFX6-NEXT: s_sub_i32 s9, s16, s9 ; GFX6-NEXT: s_cmp_lt_i32 s5, 0 ; GFX6-NEXT: s_cselect_b32 s10, s5, 0 +; GFX6-NEXT: s_sext_i32_i16 s8, s13 ; GFX6-NEXT: s_sub_i32 s10, s17, s10 ; GFX6-NEXT: s_cmp_gt_i32 s10, s8 ; GFX6-NEXT: s_cselect_b32 s8, s10, s8 ; GFX6-NEXT: s_cmp_lt_i32 s8, s9 ; GFX6-NEXT: s_cselect_b32 s8, s8, s9 ; GFX6-NEXT: s_add_i32 s5, s5, s8 -; GFX6-NEXT: s_ashr_i32 s5, s5, 16 -; GFX6-NEXT: s_lshl_b32 s6, s6, 16 -; GFX6-NEXT: s_lshl_b32 s8, s14, 16 +; GFX6-NEXT: s_sext_i32_i16 s6, s6 ; GFX6-NEXT: s_cmp_gt_i32 s6, 0 ; GFX6-NEXT: s_cselect_b32 s9, s6, 0 ; GFX6-NEXT: s_sub_i32 s9, s16, s9 ; GFX6-NEXT: s_cmp_lt_i32 s6, 0 ; GFX6-NEXT: s_cselect_b32 s10, s6, 0 +; GFX6-NEXT: s_sext_i32_i16 s8, s14 ; GFX6-NEXT: s_sub_i32 s10, s17, s10 ; GFX6-NEXT: s_cmp_gt_i32 s10, s8 ; GFX6-NEXT: s_cselect_b32 s8, s10, s8 ; GFX6-NEXT: s_cmp_lt_i32 s8, s9 ; GFX6-NEXT: s_cselect_b32 s8, s8, s9 ; GFX6-NEXT: s_add_i32 s6, s6, s8 -; GFX6-NEXT: s_ashr_i32 s6, s6, 16 -; GFX6-NEXT: s_lshl_b32 s7, s7, 16 -; GFX6-NEXT: s_lshl_b32 s8, s15, 16 +; GFX6-NEXT: s_sext_i32_i16 s7, s7 ; GFX6-NEXT: s_cmp_gt_i32 s7, 0 ; GFX6-NEXT: s_cselect_b32 s9, s7, 0 ; GFX6-NEXT: s_sub_i32 s9, s16, s9 ; GFX6-NEXT: s_cmp_lt_i32 s7, 0 ; GFX6-NEXT: s_cselect_b32 s10, s7, 0 +; GFX6-NEXT: s_sext_i32_i16 s8, s15 ; GFX6-NEXT: s_sub_i32 s10, s17, s10 ; GFX6-NEXT: s_cmp_gt_i32 s10, s8 ; GFX6-NEXT: s_cselect_b32 s8, s10, s8 @@ -4454,7 +4389,6 @@ ; GFX6-NEXT: s_and_b32 s2, s3, s8 ; GFX6-NEXT: s_and_b32 s3, s5, s8 ; GFX6-NEXT: s_lshl_b32 s2, s2, 16 -; GFX6-NEXT: s_ashr_i32 s7, s7, 16 ; GFX6-NEXT: s_or_b32 s1, s1, s2 ; GFX6-NEXT: s_and_b32 s2, s4, s8 ; GFX6-NEXT: s_and_b32 s4, s7, s8 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll @@ -8,16 +8,15 @@ ; GFX6-LABEL: v_ssubsat_i7: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 25, v0 +; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 7 ; GFX6-NEXT: v_max_i32_e32 v2, -1, v0 ; GFX6-NEXT: v_min_i32_e32 v3, -1, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 25, v1 -; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, 0x7fffffff, v2 -; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, 0x80000000, v3 +; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 7 +; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, 63, v2 +; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, 0xffffffc0, v3 ; GFX6-NEXT: v_max_i32_e32 v1, v2, v1 ; GFX6-NEXT: v_min_i32_e32 v1, v1, v3 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 -; GFX6-NEXT: v_ashrrev_i32_e32 v0, 25, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_ssubsat_i7: @@ -62,20 +61,19 @@ define amdgpu_ps i7 @s_ssubsat_i7(i7 inreg %lhs, i7 inreg %rhs) { ; GFX6-LABEL: s_ssubsat_i7: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_lshl_b32 s0, s0, 25 -; GFX6-NEXT: s_lshl_b32 s1, s1, 25 +; GFX6-NEXT: s_bfe_i32 s0, s0, 0x70000 +; GFX6-NEXT: s_bfe_i32 s1, s1, 0x70000 ; GFX6-NEXT: s_cmp_gt_i32 s0, -1 ; GFX6-NEXT: s_cselect_b32 s2, s0, -1 -; GFX6-NEXT: s_sub_i32 s2, s2, 0x7fffffff +; GFX6-NEXT: s_sub_i32 s2, s2, 63 ; GFX6-NEXT: s_cmp_lt_i32 s0, -1 ; GFX6-NEXT: s_cselect_b32 s3, s0, -1 -; GFX6-NEXT: s_sub_i32 s3, s3, 0x80000000 +; GFX6-NEXT: s_sub_i32 s3, s3, 0xffffffc0 ; GFX6-NEXT: s_cmp_gt_i32 s2, s1 ; GFX6-NEXT: s_cselect_b32 s1, s2, s1 ; GFX6-NEXT: s_cmp_lt_i32 s1, s3 ; GFX6-NEXT: s_cselect_b32 s1, s1, s3 ; GFX6-NEXT: s_sub_i32 s0, s0, s1 -; GFX6-NEXT: s_ashr_i32 s0, s0, 25 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_ssubsat_i7: @@ -133,16 +131,15 @@ ; GFX6-LABEL: v_ssubsat_i8: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 8 ; GFX6-NEXT: v_max_i32_e32 v2, -1, v0 ; GFX6-NEXT: v_min_i32_e32 v3, -1, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, 0x7fffffff, v2 -; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, 0x80000000, v3 +; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 8 +; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, 0x7f, v2 +; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, 0xffffff80, v3 ; GFX6-NEXT: v_max_i32_e32 v1, v2, v1 ; GFX6-NEXT: v_min_i32_e32 v1, v1, v3 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 -; GFX6-NEXT: v_ashrrev_i32_e32 v0, 24, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_ssubsat_i8: @@ -187,20 +184,19 @@ define amdgpu_ps i8 @s_ssubsat_i8(i8 inreg %lhs, i8 inreg %rhs) { ; GFX6-LABEL: s_ssubsat_i8: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_lshl_b32 s0, s0, 24 -; GFX6-NEXT: s_lshl_b32 s1, s1, 24 +; GFX6-NEXT: s_sext_i32_i8 s0, s0 ; GFX6-NEXT: s_cmp_gt_i32 s0, -1 ; GFX6-NEXT: s_cselect_b32 s2, s0, -1 -; GFX6-NEXT: s_sub_i32 s2, s2, 0x7fffffff +; GFX6-NEXT: s_sub_i32 s2, s2, 0x7f ; GFX6-NEXT: s_cmp_lt_i32 s0, -1 ; GFX6-NEXT: s_cselect_b32 s3, s0, -1 -; GFX6-NEXT: s_sub_i32 s3, s3, 0x80000000 +; GFX6-NEXT: s_sext_i32_i8 s1, s1 +; GFX6-NEXT: s_sub_i32 s3, s3, 0xffffff80 ; GFX6-NEXT: s_cmp_gt_i32 s2, s1 ; GFX6-NEXT: s_cselect_b32 s1, s2, s1 ; GFX6-NEXT: s_cmp_lt_i32 s1, s3 ; GFX6-NEXT: s_cselect_b32 s1, s1, s3 ; GFX6-NEXT: s_sub_i32 s0, s0, s1 -; GFX6-NEXT: s_ashr_i32 s0, s0, 24 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_ssubsat_i8: @@ -259,20 +255,20 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v2, 8, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; GFX6-NEXT: s_brev_b32 s4, -2 +; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 8 +; GFX6-NEXT: s_movk_i32 s4, 0x7f ; GFX6-NEXT: v_max_i32_e32 v4, -1, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 8, v1 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 8 ; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s4, v4 -; GFX6-NEXT: s_mov_b32 s5, 0x80000000 +; GFX6-NEXT: s_mov_b32 s5, 0xffffff80 ; GFX6-NEXT: v_min_i32_e32 v5, -1, v0 ; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, s5, v5 ; GFX6-NEXT: v_max_i32_e32 v1, v4, v1 ; GFX6-NEXT: v_min_i32_e32 v1, v1, v5 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 24, v3 +; GFX6-NEXT: v_bfe_i32 v1, v2, 0, 8 +; GFX6-NEXT: v_bfe_i32 v2, v3, 0, 8 ; GFX6-NEXT: v_max_i32_e32 v3, -1, v1 ; GFX6-NEXT: v_min_i32_e32 v4, -1, v1 ; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s4, v3 @@ -280,9 +276,7 @@ ; GFX6-NEXT: v_max_i32_e32 v2, v3, v2 ; GFX6-NEXT: v_min_i32_e32 v2, v2, v4 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v2 -; GFX6-NEXT: v_ashrrev_i32_e32 v1, 24, v1 ; GFX6-NEXT: v_mov_b32_e32 v2, 0xff -; GFX6-NEXT: v_ashrrev_i32_e32 v0, 24, v0 ; GFX6-NEXT: v_and_b32_e32 v1, v1, v2 ; GFX6-NEXT: v_and_b32_e32 v0, v0, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 8, v1 @@ -371,25 +365,24 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_lshr_b32 s2, s0, 8 ; GFX6-NEXT: s_lshr_b32 s3, s1, 8 -; GFX6-NEXT: s_lshl_b32 s0, s0, 24 -; GFX6-NEXT: s_lshl_b32 s1, s1, 24 +; GFX6-NEXT: s_sext_i32_i8 s0, s0 ; GFX6-NEXT: s_cmp_gt_i32 s0, -1 -; GFX6-NEXT: s_brev_b32 s4, -2 +; GFX6-NEXT: s_movk_i32 s4, 0x7f ; GFX6-NEXT: s_cselect_b32 s6, s0, -1 ; GFX6-NEXT: s_sub_i32 s6, s6, s4 ; GFX6-NEXT: s_cmp_lt_i32 s0, -1 -; GFX6-NEXT: s_mov_b32 s5, 0x80000000 +; GFX6-NEXT: s_mov_b32 s5, 0xffffff80 ; GFX6-NEXT: s_cselect_b32 s7, s0, -1 +; GFX6-NEXT: s_sext_i32_i8 s1, s1 ; GFX6-NEXT: s_sub_i32 s7, s7, s5 ; GFX6-NEXT: s_cmp_gt_i32 s6, s1 ; GFX6-NEXT: s_cselect_b32 s1, s6, s1 ; GFX6-NEXT: s_cmp_lt_i32 s1, s7 ; GFX6-NEXT: s_cselect_b32 s1, s1, s7 ; GFX6-NEXT: s_sub_i32 s0, s0, s1 -; GFX6-NEXT: s_lshl_b32 s1, s2, 24 -; GFX6-NEXT: s_ashr_i32 s0, s0, 24 -; GFX6-NEXT: s_lshl_b32 s2, s3, 24 +; GFX6-NEXT: s_sext_i32_i8 s1, s2 ; GFX6-NEXT: s_cmp_gt_i32 s1, -1 +; GFX6-NEXT: s_sext_i32_i8 s2, s3 ; GFX6-NEXT: s_cselect_b32 s3, s1, -1 ; GFX6-NEXT: s_sub_i32 s3, s3, s4 ; GFX6-NEXT: s_cmp_lt_i32 s1, -1 @@ -401,7 +394,6 @@ ; GFX6-NEXT: s_cselect_b32 s2, s2, s4 ; GFX6-NEXT: s_sub_i32 s1, s1, s2 ; GFX6-NEXT: s_movk_i32 s2, 0xff -; GFX6-NEXT: s_ashr_i32 s1, s1, 24 ; GFX6-NEXT: s_and_b32 s1, s1, s2 ; GFX6-NEXT: s_and_b32 s0, s0, s2 ; GFX6-NEXT: s_lshl_b32 s1, s1, 8 @@ -526,60 +518,56 @@ ; GFX6-NEXT: v_lshrrev_b32_e32 v2, 8, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 24, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; GFX6-NEXT: s_brev_b32 s4, -2 +; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 8 +; GFX6-NEXT: s_movk_i32 s4, 0x7f ; GFX6-NEXT: v_max_i32_e32 v8, -1, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v5, 8, v1 ; GFX6-NEXT: v_lshrrev_b32_e32 v6, 16, v1 ; GFX6-NEXT: v_lshrrev_b32_e32 v7, 24, v1 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 8 ; GFX6-NEXT: v_subrev_i32_e32 v8, vcc, s4, v8 -; GFX6-NEXT: s_mov_b32 s5, 0x80000000 +; GFX6-NEXT: s_mov_b32 s5, 0xffffff80 ; GFX6-NEXT: v_min_i32_e32 v10, -1, v0 ; GFX6-NEXT: v_subrev_i32_e32 v10, vcc, s5, v10 ; GFX6-NEXT: v_max_i32_e32 v1, v8, v1 ; GFX6-NEXT: v_min_i32_e32 v1, v1, v10 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 24, v5 +; GFX6-NEXT: v_bfe_i32 v1, v2, 0, 8 +; GFX6-NEXT: v_bfe_i32 v2, v5, 0, 8 ; GFX6-NEXT: v_max_i32_e32 v5, -1, v1 -; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, s4, v5 ; GFX6-NEXT: v_min_i32_e32 v8, -1, v1 +; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, s4, v5 ; GFX6-NEXT: v_subrev_i32_e32 v8, vcc, s5, v8 ; GFX6-NEXT: v_max_i32_e32 v2, v5, v2 ; GFX6-NEXT: v_min_i32_e32 v2, v2, v8 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 24, v3 -; GFX6-NEXT: v_bfrev_b32_e32 v9, -2 +; GFX6-NEXT: v_bfe_i32 v2, v3, 0, 8 ; GFX6-NEXT: v_max_i32_e32 v5, -1, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 24, v6 +; GFX6-NEXT: v_bfe_i32 v3, v6, 0, 8 +; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, s4, v5 ; GFX6-NEXT: v_min_i32_e32 v6, -1, v2 -; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v5, v9 ; GFX6-NEXT: v_subrev_i32_e32 v6, vcc, s5, v6 ; GFX6-NEXT: v_max_i32_e32 v3, v5, v3 ; GFX6-NEXT: v_min_i32_e32 v3, v3, v6 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 24, v4 +; GFX6-NEXT: v_bfe_i32 v3, v4, 0, 8 +; GFX6-NEXT: s_movk_i32 s4, 0xff +; GFX6-NEXT: v_mov_b32_e32 v9, 0x7f ; GFX6-NEXT: v_max_i32_e32 v5, -1, v3 -; GFX6-NEXT: v_mov_b32_e32 v11, 0x80000000 +; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX6-NEXT: v_mov_b32_e32 v11, 0xffffff80 ; GFX6-NEXT: v_min_i32_e32 v6, -1, v3 -; GFX6-NEXT: v_ashrrev_i32_e32 v1, 24, v1 -; GFX6-NEXT: s_movk_i32 s4, 0xff -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 24, v7 +; GFX6-NEXT: v_bfe_i32 v4, v7, 0, 8 ; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v5, v9 -; GFX6-NEXT: v_ashrrev_i32_e32 v0, 24, v0 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 ; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v6, v11 ; GFX6-NEXT: v_max_i32_e32 v4, v5, v4 -; GFX6-NEXT: v_min_i32_e32 v4, v4, v6 -; GFX6-NEXT: v_ashrrev_i32_e32 v2, 24, v2 ; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v3, v4 +; GFX6-NEXT: v_min_i32_e32 v4, v4, v6 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: v_and_b32_e32 v1, s4, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_ashrrev_i32_e32 v3, 24, v3 +; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v3, v4 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: v_and_b32_e32 v1, s4, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v1 @@ -735,25 +723,24 @@ ; GFX6-NEXT: s_lshr_b32 s5, s1, 8 ; GFX6-NEXT: s_lshr_b32 s6, s1, 16 ; GFX6-NEXT: s_lshr_b32 s7, s1, 24 -; GFX6-NEXT: s_lshl_b32 s0, s0, 24 -; GFX6-NEXT: s_lshl_b32 s1, s1, 24 +; GFX6-NEXT: s_sext_i32_i8 s0, s0 ; GFX6-NEXT: s_cmp_gt_i32 s0, -1 -; GFX6-NEXT: s_brev_b32 s8, -2 +; GFX6-NEXT: s_movk_i32 s8, 0x7f ; GFX6-NEXT: s_cselect_b32 s10, s0, -1 ; GFX6-NEXT: s_sub_i32 s10, s10, s8 ; GFX6-NEXT: s_cmp_lt_i32 s0, -1 -; GFX6-NEXT: s_mov_b32 s9, 0x80000000 +; GFX6-NEXT: s_mov_b32 s9, 0xffffff80 ; GFX6-NEXT: s_cselect_b32 s11, s0, -1 +; GFX6-NEXT: s_sext_i32_i8 s1, s1 ; GFX6-NEXT: s_sub_i32 s11, s11, s9 ; GFX6-NEXT: s_cmp_gt_i32 s10, s1 ; GFX6-NEXT: s_cselect_b32 s1, s10, s1 ; GFX6-NEXT: s_cmp_lt_i32 s1, s11 ; GFX6-NEXT: s_cselect_b32 s1, s1, s11 ; GFX6-NEXT: s_sub_i32 s0, s0, s1 -; GFX6-NEXT: s_lshl_b32 s1, s2, 24 -; GFX6-NEXT: s_ashr_i32 s0, s0, 24 -; GFX6-NEXT: s_lshl_b32 s2, s5, 24 +; GFX6-NEXT: s_sext_i32_i8 s1, s2 ; GFX6-NEXT: s_cmp_gt_i32 s1, -1 +; GFX6-NEXT: s_sext_i32_i8 s2, s5 ; GFX6-NEXT: s_cselect_b32 s5, s1, -1 ; GFX6-NEXT: s_sub_i32 s5, s5, s8 ; GFX6-NEXT: s_cmp_lt_i32 s1, -1 @@ -764,13 +751,12 @@ ; GFX6-NEXT: s_cmp_lt_i32 s2, s10 ; GFX6-NEXT: s_cselect_b32 s2, s2, s10 ; GFX6-NEXT: s_sub_i32 s1, s1, s2 -; GFX6-NEXT: s_lshl_b32 s2, s3, 24 -; GFX6-NEXT: s_ashr_i32 s1, s1, 24 -; GFX6-NEXT: s_lshl_b32 s3, s6, 24 +; GFX6-NEXT: s_sext_i32_i8 s2, s3 ; GFX6-NEXT: s_cmp_gt_i32 s2, -1 ; GFX6-NEXT: s_cselect_b32 s5, s2, -1 ; GFX6-NEXT: s_sub_i32 s5, s5, s8 ; GFX6-NEXT: s_cmp_lt_i32 s2, -1 +; GFX6-NEXT: s_sext_i32_i8 s3, s6 ; GFX6-NEXT: s_cselect_b32 s6, s2, -1 ; GFX6-NEXT: s_sub_i32 s6, s6, s9 ; GFX6-NEXT: s_cmp_gt_i32 s5, s3 @@ -778,14 +764,13 @@ ; GFX6-NEXT: s_cmp_lt_i32 s3, s6 ; GFX6-NEXT: s_cselect_b32 s3, s3, s6 ; GFX6-NEXT: s_sub_i32 s2, s2, s3 -; GFX6-NEXT: s_lshl_b32 s3, s4, 24 -; GFX6-NEXT: s_ashr_i32 s2, s2, 24 -; GFX6-NEXT: s_lshl_b32 s4, s7, 24 +; GFX6-NEXT: s_sext_i32_i8 s3, s4 ; GFX6-NEXT: s_cmp_gt_i32 s3, -1 ; GFX6-NEXT: s_cselect_b32 s5, s3, -1 ; GFX6-NEXT: s_sub_i32 s5, s5, s8 ; GFX6-NEXT: s_cmp_lt_i32 s3, -1 ; GFX6-NEXT: s_cselect_b32 s6, s3, -1 +; GFX6-NEXT: s_sext_i32_i8 s4, s7 ; GFX6-NEXT: s_sub_i32 s6, s6, s9 ; GFX6-NEXT: s_cmp_gt_i32 s5, s4 ; GFX6-NEXT: s_cselect_b32 s4, s5, s4 @@ -799,7 +784,6 @@ ; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: s_and_b32 s1, s2, s4 ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 -; GFX6-NEXT: s_ashr_i32 s3, s3, 24 ; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: s_and_b32 s1, s3, s4 ; GFX6-NEXT: s_lshl_b32 s1, s1, 24 @@ -1014,16 +998,15 @@ ; GFX6-LABEL: v_ssubsat_i24: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 24 ; GFX6-NEXT: v_max_i32_e32 v2, -1, v0 ; GFX6-NEXT: v_min_i32_e32 v3, -1, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, 0x7fffffff, v2 -; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, 0x80000000, v3 +; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 24 +; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, 0x7fffff, v2 +; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, 0xff800000, v3 ; GFX6-NEXT: v_max_i32_e32 v1, v2, v1 ; GFX6-NEXT: v_min_i32_e32 v1, v1, v3 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 -; GFX6-NEXT: v_ashrrev_i32_e32 v0, 8, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_ssubsat_i24: @@ -1068,20 +1051,19 @@ define amdgpu_ps i24 @s_ssubsat_i24(i24 inreg %lhs, i24 inreg %rhs) { ; GFX6-LABEL: s_ssubsat_i24: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_lshl_b32 s0, s0, 8 -; GFX6-NEXT: s_lshl_b32 s1, s1, 8 +; GFX6-NEXT: s_bfe_i32 s0, s0, 0x180000 +; GFX6-NEXT: s_bfe_i32 s1, s1, 0x180000 ; GFX6-NEXT: s_cmp_gt_i32 s0, -1 ; GFX6-NEXT: s_cselect_b32 s2, s0, -1 -; GFX6-NEXT: s_sub_i32 s2, s2, 0x7fffffff +; GFX6-NEXT: s_sub_i32 s2, s2, 0x7fffff ; GFX6-NEXT: s_cmp_lt_i32 s0, -1 ; GFX6-NEXT: s_cselect_b32 s3, s0, -1 -; GFX6-NEXT: s_sub_i32 s3, s3, 0x80000000 +; GFX6-NEXT: s_sub_i32 s3, s3, 0xff800000 ; GFX6-NEXT: s_cmp_gt_i32 s2, s1 ; GFX6-NEXT: s_cselect_b32 s1, s2, s1 ; GFX6-NEXT: s_cmp_lt_i32 s1, s3 ; GFX6-NEXT: s_cselect_b32 s1, s1, s3 ; GFX6-NEXT: s_sub_i32 s0, s0, s1 -; GFX6-NEXT: s_ashr_i32 s0, s0, 8 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_ssubsat_i24: @@ -2873,16 +2855,15 @@ ; GFX6-LABEL: v_ssubsat_i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16 ; GFX6-NEXT: v_max_i32_e32 v2, -1, v0 ; GFX6-NEXT: v_min_i32_e32 v3, -1, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, 0x7fffffff, v2 -; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, 0x80000000, v3 +; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16 +; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, 0x7fff, v2 +; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, 0xffff8000, v3 ; GFX6-NEXT: v_max_i32_e32 v1, v2, v1 ; GFX6-NEXT: v_min_i32_e32 v1, v1, v3 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 -; GFX6-NEXT: v_ashrrev_i32_e32 v0, 16, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_ssubsat_i16: @@ -2918,20 +2899,19 @@ define amdgpu_ps i16 @s_ssubsat_i16(i16 inreg %lhs, i16 inreg %rhs) { ; GFX6-LABEL: s_ssubsat_i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_lshl_b32 s0, s0, 16 -; GFX6-NEXT: s_lshl_b32 s1, s1, 16 +; GFX6-NEXT: s_sext_i32_i16 s0, s0 ; GFX6-NEXT: s_cmp_gt_i32 s0, -1 ; GFX6-NEXT: s_cselect_b32 s2, s0, -1 -; GFX6-NEXT: s_sub_i32 s2, s2, 0x7fffffff +; GFX6-NEXT: s_sub_i32 s2, s2, 0x7fff ; GFX6-NEXT: s_cmp_lt_i32 s0, -1 ; GFX6-NEXT: s_cselect_b32 s3, s0, -1 -; GFX6-NEXT: s_sub_i32 s3, s3, 0x80000000 +; GFX6-NEXT: s_sext_i32_i16 s1, s1 +; GFX6-NEXT: s_sub_i32 s3, s3, 0xffff8000 ; GFX6-NEXT: s_cmp_gt_i32 s2, s1 ; GFX6-NEXT: s_cselect_b32 s1, s2, s1 ; GFX6-NEXT: s_cmp_lt_i32 s1, s3 ; GFX6-NEXT: s_cselect_b32 s1, s1, s3 ; GFX6-NEXT: s_sub_i32 s0, s0, s1 -; GFX6-NEXT: s_ashr_i32 s0, s0, 16 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_ssubsat_i16: @@ -2975,18 +2955,17 @@ define amdgpu_ps half @ssubsat_i16_sv(i16 inreg %lhs, i16 %rhs) { ; GFX6-LABEL: ssubsat_i16_sv: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_lshl_b32 s0, s0, 16 +; GFX6-NEXT: s_sext_i32_i16 s0, s0 ; GFX6-NEXT: s_cmp_gt_i32 s0, -1 ; GFX6-NEXT: s_cselect_b32 s1, s0, -1 -; GFX6-NEXT: s_sub_i32 s1, s1, 0x7fffffff +; GFX6-NEXT: s_sub_i32 s1, s1, 0x7fff ; GFX6-NEXT: s_cmp_lt_i32 s0, -1 ; GFX6-NEXT: s_cselect_b32 s2, s0, -1 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: s_sub_i32 s2, s2, 0x80000000 +; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX6-NEXT: s_sub_i32 s2, s2, 0xffff8000 ; GFX6-NEXT: v_max_i32_e32 v0, s1, v0 ; GFX6-NEXT: v_min_i32_e32 v0, s2, v0 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 -; GFX6-NEXT: v_ashrrev_i32_e32 v0, 16, v0 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: ssubsat_i16_sv: @@ -3022,16 +3001,15 @@ define amdgpu_ps half @ssubsat_i16_vs(i16 %lhs, i16 inreg %rhs) { ; GFX6-LABEL: ssubsat_i16_vs: ; GFX6: ; %bb.0: -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16 ; GFX6-NEXT: v_max_i32_e32 v1, -1, v0 ; GFX6-NEXT: v_min_i32_e32 v2, -1, v0 -; GFX6-NEXT: s_lshl_b32 s0, s0, 16 -; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, 0x7fffffff, v1 -; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, 0x80000000, v2 +; GFX6-NEXT: s_sext_i32_i16 s0, s0 +; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, 0x7fff, v1 +; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, 0xffff8000, v2 ; GFX6-NEXT: v_max_i32_e32 v1, s0, v1 ; GFX6-NEXT: v_min_i32_e32 v1, v1, v2 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 -; GFX6-NEXT: v_ashrrev_i32_e32 v0, 16, v0 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: ssubsat_i16_vs: @@ -3065,19 +3043,19 @@ ; GFX6-LABEL: v_ssubsat_v2i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: s_brev_b32 s4, -2 +; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX6-NEXT: s_movk_i32 s4, 0x7fff ; GFX6-NEXT: v_max_i32_e32 v4, -1, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 16 ; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s4, v4 -; GFX6-NEXT: s_mov_b32 s5, 0x80000000 +; GFX6-NEXT: s_mov_b32 s5, 0xffff8000 ; GFX6-NEXT: v_min_i32_e32 v5, -1, v0 ; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, s5, v5 ; GFX6-NEXT: v_max_i32_e32 v2, v4, v2 ; GFX6-NEXT: v_min_i32_e32 v2, v2, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX6-NEXT: v_bfe_i32 v2, v3, 0, 16 ; GFX6-NEXT: v_max_i32_e32 v3, -1, v1 ; GFX6-NEXT: v_min_i32_e32 v4, -1, v1 ; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s4, v3 @@ -3085,8 +3063,6 @@ ; GFX6-NEXT: v_max_i32_e32 v2, v3, v2 ; GFX6-NEXT: v_min_i32_e32 v2, v2, v4 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v2 -; GFX6-NEXT: v_ashrrev_i32_e32 v0, 16, v0 -; GFX6-NEXT: v_ashrrev_i32_e32 v1, 16, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_ssubsat_v2i16: @@ -3133,25 +3109,24 @@ define amdgpu_ps i32 @s_ssubsat_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs) { ; GFX6-LABEL: s_ssubsat_v2i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_lshl_b32 s0, s0, 16 -; GFX6-NEXT: s_lshl_b32 s2, s2, 16 +; GFX6-NEXT: s_sext_i32_i16 s0, s0 ; GFX6-NEXT: s_cmp_gt_i32 s0, -1 -; GFX6-NEXT: s_brev_b32 s4, -2 +; GFX6-NEXT: s_movk_i32 s4, 0x7fff ; GFX6-NEXT: s_cselect_b32 s6, s0, -1 ; GFX6-NEXT: s_sub_i32 s6, s6, s4 ; GFX6-NEXT: s_cmp_lt_i32 s0, -1 -; GFX6-NEXT: s_mov_b32 s5, 0x80000000 +; GFX6-NEXT: s_mov_b32 s5, 0xffff8000 ; GFX6-NEXT: s_cselect_b32 s7, s0, -1 +; GFX6-NEXT: s_sext_i32_i16 s2, s2 ; GFX6-NEXT: s_sub_i32 s7, s7, s5 ; GFX6-NEXT: s_cmp_gt_i32 s6, s2 ; GFX6-NEXT: s_cselect_b32 s2, s6, s2 ; GFX6-NEXT: s_cmp_lt_i32 s2, s7 ; GFX6-NEXT: s_cselect_b32 s2, s2, s7 ; GFX6-NEXT: s_sub_i32 s0, s0, s2 -; GFX6-NEXT: s_ashr_i32 s0, s0, 16 -; GFX6-NEXT: s_lshl_b32 s1, s1, 16 -; GFX6-NEXT: s_lshl_b32 s2, s3, 16 +; GFX6-NEXT: s_sext_i32_i16 s1, s1 ; GFX6-NEXT: s_cmp_gt_i32 s1, -1 +; GFX6-NEXT: s_sext_i32_i16 s2, s3 ; GFX6-NEXT: s_cselect_b32 s3, s1, -1 ; GFX6-NEXT: s_sub_i32 s3, s3, s4 ; GFX6-NEXT: s_cmp_lt_i32 s1, -1 @@ -3163,7 +3138,6 @@ ; GFX6-NEXT: s_cselect_b32 s2, s2, s4 ; GFX6-NEXT: s_sub_i32 s1, s1, s2 ; GFX6-NEXT: s_mov_b32 s2, 0xffff -; GFX6-NEXT: s_ashr_i32 s1, s1, 16 ; GFX6-NEXT: s_and_b32 s1, s1, s2 ; GFX6-NEXT: s_and_b32 s0, s0, s2 ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 @@ -3236,33 +3210,31 @@ define amdgpu_ps float @ssubsat_v2i16_sv(<2 x i16> inreg %lhs, <2 x i16> %rhs) { ; GFX6-LABEL: ssubsat_v2i16_sv: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_lshl_b32 s0, s0, 16 +; GFX6-NEXT: s_sext_i32_i16 s0, s0 ; GFX6-NEXT: s_cmp_gt_i32 s0, -1 -; GFX6-NEXT: s_brev_b32 s2, -2 +; GFX6-NEXT: s_movk_i32 s2, 0x7fff ; GFX6-NEXT: s_cselect_b32 s4, s0, -1 ; GFX6-NEXT: s_sub_i32 s4, s4, s2 ; GFX6-NEXT: s_cmp_lt_i32 s0, -1 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: s_mov_b32 s3, 0x80000000 +; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX6-NEXT: s_mov_b32 s3, 0xffff8000 ; GFX6-NEXT: s_cselect_b32 s5, s0, -1 ; GFX6-NEXT: s_sub_i32 s5, s5, s3 ; GFX6-NEXT: v_max_i32_e32 v0, s4, v0 ; GFX6-NEXT: v_min_i32_e32 v0, s5, v0 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 -; GFX6-NEXT: s_lshl_b32 s0, s1, 16 +; GFX6-NEXT: s_sext_i32_i16 s0, s1 ; GFX6-NEXT: s_cmp_gt_i32 s0, -1 ; GFX6-NEXT: s_cselect_b32 s1, s0, -1 ; GFX6-NEXT: s_sub_i32 s1, s1, s2 ; GFX6-NEXT: s_cmp_lt_i32 s0, -1 ; GFX6-NEXT: s_cselect_b32 s2, s0, -1 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16 ; GFX6-NEXT: s_sub_i32 s2, s2, s3 ; GFX6-NEXT: v_max_i32_e32 v1, s1, v1 ; GFX6-NEXT: v_min_i32_e32 v1, s2, v1 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s0, v1 -; GFX6-NEXT: v_ashrrev_i32_e32 v1, 16, v1 ; GFX6-NEXT: s_mov_b32 s0, 0xffff -; GFX6-NEXT: v_ashrrev_i32_e32 v0, 16, v0 ; GFX6-NEXT: v_and_b32_e32 v1, s0, v1 ; GFX6-NEXT: v_and_b32_e32 v0, s0, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 @@ -3318,29 +3290,27 @@ define amdgpu_ps float @ssubsat_v2i16_vs(<2 x i16> %lhs, <2 x i16> inreg %rhs) { ; GFX6-LABEL: ssubsat_v2i16_vs: ; GFX6: ; %bb.0: -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: s_brev_b32 s2, -2 +; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX6-NEXT: s_movk_i32 s2, 0x7fff ; GFX6-NEXT: v_max_i32_e32 v2, -1, v0 -; GFX6-NEXT: s_lshl_b32 s0, s0, 16 +; GFX6-NEXT: s_sext_i32_i16 s0, s0 ; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s2, v2 -; GFX6-NEXT: s_mov_b32 s3, 0x80000000 +; GFX6-NEXT: s_mov_b32 s3, 0xffff8000 ; GFX6-NEXT: v_min_i32_e32 v3, -1, v0 ; GFX6-NEXT: v_max_i32_e32 v2, s0, v2 ; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s3, v3 ; GFX6-NEXT: v_min_i32_e32 v2, v2, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 ; GFX6-NEXT: v_max_i32_e32 v2, -1, v1 ; GFX6-NEXT: v_min_i32_e32 v3, -1, v1 -; GFX6-NEXT: s_lshl_b32 s0, s1, 16 +; GFX6-NEXT: s_sext_i32_i16 s0, s1 ; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s2, v2 ; GFX6-NEXT: v_max_i32_e32 v2, s0, v2 ; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s3, v3 ; GFX6-NEXT: v_min_i32_e32 v2, v2, v3 -; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v2 -; GFX6-NEXT: v_ashrrev_i32_e32 v1, 16, v1 ; GFX6-NEXT: s_mov_b32 s0, 0xffff -; GFX6-NEXT: v_ashrrev_i32_e32 v0, 16, v0 +; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v2 ; GFX6-NEXT: v_and_b32_e32 v1, s0, v1 ; GFX6-NEXT: v_and_b32_e32 v0, s0, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 @@ -3401,52 +3371,48 @@ ; GFX6-LABEL: v_ssubsat_v4i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: s_brev_b32 s4, -2 +; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX6-NEXT: s_movk_i32 s4, 0x7fff ; GFX6-NEXT: v_max_i32_e32 v8, -1, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_bfe_i32 v4, v4, 0, 16 ; GFX6-NEXT: v_subrev_i32_e32 v8, vcc, s4, v8 -; GFX6-NEXT: s_mov_b32 s5, 0x80000000 +; GFX6-NEXT: s_mov_b32 s5, 0xffff8000 ; GFX6-NEXT: v_min_i32_e32 v10, -1, v0 ; GFX6-NEXT: v_subrev_i32_e32 v10, vcc, s5, v10 ; GFX6-NEXT: v_max_i32_e32 v4, v8, v4 ; GFX6-NEXT: v_min_i32_e32 v4, v4, v10 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX6-NEXT: v_bfe_i32 v4, v5, 0, 16 ; GFX6-NEXT: v_max_i32_e32 v5, -1, v1 -; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, s4, v5 ; GFX6-NEXT: v_min_i32_e32 v8, -1, v1 +; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, s4, v5 ; GFX6-NEXT: v_subrev_i32_e32 v8, vcc, s5, v8 ; GFX6-NEXT: v_max_i32_e32 v4, v5, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_min_i32_e32 v4, v4, v8 -; GFX6-NEXT: v_bfrev_b32_e32 v9, -2 +; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 16 ; GFX6-NEXT: v_max_i32_e32 v5, -1, v2 +; GFX6-NEXT: v_min_i32_e32 v4, v4, v8 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX6-NEXT: v_bfe_i32 v4, v6, 0, 16 +; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, s4, v5 ; GFX6-NEXT: v_min_i32_e32 v6, -1, v2 -; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v5, v9 ; GFX6-NEXT: v_subrev_i32_e32 v6, vcc, s5, v6 ; GFX6-NEXT: v_max_i32_e32 v4, v5, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_bfe_i32 v3, v3, 0, 16 ; GFX6-NEXT: v_min_i32_e32 v4, v4, v6 +; GFX6-NEXT: v_mov_b32_e32 v9, 0x7fff ; GFX6-NEXT: v_max_i32_e32 v5, -1, v3 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v4 -; GFX6-NEXT: v_mov_b32_e32 v11, 0x80000000 +; GFX6-NEXT: v_mov_b32_e32 v11, 0xffff8000 ; GFX6-NEXT: v_min_i32_e32 v6, -1, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v7 +; GFX6-NEXT: v_bfe_i32 v4, v7, 0, 16 ; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v5, v9 +; GFX6-NEXT: s_mov_b32 s4, 0xffff ; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v6, v11 ; GFX6-NEXT: v_max_i32_e32 v4, v5, v4 +; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 ; GFX6-NEXT: v_min_i32_e32 v4, v4, v6 -; GFX6-NEXT: v_ashrrev_i32_e32 v1, 16, v1 -; GFX6-NEXT: s_mov_b32 s4, 0xffff -; GFX6-NEXT: v_ashrrev_i32_e32 v0, 16, v0 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v3, v4 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 -; GFX6-NEXT: v_ashrrev_i32_e32 v2, 16, v2 -; GFX6-NEXT: v_ashrrev_i32_e32 v3, 16, v3 ; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 @@ -3519,25 +3485,24 @@ define amdgpu_ps <2 x i32> @s_ssubsat_v4i16(<4 x i16> inreg %lhs, <4 x i16> inreg %rhs) { ; GFX6-LABEL: s_ssubsat_v4i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_lshl_b32 s0, s0, 16 -; GFX6-NEXT: s_lshl_b32 s4, s4, 16 +; GFX6-NEXT: s_sext_i32_i16 s0, s0 ; GFX6-NEXT: s_cmp_gt_i32 s0, -1 -; GFX6-NEXT: s_brev_b32 s8, -2 +; GFX6-NEXT: s_movk_i32 s8, 0x7fff ; GFX6-NEXT: s_cselect_b32 s10, s0, -1 ; GFX6-NEXT: s_sub_i32 s10, s10, s8 ; GFX6-NEXT: s_cmp_lt_i32 s0, -1 -; GFX6-NEXT: s_mov_b32 s9, 0x80000000 +; GFX6-NEXT: s_mov_b32 s9, 0xffff8000 ; GFX6-NEXT: s_cselect_b32 s11, s0, -1 +; GFX6-NEXT: s_sext_i32_i16 s4, s4 ; GFX6-NEXT: s_sub_i32 s11, s11, s9 ; GFX6-NEXT: s_cmp_gt_i32 s10, s4 ; GFX6-NEXT: s_cselect_b32 s4, s10, s4 ; GFX6-NEXT: s_cmp_lt_i32 s4, s11 ; GFX6-NEXT: s_cselect_b32 s4, s4, s11 ; GFX6-NEXT: s_sub_i32 s0, s0, s4 -; GFX6-NEXT: s_ashr_i32 s0, s0, 16 -; GFX6-NEXT: s_lshl_b32 s1, s1, 16 -; GFX6-NEXT: s_lshl_b32 s4, s5, 16 +; GFX6-NEXT: s_sext_i32_i16 s1, s1 ; GFX6-NEXT: s_cmp_gt_i32 s1, -1 +; GFX6-NEXT: s_sext_i32_i16 s4, s5 ; GFX6-NEXT: s_cselect_b32 s5, s1, -1 ; GFX6-NEXT: s_sub_i32 s5, s5, s8 ; GFX6-NEXT: s_cmp_lt_i32 s1, -1 @@ -3548,13 +3513,12 @@ ; GFX6-NEXT: s_cmp_lt_i32 s4, s10 ; GFX6-NEXT: s_cselect_b32 s4, s4, s10 ; GFX6-NEXT: s_sub_i32 s1, s1, s4 -; GFX6-NEXT: s_ashr_i32 s1, s1, 16 -; GFX6-NEXT: s_lshl_b32 s2, s2, 16 -; GFX6-NEXT: s_lshl_b32 s4, s6, 16 +; GFX6-NEXT: s_sext_i32_i16 s2, s2 ; GFX6-NEXT: s_cmp_gt_i32 s2, -1 ; GFX6-NEXT: s_cselect_b32 s5, s2, -1 ; GFX6-NEXT: s_sub_i32 s5, s5, s8 ; GFX6-NEXT: s_cmp_lt_i32 s2, -1 +; GFX6-NEXT: s_sext_i32_i16 s4, s6 ; GFX6-NEXT: s_cselect_b32 s6, s2, -1 ; GFX6-NEXT: s_sub_i32 s6, s6, s9 ; GFX6-NEXT: s_cmp_gt_i32 s5, s4 @@ -3562,14 +3526,13 @@ ; GFX6-NEXT: s_cmp_lt_i32 s4, s6 ; GFX6-NEXT: s_cselect_b32 s4, s4, s6 ; GFX6-NEXT: s_sub_i32 s2, s2, s4 -; GFX6-NEXT: s_ashr_i32 s2, s2, 16 -; GFX6-NEXT: s_lshl_b32 s3, s3, 16 -; GFX6-NEXT: s_lshl_b32 s4, s7, 16 +; GFX6-NEXT: s_sext_i32_i16 s3, s3 ; GFX6-NEXT: s_cmp_gt_i32 s3, -1 ; GFX6-NEXT: s_cselect_b32 s5, s3, -1 ; GFX6-NEXT: s_sub_i32 s5, s5, s8 ; GFX6-NEXT: s_cmp_lt_i32 s3, -1 ; GFX6-NEXT: s_cselect_b32 s6, s3, -1 +; GFX6-NEXT: s_sext_i32_i16 s4, s7 ; GFX6-NEXT: s_sub_i32 s6, s6, s9 ; GFX6-NEXT: s_cmp_gt_i32 s5, s4 ; GFX6-NEXT: s_cselect_b32 s4, s5, s4 @@ -3578,7 +3541,6 @@ ; GFX6-NEXT: s_sub_i32 s3, s3, s4 ; GFX6-NEXT: s_mov_b32 s4, 0xffff ; GFX6-NEXT: s_and_b32 s1, s1, s4 -; GFX6-NEXT: s_ashr_i32 s3, s3, 16 ; GFX6-NEXT: s_and_b32 s0, s0, s4 ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_or_b32 s0, s0, s1 @@ -3709,79 +3671,73 @@ ; GFX6-LABEL: v_ssubsat_v6i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: s_brev_b32 s4, -2 +; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX6-NEXT: s_movk_i32 s4, 0x7fff ; GFX6-NEXT: v_max_i32_e32 v12, -1, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX6-NEXT: v_bfe_i32 v6, v6, 0, 16 ; GFX6-NEXT: v_subrev_i32_e32 v12, vcc, s4, v12 -; GFX6-NEXT: s_mov_b32 s5, 0x80000000 +; GFX6-NEXT: s_mov_b32 s5, 0xffff8000 ; GFX6-NEXT: v_min_i32_e32 v14, -1, v0 ; GFX6-NEXT: v_subrev_i32_e32 v14, vcc, s5, v14 ; GFX6-NEXT: v_max_i32_e32 v6, v12, v6 ; GFX6-NEXT: v_min_i32_e32 v6, v6, v14 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v6 -; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v7 +; GFX6-NEXT: v_bfe_i32 v6, v7, 0, 16 ; GFX6-NEXT: v_max_i32_e32 v7, -1, v1 -; GFX6-NEXT: v_subrev_i32_e32 v7, vcc, s4, v7 ; GFX6-NEXT: v_min_i32_e32 v12, -1, v1 +; GFX6-NEXT: v_subrev_i32_e32 v7, vcc, s4, v7 ; GFX6-NEXT: v_subrev_i32_e32 v12, vcc, s5, v12 ; GFX6-NEXT: v_max_i32_e32 v6, v7, v6 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_min_i32_e32 v6, v6, v12 -; GFX6-NEXT: v_bfrev_b32_e32 v13, -2 +; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 16 ; GFX6-NEXT: v_max_i32_e32 v7, -1, v2 +; GFX6-NEXT: v_min_i32_e32 v6, v6, v12 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v6 -; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v8 +; GFX6-NEXT: v_bfe_i32 v6, v8, 0, 16 +; GFX6-NEXT: v_subrev_i32_e32 v7, vcc, s4, v7 ; GFX6-NEXT: v_min_i32_e32 v8, -1, v2 -; GFX6-NEXT: v_sub_i32_e32 v7, vcc, v7, v13 ; GFX6-NEXT: v_subrev_i32_e32 v8, vcc, s5, v8 ; GFX6-NEXT: v_max_i32_e32 v6, v7, v6 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_bfe_i32 v3, v3, 0, 16 ; GFX6-NEXT: v_min_i32_e32 v6, v6, v8 +; GFX6-NEXT: v_mov_b32_e32 v13, 0x7fff ; GFX6-NEXT: v_max_i32_e32 v7, -1, v3 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v6 -; GFX6-NEXT: v_mov_b32_e32 v15, 0x80000000 +; GFX6-NEXT: v_mov_b32_e32 v15, 0xffff8000 ; GFX6-NEXT: v_min_i32_e32 v8, -1, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v9 +; GFX6-NEXT: v_bfe_i32 v6, v9, 0, 16 ; GFX6-NEXT: v_sub_i32_e32 v7, vcc, v7, v13 ; GFX6-NEXT: v_sub_i32_e32 v8, vcc, v8, v15 ; GFX6-NEXT: v_max_i32_e32 v6, v7, v6 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_bfe_i32 v4, v4, 0, 16 ; GFX6-NEXT: v_min_i32_e32 v6, v6, v8 ; GFX6-NEXT: v_max_i32_e32 v7, -1, v4 -; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v3, v6 ; GFX6-NEXT: v_min_i32_e32 v8, -1, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v10 +; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v3, v6 +; GFX6-NEXT: v_bfe_i32 v6, v10, 0, 16 ; GFX6-NEXT: v_sub_i32_e32 v7, vcc, v7, v13 ; GFX6-NEXT: v_sub_i32_e32 v8, vcc, v8, v15 ; GFX6-NEXT: v_max_i32_e32 v6, v7, v6 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX6-NEXT: v_bfe_i32 v5, v5, 0, 16 ; GFX6-NEXT: v_min_i32_e32 v6, v6, v8 ; GFX6-NEXT: v_max_i32_e32 v7, -1, v5 -; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v4, v6 ; GFX6-NEXT: v_min_i32_e32 v8, -1, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v11 -; GFX6-NEXT: v_sub_i32_e32 v7, vcc, v7, v13 -; GFX6-NEXT: v_ashrrev_i32_e32 v1, 16, v1 ; GFX6-NEXT: s_mov_b32 s4, 0xffff +; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v4, v6 +; GFX6-NEXT: v_bfe_i32 v6, v11, 0, 16 +; GFX6-NEXT: v_sub_i32_e32 v7, vcc, v7, v13 +; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 ; GFX6-NEXT: v_sub_i32_e32 v8, vcc, v8, v15 ; GFX6-NEXT: v_max_i32_e32 v6, v7, v6 -; GFX6-NEXT: v_ashrrev_i32_e32 v0, 16, v0 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 ; GFX6-NEXT: v_min_i32_e32 v6, v6, v8 -; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v5, v6 -; GFX6-NEXT: v_ashrrev_i32_e32 v2, 16, v2 -; GFX6-NEXT: v_ashrrev_i32_e32 v3, 16, v3 ; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: v_and_b32_e32 v1, s4, v2 ; GFX6-NEXT: v_and_b32_e32 v2, s4, v3 -; GFX6-NEXT: v_ashrrev_i32_e32 v5, 16, v5 +; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v5, v6 ; GFX6-NEXT: v_and_b32_e32 v3, s4, v5 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_ashrrev_i32_e32 v4, 16, v4 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX6-NEXT: v_and_b32_e32 v2, s4, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 @@ -3872,25 +3828,24 @@ define amdgpu_ps <3 x i32> @s_ssubsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inreg %rhs) { ; GFX6-LABEL: s_ssubsat_v6i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_lshl_b32 s0, s0, 16 -; GFX6-NEXT: s_lshl_b32 s6, s6, 16 +; GFX6-NEXT: s_sext_i32_i16 s0, s0 ; GFX6-NEXT: s_cmp_gt_i32 s0, -1 -; GFX6-NEXT: s_brev_b32 s12, -2 +; GFX6-NEXT: s_movk_i32 s12, 0x7fff ; GFX6-NEXT: s_cselect_b32 s14, s0, -1 ; GFX6-NEXT: s_sub_i32 s14, s14, s12 ; GFX6-NEXT: s_cmp_lt_i32 s0, -1 -; GFX6-NEXT: s_mov_b32 s13, 0x80000000 +; GFX6-NEXT: s_mov_b32 s13, 0xffff8000 ; GFX6-NEXT: s_cselect_b32 s15, s0, -1 +; GFX6-NEXT: s_sext_i32_i16 s6, s6 ; GFX6-NEXT: s_sub_i32 s15, s15, s13 ; GFX6-NEXT: s_cmp_gt_i32 s14, s6 ; GFX6-NEXT: s_cselect_b32 s6, s14, s6 ; GFX6-NEXT: s_cmp_lt_i32 s6, s15 ; GFX6-NEXT: s_cselect_b32 s6, s6, s15 ; GFX6-NEXT: s_sub_i32 s0, s0, s6 -; GFX6-NEXT: s_ashr_i32 s0, s0, 16 -; GFX6-NEXT: s_lshl_b32 s1, s1, 16 -; GFX6-NEXT: s_lshl_b32 s6, s7, 16 +; GFX6-NEXT: s_sext_i32_i16 s1, s1 ; GFX6-NEXT: s_cmp_gt_i32 s1, -1 +; GFX6-NEXT: s_sext_i32_i16 s6, s7 ; GFX6-NEXT: s_cselect_b32 s7, s1, -1 ; GFX6-NEXT: s_sub_i32 s7, s7, s12 ; GFX6-NEXT: s_cmp_lt_i32 s1, -1 @@ -3901,13 +3856,12 @@ ; GFX6-NEXT: s_cmp_lt_i32 s6, s14 ; GFX6-NEXT: s_cselect_b32 s6, s6, s14 ; GFX6-NEXT: s_sub_i32 s1, s1, s6 -; GFX6-NEXT: s_ashr_i32 s1, s1, 16 -; GFX6-NEXT: s_lshl_b32 s2, s2, 16 -; GFX6-NEXT: s_lshl_b32 s6, s8, 16 +; GFX6-NEXT: s_sext_i32_i16 s2, s2 ; GFX6-NEXT: s_cmp_gt_i32 s2, -1 ; GFX6-NEXT: s_cselect_b32 s7, s2, -1 ; GFX6-NEXT: s_sub_i32 s7, s7, s12 ; GFX6-NEXT: s_cmp_lt_i32 s2, -1 +; GFX6-NEXT: s_sext_i32_i16 s6, s8 ; GFX6-NEXT: s_cselect_b32 s8, s2, -1 ; GFX6-NEXT: s_sub_i32 s8, s8, s13 ; GFX6-NEXT: s_cmp_gt_i32 s7, s6 @@ -3915,42 +3869,39 @@ ; GFX6-NEXT: s_cmp_lt_i32 s6, s8 ; GFX6-NEXT: s_cselect_b32 s6, s6, s8 ; GFX6-NEXT: s_sub_i32 s2, s2, s6 -; GFX6-NEXT: s_ashr_i32 s2, s2, 16 -; GFX6-NEXT: s_lshl_b32 s3, s3, 16 -; GFX6-NEXT: s_lshl_b32 s6, s9, 16 +; GFX6-NEXT: s_sext_i32_i16 s3, s3 ; GFX6-NEXT: s_cmp_gt_i32 s3, -1 ; GFX6-NEXT: s_cselect_b32 s7, s3, -1 ; GFX6-NEXT: s_sub_i32 s7, s7, s12 ; GFX6-NEXT: s_cmp_lt_i32 s3, -1 ; GFX6-NEXT: s_cselect_b32 s8, s3, -1 +; GFX6-NEXT: s_sext_i32_i16 s6, s9 ; GFX6-NEXT: s_sub_i32 s8, s8, s13 ; GFX6-NEXT: s_cmp_gt_i32 s7, s6 ; GFX6-NEXT: s_cselect_b32 s6, s7, s6 ; GFX6-NEXT: s_cmp_lt_i32 s6, s8 ; GFX6-NEXT: s_cselect_b32 s6, s6, s8 ; GFX6-NEXT: s_sub_i32 s3, s3, s6 -; GFX6-NEXT: s_ashr_i32 s3, s3, 16 -; GFX6-NEXT: s_lshl_b32 s4, s4, 16 -; GFX6-NEXT: s_lshl_b32 s6, s10, 16 +; GFX6-NEXT: s_sext_i32_i16 s4, s4 ; GFX6-NEXT: s_cmp_gt_i32 s4, -1 ; GFX6-NEXT: s_cselect_b32 s7, s4, -1 ; GFX6-NEXT: s_sub_i32 s7, s7, s12 ; GFX6-NEXT: s_cmp_lt_i32 s4, -1 ; GFX6-NEXT: s_cselect_b32 s8, s4, -1 +; GFX6-NEXT: s_sext_i32_i16 s6, s10 ; GFX6-NEXT: s_sub_i32 s8, s8, s13 ; GFX6-NEXT: s_cmp_gt_i32 s7, s6 ; GFX6-NEXT: s_cselect_b32 s6, s7, s6 ; GFX6-NEXT: s_cmp_lt_i32 s6, s8 ; GFX6-NEXT: s_cselect_b32 s6, s6, s8 ; GFX6-NEXT: s_sub_i32 s4, s4, s6 -; GFX6-NEXT: s_ashr_i32 s4, s4, 16 -; GFX6-NEXT: s_lshl_b32 s5, s5, 16 -; GFX6-NEXT: s_lshl_b32 s6, s11, 16 +; GFX6-NEXT: s_sext_i32_i16 s5, s5 ; GFX6-NEXT: s_cmp_gt_i32 s5, -1 ; GFX6-NEXT: s_cselect_b32 s7, s5, -1 ; GFX6-NEXT: s_sub_i32 s7, s7, s12 ; GFX6-NEXT: s_cmp_lt_i32 s5, -1 ; GFX6-NEXT: s_cselect_b32 s8, s5, -1 +; GFX6-NEXT: s_sext_i32_i16 s6, s11 ; GFX6-NEXT: s_sub_i32 s8, s8, s13 ; GFX6-NEXT: s_cmp_gt_i32 s7, s6 ; GFX6-NEXT: s_cselect_b32 s6, s7, s6 @@ -3964,7 +3915,6 @@ ; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: s_and_b32 s1, s2, s6 ; GFX6-NEXT: s_and_b32 s2, s3, s6 -; GFX6-NEXT: s_ashr_i32 s5, s5, 16 ; GFX6-NEXT: s_and_b32 s3, s5, s6 ; GFX6-NEXT: s_lshl_b32 s2, s2, 16 ; GFX6-NEXT: s_or_b32 s1, s1, s2 @@ -4126,103 +4076,95 @@ ; GFX6-LABEL: v_ssubsat_v8i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: s_brev_b32 s4, -2 +; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX6-NEXT: s_movk_i32 s4, 0x7fff ; GFX6-NEXT: v_max_i32_e32 v16, -1, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX6-NEXT: v_bfe_i32 v8, v8, 0, 16 ; GFX6-NEXT: v_subrev_i32_e32 v16, vcc, s4, v16 -; GFX6-NEXT: s_mov_b32 s5, 0x80000000 +; GFX6-NEXT: s_mov_b32 s5, 0xffff8000 ; GFX6-NEXT: v_min_i32_e32 v18, -1, v0 ; GFX6-NEXT: v_subrev_i32_e32 v18, vcc, s5, v18 ; GFX6-NEXT: v_max_i32_e32 v8, v16, v8 ; GFX6-NEXT: v_min_i32_e32 v8, v8, v18 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v8 -; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; GFX6-NEXT: v_bfe_i32 v8, v9, 0, 16 ; GFX6-NEXT: v_max_i32_e32 v9, -1, v1 -; GFX6-NEXT: v_subrev_i32_e32 v9, vcc, s4, v9 ; GFX6-NEXT: v_min_i32_e32 v16, -1, v1 +; GFX6-NEXT: v_subrev_i32_e32 v9, vcc, s4, v9 ; GFX6-NEXT: v_subrev_i32_e32 v16, vcc, s5, v16 ; GFX6-NEXT: v_max_i32_e32 v8, v9, v8 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_min_i32_e32 v8, v8, v16 -; GFX6-NEXT: v_bfrev_b32_e32 v17, -2 +; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 16 ; GFX6-NEXT: v_max_i32_e32 v9, -1, v2 +; GFX6-NEXT: v_min_i32_e32 v8, v8, v16 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v8 -; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v10 +; GFX6-NEXT: v_bfe_i32 v8, v10, 0, 16 +; GFX6-NEXT: v_subrev_i32_e32 v9, vcc, s4, v9 ; GFX6-NEXT: v_min_i32_e32 v10, -1, v2 -; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v9, v17 ; GFX6-NEXT: v_subrev_i32_e32 v10, vcc, s5, v10 ; GFX6-NEXT: v_max_i32_e32 v8, v9, v8 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_bfe_i32 v3, v3, 0, 16 ; GFX6-NEXT: v_min_i32_e32 v8, v8, v10 +; GFX6-NEXT: v_mov_b32_e32 v17, 0x7fff ; GFX6-NEXT: v_max_i32_e32 v9, -1, v3 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v8 -; GFX6-NEXT: v_mov_b32_e32 v19, 0x80000000 +; GFX6-NEXT: v_mov_b32_e32 v19, 0xffff8000 ; GFX6-NEXT: v_min_i32_e32 v10, -1, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v11 +; GFX6-NEXT: v_bfe_i32 v8, v11, 0, 16 ; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v9, v17 ; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v10, v19 ; GFX6-NEXT: v_max_i32_e32 v8, v9, v8 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_bfe_i32 v4, v4, 0, 16 ; GFX6-NEXT: v_min_i32_e32 v8, v8, v10 ; GFX6-NEXT: v_max_i32_e32 v9, -1, v4 -; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v3, v8 ; GFX6-NEXT: v_min_i32_e32 v10, -1, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v12 +; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v3, v8 +; GFX6-NEXT: v_bfe_i32 v8, v12, 0, 16 ; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v9, v17 ; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v10, v19 ; GFX6-NEXT: v_max_i32_e32 v8, v9, v8 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX6-NEXT: v_bfe_i32 v5, v5, 0, 16 ; GFX6-NEXT: v_min_i32_e32 v8, v8, v10 ; GFX6-NEXT: v_max_i32_e32 v9, -1, v5 -; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v4, v8 ; GFX6-NEXT: v_min_i32_e32 v10, -1, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v13 +; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v4, v8 +; GFX6-NEXT: v_bfe_i32 v8, v13, 0, 16 ; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v9, v17 ; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v10, v19 ; GFX6-NEXT: v_max_i32_e32 v8, v9, v8 -; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX6-NEXT: v_bfe_i32 v6, v6, 0, 16 ; GFX6-NEXT: v_min_i32_e32 v8, v8, v10 ; GFX6-NEXT: v_max_i32_e32 v9, -1, v6 -; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v5, v8 ; GFX6-NEXT: v_min_i32_e32 v10, -1, v6 -; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v14 +; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v5, v8 +; GFX6-NEXT: v_bfe_i32 v8, v14, 0, 16 ; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v9, v17 ; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v10, v19 ; GFX6-NEXT: v_max_i32_e32 v8, v9, v8 -; GFX6-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX6-NEXT: v_bfe_i32 v7, v7, 0, 16 ; GFX6-NEXT: v_min_i32_e32 v8, v8, v10 ; GFX6-NEXT: v_max_i32_e32 v9, -1, v7 -; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v6, v8 -; GFX6-NEXT: v_min_i32_e32 v10, -1, v7 -; GFX6-NEXT: v_ashrrev_i32_e32 v1, 16, v1 ; GFX6-NEXT: s_mov_b32 s4, 0xffff -; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v15 -; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v9, v17 -; GFX6-NEXT: v_ashrrev_i32_e32 v0, 16, v0 +; GFX6-NEXT: v_min_i32_e32 v10, -1, v7 ; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v6, v8 +; GFX6-NEXT: v_bfe_i32 v8, v15, 0, 16 +; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v9, v17 ; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v10, v19 ; GFX6-NEXT: v_max_i32_e32 v8, v9, v8 -; GFX6-NEXT: v_min_i32_e32 v8, v8, v10 -; GFX6-NEXT: v_ashrrev_i32_e32 v2, 16, v2 -; GFX6-NEXT: v_ashrrev_i32_e32 v3, 16, v3 ; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_sub_i32_e32 v7, vcc, v7, v8 +; GFX6-NEXT: v_min_i32_e32 v8, v8, v10 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: v_and_b32_e32 v1, s4, v2 ; GFX6-NEXT: v_and_b32_e32 v2, s4, v3 -; GFX6-NEXT: v_ashrrev_i32_e32 v5, 16, v5 ; GFX6-NEXT: v_and_b32_e32 v3, s4, v5 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_ashrrev_i32_e32 v4, 16, v4 -; GFX6-NEXT: v_ashrrev_i32_e32 v7, 16, v7 +; GFX6-NEXT: v_sub_i32_e32 v7, vcc, v7, v8 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX6-NEXT: v_and_b32_e32 v2, s4, v4 ; GFX6-NEXT: v_and_b32_e32 v4, s4, v7 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_ashrrev_i32_e32 v6, 16, v6 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX6-NEXT: v_and_b32_e32 v3, s4, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 @@ -4331,25 +4273,24 @@ define amdgpu_ps <4 x i32> @s_ssubsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inreg %rhs) { ; GFX6-LABEL: s_ssubsat_v8i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_lshl_b32 s0, s0, 16 -; GFX6-NEXT: s_lshl_b32 s8, s8, 16 +; GFX6-NEXT: s_sext_i32_i16 s0, s0 ; GFX6-NEXT: s_cmp_gt_i32 s0, -1 -; GFX6-NEXT: s_brev_b32 s16, -2 +; GFX6-NEXT: s_movk_i32 s16, 0x7fff ; GFX6-NEXT: s_cselect_b32 s18, s0, -1 ; GFX6-NEXT: s_sub_i32 s18, s18, s16 ; GFX6-NEXT: s_cmp_lt_i32 s0, -1 -; GFX6-NEXT: s_mov_b32 s17, 0x80000000 +; GFX6-NEXT: s_mov_b32 s17, 0xffff8000 ; GFX6-NEXT: s_cselect_b32 s19, s0, -1 +; GFX6-NEXT: s_sext_i32_i16 s8, s8 ; GFX6-NEXT: s_sub_i32 s19, s19, s17 ; GFX6-NEXT: s_cmp_gt_i32 s18, s8 ; GFX6-NEXT: s_cselect_b32 s8, s18, s8 ; GFX6-NEXT: s_cmp_lt_i32 s8, s19 ; GFX6-NEXT: s_cselect_b32 s8, s8, s19 ; GFX6-NEXT: s_sub_i32 s0, s0, s8 -; GFX6-NEXT: s_ashr_i32 s0, s0, 16 -; GFX6-NEXT: s_lshl_b32 s1, s1, 16 -; GFX6-NEXT: s_lshl_b32 s8, s9, 16 +; GFX6-NEXT: s_sext_i32_i16 s1, s1 ; GFX6-NEXT: s_cmp_gt_i32 s1, -1 +; GFX6-NEXT: s_sext_i32_i16 s8, s9 ; GFX6-NEXT: s_cselect_b32 s9, s1, -1 ; GFX6-NEXT: s_sub_i32 s9, s9, s16 ; GFX6-NEXT: s_cmp_lt_i32 s1, -1 @@ -4360,13 +4301,12 @@ ; GFX6-NEXT: s_cmp_lt_i32 s8, s18 ; GFX6-NEXT: s_cselect_b32 s8, s8, s18 ; GFX6-NEXT: s_sub_i32 s1, s1, s8 -; GFX6-NEXT: s_ashr_i32 s1, s1, 16 -; GFX6-NEXT: s_lshl_b32 s2, s2, 16 -; GFX6-NEXT: s_lshl_b32 s8, s10, 16 +; GFX6-NEXT: s_sext_i32_i16 s2, s2 ; GFX6-NEXT: s_cmp_gt_i32 s2, -1 ; GFX6-NEXT: s_cselect_b32 s9, s2, -1 ; GFX6-NEXT: s_sub_i32 s9, s9, s16 ; GFX6-NEXT: s_cmp_lt_i32 s2, -1 +; GFX6-NEXT: s_sext_i32_i16 s8, s10 ; GFX6-NEXT: s_cselect_b32 s10, s2, -1 ; GFX6-NEXT: s_sub_i32 s10, s10, s17 ; GFX6-NEXT: s_cmp_gt_i32 s9, s8 @@ -4374,70 +4314,65 @@ ; GFX6-NEXT: s_cmp_lt_i32 s8, s10 ; GFX6-NEXT: s_cselect_b32 s8, s8, s10 ; GFX6-NEXT: s_sub_i32 s2, s2, s8 -; GFX6-NEXT: s_ashr_i32 s2, s2, 16 -; GFX6-NEXT: s_lshl_b32 s3, s3, 16 -; GFX6-NEXT: s_lshl_b32 s8, s11, 16 +; GFX6-NEXT: s_sext_i32_i16 s3, s3 ; GFX6-NEXT: s_cmp_gt_i32 s3, -1 ; GFX6-NEXT: s_cselect_b32 s9, s3, -1 ; GFX6-NEXT: s_sub_i32 s9, s9, s16 ; GFX6-NEXT: s_cmp_lt_i32 s3, -1 ; GFX6-NEXT: s_cselect_b32 s10, s3, -1 +; GFX6-NEXT: s_sext_i32_i16 s8, s11 ; GFX6-NEXT: s_sub_i32 s10, s10, s17 ; GFX6-NEXT: s_cmp_gt_i32 s9, s8 ; GFX6-NEXT: s_cselect_b32 s8, s9, s8 ; GFX6-NEXT: s_cmp_lt_i32 s8, s10 ; GFX6-NEXT: s_cselect_b32 s8, s8, s10 ; GFX6-NEXT: s_sub_i32 s3, s3, s8 -; GFX6-NEXT: s_ashr_i32 s3, s3, 16 -; GFX6-NEXT: s_lshl_b32 s4, s4, 16 -; GFX6-NEXT: s_lshl_b32 s8, s12, 16 +; GFX6-NEXT: s_sext_i32_i16 s4, s4 ; GFX6-NEXT: s_cmp_gt_i32 s4, -1 ; GFX6-NEXT: s_cselect_b32 s9, s4, -1 ; GFX6-NEXT: s_sub_i32 s9, s9, s16 ; GFX6-NEXT: s_cmp_lt_i32 s4, -1 ; GFX6-NEXT: s_cselect_b32 s10, s4, -1 +; GFX6-NEXT: s_sext_i32_i16 s8, s12 ; GFX6-NEXT: s_sub_i32 s10, s10, s17 ; GFX6-NEXT: s_cmp_gt_i32 s9, s8 ; GFX6-NEXT: s_cselect_b32 s8, s9, s8 ; GFX6-NEXT: s_cmp_lt_i32 s8, s10 ; GFX6-NEXT: s_cselect_b32 s8, s8, s10 ; GFX6-NEXT: s_sub_i32 s4, s4, s8 -; GFX6-NEXT: s_ashr_i32 s4, s4, 16 -; GFX6-NEXT: s_lshl_b32 s5, s5, 16 -; GFX6-NEXT: s_lshl_b32 s8, s13, 16 +; GFX6-NEXT: s_sext_i32_i16 s5, s5 ; GFX6-NEXT: s_cmp_gt_i32 s5, -1 ; GFX6-NEXT: s_cselect_b32 s9, s5, -1 ; GFX6-NEXT: s_sub_i32 s9, s9, s16 ; GFX6-NEXT: s_cmp_lt_i32 s5, -1 ; GFX6-NEXT: s_cselect_b32 s10, s5, -1 +; GFX6-NEXT: s_sext_i32_i16 s8, s13 ; GFX6-NEXT: s_sub_i32 s10, s10, s17 ; GFX6-NEXT: s_cmp_gt_i32 s9, s8 ; GFX6-NEXT: s_cselect_b32 s8, s9, s8 ; GFX6-NEXT: s_cmp_lt_i32 s8, s10 ; GFX6-NEXT: s_cselect_b32 s8, s8, s10 ; GFX6-NEXT: s_sub_i32 s5, s5, s8 -; GFX6-NEXT: s_ashr_i32 s5, s5, 16 -; GFX6-NEXT: s_lshl_b32 s6, s6, 16 -; GFX6-NEXT: s_lshl_b32 s8, s14, 16 +; GFX6-NEXT: s_sext_i32_i16 s6, s6 ; GFX6-NEXT: s_cmp_gt_i32 s6, -1 ; GFX6-NEXT: s_cselect_b32 s9, s6, -1 ; GFX6-NEXT: s_sub_i32 s9, s9, s16 ; GFX6-NEXT: s_cmp_lt_i32 s6, -1 ; GFX6-NEXT: s_cselect_b32 s10, s6, -1 +; GFX6-NEXT: s_sext_i32_i16 s8, s14 ; GFX6-NEXT: s_sub_i32 s10, s10, s17 ; GFX6-NEXT: s_cmp_gt_i32 s9, s8 ; GFX6-NEXT: s_cselect_b32 s8, s9, s8 ; GFX6-NEXT: s_cmp_lt_i32 s8, s10 ; GFX6-NEXT: s_cselect_b32 s8, s8, s10 ; GFX6-NEXT: s_sub_i32 s6, s6, s8 -; GFX6-NEXT: s_ashr_i32 s6, s6, 16 -; GFX6-NEXT: s_lshl_b32 s7, s7, 16 -; GFX6-NEXT: s_lshl_b32 s8, s15, 16 +; GFX6-NEXT: s_sext_i32_i16 s7, s7 ; GFX6-NEXT: s_cmp_gt_i32 s7, -1 ; GFX6-NEXT: s_cselect_b32 s9, s7, -1 ; GFX6-NEXT: s_sub_i32 s9, s9, s16 ; GFX6-NEXT: s_cmp_lt_i32 s7, -1 ; GFX6-NEXT: s_cselect_b32 s10, s7, -1 +; GFX6-NEXT: s_sext_i32_i16 s8, s15 ; GFX6-NEXT: s_sub_i32 s10, s10, s17 ; GFX6-NEXT: s_cmp_gt_i32 s9, s8 ; GFX6-NEXT: s_cselect_b32 s8, s9, s8 @@ -4453,7 +4388,6 @@ ; GFX6-NEXT: s_and_b32 s2, s3, s8 ; GFX6-NEXT: s_and_b32 s3, s5, s8 ; GFX6-NEXT: s_lshl_b32 s2, s2, 16 -; GFX6-NEXT: s_ashr_i32 s7, s7, 16 ; GFX6-NEXT: s_or_b32 s1, s1, s2 ; GFX6-NEXT: s_and_b32 s2, s4, s8 ; GFX6-NEXT: s_and_b32 s4, s7, s8 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll @@ -8,12 +8,11 @@ ; GFX6-LABEL: v_uaddsat_i7: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 25, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 25, v1 -; GFX6-NEXT: v_xor_b32_e32 v2, -1, v0 -; GFX6-NEXT: v_min_u32_e32 v1, v2, v1 +; GFX6-NEXT: s_movk_i32 s4, 0x7f +; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v0, 25, v0 +; GFX6-NEXT: v_min_u32_e32 v0, s4, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_uaddsat_i7: @@ -51,13 +50,12 @@ define amdgpu_ps i7 @s_uaddsat_i7(i7 inreg %lhs, i7 inreg %rhs) { ; GFX6-LABEL: s_uaddsat_i7: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_lshl_b32 s0, s0, 25 -; GFX6-NEXT: s_lshl_b32 s1, s1, 25 -; GFX6-NEXT: s_not_b32 s2, s0 -; GFX6-NEXT: s_cmp_lt_u32 s2, s1 -; GFX6-NEXT: s_cselect_b32 s1, s2, s1 +; GFX6-NEXT: s_movk_i32 s2, 0x7f +; GFX6-NEXT: s_and_b32 s0, s0, s2 +; GFX6-NEXT: s_and_b32 s1, s1, s2 ; GFX6-NEXT: s_add_i32 s0, s0, s1 -; GFX6-NEXT: s_lshr_b32 s0, s0, 25 +; GFX6-NEXT: s_cmp_lt_u32 s0, s2 +; GFX6-NEXT: s_cselect_b32 s0, s0, s2 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_uaddsat_i7: @@ -100,12 +98,11 @@ ; GFX6-LABEL: v_uaddsat_i8: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; GFX6-NEXT: v_xor_b32_e32 v2, -1, v0 -; GFX6-NEXT: v_min_u32_e32 v1, v2, v1 +; GFX6-NEXT: s_movk_i32 s4, 0xff +; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v0, 24, v0 +; GFX6-NEXT: v_min_u32_e32 v0, s4, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_uaddsat_i8: @@ -143,13 +140,12 @@ define amdgpu_ps i8 @s_uaddsat_i8(i8 inreg %lhs, i8 inreg %rhs) { ; GFX6-LABEL: s_uaddsat_i8: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_lshl_b32 s0, s0, 24 -; GFX6-NEXT: s_lshl_b32 s1, s1, 24 -; GFX6-NEXT: s_not_b32 s2, s0 -; GFX6-NEXT: s_cmp_lt_u32 s2, s1 -; GFX6-NEXT: s_cselect_b32 s1, s2, s1 +; GFX6-NEXT: s_movk_i32 s2, 0xff +; GFX6-NEXT: s_and_b32 s0, s0, s2 +; GFX6-NEXT: s_and_b32 s1, s1, s2 ; GFX6-NEXT: s_add_i32 s0, s0, s1 -; GFX6-NEXT: s_lshr_b32 s0, s0, 24 +; GFX6-NEXT: s_cmp_lt_u32 s0, s2 +; GFX6-NEXT: s_cselect_b32 s0, s0, s2 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_uaddsat_i8: @@ -192,23 +188,19 @@ ; GFX6-LABEL: v_uaddsat_v2i8: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_movk_i32 s4, 0xff ; GFX6-NEXT: v_lshrrev_b32_e32 v2, 8, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 24, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 8, v1 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; GFX6-NEXT: v_xor_b32_e32 v4, -1, v0 -; GFX6-NEXT: v_min_u32_e32 v1, v4, v1 +; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 24, v3 -; GFX6-NEXT: v_xor_b32_e32 v3, -1, v1 -; GFX6-NEXT: v_min_u32_e32 v2, v3, v2 +; GFX6-NEXT: v_and_b32_e32 v1, s4, v2 +; GFX6-NEXT: v_and_b32_e32 v2, s4, v3 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v2 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 24, v1 -; GFX6-NEXT: v_mov_b32_e32 v2, 0xff -; GFX6-NEXT: v_lshrrev_b32_e32 v0, 24, v0 -; GFX6-NEXT: v_and_b32_e32 v1, v1, v2 -; GFX6-NEXT: v_and_b32_e32 v0, v0, v2 +; GFX6-NEXT: v_min_u32_e32 v1, s4, v1 +; GFX6-NEXT: v_min_u32_e32 v0, s4, v0 +; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX6-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -278,25 +270,21 @@ define amdgpu_ps i16 @s_uaddsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) { ; GFX6-LABEL: s_uaddsat_v2i8: ; GFX6: ; %bb.0: +; GFX6-NEXT: s_movk_i32 s4, 0xff ; GFX6-NEXT: s_lshr_b32 s2, s0, 8 -; GFX6-NEXT: s_lshl_b32 s0, s0, 24 ; GFX6-NEXT: s_lshr_b32 s3, s1, 8 -; GFX6-NEXT: s_lshl_b32 s1, s1, 24 -; GFX6-NEXT: s_not_b32 s4, s0 -; GFX6-NEXT: s_cmp_lt_u32 s4, s1 -; GFX6-NEXT: s_cselect_b32 s1, s4, s1 +; GFX6-NEXT: s_and_b32 s0, s0, s4 +; GFX6-NEXT: s_and_b32 s1, s1, s4 ; GFX6-NEXT: s_add_i32 s0, s0, s1 -; GFX6-NEXT: s_lshl_b32 s1, s2, 24 -; GFX6-NEXT: s_lshl_b32 s2, s3, 24 -; GFX6-NEXT: s_lshr_b32 s0, s0, 24 -; GFX6-NEXT: s_not_b32 s3, s1 -; GFX6-NEXT: s_cmp_lt_u32 s3, s2 -; GFX6-NEXT: s_cselect_b32 s2, s3, s2 +; GFX6-NEXT: s_cmp_lt_u32 s0, s4 +; GFX6-NEXT: s_cselect_b32 s0, s0, s4 +; GFX6-NEXT: s_and_b32 s1, s2, s4 +; GFX6-NEXT: s_and_b32 s2, s3, s4 ; GFX6-NEXT: s_add_i32 s1, s1, s2 -; GFX6-NEXT: s_movk_i32 s2, 0xff -; GFX6-NEXT: s_lshr_b32 s1, s1, 24 -; GFX6-NEXT: s_and_b32 s1, s1, s2 -; GFX6-NEXT: s_and_b32 s0, s0, s2 +; GFX6-NEXT: s_cmp_lt_u32 s1, s4 +; GFX6-NEXT: s_cselect_b32 s1, s1, s4 +; GFX6-NEXT: s_and_b32 s1, s1, s4 +; GFX6-NEXT: s_and_b32 s0, s0, s4 ; GFX6-NEXT: s_lshl_b32 s1, s1, 8 ; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: ; return to shader part epilog @@ -381,45 +369,37 @@ ; GFX6-LABEL: v_uaddsat_v4i8: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_movk_i32 s4, 0xff ; GFX6-NEXT: v_lshrrev_b32_e32 v2, 8, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 24, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 24, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v5, 8, v1 ; GFX6-NEXT: v_lshrrev_b32_e32 v6, 16, v1 ; GFX6-NEXT: v_lshrrev_b32_e32 v7, 24, v1 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; GFX6-NEXT: v_xor_b32_e32 v8, -1, v0 -; GFX6-NEXT: v_min_u32_e32 v1, v8, v1 +; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 24, v5 -; GFX6-NEXT: v_xor_b32_e32 v5, -1, v1 -; GFX6-NEXT: v_min_u32_e32 v2, v5, v2 +; GFX6-NEXT: v_and_b32_e32 v1, s4, v2 +; GFX6-NEXT: v_and_b32_e32 v2, s4, v5 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 24, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 24, v6 -; GFX6-NEXT: v_xor_b32_e32 v5, -1, v2 -; GFX6-NEXT: v_min_u32_e32 v3, v5, v3 +; GFX6-NEXT: v_min_u32_e32 v1, s4, v1 +; GFX6-NEXT: v_and_b32_e32 v2, s4, v3 +; GFX6-NEXT: v_and_b32_e32 v3, s4, v6 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 24, v4 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 24, v1 -; GFX6-NEXT: s_movk_i32 s4, 0xff -; GFX6-NEXT: v_lshrrev_b32_e32 v0, 24, v0 +; GFX6-NEXT: v_min_u32_e32 v0, s4, v0 ; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 24, v7 -; GFX6-NEXT: v_xor_b32_e32 v5, -1, v3 -; GFX6-NEXT: v_min_u32_e32 v4, v5, v4 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 24, v2 +; GFX6-NEXT: v_and_b32_e32 v3, s4, v4 +; GFX6-NEXT: v_and_b32_e32 v4, s4, v7 +; GFX6-NEXT: v_min_u32_e32 v2, s4, v2 ; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v4 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: v_and_b32_e32 v1, s4, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 24, v3 +; GFX6-NEXT: v_min_u32_e32 v3, s4, v3 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v3 +; GFX6-NEXT: v_and_b32_e32 v1, 0xff, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -538,49 +518,41 @@ define amdgpu_ps i32 @s_uaddsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) { ; GFX6-LABEL: s_uaddsat_v4i8: ; GFX6: ; %bb.0: +; GFX6-NEXT: s_movk_i32 s8, 0xff ; GFX6-NEXT: s_lshr_b32 s2, s0, 8 ; GFX6-NEXT: s_lshr_b32 s3, s0, 16 ; GFX6-NEXT: s_lshr_b32 s4, s0, 24 -; GFX6-NEXT: s_lshl_b32 s0, s0, 24 ; GFX6-NEXT: s_lshr_b32 s5, s1, 8 ; GFX6-NEXT: s_lshr_b32 s6, s1, 16 ; GFX6-NEXT: s_lshr_b32 s7, s1, 24 -; GFX6-NEXT: s_lshl_b32 s1, s1, 24 -; GFX6-NEXT: s_not_b32 s8, s0 -; GFX6-NEXT: s_cmp_lt_u32 s8, s1 -; GFX6-NEXT: s_cselect_b32 s1, s8, s1 +; GFX6-NEXT: s_and_b32 s0, s0, s8 +; GFX6-NEXT: s_and_b32 s1, s1, s8 ; GFX6-NEXT: s_add_i32 s0, s0, s1 -; GFX6-NEXT: s_lshl_b32 s1, s2, 24 -; GFX6-NEXT: s_lshl_b32 s2, s5, 24 -; GFX6-NEXT: s_lshr_b32 s0, s0, 24 -; GFX6-NEXT: s_not_b32 s5, s1 -; GFX6-NEXT: s_cmp_lt_u32 s5, s2 -; GFX6-NEXT: s_cselect_b32 s2, s5, s2 +; GFX6-NEXT: s_cmp_lt_u32 s0, s8 +; GFX6-NEXT: s_cselect_b32 s0, s0, s8 +; GFX6-NEXT: s_and_b32 s1, s2, s8 +; GFX6-NEXT: s_and_b32 s2, s5, s8 ; GFX6-NEXT: s_add_i32 s1, s1, s2 -; GFX6-NEXT: s_lshl_b32 s2, s3, 24 -; GFX6-NEXT: s_lshr_b32 s1, s1, 24 -; GFX6-NEXT: s_lshl_b32 s3, s6, 24 -; GFX6-NEXT: s_not_b32 s5, s2 -; GFX6-NEXT: s_cmp_lt_u32 s5, s3 -; GFX6-NEXT: s_cselect_b32 s3, s5, s3 +; GFX6-NEXT: s_cmp_lt_u32 s1, s8 +; GFX6-NEXT: s_cselect_b32 s1, s1, s8 +; GFX6-NEXT: s_and_b32 s2, s3, s8 +; GFX6-NEXT: s_and_b32 s3, s6, s8 ; GFX6-NEXT: s_add_i32 s2, s2, s3 -; GFX6-NEXT: s_lshl_b32 s3, s4, 24 -; GFX6-NEXT: s_lshr_b32 s2, s2, 24 -; GFX6-NEXT: s_lshl_b32 s4, s7, 24 -; GFX6-NEXT: s_not_b32 s5, s3 -; GFX6-NEXT: s_cmp_lt_u32 s5, s4 -; GFX6-NEXT: s_cselect_b32 s4, s5, s4 +; GFX6-NEXT: s_cmp_lt_u32 s2, s8 +; GFX6-NEXT: s_cselect_b32 s2, s2, s8 +; GFX6-NEXT: s_and_b32 s3, s4, s8 +; GFX6-NEXT: s_and_b32 s4, s7, s8 ; GFX6-NEXT: s_add_i32 s3, s3, s4 -; GFX6-NEXT: s_movk_i32 s4, 0xff -; GFX6-NEXT: s_and_b32 s1, s1, s4 -; GFX6-NEXT: s_and_b32 s0, s0, s4 +; GFX6-NEXT: s_cmp_lt_u32 s3, s8 +; GFX6-NEXT: s_cselect_b32 s3, s3, s8 +; GFX6-NEXT: s_and_b32 s1, s1, s8 +; GFX6-NEXT: s_and_b32 s0, s0, s8 ; GFX6-NEXT: s_lshl_b32 s1, s1, 8 ; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_and_b32 s1, s2, s4 +; GFX6-NEXT: s_and_b32 s1, s2, s8 ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 -; GFX6-NEXT: s_lshr_b32 s3, s3, 24 ; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_and_b32 s1, s3, s4 +; GFX6-NEXT: s_and_b32 s1, s3, s8 ; GFX6-NEXT: s_lshl_b32 s1, s1, 24 ; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: ; return to shader part epilog @@ -725,12 +697,11 @@ ; GFX6-LABEL: v_uaddsat_i24: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX6-NEXT: v_xor_b32_e32 v2, -1, v0 -; GFX6-NEXT: v_min_u32_e32 v1, v2, v1 +; GFX6-NEXT: s_mov_b32 s4, 0xffffff +; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v0, 8, v0 +; GFX6-NEXT: v_min_u32_e32 v0, s4, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_uaddsat_i24: @@ -768,13 +739,12 @@ define amdgpu_ps i24 @s_uaddsat_i24(i24 inreg %lhs, i24 inreg %rhs) { ; GFX6-LABEL: s_uaddsat_i24: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_lshl_b32 s0, s0, 8 -; GFX6-NEXT: s_lshl_b32 s1, s1, 8 -; GFX6-NEXT: s_not_b32 s2, s0 -; GFX6-NEXT: s_cmp_lt_u32 s2, s1 -; GFX6-NEXT: s_cselect_b32 s1, s2, s1 +; GFX6-NEXT: s_mov_b32 s2, 0xffffff +; GFX6-NEXT: s_and_b32 s0, s0, s2 +; GFX6-NEXT: s_and_b32 s1, s1, s2 ; GFX6-NEXT: s_add_i32 s0, s0, s1 -; GFX6-NEXT: s_lshr_b32 s0, s0, 8 +; GFX6-NEXT: s_cmp_lt_u32 s0, s2 +; GFX6-NEXT: s_cselect_b32 s0, s0, s2 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_uaddsat_i24: @@ -1710,12 +1680,11 @@ ; GFX6-LABEL: v_uaddsat_i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_xor_b32_e32 v2, -1, v0 -; GFX6-NEXT: v_min_u32_e32 v1, v2, v1 +; GFX6-NEXT: s_mov_b32 s4, 0xffff +; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_min_u32_e32 v0, s4, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_uaddsat_i16: @@ -1744,13 +1713,12 @@ define amdgpu_ps i16 @s_uaddsat_i16(i16 inreg %lhs, i16 inreg %rhs) { ; GFX6-LABEL: s_uaddsat_i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_lshl_b32 s0, s0, 16 -; GFX6-NEXT: s_lshl_b32 s1, s1, 16 -; GFX6-NEXT: s_not_b32 s2, s0 -; GFX6-NEXT: s_cmp_lt_u32 s2, s1 -; GFX6-NEXT: s_cselect_b32 s1, s2, s1 +; GFX6-NEXT: s_mov_b32 s2, 0xffff +; GFX6-NEXT: s_and_b32 s0, s0, s2 +; GFX6-NEXT: s_and_b32 s1, s1, s2 ; GFX6-NEXT: s_add_i32 s0, s0, s1 -; GFX6-NEXT: s_lshr_b32 s0, s0, 16 +; GFX6-NEXT: s_cmp_lt_u32 s0, s2 +; GFX6-NEXT: s_cselect_b32 s0, s0, s2 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_uaddsat_i16: @@ -1780,12 +1748,11 @@ define amdgpu_ps half @uaddsat_i16_sv(i16 inreg %lhs, i16 %rhs) { ; GFX6-LABEL: uaddsat_i16_sv: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_lshl_b32 s0, s0, 16 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: s_not_b32 s1, s0 -; GFX6-NEXT: v_min_u32_e32 v0, s1, v0 +; GFX6-NEXT: s_mov_b32 s1, 0xffff +; GFX6-NEXT: s_and_b32 s0, s0, s1 +; GFX6-NEXT: v_and_b32_e32 v0, s1, v0 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v0 -; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_min_u32_e32 v0, s1, v0 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: uaddsat_i16_sv: @@ -1811,12 +1778,11 @@ define amdgpu_ps half @uaddsat_i16_vs(i16 %lhs, i16 inreg %rhs) { ; GFX6-LABEL: uaddsat_i16_vs: ; GFX6: ; %bb.0: -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: s_lshl_b32 s0, s0, 16 -; GFX6-NEXT: v_xor_b32_e32 v1, -1, v0 -; GFX6-NEXT: v_min_u32_e32 v1, s0, v1 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: s_mov_b32 s1, 0xffff +; GFX6-NEXT: v_and_b32_e32 v0, s1, v0 +; GFX6-NEXT: s_and_b32 s0, s0, s1 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v0 +; GFX6-NEXT: v_min_u32_e32 v0, s1, v0 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: uaddsat_i16_vs: @@ -1843,18 +1809,15 @@ ; GFX6-LABEL: v_uaddsat_v2i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_xor_b32_e32 v4, -1, v0 -; GFX6-NEXT: v_min_u32_e32 v2, v4, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: s_mov_b32 s4, 0xffff +; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX6-NEXT: v_and_b32_e32 v2, s4, v2 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX6-NEXT: v_xor_b32_e32 v3, -1, v1 -; GFX6-NEXT: v_min_u32_e32 v2, v3, v2 +; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX6-NEXT: v_and_b32_e32 v2, s4, v3 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v2 -; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_min_u32_e32 v0, s4, v0 +; GFX6-NEXT: v_min_u32_e32 v1, s4, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_uaddsat_v2i16: @@ -1887,23 +1850,19 @@ define amdgpu_ps i32 @s_uaddsat_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs) { ; GFX6-LABEL: s_uaddsat_v2i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_lshl_b32 s0, s0, 16 -; GFX6-NEXT: s_lshl_b32 s2, s2, 16 -; GFX6-NEXT: s_not_b32 s4, s0 -; GFX6-NEXT: s_cmp_lt_u32 s4, s2 -; GFX6-NEXT: s_cselect_b32 s2, s4, s2 +; GFX6-NEXT: s_mov_b32 s4, 0xffff +; GFX6-NEXT: s_and_b32 s0, s0, s4 +; GFX6-NEXT: s_and_b32 s2, s2, s4 ; GFX6-NEXT: s_add_i32 s0, s0, s2 -; GFX6-NEXT: s_lshl_b32 s1, s1, 16 -; GFX6-NEXT: s_lshl_b32 s2, s3, 16 -; GFX6-NEXT: s_lshr_b32 s0, s0, 16 -; GFX6-NEXT: s_not_b32 s3, s1 -; GFX6-NEXT: s_cmp_lt_u32 s3, s2 -; GFX6-NEXT: s_cselect_b32 s2, s3, s2 +; GFX6-NEXT: s_cmp_lt_u32 s0, s4 +; GFX6-NEXT: s_cselect_b32 s0, s0, s4 +; GFX6-NEXT: s_and_b32 s1, s1, s4 +; GFX6-NEXT: s_and_b32 s2, s3, s4 ; GFX6-NEXT: s_add_i32 s1, s1, s2 -; GFX6-NEXT: s_mov_b32 s2, 0xffff -; GFX6-NEXT: s_lshr_b32 s1, s1, 16 -; GFX6-NEXT: s_and_b32 s1, s1, s2 -; GFX6-NEXT: s_and_b32 s0, s0, s2 +; GFX6-NEXT: s_cmp_lt_u32 s1, s4 +; GFX6-NEXT: s_cselect_b32 s1, s1, s4 +; GFX6-NEXT: s_and_b32 s1, s1, s4 +; GFX6-NEXT: s_and_b32 s0, s0, s4 ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: ; return to shader part epilog @@ -1943,21 +1902,17 @@ define amdgpu_ps float @uaddsat_v2i16_sv(<2 x i16> inreg %lhs, <2 x i16> %rhs) { ; GFX6-LABEL: uaddsat_v2i16_sv: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_lshl_b32 s0, s0, 16 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: s_not_b32 s2, s0 -; GFX6-NEXT: v_min_u32_e32 v0, s2, v0 +; GFX6-NEXT: s_mov_b32 s2, 0xffff +; GFX6-NEXT: s_and_b32 s0, s0, s2 +; GFX6-NEXT: v_and_b32_e32 v0, s2, v0 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v0 -; GFX6-NEXT: s_lshl_b32 s0, s1, 16 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: s_not_b32 s1, s0 -; GFX6-NEXT: v_min_u32_e32 v1, s1, v1 +; GFX6-NEXT: s_and_b32 s0, s1, s2 +; GFX6-NEXT: v_and_b32_e32 v1, s2, v1 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, s0, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: s_mov_b32 s0, 0xffff -; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_and_b32_e32 v1, s0, v1 -; GFX6-NEXT: v_and_b32_e32 v0, s0, v0 +; GFX6-NEXT: v_min_u32_e32 v1, s2, v1 +; GFX6-NEXT: v_min_u32_e32 v0, s2, v0 +; GFX6-NEXT: v_and_b32_e32 v1, s2, v1 +; GFX6-NEXT: v_and_b32_e32 v0, s2, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: ; return to shader part epilog @@ -1991,21 +1946,17 @@ define amdgpu_ps float @uaddsat_v2i16_vs(<2 x i16> %lhs, <2 x i16> inreg %rhs) { ; GFX6-LABEL: uaddsat_v2i16_vs: ; GFX6: ; %bb.0: -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: s_lshl_b32 s0, s0, 16 -; GFX6-NEXT: v_xor_b32_e32 v2, -1, v0 -; GFX6-NEXT: v_min_u32_e32 v2, s0, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GFX6-NEXT: s_lshl_b32 s0, s1, 16 -; GFX6-NEXT: v_xor_b32_e32 v2, -1, v1 -; GFX6-NEXT: v_min_u32_e32 v2, s0, v2 -; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v2 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: s_mov_b32 s0, 0xffff -; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_and_b32_e32 v1, s0, v1 -; GFX6-NEXT: v_and_b32_e32 v0, s0, v0 +; GFX6-NEXT: s_mov_b32 s2, 0xffff +; GFX6-NEXT: v_and_b32_e32 v0, s2, v0 +; GFX6-NEXT: s_and_b32 s0, s0, s2 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v0 +; GFX6-NEXT: v_and_b32_e32 v1, s2, v1 +; GFX6-NEXT: s_and_b32 s0, s1, s2 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, s0, v1 +; GFX6-NEXT: v_min_u32_e32 v1, s2, v1 +; GFX6-NEXT: v_min_u32_e32 v0, s2, v0 +; GFX6-NEXT: v_and_b32_e32 v1, s2, v1 +; GFX6-NEXT: v_and_b32_e32 v0, s2, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: ; return to shader part epilog @@ -2051,37 +2002,30 @@ ; GFX6-LABEL: v_uaddsat_v4i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_xor_b32_e32 v8, -1, v0 -; GFX6-NEXT: v_min_u32_e32 v4, v8, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: s_mov_b32 s4, 0xffff +; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX6-NEXT: v_and_b32_e32 v4, s4, v4 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; GFX6-NEXT: v_xor_b32_e32 v5, -1, v1 -; GFX6-NEXT: v_min_u32_e32 v4, v5, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX6-NEXT: v_and_b32_e32 v4, s4, v5 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v6 -; GFX6-NEXT: v_xor_b32_e32 v5, -1, v2 -; GFX6-NEXT: v_min_u32_e32 v4, v5, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX6-NEXT: v_and_b32_e32 v4, s4, v6 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v7 -; GFX6-NEXT: v_xor_b32_e32 v5, -1, v3 -; GFX6-NEXT: v_min_u32_e32 v4, v5, v4 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: s_mov_b32 s4, 0xffff -; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_min_u32_e32 v1, s4, v1 +; GFX6-NEXT: v_and_b32_e32 v3, s4, v3 +; GFX6-NEXT: v_and_b32_e32 v4, s4, v7 +; GFX6-NEXT: v_min_u32_e32 v0, s4, v0 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v4 ; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_mov_b32_e32 v8, 0xffff +; GFX6-NEXT: v_min_u32_e32 v2, s4, v2 +; GFX6-NEXT: v_min_u32_e32 v3, s4, v3 ; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v2 -; GFX6-NEXT: v_and_b32_e32 v2, s4, v3 +; GFX6-NEXT: v_and_b32_e32 v1, v2, v8 +; GFX6-NEXT: v_and_b32_e32 v2, v3, v8 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -2123,41 +2067,33 @@ define amdgpu_ps <2 x i32> @s_uaddsat_v4i16(<4 x i16> inreg %lhs, <4 x i16> inreg %rhs) { ; GFX6-LABEL: s_uaddsat_v4i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_lshl_b32 s0, s0, 16 -; GFX6-NEXT: s_lshl_b32 s4, s4, 16 -; GFX6-NEXT: s_not_b32 s8, s0 -; GFX6-NEXT: s_cmp_lt_u32 s8, s4 -; GFX6-NEXT: s_cselect_b32 s4, s8, s4 +; GFX6-NEXT: s_mov_b32 s8, 0xffff +; GFX6-NEXT: s_and_b32 s0, s0, s8 +; GFX6-NEXT: s_and_b32 s4, s4, s8 ; GFX6-NEXT: s_add_i32 s0, s0, s4 -; GFX6-NEXT: s_lshl_b32 s1, s1, 16 -; GFX6-NEXT: s_lshl_b32 s4, s5, 16 -; GFX6-NEXT: s_lshr_b32 s0, s0, 16 -; GFX6-NEXT: s_not_b32 s5, s1 -; GFX6-NEXT: s_cmp_lt_u32 s5, s4 -; GFX6-NEXT: s_cselect_b32 s4, s5, s4 +; GFX6-NEXT: s_cmp_lt_u32 s0, s8 +; GFX6-NEXT: s_cselect_b32 s0, s0, s8 +; GFX6-NEXT: s_and_b32 s1, s1, s8 +; GFX6-NEXT: s_and_b32 s4, s5, s8 ; GFX6-NEXT: s_add_i32 s1, s1, s4 -; GFX6-NEXT: s_lshl_b32 s2, s2, 16 -; GFX6-NEXT: s_lshr_b32 s1, s1, 16 -; GFX6-NEXT: s_lshl_b32 s4, s6, 16 -; GFX6-NEXT: s_not_b32 s5, s2 -; GFX6-NEXT: s_cmp_lt_u32 s5, s4 -; GFX6-NEXT: s_cselect_b32 s4, s5, s4 +; GFX6-NEXT: s_cmp_lt_u32 s1, s8 +; GFX6-NEXT: s_cselect_b32 s1, s1, s8 +; GFX6-NEXT: s_and_b32 s2, s2, s8 +; GFX6-NEXT: s_and_b32 s4, s6, s8 ; GFX6-NEXT: s_add_i32 s2, s2, s4 -; GFX6-NEXT: s_lshl_b32 s3, s3, 16 -; GFX6-NEXT: s_lshr_b32 s2, s2, 16 -; GFX6-NEXT: s_lshl_b32 s4, s7, 16 -; GFX6-NEXT: s_not_b32 s5, s3 -; GFX6-NEXT: s_cmp_lt_u32 s5, s4 -; GFX6-NEXT: s_cselect_b32 s4, s5, s4 +; GFX6-NEXT: s_cmp_lt_u32 s2, s8 +; GFX6-NEXT: s_cselect_b32 s2, s2, s8 +; GFX6-NEXT: s_and_b32 s3, s3, s8 +; GFX6-NEXT: s_and_b32 s4, s7, s8 ; GFX6-NEXT: s_add_i32 s3, s3, s4 -; GFX6-NEXT: s_mov_b32 s4, 0xffff -; GFX6-NEXT: s_and_b32 s1, s1, s4 -; GFX6-NEXT: s_lshr_b32 s3, s3, 16 -; GFX6-NEXT: s_and_b32 s0, s0, s4 +; GFX6-NEXT: s_cmp_lt_u32 s3, s8 +; GFX6-NEXT: s_cselect_b32 s3, s3, s8 +; GFX6-NEXT: s_and_b32 s1, s1, s8 +; GFX6-NEXT: s_and_b32 s0, s0, s8 ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_and_b32 s1, s2, s4 -; GFX6-NEXT: s_and_b32 s2, s3, s4 +; GFX6-NEXT: s_and_b32 s1, s2, s8 +; GFX6-NEXT: s_and_b32 s2, s3, s8 ; GFX6-NEXT: s_lshl_b32 s2, s2, 16 ; GFX6-NEXT: s_or_b32 s1, s1, s2 ; GFX6-NEXT: ; return to shader part epilog @@ -2223,53 +2159,42 @@ ; GFX6-LABEL: v_uaddsat_v6i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX6-NEXT: v_xor_b32_e32 v12, -1, v0 -; GFX6-NEXT: v_min_u32_e32 v6, v12, v6 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: s_mov_b32 s4, 0xffff +; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX6-NEXT: v_and_b32_e32 v6, s4, v6 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v6 -; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; GFX6-NEXT: v_xor_b32_e32 v7, -1, v1 -; GFX6-NEXT: v_min_u32_e32 v6, v7, v6 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX6-NEXT: v_and_b32_e32 v6, s4, v7 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v6 -; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v8 -; GFX6-NEXT: v_xor_b32_e32 v7, -1, v2 -; GFX6-NEXT: v_min_u32_e32 v6, v7, v6 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX6-NEXT: v_and_b32_e32 v6, s4, v8 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v6 -; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v9 -; GFX6-NEXT: v_xor_b32_e32 v7, -1, v3 -; GFX6-NEXT: v_min_u32_e32 v6, v7, v6 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_and_b32_e32 v3, s4, v3 +; GFX6-NEXT: v_and_b32_e32 v6, s4, v9 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v10 -; GFX6-NEXT: v_xor_b32_e32 v7, -1, v4 -; GFX6-NEXT: v_min_u32_e32 v6, v7, v6 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX6-NEXT: v_mov_b32_e32 v12, 0xffff +; GFX6-NEXT: v_min_u32_e32 v1, s4, v1 +; GFX6-NEXT: v_and_b32_e32 v4, s4, v4 +; GFX6-NEXT: v_and_b32_e32 v6, s4, v10 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v6 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: s_mov_b32 s4, 0xffff -; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v11 -; GFX6-NEXT: v_xor_b32_e32 v7, -1, v5 -; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 -; GFX6-NEXT: v_min_u32_e32 v6, v7, v6 +; GFX6-NEXT: v_min_u32_e32 v0, s4, v0 +; GFX6-NEXT: v_and_b32_e32 v1, v1, v12 +; GFX6-NEXT: v_and_b32_e32 v5, s4, v5 +; GFX6-NEXT: v_and_b32_e32 v6, v11, v12 ; GFX6-NEXT: v_add_i32_e32 v5, vcc, v5, v6 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX6-NEXT: v_min_u32_e32 v2, s4, v2 +; GFX6-NEXT: v_min_u32_e32 v3, s4, v3 +; GFX6-NEXT: v_and_b32_e32 v0, v0, v12 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v2 -; GFX6-NEXT: v_and_b32_e32 v2, s4, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: v_and_b32_e32 v3, s4, v5 +; GFX6-NEXT: v_and_b32_e32 v1, v2, v12 +; GFX6-NEXT: v_and_b32_e32 v2, v3, v12 +; GFX6-NEXT: v_min_u32_e32 v5, v5, v12 +; GFX6-NEXT: v_and_b32_e32 v3, v5, v12 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_min_u32_e32 v4, s4, v4 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX6-NEXT: v_and_b32_e32 v2, s4, v4 +; GFX6-NEXT: v_and_b32_e32 v2, v4, v12 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -2318,59 +2243,47 @@ define amdgpu_ps <3 x i32> @s_uaddsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inreg %rhs) { ; GFX6-LABEL: s_uaddsat_v6i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_lshl_b32 s0, s0, 16 -; GFX6-NEXT: s_lshl_b32 s6, s6, 16 -; GFX6-NEXT: s_not_b32 s12, s0 -; GFX6-NEXT: s_cmp_lt_u32 s12, s6 -; GFX6-NEXT: s_cselect_b32 s6, s12, s6 +; GFX6-NEXT: s_mov_b32 s12, 0xffff +; GFX6-NEXT: s_and_b32 s0, s0, s12 +; GFX6-NEXT: s_and_b32 s6, s6, s12 ; GFX6-NEXT: s_add_i32 s0, s0, s6 -; GFX6-NEXT: s_lshl_b32 s1, s1, 16 -; GFX6-NEXT: s_lshl_b32 s6, s7, 16 -; GFX6-NEXT: s_lshr_b32 s0, s0, 16 -; GFX6-NEXT: s_not_b32 s7, s1 -; GFX6-NEXT: s_cmp_lt_u32 s7, s6 -; GFX6-NEXT: s_cselect_b32 s6, s7, s6 +; GFX6-NEXT: s_cmp_lt_u32 s0, s12 +; GFX6-NEXT: s_cselect_b32 s0, s0, s12 +; GFX6-NEXT: s_and_b32 s1, s1, s12 +; GFX6-NEXT: s_and_b32 s6, s7, s12 ; GFX6-NEXT: s_add_i32 s1, s1, s6 -; GFX6-NEXT: s_lshl_b32 s2, s2, 16 -; GFX6-NEXT: s_lshr_b32 s1, s1, 16 -; GFX6-NEXT: s_lshl_b32 s6, s8, 16 -; GFX6-NEXT: s_not_b32 s7, s2 -; GFX6-NEXT: s_cmp_lt_u32 s7, s6 -; GFX6-NEXT: s_cselect_b32 s6, s7, s6 +; GFX6-NEXT: s_cmp_lt_u32 s1, s12 +; GFX6-NEXT: s_cselect_b32 s1, s1, s12 +; GFX6-NEXT: s_and_b32 s2, s2, s12 +; GFX6-NEXT: s_and_b32 s6, s8, s12 ; GFX6-NEXT: s_add_i32 s2, s2, s6 -; GFX6-NEXT: s_lshl_b32 s3, s3, 16 -; GFX6-NEXT: s_lshr_b32 s2, s2, 16 -; GFX6-NEXT: s_lshl_b32 s6, s9, 16 -; GFX6-NEXT: s_not_b32 s7, s3 -; GFX6-NEXT: s_cmp_lt_u32 s7, s6 -; GFX6-NEXT: s_cselect_b32 s6, s7, s6 +; GFX6-NEXT: s_cmp_lt_u32 s2, s12 +; GFX6-NEXT: s_cselect_b32 s2, s2, s12 +; GFX6-NEXT: s_and_b32 s3, s3, s12 +; GFX6-NEXT: s_and_b32 s6, s9, s12 ; GFX6-NEXT: s_add_i32 s3, s3, s6 -; GFX6-NEXT: s_lshl_b32 s4, s4, 16 -; GFX6-NEXT: s_lshr_b32 s3, s3, 16 -; GFX6-NEXT: s_lshl_b32 s6, s10, 16 -; GFX6-NEXT: s_not_b32 s7, s4 -; GFX6-NEXT: s_cmp_lt_u32 s7, s6 -; GFX6-NEXT: s_cselect_b32 s6, s7, s6 +; GFX6-NEXT: s_cmp_lt_u32 s3, s12 +; GFX6-NEXT: s_cselect_b32 s3, s3, s12 +; GFX6-NEXT: s_and_b32 s4, s4, s12 +; GFX6-NEXT: s_and_b32 s6, s10, s12 ; GFX6-NEXT: s_add_i32 s4, s4, s6 -; GFX6-NEXT: s_lshl_b32 s5, s5, 16 -; GFX6-NEXT: s_lshr_b32 s4, s4, 16 -; GFX6-NEXT: s_lshl_b32 s6, s11, 16 -; GFX6-NEXT: s_not_b32 s7, s5 -; GFX6-NEXT: s_cmp_lt_u32 s7, s6 -; GFX6-NEXT: s_cselect_b32 s6, s7, s6 +; GFX6-NEXT: s_cmp_lt_u32 s4, s12 +; GFX6-NEXT: s_cselect_b32 s4, s4, s12 +; GFX6-NEXT: s_and_b32 s5, s5, s12 +; GFX6-NEXT: s_and_b32 s6, s11, s12 ; GFX6-NEXT: s_add_i32 s5, s5, s6 -; GFX6-NEXT: s_mov_b32 s6, 0xffff -; GFX6-NEXT: s_and_b32 s1, s1, s6 -; GFX6-NEXT: s_and_b32 s0, s0, s6 +; GFX6-NEXT: s_cmp_lt_u32 s5, s12 +; GFX6-NEXT: s_cselect_b32 s5, s5, s12 +; GFX6-NEXT: s_and_b32 s1, s1, s12 +; GFX6-NEXT: s_and_b32 s0, s0, s12 ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_and_b32 s1, s2, s6 -; GFX6-NEXT: s_and_b32 s2, s3, s6 -; GFX6-NEXT: s_lshr_b32 s5, s5, 16 -; GFX6-NEXT: s_and_b32 s3, s5, s6 +; GFX6-NEXT: s_and_b32 s1, s2, s12 +; GFX6-NEXT: s_and_b32 s2, s3, s12 +; GFX6-NEXT: s_and_b32 s3, s5, s12 ; GFX6-NEXT: s_lshl_b32 s2, s2, 16 ; GFX6-NEXT: s_or_b32 s1, s1, s2 -; GFX6-NEXT: s_and_b32 s2, s4, s6 +; GFX6-NEXT: s_and_b32 s2, s4, s12 ; GFX6-NEXT: s_lshl_b32 s3, s3, 16 ; GFX6-NEXT: s_or_b32 s2, s2, s3 ; GFX6-NEXT: ; return to shader part epilog @@ -2439,69 +2352,54 @@ ; GFX6-LABEL: v_uaddsat_v8i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GFX6-NEXT: v_xor_b32_e32 v16, -1, v0 -; GFX6-NEXT: v_min_u32_e32 v8, v16, v8 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: s_mov_b32 s4, 0xffff +; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX6-NEXT: v_and_b32_e32 v8, s4, v8 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v8 -; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; GFX6-NEXT: v_xor_b32_e32 v9, -1, v1 -; GFX6-NEXT: v_min_u32_e32 v8, v9, v8 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX6-NEXT: v_and_b32_e32 v8, s4, v9 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v8 -; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v10 -; GFX6-NEXT: v_xor_b32_e32 v9, -1, v2 -; GFX6-NEXT: v_min_u32_e32 v8, v9, v8 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX6-NEXT: v_and_b32_e32 v8, s4, v10 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v8 -; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v11 -; GFX6-NEXT: v_xor_b32_e32 v9, -1, v3 -; GFX6-NEXT: v_min_u32_e32 v8, v9, v8 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_and_b32_e32 v3, s4, v3 +; GFX6-NEXT: v_and_b32_e32 v8, s4, v11 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v8 -; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v12 -; GFX6-NEXT: v_xor_b32_e32 v9, -1, v4 -; GFX6-NEXT: v_min_u32_e32 v8, v9, v8 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX6-NEXT: v_mov_b32_e32 v16, 0xffff +; GFX6-NEXT: v_and_b32_e32 v4, s4, v4 +; GFX6-NEXT: v_and_b32_e32 v8, s4, v12 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v8 -; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v13 -; GFX6-NEXT: v_xor_b32_e32 v9, -1, v5 -; GFX6-NEXT: v_min_u32_e32 v8, v9, v8 -; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX6-NEXT: v_min_u32_e32 v1, s4, v1 +; GFX6-NEXT: v_and_b32_e32 v5, s4, v5 +; GFX6-NEXT: v_and_b32_e32 v8, v13, v16 ; GFX6-NEXT: v_add_i32_e32 v5, vcc, v5, v8 -; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v14 -; GFX6-NEXT: v_xor_b32_e32 v9, -1, v6 -; GFX6-NEXT: v_min_u32_e32 v8, v9, v8 -; GFX6-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: s_mov_b32 s4, 0xffff +; GFX6-NEXT: v_min_u32_e32 v0, s4, v0 +; GFX6-NEXT: v_and_b32_e32 v1, v1, v16 +; GFX6-NEXT: v_and_b32_e32 v6, v6, v16 +; GFX6-NEXT: v_and_b32_e32 v8, v14, v16 ; GFX6-NEXT: v_add_i32_e32 v6, vcc, v6, v8 -; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 -; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v15 -; GFX6-NEXT: v_xor_b32_e32 v9, -1, v7 -; GFX6-NEXT: v_min_u32_e32 v8, v9, v8 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX6-NEXT: v_min_u32_e32 v2, s4, v2 +; GFX6-NEXT: v_min_u32_e32 v3, s4, v3 +; GFX6-NEXT: v_and_b32_e32 v7, v7, v16 +; GFX6-NEXT: v_and_b32_e32 v8, v15, v16 +; GFX6-NEXT: v_and_b32_e32 v0, v0, v16 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_add_i32_e32 v7, vcc, v7, v8 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v2 -; GFX6-NEXT: v_and_b32_e32 v2, s4, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: v_and_b32_e32 v3, s4, v5 +; GFX6-NEXT: v_and_b32_e32 v1, v2, v16 +; GFX6-NEXT: v_and_b32_e32 v2, v3, v16 +; GFX6-NEXT: v_min_u32_e32 v5, v5, v16 +; GFX6-NEXT: v_and_b32_e32 v3, v5, v16 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GFX6-NEXT: v_min_u32_e32 v4, s4, v4 +; GFX6-NEXT: v_min_u32_e32 v7, v7, v16 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX6-NEXT: v_and_b32_e32 v2, s4, v4 -; GFX6-NEXT: v_and_b32_e32 v4, s4, v7 +; GFX6-NEXT: v_and_b32_e32 v2, v4, v16 +; GFX6-NEXT: v_and_b32_e32 v4, v7, v16 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX6-NEXT: v_min_u32_e32 v6, v6, v16 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 -; GFX6-NEXT: v_and_b32_e32 v3, s4, v6 +; GFX6-NEXT: v_and_b32_e32 v3, v6, v16 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX6-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -2556,77 +2454,61 @@ define amdgpu_ps <4 x i32> @s_uaddsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inreg %rhs) { ; GFX6-LABEL: s_uaddsat_v8i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_lshl_b32 s0, s0, 16 -; GFX6-NEXT: s_lshl_b32 s8, s8, 16 -; GFX6-NEXT: s_not_b32 s16, s0 -; GFX6-NEXT: s_cmp_lt_u32 s16, s8 -; GFX6-NEXT: s_cselect_b32 s8, s16, s8 +; GFX6-NEXT: s_mov_b32 s16, 0xffff +; GFX6-NEXT: s_and_b32 s0, s0, s16 +; GFX6-NEXT: s_and_b32 s8, s8, s16 ; GFX6-NEXT: s_add_i32 s0, s0, s8 -; GFX6-NEXT: s_lshl_b32 s1, s1, 16 -; GFX6-NEXT: s_lshl_b32 s8, s9, 16 -; GFX6-NEXT: s_lshr_b32 s0, s0, 16 -; GFX6-NEXT: s_not_b32 s9, s1 -; GFX6-NEXT: s_cmp_lt_u32 s9, s8 -; GFX6-NEXT: s_cselect_b32 s8, s9, s8 +; GFX6-NEXT: s_cmp_lt_u32 s0, s16 +; GFX6-NEXT: s_cselect_b32 s0, s0, s16 +; GFX6-NEXT: s_and_b32 s1, s1, s16 +; GFX6-NEXT: s_and_b32 s8, s9, s16 ; GFX6-NEXT: s_add_i32 s1, s1, s8 -; GFX6-NEXT: s_lshl_b32 s2, s2, 16 -; GFX6-NEXT: s_lshr_b32 s1, s1, 16 -; GFX6-NEXT: s_lshl_b32 s8, s10, 16 -; GFX6-NEXT: s_not_b32 s9, s2 -; GFX6-NEXT: s_cmp_lt_u32 s9, s8 -; GFX6-NEXT: s_cselect_b32 s8, s9, s8 +; GFX6-NEXT: s_cmp_lt_u32 s1, s16 +; GFX6-NEXT: s_cselect_b32 s1, s1, s16 +; GFX6-NEXT: s_and_b32 s2, s2, s16 +; GFX6-NEXT: s_and_b32 s8, s10, s16 ; GFX6-NEXT: s_add_i32 s2, s2, s8 -; GFX6-NEXT: s_lshl_b32 s3, s3, 16 -; GFX6-NEXT: s_lshr_b32 s2, s2, 16 -; GFX6-NEXT: s_lshl_b32 s8, s11, 16 -; GFX6-NEXT: s_not_b32 s9, s3 -; GFX6-NEXT: s_cmp_lt_u32 s9, s8 -; GFX6-NEXT: s_cselect_b32 s8, s9, s8 +; GFX6-NEXT: s_cmp_lt_u32 s2, s16 +; GFX6-NEXT: s_cselect_b32 s2, s2, s16 +; GFX6-NEXT: s_and_b32 s3, s3, s16 +; GFX6-NEXT: s_and_b32 s8, s11, s16 ; GFX6-NEXT: s_add_i32 s3, s3, s8 -; GFX6-NEXT: s_lshl_b32 s4, s4, 16 -; GFX6-NEXT: s_lshr_b32 s3, s3, 16 -; GFX6-NEXT: s_lshl_b32 s8, s12, 16 -; GFX6-NEXT: s_not_b32 s9, s4 -; GFX6-NEXT: s_cmp_lt_u32 s9, s8 -; GFX6-NEXT: s_cselect_b32 s8, s9, s8 +; GFX6-NEXT: s_cmp_lt_u32 s3, s16 +; GFX6-NEXT: s_cselect_b32 s3, s3, s16 +; GFX6-NEXT: s_and_b32 s4, s4, s16 +; GFX6-NEXT: s_and_b32 s8, s12, s16 ; GFX6-NEXT: s_add_i32 s4, s4, s8 -; GFX6-NEXT: s_lshl_b32 s5, s5, 16 -; GFX6-NEXT: s_lshr_b32 s4, s4, 16 -; GFX6-NEXT: s_lshl_b32 s8, s13, 16 -; GFX6-NEXT: s_not_b32 s9, s5 -; GFX6-NEXT: s_cmp_lt_u32 s9, s8 -; GFX6-NEXT: s_cselect_b32 s8, s9, s8 +; GFX6-NEXT: s_cmp_lt_u32 s4, s16 +; GFX6-NEXT: s_cselect_b32 s4, s4, s16 +; GFX6-NEXT: s_and_b32 s5, s5, s16 +; GFX6-NEXT: s_and_b32 s8, s13, s16 ; GFX6-NEXT: s_add_i32 s5, s5, s8 -; GFX6-NEXT: s_lshl_b32 s6, s6, 16 -; GFX6-NEXT: s_lshr_b32 s5, s5, 16 -; GFX6-NEXT: s_lshl_b32 s8, s14, 16 -; GFX6-NEXT: s_not_b32 s9, s6 -; GFX6-NEXT: s_cmp_lt_u32 s9, s8 -; GFX6-NEXT: s_cselect_b32 s8, s9, s8 +; GFX6-NEXT: s_cmp_lt_u32 s5, s16 +; GFX6-NEXT: s_cselect_b32 s5, s5, s16 +; GFX6-NEXT: s_and_b32 s6, s6, s16 +; GFX6-NEXT: s_and_b32 s8, s14, s16 ; GFX6-NEXT: s_add_i32 s6, s6, s8 -; GFX6-NEXT: s_lshl_b32 s7, s7, 16 -; GFX6-NEXT: s_lshr_b32 s6, s6, 16 -; GFX6-NEXT: s_lshl_b32 s8, s15, 16 -; GFX6-NEXT: s_not_b32 s9, s7 -; GFX6-NEXT: s_cmp_lt_u32 s9, s8 -; GFX6-NEXT: s_cselect_b32 s8, s9, s8 +; GFX6-NEXT: s_cmp_lt_u32 s6, s16 +; GFX6-NEXT: s_cselect_b32 s6, s6, s16 +; GFX6-NEXT: s_and_b32 s7, s7, s16 +; GFX6-NEXT: s_and_b32 s8, s15, s16 ; GFX6-NEXT: s_add_i32 s7, s7, s8 -; GFX6-NEXT: s_mov_b32 s8, 0xffff -; GFX6-NEXT: s_and_b32 s1, s1, s8 -; GFX6-NEXT: s_and_b32 s0, s0, s8 +; GFX6-NEXT: s_cmp_lt_u32 s7, s16 +; GFX6-NEXT: s_cselect_b32 s7, s7, s16 +; GFX6-NEXT: s_and_b32 s1, s1, s16 +; GFX6-NEXT: s_and_b32 s0, s0, s16 ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_and_b32 s1, s2, s8 -; GFX6-NEXT: s_and_b32 s2, s3, s8 -; GFX6-NEXT: s_and_b32 s3, s5, s8 +; GFX6-NEXT: s_and_b32 s1, s2, s16 +; GFX6-NEXT: s_and_b32 s2, s3, s16 +; GFX6-NEXT: s_and_b32 s3, s5, s16 ; GFX6-NEXT: s_lshl_b32 s2, s2, 16 -; GFX6-NEXT: s_lshr_b32 s7, s7, 16 ; GFX6-NEXT: s_or_b32 s1, s1, s2 -; GFX6-NEXT: s_and_b32 s2, s4, s8 -; GFX6-NEXT: s_and_b32 s4, s7, s8 +; GFX6-NEXT: s_and_b32 s2, s4, s16 +; GFX6-NEXT: s_and_b32 s4, s7, s16 ; GFX6-NEXT: s_lshl_b32 s3, s3, 16 ; GFX6-NEXT: s_or_b32 s2, s2, s3 -; GFX6-NEXT: s_and_b32 s3, s6, s8 +; GFX6-NEXT: s_and_b32 s3, s6, s16 ; GFX6-NEXT: s_lshl_b32 s4, s4, 16 ; GFX6-NEXT: s_or_b32 s3, s3, s4 ; GFX6-NEXT: ; return to shader part epilog Index: llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll @@ -8,11 +8,11 @@ ; GFX6-LABEL: v_usubsat_i7: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 25, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 25, v1 -; GFX6-NEXT: v_min_u32_e32 v1, v0, v1 +; GFX6-NEXT: s_movk_i32 s4, 0x7f +; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX6-NEXT: v_max_u32_e32 v0, v0, v1 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v0, 25, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_usubsat_i7: @@ -50,12 +50,12 @@ define amdgpu_ps i7 @s_usubsat_i7(i7 inreg %lhs, i7 inreg %rhs) { ; GFX6-LABEL: s_usubsat_i7: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_lshl_b32 s0, s0, 25 -; GFX6-NEXT: s_lshl_b32 s1, s1, 25 -; GFX6-NEXT: s_cmp_lt_u32 s0, s1 -; GFX6-NEXT: s_cselect_b32 s1, s0, s1 +; GFX6-NEXT: s_movk_i32 s2, 0x7f +; GFX6-NEXT: s_and_b32 s0, s0, s2 +; GFX6-NEXT: s_and_b32 s1, s1, s2 +; GFX6-NEXT: s_cmp_gt_u32 s0, s1 +; GFX6-NEXT: s_cselect_b32 s0, s0, s1 ; GFX6-NEXT: s_sub_i32 s0, s0, s1 -; GFX6-NEXT: s_lshr_b32 s0, s0, 25 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_usubsat_i7: @@ -98,11 +98,11 @@ ; GFX6-LABEL: v_usubsat_i8: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; GFX6-NEXT: v_min_u32_e32 v1, v0, v1 +; GFX6-NEXT: s_movk_i32 s4, 0xff +; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX6-NEXT: v_max_u32_e32 v0, v0, v1 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v0, 24, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_usubsat_i8: @@ -140,12 +140,12 @@ define amdgpu_ps i8 @s_usubsat_i8(i8 inreg %lhs, i8 inreg %rhs) { ; GFX6-LABEL: s_usubsat_i8: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_lshl_b32 s0, s0, 24 -; GFX6-NEXT: s_lshl_b32 s1, s1, 24 -; GFX6-NEXT: s_cmp_lt_u32 s0, s1 -; GFX6-NEXT: s_cselect_b32 s1, s0, s1 +; GFX6-NEXT: s_movk_i32 s2, 0xff +; GFX6-NEXT: s_and_b32 s0, s0, s2 +; GFX6-NEXT: s_and_b32 s1, s1, s2 +; GFX6-NEXT: s_cmp_gt_u32 s0, s1 +; GFX6-NEXT: s_cselect_b32 s0, s0, s1 ; GFX6-NEXT: s_sub_i32 s0, s0, s1 -; GFX6-NEXT: s_lshr_b32 s0, s0, 24 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_usubsat_i8: @@ -188,21 +188,19 @@ ; GFX6-LABEL: v_usubsat_v2i8: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_movk_i32 s4, 0xff ; GFX6-NEXT: v_lshrrev_b32_e32 v2, 8, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 8, v1 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; GFX6-NEXT: v_min_u32_e32 v1, v0, v1 +; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX6-NEXT: v_max_u32_e32 v0, v0, v1 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 24, v3 -; GFX6-NEXT: v_min_u32_e32 v2, v1, v2 +; GFX6-NEXT: v_and_b32_e32 v1, s4, v2 +; GFX6-NEXT: v_and_b32_e32 v2, s4, v3 +; GFX6-NEXT: v_max_u32_e32 v1, v1, v2 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v2 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 24, v1 -; GFX6-NEXT: v_mov_b32_e32 v2, 0xff -; GFX6-NEXT: v_lshrrev_b32_e32 v0, 24, v0 -; GFX6-NEXT: v_and_b32_e32 v1, v1, v2 -; GFX6-NEXT: v_and_b32_e32 v0, v0, v2 +; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX6-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -272,23 +270,21 @@ define amdgpu_ps i16 @s_usubsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) { ; GFX6-LABEL: s_usubsat_v2i8: ; GFX6: ; %bb.0: +; GFX6-NEXT: s_movk_i32 s4, 0xff ; GFX6-NEXT: s_lshr_b32 s2, s0, 8 ; GFX6-NEXT: s_lshr_b32 s3, s1, 8 -; GFX6-NEXT: s_lshl_b32 s0, s0, 24 -; GFX6-NEXT: s_lshl_b32 s1, s1, 24 -; GFX6-NEXT: s_cmp_lt_u32 s0, s1 -; GFX6-NEXT: s_cselect_b32 s1, s0, s1 +; GFX6-NEXT: s_and_b32 s0, s0, s4 +; GFX6-NEXT: s_and_b32 s1, s1, s4 +; GFX6-NEXT: s_cmp_gt_u32 s0, s1 +; GFX6-NEXT: s_cselect_b32 s0, s0, s1 ; GFX6-NEXT: s_sub_i32 s0, s0, s1 -; GFX6-NEXT: s_lshl_b32 s1, s2, 24 -; GFX6-NEXT: s_lshr_b32 s0, s0, 24 -; GFX6-NEXT: s_lshl_b32 s2, s3, 24 -; GFX6-NEXT: s_cmp_lt_u32 s1, s2 -; GFX6-NEXT: s_cselect_b32 s2, s1, s2 +; GFX6-NEXT: s_and_b32 s1, s2, s4 +; GFX6-NEXT: s_and_b32 s2, s3, s4 +; GFX6-NEXT: s_cmp_gt_u32 s1, s2 +; GFX6-NEXT: s_cselect_b32 s1, s1, s2 ; GFX6-NEXT: s_sub_i32 s1, s1, s2 -; GFX6-NEXT: s_movk_i32 s2, 0xff -; GFX6-NEXT: s_lshr_b32 s1, s1, 24 -; GFX6-NEXT: s_and_b32 s1, s1, s2 -; GFX6-NEXT: s_and_b32 s0, s0, s2 +; GFX6-NEXT: s_and_b32 s1, s1, s4 +; GFX6-NEXT: s_and_b32 s0, s0, s4 ; GFX6-NEXT: s_lshl_b32 s1, s1, 8 ; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: ; return to shader part epilog @@ -373,41 +369,37 @@ ; GFX6-LABEL: v_usubsat_v4i8: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_movk_i32 s4, 0xff ; GFX6-NEXT: v_lshrrev_b32_e32 v2, 8, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 24, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v5, 8, v1 ; GFX6-NEXT: v_lshrrev_b32_e32 v6, 16, v1 ; GFX6-NEXT: v_lshrrev_b32_e32 v7, 24, v1 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; GFX6-NEXT: v_min_u32_e32 v1, v0, v1 +; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX6-NEXT: v_max_u32_e32 v0, v0, v1 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 24, v5 -; GFX6-NEXT: v_min_u32_e32 v2, v1, v2 +; GFX6-NEXT: v_and_b32_e32 v1, s4, v2 +; GFX6-NEXT: v_and_b32_e32 v2, s4, v5 +; GFX6-NEXT: v_max_u32_e32 v1, v1, v2 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 24, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 24, v6 -; GFX6-NEXT: v_min_u32_e32 v3, v2, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 24, v1 -; GFX6-NEXT: s_movk_i32 s4, 0xff -; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v0, 24, v0 +; GFX6-NEXT: v_and_b32_e32 v2, s4, v3 +; GFX6-NEXT: v_and_b32_e32 v3, s4, v6 +; GFX6-NEXT: v_max_u32_e32 v2, v2, v3 ; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 24, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 24, v7 -; GFX6-NEXT: v_min_u32_e32 v4, v3, v4 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 24, v2 +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v3 +; GFX6-NEXT: v_and_b32_e32 v3, s4, v4 +; GFX6-NEXT: v_and_b32_e32 v4, s4, v7 ; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v3, v4 +; GFX6-NEXT: v_max_u32_e32 v3, v3, v4 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: v_and_b32_e32 v1, s4, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 24, v3 +; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v3, v4 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v3 +; GFX6-NEXT: v_and_b32_e32 v1, 0xff, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -526,45 +518,41 @@ define amdgpu_ps i32 @s_usubsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) { ; GFX6-LABEL: s_usubsat_v4i8: ; GFX6: ; %bb.0: +; GFX6-NEXT: s_movk_i32 s8, 0xff ; GFX6-NEXT: s_lshr_b32 s2, s0, 8 ; GFX6-NEXT: s_lshr_b32 s3, s0, 16 ; GFX6-NEXT: s_lshr_b32 s4, s0, 24 ; GFX6-NEXT: s_lshr_b32 s5, s1, 8 ; GFX6-NEXT: s_lshr_b32 s6, s1, 16 ; GFX6-NEXT: s_lshr_b32 s7, s1, 24 -; GFX6-NEXT: s_lshl_b32 s0, s0, 24 -; GFX6-NEXT: s_lshl_b32 s1, s1, 24 -; GFX6-NEXT: s_cmp_lt_u32 s0, s1 -; GFX6-NEXT: s_cselect_b32 s1, s0, s1 +; GFX6-NEXT: s_and_b32 s0, s0, s8 +; GFX6-NEXT: s_and_b32 s1, s1, s8 +; GFX6-NEXT: s_cmp_gt_u32 s0, s1 +; GFX6-NEXT: s_cselect_b32 s0, s0, s1 ; GFX6-NEXT: s_sub_i32 s0, s0, s1 -; GFX6-NEXT: s_lshl_b32 s1, s2, 24 -; GFX6-NEXT: s_lshr_b32 s0, s0, 24 -; GFX6-NEXT: s_lshl_b32 s2, s5, 24 -; GFX6-NEXT: s_cmp_lt_u32 s1, s2 -; GFX6-NEXT: s_cselect_b32 s2, s1, s2 +; GFX6-NEXT: s_and_b32 s1, s2, s8 +; GFX6-NEXT: s_and_b32 s2, s5, s8 +; GFX6-NEXT: s_cmp_gt_u32 s1, s2 +; GFX6-NEXT: s_cselect_b32 s1, s1, s2 ; GFX6-NEXT: s_sub_i32 s1, s1, s2 -; GFX6-NEXT: s_lshl_b32 s2, s3, 24 -; GFX6-NEXT: s_lshr_b32 s1, s1, 24 -; GFX6-NEXT: s_lshl_b32 s3, s6, 24 -; GFX6-NEXT: s_cmp_lt_u32 s2, s3 -; GFX6-NEXT: s_cselect_b32 s3, s2, s3 +; GFX6-NEXT: s_and_b32 s2, s3, s8 +; GFX6-NEXT: s_and_b32 s3, s6, s8 +; GFX6-NEXT: s_cmp_gt_u32 s2, s3 +; GFX6-NEXT: s_cselect_b32 s2, s2, s3 ; GFX6-NEXT: s_sub_i32 s2, s2, s3 -; GFX6-NEXT: s_lshl_b32 s3, s4, 24 -; GFX6-NEXT: s_lshr_b32 s2, s2, 24 -; GFX6-NEXT: s_lshl_b32 s4, s7, 24 -; GFX6-NEXT: s_cmp_lt_u32 s3, s4 -; GFX6-NEXT: s_cselect_b32 s4, s3, s4 -; GFX6-NEXT: s_sub_i32 s3, s3, s4 -; GFX6-NEXT: s_movk_i32 s4, 0xff -; GFX6-NEXT: s_and_b32 s1, s1, s4 -; GFX6-NEXT: s_and_b32 s0, s0, s4 +; GFX6-NEXT: s_and_b32 s3, s4, s8 +; GFX6-NEXT: s_and_b32 s4, s7, s8 +; GFX6-NEXT: s_cmp_gt_u32 s3, s4 +; GFX6-NEXT: s_cselect_b32 s3, s3, s4 +; GFX6-NEXT: s_and_b32 s1, s1, s8 +; GFX6-NEXT: s_and_b32 s0, s0, s8 ; GFX6-NEXT: s_lshl_b32 s1, s1, 8 ; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_and_b32 s1, s2, s4 +; GFX6-NEXT: s_and_b32 s1, s2, s8 ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 -; GFX6-NEXT: s_lshr_b32 s3, s3, 24 +; GFX6-NEXT: s_sub_i32 s3, s3, s4 ; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_and_b32 s1, s3, s4 +; GFX6-NEXT: s_and_b32 s1, s3, s8 ; GFX6-NEXT: s_lshl_b32 s1, s1, 24 ; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: ; return to shader part epilog @@ -709,11 +697,11 @@ ; GFX6-LABEL: v_usubsat_i24: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX6-NEXT: v_min_u32_e32 v1, v0, v1 +; GFX6-NEXT: s_mov_b32 s4, 0xffffff +; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX6-NEXT: v_max_u32_e32 v0, v0, v1 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v0, 8, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_usubsat_i24: @@ -751,12 +739,12 @@ define amdgpu_ps i24 @s_usubsat_i24(i24 inreg %lhs, i24 inreg %rhs) { ; GFX6-LABEL: s_usubsat_i24: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_lshl_b32 s0, s0, 8 -; GFX6-NEXT: s_lshl_b32 s1, s1, 8 -; GFX6-NEXT: s_cmp_lt_u32 s0, s1 -; GFX6-NEXT: s_cselect_b32 s1, s0, s1 +; GFX6-NEXT: s_mov_b32 s2, 0xffffff +; GFX6-NEXT: s_and_b32 s0, s0, s2 +; GFX6-NEXT: s_and_b32 s1, s1, s2 +; GFX6-NEXT: s_cmp_gt_u32 s0, s1 +; GFX6-NEXT: s_cselect_b32 s0, s0, s1 ; GFX6-NEXT: s_sub_i32 s0, s0, s1 -; GFX6-NEXT: s_lshr_b32 s0, s0, 8 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_usubsat_i24: @@ -796,7 +784,7 @@ ; GFX6-LABEL: v_usubsat_i32: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_min_u32_e32 v1, v0, v1 +; GFX6-NEXT: v_max_u32_e32 v0, v0, v1 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; @@ -826,8 +814,8 @@ define amdgpu_ps i32 @s_usubsat_i32(i32 inreg %lhs, i32 inreg %rhs) { ; GFX6-LABEL: s_usubsat_i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_cmp_lt_u32 s0, s1 -; GFX6-NEXT: s_cselect_b32 s1, s0, s1 +; GFX6-NEXT: s_cmp_gt_u32 s0, s1 +; GFX6-NEXT: s_cselect_b32 s0, s0, s1 ; GFX6-NEXT: s_sub_i32 s0, s0, s1 ; GFX6-NEXT: ; return to shader part epilog ; @@ -858,8 +846,8 @@ define amdgpu_ps float @usubsat_i32_sv(i32 inreg %lhs, i32 %rhs) { ; GFX6-LABEL: usubsat_i32_sv: ; GFX6: ; %bb.0: -; GFX6-NEXT: v_min_u32_e32 v0, s0, v0 -; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 +; GFX6-NEXT: v_max_u32_e32 v1, s0, v0 +; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v1, v0 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: usubsat_i32_sv: @@ -885,8 +873,8 @@ define amdgpu_ps float @usubsat_i32_vs(i32 %lhs, i32 inreg %rhs) { ; GFX6-LABEL: usubsat_i32_vs: ; GFX6: ; %bb.0: -; GFX6-NEXT: v_min_u32_e32 v1, s0, v0 -; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 +; GFX6-NEXT: v_max_u32_e32 v0, s0, v0 +; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s0, v0 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: usubsat_i32_vs: @@ -913,10 +901,10 @@ ; GFX6-LABEL: v_usubsat_v2i32: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_min_u32_e32 v2, v0, v2 +; GFX6-NEXT: v_max_u32_e32 v0, v0, v2 +; GFX6-NEXT: v_max_u32_e32 v1, v1, v3 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 -; GFX6-NEXT: v_min_u32_e32 v2, v1, v3 -; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v2 +; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v3 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_usubsat_v2i32: @@ -948,12 +936,12 @@ define amdgpu_ps <2 x i32> @s_usubsat_v2i32(<2 x i32> inreg %lhs, <2 x i32> inreg %rhs) { ; GFX6-LABEL: s_usubsat_v2i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_cmp_lt_u32 s0, s2 -; GFX6-NEXT: s_cselect_b32 s2, s0, s2 +; GFX6-NEXT: s_cmp_gt_u32 s0, s2 +; GFX6-NEXT: s_cselect_b32 s0, s0, s2 ; GFX6-NEXT: s_sub_i32 s0, s0, s2 -; GFX6-NEXT: s_cmp_lt_u32 s1, s3 -; GFX6-NEXT: s_cselect_b32 s2, s1, s3 -; GFX6-NEXT: s_sub_i32 s1, s1, s2 +; GFX6-NEXT: s_cmp_gt_u32 s1, s3 +; GFX6-NEXT: s_cselect_b32 s1, s1, s3 +; GFX6-NEXT: s_sub_i32 s1, s1, s3 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_usubsat_v2i32: @@ -992,12 +980,12 @@ ; GFX6-LABEL: v_usubsat_v3i32: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_min_u32_e32 v3, v0, v3 +; GFX6-NEXT: v_max_u32_e32 v0, v0, v3 +; GFX6-NEXT: v_max_u32_e32 v1, v1, v4 +; GFX6-NEXT: v_max_u32_e32 v2, v2, v5 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 -; GFX6-NEXT: v_min_u32_e32 v3, v1, v4 -; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v3 -; GFX6-NEXT: v_min_u32_e32 v3, v2, v5 -; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v3 +; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v4 +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v5 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_usubsat_v3i32: @@ -1032,15 +1020,15 @@ define amdgpu_ps <3 x i32> @s_usubsat_v3i32(<3 x i32> inreg %lhs, <3 x i32> inreg %rhs) { ; GFX6-LABEL: s_usubsat_v3i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_cmp_lt_u32 s0, s3 -; GFX6-NEXT: s_cselect_b32 s3, s0, s3 +; GFX6-NEXT: s_cmp_gt_u32 s0, s3 +; GFX6-NEXT: s_cselect_b32 s0, s0, s3 ; GFX6-NEXT: s_sub_i32 s0, s0, s3 -; GFX6-NEXT: s_cmp_lt_u32 s1, s4 -; GFX6-NEXT: s_cselect_b32 s3, s1, s4 -; GFX6-NEXT: s_sub_i32 s1, s1, s3 -; GFX6-NEXT: s_cmp_lt_u32 s2, s5 -; GFX6-NEXT: s_cselect_b32 s3, s2, s5 -; GFX6-NEXT: s_sub_i32 s2, s2, s3 +; GFX6-NEXT: s_cmp_gt_u32 s1, s4 +; GFX6-NEXT: s_cselect_b32 s1, s1, s4 +; GFX6-NEXT: s_sub_i32 s1, s1, s4 +; GFX6-NEXT: s_cmp_gt_u32 s2, s5 +; GFX6-NEXT: s_cselect_b32 s2, s2, s5 +; GFX6-NEXT: s_sub_i32 s2, s2, s5 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_usubsat_v3i32: @@ -1087,14 +1075,14 @@ ; GFX6-LABEL: v_usubsat_v4i32: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_min_u32_e32 v4, v0, v4 +; GFX6-NEXT: v_max_u32_e32 v0, v0, v4 +; GFX6-NEXT: v_max_u32_e32 v1, v1, v5 +; GFX6-NEXT: v_max_u32_e32 v2, v2, v6 +; GFX6-NEXT: v_max_u32_e32 v3, v3, v7 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 -; GFX6-NEXT: v_min_u32_e32 v4, v1, v5 -; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v4 -; GFX6-NEXT: v_min_u32_e32 v4, v2, v6 -; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v4 -; GFX6-NEXT: v_min_u32_e32 v4, v3, v7 -; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v3, v4 +; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v5 +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v6 +; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v3, v7 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_usubsat_v4i32: @@ -1132,18 +1120,18 @@ define amdgpu_ps <4 x i32> @s_usubsat_v4i32(<4 x i32> inreg %lhs, <4 x i32> inreg %rhs) { ; GFX6-LABEL: s_usubsat_v4i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_cmp_lt_u32 s0, s4 -; GFX6-NEXT: s_cselect_b32 s4, s0, s4 +; GFX6-NEXT: s_cmp_gt_u32 s0, s4 +; GFX6-NEXT: s_cselect_b32 s0, s0, s4 ; GFX6-NEXT: s_sub_i32 s0, s0, s4 -; GFX6-NEXT: s_cmp_lt_u32 s1, s5 -; GFX6-NEXT: s_cselect_b32 s4, s1, s5 -; GFX6-NEXT: s_sub_i32 s1, s1, s4 -; GFX6-NEXT: s_cmp_lt_u32 s2, s6 -; GFX6-NEXT: s_cselect_b32 s4, s2, s6 -; GFX6-NEXT: s_sub_i32 s2, s2, s4 -; GFX6-NEXT: s_cmp_lt_u32 s3, s7 -; GFX6-NEXT: s_cselect_b32 s4, s3, s7 -; GFX6-NEXT: s_sub_i32 s3, s3, s4 +; GFX6-NEXT: s_cmp_gt_u32 s1, s5 +; GFX6-NEXT: s_cselect_b32 s1, s1, s5 +; GFX6-NEXT: s_sub_i32 s1, s1, s5 +; GFX6-NEXT: s_cmp_gt_u32 s2, s6 +; GFX6-NEXT: s_cselect_b32 s2, s2, s6 +; GFX6-NEXT: s_sub_i32 s2, s2, s6 +; GFX6-NEXT: s_cmp_gt_u32 s3, s7 +; GFX6-NEXT: s_cselect_b32 s3, s3, s7 +; GFX6-NEXT: s_sub_i32 s3, s3, s7 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_usubsat_v4i32: @@ -1198,16 +1186,16 @@ ; GFX6-LABEL: v_usubsat_v5i32: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_min_u32_e32 v5, v0, v5 +; GFX6-NEXT: v_max_u32_e32 v0, v0, v5 +; GFX6-NEXT: v_max_u32_e32 v1, v1, v6 +; GFX6-NEXT: v_max_u32_e32 v2, v2, v7 +; GFX6-NEXT: v_max_u32_e32 v3, v3, v8 +; GFX6-NEXT: v_max_u32_e32 v4, v4, v9 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v5 -; GFX6-NEXT: v_min_u32_e32 v5, v1, v6 -; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v5 -; GFX6-NEXT: v_min_u32_e32 v5, v2, v7 -; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v5 -; GFX6-NEXT: v_min_u32_e32 v5, v3, v8 -; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v3, v5 -; GFX6-NEXT: v_min_u32_e32 v5, v4, v9 -; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v4, v5 +; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v6 +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v7 +; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v3, v8 +; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v4, v9 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_usubsat_v5i32: @@ -1248,21 +1236,21 @@ define amdgpu_ps <5 x i32> @s_usubsat_v5i32(<5 x i32> inreg %lhs, <5 x i32> inreg %rhs) { ; GFX6-LABEL: s_usubsat_v5i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_cmp_lt_u32 s0, s5 -; GFX6-NEXT: s_cselect_b32 s5, s0, s5 +; GFX6-NEXT: s_cmp_gt_u32 s0, s5 +; GFX6-NEXT: s_cselect_b32 s0, s0, s5 ; GFX6-NEXT: s_sub_i32 s0, s0, s5 -; GFX6-NEXT: s_cmp_lt_u32 s1, s6 -; GFX6-NEXT: s_cselect_b32 s5, s1, s6 -; GFX6-NEXT: s_sub_i32 s1, s1, s5 -; GFX6-NEXT: s_cmp_lt_u32 s2, s7 -; GFX6-NEXT: s_cselect_b32 s5, s2, s7 -; GFX6-NEXT: s_sub_i32 s2, s2, s5 -; GFX6-NEXT: s_cmp_lt_u32 s3, s8 -; GFX6-NEXT: s_cselect_b32 s5, s3, s8 -; GFX6-NEXT: s_sub_i32 s3, s3, s5 -; GFX6-NEXT: s_cmp_lt_u32 s4, s9 -; GFX6-NEXT: s_cselect_b32 s5, s4, s9 -; GFX6-NEXT: s_sub_i32 s4, s4, s5 +; GFX6-NEXT: s_cmp_gt_u32 s1, s6 +; GFX6-NEXT: s_cselect_b32 s1, s1, s6 +; GFX6-NEXT: s_sub_i32 s1, s1, s6 +; GFX6-NEXT: s_cmp_gt_u32 s2, s7 +; GFX6-NEXT: s_cselect_b32 s2, s2, s7 +; GFX6-NEXT: s_sub_i32 s2, s2, s7 +; GFX6-NEXT: s_cmp_gt_u32 s3, s8 +; GFX6-NEXT: s_cselect_b32 s3, s3, s8 +; GFX6-NEXT: s_sub_i32 s3, s3, s8 +; GFX6-NEXT: s_cmp_gt_u32 s4, s9 +; GFX6-NEXT: s_cselect_b32 s4, s4, s9 +; GFX6-NEXT: s_sub_i32 s4, s4, s9 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_usubsat_v5i32: @@ -1325,38 +1313,38 @@ ; GFX6-LABEL: v_usubsat_v16i32: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_min_u32_e32 v16, v0, v16 +; GFX6-NEXT: v_max_u32_e32 v0, v0, v16 +; GFX6-NEXT: v_max_u32_e32 v1, v1, v17 +; GFX6-NEXT: v_max_u32_e32 v2, v2, v18 +; GFX6-NEXT: v_max_u32_e32 v3, v3, v19 +; GFX6-NEXT: v_max_u32_e32 v4, v4, v20 +; GFX6-NEXT: v_max_u32_e32 v5, v5, v21 +; GFX6-NEXT: v_max_u32_e32 v6, v6, v22 +; GFX6-NEXT: v_max_u32_e32 v7, v7, v23 +; GFX6-NEXT: v_max_u32_e32 v8, v8, v24 +; GFX6-NEXT: v_max_u32_e32 v9, v9, v25 +; GFX6-NEXT: v_max_u32_e32 v10, v10, v26 +; GFX6-NEXT: v_max_u32_e32 v11, v11, v27 +; GFX6-NEXT: v_max_u32_e32 v12, v12, v28 +; GFX6-NEXT: v_max_u32_e32 v13, v13, v29 +; GFX6-NEXT: v_max_u32_e32 v14, v14, v30 +; GFX6-NEXT: v_max_u32_e32 v15, v15, v31 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v16 -; GFX6-NEXT: v_min_u32_e32 v16, v1, v17 -; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v16 -; GFX6-NEXT: v_min_u32_e32 v16, v2, v18 -; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v16 -; GFX6-NEXT: v_min_u32_e32 v16, v3, v19 -; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v3, v16 -; GFX6-NEXT: v_min_u32_e32 v16, v4, v20 -; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v4, v16 -; GFX6-NEXT: v_min_u32_e32 v16, v5, v21 -; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v5, v16 -; GFX6-NEXT: v_min_u32_e32 v16, v6, v22 -; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v6, v16 -; GFX6-NEXT: v_min_u32_e32 v16, v7, v23 -; GFX6-NEXT: v_sub_i32_e32 v7, vcc, v7, v16 -; GFX6-NEXT: v_min_u32_e32 v16, v8, v24 -; GFX6-NEXT: v_sub_i32_e32 v8, vcc, v8, v16 -; GFX6-NEXT: v_min_u32_e32 v16, v9, v25 -; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v9, v16 -; GFX6-NEXT: v_min_u32_e32 v16, v10, v26 -; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v10, v16 -; GFX6-NEXT: v_min_u32_e32 v16, v11, v27 -; GFX6-NEXT: v_sub_i32_e32 v11, vcc, v11, v16 -; GFX6-NEXT: v_min_u32_e32 v16, v12, v28 -; GFX6-NEXT: v_sub_i32_e32 v12, vcc, v12, v16 -; GFX6-NEXT: v_min_u32_e32 v16, v13, v29 -; GFX6-NEXT: v_sub_i32_e32 v13, vcc, v13, v16 -; GFX6-NEXT: v_min_u32_e32 v16, v14, v30 -; GFX6-NEXT: v_sub_i32_e32 v14, vcc, v14, v16 -; GFX6-NEXT: v_min_u32_e32 v16, v15, v31 -; GFX6-NEXT: v_sub_i32_e32 v15, vcc, v15, v16 +; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v17 +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v18 +; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v3, v19 +; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v4, v20 +; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v5, v21 +; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v6, v22 +; GFX6-NEXT: v_sub_i32_e32 v7, vcc, v7, v23 +; GFX6-NEXT: v_sub_i32_e32 v8, vcc, v8, v24 +; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v9, v25 +; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v10, v26 +; GFX6-NEXT: v_sub_i32_e32 v11, vcc, v11, v27 +; GFX6-NEXT: v_sub_i32_e32 v12, vcc, v12, v28 +; GFX6-NEXT: v_sub_i32_e32 v13, vcc, v13, v29 +; GFX6-NEXT: v_sub_i32_e32 v14, vcc, v14, v30 +; GFX6-NEXT: v_sub_i32_e32 v15, vcc, v15, v31 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_usubsat_v16i32: @@ -1430,54 +1418,54 @@ define amdgpu_ps <16 x i32> @s_usubsat_v16i32(<16 x i32> inreg %lhs, <16 x i32> inreg %rhs) { ; GFX6-LABEL: s_usubsat_v16i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_cmp_lt_u32 s0, s16 -; GFX6-NEXT: s_cselect_b32 s16, s0, s16 -; GFX6-NEXT: s_sub_i32 s0, s0, s16 -; GFX6-NEXT: s_cmp_lt_u32 s1, s17 -; GFX6-NEXT: s_cselect_b32 s16, s1, s17 -; GFX6-NEXT: s_sub_i32 s1, s1, s16 -; GFX6-NEXT: s_cmp_lt_u32 s2, s18 -; GFX6-NEXT: s_cselect_b32 s16, s2, s18 -; GFX6-NEXT: s_sub_i32 s2, s2, s16 -; GFX6-NEXT: s_cmp_lt_u32 s3, s19 -; GFX6-NEXT: s_cselect_b32 s16, s3, s19 -; GFX6-NEXT: s_sub_i32 s3, s3, s16 -; GFX6-NEXT: s_cmp_lt_u32 s4, s20 -; GFX6-NEXT: s_cselect_b32 s16, s4, s20 -; GFX6-NEXT: s_sub_i32 s4, s4, s16 -; GFX6-NEXT: s_cmp_lt_u32 s5, s21 -; GFX6-NEXT: s_cselect_b32 s16, s5, s21 -; GFX6-NEXT: s_sub_i32 s5, s5, s16 -; GFX6-NEXT: s_cmp_lt_u32 s6, s22 -; GFX6-NEXT: s_cselect_b32 s16, s6, s22 -; GFX6-NEXT: s_sub_i32 s6, s6, s16 -; GFX6-NEXT: s_cmp_lt_u32 s7, s23 -; GFX6-NEXT: s_cselect_b32 s16, s7, s23 -; GFX6-NEXT: s_sub_i32 s7, s7, s16 -; GFX6-NEXT: s_cmp_lt_u32 s8, s24 -; GFX6-NEXT: s_cselect_b32 s16, s8, s24 -; GFX6-NEXT: s_sub_i32 s8, s8, s16 -; GFX6-NEXT: s_cmp_lt_u32 s9, s25 -; GFX6-NEXT: s_cselect_b32 s16, s9, s25 -; GFX6-NEXT: s_sub_i32 s9, s9, s16 -; GFX6-NEXT: s_cmp_lt_u32 s10, s26 -; GFX6-NEXT: s_cselect_b32 s16, s10, s26 -; GFX6-NEXT: s_sub_i32 s10, s10, s16 -; GFX6-NEXT: s_cmp_lt_u32 s11, s27 -; GFX6-NEXT: s_cselect_b32 s16, s11, s27 -; GFX6-NEXT: s_sub_i32 s11, s11, s16 -; GFX6-NEXT: s_cmp_lt_u32 s12, s28 -; GFX6-NEXT: s_cselect_b32 s16, s12, s28 -; GFX6-NEXT: s_sub_i32 s12, s12, s16 -; GFX6-NEXT: s_cmp_lt_u32 s13, s29 -; GFX6-NEXT: s_cselect_b32 s16, s13, s29 -; GFX6-NEXT: s_sub_i32 s13, s13, s16 -; GFX6-NEXT: s_cmp_lt_u32 s14, s30 -; GFX6-NEXT: s_cselect_b32 s16, s14, s30 -; GFX6-NEXT: s_sub_i32 s14, s14, s16 -; GFX6-NEXT: s_cmp_lt_u32 s15, s31 -; GFX6-NEXT: s_cselect_b32 s16, s15, s31 -; GFX6-NEXT: s_sub_i32 s15, s15, s16 +; GFX6-NEXT: s_cmp_gt_u32 s0, s16 +; GFX6-NEXT: s_cselect_b32 s32, s0, s16 +; GFX6-NEXT: s_sub_i32 s0, s32, s16 +; GFX6-NEXT: s_cmp_gt_u32 s1, s17 +; GFX6-NEXT: s_cselect_b32 s1, s1, s17 +; GFX6-NEXT: s_sub_i32 s1, s1, s17 +; GFX6-NEXT: s_cmp_gt_u32 s2, s18 +; GFX6-NEXT: s_cselect_b32 s2, s2, s18 +; GFX6-NEXT: s_sub_i32 s2, s2, s18 +; GFX6-NEXT: s_cmp_gt_u32 s3, s19 +; GFX6-NEXT: s_cselect_b32 s3, s3, s19 +; GFX6-NEXT: s_sub_i32 s3, s3, s19 +; GFX6-NEXT: s_cmp_gt_u32 s4, s20 +; GFX6-NEXT: s_cselect_b32 s4, s4, s20 +; GFX6-NEXT: s_sub_i32 s4, s4, s20 +; GFX6-NEXT: s_cmp_gt_u32 s5, s21 +; GFX6-NEXT: s_cselect_b32 s5, s5, s21 +; GFX6-NEXT: s_sub_i32 s5, s5, s21 +; GFX6-NEXT: s_cmp_gt_u32 s6, s22 +; GFX6-NEXT: s_cselect_b32 s6, s6, s22 +; GFX6-NEXT: s_sub_i32 s6, s6, s22 +; GFX6-NEXT: s_cmp_gt_u32 s7, s23 +; GFX6-NEXT: s_cselect_b32 s7, s7, s23 +; GFX6-NEXT: s_sub_i32 s7, s7, s23 +; GFX6-NEXT: s_cmp_gt_u32 s8, s24 +; GFX6-NEXT: s_cselect_b32 s8, s8, s24 +; GFX6-NEXT: s_sub_i32 s8, s8, s24 +; GFX6-NEXT: s_cmp_gt_u32 s9, s25 +; GFX6-NEXT: s_cselect_b32 s9, s9, s25 +; GFX6-NEXT: s_sub_i32 s9, s9, s25 +; GFX6-NEXT: s_cmp_gt_u32 s10, s26 +; GFX6-NEXT: s_cselect_b32 s10, s10, s26 +; GFX6-NEXT: s_sub_i32 s10, s10, s26 +; GFX6-NEXT: s_cmp_gt_u32 s11, s27 +; GFX6-NEXT: s_cselect_b32 s11, s11, s27 +; GFX6-NEXT: s_sub_i32 s11, s11, s27 +; GFX6-NEXT: s_cmp_gt_u32 s12, s28 +; GFX6-NEXT: s_cselect_b32 s12, s12, s28 +; GFX6-NEXT: s_sub_i32 s12, s12, s28 +; GFX6-NEXT: s_cmp_gt_u32 s13, s29 +; GFX6-NEXT: s_cselect_b32 s13, s13, s29 +; GFX6-NEXT: s_sub_i32 s13, s13, s29 +; GFX6-NEXT: s_cmp_gt_u32 s14, s30 +; GFX6-NEXT: s_cselect_b32 s14, s14, s30 +; GFX6-NEXT: s_sub_i32 s14, s14, s30 +; GFX6-NEXT: s_cmp_gt_u32 s15, s31 +; GFX6-NEXT: s_cselect_b32 s15, s15, s31 +; GFX6-NEXT: s_sub_i32 s15, s15, s31 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_usubsat_v16i32: @@ -1628,11 +1616,11 @@ ; GFX6-LABEL: v_usubsat_i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_min_u32_e32 v1, v0, v1 +; GFX6-NEXT: s_mov_b32 s4, 0xffff +; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX6-NEXT: v_max_u32_e32 v0, v0, v1 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_usubsat_i16: @@ -1661,12 +1649,12 @@ define amdgpu_ps i16 @s_usubsat_i16(i16 inreg %lhs, i16 inreg %rhs) { ; GFX6-LABEL: s_usubsat_i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_lshl_b32 s0, s0, 16 -; GFX6-NEXT: s_lshl_b32 s1, s1, 16 -; GFX6-NEXT: s_cmp_lt_u32 s0, s1 -; GFX6-NEXT: s_cselect_b32 s1, s0, s1 +; GFX6-NEXT: s_mov_b32 s2, 0xffff +; GFX6-NEXT: s_and_b32 s0, s0, s2 +; GFX6-NEXT: s_and_b32 s1, s1, s2 +; GFX6-NEXT: s_cmp_gt_u32 s0, s1 +; GFX6-NEXT: s_cselect_b32 s0, s0, s1 ; GFX6-NEXT: s_sub_i32 s0, s0, s1 -; GFX6-NEXT: s_lshr_b32 s0, s0, 16 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_usubsat_i16: @@ -1696,11 +1684,11 @@ define amdgpu_ps half @usubsat_i16_sv(i16 inreg %lhs, i16 %rhs) { ; GFX6-LABEL: usubsat_i16_sv: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_lshl_b32 s0, s0, 16 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_min_u32_e32 v0, s0, v0 -; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 -; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: s_mov_b32 s1, 0xffff +; GFX6-NEXT: s_and_b32 s0, s0, s1 +; GFX6-NEXT: v_and_b32_e32 v0, s1, v0 +; GFX6-NEXT: v_max_u32_e32 v1, s0, v0 +; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v1, v0 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: usubsat_i16_sv: @@ -1726,11 +1714,11 @@ define amdgpu_ps half @usubsat_i16_vs(i16 %lhs, i16 inreg %rhs) { ; GFX6-LABEL: usubsat_i16_vs: ; GFX6: ; %bb.0: -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: s_lshl_b32 s0, s0, 16 -; GFX6-NEXT: v_min_u32_e32 v1, s0, v0 -; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: s_mov_b32 s1, 0xffff +; GFX6-NEXT: v_and_b32_e32 v0, s1, v0 +; GFX6-NEXT: s_and_b32 s0, s0, s1 +; GFX6-NEXT: v_max_u32_e32 v0, s0, v0 +; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s0, v0 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: usubsat_i16_vs: @@ -1757,16 +1745,15 @@ ; GFX6-LABEL: v_usubsat_v2i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_min_u32_e32 v2, v0, v2 +; GFX6-NEXT: s_mov_b32 s4, 0xffff +; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX6-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX6-NEXT: v_max_u32_e32 v0, v0, v2 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX6-NEXT: v_min_u32_e32 v2, v1, v2 +; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX6-NEXT: v_and_b32_e32 v2, s4, v3 +; GFX6-NEXT: v_max_u32_e32 v1, v1, v2 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v2 -; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_usubsat_v2i16: @@ -1799,21 +1786,19 @@ define amdgpu_ps i32 @s_usubsat_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs) { ; GFX6-LABEL: s_usubsat_v2i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_lshl_b32 s0, s0, 16 -; GFX6-NEXT: s_lshl_b32 s2, s2, 16 -; GFX6-NEXT: s_cmp_lt_u32 s0, s2 -; GFX6-NEXT: s_cselect_b32 s2, s0, s2 +; GFX6-NEXT: s_mov_b32 s4, 0xffff +; GFX6-NEXT: s_and_b32 s0, s0, s4 +; GFX6-NEXT: s_and_b32 s2, s2, s4 +; GFX6-NEXT: s_cmp_gt_u32 s0, s2 +; GFX6-NEXT: s_cselect_b32 s0, s0, s2 ; GFX6-NEXT: s_sub_i32 s0, s0, s2 -; GFX6-NEXT: s_lshr_b32 s0, s0, 16 -; GFX6-NEXT: s_lshl_b32 s1, s1, 16 -; GFX6-NEXT: s_lshl_b32 s2, s3, 16 -; GFX6-NEXT: s_cmp_lt_u32 s1, s2 -; GFX6-NEXT: s_cselect_b32 s2, s1, s2 +; GFX6-NEXT: s_and_b32 s1, s1, s4 +; GFX6-NEXT: s_and_b32 s2, s3, s4 +; GFX6-NEXT: s_cmp_gt_u32 s1, s2 +; GFX6-NEXT: s_cselect_b32 s1, s1, s2 ; GFX6-NEXT: s_sub_i32 s1, s1, s2 -; GFX6-NEXT: s_mov_b32 s2, 0xffff -; GFX6-NEXT: s_lshr_b32 s1, s1, 16 -; GFX6-NEXT: s_and_b32 s1, s1, s2 -; GFX6-NEXT: s_and_b32 s0, s0, s2 +; GFX6-NEXT: s_and_b32 s1, s1, s4 +; GFX6-NEXT: s_and_b32 s0, s0, s4 ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: ; return to shader part epilog @@ -1853,19 +1838,17 @@ define amdgpu_ps float @usubsat_v2i16_sv(<2 x i16> inreg %lhs, <2 x i16> %rhs) { ; GFX6-LABEL: usubsat_v2i16_sv: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_lshl_b32 s0, s0, 16 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_min_u32_e32 v0, s0, v0 -; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 -; GFX6-NEXT: s_lshl_b32 s0, s1, 16 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_min_u32_e32 v1, s0, v1 -; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s0, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: s_mov_b32 s0, 0xffff -; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_and_b32_e32 v1, s0, v1 -; GFX6-NEXT: v_and_b32_e32 v0, s0, v0 +; GFX6-NEXT: s_mov_b32 s2, 0xffff +; GFX6-NEXT: s_and_b32 s0, s0, s2 +; GFX6-NEXT: v_and_b32_e32 v0, s2, v0 +; GFX6-NEXT: v_max_u32_e32 v2, s0, v0 +; GFX6-NEXT: s_and_b32 s0, s1, s2 +; GFX6-NEXT: v_and_b32_e32 v1, s2, v1 +; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v2, v0 +; GFX6-NEXT: v_max_u32_e32 v2, s0, v1 +; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v2, v1 +; GFX6-NEXT: v_and_b32_e32 v1, s2, v1 +; GFX6-NEXT: v_and_b32_e32 v0, s2, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: ; return to shader part epilog @@ -1899,19 +1882,17 @@ define amdgpu_ps float @usubsat_v2i16_vs(<2 x i16> %lhs, <2 x i16> inreg %rhs) { ; GFX6-LABEL: usubsat_v2i16_vs: ; GFX6: ; %bb.0: -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: s_lshl_b32 s0, s0, 16 -; GFX6-NEXT: v_min_u32_e32 v2, s0, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: s_lshl_b32 s0, s1, 16 -; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 -; GFX6-NEXT: v_min_u32_e32 v2, s0, v1 -; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v2 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: s_mov_b32 s0, 0xffff -; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_and_b32_e32 v1, s0, v1 -; GFX6-NEXT: v_and_b32_e32 v0, s0, v0 +; GFX6-NEXT: s_mov_b32 s2, 0xffff +; GFX6-NEXT: v_and_b32_e32 v0, s2, v0 +; GFX6-NEXT: s_and_b32 s0, s0, s2 +; GFX6-NEXT: v_max_u32_e32 v0, s0, v0 +; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s0, v0 +; GFX6-NEXT: v_and_b32_e32 v1, s2, v1 +; GFX6-NEXT: s_and_b32 s0, s1, s2 +; GFX6-NEXT: v_max_u32_e32 v1, s0, v1 +; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, s0, v1 +; GFX6-NEXT: v_and_b32_e32 v1, s2, v1 +; GFX6-NEXT: v_and_b32_e32 v0, s2, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: ; return to shader part epilog @@ -1957,33 +1938,30 @@ ; GFX6-LABEL: v_usubsat_v4i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_min_u32_e32 v4, v0, v4 +; GFX6-NEXT: s_mov_b32 s4, 0xffff +; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX6-NEXT: v_and_b32_e32 v4, s4, v4 +; GFX6-NEXT: v_max_u32_e32 v0, v0, v4 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; GFX6-NEXT: v_min_u32_e32 v4, v1, v4 +; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX6-NEXT: v_and_b32_e32 v4, s4, v5 +; GFX6-NEXT: v_max_u32_e32 v1, v1, v4 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v6 -; GFX6-NEXT: v_min_u32_e32 v4, v2, v4 +; GFX6-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX6-NEXT: v_and_b32_e32 v4, s4, v6 +; GFX6-NEXT: v_max_u32_e32 v2, v2, v4 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v7 -; GFX6-NEXT: v_min_u32_e32 v4, v3, v4 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: s_mov_b32 s4, 0xffff -; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v3, v4 +; GFX6-NEXT: v_and_b32_e32 v3, s4, v3 +; GFX6-NEXT: v_and_b32_e32 v4, s4, v7 +; GFX6-NEXT: v_max_u32_e32 v3, v3, v4 ; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_mov_b32_e32 v8, 0xffff +; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v3, v4 ; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v2 -; GFX6-NEXT: v_and_b32_e32 v2, s4, v3 +; GFX6-NEXT: v_and_b32_e32 v1, v2, v8 +; GFX6-NEXT: v_and_b32_e32 v2, v3, v8 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -2025,37 +2003,33 @@ define amdgpu_ps <2 x i32> @s_usubsat_v4i16(<4 x i16> inreg %lhs, <4 x i16> inreg %rhs) { ; GFX6-LABEL: s_usubsat_v4i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_lshl_b32 s0, s0, 16 -; GFX6-NEXT: s_lshl_b32 s4, s4, 16 -; GFX6-NEXT: s_cmp_lt_u32 s0, s4 -; GFX6-NEXT: s_cselect_b32 s4, s0, s4 +; GFX6-NEXT: s_mov_b32 s8, 0xffff +; GFX6-NEXT: s_and_b32 s0, s0, s8 +; GFX6-NEXT: s_and_b32 s4, s4, s8 +; GFX6-NEXT: s_cmp_gt_u32 s0, s4 +; GFX6-NEXT: s_cselect_b32 s0, s0, s4 ; GFX6-NEXT: s_sub_i32 s0, s0, s4 -; GFX6-NEXT: s_lshr_b32 s0, s0, 16 -; GFX6-NEXT: s_lshl_b32 s1, s1, 16 -; GFX6-NEXT: s_lshl_b32 s4, s5, 16 -; GFX6-NEXT: s_cmp_lt_u32 s1, s4 -; GFX6-NEXT: s_cselect_b32 s4, s1, s4 +; GFX6-NEXT: s_and_b32 s1, s1, s8 +; GFX6-NEXT: s_and_b32 s4, s5, s8 +; GFX6-NEXT: s_cmp_gt_u32 s1, s4 +; GFX6-NEXT: s_cselect_b32 s1, s1, s4 ; GFX6-NEXT: s_sub_i32 s1, s1, s4 -; GFX6-NEXT: s_lshr_b32 s1, s1, 16 -; GFX6-NEXT: s_lshl_b32 s2, s2, 16 -; GFX6-NEXT: s_lshl_b32 s4, s6, 16 -; GFX6-NEXT: s_cmp_lt_u32 s2, s4 -; GFX6-NEXT: s_cselect_b32 s4, s2, s4 +; GFX6-NEXT: s_and_b32 s2, s2, s8 +; GFX6-NEXT: s_and_b32 s4, s6, s8 +; GFX6-NEXT: s_cmp_gt_u32 s2, s4 +; GFX6-NEXT: s_cselect_b32 s2, s2, s4 ; GFX6-NEXT: s_sub_i32 s2, s2, s4 -; GFX6-NEXT: s_lshr_b32 s2, s2, 16 -; GFX6-NEXT: s_lshl_b32 s3, s3, 16 -; GFX6-NEXT: s_lshl_b32 s4, s7, 16 -; GFX6-NEXT: s_cmp_lt_u32 s3, s4 -; GFX6-NEXT: s_cselect_b32 s4, s3, s4 +; GFX6-NEXT: s_and_b32 s3, s3, s8 +; GFX6-NEXT: s_and_b32 s4, s7, s8 +; GFX6-NEXT: s_cmp_gt_u32 s3, s4 +; GFX6-NEXT: s_cselect_b32 s3, s3, s4 +; GFX6-NEXT: s_and_b32 s1, s1, s8 ; GFX6-NEXT: s_sub_i32 s3, s3, s4 -; GFX6-NEXT: s_mov_b32 s4, 0xffff -; GFX6-NEXT: s_and_b32 s1, s1, s4 -; GFX6-NEXT: s_lshr_b32 s3, s3, 16 -; GFX6-NEXT: s_and_b32 s0, s0, s4 +; GFX6-NEXT: s_and_b32 s0, s0, s8 ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_and_b32 s1, s2, s4 -; GFX6-NEXT: s_and_b32 s2, s3, s4 +; GFX6-NEXT: s_and_b32 s1, s2, s8 +; GFX6-NEXT: s_and_b32 s2, s3, s8 ; GFX6-NEXT: s_lshl_b32 s2, s2, 16 ; GFX6-NEXT: s_or_b32 s1, s1, s2 ; GFX6-NEXT: ; return to shader part epilog @@ -2121,47 +2095,42 @@ ; GFX6-LABEL: v_usubsat_v6i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX6-NEXT: v_min_u32_e32 v6, v0, v6 +; GFX6-NEXT: s_mov_b32 s4, 0xffff +; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX6-NEXT: v_and_b32_e32 v6, s4, v6 +; GFX6-NEXT: v_max_u32_e32 v0, v0, v6 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v6 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; GFX6-NEXT: v_min_u32_e32 v6, v1, v6 +; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX6-NEXT: v_and_b32_e32 v6, s4, v7 +; GFX6-NEXT: v_max_u32_e32 v1, v1, v6 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v6 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v8 -; GFX6-NEXT: v_min_u32_e32 v6, v2, v6 +; GFX6-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX6-NEXT: v_and_b32_e32 v6, s4, v8 +; GFX6-NEXT: v_max_u32_e32 v2, v2, v6 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v6 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v9 -; GFX6-NEXT: v_min_u32_e32 v6, v3, v6 +; GFX6-NEXT: v_and_b32_e32 v3, s4, v3 +; GFX6-NEXT: v_and_b32_e32 v6, s4, v9 +; GFX6-NEXT: v_max_u32_e32 v3, v3, v6 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v3, v6 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v10 -; GFX6-NEXT: v_min_u32_e32 v6, v4, v6 +; GFX6-NEXT: v_and_b32_e32 v4, s4, v4 +; GFX6-NEXT: v_and_b32_e32 v6, s4, v10 +; GFX6-NEXT: v_mov_b32_e32 v12, 0xffff +; GFX6-NEXT: v_max_u32_e32 v4, v4, v6 +; GFX6-NEXT: v_and_b32_e32 v1, v1, v12 ; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v4, v6 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: s_mov_b32 s4, 0xffff -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v11 -; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_min_u32_e32 v6, v5, v6 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 -; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v5, v6 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX6-NEXT: v_and_b32_e32 v5, s4, v5 +; GFX6-NEXT: v_and_b32_e32 v6, v11, v12 +; GFX6-NEXT: v_max_u32_e32 v5, v5, v6 +; GFX6-NEXT: v_and_b32_e32 v0, v0, v12 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v2 -; GFX6-NEXT: v_and_b32_e32 v2, s4, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: v_and_b32_e32 v3, s4, v5 +; GFX6-NEXT: v_and_b32_e32 v1, v2, v12 +; GFX6-NEXT: v_and_b32_e32 v2, v3, v12 +; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v5, v6 +; GFX6-NEXT: v_and_b32_e32 v3, v5, v12 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX6-NEXT: v_and_b32_e32 v2, s4, v4 +; GFX6-NEXT: v_and_b32_e32 v2, v4, v12 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -2210,53 +2179,47 @@ define amdgpu_ps <3 x i32> @s_usubsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inreg %rhs) { ; GFX6-LABEL: s_usubsat_v6i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_lshl_b32 s0, s0, 16 -; GFX6-NEXT: s_lshl_b32 s6, s6, 16 -; GFX6-NEXT: s_cmp_lt_u32 s0, s6 -; GFX6-NEXT: s_cselect_b32 s6, s0, s6 +; GFX6-NEXT: s_mov_b32 s12, 0xffff +; GFX6-NEXT: s_and_b32 s0, s0, s12 +; GFX6-NEXT: s_and_b32 s6, s6, s12 +; GFX6-NEXT: s_cmp_gt_u32 s0, s6 +; GFX6-NEXT: s_cselect_b32 s0, s0, s6 ; GFX6-NEXT: s_sub_i32 s0, s0, s6 -; GFX6-NEXT: s_lshr_b32 s0, s0, 16 -; GFX6-NEXT: s_lshl_b32 s1, s1, 16 -; GFX6-NEXT: s_lshl_b32 s6, s7, 16 -; GFX6-NEXT: s_cmp_lt_u32 s1, s6 -; GFX6-NEXT: s_cselect_b32 s6, s1, s6 +; GFX6-NEXT: s_and_b32 s1, s1, s12 +; GFX6-NEXT: s_and_b32 s6, s7, s12 +; GFX6-NEXT: s_cmp_gt_u32 s1, s6 +; GFX6-NEXT: s_cselect_b32 s1, s1, s6 ; GFX6-NEXT: s_sub_i32 s1, s1, s6 -; GFX6-NEXT: s_lshr_b32 s1, s1, 16 -; GFX6-NEXT: s_lshl_b32 s2, s2, 16 -; GFX6-NEXT: s_lshl_b32 s6, s8, 16 -; GFX6-NEXT: s_cmp_lt_u32 s2, s6 -; GFX6-NEXT: s_cselect_b32 s6, s2, s6 +; GFX6-NEXT: s_and_b32 s2, s2, s12 +; GFX6-NEXT: s_and_b32 s6, s8, s12 +; GFX6-NEXT: s_cmp_gt_u32 s2, s6 +; GFX6-NEXT: s_cselect_b32 s2, s2, s6 ; GFX6-NEXT: s_sub_i32 s2, s2, s6 -; GFX6-NEXT: s_lshr_b32 s2, s2, 16 -; GFX6-NEXT: s_lshl_b32 s3, s3, 16 -; GFX6-NEXT: s_lshl_b32 s6, s9, 16 -; GFX6-NEXT: s_cmp_lt_u32 s3, s6 -; GFX6-NEXT: s_cselect_b32 s6, s3, s6 +; GFX6-NEXT: s_and_b32 s3, s3, s12 +; GFX6-NEXT: s_and_b32 s6, s9, s12 +; GFX6-NEXT: s_cmp_gt_u32 s3, s6 +; GFX6-NEXT: s_cselect_b32 s3, s3, s6 ; GFX6-NEXT: s_sub_i32 s3, s3, s6 -; GFX6-NEXT: s_lshr_b32 s3, s3, 16 -; GFX6-NEXT: s_lshl_b32 s4, s4, 16 -; GFX6-NEXT: s_lshl_b32 s6, s10, 16 -; GFX6-NEXT: s_cmp_lt_u32 s4, s6 -; GFX6-NEXT: s_cselect_b32 s6, s4, s6 +; GFX6-NEXT: s_and_b32 s4, s4, s12 +; GFX6-NEXT: s_and_b32 s6, s10, s12 +; GFX6-NEXT: s_cmp_gt_u32 s4, s6 +; GFX6-NEXT: s_cselect_b32 s4, s4, s6 ; GFX6-NEXT: s_sub_i32 s4, s4, s6 -; GFX6-NEXT: s_lshr_b32 s4, s4, 16 -; GFX6-NEXT: s_lshl_b32 s5, s5, 16 -; GFX6-NEXT: s_lshl_b32 s6, s11, 16 -; GFX6-NEXT: s_cmp_lt_u32 s5, s6 -; GFX6-NEXT: s_cselect_b32 s6, s5, s6 -; GFX6-NEXT: s_sub_i32 s5, s5, s6 -; GFX6-NEXT: s_mov_b32 s6, 0xffff -; GFX6-NEXT: s_and_b32 s1, s1, s6 -; GFX6-NEXT: s_and_b32 s0, s0, s6 +; GFX6-NEXT: s_and_b32 s5, s5, s12 +; GFX6-NEXT: s_and_b32 s6, s11, s12 +; GFX6-NEXT: s_cmp_gt_u32 s5, s6 +; GFX6-NEXT: s_cselect_b32 s5, s5, s6 +; GFX6-NEXT: s_and_b32 s1, s1, s12 +; GFX6-NEXT: s_and_b32 s0, s0, s12 ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_and_b32 s1, s2, s6 -; GFX6-NEXT: s_and_b32 s2, s3, s6 -; GFX6-NEXT: s_lshr_b32 s5, s5, 16 -; GFX6-NEXT: s_and_b32 s3, s5, s6 +; GFX6-NEXT: s_and_b32 s1, s2, s12 +; GFX6-NEXT: s_and_b32 s2, s3, s12 +; GFX6-NEXT: s_sub_i32 s5, s5, s6 +; GFX6-NEXT: s_and_b32 s3, s5, s12 ; GFX6-NEXT: s_lshl_b32 s2, s2, 16 ; GFX6-NEXT: s_or_b32 s1, s1, s2 -; GFX6-NEXT: s_and_b32 s2, s4, s6 +; GFX6-NEXT: s_and_b32 s2, s4, s12 ; GFX6-NEXT: s_lshl_b32 s3, s3, 16 ; GFX6-NEXT: s_or_b32 s2, s2, s3 ; GFX6-NEXT: ; return to shader part epilog @@ -2325,61 +2288,54 @@ ; GFX6-LABEL: v_usubsat_v8i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GFX6-NEXT: v_min_u32_e32 v8, v0, v8 +; GFX6-NEXT: s_mov_b32 s4, 0xffff +; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX6-NEXT: v_and_b32_e32 v8, s4, v8 +; GFX6-NEXT: v_max_u32_e32 v0, v0, v8 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v8 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; GFX6-NEXT: v_min_u32_e32 v8, v1, v8 +; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX6-NEXT: v_and_b32_e32 v8, s4, v9 +; GFX6-NEXT: v_max_u32_e32 v1, v1, v8 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v8 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v10 -; GFX6-NEXT: v_min_u32_e32 v8, v2, v8 +; GFX6-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX6-NEXT: v_and_b32_e32 v8, s4, v10 +; GFX6-NEXT: v_max_u32_e32 v2, v2, v8 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v8 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v11 -; GFX6-NEXT: v_min_u32_e32 v8, v3, v8 +; GFX6-NEXT: v_and_b32_e32 v3, s4, v3 +; GFX6-NEXT: v_and_b32_e32 v8, s4, v11 +; GFX6-NEXT: v_max_u32_e32 v3, v3, v8 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v3, v8 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v12 -; GFX6-NEXT: v_min_u32_e32 v8, v4, v8 +; GFX6-NEXT: v_and_b32_e32 v4, s4, v4 +; GFX6-NEXT: v_and_b32_e32 v8, s4, v12 +; GFX6-NEXT: v_mov_b32_e32 v16, 0xffff +; GFX6-NEXT: v_max_u32_e32 v4, v4, v8 ; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v4, v8 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v13 -; GFX6-NEXT: v_min_u32_e32 v8, v5, v8 +; GFX6-NEXT: v_and_b32_e32 v5, s4, v5 +; GFX6-NEXT: v_and_b32_e32 v8, v13, v16 +; GFX6-NEXT: v_max_u32_e32 v5, v5, v8 ; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v5, v8 -; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v14 -; GFX6-NEXT: v_min_u32_e32 v8, v6, v8 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: s_mov_b32 s4, 0xffff +; GFX6-NEXT: v_and_b32_e32 v6, v6, v16 +; GFX6-NEXT: v_and_b32_e32 v8, v14, v16 +; GFX6-NEXT: v_max_u32_e32 v6, v6, v8 +; GFX6-NEXT: v_and_b32_e32 v1, v1, v16 ; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v6, v8 -; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 -; GFX6-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v15 -; GFX6-NEXT: v_min_u32_e32 v8, v7, v8 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX6-NEXT: v_and_b32_e32 v7, v7, v16 +; GFX6-NEXT: v_and_b32_e32 v8, v15, v16 +; GFX6-NEXT: v_and_b32_e32 v0, v0, v16 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_sub_i32_e32 v7, vcc, v7, v8 +; GFX6-NEXT: v_max_u32_e32 v7, v7, v8 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v2 -; GFX6-NEXT: v_and_b32_e32 v2, s4, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: v_and_b32_e32 v3, s4, v5 +; GFX6-NEXT: v_and_b32_e32 v1, v2, v16 +; GFX6-NEXT: v_and_b32_e32 v2, v3, v16 +; GFX6-NEXT: v_and_b32_e32 v3, v5, v16 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GFX6-NEXT: v_sub_i32_e32 v7, vcc, v7, v8 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX6-NEXT: v_and_b32_e32 v2, s4, v4 -; GFX6-NEXT: v_and_b32_e32 v4, s4, v7 +; GFX6-NEXT: v_and_b32_e32 v2, v4, v16 +; GFX6-NEXT: v_and_b32_e32 v4, v7, v16 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v6, 16, v6 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 -; GFX6-NEXT: v_and_b32_e32 v3, s4, v6 +; GFX6-NEXT: v_and_b32_e32 v3, v6, v16 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX6-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -2434,69 +2390,61 @@ define amdgpu_ps <4 x i32> @s_usubsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inreg %rhs) { ; GFX6-LABEL: s_usubsat_v8i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_lshl_b32 s0, s0, 16 -; GFX6-NEXT: s_lshl_b32 s8, s8, 16 -; GFX6-NEXT: s_cmp_lt_u32 s0, s8 -; GFX6-NEXT: s_cselect_b32 s8, s0, s8 +; GFX6-NEXT: s_mov_b32 s16, 0xffff +; GFX6-NEXT: s_and_b32 s0, s0, s16 +; GFX6-NEXT: s_and_b32 s8, s8, s16 +; GFX6-NEXT: s_cmp_gt_u32 s0, s8 +; GFX6-NEXT: s_cselect_b32 s0, s0, s8 ; GFX6-NEXT: s_sub_i32 s0, s0, s8 -; GFX6-NEXT: s_lshr_b32 s0, s0, 16 -; GFX6-NEXT: s_lshl_b32 s1, s1, 16 -; GFX6-NEXT: s_lshl_b32 s8, s9, 16 -; GFX6-NEXT: s_cmp_lt_u32 s1, s8 -; GFX6-NEXT: s_cselect_b32 s8, s1, s8 +; GFX6-NEXT: s_and_b32 s1, s1, s16 +; GFX6-NEXT: s_and_b32 s8, s9, s16 +; GFX6-NEXT: s_cmp_gt_u32 s1, s8 +; GFX6-NEXT: s_cselect_b32 s1, s1, s8 ; GFX6-NEXT: s_sub_i32 s1, s1, s8 -; GFX6-NEXT: s_lshr_b32 s1, s1, 16 -; GFX6-NEXT: s_lshl_b32 s2, s2, 16 -; GFX6-NEXT: s_lshl_b32 s8, s10, 16 -; GFX6-NEXT: s_cmp_lt_u32 s2, s8 -; GFX6-NEXT: s_cselect_b32 s8, s2, s8 +; GFX6-NEXT: s_and_b32 s2, s2, s16 +; GFX6-NEXT: s_and_b32 s8, s10, s16 +; GFX6-NEXT: s_cmp_gt_u32 s2, s8 +; GFX6-NEXT: s_cselect_b32 s2, s2, s8 ; GFX6-NEXT: s_sub_i32 s2, s2, s8 -; GFX6-NEXT: s_lshr_b32 s2, s2, 16 -; GFX6-NEXT: s_lshl_b32 s3, s3, 16 -; GFX6-NEXT: s_lshl_b32 s8, s11, 16 -; GFX6-NEXT: s_cmp_lt_u32 s3, s8 -; GFX6-NEXT: s_cselect_b32 s8, s3, s8 +; GFX6-NEXT: s_and_b32 s3, s3, s16 +; GFX6-NEXT: s_and_b32 s8, s11, s16 +; GFX6-NEXT: s_cmp_gt_u32 s3, s8 +; GFX6-NEXT: s_cselect_b32 s3, s3, s8 ; GFX6-NEXT: s_sub_i32 s3, s3, s8 -; GFX6-NEXT: s_lshr_b32 s3, s3, 16 -; GFX6-NEXT: s_lshl_b32 s4, s4, 16 -; GFX6-NEXT: s_lshl_b32 s8, s12, 16 -; GFX6-NEXT: s_cmp_lt_u32 s4, s8 -; GFX6-NEXT: s_cselect_b32 s8, s4, s8 +; GFX6-NEXT: s_and_b32 s4, s4, s16 +; GFX6-NEXT: s_and_b32 s8, s12, s16 +; GFX6-NEXT: s_cmp_gt_u32 s4, s8 +; GFX6-NEXT: s_cselect_b32 s4, s4, s8 ; GFX6-NEXT: s_sub_i32 s4, s4, s8 -; GFX6-NEXT: s_lshr_b32 s4, s4, 16 -; GFX6-NEXT: s_lshl_b32 s5, s5, 16 -; GFX6-NEXT: s_lshl_b32 s8, s13, 16 -; GFX6-NEXT: s_cmp_lt_u32 s5, s8 -; GFX6-NEXT: s_cselect_b32 s8, s5, s8 +; GFX6-NEXT: s_and_b32 s5, s5, s16 +; GFX6-NEXT: s_and_b32 s8, s13, s16 +; GFX6-NEXT: s_cmp_gt_u32 s5, s8 +; GFX6-NEXT: s_cselect_b32 s5, s5, s8 ; GFX6-NEXT: s_sub_i32 s5, s5, s8 -; GFX6-NEXT: s_lshr_b32 s5, s5, 16 -; GFX6-NEXT: s_lshl_b32 s6, s6, 16 -; GFX6-NEXT: s_lshl_b32 s8, s14, 16 -; GFX6-NEXT: s_cmp_lt_u32 s6, s8 -; GFX6-NEXT: s_cselect_b32 s8, s6, s8 +; GFX6-NEXT: s_and_b32 s6, s6, s16 +; GFX6-NEXT: s_and_b32 s8, s14, s16 +; GFX6-NEXT: s_cmp_gt_u32 s6, s8 +; GFX6-NEXT: s_cselect_b32 s6, s6, s8 ; GFX6-NEXT: s_sub_i32 s6, s6, s8 -; GFX6-NEXT: s_lshr_b32 s6, s6, 16 -; GFX6-NEXT: s_lshl_b32 s7, s7, 16 -; GFX6-NEXT: s_lshl_b32 s8, s15, 16 -; GFX6-NEXT: s_cmp_lt_u32 s7, s8 -; GFX6-NEXT: s_cselect_b32 s8, s7, s8 -; GFX6-NEXT: s_sub_i32 s7, s7, s8 -; GFX6-NEXT: s_mov_b32 s8, 0xffff -; GFX6-NEXT: s_and_b32 s1, s1, s8 -; GFX6-NEXT: s_and_b32 s0, s0, s8 +; GFX6-NEXT: s_and_b32 s7, s7, s16 +; GFX6-NEXT: s_and_b32 s8, s15, s16 +; GFX6-NEXT: s_cmp_gt_u32 s7, s8 +; GFX6-NEXT: s_cselect_b32 s7, s7, s8 +; GFX6-NEXT: s_and_b32 s1, s1, s16 +; GFX6-NEXT: s_and_b32 s0, s0, s16 ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_and_b32 s1, s2, s8 -; GFX6-NEXT: s_and_b32 s2, s3, s8 -; GFX6-NEXT: s_and_b32 s3, s5, s8 +; GFX6-NEXT: s_and_b32 s1, s2, s16 +; GFX6-NEXT: s_and_b32 s2, s3, s16 +; GFX6-NEXT: s_and_b32 s3, s5, s16 ; GFX6-NEXT: s_lshl_b32 s2, s2, 16 -; GFX6-NEXT: s_lshr_b32 s7, s7, 16 +; GFX6-NEXT: s_sub_i32 s7, s7, s8 ; GFX6-NEXT: s_or_b32 s1, s1, s2 -; GFX6-NEXT: s_and_b32 s2, s4, s8 -; GFX6-NEXT: s_and_b32 s4, s7, s8 +; GFX6-NEXT: s_and_b32 s2, s4, s16 +; GFX6-NEXT: s_and_b32 s4, s7, s16 ; GFX6-NEXT: s_lshl_b32 s3, s3, 16 ; GFX6-NEXT: s_or_b32 s2, s2, s3 -; GFX6-NEXT: s_and_b32 s3, s6, s8 +; GFX6-NEXT: s_and_b32 s3, s6, s16 ; GFX6-NEXT: s_lshl_b32 s4, s4, 16 ; GFX6-NEXT: s_or_b32 s3, s3, s4 ; GFX6-NEXT: ; return to shader part epilog