Index: llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -431,7 +431,7 @@ .widenScalarToNextPow2(0, 32); getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) - .lowerFor({S32, S16, V2S16}) // FIXME: legal and merge with add/sub/mul + .legalFor({S32, S16, V2S16}) // Clamp modifier .minScalar(0, S16) .clampMaxNumElements(0, S16, 2) .scalarize(0) @@ -442,7 +442,7 @@ // not signed s16. We should handle scalar s16 as the low half of the // vector. getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) - .lowerFor({V2S16}) // FIXME: Make legal + .lowerFor({V2S16}) // Clamp modifier .minScalar(0, S32) .clampMaxNumElements(0, S16, 2) .scalarize(0) @@ -460,7 +460,7 @@ .widenScalarToNextPow2(0, 32); // FIXME: min should be 16 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) - .lowerFor({S32, S16}) // FIXME: legal with clamp modifier + .legalFor({S32, S16}) // Clamp modifier .minScalar(0, S16) .scalarize(0) .widenScalarToNextPow2(0, 16) @@ -480,7 +480,7 @@ if (ST.hasIntClamp()) { getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) - .lowerFor({S32}) // FIXME: legal with clamp modifier. + .legalFor({S32}) // Clamp modifier. .scalarize(0) .minScalarOrElt(0, S32) .lower(); Index: llvm/lib/Target/AMDGPU/VOP2Instructions.td =================================================================== --- llvm/lib/Target/AMDGPU/VOP2Instructions.td +++ llvm/lib/Target/AMDGPU/VOP2Instructions.td @@ -827,6 +827,40 @@ } // End Predicates = [Has16BitInsts] +class getVSrcOp { + RegisterOperand ret = !if(!eq(vt.Size, 32), VSrc_b32, VSrc_b16); +} + +// Class for binary integer operations with the clamp bit set for saturation +// TODO: Add sub with negated inline constant pattern. +class BinOpClampPat : + GCNPat<(node vt:$src0, vt:$src1), + (inst getVSrcOp.ret:$src0, getVSrcOp.ret:$src1, + DSTCLAMP.ENABLE) +>; + +let SubtargetPredicate = HasIntClamp in { +// Set clamp bit for saturation. Note the oddity of the _I32 +// instruction for the unsigned operation. The instructions were +// mis-named originally, and renamed in VI, but do have the unsigned +// behavior. +// FIXME: Should rename the pseudoinstruction +def : BinOpClampPat; +def : BinOpClampPat; +} + +let SubtargetPredicate = HasAddNoCarryInsts, OtherPredicates = [HasIntClamp] in { +let AddedComplexity = 1 in { // Prefer over form with carry-out. +def : BinOpClampPat; +def : BinOpClampPat; +} +} + +let SubtargetPredicate = Has16BitInsts, OtherPredicates = [HasIntClamp] in { +def : BinOpClampPat; +def : BinOpClampPat; +} + //===----------------------------------------------------------------------===// // Target-specific instruction encodings. //===----------------------------------------------------------------------===// Index: llvm/lib/Target/AMDGPU/VOP3PInstructions.td =================================================================== --- llvm/lib/Target/AMDGPU/VOP3PInstructions.td +++ llvm/lib/Target/AMDGPU/VOP3PInstructions.td @@ -77,6 +77,8 @@ def V_PK_LSHRREV_B16 : VOP3PInst<"v_pk_lshrrev_b16", VOP3_Profile, lshr_rev>; +let SubtargetPredicate = HasVOP3PInsts in { + // Undo sub x, c -> add x, -c canonicalization since c is more likely // an inline immediate than -c. // The constant will be emitted as a mov, and folded later. @@ -86,6 +88,19 @@ (V_PK_SUB_U16 $src0_modifiers, $src0, SRCMODS.OP_SEL_1, NegSubInlineConstV216:$src1) >; +// Integer operations with clamp bit set. +class VOP3PSatPat : GCNPat< + (pat (v2i16 (VOP3PMods v2i16:$src0, i32:$src0_modifiers)), + (v2i16 (VOP3PMods v2i16:$src1, i32:$src1_modifiers))), + (inst $src0_modifiers, $src0, $src1_modifiers, $src1, DSTCLAMP.ENABLE) +>; + +def : VOP3PSatPat; +def : VOP3PSatPat; +def : VOP3PSatPat; +def : VOP3PSatPat; +} // End SubtargetPredicate = HasVOP3PInsts + multiclass MadFmaMixPats) = G_BITCAST [[OR]](s32) @@ -380,12 +347,8 @@ ; GFX9-LABEL: name: uaddsat_v2s16 ; GFX9: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX9: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 - ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1 - ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[C]](s32), [[C]](s32) - ; GFX9: [[XOR:%[0-9]+]]:_(<2 x s16>) = G_XOR [[COPY]], [[BUILD_VECTOR_TRUNC]] - ; GFX9: [[UMIN:%[0-9]+]]:_(<2 x s16>) = G_UMIN [[XOR]], [[COPY1]] - ; GFX9: [[ADD:%[0-9]+]]:_(<2 x s16>) = G_ADD [[COPY]], [[UMIN]] - ; GFX9: $vgpr0 = COPY [[ADD]](<2 x s16>) + ; GFX9: [[UADDSAT:%[0-9]+]]:_(<2 x s16>) = G_UADDSAT [[COPY]], [[COPY1]] + ; GFX9: $vgpr0 = COPY [[UADDSAT]](<2 x s16>) %0:_(<2 x s16>) = COPY $vgpr0 %1:_(<2 x s16>) = COPY $vgpr1 %2:_(<2 x s16>) = G_UADDSAT %0, %1 @@ -482,24 +445,17 @@ ; GFX8: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV5]](<2 x s16>) ; GFX8: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST3]](s32) ; GFX8: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) - ; GFX8: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 -1 - ; GFX8: [[XOR:%[0-9]+]]:_(s16) = G_XOR [[TRUNC]], [[C1]] - ; GFX8: [[UMIN:%[0-9]+]]:_(s16) = G_UMIN [[XOR]], [[TRUNC3]] - ; GFX8: [[ADD:%[0-9]+]]:_(s16) = G_ADD [[TRUNC]], [[UMIN]] - ; GFX8: [[XOR1:%[0-9]+]]:_(s16) = G_XOR [[TRUNC1]], [[C1]] - ; GFX8: [[UMIN1:%[0-9]+]]:_(s16) = G_UMIN [[XOR1]], [[TRUNC4]] - ; GFX8: [[ADD1:%[0-9]+]]:_(s16) = G_ADD [[TRUNC1]], [[UMIN1]] - ; GFX8: [[XOR2:%[0-9]+]]:_(s16) = G_XOR [[TRUNC2]], [[C1]] - ; GFX8: [[UMIN2:%[0-9]+]]:_(s16) = G_UMIN [[XOR2]], [[TRUNC5]] - ; GFX8: [[ADD2:%[0-9]+]]:_(s16) = G_ADD [[TRUNC2]], [[UMIN2]] - ; GFX8: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[ADD]](s16) - ; GFX8: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[ADD1]](s16) + ; GFX8: [[UADDSAT:%[0-9]+]]:_(s16) = G_UADDSAT [[TRUNC]], [[TRUNC3]] + ; GFX8: [[UADDSAT1:%[0-9]+]]:_(s16) = G_UADDSAT [[TRUNC1]], [[TRUNC4]] + ; GFX8: [[UADDSAT2:%[0-9]+]]:_(s16) = G_UADDSAT [[TRUNC2]], [[TRUNC5]] + ; GFX8: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[UADDSAT]](s16) + ; GFX8: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[UADDSAT1]](s16) ; GFX8: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C]](s32) ; GFX8: [[OR:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL]] ; GFX8: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) - ; GFX8: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[ADD2]](s16) - ; GFX8: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 - ; GFX8: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[C2]], [[C]](s32) + ; GFX8: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[UADDSAT2]](s16) + ; GFX8: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; GFX8: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[C1]], [[C]](s32) ; GFX8: [[OR1:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL1]] ; GFX8: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32) ; GFX8: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>) @@ -536,16 +492,9 @@ ; GFX9: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY4]](s32), [[COPY5]](s32) ; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY [[BITCAST3]](s32) ; GFX9: [[BUILD_VECTOR_TRUNC3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY6]](s32), [[DEF1]](s32) - ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1 - ; GFX9: [[BUILD_VECTOR_TRUNC4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[C1]](s32), [[C1]](s32) - ; GFX9: [[XOR:%[0-9]+]]:_(<2 x s16>) = G_XOR [[BUILD_VECTOR_TRUNC]], [[BUILD_VECTOR_TRUNC4]] - ; GFX9: [[UMIN:%[0-9]+]]:_(<2 x s16>) = G_UMIN [[XOR]], [[BUILD_VECTOR_TRUNC2]] - ; GFX9: [[ADD:%[0-9]+]]:_(<2 x s16>) = G_ADD [[BUILD_VECTOR_TRUNC]], [[UMIN]] - ; GFX9: [[BUILD_VECTOR_TRUNC5:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[C1]](s32), [[C1]](s32) - ; GFX9: [[XOR1:%[0-9]+]]:_(<2 x s16>) = G_XOR [[BUILD_VECTOR_TRUNC1]], [[BUILD_VECTOR_TRUNC5]] - ; GFX9: [[UMIN1:%[0-9]+]]:_(<2 x s16>) = G_UMIN [[XOR1]], [[BUILD_VECTOR_TRUNC3]] - ; GFX9: [[ADD1:%[0-9]+]]:_(<2 x s16>) = G_ADD [[BUILD_VECTOR_TRUNC1]], [[UMIN1]] - ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[ADD]](<2 x s16>), [[ADD1]](<2 x s16>), [[DEF2]](<2 x s16>) + ; GFX9: [[UADDSAT:%[0-9]+]]:_(<2 x s16>) = G_UADDSAT [[BUILD_VECTOR_TRUNC]], [[BUILD_VECTOR_TRUNC2]] + ; GFX9: [[UADDSAT1:%[0-9]+]]:_(<2 x s16>) = G_UADDSAT [[BUILD_VECTOR_TRUNC1]], [[BUILD_VECTOR_TRUNC3]] + ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[UADDSAT]](<2 x s16>), [[UADDSAT1]](<2 x s16>), [[DEF2]](<2 x s16>) ; GFX9: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS]](<6 x s16>), 0 ; GFX9: [[EXTRACT1:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF]](<4 x s16>), 0 ; GFX9: [[CONCAT_VECTORS1:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[EXTRACT]](<3 x s16>), [[EXTRACT1]](<3 x s16>) @@ -650,26 +599,17 @@ ; GFX8: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST3]](s32) ; GFX8: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) ; GFX8: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR3]](s32) - ; GFX8: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 -1 - ; GFX8: [[XOR:%[0-9]+]]:_(s16) = G_XOR [[TRUNC]], [[C1]] - ; GFX8: [[UMIN:%[0-9]+]]:_(s16) = G_UMIN [[XOR]], [[TRUNC4]] - ; GFX8: [[ADD:%[0-9]+]]:_(s16) = G_ADD [[TRUNC]], [[UMIN]] - ; GFX8: [[XOR1:%[0-9]+]]:_(s16) = G_XOR [[TRUNC1]], [[C1]] - ; GFX8: [[UMIN1:%[0-9]+]]:_(s16) = G_UMIN [[XOR1]], [[TRUNC5]] - ; GFX8: [[ADD1:%[0-9]+]]:_(s16) = G_ADD [[TRUNC1]], [[UMIN1]] - ; GFX8: [[XOR2:%[0-9]+]]:_(s16) = G_XOR [[TRUNC2]], [[C1]] - ; GFX8: [[UMIN2:%[0-9]+]]:_(s16) = G_UMIN [[XOR2]], [[TRUNC6]] - ; GFX8: [[ADD2:%[0-9]+]]:_(s16) = G_ADD [[TRUNC2]], [[UMIN2]] - ; GFX8: [[XOR3:%[0-9]+]]:_(s16) = G_XOR [[TRUNC3]], [[C1]] - ; GFX8: [[UMIN3:%[0-9]+]]:_(s16) = G_UMIN [[XOR3]], [[TRUNC7]] - ; GFX8: [[ADD3:%[0-9]+]]:_(s16) = G_ADD [[TRUNC3]], [[UMIN3]] - ; GFX8: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[ADD]](s16) - ; GFX8: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[ADD1]](s16) + ; GFX8: [[UADDSAT:%[0-9]+]]:_(s16) = G_UADDSAT [[TRUNC]], [[TRUNC4]] + ; GFX8: [[UADDSAT1:%[0-9]+]]:_(s16) = G_UADDSAT [[TRUNC1]], [[TRUNC5]] + ; GFX8: [[UADDSAT2:%[0-9]+]]:_(s16) = G_UADDSAT [[TRUNC2]], [[TRUNC6]] + ; GFX8: [[UADDSAT3:%[0-9]+]]:_(s16) = G_UADDSAT [[TRUNC3]], [[TRUNC7]] + ; GFX8: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[UADDSAT]](s16) + ; GFX8: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[UADDSAT1]](s16) ; GFX8: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C]](s32) ; GFX8: [[OR:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL]] ; GFX8: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) - ; GFX8: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[ADD2]](s16) - ; GFX8: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[ADD3]](s16) + ; GFX8: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[UADDSAT2]](s16) + ; GFX8: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[UADDSAT3]](s16) ; GFX8: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C]](s32) ; GFX8: [[OR1:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL1]] ; GFX8: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32) @@ -680,16 +620,9 @@ ; GFX9: [[COPY1:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr2_vgpr3 ; GFX9: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY]](<4 x s16>) ; GFX9: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY1]](<4 x s16>) - ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1 - ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[C]](s32), [[C]](s32) - ; GFX9: [[XOR:%[0-9]+]]:_(<2 x s16>) = G_XOR [[UV]], [[BUILD_VECTOR_TRUNC]] - ; GFX9: [[UMIN:%[0-9]+]]:_(<2 x s16>) = G_UMIN [[XOR]], [[UV2]] - ; GFX9: [[ADD:%[0-9]+]]:_(<2 x s16>) = G_ADD [[UV]], [[UMIN]] - ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[C]](s32), [[C]](s32) - ; GFX9: [[XOR1:%[0-9]+]]:_(<2 x s16>) = G_XOR [[UV1]], [[BUILD_VECTOR_TRUNC1]] - ; GFX9: [[UMIN1:%[0-9]+]]:_(<2 x s16>) = G_UMIN [[XOR1]], [[UV3]] - ; GFX9: [[ADD1:%[0-9]+]]:_(<2 x s16>) = G_ADD [[UV1]], [[UMIN1]] - ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[ADD]](<2 x s16>), [[ADD1]](<2 x s16>) + ; GFX9: [[UADDSAT:%[0-9]+]]:_(<2 x s16>) = G_UADDSAT [[UV]], [[UV2]] + ; GFX9: [[UADDSAT1:%[0-9]+]]:_(<2 x s16>) = G_UADDSAT [[UV1]], [[UV3]] + ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[UADDSAT]](<2 x s16>), [[UADDSAT1]](<2 x s16>) ; GFX9: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>) %0:_(<4 x s16>) = COPY $vgpr0_vgpr1 %1:_(<4 x s16>) = COPY $vgpr2_vgpr3 @@ -714,19 +647,13 @@ ; GFX8-LABEL: name: uaddsat_s32 ; GFX8: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX8: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX8: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1 - ; GFX8: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[COPY]], [[C]] - ; GFX8: [[UMIN:%[0-9]+]]:_(s32) = G_UMIN [[XOR]], [[COPY1]] - ; GFX8: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[COPY]], [[UMIN]] - ; GFX8: $vgpr0 = COPY [[ADD]](s32) + ; GFX8: [[UADDSAT:%[0-9]+]]:_(s32) = G_UADDSAT [[COPY]], [[COPY1]] + ; GFX8: $vgpr0 = COPY [[UADDSAT]](s32) ; GFX9-LABEL: name: uaddsat_s32 ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1 - ; GFX9: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[COPY]], [[C]] - ; GFX9: [[UMIN:%[0-9]+]]:_(s32) = G_UMIN [[XOR]], [[COPY1]] - ; GFX9: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[COPY]], [[UMIN]] - ; GFX9: $vgpr0 = COPY [[ADD]](s32) + ; GFX9: [[UADDSAT:%[0-9]+]]:_(s32) = G_UADDSAT [[COPY]], [[COPY1]] + ; GFX9: $vgpr0 = COPY [[UADDSAT]](s32) %0:_(s32) = COPY $vgpr0 %1:_(s32) = COPY $vgpr1 %2:_(s32) = G_UADDSAT %0, %1 @@ -758,28 +685,18 @@ ; GFX8: [[COPY1:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr2_vgpr3 ; GFX8: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>) ; GFX8: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<2 x s32>) - ; GFX8: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1 - ; GFX8: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[UV]], [[C]] - ; GFX8: [[UMIN:%[0-9]+]]:_(s32) = G_UMIN [[XOR]], [[UV2]] - ; GFX8: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[UV]], [[UMIN]] - ; GFX8: [[XOR1:%[0-9]+]]:_(s32) = G_XOR [[UV1]], [[C]] - ; GFX8: [[UMIN1:%[0-9]+]]:_(s32) = G_UMIN [[XOR1]], [[UV3]] - ; GFX8: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[UV1]], [[UMIN1]] - ; GFX8: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[ADD]](s32), [[ADD1]](s32) + ; GFX8: [[UADDSAT:%[0-9]+]]:_(s32) = G_UADDSAT [[UV]], [[UV2]] + ; GFX8: [[UADDSAT1:%[0-9]+]]:_(s32) = G_UADDSAT [[UV1]], [[UV3]] + ; GFX8: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[UADDSAT]](s32), [[UADDSAT1]](s32) ; GFX8: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) ; GFX9-LABEL: name: uaddsat_v2s32 ; GFX9: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr0_vgpr1 ; GFX9: [[COPY1:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr2_vgpr3 ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>) ; GFX9: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<2 x s32>) - ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1 - ; GFX9: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[UV]], [[C]] - ; GFX9: [[UMIN:%[0-9]+]]:_(s32) = G_UMIN [[XOR]], [[UV2]] - ; GFX9: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[UV]], [[UMIN]] - ; GFX9: [[XOR1:%[0-9]+]]:_(s32) = G_XOR [[UV1]], [[C]] - ; GFX9: [[UMIN1:%[0-9]+]]:_(s32) = G_UMIN [[XOR1]], [[UV3]] - ; GFX9: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[UV1]], [[UMIN1]] - ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[ADD]](s32), [[ADD1]](s32) + ; GFX9: [[UADDSAT:%[0-9]+]]:_(s32) = G_UADDSAT [[UV]], [[UV2]] + ; GFX9: [[UADDSAT1:%[0-9]+]]:_(s32) = G_UADDSAT [[UV1]], [[UV3]] + ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[UADDSAT]](s32), [[UADDSAT1]](s32) ; GFX9: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) %0:_(<2 x s32>) = COPY $vgpr0_vgpr1 %1:_(<2 x s32>) = COPY $vgpr2_vgpr3 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-usubsat.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-usubsat.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-usubsat.mir @@ -30,9 +30,8 @@ ; GFX8: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 9 ; GFX8: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[TRUNC]], [[C]](s16) ; GFX8: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[TRUNC1]], [[C]](s16) - ; GFX8: [[UMIN:%[0-9]+]]:_(s16) = G_UMIN [[SHL]], [[SHL1]] - ; GFX8: [[SUB:%[0-9]+]]:_(s16) = G_SUB [[SHL]], [[UMIN]] - ; GFX8: [[LSHR:%[0-9]+]]:_(s16) = G_LSHR [[SUB]], [[C]](s16) + ; GFX8: [[USUBSAT:%[0-9]+]]:_(s16) = G_USUBSAT [[SHL]], [[SHL1]] + ; GFX8: [[LSHR:%[0-9]+]]:_(s16) = G_LSHR [[USUBSAT]], [[C]](s16) ; GFX8: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LSHR]](s16) ; GFX8: $vgpr0 = COPY [[ANYEXT]](s32) ; GFX9-LABEL: name: usubsat_s7 @@ -43,9 +42,8 @@ ; GFX9: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 9 ; GFX9: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[TRUNC]], [[C]](s16) ; GFX9: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[TRUNC1]], [[C]](s16) - ; GFX9: [[UMIN:%[0-9]+]]:_(s16) = G_UMIN [[SHL]], [[SHL1]] - ; GFX9: [[SUB:%[0-9]+]]:_(s16) = G_SUB [[SHL]], [[UMIN]] - ; GFX9: [[LSHR:%[0-9]+]]:_(s16) = G_LSHR [[SUB]], [[C]](s16) + ; GFX9: [[USUBSAT:%[0-9]+]]:_(s16) = G_USUBSAT [[SHL]], [[SHL1]] + ; GFX9: [[LSHR:%[0-9]+]]:_(s16) = G_LSHR [[USUBSAT]], [[C]](s16) ; GFX9: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LSHR]](s16) ; GFX9: $vgpr0 = COPY [[ANYEXT]](s32) %0:_(s32) = COPY $vgpr0 @@ -84,9 +82,8 @@ ; GFX8: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 8 ; GFX8: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[TRUNC]], [[C]](s16) ; GFX8: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[TRUNC1]], [[C]](s16) - ; GFX8: [[UMIN:%[0-9]+]]:_(s16) = G_UMIN [[SHL]], [[SHL1]] - ; GFX8: [[SUB:%[0-9]+]]:_(s16) = G_SUB [[SHL]], [[UMIN]] - ; GFX8: [[LSHR:%[0-9]+]]:_(s16) = G_LSHR [[SUB]], [[C]](s16) + ; GFX8: [[USUBSAT:%[0-9]+]]:_(s16) = G_USUBSAT [[SHL]], [[SHL1]] + ; GFX8: [[LSHR:%[0-9]+]]:_(s16) = G_LSHR [[USUBSAT]], [[C]](s16) ; GFX8: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LSHR]](s16) ; GFX8: $vgpr0 = COPY [[ANYEXT]](s32) ; GFX9-LABEL: name: usubsat_s8 @@ -97,9 +94,8 @@ ; GFX9: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 8 ; GFX9: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[TRUNC]], [[C]](s16) ; GFX9: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[TRUNC1]], [[C]](s16) - ; GFX9: [[UMIN:%[0-9]+]]:_(s16) = G_UMIN [[SHL]], [[SHL1]] - ; GFX9: [[SUB:%[0-9]+]]:_(s16) = G_SUB [[SHL]], [[UMIN]] - ; GFX9: [[LSHR:%[0-9]+]]:_(s16) = G_LSHR [[SUB]], [[C]](s16) + ; GFX9: [[USUBSAT:%[0-9]+]]:_(s16) = G_USUBSAT [[SHL]], [[SHL1]] + ; GFX9: [[LSHR:%[0-9]+]]:_(s16) = G_LSHR [[USUBSAT]], [[C]](s16) ; GFX9: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LSHR]](s16) ; GFX9: $vgpr0 = COPY [[ANYEXT]](s32) %0:_(s32) = COPY $vgpr0 @@ -172,16 +168,14 @@ ; GFX8: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 8 ; GFX8: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[TRUNC]], [[C3]](s16) ; GFX8: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[TRUNC1]], [[C3]](s16) - ; GFX8: [[UMIN:%[0-9]+]]:_(s16) = G_UMIN [[SHL]], [[SHL1]] - ; GFX8: [[SUB:%[0-9]+]]:_(s16) = G_SUB [[SHL]], [[UMIN]] - ; GFX8: [[LSHR6:%[0-9]+]]:_(s16) = G_LSHR [[SUB]], [[C3]](s16) + ; GFX8: [[USUBSAT:%[0-9]+]]:_(s16) = G_USUBSAT [[SHL]], [[SHL1]] + ; GFX8: [[LSHR6:%[0-9]+]]:_(s16) = G_LSHR [[USUBSAT]], [[C3]](s16) ; GFX8: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX8: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR3]](s32) ; GFX8: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[TRUNC2]], [[C3]](s16) ; GFX8: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[TRUNC3]], [[C3]](s16) - ; GFX8: [[UMIN1:%[0-9]+]]:_(s16) = G_UMIN [[SHL2]], [[SHL3]] - ; GFX8: [[SUB1:%[0-9]+]]:_(s16) = G_SUB [[SHL2]], [[UMIN1]] - ; GFX8: [[LSHR7:%[0-9]+]]:_(s16) = G_LSHR [[SUB1]], [[C3]](s16) + ; GFX8: [[USUBSAT1:%[0-9]+]]:_(s16) = G_USUBSAT [[SHL2]], [[SHL3]] + ; GFX8: [[LSHR7:%[0-9]+]]:_(s16) = G_LSHR [[USUBSAT1]], [[C3]](s16) ; GFX8: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 255 ; GFX8: [[COPY2:%[0-9]+]]:_(s16) = COPY [[LSHR6]](s16) ; GFX8: [[AND:%[0-9]+]]:_(s16) = G_AND [[COPY2]], [[C4]] @@ -208,16 +202,14 @@ ; GFX9: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 8 ; GFX9: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[TRUNC]], [[C3]](s16) ; GFX9: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[TRUNC1]], [[C3]](s16) - ; GFX9: [[UMIN:%[0-9]+]]:_(s16) = G_UMIN [[SHL]], [[SHL1]] - ; GFX9: [[SUB:%[0-9]+]]:_(s16) = G_SUB [[SHL]], [[UMIN]] - ; GFX9: [[LSHR6:%[0-9]+]]:_(s16) = G_LSHR [[SUB]], [[C3]](s16) + ; GFX9: [[USUBSAT:%[0-9]+]]:_(s16) = G_USUBSAT [[SHL]], [[SHL1]] + ; GFX9: [[LSHR6:%[0-9]+]]:_(s16) = G_LSHR [[USUBSAT]], [[C3]](s16) ; GFX9: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX9: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR3]](s32) ; GFX9: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[TRUNC2]], [[C3]](s16) ; GFX9: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[TRUNC3]], [[C3]](s16) - ; GFX9: [[UMIN1:%[0-9]+]]:_(s16) = G_UMIN [[SHL2]], [[SHL3]] - ; GFX9: [[SUB1:%[0-9]+]]:_(s16) = G_SUB [[SHL2]], [[UMIN1]] - ; GFX9: [[LSHR7:%[0-9]+]]:_(s16) = G_LSHR [[SUB1]], [[C3]](s16) + ; GFX9: [[USUBSAT1:%[0-9]+]]:_(s16) = G_USUBSAT [[SHL2]], [[SHL3]] + ; GFX9: [[LSHR7:%[0-9]+]]:_(s16) = G_LSHR [[USUBSAT1]], [[C3]](s16) ; GFX9: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 255 ; GFX9: [[COPY2:%[0-9]+]]:_(s16) = COPY [[LSHR6]](s16) ; GFX9: [[AND:%[0-9]+]]:_(s16) = G_AND [[COPY2]], [[C4]] @@ -263,18 +255,16 @@ ; GFX8: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 ; GFX8: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) ; GFX8: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) - ; GFX8: [[UMIN:%[0-9]+]]:_(s16) = G_UMIN [[TRUNC]], [[TRUNC1]] - ; GFX8: [[SUB:%[0-9]+]]:_(s16) = G_SUB [[TRUNC]], [[UMIN]] - ; GFX8: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[SUB]](s16) + ; GFX8: [[USUBSAT:%[0-9]+]]:_(s16) = G_USUBSAT [[TRUNC]], [[TRUNC1]] + ; GFX8: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[USUBSAT]](s16) ; GFX8: $vgpr0 = COPY [[ANYEXT]](s32) ; GFX9-LABEL: name: usubsat_s16 ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) ; GFX9: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) - ; GFX9: [[UMIN:%[0-9]+]]:_(s16) = G_UMIN [[TRUNC]], [[TRUNC1]] - ; GFX9: [[SUB:%[0-9]+]]:_(s16) = G_SUB [[TRUNC]], [[UMIN]] - ; GFX9: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[SUB]](s16) + ; GFX9: [[USUBSAT:%[0-9]+]]:_(s16) = G_USUBSAT [[TRUNC]], [[TRUNC1]] + ; GFX9: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[USUBSAT]](s16) ; GFX9: $vgpr0 = COPY [[ANYEXT]](s32) %0:_(s32) = COPY $vgpr0 %1:_(s32) = COPY $vgpr1 @@ -334,12 +324,10 @@ ; GFX8: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) ; GFX8: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) ; GFX8: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) - ; GFX8: [[UMIN:%[0-9]+]]:_(s16) = G_UMIN [[TRUNC]], [[TRUNC2]] - ; GFX8: [[SUB:%[0-9]+]]:_(s16) = G_SUB [[TRUNC]], [[UMIN]] - ; GFX8: [[UMIN1:%[0-9]+]]:_(s16) = G_UMIN [[TRUNC1]], [[TRUNC3]] - ; GFX8: [[SUB1:%[0-9]+]]:_(s16) = G_SUB [[TRUNC1]], [[UMIN1]] - ; GFX8: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[SUB]](s16) - ; GFX8: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[SUB1]](s16) + ; GFX8: [[USUBSAT:%[0-9]+]]:_(s16) = G_USUBSAT [[TRUNC]], [[TRUNC2]] + ; GFX8: [[USUBSAT1:%[0-9]+]]:_(s16) = G_USUBSAT [[TRUNC1]], [[TRUNC3]] + ; GFX8: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[USUBSAT]](s16) + ; GFX8: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[USUBSAT1]](s16) ; GFX8: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C]](s32) ; GFX8: [[OR:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL]] ; GFX8: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) @@ -347,9 +335,8 @@ ; GFX9-LABEL: name: usubsat_v2s16 ; GFX9: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX9: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 - ; GFX9: [[UMIN:%[0-9]+]]:_(<2 x s16>) = G_UMIN [[COPY]], [[COPY1]] - ; GFX9: [[SUB:%[0-9]+]]:_(<2 x s16>) = G_SUB [[COPY]], [[UMIN]] - ; GFX9: $vgpr0 = COPY [[SUB]](<2 x s16>) + ; GFX9: [[USUBSAT:%[0-9]+]]:_(<2 x s16>) = G_USUBSAT [[COPY]], [[COPY1]] + ; GFX9: $vgpr0 = COPY [[USUBSAT]](<2 x s16>) %0:_(<2 x s16>) = COPY $vgpr0 %1:_(<2 x s16>) = COPY $vgpr1 %2:_(<2 x s16>) = G_USUBSAT %0, %1 @@ -442,18 +429,15 @@ ; GFX8: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV5]](<2 x s16>) ; GFX8: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST3]](s32) ; GFX8: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) - ; GFX8: [[UMIN:%[0-9]+]]:_(s16) = G_UMIN [[TRUNC]], [[TRUNC3]] - ; GFX8: [[SUB:%[0-9]+]]:_(s16) = G_SUB [[TRUNC]], [[UMIN]] - ; GFX8: [[UMIN1:%[0-9]+]]:_(s16) = G_UMIN [[TRUNC1]], [[TRUNC4]] - ; GFX8: [[SUB1:%[0-9]+]]:_(s16) = G_SUB [[TRUNC1]], [[UMIN1]] - ; GFX8: [[UMIN2:%[0-9]+]]:_(s16) = G_UMIN [[TRUNC2]], [[TRUNC5]] - ; GFX8: [[SUB2:%[0-9]+]]:_(s16) = G_SUB [[TRUNC2]], [[UMIN2]] - ; GFX8: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[SUB]](s16) - ; GFX8: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[SUB1]](s16) + ; GFX8: [[USUBSAT:%[0-9]+]]:_(s16) = G_USUBSAT [[TRUNC]], [[TRUNC3]] + ; GFX8: [[USUBSAT1:%[0-9]+]]:_(s16) = G_USUBSAT [[TRUNC1]], [[TRUNC4]] + ; GFX8: [[USUBSAT2:%[0-9]+]]:_(s16) = G_USUBSAT [[TRUNC2]], [[TRUNC5]] + ; GFX8: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[USUBSAT]](s16) + ; GFX8: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[USUBSAT1]](s16) ; GFX8: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C]](s32) ; GFX8: [[OR:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL]] ; GFX8: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) - ; GFX8: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[SUB2]](s16) + ; GFX8: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[USUBSAT2]](s16) ; GFX8: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX8: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[C1]], [[C]](s32) ; GFX8: [[OR1:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL1]] @@ -492,11 +476,9 @@ ; GFX9: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY4]](s32), [[COPY5]](s32) ; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY [[BITCAST3]](s32) ; GFX9: [[BUILD_VECTOR_TRUNC3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY6]](s32), [[DEF1]](s32) - ; GFX9: [[UMIN:%[0-9]+]]:_(<2 x s16>) = G_UMIN [[BUILD_VECTOR_TRUNC]], [[BUILD_VECTOR_TRUNC2]] - ; GFX9: [[SUB:%[0-9]+]]:_(<2 x s16>) = G_SUB [[BUILD_VECTOR_TRUNC]], [[UMIN]] - ; GFX9: [[UMIN1:%[0-9]+]]:_(<2 x s16>) = G_UMIN [[BUILD_VECTOR_TRUNC1]], [[BUILD_VECTOR_TRUNC3]] - ; GFX9: [[SUB1:%[0-9]+]]:_(<2 x s16>) = G_SUB [[BUILD_VECTOR_TRUNC1]], [[UMIN1]] - ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[SUB]](<2 x s16>), [[SUB1]](<2 x s16>), [[DEF2]](<2 x s16>) + ; GFX9: [[USUBSAT:%[0-9]+]]:_(<2 x s16>) = G_USUBSAT [[BUILD_VECTOR_TRUNC]], [[BUILD_VECTOR_TRUNC2]] + ; GFX9: [[USUBSAT1:%[0-9]+]]:_(<2 x s16>) = G_USUBSAT [[BUILD_VECTOR_TRUNC1]], [[BUILD_VECTOR_TRUNC3]] + ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[USUBSAT]](<2 x s16>), [[USUBSAT1]](<2 x s16>), [[DEF2]](<2 x s16>) ; GFX9: [[EXTRACT:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[CONCAT_VECTORS]](<6 x s16>), 0 ; GFX9: [[EXTRACT1:%[0-9]+]]:_(<3 x s16>) = G_EXTRACT [[DEF]](<4 x s16>), 0 ; GFX9: [[CONCAT_VECTORS1:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[EXTRACT]](<3 x s16>), [[EXTRACT1]](<3 x s16>) @@ -596,21 +578,17 @@ ; GFX8: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST3]](s32) ; GFX8: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) ; GFX8: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR3]](s32) - ; GFX8: [[UMIN:%[0-9]+]]:_(s16) = G_UMIN [[TRUNC]], [[TRUNC4]] - ; GFX8: [[SUB:%[0-9]+]]:_(s16) = G_SUB [[TRUNC]], [[UMIN]] - ; GFX8: [[UMIN1:%[0-9]+]]:_(s16) = G_UMIN [[TRUNC1]], [[TRUNC5]] - ; GFX8: [[SUB1:%[0-9]+]]:_(s16) = G_SUB [[TRUNC1]], [[UMIN1]] - ; GFX8: [[UMIN2:%[0-9]+]]:_(s16) = G_UMIN [[TRUNC2]], [[TRUNC6]] - ; GFX8: [[SUB2:%[0-9]+]]:_(s16) = G_SUB [[TRUNC2]], [[UMIN2]] - ; GFX8: [[UMIN3:%[0-9]+]]:_(s16) = G_UMIN [[TRUNC3]], [[TRUNC7]] - ; GFX8: [[SUB3:%[0-9]+]]:_(s16) = G_SUB [[TRUNC3]], [[UMIN3]] - ; GFX8: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[SUB]](s16) - ; GFX8: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[SUB1]](s16) + ; GFX8: [[USUBSAT:%[0-9]+]]:_(s16) = G_USUBSAT [[TRUNC]], [[TRUNC4]] + ; GFX8: [[USUBSAT1:%[0-9]+]]:_(s16) = G_USUBSAT [[TRUNC1]], [[TRUNC5]] + ; GFX8: [[USUBSAT2:%[0-9]+]]:_(s16) = G_USUBSAT [[TRUNC2]], [[TRUNC6]] + ; GFX8: [[USUBSAT3:%[0-9]+]]:_(s16) = G_USUBSAT [[TRUNC3]], [[TRUNC7]] + ; GFX8: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[USUBSAT]](s16) + ; GFX8: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[USUBSAT1]](s16) ; GFX8: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C]](s32) ; GFX8: [[OR:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL]] ; GFX8: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) - ; GFX8: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[SUB2]](s16) - ; GFX8: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[SUB3]](s16) + ; GFX8: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[USUBSAT2]](s16) + ; GFX8: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[USUBSAT3]](s16) ; GFX8: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C]](s32) ; GFX8: [[OR1:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL1]] ; GFX8: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32) @@ -621,11 +599,9 @@ ; GFX9: [[COPY1:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr2_vgpr3 ; GFX9: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY]](<4 x s16>) ; GFX9: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY1]](<4 x s16>) - ; GFX9: [[UMIN:%[0-9]+]]:_(<2 x s16>) = G_UMIN [[UV]], [[UV2]] - ; GFX9: [[SUB:%[0-9]+]]:_(<2 x s16>) = G_SUB [[UV]], [[UMIN]] - ; GFX9: [[UMIN1:%[0-9]+]]:_(<2 x s16>) = G_UMIN [[UV1]], [[UV3]] - ; GFX9: [[SUB1:%[0-9]+]]:_(<2 x s16>) = G_SUB [[UV1]], [[UMIN1]] - ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[SUB]](<2 x s16>), [[SUB1]](<2 x s16>) + ; GFX9: [[USUBSAT:%[0-9]+]]:_(<2 x s16>) = G_USUBSAT [[UV]], [[UV2]] + ; GFX9: [[USUBSAT1:%[0-9]+]]:_(<2 x s16>) = G_USUBSAT [[UV1]], [[UV3]] + ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[USUBSAT]](<2 x s16>), [[USUBSAT1]](<2 x s16>) ; GFX9: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>) %0:_(<4 x s16>) = COPY $vgpr0_vgpr1 %1:_(<4 x s16>) = COPY $vgpr2_vgpr3 @@ -648,15 +624,13 @@ ; GFX8-LABEL: name: usubsat_s32 ; GFX8: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX8: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX8: [[UMIN:%[0-9]+]]:_(s32) = G_UMIN [[COPY]], [[COPY1]] - ; GFX8: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[COPY]], [[UMIN]] - ; GFX8: $vgpr0 = COPY [[SUB]](s32) + ; GFX8: [[USUBSAT:%[0-9]+]]:_(s32) = G_USUBSAT [[COPY]], [[COPY1]] + ; GFX8: $vgpr0 = COPY [[USUBSAT]](s32) ; GFX9-LABEL: name: usubsat_s32 ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX9: [[UMIN:%[0-9]+]]:_(s32) = G_UMIN [[COPY]], [[COPY1]] - ; GFX9: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[COPY]], [[UMIN]] - ; GFX9: $vgpr0 = COPY [[SUB]](s32) + ; GFX9: [[USUBSAT:%[0-9]+]]:_(s32) = G_USUBSAT [[COPY]], [[COPY1]] + ; GFX9: $vgpr0 = COPY [[USUBSAT]](s32) %0:_(s32) = COPY $vgpr0 %1:_(s32) = COPY $vgpr1 %2:_(s32) = G_USUBSAT %0, %1 @@ -685,22 +659,18 @@ ; GFX8: [[COPY1:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr2_vgpr3 ; GFX8: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>) ; GFX8: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<2 x s32>) - ; GFX8: [[UMIN:%[0-9]+]]:_(s32) = G_UMIN [[UV]], [[UV2]] - ; GFX8: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UV]], [[UMIN]] - ; GFX8: [[UMIN1:%[0-9]+]]:_(s32) = G_UMIN [[UV1]], [[UV3]] - ; GFX8: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[UV1]], [[UMIN1]] - ; GFX8: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[SUB]](s32), [[SUB1]](s32) + ; GFX8: [[USUBSAT:%[0-9]+]]:_(s32) = G_USUBSAT [[UV]], [[UV2]] + ; GFX8: [[USUBSAT1:%[0-9]+]]:_(s32) = G_USUBSAT [[UV1]], [[UV3]] + ; GFX8: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[USUBSAT]](s32), [[USUBSAT1]](s32) ; GFX8: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) ; GFX9-LABEL: name: usubsat_v2s32 ; GFX9: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr0_vgpr1 ; GFX9: [[COPY1:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr2_vgpr3 ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>) ; GFX9: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<2 x s32>) - ; GFX9: [[UMIN:%[0-9]+]]:_(s32) = G_UMIN [[UV]], [[UV2]] - ; GFX9: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UV]], [[UMIN]] - ; GFX9: [[UMIN1:%[0-9]+]]:_(s32) = G_UMIN [[UV1]], [[UV3]] - ; GFX9: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[UV1]], [[UMIN1]] - ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[SUB]](s32), [[SUB1]](s32) + ; GFX9: [[USUBSAT:%[0-9]+]]:_(s32) = G_USUBSAT [[UV]], [[UV2]] + ; GFX9: [[USUBSAT1:%[0-9]+]]:_(s32) = G_USUBSAT [[UV1]], [[UV3]] + ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[USUBSAT]](s32), [[USUBSAT1]](s32) ; GFX9: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) %0:_(<2 x s32>) = COPY $vgpr0_vgpr1 %1:_(<2 x s32>) = COPY $vgpr2_vgpr3 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll @@ -20,9 +20,7 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshlrev_b16_e32 v0, 9, v0 ; GFX8-NEXT: v_lshlrev_b16_e32 v1, 9, v1 -; GFX8-NEXT: v_xor_b32_e32 v2, 0xffff, v0 -; GFX8-NEXT: v_min_u16_e32 v1, v2, v1 -; GFX8-NEXT: v_add_u16_e32 v0, v0, v1 +; GFX8-NEXT: v_add_u16_e64 v0, v0, v1 clamp ; GFX8-NEXT: v_lshrrev_b16_e32 v0, 9, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -31,9 +29,7 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 9, v0 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 9, v1 -; GFX9-NEXT: v_xor_b32_e32 v2, 0xffff, v0 -; GFX9-NEXT: v_min_u16_e32 v1, v2, v1 -; GFX9-NEXT: v_add_u16_e32 v0, v0, v1 +; GFX9-NEXT: v_add_u16_e64 v0, v0, v1 clamp ; GFX9-NEXT: v_lshrrev_b16_e32 v0, 9, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = call i7 @llvm.uadd.sat.i7(i7 %value, i7 %amount) @@ -55,31 +51,23 @@ ; GFX8-LABEL: s_uaddsat_i7: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_bfe_u32 s2, 9, 0x100000 -; GFX8-NEXT: s_lshl_b32 s0, s0, s2 ; GFX8-NEXT: s_lshl_b32 s1, s1, s2 -; GFX8-NEXT: s_xor_b32 s3, s0, 0xffff -; GFX8-NEXT: s_bfe_u32 s3, s3, 0x100000 -; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s3, s1 -; GFX8-NEXT: s_cselect_b32 s1, s3, s1 -; GFX8-NEXT: s_add_i32 s0, s0, s1 -; GFX8-NEXT: s_bfe_u32 s0, s0, 0x100000 -; GFX8-NEXT: s_lshr_b32 s0, s0, s2 +; GFX8-NEXT: s_lshl_b32 s0, s0, s2 +; GFX8-NEXT: v_mov_b32_e32 v0, s1 +; GFX8-NEXT: v_add_u16_e64 v0, s0, v0 clamp +; GFX8-NEXT: v_lshrrev_b16_e32 v0, 9, v0 +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_uaddsat_i7: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_bfe_u32 s2, 9, 0x100000 -; GFX9-NEXT: s_lshl_b32 s0, s0, s2 ; GFX9-NEXT: s_lshl_b32 s1, s1, s2 -; GFX9-NEXT: s_xor_b32 s3, s0, 0xffff -; GFX9-NEXT: s_bfe_u32 s3, s3, 0x100000 -; GFX9-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX9-NEXT: s_cmp_lt_u32 s3, s1 -; GFX9-NEXT: s_cselect_b32 s1, s3, s1 -; GFX9-NEXT: s_add_i32 s0, s0, s1 -; GFX9-NEXT: s_bfe_u32 s0, s0, 0x100000 -; GFX9-NEXT: s_lshr_b32 s0, s0, s2 +; GFX9-NEXT: s_lshl_b32 s0, s0, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_add_u16_e64 v0, s0, v0 clamp +; GFX9-NEXT: v_lshrrev_b16_e32 v0, 9, v0 +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: ; return to shader part epilog %result = call i7 @llvm.uadd.sat.i7(i7 %value, i7 %amount) ret i7 %result @@ -102,9 +90,7 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX8-NEXT: v_xor_b32_e32 v2, 0xffff, v0 -; GFX8-NEXT: v_min_u16_e32 v1, v2, v1 -; GFX8-NEXT: v_add_u16_e32 v0, v0, v1 +; GFX8-NEXT: v_add_u16_e64 v0, v0, v1 clamp ; GFX8-NEXT: v_lshrrev_b16_e32 v0, 8, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -113,9 +99,7 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: v_xor_b32_e32 v2, 0xffff, v0 -; GFX9-NEXT: v_min_u16_e32 v1, v2, v1 -; GFX9-NEXT: v_add_u16_e32 v0, v0, v1 +; GFX9-NEXT: v_add_u16_e64 v0, v0, v1 clamp ; GFX9-NEXT: v_lshrrev_b16_e32 v0, 8, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = call i8 @llvm.uadd.sat.i8(i8 %value, i8 %amount) @@ -137,31 +121,23 @@ ; GFX8-LABEL: s_uaddsat_i8: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_bfe_u32 s2, 8, 0x100000 -; GFX8-NEXT: s_lshl_b32 s0, s0, s2 ; GFX8-NEXT: s_lshl_b32 s1, s1, s2 -; GFX8-NEXT: s_xor_b32 s3, s0, 0xffff -; GFX8-NEXT: s_bfe_u32 s3, s3, 0x100000 -; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s3, s1 -; GFX8-NEXT: s_cselect_b32 s1, s3, s1 -; GFX8-NEXT: s_add_i32 s0, s0, s1 -; GFX8-NEXT: s_bfe_u32 s0, s0, 0x100000 -; GFX8-NEXT: s_lshr_b32 s0, s0, s2 +; GFX8-NEXT: s_lshl_b32 s0, s0, s2 +; GFX8-NEXT: v_mov_b32_e32 v0, s1 +; GFX8-NEXT: v_add_u16_e64 v0, s0, v0 clamp +; GFX8-NEXT: v_lshrrev_b16_e32 v0, 8, v0 +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_uaddsat_i8: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_bfe_u32 s2, 8, 0x100000 -; GFX9-NEXT: s_lshl_b32 s0, s0, s2 ; GFX9-NEXT: s_lshl_b32 s1, s1, s2 -; GFX9-NEXT: s_xor_b32 s3, s0, 0xffff -; GFX9-NEXT: s_bfe_u32 s3, s3, 0x100000 -; GFX9-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX9-NEXT: s_cmp_lt_u32 s3, s1 -; GFX9-NEXT: s_cselect_b32 s1, s3, s1 -; GFX9-NEXT: s_add_i32 s0, s0, s1 -; GFX9-NEXT: s_bfe_u32 s0, s0, 0x100000 -; GFX9-NEXT: s_lshr_b32 s0, s0, s2 +; GFX9-NEXT: s_lshl_b32 s0, s0, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_add_u16_e64 v0, s0, v0 clamp +; GFX9-NEXT: v_lshrrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: ; return to shader part epilog %result = call i8 @llvm.uadd.sat.i8(i8 %value, i8 %amount) ret i8 %result @@ -184,9 +160,7 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 8, v0 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX8-NEXT: v_xor_b32_e32 v2, -1, v0 -; GFX8-NEXT: v_min_u32_e32 v1, v2, v1 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 +; GFX8-NEXT: v_add_u32_e64 v0, s[4:5], v0, v1 clamp ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 8, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -195,9 +169,7 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v0 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX9-NEXT: v_xor_b32_e32 v2, -1, v0 -; GFX9-NEXT: v_min_u32_e32 v1, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 +; GFX9-NEXT: v_add_u32_e64 v0, v0, v1 clamp ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = call i24 @llvm.uadd.sat.i24(i24 %value, i24 %amount) @@ -205,16 +177,36 @@ } define amdgpu_ps i24 @s_uaddsat_i24(i24 inreg %value, i24 inreg %amount) { -; GCN-LABEL: s_uaddsat_i24: -; GCN: ; %bb.0: -; GCN-NEXT: s_lshl_b32 s0, s0, 8 -; GCN-NEXT: s_lshl_b32 s1, s1, 8 -; GCN-NEXT: s_not_b32 s2, s0 -; GCN-NEXT: s_cmp_lt_u32 s2, s1 -; GCN-NEXT: s_cselect_b32 s1, s2, s1 -; GCN-NEXT: s_add_i32 s0, s0, s1 -; GCN-NEXT: s_lshr_b32 s0, s0, 8 -; GCN-NEXT: ; return to shader part epilog +; GFX6-LABEL: s_uaddsat_i24: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_lshl_b32 s0, s0, 8 +; GFX6-NEXT: s_lshl_b32 s1, s1, 8 +; GFX6-NEXT: s_not_b32 s2, s0 +; GFX6-NEXT: s_cmp_lt_u32 s2, s1 +; GFX6-NEXT: s_cselect_b32 s1, s2, s1 +; GFX6-NEXT: s_add_i32 s0, s0, s1 +; GFX6-NEXT: s_lshr_b32 s0, s0, 8 +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: s_uaddsat_i24: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_lshl_b32 s1, s1, 8 +; GFX8-NEXT: s_lshl_b32 s0, s0, 8 +; GFX8-NEXT: v_mov_b32_e32 v0, s1 +; GFX8-NEXT: v_add_u32_e64 v0, s[0:1], s0, v0 clamp +; GFX8-NEXT: v_lshrrev_b32_e32 v0, 8, v0 +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: s_uaddsat_i24: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_lshl_b32 s1, s1, 8 +; GFX9-NEXT: s_lshl_b32 s0, s0, 8 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_add_u32_e64 v0, s0, v0 clamp +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v0 +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: ; return to shader part epilog %result = call i24 @llvm.uadd.sat.i24(i24 %value, i24 %amount) ret i24 %result } @@ -231,30 +223,40 @@ ; GFX8-LABEL: v_uaddsat_i32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_xor_b32_e32 v2, -1, v0 -; GFX8-NEXT: v_min_u32_e32 v1, v2, v1 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 +; GFX8-NEXT: v_add_u32_e64 v0, s[4:5], v0, v1 clamp ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_uaddsat_i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_xor_b32_e32 v2, -1, v0 -; GFX9-NEXT: v_min_u32_e32 v1, v2, v1 -; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 +; GFX9-NEXT: v_add_u32_e64 v0, v0, v1 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = call i32 @llvm.uadd.sat.i32(i32 %value, i32 %amount) ret i32 %result } define amdgpu_ps i32 @s_uaddsat_i32(i32 inreg %value, i32 inreg %amount) { -; GCN-LABEL: s_uaddsat_i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_not_b32 s2, s0 -; GCN-NEXT: s_cmp_lt_u32 s2, s1 -; GCN-NEXT: s_cselect_b32 s1, s2, s1 -; GCN-NEXT: s_add_i32 s0, s0, s1 -; GCN-NEXT: ; return to shader part epilog +; GFX6-LABEL: s_uaddsat_i32: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_not_b32 s2, s0 +; GFX6-NEXT: s_cmp_lt_u32 s2, s1 +; GFX6-NEXT: s_cselect_b32 s1, s2, s1 +; GFX6-NEXT: s_add_i32 s0, s0, s1 +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: s_uaddsat_i32: +; GFX8: ; %bb.0: +; GFX8-NEXT: v_mov_b32_e32 v0, s1 +; GFX8-NEXT: v_add_u32_e64 v0, s[0:1], s0, v0 clamp +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: s_uaddsat_i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_add_u32_e64 v0, s0, v0 clamp +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: ; return to shader part epilog %result = call i32 @llvm.uadd.sat.i32(i32 %value, i32 %amount) ret i32 %result } @@ -269,16 +271,12 @@ ; ; GFX8-LABEL: uaddsat_i32_sv: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_not_b32 s1, s0 -; GFX8-NEXT: v_min_u32_e32 v0, s1, v0 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; GFX8-NEXT: v_add_u32_e64 v0, s[0:1], s0, v0 clamp ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: uaddsat_i32_sv: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_not_b32 s1, s0 -; GFX9-NEXT: v_min_u32_e32 v0, s1, v0 -; GFX9-NEXT: v_add_u32_e32 v0, s0, v0 +; GFX9-NEXT: v_add_u32_e64 v0, s0, v0 clamp ; GFX9-NEXT: ; return to shader part epilog %result = call i32 @llvm.uadd.sat.i32(i32 %value, i32 %amount) %cast = bitcast i32 %result to float @@ -295,16 +293,12 @@ ; ; GFX8-LABEL: uaddsat_i32_vs: ; GFX8: ; %bb.0: -; GFX8-NEXT: v_xor_b32_e32 v1, -1, v0 -; GFX8-NEXT: v_min_u32_e32 v1, s0, v1 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 +; GFX8-NEXT: v_add_u32_e64 v0, s[0:1], v0, s0 clamp ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: uaddsat_i32_vs: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_xor_b32_e32 v1, -1, v0 -; GFX9-NEXT: v_min_u32_e32 v1, s0, v1 -; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 +; GFX9-NEXT: v_add_u32_e64 v0, v0, s0 clamp ; GFX9-NEXT: ; return to shader part epilog %result = call i32 @llvm.uadd.sat.i32(i32 %value, i32 %amount) %cast = bitcast i32 %result to float @@ -326,40 +320,52 @@ ; GFX8-LABEL: v_uaddsat_v2i32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_xor_b32_e32 v4, -1, v0 -; GFX8-NEXT: v_min_u32_e32 v2, v4, v2 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 -; GFX8-NEXT: v_xor_b32_e32 v2, -1, v1 -; GFX8-NEXT: v_min_u32_e32 v2, v2, v3 -; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2 +; GFX8-NEXT: v_add_u32_e64 v0, s[4:5], v0, v2 clamp +; GFX8-NEXT: v_add_u32_e64 v1, s[4:5], v1, v3 clamp ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_uaddsat_v2i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_xor_b32_e32 v4, -1, v0 -; GFX9-NEXT: v_min_u32_e32 v2, v4, v2 -; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 -; GFX9-NEXT: v_xor_b32_e32 v2, -1, v1 -; GFX9-NEXT: v_min_u32_e32 v2, v2, v3 -; GFX9-NEXT: v_add_u32_e32 v1, v1, v2 +; GFX9-NEXT: v_add_u32_e64 v0, v0, v2 clamp +; GFX9-NEXT: v_add_u32_e64 v1, v1, v3 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = call <2 x i32> @llvm.uadd.sat.v2i32(<2 x i32> %value, <2 x i32> %amount) ret <2 x i32> %result } define amdgpu_ps <2 x i32> @s_uaddsat_v2i32(<2 x i32> inreg %value, <2 x i32> inreg %amount) { -; GCN-LABEL: s_uaddsat_v2i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_not_b32 s4, s0 -; GCN-NEXT: s_cmp_lt_u32 s4, s2 -; GCN-NEXT: s_cselect_b32 s2, s4, s2 -; GCN-NEXT: s_add_i32 s0, s0, s2 -; GCN-NEXT: s_not_b32 s2, s1 -; GCN-NEXT: s_cmp_lt_u32 s2, s3 -; GCN-NEXT: s_cselect_b32 s2, s2, s3 -; GCN-NEXT: s_add_i32 s1, s1, s2 -; GCN-NEXT: ; return to shader part epilog +; GFX6-LABEL: s_uaddsat_v2i32: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_not_b32 s4, s0 +; GFX6-NEXT: s_cmp_lt_u32 s4, s2 +; GFX6-NEXT: s_cselect_b32 s2, s4, s2 +; GFX6-NEXT: s_add_i32 s0, s0, s2 +; GFX6-NEXT: s_not_b32 s2, s1 +; GFX6-NEXT: s_cmp_lt_u32 s2, s3 +; GFX6-NEXT: s_cselect_b32 s2, s2, s3 +; GFX6-NEXT: s_add_i32 s1, s1, s2 +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: s_uaddsat_v2i32: +; GFX8: ; %bb.0: +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_add_u32_e64 v0, s[4:5], s0, v0 clamp +; GFX8-NEXT: v_add_u32_e64 v1, s[0:1], s1, v1 clamp +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: v_readfirstlane_b32 s1, v1 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: s_uaddsat_v2i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_add_u32_e64 v0, s0, v0 clamp +; GFX9-NEXT: v_add_u32_e64 v1, s1, v1 clamp +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: v_readfirstlane_b32 s1, v1 +; GFX9-NEXT: ; return to shader part epilog %result = call <2 x i32> @llvm.uadd.sat.v2i32(<2 x i32> %value, <2 x i32> %amount) ret <2 x i32> %result } @@ -382,50 +388,64 @@ ; GFX8-LABEL: v_uaddsat_v3i32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_xor_b32_e32 v6, -1, v0 -; GFX8-NEXT: v_min_u32_e32 v3, v6, v3 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v3 -; GFX8-NEXT: v_xor_b32_e32 v3, -1, v1 -; GFX8-NEXT: v_min_u32_e32 v3, v3, v4 -; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3 -; GFX8-NEXT: v_xor_b32_e32 v3, -1, v2 -; GFX8-NEXT: v_min_u32_e32 v3, v3, v5 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v3 +; GFX8-NEXT: v_add_u32_e64 v0, s[4:5], v0, v3 clamp +; GFX8-NEXT: v_add_u32_e64 v1, s[4:5], v1, v4 clamp +; GFX8-NEXT: v_add_u32_e64 v2, s[4:5], v2, v5 clamp ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_uaddsat_v3i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_xor_b32_e32 v6, -1, v0 -; GFX9-NEXT: v_min_u32_e32 v3, v6, v3 -; GFX9-NEXT: v_add_u32_e32 v0, v0, v3 -; GFX9-NEXT: v_xor_b32_e32 v3, -1, v1 -; GFX9-NEXT: v_min_u32_e32 v3, v3, v4 -; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 -; GFX9-NEXT: v_xor_b32_e32 v3, -1, v2 -; GFX9-NEXT: v_min_u32_e32 v3, v3, v5 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v3 +; GFX9-NEXT: v_add_u32_e64 v0, v0, v3 clamp +; GFX9-NEXT: v_add_u32_e64 v1, v1, v4 clamp +; GFX9-NEXT: v_add_u32_e64 v2, v2, v5 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = call <3 x i32> @llvm.uadd.sat.v3i32(<3 x i32> %value, <3 x i32> %amount) ret <3 x i32> %result } define amdgpu_ps <3 x i32> @s_uaddsat_v3i32(<3 x i32> inreg %value, <3 x i32> inreg %amount) { -; GCN-LABEL: s_uaddsat_v3i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_not_b32 s6, s0 -; GCN-NEXT: s_cmp_lt_u32 s6, s3 -; GCN-NEXT: s_cselect_b32 s3, s6, s3 -; GCN-NEXT: s_add_i32 s0, s0, s3 -; GCN-NEXT: s_not_b32 s3, s1 -; GCN-NEXT: s_cmp_lt_u32 s3, s4 -; GCN-NEXT: s_cselect_b32 s3, s3, s4 -; GCN-NEXT: s_add_i32 s1, s1, s3 -; GCN-NEXT: s_not_b32 s3, s2 -; GCN-NEXT: s_cmp_lt_u32 s3, s5 -; GCN-NEXT: s_cselect_b32 s3, s3, s5 -; GCN-NEXT: s_add_i32 s2, s2, s3 -; GCN-NEXT: ; return to shader part epilog +; GFX6-LABEL: s_uaddsat_v3i32: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_not_b32 s6, s0 +; GFX6-NEXT: s_cmp_lt_u32 s6, s3 +; GFX6-NEXT: s_cselect_b32 s3, s6, s3 +; GFX6-NEXT: s_add_i32 s0, s0, s3 +; GFX6-NEXT: s_not_b32 s3, s1 +; GFX6-NEXT: s_cmp_lt_u32 s3, s4 +; GFX6-NEXT: s_cselect_b32 s3, s3, s4 +; GFX6-NEXT: s_add_i32 s1, s1, s3 +; GFX6-NEXT: s_not_b32 s3, s2 +; GFX6-NEXT: s_cmp_lt_u32 s3, s5 +; GFX6-NEXT: s_cselect_b32 s3, s3, s5 +; GFX6-NEXT: s_add_i32 s2, s2, s3 +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: s_uaddsat_v3i32: +; GFX8: ; %bb.0: +; GFX8-NEXT: v_mov_b32_e32 v0, s3 +; GFX8-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NEXT: v_mov_b32_e32 v2, s5 +; GFX8-NEXT: v_add_u32_e64 v0, s[6:7], s0, v0 clamp +; GFX8-NEXT: v_add_u32_e64 v1, s[0:1], s1, v1 clamp +; GFX8-NEXT: v_add_u32_e64 v2, s[0:1], s2, v2 clamp +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: v_readfirstlane_b32 s1, v1 +; GFX8-NEXT: v_readfirstlane_b32 s2, v2 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: s_uaddsat_v3i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-NEXT: v_add_u32_e64 v0, s0, v0 clamp +; GFX9-NEXT: v_add_u32_e64 v1, s1, v1 clamp +; GFX9-NEXT: v_add_u32_e64 v2, s2, v2 clamp +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: v_readfirstlane_b32 s1, v1 +; GFX9-NEXT: v_readfirstlane_b32 s2, v2 +; GFX9-NEXT: ; return to shader part epilog %result = call <3 x i32> @llvm.uadd.sat.v3i32(<3 x i32> %value, <3 x i32> %amount) ret <3 x i32> %result } @@ -451,60 +471,76 @@ ; GFX8-LABEL: v_uaddsat_v4i32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_xor_b32_e32 v8, -1, v0 -; GFX8-NEXT: v_min_u32_e32 v4, v8, v4 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v4 -; GFX8-NEXT: v_xor_b32_e32 v4, -1, v1 -; GFX8-NEXT: v_min_u32_e32 v4, v4, v5 -; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v4 -; GFX8-NEXT: v_xor_b32_e32 v4, -1, v2 -; GFX8-NEXT: v_min_u32_e32 v4, v4, v6 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v4 -; GFX8-NEXT: v_xor_b32_e32 v4, -1, v3 -; GFX8-NEXT: v_min_u32_e32 v4, v4, v7 -; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v4 +; GFX8-NEXT: v_add_u32_e64 v0, s[4:5], v0, v4 clamp +; GFX8-NEXT: v_add_u32_e64 v1, s[4:5], v1, v5 clamp +; GFX8-NEXT: v_add_u32_e64 v2, s[4:5], v2, v6 clamp +; GFX8-NEXT: v_add_u32_e64 v3, s[4:5], v3, v7 clamp ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_uaddsat_v4i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_xor_b32_e32 v8, -1, v0 -; GFX9-NEXT: v_min_u32_e32 v4, v8, v4 -; GFX9-NEXT: v_add_u32_e32 v0, v0, v4 -; GFX9-NEXT: v_xor_b32_e32 v4, -1, v1 -; GFX9-NEXT: v_min_u32_e32 v4, v4, v5 -; GFX9-NEXT: v_add_u32_e32 v1, v1, v4 -; GFX9-NEXT: v_xor_b32_e32 v4, -1, v2 -; GFX9-NEXT: v_min_u32_e32 v4, v4, v6 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v4 -; GFX9-NEXT: v_xor_b32_e32 v4, -1, v3 -; GFX9-NEXT: v_min_u32_e32 v4, v4, v7 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v4 +; GFX9-NEXT: v_add_u32_e64 v0, v0, v4 clamp +; GFX9-NEXT: v_add_u32_e64 v1, v1, v5 clamp +; GFX9-NEXT: v_add_u32_e64 v2, v2, v6 clamp +; GFX9-NEXT: v_add_u32_e64 v3, v3, v7 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> %value, <4 x i32> %amount) ret <4 x i32> %result } define amdgpu_ps <4 x i32> @s_uaddsat_v4i32(<4 x i32> inreg %value, <4 x i32> inreg %amount) { -; GCN-LABEL: s_uaddsat_v4i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_not_b32 s8, s0 -; GCN-NEXT: s_cmp_lt_u32 s8, s4 -; GCN-NEXT: s_cselect_b32 s4, s8, s4 -; GCN-NEXT: s_add_i32 s0, s0, s4 -; GCN-NEXT: s_not_b32 s4, s1 -; GCN-NEXT: s_cmp_lt_u32 s4, s5 -; GCN-NEXT: s_cselect_b32 s4, s4, s5 -; GCN-NEXT: s_add_i32 s1, s1, s4 -; GCN-NEXT: s_not_b32 s4, s2 -; GCN-NEXT: s_cmp_lt_u32 s4, s6 -; GCN-NEXT: s_cselect_b32 s4, s4, s6 -; GCN-NEXT: s_add_i32 s2, s2, s4 -; GCN-NEXT: s_not_b32 s4, s3 -; GCN-NEXT: s_cmp_lt_u32 s4, s7 -; GCN-NEXT: s_cselect_b32 s4, s4, s7 -; GCN-NEXT: s_add_i32 s3, s3, s4 -; GCN-NEXT: ; return to shader part epilog +; GFX6-LABEL: s_uaddsat_v4i32: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_not_b32 s8, s0 +; GFX6-NEXT: s_cmp_lt_u32 s8, s4 +; GFX6-NEXT: s_cselect_b32 s4, s8, s4 +; GFX6-NEXT: s_add_i32 s0, s0, s4 +; GFX6-NEXT: s_not_b32 s4, s1 +; GFX6-NEXT: s_cmp_lt_u32 s4, s5 +; GFX6-NEXT: s_cselect_b32 s4, s4, s5 +; GFX6-NEXT: s_add_i32 s1, s1, s4 +; GFX6-NEXT: s_not_b32 s4, s2 +; GFX6-NEXT: s_cmp_lt_u32 s4, s6 +; GFX6-NEXT: s_cselect_b32 s4, s4, s6 +; GFX6-NEXT: s_add_i32 s2, s2, s4 +; GFX6-NEXT: s_not_b32 s4, s3 +; GFX6-NEXT: s_cmp_lt_u32 s4, s7 +; GFX6-NEXT: s_cselect_b32 s4, s4, s7 +; GFX6-NEXT: s_add_i32 s3, s3, s4 +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: s_uaddsat_v4i32: +; GFX8: ; %bb.0: +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: v_mov_b32_e32 v2, s6 +; GFX8-NEXT: v_mov_b32_e32 v3, s7 +; GFX8-NEXT: v_add_u32_e64 v0, s[8:9], s0, v0 clamp +; GFX8-NEXT: v_add_u32_e64 v1, s[0:1], s1, v1 clamp +; GFX8-NEXT: v_add_u32_e64 v2, s[0:1], s2, v2 clamp +; GFX8-NEXT: v_add_u32_e64 v3, s[0:1], s3, v3 clamp +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: v_readfirstlane_b32 s1, v1 +; GFX8-NEXT: v_readfirstlane_b32 s2, v2 +; GFX8-NEXT: v_readfirstlane_b32 s3, v3 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: s_uaddsat_v4i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: v_add_u32_e64 v0, s0, v0 clamp +; GFX9-NEXT: v_add_u32_e64 v1, s1, v1 clamp +; GFX9-NEXT: v_add_u32_e64 v2, s2, v2 clamp +; GFX9-NEXT: v_add_u32_e64 v3, s3, v3 clamp +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: v_readfirstlane_b32 s1, v1 +; GFX9-NEXT: v_readfirstlane_b32 s2, v2 +; GFX9-NEXT: v_readfirstlane_b32 s3, v3 +; GFX9-NEXT: ; return to shader part epilog %result = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> %value, <4 x i32> %amount) ret <4 x i32> %result } @@ -533,70 +569,88 @@ ; GFX8-LABEL: v_uaddsat_v5i32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_xor_b32_e32 v10, -1, v0 -; GFX8-NEXT: v_min_u32_e32 v5, v10, v5 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v5 -; GFX8-NEXT: v_xor_b32_e32 v5, -1, v1 -; GFX8-NEXT: v_min_u32_e32 v5, v5, v6 -; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v5 -; GFX8-NEXT: v_xor_b32_e32 v5, -1, v2 -; GFX8-NEXT: v_min_u32_e32 v5, v5, v7 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5 -; GFX8-NEXT: v_xor_b32_e32 v5, -1, v3 -; GFX8-NEXT: v_min_u32_e32 v5, v5, v8 -; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v5 -; GFX8-NEXT: v_xor_b32_e32 v5, -1, v4 -; GFX8-NEXT: v_min_u32_e32 v5, v5, v9 -; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v5 +; GFX8-NEXT: v_add_u32_e64 v0, s[4:5], v0, v5 clamp +; GFX8-NEXT: v_add_u32_e64 v1, s[4:5], v1, v6 clamp +; GFX8-NEXT: v_add_u32_e64 v2, s[4:5], v2, v7 clamp +; GFX8-NEXT: v_add_u32_e64 v3, s[4:5], v3, v8 clamp +; GFX8-NEXT: v_add_u32_e64 v4, s[4:5], v4, v9 clamp ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_uaddsat_v5i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_xor_b32_e32 v10, -1, v0 -; GFX9-NEXT: v_min_u32_e32 v5, v10, v5 -; GFX9-NEXT: v_add_u32_e32 v0, v0, v5 -; GFX9-NEXT: v_xor_b32_e32 v5, -1, v1 -; GFX9-NEXT: v_min_u32_e32 v5, v5, v6 -; GFX9-NEXT: v_add_u32_e32 v1, v1, v5 -; GFX9-NEXT: v_xor_b32_e32 v5, -1, v2 -; GFX9-NEXT: v_min_u32_e32 v5, v5, v7 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v5 -; GFX9-NEXT: v_xor_b32_e32 v5, -1, v3 -; GFX9-NEXT: v_min_u32_e32 v5, v5, v8 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v5 -; GFX9-NEXT: v_xor_b32_e32 v5, -1, v4 -; GFX9-NEXT: v_min_u32_e32 v5, v5, v9 -; GFX9-NEXT: v_add_u32_e32 v4, v4, v5 +; GFX9-NEXT: v_add_u32_e64 v0, v0, v5 clamp +; GFX9-NEXT: v_add_u32_e64 v1, v1, v6 clamp +; GFX9-NEXT: v_add_u32_e64 v2, v2, v7 clamp +; GFX9-NEXT: v_add_u32_e64 v3, v3, v8 clamp +; GFX9-NEXT: v_add_u32_e64 v4, v4, v9 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = call <5 x i32> @llvm.uadd.sat.v5i32(<5 x i32> %value, <5 x i32> %amount) ret <5 x i32> %result } define amdgpu_ps <5 x i32> @s_uaddsat_v5i32(<5 x i32> inreg %value, <5 x i32> inreg %amount) { -; GCN-LABEL: s_uaddsat_v5i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_not_b32 s10, s0 -; GCN-NEXT: s_cmp_lt_u32 s10, s5 -; GCN-NEXT: s_cselect_b32 s5, s10, s5 -; GCN-NEXT: s_add_i32 s0, s0, s5 -; GCN-NEXT: s_not_b32 s5, s1 -; GCN-NEXT: s_cmp_lt_u32 s5, s6 -; GCN-NEXT: s_cselect_b32 s5, s5, s6 -; GCN-NEXT: s_add_i32 s1, s1, s5 -; GCN-NEXT: s_not_b32 s5, s2 -; GCN-NEXT: s_cmp_lt_u32 s5, s7 -; GCN-NEXT: s_cselect_b32 s5, s5, s7 -; GCN-NEXT: s_add_i32 s2, s2, s5 -; GCN-NEXT: s_not_b32 s5, s3 -; GCN-NEXT: s_cmp_lt_u32 s5, s8 -; GCN-NEXT: s_cselect_b32 s5, s5, s8 -; GCN-NEXT: s_add_i32 s3, s3, s5 -; GCN-NEXT: s_not_b32 s5, s4 -; GCN-NEXT: s_cmp_lt_u32 s5, s9 -; GCN-NEXT: s_cselect_b32 s5, s5, s9 -; GCN-NEXT: s_add_i32 s4, s4, s5 -; GCN-NEXT: ; return to shader part epilog +; GFX6-LABEL: s_uaddsat_v5i32: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_not_b32 s10, s0 +; GFX6-NEXT: s_cmp_lt_u32 s10, s5 +; GFX6-NEXT: s_cselect_b32 s5, s10, s5 +; GFX6-NEXT: s_add_i32 s0, s0, s5 +; GFX6-NEXT: s_not_b32 s5, s1 +; GFX6-NEXT: s_cmp_lt_u32 s5, s6 +; GFX6-NEXT: s_cselect_b32 s5, s5, s6 +; GFX6-NEXT: s_add_i32 s1, s1, s5 +; GFX6-NEXT: s_not_b32 s5, s2 +; GFX6-NEXT: s_cmp_lt_u32 s5, s7 +; GFX6-NEXT: s_cselect_b32 s5, s5, s7 +; GFX6-NEXT: s_add_i32 s2, s2, s5 +; GFX6-NEXT: s_not_b32 s5, s3 +; GFX6-NEXT: s_cmp_lt_u32 s5, s8 +; GFX6-NEXT: s_cselect_b32 s5, s5, s8 +; GFX6-NEXT: s_add_i32 s3, s3, s5 +; GFX6-NEXT: s_not_b32 s5, s4 +; GFX6-NEXT: s_cmp_lt_u32 s5, s9 +; GFX6-NEXT: s_cselect_b32 s5, s5, s9 +; GFX6-NEXT: s_add_i32 s4, s4, s5 +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: s_uaddsat_v5i32: +; GFX8: ; %bb.0: +; GFX8-NEXT: v_mov_b32_e32 v0, s5 +; GFX8-NEXT: v_mov_b32_e32 v1, s6 +; GFX8-NEXT: v_mov_b32_e32 v2, s7 +; GFX8-NEXT: v_mov_b32_e32 v3, s8 +; GFX8-NEXT: v_mov_b32_e32 v4, s9 +; GFX8-NEXT: v_add_u32_e64 v0, s[10:11], s0, v0 clamp +; GFX8-NEXT: v_add_u32_e64 v1, s[0:1], s1, v1 clamp +; GFX8-NEXT: v_add_u32_e64 v2, s[0:1], s2, v2 clamp +; GFX8-NEXT: v_add_u32_e64 v3, s[0:1], s3, v3 clamp +; GFX8-NEXT: v_add_u32_e64 v4, s[0:1], s4, v4 clamp +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: v_readfirstlane_b32 s1, v1 +; GFX8-NEXT: v_readfirstlane_b32 s2, v2 +; GFX8-NEXT: v_readfirstlane_b32 s3, v3 +; GFX8-NEXT: v_readfirstlane_b32 s4, v4 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: s_uaddsat_v5i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_mov_b32_e32 v2, s7 +; GFX9-NEXT: v_mov_b32_e32 v3, s8 +; GFX9-NEXT: v_mov_b32_e32 v4, s9 +; GFX9-NEXT: v_add_u32_e64 v0, s0, v0 clamp +; GFX9-NEXT: v_add_u32_e64 v1, s1, v1 clamp +; GFX9-NEXT: v_add_u32_e64 v2, s2, v2 clamp +; GFX9-NEXT: v_add_u32_e64 v3, s3, v3 clamp +; GFX9-NEXT: v_add_u32_e64 v4, s4, v4 clamp +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: v_readfirstlane_b32 s1, v1 +; GFX9-NEXT: v_readfirstlane_b32 s2, v2 +; GFX9-NEXT: v_readfirstlane_b32 s3, v3 +; GFX9-NEXT: v_readfirstlane_b32 s4, v4 +; GFX9-NEXT: ; return to shader part epilog %result = call <5 x i32> @llvm.uadd.sat.v5i32(<5 x i32> %value, <5 x i32> %amount) ret <5 x i32> %result } @@ -658,180 +712,220 @@ ; GFX8-LABEL: v_uaddsat_v16i32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_xor_b32_e32 v32, -1, v0 -; GFX8-NEXT: v_min_u32_e32 v16, v32, v16 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v16 -; GFX8-NEXT: v_xor_b32_e32 v16, -1, v1 -; GFX8-NEXT: v_min_u32_e32 v16, v16, v17 -; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v16 -; GFX8-NEXT: v_xor_b32_e32 v16, -1, v2 -; GFX8-NEXT: v_min_u32_e32 v16, v16, v18 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v16 -; GFX8-NEXT: v_xor_b32_e32 v16, -1, v3 -; GFX8-NEXT: v_min_u32_e32 v16, v16, v19 -; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v16 -; GFX8-NEXT: v_xor_b32_e32 v16, -1, v4 -; GFX8-NEXT: v_min_u32_e32 v16, v16, v20 -; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v16 -; GFX8-NEXT: v_xor_b32_e32 v16, -1, v5 -; GFX8-NEXT: v_min_u32_e32 v16, v16, v21 -; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v16 -; GFX8-NEXT: v_xor_b32_e32 v16, -1, v6 -; GFX8-NEXT: v_min_u32_e32 v16, v16, v22 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v16 -; GFX8-NEXT: v_xor_b32_e32 v16, -1, v7 -; GFX8-NEXT: v_min_u32_e32 v16, v16, v23 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v16 -; GFX8-NEXT: v_xor_b32_e32 v16, -1, v8 -; GFX8-NEXT: v_min_u32_e32 v16, v16, v24 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v16 -; GFX8-NEXT: v_xor_b32_e32 v16, -1, v9 -; GFX8-NEXT: v_min_u32_e32 v16, v16, v25 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v16 -; GFX8-NEXT: v_xor_b32_e32 v16, -1, v10 -; GFX8-NEXT: v_min_u32_e32 v16, v16, v26 -; GFX8-NEXT: v_add_u32_e32 v10, vcc, v10, v16 -; GFX8-NEXT: v_xor_b32_e32 v16, -1, v11 -; GFX8-NEXT: v_min_u32_e32 v16, v16, v27 -; GFX8-NEXT: v_add_u32_e32 v11, vcc, v11, v16 -; GFX8-NEXT: v_xor_b32_e32 v16, -1, v12 -; GFX8-NEXT: v_min_u32_e32 v16, v16, v28 -; GFX8-NEXT: v_add_u32_e32 v12, vcc, v12, v16 -; GFX8-NEXT: v_xor_b32_e32 v16, -1, v13 -; GFX8-NEXT: v_min_u32_e32 v16, v16, v29 -; GFX8-NEXT: v_add_u32_e32 v13, vcc, v13, v16 -; GFX8-NEXT: v_xor_b32_e32 v16, -1, v14 -; GFX8-NEXT: v_min_u32_e32 v16, v16, v30 -; GFX8-NEXT: v_add_u32_e32 v14, vcc, v14, v16 -; GFX8-NEXT: v_xor_b32_e32 v16, -1, v15 -; GFX8-NEXT: v_min_u32_e32 v16, v16, v31 -; GFX8-NEXT: v_add_u32_e32 v15, vcc, v15, v16 +; GFX8-NEXT: v_add_u32_e64 v0, s[4:5], v0, v16 clamp +; GFX8-NEXT: v_add_u32_e64 v1, s[4:5], v1, v17 clamp +; GFX8-NEXT: v_add_u32_e64 v2, s[4:5], v2, v18 clamp +; GFX8-NEXT: v_add_u32_e64 v3, s[4:5], v3, v19 clamp +; GFX8-NEXT: v_add_u32_e64 v4, s[4:5], v4, v20 clamp +; GFX8-NEXT: v_add_u32_e64 v5, s[4:5], v5, v21 clamp +; GFX8-NEXT: v_add_u32_e64 v6, s[4:5], v6, v22 clamp +; GFX8-NEXT: v_add_u32_e64 v7, s[4:5], v7, v23 clamp +; GFX8-NEXT: v_add_u32_e64 v8, s[4:5], v8, v24 clamp +; GFX8-NEXT: v_add_u32_e64 v9, s[4:5], v9, v25 clamp +; GFX8-NEXT: v_add_u32_e64 v10, s[4:5], v10, v26 clamp +; GFX8-NEXT: v_add_u32_e64 v11, s[4:5], v11, v27 clamp +; GFX8-NEXT: v_add_u32_e64 v12, s[4:5], v12, v28 clamp +; GFX8-NEXT: v_add_u32_e64 v13, s[4:5], v13, v29 clamp +; GFX8-NEXT: v_add_u32_e64 v14, s[4:5], v14, v30 clamp +; GFX8-NEXT: v_add_u32_e64 v15, s[4:5], v15, v31 clamp ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_uaddsat_v16i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_xor_b32_e32 v32, -1, v0 -; GFX9-NEXT: v_min_u32_e32 v16, v32, v16 -; GFX9-NEXT: v_add_u32_e32 v0, v0, v16 -; GFX9-NEXT: v_xor_b32_e32 v16, -1, v1 -; GFX9-NEXT: v_min_u32_e32 v16, v16, v17 -; GFX9-NEXT: v_add_u32_e32 v1, v1, v16 -; GFX9-NEXT: v_xor_b32_e32 v16, -1, v2 -; GFX9-NEXT: v_min_u32_e32 v16, v16, v18 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v16 -; GFX9-NEXT: v_xor_b32_e32 v16, -1, v3 -; GFX9-NEXT: v_min_u32_e32 v16, v16, v19 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v16 -; GFX9-NEXT: v_xor_b32_e32 v16, -1, v4 -; GFX9-NEXT: v_min_u32_e32 v16, v16, v20 -; GFX9-NEXT: v_add_u32_e32 v4, v4, v16 -; GFX9-NEXT: v_xor_b32_e32 v16, -1, v5 -; GFX9-NEXT: v_min_u32_e32 v16, v16, v21 -; GFX9-NEXT: v_add_u32_e32 v5, v5, v16 -; GFX9-NEXT: v_xor_b32_e32 v16, -1, v6 -; GFX9-NEXT: v_min_u32_e32 v16, v16, v22 -; GFX9-NEXT: v_add_u32_e32 v6, v6, v16 -; GFX9-NEXT: v_xor_b32_e32 v16, -1, v7 -; GFX9-NEXT: v_min_u32_e32 v16, v16, v23 -; GFX9-NEXT: v_add_u32_e32 v7, v7, v16 -; GFX9-NEXT: v_xor_b32_e32 v16, -1, v8 -; GFX9-NEXT: v_min_u32_e32 v16, v16, v24 -; GFX9-NEXT: v_add_u32_e32 v8, v8, v16 -; GFX9-NEXT: v_xor_b32_e32 v16, -1, v9 -; GFX9-NEXT: v_min_u32_e32 v16, v16, v25 -; GFX9-NEXT: v_add_u32_e32 v9, v9, v16 -; GFX9-NEXT: v_xor_b32_e32 v16, -1, v10 -; GFX9-NEXT: v_min_u32_e32 v16, v16, v26 -; GFX9-NEXT: v_add_u32_e32 v10, v10, v16 -; GFX9-NEXT: v_xor_b32_e32 v16, -1, v11 -; GFX9-NEXT: v_min_u32_e32 v16, v16, v27 -; GFX9-NEXT: v_add_u32_e32 v11, v11, v16 -; GFX9-NEXT: v_xor_b32_e32 v16, -1, v12 -; GFX9-NEXT: v_min_u32_e32 v16, v16, v28 -; GFX9-NEXT: v_add_u32_e32 v12, v12, v16 -; GFX9-NEXT: v_xor_b32_e32 v16, -1, v13 -; GFX9-NEXT: v_min_u32_e32 v16, v16, v29 -; GFX9-NEXT: v_add_u32_e32 v13, v13, v16 -; GFX9-NEXT: v_xor_b32_e32 v16, -1, v14 -; GFX9-NEXT: v_min_u32_e32 v16, v16, v30 -; GFX9-NEXT: v_add_u32_e32 v14, v14, v16 -; GFX9-NEXT: v_xor_b32_e32 v16, -1, v15 -; GFX9-NEXT: v_min_u32_e32 v16, v16, v31 -; GFX9-NEXT: v_add_u32_e32 v15, v15, v16 +; GFX9-NEXT: v_add_u32_e64 v0, v0, v16 clamp +; GFX9-NEXT: v_add_u32_e64 v1, v1, v17 clamp +; GFX9-NEXT: v_add_u32_e64 v2, v2, v18 clamp +; GFX9-NEXT: v_add_u32_e64 v3, v3, v19 clamp +; GFX9-NEXT: v_add_u32_e64 v4, v4, v20 clamp +; GFX9-NEXT: v_add_u32_e64 v5, v5, v21 clamp +; GFX9-NEXT: v_add_u32_e64 v6, v6, v22 clamp +; GFX9-NEXT: v_add_u32_e64 v7, v7, v23 clamp +; GFX9-NEXT: v_add_u32_e64 v8, v8, v24 clamp +; GFX9-NEXT: v_add_u32_e64 v9, v9, v25 clamp +; GFX9-NEXT: v_add_u32_e64 v10, v10, v26 clamp +; GFX9-NEXT: v_add_u32_e64 v11, v11, v27 clamp +; GFX9-NEXT: v_add_u32_e64 v12, v12, v28 clamp +; GFX9-NEXT: v_add_u32_e64 v13, v13, v29 clamp +; GFX9-NEXT: v_add_u32_e64 v14, v14, v30 clamp +; GFX9-NEXT: v_add_u32_e64 v15, v15, v31 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> %value, <16 x i32> %amount) ret <16 x i32> %result } define amdgpu_ps <16 x i32> @s_uaddsat_v16i32(<16 x i32> inreg %value, <16 x i32> inreg %amount) { -; GCN-LABEL: s_uaddsat_v16i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_not_b32 s32, s0 -; GCN-NEXT: s_cmp_lt_u32 s32, s16 -; GCN-NEXT: s_cselect_b32 s16, s32, s16 -; GCN-NEXT: s_add_i32 s0, s0, s16 -; GCN-NEXT: s_not_b32 s16, s1 -; GCN-NEXT: s_cmp_lt_u32 s16, s17 -; GCN-NEXT: s_cselect_b32 s16, s16, s17 -; GCN-NEXT: s_add_i32 s1, s1, s16 -; GCN-NEXT: s_not_b32 s16, s2 -; GCN-NEXT: s_cmp_lt_u32 s16, s18 -; GCN-NEXT: s_cselect_b32 s16, s16, s18 -; GCN-NEXT: s_add_i32 s2, s2, s16 -; GCN-NEXT: s_not_b32 s16, s3 -; GCN-NEXT: s_cmp_lt_u32 s16, s19 -; GCN-NEXT: s_cselect_b32 s16, s16, s19 -; GCN-NEXT: s_add_i32 s3, s3, s16 -; GCN-NEXT: s_not_b32 s16, s4 -; GCN-NEXT: s_cmp_lt_u32 s16, s20 -; GCN-NEXT: s_cselect_b32 s16, s16, s20 -; GCN-NEXT: s_add_i32 s4, s4, s16 -; GCN-NEXT: s_not_b32 s16, s5 -; GCN-NEXT: s_cmp_lt_u32 s16, s21 -; GCN-NEXT: s_cselect_b32 s16, s16, s21 -; GCN-NEXT: s_add_i32 s5, s5, s16 -; GCN-NEXT: s_not_b32 s16, s6 -; GCN-NEXT: s_cmp_lt_u32 s16, s22 -; GCN-NEXT: s_cselect_b32 s16, s16, s22 -; GCN-NEXT: s_add_i32 s6, s6, s16 -; GCN-NEXT: s_not_b32 s16, s7 -; GCN-NEXT: s_cmp_lt_u32 s16, s23 -; GCN-NEXT: s_cselect_b32 s16, s16, s23 -; GCN-NEXT: s_add_i32 s7, s7, s16 -; GCN-NEXT: s_not_b32 s16, s8 -; GCN-NEXT: s_cmp_lt_u32 s16, s24 -; GCN-NEXT: s_cselect_b32 s16, s16, s24 -; GCN-NEXT: s_add_i32 s8, s8, s16 -; GCN-NEXT: s_not_b32 s16, s9 -; GCN-NEXT: s_cmp_lt_u32 s16, s25 -; GCN-NEXT: s_cselect_b32 s16, s16, s25 -; GCN-NEXT: s_add_i32 s9, s9, s16 -; GCN-NEXT: s_not_b32 s16, s10 -; GCN-NEXT: s_cmp_lt_u32 s16, s26 -; GCN-NEXT: s_cselect_b32 s16, s16, s26 -; GCN-NEXT: s_add_i32 s10, s10, s16 -; GCN-NEXT: s_not_b32 s16, s11 -; GCN-NEXT: s_cmp_lt_u32 s16, s27 -; GCN-NEXT: s_cselect_b32 s16, s16, s27 -; GCN-NEXT: s_add_i32 s11, s11, s16 -; GCN-NEXT: s_not_b32 s16, s12 -; GCN-NEXT: s_cmp_lt_u32 s16, s28 -; GCN-NEXT: s_cselect_b32 s16, s16, s28 -; GCN-NEXT: s_add_i32 s12, s12, s16 -; GCN-NEXT: s_not_b32 s16, s13 -; GCN-NEXT: s_cmp_lt_u32 s16, s29 -; GCN-NEXT: s_cselect_b32 s16, s16, s29 -; GCN-NEXT: s_add_i32 s13, s13, s16 -; GCN-NEXT: s_not_b32 s16, s14 -; GCN-NEXT: s_cmp_lt_u32 s16, s30 -; GCN-NEXT: s_cselect_b32 s16, s16, s30 -; GCN-NEXT: s_add_i32 s14, s14, s16 -; GCN-NEXT: s_not_b32 s16, s15 -; GCN-NEXT: s_cmp_lt_u32 s16, s31 -; GCN-NEXT: s_cselect_b32 s16, s16, s31 -; GCN-NEXT: s_add_i32 s15, s15, s16 -; GCN-NEXT: ; return to shader part epilog +; GFX6-LABEL: s_uaddsat_v16i32: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_not_b32 s32, s0 +; GFX6-NEXT: s_cmp_lt_u32 s32, s16 +; GFX6-NEXT: s_cselect_b32 s16, s32, s16 +; GFX6-NEXT: s_add_i32 s0, s0, s16 +; GFX6-NEXT: s_not_b32 s16, s1 +; GFX6-NEXT: s_cmp_lt_u32 s16, s17 +; GFX6-NEXT: s_cselect_b32 s16, s16, s17 +; GFX6-NEXT: s_add_i32 s1, s1, s16 +; GFX6-NEXT: s_not_b32 s16, s2 +; GFX6-NEXT: s_cmp_lt_u32 s16, s18 +; GFX6-NEXT: s_cselect_b32 s16, s16, s18 +; GFX6-NEXT: s_add_i32 s2, s2, s16 +; GFX6-NEXT: s_not_b32 s16, s3 +; GFX6-NEXT: s_cmp_lt_u32 s16, s19 +; GFX6-NEXT: s_cselect_b32 s16, s16, s19 +; GFX6-NEXT: s_add_i32 s3, s3, s16 +; GFX6-NEXT: s_not_b32 s16, s4 +; GFX6-NEXT: s_cmp_lt_u32 s16, s20 +; GFX6-NEXT: s_cselect_b32 s16, s16, s20 +; GFX6-NEXT: s_add_i32 s4, s4, s16 +; GFX6-NEXT: s_not_b32 s16, s5 +; GFX6-NEXT: s_cmp_lt_u32 s16, s21 +; GFX6-NEXT: s_cselect_b32 s16, s16, s21 +; GFX6-NEXT: s_add_i32 s5, s5, s16 +; GFX6-NEXT: s_not_b32 s16, s6 +; GFX6-NEXT: s_cmp_lt_u32 s16, s22 +; GFX6-NEXT: s_cselect_b32 s16, s16, s22 +; GFX6-NEXT: s_add_i32 s6, s6, s16 +; GFX6-NEXT: s_not_b32 s16, s7 +; GFX6-NEXT: s_cmp_lt_u32 s16, s23 +; GFX6-NEXT: s_cselect_b32 s16, s16, s23 +; GFX6-NEXT: s_add_i32 s7, s7, s16 +; GFX6-NEXT: s_not_b32 s16, s8 +; GFX6-NEXT: s_cmp_lt_u32 s16, s24 +; GFX6-NEXT: s_cselect_b32 s16, s16, s24 +; GFX6-NEXT: s_add_i32 s8, s8, s16 +; GFX6-NEXT: s_not_b32 s16, s9 +; GFX6-NEXT: s_cmp_lt_u32 s16, s25 +; GFX6-NEXT: s_cselect_b32 s16, s16, s25 +; GFX6-NEXT: s_add_i32 s9, s9, s16 +; GFX6-NEXT: s_not_b32 s16, s10 +; GFX6-NEXT: s_cmp_lt_u32 s16, s26 +; GFX6-NEXT: s_cselect_b32 s16, s16, s26 +; GFX6-NEXT: s_add_i32 s10, s10, s16 +; GFX6-NEXT: s_not_b32 s16, s11 +; GFX6-NEXT: s_cmp_lt_u32 s16, s27 +; GFX6-NEXT: s_cselect_b32 s16, s16, s27 +; GFX6-NEXT: s_add_i32 s11, s11, s16 +; GFX6-NEXT: s_not_b32 s16, s12 +; GFX6-NEXT: s_cmp_lt_u32 s16, s28 +; GFX6-NEXT: s_cselect_b32 s16, s16, s28 +; GFX6-NEXT: s_add_i32 s12, s12, s16 +; GFX6-NEXT: s_not_b32 s16, s13 +; GFX6-NEXT: s_cmp_lt_u32 s16, s29 +; GFX6-NEXT: s_cselect_b32 s16, s16, s29 +; GFX6-NEXT: s_add_i32 s13, s13, s16 +; GFX6-NEXT: s_not_b32 s16, s14 +; GFX6-NEXT: s_cmp_lt_u32 s16, s30 +; GFX6-NEXT: s_cselect_b32 s16, s16, s30 +; GFX6-NEXT: s_add_i32 s14, s14, s16 +; GFX6-NEXT: s_not_b32 s16, s15 +; GFX6-NEXT: s_cmp_lt_u32 s16, s31 +; GFX6-NEXT: s_cselect_b32 s16, s16, s31 +; GFX6-NEXT: s_add_i32 s15, s15, s16 +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: s_uaddsat_v16i32: +; GFX8: ; %bb.0: +; GFX8-NEXT: v_mov_b32_e32 v0, s16 +; GFX8-NEXT: v_mov_b32_e32 v1, s17 +; GFX8-NEXT: v_mov_b32_e32 v2, s18 +; GFX8-NEXT: v_mov_b32_e32 v3, s19 +; GFX8-NEXT: v_mov_b32_e32 v4, s20 +; GFX8-NEXT: v_mov_b32_e32 v5, s21 +; GFX8-NEXT: v_mov_b32_e32 v6, s22 +; GFX8-NEXT: v_mov_b32_e32 v7, s23 +; GFX8-NEXT: v_mov_b32_e32 v8, s24 +; GFX8-NEXT: v_mov_b32_e32 v9, s25 +; GFX8-NEXT: v_mov_b32_e32 v10, s26 +; GFX8-NEXT: v_mov_b32_e32 v11, s27 +; GFX8-NEXT: v_mov_b32_e32 v12, s28 +; GFX8-NEXT: v_mov_b32_e32 v13, s29 +; GFX8-NEXT: v_mov_b32_e32 v14, s30 +; GFX8-NEXT: v_mov_b32_e32 v15, s31 +; GFX8-NEXT: v_add_u32_e64 v0, s[32:33], s0, v0 clamp +; GFX8-NEXT: v_add_u32_e64 v1, s[16:17], s1, v1 clamp +; GFX8-NEXT: v_add_u32_e64 v2, s[16:17], s2, v2 clamp +; GFX8-NEXT: v_add_u32_e64 v3, s[2:3], s3, v3 clamp +; GFX8-NEXT: v_add_u32_e64 v4, s[2:3], s4, v4 clamp +; GFX8-NEXT: v_add_u32_e64 v5, s[2:3], s5, v5 clamp +; GFX8-NEXT: v_add_u32_e64 v6, s[2:3], s6, v6 clamp +; GFX8-NEXT: v_add_u32_e64 v7, s[2:3], s7, v7 clamp +; GFX8-NEXT: v_add_u32_e64 v8, s[2:3], s8, v8 clamp +; GFX8-NEXT: v_add_u32_e64 v9, s[2:3], s9, v9 clamp +; GFX8-NEXT: v_add_u32_e64 v10, s[2:3], s10, v10 clamp +; GFX8-NEXT: v_add_u32_e64 v11, s[2:3], s11, v11 clamp +; GFX8-NEXT: v_add_u32_e64 v12, s[2:3], s12, v12 clamp +; GFX8-NEXT: v_add_u32_e64 v13, s[2:3], s13, v13 clamp +; GFX8-NEXT: v_add_u32_e64 v14, s[2:3], s14, v14 clamp +; GFX8-NEXT: v_add_u32_e64 v15, s[2:3], s15, v15 clamp +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: v_readfirstlane_b32 s1, v1 +; GFX8-NEXT: v_readfirstlane_b32 s2, v2 +; GFX8-NEXT: v_readfirstlane_b32 s3, v3 +; GFX8-NEXT: v_readfirstlane_b32 s4, v4 +; GFX8-NEXT: v_readfirstlane_b32 s5, v5 +; GFX8-NEXT: v_readfirstlane_b32 s6, v6 +; GFX8-NEXT: v_readfirstlane_b32 s7, v7 +; GFX8-NEXT: v_readfirstlane_b32 s8, v8 +; GFX8-NEXT: v_readfirstlane_b32 s9, v9 +; GFX8-NEXT: v_readfirstlane_b32 s10, v10 +; GFX8-NEXT: v_readfirstlane_b32 s11, v11 +; GFX8-NEXT: v_readfirstlane_b32 s12, v12 +; GFX8-NEXT: v_readfirstlane_b32 s13, v13 +; GFX8-NEXT: v_readfirstlane_b32 s14, v14 +; GFX8-NEXT: v_readfirstlane_b32 s15, v15 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: s_uaddsat_v16i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: v_mov_b32_e32 v14, s30 +; GFX9-NEXT: v_mov_b32_e32 v15, s31 +; GFX9-NEXT: v_add_u32_e64 v0, s0, v0 clamp +; GFX9-NEXT: v_add_u32_e64 v1, s1, v1 clamp +; GFX9-NEXT: v_add_u32_e64 v2, s2, v2 clamp +; GFX9-NEXT: v_add_u32_e64 v3, s3, v3 clamp +; GFX9-NEXT: v_add_u32_e64 v4, s4, v4 clamp +; GFX9-NEXT: v_add_u32_e64 v5, s5, v5 clamp +; GFX9-NEXT: v_add_u32_e64 v6, s6, v6 clamp +; GFX9-NEXT: v_add_u32_e64 v7, s7, v7 clamp +; GFX9-NEXT: v_add_u32_e64 v8, s8, v8 clamp +; GFX9-NEXT: v_add_u32_e64 v9, s9, v9 clamp +; GFX9-NEXT: v_add_u32_e64 v10, s10, v10 clamp +; GFX9-NEXT: v_add_u32_e64 v11, s11, v11 clamp +; GFX9-NEXT: v_add_u32_e64 v12, s12, v12 clamp +; GFX9-NEXT: v_add_u32_e64 v13, s13, v13 clamp +; GFX9-NEXT: v_add_u32_e64 v14, s14, v14 clamp +; GFX9-NEXT: v_add_u32_e64 v15, s15, v15 clamp +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: v_readfirstlane_b32 s1, v1 +; GFX9-NEXT: v_readfirstlane_b32 s2, v2 +; GFX9-NEXT: v_readfirstlane_b32 s3, v3 +; GFX9-NEXT: v_readfirstlane_b32 s4, v4 +; GFX9-NEXT: v_readfirstlane_b32 s5, v5 +; GFX9-NEXT: v_readfirstlane_b32 s6, v6 +; GFX9-NEXT: v_readfirstlane_b32 s7, v7 +; GFX9-NEXT: v_readfirstlane_b32 s8, v8 +; GFX9-NEXT: v_readfirstlane_b32 s9, v9 +; GFX9-NEXT: v_readfirstlane_b32 s10, v10 +; GFX9-NEXT: v_readfirstlane_b32 s11, v11 +; GFX9-NEXT: v_readfirstlane_b32 s12, v12 +; GFX9-NEXT: v_readfirstlane_b32 s13, v13 +; GFX9-NEXT: v_readfirstlane_b32 s14, v14 +; GFX9-NEXT: v_readfirstlane_b32 s15, v15 +; GFX9-NEXT: ; return to shader part epilog %result = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> %value, <16 x i32> %amount) ret <16 x i32> %result } @@ -851,17 +945,13 @@ ; GFX8-LABEL: v_uaddsat_i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_xor_b32_e32 v2, 0xffff, v0 -; GFX8-NEXT: v_min_u16_e32 v1, v2, v1 -; GFX8-NEXT: v_add_u16_e32 v0, v0, v1 +; GFX8-NEXT: v_add_u16_e64 v0, v0, v1 clamp ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_uaddsat_i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_xor_b32_e32 v2, 0xffff, v0 -; GFX9-NEXT: v_min_u16_e32 v1, v2, v1 -; GFX9-NEXT: v_add_u16_e32 v0, v0, v1 +; GFX9-NEXT: v_add_u16_e64 v0, v0, v1 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = call i16 @llvm.uadd.sat.i16(i16 %value, i16 %amount) ret i16 %result @@ -881,22 +971,16 @@ ; ; GFX8-LABEL: s_uaddsat_i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_xor_b32 s2, s0, 0xffff -; GFX8-NEXT: s_bfe_u32 s2, s2, 0x100000 -; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s2, s1 -; GFX8-NEXT: s_cselect_b32 s1, s2, s1 -; GFX8-NEXT: s_add_i32 s0, s0, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s1 +; GFX8-NEXT: v_add_u16_e64 v0, s0, v0 clamp +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_uaddsat_i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_xor_b32 s2, s0, 0xffff -; GFX9-NEXT: s_bfe_u32 s2, s2, 0x100000 -; GFX9-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX9-NEXT: s_cmp_lt_u32 s2, s1 -; GFX9-NEXT: s_cselect_b32 s1, s2, s1 -; GFX9-NEXT: s_add_i32 s0, s0, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_add_u16_e64 v0, s0, v0 clamp +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: ; return to shader part epilog %result = call i16 @llvm.uadd.sat.i16(i16 %value, i16 %amount) ret i16 %result @@ -915,16 +999,12 @@ ; ; GFX8-LABEL: uaddsat_i16_sv: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_xor_b32 s1, s0, 0xffff -; GFX8-NEXT: v_min_u16_e32 v0, s1, v0 -; GFX8-NEXT: v_add_u16_e32 v0, s0, v0 +; GFX8-NEXT: v_add_u16_e64 v0, s0, v0 clamp ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: uaddsat_i16_sv: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_xor_b32 s1, s0, 0xffff -; GFX9-NEXT: v_min_u16_e32 v0, s1, v0 -; GFX9-NEXT: v_add_u16_e32 v0, s0, v0 +; GFX9-NEXT: v_add_u16_e64 v0, s0, v0 clamp ; GFX9-NEXT: ; return to shader part epilog %result = call i16 @llvm.uadd.sat.i16(i16 %value, i16 %amount) %cast = bitcast i16 %result to half @@ -944,16 +1024,12 @@ ; ; GFX8-LABEL: uaddsat_i16_vs: ; GFX8: ; %bb.0: -; GFX8-NEXT: v_xor_b32_e32 v1, 0xffff, v0 -; GFX8-NEXT: v_min_u16_e32 v1, s0, v1 -; GFX8-NEXT: v_add_u16_e32 v0, v0, v1 +; GFX8-NEXT: v_add_u16_e64 v0, v0, s0 clamp ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: uaddsat_i16_vs: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_xor_b32_e32 v1, 0xffff, v0 -; GFX9-NEXT: v_min_u16_e32 v1, s0, v1 -; GFX9-NEXT: v_add_u16_e32 v0, v0, v1 +; GFX9-NEXT: v_add_u16_e64 v0, v0, s0 clamp ; GFX9-NEXT: ; return to shader part epilog %result = call i16 @llvm.uadd.sat.i16(i16 %value, i16 %amount) %cast = bitcast i16 %result to half @@ -981,24 +1057,17 @@ ; GFX8-LABEL: v_uaddsat_v2i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s4, 0xffff -; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX8-NEXT: v_xor_b32_e32 v3, s4, v0 -; GFX8-NEXT: v_xor_b32_e32 v4, s4, v2 -; GFX8-NEXT: v_min_u16_e32 v3, v3, v1 -; GFX8-NEXT: v_min_u16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_add_u16_e32 v0, v0, v3 -; GFX8-NEXT: v_add_u16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: v_add_u16_e64 v2, v0, v1 clamp +; GFX8-NEXT: v_add_u16_sdwa v0, v0, v1 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_mov_b32_e32 v1, 16 +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_uaddsat_v2i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_pack_ll_b32_b16 s4, -1, -1 -; GFX9-NEXT: v_xor_b32_e32 v2, s4, v0 -; GFX9-NEXT: v_pk_min_u16 v1, v2, v1 -; GFX9-NEXT: v_pk_add_u16 v0, v0, v1 +; GFX9-NEXT: v_pk_add_u16 v0, v0, v1 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> %value, <2 x i16> %amount) ret <2 x i16> %result @@ -1030,46 +1099,23 @@ ; ; GFX8-LABEL: s_uaddsat_v2i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_mov_b32 s4, 0xffff -; GFX8-NEXT: s_xor_b32 s5, s0, s4 ; GFX8-NEXT: s_lshr_b32 s3, s1, 16 ; GFX8-NEXT: s_lshr_b32 s2, s0, 16 -; GFX8-NEXT: s_bfe_u32 s5, s5, 0x100000 -; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s5, s1 -; GFX8-NEXT: s_cselect_b32 s1, s5, s1 -; GFX8-NEXT: s_add_i32 s0, s0, s1 -; GFX8-NEXT: s_xor_b32 s1, s2, s4 -; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX8-NEXT: s_bfe_u32 s3, s3, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s1, s3 -; GFX8-NEXT: s_cselect_b32 s1, s1, s3 -; GFX8-NEXT: s_add_i32 s2, s2, s1 -; GFX8-NEXT: s_bfe_u32 s1, s2, 0x100000 -; GFX8-NEXT: s_bfe_u32 s0, s0, 0x100000 -; GFX8-NEXT: s_lshl_b32 s1, s1, 16 -; GFX8-NEXT: s_or_b32 s0, s0, s1 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_mov_b32_e32 v0, s1 +; GFX8-NEXT: v_add_u16_e64 v1, s2, v1 clamp +; GFX8-NEXT: v_mov_b32_e32 v2, 16 +; GFX8-NEXT: v_add_u16_e64 v0, s0, v0 clamp +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_uaddsat_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_pack_ll_b32_b16 s2, -1, -1 -; GFX9-NEXT: s_xor_b32 s2, s0, s2 -; GFX9-NEXT: s_mov_b32 s4, 0xffff -; GFX9-NEXT: s_lshr_b32 s3, s2, 16 -; GFX9-NEXT: s_lshr_b32 s5, s1, 16 -; GFX9-NEXT: s_and_b32 s2, s2, s4 -; GFX9-NEXT: s_and_b32 s1, s1, s4 -; GFX9-NEXT: s_cmp_lt_u32 s2, s1 -; GFX9-NEXT: s_cselect_b32 s1, s2, s1 -; GFX9-NEXT: s_cmp_lt_u32 s3, s5 -; GFX9-NEXT: s_cselect_b32 s2, s3, s5 -; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s2 -; GFX9-NEXT: s_lshr_b32 s2, s0, 16 -; GFX9-NEXT: s_lshr_b32 s3, s1, 16 -; GFX9-NEXT: s_add_i32 s0, s0, s1 -; GFX9-NEXT: s_add_i32 s2, s2, s3 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_pk_add_u16 v0, s0, v0 clamp +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: ; return to shader part epilog %result = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> %value, <2 x i16> %amount) %cast = bitcast <2 x i16> %result to i32 @@ -1100,25 +1146,18 @@ ; ; GFX8-LABEL: uaddsat_v2i16_sv: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_mov_b32 s2, 0xffff ; GFX8-NEXT: s_lshr_b32 s1, s0, 16 -; GFX8-NEXT: s_xor_b32 s3, s0, s2 -; GFX8-NEXT: s_xor_b32 s2, s1, s2 -; GFX8-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NEXT: v_min_u16_e32 v1, s3, v0 -; GFX8-NEXT: v_min_u16_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_mov_b32_e32 v2, s1 -; GFX8-NEXT: v_add_u16_e32 v1, s0, v1 -; GFX8-NEXT: v_add_u16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX8-NEXT: v_add_u16_e64 v1, s0, v0 clamp +; GFX8-NEXT: v_add_u16_sdwa v0, v2, v0 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_mov_b32_e32 v2, 16 +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: uaddsat_v2i16_sv: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_pack_ll_b32_b16 s1, -1, -1 -; GFX9-NEXT: s_xor_b32 s1, s0, s1 -; GFX9-NEXT: v_pk_min_u16 v0, s1, v0 -; GFX9-NEXT: v_pk_add_u16 v0, s0, v0 +; GFX9-NEXT: v_pk_add_u16 v0, s0, v0 clamp ; GFX9-NEXT: ; return to shader part epilog %result = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> %value, <2 x i16> %amount) %cast = bitcast <2 x i16> %result to float @@ -1149,24 +1188,18 @@ ; ; GFX8-LABEL: uaddsat_v2i16_vs: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_mov_b32 s2, 0xffff -; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX8-NEXT: v_xor_b32_e32 v2, s2, v0 ; GFX8-NEXT: s_lshr_b32 s1, s0, 16 -; GFX8-NEXT: v_xor_b32_e32 v3, s2, v1 -; GFX8-NEXT: v_min_u16_e32 v2, s0, v2 -; GFX8-NEXT: v_min_u16_e32 v3, s1, v3 -; GFX8-NEXT: v_add_u16_e32 v0, v0, v2 -; GFX8-NEXT: v_add_u16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, s1 +; GFX8-NEXT: v_add_u16_e64 v1, v0, s0 clamp +; GFX8-NEXT: v_add_u16_sdwa v0, v0, v2 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_mov_b32_e32 v2, 16 +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: uaddsat_v2i16_vs: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_pack_ll_b32_b16 s1, -1, -1 -; GFX9-NEXT: v_xor_b32_e32 v1, s1, v0 -; GFX9-NEXT: v_pk_min_u16 v1, v1, s0 -; GFX9-NEXT: v_pk_add_u16 v0, v0, v1 +; GFX9-NEXT: v_pk_add_u16 v0, v0, s0 clamp ; GFX9-NEXT: ; return to shader part epilog %result = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> %value, <2 x i16> %amount) %cast = bitcast <2 x i16> %result to float @@ -1226,35 +1259,22 @@ ; GFX8-LABEL: v_uaddsat_v4i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s4, 0xffff -; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; GFX8-NEXT: v_xor_b32_e32 v6, s4, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v1 -; GFX8-NEXT: v_xor_b32_e32 v7, s4, v4 -; GFX8-NEXT: v_min_u16_e32 v6, v6, v2 -; GFX8-NEXT: v_min_u16_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_xor_b32_e32 v7, s4, v1 -; GFX8-NEXT: v_xor_b32_e32 v8, s4, v5 -; GFX8-NEXT: v_min_u16_e32 v7, v7, v3 -; GFX8-NEXT: v_min_u16_sdwa v3, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_add_u16_e32 v0, v0, v6 -; GFX8-NEXT: v_add_u16_sdwa v2, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX8-NEXT: v_add_u16_e32 v1, v1, v7 -; GFX8-NEXT: v_add_u16_sdwa v2, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX8-NEXT: v_add_u16_e64 v4, v0, v2 clamp +; GFX8-NEXT: v_add_u16_sdwa v0, v0, v2 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_u16_e64 v2, v1, v3 clamp +; GFX8-NEXT: v_add_u16_sdwa v1, v1, v3 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_mov_b32_e32 v3, 16 +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_uaddsat_v4i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_pack_ll_b32_b16 s4, -1, -1 -; GFX9-NEXT: v_xor_b32_e32 v4, s4, v0 -; GFX9-NEXT: v_pk_min_u16 v2, v4, v2 -; GFX9-NEXT: v_pk_add_u16 v0, v0, v2 -; GFX9-NEXT: v_xor_b32_e32 v2, s4, v1 -; GFX9-NEXT: v_pk_min_u16 v2, v2, v3 -; GFX9-NEXT: v_pk_add_u16 v1, v1, v2 +; GFX9-NEXT: v_pk_add_u16 v0, v0, v2 clamp +; GFX9-NEXT: v_pk_add_u16 v1, v1, v3 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> %value, <4 x i16> %amount) %cast = bitcast <4 x i16> %result to <2 x float> @@ -1305,79 +1325,35 @@ ; ; GFX8-LABEL: s_uaddsat_v4i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_mov_b32 s8, 0xffff -; GFX8-NEXT: s_xor_b32 s9, s0, s8 ; GFX8-NEXT: s_lshr_b32 s6, s2, 16 +; GFX8-NEXT: s_lshr_b32 s7, s3, 16 ; GFX8-NEXT: s_lshr_b32 s4, s0, 16 +; GFX8-NEXT: v_mov_b32_e32 v1, s6 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: s_lshr_b32 s5, s1, 16 -; GFX8-NEXT: s_lshr_b32 s7, s3, 16 -; GFX8-NEXT: s_bfe_u32 s9, s9, 0x100000 -; GFX8-NEXT: s_bfe_u32 s2, s2, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s9, s2 -; GFX8-NEXT: s_cselect_b32 s2, s9, s2 -; GFX8-NEXT: s_add_i32 s0, s0, s2 -; GFX8-NEXT: s_xor_b32 s2, s4, s8 -; GFX8-NEXT: s_bfe_u32 s2, s2, 0x100000 -; GFX8-NEXT: s_bfe_u32 s6, s6, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s2, s6 -; GFX8-NEXT: s_cselect_b32 s2, s2, s6 -; GFX8-NEXT: s_add_i32 s4, s4, s2 -; GFX8-NEXT: s_xor_b32 s2, s1, s8 -; GFX8-NEXT: s_bfe_u32 s2, s2, 0x100000 -; GFX8-NEXT: s_bfe_u32 s3, s3, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s2, s3 -; GFX8-NEXT: s_cselect_b32 s2, s2, s3 -; GFX8-NEXT: s_add_i32 s1, s1, s2 -; GFX8-NEXT: s_xor_b32 s2, s5, s8 -; GFX8-NEXT: s_bfe_u32 s2, s2, 0x100000 -; GFX8-NEXT: s_bfe_u32 s3, s7, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s2, s3 -; GFX8-NEXT: s_cselect_b32 s2, s2, s3 -; GFX8-NEXT: s_add_i32 s5, s5, s2 -; GFX8-NEXT: s_bfe_u32 s2, s4, 0x100000 -; GFX8-NEXT: s_bfe_u32 s0, s0, 0x100000 -; GFX8-NEXT: s_lshl_b32 s2, s2, 16 -; GFX8-NEXT: s_or_b32 s0, s0, s2 -; GFX8-NEXT: s_bfe_u32 s2, s5, 0x100000 -; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX8-NEXT: s_lshl_b32 s2, s2, 16 -; GFX8-NEXT: s_or_b32 s1, s1, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s7 +; GFX8-NEXT: v_add_u16_e64 v1, s4, v1 clamp +; GFX8-NEXT: v_mov_b32_e32 v4, 16 +; GFX8-NEXT: v_mov_b32_e32 v2, s3 +; GFX8-NEXT: v_add_u16_e64 v0, s0, v0 clamp +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_add_u16_e64 v3, s5, v3 clamp +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_add_u16_e64 v2, s1, v2 clamp +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: v_readfirstlane_b32 s1, v1 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_uaddsat_v4i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_pack_ll_b32_b16 s4, -1, -1 -; GFX9-NEXT: s_xor_b32 s5, s0, s4 -; GFX9-NEXT: s_mov_b32 s7, 0xffff -; GFX9-NEXT: s_lshr_b32 s6, s5, 16 -; GFX9-NEXT: s_lshr_b32 s8, s2, 16 -; GFX9-NEXT: s_and_b32 s5, s5, s7 -; GFX9-NEXT: s_and_b32 s2, s2, s7 -; GFX9-NEXT: s_cmp_lt_u32 s5, s2 -; GFX9-NEXT: s_cselect_b32 s2, s5, s2 -; GFX9-NEXT: s_cmp_lt_u32 s6, s8 -; GFX9-NEXT: s_cselect_b32 s5, s6, s8 -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s5 -; GFX9-NEXT: s_lshr_b32 s5, s0, 16 -; GFX9-NEXT: s_lshr_b32 s6, s2, 16 -; GFX9-NEXT: s_add_i32 s0, s0, s2 -; GFX9-NEXT: s_xor_b32 s2, s1, s4 -; GFX9-NEXT: s_add_i32 s5, s5, s6 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s5 -; GFX9-NEXT: s_lshr_b32 s4, s2, 16 -; GFX9-NEXT: s_lshr_b32 s5, s3, 16 -; GFX9-NEXT: s_and_b32 s2, s2, s7 -; GFX9-NEXT: s_and_b32 s3, s3, s7 -; GFX9-NEXT: s_cmp_lt_u32 s2, s3 -; GFX9-NEXT: s_cselect_b32 s2, s2, s3 -; GFX9-NEXT: s_cmp_lt_u32 s4, s5 -; GFX9-NEXT: s_cselect_b32 s3, s4, s5 -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s3 -; GFX9-NEXT: s_lshr_b32 s3, s1, 16 -; GFX9-NEXT: s_lshr_b32 s4, s2, 16 -; GFX9-NEXT: s_add_i32 s1, s1, s2 -; GFX9-NEXT: s_add_i32 s3, s3, s4 -; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s3 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_pk_add_u16 v0, s0, v0 clamp +; GFX9-NEXT: v_pk_add_u16 v1, s1, v1 clamp +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: v_readfirstlane_b32 s1, v1 ; GFX9-NEXT: ; return to shader part epilog %result = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> %value, <4 x i16> %amount) %cast = bitcast <4 x i16> %result to <2 x i32> @@ -1453,46 +1429,28 @@ ; GFX8-LABEL: v_uaddsat_v6i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s4, 0xffff -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; GFX8-NEXT: v_xor_b32_e32 v9, s4, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v1 -; GFX8-NEXT: v_xor_b32_e32 v10, s4, v6 -; GFX8-NEXT: v_min_u16_e32 v9, v9, v3 -; GFX8-NEXT: v_min_u16_sdwa v3, v10, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_xor_b32_e32 v10, s4, v1 -; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v2 -; GFX8-NEXT: v_xor_b32_e32 v11, s4, v7 -; GFX8-NEXT: v_min_u16_e32 v10, v10, v4 -; GFX8-NEXT: v_min_u16_sdwa v4, v11, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_xor_b32_e32 v11, s4, v2 -; GFX8-NEXT: v_xor_b32_e32 v12, s4, v8 -; GFX8-NEXT: v_add_u16_e32 v0, v0, v9 -; GFX8-NEXT: v_add_u16_sdwa v3, v6, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_u16_e32 v11, v11, v5 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v3 -; GFX8-NEXT: v_min_u16_sdwa v5, v12, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_add_u16_e32 v1, v1, v10 -; GFX8-NEXT: v_add_u16_sdwa v3, v7, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v1, v1, v3 -; GFX8-NEXT: v_add_u16_e32 v2, v2, v11 -; GFX8-NEXT: v_add_u16_sdwa v3, v8, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX8-NEXT: v_add_u16_e64 v6, v0, v3 clamp +; GFX8-NEXT: v_add_u16_sdwa v0, v0, v3 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_u16_e64 v3, v1, v4 clamp +; GFX8-NEXT: v_add_u16_sdwa v1, v1, v4 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_u16_e64 v4, v2, v5 clamp +; GFX8-NEXT: v_add_u16_sdwa v2, v2, v5 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_mov_b32_e32 v5, 16 +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_mov_b32_e32 v3, 16 +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_or_b32_sdwa v0, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_uaddsat_v6i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_pack_ll_b32_b16 s4, -1, -1 -; GFX9-NEXT: v_xor_b32_e32 v6, s4, v0 -; GFX9-NEXT: v_pk_min_u16 v3, v6, v3 -; GFX9-NEXT: v_pk_add_u16 v0, v0, v3 -; GFX9-NEXT: v_xor_b32_e32 v3, s4, v1 -; GFX9-NEXT: v_pk_min_u16 v3, v3, v4 -; GFX9-NEXT: v_pk_add_u16 v1, v1, v3 -; GFX9-NEXT: v_xor_b32_e32 v3, s4, v2 -; GFX9-NEXT: v_pk_min_u16 v3, v3, v5 -; GFX9-NEXT: v_pk_add_u16 v2, v2, v3 +; GFX9-NEXT: v_pk_add_u16 v0, v0, v3 clamp +; GFX9-NEXT: v_pk_add_u16 v1, v1, v4 clamp +; GFX9-NEXT: v_pk_add_u16 v2, v2, v5 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = call <6 x i16> @llvm.uadd.sat.v6i16(<6 x i16> %value, <6 x i16> %amount) %cast = bitcast <6 x i16> %result to <3 x float> @@ -1561,112 +1519,47 @@ ; ; GFX8-LABEL: s_uaddsat_v6i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_mov_b32 s12, 0xffff -; GFX8-NEXT: s_xor_b32 s13, s0, s12 ; GFX8-NEXT: s_lshr_b32 s9, s3, 16 +; GFX8-NEXT: s_lshr_b32 s10, s4, 16 ; GFX8-NEXT: s_lshr_b32 s6, s0, 16 +; GFX8-NEXT: v_mov_b32_e32 v1, s9 +; GFX8-NEXT: v_mov_b32_e32 v0, s3 +; GFX8-NEXT: s_lshr_b32 s11, s5, 16 ; GFX8-NEXT: s_lshr_b32 s7, s1, 16 +; GFX8-NEXT: v_mov_b32_e32 v3, s10 +; GFX8-NEXT: v_add_u16_e64 v1, s6, v1 clamp +; GFX8-NEXT: v_mov_b32_e32 v6, 16 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: s_lshr_b32 s8, s2, 16 -; GFX8-NEXT: s_lshr_b32 s10, s4, 16 -; GFX8-NEXT: s_lshr_b32 s11, s5, 16 -; GFX8-NEXT: s_bfe_u32 s13, s13, 0x100000 -; GFX8-NEXT: s_bfe_u32 s3, s3, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s13, s3 -; GFX8-NEXT: s_cselect_b32 s3, s13, s3 -; GFX8-NEXT: s_add_i32 s0, s0, s3 -; GFX8-NEXT: s_xor_b32 s3, s6, s12 -; GFX8-NEXT: s_bfe_u32 s3, s3, 0x100000 -; GFX8-NEXT: s_bfe_u32 s9, s9, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s3, s9 -; GFX8-NEXT: s_cselect_b32 s3, s3, s9 -; GFX8-NEXT: s_add_i32 s6, s6, s3 -; GFX8-NEXT: s_xor_b32 s3, s1, s12 -; GFX8-NEXT: s_bfe_u32 s3, s3, 0x100000 -; GFX8-NEXT: s_bfe_u32 s4, s4, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s3, s4 -; GFX8-NEXT: s_cselect_b32 s3, s3, s4 -; GFX8-NEXT: s_add_i32 s1, s1, s3 -; GFX8-NEXT: s_xor_b32 s3, s7, s12 -; GFX8-NEXT: s_bfe_u32 s3, s3, 0x100000 -; GFX8-NEXT: s_bfe_u32 s4, s10, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s3, s4 -; GFX8-NEXT: s_cselect_b32 s3, s3, s4 -; GFX8-NEXT: s_add_i32 s7, s7, s3 -; GFX8-NEXT: s_xor_b32 s3, s2, s12 -; GFX8-NEXT: s_bfe_u32 s3, s3, 0x100000 -; GFX8-NEXT: s_bfe_u32 s4, s5, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s3, s4 -; GFX8-NEXT: s_cselect_b32 s3, s3, s4 -; GFX8-NEXT: s_add_i32 s2, s2, s3 -; GFX8-NEXT: s_xor_b32 s3, s8, s12 -; GFX8-NEXT: s_bfe_u32 s3, s3, 0x100000 -; GFX8-NEXT: s_bfe_u32 s4, s11, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s3, s4 -; GFX8-NEXT: s_cselect_b32 s3, s3, s4 -; GFX8-NEXT: s_add_i32 s8, s8, s3 -; GFX8-NEXT: s_bfe_u32 s3, s6, 0x100000 -; GFX8-NEXT: s_bfe_u32 s0, s0, 0x100000 -; GFX8-NEXT: s_lshl_b32 s3, s3, 16 -; GFX8-NEXT: s_or_b32 s0, s0, s3 -; GFX8-NEXT: s_bfe_u32 s3, s7, 0x100000 -; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX8-NEXT: s_lshl_b32 s3, s3, 16 -; GFX8-NEXT: s_or_b32 s1, s1, s3 -; GFX8-NEXT: s_bfe_u32 s3, s8, 0x100000 -; GFX8-NEXT: s_bfe_u32 s2, s2, 0x100000 -; GFX8-NEXT: s_lshl_b32 s3, s3, 16 -; GFX8-NEXT: s_or_b32 s2, s2, s3 +; GFX8-NEXT: v_mov_b32_e32 v5, s11 +; GFX8-NEXT: v_add_u16_e64 v0, s0, v0 clamp +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_add_u16_e64 v3, s7, v3 clamp +; GFX8-NEXT: v_mov_b32_e32 v4, s5 +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_add_u16_e64 v2, s1, v2 clamp +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_add_u16_e64 v5, s8, v5 clamp +; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_add_u16_e64 v4, s2, v4 clamp +; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: v_readfirstlane_b32 s1, v1 +; GFX8-NEXT: v_readfirstlane_b32 s2, v2 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_uaddsat_v6i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_pack_ll_b32_b16 s6, -1, -1 -; GFX9-NEXT: s_xor_b32 s7, s0, s6 -; GFX9-NEXT: s_mov_b32 s9, 0xffff -; GFX9-NEXT: s_lshr_b32 s8, s7, 16 -; GFX9-NEXT: s_lshr_b32 s10, s3, 16 -; GFX9-NEXT: s_and_b32 s7, s7, s9 -; GFX9-NEXT: s_and_b32 s3, s3, s9 -; GFX9-NEXT: s_cmp_lt_u32 s7, s3 -; GFX9-NEXT: s_cselect_b32 s3, s7, s3 -; GFX9-NEXT: s_cmp_lt_u32 s8, s10 -; GFX9-NEXT: s_cselect_b32 s7, s8, s10 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s7 -; GFX9-NEXT: s_lshr_b32 s7, s0, 16 -; GFX9-NEXT: s_lshr_b32 s8, s3, 16 -; GFX9-NEXT: s_add_i32 s0, s0, s3 -; GFX9-NEXT: s_add_i32 s7, s7, s8 -; GFX9-NEXT: s_xor_b32 s3, s1, s6 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s7 -; GFX9-NEXT: s_lshr_b32 s7, s3, 16 -; GFX9-NEXT: s_lshr_b32 s8, s4, 16 -; GFX9-NEXT: s_and_b32 s3, s3, s9 -; GFX9-NEXT: s_and_b32 s4, s4, s9 -; GFX9-NEXT: s_cmp_lt_u32 s3, s4 -; GFX9-NEXT: s_cselect_b32 s3, s3, s4 -; GFX9-NEXT: s_cmp_lt_u32 s7, s8 -; GFX9-NEXT: s_cselect_b32 s4, s7, s8 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s4 -; GFX9-NEXT: s_lshr_b32 s4, s1, 16 -; GFX9-NEXT: s_lshr_b32 s7, s3, 16 -; GFX9-NEXT: s_add_i32 s1, s1, s3 -; GFX9-NEXT: s_add_i32 s4, s4, s7 -; GFX9-NEXT: s_xor_b32 s3, s2, s6 -; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s4 -; GFX9-NEXT: s_lshr_b32 s4, s3, 16 -; GFX9-NEXT: s_lshr_b32 s6, s5, 16 -; GFX9-NEXT: s_and_b32 s3, s3, s9 -; GFX9-NEXT: s_and_b32 s5, s5, s9 -; GFX9-NEXT: s_cmp_lt_u32 s3, s5 -; GFX9-NEXT: s_cselect_b32 s3, s3, s5 -; GFX9-NEXT: s_cmp_lt_u32 s4, s6 -; GFX9-NEXT: s_cselect_b32 s4, s4, s6 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s4 -; GFX9-NEXT: s_lshr_b32 s4, s2, 16 -; GFX9-NEXT: s_lshr_b32 s5, s3, 16 -; GFX9-NEXT: s_add_i32 s2, s2, s3 -; GFX9-NEXT: s_add_i32 s4, s4, s5 -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-NEXT: v_pk_add_u16 v0, s0, v0 clamp +; GFX9-NEXT: v_pk_add_u16 v1, s1, v1 clamp +; GFX9-NEXT: v_pk_add_u16 v2, s2, v2 clamp +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: v_readfirstlane_b32 s1, v1 +; GFX9-NEXT: v_readfirstlane_b32 s2, v2 ; GFX9-NEXT: ; return to shader part epilog %result = call <6 x i16> @llvm.uadd.sat.v6i16(<6 x i16> %value, <6 x i16> %amount) %cast = bitcast <6 x i16> %result to <3 x i32> @@ -1747,57 +1640,33 @@ ; GFX8-LABEL: v_uaddsat_v8i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s4, 0xffff -; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v0 -; GFX8-NEXT: v_xor_b32_e32 v12, s4, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v1 -; GFX8-NEXT: v_xor_b32_e32 v13, s4, v8 -; GFX8-NEXT: v_min_u16_e32 v12, v12, v4 -; GFX8-NEXT: v_min_u16_sdwa v4, v13, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_xor_b32_e32 v13, s4, v1 -; GFX8-NEXT: v_lshrrev_b32_e32 v10, 16, v2 -; GFX8-NEXT: v_xor_b32_e32 v14, s4, v9 -; GFX8-NEXT: v_min_u16_e32 v13, v13, v5 -; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v3 -; GFX8-NEXT: v_min_u16_sdwa v5, v14, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_xor_b32_e32 v14, s4, v2 -; GFX8-NEXT: v_xor_b32_e32 v15, s4, v10 -; GFX8-NEXT: v_add_u16_e32 v0, v0, v12 -; GFX8-NEXT: v_add_u16_sdwa v4, v8, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_u16_e32 v14, v14, v6 -; GFX8-NEXT: v_min_u16_sdwa v6, v15, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_xor_b32_e32 v15, s4, v3 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 -; GFX8-NEXT: v_xor_b32_e32 v16, s4, v11 -; GFX8-NEXT: v_add_u16_e32 v1, v1, v13 -; GFX8-NEXT: v_add_u16_sdwa v4, v9, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_u16_e32 v15, v15, v7 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v4 -; GFX8-NEXT: v_min_u16_sdwa v7, v16, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_add_u16_e32 v2, v2, v14 -; GFX8-NEXT: v_add_u16_sdwa v4, v10, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v2, v2, v4 -; GFX8-NEXT: v_add_u16_e32 v3, v3, v15 -; GFX8-NEXT: v_add_u16_sdwa v4, v11, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v3, v3, v4 +; GFX8-NEXT: v_add_u16_e64 v8, v0, v4 clamp +; GFX8-NEXT: v_add_u16_sdwa v0, v0, v4 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_u16_e64 v4, v1, v5 clamp +; GFX8-NEXT: v_add_u16_sdwa v1, v1, v5 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_u16_e64 v5, v2, v6 clamp +; GFX8-NEXT: v_add_u16_sdwa v2, v2, v6 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_u16_e64 v6, v3, v7 clamp +; GFX8-NEXT: v_add_u16_sdwa v3, v3, v7 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_mov_b32_e32 v7, 16 +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_mov_b32_e32 v7, 16 +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_or_b32_sdwa v0, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v3, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_uaddsat_v8i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_pack_ll_b32_b16 s4, -1, -1 -; GFX9-NEXT: v_xor_b32_e32 v8, s4, v0 -; GFX9-NEXT: v_pk_min_u16 v4, v8, v4 -; GFX9-NEXT: v_pk_add_u16 v0, v0, v4 -; GFX9-NEXT: v_xor_b32_e32 v4, s4, v1 -; GFX9-NEXT: v_pk_min_u16 v4, v4, v5 -; GFX9-NEXT: v_pk_add_u16 v1, v1, v4 -; GFX9-NEXT: v_xor_b32_e32 v4, s4, v2 -; GFX9-NEXT: v_pk_min_u16 v4, v4, v6 -; GFX9-NEXT: v_pk_add_u16 v2, v2, v4 -; GFX9-NEXT: v_xor_b32_e32 v4, s4, v3 -; GFX9-NEXT: v_pk_min_u16 v4, v4, v7 -; GFX9-NEXT: v_pk_add_u16 v3, v3, v4 +; GFX9-NEXT: v_pk_add_u16 v0, v0, v4 clamp +; GFX9-NEXT: v_pk_add_u16 v1, v1, v5 clamp +; GFX9-NEXT: v_pk_add_u16 v2, v2, v6 clamp +; GFX9-NEXT: v_pk_add_u16 v3, v3, v7 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> %value, <8 x i16> %amount) %cast = bitcast <8 x i16> %result to <4 x float> @@ -1884,145 +1753,59 @@ ; ; GFX8-LABEL: s_uaddsat_v8i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_mov_b32 s16, 0xffff -; GFX8-NEXT: s_xor_b32 s17, s0, s16 ; GFX8-NEXT: s_lshr_b32 s12, s4, 16 +; GFX8-NEXT: s_lshr_b32 s13, s5, 16 ; GFX8-NEXT: s_lshr_b32 s8, s0, 16 +; GFX8-NEXT: v_mov_b32_e32 v1, s12 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: s_lshr_b32 s14, s6, 16 +; GFX8-NEXT: s_lshr_b32 s15, s7, 16 ; GFX8-NEXT: s_lshr_b32 s9, s1, 16 +; GFX8-NEXT: v_mov_b32_e32 v3, s13 +; GFX8-NEXT: v_add_u16_e64 v1, s8, v1 clamp +; GFX8-NEXT: v_mov_b32_e32 v8, 16 +; GFX8-NEXT: v_mov_b32_e32 v2, s5 ; GFX8-NEXT: s_lshr_b32 s10, s2, 16 +; GFX8-NEXT: v_mov_b32_e32 v5, s14 ; GFX8-NEXT: s_lshr_b32 s11, s3, 16 -; GFX8-NEXT: s_lshr_b32 s13, s5, 16 -; GFX8-NEXT: s_lshr_b32 s14, s6, 16 -; GFX8-NEXT: s_lshr_b32 s15, s7, 16 -; GFX8-NEXT: s_bfe_u32 s17, s17, 0x100000 -; GFX8-NEXT: s_bfe_u32 s4, s4, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s17, s4 -; GFX8-NEXT: s_cselect_b32 s4, s17, s4 -; GFX8-NEXT: s_add_i32 s0, s0, s4 -; GFX8-NEXT: s_xor_b32 s4, s8, s16 -; GFX8-NEXT: s_bfe_u32 s4, s4, 0x100000 -; GFX8-NEXT: s_bfe_u32 s12, s12, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s4, s12 -; GFX8-NEXT: s_cselect_b32 s4, s4, s12 -; GFX8-NEXT: s_add_i32 s8, s8, s4 -; GFX8-NEXT: s_xor_b32 s4, s1, s16 -; GFX8-NEXT: s_bfe_u32 s4, s4, 0x100000 -; GFX8-NEXT: s_bfe_u32 s5, s5, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s4, s5 -; GFX8-NEXT: s_cselect_b32 s4, s4, s5 -; GFX8-NEXT: s_add_i32 s1, s1, s4 -; GFX8-NEXT: s_xor_b32 s4, s9, s16 -; GFX8-NEXT: s_bfe_u32 s4, s4, 0x100000 -; GFX8-NEXT: s_bfe_u32 s5, s13, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s4, s5 -; GFX8-NEXT: s_cselect_b32 s4, s4, s5 -; GFX8-NEXT: s_add_i32 s9, s9, s4 -; GFX8-NEXT: s_xor_b32 s4, s2, s16 -; GFX8-NEXT: s_bfe_u32 s4, s4, 0x100000 -; GFX8-NEXT: s_bfe_u32 s5, s6, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s4, s5 -; GFX8-NEXT: s_cselect_b32 s4, s4, s5 -; GFX8-NEXT: s_add_i32 s2, s2, s4 -; GFX8-NEXT: s_xor_b32 s4, s10, s16 -; GFX8-NEXT: s_bfe_u32 s4, s4, 0x100000 -; GFX8-NEXT: s_bfe_u32 s5, s14, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s4, s5 -; GFX8-NEXT: s_cselect_b32 s4, s4, s5 -; GFX8-NEXT: s_add_i32 s10, s10, s4 -; GFX8-NEXT: s_xor_b32 s4, s3, s16 -; GFX8-NEXT: s_bfe_u32 s4, s4, 0x100000 -; GFX8-NEXT: s_bfe_u32 s5, s7, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s4, s5 -; GFX8-NEXT: s_cselect_b32 s4, s4, s5 -; GFX8-NEXT: s_add_i32 s3, s3, s4 -; GFX8-NEXT: s_xor_b32 s4, s11, s16 -; GFX8-NEXT: s_bfe_u32 s4, s4, 0x100000 -; GFX8-NEXT: s_bfe_u32 s5, s15, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s4, s5 -; GFX8-NEXT: s_cselect_b32 s4, s4, s5 -; GFX8-NEXT: s_add_i32 s11, s11, s4 -; GFX8-NEXT: s_bfe_u32 s4, s8, 0x100000 -; GFX8-NEXT: s_bfe_u32 s0, s0, 0x100000 -; GFX8-NEXT: s_lshl_b32 s4, s4, 16 -; GFX8-NEXT: s_or_b32 s0, s0, s4 -; GFX8-NEXT: s_bfe_u32 s4, s9, 0x100000 -; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX8-NEXT: s_lshl_b32 s4, s4, 16 -; GFX8-NEXT: s_or_b32 s1, s1, s4 -; GFX8-NEXT: s_bfe_u32 s4, s10, 0x100000 -; GFX8-NEXT: s_bfe_u32 s2, s2, 0x100000 -; GFX8-NEXT: s_lshl_b32 s4, s4, 16 -; GFX8-NEXT: s_or_b32 s2, s2, s4 -; GFX8-NEXT: s_bfe_u32 s4, s11, 0x100000 -; GFX8-NEXT: s_bfe_u32 s3, s3, 0x100000 -; GFX8-NEXT: s_lshl_b32 s4, s4, 16 -; GFX8-NEXT: s_or_b32 s3, s3, s4 +; GFX8-NEXT: v_mov_b32_e32 v7, s15 +; GFX8-NEXT: v_add_u16_e64 v0, s0, v0 clamp +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_add_u16_e64 v3, s9, v3 clamp +; GFX8-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NEXT: v_mov_b32_e32 v6, s7 +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_add_u16_e64 v2, s1, v2 clamp +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_add_u16_e64 v7, s11, v7 clamp +; GFX8-NEXT: v_add_u16_e64 v5, s10, v5 clamp +; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_add_u16_e64 v4, s2, v4 clamp +; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v8, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_add_u16_e64 v6, s3, v6 clamp +; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v3, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: v_readfirstlane_b32 s1, v1 +; GFX8-NEXT: v_readfirstlane_b32 s2, v2 +; GFX8-NEXT: v_readfirstlane_b32 s3, v3 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_uaddsat_v8i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_pack_ll_b32_b16 s8, -1, -1 -; GFX9-NEXT: s_xor_b32 s9, s0, s8 -; GFX9-NEXT: s_mov_b32 s11, 0xffff -; GFX9-NEXT: s_lshr_b32 s10, s9, 16 -; GFX9-NEXT: s_lshr_b32 s12, s4, 16 -; GFX9-NEXT: s_and_b32 s9, s9, s11 -; GFX9-NEXT: s_and_b32 s4, s4, s11 -; GFX9-NEXT: s_cmp_lt_u32 s9, s4 -; GFX9-NEXT: s_cselect_b32 s4, s9, s4 -; GFX9-NEXT: s_cmp_lt_u32 s10, s12 -; GFX9-NEXT: s_cselect_b32 s9, s10, s12 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s9 -; GFX9-NEXT: s_lshr_b32 s9, s0, 16 -; GFX9-NEXT: s_lshr_b32 s10, s4, 16 -; GFX9-NEXT: s_add_i32 s0, s0, s4 -; GFX9-NEXT: s_add_i32 s9, s9, s10 -; GFX9-NEXT: s_xor_b32 s4, s1, s8 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s9 -; GFX9-NEXT: s_lshr_b32 s9, s4, 16 -; GFX9-NEXT: s_lshr_b32 s10, s5, 16 -; GFX9-NEXT: s_and_b32 s4, s4, s11 -; GFX9-NEXT: s_and_b32 s5, s5, s11 -; GFX9-NEXT: s_cmp_lt_u32 s4, s5 -; GFX9-NEXT: s_cselect_b32 s4, s4, s5 -; GFX9-NEXT: s_cmp_lt_u32 s9, s10 -; GFX9-NEXT: s_cselect_b32 s5, s9, s10 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s5 -; GFX9-NEXT: s_lshr_b32 s5, s1, 16 -; GFX9-NEXT: s_lshr_b32 s9, s4, 16 -; GFX9-NEXT: s_add_i32 s1, s1, s4 -; GFX9-NEXT: s_add_i32 s5, s5, s9 -; GFX9-NEXT: s_xor_b32 s4, s2, s8 -; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s5 -; GFX9-NEXT: s_lshr_b32 s5, s4, 16 -; GFX9-NEXT: s_lshr_b32 s9, s6, 16 -; GFX9-NEXT: s_and_b32 s4, s4, s11 -; GFX9-NEXT: s_and_b32 s6, s6, s11 -; GFX9-NEXT: s_cmp_lt_u32 s4, s6 -; GFX9-NEXT: s_cselect_b32 s4, s4, s6 -; GFX9-NEXT: s_cmp_lt_u32 s5, s9 -; GFX9-NEXT: s_cselect_b32 s5, s5, s9 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s5 -; GFX9-NEXT: s_lshr_b32 s5, s2, 16 -; GFX9-NEXT: s_lshr_b32 s6, s4, 16 -; GFX9-NEXT: s_add_i32 s2, s2, s4 -; GFX9-NEXT: s_add_i32 s5, s5, s6 -; GFX9-NEXT: s_xor_b32 s4, s3, s8 -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s5 -; GFX9-NEXT: s_lshr_b32 s5, s4, 16 -; GFX9-NEXT: s_lshr_b32 s6, s7, 16 -; GFX9-NEXT: s_and_b32 s4, s4, s11 -; GFX9-NEXT: s_and_b32 s7, s7, s11 -; GFX9-NEXT: s_cmp_lt_u32 s4, s7 -; GFX9-NEXT: s_cselect_b32 s4, s4, s7 -; GFX9-NEXT: s_cmp_lt_u32 s5, s6 -; GFX9-NEXT: s_cselect_b32 s5, s5, s6 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s5 -; GFX9-NEXT: s_lshr_b32 s5, s3, 16 -; GFX9-NEXT: s_lshr_b32 s6, s4, 16 -; GFX9-NEXT: s_add_i32 s3, s3, s4 -; GFX9-NEXT: s_add_i32 s5, s5, s6 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s5 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: v_pk_add_u16 v0, s0, v0 clamp +; GFX9-NEXT: v_pk_add_u16 v1, s1, v1 clamp +; GFX9-NEXT: v_pk_add_u16 v2, s2, v2 clamp +; GFX9-NEXT: v_pk_add_u16 v3, s3, v3 clamp +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: v_readfirstlane_b32 s1, v1 +; GFX9-NEXT: v_readfirstlane_b32 s2, v2 +; GFX9-NEXT: v_readfirstlane_b32 s3, v3 ; GFX9-NEXT: ; return to shader part epilog %result = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> %value, <8 x i16> %amount) %cast = bitcast <8 x i16> %result to <4 x i32> Index: llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll @@ -19,8 +19,7 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshlrev_b16_e32 v0, 9, v0 ; GFX8-NEXT: v_lshlrev_b16_e32 v1, 9, v1 -; GFX8-NEXT: v_min_u16_e32 v1, v0, v1 -; GFX8-NEXT: v_sub_u16_e32 v0, v0, v1 +; GFX8-NEXT: v_sub_u16_e64 v0, v0, v1 clamp ; GFX8-NEXT: v_lshrrev_b16_e32 v0, 9, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -29,8 +28,7 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 9, v0 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 9, v1 -; GFX9-NEXT: v_min_u16_e32 v1, v0, v1 -; GFX9-NEXT: v_sub_u16_e32 v0, v0, v1 +; GFX9-NEXT: v_sub_u16_e64 v0, v0, v1 clamp ; GFX9-NEXT: v_lshrrev_b16_e32 v0, 9, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = call i7 @llvm.usub.sat.i7(i7 %value, i7 %amount) @@ -53,13 +51,10 @@ ; GFX8-NEXT: s_bfe_u32 s2, 9, 0x100000 ; GFX8-NEXT: s_lshl_b32 s1, s1, s2 ; GFX8-NEXT: s_lshl_b32 s0, s0, s2 -; GFX8-NEXT: s_bfe_u32 s3, s0, 0x100000 -; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s3, s1 -; GFX8-NEXT: s_cselect_b32 s1, s3, s1 -; GFX8-NEXT: s_sub_i32 s0, s0, s1 -; GFX8-NEXT: s_bfe_u32 s0, s0, 0x100000 -; GFX8-NEXT: s_lshr_b32 s0, s0, s2 +; GFX8-NEXT: v_mov_b32_e32 v0, s1 +; GFX8-NEXT: v_sub_u16_e64 v0, s0, v0 clamp +; GFX8-NEXT: v_lshrrev_b16_e32 v0, 9, v0 +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_usubsat_i7: @@ -67,13 +62,10 @@ ; GFX9-NEXT: s_bfe_u32 s2, 9, 0x100000 ; GFX9-NEXT: s_lshl_b32 s1, s1, s2 ; GFX9-NEXT: s_lshl_b32 s0, s0, s2 -; GFX9-NEXT: s_bfe_u32 s3, s0, 0x100000 -; GFX9-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX9-NEXT: s_cmp_lt_u32 s3, s1 -; GFX9-NEXT: s_cselect_b32 s1, s3, s1 -; GFX9-NEXT: s_sub_i32 s0, s0, s1 -; GFX9-NEXT: s_bfe_u32 s0, s0, 0x100000 -; GFX9-NEXT: s_lshr_b32 s0, s0, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_sub_u16_e64 v0, s0, v0 clamp +; GFX9-NEXT: v_lshrrev_b16_e32 v0, 9, v0 +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: ; return to shader part epilog %result = call i7 @llvm.usub.sat.i7(i7 %value, i7 %amount) ret i7 %result @@ -95,8 +87,7 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX8-NEXT: v_min_u16_e32 v1, v0, v1 -; GFX8-NEXT: v_sub_u16_e32 v0, v0, v1 +; GFX8-NEXT: v_sub_u16_e64 v0, v0, v1 clamp ; GFX8-NEXT: v_lshrrev_b16_e32 v0, 8, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -105,8 +96,7 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: v_min_u16_e32 v1, v0, v1 -; GFX9-NEXT: v_sub_u16_e32 v0, v0, v1 +; GFX9-NEXT: v_sub_u16_e64 v0, v0, v1 clamp ; GFX9-NEXT: v_lshrrev_b16_e32 v0, 8, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = call i8 @llvm.usub.sat.i8(i8 %value, i8 %amount) @@ -129,13 +119,10 @@ ; GFX8-NEXT: s_bfe_u32 s2, 8, 0x100000 ; GFX8-NEXT: s_lshl_b32 s1, s1, s2 ; GFX8-NEXT: s_lshl_b32 s0, s0, s2 -; GFX8-NEXT: s_bfe_u32 s3, s0, 0x100000 -; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s3, s1 -; GFX8-NEXT: s_cselect_b32 s1, s3, s1 -; GFX8-NEXT: s_sub_i32 s0, s0, s1 -; GFX8-NEXT: s_bfe_u32 s0, s0, 0x100000 -; GFX8-NEXT: s_lshr_b32 s0, s0, s2 +; GFX8-NEXT: v_mov_b32_e32 v0, s1 +; GFX8-NEXT: v_sub_u16_e64 v0, s0, v0 clamp +; GFX8-NEXT: v_lshrrev_b16_e32 v0, 8, v0 +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_usubsat_i8: @@ -143,13 +130,10 @@ ; GFX9-NEXT: s_bfe_u32 s2, 8, 0x100000 ; GFX9-NEXT: s_lshl_b32 s1, s1, s2 ; GFX9-NEXT: s_lshl_b32 s0, s0, s2 -; GFX9-NEXT: s_bfe_u32 s3, s0, 0x100000 -; GFX9-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX9-NEXT: s_cmp_lt_u32 s3, s1 -; GFX9-NEXT: s_cselect_b32 s1, s3, s1 -; GFX9-NEXT: s_sub_i32 s0, s0, s1 -; GFX9-NEXT: s_bfe_u32 s0, s0, 0x100000 -; GFX9-NEXT: s_lshr_b32 s0, s0, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_sub_u16_e64 v0, s0, v0 clamp +; GFX9-NEXT: v_lshrrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: ; return to shader part epilog %result = call i8 @llvm.usub.sat.i8(i8 %value, i8 %amount) ret i8 %result @@ -171,8 +155,7 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 8, v0 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX8-NEXT: v_min_u32_e32 v1, v0, v1 -; GFX8-NEXT: v_sub_u32_e32 v0, vcc, v0, v1 +; GFX8-NEXT: v_sub_u32_e64 v0, s[4:5], v0, v1 clamp ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 8, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -181,8 +164,7 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v0 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX9-NEXT: v_min_u32_e32 v1, v0, v1 -; GFX9-NEXT: v_sub_u32_e32 v0, v0, v1 +; GFX9-NEXT: v_sub_u32_e64 v0, v0, v1 clamp ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = call i24 @llvm.usub.sat.i24(i24 %value, i24 %amount) @@ -190,15 +172,35 @@ } define amdgpu_ps i24 @s_usubsat_i24(i24 inreg %value, i24 inreg %amount) { -; GCN-LABEL: s_usubsat_i24: -; GCN: ; %bb.0: -; GCN-NEXT: s_lshl_b32 s0, s0, 8 -; GCN-NEXT: s_lshl_b32 s1, s1, 8 -; GCN-NEXT: s_cmp_lt_u32 s0, s1 -; GCN-NEXT: s_cselect_b32 s1, s0, s1 -; GCN-NEXT: s_sub_i32 s0, s0, s1 -; GCN-NEXT: s_lshr_b32 s0, s0, 8 -; GCN-NEXT: ; return to shader part epilog +; GFX6-LABEL: s_usubsat_i24: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_lshl_b32 s0, s0, 8 +; GFX6-NEXT: s_lshl_b32 s1, s1, 8 +; GFX6-NEXT: s_cmp_lt_u32 s0, s1 +; GFX6-NEXT: s_cselect_b32 s1, s0, s1 +; GFX6-NEXT: s_sub_i32 s0, s0, s1 +; GFX6-NEXT: s_lshr_b32 s0, s0, 8 +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: s_usubsat_i24: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_lshl_b32 s1, s1, 8 +; GFX8-NEXT: s_lshl_b32 s0, s0, 8 +; GFX8-NEXT: v_mov_b32_e32 v0, s1 +; GFX8-NEXT: v_sub_u32_e64 v0, s[0:1], s0, v0 clamp +; GFX8-NEXT: v_lshrrev_b32_e32 v0, 8, v0 +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: s_usubsat_i24: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_lshl_b32 s1, s1, 8 +; GFX9-NEXT: s_lshl_b32 s0, s0, 8 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_sub_u32_e64 v0, s0, v0 clamp +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v0 +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: ; return to shader part epilog %result = call i24 @llvm.usub.sat.i24(i24 %value, i24 %amount) ret i24 %result } @@ -214,27 +216,39 @@ ; GFX8-LABEL: v_usubsat_i32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_min_u32_e32 v1, v0, v1 -; GFX8-NEXT: v_sub_u32_e32 v0, vcc, v0, v1 +; GFX8-NEXT: v_sub_u32_e64 v0, s[4:5], v0, v1 clamp ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_usubsat_i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_min_u32_e32 v1, v0, v1 -; GFX9-NEXT: v_sub_u32_e32 v0, v0, v1 +; GFX9-NEXT: v_sub_u32_e64 v0, v0, v1 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = call i32 @llvm.usub.sat.i32(i32 %value, i32 %amount) ret i32 %result } define amdgpu_ps i32 @s_usubsat_i32(i32 inreg %value, i32 inreg %amount) { -; GCN-LABEL: s_usubsat_i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_cmp_lt_u32 s0, s1 -; GCN-NEXT: s_cselect_b32 s1, s0, s1 -; GCN-NEXT: s_sub_i32 s0, s0, s1 -; GCN-NEXT: ; return to shader part epilog +; GFX6-LABEL: s_usubsat_i32: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_cmp_lt_u32 s0, s1 +; GFX6-NEXT: s_cselect_b32 s1, s0, s1 +; GFX6-NEXT: s_sub_i32 s0, s0, s1 +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: s_usubsat_i32: +; GFX8: ; %bb.0: +; GFX8-NEXT: v_mov_b32_e32 v0, s1 +; GFX8-NEXT: v_sub_u32_e64 v0, s[0:1], s0, v0 clamp +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: s_usubsat_i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_sub_u32_e64 v0, s0, v0 clamp +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: ; return to shader part epilog %result = call i32 @llvm.usub.sat.i32(i32 %value, i32 %amount) ret i32 %result } @@ -248,14 +262,12 @@ ; ; GFX8-LABEL: usubsat_i32_sv: ; GFX8: ; %bb.0: -; GFX8-NEXT: v_min_u32_e32 v0, s0, v0 -; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s0, v0 +; GFX8-NEXT: v_sub_u32_e64 v0, s[0:1], s0, v0 clamp ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: usubsat_i32_sv: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_min_u32_e32 v0, s0, v0 -; GFX9-NEXT: v_sub_u32_e32 v0, s0, v0 +; GFX9-NEXT: v_sub_u32_e64 v0, s0, v0 clamp ; GFX9-NEXT: ; return to shader part epilog %result = call i32 @llvm.usub.sat.i32(i32 %value, i32 %amount) %cast = bitcast i32 %result to float @@ -271,14 +283,12 @@ ; ; GFX8-LABEL: usubsat_i32_vs: ; GFX8: ; %bb.0: -; GFX8-NEXT: v_min_u32_e32 v1, s0, v0 -; GFX8-NEXT: v_sub_u32_e32 v0, vcc, v0, v1 +; GFX8-NEXT: v_sub_u32_e64 v0, s[0:1], v0, s0 clamp ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: usubsat_i32_vs: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_min_u32_e32 v1, s0, v0 -; GFX9-NEXT: v_sub_u32_e32 v0, v0, v1 +; GFX9-NEXT: v_sub_u32_e64 v0, v0, s0 clamp ; GFX9-NEXT: ; return to shader part epilog %result = call i32 @llvm.usub.sat.i32(i32 %value, i32 %amount) %cast = bitcast i32 %result to float @@ -298,34 +308,50 @@ ; GFX8-LABEL: v_usubsat_v2i32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_min_u32_e32 v2, v0, v2 -; GFX8-NEXT: v_sub_u32_e32 v0, vcc, v0, v2 -; GFX8-NEXT: v_min_u32_e32 v2, v1, v3 -; GFX8-NEXT: v_sub_u32_e32 v1, vcc, v1, v2 +; GFX8-NEXT: v_sub_u32_e64 v0, s[4:5], v0, v2 clamp +; GFX8-NEXT: v_sub_u32_e64 v1, s[4:5], v1, v3 clamp ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_usubsat_v2i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_min_u32_e32 v2, v0, v2 -; GFX9-NEXT: v_sub_u32_e32 v0, v0, v2 -; GFX9-NEXT: v_min_u32_e32 v2, v1, v3 -; GFX9-NEXT: v_sub_u32_e32 v1, v1, v2 +; GFX9-NEXT: v_sub_u32_e64 v0, v0, v2 clamp +; GFX9-NEXT: v_sub_u32_e64 v1, v1, v3 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = call <2 x i32> @llvm.usub.sat.v2i32(<2 x i32> %value, <2 x i32> %amount) ret <2 x i32> %result } define amdgpu_ps <2 x i32> @s_usubsat_v2i32(<2 x i32> inreg %value, <2 x i32> inreg %amount) { -; GCN-LABEL: s_usubsat_v2i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_cmp_lt_u32 s0, s2 -; GCN-NEXT: s_cselect_b32 s2, s0, s2 -; GCN-NEXT: s_sub_i32 s0, s0, s2 -; GCN-NEXT: s_cmp_lt_u32 s1, s3 -; GCN-NEXT: s_cselect_b32 s2, s1, s3 -; GCN-NEXT: s_sub_i32 s1, s1, s2 -; GCN-NEXT: ; return to shader part epilog +; GFX6-LABEL: s_usubsat_v2i32: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_cmp_lt_u32 s0, s2 +; GFX6-NEXT: s_cselect_b32 s2, s0, s2 +; GFX6-NEXT: s_sub_i32 s0, s0, s2 +; GFX6-NEXT: s_cmp_lt_u32 s1, s3 +; GFX6-NEXT: s_cselect_b32 s2, s1, s3 +; GFX6-NEXT: s_sub_i32 s1, s1, s2 +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: s_usubsat_v2i32: +; GFX8: ; %bb.0: +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_sub_u32_e64 v0, s[4:5], s0, v0 clamp +; GFX8-NEXT: v_sub_u32_e64 v1, s[0:1], s1, v1 clamp +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: v_readfirstlane_b32 s1, v1 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: s_usubsat_v2i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_sub_u32_e64 v0, s0, v0 clamp +; GFX9-NEXT: v_sub_u32_e64 v1, s1, v1 clamp +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: v_readfirstlane_b32 s1, v1 +; GFX9-NEXT: ; return to shader part epilog %result = call <2 x i32> @llvm.usub.sat.v2i32(<2 x i32> %value, <2 x i32> %amount) ret <2 x i32> %result } @@ -345,41 +371,61 @@ ; GFX8-LABEL: v_usubsat_v3i32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_min_u32_e32 v3, v0, v3 -; GFX8-NEXT: v_sub_u32_e32 v0, vcc, v0, v3 -; GFX8-NEXT: v_min_u32_e32 v3, v1, v4 -; GFX8-NEXT: v_sub_u32_e32 v1, vcc, v1, v3 -; GFX8-NEXT: v_min_u32_e32 v3, v2, v5 -; GFX8-NEXT: v_sub_u32_e32 v2, vcc, v2, v3 +; GFX8-NEXT: v_sub_u32_e64 v0, s[4:5], v0, v3 clamp +; GFX8-NEXT: v_sub_u32_e64 v1, s[4:5], v1, v4 clamp +; GFX8-NEXT: v_sub_u32_e64 v2, s[4:5], v2, v5 clamp ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_usubsat_v3i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_min_u32_e32 v3, v0, v3 -; GFX9-NEXT: v_sub_u32_e32 v0, v0, v3 -; GFX9-NEXT: v_min_u32_e32 v3, v1, v4 -; GFX9-NEXT: v_sub_u32_e32 v1, v1, v3 -; GFX9-NEXT: v_min_u32_e32 v3, v2, v5 -; GFX9-NEXT: v_sub_u32_e32 v2, v2, v3 +; GFX9-NEXT: v_sub_u32_e64 v0, v0, v3 clamp +; GFX9-NEXT: v_sub_u32_e64 v1, v1, v4 clamp +; GFX9-NEXT: v_sub_u32_e64 v2, v2, v5 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = call <3 x i32> @llvm.usub.sat.v3i32(<3 x i32> %value, <3 x i32> %amount) ret <3 x i32> %result } define amdgpu_ps <3 x i32> @s_usubsat_v3i32(<3 x i32> inreg %value, <3 x i32> inreg %amount) { -; GCN-LABEL: s_usubsat_v3i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_cmp_lt_u32 s0, s3 -; GCN-NEXT: s_cselect_b32 s3, s0, s3 -; GCN-NEXT: s_sub_i32 s0, s0, s3 -; GCN-NEXT: s_cmp_lt_u32 s1, s4 -; GCN-NEXT: s_cselect_b32 s3, s1, s4 -; GCN-NEXT: s_sub_i32 s1, s1, s3 -; GCN-NEXT: s_cmp_lt_u32 s2, s5 -; GCN-NEXT: s_cselect_b32 s3, s2, s5 -; GCN-NEXT: s_sub_i32 s2, s2, s3 -; GCN-NEXT: ; return to shader part epilog +; GFX6-LABEL: s_usubsat_v3i32: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_cmp_lt_u32 s0, s3 +; GFX6-NEXT: s_cselect_b32 s3, s0, s3 +; GFX6-NEXT: s_sub_i32 s0, s0, s3 +; GFX6-NEXT: s_cmp_lt_u32 s1, s4 +; GFX6-NEXT: s_cselect_b32 s3, s1, s4 +; GFX6-NEXT: s_sub_i32 s1, s1, s3 +; GFX6-NEXT: s_cmp_lt_u32 s2, s5 +; GFX6-NEXT: s_cselect_b32 s3, s2, s5 +; GFX6-NEXT: s_sub_i32 s2, s2, s3 +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: s_usubsat_v3i32: +; GFX8: ; %bb.0: +; GFX8-NEXT: v_mov_b32_e32 v0, s3 +; GFX8-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NEXT: v_mov_b32_e32 v2, s5 +; GFX8-NEXT: v_sub_u32_e64 v0, s[6:7], s0, v0 clamp +; GFX8-NEXT: v_sub_u32_e64 v1, s[0:1], s1, v1 clamp +; GFX8-NEXT: v_sub_u32_e64 v2, s[0:1], s2, v2 clamp +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: v_readfirstlane_b32 s1, v1 +; GFX8-NEXT: v_readfirstlane_b32 s2, v2 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: s_usubsat_v3i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-NEXT: v_sub_u32_e64 v0, s0, v0 clamp +; GFX9-NEXT: v_sub_u32_e64 v1, s1, v1 clamp +; GFX9-NEXT: v_sub_u32_e64 v2, s2, v2 clamp +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: v_readfirstlane_b32 s1, v1 +; GFX9-NEXT: v_readfirstlane_b32 s2, v2 +; GFX9-NEXT: ; return to shader part epilog %result = call <3 x i32> @llvm.usub.sat.v3i32(<3 x i32> %value, <3 x i32> %amount) ret <3 x i32> %result } @@ -401,48 +447,72 @@ ; GFX8-LABEL: v_usubsat_v4i32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_min_u32_e32 v4, v0, v4 -; GFX8-NEXT: v_sub_u32_e32 v0, vcc, v0, v4 -; GFX8-NEXT: v_min_u32_e32 v4, v1, v5 -; GFX8-NEXT: v_sub_u32_e32 v1, vcc, v1, v4 -; GFX8-NEXT: v_min_u32_e32 v4, v2, v6 -; GFX8-NEXT: v_sub_u32_e32 v2, vcc, v2, v4 -; GFX8-NEXT: v_min_u32_e32 v4, v3, v7 -; GFX8-NEXT: v_sub_u32_e32 v3, vcc, v3, v4 +; GFX8-NEXT: v_sub_u32_e64 v0, s[4:5], v0, v4 clamp +; GFX8-NEXT: v_sub_u32_e64 v1, s[4:5], v1, v5 clamp +; GFX8-NEXT: v_sub_u32_e64 v2, s[4:5], v2, v6 clamp +; GFX8-NEXT: v_sub_u32_e64 v3, s[4:5], v3, v7 clamp ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_usubsat_v4i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_min_u32_e32 v4, v0, v4 -; GFX9-NEXT: v_sub_u32_e32 v0, v0, v4 -; GFX9-NEXT: v_min_u32_e32 v4, v1, v5 -; GFX9-NEXT: v_sub_u32_e32 v1, v1, v4 -; GFX9-NEXT: v_min_u32_e32 v4, v2, v6 -; GFX9-NEXT: v_sub_u32_e32 v2, v2, v4 -; GFX9-NEXT: v_min_u32_e32 v4, v3, v7 -; GFX9-NEXT: v_sub_u32_e32 v3, v3, v4 +; GFX9-NEXT: v_sub_u32_e64 v0, v0, v4 clamp +; GFX9-NEXT: v_sub_u32_e64 v1, v1, v5 clamp +; GFX9-NEXT: v_sub_u32_e64 v2, v2, v6 clamp +; GFX9-NEXT: v_sub_u32_e64 v3, v3, v7 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> %value, <4 x i32> %amount) ret <4 x i32> %result } define amdgpu_ps <4 x i32> @s_usubsat_v4i32(<4 x i32> inreg %value, <4 x i32> inreg %amount) { -; GCN-LABEL: s_usubsat_v4i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_cmp_lt_u32 s0, s4 -; GCN-NEXT: s_cselect_b32 s4, s0, s4 -; GCN-NEXT: s_sub_i32 s0, s0, s4 -; GCN-NEXT: s_cmp_lt_u32 s1, s5 -; GCN-NEXT: s_cselect_b32 s4, s1, s5 -; GCN-NEXT: s_sub_i32 s1, s1, s4 -; GCN-NEXT: s_cmp_lt_u32 s2, s6 -; GCN-NEXT: s_cselect_b32 s4, s2, s6 -; GCN-NEXT: s_sub_i32 s2, s2, s4 -; GCN-NEXT: s_cmp_lt_u32 s3, s7 -; GCN-NEXT: s_cselect_b32 s4, s3, s7 -; GCN-NEXT: s_sub_i32 s3, s3, s4 -; GCN-NEXT: ; return to shader part epilog +; GFX6-LABEL: s_usubsat_v4i32: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_cmp_lt_u32 s0, s4 +; GFX6-NEXT: s_cselect_b32 s4, s0, s4 +; GFX6-NEXT: s_sub_i32 s0, s0, s4 +; GFX6-NEXT: s_cmp_lt_u32 s1, s5 +; GFX6-NEXT: s_cselect_b32 s4, s1, s5 +; GFX6-NEXT: s_sub_i32 s1, s1, s4 +; GFX6-NEXT: s_cmp_lt_u32 s2, s6 +; GFX6-NEXT: s_cselect_b32 s4, s2, s6 +; GFX6-NEXT: s_sub_i32 s2, s2, s4 +; GFX6-NEXT: s_cmp_lt_u32 s3, s7 +; GFX6-NEXT: s_cselect_b32 s4, s3, s7 +; GFX6-NEXT: s_sub_i32 s3, s3, s4 +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: s_usubsat_v4i32: +; GFX8: ; %bb.0: +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: v_mov_b32_e32 v2, s6 +; GFX8-NEXT: v_mov_b32_e32 v3, s7 +; GFX8-NEXT: v_sub_u32_e64 v0, s[8:9], s0, v0 clamp +; GFX8-NEXT: v_sub_u32_e64 v1, s[0:1], s1, v1 clamp +; GFX8-NEXT: v_sub_u32_e64 v2, s[0:1], s2, v2 clamp +; GFX8-NEXT: v_sub_u32_e64 v3, s[0:1], s3, v3 clamp +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: v_readfirstlane_b32 s1, v1 +; GFX8-NEXT: v_readfirstlane_b32 s2, v2 +; GFX8-NEXT: v_readfirstlane_b32 s3, v3 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: s_usubsat_v4i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: v_sub_u32_e64 v0, s0, v0 clamp +; GFX9-NEXT: v_sub_u32_e64 v1, s1, v1 clamp +; GFX9-NEXT: v_sub_u32_e64 v2, s2, v2 clamp +; GFX9-NEXT: v_sub_u32_e64 v3, s3, v3 clamp +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: v_readfirstlane_b32 s1, v1 +; GFX9-NEXT: v_readfirstlane_b32 s2, v2 +; GFX9-NEXT: v_readfirstlane_b32 s3, v3 +; GFX9-NEXT: ; return to shader part epilog %result = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> %value, <4 x i32> %amount) ret <4 x i32> %result } @@ -466,55 +536,83 @@ ; GFX8-LABEL: v_usubsat_v5i32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_min_u32_e32 v5, v0, v5 -; GFX8-NEXT: v_sub_u32_e32 v0, vcc, v0, v5 -; GFX8-NEXT: v_min_u32_e32 v5, v1, v6 -; GFX8-NEXT: v_sub_u32_e32 v1, vcc, v1, v5 -; GFX8-NEXT: v_min_u32_e32 v5, v2, v7 -; GFX8-NEXT: v_sub_u32_e32 v2, vcc, v2, v5 -; GFX8-NEXT: v_min_u32_e32 v5, v3, v8 -; GFX8-NEXT: v_sub_u32_e32 v3, vcc, v3, v5 -; GFX8-NEXT: v_min_u32_e32 v5, v4, v9 -; GFX8-NEXT: v_sub_u32_e32 v4, vcc, v4, v5 +; GFX8-NEXT: v_sub_u32_e64 v0, s[4:5], v0, v5 clamp +; GFX8-NEXT: v_sub_u32_e64 v1, s[4:5], v1, v6 clamp +; GFX8-NEXT: v_sub_u32_e64 v2, s[4:5], v2, v7 clamp +; GFX8-NEXT: v_sub_u32_e64 v3, s[4:5], v3, v8 clamp +; GFX8-NEXT: v_sub_u32_e64 v4, s[4:5], v4, v9 clamp ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_usubsat_v5i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_min_u32_e32 v5, v0, v5 -; GFX9-NEXT: v_sub_u32_e32 v0, v0, v5 -; GFX9-NEXT: v_min_u32_e32 v5, v1, v6 -; GFX9-NEXT: v_sub_u32_e32 v1, v1, v5 -; GFX9-NEXT: v_min_u32_e32 v5, v2, v7 -; GFX9-NEXT: v_sub_u32_e32 v2, v2, v5 -; GFX9-NEXT: v_min_u32_e32 v5, v3, v8 -; GFX9-NEXT: v_sub_u32_e32 v3, v3, v5 -; GFX9-NEXT: v_min_u32_e32 v5, v4, v9 -; GFX9-NEXT: v_sub_u32_e32 v4, v4, v5 +; GFX9-NEXT: v_sub_u32_e64 v0, v0, v5 clamp +; GFX9-NEXT: v_sub_u32_e64 v1, v1, v6 clamp +; GFX9-NEXT: v_sub_u32_e64 v2, v2, v7 clamp +; GFX9-NEXT: v_sub_u32_e64 v3, v3, v8 clamp +; GFX9-NEXT: v_sub_u32_e64 v4, v4, v9 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = call <5 x i32> @llvm.usub.sat.v5i32(<5 x i32> %value, <5 x i32> %amount) ret <5 x i32> %result } define amdgpu_ps <5 x i32> @s_usubsat_v5i32(<5 x i32> inreg %value, <5 x i32> inreg %amount) { -; GCN-LABEL: s_usubsat_v5i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_cmp_lt_u32 s0, s5 -; GCN-NEXT: s_cselect_b32 s5, s0, s5 -; GCN-NEXT: s_sub_i32 s0, s0, s5 -; GCN-NEXT: s_cmp_lt_u32 s1, s6 -; GCN-NEXT: s_cselect_b32 s5, s1, s6 -; GCN-NEXT: s_sub_i32 s1, s1, s5 -; GCN-NEXT: s_cmp_lt_u32 s2, s7 -; GCN-NEXT: s_cselect_b32 s5, s2, s7 -; GCN-NEXT: s_sub_i32 s2, s2, s5 -; GCN-NEXT: s_cmp_lt_u32 s3, s8 -; GCN-NEXT: s_cselect_b32 s5, s3, s8 -; GCN-NEXT: s_sub_i32 s3, s3, s5 -; GCN-NEXT: s_cmp_lt_u32 s4, s9 -; GCN-NEXT: s_cselect_b32 s5, s4, s9 -; GCN-NEXT: s_sub_i32 s4, s4, s5 -; GCN-NEXT: ; return to shader part epilog +; GFX6-LABEL: s_usubsat_v5i32: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_cmp_lt_u32 s0, s5 +; GFX6-NEXT: s_cselect_b32 s5, s0, s5 +; GFX6-NEXT: s_sub_i32 s0, s0, s5 +; GFX6-NEXT: s_cmp_lt_u32 s1, s6 +; GFX6-NEXT: s_cselect_b32 s5, s1, s6 +; GFX6-NEXT: s_sub_i32 s1, s1, s5 +; GFX6-NEXT: s_cmp_lt_u32 s2, s7 +; GFX6-NEXT: s_cselect_b32 s5, s2, s7 +; GFX6-NEXT: s_sub_i32 s2, s2, s5 +; GFX6-NEXT: s_cmp_lt_u32 s3, s8 +; GFX6-NEXT: s_cselect_b32 s5, s3, s8 +; GFX6-NEXT: s_sub_i32 s3, s3, s5 +; GFX6-NEXT: s_cmp_lt_u32 s4, s9 +; GFX6-NEXT: s_cselect_b32 s5, s4, s9 +; GFX6-NEXT: s_sub_i32 s4, s4, s5 +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: s_usubsat_v5i32: +; GFX8: ; %bb.0: +; GFX8-NEXT: v_mov_b32_e32 v0, s5 +; GFX8-NEXT: v_mov_b32_e32 v1, s6 +; GFX8-NEXT: v_mov_b32_e32 v2, s7 +; GFX8-NEXT: v_mov_b32_e32 v3, s8 +; GFX8-NEXT: v_mov_b32_e32 v4, s9 +; GFX8-NEXT: v_sub_u32_e64 v0, s[10:11], s0, v0 clamp +; GFX8-NEXT: v_sub_u32_e64 v1, s[0:1], s1, v1 clamp +; GFX8-NEXT: v_sub_u32_e64 v2, s[0:1], s2, v2 clamp +; GFX8-NEXT: v_sub_u32_e64 v3, s[0:1], s3, v3 clamp +; GFX8-NEXT: v_sub_u32_e64 v4, s[0:1], s4, v4 clamp +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: v_readfirstlane_b32 s1, v1 +; GFX8-NEXT: v_readfirstlane_b32 s2, v2 +; GFX8-NEXT: v_readfirstlane_b32 s3, v3 +; GFX8-NEXT: v_readfirstlane_b32 s4, v4 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: s_usubsat_v5i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_mov_b32_e32 v2, s7 +; GFX9-NEXT: v_mov_b32_e32 v3, s8 +; GFX9-NEXT: v_mov_b32_e32 v4, s9 +; GFX9-NEXT: v_sub_u32_e64 v0, s0, v0 clamp +; GFX9-NEXT: v_sub_u32_e64 v1, s1, v1 clamp +; GFX9-NEXT: v_sub_u32_e64 v2, s2, v2 clamp +; GFX9-NEXT: v_sub_u32_e64 v3, s3, v3 clamp +; GFX9-NEXT: v_sub_u32_e64 v4, s4, v4 clamp +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: v_readfirstlane_b32 s1, v1 +; GFX9-NEXT: v_readfirstlane_b32 s2, v2 +; GFX9-NEXT: v_readfirstlane_b32 s3, v3 +; GFX9-NEXT: v_readfirstlane_b32 s4, v4 +; GFX9-NEXT: ; return to shader part epilog %result = call <5 x i32> @llvm.usub.sat.v5i32(<5 x i32> %value, <5 x i32> %amount) ret <5 x i32> %result } @@ -560,132 +658,204 @@ ; GFX8-LABEL: v_usubsat_v16i32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_min_u32_e32 v16, v0, v16 -; GFX8-NEXT: v_sub_u32_e32 v0, vcc, v0, v16 -; GFX8-NEXT: v_min_u32_e32 v16, v1, v17 -; GFX8-NEXT: v_sub_u32_e32 v1, vcc, v1, v16 -; GFX8-NEXT: v_min_u32_e32 v16, v2, v18 -; GFX8-NEXT: v_sub_u32_e32 v2, vcc, v2, v16 -; GFX8-NEXT: v_min_u32_e32 v16, v3, v19 -; GFX8-NEXT: v_sub_u32_e32 v3, vcc, v3, v16 -; GFX8-NEXT: v_min_u32_e32 v16, v4, v20 -; GFX8-NEXT: v_sub_u32_e32 v4, vcc, v4, v16 -; GFX8-NEXT: v_min_u32_e32 v16, v5, v21 -; GFX8-NEXT: v_sub_u32_e32 v5, vcc, v5, v16 -; GFX8-NEXT: v_min_u32_e32 v16, v6, v22 -; GFX8-NEXT: v_sub_u32_e32 v6, vcc, v6, v16 -; GFX8-NEXT: v_min_u32_e32 v16, v7, v23 -; GFX8-NEXT: v_sub_u32_e32 v7, vcc, v7, v16 -; GFX8-NEXT: v_min_u32_e32 v16, v8, v24 -; GFX8-NEXT: v_sub_u32_e32 v8, vcc, v8, v16 -; GFX8-NEXT: v_min_u32_e32 v16, v9, v25 -; GFX8-NEXT: v_sub_u32_e32 v9, vcc, v9, v16 -; GFX8-NEXT: v_min_u32_e32 v16, v10, v26 -; GFX8-NEXT: v_sub_u32_e32 v10, vcc, v10, v16 -; GFX8-NEXT: v_min_u32_e32 v16, v11, v27 -; GFX8-NEXT: v_sub_u32_e32 v11, vcc, v11, v16 -; GFX8-NEXT: v_min_u32_e32 v16, v12, v28 -; GFX8-NEXT: v_sub_u32_e32 v12, vcc, v12, v16 -; GFX8-NEXT: v_min_u32_e32 v16, v13, v29 -; GFX8-NEXT: v_sub_u32_e32 v13, vcc, v13, v16 -; GFX8-NEXT: v_min_u32_e32 v16, v14, v30 -; GFX8-NEXT: v_sub_u32_e32 v14, vcc, v14, v16 -; GFX8-NEXT: v_min_u32_e32 v16, v15, v31 -; GFX8-NEXT: v_sub_u32_e32 v15, vcc, v15, v16 +; GFX8-NEXT: v_sub_u32_e64 v0, s[4:5], v0, v16 clamp +; GFX8-NEXT: v_sub_u32_e64 v1, s[4:5], v1, v17 clamp +; GFX8-NEXT: v_sub_u32_e64 v2, s[4:5], v2, v18 clamp +; GFX8-NEXT: v_sub_u32_e64 v3, s[4:5], v3, v19 clamp +; GFX8-NEXT: v_sub_u32_e64 v4, s[4:5], v4, v20 clamp +; GFX8-NEXT: v_sub_u32_e64 v5, s[4:5], v5, v21 clamp +; GFX8-NEXT: v_sub_u32_e64 v6, s[4:5], v6, v22 clamp +; GFX8-NEXT: v_sub_u32_e64 v7, s[4:5], v7, v23 clamp +; GFX8-NEXT: v_sub_u32_e64 v8, s[4:5], v8, v24 clamp +; GFX8-NEXT: v_sub_u32_e64 v9, s[4:5], v9, v25 clamp +; GFX8-NEXT: v_sub_u32_e64 v10, s[4:5], v10, v26 clamp +; GFX8-NEXT: v_sub_u32_e64 v11, s[4:5], v11, v27 clamp +; GFX8-NEXT: v_sub_u32_e64 v12, s[4:5], v12, v28 clamp +; GFX8-NEXT: v_sub_u32_e64 v13, s[4:5], v13, v29 clamp +; GFX8-NEXT: v_sub_u32_e64 v14, s[4:5], v14, v30 clamp +; GFX8-NEXT: v_sub_u32_e64 v15, s[4:5], v15, v31 clamp ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_usubsat_v16i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_min_u32_e32 v16, v0, v16 -; GFX9-NEXT: v_sub_u32_e32 v0, v0, v16 -; GFX9-NEXT: v_min_u32_e32 v16, v1, v17 -; GFX9-NEXT: v_sub_u32_e32 v1, v1, v16 -; GFX9-NEXT: v_min_u32_e32 v16, v2, v18 -; GFX9-NEXT: v_sub_u32_e32 v2, v2, v16 -; GFX9-NEXT: v_min_u32_e32 v16, v3, v19 -; GFX9-NEXT: v_sub_u32_e32 v3, v3, v16 -; GFX9-NEXT: v_min_u32_e32 v16, v4, v20 -; GFX9-NEXT: v_sub_u32_e32 v4, v4, v16 -; GFX9-NEXT: v_min_u32_e32 v16, v5, v21 -; GFX9-NEXT: v_sub_u32_e32 v5, v5, v16 -; GFX9-NEXT: v_min_u32_e32 v16, v6, v22 -; GFX9-NEXT: v_sub_u32_e32 v6, v6, v16 -; GFX9-NEXT: v_min_u32_e32 v16, v7, v23 -; GFX9-NEXT: v_sub_u32_e32 v7, v7, v16 -; GFX9-NEXT: v_min_u32_e32 v16, v8, v24 -; GFX9-NEXT: v_sub_u32_e32 v8, v8, v16 -; GFX9-NEXT: v_min_u32_e32 v16, v9, v25 -; GFX9-NEXT: v_sub_u32_e32 v9, v9, v16 -; GFX9-NEXT: v_min_u32_e32 v16, v10, v26 -; GFX9-NEXT: v_sub_u32_e32 v10, v10, v16 -; GFX9-NEXT: v_min_u32_e32 v16, v11, v27 -; GFX9-NEXT: v_sub_u32_e32 v11, v11, v16 -; GFX9-NEXT: v_min_u32_e32 v16, v12, v28 -; GFX9-NEXT: v_sub_u32_e32 v12, v12, v16 -; GFX9-NEXT: v_min_u32_e32 v16, v13, v29 -; GFX9-NEXT: v_sub_u32_e32 v13, v13, v16 -; GFX9-NEXT: v_min_u32_e32 v16, v14, v30 -; GFX9-NEXT: v_sub_u32_e32 v14, v14, v16 -; GFX9-NEXT: v_min_u32_e32 v16, v15, v31 -; GFX9-NEXT: v_sub_u32_e32 v15, v15, v16 +; GFX9-NEXT: v_sub_u32_e64 v0, v0, v16 clamp +; GFX9-NEXT: v_sub_u32_e64 v1, v1, v17 clamp +; GFX9-NEXT: v_sub_u32_e64 v2, v2, v18 clamp +; GFX9-NEXT: v_sub_u32_e64 v3, v3, v19 clamp +; GFX9-NEXT: v_sub_u32_e64 v4, v4, v20 clamp +; GFX9-NEXT: v_sub_u32_e64 v5, v5, v21 clamp +; GFX9-NEXT: v_sub_u32_e64 v6, v6, v22 clamp +; GFX9-NEXT: v_sub_u32_e64 v7, v7, v23 clamp +; GFX9-NEXT: v_sub_u32_e64 v8, v8, v24 clamp +; GFX9-NEXT: v_sub_u32_e64 v9, v9, v25 clamp +; GFX9-NEXT: v_sub_u32_e64 v10, v10, v26 clamp +; GFX9-NEXT: v_sub_u32_e64 v11, v11, v27 clamp +; GFX9-NEXT: v_sub_u32_e64 v12, v12, v28 clamp +; GFX9-NEXT: v_sub_u32_e64 v13, v13, v29 clamp +; GFX9-NEXT: v_sub_u32_e64 v14, v14, v30 clamp +; GFX9-NEXT: v_sub_u32_e64 v15, v15, v31 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> %value, <16 x i32> %amount) ret <16 x i32> %result } define amdgpu_ps <16 x i32> @s_usubsat_v16i32(<16 x i32> inreg %value, <16 x i32> inreg %amount) { -; GCN-LABEL: s_usubsat_v16i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_cmp_lt_u32 s0, s16 -; GCN-NEXT: s_cselect_b32 s16, s0, s16 -; GCN-NEXT: s_sub_i32 s0, s0, s16 -; GCN-NEXT: s_cmp_lt_u32 s1, s17 -; GCN-NEXT: s_cselect_b32 s16, s1, s17 -; GCN-NEXT: s_sub_i32 s1, s1, s16 -; GCN-NEXT: s_cmp_lt_u32 s2, s18 -; GCN-NEXT: s_cselect_b32 s16, s2, s18 -; GCN-NEXT: s_sub_i32 s2, s2, s16 -; GCN-NEXT: s_cmp_lt_u32 s3, s19 -; GCN-NEXT: s_cselect_b32 s16, s3, s19 -; GCN-NEXT: s_sub_i32 s3, s3, s16 -; GCN-NEXT: s_cmp_lt_u32 s4, s20 -; GCN-NEXT: s_cselect_b32 s16, s4, s20 -; GCN-NEXT: s_sub_i32 s4, s4, s16 -; GCN-NEXT: s_cmp_lt_u32 s5, s21 -; GCN-NEXT: s_cselect_b32 s16, s5, s21 -; GCN-NEXT: s_sub_i32 s5, s5, s16 -; GCN-NEXT: s_cmp_lt_u32 s6, s22 -; GCN-NEXT: s_cselect_b32 s16, s6, s22 -; GCN-NEXT: s_sub_i32 s6, s6, s16 -; GCN-NEXT: s_cmp_lt_u32 s7, s23 -; GCN-NEXT: s_cselect_b32 s16, s7, s23 -; GCN-NEXT: s_sub_i32 s7, s7, s16 -; GCN-NEXT: s_cmp_lt_u32 s8, s24 -; GCN-NEXT: s_cselect_b32 s16, s8, s24 -; GCN-NEXT: s_sub_i32 s8, s8, s16 -; GCN-NEXT: s_cmp_lt_u32 s9, s25 -; GCN-NEXT: s_cselect_b32 s16, s9, s25 -; GCN-NEXT: s_sub_i32 s9, s9, s16 -; GCN-NEXT: s_cmp_lt_u32 s10, s26 -; GCN-NEXT: s_cselect_b32 s16, s10, s26 -; GCN-NEXT: s_sub_i32 s10, s10, s16 -; GCN-NEXT: s_cmp_lt_u32 s11, s27 -; GCN-NEXT: s_cselect_b32 s16, s11, s27 -; GCN-NEXT: s_sub_i32 s11, s11, s16 -; GCN-NEXT: s_cmp_lt_u32 s12, s28 -; GCN-NEXT: s_cselect_b32 s16, s12, s28 -; GCN-NEXT: s_sub_i32 s12, s12, s16 -; GCN-NEXT: s_cmp_lt_u32 s13, s29 -; GCN-NEXT: s_cselect_b32 s16, s13, s29 -; GCN-NEXT: s_sub_i32 s13, s13, s16 -; GCN-NEXT: s_cmp_lt_u32 s14, s30 -; GCN-NEXT: s_cselect_b32 s16, s14, s30 -; GCN-NEXT: s_sub_i32 s14, s14, s16 -; GCN-NEXT: s_cmp_lt_u32 s15, s31 -; GCN-NEXT: s_cselect_b32 s16, s15, s31 -; GCN-NEXT: s_sub_i32 s15, s15, s16 -; GCN-NEXT: ; return to shader part epilog +; GFX6-LABEL: s_usubsat_v16i32: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_cmp_lt_u32 s0, s16 +; GFX6-NEXT: s_cselect_b32 s16, s0, s16 +; GFX6-NEXT: s_sub_i32 s0, s0, s16 +; GFX6-NEXT: s_cmp_lt_u32 s1, s17 +; GFX6-NEXT: s_cselect_b32 s16, s1, s17 +; GFX6-NEXT: s_sub_i32 s1, s1, s16 +; GFX6-NEXT: s_cmp_lt_u32 s2, s18 +; GFX6-NEXT: s_cselect_b32 s16, s2, s18 +; GFX6-NEXT: s_sub_i32 s2, s2, s16 +; GFX6-NEXT: s_cmp_lt_u32 s3, s19 +; GFX6-NEXT: s_cselect_b32 s16, s3, s19 +; GFX6-NEXT: s_sub_i32 s3, s3, s16 +; GFX6-NEXT: s_cmp_lt_u32 s4, s20 +; GFX6-NEXT: s_cselect_b32 s16, s4, s20 +; GFX6-NEXT: s_sub_i32 s4, s4, s16 +; GFX6-NEXT: s_cmp_lt_u32 s5, s21 +; GFX6-NEXT: s_cselect_b32 s16, s5, s21 +; GFX6-NEXT: s_sub_i32 s5, s5, s16 +; GFX6-NEXT: s_cmp_lt_u32 s6, s22 +; GFX6-NEXT: s_cselect_b32 s16, s6, s22 +; GFX6-NEXT: s_sub_i32 s6, s6, s16 +; GFX6-NEXT: s_cmp_lt_u32 s7, s23 +; GFX6-NEXT: s_cselect_b32 s16, s7, s23 +; GFX6-NEXT: s_sub_i32 s7, s7, s16 +; GFX6-NEXT: s_cmp_lt_u32 s8, s24 +; GFX6-NEXT: s_cselect_b32 s16, s8, s24 +; GFX6-NEXT: s_sub_i32 s8, s8, s16 +; GFX6-NEXT: s_cmp_lt_u32 s9, s25 +; GFX6-NEXT: s_cselect_b32 s16, s9, s25 +; GFX6-NEXT: s_sub_i32 s9, s9, s16 +; GFX6-NEXT: s_cmp_lt_u32 s10, s26 +; GFX6-NEXT: s_cselect_b32 s16, s10, s26 +; GFX6-NEXT: s_sub_i32 s10, s10, s16 +; GFX6-NEXT: s_cmp_lt_u32 s11, s27 +; GFX6-NEXT: s_cselect_b32 s16, s11, s27 +; GFX6-NEXT: s_sub_i32 s11, s11, s16 +; GFX6-NEXT: s_cmp_lt_u32 s12, s28 +; GFX6-NEXT: s_cselect_b32 s16, s12, s28 +; GFX6-NEXT: s_sub_i32 s12, s12, s16 +; GFX6-NEXT: s_cmp_lt_u32 s13, s29 +; GFX6-NEXT: s_cselect_b32 s16, s13, s29 +; GFX6-NEXT: s_sub_i32 s13, s13, s16 +; GFX6-NEXT: s_cmp_lt_u32 s14, s30 +; GFX6-NEXT: s_cselect_b32 s16, s14, s30 +; GFX6-NEXT: s_sub_i32 s14, s14, s16 +; GFX6-NEXT: s_cmp_lt_u32 s15, s31 +; GFX6-NEXT: s_cselect_b32 s16, s15, s31 +; GFX6-NEXT: s_sub_i32 s15, s15, s16 +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: s_usubsat_v16i32: +; GFX8: ; %bb.0: +; GFX8-NEXT: v_mov_b32_e32 v0, s16 +; GFX8-NEXT: v_mov_b32_e32 v1, s17 +; GFX8-NEXT: v_mov_b32_e32 v2, s18 +; GFX8-NEXT: v_mov_b32_e32 v3, s19 +; GFX8-NEXT: v_mov_b32_e32 v4, s20 +; GFX8-NEXT: v_mov_b32_e32 v5, s21 +; GFX8-NEXT: v_mov_b32_e32 v6, s22 +; GFX8-NEXT: v_mov_b32_e32 v7, s23 +; GFX8-NEXT: v_mov_b32_e32 v8, s24 +; GFX8-NEXT: v_mov_b32_e32 v9, s25 +; GFX8-NEXT: v_mov_b32_e32 v10, s26 +; GFX8-NEXT: v_mov_b32_e32 v11, s27 +; GFX8-NEXT: v_mov_b32_e32 v12, s28 +; GFX8-NEXT: v_mov_b32_e32 v13, s29 +; GFX8-NEXT: v_mov_b32_e32 v14, s30 +; GFX8-NEXT: v_mov_b32_e32 v15, s31 +; GFX8-NEXT: v_sub_u32_e64 v0, s[32:33], s0, v0 clamp +; GFX8-NEXT: v_sub_u32_e64 v1, s[16:17], s1, v1 clamp +; GFX8-NEXT: v_sub_u32_e64 v2, s[16:17], s2, v2 clamp +; GFX8-NEXT: v_sub_u32_e64 v3, s[2:3], s3, v3 clamp +; GFX8-NEXT: v_sub_u32_e64 v4, s[2:3], s4, v4 clamp +; GFX8-NEXT: v_sub_u32_e64 v5, s[2:3], s5, v5 clamp +; GFX8-NEXT: v_sub_u32_e64 v6, s[2:3], s6, v6 clamp +; GFX8-NEXT: v_sub_u32_e64 v7, s[2:3], s7, v7 clamp +; GFX8-NEXT: v_sub_u32_e64 v8, s[2:3], s8, v8 clamp +; GFX8-NEXT: v_sub_u32_e64 v9, s[2:3], s9, v9 clamp +; GFX8-NEXT: v_sub_u32_e64 v10, s[2:3], s10, v10 clamp +; GFX8-NEXT: v_sub_u32_e64 v11, s[2:3], s11, v11 clamp +; GFX8-NEXT: v_sub_u32_e64 v12, s[2:3], s12, v12 clamp +; GFX8-NEXT: v_sub_u32_e64 v13, s[2:3], s13, v13 clamp +; GFX8-NEXT: v_sub_u32_e64 v14, s[2:3], s14, v14 clamp +; GFX8-NEXT: v_sub_u32_e64 v15, s[2:3], s15, v15 clamp +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: v_readfirstlane_b32 s1, v1 +; GFX8-NEXT: v_readfirstlane_b32 s2, v2 +; GFX8-NEXT: v_readfirstlane_b32 s3, v3 +; GFX8-NEXT: v_readfirstlane_b32 s4, v4 +; GFX8-NEXT: v_readfirstlane_b32 s5, v5 +; GFX8-NEXT: v_readfirstlane_b32 s6, v6 +; GFX8-NEXT: v_readfirstlane_b32 s7, v7 +; GFX8-NEXT: v_readfirstlane_b32 s8, v8 +; GFX8-NEXT: v_readfirstlane_b32 s9, v9 +; GFX8-NEXT: v_readfirstlane_b32 s10, v10 +; GFX8-NEXT: v_readfirstlane_b32 s11, v11 +; GFX8-NEXT: v_readfirstlane_b32 s12, v12 +; GFX8-NEXT: v_readfirstlane_b32 s13, v13 +; GFX8-NEXT: v_readfirstlane_b32 s14, v14 +; GFX8-NEXT: v_readfirstlane_b32 s15, v15 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: s_usubsat_v16i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_mov_b32_e32 v8, s24 +; GFX9-NEXT: v_mov_b32_e32 v9, s25 +; GFX9-NEXT: v_mov_b32_e32 v10, s26 +; GFX9-NEXT: v_mov_b32_e32 v11, s27 +; GFX9-NEXT: v_mov_b32_e32 v12, s28 +; GFX9-NEXT: v_mov_b32_e32 v13, s29 +; GFX9-NEXT: v_mov_b32_e32 v14, s30 +; GFX9-NEXT: v_mov_b32_e32 v15, s31 +; GFX9-NEXT: v_sub_u32_e64 v0, s0, v0 clamp +; GFX9-NEXT: v_sub_u32_e64 v1, s1, v1 clamp +; GFX9-NEXT: v_sub_u32_e64 v2, s2, v2 clamp +; GFX9-NEXT: v_sub_u32_e64 v3, s3, v3 clamp +; GFX9-NEXT: v_sub_u32_e64 v4, s4, v4 clamp +; GFX9-NEXT: v_sub_u32_e64 v5, s5, v5 clamp +; GFX9-NEXT: v_sub_u32_e64 v6, s6, v6 clamp +; GFX9-NEXT: v_sub_u32_e64 v7, s7, v7 clamp +; GFX9-NEXT: v_sub_u32_e64 v8, s8, v8 clamp +; GFX9-NEXT: v_sub_u32_e64 v9, s9, v9 clamp +; GFX9-NEXT: v_sub_u32_e64 v10, s10, v10 clamp +; GFX9-NEXT: v_sub_u32_e64 v11, s11, v11 clamp +; GFX9-NEXT: v_sub_u32_e64 v12, s12, v12 clamp +; GFX9-NEXT: v_sub_u32_e64 v13, s13, v13 clamp +; GFX9-NEXT: v_sub_u32_e64 v14, s14, v14 clamp +; GFX9-NEXT: v_sub_u32_e64 v15, s15, v15 clamp +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: v_readfirstlane_b32 s1, v1 +; GFX9-NEXT: v_readfirstlane_b32 s2, v2 +; GFX9-NEXT: v_readfirstlane_b32 s3, v3 +; GFX9-NEXT: v_readfirstlane_b32 s4, v4 +; GFX9-NEXT: v_readfirstlane_b32 s5, v5 +; GFX9-NEXT: v_readfirstlane_b32 s6, v6 +; GFX9-NEXT: v_readfirstlane_b32 s7, v7 +; GFX9-NEXT: v_readfirstlane_b32 s8, v8 +; GFX9-NEXT: v_readfirstlane_b32 s9, v9 +; GFX9-NEXT: v_readfirstlane_b32 s10, v10 +; GFX9-NEXT: v_readfirstlane_b32 s11, v11 +; GFX9-NEXT: v_readfirstlane_b32 s12, v12 +; GFX9-NEXT: v_readfirstlane_b32 s13, v13 +; GFX9-NEXT: v_readfirstlane_b32 s14, v14 +; GFX9-NEXT: v_readfirstlane_b32 s15, v15 +; GFX9-NEXT: ; return to shader part epilog %result = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> %value, <16 x i32> %amount) ret <16 x i32> %result } @@ -704,15 +874,13 @@ ; GFX8-LABEL: v_usubsat_i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_min_u16_e32 v1, v0, v1 -; GFX8-NEXT: v_sub_u16_e32 v0, v0, v1 +; GFX8-NEXT: v_sub_u16_e64 v0, v0, v1 clamp ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_usubsat_i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_min_u16_e32 v1, v0, v1 -; GFX9-NEXT: v_sub_u16_e32 v0, v0, v1 +; GFX9-NEXT: v_sub_u16_e64 v0, v0, v1 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = call i16 @llvm.usub.sat.i16(i16 %value, i16 %amount) ret i16 %result @@ -731,20 +899,16 @@ ; ; GFX8-LABEL: s_usubsat_i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_bfe_u32 s2, s0, 0x100000 -; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s2, s1 -; GFX8-NEXT: s_cselect_b32 s1, s2, s1 -; GFX8-NEXT: s_sub_i32 s0, s0, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s1 +; GFX8-NEXT: v_sub_u16_e64 v0, s0, v0 clamp +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_usubsat_i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_bfe_u32 s2, s0, 0x100000 -; GFX9-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX9-NEXT: s_cmp_lt_u32 s2, s1 -; GFX9-NEXT: s_cselect_b32 s1, s2, s1 -; GFX9-NEXT: s_sub_i32 s0, s0, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_sub_u16_e64 v0, s0, v0 clamp +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: ; return to shader part epilog %result = call i16 @llvm.usub.sat.i16(i16 %value, i16 %amount) ret i16 %result @@ -762,14 +926,12 @@ ; ; GFX8-LABEL: usubsat_i16_sv: ; GFX8: ; %bb.0: -; GFX8-NEXT: v_min_u16_e32 v0, s0, v0 -; GFX8-NEXT: v_sub_u16_e32 v0, s0, v0 +; GFX8-NEXT: v_sub_u16_e64 v0, s0, v0 clamp ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: usubsat_i16_sv: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_min_u16_e32 v0, s0, v0 -; GFX9-NEXT: v_sub_u16_e32 v0, s0, v0 +; GFX9-NEXT: v_sub_u16_e64 v0, s0, v0 clamp ; GFX9-NEXT: ; return to shader part epilog %result = call i16 @llvm.usub.sat.i16(i16 %value, i16 %amount) %cast = bitcast i16 %result to half @@ -788,14 +950,12 @@ ; ; GFX8-LABEL: usubsat_i16_vs: ; GFX8: ; %bb.0: -; GFX8-NEXT: v_min_u16_e32 v1, s0, v0 -; GFX8-NEXT: v_sub_u16_e32 v0, v0, v1 +; GFX8-NEXT: v_sub_u16_e64 v0, v0, s0 clamp ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: usubsat_i16_vs: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_min_u16_e32 v1, s0, v0 -; GFX9-NEXT: v_sub_u16_e32 v0, v0, v1 +; GFX9-NEXT: v_sub_u16_e64 v0, v0, s0 clamp ; GFX9-NEXT: ; return to shader part epilog %result = call i16 @llvm.usub.sat.i16(i16 %value, i16 %amount) %cast = bitcast i16 %result to half @@ -821,19 +981,17 @@ ; GFX8-LABEL: v_usubsat_v2i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX8-NEXT: v_min_u16_e32 v3, v0, v1 -; GFX8-NEXT: v_min_u16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_sub_u16_e32 v0, v0, v3 -; GFX8-NEXT: v_sub_u16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: v_sub_u16_e64 v2, v0, v1 clamp +; GFX8-NEXT: v_sub_u16_sdwa v0, v0, v1 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_mov_b32_e32 v1, 16 +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_usubsat_v2i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_pk_min_u16 v1, v0, v1 -; GFX9-NEXT: v_pk_sub_i16 v0, v0, v1 +; GFX9-NEXT: v_pk_sub_u16 v0, v0, v1 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = call <2 x i16> @llvm.usub.sat.v2i16(<2 x i16> %value, <2 x i16> %amount) ret <2 x i16> %result @@ -865,38 +1023,21 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_lshr_b32 s3, s1, 16 ; GFX8-NEXT: s_lshr_b32 s2, s0, 16 -; GFX8-NEXT: s_bfe_u32 s4, s0, 0x100000 -; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s4, s1 -; GFX8-NEXT: s_cselect_b32 s1, s4, s1 -; GFX8-NEXT: s_sub_i32 s0, s0, s1 -; GFX8-NEXT: s_bfe_u32 s1, s2, 0x100000 -; GFX8-NEXT: s_bfe_u32 s3, s3, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s1, s3 -; GFX8-NEXT: s_cselect_b32 s1, s1, s3 -; GFX8-NEXT: s_sub_i32 s1, s2, s1 -; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX8-NEXT: s_bfe_u32 s0, s0, 0x100000 -; GFX8-NEXT: s_lshl_b32 s1, s1, 16 -; GFX8-NEXT: s_or_b32 s0, s0, s1 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_mov_b32_e32 v0, s1 +; GFX8-NEXT: v_sub_u16_e64 v1, s2, v1 clamp +; GFX8-NEXT: v_mov_b32_e32 v2, 16 +; GFX8-NEXT: v_sub_u16_e64 v0, s0, v0 clamp +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_usubsat_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s3, 0xffff -; GFX9-NEXT: s_lshr_b32 s5, s1, 16 -; GFX9-NEXT: s_lshr_b32 s2, s0, 16 -; GFX9-NEXT: s_and_b32 s4, s0, s3 -; GFX9-NEXT: s_and_b32 s1, s1, s3 -; GFX9-NEXT: s_cmp_lt_u32 s4, s1 -; GFX9-NEXT: s_cselect_b32 s1, s4, s1 -; GFX9-NEXT: s_cmp_lt_u32 s2, s5 -; GFX9-NEXT: s_cselect_b32 s3, s2, s5 -; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s3 -; GFX9-NEXT: s_lshr_b32 s3, s1, 16 -; GFX9-NEXT: s_sub_i32 s0, s0, s1 -; GFX9-NEXT: s_sub_i32 s1, s2, s3 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_pk_sub_u16 v0, s0, v0 clamp +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: ; return to shader part epilog %result = call <2 x i16> @llvm.usub.sat.v2i16(<2 x i16> %value, <2 x i16> %amount) %cast = bitcast <2 x i16> %result to i32 @@ -927,17 +1068,16 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_lshr_b32 s1, s0, 16 ; GFX8-NEXT: v_mov_b32_e32 v2, s1 -; GFX8-NEXT: v_min_u16_e32 v1, s0, v0 -; GFX8-NEXT: v_min_u16_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_sub_u16_e32 v1, s0, v1 -; GFX8-NEXT: v_sub_u16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX8-NEXT: v_sub_u16_e64 v1, s0, v0 clamp +; GFX8-NEXT: v_sub_u16_sdwa v0, v2, v0 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_mov_b32_e32 v2, 16 +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: usubsat_v2i16_sv: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_pk_min_u16 v0, s0, v0 -; GFX9-NEXT: v_pk_sub_i16 v0, s0, v0 +; GFX9-NEXT: v_pk_sub_u16 v0, s0, v0 clamp ; GFX9-NEXT: ; return to shader part epilog %result = call <2 x i16> @llvm.usub.sat.v2i16(<2 x i16> %value, <2 x i16> %amount) %cast = bitcast <2 x i16> %result to float @@ -966,19 +1106,18 @@ ; ; GFX8-LABEL: usubsat_v2i16_vs: ; GFX8: ; %bb.0: -; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX8-NEXT: s_lshr_b32 s1, s0, 16 -; GFX8-NEXT: v_min_u16_e32 v2, s0, v0 -; GFX8-NEXT: v_min_u16_e32 v3, s1, v1 -; GFX8-NEXT: v_sub_u16_e32 v0, v0, v2 -; GFX8-NEXT: v_sub_u16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, s1 +; GFX8-NEXT: v_sub_u16_e64 v1, v0, s0 clamp +; GFX8-NEXT: v_sub_u16_sdwa v0, v0, v2 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_mov_b32_e32 v2, 16 +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: usubsat_v2i16_vs: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_pk_min_u16 v1, v0, s0 -; GFX9-NEXT: v_pk_sub_i16 v0, v0, v1 +; GFX9-NEXT: v_pk_sub_u16 v0, v0, s0 clamp ; GFX9-NEXT: ; return to shader part epilog %result = call <2 x i16> @llvm.usub.sat.v2i16(<2 x i16> %value, <2 x i16> %amount) %cast = bitcast <2 x i16> %result to float @@ -1034,27 +1173,22 @@ ; GFX8-LABEL: v_usubsat_v4i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; GFX8-NEXT: v_min_u16_e32 v6, v0, v2 -; GFX8-NEXT: v_min_u16_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v1 -; GFX8-NEXT: v_min_u16_e32 v7, v1, v3 -; GFX8-NEXT: v_min_u16_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_sub_u16_e32 v0, v0, v6 -; GFX8-NEXT: v_sub_u16_sdwa v2, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX8-NEXT: v_sub_u16_e32 v1, v1, v7 -; GFX8-NEXT: v_sub_u16_sdwa v2, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX8-NEXT: v_sub_u16_e64 v4, v0, v2 clamp +; GFX8-NEXT: v_sub_u16_sdwa v0, v0, v2 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_sub_u16_e64 v2, v1, v3 clamp +; GFX8-NEXT: v_sub_u16_sdwa v1, v1, v3 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_mov_b32_e32 v3, 16 +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_usubsat_v4i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_pk_min_u16 v2, v0, v2 -; GFX9-NEXT: v_pk_sub_i16 v0, v0, v2 -; GFX9-NEXT: v_pk_min_u16 v2, v1, v3 -; GFX9-NEXT: v_pk_sub_i16 v1, v1, v2 +; GFX9-NEXT: v_pk_sub_u16 v0, v0, v2 clamp +; GFX9-NEXT: v_pk_sub_u16 v1, v1, v3 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = call <4 x i16> @llvm.usub.sat.v4i16(<4 x i16> %value, <4 x i16> %amount) %cast = bitcast <4 x i16> %result to <2 x float> @@ -1102,68 +1236,34 @@ ; GFX8-LABEL: s_usubsat_v4i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_lshr_b32 s6, s2, 16 +; GFX8-NEXT: s_lshr_b32 s7, s3, 16 ; GFX8-NEXT: s_lshr_b32 s4, s0, 16 +; GFX8-NEXT: v_mov_b32_e32 v1, s6 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: s_lshr_b32 s5, s1, 16 -; GFX8-NEXT: s_lshr_b32 s7, s3, 16 -; GFX8-NEXT: s_bfe_u32 s8, s0, 0x100000 -; GFX8-NEXT: s_bfe_u32 s2, s2, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s8, s2 -; GFX8-NEXT: s_cselect_b32 s2, s8, s2 -; GFX8-NEXT: s_sub_i32 s0, s0, s2 -; GFX8-NEXT: s_bfe_u32 s2, s4, 0x100000 -; GFX8-NEXT: s_bfe_u32 s6, s6, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s2, s6 -; GFX8-NEXT: s_cselect_b32 s2, s2, s6 -; GFX8-NEXT: s_sub_i32 s2, s4, s2 -; GFX8-NEXT: s_bfe_u32 s4, s1, 0x100000 -; GFX8-NEXT: s_bfe_u32 s3, s3, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s4, s3 -; GFX8-NEXT: s_cselect_b32 s3, s4, s3 -; GFX8-NEXT: s_sub_i32 s1, s1, s3 -; GFX8-NEXT: s_bfe_u32 s3, s5, 0x100000 -; GFX8-NEXT: s_bfe_u32 s4, s7, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s3, s4 -; GFX8-NEXT: s_cselect_b32 s3, s3, s4 -; GFX8-NEXT: s_bfe_u32 s2, s2, 0x100000 -; GFX8-NEXT: s_sub_i32 s3, s5, s3 -; GFX8-NEXT: s_bfe_u32 s0, s0, 0x100000 -; GFX8-NEXT: s_lshl_b32 s2, s2, 16 -; GFX8-NEXT: s_or_b32 s0, s0, s2 -; GFX8-NEXT: s_bfe_u32 s2, s3, 0x100000 -; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX8-NEXT: s_lshl_b32 s2, s2, 16 -; GFX8-NEXT: s_or_b32 s1, s1, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s7 +; GFX8-NEXT: v_sub_u16_e64 v1, s4, v1 clamp +; GFX8-NEXT: v_mov_b32_e32 v4, 16 +; GFX8-NEXT: v_mov_b32_e32 v2, s3 +; GFX8-NEXT: v_sub_u16_e64 v0, s0, v0 clamp +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_sub_u16_e64 v3, s5, v3 clamp +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_sub_u16_e64 v2, s1, v2 clamp +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: v_readfirstlane_b32 s1, v1 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_usubsat_v4i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s5, 0xffff -; GFX9-NEXT: s_lshr_b32 s7, s2, 16 -; GFX9-NEXT: s_lshr_b32 s4, s0, 16 -; GFX9-NEXT: s_and_b32 s6, s0, s5 -; GFX9-NEXT: s_and_b32 s2, s2, s5 -; GFX9-NEXT: s_cmp_lt_u32 s6, s2 -; GFX9-NEXT: s_cselect_b32 s2, s6, s2 -; GFX9-NEXT: s_cmp_lt_u32 s4, s7 -; GFX9-NEXT: s_cselect_b32 s6, s4, s7 -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s6 -; GFX9-NEXT: s_lshr_b32 s6, s2, 16 -; GFX9-NEXT: s_sub_i32 s0, s0, s2 -; GFX9-NEXT: s_sub_i32 s2, s4, s6 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s2 -; GFX9-NEXT: s_lshr_b32 s6, s3, 16 -; GFX9-NEXT: s_lshr_b32 s2, s1, 16 -; GFX9-NEXT: s_and_b32 s4, s1, s5 -; GFX9-NEXT: s_and_b32 s3, s3, s5 -; GFX9-NEXT: s_cmp_lt_u32 s4, s3 -; GFX9-NEXT: s_cselect_b32 s3, s4, s3 -; GFX9-NEXT: s_cmp_lt_u32 s2, s6 -; GFX9-NEXT: s_cselect_b32 s4, s2, s6 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s4 -; GFX9-NEXT: s_lshr_b32 s4, s3, 16 -; GFX9-NEXT: s_sub_i32 s1, s1, s3 -; GFX9-NEXT: s_sub_i32 s2, s2, s4 -; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_pk_sub_u16 v0, s0, v0 clamp +; GFX9-NEXT: v_pk_sub_u16 v1, s1, v1 clamp +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: v_readfirstlane_b32 s1, v1 ; GFX9-NEXT: ; return to shader part epilog %result = call <4 x i16> @llvm.usub.sat.v4i16(<4 x i16> %value, <4 x i16> %amount) %cast = bitcast <4 x i16> %result to <2 x i32> @@ -1233,35 +1333,28 @@ ; GFX8-LABEL: v_usubsat_v6i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; GFX8-NEXT: v_min_u16_e32 v9, v0, v3 -; GFX8-NEXT: v_min_u16_sdwa v3, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v1 -; GFX8-NEXT: v_min_u16_e32 v10, v1, v4 -; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v2 -; GFX8-NEXT: v_min_u16_sdwa v4, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_sub_u16_e32 v0, v0, v9 -; GFX8-NEXT: v_sub_u16_sdwa v3, v6, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_u16_e32 v11, v2, v5 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v3 -; GFX8-NEXT: v_min_u16_sdwa v5, v8, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_sub_u16_e32 v1, v1, v10 -; GFX8-NEXT: v_sub_u16_sdwa v3, v7, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v1, v1, v3 -; GFX8-NEXT: v_sub_u16_e32 v2, v2, v11 -; GFX8-NEXT: v_sub_u16_sdwa v3, v8, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX8-NEXT: v_sub_u16_e64 v6, v0, v3 clamp +; GFX8-NEXT: v_sub_u16_sdwa v0, v0, v3 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_sub_u16_e64 v3, v1, v4 clamp +; GFX8-NEXT: v_sub_u16_sdwa v1, v1, v4 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_sub_u16_e64 v4, v2, v5 clamp +; GFX8-NEXT: v_sub_u16_sdwa v2, v2, v5 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_mov_b32_e32 v5, 16 +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_mov_b32_e32 v3, 16 +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_or_b32_sdwa v0, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_usubsat_v6i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_pk_min_u16 v3, v0, v3 -; GFX9-NEXT: v_pk_sub_i16 v0, v0, v3 -; GFX9-NEXT: v_pk_min_u16 v3, v1, v4 -; GFX9-NEXT: v_pk_sub_i16 v1, v1, v3 -; GFX9-NEXT: v_pk_min_u16 v3, v2, v5 -; GFX9-NEXT: v_pk_sub_i16 v2, v2, v3 +; GFX9-NEXT: v_pk_sub_u16 v0, v0, v3 clamp +; GFX9-NEXT: v_pk_sub_u16 v1, v1, v4 clamp +; GFX9-NEXT: v_pk_sub_u16 v2, v2, v5 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = call <6 x i16> @llvm.usub.sat.v6i16(<6 x i16> %value, <6 x i16> %amount) %cast = bitcast <6 x i16> %result to <3 x float> @@ -1325,97 +1418,46 @@ ; GFX8-LABEL: s_usubsat_v6i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_lshr_b32 s9, s3, 16 +; GFX8-NEXT: s_lshr_b32 s10, s4, 16 ; GFX8-NEXT: s_lshr_b32 s6, s0, 16 +; GFX8-NEXT: v_mov_b32_e32 v1, s9 +; GFX8-NEXT: v_mov_b32_e32 v0, s3 +; GFX8-NEXT: s_lshr_b32 s11, s5, 16 ; GFX8-NEXT: s_lshr_b32 s7, s1, 16 +; GFX8-NEXT: v_mov_b32_e32 v3, s10 +; GFX8-NEXT: v_sub_u16_e64 v1, s6, v1 clamp +; GFX8-NEXT: v_mov_b32_e32 v6, 16 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: s_lshr_b32 s8, s2, 16 -; GFX8-NEXT: s_lshr_b32 s10, s4, 16 -; GFX8-NEXT: s_lshr_b32 s11, s5, 16 -; GFX8-NEXT: s_bfe_u32 s12, s0, 0x100000 -; GFX8-NEXT: s_bfe_u32 s3, s3, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s12, s3 -; GFX8-NEXT: s_cselect_b32 s3, s12, s3 -; GFX8-NEXT: s_sub_i32 s0, s0, s3 -; GFX8-NEXT: s_bfe_u32 s3, s6, 0x100000 -; GFX8-NEXT: s_bfe_u32 s9, s9, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s3, s9 -; GFX8-NEXT: s_cselect_b32 s3, s3, s9 -; GFX8-NEXT: s_sub_i32 s3, s6, s3 -; GFX8-NEXT: s_bfe_u32 s6, s1, 0x100000 -; GFX8-NEXT: s_bfe_u32 s4, s4, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s6, s4 -; GFX8-NEXT: s_cselect_b32 s4, s6, s4 -; GFX8-NEXT: s_sub_i32 s1, s1, s4 -; GFX8-NEXT: s_bfe_u32 s4, s7, 0x100000 -; GFX8-NEXT: s_bfe_u32 s6, s10, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s4, s6 -; GFX8-NEXT: s_cselect_b32 s4, s4, s6 -; GFX8-NEXT: s_sub_i32 s4, s7, s4 -; GFX8-NEXT: s_bfe_u32 s6, s2, 0x100000 -; GFX8-NEXT: s_bfe_u32 s5, s5, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s6, s5 -; GFX8-NEXT: s_cselect_b32 s5, s6, s5 -; GFX8-NEXT: s_sub_i32 s2, s2, s5 -; GFX8-NEXT: s_bfe_u32 s5, s8, 0x100000 -; GFX8-NEXT: s_bfe_u32 s6, s11, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s5, s6 -; GFX8-NEXT: s_cselect_b32 s5, s5, s6 -; GFX8-NEXT: s_bfe_u32 s3, s3, 0x100000 -; GFX8-NEXT: s_bfe_u32 s0, s0, 0x100000 -; GFX8-NEXT: s_lshl_b32 s3, s3, 16 -; GFX8-NEXT: s_or_b32 s0, s0, s3 -; GFX8-NEXT: s_bfe_u32 s3, s4, 0x100000 -; GFX8-NEXT: s_sub_i32 s5, s8, s5 -; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX8-NEXT: s_lshl_b32 s3, s3, 16 -; GFX8-NEXT: s_or_b32 s1, s1, s3 -; GFX8-NEXT: s_bfe_u32 s3, s5, 0x100000 -; GFX8-NEXT: s_bfe_u32 s2, s2, 0x100000 -; GFX8-NEXT: s_lshl_b32 s3, s3, 16 -; GFX8-NEXT: s_or_b32 s2, s2, s3 +; GFX8-NEXT: v_mov_b32_e32 v5, s11 +; GFX8-NEXT: v_sub_u16_e64 v0, s0, v0 clamp +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_sub_u16_e64 v3, s7, v3 clamp +; GFX8-NEXT: v_mov_b32_e32 v4, s5 +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_sub_u16_e64 v2, s1, v2 clamp +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_sub_u16_e64 v5, s8, v5 clamp +; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_sub_u16_e64 v4, s2, v4 clamp +; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: v_readfirstlane_b32 s1, v1 +; GFX8-NEXT: v_readfirstlane_b32 s2, v2 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_usubsat_v6i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s7, 0xffff -; GFX9-NEXT: s_lshr_b32 s9, s3, 16 -; GFX9-NEXT: s_lshr_b32 s6, s0, 16 -; GFX9-NEXT: s_and_b32 s8, s0, s7 -; GFX9-NEXT: s_and_b32 s3, s3, s7 -; GFX9-NEXT: s_cmp_lt_u32 s8, s3 -; GFX9-NEXT: s_cselect_b32 s3, s8, s3 -; GFX9-NEXT: s_cmp_lt_u32 s6, s9 -; GFX9-NEXT: s_cselect_b32 s8, s6, s9 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s8 -; GFX9-NEXT: s_lshr_b32 s8, s3, 16 -; GFX9-NEXT: s_sub_i32 s0, s0, s3 -; GFX9-NEXT: s_sub_i32 s3, s6, s8 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s3 -; GFX9-NEXT: s_lshr_b32 s8, s4, 16 -; GFX9-NEXT: s_lshr_b32 s3, s1, 16 -; GFX9-NEXT: s_and_b32 s6, s1, s7 -; GFX9-NEXT: s_and_b32 s4, s4, s7 -; GFX9-NEXT: s_cmp_lt_u32 s6, s4 -; GFX9-NEXT: s_cselect_b32 s4, s6, s4 -; GFX9-NEXT: s_cmp_lt_u32 s3, s8 -; GFX9-NEXT: s_cselect_b32 s6, s3, s8 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s6 -; GFX9-NEXT: s_lshr_b32 s6, s4, 16 -; GFX9-NEXT: s_sub_i32 s1, s1, s4 -; GFX9-NEXT: s_sub_i32 s3, s3, s6 -; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s3 -; GFX9-NEXT: s_lshr_b32 s6, s5, 16 -; GFX9-NEXT: s_lshr_b32 s3, s2, 16 -; GFX9-NEXT: s_and_b32 s4, s2, s7 -; GFX9-NEXT: s_and_b32 s5, s5, s7 -; GFX9-NEXT: s_cmp_lt_u32 s4, s5 -; GFX9-NEXT: s_cselect_b32 s4, s4, s5 -; GFX9-NEXT: s_cmp_lt_u32 s3, s6 -; GFX9-NEXT: s_cselect_b32 s5, s3, s6 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s5 -; GFX9-NEXT: s_lshr_b32 s5, s4, 16 -; GFX9-NEXT: s_sub_i32 s2, s2, s4 -; GFX9-NEXT: s_sub_i32 s3, s3, s5 -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s3 +; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-NEXT: v_pk_sub_u16 v0, s0, v0 clamp +; GFX9-NEXT: v_pk_sub_u16 v1, s1, v1 clamp +; GFX9-NEXT: v_pk_sub_u16 v2, s2, v2 clamp +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: v_readfirstlane_b32 s1, v1 +; GFX9-NEXT: v_readfirstlane_b32 s2, v2 ; GFX9-NEXT: ; return to shader part epilog %result = call <6 x i16> @llvm.usub.sat.v6i16(<6 x i16> %value, <6 x i16> %amount) %cast = bitcast <6 x i16> %result to <3 x i32> @@ -1488,43 +1530,33 @@ ; GFX8-LABEL: v_usubsat_v8i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v0 -; GFX8-NEXT: v_min_u16_e32 v12, v0, v4 -; GFX8-NEXT: v_min_u16_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v1 -; GFX8-NEXT: v_min_u16_e32 v13, v1, v5 -; GFX8-NEXT: v_lshrrev_b32_e32 v10, 16, v2 -; GFX8-NEXT: v_min_u16_sdwa v5, v9, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_sub_u16_e32 v0, v0, v12 -; GFX8-NEXT: v_sub_u16_sdwa v4, v8, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_u16_e32 v14, v2, v6 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 -; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v3 -; GFX8-NEXT: v_min_u16_sdwa v6, v10, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_sub_u16_e32 v1, v1, v13 -; GFX8-NEXT: v_sub_u16_sdwa v4, v9, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_u16_e32 v15, v3, v7 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v4 -; GFX8-NEXT: v_min_u16_sdwa v7, v11, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_sub_u16_e32 v2, v2, v14 -; GFX8-NEXT: v_sub_u16_sdwa v4, v10, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v2, v2, v4 -; GFX8-NEXT: v_sub_u16_e32 v3, v3, v15 -; GFX8-NEXT: v_sub_u16_sdwa v4, v11, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v3, v3, v4 +; GFX8-NEXT: v_sub_u16_e64 v8, v0, v4 clamp +; GFX8-NEXT: v_sub_u16_sdwa v0, v0, v4 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_sub_u16_e64 v4, v1, v5 clamp +; GFX8-NEXT: v_sub_u16_sdwa v1, v1, v5 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_sub_u16_e64 v5, v2, v6 clamp +; GFX8-NEXT: v_sub_u16_sdwa v2, v2, v6 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_sub_u16_e64 v6, v3, v7 clamp +; GFX8-NEXT: v_sub_u16_sdwa v3, v3, v7 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_mov_b32_e32 v7, 16 +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_mov_b32_e32 v7, 16 +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_or_b32_sdwa v0, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v3, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_usubsat_v8i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_pk_min_u16 v4, v0, v4 -; GFX9-NEXT: v_pk_sub_i16 v0, v0, v4 -; GFX9-NEXT: v_pk_min_u16 v4, v1, v5 -; GFX9-NEXT: v_pk_sub_i16 v1, v1, v4 -; GFX9-NEXT: v_pk_min_u16 v4, v2, v6 -; GFX9-NEXT: v_pk_sub_i16 v2, v2, v4 -; GFX9-NEXT: v_pk_min_u16 v4, v3, v7 -; GFX9-NEXT: v_pk_sub_i16 v3, v3, v4 +; GFX9-NEXT: v_pk_sub_u16 v0, v0, v4 clamp +; GFX9-NEXT: v_pk_sub_u16 v1, v1, v5 clamp +; GFX9-NEXT: v_pk_sub_u16 v2, v2, v6 clamp +; GFX9-NEXT: v_pk_sub_u16 v3, v3, v7 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> %value, <8 x i16> %amount) %cast = bitcast <8 x i16> %result to <4 x float> @@ -1604,126 +1636,58 @@ ; GFX8-LABEL: s_usubsat_v8i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_lshr_b32 s12, s4, 16 +; GFX8-NEXT: s_lshr_b32 s13, s5, 16 ; GFX8-NEXT: s_lshr_b32 s8, s0, 16 +; GFX8-NEXT: v_mov_b32_e32 v1, s12 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: s_lshr_b32 s14, s6, 16 +; GFX8-NEXT: s_lshr_b32 s15, s7, 16 ; GFX8-NEXT: s_lshr_b32 s9, s1, 16 +; GFX8-NEXT: v_mov_b32_e32 v3, s13 +; GFX8-NEXT: v_sub_u16_e64 v1, s8, v1 clamp +; GFX8-NEXT: v_mov_b32_e32 v8, 16 +; GFX8-NEXT: v_mov_b32_e32 v2, s5 ; GFX8-NEXT: s_lshr_b32 s10, s2, 16 +; GFX8-NEXT: v_mov_b32_e32 v5, s14 ; GFX8-NEXT: s_lshr_b32 s11, s3, 16 -; GFX8-NEXT: s_lshr_b32 s13, s5, 16 -; GFX8-NEXT: s_lshr_b32 s14, s6, 16 -; GFX8-NEXT: s_lshr_b32 s15, s7, 16 -; GFX8-NEXT: s_bfe_u32 s16, s0, 0x100000 -; GFX8-NEXT: s_bfe_u32 s4, s4, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s16, s4 -; GFX8-NEXT: s_cselect_b32 s4, s16, s4 -; GFX8-NEXT: s_sub_i32 s0, s0, s4 -; GFX8-NEXT: s_bfe_u32 s4, s8, 0x100000 -; GFX8-NEXT: s_bfe_u32 s12, s12, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s4, s12 -; GFX8-NEXT: s_cselect_b32 s4, s4, s12 -; GFX8-NEXT: s_sub_i32 s4, s8, s4 -; GFX8-NEXT: s_bfe_u32 s8, s1, 0x100000 -; GFX8-NEXT: s_bfe_u32 s5, s5, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s8, s5 -; GFX8-NEXT: s_cselect_b32 s5, s8, s5 -; GFX8-NEXT: s_sub_i32 s1, s1, s5 -; GFX8-NEXT: s_bfe_u32 s5, s9, 0x100000 -; GFX8-NEXT: s_bfe_u32 s8, s13, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s5, s8 -; GFX8-NEXT: s_cselect_b32 s5, s5, s8 -; GFX8-NEXT: s_sub_i32 s5, s9, s5 -; GFX8-NEXT: s_bfe_u32 s8, s2, 0x100000 -; GFX8-NEXT: s_bfe_u32 s6, s6, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s8, s6 -; GFX8-NEXT: s_cselect_b32 s6, s8, s6 -; GFX8-NEXT: s_sub_i32 s2, s2, s6 -; GFX8-NEXT: s_bfe_u32 s6, s10, 0x100000 -; GFX8-NEXT: s_bfe_u32 s8, s14, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s6, s8 -; GFX8-NEXT: s_cselect_b32 s6, s6, s8 -; GFX8-NEXT: s_sub_i32 s6, s10, s6 -; GFX8-NEXT: s_bfe_u32 s8, s3, 0x100000 -; GFX8-NEXT: s_bfe_u32 s7, s7, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s8, s7 -; GFX8-NEXT: s_cselect_b32 s7, s8, s7 -; GFX8-NEXT: s_sub_i32 s3, s3, s7 -; GFX8-NEXT: s_bfe_u32 s7, s11, 0x100000 -; GFX8-NEXT: s_bfe_u32 s8, s15, 0x100000 -; GFX8-NEXT: s_cmp_lt_u32 s7, s8 -; GFX8-NEXT: s_cselect_b32 s7, s7, s8 -; GFX8-NEXT: s_bfe_u32 s4, s4, 0x100000 -; GFX8-NEXT: s_bfe_u32 s0, s0, 0x100000 -; GFX8-NEXT: s_lshl_b32 s4, s4, 16 -; GFX8-NEXT: s_or_b32 s0, s0, s4 -; GFX8-NEXT: s_bfe_u32 s4, s5, 0x100000 -; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX8-NEXT: s_lshl_b32 s4, s4, 16 -; GFX8-NEXT: s_or_b32 s1, s1, s4 -; GFX8-NEXT: s_bfe_u32 s4, s6, 0x100000 -; GFX8-NEXT: s_sub_i32 s7, s11, s7 -; GFX8-NEXT: s_bfe_u32 s2, s2, 0x100000 -; GFX8-NEXT: s_lshl_b32 s4, s4, 16 -; GFX8-NEXT: s_or_b32 s2, s2, s4 -; GFX8-NEXT: s_bfe_u32 s4, s7, 0x100000 -; GFX8-NEXT: s_bfe_u32 s3, s3, 0x100000 -; GFX8-NEXT: s_lshl_b32 s4, s4, 16 -; GFX8-NEXT: s_or_b32 s3, s3, s4 +; GFX8-NEXT: v_mov_b32_e32 v7, s15 +; GFX8-NEXT: v_sub_u16_e64 v0, s0, v0 clamp +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_sub_u16_e64 v3, s9, v3 clamp +; GFX8-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NEXT: v_mov_b32_e32 v6, s7 +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_sub_u16_e64 v2, s1, v2 clamp +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_sub_u16_e64 v7, s11, v7 clamp +; GFX8-NEXT: v_sub_u16_e64 v5, s10, v5 clamp +; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_sub_u16_e64 v4, s2, v4 clamp +; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v8, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_sub_u16_e64 v6, s3, v6 clamp +; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v3, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: v_readfirstlane_b32 s1, v1 +; GFX8-NEXT: v_readfirstlane_b32 s2, v2 +; GFX8-NEXT: v_readfirstlane_b32 s3, v3 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_usubsat_v8i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s9, 0xffff -; GFX9-NEXT: s_lshr_b32 s11, s4, 16 -; GFX9-NEXT: s_lshr_b32 s8, s0, 16 -; GFX9-NEXT: s_and_b32 s10, s0, s9 -; GFX9-NEXT: s_and_b32 s4, s4, s9 -; GFX9-NEXT: s_cmp_lt_u32 s10, s4 -; GFX9-NEXT: s_cselect_b32 s4, s10, s4 -; GFX9-NEXT: s_cmp_lt_u32 s8, s11 -; GFX9-NEXT: s_cselect_b32 s10, s8, s11 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s10 -; GFX9-NEXT: s_lshr_b32 s10, s4, 16 -; GFX9-NEXT: s_sub_i32 s0, s0, s4 -; GFX9-NEXT: s_sub_i32 s4, s8, s10 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s4 -; GFX9-NEXT: s_lshr_b32 s10, s5, 16 -; GFX9-NEXT: s_lshr_b32 s4, s1, 16 -; GFX9-NEXT: s_and_b32 s8, s1, s9 -; GFX9-NEXT: s_and_b32 s5, s5, s9 -; GFX9-NEXT: s_cmp_lt_u32 s8, s5 -; GFX9-NEXT: s_cselect_b32 s5, s8, s5 -; GFX9-NEXT: s_cmp_lt_u32 s4, s10 -; GFX9-NEXT: s_cselect_b32 s8, s4, s10 -; GFX9-NEXT: s_pack_ll_b32_b16 s5, s5, s8 -; GFX9-NEXT: s_lshr_b32 s8, s5, 16 -; GFX9-NEXT: s_sub_i32 s1, s1, s5 -; GFX9-NEXT: s_sub_i32 s4, s4, s8 -; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s4 -; GFX9-NEXT: s_lshr_b32 s8, s6, 16 -; GFX9-NEXT: s_lshr_b32 s4, s2, 16 -; GFX9-NEXT: s_and_b32 s5, s2, s9 -; GFX9-NEXT: s_and_b32 s6, s6, s9 -; GFX9-NEXT: s_cmp_lt_u32 s5, s6 -; GFX9-NEXT: s_cselect_b32 s5, s5, s6 -; GFX9-NEXT: s_cmp_lt_u32 s4, s8 -; GFX9-NEXT: s_cselect_b32 s6, s4, s8 -; GFX9-NEXT: s_pack_ll_b32_b16 s5, s5, s6 -; GFX9-NEXT: s_lshr_b32 s6, s5, 16 -; GFX9-NEXT: s_sub_i32 s2, s2, s5 -; GFX9-NEXT: s_sub_i32 s4, s4, s6 -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s4 -; GFX9-NEXT: s_lshr_b32 s6, s7, 16 -; GFX9-NEXT: s_lshr_b32 s4, s3, 16 -; GFX9-NEXT: s_and_b32 s5, s3, s9 -; GFX9-NEXT: s_and_b32 s7, s7, s9 -; GFX9-NEXT: s_cmp_lt_u32 s5, s7 -; GFX9-NEXT: s_cselect_b32 s5, s5, s7 -; GFX9-NEXT: s_cmp_lt_u32 s4, s6 -; GFX9-NEXT: s_cselect_b32 s6, s4, s6 -; GFX9-NEXT: s_pack_ll_b32_b16 s5, s5, s6 -; GFX9-NEXT: s_lshr_b32 s6, s5, 16 -; GFX9-NEXT: s_sub_i32 s3, s3, s5 -; GFX9-NEXT: s_sub_i32 s4, s4, s6 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: v_pk_sub_u16 v0, s0, v0 clamp +; GFX9-NEXT: v_pk_sub_u16 v1, s1, v1 clamp +; GFX9-NEXT: v_pk_sub_u16 v2, s2, v2 clamp +; GFX9-NEXT: v_pk_sub_u16 v3, s3, v3 clamp +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: v_readfirstlane_b32 s1, v1 +; GFX9-NEXT: v_readfirstlane_b32 s2, v2 +; GFX9-NEXT: v_readfirstlane_b32 s3, v3 ; GFX9-NEXT: ; return to shader part epilog %result = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> %value, <8 x i16> %amount) %cast = bitcast <8 x i16> %result to <4 x i32>