diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -656,12 +656,11 @@ .widenScalarToNextPow2(0) .scalarize(0); - getActionDefinitionsBuilder({G_UADDO, G_USUBO, - G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) - .legalFor({{S32, S1}, {S32, S32}}) - .minScalar(0, S32) - .scalarize(0) - .lower(); + getActionDefinitionsBuilder( + {G_UADDO, G_USUBO, G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) + .legalFor({{S32, S1}, {S32, S32}}) + .clampScalar(0, S32, S32) + .scalarize(0); getActionDefinitionsBuilder(G_BITCAST) // Don't worry about the size constraint. diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/addo.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/addo.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/addo.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/addo.ll @@ -41,7 +41,6 @@ ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc -; GFX7-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3] ; GFX7-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc @@ -52,7 +51,6 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc -; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3] ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc @@ -63,7 +61,6 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc -; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc @@ -480,46 +477,28 @@ ; GFX7-LABEL: s_uaddo_i64: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_add_u32 s0, s0, s2 -; GFX7-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-NEXT: s_addc_u32 s1, s1, s3 -; GFX7-NEXT: v_mov_b32_e32 v1, s3 -; GFX7-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1] -; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GFX7-NEXT: v_add_i32_e32 v0, vcc, s0, v0 -; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX7-NEXT: v_readfirstlane_b32 s0, v0 -; GFX7-NEXT: v_readfirstlane_b32 s1, v1 +; GFX7-NEXT: s_cselect_b32 s2, 1, 0 +; GFX7-NEXT: s_add_u32 s0, s0, s2 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_uaddo_i64: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_add_u32 s0, s0, s2 -; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: s_addc_u32 s1, s1, s3 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: v_readfirstlane_b32 s0, v0 -; GFX8-NEXT: v_readfirstlane_b32 s1, v1 +; GFX8-NEXT: s_cselect_b32 s2, 1, 0 +; GFX8-NEXT: s_add_u32 s0, s0, s2 +; GFX8-NEXT: s_addc_u32 s1, s1, 0 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_uaddo_i64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_add_u32 s0, s0, s2 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: s_addc_u32 s1, s1, s3 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: v_readfirstlane_b32 s0, v0 -; GFX9-NEXT: v_readfirstlane_b32 s1, v1 +; GFX9-NEXT: s_cselect_b32 s2, 1, 0 +; GFX9-NEXT: s_add_u32 s0, s0, s2 +; GFX9-NEXT: s_addc_u32 s1, s1, 0 ; GFX9-NEXT: ; return to shader part epilog %uaddo = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %a, i64 %b) %add = extractvalue {i64, i1} %uaddo, 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-sadde.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-sadde.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-sadde.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-sadde.mir @@ -1,5 +1,5 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -O0 -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -O0 -run-pass=legalizer %s -o - | FileCheck %s --- name: test_sadde_s32 @@ -120,9 +120,13 @@ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr4 ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[COPY2]](s32), [[C]] - ; CHECK-NEXT: [[SADDE:%[0-9]+]]:_(s64), [[SADDE1:%[0-9]+]]:_(s1) = G_SADDE [[COPY]], [[COPY1]], [[ICMP]] + ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64) + ; CHECK-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](s64) + ; CHECK-NEXT: [[UADDE:%[0-9]+]]:_(s32), [[UADDE1:%[0-9]+]]:_(s1) = G_UADDE [[UV]], [[UV2]], [[ICMP]] + ; CHECK-NEXT: [[SADDE:%[0-9]+]]:_(s32), [[SADDE1:%[0-9]+]]:_(s1) = G_SADDE [[UV1]], [[UV3]], [[UADDE1]] + ; CHECK-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDE]](s32), [[SADDE]](s32) ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[SADDE1]](s1) - ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[SADDE]](s64) + ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[MV]](s64) ; CHECK-NEXT: $vgpr2 = COPY [[ZEXT]](s32) %0:_(s64) = COPY $vgpr0_vgpr1 %1:_(s64) = COPY $vgpr2_vgpr3 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ssube.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ssube.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ssube.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ssube.mir @@ -1,5 +1,5 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -O0 -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -O0 -run-pass=legalizer %s -o - | FileCheck %s --- name: test_ssube_s32 @@ -119,9 +119,13 @@ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr4 ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[COPY2]](s32), [[C]] - ; CHECK-NEXT: [[SSUBE:%[0-9]+]]:_(s64), [[SSUBE1:%[0-9]+]]:_(s1) = G_SSUBE [[COPY]], [[COPY1]], [[ICMP]] + ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64) + ; CHECK-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](s64) + ; CHECK-NEXT: [[USUBE:%[0-9]+]]:_(s32), [[USUBE1:%[0-9]+]]:_(s1) = G_USUBE [[UV]], [[UV2]], [[ICMP]] + ; CHECK-NEXT: [[SSUBE:%[0-9]+]]:_(s32), [[SSUBE1:%[0-9]+]]:_(s1) = G_SSUBE [[UV1]], [[UV3]], [[USUBE1]] + ; CHECK-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[USUBE]](s32), [[SSUBE]](s32) ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[SSUBE1]](s1) - ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[SSUBE]](s64) + ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[MV]](s64) ; CHECK-NEXT: $vgpr2 = COPY [[ZEXT]](s32) %0:_(s64) = COPY $vgpr0_vgpr1 %1:_(s64) = COPY $vgpr2_vgpr3 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-uadde.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-uadde.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-uadde.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-uadde.mir @@ -123,17 +123,12 @@ ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[COPY2]](s32), [[C]] ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64) ; CHECK-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](s64) - ; CHECK-NEXT: [[UADDO:%[0-9]+]]:_(s32), [[UADDO1:%[0-9]+]]:_(s1) = G_UADDO [[UV]], [[UV2]] - ; CHECK-NEXT: [[UADDE:%[0-9]+]]:_(s32), [[UADDE1:%[0-9]+]]:_(s1) = G_UADDE [[UV1]], [[UV3]], [[UADDO1]] - ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[ICMP]](s1) - ; CHECK-NEXT: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[ZEXT]](s64) - ; CHECK-NEXT: [[UADDO2:%[0-9]+]]:_(s32), [[UADDO3:%[0-9]+]]:_(s1) = G_UADDO [[UADDO]], [[UV4]] - ; CHECK-NEXT: [[UADDE2:%[0-9]+]]:_(s32), [[UADDE3:%[0-9]+]]:_(s1) = G_UADDE [[UADDE]], [[UV5]], [[UADDO3]] - ; CHECK-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO2]](s32), [[UADDE2]](s32) - ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[MV]](s64), [[COPY]] - ; CHECK-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[ICMP1]](s1) + ; CHECK-NEXT: [[UADDE:%[0-9]+]]:_(s32), [[UADDE1:%[0-9]+]]:_(s1) = G_UADDE [[UV]], [[UV2]], [[ICMP]] + ; CHECK-NEXT: [[UADDE2:%[0-9]+]]:_(s32), [[UADDE3:%[0-9]+]]:_(s1) = G_UADDE [[UV1]], [[UV3]], [[UADDE1]] + ; CHECK-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDE]](s32), [[UADDE2]](s32) + ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[UADDE3]](s1) ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[MV]](s64) - ; CHECK-NEXT: $vgpr2 = COPY [[ZEXT1]](s32) + ; CHECK-NEXT: $vgpr2 = COPY [[ZEXT]](s32) %0:_(s64) = COPY $vgpr0_vgpr1 %1:_(s64) = COPY $vgpr2_vgpr3 %2:_(s32) = COPY $vgpr4 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-uaddo.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-uaddo.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-uaddo.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-uaddo.mir @@ -106,8 +106,7 @@ ; CHECK-NEXT: [[UADDO:%[0-9]+]]:_(s32), [[UADDO1:%[0-9]+]]:_(s1) = G_UADDO [[UV]], [[UV2]] ; CHECK-NEXT: [[UADDE:%[0-9]+]]:_(s32), [[UADDE1:%[0-9]+]]:_(s1) = G_UADDE [[UV1]], [[UV3]], [[UADDO1]] ; CHECK-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO]](s32), [[UADDE]](s32) - ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[MV]](s64), [[COPY1]] - ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[ICMP]](s1) + ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[UADDE1]](s1) ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[MV]](s64) ; CHECK-NEXT: $vgpr2 = COPY [[ZEXT]](s32) %0:_(s64) = COPY $vgpr0_vgpr1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-uaddsat.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-uaddsat.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-uaddsat.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-uaddsat.mir @@ -720,9 +720,8 @@ ; GFX6-NEXT: [[UADDO:%[0-9]+]]:_(s32), [[UADDO1:%[0-9]+]]:_(s1) = G_UADDO [[UV]], [[UV2]] ; GFX6-NEXT: [[UADDE:%[0-9]+]]:_(s32), [[UADDE1:%[0-9]+]]:_(s1) = G_UADDE [[UV1]], [[UV3]], [[UADDO1]] ; GFX6-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO]](s32), [[UADDE]](s32) - ; GFX6-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[MV]](s64), [[COPY1]] ; GFX6-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1 - ; GFX6-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s1), [[C]], [[MV]] + ; GFX6-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[UADDE1]](s1), [[C]], [[MV]] ; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[SELECT]](s64) ; GFX8-LABEL: name: uaddsat_s64 ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 @@ -734,9 +733,8 @@ ; GFX8-NEXT: [[UADDO:%[0-9]+]]:_(s32), [[UADDO1:%[0-9]+]]:_(s1) = G_UADDO [[UV]], [[UV2]] ; GFX8-NEXT: [[UADDE:%[0-9]+]]:_(s32), [[UADDE1:%[0-9]+]]:_(s1) = G_UADDE [[UV1]], [[UV3]], [[UADDO1]] ; GFX8-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO]](s32), [[UADDE]](s32) - ; GFX8-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[MV]](s64), [[COPY1]] ; GFX8-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1 - ; GFX8-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s1), [[C]], [[MV]] + ; GFX8-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[UADDE1]](s1), [[C]], [[MV]] ; GFX8-NEXT: $vgpr0_vgpr1 = COPY [[SELECT]](s64) ; GFX9-LABEL: name: uaddsat_s64 ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 @@ -748,9 +746,8 @@ ; GFX9-NEXT: [[UADDO:%[0-9]+]]:_(s32), [[UADDO1:%[0-9]+]]:_(s1) = G_UADDO [[UV]], [[UV2]] ; GFX9-NEXT: [[UADDE:%[0-9]+]]:_(s32), [[UADDE1:%[0-9]+]]:_(s1) = G_UADDE [[UV1]], [[UV3]], [[UADDO1]] ; GFX9-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO]](s32), [[UADDE]](s32) - ; GFX9-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[MV]](s64), [[COPY1]] ; GFX9-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1 - ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s1), [[C]], [[MV]] + ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[UADDE1]](s1), [[C]], [[MV]] ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[SELECT]](s64) %0:_(s64) = COPY $vgpr0_vgpr1 %1:_(s64) = COPY $vgpr2_vgpr3 @@ -776,16 +773,14 @@ ; GFX6-NEXT: [[UADDO:%[0-9]+]]:_(s32), [[UADDO1:%[0-9]+]]:_(s1) = G_UADDO [[UV4]], [[UV6]] ; GFX6-NEXT: [[UADDE:%[0-9]+]]:_(s32), [[UADDE1:%[0-9]+]]:_(s1) = G_UADDE [[UV5]], [[UV7]], [[UADDO1]] ; GFX6-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO]](s32), [[UADDE]](s32) - ; GFX6-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[MV]](s64), [[UV2]] ; GFX6-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1 - ; GFX6-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s1), [[C]], [[MV]] + ; GFX6-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[UADDE1]](s1), [[C]], [[MV]] ; GFX6-NEXT: [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV1]](s64) ; GFX6-NEXT: [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV3]](s64) ; GFX6-NEXT: [[UADDO2:%[0-9]+]]:_(s32), [[UADDO3:%[0-9]+]]:_(s1) = G_UADDO [[UV8]], [[UV10]] ; GFX6-NEXT: [[UADDE2:%[0-9]+]]:_(s32), [[UADDE3:%[0-9]+]]:_(s1) = G_UADDE [[UV9]], [[UV11]], [[UADDO3]] ; GFX6-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO2]](s32), [[UADDE2]](s32) - ; GFX6-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[MV1]](s64), [[UV3]] - ; GFX6-NEXT: [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[ICMP1]](s1), [[C]], [[MV1]] + ; GFX6-NEXT: [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[UADDE3]](s1), [[C]], [[MV1]] ; GFX6-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[SELECT]](s64), [[SELECT1]](s64) ; GFX6-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>) ; GFX8-LABEL: name: uaddsat_v2s64 @@ -800,16 +795,14 @@ ; GFX8-NEXT: [[UADDO:%[0-9]+]]:_(s32), [[UADDO1:%[0-9]+]]:_(s1) = G_UADDO [[UV4]], [[UV6]] ; GFX8-NEXT: [[UADDE:%[0-9]+]]:_(s32), [[UADDE1:%[0-9]+]]:_(s1) = G_UADDE [[UV5]], [[UV7]], [[UADDO1]] ; GFX8-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO]](s32), [[UADDE]](s32) - ; GFX8-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[MV]](s64), [[UV2]] ; GFX8-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1 - ; GFX8-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s1), [[C]], [[MV]] + ; GFX8-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[UADDE1]](s1), [[C]], [[MV]] ; GFX8-NEXT: [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV1]](s64) ; GFX8-NEXT: [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV3]](s64) ; GFX8-NEXT: [[UADDO2:%[0-9]+]]:_(s32), [[UADDO3:%[0-9]+]]:_(s1) = G_UADDO [[UV8]], [[UV10]] ; GFX8-NEXT: [[UADDE2:%[0-9]+]]:_(s32), [[UADDE3:%[0-9]+]]:_(s1) = G_UADDE [[UV9]], [[UV11]], [[UADDO3]] ; GFX8-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO2]](s32), [[UADDE2]](s32) - ; GFX8-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[MV1]](s64), [[UV3]] - ; GFX8-NEXT: [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[ICMP1]](s1), [[C]], [[MV1]] + ; GFX8-NEXT: [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[UADDE3]](s1), [[C]], [[MV1]] ; GFX8-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[SELECT]](s64), [[SELECT1]](s64) ; GFX8-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>) ; GFX9-LABEL: name: uaddsat_v2s64 @@ -824,16 +817,14 @@ ; GFX9-NEXT: [[UADDO:%[0-9]+]]:_(s32), [[UADDO1:%[0-9]+]]:_(s1) = G_UADDO [[UV4]], [[UV6]] ; GFX9-NEXT: [[UADDE:%[0-9]+]]:_(s32), [[UADDE1:%[0-9]+]]:_(s1) = G_UADDE [[UV5]], [[UV7]], [[UADDO1]] ; GFX9-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO]](s32), [[UADDE]](s32) - ; GFX9-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[MV]](s64), [[UV2]] ; GFX9-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1 - ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s1), [[C]], [[MV]] + ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[UADDE1]](s1), [[C]], [[MV]] ; GFX9-NEXT: [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV1]](s64) ; GFX9-NEXT: [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV3]](s64) ; GFX9-NEXT: [[UADDO2:%[0-9]+]]:_(s32), [[UADDO3:%[0-9]+]]:_(s1) = G_UADDO [[UV8]], [[UV10]] ; GFX9-NEXT: [[UADDE2:%[0-9]+]]:_(s32), [[UADDE3:%[0-9]+]]:_(s1) = G_UADDE [[UV9]], [[UV11]], [[UADDO3]] ; GFX9-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO2]](s32), [[UADDE2]](s32) - ; GFX9-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[MV1]](s64), [[UV3]] - ; GFX9-NEXT: [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[ICMP1]](s1), [[C]], [[MV1]] + ; GFX9-NEXT: [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[UADDE3]](s1), [[C]], [[MV1]] ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[SELECT]](s64), [[SELECT1]](s64) ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>) %0:_(<2 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-usube.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-usube.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-usube.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-usube.mir @@ -123,23 +123,12 @@ ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[COPY2]](s32), [[C]] ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64) ; CHECK-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](s64) - ; CHECK-NEXT: [[USUBO:%[0-9]+]]:_(s32), [[USUBO1:%[0-9]+]]:_(s1) = G_USUBO [[UV]], [[UV2]] - ; CHECK-NEXT: [[USUBE:%[0-9]+]]:_(s32), [[USUBE1:%[0-9]+]]:_(s1) = G_USUBE [[UV1]], [[UV3]], [[USUBO1]] - ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[ICMP]](s1) - ; CHECK-NEXT: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[ZEXT]](s64) - ; CHECK-NEXT: [[USUBO2:%[0-9]+]]:_(s32), [[USUBO3:%[0-9]+]]:_(s1) = G_USUBO [[USUBO]], [[UV4]] - ; CHECK-NEXT: [[USUBE2:%[0-9]+]]:_(s32), [[USUBE3:%[0-9]+]]:_(s1) = G_USUBE [[USUBE]], [[UV5]], [[USUBO3]] - ; CHECK-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[USUBO2]](s32), [[USUBE2]](s32) - ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[COPY]](s64), [[COPY1]] - ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[COPY]](s64), [[COPY1]] - ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s16) = G_ANYEXT [[ICMP]](s1) - ; CHECK-NEXT: [[ANYEXT1:%[0-9]+]]:_(s16) = G_ANYEXT [[ICMP2]](s1) - ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s16) = G_SELECT [[ICMP1]](s1), [[ANYEXT]], [[ANYEXT1]] - ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 - ; CHECK-NEXT: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[SELECT]](s16) - ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[ANYEXT2]], [[C1]] + ; CHECK-NEXT: [[USUBE:%[0-9]+]]:_(s32), [[USUBE1:%[0-9]+]]:_(s1) = G_USUBE [[UV]], [[UV2]], [[ICMP]] + ; CHECK-NEXT: [[USUBE2:%[0-9]+]]:_(s32), [[USUBE3:%[0-9]+]]:_(s1) = G_USUBE [[UV1]], [[UV3]], [[USUBE1]] + ; CHECK-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[USUBE]](s32), [[USUBE2]](s32) + ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[USUBE3]](s1) ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[MV]](s64) - ; CHECK-NEXT: $vgpr2 = COPY [[AND]](s32) + ; CHECK-NEXT: $vgpr2 = COPY [[ZEXT]](s32) %0:_(s64) = COPY $vgpr0_vgpr1 %1:_(s64) = COPY $vgpr2_vgpr3 %2:_(s32) = COPY $vgpr4 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-usubo.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-usubo.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-usubo.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-usubo.mir @@ -106,8 +106,7 @@ ; CHECK-NEXT: [[USUBO:%[0-9]+]]:_(s32), [[USUBO1:%[0-9]+]]:_(s1) = G_USUBO [[UV]], [[UV2]] ; CHECK-NEXT: [[USUBE:%[0-9]+]]:_(s32), [[USUBE1:%[0-9]+]]:_(s1) = G_USUBE [[UV1]], [[UV3]], [[USUBO1]] ; CHECK-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[USUBO]](s32), [[USUBE]](s32) - ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[COPY]](s64), [[COPY1]] - ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[ICMP]](s1) + ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[USUBE1]](s1) ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[MV]](s64) ; CHECK-NEXT: $vgpr2 = COPY [[ZEXT]](s32) %0:_(s64) = COPY $vgpr0_vgpr1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-usubsat.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-usubsat.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-usubsat.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-usubsat.mir @@ -694,9 +694,8 @@ ; GFX6-NEXT: [[USUBO:%[0-9]+]]:_(s32), [[USUBO1:%[0-9]+]]:_(s1) = G_USUBO [[UV]], [[UV2]] ; GFX6-NEXT: [[USUBE:%[0-9]+]]:_(s32), [[USUBE1:%[0-9]+]]:_(s1) = G_USUBE [[UV1]], [[UV3]], [[USUBO1]] ; GFX6-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[USUBO]](s32), [[USUBE]](s32) - ; GFX6-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[COPY]](s64), [[COPY1]] ; GFX6-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 - ; GFX6-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s1), [[C]], [[MV]] + ; GFX6-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[USUBE1]](s1), [[C]], [[MV]] ; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[SELECT]](s64) ; GFX8-LABEL: name: usubsat_s64 ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 @@ -708,9 +707,8 @@ ; GFX8-NEXT: [[USUBO:%[0-9]+]]:_(s32), [[USUBO1:%[0-9]+]]:_(s1) = G_USUBO [[UV]], [[UV2]] ; GFX8-NEXT: [[USUBE:%[0-9]+]]:_(s32), [[USUBE1:%[0-9]+]]:_(s1) = G_USUBE [[UV1]], [[UV3]], [[USUBO1]] ; GFX8-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[USUBO]](s32), [[USUBE]](s32) - ; GFX8-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[COPY]](s64), [[COPY1]] ; GFX8-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 - ; GFX8-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s1), [[C]], [[MV]] + ; GFX8-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[USUBE1]](s1), [[C]], [[MV]] ; GFX8-NEXT: $vgpr0_vgpr1 = COPY [[SELECT]](s64) ; GFX9-LABEL: name: usubsat_s64 ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 @@ -722,9 +720,8 @@ ; GFX9-NEXT: [[USUBO:%[0-9]+]]:_(s32), [[USUBO1:%[0-9]+]]:_(s1) = G_USUBO [[UV]], [[UV2]] ; GFX9-NEXT: [[USUBE:%[0-9]+]]:_(s32), [[USUBE1:%[0-9]+]]:_(s1) = G_USUBE [[UV1]], [[UV3]], [[USUBO1]] ; GFX9-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[USUBO]](s32), [[USUBE]](s32) - ; GFX9-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[COPY]](s64), [[COPY1]] ; GFX9-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 - ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s1), [[C]], [[MV]] + ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[USUBE1]](s1), [[C]], [[MV]] ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[SELECT]](s64) %0:_(s64) = COPY $vgpr0_vgpr1 %1:_(s64) = COPY $vgpr2_vgpr3 @@ -750,16 +747,14 @@ ; GFX6-NEXT: [[USUBO:%[0-9]+]]:_(s32), [[USUBO1:%[0-9]+]]:_(s1) = G_USUBO [[UV4]], [[UV6]] ; GFX6-NEXT: [[USUBE:%[0-9]+]]:_(s32), [[USUBE1:%[0-9]+]]:_(s1) = G_USUBE [[UV5]], [[UV7]], [[USUBO1]] ; GFX6-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[USUBO]](s32), [[USUBE]](s32) - ; GFX6-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[UV]](s64), [[UV2]] ; GFX6-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 - ; GFX6-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s1), [[C]], [[MV]] + ; GFX6-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[USUBE1]](s1), [[C]], [[MV]] ; GFX6-NEXT: [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV1]](s64) ; GFX6-NEXT: [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV3]](s64) ; GFX6-NEXT: [[USUBO2:%[0-9]+]]:_(s32), [[USUBO3:%[0-9]+]]:_(s1) = G_USUBO [[UV8]], [[UV10]] ; GFX6-NEXT: [[USUBE2:%[0-9]+]]:_(s32), [[USUBE3:%[0-9]+]]:_(s1) = G_USUBE [[UV9]], [[UV11]], [[USUBO3]] ; GFX6-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[USUBO2]](s32), [[USUBE2]](s32) - ; GFX6-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[UV1]](s64), [[UV3]] - ; GFX6-NEXT: [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[ICMP1]](s1), [[C]], [[MV1]] + ; GFX6-NEXT: [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[USUBE3]](s1), [[C]], [[MV1]] ; GFX6-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[SELECT]](s64), [[SELECT1]](s64) ; GFX6-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>) ; GFX8-LABEL: name: usubsat_v2s64 @@ -774,16 +769,14 @@ ; GFX8-NEXT: [[USUBO:%[0-9]+]]:_(s32), [[USUBO1:%[0-9]+]]:_(s1) = G_USUBO [[UV4]], [[UV6]] ; GFX8-NEXT: [[USUBE:%[0-9]+]]:_(s32), [[USUBE1:%[0-9]+]]:_(s1) = G_USUBE [[UV5]], [[UV7]], [[USUBO1]] ; GFX8-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[USUBO]](s32), [[USUBE]](s32) - ; GFX8-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[UV]](s64), [[UV2]] ; GFX8-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 - ; GFX8-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s1), [[C]], [[MV]] + ; GFX8-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[USUBE1]](s1), [[C]], [[MV]] ; GFX8-NEXT: [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV1]](s64) ; GFX8-NEXT: [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV3]](s64) ; GFX8-NEXT: [[USUBO2:%[0-9]+]]:_(s32), [[USUBO3:%[0-9]+]]:_(s1) = G_USUBO [[UV8]], [[UV10]] ; GFX8-NEXT: [[USUBE2:%[0-9]+]]:_(s32), [[USUBE3:%[0-9]+]]:_(s1) = G_USUBE [[UV9]], [[UV11]], [[USUBO3]] ; GFX8-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[USUBO2]](s32), [[USUBE2]](s32) - ; GFX8-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[UV1]](s64), [[UV3]] - ; GFX8-NEXT: [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[ICMP1]](s1), [[C]], [[MV1]] + ; GFX8-NEXT: [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[USUBE3]](s1), [[C]], [[MV1]] ; GFX8-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[SELECT]](s64), [[SELECT1]](s64) ; GFX8-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>) ; GFX9-LABEL: name: usubsat_v2s64 @@ -798,16 +791,14 @@ ; GFX9-NEXT: [[USUBO:%[0-9]+]]:_(s32), [[USUBO1:%[0-9]+]]:_(s1) = G_USUBO [[UV4]], [[UV6]] ; GFX9-NEXT: [[USUBE:%[0-9]+]]:_(s32), [[USUBE1:%[0-9]+]]:_(s1) = G_USUBE [[UV5]], [[UV7]], [[USUBO1]] ; GFX9-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[USUBO]](s32), [[USUBE]](s32) - ; GFX9-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[UV]](s64), [[UV2]] ; GFX9-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 - ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s1), [[C]], [[MV]] + ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[USUBE1]](s1), [[C]], [[MV]] ; GFX9-NEXT: [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV1]](s64) ; GFX9-NEXT: [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV3]](s64) ; GFX9-NEXT: [[USUBO2:%[0-9]+]]:_(s32), [[USUBO3:%[0-9]+]]:_(s1) = G_USUBO [[UV8]], [[UV10]] ; GFX9-NEXT: [[USUBE2:%[0-9]+]]:_(s32), [[USUBE3:%[0-9]+]]:_(s1) = G_USUBE [[UV9]], [[UV11]], [[USUBO3]] ; GFX9-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[USUBO2]](s32), [[USUBE2]](s32) - ; GFX9-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[UV1]](s64), [[UV3]] - ; GFX9-NEXT: [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[ICMP1]](s1), [[C]], [[MV1]] + ; GFX9-NEXT: [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[USUBE3]](s1), [[C]], [[MV1]] ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[SELECT]](s64), [[SELECT1]](s64) ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>) %0:_(<2 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/subo.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/subo.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/subo.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/subo.ll @@ -39,34 +39,31 @@ ; GFX7-LABEL: v_usubo_i64: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_sub_i32_e32 v4, vcc, v0, v2 -; GFX7-NEXT: v_subb_u32_e32 v5, vcc, v1, v3, vcc -; GFX7-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3] -; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GFX7-NEXT: v_sub_i32_e32 v0, vcc, v4, v0 -; GFX7-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v5, vcc +; GFX7-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 +; GFX7-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX7-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 +; GFX7-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_usubo_i64: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_sub_u32_e32 v4, vcc, v0, v2 -; GFX8-NEXT: v_subb_u32_e32 v5, vcc, v1, v3, vcc -; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3] -; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GFX8-NEXT: v_sub_u32_e32 v0, vcc, v4, v0 -; GFX8-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v5, vcc +; GFX8-NEXT: v_sub_u32_e32 v0, vcc, v0, v2 +; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX8-NEXT: v_sub_u32_e32 v0, vcc, v0, v2 +; GFX8-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_usubo_i64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, v0, v2 -; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v1, v3, vcc -; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3] -; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v4, v0 -; GFX9-NEXT: v_subbrev_co_u32_e32 v1, vcc, 0, v5, vcc +; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v2 +; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v2 +; GFX9-NEXT: v_subbrev_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %usubo = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %a, i64 %b) %sub = extractvalue {i64, i1} %usubo, 0 @@ -479,47 +476,29 @@ define amdgpu_ps i64 @s_usubo_i64(i64 inreg %a, i64 inreg %b) { ; GFX7-LABEL: s_usubo_i64: ; GFX7: ; %bb.0: -; GFX7-NEXT: v_mov_b32_e32 v0, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s3 -; GFX7-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1] -; GFX7-NEXT: s_sub_u32 s4, s0, s2 -; GFX7-NEXT: s_subb_u32 s5, s1, s3 -; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 -; GFX7-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; GFX7-NEXT: v_readfirstlane_b32 s0, v0 -; GFX7-NEXT: v_readfirstlane_b32 s1, v1 +; GFX7-NEXT: s_sub_u32 s0, s0, s2 +; GFX7-NEXT: s_subb_u32 s1, s1, s3 +; GFX7-NEXT: s_cselect_b32 s2, 1, 0 +; GFX7-NEXT: s_sub_u32 s0, s0, s2 +; GFX7-NEXT: s_subb_u32 s1, s1, 0 ; GFX7-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_usubo_i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1] -; GFX8-NEXT: s_sub_u32 s4, s0, s2 -; GFX8-NEXT: s_subb_u32 s5, s1, s3 -; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s4, v0 -; GFX8-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: v_readfirstlane_b32 s0, v0 -; GFX8-NEXT: v_readfirstlane_b32 s1, v1 +; GFX8-NEXT: s_sub_u32 s0, s0, s2 +; GFX8-NEXT: s_subb_u32 s1, s1, s3 +; GFX8-NEXT: s_cselect_b32 s2, 1, 0 +; GFX8-NEXT: s_sub_u32 s0, s0, s2 +; GFX8-NEXT: s_subb_u32 s1, s1, 0 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_usubo_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1] -; GFX9-NEXT: s_sub_u32 s4, s0, s2 -; GFX9-NEXT: s_subb_u32 s5, s1, s3 -; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s4, v0 -; GFX9-NEXT: v_subbrev_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: v_readfirstlane_b32 s0, v0 -; GFX9-NEXT: v_readfirstlane_b32 s1, v1 +; GFX9-NEXT: s_sub_u32 s0, s0, s2 +; GFX9-NEXT: s_subb_u32 s1, s1, s3 +; GFX9-NEXT: s_cselect_b32 s2, 1, 0 +; GFX9-NEXT: s_sub_u32 s0, s0, s2 +; GFX9-NEXT: s_subb_u32 s1, s1, 0 ; GFX9-NEXT: ; return to shader part epilog %usubo = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %a, i64 %b) %sub = extractvalue {i64, i1} %usubo, 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll @@ -2640,13 +2640,18 @@ ; GFX6-LABEL: v_uaddsat_i48: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v1, v3, vcc -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3] +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v1 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v3 +; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, v1, v2 ; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc -; GFX6-NEXT: v_cndmask_b32_e64 v1, v4, -1, vcc +; GFX6-NEXT: v_cndmask_b32_e64 v1, v2, -1, vcc ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_uaddsat_i48: @@ -2656,7 +2661,6 @@ ; GFX8-NEXT: v_lshlrev_b64 v[2:3], 16, v[2:3] ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc -; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3] ; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc ; GFX8-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] @@ -2669,7 +2673,6 @@ ; GFX9-NEXT: v_lshlrev_b64 v[2:3], 16, v[2:3] ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc -; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc ; GFX9-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] @@ -2683,7 +2686,6 @@ ; GFX10PLUS-NEXT: v_lshlrev_b64 v[2:3], 16, v[2:3] ; GFX10PLUS-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 ; GFX10PLUS-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo -; GFX10PLUS-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc_lo ; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc_lo ; GFX10PLUS-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] @@ -2695,21 +2697,21 @@ define amdgpu_ps i48 @s_uaddsat_i48(i48 inreg %lhs, i48 inreg %rhs) { ; GFX6-LABEL: s_uaddsat_i48: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_mov_b32 s4, -1 ; GFX6-NEXT: s_add_u32 s0, s0, s2 -; GFX6-NEXT: s_mov_b32 s5, 0xffff -; GFX6-NEXT: s_addc_u32 s1, s1, s3 -; GFX6-NEXT: s_and_b64 s[2:3], s[2:3], s[4:5] -; GFX6-NEXT: v_mov_b32_e32 v0, s2 -; GFX6-NEXT: s_and_b64 s[6:7], s[0:1], s[4:5] -; GFX6-NEXT: v_mov_b32_e32 v1, s3 -; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1] -; GFX6-NEXT: v_mov_b32_e32 v2, s0 -; GFX6-NEXT: v_mov_b32_e32 v3, s1 -; GFX6-NEXT: v_cndmask_b32_e64 v0, v2, -1, vcc -; GFX6-NEXT: v_cndmask_b32_e64 v1, v3, -1, vcc -; GFX6-NEXT: v_readfirstlane_b32 s0, v0 -; GFX6-NEXT: v_readfirstlane_b32 s1, v1 +; GFX6-NEXT: s_cselect_b32 s2, 1, 0 +; GFX6-NEXT: s_and_b32 s1, s1, 0xffff +; GFX6-NEXT: s_and_b32 s3, s3, 0xffff +; GFX6-NEXT: s_cmp_lg_u32 s2, 0 +; GFX6-NEXT: s_addc_u32 s2, s1, s3 +; GFX6-NEXT: s_and_b32 s1, s2, 0xffff +; GFX6-NEXT: s_cmp_lg_u32 s2, s1 +; GFX6-NEXT: s_cselect_b32 s2, 1, 0 +; GFX6-NEXT: s_lshr_b32 s3, s0, 16 +; GFX6-NEXT: s_and_b32 s0, s0, 0xffff +; GFX6-NEXT: s_lshl_b32 s3, s3, 16 +; GFX6-NEXT: s_or_b32 s0, s0, s3 +; GFX6-NEXT: s_cmp_lg_u32 s2, 0 +; GFX6-NEXT: s_cselect_b64 s[0:1], -1, s[0:1] ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_uaddsat_i48: @@ -2717,17 +2719,9 @@ ; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], 16 ; GFX8-NEXT: s_lshl_b64 s[2:3], s[2:3], 16 ; GFX8-NEXT: s_add_u32 s0, s0, s2 -; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: s_addc_u32 s1, s1, s3 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v2, s0 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_cndmask_b32_e64 v0, v2, -1, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v1, v3, -1, vcc -; GFX8-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] -; GFX8-NEXT: v_readfirstlane_b32 s0, v0 -; GFX8-NEXT: v_readfirstlane_b32 s1, v1 +; GFX8-NEXT: s_cselect_b64 s[0:1], -1, s[0:1] +; GFX8-NEXT: s_lshr_b64 s[0:1], s[0:1], 16 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_uaddsat_i48: @@ -2735,17 +2729,9 @@ ; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 16 ; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], 16 ; GFX9-NEXT: s_add_u32 s0, s0, s2 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: s_addc_u32 s1, s1, s3 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, -1, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v1, v3, -1, vcc -; GFX9-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] -; GFX9-NEXT: v_readfirstlane_b32 s0, v0 -; GFX9-NEXT: v_readfirstlane_b32 s1, v1 +; GFX9-NEXT: s_cselect_b64 s[0:1], -1, s[0:1] +; GFX9-NEXT: s_lshr_b64 s[0:1], s[0:1], 16 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: s_uaddsat_i48: @@ -2754,12 +2740,8 @@ ; GFX10PLUS-NEXT: s_lshl_b64 s[2:3], s[2:3], 16 ; GFX10PLUS-NEXT: s_add_u32 s0, s0, s2 ; GFX10PLUS-NEXT: s_addc_u32 s1, s1, s3 -; GFX10PLUS-NEXT: v_cmp_lt_u64_e64 s2, s[0:1], s[2:3] -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, s0, -1, s2 -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, s1, -1, s2 -; GFX10PLUS-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] -; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0 -; GFX10PLUS-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10PLUS-NEXT: s_cselect_b64 s[0:1], -1, s[0:1] +; GFX10PLUS-NEXT: s_lshr_b64 s[0:1], s[0:1], 16 ; GFX10PLUS-NEXT: ; return to shader part epilog %result = call i48 @llvm.uadd.sat.i48(i48 %lhs, i48 %rhs) ret i48 %result @@ -2768,14 +2750,19 @@ define amdgpu_ps <2 x float> @uaddsat_i48_sv(i48 inreg %lhs, i48 %rhs) { ; GFX6-LABEL: uaddsat_i48_sv: ; GFX6: ; %bb.0: -; GFX6-NEXT: v_mov_b32_e32 v3, s1 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, s0, v0 -; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v3, v1, vcc -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v4 +; GFX6-NEXT: s_and_b32 s1, s1, 0xffff ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[0:1] -; GFX6-NEXT: v_cndmask_b32_e64 v1, v4, -1, vcc -; GFX6-NEXT: v_cndmask_b32_e64 v0, v2, -1, vcc +; GFX6-NEXT: v_mov_b32_e32 v2, s1 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v0 +; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, v1, v2 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v3 +; GFX6-NEXT: v_cndmask_b32_e64 v1, v2, -1, vcc +; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX6-NEXT: ; return to shader part epilog ; @@ -2783,12 +2770,11 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1] ; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], 16 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v0 -; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v3, v1, vcc -; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[0:1] -; GFX8-NEXT: v_cndmask_b32_e64 v0, v2, -1, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v1, v3, -1, vcc +; GFX8-NEXT: v_mov_b32_e32 v2, s1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc ; GFX8-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] ; GFX8-NEXT: ; return to shader part epilog ; @@ -2796,12 +2782,11 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1] ; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 16 -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v1, vcc -; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, -1, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v1, v3, -1, vcc +; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc ; GFX9-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] ; GFX9-NEXT: ; return to shader part epilog ; @@ -2809,11 +2794,10 @@ ; GFX10PLUS: ; %bb.0: ; GFX10PLUS-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1] ; GFX10PLUS-NEXT: s_lshl_b64 s[0:1], s[0:1], 16 -; GFX10PLUS-NEXT: v_add_co_u32 v2, vcc_lo, s0, v0 -; GFX10PLUS-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo -; GFX10PLUS-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[2:3], v[0:1] -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, v2, -1, vcc_lo -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v3, -1, vcc_lo +; GFX10PLUS-NEXT: v_add_co_u32 v0, vcc_lo, s0, v0 +; GFX10PLUS-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo +; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc_lo +; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc_lo ; GFX10PLUS-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] ; GFX10PLUS-NEXT: ; return to shader part epilog %result = call i48 @llvm.uadd.sat.i48(i48 %lhs, i48 %rhs) @@ -2825,14 +2809,17 @@ define amdgpu_ps <2 x float> @uaddsat_i48_vs(i48 %lhs, i48 inreg %rhs) { ; GFX6-LABEL: uaddsat_i48_vs: ; GFX6: ; %bb.0: +; GFX6-NEXT: s_and_b32 s1, s1, 0xffff +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX6-NEXT: v_mov_b32_e32 v2, s1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v0 -; GFX6-NEXT: s_mov_b32 s2, -1 -; GFX6-NEXT: v_addc_u32_e32 v2, vcc, v1, v2, vcc -; GFX6-NEXT: s_mov_b32 s3, 0xffff -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; GFX6-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] -; GFX6-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1] +; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v2, vcc +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, v1, v2 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v3 ; GFX6-NEXT: v_cndmask_b32_e64 v1, v2, -1, vcc ; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 @@ -2845,7 +2832,6 @@ ; GFX8-NEXT: v_mov_b32_e32 v2, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v1, v2, vcc -; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1] ; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc ; GFX8-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] @@ -2858,7 +2844,6 @@ ; GFX9-NEXT: v_mov_b32_e32 v2, s1 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc -; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc ; GFX9-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] @@ -2870,7 +2855,6 @@ ; GFX10PLUS-NEXT: s_lshl_b64 s[0:1], s[0:1], 16 ; GFX10PLUS-NEXT: v_add_co_u32 v0, vcc_lo, v0, s0 ; GFX10PLUS-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo -; GFX10PLUS-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[0:1], v[0:1] ; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc_lo ; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc_lo ; GFX10PLUS-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] @@ -2887,7 +2871,6 @@ ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc -; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3] ; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc ; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -2897,7 +2880,6 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc -; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3] ; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -2907,7 +2889,6 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc -; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -2918,7 +2899,6 @@ ; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10PLUS-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 ; GFX10PLUS-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo -; GFX10PLUS-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc_lo ; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc_lo ; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] @@ -2930,57 +2910,29 @@ ; GFX6-LABEL: s_uaddsat_i64: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_add_u32 s0, s0, s2 -; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: s_addc_u32 s1, s1, s3 -; GFX6-NEXT: v_mov_b32_e32 v1, s3 -; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1] -; GFX6-NEXT: v_mov_b32_e32 v2, s0 -; GFX6-NEXT: v_mov_b32_e32 v3, s1 -; GFX6-NEXT: v_cndmask_b32_e64 v0, v2, -1, vcc -; GFX6-NEXT: v_cndmask_b32_e64 v1, v3, -1, vcc -; GFX6-NEXT: v_readfirstlane_b32 s0, v0 -; GFX6-NEXT: v_readfirstlane_b32 s1, v1 +; GFX6-NEXT: s_cselect_b64 s[0:1], -1, s[0:1] ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_uaddsat_i64: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_add_u32 s0, s0, s2 -; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: s_addc_u32 s1, s1, s3 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v2, s0 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_cndmask_b32_e64 v0, v2, -1, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v1, v3, -1, vcc -; GFX8-NEXT: v_readfirstlane_b32 s0, v0 -; GFX8-NEXT: v_readfirstlane_b32 s1, v1 +; GFX8-NEXT: s_cselect_b64 s[0:1], -1, s[0:1] ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_uaddsat_i64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_add_u32 s0, s0, s2 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: s_addc_u32 s1, s1, s3 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, -1, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v1, v3, -1, vcc -; GFX9-NEXT: v_readfirstlane_b32 s0, v0 -; GFX9-NEXT: v_readfirstlane_b32 s1, v1 +; GFX9-NEXT: s_cselect_b64 s[0:1], -1, s[0:1] ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: s_uaddsat_i64: ; GFX10PLUS: ; %bb.0: ; GFX10PLUS-NEXT: s_add_u32 s0, s0, s2 ; GFX10PLUS-NEXT: s_addc_u32 s1, s1, s3 -; GFX10PLUS-NEXT: v_cmp_lt_u64_e64 s2, s[0:1], s[2:3] -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, s0, -1, s2 -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, s1, -1, s2 -; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0 -; GFX10PLUS-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10PLUS-NEXT: s_cselect_b64 s[0:1], -1, s[0:1] ; GFX10PLUS-NEXT: ; return to shader part epilog %result = call i64 @llvm.uadd.sat.i64(i64 %lhs, i64 %rhs) ret i64 %result @@ -2989,41 +2941,37 @@ define amdgpu_ps <2 x float> @uaddsat_i64_sv(i64 inreg %lhs, i64 %rhs) { ; GFX6-LABEL: uaddsat_i64_sv: ; GFX6: ; %bb.0: -; GFX6-NEXT: v_mov_b32_e32 v3, s1 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, s0, v0 -; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v3, v1, vcc -; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[0:1] -; GFX6-NEXT: v_cndmask_b32_e64 v0, v2, -1, vcc -; GFX6-NEXT: v_cndmask_b32_e64 v1, v3, -1, vcc +; GFX6-NEXT: v_mov_b32_e32 v2, s1 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v0 +; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc +; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc +; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: uaddsat_i64_sv: ; GFX8: ; %bb.0: -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v0 -; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v3, v1, vcc -; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[0:1] -; GFX8-NEXT: v_cndmask_b32_e64 v0, v2, -1, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v1, v3, -1, vcc +; GFX8-NEXT: v_mov_b32_e32 v2, s1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: uaddsat_i64_sv: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v1, vcc -; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, -1, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v1, v3, -1, vcc +; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: uaddsat_i64_sv: ; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: v_add_co_u32 v2, vcc_lo, s0, v0 -; GFX10PLUS-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo -; GFX10PLUS-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[2:3], v[0:1] -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, v2, -1, vcc_lo -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v3, -1, vcc_lo +; GFX10PLUS-NEXT: v_add_co_u32 v0, vcc_lo, s0, v0 +; GFX10PLUS-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo +; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc_lo +; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc_lo ; GFX10PLUS-NEXT: ; return to shader part epilog %result = call i64 @llvm.uadd.sat.i64(i64 %lhs, i64 %rhs) %cast = bitcast i64 %result to <2 x float> @@ -3036,7 +2984,6 @@ ; GFX6-NEXT: v_mov_b32_e32 v2, s1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v0 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v2, vcc -; GFX6-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1] ; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc ; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc ; GFX6-NEXT: ; return to shader part epilog @@ -3046,7 +2993,6 @@ ; GFX8-NEXT: v_mov_b32_e32 v2, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v1, v2, vcc -; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1] ; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc ; GFX8-NEXT: ; return to shader part epilog @@ -3056,7 +3002,6 @@ ; GFX9-NEXT: v_mov_b32_e32 v2, s1 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc -; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc ; GFX9-NEXT: ; return to shader part epilog @@ -3065,7 +3010,6 @@ ; GFX10PLUS: ; %bb.0: ; GFX10PLUS-NEXT: v_add_co_u32 v0, vcc_lo, v0, s0 ; GFX10PLUS-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo -; GFX10PLUS-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[0:1], v[0:1] ; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc_lo ; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc_lo ; GFX10PLUS-NEXT: ; return to shader part epilog @@ -3080,12 +3024,10 @@ ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v4 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v5, vcc -; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[4:5] ; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc ; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v6 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v3, v7, vcc -; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[6:7] ; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, -1, vcc ; GFX6-NEXT: v_cndmask_b32_e64 v3, v3, -1, vcc ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -3095,12 +3037,10 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v4 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v1, v5, vcc -; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[4:5] ; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v6 ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v3, v7, vcc -; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[6:7] ; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, -1, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, -1, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -3110,12 +3050,10 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v5, vcc -; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[4:5] ; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v6 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v7, vcc -; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[6:7] ; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, -1, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, -1, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -3125,11 +3063,9 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 +; GFX10-NEXT: v_add_co_u32 v2, s4, v2, v6 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v5, vcc_lo -; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v6 -; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v7, vcc_lo -; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[4:5] -; GFX10-NEXT: v_cmp_lt_u64_e64 s4, v[2:3], v[6:7] +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s4, v3, v7, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, -1, s4 @@ -3141,11 +3077,9 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 +; GFX11-NEXT: v_add_co_u32 v2, s0, v2, v6 ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v5, vcc_lo -; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, v6 -; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v7, vcc_lo -; GFX11-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[4:5] -; GFX11-NEXT: v_cmp_lt_u64_e64 s0, v[2:3], v[6:7] +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, s0, v3, v7, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc_lo ; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc_lo ; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, -1, s0 @@ -3159,97 +3093,41 @@ ; GFX6-LABEL: s_uaddsat_v2i64: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_add_u32 s0, s0, s4 -; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_addc_u32 s1, s1, s5 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 -; GFX6-NEXT: v_mov_b32_e32 v2, s0 -; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1] -; GFX6-NEXT: s_add_u32 s0, s2, s6 -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v3, s1 -; GFX6-NEXT: s_addc_u32 s1, s3, s7 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, -1, vcc -; GFX6-NEXT: v_cndmask_b32_e64 v3, v3, -1, vcc -; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1] -; GFX6-NEXT: v_mov_b32_e32 v4, s0 -; GFX6-NEXT: v_mov_b32_e32 v5, s1 -; GFX6-NEXT: v_cndmask_b32_e64 v0, v4, -1, vcc -; GFX6-NEXT: v_cndmask_b32_e64 v1, v5, -1, vcc -; GFX6-NEXT: v_readfirstlane_b32 s0, v2 -; GFX6-NEXT: v_readfirstlane_b32 s1, v3 -; GFX6-NEXT: v_readfirstlane_b32 s2, v0 -; GFX6-NEXT: v_readfirstlane_b32 s3, v1 +; GFX6-NEXT: s_cselect_b64 s[0:1], -1, s[0:1] +; GFX6-NEXT: s_add_u32 s2, s2, s6 +; GFX6-NEXT: s_addc_u32 s3, s3, s7 +; GFX6-NEXT: s_cselect_b64 s[2:3], -1, s[2:3] ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_uaddsat_v2i64: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_add_u32 s0, s0, s4 -; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: s_addc_u32 s1, s1, s5 -; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 -; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1] -; GFX8-NEXT: s_add_u32 s0, s2, s6 -; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: s_addc_u32 s1, s3, s7 -; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, -1, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, -1, vcc -; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NEXT: v_cndmask_b32_e64 v0, v4, -1, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v1, v5, -1, vcc -; GFX8-NEXT: v_readfirstlane_b32 s0, v2 -; GFX8-NEXT: v_readfirstlane_b32 s1, v3 -; GFX8-NEXT: v_readfirstlane_b32 s2, v0 -; GFX8-NEXT: v_readfirstlane_b32 s3, v1 +; GFX8-NEXT: s_cselect_b64 s[0:1], -1, s[0:1] +; GFX8-NEXT: s_add_u32 s2, s2, s6 +; GFX8-NEXT: s_addc_u32 s3, s3, s7 +; GFX8-NEXT: s_cselect_b64 s[2:3], -1, s[2:3] ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_uaddsat_v2i64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_add_u32 s0, s0, s4 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: s_addc_u32 s1, s1, s5 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1] -; GFX9-NEXT: s_add_u32 s0, s2, s6 -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: s_addc_u32 s1, s3, s7 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, -1, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, -1, vcc -; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v4, s0 -; GFX9-NEXT: v_mov_b32_e32 v5, s1 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v4, -1, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v1, v5, -1, vcc -; GFX9-NEXT: v_readfirstlane_b32 s0, v2 -; GFX9-NEXT: v_readfirstlane_b32 s1, v3 -; GFX9-NEXT: v_readfirstlane_b32 s2, v0 -; GFX9-NEXT: v_readfirstlane_b32 s3, v1 +; GFX9-NEXT: s_cselect_b64 s[0:1], -1, s[0:1] +; GFX9-NEXT: s_add_u32 s2, s2, s6 +; GFX9-NEXT: s_addc_u32 s3, s3, s7 +; GFX9-NEXT: s_cselect_b64 s[2:3], -1, s[2:3] ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: s_uaddsat_v2i64: ; GFX10PLUS: ; %bb.0: ; GFX10PLUS-NEXT: s_add_u32 s0, s0, s4 ; GFX10PLUS-NEXT: s_addc_u32 s1, s1, s5 +; GFX10PLUS-NEXT: s_cselect_b64 s[0:1], -1, s[0:1] ; GFX10PLUS-NEXT: s_add_u32 s2, s2, s6 ; GFX10PLUS-NEXT: s_addc_u32 s3, s3, s7 -; GFX10PLUS-NEXT: v_cmp_lt_u64_e64 s4, s[0:1], s[4:5] -; GFX10PLUS-NEXT: v_cmp_lt_u64_e64 s5, s[2:3], s[6:7] -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, s0, -1, s4 -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, s1, -1, s4 -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v2, s2, -1, s5 -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v3, s3, -1, s5 -; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0 -; GFX10PLUS-NEXT: v_readfirstlane_b32 s1, v1 -; GFX10PLUS-NEXT: v_readfirstlane_b32 s2, v2 -; GFX10PLUS-NEXT: v_readfirstlane_b32 s3, v3 +; GFX10PLUS-NEXT: s_cselect_b64 s[2:3], -1, s[2:3] ; GFX10PLUS-NEXT: ; return to shader part epilog %result = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> %lhs, <2 x i64> %rhs) ret <2 x i64> %result @@ -3259,103 +3137,31 @@ ; GFX6-LABEL: s_uaddsat_i128: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_add_u32 s0, s0, s4 -; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: s_addc_u32 s1, s1, s5 -; GFX6-NEXT: v_mov_b32_e32 v3, s5 ; GFX6-NEXT: s_addc_u32 s2, s2, s6 -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3] ; GFX6-NEXT: s_addc_u32 s3, s3, s7 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1] -; GFX6-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[2:3], v[0:1] -; GFX6-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX6-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX6-NEXT: v_mov_b32_e32 v2, s1 -; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX6-NEXT: v_cndmask_b32_e64 v0, v1, -1, vcc -; GFX6-NEXT: v_cndmask_b32_e64 v1, v2, -1, vcc -; GFX6-NEXT: v_mov_b32_e32 v2, s2 -; GFX6-NEXT: v_mov_b32_e32 v3, s3 -; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, -1, vcc -; GFX6-NEXT: v_cndmask_b32_e64 v3, v3, -1, vcc -; GFX6-NEXT: v_readfirstlane_b32 s0, v0 -; GFX6-NEXT: v_readfirstlane_b32 s1, v1 -; GFX6-NEXT: v_readfirstlane_b32 s2, v2 -; GFX6-NEXT: v_readfirstlane_b32 s3, v3 +; GFX6-NEXT: s_cselect_b64 s[0:1], -1, s[0:1] +; GFX6-NEXT: s_cselect_b64 s[2:3], -1, s[2:3] ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_uaddsat_i128: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_add_u32 s0, s0, s4 ; GFX8-NEXT: s_addc_u32 s1, s1, s5 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: s_addc_u32 s2, s2, s6 -; GFX8-NEXT: v_mov_b32_e32 v3, s5 ; GFX8-NEXT: s_addc_u32 s3, s3, s7 -; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3] -; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: s_cmp_eq_u64 s[2:3], s[6:7] -; GFX8-NEXT: s_cselect_b32 s6, 1, 0 -; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1] -; GFX8-NEXT: s_and_b32 s4, 1, s6 -; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX8-NEXT: v_mov_b32_e32 v1, s0 -; GFX8-NEXT: v_mov_b32_e32 v2, s1 -; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX8-NEXT: v_cndmask_b32_e64 v0, v1, -1, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v1, v2, -1, vcc -; GFX8-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NEXT: v_mov_b32_e32 v3, s3 -; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, -1, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, -1, vcc -; GFX8-NEXT: v_readfirstlane_b32 s0, v0 -; GFX8-NEXT: v_readfirstlane_b32 s1, v1 -; GFX8-NEXT: v_readfirstlane_b32 s2, v2 -; GFX8-NEXT: v_readfirstlane_b32 s3, v3 +; GFX8-NEXT: s_cselect_b64 s[0:1], -1, s[0:1] +; GFX8-NEXT: s_cselect_b64 s[2:3], -1, s[2:3] ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_uaddsat_i128: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_add_u32 s0, s0, s4 ; GFX9-NEXT: s_addc_u32 s1, s1, s5 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: s_addc_u32 s2, s2, s6 -; GFX9-NEXT: v_mov_b32_e32 v3, s5 ; GFX9-NEXT: s_addc_u32 s3, s3, s7 -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3] -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: s_cmp_eq_u64 s[2:3], s[6:7] -; GFX9-NEXT: s_cselect_b32 s6, 1, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1] -; GFX9-NEXT: s_and_b32 s4, 1, s6 -; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v1, -1, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v1, v2, -1, vcc -; GFX9-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, -1, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, -1, vcc -; GFX9-NEXT: v_readfirstlane_b32 s0, v0 -; GFX9-NEXT: v_readfirstlane_b32 s1, v1 -; GFX9-NEXT: v_readfirstlane_b32 s2, v2 -; GFX9-NEXT: v_readfirstlane_b32 s3, v3 +; GFX9-NEXT: s_cselect_b64 s[0:1], -1, s[0:1] +; GFX9-NEXT: s_cselect_b64 s[2:3], -1, s[2:3] ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: s_uaddsat_i128: @@ -3363,26 +3169,9 @@ ; GFX10PLUS-NEXT: s_add_u32 s0, s0, s4 ; GFX10PLUS-NEXT: s_addc_u32 s1, s1, s5 ; GFX10PLUS-NEXT: s_addc_u32 s2, s2, s6 -; GFX10PLUS-NEXT: v_cmp_lt_u64_e64 s4, s[0:1], s[4:5] ; GFX10PLUS-NEXT: s_addc_u32 s3, s3, s7 -; GFX10PLUS-NEXT: s_cmp_eq_u64 s[2:3], s[6:7] -; GFX10PLUS-NEXT: s_cselect_b32 s8, 1, 0 -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 -; GFX10PLUS-NEXT: v_cmp_lt_u64_e64 s4, s[2:3], s[6:7] -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, 0, 1, s4 -; GFX10PLUS-NEXT: s_and_b32 s4, 1, s8 -; GFX10PLUS-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s4 -; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo -; GFX10PLUS-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX10PLUS-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, s0, -1, vcc_lo -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, s1, -1, vcc_lo -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v2, s2, -1, vcc_lo -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v3, s3, -1, vcc_lo -; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0 -; GFX10PLUS-NEXT: v_readfirstlane_b32 s1, v1 -; GFX10PLUS-NEXT: v_readfirstlane_b32 s2, v2 -; GFX10PLUS-NEXT: v_readfirstlane_b32 s3, v3 +; GFX10PLUS-NEXT: s_cselect_b64 s[0:1], -1, s[0:1] +; GFX10PLUS-NEXT: s_cselect_b64 s[2:3], -1, s[2:3] ; GFX10PLUS-NEXT: ; return to shader part epilog %result = call i128 @llvm.uadd.sat.i128(i128 %lhs, i128 %rhs) ret i128 %result @@ -3391,91 +3180,59 @@ define amdgpu_ps <4 x float> @uaddsat_i128_sv(i128 inreg %lhs, i128 %rhs) { ; GFX6-LABEL: uaddsat_i128_sv: ; GFX6: ; %bb.0: -; GFX6-NEXT: v_mov_b32_e32 v5, s1 -; GFX6-NEXT: v_add_i32_e32 v4, vcc, s0, v0 -; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v5, v1, vcc -; GFX6-NEXT: v_mov_b32_e32 v6, s2 -; GFX6-NEXT: v_mov_b32_e32 v7, s3 -; GFX6-NEXT: v_addc_u32_e32 v6, vcc, v6, v2, vcc -; GFX6-NEXT: v_addc_u32_e32 v7, vcc, v7, v3, vcc -; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, v[4:5], v[0:1] -; GFX6-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3] -; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] -; GFX6-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc -; GFX6-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX6-NEXT: v_cndmask_b32_e64 v0, v4, -1, vcc -; GFX6-NEXT: v_cndmask_b32_e64 v1, v5, -1, vcc -; GFX6-NEXT: v_cndmask_b32_e64 v2, v6, -1, vcc -; GFX6-NEXT: v_cndmask_b32_e64 v3, v7, -1, vcc +; GFX6-NEXT: v_mov_b32_e32 v4, s1 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v0 +; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc +; GFX6-NEXT: v_mov_b32_e32 v4, s2 +; GFX6-NEXT: v_mov_b32_e32 v5, s3 +; GFX6-NEXT: v_addc_u32_e32 v2, vcc, v4, v2, vcc +; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc +; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc +; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc +; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, -1, vcc +; GFX6-NEXT: v_cndmask_b32_e64 v3, v3, -1, vcc ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: uaddsat_i128_sv: ; GFX8: ; %bb.0: -; GFX8-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NEXT: v_add_u32_e32 v4, vcc, s0, v0 -; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v5, v1, vcc -; GFX8-NEXT: v_mov_b32_e32 v6, s2 -; GFX8-NEXT: v_mov_b32_e32 v7, s3 -; GFX8-NEXT: v_addc_u32_e32 v6, vcc, v6, v2, vcc -; GFX8-NEXT: v_addc_u32_e32 v7, vcc, v7, v3, vcc -; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, v[4:5], v[0:1] -; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3] -; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] -; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc -; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX8-NEXT: v_cndmask_b32_e64 v0, v4, -1, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v1, v5, -1, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v2, v6, -1, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v3, v7, -1, vcc +; GFX8-NEXT: v_mov_b32_e32 v4, s1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc +; GFX8-NEXT: v_mov_b32_e32 v4, s2 +; GFX8-NEXT: v_mov_b32_e32 v5, s3 +; GFX8-NEXT: v_addc_u32_e32 v2, vcc, v4, v2, vcc +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, -1, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, -1, vcc ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: uaddsat_i128_sv: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v5, s1 -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s0, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v5, v1, vcc -; GFX9-NEXT: v_mov_b32_e32 v6, s2 -; GFX9-NEXT: v_mov_b32_e32 v7, s3 -; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, v6, v2, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v7, v3, vcc -; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[4:5], v[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3] -; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc -; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v4, -1, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v1, v5, -1, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v2, v6, -1, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v3, v7, -1, vcc +; GFX9-NEXT: v_mov_b32_e32 v4, s1 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v4, v1, vcc +; GFX9-NEXT: v_mov_b32_e32 v4, s2 +; GFX9-NEXT: v_mov_b32_e32 v5, s3 +; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v4, v2, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, -1, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, -1, vcc ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: uaddsat_i128_sv: ; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: v_add_co_u32 v4, vcc_lo, s0, v0 -; GFX10PLUS-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, s1, v1, vcc_lo -; GFX10PLUS-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, s2, v2, vcc_lo -; GFX10PLUS-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, s3, v3, vcc_lo -; GFX10PLUS-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[4:5], v[0:1] -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo -; GFX10PLUS-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[6:7], v[2:3] -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo -; GFX10PLUS-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[2:3] -; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo -; GFX10PLUS-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX10PLUS-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, v4, -1, vcc_lo -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v5, -1, vcc_lo -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v2, v6, -1, vcc_lo -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v3, v7, -1, vcc_lo +; GFX10PLUS-NEXT: v_add_co_u32 v0, vcc_lo, s0, v0 +; GFX10PLUS-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo +; GFX10PLUS-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, s2, v2, vcc_lo +; GFX10PLUS-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, s3, v3, vcc_lo +; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc_lo +; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc_lo +; GFX10PLUS-NEXT: v_cndmask_b32_e64 v2, v2, -1, vcc_lo +; GFX10PLUS-NEXT: v_cndmask_b32_e64 v3, v3, -1, vcc_lo ; GFX10PLUS-NEXT: ; return to shader part epilog %result = call i128 @llvm.uadd.sat.i128(i128 %lhs, i128 %rhs) %cast = bitcast i128 %result to <4 x float> @@ -3492,14 +3249,6 @@ ; GFX6-NEXT: v_mov_b32_e32 v5, s3 ; GFX6-NEXT: v_addc_u32_e32 v2, vcc, v2, v4, vcc ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v3, v5, vcc -; GFX6-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1] -; GFX6-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GFX6-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[2:3] -; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[2:3], v[2:3] -; GFX6-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc -; GFX6-NEXT: v_and_b32_e32 v4, 1, v4 -; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 ; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc ; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc ; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, -1, vcc @@ -3515,14 +3264,6 @@ ; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: v_addc_u32_e32 v2, vcc, v2, v4, vcc ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v3, v5, vcc -; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1] -; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[2:3] -; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, s[2:3], v[2:3] -; GFX8-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc -; GFX8-NEXT: v_and_b32_e32 v4, 1, v4 -; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, -1, vcc @@ -3538,14 +3279,6 @@ ; GFX9-NEXT: v_mov_b32_e32 v5, s3 ; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v2, v4, vcc ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v5, vcc -; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[2:3] -; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, s[2:3], v[2:3] -; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc -; GFX9-NEXT: v_and_b32_e32 v4, 1, v4 -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, -1, vcc @@ -3558,14 +3291,6 @@ ; GFX10PLUS-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo ; GFX10PLUS-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, s2, v2, vcc_lo ; GFX10PLUS-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, s3, v3, vcc_lo -; GFX10PLUS-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[0:1], v[0:1] -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo -; GFX10PLUS-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[2:3], v[2:3] -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc_lo -; GFX10PLUS-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[2:3], v[2:3] -; GFX10PLUS-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc_lo -; GFX10PLUS-NEXT: v_and_b32_e32 v4, 1, v4 -; GFX10PLUS-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4 ; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc_lo ; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc_lo ; GFX10PLUS-NEXT: v_cndmask_b32_e64 v2, v2, -1, vcc_lo @@ -3584,14 +3309,6 @@ ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v9, vcc ; GFX6-NEXT: v_addc_u32_e32 v2, vcc, v2, v10, vcc ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v3, v11, vcc -; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[8:9] -; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[10:11] -; GFX6-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[10:11] -; GFX6-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc -; GFX6-NEXT: v_and_b32_e32 v8, 1, v8 -; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 ; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc ; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc ; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, -1, vcc @@ -3600,14 +3317,6 @@ ; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v5, v13, vcc ; GFX6-NEXT: v_addc_u32_e32 v6, vcc, v6, v14, vcc ; GFX6-NEXT: v_addc_u32_e32 v7, vcc, v7, v15, vcc -; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, v[4:5], v[12:13] -; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[14:15] -; GFX6-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[14:15] -; GFX6-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc -; GFX6-NEXT: v_and_b32_e32 v8, 1, v8 -; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 ; GFX6-NEXT: v_cndmask_b32_e64 v4, v4, -1, vcc ; GFX6-NEXT: v_cndmask_b32_e64 v5, v5, -1, vcc ; GFX6-NEXT: v_cndmask_b32_e64 v6, v6, -1, vcc @@ -3621,14 +3330,6 @@ ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v1, v9, vcc ; GFX8-NEXT: v_addc_u32_e32 v2, vcc, v2, v10, vcc ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v3, v11, vcc -; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[8:9] -; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[10:11] -; GFX8-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[10:11] -; GFX8-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc -; GFX8-NEXT: v_and_b32_e32 v8, 1, v8 -; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, -1, vcc @@ -3637,14 +3338,6 @@ ; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v5, v13, vcc ; GFX8-NEXT: v_addc_u32_e32 v6, vcc, v6, v14, vcc ; GFX8-NEXT: v_addc_u32_e32 v7, vcc, v7, v15, vcc -; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, v[4:5], v[12:13] -; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[14:15] -; GFX8-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[14:15] -; GFX8-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc -; GFX8-NEXT: v_and_b32_e32 v8, 1, v8 -; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 ; GFX8-NEXT: v_cndmask_b32_e64 v4, v4, -1, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v5, v5, -1, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, -1, vcc @@ -3658,14 +3351,6 @@ ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v9, vcc ; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v2, v10, vcc ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v11, vcc -; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[8:9] -; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[10:11] -; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[10:11] -; GFX9-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc -; GFX9-NEXT: v_and_b32_e32 v8, 1, v8 -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, -1, vcc @@ -3674,14 +3359,6 @@ ; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v5, v13, vcc ; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, v6, v14, vcc ; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v7, v15, vcc -; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[4:5], v[12:13] -; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[14:15] -; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[14:15] -; GFX9-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc -; GFX9-NEXT: v_and_b32_e32 v8, 1, v8 -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 ; GFX9-NEXT: v_cndmask_b32_e64 v4, v4, -1, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, -1, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, -1, vcc @@ -3693,31 +3370,15 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v8 +; GFX10-NEXT: v_add_co_u32 v4, s4, v4, v12 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v9, vcc_lo +; GFX10-NEXT: v_add_co_ci_u32_e64 v5, s4, v5, v13, s4 ; GFX10-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v2, v10, vcc_lo +; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s4, v6, v14, s4 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v11, vcc_lo -; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[8:9] -; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc_lo -; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, v4, v12 -; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v5, v13, vcc_lo -; GFX10-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, v6, v14, vcc_lo -; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, v7, v15, vcc_lo -; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[2:3], v[10:11] -; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc_lo -; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[4:5], v[12:13] -; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc_lo -; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[6:7], v[14:15] -; GFX10-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc_lo -; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[10:11] -; GFX10-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc_lo -; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[14:15] -; GFX10-NEXT: v_and_b32_e32 v8, 1, v8 -; GFX10-NEXT: v_cndmask_b32_e32 v9, v13, v12, vcc_lo -; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8 -; GFX10-NEXT: v_and_b32_e32 v9, 1, v9 +; GFX10-NEXT: v_add_co_ci_u32_e64 v7, s4, v7, v15, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc_lo -; GFX10-NEXT: v_cmp_ne_u32_e64 s4, 0, v9 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, -1, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, -1, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, -1, s4 @@ -3731,34 +3392,18 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v8 +; GFX11-NEXT: v_add_co_u32 v4, s0, v4, v12 ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v9, vcc_lo +; GFX11-NEXT: v_add_co_ci_u32_e64 v5, s0, v5, v13, s0 ; GFX11-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v2, v10, vcc_lo +; GFX11-NEXT: v_add_co_ci_u32_e64 v6, s0, v6, v14, s0 ; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v11, vcc_lo -; GFX11-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[8:9] -; GFX11-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc_lo -; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, v12 -; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v5, v13, vcc_lo -; GFX11-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, v6, v14, vcc_lo -; GFX11-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, v7, v15, vcc_lo -; GFX11-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[2:3], v[10:11] -; GFX11-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc_lo -; GFX11-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[4:5], v[12:13] -; GFX11-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc_lo -; GFX11-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[6:7], v[14:15] -; GFX11-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc_lo -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[10:11] -; GFX11-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc_lo -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[14:15] -; GFX11-NEXT: v_cndmask_b32_e32 v9, v13, v12, vcc_lo -; GFX11-NEXT: v_and_b32_e32 v9, 1, v9 -; GFX11-NEXT: v_and_b32_e32 v8, 1, v8 -; GFX11-NEXT: v_cmp_ne_u32_e64 s0, 0, v9 -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8 -; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, -1, s0 +; GFX11-NEXT: v_add_co_ci_u32_e64 v7, s0, v7, v15, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc_lo ; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc_lo ; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, -1, vcc_lo ; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, -1, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, -1, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, -1, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v6, v6, -1, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v7, v7, -1, s0 @@ -3771,293 +3416,66 @@ ; GFX6-LABEL: s_uaddsat_v2i128: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_add_u32 s0, s0, s8 -; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: s_addc_u32 s1, s1, s9 -; GFX6-NEXT: v_mov_b32_e32 v3, s9 ; GFX6-NEXT: s_addc_u32 s2, s2, s10 -; GFX6-NEXT: v_mov_b32_e32 v0, s10 -; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3] ; GFX6-NEXT: s_addc_u32 s3, s3, s11 -; GFX6-NEXT: v_mov_b32_e32 v1, s11 -; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1] -; GFX6-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[2:3], v[0:1] -; GFX6-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX6-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX6-NEXT: v_mov_b32_e32 v2, s1 -; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX6-NEXT: v_cndmask_b32_e64 v5, v2, -1, vcc -; GFX6-NEXT: s_add_u32 s0, s4, s12 -; GFX6-NEXT: v_mov_b32_e32 v2, s12 -; GFX6-NEXT: v_cndmask_b32_e64 v4, v1, -1, vcc -; GFX6-NEXT: v_mov_b32_e32 v0, s2 -; GFX6-NEXT: v_mov_b32_e32 v1, s3 -; GFX6-NEXT: s_addc_u32 s1, s5, s13 -; GFX6-NEXT: v_mov_b32_e32 v3, s13 -; GFX6-NEXT: v_cndmask_b32_e64 v6, v0, -1, vcc -; GFX6-NEXT: v_cndmask_b32_e64 v7, v1, -1, vcc -; GFX6-NEXT: s_addc_u32 s2, s6, s14 -; GFX6-NEXT: v_mov_b32_e32 v0, s14 -; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3] -; GFX6-NEXT: s_addc_u32 s3, s7, s15 -; GFX6-NEXT: v_mov_b32_e32 v1, s15 -; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1] -; GFX6-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[2:3], v[0:1] -; GFX6-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX6-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX6-NEXT: v_mov_b32_e32 v2, s1 -; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX6-NEXT: v_cndmask_b32_e64 v0, v1, -1, vcc -; GFX6-NEXT: v_cndmask_b32_e64 v1, v2, -1, vcc -; GFX6-NEXT: v_mov_b32_e32 v2, s2 -; GFX6-NEXT: v_mov_b32_e32 v3, s3 -; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, -1, vcc -; GFX6-NEXT: v_cndmask_b32_e64 v3, v3, -1, vcc -; GFX6-NEXT: v_readfirstlane_b32 s0, v4 -; GFX6-NEXT: v_readfirstlane_b32 s1, v5 -; GFX6-NEXT: v_readfirstlane_b32 s2, v6 -; GFX6-NEXT: v_readfirstlane_b32 s3, v7 -; GFX6-NEXT: v_readfirstlane_b32 s4, v0 -; GFX6-NEXT: v_readfirstlane_b32 s5, v1 -; GFX6-NEXT: v_readfirstlane_b32 s6, v2 -; GFX6-NEXT: v_readfirstlane_b32 s7, v3 +; GFX6-NEXT: s_cselect_b64 s[0:1], -1, s[0:1] +; GFX6-NEXT: s_cselect_b64 s[2:3], -1, s[2:3] +; GFX6-NEXT: s_add_u32 s4, s4, s12 +; GFX6-NEXT: s_addc_u32 s5, s5, s13 +; GFX6-NEXT: s_addc_u32 s6, s6, s14 +; GFX6-NEXT: s_addc_u32 s7, s7, s15 +; GFX6-NEXT: s_cselect_b64 s[4:5], -1, s[4:5] +; GFX6-NEXT: s_cselect_b64 s[6:7], -1, s[6:7] ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_uaddsat_v2i128: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_add_u32 s0, s0, s8 ; GFX8-NEXT: s_addc_u32 s1, s1, s9 -; GFX8-NEXT: v_mov_b32_e32 v2, s8 ; GFX8-NEXT: s_addc_u32 s2, s2, s10 -; GFX8-NEXT: v_mov_b32_e32 v3, s9 ; GFX8-NEXT: s_addc_u32 s3, s3, s11 -; GFX8-NEXT: v_mov_b32_e32 v0, s10 -; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3] -; GFX8-NEXT: v_mov_b32_e32 v1, s11 -; GFX8-NEXT: s_cmp_eq_u64 s[2:3], s[10:11] -; GFX8-NEXT: s_cselect_b32 s10, 1, 0 -; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1] -; GFX8-NEXT: s_and_b32 s8, 1, s10 -; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s8 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX8-NEXT: v_mov_b32_e32 v1, s0 -; GFX8-NEXT: v_mov_b32_e32 v2, s1 -; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX8-NEXT: s_add_u32 s0, s4, s12 -; GFX8-NEXT: v_cndmask_b32_e64 v5, v2, -1, vcc -; GFX8-NEXT: s_addc_u32 s1, s5, s13 -; GFX8-NEXT: v_mov_b32_e32 v2, s12 -; GFX8-NEXT: v_cndmask_b32_e64 v4, v1, -1, vcc -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: s_addc_u32 s2, s6, s14 -; GFX8-NEXT: v_mov_b32_e32 v3, s13 -; GFX8-NEXT: v_cndmask_b32_e64 v6, v0, -1, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v7, v1, -1, vcc -; GFX8-NEXT: s_addc_u32 s3, s7, s15 -; GFX8-NEXT: v_mov_b32_e32 v0, s14 -; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3] -; GFX8-NEXT: v_mov_b32_e32 v1, s15 -; GFX8-NEXT: s_cmp_eq_u64 s[2:3], s[14:15] -; GFX8-NEXT: s_cselect_b32 s4, 1, 0 -; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1] -; GFX8-NEXT: s_and_b32 s4, 1, s4 -; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX8-NEXT: v_mov_b32_e32 v1, s0 -; GFX8-NEXT: v_mov_b32_e32 v2, s1 -; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX8-NEXT: v_cndmask_b32_e64 v0, v1, -1, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v1, v2, -1, vcc -; GFX8-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NEXT: v_mov_b32_e32 v3, s3 -; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, -1, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, -1, vcc -; GFX8-NEXT: v_readfirstlane_b32 s0, v4 -; GFX8-NEXT: v_readfirstlane_b32 s1, v5 -; GFX8-NEXT: v_readfirstlane_b32 s2, v6 -; GFX8-NEXT: v_readfirstlane_b32 s3, v7 -; GFX8-NEXT: v_readfirstlane_b32 s4, v0 -; GFX8-NEXT: v_readfirstlane_b32 s5, v1 -; GFX8-NEXT: v_readfirstlane_b32 s6, v2 -; GFX8-NEXT: v_readfirstlane_b32 s7, v3 +; GFX8-NEXT: s_cselect_b64 s[0:1], -1, s[0:1] +; GFX8-NEXT: s_cselect_b64 s[2:3], -1, s[2:3] +; GFX8-NEXT: s_add_u32 s4, s4, s12 +; GFX8-NEXT: s_addc_u32 s5, s5, s13 +; GFX8-NEXT: s_addc_u32 s6, s6, s14 +; GFX8-NEXT: s_addc_u32 s7, s7, s15 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, s[4:5] +; GFX8-NEXT: s_cselect_b64 s[6:7], -1, s[6:7] ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_uaddsat_v2i128: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_add_u32 s0, s0, s8 ; GFX9-NEXT: s_addc_u32 s1, s1, s9 -; GFX9-NEXT: v_mov_b32_e32 v2, s8 ; GFX9-NEXT: s_addc_u32 s2, s2, s10 -; GFX9-NEXT: v_mov_b32_e32 v3, s9 ; GFX9-NEXT: s_addc_u32 s3, s3, s11 -; GFX9-NEXT: v_mov_b32_e32 v0, s10 -; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3] -; GFX9-NEXT: v_mov_b32_e32 v1, s11 -; GFX9-NEXT: s_cmp_eq_u64 s[2:3], s[10:11] -; GFX9-NEXT: s_cselect_b32 s10, 1, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1] -; GFX9-NEXT: s_and_b32 s8, 1, s10 -; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s8 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_add_u32 s0, s4, s12 -; GFX9-NEXT: v_cndmask_b32_e64 v5, v2, -1, vcc -; GFX9-NEXT: s_addc_u32 s1, s5, s13 -; GFX9-NEXT: v_mov_b32_e32 v2, s12 -; GFX9-NEXT: v_cndmask_b32_e64 v4, v1, -1, vcc -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: s_addc_u32 s2, s6, s14 -; GFX9-NEXT: v_mov_b32_e32 v3, s13 -; GFX9-NEXT: v_cndmask_b32_e64 v6, v0, -1, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v7, v1, -1, vcc -; GFX9-NEXT: s_addc_u32 s3, s7, s15 -; GFX9-NEXT: v_mov_b32_e32 v0, s14 -; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3] -; GFX9-NEXT: v_mov_b32_e32 v1, s15 -; GFX9-NEXT: s_cmp_eq_u64 s[2:3], s[14:15] -; GFX9-NEXT: s_cselect_b32 s4, 1, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1] -; GFX9-NEXT: s_and_b32 s4, 1, s4 -; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v1, -1, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v1, v2, -1, vcc -; GFX9-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, -1, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, -1, vcc -; GFX9-NEXT: v_readfirstlane_b32 s0, v4 -; GFX9-NEXT: v_readfirstlane_b32 s1, v5 -; GFX9-NEXT: v_readfirstlane_b32 s2, v6 -; GFX9-NEXT: v_readfirstlane_b32 s3, v7 -; GFX9-NEXT: v_readfirstlane_b32 s4, v0 -; GFX9-NEXT: v_readfirstlane_b32 s5, v1 -; GFX9-NEXT: v_readfirstlane_b32 s6, v2 -; GFX9-NEXT: v_readfirstlane_b32 s7, v3 +; GFX9-NEXT: s_cselect_b64 s[0:1], -1, s[0:1] +; GFX9-NEXT: s_cselect_b64 s[2:3], -1, s[2:3] +; GFX9-NEXT: s_add_u32 s4, s4, s12 +; GFX9-NEXT: s_addc_u32 s5, s5, s13 +; GFX9-NEXT: s_addc_u32 s6, s6, s14 +; GFX9-NEXT: s_addc_u32 s7, s7, s15 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, s[4:5] +; GFX9-NEXT: s_cselect_b64 s[6:7], -1, s[6:7] ; GFX9-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: s_uaddsat_v2i128: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_add_u32 s0, s0, s8 -; GFX10-NEXT: s_addc_u32 s1, s1, s9 -; GFX10-NEXT: s_addc_u32 s2, s2, s10 -; GFX10-NEXT: v_cmp_lt_u64_e64 s8, s[0:1], s[8:9] -; GFX10-NEXT: s_addc_u32 s3, s3, s11 -; GFX10-NEXT: s_cmp_eq_u64 s[2:3], s[10:11] -; GFX10-NEXT: s_cselect_b32 s16, 1, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s8 -; GFX10-NEXT: v_cmp_lt_u64_e64 s8, s[2:3], s[10:11] -; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s8 -; GFX10-NEXT: s_and_b32 s8, 1, s16 -; GFX10-NEXT: s_add_u32 s4, s4, s12 -; GFX10-NEXT: s_addc_u32 s5, s5, s13 -; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s8 -; GFX10-NEXT: v_cmp_lt_u64_e64 s9, s[4:5], s[12:13] -; GFX10-NEXT: s_addc_u32 s6, s6, s14 -; GFX10-NEXT: s_addc_u32 s7, s7, s15 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo -; GFX10-NEXT: s_cmp_eq_u64 s[6:7], s[14:15] -; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s9 -; GFX10-NEXT: v_cmp_lt_u64_e64 s9, s[6:7], s[14:15] -; GFX10-NEXT: s_cselect_b32 s8, 1, 0 -; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX10-NEXT: s_and_b32 s8, 1, s8 -; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s8 -; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s9 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc_lo -; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX10-NEXT: v_and_b32_e32 v1, 1, v1 -; GFX10-NEXT: v_cndmask_b32_e64 v0, s0, -1, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, s2, -1, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v3, s3, -1, vcc_lo -; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v1 -; GFX10-NEXT: v_cndmask_b32_e64 v1, s1, -1, vcc_lo -; GFX10-NEXT: v_readfirstlane_b32 s2, v2 -; GFX10-NEXT: v_readfirstlane_b32 s3, v3 -; GFX10-NEXT: v_cndmask_b32_e64 v4, s4, -1, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v5, s5, -1, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v6, s6, -1, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v7, s7, -1, s0 -; GFX10-NEXT: v_readfirstlane_b32 s0, v0 -; GFX10-NEXT: v_readfirstlane_b32 s1, v1 -; GFX10-NEXT: v_readfirstlane_b32 s4, v4 -; GFX10-NEXT: v_readfirstlane_b32 s5, v5 -; GFX10-NEXT: v_readfirstlane_b32 s6, v6 -; GFX10-NEXT: v_readfirstlane_b32 s7, v7 -; GFX10-NEXT: ; return to shader part epilog -; -; GFX11-LABEL: s_uaddsat_v2i128: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_add_u32 s0, s0, s8 -; GFX11-NEXT: s_addc_u32 s1, s1, s9 -; GFX11-NEXT: s_addc_u32 s2, s2, s10 -; GFX11-NEXT: v_cmp_lt_u64_e64 s8, s[0:1], s[8:9] -; GFX11-NEXT: s_addc_u32 s3, s3, s11 -; GFX11-NEXT: s_cmp_eq_u64 s[2:3], s[10:11] -; GFX11-NEXT: s_cselect_b32 s16, 1, 0 -; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s8 -; GFX11-NEXT: v_cmp_lt_u64_e64 s8, s[2:3], s[10:11] -; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s8 -; GFX11-NEXT: s_and_b32 s8, 1, s16 -; GFX11-NEXT: s_add_u32 s4, s4, s12 -; GFX11-NEXT: s_addc_u32 s5, s5, s13 -; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s8 -; GFX11-NEXT: v_cmp_lt_u64_e64 s9, s[4:5], s[12:13] -; GFX11-NEXT: s_addc_u32 s6, s6, s14 -; GFX11-NEXT: s_addc_u32 s7, s7, s15 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo -; GFX11-NEXT: s_cmp_eq_u64 s[6:7], s[14:15] -; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s9 -; GFX11-NEXT: v_cmp_lt_u64_e64 s9, s[6:7], s[14:15] -; GFX11-NEXT: s_cselect_b32 s8, 1, 0 -; GFX11-NEXT: s_and_b32 s8, 1, s8 -; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s8 -; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s9 -; GFX11-NEXT: v_dual_cndmask_b32 v1, v2, v1 :: v_dual_and_b32 v0, 1, v0 -; GFX11-NEXT: v_and_b32_e32 v1, 1, v1 -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: v_cndmask_b32_e64 v0, s0, -1, vcc_lo -; GFX11-NEXT: v_cmp_ne_u32_e64 s0, 0, v1 -; GFX11-NEXT: v_cndmask_b32_e64 v1, s1, -1, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v2, s2, -1, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v3, s3, -1, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v4, s4, -1, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v5, s5, -1, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v6, s6, -1, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v7, s7, -1, s0 -; GFX11-NEXT: v_readfirstlane_b32 s0, v0 -; GFX11-NEXT: v_readfirstlane_b32 s1, v1 -; GFX11-NEXT: v_readfirstlane_b32 s2, v2 -; GFX11-NEXT: v_readfirstlane_b32 s3, v3 -; GFX11-NEXT: v_readfirstlane_b32 s4, v4 -; GFX11-NEXT: v_readfirstlane_b32 s5, v5 -; GFX11-NEXT: v_readfirstlane_b32 s6, v6 -; GFX11-NEXT: v_readfirstlane_b32 s7, v7 -; GFX11-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: s_uaddsat_v2i128: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_add_u32 s0, s0, s8 +; GFX10PLUS-NEXT: s_addc_u32 s1, s1, s9 +; GFX10PLUS-NEXT: s_addc_u32 s2, s2, s10 +; GFX10PLUS-NEXT: s_addc_u32 s3, s3, s11 +; GFX10PLUS-NEXT: s_cselect_b64 s[0:1], -1, s[0:1] +; GFX10PLUS-NEXT: s_cselect_b64 s[2:3], -1, s[2:3] +; GFX10PLUS-NEXT: s_add_u32 s4, s4, s12 +; GFX10PLUS-NEXT: s_addc_u32 s5, s5, s13 +; GFX10PLUS-NEXT: s_addc_u32 s6, s6, s14 +; GFX10PLUS-NEXT: s_addc_u32 s7, s7, s15 +; GFX10PLUS-NEXT: s_cselect_b64 s[4:5], -1, s[4:5] +; GFX10PLUS-NEXT: s_cselect_b64 s[6:7], -1, s[6:7] +; GFX10PLUS-NEXT: ; return to shader part epilog %result = call <2 x i128> @llvm.uadd.sat.v2i128(<2 x i128> %lhs, <2 x i128> %rhs) ret <2 x i128> %result } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll @@ -2510,13 +2510,18 @@ ; GFX6-LABEL: v_usubsat_i48: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v0, v2 -; GFX6-NEXT: v_subb_u32_e32 v5, vcc, v1, v3, vcc ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3] -; GFX6-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc -; GFX6-NEXT: v_cndmask_b32_e64 v1, v5, 0, vcc +; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 +; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v1 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v3 +; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, v1, v2 +; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; GFX6-NEXT: v_cndmask_b32_e64 v1, v2, 0, vcc ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_usubsat_i48: @@ -2524,11 +2529,10 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1] ; GFX8-NEXT: v_lshlrev_b64 v[2:3], 16, v[2:3] -; GFX8-NEXT: v_sub_u32_e32 v4, vcc, v0, v2 -; GFX8-NEXT: v_subb_u32_e32 v5, vcc, v1, v3, vcc -; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3] -; GFX8-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v1, v5, 0, vcc +; GFX8-NEXT: v_sub_u32_e32 v0, vcc, v0, v2 +; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc ; GFX8-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -2537,11 +2541,10 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1] ; GFX9-NEXT: v_lshlrev_b64 v[2:3], 16, v[2:3] -; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, v0, v2 -; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v1, v3, vcc -; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3] -; GFX9-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v1, v5, 0, vcc +; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v2 +; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc ; GFX9-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -2551,11 +2554,10 @@ ; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10PLUS-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1] ; GFX10PLUS-NEXT: v_lshlrev_b64 v[2:3], 16, v[2:3] -; GFX10PLUS-NEXT: v_sub_co_u32 v4, vcc_lo, v0, v2 -; GFX10PLUS-NEXT: v_sub_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo -; GFX10PLUS-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[2:3] -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc_lo -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v5, 0, vcc_lo +; GFX10PLUS-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v2 +; GFX10PLUS-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo +; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc_lo +; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc_lo ; GFX10PLUS-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] ; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %result = call i48 @llvm.usub.sat.i48(i48 %lhs, i48 %rhs) @@ -2565,71 +2567,51 @@ define amdgpu_ps i48 @s_usubsat_i48(i48 inreg %lhs, i48 inreg %rhs) { ; GFX6-LABEL: s_usubsat_i48: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_mov_b32 s6, -1 -; GFX6-NEXT: s_sub_u32 s4, s0, s2 -; GFX6-NEXT: s_mov_b32 s7, 0xffff -; GFX6-NEXT: s_subb_u32 s5, s1, s3 -; GFX6-NEXT: s_and_b64 s[2:3], s[2:3], s[6:7] -; GFX6-NEXT: v_mov_b32_e32 v0, s2 -; GFX6-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7] -; GFX6-NEXT: v_mov_b32_e32 v1, s3 -; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1] -; GFX6-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-NEXT: v_mov_b32_e32 v3, s5 -; GFX6-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc -; GFX6-NEXT: v_cndmask_b32_e64 v1, v3, 0, vcc -; GFX6-NEXT: v_readfirstlane_b32 s0, v0 -; GFX6-NEXT: v_readfirstlane_b32 s1, v1 +; GFX6-NEXT: s_sub_u32 s0, s0, s2 +; GFX6-NEXT: s_cselect_b32 s2, 1, 0 +; GFX6-NEXT: s_and_b32 s1, s1, 0xffff +; GFX6-NEXT: s_and_b32 s3, s3, 0xffff +; GFX6-NEXT: s_cmp_lg_u32 s2, 0 +; GFX6-NEXT: s_subb_u32 s2, s1, s3 +; GFX6-NEXT: s_and_b32 s1, s2, 0xffff +; GFX6-NEXT: s_cmp_lg_u32 s2, s1 +; GFX6-NEXT: s_cselect_b32 s2, 1, 0 +; GFX6-NEXT: s_lshr_b32 s3, s0, 16 +; GFX6-NEXT: s_and_b32 s0, s0, 0xffff +; GFX6-NEXT: s_lshl_b32 s3, s3, 16 +; GFX6-NEXT: s_or_b32 s0, s0, s3 +; GFX6-NEXT: s_cmp_lg_u32 s2, 0 +; GFX6-NEXT: s_cselect_b64 s[0:1], 0, s[0:1] ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_usubsat_i48: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_lshl_b64 s[2:3], s[2:3], 16 ; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], 16 -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: s_sub_u32 s4, s0, s2 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: s_subb_u32 s5, s1, s3 -; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: v_mov_b32_e32 v3, s5 -; GFX8-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v1, v3, 0, vcc -; GFX8-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] -; GFX8-NEXT: v_readfirstlane_b32 s0, v0 -; GFX8-NEXT: v_readfirstlane_b32 s1, v1 +; GFX8-NEXT: s_lshl_b64 s[2:3], s[2:3], 16 +; GFX8-NEXT: s_sub_u32 s0, s0, s2 +; GFX8-NEXT: s_subb_u32 s1, s1, s3 +; GFX8-NEXT: s_cselect_b64 s[0:1], 0, s[0:1] +; GFX8-NEXT: s_lshr_b64 s[0:1], s[0:1], 16 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_usubsat_i48: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], 16 ; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 16 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: s_sub_u32 s4, s0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: s_subb_u32 s5, s1, s3 -; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v1, v3, 0, vcc -; GFX9-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] -; GFX9-NEXT: v_readfirstlane_b32 s0, v0 -; GFX9-NEXT: v_readfirstlane_b32 s1, v1 +; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], 16 +; GFX9-NEXT: s_sub_u32 s0, s0, s2 +; GFX9-NEXT: s_subb_u32 s1, s1, s3 +; GFX9-NEXT: s_cselect_b64 s[0:1], 0, s[0:1] +; GFX9-NEXT: s_lshr_b64 s[0:1], s[0:1], 16 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: s_usubsat_i48: ; GFX10PLUS: ; %bb.0: ; GFX10PLUS-NEXT: s_lshl_b64 s[0:1], s[0:1], 16 ; GFX10PLUS-NEXT: s_lshl_b64 s[2:3], s[2:3], 16 -; GFX10PLUS-NEXT: s_sub_u32 s4, s0, s2 -; GFX10PLUS-NEXT: v_cmp_lt_u64_e64 s0, s[0:1], s[2:3] +; GFX10PLUS-NEXT: s_sub_u32 s0, s0, s2 ; GFX10PLUS-NEXT: s_subb_u32 s1, s1, s3 -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, s4, 0, s0 -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, s1, 0, s0 -; GFX10PLUS-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] -; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0 -; GFX10PLUS-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10PLUS-NEXT: s_cselect_b64 s[0:1], 0, s[0:1] +; GFX10PLUS-NEXT: s_lshr_b64 s[0:1], s[0:1], 16 ; GFX10PLUS-NEXT: ; return to shader part epilog %result = call i48 @llvm.usub.sat.i48(i48 %lhs, i48 %rhs) ret i48 %result @@ -2638,17 +2620,19 @@ define amdgpu_ps <2 x float> @usubsat_i48_sv(i48 inreg %lhs, i48 %rhs) { ; GFX6-LABEL: usubsat_i48_sv: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_mov_b32 s2, -1 -; GFX6-NEXT: v_mov_b32_e32 v2, s1 -; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s0, v0 -; GFX6-NEXT: s_mov_b32 s3, 0xffff -; GFX6-NEXT: v_subb_u32_e32 v2, vcc, v2, v1, vcc -; GFX6-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] +; GFX6-NEXT: s_and_b32 s1, s1, 0xffff ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1] +; GFX6-NEXT: v_mov_b32_e32 v2, s1 +; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 +; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v1 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v3 +; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, v1, v2 +; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc ; GFX6-NEXT: v_cndmask_b32_e64 v1, v2, 0, vcc -; GFX6-NEXT: v_cndmask_b32_e64 v0, v3, 0, vcc -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: usubsat_i48_sv: @@ -2656,11 +2640,10 @@ ; GFX8-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1] ; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], 16 ; GFX8-NEXT: v_mov_b32_e32 v2, s1 -; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s0, v0 -; GFX8-NEXT: v_subb_u32_e32 v2, vcc, v2, v1, vcc -; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1] -; GFX8-NEXT: v_cndmask_b32_e64 v0, v3, 0, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v1, v2, 0, vcc +; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s0, v0 +; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc ; GFX8-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] ; GFX8-NEXT: ; return to shader part epilog ; @@ -2669,11 +2652,10 @@ ; GFX9-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1] ; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 16 ; GFX9-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-NEXT: v_sub_co_u32_e32 v3, vcc, s0, v0 -; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v1, vcc -; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v0, v3, 0, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v1, v2, 0, vcc +; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s0, v0 +; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v1, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc ; GFX9-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] ; GFX9-NEXT: ; return to shader part epilog ; @@ -2681,11 +2663,10 @@ ; GFX10PLUS: ; %bb.0: ; GFX10PLUS-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1] ; GFX10PLUS-NEXT: s_lshl_b64 s[0:1], s[0:1], 16 -; GFX10PLUS-NEXT: v_sub_co_u32 v2, vcc_lo, s0, v0 -; GFX10PLUS-NEXT: v_sub_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo -; GFX10PLUS-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[0:1], v[0:1] -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc_lo -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v3, 0, vcc_lo +; GFX10PLUS-NEXT: v_sub_co_u32 v0, vcc_lo, s0, v0 +; GFX10PLUS-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo +; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc_lo +; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc_lo ; GFX10PLUS-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] ; GFX10PLUS-NEXT: ; return to shader part epilog %result = call i48 @llvm.usub.sat.i48(i48 %lhs, i48 %rhs) @@ -2697,17 +2678,19 @@ define amdgpu_ps <2 x float> @usubsat_i48_vs(i48 %lhs, i48 inreg %rhs) { ; GFX6-LABEL: usubsat_i48_vs: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_mov_b32 s2, -1 -; GFX6-NEXT: v_mov_b32_e32 v2, s1 -; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s0, v0 -; GFX6-NEXT: s_mov_b32 s3, 0xffff -; GFX6-NEXT: v_subb_u32_e32 v2, vcc, v1, v2, vcc +; GFX6-NEXT: s_and_b32 s1, s1, 0xffff ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX6-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] -; GFX6-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1] +; GFX6-NEXT: v_mov_b32_e32 v2, s1 +; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s0, v0 +; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v1 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v3 +; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, v1, v2 +; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc ; GFX6-NEXT: v_cndmask_b32_e64 v1, v2, 0, vcc -; GFX6-NEXT: v_cndmask_b32_e64 v0, v3, 0, vcc -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: usubsat_i48_vs: @@ -2715,11 +2698,10 @@ ; GFX8-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1] ; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], 16 ; GFX8-NEXT: v_mov_b32_e32 v2, s1 -; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s0, v0 -; GFX8-NEXT: v_subb_u32_e32 v2, vcc, v1, v2, vcc -; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1] -; GFX8-NEXT: v_cndmask_b32_e64 v0, v3, 0, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v1, v2, 0, vcc +; GFX8-NEXT: v_subrev_u32_e32 v0, vcc, s0, v0 +; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc ; GFX8-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] ; GFX8-NEXT: ; return to shader part epilog ; @@ -2728,11 +2710,10 @@ ; GFX9-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1] ; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 16 ; GFX9-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-NEXT: v_subrev_co_u32_e32 v3, vcc, s0, v0 -; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v1, v2, vcc -; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v0, v3, 0, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v1, v2, 0, vcc +; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s0, v0 +; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc ; GFX9-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] ; GFX9-NEXT: ; return to shader part epilog ; @@ -2740,11 +2721,10 @@ ; GFX10PLUS: ; %bb.0: ; GFX10PLUS-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1] ; GFX10PLUS-NEXT: s_lshl_b64 s[0:1], s[0:1], 16 -; GFX10PLUS-NEXT: v_sub_co_u32 v2, vcc_lo, v0, s0 -; GFX10PLUS-NEXT: v_subrev_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo -; GFX10PLUS-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[0:1], v[0:1] -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc_lo -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v3, 0, vcc_lo +; GFX10PLUS-NEXT: v_sub_co_u32 v0, vcc_lo, v0, s0 +; GFX10PLUS-NEXT: v_subrev_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo +; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc_lo +; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc_lo ; GFX10PLUS-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] ; GFX10PLUS-NEXT: ; return to shader part epilog %result = call i48 @llvm.usub.sat.i48(i48 %lhs, i48 %rhs) @@ -2757,42 +2737,38 @@ ; GFX6-LABEL: v_usubsat_i64: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v0, v2 -; GFX6-NEXT: v_subb_u32_e32 v5, vcc, v1, v3, vcc -; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3] -; GFX6-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc -; GFX6-NEXT: v_cndmask_b32_e64 v1, v5, 0, vcc +; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 +; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc +; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_usubsat_i64: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_sub_u32_e32 v4, vcc, v0, v2 -; GFX8-NEXT: v_subb_u32_e32 v5, vcc, v1, v3, vcc -; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3] -; GFX8-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v1, v5, 0, vcc +; GFX8-NEXT: v_sub_u32_e32 v0, vcc, v0, v2 +; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_usubsat_i64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, v0, v2 -; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v1, v3, vcc -; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3] -; GFX9-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v1, v5, 0, vcc +; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v2 +; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10PLUS-LABEL: v_usubsat_i64: ; GFX10PLUS: ; %bb.0: ; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10PLUS-NEXT: v_sub_co_u32 v4, vcc_lo, v0, v2 -; GFX10PLUS-NEXT: v_sub_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo -; GFX10PLUS-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[2:3] -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc_lo -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v5, 0, vcc_lo +; GFX10PLUS-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v2 +; GFX10PLUS-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo +; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc_lo +; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc_lo ; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %result = call i64 @llvm.usub.sat.i64(i64 %lhs, i64 %rhs) ret i64 %result @@ -2801,58 +2777,30 @@ define amdgpu_ps i64 @s_usubsat_i64(i64 inreg %lhs, i64 inreg %rhs) { ; GFX6-LABEL: s_usubsat_i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: v_mov_b32_e32 v0, s2 -; GFX6-NEXT: s_sub_u32 s4, s0, s2 -; GFX6-NEXT: v_mov_b32_e32 v1, s3 -; GFX6-NEXT: s_subb_u32 s5, s1, s3 -; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1] -; GFX6-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-NEXT: v_mov_b32_e32 v3, s5 -; GFX6-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc -; GFX6-NEXT: v_cndmask_b32_e64 v1, v3, 0, vcc -; GFX6-NEXT: v_readfirstlane_b32 s0, v0 -; GFX6-NEXT: v_readfirstlane_b32 s1, v1 +; GFX6-NEXT: s_sub_u32 s0, s0, s2 +; GFX6-NEXT: s_subb_u32 s1, s1, s3 +; GFX6-NEXT: s_cselect_b64 s[0:1], 0, s[0:1] ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_usubsat_i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: s_sub_u32 s4, s0, s2 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: s_subb_u32 s5, s1, s3 -; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: v_mov_b32_e32 v3, s5 -; GFX8-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v1, v3, 0, vcc -; GFX8-NEXT: v_readfirstlane_b32 s0, v0 -; GFX8-NEXT: v_readfirstlane_b32 s1, v1 +; GFX8-NEXT: s_sub_u32 s0, s0, s2 +; GFX8-NEXT: s_subb_u32 s1, s1, s3 +; GFX8-NEXT: s_cselect_b64 s[0:1], 0, s[0:1] ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_usubsat_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: s_sub_u32 s4, s0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: s_subb_u32 s5, s1, s3 -; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v1, v3, 0, vcc -; GFX9-NEXT: v_readfirstlane_b32 s0, v0 -; GFX9-NEXT: v_readfirstlane_b32 s1, v1 +; GFX9-NEXT: s_sub_u32 s0, s0, s2 +; GFX9-NEXT: s_subb_u32 s1, s1, s3 +; GFX9-NEXT: s_cselect_b64 s[0:1], 0, s[0:1] ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: s_usubsat_i64: ; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_sub_u32 s4, s0, s2 -; GFX10PLUS-NEXT: v_cmp_lt_u64_e64 s0, s[0:1], s[2:3] +; GFX10PLUS-NEXT: s_sub_u32 s0, s0, s2 ; GFX10PLUS-NEXT: s_subb_u32 s1, s1, s3 -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, s4, 0, s0 -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, s1, 0, s0 -; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0 -; GFX10PLUS-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10PLUS-NEXT: s_cselect_b64 s[0:1], 0, s[0:1] ; GFX10PLUS-NEXT: ; return to shader part epilog %result = call i64 @llvm.usub.sat.i64(i64 %lhs, i64 %rhs) ret i64 %result @@ -2862,40 +2810,36 @@ ; GFX6-LABEL: usubsat_i64_sv: ; GFX6: ; %bb.0: ; GFX6-NEXT: v_mov_b32_e32 v2, s1 -; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s0, v0 -; GFX6-NEXT: v_subb_u32_e32 v2, vcc, v2, v1, vcc -; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1] -; GFX6-NEXT: v_cndmask_b32_e64 v0, v3, 0, vcc -; GFX6-NEXT: v_cndmask_b32_e64 v1, v2, 0, vcc +; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 +; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc +; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: usubsat_i64_sv: ; GFX8: ; %bb.0: ; GFX8-NEXT: v_mov_b32_e32 v2, s1 -; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s0, v0 -; GFX8-NEXT: v_subb_u32_e32 v2, vcc, v2, v1, vcc -; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1] -; GFX8-NEXT: v_cndmask_b32_e64 v0, v3, 0, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v1, v2, 0, vcc +; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s0, v0 +; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: usubsat_i64_sv: ; GFX9: ; %bb.0: ; GFX9-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-NEXT: v_sub_co_u32_e32 v3, vcc, s0, v0 -; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v1, vcc -; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v0, v3, 0, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v1, v2, 0, vcc +; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s0, v0 +; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v1, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: usubsat_i64_sv: ; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: v_sub_co_u32 v2, vcc_lo, s0, v0 -; GFX10PLUS-NEXT: v_sub_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo -; GFX10PLUS-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[0:1], v[0:1] -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc_lo -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v3, 0, vcc_lo +; GFX10PLUS-NEXT: v_sub_co_u32 v0, vcc_lo, s0, v0 +; GFX10PLUS-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo +; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc_lo +; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc_lo ; GFX10PLUS-NEXT: ; return to shader part epilog %result = call i64 @llvm.usub.sat.i64(i64 %lhs, i64 %rhs) %cast = bitcast i64 %result to <2 x float> @@ -2906,40 +2850,36 @@ ; GFX6-LABEL: usubsat_i64_vs: ; GFX6: ; %bb.0: ; GFX6-NEXT: v_mov_b32_e32 v2, s1 -; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s0, v0 -; GFX6-NEXT: v_subb_u32_e32 v2, vcc, v1, v2, vcc -; GFX6-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1] -; GFX6-NEXT: v_cndmask_b32_e64 v0, v3, 0, vcc -; GFX6-NEXT: v_cndmask_b32_e64 v1, v2, 0, vcc +; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s0, v0 +; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc +; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: usubsat_i64_vs: ; GFX8: ; %bb.0: ; GFX8-NEXT: v_mov_b32_e32 v2, s1 -; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s0, v0 -; GFX8-NEXT: v_subb_u32_e32 v2, vcc, v1, v2, vcc -; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1] -; GFX8-NEXT: v_cndmask_b32_e64 v0, v3, 0, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v1, v2, 0, vcc +; GFX8-NEXT: v_subrev_u32_e32 v0, vcc, s0, v0 +; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: usubsat_i64_vs: ; GFX9: ; %bb.0: ; GFX9-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-NEXT: v_subrev_co_u32_e32 v3, vcc, s0, v0 -; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v1, v2, vcc -; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v0, v3, 0, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v1, v2, 0, vcc +; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s0, v0 +; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: usubsat_i64_vs: ; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: v_sub_co_u32 v2, vcc_lo, v0, s0 -; GFX10PLUS-NEXT: v_subrev_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo -; GFX10PLUS-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[0:1], v[0:1] -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc_lo -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v3, 0, vcc_lo +; GFX10PLUS-NEXT: v_sub_co_u32 v0, vcc_lo, v0, s0 +; GFX10PLUS-NEXT: v_subrev_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo +; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc_lo +; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc_lo ; GFX10PLUS-NEXT: ; return to shader part epilog %result = call i64 @llvm.usub.sat.i64(i64 %lhs, i64 %rhs) %cast = bitcast i64 %result to <2 x float> @@ -2950,78 +2890,68 @@ ; GFX6-LABEL: v_usubsat_v2i64: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_sub_i32_e32 v8, vcc, v0, v4 -; GFX6-NEXT: v_subb_u32_e32 v9, vcc, v1, v5, vcc -; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[4:5] -; GFX6-NEXT: v_cndmask_b32_e64 v0, v8, 0, vcc -; GFX6-NEXT: v_cndmask_b32_e64 v1, v9, 0, vcc -; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v2, v6 -; GFX6-NEXT: v_subb_u32_e32 v5, vcc, v3, v7, vcc -; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[6:7] -; GFX6-NEXT: v_cndmask_b32_e64 v2, v4, 0, vcc -; GFX6-NEXT: v_cndmask_b32_e64 v3, v5, 0, vcc +; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 +; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v1, v5, vcc +; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v6 +; GFX6-NEXT: v_subb_u32_e32 v3, vcc, v3, v7, vcc +; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc +; GFX6-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_usubsat_v2i64: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_sub_u32_e32 v8, vcc, v0, v4 -; GFX8-NEXT: v_subb_u32_e32 v9, vcc, v1, v5, vcc -; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v0, v8, 0, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v1, v9, 0, vcc -; GFX8-NEXT: v_sub_u32_e32 v4, vcc, v2, v6 -; GFX8-NEXT: v_subb_u32_e32 v5, vcc, v3, v7, vcc -; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[6:7] -; GFX8-NEXT: v_cndmask_b32_e64 v2, v4, 0, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v3, v5, 0, vcc +; GFX8-NEXT: v_sub_u32_e32 v0, vcc, v0, v4 +; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v1, v5, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc +; GFX8-NEXT: v_sub_u32_e32 v2, vcc, v2, v6 +; GFX8-NEXT: v_subb_u32_e32 v3, vcc, v3, v7, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_usubsat_v2i64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_sub_co_u32_e32 v8, vcc, v0, v4 -; GFX9-NEXT: v_subb_co_u32_e32 v9, vcc, v1, v5, vcc -; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v0, v8, 0, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v1, v9, 0, vcc -; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, v2, v6 -; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v3, v7, vcc -; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v2, v4, 0, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v3, v5, 0, vcc +; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v4 +; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v5, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc +; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, v2, v6 +; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v7, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_usubsat_v2i64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_sub_co_u32 v8, vcc_lo, v0, v4 -; GFX10-NEXT: v_sub_co_ci_u32_e32 v9, vcc_lo, v1, v5, vcc_lo -; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[4:5] -; GFX10-NEXT: v_sub_co_u32 v4, s4, v2, v6 -; GFX10-NEXT: v_sub_co_ci_u32_e64 v5, s4, v3, v7, s4 -; GFX10-NEXT: v_cmp_lt_u64_e64 s4, v[2:3], v[6:7] -; GFX10-NEXT: v_cndmask_b32_e64 v0, v8, 0, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v1, v9, 0, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v4, 0, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v5, 0, s4 +; GFX10-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v4 +; GFX10-NEXT: v_sub_co_u32 v2, s4, v2, v6 +; GFX10-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v5, vcc_lo +; GFX10-NEXT: v_sub_co_ci_u32_e64 v3, s4, v3, v7, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, 0, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, 0, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_usubsat_v2i64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: v_sub_co_u32 v8, vcc_lo, v0, v4 -; GFX11-NEXT: v_sub_co_ci_u32_e32 v9, vcc_lo, v1, v5, vcc_lo -; GFX11-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[4:5] -; GFX11-NEXT: v_sub_co_u32 v4, s0, v2, v6 -; GFX11-NEXT: v_sub_co_ci_u32_e64 v5, s0, v3, v7, s0 -; GFX11-NEXT: v_cmp_lt_u64_e64 s0, v[2:3], v[6:7] -; GFX11-NEXT: v_cndmask_b32_e64 v0, v8, 0, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v1, v9, 0, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v2, v4, 0, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v3, v5, 0, s0 +; GFX11-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v4 +; GFX11-NEXT: v_sub_co_u32 v2, s0, v2, v6 +; GFX11-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v5, vcc_lo +; GFX11-NEXT: v_sub_co_ci_u32_e64 v3, s0, v3, v7, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, 0, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, 0, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> %lhs, <2 x i64> %rhs) ret <2 x i64> %result @@ -3030,98 +2960,42 @@ define amdgpu_ps <2 x i64> @s_usubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inreg %rhs) { ; GFX6-LABEL: s_usubsat_v2i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: s_sub_u32 s8, s0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 -; GFX6-NEXT: s_subb_u32 s9, s1, s5 -; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1] -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v2, s8 -; GFX6-NEXT: v_mov_b32_e32 v3, s9 -; GFX6-NEXT: s_sub_u32 s0, s2, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc -; GFX6-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc -; GFX6-NEXT: s_subb_u32 s1, s3, s7 -; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1] -; GFX6-NEXT: v_mov_b32_e32 v4, s0 -; GFX6-NEXT: v_mov_b32_e32 v5, s1 -; GFX6-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc -; GFX6-NEXT: v_cndmask_b32_e64 v1, v5, 0, vcc -; GFX6-NEXT: v_readfirstlane_b32 s0, v2 -; GFX6-NEXT: v_readfirstlane_b32 s1, v3 -; GFX6-NEXT: v_readfirstlane_b32 s2, v0 -; GFX6-NEXT: v_readfirstlane_b32 s3, v1 +; GFX6-NEXT: s_sub_u32 s0, s0, s4 +; GFX6-NEXT: s_subb_u32 s1, s1, s5 +; GFX6-NEXT: s_cselect_b64 s[0:1], 0, s[0:1] +; GFX6-NEXT: s_sub_u32 s2, s2, s6 +; GFX6-NEXT: s_subb_u32 s3, s3, s7 +; GFX6-NEXT: s_cselect_b64 s[2:3], 0, s[2:3] ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_usubsat_v2i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NEXT: s_sub_u32 s8, s0, s4 -; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: s_subb_u32 s9, s1, s5 -; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: v_mov_b32_e32 v2, s8 -; GFX8-NEXT: v_mov_b32_e32 v3, s9 -; GFX8-NEXT: s_sub_u32 s0, s2, s6 -; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc -; GFX8-NEXT: s_subb_u32 s1, s3, s7 -; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v1, v5, 0, vcc -; GFX8-NEXT: v_readfirstlane_b32 s0, v2 -; GFX8-NEXT: v_readfirstlane_b32 s1, v3 -; GFX8-NEXT: v_readfirstlane_b32 s2, v0 -; GFX8-NEXT: v_readfirstlane_b32 s3, v1 +; GFX8-NEXT: s_sub_u32 s0, s0, s4 +; GFX8-NEXT: s_subb_u32 s1, s1, s5 +; GFX8-NEXT: s_cselect_b64 s[0:1], 0, s[0:1] +; GFX8-NEXT: s_sub_u32 s2, s2, s6 +; GFX8-NEXT: s_subb_u32 s3, s3, s7 +; GFX8-NEXT: s_cselect_b64 s[2:3], 0, s[2:3] ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_usubsat_v2i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: s_sub_u32 s8, s0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: s_subb_u32 s9, s1, s5 -; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v2, s8 -; GFX9-NEXT: v_mov_b32_e32 v3, s9 -; GFX9-NEXT: s_sub_u32 s0, s2, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc -; GFX9-NEXT: s_subb_u32 s1, s3, s7 -; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v4, s0 -; GFX9-NEXT: v_mov_b32_e32 v5, s1 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v1, v5, 0, vcc -; GFX9-NEXT: v_readfirstlane_b32 s0, v2 -; GFX9-NEXT: v_readfirstlane_b32 s1, v3 -; GFX9-NEXT: v_readfirstlane_b32 s2, v0 -; GFX9-NEXT: v_readfirstlane_b32 s3, v1 +; GFX9-NEXT: s_sub_u32 s0, s0, s4 +; GFX9-NEXT: s_subb_u32 s1, s1, s5 +; GFX9-NEXT: s_cselect_b64 s[0:1], 0, s[0:1] +; GFX9-NEXT: s_sub_u32 s2, s2, s6 +; GFX9-NEXT: s_subb_u32 s3, s3, s7 +; GFX9-NEXT: s_cselect_b64 s[2:3], 0, s[2:3] ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: s_usubsat_v2i64: ; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_sub_u32 s8, s0, s4 -; GFX10PLUS-NEXT: v_cmp_lt_u64_e64 s4, s[0:1], s[4:5] +; GFX10PLUS-NEXT: s_sub_u32 s0, s0, s4 ; GFX10PLUS-NEXT: s_subb_u32 s1, s1, s5 -; GFX10PLUS-NEXT: s_sub_u32 s0, s2, s6 -; GFX10PLUS-NEXT: v_cmp_lt_u64_e64 s2, s[2:3], s[6:7] -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, s1, 0, s4 -; GFX10PLUS-NEXT: s_subb_u32 s1, s3, s7 -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, s8, 0, s4 -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v2, s0, 0, s2 -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v3, s1, 0, s2 -; GFX10PLUS-NEXT: v_readfirstlane_b32 s1, v1 -; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0 -; GFX10PLUS-NEXT: v_readfirstlane_b32 s2, v2 -; GFX10PLUS-NEXT: v_readfirstlane_b32 s3, v3 +; GFX10PLUS-NEXT: s_cselect_b64 s[0:1], 0, s[0:1] +; GFX10PLUS-NEXT: s_sub_u32 s2, s2, s6 +; GFX10PLUS-NEXT: s_subb_u32 s3, s3, s7 +; GFX10PLUS-NEXT: s_cselect_b64 s[2:3], 0, s[2:3] ; GFX10PLUS-NEXT: ; return to shader part epilog %result = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> %lhs, <2 x i64> %rhs) ret <2 x i64> %result @@ -3130,131 +3004,42 @@ define amdgpu_ps i128 @s_usubsat_i128(i128 inreg %lhs, i128 inreg %rhs) { ; GFX6-LABEL: s_usubsat_i128: ; GFX6: ; %bb.0: -; GFX6-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-NEXT: v_mov_b32_e32 v3, s5 -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3] -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1] -; GFX6-NEXT: s_sub_u32 s8, s0, s4 -; GFX6-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[2:3], v[0:1] -; GFX6-NEXT: s_subb_u32 s9, s1, s5 -; GFX6-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX6-NEXT: s_subb_u32 s10, s2, s6 -; GFX6-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX6-NEXT: s_subb_u32 s11, s3, s7 -; GFX6-NEXT: v_mov_b32_e32 v1, s8 -; GFX6-NEXT: v_mov_b32_e32 v2, s9 -; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX6-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc -; GFX6-NEXT: v_cndmask_b32_e64 v1, v2, 0, vcc -; GFX6-NEXT: v_mov_b32_e32 v2, s10 -; GFX6-NEXT: v_mov_b32_e32 v3, s11 -; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc -; GFX6-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc -; GFX6-NEXT: v_readfirstlane_b32 s0, v0 -; GFX6-NEXT: v_readfirstlane_b32 s1, v1 -; GFX6-NEXT: v_readfirstlane_b32 s2, v2 -; GFX6-NEXT: v_readfirstlane_b32 s3, v3 +; GFX6-NEXT: s_sub_u32 s0, s0, s4 +; GFX6-NEXT: s_subb_u32 s1, s1, s5 +; GFX6-NEXT: s_subb_u32 s2, s2, s6 +; GFX6-NEXT: s_subb_u32 s3, s3, s7 +; GFX6-NEXT: s_cselect_b64 s[0:1], 0, s[0:1] +; GFX6-NEXT: s_cselect_b64 s[2:3], 0, s[2:3] ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_usubsat_i128: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_sub_u32 s8, s0, s4 -; GFX8-NEXT: s_subb_u32 s9, s1, s5 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: s_subb_u32 s10, s2, s6 -; GFX8-NEXT: v_mov_b32_e32 v3, s5 -; GFX8-NEXT: s_subb_u32 s11, s3, s7 -; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3] -; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: s_cmp_eq_u64 s[2:3], s[6:7] -; GFX8-NEXT: s_cselect_b32 s6, 1, 0 -; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1] -; GFX8-NEXT: s_and_b32 s0, 1, s6 -; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX8-NEXT: v_mov_b32_e32 v1, s8 -; GFX8-NEXT: v_mov_b32_e32 v2, s9 -; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX8-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v1, v2, 0, vcc -; GFX8-NEXT: v_mov_b32_e32 v2, s10 -; GFX8-NEXT: v_mov_b32_e32 v3, s11 -; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc -; GFX8-NEXT: v_readfirstlane_b32 s0, v0 -; GFX8-NEXT: v_readfirstlane_b32 s1, v1 -; GFX8-NEXT: v_readfirstlane_b32 s2, v2 -; GFX8-NEXT: v_readfirstlane_b32 s3, v3 +; GFX8-NEXT: s_sub_u32 s0, s0, s4 +; GFX8-NEXT: s_subb_u32 s1, s1, s5 +; GFX8-NEXT: s_subb_u32 s2, s2, s6 +; GFX8-NEXT: s_subb_u32 s3, s3, s7 +; GFX8-NEXT: s_cselect_b64 s[0:1], 0, s[0:1] +; GFX8-NEXT: s_cselect_b64 s[2:3], 0, s[2:3] ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_usubsat_i128: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_sub_u32 s8, s0, s4 -; GFX9-NEXT: s_subb_u32 s9, s1, s5 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NEXT: s_subb_u32 s10, s2, s6 -; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: s_subb_u32 s11, s3, s7 -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3] -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: s_cmp_eq_u64 s[2:3], s[6:7] -; GFX9-NEXT: s_cselect_b32 s6, 1, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1] -; GFX9-NEXT: s_and_b32 s0, 1, s6 -; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: v_mov_b32_e32 v1, s8 -; GFX9-NEXT: v_mov_b32_e32 v2, s9 -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v1, v2, 0, vcc -; GFX9-NEXT: v_mov_b32_e32 v2, s10 -; GFX9-NEXT: v_mov_b32_e32 v3, s11 -; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc -; GFX9-NEXT: v_readfirstlane_b32 s0, v0 -; GFX9-NEXT: v_readfirstlane_b32 s1, v1 -; GFX9-NEXT: v_readfirstlane_b32 s2, v2 -; GFX9-NEXT: v_readfirstlane_b32 s3, v3 +; GFX9-NEXT: s_sub_u32 s0, s0, s4 +; GFX9-NEXT: s_subb_u32 s1, s1, s5 +; GFX9-NEXT: s_subb_u32 s2, s2, s6 +; GFX9-NEXT: s_subb_u32 s3, s3, s7 +; GFX9-NEXT: s_cselect_b64 s[0:1], 0, s[0:1] +; GFX9-NEXT: s_cselect_b64 s[2:3], 0, s[2:3] ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: s_usubsat_i128: ; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_sub_u32 s8, s0, s4 -; GFX10PLUS-NEXT: v_cmp_lt_u64_e64 s0, s[0:1], s[4:5] -; GFX10PLUS-NEXT: s_subb_u32 s9, s1, s5 -; GFX10PLUS-NEXT: s_subb_u32 s10, s2, s6 -; GFX10PLUS-NEXT: s_subb_u32 s11, s3, s7 -; GFX10PLUS-NEXT: s_cmp_eq_u64 s[2:3], s[6:7] -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 -; GFX10PLUS-NEXT: v_cmp_lt_u64_e64 s0, s[2:3], s[6:7] -; GFX10PLUS-NEXT: s_cselect_b32 s12, 1, 0 -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 -; GFX10PLUS-NEXT: s_and_b32 s0, 1, s12 -; GFX10PLUS-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 -; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo -; GFX10PLUS-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX10PLUS-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, s8, 0, vcc_lo -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, s9, 0, vcc_lo -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v2, s10, 0, vcc_lo -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v3, s11, 0, vcc_lo -; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0 -; GFX10PLUS-NEXT: v_readfirstlane_b32 s1, v1 -; GFX10PLUS-NEXT: v_readfirstlane_b32 s2, v2 -; GFX10PLUS-NEXT: v_readfirstlane_b32 s3, v3 +; GFX10PLUS-NEXT: s_sub_u32 s0, s0, s4 +; GFX10PLUS-NEXT: s_subb_u32 s1, s1, s5 +; GFX10PLUS-NEXT: s_subb_u32 s2, s2, s6 +; GFX10PLUS-NEXT: s_subb_u32 s3, s3, s7 +; GFX10PLUS-NEXT: s_cselect_b64 s[0:1], 0, s[0:1] +; GFX10PLUS-NEXT: s_cselect_b64 s[2:3], 0, s[2:3] ; GFX10PLUS-NEXT: ; return to shader part epilog %result = call i128 @llvm.usub.sat.i128(i128 %lhs, i128 %rhs) ret i128 %result @@ -3264,90 +3049,58 @@ ; GFX6-LABEL: usubsat_i128_sv: ; GFX6: ; %bb.0: ; GFX6-NEXT: v_mov_b32_e32 v4, s1 -; GFX6-NEXT: v_sub_i32_e32 v5, vcc, s0, v0 -; GFX6-NEXT: v_subb_u32_e32 v4, vcc, v4, v1, vcc -; GFX6-NEXT: v_mov_b32_e32 v6, s2 -; GFX6-NEXT: v_mov_b32_e32 v7, s3 -; GFX6-NEXT: v_subb_u32_e32 v6, vcc, v6, v2, vcc -; GFX6-NEXT: v_subb_u32_e32 v7, vcc, v7, v3, vcc -; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1] -; GFX6-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3] -; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[2:3], v[2:3] -; GFX6-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc -; GFX6-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX6-NEXT: v_cndmask_b32_e64 v0, v5, 0, vcc -; GFX6-NEXT: v_cndmask_b32_e64 v1, v4, 0, vcc -; GFX6-NEXT: v_cndmask_b32_e64 v2, v6, 0, vcc -; GFX6-NEXT: v_cndmask_b32_e64 v3, v7, 0, vcc +; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 +; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v4, v1, vcc +; GFX6-NEXT: v_mov_b32_e32 v4, s2 +; GFX6-NEXT: v_mov_b32_e32 v5, s3 +; GFX6-NEXT: v_subb_u32_e32 v2, vcc, v4, v2, vcc +; GFX6-NEXT: v_subb_u32_e32 v3, vcc, v5, v3, vcc +; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc +; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc +; GFX6-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: usubsat_i128_sv: ; GFX8: ; %bb.0: ; GFX8-NEXT: v_mov_b32_e32 v4, s1 -; GFX8-NEXT: v_sub_u32_e32 v5, vcc, s0, v0 -; GFX8-NEXT: v_subb_u32_e32 v4, vcc, v4, v1, vcc -; GFX8-NEXT: v_mov_b32_e32 v6, s2 -; GFX8-NEXT: v_mov_b32_e32 v7, s3 -; GFX8-NEXT: v_subb_u32_e32 v6, vcc, v6, v2, vcc -; GFX8-NEXT: v_subb_u32_e32 v7, vcc, v7, v3, vcc -; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1] -; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3] -; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, s[2:3], v[2:3] -; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc -; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX8-NEXT: v_cndmask_b32_e64 v0, v5, 0, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v1, v4, 0, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v2, v6, 0, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v3, v7, 0, vcc +; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s0, v0 +; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v4, v1, vcc +; GFX8-NEXT: v_mov_b32_e32 v4, s2 +; GFX8-NEXT: v_mov_b32_e32 v5, s3 +; GFX8-NEXT: v_subb_u32_e32 v2, vcc, v4, v2, vcc +; GFX8-NEXT: v_subb_u32_e32 v3, vcc, v5, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: usubsat_i128_sv: ; GFX9: ; %bb.0: ; GFX9-NEXT: v_mov_b32_e32 v4, s1 -; GFX9-NEXT: v_sub_co_u32_e32 v5, vcc, s0, v0 -; GFX9-NEXT: v_subb_co_u32_e32 v4, vcc, v4, v1, vcc -; GFX9-NEXT: v_mov_b32_e32 v6, s2 -; GFX9-NEXT: v_mov_b32_e32 v7, s3 -; GFX9-NEXT: v_subb_co_u32_e32 v6, vcc, v6, v2, vcc -; GFX9-NEXT: v_subb_co_u32_e32 v7, vcc, v7, v3, vcc -; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3] -; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, s[2:3], v[2:3] -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc -; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v5, 0, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v1, v4, 0, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v2, v6, 0, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v3, v7, 0, vcc +; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s0, v0 +; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v4, v1, vcc +; GFX9-NEXT: v_mov_b32_e32 v4, s2 +; GFX9-NEXT: v_mov_b32_e32 v5, s3 +; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v4, v2, vcc +; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v5, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: usubsat_i128_sv: ; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[0:1], v[0:1] -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo -; GFX10PLUS-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[2:3], v[2:3] -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc_lo -; GFX10PLUS-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[2:3], v[2:3] -; GFX10PLUS-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc_lo ; GFX10PLUS-NEXT: v_sub_co_u32 v0, vcc_lo, s0, v0 ; GFX10PLUS-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo -; GFX10PLUS-NEXT: v_and_b32_e32 v4, 1, v4 ; GFX10PLUS-NEXT: v_sub_co_ci_u32_e32 v2, vcc_lo, s2, v2, vcc_lo ; GFX10PLUS-NEXT: v_sub_co_ci_u32_e32 v3, vcc_lo, s3, v3, vcc_lo -; GFX10PLUS-NEXT: v_cmp_ne_u32_e64 s0, 0, v4 -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, v0, 0, s0 -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, 0, s0 -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v2, v2, 0, s0 -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v3, v3, 0, s0 +; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc_lo +; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc_lo +; GFX10PLUS-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc_lo +; GFX10PLUS-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc_lo ; GFX10PLUS-NEXT: ; return to shader part epilog %result = call i128 @llvm.usub.sat.i128(i128 %lhs, i128 %rhs) %cast = bitcast i128 %result to <4 x float> @@ -3358,90 +3111,58 @@ ; GFX6-LABEL: usubsat_i128_vs: ; GFX6: ; %bb.0: ; GFX6-NEXT: v_mov_b32_e32 v4, s1 -; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, s0, v0 -; GFX6-NEXT: v_subb_u32_e32 v4, vcc, v1, v4, vcc -; GFX6-NEXT: v_mov_b32_e32 v6, s2 -; GFX6-NEXT: v_mov_b32_e32 v7, s3 -; GFX6-NEXT: v_subb_u32_e32 v6, vcc, v2, v6, vcc -; GFX6-NEXT: v_subb_u32_e32 v7, vcc, v3, v7, vcc -; GFX6-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1] -; GFX6-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GFX6-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[2:3] -; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[2:3], v[2:3] -; GFX6-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc -; GFX6-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX6-NEXT: v_cndmask_b32_e64 v0, v5, 0, vcc -; GFX6-NEXT: v_cndmask_b32_e64 v1, v4, 0, vcc -; GFX6-NEXT: v_cndmask_b32_e64 v2, v6, 0, vcc -; GFX6-NEXT: v_cndmask_b32_e64 v3, v7, 0, vcc +; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s0, v0 +; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v1, v4, vcc +; GFX6-NEXT: v_mov_b32_e32 v4, s2 +; GFX6-NEXT: v_mov_b32_e32 v5, s3 +; GFX6-NEXT: v_subb_u32_e32 v2, vcc, v2, v4, vcc +; GFX6-NEXT: v_subb_u32_e32 v3, vcc, v3, v5, vcc +; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc +; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc +; GFX6-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: usubsat_i128_vs: ; GFX8: ; %bb.0: ; GFX8-NEXT: v_mov_b32_e32 v4, s1 -; GFX8-NEXT: v_subrev_u32_e32 v5, vcc, s0, v0 -; GFX8-NEXT: v_subb_u32_e32 v4, vcc, v1, v4, vcc -; GFX8-NEXT: v_mov_b32_e32 v6, s2 -; GFX8-NEXT: v_mov_b32_e32 v7, s3 -; GFX8-NEXT: v_subb_u32_e32 v6, vcc, v2, v6, vcc -; GFX8-NEXT: v_subb_u32_e32 v7, vcc, v3, v7, vcc -; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1] -; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[2:3] -; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, s[2:3], v[2:3] -; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc -; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX8-NEXT: v_cndmask_b32_e64 v0, v5, 0, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v1, v4, 0, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v2, v6, 0, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v3, v7, 0, vcc +; GFX8-NEXT: v_subrev_u32_e32 v0, vcc, s0, v0 +; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v1, v4, vcc +; GFX8-NEXT: v_mov_b32_e32 v4, s2 +; GFX8-NEXT: v_mov_b32_e32 v5, s3 +; GFX8-NEXT: v_subb_u32_e32 v2, vcc, v2, v4, vcc +; GFX8-NEXT: v_subb_u32_e32 v3, vcc, v3, v5, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: usubsat_i128_vs: ; GFX9: ; %bb.0: ; GFX9-NEXT: v_mov_b32_e32 v4, s1 -; GFX9-NEXT: v_subrev_co_u32_e32 v5, vcc, s0, v0 -; GFX9-NEXT: v_subb_co_u32_e32 v4, vcc, v1, v4, vcc -; GFX9-NEXT: v_mov_b32_e32 v6, s2 -; GFX9-NEXT: v_mov_b32_e32 v7, s3 -; GFX9-NEXT: v_subb_co_u32_e32 v6, vcc, v2, v6, vcc -; GFX9-NEXT: v_subb_co_u32_e32 v7, vcc, v3, v7, vcc -; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[2:3] -; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, s[2:3], v[2:3] -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc -; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v5, 0, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v1, v4, 0, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v2, v6, 0, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v3, v7, 0, vcc +; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s0, v0 +; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v4, vcc +; GFX9-NEXT: v_mov_b32_e32 v4, s2 +; GFX9-NEXT: v_mov_b32_e32 v5, s3 +; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v4, vcc +; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v5, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: usubsat_i128_vs: ; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[0:1], v[0:1] -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo -; GFX10PLUS-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[2:3], v[2:3] -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc_lo -; GFX10PLUS-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[2:3], v[2:3] -; GFX10PLUS-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc_lo ; GFX10PLUS-NEXT: v_sub_co_u32 v0, vcc_lo, v0, s0 ; GFX10PLUS-NEXT: v_subrev_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo -; GFX10PLUS-NEXT: v_and_b32_e32 v4, 1, v4 ; GFX10PLUS-NEXT: v_subrev_co_ci_u32_e32 v2, vcc_lo, s2, v2, vcc_lo ; GFX10PLUS-NEXT: v_subrev_co_ci_u32_e32 v3, vcc_lo, s3, v3, vcc_lo -; GFX10PLUS-NEXT: v_cmp_ne_u32_e64 s0, 0, v4 -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, v0, 0, s0 -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, 0, s0 -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v2, v2, 0, s0 -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v3, v3, 0, s0 +; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc_lo +; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc_lo +; GFX10PLUS-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc_lo +; GFX10PLUS-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc_lo ; GFX10PLUS-NEXT: ; return to shader part epilog %result = call i128 @llvm.usub.sat.i128(i128 %lhs, i128 %rhs) %cast = bitcast i128 %result to <4 x float> @@ -3452,188 +3173,108 @@ ; GFX6-LABEL: v_usubsat_v2i128: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_sub_i32_e32 v16, vcc, v0, v8 -; GFX6-NEXT: v_subb_u32_e32 v17, vcc, v1, v9, vcc -; GFX6-NEXT: v_subb_u32_e32 v18, vcc, v2, v10, vcc -; GFX6-NEXT: v_subb_u32_e32 v19, vcc, v3, v11, vcc -; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[8:9] -; GFX6-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[10:11] -; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[10:11] -; GFX6-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc -; GFX6-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX6-NEXT: v_cndmask_b32_e64 v0, v16, 0, vcc -; GFX6-NEXT: v_cndmask_b32_e64 v1, v17, 0, vcc -; GFX6-NEXT: v_cndmask_b32_e64 v2, v18, 0, vcc -; GFX6-NEXT: v_cndmask_b32_e64 v3, v19, 0, vcc -; GFX6-NEXT: v_sub_i32_e32 v8, vcc, v4, v12 -; GFX6-NEXT: v_subb_u32_e32 v9, vcc, v5, v13, vcc -; GFX6-NEXT: v_subb_u32_e32 v10, vcc, v6, v14, vcc -; GFX6-NEXT: v_subb_u32_e32 v11, vcc, v7, v15, vcc -; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, v[4:5], v[12:13] -; GFX6-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[14:15] -; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[14:15] -; GFX6-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc -; GFX6-NEXT: v_and_b32_e32 v4, 1, v4 -; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GFX6-NEXT: v_cndmask_b32_e64 v4, v8, 0, vcc -; GFX6-NEXT: v_cndmask_b32_e64 v5, v9, 0, vcc -; GFX6-NEXT: v_cndmask_b32_e64 v6, v10, 0, vcc -; GFX6-NEXT: v_cndmask_b32_e64 v7, v11, 0, vcc +; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v8 +; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v1, v9, vcc +; GFX6-NEXT: v_subb_u32_e32 v2, vcc, v2, v10, vcc +; GFX6-NEXT: v_subb_u32_e32 v3, vcc, v3, v11, vcc +; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc +; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc +; GFX6-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc +; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v4, v12 +; GFX6-NEXT: v_subb_u32_e32 v5, vcc, v5, v13, vcc +; GFX6-NEXT: v_subb_u32_e32 v6, vcc, v6, v14, vcc +; GFX6-NEXT: v_subb_u32_e32 v7, vcc, v7, v15, vcc +; GFX6-NEXT: v_cndmask_b32_e64 v4, v4, 0, vcc +; GFX6-NEXT: v_cndmask_b32_e64 v5, v5, 0, vcc +; GFX6-NEXT: v_cndmask_b32_e64 v6, v6, 0, vcc +; GFX6-NEXT: v_cndmask_b32_e64 v7, v7, 0, vcc ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_usubsat_v2i128: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_sub_u32_e32 v16, vcc, v0, v8 -; GFX8-NEXT: v_subb_u32_e32 v17, vcc, v1, v9, vcc -; GFX8-NEXT: v_subb_u32_e32 v18, vcc, v2, v10, vcc -; GFX8-NEXT: v_subb_u32_e32 v19, vcc, v3, v11, vcc -; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[8:9] -; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[10:11] -; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[10:11] -; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc -; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX8-NEXT: v_cndmask_b32_e64 v0, v16, 0, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v1, v17, 0, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v2, v18, 0, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v3, v19, 0, vcc -; GFX8-NEXT: v_sub_u32_e32 v8, vcc, v4, v12 -; GFX8-NEXT: v_subb_u32_e32 v9, vcc, v5, v13, vcc -; GFX8-NEXT: v_subb_u32_e32 v10, vcc, v6, v14, vcc -; GFX8-NEXT: v_subb_u32_e32 v11, vcc, v7, v15, vcc -; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, v[4:5], v[12:13] -; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[14:15] -; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[14:15] -; GFX8-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc -; GFX8-NEXT: v_and_b32_e32 v4, 1, v4 -; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GFX8-NEXT: v_cndmask_b32_e64 v4, v8, 0, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v5, v9, 0, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v6, v10, 0, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v7, v11, 0, vcc +; GFX8-NEXT: v_sub_u32_e32 v0, vcc, v0, v8 +; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v1, v9, vcc +; GFX8-NEXT: v_subb_u32_e32 v2, vcc, v2, v10, vcc +; GFX8-NEXT: v_subb_u32_e32 v3, vcc, v3, v11, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc +; GFX8-NEXT: v_sub_u32_e32 v4, vcc, v4, v12 +; GFX8-NEXT: v_subb_u32_e32 v5, vcc, v5, v13, vcc +; GFX8-NEXT: v_subb_u32_e32 v6, vcc, v6, v14, vcc +; GFX8-NEXT: v_subb_u32_e32 v7, vcc, v7, v15, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v4, v4, 0, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v5, v5, 0, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, 0, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v7, v7, 0, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_usubsat_v2i128: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_sub_co_u32_e32 v16, vcc, v0, v8 -; GFX9-NEXT: v_subb_co_u32_e32 v17, vcc, v1, v9, vcc -; GFX9-NEXT: v_subb_co_u32_e32 v18, vcc, v2, v10, vcc -; GFX9-NEXT: v_subb_co_u32_e32 v19, vcc, v3, v11, vcc -; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[8:9] -; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[10:11] -; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[10:11] -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc -; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v16, 0, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v1, v17, 0, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v2, v18, 0, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v3, v19, 0, vcc -; GFX9-NEXT: v_sub_co_u32_e32 v8, vcc, v4, v12 -; GFX9-NEXT: v_subb_co_u32_e32 v9, vcc, v5, v13, vcc -; GFX9-NEXT: v_subb_co_u32_e32 v10, vcc, v6, v14, vcc -; GFX9-NEXT: v_subb_co_u32_e32 v11, vcc, v7, v15, vcc -; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[4:5], v[12:13] -; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[14:15] -; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[14:15] -; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc -; GFX9-NEXT: v_and_b32_e32 v4, 1, v4 -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GFX9-NEXT: v_cndmask_b32_e64 v4, v8, 0, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v5, v9, 0, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v6, v10, 0, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v7, v11, 0, vcc +; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v8 +; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v9, vcc +; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v10, vcc +; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v11, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc +; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, v4, v12 +; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v5, v13, vcc +; GFX9-NEXT: v_subb_co_u32_e32 v6, vcc, v6, v14, vcc +; GFX9-NEXT: v_subb_co_u32_e32 v7, vcc, v7, v15, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v4, v4, 0, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, 0, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, 0, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, 0, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_usubsat_v2i128: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[8:9] -; GFX10-NEXT: v_cmp_eq_u64_e64 s5, v[6:7], v[14:15] -; GFX10-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc_lo -; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[2:3], v[10:11] -; GFX10-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc_lo -; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[10:11] -; GFX10-NEXT: v_cndmask_b32_e32 v16, v17, v16, vcc_lo -; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[4:5], v[12:13] -; GFX10-NEXT: v_and_b32_e32 v16, 1, v16 -; GFX10-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc_lo -; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[6:7], v[14:15] -; GFX10-NEXT: v_cmp_ne_u32_e64 s4, 0, v16 -; GFX10-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc_lo ; GFX10-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v8 +; GFX10-NEXT: v_sub_co_u32 v4, s4, v4, v12 ; GFX10-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v8, v18, v17, s5 +; GFX10-NEXT: v_sub_co_ci_u32_e64 v5, s4, v5, v13, s4 ; GFX10-NEXT: v_sub_co_ci_u32_e32 v2, vcc_lo, v2, v10, vcc_lo +; GFX10-NEXT: v_sub_co_ci_u32_e64 v6, s4, v6, v14, s4 ; GFX10-NEXT: v_sub_co_ci_u32_e32 v3, vcc_lo, v3, v11, vcc_lo -; GFX10-NEXT: v_sub_co_u32 v4, vcc_lo, v4, v12 -; GFX10-NEXT: v_and_b32_e32 v8, 1, v8 -; GFX10-NEXT: v_sub_co_ci_u32_e32 v5, vcc_lo, v5, v13, vcc_lo -; GFX10-NEXT: v_sub_co_ci_u32_e32 v6, vcc_lo, v6, v14, vcc_lo -; GFX10-NEXT: v_cmp_ne_u32_e64 s5, 0, v8 -; GFX10-NEXT: v_sub_co_ci_u32_e32 v7, vcc_lo, v7, v15, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, 0, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, 0, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, 0, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, 0, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, 0, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, 0, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, 0, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, 0, s5 +; GFX10-NEXT: v_sub_co_ci_u32_e64 v7, s4, v7, v15, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, 0, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, 0, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, 0, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, 0, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_usubsat_v2i128: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[8:9] -; GFX11-NEXT: v_cmp_eq_u64_e64 s1, v[6:7], v[14:15] -; GFX11-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc_lo -; GFX11-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[2:3], v[10:11] -; GFX11-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc_lo -; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[10:11] -; GFX11-NEXT: v_cndmask_b32_e32 v16, v17, v16, vcc_lo -; GFX11-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[4:5], v[12:13] -; GFX11-NEXT: v_and_b32_e32 v16, 1, v16 -; GFX11-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc_lo -; GFX11-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[6:7], v[14:15] -; GFX11-NEXT: v_cmp_ne_u32_e64 s0, 0, v16 -; GFX11-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc_lo ; GFX11-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v8 +; GFX11-NEXT: v_sub_co_u32 v4, s0, v4, v12 ; GFX11-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v9, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v8, v18, v17, s1 +; GFX11-NEXT: v_sub_co_ci_u32_e64 v5, s0, v5, v13, s0 ; GFX11-NEXT: v_sub_co_ci_u32_e32 v2, vcc_lo, v2, v10, vcc_lo +; GFX11-NEXT: v_sub_co_ci_u32_e64 v6, s0, v6, v14, s0 ; GFX11-NEXT: v_sub_co_ci_u32_e32 v3, vcc_lo, v3, v11, vcc_lo -; GFX11-NEXT: v_sub_co_u32 v4, vcc_lo, v4, v12 -; GFX11-NEXT: v_and_b32_e32 v8, 1, v8 -; GFX11-NEXT: v_sub_co_ci_u32_e32 v5, vcc_lo, v5, v13, vcc_lo -; GFX11-NEXT: v_sub_co_ci_u32_e32 v6, vcc_lo, v6, v14, vcc_lo -; GFX11-NEXT: v_cmp_ne_u32_e64 s1, 0, v8 -; GFX11-NEXT: v_sub_co_ci_u32_e32 v7, vcc_lo, v7, v15, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, 0, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, 0, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, 0, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, 0, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, 0, s1 -; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, 0, s1 -; GFX11-NEXT: v_cndmask_b32_e64 v6, v6, 0, s1 -; GFX11-NEXT: v_cndmask_b32_e64 v7, v7, 0, s1 +; GFX11-NEXT: v_sub_co_ci_u32_e64 v7, s0, v7, v15, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, 0, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, 0, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v6, v6, 0, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v7, v7, 0, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call <2 x i128> @llvm.usub.sat.v2i128(<2 x i128> %lhs, <2 x i128> %rhs) ret <2 x i128> %result @@ -3642,294 +3283,67 @@ define amdgpu_ps <2 x i128> @s_usubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> inreg %rhs) { ; GFX6-LABEL: s_usubsat_v2i128: ; GFX6: ; %bb.0: -; GFX6-NEXT: v_mov_b32_e32 v2, s8 -; GFX6-NEXT: v_mov_b32_e32 v3, s9 -; GFX6-NEXT: v_mov_b32_e32 v0, s10 -; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3] -; GFX6-NEXT: v_mov_b32_e32 v1, s11 -; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1] -; GFX6-NEXT: s_sub_u32 s16, s0, s8 -; GFX6-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[2:3], v[0:1] -; GFX6-NEXT: s_subb_u32 s17, s1, s9 -; GFX6-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX6-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX6-NEXT: s_subb_u32 s18, s2, s10 -; GFX6-NEXT: v_mov_b32_e32 v2, s17 -; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX6-NEXT: s_subb_u32 s19, s3, s11 -; GFX6-NEXT: v_mov_b32_e32 v1, s16 -; GFX6-NEXT: v_cndmask_b32_e64 v5, v2, 0, vcc -; GFX6-NEXT: v_mov_b32_e32 v2, s12 -; GFX6-NEXT: v_cndmask_b32_e64 v4, v1, 0, vcc -; GFX6-NEXT: v_mov_b32_e32 v0, s18 -; GFX6-NEXT: v_mov_b32_e32 v1, s19 -; GFX6-NEXT: v_mov_b32_e32 v3, s13 -; GFX6-NEXT: v_cndmask_b32_e64 v6, v0, 0, vcc -; GFX6-NEXT: v_cndmask_b32_e64 v7, v1, 0, vcc -; GFX6-NEXT: v_mov_b32_e32 v0, s14 -; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3] -; GFX6-NEXT: v_mov_b32_e32 v1, s15 -; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1] -; GFX6-NEXT: s_sub_u32 s0, s4, s12 -; GFX6-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[6:7], v[0:1] -; GFX6-NEXT: s_subb_u32 s1, s5, s13 -; GFX6-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX6-NEXT: s_subb_u32 s2, s6, s14 -; GFX6-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX6-NEXT: s_subb_u32 s3, s7, s15 -; GFX6-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NEXT: v_mov_b32_e32 v2, s1 -; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX6-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc -; GFX6-NEXT: v_cndmask_b32_e64 v1, v2, 0, vcc -; GFX6-NEXT: v_mov_b32_e32 v2, s2 -; GFX6-NEXT: v_mov_b32_e32 v3, s3 -; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc -; GFX6-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc -; GFX6-NEXT: v_readfirstlane_b32 s0, v4 -; GFX6-NEXT: v_readfirstlane_b32 s1, v5 -; GFX6-NEXT: v_readfirstlane_b32 s2, v6 -; GFX6-NEXT: v_readfirstlane_b32 s3, v7 -; GFX6-NEXT: v_readfirstlane_b32 s4, v0 -; GFX6-NEXT: v_readfirstlane_b32 s5, v1 -; GFX6-NEXT: v_readfirstlane_b32 s6, v2 -; GFX6-NEXT: v_readfirstlane_b32 s7, v3 +; GFX6-NEXT: s_sub_u32 s0, s0, s8 +; GFX6-NEXT: s_subb_u32 s1, s1, s9 +; GFX6-NEXT: s_subb_u32 s2, s2, s10 +; GFX6-NEXT: s_subb_u32 s3, s3, s11 +; GFX6-NEXT: s_cselect_b64 s[0:1], 0, s[0:1] +; GFX6-NEXT: s_cselect_b64 s[2:3], 0, s[2:3] +; GFX6-NEXT: s_sub_u32 s4, s4, s12 +; GFX6-NEXT: s_subb_u32 s5, s5, s13 +; GFX6-NEXT: s_subb_u32 s6, s6, s14 +; GFX6-NEXT: s_subb_u32 s7, s7, s15 +; GFX6-NEXT: s_cselect_b64 s[4:5], 0, s[4:5] +; GFX6-NEXT: s_cselect_b64 s[6:7], 0, s[6:7] ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_usubsat_v2i128: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_sub_u32 s16, s0, s8 -; GFX8-NEXT: s_subb_u32 s17, s1, s9 -; GFX8-NEXT: v_mov_b32_e32 v2, s8 -; GFX8-NEXT: s_subb_u32 s18, s2, s10 -; GFX8-NEXT: v_mov_b32_e32 v3, s9 -; GFX8-NEXT: s_subb_u32 s19, s3, s11 -; GFX8-NEXT: v_mov_b32_e32 v0, s10 -; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3] -; GFX8-NEXT: v_mov_b32_e32 v1, s11 -; GFX8-NEXT: s_cmp_eq_u64 s[2:3], s[10:11] -; GFX8-NEXT: s_cselect_b32 s10, 1, 0 -; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1] -; GFX8-NEXT: s_and_b32 s0, 1, s10 -; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX8-NEXT: v_mov_b32_e32 v2, s17 -; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX8-NEXT: s_sub_u32 s0, s4, s12 -; GFX8-NEXT: v_mov_b32_e32 v1, s16 -; GFX8-NEXT: v_cndmask_b32_e64 v5, v2, 0, vcc -; GFX8-NEXT: s_subb_u32 s1, s5, s13 -; GFX8-NEXT: v_mov_b32_e32 v2, s12 -; GFX8-NEXT: v_cndmask_b32_e64 v4, v1, 0, vcc -; GFX8-NEXT: v_mov_b32_e32 v0, s18 -; GFX8-NEXT: v_mov_b32_e32 v1, s19 -; GFX8-NEXT: s_subb_u32 s2, s6, s14 -; GFX8-NEXT: v_mov_b32_e32 v3, s13 -; GFX8-NEXT: v_cndmask_b32_e64 v6, v0, 0, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v7, v1, 0, vcc -; GFX8-NEXT: s_subb_u32 s3, s7, s15 -; GFX8-NEXT: v_mov_b32_e32 v0, s14 -; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3] -; GFX8-NEXT: v_mov_b32_e32 v1, s15 -; GFX8-NEXT: s_cmp_eq_u64 s[6:7], s[14:15] -; GFX8-NEXT: s_cselect_b32 s8, 1, 0 -; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1] -; GFX8-NEXT: s_and_b32 s4, 1, s8 -; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX8-NEXT: v_mov_b32_e32 v1, s0 -; GFX8-NEXT: v_mov_b32_e32 v2, s1 -; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX8-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v1, v2, 0, vcc -; GFX8-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NEXT: v_mov_b32_e32 v3, s3 -; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc -; GFX8-NEXT: v_readfirstlane_b32 s0, v4 -; GFX8-NEXT: v_readfirstlane_b32 s1, v5 -; GFX8-NEXT: v_readfirstlane_b32 s2, v6 -; GFX8-NEXT: v_readfirstlane_b32 s3, v7 -; GFX8-NEXT: v_readfirstlane_b32 s4, v0 -; GFX8-NEXT: v_readfirstlane_b32 s5, v1 -; GFX8-NEXT: v_readfirstlane_b32 s6, v2 -; GFX8-NEXT: v_readfirstlane_b32 s7, v3 +; GFX8-NEXT: s_sub_u32 s0, s0, s8 +; GFX8-NEXT: s_subb_u32 s1, s1, s9 +; GFX8-NEXT: s_subb_u32 s2, s2, s10 +; GFX8-NEXT: s_subb_u32 s3, s3, s11 +; GFX8-NEXT: s_cselect_b64 s[0:1], 0, s[0:1] +; GFX8-NEXT: s_cselect_b64 s[2:3], 0, s[2:3] +; GFX8-NEXT: s_sub_u32 s4, s4, s12 +; GFX8-NEXT: s_subb_u32 s5, s5, s13 +; GFX8-NEXT: s_subb_u32 s6, s6, s14 +; GFX8-NEXT: s_subb_u32 s7, s7, s15 +; GFX8-NEXT: s_cselect_b64 s[4:5], 0, s[4:5] +; GFX8-NEXT: s_cselect_b64 s[6:7], 0, s[6:7] ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_usubsat_v2i128: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_sub_u32 s16, s0, s8 -; GFX9-NEXT: s_subb_u32 s17, s1, s9 -; GFX9-NEXT: v_mov_b32_e32 v2, s8 -; GFX9-NEXT: s_subb_u32 s18, s2, s10 -; GFX9-NEXT: v_mov_b32_e32 v3, s9 -; GFX9-NEXT: s_subb_u32 s19, s3, s11 -; GFX9-NEXT: v_mov_b32_e32 v0, s10 -; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3] -; GFX9-NEXT: v_mov_b32_e32 v1, s11 -; GFX9-NEXT: s_cmp_eq_u64 s[2:3], s[10:11] -; GFX9-NEXT: s_cselect_b32 s10, 1, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1] -; GFX9-NEXT: s_and_b32 s0, 1, s10 -; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: v_mov_b32_e32 v2, s17 -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_sub_u32 s0, s4, s12 -; GFX9-NEXT: v_mov_b32_e32 v1, s16 -; GFX9-NEXT: v_cndmask_b32_e64 v5, v2, 0, vcc -; GFX9-NEXT: s_subb_u32 s1, s5, s13 -; GFX9-NEXT: v_mov_b32_e32 v2, s12 -; GFX9-NEXT: v_cndmask_b32_e64 v4, v1, 0, vcc -; GFX9-NEXT: v_mov_b32_e32 v0, s18 -; GFX9-NEXT: v_mov_b32_e32 v1, s19 -; GFX9-NEXT: s_subb_u32 s2, s6, s14 -; GFX9-NEXT: v_mov_b32_e32 v3, s13 -; GFX9-NEXT: v_cndmask_b32_e64 v6, v0, 0, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v7, v1, 0, vcc -; GFX9-NEXT: s_subb_u32 s3, s7, s15 -; GFX9-NEXT: v_mov_b32_e32 v0, s14 -; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3] -; GFX9-NEXT: v_mov_b32_e32 v1, s15 -; GFX9-NEXT: s_cmp_eq_u64 s[6:7], s[14:15] -; GFX9-NEXT: s_cselect_b32 s8, 1, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1] -; GFX9-NEXT: s_and_b32 s4, 1, s8 -; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v1, v2, 0, vcc -; GFX9-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc -; GFX9-NEXT: v_readfirstlane_b32 s0, v4 -; GFX9-NEXT: v_readfirstlane_b32 s1, v5 -; GFX9-NEXT: v_readfirstlane_b32 s2, v6 -; GFX9-NEXT: v_readfirstlane_b32 s3, v7 -; GFX9-NEXT: v_readfirstlane_b32 s4, v0 -; GFX9-NEXT: v_readfirstlane_b32 s5, v1 -; GFX9-NEXT: v_readfirstlane_b32 s6, v2 -; GFX9-NEXT: v_readfirstlane_b32 s7, v3 +; GFX9-NEXT: s_sub_u32 s0, s0, s8 +; GFX9-NEXT: s_subb_u32 s1, s1, s9 +; GFX9-NEXT: s_subb_u32 s2, s2, s10 +; GFX9-NEXT: s_subb_u32 s3, s3, s11 +; GFX9-NEXT: s_cselect_b64 s[0:1], 0, s[0:1] +; GFX9-NEXT: s_cselect_b64 s[2:3], 0, s[2:3] +; GFX9-NEXT: s_sub_u32 s4, s4, s12 +; GFX9-NEXT: s_subb_u32 s5, s5, s13 +; GFX9-NEXT: s_subb_u32 s6, s6, s14 +; GFX9-NEXT: s_subb_u32 s7, s7, s15 +; GFX9-NEXT: s_cselect_b64 s[4:5], 0, s[4:5] +; GFX9-NEXT: s_cselect_b64 s[6:7], 0, s[6:7] ; GFX9-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: s_usubsat_v2i128: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_sub_u32 s16, s0, s8 -; GFX10-NEXT: v_cmp_lt_u64_e64 s0, s[0:1], s[8:9] -; GFX10-NEXT: s_subb_u32 s17, s1, s9 -; GFX10-NEXT: s_subb_u32 s18, s2, s10 -; GFX10-NEXT: s_subb_u32 s19, s3, s11 -; GFX10-NEXT: s_cmp_eq_u64 s[2:3], s[10:11] -; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 -; GFX10-NEXT: v_cmp_lt_u64_e64 s0, s[2:3], s[10:11] -; GFX10-NEXT: s_cselect_b32 s20, 1, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 -; GFX10-NEXT: s_and_b32 s0, 1, s20 -; GFX10-NEXT: s_sub_u32 s2, s4, s12 -; GFX10-NEXT: v_cmp_lt_u64_e64 s4, s[4:5], s[12:13] -; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 -; GFX10-NEXT: s_subb_u32 s1, s5, s13 -; GFX10-NEXT: s_subb_u32 s8, s6, s14 -; GFX10-NEXT: s_subb_u32 s3, s7, s15 -; GFX10-NEXT: s_cmp_eq_u64 s[6:7], s[14:15] -; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s4 -; GFX10-NEXT: v_cmp_lt_u64_e64 s4, s[6:7], s[14:15] -; GFX10-NEXT: s_cselect_b32 s0, 1, 0 -; GFX10-NEXT: s_and_b32 s0, 1, s0 -; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc_lo -; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX10-NEXT: v_and_b32_e32 v1, 1, v1 -; GFX10-NEXT: v_cndmask_b32_e64 v0, s16, 0, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, s18, 0, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v3, s19, 0, vcc_lo -; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v1 -; GFX10-NEXT: v_cndmask_b32_e64 v1, s17, 0, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v4, s2, 0, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v5, s1, 0, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v6, s8, 0, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v7, s3, 0, s0 -; GFX10-NEXT: v_readfirstlane_b32 s0, v0 -; GFX10-NEXT: v_readfirstlane_b32 s1, v1 -; GFX10-NEXT: v_readfirstlane_b32 s2, v2 -; GFX10-NEXT: v_readfirstlane_b32 s3, v3 -; GFX10-NEXT: v_readfirstlane_b32 s4, v4 -; GFX10-NEXT: v_readfirstlane_b32 s5, v5 -; GFX10-NEXT: v_readfirstlane_b32 s6, v6 -; GFX10-NEXT: v_readfirstlane_b32 s7, v7 -; GFX10-NEXT: ; return to shader part epilog -; -; GFX11-LABEL: s_usubsat_v2i128: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_sub_u32 s16, s0, s8 -; GFX11-NEXT: v_cmp_lt_u64_e64 s0, s[0:1], s[8:9] -; GFX11-NEXT: s_subb_u32 s17, s1, s9 -; GFX11-NEXT: s_subb_u32 s18, s2, s10 -; GFX11-NEXT: s_subb_u32 s19, s3, s11 -; GFX11-NEXT: s_cmp_eq_u64 s[2:3], s[10:11] -; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 -; GFX11-NEXT: v_cmp_lt_u64_e64 s0, s[2:3], s[10:11] -; GFX11-NEXT: s_cselect_b32 s20, 1, 0 -; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 -; GFX11-NEXT: s_and_b32 s0, 1, s20 -; GFX11-NEXT: s_sub_u32 s2, s4, s12 -; GFX11-NEXT: v_cmp_lt_u64_e64 s4, s[4:5], s[12:13] -; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 -; GFX11-NEXT: s_subb_u32 s1, s5, s13 -; GFX11-NEXT: s_subb_u32 s8, s6, s14 -; GFX11-NEXT: s_subb_u32 s3, s7, s15 -; GFX11-NEXT: s_cmp_eq_u64 s[6:7], s[14:15] -; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s4 -; GFX11-NEXT: v_cmp_lt_u64_e64 s4, s[6:7], s[14:15] -; GFX11-NEXT: s_cselect_b32 s0, 1, 0 -; GFX11-NEXT: s_and_b32 s0, 1, s0 -; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4 -; GFX11-NEXT: v_dual_cndmask_b32 v1, v2, v1 :: v_dual_and_b32 v0, 1, v0 -; GFX11-NEXT: v_and_b32_e32 v1, 1, v1 -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: v_cmp_ne_u32_e64 s0, 0, v1 -; GFX11-NEXT: v_cndmask_b32_e64 v0, s16, 0, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v1, s17, 0, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v2, s18, 0, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v3, s19, 0, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v4, s2, 0, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v5, s1, 0, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v6, s8, 0, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v7, s3, 0, s0 -; GFX11-NEXT: v_readfirstlane_b32 s0, v0 -; GFX11-NEXT: v_readfirstlane_b32 s1, v1 -; GFX11-NEXT: v_readfirstlane_b32 s2, v2 -; GFX11-NEXT: v_readfirstlane_b32 s3, v3 -; GFX11-NEXT: v_readfirstlane_b32 s4, v4 -; GFX11-NEXT: v_readfirstlane_b32 s5, v5 -; GFX11-NEXT: v_readfirstlane_b32 s6, v6 -; GFX11-NEXT: v_readfirstlane_b32 s7, v7 -; GFX11-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: s_usubsat_v2i128: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_sub_u32 s0, s0, s8 +; GFX10PLUS-NEXT: s_subb_u32 s1, s1, s9 +; GFX10PLUS-NEXT: s_subb_u32 s2, s2, s10 +; GFX10PLUS-NEXT: s_subb_u32 s3, s3, s11 +; GFX10PLUS-NEXT: s_cselect_b64 s[0:1], 0, s[0:1] +; GFX10PLUS-NEXT: s_cselect_b64 s[2:3], 0, s[2:3] +; GFX10PLUS-NEXT: s_sub_u32 s4, s4, s12 +; GFX10PLUS-NEXT: s_subb_u32 s5, s5, s13 +; GFX10PLUS-NEXT: s_subb_u32 s6, s6, s14 +; GFX10PLUS-NEXT: s_subb_u32 s7, s7, s15 +; GFX10PLUS-NEXT: s_cselect_b64 s[4:5], 0, s[4:5] +; GFX10PLUS-NEXT: s_cselect_b64 s[6:7], 0, s[6:7] +; GFX10PLUS-NEXT: ; return to shader part epilog %result = call <2 x i128> @llvm.usub.sat.v2i128(<2 x i128> %lhs, <2 x i128> %rhs) ret <2 x i128> %result }