diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -4023,6 +4023,23 @@ return false; } + /// Allows the target to handle physreg-carried dependency + /// in target-specific way. Used from the ScheduleDAGSDNodes to decide whether + /// to add the edge to the dependency graph. + /// Def - input: Selection DAG node defininfg physical register + /// User - input: Selection DAG node using physical register + /// Op - input: Number of User operand + /// PhysReg - inout: set to the physical register if the edge is + /// necessary, unchanged otherwise + /// Cost - inout: physical register copy cost. + /// Returns 'true' is the edge is necessary, 'false' otherwise + virtual bool checkForPhysRegDependency(SDNode *Def, SDNode *User, unsigned Op, + const TargetRegisterInfo *TRI, + const TargetInstrInfo *TII, + unsigned &PhysReg, int &Cost) const { + return false; + } + /// Target-specific combining of register parts into its original value virtual SDValue joinRegisterPartsIntoValue(SelectionDAG &DAG, const SDLoc &DL, diff --git a/llvm/lib/CodeGen/MachineRegisterInfo.cpp b/llvm/lib/CodeGen/MachineRegisterInfo.cpp --- a/llvm/lib/CodeGen/MachineRegisterInfo.cpp +++ b/llvm/lib/CodeGen/MachineRegisterInfo.cpp @@ -79,10 +79,10 @@ return NewRC; } -const TargetRegisterClass * -MachineRegisterInfo::constrainRegClass(Register Reg, - const TargetRegisterClass *RC, - unsigned MinNumRegs) { +const TargetRegisterClass *MachineRegisterInfo::constrainRegClass( + Register Reg, const TargetRegisterClass *RC, unsigned MinNumRegs) { + if (Reg.isPhysical()) + return nullptr; return ::constrainRegClass(*this, Reg, getRegClass(Reg), RC, MinNumRegs); } diff --git a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp --- a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp @@ -110,11 +110,15 @@ static void CheckForPhysRegDependency(SDNode *Def, SDNode *User, unsigned Op, const TargetRegisterInfo *TRI, const TargetInstrInfo *TII, + const TargetLowering &TLI, unsigned &PhysReg, int &Cost) { if (Op != 2 || User->getOpcode() != ISD::CopyToReg) return; unsigned Reg = cast(User->getOperand(1))->getReg(); + if (TLI.checkForPhysRegDependency(Def, User, Op, TRI, TII, PhysReg, Cost)) + return; + if (Register::isVirtualRegister(Reg)) return; @@ -485,7 +489,8 @@ unsigned PhysReg = 0; int Cost = 1; // Determine if this is a physical register dependency. - CheckForPhysRegDependency(OpN, N, i, TRI, TII, PhysReg, Cost); + const TargetLowering &TLI = DAG->getTargetLoweringInfo(); + CheckForPhysRegDependency(OpN, N, i, TRI, TII, TLI, PhysReg, Cost); assert((PhysReg == 0 || !isChain) && "Chain dependence via physreg data?"); // FIXME: See ScheduleDAGSDNodes::EmitCopyFromReg. For now, scheduler diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h --- a/llvm/lib/Target/AMDGPU/SIISelLowering.h +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -479,6 +479,11 @@ bool denormalsEnabledForType(const SelectionDAG &DAG, EVT VT) const; bool denormalsEnabledForType(LLT Ty, MachineFunction &MF) const; + bool checkForPhysRegDependency(SDNode *Def, SDNode *User, unsigned Op, + const TargetRegisterInfo *TRI, + const TargetInstrInfo *TII, unsigned &PhysReg, + int &Cost) const override; + bool isKnownNeverNaNForTargetNode(SDValue Op, const SelectionDAG &DAG, bool SNaN = false, diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -12988,3 +12988,28 @@ return MONoClobber; return MachineMemOperand::MONone; } + +bool SITargetLowering::checkForPhysRegDependency( + SDNode *Def, SDNode *User, unsigned Op, const TargetRegisterInfo *TRI, + const TargetInstrInfo *TII, unsigned &PhysReg, int &Cost) const { + if (User->getOpcode() != ISD::CopyToReg) + return false; + if (!Def->isMachineOpcode()) + return false; + MachineSDNode *MDef = dyn_cast(Def); + if (!MDef) + return false; + + unsigned ResNo = User->getOperand(Op).getResNo(); + if (User->getOperand(Op)->getValueType(ResNo) != MVT::i1) + return false; + const MCInstrDesc &II = TII->get(MDef->getMachineOpcode()); + if (II.isCompare() && II.hasImplicitDefOfPhysReg(AMDGPU::SCC)) { + PhysReg = AMDGPU::SCC; + const TargetRegisterClass *RC = + TRI->getMinimalPhysRegClass(PhysReg, Def->getSimpleValueType(ResNo)); + Cost = RC->getCopyCost(); + return true; + } + return false; +} diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -931,6 +931,8 @@ SIRegisterInfo::getCrossCopyRegClass(const TargetRegisterClass *RC) const { if (isAGPRClass(RC) && !ST.hasGFX90AInsts()) return getEquivalentVGPRClass(RC); + if (RC == &AMDGPU::SCC_CLASSRegClass) + return getWaveMaskRegClass(); return RC; } diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td --- a/llvm/lib/Target/AMDGPU/SOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td @@ -523,8 +523,7 @@ class SelectPat : PatFrag < (ops node:$src1, node:$src2), (select SCC, $src1, $src2), - [{ return Subtarget->hasScalarCompareEq64() && - N->getOperand(0)->hasOneUse() && !N->isDivergent(); }] + [{ return !N->isDivergent(); }] >; let Uses = [SCC] in { diff --git a/llvm/test/CodeGen/AMDGPU/32-bit-local-address-space.ll b/llvm/test/CodeGen/AMDGPU/32-bit-local-address-space.ll --- a/llvm/test/CodeGen/AMDGPU/32-bit-local-address-space.ll +++ b/llvm/test/CodeGen/AMDGPU/32-bit-local-address-space.ll @@ -58,7 +58,7 @@ ; FUNC-LABEL: {{^}}null_32bit_lds_ptr: ; GFX7 v_cmp_ne_u32 -; GFX7: v_cndmask_b32 +; GFX7: s_cselect_b32 ; GFX8: s_cmp_lg_u32 ; GFX8-NOT: v_cmp_ne_u32 ; GFX8: s_cselect_b32 diff --git a/llvm/test/CodeGen/AMDGPU/addrspacecast.ll b/llvm/test/CodeGen/AMDGPU/addrspacecast.ll --- a/llvm/test/CodeGen/AMDGPU/addrspacecast.ll +++ b/llvm/test/CodeGen/AMDGPU/addrspacecast.ll @@ -9,33 +9,26 @@ ; CI-DAG: s_load_dword [[PTR:s[0-9]+]], s[6:7], 0x0{{$}} ; CI-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x10{{$}} -; CI-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], [[APERTURE]] ; CI-DAG: s_cmp_lg_u32 [[PTR]], -1 -; CI-DAG: s_cselect_b64 vcc, -1, 0 -; CI-DAG: v_cndmask_b32_e32 v[[HI:[0-9]+]], 0, [[VAPERTURE]], vcc -; CI-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]] -; CI-DAG: v_cndmask_b32_e32 v[[LO:[0-9]+]], 0, [[VPTR]] +; CI-DAG: s_cselect_b32 s[[HI:[0-9]+]], [[APERTURE]], 0 +; CI-DAG: s_cselect_b32 s[[LO:[0-9]+]], [[PTR]], 0 ; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7 ; GFX9-DAG: s_load_dword [[PTR:s[0-9]+]], s[4:5], 0x0{{$}} ; GFX9-DAG: s_getreg_b32 [[SSRC_SHARED:s[0-9]+]], hwreg(HW_REG_SH_MEM_BASES, 16, 16) ; GFX9-DAG: s_lshl_b32 [[SSRC_SHARED_BASE:s[0-9]+]], [[SSRC_SHARED]], 16 -; GFX9-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], [[SSRC_SHARED_BASE]] ; GFX9-XXX: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], src_shared_base ; GFX9: s_cmp_lg_u32 [[PTR]], -1 -; GFX9: s_cselect_b64 vcc, -1, 0 -; GFX9: v_cndmask_b32_e32 v[[HI:[0-9]+]], 0, [[VAPERTURE]], vcc -; GFX9-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]] -; GFX9-DAG: v_cndmask_b32_e32 v[[LO:[0-9]+]], 0, [[VPTR]] +; GFX9-DAG: s_cselect_b32 s[[HI:[0-9]+]], [[SSRC_SHARED_BASE]], 0 +; GFX9-DAG: s_cselect_b32 s[[LO:[0-9]+]], [[PTR]], 0 ; HSA: flat_store_dword v[[[LO]]:[[HI]]], [[K]] ; At most 2 digits. Make sure src_shared_base is not counted as a high ; number SGPR. -; CI: NumSgprs: {{[0-9][0-9]+}} -; GFX9: NumSgprs: {{[0-9]+}} +; HSA: NumSgprs: {{[0-9]+}} define amdgpu_kernel void @use_group_to_flat_addrspacecast(i32 addrspace(3)* %ptr) #0 { %stof = addrspacecast i32 addrspace(3)* %ptr to i32* store volatile i32 7, i32* %stof @@ -75,33 +68,26 @@ ; CI-DAG: s_load_dword [[PTR:s[0-9]+]], s[6:7], 0x0{{$}} ; CI-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x11{{$}} -; CI-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], [[APERTURE]] ; CI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7 ; CI-DAG: s_cmp_lg_u32 [[PTR]], -1 -; CI-DAG: s_cselect_b64 vcc, -1, 0 -; CI-DAG: v_cndmask_b32_e32 v[[HI:[0-9]+]], 0, [[VAPERTURE]], vcc -; CI-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]] -; CI-DAG: v_cndmask_b32_e32 v[[LO:[0-9]+]], 0, [[VPTR]] +; CI-DAG: s_cselect_b32 s[[HI:[0-9]+]], [[APERTURE]], 0 +; CI-DAG: s_cselect_b32 s[[LO:[0-9]+]], [[PTR]], 0 ; GFX9-DAG: s_load_dword [[PTR:s[0-9]+]], s[4:5], 0x0{{$}} ; GFX9-DAG: s_getreg_b32 [[SSRC_PRIVATE:s[0-9]+]], hwreg(HW_REG_SH_MEM_BASES, 0, 16) ; GFX9-DAG: s_lshl_b32 [[SSRC_PRIVATE_BASE:s[0-9]+]], [[SSRC_PRIVATE]], 16 -; GFX9-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], [[SSRC_PRIVATE_BASE]] ; GFX9-XXX: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], src_private_base ; GFX9-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7 ; GFX9: s_cmp_lg_u32 [[PTR]], -1 -; GFX9: s_cselect_b64 vcc, -1, 0 -; GFX9: v_cndmask_b32_e32 v[[HI:[0-9]+]], 0, [[VAPERTURE]], vcc -; GFX9: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]] -; GFX9-DAG: v_cndmask_b32_e32 v[[LO:[0-9]+]], 0, [[VPTR]] +; GFX9: s_cselect_b32 s[[HI:[0-9]+]], [[SSRC_PRIVATE_BASE]], 0 +; GFX9: s_cselect_b32 s[[LO:[0-9]+]], [[PTR]], 0 ; HSA: flat_store_dword v[[[LO]]:[[HI]]], [[K]] -; CI: NumSgprs: {{[0-9][0-9]+}} -; GFX9: NumSgprs: {{[0-9]+}} +; HSA: NumSgprs: {{[0-9]+}} define amdgpu_kernel void @use_private_to_flat_addrspacecast(i32 addrspace(5)* %ptr) #0 { %stof = addrspacecast i32 addrspace(5)* %ptr to i32* store volatile i32 7, i32* %stof @@ -155,14 +141,16 @@ ; HSA: enable_sgpr_queue_ptr = 0 ; HSA: s_load_dwordx2 s[[[PTR_LO:[0-9]+]]:[[PTR_HI:[0-9]+]]] -; CI-DAG: v_cmp_ne_u64_e64 vcc, s[[[PTR_LO]]:[[PTR_HI]]], 0{{$}} -; CI-DAG: v_mov_b32_e32 v[[VPTR_LO:[0-9]+]], s[[PTR_LO]] -; CI-DAG: v_cndmask_b32_e32 [[CASTPTR:v[0-9]+]], -1, v[[VPTR_LO]] +; CI-DAG: v_cmp_ne_u64_e64 s[[[CMP_LO:[0-9]+]]:[[CMP_HI:[0-9]+]]], s[[[PTR_LO]]:[[PTR_HI]]], 0{{$}} +; CI-DAG: s_and_b64 s{{[[0-9]+:[0-9]+]}}, s[[[CMP_LO]]:[[CMP_HI]]], exec +; CI-DAG: s_cselect_b32 [[CASTPTR:s[0-9]+]], s[[PTR_LO]], -1 +; CI-DAG: v_mov_b32_e32 [[VCASTPTR:v[0-9]+]], [[CASTPTR]] ; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 0{{$}} ; GFX9-DAG: s_cmp_lg_u64 s[[[CMP_LO:[0-9]+]]:[[CMP_HI:[0-9]+]]], 0 ; GFX9-DAG: s_cselect_b32 s[[PTR_LO]], s[[PTR_LO]], -1 ; GFX9-DAG: v_mov_b32_e32 [[CASTPTR:v[0-9]+]], s[[PTR_LO]] -; HSA: ds_write_b32 [[CASTPTR]], v[[K]] +; CI-DAG: ds_write_b32 [[VCASTPTR]], v[[K]] +; GFX9-DAG: ds_write_b32 [[CASTPTR]], v[[K]] define amdgpu_kernel void @use_flat_to_group_addrspacecast(i32* %ptr) #0 { %ftos = addrspacecast i32* %ptr to i32 addrspace(3)* store volatile i32 0, i32 addrspace(3)* %ftos @@ -175,14 +163,19 @@ ; HSA: enable_sgpr_queue_ptr = 0 ; HSA: s_load_dwordx2 s[[[PTR_LO:[0-9]+]]:[[PTR_HI:[0-9]+]]] -; CI-DAG: v_cmp_ne_u64_e64 vcc, s[[[PTR_LO]]:[[PTR_HI]]], 0{{$}} -; CI-DAG: v_mov_b32_e32 v[[VPTR_LO:[0-9]+]], s[[PTR_LO]] -; CI-DAG: v_cndmask_b32_e32 [[CASTPTR:v[0-9]+]], -1, v[[VPTR_LO]] +; CI-DAG v_cmp_ne_u64_e64 vcc, s[[[PTR_LO]]:[[PTR_HI]]], 0{{$}} +; CI-DAG v_mov_b32_e32 v[[VPTR_LO:[0-9]+]], s[[PTR_LO]] +; CI-DAG v_cndmask_b32_e32 [[CASTPTR:v[0-9]+]], -1, v[[VPTR_LO]] +; CI-DAG: v_cmp_ne_u64_e64 s[[[CMP_LO:[0-9]+]]:[[CMP_HI:[0-9]+]]], s[[[PTR_LO]]:[[PTR_HI]]], 0{{$}} +; CI-DAG: s_and_b64 s{{[[0-9]+:[0-9]+]}}, s[[[CMP_LO]]:[[CMP_HI]]], exec +; CI-DAG: s_cselect_b32 [[CASTPTR:s[0-9]+]], s[[PTR_LO]], -1 +; CI-DAG: v_mov_b32_e32 [[VCASTPTR:v[0-9]+]], [[CASTPTR]] ; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 0{{$}} ; GFX9-DAG: s_cmp_lg_u64 s[[[CMP_LO:[0-9]+]]:[[CMP_HI:[0-9]+]]], 0 ; GFX9-DAG: s_cselect_b32 s[[PTR_LO]], s[[PTR_LO]], -1 ; GFX9-DAG: v_mov_b32_e32 [[CASTPTR:v[0-9]+]], s[[PTR_LO]] -; HSA: buffer_store_dword v[[K]], [[CASTPTR]], s{{\[[0-9]+:[0-9]+\]}}, 0 offen{{$}} +; CI: buffer_store_dword v[[K]], [[VCASTPTR]], s{{\[[0-9]+:[0-9]+\]}}, 0 offen{{$}} +; GFX9: buffer_store_dword v[[K]], [[CASTPTR]], s{{\[[0-9]+:[0-9]+\]}}, 0 offen{{$}} define amdgpu_kernel void @use_flat_to_private_addrspacecast(i32* %ptr) #0 { %ftos = addrspacecast i32* %ptr to i32 addrspace(5)* store volatile i32 0, i32 addrspace(5)* %ftos diff --git a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll --- a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll +++ b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll @@ -521,266 +521,258 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg2, i64 %arg3, <2 x half> %arg4, <2 x half> %arg5) #3 { ; GFX908-LABEL: introduced_copy_to_sgpr: ; GFX908: ; %bb.0: ; %bb -; GFX908-NEXT: global_load_ushort v24, v[0:1], off glc +; GFX908-NEXT: global_load_ushort v16, v[0:1], off glc ; GFX908-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX908-NEXT: s_load_dwordx2 s[10:11], s[4:5], 0x10 -; GFX908-NEXT: v_mov_b32_e32 v1, 0 -; GFX908-NEXT: s_load_dword s5, s[4:5], 0x18 -; GFX908-NEXT: s_mov_b32 s4, 0 +; GFX908-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GFX908-NEXT: s_load_dword s9, s[4:5], 0x18 +; GFX908-NEXT: s_mov_b32 s8, 0 +; GFX908-NEXT: s_mov_b32 s5, s8 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_cvt_f32_u32_e32 v0, s3 -; GFX908-NEXT: s_sub_i32 s6, 0, s3 -; GFX908-NEXT: s_lshl_b64 s[8:9], s[10:11], 5 -; GFX908-NEXT: s_lshr_b32 s12, s5, 16 -; GFX908-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX908-NEXT: v_cvt_f32_f16_e32 v25, s5 -; GFX908-NEXT: v_cvt_f32_f16_e32 v26, s12 -; GFX908-NEXT: s_or_b32 s8, s8, 28 -; GFX908-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 -; GFX908-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX908-NEXT: v_mov_b32_e32 v6, s10 -; GFX908-NEXT: v_mov_b32_e32 v7, s11 -; GFX908-NEXT: v_mul_lo_u32 v2, s6, v0 -; GFX908-NEXT: s_lshl_b64 s[6:7], s[0:1], 5 -; GFX908-NEXT: v_mul_hi_u32 v2, v0, v2 -; GFX908-NEXT: v_add_u32_e32 v0, v0, v2 -; GFX908-NEXT: v_mul_hi_u32 v0, s2, v0 -; GFX908-NEXT: v_mov_b32_e32 v2, s8 -; GFX908-NEXT: v_mov_b32_e32 v3, s9 -; GFX908-NEXT: v_mul_lo_u32 v4, v0, s3 -; GFX908-NEXT: v_add_u32_e32 v5, 1, v0 -; GFX908-NEXT: v_sub_u32_e32 v4, s2, v4 -; GFX908-NEXT: v_cmp_le_u32_e32 vcc, s3, v4 -; GFX908-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc -; GFX908-NEXT: v_subrev_u32_e32 v5, s3, v4 -; GFX908-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc -; GFX908-NEXT: v_add_u32_e32 v5, 1, v0 -; GFX908-NEXT: v_cmp_le_u32_e32 vcc, s3, v4 -; GFX908-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc -; GFX908-NEXT: v_lshlrev_b64 v[4:5], 5, v[0:1] +; GFX908-NEXT: s_sub_i32 s4, 0, s3 +; GFX908-NEXT: v_cvt_f32_f16_e32 v17, s9 +; GFX908-NEXT: v_mov_b32_e32 v19, 0 +; GFX908-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; GFX908-NEXT: v_mov_b32_e32 v0, 0 +; GFX908-NEXT: v_mov_b32_e32 v1, 0 +; GFX908-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 +; GFX908-NEXT: v_cvt_u32_f32_e32 v2, v2 +; GFX908-NEXT: v_readfirstlane_b32 s10, v2 +; GFX908-NEXT: s_mul_i32 s4, s4, s10 +; GFX908-NEXT: s_mul_hi_u32 s4, s10, s4 +; GFX908-NEXT: s_add_i32 s10, s10, s4 +; GFX908-NEXT: s_mul_hi_u32 s4, s2, s10 +; GFX908-NEXT: s_mul_i32 s10, s4, s3 +; GFX908-NEXT: s_sub_i32 s2, s2, s10 +; GFX908-NEXT: s_add_i32 s11, s4, 1 +; GFX908-NEXT: s_sub_i32 s10, s2, s3 +; GFX908-NEXT: s_cmp_ge_u32 s2, s3 +; GFX908-NEXT: s_cselect_b32 s4, s11, s4 +; GFX908-NEXT: s_cselect_b32 s2, s10, s2 +; GFX908-NEXT: s_add_i32 s10, s4, 1 +; GFX908-NEXT: s_cmp_ge_u32 s2, s3 +; GFX908-NEXT: s_cselect_b32 s4, s10, s4 +; GFX908-NEXT: s_lshr_b32 s9, s9, 16 +; GFX908-NEXT: s_lshl_b64 s[12:13], s[4:5], 5 +; GFX908-NEXT: v_cvt_f32_f16_e32 v18, s9 +; GFX908-NEXT: s_lshl_b64 s[2:3], s[0:1], 5 +; GFX908-NEXT: s_lshl_b64 s[10:11], s[6:7], 5 +; GFX908-NEXT: s_or_b32 s10, s10, 28 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_readfirstlane_b32 s2, v24 -; GFX908-NEXT: s_and_b32 s2, 0xffff, s2 -; GFX908-NEXT: s_mul_i32 s1, s1, s2 -; GFX908-NEXT: s_mul_hi_u32 s3, s0, s2 -; GFX908-NEXT: s_mul_i32 s0, s0, s2 -; GFX908-NEXT: s_add_i32 s1, s3, s1 -; GFX908-NEXT: s_lshl_b64 s[8:9], s[0:1], 5 +; GFX908-NEXT: v_readfirstlane_b32 s5, v16 +; GFX908-NEXT: s_and_b32 s5, 0xffff, s5 +; GFX908-NEXT: s_mul_i32 s1, s1, s5 +; GFX908-NEXT: s_mul_hi_u32 s9, s0, s5 +; GFX908-NEXT: s_mul_i32 s0, s0, s5 +; GFX908-NEXT: s_add_i32 s1, s9, s1 +; GFX908-NEXT: s_lshl_b64 s[0:1], s[0:1], 5 ; GFX908-NEXT: s_branch .LBB3_2 ; GFX908-NEXT: .LBB3_1: ; %bb12 ; GFX908-NEXT: ; in Loop: Header=BB3_2 Depth=1 -; GFX908-NEXT: v_add_co_u32_e32 v6, vcc, v6, v0 -; GFX908-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc -; GFX908-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 -; GFX908-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v5, vcc +; GFX908-NEXT: s_add_u32 s6, s6, s4 +; GFX908-NEXT: s_addc_u32 s7, s7, 0 +; GFX908-NEXT: s_add_u32 s10, s10, s12 +; GFX908-NEXT: s_addc_u32 s11, s11, s13 ; GFX908-NEXT: .LBB3_2: ; %bb9 ; GFX908-NEXT: ; =>This Loop Header: Depth=1 ; GFX908-NEXT: ; Child Loop BB3_5 Depth 2 ; GFX908-NEXT: s_cbranch_scc0 .LBB3_1 ; GFX908-NEXT: ; %bb.3: ; %bb14 ; GFX908-NEXT: ; in Loop: Header=BB3_2 Depth=1 -; GFX908-NEXT: v_mov_b32_e32 v8, 0 -; GFX908-NEXT: v_mov_b32_e32 v9, 0 -; GFX908-NEXT: global_load_dwordx2 v[8:9], v[8:9], off -; GFX908-NEXT: s_mov_b32 s5, s4 -; GFX908-NEXT: v_mov_b32_e32 v13, s5 -; GFX908-NEXT: v_mov_b32_e32 v15, s5 -; GFX908-NEXT: v_mov_b32_e32 v17, s5 -; GFX908-NEXT: v_mov_b32_e32 v12, s4 -; GFX908-NEXT: v_mov_b32_e32 v14, s4 -; GFX908-NEXT: v_mov_b32_e32 v16, s4 -; GFX908-NEXT: v_cmp_gt_i64_e64 s[0:1], 0, v[6:7] -; GFX908-NEXT: v_mov_b32_e32 v11, v3 -; GFX908-NEXT: v_mov_b32_e32 v19, v13 -; GFX908-NEXT: v_mov_b32_e32 v10, v2 -; GFX908-NEXT: v_mov_b32_e32 v18, v12 +; GFX908-NEXT: global_load_dwordx2 v[2:3], v[0:1], off +; GFX908-NEXT: s_mov_b32 s9, s8 +; GFX908-NEXT: v_mov_b32_e32 v4, s8 +; GFX908-NEXT: v_mov_b32_e32 v6, s8 +; GFX908-NEXT: v_mov_b32_e32 v8, s8 +; GFX908-NEXT: v_mov_b32_e32 v5, s9 +; GFX908-NEXT: v_mov_b32_e32 v7, s9 +; GFX908-NEXT: v_mov_b32_e32 v9, s9 +; GFX908-NEXT: v_cmp_lt_i64_e64 s[14:15], s[6:7], 0 +; GFX908-NEXT: v_mov_b32_e32 v11, v5 +; GFX908-NEXT: s_mov_b64 s[16:17], s[10:11] +; GFX908-NEXT: v_mov_b32_e32 v10, v4 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_readfirstlane_b32 s2, v8 -; GFX908-NEXT: v_readfirstlane_b32 s3, v9 -; GFX908-NEXT: s_add_u32 s2, s2, 1 -; GFX908-NEXT: s_addc_u32 s3, s3, 0 -; GFX908-NEXT: s_mul_hi_u32 s5, s6, s2 -; GFX908-NEXT: s_mul_i32 s11, s7, s2 -; GFX908-NEXT: s_mul_i32 s10, s6, s2 -; GFX908-NEXT: s_mul_i32 s2, s6, s3 -; GFX908-NEXT: s_add_i32 s2, s5, s2 -; GFX908-NEXT: s_add_i32 s5, s2, s11 +; GFX908-NEXT: v_readfirstlane_b32 s5, v2 +; GFX908-NEXT: v_readfirstlane_b32 s9, v3 +; GFX908-NEXT: s_add_u32 s5, s5, 1 +; GFX908-NEXT: s_addc_u32 s9, s9, 0 +; GFX908-NEXT: s_mul_hi_u32 s19, s2, s5 +; GFX908-NEXT: s_mul_i32 s20, s3, s5 +; GFX908-NEXT: s_mul_i32 s18, s2, s5 +; GFX908-NEXT: s_mul_i32 s5, s2, s9 +; GFX908-NEXT: s_add_i32 s5, s19, s5 +; GFX908-NEXT: s_add_i32 s5, s5, s20 ; GFX908-NEXT: s_branch .LBB3_5 ; GFX908-NEXT: .LBB3_4: ; %bb58 ; GFX908-NEXT: ; in Loop: Header=BB3_5 Depth=2 -; GFX908-NEXT: v_add_co_u32_sdwa v8, vcc, v8, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX908-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc -; GFX908-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[8:9] -; GFX908-NEXT: v_mov_b32_e32 v20, s9 -; GFX908-NEXT: v_add_co_u32_e64 v10, s[2:3], s8, v10 -; GFX908-NEXT: v_addc_co_u32_e64 v11, s[2:3], v11, v20, s[2:3] +; GFX908-NEXT: v_add_co_u32_sdwa v2, vcc, v2, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX908-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX908-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[2:3] +; GFX908-NEXT: s_add_u32 s16, s16, s0 +; GFX908-NEXT: s_addc_u32 s17, s17, s1 ; GFX908-NEXT: s_cbranch_vccz .LBB3_1 ; GFX908-NEXT: .LBB3_5: ; %bb16 ; GFX908-NEXT: ; Parent Loop BB3_2 Depth=1 ; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX908-NEXT: v_mov_b32_e32 v21, s5 -; GFX908-NEXT: v_add_co_u32_e32 v20, vcc, s10, v10 -; GFX908-NEXT: v_addc_co_u32_e32 v21, vcc, v11, v21, vcc -; GFX908-NEXT: global_load_dword v28, v[20:21], off offset:-12 glc +; GFX908-NEXT: s_add_u32 s20, s16, s18 +; GFX908-NEXT: s_addc_u32 s21, s17, s5 +; GFX908-NEXT: global_load_dword v21, v19, s[20:21] offset:-12 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: global_load_dword v27, v[20:21], off offset:-8 glc +; GFX908-NEXT: global_load_dword v20, v19, s[20:21] offset:-8 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: global_load_dword v22, v[20:21], off offset:-4 glc +; GFX908-NEXT: global_load_dword v12, v19, s[20:21] offset:-4 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: global_load_dword v20, v[20:21], off glc +; GFX908-NEXT: global_load_dword v12, v19, s[20:21] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: ds_read_b64 v[20:21], v1 -; GFX908-NEXT: ds_read_b64 v[22:23], v0 -; GFX908-NEXT: s_and_b64 vcc, exec, s[0:1] +; GFX908-NEXT: ds_read_b64 v[12:13], v19 +; GFX908-NEXT: ds_read_b64 v[14:15], v0 +; GFX908-NEXT: s_and_b64 vcc, exec, s[14:15] ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: s_cbranch_vccnz .LBB3_4 ; GFX908-NEXT: ; %bb.6: ; %bb51 ; GFX908-NEXT: ; in Loop: Header=BB3_5 Depth=2 -; GFX908-NEXT: v_cvt_f32_f16_sdwa v29, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX908-NEXT: v_cvt_f32_f16_e32 v28, v28 -; GFX908-NEXT: v_cvt_f32_f16_sdwa v30, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX908-NEXT: v_cvt_f32_f16_e32 v27, v27 -; GFX908-NEXT: v_add_f32_e32 v31, v25, v20 -; GFX908-NEXT: v_add_f32_e32 v32, v26, v21 -; GFX908-NEXT: v_add_f32_e32 v33, 0, v20 -; GFX908-NEXT: v_add_f32_e32 v34, 0, v21 -; GFX908-NEXT: v_add_f32_e32 v23, v29, v23 -; GFX908-NEXT: v_add_f32_e32 v22, v28, v22 -; GFX908-NEXT: v_add_f32_e32 v21, v30, v21 -; GFX908-NEXT: v_add_f32_e32 v20, v27, v20 -; GFX908-NEXT: v_add_f32_e32 v13, v13, v32 -; GFX908-NEXT: v_add_f32_e32 v12, v12, v31 -; GFX908-NEXT: v_add_f32_e32 v15, v15, v34 -; GFX908-NEXT: v_add_f32_e32 v14, v14, v33 -; GFX908-NEXT: v_add_f32_e32 v16, v16, v22 -; GFX908-NEXT: v_add_f32_e32 v17, v17, v23 -; GFX908-NEXT: v_add_f32_e32 v18, v18, v20 -; GFX908-NEXT: v_add_f32_e32 v19, v19, v21 +; GFX908-NEXT: v_cvt_f32_f16_sdwa v22, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX908-NEXT: v_cvt_f32_f16_e32 v21, v21 +; GFX908-NEXT: v_cvt_f32_f16_sdwa v23, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX908-NEXT: v_cvt_f32_f16_e32 v20, v20 +; GFX908-NEXT: v_add_f32_e32 v24, v17, v12 +; GFX908-NEXT: v_add_f32_e32 v25, v18, v13 +; GFX908-NEXT: v_add_f32_e32 v26, 0, v12 +; GFX908-NEXT: v_add_f32_e32 v27, 0, v13 +; GFX908-NEXT: v_add_f32_e32 v15, v22, v15 +; GFX908-NEXT: v_add_f32_e32 v14, v21, v14 +; GFX908-NEXT: v_add_f32_e32 v13, v23, v13 +; GFX908-NEXT: v_add_f32_e32 v12, v20, v12 +; GFX908-NEXT: v_add_f32_e32 v5, v5, v25 +; GFX908-NEXT: v_add_f32_e32 v4, v4, v24 +; GFX908-NEXT: v_add_f32_e32 v7, v7, v27 +; GFX908-NEXT: v_add_f32_e32 v6, v6, v26 +; GFX908-NEXT: v_add_f32_e32 v8, v8, v14 +; GFX908-NEXT: v_add_f32_e32 v9, v9, v15 +; GFX908-NEXT: v_add_f32_e32 v10, v10, v12 +; GFX908-NEXT: v_add_f32_e32 v11, v11, v13 ; GFX908-NEXT: s_branch .LBB3_4 ; ; GFX90A-LABEL: introduced_copy_to_sgpr: ; GFX90A: ; %bb.0: ; %bb -; GFX90A-NEXT: global_load_ushort v28, v[0:1], off glc +; GFX90A-NEXT: global_load_ushort v18, v[0:1], off glc ; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x10 -; GFX90A-NEXT: s_load_dword s7, s[4:5], 0x18 -; GFX90A-NEXT: v_mov_b32_e32 v1, 0 -; GFX90A-NEXT: s_mov_b32 s6, 0 +; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GFX90A-NEXT: s_load_dword s9, s[4:5], 0x18 +; GFX90A-NEXT: s_mov_b32 s8, 0 +; GFX90A-NEXT: s_mov_b32 s5, s8 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s3 -; GFX90A-NEXT: s_sub_i32 s12, 0, s3 -; GFX90A-NEXT: s_lshr_b32 s13, s7, 16 -; GFX90A-NEXT: v_cvt_f32_f16_e32 v2, s7 +; GFX90A-NEXT: s_sub_i32 s4, 0, s3 +; GFX90A-NEXT: v_mov_b32_e32 v19, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], 0, 0 ; GFX90A-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX90A-NEXT: v_cvt_f32_f16_e32 v3, s13 -; GFX90A-NEXT: s_lshl_b64 s[4:5], s[0:1], 5 -; GFX90A-NEXT: s_lshl_b64 s[10:11], s[8:9], 5 ; GFX90A-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 -; GFX90A-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v0 +; GFX90A-NEXT: v_cvt_f32_f16_e32 v0, s9 +; GFX90A-NEXT: v_readfirstlane_b32 s10, v1 +; GFX90A-NEXT: s_mul_i32 s4, s4, s10 +; GFX90A-NEXT: s_mul_hi_u32 s4, s10, s4 +; GFX90A-NEXT: s_add_i32 s10, s10, s4 +; GFX90A-NEXT: s_mul_hi_u32 s4, s2, s10 +; GFX90A-NEXT: s_mul_i32 s10, s4, s3 +; GFX90A-NEXT: s_sub_i32 s2, s2, s10 +; GFX90A-NEXT: s_add_i32 s11, s4, 1 +; GFX90A-NEXT: s_sub_i32 s10, s2, s3 +; GFX90A-NEXT: s_cmp_ge_u32 s2, s3 +; GFX90A-NEXT: s_cselect_b32 s4, s11, s4 +; GFX90A-NEXT: s_cselect_b32 s2, s10, s2 +; GFX90A-NEXT: s_add_i32 s10, s4, 1 +; GFX90A-NEXT: s_cmp_ge_u32 s2, s3 +; GFX90A-NEXT: s_cselect_b32 s4, s10, s4 +; GFX90A-NEXT: s_lshr_b32 s9, s9, 16 +; GFX90A-NEXT: s_lshl_b64 s[12:13], s[4:5], 5 +; GFX90A-NEXT: v_cvt_f32_f16_e32 v1, s9 +; GFX90A-NEXT: s_lshl_b64 s[2:3], s[0:1], 5 +; GFX90A-NEXT: s_lshl_b64 s[10:11], s[6:7], 5 ; GFX90A-NEXT: s_or_b32 s10, s10, 28 -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[8:9], s[8:9] op_sel:[0,1] -; GFX90A-NEXT: v_pk_mov_b32 v[6:7], s[10:11], s[10:11] op_sel:[0,1] -; GFX90A-NEXT: v_mul_lo_u32 v8, s12, v0 -; GFX90A-NEXT: v_mul_hi_u32 v8, v0, v8 -; GFX90A-NEXT: v_add_u32_e32 v0, v0, v8 -; GFX90A-NEXT: v_mul_hi_u32 v0, s2, v0 -; GFX90A-NEXT: v_mul_lo_u32 v8, v0, s3 -; GFX90A-NEXT: v_sub_u32_e32 v8, s2, v8 -; GFX90A-NEXT: v_add_u32_e32 v9, 1, v0 -; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s3, v8 -; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc -; GFX90A-NEXT: v_subrev_u32_e32 v9, s3, v8 -; GFX90A-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc -; GFX90A-NEXT: v_add_u32_e32 v9, 1, v0 -; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s3, v8 -; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc -; GFX90A-NEXT: v_lshlrev_b64 v[8:9], 5, v[0:1] -; GFX90A-NEXT: v_pk_mov_b32 v[10:11], 0, 0 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_readfirstlane_b32 s2, v28 -; GFX90A-NEXT: s_and_b32 s2, 0xffff, s2 -; GFX90A-NEXT: s_mul_i32 s1, s1, s2 -; GFX90A-NEXT: s_mul_hi_u32 s3, s0, s2 -; GFX90A-NEXT: s_mul_i32 s0, s0, s2 -; GFX90A-NEXT: s_add_i32 s1, s3, s1 -; GFX90A-NEXT: s_lshl_b64 s[2:3], s[0:1], 5 +; GFX90A-NEXT: v_readfirstlane_b32 s5, v18 +; GFX90A-NEXT: s_and_b32 s5, 0xffff, s5 +; GFX90A-NEXT: s_mul_i32 s1, s1, s5 +; GFX90A-NEXT: s_mul_hi_u32 s9, s0, s5 +; GFX90A-NEXT: s_mul_i32 s0, s0, s5 +; GFX90A-NEXT: s_add_i32 s1, s9, s1 +; GFX90A-NEXT: s_lshl_b64 s[0:1], s[0:1], 5 ; GFX90A-NEXT: s_branch .LBB3_2 ; GFX90A-NEXT: .LBB3_1: ; %bb12 ; GFX90A-NEXT: ; in Loop: Header=BB3_2 Depth=1 -; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v4, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc -; GFX90A-NEXT: v_add_co_u32_e32 v6, vcc, v6, v8 -; GFX90A-NEXT: v_addc_co_u32_e32 v7, vcc, v7, v9, vcc +; GFX90A-NEXT: s_add_u32 s6, s6, s4 +; GFX90A-NEXT: s_addc_u32 s7, s7, 0 +; GFX90A-NEXT: s_add_u32 s10, s10, s12 +; GFX90A-NEXT: s_addc_u32 s11, s11, s13 ; GFX90A-NEXT: .LBB3_2: ; %bb9 ; GFX90A-NEXT: ; =>This Loop Header: Depth=1 ; GFX90A-NEXT: ; Child Loop BB3_5 Depth 2 ; GFX90A-NEXT: s_cbranch_scc0 .LBB3_1 ; GFX90A-NEXT: ; %bb.3: ; %bb14 ; GFX90A-NEXT: ; in Loop: Header=BB3_2 Depth=1 -; GFX90A-NEXT: global_load_dwordx2 v[12:13], v[10:11], off -; GFX90A-NEXT: s_mov_b32 s7, s6 -; GFX90A-NEXT: v_pk_mov_b32 v[16:17], s[6:7], s[6:7] op_sel:[0,1] -; GFX90A-NEXT: v_pk_mov_b32 v[18:19], s[6:7], s[6:7] op_sel:[0,1] -; GFX90A-NEXT: v_pk_mov_b32 v[20:21], s[6:7], s[6:7] op_sel:[0,1] -; GFX90A-NEXT: v_cmp_gt_i64_e64 s[0:1], 0, v[4:5] -; GFX90A-NEXT: v_pk_mov_b32 v[14:15], v[6:7], v[6:7] op_sel:[0,1] -; GFX90A-NEXT: v_pk_mov_b32 v[22:23], v[16:17], v[16:17] op_sel:[0,1] +; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[2:3], off +; GFX90A-NEXT: s_mov_b32 s9, s8 +; GFX90A-NEXT: v_pk_mov_b32 v[6:7], s[8:9], s[8:9] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[8:9], s[8:9], s[8:9] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[10:11], s[8:9], s[8:9] op_sel:[0,1] +; GFX90A-NEXT: v_cmp_lt_i64_e64 s[14:15], s[6:7], 0 +; GFX90A-NEXT: s_mov_b64 s[16:17], s[10:11] +; GFX90A-NEXT: v_pk_mov_b32 v[12:13], v[6:7], v[6:7] op_sel:[0,1] ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_readfirstlane_b32 s7, v12 -; GFX90A-NEXT: v_readfirstlane_b32 s8, v13 -; GFX90A-NEXT: s_add_u32 s7, s7, 1 -; GFX90A-NEXT: s_addc_u32 s9, s8, 0 -; GFX90A-NEXT: s_mul_hi_u32 s10, s4, s7 -; GFX90A-NEXT: s_mul_i32 s11, s5, s7 -; GFX90A-NEXT: s_mul_i32 s8, s4, s7 -; GFX90A-NEXT: s_mul_i32 s7, s4, s9 -; GFX90A-NEXT: s_add_i32 s7, s10, s7 -; GFX90A-NEXT: s_add_i32 s7, s7, s11 +; GFX90A-NEXT: v_readfirstlane_b32 s5, v4 +; GFX90A-NEXT: v_readfirstlane_b32 s9, v5 +; GFX90A-NEXT: s_add_u32 s5, s5, 1 +; GFX90A-NEXT: s_addc_u32 s9, s9, 0 +; GFX90A-NEXT: s_mul_hi_u32 s19, s2, s5 +; GFX90A-NEXT: s_mul_i32 s20, s3, s5 +; GFX90A-NEXT: s_mul_i32 s18, s2, s5 +; GFX90A-NEXT: s_mul_i32 s5, s2, s9 +; GFX90A-NEXT: s_add_i32 s5, s19, s5 +; GFX90A-NEXT: s_add_i32 s5, s5, s20 ; GFX90A-NEXT: s_branch .LBB3_5 ; GFX90A-NEXT: .LBB3_4: ; %bb58 ; GFX90A-NEXT: ; in Loop: Header=BB3_5 Depth=2 -; GFX90A-NEXT: v_add_co_u32_sdwa v12, vcc, v12, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX90A-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc -; GFX90A-NEXT: v_mov_b32_e32 v24, s3 -; GFX90A-NEXT: v_add_co_u32_e32 v14, vcc, s2, v14 -; GFX90A-NEXT: v_addc_co_u32_e32 v15, vcc, v15, v24, vcc -; GFX90A-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[12:13] +; GFX90A-NEXT: v_add_co_u32_sdwa v4, vcc, v4, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX90A-NEXT: s_add_u32 s16, s16, s0 +; GFX90A-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[4:5] +; GFX90A-NEXT: s_addc_u32 s17, s17, s1 ; GFX90A-NEXT: s_cbranch_vccz .LBB3_1 ; GFX90A-NEXT: .LBB3_5: ; %bb16 ; GFX90A-NEXT: ; Parent Loop BB3_2 Depth=1 ; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX90A-NEXT: v_mov_b32_e32 v25, s7 -; GFX90A-NEXT: v_add_co_u32_e32 v24, vcc, s8, v14 -; GFX90A-NEXT: v_addc_co_u32_e32 v25, vcc, v15, v25, vcc -; GFX90A-NEXT: global_load_dword v30, v[24:25], off offset:-12 glc -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: global_load_dword v29, v[24:25], off offset:-8 glc +; GFX90A-NEXT: s_add_u32 s20, s16, s18 +; GFX90A-NEXT: s_addc_u32 s21, s17, s5 +; GFX90A-NEXT: global_load_dword v21, v19, s[20:21] offset:-12 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: global_load_dword v26, v[24:25], off offset:-4 glc +; GFX90A-NEXT: global_load_dword v20, v19, s[20:21] offset:-8 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: global_load_dword v26, v[24:25], off glc +; GFX90A-NEXT: global_load_dword v14, v19, s[20:21] offset:-4 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: ; kill: killed $vgpr24 killed $vgpr25 -; GFX90A-NEXT: ds_read_b64 v[24:25], v1 +; GFX90A-NEXT: global_load_dword v14, v19, s[20:21] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: ds_read_b64 v[26:27], v0 -; GFX90A-NEXT: s_and_b64 vcc, exec, s[0:1] +; GFX90A-NEXT: ds_read_b64 v[14:15], v19 +; GFX90A-NEXT: ds_read_b64 v[16:17], v0 +; GFX90A-NEXT: s_and_b64 vcc, exec, s[14:15] +; GFX90A-NEXT: ; kill: killed $sgpr20 killed $sgpr21 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_cbranch_vccnz .LBB3_4 ; GFX90A-NEXT: ; %bb.6: ; %bb51 ; GFX90A-NEXT: ; in Loop: Header=BB3_5 Depth=2 -; GFX90A-NEXT: v_cvt_f32_f16_sdwa v31, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX90A-NEXT: v_cvt_f32_f16_e32 v30, v30 -; GFX90A-NEXT: v_cvt_f32_f16_sdwa v33, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX90A-NEXT: v_cvt_f32_f16_e32 v32, v29 -; GFX90A-NEXT: v_pk_add_f32 v[34:35], v[2:3], v[24:25] -; GFX90A-NEXT: v_pk_add_f32 v[36:37], v[24:25], 0 op_sel_hi:[1,0] -; GFX90A-NEXT: v_pk_add_f32 v[26:27], v[30:31], v[26:27] -; GFX90A-NEXT: v_pk_add_f32 v[24:25], v[32:33], v[24:25] -; GFX90A-NEXT: v_pk_add_f32 v[16:17], v[16:17], v[34:35] -; GFX90A-NEXT: v_pk_add_f32 v[18:19], v[18:19], v[36:37] -; GFX90A-NEXT: v_pk_add_f32 v[20:21], v[20:21], v[26:27] -; GFX90A-NEXT: v_pk_add_f32 v[22:23], v[22:23], v[24:25] +; GFX90A-NEXT: v_cvt_f32_f16_sdwa v23, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX90A-NEXT: v_cvt_f32_f16_e32 v22, v21 +; GFX90A-NEXT: v_cvt_f32_f16_sdwa v21, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX90A-NEXT: v_cvt_f32_f16_e32 v20, v20 +; GFX90A-NEXT: v_pk_add_f32 v[24:25], v[0:1], v[14:15] +; GFX90A-NEXT: v_pk_add_f32 v[26:27], v[14:15], 0 op_sel_hi:[1,0] +; GFX90A-NEXT: v_pk_add_f32 v[16:17], v[22:23], v[16:17] +; GFX90A-NEXT: v_pk_add_f32 v[14:15], v[20:21], v[14:15] +; GFX90A-NEXT: v_pk_add_f32 v[6:7], v[6:7], v[24:25] +; GFX90A-NEXT: v_pk_add_f32 v[8:9], v[8:9], v[26:27] +; GFX90A-NEXT: v_pk_add_f32 v[10:11], v[10:11], v[16:17] +; GFX90A-NEXT: v_pk_add_f32 v[12:13], v[12:13], v[14:15] ; GFX90A-NEXT: s_branch .LBB3_4 bb: %i = load volatile i16, i16 addrspace(4)* undef, align 2 diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll --- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll @@ -54,44 +54,49 @@ ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_mul_hi_u32 v0, s2, v0 -; GFX6-NEXT: v_mul_lo_u32 v1, v0, s3 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v0 -; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s2, v1 -; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v1 -; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] -; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s3, v1 -; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[0:1] -; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v0 -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s3, v1 -; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX6-NEXT: v_readfirstlane_b32 s0, v0 +; GFX6-NEXT: s_mul_i32 s0, s0, s3 +; GFX6-NEXT: s_sub_i32 s0, s2, s0 +; GFX6-NEXT: s_sub_i32 s1, s0, s3 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, 1, v0 +; GFX6-NEXT: s_cmp_ge_u32 s0, s3 +; GFX6-NEXT: s_cselect_b64 vcc, -1, 0 +; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX6-NEXT: s_cselect_b32 s0, s1, s0 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, 1, v0 +; GFX6-NEXT: s_cmp_ge_u32 s0, s3 +; GFX6-NEXT: s_cselect_b64 vcc, -1, 0 +; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: udiv_i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 ; GFX9-NEXT: s_sub_i32 s4, 0, s3 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: v_mul_lo_u32 v1, s4, v0 -; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 -; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 -; GFX9-NEXT: v_mul_hi_u32 v0, s2, v0 -; GFX9-NEXT: v_mul_lo_u32 v1, v0, s3 -; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 -; GFX9-NEXT: v_sub_u32_e32 v1, s2, v1 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, s3, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: global_store_dword v2, v0, s[0:1] +; GFX9-NEXT: v_readfirstlane_b32 s5, v0 +; GFX9-NEXT: s_mul_i32 s4, s4, s5 +; GFX9-NEXT: s_mul_hi_u32 s4, s5, s4 +; GFX9-NEXT: s_add_i32 s5, s5, s4 +; GFX9-NEXT: s_mul_hi_u32 s4, s2, s5 +; GFX9-NEXT: s_mul_i32 s5, s4, s3 +; GFX9-NEXT: s_sub_i32 s2, s2, s5 +; GFX9-NEXT: s_add_i32 s6, s4, 1 +; GFX9-NEXT: s_sub_i32 s5, s2, s3 +; GFX9-NEXT: s_cmp_ge_u32 s2, s3 +; GFX9-NEXT: s_cselect_b32 s4, s6, s4 +; GFX9-NEXT: s_cselect_b32 s2, s5, s2 +; GFX9-NEXT: s_add_i32 s5, s4, 1 +; GFX9-NEXT: s_cmp_ge_u32 s2, s3 +; GFX9-NEXT: s_cselect_b32 s2, s5, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-NEXT: s_endpgm %r = udiv i32 %x, %y store i32 %r, i32 addrspace(1)* %out @@ -145,16 +150,18 @@ ; GFX6-NEXT: v_mul_lo_u32 v1, s4, v0 ; GFX6-NEXT: s_mov_b32 s4, s0 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GFX6-NEXT: v_mul_hi_u32 v0, s2, v0 -; GFX6-NEXT: v_mul_lo_u32 v0, v0, s3 -; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 -; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, s3, v0 -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s3, v0 -; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, s3, v0 -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s3, v0 -; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX6-NEXT: v_readfirstlane_b32 s0, v0 +; GFX6-NEXT: s_mul_i32 s0, s0, s3 +; GFX6-NEXT: s_sub_i32 s0, s2, s0 +; GFX6-NEXT: s_sub_i32 s1, s0, s3 +; GFX6-NEXT: s_cmp_ge_u32 s0, s3 +; GFX6-NEXT: s_cselect_b32 s0, s1, s0 +; GFX6-NEXT: s_sub_i32 s1, s0, s3 +; GFX6-NEXT: s_cmp_ge_u32 s0, s3 +; GFX6-NEXT: s_cselect_b32 s0, s1, s0 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -248,63 +255,68 @@ ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX6-NEXT: s_xor_b32 s2, s2, s9 ; GFX6-NEXT: s_mov_b32 s5, s1 -; GFX6-NEXT: s_xor_b32 s8, s9, s8 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX6-NEXT: v_mul_lo_u32 v1, s4, v0 ; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_xor_b32 s0, s9, s8 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GFX6-NEXT: v_mul_hi_u32 v0, s2, v0 -; GFX6-NEXT: v_mul_lo_u32 v1, v0, s3 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v0 -; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s2, v1 -; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v1 -; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] -; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s3, v1 -; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[0:1] -; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v0 -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s3, v1 -; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX6-NEXT: v_xor_b32_e32 v0, s8, v0 -; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s8, v0 +; GFX6-NEXT: v_readfirstlane_b32 s1, v0 +; GFX6-NEXT: s_mul_i32 s1, s1, s3 +; GFX6-NEXT: s_sub_i32 s1, s2, s1 +; GFX6-NEXT: s_sub_i32 s2, s1, s3 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, 1, v0 +; GFX6-NEXT: s_cmp_ge_u32 s1, s3 +; GFX6-NEXT: s_cselect_b64 vcc, -1, 0 +; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX6-NEXT: s_cselect_b32 s1, s2, s1 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, 1, v0 +; GFX6-NEXT: s_cmp_ge_u32 s1, s3 +; GFX6-NEXT: s_cselect_b64 vcc, -1, 0 +; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX6-NEXT: v_xor_b32_e32 v0, s0, v0 +; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s0, v0 ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: sdiv_i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_ashr_i32 s4, s3, 31 ; GFX9-NEXT: s_add_i32 s3, s3, s4 ; GFX9-NEXT: s_xor_b32 s3, s3, s4 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 -; GFX9-NEXT: s_sub_i32 s5, 0, s3 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 -; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: v_mul_lo_u32 v1, s5, v0 ; GFX9-NEXT: s_ashr_i32 s5, s2, 31 ; GFX9-NEXT: s_add_i32 s2, s2, s5 -; GFX9-NEXT: s_xor_b32 s2, s2, s5 -; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX9-NEXT: s_xor_b32 s4, s5, s4 -; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 -; GFX9-NEXT: v_mul_hi_u32 v0, s2, v0 -; GFX9-NEXT: v_mul_lo_u32 v1, v0, s3 -; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 -; GFX9-NEXT: v_sub_u32_e32 v1, s2, v1 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, s3, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_xor_b32_e32 v0, s4, v0 -; GFX9-NEXT: v_subrev_u32_e32 v0, s4, v0 -; GFX9-NEXT: global_store_dword v2, v0, s[0:1] +; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX9-NEXT: s_xor_b32 s2, s2, s5 +; GFX9-NEXT: s_sub_i32 s5, 0, s3 +; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX9-NEXT: v_readfirstlane_b32 s6, v0 +; GFX9-NEXT: s_mul_i32 s5, s5, s6 +; GFX9-NEXT: s_mul_hi_u32 s5, s6, s5 +; GFX9-NEXT: s_add_i32 s6, s6, s5 +; GFX9-NEXT: s_mul_hi_u32 s5, s2, s6 +; GFX9-NEXT: s_mul_i32 s6, s5, s3 +; GFX9-NEXT: s_sub_i32 s2, s2, s6 +; GFX9-NEXT: s_add_i32 s7, s5, 1 +; GFX9-NEXT: s_sub_i32 s6, s2, s3 +; GFX9-NEXT: s_cmp_ge_u32 s2, s3 +; GFX9-NEXT: s_cselect_b32 s5, s7, s5 +; GFX9-NEXT: s_cselect_b32 s2, s6, s2 +; GFX9-NEXT: s_add_i32 s6, s5, 1 +; GFX9-NEXT: s_cmp_ge_u32 s2, s3 +; GFX9-NEXT: s_cselect_b32 s2, s6, s5 +; GFX9-NEXT: s_xor_b32 s2, s2, s4 +; GFX9-NEXT: s_sub_i32 s2, s2, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-NEXT: s_endpgm %r = sdiv i32 %x, %y store i32 %r, i32 addrspace(1)* %out @@ -372,16 +384,18 @@ ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_mul_hi_u32 v0, s6, v0 -; GFX6-NEXT: v_mul_lo_u32 v0, v0, s4 -; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s6, v0 -; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, s4, v0 -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s4, v0 -; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, s4, v0 -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s4, v0 -; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GFX6-NEXT: v_xor_b32_e32 v0, s5, v0 -; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s5, v0 +; GFX6-NEXT: v_readfirstlane_b32 s7, v0 +; GFX6-NEXT: s_mul_i32 s7, s7, s4 +; GFX6-NEXT: s_sub_i32 s6, s6, s7 +; GFX6-NEXT: s_sub_i32 s7, s6, s4 +; GFX6-NEXT: s_cmp_ge_u32 s6, s4 +; GFX6-NEXT: s_cselect_b32 s6, s7, s6 +; GFX6-NEXT: s_sub_i32 s7, s6, s4 +; GFX6-NEXT: s_cmp_ge_u32 s6, s4 +; GFX6-NEXT: s_cselect_b32 s4, s7, s6 +; GFX6-NEXT: s_xor_b32 s4, s4, s5 +; GFX6-NEXT: s_sub_i32 s4, s4, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; @@ -605,15 +619,15 @@ ; GFX6-NEXT: s_xor_b32 s4, s4, s5 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 ; GFX6-NEXT: s_ashr_i32 s4, s4, 30 -; GFX6-NEXT: s_or_b32 s4, s4, 1 -; GFX6-NEXT: v_mov_b32_e32 v3, s4 +; GFX6-NEXT: s_or_b32 s6, s4, 1 ; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 ; GFX6-NEXT: v_trunc_f32_e32 v2, v2 ; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 ; GFX6-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| -; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, |v0| +; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX6-NEXT: s_cselect_b32 s4, s6, 0 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, s4, v2 ; GFX6-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; @@ -679,24 +693,24 @@ ; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_ashr_i32 s2, s4, 16 -; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s2 -; GFX6-NEXT: s_sext_i32_i16 s3, s4 -; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s3 -; GFX6-NEXT: s_xor_b32 s3, s3, s2 +; GFX6-NEXT: s_ashr_i32 s5, s4, 16 +; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s5 +; GFX6-NEXT: s_sext_i32_i16 s2, s4 +; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s2 +; GFX6-NEXT: s_xor_b32 s2, s2, s5 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GFX6-NEXT: s_ashr_i32 s3, s3, 30 -; GFX6-NEXT: s_or_b32 s3, s3, 1 -; GFX6-NEXT: v_mov_b32_e32 v3, s3 +; GFX6-NEXT: s_ashr_i32 s2, s2, 30 +; GFX6-NEXT: s_or_b32 s6, s2, 1 ; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 ; GFX6-NEXT: v_trunc_f32_e32 v2, v2 ; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 ; GFX6-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| -; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc +; GFX6-NEXT: v_cmp_ge_f32_e64 s[2:3], |v1|, |v0| +; GFX6-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX6-NEXT: s_cselect_b32 s2, s6, 0 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, s2, v2 +; GFX6-NEXT: v_mul_lo_u32 v0, v0, s5 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v2, v0 -; GFX6-NEXT: v_mul_lo_u32 v0, v0, s2 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 ; GFX6-NEXT: buffer_store_short v0, off, s[0:3], 0 @@ -909,15 +923,15 @@ ; GFX6-NEXT: s_xor_b32 s4, s4, s5 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 ; GFX6-NEXT: s_ashr_i32 s4, s4, 30 -; GFX6-NEXT: s_or_b32 s4, s4, 1 -; GFX6-NEXT: v_mov_b32_e32 v3, s4 +; GFX6-NEXT: s_or_b32 s6, s4, 1 ; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 ; GFX6-NEXT: v_trunc_f32_e32 v2, v2 ; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 ; GFX6-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| -; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v2, v0 +; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, |v0| +; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX6-NEXT: s_cselect_b32 s4, s6, 0 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, s4, v2 ; GFX6-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; @@ -985,22 +999,22 @@ ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_bfe_i32 s2, s4, 0x80008 ; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s2 -; GFX6-NEXT: s_sext_i32_i8 s5, s4 -; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s5 -; GFX6-NEXT: s_xor_b32 s2, s5, s2 +; GFX6-NEXT: s_sext_i32_i8 s3, s4 +; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s3 +; GFX6-NEXT: s_xor_b32 s2, s3, s2 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 ; GFX6-NEXT: s_ashr_i32 s2, s2, 30 -; GFX6-NEXT: s_or_b32 s2, s2, 1 -; GFX6-NEXT: v_mov_b32_e32 v3, s2 +; GFX6-NEXT: s_lshr_b32 s5, s4, 8 +; GFX6-NEXT: s_or_b32 s6, s2, 1 ; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 ; GFX6-NEXT: v_trunc_f32_e32 v2, v2 ; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 ; GFX6-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| -; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc -; GFX6-NEXT: s_lshr_b32 s3, s4, 8 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GFX6-NEXT: v_mul_lo_u32 v0, v0, s3 +; GFX6-NEXT: v_cmp_ge_f32_e64 s[2:3], |v1|, |v0| +; GFX6-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX6-NEXT: s_cselect_b32 s2, s6, 0 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, s2, v2 +; GFX6-NEXT: v_mul_lo_u32 v0, v0, s5 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 @@ -1174,88 +1188,100 @@ ; ; GFX6-LABEL: udiv_v4i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd -; GFX6-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x9 -; GFX6-NEXT: s_mov_b32 s15, 0xf000 -; GFX6-NEXT: s_mov_b32 s14, -1 +; GFX6-NEXT: s_load_dwordx8 s[8:15], s[0:1], 0xd +; GFX6-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s19, 0xf000 +; GFX6-NEXT: s_mov_b32 s18, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s8 -; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s9 -; GFX6-NEXT: s_sub_i32 s2, 0, s8 -; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s10 +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s12 +; GFX6-NEXT: s_sub_i32 s2, 0, s12 +; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s13 +; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s14 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GFX6-NEXT: v_cvt_f32_u32_e32 v6, s11 +; GFX6-NEXT: v_cvt_f32_u32_e32 v6, s15 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v4 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX6-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v6, v6 +; GFX6-NEXT: v_mul_lo_u32 v1, s2, v0 +; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GFX6-NEXT: v_mul_hi_u32 v0, s8, v0 +; GFX6-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX6-NEXT: v_mul_lo_u32 v2, s2, v0 -; GFX6-NEXT: s_sub_i32 s2, 0, s9 -; GFX6-NEXT: v_mul_lo_u32 v3, s2, v1 -; GFX6-NEXT: s_sub_i32 s2, 0, s10 -; GFX6-NEXT: v_mul_hi_u32 v2, v0, v2 +; GFX6-NEXT: v_readfirstlane_b32 s2, v0 +; GFX6-NEXT: s_mul_i32 s2, s2, s12 +; GFX6-NEXT: s_sub_i32 s2, s8, s2 +; GFX6-NEXT: s_sub_i32 s3, s2, s12 +; GFX6-NEXT: s_cmp_ge_u32 s2, s12 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v0 +; GFX6-NEXT: s_cselect_b32 s2, s3, s2 +; GFX6-NEXT: s_cselect_b64 vcc, -1, 0 +; GFX6-NEXT: s_cmp_ge_u32 s2, s12 +; GFX6-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX6-NEXT: s_sub_i32 s4, 0, s13 +; GFX6-NEXT: v_mul_lo_u32 v3, s4, v1 +; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v0 ; GFX6-NEXT: v_mul_hi_u32 v3, v1, v3 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0 -; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3 -; GFX6-NEXT: v_mul_hi_u32 v1, s5, v1 -; GFX6-NEXT: v_mul_lo_u32 v2, v0, s8 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, 1, v0 -; GFX6-NEXT: v_mul_lo_u32 v5, v1, s9 -; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s4, v2 -; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v2 -; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v3, s[0:1] -; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s8, v2 -; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] -; GFX6-NEXT: v_add_i32_e32 v3, vcc, 1, v0 -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s8, v2 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v4 -; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s5, v5 -; GFX6-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 -; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v2 +; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[2:3] +; GFX6-NEXT: v_add_i32_e32 v1, vcc, v3, v1 +; GFX6-NEXT: v_mul_hi_u32 v1, s9, v1 +; GFX6-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v4 +; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v3 +; GFX6-NEXT: v_readfirstlane_b32 s4, v1 +; GFX6-NEXT: s_mul_i32 s4, s4, s13 +; GFX6-NEXT: s_sub_i32 s4, s9, s4 +; GFX6-NEXT: s_sub_i32 s5, s4, s13 +; GFX6-NEXT: s_cmp_ge_u32 s4, s13 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 1, v1 -; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v3 -; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[0:1] -; GFX6-NEXT: v_mul_lo_u32 v4, s2, v2 -; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, s9, v3 -; GFX6-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[0:1] -; GFX6-NEXT: v_mul_hi_u32 v4, v2, v4 -; GFX6-NEXT: v_add_i32_e32 v5, vcc, 1, v1 -; GFX6-NEXT: s_sub_i32 s0, 0, s11 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v6 -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s9, v3 -; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc -; GFX6-NEXT: v_mul_hi_u32 v2, s6, v2 -; GFX6-NEXT: v_mul_f32_e32 v4, 0x4f7ffffe, v4 -; GFX6-NEXT: v_cvt_u32_f32_e32 v4, v4 -; GFX6-NEXT: v_mul_lo_u32 v3, v2, s10 -; GFX6-NEXT: v_add_i32_e32 v6, vcc, 1, v2 -; GFX6-NEXT: v_mul_lo_u32 v5, s0, v4 -; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s6, v3 -; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s10, v3 -; GFX6-NEXT: v_mul_hi_u32 v5, v4, v5 -; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[0:1] -; GFX6-NEXT: v_subrev_i32_e32 v6, vcc, s10, v3 -; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; GFX6-NEXT: v_mul_hi_u32 v4, s7, v4 -; GFX6-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[0:1] -; GFX6-NEXT: v_add_i32_e32 v5, vcc, 1, v2 -; GFX6-NEXT: v_mul_lo_u32 v6, v4, s11 -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s10, v3 -; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc -; GFX6-NEXT: v_add_i32_e32 v5, vcc, 1, v4 -; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s7, v6 -; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s11, v3 -; GFX6-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[0:1] -; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, s11, v3 -; GFX6-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[0:1] -; GFX6-NEXT: v_add_i32_e32 v5, vcc, 1, v4 -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s11, v3 -; GFX6-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 +; GFX6-NEXT: s_cselect_b32 s4, s5, s4 +; GFX6-NEXT: s_cselect_b64 vcc, -1, 0 +; GFX6-NEXT: s_cmp_ge_u32 s4, s13 +; GFX6-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX6-NEXT: s_sub_i32 s6, 0, s14 +; GFX6-NEXT: v_mul_lo_u32 v5, s6, v3 +; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; GFX6-NEXT: v_add_i32_e32 v4, vcc, 1, v1 +; GFX6-NEXT: v_mul_hi_u32 v5, v3, v5 +; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[4:5] +; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v5 +; GFX6-NEXT: v_mul_hi_u32 v3, s10, v3 +; GFX6-NEXT: v_mul_f32_e32 v5, 0x4f7ffffe, v6 +; GFX6-NEXT: v_cvt_u32_f32_e32 v5, v5 +; GFX6-NEXT: v_readfirstlane_b32 s6, v3 +; GFX6-NEXT: s_mul_i32 s6, s6, s14 +; GFX6-NEXT: s_sub_i32 s6, s10, s6 +; GFX6-NEXT: s_sub_i32 s7, s6, s14 +; GFX6-NEXT: s_cmp_ge_u32 s6, s14 +; GFX6-NEXT: v_add_i32_e32 v6, vcc, 1, v3 +; GFX6-NEXT: s_cselect_b32 s6, s7, s6 +; GFX6-NEXT: s_cselect_b64 vcc, -1, 0 +; GFX6-NEXT: s_cmp_ge_u32 s6, s14 +; GFX6-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GFX6-NEXT: s_sub_i32 s8, 0, s15 +; GFX6-NEXT: v_mul_lo_u32 v7, s8, v5 +; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc +; GFX6-NEXT: v_add_i32_e32 v6, vcc, 1, v3 +; GFX6-NEXT: v_mul_hi_u32 v7, v5, v7 +; GFX6-NEXT: v_cndmask_b32_e64 v2, v3, v6, s[6:7] +; GFX6-NEXT: v_add_i32_e32 v5, vcc, v7, v5 +; GFX6-NEXT: v_mul_hi_u32 v5, s11, v5 +; GFX6-NEXT: v_readfirstlane_b32 s0, v5 +; GFX6-NEXT: s_mul_i32 s0, s0, s15 +; GFX6-NEXT: s_sub_i32 s0, s11, s0 +; GFX6-NEXT: s_sub_i32 s1, s0, s15 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, 1, v5 +; GFX6-NEXT: s_cmp_ge_u32 s0, s15 +; GFX6-NEXT: s_cselect_b64 vcc, -1, 0 +; GFX6-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX6-NEXT: s_cselect_b32 s0, s1, s0 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, 1, v3 +; GFX6-NEXT: s_cmp_ge_u32 s0, s15 +; GFX6-NEXT: s_cselect_b64 vcc, -1, 0 +; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: udiv_v4i32: @@ -1267,79 +1293,87 @@ ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s8 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s9 ; GFX9-NEXT: s_sub_i32 s2, 0, s8 -; GFX9-NEXT: s_sub_i32 s3, 0, s9 +; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s10 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s10 -; GFX9-NEXT: v_cvt_f32_u32_e32 v6, s11 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v5 -; GFX9-NEXT: v_mul_lo_u32 v2, s2, v0 -; GFX9-NEXT: s_sub_i32 s2, 0, s10 -; GFX9-NEXT: v_mul_lo_u32 v3, s3, v1 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v6, v6 -; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2 -; GFX9-NEXT: v_mul_hi_u32 v3, v1, v3 -; GFX9-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6 -; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 -; GFX9-NEXT: v_mul_hi_u32 v0, s4, v0 -; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 -; GFX9-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v5 -; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 -; GFX9-NEXT: v_mul_lo_u32 v3, v0, s8 -; GFX9-NEXT: v_add_u32_e32 v7, 1, v0 -; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v6 -; GFX9-NEXT: v_mul_hi_u32 v1, s5, v1 -; GFX9-NEXT: v_sub_u32_e32 v3, s4, v3 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s8, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc -; GFX9-NEXT: v_subrev_u32_e32 v7, s8, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s8, v3 -; GFX9-NEXT: v_mul_lo_u32 v3, s2, v2 -; GFX9-NEXT: s_sub_i32 s2, 0, s11 -; GFX9-NEXT: v_mul_lo_u32 v5, v1, s9 -; GFX9-NEXT: v_add_u32_e32 v7, 1, v0 -; GFX9-NEXT: v_mul_hi_u32 v3, v2, v3 -; GFX9-NEXT: v_add_u32_e32 v8, 1, v1 -; GFX9-NEXT: v_sub_u32_e32 v5, s5, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc -; GFX9-NEXT: v_add_u32_e32 v2, v2, v3 -; GFX9-NEXT: v_mul_lo_u32 v3, s2, v6 -; GFX9-NEXT: v_mul_hi_u32 v2, s6, v2 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s9, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc -; GFX9-NEXT: v_mul_hi_u32 v3, v6, v3 -; GFX9-NEXT: v_mul_lo_u32 v8, v2, s10 -; GFX9-NEXT: v_subrev_u32_e32 v7, s9, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc -; GFX9-NEXT: v_add_u32_e32 v3, v6, v3 -; GFX9-NEXT: v_mul_hi_u32 v3, s7, v3 -; GFX9-NEXT: v_add_u32_e32 v7, 1, v1 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s9, v5 -; GFX9-NEXT: v_sub_u32_e32 v5, s6, v8 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s10, v5 -; GFX9-NEXT: v_subrev_u32_e32 v6, s10, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc -; GFX9-NEXT: v_mul_lo_u32 v6, v3, s11 -; GFX9-NEXT: v_add_u32_e32 v7, 1, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc -; GFX9-NEXT: v_add_u32_e32 v7, 1, v2 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s10, v5 -; GFX9-NEXT: v_sub_u32_e32 v5, s7, v6 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc -; GFX9-NEXT: v_add_u32_e32 v6, 1, v3 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s11, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc -; GFX9-NEXT: v_subrev_u32_e32 v6, s11, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc -; GFX9-NEXT: v_add_u32_e32 v6, 1, v3 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s11, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc +; GFX9-NEXT: v_readfirstlane_b32 s3, v0 +; GFX9-NEXT: s_mul_i32 s2, s2, s3 +; GFX9-NEXT: s_mul_hi_u32 s2, s3, s2 +; GFX9-NEXT: s_add_i32 s3, s3, s2 +; GFX9-NEXT: s_mul_hi_u32 s2, s4, s3 +; GFX9-NEXT: s_mul_i32 s3, s2, s8 +; GFX9-NEXT: s_sub_i32 s3, s4, s3 +; GFX9-NEXT: s_add_i32 s13, s2, 1 +; GFX9-NEXT: s_sub_i32 s4, s3, s8 +; GFX9-NEXT: s_cmp_ge_u32 s3, s8 +; GFX9-NEXT: s_cselect_b32 s2, s13, s2 +; GFX9-NEXT: s_cselect_b32 s3, s4, s3 +; GFX9-NEXT: s_add_i32 s4, s2, 1 +; GFX9-NEXT: s_cmp_ge_u32 s3, s8 +; GFX9-NEXT: v_readfirstlane_b32 s12, v1 +; GFX9-NEXT: s_cselect_b32 s2, s4, s2 +; GFX9-NEXT: s_sub_i32 s3, 0, s9 +; GFX9-NEXT: s_mul_i32 s3, s3, s12 +; GFX9-NEXT: s_mul_hi_u32 s3, s12, s3 +; GFX9-NEXT: s_add_i32 s12, s12, s3 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v2 +; GFX9-NEXT: s_mul_hi_u32 s3, s5, s12 +; GFX9-NEXT: s_mul_i32 s4, s3, s9 +; GFX9-NEXT: s_sub_i32 s4, s5, s4 +; GFX9-NEXT: s_add_i32 s8, s3, 1 +; GFX9-NEXT: s_sub_i32 s5, s4, s9 +; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; GFX9-NEXT: s_cmp_ge_u32 s4, s9 +; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX9-NEXT: s_cselect_b32 s3, s8, s3 +; GFX9-NEXT: s_cselect_b32 s4, s5, s4 +; GFX9-NEXT: s_add_i32 s5, s3, 1 +; GFX9-NEXT: s_cmp_ge_u32 s4, s9 +; GFX9-NEXT: s_cselect_b32 s3, s5, s3 +; GFX9-NEXT: v_readfirstlane_b32 s5, v0 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s11 +; GFX9-NEXT: s_sub_i32 s4, 0, s10 +; GFX9-NEXT: s_mul_i32 s4, s4, s5 +; GFX9-NEXT: s_mul_hi_u32 s4, s5, s4 +; GFX9-NEXT: s_add_i32 s5, s5, s4 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX9-NEXT: s_mul_hi_u32 s4, s6, s5 +; GFX9-NEXT: s_mul_i32 s5, s4, s10 +; GFX9-NEXT: s_sub_i32 s5, s6, s5 +; GFX9-NEXT: s_add_i32 s6, s4, 1 +; GFX9-NEXT: s_sub_i32 s8, s5, s10 +; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; GFX9-NEXT: s_cmp_ge_u32 s5, s10 +; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX9-NEXT: s_cselect_b32 s4, s6, s4 +; GFX9-NEXT: s_cselect_b32 s5, s8, s5 +; GFX9-NEXT: s_add_i32 s6, s4, 1 +; GFX9-NEXT: s_cmp_ge_u32 s5, s10 +; GFX9-NEXT: s_cselect_b32 s4, s6, s4 +; GFX9-NEXT: s_sub_i32 s5, 0, s11 +; GFX9-NEXT: v_readfirstlane_b32 s6, v0 +; GFX9-NEXT: s_mul_i32 s5, s5, s6 +; GFX9-NEXT: s_mul_hi_u32 s5, s6, s5 +; GFX9-NEXT: s_add_i32 s6, s6, s5 +; GFX9-NEXT: s_mul_hi_u32 s5, s7, s6 +; GFX9-NEXT: s_mul_i32 s6, s5, s11 +; GFX9-NEXT: s_sub_i32 s6, s7, s6 +; GFX9-NEXT: s_add_i32 s7, s5, 1 +; GFX9-NEXT: s_sub_i32 s8, s6, s11 +; GFX9-NEXT: s_cmp_ge_u32 s6, s11 +; GFX9-NEXT: s_cselect_b32 s5, s7, s5 +; GFX9-NEXT: s_cselect_b32 s6, s8, s6 +; GFX9-NEXT: s_add_i32 s7, s5, 1 +; GFX9-NEXT: s_cmp_ge_u32 s6, s11 +; GFX9-NEXT: s_cselect_b32 s5, s7, s5 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX9-NEXT: s_endpgm %r = udiv <4 x i32> %x, %y @@ -1476,77 +1510,85 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; GFX6-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s8 -; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s9 -; GFX6-NEXT: s_sub_i32 s12, 0, s8 -; GFX6-NEXT: s_sub_i32 s13, 0, s9 +; GFX6-NEXT: s_sub_i32 s2, 0, s8 +; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s9 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s10 -; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s11 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX6-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 -; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v3 -; GFX6-NEXT: v_mul_lo_u32 v2, s12, v0 -; GFX6-NEXT: v_mul_lo_u32 v4, s13, v1 -; GFX6-NEXT: v_mul_hi_u32 v2, v0, v2 -; GFX6-NEXT: v_mul_hi_u32 v4, v1, v4 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v2, v0 +; GFX6-NEXT: v_mul_lo_u32 v1, s2, v0 +; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0 -; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v4 -; GFX6-NEXT: v_mul_hi_u32 v1, s5, v1 -; GFX6-NEXT: v_mul_lo_u32 v0, v0, s8 -; GFX6-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v3 -; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v2 -; GFX6-NEXT: v_mul_lo_u32 v1, v1, s9 -; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 -; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s8, v0 -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 -; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s8, v0 -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 -; GFX6-NEXT: s_sub_i32 s4, 0, s10 -; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX6-NEXT: v_mul_lo_u32 v3, s4, v2 -; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s5, v1 -; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s9, v1 -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s9, v1 -; GFX6-NEXT: v_mul_hi_u32 v3, v2, v3 -; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v5 -; GFX6-NEXT: s_sub_i32 s4, 0, s11 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GFX6-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v4 -; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v3 -; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s9, v1 -; GFX6-NEXT: v_mul_hi_u32 v2, s6, v2 -; GFX6-NEXT: v_mul_lo_u32 v5, s4, v3 -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s9, v1 -; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GFX6-NEXT: v_mul_lo_u32 v2, v2, s10 -; GFX6-NEXT: v_mul_hi_u32 v4, v3, v5 -; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s6, v2 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v4 -; GFX6-NEXT: v_mul_hi_u32 v3, s7, v3 -; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, s10, v2 -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s10, v2 -; GFX6-NEXT: v_mul_lo_u32 v3, v3, s11 -; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc -; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s10, v2 -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s10, v2 -; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s7, v3 -; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s11, v3 -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s11, v3 -; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc -; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s11, v3 -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s11, v3 -; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; GFX6-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2 +; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s10 +; GFX6-NEXT: v_readfirstlane_b32 s2, v0 +; GFX6-NEXT: s_mul_i32 s2, s2, s8 +; GFX6-NEXT: s_sub_i32 s2, s4, s2 +; GFX6-NEXT: s_sub_i32 s3, s2, s8 +; GFX6-NEXT: s_cmp_ge_u32 s2, s8 +; GFX6-NEXT: s_cselect_b32 s2, s3, s2 +; GFX6-NEXT: s_sub_i32 s3, s2, s8 +; GFX6-NEXT: s_cmp_ge_u32 s2, s8 +; GFX6-NEXT: s_cselect_b32 s4, s3, s2 +; GFX6-NEXT: s_sub_i32 s2, 0, s9 +; GFX6-NEXT: v_mul_lo_u32 v0, s2, v1 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 +; GFX6-NEXT: v_mul_hi_u32 v0, v1, v0 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GFX6-NEXT: v_mul_hi_u32 v0, s5, v0 +; GFX6-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2 +; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s11 +; GFX6-NEXT: v_readfirstlane_b32 s2, v0 +; GFX6-NEXT: s_mul_i32 s2, s2, s9 +; GFX6-NEXT: s_sub_i32 s2, s5, s2 +; GFX6-NEXT: s_sub_i32 s3, s2, s9 +; GFX6-NEXT: s_cmp_ge_u32 s2, s9 +; GFX6-NEXT: s_cselect_b32 s2, s3, s2 +; GFX6-NEXT: s_sub_i32 s3, s2, s9 +; GFX6-NEXT: s_cmp_ge_u32 s2, s9 +; GFX6-NEXT: s_cselect_b32 s5, s3, s2 +; GFX6-NEXT: s_sub_i32 s2, 0, s10 +; GFX6-NEXT: v_mul_lo_u32 v0, s2, v1 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 +; GFX6-NEXT: v_mul_hi_u32 v0, v1, v0 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GFX6-NEXT: v_mul_hi_u32 v0, s6, v0 +; GFX6-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2 +; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GFX6-NEXT: v_readfirstlane_b32 s2, v0 +; GFX6-NEXT: s_mul_i32 s2, s2, s10 +; GFX6-NEXT: s_sub_i32 s2, s6, s2 +; GFX6-NEXT: s_sub_i32 s3, s2, s10 +; GFX6-NEXT: s_cmp_ge_u32 s2, s10 +; GFX6-NEXT: s_cselect_b32 s2, s3, s2 +; GFX6-NEXT: s_sub_i32 s3, s2, s10 +; GFX6-NEXT: s_cmp_ge_u32 s2, s10 +; GFX6-NEXT: s_cselect_b32 s6, s3, s2 +; GFX6-NEXT: s_sub_i32 s2, 0, s11 +; GFX6-NEXT: v_mul_lo_u32 v0, s2, v1 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: v_mul_hi_u32 v0, v1, v0 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 +; GFX6-NEXT: v_mul_hi_u32 v2, s7, v0 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_readfirstlane_b32 s4, v2 +; GFX6-NEXT: s_mul_i32 s4, s4, s11 +; GFX6-NEXT: s_sub_i32 s4, s7, s4 +; GFX6-NEXT: s_sub_i32 s5, s4, s11 +; GFX6-NEXT: s_cmp_ge_u32 s4, s11 +; GFX6-NEXT: s_cselect_b32 s4, s5, s4 +; GFX6-NEXT: s_sub_i32 s5, s4, s11 +; GFX6-NEXT: s_cmp_ge_u32 s4, s11 +; GFX6-NEXT: s_cselect_b32 s4, s5, s4 +; GFX6-NEXT: v_mov_b32_e32 v2, s6 +; GFX6-NEXT: v_mov_b32_e32 v3, s4 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; @@ -1810,124 +1852,136 @@ ; ; GFX6-LABEL: sdiv_v4i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd -; GFX6-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x9 -; GFX6-NEXT: s_mov_b32 s15, 0xf000 -; GFX6-NEXT: s_mov_b32 s14, -1 +; GFX6-NEXT: s_load_dwordx8 s[8:15], s[0:1], 0xd +; GFX6-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s19, 0xf000 +; GFX6-NEXT: s_mov_b32 s18, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_ashr_i32 s2, s8, 31 -; GFX6-NEXT: s_add_i32 s3, s8, s2 +; GFX6-NEXT: s_ashr_i32 s2, s12, 31 +; GFX6-NEXT: s_add_i32 s3, s12, s2 ; GFX6-NEXT: s_xor_b32 s3, s3, s2 ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s3 -; GFX6-NEXT: s_ashr_i32 s8, s9, 31 -; GFX6-NEXT: s_add_i32 s0, s9, s8 -; GFX6-NEXT: s_xor_b32 s9, s0, s8 +; GFX6-NEXT: s_sub_i32 s4, 0, s3 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s9 -; GFX6-NEXT: s_sub_i32 s1, 0, s3 -; GFX6-NEXT: s_ashr_i32 s0, s4, 31 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GFX6-NEXT: s_xor_b32 s2, s0, s2 -; GFX6-NEXT: v_mul_lo_u32 v2, s1, v0 -; GFX6-NEXT: s_add_i32 s1, s4, s0 -; GFX6-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 -; GFX6-NEXT: s_xor_b32 s1, s1, s0 -; GFX6-NEXT: v_mul_hi_u32 v2, v0, v2 -; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX6-NEXT: s_sub_i32 s0, 0, s9 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v2, v0 -; GFX6-NEXT: v_mul_hi_u32 v0, s1, v0 -; GFX6-NEXT: v_mul_lo_u32 v2, s0, v1 -; GFX6-NEXT: v_mul_lo_u32 v3, v0, s3 -; GFX6-NEXT: v_mul_hi_u32 v2, v1, v2 -; GFX6-NEXT: v_add_i32_e32 v4, vcc, 1, v0 -; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s1, v3 -; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v3 -; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[0:1] -; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s3, v3 -; GFX6-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[0:1] -; GFX6-NEXT: v_add_i32_e32 v4, vcc, 1, v0 -; GFX6-NEXT: v_add_i32_e32 v1, vcc, v2, v1 -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s3, v3 -; GFX6-NEXT: s_ashr_i32 s0, s5, 31 -; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; GFX6-NEXT: s_add_i32 s1, s5, s0 -; GFX6-NEXT: v_xor_b32_e32 v0, s2, v0 -; GFX6-NEXT: s_ashr_i32 s3, s10, 31 -; GFX6-NEXT: s_xor_b32 s1, s1, s0 -; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s2, v0 -; GFX6-NEXT: s_xor_b32 s2, s0, s8 -; GFX6-NEXT: s_add_i32 s0, s10, s3 -; GFX6-NEXT: s_xor_b32 s4, s0, s3 -; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s4 -; GFX6-NEXT: v_mul_hi_u32 v1, s1, v1 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v3 -; GFX6-NEXT: v_mul_lo_u32 v2, v1, s9 -; GFX6-NEXT: v_add_i32_e32 v4, vcc, 1, v1 -; GFX6-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 -; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s1, v2 -; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v3 -; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v2 -; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[0:1] -; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s9, v2 -; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[0:1] -; GFX6-NEXT: s_sub_i32 s0, 0, s4 -; GFX6-NEXT: v_mul_lo_u32 v5, s0, v3 -; GFX6-NEXT: v_add_i32_e32 v4, vcc, 1, v1 -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s9, v2 -; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GFX6-NEXT: v_mul_hi_u32 v2, v3, v5 -; GFX6-NEXT: v_xor_b32_e32 v1, s2, v1 -; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, s2, v1 -; GFX6-NEXT: s_ashr_i32 s2, s11, 31 -; GFX6-NEXT: s_ashr_i32 s0, s6, 31 -; GFX6-NEXT: s_add_i32 s5, s11, s2 -; GFX6-NEXT: s_add_i32 s1, s6, s0 -; GFX6-NEXT: s_xor_b32 s5, s5, s2 -; GFX6-NEXT: s_xor_b32 s1, s1, s0 +; GFX6-NEXT: v_mul_lo_u32 v1, s4, v0 +; GFX6-NEXT: s_ashr_i32 s4, s8, 31 +; GFX6-NEXT: s_add_i32 s5, s8, s4 +; GFX6-NEXT: s_xor_b32 s5, s5, s4 +; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 +; GFX6-NEXT: s_xor_b32 s8, s4, s2 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 +; GFX6-NEXT: v_mul_hi_u32 v0, s5, v0 +; GFX6-NEXT: v_readfirstlane_b32 s2, v0 +; GFX6-NEXT: s_mul_i32 s2, s2, s3 +; GFX6-NEXT: s_sub_i32 s2, s5, s2 +; GFX6-NEXT: s_sub_i32 s4, s2, s3 +; GFX6-NEXT: s_cmp_ge_u32 s2, s3 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, 1, v0 +; GFX6-NEXT: s_cselect_b32 s2, s4, s2 +; GFX6-NEXT: s_cselect_b64 vcc, -1, 0 +; GFX6-NEXT: s_cmp_ge_u32 s2, s3 +; GFX6-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX6-NEXT: s_ashr_i32 s4, s13, 31 +; GFX6-NEXT: s_add_i32 s5, s13, s4 +; GFX6-NEXT: s_xor_b32 s5, s5, s4 +; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s5 +; GFX6-NEXT: s_sub_i32 s6, 0, s5 +; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX6-NEXT: v_add_i32_e32 v1, vcc, 1, v0 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 +; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3] +; GFX6-NEXT: v_xor_b32_e32 v0, s8, v0 +; GFX6-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 +; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v2 +; GFX6-NEXT: v_mul_lo_u32 v3, s6, v2 +; GFX6-NEXT: s_ashr_i32 s6, s9, 31 +; GFX6-NEXT: s_add_i32 s7, s9, s6 +; GFX6-NEXT: s_xor_b32 s7, s7, s6 +; GFX6-NEXT: v_mul_hi_u32 v3, v2, v3 +; GFX6-NEXT: s_xor_b32 s9, s6, s4 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s5 -; GFX6-NEXT: v_mul_hi_u32 v2, s1, v2 -; GFX6-NEXT: s_xor_b32 s3, s0, s3 +; GFX6-NEXT: v_mul_hi_u32 v2, s7, v2 +; GFX6-NEXT: v_readfirstlane_b32 s4, v2 +; GFX6-NEXT: s_mul_i32 s4, s4, s5 +; GFX6-NEXT: s_sub_i32 s4, s7, s4 +; GFX6-NEXT: s_sub_i32 s6, s4, s5 +; GFX6-NEXT: s_cmp_ge_u32 s4, s5 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, 1, v2 +; GFX6-NEXT: s_cselect_b32 s4, s6, s4 +; GFX6-NEXT: s_cselect_b64 vcc, -1, 0 +; GFX6-NEXT: s_cmp_ge_u32 s4, s5 +; GFX6-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX6-NEXT: s_ashr_i32 s6, s14, 31 +; GFX6-NEXT: s_add_i32 s7, s14, s6 +; GFX6-NEXT: s_xor_b32 s7, s7, s6 +; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s7 +; GFX6-NEXT: s_sub_i32 s12, 0, s7 +; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; GFX6-NEXT: v_add_i32_e32 v3, vcc, 1, v2 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v4 -; GFX6-NEXT: v_mul_lo_u32 v3, v2, s4 -; GFX6-NEXT: v_add_i32_e32 v5, vcc, 1, v2 +; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[4:5] +; GFX6-NEXT: v_xor_b32_e32 v2, s9, v2 ; GFX6-NEXT: v_mul_f32_e32 v4, 0x4f7ffffe, v4 -; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s1, v3 ; GFX6-NEXT: v_cvt_u32_f32_e32 v4, v4 -; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s4, v3 -; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1] -; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, s4, v3 -; GFX6-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[0:1] -; GFX6-NEXT: s_sub_i32 s0, 0, s5 -; GFX6-NEXT: v_mul_lo_u32 v5, s0, v4 -; GFX6-NEXT: s_ashr_i32 s0, s7, 31 -; GFX6-NEXT: s_add_i32 s1, s7, s0 -; GFX6-NEXT: s_xor_b32 s1, s1, s0 +; GFX6-NEXT: v_mul_lo_u32 v5, s12, v4 +; GFX6-NEXT: s_ashr_i32 s12, s10, 31 +; GFX6-NEXT: s_add_i32 s10, s10, s12 +; GFX6-NEXT: s_xor_b32 s10, s10, s12 ; GFX6-NEXT: v_mul_hi_u32 v5, v4, v5 -; GFX6-NEXT: v_add_i32_e32 v6, vcc, 1, v2 -; GFX6-NEXT: s_xor_b32 s2, s0, s2 -; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; GFX6-NEXT: v_mul_hi_u32 v4, s1, v4 -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s4, v3 -; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc -; GFX6-NEXT: v_xor_b32_e32 v2, s3, v2 -; GFX6-NEXT: v_mul_lo_u32 v3, v4, s5 +; GFX6-NEXT: s_xor_b32 s12, s12, s6 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v5 +; GFX6-NEXT: v_mul_hi_u32 v4, s10, v4 +; GFX6-NEXT: v_readfirstlane_b32 s6, v4 +; GFX6-NEXT: s_mul_i32 s6, s6, s7 +; GFX6-NEXT: s_sub_i32 s6, s10, s6 +; GFX6-NEXT: s_sub_i32 s10, s6, s7 +; GFX6-NEXT: s_cmp_ge_u32 s6, s7 ; GFX6-NEXT: v_add_i32_e32 v5, vcc, 1, v4 -; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s3, v2 -; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s1, v3 -; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s5, v3 -; GFX6-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[0:1] -; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, s5, v3 -; GFX6-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[0:1] +; GFX6-NEXT: s_cselect_b32 s6, s10, s6 +; GFX6-NEXT: s_cselect_b64 vcc, -1, 0 +; GFX6-NEXT: s_cmp_ge_u32 s6, s7 +; GFX6-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GFX6-NEXT: s_ashr_i32 s10, s15, 31 +; GFX6-NEXT: s_add_i32 s13, s15, s10 +; GFX6-NEXT: s_xor_b32 s13, s13, s10 +; GFX6-NEXT: v_cvt_f32_u32_e32 v6, s13 +; GFX6-NEXT: s_sub_i32 s0, 0, s13 +; GFX6-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc ; GFX6-NEXT: v_add_i32_e32 v5, vcc, 1, v4 -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s5, v3 -; GFX6-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; GFX6-NEXT: v_xor_b32_e32 v3, s2, v3 -; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s2, v3 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v1, v6 +; GFX6-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[6:7] +; GFX6-NEXT: v_xor_b32_e32 v4, s12, v4 +; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s8, v0 +; GFX6-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 +; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v1 +; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, s9, v2 +; GFX6-NEXT: v_mul_lo_u32 v2, s0, v3 +; GFX6-NEXT: s_ashr_i32 s0, s11, 31 +; GFX6-NEXT: s_add_i32 s1, s11, s0 +; GFX6-NEXT: s_xor_b32 s1, s1, s0 +; GFX6-NEXT: v_mul_hi_u32 v2, v3, v2 +; GFX6-NEXT: s_xor_b32 s0, s0, s10 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; GFX6-NEXT: v_mul_hi_u32 v3, s1, v2 +; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s12, v4 +; GFX6-NEXT: v_readfirstlane_b32 s2, v3 +; GFX6-NEXT: s_mul_i32 s2, s2, s13 +; GFX6-NEXT: s_sub_i32 s1, s1, s2 +; GFX6-NEXT: s_sub_i32 s2, s1, s13 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, 1, v3 +; GFX6-NEXT: s_cmp_ge_u32 s1, s13 +; GFX6-NEXT: s_cselect_b64 vcc, -1, 0 +; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; GFX6-NEXT: s_cselect_b32 s1, s2, s1 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, 1, v3 +; GFX6-NEXT: s_cmp_ge_u32 s1, s13 +; GFX6-NEXT: s_cselect_b64 vcc, -1, 0 +; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; GFX6-NEXT: v_xor_b32_e32 v3, s0, v3 +; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s0, v3 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: sdiv_v4i32: @@ -1940,114 +1994,122 @@ ; GFX9-NEXT: s_add_i32 s3, s8, s2 ; GFX9-NEXT: s_xor_b32 s3, s3, s2 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 -; GFX9-NEXT: s_ashr_i32 s12, s9, 31 -; GFX9-NEXT: s_add_i32 s9, s9, s12 -; GFX9-NEXT: s_xor_b32 s9, s9, s12 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s9 -; GFX9-NEXT: s_sub_i32 s14, 0, s3 ; GFX9-NEXT: s_ashr_i32 s8, s4, 31 +; GFX9-NEXT: s_add_i32 s4, s4, s8 +; GFX9-NEXT: s_xor_b32 s2, s8, s2 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX9-NEXT: s_xor_b32 s4, s4, s8 +; GFX9-NEXT: s_sub_i32 s8, 0, s3 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX9-NEXT: v_readfirstlane_b32 s12, v0 +; GFX9-NEXT: s_mul_i32 s8, s8, s12 +; GFX9-NEXT: s_mul_hi_u32 s8, s12, s8 +; GFX9-NEXT: s_add_i32 s12, s12, s8 +; GFX9-NEXT: s_mul_hi_u32 s8, s4, s12 +; GFX9-NEXT: s_mul_i32 s12, s8, s3 +; GFX9-NEXT: s_sub_i32 s4, s4, s12 +; GFX9-NEXT: s_add_i32 s13, s8, 1 +; GFX9-NEXT: s_sub_i32 s12, s4, s3 +; GFX9-NEXT: s_cmp_ge_u32 s4, s3 +; GFX9-NEXT: s_cselect_b32 s8, s13, s8 +; GFX9-NEXT: s_cselect_b32 s4, s12, s4 +; GFX9-NEXT: s_add_i32 s12, s8, 1 +; GFX9-NEXT: s_cmp_ge_u32 s4, s3 +; GFX9-NEXT: s_cselect_b32 s3, s12, s8 +; GFX9-NEXT: s_ashr_i32 s4, s9, 31 +; GFX9-NEXT: s_add_i32 s8, s9, s4 +; GFX9-NEXT: s_xor_b32 s8, s8, s4 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s8 +; GFX9-NEXT: s_ashr_i32 s9, s5, 31 +; GFX9-NEXT: s_xor_b32 s3, s3, s2 +; GFX9-NEXT: s_add_i32 s5, s5, s9 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX9-NEXT: s_xor_b32 s4, s9, s4 +; GFX9-NEXT: s_sub_i32 s2, s3, s2 +; GFX9-NEXT: s_xor_b32 s3, s5, s9 +; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX9-NEXT: s_sub_i32 s5, 0, s8 +; GFX9-NEXT: v_readfirstlane_b32 s9, v0 +; GFX9-NEXT: s_mul_i32 s5, s5, s9 +; GFX9-NEXT: s_mul_hi_u32 s5, s9, s5 +; GFX9-NEXT: s_add_i32 s9, s9, s5 +; GFX9-NEXT: s_mul_hi_u32 s5, s3, s9 +; GFX9-NEXT: s_mul_i32 s9, s5, s8 +; GFX9-NEXT: s_sub_i32 s3, s3, s9 +; GFX9-NEXT: s_add_i32 s12, s5, 1 +; GFX9-NEXT: s_sub_i32 s9, s3, s8 +; GFX9-NEXT: s_cmp_ge_u32 s3, s8 +; GFX9-NEXT: s_cselect_b32 s5, s12, s5 +; GFX9-NEXT: s_cselect_b32 s3, s9, s3 +; GFX9-NEXT: s_add_i32 s9, s5, 1 +; GFX9-NEXT: s_cmp_ge_u32 s3, s8 +; GFX9-NEXT: s_cselect_b32 s3, s9, s5 +; GFX9-NEXT: s_ashr_i32 s5, s10, 31 +; GFX9-NEXT: s_add_i32 s8, s10, s5 +; GFX9-NEXT: s_xor_b32 s8, s8, s5 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s8 +; GFX9-NEXT: s_ashr_i32 s9, s6, 31 +; GFX9-NEXT: s_xor_b32 s3, s3, s4 +; GFX9-NEXT: s_add_i32 s6, s6, s9 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX9-NEXT: s_xor_b32 s5, s9, s5 +; GFX9-NEXT: s_sub_i32 s3, s3, s4 +; GFX9-NEXT: s_xor_b32 s4, s6, s9 +; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX9-NEXT: s_sub_i32 s6, 0, s8 +; GFX9-NEXT: v_readfirstlane_b32 s9, v0 +; GFX9-NEXT: s_mul_i32 s6, s6, s9 +; GFX9-NEXT: s_mul_hi_u32 s6, s9, s6 +; GFX9-NEXT: s_add_i32 s9, s9, s6 +; GFX9-NEXT: s_mul_hi_u32 s6, s4, s9 +; GFX9-NEXT: s_mul_i32 s9, s6, s8 +; GFX9-NEXT: s_sub_i32 s4, s4, s9 +; GFX9-NEXT: s_add_i32 s10, s6, 1 +; GFX9-NEXT: s_sub_i32 s9, s4, s8 +; GFX9-NEXT: s_cmp_ge_u32 s4, s8 +; GFX9-NEXT: s_cselect_b32 s6, s10, s6 +; GFX9-NEXT: s_cselect_b32 s4, s9, s4 +; GFX9-NEXT: s_add_i32 s9, s6, 1 +; GFX9-NEXT: s_cmp_ge_u32 s4, s8 +; GFX9-NEXT: s_cselect_b32 s4, s9, s6 +; GFX9-NEXT: s_ashr_i32 s6, s11, 31 +; GFX9-NEXT: s_add_i32 s8, s11, s6 +; GFX9-NEXT: s_xor_b32 s8, s8, s6 +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s8 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: s_ashr_i32 s2, s7, 31 +; GFX9-NEXT: s_xor_b32 s4, s4, s5 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GFX9-NEXT: s_add_i32 s4, s4, s8 -; GFX9-NEXT: s_xor_b32 s4, s4, s8 -; GFX9-NEXT: v_mul_lo_u32 v2, s14, v0 +; GFX9-NEXT: s_add_i32 s7, s7, s2 +; GFX9-NEXT: s_xor_b32 s6, s2, s6 +; GFX9-NEXT: s_sub_i32 s4, s4, s5 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX9-NEXT: s_sub_i32 s14, 0, s9 -; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2 -; GFX9-NEXT: s_ashr_i32 s13, s5, 31 -; GFX9-NEXT: v_mul_lo_u32 v3, s14, v1 -; GFX9-NEXT: s_add_i32 s5, s5, s13 -; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 -; GFX9-NEXT: v_mul_hi_u32 v0, s4, v0 -; GFX9-NEXT: v_mul_hi_u32 v2, v1, v3 -; GFX9-NEXT: s_xor_b32 s5, s5, s13 -; GFX9-NEXT: s_xor_b32 s2, s8, s2 -; GFX9-NEXT: v_mul_lo_u32 v3, v0, s3 -; GFX9-NEXT: v_add_u32_e32 v1, v1, v2 -; GFX9-NEXT: v_add_u32_e32 v2, 1, v0 -; GFX9-NEXT: v_mul_hi_u32 v1, s5, v1 -; GFX9-NEXT: v_sub_u32_e32 v3, s4, v3 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX9-NEXT: v_subrev_u32_e32 v2, s3, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v2 -; GFX9-NEXT: s_ashr_i32 s3, s10, 31 -; GFX9-NEXT: s_add_i32 s4, s10, s3 -; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 -; GFX9-NEXT: s_xor_b32 s4, s4, s3 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s4 -; GFX9-NEXT: v_mul_lo_u32 v2, v1, s9 -; GFX9-NEXT: v_add_u32_e32 v5, 1, v1 -; GFX9-NEXT: s_ashr_i32 s8, s11, 31 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v3 -; GFX9-NEXT: v_sub_u32_e32 v2, s5, v2 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s9, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc -; GFX9-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 -; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 -; GFX9-NEXT: v_subrev_u32_e32 v5, s9, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc -; GFX9-NEXT: s_sub_i32 s5, 0, s4 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s9, v2 -; GFX9-NEXT: v_mul_lo_u32 v2, s5, v3 -; GFX9-NEXT: s_add_i32 s9, s11, s8 -; GFX9-NEXT: v_add_u32_e32 v5, 1, v1 -; GFX9-NEXT: s_xor_b32 s9, s9, s8 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc -; GFX9-NEXT: v_mul_hi_u32 v2, v3, v2 -; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s9 -; GFX9-NEXT: s_ashr_i32 s5, s6, 31 -; GFX9-NEXT: s_add_i32 s6, s6, s5 -; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v5 -; GFX9-NEXT: s_xor_b32 s6, s6, s5 -; GFX9-NEXT: v_mul_hi_u32 v2, s6, v2 -; GFX9-NEXT: v_xor_b32_e32 v0, s2, v0 -; GFX9-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 -; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 -; GFX9-NEXT: v_subrev_u32_e32 v0, s2, v0 -; GFX9-NEXT: s_xor_b32 s2, s13, s12 -; GFX9-NEXT: v_mul_lo_u32 v5, v2, s4 -; GFX9-NEXT: v_xor_b32_e32 v1, s2, v1 -; GFX9-NEXT: v_subrev_u32_e32 v1, s2, v1 -; GFX9-NEXT: s_xor_b32 s2, s5, s3 -; GFX9-NEXT: s_sub_i32 s3, 0, s9 -; GFX9-NEXT: v_mul_lo_u32 v7, s3, v3 -; GFX9-NEXT: v_sub_u32_e32 v5, s6, v5 -; GFX9-NEXT: v_add_u32_e32 v6, 1, v2 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s4, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc -; GFX9-NEXT: v_subrev_u32_e32 v6, s4, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc -; GFX9-NEXT: v_mul_hi_u32 v6, v3, v7 -; GFX9-NEXT: s_ashr_i32 s3, s7, 31 -; GFX9-NEXT: s_add_i32 s5, s7, s3 -; GFX9-NEXT: s_xor_b32 s5, s5, s3 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v6 -; GFX9-NEXT: v_mul_hi_u32 v3, s5, v3 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s4, v5 -; GFX9-NEXT: v_add_u32_e32 v6, 1, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc -; GFX9-NEXT: v_mul_lo_u32 v5, v3, s9 -; GFX9-NEXT: v_add_u32_e32 v6, 1, v3 -; GFX9-NEXT: v_xor_b32_e32 v2, s2, v2 -; GFX9-NEXT: v_subrev_u32_e32 v2, s2, v2 -; GFX9-NEXT: v_sub_u32_e32 v5, s5, v5 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s9, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc -; GFX9-NEXT: v_subrev_u32_e32 v6, s9, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc -; GFX9-NEXT: v_add_u32_e32 v6, 1, v3 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s9, v5 -; GFX9-NEXT: s_xor_b32 s2, s3, s8 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc -; GFX9-NEXT: v_xor_b32_e32 v3, s2, v3 -; GFX9-NEXT: v_subrev_u32_e32 v3, s2, v3 +; GFX9-NEXT: s_xor_b32 s2, s7, s2 +; GFX9-NEXT: s_sub_i32 s5, 0, s8 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_readfirstlane_b32 s7, v1 +; GFX9-NEXT: s_mul_i32 s5, s5, s7 +; GFX9-NEXT: s_mul_hi_u32 s5, s7, s5 +; GFX9-NEXT: s_add_i32 s7, s7, s5 +; GFX9-NEXT: s_mul_hi_u32 s5, s2, s7 +; GFX9-NEXT: s_mul_i32 s7, s5, s8 +; GFX9-NEXT: s_sub_i32 s2, s2, s7 +; GFX9-NEXT: s_add_i32 s9, s5, 1 +; GFX9-NEXT: s_sub_i32 s7, s2, s8 +; GFX9-NEXT: s_cmp_ge_u32 s2, s8 +; GFX9-NEXT: s_cselect_b32 s5, s9, s5 +; GFX9-NEXT: s_cselect_b32 s2, s7, s2 +; GFX9-NEXT: s_add_i32 s7, s5, 1 +; GFX9-NEXT: s_cmp_ge_u32 s2, s8 +; GFX9-NEXT: s_cselect_b32 s2, s7, s5 +; GFX9-NEXT: s_xor_b32 s2, s2, s6 +; GFX9-NEXT: s_sub_i32 s2, s2, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_mov_b32_e32 v3, s2 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX9-NEXT: s_endpgm %r = sdiv <4 x i32> %x, %y @@ -2216,109 +2278,117 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_ashr_i32 s2, s8, 31 -; GFX6-NEXT: s_add_i32 s8, s8, s2 -; GFX6-NEXT: s_xor_b32 s8, s8, s2 -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s8 -; GFX6-NEXT: s_ashr_i32 s13, s9, 31 -; GFX6-NEXT: s_add_i32 s9, s9, s13 -; GFX6-NEXT: s_xor_b32 s9, s9, s13 +; GFX6-NEXT: s_add_i32 s3, s8, s2 +; GFX6-NEXT: s_xor_b32 s2, s3, s2 +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GFX6-NEXT: s_sub_i32 s3, 0, s2 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX6-NEXT: s_sub_i32 s14, 0, s8 -; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s9 -; GFX6-NEXT: s_ashr_i32 s12, s4, 31 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GFX6-NEXT: s_add_i32 s4, s4, s12 -; GFX6-NEXT: s_xor_b32 s4, s4, s12 -; GFX6-NEXT: v_mul_lo_u32 v2, s14, v0 -; GFX6-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 -; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX6-NEXT: s_sub_i32 s14, 0, s9 -; GFX6-NEXT: v_mul_hi_u32 v2, v0, v2 -; GFX6-NEXT: s_ashr_i32 s13, s5, 31 -; GFX6-NEXT: s_add_i32 s5, s5, s13 -; GFX6-NEXT: s_xor_b32 s5, s5, s13 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v2, v0 +; GFX6-NEXT: v_mul_lo_u32 v1, s3, v0 +; GFX6-NEXT: s_ashr_i32 s3, s4, 31 +; GFX6-NEXT: s_add_i32 s4, s4, s3 +; GFX6-NEXT: s_xor_b32 s4, s4, s3 +; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0 -; GFX6-NEXT: v_mul_lo_u32 v2, s14, v1 +; GFX6-NEXT: v_readfirstlane_b32 s8, v0 +; GFX6-NEXT: s_mul_i32 s8, s8, s2 +; GFX6-NEXT: s_sub_i32 s4, s4, s8 +; GFX6-NEXT: s_sub_i32 s8, s4, s2 +; GFX6-NEXT: s_cmp_ge_u32 s4, s2 +; GFX6-NEXT: s_cselect_b32 s4, s8, s4 +; GFX6-NEXT: s_sub_i32 s8, s4, s2 +; GFX6-NEXT: s_cmp_ge_u32 s4, s2 +; GFX6-NEXT: s_cselect_b32 s2, s8, s4 +; GFX6-NEXT: s_ashr_i32 s4, s9, 31 +; GFX6-NEXT: s_add_i32 s8, s9, s4 +; GFX6-NEXT: s_xor_b32 s4, s8, s4 +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s4 +; GFX6-NEXT: s_sub_i32 s8, 0, s4 +; GFX6-NEXT: s_xor_b32 s2, s2, s3 +; GFX6-NEXT: s_sub_i32 s9, s2, s3 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX6-NEXT: v_mul_lo_u32 v1, s8, v0 +; GFX6-NEXT: s_ashr_i32 s8, s5, 31 +; GFX6-NEXT: s_add_i32 s5, s5, s8 +; GFX6-NEXT: s_xor_b32 s5, s5, s8 +; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 +; GFX6-NEXT: v_mul_hi_u32 v0, s5, v0 +; GFX6-NEXT: v_readfirstlane_b32 s2, v0 +; GFX6-NEXT: s_mul_i32 s2, s2, s4 +; GFX6-NEXT: s_sub_i32 s2, s5, s2 +; GFX6-NEXT: s_sub_i32 s3, s2, s4 +; GFX6-NEXT: s_cmp_ge_u32 s2, s4 +; GFX6-NEXT: s_cselect_b32 s2, s3, s2 +; GFX6-NEXT: s_sub_i32 s3, s2, s4 +; GFX6-NEXT: s_cmp_ge_u32 s2, s4 +; GFX6-NEXT: s_cselect_b32 s2, s3, s2 +; GFX6-NEXT: s_ashr_i32 s3, s10, 31 +; GFX6-NEXT: s_add_i32 s4, s10, s3 +; GFX6-NEXT: s_xor_b32 s3, s4, s3 +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s3 +; GFX6-NEXT: s_sub_i32 s4, 0, s3 +; GFX6-NEXT: s_xor_b32 s2, s2, s8 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX6-NEXT: v_mul_lo_u32 v1, s4, v0 +; GFX6-NEXT: s_ashr_i32 s4, s6, 31 +; GFX6-NEXT: s_add_i32 s5, s6, s4 +; GFX6-NEXT: s_xor_b32 s5, s5, s4 +; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 +; GFX6-NEXT: s_sub_i32 s6, s2, s8 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 +; GFX6-NEXT: v_mul_hi_u32 v0, s5, v0 +; GFX6-NEXT: v_readfirstlane_b32 s2, v0 +; GFX6-NEXT: s_mul_i32 s2, s2, s3 +; GFX6-NEXT: s_sub_i32 s2, s5, s2 +; GFX6-NEXT: s_sub_i32 s5, s2, s3 +; GFX6-NEXT: s_cmp_ge_u32 s2, s3 +; GFX6-NEXT: s_cselect_b32 s2, s5, s2 +; GFX6-NEXT: s_sub_i32 s5, s2, s3 +; GFX6-NEXT: s_cmp_ge_u32 s2, s3 +; GFX6-NEXT: s_cselect_b32 s5, s5, s2 +; GFX6-NEXT: s_ashr_i32 s2, s11, 31 +; GFX6-NEXT: s_add_i32 s3, s11, s2 +; GFX6-NEXT: s_xor_b32 s8, s3, s2 +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s8 +; GFX6-NEXT: s_sub_i32 s10, 0, s8 +; GFX6-NEXT: s_xor_b32 s5, s5, s4 +; GFX6-NEXT: s_sub_i32 s4, s5, s4 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 -; GFX6-NEXT: v_mul_lo_u32 v0, v0, s8 +; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v0 +; GFX6-NEXT: v_mov_b32_e32 v0, s9 +; GFX6-NEXT: s_ashr_i32 s9, s7, 31 +; GFX6-NEXT: s_add_i32 s7, s7, s9 +; GFX6-NEXT: v_mul_lo_u32 v2, s10, v1 +; GFX6-NEXT: s_xor_b32 s7, s7, s9 ; GFX6-NEXT: v_mul_hi_u32 v2, v1, v2 -; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 -; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s8, v0 -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 -; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s8, v0 -; GFX6-NEXT: v_add_i32_e32 v1, vcc, v2, v1 -; GFX6-NEXT: s_ashr_i32 s4, s10, 31 -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 -; GFX6-NEXT: s_add_i32 s8, s10, s4 -; GFX6-NEXT: s_xor_b32 s4, s8, s4 -; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s4 -; GFX6-NEXT: v_mul_hi_u32 v1, s5, v1 -; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX6-NEXT: v_xor_b32_e32 v0, s12, v0 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 -; GFX6-NEXT: v_mul_lo_u32 v1, v1, s9 -; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s12, v0 -; GFX6-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 -; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v2 -; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s5, v1 -; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s9, v1 -; GFX6-NEXT: s_sub_i32 s5, 0, s4 -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s9, v1 -; GFX6-NEXT: v_mul_lo_u32 v4, s5, v2 -; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s9, v1 -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s9, v1 -; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX6-NEXT: v_mul_hi_u32 v3, v2, v4 -; GFX6-NEXT: s_ashr_i32 s8, s11, 31 -; GFX6-NEXT: s_add_i32 s9, s11, s8 -; GFX6-NEXT: s_ashr_i32 s5, s6, 31 -; GFX6-NEXT: s_xor_b32 s8, s9, s8 -; GFX6-NEXT: s_add_i32 s6, s6, s5 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s8 -; GFX6-NEXT: s_xor_b32 s6, s6, s5 -; GFX6-NEXT: v_mul_hi_u32 v2, s6, v2 -; GFX6-NEXT: v_xor_b32_e32 v1, s13, v1 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v3 -; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, s13, v1 -; GFX6-NEXT: v_mul_lo_u32 v2, v2, s4 -; GFX6-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 -; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v3 -; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s6, v2 -; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s4, v2 -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s4, v2 -; GFX6-NEXT: s_sub_i32 s6, 0, s8 -; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX6-NEXT: v_mul_lo_u32 v4, s6, v3 -; GFX6-NEXT: s_ashr_i32 s6, s7, 31 -; GFX6-NEXT: s_add_i32 s7, s7, s6 -; GFX6-NEXT: s_xor_b32 s7, s7, s6 -; GFX6-NEXT: v_mul_hi_u32 v4, v3, v4 -; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, s4, v2 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GFX6-NEXT: v_mul_hi_u32 v3, s7, v3 -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s4, v2 -; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc -; GFX6-NEXT: v_xor_b32_e32 v2, s5, v2 -; GFX6-NEXT: v_mul_lo_u32 v3, v3, s8 -; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s5, v2 -; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s7, v3 -; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s8, v3 -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s8, v3 -; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc -; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s8, v3 -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s8, v3 -; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc -; GFX6-NEXT: v_xor_b32_e32 v3, s6, v3 -; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s6, v3 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v2 +; GFX6-NEXT: v_mul_hi_u32 v2, s7, v1 +; GFX6-NEXT: v_mov_b32_e32 v1, s6 +; GFX6-NEXT: v_readfirstlane_b32 s5, v2 +; GFX6-NEXT: s_mul_i32 s5, s5, s8 +; GFX6-NEXT: s_sub_i32 s5, s7, s5 +; GFX6-NEXT: s_sub_i32 s6, s5, s8 +; GFX6-NEXT: s_cmp_ge_u32 s5, s8 +; GFX6-NEXT: s_cselect_b32 s5, s6, s5 +; GFX6-NEXT: s_sub_i32 s6, s5, s8 +; GFX6-NEXT: s_cmp_ge_u32 s5, s8 +; GFX6-NEXT: s_cselect_b32 s5, s6, s5 +; GFX6-NEXT: s_xor_b32 s5, s5, s9 +; GFX6-NEXT: s_sub_i32 s5, s5, s9 +; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: v_mov_b32_e32 v3, s5 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; @@ -2993,70 +3063,70 @@ ; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s9 ; GFX6-NEXT: s_xor_b32 s8, s9, s8 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GFX6-NEXT: s_ashr_i32 s6, s6, 16 ; GFX6-NEXT: s_ashr_i32 s8, s8, 30 -; GFX6-NEXT: s_or_b32 s8, s8, 1 +; GFX6-NEXT: s_or_b32 s10, s8, 1 ; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 ; GFX6-NEXT: v_trunc_f32_e32 v2, v2 ; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 -; GFX6-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| -; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s6 -; GFX6-NEXT: v_mov_b32_e32 v3, s8 -; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc +; GFX6-NEXT: v_cmp_ge_f32_e64 s[8:9], |v1|, |v0| +; GFX6-NEXT: s_and_b64 s[8:9], s[8:9], exec +; GFX6-NEXT: s_cselect_b32 s8, s10, 0 +; GFX6-NEXT: s_ashr_i32 s6, s6, 16 +; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s6 ; GFX6-NEXT: s_ashr_i32 s4, s4, 16 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s4 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v1 +; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s4 +; GFX6-NEXT: v_cvt_i32_f32_e32 v2, v2 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v0 ; GFX6-NEXT: s_xor_b32 s4, s4, s6 ; GFX6-NEXT: s_ashr_i32 s4, s4, 30 -; GFX6-NEXT: s_or_b32 s4, s4, 1 -; GFX6-NEXT: v_mul_f32_e32 v3, v2, v3 +; GFX6-NEXT: s_sext_i32_i16 s6, s7 +; GFX6-NEXT: v_mul_f32_e32 v3, v1, v3 ; GFX6-NEXT: v_trunc_f32_e32 v3, v3 -; GFX6-NEXT: v_mad_f32 v2, -v3, v1, v2 -; GFX6-NEXT: v_mov_b32_e32 v4, s4 -; GFX6-NEXT: s_sext_i32_i16 s4, s7 +; GFX6-NEXT: v_mad_f32 v1, -v3, v0, v1 ; GFX6-NEXT: v_cvt_i32_f32_e32 v3, v3 -; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, |v1| -; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s4 -; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v4, vcc -; GFX6-NEXT: s_sext_i32_i16 s6, s5 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v1, v3 -; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s6 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v2 -; GFX6-NEXT: s_xor_b32 s4, s6, s4 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, s8, v2 +; GFX6-NEXT: s_or_b32 s4, s4, 1 +; GFX6-NEXT: v_cmp_ge_f32_e64 s[8:9], |v1|, |v0| +; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s6 +; GFX6-NEXT: s_and_b64 s[8:9], s[8:9], exec +; GFX6-NEXT: s_cselect_b32 s4, s4, 0 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, s4, v3 +; GFX6-NEXT: s_sext_i32_i16 s4, s5 +; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s4 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v0 +; GFX6-NEXT: s_xor_b32 s4, s4, s6 ; GFX6-NEXT: s_ashr_i32 s4, s4, 30 ; GFX6-NEXT: s_or_b32 s4, s4, 1 ; GFX6-NEXT: v_mul_f32_e32 v4, v1, v4 ; GFX6-NEXT: v_trunc_f32_e32 v4, v4 -; GFX6-NEXT: v_mad_f32 v1, -v4, v2, v1 -; GFX6-NEXT: v_mov_b32_e32 v5, s4 -; GFX6-NEXT: s_ashr_i32 s4, s7, 16 +; GFX6-NEXT: v_mad_f32 v1, -v4, v0, v1 +; GFX6-NEXT: v_cmp_ge_f32_e64 s[8:9], |v1|, |v0| +; GFX6-NEXT: s_and_b64 s[8:9], s[8:9], exec ; GFX6-NEXT: v_cvt_i32_f32_e32 v4, v4 -; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v2| -; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s4 -; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v5, vcc -; GFX6-NEXT: s_ashr_i32 s5, s5, 16 -; GFX6-NEXT: v_add_i32_e32 v1, vcc, v4, v1 -; GFX6-NEXT: v_cvt_f32_i32_e32 v4, s5 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v2 -; GFX6-NEXT: s_xor_b32 s4, s5, s4 +; GFX6-NEXT: s_cselect_b32 s4, s4, 0 +; GFX6-NEXT: s_ashr_i32 s6, s7, 16 +; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s6 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, s4, v4 +; GFX6-NEXT: s_ashr_i32 s4, s5, 16 +; GFX6-NEXT: v_cvt_f32_i32_e32 v4, s4 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v0 +; GFX6-NEXT: s_xor_b32 s4, s4, s6 ; GFX6-NEXT: s_ashr_i32 s4, s4, 30 -; GFX6-NEXT: s_or_b32 s4, s4, 1 +; GFX6-NEXT: s_or_b32 s6, s4, 1 ; GFX6-NEXT: v_mul_f32_e32 v5, v4, v5 ; GFX6-NEXT: v_trunc_f32_e32 v5, v5 -; GFX6-NEXT: v_mad_f32 v4, -v5, v2, v4 +; GFX6-NEXT: v_mad_f32 v4, -v5, v0, v4 ; GFX6-NEXT: v_cvt_i32_f32_e32 v5, v5 -; GFX6-NEXT: v_mov_b32_e32 v6, s4 -; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, |v2| -; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v6, vcc -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v5, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v4|, |v0| +; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX6-NEXT: s_cselect_b32 s4, s6, 0 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, s4, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX6-NEXT: v_or_b32_e32 v1, v1, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; @@ -3264,80 +3334,80 @@ ; GFX6-NEXT: s_xor_b32 s8, s9, s8 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 ; GFX6-NEXT: s_ashr_i32 s8, s8, 30 -; GFX6-NEXT: s_or_b32 s8, s8, 1 -; GFX6-NEXT: v_mov_b32_e32 v3, s8 +; GFX6-NEXT: s_or_b32 s10, s8, 1 ; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 ; GFX6-NEXT: v_trunc_f32_e32 v2, v2 ; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 ; GFX6-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| -; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc -; GFX6-NEXT: s_ashr_i32 s9, s6, 16 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v2, v0 +; GFX6-NEXT: v_cmp_ge_f32_e64 s[8:9], |v1|, |v0| +; GFX6-NEXT: s_and_b64 s[8:9], s[8:9], exec +; GFX6-NEXT: s_cselect_b32 s8, s10, 0 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, s8, v2 ; GFX6-NEXT: v_mul_lo_u32 v0, v0, s6 -; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s9 -; GFX6-NEXT: s_lshr_b32 s8, s4, 16 -; GFX6-NEXT: s_lshr_b32 s6, s6, 16 +; GFX6-NEXT: s_ashr_i32 s8, s6, 16 +; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s8 +; GFX6-NEXT: s_lshr_b32 s10, s4, 16 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 ; GFX6-NEXT: s_ashr_i32 s4, s4, 16 ; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s4 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v1 -; GFX6-NEXT: s_xor_b32 s4, s4, s9 +; GFX6-NEXT: s_xor_b32 s4, s4, s8 ; GFX6-NEXT: s_ashr_i32 s4, s4, 30 -; GFX6-NEXT: s_or_b32 s4, s4, 1 +; GFX6-NEXT: s_lshr_b32 s6, s6, 16 ; GFX6-NEXT: v_mul_f32_e32 v3, v2, v3 ; GFX6-NEXT: v_trunc_f32_e32 v3, v3 ; GFX6-NEXT: v_mad_f32 v2, -v3, v1, v2 ; GFX6-NEXT: v_cvt_i32_f32_e32 v3, v3 -; GFX6-NEXT: v_mov_b32_e32 v4, s4 -; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, |v1| -; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v4, vcc -; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3 +; GFX6-NEXT: s_or_b32 s4, s4, 1 +; GFX6-NEXT: v_cmp_ge_f32_e64 s[8:9], |v2|, |v1| +; GFX6-NEXT: s_and_b64 s[8:9], s[8:9], exec +; GFX6-NEXT: s_cselect_b32 s4, s4, 0 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, s4, v3 ; GFX6-NEXT: s_sext_i32_i16 s4, s7 ; GFX6-NEXT: v_mul_lo_u32 v1, v1, s6 ; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s4 ; GFX6-NEXT: s_sext_i32_i16 s6, s5 ; GFX6-NEXT: s_xor_b32 s4, s6, s4 -; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s8, v1 +; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s10, v1 ; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s6 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v2 ; GFX6-NEXT: s_ashr_i32 s4, s4, 30 ; GFX6-NEXT: s_or_b32 s4, s4, 1 -; GFX6-NEXT: v_mov_b32_e32 v5, s4 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_mul_f32_e32 v4, v1, v4 ; GFX6-NEXT: v_trunc_f32_e32 v4, v4 ; GFX6-NEXT: v_mad_f32 v1, -v4, v2, v1 +; GFX6-NEXT: v_cmp_ge_f32_e64 s[8:9], |v1|, |v2| ; GFX6-NEXT: v_cvt_i32_f32_e32 v4, v4 -; GFX6-NEXT: s_ashr_i32 s4, s7, 16 -; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v2| -; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s4 -; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v5, vcc -; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v4 +; GFX6-NEXT: s_and_b64 s[8:9], s[8:9], exec +; GFX6-NEXT: s_cselect_b32 s4, s4, 0 +; GFX6-NEXT: s_ashr_i32 s6, s7, 16 +; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s6 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, s4, v4 ; GFX6-NEXT: v_mul_lo_u32 v1, v1, s7 -; GFX6-NEXT: s_lshr_b32 s6, s7, 16 +; GFX6-NEXT: s_lshr_b32 s8, s7, 16 ; GFX6-NEXT: s_ashr_i32 s7, s5, 16 ; GFX6-NEXT: v_cvt_f32_i32_e32 v4, s7 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v2 -; GFX6-NEXT: s_xor_b32 s4, s7, s4 -; GFX6-NEXT: s_ashr_i32 s4, s4, 30 -; GFX6-NEXT: s_or_b32 s4, s4, 1 +; GFX6-NEXT: s_xor_b32 s6, s7, s6 +; GFX6-NEXT: s_ashr_i32 s6, s6, 30 +; GFX6-NEXT: s_lshr_b32 s4, s5, 16 ; GFX6-NEXT: v_mul_f32_e32 v5, v4, v5 ; GFX6-NEXT: v_trunc_f32_e32 v5, v5 ; GFX6-NEXT: v_mad_f32 v4, -v5, v2, v4 ; GFX6-NEXT: v_cvt_i32_f32_e32 v5, v5 -; GFX6-NEXT: v_mov_b32_e32 v6, s4 -; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, |v2| -; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v6, vcc -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5 -; GFX6-NEXT: v_mul_lo_u32 v2, v2, s6 -; GFX6-NEXT: s_lshr_b32 s4, s5, 16 +; GFX6-NEXT: s_or_b32 s9, s6, 1 +; GFX6-NEXT: v_cmp_ge_f32_e64 s[6:7], |v4|, |v2| +; GFX6-NEXT: s_and_b64 s[6:7], s[6:7], exec +; GFX6-NEXT: s_cselect_b32 s6, s9, 0 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, s6, v5 +; GFX6-NEXT: v_mul_lo_u32 v2, v2, s8 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s5, v1 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s4, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm @@ -3618,15 +3688,15 @@ ; GFX6-NEXT: s_xor_b32 s4, s4, s5 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 ; GFX6-NEXT: s_ashr_i32 s4, s4, 30 -; GFX6-NEXT: s_or_b32 s4, s4, 1 -; GFX6-NEXT: v_mov_b32_e32 v3, s4 +; GFX6-NEXT: s_or_b32 s6, s4, 1 ; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 ; GFX6-NEXT: v_trunc_f32_e32 v2, v2 ; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 ; GFX6-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| -; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v2, v0 +; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, |v0| +; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX6-NEXT: s_cselect_b32 s4, s6, 0 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, s4, v2 ; GFX6-NEXT: v_and_b32_e32 v0, 7, v0 ; GFX6-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm @@ -3696,22 +3766,22 @@ ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_bfe_i32 s2, s4, 0x30008 ; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s2 -; GFX6-NEXT: s_bfe_i32 s5, s4, 0x30000 -; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s5 -; GFX6-NEXT: s_xor_b32 s2, s5, s2 +; GFX6-NEXT: s_bfe_i32 s3, s4, 0x30000 +; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s3 +; GFX6-NEXT: s_xor_b32 s2, s3, s2 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 ; GFX6-NEXT: s_ashr_i32 s2, s2, 30 -; GFX6-NEXT: s_or_b32 s2, s2, 1 -; GFX6-NEXT: v_mov_b32_e32 v3, s2 +; GFX6-NEXT: s_lshr_b32 s5, s4, 8 +; GFX6-NEXT: s_or_b32 s6, s2, 1 ; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 ; GFX6-NEXT: v_trunc_f32_e32 v2, v2 ; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 ; GFX6-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| -; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc -; GFX6-NEXT: s_lshr_b32 s3, s4, 8 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v2, v0 -; GFX6-NEXT: v_mul_lo_u32 v0, v0, s3 +; GFX6-NEXT: v_cmp_ge_f32_e64 s[2:3], |v1|, |v0| +; GFX6-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX6-NEXT: s_cselect_b32 s2, s6, 0 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, s2, v2 +; GFX6-NEXT: v_mul_lo_u32 v0, v0, s5 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 @@ -4184,53 +4254,53 @@ ; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s9 ; GFX6-NEXT: s_xor_b32 s8, s9, s8 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GFX6-NEXT: s_ashr_i32 s6, s6, 16 ; GFX6-NEXT: s_ashr_i32 s8, s8, 30 -; GFX6-NEXT: s_or_b32 s8, s8, 1 +; GFX6-NEXT: s_or_b32 s10, s8, 1 ; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 ; GFX6-NEXT: v_trunc_f32_e32 v2, v2 ; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 +; GFX6-NEXT: v_cmp_ge_f32_e64 s[8:9], |v1|, |v0| +; GFX6-NEXT: s_and_b64 s[8:9], s[8:9], exec +; GFX6-NEXT: s_cselect_b32 s8, s10, 0 +; GFX6-NEXT: s_ashr_i32 s6, s6, 16 ; GFX6-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| -; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s6 -; GFX6-NEXT: v_mov_b32_e32 v3, s8 -; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc +; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s6 ; GFX6-NEXT: s_ashr_i32 s4, s4, 16 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, s8, v2 ; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s4 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v1 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v0 ; GFX6-NEXT: s_xor_b32 s4, s4, s6 ; GFX6-NEXT: s_ashr_i32 s4, s4, 30 -; GFX6-NEXT: s_or_b32 s4, s4, 1 +; GFX6-NEXT: s_sext_i32_i16 s6, s7 ; GFX6-NEXT: v_mul_f32_e32 v3, v2, v3 ; GFX6-NEXT: v_trunc_f32_e32 v3, v3 -; GFX6-NEXT: v_mad_f32 v2, -v3, v1, v2 -; GFX6-NEXT: v_mov_b32_e32 v4, s4 -; GFX6-NEXT: s_sext_i32_i16 s4, s7 +; GFX6-NEXT: v_mad_f32 v2, -v3, v0, v2 ; GFX6-NEXT: v_cvt_i32_f32_e32 v3, v3 -; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, |v1| -; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s4 -; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v4, vcc -; GFX6-NEXT: s_sext_i32_i16 s5, s5 -; GFX6-NEXT: v_add_i32_e32 v1, vcc, v3, v1 -; GFX6-NEXT: v_cvt_f32_i32_e32 v3, s5 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v2 -; GFX6-NEXT: s_xor_b32 s4, s5, s4 -; GFX6-NEXT: s_ashr_i32 s4, s4, 30 ; GFX6-NEXT: s_or_b32 s4, s4, 1 +; GFX6-NEXT: v_cmp_ge_f32_e64 s[8:9], |v2|, |v0| +; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s6 +; GFX6-NEXT: s_and_b64 s[8:9], s[8:9], exec +; GFX6-NEXT: s_cselect_b32 s4, s4, 0 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, s4, v3 +; GFX6-NEXT: s_sext_i32_i16 s4, s5 +; GFX6-NEXT: v_cvt_f32_i32_e32 v3, s4 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v0 +; GFX6-NEXT: s_xor_b32 s4, s4, s6 +; GFX6-NEXT: s_ashr_i32 s4, s4, 30 +; GFX6-NEXT: s_or_b32 s6, s4, 1 ; GFX6-NEXT: v_mul_f32_e32 v4, v3, v4 ; GFX6-NEXT: v_trunc_f32_e32 v4, v4 -; GFX6-NEXT: v_mad_f32 v3, -v4, v2, v3 +; GFX6-NEXT: v_mad_f32 v3, -v4, v0, v3 ; GFX6-NEXT: v_cvt_i32_f32_e32 v4, v4 -; GFX6-NEXT: v_mov_b32_e32 v5, s4 -; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v2| -; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v5, vcc -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v4, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX6-NEXT: buffer_store_short v2, off, s[0:3], 0 offset:4 -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v3|, |v0| +; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX6-NEXT: s_cselect_b32 s4, s6, 0 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, s4, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX6-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:4 +; GFX6-NEXT: buffer_store_dword v1, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: sdiv_v3i16: @@ -4394,37 +4464,37 @@ ; GFX6-NEXT: s_xor_b32 s8, s9, s8 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 ; GFX6-NEXT: s_ashr_i32 s8, s8, 30 -; GFX6-NEXT: s_or_b32 s8, s8, 1 -; GFX6-NEXT: v_mov_b32_e32 v3, s8 +; GFX6-NEXT: s_or_b32 s10, s8, 1 ; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 ; GFX6-NEXT: v_trunc_f32_e32 v2, v2 ; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 ; GFX6-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| -; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc -; GFX6-NEXT: s_ashr_i32 s9, s6, 16 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v2, v0 +; GFX6-NEXT: v_cmp_ge_f32_e64 s[8:9], |v1|, |v0| +; GFX6-NEXT: s_and_b64 s[8:9], s[8:9], exec +; GFX6-NEXT: s_cselect_b32 s8, s10, 0 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, s8, v2 ; GFX6-NEXT: v_mul_lo_u32 v0, v0, s6 -; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s9 -; GFX6-NEXT: s_lshr_b32 s8, s4, 16 -; GFX6-NEXT: s_lshr_b32 s6, s6, 16 +; GFX6-NEXT: s_ashr_i32 s8, s6, 16 +; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s8 +; GFX6-NEXT: s_lshr_b32 s10, s4, 16 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 ; GFX6-NEXT: s_ashr_i32 s4, s4, 16 ; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s4 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v1 -; GFX6-NEXT: s_xor_b32 s4, s4, s9 +; GFX6-NEXT: s_xor_b32 s4, s4, s8 ; GFX6-NEXT: s_ashr_i32 s4, s4, 30 -; GFX6-NEXT: s_or_b32 s4, s4, 1 +; GFX6-NEXT: s_lshr_b32 s6, s6, 16 ; GFX6-NEXT: v_mul_f32_e32 v3, v2, v3 ; GFX6-NEXT: v_trunc_f32_e32 v3, v3 ; GFX6-NEXT: v_mad_f32 v2, -v3, v1, v2 ; GFX6-NEXT: v_cvt_i32_f32_e32 v3, v3 -; GFX6-NEXT: v_mov_b32_e32 v4, s4 +; GFX6-NEXT: s_or_b32 s4, s4, 1 +; GFX6-NEXT: v_cmp_ge_f32_e64 s[8:9], |v2|, |v1| +; GFX6-NEXT: s_and_b64 s[8:9], s[8:9], exec +; GFX6-NEXT: s_cselect_b32 s4, s4, 0 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, s4, v3 ; GFX6-NEXT: s_sext_i32_i16 s4, s7 -; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, |v1| ; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s4 -; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v4, vcc -; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3 ; GFX6-NEXT: v_mul_lo_u32 v1, v1, s6 ; GFX6-NEXT: s_sext_i32_i16 s6, s5 ; GFX6-NEXT: v_cvt_f32_i32_e32 v3, s6 @@ -4436,12 +4506,12 @@ ; GFX6-NEXT: v_trunc_f32_e32 v4, v4 ; GFX6-NEXT: v_mad_f32 v3, -v4, v2, v3 ; GFX6-NEXT: v_cvt_i32_f32_e32 v4, v4 -; GFX6-NEXT: v_mov_b32_e32 v5, s4 -; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v2| -; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v5, vcc -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; GFX6-NEXT: v_cmp_ge_f32_e64 s[8:9], |v3|, |v2| +; GFX6-NEXT: s_and_b64 s[8:9], s[8:9], exec +; GFX6-NEXT: s_cselect_b32 s4, s4, 0 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, s4, v4 ; GFX6-NEXT: v_mul_lo_u32 v2, v2, s7 -; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s8, v1 +; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s10, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s5, v2 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 @@ -4826,7 +4896,7 @@ ; GFX6-NEXT: s_mov_b32 s0, s4 ; GFX6-NEXT: s_lshr_b32 s4, s6, 15 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s4, v1 -; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, v2, v0 +; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 ; GFX6-NEXT: v_and_b32_e32 v3, 0x7fff, v3 ; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 30 ; GFX6-NEXT: v_and_b32_e32 v2, 0x7fff, v6 @@ -4997,54 +5067,54 @@ ; GFX6-NEXT: v_cvt_f32_i32_e32 v3, s5 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v2 ; GFX6-NEXT: s_xor_b32 s4, s5, s4 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: s_ashr_i32 s4, s4, 30 -; GFX6-NEXT: s_or_b32 s4, s4, 1 ; GFX6-NEXT: v_mul_f32_e32 v4, v3, v4 ; GFX6-NEXT: v_trunc_f32_e32 v4, v4 ; GFX6-NEXT: v_mad_f32 v3, -v4, v2, v3 -; GFX6-NEXT: v_mov_b32_e32 v5, s4 -; GFX6-NEXT: s_bfe_i32 s4, s8, 0xf000f +; GFX6-NEXT: v_alignbit_b32 v0, s7, v0, 30 +; GFX6-NEXT: s_or_b32 s7, s4, 1 +; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v3|, |v2| +; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], exec ; GFX6-NEXT: v_cvt_i32_f32_e32 v4, v4 -; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v2| -; GFX6-NEXT: v_cvt_f32_i32_e32 v3, s4 -; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v5, vcc -; GFX6-NEXT: s_bfe_i32 s5, s6, 0xf000f -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 -; GFX6-NEXT: v_cvt_f32_i32_e32 v4, s5 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v3 +; GFX6-NEXT: s_cselect_b32 s4, s7, 0 +; GFX6-NEXT: s_bfe_i32 s5, s8, 0xf000f +; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s5 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, s4, v4 +; GFX6-NEXT: s_bfe_i32 s4, s6, 0xf000f +; GFX6-NEXT: v_cvt_f32_i32_e32 v4, s4 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v2 ; GFX6-NEXT: v_mov_b32_e32 v1, s8 ; GFX6-NEXT: v_alignbit_b32 v1, s9, v1, 30 -; GFX6-NEXT: s_xor_b32 s4, s5, s4 +; GFX6-NEXT: s_xor_b32 s4, s4, s5 ; GFX6-NEXT: v_mul_f32_e32 v5, v4, v5 ; GFX6-NEXT: v_trunc_f32_e32 v5, v5 -; GFX6-NEXT: v_mad_f32 v4, -v5, v3, v4 -; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 15 ; GFX6-NEXT: s_ashr_i32 s4, s4, 30 +; GFX6-NEXT: v_mad_f32 v4, -v5, v2, v4 +; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 15 +; GFX6-NEXT: s_or_b32 s6, s4, 1 ; GFX6-NEXT: v_cvt_i32_f32_e32 v5, v5 -; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, |v3| -; GFX6-NEXT: v_cvt_f32_i32_e32 v4, v1 -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: s_or_b32 s4, s4, 1 -; GFX6-NEXT: v_alignbit_b32 v0, s7, v0, 30 -; GFX6-NEXT: v_mov_b32_e32 v6, s4 -; GFX6-NEXT: v_cndmask_b32_e32 v3, 0, v6, vcc +; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v4|, |v2| +; GFX6-NEXT: v_cvt_f32_i32_e32 v2, v1 +; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX6-NEXT: s_cselect_b32 s4, s6, 0 ; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 15 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v5 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, s4, v5 ; GFX6-NEXT: v_cvt_f32_i32_e32 v5, v0 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v6, v4 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v6, v2 ; GFX6-NEXT: v_xor_b32_e32 v0, v0, v1 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 30, v0 ; GFX6-NEXT: v_or_b32_e32 v0, 1, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, v5, v6 ; GFX6-NEXT: v_trunc_f32_e32 v1, v1 -; GFX6-NEXT: v_mad_f32 v5, -v1, v4, v5 +; GFX6-NEXT: v_mad_f32 v5, -v1, v2, v5 ; GFX6-NEXT: v_cvt_i32_f32_e32 v1, v1 -; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v5|, |v4| +; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v5|, |v2| ; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GFX6-NEXT: v_and_b32_e32 v3, 0x7fff, v3 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 +; GFX6-NEXT: v_and_b32_e32 v2, 0x7fff, v3 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GFX6-NEXT: v_and_b32_e32 v3, 0x7fff, v4 ; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 30 -; GFX6-NEXT: v_and_b32_e32 v2, 0x7fff, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 15, v3 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 @@ -5216,52 +5286,53 @@ ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s0, s4 +; GFX6-NEXT: s_bfe_i32 s4, s8, 0xf0000 +; GFX6-NEXT: v_cvt_f32_i32_e32 v4, s4 ; GFX6-NEXT: s_mov_b32 s1, s5 -; GFX6-NEXT: s_bfe_i32 s5, s8, 0xf0000 -; GFX6-NEXT: v_cvt_f32_i32_e32 v4, s5 -; GFX6-NEXT: v_mov_b32_e32 v2, s8 -; GFX6-NEXT: v_alignbit_b32 v2, s9, v2, 30 -; GFX6-NEXT: s_bfe_i32 s9, s6, 0xf0000 -; GFX6-NEXT: v_cvt_f32_i32_e32 v5, s9 +; GFX6-NEXT: s_bfe_i32 s5, s6, 0xf0000 +; GFX6-NEXT: v_cvt_f32_i32_e32 v5, s5 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v6, v4 -; GFX6-NEXT: s_xor_b32 s5, s9, s5 -; GFX6-NEXT: s_ashr_i32 s5, s5, 30 -; GFX6-NEXT: s_or_b32 s5, s5, 1 +; GFX6-NEXT: s_xor_b32 s4, s5, s4 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: v_mul_f32_e32 v6, v5, v6 ; GFX6-NEXT: v_trunc_f32_e32 v6, v6 +; GFX6-NEXT: s_ashr_i32 s4, s4, 30 ; GFX6-NEXT: v_mad_f32 v5, -v6, v4, v5 ; GFX6-NEXT: v_cvt_i32_f32_e32 v6, v6 -; GFX6-NEXT: v_mov_b32_e32 v7, s5 -; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v5|, |v4| -; GFX6-NEXT: v_cndmask_b32_e32 v4, 0, v7, vcc -; GFX6-NEXT: v_add_i32_e32 v4, vcc, v6, v4 +; GFX6-NEXT: v_alignbit_b32 v0, s7, v0, 30 +; GFX6-NEXT: s_lshr_b32 s7, s6, 15 +; GFX6-NEXT: v_alignbit_b32 v2, s9, v2, 30 +; GFX6-NEXT: s_lshr_b32 s9, s8, 15 +; GFX6-NEXT: s_or_b32 s10, s4, 1 +; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v5|, |v4| +; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX6-NEXT: s_cselect_b32 s4, s10, 0 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, s4, v6 +; GFX6-NEXT: s_bfe_i32 s4, s8, 0xf000f +; GFX6-NEXT: v_cvt_f32_i32_e32 v5, s4 +; GFX6-NEXT: s_bfe_i32 s5, s6, 0xf000f +; GFX6-NEXT: v_cvt_f32_i32_e32 v6, s5 ; GFX6-NEXT: v_mul_lo_u32 v4, v4, s8 -; GFX6-NEXT: s_bfe_i32 s5, s8, 0xf000f -; GFX6-NEXT: v_cvt_f32_i32_e32 v5, s5 -; GFX6-NEXT: s_mov_b32 s0, s4 -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: s_lshr_b32 s4, s6, 15 -; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s6, v4 -; GFX6-NEXT: s_bfe_i32 s6, s6, 0xf000f -; GFX6-NEXT: v_cvt_f32_i32_e32 v6, s6 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v7, v5 +; GFX6-NEXT: s_xor_b32 s4, s5, s4 ; GFX6-NEXT: v_and_b32_e32 v3, 0x7fff, v2 -; GFX6-NEXT: s_xor_b32 s5, s6, s5 -; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 15 +; GFX6-NEXT: s_ashr_i32 s4, s4, 30 ; GFX6-NEXT: v_mul_f32_e32 v7, v6, v7 ; GFX6-NEXT: v_trunc_f32_e32 v7, v7 ; GFX6-NEXT: v_mad_f32 v6, -v7, v5, v6 -; GFX6-NEXT: s_ashr_i32 s5, s5, 30 +; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 15 +; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s6, v4 +; GFX6-NEXT: s_or_b32 s6, s4, 1 ; GFX6-NEXT: v_cvt_i32_f32_e32 v7, v7 -; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v6|, |v5| +; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v6|, |v5| ; GFX6-NEXT: v_cvt_f32_i32_e32 v6, v2 -; GFX6-NEXT: s_or_b32 s5, s5, 1 -; GFX6-NEXT: v_alignbit_b32 v0, s7, v0, 30 -; GFX6-NEXT: v_mov_b32_e32 v8, s5 +; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], exec ; GFX6-NEXT: v_and_b32_e32 v1, 0x7fff, v0 -; GFX6-NEXT: v_cndmask_b32_e32 v5, 0, v8, vcc +; GFX6-NEXT: s_cselect_b32 s4, s6, 0 ; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 15 -; GFX6-NEXT: v_add_i32_e32 v5, vcc, v5, v7 +; GFX6-NEXT: v_add_i32_e32 v5, vcc, s4, v7 ; GFX6-NEXT: v_cvt_f32_i32_e32 v7, v0 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v8, v6 ; GFX6-NEXT: v_xor_b32_e32 v0, v0, v2 @@ -5272,12 +5343,11 @@ ; GFX6-NEXT: v_mad_f32 v7, -v2, v6, v7 ; GFX6-NEXT: v_cvt_i32_f32_e32 v2, v2 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v7|, |v6| -; GFX6-NEXT: s_lshr_b32 s7, s8, 15 ; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GFX6-NEXT: v_mul_lo_u32 v5, v5, s7 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GFX6-NEXT: v_mul_lo_u32 v5, v5, s9 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v2, v0 ; GFX6-NEXT: v_mul_lo_u32 v0, v0, v3 -; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s4, v5 +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s7, v5 ; GFX6-NEXT: v_and_b32_e32 v2, 0x7fff, v2 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v1, v0 ; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 30 @@ -5655,95 +5725,104 @@ ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_lshl_b32 s2, 0x1000, s6 ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s2 -; GFX6-NEXT: s_lshl_b32 s3, 0x1000, s7 -; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s3 -; GFX6-NEXT: s_sub_i32 s0, 0, s2 +; GFX6-NEXT: s_sub_i32 s3, 0, s2 +; GFX6-NEXT: s_lshl_b32 s6, 0x1000, s7 +; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s6 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v1, v1 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX6-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 +; GFX6-NEXT: v_mul_lo_u32 v1, s3, v0 +; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0 +; GFX6-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX6-NEXT: v_mul_lo_u32 v2, s0, v0 -; GFX6-NEXT: s_sub_i32 s0, 0, s3 -; GFX6-NEXT: v_mul_lo_u32 v3, s0, v1 -; GFX6-NEXT: v_mul_hi_u32 v2, v0, v2 +; GFX6-NEXT: v_readfirstlane_b32 s3, v0 +; GFX6-NEXT: s_mul_i32 s3, s3, s2 +; GFX6-NEXT: s_sub_i32 s3, s4, s3 +; GFX6-NEXT: s_sub_i32 s4, s3, s2 +; GFX6-NEXT: s_cmp_ge_u32 s3, s2 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v0 +; GFX6-NEXT: s_cselect_b32 s3, s4, s3 +; GFX6-NEXT: s_cselect_b64 vcc, -1, 0 +; GFX6-NEXT: s_cmp_ge_u32 s3, s2 +; GFX6-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX6-NEXT: s_sub_i32 s4, 0, s6 +; GFX6-NEXT: v_mul_lo_u32 v3, s4, v1 +; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v0 ; GFX6-NEXT: v_mul_hi_u32 v3, v1, v3 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v2, v0 -; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0 +; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[2:3] ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v3, v1 ; GFX6-NEXT: v_mul_hi_u32 v1, s5, v1 -; GFX6-NEXT: v_mul_lo_u32 v2, v0, s2 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, 1, v0 -; GFX6-NEXT: v_mul_lo_u32 v4, v1, s3 -; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s4, v2 -; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v2 -; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v3, s[0:1] -; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s2, v2 -; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] -; GFX6-NEXT: v_add_i32_e32 v3, vcc, 1, v0 -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s2, v2 -; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s5, v4 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, 1, v1 -; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v2 -; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] -; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s3, v2 -; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] -; GFX6-NEXT: v_add_i32_e32 v3, vcc, 1, v1 -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s3, v2 -; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX6-NEXT: v_readfirstlane_b32 s0, v1 +; GFX6-NEXT: s_mul_i32 s0, s0, s6 +; GFX6-NEXT: s_sub_i32 s0, s5, s0 +; GFX6-NEXT: s_sub_i32 s1, s0, s6 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v1 +; GFX6-NEXT: s_cmp_ge_u32 s0, s6 +; GFX6-NEXT: s_cselect_b64 vcc, -1, 0 +; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX6-NEXT: s_cselect_b32 s0, s1, s0 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v1 +; GFX6-NEXT: s_cmp_ge_u32 s0, s6 +; GFX6-NEXT: s_cselect_b64 vcc, -1, 0 +; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: udiv_v2i32_pow2_shl_denom: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b32 s6, 0x1000, s6 -; GFX9-NEXT: s_lshl_b32 s7, 0x1000, s7 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6 -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s7 -; GFX9-NEXT: s_sub_i32 s2, 0, s6 -; GFX9-NEXT: s_sub_i32 s3, 0, s7 +; GFX9-NEXT: s_lshl_b32 s3, 0x1000, s6 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 +; GFX9-NEXT: s_lshl_b32 s2, 0x1000, s7 +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s2 +; GFX9-NEXT: s_sub_i32 s6, 0, s3 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 -; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX9-NEXT: v_mul_lo_u32 v2, s2, v0 -; GFX9-NEXT: v_mul_lo_u32 v3, s3, v1 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2 -; GFX9-NEXT: v_mul_hi_u32 v3, v1, v3 -; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 -; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 -; GFX9-NEXT: v_mul_hi_u32 v0, s4, v0 -; GFX9-NEXT: v_mul_hi_u32 v1, s5, v1 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: v_mul_lo_u32 v3, v0, s6 -; GFX9-NEXT: v_mul_lo_u32 v4, v1, s7 -; GFX9-NEXT: v_add_u32_e32 v5, 1, v0 -; GFX9-NEXT: v_add_u32_e32 v6, 1, v1 -; GFX9-NEXT: v_sub_u32_e32 v3, s4, v3 -; GFX9-NEXT: v_sub_u32_e32 v4, s5, v4 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s6, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc -; GFX9-NEXT: v_subrev_u32_e32 v5, s6, v3 -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s7, v4 -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[0:1] -; GFX9-NEXT: v_subrev_u32_e32 v6, s7, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc -; GFX9-NEXT: v_add_u32_e32 v5, 1, v0 -; GFX9-NEXT: v_cndmask_b32_e64 v4, v4, v6, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s6, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc -; GFX9-NEXT: v_add_u32_e32 v3, 1, v1 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: v_readfirstlane_b32 s7, v0 +; GFX9-NEXT: s_mul_i32 s6, s6, s7 +; GFX9-NEXT: s_mul_hi_u32 s6, s7, s6 +; GFX9-NEXT: s_add_i32 s7, s7, s6 +; GFX9-NEXT: s_mul_hi_u32 s6, s4, s7 +; GFX9-NEXT: s_mul_i32 s7, s6, s3 +; GFX9-NEXT: s_sub_i32 s4, s4, s7 +; GFX9-NEXT: s_add_i32 s9, s6, 1 +; GFX9-NEXT: s_sub_i32 s7, s4, s3 +; GFX9-NEXT: s_cmp_ge_u32 s4, s3 +; GFX9-NEXT: s_cselect_b32 s6, s9, s6 +; GFX9-NEXT: s_cselect_b32 s4, s7, s4 +; GFX9-NEXT: s_add_i32 s7, s6, 1 +; GFX9-NEXT: s_cmp_ge_u32 s4, s3 +; GFX9-NEXT: v_readfirstlane_b32 s8, v1 +; GFX9-NEXT: s_cselect_b32 s3, s7, s6 +; GFX9-NEXT: s_sub_i32 s4, 0, s2 +; GFX9-NEXT: s_mul_i32 s4, s4, s8 +; GFX9-NEXT: s_mul_hi_u32 s4, s8, s4 +; GFX9-NEXT: s_add_i32 s8, s8, s4 +; GFX9-NEXT: s_mul_hi_u32 s4, s5, s8 +; GFX9-NEXT: s_mul_i32 s6, s4, s2 +; GFX9-NEXT: s_sub_i32 s5, s5, s6 +; GFX9-NEXT: s_add_i32 s7, s4, 1 +; GFX9-NEXT: s_sub_i32 s6, s5, s2 +; GFX9-NEXT: s_cmp_ge_u32 s5, s2 +; GFX9-NEXT: s_cselect_b32 s4, s7, s4 +; GFX9-NEXT: s_cselect_b32 s5, s6, s5 +; GFX9-NEXT: s_add_i32 s6, s4, 1 +; GFX9-NEXT: s_cmp_ge_u32 s5, s2 +; GFX9-NEXT: s_cselect_b32 s2, s6, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm %shl.y = shl <2 x i32> , %y %r = udiv <2 x i32> %x, %shl.y @@ -5981,45 +6060,49 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_lshl_b32 s6, 0x1000, s6 -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s6 -; GFX6-NEXT: s_lshl_b32 s7, 0x1000, s7 -; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s7 -; GFX6-NEXT: s_sub_i32 s2, 0, s6 +; GFX6-NEXT: s_lshl_b32 s2, 0x1000, s6 +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GFX6-NEXT: s_sub_i32 s3, 0, s2 +; GFX6-NEXT: s_lshl_b32 s6, 0x1000, s7 +; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s6 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v1, v1 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX6-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 +; GFX6-NEXT: v_mul_lo_u32 v1, s3, v0 +; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0 +; GFX6-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX6-NEXT: v_mul_lo_u32 v2, s2, v0 -; GFX6-NEXT: s_sub_i32 s2, 0, s7 -; GFX6-NEXT: v_mul_lo_u32 v3, s2, v1 +; GFX6-NEXT: v_readfirstlane_b32 s3, v0 +; GFX6-NEXT: s_mul_i32 s3, s3, s2 +; GFX6-NEXT: s_sub_i32 s3, s4, s3 +; GFX6-NEXT: s_sub_i32 s4, s3, s2 +; GFX6-NEXT: s_cmp_ge_u32 s3, s2 +; GFX6-NEXT: s_cselect_b32 s3, s4, s3 +; GFX6-NEXT: s_sub_i32 s4, s3, s2 +; GFX6-NEXT: s_cmp_ge_u32 s3, s2 +; GFX6-NEXT: s_cselect_b32 s4, s4, s3 +; GFX6-NEXT: s_sub_i32 s2, 0, s6 +; GFX6-NEXT: v_mul_lo_u32 v0, s2, v1 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 -; GFX6-NEXT: v_mul_hi_u32 v2, v0, v2 -; GFX6-NEXT: v_mul_hi_u32 v3, v1, v3 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0 -; GFX6-NEXT: v_add_i32_e32 v1, vcc, v3, v1 -; GFX6-NEXT: v_mul_hi_u32 v1, s5, v1 -; GFX6-NEXT: v_mul_lo_u32 v0, v0, s6 -; GFX6-NEXT: v_mul_lo_u32 v1, v1, s7 -; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 -; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s6, v0 -; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s5, v1 -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s6, v0 -; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s6, v0 -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s6, v0 -; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s7, v1 -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s7, v1 -; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s7, v1 -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s7, v1 -; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX6-NEXT: v_mul_hi_u32 v0, v1, v0 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GFX6-NEXT: v_mul_hi_u32 v0, s5, v0 +; GFX6-NEXT: v_readfirstlane_b32 s7, v0 +; GFX6-NEXT: s_mul_i32 s7, s7, s6 +; GFX6-NEXT: s_sub_i32 s5, s5, s7 +; GFX6-NEXT: s_sub_i32 s7, s5, s6 +; GFX6-NEXT: s_cmp_ge_u32 s5, s6 +; GFX6-NEXT: s_cselect_b32 s5, s7, s5 +; GFX6-NEXT: s_sub_i32 s7, s5, s6 +; GFX6-NEXT: s_cmp_ge_u32 s5, s6 +; GFX6-NEXT: s_cselect_b32 s5, s7, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; @@ -6181,7 +6264,6 @@ ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX6-NEXT: s_xor_b32 s2, s2, s9 ; GFX6-NEXT: s_mov_b32 s5, s1 -; GFX6-NEXT: s_xor_b32 s8, s9, s8 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX6-NEXT: v_mul_lo_u32 v1, s4, v0 @@ -6189,56 +6271,62 @@ ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GFX6-NEXT: v_mul_hi_u32 v0, s2, v0 -; GFX6-NEXT: v_mul_lo_u32 v1, v0, s3 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v0 -; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s2, v1 -; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v1 -; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] -; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s3, v1 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, 1, v0 -; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[0:1] -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s3, v1 -; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX6-NEXT: v_xor_b32_e32 v0, s8, v0 -; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s8, v0 +; GFX6-NEXT: v_readfirstlane_b32 s0, v0 +; GFX6-NEXT: s_mul_i32 s0, s0, s3 +; GFX6-NEXT: s_sub_i32 s0, s2, s0 +; GFX6-NEXT: s_sub_i32 s1, s0, s3 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, 1, v0 +; GFX6-NEXT: s_cmp_ge_u32 s0, s3 +; GFX6-NEXT: s_cselect_b64 vcc, -1, 0 +; GFX6-NEXT: s_cselect_b32 s0, s1, s0 +; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX6-NEXT: v_add_i32_e32 v1, vcc, 1, v0 +; GFX6-NEXT: s_cmp_ge_u32 s0, s3 +; GFX6-NEXT: s_cselect_b64 vcc, -1, 0 +; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX6-NEXT: s_xor_b32 s0, s9, s8 +; GFX6-NEXT: v_xor_b32_e32 v0, s0, v0 +; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s0, v0 ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: sdiv_i32_pow2_shl_denom: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b32 s3, 0x1000, s3 ; GFX9-NEXT: s_ashr_i32 s4, s3, 31 ; GFX9-NEXT: s_add_i32 s3, s3, s4 ; GFX9-NEXT: s_xor_b32 s3, s3, s4 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 -; GFX9-NEXT: s_sub_i32 s5, 0, s3 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 -; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: v_mul_lo_u32 v1, s5, v0 +; GFX9-NEXT: s_sub_i32 s6, 0, s3 ; GFX9-NEXT: s_ashr_i32 s5, s2, 31 ; GFX9-NEXT: s_add_i32 s2, s2, s5 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: s_xor_b32 s2, s2, s5 -; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 -; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 -; GFX9-NEXT: v_mul_hi_u32 v0, s2, v0 -; GFX9-NEXT: v_mul_lo_u32 v1, v0, s3 -; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 -; GFX9-NEXT: v_sub_u32_e32 v1, s2, v1 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, s3, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX9-NEXT: v_add_u32_e32 v4, 1, v0 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; GFX9-NEXT: s_xor_b32 s2, s5, s4 -; GFX9-NEXT: v_xor_b32_e32 v0, s2, v0 -; GFX9-NEXT: v_subrev_u32_e32 v0, s2, v0 -; GFX9-NEXT: global_store_dword v2, v0, s[0:1] +; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX9-NEXT: v_readfirstlane_b32 s7, v0 +; GFX9-NEXT: s_mul_i32 s6, s6, s7 +; GFX9-NEXT: s_mul_hi_u32 s6, s7, s6 +; GFX9-NEXT: s_add_i32 s7, s7, s6 +; GFX9-NEXT: s_mul_hi_u32 s6, s2, s7 +; GFX9-NEXT: s_mul_i32 s8, s6, s3 +; GFX9-NEXT: s_sub_i32 s2, s2, s8 +; GFX9-NEXT: s_add_i32 s7, s6, 1 +; GFX9-NEXT: s_sub_i32 s8, s2, s3 +; GFX9-NEXT: s_cmp_ge_u32 s2, s3 +; GFX9-NEXT: s_cselect_b32 s6, s7, s6 +; GFX9-NEXT: s_cselect_b32 s2, s8, s2 +; GFX9-NEXT: s_add_i32 s7, s6, 1 +; GFX9-NEXT: s_cmp_ge_u32 s2, s3 +; GFX9-NEXT: s_cselect_b32 s2, s7, s6 +; GFX9-NEXT: s_xor_b32 s3, s5, s4 +; GFX9-NEXT: s_xor_b32 s2, s2, s3 +; GFX9-NEXT: s_sub_i32 s2, s2, s3 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-NEXT: s_endpgm %shl.y = shl i32 4096, %y %r = sdiv i32 %x, %shl.y @@ -6446,148 +6534,158 @@ ; ; GFX6-LABEL: sdiv_v2i32_pow2_shl_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xb -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_lshl_b32 s0, 0x1000, s10 -; GFX6-NEXT: s_ashr_i32 s1, s0, 31 -; GFX6-NEXT: s_add_i32 s0, s0, s1 -; GFX6-NEXT: s_xor_b32 s2, s0, s1 +; GFX6-NEXT: s_lshl_b32 s2, 0x1000, s6 +; GFX6-NEXT: s_ashr_i32 s3, s2, 31 +; GFX6-NEXT: s_add_i32 s2, s2, s3 +; GFX6-NEXT: s_xor_b32 s2, s2, s3 ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s2 -; GFX6-NEXT: s_lshl_b32 s0, 0x1000, s11 -; GFX6-NEXT: s_ashr_i32 s3, s0, 31 -; GFX6-NEXT: s_add_i32 s0, s0, s3 +; GFX6-NEXT: s_sub_i32 s6, 0, s2 +; GFX6-NEXT: s_lshl_b32 s7, 0x1000, s7 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX6-NEXT: s_sub_i32 s11, 0, s2 -; GFX6-NEXT: s_xor_b32 s10, s0, s3 -; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s10 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX6-NEXT: s_ashr_i32 s0, s8, 31 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GFX6-NEXT: s_add_i32 s8, s8, s0 -; GFX6-NEXT: v_mul_lo_u32 v2, s11, v0 -; GFX6-NEXT: s_xor_b32 s8, s8, s0 -; GFX6-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 -; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX6-NEXT: v_mul_hi_u32 v2, v0, v2 -; GFX6-NEXT: s_xor_b32 s11, s0, s1 -; GFX6-NEXT: s_sub_i32 s0, 0, s10 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GFX6-NEXT: v_mul_hi_u32 v0, s8, v0 -; GFX6-NEXT: v_mul_lo_u32 v2, s0, v1 -; GFX6-NEXT: v_mul_lo_u32 v3, v0, s2 -; GFX6-NEXT: v_add_i32_e32 v4, vcc, 1, v0 -; GFX6-NEXT: v_mul_hi_u32 v2, v1, v2 -; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s8, v3 -; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v3 -; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[0:1] -; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s2, v3 -; GFX6-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[0:1] -; GFX6-NEXT: s_ashr_i32 s0, s9, 31 -; GFX6-NEXT: s_add_i32 s1, s9, s0 -; GFX6-NEXT: s_xor_b32 s1, s1, s0 -; GFX6-NEXT: v_add_i32_e32 v1, vcc, v2, v1 -; GFX6-NEXT: v_mul_hi_u32 v1, s1, v1 -; GFX6-NEXT: v_add_i32_e32 v4, vcc, 1, v0 -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s2, v3 -; GFX6-NEXT: v_mul_lo_u32 v2, v1, s10 -; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; GFX6-NEXT: s_xor_b32 s2, s0, s3 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, 1, v1 -; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s1, v2 -; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s10, v2 -; GFX6-NEXT: v_xor_b32_e32 v0, s11, v0 -; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] -; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s10, v2 -; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s11, v0 -; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] -; GFX6-NEXT: v_add_i32_e32 v3, vcc, 1, v1 -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s10, v2 -; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX6-NEXT: v_xor_b32_e32 v1, s2, v1 -; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, s2, v1 -; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX6-NEXT: v_mul_lo_u32 v1, s6, v0 +; GFX6-NEXT: s_ashr_i32 s6, s4, 31 +; GFX6-NEXT: s_add_i32 s4, s4, s6 +; GFX6-NEXT: s_xor_b32 s4, s4, s6 +; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 +; GFX6-NEXT: s_xor_b32 s6, s6, s3 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 +; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0 +; GFX6-NEXT: v_readfirstlane_b32 s3, v0 +; GFX6-NEXT: s_mul_i32 s3, s3, s2 +; GFX6-NEXT: s_sub_i32 s3, s4, s3 +; GFX6-NEXT: s_sub_i32 s4, s3, s2 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, 1, v0 +; GFX6-NEXT: s_cmp_ge_u32 s3, s2 +; GFX6-NEXT: s_cselect_b64 vcc, -1, 0 +; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX6-NEXT: s_cselect_b32 s3, s4, s3 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, 1, v0 +; GFX6-NEXT: s_cmp_ge_u32 s3, s2 +; GFX6-NEXT: s_cselect_b64 vcc, -1, 0 +; GFX6-NEXT: s_ashr_i32 s4, s7, 31 +; GFX6-NEXT: s_add_i32 s7, s7, s4 +; GFX6-NEXT: s_xor_b32 s7, s7, s4 +; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s7 +; GFX6-NEXT: s_sub_i32 s8, 0, s7 +; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX6-NEXT: v_xor_b32_e32 v0, s6, v0 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 +; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s6, v0 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 +; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v2 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: v_mul_lo_u32 v3, s8, v2 +; GFX6-NEXT: s_ashr_i32 s8, s5, 31 +; GFX6-NEXT: s_add_i32 s5, s5, s8 +; GFX6-NEXT: s_xor_b32 s5, s5, s8 +; GFX6-NEXT: v_mul_hi_u32 v1, v2, v3 +; GFX6-NEXT: s_xor_b32 s4, s8, s4 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v2 +; GFX6-NEXT: v_mul_hi_u32 v1, s5, v1 +; GFX6-NEXT: v_readfirstlane_b32 s6, v1 +; GFX6-NEXT: s_mul_i32 s6, s6, s7 +; GFX6-NEXT: s_sub_i32 s5, s5, s6 +; GFX6-NEXT: s_sub_i32 s6, s5, s7 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v1 +; GFX6-NEXT: s_cmp_ge_u32 s5, s7 +; GFX6-NEXT: s_cselect_b64 vcc, -1, 0 +; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX6-NEXT: s_cselect_b32 s5, s6, s5 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v1 +; GFX6-NEXT: s_cmp_ge_u32 s5, s7 +; GFX6-NEXT: s_cselect_b64 vcc, -1, 0 +; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX6-NEXT: v_xor_b32_e32 v1, s4, v1 +; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, s4, v1 +; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: sdiv_v2i32_pow2_shl_denom: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b32 s0, 0x1000, s6 -; GFX9-NEXT: s_ashr_i32 s1, s0, 31 -; GFX9-NEXT: s_add_i32 s0, s0, s1 -; GFX9-NEXT: s_xor_b32 s0, s0, s1 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s0 +; GFX9-NEXT: s_lshl_b32 s2, 0x1000, s6 +; GFX9-NEXT: s_ashr_i32 s3, s2, 31 +; GFX9-NEXT: s_add_i32 s2, s2, s3 +; GFX9-NEXT: s_xor_b32 s2, s2, s3 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2 ; GFX9-NEXT: s_lshl_b32 s6, 0x1000, s7 -; GFX9-NEXT: s_ashr_i32 s8, s6, 31 -; GFX9-NEXT: s_add_i32 s6, s6, s8 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_xor_b32 s6, s6, s8 -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s6 -; GFX9-NEXT: s_sub_i32 s10, 0, s0 -; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 -; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GFX9-NEXT: s_ashr_i32 s7, s4, 31 ; GFX9-NEXT: s_add_i32 s4, s4, s7 -; GFX9-NEXT: v_mul_lo_u32 v3, s10, v0 -; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 -; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX9-NEXT: s_sub_i32 s10, 0, s6 -; GFX9-NEXT: v_mul_hi_u32 v3, v0, v3 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX9-NEXT: s_xor_b32 s3, s7, s3 ; GFX9-NEXT: s_xor_b32 s4, s4, s7 -; GFX9-NEXT: v_mul_lo_u32 v4, s10, v1 -; GFX9-NEXT: s_ashr_i32 s9, s5, 31 -; GFX9-NEXT: v_add_u32_e32 v0, v0, v3 -; GFX9-NEXT: v_mul_hi_u32 v0, s4, v0 -; GFX9-NEXT: v_mul_hi_u32 v3, v1, v4 -; GFX9-NEXT: s_add_i32 s5, s5, s9 -; GFX9-NEXT: s_xor_b32 s5, s5, s9 -; GFX9-NEXT: v_mul_lo_u32 v4, v0, s0 -; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 -; GFX9-NEXT: v_mul_hi_u32 v1, s5, v1 -; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 -; GFX9-NEXT: v_sub_u32_e32 v4, s4, v4 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s0, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, s0, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s0, v3 -; GFX9-NEXT: v_mul_lo_u32 v3, v1, s6 -; GFX9-NEXT: v_add_u32_e32 v4, 1, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; GFX9-NEXT: v_add_u32_e32 v4, 1, v1 -; GFX9-NEXT: v_sub_u32_e32 v3, s5, v3 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s6, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GFX9-NEXT: v_subrev_u32_e32 v4, s6, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc -; GFX9-NEXT: v_add_u32_e32 v4, 1, v1 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s6, v3 -; GFX9-NEXT: s_xor_b32 s1, s7, s1 -; GFX9-NEXT: s_xor_b32 s0, s9, s8 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GFX9-NEXT: v_xor_b32_e32 v0, s1, v0 -; GFX9-NEXT: v_xor_b32_e32 v1, s0, v1 -; GFX9-NEXT: v_subrev_u32_e32 v0, s1, v0 -; GFX9-NEXT: v_subrev_u32_e32 v1, s0, v1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] -; GFX9-NEXT: s_endpgm - %shl.y = shl <2 x i32> , %y - %r = sdiv <2 x i32> %x, %shl.y - store <2 x i32> %r, <2 x i32> addrspace(1)* %out - ret void -} - -define amdgpu_kernel void @srem_i32_oddk_denom(i32 addrspace(1)* %out, i32 %x) { -; CHECK-LABEL: @srem_i32_oddk_denom( -; CHECK-NEXT: [[R:%.*]] = srem i32 [[X:%.*]], 1235195 -; CHECK-NEXT: store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4 +; GFX9-NEXT: s_sub_i32 s7, 0, s2 +; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX9-NEXT: v_readfirstlane_b32 s8, v0 +; GFX9-NEXT: s_mul_i32 s7, s7, s8 +; GFX9-NEXT: s_mul_hi_u32 s7, s8, s7 +; GFX9-NEXT: s_add_i32 s8, s8, s7 +; GFX9-NEXT: s_mul_hi_u32 s7, s4, s8 +; GFX9-NEXT: s_mul_i32 s8, s7, s2 +; GFX9-NEXT: s_sub_i32 s4, s4, s8 +; GFX9-NEXT: s_add_i32 s9, s7, 1 +; GFX9-NEXT: s_sub_i32 s8, s4, s2 +; GFX9-NEXT: s_cmp_ge_u32 s4, s2 +; GFX9-NEXT: s_cselect_b32 s7, s9, s7 +; GFX9-NEXT: s_cselect_b32 s4, s8, s4 +; GFX9-NEXT: s_add_i32 s8, s7, 1 +; GFX9-NEXT: s_cmp_ge_u32 s4, s2 +; GFX9-NEXT: s_cselect_b32 s2, s8, s7 +; GFX9-NEXT: s_ashr_i32 s4, s6, 31 +; GFX9-NEXT: s_add_i32 s6, s6, s4 +; GFX9-NEXT: s_xor_b32 s6, s6, s4 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6 +; GFX9-NEXT: s_ashr_i32 s7, s5, 31 +; GFX9-NEXT: s_xor_b32 s2, s2, s3 +; GFX9-NEXT: s_add_i32 s5, s5, s7 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX9-NEXT: s_xor_b32 s4, s7, s4 +; GFX9-NEXT: s_sub_i32 s2, s2, s3 +; GFX9-NEXT: s_xor_b32 s3, s5, s7 +; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX9-NEXT: s_sub_i32 s5, 0, s6 +; GFX9-NEXT: v_readfirstlane_b32 s7, v0 +; GFX9-NEXT: s_mul_i32 s5, s5, s7 +; GFX9-NEXT: s_mul_hi_u32 s5, s7, s5 +; GFX9-NEXT: s_add_i32 s7, s7, s5 +; GFX9-NEXT: s_mul_hi_u32 s5, s3, s7 +; GFX9-NEXT: s_mul_i32 s7, s5, s6 +; GFX9-NEXT: s_sub_i32 s3, s3, s7 +; GFX9-NEXT: s_add_i32 s8, s5, 1 +; GFX9-NEXT: s_sub_i32 s7, s3, s6 +; GFX9-NEXT: s_cmp_ge_u32 s3, s6 +; GFX9-NEXT: s_cselect_b32 s5, s8, s5 +; GFX9-NEXT: s_cselect_b32 s3, s7, s3 +; GFX9-NEXT: s_add_i32 s7, s5, 1 +; GFX9-NEXT: s_cmp_ge_u32 s3, s6 +; GFX9-NEXT: s_cselect_b32 s3, s7, s5 +; GFX9-NEXT: s_xor_b32 s3, s3, s4 +; GFX9-NEXT: s_sub_i32 s3, s3, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: s_endpgm + %shl.y = shl <2 x i32> , %y + %r = sdiv <2 x i32> %x, %shl.y + store <2 x i32> %r, <2 x i32> addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @srem_i32_oddk_denom(i32 addrspace(1)* %out, i32 %x) { +; CHECK-LABEL: @srem_i32_oddk_denom( +; CHECK-NEXT: [[R:%.*]] = srem i32 [[X:%.*]], 1235195 +; CHECK-NEXT: store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4 ; CHECK-NEXT: ret void ; ; GFX6-LABEL: srem_i32_oddk_denom: @@ -6699,18 +6797,20 @@ ; GFX6-NEXT: v_mul_lo_u32 v1, s3, v0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GFX6-NEXT: v_mul_hi_u32 v0, s6, v0 -; GFX6-NEXT: v_mul_lo_u32 v0, v0, s4 -; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s6, v0 -; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, s4, v0 -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s4, v0 -; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, s4, v0 -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s4, v0 -; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GFX6-NEXT: v_xor_b32_e32 v0, s5, v0 -; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s5, v0 +; GFX6-NEXT: v_readfirstlane_b32 s7, v0 +; GFX6-NEXT: s_mul_i32 s7, s7, s4 +; GFX6-NEXT: s_sub_i32 s6, s6, s7 +; GFX6-NEXT: s_sub_i32 s7, s6, s4 +; GFX6-NEXT: s_cmp_ge_u32 s6, s4 +; GFX6-NEXT: s_cselect_b32 s6, s7, s6 +; GFX6-NEXT: s_sub_i32 s7, s6, s4 +; GFX6-NEXT: s_cmp_ge_u32 s6, s4 +; GFX6-NEXT: s_cselect_b32 s4, s7, s6 +; GFX6-NEXT: s_xor_b32 s4, s4, s5 +; GFX6-NEXT: s_sub_i32 s4, s4, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; @@ -6903,57 +7003,61 @@ ; GFX6-NEXT: s_lshl_b32 s2, 0x1000, s6 ; GFX6-NEXT: s_ashr_i32 s3, s2, 31 ; GFX6-NEXT: s_add_i32 s2, s2, s3 -; GFX6-NEXT: s_xor_b32 s6, s2, s3 -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s6 -; GFX6-NEXT: s_lshl_b32 s7, 0x1000, s7 -; GFX6-NEXT: s_ashr_i32 s8, s7, 31 -; GFX6-NEXT: s_add_i32 s7, s7, s8 +; GFX6-NEXT: s_xor_b32 s2, s2, s3 +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GFX6-NEXT: s_sub_i32 s3, 0, s2 +; GFX6-NEXT: s_ashr_i32 s6, s4, 31 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX6-NEXT: s_xor_b32 s7, s7, s8 -; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s7 -; GFX6-NEXT: s_sub_i32 s9, 0, s6 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GFX6-NEXT: s_ashr_i32 s8, s4, 31 -; GFX6-NEXT: s_add_i32 s4, s4, s8 -; GFX6-NEXT: v_mul_lo_u32 v2, s9, v0 -; GFX6-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 -; GFX6-NEXT: s_xor_b32 s4, s4, s8 -; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX6-NEXT: v_mul_hi_u32 v2, v0, v2 -; GFX6-NEXT: s_sub_i32 s9, 0, s7 +; GFX6-NEXT: v_mul_lo_u32 v1, s3, v0 +; GFX6-NEXT: s_add_i32 s3, s4, s6 +; GFX6-NEXT: s_xor_b32 s3, s3, s6 +; GFX6-NEXT: s_lshl_b32 s4, 0x1000, s7 +; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GFX6-NEXT: v_mul_hi_u32 v0, s3, v0 +; GFX6-NEXT: v_readfirstlane_b32 s7, v0 +; GFX6-NEXT: s_mul_i32 s7, s7, s2 +; GFX6-NEXT: s_sub_i32 s3, s3, s7 +; GFX6-NEXT: s_sub_i32 s7, s3, s2 +; GFX6-NEXT: s_cmp_ge_u32 s3, s2 +; GFX6-NEXT: s_cselect_b32 s3, s7, s3 +; GFX6-NEXT: s_sub_i32 s7, s3, s2 +; GFX6-NEXT: s_cmp_ge_u32 s3, s2 +; GFX6-NEXT: s_cselect_b32 s7, s7, s3 +; GFX6-NEXT: s_ashr_i32 s2, s4, 31 +; GFX6-NEXT: s_add_i32 s4, s4, s2 +; GFX6-NEXT: s_xor_b32 s4, s4, s2 +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s4 +; GFX6-NEXT: s_sub_i32 s2, 0, s4 +; GFX6-NEXT: s_ashr_i32 s8, s5, 31 +; GFX6-NEXT: s_xor_b32 s7, s7, s6 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX6-NEXT: s_sub_i32 s6, s7, s6 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX6-NEXT: v_mul_lo_u32 v1, s2, v0 +; GFX6-NEXT: s_add_i32 s2, s5, s8 +; GFX6-NEXT: s_xor_b32 s5, s2, s8 ; GFX6-NEXT: s_mov_b32 s2, -1 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0 -; GFX6-NEXT: v_mul_lo_u32 v2, s9, v1 -; GFX6-NEXT: s_ashr_i32 s9, s5, 31 -; GFX6-NEXT: s_add_i32 s5, s5, s9 -; GFX6-NEXT: v_mul_lo_u32 v0, v0, s6 -; GFX6-NEXT: v_mul_hi_u32 v2, v1, v2 -; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 -; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s6, v0 -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s6, v0 -; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX6-NEXT: s_xor_b32 s4, s5, s9 -; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v2 -; GFX6-NEXT: v_mul_hi_u32 v1, s4, v1 -; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s6, v0 -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s6, v0 -; GFX6-NEXT: v_mul_lo_u32 v1, v1, s7 -; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX6-NEXT: v_xor_b32_e32 v0, s8, v0 -; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s8, v0 -; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s4, v1 -; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s7, v1 -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s7, v1 -; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s7, v1 -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s7, v1 -; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX6-NEXT: v_xor_b32_e32 v1, s9, v1 -; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, s9, v1 +; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 +; GFX6-NEXT: v_mul_hi_u32 v0, s5, v0 +; GFX6-NEXT: v_readfirstlane_b32 s7, v0 +; GFX6-NEXT: s_mul_i32 s7, s7, s4 +; GFX6-NEXT: s_sub_i32 s5, s5, s7 +; GFX6-NEXT: s_sub_i32 s7, s5, s4 +; GFX6-NEXT: s_cmp_ge_u32 s5, s4 +; GFX6-NEXT: s_cselect_b32 s5, s7, s5 +; GFX6-NEXT: s_sub_i32 s7, s5, s4 +; GFX6-NEXT: s_cmp_ge_u32 s5, s4 +; GFX6-NEXT: s_cselect_b32 s4, s7, s5 +; GFX6-NEXT: s_xor_b32 s4, s4, s8 +; GFX6-NEXT: s_sub_i32 s4, s4, s8 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; @@ -7079,9 +7183,9 @@ ; GFX6-NEXT: v_mul_lo_u32 v4, v1, s5 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s4, s0 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GFX6-NEXT: v_mul_lo_u32 v3, v0, s5 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v4, v2 ; GFX6-NEXT: v_mul_lo_u32 v4, v0, v2 ; GFX6-NEXT: v_mul_hi_u32 v5, v0, v3 ; GFX6-NEXT: v_mul_hi_u32 v6, v0, v2 @@ -7133,12 +7237,13 @@ ; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1] ; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], s8, v4 ; GFX6-NEXT: v_cndmask_b32_e64 v4, v6, v5, s[0:1] -; GFX6-NEXT: v_add_i32_e64 v5, s[0:1], 2, v0 +; GFX6-NEXT: v_add_i32_e64 v5, s[0:1], 1, v0 ; GFX6-NEXT: v_addc_u32_e64 v6, s[0:1], 0, v1, s[0:1] -; GFX6-NEXT: v_add_i32_e64 v7, s[0:1], 1, v0 +; GFX6-NEXT: v_add_i32_e64 v7, s[0:1], 2, v0 ; GFX6-NEXT: v_addc_u32_e64 v8, s[0:1], 0, v1, s[0:1] ; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4 -; GFX6-NEXT: v_cndmask_b32_e64 v4, v8, v6, s[0:1] +; GFX6-NEXT: v_cndmask_b32_e64 v4, v5, v7, s[0:1] +; GFX6-NEXT: v_cndmask_b32_e64 v5, v6, v8, s[0:1] ; GFX6-NEXT: v_mov_b32_e32 v6, s3 ; GFX6-NEXT: v_subb_u32_e32 v2, vcc, v6, v2, vcc ; GFX6-NEXT: v_cmp_lt_u32_e32 vcc, s2, v2 @@ -7148,9 +7253,8 @@ ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s8, v2 ; GFX6-NEXT: v_cndmask_b32_e32 v2, v6, v3, vcc ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GFX6-NEXT: v_cndmask_b32_e64 v2, v7, v5, s[0:1] -; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -7256,33 +7360,33 @@ ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v3, s9 ; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GFX9-NEXT: s_add_u32 s6, s3, 2 ; GFX9-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[0:1] -; GFX9-NEXT: s_addc_u32 s0, s2, 0 -; GFX9-NEXT: s_add_u32 s9, s3, 1 -; GFX9-NEXT: s_addc_u32 s1, s2, 0 -; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 -; GFX9-NEXT: s_subb_u32 s7, s7, s8 -; GFX9-NEXT: s_cmpk_gt_u32 s7, 0x11e -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_mov_b32_e32 v4, s0 +; GFX9-NEXT: s_add_u32 s0, s3, 1 +; GFX9-NEXT: s_addc_u32 s6, s2, 0 +; GFX9-NEXT: s_add_u32 s1, s3, 2 +; GFX9-NEXT: s_addc_u32 s9, s2, 0 +; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: v_mov_b32_e32 v4, s1 ; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v1 -; GFX9-NEXT: s_cselect_b32 s8, -1, 0 +; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_mov_b32_e32 v4, s9 +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[0:1] +; GFX9-NEXT: s_subb_u32 s0, s7, s8 +; GFX9-NEXT: s_cmpk_gt_u32 s0, 0x11e +; GFX9-NEXT: s_cselect_b32 s1, -1, 0 ; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, s10, v0 -; GFX9-NEXT: s_cmpk_eq_i32 s7, 0x11f -; GFX9-NEXT: v_cndmask_b32_e64 v1, v3, v4, s[0:1] +; GFX9-NEXT: s_cmpk_eq_i32 s0, 0x11f ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; GFX9-NEXT: v_mov_b32_e32 v3, s8 +; GFX9-NEXT: v_mov_b32_e32 v4, s1 ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc -; GFX9-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; GFX9-NEXT: v_mov_b32_e32 v4, s2 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; GFX9-NEXT: v_mov_b32_e32 v0, s9 -; GFX9-NEXT: v_mov_b32_e32 v3, s6 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v3, s[0:1] -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc +; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm %r = udiv i64 %x, 1235195949943 @@ -7431,7 +7535,7 @@ ; GFX6-NEXT: s_movk_i32 s6, 0xf001 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd -; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_movk_i32 s8, 0xfff ; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GFX6-NEXT: v_trunc_f32_e32 v1, v1 @@ -7439,8 +7543,8 @@ ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_lshr_b64 s[8:9], s[0:1], 12 -; GFX6-NEXT: s_movk_i32 s0, 0xfff +; GFX6-NEXT: s_lshr_b64 s[0:1], s[0:1], 12 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: v_mul_hi_u32 v2, v0, s6 ; GFX6-NEXT: v_mul_lo_u32 v4, v1, s6 ; GFX6-NEXT: v_mul_lo_u32 v3, v0, s6 @@ -7498,143 +7602,159 @@ ; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc -; GFX6-NEXT: v_mul_lo_u32 v4, v1, s0 -; GFX6-NEXT: v_mul_hi_u32 v5, v0, s0 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, 2, v0 -; GFX6-NEXT: v_mul_lo_u32 v8, v0, s0 +; GFX6-NEXT: v_mul_lo_u32 v4, v1, s8 +; GFX6-NEXT: v_mul_hi_u32 v5, v0, s8 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v0 +; GFX6-NEXT: v_mul_lo_u32 v8, v0, s8 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc -; GFX6-NEXT: v_add_i32_e32 v6, vcc, 1, v0 +; GFX6-NEXT: v_add_i32_e32 v6, vcc, 2, v0 ; GFX6-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc -; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v5 ; GFX6-NEXT: v_mov_b32_e32 v5, s3 ; GFX6-NEXT: v_sub_i32_e32 v8, vcc, s2, v8 ; GFX6-NEXT: v_subb_u32_e32 v4, vcc, v5, v4, vcc -; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, s0, v8 +; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, s8, v8 ; GFX6-NEXT: v_subbrev_u32_e32 v9, vcc, 0, v4, vcc -; GFX6-NEXT: s_movk_i32 s0, 0xffe -; GFX6-NEXT: v_cmp_lt_u32_e32 vcc, s0, v5 +; GFX6-NEXT: s_movk_i32 s2, 0xffe +; GFX6-NEXT: v_cmp_lt_u32_e32 vcc, s2, v5 ; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v9 ; GFX6-NEXT: v_cndmask_b32_e32 v5, -1, v5, vcc -; GFX6-NEXT: v_cmp_lt_u32_e64 s[0:1], s0, v8 ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1] -; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v4 -; GFX6-NEXT: v_cndmask_b32_e64 v4, -1, v5, s[0:1] -; GFX6-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc -; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4 -; GFX6-NEXT: v_cndmask_b32_e64 v3, v1, v3, s[0:1] -; GFX6-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc -; GFX6-NEXT: v_cndmask_b32_e64 v2, v0, v1, s[0:1] -; GFX6-NEXT: v_mov_b32_e32 v0, s8 -; GFX6-NEXT: v_mov_b32_e32 v1, s9 +; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc +; GFX6-NEXT: v_cmp_lt_u32_e32 vcc, s2, v8 +; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX6-NEXT: v_cndmask_b32_e32 v4, -1, v5, vcc +; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GFX6-NEXT: v_cndmask_b32_e32 v3, v1, v3, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: udiv_v2i64_mixed_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v0, 0x4f800000 -; GFX9-NEXT: v_madak_f32 v0, 0, v0, 0x457ff000 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x457ff000 +; GFX9-NEXT: v_mov_b32_e32 v1, 0x4f800000 +; GFX9-NEXT: v_mac_f32_e32 v0, 0, v1 ; GFX9-NEXT: v_rcp_f32_e32 v0, v0 -; GFX9-NEXT: s_movk_i32 s2, 0xf001 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GFX9-NEXT: v_trunc_f32_e32 v1, v1 ; GFX9-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX9-NEXT: v_mul_hi_u32 v2, v0, s2 -; GFX9-NEXT: v_mul_lo_u32 v4, v1, s2 -; GFX9-NEXT: v_mul_lo_u32 v3, v0, s2 -; GFX9-NEXT: v_sub_u32_e32 v2, v2, v0 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v4 -; GFX9-NEXT: v_mul_hi_u32 v5, v0, v3 -; GFX9-NEXT: v_mul_lo_u32 v4, v0, v2 -; GFX9-NEXT: v_mul_hi_u32 v7, v0, v2 -; GFX9-NEXT: v_mul_lo_u32 v6, v1, v3 -; GFX9-NEXT: v_mul_hi_u32 v3, v1, v3 -; GFX9-NEXT: v_mul_hi_u32 v8, v1, v2 -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v5, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc -; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v6 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v3, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v8, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc -; GFX9-NEXT: v_mul_hi_u32 v2, v0, s2 -; GFX9-NEXT: v_mul_lo_u32 v3, v1, s2 -; GFX9-NEXT: v_mul_lo_u32 v5, v0, s2 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GFX9-NEXT: v_sub_u32_e32 v2, v2, v0 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v3 -; GFX9-NEXT: v_mul_lo_u32 v3, v0, v2 -; GFX9-NEXT: v_mul_hi_u32 v6, v0, v5 -; GFX9-NEXT: v_mul_hi_u32 v7, v0, v2 -; GFX9-NEXT: v_mul_hi_u32 v8, v1, v2 -; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v6, v3 -; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v7, vcc -; GFX9-NEXT: v_mul_lo_u32 v7, v1, v5 -; GFX9-NEXT: v_mul_hi_u32 v5, v1, v5 -; GFX9-NEXT: s_movk_i32 s0, 0xfff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshr_b64 s[4:5], s[4:5], 12 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v7 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v6, v5, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v8, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v5, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc -; GFX9-NEXT: v_mul_lo_u32 v2, s6, v1 -; GFX9-NEXT: v_mul_hi_u32 v3, s6, v0 -; GFX9-NEXT: v_mul_hi_u32 v5, s6, v1 -; GFX9-NEXT: v_mul_hi_u32 v6, s7, v1 -; GFX9-NEXT: v_mul_lo_u32 v1, s7, v1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v5, vcc -; GFX9-NEXT: v_mul_lo_u32 v5, s7, v0 -; GFX9-NEXT: v_mul_hi_u32 v0, s7, v0 -; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5 -; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v3, v0, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v6, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v2, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 2, v0 -; GFX9-NEXT: v_mul_lo_u32 v5, v1, s0 -; GFX9-NEXT: v_mul_hi_u32 v6, v0, s0 -; GFX9-NEXT: v_mul_lo_u32 v9, v0, s0 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, 1, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v1, vcc -; GFX9-NEXT: v_add_u32_e32 v5, v6, v5 -; GFX9-NEXT: v_mov_b32_e32 v6, s7 -; GFX9-NEXT: v_sub_co_u32_e32 v9, vcc, s6, v9 -; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v6, v5, vcc -; GFX9-NEXT: v_subrev_co_u32_e32 v6, vcc, s0, v9 -; GFX9-NEXT: v_subbrev_co_u32_e32 v10, vcc, 0, v5, vcc -; GFX9-NEXT: s_movk_i32 s0, 0xffe -; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, s0, v6 -; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v10 -; GFX9-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc -; GFX9-NEXT: v_cmp_lt_u32_e64 s[0:1], s0, v9 -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v5 -; GFX9-NEXT: v_cndmask_b32_e64 v5, -1, v6, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e32 v3, v8, v3, vcc -; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v5 -; GFX9-NEXT: v_cndmask_b32_e64 v3, v1, v3, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v2, v0, v1, s[0:1] +; GFX9-NEXT: s_lshr_b64 s[0:1], s[4:5], 12 +; GFX9-NEXT: v_readfirstlane_b32 s4, v0 +; GFX9-NEXT: s_mul_hi_u32 s5, s4, 0xfffff001 +; GFX9-NEXT: v_readfirstlane_b32 s8, v1 +; GFX9-NEXT: s_sub_i32 s5, s5, s4 +; GFX9-NEXT: s_mul_i32 s9, s8, 0xfffff001 +; GFX9-NEXT: s_add_i32 s5, s5, s9 +; GFX9-NEXT: s_mul_i32 s11, s4, 0xfffff001 +; GFX9-NEXT: s_mul_hi_u32 s9, s4, s5 +; GFX9-NEXT: s_mul_i32 s10, s4, s5 +; GFX9-NEXT: s_mul_hi_u32 s4, s4, s11 +; GFX9-NEXT: s_add_u32 s4, s4, s10 +; GFX9-NEXT: s_addc_u32 s9, 0, s9 +; GFX9-NEXT: s_mul_hi_u32 s12, s8, s11 +; GFX9-NEXT: s_mul_i32 s11, s8, s11 +; GFX9-NEXT: s_add_u32 s4, s4, s11 +; GFX9-NEXT: s_mul_hi_u32 s10, s8, s5 +; GFX9-NEXT: s_addc_u32 s4, s9, s12 +; GFX9-NEXT: s_addc_u32 s9, s10, 0 +; GFX9-NEXT: s_mul_i32 s5, s8, s5 +; GFX9-NEXT: s_add_u32 s4, s4, s5 +; GFX9-NEXT: s_addc_u32 s5, 0, s9 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v0 +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-NEXT: s_addc_u32 s4, s8, s5 +; GFX9-NEXT: v_readfirstlane_b32 s8, v0 +; GFX9-NEXT: s_mul_hi_u32 s9, s8, 0xfffff001 +; GFX9-NEXT: s_mul_i32 s5, s4, 0xfffff001 +; GFX9-NEXT: s_sub_i32 s9, s9, s8 +; GFX9-NEXT: s_add_i32 s9, s9, s5 +; GFX9-NEXT: s_mul_i32 s11, s8, 0xfffff001 +; GFX9-NEXT: s_mul_hi_u32 s5, s8, s9 +; GFX9-NEXT: s_mul_i32 s10, s8, s9 +; GFX9-NEXT: s_mul_hi_u32 s8, s8, s11 +; GFX9-NEXT: s_add_u32 s8, s8, s10 +; GFX9-NEXT: s_addc_u32 s5, 0, s5 +; GFX9-NEXT: s_mul_hi_u32 s12, s4, s11 +; GFX9-NEXT: s_mul_i32 s11, s4, s11 +; GFX9-NEXT: s_add_u32 s8, s8, s11 +; GFX9-NEXT: s_mul_hi_u32 s10, s4, s9 +; GFX9-NEXT: s_addc_u32 s5, s5, s12 +; GFX9-NEXT: s_addc_u32 s8, s10, 0 +; GFX9-NEXT: s_mul_i32 s9, s4, s9 +; GFX9-NEXT: s_add_u32 s5, s5, s9 +; GFX9-NEXT: s_addc_u32 s8, 0, s8 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s5, v0 +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-NEXT: s_addc_u32 s4, s4, s8 +; GFX9-NEXT: v_readfirstlane_b32 s9, v0 +; GFX9-NEXT: s_mul_i32 s8, s6, s4 +; GFX9-NEXT: s_mul_hi_u32 s10, s6, s9 +; GFX9-NEXT: s_mul_hi_u32 s5, s6, s4 +; GFX9-NEXT: s_add_u32 s8, s10, s8 +; GFX9-NEXT: s_addc_u32 s5, 0, s5 +; GFX9-NEXT: s_mul_hi_u32 s11, s7, s9 +; GFX9-NEXT: s_mul_i32 s9, s7, s9 +; GFX9-NEXT: s_add_u32 s8, s8, s9 +; GFX9-NEXT: s_mul_hi_u32 s10, s7, s4 +; GFX9-NEXT: s_addc_u32 s5, s5, s11 +; GFX9-NEXT: s_addc_u32 s8, s10, 0 +; GFX9-NEXT: s_mul_i32 s4, s7, s4 +; GFX9-NEXT: s_add_u32 s4, s5, s4 +; GFX9-NEXT: s_addc_u32 s5, 0, s8 +; GFX9-NEXT: s_add_u32 s8, s4, 1 +; GFX9-NEXT: s_addc_u32 s9, s5, 0 +; GFX9-NEXT: s_add_u32 s10, s4, 2 +; GFX9-NEXT: s_mul_i32 s13, s5, 0xfff +; GFX9-NEXT: s_mul_hi_u32 s14, s4, 0xfff +; GFX9-NEXT: s_addc_u32 s11, s5, 0 +; GFX9-NEXT: s_add_i32 s14, s14, s13 +; GFX9-NEXT: s_mul_i32 s13, s4, 0xfff +; GFX9-NEXT: v_mov_b32_e32 v0, s13 +; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s6, v0 +; GFX9-NEXT: s_movk_i32 s12, 0xfff +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-NEXT: s_subb_u32 s6, s7, s14 +; GFX9-NEXT: v_subrev_co_u32_e32 v1, vcc, s12, v0 +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-NEXT: s_subb_u32 s7, s6, 0 +; GFX9-NEXT: s_movk_i32 s12, 0xffe +; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, s12, v1 +; GFX9-NEXT: s_cmp_eq_u32 s7, 0 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc +; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 +; GFX9-NEXT: v_cndmask_b32_e32 v1, -1, v1, vcc +; GFX9-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-NEXT: v_mov_b32_e32 v3, s10 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_mov_b32_e32 v2, s9 +; GFX9-NEXT: v_mov_b32_e32 v3, s11 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, s12, v0 +; GFX9-NEXT: s_cmp_eq_u32 s6, 0 +; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] ; GFX9-NEXT: s_endpgm %r = udiv <2 x i64> %x, @@ -7806,19 +7926,19 @@ ; GFX6-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[2:3] ; GFX6-NEXT: v_subbrev_u32_e64 v2, s[0:1], 0, v2, s[0:1] ; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v6 -; GFX6-NEXT: v_cndmask_b32_e64 v2, v5, v2, s[0:1] -; GFX6-NEXT: v_mov_b32_e32 v5, s7 -; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v5, v1, vcc +; GFX6-NEXT: v_cndmask_b32_e64 v3, v4, v3, s[0:1] +; GFX6-NEXT: v_mov_b32_e32 v4, s7 +; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v4, v1, vcc ; GFX6-NEXT: v_cmp_lt_u32_e32 vcc, s5, v1 -; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc +; GFX6-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc ; GFX6-NEXT: v_cmp_lt_u32_e32 vcc, s6, v0 -; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc +; GFX6-NEXT: v_cndmask_b32_e64 v2, v5, v2, s[0:1] +; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s4, v1 -; GFX6-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc -; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 +; GFX6-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc +; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 ; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX6-NEXT: v_cndmask_b32_e64 v2, v4, v3, s[0:1] -; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; GFX6-NEXT: s_endpgm ; @@ -7914,40 +8034,40 @@ ; GFX9-NEXT: s_mov_b32 s8, 0x9761f7c9 ; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX9-NEXT: s_subb_u32 s6, s1, 0x11f -; GFX9-NEXT: v_subrev_co_u32_e64 v3, s[0:1], s8, v0 +; GFX9-NEXT: v_subrev_co_u32_e64 v1, s[0:1], s8, v0 ; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: s_subb_u32 s10, s6, 0 ; GFX9-NEXT: s_cmpk_gt_u32 s10, 0x11e ; GFX9-NEXT: s_cselect_b32 s11, -1, 0 -; GFX9-NEXT: v_cmp_lt_u32_e64 s[2:3], s12, v3 +; GFX9-NEXT: v_cmp_lt_u32_e64 s[2:3], s12, v1 ; GFX9-NEXT: s_cmpk_eq_i32 s10, 0x11f -; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[2:3] +; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v4, s11 ; GFX9-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX9-NEXT: v_cndmask_b32_e64 v1, v4, v1, s[2:3] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v4, v3, s[2:3] ; GFX9-NEXT: s_subb_u32 s2, s6, 0x11f -; GFX9-NEXT: v_subrev_co_u32_e64 v4, s[0:1], s8, v3 +; GFX9-NEXT: v_subrev_co_u32_e64 v4, s[0:1], s8, v1 ; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX9-NEXT: s_subb_u32 s0, s2, 0 +; GFX9-NEXT: s_subb_u32 s2, s2, 0 +; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v3 +; GFX9-NEXT: v_cndmask_b32_e64 v3, v1, v4, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v1, s10 +; GFX9-NEXT: v_mov_b32_e32 v4, s2 ; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 -; GFX9-NEXT: s_subb_u32 s2, s7, s9 -; GFX9-NEXT: s_cmpk_gt_u32 s2, 0x11e -; GFX9-NEXT: v_mov_b32_e32 v5, s10 -; GFX9-NEXT: v_mov_b32_e32 v6, s0 -; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v1 -; GFX9-NEXT: s_cselect_b32 s3, -1, 0 +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[0:1] +; GFX9-NEXT: s_subb_u32 s0, s7, s9 +; GFX9-NEXT: s_cmpk_gt_u32 s0, 0x11e +; GFX9-NEXT: s_cselect_b32 s1, -1, 0 ; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, s12, v0 -; GFX9-NEXT: s_cmpk_eq_i32 s2, 0x11f -; GFX9-NEXT: v_cndmask_b32_e64 v1, v5, v6, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc -; GFX9-NEXT: v_mov_b32_e32 v6, s3 +; GFX9-NEXT: s_cmpk_eq_i32 s0, 0x11f +; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc +; GFX9-NEXT: v_mov_b32_e32 v5, s1 ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc -; GFX9-NEXT: v_mov_b32_e32 v6, s2 -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v1, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc +; GFX9-NEXT: v_mov_b32_e32 v5, s0 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm @@ -8157,9 +8277,9 @@ ; GFX6-NEXT: v_mov_b32_e32 v0, 0x4f800000 ; GFX6-NEXT: v_madak_f32 v0, 0, v0, 0x4996c7d8 ; GFX6-NEXT: v_rcp_f32_e32 v0, v0 -; GFX6-NEXT: s_mov_b32 s5, 0xffed2705 -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s8, 0xffed2705 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GFX6-NEXT: v_trunc_f32_e32 v1, v1 @@ -8167,15 +8287,15 @@ ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_ashr_i32 s8, s3, 31 -; GFX6-NEXT: s_add_u32 s2, s2, s8 -; GFX6-NEXT: v_mul_lo_u32 v2, v1, s5 -; GFX6-NEXT: v_mul_hi_u32 v3, v0, s5 -; GFX6-NEXT: v_mul_lo_u32 v4, v0, s5 -; GFX6-NEXT: s_mov_b32 s9, s8 -; GFX6-NEXT: s_addc_u32 s3, s3, s8 +; GFX6-NEXT: s_mov_b32 s0, s4 +; GFX6-NEXT: s_mov_b32 s4, 0x12d8fb +; GFX6-NEXT: v_mul_lo_u32 v2, v1, s8 +; GFX6-NEXT: v_mul_hi_u32 v3, v0, s8 +; GFX6-NEXT: v_mul_lo_u32 v4, v0, s8 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_mov_b32 s1, s5 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v0 +; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, v0, v2 ; GFX6-NEXT: v_mul_hi_u32 v3, v0, v4 ; GFX6-NEXT: v_mul_lo_u32 v5, v0, v2 ; GFX6-NEXT: v_mul_hi_u32 v6, v0, v2 @@ -8185,8 +8305,6 @@ ; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc ; GFX6-NEXT: v_mul_lo_u32 v6, v1, v4 ; GFX6-NEXT: v_mul_hi_u32 v4, v1, v4 -; GFX6-NEXT: s_xor_b64 s[2:3], s[2:3], s[8:9] -; GFX6-NEXT: s_mov_b32 s4, s0 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v6 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v5, v4, vcc ; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v7, vcc @@ -8194,12 +8312,10 @@ ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc -; GFX6-NEXT: v_mul_lo_u32 v2, v1, s5 -; GFX6-NEXT: v_mul_hi_u32 v3, v0, s5 -; GFX6-NEXT: s_mov_b32 s0, 0x12d8fb -; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: v_mul_lo_u32 v2, v1, s8 +; GFX6-NEXT: v_mul_hi_u32 v3, v0, s8 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GFX6-NEXT: v_mul_lo_u32 v3, v0, s5 +; GFX6-NEXT: v_mul_lo_u32 v3, v0, s8 ; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, v0, v2 ; GFX6-NEXT: v_mul_lo_u32 v6, v0, v2 ; GFX6-NEXT: v_mul_hi_u32 v7, v0, v3 @@ -8214,176 +8330,196 @@ ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v7, v5, vcc ; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GFX6-NEXT: s_ashr_i32 s8, s7, 31 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc +; GFX6-NEXT: s_add_u32 s6, s6, s8 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GFX6-NEXT: s_mov_b32 s9, s8 +; GFX6-NEXT: s_addc_u32 s7, s7, s8 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc -; GFX6-NEXT: v_mul_lo_u32 v2, s2, v1 -; GFX6-NEXT: v_mul_hi_u32 v3, s2, v0 -; GFX6-NEXT: v_mul_hi_u32 v4, s2, v1 -; GFX6-NEXT: v_mul_hi_u32 v5, s3, v1 -; GFX6-NEXT: v_mul_lo_u32 v1, s3, v1 +; GFX6-NEXT: s_xor_b64 s[6:7], s[6:7], s[8:9] +; GFX6-NEXT: v_mul_lo_u32 v2, s6, v1 +; GFX6-NEXT: v_mul_hi_u32 v3, s6, v0 +; GFX6-NEXT: v_mul_hi_u32 v4, s6, v1 +; GFX6-NEXT: v_mul_hi_u32 v5, s7, v1 +; GFX6-NEXT: v_mul_lo_u32 v1, s7, v1 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc -; GFX6-NEXT: v_mul_lo_u32 v4, s3, v0 -; GFX6-NEXT: v_mul_hi_u32 v0, s3, v0 -; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mul_lo_u32 v4, s7, v0 +; GFX6-NEXT: v_mul_hi_u32 v0, s7, v0 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GFX6-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc ; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc -; GFX6-NEXT: v_mul_lo_u32 v4, v1, s0 -; GFX6-NEXT: v_mul_hi_u32 v5, v0, s0 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, 2, v0 -; GFX6-NEXT: v_mul_lo_u32 v8, v0, s0 +; GFX6-NEXT: v_mul_lo_u32 v4, v1, s4 +; GFX6-NEXT: v_mul_hi_u32 v5, v0, s4 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v0 +; GFX6-NEXT: v_mul_lo_u32 v8, v0, s4 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc -; GFX6-NEXT: v_add_i32_e32 v6, vcc, 1, v0 +; GFX6-NEXT: v_add_i32_e32 v6, vcc, 2, v0 ; GFX6-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; GFX6-NEXT: v_mov_b32_e32 v5, s3 -; GFX6-NEXT: v_sub_i32_e32 v8, vcc, s2, v8 +; GFX6-NEXT: v_mov_b32_e32 v5, s7 +; GFX6-NEXT: v_sub_i32_e32 v8, vcc, s6, v8 ; GFX6-NEXT: v_subb_u32_e32 v4, vcc, v5, v4, vcc -; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, s0, v8 +; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, s4, v8 ; GFX6-NEXT: v_subbrev_u32_e32 v9, vcc, 0, v4, vcc -; GFX6-NEXT: s_mov_b32 s0, 0x12d8fa -; GFX6-NEXT: v_cmp_lt_u32_e32 vcc, s0, v5 +; GFX6-NEXT: s_mov_b32 s4, 0x12d8fa +; GFX6-NEXT: v_cmp_lt_u32_e32 vcc, s4, v5 ; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v9 ; GFX6-NEXT: v_cndmask_b32_e32 v5, -1, v5, vcc -; GFX6-NEXT: v_cmp_lt_u32_e64 s[0:1], s0, v8 ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1] -; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v4 -; GFX6-NEXT: v_cndmask_b32_e64 v4, -1, v5, s[0:1] -; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4 -; GFX6-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc -; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] -; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] +; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc +; GFX6-NEXT: v_cmp_lt_u32_e32 vcc, s4, v8 +; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX6-NEXT: v_cndmask_b32_e32 v4, -1, v5, vcc +; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX6-NEXT: v_xor_b32_e32 v0, s8, v0 ; GFX6-NEXT: v_xor_b32_e32 v1, s8, v1 ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s8, v0 ; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc -; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: sdiv_i64_oddk_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v0, 0x4f800000 -; GFX9-NEXT: v_madak_f32 v0, 0, v0, 0x4996c7d8 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x4996c7d8 +; GFX9-NEXT: v_mov_b32_e32 v1, 0x4f800000 +; GFX9-NEXT: v_mac_f32_e32 v0, 0, v1 ; GFX9-NEXT: v_rcp_f32_e32 v0, v0 -; GFX9-NEXT: s_mov_b32 s2, 0xffed2705 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GFX9-NEXT: v_trunc_f32_e32 v1, v1 ; GFX9-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: v_mul_lo_u32 v2, v1, s2 -; GFX9-NEXT: v_mul_hi_u32 v3, v0, s2 -; GFX9-NEXT: v_mul_lo_u32 v4, v0, s2 -; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 -; GFX9-NEXT: v_sub_u32_e32 v2, v2, v0 -; GFX9-NEXT: v_mul_hi_u32 v3, v0, v4 -; GFX9-NEXT: v_mul_lo_u32 v6, v0, v2 -; GFX9-NEXT: v_mul_hi_u32 v7, v0, v2 -; GFX9-NEXT: v_mul_lo_u32 v5, v1, v4 -; GFX9-NEXT: v_mul_hi_u32 v4, v1, v4 -; GFX9-NEXT: v_mul_hi_u32 v8, v1, v2 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v6 -; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v7, vcc -; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v6, v4, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v8, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc -; GFX9-NEXT: v_mul_lo_u32 v2, v1, s2 -; GFX9-NEXT: v_mul_hi_u32 v3, v0, s2 -; GFX9-NEXT: v_mul_lo_u32 v4, v0, s2 +; GFX9-NEXT: v_readfirstlane_b32 s4, v1 +; GFX9-NEXT: v_readfirstlane_b32 s5, v0 +; GFX9-NEXT: s_mul_hi_u32 s6, s5, 0xffed2705 +; GFX9-NEXT: s_mul_i32 s7, s4, 0xffed2705 +; GFX9-NEXT: s_add_i32 s6, s6, s7 +; GFX9-NEXT: s_sub_i32 s6, s6, s5 +; GFX9-NEXT: s_mul_i32 s9, s5, 0xffed2705 +; GFX9-NEXT: s_mul_hi_u32 s7, s5, s6 +; GFX9-NEXT: s_mul_i32 s8, s5, s6 +; GFX9-NEXT: s_mul_hi_u32 s5, s5, s9 +; GFX9-NEXT: s_add_u32 s5, s5, s8 +; GFX9-NEXT: s_addc_u32 s7, 0, s7 +; GFX9-NEXT: s_mul_hi_u32 s10, s4, s9 +; GFX9-NEXT: s_mul_i32 s9, s4, s9 +; GFX9-NEXT: s_add_u32 s5, s5, s9 +; GFX9-NEXT: s_mul_hi_u32 s8, s4, s6 +; GFX9-NEXT: s_addc_u32 s5, s7, s10 +; GFX9-NEXT: s_addc_u32 s7, s8, 0 +; GFX9-NEXT: s_mul_i32 s6, s4, s6 +; GFX9-NEXT: s_add_u32 s5, s5, s6 +; GFX9-NEXT: s_addc_u32 s6, 0, s7 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s5, v0 +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-NEXT: s_addc_u32 s4, s4, s6 +; GFX9-NEXT: v_readfirstlane_b32 s6, v0 +; GFX9-NEXT: s_mul_i32 s5, s4, 0xffed2705 +; GFX9-NEXT: s_mul_hi_u32 s7, s6, 0xffed2705 +; GFX9-NEXT: s_add_i32 s7, s7, s5 +; GFX9-NEXT: s_sub_i32 s5, s7, s6 +; GFX9-NEXT: s_mul_i32 s8, s6, 0xffed2705 +; GFX9-NEXT: s_mul_hi_u32 s11, s6, s5 +; GFX9-NEXT: s_mul_i32 s12, s6, s5 +; GFX9-NEXT: s_mul_hi_u32 s6, s6, s8 +; GFX9-NEXT: s_add_u32 s6, s6, s12 +; GFX9-NEXT: s_mul_hi_u32 s9, s4, s8 +; GFX9-NEXT: s_mul_i32 s10, s4, s8 +; GFX9-NEXT: s_addc_u32 s8, 0, s11 +; GFX9-NEXT: s_add_u32 s6, s6, s10 +; GFX9-NEXT: s_mul_hi_u32 s7, s4, s5 +; GFX9-NEXT: s_addc_u32 s6, s8, s9 +; GFX9-NEXT: s_addc_u32 s7, s7, 0 +; GFX9-NEXT: s_mul_i32 s5, s4, s5 +; GFX9-NEXT: s_add_u32 s5, s6, s5 +; GFX9-NEXT: s_addc_u32 s6, 0, s7 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s5, v0 +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-NEXT: s_addc_u32 s6, s4, s6 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s2, s7, 31 -; GFX9-NEXT: s_add_u32 s0, s6, s2 -; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 -; GFX9-NEXT: v_sub_u32_e32 v2, v2, v0 -; GFX9-NEXT: v_mul_lo_u32 v6, v0, v2 -; GFX9-NEXT: v_mul_hi_u32 v7, v0, v4 -; GFX9-NEXT: v_mul_hi_u32 v8, v0, v2 -; GFX9-NEXT: v_mul_hi_u32 v5, v1, v4 -; GFX9-NEXT: v_mul_lo_u32 v4, v1, v4 -; GFX9-NEXT: v_mul_hi_u32 v3, v1, v2 -; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v7, v6 -; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v8, vcc -; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v6, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v7, v5, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v4, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 -; GFX9-NEXT: s_mov_b32 s3, s2 -; GFX9-NEXT: s_addc_u32 s1, s7, s2 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc -; GFX9-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3] -; GFX9-NEXT: v_mul_lo_u32 v2, s0, v1 -; GFX9-NEXT: v_mul_hi_u32 v3, s0, v0 -; GFX9-NEXT: v_mul_hi_u32 v5, s0, v1 -; GFX9-NEXT: v_mul_hi_u32 v6, s1, v1 -; GFX9-NEXT: v_mul_lo_u32 v1, s1, v1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v5, vcc -; GFX9-NEXT: v_mul_lo_u32 v5, s1, v0 -; GFX9-NEXT: v_mul_hi_u32 v0, s1, v0 -; GFX9-NEXT: s_mov_b32 s3, 0x12d8fb -; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5 -; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v3, v0, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v6, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v2, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 2, v0 -; GFX9-NEXT: v_mul_lo_u32 v5, v1, s3 -; GFX9-NEXT: v_mul_hi_u32 v6, v0, s3 -; GFX9-NEXT: v_mul_lo_u32 v9, v0, s3 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, 1, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v1, vcc -; GFX9-NEXT: v_add_u32_e32 v5, v6, v5 -; GFX9-NEXT: v_mov_b32_e32 v6, s1 -; GFX9-NEXT: v_sub_co_u32_e32 v9, vcc, s0, v9 -; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v6, v5, vcc -; GFX9-NEXT: v_subrev_co_u32_e32 v6, vcc, s3, v9 -; GFX9-NEXT: v_subbrev_co_u32_e32 v10, vcc, 0, v5, vcc -; GFX9-NEXT: s_mov_b32 s0, 0x12d8fa -; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, s0, v6 -; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v10 -; GFX9-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc -; GFX9-NEXT: v_cmp_lt_u32_e64 s[0:1], s0, v9 -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v5 -; GFX9-NEXT: v_cndmask_b32_e64 v5, -1, v6, s[0:1] -; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v7, v2, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v3, v8, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] -; GFX9-NEXT: v_xor_b32_e32 v0, s2, v0 -; GFX9-NEXT: v_xor_b32_e32 v1, s2, v1 -; GFX9-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s2, v0 -; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v2, vcc -; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5] -; GFX9-NEXT: s_endpgm - %r = sdiv i64 %x, 1235195 - store i64 %r, i64 addrspace(1)* %out - ret void -} +; GFX9-NEXT: s_ashr_i32 s4, s3, 31 +; GFX9-NEXT: s_add_u32 s2, s2, s4 +; GFX9-NEXT: s_mov_b32 s5, s4 +; GFX9-NEXT: s_addc_u32 s3, s3, s4 +; GFX9-NEXT: s_xor_b64 s[2:3], s[2:3], s[4:5] +; GFX9-NEXT: v_readfirstlane_b32 s8, v0 +; GFX9-NEXT: s_mul_i32 s7, s2, s6 +; GFX9-NEXT: s_mul_hi_u32 s9, s2, s8 +; GFX9-NEXT: s_mul_hi_u32 s5, s2, s6 +; GFX9-NEXT: s_add_u32 s7, s9, s7 +; GFX9-NEXT: s_addc_u32 s5, 0, s5 +; GFX9-NEXT: s_mul_hi_u32 s10, s3, s8 +; GFX9-NEXT: s_mul_i32 s8, s3, s8 +; GFX9-NEXT: s_add_u32 s7, s7, s8 +; GFX9-NEXT: s_mul_hi_u32 s9, s3, s6 +; GFX9-NEXT: s_addc_u32 s5, s5, s10 +; GFX9-NEXT: s_addc_u32 s7, s9, 0 +; GFX9-NEXT: s_mul_i32 s6, s3, s6 +; GFX9-NEXT: s_add_u32 s5, s5, s6 +; GFX9-NEXT: s_addc_u32 s6, 0, s7 +; GFX9-NEXT: s_add_u32 s7, s5, 1 +; GFX9-NEXT: s_addc_u32 s8, s6, 0 +; GFX9-NEXT: s_add_u32 s9, s5, 2 +; GFX9-NEXT: s_mul_i32 s12, s6, 0x12d8fb +; GFX9-NEXT: s_mul_hi_u32 s13, s5, 0x12d8fb +; GFX9-NEXT: s_addc_u32 s10, s6, 0 +; GFX9-NEXT: s_add_i32 s13, s13, s12 +; GFX9-NEXT: s_mul_i32 s12, s5, 0x12d8fb +; GFX9-NEXT: v_mov_b32_e32 v0, s12 +; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s2, v0 +; GFX9-NEXT: s_mov_b32 s11, 0x12d8fb +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-NEXT: s_subb_u32 s2, s3, s13 +; GFX9-NEXT: v_subrev_co_u32_e32 v1, vcc, s11, v0 +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-NEXT: s_subb_u32 s3, s2, 0 +; GFX9-NEXT: s_mov_b32 s11, 0x12d8fa +; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, s11, v1 +; GFX9-NEXT: s_cmp_eq_u32 s3, 0 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc +; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 +; GFX9-NEXT: v_cndmask_b32_e32 v1, -1, v1, vcc +; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: v_mov_b32_e32 v4, s9 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; GFX9-NEXT: v_mov_b32_e32 v3, s8 +; GFX9-NEXT: v_mov_b32_e32 v4, s10 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, s11, v0 +; GFX9-NEXT: s_cmp_eq_u32 s2, 0 +; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GFX9-NEXT: v_mov_b32_e32 v4, s6 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc +; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GFX9-NEXT: v_xor_b32_e32 v1, s4, v1 +; GFX9-NEXT: v_xor_b32_e32 v3, s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s4, v1 +; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v4, vcc +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: s_endpgm + %r = sdiv i64 %x, 1235195 + store i64 %r, i64 addrspace(1)* %out + ret void +} define amdgpu_kernel void @sdiv_i64_pow2k_denom(i64 addrspace(1)* %out, i64 %x) { ; CHECK-LABEL: @sdiv_i64_pow2k_denom( @@ -8493,9 +8629,9 @@ ; GFX6-NEXT: v_mul_hi_u32 v3, s4, v0 ; GFX6-NEXT: v_mul_lo_u32 v4, s5, v0 ; GFX6-NEXT: s_mov_b32 s5, s1 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GFX6-NEXT: v_mul_lo_u32 v3, s4, v0 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v4, v2 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GFX6-NEXT: v_mul_lo_u32 v6, v0, v2 ; GFX6-NEXT: v_mul_hi_u32 v7, v0, v3 ; GFX6-NEXT: v_mul_hi_u32 v8, v0, v2 @@ -8545,12 +8681,13 @@ ; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1] ; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], s11, v4 ; GFX6-NEXT: v_cndmask_b32_e64 v4, v6, v5, s[0:1] -; GFX6-NEXT: v_add_i32_e64 v5, s[0:1], 2, v0 +; GFX6-NEXT: v_add_i32_e64 v5, s[0:1], 1, v0 ; GFX6-NEXT: v_addc_u32_e64 v6, s[0:1], 0, v1, s[0:1] -; GFX6-NEXT: v_add_i32_e64 v7, s[0:1], 1, v0 +; GFX6-NEXT: v_add_i32_e64 v7, s[0:1], 2, v0 ; GFX6-NEXT: v_addc_u32_e64 v8, s[0:1], 0, v1, s[0:1] ; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4 -; GFX6-NEXT: v_cndmask_b32_e64 v4, v8, v6, s[0:1] +; GFX6-NEXT: v_cndmask_b32_e64 v4, v5, v7, s[0:1] +; GFX6-NEXT: v_cndmask_b32_e64 v5, v6, v8, s[0:1] ; GFX6-NEXT: v_mov_b32_e32 v6, s3 ; GFX6-NEXT: v_subb_u32_e32 v2, vcc, v6, v2, vcc ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s11, v2 @@ -8560,10 +8697,9 @@ ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s11, v2 ; GFX6-NEXT: v_cndmask_b32_e32 v2, v6, v3, vcc ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GFX6-NEXT: v_cndmask_b32_e64 v2, v7, v5, s[0:1] -; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; GFX6-NEXT: s_xor_b64 s[0:1], s[12:13], s[8:9] -; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc ; GFX6-NEXT: v_xor_b32_e32 v0, s0, v0 ; GFX6-NEXT: v_xor_b32_e32 v1, s1, v1 ; GFX6-NEXT: v_mov_b32_e32 v2, s1 @@ -8688,31 +8824,31 @@ ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v3, s15 ; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GFX9-NEXT: s_add_u32 s6, s12, 2 ; GFX9-NEXT: v_cndmask_b32_e64 v2, v3, v2, s[0:1] -; GFX9-NEXT: s_addc_u32 s0, s13, 0 -; GFX9-NEXT: s_add_u32 s15, s12, 1 -; GFX9-NEXT: s_addc_u32 s1, s13, 0 -; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 -; GFX9-NEXT: s_subb_u32 s7, s7, s14 -; GFX9-NEXT: s_cmp_ge_u32 s7, s9 -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_mov_b32_e32 v4, s0 +; GFX9-NEXT: s_add_u32 s0, s12, 1 +; GFX9-NEXT: s_addc_u32 s6, s13, 0 +; GFX9-NEXT: s_add_u32 s1, s12, 2 +; GFX9-NEXT: s_addc_u32 s15, s13, 0 +; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: v_mov_b32_e32 v4, s1 ; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v2 -; GFX9-NEXT: s_cselect_b32 s14, -1, 0 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s8, v1 -; GFX9-NEXT: s_cmp_eq_u32 s7, s9 ; GFX9-NEXT: v_cndmask_b32_e64 v2, v3, v4, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v3, s6 +; GFX9-NEXT: v_mov_b32_e32 v4, s15 +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[0:1] +; GFX9-NEXT: s_subb_u32 s0, s7, s14 +; GFX9-NEXT: s_cmp_ge_u32 s0, s9 +; GFX9-NEXT: s_cselect_b32 s1, -1, 0 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s8, v1 +; GFX9-NEXT: s_cmp_eq_u32 s0, s9 ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc -; GFX9-NEXT: v_mov_b32_e32 v3, s14 +; GFX9-NEXT: v_mov_b32_e32 v4, s1 ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; GFX9-NEXT: v_mov_b32_e32 v3, s13 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc +; GFX9-NEXT: v_mov_b32_e32 v4, s13 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc -; GFX9-NEXT: v_mov_b32_e32 v2, s15 -; GFX9-NEXT: v_mov_b32_e32 v3, s6 -; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc ; GFX9-NEXT: v_mov_b32_e32 v3, s12 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc ; GFX9-NEXT: s_xor_b64 s[0:1], s[10:11], s[2:3] @@ -8825,10 +8961,10 @@ ; GFX6-NEXT: v_mul_lo_u32 v3, v1, s6 ; GFX6-NEXT: s_add_u32 s0, s0, s8 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 -; GFX6-NEXT: s_ashr_i64 s[8:9], s[0:1], 12 +; GFX6-NEXT: s_ashr_i64 s[0:1], s[0:1], 12 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX6-NEXT: v_mul_lo_u32 v3, v0, s6 -; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v0 +; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, v0, v2 ; GFX6-NEXT: v_mul_lo_u32 v4, v0, v2 ; GFX6-NEXT: v_mul_hi_u32 v5, v0, v3 ; GFX6-NEXT: v_mul_hi_u32 v6, v0, v2 @@ -8838,8 +8974,8 @@ ; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc ; GFX6-NEXT: v_mul_lo_u32 v6, v1, v3 ; GFX6-NEXT: v_mul_hi_u32 v3, v1, v3 -; GFX6-NEXT: s_ashr_i32 s10, s3, 31 -; GFX6-NEXT: s_add_u32 s0, s2, s10 +; GFX6-NEXT: s_ashr_i32 s8, s3, 31 +; GFX6-NEXT: s_add_u32 s2, s2, s8 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v6 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc ; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v7, vcc @@ -8849,9 +8985,9 @@ ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc ; GFX6-NEXT: v_mul_lo_u32 v2, v1, s6 ; GFX6-NEXT: v_mul_hi_u32 v3, v0, s6 -; GFX6-NEXT: s_mov_b32 s11, s10 -; GFX6-NEXT: s_addc_u32 s1, s3, s10 -; GFX6-NEXT: s_xor_b64 s[0:1], s[0:1], s[10:11] +; GFX6-NEXT: s_mov_b32 s9, s8 +; GFX6-NEXT: s_addc_u32 s3, s3, s8 +; GFX6-NEXT: s_xor_b64 s[2:3], s[2:3], s[8:9] ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX6-NEXT: v_mul_lo_u32 v3, v0, s6 ; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, v0, v2 @@ -8871,180 +9007,195 @@ ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc -; GFX6-NEXT: v_mul_lo_u32 v2, s0, v1 -; GFX6-NEXT: v_mul_hi_u32 v3, s0, v0 -; GFX6-NEXT: v_mul_hi_u32 v4, s0, v1 -; GFX6-NEXT: v_mul_hi_u32 v5, s1, v1 -; GFX6-NEXT: v_mul_lo_u32 v1, s1, v1 +; GFX6-NEXT: v_mul_lo_u32 v2, s2, v1 +; GFX6-NEXT: v_mul_hi_u32 v3, s2, v0 +; GFX6-NEXT: v_mul_hi_u32 v4, s2, v1 +; GFX6-NEXT: v_mul_hi_u32 v5, s3, v1 +; GFX6-NEXT: v_mul_lo_u32 v1, s3, v1 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc -; GFX6-NEXT: v_mul_lo_u32 v4, s1, v0 -; GFX6-NEXT: v_mul_hi_u32 v0, s1, v0 -; GFX6-NEXT: s_movk_i32 s2, 0xfff +; GFX6-NEXT: v_mul_lo_u32 v4, s3, v0 +; GFX6-NEXT: v_mul_hi_u32 v0, s3, v0 +; GFX6-NEXT: s_movk_i32 s9, 0xfff ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GFX6-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc ; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc -; GFX6-NEXT: v_mul_lo_u32 v4, v1, s2 -; GFX6-NEXT: v_mul_hi_u32 v5, v0, s2 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, 2, v0 -; GFX6-NEXT: v_mul_lo_u32 v8, v0, s2 +; GFX6-NEXT: v_mul_lo_u32 v4, v1, s9 +; GFX6-NEXT: v_mul_hi_u32 v5, v0, s9 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v0 +; GFX6-NEXT: v_mul_lo_u32 v8, v0, s9 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc -; GFX6-NEXT: v_add_i32_e32 v6, vcc, 1, v0 +; GFX6-NEXT: v_add_i32_e32 v6, vcc, 2, v0 ; GFX6-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; GFX6-NEXT: v_mov_b32_e32 v5, s1 -; GFX6-NEXT: v_sub_i32_e32 v8, vcc, s0, v8 +; GFX6-NEXT: v_mov_b32_e32 v5, s3 +; GFX6-NEXT: v_sub_i32_e32 v8, vcc, s2, v8 ; GFX6-NEXT: v_subb_u32_e32 v4, vcc, v5, v4, vcc -; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, s2, v8 +; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, s9, v8 ; GFX6-NEXT: v_subbrev_u32_e32 v9, vcc, 0, v4, vcc -; GFX6-NEXT: s_movk_i32 s0, 0xffe -; GFX6-NEXT: v_cmp_lt_u32_e32 vcc, s0, v5 +; GFX6-NEXT: s_movk_i32 s2, 0xffe +; GFX6-NEXT: v_cmp_lt_u32_e32 vcc, s2, v5 ; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v9 ; GFX6-NEXT: v_cndmask_b32_e32 v5, -1, v5, vcc -; GFX6-NEXT: v_cmp_lt_u32_e64 s[0:1], s0, v8 ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1] -; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v4 -; GFX6-NEXT: v_cndmask_b32_e64 v4, -1, v5, s[0:1] -; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4 -; GFX6-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc -; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] -; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] -; GFX6-NEXT: v_xor_b32_e32 v0, s10, v0 -; GFX6-NEXT: v_xor_b32_e32 v1, s10, v1 -; GFX6-NEXT: v_mov_b32_e32 v3, s10 -; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s10, v0 +; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc +; GFX6-NEXT: v_cmp_lt_u32_e32 vcc, s2, v8 +; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX6-NEXT: v_cndmask_b32_e32 v4, -1, v5, vcc +; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX6-NEXT: v_xor_b32_e32 v0, s8, v0 +; GFX6-NEXT: v_xor_b32_e32 v1, s8, v1 +; GFX6-NEXT: v_mov_b32_e32 v3, s8 +; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s8, v0 ; GFX6-NEXT: v_subb_u32_e32 v3, vcc, v1, v3, vcc -; GFX6-NEXT: v_mov_b32_e32 v0, s8 -; GFX6-NEXT: v_mov_b32_e32 v1, s9 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: ssdiv_v2i64_mixed_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v0, 0x457ff000 -; GFX9-NEXT: v_mov_b32_e32 v1, 0x4f800000 -; GFX9-NEXT: v_mac_f32_e32 v0, 0, v1 -; GFX9-NEXT: v_rcp_f32_e32 v0, v0 -; GFX9-NEXT: s_movk_i32 s8, 0xf001 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX9-NEXT: v_mov_b32_e32 v1, 0x457ff000 +; GFX9-NEXT: v_mov_b32_e32 v2, 0x4f800000 +; GFX9-NEXT: v_mac_f32_e32 v1, 0, v2 +; GFX9-NEXT: v_rcp_f32_e32 v1, v1 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 -; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 -; GFX9-NEXT: v_trunc_f32_e32 v1, v1 -; GFX9-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 -; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v1 +; GFX9-NEXT: v_mul_f32_e32 v2, 0x2f800000, v1 +; GFX9-NEXT: v_trunc_f32_e32 v2, v2 +; GFX9-NEXT: v_mac_f32_e32 v1, 0xcf800000, v2 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_ashr_i32 s0, s5, 31 ; GFX9-NEXT: s_lshr_b32 s0, s0, 20 -; GFX9-NEXT: v_mul_hi_u32 v2, v0, s8 -; GFX9-NEXT: v_mul_lo_u32 v3, v1, s8 -; GFX9-NEXT: v_mul_lo_u32 v4, v0, s8 ; GFX9-NEXT: s_add_u32 s0, s4, s0 +; GFX9-NEXT: v_readfirstlane_b32 s4, v1 +; GFX9-NEXT: v_readfirstlane_b32 s8, v2 ; GFX9-NEXT: s_addc_u32 s1, s5, 0 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v3 -; GFX9-NEXT: v_sub_u32_e32 v2, v2, v0 -; GFX9-NEXT: v_mul_lo_u32 v3, v0, v2 -; GFX9-NEXT: v_mul_hi_u32 v5, v0, v4 -; GFX9-NEXT: v_mul_hi_u32 v6, v0, v2 -; GFX9-NEXT: v_mul_hi_u32 v7, v1, v2 -; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v5, v3 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v6, vcc -; GFX9-NEXT: v_mul_lo_u32 v6, v1, v4 -; GFX9-NEXT: v_mul_hi_u32 v4, v1, v4 -; GFX9-NEXT: s_ashr_i64 s[4:5], s[0:1], 12 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v6 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v4, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v7, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc -; GFX9-NEXT: v_mul_lo_u32 v2, v1, s8 -; GFX9-NEXT: v_mul_hi_u32 v3, v0, s8 -; GFX9-NEXT: v_mul_lo_u32 v4, v0, s8 -; GFX9-NEXT: s_ashr_i32 s8, s7, 31 -; GFX9-NEXT: s_add_u32 s0, s6, s8 -; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 -; GFX9-NEXT: v_sub_u32_e32 v2, v2, v0 -; GFX9-NEXT: v_mul_lo_u32 v6, v0, v2 -; GFX9-NEXT: v_mul_hi_u32 v7, v0, v4 -; GFX9-NEXT: v_mul_hi_u32 v8, v0, v2 -; GFX9-NEXT: v_mul_hi_u32 v5, v1, v4 -; GFX9-NEXT: v_mul_lo_u32 v4, v1, v4 -; GFX9-NEXT: v_mul_hi_u32 v3, v1, v2 -; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v7, v6 -; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v8, vcc -; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v6, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v7, v5, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v4, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 -; GFX9-NEXT: s_mov_b32 s9, s8 -; GFX9-NEXT: s_addc_u32 s1, s7, s8 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc -; GFX9-NEXT: s_xor_b64 s[0:1], s[0:1], s[8:9] -; GFX9-NEXT: v_mul_lo_u32 v2, s0, v1 -; GFX9-NEXT: v_mul_hi_u32 v3, s0, v0 -; GFX9-NEXT: v_mul_hi_u32 v5, s0, v1 -; GFX9-NEXT: v_mul_hi_u32 v6, s1, v1 -; GFX9-NEXT: v_mul_lo_u32 v1, s1, v1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v5, vcc -; GFX9-NEXT: v_mul_lo_u32 v5, s1, v0 -; GFX9-NEXT: v_mul_hi_u32 v0, s1, v0 -; GFX9-NEXT: s_movk_i32 s6, 0xfff -; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5 -; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v3, v0, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v6, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v2, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 2, v0 -; GFX9-NEXT: v_mul_lo_u32 v5, v1, s6 -; GFX9-NEXT: v_mul_hi_u32 v6, v0, s6 -; GFX9-NEXT: v_mul_lo_u32 v9, v0, s6 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, 1, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v1, vcc -; GFX9-NEXT: v_add_u32_e32 v5, v6, v5 -; GFX9-NEXT: v_mov_b32_e32 v6, s1 -; GFX9-NEXT: v_sub_co_u32_e32 v9, vcc, s0, v9 -; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v6, v5, vcc -; GFX9-NEXT: v_subrev_co_u32_e32 v6, vcc, s6, v9 -; GFX9-NEXT: v_subbrev_co_u32_e32 v10, vcc, 0, v5, vcc -; GFX9-NEXT: s_movk_i32 s0, 0xffe -; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, s0, v6 -; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v10 -; GFX9-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc -; GFX9-NEXT: v_cmp_lt_u32_e64 s[0:1], s0, v9 -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v5 -; GFX9-NEXT: v_cndmask_b32_e64 v5, -1, v6, s[0:1] -; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v7, v2, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v3, v8, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] -; GFX9-NEXT: v_xor_b32_e32 v0, s8, v0 -; GFX9-NEXT: v_xor_b32_e32 v1, s8, v1 -; GFX9-NEXT: v_mov_b32_e32 v3, s8 -; GFX9-NEXT: v_subrev_co_u32_e32 v2, vcc, s8, v0 -; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v3, vcc -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] +; GFX9-NEXT: s_mul_hi_u32 s5, s4, 0xfffff001 +; GFX9-NEXT: s_mul_i32 s9, s8, 0xfffff001 +; GFX9-NEXT: s_add_i32 s5, s5, s9 +; GFX9-NEXT: s_sub_i32 s5, s5, s4 +; GFX9-NEXT: s_mul_i32 s11, s4, 0xfffff001 +; GFX9-NEXT: s_ashr_i64 s[0:1], s[0:1], 12 +; GFX9-NEXT: s_mul_hi_u32 s9, s4, s5 +; GFX9-NEXT: s_mul_i32 s10, s4, s5 +; GFX9-NEXT: s_mul_hi_u32 s4, s4, s11 +; GFX9-NEXT: s_add_u32 s4, s4, s10 +; GFX9-NEXT: s_addc_u32 s9, 0, s9 +; GFX9-NEXT: s_mul_hi_u32 s12, s8, s11 +; GFX9-NEXT: s_mul_i32 s11, s8, s11 +; GFX9-NEXT: s_add_u32 s4, s4, s11 +; GFX9-NEXT: s_mul_hi_u32 s10, s8, s5 +; GFX9-NEXT: s_addc_u32 s4, s9, s12 +; GFX9-NEXT: s_addc_u32 s9, s10, 0 +; GFX9-NEXT: s_mul_i32 s5, s8, s5 +; GFX9-NEXT: s_add_u32 s4, s4, s5 +; GFX9-NEXT: s_addc_u32 s5, 0, s9 +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, s4, v1 +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-NEXT: s_addc_u32 s4, s8, s5 +; GFX9-NEXT: v_readfirstlane_b32 s8, v1 +; GFX9-NEXT: s_mul_i32 s5, s4, 0xfffff001 +; GFX9-NEXT: s_mul_hi_u32 s9, s8, 0xfffff001 +; GFX9-NEXT: s_add_i32 s9, s9, s5 +; GFX9-NEXT: s_sub_i32 s5, s9, s8 +; GFX9-NEXT: s_mul_i32 s10, s8, 0xfffff001 +; GFX9-NEXT: s_mul_hi_u32 s13, s8, s5 +; GFX9-NEXT: s_mul_i32 s14, s8, s5 +; GFX9-NEXT: s_mul_hi_u32 s8, s8, s10 +; GFX9-NEXT: s_add_u32 s8, s8, s14 +; GFX9-NEXT: s_mul_hi_u32 s11, s4, s10 +; GFX9-NEXT: s_mul_i32 s12, s4, s10 +; GFX9-NEXT: s_addc_u32 s10, 0, s13 +; GFX9-NEXT: s_add_u32 s8, s8, s12 +; GFX9-NEXT: s_mul_hi_u32 s9, s4, s5 +; GFX9-NEXT: s_addc_u32 s8, s10, s11 +; GFX9-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-NEXT: s_mul_i32 s5, s4, s5 +; GFX9-NEXT: s_add_u32 s5, s8, s5 +; GFX9-NEXT: s_addc_u32 s8, 0, s9 +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, s5, v1 +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-NEXT: s_addc_u32 s8, s4, s8 +; GFX9-NEXT: s_ashr_i32 s4, s7, 31 +; GFX9-NEXT: s_add_u32 s6, s6, s4 +; GFX9-NEXT: s_mov_b32 s5, s4 +; GFX9-NEXT: s_addc_u32 s7, s7, s4 +; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[4:5] +; GFX9-NEXT: v_readfirstlane_b32 s10, v1 +; GFX9-NEXT: s_mul_i32 s9, s6, s8 +; GFX9-NEXT: s_mul_hi_u32 s11, s6, s10 +; GFX9-NEXT: s_mul_hi_u32 s5, s6, s8 +; GFX9-NEXT: s_add_u32 s9, s11, s9 +; GFX9-NEXT: s_addc_u32 s5, 0, s5 +; GFX9-NEXT: s_mul_hi_u32 s12, s7, s10 +; GFX9-NEXT: s_mul_i32 s10, s7, s10 +; GFX9-NEXT: s_add_u32 s9, s9, s10 +; GFX9-NEXT: s_mul_hi_u32 s11, s7, s8 +; GFX9-NEXT: s_addc_u32 s5, s5, s12 +; GFX9-NEXT: s_addc_u32 s9, s11, 0 +; GFX9-NEXT: s_mul_i32 s8, s7, s8 +; GFX9-NEXT: s_add_u32 s5, s5, s8 +; GFX9-NEXT: s_addc_u32 s8, 0, s9 +; GFX9-NEXT: s_add_u32 s9, s5, 1 +; GFX9-NEXT: s_addc_u32 s10, s8, 0 +; GFX9-NEXT: s_add_u32 s11, s5, 2 +; GFX9-NEXT: s_mul_i32 s14, s8, 0xfff +; GFX9-NEXT: s_mul_hi_u32 s15, s5, 0xfff +; GFX9-NEXT: s_addc_u32 s12, s8, 0 +; GFX9-NEXT: s_add_i32 s15, s15, s14 +; GFX9-NEXT: s_mul_i32 s14, s5, 0xfff +; GFX9-NEXT: v_mov_b32_e32 v1, s14 +; GFX9-NEXT: v_sub_co_u32_e32 v1, vcc, s6, v1 +; GFX9-NEXT: s_movk_i32 s13, 0xfff +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-NEXT: s_subb_u32 s6, s7, s15 +; GFX9-NEXT: v_subrev_co_u32_e32 v2, vcc, s13, v1 +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-NEXT: s_subb_u32 s7, s6, 0 +; GFX9-NEXT: s_movk_i32 s13, 0xffe +; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, s13, v2 +; GFX9-NEXT: s_cmp_eq_u32 s7, 0 +; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc +; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 +; GFX9-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc +; GFX9-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-NEXT: v_mov_b32_e32 v4, s11 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_mov_b32_e32 v3, s10 +; GFX9-NEXT: v_mov_b32_e32 v4, s12 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, s13, v1 +; GFX9-NEXT: s_cmp_eq_u32 s6, 0 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc +; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 +; GFX9-NEXT: v_cndmask_b32_e32 v1, -1, v1, vcc +; GFX9-NEXT: v_mov_b32_e32 v4, s8 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc +; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc +; GFX9-NEXT: v_xor_b32_e32 v2, s4, v2 +; GFX9-NEXT: v_xor_b32_e32 v1, s4, v1 +; GFX9-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NEXT: v_subrev_co_u32_e32 v3, vcc, s4, v2 +; GFX9-NEXT: v_subb_co_u32_e32 v4, vcc, v1, v4, vcc +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: global_store_dwordx4 v0, v[1:4], s[2:3] ; GFX9-NEXT: s_endpgm %r = sdiv <2 x i64> %x, store <2 x i64> %r, <2 x i64> addrspace(1)* %out @@ -9099,7 +9250,6 @@ ; GFX6-NEXT: v_mul_hi_u32 v3, s10, v0 ; GFX6-NEXT: v_mul_lo_u32 v5, s11, v0 ; GFX6-NEXT: v_mul_lo_u32 v4, s10, v0 -; GFX6-NEXT: s_xor_b64 s[14:15], s[16:17], s[14:15] ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v5, v2 ; GFX6-NEXT: v_mul_hi_u32 v3, v0, v4 @@ -9160,9 +9310,9 @@ ; GFX6-NEXT: v_mul_hi_u32 v3, s12, v0 ; GFX6-NEXT: v_mul_lo_u32 v4, s13, v0 ; GFX6-NEXT: v_mov_b32_e32 v5, s13 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GFX6-NEXT: v_mul_lo_u32 v3, s12, v0 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v4, v2 ; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s5, v2 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s4, v3 ; GFX6-NEXT: v_subb_u32_e64 v4, s[0:1], v4, v5, vcc @@ -9174,72 +9324,71 @@ ; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1] ; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v4 ; GFX6-NEXT: v_cndmask_b32_e64 v4, v6, v5, s[0:1] -; GFX6-NEXT: v_add_i32_e64 v5, s[0:1], 2, v0 +; GFX6-NEXT: v_add_i32_e64 v5, s[0:1], 1, v0 ; GFX6-NEXT: v_addc_u32_e64 v6, s[0:1], 0, v1, s[0:1] -; GFX6-NEXT: v_add_i32_e64 v7, s[0:1], 1, v0 +; GFX6-NEXT: v_add_i32_e64 v7, s[0:1], 2, v0 ; GFX6-NEXT: v_addc_u32_e64 v8, s[0:1], 0, v1, s[0:1] -; GFX6-NEXT: s_ashr_i32 s4, s3, 31 ; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4 +; GFX6-NEXT: v_cndmask_b32_e64 v4, v5, v7, s[0:1] +; GFX6-NEXT: v_cndmask_b32_e64 v5, v6, v8, s[0:1] +; GFX6-NEXT: s_xor_b64 s[0:1], s[16:17], s[14:15] +; GFX6-NEXT: s_ashr_i32 s4, s3, 31 ; GFX6-NEXT: s_add_u32 s2, s2, s4 -; GFX6-NEXT: v_cndmask_b32_e64 v4, v8, v6, s[0:1] ; GFX6-NEXT: v_mov_b32_e32 v6, s5 ; GFX6-NEXT: s_mov_b32 s5, s4 ; GFX6-NEXT: s_addc_u32 s3, s3, s4 ; GFX6-NEXT: s_xor_b64 s[2:3], s[2:3], s[4:5] -; GFX6-NEXT: v_cvt_f32_u32_e32 v8, s2 -; GFX6-NEXT: v_cvt_f32_u32_e32 v9, s3 ; GFX6-NEXT: v_subb_u32_e32 v2, vcc, v6, v2, vcc +; GFX6-NEXT: v_cvt_f32_u32_e32 v6, s2 +; GFX6-NEXT: v_cvt_f32_u32_e32 v7, s3 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s13, v2 -; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc +; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s12, v3 +; GFX6-NEXT: v_mac_f32_e32 v6, 0x4f800000, v7 +; GFX6-NEXT: v_rcp_f32_e32 v6, v6 ; GFX6-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s13, v2 -; GFX6-NEXT: v_mac_f32_e32 v8, 0x4f800000, v9 -; GFX6-NEXT: v_cndmask_b32_e32 v2, v6, v3, vcc -; GFX6-NEXT: v_rcp_f32_e32 v3, v8 +; GFX6-NEXT: v_cndmask_b32_e32 v2, v8, v3, vcc ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GFX6-NEXT: v_cndmask_b32_e64 v2, v7, v5, s[0:1] -; GFX6-NEXT: v_mul_f32_e32 v3, 0x5f7ffffc, v3 -; GFX6-NEXT: v_mul_f32_e32 v4, 0x2f800000, v3 -; GFX6-NEXT: v_trunc_f32_e32 v4, v4 -; GFX6-NEXT: v_mac_f32_e32 v3, 0xcf800000, v4 +; GFX6-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v6 +; GFX6-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 +; GFX6-NEXT: v_trunc_f32_e32 v3, v3 +; GFX6-NEXT: v_mac_f32_e32 v2, 0xcf800000, v3 +; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v3 -; GFX6-NEXT: v_cvt_u32_f32_e32 v4, v4 -; GFX6-NEXT: s_sub_u32 s0, 0, s2 -; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX6-NEXT: v_mul_hi_u32 v2, s0, v3 -; GFX6-NEXT: v_mul_lo_u32 v5, s0, v4 -; GFX6-NEXT: s_subb_u32 s1, 0, s3 -; GFX6-NEXT: v_mul_lo_u32 v6, s1, v3 -; GFX6-NEXT: s_ashr_i32 s12, s7, 31 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v5, v2 -; GFX6-NEXT: v_mul_lo_u32 v5, s0, v3 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v6, v2 -; GFX6-NEXT: v_mul_lo_u32 v6, v3, v2 -; GFX6-NEXT: v_mul_hi_u32 v7, v3, v5 -; GFX6-NEXT: v_mul_hi_u32 v8, v3, v2 -; GFX6-NEXT: v_mul_hi_u32 v9, v4, v2 -; GFX6-NEXT: v_mul_lo_u32 v2, v4, v2 +; GFX6-NEXT: s_sub_u32 s12, 0, s2 +; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; GFX6-NEXT: v_mul_hi_u32 v4, s12, v2 +; GFX6-NEXT: v_mul_lo_u32 v5, s12, v3 +; GFX6-NEXT: s_subb_u32 s13, 0, s3 +; GFX6-NEXT: v_mul_lo_u32 v6, s13, v2 +; GFX6-NEXT: v_xor_b32_e32 v0, s0, v0 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 +; GFX6-NEXT: v_mul_lo_u32 v5, s12, v2 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, v6, v4 +; GFX6-NEXT: v_mul_lo_u32 v6, v2, v4 +; GFX6-NEXT: v_mul_hi_u32 v7, v2, v5 +; GFX6-NEXT: v_mul_hi_u32 v8, v2, v4 +; GFX6-NEXT: v_mul_hi_u32 v9, v3, v4 +; GFX6-NEXT: v_mul_lo_u32 v4, v3, v4 ; GFX6-NEXT: v_add_i32_e32 v6, vcc, v7, v6 ; GFX6-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc -; GFX6-NEXT: v_mul_lo_u32 v8, v4, v5 -; GFX6-NEXT: v_mul_hi_u32 v5, v4, v5 -; GFX6-NEXT: s_mov_b32 s13, s12 -; GFX6-NEXT: v_xor_b32_e32 v0, s14, v0 +; GFX6-NEXT: v_mul_lo_u32 v8, v3, v5 +; GFX6-NEXT: v_mul_hi_u32 v5, v3, v5 +; GFX6-NEXT: v_xor_b32_e32 v1, s1, v1 ; GFX6-NEXT: v_add_i32_e32 v6, vcc, v6, v8 ; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v7, v5, vcc ; GFX6-NEXT: v_addc_u32_e32 v6, vcc, 0, v9, vcc -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v5, v2 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 ; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v4, v5, vcc -; GFX6-NEXT: v_mul_lo_u32 v4, s0, v3 -; GFX6-NEXT: v_mul_hi_u32 v5, s0, v2 -; GFX6-NEXT: v_mul_lo_u32 v6, s1, v2 -; GFX6-NEXT: v_xor_b32_e32 v1, s15, v1 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v3, v5, vcc +; GFX6-NEXT: v_mul_lo_u32 v4, s12, v3 +; GFX6-NEXT: v_mul_hi_u32 v5, s12, v2 +; GFX6-NEXT: v_mul_lo_u32 v6, s13, v2 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; GFX6-NEXT: v_mul_lo_u32 v5, s0, v2 +; GFX6-NEXT: v_mul_lo_u32 v5, s12, v2 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v6, v4 ; GFX6-NEXT: v_mul_lo_u32 v8, v2, v4 ; GFX6-NEXT: v_mul_hi_u32 v9, v2, v5 @@ -9254,12 +9403,14 @@ ; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v9, v7, vcc ; GFX6-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 +; GFX6-NEXT: s_ashr_i32 s12, s7, 31 ; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc -; GFX6-NEXT: s_add_u32 s0, s6, s12 +; GFX6-NEXT: s_add_u32 s6, s6, s12 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 -; GFX6-NEXT: s_addc_u32 s1, s7, s12 +; GFX6-NEXT: s_mov_b32 s13, s12 +; GFX6-NEXT: s_addc_u32 s7, s7, s12 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v3, v5, vcc -; GFX6-NEXT: s_xor_b64 s[6:7], s[0:1], s[12:13] +; GFX6-NEXT: s_xor_b64 s[6:7], s[6:7], s[12:13] ; GFX6-NEXT: v_mul_lo_u32 v4, s6, v3 ; GFX6-NEXT: v_mul_hi_u32 v5, s6, v2 ; GFX6-NEXT: v_mul_hi_u32 v7, s6, v3 @@ -9269,7 +9420,7 @@ ; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc ; GFX6-NEXT: v_mul_lo_u32 v7, s7, v2 ; GFX6-NEXT: v_mul_hi_u32 v2, s7, v2 -; GFX6-NEXT: v_mov_b32_e32 v6, s15 +; GFX6-NEXT: v_mov_b32_e32 v6, s1 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v7 ; GFX6-NEXT: v_addc_u32_e32 v2, vcc, v5, v2, vcc ; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v8, vcc @@ -9277,12 +9428,12 @@ ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc ; GFX6-NEXT: v_mul_lo_u32 v4, s2, v3 ; GFX6-NEXT: v_mul_hi_u32 v5, s2, v2 -; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s14, v0 +; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s0, v0 ; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v1, v6, vcc ; GFX6-NEXT: v_mul_lo_u32 v6, s3, v2 -; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v5 ; GFX6-NEXT: v_mul_lo_u32 v5, s2, v2 -; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v6 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, v6, v4 ; GFX6-NEXT: v_sub_i32_e32 v6, vcc, s7, v4 ; GFX6-NEXT: v_mov_b32_e32 v7, s3 ; GFX6-NEXT: v_sub_i32_e32 v5, vcc, s6, v5 @@ -9295,12 +9446,13 @@ ; GFX6-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[0:1] ; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], s3, v6 ; GFX6-NEXT: v_cndmask_b32_e64 v6, v8, v7, s[0:1] -; GFX6-NEXT: v_add_i32_e64 v7, s[0:1], 2, v2 +; GFX6-NEXT: v_add_i32_e64 v7, s[0:1], 1, v2 ; GFX6-NEXT: v_addc_u32_e64 v8, s[0:1], 0, v3, s[0:1] -; GFX6-NEXT: v_add_i32_e64 v9, s[0:1], 1, v2 +; GFX6-NEXT: v_add_i32_e64 v9, s[0:1], 2, v2 ; GFX6-NEXT: v_addc_u32_e64 v10, s[0:1], 0, v3, s[0:1] ; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v6 -; GFX6-NEXT: v_cndmask_b32_e64 v6, v10, v8, s[0:1] +; GFX6-NEXT: v_cndmask_b32_e64 v6, v7, v9, s[0:1] +; GFX6-NEXT: v_cndmask_b32_e64 v7, v8, v10, s[0:1] ; GFX6-NEXT: v_mov_b32_e32 v8, s7 ; GFX6-NEXT: v_subb_u32_e32 v4, vcc, v8, v4, vcc ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s3, v4 @@ -9310,10 +9462,9 @@ ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s3, v4 ; GFX6-NEXT: v_cndmask_b32_e32 v4, v8, v5, vcc ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GFX6-NEXT: v_cndmask_b32_e64 v4, v9, v7, s[0:1] -; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc ; GFX6-NEXT: s_xor_b64 s[0:1], s[12:13], s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc ; GFX6-NEXT: v_xor_b32_e32 v2, s0, v2 ; GFX6-NEXT: v_xor_b32_e32 v3, s1, v3 ; GFX6-NEXT: v_mov_b32_e32 v4, s1 @@ -9439,36 +9590,36 @@ ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v2, s19 ; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GFX9-NEXT: s_add_u32 s4, s16, 2 ; GFX9-NEXT: v_cndmask_b32_e64 v1, v2, v1, s[0:1] -; GFX9-NEXT: s_addc_u32 s0, s17, 0 -; GFX9-NEXT: s_add_u32 s19, s16, 1 -; GFX9-NEXT: s_addc_u32 s1, s17, 0 -; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 -; GFX9-NEXT: s_subb_u32 s5, s5, s18 -; GFX9-NEXT: s_cmp_ge_u32 s5, s13 -; GFX9-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: s_add_u32 s0, s16, 1 +; GFX9-NEXT: s_addc_u32 s4, s17, 0 +; GFX9-NEXT: s_add_u32 s1, s16, 2 +; GFX9-NEXT: s_addc_u32 s19, s17, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v1 -; GFX9-NEXT: s_cselect_b32 s18, -1, 0 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s12, v0 -; GFX9-NEXT: s_cmp_eq_u32 s5, s13 ; GFX9-NEXT: v_cndmask_b32_e64 v1, v2, v3, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] +; GFX9-NEXT: s_subb_u32 s0, s5, s18 +; GFX9-NEXT: s_cmp_ge_u32 s0, s13 +; GFX9-NEXT: s_cselect_b32 s1, -1, 0 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s12, v0 +; GFX9-NEXT: s_cmp_eq_u32 s0, s13 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; GFX9-NEXT: v_mov_b32_e32 v2, s17 -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; GFX9-NEXT: v_mov_b32_e32 v1, s19 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[0:1] ; GFX9-NEXT: s_xor_b64 s[0:1], s[14:15], s[8:9] ; GFX9-NEXT: s_ashr_i32 s4, s11, 31 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc ; GFX9-NEXT: s_add_u32 s8, s10, s4 +; GFX9-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX9-NEXT: s_mov_b32 s5, s4 ; GFX9-NEXT: s_addc_u32 s9, s11, s4 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX9-NEXT: v_mov_b32_e32 v2, s16 ; GFX9-NEXT: s_xor_b64 s[8:9], s[8:9], s[4:5] ; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc @@ -9579,31 +9730,31 @@ ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v5, s15 ; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GFX9-NEXT: s_add_u32 s6, s12, 2 ; GFX9-NEXT: v_cndmask_b32_e64 v3, v5, v3, s[0:1] -; GFX9-NEXT: s_addc_u32 s0, s13, 0 -; GFX9-NEXT: s_add_u32 s15, s12, 1 -; GFX9-NEXT: s_addc_u32 s1, s13, 0 -; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 -; GFX9-NEXT: s_subb_u32 s7, s7, s14 -; GFX9-NEXT: s_cmp_ge_u32 s7, s9 -; GFX9-NEXT: v_mov_b32_e32 v5, s1 -; GFX9-NEXT: v_mov_b32_e32 v6, s0 +; GFX9-NEXT: s_add_u32 s0, s12, 1 +; GFX9-NEXT: s_addc_u32 s6, s13, 0 +; GFX9-NEXT: s_add_u32 s1, s12, 2 +; GFX9-NEXT: s_addc_u32 s15, s13, 0 +; GFX9-NEXT: v_mov_b32_e32 v5, s0 +; GFX9-NEXT: v_mov_b32_e32 v6, s1 ; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v3 -; GFX9-NEXT: s_cselect_b32 s14, -1, 0 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s8, v2 -; GFX9-NEXT: s_cmp_eq_u32 s7, s9 ; GFX9-NEXT: v_cndmask_b32_e64 v3, v5, v6, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v5, s6 +; GFX9-NEXT: v_mov_b32_e32 v6, s15 +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v6, s[0:1] +; GFX9-NEXT: s_subb_u32 s0, s7, s14 +; GFX9-NEXT: s_cmp_ge_u32 s0, s9 +; GFX9-NEXT: s_cselect_b32 s1, -1, 0 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s8, v2 +; GFX9-NEXT: s_cmp_eq_u32 s0, s9 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc -; GFX9-NEXT: v_mov_b32_e32 v5, s14 +; GFX9-NEXT: v_mov_b32_e32 v6, s1 ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc -; GFX9-NEXT: v_mov_b32_e32 v5, s13 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; GFX9-NEXT: v_mov_b32_e32 v6, s13 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v5, v3, vcc -; GFX9-NEXT: v_mov_b32_e32 v3, s15 -; GFX9-NEXT: v_mov_b32_e32 v5, s6 -; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e32 v2, v6, v5, vcc ; GFX9-NEXT: v_mov_b32_e32 v5, s12 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX9-NEXT: s_xor_b64 s[0:1], s[10:11], s[4:5] @@ -9632,23 +9783,19 @@ ; GFX6-NEXT: v_mov_b32_e32 v0, 0x4f800000 ; GFX6-NEXT: v_madak_f32 v0, 0, v0, 0x4996c7d8 ; GFX6-NEXT: v_rcp_f32_e32 v0, v0 -; GFX6-NEXT: s_mov_b32 s4, 0xffed2705 -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s8, 0xffed2705 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GFX6-NEXT: v_trunc_f32_e32 v1, v1 ; GFX6-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_ashr_i32 s8, s3, 31 -; GFX6-NEXT: s_add_u32 s2, s2, s8 -; GFX6-NEXT: v_mul_lo_u32 v2, v1, s4 -; GFX6-NEXT: v_mul_hi_u32 v3, v0, s4 -; GFX6-NEXT: v_mul_lo_u32 v4, v0, s4 -; GFX6-NEXT: s_mov_b32 s9, s8 -; GFX6-NEXT: s_addc_u32 s3, s3, s8 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: v_mul_lo_u32 v2, v1, s8 +; GFX6-NEXT: v_mul_hi_u32 v3, v0, s8 +; GFX6-NEXT: v_mul_lo_u32 v4, v0, s8 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v0 ; GFX6-NEXT: v_mul_hi_u32 v3, v0, v4 @@ -9660,8 +9807,6 @@ ; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc ; GFX6-NEXT: v_mul_lo_u32 v6, v1, v4 ; GFX6-NEXT: v_mul_hi_u32 v4, v1, v4 -; GFX6-NEXT: s_xor_b64 s[2:3], s[2:3], s[8:9] -; GFX6-NEXT: s_mov_b32 s5, s1 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v6 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v5, v4, vcc ; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v7, vcc @@ -9669,11 +9814,10 @@ ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc -; GFX6-NEXT: v_mul_lo_u32 v2, v1, s4 -; GFX6-NEXT: v_mul_hi_u32 v3, v0, s4 -; GFX6-NEXT: s_mov_b32 s6, -1 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GFX6-NEXT: v_mul_lo_u32 v3, v0, s4 +; GFX6-NEXT: v_mul_lo_u32 v2, v1, s8 +; GFX6-NEXT: v_mul_hi_u32 v3, v0, s8 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; GFX6-NEXT: v_mul_lo_u32 v3, v0, s8 ; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, v0, v2 ; GFX6-NEXT: v_mul_lo_u32 v6, v0, v2 ; GFX6-NEXT: v_mul_hi_u32 v7, v0, v3 @@ -9688,57 +9832,64 @@ ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v7, v5, vcc ; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_ashr_i32 s8, s7, 31 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc +; GFX6-NEXT: s_add_u32 s0, s6, s8 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GFX6-NEXT: s_mov_b32 s9, s8 +; GFX6-NEXT: s_addc_u32 s1, s7, s8 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc -; GFX6-NEXT: v_mul_lo_u32 v2, s2, v1 -; GFX6-NEXT: v_mul_hi_u32 v3, s2, v0 -; GFX6-NEXT: v_mul_hi_u32 v4, s2, v1 -; GFX6-NEXT: v_mul_hi_u32 v5, s3, v1 -; GFX6-NEXT: v_mul_lo_u32 v1, s3, v1 +; GFX6-NEXT: s_xor_b64 s[6:7], s[0:1], s[8:9] +; GFX6-NEXT: v_mul_lo_u32 v2, s6, v1 +; GFX6-NEXT: v_mul_hi_u32 v3, s6, v0 +; GFX6-NEXT: v_mul_hi_u32 v4, s6, v1 +; GFX6-NEXT: v_mul_hi_u32 v5, s7, v1 +; GFX6-NEXT: v_mul_lo_u32 v1, s7, v1 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc -; GFX6-NEXT: v_mul_lo_u32 v4, s3, v0 -; GFX6-NEXT: v_mul_hi_u32 v0, s3, v0 -; GFX6-NEXT: s_mov_b32 s4, s0 -; GFX6-NEXT: s_mov_b32 s0, 0x12d8fb +; GFX6-NEXT: v_mul_lo_u32 v4, s7, v0 +; GFX6-NEXT: v_mul_hi_u32 v0, s7, v0 +; GFX6-NEXT: s_mov_b32 s0, s4 +; GFX6-NEXT: s_mov_b32 s4, 0x12d8fb ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GFX6-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc ; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc -; GFX6-NEXT: v_mul_lo_u32 v1, v1, s0 -; GFX6-NEXT: v_mul_hi_u32 v2, v0, s0 -; GFX6-NEXT: v_mul_lo_u32 v0, v0, s0 +; GFX6-NEXT: v_mul_lo_u32 v1, v1, s4 +; GFX6-NEXT: v_mul_hi_u32 v2, v0, s4 +; GFX6-NEXT: v_mul_lo_u32 v0, v0, s4 +; GFX6-NEXT: s_mov_b32 s1, s5 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v2, v1 -; GFX6-NEXT: v_mov_b32_e32 v2, s3 -; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 +; GFX6-NEXT: v_mov_b32_e32 v2, s7 +; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s6, v0 ; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc -; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s0, v0 +; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s4, v0 ; GFX6-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v1, vcc -; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s0, v2 +; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s4, v2 ; GFX6-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v3, vcc -; GFX6-NEXT: s_mov_b32 s0, 0x12d8fa -; GFX6-NEXT: v_cmp_lt_u32_e32 vcc, s0, v2 +; GFX6-NEXT: s_mov_b32 s4, 0x12d8fa +; GFX6-NEXT: v_cmp_lt_u32_e32 vcc, s4, v2 ; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX6-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; GFX6-NEXT: v_cmp_lt_u32_e64 s[0:1], s0, v0 ; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc -; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1] -; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v1 -; GFX6-NEXT: v_cndmask_b32_e64 v5, -1, v5, s[0:1] -; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v5 ; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] -; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] +; GFX6-NEXT: v_cmp_lt_u32_e32 vcc, s4, v0 +; GFX6-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX6-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX6-NEXT: v_xor_b32_e32 v0, s8, v0 ; GFX6-NEXT: v_xor_b32_e32 v1, s8, v1 ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s8, v0 ; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc -; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: srem_i64_oddk_denom: @@ -9747,7 +9898,7 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, 0x4f800000 ; GFX9-NEXT: v_mac_f32_e32 v0, 0, v1 ; GFX9-NEXT: v_rcp_f32_e32 v0, v0 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 @@ -9755,115 +9906,115 @@ ; GFX9-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: v_readfirstlane_b32 s0, v1 -; GFX9-NEXT: v_readfirstlane_b32 s1, v0 -; GFX9-NEXT: s_mul_hi_u32 s2, s1, 0xffed2705 -; GFX9-NEXT: s_mul_i32 s3, s0, 0xffed2705 -; GFX9-NEXT: s_add_i32 s2, s2, s3 -; GFX9-NEXT: s_sub_i32 s2, s2, s1 -; GFX9-NEXT: s_mul_i32 s9, s1, 0xffed2705 -; GFX9-NEXT: s_mul_hi_u32 s3, s1, s2 -; GFX9-NEXT: s_mul_i32 s8, s1, s2 -; GFX9-NEXT: s_mul_hi_u32 s1, s1, s9 -; GFX9-NEXT: s_add_u32 s1, s1, s8 -; GFX9-NEXT: s_addc_u32 s3, 0, s3 -; GFX9-NEXT: s_mul_hi_u32 s10, s0, s9 -; GFX9-NEXT: s_mul_i32 s9, s0, s9 -; GFX9-NEXT: s_add_u32 s1, s1, s9 -; GFX9-NEXT: s_mul_hi_u32 s8, s0, s2 -; GFX9-NEXT: s_addc_u32 s1, s3, s10 -; GFX9-NEXT: s_addc_u32 s3, s8, 0 -; GFX9-NEXT: s_mul_i32 s2, s0, s2 -; GFX9-NEXT: s_add_u32 s1, s1, s2 -; GFX9-NEXT: s_addc_u32 s2, 0, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s1, v0 +; GFX9-NEXT: v_readfirstlane_b32 s4, v1 +; GFX9-NEXT: v_readfirstlane_b32 s5, v0 +; GFX9-NEXT: s_mul_hi_u32 s6, s5, 0xffed2705 +; GFX9-NEXT: s_mul_i32 s7, s4, 0xffed2705 +; GFX9-NEXT: s_add_i32 s6, s6, s7 +; GFX9-NEXT: s_sub_i32 s6, s6, s5 +; GFX9-NEXT: s_mul_i32 s9, s5, 0xffed2705 +; GFX9-NEXT: s_mul_hi_u32 s7, s5, s6 +; GFX9-NEXT: s_mul_i32 s8, s5, s6 +; GFX9-NEXT: s_mul_hi_u32 s5, s5, s9 +; GFX9-NEXT: s_add_u32 s5, s5, s8 +; GFX9-NEXT: s_addc_u32 s7, 0, s7 +; GFX9-NEXT: s_mul_hi_u32 s10, s4, s9 +; GFX9-NEXT: s_mul_i32 s9, s4, s9 +; GFX9-NEXT: s_add_u32 s5, s5, s9 +; GFX9-NEXT: s_mul_hi_u32 s8, s4, s6 +; GFX9-NEXT: s_addc_u32 s5, s7, s10 +; GFX9-NEXT: s_addc_u32 s7, s8, 0 +; GFX9-NEXT: s_mul_i32 s6, s4, s6 +; GFX9-NEXT: s_add_u32 s5, s5, s6 +; GFX9-NEXT: s_addc_u32 s6, 0, s7 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s5, v0 ; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 -; GFX9-NEXT: s_addc_u32 s0, s0, s2 -; GFX9-NEXT: v_readfirstlane_b32 s2, v0 -; GFX9-NEXT: s_mul_i32 s1, s0, 0xffed2705 -; GFX9-NEXT: s_mul_hi_u32 s3, s2, 0xffed2705 -; GFX9-NEXT: s_add_i32 s3, s3, s1 -; GFX9-NEXT: s_sub_i32 s1, s3, s2 -; GFX9-NEXT: s_mul_i32 s8, s2, 0xffed2705 -; GFX9-NEXT: s_mul_hi_u32 s11, s2, s1 -; GFX9-NEXT: s_mul_i32 s12, s2, s1 -; GFX9-NEXT: s_mul_hi_u32 s2, s2, s8 -; GFX9-NEXT: s_add_u32 s2, s2, s12 -; GFX9-NEXT: s_mul_hi_u32 s9, s0, s8 -; GFX9-NEXT: s_mul_i32 s10, s0, s8 +; GFX9-NEXT: s_addc_u32 s4, s4, s6 +; GFX9-NEXT: v_readfirstlane_b32 s6, v0 +; GFX9-NEXT: s_mul_i32 s5, s4, 0xffed2705 +; GFX9-NEXT: s_mul_hi_u32 s7, s6, 0xffed2705 +; GFX9-NEXT: s_add_i32 s7, s7, s5 +; GFX9-NEXT: s_sub_i32 s5, s7, s6 +; GFX9-NEXT: s_mul_i32 s8, s6, 0xffed2705 +; GFX9-NEXT: s_mul_hi_u32 s11, s6, s5 +; GFX9-NEXT: s_mul_i32 s12, s6, s5 +; GFX9-NEXT: s_mul_hi_u32 s6, s6, s8 +; GFX9-NEXT: s_add_u32 s6, s6, s12 +; GFX9-NEXT: s_mul_hi_u32 s9, s4, s8 +; GFX9-NEXT: s_mul_i32 s10, s4, s8 ; GFX9-NEXT: s_addc_u32 s8, 0, s11 -; GFX9-NEXT: s_add_u32 s2, s2, s10 -; GFX9-NEXT: s_mul_hi_u32 s3, s0, s1 -; GFX9-NEXT: s_addc_u32 s2, s8, s9 -; GFX9-NEXT: s_addc_u32 s3, s3, 0 -; GFX9-NEXT: s_mul_i32 s1, s0, s1 -; GFX9-NEXT: s_add_u32 s1, s2, s1 -; GFX9-NEXT: s_addc_u32 s2, 0, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s1, v0 +; GFX9-NEXT: s_add_u32 s6, s6, s10 +; GFX9-NEXT: s_mul_hi_u32 s7, s4, s5 +; GFX9-NEXT: s_addc_u32 s6, s8, s9 +; GFX9-NEXT: s_addc_u32 s7, s7, 0 +; GFX9-NEXT: s_mul_i32 s5, s4, s5 +; GFX9-NEXT: s_add_u32 s5, s6, s5 +; GFX9-NEXT: s_addc_u32 s6, 0, s7 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s5, v0 ; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 -; GFX9-NEXT: s_addc_u32 s8, s0, s2 +; GFX9-NEXT: s_addc_u32 s6, s4, s6 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s2, s7, 31 -; GFX9-NEXT: s_add_u32 s0, s6, s2 -; GFX9-NEXT: s_mov_b32 s3, s2 -; GFX9-NEXT: s_addc_u32 s1, s7, s2 -; GFX9-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3] -; GFX9-NEXT: v_readfirstlane_b32 s7, v0 -; GFX9-NEXT: s_mul_i32 s6, s0, s8 -; GFX9-NEXT: s_mul_hi_u32 s9, s0, s7 -; GFX9-NEXT: s_mul_hi_u32 s3, s0, s8 -; GFX9-NEXT: s_add_u32 s6, s9, s6 -; GFX9-NEXT: s_addc_u32 s3, 0, s3 -; GFX9-NEXT: s_mul_hi_u32 s10, s1, s7 -; GFX9-NEXT: s_mul_i32 s7, s1, s7 -; GFX9-NEXT: s_add_u32 s6, s6, s7 -; GFX9-NEXT: s_mul_hi_u32 s9, s1, s8 -; GFX9-NEXT: s_addc_u32 s3, s3, s10 -; GFX9-NEXT: s_addc_u32 s6, s9, 0 -; GFX9-NEXT: s_mul_i32 s7, s1, s8 -; GFX9-NEXT: s_add_u32 s3, s3, s7 -; GFX9-NEXT: s_addc_u32 s6, 0, s6 -; GFX9-NEXT: s_mul_hi_u32 s8, s3, 0x12d8fb -; GFX9-NEXT: s_mul_i32 s3, s3, 0x12d8fb +; GFX9-NEXT: s_ashr_i32 s4, s3, 31 +; GFX9-NEXT: s_add_u32 s2, s2, s4 +; GFX9-NEXT: s_mov_b32 s5, s4 +; GFX9-NEXT: s_addc_u32 s3, s3, s4 +; GFX9-NEXT: s_xor_b64 s[2:3], s[2:3], s[4:5] +; GFX9-NEXT: v_readfirstlane_b32 s8, v0 +; GFX9-NEXT: s_mul_i32 s7, s2, s6 +; GFX9-NEXT: s_mul_hi_u32 s9, s2, s8 +; GFX9-NEXT: s_mul_hi_u32 s5, s2, s6 +; GFX9-NEXT: s_add_u32 s7, s9, s7 +; GFX9-NEXT: s_addc_u32 s5, 0, s5 +; GFX9-NEXT: s_mul_hi_u32 s10, s3, s8 +; GFX9-NEXT: s_mul_i32 s8, s3, s8 +; GFX9-NEXT: s_add_u32 s7, s7, s8 +; GFX9-NEXT: s_mul_hi_u32 s9, s3, s6 +; GFX9-NEXT: s_addc_u32 s5, s5, s10 +; GFX9-NEXT: s_addc_u32 s7, s9, 0 +; GFX9-NEXT: s_mul_i32 s6, s3, s6 +; GFX9-NEXT: s_add_u32 s5, s5, s6 +; GFX9-NEXT: s_addc_u32 s6, 0, s7 +; GFX9-NEXT: s_mul_hi_u32 s8, s5, 0x12d8fb +; GFX9-NEXT: s_mul_i32 s5, s5, 0x12d8fb ; GFX9-NEXT: s_mul_i32 s6, s6, 0x12d8fb -; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: v_mov_b32_e32 v0, s5 ; GFX9-NEXT: s_add_i32 s8, s8, s6 -; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s0, v0 +; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s2, v0 ; GFX9-NEXT: s_mov_b32 s7, 0x12d8fb ; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 -; GFX9-NEXT: s_subb_u32 s3, s1, s8 +; GFX9-NEXT: s_subb_u32 s2, s3, s8 ; GFX9-NEXT: v_subrev_co_u32_e32 v1, vcc, s7, v0 ; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 -; GFX9-NEXT: s_subb_u32 s0, s3, 0 +; GFX9-NEXT: s_subb_u32 s3, s2, 0 ; GFX9-NEXT: v_subrev_co_u32_e32 v3, vcc, s7, v1 ; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 -; GFX9-NEXT: s_subb_u32 s1, s0, 0 +; GFX9-NEXT: s_subb_u32 s5, s3, 0 ; GFX9-NEXT: s_mov_b32 s6, 0x12d8fa ; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, s6, v1 -; GFX9-NEXT: s_cmp_eq_u32 s0, 0 +; GFX9-NEXT: s_cmp_eq_u32 s3, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX9-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc -; GFX9-NEXT: v_mov_b32_e32 v5, s0 -; GFX9-NEXT: v_mov_b32_e32 v6, s1 +; GFX9-NEXT: v_mov_b32_e32 v5, s3 +; GFX9-NEXT: v_mov_b32_e32 v6, s5 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GFX9-NEXT: v_cmp_lt_u32_e64 s[0:1], s6, v0 -; GFX9-NEXT: s_cmp_eq_u32 s3, 0 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1] -; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v5, -1, v5, s[0:1] -; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v5 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX9-NEXT: v_mov_b32_e32 v6, s3 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v4, v6, v4, s[0:1] -; GFX9-NEXT: v_xor_b32_e32 v0, s2, v0 -; GFX9-NEXT: v_xor_b32_e32 v1, s2, v4 -; GFX9-NEXT: v_mov_b32_e32 v3, s2 -; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s2, v0 +; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, s6, v0 +; GFX9-NEXT: s_cmp_eq_u32 s2, 0 +; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc +; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 +; GFX9-NEXT: v_cndmask_b32_e32 v3, -1, v3, vcc +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; GFX9-NEXT: v_mov_b32_e32 v5, s2 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v4, vcc +; GFX9-NEXT: v_xor_b32_e32 v0, s4, v0 +; GFX9-NEXT: v_xor_b32_e32 v1, s4, v3 +; GFX9-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s4, v0 ; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm %r = srem i64 %x, 1235195 store i64 %r, i64 addrspace(1)* %out @@ -9982,7 +10133,7 @@ ; GFX6-NEXT: v_mul_hi_u32 v3, s4, v0 ; GFX6-NEXT: v_mul_lo_u32 v4, s5, v0 ; GFX6-NEXT: s_mov_b32 s5, s1 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX6-NEXT: v_mul_lo_u32 v3, s4, v0 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v4, v2 ; GFX6-NEXT: v_mul_lo_u32 v6, v0, v2 @@ -10020,8 +10171,8 @@ ; GFX6-NEXT: v_mul_hi_u32 v2, s8, v0 ; GFX6-NEXT: v_mul_lo_u32 v3, s9, v0 ; GFX6-NEXT: v_mul_lo_u32 v0, s8, v0 -; GFX6-NEXT: v_add_i32_e32 v1, vcc, v2, v1 -; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v2 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, v3, v1 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s13, v1 ; GFX6-NEXT: v_mov_b32_e32 v3, s9 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s12, v0 @@ -10038,19 +10189,19 @@ ; GFX6-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[2:3] ; GFX6-NEXT: v_subbrev_u32_e64 v2, s[0:1], 0, v2, s[0:1] ; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v6 -; GFX6-NEXT: v_cndmask_b32_e64 v2, v5, v2, s[0:1] -; GFX6-NEXT: v_mov_b32_e32 v5, s13 -; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v5, v1, vcc +; GFX6-NEXT: v_cndmask_b32_e64 v3, v4, v3, s[0:1] +; GFX6-NEXT: v_mov_b32_e32 v4, s13 +; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v4, v1, vcc ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s9, v1 -; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc +; GFX6-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 -; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc +; GFX6-NEXT: v_cndmask_b32_e64 v2, v5, v2, s[0:1] +; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s9, v1 -; GFX6-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc -; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 +; GFX6-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc +; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX6-NEXT: v_cndmask_b32_e64 v2, v4, v3, s[0:1] -; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX6-NEXT: v_xor_b32_e32 v0, s10, v0 ; GFX6-NEXT: v_xor_b32_e32 v1, s10, v1 ; GFX6-NEXT: v_mov_b32_e32 v2, s10 @@ -10180,26 +10331,26 @@ ; GFX9-NEXT: s_subb_u32 s2, s6, s9 ; GFX9-NEXT: v_subrev_co_u32_e64 v4, s[0:1], s8, v2 ; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX9-NEXT: s_subb_u32 s0, s2, 0 -; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 -; GFX9-NEXT: s_subb_u32 s2, s7, s11 -; GFX9-NEXT: s_cmp_ge_u32 s2, s9 -; GFX9-NEXT: v_mov_b32_e32 v5, s12 -; GFX9-NEXT: v_mov_b32_e32 v6, s0 +; GFX9-NEXT: s_subb_u32 s2, s2, 0 ; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v3 -; GFX9-NEXT: s_cselect_b32 s3, -1, 0 +; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v3, s12 +; GFX9-NEXT: v_mov_b32_e32 v4, s2 +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[0:1] +; GFX9-NEXT: s_subb_u32 s0, s7, s11 +; GFX9-NEXT: s_cmp_ge_u32 s0, s9 +; GFX9-NEXT: s_cselect_b32 s1, -1, 0 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s8, v1 -; GFX9-NEXT: s_cmp_eq_u32 s2, s9 -; GFX9-NEXT: v_cndmask_b32_e64 v3, v5, v6, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc -; GFX9-NEXT: v_mov_b32_e32 v6, s3 +; GFX9-NEXT: s_cmp_eq_u32 s0, s9 +; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc +; GFX9-NEXT: v_mov_b32_e32 v5, s1 ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[0:1] -; GFX9-NEXT: v_mov_b32_e32 v6, s2 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GFX9-NEXT: v_mov_b32_e32 v5, s0 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX9-NEXT: v_xor_b32_e32 v1, s10, v1 ; GFX9-NEXT: v_xor_b32_e32 v2, s10, v3 ; GFX9-NEXT: v_mov_b32_e32 v3, s10 @@ -10354,7 +10505,7 @@ ; GFX6-NEXT: v_mul_lo_u32 v2, s2, v1 ; GFX6-NEXT: v_mul_hi_u32 v3, s2, v0 ; GFX6-NEXT: v_mul_lo_u32 v4, s3, v0 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX6-NEXT: v_mul_lo_u32 v3, s2, v0 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v4, v2 ; GFX6-NEXT: v_mul_lo_u32 v6, v0, v2 @@ -10391,8 +10542,8 @@ ; GFX6-NEXT: v_mul_hi_u32 v2, s16, v0 ; GFX6-NEXT: v_mul_lo_u32 v3, s17, v0 ; GFX6-NEXT: v_mul_lo_u32 v0, s16, v0 -; GFX6-NEXT: v_add_i32_e32 v1, vcc, v2, v1 -; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v2 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, v3, v1 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s5, v1 ; GFX6-NEXT: v_mov_b32_e32 v3, s17 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 @@ -10408,49 +10559,49 @@ ; GFX6-NEXT: v_subrev_i32_e64 v3, s[0:1], s16, v4 ; GFX6-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[2:3] ; GFX6-NEXT: v_subbrev_u32_e64 v2, s[0:1], 0, v2, s[0:1] -; GFX6-NEXT: s_ashr_i32 s2, s15, 31 ; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v6 -; GFX6-NEXT: s_add_u32 s4, s14, s2 +; GFX6-NEXT: v_cndmask_b32_e64 v3, v4, v3, s[0:1] ; GFX6-NEXT: v_cndmask_b32_e64 v2, v5, v2, s[0:1] -; GFX6-NEXT: v_mov_b32_e32 v5, s5 -; GFX6-NEXT: s_mov_b32 s3, s2 -; GFX6-NEXT: s_addc_u32 s5, s15, s2 -; GFX6-NEXT: s_xor_b64 s[4:5], s[4:5], s[2:3] -; GFX6-NEXT: v_cvt_f32_u32_e32 v6, s4 -; GFX6-NEXT: v_cvt_f32_u32_e32 v7, s5 -; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v5, v1, vcc +; GFX6-NEXT: s_ashr_i32 s0, s15, 31 +; GFX6-NEXT: s_add_u32 s2, s14, s0 +; GFX6-NEXT: s_mov_b32 s1, s0 +; GFX6-NEXT: s_addc_u32 s3, s15, s0 +; GFX6-NEXT: v_mov_b32_e32 v4, s5 +; GFX6-NEXT: s_xor_b64 s[4:5], s[2:3], s[0:1] +; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v4, v1, vcc +; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s4 +; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s5 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s17, v1 -; GFX6-NEXT: v_mac_f32_e32 v6, 0x4f800000, v7 -; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc +; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s16, v0 -; GFX6-NEXT: v_rcp_f32_e32 v6, v6 -; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc +; GFX6-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 +; GFX6-NEXT: v_rcp_f32_e32 v4, v4 +; GFX6-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s17, v1 -; GFX6-NEXT: v_cndmask_b32_e32 v5, v5, v8, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 ; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX6-NEXT: v_cndmask_b32_e64 v2, v4, v3, s[0:1] -; GFX6-NEXT: v_mul_f32_e32 v3, 0x5f7ffffc, v6 -; GFX6-NEXT: v_mul_f32_e32 v4, 0x2f800000, v3 +; GFX6-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v4 +; GFX6-NEXT: v_mul_f32_e32 v4, 0x2f800000, v2 ; GFX6-NEXT: v_trunc_f32_e32 v4, v4 -; GFX6-NEXT: v_mac_f32_e32 v3, 0xcf800000, v4 -; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v3 +; GFX6-NEXT: v_mac_f32_e32 v2, 0xcf800000, v4 +; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GFX6-NEXT: v_cvt_u32_f32_e32 v4, v4 ; GFX6-NEXT: s_sub_u32 s0, 0, s4 -; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX6-NEXT: v_mul_hi_u32 v2, s0, v3 +; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; GFX6-NEXT: v_mul_hi_u32 v3, s0, v2 ; GFX6-NEXT: v_mul_lo_u32 v5, s0, v4 ; GFX6-NEXT: s_subb_u32 s1, 0, s5 -; GFX6-NEXT: v_mul_lo_u32 v6, s1, v3 +; GFX6-NEXT: v_mul_lo_u32 v6, s1, v2 ; GFX6-NEXT: s_ashr_i32 s14, s7, 31 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v5, v2 -; GFX6-NEXT: v_mul_lo_u32 v5, s0, v3 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v6, v2 -; GFX6-NEXT: v_mul_lo_u32 v6, v3, v2 -; GFX6-NEXT: v_mul_hi_u32 v7, v3, v5 -; GFX6-NEXT: v_mul_hi_u32 v8, v3, v2 -; GFX6-NEXT: v_mul_hi_u32 v9, v4, v2 -; GFX6-NEXT: v_mul_lo_u32 v2, v4, v2 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, v5, v3 +; GFX6-NEXT: v_mul_lo_u32 v5, s0, v2 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, v6, v3 +; GFX6-NEXT: v_mul_lo_u32 v6, v2, v3 +; GFX6-NEXT: v_mul_hi_u32 v7, v2, v5 +; GFX6-NEXT: v_mul_hi_u32 v8, v2, v3 +; GFX6-NEXT: v_mul_hi_u32 v9, v4, v3 +; GFX6-NEXT: v_mul_lo_u32 v3, v4, v3 ; GFX6-NEXT: v_add_i32_e32 v6, vcc, v7, v6 ; GFX6-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc ; GFX6-NEXT: v_mul_lo_u32 v8, v4, v5 @@ -10460,15 +10611,15 @@ ; GFX6-NEXT: v_add_i32_e32 v6, vcc, v6, v8 ; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v7, v5, vcc ; GFX6-NEXT: v_addc_u32_e32 v6, vcc, 0, v9, vcc -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v5, v2 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, v5, v3 ; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v4, v5, vcc ; GFX6-NEXT: v_mul_lo_u32 v4, s0, v3 ; GFX6-NEXT: v_mul_hi_u32 v5, s0, v2 ; GFX6-NEXT: v_mul_lo_u32 v6, s1, v2 ; GFX6-NEXT: v_xor_b32_e32 v1, s12, v1 -; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v5 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 ; GFX6-NEXT: v_mul_lo_u32 v5, s0, v2 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v6, v4 ; GFX6-NEXT: v_mul_lo_u32 v8, v2, v4 @@ -10511,8 +10662,8 @@ ; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s12, v0 ; GFX6-NEXT: v_mul_lo_u32 v2, s4, v2 ; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v1, v6, vcc -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v5 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v4 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, v5, v3 ; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s7, v3 ; GFX6-NEXT: v_mov_b32_e32 v5, s5 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s6, v2 @@ -10529,19 +10680,19 @@ ; GFX6-NEXT: v_cndmask_b32_e64 v8, v8, v9, s[2:3] ; GFX6-NEXT: v_subbrev_u32_e64 v4, s[0:1], 0, v4, s[0:1] ; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v8 -; GFX6-NEXT: v_cndmask_b32_e64 v4, v7, v4, s[0:1] -; GFX6-NEXT: v_mov_b32_e32 v7, s7 -; GFX6-NEXT: v_subb_u32_e32 v3, vcc, v7, v3, vcc +; GFX6-NEXT: v_cndmask_b32_e64 v5, v6, v5, s[0:1] +; GFX6-NEXT: v_mov_b32_e32 v6, s7 +; GFX6-NEXT: v_subb_u32_e32 v3, vcc, v6, v3, vcc ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s5, v3 -; GFX6-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc +; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s4, v2 -; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc +; GFX6-NEXT: v_cndmask_b32_e64 v4, v7, v4, s[0:1] +; GFX6-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s5, v3 -; GFX6-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc -; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 +; GFX6-NEXT: v_cndmask_b32_e32 v6, v6, v7, vcc +; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc -; GFX6-NEXT: v_cndmask_b32_e64 v4, v6, v5, s[0:1] -; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX6-NEXT: v_xor_b32_e32 v2, s14, v2 ; GFX6-NEXT: v_xor_b32_e32 v3, s14, v3 ; GFX6-NEXT: v_mov_b32_e32 v4, s14 @@ -10672,33 +10823,33 @@ ; GFX9-NEXT: s_subb_u32 s2, s4, s13 ; GFX9-NEXT: v_subrev_co_u32_e64 v3, s[0:1], s12, v1 ; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX9-NEXT: s_subb_u32 s0, s2, 0 -; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 -; GFX9-NEXT: s_subb_u32 s2, s5, s15 -; GFX9-NEXT: s_cmp_ge_u32 s2, s13 -; GFX9-NEXT: v_mov_b32_e32 v5, s16 -; GFX9-NEXT: v_mov_b32_e32 v6, s0 +; GFX9-NEXT: s_subb_u32 s2, s2, 0 ; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v2 -; GFX9-NEXT: s_cselect_b32 s3, -1, 0 +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v2, s16 +; GFX9-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] +; GFX9-NEXT: s_subb_u32 s0, s5, s15 +; GFX9-NEXT: s_cmp_ge_u32 s0, s13 +; GFX9-NEXT: s_cselect_b32 s1, -1, 0 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s12, v0 -; GFX9-NEXT: s_cmp_eq_u32 s2, s13 -; GFX9-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc -; GFX9-NEXT: v_mov_b32_e32 v6, s3 +; GFX9-NEXT: s_cmp_eq_u32 s0, s13 +; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc +; GFX9-NEXT: v_mov_b32_e32 v5, s1 ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX9-NEXT: v_mov_b32_e32 v5, s0 ; GFX9-NEXT: s_ashr_i32 s0, s11, 31 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc -; GFX9-NEXT: v_mov_b32_e32 v6, s2 ; GFX9-NEXT: s_add_u32 s2, s10, s0 ; GFX9-NEXT: s_mov_b32 s1, s0 ; GFX9-NEXT: s_addc_u32 s3, s11, s0 -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 ; GFX9-NEXT: s_xor_b64 s[4:5], s[2:3], s[0:1] ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s4 ; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s5 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc ; GFX9-NEXT: v_xor_b32_e32 v0, s14, v0 ; GFX9-NEXT: v_xor_b32_e32 v2, s14, v2 ; GFX9-NEXT: v_mac_f32_e32 v1, 0x4f800000, v3 @@ -10809,26 +10960,26 @@ ; GFX9-NEXT: s_subb_u32 s2, s6, s5 ; GFX9-NEXT: v_subrev_co_u32_e64 v6, s[0:1], s4, v3 ; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX9-NEXT: s_subb_u32 s0, s2, 0 -; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 -; GFX9-NEXT: s_subb_u32 s2, s7, s11 -; GFX9-NEXT: s_cmp_ge_u32 s2, s5 -; GFX9-NEXT: v_mov_b32_e32 v7, s12 -; GFX9-NEXT: v_mov_b32_e32 v8, s0 +; GFX9-NEXT: s_subb_u32 s2, s2, 0 ; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v5 -; GFX9-NEXT: s_cselect_b32 s3, -1, 0 +; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v5, s12 +; GFX9-NEXT: v_mov_b32_e32 v6, s2 +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v6, s[0:1] +; GFX9-NEXT: s_subb_u32 s0, s7, s11 +; GFX9-NEXT: s_cmp_ge_u32 s0, s5 +; GFX9-NEXT: s_cselect_b32 s1, -1, 0 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s4, v2 -; GFX9-NEXT: s_cmp_eq_u32 s2, s5 -; GFX9-NEXT: v_cndmask_b32_e64 v5, v7, v8, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc -; GFX9-NEXT: v_mov_b32_e32 v8, s3 +; GFX9-NEXT: s_cmp_eq_u32 s0, s5 +; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc +; GFX9-NEXT: v_mov_b32_e32 v7, s1 ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: v_cndmask_b32_e32 v7, v8, v7, vcc -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 -; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[0:1] -; GFX9-NEXT: v_mov_b32_e32 v8, s2 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v6, vcc +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; GFX9-NEXT: v_mov_b32_e32 v7, s0 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v5, v8, v5, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc ; GFX9-NEXT: v_xor_b32_e32 v2, s10, v2 ; GFX9-NEXT: v_xor_b32_e32 v3, s10, v5 ; GFX9-NEXT: v_mov_b32_e32 v5, s10 diff --git a/llvm/test/CodeGen/AMDGPU/dagcombine-select.ll b/llvm/test/CodeGen/AMDGPU/dagcombine-select.ll --- a/llvm/test/CodeGen/AMDGPU/dagcombine-select.ll +++ b/llvm/test/CodeGen/AMDGPU/dagcombine-select.ll @@ -40,12 +40,16 @@ } ; GCN-LABEL: {{^}}select_and_v4: -; GCN: v_cndmask_b32_e32 [[SEL:v[0-9]+]], 0, v{{[0-9]+}}, -; GCN: v_cndmask_b32_e32 [[SEL:v[0-9]+]], 0, v{{[0-9]+}}, -; GCN: v_cndmask_b32_e32 [[SEL:v[0-9]+]], 0, v{{[0-9]+}}, -; GCN: v_cndmask_b32_e32 [[SEL:v[0-9]+]], 0, v{{[0-9]+}}, +; GCN: s_cselect_b32 s[[SEL0:[0-9]+]], s{{[0-9]+}}, 0 +; GCN: s_cselect_b32 s[[SEL1:[0-9]+]], s{{[0-9]+}}, 0 +; GCN: s_cselect_b32 s[[SEL2:[0-9]+]], s{{[0-9]+}}, 0 +; GCN: s_cselect_b32 s[[SEL3:[0-9]+]], s{{[0-9]+}}, 0 +; GCN: v_mov_b32_e32 v[[V0:[0-9]+]], s[[SEL3]] +; GCN: v_mov_b32_e32 v[[V1:[0-9]+]], s[[SEL2]] +; GCN: v_mov_b32_e32 v[[V2:[0-9]+]], s[[SEL1]] +; GCN: v_mov_b32_e32 v[[V3:[0-9]+]], s[[SEL0]] ; GCN-NOT: v_and_b32 -; GCN: store_dword +; GCN: global_store_dwordx4 v{{[0-9]+}}, v[[[V0]]:[[V3]]] define amdgpu_kernel void @select_and_v4(<4 x i32> addrspace(1)* %p, i32 %x, <4 x i32> %y) { %c = icmp slt i32 %x, 11 %s = select i1 %c, <4 x i32> zeroinitializer, <4 x i32> @@ -94,12 +98,16 @@ } ; GCN-LABEL: {{^}}select_or_v4: -; GCN: v_cndmask_b32_e32 [[SEL:v[0-9]+]], -1, v{{[0-9]+}}, -; GCN: v_cndmask_b32_e32 [[SEL:v[0-9]+]], -1, v{{[0-9]+}}, -; GCN: v_cndmask_b32_e32 [[SEL:v[0-9]+]], -1, v{{[0-9]+}}, -; GCN: v_cndmask_b32_e32 [[SEL:v[0-9]+]], -1, v{{[0-9]+}}, +; GCN: s_cselect_b32 s[[SEL0:[0-9]+]], s{{[0-9]+}}, -1 +; GCN: s_cselect_b32 s[[SEL1:[0-9]+]], s{{[0-9]+}}, -1 +; GCN: s_cselect_b32 s[[SEL2:[0-9]+]], s{{[0-9]+}}, -1 +; GCN: s_cselect_b32 s[[SEL3:[0-9]+]], s{{[0-9]+}}, -1 ; GCN-NOT: v_or_b32 -; GCN: store_dword +; GCN: v_mov_b32_e32 v[[V0:[0-9]+]], s[[SEL3]] +; GCN: v_mov_b32_e32 v[[V1:[0-9]+]], s[[SEL2]] +; GCN: v_mov_b32_e32 v[[V2:[0-9]+]], s[[SEL1]] +; GCN: v_mov_b32_e32 v[[V3:[0-9]+]], s[[SEL0]] +; GCN: global_store_dwordx4 v{{[0-9]+}}, v[[[V0]]:[[V3]]] define amdgpu_kernel void @select_or_v4(<4 x i32> addrspace(1)* %p, i32 %x, <4 x i32> %y) { %c = icmp slt i32 %x, 11 %s = select i1 %c, <4 x i32> zeroinitializer, <4 x i32> @@ -147,10 +155,15 @@ } ; GCN-LABEL: {{^}}sel_constants_sub_constant_sel_constants_v4i32: -; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, 2, 9, -; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, 6, 5, -; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, 10, 6, -; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, 14, 7, +; GCN: s_cselect_b32 s[[SEL0:[0-9]+]], 7, 14 +; GCN: s_cselect_b32 s[[SEL1:[0-9]+]], 6, 10 +; GCN: s_cselect_b32 s[[SEL2:[0-9]+]], 5, 6 +; GCN: s_cselect_b32 s[[SEL3:[0-9]+]], 9, 2 +; GCN: v_mov_b32_e32 v[[V0:[0-9]+]], s[[SEL3]] +; GCN: v_mov_b32_e32 v[[V1:[0-9]+]], s[[SEL2]] +; GCN: v_mov_b32_e32 v[[V2:[0-9]+]], s[[SEL1]] +; GCN: v_mov_b32_e32 v[[V3:[0-9]+]], s[[SEL0]] +; GCN: global_store_dwordx4 v{{[0-9]+}}, v[[[V0]]:[[V3]]] define amdgpu_kernel void @sel_constants_sub_constant_sel_constants_v4i32(<4 x i32> addrspace(1)* %p, i1 %cond) { %sel = select i1 %cond, <4 x i32> , <4 x i32> %bo = sub <4 x i32> , %sel @@ -261,14 +274,16 @@ } ; GCN-LABEL: {{^}}fsub_constant_sel_constants_v4f32: -; GCN-DAG: v_mov_b32_e32 [[T2:v[0-9]+]], 0x40a00000 -; GCN-DAG: v_mov_b32_e32 [[T3:v[0-9]+]], 0x41100000 -; GCN-DAG: v_mov_b32_e32 [[T4:v[0-9]+]], 0x41500000 -; GCN-DAG: v_mov_b32_e32 [[F4:v[0-9]+]], 0x40c00000 -; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1.0, -; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 2.0, [[T2]], -; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 4.0, [[T3]], -; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, [[F4]], [[T4]], +; GCN: s_mov_b32 [[T0:s[0-9]+]], 0x41500000 +; GCN: s_cselect_b32 s[[SEL0:[0-9]+]], [[T0]], 0x40c00000 +; GCN: s_cselect_b32 s[[SEL1:[0-9]+]], 0x41100000, 4.0 +; GCN: s_cselect_b32 s[[SEL2:[0-9]+]], 0x40a00000, 2.0 +; GCN: s_cselect_b32 s[[SEL3:[0-9]+]], 1.0, 0 +; GCN: v_mov_b32_e32 v[[V0:[0-9]+]], s[[SEL3]] +; GCN: v_mov_b32_e32 v[[V1:[0-9]+]], s[[SEL2]] +; GCN: v_mov_b32_e32 v[[V2:[0-9]+]], s[[SEL1]] +; GCN: v_mov_b32_e32 v[[V3:[0-9]+]], s[[SEL0]] +; GCN: global_store_dwordx4 v{{[0-9]+}}, v[[[V0]]:[[V3]]] define amdgpu_kernel void @fsub_constant_sel_constants_v4f32(<4 x float> addrspace(1)* %p, i1 %cond) { %sel = select i1 %cond, <4 x float> , <4 x float> %bo = fsub <4 x float> , %sel diff --git a/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll b/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll --- a/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll +++ b/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll @@ -14,11 +14,12 @@ ; GFX7-NEXT: v_add_i32_e64 v0, s[4:5], s6, s6 ; GFX7-NEXT: s_or_b32 s4, s4, s5 ; GFX7-NEXT: s_cmp_lg_u32 s4, 0 -; GFX7-NEXT: s_addc_u32 s4, s6, 0 -; GFX7-NEXT: v_mov_b32_e32 v1, s4 -; GFX7-NEXT: s_cselect_b64 vcc, -1, 0 +; GFX7-NEXT: s_addc_u32 s7, s6, 0 +; GFX7-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX7-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX7-NEXT: s_cselect_b32 s4, s7, 0 ; GFX7-NEXT: s_cmp_gt_u32 s6, 31 -; GFX7-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -31,11 +32,12 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_add_co_u32_e64 v0, s[4:5], s6, s6 ; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0 -; GFX9-NEXT: s_addc_u32 s4, s6, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 +; GFX9-NEXT: s_addc_u32 s7, s6, 0 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s4, s7, 0 ; GFX9-NEXT: s_cmp_gt_u32 s6, 31 -; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -51,10 +53,11 @@ ; GFX10-NEXT: s_cmpk_lg_u32 s5, 0x0 ; GFX10-NEXT: s_addc_u32 s5, s4, 0 ; GFX10-NEXT: s_cselect_b32 s6, -1, 0 +; GFX10-NEXT: s_and_b32 s6, s6, exec_lo +; GFX10-NEXT: s_cselect_b32 s5, s5, 0 ; GFX10-NEXT: s_cmp_gt_u32 s4, 31 -; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, s5, s6 ; GFX10-NEXT: s_cselect_b32 vcc_lo, -1, 0 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, s5, v0, vcc_lo ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: s_add_co_select_user: @@ -65,15 +68,15 @@ ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_add_co_u32 v0, s1, s0, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_cmpk_lg_u32 s1, 0x0 ; GFX11-NEXT: s_addc_u32 s1, s0, 0 ; GFX11-NEXT: s_cselect_b32 s2, -1, 0 +; GFX11-NEXT: s_and_b32 s2, s2, exec_lo +; GFX11-NEXT: s_cselect_b32 s1, s1, 0 ; GFX11-NEXT: s_cmp_gt_u32 s0, 31 -; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, s1, s2 ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v0, s1, v0, vcc_lo ; GFX11-NEXT: s_setpc_b64 s[30:31] bb: %i = load volatile i32, i32 addrspace(4)* null, align 8 diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll --- a/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll +++ b/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll @@ -38,15 +38,15 @@ ; GCN-LABEL: {{^}}double4_extelt: ; GCN-NOT: buffer_ -; GCN-DAG: s_cmp_eq_u32 [[IDX:s[0-9]+]], 1 -; GCN-DAG: s_cselect_b64 [[C1:[^,]+]], -1, 0 -; GCN-DAG: s_cmp_eq_u32 [[IDX]], 2 -; GCN-DAG: s_cselect_b64 [[C2:[^,]+]], -1, 0 -; GCN-DAG: s_cmp_eq_u32 [[IDX]], 3 -; GCN-DAG: s_cselect_b64 [[C3:[^,]+]], -1, 0 -; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, {{[^,]+}}, {{[^,]+}}, [[C1]] -; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, {{[^,]+}}, {{[^,]+}}, [[C2]] -; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, {{[^,]+}}, {{[^,]+}}, [[C3]] +; GCN-DAG: s_cmp_eq_u32 s{{[0-9]+}}, 1 +; GCN-DAG: s_cselect_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x3f847ae1 +; GCN-DAG: s_cselect_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x47ae147b +; GCN-DAG: s_cmp_eq_u32 s{{[0-9]+}}, 2 +; GCN-DAG: s_cselect_b32 s{{[0-9]+}}, 0xe147ae14, s{{[0-9]+}} +; GCN-DAG: s_cselect_b32 s{{[0-9]+}}, 0x4000147a, s{{[0-9]+}} +; GCN-DAG: s_cmp_eq_u32 s{{[[0-9]+}}, 3 +; GCN-DAG: s_cselect_b32 s{{[0-9]+}}, 0x40100a3d, s{{[0-9]+}} +; GCN-DAG: s_cselect_b32 s{{[0-9]+}}, 0x70a3d70a, s{{[0-9]+}} ; GCN: store_dwordx2 v[{{[0-9:]+}}] define amdgpu_kernel void @double4_extelt(double addrspace(1)* %out, i32 %sel) { entry: @@ -57,18 +57,17 @@ ; GCN-LABEL: {{^}}double5_extelt: ; GCN-NOT: buffer_ -; GCN-DAG: s_cmp_eq_u32 [[IDX:s[0-9]+]], 1 -; GCN-DAG: s_cselect_b64 [[C1:[^,]+]], -1, 0 -; GCN-DAG: s_cmp_eq_u32 [[IDX]], 2 -; GCN-DAG: s_cselect_b64 [[C2:[^,]+]], -1, 0 -; GCN-DAG: s_cmp_eq_u32 [[IDX]], 3 -; GCN-DAG: s_cselect_b64 [[C3:[^,]+]], -1, 0 -; GCN-DAG: s_cmp_eq_u32 [[IDX]], 4 -; GCN-DAG: s_cselect_b64 [[C4:[^,]+]], -1, 0 -; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, {{[^,]+}}, {{[^,]+}}, [[C1]] -; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, {{[^,]+}}, {{[^,]+}}, [[C2]] -; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, {{[^,]+}}, {{[^,]+}}, [[C3]] -; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, {{[^,]+}}, {{[^,]+}}, [[C4]] +; GCN-DAG: s_cmp_eq_u32 s{{[0-9]+}}, 1 +; GCN-DAG: s_cselect_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x3f847ae1 +; GCN-DAG: s_cselect_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x47ae147b +; GCN-DAG: s_cmp_eq_u32 s{{[0-9]+}}, 2 +; GCN-DAG: s_cselect_b32 s{{[0-9]+}}, 0xe147ae14, s{{[0-9]+}} +; GCN-DAG: s_cselect_b32 s{{[0-9]+}}, 0x4000147a, s{{[0-9]+}} +; GCN-DAG: s_cmp_eq_u32 s{{[[0-9]+}}, 3 +; GCN-DAG: s_cselect_b32 s{{[0-9]+}}, 0x40100a3d, s{{[0-9]+}} +; GCN-DAG: s_cselect_b32 s{{[0-9]+}}, 0x70a3d70a, s{{[0-9]+}} +; GCN-DAG: s_cmp_eq_u32 s{{[[0-9]+}}, 4 +; GCN-DAG: s_cselect_b32 s{{[0-9]+}}, 0x40140a3d, s{{[0-9]+}} ; GCN: store_dwordx2 v[{{[0-9:]+}}] define amdgpu_kernel void @double5_extelt(double addrspace(1)* %out, i32 %sel) { entry: @@ -107,10 +106,9 @@ ; GCN-LABEL: {{^}}double2_extelt: ; GCN-NOT: buffer_ -; GCN-DAG: s_cmp_eq_u32 [[IDX:s[0-9]+]], 1 -; GCN-DAG: s_cselect_b64 [[C1:[^,]+]], -1, 0 -; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, {{[^,]+}}, {{[^,]+}}, [[C1]] -; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, {{[^,]+}}, {{[^,]+}}, [[C1]] +; GCN-DAG: s_cmp_eq_u32 s{{[0-9]+}}, 1 +; GCN-DAG: s_cselect_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x3f847ae1 +; GCN-DAG: s_cselect_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x47ae147b ; GCN: store_dwordx2 v[{{[0-9:]+}}] define amdgpu_kernel void @double2_extelt(double addrspace(1)* %out, i32 %sel) { entry: diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f64.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f64.ll --- a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f64.ll +++ b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f64.ll @@ -14,14 +14,12 @@ ; GCN-LABEL: {{^}}dyn_extract_vector_elt_v3f64: ; GCN-NOT: buffer_load -; GCN-DAG: s_cmp_eq_u32 [[IDX:s[0-9]+]], 1 -; GCN-DAG: s_cselect_b64 [[C1:[^,]+]], -1, 0 -; GCN-DAG: s_cmp_eq_u32 [[IDX]], 2 -; GCN-DAG: s_cselect_b64 [[C2:[^,]+]], -1, 0 -; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C1]] -; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C1]] -; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C2]] -; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C2]] +; GCN-DAG: s_cmp_eq_u32 s{{[0-9]+}}, 1 +; GCN-DAG: s_cselect_b32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} +; GCN-DAG: s_cselect_b32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} +; GCN-DAG: s_cmp_eq_u32 s{{[0-9]+}}, 2 +; GCN-DAG: s_cselect_b32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} +; GCN-DAG: s_cselect_b32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} ; GCN: store_dwordx2 v[{{[0-9:]+}}] define amdgpu_kernel void @dyn_extract_vector_elt_v3f64(double addrspace(1)* %out, <3 x double> %foo, i32 %elt) #0 { %dynelt = extractelement <3 x double> %foo, i32 %elt @@ -31,18 +29,15 @@ ; GCN-LABEL: {{^}}dyn_extract_vector_elt_v4f64: ; GCN-NOT: buffer_load -; GCN-DAG: s_cmp_eq_u32 [[IDX:s[0-9]+]], 1 -; GCN-DAG: s_cselect_b64 [[C1:[^,]+]], -1, 0 -; GCN-DAG: s_cmp_eq_u32 [[IDX]], 2 -; GCN-DAG: s_cselect_b64 [[C2:[^,]+]], -1, 0 -; GCN-DAG: s_cmp_eq_u32 [[IDX]], 3 -; GCN-DAG: s_cselect_b64 [[C3:[^,]+]], -1, 0 -; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C1]] -; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C1]] -; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C2]] -; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C2]] -; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C3]] -; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C3]] +; GCN-DAG: s_cmp_eq_u32 s{{[0-9]+}}, 1 +; GCN-DAG: s_cselect_b32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} +; GCN-DAG: s_cselect_b32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} +; GCN-DAG: s_cmp_eq_u32 s{{[0-9]+}}, 2 +; GCN-DAG: s_cselect_b32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} +; GCN-DAG: s_cselect_b32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} +; GCN-DAG: s_cmp_eq_u32 s{{[0-9]+}}, 3 +; GCN-DAG: s_cselect_b32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} +; GCN-DAG: s_cselect_b32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} ; GCN: store_dwordx2 v[{{[0-9:]+}}] define amdgpu_kernel void @dyn_extract_vector_elt_v4f64(double addrspace(1)* %out, <4 x double> %foo, i32 %elt) #0 { %dynelt = extractelement <4 x double> %foo, i32 %elt diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i64.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i64.ll --- a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i64.ll +++ b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i64.ll @@ -31,10 +31,9 @@ ; GCN-LABEL: {{^}}dyn_extract_vector_elt_v2i64: ; GCN-NOT: buffer_load -; GCN-DAG: s_cmp_eq_u32 [[IDX:s[0-9]+]], 1 -; GCN-DAG: s_cselect_b64 [[C1:[^,]+]], -1, 0 -; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C1]] -; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C1]] +; GCN-DAG: s_cmp_eq_u32 s{{[0-9]+}}, 1 +; GCN-DAG: s_cselect_b32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} +; GCN-DAG: s_cselect_b32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} ; GCN: store_dwordx2 v[{{[0-9:]+}}] define amdgpu_kernel void @dyn_extract_vector_elt_v2i64(i64 addrspace(1)* %out, <2 x i64> %foo, i32 %elt) #0 { %dynelt = extractelement <2 x i64> %foo, i32 %elt @@ -60,14 +59,12 @@ ; GCN-LABEL: {{^}}dyn_extract_vector_elt_v3i64: ; GCN-NOT: buffer_load -; GCN-DAG: s_cmp_eq_u32 [[IDX:s[0-9]+]], 1 -; GCN-DAG: s_cselect_b64 [[C1:[^,]+]], -1, 0 -; GCN-DAG: s_cmp_eq_u32 [[IDX]], 2 -; GCN-DAG: s_cselect_b64 [[C2:[^,]+]], -1, 0 -; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C1]] -; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C2]] -; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C1]] -; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C2]] +; GCN-DAG: s_cmp_eq_u32 s{{[0-9]+}}, 1 +; GCN-DAG: s_cselect_b32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} +; GCN-DAG: s_cselect_b32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} +; GCN-DAG: s_cmp_eq_u32 s{{[0-9]+}}, 2 +; GCN-DAG: s_cselect_b32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} +; GCN-DAG: s_cselect_b32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} ; GCN: store_dwordx2 v[{{[0-9:]+}}] define amdgpu_kernel void @dyn_extract_vector_elt_v3i64(i64 addrspace(1)* %out, <3 x i64> %foo, i32 %elt) #0 { %dynelt = extractelement <3 x i64> %foo, i32 %elt @@ -77,18 +74,15 @@ ; GCN-LABEL: {{^}}dyn_extract_vector_elt_v4i64: ; GCN-NOT: buffer_load -; GCN-DAG: s_cmp_eq_u32 [[IDX:s[0-9]+]], 1 -; GCN-DAG: s_cselect_b64 [[C1:[^,]+]], -1, 0 -; GCN-DAG: s_cmp_eq_u32 [[IDX]], 2 -; GCN-DAG: s_cselect_b64 [[C2:[^,]+]], -1, 0 -; GCN-DAG: s_cmp_eq_u32 [[IDX]], 3 -; GCN-DAG: s_cselect_b64 [[C3:[^,]+]], -1, 0 -; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C1]] -; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C2]] -; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C1]] -; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C2]] -; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C3]] -; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C3]] +; GCN-DAG: s_cmp_eq_u32 s{{[0-9]+}}, 1 +; GCN-DAG: s_cselect_b32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} +; GCN-DAG: s_cselect_b32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} +; GCN-DAG: s_cmp_eq_u32 s{{[0-9]+}}, 2 +; GCN-DAG: s_cselect_b32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} +; GCN-DAG: s_cselect_b32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} +; GCN-DAG: s_cmp_eq_u32 s{{[0-9]+}}, 3 +; GCN-DAG: s_cselect_b32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} +; GCN-DAG: s_cselect_b32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} ; GCN: store_dwordx2 v[{{[0-9:]+}}] define amdgpu_kernel void @dyn_extract_vector_elt_v4i64(i64 addrspace(1)* %out, <4 x i64> %foo, i32 %elt) #0 { %dynelt = extractelement <4 x i64> %foo, i32 %elt diff --git a/llvm/test/CodeGen/AMDGPU/fceil64.ll b/llvm/test/CodeGen/AMDGPU/fceil64.ll --- a/llvm/test/CodeGen/AMDGPU/fceil64.ll +++ b/llvm/test/CodeGen/AMDGPU/fceil64.ll @@ -13,21 +13,20 @@ ; CI: v_ceil_f64_e32 ; SI: s_bfe_u32 [[SEXP:s[0-9]+]], {{s[0-9]+}}, 0xb0014 ; SI-DAG: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000 -; FIXME: We should be using s_addk_i32 here, but the reg allocation hints -; are not always followed. -; SI-DAG: s_add_i32 [[SEXP0:s[0-9]+]], [[SEXP]], 0xfffffc01 -; SI-DAG: s_lshr_b64 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], [[SEXP0]] +; SI-DAG: s_addk_i32 [[SEXP]], 0xfc01 +; SI-DAG: s_lshr_b64 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], [[SEXP]] ; SI-DAG: s_andn2_b64 ; SI-DAG: cmp_gt_i32 -; SI-DAG: cndmask_b32 -; SI-DAG: cndmask_b32 +; SI-DAG: s_cselect_b32 +; SI-DAG: s_cselect_b32 ; SI-DAG: cmp_lt_i32 -; SI-DAG: cndmask_b32 -; SI-DAG: cndmask_b32 -; SI-DAG: v_cmp_gt_f64 -; SI-DAG: v_cmp_lg_f64 -; SI-DAG: v_cndmask_b32 -; SI: v_cndmask_b32 +; SI-DAG: s_cselect_b32 +; SI-DAG: s_cselect_b32 +; SI-DAG: v_cmp_gt_f64_e64 [[FCMP:s[[0-9]+:[0-9]+]]] +; SI-DAG: v_cmp_lg_f64_e32 vcc +; SI-DAG: s_and_b64 [[AND1:s[[0-9]+:[0-9]+]]], [[FCMP]], vcc +; SI-DAG: s_and_b64 [[AND1]], [[AND1]], exec +; SI-DAG: s_cselect_b32 s{{[0-9]+}}, 0x3ff00000, 0 ; SI: v_add_f64 ; SI: s_endpgm define amdgpu_kernel void @fceil_f64(double addrspace(1)* %out, double %x) { diff --git a/llvm/test/CodeGen/AMDGPU/frem.ll b/llvm/test/CodeGen/AMDGPU/frem.ll --- a/llvm/test/CodeGen/AMDGPU/frem.ll +++ b/llvm/test/CodeGen/AMDGPU/frem.ll @@ -1002,22 +1002,27 @@ ; SI-NEXT: s_nop 1 ; SI-NEXT: v_div_fmas_f64 v[4:5], v[12:13], v[6:7], v[10:11] ; SI-NEXT: v_div_fixup_f64 v[4:5], v[4:5], v[2:3], v[0:1] -; SI-NEXT: v_bfe_u32 v6, v5, 20, 11 -; SI-NEXT: v_add_i32_e32 v8, vcc, 0xfffffc01, v6 +; SI-NEXT: v_readfirstlane_b32 s2, v5 +; SI-NEXT: s_bfe_u32 s0, s2, 0xb0014 +; SI-NEXT: s_add_i32 s3, s0, 0xfffffc01 ; SI-NEXT: s_mov_b32 s1, 0xfffff ; SI-NEXT: s_mov_b32 s0, s6 -; SI-NEXT: v_lshr_b64 v[6:7], s[0:1], v8 -; SI-NEXT: v_not_b32_e32 v6, v6 +; SI-NEXT: s_lshr_b64 s[0:1], s[0:1], s3 +; SI-NEXT: v_not_b32_e32 v6, s0 ; SI-NEXT: v_and_b32_e32 v6, v4, v6 -; SI-NEXT: v_not_b32_e32 v7, v7 -; SI-NEXT: v_and_b32_e32 v7, v5, v7 -; SI-NEXT: v_and_b32_e32 v9, 0x80000000, v5 -; SI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v8 -; SI-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc -; SI-NEXT: v_cmp_lt_i32_e64 s[0:1], 51, v8 -; SI-NEXT: v_cndmask_b32_e64 v5, v7, v5, s[0:1] +; SI-NEXT: v_not_b32_e32 v7, s1 +; SI-NEXT: v_and_b32_e32 v5, v5, v7 +; SI-NEXT: s_and_b32 s0, s2, 0x80000000 +; SI-NEXT: s_cmp_lt_i32 s3, 0 +; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: v_cndmask_b32_e64 v6, v6, 0, vcc -; SI-NEXT: v_cndmask_b32_e64 v4, v6, v4, s[0:1] +; SI-NEXT: v_mov_b32_e32 v7, s0 +; SI-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc +; SI-NEXT: s_cmp_gt_i32 s3, 51 +; SI-NEXT: s_cselect_b64 vcc, -1, 0 +; SI-NEXT: v_mov_b32_e32 v7, s2 +; SI-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc +; SI-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc ; SI-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1] ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm @@ -1185,21 +1190,21 @@ define amdgpu_kernel void @fast_frem_f64(double addrspace(1)* %out, double addrspace(1)* %in1, ; SI-LABEL: fast_frem_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s4, s8 -; SI-NEXT: s_mov_b32 s5, s9 -; SI-NEXT: s_mov_b32 s8, s10 -; SI-NEXT: s_mov_b32 s9, s11 -; SI-NEXT: s_mov_b32 s10, s6 -; SI-NEXT: s_mov_b32 s11, s7 -; SI-NEXT: s_mov_b32 s2, s6 -; SI-NEXT: s_mov_b32 s3, s7 -; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 -; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s7 +; SI-NEXT: s_mov_b32 s6, s2 +; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: s_mov_b32 s10, s2 +; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[8:11], 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] ; SI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 @@ -1209,24 +1214,29 @@ ; SI-NEXT: v_mul_f64 v[6:7], v[0:1], v[4:5] ; SI-NEXT: v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1] ; SI-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7] -; SI-NEXT: v_bfe_u32 v6, v5, 20, 11 -; SI-NEXT: v_add_i32_e32 v8, vcc, 0xfffffc01, v6 -; SI-NEXT: s_mov_b32 s1, 0xfffff -; SI-NEXT: s_mov_b32 s0, s6 -; SI-NEXT: v_lshr_b64 v[6:7], s[0:1], v8 -; SI-NEXT: v_not_b32_e32 v6, v6 +; SI-NEXT: v_readfirstlane_b32 s6, v5 +; SI-NEXT: s_bfe_u32 s4, s6, 0xb0014 +; SI-NEXT: s_add_i32 s7, s4, 0xfffffc01 +; SI-NEXT: s_mov_b32 s5, 0xfffff +; SI-NEXT: s_mov_b32 s4, s2 +; SI-NEXT: s_lshr_b64 s[4:5], s[4:5], s7 +; SI-NEXT: v_not_b32_e32 v6, s4 ; SI-NEXT: v_and_b32_e32 v6, v4, v6 -; SI-NEXT: v_not_b32_e32 v7, v7 -; SI-NEXT: v_and_b32_e32 v7, v5, v7 -; SI-NEXT: v_and_b32_e32 v9, 0x80000000, v5 -; SI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v8 -; SI-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc -; SI-NEXT: v_cmp_lt_i32_e64 s[0:1], 51, v8 -; SI-NEXT: v_cndmask_b32_e64 v5, v7, v5, s[0:1] +; SI-NEXT: v_not_b32_e32 v7, s5 +; SI-NEXT: v_and_b32_e32 v5, v5, v7 +; SI-NEXT: s_and_b32 s4, s6, 0x80000000 +; SI-NEXT: s_cmp_lt_i32 s7, 0 +; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: v_cndmask_b32_e64 v6, v6, 0, vcc -; SI-NEXT: v_cndmask_b32_e64 v4, v6, v4, s[0:1] +; SI-NEXT: v_mov_b32_e32 v7, s4 +; SI-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc +; SI-NEXT: s_cmp_gt_i32 s7, 51 +; SI-NEXT: s_cselect_b64 vcc, -1, 0 +; SI-NEXT: v_mov_b32_e32 v7, s6 +; SI-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc +; SI-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc ; SI-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1] -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; CI-LABEL: fast_frem_f64: @@ -1373,21 +1383,21 @@ define amdgpu_kernel void @unsafe_frem_f64(double addrspace(1)* %out, double addrspace(1)* %in1, ; SI-LABEL: unsafe_frem_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s4, s8 -; SI-NEXT: s_mov_b32 s5, s9 -; SI-NEXT: s_mov_b32 s8, s10 -; SI-NEXT: s_mov_b32 s9, s11 -; SI-NEXT: s_mov_b32 s10, s6 -; SI-NEXT: s_mov_b32 s11, s7 -; SI-NEXT: s_mov_b32 s2, s6 -; SI-NEXT: s_mov_b32 s3, s7 -; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 -; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s7 +; SI-NEXT: s_mov_b32 s6, s2 +; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: s_mov_b32 s10, s2 +; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[8:11], 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] ; SI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 @@ -1397,24 +1407,29 @@ ; SI-NEXT: v_mul_f64 v[6:7], v[0:1], v[4:5] ; SI-NEXT: v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1] ; SI-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7] -; SI-NEXT: v_bfe_u32 v6, v5, 20, 11 -; SI-NEXT: v_add_i32_e32 v8, vcc, 0xfffffc01, v6 -; SI-NEXT: s_mov_b32 s1, 0xfffff -; SI-NEXT: s_mov_b32 s0, s6 -; SI-NEXT: v_lshr_b64 v[6:7], s[0:1], v8 -; SI-NEXT: v_not_b32_e32 v6, v6 +; SI-NEXT: v_readfirstlane_b32 s6, v5 +; SI-NEXT: s_bfe_u32 s4, s6, 0xb0014 +; SI-NEXT: s_add_i32 s7, s4, 0xfffffc01 +; SI-NEXT: s_mov_b32 s5, 0xfffff +; SI-NEXT: s_mov_b32 s4, s2 +; SI-NEXT: s_lshr_b64 s[4:5], s[4:5], s7 +; SI-NEXT: v_not_b32_e32 v6, s4 ; SI-NEXT: v_and_b32_e32 v6, v4, v6 -; SI-NEXT: v_not_b32_e32 v7, v7 -; SI-NEXT: v_and_b32_e32 v7, v5, v7 -; SI-NEXT: v_and_b32_e32 v9, 0x80000000, v5 -; SI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v8 -; SI-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc -; SI-NEXT: v_cmp_lt_i32_e64 s[0:1], 51, v8 -; SI-NEXT: v_cndmask_b32_e64 v5, v7, v5, s[0:1] +; SI-NEXT: v_not_b32_e32 v7, s5 +; SI-NEXT: v_and_b32_e32 v5, v5, v7 +; SI-NEXT: s_and_b32 s4, s6, 0x80000000 +; SI-NEXT: s_cmp_lt_i32 s7, 0 +; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: v_cndmask_b32_e64 v6, v6, 0, vcc -; SI-NEXT: v_cndmask_b32_e64 v4, v6, v4, s[0:1] +; SI-NEXT: v_mov_b32_e32 v7, s4 +; SI-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc +; SI-NEXT: s_cmp_gt_i32 s7, 51 +; SI-NEXT: s_cselect_b64 vcc, -1, 0 +; SI-NEXT: v_mov_b32_e32 v7, s6 +; SI-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc +; SI-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc ; SI-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1] -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; CI-LABEL: unsafe_frem_f64: @@ -3125,21 +3140,26 @@ ; SI-NEXT: s_nop 1 ; SI-NEXT: v_div_fmas_f64 v[8:9], v[16:17], v[10:11], v[14:15] ; SI-NEXT: v_div_fixup_f64 v[8:9], v[8:9], v[6:7], v[2:3] -; SI-NEXT: v_bfe_u32 v10, v9, 20, 11 -; SI-NEXT: v_add_i32_e32 v12, vcc, 0xfffffc01, v10 +; SI-NEXT: v_readfirstlane_b32 s8, v9 +; SI-NEXT: s_bfe_u32 s0, s8, 0xb0014 +; SI-NEXT: s_add_i32 s9, s0, 0xfffffc01 ; SI-NEXT: s_mov_b32 s3, 0xfffff -; SI-NEXT: v_lshr_b64 v[10:11], s[2:3], v12 -; SI-NEXT: v_not_b32_e32 v10, v10 +; SI-NEXT: s_lshr_b64 s[0:1], s[2:3], s9 +; SI-NEXT: v_not_b32_e32 v10, s0 ; SI-NEXT: v_and_b32_e32 v10, v8, v10 -; SI-NEXT: v_not_b32_e32 v11, v11 -; SI-NEXT: v_and_b32_e32 v11, v9, v11 -; SI-NEXT: v_and_b32_e32 v13, 0x80000000, v9 -; SI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v12 -; SI-NEXT: v_cndmask_b32_e32 v11, v11, v13, vcc -; SI-NEXT: v_cmp_lt_i32_e64 s[0:1], 51, v12 -; SI-NEXT: v_cndmask_b32_e64 v9, v11, v9, s[0:1] +; SI-NEXT: v_not_b32_e32 v11, s1 +; SI-NEXT: v_and_b32_e32 v9, v9, v11 +; SI-NEXT: s_and_b32 s0, s8, 0x80000000 +; SI-NEXT: s_cmp_lt_i32 s9, 0 +; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: v_cndmask_b32_e64 v10, v10, 0, vcc -; SI-NEXT: v_cndmask_b32_e64 v8, v10, v8, s[0:1] +; SI-NEXT: v_mov_b32_e32 v11, s0 +; SI-NEXT: v_cndmask_b32_e32 v9, v9, v11, vcc +; SI-NEXT: s_cmp_gt_i32 s9, 51 +; SI-NEXT: s_cselect_b64 vcc, -1, 0 +; SI-NEXT: v_mov_b32_e32 v11, s8 +; SI-NEXT: v_cndmask_b32_e32 v9, v9, v11, vcc +; SI-NEXT: v_cndmask_b32_e32 v8, v10, v8, vcc ; SI-NEXT: v_fma_f64 v[2:3], -v[8:9], v[6:7], v[2:3] ; SI-NEXT: v_div_scale_f64 v[6:7], s[0:1], v[4:5], v[4:5], v[0:1] ; SI-NEXT: v_rcp_f64_e32 v[8:9], v[6:7] @@ -3156,20 +3176,25 @@ ; SI-NEXT: s_nop 1 ; SI-NEXT: v_div_fmas_f64 v[6:7], v[14:15], v[8:9], v[12:13] ; SI-NEXT: v_div_fixup_f64 v[6:7], v[6:7], v[4:5], v[0:1] -; SI-NEXT: v_bfe_u32 v8, v7, 20, 11 -; SI-NEXT: v_add_i32_e32 v10, vcc, 0xfffffc01, v8 -; SI-NEXT: v_lshr_b64 v[8:9], s[2:3], v10 -; SI-NEXT: v_not_b32_e32 v8, v8 +; SI-NEXT: v_readfirstlane_b32 s8, v7 +; SI-NEXT: s_bfe_u32 s0, s8, 0xb0014 +; SI-NEXT: s_add_i32 s9, s0, 0xfffffc01 +; SI-NEXT: s_lshr_b64 s[0:1], s[2:3], s9 +; SI-NEXT: v_not_b32_e32 v8, s0 ; SI-NEXT: v_and_b32_e32 v8, v6, v8 -; SI-NEXT: v_not_b32_e32 v9, v9 -; SI-NEXT: v_and_b32_e32 v9, v7, v9 -; SI-NEXT: v_and_b32_e32 v11, 0x80000000, v7 -; SI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v10 -; SI-NEXT: v_cndmask_b32_e32 v9, v9, v11, vcc -; SI-NEXT: v_cmp_lt_i32_e64 s[0:1], 51, v10 -; SI-NEXT: v_cndmask_b32_e64 v7, v9, v7, s[0:1] +; SI-NEXT: v_not_b32_e32 v9, s1 +; SI-NEXT: v_and_b32_e32 v7, v7, v9 +; SI-NEXT: s_and_b32 s0, s8, 0x80000000 +; SI-NEXT: s_cmp_lt_i32 s9, 0 +; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: v_cndmask_b32_e64 v8, v8, 0, vcc -; SI-NEXT: v_cndmask_b32_e64 v6, v8, v6, s[0:1] +; SI-NEXT: v_mov_b32_e32 v9, s0 +; SI-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc +; SI-NEXT: s_cmp_gt_i32 s9, 51 +; SI-NEXT: s_cselect_b64 vcc, -1, 0 +; SI-NEXT: v_mov_b32_e32 v9, s8 +; SI-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc +; SI-NEXT: v_cndmask_b32_e32 v6, v8, v6, vcc ; SI-NEXT: v_fma_f64 v[0:1], -v[6:7], v[4:5], v[0:1] ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SI-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/ftrunc.f64.ll b/llvm/test/CodeGen/AMDGPU/ftrunc.f64.ll --- a/llvm/test/CodeGen/AMDGPU/ftrunc.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/ftrunc.f64.ll @@ -11,7 +11,7 @@ ; FUNC-LABEL: {{^}}v_ftrunc_f64: ; CI: v_trunc_f64 -; SI: v_bfe_u32 {{v[0-9]+}}, {{v[0-9]+}}, 20, 11 +; SI: s_bfe_u32 {{s[0-9]+}}, {{s[0-9]+}}, 0xb0014 ; SI: s_endpgm define amdgpu_kernel void @v_ftrunc_f64(double addrspace(1)* %out, double addrspace(1)* %in) { %x = load double, double addrspace(1)* %in, align 8 @@ -29,11 +29,11 @@ ; SI-DAG: s_lshr_b64 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], [[SEXP1]] ; SI-DAG: s_andn2_b64 ; SI-DAG: cmp_gt_i32 -; SI-DAG: cndmask_b32 -; SI-DAG: cndmask_b32 +; SI-DAG: s_cselect_b32 +; SI-DAG: s_cselect_b32 ; SI-DAG: cmp_lt_i32 -; SI-DAG: cndmask_b32 -; SI-DAG: cndmask_b32 +; SI-DAG: s_cselect_b32 +; SI-DAG: s_cselect_b32 ; SI: s_endpgm define amdgpu_kernel void @ftrunc_f64(double addrspace(1)* %out, double %x) { %y = call double @llvm.trunc.f64(double %x) nounwind readnone diff --git a/llvm/test/CodeGen/AMDGPU/idiv-licm.ll b/llvm/test/CodeGen/AMDGPU/idiv-licm.ll --- a/llvm/test/CodeGen/AMDGPU/idiv-licm.ll +++ b/llvm/test/CodeGen/AMDGPU/idiv-licm.ll @@ -25,25 +25,21 @@ ; GFX9-NEXT: s_mul_i32 s7, s3, s6 ; GFX9-NEXT: s_mul_hi_u32 s6, s2, s6 ; GFX9-NEXT: s_add_i32 s6, s6, s7 +; GFX9-NEXT: s_not_b32 s9, s6 ; GFX9-NEXT: s_mul_i32 s7, s5, s6 +; GFX9-NEXT: s_mul_i32 s9, s4, s9 +; GFX9-NEXT: s_add_i32 s8, s6, 1 ; GFX9-NEXT: s_add_i32 s7, s2, s7 +; GFX9-NEXT: s_add_i32 s9, s2, s9 ; GFX9-NEXT: s_cmp_ge_u32 s7, s4 -; GFX9-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-NEXT: v_mov_b32_e32 v3, s7 -; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: s_add_i32 s7, s6, 1 -; GFX9-NEXT: s_not_b32 s6, s6 -; GFX9-NEXT: s_mul_i32 s6, s4, s6 -; GFX9-NEXT: v_mov_b32_e32 v4, s7 -; GFX9-NEXT: s_add_i32 s6, s2, s6 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX9-NEXT: v_mov_b32_e32 v4, s6 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc -; GFX9-NEXT: v_add_u32_e32 v5, 1, v2 +; GFX9-NEXT: s_cselect_b32 s6, s8, s6 +; GFX9-NEXT: s_cselect_b32 s7, s9, s7 +; GFX9-NEXT: s_add_i32 s8, s6, 1 +; GFX9-NEXT: s_cmp_ge_u32 s7, s4 +; GFX9-NEXT: s_cselect_b32 s6, s8, s6 ; GFX9-NEXT: s_add_u32 s2, s2, 1 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s4, v3 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-NEXT: s_addc_u32 s3, s3, 0 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc ; GFX9-NEXT: global_store_dword v1, v2, s[0:1] ; GFX9-NEXT: s_add_u32 s0, s0, 4 ; GFX9-NEXT: s_addc_u32 s1, s1, 0 @@ -73,23 +69,21 @@ ; GFX10-NEXT: s_mul_i32 s7, s3, s6 ; GFX10-NEXT: s_mul_hi_u32 s6, s2, s6 ; GFX10-NEXT: s_add_i32 s6, s6, s7 +; GFX10-NEXT: s_not_b32 s8, s6 ; GFX10-NEXT: s_mul_i32 s7, s5, s6 +; GFX10-NEXT: s_mul_i32 s8, s4, s8 ; GFX10-NEXT: s_add_i32 s7, s2, s7 +; GFX10-NEXT: s_add_i32 s9, s6, 1 +; GFX10-NEXT: s_add_i32 s8, s2, s8 ; GFX10-NEXT: s_cmp_ge_u32 s7, s4 -; GFX10-NEXT: s_cselect_b32 vcc_lo, -1, 0 +; GFX10-NEXT: s_cselect_b32 s6, s9, s6 +; GFX10-NEXT: s_cselect_b32 s7, s8, s7 ; GFX10-NEXT: s_add_i32 s8, s6, 1 -; GFX10-NEXT: s_not_b32 s9, s6 -; GFX10-NEXT: v_mov_b32_e32 v2, s8 -; GFX10-NEXT: s_mul_i32 s8, s4, s9 -; GFX10-NEXT: s_add_i32 s8, s2, s8 +; GFX10-NEXT: s_cmp_ge_u32 s7, s4 +; GFX10-NEXT: s_cselect_b32 s6, s8, s6 ; GFX10-NEXT: s_add_u32 s2, s2, 1 -; GFX10-NEXT: v_mov_b32_e32 v3, s8 -; GFX10-NEXT: v_cndmask_b32_e32 v2, s6, v2, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v2, s6 ; GFX10-NEXT: s_addc_u32 s3, s3, 0 -; GFX10-NEXT: v_cndmask_b32_e32 v3, s7, v3, vcc_lo -; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v2 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s4, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo ; GFX10-NEXT: global_store_dword v1, v2, s[0:1] ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_add_u32 s0, s0, 4 @@ -127,26 +121,22 @@ ; GFX11-NEXT: s_mul_i32 s7, s3, s6 ; GFX11-NEXT: s_mul_hi_u32 s6, s2, s6 ; GFX11-NEXT: s_add_i32 s6, s6, s7 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_not_b32 s8, s6 ; GFX11-NEXT: s_mul_i32 s7, s5, s6 +; GFX11-NEXT: s_mul_i32 s8, s4, s8 ; GFX11-NEXT: s_add_i32 s7, s2, s7 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_add_i32 s9, s6, 1 +; GFX11-NEXT: s_add_i32 s8, s2, s8 ; GFX11-NEXT: s_cmp_ge_u32 s7, s4 -; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 +; GFX11-NEXT: s_cselect_b32 s6, s9, s6 +; GFX11-NEXT: s_cselect_b32 s7, s8, s7 ; GFX11-NEXT: s_add_i32 s8, s6, 1 -; GFX11-NEXT: s_not_b32 s9, s6 -; GFX11-NEXT: v_mov_b32_e32 v2, s8 -; GFX11-NEXT: s_mul_i32 s8, s4, s9 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX11-NEXT: s_add_i32 s8, s2, s8 +; GFX11-NEXT: s_cmp_ge_u32 s7, s4 +; GFX11-NEXT: s_cselect_b32 s6, s8, s6 ; GFX11-NEXT: s_add_u32 s2, s2, 1 -; GFX11-NEXT: v_mov_b32_e32 v3, s8 -; GFX11-NEXT: v_cndmask_b32_e32 v2, s6, v2, vcc_lo +; GFX11-NEXT: v_mov_b32_e32 v2, s6 ; GFX11-NEXT: s_addc_u32 s3, s3, 0 -; GFX11-NEXT: v_dual_cndmask_b32 v3, s7, v3 :: v_dual_add_nc_u32 v4, 1, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, s4, v3 -; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo ; GFX11-NEXT: global_store_b32 v1, v2, s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, 4 ; GFX11-NEXT: s_addc_u32 s1, s1, 0 @@ -330,38 +320,39 @@ ; GFX9-LABEL: sdiv32_invariant_denom: ; GFX9: ; %bb.0: ; %bb ; GFX9-NEXT: s_load_dword s3, s[0:1], 0x2c +; GFX9-NEXT: s_mov_b32 s4, 0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_ashr_i32 s2, s3, 31 ; GFX9-NEXT: s_add_i32 s3, s3, s2 ; GFX9-NEXT: s_xor_b32 s3, s3, s2 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 -; GFX9-NEXT: s_sub_i32 s4, 0, s3 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_sub_i32 s5, 0, s3 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: v_mul_lo_u32 v1, s4, v0 -; GFX9-NEXT: s_mov_b32 s4, 0 -; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 -; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: .LBB2_1: ; %bb3 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_mul_hi_u32 v2, s4, v0 -; GFX9-NEXT: v_mul_lo_u32 v3, v2, s3 -; GFX9-NEXT: v_add_u32_e32 v4, 1, v2 -; GFX9-NEXT: v_sub_u32_e32 v3, s4, v3 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX9-NEXT: v_subrev_u32_e32 v4, s3, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc -; GFX9-NEXT: v_add_u32_e32 v4, 1, v2 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX9-NEXT: v_xor_b32_e32 v2, s2, v2 +; GFX9-NEXT: v_readfirstlane_b32 s6, v0 +; GFX9-NEXT: s_mul_i32 s7, s5, s6 +; GFX9-NEXT: s_mul_hi_u32 s7, s6, s7 +; GFX9-NEXT: s_add_i32 s6, s6, s7 +; GFX9-NEXT: s_mul_hi_u32 s6, s4, s6 +; GFX9-NEXT: s_mul_i32 s7, s6, s3 +; GFX9-NEXT: s_sub_i32 s7, s4, s7 +; GFX9-NEXT: s_add_i32 s8, s6, 1 +; GFX9-NEXT: s_sub_i32 s9, s7, s3 +; GFX9-NEXT: s_cmp_ge_u32 s7, s3 +; GFX9-NEXT: s_cselect_b32 s6, s8, s6 +; GFX9-NEXT: s_cselect_b32 s7, s9, s7 +; GFX9-NEXT: s_add_i32 s8, s6, 1 +; GFX9-NEXT: s_cmp_ge_u32 s7, s3 +; GFX9-NEXT: s_cselect_b32 s6, s8, s6 +; GFX9-NEXT: s_xor_b32 s6, s6, s2 +; GFX9-NEXT: s_sub_i32 s6, s6, s2 ; GFX9-NEXT: s_add_i32 s4, s4, 1 -; GFX9-NEXT: v_subrev_u32_e32 v2, s2, v2 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-NEXT: global_store_dword v1, v2, s[0:1] ; GFX9-NEXT: s_add_u32 s0, s0, 4 ; GFX9-NEXT: s_addc_u32 s1, s1, 0 @@ -393,21 +384,19 @@ ; GFX10-NEXT: s_add_i32 s6, s6, s7 ; GFX10-NEXT: s_mul_hi_u32 s6, s4, s6 ; GFX10-NEXT: s_mul_i32 s7, s6, s3 +; GFX10-NEXT: s_add_i32 s8, s6, 1 ; GFX10-NEXT: s_sub_i32 s7, s4, s7 +; GFX10-NEXT: s_sub_i32 s9, s7, s3 ; GFX10-NEXT: s_cmp_ge_u32 s7, s3 -; GFX10-NEXT: s_cselect_b32 vcc_lo, -1, 0 +; GFX10-NEXT: s_cselect_b32 s6, s8, s6 +; GFX10-NEXT: s_cselect_b32 s7, s9, s7 ; GFX10-NEXT: s_add_i32 s8, s6, 1 +; GFX10-NEXT: s_cmp_ge_u32 s7, s3 +; GFX10-NEXT: s_cselect_b32 s6, s8, s6 ; GFX10-NEXT: s_add_i32 s4, s4, 1 -; GFX10-NEXT: v_mov_b32_e32 v2, s8 -; GFX10-NEXT: s_sub_i32 s8, s7, s3 -; GFX10-NEXT: v_mov_b32_e32 v3, s8 -; GFX10-NEXT: v_cndmask_b32_e32 v2, s6, v2, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v3, s7, v3, vcc_lo -; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v2 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s3, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo -; GFX10-NEXT: v_xor_b32_e32 v2, s2, v2 -; GFX10-NEXT: v_subrev_nc_u32_e32 v2, s2, v2 +; GFX10-NEXT: s_xor_b32 s6, s6, s2 +; GFX10-NEXT: s_sub_i32 s6, s6, s2 +; GFX10-NEXT: v_mov_b32_e32 v2, s6 ; GFX10-NEXT: global_store_dword v1, v2, s[0:1] ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_add_u32 s0, s0, 4 @@ -449,25 +438,21 @@ ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_mul_hi_u32 s6, s4, s6 ; GFX11-NEXT: s_mul_i32 s7, s6, s3 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_add_i32 s8, s6, 1 ; GFX11-NEXT: s_sub_i32 s7, s4, s7 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_sub_i32 s9, s7, s3 ; GFX11-NEXT: s_cmp_ge_u32 s7, s3 -; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 +; GFX11-NEXT: s_cselect_b32 s6, s8, s6 +; GFX11-NEXT: s_cselect_b32 s7, s9, s7 ; GFX11-NEXT: s_add_i32 s8, s6, 1 +; GFX11-NEXT: s_cmp_ge_u32 s7, s3 +; GFX11-NEXT: s_cselect_b32 s6, s8, s6 ; GFX11-NEXT: s_add_i32 s4, s4, 1 -; GFX11-NEXT: v_mov_b32_e32 v2, s8 -; GFX11-NEXT: s_sub_i32 s8, s7, s3 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_mov_b32_e32 v3, s8 -; GFX11-NEXT: v_cndmask_b32_e32 v2, s6, v2, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_dual_cndmask_b32 v3, s7, v3 :: v_dual_add_nc_u32 v4, 1, v2 -; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, s3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo -; GFX11-NEXT: v_xor_b32_e32 v2, s2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_subrev_nc_u32_e32 v2, s2, v2 +; GFX11-NEXT: s_xor_b32 s6, s6, s2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_sub_i32 s6, s6, s2 +; GFX11-NEXT: v_mov_b32_e32 v2, s6 ; GFX11-NEXT: global_store_b32 v1, v2, s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, 4 ; GFX11-NEXT: s_addc_u32 s1, s1, 0 diff --git a/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll b/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll --- a/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll +++ b/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll @@ -15,17 +15,15 @@ ; GFX8V3-NEXT: v_mov_b32_e32 v4, 1 ; GFX8V3-NEXT: s_waitcnt lgkmcnt(0) ; GFX8V3-NEXT: s_cmp_lg_u32 s0, -1 -; GFX8V3-NEXT: v_mov_b32_e32 v0, s3 -; GFX8V3-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX8V3-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc -; GFX8V3-NEXT: v_mov_b32_e32 v0, s0 +; GFX8V3-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8V3-NEXT: s_cselect_b32 s0, s0, 0 ; GFX8V3-NEXT: s_cmp_lg_u32 s1, -1 -; GFX8V3-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GFX8V3-NEXT: v_mov_b32_e32 v2, s2 -; GFX8V3-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX8V3-NEXT: v_cndmask_b32_e32 v3, 0, v2, vcc +; GFX8V3-NEXT: v_mov_b32_e32 v0, s0 +; GFX8V3-NEXT: v_mov_b32_e32 v1, s3 +; GFX8V3-NEXT: s_cselect_b32 s0, s2, 0 +; GFX8V3-NEXT: s_cselect_b32 s1, s1, 0 ; GFX8V3-NEXT: v_mov_b32_e32 v2, s1 -; GFX8V3-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GFX8V3-NEXT: v_mov_b32_e32 v3, s0 ; GFX8V3-NEXT: flat_store_dword v[0:1], v4 ; GFX8V3-NEXT: s_waitcnt vmcnt(0) ; GFX8V3-NEXT: v_mov_b32_e32 v0, 2 @@ -40,17 +38,15 @@ ; GFX8V4-NEXT: v_mov_b32_e32 v4, 1 ; GFX8V4-NEXT: s_waitcnt lgkmcnt(0) ; GFX8V4-NEXT: s_cmp_lg_u32 s0, -1 -; GFX8V4-NEXT: v_mov_b32_e32 v0, s3 -; GFX8V4-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX8V4-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc -; GFX8V4-NEXT: v_mov_b32_e32 v0, s0 +; GFX8V4-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8V4-NEXT: s_cselect_b32 s0, s0, 0 ; GFX8V4-NEXT: s_cmp_lg_u32 s1, -1 -; GFX8V4-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GFX8V4-NEXT: v_mov_b32_e32 v2, s2 -; GFX8V4-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX8V4-NEXT: v_cndmask_b32_e32 v3, 0, v2, vcc +; GFX8V4-NEXT: v_mov_b32_e32 v0, s0 +; GFX8V4-NEXT: v_mov_b32_e32 v1, s3 +; GFX8V4-NEXT: s_cselect_b32 s0, s2, 0 +; GFX8V4-NEXT: s_cselect_b32 s1, s1, 0 ; GFX8V4-NEXT: v_mov_b32_e32 v2, s1 -; GFX8V4-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GFX8V4-NEXT: v_mov_b32_e32 v3, s0 ; GFX8V4-NEXT: flat_store_dword v[0:1], v4 ; GFX8V4-NEXT: s_waitcnt vmcnt(0) ; GFX8V4-NEXT: v_mov_b32_e32 v0, 2 @@ -65,17 +61,15 @@ ; GFX8V5-NEXT: v_mov_b32_e32 v4, 1 ; GFX8V5-NEXT: s_waitcnt lgkmcnt(0) ; GFX8V5-NEXT: s_cmp_lg_u32 s0, -1 -; GFX8V5-NEXT: v_mov_b32_e32 v0, s2 -; GFX8V5-NEXT: v_mov_b32_e32 v2, s0 -; GFX8V5-NEXT: s_cselect_b64 vcc, -1, 0 +; GFX8V5-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8V5-NEXT: s_cselect_b32 s0, s0, 0 ; GFX8V5-NEXT: s_cmp_lg_u32 s1, -1 -; GFX8V5-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc -; GFX8V5-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc -; GFX8V5-NEXT: v_mov_b32_e32 v2, s3 -; GFX8V5-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX8V5-NEXT: v_cndmask_b32_e32 v3, 0, v2, vcc +; GFX8V5-NEXT: v_mov_b32_e32 v0, s0 +; GFX8V5-NEXT: v_mov_b32_e32 v1, s2 +; GFX8V5-NEXT: s_cselect_b32 s0, s3, 0 +; GFX8V5-NEXT: s_cselect_b32 s1, s1, 0 ; GFX8V5-NEXT: v_mov_b32_e32 v2, s1 -; GFX8V5-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GFX8V5-NEXT: v_mov_b32_e32 v3, s0 ; GFX8V5-NEXT: flat_store_dword v[0:1], v4 ; GFX8V5-NEXT: s_waitcnt vmcnt(0) ; GFX8V5-NEXT: v_mov_b32_e32 v0, 2 @@ -88,22 +82,20 @@ ; GFX9V3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9V3-NEXT: s_getreg_b32 s2, hwreg(HW_REG_SH_MEM_BASES, 0, 16) ; GFX9V3-NEXT: s_lshl_b32 s2, s2, 16 -; GFX9V3-NEXT: v_mov_b32_e32 v0, s2 ; GFX9V3-NEXT: v_mov_b32_e32 v4, 1 ; GFX9V3-NEXT: s_waitcnt lgkmcnt(0) ; GFX9V3-NEXT: s_cmp_lg_u32 s0, -1 -; GFX9V3-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9V3-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc +; GFX9V3-NEXT: s_cselect_b32 s0, s0, 0 ; GFX9V3-NEXT: v_mov_b32_e32 v0, s0 ; GFX9V3-NEXT: s_getreg_b32 s0, hwreg(HW_REG_SH_MEM_BASES, 16, 16) +; GFX9V3-NEXT: s_cselect_b32 s2, s2, 0 ; GFX9V3-NEXT: s_lshl_b32 s0, s0, 16 ; GFX9V3-NEXT: s_cmp_lg_u32 s1, -1 -; GFX9V3-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GFX9V3-NEXT: v_mov_b32_e32 v2, s0 -; GFX9V3-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9V3-NEXT: v_cndmask_b32_e32 v3, 0, v2, vcc +; GFX9V3-NEXT: v_mov_b32_e32 v1, s2 +; GFX9V3-NEXT: s_cselect_b32 s0, s0, 0 +; GFX9V3-NEXT: s_cselect_b32 s1, s1, 0 ; GFX9V3-NEXT: v_mov_b32_e32 v2, s1 -; GFX9V3-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GFX9V3-NEXT: v_mov_b32_e32 v3, s0 ; GFX9V3-NEXT: flat_store_dword v[0:1], v4 ; GFX9V3-NEXT: s_waitcnt vmcnt(0) ; GFX9V3-NEXT: v_mov_b32_e32 v0, 2 @@ -116,22 +108,20 @@ ; GFX9V4-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9V4-NEXT: s_getreg_b32 s2, hwreg(HW_REG_SH_MEM_BASES, 0, 16) ; GFX9V4-NEXT: s_lshl_b32 s2, s2, 16 -; GFX9V4-NEXT: v_mov_b32_e32 v0, s2 ; GFX9V4-NEXT: v_mov_b32_e32 v4, 1 ; GFX9V4-NEXT: s_waitcnt lgkmcnt(0) ; GFX9V4-NEXT: s_cmp_lg_u32 s0, -1 -; GFX9V4-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9V4-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc +; GFX9V4-NEXT: s_cselect_b32 s0, s0, 0 ; GFX9V4-NEXT: v_mov_b32_e32 v0, s0 ; GFX9V4-NEXT: s_getreg_b32 s0, hwreg(HW_REG_SH_MEM_BASES, 16, 16) +; GFX9V4-NEXT: s_cselect_b32 s2, s2, 0 ; GFX9V4-NEXT: s_lshl_b32 s0, s0, 16 ; GFX9V4-NEXT: s_cmp_lg_u32 s1, -1 -; GFX9V4-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GFX9V4-NEXT: v_mov_b32_e32 v2, s0 -; GFX9V4-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9V4-NEXT: v_cndmask_b32_e32 v3, 0, v2, vcc +; GFX9V4-NEXT: v_mov_b32_e32 v1, s2 +; GFX9V4-NEXT: s_cselect_b32 s0, s0, 0 +; GFX9V4-NEXT: s_cselect_b32 s1, s1, 0 ; GFX9V4-NEXT: v_mov_b32_e32 v2, s1 -; GFX9V4-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GFX9V4-NEXT: v_mov_b32_e32 v3, s0 ; GFX9V4-NEXT: flat_store_dword v[0:1], v4 ; GFX9V4-NEXT: s_waitcnt vmcnt(0) ; GFX9V4-NEXT: v_mov_b32_e32 v0, 2 @@ -144,22 +134,20 @@ ; GFX9V5-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9V5-NEXT: s_getreg_b32 s2, hwreg(HW_REG_SH_MEM_BASES, 0, 16) ; GFX9V5-NEXT: s_lshl_b32 s2, s2, 16 -; GFX9V5-NEXT: v_mov_b32_e32 v0, s2 ; GFX9V5-NEXT: v_mov_b32_e32 v4, 1 ; GFX9V5-NEXT: s_waitcnt lgkmcnt(0) ; GFX9V5-NEXT: s_cmp_lg_u32 s0, -1 -; GFX9V5-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9V5-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc +; GFX9V5-NEXT: s_cselect_b32 s0, s0, 0 ; GFX9V5-NEXT: v_mov_b32_e32 v0, s0 ; GFX9V5-NEXT: s_getreg_b32 s0, hwreg(HW_REG_SH_MEM_BASES, 16, 16) +; GFX9V5-NEXT: s_cselect_b32 s2, s2, 0 ; GFX9V5-NEXT: s_lshl_b32 s0, s0, 16 ; GFX9V5-NEXT: s_cmp_lg_u32 s1, -1 -; GFX9V5-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GFX9V5-NEXT: v_mov_b32_e32 v2, s0 -; GFX9V5-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9V5-NEXT: v_cndmask_b32_e32 v3, 0, v2, vcc +; GFX9V5-NEXT: v_mov_b32_e32 v1, s2 +; GFX9V5-NEXT: s_cselect_b32 s0, s0, 0 +; GFX9V5-NEXT: s_cselect_b32 s1, s1, 0 ; GFX9V5-NEXT: v_mov_b32_e32 v2, s1 -; GFX9V5-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GFX9V5-NEXT: v_mov_b32_e32 v3, s0 ; GFX9V5-NEXT: flat_store_dword v[0:1], v4 ; GFX9V5-NEXT: s_waitcnt vmcnt(0) ; GFX9V5-NEXT: v_mov_b32_e32 v0, 2 diff --git a/llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll b/llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll --- a/llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll +++ b/llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll @@ -12,66 +12,32 @@ ; CHECK: ; %bb.0: ; %bb ; CHECK-NEXT: s_add_u32 flat_scratch_lo, s4, s7 ; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s5, 0 -; CHECK-NEXT: s_mov_b64 s[4:5], 0 -; CHECK-NEXT: s_load_dword s4, s[4:5], 0x0 ; CHECK-NEXT: s_add_u32 s0, s0, s7 ; CHECK-NEXT: s_addc_u32 s1, s1, 0 -; CHECK-NEXT: s_mov_b32 s33, s6 -; CHECK-NEXT: v_mov_b32_e32 v31, v0 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_bitcmp1_b32 s4, 0 -; CHECK-NEXT: s_cselect_b64 vcc, -1, 0 +; CHECK-NEXT: s_mov_b64 s[4:5], 0 +; CHECK-NEXT: s_load_dword s7, s[4:5], 0x0 ; CHECK-NEXT: s_getpc_b64 s[4:5] ; CHECK-NEXT: s_add_u32 s4, s4, wobble@gotpcrel32@lo+4 ; CHECK-NEXT: s_addc_u32 s5, s5, wobble@gotpcrel32@hi+12 -; CHECK-NEXT: s_getpc_b64 s[6:7] -; CHECK-NEXT: s_add_u32 s6, s6, snork@gotpcrel32@lo+4 -; CHECK-NEXT: s_addc_u32 s7, s7, snork@gotpcrel32@hi+12 -; CHECK-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 -; CHECK-NEXT: s_load_dwordx2 s[10:11], s[4:5], 0x0 -; CHECK-NEXT: s_mov_b32 s32, 0 -; CHECK-NEXT: s_mov_b64 s[4:5], exec +; CHECK-NEXT: s_getpc_b64 s[8:9] +; CHECK-NEXT: s_add_u32 s8, s8, snork@gotpcrel32@lo+4 +; CHECK-NEXT: s_addc_u32 s9, s9, snork@gotpcrel32@hi+12 +; CHECK-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x0 +; CHECK-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x0 +; CHECK-NEXT: s_mov_b64 s[8:9], 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v0, s9 -; CHECK-NEXT: v_mov_b32_e32 v1, s11 -; CHECK-NEXT: v_mov_b32_e32 v2, s8 -; CHECK-NEXT: v_mov_b32_e32 v4, s10 -; CHECK-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc -; CHECK-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; CHECK-NEXT: s_and_b32 s4, 1, s7 +; CHECK-NEXT: s_cmp_eq_u32 s4, 1 +; CHECK-NEXT: v_mov_b32_e32 v31, v0 +; CHECK-NEXT: s_cselect_b32 s5, s13, s11 +; CHECK-NEXT: s_cselect_b32 s4, s12, s10 +; CHECK-NEXT: s_mov_b32 s12, s6 ; CHECK-NEXT: v_mov_b32_e32 v1, 0 -; CHECK-NEXT: .LBB0_1: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: v_readfirstlane_b32 s4, v2 -; CHECK-NEXT: v_readfirstlane_b32 s5, v3 -; CHECK-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[2:3] -; CHECK-NEXT: s_and_saveexec_b64 s[34:35], vcc -; CHECK-NEXT: s_mov_b64 s[8:9], 0 -; CHECK-NEXT: s_mov_b32 s12, s33 -; CHECK-NEXT: v_mov_b32_e32 v4, v1 +; CHECK-NEXT: v_mov_b32_e32 v4, 0 +; CHECK-NEXT: s_mov_b32 s32, 0 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[4:5] -; CHECK-NEXT: ; implicit-def: $vgpr2_vgpr3 -; CHECK-NEXT: ; implicit-def: $vgpr31 -; CHECK-NEXT: ; implicit-def: $vgpr1 -; CHECK-NEXT: s_xor_b64 exec, exec, s[34:35] -; CHECK-NEXT: s_cbranch_execnz .LBB0_1 -; CHECK-NEXT: ; %bb.2: ; CHECK-NEXT: s_endpgm -; CHECK: .amdhsa_kernarg_size 0 -; CHECK-NEXT: .amdhsa_user_sgpr_count 6 -; CHECK-NEXT: .amdhsa_user_sgpr_private_segment_buffer 1 -; CHECK-NEXT: .amdhsa_user_sgpr_dispatch_ptr 0 -; CHECK-NEXT: .amdhsa_user_sgpr_queue_ptr 0 -; CHECK-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 0 -; CHECK-NEXT: .amdhsa_user_sgpr_dispatch_id 0 -; CHECK-NEXT: .amdhsa_user_sgpr_flat_scratch_init 1 -; CHECK-NEXT: .amdhsa_user_sgpr_private_segment_size 0 -; CHECK-NEXT: .amdhsa_uses_dynamic_stack 1 -; CHECK-NEXT: .amdhsa_system_sgpr_private_segment_wavefront_offset 1 -; CHECK-NEXT: .amdhsa_system_sgpr_workgroup_id_x 1 -; CHECK-NEXT: .amdhsa_system_sgpr_workgroup_id_y 0 -; CHECK-NEXT: .amdhsa_system_sgpr_workgroup_id_z 0 -; CHECK-NEXT: .amdhsa_system_sgpr_workgroup_info 0 -; CHECK-NEXT: .amdhsa_system_vgpr_workitem_id 0 bb: %cond = load i1, i1 addrspace(4)* null %tmp = select i1 %cond, void (i8*, i32, i8*)* bitcast (void ()* @wobble to void (i8*, i32, i8*)*), void (i8*, i32, i8*)* bitcast (void ()* @snork to void (i8*, i32, i8*)*) diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll --- a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll @@ -586,21 +586,18 @@ ; GCN-NEXT: s_load_dword s2, s[0:1], 0x44 ; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GCN-NEXT: v_mov_b32_e32 v0, 0x3ff00000 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_cmp_eq_u32 s2, 1 -; GCN-NEXT: v_mov_b32_e32 v1, s7 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v3, v1, v0, vcc -; GCN-NEXT: v_mov_b32_e32 v1, s6 +; GCN-NEXT: s_cselect_b32 s3, 0x3ff00000, s7 +; GCN-NEXT: s_cselect_b32 s6, 0, s6 ; GCN-NEXT: s_cmp_eq_u32 s2, 0 -; GCN-NEXT: v_cndmask_b32_e64 v2, v1, 0, vcc -; GCN-NEXT: v_mov_b32_e32 v1, s5 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc -; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: s_cselect_b32 s2, 0x3ff00000, s5 +; GCN-NEXT: s_cselect_b32 s4, 0, s4 ; GCN-NEXT: v_mov_b32_e32 v5, s1 -; GCN-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v1, s2 +; GCN-NEXT: v_mov_b32_e32 v2, s6 +; GCN-NEXT: v_mov_b32_e32 v3, s3 ; GCN-NEXT: v_mov_b32_e32 v4, s0 ; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NEXT: s_endpgm @@ -617,51 +614,45 @@ ; GCN-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x84 ; GCN-NEXT: s_load_dwordx2 s[10:11], s[0:1], 0x24 ; GCN-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x64 -; GCN-NEXT: v_mov_b32_e32 v4, 0x3ff00000 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_cmp_eq_u32 s12, 4 -; GCN-NEXT: v_mov_b32_e32 v0, s9 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v9, v0, v4, vcc -; GCN-NEXT: v_mov_b32_e32 v0, s8 +; GCN-NEXT: s_cselect_b32 s9, 0x3ff00000, s9 +; GCN-NEXT: s_cselect_b32 s8, 0, s8 ; GCN-NEXT: s_cmp_eq_u32 s12, 1 -; GCN-NEXT: v_cndmask_b32_e64 v8, v0, 0, vcc -; GCN-NEXT: v_mov_b32_e32 v0, s3 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v3, v0, v4, vcc -; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: s_cselect_b32 s3, 0x3ff00000, s3 +; GCN-NEXT: s_cselect_b32 s2, 0, s2 ; GCN-NEXT: s_cmp_eq_u32 s12, 0 -; GCN-NEXT: v_cndmask_b32_e64 v2, v0, 0, vcc -; GCN-NEXT: v_mov_b32_e32 v0, s1 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v1, v0, v4, vcc -; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: v_mov_b32_e32 v4, s8 +; GCN-NEXT: v_mov_b32_e32 v5, s9 +; GCN-NEXT: s_cselect_b32 s8, 0x3ff00000, s1 +; GCN-NEXT: s_cselect_b32 s9, 0, s0 ; GCN-NEXT: s_cmp_eq_u32 s12, 3 -; GCN-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc -; GCN-NEXT: v_mov_b32_e32 v5, s7 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v7, v5, v4, vcc -; GCN-NEXT: v_mov_b32_e32 v5, s6 +; GCN-NEXT: s_cselect_b32 s0, 0x3ff00000, s7 +; GCN-NEXT: s_cselect_b32 s1, 0, s6 ; GCN-NEXT: s_cmp_eq_u32 s12, 2 -; GCN-NEXT: v_cndmask_b32_e64 v6, v5, 0, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cselect_b32 s5, 0x3ff00000, s5 +; GCN-NEXT: s_cselect_b32 s4, 0, s4 +; GCN-NEXT: v_mov_b32_e32 v3, s0 ; GCN-NEXT: s_add_u32 s0, s10, 16 -; GCN-NEXT: v_mov_b32_e32 v5, s5 +; GCN-NEXT: v_mov_b32_e32 v2, s1 ; GCN-NEXT: s_addc_u32 s1, s11, 0 -; GCN-NEXT: v_cndmask_b32_e32 v5, v5, v4, vcc -; GCN-NEXT: v_mov_b32_e32 v4, s4 -; GCN-NEXT: v_mov_b32_e32 v11, s1 -; GCN-NEXT: v_cndmask_b32_e64 v4, v4, 0, vcc -; GCN-NEXT: v_mov_b32_e32 v10, s0 -; GCN-NEXT: flat_store_dwordx4 v[10:11], v[4:7] +; GCN-NEXT: v_mov_b32_e32 v7, s1 +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v1, s5 +; GCN-NEXT: v_mov_b32_e32 v6, s0 +; GCN-NEXT: flat_store_dwordx4 v[6:7], v[0:3] +; GCN-NEXT: v_mov_b32_e32 v6, s10 +; GCN-NEXT: v_mov_b32_e32 v0, s9 +; GCN-NEXT: v_mov_b32_e32 v1, s8 +; GCN-NEXT: v_mov_b32_e32 v2, s2 +; GCN-NEXT: v_mov_b32_e32 v3, s3 +; GCN-NEXT: v_mov_b32_e32 v7, s11 ; GCN-NEXT: s_add_u32 s0, s10, 32 -; GCN-NEXT: v_mov_b32_e32 v4, s10 -; GCN-NEXT: v_mov_b32_e32 v5, s11 -; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-NEXT: flat_store_dwordx4 v[6:7], v[0:3] ; GCN-NEXT: s_addc_u32 s1, s11, 0 ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, s1 -; GCN-NEXT: flat_store_dwordx2 v[0:1], v[8:9] +; GCN-NEXT: flat_store_dwordx2 v[0:1], v[4:5] ; GCN-NEXT: s_endpgm entry: %v = insertelement <5 x double> %vec, double 1.000000e+00, i32 %sel diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll @@ -693,16 +693,14 @@ ; SI-NEXT: s_mov_b32 s7, 0x100f000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s3 +; SI-NEXT: s_mov_b32 s4, s0 ; SI-NEXT: s_cmp_lg_u32 s8, 1 -; SI-NEXT: s_cselect_b64 vcc, -1, 0 +; SI-NEXT: s_cselect_b32 s0, s3, 5 ; SI-NEXT: s_cmp_lg_u32 s8, 0 -; SI-NEXT: v_cndmask_b32_e32 v1, 5, v0, vcc -; SI-NEXT: v_mov_b32_e32 v0, s2 -; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: s_mov_b32 s4, s0 ; SI-NEXT: s_mov_b32 s5, s1 -; SI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc +; SI-NEXT: s_cselect_b32 s1, s2, 5 +; SI-NEXT: v_mov_b32_e32 v0, s1 +; SI-NEXT: v_mov_b32_e32 v1, s0 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; @@ -738,17 +736,14 @@ ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s8, 2 -; SI-NEXT: v_mov_b32_e32 v0, s2 -; SI-NEXT: s_cselect_b64 vcc, -1, 0 +; SI-NEXT: s_cselect_b32 s2, s2, 5 ; SI-NEXT: s_cmp_lg_u32 s8, 1 -; SI-NEXT: v_cndmask_b32_e32 v2, 5, v0, vcc -; SI-NEXT: v_mov_b32_e32 v0, s1 -; SI-NEXT: s_cselect_b64 vcc, -1, 0 +; SI-NEXT: s_cselect_b32 s1, s1, 5 ; SI-NEXT: s_cmp_lg_u32 s8, 0 -; SI-NEXT: v_cndmask_b32_e32 v1, 5, v0, vcc +; SI-NEXT: s_cselect_b32 s0, s0, 5 ; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc +; SI-NEXT: v_mov_b32_e32 v1, s1 +; SI-NEXT: v_mov_b32_e32 v2, s2 ; SI-NEXT: buffer_store_dwordx3 v[0:2], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; @@ -779,31 +774,26 @@ define amdgpu_kernel void @dynamic_insertelement_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, i32 %b, [8 x i32], i32 %val) nounwind { ; SI-LABEL: dynamic_insertelement_v4i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s6, s[4:5], 0x8 -; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x4 -; SI-NEXT: s_load_dword s4, s[4:5], 0x11 -; SI-NEXT: s_mov_b32 s3, 0x100f000 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 +; SI-NEXT: s_load_dword s8, s[4:5], 0x8 +; SI-NEXT: s_load_dword s9, s[4:5], 0x11 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; SI-NEXT: s_mov_b32 s7, 0x100f000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_cmp_eq_u32 s6, 3 -; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: v_mov_b32_e32 v0, s11 -; SI-NEXT: v_mov_b32_e32 v4, s4 -; SI-NEXT: s_cmp_eq_u32 s6, 2 -; SI-NEXT: v_cndmask_b32_e32 v3, v0, v4, vcc -; SI-NEXT: v_mov_b32_e32 v0, s10 -; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: s_cmp_eq_u32 s6, 1 -; SI-NEXT: v_cndmask_b32_e32 v2, v0, v4, vcc -; SI-NEXT: v_mov_b32_e32 v0, s9 -; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: s_cmp_eq_u32 s6, 0 -; SI-NEXT: v_cndmask_b32_e32 v1, v0, v4, vcc -; SI-NEXT: v_mov_b32_e32 v0, s8 -; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: s_cmp_eq_u32 s8, 3 +; SI-NEXT: s_cselect_b32 s3, s9, s3 +; SI-NEXT: s_cmp_eq_u32 s8, 2 +; SI-NEXT: s_cselect_b32 s2, s9, s2 +; SI-NEXT: s_cmp_eq_u32 s8, 1 +; SI-NEXT: s_cselect_b32 s1, s9, s1 +; SI-NEXT: s_cmp_eq_u32 s8, 0 +; SI-NEXT: s_cselect_b32 s0, s9, s0 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_mov_b32_e32 v1, s1 +; SI-NEXT: v_mov_b32_e32 v2, s2 +; SI-NEXT: v_mov_b32_e32 v3, s3 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: dynamic_insertelement_v4i32: @@ -1229,116 +1219,88 @@ ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshr_b32 s4, s11, 24 ; SI-NEXT: s_cmp_lg_u32 s6, 15 -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: s_lshr_b32 s4, s11, 16 +; SI-NEXT: s_cselect_b32 s4, s4, 5 +; SI-NEXT: s_lshl_b32 s4, s4, 8 +; SI-NEXT: s_lshr_b32 s5, s11, 16 ; SI-NEXT: s_cmp_lg_u32 s6, 14 -; SI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc -; SI-NEXT: v_mov_b32_e32 v1, s4 -; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc -; SI-NEXT: s_lshr_b32 s4, s11, 8 -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: s_cselect_b32 s5, s5, 5 +; SI-NEXT: s_and_b32 s5, s5, 0xff +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_lshr_b32 s5, s11, 8 ; SI-NEXT: s_cmp_lg_u32 s6, 13 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_mov_b32_e32 v1, s4 -; SI-NEXT: s_cselect_b64 vcc, -1, 0 +; SI-NEXT: s_cselect_b32 s5, s5, 5 +; SI-NEXT: s_lshl_b32 s5, s5, 8 ; SI-NEXT: s_cmp_lg_u32 s6, 12 -; SI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc -; SI-NEXT: v_mov_b32_e32 v2, s11 -; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: v_cndmask_b32_e32 v2, 5, v2, vcc -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: s_lshr_b32 s4, s10, 24 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_cselect_b32 s7, s11, 5 +; SI-NEXT: s_and_b32 s7, s7, 0xff +; SI-NEXT: s_or_b32 s5, s7, s5 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_lshr_b32 s5, s10, 24 ; SI-NEXT: s_cmp_lg_u32 s6, 11 -; SI-NEXT: v_or_b32_e32 v3, v1, v0 -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: s_lshr_b32 s4, s10, 16 +; SI-NEXT: s_cselect_b32 s5, s5, 5 +; SI-NEXT: s_lshl_b32 s5, s5, 8 +; SI-NEXT: s_lshr_b32 s7, s10, 16 ; SI-NEXT: s_cmp_lg_u32 s6, 10 -; SI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc -; SI-NEXT: v_mov_b32_e32 v1, s4 -; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc -; SI-NEXT: s_lshr_b32 s4, s10, 8 -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: s_cselect_b32 s7, s7, 5 +; SI-NEXT: s_and_b32 s7, s7, 0xff +; SI-NEXT: s_or_b32 s5, s7, s5 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshr_b32 s7, s10, 8 ; SI-NEXT: s_cmp_lg_u32 s6, 9 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_mov_b32_e32 v1, s4 -; SI-NEXT: s_cselect_b64 vcc, -1, 0 +; SI-NEXT: s_cselect_b32 s7, s7, 5 +; SI-NEXT: s_lshl_b32 s7, s7, 8 ; SI-NEXT: s_cmp_lg_u32 s6, 8 -; SI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc -; SI-NEXT: v_mov_b32_e32 v2, s10 -; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: v_cndmask_b32_e32 v2, 5, v2, vcc -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: s_lshr_b32 s4, s9, 24 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_cselect_b32 s10, s10, 5 +; SI-NEXT: s_and_b32 s10, s10, 0xff +; SI-NEXT: s_or_b32 s7, s10, s7 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_or_b32 s5, s7, s5 +; SI-NEXT: s_lshr_b32 s7, s9, 24 ; SI-NEXT: s_cmp_lg_u32 s6, 7 -; SI-NEXT: v_or_b32_e32 v2, v1, v0 -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: s_lshr_b32 s4, s9, 16 +; SI-NEXT: s_cselect_b32 s7, s7, 5 +; SI-NEXT: s_lshl_b32 s7, s7, 8 +; SI-NEXT: s_lshr_b32 s10, s9, 16 ; SI-NEXT: s_cmp_lg_u32 s6, 6 -; SI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc -; SI-NEXT: v_mov_b32_e32 v1, s4 -; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc -; SI-NEXT: s_lshr_b32 s4, s9, 8 -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: s_cselect_b32 s10, s10, 5 +; SI-NEXT: s_and_b32 s10, s10, 0xff +; SI-NEXT: s_or_b32 s7, s10, s7 +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_lshr_b32 s10, s9, 8 ; SI-NEXT: s_cmp_lg_u32 s6, 5 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_mov_b32_e32 v1, s4 -; SI-NEXT: s_cselect_b64 vcc, -1, 0 +; SI-NEXT: s_cselect_b32 s10, s10, 5 +; SI-NEXT: s_lshl_b32 s10, s10, 8 ; SI-NEXT: s_cmp_lg_u32 s6, 4 -; SI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc -; SI-NEXT: v_mov_b32_e32 v4, s9 -; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: v_cndmask_b32_e32 v4, 5, v4, vcc -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 -; SI-NEXT: v_or_b32_e32 v1, v4, v1 -; SI-NEXT: s_lshr_b32 s4, s8, 24 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_cselect_b32 s9, s9, 5 +; SI-NEXT: s_and_b32 s9, s9, 0xff +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s9, s9, 0xffff +; SI-NEXT: s_or_b32 s7, s9, s7 +; SI-NEXT: s_lshr_b32 s9, s8, 24 ; SI-NEXT: s_cmp_lg_u32 s6, 3 -; SI-NEXT: v_or_b32_e32 v1, v1, v0 -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: s_lshr_b32 s4, s8, 16 +; SI-NEXT: s_cselect_b32 s9, s9, 5 +; SI-NEXT: s_lshl_b32 s9, s9, 8 +; SI-NEXT: s_lshr_b32 s10, s8, 16 ; SI-NEXT: s_cmp_lg_u32 s6, 2 -; SI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc -; SI-NEXT: v_mov_b32_e32 v4, s4 -; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: v_cndmask_b32_e32 v4, 5, v4, vcc -; SI-NEXT: s_lshr_b32 s4, s8, 8 -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: s_cselect_b32 s10, s10, 5 +; SI-NEXT: s_and_b32 s10, s10, 0xff +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_lshl_b32 s9, s9, 16 +; SI-NEXT: s_lshr_b32 s10, s8, 8 ; SI-NEXT: s_cmp_lg_u32 s6, 1 -; SI-NEXT: v_or_b32_e32 v0, v4, v0 -; SI-NEXT: v_mov_b32_e32 v4, s4 -; SI-NEXT: s_cselect_b64 vcc, -1, 0 +; SI-NEXT: s_cselect_b32 s10, s10, 5 +; SI-NEXT: s_lshl_b32 s10, s10, 8 ; SI-NEXT: s_cmp_lg_u32 s6, 0 -; SI-NEXT: v_cndmask_b32_e32 v4, 5, v4, vcc -; SI-NEXT: v_mov_b32_e32 v5, s8 -; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: v_cndmask_b32_e32 v5, 5, v5, vcc -; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v4 -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; SI-NEXT: v_or_b32_e32 v0, v4, v0 +; SI-NEXT: s_cselect_b32 s6, s8, 5 +; SI-NEXT: s_and_b32 s6, s6, 0xff +; SI-NEXT: s_or_b32 s6, s6, s10 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s6, s6, s9 +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v2, s5 +; SI-NEXT: v_mov_b32_e32 v3, s4 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; @@ -1534,22 +1496,19 @@ ; SI-NEXT: s_load_dword s8, s[4:5], 0x18 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xc ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; SI-NEXT: v_mov_b32_e32 v1, 0x40200000 ; SI-NEXT: s_mov_b32 s7, 0x100f000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_cmp_eq_u32 s8, 1 -; SI-NEXT: v_mov_b32_e32 v0, s3 -; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc -; SI-NEXT: v_mov_b32_e32 v0, s2 +; SI-NEXT: s_cselect_b32 s3, 0x40200000, s3 +; SI-NEXT: s_cselect_b32 s2, 0, s2 ; SI-NEXT: s_cmp_eq_u32 s8, 0 -; SI-NEXT: v_cndmask_b32_e64 v2, v0, 0, vcc -; SI-NEXT: v_mov_b32_e32 v0, s1 -; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc +; SI-NEXT: s_cselect_b32 s1, 0x40200000, s1 +; SI-NEXT: s_cselect_b32 s0, 0, s0 ; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; SI-NEXT: v_mov_b32_e32 v1, s1 +; SI-NEXT: v_mov_b32_e32 v2, s2 +; SI-NEXT: v_mov_b32_e32 v3, s3 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; @@ -1558,22 +1517,19 @@ ; VI-NEXT: s_load_dword s8, s[4:5], 0x60 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x30 ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; VI-NEXT: v_mov_b32_e32 v1, 0x40200000 ; VI-NEXT: s_mov_b32 s7, 0x1100f000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_cmp_eq_u32 s8, 1 -; VI-NEXT: v_mov_b32_e32 v0, s3 -; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: s_cselect_b32 s3, 0x40200000, s3 +; VI-NEXT: s_cselect_b32 s2, 0, s2 ; VI-NEXT: s_cmp_eq_u32 s8, 0 -; VI-NEXT: v_cndmask_b32_e64 v2, v0, 0, vcc -; VI-NEXT: v_mov_b32_e32 v0, s1 -; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc +; VI-NEXT: s_cselect_b32 s1, 0x40200000, s1 +; VI-NEXT: s_cselect_b32 s0, 0, s0 ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: s_mov_b32 s6, -1 -; VI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; VI-NEXT: s_endpgm %vecins = insertelement <2 x double> %a, double 8.0, i32 %b @@ -1584,47 +1540,43 @@ define amdgpu_kernel void @dynamic_insertelement_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> %a, i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v2i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s10, s[4:5], 0x8 +; SI-NEXT: s_load_dword s8, s[4:5], 0x8 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; SI-NEXT: s_mov_b32 s7, 0x100f000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_cmp_eq_u32 s10, 1 -; SI-NEXT: v_mov_b32_e32 v0, s3 -; SI-NEXT: s_cselect_b64 s[8:9], -1, 0 -; SI-NEXT: v_cndmask_b32_e64 v3, v0, 0, s[8:9] -; SI-NEXT: v_mov_b32_e32 v0, s2 -; SI-NEXT: s_cmp_eq_u32 s10, 0 -; SI-NEXT: v_cndmask_b32_e64 v2, v0, 5, s[8:9] -; SI-NEXT: v_mov_b32_e32 v0, s1 -; SI-NEXT: s_cselect_b64 s[2:3], -1, 0 -; SI-NEXT: v_cndmask_b32_e64 v1, v0, 0, s[2:3] +; SI-NEXT: s_cmp_eq_u32 s8, 1 +; SI-NEXT: s_cselect_b32 s3, 0, s3 +; SI-NEXT: s_cselect_b32 s2, 5, s2 +; SI-NEXT: s_cmp_eq_u32 s8, 0 +; SI-NEXT: s_cselect_b32 s1, 0, s1 +; SI-NEXT: s_cselect_b32 s0, 5, s0 ; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: v_cndmask_b32_e64 v0, v0, 5, s[2:3] +; SI-NEXT: v_mov_b32_e32 v1, s1 +; SI-NEXT: v_mov_b32_e32 v2, s2 +; SI-NEXT: v_mov_b32_e32 v3, s3 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: dynamic_insertelement_v2i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s10, s[4:5], 0x20 +; VI-NEXT: s_load_dword s8, s[4:5], 0x20 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; VI-NEXT: s_mov_b32 s7, 0x1100f000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_cmp_eq_u32 s10, 1 -; VI-NEXT: v_mov_b32_e32 v0, s3 -; VI-NEXT: s_cselect_b64 s[8:9], -1, 0 -; VI-NEXT: v_cndmask_b32_e64 v3, v0, 0, s[8:9] -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: s_cmp_eq_u32 s10, 0 -; VI-NEXT: v_cndmask_b32_e64 v2, v0, 5, s[8:9] -; VI-NEXT: v_mov_b32_e32 v0, s1 -; VI-NEXT: s_cselect_b64 s[2:3], -1, 0 -; VI-NEXT: v_cndmask_b32_e64 v1, v0, 0, s[2:3] +; VI-NEXT: s_cmp_eq_u32 s8, 1 +; VI-NEXT: s_cselect_b32 s3, 0, s3 +; VI-NEXT: s_cselect_b32 s2, 5, s2 +; VI-NEXT: s_cmp_eq_u32 s8, 0 +; VI-NEXT: s_cselect_b32 s1, 0, s1 +; VI-NEXT: s_cselect_b32 s0, 5, s0 ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_cndmask_b32_e64 v0, v0, 5, s[2:3] +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; VI-NEXT: s_endpgm %vecins = insertelement <2 x i64> %a, i64 5, i32 %b @@ -1635,63 +1587,57 @@ define amdgpu_kernel void @dynamic_insertelement_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> %a, i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v3i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s12, s[4:5], 0x10 +; SI-NEXT: s_load_dword s6, s[4:5], 0x10 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8 ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xc ; SI-NEXT: s_mov_b32 s3, 0x100f000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_cmp_eq_u32 s12, 1 -; SI-NEXT: s_cselect_b64 s[6:7], -1, 0 -; SI-NEXT: v_mov_b32_e32 v0, s11 -; SI-NEXT: v_cndmask_b32_e64 v3, v0, 0, s[6:7] -; SI-NEXT: v_mov_b32_e32 v0, s10 -; SI-NEXT: s_cmp_eq_u32 s12, 0 -; SI-NEXT: v_cndmask_b32_e64 v2, v0, 5, s[6:7] -; SI-NEXT: v_mov_b32_e32 v0, s9 -; SI-NEXT: s_cselect_b64 s[6:7], -1, 0 -; SI-NEXT: v_cndmask_b32_e64 v1, v0, 0, s[6:7] -; SI-NEXT: v_mov_b32_e32 v0, s8 -; SI-NEXT: s_cmp_eq_u32 s12, 2 -; SI-NEXT: v_cndmask_b32_e64 v0, v0, 5, s[6:7] -; SI-NEXT: v_mov_b32_e32 v4, s5 -; SI-NEXT: s_cselect_b64 s[6:7], -1, 0 -; SI-NEXT: v_cndmask_b32_e64 v5, v4, 0, s[6:7] -; SI-NEXT: v_mov_b32_e32 v4, s4 +; SI-NEXT: s_cmp_eq_u32 s6, 1 ; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: v_cndmask_b32_e64 v4, v4, 5, s[6:7] -; SI-NEXT: buffer_store_dwordx2 v[4:5], off, s[0:3], 0 offset:16 +; SI-NEXT: s_cselect_b32 s7, 0, s11 +; SI-NEXT: s_cselect_b32 s10, 5, s10 +; SI-NEXT: s_cmp_eq_u32 s6, 0 +; SI-NEXT: s_cselect_b32 s9, 0, s9 +; SI-NEXT: s_cselect_b32 s8, 5, s8 +; SI-NEXT: s_cmp_eq_u32 s6, 2 +; SI-NEXT: s_cselect_b32 s5, 0, s5 +; SI-NEXT: s_cselect_b32 s4, 5, s4 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 offset:16 +; SI-NEXT: v_mov_b32_e32 v0, s8 +; SI-NEXT: v_mov_b32_e32 v1, s9 +; SI-NEXT: v_mov_b32_e32 v2, s10 +; SI-NEXT: v_mov_b32_e32 v3, s7 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: dynamic_insertelement_v3i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s12, s[4:5], 0x40 +; VI-NEXT: s_load_dword s6, s[4:5], 0x40 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x20 ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x30 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_cmp_eq_u32 s12, 1 -; VI-NEXT: s_cselect_b64 s[6:7], -1, 0 -; VI-NEXT: v_mov_b32_e32 v0, s11 -; VI-NEXT: v_cndmask_b32_e64 v3, v0, 0, s[6:7] -; VI-NEXT: v_mov_b32_e32 v0, s10 -; VI-NEXT: s_cmp_eq_u32 s12, 0 -; VI-NEXT: v_cndmask_b32_e64 v2, v0, 5, s[6:7] -; VI-NEXT: v_mov_b32_e32 v0, s9 -; VI-NEXT: s_cselect_b64 s[6:7], -1, 0 -; VI-NEXT: v_cndmask_b32_e64 v1, v0, 0, s[6:7] -; VI-NEXT: v_mov_b32_e32 v0, s8 -; VI-NEXT: s_cmp_eq_u32 s12, 2 -; VI-NEXT: v_cndmask_b32_e64 v0, v0, 5, s[6:7] -; VI-NEXT: v_mov_b32_e32 v4, s5 -; VI-NEXT: s_cselect_b64 s[6:7], -1, 0 -; VI-NEXT: v_cndmask_b32_e64 v5, v4, 0, s[6:7] -; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: s_cmp_eq_u32 s6, 1 ; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: v_cndmask_b32_e64 v4, v4, 5, s[6:7] -; VI-NEXT: buffer_store_dwordx2 v[4:5], off, s[0:3], 0 offset:16 +; VI-NEXT: s_cselect_b32 s7, 0, s11 +; VI-NEXT: s_cselect_b32 s10, 5, s10 +; VI-NEXT: s_cmp_eq_u32 s6, 0 +; VI-NEXT: s_cselect_b32 s9, 0, s9 +; VI-NEXT: s_cselect_b32 s8, 5, s8 +; VI-NEXT: s_cmp_eq_u32 s6, 2 +; VI-NEXT: s_cselect_b32 s5, 0, s5 +; VI-NEXT: s_cselect_b32 s4, 5, s4 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 offset:16 +; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: v_mov_b32_e32 v1, s9 +; VI-NEXT: v_mov_b32_e32 v2, s10 +; VI-NEXT: v_mov_b32_e32 v3, s7 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; VI-NEXT: s_endpgm %vecins = insertelement <3 x i64> %a, i64 5, i32 %b @@ -1704,36 +1650,32 @@ ; SI: ; %bb.0: ; SI-NEXT: s_load_dword s6, s[4:5], 0x10 ; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x8 -; SI-NEXT: v_mov_b32_e32 v4, 0x40200000 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SI-NEXT: s_mov_b32 s3, 0x100f000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_cmp_eq_u32 s6, 1 -; SI-NEXT: v_mov_b32_e32 v0, s11 -; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: v_cndmask_b32_e32 v3, v0, v4, vcc -; SI-NEXT: v_mov_b32_e32 v0, s10 +; SI-NEXT: s_cselect_b32 s4, 0x40200000, s11 +; SI-NEXT: s_cselect_b32 s5, 0, s10 ; SI-NEXT: s_cmp_eq_u32 s6, 0 -; SI-NEXT: v_cndmask_b32_e64 v2, v0, 0, vcc -; SI-NEXT: v_mov_b32_e32 v0, s9 -; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: v_cndmask_b32_e32 v1, v0, v4, vcc -; SI-NEXT: v_mov_b32_e32 v0, s8 +; SI-NEXT: s_cselect_b32 s7, 0x40200000, s9 +; SI-NEXT: s_cselect_b32 s8, 0, s8 ; SI-NEXT: s_cmp_eq_u32 s6, 3 -; SI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc -; SI-NEXT: v_mov_b32_e32 v5, s15 -; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: v_cndmask_b32_e32 v7, v5, v4, vcc -; SI-NEXT: v_mov_b32_e32 v5, s14 +; SI-NEXT: s_cselect_b32 s9, 0x40200000, s15 +; SI-NEXT: s_cselect_b32 s10, 0, s14 ; SI-NEXT: s_cmp_eq_u32 s6, 2 -; SI-NEXT: v_cndmask_b32_e64 v6, v5, 0, vcc -; SI-NEXT: v_mov_b32_e32 v5, s13 -; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: v_cndmask_b32_e32 v5, v5, v4, vcc -; SI-NEXT: v_mov_b32_e32 v4, s12 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: v_cndmask_b32_e64 v4, v4, 0, vcc -; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; SI-NEXT: s_cselect_b32 s6, 0x40200000, s13 +; SI-NEXT: s_cselect_b32 s11, 0, s12 +; SI-NEXT: v_mov_b32_e32 v0, s11 +; SI-NEXT: v_mov_b32_e32 v1, s6 +; SI-NEXT: v_mov_b32_e32 v2, s10 +; SI-NEXT: v_mov_b32_e32 v3, s9 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 +; SI-NEXT: s_nop 0 +; SI-NEXT: v_mov_b32_e32 v0, s8 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v2, s5 +; SI-NEXT: v_mov_b32_e32 v3, s4 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; @@ -1741,36 +1683,32 @@ ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s6, s[4:5], 0x40 ; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20 -; VI-NEXT: v_mov_b32_e32 v4, 0x40200000 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_cmp_eq_u32 s6, 1 -; VI-NEXT: v_mov_b32_e32 v0, s11 -; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: v_cndmask_b32_e32 v3, v0, v4, vcc -; VI-NEXT: v_mov_b32_e32 v0, s10 +; VI-NEXT: s_cselect_b32 s4, 0x40200000, s11 +; VI-NEXT: s_cselect_b32 s5, 0, s10 ; VI-NEXT: s_cmp_eq_u32 s6, 0 -; VI-NEXT: v_cndmask_b32_e64 v2, v0, 0, vcc -; VI-NEXT: v_mov_b32_e32 v0, s9 -; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: v_cndmask_b32_e32 v1, v0, v4, vcc -; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: s_cselect_b32 s7, 0x40200000, s9 +; VI-NEXT: s_cselect_b32 s8, 0, s8 ; VI-NEXT: s_cmp_eq_u32 s6, 3 -; VI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc -; VI-NEXT: v_mov_b32_e32 v5, s15 -; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: v_cndmask_b32_e32 v7, v5, v4, vcc -; VI-NEXT: v_mov_b32_e32 v5, s14 +; VI-NEXT: s_cselect_b32 s9, 0x40200000, s15 +; VI-NEXT: s_cselect_b32 s10, 0, s14 ; VI-NEXT: s_cmp_eq_u32 s6, 2 -; VI-NEXT: v_cndmask_b32_e64 v6, v5, 0, vcc -; VI-NEXT: v_mov_b32_e32 v5, s13 -; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: v_cndmask_b32_e32 v5, v5, v4, vcc -; VI-NEXT: v_mov_b32_e32 v4, s12 -; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: v_cndmask_b32_e64 v4, v4, 0, vcc -; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; VI-NEXT: s_cselect_b32 s6, 0x40200000, s13 +; VI-NEXT: s_cselect_b32 s11, 0, s12 +; VI-NEXT: v_mov_b32_e32 v0, s11 +; VI-NEXT: v_mov_b32_e32 v1, s6 +; VI-NEXT: v_mov_b32_e32 v2, s10 +; VI-NEXT: v_mov_b32_e32 v3, s9 +; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 +; VI-NEXT: s_nop 0 +; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: v_mov_b32_e32 v3, s4 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; VI-NEXT: s_endpgm %vecins = insertelement <4 x double> %a, double 8.0, i32 %b diff --git a/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll b/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll @@ -284,10 +284,9 @@ ; SI-NEXT: v_mul_hi_u32 v0, s0, v0 ; SI-NEXT: v_mul_hi_u32 v2, s1, v2 ; SI-NEXT: s_mul_i32 s1, s1, s3 -; SI-NEXT: s_mul_i32 s0, s0, s2 +; SI-NEXT: s_mul_i32 s2, s0, s2 ; SI-NEXT: v_add_i32_e32 v4, vcc, s5, v0 ; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; SI-NEXT: v_mov_b32_e32 v5, s0 ; SI-NEXT: v_add_i32_e32 v4, vcc, s4, v4 ; SI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc ; SI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc @@ -297,8 +296,10 @@ ; SI-NEXT: v_add_i32_e32 v2, vcc, s4, v3 ; SI-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; SI-NEXT: v_cndmask_b32_e64 v1, v2, 0, vcc -; SI-NEXT: v_cndmask_b32_e64 v0, v5, 0, vcc +; SI-NEXT: s_and_b64 s[0:1], vcc, exec +; SI-NEXT: s_cselect_b32 s0, 0, s2 ; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: v_mov_b32_e32 v0, s0 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; @@ -322,13 +323,12 @@ ; GFX9-NEXT: s_addc_u32 s5, 0, s5 ; GFX9-NEXT: s_add_i32 s1, s8, s7 ; GFX9-NEXT: s_add_i32 s1, s1, s6 +; GFX9-NEXT: s_mul_i32 s0, s0, s2 ; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0 -; GFX9-NEXT: s_mul_i32 s2, s0, s2 -; GFX9-NEXT: v_mov_b32_e32 v0, s1 -; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v1, v0, 0, s[0:1] -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[0:1] +; GFX9-NEXT: s_cselect_b32 s1, 0, s1 +; GFX9-NEXT: s_cselect_b32 s0, 0, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: global_store_dwordx2 v[0:1], v[0:1], off ; GFX9-NEXT: s_endpgm ; @@ -354,9 +354,10 @@ ; GFX10-NEXT: s_mul_i32 s0, s0, s2 ; GFX10-NEXT: s_add_i32 s1, s1, s6 ; GFX10-NEXT: s_cmp_lg_u64 s[4:5], 0 -; GFX10-NEXT: s_cselect_b32 s2, -1, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v1, s1, 0, s2 -; GFX10-NEXT: v_cndmask_b32_e64 v0, s0, 0, s2 +; GFX10-NEXT: s_cselect_b32 s0, 0, s0 +; GFX10-NEXT: s_cselect_b32 s1, 0, s1 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: global_store_dwordx2 v[0:1], v[0:1], off ; GFX10-NEXT: s_endpgm ; @@ -382,10 +383,10 @@ ; GFX11-NEXT: s_mul_i32 s0, s0, s2 ; GFX11-NEXT: s_add_i32 s1, s1, s6 ; GFX11-NEXT: s_cmp_lg_u64 s[4:5], 0 -; GFX11-NEXT: s_cselect_b32 s2, -1, 0 +; GFX11-NEXT: s_cselect_b32 s0, 0, s0 +; GFX11-NEXT: s_cselect_b32 s1, 0, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_cndmask_b32_e64 v1, s1, 0, s2 -; GFX11-NEXT: v_cndmask_b32_e64 v0, s0, 0, s2 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: global_store_b64 v[0:1], v[0:1], off ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -413,35 +414,38 @@ ; SI-NEXT: v_mul_hi_u32 v0, s0, v0 ; SI-NEXT: v_mul_hi_i32 v2, s1, v2 ; SI-NEXT: s_mul_i32 s6, s1, s3 -; SI-NEXT: s_cmp_lt_i32 s1, 0 -; SI-NEXT: s_mul_i32 s1, s0, s2 -; SI-NEXT: v_add_i32_e32 v4, vcc, s5, v0 -; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; SI-NEXT: v_mov_b32_e32 v5, s1 -; SI-NEXT: v_add_i32_e32 v4, vcc, s4, v4 -; SI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc -; SI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc +; SI-NEXT: s_mul_i32 s8, s0, s2 +; SI-NEXT: v_readfirstlane_b32 s9, v1 +; SI-NEXT: v_readfirstlane_b32 s10, v3 +; SI-NEXT: v_readfirstlane_b32 s11, v0 +; SI-NEXT: v_readfirstlane_b32 s12, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, s5, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; SI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc -; SI-NEXT: v_add_i32_e32 v4, vcc, s4, v0 -; SI-NEXT: v_subrev_i32_e32 v3, vcc, s2, v1 -; SI-NEXT: v_subbrev_u32_e32 v6, vcc, 0, v2, vcc -; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: s_cmp_lt_i32 s3, 0 -; SI-NEXT: v_ashrrev_i32_e32 v0, 31, v4 -; SI-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc -; SI-NEXT: v_cndmask_b32_e32 v6, v1, v3, vcc +; SI-NEXT: s_add_u32 s5, s11, s5 +; SI-NEXT: v_add_i32_e32 v2, vcc, s4, v0 +; SI-NEXT: s_addc_u32 s10, 0, s10 +; SI-NEXT: v_ashrrev_i32_e32 v0, 31, v2 +; SI-NEXT: s_add_u32 s4, s5, s4 ; SI-NEXT: v_mov_b32_e32 v1, v0 -; SI-NEXT: v_subrev_i32_e32 v7, vcc, s0, v6 -; SI-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v2, vcc -; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: v_cndmask_b32_e32 v3, v2, v3, vcc -; SI-NEXT: v_cndmask_b32_e32 v2, v6, v7, vcc -; SI-NEXT: v_cmp_ne_u64_e32 vcc, v[2:3], v[0:1] -; SI-NEXT: v_cndmask_b32_e64 v1, v4, 0, vcc -; SI-NEXT: v_cndmask_b32_e64 v0, v5, 0, vcc +; SI-NEXT: s_addc_u32 s4, s10, s9 +; SI-NEXT: s_addc_u32 s5, s12, 0 +; SI-NEXT: s_add_u32 s4, s4, s6 +; SI-NEXT: s_addc_u32 s5, 0, s5 +; SI-NEXT: s_sub_u32 s2, s4, s2 +; SI-NEXT: s_subb_u32 s6, s5, 0 +; SI-NEXT: s_cmp_lt_i32 s1, 0 +; SI-NEXT: s_cselect_b32 s1, s6, s5 +; SI-NEXT: s_cselect_b32 s2, s2, s4 +; SI-NEXT: s_sub_u32 s0, s2, s0 +; SI-NEXT: s_subb_u32 s4, s1, 0 +; SI-NEXT: s_cmp_lt_i32 s3, 0 +; SI-NEXT: s_cselect_b32 s1, s4, s1 +; SI-NEXT: s_cselect_b32 s0, s0, s2 +; SI-NEXT: v_cmp_ne_u64_e32 vcc, s[0:1], v[0:1] +; SI-NEXT: v_cndmask_b32_e64 v1, v2, 0, vcc +; SI-NEXT: s_and_b64 s[0:1], vcc, exec +; SI-NEXT: s_cselect_b32 s0, 0, s8 ; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: v_mov_b32_e32 v0, s0 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; @@ -451,44 +455,38 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_mul_i32 s7, s0, s3 ; GFX9-NEXT: s_mul_hi_u32 s8, s0, s2 -; GFX9-NEXT: s_mul_hi_u32 s6, s0, s3 +; GFX9-NEXT: s_mul_hi_u32 s5, s0, s3 ; GFX9-NEXT: s_add_u32 s9, s8, s7 -; GFX9-NEXT: s_mul_i32 s5, s1, s2 -; GFX9-NEXT: s_addc_u32 s6, 0, s6 +; GFX9-NEXT: s_mul_i32 s6, s1, s2 +; GFX9-NEXT: s_addc_u32 s5, 0, s5 ; GFX9-NEXT: s_mul_hi_u32 s4, s1, s2 -; GFX9-NEXT: s_add_u32 s9, s9, s5 +; GFX9-NEXT: s_add_u32 s9, s9, s6 ; GFX9-NEXT: s_mul_hi_i32 s10, s1, s3 -; GFX9-NEXT: s_addc_u32 s4, s6, s4 -; GFX9-NEXT: s_addc_u32 s6, s10, 0 +; GFX9-NEXT: s_addc_u32 s4, s5, s4 +; GFX9-NEXT: s_addc_u32 s5, s10, 0 ; GFX9-NEXT: s_mul_i32 s9, s1, s3 ; GFX9-NEXT: s_add_u32 s4, s4, s9 -; GFX9-NEXT: s_addc_u32 s6, 0, s6 +; GFX9-NEXT: s_addc_u32 s5, 0, s5 ; GFX9-NEXT: s_sub_u32 s9, s4, s2 -; GFX9-NEXT: s_subb_u32 s10, s6, 0 +; GFX9-NEXT: s_subb_u32 s10, s5, 0 ; GFX9-NEXT: s_cmp_lt_i32 s1, 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s10 -; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: v_mov_b32_e32 v2, s9 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v1, v2, vcc -; GFX9-NEXT: v_subrev_co_u32_e32 v3, vcc, s0, v2 -; GFX9-NEXT: v_subbrev_co_u32_e32 v1, vcc, 0, v0, vcc +; GFX9-NEXT: s_cselect_b32 s4, s9, s4 +; GFX9-NEXT: s_cselect_b32 s1, s10, s5 +; GFX9-NEXT: s_sub_u32 s9, s4, s0 +; GFX9-NEXT: s_subb_u32 s5, s1, 0 ; GFX9-NEXT: s_cmp_lt_i32 s3, 0 -; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 +; GFX9-NEXT: s_cselect_b32 s5, s5, s1 +; GFX9-NEXT: s_cselect_b32 s4, s9, s4 ; GFX9-NEXT: s_add_i32 s1, s8, s7 -; GFX9-NEXT: s_add_i32 s1, s1, s5 -; GFX9-NEXT: s_ashr_i32 s4, s1, 31 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; GFX9-NEXT: s_mov_b32 s5, s4 +; GFX9-NEXT: s_add_i32 s1, s1, s6 +; GFX9-NEXT: s_ashr_i32 s6, s1, 31 +; GFX9-NEXT: s_mov_b32 s7, s6 ; GFX9-NEXT: s_mul_i32 s0, s0, s2 -; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, s[4:5], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: s_cmp_lg_u64 s[4:5], s[6:7] +; GFX9-NEXT: s_cselect_b32 s1, 0, s1 +; GFX9-NEXT: s_cselect_b32 s0, 0, s0 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_cndmask_b32_e64 v1, v2, 0, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: global_store_dwordx2 v[0:1], v[0:1], off ; GFX9-NEXT: s_endpgm ; @@ -498,40 +496,38 @@ ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_mul_i32 s7, s0, s3 ; GFX10-NEXT: s_mul_hi_u32 s8, s0, s2 -; GFX10-NEXT: s_mul_hi_u32 s6, s0, s3 -; GFX10-NEXT: s_mul_i32 s5, s1, s2 +; GFX10-NEXT: s_mul_hi_u32 s5, s0, s3 +; GFX10-NEXT: s_mul_i32 s6, s1, s2 ; GFX10-NEXT: s_add_u32 s11, s8, s7 ; GFX10-NEXT: s_mul_hi_u32 s4, s1, s2 -; GFX10-NEXT: s_addc_u32 s6, 0, s6 +; GFX10-NEXT: s_addc_u32 s5, 0, s5 ; GFX10-NEXT: s_mul_hi_i32 s9, s1, s3 -; GFX10-NEXT: s_add_u32 s11, s11, s5 +; GFX10-NEXT: s_add_u32 s11, s11, s6 ; GFX10-NEXT: s_mul_i32 s10, s1, s3 -; GFX10-NEXT: s_addc_u32 s4, s6, s4 -; GFX10-NEXT: s_addc_u32 s6, s9, 0 +; GFX10-NEXT: s_addc_u32 s4, s5, s4 +; GFX10-NEXT: s_addc_u32 s5, s9, 0 ; GFX10-NEXT: s_add_u32 s4, s4, s10 -; GFX10-NEXT: s_addc_u32 s6, 0, s6 +; GFX10-NEXT: s_addc_u32 s5, 0, s5 ; GFX10-NEXT: s_sub_u32 s9, s4, s2 -; GFX10-NEXT: s_subb_u32 s10, s6, 0 -; GFX10-NEXT: v_mov_b32_e32 v1, s9 +; GFX10-NEXT: s_subb_u32 s10, s5, 0 ; GFX10-NEXT: s_cmp_lt_i32 s1, 0 -; GFX10-NEXT: v_mov_b32_e32 v0, s10 -; GFX10-NEXT: s_cselect_b32 vcc_lo, -1, 0 +; GFX10-NEXT: s_cselect_b32 s1, s9, s4 +; GFX10-NEXT: s_cselect_b32 s4, s10, s5 +; GFX10-NEXT: s_sub_u32 s9, s1, s0 +; GFX10-NEXT: s_subb_u32 s5, s4, 0 ; GFX10-NEXT: s_cmp_lt_i32 s3, 0 -; GFX10-NEXT: v_cndmask_b32_e32 v2, s4, v1, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v0, s6, v0, vcc_lo -; GFX10-NEXT: v_sub_co_u32 v3, vcc_lo, v2, s0 -; GFX10-NEXT: v_subrev_co_ci_u32_e32 v1, vcc_lo, 0, v0, vcc_lo -; GFX10-NEXT: s_cselect_b32 vcc_lo, -1, 0 -; GFX10-NEXT: s_add_i32 s1, s8, s7 ; GFX10-NEXT: s_mul_i32 s0, s0, s2 -; GFX10-NEXT: s_add_i32 s1, s1, s5 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo -; GFX10-NEXT: s_ashr_i32 s4, s1, 31 -; GFX10-NEXT: s_mov_b32 s5, s4 -; GFX10-NEXT: v_cmp_ne_u64_e32 vcc_lo, s[4:5], v[0:1] -; GFX10-NEXT: v_cndmask_b32_e64 v1, s1, 0, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, s0, 0, vcc_lo +; GFX10-NEXT: s_cselect_b32 s5, s5, s4 +; GFX10-NEXT: s_cselect_b32 s4, s9, s1 +; GFX10-NEXT: s_add_i32 s1, s8, s7 +; GFX10-NEXT: s_add_i32 s1, s1, s6 +; GFX10-NEXT: s_ashr_i32 s6, s1, 31 +; GFX10-NEXT: s_mov_b32 s7, s6 +; GFX10-NEXT: s_cmp_lg_u64 s[4:5], s[6:7] +; GFX10-NEXT: s_cselect_b32 s0, 0, s0 +; GFX10-NEXT: s_cselect_b32 s1, 0, s1 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: global_store_dwordx2 v[0:1], v[0:1], off ; GFX10-NEXT: s_endpgm ; @@ -541,42 +537,40 @@ ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_mul_i32 s7, s0, s3 ; GFX11-NEXT: s_mul_hi_u32 s8, s0, s2 -; GFX11-NEXT: s_mul_hi_u32 s6, s0, s3 -; GFX11-NEXT: s_mul_i32 s5, s1, s2 +; GFX11-NEXT: s_mul_hi_u32 s5, s0, s3 +; GFX11-NEXT: s_mul_i32 s6, s1, s2 ; GFX11-NEXT: s_add_u32 s11, s8, s7 ; GFX11-NEXT: s_mul_hi_u32 s4, s1, s2 -; GFX11-NEXT: s_addc_u32 s6, 0, s6 +; GFX11-NEXT: s_addc_u32 s5, 0, s5 ; GFX11-NEXT: s_mul_hi_i32 s9, s1, s3 -; GFX11-NEXT: s_add_u32 s11, s11, s5 +; GFX11-NEXT: s_add_u32 s11, s11, s6 ; GFX11-NEXT: s_mul_i32 s10, s1, s3 -; GFX11-NEXT: s_addc_u32 s4, s6, s4 -; GFX11-NEXT: s_addc_u32 s6, s9, 0 +; GFX11-NEXT: s_addc_u32 s4, s5, s4 +; GFX11-NEXT: s_addc_u32 s5, s9, 0 ; GFX11-NEXT: s_add_u32 s4, s4, s10 -; GFX11-NEXT: s_addc_u32 s6, 0, s6 +; GFX11-NEXT: s_addc_u32 s5, 0, s5 ; GFX11-NEXT: s_sub_u32 s9, s4, s2 -; GFX11-NEXT: s_subb_u32 s10, s6, 0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v1, s9 :: v_dual_mov_b32 v0, s10 +; GFX11-NEXT: s_subb_u32 s10, s5, 0 ; GFX11-NEXT: s_cmp_lt_i32 s1, 0 -; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 +; GFX11-NEXT: s_cselect_b32 s1, s9, s4 +; GFX11-NEXT: s_cselect_b32 s4, s10, s5 +; GFX11-NEXT: s_sub_u32 s9, s1, s0 +; GFX11-NEXT: s_subb_u32 s5, s4, 0 ; GFX11-NEXT: s_cmp_lt_i32 s3, 0 -; GFX11-NEXT: v_cndmask_b32_e32 v2, s4, v1, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e32 v0, s6, v0, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_sub_co_u32 v3, vcc_lo, v2, s0 -; GFX11-NEXT: v_subrev_co_ci_u32_e32 v1, vcc_lo, 0, v0, vcc_lo -; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 -; GFX11-NEXT: s_add_i32 s1, s8, s7 ; GFX11-NEXT: s_mul_i32 s0, s0, s2 -; GFX11-NEXT: s_add_i32 s1, s1, s5 -; GFX11-NEXT: v_dual_cndmask_b32 v1, v0, v1 :: v_dual_cndmask_b32 v0, v2, v3 -; GFX11-NEXT: s_ashr_i32 s4, s1, 31 +; GFX11-NEXT: s_cselect_b32 s5, s5, s4 +; GFX11-NEXT: s_cselect_b32 s4, s9, s1 +; GFX11-NEXT: s_add_i32 s1, s8, s7 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_add_i32 s1, s1, s6 +; GFX11-NEXT: s_ashr_i32 s6, s1, 31 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_mov_b32 s7, s6 +; GFX11-NEXT: s_cmp_lg_u64 s[4:5], s[6:7] +; GFX11-NEXT: s_cselect_b32 s0, 0, s0 +; GFX11-NEXT: s_cselect_b32 s1, 0, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_mov_b32 s5, s4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, s[4:5], v[0:1] -; GFX11-NEXT: v_cndmask_b32_e64 v1, s1, 0, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v0, s0, 0, vcc_lo +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: global_store_b64 v[0:1], v[0:1], off ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll @@ -5,42 +5,37 @@ define amdgpu_kernel void @round_f64(double addrspace(1)* %out, double %x) #0 { ; SI-LABEL: round_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: s_mov_b32 s1, 0xfffff -; SI-NEXT: v_mov_b32_e32 v4, 0x3ff00000 -; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s9, 0xfffff +; SI-NEXT: s_mov_b32 s8, s6 +; SI-NEXT: v_mov_b32_e32 v2, 0x3ff00000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_bfe_u32 s0, s7, 0xb0014 -; SI-NEXT: s_mov_b32 s8, s4 -; SI-NEXT: s_add_i32 s4, s0, 0xfffffc01 -; SI-NEXT: s_mov_b32 s0, s10 -; SI-NEXT: s_lshr_b64 s[0:1], s[0:1], s4 -; SI-NEXT: s_andn2_b64 s[2:3], s[6:7], s[0:1] -; SI-NEXT: s_and_b32 s0, s7, 0x80000000 -; SI-NEXT: s_cmp_lt_i32 s4, 0 -; SI-NEXT: v_mov_b32_e32 v0, s3 -; SI-NEXT: v_mov_b32_e32 v1, s0 -; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: s_cmp_gt_i32 s4, 51 -; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; SI-NEXT: v_mov_b32_e32 v1, s7 -; SI-NEXT: s_cselect_b64 s[0:1], -1, 0 -; SI-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[0:1] -; SI-NEXT: v_mov_b32_e32 v0, s2 -; SI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] -; SI-NEXT: v_add_f64 v[2:3], s[6:7], -v[0:1] +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_bfe_u32 s0, s3, 0xb0014 +; SI-NEXT: s_addk_i32 s0, 0xfc01 +; SI-NEXT: s_lshr_b64 s[8:9], s[8:9], s0 +; SI-NEXT: s_andn2_b64 s[8:9], s[2:3], s[8:9] +; SI-NEXT: s_and_b32 s5, s3, 0x80000000 +; SI-NEXT: s_cmp_lt_i32 s0, 0 +; SI-NEXT: s_cselect_b32 s8, 0, s8 +; SI-NEXT: s_cselect_b32 s5, s5, s9 +; SI-NEXT: s_cmp_gt_i32 s0, 51 +; SI-NEXT: s_cselect_b32 s8, s2, s8 +; SI-NEXT: s_cselect_b32 s9, s3, s5 +; SI-NEXT: v_mov_b32_e32 v0, s8 +; SI-NEXT: v_mov_b32_e32 v1, s9 +; SI-NEXT: v_add_f64 v[0:1], s[2:3], -v[0:1] ; SI-NEXT: s_brev_b32 s0, -2 -; SI-NEXT: v_mov_b32_e32 v5, s7 -; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[2:3]|, 0.5 -; SI-NEXT: v_bfi_b32 v4, s0, v4, v5 -; SI-NEXT: v_cndmask_b32_e32 v3, 0, v4, vcc -; SI-NEXT: v_mov_b32_e32 v2, 0 -; SI-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] -; SI-NEXT: s_mov_b32 s9, s5 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; SI-NEXT: v_mov_b32_e32 v3, s3 +; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[0:1]|, 0.5 +; SI-NEXT: v_bfi_b32 v2, s0, v2, v3 +; SI-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; SI-NEXT: v_mov_b32_e32 v0, 0 +; SI-NEXT: v_add_f64 v[0:1], s[8:9], v[0:1] +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; CI-LABEL: round_f64: @@ -144,66 +139,56 @@ define amdgpu_kernel void @round_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in) #0 { ; SI-LABEL: round_v2f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_mov_b32 s3, 0xfffff -; SI-NEXT: s_mov_b32 s2, s6 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s9, 0xfffff +; SI-NEXT: s_mov_b32 s8, s2 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_bfe_u32 s0, s11, 0xb0014 -; SI-NEXT: s_add_i32 s7, s0, 0xfffffc01 -; SI-NEXT: s_lshr_b64 s[0:1], s[2:3], s7 -; SI-NEXT: s_andn2_b64 s[12:13], s[10:11], s[0:1] -; SI-NEXT: s_and_b32 s0, s11, 0x80000000 -; SI-NEXT: s_cmp_lt_i32 s7, 0 -; SI-NEXT: v_mov_b32_e32 v0, s13 -; SI-NEXT: v_mov_b32_e32 v1, s0 -; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: s_cmp_gt_i32 s7, 51 -; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; SI-NEXT: s_bfe_u32 s3, s7, 0xb0014 +; SI-NEXT: s_addk_i32 s3, 0xfc01 +; SI-NEXT: s_lshr_b64 s[10:11], s[8:9], s3 +; SI-NEXT: s_andn2_b64 s[10:11], s[6:7], s[10:11] +; SI-NEXT: s_and_b32 s12, s7, 0x80000000 +; SI-NEXT: s_cmp_lt_i32 s3, 0 +; SI-NEXT: s_cselect_b32 s10, 0, s10 +; SI-NEXT: s_cselect_b32 s11, s12, s11 +; SI-NEXT: s_cmp_gt_i32 s3, 51 +; SI-NEXT: s_cselect_b32 s10, s6, s10 +; SI-NEXT: s_cselect_b32 s11, s7, s11 +; SI-NEXT: v_mov_b32_e32 v0, s10 ; SI-NEXT: v_mov_b32_e32 v1, s11 -; SI-NEXT: s_cselect_b64 s[0:1], -1, 0 -; SI-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[0:1] -; SI-NEXT: v_mov_b32_e32 v0, s12 -; SI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc -; SI-NEXT: v_mov_b32_e32 v2, s10 -; SI-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] -; SI-NEXT: v_add_f64 v[2:3], s[10:11], -v[0:1] -; SI-NEXT: s_bfe_u32 s0, s9, 0xb0014 -; SI-NEXT: s_add_i32 s10, s0, 0xfffffc01 -; SI-NEXT: s_brev_b32 s7, -2 -; SI-NEXT: v_mov_b32_e32 v6, 0x3ff00000 -; SI-NEXT: v_mov_b32_e32 v4, s11 -; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[2:3]|, 0.5 -; SI-NEXT: s_lshr_b64 s[0:1], s[2:3], s10 -; SI-NEXT: v_bfi_b32 v4, s7, v6, v4 -; SI-NEXT: s_andn2_b64 s[2:3], s[8:9], s[0:1] -; SI-NEXT: s_and_b32 s0, s9, 0x80000000 -; SI-NEXT: v_cndmask_b32_e32 v3, 0, v4, vcc -; SI-NEXT: v_mov_b32_e32 v2, 0 +; SI-NEXT: v_add_f64 v[0:1], s[6:7], -v[0:1] +; SI-NEXT: s_brev_b32 s3, -2 +; SI-NEXT: v_mov_b32_e32 v4, 0x3ff00000 +; SI-NEXT: v_mov_b32_e32 v2, s7 +; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[0:1]|, 0.5 +; SI-NEXT: v_bfi_b32 v2, s3, v4, v2 +; SI-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; SI-NEXT: v_mov_b32_e32 v0, 0 +; SI-NEXT: s_bfe_u32 s6, s5, 0xb0014 +; SI-NEXT: v_add_f64 v[2:3], s[10:11], v[0:1] +; SI-NEXT: s_add_i32 s10, s6, 0xfffffc01 +; SI-NEXT: s_lshr_b64 s[6:7], s[8:9], s10 +; SI-NEXT: s_andn2_b64 s[6:7], s[4:5], s[6:7] +; SI-NEXT: s_and_b32 s8, s5, 0x80000000 ; SI-NEXT: s_cmp_lt_i32 s10, 0 -; SI-NEXT: v_add_f64 v[2:3], v[0:1], v[2:3] -; SI-NEXT: v_mov_b32_e32 v0, s3 -; SI-NEXT: v_mov_b32_e32 v1, s0 -; SI-NEXT: s_cselect_b64 vcc, -1, 0 +; SI-NEXT: s_cselect_b32 s6, 0, s6 +; SI-NEXT: s_cselect_b32 s7, s8, s7 ; SI-NEXT: s_cmp_gt_i32 s10, 51 -; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; SI-NEXT: v_mov_b32_e32 v1, s9 -; SI-NEXT: s_cselect_b64 s[0:1], -1, 0 -; SI-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[0:1] -; SI-NEXT: v_mov_b32_e32 v0, s2 -; SI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc -; SI-NEXT: v_mov_b32_e32 v4, s8 -; SI-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[0:1] -; SI-NEXT: v_add_f64 v[4:5], s[8:9], -v[0:1] -; SI-NEXT: v_mov_b32_e32 v7, s9 -; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5 -; SI-NEXT: v_bfi_b32 v6, s7, v6, v7 -; SI-NEXT: v_cndmask_b32_e32 v5, 0, v6, vcc -; SI-NEXT: v_mov_b32_e32 v4, 0 -; SI-NEXT: v_add_f64 v[0:1], v[0:1], v[4:5] -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; SI-NEXT: s_cselect_b32 s6, s4, s6 +; SI-NEXT: s_cselect_b32 s7, s5, s7 +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_add_f64 v[0:1], s[4:5], -v[0:1] +; SI-NEXT: v_mov_b32_e32 v5, s5 +; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[0:1]|, 0.5 +; SI-NEXT: v_bfi_b32 v4, s3, v4, v5 +; SI-NEXT: v_cndmask_b32_e32 v1, 0, v4, vcc +; SI-NEXT: v_mov_b32_e32 v0, 0 +; SI-NEXT: v_add_f64 v[0:1], s[6:7], v[0:1] +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; CI-LABEL: round_v2f64: @@ -242,116 +227,96 @@ ; SI-LABEL: round_v4f64: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x11 -; SI-NEXT: s_mov_b32 s14, -1 -; SI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xfffff -; SI-NEXT: s_mov_b32 s2, s14 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s13, 0xfffff +; SI-NEXT: v_mov_b32_e32 v8, 0x3ff00000 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_bfe_u32 s0, s7, 0xb0014 -; SI-NEXT: s_add_i32 s18, s0, 0xfffffc01 -; SI-NEXT: s_lshr_b64 s[0:1], s[2:3], s18 -; SI-NEXT: s_andn2_b64 s[16:17], s[6:7], s[0:1] -; SI-NEXT: s_and_b32 s0, s7, 0x80000000 -; SI-NEXT: s_cmp_lt_i32 s18, 0 -; SI-NEXT: v_mov_b32_e32 v0, s17 -; SI-NEXT: v_mov_b32_e32 v1, s0 -; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: s_cmp_gt_i32 s18, 51 -; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; SI-NEXT: v_mov_b32_e32 v1, s7 -; SI-NEXT: s_cselect_b64 s[0:1], -1, 0 -; SI-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[0:1] -; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] -; SI-NEXT: v_add_f64 v[2:3], s[6:7], -v[0:1] -; SI-NEXT: s_bfe_u32 s0, s5, 0xb0014 -; SI-NEXT: s_add_i32 s17, s0, 0xfffffc01 +; SI-NEXT: s_bfe_u32 s12, s7, 0xb0014 +; SI-NEXT: s_add_i32 s16, s12, 0xfffffc01 +; SI-NEXT: s_mov_b32 s12, s2 +; SI-NEXT: s_lshr_b64 s[14:15], s[12:13], s16 +; SI-NEXT: s_andn2_b64 s[14:15], s[6:7], s[14:15] +; SI-NEXT: s_and_b32 s17, s7, 0x80000000 +; SI-NEXT: s_cmp_lt_i32 s16, 0 +; SI-NEXT: s_cselect_b32 s14, 0, s14 +; SI-NEXT: s_cselect_b32 s15, s17, s15 +; SI-NEXT: s_cmp_gt_i32 s16, 51 +; SI-NEXT: s_cselect_b32 s14, s6, s14 +; SI-NEXT: s_cselect_b32 s15, s7, s15 +; SI-NEXT: v_mov_b32_e32 v0, s14 +; SI-NEXT: v_mov_b32_e32 v1, s15 +; SI-NEXT: v_add_f64 v[0:1], s[6:7], -v[0:1] ; SI-NEXT: s_brev_b32 s16, -2 -; SI-NEXT: v_mov_b32_e32 v12, 0x3ff00000 -; SI-NEXT: v_mov_b32_e32 v4, s7 -; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[2:3]|, 0.5 -; SI-NEXT: s_lshr_b64 s[0:1], s[2:3], s17 -; SI-NEXT: v_bfi_b32 v4, s16, v12, v4 -; SI-NEXT: s_andn2_b64 s[6:7], s[4:5], s[0:1] -; SI-NEXT: s_and_b32 s0, s5, 0x80000000 -; SI-NEXT: v_cndmask_b32_e32 v3, 0, v4, vcc -; SI-NEXT: v_mov_b32_e32 v2, 0 -; SI-NEXT: s_cmp_lt_i32 s17, 0 -; SI-NEXT: v_add_f64 v[2:3], v[0:1], v[2:3] -; SI-NEXT: v_mov_b32_e32 v0, s7 -; SI-NEXT: v_mov_b32_e32 v1, s0 -; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: s_cmp_gt_i32 s17, 51 -; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: s_cselect_b64 s[0:1], -1, 0 -; SI-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[0:1] +; SI-NEXT: v_mov_b32_e32 v2, s7 +; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[0:1]|, 0.5 +; SI-NEXT: v_bfi_b32 v2, s16, v8, v2 +; SI-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; SI-NEXT: v_mov_b32_e32 v0, 0 +; SI-NEXT: s_bfe_u32 s6, s5, 0xb0014 +; SI-NEXT: v_add_f64 v[2:3], s[14:15], v[0:1] +; SI-NEXT: s_add_i32 s14, s6, 0xfffffc01 +; SI-NEXT: s_lshr_b64 s[6:7], s[12:13], s14 +; SI-NEXT: s_andn2_b64 s[6:7], s[4:5], s[6:7] +; SI-NEXT: s_and_b32 s15, s5, 0x80000000 +; SI-NEXT: s_cmp_lt_i32 s14, 0 +; SI-NEXT: s_cselect_b32 s6, 0, s6 +; SI-NEXT: s_cselect_b32 s7, s15, s7 +; SI-NEXT: s_cmp_gt_i32 s14, 51 +; SI-NEXT: s_cselect_b32 s6, s4, s6 +; SI-NEXT: s_cselect_b32 s7, s5, s7 ; SI-NEXT: v_mov_b32_e32 v0, s6 -; SI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_add_f64 v[0:1], s[4:5], -v[0:1] +; SI-NEXT: s_bfe_u32 s4, s11, 0xb0014 +; SI-NEXT: s_add_i32 s14, s4, 0xfffffc01 +; SI-NEXT: v_mov_b32_e32 v4, s5 +; SI-NEXT: s_lshr_b64 s[4:5], s[12:13], s14 +; SI-NEXT: s_andn2_b64 s[4:5], s[10:11], s[4:5] +; SI-NEXT: s_and_b32 s15, s11, 0x80000000 +; SI-NEXT: s_cmp_lt_i32 s14, 0 +; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[0:1]|, 0.5 +; SI-NEXT: s_cselect_b32 s4, 0, s4 +; SI-NEXT: s_cselect_b32 s5, s15, s5 +; SI-NEXT: s_cmp_gt_i32 s14, 51 +; SI-NEXT: v_bfi_b32 v4, s16, v8, v4 +; SI-NEXT: s_cselect_b32 s4, s10, s4 +; SI-NEXT: v_cndmask_b32_e32 v1, 0, v4, vcc +; SI-NEXT: s_cselect_b32 s5, s11, s5 ; SI-NEXT: v_mov_b32_e32 v4, s4 -; SI-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[0:1] -; SI-NEXT: s_bfe_u32 s0, s11, 0xb0014 -; SI-NEXT: v_add_f64 v[4:5], s[4:5], -v[0:1] -; SI-NEXT: s_add_i32 s6, s0, 0xfffffc01 -; SI-NEXT: s_lshr_b64 s[0:1], s[2:3], s6 -; SI-NEXT: v_mov_b32_e32 v6, s5 +; SI-NEXT: v_mov_b32_e32 v5, s5 +; SI-NEXT: v_add_f64 v[4:5], s[10:11], -v[4:5] +; SI-NEXT: v_mov_b32_e32 v6, s11 ; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5 -; SI-NEXT: s_andn2_b64 s[4:5], s[10:11], s[0:1] -; SI-NEXT: s_and_b32 s0, s11, 0x80000000 -; SI-NEXT: v_bfi_b32 v6, s16, v12, v6 -; SI-NEXT: s_cmp_lt_i32 s6, 0 -; SI-NEXT: v_cndmask_b32_e32 v9, 0, v6, vcc -; SI-NEXT: v_mov_b32_e32 v4, s5 -; SI-NEXT: v_mov_b32_e32 v5, s0 -; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: s_cmp_gt_i32 s6, 51 -; SI-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc -; SI-NEXT: v_mov_b32_e32 v5, s11 -; SI-NEXT: s_cselect_b64 s[0:1], -1, 0 -; SI-NEXT: v_cndmask_b32_e64 v5, v4, v5, s[0:1] +; SI-NEXT: v_bfi_b32 v6, s16, v8, v6 +; SI-NEXT: v_cndmask_b32_e32 v5, 0, v6, vcc +; SI-NEXT: v_mov_b32_e32 v4, 0 +; SI-NEXT: v_add_f64 v[6:7], s[4:5], v[4:5] +; SI-NEXT: s_bfe_u32 s4, s9, 0xb0014 +; SI-NEXT: s_add_i32 s10, s4, 0xfffffc01 +; SI-NEXT: s_lshr_b64 s[4:5], s[12:13], s10 +; SI-NEXT: s_andn2_b64 s[4:5], s[8:9], s[4:5] +; SI-NEXT: s_and_b32 s11, s9, 0x80000000 +; SI-NEXT: s_cmp_lt_i32 s10, 0 +; SI-NEXT: s_cselect_b32 s4, 0, s4 +; SI-NEXT: s_cselect_b32 s5, s11, s5 +; SI-NEXT: s_cmp_gt_i32 s10, 51 +; SI-NEXT: s_cselect_b32 s4, s8, s4 +; SI-NEXT: s_cselect_b32 s5, s9, s5 ; SI-NEXT: v_mov_b32_e32 v4, s4 -; SI-NEXT: v_cndmask_b32_e64 v4, v4, 0, vcc -; SI-NEXT: v_mov_b32_e32 v6, s10 -; SI-NEXT: v_cndmask_b32_e64 v4, v4, v6, s[0:1] -; SI-NEXT: v_add_f64 v[6:7], s[10:11], -v[4:5] -; SI-NEXT: s_bfe_u32 s0, s9, 0xb0014 -; SI-NEXT: s_add_i32 s4, s0, 0xfffffc01 -; SI-NEXT: v_mov_b32_e32 v10, s11 -; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[6:7]|, 0.5 -; SI-NEXT: s_lshr_b64 s[0:1], s[2:3], s4 -; SI-NEXT: v_bfi_b32 v10, s16, v12, v10 -; SI-NEXT: s_andn2_b64 s[2:3], s[8:9], s[0:1] -; SI-NEXT: s_and_b32 s0, s9, 0x80000000 -; SI-NEXT: v_cndmask_b32_e32 v7, 0, v10, vcc -; SI-NEXT: v_mov_b32_e32 v6, 0 -; SI-NEXT: s_cmp_lt_i32 s4, 0 -; SI-NEXT: v_add_f64 v[6:7], v[4:5], v[6:7] -; SI-NEXT: v_mov_b32_e32 v4, s3 -; SI-NEXT: v_mov_b32_e32 v5, s0 -; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: s_cmp_gt_i32 s4, 51 -; SI-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc -; SI-NEXT: v_mov_b32_e32 v5, s9 -; SI-NEXT: s_cselect_b64 s[0:1], -1, 0 -; SI-NEXT: v_cndmask_b32_e64 v5, v4, v5, s[0:1] -; SI-NEXT: v_mov_b32_e32 v4, s2 -; SI-NEXT: v_cndmask_b32_e64 v4, v4, 0, vcc -; SI-NEXT: v_mov_b32_e32 v10, s8 -; SI-NEXT: v_cndmask_b32_e64 v4, v4, v10, s[0:1] -; SI-NEXT: v_add_f64 v[10:11], s[8:9], -v[4:5] -; SI-NEXT: v_mov_b32_e32 v13, s9 -; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[10:11]|, 0.5 -; SI-NEXT: v_bfi_b32 v12, s16, v12, v13 -; SI-NEXT: v_cndmask_b32_e32 v11, 0, v12, vcc -; SI-NEXT: v_mov_b32_e32 v10, 0 -; SI-NEXT: v_mov_b32_e32 v8, 0 -; SI-NEXT: v_add_f64 v[4:5], v[4:5], v[10:11] -; SI-NEXT: s_mov_b32 s15, 0xf000 -; SI-NEXT: v_add_f64 v[0:1], v[0:1], v[8:9] -; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[12:15], 0 offset:16 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 +; SI-NEXT: v_mov_b32_e32 v5, s5 +; SI-NEXT: v_add_f64 v[4:5], s[8:9], -v[4:5] +; SI-NEXT: v_mov_b32_e32 v9, s9 +; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5 +; SI-NEXT: v_bfi_b32 v8, s16, v8, v9 +; SI-NEXT: v_cndmask_b32_e32 v5, 0, v8, vcc +; SI-NEXT: v_mov_b32_e32 v4, 0 +; SI-NEXT: v_mov_b32_e32 v0, 0 +; SI-NEXT: v_add_f64 v[4:5], s[4:5], v[4:5] +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: v_add_f64 v[0:1], s[6:7], v[0:1] +; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; CI-LABEL: round_v4f64: @@ -407,219 +372,178 @@ ; SI-LABEL: round_v8f64: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x19 -; SI-NEXT: s_mov_b32 s22, -1 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_mov_b32 s21, 0xfffff -; SI-NEXT: s_mov_b32 s20, s22 ; SI-NEXT: v_mov_b32_e32 v8, 0x3ff00000 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_bfe_u32 s2, s7, 0xb0014 -; SI-NEXT: s_add_i32 s26, s2, 0xfffffc01 -; SI-NEXT: s_lshr_b64 s[2:3], s[20:21], s26 -; SI-NEXT: s_and_b32 s23, s7, 0x80000000 -; SI-NEXT: s_andn2_b64 s[24:25], s[6:7], s[2:3] -; SI-NEXT: s_cmp_lt_i32 s26, 0 -; SI-NEXT: v_mov_b32_e32 v0, s25 -; SI-NEXT: v_mov_b32_e32 v1, s23 -; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: s_cmp_gt_i32 s26, 51 -; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; SI-NEXT: v_mov_b32_e32 v1, s7 -; SI-NEXT: s_cselect_b64 s[2:3], -1, 0 -; SI-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3] -; SI-NEXT: v_mov_b32_e32 v0, s24 -; SI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[2:3] -; SI-NEXT: v_add_f64 v[2:3], s[6:7], -v[0:1] -; SI-NEXT: s_bfe_u32 s2, s5, 0xb0014 -; SI-NEXT: s_add_i32 s24, s2, 0xfffffc01 -; SI-NEXT: s_brev_b32 s23, -2 -; SI-NEXT: v_mov_b32_e32 v4, s7 -; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[2:3]|, 0.5 -; SI-NEXT: s_lshr_b64 s[2:3], s[20:21], s24 -; SI-NEXT: v_bfi_b32 v4, s23, v8, v4 -; SI-NEXT: s_andn2_b64 s[6:7], s[4:5], s[2:3] -; SI-NEXT: s_and_b32 s2, s5, 0x80000000 -; SI-NEXT: v_cndmask_b32_e32 v3, 0, v4, vcc -; SI-NEXT: v_mov_b32_e32 v2, 0 +; SI-NEXT: s_bfe_u32 s20, s7, 0xb0014 +; SI-NEXT: s_add_i32 s24, s20, 0xfffffc01 +; SI-NEXT: s_mov_b32 s20, s2 +; SI-NEXT: s_lshr_b64 s[22:23], s[20:21], s24 +; SI-NEXT: s_andn2_b64 s[22:23], s[6:7], s[22:23] +; SI-NEXT: s_and_b32 s25, s7, 0x80000000 ; SI-NEXT: s_cmp_lt_i32 s24, 0 -; SI-NEXT: v_add_f64 v[2:3], v[0:1], v[2:3] -; SI-NEXT: v_mov_b32_e32 v0, s7 -; SI-NEXT: v_mov_b32_e32 v1, s2 -; SI-NEXT: s_cselect_b64 vcc, -1, 0 +; SI-NEXT: s_cselect_b32 s22, 0, s22 +; SI-NEXT: s_cselect_b32 s23, s25, s23 ; SI-NEXT: s_cmp_gt_i32 s24, 51 -; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: s_cselect_b64 s[2:3], -1, 0 -; SI-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3] -; SI-NEXT: v_mov_b32_e32 v0, s6 -; SI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; SI-NEXT: s_cselect_b32 s22, s6, s22 +; SI-NEXT: s_cselect_b32 s23, s7, s23 +; SI-NEXT: v_mov_b32_e32 v0, s22 +; SI-NEXT: v_mov_b32_e32 v1, s23 +; SI-NEXT: v_add_f64 v[0:1], s[6:7], -v[0:1] +; SI-NEXT: s_brev_b32 s6, -2 +; SI-NEXT: v_mov_b32_e32 v2, s7 +; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[0:1]|, 0.5 +; SI-NEXT: v_bfi_b32 v2, s6, v8, v2 +; SI-NEXT: s_bfe_u32 s7, s5, 0xb0014 +; SI-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; SI-NEXT: v_mov_b32_e32 v0, 0 +; SI-NEXT: s_addk_i32 s7, 0xfc01 +; SI-NEXT: v_add_f64 v[2:3], s[22:23], v[0:1] +; SI-NEXT: s_lshr_b64 s[22:23], s[20:21], s7 +; SI-NEXT: s_andn2_b64 s[22:23], s[4:5], s[22:23] +; SI-NEXT: s_and_b32 s24, s5, 0x80000000 +; SI-NEXT: s_cmp_lt_i32 s7, 0 +; SI-NEXT: s_cselect_b32 s22, 0, s22 +; SI-NEXT: s_cselect_b32 s23, s24, s23 +; SI-NEXT: s_cmp_gt_i32 s7, 51 +; SI-NEXT: s_cselect_b32 s22, s4, s22 +; SI-NEXT: s_cselect_b32 s23, s5, s23 +; SI-NEXT: v_mov_b32_e32 v0, s22 +; SI-NEXT: v_mov_b32_e32 v1, s23 +; SI-NEXT: v_add_f64 v[0:1], s[4:5], -v[0:1] +; SI-NEXT: v_mov_b32_e32 v4, s5 +; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[0:1]|, 0.5 +; SI-NEXT: s_bfe_u32 s4, s11, 0xb0014 +; SI-NEXT: v_bfi_b32 v4, s6, v8, v4 +; SI-NEXT: s_add_i32 s7, s4, 0xfffffc01 +; SI-NEXT: v_cndmask_b32_e32 v1, 0, v4, vcc +; SI-NEXT: v_mov_b32_e32 v0, 0 +; SI-NEXT: s_lshr_b64 s[4:5], s[20:21], s7 +; SI-NEXT: v_add_f64 v[0:1], s[22:23], v[0:1] +; SI-NEXT: s_andn2_b64 s[4:5], s[10:11], s[4:5] +; SI-NEXT: s_and_b32 s22, s11, 0x80000000 +; SI-NEXT: s_cmp_lt_i32 s7, 0 +; SI-NEXT: s_cselect_b32 s4, 0, s4 +; SI-NEXT: s_cselect_b32 s5, s22, s5 +; SI-NEXT: s_cmp_gt_i32 s7, 51 +; SI-NEXT: s_cselect_b32 s4, s10, s4 +; SI-NEXT: s_cselect_b32 s5, s11, s5 ; SI-NEXT: v_mov_b32_e32 v4, s4 -; SI-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[2:3] -; SI-NEXT: v_add_f64 v[4:5], s[4:5], -v[0:1] -; SI-NEXT: s_bfe_u32 s2, s11, 0xb0014 -; SI-NEXT: s_add_i32 s6, s2, 0xfffffc01 -; SI-NEXT: v_mov_b32_e32 v6, s5 +; SI-NEXT: v_mov_b32_e32 v5, s5 +; SI-NEXT: v_add_f64 v[4:5], s[10:11], -v[4:5] +; SI-NEXT: v_mov_b32_e32 v6, s11 ; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5 -; SI-NEXT: s_lshr_b64 s[2:3], s[20:21], s6 -; SI-NEXT: v_bfi_b32 v6, s23, v8, v6 -; SI-NEXT: s_andn2_b64 s[4:5], s[10:11], s[2:3] -; SI-NEXT: s_and_b32 s2, s11, 0x80000000 +; SI-NEXT: v_bfi_b32 v6, s6, v8, v6 ; SI-NEXT: v_cndmask_b32_e32 v5, 0, v6, vcc ; SI-NEXT: v_mov_b32_e32 v4, 0 -; SI-NEXT: s_cmp_lt_i32 s6, 0 -; SI-NEXT: v_add_f64 v[0:1], v[0:1], v[4:5] -; SI-NEXT: v_mov_b32_e32 v4, s5 -; SI-NEXT: v_mov_b32_e32 v5, s2 -; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: s_cmp_gt_i32 s6, 51 -; SI-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc -; SI-NEXT: v_mov_b32_e32 v5, s11 -; SI-NEXT: s_cselect_b64 s[2:3], -1, 0 -; SI-NEXT: v_cndmask_b32_e64 v5, v4, v5, s[2:3] -; SI-NEXT: v_mov_b32_e32 v4, s4 -; SI-NEXT: v_cndmask_b32_e64 v4, v4, 0, vcc -; SI-NEXT: v_mov_b32_e32 v6, s10 -; SI-NEXT: v_cndmask_b32_e64 v4, v4, v6, s[2:3] -; SI-NEXT: v_add_f64 v[6:7], s[10:11], -v[4:5] -; SI-NEXT: s_bfe_u32 s2, s9, 0xb0014 -; SI-NEXT: s_add_i32 s6, s2, 0xfffffc01 -; SI-NEXT: v_mov_b32_e32 v9, s11 -; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[6:7]|, 0.5 -; SI-NEXT: s_lshr_b64 s[2:3], s[20:21], s6 -; SI-NEXT: v_bfi_b32 v9, s23, v8, v9 -; SI-NEXT: s_andn2_b64 s[4:5], s[8:9], s[2:3] -; SI-NEXT: s_and_b32 s2, s9, 0x80000000 -; SI-NEXT: v_cndmask_b32_e32 v7, 0, v9, vcc -; SI-NEXT: v_mov_b32_e32 v6, 0 -; SI-NEXT: s_cmp_lt_i32 s6, 0 -; SI-NEXT: v_add_f64 v[6:7], v[4:5], v[6:7] -; SI-NEXT: v_mov_b32_e32 v4, s5 -; SI-NEXT: v_mov_b32_e32 v5, s2 -; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: s_cmp_gt_i32 s6, 51 -; SI-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc -; SI-NEXT: v_mov_b32_e32 v5, s9 -; SI-NEXT: s_cselect_b64 s[2:3], -1, 0 -; SI-NEXT: v_cndmask_b32_e64 v5, v4, v5, s[2:3] +; SI-NEXT: v_add_f64 v[6:7], s[4:5], v[4:5] +; SI-NEXT: s_bfe_u32 s4, s9, 0xb0014 +; SI-NEXT: s_add_i32 s7, s4, 0xfffffc01 +; SI-NEXT: s_lshr_b64 s[4:5], s[20:21], s7 +; SI-NEXT: s_andn2_b64 s[4:5], s[8:9], s[4:5] +; SI-NEXT: s_and_b32 s10, s9, 0x80000000 +; SI-NEXT: s_cmp_lt_i32 s7, 0 +; SI-NEXT: s_cselect_b32 s4, 0, s4 +; SI-NEXT: s_cselect_b32 s5, s10, s5 +; SI-NEXT: s_cmp_gt_i32 s7, 51 +; SI-NEXT: s_cselect_b32 s4, s8, s4 +; SI-NEXT: s_cselect_b32 s5, s9, s5 ; SI-NEXT: v_mov_b32_e32 v4, s4 -; SI-NEXT: v_cndmask_b32_e64 v4, v4, 0, vcc -; SI-NEXT: v_mov_b32_e32 v9, s8 -; SI-NEXT: v_cndmask_b32_e64 v4, v4, v9, s[2:3] -; SI-NEXT: s_bfe_u32 s2, s15, 0xb0014 -; SI-NEXT: v_add_f64 v[9:10], s[8:9], -v[4:5] -; SI-NEXT: s_add_i32 s4, s2, 0xfffffc01 -; SI-NEXT: s_lshr_b64 s[2:3], s[20:21], s4 -; SI-NEXT: v_mov_b32_e32 v11, s9 +; SI-NEXT: v_mov_b32_e32 v5, s5 +; SI-NEXT: v_add_f64 v[4:5], s[8:9], -v[4:5] +; SI-NEXT: v_mov_b32_e32 v9, s9 +; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5 +; SI-NEXT: v_bfi_b32 v9, s6, v8, v9 +; SI-NEXT: v_cndmask_b32_e32 v5, 0, v9, vcc +; SI-NEXT: v_mov_b32_e32 v4, 0 +; SI-NEXT: v_add_f64 v[4:5], s[4:5], v[4:5] +; SI-NEXT: s_bfe_u32 s4, s15, 0xb0014 +; SI-NEXT: s_add_i32 s7, s4, 0xfffffc01 +; SI-NEXT: s_lshr_b64 s[4:5], s[20:21], s7 +; SI-NEXT: s_andn2_b64 s[4:5], s[14:15], s[4:5] +; SI-NEXT: s_and_b32 s8, s15, 0x80000000 +; SI-NEXT: s_cmp_lt_i32 s7, 0 +; SI-NEXT: s_cselect_b32 s4, 0, s4 +; SI-NEXT: s_cselect_b32 s5, s8, s5 +; SI-NEXT: s_cmp_gt_i32 s7, 51 +; SI-NEXT: s_cselect_b32 s5, s15, s5 +; SI-NEXT: s_cselect_b32 s4, s14, s4 +; SI-NEXT: v_mov_b32_e32 v10, s5 +; SI-NEXT: v_mov_b32_e32 v9, s4 +; SI-NEXT: v_add_f64 v[9:10], s[14:15], -v[9:10] +; SI-NEXT: v_mov_b32_e32 v11, s15 ; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[9:10]|, 0.5 -; SI-NEXT: s_andn2_b64 s[24:25], s[14:15], s[2:3] -; SI-NEXT: s_and_b32 s2, s15, 0x80000000 -; SI-NEXT: v_bfi_b32 v11, s23, v8, v11 -; SI-NEXT: s_cmp_lt_i32 s4, 0 +; SI-NEXT: v_bfi_b32 v11, s6, v8, v11 ; SI-NEXT: v_cndmask_b32_e32 v10, 0, v11, vcc ; SI-NEXT: v_mov_b32_e32 v9, 0 -; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: s_cmp_gt_i32 s4, 51 -; SI-NEXT: v_add_f64 v[4:5], v[4:5], v[9:10] -; SI-NEXT: v_mov_b32_e32 v10, s2 -; SI-NEXT: s_cselect_b64 s[2:3], -1, 0 +; SI-NEXT: v_add_f64 v[10:11], s[4:5], v[9:10] ; SI-NEXT: s_bfe_u32 s4, s13, 0xb0014 -; SI-NEXT: s_add_i32 s6, s4, 0xfffffc01 -; SI-NEXT: s_lshr_b64 s[4:5], s[20:21], s6 -; SI-NEXT: s_andn2_b64 s[26:27], s[12:13], s[4:5] -; SI-NEXT: s_and_b32 s4, s13, 0x80000000 -; SI-NEXT: v_mov_b32_e32 v9, s25 -; SI-NEXT: s_cmp_lt_i32 s6, 0 -; SI-NEXT: v_cndmask_b32_e32 v15, v9, v10, vcc -; SI-NEXT: v_mov_b32_e32 v10, s4 -; SI-NEXT: s_cselect_b64 s[4:5], -1, 0 -; SI-NEXT: s_cmp_gt_i32 s6, 51 -; SI-NEXT: s_cselect_b64 s[6:7], -1, 0 -; SI-NEXT: s_bfe_u32 s8, s19, 0xb0014 -; SI-NEXT: s_add_i32 s10, s8, 0xfffffc01 -; SI-NEXT: s_lshr_b64 s[8:9], s[20:21], s10 -; SI-NEXT: s_andn2_b64 s[28:29], s[18:19], s[8:9] -; SI-NEXT: s_and_b32 s8, s19, 0x80000000 -; SI-NEXT: v_mov_b32_e32 v9, s27 -; SI-NEXT: s_cmp_lt_i32 s10, 0 -; SI-NEXT: v_cndmask_b32_e64 v17, v9, v10, s[4:5] -; SI-NEXT: v_mov_b32_e32 v9, s29 -; SI-NEXT: v_mov_b32_e32 v10, s8 -; SI-NEXT: s_cselect_b64 s[8:9], -1, 0 -; SI-NEXT: s_cmp_gt_i32 s10, 51 -; SI-NEXT: v_cndmask_b32_e64 v9, v9, v10, s[8:9] -; SI-NEXT: v_mov_b32_e32 v10, s19 -; SI-NEXT: s_cselect_b64 s[10:11], -1, 0 -; SI-NEXT: v_cndmask_b32_e64 v10, v9, v10, s[10:11] -; SI-NEXT: v_mov_b32_e32 v9, s28 -; SI-NEXT: v_cndmask_b32_e64 v9, v9, 0, s[8:9] -; SI-NEXT: v_mov_b32_e32 v11, s18 -; SI-NEXT: s_bfe_u32 s8, s17, 0xb0014 -; SI-NEXT: v_cndmask_b32_e64 v9, v9, v11, s[10:11] -; SI-NEXT: s_add_i32 s10, s8, 0xfffffc01 -; SI-NEXT: s_lshr_b64 s[8:9], s[20:21], s10 -; SI-NEXT: s_andn2_b64 s[20:21], s[16:17], s[8:9] -; SI-NEXT: s_and_b32 s8, s17, 0x80000000 -; SI-NEXT: s_cmp_lt_i32 s10, 0 -; SI-NEXT: v_mov_b32_e32 v11, s21 +; SI-NEXT: s_add_i32 s7, s4, 0xfffffc01 +; SI-NEXT: s_lshr_b64 s[4:5], s[20:21], s7 +; SI-NEXT: s_andn2_b64 s[4:5], s[12:13], s[4:5] +; SI-NEXT: s_and_b32 s8, s13, 0x80000000 +; SI-NEXT: s_cmp_lt_i32 s7, 0 +; SI-NEXT: s_cselect_b32 s4, 0, s4 +; SI-NEXT: s_cselect_b32 s5, s8, s5 +; SI-NEXT: s_cmp_gt_i32 s7, 51 +; SI-NEXT: s_cselect_b32 s5, s13, s5 +; SI-NEXT: s_cselect_b32 s4, s12, s4 +; SI-NEXT: s_bfe_u32 s7, s19, 0xb0014 +; SI-NEXT: s_addk_i32 s7, 0xfc01 +; SI-NEXT: s_lshr_b64 s[8:9], s[20:21], s7 +; SI-NEXT: s_andn2_b64 s[8:9], s[18:19], s[8:9] +; SI-NEXT: s_and_b32 s10, s19, 0x80000000 +; SI-NEXT: s_cmp_lt_i32 s7, 0 +; SI-NEXT: s_cselect_b32 s8, 0, s8 +; SI-NEXT: s_cselect_b32 s9, s10, s9 +; SI-NEXT: s_cmp_gt_i32 s7, 51 +; SI-NEXT: s_cselect_b32 s9, s19, s9 +; SI-NEXT: s_cselect_b32 s8, s18, s8 +; SI-NEXT: s_bfe_u32 s7, s17, 0xb0014 +; SI-NEXT: v_mov_b32_e32 v13, s5 +; SI-NEXT: s_addk_i32 s7, 0xfc01 +; SI-NEXT: v_mov_b32_e32 v12, s4 +; SI-NEXT: s_lshr_b64 s[10:11], s[20:21], s7 +; SI-NEXT: v_add_f64 v[12:13], s[12:13], -v[12:13] +; SI-NEXT: s_andn2_b64 s[10:11], s[16:17], s[10:11] +; SI-NEXT: s_and_b32 s12, s17, 0x80000000 +; SI-NEXT: s_cmp_lt_i32 s7, 0 +; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[12:13]|, 0.5 +; SI-NEXT: v_mov_b32_e32 v13, s9 +; SI-NEXT: s_cselect_b32 s10, 0, s10 +; SI-NEXT: s_cselect_b32 s11, s12, s11 +; SI-NEXT: s_cmp_gt_i32 s7, 51 ; SI-NEXT: v_mov_b32_e32 v12, s8 -; SI-NEXT: s_cselect_b64 s[8:9], -1, 0 -; SI-NEXT: s_cmp_gt_i32 s10, 51 -; SI-NEXT: v_cndmask_b32_e64 v11, v11, v12, s[8:9] -; SI-NEXT: v_mov_b32_e32 v12, s17 -; SI-NEXT: s_cselect_b64 s[10:11], -1, 0 -; SI-NEXT: v_cndmask_b32_e64 v14, v11, v12, s[10:11] -; SI-NEXT: v_mov_b32_e32 v11, s20 -; SI-NEXT: v_cndmask_b32_e64 v11, v11, 0, s[8:9] -; SI-NEXT: v_mov_b32_e32 v12, s16 -; SI-NEXT: v_cndmask_b32_e64 v13, v11, v12, s[10:11] -; SI-NEXT: v_add_f64 v[11:12], s[16:17], -v[13:14] -; SI-NEXT: v_mov_b32_e32 v19, s17 -; SI-NEXT: v_cmp_ge_f64_e64 s[8:9], |v[11:12]|, 0.5 -; SI-NEXT: v_mov_b32_e32 v11, s19 -; SI-NEXT: v_bfi_b32 v20, s23, v8, v11 -; SI-NEXT: v_add_f64 v[11:12], s[18:19], -v[9:10] -; SI-NEXT: v_bfi_b32 v19, s23, v8, v19 -; SI-NEXT: v_cmp_ge_f64_e64 s[10:11], |v[11:12]|, 0.5 -; SI-NEXT: v_mov_b32_e32 v11, 0 -; SI-NEXT: v_cndmask_b32_e64 v12, 0, v20, s[10:11] -; SI-NEXT: v_add_f64 v[11:12], v[9:10], v[11:12] -; SI-NEXT: v_cndmask_b32_e64 v10, 0, v19, s[8:9] -; SI-NEXT: v_mov_b32_e32 v9, 0 -; SI-NEXT: v_mov_b32_e32 v16, s15 -; SI-NEXT: v_add_f64 v[9:10], v[13:14], v[9:10] -; SI-NEXT: v_mov_b32_e32 v13, s24 -; SI-NEXT: v_cndmask_b32_e64 v14, v15, v16, s[2:3] -; SI-NEXT: v_cndmask_b32_e64 v13, v13, 0, vcc -; SI-NEXT: v_mov_b32_e32 v15, s14 -; SI-NEXT: v_cndmask_b32_e64 v13, v13, v15, s[2:3] -; SI-NEXT: v_mov_b32_e32 v15, s15 -; SI-NEXT: v_bfi_b32 v19, s23, v8, v15 -; SI-NEXT: v_mov_b32_e32 v15, s26 -; SI-NEXT: v_mov_b32_e32 v18, s13 -; SI-NEXT: v_cndmask_b32_e64 v15, v15, 0, s[4:5] -; SI-NEXT: v_mov_b32_e32 v16, s12 -; SI-NEXT: v_cndmask_b32_e64 v18, v17, v18, s[6:7] -; SI-NEXT: v_cndmask_b32_e64 v17, v15, v16, s[6:7] -; SI-NEXT: v_mov_b32_e32 v15, s13 -; SI-NEXT: v_bfi_b32 v8, s23, v8, v15 -; SI-NEXT: v_add_f64 v[15:16], s[12:13], -v[17:18] -; SI-NEXT: s_load_dwordx2 s[20:21], s[0:1], 0x9 -; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[15:16]|, 0.5 -; SI-NEXT: v_add_f64 v[15:16], s[14:15], -v[13:14] -; SI-NEXT: s_mov_b32 s23, 0xf000 -; SI-NEXT: v_cmp_ge_f64_e64 s[0:1], |v[15:16]|, 0.5 -; SI-NEXT: v_mov_b32_e32 v15, 0 -; SI-NEXT: v_cndmask_b32_e64 v16, 0, v19, s[0:1] -; SI-NEXT: v_add_f64 v[15:16], v[13:14], v[15:16] -; SI-NEXT: v_cndmask_b32_e32 v14, 0, v8, vcc -; SI-NEXT: v_mov_b32_e32 v13, 0 -; SI-NEXT: v_add_f64 v[13:14], v[17:18], v[13:14] -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_store_dwordx4 v[9:12], off, s[20:23], 0 offset:48 -; SI-NEXT: buffer_store_dwordx4 v[13:16], off, s[20:23], 0 offset:32 -; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[20:23], 0 offset:16 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[20:23], 0 +; SI-NEXT: s_cselect_b32 s11, s17, s11 +; SI-NEXT: v_mov_b32_e32 v9, s13 +; SI-NEXT: v_add_f64 v[12:13], s[18:19], -v[12:13] +; SI-NEXT: s_cselect_b32 s10, s16, s10 +; SI-NEXT: v_mov_b32_e32 v15, s11 +; SI-NEXT: v_bfi_b32 v9, s6, v8, v9 +; SI-NEXT: v_mov_b32_e32 v14, s10 +; SI-NEXT: v_cndmask_b32_e32 v17, 0, v9, vcc +; SI-NEXT: v_mov_b32_e32 v9, s19 +; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[12:13]|, 0.5 +; SI-NEXT: v_add_f64 v[14:15], s[16:17], -v[14:15] +; SI-NEXT: v_bfi_b32 v9, s6, v8, v9 +; SI-NEXT: v_cndmask_b32_e32 v13, 0, v9, vcc +; SI-NEXT: v_mov_b32_e32 v9, s17 +; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[14:15]|, 0.5 +; SI-NEXT: v_bfi_b32 v8, s6, v8, v9 +; SI-NEXT: v_mov_b32_e32 v12, 0 +; SI-NEXT: v_cndmask_b32_e32 v9, 0, v8, vcc +; SI-NEXT: v_mov_b32_e32 v8, 0 +; SI-NEXT: v_mov_b32_e32 v16, 0 +; SI-NEXT: v_add_f64 v[14:15], s[8:9], v[12:13] +; SI-NEXT: v_add_f64 v[12:13], s[10:11], v[8:9] +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: v_add_f64 v[8:9], s[4:5], v[16:17] +; SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48 +; SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32 +; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; CI-LABEL: round_v8f64: diff --git a/llvm/test/CodeGen/AMDGPU/load-select-ptr.ll b/llvm/test/CodeGen/AMDGPU/load-select-ptr.ll --- a/llvm/test/CodeGen/AMDGPU/load-select-ptr.ll +++ b/llvm/test/CodeGen/AMDGPU/load-select-ptr.ll @@ -9,8 +9,8 @@ ; GCN: s_load_dwordx2 ; GCN: s_cmp_eq_u32 -; GCN: v_cndmask_b32 -; GCN: v_cndmask_b32 +; GCN: s_cselect_b32 +; GCN: s_cselect_b32 ; GCN-NOT: load_dword ; GCN: flat_load_dwordx2 @@ -35,8 +35,8 @@ ; GCN: s_load_dwordx2 ; GCN: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x0{{$}} ; GCN: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x0{{$}} -; GCN: v_cndmask_b32 -; GCN: v_cndmask_b32 +; GCN: s_cselect_b32 +; GCN: s_cselect_b32 ; GCN: flat_store_dwordx2 define amdgpu_kernel void @select_ptr_crash_i64_global(i32 %tmp, [8 x i32], i64 addrspace(1)* %ptr0, [8 x i32], i64 addrspace(1)* %ptr1, [8 x i32], i64 addrspace(1)* %ptr2) { %tmp2 = icmp eq i32 %tmp, 0 diff --git a/llvm/test/CodeGen/AMDGPU/mad_uint24.ll b/llvm/test/CodeGen/AMDGPU/mad_uint24.ll --- a/llvm/test/CodeGen/AMDGPU/mad_uint24.ll +++ b/llvm/test/CodeGen/AMDGPU/mad_uint24.ll @@ -74,7 +74,7 @@ ; Check that the select instruction is not deleted. ; FUNC-LABEL: {{^}}i24_i32_i32_mad: ; EG: CNDE_INT -; SI: v_cndmask +; SI: s_cselect ; GCN2: s_cselect define amdgpu_kernel void @i24_i32_i32_mad(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d) { entry: diff --git a/llvm/test/CodeGen/AMDGPU/sad.ll b/llvm/test/CodeGen/AMDGPU/sad.ll --- a/llvm/test/CodeGen/AMDGPU/sad.ll +++ b/llvm/test/CodeGen/AMDGPU/sad.ll @@ -259,7 +259,7 @@ ; GCN-DAG: s_and_b32 ; GCN-DAG: s_sub_i32 ; GCN-DAG: s_lshr_b32 -; GCN: v_add_i32_e32 +; GCN: s_add_i32 define amdgpu_kernel void @s_sad_u32_i8_pat2(i8 addrspace(1)* %out, i8 zeroext %a, i8 zeroext %b, i8 zeroext %c) { %icmp0 = icmp ugt i8 %a, %b %sub0 = sub i8 %a, %b @@ -275,8 +275,8 @@ ; GCN-LABEL: {{^}}v_sad_u32_mismatched_operands_pat1: ; GCN-DAG: s_cmp_le_u32 s{{[0-9]+}}, s{{[0-9]+}} ; GCN-DAG: s_max_u32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} -; GCN: v_sub_i32_e32 v{{[0-9]+}}, vcc, s{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_add_i32_e32 v{{[0-9]+}}, vcc, s{{[0-9]+}}, v{{[0-9]+}} +; GCN: s_sub_i32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} +; GCN: s_add_i32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} define amdgpu_kernel void @v_sad_u32_mismatched_operands_pat1(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d) { %icmp0 = icmp ugt i32 %a, %b %t0 = select i1 %icmp0, i32 %a, i32 %b @@ -294,7 +294,7 @@ ; GCN-LABEL: {{^}}v_sad_u32_mismatched_operands_pat2: ; GCN: s_sub_i32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} ; GCN: s_sub_i32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} -; GCN: v_add_i32_e32 v{{[0-9]+}}, vcc, s{{[0-9]+}}, v{{[0-9]+}} +; GCN: s_add_i32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} define amdgpu_kernel void @v_sad_u32_mismatched_operands_pat2(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d) { %icmp0 = icmp ugt i32 %a, %b %sub0 = sub i32 %a, %d diff --git a/llvm/test/CodeGen/AMDGPU/sdiv.ll b/llvm/test/CodeGen/AMDGPU/sdiv.ll --- a/llvm/test/CodeGen/AMDGPU/sdiv.ll +++ b/llvm/test/CodeGen/AMDGPU/sdiv.ll @@ -46,11 +46,11 @@ ; GCN-NEXT: v_mul_hi_u32 v3, v0, v3 ; GCN-NEXT: v_mul_lo_u32 v4, v3, v1 ; GCN-NEXT: v_add_i32_e32 v5, vcc, 1, v3 -; GCN-NEXT: v_subrev_i32_e32 v0, vcc, v4, v0 -; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v1 -; GCN-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[0:1] +; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 ; GCN-NEXT: v_subrev_i32_e32 v4, vcc, v1, v0 -; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[0:1] +; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1 +; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; GCN-NEXT: v_add_i32_e32 v4, vcc, 1, v3 ; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1 ; GCN-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc @@ -91,11 +91,11 @@ ; TONGA-NEXT: v_mul_hi_u32 v3, v0, v3 ; TONGA-NEXT: v_mul_lo_u32 v4, v3, v1 ; TONGA-NEXT: v_add_u32_e32 v5, vcc, 1, v3 -; TONGA-NEXT: v_subrev_u32_e32 v0, vcc, v4, v0 -; TONGA-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v1 -; TONGA-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[0:1] +; TONGA-NEXT: v_sub_u32_e32 v0, vcc, v0, v4 ; TONGA-NEXT: v_subrev_u32_e32 v4, vcc, v1, v0 -; TONGA-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[0:1] +; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1 +; TONGA-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc +; TONGA-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; TONGA-NEXT: v_add_u32_e32 v4, vcc, 1, v3 ; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1 ; TONGA-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc @@ -118,34 +118,38 @@ ; GFX9-NEXT: s_mov_b32 s4, s0 ; GFX9-NEXT: s_mov_b32 s5, s1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_ashrrev_i32_e32 v2, 31, v1 -; GFX9-NEXT: v_add_u32_e32 v1, v1, v2 -; GFX9-NEXT: v_xor_b32_e32 v1, v1, v2 -; GFX9-NEXT: v_cvt_f32_u32_e32 v3, v1 -; GFX9-NEXT: v_sub_u32_e32 v4, 0, v1 -; GFX9-NEXT: v_ashrrev_i32_e32 v5, 31, v0 -; GFX9-NEXT: v_add_u32_e32 v0, v0, v5 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v3 -; GFX9-NEXT: v_xor_b32_e32 v0, v0, v5 -; GFX9-NEXT: v_xor_b32_e32 v2, v5, v2 -; GFX9-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 -; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 -; GFX9-NEXT: v_mul_lo_u32 v4, v4, v3 -; GFX9-NEXT: v_mul_hi_u32 v4, v3, v4 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v4 -; GFX9-NEXT: v_mul_hi_u32 v3, v0, v3 -; GFX9-NEXT: v_mul_lo_u32 v4, v3, v1 -; GFX9-NEXT: v_add_u32_e32 v5, 1, v3 -; GFX9-NEXT: v_sub_u32_e32 v0, v0, v4 -; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1 -; GFX9-NEXT: v_sub_u32_e32 v4, v0, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; GFX9-NEXT: v_add_u32_e32 v4, 1, v3 -; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc -; GFX9-NEXT: v_xor_b32_e32 v0, v0, v2 -; GFX9-NEXT: v_sub_u32_e32 v0, v0, v2 +; GFX9-NEXT: v_readfirstlane_b32 s2, v1 +; GFX9-NEXT: s_ashr_i32 s3, s2, 31 +; GFX9-NEXT: s_add_i32 s2, s2, s3 +; GFX9-NEXT: s_xor_b32 s2, s2, s3 +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s2 +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: s_ashr_i32 s1, s0, 31 +; GFX9-NEXT: s_add_i32 s0, s0, s1 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 +; GFX9-NEXT: s_xor_b32 s3, s1, s3 +; GFX9-NEXT: s_xor_b32 s0, s0, s1 +; GFX9-NEXT: s_sub_i32 s1, 0, s2 +; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v1 +; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX9-NEXT: v_readfirstlane_b32 s8, v0 +; GFX9-NEXT: s_mul_i32 s1, s1, s8 +; GFX9-NEXT: s_mul_hi_u32 s1, s8, s1 +; GFX9-NEXT: s_add_i32 s8, s8, s1 +; GFX9-NEXT: s_mul_hi_u32 s1, s0, s8 +; GFX9-NEXT: s_mul_i32 s8, s1, s2 +; GFX9-NEXT: s_sub_i32 s0, s0, s8 +; GFX9-NEXT: s_add_i32 s9, s1, 1 +; GFX9-NEXT: s_sub_i32 s8, s0, s2 +; GFX9-NEXT: s_cmp_ge_u32 s0, s2 +; GFX9-NEXT: s_cselect_b32 s1, s9, s1 +; GFX9-NEXT: s_cselect_b32 s0, s8, s0 +; GFX9-NEXT: s_add_i32 s8, s1, 1 +; GFX9-NEXT: s_cmp_ge_u32 s0, s2 +; GFX9-NEXT: s_cselect_b32 s0, s8, s1 +; GFX9-NEXT: s_xor_b32 s0, s0, s3 +; GFX9-NEXT: s_sub_i32 s0, s0, s3 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; @@ -308,7 +312,7 @@ ; GCN-NEXT: s_mov_b32 s5, s1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mul_hi_i32 v1, v0, s2 -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GCN-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GCN-NEXT: v_lshrrev_b32_e32 v1, 31, v0 ; GCN-NEXT: v_ashrrev_i32_e32 v0, 11, v0 ; GCN-NEXT: v_add_i32_e32 v0, vcc, v1, v0 @@ -331,7 +335,7 @@ ; TONGA-NEXT: s_mov_b32 s5, s1 ; TONGA-NEXT: s_waitcnt vmcnt(0) ; TONGA-NEXT: v_mul_hi_i32 v1, v0, s2 -; TONGA-NEXT: v_add_u32_e32 v0, vcc, v0, v1 +; TONGA-NEXT: v_add_u32_e32 v0, vcc, v1, v0 ; TONGA-NEXT: v_lshrrev_b32_e32 v1, 31, v0 ; TONGA-NEXT: v_ashrrev_i32_e32 v0, 11, v0 ; TONGA-NEXT: v_add_u32_e32 v0, vcc, v1, v0 @@ -432,22 +436,22 @@ ; GCN-NEXT: v_mul_hi_u32 v4, v5, v10 ; GCN-NEXT: v_xor_b32_e32 v1, v1, v6 ; GCN-NEXT: v_mul_hi_u32 v6, v7, v11 -; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; GCN-NEXT: v_add_i32_e32 v5, vcc, v7, v6 +; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4 +; GCN-NEXT: v_add_i32_e32 v5, vcc, v6, v7 ; GCN-NEXT: v_mul_hi_u32 v4, v0, v4 ; GCN-NEXT: v_mul_hi_u32 v5, v1, v5 ; GCN-NEXT: v_mul_lo_u32 v6, v4, v2 ; GCN-NEXT: v_mul_lo_u32 v10, v5, v3 ; GCN-NEXT: v_add_i32_e32 v7, vcc, 1, v4 ; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v6 -; GCN-NEXT: v_subrev_i32_e32 v1, vcc, v10, v1 +; GCN-NEXT: v_sub_i32_e32 v1, vcc, v1, v10 ; GCN-NEXT: v_add_i32_e32 v11, vcc, 1, v5 ; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v2 ; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], v1, v3 -; GCN-NEXT: v_cndmask_b32_e64 v4, v4, v7, s[0:1] ; GCN-NEXT: v_subrev_i32_e32 v6, vcc, v2, v0 -; GCN-NEXT: v_cndmask_b32_e64 v5, v5, v11, s[2:3] +; GCN-NEXT: v_cndmask_b32_e64 v4, v4, v7, s[0:1] ; GCN-NEXT: v_subrev_i32_e32 v7, vcc, v3, v1 +; GCN-NEXT: v_cndmask_b32_e64 v5, v5, v11, s[2:3] ; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[0:1] ; GCN-NEXT: v_add_i32_e32 v6, vcc, 1, v4 ; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[2:3] @@ -458,7 +462,7 @@ ; GCN-NEXT: v_cndmask_b32_e32 v1, v5, v7, vcc ; GCN-NEXT: v_xor_b32_e32 v0, v0, v8 ; GCN-NEXT: v_xor_b32_e32 v1, v1, v9 -; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v8 +; GCN-NEXT: v_subrev_i32_e32 v0, vcc, v8, v0 ; GCN-NEXT: v_subrev_i32_e32 v1, vcc, v9, v1 ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN-NEXT: s_endpgm @@ -505,8 +509,8 @@ ; TONGA-NEXT: v_mul_hi_u32 v4, v5, v10 ; TONGA-NEXT: v_xor_b32_e32 v1, v1, v6 ; TONGA-NEXT: v_mul_hi_u32 v6, v7, v11 -; TONGA-NEXT: v_add_u32_e32 v4, vcc, v4, v5 -; TONGA-NEXT: v_add_u32_e32 v5, vcc, v7, v6 +; TONGA-NEXT: v_add_u32_e32 v4, vcc, v5, v4 +; TONGA-NEXT: v_add_u32_e32 v5, vcc, v6, v7 ; TONGA-NEXT: v_mul_hi_u32 v4, v0, v4 ; TONGA-NEXT: v_mul_hi_u32 v5, v1, v5 ; TONGA-NEXT: v_mul_lo_u32 v6, v4, v2 @@ -517,10 +521,10 @@ ; TONGA-NEXT: v_add_u32_e32 v11, vcc, 1, v5 ; TONGA-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v2 ; TONGA-NEXT: v_cmp_ge_u32_e64 s[2:3], v1, v3 -; TONGA-NEXT: v_cndmask_b32_e64 v4, v4, v7, s[0:1] ; TONGA-NEXT: v_subrev_u32_e32 v6, vcc, v2, v0 -; TONGA-NEXT: v_cndmask_b32_e64 v5, v5, v11, s[2:3] +; TONGA-NEXT: v_cndmask_b32_e64 v4, v4, v7, s[0:1] ; TONGA-NEXT: v_subrev_u32_e32 v7, vcc, v3, v1 +; TONGA-NEXT: v_cndmask_b32_e64 v5, v5, v11, s[2:3] ; TONGA-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[0:1] ; TONGA-NEXT: v_add_u32_e32 v6, vcc, 1, v4 ; TONGA-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[2:3] @@ -538,75 +542,83 @@ ; ; GFX9-LABEL: sdiv_v2i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 -; GFX9-NEXT: s_mov_b32 s10, s6 -; GFX9-NEXT: s_mov_b32 s11, s7 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_mov_b32 s10, s2 +; GFX9-NEXT: s_mov_b32 s11, s3 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s8, s2 -; GFX9-NEXT: s_mov_b32 s9, s3 +; GFX9-NEXT: s_mov_b32 s8, s6 +; GFX9-NEXT: s_mov_b32 s9, s7 ; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 -; GFX9-NEXT: s_mov_b32 s4, s0 -; GFX9-NEXT: s_mov_b32 s5, s1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_ashrrev_i32_e32 v4, 31, v2 -; GFX9-NEXT: v_ashrrev_i32_e32 v5, 31, v3 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v4 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v5 -; GFX9-NEXT: v_xor_b32_e32 v2, v2, v4 -; GFX9-NEXT: v_xor_b32_e32 v3, v3, v5 -; GFX9-NEXT: v_cvt_f32_u32_e32 v6, v2 -; GFX9-NEXT: v_cvt_f32_u32_e32 v7, v3 -; GFX9-NEXT: v_sub_u32_e32 v10, 0, v2 -; GFX9-NEXT: v_sub_u32_e32 v11, 0, v3 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v6, v6 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v7, v7 -; GFX9-NEXT: v_ashrrev_i32_e32 v8, 31, v0 -; GFX9-NEXT: v_ashrrev_i32_e32 v9, 31, v1 -; GFX9-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6 -; GFX9-NEXT: v_mul_f32_e32 v7, 0x4f7ffffe, v7 -; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v6 -; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v7 -; GFX9-NEXT: v_add_u32_e32 v0, v0, v8 -; GFX9-NEXT: v_add_u32_e32 v1, v1, v9 -; GFX9-NEXT: v_mul_lo_u32 v10, v10, v6 -; GFX9-NEXT: v_mul_lo_u32 v11, v11, v7 -; GFX9-NEXT: v_xor_b32_e32 v0, v0, v8 -; GFX9-NEXT: v_xor_b32_e32 v1, v1, v9 -; GFX9-NEXT: v_mul_hi_u32 v10, v6, v10 -; GFX9-NEXT: v_mul_hi_u32 v11, v7, v11 -; GFX9-NEXT: v_xor_b32_e32 v4, v8, v4 -; GFX9-NEXT: v_xor_b32_e32 v5, v9, v5 -; GFX9-NEXT: v_add_u32_e32 v6, v6, v10 -; GFX9-NEXT: v_add_u32_e32 v7, v7, v11 -; GFX9-NEXT: v_mul_hi_u32 v6, v0, v6 -; GFX9-NEXT: v_mul_hi_u32 v7, v1, v7 -; GFX9-NEXT: v_mul_lo_u32 v8, v6, v2 -; GFX9-NEXT: v_mul_lo_u32 v9, v7, v3 -; GFX9-NEXT: v_add_u32_e32 v10, 1, v6 -; GFX9-NEXT: v_add_u32_e32 v11, 1, v7 -; GFX9-NEXT: v_sub_u32_e32 v0, v0, v8 -; GFX9-NEXT: v_sub_u32_e32 v1, v1, v9 -; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 -; GFX9-NEXT: v_sub_u32_e32 v8, v0, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc -; GFX9-NEXT: v_cmp_ge_u32_e64 s[0:1], v1, v3 -; GFX9-NEXT: v_sub_u32_e32 v9, v1, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v11, s[0:1] -; GFX9-NEXT: v_add_u32_e32 v8, 1, v6 -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[0:1] -; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 -; GFX9-NEXT: v_add_u32_e32 v9, 1, v7 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v8, vcc -; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v9, vcc -; GFX9-NEXT: v_xor_b32_e32 v0, v0, v4 -; GFX9-NEXT: v_xor_b32_e32 v1, v1, v5 -; GFX9-NEXT: v_sub_u32_e32 v0, v0, v4 -; GFX9-NEXT: v_sub_u32_e32 v1, v1, v5 -; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX9-NEXT: v_readfirstlane_b32 s0, v2 +; GFX9-NEXT: s_ashr_i32 s1, s0, 31 +; GFX9-NEXT: s_add_i32 s0, s0, s1 +; GFX9-NEXT: s_xor_b32 s6, s0, s1 +; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s6 +; GFX9-NEXT: v_readfirstlane_b32 s7, v0 +; GFX9-NEXT: s_ashr_i32 s8, s7, 31 +; GFX9-NEXT: s_add_i32 s7, s7, s8 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v2 +; GFX9-NEXT: s_xor_b32 s9, s8, s1 +; GFX9-NEXT: s_xor_b32 s1, s7, s8 +; GFX9-NEXT: s_sub_i32 s7, 0, s6 +; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v2 +; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: v_readfirstlane_b32 s4, v3 +; GFX9-NEXT: v_readfirstlane_b32 s8, v0 +; GFX9-NEXT: s_mul_i32 s7, s7, s8 +; GFX9-NEXT: s_mul_hi_u32 s7, s8, s7 +; GFX9-NEXT: s_add_i32 s8, s8, s7 +; GFX9-NEXT: s_mul_hi_u32 s7, s1, s8 +; GFX9-NEXT: s_mul_i32 s8, s7, s6 +; GFX9-NEXT: s_sub_i32 s1, s1, s8 +; GFX9-NEXT: s_add_i32 s10, s7, 1 +; GFX9-NEXT: s_sub_i32 s8, s1, s6 +; GFX9-NEXT: s_cmp_ge_u32 s1, s6 +; GFX9-NEXT: s_cselect_b32 s7, s10, s7 +; GFX9-NEXT: s_cselect_b32 s1, s8, s1 +; GFX9-NEXT: s_add_i32 s8, s7, 1 +; GFX9-NEXT: s_cmp_ge_u32 s1, s6 +; GFX9-NEXT: s_cselect_b32 s6, s8, s7 +; GFX9-NEXT: s_ashr_i32 s7, s4, 31 +; GFX9-NEXT: s_add_i32 s4, s4, s7 +; GFX9-NEXT: s_xor_b32 s4, s4, s7 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s4 +; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: v_readfirstlane_b32 s5, v1 +; GFX9-NEXT: s_ashr_i32 s8, s5, 31 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX9-NEXT: s_xor_b32 s6, s6, s9 +; GFX9-NEXT: s_add_i32 s5, s5, s8 +; GFX9-NEXT: s_xor_b32 s7, s8, s7 +; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX9-NEXT: s_sub_i32 s6, s6, s9 +; GFX9-NEXT: s_xor_b32 s5, s5, s8 +; GFX9-NEXT: s_sub_i32 s8, 0, s4 +; GFX9-NEXT: v_readfirstlane_b32 s9, v0 +; GFX9-NEXT: s_mul_i32 s8, s8, s9 +; GFX9-NEXT: s_mul_hi_u32 s8, s9, s8 +; GFX9-NEXT: s_add_i32 s9, s9, s8 +; GFX9-NEXT: s_mul_hi_u32 s8, s5, s9 +; GFX9-NEXT: s_mul_i32 s9, s8, s4 +; GFX9-NEXT: s_sub_i32 s5, s5, s9 +; GFX9-NEXT: s_add_i32 s10, s8, 1 +; GFX9-NEXT: s_sub_i32 s9, s5, s4 +; GFX9-NEXT: s_cmp_ge_u32 s5, s4 +; GFX9-NEXT: s_cselect_b32 s8, s10, s8 +; GFX9-NEXT: s_cselect_b32 s5, s9, s5 +; GFX9-NEXT: s_add_i32 s9, s8, 1 +; GFX9-NEXT: s_cmp_ge_u32 s5, s4 +; GFX9-NEXT: s_cselect_b32 s4, s9, s8 +; GFX9-NEXT: s_xor_b32 s4, s4, s7 +; GFX9-NEXT: s_sub_i32 s4, s4, s7 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: sdiv_v2i32: @@ -834,7 +846,7 @@ ; GCN-NEXT: v_xor_b32_e32 v1, v1, v10 ; GCN-NEXT: v_cvt_f32_u32_e32 v10, v5 ; GCN-NEXT: v_cvt_f32_u32_e32 v11, v6 -; GCN-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; GCN-NEXT: v_add_i32_e32 v8, vcc, v8, v9 ; GCN-NEXT: v_rcp_iflag_f32_e32 v10, v10 ; GCN-NEXT: v_rcp_iflag_f32_e32 v11, v11 ; GCN-NEXT: v_mul_hi_u32 v8, v0, v8 @@ -865,19 +877,19 @@ ; GCN-NEXT: v_cvt_f32_u32_e32 v12, v4 ; GCN-NEXT: v_mul_hi_u32 v0, v10, v0 ; GCN-NEXT: v_add_i32_e32 v11, vcc, 1, v8 -; GCN-NEXT: v_add_i32_e32 v7, vcc, v9, v7 +; GCN-NEXT: v_add_i32_e32 v7, vcc, v7, v9 ; GCN-NEXT: v_mul_hi_u32 v7, v1, v7 -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v10 +; GCN-NEXT: v_add_i32_e32 v0, vcc, v10, v0 ; GCN-NEXT: v_mul_hi_u32 v0, v2, v0 ; GCN-NEXT: v_mul_lo_u32 v10, v7, v5 ; GCN-NEXT: v_rcp_iflag_f32_e32 v12, v12 ; GCN-NEXT: v_sub_i32_e32 v9, vcc, 0, v4 -; GCN-NEXT: v_subrev_i32_e32 v1, vcc, v10, v1 +; GCN-NEXT: v_sub_i32_e32 v1, vcc, v1, v10 ; GCN-NEXT: v_mul_lo_u32 v10, v0, v6 ; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], v1, v5 ; GCN-NEXT: v_mul_f32_e32 v12, 0x4f7ffffe, v12 ; GCN-NEXT: v_cvt_u32_f32_e32 v12, v12 -; GCN-NEXT: v_subrev_i32_e32 v2, vcc, v10, v2 +; GCN-NEXT: v_sub_i32_e32 v2, vcc, v2, v10 ; GCN-NEXT: v_add_i32_e32 v10, vcc, 1, v7 ; GCN-NEXT: v_cndmask_b32_e64 v7, v7, v10, s[2:3] ; GCN-NEXT: v_add_i32_e32 v10, vcc, 1, v0 @@ -893,7 +905,7 @@ ; GCN-NEXT: v_cndmask_b32_e32 v0, v7, v1, vcc ; GCN-NEXT: v_xor_b32_e32 v1, v8, v15 ; GCN-NEXT: v_xor_b32_e32 v5, v0, v16 -; GCN-NEXT: v_sub_i32_e32 v0, vcc, v1, v15 +; GCN-NEXT: v_subrev_i32_e32 v0, vcc, v15, v1 ; GCN-NEXT: v_subrev_i32_e32 v1, vcc, v16, v5 ; GCN-NEXT: v_mul_lo_u32 v5, v9, v12 ; GCN-NEXT: v_ashrrev_i32_e32 v8, 31, v3 @@ -909,12 +921,12 @@ ; GCN-NEXT: v_mul_lo_u32 v6, v5, v4 ; GCN-NEXT: v_subrev_i32_e32 v2, vcc, v17, v2 ; GCN-NEXT: v_xor_b32_e32 v7, v8, v14 -; GCN-NEXT: v_subrev_i32_e32 v3, vcc, v6, v3 +; GCN-NEXT: v_sub_i32_e32 v3, vcc, v3, v6 ; GCN-NEXT: v_add_i32_e32 v6, vcc, 1, v5 -; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], v3, v4 -; GCN-NEXT: v_cndmask_b32_e64 v5, v5, v6, s[0:1] -; GCN-NEXT: v_subrev_i32_e32 v6, vcc, v4, v3 -; GCN-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[0:1] +; GCN-NEXT: v_subrev_i32_e32 v8, vcc, v4, v3 +; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v3, v4 +; GCN-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc +; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc ; GCN-NEXT: v_add_i32_e32 v6, vcc, 1, v5 ; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v3, v4 ; GCN-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc @@ -965,7 +977,7 @@ ; TONGA-NEXT: v_xor_b32_e32 v1, v1, v10 ; TONGA-NEXT: v_cvt_f32_u32_e32 v10, v5 ; TONGA-NEXT: v_cvt_f32_u32_e32 v11, v6 -; TONGA-NEXT: v_add_u32_e32 v8, vcc, v9, v8 +; TONGA-NEXT: v_add_u32_e32 v8, vcc, v8, v9 ; TONGA-NEXT: v_rcp_iflag_f32_e32 v10, v10 ; TONGA-NEXT: v_rcp_iflag_f32_e32 v11, v11 ; TONGA-NEXT: v_mul_hi_u32 v8, v0, v8 @@ -996,9 +1008,9 @@ ; TONGA-NEXT: v_cvt_f32_u32_e32 v12, v4 ; TONGA-NEXT: v_mul_hi_u32 v0, v10, v0 ; TONGA-NEXT: v_add_u32_e32 v11, vcc, 1, v8 -; TONGA-NEXT: v_add_u32_e32 v7, vcc, v9, v7 +; TONGA-NEXT: v_add_u32_e32 v7, vcc, v7, v9 ; TONGA-NEXT: v_mul_hi_u32 v7, v1, v7 -; TONGA-NEXT: v_add_u32_e32 v0, vcc, v0, v10 +; TONGA-NEXT: v_add_u32_e32 v0, vcc, v10, v0 ; TONGA-NEXT: v_mul_hi_u32 v0, v2, v0 ; TONGA-NEXT: v_mul_lo_u32 v10, v7, v5 ; TONGA-NEXT: v_rcp_iflag_f32_e32 v12, v12 @@ -1008,7 +1020,7 @@ ; TONGA-NEXT: v_cmp_ge_u32_e64 s[2:3], v1, v5 ; TONGA-NEXT: v_mul_f32_e32 v12, 0x4f7ffffe, v12 ; TONGA-NEXT: v_cvt_u32_f32_e32 v12, v12 -; TONGA-NEXT: v_subrev_u32_e32 v2, vcc, v10, v2 +; TONGA-NEXT: v_sub_u32_e32 v2, vcc, v2, v10 ; TONGA-NEXT: v_add_u32_e32 v10, vcc, 1, v7 ; TONGA-NEXT: v_cndmask_b32_e64 v7, v7, v10, s[2:3] ; TONGA-NEXT: v_add_u32_e32 v10, vcc, 1, v0 @@ -1040,149 +1052,165 @@ ; TONGA-NEXT: v_mul_lo_u32 v6, v5, v4 ; TONGA-NEXT: v_subrev_u32_e32 v2, vcc, v17, v2 ; TONGA-NEXT: v_xor_b32_e32 v7, v8, v14 -; TONGA-NEXT: v_subrev_u32_e32 v3, vcc, v6, v3 +; TONGA-NEXT: v_sub_u32_e32 v3, vcc, v3, v6 ; TONGA-NEXT: v_add_u32_e32 v6, vcc, 1, v5 -; TONGA-NEXT: v_cmp_ge_u32_e64 s[0:1], v3, v4 -; TONGA-NEXT: v_cndmask_b32_e64 v5, v5, v6, s[0:1] -; TONGA-NEXT: v_subrev_u32_e32 v6, vcc, v4, v3 -; TONGA-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[0:1] +; TONGA-NEXT: v_subrev_u32_e32 v8, vcc, v4, v3 +; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v3, v4 +; TONGA-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc +; TONGA-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc ; TONGA-NEXT: v_add_u32_e32 v6, vcc, 1, v5 ; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v3, v4 ; TONGA-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc ; TONGA-NEXT: v_xor_b32_e32 v3, v3, v7 -; TONGA-NEXT: v_sub_u32_e32 v3, vcc, v3, v7 +; TONGA-NEXT: v_subrev_u32_e32 v3, vcc, v7, v3 ; TONGA-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 ; TONGA-NEXT: s_endpgm ; ; GFX9-LABEL: sdiv_v4i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b32 s11, 0xf000 -; GFX9-NEXT: s_mov_b32 s10, -1 -; GFX9-NEXT: s_mov_b32 s6, s10 -; GFX9-NEXT: s_mov_b32 s7, s11 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_mov_b32 s10, s2 +; GFX9-NEXT: s_mov_b32 s11, s3 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s4, s2 -; GFX9-NEXT: s_mov_b32 s5, s3 -; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 -; GFX9-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16 -; GFX9-NEXT: s_mov_b32 s8, s0 -; GFX9-NEXT: s_mov_b32 s9, s1 +; GFX9-NEXT: s_mov_b32 s8, s6 +; GFX9-NEXT: s_mov_b32 s9, s7 +; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:16 +; GFX9-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 +; GFX9-NEXT: s_mov_b32 s0, s4 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_ashrrev_i32_e32 v8, 31, v0 +; GFX9-NEXT: v_readfirstlane_b32 s1, v0 +; GFX9-NEXT: s_ashr_i32 s4, s1, 31 +; GFX9-NEXT: s_add_i32 s1, s1, s4 +; GFX9-NEXT: s_xor_b32 s6, s1, s4 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_ashrrev_i32_e32 v9, 31, v4 -; GFX9-NEXT: v_add_u32_e32 v4, v4, v9 -; GFX9-NEXT: v_ashrrev_i32_e32 v11, 31, v5 -; GFX9-NEXT: v_add_u32_e32 v0, v0, v8 -; GFX9-NEXT: v_xor_b32_e32 v4, v4, v9 -; GFX9-NEXT: v_ashrrev_i32_e32 v10, 31, v1 -; GFX9-NEXT: v_ashrrev_i32_e32 v13, 31, v6 -; GFX9-NEXT: v_xor_b32_e32 v16, v8, v9 -; GFX9-NEXT: v_add_u32_e32 v5, v5, v11 -; GFX9-NEXT: v_xor_b32_e32 v0, v0, v8 -; GFX9-NEXT: v_cvt_f32_u32_e32 v8, v4 -; GFX9-NEXT: v_ashrrev_i32_e32 v12, 31, v2 -; GFX9-NEXT: v_ashrrev_i32_e32 v15, 31, v7 -; GFX9-NEXT: v_add_u32_e32 v1, v1, v10 -; GFX9-NEXT: v_add_u32_e32 v6, v6, v13 -; GFX9-NEXT: v_xor_b32_e32 v5, v5, v11 -; GFX9-NEXT: v_ashrrev_i32_e32 v14, 31, v3 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v12 -; GFX9-NEXT: v_add_u32_e32 v7, v7, v15 -; GFX9-NEXT: v_xor_b32_e32 v17, v10, v11 -; GFX9-NEXT: v_xor_b32_e32 v1, v1, v10 -; GFX9-NEXT: v_xor_b32_e32 v6, v6, v13 -; GFX9-NEXT: v_cvt_f32_u32_e32 v10, v5 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v14 -; GFX9-NEXT: v_xor_b32_e32 v18, v12, v13 -; GFX9-NEXT: v_xor_b32_e32 v2, v2, v12 -; GFX9-NEXT: v_xor_b32_e32 v7, v7, v15 -; GFX9-NEXT: v_cvt_f32_u32_e32 v12, v6 -; GFX9-NEXT: v_xor_b32_e32 v19, v14, v15 -; GFX9-NEXT: v_xor_b32_e32 v3, v3, v14 -; GFX9-NEXT: v_cvt_f32_u32_e32 v14, v7 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v8, v8 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v10, v10 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v12, v12 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v14, v14 -; GFX9-NEXT: v_mul_f32_e32 v8, 0x4f7ffffe, v8 -; GFX9-NEXT: v_cvt_u32_f32_e32 v8, v8 -; GFX9-NEXT: v_mul_f32_e32 v10, 0x4f7ffffe, v10 -; GFX9-NEXT: v_mul_f32_e32 v12, 0x4f7ffffe, v12 -; GFX9-NEXT: v_cvt_u32_f32_e32 v10, v10 -; GFX9-NEXT: v_sub_u32_e32 v9, 0, v4 -; GFX9-NEXT: v_mul_f32_e32 v14, 0x4f7ffffe, v14 -; GFX9-NEXT: v_cvt_u32_f32_e32 v12, v12 -; GFX9-NEXT: v_cvt_u32_f32_e32 v14, v14 -; GFX9-NEXT: v_mul_lo_u32 v9, v9, v8 -; GFX9-NEXT: v_sub_u32_e32 v11, 0, v5 -; GFX9-NEXT: v_sub_u32_e32 v13, 0, v6 -; GFX9-NEXT: v_mul_lo_u32 v11, v11, v10 -; GFX9-NEXT: v_sub_u32_e32 v15, 0, v7 -; GFX9-NEXT: v_mul_lo_u32 v13, v13, v12 -; GFX9-NEXT: v_mul_lo_u32 v15, v15, v14 -; GFX9-NEXT: v_mul_hi_u32 v9, v8, v9 -; GFX9-NEXT: v_mul_hi_u32 v11, v10, v11 -; GFX9-NEXT: v_mul_hi_u32 v13, v12, v13 -; GFX9-NEXT: v_mul_hi_u32 v15, v14, v15 -; GFX9-NEXT: v_add_u32_e32 v8, v8, v9 -; GFX9-NEXT: v_mul_hi_u32 v8, v0, v8 -; GFX9-NEXT: v_add_u32_e32 v9, v10, v11 -; GFX9-NEXT: v_add_u32_e32 v10, v12, v13 -; GFX9-NEXT: v_mul_hi_u32 v9, v1, v9 -; GFX9-NEXT: v_add_u32_e32 v11, v14, v15 -; GFX9-NEXT: v_mul_hi_u32 v10, v2, v10 -; GFX9-NEXT: v_mul_hi_u32 v11, v3, v11 -; GFX9-NEXT: v_mul_lo_u32 v12, v8, v4 -; GFX9-NEXT: v_mul_lo_u32 v14, v9, v5 -; GFX9-NEXT: v_mul_lo_u32 v15, v10, v6 -; GFX9-NEXT: v_add_u32_e32 v13, 1, v8 -; GFX9-NEXT: v_sub_u32_e32 v0, v0, v12 -; GFX9-NEXT: v_mul_lo_u32 v12, v11, v7 -; GFX9-NEXT: v_sub_u32_e32 v1, v1, v14 -; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v0, v4 -; GFX9-NEXT: v_add_u32_e32 v14, 1, v9 -; GFX9-NEXT: v_sub_u32_e32 v2, v2, v15 -; GFX9-NEXT: v_cndmask_b32_e32 v8, v8, v13, vcc -; GFX9-NEXT: v_sub_u32_e32 v13, v0, v4 -; GFX9-NEXT: v_cmp_ge_u32_e64 s[0:1], v1, v5 -; GFX9-NEXT: v_add_u32_e32 v15, 1, v10 -; GFX9-NEXT: v_sub_u32_e32 v3, v3, v12 -; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v14, s[0:1] -; GFX9-NEXT: v_sub_u32_e32 v14, v1, v5 -; GFX9-NEXT: v_cmp_ge_u32_e64 s[2:3], v2, v6 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v13, vcc -; GFX9-NEXT: v_add_u32_e32 v12, 1, v11 -; GFX9-NEXT: v_cndmask_b32_e64 v10, v10, v15, s[2:3] -; GFX9-NEXT: v_sub_u32_e32 v15, v2, v6 -; GFX9-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v7 -; GFX9-NEXT: v_add_u32_e32 v13, 1, v8 -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v14, s[0:1] -; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v0, v4 -; GFX9-NEXT: v_cndmask_b32_e64 v11, v11, v12, s[4:5] -; GFX9-NEXT: v_sub_u32_e32 v12, v3, v7 -; GFX9-NEXT: v_add_u32_e32 v14, 1, v9 -; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v15, s[2:3] -; GFX9-NEXT: v_cndmask_b32_e32 v0, v8, v13, vcc -; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v1, v5 -; GFX9-NEXT: v_add_u32_e32 v15, 1, v10 -; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v12, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v14, vcc -; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v2, v6 -; GFX9-NEXT: v_add_u32_e32 v12, 1, v11 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v10, v15, vcc -; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v3, v7 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v11, v12, vcc -; GFX9-NEXT: v_xor_b32_e32 v0, v0, v16 -; GFX9-NEXT: v_xor_b32_e32 v1, v1, v17 -; GFX9-NEXT: v_xor_b32_e32 v2, v2, v18 -; GFX9-NEXT: v_xor_b32_e32 v3, v3, v19 -; GFX9-NEXT: v_sub_u32_e32 v0, v0, v16 -; GFX9-NEXT: v_sub_u32_e32 v1, v1, v17 -; GFX9-NEXT: v_sub_u32_e32 v2, v2, v18 -; GFX9-NEXT: v_sub_u32_e32 v3, v3, v19 -; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 +; GFX9-NEXT: v_readfirstlane_b32 s8, v4 +; GFX9-NEXT: s_ashr_i32 s9, s8, 31 +; GFX9-NEXT: s_add_i32 s8, s8, s9 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX9-NEXT: s_xor_b32 s4, s9, s4 +; GFX9-NEXT: s_xor_b32 s8, s8, s9 +; GFX9-NEXT: s_sub_i32 s9, 0, s6 +; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX9-NEXT: v_readfirstlane_b32 s7, v1 +; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: v_readfirstlane_b32 s5, v3 +; GFX9-NEXT: v_readfirstlane_b32 s10, v0 +; GFX9-NEXT: s_mul_i32 s9, s9, s10 +; GFX9-NEXT: s_mul_hi_u32 s9, s10, s9 +; GFX9-NEXT: s_add_i32 s10, s10, s9 +; GFX9-NEXT: s_mul_hi_u32 s9, s8, s10 +; GFX9-NEXT: s_mul_i32 s10, s9, s6 +; GFX9-NEXT: s_sub_i32 s8, s8, s10 +; GFX9-NEXT: s_add_i32 s11, s9, 1 +; GFX9-NEXT: s_sub_i32 s10, s8, s6 +; GFX9-NEXT: s_cmp_ge_u32 s8, s6 +; GFX9-NEXT: s_cselect_b32 s9, s11, s9 +; GFX9-NEXT: s_cselect_b32 s8, s10, s8 +; GFX9-NEXT: s_add_i32 s10, s9, 1 +; GFX9-NEXT: s_cmp_ge_u32 s8, s6 +; GFX9-NEXT: s_cselect_b32 s6, s10, s9 +; GFX9-NEXT: s_ashr_i32 s8, s7, 31 +; GFX9-NEXT: s_add_i32 s7, s7, s8 +; GFX9-NEXT: s_xor_b32 s7, s7, s8 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s7 +; GFX9-NEXT: v_readfirstlane_b32 s10, v5 +; GFX9-NEXT: s_ashr_i32 s11, s10, 31 +; GFX9-NEXT: s_xor_b32 s6, s6, s4 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX9-NEXT: s_add_i32 s10, s10, s11 +; GFX9-NEXT: s_xor_b32 s8, s11, s8 +; GFX9-NEXT: s_sub_i32 s4, s6, s4 +; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX9-NEXT: s_xor_b32 s6, s10, s11 +; GFX9-NEXT: s_sub_i32 s10, 0, s7 +; GFX9-NEXT: v_readfirstlane_b32 s9, v2 +; GFX9-NEXT: v_readfirstlane_b32 s11, v0 +; GFX9-NEXT: s_mul_i32 s10, s10, s11 +; GFX9-NEXT: s_mul_hi_u32 s10, s11, s10 +; GFX9-NEXT: s_add_i32 s11, s11, s10 +; GFX9-NEXT: s_mul_hi_u32 s10, s6, s11 +; GFX9-NEXT: s_mul_i32 s11, s10, s7 +; GFX9-NEXT: s_sub_i32 s6, s6, s11 +; GFX9-NEXT: s_add_i32 s12, s10, 1 +; GFX9-NEXT: s_sub_i32 s11, s6, s7 +; GFX9-NEXT: s_cmp_ge_u32 s6, s7 +; GFX9-NEXT: s_cselect_b32 s10, s12, s10 +; GFX9-NEXT: s_cselect_b32 s6, s11, s6 +; GFX9-NEXT: s_add_i32 s11, s10, 1 +; GFX9-NEXT: s_cmp_ge_u32 s6, s7 +; GFX9-NEXT: s_cselect_b32 s6, s11, s10 +; GFX9-NEXT: s_ashr_i32 s7, s9, 31 +; GFX9-NEXT: s_add_i32 s9, s9, s7 +; GFX9-NEXT: s_xor_b32 s9, s9, s7 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s9 +; GFX9-NEXT: v_readfirstlane_b32 s11, v6 +; GFX9-NEXT: s_ashr_i32 s12, s11, 31 +; GFX9-NEXT: s_xor_b32 s6, s6, s8 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX9-NEXT: s_add_i32 s11, s11, s12 +; GFX9-NEXT: s_xor_b32 s7, s12, s7 +; GFX9-NEXT: s_sub_i32 s6, s6, s8 +; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX9-NEXT: s_xor_b32 s8, s11, s12 +; GFX9-NEXT: s_sub_i32 s11, 0, s9 +; GFX9-NEXT: v_readfirstlane_b32 s10, v7 +; GFX9-NEXT: v_readfirstlane_b32 s12, v0 +; GFX9-NEXT: s_mul_i32 s11, s11, s12 +; GFX9-NEXT: s_mul_hi_u32 s11, s12, s11 +; GFX9-NEXT: s_add_i32 s12, s12, s11 +; GFX9-NEXT: s_mul_hi_u32 s11, s8, s12 +; GFX9-NEXT: s_mul_i32 s12, s11, s9 +; GFX9-NEXT: s_sub_i32 s8, s8, s12 +; GFX9-NEXT: s_add_i32 s13, s11, 1 +; GFX9-NEXT: s_sub_i32 s12, s8, s9 +; GFX9-NEXT: s_cmp_ge_u32 s8, s9 +; GFX9-NEXT: s_cselect_b32 s11, s13, s11 +; GFX9-NEXT: s_cselect_b32 s8, s12, s8 +; GFX9-NEXT: s_add_i32 s12, s11, 1 +; GFX9-NEXT: s_cmp_ge_u32 s8, s9 +; GFX9-NEXT: s_cselect_b32 s8, s12, s11 +; GFX9-NEXT: s_ashr_i32 s9, s5, 31 +; GFX9-NEXT: s_add_i32 s5, s5, s9 +; GFX9-NEXT: s_xor_b32 s5, s5, s9 +; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s5 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: s_ashr_i32 s4, s10, 31 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v2 +; GFX9-NEXT: s_xor_b32 s6, s8, s7 +; GFX9-NEXT: s_xor_b32 s8, s4, s9 +; GFX9-NEXT: s_sub_i32 s6, s6, s7 +; GFX9-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 +; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 +; GFX9-NEXT: s_sub_i32 s7, 0, s5 +; GFX9-NEXT: s_add_i32 s10, s10, s4 +; GFX9-NEXT: s_xor_b32 s4, s10, s4 +; GFX9-NEXT: v_readfirstlane_b32 s9, v2 +; GFX9-NEXT: s_mul_i32 s7, s7, s9 +; GFX9-NEXT: s_mul_hi_u32 s7, s9, s7 +; GFX9-NEXT: s_add_i32 s9, s9, s7 +; GFX9-NEXT: s_mul_hi_u32 s7, s4, s9 +; GFX9-NEXT: s_mul_i32 s9, s7, s5 +; GFX9-NEXT: s_sub_i32 s4, s4, s9 +; GFX9-NEXT: s_add_i32 s10, s7, 1 +; GFX9-NEXT: s_sub_i32 s9, s4, s5 +; GFX9-NEXT: s_cmp_ge_u32 s4, s5 +; GFX9-NEXT: s_cselect_b32 s7, s10, s7 +; GFX9-NEXT: s_cselect_b32 s4, s9, s4 +; GFX9-NEXT: s_add_i32 s9, s7, 1 +; GFX9-NEXT: s_cmp_ge_u32 s4, s5 +; GFX9-NEXT: s_cselect_b32 s4, s9, s7 +; GFX9-NEXT: s_xor_b32 s4, s4, s8 +; GFX9-NEXT: s_sub_i32 s4, s4, s8 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: sdiv_v4i32: @@ -1824,7 +1852,7 @@ ; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 ; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v1| ; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, v2, v0 +; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 24 ; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN-NEXT: s_endpgm @@ -2001,11 +2029,11 @@ ; GCN-NEXT: v_mul_hi_u32 v3, v5, v3 ; GCN-NEXT: v_mul_lo_u32 v1, v3, v2 ; GCN-NEXT: v_add_i32_e32 v4, vcc, 1, v3 -; GCN-NEXT: v_subrev_i32_e32 v1, vcc, v1, v5 -; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], v1, v2 -; GCN-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[0:1] -; GCN-NEXT: v_subrev_i32_e32 v4, vcc, v2, v1 -; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[0:1] +; GCN-NEXT: v_sub_i32_e32 v1, vcc, v5, v1 +; GCN-NEXT: v_subrev_i32_e32 v5, vcc, v2, v1 +; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2 +; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc ; GCN-NEXT: v_add_i32_e32 v4, vcc, 1, v3 ; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2 ; GCN-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc @@ -2049,11 +2077,11 @@ ; TONGA-NEXT: v_mul_hi_u32 v3, v5, v3 ; TONGA-NEXT: v_mul_lo_u32 v1, v3, v2 ; TONGA-NEXT: v_add_u32_e32 v4, vcc, 1, v3 -; TONGA-NEXT: v_subrev_u32_e32 v1, vcc, v1, v5 -; TONGA-NEXT: v_cmp_ge_u32_e64 s[0:1], v1, v2 -; TONGA-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[0:1] -; TONGA-NEXT: v_subrev_u32_e32 v4, vcc, v2, v1 -; TONGA-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[0:1] +; TONGA-NEXT: v_sub_u32_e32 v1, vcc, v5, v1 +; TONGA-NEXT: v_subrev_u32_e32 v5, vcc, v2, v1 +; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2 +; TONGA-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; TONGA-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc ; TONGA-NEXT: v_add_u32_e32 v4, vcc, 1, v3 ; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2 ; TONGA-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc @@ -2077,37 +2105,41 @@ ; GFX9-NEXT: s_mov_b32 s4, s0 ; GFX9-NEXT: s_mov_b32 s5, s1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_bfe_i32 v2, v1, 0, 25 -; GFX9-NEXT: v_bfe_i32 v1, v1, 24, 1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v1 -; GFX9-NEXT: v_xor_b32_e32 v2, v2, v1 -; GFX9-NEXT: v_cvt_f32_u32_e32 v3, v2 -; GFX9-NEXT: v_sub_u32_e32 v4, 0, v2 -; GFX9-NEXT: v_bfe_i32 v5, v0, 0, 25 -; GFX9-NEXT: v_bfe_i32 v0, v0, 24, 1 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v3 -; GFX9-NEXT: v_add_u32_e32 v5, v5, v0 -; GFX9-NEXT: v_xor_b32_e32 v5, v5, v0 -; GFX9-NEXT: v_xor_b32_e32 v0, v0, v1 -; GFX9-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 -; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 -; GFX9-NEXT: v_mul_lo_u32 v4, v4, v3 -; GFX9-NEXT: v_mul_hi_u32 v4, v3, v4 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v4 -; GFX9-NEXT: v_mul_hi_u32 v3, v5, v3 -; GFX9-NEXT: v_mul_lo_u32 v4, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v1, 1, v3 -; GFX9-NEXT: v_sub_u32_e32 v4, v5, v4 -; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v4, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; GFX9-NEXT: v_sub_u32_e32 v3, v4, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc -; GFX9-NEXT: v_add_u32_e32 v4, 1, v1 -; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v3, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GFX9-NEXT: v_xor_b32_e32 v1, v1, v0 -; GFX9-NEXT: v_sub_u32_e32 v0, v1, v0 -; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 25 +; GFX9-NEXT: v_readfirstlane_b32 s2, v1 +; GFX9-NEXT: s_bfe_i32 s3, s2, 0x190000 +; GFX9-NEXT: s_bfe_i32 s2, s2, 0x10018 +; GFX9-NEXT: s_add_i32 s3, s3, s2 +; GFX9-NEXT: s_xor_b32 s3, s3, s2 +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s3 +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: s_bfe_i32 s1, s0, 0x190000 +; GFX9-NEXT: s_bfe_i32 s0, s0, 0x10018 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v1 +; GFX9-NEXT: s_add_i32 s1, s1, s0 +; GFX9-NEXT: s_xor_b32 s2, s0, s2 +; GFX9-NEXT: s_xor_b32 s0, s1, s0 +; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX9-NEXT: s_sub_i32 s1, 0, s3 +; GFX9-NEXT: v_readfirstlane_b32 s8, v0 +; GFX9-NEXT: s_mul_i32 s1, s1, s8 +; GFX9-NEXT: s_mul_hi_u32 s1, s8, s1 +; GFX9-NEXT: s_add_i32 s8, s8, s1 +; GFX9-NEXT: s_mul_hi_u32 s1, s0, s8 +; GFX9-NEXT: s_mul_i32 s8, s1, s3 +; GFX9-NEXT: s_sub_i32 s0, s0, s8 +; GFX9-NEXT: s_add_i32 s9, s1, 1 +; GFX9-NEXT: s_sub_i32 s8, s0, s3 +; GFX9-NEXT: s_cmp_ge_u32 s0, s3 +; GFX9-NEXT: s_cselect_b32 s1, s9, s1 +; GFX9-NEXT: s_cselect_b32 s0, s8, s0 +; GFX9-NEXT: s_add_i32 s8, s1, 1 +; GFX9-NEXT: s_cmp_ge_u32 s0, s3 +; GFX9-NEXT: s_cselect_b32 s0, s8, s1 +; GFX9-NEXT: s_xor_b32 s0, s0, s2 +; GFX9-NEXT: s_sub_i32 s0, s0, s2 +; GFX9-NEXT: s_bfe_i32 s0, s0, 0x190000 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; @@ -2252,10 +2284,10 @@ ; TONGA-NEXT: v_ashrrev_i32_e32 v2, 12, v2 ; TONGA-NEXT: v_lshrrev_b32_e32 v7, 31, v3 ; TONGA-NEXT: v_ashrrev_i32_e32 v3, 12, v3 -; TONGA-NEXT: v_add_u32_e32 v0, vcc, v4, v0 +; TONGA-NEXT: v_add_u32_e32 v0, vcc, v0, v4 ; TONGA-NEXT: v_add_u32_e32 v1, vcc, v1, v5 -; TONGA-NEXT: v_add_u32_e32 v2, vcc, v2, v6 -; TONGA-NEXT: v_add_u32_e32 v3, vcc, v7, v3 +; TONGA-NEXT: v_add_u32_e32 v2, vcc, v6, v2 +; TONGA-NEXT: v_add_u32_e32 v3, vcc, v3, v7 ; TONGA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; TONGA-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/sdiv64.ll b/llvm/test/CodeGen/AMDGPU/sdiv64.ll --- a/llvm/test/CodeGen/AMDGPU/sdiv64.ll +++ b/llvm/test/CodeGen/AMDGPU/sdiv64.ll @@ -59,9 +59,9 @@ ; GCN-NEXT: v_mul_lo_u32 v4, s5, v0 ; GCN-NEXT: s_xor_b64 s[2:3], s[2:3], s[12:13] ; GCN-NEXT: s_mov_b32 s5, s1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GCN-NEXT: v_mul_lo_u32 v3, s4, v0 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v4, v2 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GCN-NEXT: v_mul_lo_u32 v6, v0, v2 ; GCN-NEXT: v_mul_hi_u32 v7, v0, v3 ; GCN-NEXT: v_mul_hi_u32 v8, v0, v2 @@ -111,12 +111,13 @@ ; GCN-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1] ; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], s11, v4 ; GCN-NEXT: v_cndmask_b32_e64 v4, v6, v5, s[0:1] -; GCN-NEXT: v_add_i32_e64 v5, s[0:1], 2, v0 +; GCN-NEXT: v_add_i32_e64 v5, s[0:1], 1, v0 ; GCN-NEXT: v_addc_u32_e64 v6, s[0:1], 0, v1, s[0:1] -; GCN-NEXT: v_add_i32_e64 v7, s[0:1], 1, v0 +; GCN-NEXT: v_add_i32_e64 v7, s[0:1], 2, v0 ; GCN-NEXT: v_addc_u32_e64 v8, s[0:1], 0, v1, s[0:1] ; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4 -; GCN-NEXT: v_cndmask_b32_e64 v4, v8, v6, s[0:1] +; GCN-NEXT: v_cndmask_b32_e64 v4, v5, v7, s[0:1] +; GCN-NEXT: v_cndmask_b32_e64 v5, v6, v8, s[0:1] ; GCN-NEXT: v_mov_b32_e32 v6, s3 ; GCN-NEXT: v_subb_u32_e32 v2, vcc, v6, v2, vcc ; GCN-NEXT: v_cmp_le_u32_e32 vcc, s11, v2 @@ -126,10 +127,9 @@ ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, s11, v2 ; GCN-NEXT: v_cndmask_b32_e32 v2, v6, v3, vcc ; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GCN-NEXT: v_cndmask_b32_e64 v2, v7, v5, s[0:1] -; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; GCN-NEXT: s_xor_b64 s[0:1], s[12:13], s[8:9] -; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc ; GCN-NEXT: v_xor_b32_e32 v0, s0, v0 ; GCN-NEXT: v_xor_b32_e32 v1, s1, v1 ; GCN-NEXT: v_mov_b32_e32 v2, s1 @@ -142,6 +142,7 @@ ; GCN-IR: ; %bb.0: ; %_udiv-special-cases ; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GCN-IR-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; GCN-IR-NEXT: s_mov_b32 s15, 0 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_ashr_i32 s0, s7, 31 ; GCN-IR-NEXT: s_mov_b32 s1, s0 @@ -153,37 +154,39 @@ ; GCN-IR-NEXT: s_xor_b64 s[6:7], s[2:3], s[8:9] ; GCN-IR-NEXT: s_sub_u32 s6, s6, s2 ; GCN-IR-NEXT: s_subb_u32 s7, s7, s2 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[8:9], s[12:13], 0 ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[10:11], s[6:7], 0 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[14:15], s[12:13], 0 +; GCN-IR-NEXT: s_or_b64 s[10:11], s[10:11], s[8:9] +; GCN-IR-NEXT: s_flbit_i32_b32 s8, s6 +; GCN-IR-NEXT: s_add_i32 s8, s8, 32 +; GCN-IR-NEXT: s_flbit_i32_b32 s9, s7 +; GCN-IR-NEXT: s_min_u32 s14, s8, s9 +; GCN-IR-NEXT: s_flbit_i32_b32 s8, s12 +; GCN-IR-NEXT: s_add_i32 s8, s8, 32 +; GCN-IR-NEXT: s_flbit_i32_b32 s9, s13 +; GCN-IR-NEXT: s_min_u32 s18, s8, s9 +; GCN-IR-NEXT: s_sub_u32 s16, s14, s18 +; GCN-IR-NEXT: s_subb_u32 s17, 0, 0 +; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[20:21], s[16:17], 63 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[22:23], s[16:17], 63 +; GCN-IR-NEXT: s_or_b64 s[20:21], s[10:11], s[20:21] +; GCN-IR-NEXT: s_and_b64 s[10:11], s[20:21], exec +; GCN-IR-NEXT: s_cselect_b32 s11, 0, s13 +; GCN-IR-NEXT: s_cselect_b32 s10, 0, s12 +; GCN-IR-NEXT: s_or_b64 s[20:21], s[20:21], s[22:23] ; GCN-IR-NEXT: s_mov_b64 s[8:9], 0 -; GCN-IR-NEXT: s_or_b64 s[16:17], s[10:11], s[14:15] -; GCN-IR-NEXT: s_flbit_i32_b32 s10, s6 -; GCN-IR-NEXT: s_add_i32 s10, s10, 32 -; GCN-IR-NEXT: s_flbit_i32_b32 s11, s7 -; GCN-IR-NEXT: s_min_u32 s14, s10, s11 -; GCN-IR-NEXT: s_flbit_i32_b32 s10, s12 -; GCN-IR-NEXT: s_add_i32 s10, s10, 32 -; GCN-IR-NEXT: s_flbit_i32_b32 s11, s13 -; GCN-IR-NEXT: s_min_u32 s18, s10, s11 -; GCN-IR-NEXT: s_sub_u32 s10, s14, s18 -; GCN-IR-NEXT: s_subb_u32 s11, 0, 0 -; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[20:21], s[10:11], 63 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[22:23], s[10:11], 63 -; GCN-IR-NEXT: s_or_b64 s[16:17], s[16:17], s[20:21] -; GCN-IR-NEXT: s_or_b64 s[20:21], s[16:17], s[22:23] ; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[20:21] -; GCN-IR-NEXT: s_mov_b32 s15, 0 ; GCN-IR-NEXT: s_cbranch_vccz .LBB0_5 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 -; GCN-IR-NEXT: s_add_u32 s16, s10, 1 -; GCN-IR-NEXT: s_addc_u32 s17, s11, 0 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[20:21], s[16:17], 0 -; GCN-IR-NEXT: s_sub_i32 s10, 63, s10 -; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[20:21] -; GCN-IR-NEXT: s_lshl_b64 s[10:11], s[12:13], s10 +; GCN-IR-NEXT: s_add_u32 s20, s16, 1 +; GCN-IR-NEXT: s_addc_u32 s21, s17, 0 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[10:11], s[20:21], 0 +; GCN-IR-NEXT: s_sub_i32 s16, 63, s16 +; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[10:11] +; GCN-IR-NEXT: s_lshl_b64 s[10:11], s[12:13], s16 ; GCN-IR-NEXT: s_cbranch_vccz .LBB0_4 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader -; GCN-IR-NEXT: s_lshr_b64 s[16:17], s[12:13], s16 +; GCN-IR-NEXT: s_lshr_b64 s[16:17], s[12:13], s20 ; GCN-IR-NEXT: s_add_u32 s19, s6, -1 ; GCN-IR-NEXT: s_addc_u32 s20, s7, -1 ; GCN-IR-NEXT: s_not_b64 s[8:9], s[14:15] @@ -214,24 +217,16 @@ ; GCN-IR-NEXT: s_cbranch_vccz .LBB0_3 ; GCN-IR-NEXT: .LBB0_4: ; %Flow6 ; GCN-IR-NEXT: s_lshl_b64 s[6:7], s[10:11], 1 -; GCN-IR-NEXT: s_or_b64 s[6:7], s[8:9], s[6:7] -; GCN-IR-NEXT: v_mov_b32_e32 v0, s6 -; GCN-IR-NEXT: v_mov_b32_e32 v1, s7 -; GCN-IR-NEXT: s_branch .LBB0_6 -; GCN-IR-NEXT: .LBB0_5: -; GCN-IR-NEXT: v_mov_b32_e32 v0, s13 -; GCN-IR-NEXT: v_cndmask_b32_e64 v1, v0, 0, s[16:17] -; GCN-IR-NEXT: v_mov_b32_e32 v0, s12 -; GCN-IR-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[16:17] -; GCN-IR-NEXT: .LBB0_6: ; %udiv-end +; GCN-IR-NEXT: s_or_b64 s[10:11], s[8:9], s[6:7] +; GCN-IR-NEXT: .LBB0_5: ; %udiv-end ; GCN-IR-NEXT: s_xor_b64 s[0:1], s[2:3], s[0:1] -; GCN-IR-NEXT: v_xor_b32_e32 v0, s0, v0 -; GCN-IR-NEXT: v_xor_b32_e32 v1, s1, v1 -; GCN-IR-NEXT: v_mov_b32_e32 v2, s1 -; GCN-IR-NEXT: v_subrev_i32_e32 v0, vcc, s0, v0 +; GCN-IR-NEXT: s_xor_b64 s[2:3], s[10:11], s[0:1] +; GCN-IR-NEXT: s_sub_u32 s0, s2, s0 +; GCN-IR-NEXT: s_subb_u32 s1, s3, s1 +; GCN-IR-NEXT: v_mov_b32_e32 v0, s0 ; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s6, -1 -; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc +; GCN-IR-NEXT: v_mov_b32_e32 v1, s1 ; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN-IR-NEXT: s_endpgm %result = sdiv i64 %x, %y @@ -485,15 +480,15 @@ ; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 ; GCN-NEXT: s_xor_b32 s4, s4, s8 ; GCN-NEXT: s_ashr_i32 s4, s4, 30 -; GCN-NEXT: s_or_b32 s4, s4, 1 +; GCN-NEXT: s_or_b32 s6, s4, 1 ; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-NEXT: v_mad_f32 v1, -v2, v0, v1 ; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GCN-NEXT: v_mov_b32_e32 v3, s4 -; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| -; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, v2, v0 +; GCN-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, |v0| +; GCN-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GCN-NEXT: s_cselect_b32 s4, s6, 0 +; GCN-NEXT: v_add_i32_e32 v0, vcc, s4, v2 ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 24 ; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -515,15 +510,15 @@ ; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v0 ; GCN-IR-NEXT: s_xor_b32 s4, s4, s8 ; GCN-IR-NEXT: s_ashr_i32 s4, s4, 30 -; GCN-IR-NEXT: s_or_b32 s4, s4, 1 +; GCN-IR-NEXT: s_or_b32 s6, s4, 1 ; GCN-IR-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-IR-NEXT: v_mad_f32 v1, -v2, v0, v1 ; GCN-IR-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GCN-IR-NEXT: v_mov_b32_e32 v3, s4 -; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| -; GCN-IR-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc -; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v2, v0 +; GCN-IR-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, |v0| +; GCN-IR-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GCN-IR-NEXT: s_cselect_b32 s4, s6, 0 +; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, s4, v2 ; GCN-IR-NEXT: v_bfe_i32 v0, v0, 0, 24 ; GCN-IR-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -591,16 +586,16 @@ ; GCN-NEXT: s_xor_b32 s0, s3, s8 ; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 ; GCN-NEXT: s_ashr_i32 s0, s0, 30 -; GCN-NEXT: s_or_b32 s0, s0, 1 -; GCN-NEXT: v_mov_b32_e32 v3, s0 +; GCN-NEXT: s_mov_b32 s5, s1 +; GCN-NEXT: s_or_b32 s2, s0, 1 ; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-NEXT: v_mad_f32 v1, -v2, v0, v1 ; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| -; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc -; GCN-NEXT: s_mov_b32 s5, s1 -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GCN-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, |v0| +; GCN-NEXT: s_and_b64 s[0:1], s[0:1], exec +; GCN-NEXT: s_cselect_b32 s0, s2, 0 +; GCN-NEXT: v_add_i32_e32 v0, vcc, s0, v2 ; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN-NEXT: s_endpgm @@ -618,16 +613,16 @@ ; GCN-IR-NEXT: s_xor_b32 s0, s3, s8 ; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v0 ; GCN-IR-NEXT: s_ashr_i32 s0, s0, 30 -; GCN-IR-NEXT: s_or_b32 s0, s0, 1 -; GCN-IR-NEXT: v_mov_b32_e32 v3, s0 +; GCN-IR-NEXT: s_mov_b32 s5, s1 +; GCN-IR-NEXT: s_or_b32 s2, s0, 1 ; GCN-IR-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-IR-NEXT: v_mad_f32 v1, -v2, v0, v1 ; GCN-IR-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| -; GCN-IR-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc -; GCN-IR-NEXT: s_mov_b32 s5, s1 -; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GCN-IR-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, |v0| +; GCN-IR-NEXT: s_and_b64 s[0:1], s[0:1], exec +; GCN-IR-NEXT: s_cselect_b32 s0, s2, 0 +; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, s0, v2 ; GCN-IR-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN-IR-NEXT: s_endpgm @@ -655,15 +650,15 @@ ; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 ; GCN-NEXT: s_xor_b32 s4, s4, s8 ; GCN-NEXT: s_ashr_i32 s4, s4, 30 -; GCN-NEXT: s_or_b32 s4, s4, 1 +; GCN-NEXT: s_or_b32 s6, s4, 1 ; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-NEXT: v_mad_f32 v1, -v2, v0, v1 ; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GCN-NEXT: v_mov_b32_e32 v3, s4 -; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| -; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GCN-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, |v0| +; GCN-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GCN-NEXT: s_cselect_b32 s4, s6, 0 +; GCN-NEXT: v_add_i32_e32 v0, vcc, s4, v2 ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 31 ; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -685,15 +680,15 @@ ; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v0 ; GCN-IR-NEXT: s_xor_b32 s4, s4, s8 ; GCN-IR-NEXT: s_ashr_i32 s4, s4, 30 -; GCN-IR-NEXT: s_or_b32 s4, s4, 1 +; GCN-IR-NEXT: s_or_b32 s6, s4, 1 ; GCN-IR-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-IR-NEXT: v_mad_f32 v1, -v2, v0, v1 ; GCN-IR-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GCN-IR-NEXT: v_mov_b32_e32 v3, s4 -; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| -; GCN-IR-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc -; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GCN-IR-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, |v0| +; GCN-IR-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GCN-IR-NEXT: s_cselect_b32 s4, s6, 0 +; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, s4, v2 ; GCN-IR-NEXT: v_bfe_i32 v0, v0, 0, 31 ; GCN-IR-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -722,15 +717,15 @@ ; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 ; GCN-NEXT: s_xor_b32 s4, s4, s8 ; GCN-NEXT: s_ashr_i32 s4, s4, 30 -; GCN-NEXT: s_or_b32 s4, s4, 1 +; GCN-NEXT: s_or_b32 s6, s4, 1 ; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-NEXT: v_mad_f32 v1, -v2, v0, v1 ; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GCN-NEXT: v_mov_b32_e32 v3, s4 -; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| -; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GCN-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, |v0| +; GCN-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GCN-NEXT: s_cselect_b32 s4, s6, 0 +; GCN-NEXT: v_add_i32_e32 v0, vcc, s4, v2 ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 23 ; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -752,15 +747,15 @@ ; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v0 ; GCN-IR-NEXT: s_xor_b32 s4, s4, s8 ; GCN-IR-NEXT: s_ashr_i32 s4, s4, 30 -; GCN-IR-NEXT: s_or_b32 s4, s4, 1 +; GCN-IR-NEXT: s_or_b32 s6, s4, 1 ; GCN-IR-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-IR-NEXT: v_mad_f32 v1, -v2, v0, v1 ; GCN-IR-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GCN-IR-NEXT: v_mov_b32_e32 v3, s4 -; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| -; GCN-IR-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc -; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GCN-IR-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, |v0| +; GCN-IR-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GCN-IR-NEXT: s_cselect_b32 s4, s6, 0 +; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, s4, v2 ; GCN-IR-NEXT: v_bfe_i32 v0, v0, 0, 23 ; GCN-IR-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -789,15 +784,15 @@ ; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 ; GCN-NEXT: s_xor_b32 s4, s4, s8 ; GCN-NEXT: s_ashr_i32 s4, s4, 30 -; GCN-NEXT: s_or_b32 s4, s4, 1 +; GCN-NEXT: s_or_b32 s6, s4, 1 ; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-NEXT: v_mad_f32 v1, -v2, v0, v1 ; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GCN-NEXT: v_mov_b32_e32 v3, s4 -; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| -; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, v2, v0 +; GCN-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, |v0| +; GCN-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GCN-NEXT: s_cselect_b32 s4, s6, 0 +; GCN-NEXT: v_add_i32_e32 v0, vcc, s4, v2 ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 25 ; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -819,15 +814,15 @@ ; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v0 ; GCN-IR-NEXT: s_xor_b32 s4, s4, s8 ; GCN-IR-NEXT: s_ashr_i32 s4, s4, 30 -; GCN-IR-NEXT: s_or_b32 s4, s4, 1 +; GCN-IR-NEXT: s_or_b32 s6, s4, 1 ; GCN-IR-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-IR-NEXT: v_mad_f32 v1, -v2, v0, v1 ; GCN-IR-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GCN-IR-NEXT: v_mov_b32_e32 v3, s4 -; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| -; GCN-IR-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc -; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GCN-IR-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, |v0| +; GCN-IR-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GCN-IR-NEXT: s_cselect_b32 s4, s6, 0 +; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, s4, v2 ; GCN-IR-NEXT: v_bfe_i32 v0, v0, 0, 25 ; GCN-IR-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -853,35 +848,35 @@ ; GCN-NEXT: v_cvt_f32_i32_e32 v1, s4 ; GCN-NEXT: s_xor_b32 s4, s4, s8 ; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; GCN-NEXT: s_ashr_i64 s[6:7], s[6:7], 40 ; GCN-NEXT: s_ashr_i32 s4, s4, 30 -; GCN-NEXT: s_or_b32 s4, s4, 1 -; GCN-NEXT: v_mov_b32_e32 v3, s4 +; GCN-NEXT: s_ashr_i64 s[10:11], s[10:11], 40 ; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-NEXT: v_mad_f32 v1, -v2, v0, v1 ; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| -; GCN-NEXT: s_ashr_i64 s[10:11], s[10:11], 40 -; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, v2, v0 +; GCN-NEXT: s_or_b32 s7, s4, 1 +; GCN-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, |v0| +; GCN-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GCN-NEXT: s_cselect_b32 s4, s7, 0 +; GCN-NEXT: v_add_i32_e32 v0, vcc, s4, v2 ; GCN-NEXT: v_cvt_f32_i32_e32 v2, s10 -; GCN-NEXT: s_ashr_i64 s[6:7], s[6:7], 40 ; GCN-NEXT: v_cvt_f32_i32_e32 v3, s6 ; GCN-NEXT: s_xor_b32 s4, s6, s10 -; GCN-NEXT: v_rcp_iflag_f32_e32 v4, v2 ; GCN-NEXT: s_ashr_i32 s4, s4, 30 -; GCN-NEXT: s_or_b32 s4, s4, 1 -; GCN-NEXT: v_mov_b32_e32 v5, s4 +; GCN-NEXT: v_rcp_iflag_f32_e32 v4, v2 +; GCN-NEXT: s_or_b32 s6, s4, 1 +; GCN-NEXT: v_bfe_i32 v0, v0, 0, 24 +; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-NEXT: v_mul_f32_e32 v4, v3, v4 ; GCN-NEXT: v_trunc_f32_e32 v4, v4 ; GCN-NEXT: v_mad_f32 v3, -v4, v2, v3 ; GCN-NEXT: v_cvt_i32_f32_e32 v4, v4 -; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v2| -; GCN-NEXT: v_cndmask_b32_e32 v2, 0, v5, vcc -; GCN-NEXT: v_bfe_i32 v0, v0, 0, 24 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; GCN-NEXT: v_cmp_ge_f32_e64 s[4:5], |v3|, |v2| +; GCN-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GCN-NEXT: s_cselect_b32 s4, s6, 0 +; GCN-NEXT: v_add_i32_e32 v2, vcc, s4, v4 ; GCN-NEXT: v_bfe_i32 v2, v2, 0, 24 -; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NEXT: s_endpgm @@ -899,35 +894,35 @@ ; GCN-IR-NEXT: v_cvt_f32_i32_e32 v1, s4 ; GCN-IR-NEXT: s_xor_b32 s4, s4, s8 ; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; GCN-IR-NEXT: s_ashr_i64 s[6:7], s[6:7], 40 ; GCN-IR-NEXT: s_ashr_i32 s4, s4, 30 -; GCN-IR-NEXT: s_or_b32 s4, s4, 1 -; GCN-IR-NEXT: v_mov_b32_e32 v3, s4 +; GCN-IR-NEXT: s_ashr_i64 s[10:11], s[10:11], 40 ; GCN-IR-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-IR-NEXT: v_mad_f32 v1, -v2, v0, v1 ; GCN-IR-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| -; GCN-IR-NEXT: s_ashr_i64 s[10:11], s[10:11], 40 -; GCN-IR-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc -; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v2, v0 +; GCN-IR-NEXT: s_or_b32 s7, s4, 1 +; GCN-IR-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, |v0| +; GCN-IR-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GCN-IR-NEXT: s_cselect_b32 s4, s7, 0 +; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, s4, v2 ; GCN-IR-NEXT: v_cvt_f32_i32_e32 v2, s10 -; GCN-IR-NEXT: s_ashr_i64 s[6:7], s[6:7], 40 ; GCN-IR-NEXT: v_cvt_f32_i32_e32 v3, s6 ; GCN-IR-NEXT: s_xor_b32 s4, s6, s10 -; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v4, v2 ; GCN-IR-NEXT: s_ashr_i32 s4, s4, 30 -; GCN-IR-NEXT: s_or_b32 s4, s4, 1 -; GCN-IR-NEXT: v_mov_b32_e32 v5, s4 +; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v4, v2 +; GCN-IR-NEXT: s_or_b32 s6, s4, 1 +; GCN-IR-NEXT: v_bfe_i32 v0, v0, 0, 24 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-IR-NEXT: v_mul_f32_e32 v4, v3, v4 ; GCN-IR-NEXT: v_trunc_f32_e32 v4, v4 ; GCN-IR-NEXT: v_mad_f32 v3, -v4, v2, v3 ; GCN-IR-NEXT: v_cvt_i32_f32_e32 v4, v4 -; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v2| -; GCN-IR-NEXT: v_cndmask_b32_e32 v2, 0, v5, vcc -; GCN-IR-NEXT: v_bfe_i32 v0, v0, 0, 24 -; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, v4, v2 +; GCN-IR-NEXT: v_cmp_ge_f32_e64 s[4:5], |v3|, |v2| +; GCN-IR-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GCN-IR-NEXT: s_cselect_b32 s4, s6, 0 +; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, s4, v4 ; GCN-IR-NEXT: v_bfe_i32 v2, v2, 0, 24 -; GCN-IR-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-IR-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; GCN-IR-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-IR-NEXT: s_endpgm @@ -966,7 +961,7 @@ ; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 ; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v1| ; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GCN-NEXT: v_add_i32_e32 v0, vcc, v2, v0 ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 24 ; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -977,6 +972,7 @@ ; GCN-IR: ; %bb.0: ; %_udiv-special-cases ; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GCN-IR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GCN-IR-NEXT: s_mov_b32 s15, 0 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_sext_i32_i16 s7, s7 ; GCN-IR-NEXT: s_sext_i32_i16 s1, s1 @@ -996,26 +992,28 @@ ; GCN-IR-NEXT: s_xor_b64 s[6:7], s[2:3], s[10:11] ; GCN-IR-NEXT: s_sub_u32 s6, s6, s2 ; GCN-IR-NEXT: s_subb_u32 s7, s7, s2 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[10:11], s[6:7], 0 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[14:15], s[12:13], 0 -; GCN-IR-NEXT: s_mov_b64 s[8:9], 0 -; GCN-IR-NEXT: s_or_b64 s[16:17], s[10:11], s[14:15] -; GCN-IR-NEXT: s_flbit_i32_b32 s10, s6 -; GCN-IR-NEXT: s_add_i32 s10, s10, 32 -; GCN-IR-NEXT: s_flbit_i32_b32 s11, s7 -; GCN-IR-NEXT: s_min_u32 s14, s10, s11 -; GCN-IR-NEXT: s_flbit_i32_b32 s10, s12 -; GCN-IR-NEXT: s_add_i32 s10, s10, 32 -; GCN-IR-NEXT: s_flbit_i32_b32 s11, s13 -; GCN-IR-NEXT: s_min_u32 s18, s10, s11 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[8:9], s[6:7], 0 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[10:11], s[12:13], 0 +; GCN-IR-NEXT: s_or_b64 s[16:17], s[8:9], s[10:11] +; GCN-IR-NEXT: s_flbit_i32_b32 s8, s6 +; GCN-IR-NEXT: s_add_i32 s8, s8, 32 +; GCN-IR-NEXT: s_flbit_i32_b32 s9, s7 +; GCN-IR-NEXT: s_min_u32 s14, s8, s9 +; GCN-IR-NEXT: s_flbit_i32_b32 s8, s12 +; GCN-IR-NEXT: s_add_i32 s8, s8, 32 +; GCN-IR-NEXT: s_flbit_i32_b32 s9, s13 +; GCN-IR-NEXT: s_min_u32 s18, s8, s9 ; GCN-IR-NEXT: s_sub_u32 s10, s14, s18 ; GCN-IR-NEXT: s_subb_u32 s11, 0, 0 ; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[20:21], s[10:11], 63 ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[22:23], s[10:11], 63 -; GCN-IR-NEXT: s_or_b64 s[16:17], s[16:17], s[20:21] -; GCN-IR-NEXT: s_or_b64 s[20:21], s[16:17], s[22:23] +; GCN-IR-NEXT: s_or_b64 s[20:21], s[16:17], s[20:21] +; GCN-IR-NEXT: s_and_b64 s[16:17], s[20:21], exec +; GCN-IR-NEXT: s_cselect_b32 s17, 0, s13 +; GCN-IR-NEXT: s_cselect_b32 s16, 0, s12 +; GCN-IR-NEXT: s_or_b64 s[20:21], s[20:21], s[22:23] +; GCN-IR-NEXT: s_mov_b64 s[8:9], 0 ; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[20:21] -; GCN-IR-NEXT: s_mov_b32 s15, 0 ; GCN-IR-NEXT: s_cbranch_vccz .LBB9_5 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 ; GCN-IR-NEXT: s_add_u32 s16, s10, 1 @@ -1057,25 +1055,18 @@ ; GCN-IR-NEXT: s_cbranch_vccz .LBB9_3 ; GCN-IR-NEXT: .LBB9_4: ; %Flow3 ; GCN-IR-NEXT: s_lshl_b64 s[6:7], s[10:11], 1 -; GCN-IR-NEXT: s_or_b64 s[6:7], s[8:9], s[6:7] -; GCN-IR-NEXT: v_mov_b32_e32 v0, s6 -; GCN-IR-NEXT: v_mov_b32_e32 v1, s7 -; GCN-IR-NEXT: s_branch .LBB9_6 -; GCN-IR-NEXT: .LBB9_5: -; GCN-IR-NEXT: v_mov_b32_e32 v0, s13 -; GCN-IR-NEXT: v_cndmask_b32_e64 v1, v0, 0, s[16:17] -; GCN-IR-NEXT: v_mov_b32_e32 v0, s12 -; GCN-IR-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[16:17] -; GCN-IR-NEXT: .LBB9_6: ; %udiv-end +; GCN-IR-NEXT: s_or_b64 s[16:17], s[8:9], s[6:7] +; GCN-IR-NEXT: .LBB9_5: ; %udiv-end ; GCN-IR-NEXT: s_xor_b64 s[0:1], s[2:3], s[0:1] -; GCN-IR-NEXT: v_xor_b32_e32 v0, s0, v0 -; GCN-IR-NEXT: v_xor_b32_e32 v1, s1, v1 -; GCN-IR-NEXT: v_mov_b32_e32 v2, s1 -; GCN-IR-NEXT: v_subrev_i32_e32 v0, vcc, s0, v0 -; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc +; GCN-IR-NEXT: s_xor_b64 s[2:3], s[16:17], s[0:1] +; GCN-IR-NEXT: s_sub_u32 s0, s2, s0 +; GCN-IR-NEXT: s_subb_u32 s1, s3, s1 ; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s6, -1 -; GCN-IR-NEXT: buffer_store_short v1, off, s[4:7], 0 offset:4 +; GCN-IR-NEXT: v_mov_b32_e32 v0, s1 +; GCN-IR-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:4 +; GCN-IR-NEXT: s_waitcnt expcnt(0) +; GCN-IR-NEXT: v_mov_b32_e32 v0, s0 ; GCN-IR-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN-IR-NEXT: s_endpgm %1 = ashr i48 %x, 24 @@ -1176,23 +1167,23 @@ ; GCN-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[0:1] ; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], s3, v3 ; GCN-NEXT: v_cndmask_b32_e64 v3, v5, v4, s[0:1] -; GCN-NEXT: v_add_i32_e64 v4, s[0:1], 2, v0 +; GCN-NEXT: v_add_i32_e64 v4, s[0:1], 1, v0 ; GCN-NEXT: v_addc_u32_e64 v5, s[0:1], 0, 0, s[0:1] -; GCN-NEXT: v_add_i32_e64 v6, s[0:1], 1, v0 +; GCN-NEXT: v_add_i32_e64 v6, s[0:1], 2, v0 ; GCN-NEXT: v_addc_u32_e64 v7, s[0:1], 0, 0, s[0:1] ; GCN-NEXT: v_subb_u32_e32 v1, vcc, 0, v1, vcc ; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v3 ; GCN-NEXT: v_cmp_le_u32_e32 vcc, s3, v1 -; GCN-NEXT: v_cndmask_b32_e64 v3, v7, v5, s[0:1] +; GCN-NEXT: v_cndmask_b32_e64 v3, v4, v6, s[0:1] +; GCN-NEXT: v_cndmask_b32_e64 v4, v5, v7, s[0:1] ; GCN-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc ; GCN-NEXT: v_cmp_le_u32_e32 vcc, s2, v2 ; GCN-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, s3, v1 ; GCN-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc ; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; GCN-NEXT: v_cndmask_b32_e64 v2, v6, v4, s[0:1] -; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v4, vcc ; GCN-NEXT: v_xor_b32_e32 v0, s8, v0 ; GCN-NEXT: v_xor_b32_e32 v1, s8, v1 ; GCN-NEXT: v_mov_b32_e32 v2, s8 @@ -1204,36 +1195,39 @@ ; GCN-IR-LABEL: s_test_sdiv_k_num_i64: ; GCN-IR: ; %bb.0: ; %_udiv-special-cases ; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-IR-NEXT: s_mov_b64 s[6:7], 0 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_ashr_i32 s4, s3, 31 ; GCN-IR-NEXT: s_mov_b32 s5, s4 ; GCN-IR-NEXT: s_xor_b64 s[2:3], s[4:5], s[2:3] ; GCN-IR-NEXT: s_sub_u32 s2, s2, s4 ; GCN-IR-NEXT: s_subb_u32 s3, s3, s4 -; GCN-IR-NEXT: s_flbit_i32_b32 s8, s2 -; GCN-IR-NEXT: s_add_i32 s8, s8, 32 -; GCN-IR-NEXT: s_flbit_i32_b32 s9, s3 -; GCN-IR-NEXT: s_min_u32 s10, s8, s9 -; GCN-IR-NEXT: s_add_u32 s8, s10, 0xffffffc5 -; GCN-IR-NEXT: s_addc_u32 s9, 0, -1 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[6:7], s[2:3], 0 -; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[12:13], s[8:9], 63 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[14:15], s[8:9], 63 -; GCN-IR-NEXT: s_or_b64 s[12:13], s[6:7], s[12:13] -; GCN-IR-NEXT: s_or_b64 s[6:7], s[12:13], s[14:15] -; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[6:7] -; GCN-IR-NEXT: s_mov_b64 s[6:7], 0 +; GCN-IR-NEXT: s_flbit_i32_b32 s10, s2 +; GCN-IR-NEXT: s_add_i32 s10, s10, 32 +; GCN-IR-NEXT: s_flbit_i32_b32 s11, s3 +; GCN-IR-NEXT: s_min_u32 s10, s10, s11 +; GCN-IR-NEXT: s_add_u32 s12, s10, 0xffffffc5 +; GCN-IR-NEXT: s_addc_u32 s13, 0, -1 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[8:9], s[2:3], 0 +; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[14:15], s[12:13], 63 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[16:17], s[12:13], 63 +; GCN-IR-NEXT: s_or_b64 s[14:15], s[8:9], s[14:15] +; GCN-IR-NEXT: s_and_b64 s[8:9], s[14:15], exec +; GCN-IR-NEXT: s_cselect_b32 s8, 0, 24 +; GCN-IR-NEXT: s_or_b64 s[14:15], s[14:15], s[16:17] +; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[14:15] +; GCN-IR-NEXT: s_mov_b32 s9, 0 ; GCN-IR-NEXT: s_cbranch_vccz .LBB10_5 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 -; GCN-IR-NEXT: s_add_u32 s12, s8, 1 -; GCN-IR-NEXT: s_addc_u32 s13, s9, 0 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[14:15], s[12:13], 0 -; GCN-IR-NEXT: s_sub_i32 s8, 63, s8 -; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[14:15] -; GCN-IR-NEXT: s_lshl_b64 s[8:9], 24, s8 +; GCN-IR-NEXT: s_add_u32 s14, s12, 1 +; GCN-IR-NEXT: s_addc_u32 s15, s13, 0 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[8:9], s[14:15], 0 +; GCN-IR-NEXT: s_sub_i32 s11, 63, s12 +; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[8:9] +; GCN-IR-NEXT: s_lshl_b64 s[8:9], 24, s11 ; GCN-IR-NEXT: s_cbranch_vccz .LBB10_4 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader -; GCN-IR-NEXT: s_lshr_b64 s[12:13], 24, s12 +; GCN-IR-NEXT: s_lshr_b64 s[12:13], 24, s14 ; GCN-IR-NEXT: s_add_u32 s16, s2, -1 ; GCN-IR-NEXT: s_addc_u32 s17, s3, -1 ; GCN-IR-NEXT: s_sub_u32 s10, 58, s10 @@ -1263,21 +1257,15 @@ ; GCN-IR-NEXT: s_cbranch_vccz .LBB10_3 ; GCN-IR-NEXT: .LBB10_4: ; %Flow5 ; GCN-IR-NEXT: s_lshl_b64 s[2:3], s[8:9], 1 -; GCN-IR-NEXT: s_or_b64 s[2:3], s[6:7], s[2:3] -; GCN-IR-NEXT: v_mov_b32_e32 v0, s2 -; GCN-IR-NEXT: v_mov_b32_e32 v1, s3 -; GCN-IR-NEXT: s_branch .LBB10_6 -; GCN-IR-NEXT: .LBB10_5: -; GCN-IR-NEXT: v_mov_b32_e32 v1, 0 -; GCN-IR-NEXT: v_cndmask_b32_e64 v0, 24, 0, s[12:13] -; GCN-IR-NEXT: .LBB10_6: ; %udiv-end -; GCN-IR-NEXT: v_xor_b32_e32 v0, s4, v0 -; GCN-IR-NEXT: v_xor_b32_e32 v1, s5, v1 -; GCN-IR-NEXT: v_mov_b32_e32 v2, s5 -; GCN-IR-NEXT: v_subrev_i32_e32 v0, vcc, s4, v0 +; GCN-IR-NEXT: s_or_b64 s[8:9], s[6:7], s[2:3] +; GCN-IR-NEXT: .LBB10_5: ; %udiv-end +; GCN-IR-NEXT: s_xor_b64 s[6:7], s[8:9], s[4:5] +; GCN-IR-NEXT: s_sub_u32 s4, s6, s4 +; GCN-IR-NEXT: s_subb_u32 s5, s7, s5 +; GCN-IR-NEXT: v_mov_b32_e32 v0, s4 ; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s2, -1 -; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc +; GCN-IR-NEXT: v_mov_b32_e32 v1, s5 ; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN-IR-NEXT: s_endpgm %result = sdiv i64 24, %x @@ -1788,16 +1776,16 @@ ; GCN-NEXT: s_mov_b32 s4, s0 ; GCN-NEXT: s_ashr_i32 s0, s2, 30 ; GCN-NEXT: v_rcp_iflag_f32_e32 v1, v0 -; GCN-NEXT: s_or_b32 s0, s0, 1 -; GCN-NEXT: v_mov_b32_e32 v3, s0 ; GCN-NEXT: s_mov_b32 s5, s1 +; GCN-NEXT: s_or_b32 s2, s0, 1 ; GCN-NEXT: v_mul_f32_e32 v1, 0x41c00000, v1 ; GCN-NEXT: v_trunc_f32_e32 v1, v1 ; GCN-NEXT: v_mad_f32 v2, -v1, v0, s3 ; GCN-NEXT: v_cvt_i32_f32_e32 v1, v1 -; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, |v0| -; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, v1, v0 +; GCN-NEXT: v_cmp_ge_f32_e64 s[0:1], |v2|, |v0| +; GCN-NEXT: s_and_b64 s[0:1], s[0:1], exec +; GCN-NEXT: s_cselect_b32 s0, s2, 0 +; GCN-NEXT: v_add_i32_e32 v0, vcc, s0, v1 ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 24 ; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 @@ -1815,16 +1803,16 @@ ; GCN-IR-NEXT: s_mov_b32 s4, s0 ; GCN-IR-NEXT: s_ashr_i32 s0, s2, 30 ; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v1, v0 -; GCN-IR-NEXT: s_or_b32 s0, s0, 1 -; GCN-IR-NEXT: v_mov_b32_e32 v3, s0 ; GCN-IR-NEXT: s_mov_b32 s5, s1 +; GCN-IR-NEXT: s_or_b32 s2, s0, 1 ; GCN-IR-NEXT: v_mul_f32_e32 v1, 0x41c00000, v1 ; GCN-IR-NEXT: v_trunc_f32_e32 v1, v1 ; GCN-IR-NEXT: v_mad_f32 v2, -v1, v0, s3 ; GCN-IR-NEXT: v_cvt_i32_f32_e32 v1, v1 -; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, |v0| -; GCN-IR-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc -; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GCN-IR-NEXT: v_cmp_ge_f32_e64 s[0:1], |v2|, |v0| +; GCN-IR-NEXT: s_and_b64 s[0:1], s[0:1], exec +; GCN-IR-NEXT: s_cselect_b32 s0, s2, 0 +; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, s0, v1 ; GCN-IR-NEXT: v_bfe_i32 v0, v0, 0, 24 ; GCN-IR-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 @@ -1847,17 +1835,17 @@ ; GCN-NEXT: v_cvt_f32_i32_e32 v0, s2 ; GCN-NEXT: s_mov_b32 s4, s0 ; GCN-NEXT: s_ashr_i32 s0, s2, 30 -; GCN-NEXT: s_or_b32 s0, s0, 1 +; GCN-NEXT: s_mov_b32 s5, s1 ; GCN-NEXT: v_mul_f32_e32 v1, 0x38331158, v0 ; GCN-NEXT: v_trunc_f32_e32 v1, v1 ; GCN-NEXT: v_mad_f32 v0, -v1, s8, v0 ; GCN-NEXT: v_cvt_i32_f32_e32 v1, v1 -; GCN-NEXT: v_mov_b32_e32 v2, s0 -; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, s8 -; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GCN-NEXT: s_or_b32 s2, s0, 1 +; GCN-NEXT: v_cmp_ge_f32_e64 s[0:1], |v0|, s8 +; GCN-NEXT: s_and_b64 s[0:1], s[0:1], exec +; GCN-NEXT: s_cselect_b32 s0, s2, 0 +; GCN-NEXT: v_add_i32_e32 v0, vcc, s0, v1 ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 24 -; GCN-NEXT: s_mov_b32 s5, s1 ; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN-NEXT: s_endpgm @@ -1873,17 +1861,17 @@ ; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, s2 ; GCN-IR-NEXT: s_mov_b32 s4, s0 ; GCN-IR-NEXT: s_ashr_i32 s0, s2, 30 -; GCN-IR-NEXT: s_or_b32 s0, s0, 1 +; GCN-IR-NEXT: s_mov_b32 s5, s1 ; GCN-IR-NEXT: v_mul_f32_e32 v1, 0x38331158, v0 ; GCN-IR-NEXT: v_trunc_f32_e32 v1, v1 ; GCN-IR-NEXT: v_mad_f32 v0, -v1, s8, v0 ; GCN-IR-NEXT: v_cvt_i32_f32_e32 v1, v1 -; GCN-IR-NEXT: v_mov_b32_e32 v2, s0 -; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, s8 -; GCN-IR-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc -; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GCN-IR-NEXT: s_or_b32 s2, s0, 1 +; GCN-IR-NEXT: v_cmp_ge_f32_e64 s[0:1], |v0|, s8 +; GCN-IR-NEXT: s_and_b64 s[0:1], s[0:1], exec +; GCN-IR-NEXT: s_cselect_b32 s0, s2, 0 +; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, s0, v1 ; GCN-IR-NEXT: v_bfe_i32 v0, v0, 0, 24 -; GCN-IR-NEXT: s_mov_b32 s5, s1 ; GCN-IR-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN-IR-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/select-constant-cttz.ll b/llvm/test/CodeGen/AMDGPU/select-constant-cttz.ll --- a/llvm/test/CodeGen/AMDGPU/select-constant-cttz.ll +++ b/llvm/test/CodeGen/AMDGPU/select-constant-cttz.ll @@ -9,23 +9,24 @@ ; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_load_dword s2, s[2:3], 0x0 -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s6, -1 -; GCN-NEXT: s_mov_b32 s4, s0 -; GCN-NEXT: s_mov_b32 s5, s1 +; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshr_b32 s0, 1, s2 -; GCN-NEXT: s_ff1_i32_b32 s0, s0 +; GCN-NEXT: s_lshr_b32 s4, 1, s2 ; GCN-NEXT: s_cmp_lg_u32 s2, 0 -; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GCN-NEXT: v_cndmask_b32_e64 v0, v0, -1, s[2:3] -; GCN-NEXT: v_ffbh_i32_e32 v1, v0 -; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v0 -; GCN-NEXT: v_sub_i32_e32 v0, vcc, 31, v1 -; GCN-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] -; GCN-NEXT: v_cndmask_b32_e64 v0, v0, -1, s[0:1] -; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN-NEXT: s_ff1_i32_b32 s2, s4 +; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN-NEXT: s_and_b64 s[6:7], s[4:5], exec +; GCN-NEXT: s_cselect_b32 s2, -1, s2 +; GCN-NEXT: s_flbit_i32 s6, s2 +; GCN-NEXT: s_sub_i32 s8, 31, s6 +; GCN-NEXT: s_cmp_eq_u32 s2, 0 +; GCN-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GCN-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] +; GCN-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GCN-NEXT: s_cselect_b32 s4, -1, s8 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GCN-NEXT: s_endpgm %v = load i32, i32 addrspace(1)* %arrayidx, align 4 %sr = lshr i32 1, %v diff --git a/llvm/test/CodeGen/AMDGPU/select-opt.ll b/llvm/test/CodeGen/AMDGPU/select-opt.ll --- a/llvm/test/CodeGen/AMDGPU/select-opt.ll +++ b/llvm/test/CodeGen/AMDGPU/select-opt.ll @@ -9,10 +9,11 @@ ; GCN: s_cselect_b64 [[CMP1:s\[[0-9]+:[0-9]+\]]], -1, 0 ; GCN-DAG: s_cmp_lg_u32 ; GCN: s_cselect_b64 [[CMP2:s\[[0-9]+:[0-9]+\]]], -1, 0 -; GCN: s_and_b64 vcc, [[CMP1]], [[CMP2]] -; GCN: v_cndmask_b32_e32 [[RESULT:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, vcc -; GCN-NOT: [[RESULT]] -; GCN: buffer_store_dword [[RESULT]] +; GCN: s_and_b64 [[AND1:s\[[0-9]+:[0-9]+\]]], [[CMP1]], [[CMP2]] +; GCN: s_and_b64 [[AND2:s\[[0-9]+:[0-9]+\]]], [[AND1]], exec +; GCN: s_cselect_b32 [[RESULT:s[0-9]+]] +; GCN: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[RESULT]] +; GCN: buffer_store_dword [[VRESULT]] define amdgpu_kernel void @opt_select_i32_and_cmp_i32(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %x, i32 %y) #0 { %icmp0 = icmp ne i32 %a, %b %icmp1 = icmp ne i32 %a, %c @@ -25,10 +26,11 @@ ; GCN-LABEL: {{^}}opt_select_i32_and_cmp_f32: ; GCN-DAG: v_cmp_lg_f32_e32 vcc ; GCN-DAG: v_cmp_lg_f32_e64 [[CMP1:s\[[0-9]+:[0-9]+\]]] -; GCN: s_and_b64 vcc, vcc, [[CMP1]] -; GCN: v_cndmask_b32_e32 [[RESULT:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, vcc -; GCN-NOT: [[RESULT]] -; GCN: buffer_store_dword [[RESULT]] +; GCN: s_and_b64 [[CMP1]], vcc, [[CMP1]] +; GCN: s_and_b64 [[AND:s\[[0-9]+:[0-9]+\]]], [[CMP1]], exec +; GCN: s_cselect_b32 [[RESULT:s[0-9]+]] +; GCN: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[RESULT]] +; GCN: buffer_store_dword [[VRESULT]] define amdgpu_kernel void @opt_select_i32_and_cmp_f32(i32 addrspace(1)* %out, float %a, float %b, float %c, i32 %x, i32 %y) #0 { %fcmp0 = fcmp one float %a, %b %fcmp1 = fcmp one float %a, %c @@ -43,10 +45,13 @@ ; GCN: s_cselect_b64 [[CMP1:s\[[0-9]+:[0-9]+\]]], -1, 0 ; GCN-DAG: s_cmp_lg_u32 ; GCN: s_cselect_b64 [[CMP2:s\[[0-9]+:[0-9]+\]]], -1, 0 -; GCN: s_and_b64 vcc, [[CMP1]], [[CMP2]] -; GCN: v_cndmask_b32_e32 v[[RESULT1:[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, vcc -; GCN: v_cndmask_b32_e32 v[[RESULT0:[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, vcc -; GCN: buffer_store_dwordx2 v[[[RESULT0]]:[[RESULT1]]] +; GCN: s_and_b64 [[AND1:s\[[0-9]+:[0-9]+\]]], [[CMP1]], [[CMP2]] +; GCN: s_and_b64 [[AND2:s\[[0-9]+:[0-9]+\]]], [[AND1]], exec +; GCN-DAG: s_cselect_b32 [[RESULT0:s[0-9]+]] +; GCN-DAG: s_cselect_b32 [[RESULT1:s[0-9]+]] +; GCN-DAG: v_mov_b32_e32 v[[VRESULT1:[0-9]+]], [[RESULT0]] +; GCN-DAG: v_mov_b32_e32 v[[VRESULT0:[0-9]+]], [[RESULT1]] +; GCN: buffer_store_dwordx2 v[[[VRESULT0]]:[[VRESULT1]]] define amdgpu_kernel void @opt_select_i64_and_cmp_i32(i64 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i64 %x, i64 %y) #0 { %icmp0 = icmp ne i32 %a, %b %icmp1 = icmp ne i32 %a, %c @@ -59,10 +64,13 @@ ; GCN-LABEL: {{^}}opt_select_i64_and_cmp_f32: ; GCN-DAG: v_cmp_lg_f32_e32 vcc, ; GCN-DAG: v_cmp_lg_f32_e64 [[CMP1:s\[[0-9]+:[0-9]+\]]] -; GCN: s_and_b64 vcc, vcc, [[CMP1]] -; GCN: v_cndmask_b32_e32 v[[RESULT1:[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, vcc -; GCN: v_cndmask_b32_e32 v[[RESULT0:[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, vcc -; GCN: buffer_store_dwordx2 v[[[RESULT0]]:[[RESULT1]]] +; GCN: s_and_b64 [[AND1:s\[[0-9]+:[0-9]+\]]], vcc, [[CMP1]] +; GCN: s_and_b64 [[AND2:s\[[0-9]+:[0-9]+\]]], [[AND1]], exec +; GCN-DAG: s_cselect_b32 [[RESULT0:s[0-9]+]] +; GCN-DAG: s_cselect_b32 [[RESULT1:s[0-9]+]] +; GCN-DAG: v_mov_b32_e32 v[[VRESULT1:[0-9]+]], [[RESULT0]] +; GCN-DAG: v_mov_b32_e32 v[[VRESULT0:[0-9]+]], [[RESULT1]] +; GCN: buffer_store_dwordx2 v[[[VRESULT0]]:[[VRESULT1]]] define amdgpu_kernel void @opt_select_i64_and_cmp_f32(i64 addrspace(1)* %out, float %a, float %b, float %c, i64 %x, i64 %y) #0 { %fcmp0 = fcmp one float %a, %b %fcmp1 = fcmp one float %a, %c @@ -77,10 +85,11 @@ ; GCN: s_cselect_b64 [[CMP1:s\[[0-9]+:[0-9]+\]]], -1, 0 ; GCN-DAG: s_cmp_lg_u32 ; GCN: s_cselect_b64 [[CMP2:s\[[0-9]+:[0-9]+\]]], -1, 0 -; GCN: s_or_b64 vcc, [[CMP1]], [[CMP2]] -; GCN: v_cndmask_b32_e32 [[RESULT:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, vcc -; GCN-NOT: [[RESULT]] -; GCN: buffer_store_dword [[RESULT]] +; GCN: s_or_b64 [[OR:s\[[0-9]+:[0-9]+\]]], [[CMP1]], [[CMP2]] +; GCN: s_and_b64 [[AND:s\[[0-9]+:[0-9]+\]]], [[OR]], exec +; GCN-DAG: s_cselect_b32 [[RESULT:s[0-9]+]] +; GCN-DAG: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[RESULT]] +; GCN: buffer_store_dword [[VRESULT]] ; GCN: s_endpgm define amdgpu_kernel void @opt_select_i32_or_cmp_i32(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %x, i32 %y) #0 { %icmp0 = icmp ne i32 %a, %b @@ -94,10 +103,11 @@ ; GCN-LABEL: {{^}}opt_select_i32_or_cmp_f32: ; GCN-DAG: v_cmp_lg_f32_e32 vcc ; GCN-DAG: v_cmp_lg_f32_e64 [[CMP1:s\[[0-9]+:[0-9]+\]]] -; GCN: s_or_b64 vcc, vcc, [[CMP1]] -; GCN: v_cndmask_b32_e32 [[RESULT:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, vcc -; GCN-NOT: [[RESULT]] -; GCN: buffer_store_dword [[RESULT]] +; GCN: s_or_b64 [[OR:s\[[0-9]+:[0-9]+\]]], vcc, [[CMP1]] +; GCN: s_and_b64 [[AND:s\[[0-9]+:[0-9]+\]]], [[OR]], exec +; GCN-DAG: s_cselect_b32 [[RESULT:s[0-9]+]] +; GCN-DAG: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[RESULT]] +; GCN: buffer_store_dword [[VRESULT]] define amdgpu_kernel void @opt_select_i32_or_cmp_f32(i32 addrspace(1)* %out, float %a, float %b, float %c, i32 %x, i32 %y) #0 { %fcmp0 = fcmp one float %a, %b %fcmp1 = fcmp one float %a, %c @@ -112,10 +122,13 @@ ; GCN: s_cselect_b64 [[CMP1:s\[[0-9]+:[0-9]+\]]], -1, 0 ; GCN-DAG: s_cmp_lg_u32 ; GCN: s_cselect_b64 [[CMP2:s\[[0-9]+:[0-9]+\]]], -1, 0 -; GCN: s_or_b64 vcc, [[CMP1]], [[CMP2]] -; GCN: v_cndmask_b32_e32 v[[RESULT1:[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, vcc -; GCN: v_cndmask_b32_e32 v[[RESULT0:[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, vcc -; GCN: buffer_store_dwordx2 v[[[RESULT0]]:[[RESULT1]]] +; GCN: s_or_b64 [[OR:s\[[0-9]+:[0-9]+\]]], [[CMP1]], [[CMP2]] +; GCN: s_and_b64 [[AND:s\[[0-9]+:[0-9]+\]]], [[OR]], exec +; GCN-DAG: s_cselect_b32 [[RESULT0:s[0-9]+]] +; GCN-DAG: s_cselect_b32 [[RESULT1:s[0-9]+]] +; GCN-DAG: v_mov_b32_e32 v[[VRESULT1:[0-9]+]], [[RESULT0]] +; GCN-DAG: v_mov_b32_e32 v[[VRESULT0:[0-9]+]], [[RESULT1]] +; GCN: buffer_store_dwordx2 v[[[VRESULT0]]:[[VRESULT1]]] define amdgpu_kernel void @opt_select_i64_or_cmp_i32(i64 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i64 %x, i64 %y) #0 { %icmp0 = icmp ne i32 %a, %b %icmp1 = icmp ne i32 %a, %c @@ -128,10 +141,13 @@ ; GCN-LABEL: {{^}}opt_select_i64_or_cmp_f32: ; GCN-DAG: v_cmp_lg_f32_e32 vcc, ; GCN-DAG: v_cmp_lg_f32_e64 [[CMP1:s\[[0-9]+:[0-9]+\]]] -; GCN: s_or_b64 vcc, vcc, [[CMP1]] -; GCN: v_cndmask_b32_e32 v[[RESULT1:[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, vcc -; GCN: v_cndmask_b32_e32 v[[RESULT0:[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, vcc -; GCN: buffer_store_dwordx2 v[[[RESULT0]]:[[RESULT1]]] +; GCN: s_or_b64 [[OR:s\[[0-9]+:[0-9]+\]]], vcc, [[CMP1]] +; GCN: s_and_b64 [[AND:s\[[0-9]+:[0-9]+\]]], [[OR]], exec +; GCN-DAG: s_cselect_b32 [[RESULT0:s[0-9]+]] +; GCN-DAG: s_cselect_b32 [[RESULT1:s[0-9]+]] +; GCN-DAG: v_mov_b32_e32 v[[VRESULT1:[0-9]+]], [[RESULT0]] +; GCN-DAG: v_mov_b32_e32 v[[VRESULT0:[0-9]+]], [[RESULT1]] +; GCN: buffer_store_dwordx2 v[[[VRESULT0]]:[[VRESULT1]]] define amdgpu_kernel void @opt_select_i64_or_cmp_f32(i64 addrspace(1)* %out, float %a, float %b, float %c, i64 %x, i64 %y) #0 { %fcmp0 = fcmp one float %a, %b %fcmp1 = fcmp one float %a, %c diff --git a/llvm/test/CodeGen/AMDGPU/select-vectors.ll b/llvm/test/CodeGen/AMDGPU/select-vectors.ll --- a/llvm/test/CodeGen/AMDGPU/select-vectors.ll +++ b/llvm/test/CodeGen/AMDGPU/select-vectors.ll @@ -69,7 +69,7 @@ ; GFX89: s_cselect_b32 ; GFX89-NOT: s_cselect_b32 -; SI: v_cndmask_b32 +; SI: s_cselect_b32 ; SI-NOT: cndmask define amdgpu_kernel void @select_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> %a, <4 x i8> %b, i8 %c) #0 { %cmp = icmp eq i8 %c, 0 @@ -83,7 +83,7 @@ ; GFX89: s_cselect_b32 ; GFX89-NOT: s_cselect_b32 -; SI: v_cndmask_b32_e32 +; SI: s_cselect_b32 ; SI-NOT: v_cndmask_b32e define amdgpu_kernel void @select_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b, i32 %c) #0 { %cmp = icmp eq i32 %c, 0 @@ -111,10 +111,10 @@ ; SI: cndmask ; SI-NOT: cndmask -; GFX89: v_cndmask_b32_e32 -; GFX89: cndmask -; VI: cndmask -; GFX89-NOT: cndmask +; VI: s_cselect_b32 +; VI: s_cselect_b32 +; GFX9: cndmask +; GFX9: cndmask define amdgpu_kernel void @v_select_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> addrspace(1)* %a.ptr, <3 x i16> addrspace(1)* %b.ptr, i32 %c) #0 { %a = load <3 x i16>, <3 x i16> addrspace(1)* %a.ptr %b = load <3 x i16>, <3 x i16> addrspace(1)* %b.ptr @@ -156,8 +156,8 @@ ; vector select with SGPR inputs. ; GCN-LABEL: {{^}}s_select_v2i32: -; GCN: v_cndmask_b32_e32 -; GCN: v_cndmask_b32_e32 +; GCN: s_cselect_b32 +; GCN: s_cselect_b32 ; GCN: buffer_store_dwordx2 define amdgpu_kernel void @s_select_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b, i32 %c) #0 { %cmp = icmp eq i32 %c, 0 @@ -167,10 +167,10 @@ } ; GCN-LABEL: {{^}}s_select_v4i32: -; GCN: v_cndmask_b32_e32 -; GCN: v_cndmask_b32_e32 -; GCN: v_cndmask_b32_e32 -; GCN: v_cndmask_b32_e32 +; GCN: s_cselect_b32 +; GCN: s_cselect_b32 +; GCN: s_cselect_b32 +; GCN: s_cselect_b32 ; GCN: buffer_store_dwordx4 define amdgpu_kernel void @s_select_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b, i32 %c) #0 { %cmp = icmp eq i32 %c, 0 @@ -198,14 +198,14 @@ } ; GCN-LABEL: {{^}}select_v8i32: -; GCN: v_cndmask_b32_e32 -; GCN: v_cndmask_b32_e32 -; GCN: v_cndmask_b32_e32 -; GCN: v_cndmask_b32_e32 -; GCN: v_cndmask_b32_e32 -; GCN: v_cndmask_b32_e32 -; GCN: v_cndmask_b32_e32 -; GCN: v_cndmask_b32_e32 +; GCN: s_cselect_b32 +; GCN: s_cselect_b32 +; GCN: s_cselect_b32 +; GCN: s_cselect_b32 +; GCN: s_cselect_b32 +; GCN: s_cselect_b32 +; GCN: s_cselect_b32 +; GCN: s_cselect_b32 define amdgpu_kernel void @select_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> %a, <8 x i32> %b, i32 %c) #0 { %cmp = icmp eq i32 %c, 0 %select = select i1 %cmp, <8 x i32> %a, <8 x i32> %b @@ -214,15 +214,9 @@ } ; GCN-LABEL: {{^}}s_select_v2f32: -; GCN-DAG: s_load_dwordx4 s[[[ALO:[0-9]+]]:[[BHI:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}} - -; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[BHI]] -; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[ALO]] ; GCN-DAG: s_cmp_eq_u32 s{{[0-9]+}}, 0{{$}} - -; GCN-DAG: v_cndmask_b32_e32 -; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}} -; GCN-DAG: v_cndmask_b32_e32 +; GCN-DAG: s_cselect_b32 +; GCN-DAG: s_cselect_b32 ; GCN: buffer_store_dwordx2 define amdgpu_kernel void @s_select_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b, i32 %c) #0 { %cmp = icmp eq i32 %c, 0 @@ -234,9 +228,9 @@ ; GCN-LABEL: {{^}}s_select_v3f32: ; GCN: s_cmp_eq_u32 s{{[0-9]+}}, 0{{$}} -; GCN: v_cndmask_b32_e32 -; GCN: v_cndmask_b32_e32 -; GCN: v_cndmask_b32_e32 +; GCN: s_cselect_b32 +; GCN: s_cselect_b32 +; GCN: s_cselect_b32 ; GCN: buffer_store_dwordx define amdgpu_kernel void @s_select_v3f32(<3 x float> addrspace(1)* %out, <3 x float> %a, <3 x float> %b, i32 %c) #0 { @@ -250,10 +244,10 @@ ; GCN: s_load_dwordx8 ; GCN: s_cmp_eq_u32 s{{[0-9]+}}, 0{{$}} -; GCN: v_cndmask_b32_e32 -; GCN: v_cndmask_b32_e32 -; GCN: v_cndmask_b32_e32 -; GCN: v_cndmask_b32_e32 +; GCN: s_cselect_b32 +; GCN: s_cselect_b32 +; GCN: s_cselect_b32 +; GCN: s_cselect_b32 ; GCN: buffer_store_dwordx4 define amdgpu_kernel void @s_select_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, <4 x float> %b, i32 %c) #0 { @@ -284,11 +278,11 @@ ; GCN-LABEL: {{^}}s_select_v5f32: ; GCN: s_cmp_eq_u32 s{{[0-9]+}}, 0{{$}} -; GCN: v_cndmask_b32_e32 -; GCN: v_cndmask_b32_e32 -; GCN: v_cndmask_b32_e32 -; GCN: v_cndmask_b32_e32 -; GCN: v_cndmask_b32_e32 +; GCN: s_cselect_b32 +; GCN: s_cselect_b32 +; GCN: s_cselect_b32 +; GCN: s_cselect_b32 +; GCN: s_cselect_b32 ; GCN: buffer_store_dwordx define amdgpu_kernel void @s_select_v5f32(<5 x float> addrspace(1)* %out, <5 x float> %a, <5 x float> %b, i32 %c) #0 { @@ -315,10 +309,10 @@ } ; GCN-LABEL: {{^}}select_v2f64: -; GCN: v_cndmask_b32_e32 -; GCN: v_cndmask_b32_e32 -; GCN: v_cndmask_b32_e32 -; GCN: v_cndmask_b32_e32 +; GCN: s_cselect_b32 +; GCN: s_cselect_b32 +; GCN: s_cselect_b32 +; GCN: s_cselect_b32 define amdgpu_kernel void @select_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b, i32 %c) #0 { %cmp = icmp eq i32 %c, 0 %select = select i1 %cmp, <2 x double> %a, <2 x double> %b @@ -327,14 +321,14 @@ } ; GCN-LABEL: {{^}}select_v4f64: -; GCN: v_cndmask_b32_e32 -; GCN: v_cndmask_b32_e32 -; GCN: v_cndmask_b32_e32 -; GCN: v_cndmask_b32_e32 -; GCN: v_cndmask_b32_e32 -; GCN: v_cndmask_b32_e32 -; GCN: v_cndmask_b32_e32 -; GCN: v_cndmask_b32_e32 +; GCN: s_cselect_b32 +; GCN: s_cselect_b32 +; GCN: s_cselect_b32 +; GCN: s_cselect_b32 +; GCN: s_cselect_b32 +; GCN: s_cselect_b32 +; GCN: s_cselect_b32 +; GCN: s_cselect_b32 define amdgpu_kernel void @select_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b, i32 %c) #0 { %cmp = icmp eq i32 %c, 0 %select = select i1 %cmp, <4 x double> %a, <4 x double> %b @@ -343,22 +337,22 @@ } ; GCN-LABEL: {{^}}select_v8f64: -; GCN: v_cndmask_b32_e32 -; GCN: v_cndmask_b32_e32 -; GCN: v_cndmask_b32_e32 -; GCN: v_cndmask_b32_e32 -; GCN: v_cndmask_b32_e32 -; GCN: v_cndmask_b32_e32 -; GCN: v_cndmask_b32_e32 -; GCN: v_cndmask_b32_e32 -; GCN: v_cndmask_b32_e32 -; GCN: v_cndmask_b32_e32 -; GCN: v_cndmask_b32_e32 -; GCN: v_cndmask_b32_e32 -; GCN: v_cndmask_b32_e32 -; GCN: v_cndmask_b32_e32 -; GCN: v_cndmask_b32_e32 -; GCN: v_cndmask_b32_e32 +; GCN: s_cselect_b32 +; GCN: s_cselect_b32 +; GCN: s_cselect_b32 +; GCN: s_cselect_b32 +; GCN: s_cselect_b32 +; GCN: s_cselect_b32 +; GCN: s_cselect_b32 +; GCN: s_cselect_b32 +; GCN: s_cselect_b32 +; GCN: s_cselect_b32 +; GCN: s_cselect_b32 +; GCN: s_cselect_b32 +; GCN: s_cselect_b32 +; GCN: s_cselect_b32 +; GCN: s_cselect_b32 +; GCN: s_cselect_b32 define amdgpu_kernel void @select_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %a, <8 x double> %b, i32 %c) #0 { %cmp = icmp eq i32 %c, 0 %select = select i1 %cmp, <8 x double> %a, <8 x double> %b diff --git a/llvm/test/CodeGen/AMDGPU/select64.ll b/llvm/test/CodeGen/AMDGPU/select64.ll --- a/llvm/test/CodeGen/AMDGPU/select64.ll +++ b/llvm/test/CodeGen/AMDGPU/select64.ll @@ -1,12 +1,12 @@ -; RUN: llc < %s -march=amdgcn -mcpu=tahiti -verify-machineinstrs | FileCheck -check-prefixes=SI,GCN %s -; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefixes=VI,GCN %s +; RUN: llc < %s -march=amdgcn -mcpu=tahiti -verify-machineinstrs | FileCheck -check-prefix=GCN %s +; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefix=GCN %s ; GCN-LABEL: {{^}}select0: ; i64 select should be split into two i32 selects, and we shouldn't need ; to use a shfit to extract the hi dword of the input. ; GCN-NOT: s_lshr_b64 -; GCN: v_cndmask -; GCN: v_cndmask +; GCN: s_cselect_b32 +; GCN: s_cselect_b32 define amdgpu_kernel void @select0(i64 addrspace(1)* %out, i32 %cond, i64 %in) { entry: %0 = icmp ugt i32 %cond, 5 @@ -16,10 +16,8 @@ } ; GCN-LABEL: {{^}}select_trunc_i64: -; VI: s_cselect_b32 -; VI-NOT: s_cselect_b32 -; SI: v_cndmask_b32 -; SI-NOT: v_cndmask_b32 +; GCN: s_cselect_b32 +; GCN-NOT: s_cselect_b32 define amdgpu_kernel void @select_trunc_i64(i32 addrspace(1)* %out, i32 %cond, i64 %in) nounwind { %cmp = icmp ugt i32 %cond, 5 %sel = select i1 %cmp, i64 0, i64 %in @@ -29,10 +27,8 @@ } ; GCN-LABEL: {{^}}select_trunc_i64_2: -; VI: s_cselect_b32 -; VI-NOT: s_cselect_b32 -; SI: v_cndmask_b32 -; SI-NOT: v_cndmask_b32 +; GCN: s_cselect_b32 +; GCN-NOT: s_cselect_b32 define amdgpu_kernel void @select_trunc_i64_2(i32 addrspace(1)* %out, i32 %cond, i64 %a, i64 %b) nounwind { %cmp = icmp ugt i32 %cond, 5 %sel = select i1 %cmp, i64 %a, i64 %b @@ -42,10 +38,8 @@ } ; GCN-LABEL: {{^}}v_select_trunc_i64_2: -; VI: s_cselect_b32 -; VI-NOT: s_cselect_b32 -; SI: v_cndmask_b32 -; SI-NOT: v_cndmask_b32 +; GCN: s_cselect_b32 +; GCN-NOT: s_cselect_b32 define amdgpu_kernel void @v_select_trunc_i64_2(i32 addrspace(1)* %out, i32 %cond, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind { %cmp = icmp ugt i32 %cond, 5 %a = load i64, i64 addrspace(1)* %aptr, align 8 @@ -57,8 +51,8 @@ } ; GCN-LABEL: {{^}}v_select_i64_split_imm: -; GCN-DAG: v_cndmask_b32_e32 {{v[0-9]+}}, 0, {{v[0-9]+}} -; GCN-DAG: v_cndmask_b32_e32 {{v[0-9]+}}, 63, {{v[0-9]+}} +; GCN-DAG: s_cselect_b32 +; GCN-DAG: s_cselect_b32 ; GCN: s_endpgm define amdgpu_kernel void @v_select_i64_split_imm(i64 addrspace(1)* %out, i32 %cond, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind { %cmp = icmp ugt i32 %cond, 5 diff --git a/llvm/test/CodeGen/AMDGPU/selectcc.ll b/llvm/test/CodeGen/AMDGPU/selectcc.ll --- a/llvm/test/CodeGen/AMDGPU/selectcc.ll +++ b/llvm/test/CodeGen/AMDGPU/selectcc.ll @@ -10,8 +10,7 @@ ; EG: CNDE_INT ; SI: v_cmp_eq_u64 ; VI: s_cmp_eq_u64 -; GCN: v_cndmask -; GCN: v_cndmask +; GCN: s_cselect_b32 define amdgpu_kernel void @selectcc_i64(i64 addrspace(1) * %out, i64 %lhs, i64 %rhs, i64 %true, i64 %false) { entry: %0 = icmp eq i64 %lhs, %rhs diff --git a/llvm/test/CodeGen/AMDGPU/setcc64.ll b/llvm/test/CodeGen/AMDGPU/setcc64.ll --- a/llvm/test/CodeGen/AMDGPU/setcc64.ll +++ b/llvm/test/CodeGen/AMDGPU/setcc64.ll @@ -261,8 +261,8 @@ } ; GCN-LABEL: {{^}}i128_sle: -; GCN: v_cmp_le_u64 ; GCN: v_cmp_le_i64 +; CGV: v_cndmask ; SI: v_cmp_eq_u64 ; VI: s_cmp_eq_u64 define amdgpu_kernel void @i128_sle(i32 addrspace(1)* %out, i128 %a, i128 %b) #0 { diff --git a/llvm/test/CodeGen/AMDGPU/shift-i128.ll b/llvm/test/CodeGen/AMDGPU/shift-i128.ll --- a/llvm/test/CodeGen/AMDGPU/shift-i128.ll +++ b/llvm/test/CodeGen/AMDGPU/shift-i128.ll @@ -183,35 +183,29 @@ define amdgpu_kernel void @s_shl_i128_ss(i128 %lhs, i128 %rhs) { ; GCN-LABEL: s_shl_i128_ss: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0 +; GCN-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x0 ; GCN-NEXT: v_mov_b32_e32 v4, 0 ; GCN-NEXT: v_mov_b32_e32 v5, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_sub_i32 s9, 64, s8 -; GCN-NEXT: s_sub_i32 s2, s8, 64 -; GCN-NEXT: s_lshl_b64 s[0:1], s[6:7], s8 -; GCN-NEXT: s_lshr_b64 s[10:11], s[4:5], s9 -; GCN-NEXT: s_lshl_b64 s[2:3], s[4:5], s2 -; GCN-NEXT: s_or_b64 s[10:11], s[0:1], s[10:11] -; GCN-NEXT: s_cmp_lt_u32 s8, 64 -; GCN-NEXT: v_mov_b32_e32 v0, s3 -; GCN-NEXT: v_mov_b32_e32 v1, s11 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_eq_u32 s8, 0 -; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GCN-NEXT: v_mov_b32_e32 v1, s7 -; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GCN-NEXT: v_cndmask_b32_e64 v3, v0, v1, s[0:1] -; GCN-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NEXT: v_mov_b32_e32 v1, s10 -; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GCN-NEXT: v_mov_b32_e32 v1, s6 -; GCN-NEXT: v_cndmask_b32_e64 v2, v0, v1, s[0:1] -; GCN-NEXT: s_lshl_b64 s[0:1], s[4:5], s8 -; GCN-NEXT: v_mov_b32_e32 v0, s1 -; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc -; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GCN-NEXT: s_sub_i32 s5, s4, 64 +; GCN-NEXT: s_sub_i32 s12, 64, s4 +; GCN-NEXT: s_lshl_b64 s[6:7], s[2:3], s4 +; GCN-NEXT: s_lshl_b64 s[8:9], s[0:1], s4 +; GCN-NEXT: s_lshl_b64 s[10:11], s[0:1], s5 +; GCN-NEXT: s_lshr_b64 s[0:1], s[0:1], s12 +; GCN-NEXT: s_or_b64 s[0:1], s[6:7], s[0:1] +; GCN-NEXT: s_cmp_lt_u32 s4, 64 +; GCN-NEXT: s_cselect_b32 s0, s0, s10 +; GCN-NEXT: s_cselect_b32 s1, s1, s11 +; GCN-NEXT: s_cselect_b32 s5, s9, 0 +; GCN-NEXT: s_cselect_b32 s6, s8, 0 +; GCN-NEXT: s_cmp_eq_u32 s4, 0 +; GCN-NEXT: s_cselect_b32 s1, s3, s1 +; GCN-NEXT: s_cselect_b32 s0, s2, s0 +; GCN-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NEXT: v_mov_b32_e32 v1, s5 +; GCN-NEXT: v_mov_b32_e32 v2, s0 +; GCN-NEXT: v_mov_b32_e32 v3, s1 ; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NEXT: s_endpgm %shift = shl i128 %lhs, %rhs @@ -222,35 +216,29 @@ define amdgpu_kernel void @s_lshr_i128_ss(i128 %lhs, i128 %rhs) { ; GCN-LABEL: s_lshr_i128_ss: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0 +; GCN-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x0 ; GCN-NEXT: v_mov_b32_e32 v4, 0 ; GCN-NEXT: v_mov_b32_e32 v5, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_sub_i32 s9, 64, s8 -; GCN-NEXT: s_sub_i32 s2, s8, 64 -; GCN-NEXT: s_lshr_b64 s[0:1], s[4:5], s8 -; GCN-NEXT: s_lshl_b64 s[10:11], s[6:7], s9 -; GCN-NEXT: s_lshr_b64 s[2:3], s[6:7], s2 -; GCN-NEXT: s_or_b64 s[10:11], s[0:1], s[10:11] -; GCN-NEXT: s_cmp_lt_u32 s8, 64 -; GCN-NEXT: v_mov_b32_e32 v0, s3 -; GCN-NEXT: v_mov_b32_e32 v1, s11 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_eq_u32 s8, 0 -; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GCN-NEXT: v_mov_b32_e32 v1, s5 -; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GCN-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[0:1] -; GCN-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NEXT: v_mov_b32_e32 v2, s10 -; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GCN-NEXT: v_mov_b32_e32 v2, s4 -; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] -; GCN-NEXT: s_lshr_b64 s[0:1], s[6:7], s8 -; GCN-NEXT: v_mov_b32_e32 v2, s1 -; GCN-NEXT: v_cndmask_b32_e32 v3, 0, v2, vcc -; GCN-NEXT: v_mov_b32_e32 v2, s0 -; GCN-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GCN-NEXT: s_sub_i32 s5, s4, 64 +; GCN-NEXT: s_sub_i32 s12, 64, s4 +; GCN-NEXT: s_lshr_b64 s[6:7], s[0:1], s4 +; GCN-NEXT: s_lshr_b64 s[8:9], s[2:3], s4 +; GCN-NEXT: s_lshr_b64 s[10:11], s[2:3], s5 +; GCN-NEXT: s_lshl_b64 s[2:3], s[2:3], s12 +; GCN-NEXT: s_or_b64 s[2:3], s[6:7], s[2:3] +; GCN-NEXT: s_cmp_lt_u32 s4, 64 +; GCN-NEXT: s_cselect_b32 s2, s2, s10 +; GCN-NEXT: s_cselect_b32 s3, s3, s11 +; GCN-NEXT: s_cselect_b32 s5, s9, 0 +; GCN-NEXT: s_cselect_b32 s6, s8, 0 +; GCN-NEXT: s_cmp_eq_u32 s4, 0 +; GCN-NEXT: s_cselect_b32 s1, s1, s3 +; GCN-NEXT: s_cselect_b32 s0, s0, s2 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: v_mov_b32_e32 v1, s1 +; GCN-NEXT: v_mov_b32_e32 v2, s6 +; GCN-NEXT: v_mov_b32_e32 v3, s5 ; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NEXT: s_endpgm %shift = lshr i128 %lhs, %rhs @@ -261,37 +249,30 @@ define amdgpu_kernel void @s_ashr_i128_ss(i128 %lhs, i128 %rhs) { ; GCN-LABEL: s_ashr_i128_ss: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_ashr_i32 s2, s7, 31 -; GCN-NEXT: s_ashr_i64 s[0:1], s[6:7], s8 -; GCN-NEXT: s_cmp_lt_u32 s8, 64 -; GCN-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NEXT: v_mov_b32_e32 v1, s1 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc -; GCN-NEXT: v_mov_b32_e32 v1, s0 -; GCN-NEXT: s_sub_i32 s0, s8, 64 -; GCN-NEXT: s_ashr_i64 s[2:3], s[6:7], s0 -; GCN-NEXT: s_sub_i32 s0, 64, s8 -; GCN-NEXT: s_lshl_b64 s[0:1], s[6:7], s0 -; GCN-NEXT: s_lshr_b64 s[6:7], s[4:5], s8 -; GCN-NEXT: s_or_b64 s[6:7], s[6:7], s[0:1] -; GCN-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc -; GCN-NEXT: v_mov_b32_e32 v0, s3 -; GCN-NEXT: v_mov_b32_e32 v1, s7 -; GCN-NEXT: s_cmp_eq_u32 s8, 0 -; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GCN-NEXT: v_mov_b32_e32 v1, s5 -; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GCN-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[0:1] -; GCN-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NEXT: v_mov_b32_e32 v4, s6 -; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; GCN-NEXT: v_mov_b32_e32 v6, s4 +; GCN-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x0 ; GCN-NEXT: v_mov_b32_e32 v4, 0 ; GCN-NEXT: v_mov_b32_e32 v5, 0 -; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[0:1] +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_sub_i32 s5, 64, s4 +; GCN-NEXT: s_lshr_b64 s[6:7], s[0:1], s4 +; GCN-NEXT: s_sub_i32 s10, s4, 64 +; GCN-NEXT: s_lshl_b64 s[8:9], s[2:3], s5 +; GCN-NEXT: s_ashr_i32 s12, s3, 31 +; GCN-NEXT: s_ashr_i64 s[10:11], s[2:3], s10 +; GCN-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] +; GCN-NEXT: s_ashr_i64 s[2:3], s[2:3], s4 +; GCN-NEXT: s_cmp_lt_u32 s4, 64 +; GCN-NEXT: s_cselect_b32 s3, s3, s12 +; GCN-NEXT: s_cselect_b32 s2, s2, s12 +; GCN-NEXT: s_cselect_b32 s5, s6, s10 +; GCN-NEXT: s_cselect_b32 s6, s7, s11 +; GCN-NEXT: s_cmp_eq_u32 s4, 0 +; GCN-NEXT: s_cselect_b32 s1, s1, s6 +; GCN-NEXT: s_cselect_b32 s0, s0, s5 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: v_mov_b32_e32 v1, s1 +; GCN-NEXT: v_mov_b32_e32 v2, s2 +; GCN-NEXT: v_mov_b32_e32 v3, s3 ; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NEXT: s_endpgm %shift = ashr i128 %lhs, %rhs @@ -451,66 +432,69 @@ define amdgpu_kernel void @s_shl_v2i128ss(<2 x i128> %lhs, <2 x i128> %rhs) { ; GCN-LABEL: s_shl_v2i128ss: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx16 s[4:19], s[4:5], 0x0 -; GCN-NEXT: v_mov_b32_e32 v10, 16 -; GCN-NEXT: v_mov_b32_e32 v8, 0 -; GCN-NEXT: v_mov_b32_e32 v11, 0 -; GCN-NEXT: v_mov_b32_e32 v9, 0 +; GCN-NEXT: s_load_dwordx16 s[0:15], s[4:5], 0x0 +; GCN-NEXT: v_mov_b32_e32 v6, 16 +; GCN-NEXT: v_mov_b32_e32 v4, 0 +; GCN-NEXT: v_mov_b32_e32 v7, 0 +; GCN-NEXT: v_mov_b32_e32 v5, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_cmp_lt_u64_e64 s[0:1], s[12:13], 64 -; GCN-NEXT: v_cmp_eq_u64_e64 s[2:3], s[14:15], 0 -; GCN-NEXT: s_sub_i32 s22, 64, s12 -; GCN-NEXT: s_sub_i32 s20, s12, 64 -; GCN-NEXT: s_lshr_b64 s[22:23], s[4:5], s22 -; GCN-NEXT: s_lshl_b64 s[24:25], s[6:7], s12 -; GCN-NEXT: s_lshl_b64 s[20:21], s[4:5], s20 -; GCN-NEXT: s_or_b64 s[22:23], s[24:25], s[22:23] -; GCN-NEXT: s_and_b64 vcc, s[2:3], s[0:1] -; GCN-NEXT: s_or_b64 s[0:1], s[12:13], s[14:15] -; GCN-NEXT: v_mov_b32_e32 v0, s21 -; GCN-NEXT: v_mov_b32_e32 v1, s23 -; GCN-NEXT: v_cmp_eq_u64_e64 s[0:1], s[0:1], 0 -; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GCN-NEXT: v_mov_b32_e32 v1, s7 -; GCN-NEXT: v_cndmask_b32_e64 v3, v0, v1, s[0:1] -; GCN-NEXT: v_mov_b32_e32 v0, s20 -; GCN-NEXT: v_mov_b32_e32 v1, s22 -; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GCN-NEXT: v_mov_b32_e32 v1, s6 -; GCN-NEXT: v_cndmask_b32_e64 v2, v0, v1, s[0:1] -; GCN-NEXT: v_cmp_lt_u64_e64 s[0:1], s[16:17], 64 -; GCN-NEXT: v_cmp_eq_u64_e64 s[2:3], s[18:19], 0 -; GCN-NEXT: s_sub_i32 s13, 64, s16 -; GCN-NEXT: s_sub_i32 s6, s16, 64 -; GCN-NEXT: s_lshr_b64 s[14:15], s[8:9], s13 -; GCN-NEXT: s_lshl_b64 s[20:21], s[10:11], s16 -; GCN-NEXT: s_lshl_b64 s[6:7], s[8:9], s6 -; GCN-NEXT: s_or_b64 s[14:15], s[20:21], s[14:15] -; GCN-NEXT: s_and_b64 s[0:1], s[2:3], s[0:1] -; GCN-NEXT: s_or_b64 s[2:3], s[16:17], s[18:19] +; GCN-NEXT: v_cmp_lt_u64_e64 s[16:17], s[8:9], 64 +; GCN-NEXT: v_cmp_eq_u64_e64 s[18:19], s[10:11], 0 +; GCN-NEXT: s_sub_i32 s22, 64, s8 +; GCN-NEXT: s_sub_i32 s20, s8, 64 +; GCN-NEXT: s_lshr_b64 s[22:23], s[0:1], s22 +; GCN-NEXT: s_and_b64 s[16:17], s[18:19], s[16:17] +; GCN-NEXT: s_lshl_b64 s[18:19], s[2:3], s8 +; GCN-NEXT: s_lshl_b64 s[20:21], s[0:1], s20 +; GCN-NEXT: s_or_b64 s[18:19], s[18:19], s[22:23] +; GCN-NEXT: s_and_b64 s[22:23], s[16:17], exec +; GCN-NEXT: s_cselect_b32 s19, s19, s21 +; GCN-NEXT: s_or_b64 s[10:11], s[8:9], s[10:11] +; GCN-NEXT: v_cmp_eq_u64_e64 s[10:11], s[10:11], 0 +; GCN-NEXT: s_and_b64 s[22:23], s[10:11], exec +; GCN-NEXT: s_cselect_b32 s9, s3, s19 +; GCN-NEXT: s_and_b64 s[22:23], s[16:17], exec +; GCN-NEXT: s_cselect_b32 s3, s18, s20 +; GCN-NEXT: s_and_b64 s[10:11], s[10:11], exec +; GCN-NEXT: v_cmp_lt_u64_e64 s[10:11], s[12:13], 64 +; GCN-NEXT: v_cmp_eq_u64_e64 s[18:19], s[14:15], 0 +; GCN-NEXT: s_cselect_b32 s22, s2, s3 +; GCN-NEXT: s_and_b64 s[2:3], s[18:19], s[10:11] +; GCN-NEXT: s_sub_i32 s18, 64, s12 +; GCN-NEXT: s_sub_i32 s10, s12, 64 +; GCN-NEXT: s_lshr_b64 s[18:19], s[4:5], s18 +; GCN-NEXT: s_lshl_b64 s[20:21], s[6:7], s12 +; GCN-NEXT: s_lshl_b64 s[10:11], s[4:5], s10 +; GCN-NEXT: s_or_b64 s[18:19], s[20:21], s[18:19] +; GCN-NEXT: s_and_b64 s[20:21], s[2:3], exec +; GCN-NEXT: s_cselect_b32 s11, s19, s11 +; GCN-NEXT: s_or_b64 s[14:15], s[12:13], s[14:15] +; GCN-NEXT: v_cmp_eq_u64_e64 s[14:15], s[14:15], 0 +; GCN-NEXT: s_and_b64 s[20:21], s[14:15], exec +; GCN-NEXT: s_cselect_b32 s13, s7, s11 +; GCN-NEXT: s_and_b64 s[20:21], s[2:3], exec +; GCN-NEXT: s_cselect_b32 s7, s18, s10 +; GCN-NEXT: s_and_b64 s[10:11], s[14:15], exec +; GCN-NEXT: s_cselect_b32 s10, s6, s7 +; GCN-NEXT: s_lshl_b64 s[0:1], s[0:1], s8 +; GCN-NEXT: s_and_b64 s[6:7], s[16:17], exec +; GCN-NEXT: s_cselect_b32 s6, s1, 0 +; GCN-NEXT: s_cselect_b32 s7, s0, 0 +; GCN-NEXT: s_lshl_b64 s[0:1], s[4:5], s12 +; GCN-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN-NEXT: s_cselect_b32 s1, s1, 0 +; GCN-NEXT: s_cselect_b32 s0, s0, 0 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: v_mov_b32_e32 v1, s1 +; GCN-NEXT: v_mov_b32_e32 v2, s10 +; GCN-NEXT: v_mov_b32_e32 v3, s13 +; GCN-NEXT: flat_store_dwordx4 v[6:7], v[0:3] +; GCN-NEXT: s_nop 0 ; GCN-NEXT: v_mov_b32_e32 v0, s7 -; GCN-NEXT: v_mov_b32_e32 v1, s15 -; GCN-NEXT: v_cmp_eq_u64_e64 s[2:3], s[2:3], 0 -; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[0:1] -; GCN-NEXT: v_mov_b32_e32 v1, s11 -; GCN-NEXT: v_cndmask_b32_e64 v7, v0, v1, s[2:3] -; GCN-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NEXT: v_mov_b32_e32 v1, s14 -; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[0:1] -; GCN-NEXT: v_mov_b32_e32 v1, s10 -; GCN-NEXT: v_cndmask_b32_e64 v6, v0, v1, s[2:3] -; GCN-NEXT: s_lshl_b64 s[2:3], s[4:5], s12 -; GCN-NEXT: v_mov_b32_e32 v0, s3 -; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc -; GCN-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NEXT: s_lshl_b64 s[2:3], s[8:9], s16 -; GCN-NEXT: v_mov_b32_e32 v4, s3 -; GCN-NEXT: v_cndmask_b32_e64 v5, 0, v4, s[0:1] -; GCN-NEXT: v_mov_b32_e32 v4, s2 -; GCN-NEXT: v_cndmask_b32_e64 v4, 0, v4, s[0:1] -; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GCN-NEXT: flat_store_dwordx4 v[10:11], v[4:7] -; GCN-NEXT: flat_store_dwordx4 v[8:9], v[0:3] +; GCN-NEXT: v_mov_b32_e32 v1, s6 +; GCN-NEXT: v_mov_b32_e32 v2, s22 +; GCN-NEXT: v_mov_b32_e32 v3, s9 +; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NEXT: s_endpgm %shift = shl <2 x i128> %lhs, %rhs store <2 x i128> %shift, <2 x i128> addrspace(1)* null @@ -520,66 +504,69 @@ define amdgpu_kernel void @s_lshr_v2i128_ss(<2 x i128> %lhs, <2 x i128> %rhs) { ; GCN-LABEL: s_lshr_v2i128_ss: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx16 s[4:19], s[4:5], 0x0 -; GCN-NEXT: v_mov_b32_e32 v10, 16 -; GCN-NEXT: v_mov_b32_e32 v8, 0 -; GCN-NEXT: v_mov_b32_e32 v11, 0 -; GCN-NEXT: v_mov_b32_e32 v9, 0 +; GCN-NEXT: s_load_dwordx16 s[0:15], s[4:5], 0x0 +; GCN-NEXT: v_mov_b32_e32 v6, 16 +; GCN-NEXT: v_mov_b32_e32 v4, 0 +; GCN-NEXT: v_mov_b32_e32 v7, 0 +; GCN-NEXT: v_mov_b32_e32 v5, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_cmp_lt_u64_e64 s[0:1], s[12:13], 64 -; GCN-NEXT: v_cmp_eq_u64_e64 s[2:3], s[14:15], 0 -; GCN-NEXT: s_sub_i32 s22, 64, s12 -; GCN-NEXT: s_sub_i32 s20, s12, 64 -; GCN-NEXT: s_lshl_b64 s[22:23], s[6:7], s22 -; GCN-NEXT: s_lshr_b64 s[24:25], s[4:5], s12 -; GCN-NEXT: s_lshr_b64 s[20:21], s[6:7], s20 -; GCN-NEXT: s_or_b64 s[22:23], s[24:25], s[22:23] -; GCN-NEXT: s_and_b64 vcc, s[2:3], s[0:1] -; GCN-NEXT: s_or_b64 s[0:1], s[12:13], s[14:15] -; GCN-NEXT: v_mov_b32_e32 v0, s21 -; GCN-NEXT: v_mov_b32_e32 v1, s23 -; GCN-NEXT: v_cmp_eq_u64_e64 s[0:1], s[0:1], 0 -; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GCN-NEXT: v_mov_b32_e32 v1, s5 -; GCN-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[0:1] -; GCN-NEXT: v_mov_b32_e32 v0, s20 -; GCN-NEXT: v_mov_b32_e32 v2, s22 -; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GCN-NEXT: v_mov_b32_e32 v2, s4 -; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] -; GCN-NEXT: v_cmp_lt_u64_e64 s[0:1], s[16:17], 64 -; GCN-NEXT: v_cmp_eq_u64_e64 s[2:3], s[18:19], 0 -; GCN-NEXT: s_sub_i32 s13, 64, s16 -; GCN-NEXT: s_sub_i32 s4, s16, 64 -; GCN-NEXT: s_lshl_b64 s[14:15], s[10:11], s13 -; GCN-NEXT: s_lshr_b64 s[20:21], s[8:9], s16 -; GCN-NEXT: s_lshr_b64 s[4:5], s[10:11], s4 -; GCN-NEXT: s_or_b64 s[14:15], s[20:21], s[14:15] -; GCN-NEXT: s_and_b64 s[0:1], s[2:3], s[0:1] -; GCN-NEXT: s_or_b64 s[2:3], s[16:17], s[18:19] -; GCN-NEXT: v_mov_b32_e32 v2, s5 -; GCN-NEXT: v_mov_b32_e32 v3, s15 -; GCN-NEXT: v_cmp_eq_u64_e64 s[2:3], s[2:3], 0 -; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] -; GCN-NEXT: v_mov_b32_e32 v3, s9 -; GCN-NEXT: v_cndmask_b32_e64 v5, v2, v3, s[2:3] -; GCN-NEXT: v_mov_b32_e32 v2, s4 -; GCN-NEXT: v_mov_b32_e32 v3, s14 -; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] -; GCN-NEXT: v_mov_b32_e32 v3, s8 -; GCN-NEXT: v_cndmask_b32_e64 v4, v2, v3, s[2:3] +; GCN-NEXT: v_cmp_lt_u64_e64 s[16:17], s[8:9], 64 +; GCN-NEXT: v_cmp_eq_u64_e64 s[18:19], s[10:11], 0 +; GCN-NEXT: s_sub_i32 s22, 64, s8 +; GCN-NEXT: s_sub_i32 s20, s8, 64 +; GCN-NEXT: s_lshl_b64 s[22:23], s[2:3], s22 +; GCN-NEXT: s_and_b64 s[16:17], s[18:19], s[16:17] +; GCN-NEXT: s_lshr_b64 s[18:19], s[0:1], s8 +; GCN-NEXT: s_lshr_b64 s[20:21], s[2:3], s20 +; GCN-NEXT: s_or_b64 s[18:19], s[18:19], s[22:23] +; GCN-NEXT: s_and_b64 s[22:23], s[16:17], exec +; GCN-NEXT: s_cselect_b32 s19, s19, s21 +; GCN-NEXT: s_or_b64 s[10:11], s[8:9], s[10:11] +; GCN-NEXT: v_cmp_eq_u64_e64 s[10:11], s[10:11], 0 +; GCN-NEXT: s_and_b64 s[22:23], s[10:11], exec +; GCN-NEXT: s_cselect_b32 s9, s1, s19 +; GCN-NEXT: s_and_b64 s[22:23], s[16:17], exec +; GCN-NEXT: s_cselect_b32 s1, s18, s20 +; GCN-NEXT: s_and_b64 s[10:11], s[10:11], exec +; GCN-NEXT: v_cmp_lt_u64_e64 s[10:11], s[12:13], 64 +; GCN-NEXT: v_cmp_eq_u64_e64 s[18:19], s[14:15], 0 +; GCN-NEXT: s_cselect_b32 s22, s0, s1 +; GCN-NEXT: s_and_b64 s[0:1], s[18:19], s[10:11] +; GCN-NEXT: s_sub_i32 s18, 64, s12 +; GCN-NEXT: s_sub_i32 s10, s12, 64 +; GCN-NEXT: s_lshl_b64 s[18:19], s[6:7], s18 +; GCN-NEXT: s_lshr_b64 s[20:21], s[4:5], s12 +; GCN-NEXT: s_lshr_b64 s[10:11], s[6:7], s10 +; GCN-NEXT: s_or_b64 s[18:19], s[20:21], s[18:19] +; GCN-NEXT: s_and_b64 s[20:21], s[0:1], exec +; GCN-NEXT: s_cselect_b32 s11, s19, s11 +; GCN-NEXT: s_or_b64 s[14:15], s[12:13], s[14:15] +; GCN-NEXT: v_cmp_eq_u64_e64 s[14:15], s[14:15], 0 +; GCN-NEXT: s_and_b64 s[20:21], s[14:15], exec +; GCN-NEXT: s_cselect_b32 s13, s5, s11 +; GCN-NEXT: s_and_b64 s[20:21], s[0:1], exec +; GCN-NEXT: s_cselect_b32 s5, s18, s10 +; GCN-NEXT: s_and_b64 s[10:11], s[14:15], exec +; GCN-NEXT: s_cselect_b32 s10, s4, s5 +; GCN-NEXT: s_lshr_b64 s[2:3], s[2:3], s8 +; GCN-NEXT: s_and_b64 s[4:5], s[16:17], exec +; GCN-NEXT: s_cselect_b32 s4, s3, 0 +; GCN-NEXT: s_cselect_b32 s5, s2, 0 ; GCN-NEXT: s_lshr_b64 s[2:3], s[6:7], s12 -; GCN-NEXT: v_mov_b32_e32 v2, s3 -; GCN-NEXT: v_cndmask_b32_e32 v3, 0, v2, vcc -; GCN-NEXT: v_mov_b32_e32 v2, s2 -; GCN-NEXT: s_lshr_b64 s[2:3], s[10:11], s16 -; GCN-NEXT: v_mov_b32_e32 v6, s3 -; GCN-NEXT: v_cndmask_b32_e64 v7, 0, v6, s[0:1] -; GCN-NEXT: v_mov_b32_e32 v6, s2 -; GCN-NEXT: v_cndmask_b32_e64 v6, 0, v6, s[0:1] -; GCN-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc -; GCN-NEXT: flat_store_dwordx4 v[10:11], v[4:7] -; GCN-NEXT: flat_store_dwordx4 v[8:9], v[0:3] +; GCN-NEXT: s_and_b64 s[0:1], s[0:1], exec +; GCN-NEXT: s_cselect_b32 s0, s3, 0 +; GCN-NEXT: s_cselect_b32 s1, s2, 0 +; GCN-NEXT: v_mov_b32_e32 v0, s10 +; GCN-NEXT: v_mov_b32_e32 v1, s13 +; GCN-NEXT: v_mov_b32_e32 v2, s1 +; GCN-NEXT: v_mov_b32_e32 v3, s0 +; GCN-NEXT: flat_store_dwordx4 v[6:7], v[0:3] +; GCN-NEXT: s_nop 0 +; GCN-NEXT: v_mov_b32_e32 v0, s22 +; GCN-NEXT: v_mov_b32_e32 v1, s9 +; GCN-NEXT: v_mov_b32_e32 v2, s5 +; GCN-NEXT: v_mov_b32_e32 v3, s4 +; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NEXT: s_endpgm %shift = lshr <2 x i128> %lhs, %rhs store <2 x i128> %shift, <2 x i128> addrspace(1)* null @@ -589,70 +576,71 @@ define amdgpu_kernel void @s_ashr_v2i128_ss(<2 x i128> %lhs, <2 x i128> %rhs) { ; GCN-LABEL: s_ashr_v2i128_ss: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx16 s[4:19], s[4:5], 0x0 -; GCN-NEXT: v_mov_b32_e32 v10, 16 -; GCN-NEXT: v_mov_b32_e32 v8, 0 -; GCN-NEXT: v_mov_b32_e32 v11, 0 -; GCN-NEXT: v_mov_b32_e32 v9, 0 +; GCN-NEXT: s_load_dwordx16 s[0:15], s[4:5], 0x0 +; GCN-NEXT: v_mov_b32_e32 v6, 16 +; GCN-NEXT: v_mov_b32_e32 v4, 0 +; GCN-NEXT: v_mov_b32_e32 v7, 0 +; GCN-NEXT: v_mov_b32_e32 v5, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_cmp_lt_u64_e64 s[0:1], s[12:13], 64 -; GCN-NEXT: v_cmp_eq_u64_e64 s[2:3], s[14:15], 0 -; GCN-NEXT: s_sub_i32 s22, 64, s12 -; GCN-NEXT: s_sub_i32 s20, s12, 64 -; GCN-NEXT: s_lshl_b64 s[22:23], s[6:7], s22 -; GCN-NEXT: s_lshr_b64 s[24:25], s[4:5], s12 -; GCN-NEXT: s_ashr_i64 s[20:21], s[6:7], s20 -; GCN-NEXT: s_or_b64 s[22:23], s[24:25], s[22:23] -; GCN-NEXT: s_and_b64 vcc, s[2:3], s[0:1] -; GCN-NEXT: s_or_b64 s[0:1], s[12:13], s[14:15] -; GCN-NEXT: v_mov_b32_e32 v0, s21 -; GCN-NEXT: v_mov_b32_e32 v1, s23 -; GCN-NEXT: v_cmp_eq_u64_e64 s[0:1], s[0:1], 0 -; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GCN-NEXT: v_mov_b32_e32 v1, s5 -; GCN-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[0:1] -; GCN-NEXT: v_mov_b32_e32 v0, s20 -; GCN-NEXT: v_mov_b32_e32 v2, s22 -; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GCN-NEXT: v_mov_b32_e32 v2, s4 -; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] -; GCN-NEXT: v_cmp_lt_u64_e64 s[0:1], s[16:17], 64 -; GCN-NEXT: v_cmp_eq_u64_e64 s[2:3], s[18:19], 0 -; GCN-NEXT: s_sub_i32 s13, 64, s16 -; GCN-NEXT: s_sub_i32 s4, s16, 64 -; GCN-NEXT: s_lshl_b64 s[14:15], s[10:11], s13 -; GCN-NEXT: s_lshr_b64 s[20:21], s[8:9], s16 -; GCN-NEXT: s_ashr_i64 s[4:5], s[10:11], s4 -; GCN-NEXT: s_or_b64 s[14:15], s[20:21], s[14:15] -; GCN-NEXT: s_and_b64 s[0:1], s[2:3], s[0:1] -; GCN-NEXT: s_or_b64 s[2:3], s[16:17], s[18:19] -; GCN-NEXT: v_mov_b32_e32 v2, s5 -; GCN-NEXT: v_mov_b32_e32 v3, s15 -; GCN-NEXT: v_cmp_eq_u64_e64 s[2:3], s[2:3], 0 -; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] -; GCN-NEXT: v_mov_b32_e32 v3, s9 -; GCN-NEXT: v_cndmask_b32_e64 v5, v2, v3, s[2:3] -; GCN-NEXT: v_mov_b32_e32 v2, s4 -; GCN-NEXT: v_mov_b32_e32 v3, s14 -; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] -; GCN-NEXT: v_mov_b32_e32 v3, s8 -; GCN-NEXT: v_cndmask_b32_e64 v4, v2, v3, s[2:3] -; GCN-NEXT: s_ashr_i32 s4, s7, 31 +; GCN-NEXT: v_cmp_lt_u64_e64 s[16:17], s[8:9], 64 +; GCN-NEXT: v_cmp_eq_u64_e64 s[18:19], s[10:11], 0 +; GCN-NEXT: s_sub_i32 s22, 64, s8 +; GCN-NEXT: s_sub_i32 s20, s8, 64 +; GCN-NEXT: s_lshl_b64 s[22:23], s[2:3], s22 +; GCN-NEXT: s_and_b64 s[16:17], s[18:19], s[16:17] +; GCN-NEXT: s_lshr_b64 s[18:19], s[0:1], s8 +; GCN-NEXT: s_ashr_i64 s[20:21], s[2:3], s20 +; GCN-NEXT: s_or_b64 s[18:19], s[18:19], s[22:23] +; GCN-NEXT: s_and_b64 s[22:23], s[16:17], exec +; GCN-NEXT: s_cselect_b32 s19, s19, s21 +; GCN-NEXT: s_or_b64 s[10:11], s[8:9], s[10:11] +; GCN-NEXT: v_cmp_eq_u64_e64 s[10:11], s[10:11], 0 +; GCN-NEXT: s_and_b64 s[22:23], s[10:11], exec +; GCN-NEXT: s_cselect_b32 s9, s1, s19 +; GCN-NEXT: s_and_b64 s[22:23], s[16:17], exec +; GCN-NEXT: s_cselect_b32 s1, s18, s20 +; GCN-NEXT: s_and_b64 s[10:11], s[10:11], exec +; GCN-NEXT: v_cmp_lt_u64_e64 s[10:11], s[12:13], 64 +; GCN-NEXT: v_cmp_eq_u64_e64 s[18:19], s[14:15], 0 +; GCN-NEXT: s_cselect_b32 s22, s0, s1 +; GCN-NEXT: s_and_b64 s[0:1], s[18:19], s[10:11] +; GCN-NEXT: s_sub_i32 s18, 64, s12 +; GCN-NEXT: s_sub_i32 s10, s12, 64 +; GCN-NEXT: s_lshl_b64 s[18:19], s[6:7], s18 +; GCN-NEXT: s_lshr_b64 s[20:21], s[4:5], s12 +; GCN-NEXT: s_ashr_i64 s[10:11], s[6:7], s10 +; GCN-NEXT: s_or_b64 s[18:19], s[20:21], s[18:19] +; GCN-NEXT: s_and_b64 s[20:21], s[0:1], exec +; GCN-NEXT: s_cselect_b32 s11, s19, s11 +; GCN-NEXT: s_or_b64 s[14:15], s[12:13], s[14:15] +; GCN-NEXT: v_cmp_eq_u64_e64 s[14:15], s[14:15], 0 +; GCN-NEXT: s_and_b64 s[20:21], s[14:15], exec +; GCN-NEXT: s_cselect_b32 s13, s5, s11 +; GCN-NEXT: s_and_b64 s[20:21], s[0:1], exec +; GCN-NEXT: s_cselect_b32 s5, s18, s10 +; GCN-NEXT: s_and_b64 s[10:11], s[14:15], exec +; GCN-NEXT: s_cselect_b32 s10, s4, s5 +; GCN-NEXT: s_ashr_i32 s11, s3, 31 +; GCN-NEXT: s_ashr_i64 s[2:3], s[2:3], s8 +; GCN-NEXT: s_and_b64 s[4:5], s[16:17], exec +; GCN-NEXT: s_cselect_b32 s4, s3, s11 +; GCN-NEXT: s_cselect_b32 s5, s2, s11 +; GCN-NEXT: s_ashr_i32 s8, s7, 31 ; GCN-NEXT: s_ashr_i64 s[2:3], s[6:7], s12 -; GCN-NEXT: v_mov_b32_e32 v2, s4 -; GCN-NEXT: v_mov_b32_e32 v3, s3 -; GCN-NEXT: v_mov_b32_e32 v6, s2 -; GCN-NEXT: s_ashr_i32 s4, s11, 31 -; GCN-NEXT: s_ashr_i64 s[2:3], s[10:11], s16 -; GCN-NEXT: v_cndmask_b32_e32 v3, v2, v3, vcc -; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc -; GCN-NEXT: v_mov_b32_e32 v6, s4 -; GCN-NEXT: v_mov_b32_e32 v7, s3 -; GCN-NEXT: v_mov_b32_e32 v12, s2 -; GCN-NEXT: v_cndmask_b32_e64 v7, v6, v7, s[0:1] -; GCN-NEXT: v_cndmask_b32_e64 v6, v6, v12, s[0:1] -; GCN-NEXT: flat_store_dwordx4 v[10:11], v[4:7] -; GCN-NEXT: flat_store_dwordx4 v[8:9], v[0:3] +; GCN-NEXT: s_and_b64 s[0:1], s[0:1], exec +; GCN-NEXT: s_cselect_b32 s0, s3, s8 +; GCN-NEXT: s_cselect_b32 s1, s2, s8 +; GCN-NEXT: v_mov_b32_e32 v0, s10 +; GCN-NEXT: v_mov_b32_e32 v1, s13 +; GCN-NEXT: v_mov_b32_e32 v2, s1 +; GCN-NEXT: v_mov_b32_e32 v3, s0 +; GCN-NEXT: flat_store_dwordx4 v[6:7], v[0:3] +; GCN-NEXT: s_nop 0 +; GCN-NEXT: v_mov_b32_e32 v0, s22 +; GCN-NEXT: v_mov_b32_e32 v1, s9 +; GCN-NEXT: v_mov_b32_e32 v2, s5 +; GCN-NEXT: v_mov_b32_e32 v3, s4 +; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NEXT: s_endpgm %shift = ashr <2 x i128> %lhs, %rhs store <2 x i128> %shift, <2 x i128> addrspace(1)* null diff --git a/llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll b/llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll --- a/llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll @@ -15,19 +15,12 @@ ; uses an SGPR (implicit vcc). ; GCN-LABEL: {{^}}sint_to_fp_i1_f64: -; VI-DAG: s_cmp_eq_u32 -; VI-DAG: s_cselect_b32 s[[SSEL:[0-9]+]], 0xbff00000, 0 -; VI-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} -; VI-DAG: v_mov_b32_e32 v[[SEL:[0-9]+]], s[[SSEL]] -; VI: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v[[[ZERO]]:[[SEL]]] -; VI: s_endpgm - -; CI-DAG: s_cmp_eq_u32 -; CI-DAG: s_cselect_b64 vcc, -1, 0 -; CI-DAG: v_cndmask_b32_e32 v[[SEL:[0-9]+]], 0, v{{[0-9]+}}, vcc -; CI-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} -; CI: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v[[[ZERO]]:[[SEL]]] -; CI: s_endpgm +; GCN-DAG: s_cmp_eq_u32 +; GCN-DAG: s_cselect_b32 s[[SSEL:[0-9]+]], 0xbff00000, 0 +; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} +; GCN-DAG: v_mov_b32_e32 v[[SEL:[0-9]+]], s[[SSEL]] +; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v[[[ZERO]]:[[SEL]]] +; GCN: s_endpgm define amdgpu_kernel void @sint_to_fp_i1_f64(double addrspace(1)* %out, i32 %in) { %cmp = icmp eq i32 %in, 0 %fp = sitofp i1 %cmp to double diff --git a/llvm/test/CodeGen/AMDGPU/srem64.ll b/llvm/test/CodeGen/AMDGPU/srem64.ll --- a/llvm/test/CodeGen/AMDGPU/srem64.ll +++ b/llvm/test/CodeGen/AMDGPU/srem64.ll @@ -51,7 +51,7 @@ ; GCN-NEXT: v_mul_lo_u32 v4, s1, v0 ; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GCN-NEXT: v_mul_lo_u32 v3, s0, v0 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v4, v2 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GCN-NEXT: v_mul_lo_u32 v6, v0, v2 ; GCN-NEXT: v_mul_hi_u32 v7, v0, v3 ; GCN-NEXT: v_mul_hi_u32 v8, v0, v2 @@ -104,19 +104,19 @@ ; GCN-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[2:3] ; GCN-NEXT: v_subbrev_u32_e64 v2, s[0:1], 0, v2, s[0:1] ; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v6 -; GCN-NEXT: v_cndmask_b32_e64 v2, v5, v2, s[0:1] -; GCN-NEXT: v_mov_b32_e32 v5, s11 -; GCN-NEXT: v_subb_u32_e32 v1, vcc, v5, v1, vcc +; GCN-NEXT: v_cndmask_b32_e64 v3, v4, v3, s[0:1] +; GCN-NEXT: v_mov_b32_e32 v4, s11 +; GCN-NEXT: v_subb_u32_e32 v1, vcc, v4, v1, vcc ; GCN-NEXT: v_cmp_le_u32_e32 vcc, s13, v1 -; GCN-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc +; GCN-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc ; GCN-NEXT: v_cmp_le_u32_e32 vcc, s12, v0 -; GCN-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc +; GCN-NEXT: v_cndmask_b32_e64 v2, v5, v2, s[0:1] +; GCN-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, s13, v1 -; GCN-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 +; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 ; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GCN-NEXT: v_cndmask_b32_e64 v2, v4, v3, s[0:1] -; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN-NEXT: s_endpgm ; @@ -124,38 +124,41 @@ ; GCN-IR: ; %bb.0: ; %_udiv-special-cases ; GCN-IR-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd ; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; GCN-IR-NEXT: s_mov_b64 s[6:7], 0 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[8:9], s[4:5], 0 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[10:11], s[2:3], 0 -; GCN-IR-NEXT: s_flbit_i32_b32 s12, s4 -; GCN-IR-NEXT: s_add_i32 s14, s12, 32 -; GCN-IR-NEXT: s_or_b64 s[12:13], s[8:9], s[10:11] -; GCN-IR-NEXT: s_flbit_i32_b32 s8, s5 -; GCN-IR-NEXT: s_min_u32 s10, s14, s8 -; GCN-IR-NEXT: s_flbit_i32_b32 s8, s2 -; GCN-IR-NEXT: s_add_i32 s8, s8, 32 -; GCN-IR-NEXT: s_flbit_i32_b32 s9, s3 -; GCN-IR-NEXT: s_min_u32 s14, s8, s9 -; GCN-IR-NEXT: s_sub_u32 s8, s10, s14 -; GCN-IR-NEXT: s_subb_u32 s9, 0, 0 -; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[16:17], s[8:9], 63 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[18:19], s[8:9], 63 -; GCN-IR-NEXT: s_or_b64 s[12:13], s[12:13], s[16:17] -; GCN-IR-NEXT: s_or_b64 s[16:17], s[12:13], s[18:19] +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[6:7], s[4:5], 0 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[8:9], s[2:3], 0 +; GCN-IR-NEXT: s_flbit_i32_b32 s10, s4 +; GCN-IR-NEXT: s_or_b64 s[8:9], s[6:7], s[8:9] +; GCN-IR-NEXT: s_flbit_i32_b32 s6, s2 +; GCN-IR-NEXT: s_flbit_i32_b32 s11, s5 +; GCN-IR-NEXT: s_add_i32 s10, s10, 32 +; GCN-IR-NEXT: s_add_i32 s6, s6, 32 +; GCN-IR-NEXT: s_flbit_i32_b32 s7, s3 +; GCN-IR-NEXT: s_min_u32 s10, s10, s11 +; GCN-IR-NEXT: s_min_u32 s14, s6, s7 +; GCN-IR-NEXT: s_sub_u32 s12, s10, s14 +; GCN-IR-NEXT: s_subb_u32 s13, 0, 0 +; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[16:17], s[12:13], 63 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[18:19], s[12:13], 63 +; GCN-IR-NEXT: s_or_b64 s[16:17], s[8:9], s[16:17] +; GCN-IR-NEXT: s_and_b64 s[8:9], s[16:17], exec +; GCN-IR-NEXT: s_cselect_b32 s9, 0, s3 +; GCN-IR-NEXT: s_cselect_b32 s8, 0, s2 +; GCN-IR-NEXT: s_or_b64 s[16:17], s[16:17], s[18:19] +; GCN-IR-NEXT: s_mov_b64 s[6:7], 0 ; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[16:17] ; GCN-IR-NEXT: s_mov_b32 s11, 0 ; GCN-IR-NEXT: s_cbranch_vccz .LBB0_5 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 -; GCN-IR-NEXT: s_add_u32 s12, s8, 1 -; GCN-IR-NEXT: s_addc_u32 s13, s9, 0 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[16:17], s[12:13], 0 -; GCN-IR-NEXT: s_sub_i32 s8, 63, s8 -; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[16:17] -; GCN-IR-NEXT: s_lshl_b64 s[8:9], s[2:3], s8 +; GCN-IR-NEXT: s_add_u32 s16, s12, 1 +; GCN-IR-NEXT: s_addc_u32 s17, s13, 0 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[8:9], s[16:17], 0 +; GCN-IR-NEXT: s_sub_i32 s12, 63, s12 +; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[8:9] +; GCN-IR-NEXT: s_lshl_b64 s[8:9], s[2:3], s12 ; GCN-IR-NEXT: s_cbranch_vccz .LBB0_4 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader -; GCN-IR-NEXT: s_lshr_b64 s[12:13], s[2:3], s12 +; GCN-IR-NEXT: s_lshr_b64 s[12:13], s[2:3], s16 ; GCN-IR-NEXT: s_add_u32 s16, s4, -1 ; GCN-IR-NEXT: s_addc_u32 s17, s5, -1 ; GCN-IR-NEXT: s_not_b64 s[6:7], s[10:11] @@ -186,30 +189,24 @@ ; GCN-IR-NEXT: s_cbranch_vccz .LBB0_3 ; GCN-IR-NEXT: .LBB0_4: ; %Flow6 ; GCN-IR-NEXT: s_lshl_b64 s[8:9], s[8:9], 1 -; GCN-IR-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] -; GCN-IR-NEXT: v_mov_b32_e32 v0, s6 -; GCN-IR-NEXT: v_mov_b32_e32 v1, s7 -; GCN-IR-NEXT: s_branch .LBB0_6 -; GCN-IR-NEXT: .LBB0_5: -; GCN-IR-NEXT: v_mov_b32_e32 v0, s3 -; GCN-IR-NEXT: v_cndmask_b32_e64 v1, v0, 0, s[12:13] -; GCN-IR-NEXT: v_mov_b32_e32 v0, s2 -; GCN-IR-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[12:13] -; GCN-IR-NEXT: .LBB0_6: ; %udiv-end -; GCN-IR-NEXT: v_mul_lo_u32 v1, s4, v1 -; GCN-IR-NEXT: v_mul_hi_u32 v2, s4, v0 -; GCN-IR-NEXT: v_mul_lo_u32 v3, s5, v0 -; GCN-IR-NEXT: v_mul_lo_u32 v0, s4, v0 -; GCN-IR-NEXT: s_mov_b32 s11, 0xf000 -; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, v1, v2 -; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, v3, v1 +; GCN-IR-NEXT: s_or_b64 s[8:9], s[6:7], s[8:9] +; GCN-IR-NEXT: .LBB0_5: ; %udiv-end +; GCN-IR-NEXT: v_mov_b32_e32 v0, s8 +; GCN-IR-NEXT: v_mul_hi_u32 v0, s4, v0 +; GCN-IR-NEXT: s_mov_b32 s12, s0 +; GCN-IR-NEXT: s_mul_i32 s0, s4, s9 ; GCN-IR-NEXT: v_mov_b32_e32 v2, s3 +; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, s0, v0 +; GCN-IR-NEXT: s_mul_i32 s0, s5, s8 +; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, s0, v0 +; GCN-IR-NEXT: s_mul_i32 s0, s4, s8 +; GCN-IR-NEXT: v_mov_b32_e32 v0, s0 ; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 -; GCN-IR-NEXT: s_mov_b32 s10, -1 -; GCN-IR-NEXT: s_mov_b32 s8, s0 -; GCN-IR-NEXT: s_mov_b32 s9, s1 +; GCN-IR-NEXT: s_mov_b32 s15, 0xf000 +; GCN-IR-NEXT: s_mov_b32 s14, -1 +; GCN-IR-NEXT: s_mov_b32 s13, s1 ; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc -; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[12:15], 0 ; GCN-IR-NEXT: s_endpgm %result = urem i64 %x, %y store i64 %result, i64 addrspace(1)* %out @@ -451,66 +448,72 @@ define amdgpu_kernel void @s_test_srem23_64(i64 addrspace(1)* %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_srem23_64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s5, s[0:1], 0xe -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-NEXT: s_load_dword s1, s[0:1], 0xe +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_ashr_i64 s[4:5], s[4:5], 41 -; GCN-NEXT: v_cvt_f32_i32_e32 v0, s4 -; GCN-NEXT: s_ashr_i64 s[2:3], s[2:3], 41 -; GCN-NEXT: v_cvt_f32_i32_e32 v1, s2 -; GCN-NEXT: s_xor_b32 s3, s2, s4 +; GCN-NEXT: s_mov_b32 s0, s4 +; GCN-NEXT: s_ashr_i64 s[8:9], s[0:1], 41 +; GCN-NEXT: v_cvt_f32_i32_e32 v0, s8 +; GCN-NEXT: s_mov_b32 s1, s5 +; GCN-NEXT: s_ashr_i64 s[4:5], s[6:7], 41 +; GCN-NEXT: v_cvt_f32_i32_e32 v1, s4 ; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GCN-NEXT: s_ashr_i32 s3, s3, 30 -; GCN-NEXT: s_or_b32 s3, s3, 1 -; GCN-NEXT: v_mov_b32_e32 v3, s3 +; GCN-NEXT: s_xor_b32 s5, s4, s8 +; GCN-NEXT: s_ashr_i32 s5, s5, 30 +; GCN-NEXT: s_or_b32 s5, s5, 1 ; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-NEXT: v_mad_f32 v1, -v2, v0, v1 ; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| -; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc -; GCN-NEXT: s_mov_b32 s5, s1 -; GCN-NEXT: v_add_i32_e32 v0, vcc, v2, v0 -; GCN-NEXT: v_mul_lo_u32 v0, v0, s4 -; GCN-NEXT: s_mov_b32 s4, s0 -; GCN-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 -; GCN-NEXT: v_bfe_i32 v0, v0, 0, 23 -; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GCN-NEXT: v_cmp_ge_f32_e64 s[6:7], |v1|, |v0| +; GCN-NEXT: s_and_b64 s[6:7], s[6:7], exec +; GCN-NEXT: s_cselect_b32 s5, s5, 0 +; GCN-NEXT: v_readfirstlane_b32 s6, v2 +; GCN-NEXT: s_add_i32 s5, s6, s5 +; GCN-NEXT: s_mul_i32 s5, s5, s8 +; GCN-NEXT: s_sub_i32 s4, s4, s5 +; GCN-NEXT: s_bfe_i32 s4, s4, 0x170000 +; GCN-NEXT: s_ashr_i32 s5, s4, 31 +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v1, s5 +; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN-NEXT: s_endpgm ; ; GCN-IR-LABEL: s_test_srem23_64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dword s5, s[0:1], 0xe -; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 -; GCN-IR-NEXT: s_mov_b32 s6, -1 +; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-IR-NEXT: s_load_dword s1, s[0:1], 0xe +; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 +; GCN-IR-NEXT: s_mov_b32 s2, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_ashr_i64 s[4:5], s[4:5], 41 -; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, s4 -; GCN-IR-NEXT: s_ashr_i64 s[2:3], s[2:3], 41 -; GCN-IR-NEXT: v_cvt_f32_i32_e32 v1, s2 -; GCN-IR-NEXT: s_xor_b32 s3, s2, s4 +; GCN-IR-NEXT: s_mov_b32 s0, s4 +; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[0:1], 41 +; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, s8 +; GCN-IR-NEXT: s_mov_b32 s1, s5 +; GCN-IR-NEXT: s_ashr_i64 s[4:5], s[6:7], 41 +; GCN-IR-NEXT: v_cvt_f32_i32_e32 v1, s4 ; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GCN-IR-NEXT: s_ashr_i32 s3, s3, 30 -; GCN-IR-NEXT: s_or_b32 s3, s3, 1 -; GCN-IR-NEXT: v_mov_b32_e32 v3, s3 +; GCN-IR-NEXT: s_xor_b32 s5, s4, s8 +; GCN-IR-NEXT: s_ashr_i32 s5, s5, 30 +; GCN-IR-NEXT: s_or_b32 s5, s5, 1 ; GCN-IR-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-IR-NEXT: v_mad_f32 v1, -v2, v0, v1 ; GCN-IR-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| -; GCN-IR-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc -; GCN-IR-NEXT: s_mov_b32 s5, s1 -; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GCN-IR-NEXT: v_mul_lo_u32 v0, v0, s4 -; GCN-IR-NEXT: s_mov_b32 s4, s0 -; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 -; GCN-IR-NEXT: v_bfe_i32 v0, v0, 0, 23 -; GCN-IR-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GCN-IR-NEXT: v_cmp_ge_f32_e64 s[6:7], |v1|, |v0| +; GCN-IR-NEXT: s_and_b64 s[6:7], s[6:7], exec +; GCN-IR-NEXT: s_cselect_b32 s5, s5, 0 +; GCN-IR-NEXT: v_readfirstlane_b32 s6, v2 +; GCN-IR-NEXT: s_add_i32 s5, s6, s5 +; GCN-IR-NEXT: s_mul_i32 s5, s5, s8 +; GCN-IR-NEXT: s_sub_i32 s4, s4, s5 +; GCN-IR-NEXT: s_bfe_i32 s4, s4, 0x170000 +; GCN-IR-NEXT: s_ashr_i32 s5, s4, 31 +; GCN-IR-NEXT: v_mov_b32_e32 v0, s4 +; GCN-IR-NEXT: v_mov_b32_e32 v1, s5 +; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN-IR-NEXT: s_endpgm %1 = ashr i64 %x, 41 %2 = ashr i64 %y, 41 @@ -522,66 +525,72 @@ define amdgpu_kernel void @s_test_srem24_64(i64 addrspace(1)* %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_srem24_64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s5, s[0:1], 0xe -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-NEXT: s_load_dword s1, s[0:1], 0xe +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_ashr_i64 s[4:5], s[4:5], 40 -; GCN-NEXT: v_cvt_f32_i32_e32 v0, s4 -; GCN-NEXT: s_ashr_i64 s[2:3], s[2:3], 40 -; GCN-NEXT: v_cvt_f32_i32_e32 v1, s2 -; GCN-NEXT: s_xor_b32 s3, s2, s4 +; GCN-NEXT: s_mov_b32 s0, s4 +; GCN-NEXT: s_ashr_i64 s[8:9], s[0:1], 40 +; GCN-NEXT: v_cvt_f32_i32_e32 v0, s8 +; GCN-NEXT: s_mov_b32 s1, s5 +; GCN-NEXT: s_ashr_i64 s[4:5], s[6:7], 40 +; GCN-NEXT: v_cvt_f32_i32_e32 v1, s4 ; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GCN-NEXT: s_ashr_i32 s3, s3, 30 -; GCN-NEXT: s_or_b32 s3, s3, 1 -; GCN-NEXT: v_mov_b32_e32 v3, s3 +; GCN-NEXT: s_xor_b32 s5, s4, s8 +; GCN-NEXT: s_ashr_i32 s5, s5, 30 +; GCN-NEXT: s_or_b32 s5, s5, 1 ; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-NEXT: v_mad_f32 v1, -v2, v0, v1 ; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| -; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc -; GCN-NEXT: s_mov_b32 s5, s1 -; GCN-NEXT: v_add_i32_e32 v0, vcc, v2, v0 -; GCN-NEXT: v_mul_lo_u32 v0, v0, s4 -; GCN-NEXT: s_mov_b32 s4, s0 -; GCN-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 -; GCN-NEXT: v_bfe_i32 v0, v0, 0, 24 -; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GCN-NEXT: v_cmp_ge_f32_e64 s[6:7], |v1|, |v0| +; GCN-NEXT: s_and_b64 s[6:7], s[6:7], exec +; GCN-NEXT: s_cselect_b32 s5, s5, 0 +; GCN-NEXT: v_readfirstlane_b32 s6, v2 +; GCN-NEXT: s_add_i32 s5, s6, s5 +; GCN-NEXT: s_mul_i32 s5, s5, s8 +; GCN-NEXT: s_sub_i32 s4, s4, s5 +; GCN-NEXT: s_bfe_i32 s4, s4, 0x180000 +; GCN-NEXT: s_ashr_i32 s5, s4, 31 +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v1, s5 +; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN-NEXT: s_endpgm ; ; GCN-IR-LABEL: s_test_srem24_64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dword s5, s[0:1], 0xe -; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 -; GCN-IR-NEXT: s_mov_b32 s6, -1 +; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-IR-NEXT: s_load_dword s1, s[0:1], 0xe +; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 +; GCN-IR-NEXT: s_mov_b32 s2, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_ashr_i64 s[4:5], s[4:5], 40 -; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, s4 -; GCN-IR-NEXT: s_ashr_i64 s[2:3], s[2:3], 40 -; GCN-IR-NEXT: v_cvt_f32_i32_e32 v1, s2 -; GCN-IR-NEXT: s_xor_b32 s3, s2, s4 +; GCN-IR-NEXT: s_mov_b32 s0, s4 +; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[0:1], 40 +; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, s8 +; GCN-IR-NEXT: s_mov_b32 s1, s5 +; GCN-IR-NEXT: s_ashr_i64 s[4:5], s[6:7], 40 +; GCN-IR-NEXT: v_cvt_f32_i32_e32 v1, s4 ; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GCN-IR-NEXT: s_ashr_i32 s3, s3, 30 -; GCN-IR-NEXT: s_or_b32 s3, s3, 1 -; GCN-IR-NEXT: v_mov_b32_e32 v3, s3 +; GCN-IR-NEXT: s_xor_b32 s5, s4, s8 +; GCN-IR-NEXT: s_ashr_i32 s5, s5, 30 +; GCN-IR-NEXT: s_or_b32 s5, s5, 1 ; GCN-IR-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-IR-NEXT: v_mad_f32 v1, -v2, v0, v1 ; GCN-IR-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| -; GCN-IR-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc -; GCN-IR-NEXT: s_mov_b32 s5, s1 -; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v2, v0 -; GCN-IR-NEXT: v_mul_lo_u32 v0, v0, s4 -; GCN-IR-NEXT: s_mov_b32 s4, s0 -; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 -; GCN-IR-NEXT: v_bfe_i32 v0, v0, 0, 24 -; GCN-IR-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GCN-IR-NEXT: v_cmp_ge_f32_e64 s[6:7], |v1|, |v0| +; GCN-IR-NEXT: s_and_b64 s[6:7], s[6:7], exec +; GCN-IR-NEXT: s_cselect_b32 s5, s5, 0 +; GCN-IR-NEXT: v_readfirstlane_b32 s6, v2 +; GCN-IR-NEXT: s_add_i32 s5, s6, s5 +; GCN-IR-NEXT: s_mul_i32 s5, s5, s8 +; GCN-IR-NEXT: s_sub_i32 s4, s4, s5 +; GCN-IR-NEXT: s_bfe_i32 s4, s4, 0x180000 +; GCN-IR-NEXT: s_ashr_i32 s5, s4, 31 +; GCN-IR-NEXT: v_mov_b32_e32 v0, s4 +; GCN-IR-NEXT: v_mov_b32_e32 v1, s5 +; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN-IR-NEXT: s_endpgm %1 = ashr i64 %x, 40 %2 = ashr i64 %y, 40 @@ -647,66 +656,72 @@ define amdgpu_kernel void @s_test_srem25_64(i64 addrspace(1)* %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_srem25_64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s5, s[0:1], 0xe -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-NEXT: s_load_dword s1, s[0:1], 0xe +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_ashr_i64 s[4:5], s[4:5], 39 -; GCN-NEXT: v_cvt_f32_i32_e32 v0, s4 -; GCN-NEXT: s_ashr_i64 s[2:3], s[2:3], 39 -; GCN-NEXT: v_cvt_f32_i32_e32 v1, s2 -; GCN-NEXT: s_xor_b32 s3, s2, s4 +; GCN-NEXT: s_mov_b32 s0, s4 +; GCN-NEXT: s_ashr_i64 s[8:9], s[0:1], 39 +; GCN-NEXT: v_cvt_f32_i32_e32 v0, s8 +; GCN-NEXT: s_mov_b32 s1, s5 +; GCN-NEXT: s_ashr_i64 s[4:5], s[6:7], 39 +; GCN-NEXT: v_cvt_f32_i32_e32 v1, s4 ; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GCN-NEXT: s_ashr_i32 s3, s3, 30 -; GCN-NEXT: s_or_b32 s3, s3, 1 -; GCN-NEXT: v_mov_b32_e32 v3, s3 +; GCN-NEXT: s_xor_b32 s5, s4, s8 +; GCN-NEXT: s_ashr_i32 s5, s5, 30 +; GCN-NEXT: s_or_b32 s5, s5, 1 ; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-NEXT: v_mad_f32 v1, -v2, v0, v1 ; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| -; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc -; GCN-NEXT: s_mov_b32 s5, s1 -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GCN-NEXT: v_mul_lo_u32 v0, v0, s4 -; GCN-NEXT: s_mov_b32 s4, s0 -; GCN-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 -; GCN-NEXT: v_bfe_i32 v0, v0, 0, 25 -; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GCN-NEXT: v_cmp_ge_f32_e64 s[6:7], |v1|, |v0| +; GCN-NEXT: s_and_b64 s[6:7], s[6:7], exec +; GCN-NEXT: s_cselect_b32 s5, s5, 0 +; GCN-NEXT: v_readfirstlane_b32 s6, v2 +; GCN-NEXT: s_add_i32 s5, s6, s5 +; GCN-NEXT: s_mul_i32 s5, s5, s8 +; GCN-NEXT: s_sub_i32 s4, s4, s5 +; GCN-NEXT: s_bfe_i32 s4, s4, 0x190000 +; GCN-NEXT: s_ashr_i32 s5, s4, 31 +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v1, s5 +; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN-NEXT: s_endpgm ; ; GCN-IR-LABEL: s_test_srem25_64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dword s5, s[0:1], 0xe -; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 -; GCN-IR-NEXT: s_mov_b32 s6, -1 +; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-IR-NEXT: s_load_dword s1, s[0:1], 0xe +; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 +; GCN-IR-NEXT: s_mov_b32 s2, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_ashr_i64 s[4:5], s[4:5], 39 -; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, s4 -; GCN-IR-NEXT: s_ashr_i64 s[2:3], s[2:3], 39 -; GCN-IR-NEXT: v_cvt_f32_i32_e32 v1, s2 -; GCN-IR-NEXT: s_xor_b32 s3, s2, s4 +; GCN-IR-NEXT: s_mov_b32 s0, s4 +; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[0:1], 39 +; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, s8 +; GCN-IR-NEXT: s_mov_b32 s1, s5 +; GCN-IR-NEXT: s_ashr_i64 s[4:5], s[6:7], 39 +; GCN-IR-NEXT: v_cvt_f32_i32_e32 v1, s4 ; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GCN-IR-NEXT: s_ashr_i32 s3, s3, 30 -; GCN-IR-NEXT: s_or_b32 s3, s3, 1 -; GCN-IR-NEXT: v_mov_b32_e32 v3, s3 +; GCN-IR-NEXT: s_xor_b32 s5, s4, s8 +; GCN-IR-NEXT: s_ashr_i32 s5, s5, 30 +; GCN-IR-NEXT: s_or_b32 s5, s5, 1 ; GCN-IR-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-IR-NEXT: v_mad_f32 v1, -v2, v0, v1 ; GCN-IR-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| -; GCN-IR-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc -; GCN-IR-NEXT: s_mov_b32 s5, s1 -; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v2, v0 -; GCN-IR-NEXT: v_mul_lo_u32 v0, v0, s4 -; GCN-IR-NEXT: s_mov_b32 s4, s0 -; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 -; GCN-IR-NEXT: v_bfe_i32 v0, v0, 0, 25 -; GCN-IR-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GCN-IR-NEXT: v_cmp_ge_f32_e64 s[6:7], |v1|, |v0| +; GCN-IR-NEXT: s_and_b64 s[6:7], s[6:7], exec +; GCN-IR-NEXT: s_cselect_b32 s5, s5, 0 +; GCN-IR-NEXT: v_readfirstlane_b32 s6, v2 +; GCN-IR-NEXT: s_add_i32 s5, s6, s5 +; GCN-IR-NEXT: s_mul_i32 s5, s5, s8 +; GCN-IR-NEXT: s_sub_i32 s4, s4, s5 +; GCN-IR-NEXT: s_bfe_i32 s4, s4, 0x190000 +; GCN-IR-NEXT: s_ashr_i32 s5, s4, 31 +; GCN-IR-NEXT: v_mov_b32_e32 v0, s4 +; GCN-IR-NEXT: v_mov_b32_e32 v1, s5 +; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN-IR-NEXT: s_endpgm %1 = ashr i64 %x, 39 %2 = ashr i64 %y, 39 @@ -718,66 +733,72 @@ define amdgpu_kernel void @s_test_srem31_64(i64 addrspace(1)* %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_srem31_64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s5, s[0:1], 0xe -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-NEXT: s_load_dword s1, s[0:1], 0xe +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_ashr_i64 s[4:5], s[4:5], 33 -; GCN-NEXT: v_cvt_f32_i32_e32 v0, s4 -; GCN-NEXT: s_ashr_i64 s[2:3], s[2:3], 33 -; GCN-NEXT: v_cvt_f32_i32_e32 v1, s2 -; GCN-NEXT: s_xor_b32 s3, s2, s4 +; GCN-NEXT: s_mov_b32 s0, s4 +; GCN-NEXT: s_ashr_i64 s[8:9], s[0:1], 33 +; GCN-NEXT: v_cvt_f32_i32_e32 v0, s8 +; GCN-NEXT: s_mov_b32 s1, s5 +; GCN-NEXT: s_ashr_i64 s[4:5], s[6:7], 33 +; GCN-NEXT: v_cvt_f32_i32_e32 v1, s4 ; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GCN-NEXT: s_ashr_i32 s3, s3, 30 -; GCN-NEXT: s_or_b32 s3, s3, 1 -; GCN-NEXT: v_mov_b32_e32 v3, s3 +; GCN-NEXT: s_xor_b32 s5, s4, s8 +; GCN-NEXT: s_ashr_i32 s5, s5, 30 +; GCN-NEXT: s_or_b32 s5, s5, 1 ; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-NEXT: v_mad_f32 v1, -v2, v0, v1 ; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| -; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc -; GCN-NEXT: s_mov_b32 s5, s1 -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GCN-NEXT: v_mul_lo_u32 v0, v0, s4 -; GCN-NEXT: s_mov_b32 s4, s0 -; GCN-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 -; GCN-NEXT: v_bfe_i32 v0, v0, 0, 31 -; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GCN-NEXT: v_cmp_ge_f32_e64 s[6:7], |v1|, |v0| +; GCN-NEXT: s_and_b64 s[6:7], s[6:7], exec +; GCN-NEXT: s_cselect_b32 s5, s5, 0 +; GCN-NEXT: v_readfirstlane_b32 s6, v2 +; GCN-NEXT: s_add_i32 s5, s6, s5 +; GCN-NEXT: s_mul_i32 s5, s5, s8 +; GCN-NEXT: s_sub_i32 s4, s4, s5 +; GCN-NEXT: s_bfe_i32 s4, s4, 0x1f0000 +; GCN-NEXT: s_ashr_i32 s5, s4, 31 +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v1, s5 +; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN-NEXT: s_endpgm ; ; GCN-IR-LABEL: s_test_srem31_64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dword s5, s[0:1], 0xe -; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 -; GCN-IR-NEXT: s_mov_b32 s6, -1 +; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-IR-NEXT: s_load_dword s1, s[0:1], 0xe +; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 +; GCN-IR-NEXT: s_mov_b32 s2, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_ashr_i64 s[4:5], s[4:5], 33 -; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, s4 -; GCN-IR-NEXT: s_ashr_i64 s[2:3], s[2:3], 33 -; GCN-IR-NEXT: v_cvt_f32_i32_e32 v1, s2 -; GCN-IR-NEXT: s_xor_b32 s3, s2, s4 +; GCN-IR-NEXT: s_mov_b32 s0, s4 +; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[0:1], 33 +; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, s8 +; GCN-IR-NEXT: s_mov_b32 s1, s5 +; GCN-IR-NEXT: s_ashr_i64 s[4:5], s[6:7], 33 +; GCN-IR-NEXT: v_cvt_f32_i32_e32 v1, s4 ; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GCN-IR-NEXT: s_ashr_i32 s3, s3, 30 -; GCN-IR-NEXT: s_or_b32 s3, s3, 1 -; GCN-IR-NEXT: v_mov_b32_e32 v3, s3 +; GCN-IR-NEXT: s_xor_b32 s5, s4, s8 +; GCN-IR-NEXT: s_ashr_i32 s5, s5, 30 +; GCN-IR-NEXT: s_or_b32 s5, s5, 1 ; GCN-IR-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-IR-NEXT: v_mad_f32 v1, -v2, v0, v1 ; GCN-IR-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| -; GCN-IR-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc -; GCN-IR-NEXT: s_mov_b32 s5, s1 -; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GCN-IR-NEXT: v_mul_lo_u32 v0, v0, s4 -; GCN-IR-NEXT: s_mov_b32 s4, s0 -; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 -; GCN-IR-NEXT: v_bfe_i32 v0, v0, 0, 31 -; GCN-IR-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GCN-IR-NEXT: v_cmp_ge_f32_e64 s[6:7], |v1|, |v0| +; GCN-IR-NEXT: s_and_b64 s[6:7], s[6:7], exec +; GCN-IR-NEXT: s_cselect_b32 s5, s5, 0 +; GCN-IR-NEXT: v_readfirstlane_b32 s6, v2 +; GCN-IR-NEXT: s_add_i32 s5, s6, s5 +; GCN-IR-NEXT: s_mul_i32 s5, s5, s8 +; GCN-IR-NEXT: s_sub_i32 s4, s4, s5 +; GCN-IR-NEXT: s_bfe_i32 s4, s4, 0x1f0000 +; GCN-IR-NEXT: s_ashr_i32 s5, s4, 31 +; GCN-IR-NEXT: v_mov_b32_e32 v0, s4 +; GCN-IR-NEXT: v_mov_b32_e32 v1, s5 +; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN-IR-NEXT: s_endpgm %1 = ashr i64 %x, 33 %2 = ashr i64 %y, 33 @@ -790,28 +811,28 @@ define amdgpu_kernel void @s_test_srem32_64(i64 addrspace(1)* %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_srem32_64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[0:1], 0xe +; GCN-NEXT: s_load_dword s8, s[0:1], 0xe ; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_cvt_f32_i32_e32 v0, s4 +; GCN-NEXT: v_cvt_f32_i32_e32 v0, s8 ; GCN-NEXT: v_cvt_f32_i32_e32 v1, s3 -; GCN-NEXT: s_xor_b32 s2, s3, s4 +; GCN-NEXT: s_xor_b32 s2, s3, s8 ; GCN-NEXT: s_ashr_i32 s2, s2, 30 ; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 ; GCN-NEXT: s_or_b32 s2, s2, 1 -; GCN-NEXT: v_mov_b32_e32 v3, s2 -; GCN-NEXT: s_mov_b32 s5, s1 ; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-NEXT: v_mad_f32 v1, -v2, v0, v1 ; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| -; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GCN-NEXT: v_mul_lo_u32 v0, v0, s4 +; GCN-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, |v0| +; GCN-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GCN-NEXT: s_cselect_b32 s2, s2, 0 +; GCN-NEXT: v_add_i32_e32 v0, vcc, s2, v2 +; GCN-NEXT: v_mul_lo_u32 v0, v0, s8 ; GCN-NEXT: s_mov_b32 s4, s0 +; GCN-NEXT: s_mov_b32 s5, s1 ; GCN-NEXT: v_sub_i32_e32 v0, vcc, s3, v0 ; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 @@ -819,28 +840,28 @@ ; ; GCN-IR-LABEL: s_test_srem32_64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dword s4, s[0:1], 0xe +; GCN-IR-NEXT: s_load_dword s8, s[0:1], 0xe ; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s6, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, s4 +; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, s8 ; GCN-IR-NEXT: v_cvt_f32_i32_e32 v1, s3 -; GCN-IR-NEXT: s_xor_b32 s2, s3, s4 +; GCN-IR-NEXT: s_xor_b32 s2, s3, s8 ; GCN-IR-NEXT: s_ashr_i32 s2, s2, 30 ; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v0 ; GCN-IR-NEXT: s_or_b32 s2, s2, 1 -; GCN-IR-NEXT: v_mov_b32_e32 v3, s2 -; GCN-IR-NEXT: s_mov_b32 s5, s1 ; GCN-IR-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-IR-NEXT: v_mad_f32 v1, -v2, v0, v1 ; GCN-IR-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| -; GCN-IR-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc -; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GCN-IR-NEXT: v_mul_lo_u32 v0, v0, s4 +; GCN-IR-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, |v0| +; GCN-IR-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GCN-IR-NEXT: s_cselect_b32 s2, s2, 0 +; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, s2, v2 +; GCN-IR-NEXT: v_mul_lo_u32 v0, v0, s8 ; GCN-IR-NEXT: s_mov_b32 s4, s0 +; GCN-IR-NEXT: s_mov_b32 s5, s1 ; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, s3, v0 ; GCN-IR-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 @@ -911,7 +932,7 @@ ; GCN-NEXT: v_mul_lo_u32 v4, s1, v0 ; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GCN-NEXT: v_mul_lo_u32 v3, s0, v0 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v4, v2 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GCN-NEXT: v_mul_lo_u32 v6, v0, v2 ; GCN-NEXT: v_mul_hi_u32 v7, v0, v3 ; GCN-NEXT: v_mul_hi_u32 v8, v0, v2 @@ -967,19 +988,19 @@ ; GCN-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[2:3] ; GCN-NEXT: v_subbrev_u32_e64 v2, s[0:1], 0, v2, s[0:1] ; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v6 -; GCN-NEXT: v_cndmask_b32_e64 v2, v5, v2, s[0:1] -; GCN-NEXT: v_mov_b32_e32 v5, s15 -; GCN-NEXT: v_subb_u32_e32 v1, vcc, v5, v1, vcc +; GCN-NEXT: v_cndmask_b32_e64 v3, v4, v3, s[0:1] +; GCN-NEXT: v_mov_b32_e32 v4, s15 +; GCN-NEXT: v_subb_u32_e32 v1, vcc, v4, v1, vcc ; GCN-NEXT: v_cmp_le_u32_e32 vcc, s13, v1 -; GCN-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc +; GCN-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc ; GCN-NEXT: v_cmp_le_u32_e32 vcc, s12, v0 -; GCN-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc +; GCN-NEXT: v_cndmask_b32_e64 v2, v5, v2, s[0:1] +; GCN-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, s13, v1 -; GCN-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 +; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GCN-NEXT: v_cndmask_b32_e64 v2, v4, v3, s[0:1] -; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GCN-NEXT: v_xor_b32_e32 v0, s6, v0 ; GCN-NEXT: v_xor_b32_e32 v1, s6, v1 ; GCN-NEXT: v_mov_b32_e32 v2, s6 @@ -992,6 +1013,7 @@ ; GCN-IR: ; %bb.0: ; %_udiv-special-cases ; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GCN-IR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GCN-IR-NEXT: s_mov_b32 s13, 0 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_ashr_i64 s[2:3], s[6:7], 31 ; GCN-IR-NEXT: s_ashr_i64 s[6:7], s[0:1], 31 @@ -1005,37 +1027,39 @@ ; GCN-IR-NEXT: s_subb_u32 s3, s3, s0 ; GCN-IR-NEXT: s_sub_u32 s8, s6, s10 ; GCN-IR-NEXT: s_subb_u32 s9, s7, s10 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[10:11], s[8:9], 0 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[12:13], s[2:3], 0 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[6:7], s[8:9], 0 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[10:11], s[2:3], 0 +; GCN-IR-NEXT: s_or_b64 s[10:11], s[6:7], s[10:11] +; GCN-IR-NEXT: s_flbit_i32_b32 s6, s8 +; GCN-IR-NEXT: s_add_i32 s6, s6, 32 +; GCN-IR-NEXT: s_flbit_i32_b32 s7, s9 +; GCN-IR-NEXT: s_min_u32 s12, s6, s7 +; GCN-IR-NEXT: s_flbit_i32_b32 s6, s2 +; GCN-IR-NEXT: s_add_i32 s6, s6, 32 +; GCN-IR-NEXT: s_flbit_i32_b32 s7, s3 +; GCN-IR-NEXT: s_min_u32 s16, s6, s7 +; GCN-IR-NEXT: s_sub_u32 s14, s12, s16 +; GCN-IR-NEXT: s_subb_u32 s15, 0, 0 +; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[18:19], s[14:15], 63 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[20:21], s[14:15], 63 +; GCN-IR-NEXT: s_or_b64 s[18:19], s[10:11], s[18:19] +; GCN-IR-NEXT: s_and_b64 s[10:11], s[18:19], exec +; GCN-IR-NEXT: s_cselect_b32 s11, 0, s3 +; GCN-IR-NEXT: s_cselect_b32 s10, 0, s2 +; GCN-IR-NEXT: s_or_b64 s[18:19], s[18:19], s[20:21] ; GCN-IR-NEXT: s_mov_b64 s[6:7], 0 -; GCN-IR-NEXT: s_or_b64 s[14:15], s[10:11], s[12:13] -; GCN-IR-NEXT: s_flbit_i32_b32 s10, s8 -; GCN-IR-NEXT: s_add_i32 s10, s10, 32 -; GCN-IR-NEXT: s_flbit_i32_b32 s11, s9 -; GCN-IR-NEXT: s_min_u32 s12, s10, s11 -; GCN-IR-NEXT: s_flbit_i32_b32 s10, s2 -; GCN-IR-NEXT: s_add_i32 s10, s10, 32 -; GCN-IR-NEXT: s_flbit_i32_b32 s11, s3 -; GCN-IR-NEXT: s_min_u32 s16, s10, s11 -; GCN-IR-NEXT: s_sub_u32 s10, s12, s16 -; GCN-IR-NEXT: s_subb_u32 s11, 0, 0 -; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[18:19], s[10:11], 63 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[20:21], s[10:11], 63 -; GCN-IR-NEXT: s_or_b64 s[14:15], s[14:15], s[18:19] -; GCN-IR-NEXT: s_or_b64 s[18:19], s[14:15], s[20:21] ; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[18:19] -; GCN-IR-NEXT: s_mov_b32 s13, 0 ; GCN-IR-NEXT: s_cbranch_vccz .LBB8_5 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 -; GCN-IR-NEXT: s_add_u32 s14, s10, 1 -; GCN-IR-NEXT: s_addc_u32 s15, s11, 0 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[18:19], s[14:15], 0 -; GCN-IR-NEXT: s_sub_i32 s10, 63, s10 -; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[18:19] -; GCN-IR-NEXT: s_lshl_b64 s[10:11], s[2:3], s10 +; GCN-IR-NEXT: s_add_u32 s18, s14, 1 +; GCN-IR-NEXT: s_addc_u32 s19, s15, 0 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[10:11], s[18:19], 0 +; GCN-IR-NEXT: s_sub_i32 s14, 63, s14 +; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[10:11] +; GCN-IR-NEXT: s_lshl_b64 s[10:11], s[2:3], s14 ; GCN-IR-NEXT: s_cbranch_vccz .LBB8_4 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader -; GCN-IR-NEXT: s_lshr_b64 s[14:15], s[2:3], s14 +; GCN-IR-NEXT: s_lshr_b64 s[14:15], s[2:3], s18 ; GCN-IR-NEXT: s_add_u32 s18, s8, -1 ; GCN-IR-NEXT: s_addc_u32 s19, s9, -1 ; GCN-IR-NEXT: s_not_b64 s[6:7], s[12:13] @@ -1066,32 +1090,25 @@ ; GCN-IR-NEXT: s_cbranch_vccz .LBB8_3 ; GCN-IR-NEXT: .LBB8_4: ; %Flow6 ; GCN-IR-NEXT: s_lshl_b64 s[10:11], s[10:11], 1 -; GCN-IR-NEXT: s_or_b64 s[6:7], s[6:7], s[10:11] -; GCN-IR-NEXT: v_mov_b32_e32 v0, s6 -; GCN-IR-NEXT: v_mov_b32_e32 v1, s7 -; GCN-IR-NEXT: s_branch .LBB8_6 -; GCN-IR-NEXT: .LBB8_5: -; GCN-IR-NEXT: v_mov_b32_e32 v0, s3 -; GCN-IR-NEXT: v_cndmask_b32_e64 v1, v0, 0, s[14:15] -; GCN-IR-NEXT: v_mov_b32_e32 v0, s2 -; GCN-IR-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[14:15] -; GCN-IR-NEXT: .LBB8_6: ; %udiv-end -; GCN-IR-NEXT: v_mul_lo_u32 v1, s8, v1 -; GCN-IR-NEXT: v_mul_hi_u32 v2, s8, v0 -; GCN-IR-NEXT: v_mul_lo_u32 v3, s9, v0 -; GCN-IR-NEXT: v_mul_lo_u32 v0, s8, v0 +; GCN-IR-NEXT: s_or_b64 s[10:11], s[6:7], s[10:11] +; GCN-IR-NEXT: .LBB8_5: ; %udiv-end +; GCN-IR-NEXT: v_mov_b32_e32 v0, s10 +; GCN-IR-NEXT: v_mul_hi_u32 v0, s8, v0 +; GCN-IR-NEXT: s_mul_i32 s11, s8, s11 +; GCN-IR-NEXT: s_mul_i32 s9, s9, s10 +; GCN-IR-NEXT: s_mul_i32 s8, s8, s10 +; GCN-IR-NEXT: v_readfirstlane_b32 s12, v0 +; GCN-IR-NEXT: s_add_i32 s11, s12, s11 +; GCN-IR-NEXT: s_add_i32 s11, s11, s9 +; GCN-IR-NEXT: s_sub_u32 s2, s2, s8 +; GCN-IR-NEXT: s_subb_u32 s3, s3, s11 +; GCN-IR-NEXT: s_xor_b64 s[2:3], s[2:3], s[0:1] +; GCN-IR-NEXT: s_sub_u32 s0, s2, s0 +; GCN-IR-NEXT: s_subb_u32 s1, s3, s1 +; GCN-IR-NEXT: v_mov_b32_e32 v0, s0 ; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 -; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, v1, v2 -; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, v3, v1 -; GCN-IR-NEXT: v_mov_b32_e32 v2, s3 -; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 -; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc -; GCN-IR-NEXT: v_xor_b32_e32 v0, s0, v0 -; GCN-IR-NEXT: v_xor_b32_e32 v1, s1, v1 -; GCN-IR-NEXT: v_mov_b32_e32 v2, s1 -; GCN-IR-NEXT: v_subrev_i32_e32 v0, vcc, s0, v0 ; GCN-IR-NEXT: s_mov_b32 s6, -1 -; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc +; GCN-IR-NEXT: v_mov_b32_e32 v1, s1 ; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN-IR-NEXT: s_endpgm %1 = ashr i64 %x, 31 @@ -1128,10 +1145,10 @@ ; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v1| ; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v5, vcc ; GCN-NEXT: s_mov_b32 s0, s4 -; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v4 +; GCN-NEXT: v_add_i32_e32 v1, vcc, v4, v1 ; GCN-NEXT: v_mul_lo_u32 v0, v1, v0 ; GCN-NEXT: s_mov_b32 s1, s5 -; GCN-NEXT: v_sub_i32_e32 v0, vcc, v2, v0 +; GCN-NEXT: v_subrev_i32_e32 v0, vcc, v0, v2 ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 24 ; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -1161,37 +1178,40 @@ ; GCN-IR-NEXT: s_subb_u32 s3, s3, s0 ; GCN-IR-NEXT: s_sub_u32 s6, s6, s12 ; GCN-IR-NEXT: s_subb_u32 s7, s7, s12 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[10:11], s[6:7], 0 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[12:13], s[2:3], 0 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[8:9], s[6:7], 0 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[10:11], s[2:3], 0 +; GCN-IR-NEXT: s_mov_b32 s13, 0 +; GCN-IR-NEXT: s_or_b64 s[10:11], s[8:9], s[10:11] +; GCN-IR-NEXT: s_flbit_i32_b32 s8, s6 +; GCN-IR-NEXT: s_add_i32 s8, s8, 32 +; GCN-IR-NEXT: s_flbit_i32_b32 s9, s7 +; GCN-IR-NEXT: s_min_u32 s12, s8, s9 +; GCN-IR-NEXT: s_flbit_i32_b32 s8, s2 +; GCN-IR-NEXT: s_add_i32 s8, s8, 32 +; GCN-IR-NEXT: s_flbit_i32_b32 s9, s3 +; GCN-IR-NEXT: s_min_u32 s16, s8, s9 +; GCN-IR-NEXT: s_sub_u32 s14, s12, s16 +; GCN-IR-NEXT: s_subb_u32 s15, 0, 0 +; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[18:19], s[14:15], 63 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[20:21], s[14:15], 63 +; GCN-IR-NEXT: s_or_b64 s[18:19], s[10:11], s[18:19] +; GCN-IR-NEXT: s_and_b64 s[10:11], s[18:19], exec +; GCN-IR-NEXT: s_cselect_b32 s11, 0, s3 +; GCN-IR-NEXT: s_cselect_b32 s10, 0, s2 +; GCN-IR-NEXT: s_or_b64 s[18:19], s[18:19], s[20:21] ; GCN-IR-NEXT: s_mov_b64 s[8:9], 0 -; GCN-IR-NEXT: s_or_b64 s[14:15], s[10:11], s[12:13] -; GCN-IR-NEXT: s_flbit_i32_b32 s10, s6 -; GCN-IR-NEXT: s_add_i32 s10, s10, 32 -; GCN-IR-NEXT: s_flbit_i32_b32 s11, s7 -; GCN-IR-NEXT: s_min_u32 s12, s10, s11 -; GCN-IR-NEXT: s_flbit_i32_b32 s10, s2 -; GCN-IR-NEXT: s_add_i32 s10, s10, 32 -; GCN-IR-NEXT: s_flbit_i32_b32 s11, s3 -; GCN-IR-NEXT: s_min_u32 s16, s10, s11 -; GCN-IR-NEXT: s_sub_u32 s10, s12, s16 -; GCN-IR-NEXT: s_subb_u32 s11, 0, 0 -; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[18:19], s[10:11], 63 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[20:21], s[10:11], 63 -; GCN-IR-NEXT: s_or_b64 s[14:15], s[14:15], s[18:19] -; GCN-IR-NEXT: s_or_b64 s[18:19], s[14:15], s[20:21] ; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[18:19] -; GCN-IR-NEXT: s_mov_b32 s13, 0 ; GCN-IR-NEXT: s_cbranch_vccz .LBB9_5 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 -; GCN-IR-NEXT: s_add_u32 s14, s10, 1 -; GCN-IR-NEXT: s_addc_u32 s15, s11, 0 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[18:19], s[14:15], 0 -; GCN-IR-NEXT: s_sub_i32 s10, 63, s10 -; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[18:19] -; GCN-IR-NEXT: s_lshl_b64 s[10:11], s[2:3], s10 +; GCN-IR-NEXT: s_add_u32 s18, s14, 1 +; GCN-IR-NEXT: s_addc_u32 s19, s15, 0 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[10:11], s[18:19], 0 +; GCN-IR-NEXT: s_sub_i32 s14, 63, s14 +; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[10:11] +; GCN-IR-NEXT: s_lshl_b64 s[10:11], s[2:3], s14 ; GCN-IR-NEXT: s_cbranch_vccz .LBB9_4 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader -; GCN-IR-NEXT: s_lshr_b64 s[14:15], s[2:3], s14 +; GCN-IR-NEXT: s_lshr_b64 s[14:15], s[2:3], s18 ; GCN-IR-NEXT: s_add_u32 s18, s6, -1 ; GCN-IR-NEXT: s_addc_u32 s19, s7, -1 ; GCN-IR-NEXT: s_not_b64 s[8:9], s[12:13] @@ -1222,34 +1242,28 @@ ; GCN-IR-NEXT: s_cbranch_vccz .LBB9_3 ; GCN-IR-NEXT: .LBB9_4: ; %Flow3 ; GCN-IR-NEXT: s_lshl_b64 s[10:11], s[10:11], 1 -; GCN-IR-NEXT: s_or_b64 s[8:9], s[8:9], s[10:11] -; GCN-IR-NEXT: v_mov_b32_e32 v0, s8 -; GCN-IR-NEXT: v_mov_b32_e32 v1, s9 -; GCN-IR-NEXT: s_branch .LBB9_6 -; GCN-IR-NEXT: .LBB9_5: -; GCN-IR-NEXT: v_mov_b32_e32 v0, s3 -; GCN-IR-NEXT: v_cndmask_b32_e64 v1, v0, 0, s[14:15] -; GCN-IR-NEXT: v_mov_b32_e32 v0, s2 -; GCN-IR-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[14:15] -; GCN-IR-NEXT: .LBB9_6: ; %udiv-end -; GCN-IR-NEXT: v_mul_lo_u32 v1, s6, v1 -; GCN-IR-NEXT: v_mul_hi_u32 v2, s6, v0 -; GCN-IR-NEXT: v_mul_lo_u32 v3, s7, v0 -; GCN-IR-NEXT: v_mul_lo_u32 v0, s6, v0 -; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 -; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, v2, v1 -; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, v1, v3 +; GCN-IR-NEXT: s_or_b64 s[10:11], s[8:9], s[10:11] +; GCN-IR-NEXT: .LBB9_5: ; %udiv-end +; GCN-IR-NEXT: v_mov_b32_e32 v0, s10 +; GCN-IR-NEXT: v_mul_hi_u32 v0, s6, v0 +; GCN-IR-NEXT: s_mul_i32 s8, s6, s11 +; GCN-IR-NEXT: s_mul_i32 s7, s7, s10 +; GCN-IR-NEXT: s_mul_i32 s6, s6, s10 +; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, s8, v0 +; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, s7, v0 +; GCN-IR-NEXT: v_mov_b32_e32 v1, s6 ; GCN-IR-NEXT: v_mov_b32_e32 v2, s3 -; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 -; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc -; GCN-IR-NEXT: v_xor_b32_e32 v0, s0, v0 -; GCN-IR-NEXT: v_xor_b32_e32 v1, s1, v1 +; GCN-IR-NEXT: v_sub_i32_e32 v1, vcc, s2, v1 +; GCN-IR-NEXT: v_subb_u32_e32 v0, vcc, v2, v0, vcc +; GCN-IR-NEXT: v_xor_b32_e32 v1, s0, v1 +; GCN-IR-NEXT: v_xor_b32_e32 v0, s1, v0 ; GCN-IR-NEXT: v_mov_b32_e32 v2, s1 -; GCN-IR-NEXT: v_subrev_i32_e32 v0, vcc, s0, v0 -; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc +; GCN-IR-NEXT: v_subrev_i32_e32 v1, vcc, s0, v1 +; GCN-IR-NEXT: v_subb_u32_e32 v0, vcc, v0, v2, vcc +; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s6, -1 -; GCN-IR-NEXT: buffer_store_short v1, off, s[4:7], 0 offset:4 -; GCN-IR-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN-IR-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:4 +; GCN-IR-NEXT: buffer_store_dword v1, off, s[4:7], 0 ; GCN-IR-NEXT: s_endpgm %1 = ashr i48 %x, 24 %2 = ashr i48 %y, 24 @@ -1309,9 +1323,9 @@ ; GCN-NEXT: v_mul_lo_u32 v2, s2, v1 ; GCN-NEXT: v_mul_hi_u32 v3, s2, v0 ; GCN-NEXT: v_mul_lo_u32 v4, s3, v0 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GCN-NEXT: v_mul_lo_u32 v3, s2, v0 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v4, v2 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GCN-NEXT: v_mul_lo_u32 v6, v0, v2 ; GCN-NEXT: v_mul_hi_u32 v7, v0, v3 ; GCN-NEXT: v_mul_hi_u32 v8, v0, v2 @@ -1355,49 +1369,52 @@ ; GCN-NEXT: v_subb_u32_e32 v1, vcc, 0, v1, vcc ; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v6 ; GCN-NEXT: v_cmp_le_u32_e32 vcc, s9, v1 +; GCN-NEXT: v_cndmask_b32_e64 v3, v4, v3, s[0:1] +; GCN-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc +; GCN-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 ; GCN-NEXT: v_cndmask_b32_e64 v2, v5, v2, s[0:1] ; GCN-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc -; GCN-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 -; GCN-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, s9, v1 -; GCN-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 +; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 ; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GCN-NEXT: v_cndmask_b32_e64 v2, v4, v3, s[0:1] -; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN-NEXT: s_endpgm ; ; GCN-IR-LABEL: s_test_srem_k_num_i64: ; GCN-IR: ; %bb.0: ; %_udiv-special-cases ; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-IR-NEXT: s_mov_b64 s[6:7], 0 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_ashr_i32 s6, s3, 31 -; GCN-IR-NEXT: s_mov_b32 s7, s6 -; GCN-IR-NEXT: s_xor_b64 s[2:3], s[2:3], s[6:7] -; GCN-IR-NEXT: s_sub_u32 s4, s2, s6 -; GCN-IR-NEXT: s_subb_u32 s5, s3, s6 -; GCN-IR-NEXT: s_flbit_i32_b32 s6, s4 -; GCN-IR-NEXT: s_add_i32 s6, s6, 32 -; GCN-IR-NEXT: s_flbit_i32_b32 s7, s5 -; GCN-IR-NEXT: s_min_u32 s8, s6, s7 -; GCN-IR-NEXT: s_add_u32 s6, s8, 0xffffffc5 -; GCN-IR-NEXT: s_addc_u32 s7, 0, -1 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[2:3], s[4:5], 0 -; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[10:11], s[6:7], 63 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[12:13], s[6:7], 63 -; GCN-IR-NEXT: s_or_b64 s[10:11], s[2:3], s[10:11] -; GCN-IR-NEXT: s_or_b64 s[2:3], s[10:11], s[12:13] -; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[2:3] -; GCN-IR-NEXT: s_mov_b64 s[2:3], 0 +; GCN-IR-NEXT: s_ashr_i32 s8, s3, 31 +; GCN-IR-NEXT: s_mov_b32 s9, s8 +; GCN-IR-NEXT: s_xor_b64 s[2:3], s[2:3], s[8:9] +; GCN-IR-NEXT: s_sub_u32 s4, s2, s8 +; GCN-IR-NEXT: s_subb_u32 s5, s3, s8 +; GCN-IR-NEXT: s_flbit_i32_b32 s2, s4 +; GCN-IR-NEXT: s_add_i32 s2, s2, 32 +; GCN-IR-NEXT: s_flbit_i32_b32 s3, s5 +; GCN-IR-NEXT: s_min_u32 s8, s2, s3 +; GCN-IR-NEXT: s_add_u32 s2, s8, 0xffffffc5 +; GCN-IR-NEXT: s_addc_u32 s3, 0, -1 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[10:11], s[4:5], 0 +; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[12:13], s[2:3], 63 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[14:15], s[2:3], 63 +; GCN-IR-NEXT: s_or_b64 s[12:13], s[10:11], s[12:13] +; GCN-IR-NEXT: s_and_b64 s[10:11], s[12:13], exec +; GCN-IR-NEXT: s_cselect_b32 s10, 0, 24 +; GCN-IR-NEXT: s_or_b64 s[12:13], s[12:13], s[14:15] +; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[12:13] +; GCN-IR-NEXT: s_mov_b32 s11, 0 ; GCN-IR-NEXT: s_cbranch_vccz .LBB10_5 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 -; GCN-IR-NEXT: s_add_u32 s10, s6, 1 -; GCN-IR-NEXT: s_addc_u32 s11, s7, 0 +; GCN-IR-NEXT: s_add_u32 s10, s2, 1 +; GCN-IR-NEXT: s_addc_u32 s11, s3, 0 ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[12:13], s[10:11], 0 -; GCN-IR-NEXT: s_sub_i32 s6, 63, s6 +; GCN-IR-NEXT: s_sub_i32 s2, 63, s2 ; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[12:13] -; GCN-IR-NEXT: s_lshl_b64 s[6:7], 24, s6 +; GCN-IR-NEXT: s_lshl_b64 s[2:3], 24, s2 ; GCN-IR-NEXT: s_cbranch_vccz .LBB10_4 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader ; GCN-IR-NEXT: s_lshr_b64 s[10:11], 24, s10 @@ -1406,46 +1423,41 @@ ; GCN-IR-NEXT: s_sub_u32 s8, 58, s8 ; GCN-IR-NEXT: s_subb_u32 s9, 0, 0 ; GCN-IR-NEXT: s_mov_b64 s[12:13], 0 -; GCN-IR-NEXT: s_mov_b32 s3, 0 +; GCN-IR-NEXT: s_mov_b32 s7, 0 ; GCN-IR-NEXT: .LBB10_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-IR-NEXT: s_lshl_b64 s[10:11], s[10:11], 1 -; GCN-IR-NEXT: s_lshr_b32 s2, s7, 31 -; GCN-IR-NEXT: s_lshl_b64 s[6:7], s[6:7], 1 -; GCN-IR-NEXT: s_or_b64 s[10:11], s[10:11], s[2:3] -; GCN-IR-NEXT: s_or_b64 s[6:7], s[12:13], s[6:7] -; GCN-IR-NEXT: s_sub_u32 s2, s14, s10 -; GCN-IR-NEXT: s_subb_u32 s2, s15, s11 -; GCN-IR-NEXT: s_ashr_i32 s12, s2, 31 +; GCN-IR-NEXT: s_lshr_b32 s6, s3, 31 +; GCN-IR-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 +; GCN-IR-NEXT: s_or_b64 s[10:11], s[10:11], s[6:7] +; GCN-IR-NEXT: s_or_b64 s[2:3], s[12:13], s[2:3] +; GCN-IR-NEXT: s_sub_u32 s6, s14, s10 +; GCN-IR-NEXT: s_subb_u32 s6, s15, s11 +; GCN-IR-NEXT: s_ashr_i32 s12, s6, 31 ; GCN-IR-NEXT: s_mov_b32 s13, s12 -; GCN-IR-NEXT: s_and_b32 s2, s12, 1 +; GCN-IR-NEXT: s_and_b32 s6, s12, 1 ; GCN-IR-NEXT: s_and_b64 s[12:13], s[12:13], s[4:5] ; GCN-IR-NEXT: s_sub_u32 s10, s10, s12 ; GCN-IR-NEXT: s_subb_u32 s11, s11, s13 ; GCN-IR-NEXT: s_add_u32 s8, s8, 1 ; GCN-IR-NEXT: s_addc_u32 s9, s9, 0 ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[16:17], s[8:9], 0 -; GCN-IR-NEXT: s_mov_b64 s[12:13], s[2:3] +; GCN-IR-NEXT: s_mov_b64 s[12:13], s[6:7] ; GCN-IR-NEXT: s_and_b64 vcc, exec, s[16:17] ; GCN-IR-NEXT: s_cbranch_vccz .LBB10_3 ; GCN-IR-NEXT: .LBB10_4: ; %Flow5 -; GCN-IR-NEXT: s_lshl_b64 s[6:7], s[6:7], 1 -; GCN-IR-NEXT: s_or_b64 s[2:3], s[2:3], s[6:7] -; GCN-IR-NEXT: v_mov_b32_e32 v0, s2 -; GCN-IR-NEXT: v_mov_b32_e32 v1, s3 -; GCN-IR-NEXT: s_branch .LBB10_6 -; GCN-IR-NEXT: .LBB10_5: -; GCN-IR-NEXT: v_mov_b32_e32 v1, 0 -; GCN-IR-NEXT: v_cndmask_b32_e64 v0, 24, 0, s[10:11] -; GCN-IR-NEXT: .LBB10_6: ; %udiv-end -; GCN-IR-NEXT: v_mul_lo_u32 v1, s4, v1 -; GCN-IR-NEXT: v_mul_hi_u32 v2, s4, v0 -; GCN-IR-NEXT: v_mul_lo_u32 v3, s5, v0 -; GCN-IR-NEXT: v_mul_lo_u32 v0, s4, v0 +; GCN-IR-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 +; GCN-IR-NEXT: s_or_b64 s[10:11], s[6:7], s[2:3] +; GCN-IR-NEXT: .LBB10_5: ; %udiv-end +; GCN-IR-NEXT: v_mov_b32_e32 v0, s10 +; GCN-IR-NEXT: v_mul_hi_u32 v0, s4, v0 +; GCN-IR-NEXT: s_mul_i32 s6, s4, s11 +; GCN-IR-NEXT: s_mul_i32 s5, s5, s10 +; GCN-IR-NEXT: s_mul_i32 s4, s4, s10 +; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, s5, v0 +; GCN-IR-NEXT: v_sub_i32_e64 v0, vcc, 24, s4 ; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 -; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, v2, v1 -; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, v1, v3 -; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, 24, v0 ; GCN-IR-NEXT: s_mov_b32 s2, -1 ; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, 0, v1, vcc ; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -1950,55 +1962,65 @@ ; GCN-LABEL: s_test_srem24_k_num_i64: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_ashr_i64 s[4:5], s[2:3], 40 -; GCN-NEXT: v_cvt_f32_i32_e32 v0, s4 -; GCN-NEXT: s_mov_b32 s5, 0x41c00000 -; GCN-NEXT: s_ashr_i32 s6, s4, 30 -; GCN-NEXT: s_or_b32 s6, s6, 1 +; GCN-NEXT: s_ashr_i64 s[2:3], s[2:3], 40 +; GCN-NEXT: v_cvt_f32_i32_e32 v0, s2 +; GCN-NEXT: s_mov_b32 s3, 0x41c00000 +; GCN-NEXT: s_mov_b32 s4, s0 +; GCN-NEXT: s_ashr_i32 s0, s2, 30 ; GCN-NEXT: v_rcp_iflag_f32_e32 v1, v0 -; GCN-NEXT: v_mov_b32_e32 v3, s6 -; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: s_mov_b32 s5, s1 +; GCN-NEXT: s_or_b32 s8, s0, 1 ; GCN-NEXT: v_mul_f32_e32 v1, 0x41c00000, v1 ; GCN-NEXT: v_trunc_f32_e32 v1, v1 -; GCN-NEXT: v_mad_f32 v2, -v1, v0, s5 +; GCN-NEXT: v_mad_f32 v2, -v1, v0, s3 ; GCN-NEXT: v_cvt_i32_f32_e32 v1, v1 -; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, |v0| -; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, v1, v0 -; GCN-NEXT: v_mul_lo_u32 v0, v0, s4 -; GCN-NEXT: v_sub_i32_e32 v0, vcc, 24, v0 -; GCN-NEXT: v_bfe_i32 v0, v0, 0, 24 -; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GCN-NEXT: v_cmp_ge_f32_e64 s[0:1], |v2|, |v0| +; GCN-NEXT: s_and_b64 s[0:1], s[0:1], exec +; GCN-NEXT: s_cselect_b32 s0, s8, 0 +; GCN-NEXT: v_readfirstlane_b32 s1, v1 +; GCN-NEXT: s_add_i32 s0, s1, s0 +; GCN-NEXT: s_mul_i32 s0, s0, s2 +; GCN-NEXT: s_sub_i32 s0, 24, s0 +; GCN-NEXT: s_bfe_i32 s0, s0, 0x180000 +; GCN-NEXT: s_ashr_i32 s1, s0, 31 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: v_mov_b32_e32 v1, s1 +; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN-NEXT: s_endpgm ; ; GCN-IR-LABEL: s_test_srem24_k_num_i64: ; GCN-IR: ; %bb.0: ; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 +; GCN-IR-NEXT: s_mov_b32 s6, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_ashr_i64 s[4:5], s[2:3], 40 -; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, s4 -; GCN-IR-NEXT: s_mov_b32 s5, 0x41c00000 -; GCN-IR-NEXT: s_ashr_i32 s6, s4, 30 -; GCN-IR-NEXT: s_or_b32 s6, s6, 1 +; GCN-IR-NEXT: s_ashr_i64 s[2:3], s[2:3], 40 +; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, s2 +; GCN-IR-NEXT: s_mov_b32 s3, 0x41c00000 +; GCN-IR-NEXT: s_mov_b32 s4, s0 +; GCN-IR-NEXT: s_ashr_i32 s0, s2, 30 ; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v1, v0 -; GCN-IR-NEXT: v_mov_b32_e32 v3, s6 -; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 -; GCN-IR-NEXT: s_mov_b32 s2, -1 +; GCN-IR-NEXT: s_mov_b32 s5, s1 +; GCN-IR-NEXT: s_or_b32 s8, s0, 1 ; GCN-IR-NEXT: v_mul_f32_e32 v1, 0x41c00000, v1 ; GCN-IR-NEXT: v_trunc_f32_e32 v1, v1 -; GCN-IR-NEXT: v_mad_f32 v2, -v1, v0, s5 +; GCN-IR-NEXT: v_mad_f32 v2, -v1, v0, s3 ; GCN-IR-NEXT: v_cvt_i32_f32_e32 v1, v1 -; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, |v0| -; GCN-IR-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc -; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GCN-IR-NEXT: v_mul_lo_u32 v0, v0, s4 -; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, 24, v0 -; GCN-IR-NEXT: v_bfe_i32 v0, v0, 0, 24 -; GCN-IR-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GCN-IR-NEXT: v_cmp_ge_f32_e64 s[0:1], |v2|, |v0| +; GCN-IR-NEXT: s_and_b64 s[0:1], s[0:1], exec +; GCN-IR-NEXT: s_cselect_b32 s0, s8, 0 +; GCN-IR-NEXT: v_readfirstlane_b32 s1, v1 +; GCN-IR-NEXT: s_add_i32 s0, s1, s0 +; GCN-IR-NEXT: s_mul_i32 s0, s0, s2 +; GCN-IR-NEXT: s_sub_i32 s0, 24, s0 +; GCN-IR-NEXT: s_bfe_i32 s0, s0, 0x180000 +; GCN-IR-NEXT: s_ashr_i32 s1, s0, 31 +; GCN-IR-NEXT: v_mov_b32_e32 v0, s0 +; GCN-IR-NEXT: v_mov_b32_e32 v1, s1 +; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN-IR-NEXT: s_endpgm %x.shr = ashr i64 %x, 40 %result = srem i64 24, %x.shr @@ -2010,58 +2032,62 @@ ; GCN-LABEL: s_test_srem24_k_den_i64: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; GCN-NEXT: s_mov_b32 s4, 0x46b6fe00 +; GCN-NEXT: s_mov_b32 s8, 0x46b6fe00 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_ashr_i64 s[2:3], s[2:3], 40 ; GCN-NEXT: v_cvt_f32_i32_e32 v0, s2 -; GCN-NEXT: s_ashr_i32 s3, s2, 30 -; GCN-NEXT: s_or_b32 s3, s3, 1 -; GCN-NEXT: v_mov_b32_e32 v1, s3 -; GCN-NEXT: v_mul_f32_e32 v2, 0x38331158, v0 -; GCN-NEXT: v_trunc_f32_e32 v2, v2 -; GCN-NEXT: v_mad_f32 v0, -v2, s4, v0 -; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, s4 -; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc -; GCN-NEXT: s_movk_i32 s3, 0x5b7f -; GCN-NEXT: v_add_i32_e32 v0, vcc, v2, v0 -; GCN-NEXT: v_mul_lo_u32 v0, v0, s3 ; GCN-NEXT: s_mov_b32 s4, s0 +; GCN-NEXT: s_ashr_i32 s0, s2, 30 ; GCN-NEXT: s_mov_b32 s5, s1 -; GCN-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 -; GCN-NEXT: v_bfe_i32 v0, v0, 0, 24 -; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GCN-NEXT: v_mul_f32_e32 v1, 0x38331158, v0 +; GCN-NEXT: v_trunc_f32_e32 v1, v1 +; GCN-NEXT: v_mad_f32 v0, -v1, s8, v0 +; GCN-NEXT: v_cvt_i32_f32_e32 v1, v1 +; GCN-NEXT: s_or_b32 s3, s0, 1 +; GCN-NEXT: v_cmp_ge_f32_e64 s[0:1], |v0|, s8 +; GCN-NEXT: s_and_b64 s[0:1], s[0:1], exec +; GCN-NEXT: s_cselect_b32 s0, s3, 0 +; GCN-NEXT: v_readfirstlane_b32 s1, v1 +; GCN-NEXT: s_add_i32 s0, s1, s0 +; GCN-NEXT: s_mulk_i32 s0, 0x5b7f +; GCN-NEXT: s_sub_i32 s0, s2, s0 +; GCN-NEXT: s_bfe_i32 s0, s0, 0x180000 +; GCN-NEXT: s_ashr_i32 s1, s0, 31 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN-NEXT: s_endpgm ; ; GCN-IR-LABEL: s_test_srem24_k_den_i64: ; GCN-IR: ; %bb.0: ; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; GCN-IR-NEXT: s_mov_b32 s4, 0x46b6fe00 +; GCN-IR-NEXT: s_mov_b32 s8, 0x46b6fe00 ; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s6, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_ashr_i64 s[2:3], s[2:3], 40 ; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, s2 -; GCN-IR-NEXT: s_ashr_i32 s3, s2, 30 -; GCN-IR-NEXT: s_or_b32 s3, s3, 1 -; GCN-IR-NEXT: v_mov_b32_e32 v1, s3 -; GCN-IR-NEXT: v_mul_f32_e32 v2, 0x38331158, v0 -; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2 -; GCN-IR-NEXT: v_mad_f32 v0, -v2, s4, v0 -; GCN-IR-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, s4 -; GCN-IR-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc -; GCN-IR-NEXT: s_movk_i32 s3, 0x5b7f -; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GCN-IR-NEXT: v_mul_lo_u32 v0, v0, s3 ; GCN-IR-NEXT: s_mov_b32 s4, s0 +; GCN-IR-NEXT: s_ashr_i32 s0, s2, 30 ; GCN-IR-NEXT: s_mov_b32 s5, s1 -; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 -; GCN-IR-NEXT: v_bfe_i32 v0, v0, 0, 24 -; GCN-IR-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GCN-IR-NEXT: v_mul_f32_e32 v1, 0x38331158, v0 +; GCN-IR-NEXT: v_trunc_f32_e32 v1, v1 +; GCN-IR-NEXT: v_mad_f32 v0, -v1, s8, v0 +; GCN-IR-NEXT: v_cvt_i32_f32_e32 v1, v1 +; GCN-IR-NEXT: s_or_b32 s3, s0, 1 +; GCN-IR-NEXT: v_cmp_ge_f32_e64 s[0:1], |v0|, s8 +; GCN-IR-NEXT: s_and_b64 s[0:1], s[0:1], exec +; GCN-IR-NEXT: s_cselect_b32 s0, s3, 0 +; GCN-IR-NEXT: v_readfirstlane_b32 s1, v1 +; GCN-IR-NEXT: s_add_i32 s0, s1, s0 +; GCN-IR-NEXT: s_mulk_i32 s0, 0x5b7f +; GCN-IR-NEXT: s_sub_i32 s0, s2, s0 +; GCN-IR-NEXT: s_bfe_i32 s0, s0, 0x180000 +; GCN-IR-NEXT: s_ashr_i32 s1, s0, 31 +; GCN-IR-NEXT: v_mov_b32_e32 v0, s0 +; GCN-IR-NEXT: v_mov_b32_e32 v1, s1 ; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN-IR-NEXT: s_endpgm %x.shr = ashr i64 %x, 40 diff --git a/llvm/test/CodeGen/AMDGPU/trunc.ll b/llvm/test/CodeGen/AMDGPU/trunc.ll --- a/llvm/test/CodeGen/AMDGPU/trunc.ll +++ b/llvm/test/CodeGen/AMDGPU/trunc.ll @@ -96,9 +96,7 @@ ; SI: s_load_dwordx2 s[[[SLO:[0-9]+]]:{{[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0x13 ; VI: s_load_dwordx2 s[[[SLO:[0-9]+]]:{{[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0x4c ; GCN: s_bitcmp1_b32 s[[SLO]], 0 -; SI: s_cselect_b64 s[[[VLO:[0-9]+]]:[[VHI:[0-9]+]]], -1, 0 -; SI: v_cndmask_b32_e64 {{v[0-9]+}}, -12, 63, s[[[VLO]]:[[VHI]]] -; VI: s_cselect_b32 {{s[0-9]+}}, 63, -12 +; GCN: s_cselect_b32 {{s[0-9]+}}, 63, -12 define amdgpu_kernel void @s_trunc_i64_to_i1(i32 addrspace(1)* %out, [8 x i32], i64 %x) { %trunc = trunc i64 %x to i1 %sel = select i1 %trunc, i32 63, i32 -12 diff --git a/llvm/test/CodeGen/AMDGPU/udiv.ll b/llvm/test/CodeGen/AMDGPU/udiv.ll --- a/llvm/test/CodeGen/AMDGPU/udiv.ll +++ b/llvm/test/CodeGen/AMDGPU/udiv.ll @@ -31,11 +31,11 @@ ; SI-NEXT: v_mul_hi_u32 v2, v0, v2 ; SI-NEXT: v_mul_lo_u32 v3, v2, v1 ; SI-NEXT: v_add_i32_e32 v4, vcc, 1, v2 -; SI-NEXT: v_subrev_i32_e32 v0, vcc, v3, v0 -; SI-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v1 -; SI-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[0:1] +; SI-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 ; SI-NEXT: v_subrev_i32_e32 v3, vcc, v1, v0 -; SI-NEXT: v_cndmask_b32_e64 v0, v0, v3, s[0:1] +; SI-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1 +; SI-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; SI-NEXT: v_add_i32_e32 v3, vcc, 1, v2 ; SI-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1 ; SI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc @@ -67,11 +67,11 @@ ; VI-NEXT: v_mul_hi_u32 v2, v0, v2 ; VI-NEXT: v_mul_lo_u32 v3, v2, v1 ; VI-NEXT: v_add_u32_e32 v4, vcc, 1, v2 -; VI-NEXT: v_subrev_u32_e32 v0, vcc, v3, v0 -; VI-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v1 -; VI-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[0:1] +; VI-NEXT: v_sub_u32_e32 v0, vcc, v0, v3 ; VI-NEXT: v_subrev_u32_e32 v3, vcc, v1, v0 -; VI-NEXT: v_cndmask_b32_e64 v0, v0, v3, s[0:1] +; VI-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1 +; VI-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; VI-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; VI-NEXT: v_add_u32_e32 v3, vcc, 1, v2 ; VI-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1 ; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc @@ -99,11 +99,11 @@ ; GCN-NEXT: v_mov_b32_e32 v3, s1 ; GCN-NEXT: v_mul_lo_u32 v5, v4, v1 ; GCN-NEXT: v_add_u32_e32 v6, vcc, 1, v4 -; GCN-NEXT: v_subrev_u32_e32 v0, vcc, v5, v0 -; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v1 -; GCN-NEXT: v_cndmask_b32_e64 v4, v4, v6, s[0:1] +; GCN-NEXT: v_sub_u32_e32 v0, vcc, v0, v5 ; GCN-NEXT: v_subrev_u32_e32 v5, vcc, v1, v0 -; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v5, s[0:1] +; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1 +; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc ; GCN-NEXT: v_add_u32_e32 v5, vcc, 1, v4 ; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1 ; GCN-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc @@ -117,25 +117,29 @@ ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] ; GFX1030-NEXT: s_waitcnt vmcnt(0) -; GFX1030-NEXT: v_cvt_f32_u32_e32 v3, v1 -; GFX1030-NEXT: v_sub_nc_u32_e32 v4, 0, v1 -; GFX1030-NEXT: v_rcp_iflag_f32_e32 v3, v3 -; GFX1030-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 -; GFX1030-NEXT: v_cvt_u32_f32_e32 v3, v3 -; GFX1030-NEXT: v_mul_lo_u32 v4, v4, v3 -; GFX1030-NEXT: v_mul_hi_u32 v4, v3, v4 -; GFX1030-NEXT: v_add_nc_u32_e32 v3, v3, v4 -; GFX1030-NEXT: v_mul_hi_u32 v3, v0, v3 -; GFX1030-NEXT: v_mul_lo_u32 v4, v3, v1 -; GFX1030-NEXT: v_sub_nc_u32_e32 v0, v0, v4 -; GFX1030-NEXT: v_add_nc_u32_e32 v4, 1, v3 -; GFX1030-NEXT: v_sub_nc_u32_e32 v5, v0, v1 -; GFX1030-NEXT: v_cmp_ge_u32_e32 vcc_lo, v0, v1 -; GFX1030-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo -; GFX1030-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo -; GFX1030-NEXT: v_add_nc_u32_e32 v4, 1, v3 -; GFX1030-NEXT: v_cmp_ge_u32_e32 vcc_lo, v0, v1 -; GFX1030-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc_lo +; GFX1030-NEXT: v_readfirstlane_b32 s2, v1 +; GFX1030-NEXT: v_readfirstlane_b32 s5, v0 +; GFX1030-NEXT: v_cvt_f32_u32_e32 v1, s2 +; GFX1030-NEXT: s_sub_i32 s4, 0, s2 +; GFX1030-NEXT: v_rcp_iflag_f32_e32 v1, v1 +; GFX1030-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 +; GFX1030-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GFX1030-NEXT: v_readfirstlane_b32 s3, v1 +; GFX1030-NEXT: s_mul_i32 s4, s4, s3 +; GFX1030-NEXT: s_mul_hi_u32 s4, s3, s4 +; GFX1030-NEXT: s_add_i32 s3, s3, s4 +; GFX1030-NEXT: s_mul_hi_u32 s3, s5, s3 +; GFX1030-NEXT: s_mul_i32 s4, s3, s2 +; GFX1030-NEXT: s_sub_i32 s4, s5, s4 +; GFX1030-NEXT: s_add_i32 s5, s3, 1 +; GFX1030-NEXT: s_sub_i32 s6, s4, s2 +; GFX1030-NEXT: s_cmp_ge_u32 s4, s2 +; GFX1030-NEXT: s_cselect_b32 s3, s5, s3 +; GFX1030-NEXT: s_cselect_b32 s4, s6, s4 +; GFX1030-NEXT: s_add_i32 s5, s3, 1 +; GFX1030-NEXT: s_cmp_ge_u32 s4, s2 +; GFX1030-NEXT: s_cselect_b32 s2, s5, s3 +; GFX1030-NEXT: v_mov_b32_e32 v0, s2 ; GFX1030-NEXT: global_store_dword v2, v0, s[0:1] ; GFX1030-NEXT: s_endpgm ; @@ -194,18 +198,21 @@ ; SI-NEXT: v_mul_lo_u32 v1, s4, v0 ; SI-NEXT: s_mov_b32 s4, s0 ; SI-NEXT: v_mul_hi_u32 v1, v0, v1 -; SI-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; SI-NEXT: v_mul_hi_u32 v0, s2, v0 -; SI-NEXT: v_mul_lo_u32 v1, v0, s3 -; SI-NEXT: v_add_i32_e32 v2, vcc, 1, v0 -; SI-NEXT: v_sub_i32_e32 v1, vcc, s2, v1 -; SI-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v1 -; SI-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] -; SI-NEXT: v_subrev_i32_e32 v2, vcc, s3, v1 -; SI-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[0:1] -; SI-NEXT: v_add_i32_e32 v2, vcc, 1, v0 -; SI-NEXT: v_cmp_le_u32_e32 vcc, s3, v1 -; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; SI-NEXT: v_readfirstlane_b32 s0, v0 +; SI-NEXT: s_mul_i32 s0, s0, s3 +; SI-NEXT: s_sub_i32 s0, s2, s0 +; SI-NEXT: s_sub_i32 s1, s0, s3 +; SI-NEXT: v_add_i32_e32 v1, vcc, 1, v0 +; SI-NEXT: s_cmp_ge_u32 s0, s3 +; SI-NEXT: s_cselect_b64 vcc, -1, 0 +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; SI-NEXT: s_cselect_b32 s0, s1, s0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 1, v0 +; SI-NEXT: s_cmp_ge_u32 s0, s3 +; SI-NEXT: s_cselect_b64 vcc, -1, 0 +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; @@ -226,51 +233,56 @@ ; VI-NEXT: v_mul_hi_u32 v1, v0, v1 ; VI-NEXT: v_add_u32_e32 v0, vcc, v1, v0 ; VI-NEXT: v_mul_hi_u32 v0, s2, v0 -; VI-NEXT: v_mul_lo_u32 v1, v0, s3 -; VI-NEXT: v_add_u32_e32 v2, vcc, 1, v0 -; VI-NEXT: v_sub_u32_e32 v1, vcc, s2, v1 -; VI-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v1 -; VI-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] -; VI-NEXT: v_subrev_u32_e32 v2, vcc, s3, v1 -; VI-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[0:1] -; VI-NEXT: v_add_u32_e32 v2, vcc, 1, v0 -; VI-NEXT: v_cmp_le_u32_e32 vcc, s3, v1 -; VI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; VI-NEXT: v_readfirstlane_b32 s0, v0 +; VI-NEXT: s_mul_i32 s0, s0, s3 +; VI-NEXT: s_sub_i32 s0, s2, s0 +; VI-NEXT: s_sub_i32 s1, s0, s3 +; VI-NEXT: v_add_u32_e32 v1, vcc, 1, v0 +; VI-NEXT: s_cmp_ge_u32 s0, s3 +; VI-NEXT: s_cselect_b64 vcc, -1, 0 +; VI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; VI-NEXT: s_cselect_b32 s0, s1, s0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 1, v0 +; VI-NEXT: s_cmp_ge_u32 s0, s3 +; VI-NEXT: s_cselect_b64 vcc, -1, 0 +; VI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GCN-LABEL: s_udiv_i32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_cvt_f32_u32_e32 v0, s7 -; GCN-NEXT: s_sub_i32 s0, 0, s7 +; GCN-NEXT: v_cvt_f32_u32_e32 v0, s3 +; GCN-NEXT: s_sub_i32 s4, 0, s3 ; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GCN-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_mul_lo_u32 v1, s0, v0 +; GCN-NEXT: v_mul_lo_u32 v1, s4, v0 ; GCN-NEXT: v_mul_hi_u32 v1, v0, v1 ; GCN-NEXT: v_add_u32_e32 v0, vcc, v1, v0 -; GCN-NEXT: v_mul_hi_u32 v0, s6, v0 -; GCN-NEXT: v_mul_lo_u32 v1, v0, s7 -; GCN-NEXT: v_add_u32_e32 v2, vcc, 1, v0 -; GCN-NEXT: v_sub_u32_e32 v1, vcc, s6, v1 -; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s7, v1 -; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] -; GCN-NEXT: v_subrev_u32_e32 v2, vcc, s7, v1 -; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[0:1] -; GCN-NEXT: v_add_u32_e32 v2, vcc, 1, v0 -; GCN-NEXT: v_cmp_le_u32_e32 vcc, s7, v1 -; GCN-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc -; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: v_mov_b32_e32 v1, s5 +; GCN-NEXT: v_mul_hi_u32 v0, s2, v0 +; GCN-NEXT: v_readfirstlane_b32 s4, v0 +; GCN-NEXT: s_mul_i32 s4, s4, s3 +; GCN-NEXT: s_sub_i32 s2, s2, s4 +; GCN-NEXT: s_sub_i32 s4, s2, s3 +; GCN-NEXT: v_add_u32_e32 v1, vcc, 1, v0 +; GCN-NEXT: s_cmp_ge_u32 s2, s3 +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GCN-NEXT: s_cselect_b32 s2, s4, s2 +; GCN-NEXT: v_add_u32_e32 v1, vcc, 1, v0 +; GCN-NEXT: s_cmp_ge_u32 s2, s3 +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: flat_store_dword v[0:1], v2 ; GCN-NEXT: s_endpgm ; ; GFX1030-LABEL: s_udiv_i32: ; GFX1030: ; %bb.0: ; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX1030-NEXT: v_mov_b32_e32 v3, 0 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: v_cvt_f32_u32_e32 v0, s3 ; GFX1030-NEXT: s_sub_i32 s5, 0, s3 @@ -278,24 +290,23 @@ ; GFX1030-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX1030-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX1030-NEXT: v_readfirstlane_b32 s4, v0 +; GFX1030-NEXT: v_mov_b32_e32 v0, 0 ; GFX1030-NEXT: s_mul_i32 s5, s5, s4 ; GFX1030-NEXT: s_mul_hi_u32 s5, s4, s5 ; GFX1030-NEXT: s_add_i32 s4, s4, s5 ; GFX1030-NEXT: s_mul_hi_u32 s4, s2, s4 ; GFX1030-NEXT: s_mul_i32 s5, s4, s3 ; GFX1030-NEXT: s_sub_i32 s2, s2, s5 +; GFX1030-NEXT: s_add_i32 s5, s4, 1 +; GFX1030-NEXT: s_sub_i32 s6, s2, s3 ; GFX1030-NEXT: s_cmp_ge_u32 s2, s3 -; GFX1030-NEXT: s_cselect_b32 vcc_lo, -1, 0 +; GFX1030-NEXT: s_cselect_b32 s4, s5, s4 +; GFX1030-NEXT: s_cselect_b32 s2, s6, s2 ; GFX1030-NEXT: s_add_i32 s5, s4, 1 -; GFX1030-NEXT: v_mov_b32_e32 v0, s5 -; GFX1030-NEXT: s_sub_i32 s5, s2, s3 -; GFX1030-NEXT: v_mov_b32_e32 v1, s5 -; GFX1030-NEXT: v_cndmask_b32_e32 v0, s4, v0, vcc_lo -; GFX1030-NEXT: v_cndmask_b32_e32 v1, s2, v1, vcc_lo -; GFX1030-NEXT: v_add_nc_u32_e32 v2, 1, v0 -; GFX1030-NEXT: v_cmp_le_u32_e32 vcc_lo, s3, v1 -; GFX1030-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo -; GFX1030-NEXT: global_store_dword v3, v0, s[0:1] +; GFX1030-NEXT: s_cmp_ge_u32 s2, s3 +; GFX1030-NEXT: s_cselect_b32 s2, s5, s4 +; GFX1030-NEXT: v_mov_b32_e32 v1, s2 +; GFX1030-NEXT: global_store_dword v0, v1, s[0:1] ; GFX1030-NEXT: s_endpgm ; ; EG-LABEL: s_udiv_i32: @@ -362,21 +373,21 @@ ; SI-NEXT: v_mul_hi_u32 v6, v4, v6 ; SI-NEXT: v_mul_hi_u32 v7, v5, v7 ; SI-NEXT: v_add_i32_e32 v4, vcc, v6, v4 -; SI-NEXT: v_add_i32_e32 v5, vcc, v7, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, v5, v7 ; SI-NEXT: v_mul_hi_u32 v4, v0, v4 ; SI-NEXT: v_mul_hi_u32 v5, v1, v5 ; SI-NEXT: v_mul_lo_u32 v6, v4, v2 ; SI-NEXT: v_mul_lo_u32 v8, v5, v3 ; SI-NEXT: v_add_i32_e32 v7, vcc, 1, v4 -; SI-NEXT: v_subrev_i32_e32 v0, vcc, v6, v0 +; SI-NEXT: v_sub_i32_e32 v0, vcc, v0, v6 ; SI-NEXT: v_sub_i32_e32 v1, vcc, v1, v8 ; SI-NEXT: v_add_i32_e32 v9, vcc, 1, v5 ; SI-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v2 ; SI-NEXT: v_cmp_ge_u32_e64 s[2:3], v1, v3 -; SI-NEXT: v_cndmask_b32_e64 v4, v4, v7, s[0:1] ; SI-NEXT: v_subrev_i32_e32 v6, vcc, v2, v0 -; SI-NEXT: v_cndmask_b32_e64 v5, v5, v9, s[2:3] +; SI-NEXT: v_cndmask_b32_e64 v4, v4, v7, s[0:1] ; SI-NEXT: v_subrev_i32_e32 v7, vcc, v3, v1 +; SI-NEXT: v_cndmask_b32_e64 v5, v5, v9, s[2:3] ; SI-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[0:1] ; SI-NEXT: v_add_i32_e32 v6, vcc, 1, v4 ; SI-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[2:3] @@ -417,21 +428,21 @@ ; VI-NEXT: v_mul_hi_u32 v6, v4, v6 ; VI-NEXT: v_mul_hi_u32 v7, v5, v7 ; VI-NEXT: v_add_u32_e32 v4, vcc, v6, v4 -; VI-NEXT: v_add_u32_e32 v5, vcc, v7, v5 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v7 ; VI-NEXT: v_mul_hi_u32 v4, v0, v4 ; VI-NEXT: v_mul_hi_u32 v5, v1, v5 ; VI-NEXT: v_mul_lo_u32 v6, v4, v2 ; VI-NEXT: v_mul_lo_u32 v8, v5, v3 ; VI-NEXT: v_add_u32_e32 v7, vcc, 1, v4 ; VI-NEXT: v_sub_u32_e32 v0, vcc, v0, v6 -; VI-NEXT: v_subrev_u32_e32 v1, vcc, v8, v1 +; VI-NEXT: v_sub_u32_e32 v1, vcc, v1, v8 ; VI-NEXT: v_add_u32_e32 v9, vcc, 1, v5 ; VI-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v2 ; VI-NEXT: v_cmp_ge_u32_e64 s[2:3], v1, v3 +; VI-NEXT: v_subrev_u32_e32 v6, vcc, v2, v0 ; VI-NEXT: v_cndmask_b32_e64 v4, v4, v7, s[0:1] -; VI-NEXT: v_sub_u32_e32 v6, vcc, v0, v2 -; VI-NEXT: v_cndmask_b32_e64 v5, v5, v9, s[2:3] ; VI-NEXT: v_subrev_u32_e32 v7, vcc, v3, v1 +; VI-NEXT: v_cndmask_b32_e64 v5, v5, v9, s[2:3] ; VI-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[0:1] ; VI-NEXT: v_add_u32_e32 v6, vcc, 1, v4 ; VI-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[2:3] @@ -469,20 +480,20 @@ ; GCN-NEXT: v_mul_hi_u32 v8, v7, v8 ; GCN-NEXT: v_add_u32_e32 v6, vcc, v9, v6 ; GCN-NEXT: v_mul_hi_u32 v6, v0, v6 -; GCN-NEXT: v_add_u32_e32 v7, vcc, v8, v7 +; GCN-NEXT: v_add_u32_e32 v7, vcc, v7, v8 ; GCN-NEXT: v_mul_hi_u32 v7, v1, v7 ; GCN-NEXT: v_mul_lo_u32 v8, v6, v2 ; GCN-NEXT: v_add_u32_e32 v9, vcc, 1, v6 ; GCN-NEXT: v_mul_lo_u32 v10, v7, v3 ; GCN-NEXT: v_sub_u32_e32 v0, vcc, v0, v8 ; GCN-NEXT: v_add_u32_e32 v11, vcc, 1, v7 -; GCN-NEXT: v_subrev_u32_e32 v1, vcc, v10, v1 +; GCN-NEXT: v_sub_u32_e32 v1, vcc, v1, v10 ; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v2 ; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], v1, v3 +; GCN-NEXT: v_subrev_u32_e32 v8, vcc, v2, v0 ; GCN-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[0:1] -; GCN-NEXT: v_sub_u32_e32 v8, vcc, v0, v2 -; GCN-NEXT: v_cndmask_b32_e64 v7, v7, v11, s[2:3] ; GCN-NEXT: v_subrev_u32_e32 v9, vcc, v3, v1 +; GCN-NEXT: v_cndmask_b32_e64 v7, v7, v11, s[2:3] ; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v8, s[0:1] ; GCN-NEXT: v_add_u32_e32 v8, vcc, 1, v6 ; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[2:3] @@ -496,50 +507,58 @@ ; ; GFX1030-LABEL: udiv_v2i32: ; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX1030-NEXT: v_mov_b32_e32 v4, 0 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) -; GFX1030-NEXT: global_load_dwordx4 v[0:3], v4, s[6:7] +; GFX1030-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] ; GFX1030-NEXT: s_waitcnt vmcnt(0) -; GFX1030-NEXT: v_cvt_f32_u32_e32 v5, v2 -; GFX1030-NEXT: v_cvt_f32_u32_e32 v6, v3 -; GFX1030-NEXT: v_sub_nc_u32_e32 v7, 0, v2 -; GFX1030-NEXT: v_sub_nc_u32_e32 v8, 0, v3 -; GFX1030-NEXT: v_rcp_iflag_f32_e32 v5, v5 -; GFX1030-NEXT: v_rcp_iflag_f32_e32 v6, v6 -; GFX1030-NEXT: v_mul_f32_e32 v5, 0x4f7ffffe, v5 -; GFX1030-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6 -; GFX1030-NEXT: v_cvt_u32_f32_e32 v5, v5 -; GFX1030-NEXT: v_cvt_u32_f32_e32 v6, v6 -; GFX1030-NEXT: v_mul_lo_u32 v7, v7, v5 -; GFX1030-NEXT: v_mul_lo_u32 v8, v8, v6 -; GFX1030-NEXT: v_mul_hi_u32 v7, v5, v7 -; GFX1030-NEXT: v_mul_hi_u32 v8, v6, v8 -; GFX1030-NEXT: v_add_nc_u32_e32 v5, v5, v7 -; GFX1030-NEXT: v_add_nc_u32_e32 v6, v6, v8 -; GFX1030-NEXT: v_mul_hi_u32 v5, v0, v5 -; GFX1030-NEXT: v_mul_hi_u32 v6, v1, v6 -; GFX1030-NEXT: v_mul_lo_u32 v7, v5, v2 -; GFX1030-NEXT: v_mul_lo_u32 v8, v6, v3 -; GFX1030-NEXT: v_sub_nc_u32_e32 v0, v0, v7 -; GFX1030-NEXT: v_add_nc_u32_e32 v7, 1, v5 -; GFX1030-NEXT: v_sub_nc_u32_e32 v1, v1, v8 -; GFX1030-NEXT: v_add_nc_u32_e32 v8, 1, v6 -; GFX1030-NEXT: v_cmp_ge_u32_e32 vcc_lo, v0, v2 -; GFX1030-NEXT: v_sub_nc_u32_e32 v9, v1, v3 -; GFX1030-NEXT: v_cmp_ge_u32_e64 s0, v1, v3 -; GFX1030-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc_lo -; GFX1030-NEXT: v_sub_nc_u32_e32 v7, v0, v2 -; GFX1030-NEXT: v_cndmask_b32_e64 v6, v6, v8, s0 -; GFX1030-NEXT: v_cndmask_b32_e64 v1, v1, v9, s0 -; GFX1030-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc_lo -; GFX1030-NEXT: v_add_nc_u32_e32 v7, 1, v5 -; GFX1030-NEXT: v_add_nc_u32_e32 v8, 1, v6 -; GFX1030-NEXT: v_cmp_ge_u32_e32 vcc_lo, v0, v2 -; GFX1030-NEXT: v_cndmask_b32_e32 v0, v5, v7, vcc_lo -; GFX1030-NEXT: v_cmp_ge_u32_e32 vcc_lo, v1, v3 -; GFX1030-NEXT: v_cndmask_b32_e32 v1, v6, v8, vcc_lo -; GFX1030-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5] +; GFX1030-NEXT: v_readfirstlane_b32 s2, v2 +; GFX1030-NEXT: v_readfirstlane_b32 s3, v3 +; GFX1030-NEXT: v_readfirstlane_b32 s6, v0 +; GFX1030-NEXT: v_cvt_f32_u32_e32 v2, s2 +; GFX1030-NEXT: v_cvt_f32_u32_e32 v3, s3 +; GFX1030-NEXT: s_sub_i32 s5, 0, s2 +; GFX1030-NEXT: v_rcp_iflag_f32_e32 v2, v2 +; GFX1030-NEXT: v_rcp_iflag_f32_e32 v3, v3 +; GFX1030-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 +; GFX1030-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v3 +; GFX1030-NEXT: v_cvt_u32_f32_e32 v2, v2 +; GFX1030-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX1030-NEXT: v_readfirstlane_b32 s4, v2 +; GFX1030-NEXT: v_readfirstlane_b32 s8, v0 +; GFX1030-NEXT: s_mul_i32 s5, s5, s4 +; GFX1030-NEXT: s_mul_hi_u32 s5, s4, s5 +; GFX1030-NEXT: s_add_i32 s4, s4, s5 +; GFX1030-NEXT: s_mul_hi_u32 s4, s6, s4 +; GFX1030-NEXT: s_mul_i32 s5, s4, s2 +; GFX1030-NEXT: s_sub_i32 s5, s6, s5 +; GFX1030-NEXT: s_add_i32 s6, s4, 1 +; GFX1030-NEXT: s_sub_i32 s7, s5, s2 +; GFX1030-NEXT: s_cmp_ge_u32 s5, s2 +; GFX1030-NEXT: s_cselect_b32 s4, s6, s4 +; GFX1030-NEXT: s_cselect_b32 s5, s7, s5 +; GFX1030-NEXT: s_add_i32 s6, s4, 1 +; GFX1030-NEXT: s_cmp_ge_u32 s5, s2 +; GFX1030-NEXT: v_readfirstlane_b32 s5, v1 +; GFX1030-NEXT: s_cselect_b32 s2, s6, s4 +; GFX1030-NEXT: s_sub_i32 s4, 0, s3 +; GFX1030-NEXT: v_mov_b32_e32 v0, s2 +; GFX1030-NEXT: s_mul_i32 s4, s4, s8 +; GFX1030-NEXT: s_mul_hi_u32 s4, s8, s4 +; GFX1030-NEXT: s_add_i32 s8, s8, s4 +; GFX1030-NEXT: s_mul_hi_u32 s4, s5, s8 +; GFX1030-NEXT: s_mul_i32 s6, s4, s3 +; GFX1030-NEXT: s_sub_i32 s5, s5, s6 +; GFX1030-NEXT: s_add_i32 s6, s4, 1 +; GFX1030-NEXT: s_sub_i32 s7, s5, s3 +; GFX1030-NEXT: s_cmp_ge_u32 s5, s3 +; GFX1030-NEXT: s_cselect_b32 s4, s6, s4 +; GFX1030-NEXT: s_cselect_b32 s5, s7, s5 +; GFX1030-NEXT: s_add_i32 s6, s4, 1 +; GFX1030-NEXT: s_cmp_ge_u32 s5, s3 +; GFX1030-NEXT: s_cselect_b32 s3, s6, s4 +; GFX1030-NEXT: v_mov_b32_e32 v1, s3 +; GFX1030-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] ; GFX1030-NEXT: s_endpgm ; ; EG-LABEL: udiv_v2i32: @@ -641,9 +660,9 @@ ; SI-NEXT: v_mul_hi_u32 v11, v10, v11 ; SI-NEXT: v_mul_hi_u32 v13, v12, v13 ; SI-NEXT: v_mul_hi_u32 v15, v14, v15 -; SI-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; SI-NEXT: v_add_i32_e32 v8, vcc, v8, v9 ; SI-NEXT: v_add_i32_e32 v9, vcc, v11, v10 -; SI-NEXT: v_add_i32_e32 v10, vcc, v13, v12 +; SI-NEXT: v_add_i32_e32 v10, vcc, v12, v13 ; SI-NEXT: v_add_i32_e32 v11, vcc, v15, v14 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_hi_u32 v8, v4, v8 @@ -654,7 +673,7 @@ ; SI-NEXT: v_mul_lo_u32 v14, v9, v1 ; SI-NEXT: v_mul_lo_u32 v16, v10, v2 ; SI-NEXT: v_mul_lo_u32 v18, v11, v3 -; SI-NEXT: v_subrev_i32_e32 v4, vcc, v12, v4 +; SI-NEXT: v_sub_i32_e32 v4, vcc, v4, v12 ; SI-NEXT: v_sub_i32_e32 v5, vcc, v5, v14 ; SI-NEXT: v_sub_i32_e32 v6, vcc, v6, v16 ; SI-NEXT: v_sub_i32_e32 v7, vcc, v7, v18 @@ -666,14 +685,14 @@ ; SI-NEXT: v_cmp_ge_u32_e64 s[2:3], v5, v1 ; SI-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v2 ; SI-NEXT: v_cmp_ge_u32_e64 s[6:7], v7, v3 -; SI-NEXT: v_cndmask_b32_e64 v8, v8, v13, s[0:1] ; SI-NEXT: v_subrev_i32_e32 v12, vcc, v0, v4 -; SI-NEXT: v_cndmask_b32_e64 v9, v9, v15, s[2:3] +; SI-NEXT: v_cndmask_b32_e64 v8, v8, v13, s[0:1] ; SI-NEXT: v_subrev_i32_e32 v13, vcc, v1, v5 -; SI-NEXT: v_cndmask_b32_e64 v10, v10, v17, s[4:5] +; SI-NEXT: v_cndmask_b32_e64 v9, v9, v15, s[2:3] ; SI-NEXT: v_subrev_i32_e32 v14, vcc, v2, v6 +; SI-NEXT: v_cndmask_b32_e64 v10, v10, v17, s[4:5] +; SI-NEXT: v_subrev_i32_e32 v15, vcc, v3, v7 ; SI-NEXT: v_cndmask_b32_e64 v11, v11, v19, s[6:7] -; SI-NEXT: v_sub_i32_e32 v15, vcc, v7, v3 ; SI-NEXT: v_cndmask_b32_e64 v4, v4, v12, s[0:1] ; SI-NEXT: v_add_i32_e32 v12, vcc, 1, v8 ; SI-NEXT: v_cndmask_b32_e64 v5, v5, v13, s[2:3] @@ -736,8 +755,8 @@ ; VI-NEXT: v_mul_hi_u32 v11, v10, v11 ; VI-NEXT: v_mul_hi_u32 v13, v12, v13 ; VI-NEXT: v_mul_hi_u32 v15, v14, v15 -; VI-NEXT: v_add_u32_e32 v8, vcc, v9, v8 -; VI-NEXT: v_add_u32_e32 v9, vcc, v10, v11 +; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v9 +; VI-NEXT: v_add_u32_e32 v9, vcc, v11, v10 ; VI-NEXT: v_add_u32_e32 v10, vcc, v12, v13 ; VI-NEXT: v_add_u32_e32 v11, vcc, v15, v14 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -750,8 +769,8 @@ ; VI-NEXT: v_mul_lo_u32 v16, v10, v2 ; VI-NEXT: v_mul_lo_u32 v18, v11, v3 ; VI-NEXT: v_sub_u32_e32 v4, vcc, v4, v12 -; VI-NEXT: v_subrev_u32_e32 v5, vcc, v14, v5 -; VI-NEXT: v_subrev_u32_e32 v6, vcc, v16, v6 +; VI-NEXT: v_sub_u32_e32 v5, vcc, v5, v14 +; VI-NEXT: v_sub_u32_e32 v6, vcc, v6, v16 ; VI-NEXT: v_sub_u32_e32 v7, vcc, v7, v18 ; VI-NEXT: v_add_u32_e32 v13, vcc, 1, v8 ; VI-NEXT: v_add_u32_e32 v15, vcc, 1, v9 @@ -761,14 +780,14 @@ ; VI-NEXT: v_cmp_ge_u32_e64 s[2:3], v5, v1 ; VI-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v2 ; VI-NEXT: v_cmp_ge_u32_e64 s[6:7], v7, v3 -; VI-NEXT: v_cndmask_b32_e64 v8, v8, v13, s[0:1] ; VI-NEXT: v_subrev_u32_e32 v12, vcc, v0, v4 -; VI-NEXT: v_cndmask_b32_e64 v9, v9, v15, s[2:3] +; VI-NEXT: v_cndmask_b32_e64 v8, v8, v13, s[0:1] ; VI-NEXT: v_subrev_u32_e32 v13, vcc, v1, v5 -; VI-NEXT: v_cndmask_b32_e64 v10, v10, v17, s[4:5] +; VI-NEXT: v_cndmask_b32_e64 v9, v9, v15, s[2:3] ; VI-NEXT: v_subrev_u32_e32 v14, vcc, v2, v6 -; VI-NEXT: v_cndmask_b32_e64 v11, v11, v19, s[6:7] +; VI-NEXT: v_cndmask_b32_e64 v10, v10, v17, s[4:5] ; VI-NEXT: v_subrev_u32_e32 v15, vcc, v3, v7 +; VI-NEXT: v_cndmask_b32_e64 v11, v11, v19, s[6:7] ; VI-NEXT: v_cndmask_b32_e64 v4, v4, v12, s[0:1] ; VI-NEXT: v_add_u32_e32 v12, vcc, 1, v8 ; VI-NEXT: v_cndmask_b32_e64 v5, v5, v13, s[2:3] @@ -831,8 +850,8 @@ ; GCN-NEXT: v_mul_hi_u32 v13, v12, v13 ; GCN-NEXT: v_mul_hi_u32 v15, v14, v15 ; GCN-NEXT: v_mul_hi_u32 v17, v16, v17 -; GCN-NEXT: v_add_u32_e32 v10, vcc, v11, v10 -; GCN-NEXT: v_add_u32_e32 v11, vcc, v12, v13 +; GCN-NEXT: v_add_u32_e32 v10, vcc, v10, v11 +; GCN-NEXT: v_add_u32_e32 v11, vcc, v13, v12 ; GCN-NEXT: v_add_u32_e32 v12, vcc, v14, v15 ; GCN-NEXT: v_add_u32_e32 v13, vcc, v17, v16 ; GCN-NEXT: s_waitcnt vmcnt(0) @@ -845,8 +864,8 @@ ; GCN-NEXT: v_mul_lo_u32 v18, v12, v2 ; GCN-NEXT: v_mul_lo_u32 v19, v13, v3 ; GCN-NEXT: v_sub_u32_e32 v4, vcc, v4, v14 -; GCN-NEXT: v_subrev_u32_e32 v5, vcc, v16, v5 -; GCN-NEXT: v_subrev_u32_e32 v6, vcc, v18, v6 +; GCN-NEXT: v_sub_u32_e32 v5, vcc, v5, v16 +; GCN-NEXT: v_sub_u32_e32 v6, vcc, v6, v18 ; GCN-NEXT: v_sub_u32_e32 v7, vcc, v7, v19 ; GCN-NEXT: v_add_u32_e32 v15, vcc, 1, v10 ; GCN-NEXT: v_add_u32_e32 v17, vcc, 1, v11 @@ -856,120 +875,136 @@ ; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], v5, v1 ; GCN-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v2 ; GCN-NEXT: v_cmp_ge_u32_e64 s[6:7], v7, v3 +; GCN-NEXT: v_subrev_u32_e32 v18, vcc, v0, v4 ; GCN-NEXT: v_cndmask_b32_e64 v10, v10, v15, s[0:1] -; GCN-NEXT: v_subrev_u32_e32 v15, vcc, v0, v4 +; GCN-NEXT: v_subrev_u32_e32 v15, vcc, v1, v5 ; GCN-NEXT: v_cndmask_b32_e64 v11, v11, v17, s[2:3] -; GCN-NEXT: v_subrev_u32_e32 v17, vcc, v1, v5 +; GCN-NEXT: v_subrev_u32_e32 v17, vcc, v2, v6 ; GCN-NEXT: v_cndmask_b32_e64 v12, v12, v14, s[4:5] -; GCN-NEXT: v_subrev_u32_e32 v14, vcc, v2, v6 +; GCN-NEXT: v_subrev_u32_e32 v14, vcc, v3, v7 ; GCN-NEXT: v_cndmask_b32_e64 v13, v13, v16, s[6:7] -; GCN-NEXT: v_subrev_u32_e32 v16, vcc, v3, v7 -; GCN-NEXT: v_cndmask_b32_e64 v4, v4, v15, s[0:1] -; GCN-NEXT: v_add_u32_e32 v15, vcc, 1, v10 -; GCN-NEXT: v_cndmask_b32_e64 v5, v5, v17, s[2:3] -; GCN-NEXT: v_add_u32_e32 v17, vcc, 1, v11 -; GCN-NEXT: v_cndmask_b32_e64 v6, v6, v14, s[4:5] -; GCN-NEXT: v_add_u32_e32 v14, vcc, 1, v12 -; GCN-NEXT: v_cndmask_b32_e64 v7, v7, v16, s[6:7] -; GCN-NEXT: v_add_u32_e32 v16, vcc, 1, v13 +; GCN-NEXT: v_cndmask_b32_e64 v4, v4, v18, s[0:1] +; GCN-NEXT: v_add_u32_e32 v16, vcc, 1, v10 +; GCN-NEXT: v_cndmask_b32_e64 v5, v5, v15, s[2:3] +; GCN-NEXT: v_add_u32_e32 v15, vcc, 1, v11 +; GCN-NEXT: v_cndmask_b32_e64 v6, v6, v17, s[4:5] +; GCN-NEXT: v_add_u32_e32 v17, vcc, 1, v12 +; GCN-NEXT: v_cndmask_b32_e64 v7, v7, v14, s[6:7] +; GCN-NEXT: v_add_u32_e32 v14, vcc, 1, v13 ; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v4, v0 -; GCN-NEXT: v_cndmask_b32_e32 v0, v10, v15, vcc +; GCN-NEXT: v_cndmask_b32_e32 v0, v10, v16, vcc ; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v5, v1 -; GCN-NEXT: v_cndmask_b32_e32 v1, v11, v17, vcc +; GCN-NEXT: v_cndmask_b32_e32 v1, v11, v15, vcc ; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v6, v2 -; GCN-NEXT: v_cndmask_b32_e32 v2, v12, v14, vcc +; GCN-NEXT: v_cndmask_b32_e32 v2, v12, v17, vcc ; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v7, v3 -; GCN-NEXT: v_cndmask_b32_e32 v3, v13, v16, vcc +; GCN-NEXT: v_cndmask_b32_e32 v3, v13, v14, vcc ; GCN-NEXT: flat_store_dwordx4 v[8:9], v[0:3] ; GCN-NEXT: s_endpgm ; ; GFX1030-LABEL: udiv_v4i32: ; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX1030-NEXT: v_mov_b32_e32 v8, 0 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: s_clause 0x1 -; GFX1030-NEXT: global_load_dwordx4 v[0:3], v8, s[6:7] offset:16 -; GFX1030-NEXT: global_load_dwordx4 v[4:7], v8, s[6:7] +; GFX1030-NEXT: global_load_dwordx4 v[0:3], v8, s[2:3] offset:16 +; GFX1030-NEXT: global_load_dwordx4 v[4:7], v8, s[2:3] ; GFX1030-NEXT: s_waitcnt vmcnt(1) -; GFX1030-NEXT: v_cvt_f32_u32_e32 v9, v0 -; GFX1030-NEXT: v_cvt_f32_u32_e32 v10, v1 -; GFX1030-NEXT: v_cvt_f32_u32_e32 v11, v2 -; GFX1030-NEXT: v_cvt_f32_u32_e32 v12, v3 -; GFX1030-NEXT: v_sub_nc_u32_e32 v13, 0, v0 -; GFX1030-NEXT: v_rcp_iflag_f32_e32 v9, v9 -; GFX1030-NEXT: v_rcp_iflag_f32_e32 v10, v10 -; GFX1030-NEXT: v_rcp_iflag_f32_e32 v11, v11 -; GFX1030-NEXT: v_rcp_iflag_f32_e32 v12, v12 -; GFX1030-NEXT: v_sub_nc_u32_e32 v14, 0, v1 -; GFX1030-NEXT: v_sub_nc_u32_e32 v15, 0, v2 -; GFX1030-NEXT: v_sub_nc_u32_e32 v16, 0, v3 -; GFX1030-NEXT: v_mul_f32_e32 v9, 0x4f7ffffe, v9 -; GFX1030-NEXT: v_mul_f32_e32 v10, 0x4f7ffffe, v10 -; GFX1030-NEXT: v_mul_f32_e32 v11, 0x4f7ffffe, v11 -; GFX1030-NEXT: v_mul_f32_e32 v12, 0x4f7ffffe, v12 -; GFX1030-NEXT: v_cvt_u32_f32_e32 v9, v9 -; GFX1030-NEXT: v_cvt_u32_f32_e32 v10, v10 -; GFX1030-NEXT: v_cvt_u32_f32_e32 v11, v11 -; GFX1030-NEXT: v_cvt_u32_f32_e32 v12, v12 -; GFX1030-NEXT: v_mul_lo_u32 v13, v13, v9 -; GFX1030-NEXT: v_mul_lo_u32 v14, v14, v10 -; GFX1030-NEXT: v_mul_lo_u32 v15, v15, v11 -; GFX1030-NEXT: v_mul_lo_u32 v16, v16, v12 -; GFX1030-NEXT: v_mul_hi_u32 v13, v9, v13 -; GFX1030-NEXT: v_mul_hi_u32 v14, v10, v14 -; GFX1030-NEXT: v_mul_hi_u32 v15, v11, v15 -; GFX1030-NEXT: v_mul_hi_u32 v16, v12, v16 -; GFX1030-NEXT: v_add_nc_u32_e32 v9, v9, v13 -; GFX1030-NEXT: v_add_nc_u32_e32 v10, v10, v14 -; GFX1030-NEXT: v_add_nc_u32_e32 v11, v11, v15 -; GFX1030-NEXT: v_add_nc_u32_e32 v12, v12, v16 +; GFX1030-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1030-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1030-NEXT: s_waitcnt vmcnt(0) -; GFX1030-NEXT: v_mul_hi_u32 v9, v4, v9 -; GFX1030-NEXT: v_mul_hi_u32 v10, v5, v10 -; GFX1030-NEXT: v_mul_hi_u32 v11, v6, v11 -; GFX1030-NEXT: v_mul_hi_u32 v12, v7, v12 -; GFX1030-NEXT: v_mul_lo_u32 v13, v9, v0 -; GFX1030-NEXT: v_mul_lo_u32 v14, v10, v1 -; GFX1030-NEXT: v_mul_lo_u32 v15, v11, v2 -; GFX1030-NEXT: v_mul_lo_u32 v16, v12, v3 -; GFX1030-NEXT: v_add_nc_u32_e32 v17, 1, v9 -; GFX1030-NEXT: v_add_nc_u32_e32 v18, 1, v10 -; GFX1030-NEXT: v_add_nc_u32_e32 v19, 1, v11 -; GFX1030-NEXT: v_sub_nc_u32_e32 v4, v4, v13 -; GFX1030-NEXT: v_sub_nc_u32_e32 v5, v5, v14 -; GFX1030-NEXT: v_sub_nc_u32_e32 v6, v6, v15 -; GFX1030-NEXT: v_sub_nc_u32_e32 v7, v7, v16 -; GFX1030-NEXT: v_add_nc_u32_e32 v13, 1, v12 -; GFX1030-NEXT: v_cmp_ge_u32_e32 vcc_lo, v4, v0 -; GFX1030-NEXT: v_sub_nc_u32_e32 v14, v4, v0 -; GFX1030-NEXT: v_cmp_ge_u32_e64 s0, v5, v1 -; GFX1030-NEXT: v_sub_nc_u32_e32 v15, v5, v1 -; GFX1030-NEXT: v_cmp_ge_u32_e64 s1, v6, v2 -; GFX1030-NEXT: v_cndmask_b32_e32 v9, v9, v17, vcc_lo -; GFX1030-NEXT: v_cndmask_b32_e32 v4, v4, v14, vcc_lo -; GFX1030-NEXT: v_cndmask_b32_e64 v10, v10, v18, s0 -; GFX1030-NEXT: v_sub_nc_u32_e32 v16, v6, v2 -; GFX1030-NEXT: v_cmp_ge_u32_e64 s2, v7, v3 -; GFX1030-NEXT: v_add_nc_u32_e32 v14, 1, v9 -; GFX1030-NEXT: v_cndmask_b32_e64 v5, v5, v15, s0 -; GFX1030-NEXT: v_cmp_ge_u32_e32 vcc_lo, v4, v0 -; GFX1030-NEXT: v_cndmask_b32_e64 v11, v11, v19, s1 -; GFX1030-NEXT: v_cndmask_b32_e64 v12, v12, v13, s2 -; GFX1030-NEXT: v_sub_nc_u32_e32 v13, v7, v3 -; GFX1030-NEXT: v_add_nc_u32_e32 v15, 1, v10 -; GFX1030-NEXT: v_cndmask_b32_e64 v6, v6, v16, s1 -; GFX1030-NEXT: v_cndmask_b32_e32 v0, v9, v14, vcc_lo -; GFX1030-NEXT: v_cmp_ge_u32_e32 vcc_lo, v5, v1 -; GFX1030-NEXT: v_add_nc_u32_e32 v16, 1, v11 -; GFX1030-NEXT: v_cndmask_b32_e64 v7, v7, v13, s2 -; GFX1030-NEXT: v_add_nc_u32_e32 v13, 1, v12 -; GFX1030-NEXT: v_cndmask_b32_e32 v1, v10, v15, vcc_lo -; GFX1030-NEXT: v_cmp_ge_u32_e32 vcc_lo, v6, v2 -; GFX1030-NEXT: v_cndmask_b32_e32 v2, v11, v16, vcc_lo -; GFX1030-NEXT: v_cmp_ge_u32_e32 vcc_lo, v7, v3 -; GFX1030-NEXT: v_cndmask_b32_e32 v3, v12, v13, vcc_lo -; GFX1030-NEXT: global_store_dwordx4 v8, v[0:3], s[4:5] +; GFX1030-NEXT: v_readfirstlane_b32 s7, v4 +; GFX1030-NEXT: v_readfirstlane_b32 s5, v2 +; GFX1030-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GFX1030-NEXT: v_cvt_f32_u32_e32 v1, s3 +; GFX1030-NEXT: s_sub_i32 s6, 0, s2 +; GFX1030-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX1030-NEXT: v_rcp_iflag_f32_e32 v1, v1 +; GFX1030-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; GFX1030-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 +; GFX1030-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX1030-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GFX1030-NEXT: v_readfirstlane_b32 s4, v0 +; GFX1030-NEXT: v_cvt_f32_u32_e32 v0, s5 +; GFX1030-NEXT: v_readfirstlane_b32 s9, v1 +; GFX1030-NEXT: s_mul_i32 s6, s6, s4 +; GFX1030-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX1030-NEXT: s_mul_hi_u32 s6, s4, s6 +; GFX1030-NEXT: s_add_i32 s4, s4, s6 +; GFX1030-NEXT: s_mul_hi_u32 s4, s7, s4 +; GFX1030-NEXT: s_mul_i32 s6, s4, s2 +; GFX1030-NEXT: s_sub_i32 s6, s7, s6 +; GFX1030-NEXT: s_add_i32 s7, s4, 1 +; GFX1030-NEXT: s_sub_i32 s8, s6, s2 +; GFX1030-NEXT: s_cmp_ge_u32 s6, s2 +; GFX1030-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; GFX1030-NEXT: s_cselect_b32 s4, s7, s4 +; GFX1030-NEXT: s_cselect_b32 s6, s8, s6 +; GFX1030-NEXT: s_add_i32 s7, s4, 1 +; GFX1030-NEXT: s_cmp_ge_u32 s6, s2 +; GFX1030-NEXT: v_readfirstlane_b32 s2, v3 +; GFX1030-NEXT: s_cselect_b32 s4, s7, s4 +; GFX1030-NEXT: s_sub_i32 s6, 0, s3 +; GFX1030-NEXT: v_readfirstlane_b32 s7, v5 +; GFX1030-NEXT: s_mul_i32 s6, s6, s9 +; GFX1030-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX1030-NEXT: s_mul_hi_u32 s6, s9, s6 +; GFX1030-NEXT: v_cvt_f32_u32_e32 v1, s2 +; GFX1030-NEXT: s_add_i32 s9, s9, s6 +; GFX1030-NEXT: s_mul_hi_u32 s6, s7, s9 +; GFX1030-NEXT: v_readfirstlane_b32 s10, v0 +; GFX1030-NEXT: s_mul_i32 s8, s6, s3 +; GFX1030-NEXT: v_rcp_iflag_f32_e32 v1, v1 +; GFX1030-NEXT: s_sub_i32 s7, s7, s8 +; GFX1030-NEXT: s_add_i32 s8, s6, 1 +; GFX1030-NEXT: s_sub_i32 s9, s7, s3 +; GFX1030-NEXT: s_cmp_ge_u32 s7, s3 +; GFX1030-NEXT: s_cselect_b32 s6, s8, s6 +; GFX1030-NEXT: s_cselect_b32 s7, s9, s7 +; GFX1030-NEXT: s_add_i32 s8, s6, 1 +; GFX1030-NEXT: s_cmp_ge_u32 s7, s3 +; GFX1030-NEXT: v_readfirstlane_b32 s7, v6 +; GFX1030-NEXT: s_cselect_b32 s3, s8, s6 +; GFX1030-NEXT: s_sub_i32 s6, 0, s5 +; GFX1030-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v1 +; GFX1030-NEXT: s_mul_i32 s6, s6, s10 +; GFX1030-NEXT: v_mov_b32_e32 v1, s3 +; GFX1030-NEXT: s_mul_hi_u32 s6, s10, s6 +; GFX1030-NEXT: s_add_i32 s10, s10, s6 +; GFX1030-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX1030-NEXT: s_mul_hi_u32 s6, s7, s10 +; GFX1030-NEXT: s_mul_i32 s8, s6, s5 +; GFX1030-NEXT: s_sub_i32 s7, s7, s8 +; GFX1030-NEXT: s_add_i32 s8, s6, 1 +; GFX1030-NEXT: s_sub_i32 s9, s7, s5 +; GFX1030-NEXT: s_cmp_ge_u32 s7, s5 +; GFX1030-NEXT: v_readfirstlane_b32 s10, v0 +; GFX1030-NEXT: s_cselect_b32 s6, s8, s6 +; GFX1030-NEXT: s_cselect_b32 s7, s9, s7 +; GFX1030-NEXT: s_add_i32 s8, s6, 1 +; GFX1030-NEXT: s_cmp_ge_u32 s7, s5 +; GFX1030-NEXT: v_readfirstlane_b32 s7, v7 +; GFX1030-NEXT: s_cselect_b32 s5, s8, s6 +; GFX1030-NEXT: s_sub_i32 s6, 0, s2 +; GFX1030-NEXT: v_mov_b32_e32 v0, s4 +; GFX1030-NEXT: s_mul_i32 s6, s6, s10 +; GFX1030-NEXT: v_mov_b32_e32 v2, s5 +; GFX1030-NEXT: s_mul_hi_u32 s6, s10, s6 +; GFX1030-NEXT: s_add_i32 s10, s10, s6 +; GFX1030-NEXT: s_mul_hi_u32 s6, s7, s10 +; GFX1030-NEXT: s_mul_i32 s8, s6, s2 +; GFX1030-NEXT: s_sub_i32 s7, s7, s8 +; GFX1030-NEXT: s_add_i32 s8, s6, 1 +; GFX1030-NEXT: s_sub_i32 s9, s7, s2 +; GFX1030-NEXT: s_cmp_ge_u32 s7, s2 +; GFX1030-NEXT: s_cselect_b32 s6, s8, s6 +; GFX1030-NEXT: s_cselect_b32 s7, s9, s7 +; GFX1030-NEXT: s_add_i32 s8, s6, 1 +; GFX1030-NEXT: s_cmp_ge_u32 s7, s2 +; GFX1030-NEXT: s_cselect_b32 s2, s8, s6 +; GFX1030-NEXT: v_mov_b32_e32 v3, s2 +; GFX1030-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] ; GFX1030-NEXT: s_endpgm ; ; EG-LABEL: udiv_v4i32: @@ -1855,11 +1890,11 @@ ; SI-NEXT: v_mul_hi_u32 v1, v2, v1 ; SI-NEXT: v_mul_lo_u32 v3, v1, v0 ; SI-NEXT: v_add_i32_e32 v4, vcc, 1, v1 -; SI-NEXT: v_subrev_i32_e32 v2, vcc, v3, v2 -; SI-NEXT: v_cmp_ge_u32_e64 s[0:1], v2, v0 -; SI-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[0:1] +; SI-NEXT: v_sub_i32_e32 v2, vcc, v2, v3 ; SI-NEXT: v_subrev_i32_e32 v3, vcc, v0, v2 -; SI-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] +; SI-NEXT: v_cmp_ge_u32_e32 vcc, v2, v0 +; SI-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; SI-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; SI-NEXT: v_add_i32_e32 v3, vcc, 1, v1 ; SI-NEXT: v_cmp_ge_u32_e32 vcc, v2, v0 ; SI-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc @@ -1903,10 +1938,10 @@ ; VI-NEXT: v_mul_lo_u32 v3, v1, v0 ; VI-NEXT: v_add_u32_e32 v4, vcc, 1, v1 ; VI-NEXT: v_sub_u32_e32 v2, vcc, v2, v3 -; VI-NEXT: v_cmp_ge_u32_e64 s[0:1], v2, v0 -; VI-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[0:1] -; VI-NEXT: v_sub_u32_e32 v3, vcc, v2, v0 -; VI-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] +; VI-NEXT: v_subrev_u32_e32 v3, vcc, v0, v2 +; VI-NEXT: v_cmp_ge_u32_e32 vcc, v2, v0 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; VI-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; VI-NEXT: v_add_u32_e32 v3, vcc, 1, v1 ; VI-NEXT: v_cmp_ge_u32_e32 vcc, v2, v0 ; VI-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc @@ -1958,10 +1993,10 @@ ; GCN-NEXT: v_mul_lo_u32 v5, v4, v3 ; GCN-NEXT: v_add_u32_e32 v6, vcc, 1, v4 ; GCN-NEXT: v_sub_u32_e32 v2, vcc, v2, v5 -; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], v2, v3 -; GCN-NEXT: v_cndmask_b32_e64 v4, v4, v6, s[0:1] -; GCN-NEXT: v_sub_u32_e32 v5, vcc, v2, v3 -; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1] +; GCN-NEXT: v_subrev_u32_e32 v5, vcc, v3, v2 +; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v2, v3 +; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc ; GCN-NEXT: v_add_u32_e32 v5, vcc, 1, v4 ; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v2, v3 ; GCN-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc @@ -1980,32 +2015,39 @@ ; GFX1030-NEXT: global_load_ubyte v3, v0, s[2:3] offset:2 ; GFX1030-NEXT: global_load_ushort v4, v0, s[2:3] ; GFX1030-NEXT: s_waitcnt vmcnt(3) -; GFX1030-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX1030-NEXT: v_readfirstlane_b32 s2, v1 +; GFX1030-NEXT: s_waitcnt vmcnt(2) +; GFX1030-NEXT: v_readfirstlane_b32 s3, v2 ; GFX1030-NEXT: s_waitcnt vmcnt(1) -; GFX1030-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX1030-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX1030-NEXT: v_readfirstlane_b32 s4, v3 ; GFX1030-NEXT: s_waitcnt vmcnt(0) -; GFX1030-NEXT: v_or_b32_e32 v3, v4, v3 -; GFX1030-NEXT: v_cvt_f32_u32_e32 v2, v1 -; GFX1030-NEXT: v_sub_nc_u32_e32 v5, 0, v1 -; GFX1030-NEXT: v_rcp_iflag_f32_e32 v2, v2 -; GFX1030-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 -; GFX1030-NEXT: v_cvt_u32_f32_e32 v2, v2 -; GFX1030-NEXT: v_mul_lo_u32 v5, v5, v2 -; GFX1030-NEXT: v_mul_hi_u32 v5, v2, v5 -; GFX1030-NEXT: v_add_nc_u32_e32 v2, v2, v5 -; GFX1030-NEXT: v_mul_hi_u32 v2, v3, v2 -; GFX1030-NEXT: v_mul_lo_u32 v4, v2, v1 -; GFX1030-NEXT: v_sub_nc_u32_e32 v3, v3, v4 -; GFX1030-NEXT: v_add_nc_u32_e32 v4, 1, v2 -; GFX1030-NEXT: v_sub_nc_u32_e32 v5, v3, v1 -; GFX1030-NEXT: v_cmp_ge_u32_e32 vcc_lo, v3, v1 -; GFX1030-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo -; GFX1030-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc_lo -; GFX1030-NEXT: v_add_nc_u32_e32 v4, 1, v2 -; GFX1030-NEXT: v_cmp_ge_u32_e32 vcc_lo, v3, v1 -; GFX1030-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo -; GFX1030-NEXT: v_and_b32_e32 v1, 0xffffff, v1 +; GFX1030-NEXT: v_readfirstlane_b32 s5, v4 +; GFX1030-NEXT: s_lshl_b32 s2, s2, 16 +; GFX1030-NEXT: s_or_b32 s2, s3, s2 +; GFX1030-NEXT: s_lshl_b32 s4, s4, 16 +; GFX1030-NEXT: v_cvt_f32_u32_e32 v1, s2 +; GFX1030-NEXT: s_sub_i32 s6, 0, s2 +; GFX1030-NEXT: s_or_b32 s4, s5, s4 +; GFX1030-NEXT: v_rcp_iflag_f32_e32 v1, v1 +; GFX1030-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 +; GFX1030-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GFX1030-NEXT: v_readfirstlane_b32 s3, v1 +; GFX1030-NEXT: s_mul_i32 s6, s6, s3 +; GFX1030-NEXT: s_mul_hi_u32 s6, s3, s6 +; GFX1030-NEXT: s_add_i32 s3, s3, s6 +; GFX1030-NEXT: s_mul_hi_u32 s3, s4, s3 +; GFX1030-NEXT: s_mul_i32 s5, s3, s2 +; GFX1030-NEXT: s_sub_i32 s4, s4, s5 +; GFX1030-NEXT: s_add_i32 s5, s3, 1 +; GFX1030-NEXT: s_sub_i32 s6, s4, s2 +; GFX1030-NEXT: s_cmp_ge_u32 s4, s2 +; GFX1030-NEXT: s_cselect_b32 s3, s5, s3 +; GFX1030-NEXT: s_cselect_b32 s4, s6, s4 +; GFX1030-NEXT: s_add_i32 s5, s3, 1 +; GFX1030-NEXT: s_cmp_ge_u32 s4, s2 +; GFX1030-NEXT: s_cselect_b32 s2, s5, s3 +; GFX1030-NEXT: s_and_b32 s2, s2, 0xffffff +; GFX1030-NEXT: v_mov_b32_e32 v1, s2 ; GFX1030-NEXT: global_store_dword v0, v1, s[0:1] ; GFX1030-NEXT: s_endpgm ; @@ -2352,7 +2394,7 @@ ; SI-NEXT: v_cvt_i32_f32_e32 v1, v1 ; SI-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v2| ; SI-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; SI-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; @@ -2378,7 +2420,7 @@ ; VI-NEXT: v_cvt_i32_f32_e32 v1, v1 ; VI-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v2| ; VI-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, v1, v0 ; VI-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; @@ -2402,7 +2444,7 @@ ; GCN-NEXT: v_cvt_i32_f32_e32 v3, v3 ; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v5|, |v4| ; GCN-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc -; GCN-NEXT: v_add_u32_e32 v2, vcc, v2, v3 +; GCN-NEXT: v_add_u32_e32 v2, vcc, v3, v2 ; GCN-NEXT: flat_store_byte v[0:1], v2 ; GCN-NEXT: s_endpgm ; @@ -2522,7 +2564,7 @@ ; SI-NEXT: v_mul_lo_u32 v5, v3, s4 ; SI-NEXT: v_mul_lo_u32 v6, v2, s4 ; SI-NEXT: s_mov_b32 s4, 0x186a0 -; SI-NEXT: v_subrev_i32_e32 v4, vcc, v2, v4 +; SI-NEXT: v_sub_i32_e32 v4, vcc, v4, v2 ; SI-NEXT: v_add_i32_e32 v4, vcc, v4, v5 ; SI-NEXT: v_mul_lo_u32 v5, v2, v4 ; SI-NEXT: v_mul_hi_u32 v7, v2, v6 @@ -2598,8 +2640,8 @@ ; VI-NEXT: v_cvt_u32_f32_e32 v7, v3 ; VI-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, s6, 0 ; VI-NEXT: v_mul_lo_u32 v4, v7, s6 -; VI-NEXT: v_subrev_u32_e32 v3, vcc, v6, v3 -; VI-NEXT: v_add_u32_e32 v8, vcc, v3, v4 +; VI-NEXT: v_sub_u32_e32 v3, vcc, v3, v6 +; VI-NEXT: v_add_u32_e32 v8, vcc, v4, v3 ; VI-NEXT: v_mul_hi_u32 v5, v6, v2 ; VI-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v8, 0 ; VI-NEXT: v_add_u32_e32 v9, vcc, v5, v3 @@ -2615,7 +2657,7 @@ ; VI-NEXT: v_addc_u32_e32 v7, vcc, v7, v3, vcc ; VI-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, s6, 0 ; VI-NEXT: v_mul_lo_u32 v4, v7, s6 -; VI-NEXT: v_subrev_u32_e32 v3, vcc, v6, v3 +; VI-NEXT: v_sub_u32_e32 v3, vcc, v3, v6 ; VI-NEXT: v_add_u32_e32 v5, vcc, v3, v4 ; VI-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v5, 0 ; VI-NEXT: v_mul_hi_u32 v8, v6, v2 @@ -2685,8 +2727,8 @@ ; GCN-NEXT: v_cvt_u32_f32_e32 v7, v3 ; GCN-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, s6, 0 ; GCN-NEXT: v_mul_lo_u32 v4, v7, s6 -; GCN-NEXT: v_subrev_u32_e32 v3, vcc, v6, v3 -; GCN-NEXT: v_add_u32_e32 v8, vcc, v3, v4 +; GCN-NEXT: v_sub_u32_e32 v3, vcc, v3, v6 +; GCN-NEXT: v_add_u32_e32 v8, vcc, v4, v3 ; GCN-NEXT: v_mul_hi_u32 v5, v6, v2 ; GCN-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v8, 0 ; GCN-NEXT: v_add_u32_e32 v9, vcc, v5, v3 @@ -2702,7 +2744,7 @@ ; GCN-NEXT: v_addc_u32_e32 v7, vcc, v7, v3, vcc ; GCN-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, s6, 0 ; GCN-NEXT: v_mul_lo_u32 v4, v7, s6 -; GCN-NEXT: v_subrev_u32_e32 v3, vcc, v6, v3 +; GCN-NEXT: v_sub_u32_e32 v3, vcc, v3, v6 ; GCN-NEXT: v_add_u32_e32 v5, vcc, v3, v4 ; GCN-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v5, 0 ; GCN-NEXT: v_mul_hi_u32 v8, v6, v2 diff --git a/llvm/test/CodeGen/AMDGPU/udiv64.ll b/llvm/test/CodeGen/AMDGPU/udiv64.ll --- a/llvm/test/CodeGen/AMDGPU/udiv64.ll +++ b/llvm/test/CodeGen/AMDGPU/udiv64.ll @@ -48,9 +48,9 @@ ; GCN-NEXT: v_mul_hi_u32 v3, s4, v0 ; GCN-NEXT: v_mul_lo_u32 v4, s5, v0 ; GCN-NEXT: s_mov_b32 s5, s1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GCN-NEXT: v_mul_lo_u32 v3, s4, v0 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v4, v2 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GCN-NEXT: v_mul_lo_u32 v6, v0, v2 ; GCN-NEXT: v_mul_hi_u32 v7, v0, v3 ; GCN-NEXT: v_mul_hi_u32 v8, v0, v2 @@ -100,12 +100,13 @@ ; GCN-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1] ; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v4 ; GCN-NEXT: v_cndmask_b32_e64 v4, v6, v5, s[0:1] -; GCN-NEXT: v_add_i32_e64 v5, s[0:1], 2, v0 +; GCN-NEXT: v_add_i32_e64 v5, s[0:1], 1, v0 ; GCN-NEXT: v_addc_u32_e64 v6, s[0:1], 0, v1, s[0:1] -; GCN-NEXT: v_add_i32_e64 v7, s[0:1], 1, v0 +; GCN-NEXT: v_add_i32_e64 v7, s[0:1], 2, v0 ; GCN-NEXT: v_addc_u32_e64 v8, s[0:1], 0, v1, s[0:1] ; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4 -; GCN-NEXT: v_cndmask_b32_e64 v4, v8, v6, s[0:1] +; GCN-NEXT: v_cndmask_b32_e64 v4, v5, v7, s[0:1] +; GCN-NEXT: v_cndmask_b32_e64 v5, v6, v8, s[0:1] ; GCN-NEXT: v_mov_b32_e32 v6, s3 ; GCN-NEXT: v_subb_u32_e32 v2, vcc, v6, v2, vcc ; GCN-NEXT: v_cmp_le_u32_e32 vcc, s9, v2 @@ -115,9 +116,8 @@ ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, s9, v2 ; GCN-NEXT: v_cndmask_b32_e32 v2, v6, v3, vcc ; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GCN-NEXT: v_cndmask_b32_e64 v2, v7, v5, s[0:1] -; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN-NEXT: s_endpgm ; @@ -125,38 +125,41 @@ ; GCN-IR: ; %bb.0: ; %_udiv-special-cases ; GCN-IR-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd ; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; GCN-IR-NEXT: s_mov_b64 s[6:7], 0 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[8:9], s[4:5], 0 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[10:11], s[2:3], 0 -; GCN-IR-NEXT: s_flbit_i32_b32 s12, s4 -; GCN-IR-NEXT: s_add_i32 s14, s12, 32 -; GCN-IR-NEXT: s_or_b64 s[12:13], s[8:9], s[10:11] -; GCN-IR-NEXT: s_flbit_i32_b32 s8, s5 -; GCN-IR-NEXT: s_min_u32 s10, s14, s8 -; GCN-IR-NEXT: s_flbit_i32_b32 s8, s2 -; GCN-IR-NEXT: s_add_i32 s8, s8, 32 -; GCN-IR-NEXT: s_flbit_i32_b32 s9, s3 -; GCN-IR-NEXT: s_min_u32 s14, s8, s9 -; GCN-IR-NEXT: s_sub_u32 s8, s10, s14 -; GCN-IR-NEXT: s_subb_u32 s9, 0, 0 -; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[16:17], s[8:9], 63 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[18:19], s[8:9], 63 -; GCN-IR-NEXT: s_or_b64 s[12:13], s[12:13], s[16:17] -; GCN-IR-NEXT: s_or_b64 s[16:17], s[12:13], s[18:19] +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[6:7], s[4:5], 0 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[8:9], s[2:3], 0 +; GCN-IR-NEXT: s_flbit_i32_b32 s10, s4 +; GCN-IR-NEXT: s_or_b64 s[8:9], s[6:7], s[8:9] +; GCN-IR-NEXT: s_flbit_i32_b32 s6, s2 +; GCN-IR-NEXT: s_flbit_i32_b32 s11, s5 +; GCN-IR-NEXT: s_add_i32 s10, s10, 32 +; GCN-IR-NEXT: s_add_i32 s6, s6, 32 +; GCN-IR-NEXT: s_flbit_i32_b32 s7, s3 +; GCN-IR-NEXT: s_min_u32 s10, s10, s11 +; GCN-IR-NEXT: s_min_u32 s14, s6, s7 +; GCN-IR-NEXT: s_sub_u32 s12, s10, s14 +; GCN-IR-NEXT: s_subb_u32 s13, 0, 0 +; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[16:17], s[12:13], 63 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[18:19], s[12:13], 63 +; GCN-IR-NEXT: s_or_b64 s[16:17], s[8:9], s[16:17] +; GCN-IR-NEXT: s_and_b64 s[8:9], s[16:17], exec +; GCN-IR-NEXT: s_cselect_b32 s9, 0, s3 +; GCN-IR-NEXT: s_cselect_b32 s8, 0, s2 +; GCN-IR-NEXT: s_or_b64 s[16:17], s[16:17], s[18:19] +; GCN-IR-NEXT: s_mov_b64 s[6:7], 0 ; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[16:17] ; GCN-IR-NEXT: s_mov_b32 s11, 0 ; GCN-IR-NEXT: s_cbranch_vccz .LBB0_5 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 -; GCN-IR-NEXT: s_add_u32 s12, s8, 1 -; GCN-IR-NEXT: s_addc_u32 s13, s9, 0 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[16:17], s[12:13], 0 -; GCN-IR-NEXT: s_sub_i32 s8, 63, s8 -; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[16:17] -; GCN-IR-NEXT: s_lshl_b64 s[8:9], s[2:3], s8 +; GCN-IR-NEXT: s_add_u32 s16, s12, 1 +; GCN-IR-NEXT: s_addc_u32 s17, s13, 0 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[8:9], s[16:17], 0 +; GCN-IR-NEXT: s_sub_i32 s12, 63, s12 +; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[8:9] +; GCN-IR-NEXT: s_lshl_b64 s[8:9], s[2:3], s12 ; GCN-IR-NEXT: s_cbranch_vccz .LBB0_4 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader -; GCN-IR-NEXT: s_lshr_b64 s[12:13], s[2:3], s12 +; GCN-IR-NEXT: s_lshr_b64 s[12:13], s[2:3], s16 ; GCN-IR-NEXT: s_add_u32 s15, s4, -1 ; GCN-IR-NEXT: s_addc_u32 s16, s5, -1 ; GCN-IR-NEXT: s_not_b64 s[2:3], s[10:11] @@ -187,18 +190,12 @@ ; GCN-IR-NEXT: s_cbranch_vccz .LBB0_3 ; GCN-IR-NEXT: .LBB0_4: ; %Flow6 ; GCN-IR-NEXT: s_lshl_b64 s[2:3], s[8:9], 1 -; GCN-IR-NEXT: s_or_b64 s[2:3], s[6:7], s[2:3] -; GCN-IR-NEXT: v_mov_b32_e32 v0, s2 -; GCN-IR-NEXT: v_mov_b32_e32 v1, s3 -; GCN-IR-NEXT: s_branch .LBB0_6 -; GCN-IR-NEXT: .LBB0_5: -; GCN-IR-NEXT: v_mov_b32_e32 v0, s3 -; GCN-IR-NEXT: v_cndmask_b32_e64 v1, v0, 0, s[12:13] -; GCN-IR-NEXT: v_mov_b32_e32 v0, s2 -; GCN-IR-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[12:13] -; GCN-IR-NEXT: .LBB0_6: ; %udiv-end +; GCN-IR-NEXT: s_or_b64 s[8:9], s[6:7], s[2:3] +; GCN-IR-NEXT: .LBB0_5: ; %udiv-end +; GCN-IR-NEXT: v_mov_b32_e32 v0, s8 ; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s2, -1 +; GCN-IR-NEXT: v_mov_b32_e32 v1, s9 ; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN-IR-NEXT: s_endpgm %result = udiv i64 %x, %y @@ -671,36 +668,36 @@ define amdgpu_kernel void @s_test_udiv24_i48(i48 addrspace(1)* %out, i48 %x, i48 %y) { ; GCN-LABEL: s_test_udiv24_i48: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GCN-NEXT: v_mov_b32_e32 v2, 0x4f800000 -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_and_b32 s4, s4, 0xff000000 -; GCN-NEXT: s_and_b32 s5, s5, 0xffff -; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: v_alignbit_b32 v0, s5, v0, 24 +; GCN-NEXT: s_and_b32 s0, s2, 0xff000000 +; GCN-NEXT: s_and_b32 s1, s3, 0xffff +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: v_alignbit_b32 v0, s1, v0, 24 ; GCN-NEXT: v_cvt_f32_u32_e32 v1, v0 -; GCN-NEXT: s_and_b32 s8, s3, 0xffff -; GCN-NEXT: s_and_b32 s9, s2, 0xff000000 -; GCN-NEXT: s_lshr_b64 s[2:3], s[4:5], 24 +; GCN-NEXT: s_and_b32 s7, s7, 0xffff +; GCN-NEXT: s_and_b32 s6, s6, 0xff000000 +; GCN-NEXT: s_lshr_b64 s[0:1], s[0:1], 24 ; GCN-NEXT: v_mac_f32_e32 v1, 0, v2 ; GCN-NEXT: v_rcp_f32_e32 v1, v1 -; GCN-NEXT: s_sub_u32 s2, 0, s2 -; GCN-NEXT: s_subb_u32 s3, 0, s3 -; GCN-NEXT: s_mov_b32 s4, s0 +; GCN-NEXT: s_sub_u32 s8, 0, s0 +; GCN-NEXT: s_subb_u32 s9, 0, s1 +; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v1 ; GCN-NEXT: v_mul_f32_e32 v2, 0x2f800000, v1 ; GCN-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-NEXT: v_mac_f32_e32 v1, 0xcf800000, v2 ; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GCN-NEXT: s_mov_b32 s5, s1 -; GCN-NEXT: v_mul_lo_u32 v3, s2, v2 -; GCN-NEXT: v_mul_hi_u32 v4, s2, v1 -; GCN-NEXT: v_mul_lo_u32 v5, s3, v1 -; GCN-NEXT: v_mul_lo_u32 v6, s2, v1 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: s_mov_b32 s0, s4 +; GCN-NEXT: v_mul_lo_u32 v3, s8, v2 +; GCN-NEXT: v_mul_hi_u32 v4, s8, v1 +; GCN-NEXT: v_mul_lo_u32 v5, s9, v1 +; GCN-NEXT: v_mul_lo_u32 v6, s8, v1 +; GCN-NEXT: s_mov_b32 s1, s5 ; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v4 ; GCN-NEXT: v_add_i32_e32 v3, vcc, v5, v3 ; GCN-NEXT: v_mul_lo_u32 v4, v1, v3 @@ -719,11 +716,11 @@ ; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v5, vcc ; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v3 ; GCN-NEXT: v_addc_u32_e32 v2, vcc, v2, v4, vcc -; GCN-NEXT: v_mul_lo_u32 v3, s2, v2 -; GCN-NEXT: v_mul_hi_u32 v4, s2, v1 -; GCN-NEXT: v_mul_lo_u32 v5, s3, v1 +; GCN-NEXT: v_mul_lo_u32 v3, s8, v2 +; GCN-NEXT: v_mul_hi_u32 v4, s8, v1 +; GCN-NEXT: v_mul_lo_u32 v5, s9, v1 ; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GCN-NEXT: v_mul_lo_u32 v4, s2, v1 +; GCN-NEXT: v_mul_lo_u32 v4, s8, v1 ; GCN-NEXT: v_add_i32_e32 v3, vcc, v5, v3 ; GCN-NEXT: v_mul_lo_u32 v7, v1, v3 ; GCN-NEXT: v_mul_hi_u32 v8, v1, v4 @@ -740,9 +737,9 @@ ; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 ; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v5, vcc ; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v3 -; GCN-NEXT: v_mov_b32_e32 v3, s9 +; GCN-NEXT: v_mov_b32_e32 v3, s6 ; GCN-NEXT: v_addc_u32_e32 v2, vcc, v2, v4, vcc -; GCN-NEXT: v_alignbit_b32 v3, s8, v3, 24 +; GCN-NEXT: v_alignbit_b32 v3, s7, v3, 24 ; GCN-NEXT: v_mul_lo_u32 v4, v3, v2 ; GCN-NEXT: v_mul_hi_u32 v1, v3, v1 ; GCN-NEXT: v_mul_hi_u32 v2, v3, v2 @@ -756,12 +753,12 @@ ; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc ; GCN-NEXT: v_mul_lo_u32 v6, v0, v2 ; GCN-NEXT: v_mul_hi_u32 v7, v0, v1 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 2, v1 +; GCN-NEXT: v_add_i32_e32 v4, vcc, 1, v1 ; GCN-NEXT: v_mul_lo_u32 v10, v0, v1 ; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v2, vcc -; GCN-NEXT: v_add_i32_e32 v8, vcc, 1, v1 +; GCN-NEXT: v_add_i32_e32 v8, vcc, 2, v1 ; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v2, vcc -; GCN-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; GCN-NEXT: v_add_i32_e32 v6, vcc, v6, v7 ; GCN-NEXT: v_sub_i32_e32 v3, vcc, v3, v10 ; GCN-NEXT: v_subb_u32_e32 v6, vcc, 0, v6, vcc ; GCN-NEXT: v_sub_i32_e32 v7, vcc, v3, v0 @@ -769,25 +766,26 @@ ; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v7, v0 ; GCN-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v10 -; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], v3, v0 ; GCN-NEXT: v_cndmask_b32_e32 v7, -1, v7, vcc -; GCN-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1] -; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v6 ; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 -; GCN-NEXT: v_cndmask_b32_e64 v0, -1, v0, s[0:1] -; GCN-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc -; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v0 -; GCN-NEXT: v_cndmask_b32_e64 v0, v1, v4, s[0:1] -; GCN-NEXT: v_cndmask_b32_e32 v1, v9, v5, vcc -; GCN-NEXT: v_cndmask_b32_e64 v1, v2, v1, s[0:1] -; GCN-NEXT: buffer_store_short v1, off, s[4:7], 0 offset:4 -; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc +; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc +; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v3, v0 +; GCN-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v6 +; GCN-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v4, vcc +; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v5, vcc +; GCN-NEXT: buffer_store_short v1, off, s[0:3], 0 offset:4 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GCN-NEXT: s_endpgm ; ; GCN-IR-LABEL: s_test_udiv24_i48: ; GCN-IR: ; %bb.0: ; %_udiv-special-cases ; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GCN-IR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GCN-IR-NEXT: s_mov_b32 s11, 0 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_and_b32 s3, s7, 0xffff ; GCN-IR-NEXT: s_and_b32 s2, s6, 0xff000000 @@ -797,26 +795,28 @@ ; GCN-IR-NEXT: s_lshr_b64 s[0:1], s[0:1], 24 ; GCN-IR-NEXT: s_and_b32 s9, s9, 0xffff ; GCN-IR-NEXT: s_and_b32 s1, s1, 0xffff -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[6:7], s[0:1], 0 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[10:11], s[8:9], 0 -; GCN-IR-NEXT: s_mov_b64 s[2:3], 0 -; GCN-IR-NEXT: s_or_b64 s[12:13], s[6:7], s[10:11] -; GCN-IR-NEXT: s_flbit_i32_b32 s6, s0 -; GCN-IR-NEXT: s_add_i32 s6, s6, 32 -; GCN-IR-NEXT: s_flbit_i32_b32 s7, s1 -; GCN-IR-NEXT: s_min_u32 s10, s6, s7 -; GCN-IR-NEXT: s_flbit_i32_b32 s6, s8 -; GCN-IR-NEXT: s_add_i32 s6, s6, 32 -; GCN-IR-NEXT: s_flbit_i32_b32 s7, s9 -; GCN-IR-NEXT: s_min_u32 s14, s6, s7 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[2:3], s[0:1], 0 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[6:7], s[8:9], 0 +; GCN-IR-NEXT: s_or_b64 s[12:13], s[2:3], s[6:7] +; GCN-IR-NEXT: s_flbit_i32_b32 s2, s0 +; GCN-IR-NEXT: s_add_i32 s2, s2, 32 +; GCN-IR-NEXT: s_flbit_i32_b32 s3, s1 +; GCN-IR-NEXT: s_min_u32 s10, s2, s3 +; GCN-IR-NEXT: s_flbit_i32_b32 s2, s8 +; GCN-IR-NEXT: s_add_i32 s2, s2, 32 +; GCN-IR-NEXT: s_flbit_i32_b32 s3, s9 +; GCN-IR-NEXT: s_min_u32 s14, s2, s3 ; GCN-IR-NEXT: s_sub_u32 s6, s10, s14 ; GCN-IR-NEXT: s_subb_u32 s7, 0, 0 ; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[16:17], s[6:7], 63 ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[18:19], s[6:7], 63 -; GCN-IR-NEXT: s_or_b64 s[12:13], s[12:13], s[16:17] -; GCN-IR-NEXT: s_or_b64 s[16:17], s[12:13], s[18:19] +; GCN-IR-NEXT: s_or_b64 s[16:17], s[12:13], s[16:17] +; GCN-IR-NEXT: s_and_b64 s[12:13], s[16:17], exec +; GCN-IR-NEXT: s_cselect_b32 s13, 0, s9 +; GCN-IR-NEXT: s_cselect_b32 s12, 0, s8 +; GCN-IR-NEXT: s_or_b64 s[16:17], s[16:17], s[18:19] +; GCN-IR-NEXT: s_mov_b64 s[2:3], 0 ; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[16:17] -; GCN-IR-NEXT: s_mov_b32 s11, 0 ; GCN-IR-NEXT: s_cbranch_vccz .LBB7_5 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 ; GCN-IR-NEXT: s_add_u32 s12, s6, 1 @@ -858,19 +858,14 @@ ; GCN-IR-NEXT: s_cbranch_vccz .LBB7_3 ; GCN-IR-NEXT: .LBB7_4: ; %Flow3 ; GCN-IR-NEXT: s_lshl_b64 s[0:1], s[6:7], 1 -; GCN-IR-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] -; GCN-IR-NEXT: v_mov_b32_e32 v0, s0 -; GCN-IR-NEXT: v_mov_b32_e32 v1, s1 -; GCN-IR-NEXT: s_branch .LBB7_6 -; GCN-IR-NEXT: .LBB7_5: -; GCN-IR-NEXT: v_mov_b32_e32 v0, s9 -; GCN-IR-NEXT: v_cndmask_b32_e64 v1, v0, 0, s[12:13] -; GCN-IR-NEXT: v_mov_b32_e32 v0, s8 -; GCN-IR-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[12:13] -; GCN-IR-NEXT: .LBB7_6: ; %udiv-end +; GCN-IR-NEXT: s_or_b64 s[12:13], s[2:3], s[0:1] +; GCN-IR-NEXT: .LBB7_5: ; %udiv-end ; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s6, -1 -; GCN-IR-NEXT: buffer_store_short v1, off, s[4:7], 0 offset:4 +; GCN-IR-NEXT: v_mov_b32_e32 v0, s13 +; GCN-IR-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:4 +; GCN-IR-NEXT: s_waitcnt expcnt(0) +; GCN-IR-NEXT: v_mov_b32_e32 v0, s12 ; GCN-IR-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN-IR-NEXT: s_endpgm %1 = lshr i48 %x, 24 @@ -966,54 +961,57 @@ ; GCN-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[0:1] ; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], s3, v3 ; GCN-NEXT: v_cndmask_b32_e64 v3, v5, v4, s[0:1] -; GCN-NEXT: v_add_i32_e64 v4, s[0:1], 2, v0 +; GCN-NEXT: v_add_i32_e64 v4, s[0:1], 1, v0 ; GCN-NEXT: v_addc_u32_e64 v5, s[0:1], 0, 0, s[0:1] -; GCN-NEXT: v_add_i32_e64 v6, s[0:1], 1, v0 +; GCN-NEXT: v_add_i32_e64 v6, s[0:1], 2, v0 ; GCN-NEXT: v_addc_u32_e64 v7, s[0:1], 0, 0, s[0:1] ; GCN-NEXT: v_subb_u32_e32 v1, vcc, 0, v1, vcc ; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v3 ; GCN-NEXT: v_cmp_le_u32_e32 vcc, s3, v1 -; GCN-NEXT: v_cndmask_b32_e64 v3, v7, v5, s[0:1] +; GCN-NEXT: v_cndmask_b32_e64 v3, v4, v6, s[0:1] +; GCN-NEXT: v_cndmask_b32_e64 v4, v5, v7, s[0:1] ; GCN-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc ; GCN-NEXT: v_cmp_le_u32_e32 vcc, s2, v2 ; GCN-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, s3, v1 ; GCN-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc ; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; GCN-NEXT: v_cndmask_b32_e64 v2, v6, v4, s[0:1] -; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc -; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v4, vcc +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN-NEXT: s_endpgm ; ; GCN-IR-LABEL: s_test_udiv_k_num_i64: ; GCN-IR: ; %bb.0: ; %_udiv-special-cases ; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_flbit_i32_b32 s6, s2 -; GCN-IR-NEXT: s_flbit_i32_b32 s7, s3 -; GCN-IR-NEXT: s_add_i32 s6, s6, 32 -; GCN-IR-NEXT: s_min_u32 s8, s6, s7 -; GCN-IR-NEXT: s_add_u32 s6, s8, 0xffffffc5 -; GCN-IR-NEXT: s_addc_u32 s7, 0, -1 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], s[2:3], 0 -; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[10:11], s[6:7], 63 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[12:13], s[6:7], 63 -; GCN-IR-NEXT: s_or_b64 s[10:11], s[4:5], s[10:11] -; GCN-IR-NEXT: s_or_b64 s[4:5], s[10:11], s[12:13] -; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GCN-IR-NEXT: s_mov_b64 s[4:5], 0 +; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) +; GCN-IR-NEXT: s_flbit_i32_b32 s8, s2 +; GCN-IR-NEXT: s_flbit_i32_b32 s9, s3 +; GCN-IR-NEXT: s_add_i32 s8, s8, 32 +; GCN-IR-NEXT: s_min_u32 s8, s8, s9 +; GCN-IR-NEXT: s_add_u32 s10, s8, 0xffffffc5 +; GCN-IR-NEXT: s_addc_u32 s11, 0, -1 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[6:7], s[2:3], 0 +; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[12:13], s[10:11], 63 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[14:15], s[10:11], 63 +; GCN-IR-NEXT: s_or_b64 s[12:13], s[6:7], s[12:13] +; GCN-IR-NEXT: s_and_b64 s[6:7], s[12:13], exec +; GCN-IR-NEXT: s_cselect_b32 s6, 0, 24 +; GCN-IR-NEXT: s_or_b64 s[12:13], s[12:13], s[14:15] +; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[12:13] +; GCN-IR-NEXT: s_mov_b32 s7, 0 ; GCN-IR-NEXT: s_cbranch_vccz .LBB8_5 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 -; GCN-IR-NEXT: s_add_u32 s10, s6, 1 -; GCN-IR-NEXT: s_addc_u32 s11, s7, 0 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[12:13], s[10:11], 0 -; GCN-IR-NEXT: s_sub_i32 s6, 63, s6 -; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[12:13] -; GCN-IR-NEXT: s_lshl_b64 s[6:7], 24, s6 +; GCN-IR-NEXT: s_add_u32 s12, s10, 1 +; GCN-IR-NEXT: s_addc_u32 s13, s11, 0 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[6:7], s[12:13], 0 +; GCN-IR-NEXT: s_sub_i32 s9, 63, s10 +; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[6:7] +; GCN-IR-NEXT: s_lshl_b64 s[6:7], 24, s9 ; GCN-IR-NEXT: s_cbranch_vccz .LBB8_4 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader -; GCN-IR-NEXT: s_lshr_b64 s[10:11], 24, s10 +; GCN-IR-NEXT: s_lshr_b64 s[10:11], 24, s12 ; GCN-IR-NEXT: s_add_u32 s14, s2, -1 ; GCN-IR-NEXT: s_addc_u32 s15, s3, -1 ; GCN-IR-NEXT: s_sub_u32 s8, 58, s8 @@ -1043,16 +1041,12 @@ ; GCN-IR-NEXT: s_cbranch_vccz .LBB8_3 ; GCN-IR-NEXT: .LBB8_4: ; %Flow5 ; GCN-IR-NEXT: s_lshl_b64 s[2:3], s[6:7], 1 -; GCN-IR-NEXT: s_or_b64 s[2:3], s[4:5], s[2:3] -; GCN-IR-NEXT: v_mov_b32_e32 v0, s2 -; GCN-IR-NEXT: v_mov_b32_e32 v1, s3 -; GCN-IR-NEXT: s_branch .LBB8_6 -; GCN-IR-NEXT: .LBB8_5: -; GCN-IR-NEXT: v_mov_b32_e32 v1, 0 -; GCN-IR-NEXT: v_cndmask_b32_e64 v0, 24, 0, s[10:11] -; GCN-IR-NEXT: .LBB8_6: ; %udiv-end +; GCN-IR-NEXT: s_or_b64 s[6:7], s[4:5], s[2:3] +; GCN-IR-NEXT: .LBB8_5: ; %udiv-end +; GCN-IR-NEXT: v_mov_b32_e32 v0, s6 ; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s2, -1 +; GCN-IR-NEXT: v_mov_b32_e32 v1, s7 ; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN-IR-NEXT: s_endpgm %result = udiv i64 24, %x @@ -1335,21 +1329,22 @@ ; GCN-NEXT: v_mov_b32_e32 v0, 0x4f800000 ; GCN-NEXT: v_madak_f32 v0, 0, v0, 0x41c00000 ; GCN-NEXT: v_rcp_f32_e32 v0, v0 -; GCN-NEXT: s_movk_i32 s4, 0xffe8 -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_movk_i32 s8, 0xffe8 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GCN-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GCN-NEXT: v_trunc_f32_e32 v1, v1 ; GCN-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s5, s1 -; GCN-NEXT: s_mov_b32 s6, -1 -; GCN-NEXT: v_mul_hi_u32 v2, v0, s4 -; GCN-NEXT: v_mul_lo_u32 v4, v1, s4 -; GCN-NEXT: v_mul_lo_u32 v3, v0, s4 +; GCN-NEXT: s_mov_b32 s0, s4 +; GCN-NEXT: v_mul_hi_u32 v2, v0, s8 +; GCN-NEXT: v_mul_lo_u32 v4, v1, s8 +; GCN-NEXT: v_mul_lo_u32 v3, v0, s8 +; GCN-NEXT: s_mov_b32 s1, s5 ; GCN-NEXT: v_sub_i32_e32 v2, vcc, v2, v0 ; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GCN-NEXT: v_mul_hi_u32 v5, v0, v3 @@ -1368,12 +1363,11 @@ ; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GCN-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc -; GCN-NEXT: v_mul_hi_u32 v2, v0, s4 -; GCN-NEXT: v_mul_lo_u32 v3, v1, s4 -; GCN-NEXT: v_mul_lo_u32 v4, v0, s4 -; GCN-NEXT: s_mov_b32 s4, s0 -; GCN-NEXT: v_sub_i32_e32 v2, vcc, v2, v0 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; GCN-NEXT: v_mul_hi_u32 v2, v0, s8 +; GCN-NEXT: v_mul_lo_u32 v3, v1, s8 +; GCN-NEXT: v_mul_lo_u32 v4, v0, s8 +; GCN-NEXT: v_subrev_i32_e32 v2, vcc, v0, v2 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GCN-NEXT: v_mul_lo_u32 v3, v0, v2 ; GCN-NEXT: v_mul_hi_u32 v5, v0, v4 ; GCN-NEXT: v_mul_hi_u32 v6, v0, v2 @@ -1390,15 +1384,15 @@ ; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GCN-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc -; GCN-NEXT: v_mul_lo_u32 v2, s2, v1 -; GCN-NEXT: v_mul_hi_u32 v3, s2, v0 -; GCN-NEXT: v_mul_hi_u32 v4, s2, v1 -; GCN-NEXT: v_mul_hi_u32 v5, s3, v1 -; GCN-NEXT: v_mul_lo_u32 v1, s3, v1 +; GCN-NEXT: v_mul_lo_u32 v2, s6, v1 +; GCN-NEXT: v_mul_hi_u32 v3, s6, v0 +; GCN-NEXT: v_mul_hi_u32 v4, s6, v1 +; GCN-NEXT: v_mul_hi_u32 v5, s7, v1 +; GCN-NEXT: v_mul_lo_u32 v1, s7, v1 ; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc -; GCN-NEXT: v_mul_lo_u32 v4, s3, v0 -; GCN-NEXT: v_mul_hi_u32 v0, s3, v0 +; GCN-NEXT: v_mul_lo_u32 v4, s7, v0 +; GCN-NEXT: v_mul_hi_u32 v0, s7, v0 ; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GCN-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc ; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc @@ -1406,14 +1400,14 @@ ; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc ; GCN-NEXT: v_mul_lo_u32 v4, v1, 24 ; GCN-NEXT: v_mul_hi_u32 v5, v0, 24 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 2, v0 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 1, v0 ; GCN-NEXT: v_mul_lo_u32 v8, v0, 24 ; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc -; GCN-NEXT: v_add_i32_e32 v6, vcc, 1, v0 +; GCN-NEXT: v_add_i32_e32 v6, vcc, 2, v0 ; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc ; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; GCN-NEXT: v_mov_b32_e32 v5, s3 -; GCN-NEXT: v_sub_i32_e32 v8, vcc, s2, v8 +; GCN-NEXT: v_mov_b32_e32 v5, s7 +; GCN-NEXT: v_sub_i32_e32 v8, vcc, s6, v8 ; GCN-NEXT: v_subb_u32_e32 v4, vcc, v5, v4, vcc ; GCN-NEXT: v_subrev_i32_e32 v5, vcc, 24, v8 ; GCN-NEXT: v_subbrev_u32_e32 v9, vcc, 0, v4, vcc @@ -1421,17 +1415,17 @@ ; GCN-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v9 ; GCN-NEXT: v_cndmask_b32_e32 v5, -1, v5, vcc -; GCN-NEXT: v_cmp_lt_u32_e64 s[0:1], 23, v8 ; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; GCN-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1] -; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v4 -; GCN-NEXT: v_cndmask_b32_e64 v4, -1, v5, s[0:1] -; GCN-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc -; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4 -; GCN-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc -; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] -; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] -; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc +; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc +; GCN-NEXT: v_cmp_lt_u32_e32 vcc, 23, v8 +; GCN-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GCN-NEXT: v_cndmask_b32_e32 v4, -1, v5, vcc +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN-NEXT: s_endpgm ; ; GCN-IR-LABEL: s_test_udiv_k_den_i64: @@ -1442,26 +1436,29 @@ ; GCN-IR-NEXT: s_flbit_i32_b32 s7, s3 ; GCN-IR-NEXT: s_add_i32 s6, s6, 32 ; GCN-IR-NEXT: s_min_u32 s10, s6, s7 -; GCN-IR-NEXT: s_sub_u32 s6, 59, s10 -; GCN-IR-NEXT: s_subb_u32 s7, 0, 0 +; GCN-IR-NEXT: s_sub_u32 s8, 59, s10 +; GCN-IR-NEXT: s_subb_u32 s9, 0, 0 ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], s[2:3], 0 -; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[8:9], s[6:7], 63 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[12:13], s[6:7], 63 -; GCN-IR-NEXT: s_or_b64 s[8:9], s[4:5], s[8:9] -; GCN-IR-NEXT: s_or_b64 s[4:5], s[8:9], s[12:13] +; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[6:7], s[8:9], 63 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[12:13], s[8:9], 63 +; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] +; GCN-IR-NEXT: s_and_b64 s[6:7], s[4:5], exec +; GCN-IR-NEXT: s_cselect_b32 s7, 0, s3 +; GCN-IR-NEXT: s_cselect_b32 s6, 0, s2 +; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], s[12:13] ; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GCN-IR-NEXT: s_mov_b64 s[4:5], 0 ; GCN-IR-NEXT: s_cbranch_vccz .LBB11_5 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 -; GCN-IR-NEXT: s_add_u32 s8, s6, 1 -; GCN-IR-NEXT: s_addc_u32 s9, s7, 0 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[12:13], s[8:9], 0 -; GCN-IR-NEXT: s_sub_i32 s6, 63, s6 -; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[12:13] -; GCN-IR-NEXT: s_lshl_b64 s[6:7], s[2:3], s6 +; GCN-IR-NEXT: s_add_u32 s12, s8, 1 +; GCN-IR-NEXT: s_addc_u32 s13, s9, 0 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[6:7], s[12:13], 0 +; GCN-IR-NEXT: s_sub_i32 s8, 63, s8 +; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[6:7] +; GCN-IR-NEXT: s_lshl_b64 s[6:7], s[2:3], s8 ; GCN-IR-NEXT: s_cbranch_vccz .LBB11_4 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader -; GCN-IR-NEXT: s_lshr_b64 s[8:9], s[2:3], s8 +; GCN-IR-NEXT: s_lshr_b64 s[8:9], s[2:3], s12 ; GCN-IR-NEXT: s_add_u32 s2, s10, 0xffffffc4 ; GCN-IR-NEXT: s_addc_u32 s3, 0, -1 ; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 @@ -1488,18 +1485,12 @@ ; GCN-IR-NEXT: s_cbranch_vccz .LBB11_3 ; GCN-IR-NEXT: .LBB11_4: ; %Flow5 ; GCN-IR-NEXT: s_lshl_b64 s[2:3], s[6:7], 1 -; GCN-IR-NEXT: s_or_b64 s[2:3], s[4:5], s[2:3] -; GCN-IR-NEXT: v_mov_b32_e32 v0, s2 -; GCN-IR-NEXT: v_mov_b32_e32 v1, s3 -; GCN-IR-NEXT: s_branch .LBB11_6 -; GCN-IR-NEXT: .LBB11_5: -; GCN-IR-NEXT: v_mov_b32_e32 v0, s3 -; GCN-IR-NEXT: v_cndmask_b32_e64 v1, v0, 0, s[8:9] -; GCN-IR-NEXT: v_mov_b32_e32 v0, s2 -; GCN-IR-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[8:9] -; GCN-IR-NEXT: .LBB11_6: ; %udiv-end +; GCN-IR-NEXT: s_or_b64 s[6:7], s[4:5], s[2:3] +; GCN-IR-NEXT: .LBB11_5: ; %udiv-end +; GCN-IR-NEXT: v_mov_b32_e32 v0, s6 ; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s2, -1 +; GCN-IR-NEXT: v_mov_b32_e32 v1, s7 ; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN-IR-NEXT: s_endpgm %result = udiv i64 %x, 24 diff --git a/llvm/test/CodeGen/AMDGPU/udivrem.ll b/llvm/test/CodeGen/AMDGPU/udivrem.ll --- a/llvm/test/CodeGen/AMDGPU/udivrem.ll +++ b/llvm/test/CodeGen/AMDGPU/udivrem.ll @@ -36,77 +36,84 @@ ; ; GFX6-LABEL: test_udivrem: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s2, s[0:1], 0x26 +; GFX6-NEXT: s_load_dword s8, s[0:1], 0x26 +; GFX6-NEXT: s_load_dword s9, s[0:1], 0x1d ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x13 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x13 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 -; GFX6-NEXT: s_mov_b32 s10, s6 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s2 -; GFX6-NEXT: s_sub_i32 s3, 0, s2 -; GFX6-NEXT: s_mov_b32 s11, s7 +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s8 +; GFX6-NEXT: s_sub_i32 s2, 0, s8 +; GFX6-NEXT: s_mov_b32 s3, s7 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX6-NEXT: v_mul_lo_u32 v1, s3, v0 -; GFX6-NEXT: s_load_dword s3, s[0:1], 0x1d +; GFX6-NEXT: v_mul_lo_u32 v1, s2, v0 +; GFX6-NEXT: s_mov_b32 s2, s6 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mul_hi_u32 v0, s3, v0 -; GFX6-NEXT: v_mul_lo_u32 v1, v0, s2 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v0 -; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s3, v1 -; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v1 -; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] -; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s2, v1 -; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[0:1] -; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v0 -; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v1 -; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] -; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s2, v1 +; GFX6-NEXT: v_mul_hi_u32 v0, s9, v0 +; GFX6-NEXT: v_readfirstlane_b32 s10, v0 +; GFX6-NEXT: s_mul_i32 s10, s10, s8 +; GFX6-NEXT: s_sub_i32 s9, s9, s10 +; GFX6-NEXT: s_sub_i32 s10, s9, s8 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, 1, v0 +; GFX6-NEXT: s_cmp_ge_u32 s9, s8 +; GFX6-NEXT: s_cselect_b64 vcc, -1, 0 +; GFX6-NEXT: s_cselect_b32 s9, s10, s9 +; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX6-NEXT: s_sub_i32 s10, s9, s8 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, 1, v0 +; GFX6-NEXT: s_cmp_ge_u32 s9, s8 +; GFX6-NEXT: s_cselect_b64 vcc, -1, 0 +; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX6-NEXT: s_cselect_b32 s8, s10, s9 ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cndmask_b32_e64 v0, v1, v2, s[0:1] -; GFX6-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; GFX6-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: test_udivrem: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s6, s[0:1], 0x98 -; GFX8-NEXT: s_load_dword s7, s[0:1], 0x74 -; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x4c +; GFX8-NEXT: s_load_dword s4, s[0:1], 0x98 +; GFX8-NEXT: s_load_dword s5, s[0:1], 0x74 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s6 -; GFX8-NEXT: s_sub_i32 s2, 0, s6 +; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s4 +; GFX8-NEXT: s_sub_i32 s2, 0, s4 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX8-NEXT: v_mul_lo_u32 v1, s2, v0 ; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x4c ; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 -; GFX8-NEXT: v_mul_hi_u32 v2, s7, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 +; GFX8-NEXT: v_mul_hi_u32 v4, s5, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: v_mul_lo_u32 v3, v2, s6 -; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v2 -; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s7, v3 -; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s6, v3 -; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[0:1] -; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, s6, v3 -; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[0:1] -; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v2 -; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s6, v3 -; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[0:1] -; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, s6, v3 -; GFX8-NEXT: flat_store_dword v[0:1], v2 -; GFX8-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NEXT: v_cndmask_b32_e64 v2, v3, v4, s[0:1] -; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: flat_store_dword v[0:1], v2 +; GFX8-NEXT: v_readfirstlane_b32 s0, v4 +; GFX8-NEXT: s_mul_i32 s0, s0, s4 +; GFX8-NEXT: s_sub_i32 s0, s5, s0 +; GFX8-NEXT: s_sub_i32 s1, s0, s4 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 1, v4 +; GFX8-NEXT: s_cmp_ge_u32 s0, s4 +; GFX8-NEXT: s_cselect_b64 vcc, -1, 0 +; GFX8-NEXT: s_cselect_b32 s0, s1, s0 +; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc +; GFX8-NEXT: s_sub_i32 s1, s0, s4 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 1, v4 +; GFX8-NEXT: s_cmp_ge_u32 s0, s4 +; GFX8-NEXT: s_cselect_b64 vcc, -1, 0 +; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc +; GFX8-NEXT: s_cselect_b32 s0, s1, s0 +; GFX8-NEXT: flat_store_dword v[0:1], v4 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: flat_store_dword v[2:3], v0 ; GFX8-NEXT: s_endpgm %result0 = udiv i32 %x, %y store i32 %result0, i32 addrspace(1)* %out0 @@ -158,43 +165,47 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s6 -; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s7 ; GFX6-NEXT: s_sub_i32 s2, 0, s6 +; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s7 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v1, v1 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX6-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 +; GFX6-NEXT: v_mul_lo_u32 v1, s2, v0 +; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 +; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0 +; GFX6-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX6-NEXT: v_mul_lo_u32 v2, s2, v0 +; GFX6-NEXT: v_readfirstlane_b32 s2, v0 +; GFX6-NEXT: s_mul_i32 s2, s2, s6 +; GFX6-NEXT: s_sub_i32 s2, s4, s2 +; GFX6-NEXT: s_sub_i32 s3, s2, s6 +; GFX6-NEXT: s_cmp_ge_u32 s2, s6 +; GFX6-NEXT: s_cselect_b32 s2, s3, s2 +; GFX6-NEXT: s_sub_i32 s3, s2, s6 +; GFX6-NEXT: s_cmp_ge_u32 s2, s6 +; GFX6-NEXT: s_cselect_b32 s4, s3, s2 ; GFX6-NEXT: s_sub_i32 s2, 0, s7 -; GFX6-NEXT: v_mul_lo_u32 v3, s2, v1 +; GFX6-NEXT: v_mul_lo_u32 v0, s2, v1 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 -; GFX6-NEXT: v_mul_hi_u32 v2, v0, v2 -; GFX6-NEXT: v_mul_hi_u32 v3, v1, v3 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v2, v0 -; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3 -; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0 -; GFX6-NEXT: v_mul_hi_u32 v1, s5, v1 -; GFX6-NEXT: v_mul_lo_u32 v0, v0, s6 -; GFX6-NEXT: v_mul_lo_u32 v1, v1, s7 -; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 -; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s5, v1 -; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s6, v0 -; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s7, v1 -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s6, v0 -; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s6, v0 -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s6, v0 -; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s7, v1 -; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s7, v1 -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s7, v1 -; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX6-NEXT: v_mul_hi_u32 v0, v1, v0 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GFX6-NEXT: v_mul_hi_u32 v0, s5, v0 +; GFX6-NEXT: v_readfirstlane_b32 s6, v0 +; GFX6-NEXT: s_mul_i32 s6, s6, s7 +; GFX6-NEXT: s_sub_i32 s5, s5, s6 +; GFX6-NEXT: s_sub_i32 s6, s5, s7 +; GFX6-NEXT: s_cmp_ge_u32 s5, s7 +; GFX6-NEXT: s_cselect_b32 s5, s6, s5 +; GFX6-NEXT: s_sub_i32 s6, s5, s7 +; GFX6-NEXT: s_cmp_ge_u32 s5, s7 +; GFX6-NEXT: s_cselect_b32 s5, s6, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; @@ -230,7 +241,7 @@ ; GFX8-NEXT: v_mul_lo_u32 v0, s3, v1 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: v_mul_hi_u32 v0, v1, v0 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v1, v0 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 ; GFX8-NEXT: v_mul_hi_u32 v1, s5, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_readfirstlane_b32 s2, v1 @@ -323,77 +334,85 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; GFX6-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s8 -; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s9 -; GFX6-NEXT: s_sub_i32 s12, 0, s8 -; GFX6-NEXT: s_sub_i32 s13, 0, s9 +; GFX6-NEXT: s_sub_i32 s2, 0, s8 +; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s9 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s10 -; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s11 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX6-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 -; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v3 -; GFX6-NEXT: v_mul_lo_u32 v2, s12, v0 -; GFX6-NEXT: v_mul_lo_u32 v4, s13, v1 -; GFX6-NEXT: v_mul_hi_u32 v2, v0, v2 -; GFX6-NEXT: v_mul_hi_u32 v4, v1, v4 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v2, v0 +; GFX6-NEXT: v_mul_lo_u32 v1, s2, v0 +; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0 -; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v4 -; GFX6-NEXT: v_mul_hi_u32 v1, s5, v1 -; GFX6-NEXT: v_mul_lo_u32 v0, v0, s8 -; GFX6-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v3 -; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v2 -; GFX6-NEXT: v_mul_lo_u32 v1, v1, s9 -; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 -; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s8, v0 -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 -; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s8, v0 -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 -; GFX6-NEXT: s_sub_i32 s4, 0, s10 -; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX6-NEXT: v_mul_lo_u32 v3, s4, v2 -; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s5, v1 -; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s9, v1 -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s9, v1 -; GFX6-NEXT: v_mul_hi_u32 v3, v2, v3 -; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v5 -; GFX6-NEXT: s_sub_i32 s4, 0, s11 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GFX6-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v4 -; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v3 -; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s9, v1 -; GFX6-NEXT: v_mul_hi_u32 v2, s6, v2 -; GFX6-NEXT: v_mul_lo_u32 v5, s4, v3 -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s9, v1 -; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GFX6-NEXT: v_mul_lo_u32 v2, v2, s10 -; GFX6-NEXT: v_mul_hi_u32 v4, v3, v5 -; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s6, v2 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v4 -; GFX6-NEXT: v_mul_hi_u32 v3, s7, v3 -; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, s10, v2 -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s10, v2 -; GFX6-NEXT: v_mul_lo_u32 v3, v3, s11 -; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc -; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s10, v2 -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s10, v2 -; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s7, v3 -; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s11, v3 -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s11, v3 -; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc -; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s11, v3 -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s11, v3 -; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; GFX6-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2 +; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s10 +; GFX6-NEXT: v_readfirstlane_b32 s2, v0 +; GFX6-NEXT: s_mul_i32 s2, s2, s8 +; GFX6-NEXT: s_sub_i32 s2, s4, s2 +; GFX6-NEXT: s_sub_i32 s3, s2, s8 +; GFX6-NEXT: s_cmp_ge_u32 s2, s8 +; GFX6-NEXT: s_cselect_b32 s2, s3, s2 +; GFX6-NEXT: s_sub_i32 s3, s2, s8 +; GFX6-NEXT: s_cmp_ge_u32 s2, s8 +; GFX6-NEXT: s_cselect_b32 s4, s3, s2 +; GFX6-NEXT: s_sub_i32 s2, 0, s9 +; GFX6-NEXT: v_mul_lo_u32 v0, s2, v1 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 +; GFX6-NEXT: v_mul_hi_u32 v0, v1, v0 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GFX6-NEXT: v_mul_hi_u32 v0, s5, v0 +; GFX6-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2 +; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s11 +; GFX6-NEXT: v_readfirstlane_b32 s2, v0 +; GFX6-NEXT: s_mul_i32 s2, s2, s9 +; GFX6-NEXT: s_sub_i32 s2, s5, s2 +; GFX6-NEXT: s_sub_i32 s3, s2, s9 +; GFX6-NEXT: s_cmp_ge_u32 s2, s9 +; GFX6-NEXT: s_cselect_b32 s2, s3, s2 +; GFX6-NEXT: s_sub_i32 s3, s2, s9 +; GFX6-NEXT: s_cmp_ge_u32 s2, s9 +; GFX6-NEXT: s_cselect_b32 s5, s3, s2 +; GFX6-NEXT: s_sub_i32 s2, 0, s10 +; GFX6-NEXT: v_mul_lo_u32 v0, s2, v1 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 +; GFX6-NEXT: v_mul_hi_u32 v0, v1, v0 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GFX6-NEXT: v_mul_hi_u32 v0, s6, v0 +; GFX6-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2 +; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GFX6-NEXT: v_readfirstlane_b32 s2, v0 +; GFX6-NEXT: s_mul_i32 s2, s2, s10 +; GFX6-NEXT: s_sub_i32 s2, s6, s2 +; GFX6-NEXT: s_sub_i32 s3, s2, s10 +; GFX6-NEXT: s_cmp_ge_u32 s2, s10 +; GFX6-NEXT: s_cselect_b32 s2, s3, s2 +; GFX6-NEXT: s_sub_i32 s3, s2, s10 +; GFX6-NEXT: s_cmp_ge_u32 s2, s10 +; GFX6-NEXT: s_cselect_b32 s6, s3, s2 +; GFX6-NEXT: s_sub_i32 s2, 0, s11 +; GFX6-NEXT: v_mul_lo_u32 v0, s2, v1 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: v_mul_hi_u32 v0, v1, v0 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 +; GFX6-NEXT: v_mul_hi_u32 v2, s7, v0 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: v_readfirstlane_b32 s4, v2 +; GFX6-NEXT: s_mul_i32 s4, s4, s11 +; GFX6-NEXT: s_sub_i32 s4, s7, s4 +; GFX6-NEXT: s_sub_i32 s5, s4, s11 +; GFX6-NEXT: s_cmp_ge_u32 s4, s11 +; GFX6-NEXT: s_cselect_b32 s4, s5, s4 +; GFX6-NEXT: s_sub_i32 s5, s4, s11 +; GFX6-NEXT: s_cmp_ge_u32 s4, s11 +; GFX6-NEXT: s_cselect_b32 s4, s5, s4 +; GFX6-NEXT: v_mov_b32_e32 v2, s6 +; GFX6-NEXT: v_mov_b32_e32 v3, s4 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; @@ -449,7 +468,7 @@ ; GFX8-NEXT: v_mul_lo_u32 v0, s4, v1 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX8-NEXT: v_mul_hi_u32 v0, v1, v0 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v1, v0 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 ; GFX8-NEXT: v_mul_hi_u32 v0, s6, v0 ; GFX8-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2 ; GFX8-NEXT: v_cvt_u32_f32_e32 v1, v1 diff --git a/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll b/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll --- a/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll @@ -76,12 +76,9 @@ ; GCN-LABEL: {{^}}uint_to_fp_i1_to_f64: ; VI-DAG: s_cmp_eq_u32 -; VI-DAG: s_cselect_b32 s[[SSEL:[0-9]+]], 0x3ff00000, 0 -; VI-DAG: v_mov_b32_e32 v[[SEL:[0-9]+]], s[[SSEL]] -; SI-DAG: s_cmp_eq_u32 -; SI-DAG: s_cselect_b64 vcc, -1, 0 -; SI-DAG: v_cndmask_b32_e32 v[[SEL:[0-9]+]], 0, {{v[0-9]+}}, vcc +; GCN-DAG: s_cselect_b32 s[[SSEL:[0-9]+]], 0x3ff00000, 0 ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} +; GCN-DAG: v_mov_b32_e32 v[[SEL:[0-9]+]], s[[SSEL]] ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v[[[ZERO]]:[[SEL]]] ; GCN: s_endpgm define amdgpu_kernel void @uint_to_fp_i1_to_f64(double addrspace(1)* %out, i32 %in) { diff --git a/llvm/test/CodeGen/AMDGPU/urem64.ll b/llvm/test/CodeGen/AMDGPU/urem64.ll --- a/llvm/test/CodeGen/AMDGPU/urem64.ll +++ b/llvm/test/CodeGen/AMDGPU/urem64.ll @@ -51,7 +51,7 @@ ; GCN-NEXT: v_mul_lo_u32 v4, s1, v0 ; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GCN-NEXT: v_mul_lo_u32 v3, s0, v0 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v4, v2 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GCN-NEXT: v_mul_lo_u32 v6, v0, v2 ; GCN-NEXT: v_mul_hi_u32 v7, v0, v3 ; GCN-NEXT: v_mul_hi_u32 v8, v0, v2 @@ -104,19 +104,19 @@ ; GCN-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[2:3] ; GCN-NEXT: v_subbrev_u32_e64 v2, s[0:1], 0, v2, s[0:1] ; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v6 -; GCN-NEXT: v_cndmask_b32_e64 v2, v5, v2, s[0:1] -; GCN-NEXT: v_mov_b32_e32 v5, s11 -; GCN-NEXT: v_subb_u32_e32 v1, vcc, v5, v1, vcc +; GCN-NEXT: v_cndmask_b32_e64 v3, v4, v3, s[0:1] +; GCN-NEXT: v_mov_b32_e32 v4, s11 +; GCN-NEXT: v_subb_u32_e32 v1, vcc, v4, v1, vcc ; GCN-NEXT: v_cmp_le_u32_e32 vcc, s13, v1 -; GCN-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc +; GCN-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc ; GCN-NEXT: v_cmp_le_u32_e32 vcc, s12, v0 -; GCN-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc +; GCN-NEXT: v_cndmask_b32_e64 v2, v5, v2, s[0:1] +; GCN-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, s13, v1 -; GCN-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 +; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 ; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GCN-NEXT: v_cndmask_b32_e64 v2, v4, v3, s[0:1] -; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN-NEXT: s_endpgm ; @@ -124,38 +124,41 @@ ; GCN-IR: ; %bb.0: ; %_udiv-special-cases ; GCN-IR-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd ; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; GCN-IR-NEXT: s_mov_b64 s[6:7], 0 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[8:9], s[4:5], 0 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[10:11], s[2:3], 0 -; GCN-IR-NEXT: s_flbit_i32_b32 s12, s4 -; GCN-IR-NEXT: s_add_i32 s14, s12, 32 -; GCN-IR-NEXT: s_or_b64 s[12:13], s[8:9], s[10:11] -; GCN-IR-NEXT: s_flbit_i32_b32 s8, s5 -; GCN-IR-NEXT: s_min_u32 s10, s14, s8 -; GCN-IR-NEXT: s_flbit_i32_b32 s8, s2 -; GCN-IR-NEXT: s_add_i32 s8, s8, 32 -; GCN-IR-NEXT: s_flbit_i32_b32 s9, s3 -; GCN-IR-NEXT: s_min_u32 s14, s8, s9 -; GCN-IR-NEXT: s_sub_u32 s8, s10, s14 -; GCN-IR-NEXT: s_subb_u32 s9, 0, 0 -; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[16:17], s[8:9], 63 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[18:19], s[8:9], 63 -; GCN-IR-NEXT: s_or_b64 s[12:13], s[12:13], s[16:17] -; GCN-IR-NEXT: s_or_b64 s[16:17], s[12:13], s[18:19] +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[6:7], s[4:5], 0 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[8:9], s[2:3], 0 +; GCN-IR-NEXT: s_flbit_i32_b32 s10, s4 +; GCN-IR-NEXT: s_or_b64 s[8:9], s[6:7], s[8:9] +; GCN-IR-NEXT: s_flbit_i32_b32 s6, s2 +; GCN-IR-NEXT: s_flbit_i32_b32 s11, s5 +; GCN-IR-NEXT: s_add_i32 s10, s10, 32 +; GCN-IR-NEXT: s_add_i32 s6, s6, 32 +; GCN-IR-NEXT: s_flbit_i32_b32 s7, s3 +; GCN-IR-NEXT: s_min_u32 s10, s10, s11 +; GCN-IR-NEXT: s_min_u32 s14, s6, s7 +; GCN-IR-NEXT: s_sub_u32 s12, s10, s14 +; GCN-IR-NEXT: s_subb_u32 s13, 0, 0 +; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[16:17], s[12:13], 63 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[18:19], s[12:13], 63 +; GCN-IR-NEXT: s_or_b64 s[16:17], s[8:9], s[16:17] +; GCN-IR-NEXT: s_and_b64 s[8:9], s[16:17], exec +; GCN-IR-NEXT: s_cselect_b32 s9, 0, s3 +; GCN-IR-NEXT: s_cselect_b32 s8, 0, s2 +; GCN-IR-NEXT: s_or_b64 s[16:17], s[16:17], s[18:19] +; GCN-IR-NEXT: s_mov_b64 s[6:7], 0 ; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[16:17] ; GCN-IR-NEXT: s_mov_b32 s11, 0 ; GCN-IR-NEXT: s_cbranch_vccz .LBB0_5 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 -; GCN-IR-NEXT: s_add_u32 s12, s8, 1 -; GCN-IR-NEXT: s_addc_u32 s13, s9, 0 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[16:17], s[12:13], 0 -; GCN-IR-NEXT: s_sub_i32 s8, 63, s8 -; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[16:17] -; GCN-IR-NEXT: s_lshl_b64 s[8:9], s[2:3], s8 +; GCN-IR-NEXT: s_add_u32 s16, s12, 1 +; GCN-IR-NEXT: s_addc_u32 s17, s13, 0 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[8:9], s[16:17], 0 +; GCN-IR-NEXT: s_sub_i32 s12, 63, s12 +; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[8:9] +; GCN-IR-NEXT: s_lshl_b64 s[8:9], s[2:3], s12 ; GCN-IR-NEXT: s_cbranch_vccz .LBB0_4 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader -; GCN-IR-NEXT: s_lshr_b64 s[12:13], s[2:3], s12 +; GCN-IR-NEXT: s_lshr_b64 s[12:13], s[2:3], s16 ; GCN-IR-NEXT: s_add_u32 s16, s4, -1 ; GCN-IR-NEXT: s_addc_u32 s17, s5, -1 ; GCN-IR-NEXT: s_not_b64 s[6:7], s[10:11] @@ -186,30 +189,24 @@ ; GCN-IR-NEXT: s_cbranch_vccz .LBB0_3 ; GCN-IR-NEXT: .LBB0_4: ; %Flow6 ; GCN-IR-NEXT: s_lshl_b64 s[8:9], s[8:9], 1 -; GCN-IR-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] -; GCN-IR-NEXT: v_mov_b32_e32 v0, s6 -; GCN-IR-NEXT: v_mov_b32_e32 v1, s7 -; GCN-IR-NEXT: s_branch .LBB0_6 -; GCN-IR-NEXT: .LBB0_5: -; GCN-IR-NEXT: v_mov_b32_e32 v0, s3 -; GCN-IR-NEXT: v_cndmask_b32_e64 v1, v0, 0, s[12:13] -; GCN-IR-NEXT: v_mov_b32_e32 v0, s2 -; GCN-IR-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[12:13] -; GCN-IR-NEXT: .LBB0_6: ; %udiv-end -; GCN-IR-NEXT: v_mul_lo_u32 v1, s4, v1 -; GCN-IR-NEXT: v_mul_hi_u32 v2, s4, v0 -; GCN-IR-NEXT: v_mul_lo_u32 v3, s5, v0 -; GCN-IR-NEXT: v_mul_lo_u32 v0, s4, v0 -; GCN-IR-NEXT: s_mov_b32 s11, 0xf000 -; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, v1, v2 -; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, v3, v1 +; GCN-IR-NEXT: s_or_b64 s[8:9], s[6:7], s[8:9] +; GCN-IR-NEXT: .LBB0_5: ; %udiv-end +; GCN-IR-NEXT: v_mov_b32_e32 v0, s8 +; GCN-IR-NEXT: v_mul_hi_u32 v0, s4, v0 +; GCN-IR-NEXT: s_mov_b32 s12, s0 +; GCN-IR-NEXT: s_mul_i32 s0, s4, s9 ; GCN-IR-NEXT: v_mov_b32_e32 v2, s3 +; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, s0, v0 +; GCN-IR-NEXT: s_mul_i32 s0, s5, s8 +; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, s0, v0 +; GCN-IR-NEXT: s_mul_i32 s0, s4, s8 +; GCN-IR-NEXT: v_mov_b32_e32 v0, s0 ; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 -; GCN-IR-NEXT: s_mov_b32 s10, -1 -; GCN-IR-NEXT: s_mov_b32 s8, s0 -; GCN-IR-NEXT: s_mov_b32 s9, s1 +; GCN-IR-NEXT: s_mov_b32 s15, 0xf000 +; GCN-IR-NEXT: s_mov_b32 s14, -1 +; GCN-IR-NEXT: s_mov_b32 s13, s1 ; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc -; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[12:15], 0 ; GCN-IR-NEXT: s_endpgm %result = urem i64 %x, %y store i64 %result, i64 addrspace(1)* %out @@ -753,9 +750,9 @@ ; GCN-NEXT: v_mul_lo_u32 v2, s0, v1 ; GCN-NEXT: v_mul_hi_u32 v3, s0, v0 ; GCN-NEXT: v_mul_lo_u32 v4, s1, v0 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GCN-NEXT: v_mul_lo_u32 v3, s0, v0 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v4, v2 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GCN-NEXT: v_mul_lo_u32 v6, v0, v2 ; GCN-NEXT: v_mul_hi_u32 v7, v0, v3 ; GCN-NEXT: v_mul_hi_u32 v8, v0, v2 @@ -781,7 +778,7 @@ ; GCN-NEXT: v_mul_lo_u32 v1, s7, v0 ; GCN-NEXT: v_mul_hi_u32 v2, s6, v0 ; GCN-NEXT: v_mul_lo_u32 v0, s6, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v2 +; GCN-NEXT: v_add_i32_e32 v1, vcc, v2, v1 ; GCN-NEXT: v_sub_i32_e32 v2, vcc, 0, v1 ; GCN-NEXT: v_sub_i32_e32 v0, vcc, 24, v0 ; GCN-NEXT: v_subb_u32_e64 v2, s[0:1], v2, v3, vcc @@ -799,47 +796,50 @@ ; GCN-NEXT: v_subb_u32_e32 v1, vcc, 0, v1, vcc ; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v6 ; GCN-NEXT: v_cmp_le_u32_e32 vcc, s7, v1 +; GCN-NEXT: v_cndmask_b32_e64 v3, v4, v3, s[0:1] +; GCN-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc +; GCN-NEXT: v_cmp_le_u32_e32 vcc, s6, v0 ; GCN-NEXT: v_cndmask_b32_e64 v2, v5, v2, s[0:1] ; GCN-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc -; GCN-NEXT: v_cmp_le_u32_e32 vcc, s6, v0 -; GCN-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, s7, v1 -; GCN-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 +; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 ; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GCN-NEXT: v_cndmask_b32_e64 v2, v4, v3, s[0:1] -; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; GCN-NEXT: s_endpgm ; ; GCN-IR-LABEL: s_test_urem_k_num_i64: ; GCN-IR: ; %bb.0: ; %_udiv-special-cases ; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_flbit_i32_b32 s6, s2 -; GCN-IR-NEXT: s_flbit_i32_b32 s7, s3 -; GCN-IR-NEXT: s_add_i32 s6, s6, 32 -; GCN-IR-NEXT: s_min_u32 s8, s6, s7 -; GCN-IR-NEXT: s_add_u32 s6, s8, 0xffffffc5 -; GCN-IR-NEXT: s_addc_u32 s7, 0, -1 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], s[2:3], 0 -; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[10:11], s[6:7], 63 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[12:13], s[6:7], 63 -; GCN-IR-NEXT: s_or_b64 s[10:11], s[4:5], s[10:11] -; GCN-IR-NEXT: s_or_b64 s[4:5], s[10:11], s[12:13] -; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GCN-IR-NEXT: s_mov_b64 s[4:5], 0 +; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) +; GCN-IR-NEXT: s_flbit_i32_b32 s8, s2 +; GCN-IR-NEXT: s_flbit_i32_b32 s9, s3 +; GCN-IR-NEXT: s_add_i32 s8, s8, 32 +; GCN-IR-NEXT: s_min_u32 s8, s8, s9 +; GCN-IR-NEXT: s_add_u32 s10, s8, 0xffffffc5 +; GCN-IR-NEXT: s_addc_u32 s11, 0, -1 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[6:7], s[2:3], 0 +; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[12:13], s[10:11], 63 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[14:15], s[10:11], 63 +; GCN-IR-NEXT: s_or_b64 s[12:13], s[6:7], s[12:13] +; GCN-IR-NEXT: s_and_b64 s[6:7], s[12:13], exec +; GCN-IR-NEXT: s_cselect_b32 s6, 0, 24 +; GCN-IR-NEXT: s_or_b64 s[12:13], s[12:13], s[14:15] +; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[12:13] +; GCN-IR-NEXT: s_mov_b32 s7, 0 ; GCN-IR-NEXT: s_cbranch_vccz .LBB6_5 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 -; GCN-IR-NEXT: s_add_u32 s10, s6, 1 -; GCN-IR-NEXT: s_addc_u32 s11, s7, 0 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[12:13], s[10:11], 0 -; GCN-IR-NEXT: s_sub_i32 s6, 63, s6 -; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[12:13] -; GCN-IR-NEXT: s_lshl_b64 s[6:7], 24, s6 +; GCN-IR-NEXT: s_add_u32 s12, s10, 1 +; GCN-IR-NEXT: s_addc_u32 s13, s11, 0 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[6:7], s[12:13], 0 +; GCN-IR-NEXT: s_sub_i32 s9, 63, s10 +; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[6:7] +; GCN-IR-NEXT: s_lshl_b64 s[6:7], 24, s9 ; GCN-IR-NEXT: s_cbranch_vccz .LBB6_4 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader -; GCN-IR-NEXT: s_lshr_b64 s[10:11], 24, s10 +; GCN-IR-NEXT: s_lshr_b64 s[10:11], 24, s12 ; GCN-IR-NEXT: s_add_u32 s14, s2, -1 ; GCN-IR-NEXT: s_addc_u32 s15, s3, -1 ; GCN-IR-NEXT: s_sub_u32 s8, 58, s8 @@ -869,27 +869,22 @@ ; GCN-IR-NEXT: s_cbranch_vccz .LBB6_3 ; GCN-IR-NEXT: .LBB6_4: ; %Flow5 ; GCN-IR-NEXT: s_lshl_b64 s[6:7], s[6:7], 1 -; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GCN-IR-NEXT: v_mov_b32_e32 v0, s4 -; GCN-IR-NEXT: v_mov_b32_e32 v1, s5 -; GCN-IR-NEXT: s_branch .LBB6_6 -; GCN-IR-NEXT: .LBB6_5: -; GCN-IR-NEXT: v_mov_b32_e32 v1, 0 -; GCN-IR-NEXT: v_cndmask_b32_e64 v0, 24, 0, s[10:11] -; GCN-IR-NEXT: .LBB6_6: ; %udiv-end -; GCN-IR-NEXT: v_mul_lo_u32 v1, s2, v1 -; GCN-IR-NEXT: v_mul_hi_u32 v2, s2, v0 -; GCN-IR-NEXT: v_mul_lo_u32 v3, s3, v0 -; GCN-IR-NEXT: v_mul_lo_u32 v0, s2, v0 -; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 -; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, v1, v2 -; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, v3, v1 -; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, 24, v0 -; GCN-IR-NEXT: s_mov_b32 s6, -1 -; GCN-IR-NEXT: s_mov_b32 s4, s0 -; GCN-IR-NEXT: s_mov_b32 s5, s1 +; GCN-IR-NEXT: s_or_b64 s[6:7], s[4:5], s[6:7] +; GCN-IR-NEXT: .LBB6_5: ; %udiv-end +; GCN-IR-NEXT: v_mov_b32_e32 v0, s6 +; GCN-IR-NEXT: v_mul_hi_u32 v0, s2, v0 +; GCN-IR-NEXT: s_mov_b32 s8, s0 +; GCN-IR-NEXT: s_mul_i32 s0, s2, s7 +; GCN-IR-NEXT: s_mov_b32 s11, 0xf000 +; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, s0, v0 +; GCN-IR-NEXT: s_mul_i32 s0, s3, s6 +; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, s0, v0 +; GCN-IR-NEXT: s_mul_i32 s0, s2, s6 +; GCN-IR-NEXT: v_sub_i32_e64 v0, vcc, 24, s0 +; GCN-IR-NEXT: s_mov_b32 s10, -1 +; GCN-IR-NEXT: s_mov_b32 s9, s1 ; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, 0, v1, vcc -; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; GCN-IR-NEXT: s_endpgm %result = urem i64 24, %x store i64 %result, i64 addrspace(1)* %out @@ -902,9 +897,9 @@ ; GCN-NEXT: v_mov_b32_e32 v0, 0x4f800000 ; GCN-NEXT: v_madak_f32 v0, 0, v0, 0x41c00000 ; GCN-NEXT: v_rcp_f32_e32 v0, v0 -; GCN-NEXT: s_movk_i32 s4, 0xffe8 -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_movk_i32 s2, 0xffe8 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GCN-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GCN-NEXT: v_trunc_f32_e32 v1, v1 @@ -912,13 +907,13 @@ ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s5, s1 -; GCN-NEXT: s_mov_b32 s6, -1 -; GCN-NEXT: v_mul_hi_u32 v2, v0, s4 -; GCN-NEXT: v_mul_lo_u32 v4, v1, s4 -; GCN-NEXT: v_mul_lo_u32 v3, v0, s4 -; GCN-NEXT: v_sub_i32_e32 v2, vcc, v2, v0 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; GCN-NEXT: s_mov_b32 s0, s4 +; GCN-NEXT: s_mov_b32 s1, s5 +; GCN-NEXT: v_mul_hi_u32 v2, v0, s2 +; GCN-NEXT: v_mul_lo_u32 v4, v1, s2 +; GCN-NEXT: v_mul_lo_u32 v3, v0, s2 +; GCN-NEXT: v_subrev_i32_e32 v2, vcc, v0, v2 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v4, v2 ; GCN-NEXT: v_mul_hi_u32 v5, v0, v3 ; GCN-NEXT: v_mul_lo_u32 v4, v0, v2 ; GCN-NEXT: v_mul_hi_u32 v6, v0, v2 @@ -935,12 +930,12 @@ ; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GCN-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc -; GCN-NEXT: v_mul_hi_u32 v2, v0, s4 -; GCN-NEXT: v_mul_lo_u32 v3, v1, s4 -; GCN-NEXT: v_mul_lo_u32 v4, v0, s4 -; GCN-NEXT: s_mov_b32 s4, s0 -; GCN-NEXT: v_subrev_i32_e32 v2, vcc, v0, v2 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; GCN-NEXT: v_mul_hi_u32 v2, v0, s2 +; GCN-NEXT: v_mul_lo_u32 v3, v1, s2 +; GCN-NEXT: v_mul_lo_u32 v4, v0, s2 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: v_sub_i32_e32 v2, vcc, v2, v0 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GCN-NEXT: v_mul_lo_u32 v3, v0, v2 ; GCN-NEXT: v_mul_hi_u32 v5, v0, v4 ; GCN-NEXT: v_mul_hi_u32 v6, v0, v2 @@ -957,15 +952,15 @@ ; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GCN-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc -; GCN-NEXT: v_mul_lo_u32 v2, s2, v1 -; GCN-NEXT: v_mul_hi_u32 v3, s2, v0 -; GCN-NEXT: v_mul_hi_u32 v4, s2, v1 -; GCN-NEXT: v_mul_hi_u32 v5, s3, v1 -; GCN-NEXT: v_mul_lo_u32 v1, s3, v1 +; GCN-NEXT: v_mul_lo_u32 v2, s6, v1 +; GCN-NEXT: v_mul_hi_u32 v3, s6, v0 +; GCN-NEXT: v_mul_hi_u32 v4, s6, v1 +; GCN-NEXT: v_mul_hi_u32 v5, s7, v1 +; GCN-NEXT: v_mul_lo_u32 v1, s7, v1 ; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc -; GCN-NEXT: v_mul_lo_u32 v4, s3, v0 -; GCN-NEXT: v_mul_hi_u32 v0, s3, v0 +; GCN-NEXT: v_mul_lo_u32 v4, s7, v0 +; GCN-NEXT: v_mul_hi_u32 v0, s7, v0 ; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GCN-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc ; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc @@ -975,8 +970,8 @@ ; GCN-NEXT: v_mul_hi_u32 v2, v0, 24 ; GCN-NEXT: v_mul_lo_u32 v0, v0, 24 ; GCN-NEXT: v_add_i32_e32 v1, vcc, v2, v1 -; GCN-NEXT: v_mov_b32_e32 v2, s3 -; GCN-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 +; GCN-NEXT: v_mov_b32_e32 v2, s7 +; GCN-NEXT: v_sub_i32_e32 v0, vcc, s6, v0 ; GCN-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc ; GCN-NEXT: v_subrev_i32_e32 v2, vcc, 24, v0 ; GCN-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v1, vcc @@ -987,16 +982,16 @@ ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GCN-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc ; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; GCN-NEXT: v_cmp_lt_u32_e64 s[0:1], 23, v0 ; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc -; GCN-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1] -; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v1 -; GCN-NEXT: v_cndmask_b32_e64 v5, -1, v5, s[0:1] -; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v5 ; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] -; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] -; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GCN-NEXT: v_cmp_lt_u32_e32 vcc, 23, v0 +; GCN-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GCN-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN-NEXT: s_endpgm ; ; GCN-IR-LABEL: s_test_urem_k_den_i64: @@ -1007,26 +1002,29 @@ ; GCN-IR-NEXT: s_flbit_i32_b32 s7, s3 ; GCN-IR-NEXT: s_add_i32 s6, s6, 32 ; GCN-IR-NEXT: s_min_u32 s8, s6, s7 -; GCN-IR-NEXT: s_sub_u32 s6, 59, s8 -; GCN-IR-NEXT: s_subb_u32 s7, 0, 0 +; GCN-IR-NEXT: s_sub_u32 s10, 59, s8 +; GCN-IR-NEXT: s_subb_u32 s11, 0, 0 ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], s[2:3], 0 -; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[10:11], s[6:7], 63 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[12:13], s[6:7], 63 -; GCN-IR-NEXT: s_or_b64 s[10:11], s[4:5], s[10:11] -; GCN-IR-NEXT: s_or_b64 s[4:5], s[10:11], s[12:13] +; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[6:7], s[10:11], 63 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[12:13], s[10:11], 63 +; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] +; GCN-IR-NEXT: s_and_b64 s[6:7], s[4:5], exec +; GCN-IR-NEXT: s_cselect_b32 s7, 0, s3 +; GCN-IR-NEXT: s_cselect_b32 s6, 0, s2 +; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], s[12:13] ; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GCN-IR-NEXT: s_mov_b64 s[4:5], 0 ; GCN-IR-NEXT: s_cbranch_vccz .LBB7_5 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 -; GCN-IR-NEXT: s_add_u32 s10, s6, 1 -; GCN-IR-NEXT: s_addc_u32 s11, s7, 0 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[12:13], s[10:11], 0 -; GCN-IR-NEXT: s_sub_i32 s6, 63, s6 -; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[12:13] -; GCN-IR-NEXT: s_lshl_b64 s[6:7], s[2:3], s6 +; GCN-IR-NEXT: s_add_u32 s12, s10, 1 +; GCN-IR-NEXT: s_addc_u32 s13, s11, 0 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[6:7], s[12:13], 0 +; GCN-IR-NEXT: s_sub_i32 s9, 63, s10 +; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[6:7] +; GCN-IR-NEXT: s_lshl_b64 s[6:7], s[2:3], s9 ; GCN-IR-NEXT: s_cbranch_vccz .LBB7_4 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader -; GCN-IR-NEXT: s_lshr_b64 s[10:11], s[2:3], s10 +; GCN-IR-NEXT: s_lshr_b64 s[10:11], s[2:3], s12 ; GCN-IR-NEXT: s_add_u32 s8, s8, 0xffffffc4 ; GCN-IR-NEXT: s_addc_u32 s9, 0, -1 ; GCN-IR-NEXT: s_mov_b64 s[12:13], 0 @@ -1053,28 +1051,21 @@ ; GCN-IR-NEXT: s_cbranch_vccz .LBB7_3 ; GCN-IR-NEXT: .LBB7_4: ; %Flow5 ; GCN-IR-NEXT: s_lshl_b64 s[6:7], s[6:7], 1 -; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GCN-IR-NEXT: v_mov_b32_e32 v0, s4 -; GCN-IR-NEXT: v_mov_b32_e32 v1, s5 -; GCN-IR-NEXT: s_branch .LBB7_6 -; GCN-IR-NEXT: .LBB7_5: -; GCN-IR-NEXT: v_mov_b32_e32 v0, s3 -; GCN-IR-NEXT: v_cndmask_b32_e64 v1, v0, 0, s[10:11] -; GCN-IR-NEXT: v_mov_b32_e32 v0, s2 -; GCN-IR-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[10:11] -; GCN-IR-NEXT: .LBB7_6: ; %udiv-end -; GCN-IR-NEXT: v_mul_lo_u32 v1, v1, 24 -; GCN-IR-NEXT: v_mul_hi_u32 v2, v0, 24 -; GCN-IR-NEXT: v_mul_lo_u32 v0, v0, 24 -; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 -; GCN-IR-NEXT: s_mov_b32 s6, -1 -; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, v2, v1 +; GCN-IR-NEXT: s_or_b64 s[6:7], s[4:5], s[6:7] +; GCN-IR-NEXT: .LBB7_5: ; %udiv-end +; GCN-IR-NEXT: v_mul_hi_u32 v0, s6, 24 +; GCN-IR-NEXT: s_mov_b32 s8, s0 +; GCN-IR-NEXT: s_mul_i32 s0, s7, 24 ; GCN-IR-NEXT: v_mov_b32_e32 v2, s3 +; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, s0, v0 +; GCN-IR-NEXT: s_mul_i32 s0, s6, 24 +; GCN-IR-NEXT: v_mov_b32_e32 v0, s0 ; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 -; GCN-IR-NEXT: s_mov_b32 s4, s0 -; GCN-IR-NEXT: s_mov_b32 s5, s1 +; GCN-IR-NEXT: s_mov_b32 s11, 0xf000 +; GCN-IR-NEXT: s_mov_b32 s10, -1 +; GCN-IR-NEXT: s_mov_b32 s9, s1 ; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc -; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; GCN-IR-NEXT: s_endpgm %result = urem i64 %x, 24 store i64 %result, i64 addrspace(1)* %out diff --git a/llvm/test/CodeGen/AMDGPU/vselect.ll b/llvm/test/CodeGen/AMDGPU/vselect.ll --- a/llvm/test/CodeGen/AMDGPU/vselect.ll +++ b/llvm/test/CodeGen/AMDGPU/vselect.ll @@ -13,9 +13,9 @@ ; VI: s_cselect_b32 ; SI-DAG: s_cmp_gt_i32 -; SI-DAG: v_cndmask_b32_e64 +; SI-DAG: s_cselect_b32 ; SI-DAG: s_cmp_gt_i32 -; SI-DAG: v_cndmask_b32_e32 +; SI-DAG: s_cselect_b32 define amdgpu_kernel void @test_select_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in0, <2 x i32> addrspace(1)* %in1, <2 x i32> %val) { entry: @@ -59,10 +59,10 @@ ; VI: s_cselect_b32 ; VI: s_cselect_b32 -; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_cndmask_b32_e32 +; SI-DAG: s_cselect_b32 +; SI-DAG: s_cselect_b32 +; SI-DAG: s_cselect_b32 +; SI-DAG: s_cselect_b32 define amdgpu_kernel void @test_select_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in0, <4 x i32> addrspace(1)* %in1, <4 x i32> %val) { entry: