diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -8298,6 +8298,16 @@ SDLoc DL(Op); SDValue Cond = Op.getOperand(0); + if (Subtarget->hasScalarCompareEq64() && Op->getOperand(0)->hasOneUse() && + !Op->isDivergent()) { + if (VT == MVT::i64) + return Op; + SDValue LHS = DAG.getNode(ISD::BITCAST, DL, MVT::i64, Op.getOperand(1)); + SDValue RHS = DAG.getNode(ISD::BITCAST, DL, MVT::i64, Op.getOperand(2)); + return DAG.getNode(ISD::BITCAST, DL, VT, + DAG.getSelect(DL, MVT::i64, Cond, LHS, RHS)); + } + SDValue Zero = DAG.getConstant(0, DL, MVT::i32); SDValue One = DAG.getConstant(1, DL, MVT::i32); diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -78,8 +78,11 @@ moveScalarAddSub(SetVectorType &Worklist, MachineInstr &Inst, MachineDominatorTree *MDT = nullptr) const; - void lowerSelect(SetVectorType &Worklist, MachineInstr &Inst, - MachineDominatorTree *MDT = nullptr) const; + void lowerSelect32(SetVectorType &Worklist, MachineInstr &Inst, + MachineDominatorTree *MDT = nullptr) const; + + void splitSelect64(SetVectorType &Worklist, MachineInstr &Inst, + MachineDominatorTree *MDT = nullptr) const; void lowerScalarAbs(SetVectorType &Worklist, MachineInstr &Inst) const; diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -6035,8 +6035,11 @@ continue; case AMDGPU::S_CSELECT_B32: + lowerSelect32(Worklist, Inst, MDT); + Inst.eraseFromParent(); + continue; case AMDGPU::S_CSELECT_B64: - lowerSelect(Worklist, Inst, MDT); + splitSelect64(Worklist, Inst, MDT); Inst.eraseFromParent(); continue; case AMDGPU::S_CMP_EQ_I32: @@ -6214,8 +6217,8 @@ return std::make_pair(false, nullptr); } -void SIInstrInfo::lowerSelect(SetVectorType &Worklist, MachineInstr &Inst, - MachineDominatorTree *MDT) const { +void SIInstrInfo::lowerSelect32(SetVectorType &Worklist, MachineInstr &Inst, + MachineDominatorTree *MDT) const { MachineBasicBlock &MBB = *Inst.getParent(); MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); @@ -6290,6 +6293,95 @@ addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); } +void SIInstrInfo::splitSelect64(SetVectorType &Worklist, MachineInstr &Inst, + MachineDominatorTree *MDT) const { + // Split S_CSELECT_B64 into a pair of S_CSELECT_B32 and lower them + // further. + const DebugLoc &DL = Inst.getDebugLoc(); + MachineBasicBlock::iterator MII = Inst; + MachineBasicBlock &MBB = *Inst.getParent(); + MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + + // Get the original operands. + MachineOperand &Dest = Inst.getOperand(0); + MachineOperand &Src0 = Inst.getOperand(1); + MachineOperand &Src1 = Inst.getOperand(2); + MachineOperand &Cond = Inst.getOperand(3); + + Register SCCSource = Cond.getReg(); + bool IsSCC = (SCCSource == AMDGPU::SCC); + + // If this is a trivial select where the condition is effectively not SCC + // (SCCSource is a source of copy to SCC), then the select is semantically + // equivalent to copying SCCSource. Hence, there is no need to create + // V_CNDMASK, we can just use that and bail out. + if (!IsSCC && (Src0.isImm() && Src0.getImm() == -1) && + (Src1.isImm() && Src1.getImm() == 0)) { + MRI.replaceRegWith(Dest.getReg(), SCCSource); + return; + } + + // Prepare the split destination. + Register FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); + Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + + // Split the source operands. + const TargetRegisterClass *Src0RC = nullptr; + const TargetRegisterClass *Src0SubRC = nullptr; + if (Src0.isReg()) { + Src0RC = MRI.getRegClass(Src0.getReg()); + Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0); + } + const TargetRegisterClass *Src1RC = nullptr; + const TargetRegisterClass *Src1SubRC = nullptr; + if (Src1.isReg()) { + Src1RC = MRI.getRegClass(Src1.getReg()); + Src1SubRC = RI.getSubRegClass(Src1RC, AMDGPU::sub0); + } + // Split lo. + MachineOperand SrcReg0Sub0 = + buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC); + MachineOperand SrcReg1Sub0 = + buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC); + // Split hi. + MachineOperand SrcReg0Sub1 = + buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC); + MachineOperand SrcReg1Sub1 = + buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC); + // Select the lo part. + MachineInstr *LoHalf = + BuildMI(MBB, MII, DL, get(AMDGPU::S_CSELECT_B32), DestSub0) + .add(SrcReg0Sub0) + .add(SrcReg1Sub0); + // Replace the condition operand with the original one. + LoHalf->getOperand(3).setReg(SCCSource); + Worklist.insert(LoHalf); + // Select the hi part. + MachineInstr *HiHalf = + BuildMI(MBB, MII, DL, get(AMDGPU::S_CSELECT_B32), DestSub1) + .add(SrcReg0Sub1) + .add(SrcReg1Sub1); + // Replace the condition operand with the original one. + HiHalf->getOperand(3).setReg(SCCSource); + Worklist.insert(HiHalf); + // Merge them back to the original 64-bit one. + BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg) + .addReg(DestSub0) + .addImm(AMDGPU::sub0) + .addReg(DestSub1) + .addImm(AMDGPU::sub1); + MRI.replaceRegWith(Dest.getReg(), FullDestReg); + + // Try to legalize the operands in case we need to swap the order to keep + // it valid. + legalizeOperands(*LoHalf, MDT); + legalizeOperands(*HiHalf, MDT); + + // Move all users of this moved value. + addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist); +} + void SIInstrInfo::lowerScalarAbs(SetVectorType &Worklist, MachineInstr &Inst) const { MachineBasicBlock &MBB = *Inst.getParent(); diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td --- a/llvm/lib/Target/AMDGPU/SOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td @@ -517,9 +517,10 @@ def S_CSELECT_B32 : SOP2_32 <"s_cselect_b32", [(set i32:$sdst, (SelectPat i64:$src0, i64:$src1))] + >; } - - def S_CSELECT_B64 : SOP2_64 <"s_cselect_b64">; } // End Uses = [SCC] let Defs = [SCC] in { diff --git a/llvm/test/CodeGen/AMDGPU/addrspacecast.ll b/llvm/test/CodeGen/AMDGPU/addrspacecast.ll --- a/llvm/test/CodeGen/AMDGPU/addrspacecast.ll +++ b/llvm/test/CodeGen/AMDGPU/addrspacecast.ll @@ -17,17 +17,15 @@ ; CI-DAG: v_cndmask_b32_e32 v[[LO:[0-9]+]], 0, [[VPTR]] ; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7 -; GFX9-DAG: s_load_dword [[PTR:s[0-9]+]], s[4:5], 0x0{{$}} +; GFX9-DAG: s_load_dword s[[PTR:[0-9]+]], s[4:5], 0x0{{$}} ; GFX9-DAG: s_getreg_b32 [[SSRC_SHARED:s[0-9]+]], hwreg(HW_REG_SH_MEM_BASES, 16, 16) -; GFX9-DAG: s_lshl_b32 [[SSRC_SHARED_BASE:s[0-9]+]], [[SSRC_SHARED]], 16 -; GFX9-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], [[SSRC_SHARED_BASE]] +; GFX9-DAG: s_lshl_b32 s[[SSRC_SHARED_BASE:[0-9]+]], [[SSRC_SHARED]], 16 ; GFX9-XXX: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], src_shared_base -; GFX9: s_cmp_lg_u32 [[PTR]], -1 -; GFX9: s_cselect_b64 vcc, -1, 0 -; GFX9: v_cndmask_b32_e32 v[[HI:[0-9]+]], 0, [[VAPERTURE]], vcc -; GFX9-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]] -; GFX9-DAG: v_cndmask_b32_e32 v[[LO:[0-9]+]], 0, [[VPTR]] +; GFX9: s_cmp_lg_u32 s[[PTR]], -1 +; GFX9: s_cselect_b64 s{{\[}}[[SEL_LO:[0-9]+]]:[[SEL_HI:[0-9]+]]{{\]}}, s{{\[}}[[PTR]]:[[SSRC_SHARED_BASE]]{{\]}}, 0 +; GFX9-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], s[[SEL_LO]] +; GFX9-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], s[[SEL_HI]] ; HSA: flat_store_dword v{{\[}}[[LO]]:[[HI]]{{\]}}, [[K]] @@ -84,19 +82,17 @@ ; CI-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]] ; CI-DAG: v_cndmask_b32_e32 v[[LO:[0-9]+]], 0, [[VPTR]] -; GFX9-DAG: s_load_dword [[PTR:s[0-9]+]], s[4:5], 0x0{{$}} +; GFX9-DAG: s_load_dword s[[PTR:[0-9]+]], s[4:5], 0x0{{$}} ; GFX9-DAG: s_getreg_b32 [[SSRC_PRIVATE:s[0-9]+]], hwreg(HW_REG_SH_MEM_BASES, 0, 16) -; GFX9-DAG: s_lshl_b32 [[SSRC_PRIVATE_BASE:s[0-9]+]], [[SSRC_PRIVATE]], 16 -; GFX9-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], [[SSRC_PRIVATE_BASE]] +; GFX9-DAG: s_lshl_b32 s[[SSRC_PRIVATE_BASE:[0-9]+]], [[SSRC_PRIVATE]], 16 ; GFX9-XXX: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], src_private_base ; GFX9-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7 -; GFX9: s_cmp_lg_u32 [[PTR]], -1 -; GFX9: s_cselect_b64 vcc, -1, 0 -; GFX9: v_cndmask_b32_e32 v[[HI:[0-9]+]], 0, [[VAPERTURE]], vcc -; GFX9: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]] -; GFX9-DAG: v_cndmask_b32_e32 v[[LO:[0-9]+]], 0, [[VPTR]] +; GFX9: s_cmp_lg_u32 s[[PTR]], -1 +; GFX9: s_cselect_b64 s{{\[}}[[SEL_LO:[0-9]+]]:[[SEL_HI:[0-9]+]]{{\]}}, s{{\[}}[[PTR]]:[[SSRC_PRIVATE_BASE]]{{\]}}, 0 +; GFX9-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], s[[SEL_LO]] +; GFX9-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], s[[SEL_HI]] ; HSA: flat_store_dword v{{\[}}[[LO]]:[[HI]]{{\]}}, [[K]] diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll --- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll @@ -3,6 +3,7 @@ ; RUN: opt -S -mtriple=amdgcn-- -mcpu=tahiti -amdgpu-codegenprepare -amdgpu-bypass-slow-div=0 %s | FileCheck %s ; RUN: llc -mtriple=amdgcn-- -mcpu=tahiti -amdgpu-bypass-slow-div=0 < %s | FileCheck -check-prefix=GFX6 %s ; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -amdgpu-bypass-slow-div=0 < %s | FileCheck -check-prefix=GFX9 %s +; RUN: llc -mtriple=amdgcn-- -mcpu=gfx90a -amdgpu-bypass-slow-div=0 < %s | FileCheck -check-prefix=GFX90A %s define amdgpu_kernel void @udiv_i32(i32 addrspace(1)* %out, i32 %x, i32 %y) { ; CHECK-LABEL: @udiv_i32( @@ -95,6 +96,34 @@ ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GFX9-NEXT: global_store_dword v2, v0, s[0:1] ; GFX9-NEXT: s_endpgm +; +; GFX90A-LABEL: udiv_i32: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX90A-NEXT: v_mov_b32_e32 v1, 0 +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s3 +; GFX90A-NEXT: s_sub_i32 s4, 0, s3 +; GFX90A-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX90A-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; GFX90A-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX90A-NEXT: v_mul_lo_u32 v2, s4, v0 +; GFX90A-NEXT: v_mul_hi_u32 v2, v0, v2 +; GFX90A-NEXT: v_add_u32_e32 v0, v0, v2 +; GFX90A-NEXT: v_mul_hi_u32 v0, s2, v0 +; GFX90A-NEXT: v_mul_lo_u32 v2, v0, s3 +; GFX90A-NEXT: v_sub_u32_e32 v2, s2, v2 +; GFX90A-NEXT: v_add_u32_e32 v3, 1, v0 +; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s3, v2 +; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; GFX90A-NEXT: v_subrev_u32_e32 v3, s3, v2 +; GFX90A-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; GFX90A-NEXT: v_add_u32_e32 v3, 1, v0 +; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s3, v2 +; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; GFX90A-NEXT: global_store_dword v1, v0, s[0:1] +; GFX90A-NEXT: s_endpgm %r = udiv i32 %x, %y store i32 %r, i32 addrspace(1)* %out ret void @@ -185,6 +214,32 @@ ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX9-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-NEXT: s_endpgm +; +; GFX90A-LABEL: urem_i32: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX90A-NEXT: v_mov_b32_e32 v1, 0 +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s3 +; GFX90A-NEXT: s_sub_i32 s4, 0, s3 +; GFX90A-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX90A-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; GFX90A-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX90A-NEXT: v_mul_lo_u32 v2, s4, v0 +; GFX90A-NEXT: v_mul_hi_u32 v2, v0, v2 +; GFX90A-NEXT: v_add_u32_e32 v0, v0, v2 +; GFX90A-NEXT: v_mul_hi_u32 v0, s2, v0 +; GFX90A-NEXT: v_mul_lo_u32 v0, v0, s3 +; GFX90A-NEXT: v_sub_u32_e32 v0, s2, v0 +; GFX90A-NEXT: v_subrev_u32_e32 v2, s3, v0 +; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s3, v0 +; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX90A-NEXT: v_subrev_u32_e32 v2, s3, v0 +; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s3, v0 +; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX90A-NEXT: global_store_dword v1, v0, s[0:1] +; GFX90A-NEXT: s_endpgm %r = urem i32 %x, %y store i32 %r, i32 addrspace(1)* %out ret void @@ -308,6 +363,43 @@ ; GFX9-NEXT: v_subrev_u32_e32 v0, s4, v0 ; GFX9-NEXT: global_store_dword v2, v0, s[0:1] ; GFX9-NEXT: s_endpgm +; +; GFX90A-LABEL: sdiv_i32: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX90A-NEXT: v_mov_b32_e32 v1, 0 +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_ashr_i32 s4, s3, 31 +; GFX90A-NEXT: s_add_i32 s3, s3, s4 +; GFX90A-NEXT: s_xor_b32 s3, s3, s4 +; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s3 +; GFX90A-NEXT: s_ashr_i32 s5, s2, 31 +; GFX90A-NEXT: s_add_i32 s2, s2, s5 +; GFX90A-NEXT: s_xor_b32 s4, s5, s4 +; GFX90A-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX90A-NEXT: s_xor_b32 s2, s2, s5 +; GFX90A-NEXT: s_sub_i32 s5, 0, s3 +; GFX90A-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; GFX90A-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX90A-NEXT: v_mul_lo_u32 v2, s5, v0 +; GFX90A-NEXT: v_mul_hi_u32 v2, v0, v2 +; GFX90A-NEXT: v_add_u32_e32 v0, v0, v2 +; GFX90A-NEXT: v_mul_hi_u32 v0, s2, v0 +; GFX90A-NEXT: v_mul_lo_u32 v2, v0, s3 +; GFX90A-NEXT: v_sub_u32_e32 v2, s2, v2 +; GFX90A-NEXT: v_add_u32_e32 v3, 1, v0 +; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s3, v2 +; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; GFX90A-NEXT: v_subrev_u32_e32 v3, s3, v2 +; GFX90A-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; GFX90A-NEXT: v_add_u32_e32 v3, 1, v0 +; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s3, v2 +; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; GFX90A-NEXT: v_xor_b32_e32 v0, s4, v0 +; GFX90A-NEXT: v_subrev_u32_e32 v0, s4, v0 +; GFX90A-NEXT: global_store_dword v1, v0, s[0:1] +; GFX90A-NEXT: s_endpgm %r = sdiv i32 %x, %y store i32 %r, i32 addrspace(1)* %out ret void @@ -422,6 +514,40 @@ ; GFX9-NEXT: v_subrev_u32_e32 v0, s4, v0 ; GFX9-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-NEXT: s_endpgm +; +; GFX90A-LABEL: srem_i32: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX90A-NEXT: v_mov_b32_e32 v1, 0 +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_ashr_i32 s4, s3, 31 +; GFX90A-NEXT: s_add_i32 s3, s3, s4 +; GFX90A-NEXT: s_xor_b32 s3, s3, s4 +; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s3 +; GFX90A-NEXT: s_sub_i32 s5, 0, s3 +; GFX90A-NEXT: s_ashr_i32 s4, s2, 31 +; GFX90A-NEXT: s_add_i32 s2, s2, s4 +; GFX90A-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX90A-NEXT: s_xor_b32 s2, s2, s4 +; GFX90A-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; GFX90A-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX90A-NEXT: v_mul_lo_u32 v2, s5, v0 +; GFX90A-NEXT: v_mul_hi_u32 v2, v0, v2 +; GFX90A-NEXT: v_add_u32_e32 v0, v0, v2 +; GFX90A-NEXT: v_mul_hi_u32 v0, s2, v0 +; GFX90A-NEXT: v_mul_lo_u32 v0, v0, s3 +; GFX90A-NEXT: v_sub_u32_e32 v0, s2, v0 +; GFX90A-NEXT: v_subrev_u32_e32 v2, s3, v0 +; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s3, v0 +; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX90A-NEXT: v_subrev_u32_e32 v2, s3, v0 +; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s3, v0 +; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX90A-NEXT: v_xor_b32_e32 v0, s4, v0 +; GFX90A-NEXT: v_subrev_u32_e32 v0, s4, v0 +; GFX90A-NEXT: global_store_dword v1, v0, s[0:1] +; GFX90A-NEXT: s_endpgm %r = srem i32 %x, %y store i32 %r, i32 addrspace(1)* %out ret void @@ -489,6 +615,26 @@ ; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v4, vcc ; GFX9-NEXT: global_store_short v3, v0, s[0:1] ; GFX9-NEXT: s_endpgm +; +; GFX90A-LABEL: udiv_i16: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_lshr_b32 s3, s2, 16 +; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s3 +; GFX90A-NEXT: s_and_b32 s2, s2, 0xffff +; GFX90A-NEXT: v_cvt_f32_u32_e32 v1, s2 +; GFX90A-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; GFX90A-NEXT: v_mul_f32_e32 v2, v1, v2 +; GFX90A-NEXT: v_trunc_f32_e32 v2, v2 +; GFX90A-NEXT: v_cvt_u32_f32_e32 v4, v2 +; GFX90A-NEXT: v_mad_f32 v1, -v2, v0, v1 +; GFX90A-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v4, vcc +; GFX90A-NEXT: global_store_short v3, v0, s[0:1] +; GFX90A-NEXT: s_endpgm %r = udiv i16 %x, %y store i16 %r, i16 addrspace(1)* %out ret void @@ -563,6 +709,28 @@ ; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 ; GFX9-NEXT: global_store_short v1, v0, s[0:1] ; GFX9-NEXT: s_endpgm +; +; GFX90A-LABEL: urem_i16: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_lshr_b32 s3, s2, 16 +; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s3 +; GFX90A-NEXT: s_and_b32 s4, s2, 0xffff +; GFX90A-NEXT: v_cvt_f32_u32_e32 v1, s4 +; GFX90A-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; GFX90A-NEXT: v_mul_f32_e32 v2, v1, v2 +; GFX90A-NEXT: v_trunc_f32_e32 v2, v2 +; GFX90A-NEXT: v_cvt_u32_f32_e32 v4, v2 +; GFX90A-NEXT: v_mad_f32 v1, -v2, v0, v1 +; GFX90A-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v4, vcc +; GFX90A-NEXT: v_mul_lo_u32 v0, v0, s3 +; GFX90A-NEXT: v_sub_u32_e32 v0, s2, v0 +; GFX90A-NEXT: global_store_short v3, v0, s[0:1] +; GFX90A-NEXT: s_endpgm %r = urem i16 %x, %y store i16 %r, i16 addrspace(1)* %out ret void @@ -644,6 +812,31 @@ ; GFX9-NEXT: v_add_u32_e32 v0, s0, v3 ; GFX9-NEXT: global_store_short v1, v0, s[2:3] ; GFX9-NEXT: s_endpgm +; +; GFX90A-LABEL: sdiv_i16: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX90A-NEXT: v_mov_b32_e32 v1, 0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_ashr_i32 s0, s4, 16 +; GFX90A-NEXT: v_cvt_f32_i32_e32 v0, s0 +; GFX90A-NEXT: s_sext_i32_i16 s1, s4 +; GFX90A-NEXT: v_cvt_f32_i32_e32 v2, s1 +; GFX90A-NEXT: s_xor_b32 s0, s1, s0 +; GFX90A-NEXT: v_rcp_iflag_f32_e32 v3, v0 +; GFX90A-NEXT: s_ashr_i32 s0, s0, 30 +; GFX90A-NEXT: s_or_b32 s4, s0, 1 +; GFX90A-NEXT: v_mul_f32_e32 v3, v2, v3 +; GFX90A-NEXT: v_trunc_f32_e32 v3, v3 +; GFX90A-NEXT: v_mad_f32 v2, -v3, v0, v2 +; GFX90A-NEXT: v_cvt_i32_f32_e32 v3, v3 +; GFX90A-NEXT: v_cmp_ge_f32_e64 s[0:1], |v2|, |v0| +; GFX90A-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX90A-NEXT: s_cselect_b32 s0, s4, 0 +; GFX90A-NEXT: v_add_u32_e32 v0, s0, v3 +; GFX90A-NEXT: global_store_short v1, v0, s[2:3] +; GFX90A-NEXT: s_endpgm %r = sdiv i16 %x, %y store i16 %r, i16 addrspace(1)* %out ret void @@ -732,6 +925,33 @@ ; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 ; GFX9-NEXT: global_store_short v1, v0, s[0:1] ; GFX9-NEXT: s_endpgm +; +; GFX90A-LABEL: srem_i16: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX90A-NEXT: v_mov_b32_e32 v1, 0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_ashr_i32 s5, s4, 16 +; GFX90A-NEXT: v_cvt_f32_i32_e32 v0, s5 +; GFX90A-NEXT: s_sext_i32_i16 s0, s4 +; GFX90A-NEXT: v_cvt_f32_i32_e32 v2, s0 +; GFX90A-NEXT: s_xor_b32 s0, s0, s5 +; GFX90A-NEXT: v_rcp_iflag_f32_e32 v3, v0 +; GFX90A-NEXT: s_ashr_i32 s0, s0, 30 +; GFX90A-NEXT: s_or_b32 s6, s0, 1 +; GFX90A-NEXT: v_mul_f32_e32 v3, v2, v3 +; GFX90A-NEXT: v_trunc_f32_e32 v3, v3 +; GFX90A-NEXT: v_mad_f32 v2, -v3, v0, v2 +; GFX90A-NEXT: v_cvt_i32_f32_e32 v3, v3 +; GFX90A-NEXT: v_cmp_ge_f32_e64 s[0:1], |v2|, |v0| +; GFX90A-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX90A-NEXT: s_cselect_b32 s0, s6, 0 +; GFX90A-NEXT: v_add_u32_e32 v0, s0, v3 +; GFX90A-NEXT: v_mul_lo_u32 v0, v0, s5 +; GFX90A-NEXT: v_sub_u32_e32 v0, s4, v0 +; GFX90A-NEXT: global_store_short v1, v0, s[2:3] +; GFX90A-NEXT: s_endpgm %r = srem i16 %x, %y store i16 %r, i16 addrspace(1)* %out ret void @@ -795,6 +1015,24 @@ ; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v4, vcc ; GFX9-NEXT: global_store_byte v2, v0, s[0:1] ; GFX9-NEXT: s_endpgm +; +; GFX90A-LABEL: udiv_i8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: v_cvt_f32_ubyte1_e32 v0, s2 +; GFX90A-NEXT: v_rcp_iflag_f32_e32 v1, v0 +; GFX90A-NEXT: v_cvt_f32_ubyte0_e32 v3, s2 +; GFX90A-NEXT: v_mul_f32_e32 v1, v3, v1 +; GFX90A-NEXT: v_trunc_f32_e32 v1, v1 +; GFX90A-NEXT: v_cvt_u32_f32_e32 v4, v1 +; GFX90A-NEXT: v_mad_f32 v1, -v1, v0, v3 +; GFX90A-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v4, vcc +; GFX90A-NEXT: global_store_byte v2, v0, s[0:1] +; GFX90A-NEXT: s_endpgm %r = udiv i8 %x, %y store i8 %r, i8 addrspace(1)* %out ret void @@ -867,6 +1105,27 @@ ; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 ; GFX9-NEXT: global_store_byte v1, v0, s[0:1] ; GFX9-NEXT: s_endpgm +; +; GFX90A-LABEL: urem_i8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: v_cvt_f32_ubyte1_e32 v0, s4 +; GFX90A-NEXT: v_rcp_iflag_f32_e32 v1, v0 +; GFX90A-NEXT: v_cvt_f32_ubyte0_e32 v3, s4 +; GFX90A-NEXT: s_lshr_b32 s0, s4, 8 +; GFX90A-NEXT: v_mul_f32_e32 v1, v3, v1 +; GFX90A-NEXT: v_trunc_f32_e32 v1, v1 +; GFX90A-NEXT: v_cvt_u32_f32_e32 v4, v1 +; GFX90A-NEXT: v_mad_f32 v1, -v1, v0, v3 +; GFX90A-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v4, vcc +; GFX90A-NEXT: v_mul_lo_u32 v0, v0, s0 +; GFX90A-NEXT: v_sub_u32_e32 v0, s4, v0 +; GFX90A-NEXT: global_store_byte v2, v0, s[2:3] +; GFX90A-NEXT: s_endpgm %r = urem i8 %x, %y store i8 %r, i8 addrspace(1)* %out ret void @@ -948,6 +1207,31 @@ ; GFX9-NEXT: v_add_u32_e32 v0, s0, v3 ; GFX9-NEXT: global_store_byte v1, v0, s[2:3] ; GFX9-NEXT: s_endpgm +; +; GFX90A-LABEL: sdiv_i8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX90A-NEXT: v_mov_b32_e32 v1, 0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_bfe_i32 s0, s4, 0x80008 +; GFX90A-NEXT: v_cvt_f32_i32_e32 v0, s0 +; GFX90A-NEXT: s_sext_i32_i8 s1, s4 +; GFX90A-NEXT: v_cvt_f32_i32_e32 v2, s1 +; GFX90A-NEXT: s_xor_b32 s0, s1, s0 +; GFX90A-NEXT: v_rcp_iflag_f32_e32 v3, v0 +; GFX90A-NEXT: s_ashr_i32 s0, s0, 30 +; GFX90A-NEXT: s_or_b32 s4, s0, 1 +; GFX90A-NEXT: v_mul_f32_e32 v3, v2, v3 +; GFX90A-NEXT: v_trunc_f32_e32 v3, v3 +; GFX90A-NEXT: v_mad_f32 v2, -v3, v0, v2 +; GFX90A-NEXT: v_cvt_i32_f32_e32 v3, v3 +; GFX90A-NEXT: v_cmp_ge_f32_e64 s[0:1], |v2|, |v0| +; GFX90A-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX90A-NEXT: s_cselect_b32 s0, s4, 0 +; GFX90A-NEXT: v_add_u32_e32 v0, s0, v3 +; GFX90A-NEXT: global_store_byte v1, v0, s[2:3] +; GFX90A-NEXT: s_endpgm %r = sdiv i8 %x, %y store i8 %r, i8 addrspace(1)* %out ret void @@ -1038,6 +1322,34 @@ ; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 ; GFX9-NEXT: global_store_byte v1, v0, s[0:1] ; GFX9-NEXT: s_endpgm +; +; GFX90A-LABEL: srem_i8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_bfe_i32 s0, s4, 0x80008 +; GFX90A-NEXT: v_cvt_f32_i32_e32 v1, s0 +; GFX90A-NEXT: s_sext_i32_i8 s1, s4 +; GFX90A-NEXT: v_cvt_f32_i32_e32 v2, s1 +; GFX90A-NEXT: s_xor_b32 s0, s1, s0 +; GFX90A-NEXT: v_rcp_iflag_f32_e32 v3, v1 +; GFX90A-NEXT: s_ashr_i32 s0, s0, 30 +; GFX90A-NEXT: s_lshr_b32 s5, s4, 8 +; GFX90A-NEXT: s_or_b32 s6, s0, 1 +; GFX90A-NEXT: v_mul_f32_e32 v3, v2, v3 +; GFX90A-NEXT: v_trunc_f32_e32 v3, v3 +; GFX90A-NEXT: v_mad_f32 v2, -v3, v1, v2 +; GFX90A-NEXT: v_cvt_i32_f32_e32 v3, v3 +; GFX90A-NEXT: v_cmp_ge_f32_e64 s[0:1], |v2|, |v1| +; GFX90A-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX90A-NEXT: s_cselect_b32 s0, s6, 0 +; GFX90A-NEXT: v_add_u32_e32 v1, s0, v3 +; GFX90A-NEXT: v_mul_lo_u32 v1, v1, s5 +; GFX90A-NEXT: v_sub_u32_e32 v1, s4, v1 +; GFX90A-NEXT: global_store_byte v0, v1, s[2:3] +; GFX90A-NEXT: s_endpgm %r = srem i8 %x, %y store i8 %r, i8 addrspace(1)* %out ret void @@ -1348,6 +1660,92 @@ ; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX9-NEXT: s_endpgm +; +; GFX90A-LABEL: udiv_v4i32: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 +; GFX90A-NEXT: s_mov_b32 s3, 0x4f7ffffe +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s8 +; GFX90A-NEXT: v_cvt_f32_u32_e32 v1, s9 +; GFX90A-NEXT: s_sub_i32 s2, 0, s8 +; GFX90A-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX90A-NEXT: v_rcp_iflag_f32_e32 v1, v1 +; GFX90A-NEXT: v_mul_f32_e32 v0, s3, v0 +; GFX90A-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX90A-NEXT: v_mul_f32_e32 v1, s3, v1 +; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GFX90A-NEXT: v_mul_lo_u32 v2, s2, v0 +; GFX90A-NEXT: v_mul_hi_u32 v2, v0, v2 +; GFX90A-NEXT: v_add_u32_e32 v0, v0, v2 +; GFX90A-NEXT: v_mul_hi_u32 v0, s4, v0 +; GFX90A-NEXT: v_mul_lo_u32 v2, v0, s8 +; GFX90A-NEXT: v_sub_u32_e32 v2, s4, v2 +; GFX90A-NEXT: v_add_u32_e32 v3, 1, v0 +; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s8, v2 +; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; GFX90A-NEXT: v_subrev_u32_e32 v3, s8, v2 +; GFX90A-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; GFX90A-NEXT: s_sub_i32 s2, 0, s9 +; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s8, v2 +; GFX90A-NEXT: v_mul_lo_u32 v2, s2, v1 +; GFX90A-NEXT: v_add_u32_e32 v3, 1, v0 +; GFX90A-NEXT: v_mul_hi_u32 v2, v1, v2 +; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; GFX90A-NEXT: v_cvt_f32_u32_e32 v3, s10 +; GFX90A-NEXT: v_add_u32_e32 v1, v1, v2 +; GFX90A-NEXT: v_mul_hi_u32 v1, s5, v1 +; GFX90A-NEXT: v_mul_lo_u32 v2, v1, s9 +; GFX90A-NEXT: v_rcp_iflag_f32_e32 v3, v3 +; GFX90A-NEXT: v_sub_u32_e32 v2, s5, v2 +; GFX90A-NEXT: v_add_u32_e32 v5, 1, v1 +; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s9, v2 +; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; GFX90A-NEXT: v_subrev_u32_e32 v5, s9, v2 +; GFX90A-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc +; GFX90A-NEXT: v_mul_f32_e32 v3, s3, v3 +; GFX90A-NEXT: v_add_u32_e32 v5, 1, v1 +; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s9, v2 +; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; GFX90A-NEXT: v_cvt_f32_u32_e32 v5, s11 +; GFX90A-NEXT: v_cvt_u32_f32_e32 v3, v3 +; GFX90A-NEXT: s_sub_i32 s2, 0, s10 +; GFX90A-NEXT: v_rcp_iflag_f32_e32 v5, v5 +; GFX90A-NEXT: v_mul_lo_u32 v2, s2, v3 +; GFX90A-NEXT: v_mul_hi_u32 v2, v3, v2 +; GFX90A-NEXT: v_add_u32_e32 v2, v3, v2 +; GFX90A-NEXT: v_mul_hi_u32 v2, s6, v2 +; GFX90A-NEXT: v_mul_lo_u32 v3, v2, s10 +; GFX90A-NEXT: v_mul_f32_e32 v5, s3, v5 +; GFX90A-NEXT: v_sub_u32_e32 v3, s6, v3 +; GFX90A-NEXT: v_cvt_u32_f32_e32 v5, v5 +; GFX90A-NEXT: v_add_u32_e32 v6, 1, v2 +; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s10, v3 +; GFX90A-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc +; GFX90A-NEXT: v_subrev_u32_e32 v6, s10, v3 +; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc +; GFX90A-NEXT: s_sub_i32 s2, 0, s11 +; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s10, v3 +; GFX90A-NEXT: v_mul_lo_u32 v3, s2, v5 +; GFX90A-NEXT: v_mul_hi_u32 v3, v5, v3 +; GFX90A-NEXT: v_add_u32_e32 v3, v5, v3 +; GFX90A-NEXT: v_mul_hi_u32 v3, s7, v3 +; GFX90A-NEXT: v_mul_lo_u32 v5, v3, s11 +; GFX90A-NEXT: v_add_u32_e32 v6, 1, v2 +; GFX90A-NEXT: v_sub_u32_e32 v5, s7, v5 +; GFX90A-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc +; GFX90A-NEXT: v_add_u32_e32 v6, 1, v3 +; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s11, v5 +; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc +; GFX90A-NEXT: v_subrev_u32_e32 v6, s11, v5 +; GFX90A-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc +; GFX90A-NEXT: v_add_u32_e32 v6, 1, v3 +; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s11, v5 +; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX90A-NEXT: s_endpgm %r = udiv <4 x i32> %x, %y store <4 x i32> %r, <4 x i32> addrspace(1)* %out ret void @@ -1634,6 +2032,84 @@ ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX9-NEXT: s_endpgm +; +; GFX90A-LABEL: urem_v4i32: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 +; GFX90A-NEXT: s_mov_b32 s12, 0x4f7ffffe +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s8 +; GFX90A-NEXT: s_sub_i32 s2, 0, s8 +; GFX90A-NEXT: v_cvt_f32_u32_e32 v1, s9 +; GFX90A-NEXT: s_sub_i32 s3, 0, s9 +; GFX90A-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX90A-NEXT: v_rcp_iflag_f32_e32 v1, v1 +; GFX90A-NEXT: v_mul_f32_e32 v0, s12, v0 +; GFX90A-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX90A-NEXT: v_mul_f32_e32 v1, s12, v1 +; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GFX90A-NEXT: v_mul_lo_u32 v2, s2, v0 +; GFX90A-NEXT: v_mul_hi_u32 v2, v0, v2 +; GFX90A-NEXT: v_add_u32_e32 v0, v0, v2 +; GFX90A-NEXT: v_mul_hi_u32 v0, s4, v0 +; GFX90A-NEXT: v_mul_lo_u32 v0, v0, s8 +; GFX90A-NEXT: v_sub_u32_e32 v0, s4, v0 +; GFX90A-NEXT: v_subrev_u32_e32 v2, s8, v0 +; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 +; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX90A-NEXT: v_subrev_u32_e32 v2, s8, v0 +; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 +; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX90A-NEXT: v_cvt_f32_u32_e32 v2, s10 +; GFX90A-NEXT: v_mul_lo_u32 v3, s3, v1 +; GFX90A-NEXT: v_mul_hi_u32 v3, v1, v3 +; GFX90A-NEXT: v_add_u32_e32 v1, v1, v3 +; GFX90A-NEXT: v_rcp_iflag_f32_e32 v2, v2 +; GFX90A-NEXT: v_mul_hi_u32 v1, s5, v1 +; GFX90A-NEXT: v_mul_lo_u32 v1, v1, s9 +; GFX90A-NEXT: v_sub_u32_e32 v1, s5, v1 +; GFX90A-NEXT: v_mul_f32_e32 v2, s12, v2 +; GFX90A-NEXT: v_cvt_u32_f32_e32 v2, v2 +; GFX90A-NEXT: v_subrev_u32_e32 v3, s9, v1 +; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s9, v1 +; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX90A-NEXT: v_subrev_u32_e32 v3, s9, v1 +; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s9, v1 +; GFX90A-NEXT: s_sub_i32 s2, 0, s10 +; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX90A-NEXT: v_mul_lo_u32 v3, s2, v2 +; GFX90A-NEXT: v_mul_hi_u32 v3, v2, v3 +; GFX90A-NEXT: v_add_u32_e32 v2, v2, v3 +; GFX90A-NEXT: v_cvt_f32_u32_e32 v3, s11 +; GFX90A-NEXT: v_mul_hi_u32 v2, s6, v2 +; GFX90A-NEXT: v_mul_lo_u32 v2, v2, s10 +; GFX90A-NEXT: v_sub_u32_e32 v2, s6, v2 +; GFX90A-NEXT: v_rcp_iflag_f32_e32 v3, v3 +; GFX90A-NEXT: v_subrev_u32_e32 v5, s10, v2 +; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s10, v2 +; GFX90A-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc +; GFX90A-NEXT: v_mul_f32_e32 v3, s12, v3 +; GFX90A-NEXT: v_cvt_u32_f32_e32 v3, v3 +; GFX90A-NEXT: v_subrev_u32_e32 v5, s10, v2 +; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s10, v2 +; GFX90A-NEXT: s_sub_i32 s2, 0, s11 +; GFX90A-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc +; GFX90A-NEXT: v_mul_lo_u32 v5, s2, v3 +; GFX90A-NEXT: v_mul_hi_u32 v5, v3, v5 +; GFX90A-NEXT: v_add_u32_e32 v3, v3, v5 +; GFX90A-NEXT: v_mul_hi_u32 v3, s7, v3 +; GFX90A-NEXT: v_mul_lo_u32 v3, v3, s11 +; GFX90A-NEXT: v_sub_u32_e32 v3, s7, v3 +; GFX90A-NEXT: v_subrev_u32_e32 v5, s11, v3 +; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s11, v3 +; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc +; GFX90A-NEXT: v_subrev_u32_e32 v5, s11, v3 +; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s11, v3 +; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX90A-NEXT: s_endpgm %r = urem <4 x i32> %x, %y store <4 x i32> %r, <4 x i32> addrspace(1)* %out ret void @@ -2052,6 +2528,128 @@ ; GFX9-NEXT: v_subrev_u32_e32 v3, s2, v3 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX9-NEXT: s_endpgm +; +; GFX90A-LABEL: sdiv_v4i32: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 +; GFX90A-NEXT: s_mov_b32 s13, 0x4f7ffffe +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_ashr_i32 s2, s8, 31 +; GFX90A-NEXT: s_add_i32 s3, s8, s2 +; GFX90A-NEXT: s_xor_b32 s3, s3, s2 +; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s3 +; GFX90A-NEXT: s_ashr_i32 s8, s4, 31 +; GFX90A-NEXT: s_add_i32 s4, s4, s8 +; GFX90A-NEXT: s_xor_b32 s2, s8, s2 +; GFX90A-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX90A-NEXT: s_xor_b32 s4, s4, s8 +; GFX90A-NEXT: s_sub_i32 s8, 0, s3 +; GFX90A-NEXT: s_ashr_i32 s12, s9, 31 +; GFX90A-NEXT: v_mul_f32_e32 v0, s13, v0 +; GFX90A-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX90A-NEXT: v_mul_lo_u32 v1, s8, v0 +; GFX90A-NEXT: v_mul_hi_u32 v1, v0, v1 +; GFX90A-NEXT: v_add_u32_e32 v0, v0, v1 +; GFX90A-NEXT: v_mul_hi_u32 v0, s4, v0 +; GFX90A-NEXT: v_mul_lo_u32 v1, v0, s3 +; GFX90A-NEXT: v_sub_u32_e32 v1, s4, v1 +; GFX90A-NEXT: s_add_i32 s4, s9, s12 +; GFX90A-NEXT: s_xor_b32 s4, s4, s12 +; GFX90A-NEXT: v_cvt_f32_u32_e32 v3, s4 +; GFX90A-NEXT: v_add_u32_e32 v2, 1, v0 +; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s3, v1 +; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX90A-NEXT: v_subrev_u32_e32 v2, s3, v1 +; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s3, v1 +; GFX90A-NEXT: v_rcp_iflag_f32_e32 v1, v3 +; GFX90A-NEXT: v_add_u32_e32 v2, 1, v0 +; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX90A-NEXT: v_xor_b32_e32 v0, s2, v0 +; GFX90A-NEXT: v_mul_f32_e32 v1, s13, v1 +; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GFX90A-NEXT: v_subrev_u32_e32 v0, s2, v0 +; GFX90A-NEXT: s_ashr_i32 s2, s5, 31 +; GFX90A-NEXT: s_add_i32 s5, s5, s2 +; GFX90A-NEXT: s_xor_b32 s3, s2, s12 +; GFX90A-NEXT: s_xor_b32 s2, s5, s2 +; GFX90A-NEXT: s_sub_i32 s5, 0, s4 +; GFX90A-NEXT: v_mul_lo_u32 v2, s5, v1 +; GFX90A-NEXT: v_mul_hi_u32 v2, v1, v2 +; GFX90A-NEXT: v_add_u32_e32 v1, v1, v2 +; GFX90A-NEXT: v_mul_hi_u32 v1, s2, v1 +; GFX90A-NEXT: v_mul_lo_u32 v2, v1, s4 +; GFX90A-NEXT: v_sub_u32_e32 v2, s2, v2 +; GFX90A-NEXT: s_ashr_i32 s2, s10, 31 +; GFX90A-NEXT: s_add_i32 s5, s10, s2 +; GFX90A-NEXT: s_xor_b32 s5, s5, s2 +; GFX90A-NEXT: v_cvt_f32_u32_e32 v5, s5 +; GFX90A-NEXT: v_add_u32_e32 v3, 1, v1 +; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s4, v2 +; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX90A-NEXT: v_subrev_u32_e32 v3, s4, v2 +; GFX90A-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s4, v2 +; GFX90A-NEXT: v_rcp_iflag_f32_e32 v2, v5 +; GFX90A-NEXT: v_add_u32_e32 v3, 1, v1 +; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX90A-NEXT: v_xor_b32_e32 v1, s3, v1 +; GFX90A-NEXT: v_mul_f32_e32 v2, s13, v2 +; GFX90A-NEXT: v_cvt_u32_f32_e32 v2, v2 +; GFX90A-NEXT: v_subrev_u32_e32 v1, s3, v1 +; GFX90A-NEXT: s_ashr_i32 s3, s6, 31 +; GFX90A-NEXT: s_add_i32 s4, s6, s3 +; GFX90A-NEXT: s_xor_b32 s2, s3, s2 +; GFX90A-NEXT: s_xor_b32 s3, s4, s3 +; GFX90A-NEXT: s_sub_i32 s4, 0, s5 +; GFX90A-NEXT: v_mul_lo_u32 v3, s4, v2 +; GFX90A-NEXT: v_mul_hi_u32 v3, v2, v3 +; GFX90A-NEXT: v_add_u32_e32 v2, v2, v3 +; GFX90A-NEXT: v_mul_hi_u32 v2, s3, v2 +; GFX90A-NEXT: v_mul_lo_u32 v3, v2, s5 +; GFX90A-NEXT: v_sub_u32_e32 v3, s3, v3 +; GFX90A-NEXT: s_ashr_i32 s3, s11, 31 +; GFX90A-NEXT: s_add_i32 s4, s11, s3 +; GFX90A-NEXT: s_xor_b32 s4, s4, s3 +; GFX90A-NEXT: v_cvt_f32_u32_e32 v6, s4 +; GFX90A-NEXT: v_add_u32_e32 v5, 1, v2 +; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s5, v3 +; GFX90A-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc +; GFX90A-NEXT: v_subrev_u32_e32 v5, s5, v3 +; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc +; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s5, v3 +; GFX90A-NEXT: v_rcp_iflag_f32_e32 v3, v6 +; GFX90A-NEXT: v_add_u32_e32 v5, 1, v2 +; GFX90A-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc +; GFX90A-NEXT: v_xor_b32_e32 v2, s2, v2 +; GFX90A-NEXT: v_mul_f32_e32 v3, s13, v3 +; GFX90A-NEXT: v_cvt_u32_f32_e32 v3, v3 +; GFX90A-NEXT: v_subrev_u32_e32 v2, s2, v2 +; GFX90A-NEXT: s_ashr_i32 s2, s7, 31 +; GFX90A-NEXT: s_add_i32 s5, s7, s2 +; GFX90A-NEXT: s_xor_b32 s3, s2, s3 +; GFX90A-NEXT: s_xor_b32 s2, s5, s2 +; GFX90A-NEXT: s_sub_i32 s5, 0, s4 +; GFX90A-NEXT: v_mul_lo_u32 v5, s5, v3 +; GFX90A-NEXT: v_mul_hi_u32 v5, v3, v5 +; GFX90A-NEXT: v_add_u32_e32 v3, v3, v5 +; GFX90A-NEXT: v_mul_hi_u32 v3, s2, v3 +; GFX90A-NEXT: v_mul_lo_u32 v5, v3, s4 +; GFX90A-NEXT: v_sub_u32_e32 v5, s2, v5 +; GFX90A-NEXT: v_add_u32_e32 v6, 1, v3 +; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s4, v5 +; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc +; GFX90A-NEXT: v_subrev_u32_e32 v6, s4, v5 +; GFX90A-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc +; GFX90A-NEXT: v_add_u32_e32 v6, 1, v3 +; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s4, v5 +; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc +; GFX90A-NEXT: v_xor_b32_e32 v3, s3, v3 +; GFX90A-NEXT: v_subrev_u32_e32 v3, s3, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX90A-NEXT: s_endpgm %r = sdiv <4 x i32> %x, %y store <4 x i32> %r, <4 x i32> addrspace(1)* %out ret void @@ -2434,6 +3032,116 @@ ; GFX9-NEXT: v_subrev_u32_e32 v3, s5, v3 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX9-NEXT: s_endpgm +; +; GFX90A-LABEL: srem_v4i32: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 +; GFX90A-NEXT: s_mov_b32 s12, 0x4f7ffffe +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_ashr_i32 s2, s8, 31 +; GFX90A-NEXT: s_add_i32 s3, s8, s2 +; GFX90A-NEXT: s_xor_b32 s2, s3, s2 +; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GFX90A-NEXT: s_ashr_i32 s8, s9, 31 +; GFX90A-NEXT: s_add_i32 s9, s9, s8 +; GFX90A-NEXT: s_xor_b32 s8, s9, s8 +; GFX90A-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX90A-NEXT: v_cvt_f32_u32_e32 v1, s8 +; GFX90A-NEXT: s_sub_i32 s9, 0, s2 +; GFX90A-NEXT: s_ashr_i32 s3, s4, 31 +; GFX90A-NEXT: v_mul_f32_e32 v0, s12, v0 +; GFX90A-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX90A-NEXT: s_add_i32 s4, s4, s3 +; GFX90A-NEXT: v_rcp_iflag_f32_e32 v1, v1 +; GFX90A-NEXT: s_xor_b32 s4, s4, s3 +; GFX90A-NEXT: v_mul_lo_u32 v2, s9, v0 +; GFX90A-NEXT: v_mul_hi_u32 v2, v0, v2 +; GFX90A-NEXT: v_add_u32_e32 v0, v0, v2 +; GFX90A-NEXT: v_mul_hi_u32 v0, s4, v0 +; GFX90A-NEXT: v_mul_lo_u32 v0, v0, s2 +; GFX90A-NEXT: v_sub_u32_e32 v0, s4, v0 +; GFX90A-NEXT: v_mul_f32_e32 v1, s12, v1 +; GFX90A-NEXT: v_subrev_u32_e32 v2, s2, v0 +; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s2, v0 +; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX90A-NEXT: v_subrev_u32_e32 v2, s2, v0 +; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s2, v0 +; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX90A-NEXT: s_sub_i32 s4, 0, s8 +; GFX90A-NEXT: v_xor_b32_e32 v0, s3, v0 +; GFX90A-NEXT: v_mul_lo_u32 v2, s4, v1 +; GFX90A-NEXT: s_ashr_i32 s2, s5, 31 +; GFX90A-NEXT: v_mul_hi_u32 v2, v1, v2 +; GFX90A-NEXT: v_subrev_u32_e32 v0, s3, v0 +; GFX90A-NEXT: s_add_i32 s3, s5, s2 +; GFX90A-NEXT: s_xor_b32 s3, s3, s2 +; GFX90A-NEXT: v_add_u32_e32 v1, v1, v2 +; GFX90A-NEXT: v_mul_hi_u32 v1, s3, v1 +; GFX90A-NEXT: v_mul_lo_u32 v1, v1, s8 +; GFX90A-NEXT: v_sub_u32_e32 v1, s3, v1 +; GFX90A-NEXT: s_ashr_i32 s3, s10, 31 +; GFX90A-NEXT: s_add_i32 s4, s10, s3 +; GFX90A-NEXT: s_xor_b32 s3, s4, s3 +; GFX90A-NEXT: v_subrev_u32_e32 v2, s8, v1 +; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s8, v1 +; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX90A-NEXT: v_cvt_f32_u32_e32 v2, s3 +; GFX90A-NEXT: v_subrev_u32_e32 v3, s8, v1 +; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s8, v1 +; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX90A-NEXT: v_rcp_iflag_f32_e32 v2, v2 +; GFX90A-NEXT: v_xor_b32_e32 v1, s2, v1 +; GFX90A-NEXT: s_sub_i32 s5, 0, s3 +; GFX90A-NEXT: v_subrev_u32_e32 v1, s2, v1 +; GFX90A-NEXT: v_mul_f32_e32 v2, s12, v2 +; GFX90A-NEXT: v_cvt_u32_f32_e32 v2, v2 +; GFX90A-NEXT: s_ashr_i32 s2, s6, 31 +; GFX90A-NEXT: s_add_i32 s4, s6, s2 +; GFX90A-NEXT: s_xor_b32 s4, s4, s2 +; GFX90A-NEXT: v_mul_lo_u32 v3, s5, v2 +; GFX90A-NEXT: v_mul_hi_u32 v3, v2, v3 +; GFX90A-NEXT: v_add_u32_e32 v2, v2, v3 +; GFX90A-NEXT: v_mul_hi_u32 v2, s4, v2 +; GFX90A-NEXT: v_mul_lo_u32 v2, v2, s3 +; GFX90A-NEXT: v_sub_u32_e32 v2, s4, v2 +; GFX90A-NEXT: s_ashr_i32 s4, s11, 31 +; GFX90A-NEXT: s_add_i32 s5, s11, s4 +; GFX90A-NEXT: s_xor_b32 s4, s5, s4 +; GFX90A-NEXT: v_subrev_u32_e32 v3, s3, v2 +; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s3, v2 +; GFX90A-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; GFX90A-NEXT: v_cvt_f32_u32_e32 v3, s4 +; GFX90A-NEXT: v_subrev_u32_e32 v5, s3, v2 +; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s3, v2 +; GFX90A-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc +; GFX90A-NEXT: v_rcp_iflag_f32_e32 v3, v3 +; GFX90A-NEXT: v_xor_b32_e32 v2, s2, v2 +; GFX90A-NEXT: s_sub_i32 s5, 0, s4 +; GFX90A-NEXT: v_subrev_u32_e32 v2, s2, v2 +; GFX90A-NEXT: v_mul_f32_e32 v3, s12, v3 +; GFX90A-NEXT: v_cvt_u32_f32_e32 v3, v3 +; GFX90A-NEXT: s_ashr_i32 s2, s7, 31 +; GFX90A-NEXT: s_add_i32 s3, s7, s2 +; GFX90A-NEXT: s_xor_b32 s3, s3, s2 +; GFX90A-NEXT: v_mul_lo_u32 v5, s5, v3 +; GFX90A-NEXT: v_mul_hi_u32 v5, v3, v5 +; GFX90A-NEXT: v_add_u32_e32 v3, v3, v5 +; GFX90A-NEXT: v_mul_hi_u32 v3, s3, v3 +; GFX90A-NEXT: v_mul_lo_u32 v3, v3, s4 +; GFX90A-NEXT: v_sub_u32_e32 v3, s3, v3 +; GFX90A-NEXT: v_subrev_u32_e32 v5, s4, v3 +; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s4, v3 +; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc +; GFX90A-NEXT: v_subrev_u32_e32 v5, s4, v3 +; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s4, v3 +; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc +; GFX90A-NEXT: v_xor_b32_e32 v3, s2, v3 +; GFX90A-NEXT: v_subrev_u32_e32 v3, s2, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX90A-NEXT: s_endpgm %r = srem <4 x i32> %x, %y store <4 x i32> %r, <4 x i32> addrspace(1)* %out ret void @@ -2643,6 +3351,65 @@ ; GFX9-NEXT: v_lshl_or_b32 v0, v3, 16, v0 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm +; +; GFX90A-LABEL: udiv_v4i16: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GFX90A-NEXT: s_mov_b32 s8, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_and_b32 s1, s6, s8 +; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s1 +; GFX90A-NEXT: s_lshr_b32 s0, s4, 16 +; GFX90A-NEXT: s_and_b32 s4, s4, s8 +; GFX90A-NEXT: v_cvt_f32_u32_e32 v1, s4 +; GFX90A-NEXT: s_lshr_b32 s4, s6, 16 +; GFX90A-NEXT: v_rcp_iflag_f32_e32 v3, v0 +; GFX90A-NEXT: v_cvt_f32_u32_e32 v4, s4 +; GFX90A-NEXT: v_cvt_f32_u32_e32 v5, s0 +; GFX90A-NEXT: s_and_b32 s0, s7, s8 +; GFX90A-NEXT: v_mul_f32_e32 v3, v1, v3 +; GFX90A-NEXT: v_rcp_iflag_f32_e32 v6, v4 +; GFX90A-NEXT: v_trunc_f32_e32 v3, v3 +; GFX90A-NEXT: v_mad_f32 v1, -v3, v0, v1 +; GFX90A-NEXT: v_cvt_u32_f32_e32 v3, v3 +; GFX90A-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 +; GFX90A-NEXT: v_mul_f32_e32 v1, v5, v6 +; GFX90A-NEXT: v_trunc_f32_e32 v1, v1 +; GFX90A-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v3, vcc +; GFX90A-NEXT: v_mad_f32 v3, -v1, v4, v5 +; GFX90A-NEXT: v_cvt_f32_u32_e32 v5, s0 +; GFX90A-NEXT: s_and_b32 s0, s5, s8 +; GFX90A-NEXT: s_lshr_b32 s6, s7, 16 +; GFX90A-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v4 +; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GFX90A-NEXT: v_cvt_f32_u32_e32 v4, s6 +; GFX90A-NEXT: v_cvt_f32_u32_e32 v6, s0 +; GFX90A-NEXT: v_rcp_iflag_f32_e32 v7, v5 +; GFX90A-NEXT: s_lshr_b32 s1, s5, 16 +; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc +; GFX90A-NEXT: v_rcp_iflag_f32_e32 v8, v4 +; GFX90A-NEXT: v_mul_f32_e32 v1, v6, v7 +; GFX90A-NEXT: v_cvt_f32_u32_e32 v7, s1 +; GFX90A-NEXT: v_trunc_f32_e32 v1, v1 +; GFX90A-NEXT: v_mad_f32 v6, -v1, v5, v6 +; GFX90A-NEXT: v_cmp_ge_f32_e64 vcc, |v6|, v5 +; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GFX90A-NEXT: v_mul_f32_e32 v5, v7, v8 +; GFX90A-NEXT: v_trunc_f32_e32 v5, v5 +; GFX90A-NEXT: v_cvt_u32_f32_e32 v6, v5 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX90A-NEXT: v_mad_f32 v5, -v5, v4, v7 +; GFX90A-NEXT: v_cmp_ge_f32_e64 vcc, |v5|, v4 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0xffff +; GFX90A-NEXT: v_and_b32_e32 v0, v5, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v6, vcc +; GFX90A-NEXT: v_and_b32_e32 v1, v5, v1 +; GFX90A-NEXT: v_lshl_or_b32 v1, v4, 16, v1 +; GFX90A-NEXT: v_lshl_or_b32 v0, v3, 16, v0 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX90A-NEXT: s_endpgm %r = udiv <4 x i16> %x, %y store <4 x i16> %r, <4 x i16> addrspace(1)* %out ret void @@ -2876,6 +3643,73 @@ ; GFX9-NEXT: v_lshl_or_b32 v0, v5, 16, v0 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm +; +; GFX90A-LABEL: urem_v4i16: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GFX90A-NEXT: s_mov_b32 s8, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_and_b32 s1, s6, s8 +; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s1 +; GFX90A-NEXT: s_and_b32 s9, s4, s8 +; GFX90A-NEXT: v_cvt_f32_u32_e32 v1, s9 +; GFX90A-NEXT: s_lshr_b32 s9, s6, 16 +; GFX90A-NEXT: v_rcp_iflag_f32_e32 v3, v0 +; GFX90A-NEXT: v_cvt_f32_u32_e32 v4, s9 +; GFX90A-NEXT: s_lshr_b32 s0, s4, 16 +; GFX90A-NEXT: v_cvt_f32_u32_e32 v5, s0 +; GFX90A-NEXT: v_mul_f32_e32 v3, v1, v3 +; GFX90A-NEXT: v_trunc_f32_e32 v3, v3 +; GFX90A-NEXT: v_mad_f32 v1, -v3, v0, v1 +; GFX90A-NEXT: v_cvt_u32_f32_e32 v3, v3 +; GFX90A-NEXT: v_rcp_iflag_f32_e32 v6, v4 +; GFX90A-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 +; GFX90A-NEXT: s_lshr_b32 s10, s7, 16 +; GFX90A-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v3, vcc +; GFX90A-NEXT: v_mul_lo_u32 v0, v0, s6 +; GFX90A-NEXT: v_mul_f32_e32 v1, v5, v6 +; GFX90A-NEXT: v_sub_u32_e32 v0, s4, v0 +; GFX90A-NEXT: v_trunc_f32_e32 v1, v1 +; GFX90A-NEXT: s_and_b32 s4, s7, s8 +; GFX90A-NEXT: v_mad_f32 v3, -v1, v4, v5 +; GFX90A-NEXT: v_cvt_f32_u32_e32 v5, s4 +; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GFX90A-NEXT: s_and_b32 s4, s5, s8 +; GFX90A-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v4 +; GFX90A-NEXT: v_cvt_f32_u32_e32 v4, s10 +; GFX90A-NEXT: v_cvt_f32_u32_e32 v6, s4 +; GFX90A-NEXT: v_rcp_iflag_f32_e32 v7, v5 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX90A-NEXT: v_mul_lo_u32 v1, v1, s9 +; GFX90A-NEXT: s_lshr_b32 s1, s5, 16 +; GFX90A-NEXT: v_sub_u32_e32 v3, s0, v1 +; GFX90A-NEXT: v_mul_f32_e32 v1, v6, v7 +; GFX90A-NEXT: v_cvt_f32_u32_e32 v7, s1 +; GFX90A-NEXT: v_rcp_iflag_f32_e32 v8, v4 +; GFX90A-NEXT: v_trunc_f32_e32 v1, v1 +; GFX90A-NEXT: v_mad_f32 v6, -v1, v5, v6 +; GFX90A-NEXT: v_cmp_ge_f32_e64 vcc, |v6|, v5 +; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GFX90A-NEXT: v_mul_f32_e32 v5, v7, v8 +; GFX90A-NEXT: v_trunc_f32_e32 v5, v5 +; GFX90A-NEXT: v_cvt_u32_f32_e32 v6, v5 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX90A-NEXT: v_mad_f32 v5, -v5, v4, v7 +; GFX90A-NEXT: v_cmp_ge_f32_e64 vcc, |v5|, v4 +; GFX90A-NEXT: v_mul_lo_u32 v1, v1, s7 +; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v6, vcc +; GFX90A-NEXT: v_mul_lo_u32 v4, v4, s10 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0xffff +; GFX90A-NEXT: v_sub_u32_e32 v1, s5, v1 +; GFX90A-NEXT: v_and_b32_e32 v0, v5, v0 +; GFX90A-NEXT: v_sub_u32_e32 v4, s1, v4 +; GFX90A-NEXT: v_and_b32_e32 v1, v5, v1 +; GFX90A-NEXT: v_lshl_or_b32 v1, v4, 16, v1 +; GFX90A-NEXT: v_lshl_or_b32 v0, v3, 16, v0 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX90A-NEXT: s_endpgm %r = urem <4 x i16> %x, %y store <4 x i16> %r, <4 x i16> addrspace(1)* %out ret void @@ -3140,6 +3974,84 @@ ; GFX9-NEXT: v_lshl_or_b32 v0, v4, 16, v0 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm +; +; GFX90A-LABEL: sdiv_v4i16: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_sext_i32_i16 s0, s6 +; GFX90A-NEXT: v_cvt_f32_i32_e32 v0, s0 +; GFX90A-NEXT: s_sext_i32_i16 s1, s4 +; GFX90A-NEXT: v_cvt_f32_i32_e32 v1, s1 +; GFX90A-NEXT: s_xor_b32 s0, s1, s0 +; GFX90A-NEXT: v_rcp_iflag_f32_e32 v3, v0 +; GFX90A-NEXT: s_ashr_i32 s0, s0, 30 +; GFX90A-NEXT: s_or_b32 s8, s0, 1 +; GFX90A-NEXT: v_mul_f32_e32 v3, v1, v3 +; GFX90A-NEXT: v_trunc_f32_e32 v3, v3 +; GFX90A-NEXT: v_mad_f32 v1, -v3, v0, v1 +; GFX90A-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, |v0| +; GFX90A-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX90A-NEXT: s_cselect_b32 s0, s8, 0 +; GFX90A-NEXT: s_ashr_i32 s1, s6, 16 +; GFX90A-NEXT: v_cvt_f32_i32_e32 v0, s1 +; GFX90A-NEXT: s_ashr_i32 s4, s4, 16 +; GFX90A-NEXT: v_cvt_f32_i32_e32 v1, s4 +; GFX90A-NEXT: v_cvt_i32_f32_e32 v3, v3 +; GFX90A-NEXT: v_rcp_iflag_f32_e32 v4, v0 +; GFX90A-NEXT: v_add_u32_e32 v3, s0, v3 +; GFX90A-NEXT: v_mul_f32_e32 v4, v1, v4 +; GFX90A-NEXT: s_xor_b32 s0, s4, s1 +; GFX90A-NEXT: v_trunc_f32_e32 v4, v4 +; GFX90A-NEXT: s_ashr_i32 s0, s0, 30 +; GFX90A-NEXT: v_mad_f32 v1, -v4, v0, v1 +; GFX90A-NEXT: s_or_b32 s4, s0, 1 +; GFX90A-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, |v0| +; GFX90A-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX90A-NEXT: s_sext_i32_i16 s1, s7 +; GFX90A-NEXT: v_cvt_i32_f32_e32 v4, v4 +; GFX90A-NEXT: v_cvt_f32_i32_e32 v0, s1 +; GFX90A-NEXT: s_cselect_b32 s0, s4, 0 +; GFX90A-NEXT: v_add_u32_e32 v4, s0, v4 +; GFX90A-NEXT: s_sext_i32_i16 s0, s5 +; GFX90A-NEXT: v_cvt_f32_i32_e32 v1, s0 +; GFX90A-NEXT: v_rcp_iflag_f32_e32 v5, v0 +; GFX90A-NEXT: s_xor_b32 s0, s0, s1 +; GFX90A-NEXT: s_ashr_i32 s0, s0, 30 +; GFX90A-NEXT: s_or_b32 s4, s0, 1 +; GFX90A-NEXT: v_mul_f32_e32 v5, v1, v5 +; GFX90A-NEXT: v_trunc_f32_e32 v5, v5 +; GFX90A-NEXT: v_mad_f32 v1, -v5, v0, v1 +; GFX90A-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, |v0| +; GFX90A-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX90A-NEXT: s_cselect_b32 s0, s4, 0 +; GFX90A-NEXT: v_cvt_i32_f32_e32 v5, v5 +; GFX90A-NEXT: s_ashr_i32 s1, s7, 16 +; GFX90A-NEXT: v_cvt_f32_i32_e32 v0, s1 +; GFX90A-NEXT: v_add_u32_e32 v1, s0, v5 +; GFX90A-NEXT: s_ashr_i32 s0, s5, 16 +; GFX90A-NEXT: v_cvt_f32_i32_e32 v5, s0 +; GFX90A-NEXT: v_rcp_iflag_f32_e32 v6, v0 +; GFX90A-NEXT: s_xor_b32 s0, s0, s1 +; GFX90A-NEXT: s_ashr_i32 s0, s0, 30 +; GFX90A-NEXT: s_or_b32 s4, s0, 1 +; GFX90A-NEXT: v_mul_f32_e32 v6, v5, v6 +; GFX90A-NEXT: v_trunc_f32_e32 v6, v6 +; GFX90A-NEXT: v_mad_f32 v5, -v6, v0, v5 +; GFX90A-NEXT: v_cvt_i32_f32_e32 v6, v6 +; GFX90A-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, |v0| +; GFX90A-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX90A-NEXT: s_cselect_b32 s0, s4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0xffff +; GFX90A-NEXT: v_add_u32_e32 v0, s0, v6 +; GFX90A-NEXT: v_and_b32_e32 v1, v5, v1 +; GFX90A-NEXT: v_lshl_or_b32 v1, v0, 16, v1 +; GFX90A-NEXT: v_and_b32_e32 v0, v5, v3 +; GFX90A-NEXT: v_lshl_or_b32 v0, v4, 16, v0 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX90A-NEXT: s_endpgm %r = sdiv <4 x i16> %x, %y store <4 x i16> %r, <4 x i16> addrspace(1)* %out ret void @@ -3428,6 +4340,92 @@ ; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v3 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm +; +; GFX90A-LABEL: srem_v4i16: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_sext_i32_i16 s0, s6 +; GFX90A-NEXT: v_cvt_f32_i32_e32 v0, s0 +; GFX90A-NEXT: s_sext_i32_i16 s1, s4 +; GFX90A-NEXT: v_cvt_f32_i32_e32 v1, s1 +; GFX90A-NEXT: s_xor_b32 s0, s1, s0 +; GFX90A-NEXT: v_rcp_iflag_f32_e32 v3, v0 +; GFX90A-NEXT: s_ashr_i32 s0, s0, 30 +; GFX90A-NEXT: s_or_b32 s8, s0, 1 +; GFX90A-NEXT: v_mul_f32_e32 v3, v1, v3 +; GFX90A-NEXT: v_trunc_f32_e32 v3, v3 +; GFX90A-NEXT: v_mad_f32 v1, -v3, v0, v1 +; GFX90A-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, |v0| +; GFX90A-NEXT: v_cvt_i32_f32_e32 v3, v3 +; GFX90A-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX90A-NEXT: s_cselect_b32 s0, s8, 0 +; GFX90A-NEXT: s_ashr_i32 s8, s6, 16 +; GFX90A-NEXT: v_cvt_f32_i32_e32 v1, s8 +; GFX90A-NEXT: v_add_u32_e32 v0, s0, v3 +; GFX90A-NEXT: v_mul_lo_u32 v0, v0, s6 +; GFX90A-NEXT: v_sub_u32_e32 v0, s4, v0 +; GFX90A-NEXT: s_ashr_i32 s4, s4, 16 +; GFX90A-NEXT: v_cvt_f32_i32_e32 v3, s4 +; GFX90A-NEXT: v_rcp_iflag_f32_e32 v4, v1 +; GFX90A-NEXT: s_xor_b32 s0, s4, s8 +; GFX90A-NEXT: s_ashr_i32 s0, s0, 30 +; GFX90A-NEXT: s_or_b32 s6, s0, 1 +; GFX90A-NEXT: v_mul_f32_e32 v4, v3, v4 +; GFX90A-NEXT: v_trunc_f32_e32 v4, v4 +; GFX90A-NEXT: v_mad_f32 v3, -v4, v1, v3 +; GFX90A-NEXT: v_cvt_i32_f32_e32 v4, v4 +; GFX90A-NEXT: v_cmp_ge_f32_e64 s[0:1], |v3|, |v1| +; GFX90A-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX90A-NEXT: s_cselect_b32 s0, s6, 0 +; GFX90A-NEXT: v_add_u32_e32 v1, s0, v4 +; GFX90A-NEXT: s_sext_i32_i16 s0, s7 +; GFX90A-NEXT: v_cvt_f32_i32_e32 v3, s0 +; GFX90A-NEXT: v_mul_lo_u32 v1, v1, s8 +; GFX90A-NEXT: s_sext_i32_i16 s1, s5 +; GFX90A-NEXT: v_sub_u32_e32 v4, s4, v1 +; GFX90A-NEXT: v_cvt_f32_i32_e32 v1, s1 +; GFX90A-NEXT: v_rcp_iflag_f32_e32 v5, v3 +; GFX90A-NEXT: s_xor_b32 s0, s1, s0 +; GFX90A-NEXT: s_ashr_i32 s0, s0, 30 +; GFX90A-NEXT: s_or_b32 s4, s0, 1 +; GFX90A-NEXT: v_mul_f32_e32 v5, v1, v5 +; GFX90A-NEXT: v_trunc_f32_e32 v5, v5 +; GFX90A-NEXT: v_mad_f32 v1, -v5, v3, v1 +; GFX90A-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, |v3| +; GFX90A-NEXT: v_cvt_i32_f32_e32 v5, v5 +; GFX90A-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX90A-NEXT: s_cselect_b32 s0, s4, 0 +; GFX90A-NEXT: s_ashr_i32 s4, s7, 16 +; GFX90A-NEXT: v_cvt_f32_i32_e32 v3, s4 +; GFX90A-NEXT: v_add_u32_e32 v1, s0, v5 +; GFX90A-NEXT: v_mul_lo_u32 v1, v1, s7 +; GFX90A-NEXT: v_sub_u32_e32 v1, s5, v1 +; GFX90A-NEXT: s_ashr_i32 s5, s5, 16 +; GFX90A-NEXT: v_cvt_f32_i32_e32 v5, s5 +; GFX90A-NEXT: v_rcp_iflag_f32_e32 v6, v3 +; GFX90A-NEXT: s_xor_b32 s0, s5, s4 +; GFX90A-NEXT: s_ashr_i32 s0, s0, 30 +; GFX90A-NEXT: s_or_b32 s6, s0, 1 +; GFX90A-NEXT: v_mul_f32_e32 v6, v5, v6 +; GFX90A-NEXT: v_trunc_f32_e32 v6, v6 +; GFX90A-NEXT: v_mad_f32 v5, -v6, v3, v5 +; GFX90A-NEXT: v_cvt_i32_f32_e32 v6, v6 +; GFX90A-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, |v3| +; GFX90A-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX90A-NEXT: s_cselect_b32 s0, s6, 0 +; GFX90A-NEXT: v_add_u32_e32 v3, s0, v6 +; GFX90A-NEXT: v_mul_lo_u32 v3, v3, s4 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0xffff +; GFX90A-NEXT: v_and_b32_e32 v0, v5, v0 +; GFX90A-NEXT: v_sub_u32_e32 v3, s5, v3 +; GFX90A-NEXT: v_and_b32_e32 v1, v5, v1 +; GFX90A-NEXT: v_lshl_or_b32 v1, v3, 16, v1 +; GFX90A-NEXT: v_lshl_or_b32 v0, v4, 16, v0 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX90A-NEXT: s_endpgm %r = srem <4 x i16> %x, %y store <4 x i16> %r, <4 x i16> addrspace(1)* %out ret void @@ -3497,6 +4495,27 @@ ; GFX9-NEXT: v_and_b32_e32 v0, 7, v0 ; GFX9-NEXT: global_store_byte v2, v0, s[2:3] ; GFX9-NEXT: s_endpgm +; +; GFX90A-LABEL: udiv_i3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_bfe_u32 s0, s4, 0x30008 +; GFX90A-NEXT: v_cvt_f32_ubyte0_e32 v0, s0 +; GFX90A-NEXT: v_rcp_iflag_f32_e32 v1, v0 +; GFX90A-NEXT: s_and_b32 s0, s4, 7 +; GFX90A-NEXT: v_cvt_f32_ubyte0_e32 v3, s0 +; GFX90A-NEXT: v_mul_f32_e32 v1, v3, v1 +; GFX90A-NEXT: v_trunc_f32_e32 v1, v1 +; GFX90A-NEXT: v_cvt_u32_f32_e32 v4, v1 +; GFX90A-NEXT: v_mad_f32 v1, -v1, v0, v3 +; GFX90A-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v4, vcc +; GFX90A-NEXT: v_and_b32_e32 v0, 7, v0 +; GFX90A-NEXT: global_store_byte v2, v0, s[2:3] +; GFX90A-NEXT: s_endpgm %r = udiv i3 %x, %y store i3 %r, i3 addrspace(1)* %out ret void @@ -3575,6 +4594,30 @@ ; GFX9-NEXT: v_and_b32_e32 v0, 7, v0 ; GFX9-NEXT: global_store_byte v1, v0, s[0:1] ; GFX9-NEXT: s_endpgm +; +; GFX90A-LABEL: urem_i3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_bfe_u32 s0, s4, 0x30008 +; GFX90A-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 +; GFX90A-NEXT: v_rcp_iflag_f32_e32 v2, v1 +; GFX90A-NEXT: s_and_b32 s1, s4, 7 +; GFX90A-NEXT: v_cvt_f32_ubyte0_e32 v3, s1 +; GFX90A-NEXT: s_lshr_b32 s0, s4, 8 +; GFX90A-NEXT: v_mul_f32_e32 v2, v3, v2 +; GFX90A-NEXT: v_trunc_f32_e32 v2, v2 +; GFX90A-NEXT: v_cvt_u32_f32_e32 v4, v2 +; GFX90A-NEXT: v_mad_f32 v2, -v2, v1, v3 +; GFX90A-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v1 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v4, vcc +; GFX90A-NEXT: v_mul_lo_u32 v1, v1, s0 +; GFX90A-NEXT: v_sub_u32_e32 v1, s4, v1 +; GFX90A-NEXT: v_and_b32_e32 v1, 7, v1 +; GFX90A-NEXT: global_store_byte v0, v1, s[2:3] +; GFX90A-NEXT: s_endpgm %r = urem i3 %x, %y store i3 %r, i3 addrspace(1)* %out ret void @@ -3658,6 +4701,32 @@ ; GFX9-NEXT: v_and_b32_e32 v0, 7, v0 ; GFX9-NEXT: global_store_byte v1, v0, s[2:3] ; GFX9-NEXT: s_endpgm +; +; GFX90A-LABEL: sdiv_i3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX90A-NEXT: v_mov_b32_e32 v1, 0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_bfe_i32 s0, s4, 0x30008 +; GFX90A-NEXT: v_cvt_f32_i32_e32 v0, s0 +; GFX90A-NEXT: s_bfe_i32 s1, s4, 0x30000 +; GFX90A-NEXT: v_cvt_f32_i32_e32 v2, s1 +; GFX90A-NEXT: s_xor_b32 s0, s1, s0 +; GFX90A-NEXT: v_rcp_iflag_f32_e32 v3, v0 +; GFX90A-NEXT: s_ashr_i32 s0, s0, 30 +; GFX90A-NEXT: s_or_b32 s4, s0, 1 +; GFX90A-NEXT: v_mul_f32_e32 v3, v2, v3 +; GFX90A-NEXT: v_trunc_f32_e32 v3, v3 +; GFX90A-NEXT: v_mad_f32 v2, -v3, v0, v2 +; GFX90A-NEXT: v_cvt_i32_f32_e32 v3, v3 +; GFX90A-NEXT: v_cmp_ge_f32_e64 s[0:1], |v2|, |v0| +; GFX90A-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX90A-NEXT: s_cselect_b32 s0, s4, 0 +; GFX90A-NEXT: v_add_u32_e32 v0, s0, v3 +; GFX90A-NEXT: v_and_b32_e32 v0, 7, v0 +; GFX90A-NEXT: global_store_byte v1, v0, s[2:3] +; GFX90A-NEXT: s_endpgm %r = sdiv i3 %x, %y store i3 %r, i3 addrspace(1)* %out ret void @@ -3750,6 +4819,35 @@ ; GFX9-NEXT: v_and_b32_e32 v0, 7, v0 ; GFX9-NEXT: global_store_byte v1, v0, s[0:1] ; GFX9-NEXT: s_endpgm +; +; GFX90A-LABEL: srem_i3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_bfe_i32 s0, s4, 0x30008 +; GFX90A-NEXT: v_cvt_f32_i32_e32 v1, s0 +; GFX90A-NEXT: s_bfe_i32 s1, s4, 0x30000 +; GFX90A-NEXT: v_cvt_f32_i32_e32 v2, s1 +; GFX90A-NEXT: s_xor_b32 s0, s1, s0 +; GFX90A-NEXT: v_rcp_iflag_f32_e32 v3, v1 +; GFX90A-NEXT: s_ashr_i32 s0, s0, 30 +; GFX90A-NEXT: s_lshr_b32 s5, s4, 8 +; GFX90A-NEXT: s_or_b32 s6, s0, 1 +; GFX90A-NEXT: v_mul_f32_e32 v3, v2, v3 +; GFX90A-NEXT: v_trunc_f32_e32 v3, v3 +; GFX90A-NEXT: v_mad_f32 v2, -v3, v1, v2 +; GFX90A-NEXT: v_cvt_i32_f32_e32 v3, v3 +; GFX90A-NEXT: v_cmp_ge_f32_e64 s[0:1], |v2|, |v1| +; GFX90A-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX90A-NEXT: s_cselect_b32 s0, s6, 0 +; GFX90A-NEXT: v_add_u32_e32 v1, s0, v3 +; GFX90A-NEXT: v_mul_lo_u32 v1, v1, s5 +; GFX90A-NEXT: v_sub_u32_e32 v1, s4, v1 +; GFX90A-NEXT: v_and_b32_e32 v1, 7, v1 +; GFX90A-NEXT: global_store_byte v0, v1, s[2:3] +; GFX90A-NEXT: s_endpgm %r = srem i3 %x, %y store i3 %r, i3 addrspace(1)* %out ret void @@ -3915,6 +5013,53 @@ ; GFX9-NEXT: global_store_short v1, v3, s[2:3] offset:4 ; GFX9-NEXT: global_store_dword v1, v0, s[2:3] ; GFX9-NEXT: s_endpgm +; +; GFX90A-LABEL: udiv_v3i16: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c +; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34 +; GFX90A-NEXT: s_mov_b32 s8, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v1, 0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_and_b32 s0, s6, s8 +; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s0 +; GFX90A-NEXT: s_and_b32 s0, s4, s8 +; GFX90A-NEXT: s_lshr_b32 s1, s6, 16 +; GFX90A-NEXT: v_cvt_f32_u32_e32 v2, s0 +; GFX90A-NEXT: v_rcp_iflag_f32_e32 v3, v0 +; GFX90A-NEXT: v_cvt_f32_u32_e32 v4, s1 +; GFX90A-NEXT: s_lshr_b32 s0, s4, 16 +; GFX90A-NEXT: v_cvt_f32_u32_e32 v5, s0 +; GFX90A-NEXT: v_mul_f32_e32 v3, v2, v3 +; GFX90A-NEXT: v_rcp_iflag_f32_e32 v6, v4 +; GFX90A-NEXT: v_trunc_f32_e32 v3, v3 +; GFX90A-NEXT: v_mad_f32 v2, -v3, v0, v2 +; GFX90A-NEXT: v_cvt_u32_f32_e32 v3, v3 +; GFX90A-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v0 +; GFX90A-NEXT: v_mul_f32_e32 v2, v5, v6 +; GFX90A-NEXT: v_trunc_f32_e32 v2, v2 +; GFX90A-NEXT: s_and_b32 s0, s7, s8 +; GFX90A-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v3, vcc +; GFX90A-NEXT: v_mad_f32 v3, -v2, v4, v5 +; GFX90A-NEXT: v_cvt_f32_u32_e32 v5, s0 +; GFX90A-NEXT: s_and_b32 s0, s5, s8 +; GFX90A-NEXT: v_cvt_f32_u32_e32 v6, s0 +; GFX90A-NEXT: v_cvt_u32_f32_e32 v2, v2 +; GFX90A-NEXT: v_rcp_iflag_f32_e32 v7, v5 +; GFX90A-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v4 +; GFX90A-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc +; GFX90A-NEXT: v_mul_f32_e32 v3, v6, v7 +; GFX90A-NEXT: v_trunc_f32_e32 v3, v3 +; GFX90A-NEXT: v_cvt_u32_f32_e32 v4, v3 +; GFX90A-NEXT: v_mad_f32 v3, -v3, v5, v6 +; GFX90A-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v5 +; GFX90A-NEXT: v_lshl_or_b32 v0, v2, 16, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc +; GFX90A-NEXT: global_store_short v1, v3, s[2:3] offset:4 +; GFX90A-NEXT: global_store_dword v1, v0, s[2:3] +; GFX90A-NEXT: s_endpgm %r = udiv <3 x i16> %x, %y store <3 x i16> %r, <3 x i16> addrspace(1)* %out ret void @@ -4102,6 +5247,59 @@ ; GFX9-NEXT: global_store_short v3, v2, s[2:3] offset:4 ; GFX9-NEXT: global_store_dword v3, v0, s[2:3] ; GFX9-NEXT: s_endpgm +; +; GFX90A-LABEL: urem_v3i16: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c +; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34 +; GFX90A-NEXT: s_mov_b32 s8, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v1, 0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_and_b32 s1, s4, s8 +; GFX90A-NEXT: s_and_b32 s0, s6, s8 +; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s0 +; GFX90A-NEXT: v_cvt_f32_u32_e32 v2, s1 +; GFX90A-NEXT: s_lshr_b32 s6, s6, 16 +; GFX90A-NEXT: v_cvt_f32_u32_e32 v4, s6 +; GFX90A-NEXT: v_rcp_iflag_f32_e32 v3, v0 +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: v_cvt_f32_u32_e32 v5, s4 +; GFX90A-NEXT: v_rcp_iflag_f32_e32 v6, v4 +; GFX90A-NEXT: v_mul_f32_e32 v3, v2, v3 +; GFX90A-NEXT: v_trunc_f32_e32 v3, v3 +; GFX90A-NEXT: v_mad_f32 v2, -v3, v0, v2 +; GFX90A-NEXT: v_cvt_u32_f32_e32 v3, v3 +; GFX90A-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v0 +; GFX90A-NEXT: v_mul_f32_e32 v2, v5, v6 +; GFX90A-NEXT: v_trunc_f32_e32 v2, v2 +; GFX90A-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v3, vcc +; GFX90A-NEXT: v_mul_lo_u32 v0, v0, s0 +; GFX90A-NEXT: s_and_b32 s0, s7, s8 +; GFX90A-NEXT: v_mad_f32 v3, -v2, v4, v5 +; GFX90A-NEXT: v_cvt_f32_u32_e32 v5, s0 +; GFX90A-NEXT: v_sub_u32_e32 v0, s1, v0 +; GFX90A-NEXT: s_and_b32 s1, s5, s8 +; GFX90A-NEXT: v_cvt_f32_u32_e32 v6, s1 +; GFX90A-NEXT: v_rcp_iflag_f32_e32 v7, v5 +; GFX90A-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v4 +; GFX90A-NEXT: v_cvt_u32_f32_e32 v2, v2 +; GFX90A-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX90A-NEXT: v_mul_f32_e32 v3, v6, v7 +; GFX90A-NEXT: v_trunc_f32_e32 v3, v3 +; GFX90A-NEXT: v_cvt_u32_f32_e32 v4, v3 +; GFX90A-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc +; GFX90A-NEXT: v_mad_f32 v3, -v3, v5, v6 +; GFX90A-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v5 +; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc +; GFX90A-NEXT: v_mul_lo_u32 v2, v2, s6 +; GFX90A-NEXT: v_mul_lo_u32 v3, v3, s0 +; GFX90A-NEXT: v_sub_u32_e32 v2, s4, v2 +; GFX90A-NEXT: v_sub_u32_e32 v3, s1, v3 +; GFX90A-NEXT: v_lshl_or_b32 v0, v2, 16, v0 +; GFX90A-NEXT: global_store_short v1, v3, s[2:3] offset:4 +; GFX90A-NEXT: global_store_dword v1, v0, s[2:3] +; GFX90A-NEXT: s_endpgm %r = urem <3 x i16> %x, %y store <3 x i16> %r, <3 x i16> addrspace(1)* %out ret void @@ -4307,6 +5505,67 @@ ; GFX9-NEXT: global_store_short v1, v0, s[2:3] offset:4 ; GFX9-NEXT: global_store_dword v1, v2, s[2:3] ; GFX9-NEXT: s_endpgm +; +; GFX90A-LABEL: sdiv_v3i16: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c +; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34 +; GFX90A-NEXT: v_mov_b32_e32 v1, 0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_sext_i32_i16 s1, s4 +; GFX90A-NEXT: s_sext_i32_i16 s0, s6 +; GFX90A-NEXT: v_cvt_f32_i32_e32 v0, s0 +; GFX90A-NEXT: v_cvt_f32_i32_e32 v2, s1 +; GFX90A-NEXT: s_xor_b32 s0, s1, s0 +; GFX90A-NEXT: s_ashr_i32 s0, s0, 30 +; GFX90A-NEXT: v_rcp_iflag_f32_e32 v3, v0 +; GFX90A-NEXT: s_or_b32 s8, s0, 1 +; GFX90A-NEXT: v_mul_f32_e32 v3, v2, v3 +; GFX90A-NEXT: v_trunc_f32_e32 v3, v3 +; GFX90A-NEXT: v_mad_f32 v2, -v3, v0, v2 +; GFX90A-NEXT: v_cmp_ge_f32_e64 s[0:1], |v2|, |v0| +; GFX90A-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX90A-NEXT: s_cselect_b32 s0, s8, 0 +; GFX90A-NEXT: s_ashr_i32 s1, s6, 16 +; GFX90A-NEXT: v_cvt_i32_f32_e32 v3, v3 +; GFX90A-NEXT: v_cvt_f32_i32_e32 v0, s1 +; GFX90A-NEXT: s_ashr_i32 s4, s4, 16 +; GFX90A-NEXT: v_add_u32_e32 v2, s0, v3 +; GFX90A-NEXT: v_cvt_f32_i32_e32 v3, s4 +; GFX90A-NEXT: v_rcp_iflag_f32_e32 v4, v0 +; GFX90A-NEXT: s_xor_b32 s0, s4, s1 +; GFX90A-NEXT: s_ashr_i32 s0, s0, 30 +; GFX90A-NEXT: s_or_b32 s4, s0, 1 +; GFX90A-NEXT: v_mul_f32_e32 v4, v3, v4 +; GFX90A-NEXT: v_trunc_f32_e32 v4, v4 +; GFX90A-NEXT: v_mad_f32 v3, -v4, v0, v3 +; GFX90A-NEXT: v_cmp_ge_f32_e64 s[0:1], |v3|, |v0| +; GFX90A-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX90A-NEXT: s_sext_i32_i16 s1, s7 +; GFX90A-NEXT: v_cvt_i32_f32_e32 v4, v4 +; GFX90A-NEXT: v_cvt_f32_i32_e32 v0, s1 +; GFX90A-NEXT: s_cselect_b32 s0, s4, 0 +; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX90A-NEXT: v_add_u32_e32 v3, s0, v4 +; GFX90A-NEXT: s_sext_i32_i16 s0, s5 +; GFX90A-NEXT: v_cvt_f32_i32_e32 v4, s0 +; GFX90A-NEXT: v_rcp_iflag_f32_e32 v5, v0 +; GFX90A-NEXT: s_xor_b32 s0, s0, s1 +; GFX90A-NEXT: s_ashr_i32 s0, s0, 30 +; GFX90A-NEXT: s_or_b32 s4, s0, 1 +; GFX90A-NEXT: v_mul_f32_e32 v5, v4, v5 +; GFX90A-NEXT: v_trunc_f32_e32 v5, v5 +; GFX90A-NEXT: v_mad_f32 v4, -v5, v0, v4 +; GFX90A-NEXT: v_cvt_i32_f32_e32 v5, v5 +; GFX90A-NEXT: v_cmp_ge_f32_e64 s[0:1], |v4|, |v0| +; GFX90A-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX90A-NEXT: s_cselect_b32 s0, s4, 0 +; GFX90A-NEXT: v_add_u32_e32 v0, s0, v5 +; GFX90A-NEXT: v_lshl_or_b32 v2, v3, 16, v2 +; GFX90A-NEXT: global_store_short v1, v0, s[2:3] offset:4 +; GFX90A-NEXT: global_store_dword v1, v2, s[2:3] +; GFX90A-NEXT: s_endpgm %r = sdiv <3 x i16> %x, %y store <3 x i16> %r, <3 x i16> addrspace(1)* %out ret void @@ -4533,6 +5792,73 @@ ; GFX9-NEXT: global_store_short v3, v2, s[4:5] offset:4 ; GFX9-NEXT: global_store_dword v3, v0, s[4:5] ; GFX9-NEXT: s_endpgm +; +; GFX90A-LABEL: srem_v3i16: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c +; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34 +; GFX90A-NEXT: v_mov_b32_e32 v1, 0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_sext_i32_i16 s9, s4 +; GFX90A-NEXT: s_sext_i32_i16 s8, s6 +; GFX90A-NEXT: v_cvt_f32_i32_e32 v0, s8 +; GFX90A-NEXT: v_cvt_f32_i32_e32 v2, s9 +; GFX90A-NEXT: s_xor_b32 s0, s9, s8 +; GFX90A-NEXT: s_ashr_i32 s0, s0, 30 +; GFX90A-NEXT: v_rcp_iflag_f32_e32 v3, v0 +; GFX90A-NEXT: s_or_b32 s10, s0, 1 +; GFX90A-NEXT: v_mul_f32_e32 v3, v2, v3 +; GFX90A-NEXT: v_trunc_f32_e32 v3, v3 +; GFX90A-NEXT: v_mad_f32 v2, -v3, v0, v2 +; GFX90A-NEXT: v_cmp_ge_f32_e64 s[0:1], |v2|, |v0| +; GFX90A-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX90A-NEXT: s_cselect_b32 s0, s10, 0 +; GFX90A-NEXT: s_ashr_i32 s6, s6, 16 +; GFX90A-NEXT: v_cvt_i32_f32_e32 v3, v3 +; GFX90A-NEXT: v_cvt_f32_i32_e32 v2, s6 +; GFX90A-NEXT: s_ashr_i32 s4, s4, 16 +; GFX90A-NEXT: v_add_u32_e32 v0, s0, v3 +; GFX90A-NEXT: v_cvt_f32_i32_e32 v3, s4 +; GFX90A-NEXT: v_rcp_iflag_f32_e32 v4, v2 +; GFX90A-NEXT: s_xor_b32 s0, s4, s6 +; GFX90A-NEXT: s_ashr_i32 s0, s0, 30 +; GFX90A-NEXT: v_mul_lo_u32 v0, v0, s8 +; GFX90A-NEXT: v_mul_f32_e32 v4, v3, v4 +; GFX90A-NEXT: v_trunc_f32_e32 v4, v4 +; GFX90A-NEXT: v_mad_f32 v3, -v4, v2, v3 +; GFX90A-NEXT: v_cvt_i32_f32_e32 v4, v4 +; GFX90A-NEXT: s_or_b32 s8, s0, 1 +; GFX90A-NEXT: v_cmp_ge_f32_e64 s[0:1], |v3|, |v2| +; GFX90A-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX90A-NEXT: s_cselect_b32 s0, s8, 0 +; GFX90A-NEXT: v_add_u32_e32 v2, s0, v4 +; GFX90A-NEXT: v_mul_lo_u32 v2, v2, s6 +; GFX90A-NEXT: s_sext_i32_i16 s6, s7 +; GFX90A-NEXT: v_cvt_f32_i32_e32 v3, s6 +; GFX90A-NEXT: v_sub_u32_e32 v2, s4, v2 +; GFX90A-NEXT: s_sext_i32_i16 s4, s5 +; GFX90A-NEXT: v_cvt_f32_i32_e32 v4, s4 +; GFX90A-NEXT: v_rcp_iflag_f32_e32 v5, v3 +; GFX90A-NEXT: s_xor_b32 s0, s4, s6 +; GFX90A-NEXT: s_ashr_i32 s0, s0, 30 +; GFX90A-NEXT: s_or_b32 s5, s0, 1 +; GFX90A-NEXT: v_mul_f32_e32 v5, v4, v5 +; GFX90A-NEXT: v_trunc_f32_e32 v5, v5 +; GFX90A-NEXT: v_mad_f32 v4, -v5, v3, v4 +; GFX90A-NEXT: v_cvt_i32_f32_e32 v5, v5 +; GFX90A-NEXT: v_cmp_ge_f32_e64 s[0:1], |v4|, |v3| +; GFX90A-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX90A-NEXT: s_cselect_b32 s0, s5, 0 +; GFX90A-NEXT: v_add_u32_e32 v3, s0, v5 +; GFX90A-NEXT: v_sub_u32_e32 v0, s9, v0 +; GFX90A-NEXT: v_mul_lo_u32 v3, v3, s6 +; GFX90A-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX90A-NEXT: v_sub_u32_e32 v3, s4, v3 +; GFX90A-NEXT: v_lshl_or_b32 v0, v2, 16, v0 +; GFX90A-NEXT: global_store_short v1, v3, s[2:3] offset:4 +; GFX90A-NEXT: global_store_dword v1, v0, s[2:3] +; GFX90A-NEXT: s_endpgm %r = srem <3 x i16> %x, %y store <3 x i16> %r, <3 x i16> addrspace(1)* %out ret void @@ -4716,6 +6042,62 @@ ; GFX9-NEXT: v_and_b32_e32 v0, 0x1fff, v1 ; GFX9-NEXT: global_store_short v2, v0, s[2:3] offset:4 ; GFX9-NEXT: s_endpgm +; +; GFX90A-LABEL: udiv_v3i15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c +; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34 +; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_and_b32 s0, s4, s8 +; GFX90A-NEXT: s_and_b32 s1, s6, s8 +; GFX90A-NEXT: v_cvt_f32_u32_e32 v1, s1 +; GFX90A-NEXT: v_cvt_f32_u32_e32 v4, s0 +; GFX90A-NEXT: s_bfe_u32 s0, s6, 0xf000f +; GFX90A-NEXT: v_cvt_f32_u32_e32 v6, s0 +; GFX90A-NEXT: v_rcp_iflag_f32_e32 v5, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, s6 +; GFX90A-NEXT: s_bfe_u32 s1, s4, 0xf000f +; GFX90A-NEXT: v_alignbit_b32 v3, s7, v3, 30 +; GFX90A-NEXT: v_mul_f32_e32 v5, v4, v5 +; GFX90A-NEXT: v_cvt_f32_u32_e32 v7, s1 +; GFX90A-NEXT: v_rcp_iflag_f32_e32 v8, v6 +; GFX90A-NEXT: v_and_b32_e32 v3, s8, v3 +; GFX90A-NEXT: v_trunc_f32_e32 v5, v5 +; GFX90A-NEXT: v_mad_f32 v4, -v5, v1, v4 +; GFX90A-NEXT: v_cvt_u32_f32_e32 v5, v5 +; GFX90A-NEXT: v_cvt_f32_u32_e32 v3, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, v1 +; GFX90A-NEXT: v_alignbit_b32 v0, s5, v0, 30 +; GFX90A-NEXT: v_mul_f32_e32 v1, v7, v8 +; GFX90A-NEXT: v_and_b32_e32 v0, s8, v0 +; GFX90A-NEXT: v_trunc_f32_e32 v1, v1 +; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v5, vcc +; GFX90A-NEXT: v_mad_f32 v5, -v1, v6, v7 +; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, v0 +; GFX90A-NEXT: v_rcp_iflag_f32_e32 v7, v3 +; GFX90A-NEXT: v_cmp_ge_f32_e64 vcc, |v5|, v6 +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc +; GFX90A-NEXT: v_mul_f32_e32 v1, v0, v7 +; GFX90A-NEXT: v_trunc_f32_e32 v1, v1 +; GFX90A-NEXT: v_cvt_u32_f32_e32 v6, v1 +; GFX90A-NEXT: v_mad_f32 v0, -v1, v3, v0 +; GFX90A-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, v3 +; GFX90A-NEXT: v_and_b32_e32 v3, s8, v4 +; GFX90A-NEXT: v_and_b32_e32 v4, s8, v5 +; GFX90A-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v6, vcc +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 15, v4 +; GFX90A-NEXT: v_lshlrev_b64 v[0:1], 30, v[0:1] +; GFX90A-NEXT: v_or_b32_e32 v3, v3, v4 +; GFX90A-NEXT: v_or_b32_e32 v0, v3, v0 +; GFX90A-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-NEXT: v_and_b32_e32 v0, 0x1fff, v1 +; GFX90A-NEXT: global_store_short v2, v0, s[2:3] offset:4 +; GFX90A-NEXT: s_endpgm %r = udiv <3 x i15> %x, %y store <3 x i15> %r, <3 x i15> addrspace(1)* %out ret void @@ -4921,6 +6303,70 @@ ; GFX9-NEXT: v_and_b32_e32 v0, 0x1fff, v1 ; GFX9-NEXT: global_store_short v2, v0, s[2:3] offset:4 ; GFX9-NEXT: s_endpgm +; +; GFX90A-LABEL: urem_v3i15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c +; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34 +; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_and_b32 s1, s4, s8 +; GFX90A-NEXT: s_and_b32 s9, s6, s8 +; GFX90A-NEXT: v_cvt_f32_u32_e32 v1, s9 +; GFX90A-NEXT: v_cvt_f32_u32_e32 v4, s1 +; GFX90A-NEXT: v_mov_b32_e32 v3, s6 +; GFX90A-NEXT: v_alignbit_b32 v3, s7, v3, 30 +; GFX90A-NEXT: v_rcp_iflag_f32_e32 v5, v1 +; GFX90A-NEXT: s_bfe_u32 s7, s6, 0xf000f +; GFX90A-NEXT: v_cvt_f32_u32_e32 v6, s7 +; GFX90A-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NEXT: v_mul_f32_e32 v5, v4, v5 +; GFX90A-NEXT: v_trunc_f32_e32 v5, v5 +; GFX90A-NEXT: v_mad_f32 v4, -v5, v1, v4 +; GFX90A-NEXT: v_cvt_u32_f32_e32 v5, v5 +; GFX90A-NEXT: v_alignbit_b32 v0, s5, v0, 30 +; GFX90A-NEXT: s_bfe_u32 s5, s4, 0xf000f +; GFX90A-NEXT: v_cvt_f32_u32_e32 v7, s5 +; GFX90A-NEXT: v_rcp_iflag_f32_e32 v8, v6 +; GFX90A-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, v1 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v5, vcc +; GFX90A-NEXT: v_and_b32_e32 v3, s8, v3 +; GFX90A-NEXT: v_mul_lo_u32 v1, v1, s6 +; GFX90A-NEXT: v_sub_u32_e32 v4, s4, v1 +; GFX90A-NEXT: v_mul_f32_e32 v1, v7, v8 +; GFX90A-NEXT: v_cvt_f32_u32_e32 v5, v3 +; GFX90A-NEXT: v_trunc_f32_e32 v1, v1 +; GFX90A-NEXT: v_mad_f32 v7, -v1, v6, v7 +; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GFX90A-NEXT: v_and_b32_e32 v0, s8, v0 +; GFX90A-NEXT: v_cvt_f32_u32_e32 v8, v0 +; GFX90A-NEXT: v_rcp_iflag_f32_e32 v9, v5 +; GFX90A-NEXT: v_cmp_ge_f32_e64 vcc, |v7|, v6 +; GFX90A-NEXT: s_lshr_b32 s1, s6, 15 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX90A-NEXT: s_lshr_b32 s0, s4, 15 +; GFX90A-NEXT: v_mul_lo_u32 v1, v1, s1 +; GFX90A-NEXT: v_sub_u32_e32 v6, s0, v1 +; GFX90A-NEXT: v_mul_f32_e32 v1, v8, v9 +; GFX90A-NEXT: v_trunc_f32_e32 v1, v1 +; GFX90A-NEXT: v_cvt_u32_f32_e32 v7, v1 +; GFX90A-NEXT: v_mad_f32 v1, -v1, v5, v8 +; GFX90A-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v5 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v7, vcc +; GFX90A-NEXT: v_mul_lo_u32 v1, v1, v3 +; GFX90A-NEXT: v_and_b32_e32 v3, s8, v4 +; GFX90A-NEXT: v_and_b32_e32 v4, s8, v6 +; GFX90A-NEXT: v_sub_u32_e32 v0, v0, v1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 15, v4 +; GFX90A-NEXT: v_lshlrev_b64 v[0:1], 30, v[0:1] +; GFX90A-NEXT: v_or_b32_e32 v3, v3, v4 +; GFX90A-NEXT: v_or_b32_e32 v0, v3, v0 +; GFX90A-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-NEXT: v_and_b32_e32 v0, 0x1fff, v1 +; GFX90A-NEXT: global_store_short v2, v0, s[2:3] offset:4 +; GFX90A-NEXT: s_endpgm %r = urem <3 x i15> %x, %y store <3 x i15> %r, <3 x i15> addrspace(1)* %out ret void @@ -5144,6 +6590,76 @@ ; GFX9-NEXT: v_and_b32_e32 v0, 0x1fff, v1 ; GFX9-NEXT: global_store_short v2, v0, s[2:3] offset:4 ; GFX9-NEXT: s_endpgm +; +; GFX90A-LABEL: sdiv_v3i15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c +; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_bfe_i32 s1, s4, 0xf0000 +; GFX90A-NEXT: s_bfe_i32 s0, s6, 0xf0000 +; GFX90A-NEXT: v_cvt_f32_i32_e32 v3, s0 +; GFX90A-NEXT: v_cvt_f32_i32_e32 v4, s1 +; GFX90A-NEXT: s_xor_b32 s0, s1, s0 +; GFX90A-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NEXT: v_rcp_iflag_f32_e32 v5, v3 +; GFX90A-NEXT: s_ashr_i32 s0, s0, 30 +; GFX90A-NEXT: v_alignbit_b32 v0, s5, v0, 30 +; GFX90A-NEXT: s_or_b32 s5, s0, 1 +; GFX90A-NEXT: v_mul_f32_e32 v5, v4, v5 +; GFX90A-NEXT: v_trunc_f32_e32 v5, v5 +; GFX90A-NEXT: v_mad_f32 v4, -v5, v3, v4 +; GFX90A-NEXT: v_cmp_ge_f32_e64 s[0:1], |v4|, |v3| +; GFX90A-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX90A-NEXT: s_cselect_b32 s0, s5, 0 +; GFX90A-NEXT: v_cvt_i32_f32_e32 v5, v5 +; GFX90A-NEXT: s_bfe_i32 s1, s6, 0xf000f +; GFX90A-NEXT: v_cvt_f32_i32_e32 v3, s1 +; GFX90A-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NEXT: v_add_u32_e32 v4, s0, v5 +; GFX90A-NEXT: s_bfe_i32 s0, s4, 0xf000f +; GFX90A-NEXT: v_cvt_f32_i32_e32 v5, s0 +; GFX90A-NEXT: v_rcp_iflag_f32_e32 v6, v3 +; GFX90A-NEXT: v_alignbit_b32 v1, s7, v1, 30 +; GFX90A-NEXT: s_xor_b32 s0, s0, s1 +; GFX90A-NEXT: s_ashr_i32 s0, s0, 30 +; GFX90A-NEXT: v_mul_f32_e32 v6, v5, v6 +; GFX90A-NEXT: v_trunc_f32_e32 v6, v6 +; GFX90A-NEXT: v_mad_f32 v5, -v6, v3, v5 +; GFX90A-NEXT: v_bfe_i32 v1, v1, 0, 15 +; GFX90A-NEXT: s_or_b32 s4, s0, 1 +; GFX90A-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, |v3| +; GFX90A-NEXT: v_cvt_f32_i32_e32 v3, v1 +; GFX90A-NEXT: v_cvt_i32_f32_e32 v6, v6 +; GFX90A-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX90A-NEXT: s_cselect_b32 s0, s4, 0 +; GFX90A-NEXT: v_bfe_i32 v0, v0, 0, 15 +; GFX90A-NEXT: v_add_u32_e32 v5, s0, v6 +; GFX90A-NEXT: v_cvt_f32_i32_e32 v6, v0 +; GFX90A-NEXT: v_rcp_iflag_f32_e32 v7, v3 +; GFX90A-NEXT: v_xor_b32_e32 v0, v0, v1 +; GFX90A-NEXT: v_ashrrev_i32_e32 v0, 30, v0 +; GFX90A-NEXT: v_or_b32_e32 v0, 1, v0 +; GFX90A-NEXT: v_mul_f32_e32 v1, v6, v7 +; GFX90A-NEXT: v_trunc_f32_e32 v1, v1 +; GFX90A-NEXT: v_cvt_i32_f32_e32 v7, v1 +; GFX90A-NEXT: v_mad_f32 v1, -v1, v3, v6 +; GFX90A-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v3| +; GFX90A-NEXT: s_movk_i32 s0, 0x7fff +; GFX90A-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GFX90A-NEXT: v_and_b32_e32 v3, s0, v4 +; GFX90A-NEXT: v_and_b32_e32 v4, s0, v5 +; GFX90A-NEXT: v_add_u32_e32 v0, v7, v0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 15, v4 +; GFX90A-NEXT: v_lshlrev_b64 v[0:1], 30, v[0:1] +; GFX90A-NEXT: v_or_b32_e32 v3, v3, v4 +; GFX90A-NEXT: v_or_b32_e32 v0, v3, v0 +; GFX90A-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-NEXT: v_and_b32_e32 v0, 0x1fff, v1 +; GFX90A-NEXT: global_store_short v2, v0, s[2:3] offset:4 +; GFX90A-NEXT: s_endpgm %r = sdiv <3 x i15> %x, %y store <3 x i15> %r, <3 x i15> addrspace(1)* %out ret void @@ -5401,6 +6917,90 @@ ; GFX9-NEXT: v_and_b32_e32 v0, 0x1fff, v1 ; GFX9-NEXT: global_store_short v4, v0, s[2:3] offset:4 ; GFX9-NEXT: s_endpgm +; +; GFX90A-LABEL: srem_v3i15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c +; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34 +; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_and_b32 s0, s4, s8 +; GFX90A-NEXT: s_and_b32 s1, s6, s8 +; GFX90A-NEXT: s_bfe_i32 s1, s1, 0xf0000 +; GFX90A-NEXT: v_cvt_f32_i32_e32 v3, s1 +; GFX90A-NEXT: s_bfe_i32 s0, s0, 0xf0000 +; GFX90A-NEXT: v_cvt_f32_i32_e32 v4, s0 +; GFX90A-NEXT: s_xor_b32 s0, s0, s1 +; GFX90A-NEXT: v_rcp_iflag_f32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NEXT: s_ashr_i32 s0, s0, 30 +; GFX90A-NEXT: v_mul_f32_e32 v5, v4, v5 +; GFX90A-NEXT: v_trunc_f32_e32 v5, v5 +; GFX90A-NEXT: v_mad_f32 v4, -v5, v3, v4 +; GFX90A-NEXT: v_cvt_i32_f32_e32 v5, v5 +; GFX90A-NEXT: v_alignbit_b32 v0, s5, v0, 30 +; GFX90A-NEXT: v_alignbit_b32 v1, s7, v1, 30 +; GFX90A-NEXT: s_or_b32 s11, s0, 1 +; GFX90A-NEXT: s_lshr_b32 s5, s4, 15 +; GFX90A-NEXT: s_bfe_u32 s9, s4, 0xf000f +; GFX90A-NEXT: s_lshr_b32 s7, s6, 15 +; GFX90A-NEXT: s_bfe_u32 s10, s6, 0xf000f +; GFX90A-NEXT: v_cmp_ge_f32_e64 s[0:1], |v4|, |v3| +; GFX90A-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX90A-NEXT: s_cselect_b32 s0, s11, 0 +; GFX90A-NEXT: v_add_u32_e32 v3, s0, v5 +; GFX90A-NEXT: s_bfe_i32 s0, s10, 0xf0000 +; GFX90A-NEXT: v_cvt_f32_i32_e32 v4, s0 +; GFX90A-NEXT: s_bfe_i32 s1, s9, 0xf0000 +; GFX90A-NEXT: v_cvt_f32_i32_e32 v5, s1 +; GFX90A-NEXT: s_xor_b32 s0, s1, s0 +; GFX90A-NEXT: v_rcp_iflag_f32_e32 v6, v4 +; GFX90A-NEXT: v_mul_lo_u32 v3, v3, s6 +; GFX90A-NEXT: s_ashr_i32 s0, s0, 30 +; GFX90A-NEXT: v_sub_u32_e32 v3, s4, v3 +; GFX90A-NEXT: v_mul_f32_e32 v6, v5, v6 +; GFX90A-NEXT: v_trunc_f32_e32 v6, v6 +; GFX90A-NEXT: v_mad_f32 v5, -v6, v4, v5 +; GFX90A-NEXT: v_cvt_i32_f32_e32 v6, v6 +; GFX90A-NEXT: s_or_b32 s4, s0, 1 +; GFX90A-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, |v4| +; GFX90A-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX90A-NEXT: v_and_b32_e32 v1, s8, v1 +; GFX90A-NEXT: s_cselect_b32 s0, s4, 0 +; GFX90A-NEXT: v_bfe_i32 v5, v1, 0, 15 +; GFX90A-NEXT: v_add_u32_e32 v4, s0, v6 +; GFX90A-NEXT: v_cvt_f32_i32_e32 v6, v5 +; GFX90A-NEXT: v_and_b32_e32 v0, s8, v0 +; GFX90A-NEXT: v_bfe_i32 v7, v0, 0, 15 +; GFX90A-NEXT: v_cvt_f32_i32_e32 v8, v7 +; GFX90A-NEXT: v_rcp_iflag_f32_e32 v9, v6 +; GFX90A-NEXT: v_xor_b32_e32 v5, v7, v5 +; GFX90A-NEXT: v_ashrrev_i32_e32 v5, 30, v5 +; GFX90A-NEXT: v_or_b32_e32 v5, 1, v5 +; GFX90A-NEXT: v_mul_f32_e32 v7, v8, v9 +; GFX90A-NEXT: v_trunc_f32_e32 v7, v7 +; GFX90A-NEXT: v_cvt_i32_f32_e32 v9, v7 +; GFX90A-NEXT: v_mad_f32 v7, -v7, v6, v8 +; GFX90A-NEXT: v_cmp_ge_f32_e64 vcc, |v7|, |v6| +; GFX90A-NEXT: v_mul_lo_u32 v4, v4, s7 +; GFX90A-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc +; GFX90A-NEXT: v_sub_u32_e32 v4, s5, v4 +; GFX90A-NEXT: v_add_u32_e32 v5, v9, v5 +; GFX90A-NEXT: v_and_b32_e32 v4, s8, v4 +; GFX90A-NEXT: v_mul_lo_u32 v1, v5, v1 +; GFX90A-NEXT: v_sub_u32_e32 v0, v0, v1 +; GFX90A-NEXT: v_and_b32_e32 v3, s8, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 15, v4 +; GFX90A-NEXT: v_lshlrev_b64 v[0:1], 30, v[0:1] +; GFX90A-NEXT: v_or_b32_e32 v3, v3, v4 +; GFX90A-NEXT: v_or_b32_e32 v0, v3, v0 +; GFX90A-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-NEXT: v_and_b32_e32 v0, 0x1fff, v1 +; GFX90A-NEXT: global_store_short v2, v0, s[2:3] offset:4 +; GFX90A-NEXT: s_endpgm %r = srem <3 x i15> %x, %y store <3 x i15> %r, <3 x i15> addrspace(1)* %out ret void @@ -5442,6 +7042,21 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm +; +; GFX90A-LABEL: udiv_i32_oddk_denom: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_mul_hi_u32 s0, s4, 0xb2a50881 +; GFX90A-NEXT: s_sub_i32 s1, s4, s0 +; GFX90A-NEXT: s_lshr_b32 s1, s1, 1 +; GFX90A-NEXT: s_add_i32 s1, s1, s0 +; GFX90A-NEXT: s_lshr_b32 s0, s1, 20 +; GFX90A-NEXT: v_mov_b32_e32 v1, s0 +; GFX90A-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-NEXT: s_endpgm %r = udiv i32 %x, 1235195 store i32 %r, i32 addrspace(1)* %out ret void @@ -5475,6 +7090,17 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm +; +; GFX90A-LABEL: udiv_i32_pow2k_denom: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_lshr_b32 s0, s4, 12 +; GFX90A-NEXT: v_mov_b32_e32 v1, s0 +; GFX90A-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-NEXT: s_endpgm %r = udiv i32 %x, 4096 store i32 %r, i32 addrspace(1)* %out ret void @@ -5511,6 +7137,18 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm +; +; GFX90A-LABEL: udiv_i32_pow2_shl_denom: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_add_i32 s0, s5, 12 +; GFX90A-NEXT: s_lshr_b32 s0, s4, s0 +; GFX90A-NEXT: v_mov_b32_e32 v1, s0 +; GFX90A-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-NEXT: s_endpgm %shl.y = shl i32 4096, %y %r = udiv i32 %x, %shl.y store i32 %r, i32 addrspace(1)* %out @@ -5554,6 +7192,19 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm +; +; GFX90A-LABEL: udiv_v2i32_pow2k_denom: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_lshr_b32 s0, s4, 12 +; GFX90A-NEXT: s_lshr_b32 s1, s5, 12 +; GFX90A-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX90A-NEXT: s_endpgm %r = udiv <2 x i32> %x, store <2 x i32> %r, <2 x i32> addrspace(1)* %out ret void @@ -5604,6 +7255,23 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm +; +; GFX90A-LABEL: udiv_v2i32_mixed_pow2k_denom: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_mul_hi_u32 s1, s5, 0x100101 +; GFX90A-NEXT: s_lshr_b32 s0, s4, 12 +; GFX90A-NEXT: s_sub_i32 s4, s5, s1 +; GFX90A-NEXT: s_lshr_b32 s4, s4, 1 +; GFX90A-NEXT: s_add_i32 s4, s4, s1 +; GFX90A-NEXT: s_lshr_b32 s1, s4, 11 +; GFX90A-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX90A-NEXT: s_endpgm %r = udiv <2 x i32> %x, store <2 x i32> %r, <2 x i32> addrspace(1)* %out ret void @@ -5785,6 +7453,58 @@ ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm +; +; GFX90A-LABEL: udiv_v2i32_pow2_shl_denom: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX90A-NEXT: s_movk_i32 s8, 0x1000 +; GFX90A-NEXT: s_mov_b32 s9, 0x4f7ffffe +; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x2c +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_lshl_b32 s2, s8, s2 +; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GFX90A-NEXT: s_lshl_b32 s0, s8, s3 +; GFX90A-NEXT: v_cvt_f32_u32_e32 v1, s0 +; GFX90A-NEXT: s_sub_i32 s1, 0, s2 +; GFX90A-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX90A-NEXT: v_rcp_iflag_f32_e32 v1, v1 +; GFX90A-NEXT: v_mul_f32_e32 v0, s9, v0 +; GFX90A-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX90A-NEXT: v_mul_f32_e32 v1, s9, v1 +; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GFX90A-NEXT: v_mul_lo_u32 v3, s1, v0 +; GFX90A-NEXT: v_mul_hi_u32 v3, v0, v3 +; GFX90A-NEXT: v_add_u32_e32 v0, v0, v3 +; GFX90A-NEXT: v_mul_hi_u32 v0, s6, v0 +; GFX90A-NEXT: v_mul_lo_u32 v3, v0, s2 +; GFX90A-NEXT: v_sub_u32_e32 v3, s6, v3 +; GFX90A-NEXT: v_add_u32_e32 v4, 1, v0 +; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s2, v3 +; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; GFX90A-NEXT: v_subrev_u32_e32 v4, s2, v3 +; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; GFX90A-NEXT: s_sub_i32 s1, 0, s0 +; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s2, v3 +; GFX90A-NEXT: v_mul_lo_u32 v3, s1, v1 +; GFX90A-NEXT: v_mul_hi_u32 v3, v1, v3 +; GFX90A-NEXT: v_add_u32_e32 v1, v1, v3 +; GFX90A-NEXT: v_mul_hi_u32 v1, s7, v1 +; GFX90A-NEXT: v_mul_lo_u32 v3, v1, s0 +; GFX90A-NEXT: v_add_u32_e32 v4, 1, v0 +; GFX90A-NEXT: v_sub_u32_e32 v3, s7, v3 +; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; GFX90A-NEXT: v_add_u32_e32 v4, 1, v1 +; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s0, v3 +; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; GFX90A-NEXT: v_subrev_u32_e32 v4, s0, v3 +; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; GFX90A-NEXT: v_add_u32_e32 v4, 1, v1 +; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s0, v3 +; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX90A-NEXT: s_endpgm %shl.y = shl <2 x i32> , %y %r = udiv <2 x i32> %x, %shl.y store <2 x i32> %r, <2 x i32> addrspace(1)* %out @@ -5832,6 +7552,23 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm +; +; GFX90A-LABEL: urem_i32_oddk_denom: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_mul_hi_u32 s0, s4, 0xb2a50881 +; GFX90A-NEXT: s_sub_i32 s1, s4, s0 +; GFX90A-NEXT: s_lshr_b32 s1, s1, 1 +; GFX90A-NEXT: s_add_i32 s1, s1, s0 +; GFX90A-NEXT: s_lshr_b32 s0, s1, 20 +; GFX90A-NEXT: s_mul_i32 s0, s0, 0x12d8fb +; GFX90A-NEXT: s_sub_i32 s0, s4, s0 +; GFX90A-NEXT: v_mov_b32_e32 v1, s0 +; GFX90A-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-NEXT: s_endpgm %r = urem i32 %x, 1235195 store i32 %r, i32 addrspace(1)* %out ret void @@ -5865,6 +7602,17 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm +; +; GFX90A-LABEL: urem_i32_pow2k_denom: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_and_b32 s0, s4, 0xfff +; GFX90A-NEXT: v_mov_b32_e32 v1, s0 +; GFX90A-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-NEXT: s_endpgm %r = urem i32 %x, 4096 store i32 %r, i32 addrspace(1)* %out ret void @@ -5903,6 +7651,19 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm +; +; GFX90A-LABEL: urem_i32_pow2_shl_denom: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_lshl_b32 s0, 0x1000, s5 +; GFX90A-NEXT: s_add_i32 s0, s0, -1 +; GFX90A-NEXT: s_and_b32 s0, s4, s0 +; GFX90A-NEXT: v_mov_b32_e32 v1, s0 +; GFX90A-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-NEXT: s_endpgm %shl.y = shl i32 4096, %y %r = urem i32 %x, %shl.y store i32 %r, i32 addrspace(1)* %out @@ -5948,6 +7709,20 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm +; +; GFX90A-LABEL: urem_v2i32_pow2k_denom: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c +; GFX90A-NEXT: s_movk_i32 s0, 0xfff +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_and_b32 s1, s4, s0 +; GFX90A-NEXT: s_and_b32 s0, s5, s0 +; GFX90A-NEXT: v_mov_b32_e32 v0, s1 +; GFX90A-NEXT: v_mov_b32_e32 v1, s0 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX90A-NEXT: s_endpgm %r = urem <2 x i32> %x, store <2 x i32> %r, <2 x i32> addrspace(1)* %out ret void @@ -6117,6 +7892,54 @@ ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm +; +; GFX90A-LABEL: urem_v2i32_pow2_shl_denom: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX90A-NEXT: s_movk_i32 s8, 0x1000 +; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x2c +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_lshl_b32 s2, s8, s2 +; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GFX90A-NEXT: s_lshl_b32 s0, s8, s3 +; GFX90A-NEXT: s_mov_b32 s3, 0x4f7ffffe +; GFX90A-NEXT: v_cvt_f32_u32_e32 v1, s0 +; GFX90A-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX90A-NEXT: s_sub_i32 s1, 0, s2 +; GFX90A-NEXT: v_rcp_iflag_f32_e32 v1, v1 +; GFX90A-NEXT: v_mul_f32_e32 v0, s3, v0 +; GFX90A-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX90A-NEXT: v_mul_f32_e32 v1, s3, v1 +; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GFX90A-NEXT: v_mul_lo_u32 v3, s1, v0 +; GFX90A-NEXT: v_mul_hi_u32 v3, v0, v3 +; GFX90A-NEXT: v_add_u32_e32 v0, v0, v3 +; GFX90A-NEXT: v_mul_hi_u32 v0, s6, v0 +; GFX90A-NEXT: v_mul_lo_u32 v0, v0, s2 +; GFX90A-NEXT: v_sub_u32_e32 v0, s6, v0 +; GFX90A-NEXT: v_subrev_u32_e32 v3, s2, v0 +; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s2, v0 +; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; GFX90A-NEXT: v_subrev_u32_e32 v3, s2, v0 +; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s2, v0 +; GFX90A-NEXT: s_sub_i32 s1, 0, s0 +; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; GFX90A-NEXT: v_mul_lo_u32 v3, s1, v1 +; GFX90A-NEXT: v_mul_hi_u32 v3, v1, v3 +; GFX90A-NEXT: v_add_u32_e32 v1, v1, v3 +; GFX90A-NEXT: v_mul_hi_u32 v1, s7, v1 +; GFX90A-NEXT: v_mul_lo_u32 v1, v1, s0 +; GFX90A-NEXT: v_sub_u32_e32 v1, s7, v1 +; GFX90A-NEXT: v_subrev_u32_e32 v3, s0, v1 +; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s0, v1 +; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX90A-NEXT: v_subrev_u32_e32 v3, s0, v1 +; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s0, v1 +; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX90A-NEXT: s_endpgm %shl.y = shl <2 x i32> , %y %r = urem <2 x i32> %x, %shl.y store <2 x i32> %r, <2 x i32> addrspace(1)* %out @@ -6159,6 +7982,21 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm +; +; GFX90A-LABEL: sdiv_i32_oddk_denom: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_mul_hi_i32 s0, s4, 0xd9528441 +; GFX90A-NEXT: s_add_i32 s0, s0, s4 +; GFX90A-NEXT: s_lshr_b32 s1, s0, 31 +; GFX90A-NEXT: s_ashr_i32 s0, s0, 20 +; GFX90A-NEXT: s_add_i32 s0, s0, s1 +; GFX90A-NEXT: v_mov_b32_e32 v1, s0 +; GFX90A-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-NEXT: s_endpgm %r = sdiv i32 %x, 1235195 store i32 %r, i32 addrspace(1)* %out ret void @@ -6198,6 +8036,20 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm +; +; GFX90A-LABEL: sdiv_i32_pow2k_denom: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_ashr_i32 s0, s4, 31 +; GFX90A-NEXT: s_lshr_b32 s0, s0, 20 +; GFX90A-NEXT: s_add_i32 s4, s4, s0 +; GFX90A-NEXT: s_ashr_i32 s0, s4, 12 +; GFX90A-NEXT: v_mov_b32_e32 v1, s0 +; GFX90A-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-NEXT: s_endpgm %r = sdiv i32 %x, 4096 store i32 %r, i32 addrspace(1)* %out ret void @@ -6287,6 +8139,44 @@ ; GFX9-NEXT: v_subrev_u32_e32 v0, s2, v0 ; GFX9-NEXT: global_store_dword v2, v0, s[0:1] ; GFX9-NEXT: s_endpgm +; +; GFX90A-LABEL: sdiv_i32_pow2_shl_denom: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX90A-NEXT: v_mov_b32_e32 v1, 0 +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_lshl_b32 s3, 0x1000, s3 +; GFX90A-NEXT: s_ashr_i32 s4, s3, 31 +; GFX90A-NEXT: s_add_i32 s3, s3, s4 +; GFX90A-NEXT: s_xor_b32 s3, s3, s4 +; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s3 +; GFX90A-NEXT: s_sub_i32 s6, 0, s3 +; GFX90A-NEXT: s_ashr_i32 s5, s2, 31 +; GFX90A-NEXT: s_add_i32 s2, s2, s5 +; GFX90A-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX90A-NEXT: s_xor_b32 s2, s2, s5 +; GFX90A-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; GFX90A-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX90A-NEXT: v_mul_lo_u32 v2, s6, v0 +; GFX90A-NEXT: v_mul_hi_u32 v2, v0, v2 +; GFX90A-NEXT: v_add_u32_e32 v0, v0, v2 +; GFX90A-NEXT: v_mul_hi_u32 v0, s2, v0 +; GFX90A-NEXT: v_mul_lo_u32 v3, v0, s3 +; GFX90A-NEXT: v_sub_u32_e32 v3, s2, v3 +; GFX90A-NEXT: v_add_u32_e32 v2, 1, v0 +; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s3, v3 +; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX90A-NEXT: v_subrev_u32_e32 v2, s3, v3 +; GFX90A-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc +; GFX90A-NEXT: v_add_u32_e32 v4, 1, v0 +; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s3, v2 +; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; GFX90A-NEXT: s_xor_b32 s2, s5, s4 +; GFX90A-NEXT: v_xor_b32_e32 v0, s2, v0 +; GFX90A-NEXT: v_subrev_u32_e32 v0, s2, v0 +; GFX90A-NEXT: global_store_dword v1, v0, s[0:1] +; GFX90A-NEXT: s_endpgm %shl.y = shl i32 4096, %y %r = sdiv i32 %x, %shl.y store i32 %r, i32 addrspace(1)* %out @@ -6342,6 +8232,25 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm +; +; GFX90A-LABEL: sdiv_v2i32_pow2k_denom: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_ashr_i32 s0, s4, 31 +; GFX90A-NEXT: s_ashr_i32 s1, s5, 31 +; GFX90A-NEXT: s_lshr_b32 s0, s0, 20 +; GFX90A-NEXT: s_lshr_b32 s1, s1, 20 +; GFX90A-NEXT: s_add_i32 s0, s4, s0 +; GFX90A-NEXT: s_add_i32 s1, s5, s1 +; GFX90A-NEXT: s_ashr_i32 s0, s0, 12 +; GFX90A-NEXT: s_ashr_i32 s1, s1, 12 +; GFX90A-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX90A-NEXT: s_endpgm %r = sdiv <2 x i32> %x, store <2 x i32> %r, <2 x i32> addrspace(1)* %out ret void @@ -6398,6 +8307,26 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm +; +; GFX90A-LABEL: ssdiv_v2i32_mixed_pow2k_denom: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_ashr_i32 s0, s4, 31 +; GFX90A-NEXT: s_mul_hi_i32 s1, s5, 0x80080081 +; GFX90A-NEXT: s_lshr_b32 s0, s0, 20 +; GFX90A-NEXT: s_add_i32 s1, s1, s5 +; GFX90A-NEXT: s_add_i32 s0, s4, s0 +; GFX90A-NEXT: s_lshr_b32 s4, s1, 31 +; GFX90A-NEXT: s_ashr_i32 s1, s1, 11 +; GFX90A-NEXT: s_ashr_i32 s0, s0, 12 +; GFX90A-NEXT: s_add_i32 s1, s1, s4 +; GFX90A-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX90A-NEXT: s_endpgm %r = sdiv <2 x i32> %x, store <2 x i32> %r, <2 x i32> addrspace(1)* %out ret void @@ -6631,6 +8560,76 @@ ; GFX9-NEXT: v_subrev_u32_e32 v1, s1, v1 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm +; +; GFX90A-LABEL: sdiv_v2i32_pow2_shl_denom: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX90A-NEXT: s_movk_i32 s8, 0x1000 +; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x2c +; GFX90A-NEXT: s_mov_b32 s10, 0x4f7ffffe +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_lshl_b32 s2, s8, s2 +; GFX90A-NEXT: s_ashr_i32 s9, s2, 31 +; GFX90A-NEXT: s_add_i32 s2, s2, s9 +; GFX90A-NEXT: s_xor_b32 s2, s2, s9 +; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GFX90A-NEXT: s_ashr_i32 s1, s6, 31 +; GFX90A-NEXT: s_lshl_b32 s0, s8, s3 +; GFX90A-NEXT: s_add_i32 s3, s6, s1 +; GFX90A-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX90A-NEXT: s_xor_b32 s6, s1, s9 +; GFX90A-NEXT: s_xor_b32 s1, s3, s1 +; GFX90A-NEXT: s_sub_i32 s3, 0, s2 +; GFX90A-NEXT: v_mul_f32_e32 v0, s10, v0 +; GFX90A-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX90A-NEXT: v_mul_lo_u32 v1, s3, v0 +; GFX90A-NEXT: v_mul_hi_u32 v1, v0, v1 +; GFX90A-NEXT: v_add_u32_e32 v0, v0, v1 +; GFX90A-NEXT: v_mul_hi_u32 v0, s1, v0 +; GFX90A-NEXT: v_mul_lo_u32 v1, v0, s2 +; GFX90A-NEXT: v_sub_u32_e32 v1, s1, v1 +; GFX90A-NEXT: s_ashr_i32 s1, s0, 31 +; GFX90A-NEXT: s_add_i32 s0, s0, s1 +; GFX90A-NEXT: s_xor_b32 s0, s0, s1 +; GFX90A-NEXT: v_cvt_f32_u32_e32 v4, s0 +; GFX90A-NEXT: v_add_u32_e32 v3, 1, v0 +; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s2, v1 +; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; GFX90A-NEXT: v_subrev_u32_e32 v3, s2, v1 +; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s2, v1 +; GFX90A-NEXT: v_rcp_iflag_f32_e32 v1, v4 +; GFX90A-NEXT: s_ashr_i32 s2, s7, 31 +; GFX90A-NEXT: s_add_i32 s3, s7, s2 +; GFX90A-NEXT: v_add_u32_e32 v3, 1, v0 +; GFX90A-NEXT: v_mul_f32_e32 v1, s10, v1 +; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GFX90A-NEXT: s_xor_b32 s1, s2, s1 +; GFX90A-NEXT: s_xor_b32 s2, s3, s2 +; GFX90A-NEXT: s_sub_i32 s3, 0, s0 +; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; GFX90A-NEXT: v_mul_lo_u32 v3, s3, v1 +; GFX90A-NEXT: v_mul_hi_u32 v3, v1, v3 +; GFX90A-NEXT: v_add_u32_e32 v1, v1, v3 +; GFX90A-NEXT: v_mul_hi_u32 v1, s2, v1 +; GFX90A-NEXT: v_mul_lo_u32 v3, v1, s0 +; GFX90A-NEXT: v_sub_u32_e32 v3, s2, v3 +; GFX90A-NEXT: v_add_u32_e32 v4, 1, v1 +; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s0, v3 +; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; GFX90A-NEXT: v_subrev_u32_e32 v4, s0, v3 +; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; GFX90A-NEXT: v_add_u32_e32 v4, 1, v1 +; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s0, v3 +; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; GFX90A-NEXT: v_xor_b32_e32 v0, s6, v0 +; GFX90A-NEXT: v_xor_b32_e32 v1, s1, v1 +; GFX90A-NEXT: v_subrev_u32_e32 v0, s6, v0 +; GFX90A-NEXT: v_subrev_u32_e32 v1, s1, v1 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX90A-NEXT: s_endpgm %shl.y = shl <2 x i32> , %y %r = sdiv <2 x i32> %x, %shl.y store <2 x i32> %r, <2 x i32> addrspace(1)* %out @@ -6678,6 +8677,23 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm +; +; GFX90A-LABEL: srem_i32_oddk_denom: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_mul_hi_i32 s0, s4, 0xd9528441 +; GFX90A-NEXT: s_add_i32 s0, s0, s4 +; GFX90A-NEXT: s_lshr_b32 s1, s0, 31 +; GFX90A-NEXT: s_ashr_i32 s0, s0, 20 +; GFX90A-NEXT: s_add_i32 s0, s0, s1 +; GFX90A-NEXT: s_mul_i32 s0, s0, 0x12d8fb +; GFX90A-NEXT: s_sub_i32 s0, s4, s0 +; GFX90A-NEXT: v_mov_b32_e32 v1, s0 +; GFX90A-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-NEXT: s_endpgm %r = srem i32 %x, 1235195 store i32 %r, i32 addrspace(1)* %out ret void @@ -6719,6 +8735,21 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm +; +; GFX90A-LABEL: srem_i32_pow2k_denom: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_ashr_i32 s0, s4, 31 +; GFX90A-NEXT: s_lshr_b32 s0, s0, 20 +; GFX90A-NEXT: s_add_i32 s0, s4, s0 +; GFX90A-NEXT: s_and_b32 s0, s0, 0xfffff000 +; GFX90A-NEXT: s_sub_i32 s0, s4, s0 +; GFX90A-NEXT: v_mov_b32_e32 v1, s0 +; GFX90A-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-NEXT: s_endpgm %r = srem i32 %x, 4096 store i32 %r, i32 addrspace(1)* %out ret void @@ -6802,6 +8833,41 @@ ; GFX9-NEXT: v_subrev_u32_e32 v0, s4, v0 ; GFX9-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-NEXT: s_endpgm +; +; GFX90A-LABEL: srem_i32_pow2_shl_denom: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX90A-NEXT: v_mov_b32_e32 v1, 0 +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_lshl_b32 s3, 0x1000, s3 +; GFX90A-NEXT: s_ashr_i32 s4, s3, 31 +; GFX90A-NEXT: s_add_i32 s3, s3, s4 +; GFX90A-NEXT: s_xor_b32 s3, s3, s4 +; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s3 +; GFX90A-NEXT: s_sub_i32 s5, 0, s3 +; GFX90A-NEXT: s_ashr_i32 s4, s2, 31 +; GFX90A-NEXT: s_add_i32 s2, s2, s4 +; GFX90A-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX90A-NEXT: s_xor_b32 s2, s2, s4 +; GFX90A-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; GFX90A-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX90A-NEXT: v_mul_lo_u32 v2, s5, v0 +; GFX90A-NEXT: v_mul_hi_u32 v2, v0, v2 +; GFX90A-NEXT: v_add_u32_e32 v0, v0, v2 +; GFX90A-NEXT: v_mul_hi_u32 v0, s2, v0 +; GFX90A-NEXT: v_mul_lo_u32 v0, v0, s3 +; GFX90A-NEXT: v_sub_u32_e32 v0, s2, v0 +; GFX90A-NEXT: v_subrev_u32_e32 v2, s3, v0 +; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s3, v0 +; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX90A-NEXT: v_subrev_u32_e32 v2, s3, v0 +; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s3, v0 +; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX90A-NEXT: v_xor_b32_e32 v0, s4, v0 +; GFX90A-NEXT: v_subrev_u32_e32 v0, s4, v0 +; GFX90A-NEXT: global_store_dword v1, v0, s[0:1] +; GFX90A-NEXT: s_endpgm %shl.y = shl i32 4096, %y %r = srem i32 %x, %shl.y store i32 %r, i32 addrspace(1)* %out @@ -6863,6 +8929,28 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm +; +; GFX90A-LABEL: srem_v2i32_pow2k_denom: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c +; GFX90A-NEXT: s_movk_i32 s6, 0xf000 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_ashr_i32 s0, s4, 31 +; GFX90A-NEXT: s_ashr_i32 s1, s5, 31 +; GFX90A-NEXT: s_lshr_b32 s0, s0, 20 +; GFX90A-NEXT: s_lshr_b32 s1, s1, 20 +; GFX90A-NEXT: s_add_i32 s0, s4, s0 +; GFX90A-NEXT: s_add_i32 s1, s5, s1 +; GFX90A-NEXT: s_and_b32 s0, s0, s6 +; GFX90A-NEXT: s_and_b32 s1, s1, s6 +; GFX90A-NEXT: s_sub_i32 s0, s4, s0 +; GFX90A-NEXT: s_sub_i32 s1, s5, s1 +; GFX90A-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX90A-NEXT: s_endpgm %r = srem <2 x i32> %x, store <2 x i32> %r, <2 x i32> addrspace(1)* %out ret void @@ -7079,6 +9167,70 @@ ; GFX9-NEXT: v_subrev_u32_e32 v1, s7, v1 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm +; +; GFX90A-LABEL: srem_v2i32_pow2_shl_denom: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c +; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34 +; GFX90A-NEXT: s_movk_i32 s8, 0x1000 +; GFX90A-NEXT: s_mov_b32 s9, 0x4f7ffffe +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_lshl_b32 s0, s8, s6 +; GFX90A-NEXT: s_ashr_i32 s1, s0, 31 +; GFX90A-NEXT: s_add_i32 s0, s0, s1 +; GFX90A-NEXT: s_xor_b32 s0, s0, s1 +; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s0 +; GFX90A-NEXT: s_lshl_b32 s1, s8, s7 +; GFX90A-NEXT: s_sub_i32 s8, 0, s0 +; GFX90A-NEXT: s_ashr_i32 s6, s4, 31 +; GFX90A-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX90A-NEXT: s_add_i32 s4, s4, s6 +; GFX90A-NEXT: s_xor_b32 s4, s4, s6 +; GFX90A-NEXT: s_ashr_i32 s7, s1, 31 +; GFX90A-NEXT: v_mul_f32_e32 v0, s9, v0 +; GFX90A-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX90A-NEXT: s_add_i32 s1, s1, s7 +; GFX90A-NEXT: s_xor_b32 s1, s1, s7 +; GFX90A-NEXT: v_mul_lo_u32 v1, s8, v0 +; GFX90A-NEXT: v_mul_hi_u32 v1, v0, v1 +; GFX90A-NEXT: v_add_u32_e32 v0, v0, v1 +; GFX90A-NEXT: v_mul_hi_u32 v0, s4, v0 +; GFX90A-NEXT: v_mul_lo_u32 v0, v0, s0 +; GFX90A-NEXT: v_sub_u32_e32 v0, s4, v0 +; GFX90A-NEXT: v_subrev_u32_e32 v1, s0, v0 +; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s0, v0 +; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX90A-NEXT: v_cvt_f32_u32_e32 v1, s1 +; GFX90A-NEXT: v_subrev_u32_e32 v3, s0, v0 +; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s0, v0 +; GFX90A-NEXT: s_ashr_i32 s0, s5, 31 +; GFX90A-NEXT: v_rcp_iflag_f32_e32 v1, v1 +; GFX90A-NEXT: s_add_i32 s4, s5, s0 +; GFX90A-NEXT: s_sub_i32 s5, 0, s1 +; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; GFX90A-NEXT: v_mul_f32_e32 v1, s9, v1 +; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GFX90A-NEXT: s_xor_b32 s4, s4, s0 +; GFX90A-NEXT: v_xor_b32_e32 v0, s6, v0 +; GFX90A-NEXT: v_subrev_u32_e32 v0, s6, v0 +; GFX90A-NEXT: v_mul_lo_u32 v3, s5, v1 +; GFX90A-NEXT: v_mul_hi_u32 v3, v1, v3 +; GFX90A-NEXT: v_add_u32_e32 v1, v1, v3 +; GFX90A-NEXT: v_mul_hi_u32 v1, s4, v1 +; GFX90A-NEXT: v_mul_lo_u32 v1, v1, s1 +; GFX90A-NEXT: v_sub_u32_e32 v1, s4, v1 +; GFX90A-NEXT: v_subrev_u32_e32 v3, s1, v1 +; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s1, v1 +; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX90A-NEXT: v_subrev_u32_e32 v3, s1, v1 +; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s1, v1 +; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX90A-NEXT: v_xor_b32_e32 v1, s0, v1 +; GFX90A-NEXT: v_subrev_u32_e32 v1, s0, v1 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX90A-NEXT: s_endpgm %shl.y = shl <2 x i32> , %y %r = srem <2 x i32> %x, %shl.y store <2 x i32> %r, <2 x i32> addrspace(1)* %out @@ -7316,26 +9468,141 @@ ; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1] ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, v4 ; GFX9-NEXT: v_cndmask_b32_e64 v4, v7, v6, s[0:1] -; GFX9-NEXT: v_add_co_u32_e64 v6, s[0:1], 2, v0 -; GFX9-NEXT: v_addc_co_u32_e64 v7, s[0:1], 0, v1, s[0:1] -; GFX9-NEXT: v_add_co_u32_e64 v8, s[0:1], 1, v0 -; GFX9-NEXT: v_addc_co_u32_e64 v9, s[0:1], 0, v1, s[0:1] -; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4 -; GFX9-NEXT: v_cndmask_b32_e64 v4, v9, v7, s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v7, s7 ; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v7, v2, vcc ; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, s3, v2 +; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4 ; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc ; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, s6, v3 +; GFX9-NEXT: v_cndmask_b32_e64 v4, 1, 2, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s2, v2 +; GFX9-NEXT: v_add_co_u32_e64 v4, s[0:1], v0, v4 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GFX9-NEXT: v_cndmask_b32_e64 v2, v8, v6, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX9-NEXT: v_addc_co_u32_e64 v6, s[0:1], 0, v1, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc ; GFX9-NEXT: global_store_dwordx2 v5, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm +; +; GFX90A-LABEL: udiv_i64_oddk_denom: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: v_mov_b32_e32 v0, 0x4f176a73 +; GFX90A-NEXT: v_mov_b32_e32 v1, 0x4f800000 +; GFX90A-NEXT: v_madmk_f32 v0, v1, 0x438f8000, v0 +; GFX90A-NEXT: v_rcp_f32_e32 v0, v0 +; GFX90A-NEXT: s_movk_i32 s2, 0xfee0 +; GFX90A-NEXT: s_mov_b32 s3, 0x68958c89 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 +; GFX90A-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 +; GFX90A-NEXT: v_trunc_f32_e32 v1, v1 +; GFX90A-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 +; GFX90A-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX90A-NEXT: v_mul_lo_u32 v3, v0, s2 +; GFX90A-NEXT: v_mul_hi_u32 v4, v0, s3 +; GFX90A-NEXT: v_add_u32_e32 v3, v4, v3 +; GFX90A-NEXT: v_mul_lo_u32 v4, v1, s3 +; GFX90A-NEXT: v_add_u32_e32 v3, v3, v4 +; GFX90A-NEXT: v_mul_lo_u32 v6, v0, s3 +; GFX90A-NEXT: v_mul_lo_u32 v5, v0, v3 +; GFX90A-NEXT: v_mul_hi_u32 v7, v0, v6 +; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v7, v5 +; GFX90A-NEXT: v_mul_hi_u32 v4, v0, v3 +; GFX90A-NEXT: v_mul_hi_u32 v9, v1, v6 +; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v8, v4, vcc +; GFX90A-NEXT: v_mul_lo_u32 v6, v1, v6 +; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v5, v6 +; GFX90A-NEXT: v_mul_hi_u32 v7, v1, v3 +; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v9, vcc +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v7, v2, vcc +; GFX90A-NEXT: v_mul_lo_u32 v3, v1, v3 +; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v4, v3 +; GFX90A-NEXT: v_add_co_u32_e64 v0, s[0:1], v0, v3 +; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v8, v5, vcc +; GFX90A-NEXT: v_addc_co_u32_e64 v3, vcc, v1, v4, s[0:1] +; GFX90A-NEXT: v_mul_lo_u32 v6, v0, s2 +; GFX90A-NEXT: v_mul_hi_u32 v7, v0, s3 +; GFX90A-NEXT: v_mul_lo_u32 v5, v3, s3 +; GFX90A-NEXT: v_add_u32_e32 v6, v7, v6 +; GFX90A-NEXT: v_add_u32_e32 v5, v6, v5 +; GFX90A-NEXT: v_mul_lo_u32 v9, v0, s3 +; GFX90A-NEXT: v_mul_lo_u32 v7, v0, v5 +; GFX90A-NEXT: v_mul_hi_u32 v10, v0, v9 +; GFX90A-NEXT: v_add_co_u32_e32 v7, vcc, v10, v7 +; GFX90A-NEXT: v_mul_hi_u32 v6, v0, v5 +; GFX90A-NEXT: v_mul_hi_u32 v11, v3, v9 +; GFX90A-NEXT: v_addc_co_u32_e32 v6, vcc, v8, v6, vcc +; GFX90A-NEXT: v_mul_lo_u32 v9, v3, v9 +; GFX90A-NEXT: v_add_co_u32_e32 v7, vcc, v7, v9 +; GFX90A-NEXT: v_mul_hi_u32 v10, v3, v5 +; GFX90A-NEXT: v_addc_co_u32_e32 v6, vcc, v6, v11, vcc +; GFX90A-NEXT: v_addc_co_u32_e32 v7, vcc, v10, v2, vcc +; GFX90A-NEXT: v_mul_lo_u32 v3, v3, v5 +; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v6, v3 +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v8, v7, vcc +; GFX90A-NEXT: v_add_u32_e32 v1, v1, v4 +; GFX90A-NEXT: v_addc_co_u32_e64 v1, vcc, v1, v5, s[0:1] +; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: v_mul_lo_u32 v4, s6, v1 +; GFX90A-NEXT: v_mul_hi_u32 v5, s6, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v5, v4 +; GFX90A-NEXT: v_mul_hi_u32 v3, s6, v1 +; GFX90A-NEXT: v_mul_hi_u32 v6, s7, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v8, v3, vcc +; GFX90A-NEXT: v_mul_lo_u32 v0, s7, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v4, v0 +; GFX90A-NEXT: v_mul_hi_u32 v5, s7, v1 +; GFX90A-NEXT: v_addc_co_u32_e32 v0, vcc, v3, v6, vcc +; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v2, vcc +; GFX90A-NEXT: v_mul_lo_u32 v1, s7, v1 +; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 +; GFX90A-NEXT: s_mov_b32 s3, 0x976a7377 +; GFX90A-NEXT: s_movk_i32 s2, 0x11f +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v8, v3, vcc +; GFX90A-NEXT: v_mul_lo_u32 v3, v0, s2 +; GFX90A-NEXT: v_mul_hi_u32 v4, v0, s3 +; GFX90A-NEXT: v_add_u32_e32 v3, v4, v3 +; GFX90A-NEXT: v_mul_lo_u32 v4, v1, s3 +; GFX90A-NEXT: v_add_u32_e32 v3, v3, v4 +; GFX90A-NEXT: v_mul_lo_u32 v5, v0, s3 +; GFX90A-NEXT: v_sub_co_u32_e32 v5, vcc, s6, v5 +; GFX90A-NEXT: v_sub_u32_e32 v4, s7, v3 +; GFX90A-NEXT: v_mov_b32_e32 v6, s2 +; GFX90A-NEXT: v_subb_co_u32_e64 v4, s[0:1], v4, v6, vcc +; GFX90A-NEXT: v_subrev_co_u32_e64 v6, s[0:1], s3, v5 +; GFX90A-NEXT: v_subbrev_co_u32_e64 v4, s[0:1], 0, v4, s[0:1] +; GFX90A-NEXT: s_movk_i32 s3, 0x11e +; GFX90A-NEXT: v_cmp_lt_u32_e64 s[0:1], s3, v4 +; GFX90A-NEXT: s_mov_b32 s6, 0x976a7376 +; GFX90A-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[0:1] +; GFX90A-NEXT: v_cmp_lt_u32_e64 s[0:1], s6, v6 +; GFX90A-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1] +; GFX90A-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, v4 +; GFX90A-NEXT: v_cndmask_b32_e64 v4, v7, v6, s[0:1] +; GFX90A-NEXT: v_mov_b32_e32 v7, s7 +; GFX90A-NEXT: v_subb_co_u32_e32 v3, vcc, v7, v3, vcc +; GFX90A-NEXT: v_cmp_lt_u32_e32 vcc, s3, v3 +; GFX90A-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4 +; GFX90A-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc +; GFX90A-NEXT: v_cmp_lt_u32_e32 vcc, s6, v5 +; GFX90A-NEXT: v_cndmask_b32_e64 v4, 1, 2, s[0:1] +; GFX90A-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, s2, v3 +; GFX90A-NEXT: v_add_co_u32_e64 v4, s[0:1], v0, v4 +; GFX90A-NEXT: v_cndmask_b32_e32 v3, v7, v5, vcc +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; GFX90A-NEXT: v_addc_co_u32_e64 v6, s[0:1], 0, v1, s[0:1] +; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX90A-NEXT: s_endpgm %r = udiv i64 %x, 1235195949943 store i64 %r, i64 addrspace(1)* %out ret void @@ -7371,6 +9638,16 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm +; +; GFX90A-LABEL: udiv_i64_pow2k_denom: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_lshr_b64 s[2:3], s[2:3], 12 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX90A-NEXT: s_endpgm %r = udiv i64 %x, 4096 store i64 %r, i64 addrspace(1)* %out ret void @@ -7411,6 +9688,18 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm +; +; GFX90A-LABEL: udiv_i64_pow2_shl_denom: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_add_i32 s2, s2, 12 +; GFX90A-NEXT: s_lshr_b64 s[0:1], s[6:7], s2 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX90A-NEXT: s_endpgm %shl.y = shl i64 4096, %y %r = udiv i64 %x, %shl.y store i64 %r, i64 addrspace(1)* %out @@ -7458,6 +9747,21 @@ ; GFX9-NEXT: v_mov_b32_e32 v3, s5 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] ; GFX9-NEXT: s_endpgm +; +; GFX90A-LABEL: udiv_v2i64_pow2k_denom: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_lshr_b64 s[0:1], s[4:5], 12 +; GFX90A-NEXT: s_lshr_b64 s[4:5], s[6:7], 12 +; GFX90A-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NEXT: v_mov_b32_e32 v3, s5 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] +; GFX90A-NEXT: s_endpgm %r = udiv <2 x i64> %x, store <2 x i64> %r, <2 x i64> addrspace(1)* %out ret void @@ -7600,6 +9904,7 @@ ; GFX9-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GFX9-NEXT: s_movk_i32 s8, 0xfff ; GFX9-NEXT: v_mul_hi_u32 v2, v0, s4 ; GFX9-NEXT: v_mul_lo_u32 v4, v1, s4 ; GFX9-NEXT: v_mul_lo_u32 v3, v0, s4 @@ -7624,7 +9929,6 @@ ; GFX9-NEXT: v_mul_hi_u32 v4, v0, s4 ; GFX9-NEXT: v_mul_lo_u32 v6, v2, s4 ; GFX9-NEXT: v_mul_lo_u32 v8, v0, s4 -; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX9-NEXT: v_sub_u32_e32 v4, v4, v0 ; GFX9-NEXT: v_add_u32_e32 v4, v4, v6 @@ -7638,7 +9942,7 @@ ; GFX9-NEXT: v_mul_lo_u32 v10, v2, v8 ; GFX9-NEXT: v_mul_hi_u32 v8, v2, v8 ; GFX9-NEXT: v_mul_lo_u32 v2, v2, v4 -; GFX9-NEXT: s_movk_i32 s0, 0xfff +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v6, v10 ; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, v9, v8, vcc ; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v11, v5, vcc @@ -7658,43 +9962,147 @@ ; GFX9-NEXT: v_mul_lo_u32 v4, s7, v0 ; GFX9-NEXT: v_mul_hi_u32 v0, s7, v0 ; GFX9-NEXT: s_lshr_b64 s[2:3], s[4:5], 12 +; GFX9-NEXT: s_movk_i32 s4, 0xffe ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v3, v0, vcc ; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v6, v5, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v7, v2, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 2, v0 -; GFX9-NEXT: v_mul_lo_u32 v4, v1, s0 -; GFX9-NEXT: v_mul_hi_u32 v6, v0, s0 -; GFX9-NEXT: v_mul_lo_u32 v9, v0, s0 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, 1, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v1, vcc -; GFX9-NEXT: v_add_u32_e32 v4, v6, v4 -; GFX9-NEXT: v_mov_b32_e32 v6, s7 -; GFX9-NEXT: v_sub_co_u32_e32 v9, vcc, s6, v9 -; GFX9-NEXT: v_subb_co_u32_e32 v4, vcc, v6, v4, vcc -; GFX9-NEXT: v_subrev_co_u32_e32 v6, vcc, s0, v9 -; GFX9-NEXT: v_subbrev_co_u32_e32 v10, vcc, 0, v4, vcc -; GFX9-NEXT: s_movk_i32 s0, 0xffe -; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, s0, v6 -; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v10 -; GFX9-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc -; GFX9-NEXT: v_cmp_lt_u32_e64 s[0:1], s0, v9 -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v4 -; GFX9-NEXT: v_cndmask_b32_e64 v4, -1, v6, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e32 v3, v8, v3, vcc -; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4 -; GFX9-NEXT: v_cndmask_b32_e64 v3, v1, v3, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v2, v0, v1, s[0:1] +; GFX9-NEXT: v_mul_lo_u32 v4, v0, s8 +; GFX9-NEXT: v_mul_lo_u32 v2, v1, s8 +; GFX9-NEXT: v_mul_hi_u32 v3, v0, s8 +; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, s6, v4 +; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v3, v2, vcc +; GFX9-NEXT: v_subrev_co_u32_e32 v3, vcc, s8, v4 +; GFX9-NEXT: v_subbrev_co_u32_e32 v6, vcc, 0, v2, vcc +; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, s4, v3 +; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v3, -1, v3, vcc +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; GFX9-NEXT: v_cndmask_b32_e64 v3, 1, 2, vcc +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v0, v3 +; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v1, vcc +; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, s4, v4 +; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, -1, v4, vcc +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v0, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v6, vcc ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx4 v5, v[0:3], s[8:9] +; GFX9-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] ; GFX9-NEXT: s_endpgm +; +; GFX90A-LABEL: udiv_v2i64_mixed_pow2k_denom: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: v_mov_b32_e32 v0, 0x4f800000 +; GFX90A-NEXT: v_madak_f32 v0, 0, v0, 0x457ff000 +; GFX90A-NEXT: v_rcp_f32_e32 v0, v0 +; GFX90A-NEXT: s_movk_i32 s8, 0xf001 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 +; GFX90A-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 +; GFX90A-NEXT: v_trunc_f32_e32 v1, v1 +; GFX90A-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 +; GFX90A-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX90A-NEXT: v_mul_hi_u32 v2, v0, s8 +; GFX90A-NEXT: v_sub_u32_e32 v2, v2, v0 +; GFX90A-NEXT: v_mul_lo_u32 v3, v1, s8 +; GFX90A-NEXT: v_add_u32_e32 v2, v2, v3 +; GFX90A-NEXT: v_mul_lo_u32 v6, v0, s8 +; GFX90A-NEXT: v_mul_lo_u32 v5, v0, v2 +; GFX90A-NEXT: v_mul_hi_u32 v7, v0, v6 +; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v7, v5 +; GFX90A-NEXT: v_mul_hi_u32 v3, v0, v2 +; GFX90A-NEXT: v_mul_hi_u32 v9, v1, v6 +; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v8, v3, vcc +; GFX90A-NEXT: v_mul_lo_u32 v6, v1, v6 +; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v5, v6 +; GFX90A-NEXT: v_mul_hi_u32 v7, v1, v2 +; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v9, vcc +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v7, v4, vcc +; GFX90A-NEXT: v_mul_lo_u32 v2, v1, v2 +; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 +; GFX90A-NEXT: v_add_co_u32_e64 v0, s[0:1], v0, v2 +; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v8, v5, vcc +; GFX90A-NEXT: v_addc_co_u32_e64 v2, vcc, v1, v3, s[0:1] +; GFX90A-NEXT: v_mul_hi_u32 v6, v0, s8 +; GFX90A-NEXT: v_mul_lo_u32 v5, v2, s8 +; GFX90A-NEXT: v_sub_u32_e32 v6, v6, v0 +; GFX90A-NEXT: v_add_u32_e32 v5, v6, v5 +; GFX90A-NEXT: v_mul_lo_u32 v9, v0, s8 +; GFX90A-NEXT: v_mul_lo_u32 v7, v0, v5 +; GFX90A-NEXT: v_mul_hi_u32 v10, v0, v9 +; GFX90A-NEXT: v_add_co_u32_e32 v7, vcc, v10, v7 +; GFX90A-NEXT: v_mul_hi_u32 v6, v0, v5 +; GFX90A-NEXT: v_mul_hi_u32 v11, v2, v9 +; GFX90A-NEXT: v_addc_co_u32_e32 v6, vcc, v8, v6, vcc +; GFX90A-NEXT: v_mul_lo_u32 v9, v2, v9 +; GFX90A-NEXT: v_add_co_u32_e32 v7, vcc, v7, v9 +; GFX90A-NEXT: v_mul_hi_u32 v10, v2, v5 +; GFX90A-NEXT: v_addc_co_u32_e32 v6, vcc, v6, v11, vcc +; GFX90A-NEXT: v_addc_co_u32_e32 v7, vcc, v10, v4, vcc +; GFX90A-NEXT: v_mul_lo_u32 v2, v2, v5 +; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, v6, v2 +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v8, v7, vcc +; GFX90A-NEXT: v_add_u32_e32 v1, v1, v3 +; GFX90A-NEXT: v_addc_co_u32_e64 v1, vcc, v1, v5, s[0:1] +; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: v_mul_lo_u32 v3, s6, v1 +; GFX90A-NEXT: v_mul_hi_u32 v5, s6, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v5, v3 +; GFX90A-NEXT: v_mul_hi_u32 v2, s6, v1 +; GFX90A-NEXT: v_mul_hi_u32 v6, s7, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v2, vcc, v8, v2, vcc +; GFX90A-NEXT: v_mul_lo_u32 v0, s7, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v3, v0 +; GFX90A-NEXT: v_mul_hi_u32 v5, s7, v1 +; GFX90A-NEXT: v_addc_co_u32_e32 v0, vcc, v2, v6, vcc +; GFX90A-NEXT: v_addc_co_u32_e32 v2, vcc, v5, v4, vcc +; GFX90A-NEXT: v_mul_lo_u32 v1, s7, v1 +; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 +; GFX90A-NEXT: s_movk_i32 s0, 0xfff +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v8, v2, vcc +; GFX90A-NEXT: v_mul_lo_u32 v2, v1, s0 +; GFX90A-NEXT: v_mul_hi_u32 v3, v0, s0 +; GFX90A-NEXT: v_add_u32_e32 v2, v3, v2 +; GFX90A-NEXT: v_mul_lo_u32 v3, v0, s0 +; GFX90A-NEXT: v_mov_b32_e32 v5, s7 +; GFX90A-NEXT: v_sub_co_u32_e32 v3, vcc, s6, v3 +; GFX90A-NEXT: v_subb_co_u32_e32 v2, vcc, v5, v2, vcc +; GFX90A-NEXT: v_subrev_co_u32_e32 v5, vcc, s0, v3 +; GFX90A-NEXT: v_subbrev_co_u32_e32 v6, vcc, 0, v2, vcc +; GFX90A-NEXT: s_movk_i32 s0, 0xffe +; GFX90A-NEXT: v_cmp_lt_u32_e32 vcc, s0, v5 +; GFX90A-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v6 +; GFX90A-NEXT: v_cndmask_b32_e32 v5, -1, v5, vcc +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 +; GFX90A-NEXT: v_cndmask_b32_e64 v5, 1, 2, vcc +; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v0, v5 +; GFX90A-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v1, vcc +; GFX90A-NEXT: v_cmp_lt_u32_e32 vcc, s0, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX90A-NEXT: v_cndmask_b32_e32 v2, -1, v3, vcc +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GFX90A-NEXT: s_lshr_b64 s[4:5], s[4:5], 12 +; GFX90A-NEXT: v_cndmask_b32_e32 v2, v0, v5, vcc +; GFX90A-NEXT: v_cndmask_b32_e32 v3, v1, v6, vcc +; GFX90A-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] +; GFX90A-NEXT: s_endpgm %r = udiv <2 x i64> %x, store <2 x i64> %r, <2 x i64> addrspace(1)* %out ret void @@ -7750,6 +10158,24 @@ ; GFX9-NEXT: v_mov_b32_e32 v3, s5 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] ; GFX9-NEXT: s_endpgm +; +; GFX90A-LABEL: udiv_v2i64_pow2_shl_denom: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX90A-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x44 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_add_i32 s0, s8, 12 +; GFX90A-NEXT: s_add_i32 s8, s10, 12 +; GFX90A-NEXT: s_lshr_b64 s[0:1], s[4:5], s0 +; GFX90A-NEXT: s_lshr_b64 s[4:5], s[6:7], s8 +; GFX90A-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NEXT: v_mov_b32_e32 v3, s5 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] +; GFX90A-NEXT: s_endpgm %shl.y = shl <2 x i64> , %y %r = udiv <2 x i64> %x, %shl.y store <2 x i64> %r, <2 x i64> addrspace(1)* %out @@ -7990,21 +10416,141 @@ ; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v8, s[2:3] ; GFX9-NEXT: v_subbrev_co_u32_e64 v2, s[0:1], 0, v2, s[0:1] ; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v7 -; GFX9-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[0:1] -; GFX9-NEXT: v_mov_b32_e32 v6, s7 -; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v6, v1, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v3, v4, v3, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v4, s7 +; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v4, v1, vcc ; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, s6, v1 -; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc ; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, s10, v0 -; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s8, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v7, vcc -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v2, v4, v3, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX9-NEXT: global_store_dwordx2 v5, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm +; +; GFX90A-LABEL: urem_i64_oddk_denom: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: v_mov_b32_e32 v0, 0x4f1761f8 +; GFX90A-NEXT: v_mov_b32_e32 v1, 0x4f800000 +; GFX90A-NEXT: v_madmk_f32 v0, v1, 0x438f8000, v0 +; GFX90A-NEXT: v_rcp_f32_e32 v0, v0 +; GFX90A-NEXT: s_movk_i32 s2, 0xfee0 +; GFX90A-NEXT: s_mov_b32 s3, 0x689e0837 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 +; GFX90A-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 +; GFX90A-NEXT: v_trunc_f32_e32 v1, v1 +; GFX90A-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 +; GFX90A-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX90A-NEXT: v_mul_lo_u32 v3, v0, s2 +; GFX90A-NEXT: v_mul_hi_u32 v4, v0, s3 +; GFX90A-NEXT: v_add_u32_e32 v3, v4, v3 +; GFX90A-NEXT: v_mul_lo_u32 v4, v1, s3 +; GFX90A-NEXT: v_add_u32_e32 v3, v3, v4 +; GFX90A-NEXT: v_mul_lo_u32 v6, v0, s3 +; GFX90A-NEXT: v_mul_lo_u32 v5, v0, v3 +; GFX90A-NEXT: v_mul_hi_u32 v7, v0, v6 +; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v7, v5 +; GFX90A-NEXT: v_mul_hi_u32 v4, v0, v3 +; GFX90A-NEXT: v_mul_hi_u32 v9, v1, v6 +; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v8, v4, vcc +; GFX90A-NEXT: v_mul_lo_u32 v6, v1, v6 +; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v5, v6 +; GFX90A-NEXT: v_mul_hi_u32 v7, v1, v3 +; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v9, vcc +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v7, v2, vcc +; GFX90A-NEXT: v_mul_lo_u32 v3, v1, v3 +; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v4, v3 +; GFX90A-NEXT: v_add_co_u32_e64 v0, s[0:1], v0, v3 +; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v8, v5, vcc +; GFX90A-NEXT: v_addc_co_u32_e64 v3, vcc, v1, v4, s[0:1] +; GFX90A-NEXT: v_mul_lo_u32 v6, v0, s2 +; GFX90A-NEXT: v_mul_hi_u32 v7, v0, s3 +; GFX90A-NEXT: v_mul_lo_u32 v5, v3, s3 +; GFX90A-NEXT: v_add_u32_e32 v6, v7, v6 +; GFX90A-NEXT: v_add_u32_e32 v5, v6, v5 +; GFX90A-NEXT: v_mul_lo_u32 v9, v0, s3 +; GFX90A-NEXT: v_mul_lo_u32 v7, v0, v5 +; GFX90A-NEXT: v_mul_hi_u32 v10, v0, v9 +; GFX90A-NEXT: v_add_co_u32_e32 v7, vcc, v10, v7 +; GFX90A-NEXT: v_mul_hi_u32 v6, v0, v5 +; GFX90A-NEXT: v_mul_hi_u32 v11, v3, v9 +; GFX90A-NEXT: v_addc_co_u32_e32 v6, vcc, v8, v6, vcc +; GFX90A-NEXT: v_mul_lo_u32 v9, v3, v9 +; GFX90A-NEXT: v_add_co_u32_e32 v7, vcc, v7, v9 +; GFX90A-NEXT: v_mul_hi_u32 v10, v3, v5 +; GFX90A-NEXT: v_addc_co_u32_e32 v6, vcc, v6, v11, vcc +; GFX90A-NEXT: v_addc_co_u32_e32 v7, vcc, v10, v2, vcc +; GFX90A-NEXT: v_mul_lo_u32 v3, v3, v5 +; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v6, v3 +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v8, v7, vcc +; GFX90A-NEXT: v_add_u32_e32 v1, v1, v4 +; GFX90A-NEXT: v_addc_co_u32_e64 v1, vcc, v1, v5, s[0:1] +; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: v_mul_lo_u32 v4, s6, v1 +; GFX90A-NEXT: v_mul_hi_u32 v5, s6, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v5, v4 +; GFX90A-NEXT: v_mul_hi_u32 v3, s6, v1 +; GFX90A-NEXT: v_mul_hi_u32 v6, s7, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v8, v3, vcc +; GFX90A-NEXT: v_mul_lo_u32 v0, s7, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v4, v0 +; GFX90A-NEXT: v_mul_hi_u32 v5, s7, v1 +; GFX90A-NEXT: v_addc_co_u32_e32 v0, vcc, v3, v6, vcc +; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v2, vcc +; GFX90A-NEXT: v_mul_lo_u32 v1, s7, v1 +; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 +; GFX90A-NEXT: s_mov_b32 s9, 0x9761f7c9 +; GFX90A-NEXT: s_movk_i32 s8, 0x11f +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v8, v3, vcc +; GFX90A-NEXT: v_mul_lo_u32 v3, v0, s8 +; GFX90A-NEXT: v_mul_hi_u32 v4, v0, s9 +; GFX90A-NEXT: v_add_u32_e32 v3, v4, v3 +; GFX90A-NEXT: v_mul_lo_u32 v1, v1, s9 +; GFX90A-NEXT: v_add_u32_e32 v1, v3, v1 +; GFX90A-NEXT: v_mul_lo_u32 v0, v0, s9 +; GFX90A-NEXT: v_sub_co_u32_e32 v0, vcc, s6, v0 +; GFX90A-NEXT: v_sub_u32_e32 v3, s7, v1 +; GFX90A-NEXT: v_mov_b32_e32 v4, s8 +; GFX90A-NEXT: v_subb_co_u32_e64 v3, s[0:1], v3, v4, vcc +; GFX90A-NEXT: v_subrev_co_u32_e64 v5, s[0:1], s9, v0 +; GFX90A-NEXT: v_subbrev_co_u32_e64 v6, s[2:3], 0, v3, s[0:1] +; GFX90A-NEXT: s_movk_i32 s6, 0x11e +; GFX90A-NEXT: v_cmp_lt_u32_e64 s[2:3], s6, v6 +; GFX90A-NEXT: s_mov_b32 s10, 0x9761f7c8 +; GFX90A-NEXT: v_subb_co_u32_e64 v3, s[0:1], v3, v4, s[0:1] +; GFX90A-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[2:3] +; GFX90A-NEXT: v_cmp_lt_u32_e64 s[2:3], s10, v5 +; GFX90A-NEXT: v_subrev_co_u32_e64 v4, s[0:1], s9, v5 +; GFX90A-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[2:3] +; GFX90A-NEXT: v_cmp_eq_u32_e64 s[2:3], s8, v6 +; GFX90A-NEXT: v_cndmask_b32_e64 v7, v7, v8, s[2:3] +; GFX90A-NEXT: v_subbrev_co_u32_e64 v3, s[0:1], 0, v3, s[0:1] +; GFX90A-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v7 +; GFX90A-NEXT: v_cndmask_b32_e64 v4, v5, v4, s[0:1] +; GFX90A-NEXT: v_mov_b32_e32 v5, s7 +; GFX90A-NEXT: v_subb_co_u32_e32 v1, vcc, v5, v1, vcc +; GFX90A-NEXT: v_cmp_lt_u32_e32 vcc, s6, v1 +; GFX90A-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc +; GFX90A-NEXT: v_cmp_lt_u32_e32 vcc, s10, v0 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v3, s[0:1] +; GFX90A-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, s8, v1 +; GFX90A-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 +; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX90A-NEXT: s_endpgm %r = urem i64 %x, 1235195393993 store i64 %r, i64 addrspace(1)* %out ret void @@ -8039,6 +10585,16 @@ ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm +; +; GFX90A-LABEL: urem_i64_pow2k_denom: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX90A-NEXT: v_mov_b32_e32 v1, 0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_and_b32 s2, s2, 0xfff +; GFX90A-NEXT: v_mov_b32_e32 v0, s2 +; GFX90A-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1] +; GFX90A-NEXT: s_endpgm %r = urem i64 %x, 4096 store i64 %r, i64 addrspace(1)* %out ret void @@ -8085,6 +10641,21 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm +; +; GFX90A-LABEL: urem_i64_pow2_shl_denom: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX90A-NEXT: s_mov_b64 s[0:1], 0x1000 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_lshl_b64 s[0:1], s[0:1], s2 +; GFX90A-NEXT: s_add_u32 s0, s0, -1 +; GFX90A-NEXT: s_addc_u32 s1, s1, -1 +; GFX90A-NEXT: s_and_b64 s[0:1], s[6:7], s[0:1] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX90A-NEXT: s_endpgm %shl.y = shl i64 4096, %y %r = urem i64 %x, %shl.y store i64 %r, i64 addrspace(1)* %out @@ -8133,6 +10704,21 @@ ; GFX9-NEXT: v_mov_b32_e32 v2, s0 ; GFX9-NEXT: global_store_dwordx4 v1, v[0:3], s[2:3] ; GFX9-NEXT: s_endpgm +; +; GFX90A-LABEL: urem_v2i64_pow2k_denom: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX90A-NEXT: s_movk_i32 s0, 0xfff +; GFX90A-NEXT: v_mov_b32_e32 v1, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_and_b32 s1, s4, s0 +; GFX90A-NEXT: s_and_b32 s0, s6, s0 +; GFX90A-NEXT: v_mov_b32_e32 v0, s1 +; GFX90A-NEXT: v_mov_b32_e32 v2, s0 +; GFX90A-NEXT: global_store_dwordx4 v1, v[0:3], s[2:3] +; GFX90A-NEXT: s_endpgm %r = urem <2 x i64> %x, store <2 x i64> %r, <2 x i64> addrspace(1)* %out ret void @@ -8198,6 +10784,29 @@ ; GFX9-NEXT: v_mov_b32_e32 v3, s5 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] ; GFX9-NEXT: s_endpgm +; +; GFX90A-LABEL: urem_v2i64_pow2_shl_denom: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX90A-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x44 +; GFX90A-NEXT: s_mov_b64 s[0:1], 0x1000 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_lshl_b64 s[10:11], s[0:1], s10 +; GFX90A-NEXT: s_lshl_b64 s[0:1], s[0:1], s8 +; GFX90A-NEXT: s_add_u32 s0, s0, -1 +; GFX90A-NEXT: s_addc_u32 s1, s1, -1 +; GFX90A-NEXT: s_and_b64 s[0:1], s[4:5], s[0:1] +; GFX90A-NEXT: s_add_u32 s4, s10, -1 +; GFX90A-NEXT: s_addc_u32 s5, s11, -1 +; GFX90A-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NEXT: v_mov_b32_e32 v3, s5 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] +; GFX90A-NEXT: s_endpgm %shl.y = shl <2 x i64> , %y %r = urem <2 x i64> %x, %shl.y store <2 x i64> %r, <2 x i64> addrspace(1)* %out @@ -8348,6 +10957,9 @@ ; GFX9-NEXT: v_mul_hi_u32 v3, v0, s8 ; GFX9-NEXT: v_mul_lo_u32 v2, v1, s8 ; GFX9-NEXT: v_mul_lo_u32 v4, v0, s8 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_ashr_i32 s0, s7, 31 +; GFX9-NEXT: s_mov_b32 s1, s0 ; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 ; GFX9-NEXT: v_sub_u32_e32 v2, v2, v0 ; GFX9-NEXT: v_mul_hi_u32 v3, v0, v4 @@ -8387,64 +10999,171 @@ ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v8, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v7, v4, vcc ; GFX9-NEXT: v_addc_co_u32_e64 v1, vcc, v1, v4, s[2:3] -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s2, s7, 31 -; GFX9-NEXT: s_add_u32 s0, s6, s2 +; GFX9-NEXT: s_add_u32 s2, s6, s0 +; GFX9-NEXT: s_addc_u32 s3, s7, s0 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 -; GFX9-NEXT: s_mov_b32 s3, s2 -; GFX9-NEXT: s_addc_u32 s1, s7, s2 -; GFX9-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3] +; GFX9-NEXT: s_xor_b64 s[2:3], s[2:3], s[0:1] ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: v_mul_lo_u32 v2, s0, v1 -; GFX9-NEXT: v_mul_hi_u32 v3, s0, v0 -; GFX9-NEXT: v_mul_hi_u32 v4, s0, v1 -; GFX9-NEXT: v_mul_hi_u32 v6, s1, v1 -; GFX9-NEXT: v_mul_lo_u32 v1, s1, v1 +; GFX9-NEXT: v_mul_lo_u32 v2, s2, v1 +; GFX9-NEXT: v_mul_hi_u32 v3, s2, v0 +; GFX9-NEXT: v_mul_hi_u32 v4, s2, v1 +; GFX9-NEXT: v_mul_hi_u32 v6, s3, v1 +; GFX9-NEXT: v_mul_lo_u32 v1, s3, v1 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v7, v4, vcc -; GFX9-NEXT: v_mul_lo_u32 v4, s1, v0 -; GFX9-NEXT: v_mul_hi_u32 v0, s1, v0 -; GFX9-NEXT: s_mov_b32 s3, 0x12d8fb +; GFX9-NEXT: v_mul_lo_u32 v4, s3, v0 +; GFX9-NEXT: v_mul_hi_u32 v0, s3, v0 +; GFX9-NEXT: s_mov_b32 s1, 0x12d8fb ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v3, v0, vcc ; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v6, v5, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v7, v2, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 2, v0 -; GFX9-NEXT: v_mul_lo_u32 v4, v1, s3 -; GFX9-NEXT: v_mul_hi_u32 v6, v0, s3 -; GFX9-NEXT: v_mul_lo_u32 v9, v0, s3 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, 1, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v1, vcc -; GFX9-NEXT: v_add_u32_e32 v4, v6, v4 -; GFX9-NEXT: v_sub_co_u32_e32 v9, vcc, s0, v9 -; GFX9-NEXT: v_mov_b32_e32 v6, s1 -; GFX9-NEXT: v_subb_co_u32_e32 v4, vcc, v6, v4, vcc -; GFX9-NEXT: v_subrev_co_u32_e32 v6, vcc, s3, v9 -; GFX9-NEXT: v_subbrev_co_u32_e32 v10, vcc, 0, v4, vcc -; GFX9-NEXT: s_mov_b32 s0, 0x12d8fa -; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, s0, v6 -; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v10 -; GFX9-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc -; GFX9-NEXT: v_cmp_lt_u32_e64 s[0:1], s0, v9 -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v4 -; GFX9-NEXT: v_cndmask_b32_e64 v4, -1, v6, s[0:1] -; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v7, v2, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e32 v3, v8, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] -; GFX9-NEXT: v_xor_b32_e32 v0, s2, v0 -; GFX9-NEXT: v_xor_b32_e32 v1, s2, v1 -; GFX9-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s2, v0 +; GFX9-NEXT: v_mul_lo_u32 v4, v0, s1 +; GFX9-NEXT: v_mul_lo_u32 v2, v1, s1 +; GFX9-NEXT: v_mul_hi_u32 v3, v0, s1 +; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, s2, v4 +; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v3, v2, vcc +; GFX9-NEXT: v_subrev_co_u32_e32 v3, vcc, s1, v4 +; GFX9-NEXT: v_subbrev_co_u32_e32 v6, vcc, 0, v2, vcc +; GFX9-NEXT: s_mov_b32 s1, 0x12d8fa +; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, s1, v3 +; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v3, -1, v3, vcc +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; GFX9-NEXT: v_cndmask_b32_e64 v3, 1, 2, vcc +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v0, v3 +; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v1, vcc +; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, s1, v4 +; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, -1, v4, vcc +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc +; GFX9-NEXT: v_xor_b32_e32 v0, s0, v0 +; GFX9-NEXT: v_xor_b32_e32 v1, s0, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s0, v0 ; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v2, vcc ; GFX9-NEXT: global_store_dwordx2 v5, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm +; +; GFX90A-LABEL: sdiv_i64_oddk_denom: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: v_mov_b32_e32 v0, 0x4f800000 +; GFX90A-NEXT: v_madak_f32 v0, 0, v0, 0x4996c7d8 +; GFX90A-NEXT: v_rcp_f32_e32 v0, v0 +; GFX90A-NEXT: s_mov_b32 s2, 0xffed2705 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 +; GFX90A-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 +; GFX90A-NEXT: v_trunc_f32_e32 v1, v1 +; GFX90A-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 +; GFX90A-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX90A-NEXT: v_mul_hi_u32 v4, v0, s2 +; GFX90A-NEXT: v_mul_lo_u32 v3, v1, s2 +; GFX90A-NEXT: v_add_u32_e32 v3, v4, v3 +; GFX90A-NEXT: v_sub_u32_e32 v3, v3, v0 +; GFX90A-NEXT: v_mul_lo_u32 v6, v0, s2 +; GFX90A-NEXT: v_mul_lo_u32 v5, v0, v3 +; GFX90A-NEXT: v_mul_hi_u32 v7, v0, v6 +; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v7, v5 +; GFX90A-NEXT: v_mul_hi_u32 v4, v0, v3 +; GFX90A-NEXT: v_mul_hi_u32 v9, v1, v6 +; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v8, v4, vcc +; GFX90A-NEXT: v_mul_lo_u32 v6, v1, v6 +; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v5, v6 +; GFX90A-NEXT: v_mul_hi_u32 v7, v1, v3 +; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v9, vcc +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v7, v2, vcc +; GFX90A-NEXT: v_mul_lo_u32 v3, v1, v3 +; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v4, v3 +; GFX90A-NEXT: v_add_co_u32_e64 v0, s[0:1], v0, v3 +; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v8, v5, vcc +; GFX90A-NEXT: v_addc_co_u32_e64 v3, vcc, v1, v4, s[0:1] +; GFX90A-NEXT: v_mul_lo_u32 v5, v3, s2 +; GFX90A-NEXT: v_mul_hi_u32 v6, v0, s2 +; GFX90A-NEXT: v_add_u32_e32 v5, v6, v5 +; GFX90A-NEXT: v_sub_u32_e32 v5, v5, v0 +; GFX90A-NEXT: v_mul_lo_u32 v7, v0, s2 +; GFX90A-NEXT: v_mul_hi_u32 v9, v3, v7 +; GFX90A-NEXT: v_mul_lo_u32 v10, v3, v7 +; GFX90A-NEXT: v_mul_lo_u32 v12, v0, v5 +; GFX90A-NEXT: v_mul_hi_u32 v7, v0, v7 +; GFX90A-NEXT: v_mul_hi_u32 v11, v0, v5 +; GFX90A-NEXT: v_add_co_u32_e32 v7, vcc, v7, v12 +; GFX90A-NEXT: v_addc_co_u32_e32 v11, vcc, v8, v11, vcc +; GFX90A-NEXT: v_add_co_u32_e32 v7, vcc, v7, v10 +; GFX90A-NEXT: v_mul_hi_u32 v6, v3, v5 +; GFX90A-NEXT: v_addc_co_u32_e32 v7, vcc, v11, v9, vcc +; GFX90A-NEXT: v_addc_co_u32_e32 v6, vcc, v6, v2, vcc +; GFX90A-NEXT: v_mul_lo_u32 v3, v3, v5 +; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v7, v3 +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v8, v6, vcc +; GFX90A-NEXT: v_add_u32_e32 v1, v1, v4 +; GFX90A-NEXT: v_addc_co_u32_e64 v1, vcc, v1, v5, s[0:1] +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_ashr_i32 s0, s7, 31 +; GFX90A-NEXT: s_add_u32 s2, s6, s0 +; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 +; GFX90A-NEXT: s_mov_b32 s1, s0 +; GFX90A-NEXT: s_addc_u32 s3, s7, s0 +; GFX90A-NEXT: s_xor_b64 s[2:3], s[2:3], s[0:1] +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX90A-NEXT: v_mul_lo_u32 v4, s2, v1 +; GFX90A-NEXT: v_mul_hi_u32 v5, s2, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v5, v4 +; GFX90A-NEXT: v_mul_hi_u32 v3, s2, v1 +; GFX90A-NEXT: v_mul_hi_u32 v6, s3, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v8, v3, vcc +; GFX90A-NEXT: v_mul_lo_u32 v0, s3, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v4, v0 +; GFX90A-NEXT: v_mul_hi_u32 v5, s3, v1 +; GFX90A-NEXT: v_addc_co_u32_e32 v0, vcc, v3, v6, vcc +; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v2, vcc +; GFX90A-NEXT: v_mul_lo_u32 v1, s3, v1 +; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 +; GFX90A-NEXT: s_mov_b32 s1, 0x12d8fb +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v8, v3, vcc +; GFX90A-NEXT: v_mul_lo_u32 v3, v1, s1 +; GFX90A-NEXT: v_mul_hi_u32 v4, v0, s1 +; GFX90A-NEXT: v_add_u32_e32 v3, v4, v3 +; GFX90A-NEXT: v_mul_lo_u32 v4, v0, s1 +; GFX90A-NEXT: v_mov_b32_e32 v5, s3 +; GFX90A-NEXT: v_sub_co_u32_e32 v4, vcc, s2, v4 +; GFX90A-NEXT: v_subb_co_u32_e32 v3, vcc, v5, v3, vcc +; GFX90A-NEXT: v_subrev_co_u32_e32 v5, vcc, s1, v4 +; GFX90A-NEXT: v_subbrev_co_u32_e32 v6, vcc, 0, v3, vcc +; GFX90A-NEXT: s_mov_b32 s1, 0x12d8fa +; GFX90A-NEXT: v_cmp_lt_u32_e32 vcc, s1, v5 +; GFX90A-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v6 +; GFX90A-NEXT: v_cndmask_b32_e32 v5, -1, v5, vcc +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 +; GFX90A-NEXT: v_cndmask_b32_e64 v5, 1, 2, vcc +; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v0, v5 +; GFX90A-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v1, vcc +; GFX90A-NEXT: v_cmp_lt_u32_e32 vcc, s1, v4 +; GFX90A-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX90A-NEXT: v_cndmask_b32_e32 v3, -1, v4, vcc +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc +; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc +; GFX90A-NEXT: v_xor_b32_e32 v0, s0, v0 +; GFX90A-NEXT: v_xor_b32_e32 v1, s0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, s0 +; GFX90A-NEXT: v_subrev_co_u32_e32 v0, vcc, s0, v0 +; GFX90A-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX90A-NEXT: s_endpgm %r = sdiv i64 %x, 1235195 store i64 %r, i64 addrspace(1)* %out ret void @@ -8488,6 +11207,20 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm +; +; GFX90A-LABEL: sdiv_i64_pow2k_denom: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_ashr_i32 s4, s3, 31 +; GFX90A-NEXT: s_lshr_b32 s4, s4, 20 +; GFX90A-NEXT: s_add_u32 s2, s2, s4 +; GFX90A-NEXT: s_addc_u32 s3, s3, 0 +; GFX90A-NEXT: s_ashr_i64 s[2:3], s[2:3], 12 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX90A-NEXT: s_endpgm %r = sdiv i64 %x, 4096 store i64 %r, i64 addrspace(1)* %out ret void @@ -8750,25 +11483,22 @@ ; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1] ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s11, v5 ; GFX9-NEXT: v_cndmask_b32_e64 v5, v7, v6, s[0:1] -; GFX9-NEXT: v_add_co_u32_e64 v6, s[0:1], 2, v0 -; GFX9-NEXT: v_addc_co_u32_e64 v7, s[0:1], 0, v1, s[0:1] -; GFX9-NEXT: v_add_co_u32_e64 v8, s[0:1], 1, v0 -; GFX9-NEXT: v_addc_co_u32_e64 v9, s[0:1], 0, v1, s[0:1] -; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v5 -; GFX9-NEXT: v_cndmask_b32_e64 v5, v9, v7, s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v7, s7 ; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v7, v3, vcc ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s11, v3 +; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v5 ; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s10, v4 +; GFX9-NEXT: v_cndmask_b32_e64 v5, 1, 2, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s11, v3 +; GFX9-NEXT: v_add_co_u32_e64 v5, s[0:1], v0, v5 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v4, vcc +; GFX9-NEXT: v_addc_co_u32_e64 v6, s[0:1], 0, v1, s[0:1] ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; GFX9-NEXT: v_cndmask_b32_e64 v3, v8, v6, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GFX9-NEXT: s_xor_b64 s[0:1], s[2:3], s[8:9] -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc ; GFX9-NEXT: v_xor_b32_e32 v0, s0, v0 ; GFX9-NEXT: v_xor_b32_e32 v1, s1, v1 ; GFX9-NEXT: v_mov_b32_e32 v3, s1 @@ -8776,6 +11506,140 @@ ; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm +; +; GFX90A-LABEL: sdiv_i64_pow2_shl_denom: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_load_dword s4, s[0:1], 0x34 +; GFX90A-NEXT: s_mov_b64 s[2:3], 0x1000 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_lshl_b64 s[2:3], s[2:3], s4 +; GFX90A-NEXT: s_ashr_i32 s8, s3, 31 +; GFX90A-NEXT: s_add_u32 s2, s2, s8 +; GFX90A-NEXT: s_mov_b32 s9, s8 +; GFX90A-NEXT: s_addc_u32 s3, s3, s8 +; GFX90A-NEXT: s_xor_b64 s[2:3], s[2:3], s[8:9] +; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GFX90A-NEXT: v_cvt_f32_u32_e32 v1, s3 +; GFX90A-NEXT: s_sub_u32 s10, 0, s2 +; GFX90A-NEXT: s_subb_u32 s11, 0, s3 +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX90A-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 +; GFX90A-NEXT: v_rcp_f32_e32 v0, v0 +; GFX90A-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 +; GFX90A-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 +; GFX90A-NEXT: v_trunc_f32_e32 v1, v1 +; GFX90A-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 +; GFX90A-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GFX90A-NEXT: v_mul_hi_u32 v5, s10, v0 +; GFX90A-NEXT: v_mul_lo_u32 v3, s10, v1 +; GFX90A-NEXT: v_mul_lo_u32 v4, s11, v0 +; GFX90A-NEXT: v_add_u32_e32 v3, v5, v3 +; GFX90A-NEXT: v_add_u32_e32 v3, v3, v4 +; GFX90A-NEXT: v_mul_lo_u32 v6, s10, v0 +; GFX90A-NEXT: v_mul_lo_u32 v5, v0, v3 +; GFX90A-NEXT: v_mul_hi_u32 v7, v0, v6 +; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v7, v5 +; GFX90A-NEXT: v_mul_hi_u32 v4, v0, v3 +; GFX90A-NEXT: v_mul_hi_u32 v8, v1, v6 +; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc +; GFX90A-NEXT: v_mul_lo_u32 v6, v1, v6 +; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v5, v6 +; GFX90A-NEXT: v_mul_hi_u32 v7, v1, v3 +; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v8, vcc +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v7, v2, vcc +; GFX90A-NEXT: v_mul_lo_u32 v3, v1, v3 +; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_add_co_u32_e64 v0, s[0:1], v0, v3 +; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v6, v5, vcc +; GFX90A-NEXT: v_addc_co_u32_e64 v3, vcc, v1, v4, s[0:1] +; GFX90A-NEXT: v_mul_lo_u32 v5, s10, v3 +; GFX90A-NEXT: v_mul_hi_u32 v7, s10, v0 +; GFX90A-NEXT: v_add_u32_e32 v5, v7, v5 +; GFX90A-NEXT: v_mul_lo_u32 v7, s11, v0 +; GFX90A-NEXT: v_add_u32_e32 v5, v5, v7 +; GFX90A-NEXT: v_mul_lo_u32 v8, s10, v0 +; GFX90A-NEXT: v_mul_hi_u32 v9, v3, v8 +; GFX90A-NEXT: v_mul_lo_u32 v10, v3, v8 +; GFX90A-NEXT: v_mul_lo_u32 v12, v0, v5 +; GFX90A-NEXT: v_mul_hi_u32 v8, v0, v8 +; GFX90A-NEXT: v_mul_hi_u32 v11, v0, v5 +; GFX90A-NEXT: v_add_co_u32_e32 v8, vcc, v8, v12 +; GFX90A-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc +; GFX90A-NEXT: v_add_co_u32_e32 v8, vcc, v8, v10 +; GFX90A-NEXT: v_mul_hi_u32 v7, v3, v5 +; GFX90A-NEXT: v_addc_co_u32_e32 v8, vcc, v11, v9, vcc +; GFX90A-NEXT: v_addc_co_u32_e32 v7, vcc, v7, v2, vcc +; GFX90A-NEXT: v_mul_lo_u32 v3, v3, v5 +; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v8, v3 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_ashr_i32 s10, s7, 31 +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v6, v7, vcc +; GFX90A-NEXT: v_add_u32_e32 v1, v1, v4 +; GFX90A-NEXT: v_addc_co_u32_e64 v1, vcc, v1, v5, s[0:1] +; GFX90A-NEXT: s_add_u32 s0, s6, s10 +; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 +; GFX90A-NEXT: s_mov_b32 s11, s10 +; GFX90A-NEXT: s_addc_u32 s1, s7, s10 +; GFX90A-NEXT: s_xor_b64 s[6:7], s[0:1], s[10:11] +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX90A-NEXT: v_mul_lo_u32 v4, s6, v1 +; GFX90A-NEXT: v_mul_hi_u32 v5, s6, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v5, v4 +; GFX90A-NEXT: v_mul_hi_u32 v3, s6, v1 +; GFX90A-NEXT: v_mul_hi_u32 v7, s7, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX90A-NEXT: v_mul_lo_u32 v0, s7, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v4, v0 +; GFX90A-NEXT: v_mul_hi_u32 v5, s7, v1 +; GFX90A-NEXT: v_addc_co_u32_e32 v0, vcc, v3, v7, vcc +; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v2, vcc +; GFX90A-NEXT: v_mul_lo_u32 v1, s7, v1 +; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v6, v3, vcc +; GFX90A-NEXT: v_mul_lo_u32 v3, s2, v1 +; GFX90A-NEXT: v_mul_hi_u32 v4, s2, v0 +; GFX90A-NEXT: v_add_u32_e32 v3, v4, v3 +; GFX90A-NEXT: v_mul_lo_u32 v4, s3, v0 +; GFX90A-NEXT: v_add_u32_e32 v3, v3, v4 +; GFX90A-NEXT: v_mul_lo_u32 v5, s2, v0 +; GFX90A-NEXT: v_sub_u32_e32 v4, s7, v3 +; GFX90A-NEXT: v_mov_b32_e32 v6, s3 +; GFX90A-NEXT: v_sub_co_u32_e32 v5, vcc, s6, v5 +; GFX90A-NEXT: v_subb_co_u32_e64 v4, s[0:1], v4, v6, vcc +; GFX90A-NEXT: v_subrev_co_u32_e64 v6, s[0:1], s2, v5 +; GFX90A-NEXT: v_subbrev_co_u32_e64 v4, s[0:1], 0, v4, s[0:1] +; GFX90A-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v4 +; GFX90A-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[0:1] +; GFX90A-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v6 +; GFX90A-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1] +; GFX90A-NEXT: v_cmp_eq_u32_e64 s[0:1], s3, v4 +; GFX90A-NEXT: v_cndmask_b32_e64 v4, v7, v6, s[0:1] +; GFX90A-NEXT: v_mov_b32_e32 v7, s7 +; GFX90A-NEXT: v_subb_co_u32_e32 v3, vcc, v7, v3, vcc +; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s3, v3 +; GFX90A-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4 +; GFX90A-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc +; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s2, v5 +; GFX90A-NEXT: v_cndmask_b32_e64 v4, 1, 2, s[0:1] +; GFX90A-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, s3, v3 +; GFX90A-NEXT: v_add_co_u32_e64 v4, s[0:1], v0, v4 +; GFX90A-NEXT: v_cndmask_b32_e32 v3, v7, v5, vcc +; GFX90A-NEXT: v_addc_co_u32_e64 v6, s[0:1], 0, v1, s[0:1] +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; GFX90A-NEXT: s_xor_b64 s[0:1], s[10:11], s[8:9] +; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc +; GFX90A-NEXT: v_xor_b32_e32 v0, s0, v0 +; GFX90A-NEXT: v_xor_b32_e32 v1, s1, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, s1 +; GFX90A-NEXT: v_subrev_co_u32_e32 v0, vcc, s0, v0 +; GFX90A-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX90A-NEXT: s_endpgm %shl.y = shl i64 4096, %y %r = sdiv i64 %x, %shl.y store i64 %r, i64 addrspace(1)* %out @@ -8839,6 +11703,29 @@ ; GFX9-NEXT: v_mov_b32_e32 v3, s5 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] ; GFX9-NEXT: s_endpgm +; +; GFX90A-LABEL: sdiv_v2i64_pow2k_denom: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_ashr_i32 s0, s5, 31 +; GFX90A-NEXT: s_lshr_b32 s0, s0, 20 +; GFX90A-NEXT: s_add_u32 s0, s4, s0 +; GFX90A-NEXT: s_addc_u32 s1, s5, 0 +; GFX90A-NEXT: s_ashr_i32 s4, s7, 31 +; GFX90A-NEXT: s_ashr_i64 s[0:1], s[0:1], 12 +; GFX90A-NEXT: s_lshr_b32 s4, s4, 20 +; GFX90A-NEXT: s_add_u32 s4, s6, s4 +; GFX90A-NEXT: s_addc_u32 s5, s7, 0 +; GFX90A-NEXT: s_ashr_i64 s[4:5], s[4:5], 12 +; GFX90A-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NEXT: v_mov_b32_e32 v3, s5 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] +; GFX90A-NEXT: s_endpgm %r = sdiv <2 x i64> %x, store <2 x i64> %r, <2 x i64> addrspace(1)* %out ret void @@ -9017,6 +11904,7 @@ ; GFX9-NEXT: v_mul_lo_u32 v7, v1, v5 ; GFX9-NEXT: v_mul_hi_u32 v5, v1, v5 ; GFX9-NEXT: s_ashr_i64 s[4:5], s[4:5], 12 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v7 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v6, v5, vcc ; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v8, v4, vcc @@ -9029,7 +11917,6 @@ ; GFX9-NEXT: v_mul_hi_u32 v7, v0, s8 ; GFX9-NEXT: v_mul_lo_u32 v8, v0, s8 ; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 -; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 ; GFX9-NEXT: v_add_u32_e32 v5, v7, v5 ; GFX9-NEXT: v_sub_u32_e32 v5, v5, v0 ; GFX9-NEXT: v_mul_lo_u32 v10, v0, v5 @@ -9063,40 +11950,37 @@ ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v5, vcc ; GFX9-NEXT: v_mul_lo_u32 v5, s7, v0 ; GFX9-NEXT: v_mul_hi_u32 v0, s7, v0 -; GFX9-NEXT: s_movk_i32 s0, 0xfff +; GFX9-NEXT: s_movk_i32 s3, 0xfff ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5 ; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v3, v0, vcc ; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v7, v4, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v6, v2, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 2, v0 -; GFX9-NEXT: v_mul_lo_u32 v5, v1, s0 -; GFX9-NEXT: v_mul_hi_u32 v6, v0, s0 -; GFX9-NEXT: v_mul_lo_u32 v9, v0, s0 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, 1, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v1, vcc -; GFX9-NEXT: v_add_u32_e32 v5, v6, v5 -; GFX9-NEXT: v_mov_b32_e32 v6, s7 -; GFX9-NEXT: v_sub_co_u32_e32 v9, vcc, s6, v9 -; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v6, v5, vcc -; GFX9-NEXT: v_subrev_co_u32_e32 v6, vcc, s0, v9 -; GFX9-NEXT: v_subbrev_co_u32_e32 v10, vcc, 0, v5, vcc -; GFX9-NEXT: s_movk_i32 s0, 0xffe -; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, s0, v6 -; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v10 -; GFX9-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc -; GFX9-NEXT: v_cmp_lt_u32_e64 s[0:1], s0, v9 -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v5 -; GFX9-NEXT: v_cndmask_b32_e64 v5, -1, v6, s[0:1] -; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v7, v2, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e32 v3, v8, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] +; GFX9-NEXT: v_mul_lo_u32 v5, v0, s3 +; GFX9-NEXT: v_mul_lo_u32 v2, v1, s3 +; GFX9-NEXT: v_mul_hi_u32 v3, v0, s3 +; GFX9-NEXT: v_sub_co_u32_e32 v5, vcc, s6, v5 +; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v3, v2, vcc +; GFX9-NEXT: v_subrev_co_u32_e32 v3, vcc, s3, v5 +; GFX9-NEXT: v_subbrev_co_u32_e32 v6, vcc, 0, v2, vcc +; GFX9-NEXT: s_movk_i32 s3, 0xffe +; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, s3, v3 +; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v3, -1, v3, vcc +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; GFX9-NEXT: v_cndmask_b32_e64 v3, 1, 2, vcc +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v0, v3 +; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v1, vcc +; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, s3, v5 +; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, -1, v5, vcc +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc ; GFX9-NEXT: v_xor_b32_e32 v0, s2, v0 ; GFX9-NEXT: v_subrev_co_u32_e32 v2, vcc, s2, v0 ; GFX9-NEXT: v_xor_b32_e32 v1, s2, v1 @@ -9105,8 +11989,130 @@ ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[8:9] +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX9-NEXT: s_endpgm +; +; GFX90A-LABEL: ssdiv_v2i64_mixed_pow2k_denom: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: v_mov_b32_e32 v0, 0x457ff000 +; GFX90A-NEXT: v_mov_b32_e32 v1, 0x4f800000 +; GFX90A-NEXT: v_mac_f32_e32 v0, 0, v1 +; GFX90A-NEXT: v_rcp_f32_e32 v0, v0 +; GFX90A-NEXT: s_movk_i32 s8, 0xf001 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 +; GFX90A-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 +; GFX90A-NEXT: v_trunc_f32_e32 v1, v1 +; GFX90A-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 +; GFX90A-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_ashr_i32 s0, s5, 31 +; GFX90A-NEXT: s_lshr_b32 s0, s0, 20 +; GFX90A-NEXT: v_mul_hi_u32 v2, v0, s8 +; GFX90A-NEXT: v_mul_lo_u32 v3, v1, s8 +; GFX90A-NEXT: v_add_u32_e32 v2, v2, v3 +; GFX90A-NEXT: v_sub_u32_e32 v2, v2, v0 +; GFX90A-NEXT: v_mul_lo_u32 v6, v0, s8 +; GFX90A-NEXT: v_mul_lo_u32 v5, v0, v2 +; GFX90A-NEXT: v_mul_hi_u32 v7, v0, v6 +; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v7, v5 +; GFX90A-NEXT: v_mul_hi_u32 v3, v0, v2 +; GFX90A-NEXT: v_mul_hi_u32 v8, v1, v6 +; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX90A-NEXT: v_mul_lo_u32 v6, v1, v6 +; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v5, v6 +; GFX90A-NEXT: v_mul_hi_u32 v7, v1, v2 +; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v8, vcc +; GFX90A-NEXT: s_add_u32 s0, s4, s0 +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v7, v4, vcc +; GFX90A-NEXT: v_mul_lo_u32 v2, v1, v2 +; GFX90A-NEXT: s_addc_u32 s1, s5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 +; GFX90A-NEXT: s_ashr_i64 s[4:5], s[0:1], 12 +; GFX90A-NEXT: v_add_co_u32_e64 v0, s[0:1], v0, v2 +; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v6, v5, vcc +; GFX90A-NEXT: v_addc_co_u32_e64 v2, vcc, v1, v3, s[0:1] +; GFX90A-NEXT: v_mul_lo_u32 v5, v2, s8 +; GFX90A-NEXT: v_mul_hi_u32 v7, v0, s8 +; GFX90A-NEXT: v_add_u32_e32 v5, v7, v5 +; GFX90A-NEXT: v_sub_u32_e32 v5, v5, v0 +; GFX90A-NEXT: v_mul_lo_u32 v8, v0, s8 +; GFX90A-NEXT: v_mul_hi_u32 v9, v2, v8 +; GFX90A-NEXT: v_mul_lo_u32 v10, v2, v8 +; GFX90A-NEXT: v_mul_lo_u32 v12, v0, v5 +; GFX90A-NEXT: v_mul_hi_u32 v8, v0, v8 +; GFX90A-NEXT: v_mul_hi_u32 v11, v0, v5 +; GFX90A-NEXT: v_add_co_u32_e32 v8, vcc, v8, v12 +; GFX90A-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc +; GFX90A-NEXT: v_add_co_u32_e32 v8, vcc, v8, v10 +; GFX90A-NEXT: v_mul_hi_u32 v7, v2, v5 +; GFX90A-NEXT: v_addc_co_u32_e32 v8, vcc, v11, v9, vcc +; GFX90A-NEXT: v_addc_co_u32_e32 v7, vcc, v7, v4, vcc +; GFX90A-NEXT: v_mul_lo_u32 v2, v2, v5 +; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, v8, v2 +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v6, v7, vcc +; GFX90A-NEXT: v_add_u32_e32 v1, v1, v3 +; GFX90A-NEXT: v_addc_co_u32_e64 v1, vcc, v1, v5, s[0:1] +; GFX90A-NEXT: s_ashr_i32 s0, s7, 31 +; GFX90A-NEXT: s_add_u32 s6, s6, s0 +; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 +; GFX90A-NEXT: s_mov_b32 s1, s0 +; GFX90A-NEXT: s_addc_u32 s7, s7, s0 +; GFX90A-NEXT: s_xor_b64 s[6:7], s[6:7], s[0:1] +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX90A-NEXT: v_mul_lo_u32 v3, s6, v1 +; GFX90A-NEXT: v_mul_hi_u32 v5, s6, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v5, v3 +; GFX90A-NEXT: v_mul_hi_u32 v2, s6, v1 +; GFX90A-NEXT: v_mul_hi_u32 v7, s7, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc +; GFX90A-NEXT: v_mul_lo_u32 v0, s7, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v3, v0 +; GFX90A-NEXT: v_mul_hi_u32 v5, s7, v1 +; GFX90A-NEXT: v_addc_co_u32_e32 v0, vcc, v2, v7, vcc +; GFX90A-NEXT: v_addc_co_u32_e32 v2, vcc, v5, v4, vcc +; GFX90A-NEXT: v_mul_lo_u32 v1, s7, v1 +; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 +; GFX90A-NEXT: s_movk_i32 s1, 0xfff +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v6, v2, vcc +; GFX90A-NEXT: v_mul_lo_u32 v2, v1, s1 +; GFX90A-NEXT: v_mul_hi_u32 v3, v0, s1 +; GFX90A-NEXT: v_add_u32_e32 v2, v3, v2 +; GFX90A-NEXT: v_mul_lo_u32 v3, v0, s1 +; GFX90A-NEXT: v_mov_b32_e32 v5, s7 +; GFX90A-NEXT: v_sub_co_u32_e32 v3, vcc, s6, v3 +; GFX90A-NEXT: v_subb_co_u32_e32 v2, vcc, v5, v2, vcc +; GFX90A-NEXT: v_subrev_co_u32_e32 v5, vcc, s1, v3 +; GFX90A-NEXT: v_subbrev_co_u32_e32 v6, vcc, 0, v2, vcc +; GFX90A-NEXT: s_movk_i32 s1, 0xffe +; GFX90A-NEXT: v_cmp_lt_u32_e32 vcc, s1, v5 +; GFX90A-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v6 +; GFX90A-NEXT: v_cndmask_b32_e32 v5, -1, v5, vcc +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 +; GFX90A-NEXT: v_cndmask_b32_e64 v5, 1, 2, vcc +; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v0, v5 +; GFX90A-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v1, vcc +; GFX90A-NEXT: v_cmp_lt_u32_e32 vcc, s1, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX90A-NEXT: v_cndmask_b32_e32 v2, -1, v3, vcc +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc +; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc +; GFX90A-NEXT: v_xor_b32_e32 v0, s0, v0 +; GFX90A-NEXT: v_subrev_co_u32_e32 v2, vcc, s0, v0 +; GFX90A-NEXT: v_xor_b32_e32 v1, s0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, s0 +; GFX90A-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v3, vcc +; GFX90A-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] +; GFX90A-NEXT: s_endpgm %r = sdiv <2 x i64> %x, store <2 x i64> %r, <2 x i64> addrspace(1)* %out ret void @@ -9511,67 +12517,64 @@ ; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[0:1] ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s11, v4 ; GFX9-NEXT: v_cndmask_b32_e64 v4, v8, v7, s[0:1] -; GFX9-NEXT: v_add_co_u32_e64 v7, s[0:1], 2, v0 -; GFX9-NEXT: v_addc_co_u32_e64 v8, s[0:1], 0, v1, s[0:1] -; GFX9-NEXT: v_add_co_u32_e64 v9, s[0:1], 1, v0 -; GFX9-NEXT: v_addc_co_u32_e64 v10, s[0:1], 0, v1, s[0:1] ; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4 -; GFX9-NEXT: v_cndmask_b32_e64 v4, v10, v8, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v4, 1, 2, s[0:1] +; GFX9-NEXT: v_add_co_u32_e64 v4, s[0:1], v0, v4 ; GFX9-NEXT: v_mov_b32_e32 v8, s5 ; GFX9-NEXT: s_xor_b64 s[4:5], s[14:15], s[12:13] ; GFX9-NEXT: s_ashr_i32 s12, s9, 31 -; GFX9-NEXT: s_add_u32 s8, s8, s12 +; GFX9-NEXT: v_addc_co_u32_e64 v7, s[0:1], 0, v1, s[0:1] +; GFX9-NEXT: s_add_u32 s0, s8, s12 ; GFX9-NEXT: s_mov_b32 s13, s12 -; GFX9-NEXT: s_addc_u32 s9, s9, s12 -; GFX9-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13] -; GFX9-NEXT: v_cvt_f32_u32_e32 v10, s8 -; GFX9-NEXT: v_cvt_f32_u32_e32 v11, s9 +; GFX9-NEXT: s_addc_u32 s1, s9, s12 +; GFX9-NEXT: s_xor_b64 s[8:9], s[0:1], s[12:13] ; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v8, v2, vcc +; GFX9-NEXT: v_cvt_f32_u32_e32 v8, s8 +; GFX9-NEXT: v_cvt_f32_u32_e32 v9, s9 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s11, v2 -; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s10, v3 +; GFX9-NEXT: v_mac_f32_e32 v8, s16, v9 +; GFX9-NEXT: v_rcp_f32_e32 v8, v8 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s11, v2 -; GFX9-NEXT: v_mac_f32_e32 v10, s16, v11 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v8, v3, vcc -; GFX9-NEXT: v_rcp_f32_e32 v3, v10 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v10, v3, vcc ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GFX9-NEXT: s_sub_u32 s10, 0, s8 -; GFX9-NEXT: v_mul_f32_e32 v3, s17, v3 -; GFX9-NEXT: v_mul_f32_e32 v4, s18, v3 -; GFX9-NEXT: v_trunc_f32_e32 v4, v4 -; GFX9-NEXT: v_mac_f32_e32 v3, s19, v4 -; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v4 +; GFX9-NEXT: v_mul_f32_e32 v2, s17, v8 +; GFX9-NEXT: v_mul_f32_e32 v3, s18, v2 +; GFX9-NEXT: v_trunc_f32_e32 v3, v3 +; GFX9-NEXT: v_mac_f32_e32 v2, s19, v3 +; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 -; GFX9-NEXT: v_cndmask_b32_e64 v2, v9, v7, s[0:1] +; GFX9-NEXT: s_sub_u32 s10, 0, s8 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; GFX9-NEXT: s_subb_u32 s11, 0, s9 -; GFX9-NEXT: v_mul_lo_u32 v8, s10, v4 -; GFX9-NEXT: v_mul_hi_u32 v7, s10, v3 -; GFX9-NEXT: v_mul_lo_u32 v9, s11, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX9-NEXT: v_mul_lo_u32 v2, s10, v3 -; GFX9-NEXT: v_add_u32_e32 v7, v7, v8 -; GFX9-NEXT: v_add_u32_e32 v7, v7, v9 -; GFX9-NEXT: v_mul_lo_u32 v8, v3, v7 -; GFX9-NEXT: v_mul_hi_u32 v9, v3, v2 -; GFX9-NEXT: v_mul_hi_u32 v10, v3, v7 -; GFX9-NEXT: v_mul_hi_u32 v11, v4, v7 -; GFX9-NEXT: v_mul_lo_u32 v7, v4, v7 +; GFX9-NEXT: v_mul_hi_u32 v4, s10, v2 +; GFX9-NEXT: v_mul_lo_u32 v8, s10, v3 +; GFX9-NEXT: v_mul_lo_u32 v9, s11, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc +; GFX9-NEXT: v_mul_lo_u32 v7, s10, v2 +; GFX9-NEXT: v_add_u32_e32 v4, v4, v8 +; GFX9-NEXT: v_add_u32_e32 v4, v4, v9 +; GFX9-NEXT: v_mul_lo_u32 v8, v2, v4 +; GFX9-NEXT: v_mul_hi_u32 v9, v2, v7 +; GFX9-NEXT: v_mul_hi_u32 v10, v2, v4 +; GFX9-NEXT: v_mul_hi_u32 v11, v3, v4 +; GFX9-NEXT: v_mul_lo_u32 v4, v3, v4 ; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v9, v8 ; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v10, vcc -; GFX9-NEXT: v_mul_lo_u32 v10, v4, v2 -; GFX9-NEXT: v_mul_hi_u32 v2, v4, v2 +; GFX9-NEXT: v_mul_lo_u32 v10, v3, v7 +; GFX9-NEXT: v_mul_hi_u32 v7, v3, v7 ; GFX9-NEXT: v_xor_b32_e32 v0, s4, v0 ; GFX9-NEXT: v_xor_b32_e32 v1, s5, v1 ; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v8, v10 -; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v9, v2, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v9, v7, vcc ; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, v11, v6, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v7 -; GFX9-NEXT: v_add_co_u32_e64 v2, s[0:1], v3, v2 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v7, v4 +; GFX9-NEXT: v_add_co_u32_e64 v2, s[0:1], v2, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v5, v8, vcc -; GFX9-NEXT: v_addc_co_u32_e64 v3, vcc, v4, v7, s[0:1] -; GFX9-NEXT: v_mul_lo_u32 v8, s10, v3 +; GFX9-NEXT: v_addc_co_u32_e64 v4, vcc, v3, v7, s[0:1] +; GFX9-NEXT: v_mul_lo_u32 v8, s10, v4 ; GFX9-NEXT: v_mul_hi_u32 v9, s10, v2 ; GFX9-NEXT: v_mul_lo_u32 v10, s11, v2 ; GFX9-NEXT: v_mul_lo_u32 v11, s10, v2 @@ -9581,25 +12584,25 @@ ; GFX9-NEXT: v_mul_lo_u32 v12, v2, v8 ; GFX9-NEXT: v_mul_hi_u32 v13, v2, v11 ; GFX9-NEXT: v_mul_hi_u32 v14, v2, v8 -; GFX9-NEXT: v_mul_hi_u32 v10, v3, v11 -; GFX9-NEXT: v_mul_lo_u32 v11, v3, v11 +; GFX9-NEXT: v_mul_hi_u32 v10, v4, v11 +; GFX9-NEXT: v_mul_lo_u32 v11, v4, v11 ; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, v13, v12 -; GFX9-NEXT: v_mul_hi_u32 v9, v3, v8 +; GFX9-NEXT: v_mul_hi_u32 v9, v4, v8 ; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v14, vcc -; GFX9-NEXT: v_mul_lo_u32 v3, v3, v8 +; GFX9-NEXT: v_mul_lo_u32 v4, v4, v8 ; GFX9-NEXT: v_add_co_u32_e32 v11, vcc, v12, v11 ; GFX9-NEXT: v_addc_co_u32_e32 v10, vcc, v13, v10, vcc ; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, v9, v6, vcc -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v10, v3 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v10, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, v5, v8, vcc -; GFX9-NEXT: v_add_u32_e32 v4, v4, v7 -; GFX9-NEXT: v_addc_co_u32_e64 v4, vcc, v4, v8, s[0:1] +; GFX9-NEXT: v_add_u32_e32 v3, v3, v7 +; GFX9-NEXT: v_addc_co_u32_e64 v3, vcc, v3, v8, s[0:1] ; GFX9-NEXT: s_add_u32 s0, s6, s10 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v3 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 ; GFX9-NEXT: s_mov_b32 s11, s10 ; GFX9-NEXT: s_addc_u32 s1, s7, s10 ; GFX9-NEXT: s_xor_b64 s[6:7], s[0:1], s[10:11] -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: v_mul_lo_u32 v4, s6, v3 ; GFX9-NEXT: v_mul_hi_u32 v7, s6, v2 ; GFX9-NEXT: v_mul_hi_u32 v9, s6, v3 @@ -9635,25 +12638,22 @@ ; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[0:1] ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v7 ; GFX9-NEXT: v_cndmask_b32_e64 v7, v9, v8, s[0:1] -; GFX9-NEXT: v_add_co_u32_e64 v8, s[0:1], 2, v2 -; GFX9-NEXT: v_addc_co_u32_e64 v9, s[0:1], 0, v3, s[0:1] -; GFX9-NEXT: v_add_co_u32_e64 v10, s[0:1], 1, v2 -; GFX9-NEXT: v_addc_co_u32_e64 v11, s[0:1], 0, v3, s[0:1] -; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v7 -; GFX9-NEXT: v_cndmask_b32_e64 v7, v11, v9, s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v9, s7 ; GFX9-NEXT: v_subb_co_u32_e32 v4, vcc, v9, v4, vcc ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s9, v4 +; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v7 ; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, -1, vcc ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s8, v5 +; GFX9-NEXT: v_cndmask_b32_e64 v7, 1, 2, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s9, v4 +; GFX9-NEXT: v_add_co_u32_e64 v7, s[0:1], v2, v7 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v9, v5, vcc +; GFX9-NEXT: v_addc_co_u32_e64 v8, s[0:1], 0, v3, s[0:1] ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GFX9-NEXT: v_cndmask_b32_e64 v4, v10, v8, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX9-NEXT: s_xor_b64 s[0:1], s[10:11], s[12:13] -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc ; GFX9-NEXT: v_xor_b32_e32 v2, s0, v2 ; GFX9-NEXT: v_xor_b32_e32 v3, s1, v3 ; GFX9-NEXT: v_mov_b32_e32 v4, s1 @@ -9662,6 +12662,267 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dwordx4 v6, v[0:3], s[2:3] ; GFX9-NEXT: s_endpgm +; +; GFX90A-LABEL: sdiv_v2i64_pow2_shl_denom: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x44 +; GFX90A-NEXT: s_mov_b64 s[2:3], 0x1000 +; GFX90A-NEXT: s_mov_b32 s16, 0x4f800000 +; GFX90A-NEXT: s_mov_b32 s17, 0x5f7ffffc +; GFX90A-NEXT: s_mov_b32 s18, 0x2f800000 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_lshl_b64 s[8:9], s[2:3], s6 +; GFX90A-NEXT: s_lshl_b64 s[2:3], s[2:3], s4 +; GFX90A-NEXT: s_ashr_i32 s10, s3, 31 +; GFX90A-NEXT: s_add_u32 s2, s2, s10 +; GFX90A-NEXT: s_mov_b32 s11, s10 +; GFX90A-NEXT: s_addc_u32 s3, s3, s10 +; GFX90A-NEXT: s_xor_b64 s[12:13], s[2:3], s[10:11] +; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s12 +; GFX90A-NEXT: v_cvt_f32_u32_e32 v1, s13 +; GFX90A-NEXT: s_mov_b32 s19, 0xcf800000 +; GFX90A-NEXT: s_sub_u32 s14, 0, s12 +; GFX90A-NEXT: s_subb_u32 s15, 0, s13 +; GFX90A-NEXT: v_mac_f32_e32 v0, s16, v1 +; GFX90A-NEXT: v_rcp_f32_e32 v0, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX90A-NEXT: v_mul_f32_e32 v0, s17, v0 +; GFX90A-NEXT: v_mul_f32_e32 v1, s18, v0 +; GFX90A-NEXT: v_trunc_f32_e32 v1, v1 +; GFX90A-NEXT: v_mac_f32_e32 v0, s19, v1 +; GFX90A-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GFX90A-NEXT: v_mul_hi_u32 v3, s14, v0 +; GFX90A-NEXT: v_mul_lo_u32 v5, s14, v1 +; GFX90A-NEXT: v_mul_lo_u32 v2, s15, v0 +; GFX90A-NEXT: v_add_u32_e32 v3, v3, v5 +; GFX90A-NEXT: v_add_u32_e32 v2, v3, v2 +; GFX90A-NEXT: v_mul_lo_u32 v6, s14, v0 +; GFX90A-NEXT: v_mul_lo_u32 v5, v0, v2 +; GFX90A-NEXT: v_mul_hi_u32 v7, v0, v6 +; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v7, v5 +; GFX90A-NEXT: v_mul_hi_u32 v3, v0, v2 +; GFX90A-NEXT: v_mul_hi_u32 v8, v1, v6 +; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX90A-NEXT: v_mul_lo_u32 v6, v1, v6 +; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v5, v6 +; GFX90A-NEXT: v_mul_hi_u32 v7, v1, v2 +; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v8, vcc +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v7, v4, vcc +; GFX90A-NEXT: v_mul_lo_u32 v2, v1, v2 +; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_add_co_u32_e64 v0, s[0:1], v0, v2 +; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v6, v5, vcc +; GFX90A-NEXT: v_addc_co_u32_e64 v2, vcc, v1, v3, s[0:1] +; GFX90A-NEXT: v_mul_lo_u32 v5, s14, v2 +; GFX90A-NEXT: v_mul_hi_u32 v7, s14, v0 +; GFX90A-NEXT: v_add_u32_e32 v5, v7, v5 +; GFX90A-NEXT: v_mul_lo_u32 v7, s15, v0 +; GFX90A-NEXT: v_add_u32_e32 v5, v5, v7 +; GFX90A-NEXT: v_mul_lo_u32 v8, s14, v0 +; GFX90A-NEXT: v_mul_hi_u32 v9, v2, v8 +; GFX90A-NEXT: v_mul_lo_u32 v10, v2, v8 +; GFX90A-NEXT: v_mul_lo_u32 v12, v0, v5 +; GFX90A-NEXT: v_mul_hi_u32 v8, v0, v8 +; GFX90A-NEXT: v_mul_hi_u32 v11, v0, v5 +; GFX90A-NEXT: v_add_co_u32_e32 v8, vcc, v8, v12 +; GFX90A-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc +; GFX90A-NEXT: v_add_co_u32_e32 v8, vcc, v8, v10 +; GFX90A-NEXT: v_mul_hi_u32 v7, v2, v5 +; GFX90A-NEXT: v_addc_co_u32_e32 v8, vcc, v11, v9, vcc +; GFX90A-NEXT: v_addc_co_u32_e32 v7, vcc, v7, v4, vcc +; GFX90A-NEXT: v_mul_lo_u32 v2, v2, v5 +; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, v8, v2 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_ashr_i32 s14, s5, 31 +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v6, v7, vcc +; GFX90A-NEXT: v_add_u32_e32 v1, v1, v3 +; GFX90A-NEXT: v_addc_co_u32_e64 v1, vcc, v1, v5, s[0:1] +; GFX90A-NEXT: s_add_u32 s0, s4, s14 +; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 +; GFX90A-NEXT: s_mov_b32 s15, s14 +; GFX90A-NEXT: s_addc_u32 s1, s5, s14 +; GFX90A-NEXT: s_xor_b64 s[4:5], s[0:1], s[14:15] +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX90A-NEXT: v_mul_lo_u32 v3, s4, v1 +; GFX90A-NEXT: v_mul_hi_u32 v5, s4, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v5, v3 +; GFX90A-NEXT: v_mul_hi_u32 v2, s4, v1 +; GFX90A-NEXT: v_mul_hi_u32 v7, s5, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc +; GFX90A-NEXT: v_mul_lo_u32 v0, s5, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v3, v0 +; GFX90A-NEXT: v_mul_hi_u32 v5, s5, v1 +; GFX90A-NEXT: v_addc_co_u32_e32 v0, vcc, v2, v7, vcc +; GFX90A-NEXT: v_addc_co_u32_e32 v2, vcc, v5, v4, vcc +; GFX90A-NEXT: v_mul_lo_u32 v1, s5, v1 +; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v6, v2, vcc +; GFX90A-NEXT: v_mul_lo_u32 v2, s12, v1 +; GFX90A-NEXT: v_mul_hi_u32 v3, s12, v0 +; GFX90A-NEXT: v_add_u32_e32 v2, v3, v2 +; GFX90A-NEXT: v_mul_lo_u32 v3, s13, v0 +; GFX90A-NEXT: v_add_u32_e32 v2, v2, v3 +; GFX90A-NEXT: v_mul_lo_u32 v5, s12, v0 +; GFX90A-NEXT: v_sub_co_u32_e32 v5, vcc, s4, v5 +; GFX90A-NEXT: v_sub_u32_e32 v3, s5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, s13 +; GFX90A-NEXT: v_subb_co_u32_e64 v3, s[0:1], v3, v7, vcc +; GFX90A-NEXT: v_subrev_co_u32_e64 v7, s[0:1], s12, v5 +; GFX90A-NEXT: v_subbrev_co_u32_e64 v3, s[0:1], 0, v3, s[0:1] +; GFX90A-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[0:1] +; GFX90A-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v7 +; GFX90A-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[0:1] +; GFX90A-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v8, v7, s[0:1] +; GFX90A-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, 1, 2, s[0:1] +; GFX90A-NEXT: v_mov_b32_e32 v8, s5 +; GFX90A-NEXT: v_subb_co_u32_e32 v2, vcc, v8, v2, vcc +; GFX90A-NEXT: v_add_co_u32_e64 v3, s[0:1], v0, v3 +; GFX90A-NEXT: v_addc_co_u32_e64 v7, s[0:1], 0, v1, s[0:1] +; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s13, v2 +; GFX90A-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc +; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s12, v5 +; GFX90A-NEXT: s_ashr_i32 s4, s9, 31 +; GFX90A-NEXT: s_xor_b64 s[0:1], s[14:15], s[10:11] +; GFX90A-NEXT: s_add_u32 s8, s8, s4 +; GFX90A-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, s13, v2 +; GFX90A-NEXT: v_cndmask_b32_e32 v2, v8, v5, vcc +; GFX90A-NEXT: s_mov_b32 s5, s4 +; GFX90A-NEXT: s_addc_u32 s9, s9, s4 +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GFX90A-NEXT: s_xor_b64 s[8:9], s[8:9], s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; GFX90A-NEXT: v_cvt_f32_u32_e32 v2, s8 +; GFX90A-NEXT: v_cvt_f32_u32_e32 v3, s9 +; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc +; GFX90A-NEXT: v_xor_b32_e32 v0, s0, v0 +; GFX90A-NEXT: s_sub_u32 s10, 0, s8 +; GFX90A-NEXT: v_mac_f32_e32 v2, s16, v3 +; GFX90A-NEXT: v_rcp_f32_e32 v2, v2 +; GFX90A-NEXT: v_xor_b32_e32 v1, s1, v1 +; GFX90A-NEXT: v_mov_b32_e32 v5, s1 +; GFX90A-NEXT: v_subrev_co_u32_e32 v0, vcc, s0, v0 +; GFX90A-NEXT: v_mul_f32_e32 v2, s17, v2 +; GFX90A-NEXT: v_mul_f32_e32 v3, s18, v2 +; GFX90A-NEXT: v_trunc_f32_e32 v3, v3 +; GFX90A-NEXT: v_mac_f32_e32 v2, s19, v3 +; GFX90A-NEXT: v_cvt_u32_f32_e32 v2, v2 +; GFX90A-NEXT: v_cvt_u32_f32_e32 v3, v3 +; GFX90A-NEXT: s_subb_u32 s11, 0, s9 +; GFX90A-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v5, vcc +; GFX90A-NEXT: v_mul_hi_u32 v7, s10, v2 +; GFX90A-NEXT: v_mul_lo_u32 v8, s10, v3 +; GFX90A-NEXT: v_mul_lo_u32 v5, s11, v2 +; GFX90A-NEXT: v_add_u32_e32 v7, v7, v8 +; GFX90A-NEXT: v_add_u32_e32 v5, v7, v5 +; GFX90A-NEXT: v_mul_lo_u32 v9, s10, v2 +; GFX90A-NEXT: v_mul_lo_u32 v8, v2, v5 +; GFX90A-NEXT: v_mul_hi_u32 v10, v2, v9 +; GFX90A-NEXT: v_add_co_u32_e32 v8, vcc, v10, v8 +; GFX90A-NEXT: v_mul_hi_u32 v7, v2, v5 +; GFX90A-NEXT: v_mul_hi_u32 v11, v3, v9 +; GFX90A-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc +; GFX90A-NEXT: v_mul_lo_u32 v9, v3, v9 +; GFX90A-NEXT: v_add_co_u32_e32 v8, vcc, v8, v9 +; GFX90A-NEXT: v_mul_hi_u32 v10, v3, v5 +; GFX90A-NEXT: v_addc_co_u32_e32 v7, vcc, v7, v11, vcc +; GFX90A-NEXT: v_addc_co_u32_e32 v8, vcc, v10, v4, vcc +; GFX90A-NEXT: v_mul_lo_u32 v5, v3, v5 +; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v7, v5 +; GFX90A-NEXT: v_add_co_u32_e64 v2, s[0:1], v2, v5 +; GFX90A-NEXT: v_addc_co_u32_e32 v7, vcc, v6, v8, vcc +; GFX90A-NEXT: v_addc_co_u32_e64 v5, vcc, v3, v7, s[0:1] +; GFX90A-NEXT: v_mul_lo_u32 v8, s10, v5 +; GFX90A-NEXT: v_mul_hi_u32 v9, s10, v2 +; GFX90A-NEXT: v_add_u32_e32 v8, v9, v8 +; GFX90A-NEXT: v_mul_lo_u32 v9, s11, v2 +; GFX90A-NEXT: v_add_u32_e32 v8, v8, v9 +; GFX90A-NEXT: v_mul_lo_u32 v10, s10, v2 +; GFX90A-NEXT: v_mul_hi_u32 v11, v5, v10 +; GFX90A-NEXT: v_mul_lo_u32 v12, v5, v10 +; GFX90A-NEXT: v_mul_lo_u32 v14, v2, v8 +; GFX90A-NEXT: v_mul_hi_u32 v10, v2, v10 +; GFX90A-NEXT: v_mul_hi_u32 v13, v2, v8 +; GFX90A-NEXT: v_add_co_u32_e32 v10, vcc, v10, v14 +; GFX90A-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc +; GFX90A-NEXT: v_add_co_u32_e32 v10, vcc, v10, v12 +; GFX90A-NEXT: v_mul_hi_u32 v9, v5, v8 +; GFX90A-NEXT: v_addc_co_u32_e32 v10, vcc, v13, v11, vcc +; GFX90A-NEXT: v_addc_co_u32_e32 v9, vcc, v9, v4, vcc +; GFX90A-NEXT: v_mul_lo_u32 v5, v5, v8 +; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v10, v5 +; GFX90A-NEXT: s_ashr_i32 s10, s7, 31 +; GFX90A-NEXT: v_addc_co_u32_e32 v8, vcc, v6, v9, vcc +; GFX90A-NEXT: v_add_u32_e32 v3, v3, v7 +; GFX90A-NEXT: v_addc_co_u32_e64 v3, vcc, v3, v8, s[0:1] +; GFX90A-NEXT: s_add_u32 s0, s6, s10 +; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5 +; GFX90A-NEXT: s_mov_b32 s11, s10 +; GFX90A-NEXT: s_addc_u32 s1, s7, s10 +; GFX90A-NEXT: s_xor_b64 s[6:7], s[0:1], s[10:11] +; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX90A-NEXT: v_mul_lo_u32 v7, s6, v3 +; GFX90A-NEXT: v_mul_hi_u32 v8, s6, v2 +; GFX90A-NEXT: v_add_co_u32_e32 v7, vcc, v8, v7 +; GFX90A-NEXT: v_mul_hi_u32 v5, s6, v3 +; GFX90A-NEXT: v_mul_hi_u32 v9, s7, v2 +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX90A-NEXT: v_mul_lo_u32 v2, s7, v2 +; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, v7, v2 +; GFX90A-NEXT: v_mul_hi_u32 v8, s7, v3 +; GFX90A-NEXT: v_addc_co_u32_e32 v2, vcc, v5, v9, vcc +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v8, v4, vcc +; GFX90A-NEXT: v_mul_lo_u32 v3, s7, v3 +; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, v2, v3 +; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v6, v5, vcc +; GFX90A-NEXT: v_mul_lo_u32 v5, s8, v3 +; GFX90A-NEXT: v_mul_hi_u32 v6, s8, v2 +; GFX90A-NEXT: v_add_u32_e32 v5, v6, v5 +; GFX90A-NEXT: v_mul_lo_u32 v6, s9, v2 +; GFX90A-NEXT: v_add_u32_e32 v5, v5, v6 +; GFX90A-NEXT: v_mul_lo_u32 v7, s8, v2 +; GFX90A-NEXT: v_sub_u32_e32 v6, s7, v5 +; GFX90A-NEXT: v_mov_b32_e32 v8, s9 +; GFX90A-NEXT: v_sub_co_u32_e32 v7, vcc, s6, v7 +; GFX90A-NEXT: v_subb_co_u32_e64 v6, s[0:1], v6, v8, vcc +; GFX90A-NEXT: v_subrev_co_u32_e64 v8, s[0:1], s8, v7 +; GFX90A-NEXT: v_subbrev_co_u32_e64 v6, s[0:1], 0, v6, s[0:1] +; GFX90A-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v6 +; GFX90A-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[0:1] +; GFX90A-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v8 +; GFX90A-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[0:1] +; GFX90A-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v6 +; GFX90A-NEXT: v_cndmask_b32_e64 v6, v9, v8, s[0:1] +; GFX90A-NEXT: v_mov_b32_e32 v9, s7 +; GFX90A-NEXT: v_subb_co_u32_e32 v5, vcc, v9, v5, vcc +; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s9, v5 +; GFX90A-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v6 +; GFX90A-NEXT: v_cndmask_b32_e64 v9, 0, -1, vcc +; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s8, v7 +; GFX90A-NEXT: v_cndmask_b32_e64 v6, 1, 2, s[0:1] +; GFX90A-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, s9, v5 +; GFX90A-NEXT: v_add_co_u32_e64 v6, s[0:1], v2, v6 +; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v7, vcc +; GFX90A-NEXT: v_addc_co_u32_e64 v8, s[0:1], 0, v3, s[0:1] +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 +; GFX90A-NEXT: s_xor_b64 s[0:1], s[10:11], s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc +; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc +; GFX90A-NEXT: v_xor_b32_e32 v2, s0, v2 +; GFX90A-NEXT: v_xor_b32_e32 v3, s1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, s1 +; GFX90A-NEXT: v_subrev_co_u32_e32 v2, vcc, s0, v2 +; GFX90A-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v5, vcc +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] +; GFX90A-NEXT: s_endpgm %shl.y = shl <2 x i64> , %y %r = sdiv <2 x i64> %x, %shl.y store <2 x i64> %r, <2 x i64> addrspace(1)* %out @@ -9810,6 +13071,9 @@ ; GFX9-NEXT: v_mul_hi_u32 v3, v0, s8 ; GFX9-NEXT: v_mul_lo_u32 v2, v1, s8 ; GFX9-NEXT: v_mul_lo_u32 v4, v0, s8 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_ashr_i32 s0, s7, 31 +; GFX9-NEXT: s_mov_b32 s1, s0 ; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 ; GFX9-NEXT: v_sub_u32_e32 v2, v2, v0 ; GFX9-NEXT: v_mul_hi_u32 v3, v0, v4 @@ -9849,62 +13113,173 @@ ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v8, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v7, v4, vcc ; GFX9-NEXT: v_addc_co_u32_e64 v1, vcc, v1, v4, s[2:3] -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s2, s7, 31 -; GFX9-NEXT: s_add_u32 s0, s6, s2 +; GFX9-NEXT: s_add_u32 s2, s6, s0 +; GFX9-NEXT: s_addc_u32 s3, s7, s0 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 -; GFX9-NEXT: s_mov_b32 s3, s2 -; GFX9-NEXT: s_addc_u32 s1, s7, s2 -; GFX9-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3] +; GFX9-NEXT: s_xor_b64 s[2:3], s[2:3], s[0:1] ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: v_mul_lo_u32 v2, s0, v1 -; GFX9-NEXT: v_mul_hi_u32 v3, s0, v0 -; GFX9-NEXT: v_mul_hi_u32 v4, s0, v1 -; GFX9-NEXT: v_mul_hi_u32 v6, s1, v1 -; GFX9-NEXT: v_mul_lo_u32 v1, s1, v1 +; GFX9-NEXT: v_mul_lo_u32 v2, s2, v1 +; GFX9-NEXT: v_mul_hi_u32 v3, s2, v0 +; GFX9-NEXT: v_mul_hi_u32 v4, s2, v1 +; GFX9-NEXT: v_mul_hi_u32 v6, s3, v1 +; GFX9-NEXT: v_mul_lo_u32 v1, s3, v1 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v7, v4, vcc -; GFX9-NEXT: v_mul_lo_u32 v4, s1, v0 -; GFX9-NEXT: v_mul_hi_u32 v0, s1, v0 -; GFX9-NEXT: s_mov_b32 s3, 0x12d8fb +; GFX9-NEXT: v_mul_lo_u32 v4, s3, v0 +; GFX9-NEXT: v_mul_hi_u32 v0, s3, v0 +; GFX9-NEXT: s_mov_b32 s1, 0x12d8fb ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v3, v0, vcc ; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v6, v5, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v7, v2, vcc -; GFX9-NEXT: v_mul_hi_u32 v2, v0, s3 -; GFX9-NEXT: v_mul_lo_u32 v1, v1, s3 -; GFX9-NEXT: v_mul_lo_u32 v0, v0, s3 +; GFX9-NEXT: v_mul_hi_u32 v2, v0, s1 +; GFX9-NEXT: v_mul_lo_u32 v1, v1, s1 +; GFX9-NEXT: v_mul_lo_u32 v0, v0, s1 ; GFX9-NEXT: v_add_u32_e32 v1, v2, v1 -; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s0, v0 -; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: v_mov_b32_e32 v2, s3 +; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s2, v0 ; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v1, vcc -; GFX9-NEXT: v_subrev_co_u32_e32 v2, vcc, s3, v0 +; GFX9-NEXT: v_subrev_co_u32_e32 v2, vcc, s1, v0 ; GFX9-NEXT: v_subbrev_co_u32_e32 v3, vcc, 0, v1, vcc -; GFX9-NEXT: v_subrev_co_u32_e32 v4, vcc, s3, v2 +; GFX9-NEXT: v_subrev_co_u32_e32 v4, vcc, s1, v2 ; GFX9-NEXT: v_subbrev_co_u32_e32 v6, vcc, 0, v3, vcc -; GFX9-NEXT: s_mov_b32 s0, 0x12d8fa -; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, s0, v2 +; GFX9-NEXT: s_mov_b32 s1, 0x12d8fa +; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, s1, v2 ; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v7, -1, v7, vcc ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 -; GFX9-NEXT: v_cmp_lt_u32_e64 s[0:1], s0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v1 -; GFX9-NEXT: v_cndmask_b32_e64 v6, -1, v6, s[0:1] -; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v6 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] -; GFX9-NEXT: v_xor_b32_e32 v0, s2, v0 -; GFX9-NEXT: v_xor_b32_e32 v1, s2, v1 -; GFX9-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s2, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc +; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, s1, v0 +; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX9-NEXT: v_xor_b32_e32 v0, s0, v0 +; GFX9-NEXT: v_xor_b32_e32 v1, s0, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s0, v0 ; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v2, vcc ; GFX9-NEXT: global_store_dwordx2 v5, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm +; +; GFX90A-LABEL: srem_i64_oddk_denom: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: v_mov_b32_e32 v0, 0x4f800000 +; GFX90A-NEXT: v_madak_f32 v0, 0, v0, 0x4996c7d8 +; GFX90A-NEXT: v_rcp_f32_e32 v0, v0 +; GFX90A-NEXT: s_mov_b32 s2, 0xffed2705 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 +; GFX90A-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 +; GFX90A-NEXT: v_trunc_f32_e32 v1, v1 +; GFX90A-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 +; GFX90A-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX90A-NEXT: v_mul_hi_u32 v4, v0, s2 +; GFX90A-NEXT: v_mul_lo_u32 v3, v1, s2 +; GFX90A-NEXT: v_add_u32_e32 v3, v4, v3 +; GFX90A-NEXT: v_sub_u32_e32 v3, v3, v0 +; GFX90A-NEXT: v_mul_lo_u32 v6, v0, s2 +; GFX90A-NEXT: v_mul_lo_u32 v5, v0, v3 +; GFX90A-NEXT: v_mul_hi_u32 v7, v0, v6 +; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v7, v5 +; GFX90A-NEXT: v_mul_hi_u32 v4, v0, v3 +; GFX90A-NEXT: v_mul_hi_u32 v9, v1, v6 +; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v8, v4, vcc +; GFX90A-NEXT: v_mul_lo_u32 v6, v1, v6 +; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v5, v6 +; GFX90A-NEXT: v_mul_hi_u32 v7, v1, v3 +; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v9, vcc +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v7, v2, vcc +; GFX90A-NEXT: v_mul_lo_u32 v3, v1, v3 +; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v4, v3 +; GFX90A-NEXT: v_add_co_u32_e64 v0, s[0:1], v0, v3 +; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v8, v5, vcc +; GFX90A-NEXT: v_addc_co_u32_e64 v3, vcc, v1, v4, s[0:1] +; GFX90A-NEXT: v_mul_lo_u32 v5, v3, s2 +; GFX90A-NEXT: v_mul_hi_u32 v6, v0, s2 +; GFX90A-NEXT: v_add_u32_e32 v5, v6, v5 +; GFX90A-NEXT: v_sub_u32_e32 v5, v5, v0 +; GFX90A-NEXT: v_mul_lo_u32 v7, v0, s2 +; GFX90A-NEXT: v_mul_hi_u32 v9, v3, v7 +; GFX90A-NEXT: v_mul_lo_u32 v10, v3, v7 +; GFX90A-NEXT: v_mul_lo_u32 v12, v0, v5 +; GFX90A-NEXT: v_mul_hi_u32 v7, v0, v7 +; GFX90A-NEXT: v_mul_hi_u32 v11, v0, v5 +; GFX90A-NEXT: v_add_co_u32_e32 v7, vcc, v7, v12 +; GFX90A-NEXT: v_addc_co_u32_e32 v11, vcc, v8, v11, vcc +; GFX90A-NEXT: v_add_co_u32_e32 v7, vcc, v7, v10 +; GFX90A-NEXT: v_mul_hi_u32 v6, v3, v5 +; GFX90A-NEXT: v_addc_co_u32_e32 v7, vcc, v11, v9, vcc +; GFX90A-NEXT: v_addc_co_u32_e32 v6, vcc, v6, v2, vcc +; GFX90A-NEXT: v_mul_lo_u32 v3, v3, v5 +; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v7, v3 +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v8, v6, vcc +; GFX90A-NEXT: v_add_u32_e32 v1, v1, v4 +; GFX90A-NEXT: v_addc_co_u32_e64 v1, vcc, v1, v5, s[0:1] +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_ashr_i32 s0, s7, 31 +; GFX90A-NEXT: s_add_u32 s2, s6, s0 +; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 +; GFX90A-NEXT: s_mov_b32 s1, s0 +; GFX90A-NEXT: s_addc_u32 s3, s7, s0 +; GFX90A-NEXT: s_xor_b64 s[2:3], s[2:3], s[0:1] +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX90A-NEXT: v_mul_lo_u32 v4, s2, v1 +; GFX90A-NEXT: v_mul_hi_u32 v5, s2, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v5, v4 +; GFX90A-NEXT: v_mul_hi_u32 v3, s2, v1 +; GFX90A-NEXT: v_mul_hi_u32 v6, s3, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v8, v3, vcc +; GFX90A-NEXT: v_mul_lo_u32 v0, s3, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v4, v0 +; GFX90A-NEXT: v_mul_hi_u32 v5, s3, v1 +; GFX90A-NEXT: v_addc_co_u32_e32 v0, vcc, v3, v6, vcc +; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v2, vcc +; GFX90A-NEXT: v_mul_lo_u32 v1, s3, v1 +; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v8, v3, vcc +; GFX90A-NEXT: s_mov_b32 s1, 0x12d8fb +; GFX90A-NEXT: v_mul_hi_u32 v3, v0, s1 +; GFX90A-NEXT: v_mul_lo_u32 v1, v1, s1 +; GFX90A-NEXT: v_mul_lo_u32 v0, v0, s1 +; GFX90A-NEXT: v_add_u32_e32 v1, v3, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-NEXT: v_sub_co_u32_e32 v0, vcc, s2, v0 +; GFX90A-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v1, vcc +; GFX90A-NEXT: v_subrev_co_u32_e32 v3, vcc, s1, v0 +; GFX90A-NEXT: v_subbrev_co_u32_e32 v4, vcc, 0, v1, vcc +; GFX90A-NEXT: v_subrev_co_u32_e32 v5, vcc, s1, v3 +; GFX90A-NEXT: v_subbrev_co_u32_e32 v6, vcc, 0, v4, vcc +; GFX90A-NEXT: s_mov_b32 s1, 0x12d8fa +; GFX90A-NEXT: v_cmp_lt_u32_e32 vcc, s1, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX90A-NEXT: v_cndmask_b32_e32 v7, -1, v7, vcc +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 +; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; GFX90A-NEXT: v_cmp_lt_u32_e32 vcc, s1, v0 +; GFX90A-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX90A-NEXT: v_cndmask_b32_e32 v5, -1, v5, vcc +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 +; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; GFX90A-NEXT: v_xor_b32_e32 v0, s0, v0 +; GFX90A-NEXT: v_xor_b32_e32 v1, s0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, s0 +; GFX90A-NEXT: v_subrev_co_u32_e32 v0, vcc, s0, v0 +; GFX90A-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX90A-NEXT: s_endpgm %r = srem i64 %x, 1235195 store i64 %r, i64 addrspace(1)* %out ret void @@ -9952,6 +13327,22 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm +; +; GFX90A-LABEL: srem_i64_pow2k_denom: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_ashr_i32 s4, s3, 31 +; GFX90A-NEXT: s_lshr_b32 s4, s4, 20 +; GFX90A-NEXT: s_add_u32 s4, s2, s4 +; GFX90A-NEXT: s_addc_u32 s5, s3, 0 +; GFX90A-NEXT: s_and_b32 s4, s4, 0xfffff000 +; GFX90A-NEXT: s_sub_u32 s2, s2, s4 +; GFX90A-NEXT: s_subb_u32 s3, s3, s5 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX90A-NEXT: s_endpgm %r = srem i64 %x, 4096 store i64 %r, i64 addrspace(1)* %out ret void @@ -10216,19 +13607,19 @@ ; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v8, s[2:3] ; GFX9-NEXT: v_subbrev_co_u32_e64 v3, s[0:1], 0, v3, s[0:1] ; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v7 -; GFX9-NEXT: v_cndmask_b32_e64 v3, v6, v3, s[0:1] -; GFX9-NEXT: v_mov_b32_e32 v6, s7 -; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v6, v1, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v4, v5, v4, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v5, s7 +; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v5, v1, vcc ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s9, v1 -; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 -; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v3, v6, v3, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s9, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v7, vcc -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v3, v5, v4, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GFX9-NEXT: v_xor_b32_e32 v0, s10, v0 ; GFX9-NEXT: v_xor_b32_e32 v1, s10, v1 ; GFX9-NEXT: v_mov_b32_e32 v3, s10 @@ -10236,6 +13627,141 @@ ; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm +; +; GFX90A-LABEL: srem_i64_pow2_shl_denom: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_load_dword s4, s[0:1], 0x34 +; GFX90A-NEXT: s_mov_b64 s[2:3], 0x1000 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_lshl_b64 s[2:3], s[2:3], s4 +; GFX90A-NEXT: s_ashr_i32 s4, s3, 31 +; GFX90A-NEXT: s_add_u32 s2, s2, s4 +; GFX90A-NEXT: s_mov_b32 s5, s4 +; GFX90A-NEXT: s_addc_u32 s3, s3, s4 +; GFX90A-NEXT: s_xor_b64 s[8:9], s[2:3], s[4:5] +; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s8 +; GFX90A-NEXT: v_cvt_f32_u32_e32 v1, s9 +; GFX90A-NEXT: s_sub_u32 s2, 0, s8 +; GFX90A-NEXT: s_subb_u32 s3, 0, s9 +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX90A-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 +; GFX90A-NEXT: v_rcp_f32_e32 v0, v0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_ashr_i32 s10, s7, 31 +; GFX90A-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 +; GFX90A-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 +; GFX90A-NEXT: v_trunc_f32_e32 v1, v1 +; GFX90A-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 +; GFX90A-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GFX90A-NEXT: s_mov_b32 s11, s10 +; GFX90A-NEXT: v_mul_hi_u32 v5, s2, v0 +; GFX90A-NEXT: v_mul_lo_u32 v3, s2, v1 +; GFX90A-NEXT: v_mul_lo_u32 v4, s3, v0 +; GFX90A-NEXT: v_add_u32_e32 v3, v5, v3 +; GFX90A-NEXT: v_add_u32_e32 v3, v3, v4 +; GFX90A-NEXT: v_mul_lo_u32 v6, s2, v0 +; GFX90A-NEXT: v_mul_lo_u32 v5, v0, v3 +; GFX90A-NEXT: v_mul_hi_u32 v7, v0, v6 +; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v7, v5 +; GFX90A-NEXT: v_mul_hi_u32 v4, v0, v3 +; GFX90A-NEXT: v_mul_hi_u32 v8, v1, v6 +; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc +; GFX90A-NEXT: v_mul_lo_u32 v6, v1, v6 +; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v5, v6 +; GFX90A-NEXT: v_mul_hi_u32 v7, v1, v3 +; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v8, vcc +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v7, v2, vcc +; GFX90A-NEXT: v_mul_lo_u32 v3, v1, v3 +; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_add_co_u32_e64 v0, s[0:1], v0, v3 +; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v6, v5, vcc +; GFX90A-NEXT: v_addc_co_u32_e64 v3, vcc, v1, v4, s[0:1] +; GFX90A-NEXT: v_mul_lo_u32 v5, s2, v3 +; GFX90A-NEXT: v_mul_hi_u32 v7, s2, v0 +; GFX90A-NEXT: v_add_u32_e32 v5, v7, v5 +; GFX90A-NEXT: v_mul_lo_u32 v7, s3, v0 +; GFX90A-NEXT: v_add_u32_e32 v5, v5, v7 +; GFX90A-NEXT: v_mul_lo_u32 v8, s2, v0 +; GFX90A-NEXT: v_mul_hi_u32 v9, v3, v8 +; GFX90A-NEXT: v_mul_lo_u32 v10, v3, v8 +; GFX90A-NEXT: v_mul_lo_u32 v12, v0, v5 +; GFX90A-NEXT: v_mul_hi_u32 v8, v0, v8 +; GFX90A-NEXT: v_mul_hi_u32 v11, v0, v5 +; GFX90A-NEXT: v_add_co_u32_e32 v8, vcc, v8, v12 +; GFX90A-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc +; GFX90A-NEXT: v_add_co_u32_e32 v8, vcc, v8, v10 +; GFX90A-NEXT: v_mul_hi_u32 v7, v3, v5 +; GFX90A-NEXT: v_addc_co_u32_e32 v8, vcc, v11, v9, vcc +; GFX90A-NEXT: v_addc_co_u32_e32 v7, vcc, v7, v2, vcc +; GFX90A-NEXT: v_mul_lo_u32 v3, v3, v5 +; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v8, v3 +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v6, v7, vcc +; GFX90A-NEXT: v_add_u32_e32 v1, v1, v4 +; GFX90A-NEXT: v_addc_co_u32_e64 v1, vcc, v1, v5, s[0:1] +; GFX90A-NEXT: s_add_u32 s0, s6, s10 +; GFX90A-NEXT: s_addc_u32 s1, s7, s10 +; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 +; GFX90A-NEXT: s_xor_b64 s[6:7], s[0:1], s[10:11] +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX90A-NEXT: v_mul_lo_u32 v4, s6, v1 +; GFX90A-NEXT: v_mul_hi_u32 v5, s6, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v5, v4 +; GFX90A-NEXT: v_mul_hi_u32 v3, s6, v1 +; GFX90A-NEXT: v_mul_hi_u32 v7, s7, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX90A-NEXT: v_mul_lo_u32 v0, s7, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v4, v0 +; GFX90A-NEXT: v_mul_hi_u32 v5, s7, v1 +; GFX90A-NEXT: v_addc_co_u32_e32 v0, vcc, v3, v7, vcc +; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v2, vcc +; GFX90A-NEXT: v_mul_lo_u32 v1, s7, v1 +; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v6, v3, vcc +; GFX90A-NEXT: v_mul_lo_u32 v1, s8, v1 +; GFX90A-NEXT: v_mul_hi_u32 v3, s8, v0 +; GFX90A-NEXT: v_add_u32_e32 v1, v3, v1 +; GFX90A-NEXT: v_mul_lo_u32 v3, s9, v0 +; GFX90A-NEXT: v_add_u32_e32 v1, v1, v3 +; GFX90A-NEXT: v_mul_lo_u32 v0, s8, v0 +; GFX90A-NEXT: v_sub_u32_e32 v3, s7, v1 +; GFX90A-NEXT: v_mov_b32_e32 v4, s9 +; GFX90A-NEXT: v_sub_co_u32_e32 v0, vcc, s6, v0 +; GFX90A-NEXT: v_subb_co_u32_e64 v3, s[0:1], v3, v4, vcc +; GFX90A-NEXT: v_subrev_co_u32_e64 v5, s[0:1], s8, v0 +; GFX90A-NEXT: v_subbrev_co_u32_e64 v6, s[2:3], 0, v3, s[0:1] +; GFX90A-NEXT: v_cmp_le_u32_e64 s[2:3], s9, v6 +; GFX90A-NEXT: v_subb_co_u32_e64 v3, s[0:1], v3, v4, s[0:1] +; GFX90A-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[2:3] +; GFX90A-NEXT: v_cmp_le_u32_e64 s[2:3], s8, v5 +; GFX90A-NEXT: v_subrev_co_u32_e64 v4, s[0:1], s8, v5 +; GFX90A-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[2:3] +; GFX90A-NEXT: v_cmp_eq_u32_e64 s[2:3], s9, v6 +; GFX90A-NEXT: v_cndmask_b32_e64 v7, v7, v8, s[2:3] +; GFX90A-NEXT: v_subbrev_co_u32_e64 v3, s[0:1], 0, v3, s[0:1] +; GFX90A-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v7 +; GFX90A-NEXT: v_cndmask_b32_e64 v4, v5, v4, s[0:1] +; GFX90A-NEXT: v_mov_b32_e32 v5, s7 +; GFX90A-NEXT: v_subb_co_u32_e32 v1, vcc, v5, v1, vcc +; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s9, v1 +; GFX90A-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc +; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v3, s[0:1] +; GFX90A-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, s9, v1 +; GFX90A-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 +; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX90A-NEXT: v_xor_b32_e32 v0, s10, v0 +; GFX90A-NEXT: v_xor_b32_e32 v1, s10, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, s10 +; GFX90A-NEXT: v_subrev_co_u32_e32 v0, vcc, s10, v0 +; GFX90A-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX90A-NEXT: s_endpgm %shl.y = shl i64 4096, %y %r = srem i64 %x, %shl.y store i64 %r, i64 addrspace(1)* %out @@ -10309,6 +13835,34 @@ ; GFX9-NEXT: v_mov_b32_e32 v3, s5 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] ; GFX9-NEXT: s_endpgm +; +; GFX90A-LABEL: srem_v2i64_pow2k_denom: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX90A-NEXT: s_movk_i32 s8, 0xf000 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_ashr_i32 s0, s5, 31 +; GFX90A-NEXT: s_lshr_b32 s0, s0, 20 +; GFX90A-NEXT: s_add_u32 s0, s4, s0 +; GFX90A-NEXT: s_addc_u32 s1, s5, 0 +; GFX90A-NEXT: s_and_b32 s0, s0, s8 +; GFX90A-NEXT: s_sub_u32 s0, s4, s0 +; GFX90A-NEXT: s_subb_u32 s1, s5, s1 +; GFX90A-NEXT: s_ashr_i32 s4, s7, 31 +; GFX90A-NEXT: s_lshr_b32 s4, s4, 20 +; GFX90A-NEXT: s_add_u32 s4, s6, s4 +; GFX90A-NEXT: s_addc_u32 s5, s7, 0 +; GFX90A-NEXT: s_and_b32 s4, s4, s8 +; GFX90A-NEXT: s_sub_u32 s4, s6, s4 +; GFX90A-NEXT: s_subb_u32 s5, s7, s5 +; GFX90A-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NEXT: v_mov_b32_e32 v3, s5 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] +; GFX90A-NEXT: s_endpgm %r = srem <2 x i64> %x, store <2 x i64> %r, <2 x i64> addrspace(1)* %out ret void @@ -10704,37 +14258,37 @@ ; GFX9-NEXT: v_subrev_co_u32_e64 v4, s[0:1], s12, v0 ; GFX9-NEXT: v_subbrev_co_u32_e64 v7, s[2:3], 0, v2, s[0:1] ; GFX9-NEXT: v_cmp_le_u32_e64 s[2:3], s13, v7 +; GFX9-NEXT: v_subb_co_u32_e64 v2, s[0:1], v2, v3, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[2:3] ; GFX9-NEXT: v_cmp_le_u32_e64 s[2:3], s12, v4 +; GFX9-NEXT: v_subrev_co_u32_e64 v3, s[0:1], s12, v4 ; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[2:3] ; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], s13, v7 ; GFX9-NEXT: v_cndmask_b32_e64 v8, v8, v9, s[2:3] -; GFX9-NEXT: s_ashr_i32 s2, s11, 31 -; GFX9-NEXT: v_subb_co_u32_e64 v2, s[0:1], v2, v3, s[0:1] -; GFX9-NEXT: s_add_u32 s10, s10, s2 -; GFX9-NEXT: v_subrev_co_u32_e64 v3, s[0:1], s12, v4 -; GFX9-NEXT: s_mov_b32 s3, s2 -; GFX9-NEXT: s_addc_u32 s11, s11, s2 -; GFX9-NEXT: s_xor_b64 s[10:11], s[10:11], s[2:3] ; GFX9-NEXT: v_subbrev_co_u32_e64 v2, s[0:1], 0, v2, s[0:1] ; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v8 -; GFX9-NEXT: v_cvt_f32_u32_e32 v8, s10 -; GFX9-NEXT: v_cvt_f32_u32_e32 v9, s11 +; GFX9-NEXT: v_cndmask_b32_e64 v3, v4, v3, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v2, v7, v2, s[0:1] -; GFX9-NEXT: v_mov_b32_e32 v7, s15 -; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v7, v1, vcc -; GFX9-NEXT: v_mac_f32_e32 v8, s16, v9 +; GFX9-NEXT: s_ashr_i32 s0, s11, 31 +; GFX9-NEXT: s_add_u32 s2, s10, s0 +; GFX9-NEXT: s_mov_b32 s1, s0 +; GFX9-NEXT: s_addc_u32 s3, s11, s0 +; GFX9-NEXT: s_xor_b64 s[10:11], s[2:3], s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v4, s15 +; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v4, v1, vcc +; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s10 +; GFX9-NEXT: v_cvt_f32_u32_e32 v7, s11 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s13, v1 -; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s12, v0 -; GFX9-NEXT: v_rcp_f32_e32 v8, v8 -; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc +; GFX9-NEXT: v_mac_f32_e32 v4, s16, v7 +; GFX9-NEXT: v_rcp_f32_e32 v4, v4 +; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, -1, vcc ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s13, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v7, v7, v10, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v7, v8, v9, vcc ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v2, v4, v3, s[0:1] -; GFX9-NEXT: v_mul_f32_e32 v3, s17, v8 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; GFX9-NEXT: v_mul_f32_e32 v3, s17, v4 ; GFX9-NEXT: v_mul_f32_e32 v4, s18, v3 ; GFX9-NEXT: v_trunc_f32_e32 v4, v4 ; GFX9-NEXT: v_mac_f32_e32 v3, s19, v4 @@ -10745,7 +14299,7 @@ ; GFX9-NEXT: v_mul_hi_u32 v7, s2, v3 ; GFX9-NEXT: v_mul_lo_u32 v8, s2, v4 ; GFX9-NEXT: v_mul_lo_u32 v9, s3, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX9-NEXT: v_mul_lo_u32 v2, s2, v3 ; GFX9-NEXT: v_add_u32_e32 v7, v7, v8 ; GFX9-NEXT: v_add_u32_e32 v7, v7, v9 @@ -10835,19 +14389,19 @@ ; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v10, s[2:3] ; GFX9-NEXT: v_subbrev_co_u32_e64 v4, s[0:1], 0, v4, s[0:1] ; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v9 -; GFX9-NEXT: v_cndmask_b32_e64 v4, v8, v4, s[0:1] -; GFX9-NEXT: v_mov_b32_e32 v8, s7 -; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v8, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v5, v7, v5, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v7, s7 +; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v7, v3, vcc ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s11, v3 -; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s10, v2 -; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, -1, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v4, v8, v4, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s11, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v4, v7, v5, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX9-NEXT: v_xor_b32_e32 v2, s12, v2 ; GFX9-NEXT: v_xor_b32_e32 v3, s12, v3 ; GFX9-NEXT: v_mov_b32_e32 v4, s12 @@ -10856,6 +14410,269 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dwordx4 v6, v[0:3], s[4:5] ; GFX9-NEXT: s_endpgm +; +; GFX90A-LABEL: srem_v2i64_pow2_shl_denom: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x44 +; GFX90A-NEXT: s_mov_b64 s[2:3], 0x1000 +; GFX90A-NEXT: s_mov_b32 s16, 0x4f800000 +; GFX90A-NEXT: s_mov_b32 s17, 0x5f7ffffc +; GFX90A-NEXT: s_mov_b32 s18, 0x2f800000 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_lshl_b64 s[10:11], s[2:3], s6 +; GFX90A-NEXT: s_lshl_b64 s[2:3], s[2:3], s4 +; GFX90A-NEXT: s_ashr_i32 s4, s3, 31 +; GFX90A-NEXT: s_add_u32 s2, s2, s4 +; GFX90A-NEXT: s_mov_b32 s5, s4 +; GFX90A-NEXT: s_addc_u32 s3, s3, s4 +; GFX90A-NEXT: s_xor_b64 s[12:13], s[2:3], s[4:5] +; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s12 +; GFX90A-NEXT: v_cvt_f32_u32_e32 v1, s13 +; GFX90A-NEXT: s_mov_b32 s19, 0xcf800000 +; GFX90A-NEXT: s_sub_u32 s2, 0, s12 +; GFX90A-NEXT: s_subb_u32 s3, 0, s13 +; GFX90A-NEXT: v_mac_f32_e32 v0, s16, v1 +; GFX90A-NEXT: v_rcp_f32_e32 v0, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX90A-NEXT: v_mul_f32_e32 v0, s17, v0 +; GFX90A-NEXT: v_mul_f32_e32 v1, s18, v0 +; GFX90A-NEXT: v_trunc_f32_e32 v1, v1 +; GFX90A-NEXT: v_mac_f32_e32 v0, s19, v1 +; GFX90A-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_ashr_i32 s14, s5, 31 +; GFX90A-NEXT: s_mov_b32 s15, s14 +; GFX90A-NEXT: v_mul_hi_u32 v3, s2, v0 +; GFX90A-NEXT: v_mul_lo_u32 v5, s2, v1 +; GFX90A-NEXT: v_mul_lo_u32 v2, s3, v0 +; GFX90A-NEXT: v_add_u32_e32 v3, v3, v5 +; GFX90A-NEXT: v_add_u32_e32 v2, v3, v2 +; GFX90A-NEXT: v_mul_lo_u32 v6, s2, v0 +; GFX90A-NEXT: v_mul_lo_u32 v5, v0, v2 +; GFX90A-NEXT: v_mul_hi_u32 v7, v0, v6 +; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v7, v5 +; GFX90A-NEXT: v_mul_hi_u32 v3, v0, v2 +; GFX90A-NEXT: v_mul_hi_u32 v8, v1, v6 +; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX90A-NEXT: v_mul_lo_u32 v6, v1, v6 +; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v5, v6 +; GFX90A-NEXT: v_mul_hi_u32 v7, v1, v2 +; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v8, vcc +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v7, v4, vcc +; GFX90A-NEXT: v_mul_lo_u32 v2, v1, v2 +; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_add_co_u32_e64 v0, s[0:1], v0, v2 +; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v6, v5, vcc +; GFX90A-NEXT: v_addc_co_u32_e64 v2, vcc, v1, v3, s[0:1] +; GFX90A-NEXT: v_mul_lo_u32 v5, s2, v2 +; GFX90A-NEXT: v_mul_hi_u32 v7, s2, v0 +; GFX90A-NEXT: v_add_u32_e32 v5, v7, v5 +; GFX90A-NEXT: v_mul_lo_u32 v7, s3, v0 +; GFX90A-NEXT: v_add_u32_e32 v5, v5, v7 +; GFX90A-NEXT: v_mul_lo_u32 v8, s2, v0 +; GFX90A-NEXT: v_mul_hi_u32 v9, v2, v8 +; GFX90A-NEXT: v_mul_lo_u32 v10, v2, v8 +; GFX90A-NEXT: v_mul_lo_u32 v12, v0, v5 +; GFX90A-NEXT: v_mul_hi_u32 v8, v0, v8 +; GFX90A-NEXT: v_mul_hi_u32 v11, v0, v5 +; GFX90A-NEXT: v_add_co_u32_e32 v8, vcc, v8, v12 +; GFX90A-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc +; GFX90A-NEXT: v_add_co_u32_e32 v8, vcc, v8, v10 +; GFX90A-NEXT: v_mul_hi_u32 v7, v2, v5 +; GFX90A-NEXT: v_addc_co_u32_e32 v8, vcc, v11, v9, vcc +; GFX90A-NEXT: v_addc_co_u32_e32 v7, vcc, v7, v4, vcc +; GFX90A-NEXT: v_mul_lo_u32 v2, v2, v5 +; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, v8, v2 +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v6, v7, vcc +; GFX90A-NEXT: v_add_u32_e32 v1, v1, v3 +; GFX90A-NEXT: v_addc_co_u32_e64 v1, vcc, v1, v5, s[0:1] +; GFX90A-NEXT: s_add_u32 s0, s4, s14 +; GFX90A-NEXT: s_addc_u32 s1, s5, s14 +; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 +; GFX90A-NEXT: s_xor_b64 s[4:5], s[0:1], s[14:15] +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX90A-NEXT: v_mul_lo_u32 v3, s4, v1 +; GFX90A-NEXT: v_mul_hi_u32 v5, s4, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v5, v3 +; GFX90A-NEXT: v_mul_hi_u32 v2, s4, v1 +; GFX90A-NEXT: v_mul_hi_u32 v7, s5, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc +; GFX90A-NEXT: v_mul_lo_u32 v0, s5, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v3, v0 +; GFX90A-NEXT: v_mul_hi_u32 v5, s5, v1 +; GFX90A-NEXT: v_addc_co_u32_e32 v0, vcc, v2, v7, vcc +; GFX90A-NEXT: v_addc_co_u32_e32 v2, vcc, v5, v4, vcc +; GFX90A-NEXT: v_mul_lo_u32 v1, s5, v1 +; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v6, v2, vcc +; GFX90A-NEXT: v_mul_lo_u32 v1, s12, v1 +; GFX90A-NEXT: v_mul_hi_u32 v2, s12, v0 +; GFX90A-NEXT: v_add_u32_e32 v1, v2, v1 +; GFX90A-NEXT: v_mul_lo_u32 v2, s13, v0 +; GFX90A-NEXT: v_add_u32_e32 v1, v1, v2 +; GFX90A-NEXT: v_mul_lo_u32 v0, s12, v0 +; GFX90A-NEXT: v_sub_u32_e32 v2, s5, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, s13 +; GFX90A-NEXT: v_sub_co_u32_e32 v0, vcc, s4, v0 +; GFX90A-NEXT: v_subb_co_u32_e64 v2, s[0:1], v2, v3, vcc +; GFX90A-NEXT: v_subrev_co_u32_e64 v5, s[0:1], s12, v0 +; GFX90A-NEXT: v_subbrev_co_u32_e64 v7, s[2:3], 0, v2, s[0:1] +; GFX90A-NEXT: v_cmp_le_u32_e64 s[2:3], s13, v7 +; GFX90A-NEXT: v_subb_co_u32_e64 v2, s[0:1], v2, v3, s[0:1] +; GFX90A-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[2:3] +; GFX90A-NEXT: v_cmp_le_u32_e64 s[2:3], s12, v5 +; GFX90A-NEXT: v_subrev_co_u32_e64 v3, s[0:1], s12, v5 +; GFX90A-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[2:3] +; GFX90A-NEXT: v_cmp_eq_u32_e64 s[2:3], s13, v7 +; GFX90A-NEXT: v_cndmask_b32_e64 v8, v8, v9, s[2:3] +; GFX90A-NEXT: v_subbrev_co_u32_e64 v2, s[0:1], 0, v2, s[0:1] +; GFX90A-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v8 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v5, v3, s[0:1] +; GFX90A-NEXT: v_mov_b32_e32 v5, s5 +; GFX90A-NEXT: v_subb_co_u32_e32 v1, vcc, v5, v1, vcc +; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s13, v1 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v2, s[0:1] +; GFX90A-NEXT: s_ashr_i32 s0, s11, 31 +; GFX90A-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc +; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s12, v0 +; GFX90A-NEXT: s_add_u32 s2, s10, s0 +; GFX90A-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, s13, v1 +; GFX90A-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc +; GFX90A-NEXT: s_mov_b32 s1, s0 +; GFX90A-NEXT: s_addc_u32 s3, s11, s0 +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 +; GFX90A-NEXT: s_xor_b64 s[4:5], s[2:3], s[0:1] +; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX90A-NEXT: v_cvt_f32_u32_e32 v2, s4 +; GFX90A-NEXT: v_cvt_f32_u32_e32 v3, s5 +; GFX90A-NEXT: v_xor_b32_e32 v0, s14, v0 +; GFX90A-NEXT: s_sub_u32 s2, 0, s4 +; GFX90A-NEXT: s_subb_u32 s3, 0, s5 +; GFX90A-NEXT: v_mac_f32_e32 v2, s16, v3 +; GFX90A-NEXT: v_rcp_f32_e32 v2, v2 +; GFX90A-NEXT: v_xor_b32_e32 v1, s14, v1 +; GFX90A-NEXT: v_mov_b32_e32 v5, s14 +; GFX90A-NEXT: v_subrev_co_u32_e32 v0, vcc, s14, v0 +; GFX90A-NEXT: v_mul_f32_e32 v2, s17, v2 +; GFX90A-NEXT: v_mul_f32_e32 v3, s18, v2 +; GFX90A-NEXT: v_trunc_f32_e32 v3, v3 +; GFX90A-NEXT: v_mac_f32_e32 v2, s19, v3 +; GFX90A-NEXT: v_cvt_u32_f32_e32 v2, v2 +; GFX90A-NEXT: v_cvt_u32_f32_e32 v3, v3 +; GFX90A-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v5, vcc +; GFX90A-NEXT: s_ashr_i32 s10, s7, 31 +; GFX90A-NEXT: v_mul_hi_u32 v7, s2, v2 +; GFX90A-NEXT: v_mul_lo_u32 v8, s2, v3 +; GFX90A-NEXT: v_mul_lo_u32 v5, s3, v2 +; GFX90A-NEXT: v_add_u32_e32 v7, v7, v8 +; GFX90A-NEXT: v_add_u32_e32 v5, v7, v5 +; GFX90A-NEXT: v_mul_lo_u32 v9, s2, v2 +; GFX90A-NEXT: v_mul_lo_u32 v8, v2, v5 +; GFX90A-NEXT: v_mul_hi_u32 v10, v2, v9 +; GFX90A-NEXT: v_add_co_u32_e32 v8, vcc, v10, v8 +; GFX90A-NEXT: v_mul_hi_u32 v7, v2, v5 +; GFX90A-NEXT: v_mul_hi_u32 v11, v3, v9 +; GFX90A-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc +; GFX90A-NEXT: v_mul_lo_u32 v9, v3, v9 +; GFX90A-NEXT: v_add_co_u32_e32 v8, vcc, v8, v9 +; GFX90A-NEXT: v_mul_hi_u32 v10, v3, v5 +; GFX90A-NEXT: v_addc_co_u32_e32 v7, vcc, v7, v11, vcc +; GFX90A-NEXT: v_addc_co_u32_e32 v8, vcc, v10, v4, vcc +; GFX90A-NEXT: v_mul_lo_u32 v5, v3, v5 +; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v7, v5 +; GFX90A-NEXT: v_add_co_u32_e64 v2, s[0:1], v2, v5 +; GFX90A-NEXT: v_addc_co_u32_e32 v7, vcc, v6, v8, vcc +; GFX90A-NEXT: v_addc_co_u32_e64 v5, vcc, v3, v7, s[0:1] +; GFX90A-NEXT: v_mul_lo_u32 v8, s2, v5 +; GFX90A-NEXT: v_mul_hi_u32 v9, s2, v2 +; GFX90A-NEXT: v_add_u32_e32 v8, v9, v8 +; GFX90A-NEXT: v_mul_lo_u32 v9, s3, v2 +; GFX90A-NEXT: v_add_u32_e32 v8, v8, v9 +; GFX90A-NEXT: v_mul_lo_u32 v10, s2, v2 +; GFX90A-NEXT: v_mul_hi_u32 v11, v5, v10 +; GFX90A-NEXT: v_mul_lo_u32 v12, v5, v10 +; GFX90A-NEXT: v_mul_lo_u32 v14, v2, v8 +; GFX90A-NEXT: v_mul_hi_u32 v10, v2, v10 +; GFX90A-NEXT: v_mul_hi_u32 v13, v2, v8 +; GFX90A-NEXT: v_add_co_u32_e32 v10, vcc, v10, v14 +; GFX90A-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc +; GFX90A-NEXT: v_add_co_u32_e32 v10, vcc, v10, v12 +; GFX90A-NEXT: v_mul_hi_u32 v9, v5, v8 +; GFX90A-NEXT: v_addc_co_u32_e32 v10, vcc, v13, v11, vcc +; GFX90A-NEXT: v_addc_co_u32_e32 v9, vcc, v9, v4, vcc +; GFX90A-NEXT: v_mul_lo_u32 v5, v5, v8 +; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v10, v5 +; GFX90A-NEXT: v_addc_co_u32_e32 v8, vcc, v6, v9, vcc +; GFX90A-NEXT: v_add_u32_e32 v3, v3, v7 +; GFX90A-NEXT: v_addc_co_u32_e64 v3, vcc, v3, v8, s[0:1] +; GFX90A-NEXT: s_add_u32 s0, s6, s10 +; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5 +; GFX90A-NEXT: s_mov_b32 s11, s10 +; GFX90A-NEXT: s_addc_u32 s1, s7, s10 +; GFX90A-NEXT: s_xor_b64 s[6:7], s[0:1], s[10:11] +; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX90A-NEXT: v_mul_lo_u32 v7, s6, v3 +; GFX90A-NEXT: v_mul_hi_u32 v8, s6, v2 +; GFX90A-NEXT: v_add_co_u32_e32 v7, vcc, v8, v7 +; GFX90A-NEXT: v_mul_hi_u32 v5, s6, v3 +; GFX90A-NEXT: v_mul_hi_u32 v9, s7, v2 +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX90A-NEXT: v_mul_lo_u32 v2, s7, v2 +; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, v7, v2 +; GFX90A-NEXT: v_mul_hi_u32 v8, s7, v3 +; GFX90A-NEXT: v_addc_co_u32_e32 v2, vcc, v5, v9, vcc +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v8, v4, vcc +; GFX90A-NEXT: v_mul_lo_u32 v3, s7, v3 +; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, v2, v3 +; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v6, v5, vcc +; GFX90A-NEXT: v_mul_lo_u32 v3, s4, v3 +; GFX90A-NEXT: v_mul_hi_u32 v5, s4, v2 +; GFX90A-NEXT: v_add_u32_e32 v3, v5, v3 +; GFX90A-NEXT: v_mul_lo_u32 v5, s5, v2 +; GFX90A-NEXT: v_add_u32_e32 v3, v3, v5 +; GFX90A-NEXT: v_mul_lo_u32 v2, s4, v2 +; GFX90A-NEXT: v_sub_u32_e32 v5, s7, v3 +; GFX90A-NEXT: v_mov_b32_e32 v6, s5 +; GFX90A-NEXT: v_sub_co_u32_e32 v2, vcc, s6, v2 +; GFX90A-NEXT: v_subb_co_u32_e64 v5, s[0:1], v5, v6, vcc +; GFX90A-NEXT: v_subrev_co_u32_e64 v7, s[0:1], s4, v2 +; GFX90A-NEXT: v_subbrev_co_u32_e64 v8, s[2:3], 0, v5, s[0:1] +; GFX90A-NEXT: v_cmp_le_u32_e64 s[2:3], s5, v8 +; GFX90A-NEXT: v_subb_co_u32_e64 v5, s[0:1], v5, v6, s[0:1] +; GFX90A-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[2:3] +; GFX90A-NEXT: v_cmp_le_u32_e64 s[2:3], s4, v7 +; GFX90A-NEXT: v_subrev_co_u32_e64 v6, s[0:1], s4, v7 +; GFX90A-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[2:3] +; GFX90A-NEXT: v_cmp_eq_u32_e64 s[2:3], s5, v8 +; GFX90A-NEXT: v_cndmask_b32_e64 v9, v9, v10, s[2:3] +; GFX90A-NEXT: v_subbrev_co_u32_e64 v5, s[0:1], 0, v5, s[0:1] +; GFX90A-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v9 +; GFX90A-NEXT: v_cndmask_b32_e64 v6, v7, v6, s[0:1] +; GFX90A-NEXT: v_mov_b32_e32 v7, s7 +; GFX90A-NEXT: v_subb_co_u32_e32 v3, vcc, v7, v3, vcc +; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s5, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc +; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s4, v2 +; GFX90A-NEXT: v_cndmask_b32_e64 v5, v8, v5, s[0:1] +; GFX90A-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, s5, v3 +; GFX90A-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 +; GFX90A-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc +; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc +; GFX90A-NEXT: v_xor_b32_e32 v2, s10, v2 +; GFX90A-NEXT: v_xor_b32_e32 v3, s10, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, s10 +; GFX90A-NEXT: v_subrev_co_u32_e32 v2, vcc, s10, v2 +; GFX90A-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v5, vcc +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[8:9] +; GFX90A-NEXT: s_endpgm %shl.y = shl <2 x i64> , %y %r = srem <2 x i64> %x, %shl.y store <2 x i64> %r, <2 x i64> addrspace(1)* %out diff --git a/llvm/test/CodeGen/AMDGPU/dagcombine-select.ll b/llvm/test/CodeGen/AMDGPU/dagcombine-select.ll --- a/llvm/test/CodeGen/AMDGPU/dagcombine-select.ll +++ b/llvm/test/CodeGen/AMDGPU/dagcombine-select.ll @@ -159,7 +159,7 @@ } ; GCN-LABEL: {{^}}sdiv_constant_sel_constants_i64: -; GCN: s_cselect_b32 s{{[0-9]+}}, 0, 5 +; GCN: s_cselect_b64 s[{{[0-9]+}}:{{[0-9]+}}], 0, 5 define amdgpu_kernel void @sdiv_constant_sel_constants_i64(i64 addrspace(1)* %p, i1 %cond) { %sel = select i1 %cond, i64 121, i64 23 %bo = sdiv i64 120, %sel @@ -177,7 +177,7 @@ } ; GCN-LABEL: {{^}}udiv_constant_sel_constants_i64: -; GCN: s_cselect_b32 s{{[0-9]+}}, 0, 5 +; GCN: s_cselect_b64 s[{{[0-9]+}}:{{[0-9]+}}], 0, 5 define amdgpu_kernel void @udiv_constant_sel_constants_i64(i64 addrspace(1)* %p, i1 %cond) { %sel = select i1 %cond, i64 -4, i64 23 %bo = udiv i64 120, %sel @@ -186,7 +186,7 @@ } ; GCN-LABEL: {{^}}srem_constant_sel_constants: -; GCN: s_cselect_b32 s{{[0-9]+}}, 33, 3 +; GCN: s_cselect_b64 s[{{[0-9]+}}:{{[0-9]+}}], 33, 3 define amdgpu_kernel void @srem_constant_sel_constants(i64 addrspace(1)* %p, i1 %cond) { %sel = select i1 %cond, i64 34, i64 15 %bo = srem i64 33, %sel @@ -195,7 +195,7 @@ } ; GCN-LABEL: {{^}}urem_constant_sel_constants: -; GCN: s_cselect_b32 s{{[0-9]+}}, 33, 3 +; GCN: s_cselect_b64 s[{{[0-9]+}}:{{[0-9]+}}], 33, 3 define amdgpu_kernel void @urem_constant_sel_constants(i64 addrspace(1)* %p, i1 %cond) { %sel = select i1 %cond, i64 34, i64 15 %bo = urem i64 33, %sel diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll --- a/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll +++ b/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll @@ -38,16 +38,23 @@ ; GCN-LABEL: {{^}}double4_extelt: ; GCN-NOT: buffer_ +; GCN-DAG: s_mov_b32 s[[L0LO:[0-9]+]], 0x47ae147b +; GCN-DAG: s_mov_b32 s[[L0HI:[0-9]+]], 0x3f847ae1 +; GCN-DAG: s_mov_b32 s[[L1LO:[0-9]+]], 0xc28f5c29 +; GCN-DAG: s_mov_b32 s[[L1HI:[0-9]+]], 0x3ff028f5 ; GCN-DAG: s_cmp_eq_u32 [[IDX:s[0-9]+]], 1 -; GCN-DAG: s_cselect_b64 [[C1:[^,]+]], -1, 0 -; GCN-DAG: s_cmp_eq_u32 [[IDX]], 2 -; GCN-DAG: s_cselect_b64 [[C2:[^,]+]], -1, 0 -; GCN-DAG: s_cmp_eq_u32 [[IDX]], 3 -; GCN-DAG: s_cselect_b64 [[C3:[^,]+]], -1, 0 -; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, {{[^,]+}}, {{[^,]+}}, [[C1]] -; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, {{[^,]+}}, {{[^,]+}}, [[C2]] -; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, {{[^,]+}}, {{[^,]+}}, [[C3]] -; GCN: store_dwordx2 v[{{[0-9:]+}}] +; GCN: s_cselect_b64 s{{\[}}[[T0LO:[0-9]+]]:[[T0HI:[0-9]+]]{{\]}}, s{{\[}}[[L1LO]]:[[L1HI]]{{\]}}, s{{\[}}[[L0LO]]:[[L0HI]]{{\]}} +; GCN-DAG: s_mov_b32 s[[L2LO:[0-9]+]], 0xe147ae14 +; GCN-DAG: s_mov_b32 s[[L2HI:[0-9]+]], 0x4000147a +; GCN-DAG: s_cmp_lg_u32 [[IDX]], 2 +; GCN: s_cselect_b64 s{{\[}}[[T1LO:[0-9]+]]:[[T1HI:[0-9]+]]{{\]}}, s{{\[}}[[T0LO]]:[[T0HI]]{{\]}}, s{{\[}}[[L2LO]]:[[L2HI]]{{\]}} +; GCN-DAG: s_mov_b32 s[[L3LO:[0-9]+]], 0x70a3d70a +; GCN-DAG: s_mov_b32 s[[L3HI:[0-9]+]], 0x40100a3d +; GCN-DAG: s_cmp_lg_u32 [[IDX]], 3 +; GCN: s_cselect_b64 s{{\[}}[[T2LO:[0-9]+]]:[[T2HI:[0-9]+]]{{\]}}, s{{\[}}[[T1LO]]:[[T1HI]]{{\]}}, s{{\[}}[[L3LO]]:[[L3HI]]{{\]}} +; GCN-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[T2LO]] +; GCN-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[T2HI]] +; GCN: store_dwordx2 v[{{[0-9:]+}}], v{{\[}}[[V_LO]]:[[V_HI]]{{\]}} define amdgpu_kernel void @double4_extelt(double addrspace(1)* %out, i32 %sel) { entry: %ext = extractelement <4 x double> , i32 %sel @@ -57,19 +64,27 @@ ; GCN-LABEL: {{^}}double5_extelt: ; GCN-NOT: buffer_ +; GCN-DAG: s_mov_b32 s[[L0LO:[0-9]+]], 0x47ae147b +; GCN-DAG: s_mov_b32 s[[L0HI:[0-9]+]], 0x3f847ae1 +; GCN-DAG: s_mov_b32 s[[L1LO:[0-9]+]], 0xc28f5c29 +; GCN-DAG: s_mov_b32 s[[L1HI:[0-9]+]], 0x3ff028f5 ; GCN-DAG: s_cmp_eq_u32 [[IDX:s[0-9]+]], 1 -; GCN-DAG: s_cselect_b64 [[C1:[^,]+]], -1, 0 -; GCN-DAG: s_cmp_eq_u32 [[IDX]], 2 -; GCN-DAG: s_cselect_b64 [[C2:[^,]+]], -1, 0 -; GCN-DAG: s_cmp_eq_u32 [[IDX]], 3 -; GCN-DAG: s_cselect_b64 [[C3:[^,]+]], -1, 0 -; GCN-DAG: s_cmp_eq_u32 [[IDX]], 4 -; GCN-DAG: s_cselect_b64 [[C4:[^,]+]], -1, 0 -; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, {{[^,]+}}, {{[^,]+}}, [[C1]] -; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, {{[^,]+}}, {{[^,]+}}, [[C2]] -; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, {{[^,]+}}, {{[^,]+}}, [[C3]] -; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, {{[^,]+}}, {{[^,]+}}, [[C4]] -; GCN: store_dwordx2 v[{{[0-9:]+}}] +; GCN: s_cselect_b64 s{{\[}}[[T0LO:[0-9]+]]:[[T0HI:[0-9]+]]{{\]}}, s{{\[}}[[L1LO]]:[[L1HI]]{{\]}}, s{{\[}}[[L0LO]]:[[L0HI]]{{\]}} +; GCN-DAG: s_mov_b32 s[[L2LO:[0-9]+]], 0xe147ae14 +; GCN-DAG: s_mov_b32 s[[L2HI:[0-9]+]], 0x4000147a +; GCN-DAG: s_cmp_lg_u32 [[IDX]], 2 +; GCN: s_cselect_b64 s{{\[}}[[T1LO:[0-9]+]]:[[T1HI:[0-9]+]]{{\]}}, s{{\[}}[[T0LO]]:[[T0HI]]{{\]}}, s{{\[}}[[L2LO]]:[[L2HI]]{{\]}} +; GCN-DAG: s_mov_b32 s[[L3LO:[0-9]+]], 0x70a3d70a +; GCN-DAG: s_mov_b32 s[[L3HI:[0-9]+]], 0x40100a3d +; GCN-DAG: s_cmp_lg_u32 [[IDX]], 3 +; GCN: s_cselect_b64 s{{\[}}[[T2LO:[0-9]+]]:[[T2HI:[0-9]+]]{{\]}}, s{{\[}}[[T1LO]]:[[T1HI]]{{\]}}, s{{\[}}[[L3LO]]:[[L3HI]]{{\]}} +; Double literals 5.01 and 4.01 share the same low 32 bits. +; GCN-DAG: s_mov_b32 s[[L4HI:[0-9]+]], 0x40140a3d +; GCN-DAG: s_cmp_lg_u32 [[IDX]], 4 +; GCN: s_cselect_b64 s{{\[}}[[T3LO:[0-9]+]]:[[T3HI:[0-9]+]]{{\]}}, s{{\[}}[[T2LO]]:[[T2HI]]{{\]}}, s{{\[}}[[L3LO]]:[[L4HI]]{{\]}} +; GCN-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[T3LO]] +; GCN-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[T3HI]] +; GCN: store_dwordx2 v[{{[0-9:]+}}], v{{\[}}[[V_LO]]:[[V_HI]]{{\]}} define amdgpu_kernel void @double5_extelt(double addrspace(1)* %out, i32 %sel) { entry: %ext = extractelement <5 x double> , i32 %sel @@ -107,11 +122,15 @@ ; GCN-LABEL: {{^}}double2_extelt: ; GCN-NOT: buffer_ +; GCN-DAG: s_mov_b32 s[[L0LO:[0-9]+]], 0x47ae147b +; GCN-DAG: s_mov_b32 s[[L0HI:[0-9]+]], 0x3f847ae1 +; GCN-DAG: s_mov_b32 s[[L1LO:[0-9]+]], 0xc28f5c29 +; GCN-DAG: s_mov_b32 s[[L1HI:[0-9]+]], 0x3ff028f5 ; GCN-DAG: s_cmp_eq_u32 [[IDX:s[0-9]+]], 1 -; GCN-DAG: s_cselect_b64 [[C1:[^,]+]], -1, 0 -; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, {{[^,]+}}, {{[^,]+}}, [[C1]] -; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, {{[^,]+}}, {{[^,]+}}, [[C1]] -; GCN: store_dwordx2 v[{{[0-9:]+}}] +; GCN: s_cselect_b64 s{{\[}}[[T0LO:[0-9]+]]:[[T0HI:[0-9]+]]{{\]}}, s{{\[}}[[L1LO]]:[[L1HI]]{{\]}}, s{{\[}}[[L0LO]]:[[L0HI]]{{\]}} +; GCN-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[T0LO]] +; GCN-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[T0HI]] +; GCN: store_dwordx2 v[{{[0-9:]+}}], v{{\[}}[[V_LO]]:[[V_HI]]{{\]}} define amdgpu_kernel void @double2_extelt(double addrspace(1)* %out, i32 %sel) { entry: %ext = extractelement <2 x double> , i32 %sel diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f64.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f64.ll --- a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f64.ll +++ b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f64.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s -; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s +; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s ; GCN-LABEL: {{^}}extract_vector_elt_v3f64_2: ; GCN: buffer_load_dwordx4 @@ -14,15 +14,22 @@ ; GCN-LABEL: {{^}}dyn_extract_vector_elt_v3f64: ; GCN-NOT: buffer_load -; GCN-DAG: s_cmp_eq_u32 [[IDX:s[0-9]+]], 1 -; GCN-DAG: s_cselect_b64 [[C1:[^,]+]], -1, 0 -; GCN-DAG: s_cmp_eq_u32 [[IDX]], 2 -; GCN-DAG: s_cselect_b64 [[C2:[^,]+]], -1, 0 -; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C1]] -; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C1]] -; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C2]] -; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C2]] -; GCN: store_dwordx2 v[{{[0-9:]+}}] +; SI-DAG: s_cmp_eq_u32 [[IDX:s[0-9]+]], 1 +; SI-DAG: s_cselect_b64 [[C1:[^,]+]], -1, 0 +; SI-DAG: s_cmp_eq_u32 [[IDX]], 2 +; SI-DAG: s_cselect_b64 [[C2:[^,]+]], -1, 0 +; SI-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C1]] +; SI-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C1]] +; SI-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C2]] +; SI-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C2]] +; SI: store_dwordx2 v[{{[0-9:]+}}] +; VI: s_cmp_eq_u32 [[IDX:s[0-9]+]], 1 +; VI: s_cselect_b64 s{{\[}}[[T0LO:[0-9]+]]:[[T0HI:[0-9]+]]{{\]}}, s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] +; VI: s_cmp_eq_u32 [[IDX:s[0-9]+]], 2 +; VI: s_cselect_b64 s{{\[}}[[T1LO:[0-9]+]]:[[T1HI:[0-9]+]]{{\]}}, s[{{[0-9]+:[0-9]+}}], s{{\[}}[[T0LO]]:[[T0HI]]{{\]}} +; VI-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[T1LO]] +; VI-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[T1HI]] +; VI: store_dwordx2 v{{\[}}[[V_LO]]:[[V_HI]]{{\]}} define amdgpu_kernel void @dyn_extract_vector_elt_v3f64(double addrspace(1)* %out, <3 x double> %foo, i32 %elt) #0 { %dynelt = extractelement <3 x double> %foo, i32 %elt store volatile double %dynelt, double addrspace(1)* %out @@ -31,19 +38,28 @@ ; GCN-LABEL: {{^}}dyn_extract_vector_elt_v4f64: ; GCN-NOT: buffer_load -; GCN-DAG: s_cmp_eq_u32 [[IDX:s[0-9]+]], 1 -; GCN-DAG: s_cselect_b64 [[C1:[^,]+]], -1, 0 -; GCN-DAG: s_cmp_eq_u32 [[IDX]], 2 -; GCN-DAG: s_cselect_b64 [[C2:[^,]+]], -1, 0 -; GCN-DAG: s_cmp_eq_u32 [[IDX]], 3 -; GCN-DAG: s_cselect_b64 [[C3:[^,]+]], -1, 0 -; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C1]] -; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C1]] -; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C2]] -; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C2]] -; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C3]] -; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C3]] -; GCN: store_dwordx2 v[{{[0-9:]+}}] +; SI-DAG: s_cmp_eq_u32 [[IDX:s[0-9]+]], 1 +; SI-DAG: s_cselect_b64 [[C1:[^,]+]], -1, 0 +; SI-DAG: s_cmp_eq_u32 [[IDX]], 2 +; SI-DAG: s_cselect_b64 [[C2:[^,]+]], -1, 0 +; SI-DAG: s_cmp_eq_u32 [[IDX]], 3 +; SI-DAG: s_cselect_b64 [[C3:[^,]+]], -1, 0 +; SI-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C1]] +; SI-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C1]] +; SI-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C2]] +; SI-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C2]] +; SI-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C3]] +; SI-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C3]] +; SI: store_dwordx2 v[{{[0-9:]+}}] +; VI: s_cmp_eq_u32 [[IDX:s[0-9]+]], 1 +; VI: s_cselect_b64 s{{\[}}[[T0LO:[0-9]+]]:[[T0HI:[0-9]+]]{{\]}}, s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] +; VI: s_cmp_eq_u32 [[IDX:s[0-9]+]], 2 +; VI: s_cselect_b64 s{{\[}}[[T1LO:[0-9]+]]:[[T1HI:[0-9]+]]{{\]}}, s[{{[0-9]+:[0-9]+}}], s{{\[}}[[T0LO]]:[[T0HI]]{{\]}} +; VI: s_cmp_eq_u32 [[IDX:s[0-9]+]], 3 +; VI: s_cselect_b64 s{{\[}}[[T2LO:[0-9]+]]:[[T2HI:[0-9]+]]{{\]}}, s[{{[0-9]+:[0-9]+}}], s{{\[}}[[T1LO]]:[[T1HI]]{{\]}} +; VI-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[T2LO]] +; VI-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[T2HI]] +; VI: store_dwordx2 v{{\[}}[[V_LO]]:[[V_HI]]{{\]}} define amdgpu_kernel void @dyn_extract_vector_elt_v4f64(double addrspace(1)* %out, <4 x double> %foo, i32 %elt) #0 { %dynelt = extractelement <4 x double> %foo, i32 %elt store volatile double %dynelt, double addrspace(1)* %out diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i64.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i64.ll --- a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i64.ll +++ b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i64.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s -; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI %s +; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI %s ; How the replacement of i64 stores with v2i32 stores resulted in ; breaking other users of the bitcast if they already existed @@ -32,10 +32,14 @@ ; GCN-LABEL: {{^}}dyn_extract_vector_elt_v2i64: ; GCN-NOT: buffer_load ; GCN-DAG: s_cmp_eq_u32 [[IDX:s[0-9]+]], 1 -; GCN-DAG: s_cselect_b64 [[C1:[^,]+]], -1, 0 -; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C1]] -; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C1]] -; GCN: store_dwordx2 v[{{[0-9:]+}}] +; SI-DAG: s_cselect_b64 [[C1:[^,]+]], -1, 0 +; SI-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C1]] +; SI-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C1]] +; SI: store_dwordx2 v[{{[0-9:]+}}] +; VI: s_cselect_b64 s{{\[}}[[S_LO:[0-9]+]]:[[S_HI:[0-9]+]]{{\]}}, s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] +; VI-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[S_LO]] +; VI-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[S_HI]] +; VI: store_dwordx2 v{{\[}}[[V_LO]]:[[V_HI]]{{\]}} define amdgpu_kernel void @dyn_extract_vector_elt_v2i64(i64 addrspace(1)* %out, <2 x i64> %foo, i32 %elt) #0 { %dynelt = extractelement <2 x i64> %foo, i32 %elt store volatile i64 %dynelt, i64 addrspace(1)* %out @@ -59,16 +63,23 @@ } ; GCN-LABEL: {{^}}dyn_extract_vector_elt_v3i64: -; GCN-NOT: buffer_load -; GCN-DAG: s_cmp_eq_u32 [[IDX:s[0-9]+]], 1 -; GCN-DAG: s_cselect_b64 [[C1:[^,]+]], -1, 0 -; GCN-DAG: s_cmp_eq_u32 [[IDX]], 2 -; GCN-DAG: s_cselect_b64 [[C2:[^,]+]], -1, 0 -; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C1]] -; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C2]] -; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C1]] -; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C2]] -; GCN: store_dwordx2 v[{{[0-9:]+}}] +; SI-NOT: buffer_load +; SI-DAG: s_cmp_eq_u32 [[IDX:s[0-9]+]], 1 +; SI-DAG: s_cselect_b64 [[C1:[^,]+]], -1, 0 +; SI-DAG: s_cmp_eq_u32 [[IDX]], 2 +; SI-DAG: s_cselect_b64 [[C2:[^,]+]], -1, 0 +; SI-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C1]] +; SI-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C2]] +; SI-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C1]] +; SI-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C2]] +; SI: store_dwordx2 v[{{[0-9:]+}}] +; VI: s_cmp_eq_u32 [[IDX:s[0-9]+]], 1 +; VI: s_cselect_b64 s{{\[}}[[T0LO:[0-9]+]]:[[T0HI:[0-9]+]]{{\]}}, s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] +; VI: s_cmp_eq_u32 [[IDX:s[0-9]+]], 2 +; VI: s_cselect_b64 s{{\[}}[[T1LO:[0-9]+]]:[[T1HI:[0-9]+]]{{\]}}, s[{{[0-9]+:[0-9]+}}], s{{\[}}[[T0LO]]:[[T0HI]]{{\]}} +; VI-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[T1LO]] +; VI-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[T1HI]] +; VI: store_dwordx2 v{{\[}}[[V_LO]]:[[V_HI]]{{\]}} define amdgpu_kernel void @dyn_extract_vector_elt_v3i64(i64 addrspace(1)* %out, <3 x i64> %foo, i32 %elt) #0 { %dynelt = extractelement <3 x i64> %foo, i32 %elt store volatile i64 %dynelt, i64 addrspace(1)* %out @@ -77,19 +88,28 @@ ; GCN-LABEL: {{^}}dyn_extract_vector_elt_v4i64: ; GCN-NOT: buffer_load -; GCN-DAG: s_cmp_eq_u32 [[IDX:s[0-9]+]], 1 -; GCN-DAG: s_cselect_b64 [[C1:[^,]+]], -1, 0 -; GCN-DAG: s_cmp_eq_u32 [[IDX]], 2 -; GCN-DAG: s_cselect_b64 [[C2:[^,]+]], -1, 0 -; GCN-DAG: s_cmp_eq_u32 [[IDX]], 3 -; GCN-DAG: s_cselect_b64 [[C3:[^,]+]], -1, 0 -; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C1]] -; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C2]] -; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C1]] -; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C2]] -; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C3]] -; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C3]] -; GCN: store_dwordx2 v[{{[0-9:]+}}] +; SI-DAG: s_cmp_eq_u32 [[IDX:s[0-9]+]], 1 +; SI-DAG: s_cselect_b64 [[C1:[^,]+]], -1, 0 +; SI-DAG: s_cmp_eq_u32 [[IDX]], 2 +; SI-DAG: s_cselect_b64 [[C2:[^,]+]], -1, 0 +; SI-DAG: s_cmp_eq_u32 [[IDX]], 3 +; SI-DAG: s_cselect_b64 [[C3:[^,]+]], -1, 0 +; SI-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C1]] +; SI-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C2]] +; SI-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C1]] +; SI-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C2]] +; SI-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C3]] +; SI-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C3]] +; SI: store_dwordx2 v[{{[0-9:]+}}] +; VI: s_cmp_eq_u32 [[IDX:s[0-9]+]], 1 +; VI: s_cselect_b64 s{{\[}}[[T0LO:[0-9]+]]:[[T0HI:[0-9]+]]{{\]}}, s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] +; VI: s_cmp_eq_u32 [[IDX:s[0-9]+]], 2 +; VI: s_cselect_b64 s{{\[}}[[T1LO:[0-9]+]]:[[T1HI:[0-9]+]]{{\]}}, s[{{[0-9]+:[0-9]+}}], s{{\[}}[[T0LO]]:[[T0HI]]{{\]}} +; VI: s_cmp_eq_u32 [[IDX:s[0-9]+]], 3 +; VI: s_cselect_b64 s{{\[}}[[T2LO:[0-9]+]]:[[T2HI:[0-9]+]]{{\]}}, s[{{[0-9]+:[0-9]+}}], s{{\[}}[[T1LO]]:[[T1HI]]{{\]}} +; VI-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[T2LO]] +; VI-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[T2HI]] +; VI: store_dwordx2 v{{\[}}[[V_LO]]:[[V_HI]]{{\]}} define amdgpu_kernel void @dyn_extract_vector_elt_v4i64(i64 addrspace(1)* %out, <4 x i64> %foo, i32 %elt) #0 { %dynelt = extractelement <4 x i64> %foo, i32 %elt store volatile i64 %dynelt, i64 addrspace(1)* %out diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll --- a/llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll @@ -15,8 +15,8 @@ ; GCN-NEXT: v_mov_b32_e32 v2, 0 ; GCN-NEXT: s_lshl_b32 s0, s0, 16 ; GCN-NEXT: v_cmp_ne_u32_e32 vcc_lo, -1, v0 -; GCN-NEXT: v_cndmask_b32_e64 v1, 0, s0, vcc_lo ; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc_lo +; GCN-NEXT: v_cndmask_b32_e64 v1, 0, s0, vcc_lo ; GCN-NEXT: flat_store_dword v[0:1], v2 ; GCN-NEXT: s_waitcnt_vscnt null, 0x0 ; GCN-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll --- a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll @@ -285,16 +285,18 @@ } ; GCN-LABEL: {{^}}double2_inselt: +; GCN: s_load_dwordx4 s{{\[}}[[FIRST:[0-9]+]]:[[LAST:[0-9]+]]{{\]}}, s[{{[0-9]+:[0-9]+}}] ; GCN-NOT: v_movrel ; GCN-NOT: buffer_ -; GCN-DAG: s_cmp_eq_u32 [[IDX:s[0-9]+]], 1 -; GCN-DAG: s_cselect_b64 [[CC1:[^,]+]], -1, 0 -; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[CC1]] -; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 0, [[CC1]] -; GCN-DAG: s_cmp_eq_u32 [[IDX]], 0 -; GCN-DAG: s_cselect_b64 [[CC2:[^,]+]], -1, 0 -; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[CC2]] -; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 0, [[CC2]] +; GCN: s_cmp_lg_u32 [[IDX:s[0-9]+]], 1 +; GCN: s_cselect_b64 s{{\[}}[[P0_LO:[0-9]+]]:[[P0_HI:[0-9]+]]{{\]}}, s{{\[}}{{[0-9]+}}:[[LAST]]{{\]}}, 1.0 +; GCN: s_cmp_lg_u32 [[IDX]], 0 +; GCN: s_cselect_b64 s{{\[}}[[P1_LO:[0-9]+]]:[[P1_HI:[0-9]+]]{{\]}}, s{{\[}}[[FIRST]]:{{[0-9]+}}{{\]}}, 1.0 +; GCN: v_mov_b32_e32 v[[V_FIRST:[0-9]+]], s[[P1_LO]] +; GCN: v_mov_b32_e32 v[[V_SECOND:[0-9]+]], s[[P1_HI]] +; GCN: v_mov_b32_e32 v[[V_THIRD:[0-9]+]], s[[P0_LO]] +; GCN: v_mov_b32_e32 v[[V_LAST:[0-9]+]], s[[P0_HI]] +; GCN: flat_store_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, v{{\[}}[[V_FIRST]]:[[V_LAST]]{{\]}} define amdgpu_kernel void @double2_inselt(<2 x double> addrspace(1)* %out, <2 x double> %vec, i32 %sel) { entry: %v = insertelement <2 x double> %vec, double 1.000000e+00, i32 %sel @@ -305,7 +307,7 @@ ; GCN-LABEL: {{^}}double5_inselt: ; GCN-NOT: v_movrel ; GCN-NOT: buffer_ -; GCN-COUNT-10: v_cndmask_b32 +; GCN-COUNT-5: s_cselect_b64 define amdgpu_kernel void @double5_inselt(<5 x double> addrspace(1)* %out, <5 x double> %vec, i32 %sel) { entry: %v = insertelement <5 x double> %vec, double 1.000000e+00, i32 %sel diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll @@ -1627,25 +1627,22 @@ ; ; VI-LABEL: dynamic_insertelement_v2f64: ; VI: ; %bb.0: +; VI-NEXT: s_load_dword s12, s[4:5], 0x60 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x30 -; VI-NEXT: s_load_dword s4, s[4:5], 0x60 -; VI-NEXT: v_mov_b32_e32 v1, 0x40200000 +; VI-NEXT: s_mov_b32 s4, 0 +; VI-NEXT: s_mov_b32 s5, 0x40200000 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 -; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s11 -; VI-NEXT: s_cmp_eq_u32 s4, 1 -; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v0, s10 -; VI-NEXT: s_cmp_eq_u32 s4, 0 -; VI-NEXT: v_cndmask_b32_e64 v2, v0, 0, vcc -; VI-NEXT: v_mov_b32_e32 v0, s9 -; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v0, s8 -; VI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; VI-NEXT: s_cmp_lg_u32 s12, 1 +; VI-NEXT: s_cselect_b64 s[6:7], s[10:11], s[4:5] +; VI-NEXT: s_cmp_lg_u32 s12, 0 +; VI-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5] +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: v_mov_b32_e32 v3, s7 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; VI-NEXT: s_endpgm %vecins = insertelement <2 x double> %a, double 8.0, i32 %b @@ -1685,18 +1682,14 @@ ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_cmp_eq_u32 s6, 1 -; VI-NEXT: s_cselect_b64 s[4:5], -1, 0 -; VI-NEXT: v_mov_b32_e32 v0, s11 -; VI-NEXT: v_cndmask_b32_e64 v3, v0, 0, s[4:5] -; VI-NEXT: v_mov_b32_e32 v0, s10 -; VI-NEXT: s_cmp_eq_u32 s6, 0 -; VI-NEXT: v_cndmask_b32_e64 v2, v0, 5, s[4:5] -; VI-NEXT: v_mov_b32_e32 v0, s9 -; VI-NEXT: s_cselect_b64 s[4:5], -1, 0 -; VI-NEXT: v_cndmask_b32_e64 v1, v0, 0, s[4:5] -; VI-NEXT: v_mov_b32_e32 v0, s8 -; VI-NEXT: v_cndmask_b32_e64 v0, v0, 5, s[4:5] +; VI-NEXT: s_cmp_lg_u32 s6, 1 +; VI-NEXT: s_cselect_b64 s[4:5], s[10:11], 5 +; VI-NEXT: s_cmp_lg_u32 s6, 0 +; VI-NEXT: s_cselect_b64 s[6:7], s[8:9], 5 +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; VI-NEXT: s_endpgm %vecins = insertelement <2 x i64> %a, i64 5, i32 %b @@ -1745,25 +1738,19 @@ ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s11 -; VI-NEXT: s_cmp_eq_u32 s12, 1 -; VI-NEXT: s_cselect_b64 s[4:5], -1, 0 -; VI-NEXT: v_cndmask_b32_e64 v3, v0, 0, s[4:5] -; VI-NEXT: v_mov_b32_e32 v0, s10 -; VI-NEXT: s_cmp_eq_u32 s12, 0 -; VI-NEXT: v_cndmask_b32_e64 v2, v0, 5, s[4:5] -; VI-NEXT: s_cselect_b64 s[4:5], -1, 0 -; VI-NEXT: v_mov_b32_e32 v0, s9 -; VI-NEXT: v_cndmask_b32_e64 v1, v0, 0, s[4:5] +; VI-NEXT: s_cmp_lg_u32 s12, 1 +; VI-NEXT: s_cselect_b64 s[4:5], s[10:11], 5 +; VI-NEXT: s_cmp_lg_u32 s12, 0 +; VI-NEXT: s_cselect_b64 s[8:9], s[8:9], 5 +; VI-NEXT: s_cmp_lg_u32 s12, 2 +; VI-NEXT: s_cselect_b64 s[6:7], s[6:7], 5 +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 offset:16 ; VI-NEXT: v_mov_b32_e32 v0, s8 -; VI-NEXT: s_cmp_eq_u32 s12, 2 -; VI-NEXT: v_cndmask_b32_e64 v0, v0, 5, s[4:5] -; VI-NEXT: v_mov_b32_e32 v4, s7 -; VI-NEXT: s_cselect_b64 s[4:5], -1, 0 -; VI-NEXT: v_cndmask_b32_e64 v5, v4, 0, s[4:5] -; VI-NEXT: v_mov_b32_e32 v4, s6 -; VI-NEXT: v_cndmask_b32_e64 v4, v4, 5, s[4:5] -; VI-NEXT: buffer_store_dwordx2 v[4:5], off, s[0:3], 0 offset:16 +; VI-NEXT: v_mov_b32_e32 v1, s9 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; VI-NEXT: s_endpgm %vecins = insertelement <3 x i64> %a, i64 5, i32 %b @@ -1811,38 +1798,32 @@ ; ; VI-LABEL: dynamic_insertelement_v4f64: ; VI: ; %bb.0: +; VI-NEXT: s_load_dword s16, s[4:5], 0x40 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20 -; VI-NEXT: s_load_dword s4, s[4:5], 0x40 -; VI-NEXT: v_mov_b32_e32 v4, 0x40200000 +; VI-NEXT: s_mov_b32 s4, 0 +; VI-NEXT: s_mov_b32 s5, 0x40200000 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 -; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s11 -; VI-NEXT: s_cmp_eq_u32 s4, 1 -; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: v_cndmask_b32_e32 v3, v0, v4, vcc -; VI-NEXT: v_mov_b32_e32 v0, s10 -; VI-NEXT: s_cmp_eq_u32 s4, 0 -; VI-NEXT: v_cndmask_b32_e64 v2, v0, 0, vcc -; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: v_mov_b32_e32 v0, s9 -; VI-NEXT: v_cndmask_b32_e32 v1, v0, v4, vcc +; VI-NEXT: s_cmp_lg_u32 s16, 1 +; VI-NEXT: s_cselect_b64 s[6:7], s[10:11], s[4:5] +; VI-NEXT: s_cmp_lg_u32 s16, 0 +; VI-NEXT: s_cselect_b64 s[8:9], s[8:9], s[4:5] +; VI-NEXT: s_cmp_lg_u32 s16, 3 +; VI-NEXT: s_cselect_b64 s[10:11], s[14:15], s[4:5] +; VI-NEXT: s_cmp_lg_u32 s16, 2 +; VI-NEXT: s_cselect_b64 s[4:5], s[12:13], s[4:5] +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s10 +; VI-NEXT: v_mov_b32_e32 v3, s11 +; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 +; VI-NEXT: s_nop 0 ; VI-NEXT: v_mov_b32_e32 v0, s8 -; VI-NEXT: s_cmp_eq_u32 s4, 3 -; VI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc -; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: v_mov_b32_e32 v5, s15 -; VI-NEXT: v_cndmask_b32_e32 v7, v5, v4, vcc -; VI-NEXT: v_mov_b32_e32 v5, s14 -; VI-NEXT: s_cmp_eq_u32 s4, 2 -; VI-NEXT: v_cndmask_b32_e64 v6, v5, 0, vcc -; VI-NEXT: v_mov_b32_e32 v5, s13 -; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: v_cndmask_b32_e32 v5, v5, v4, vcc -; VI-NEXT: v_mov_b32_e32 v4, s12 -; VI-NEXT: v_cndmask_b32_e64 v4, v4, 0, vcc -; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; VI-NEXT: v_mov_b32_e32 v1, s9 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: v_mov_b32_e32 v3, s7 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; VI-NEXT: s_endpgm %vecins = insertelement <4 x double> %a, double 8.0, i32 %b diff --git a/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll b/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll @@ -344,46 +344,39 @@ ; GFX9: ; %bb.0: ; %bb ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mul_i32 s7, s0, s3 -; GFX9-NEXT: s_mul_hi_u32 s8, s0, s2 -; GFX9-NEXT: s_mul_hi_u32 s6, s0, s3 -; GFX9-NEXT: s_add_u32 s9, s8, s7 -; GFX9-NEXT: s_mul_i32 s5, s1, s2 -; GFX9-NEXT: s_addc_u32 s6, 0, s6 -; GFX9-NEXT: s_add_u32 s9, s9, s5 +; GFX9-NEXT: s_mul_i32 s9, s0, s3 +; GFX9-NEXT: s_mul_hi_u32 s10, s0, s2 +; GFX9-NEXT: s_mul_hi_u32 s5, s0, s3 +; GFX9-NEXT: s_add_u32 s6, s10, s9 +; GFX9-NEXT: s_mul_i32 s8, s1, s2 +; GFX9-NEXT: s_addc_u32 s5, 0, s5 +; GFX9-NEXT: s_add_u32 s6, s6, s8 ; GFX9-NEXT: s_mul_hi_u32 s4, s1, s2 -; GFX9-NEXT: s_mul_hi_i32 s10, s1, s3 -; GFX9-NEXT: s_addc_u32 s4, s6, s4 -; GFX9-NEXT: s_addc_u32 s6, s10, 0 -; GFX9-NEXT: s_mul_i32 s9, s1, s3 -; GFX9-NEXT: s_add_u32 s4, s4, s9 -; GFX9-NEXT: s_addc_u32 s6, 0, s6 -; GFX9-NEXT: s_sub_u32 s9, s4, s2 -; GFX9-NEXT: s_subb_u32 s10, s6, 0 +; GFX9-NEXT: s_mul_hi_i32 s7, s1, s3 +; GFX9-NEXT: s_addc_u32 s4, s5, s4 +; GFX9-NEXT: s_addc_u32 s5, s7, 0 +; GFX9-NEXT: s_mul_i32 s6, s1, s3 +; GFX9-NEXT: s_add_u32 s4, s4, s6 +; GFX9-NEXT: s_addc_u32 s5, 0, s5 +; GFX9-NEXT: s_sub_u32 s6, s4, s2 +; GFX9-NEXT: s_subb_u32 s7, s5, 0 ; GFX9-NEXT: s_cmp_lt_i32 s1, 0 -; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s10 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: v_mov_b32_e32 v2, s9 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v1, v2, vcc -; GFX9-NEXT: v_subrev_co_u32_e32 v3, vcc, s0, v2 -; GFX9-NEXT: v_subbrev_co_u32_e32 v1, vcc, 0, v0, vcc +; GFX9-NEXT: s_cselect_b64 s[4:5], s[6:7], s[4:5] +; GFX9-NEXT: s_sub_u32 s6, s4, s0 +; GFX9-NEXT: s_subb_u32 s7, s5, 0 ; GFX9-NEXT: s_cmp_lt_i32 s3, 0 -; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: s_add_i32 s1, s8, s7 -; GFX9-NEXT: s_add_i32 s1, s1, s5 -; GFX9-NEXT: s_ashr_i32 s4, s1, 31 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; GFX9-NEXT: s_mov_b32 s5, s4 -; GFX9-NEXT: s_mul_i32 s0, s0, s2 -; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, s[4:5], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-NEXT: v_cndmask_b32_e64 v1, v2, 0, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; GFX9-NEXT: s_cselect_b64 s[4:5], s[6:7], s[4:5] +; GFX9-NEXT: s_add_i32 s1, s10, s9 +; GFX9-NEXT: s_add_i32 s1, s1, s8 +; GFX9-NEXT: s_ashr_i32 s6, s1, 31 +; GFX9-NEXT: s_mov_b32 s7, s6 +; GFX9-NEXT: s_cmp_lg_u64 s[4:5], s[6:7] +; GFX9-NEXT: s_mul_i32 s2, s0, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GFX9-NEXT: v_cndmask_b32_e64 v1, v0, 0, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[0:1] ; GFX9-NEXT: global_store_dwordx2 v[0:1], v[0:1], off ; GFX9-NEXT: s_endpgm ; @@ -391,42 +384,37 @@ ; GFX10: ; %bb.0: ; %bb ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_mul_i32 s7, s0, s3 -; GFX10-NEXT: s_mul_hi_u32 s8, s0, s2 -; GFX10-NEXT: s_mul_hi_u32 s6, s0, s3 -; GFX10-NEXT: s_add_u32 s11, s8, s7 -; GFX10-NEXT: s_mul_i32 s5, s1, s2 -; GFX10-NEXT: s_addc_u32 s6, 0, s6 +; GFX10-NEXT: s_mul_i32 s9, s0, s3 +; GFX10-NEXT: s_mul_hi_u32 s10, s0, s2 +; GFX10-NEXT: s_mul_hi_u32 s5, s0, s3 +; GFX10-NEXT: s_add_u32 s11, s10, s9 +; GFX10-NEXT: s_mul_i32 s8, s1, s2 +; GFX10-NEXT: s_addc_u32 s5, 0, s5 ; GFX10-NEXT: s_mul_hi_u32 s4, s1, s2 -; GFX10-NEXT: s_add_u32 s11, s11, s5 -; GFX10-NEXT: s_mul_hi_i32 s9, s1, s3 -; GFX10-NEXT: s_addc_u32 s4, s6, s4 -; GFX10-NEXT: s_mul_i32 s10, s1, s3 -; GFX10-NEXT: s_addc_u32 s6, s9, 0 -; GFX10-NEXT: s_add_u32 s4, s4, s10 -; GFX10-NEXT: s_addc_u32 s6, 0, s6 -; GFX10-NEXT: s_sub_u32 s9, s4, s2 -; GFX10-NEXT: s_subb_u32 s10, s6, 0 -; GFX10-NEXT: v_mov_b32_e32 v1, s9 +; GFX10-NEXT: s_add_u32 s11, s11, s8 +; GFX10-NEXT: s_mul_hi_i32 s6, s1, s3 +; GFX10-NEXT: s_addc_u32 s4, s5, s4 +; GFX10-NEXT: s_mul_i32 s7, s1, s3 +; GFX10-NEXT: s_addc_u32 s5, s6, 0 +; GFX10-NEXT: s_add_u32 s4, s4, s7 +; GFX10-NEXT: s_addc_u32 s5, 0, s5 +; GFX10-NEXT: s_sub_u32 s6, s4, s2 +; GFX10-NEXT: s_subb_u32 s7, s5, 0 ; GFX10-NEXT: s_cmp_lt_i32 s1, 0 -; GFX10-NEXT: v_mov_b32_e32 v0, s10 -; GFX10-NEXT: s_cselect_b32 vcc_lo, -1, 0 +; GFX10-NEXT: s_cselect_b64 s[4:5], s[6:7], s[4:5] +; GFX10-NEXT: s_sub_u32 s6, s4, s0 +; GFX10-NEXT: s_subb_u32 s7, s5, 0 ; GFX10-NEXT: s_cmp_lt_i32 s3, 0 -; GFX10-NEXT: v_cndmask_b32_e32 v2, s4, v1, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v0, s6, v0, vcc_lo -; GFX10-NEXT: v_sub_co_u32 v3, vcc_lo, v2, s0 ; GFX10-NEXT: s_mul_i32 s0, s0, s2 -; GFX10-NEXT: v_subrev_co_ci_u32_e32 v1, vcc_lo, 0, v0, vcc_lo -; GFX10-NEXT: s_cselect_b32 vcc_lo, -1, 0 -; GFX10-NEXT: s_add_i32 s1, s8, s7 -; GFX10-NEXT: s_add_i32 s1, s1, s5 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo -; GFX10-NEXT: s_ashr_i32 s4, s1, 31 -; GFX10-NEXT: s_mov_b32 s5, s4 -; GFX10-NEXT: v_cmp_ne_u64_e32 vcc_lo, s[4:5], v[0:1] -; GFX10-NEXT: v_cndmask_b32_e64 v1, s1, 0, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, s0, 0, vcc_lo +; GFX10-NEXT: s_cselect_b64 s[4:5], s[6:7], s[4:5] +; GFX10-NEXT: s_add_i32 s1, s10, s9 +; GFX10-NEXT: s_add_i32 s1, s1, s8 +; GFX10-NEXT: s_ashr_i32 s6, s1, 31 +; GFX10-NEXT: s_mov_b32 s7, s6 +; GFX10-NEXT: s_cmp_lg_u64 s[4:5], s[6:7] +; GFX10-NEXT: s_cselect_b32 s2, -1, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, s1, 0, s2 +; GFX10-NEXT: v_cndmask_b32_e64 v0, s0, 0, s2 ; GFX10-NEXT: global_store_dwordx2 v[0:1], v[0:1], off ; GFX10-NEXT: s_endpgm bb: diff --git a/llvm/test/CodeGen/AMDGPU/load-select-ptr.ll b/llvm/test/CodeGen/AMDGPU/load-select-ptr.ll --- a/llvm/test/CodeGen/AMDGPU/load-select-ptr.ll +++ b/llvm/test/CodeGen/AMDGPU/load-select-ptr.ll @@ -9,8 +9,7 @@ ; GCN: s_load_dwordx2 ; GCN: s_cmp_eq_u32 -; GCN: v_cndmask_b32 -; GCN: v_cndmask_b32 +; GCN: s_cselect_b64 ; GCN-NOT: load_dword ; GCN: flat_load_dwordx2 @@ -35,8 +34,7 @@ ; GCN: s_load_dwordx2 ; GCN: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x0{{$}} ; GCN: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x0{{$}} -; GCN: v_cndmask_b32 -; GCN: v_cndmask_b32 +; GCN: s_cselect_b64 ; GCN: flat_store_dwordx2 define amdgpu_kernel void @select_ptr_crash_i64_global(i32 %tmp, [8 x i32], i64 addrspace(1)* %ptr0, [8 x i32], i64 addrspace(1)* %ptr1, [8 x i32], i64 addrspace(1)* %ptr2) { %tmp2 = icmp eq i32 %tmp, 0 diff --git a/llvm/test/CodeGen/AMDGPU/select64.ll b/llvm/test/CodeGen/AMDGPU/select64.ll --- a/llvm/test/CodeGen/AMDGPU/select64.ll +++ b/llvm/test/CodeGen/AMDGPU/select64.ll @@ -1,13 +1,53 @@ -; RUN: llc < %s -march=amdgcn -mcpu=tahiti -verify-machineinstrs | FileCheck -check-prefixes=SI,GCN %s -; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefixes=VI,GCN %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -march=amdgcn -mcpu=tahiti -verify-machineinstrs | FileCheck -check-prefix=SI %s +; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefix=VI %s +; RUN: llc < %s -march=amdgcn -mcpu=gfx90a -verify-machineinstrs | FileCheck -check-prefix=GFX90A %s -; GCN-LABEL: {{^}}select0: -; i64 select should be split into two i32 selects, and we shouldn't need -; to use a shfit to extract the hi dword of the input. -; GCN-NOT: s_lshr_b64 -; GCN: v_cndmask -; GCN: v_cndmask define amdgpu_kernel void @select0(i64 addrspace(1)* %out, i32 %cond, i64 %in) { +; SI-LABEL: select0: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_cmp_lt_u32 s2, 6 +; SI-NEXT: v_mov_b32_e32 v0, s1 +; SI-NEXT: s_cselect_b64 vcc, -1, 0 +; SI-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: select0: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: s_cmp_lt_u32 s4, 6 +; VI-NEXT: s_cselect_b64 s[0:1], s[0:1], 0 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; VI-NEXT: s_endpgm +; +; GFX90A-LABEL: select0: +; GFX90A: ; %bb.0: ; %entry +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dword s6, s[0:1], 0x2c +; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_cmp_lt_u32 s6, 6 +; GFX90A-NEXT: s_cselect_b64 s[0:1], s[4:5], 0 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX90A-NEXT: s_endpgm entry: %0 = icmp ugt i32 %cond, 5 %1 = select i1 %0, i64 0, i64 %in @@ -15,12 +55,48 @@ ret void } -; GCN-LABEL: {{^}}select_trunc_i64: -; VI: s_cselect_b32 -; VI-NOT: s_cselect_b32 -; SI: v_cndmask_b32 -; SI-NOT: v_cndmask_b32 define amdgpu_kernel void @select_trunc_i64(i32 addrspace(1)* %out, i32 %cond, i64 %in) nounwind { +; SI-LABEL: select_trunc_i64: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dword s0, s[0:1], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_cmp_lt_u32 s2, 6 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: s_cselect_b64 vcc, -1, 0 +; SI-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: select_trunc_i64: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dword s0, s[0:1], 0x34 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: s_cmp_lt_u32 s4, 6 +; VI-NEXT: s_cselect_b32 s0, s0, 0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX90A-LABEL: select_trunc_i64: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX90A-NEXT: s_load_dword s5, s[0:1], 0x34 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_cmp_lt_u32 s4, 6 +; GFX90A-NEXT: s_cselect_b32 s0, s5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, s0 +; GFX90A-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-NEXT: s_endpgm %cmp = icmp ugt i32 %cond, 5 %sel = select i1 %cmp, i64 0, i64 %in %trunc = trunc i64 %sel to i32 @@ -28,12 +104,49 @@ ret void } -; GCN-LABEL: {{^}}select_trunc_i64_2: -; VI: s_cselect_b32 -; VI-NOT: s_cselect_b32 -; SI: v_cndmask_b32 -; SI-NOT: v_cndmask_b32 define amdgpu_kernel void @select_trunc_i64_2(i32 addrspace(1)* %out, i32 %cond, i64 %a, i64 %b) nounwind { +; SI-LABEL: select_trunc_i64_2: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dword s8, s[0:1], 0xb +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_cmp_gt_u32 s8, 5 +; SI-NEXT: v_mov_b32_e32 v0, s2 +; SI-NEXT: v_mov_b32_e32 v1, s0 +; SI-NEXT: s_cselect_b64 vcc, -1, 0 +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: select_trunc_i64_2: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; VI-NEXT: s_load_dword s6, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: s_cmp_gt_u32 s6, 5 +; VI-NEXT: s_cselect_b32 s0, s0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX90A-LABEL: select_trunc_i64_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dword s8, s[0:1], 0x2c +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_cmp_gt_u32 s8, 5 +; GFX90A-NEXT: s_cselect_b32 s0, s4, s6 +; GFX90A-NEXT: v_mov_b32_e32 v1, s0 +; GFX90A-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-NEXT: s_endpgm %cmp = icmp ugt i32 %cond, 5 %sel = select i1 %cmp, i64 %a, i64 %b %trunc = trunc i64 %sel to i32 @@ -41,12 +154,58 @@ ret void } -; GCN-LABEL: {{^}}v_select_trunc_i64_2: -; VI: s_cselect_b32 -; VI-NOT: s_cselect_b32 -; SI: v_cndmask_b32 -; SI-NOT: v_cndmask_b32 define amdgpu_kernel void @v_select_trunc_i64_2(i32 addrspace(1)* %out, i32 %cond, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind { +; SI-LABEL: v_select_trunc_i64_2: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dword s0, s[0:1], 0xb +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_load_dword s1, s[8:9], 0x0 +; SI-NEXT: s_load_dword s2, s[10:11], 0x0 +; SI-NEXT: s_cmp_gt_u32 s0, 5 +; SI-NEXT: s_cselect_b64 vcc, -1, 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v1, s1 +; SI-NEXT: v_mov_b32_e32 v0, s2 +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: v_select_trunc_i64_2: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; VI-NEXT: s_load_dword s0, s[0:1], 0x2c +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_load_dword s1, s[4:5], 0x0 +; VI-NEXT: s_load_dword s4, s[6:7], 0x0 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: s_cmp_gt_u32 s0, 5 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_cselect_b32 s0, s1, s4 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX90A-LABEL: v_select_trunc_i64_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dword s8, s[0:1], 0x2c +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX90A-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX90A-NEXT: s_cmp_gt_u32 s8, 5 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_cselect_b32 s0, s0, s1 +; GFX90A-NEXT: v_mov_b32_e32 v1, s0 +; GFX90A-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-NEXT: s_endpgm %cmp = icmp ugt i32 %cond, 5 %a = load i64, i64 addrspace(1)* %aptr, align 8 %b = load i64, i64 addrspace(1)* %bptr, align 8 @@ -56,11 +215,61 @@ ret void } -; GCN-LABEL: {{^}}v_select_i64_split_imm: -; GCN-DAG: v_cndmask_b32_e32 {{v[0-9]+}}, 0, {{v[0-9]+}} -; GCN-DAG: v_cndmask_b32_e32 {{v[0-9]+}}, 63, {{v[0-9]+}} -; GCN: s_endpgm define amdgpu_kernel void @v_select_i64_split_imm(i64 addrspace(1)* %out, i32 %cond, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind { +; SI-LABEL: v_select_i64_split_imm: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_cmp_gt_u32 s2, 5 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; SI-NEXT: s_cselect_b64 vcc, -1, 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, s1 +; SI-NEXT: v_mov_b32_e32 v2, s0 +; SI-NEXT: v_cndmask_b32_e32 v1, 63, v0, vcc +; SI-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: v_select_i64_split_imm: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; VI-NEXT: s_load_dword s6, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s4, 0 +; VI-NEXT: s_mov_b32 s5, 63 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: s_cmp_gt_u32 s6, 5 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_cselect_b64 s[0:1], s[0:1], s[4:5] +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; VI-NEXT: s_endpgm +; +; GFX90A-LABEL: v_select_i64_split_imm: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dword s6, s[0:1], 0x2c +; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_cmp_gt_u32 s6, 5 +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NEXT: s_mov_b32 s4, 0 +; GFX90A-NEXT: s_mov_b32 s5, 63 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_cselect_b64 s[0:1], s[0:1], s[4:5] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX90A-NEXT: s_endpgm %cmp = icmp ugt i32 %cond, 5 %a = load i64, i64 addrspace(1)* %aptr, align 8 %b = load i64, i64 addrspace(1)* %bptr, align 8 diff --git a/llvm/test/CodeGen/AMDGPU/selectcc.ll b/llvm/test/CodeGen/AMDGPU/selectcc.ll --- a/llvm/test/CodeGen/AMDGPU/selectcc.ll +++ b/llvm/test/CodeGen/AMDGPU/selectcc.ll @@ -1,6 +1,6 @@ ; RUN: llc -verify-machineinstrs -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s -; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,SI -check-prefix=FUNC %s -; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=GCN,VI -check-prefix=FUNC %s +; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=VI -check-prefix=FUNC %s ; FUNC-LABEL: {{^}}selectcc_i64: ; EG: XOR_INT @@ -9,9 +9,10 @@ ; EG: CNDE_INT ; EG: CNDE_INT ; SI: v_cmp_eq_u64 +; SI: v_cndmask +; SI: v_cndmask ; VI: s_cmp_eq_u64 -; GCN: v_cndmask -; GCN: v_cndmask +; VI: s_cselect_b64 define amdgpu_kernel void @selectcc_i64(i64 addrspace(1) * %out, i64 %lhs, i64 %rhs, i64 %true, i64 %false) { entry: %0 = icmp eq i64 %lhs, %rhs diff --git a/llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll b/llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll --- a/llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll @@ -16,10 +16,10 @@ ; GCN-LABEL: {{^}}sint_to_fp_i1_f64: ; VI-DAG: s_cmp_eq_u32 -; VI-DAG: s_cselect_b32 s[[SSEL:[0-9]+]], 0xbff00000, 0 -; VI-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} -; VI-DAG: v_mov_b32_e32 v[[SEL:[0-9]+]], s[[SSEL]] -; VI: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[ZERO]]:[[SEL]]{{\]}} +; VI-DAG: s_cselect_b64 s{{\[}}[[S_LO:[0-9]+]]:[[S_HI:[0-9]+]]{{\]}}, -1.0, 0 +; VI-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[S_LO]] +; VI-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[S_HI]] +; VI: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[V_LO]]:[[V_HI]]{{\]}} ; VI: s_endpgm ; SI-DAG: s_cmp_eq_u32 diff --git a/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll b/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll --- a/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll @@ -76,13 +76,15 @@ ; GCN-LABEL: {{^}}uint_to_fp_i1_to_f64: ; VI-DAG: s_cmp_eq_u32 -; VI-DAG: s_cselect_b32 s[[SSEL:[0-9]+]], 0x3ff00000, 0 -; VI-DAG: v_mov_b32_e32 v[[SEL:[0-9]+]], s[[SSEL]] +; VI-DAG: s_cselect_b64 s{{\[}}[[S_LO:[0-9]+]]:[[S_HI:[0-9]+]]{{\]}}, 1.0, 0 +; VI-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[S_LO]] +; VI-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[S_HI]] +; VI: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[V_LO]]:[[V_HI]]{{\]}} ; SI-DAG: s_cmp_eq_u32 ; SI-DAG: s_cselect_b64 vcc, -1, 0 ; SI-DAG: v_cndmask_b32_e32 v[[SEL:[0-9]+]], 0, {{v[0-9]+}}, vcc -; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} -; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[ZERO]]:[[SEL]]{{\]}} +; SI-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} +; SI: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[ZERO]]:[[SEL]]{{\]}} ; GCN: s_endpgm define amdgpu_kernel void @uint_to_fp_i1_to_f64(double addrspace(1)* %out, i32 %in) { %cmp = icmp eq i32 %in, 0