Index: llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -239,6 +239,10 @@ /// subtarget without any kind of limitation. unsigned getMaxWavesPerEU() const { return MaxWavesPerEU; } + /// Return the maximum workitem ID value in the function, for the given (0, 1, + /// 2) dimension. + unsigned getMaxWorkitemID(const Function &Kernel, unsigned Dimension) const; + /// Creates value range metadata on an workitemid.* inrinsic call or load. bool makeLIDRangeMetadata(Instruction *I) const; Index: llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -436,6 +436,21 @@ return Requested; } +static unsigned getReqdWorkGroupSize(const Function &Kernel, unsigned Dim) { + auto Node = Kernel.getMetadata("reqd_work_group_size"); + if (Node && Node->getNumOperands() == 3) + return mdconst::extract(Node->getOperand(Dim))->getZExtValue(); + return std::numeric_limits::max(); +} + +unsigned AMDGPUSubtarget::getMaxWorkitemID(const Function &Kernel, + unsigned Dimension) const { + unsigned ReqdSize = getReqdWorkGroupSize(Kernel, Dimension); + if (ReqdSize != std::numeric_limits::max()) + return ReqdSize - 1; + return getFlatWorkGroupSizes(Kernel).second - 1; +} + bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const { Function *Kernel = I->getParent()->getParent(); unsigned MinSize = 0; @@ -472,11 +487,11 @@ default: break; } + if (Dim <= 3) { - if (auto Node = Kernel->getMetadata("reqd_work_group_size")) - if (Node->getNumOperands() == 3) - MinSize = MaxSize = mdconst::extract( - Node->getOperand(Dim))->getZExtValue(); + unsigned ReqdSize = getReqdWorkGroupSize(*Kernel, Dim); + if (ReqdSize != std::numeric_limits::max()) + MinSize = MaxSize = ReqdSize; } } } Index: llvm/lib/Target/AMDGPU/SIISelLowering.h =================================================================== --- llvm/lib/Target/AMDGPU/SIISelLowering.h +++ llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -420,6 +420,11 @@ void computeKnownBitsForFrameIndex(int FrameIdx, KnownBits &Known, const MachineFunction &MF) const override; + void computeKnownBitsForTargetInstr(GISelKnownBits &Analysis, Register R, + KnownBits &Known, + const APInt &DemandedElts, + const MachineRegisterInfo &MRI, + unsigned Depth = 0) const override; Align computeKnownAlignForTargetInstr(GISelKnownBits &Analysis, Register R, const MachineRegisterInfo &MRI, Index: llvm/lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -11342,6 +11342,42 @@ Known.Zero.setHighBits(getSubtarget()->getKnownHighZeroBitsForFrameIndex()); } +static void knownBitsForWorkitemID(const GCNSubtarget &ST, GISelKnownBits &KB, + KnownBits &Known, unsigned Dim) { + unsigned MaxValue = + ST.getMaxWorkitemID(KB.getMachineFunction().getFunction(), Dim); + Known.Zero.setHighBits(countLeadingZeros(MaxValue)); +} + +void SITargetLowering::computeKnownBitsForTargetInstr( + GISelKnownBits &KB, Register R, KnownBits &Known, const APInt &DemandedElts, + const MachineRegisterInfo &MRI, unsigned Depth) const { + const MachineInstr *MI = MRI.getVRegDef(R); + switch (MI->getOpcode()) { + case AMDGPU::G_INTRINSIC: { + switch (MI->getIntrinsicID()) + case Intrinsic::amdgcn_workitem_id_x: + knownBitsForWorkitemID(*getSubtarget(), KB, Known, 0); + break; + case Intrinsic::amdgcn_workitem_id_y: + knownBitsForWorkitemID(*getSubtarget(), KB, Known, 1); + break; + case Intrinsic::amdgcn_workitem_id_z: + knownBitsForWorkitemID(*getSubtarget(), KB, Known, 2); + break; + case Intrinsic::amdgcn_mbcnt_lo: + case Intrinsic::amdgcn_mbcnt_hi: { + // These return at most the wavefront size - 1. + unsigned Size = MRI.getType(R).getSizeInBits(); + Known.Zero.setHighBits(Size - getSubtarget()->getWavefrontSizeLog2()); + break; + } + default: + break; + } + } +} + Align SITargetLowering::computeKnownAlignForTargetInstr( GISelKnownBits &KB, Register R, const MachineRegisterInfo &MRI, unsigned Depth) const { Index: llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll @@ -581,8 +581,8 @@ ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s2, 0 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -612,13 +612,12 @@ ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c -; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; VI-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] +; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_add_u32_e32 v0, vcc, v2, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_add_u32_e32 v2, vcc, 1, v0 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc ; VI-NEXT: v_add_u32_e32 v4, vcc, 2, v0 @@ -688,8 +687,8 @@ ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s2, 0 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -707,13 +706,12 @@ ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c -; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; VI-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] +; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_add_u32_e32 v0, vcc, v2, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_add_u32_e32 v0, vcc, 2, v0 @@ -737,8 +735,8 @@ ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s2, 0 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -755,13 +753,12 @@ ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c -; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; VI-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] +; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_add_u32_e32 v0, vcc, v2, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_and_b32_e32 v0, 0xff00, v0 @@ -830,8 +827,8 @@ ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s2, 0 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -861,13 +858,12 @@ ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c -; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; VI-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] +; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_add_u32_e32 v0, vcc, v2, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_add_u32_e32 v2, vcc, 1, v0 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc ; VI-NEXT: v_add_u32_e32 v4, vcc, 2, v0 @@ -904,8 +900,8 @@ ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s2, 0 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -922,13 +918,12 @@ ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c -; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; VI-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] +; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_add_u32_e32 v0, vcc, v2, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 @@ -950,8 +945,8 @@ ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s2, 0 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -969,13 +964,12 @@ ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c -; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; VI-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] +; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_add_u32_e32 v0, vcc, v2, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v0 @@ -999,8 +993,8 @@ ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s2, 0 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1018,13 +1012,12 @@ ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c -; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; VI-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] +; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_add_u32_e32 v0, vcc, v2, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] ; VI-NEXT: v_mov_b32_e32 v1, 0xff ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1049,8 +1042,8 @@ ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s2, 0 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1066,13 +1059,12 @@ ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c -; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; VI-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] +; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_add_u32_e32 v0, vcc, v2, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_cvt_f32_ubyte3_e32 v2, v0 @@ -1094,8 +1086,8 @@ ; SI-LABEL: cvt_ubyte0_or_multiuse: ; SI: ; %bb.0: ; %bb ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1114,13 +1106,12 @@ ; VI-LABEL: cvt_ubyte0_or_multiuse: ; VI: ; %bb.0: ; %bb ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; VI-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] +; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_add_u32_e32 v0, vcc, v2, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_or_b32_e32 v0, 0x80000001, v0 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.dec.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.dec.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.dec.ll @@ -365,19 +365,18 @@ ; CI-LABEL: global_atomic_dec_ret_i32_offset_addr64: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; CI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; CI-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 +; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v2, s2 -; CI-NEXT: v_mov_b32_e32 v3, s3 -; CI-NEXT: v_add_i32_e32 v4, vcc, v2, v0 -; CI-NEXT: v_addc_u32_e32 v5, vcc, v3, v1, vcc -; CI-NEXT: v_mov_b32_e32 v3, s1 -; CI-NEXT: v_mov_b32_e32 v2, s0 -; CI-NEXT: v_add_i32_e32 v0, vcc, v2, v0 -; CI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc -; CI-NEXT: v_add_i32_e32 v2, vcc, 20, v4 -; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc +; CI-NEXT: v_mov_b32_e32 v0, s2 +; CI-NEXT: v_mov_b32_e32 v1, s3 +; CI-NEXT: v_add_i32_e32 v3, vcc, v0, v2 +; CI-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; CI-NEXT: v_add_i32_e32 v2, vcc, 20, v3 +; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc ; CI-NEXT: v_mov_b32_e32 v4, 42 ; CI-NEXT: flat_atomic_dec v2, v[2:3], v4 glc ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -387,19 +386,18 @@ ; VI-LABEL: global_atomic_dec_ret_i32_offset_addr64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; VI-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] +; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v3, s3 -; VI-NEXT: v_add_u32_e32 v4, vcc, v2, v0 -; VI-NEXT: v_addc_u32_e32 v5, vcc, v3, v1, vcc -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_add_u32_e32 v0, vcc, v2, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc -; VI-NEXT: v_add_u32_e32 v2, vcc, 20, v4 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v3, vcc, v0, v2 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 20, v3 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc ; VI-NEXT: v_mov_b32_e32 v4, 42 ; VI-NEXT: flat_atomic_dec v2, v[2:3], v4 glc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -442,13 +440,12 @@ ; CI-LABEL: global_atomic_dec_noret_i32_offset_addr64: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; CI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; CI-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 +; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v3, s1 -; CI-NEXT: v_mov_b32_e32 v2, s0 -; CI-NEXT: v_add_i32_e32 v0, vcc, v2, v0 -; CI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: v_add_i32_e32 v0, vcc, 20, v0 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: v_mov_b32_e32 v2, 42 @@ -458,13 +455,12 @@ ; VI-LABEL: global_atomic_dec_noret_i32_offset_addr64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; VI-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] +; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_add_u32_e32 v0, vcc, v2, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_add_u32_e32 v0, vcc, 20, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_mov_b32_e32 v2, 42 @@ -672,19 +668,18 @@ ; CI-LABEL: flat_atomic_dec_ret_i32_offset_addr64: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; CI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; CI-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 +; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v2, s2 -; CI-NEXT: v_mov_b32_e32 v3, s3 -; CI-NEXT: v_add_i32_e32 v4, vcc, v2, v0 -; CI-NEXT: v_addc_u32_e32 v5, vcc, v3, v1, vcc -; CI-NEXT: v_mov_b32_e32 v3, s1 -; CI-NEXT: v_mov_b32_e32 v2, s0 -; CI-NEXT: v_add_i32_e32 v0, vcc, v2, v0 -; CI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc -; CI-NEXT: v_add_i32_e32 v2, vcc, 20, v4 -; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc +; CI-NEXT: v_mov_b32_e32 v0, s2 +; CI-NEXT: v_mov_b32_e32 v1, s3 +; CI-NEXT: v_add_i32_e32 v3, vcc, v0, v2 +; CI-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; CI-NEXT: v_add_i32_e32 v2, vcc, 20, v3 +; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc ; CI-NEXT: v_mov_b32_e32 v4, 42 ; CI-NEXT: flat_atomic_dec v2, v[2:3], v4 glc ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -694,19 +689,18 @@ ; VI-LABEL: flat_atomic_dec_ret_i32_offset_addr64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; VI-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] +; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v3, s3 -; VI-NEXT: v_add_u32_e32 v4, vcc, v2, v0 -; VI-NEXT: v_addc_u32_e32 v5, vcc, v3, v1, vcc -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_add_u32_e32 v0, vcc, v2, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc -; VI-NEXT: v_add_u32_e32 v2, vcc, 20, v4 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v3, vcc, v0, v2 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 20, v3 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc ; VI-NEXT: v_mov_b32_e32 v4, 42 ; VI-NEXT: flat_atomic_dec v2, v[2:3], v4 glc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -749,13 +743,12 @@ ; CI-LABEL: flat_atomic_dec_noret_i32_offset_addr64: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; CI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; CI-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 +; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v3, s1 -; CI-NEXT: v_mov_b32_e32 v2, s0 -; CI-NEXT: v_add_i32_e32 v0, vcc, v2, v0 -; CI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: v_add_i32_e32 v0, vcc, 20, v0 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: v_mov_b32_e32 v2, 42 @@ -765,13 +758,12 @@ ; VI-LABEL: flat_atomic_dec_noret_i32_offset_addr64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; VI-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] +; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_add_u32_e32 v0, vcc, v2, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_add_u32_e32 v0, vcc, 20, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_mov_b32_e32 v2, 42 @@ -991,19 +983,18 @@ ; CI-LABEL: flat_atomic_dec_ret_i64_offset_addr64: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; CI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; CI-NEXT: v_lshl_b64 v[0:1], v[0:1], 3 +; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v2, s2 -; CI-NEXT: v_mov_b32_e32 v3, s3 -; CI-NEXT: v_add_i32_e32 v4, vcc, v2, v0 -; CI-NEXT: v_addc_u32_e32 v5, vcc, v3, v1, vcc -; CI-NEXT: v_mov_b32_e32 v3, s1 -; CI-NEXT: v_mov_b32_e32 v2, s0 -; CI-NEXT: v_add_i32_e32 v0, vcc, v2, v0 -; CI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc -; CI-NEXT: v_add_i32_e32 v2, vcc, 40, v4 -; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc +; CI-NEXT: v_mov_b32_e32 v0, s2 +; CI-NEXT: v_mov_b32_e32 v1, s3 +; CI-NEXT: v_add_i32_e32 v3, vcc, v0, v2 +; CI-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; CI-NEXT: v_add_i32_e32 v2, vcc, 40, v3 +; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc ; CI-NEXT: v_mov_b32_e32 v4, 42 ; CI-NEXT: v_mov_b32_e32 v5, 0 ; CI-NEXT: flat_atomic_dec_x2 v[2:3], v[2:3], v[4:5] glc @@ -1014,19 +1005,18 @@ ; VI-LABEL: flat_atomic_dec_ret_i64_offset_addr64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; VI-NEXT: v_lshlrev_b64 v[0:1], 3, v[0:1] +; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v3, s3 -; VI-NEXT: v_add_u32_e32 v4, vcc, v2, v0 -; VI-NEXT: v_addc_u32_e32 v5, vcc, v3, v1, vcc -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_add_u32_e32 v0, vcc, v2, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc -; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v4 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v3, vcc, v0, v2 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v3 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc ; VI-NEXT: v_mov_b32_e32 v4, 42 ; VI-NEXT: v_mov_b32_e32 v5, 0 ; VI-NEXT: flat_atomic_dec_x2 v[2:3], v[2:3], v[4:5] glc @@ -1071,13 +1061,12 @@ ; CI-LABEL: flat_atomic_dec_noret_i64_offset_addr64: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; CI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; CI-NEXT: v_lshl_b64 v[0:1], v[0:1], 3 +; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v3, s1 -; CI-NEXT: v_mov_b32_e32 v2, s0 -; CI-NEXT: v_add_i32_e32 v0, vcc, v2, v0 -; CI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: v_add_i32_e32 v0, vcc, 40, v0 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc @@ -1088,13 +1077,12 @@ ; VI-LABEL: flat_atomic_dec_noret_i64_offset_addr64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; VI-NEXT: v_lshlrev_b64 v[0:1], 3, v[0:1] +; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_add_u32_e32 v0, vcc, v2, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_add_u32_e32 v0, vcc, 40, v0 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc @@ -1556,19 +1544,18 @@ ; CI-LABEL: global_atomic_dec_ret_i64_offset_addr64: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; CI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; CI-NEXT: v_lshl_b64 v[0:1], v[0:1], 3 +; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v2, s2 -; CI-NEXT: v_mov_b32_e32 v3, s3 -; CI-NEXT: v_add_i32_e32 v4, vcc, v2, v0 -; CI-NEXT: v_addc_u32_e32 v5, vcc, v3, v1, vcc -; CI-NEXT: v_mov_b32_e32 v3, s1 -; CI-NEXT: v_mov_b32_e32 v2, s0 -; CI-NEXT: v_add_i32_e32 v0, vcc, v2, v0 -; CI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc -; CI-NEXT: v_add_i32_e32 v2, vcc, 40, v4 -; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc +; CI-NEXT: v_mov_b32_e32 v0, s2 +; CI-NEXT: v_mov_b32_e32 v1, s3 +; CI-NEXT: v_add_i32_e32 v3, vcc, v0, v2 +; CI-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; CI-NEXT: v_add_i32_e32 v2, vcc, 40, v3 +; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc ; CI-NEXT: v_mov_b32_e32 v4, 42 ; CI-NEXT: v_mov_b32_e32 v5, 0 ; CI-NEXT: flat_atomic_dec_x2 v[2:3], v[2:3], v[4:5] glc @@ -1579,19 +1566,18 @@ ; VI-LABEL: global_atomic_dec_ret_i64_offset_addr64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; VI-NEXT: v_lshlrev_b64 v[0:1], 3, v[0:1] +; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v3, s3 -; VI-NEXT: v_add_u32_e32 v4, vcc, v2, v0 -; VI-NEXT: v_addc_u32_e32 v5, vcc, v3, v1, vcc -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_add_u32_e32 v0, vcc, v2, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc -; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v4 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v3, vcc, v0, v2 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v3 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc ; VI-NEXT: v_mov_b32_e32 v4, 42 ; VI-NEXT: v_mov_b32_e32 v5, 0 ; VI-NEXT: flat_atomic_dec_x2 v[2:3], v[2:3], v[4:5] glc @@ -1636,13 +1622,12 @@ ; CI-LABEL: global_atomic_dec_noret_i64_offset_addr64: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; CI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; CI-NEXT: v_lshl_b64 v[0:1], v[0:1], 3 +; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v3, s1 -; CI-NEXT: v_mov_b32_e32 v2, s0 -; CI-NEXT: v_add_i32_e32 v0, vcc, v2, v0 -; CI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: v_add_i32_e32 v0, vcc, 40, v0 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc @@ -1653,13 +1638,12 @@ ; VI-LABEL: global_atomic_dec_noret_i64_offset_addr64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; VI-NEXT: v_lshlrev_b64 v[0:1], 3, v[0:1] +; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_add_u32_e32 v0, vcc, v2, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_add_u32_e32 v0, vcc, 40, v0 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll @@ -365,19 +365,18 @@ ; CI-LABEL: global_atomic_inc_ret_i32_offset_addr64: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; CI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; CI-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 +; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v2, s2 -; CI-NEXT: v_mov_b32_e32 v3, s3 -; CI-NEXT: v_add_i32_e32 v4, vcc, v2, v0 -; CI-NEXT: v_addc_u32_e32 v5, vcc, v3, v1, vcc -; CI-NEXT: v_mov_b32_e32 v3, s1 -; CI-NEXT: v_mov_b32_e32 v2, s0 -; CI-NEXT: v_add_i32_e32 v0, vcc, v2, v0 -; CI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc -; CI-NEXT: v_add_i32_e32 v2, vcc, 20, v4 -; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc +; CI-NEXT: v_mov_b32_e32 v0, s2 +; CI-NEXT: v_mov_b32_e32 v1, s3 +; CI-NEXT: v_add_i32_e32 v3, vcc, v0, v2 +; CI-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; CI-NEXT: v_add_i32_e32 v2, vcc, 20, v3 +; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc ; CI-NEXT: v_mov_b32_e32 v4, 42 ; CI-NEXT: flat_atomic_inc v2, v[2:3], v4 glc ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -387,19 +386,18 @@ ; VI-LABEL: global_atomic_inc_ret_i32_offset_addr64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; VI-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] +; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v3, s3 -; VI-NEXT: v_add_u32_e32 v4, vcc, v2, v0 -; VI-NEXT: v_addc_u32_e32 v5, vcc, v3, v1, vcc -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_add_u32_e32 v0, vcc, v2, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc -; VI-NEXT: v_add_u32_e32 v2, vcc, 20, v4 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v3, vcc, v0, v2 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 20, v3 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc ; VI-NEXT: v_mov_b32_e32 v4, 42 ; VI-NEXT: flat_atomic_inc v2, v[2:3], v4 glc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -409,19 +407,18 @@ ; GFX9-LABEL: global_atomic_inc_ret_i32_offset_addr64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX9-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v3, v1, vcc -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 20, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v5, vcc +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v1, vcc +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 20, v3 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc ; GFX9-NEXT: v_mov_b32_e32 v4, 42 ; GFX9-NEXT: global_atomic_inc v2, v[2:3], v4, off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -440,13 +437,12 @@ ; CI-LABEL: global_atomic_inc_noret_i32_offset_addr64: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; CI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; CI-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 +; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v3, s1 -; CI-NEXT: v_mov_b32_e32 v2, s0 -; CI-NEXT: v_add_i32_e32 v0, vcc, v2, v0 -; CI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: v_add_i32_e32 v0, vcc, 20, v0 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: v_mov_b32_e32 v2, 42 @@ -456,13 +452,12 @@ ; VI-LABEL: global_atomic_inc_noret_i32_offset_addr64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; VI-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] +; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_add_u32_e32 v0, vcc, v2, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_add_u32_e32 v0, vcc, 20, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_mov_b32_e32 v2, 42 @@ -472,13 +467,12 @@ ; GFX9-LABEL: global_atomic_inc_noret_i32_offset_addr64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX9-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v1, vcc +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 20, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: v_mov_b32_e32 v2, 42 @@ -925,19 +919,18 @@ ; CI-LABEL: global_atomic_inc_ret_i64_offset_addr64: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; CI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; CI-NEXT: v_lshl_b64 v[0:1], v[0:1], 3 +; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v2, s2 -; CI-NEXT: v_mov_b32_e32 v3, s3 -; CI-NEXT: v_add_i32_e32 v4, vcc, v2, v0 -; CI-NEXT: v_addc_u32_e32 v5, vcc, v3, v1, vcc -; CI-NEXT: v_mov_b32_e32 v3, s1 -; CI-NEXT: v_mov_b32_e32 v2, s0 -; CI-NEXT: v_add_i32_e32 v0, vcc, v2, v0 -; CI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc -; CI-NEXT: v_add_i32_e32 v2, vcc, 40, v4 -; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc +; CI-NEXT: v_mov_b32_e32 v0, s2 +; CI-NEXT: v_mov_b32_e32 v1, s3 +; CI-NEXT: v_add_i32_e32 v3, vcc, v0, v2 +; CI-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; CI-NEXT: v_add_i32_e32 v2, vcc, 40, v3 +; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc ; CI-NEXT: v_mov_b32_e32 v4, 42 ; CI-NEXT: v_mov_b32_e32 v5, 0 ; CI-NEXT: flat_atomic_inc_x2 v[2:3], v[2:3], v[4:5] glc @@ -948,19 +941,18 @@ ; VI-LABEL: global_atomic_inc_ret_i64_offset_addr64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; VI-NEXT: v_lshlrev_b64 v[0:1], 3, v[0:1] +; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v3, s3 -; VI-NEXT: v_add_u32_e32 v4, vcc, v2, v0 -; VI-NEXT: v_addc_u32_e32 v5, vcc, v3, v1, vcc -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_add_u32_e32 v0, vcc, v2, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc -; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v4 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v3, vcc, v0, v2 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v3 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc ; VI-NEXT: v_mov_b32_e32 v4, 42 ; VI-NEXT: v_mov_b32_e32 v5, 0 ; VI-NEXT: flat_atomic_inc_x2 v[2:3], v[2:3], v[4:5] glc @@ -971,19 +963,18 @@ ; GFX9-LABEL: global_atomic_inc_ret_i64_offset_addr64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX9-NEXT: v_lshlrev_b64 v[0:1], 3, v[0:1] +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v3, v1, vcc -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 40, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v5, vcc +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v1, vcc +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 40, v3 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc ; GFX9-NEXT: v_mov_b32_e32 v4, 42 ; GFX9-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-NEXT: global_atomic_inc_x2 v[2:3], v[2:3], v[4:5], off glc @@ -1003,13 +994,12 @@ ; CI-LABEL: global_atomic_inc_noret_i64_offset_addr64: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; CI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; CI-NEXT: v_lshl_b64 v[0:1], v[0:1], 3 +; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v3, s1 -; CI-NEXT: v_mov_b32_e32 v2, s0 -; CI-NEXT: v_add_i32_e32 v0, vcc, v2, v0 -; CI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: v_add_i32_e32 v0, vcc, 40, v0 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc @@ -1020,13 +1010,12 @@ ; VI-LABEL: global_atomic_inc_noret_i64_offset_addr64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; VI-NEXT: v_lshlrev_b64 v[0:1], 3, v[0:1] +; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_add_u32_e32 v0, vcc, v2, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_add_u32_e32 v0, vcc, 40, v0 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc @@ -1037,13 +1026,12 @@ ; GFX9-LABEL: global_atomic_inc_noret_i64_offset_addr64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX9-NEXT: v_lshlrev_b64 v[0:1], 3, v[0:1] +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v1, vcc +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 40, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 42 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc @@ -1133,19 +1121,18 @@ ; CI-LABEL: flat_atomic_inc_ret_i32_offset_addr64: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; CI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; CI-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 +; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v2, s2 -; CI-NEXT: v_mov_b32_e32 v3, s3 -; CI-NEXT: v_add_i32_e32 v4, vcc, v2, v0 -; CI-NEXT: v_addc_u32_e32 v5, vcc, v3, v1, vcc -; CI-NEXT: v_mov_b32_e32 v3, s1 -; CI-NEXT: v_mov_b32_e32 v2, s0 -; CI-NEXT: v_add_i32_e32 v0, vcc, v2, v0 -; CI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc -; CI-NEXT: v_add_i32_e32 v2, vcc, 20, v4 -; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc +; CI-NEXT: v_mov_b32_e32 v0, s2 +; CI-NEXT: v_mov_b32_e32 v1, s3 +; CI-NEXT: v_add_i32_e32 v3, vcc, v0, v2 +; CI-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; CI-NEXT: v_add_i32_e32 v2, vcc, 20, v3 +; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc ; CI-NEXT: v_mov_b32_e32 v4, 42 ; CI-NEXT: flat_atomic_inc v2, v[2:3], v4 glc ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1155,19 +1142,18 @@ ; VI-LABEL: flat_atomic_inc_ret_i32_offset_addr64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; VI-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] +; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v3, s3 -; VI-NEXT: v_add_u32_e32 v4, vcc, v2, v0 -; VI-NEXT: v_addc_u32_e32 v5, vcc, v3, v1, vcc -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_add_u32_e32 v0, vcc, v2, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc -; VI-NEXT: v_add_u32_e32 v2, vcc, 20, v4 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v3, vcc, v0, v2 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 20, v3 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc ; VI-NEXT: v_mov_b32_e32 v4, 42 ; VI-NEXT: flat_atomic_inc v2, v[2:3], v4 glc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1177,19 +1163,18 @@ ; GFX9-LABEL: flat_atomic_inc_ret_i32_offset_addr64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX9-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v3, v1, vcc -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 20, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v5, vcc +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v1, vcc +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 20, v3 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc ; GFX9-NEXT: v_mov_b32_e32 v4, 42 ; GFX9-NEXT: flat_atomic_inc v2, v[2:3], v4 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1208,13 +1193,12 @@ ; CI-LABEL: flat_atomic_inc_noret_i32_offset_addr64: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; CI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; CI-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 +; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v3, s1 -; CI-NEXT: v_mov_b32_e32 v2, s0 -; CI-NEXT: v_add_i32_e32 v0, vcc, v2, v0 -; CI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: v_add_i32_e32 v0, vcc, 20, v0 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: v_mov_b32_e32 v2, 42 @@ -1224,13 +1208,12 @@ ; VI-LABEL: flat_atomic_inc_noret_i32_offset_addr64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; VI-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] +; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_add_u32_e32 v0, vcc, v2, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_add_u32_e32 v0, vcc, 20, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_mov_b32_e32 v2, 42 @@ -1240,13 +1223,12 @@ ; GFX9-LABEL: flat_atomic_inc_noret_i32_offset_addr64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX9-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v1, vcc +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 20, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: v_mov_b32_e32 v2, 42 @@ -1406,19 +1388,18 @@ ; CI-LABEL: flat_atomic_inc_ret_i64_offset_addr64: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; CI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; CI-NEXT: v_lshl_b64 v[0:1], v[0:1], 3 +; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v2, s2 -; CI-NEXT: v_mov_b32_e32 v3, s3 -; CI-NEXT: v_add_i32_e32 v4, vcc, v2, v0 -; CI-NEXT: v_addc_u32_e32 v5, vcc, v3, v1, vcc -; CI-NEXT: v_mov_b32_e32 v3, s1 -; CI-NEXT: v_mov_b32_e32 v2, s0 -; CI-NEXT: v_add_i32_e32 v0, vcc, v2, v0 -; CI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc -; CI-NEXT: v_add_i32_e32 v2, vcc, 40, v4 -; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc +; CI-NEXT: v_mov_b32_e32 v0, s2 +; CI-NEXT: v_mov_b32_e32 v1, s3 +; CI-NEXT: v_add_i32_e32 v3, vcc, v0, v2 +; CI-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; CI-NEXT: v_add_i32_e32 v2, vcc, 40, v3 +; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc ; CI-NEXT: v_mov_b32_e32 v4, 42 ; CI-NEXT: v_mov_b32_e32 v5, 0 ; CI-NEXT: flat_atomic_inc_x2 v[2:3], v[2:3], v[4:5] glc @@ -1429,19 +1410,18 @@ ; VI-LABEL: flat_atomic_inc_ret_i64_offset_addr64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; VI-NEXT: v_lshlrev_b64 v[0:1], 3, v[0:1] +; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v3, s3 -; VI-NEXT: v_add_u32_e32 v4, vcc, v2, v0 -; VI-NEXT: v_addc_u32_e32 v5, vcc, v3, v1, vcc -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_add_u32_e32 v0, vcc, v2, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc -; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v4 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v3, vcc, v0, v2 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v3 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc ; VI-NEXT: v_mov_b32_e32 v4, 42 ; VI-NEXT: v_mov_b32_e32 v5, 0 ; VI-NEXT: flat_atomic_inc_x2 v[2:3], v[2:3], v[4:5] glc @@ -1452,19 +1432,18 @@ ; GFX9-LABEL: flat_atomic_inc_ret_i64_offset_addr64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX9-NEXT: v_lshlrev_b64 v[0:1], 3, v[0:1] +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v3, v1, vcc -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 40, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v5, vcc +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v1, vcc +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 40, v3 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc ; GFX9-NEXT: v_mov_b32_e32 v4, 42 ; GFX9-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-NEXT: flat_atomic_inc_x2 v[2:3], v[2:3], v[4:5] glc @@ -1484,13 +1463,12 @@ ; CI-LABEL: flat_atomic_inc_noret_i64_offset_addr64: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; CI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; CI-NEXT: v_lshl_b64 v[0:1], v[0:1], 3 +; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v3, s1 -; CI-NEXT: v_mov_b32_e32 v2, s0 -; CI-NEXT: v_add_i32_e32 v0, vcc, v2, v0 -; CI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: v_add_i32_e32 v0, vcc, 40, v0 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc @@ -1501,13 +1479,12 @@ ; VI-LABEL: flat_atomic_inc_noret_i64_offset_addr64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; VI-NEXT: v_lshlrev_b64 v[0:1], 3, v[0:1] +; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_add_u32_e32 v0, vcc, v2, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_add_u32_e32 v0, vcc, 40, v0 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc @@ -1518,13 +1495,12 @@ ; GFX9-LABEL: flat_atomic_inc_noret_i64_offset_addr64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX9-NEXT: v_lshlrev_b64 v[0:1], 3, v[0:1] +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v1, vcc +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 40, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 42 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll @@ -840,9 +840,9 @@ ; GFX7: ; %bb.0: ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GFX7-NEXT: s_load_dword s8, s[0:1], 0x15 -; GFX7-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX7-NEXT: v_lshl_b64 v[1:2], v[0:1], 2 ; GFX7-NEXT: s_mov_b32 s2, 0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; GFX7-NEXT: v_mov_b32_e32 v2, 0 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b64 s[0:1], s[6:7] @@ -866,13 +866,12 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dword s2, s[0:1], 0x54 -; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX8-NEXT: v_lshlrev_b64 v[1:2], 2, v[0:1] +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v3, s6 -; GFX8-NEXT: v_mov_b32_e32 v4, s7 -; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1 -; GFX8-NEXT: v_addc_u32_e32 v2, vcc, v4, v2, vcc +; GFX8-NEXT: v_mov_b32_e32 v1, s6 +; GFX8-NEXT: v_mov_b32_e32 v2, s7 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3 +; GFX8-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 4, v1 ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc ; GFX8-NEXT: v_add_u32_e32 v5, vcc, 8, v1 @@ -898,20 +897,20 @@ ; ; GFX10_W32-LABEL: test_div_fmas_f32_logical_cond_to_vcc: ; GFX10_W32: ; %bb.0: +; GFX10_W32-NEXT: s_clause 0x1 ; GFX10_W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10_W32-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GFX10_W32-NEXT: s_load_dword s2, s[0:1], 0x54 +; GFX10_W32-NEXT: v_lshlrev_b32_e32 v3, 2, v0 ; GFX10_W32-NEXT: ; implicit-def: $vcc_hi -; GFX10_W32-NEXT: v_lshlrev_b64 v[1:2], 2, v[0:1] ; GFX10_W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10_W32-NEXT: v_mov_b32_e32 v3, s6 -; GFX10_W32-NEXT: v_mov_b32_e32 v4, s7 +; GFX10_W32-NEXT: v_mov_b32_e32 v1, s6 +; GFX10_W32-NEXT: v_mov_b32_e32 v2, s7 ; GFX10_W32-NEXT: s_add_u32 s0, s4, 8 ; GFX10_W32-NEXT: s_addc_u32 s1, s5, 0 ; GFX10_W32-NEXT: s_cmp_lg_u32 s2, 0 -; GFX10_W32-NEXT: v_add_co_u32_e64 v1, vcc_lo, v3, v1 +; GFX10_W32-NEXT: v_add_co_u32_e64 v1, vcc_lo, v1, v3 ; GFX10_W32-NEXT: s_cselect_b32 s2, 1, 0 -; GFX10_W32-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v4, v2, vcc_lo +; GFX10_W32-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, 0, v2, vcc_lo ; GFX10_W32-NEXT: s_and_b32 s2, 1, s2 ; GFX10_W32-NEXT: v_add_co_u32_e64 v3, vcc_lo, v1, 8 ; GFX10_W32-NEXT: v_cmp_ne_u32_e64 s2, 0, s2 @@ -931,19 +930,19 @@ ; ; GFX10_W64-LABEL: test_div_fmas_f32_logical_cond_to_vcc: ; GFX10_W64: ; %bb.0: +; GFX10_W64-NEXT: s_clause 0x1 ; GFX10_W64-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10_W64-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GFX10_W64-NEXT: s_load_dword s2, s[0:1], 0x54 -; GFX10_W64-NEXT: v_lshlrev_b64 v[1:2], 2, v[0:1] +; GFX10_W64-NEXT: v_lshlrev_b32_e32 v3, 2, v0 ; GFX10_W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10_W64-NEXT: v_mov_b32_e32 v3, s6 -; GFX10_W64-NEXT: v_mov_b32_e32 v4, s7 +; GFX10_W64-NEXT: v_mov_b32_e32 v1, s6 +; GFX10_W64-NEXT: v_mov_b32_e32 v2, s7 ; GFX10_W64-NEXT: s_add_u32 s0, s4, 8 ; GFX10_W64-NEXT: s_addc_u32 s1, s5, 0 ; GFX10_W64-NEXT: s_cmp_lg_u32 s2, 0 -; GFX10_W64-NEXT: v_add_co_u32_e64 v1, vcc, v3, v1 +; GFX10_W64-NEXT: v_add_co_u32_e64 v1, vcc, v1, v3 ; GFX10_W64-NEXT: s_cselect_b32 s2, 1, 0 -; GFX10_W64-NEXT: v_add_co_ci_u32_e32 v2, vcc, v4, v2, vcc +; GFX10_W64-NEXT: v_add_co_ci_u32_e32 v2, vcc, 0, v2, vcc ; GFX10_W64-NEXT: s_and_b32 s2, 1, s2 ; GFX10_W64-NEXT: v_add_co_u32_e64 v3, vcc, v1, 8 ; GFX10_W64-NEXT: v_cmp_ne_u32_e64 s[2:3], 0, s2 @@ -984,8 +983,8 @@ ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x13 -; GFX7-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX7-NEXT: v_lshl_b64 v[1:2], v[0:1], 2 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; GFX7-NEXT: v_mov_b32_e32 v2, 0 ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, 0xf000 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1017,14 +1016,13 @@ ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x4c -; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX8-NEXT: v_lshlrev_b64 v[1:2], 2, v[0:1] +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 2, v0 ; GFX8-NEXT: s_mov_b32 s2, 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v3, s6 -; GFX8-NEXT: v_mov_b32_e32 v4, s7 -; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1 -; GFX8-NEXT: v_addc_u32_e32 v2, vcc, v4, v2, vcc +; GFX8-NEXT: v_mov_b32_e32 v1, s6 +; GFX8-NEXT: v_mov_b32_e32 v2, s7 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3 +; GFX8-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc ; GFX8-NEXT: flat_load_dwordx3 v[1:3], v[1:2] ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: s_and_saveexec_b64 s[6:7], vcc @@ -1053,16 +1051,15 @@ ; GFX10_W32-LABEL: test_div_fmas_f32_i1_phi_vcc: ; GFX10_W32: ; %bb.0: ; %entry ; GFX10_W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c -; GFX10_W32-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX10_W32-NEXT: v_lshlrev_b32_e32 v3, 2, v0 ; GFX10_W32-NEXT: s_mov_b32 s4, 0 ; GFX10_W32-NEXT: ; implicit-def: $vcc_hi -; GFX10_W32-NEXT: v_lshlrev_b64 v[1:2], 2, v[0:1] ; GFX10_W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10_W32-NEXT: v_mov_b32_e32 v4, s3 -; GFX10_W32-NEXT: v_mov_b32_e32 v3, s2 +; GFX10_W32-NEXT: v_mov_b32_e32 v1, s2 +; GFX10_W32-NEXT: v_mov_b32_e32 v2, s3 ; GFX10_W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX10_W32-NEXT: v_add_co_u32_e64 v1, vcc_lo, v3, v1 -; GFX10_W32-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v4, v2, vcc_lo +; GFX10_W32-NEXT: v_add_co_u32_e64 v1, vcc_lo, v1, v3 +; GFX10_W32-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, 0, v2, vcc_lo ; GFX10_W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX10_W32-NEXT: global_load_dwordx3 v[1:3], v[1:2], off ; GFX10_W32-NEXT: s_and_saveexec_b32 s5, vcc_lo @@ -1092,15 +1089,14 @@ ; GFX10_W64-LABEL: test_div_fmas_f32_i1_phi_vcc: ; GFX10_W64: ; %bb.0: ; %entry ; GFX10_W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c -; GFX10_W64-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX10_W64-NEXT: v_lshlrev_b32_e32 v3, 2, v0 ; GFX10_W64-NEXT: s_mov_b32 s6, 0 -; GFX10_W64-NEXT: v_lshlrev_b64 v[1:2], 2, v[0:1] ; GFX10_W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10_W64-NEXT: v_mov_b32_e32 v4, s3 -; GFX10_W64-NEXT: v_mov_b32_e32 v3, s2 +; GFX10_W64-NEXT: v_mov_b32_e32 v1, s2 +; GFX10_W64-NEXT: v_mov_b32_e32 v2, s3 ; GFX10_W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX10_W64-NEXT: v_add_co_u32_e64 v1, vcc, v3, v1 -; GFX10_W64-NEXT: v_add_co_ci_u32_e32 v2, vcc, v4, v2, vcc +; GFX10_W64-NEXT: v_add_co_u32_e64 v1, vcc, v1, v3 +; GFX10_W64-NEXT: v_add_co_ci_u32_e32 v2, vcc, 0, v2, vcc ; GFX10_W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX10_W64-NEXT: global_load_dwordx3 v[1:3], v[1:2], off ; GFX10_W64-NEXT: s_and_saveexec_b64 s[4:5], vcc Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll @@ -8,8 +8,8 @@ ; GFX7-LABEL: test_div_scale_f32_1: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; GFX7-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX7-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -26,13 +26,12 @@ ; GFX8-LABEL: test_div_scale_f32_1: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX8-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NEXT: v_mov_b32_e32 v3, s3 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v2, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v0 ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] @@ -47,14 +46,13 @@ ; GFX10-LABEL: test_div_scale_f32_1: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX10-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-NEXT: v_mov_b32_e32 v3, s3 -; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, v2, v0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, v0, v2 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: v_add_co_u32_e64 v2, vcc_lo, v0, 4 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: s_clause 0x1 @@ -83,8 +81,8 @@ ; GFX7-LABEL: test_div_scale_f32_2: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; GFX7-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX7-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -101,13 +99,12 @@ ; GFX8-LABEL: test_div_scale_f32_2: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX8-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NEXT: v_mov_b32_e32 v3, s3 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v2, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v0 ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] @@ -122,14 +119,13 @@ ; GFX10-LABEL: test_div_scale_f32_2: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX10-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-NEXT: v_mov_b32_e32 v3, s3 -; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, v2, v0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, v0, v2 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: v_add_co_u32_e64 v2, vcc_lo, v0, 4 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: s_clause 0x1 @@ -158,14 +154,13 @@ ; GFX7-LABEL: test_div_scale_f64_1: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd -; GFX7-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX7-NEXT: v_lshl_b64 v[0:1], v[0:1], 3 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: v_add_i32_e32 v0, vcc, v2, v0 -; GFX7-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 8, v0 ; GFX7-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc ; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] @@ -180,14 +175,13 @@ ; GFX8-LABEL: test_div_scale_f64_1: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX8-NEXT: v_lshlrev_b64 v[0:1], 3, v[0:1] +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NEXT: v_mov_b32_e32 v3, s3 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v2, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 8, v0 ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] @@ -202,15 +196,14 @@ ; GFX10-LABEL: test_div_scale_f64_1: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX10-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_lshlrev_b64 v[0:1], 3, v[0:1] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-NEXT: v_mov_b32_e32 v3, s3 -; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, v2, v0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, v0, v2 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: v_add_co_u32_e64 v2, vcc_lo, v0, 8 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: s_clause 0x1 @@ -239,14 +232,13 @@ ; GFX7-LABEL: test_div_scale_f64_2: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd -; GFX7-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX7-NEXT: v_lshl_b64 v[0:1], v[0:1], 3 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: v_add_i32_e32 v0, vcc, v2, v0 -; GFX7-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 8, v0 ; GFX7-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc ; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] @@ -261,14 +253,13 @@ ; GFX8-LABEL: test_div_scale_f64_2: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX8-NEXT: v_lshlrev_b64 v[0:1], 3, v[0:1] +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NEXT: v_mov_b32_e32 v3, s3 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v2, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 8, v0 ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] @@ -283,15 +274,14 @@ ; GFX10-LABEL: test_div_scale_f64_2: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX10-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_lshlrev_b64 v[0:1], 3, v[0:1] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-NEXT: v_mov_b32_e32 v3, s3 -; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, v2, v0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, v0, v2 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: v_add_co_u32_e64 v2, vcc_lo, v0, 8 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: s_clause 0x1 @@ -321,8 +311,8 @@ ; GFX7: ; %bb.0: ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GFX7-NEXT: s_load_dword s8, s[0:1], 0x15 -; GFX7-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX7-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -339,13 +329,12 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dword s0, s[0:1], 0x54 -; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX8-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v2, s6 -; GFX8-NEXT: v_mov_b32_e32 v3, s7 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v2, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_div_scale_f32 v2, s[0:1], v0, v0, s0 @@ -356,16 +345,16 @@ ; ; GFX10-LABEL: test_div_scale_f32_scalar_num_1: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GFX10-NEXT: s_load_dword s0, s[0:1], 0x54 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v2, s6 -; GFX10-NEXT: v_mov_b32_e32 v3, s7 -; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, v2, v0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, v0, v2 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: global_load_dword v0, v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_div_scale_f32 v2, s0, v0, v0, s0 @@ -389,8 +378,8 @@ ; GFX7: ; %bb.0: ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GFX7-NEXT: s_load_dword s8, s[0:1], 0xd -; GFX7-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX7-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -407,13 +396,12 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dword s0, s[0:1], 0x34 -; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX8-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v2, s6 -; GFX8-NEXT: v_mov_b32_e32 v3, s7 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v2, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_div_scale_f32 v2, s[0:1], s0, v0, s0 @@ -424,16 +412,16 @@ ; ; GFX10-LABEL: test_div_scale_f32_scalar_num_2: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GFX10-NEXT: s_load_dword s0, s[0:1], 0x34 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v2, s6 -; GFX10-NEXT: v_mov_b32_e32 v3, s7 -; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, v2, v0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, v0, v2 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: global_load_dword v0, v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_div_scale_f32 v2, s0, s0, v0, s0 @@ -457,8 +445,8 @@ ; GFX7: ; %bb.0: ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GFX7-NEXT: s_load_dword s8, s[0:1], 0xd -; GFX7-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX7-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -475,13 +463,12 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dword s0, s[0:1], 0x34 -; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX8-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v2, s6 -; GFX8-NEXT: v_mov_b32_e32 v3, s7 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v2, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_div_scale_f32 v2, s[0:1], s0, s0, v0 @@ -492,16 +479,16 @@ ; ; GFX10-LABEL: test_div_scale_f32_scalar_den_1: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GFX10-NEXT: s_load_dword s0, s[0:1], 0x34 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v2, s6 -; GFX10-NEXT: v_mov_b32_e32 v3, s7 -; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, v2, v0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, v0, v2 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: global_load_dword v0, v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_div_scale_f32 v2, s0, s0, s0, v0 @@ -525,8 +512,8 @@ ; GFX7: ; %bb.0: ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GFX7-NEXT: s_load_dword s8, s[0:1], 0xd -; GFX7-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX7-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -543,13 +530,12 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dword s0, s[0:1], 0x34 -; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX8-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v2, s6 -; GFX8-NEXT: v_mov_b32_e32 v3, s7 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v2, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_div_scale_f32 v2, s[0:1], v0, s0, v0 @@ -560,16 +546,16 @@ ; ; GFX10-LABEL: test_div_scale_f32_scalar_den_2: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GFX10-NEXT: s_load_dword s0, s[0:1], 0x34 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v2, s6 -; GFX10-NEXT: v_mov_b32_e32 v3, s7 -; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, v2, v0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, v0, v2 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: global_load_dword v0, v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_div_scale_f32 v2, s0, v0, s0, v0 @@ -593,13 +579,12 @@ ; GFX7: ; %bb.0: ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x15 -; GFX7-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX7-NEXT: v_lshl_b64 v[0:1], v[0:1], 3 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s6 -; GFX7-NEXT: v_mov_b32_e32 v3, s7 -; GFX7-NEXT: v_add_i32_e32 v0, vcc, v2, v0 -; GFX7-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: v_mov_b32_e32 v3, s5 @@ -612,13 +597,12 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x54 -; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX8-NEXT: v_lshlrev_b64 v[0:1], 3, v[0:1] +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v2, s6 -; GFX8-NEXT: v_mov_b32_e32 v3, s7 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v2, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_mov_b32_e32 v3, s5 @@ -629,17 +613,17 @@ ; ; GFX10-LABEL: test_div_scale_f64_scalar_num_1: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x54 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_lshlrev_b64 v[0:1], 3, v[0:1] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v2, s6 -; GFX10-NEXT: v_mov_b32_e32 v3, s7 -; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, v2, v0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, v0, v2 ; GFX10-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: v_mov_b32_e32 v3, s5 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -662,13 +646,12 @@ ; GFX7: ; %bb.0: ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x15 -; GFX7-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX7-NEXT: v_lshl_b64 v[0:1], v[0:1], 3 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s6 -; GFX7-NEXT: v_mov_b32_e32 v3, s7 -; GFX7-NEXT: v_add_i32_e32 v0, vcc, v2, v0 -; GFX7-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: v_mov_b32_e32 v3, s5 @@ -681,13 +664,12 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x54 -; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX8-NEXT: v_lshlrev_b64 v[0:1], 3, v[0:1] +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v2, s6 -; GFX8-NEXT: v_mov_b32_e32 v3, s7 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v2, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_mov_b32_e32 v3, s5 @@ -698,17 +680,17 @@ ; ; GFX10-LABEL: test_div_scale_f64_scalar_num_2: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x54 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_lshlrev_b64 v[0:1], 3, v[0:1] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v2, s6 -; GFX10-NEXT: v_mov_b32_e32 v3, s7 -; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, v2, v0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, v0, v2 ; GFX10-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: v_mov_b32_e32 v3, s5 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -731,13 +713,12 @@ ; GFX7: ; %bb.0: ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x15 -; GFX7-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX7-NEXT: v_lshl_b64 v[0:1], v[0:1], 3 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s6 -; GFX7-NEXT: v_mov_b32_e32 v3, s7 -; GFX7-NEXT: v_add_i32_e32 v0, vcc, v2, v0 -; GFX7-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: v_mov_b32_e32 v3, s5 @@ -750,13 +731,12 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x54 -; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX8-NEXT: v_lshlrev_b64 v[0:1], 3, v[0:1] +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v2, s6 -; GFX8-NEXT: v_mov_b32_e32 v3, s7 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v2, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_mov_b32_e32 v3, s5 @@ -767,17 +747,17 @@ ; ; GFX10-LABEL: test_div_scale_f64_scalar_den_1: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x54 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_lshlrev_b64 v[0:1], 3, v[0:1] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v2, s6 -; GFX10-NEXT: v_mov_b32_e32 v3, s7 -; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, v2, v0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, v0, v2 ; GFX10-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: v_mov_b32_e32 v3, s5 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -800,13 +780,12 @@ ; GFX7: ; %bb.0: ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x15 -; GFX7-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX7-NEXT: v_lshl_b64 v[0:1], v[0:1], 3 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s6 -; GFX7-NEXT: v_mov_b32_e32 v3, s7 -; GFX7-NEXT: v_add_i32_e32 v0, vcc, v2, v0 -; GFX7-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: v_mov_b32_e32 v3, s5 @@ -819,13 +798,12 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x54 -; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX8-NEXT: v_lshlrev_b64 v[0:1], 3, v[0:1] +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v2, s6 -; GFX8-NEXT: v_mov_b32_e32 v3, s7 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v2, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_mov_b32_e32 v3, s5 @@ -836,17 +814,17 @@ ; ; GFX10-LABEL: test_div_scale_f64_scalar_den_2: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x54 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_lshlrev_b64 v[0:1], 3, v[0:1] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v2, s6 -; GFX10-NEXT: v_mov_b32_e32 v3, s7 -; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, v2, v0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, v0, v2 ; GFX10-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: v_mov_b32_e32 v3, s5 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -1056,8 +1034,8 @@ ; GFX7-LABEL: test_div_scale_f32_inline_imm_num: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; GFX7-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX7-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1073,13 +1051,12 @@ ; GFX8-LABEL: test_div_scale_f32_inline_imm_num: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX8-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NEXT: v_mov_b32_e32 v3, s3 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v2, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], v0, v0, 1.0 @@ -1091,14 +1068,13 @@ ; GFX10-LABEL: test_div_scale_f32_inline_imm_num: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX10-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-NEXT: v_mov_b32_e32 v3, s3 -; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, v2, v0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, v0, v2 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: global_load_dword v0, v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_div_scale_f32 v2, s2, v0, v0, 1.0 @@ -1120,8 +1096,8 @@ ; GFX7-LABEL: test_div_scale_f32_inline_imm_den: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; GFX7-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX7-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1137,13 +1113,12 @@ ; GFX8-LABEL: test_div_scale_f32_inline_imm_den: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX8-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NEXT: v_mov_b32_e32 v3, s3 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v2, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], 2.0, 2.0, v0 @@ -1155,14 +1130,13 @@ ; GFX10-LABEL: test_div_scale_f32_inline_imm_den: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX10-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-NEXT: v_mov_b32_e32 v3, s3 -; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, v2, v0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, v0, v2 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: global_load_dword v0, v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_div_scale_f32 v2, s2, 2.0, 2.0, v0 @@ -1184,8 +1158,8 @@ ; GFX7-LABEL: test_div_scale_f32_fabs_num: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; GFX7-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX7-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1204,13 +1178,12 @@ ; GFX8-LABEL: test_div_scale_f32_fabs_num: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX8-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NEXT: v_mov_b32_e32 v3, s3 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v2, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v0 ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] @@ -1227,14 +1200,13 @@ ; GFX10-LABEL: test_div_scale_f32_fabs_num: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX10-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-NEXT: v_mov_b32_e32 v3, s3 -; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, v2, v0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, v0, v2 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: v_add_co_u32_e64 v2, vcc_lo, v0, 4 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: s_clause 0x1 @@ -1267,8 +1239,8 @@ ; GFX7-LABEL: test_div_scale_f32_fabs_den: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; GFX7-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX7-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1286,13 +1258,12 @@ ; GFX8-LABEL: test_div_scale_f32_fabs_den: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX8-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NEXT: v_mov_b32_e32 v3, s3 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v2, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v0 ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] @@ -1308,14 +1279,13 @@ ; GFX10-LABEL: test_div_scale_f32_fabs_den: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX10-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-NEXT: v_mov_b32_e32 v3, s3 -; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, v2, v0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, v0, v2 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: v_add_co_u32_e64 v2, vcc_lo, v0, 4 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: s_clause 0x1 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.private.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.private.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.private.ll @@ -8,13 +8,12 @@ ; CI-LABEL: is_private_vgpr: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; CI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; CI-NEXT: v_lshl_b64 v[0:1], v[0:1], 3 +; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v3, s1 -; CI-NEXT: v_mov_b32_e32 v2, s0 -; CI-NEXT: v_add_i32_e32 v0, vcc, v2, v0 -; CI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; CI-NEXT: s_load_dword s0, s[4:5], 0x11 ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -26,13 +25,12 @@ ; GFX9-LABEL: is_private_vgpr: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX9-NEXT: v_lshlrev_b64 v[0:1], 3, v[0:1] +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v1, vcc +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off ; GFX9-NEXT: s_getreg_b32 s0, hwreg(HW_REG_SH_MEM_BASES, 0, 16) ; GFX9-NEXT: s_lshl_b32 s0, s0, 16 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.shared.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.shared.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.shared.ll @@ -8,13 +8,12 @@ ; CI-LABEL: is_local_vgpr: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; CI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; CI-NEXT: v_lshl_b64 v[0:1], v[0:1], 3 +; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v3, s1 -; CI-NEXT: v_mov_b32_e32 v2, s0 -; CI-NEXT: v_add_i32_e32 v0, vcc, v2, v0 -; CI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; CI-NEXT: s_load_dword s0, s[4:5], 0x10 ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -26,13 +25,12 @@ ; GFX9-LABEL: is_local_vgpr: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX9-NEXT: v_lshlrev_b64 v[0:1], 3, v[0:1] +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v1, vcc +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off ; GFX9-NEXT: s_getreg_b32 s0, hwreg(HW_REG_SH_MEM_BASES, 16, 16) ; GFX9-NEXT: s_lshl_b32 s0, s0, 16 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll @@ -39,13 +39,12 @@ ; GFX8-LABEL: update_dpp64_test: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX8-NEXT: v_lshlrev_b64 v[0:1], 3, v[0:1] +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v2, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: v_mov_b32_e32 v4, s2 @@ -59,16 +58,15 @@ ; GFX10-LABEL: update_dpp64_test: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX10-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_lshlrev_b64 v[0:1], 3, v[0:1] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v3, s1 -; GFX10-NEXT: v_mov_b32_e32 v2, s0 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: v_mov_b32_e32 v5, s3 ; GFX10-NEXT: v_mov_b32_e32 v4, s2 -; GFX10-NEXT: v_add_co_u32_e64 v6, vcc_lo, v2, v0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, v3, v1, vcc_lo +; GFX10-NEXT: v_add_co_u32_e64 v6, vcc_lo, v0, v2 +; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: global_load_dwordx2 v[2:3], v[6:7], off ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_dpp v4, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll @@ -202,71 +202,68 @@ ; GFX7-LABEL: muli24_shl64: ; GFX7: ; %bb.0: ; %bb ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; GFX7-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX7-NEXT: v_lshl_b64 v[2:3], v[0:1], 2 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; GFX7-NEXT: v_mov_b32_e32 v2, 0 ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX7-NEXT: buffer_load_dword v7, v[2:3], s[4:7], 0 addr64 -; GFX7-NEXT: v_lshl_b64 v[3:4], v[0:1], 3 -; GFX7-NEXT: v_mov_b32_e32 v6, s1 -; GFX7-NEXT: v_mov_b32_e32 v2, 0 -; GFX7-NEXT: v_mov_b32_e32 v5, s0 +; GFX7-NEXT: buffer_load_dword v1, v[1:2], s[4:7], 0 addr64 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v0 +; GFX7-NEXT: v_mov_b32_e32 v4, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s0 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_or_b32_e32 v0, 0xff800000, v7 +; GFX7-NEXT: v_or_b32_e32 v0, 0xff800000, v1 ; GFX7-NEXT: v_mul_i32_i24_e32 v1, -7, v0 ; GFX7-NEXT: v_lshl_b64 v[0:1], v[1:2], 3 -; GFX7-NEXT: v_add_i32_e32 v2, vcc, v5, v3 -; GFX7-NEXT: v_addc_u32_e32 v3, vcc, v6, v4, vcc +; GFX7-NEXT: v_add_i32_e32 v2, vcc, v3, v5 +; GFX7-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc ; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: muli24_shl64: ; GFX8: ; %bb.0: ; %bb ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX8-NEXT: v_lshlrev_b64 v[2:3], 2, v[0:1] +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 2, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v5, s3 -; GFX8-NEXT: v_mov_b32_e32 v4, s2 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v4, v2 -; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc -; GFX8-NEXT: flat_load_dword v7, v[2:3] -; GFX8-NEXT: v_lshlrev_b64 v[3:4], 3, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v6, s1 -; GFX8-NEXT: v_mov_b32_e32 v2, 0 -; GFX8-NEXT: v_mov_b32_e32 v5, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s2 +; GFX8-NEXT: v_mov_b32_e32 v2, s3 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3 +; GFX8-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc +; GFX8-NEXT: flat_load_dword v4, v[1:2] +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, 0 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_or_b32_e32 v0, 0xff800000, v7 -; GFX8-NEXT: v_mul_i32_i24_e32 v1, -7, v0 -; GFX8-NEXT: v_lshlrev_b64 v[0:1], 3, v[1:2] -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v3 -; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v6, v4, vcc +; GFX8-NEXT: v_or_b32_e32 v0, 0xff800000, v4 +; GFX8-NEXT: v_mul_i32_i24_e32 v0, -7, v0 +; GFX8-NEXT: v_lshlrev_b64 v[0:1], 3, v[0:1] ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: muli24_shl64: ; GFX9: ; %bb.0: ; %bb ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX9-NEXT: v_lshlrev_b64 v[2:3], 2, v[0:1] +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 2, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v5, s3 -; GFX9-NEXT: v_mov_b32_e32 v4, s2 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v4, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v3, vcc -; GFX9-NEXT: global_load_dword v7, v[2:3], off -; GFX9-NEXT: v_lshlrev_b64 v[3:4], 3, v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v6, s1 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: v_mov_b32_e32 v5, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: v_mov_b32_e32 v2, s3 +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v3 +; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc +; GFX9-NEXT: global_load_dword v4, v[1:2], off +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_e32 v0, 0xff800000, v7 -; GFX9-NEXT: v_mul_i32_i24_e32 v1, -7, v0 -; GFX9-NEXT: v_lshlrev_b64 v[0:1], 3, v[1:2] -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v5, v3 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v6, v4, vcc +; GFX9-NEXT: v_or_b32_e32 v0, 0xff800000, v4 +; GFX9-NEXT: v_mul_i32_i24_e32 v0, -7, v0 +; GFX9-NEXT: v_lshlrev_b64 v[0:1], 3, v[0:1] ; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off ; GFX9-NEXT: s_endpgm bb: