diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -9798,38 +9798,42 @@ unsigned Offset = N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0; SDValue Src = N->getOperand(0); - SDValue Srl = N->getOperand(0); - if (Srl.getOpcode() == ISD::ZERO_EXTEND) - Srl = Srl.getOperand(0); + SDValue Shift = N->getOperand(0); + if (Shift.getOpcode() == ISD::ZERO_EXTEND) + Shift = Shift.getOperand(0); - // TODO: Handle (or x, (srl y, 8)) pattern when known bits are zero. - if (Srl.getOpcode() == ISD::SRL) { + if (Shift.getOpcode() == ISD::SRL || Shift.getOpcode() == ISD::SHL) { + // cvt_f32_ubyte1 (shl x, 8) -> cvt_f32_ubyte0 x + // cvt_f32_ubyte3 (shl x, 16) -> cvt_f32_ubyte1 x // cvt_f32_ubyte0 (srl x, 16) -> cvt_f32_ubyte2 x // cvt_f32_ubyte1 (srl x, 16) -> cvt_f32_ubyte3 x - // cvt_f32_ubyte0 (srl x, 8) -> cvt_f32_ubyte1 x - - if (const ConstantSDNode *C = - dyn_cast(Srl.getOperand(1))) { - Srl = DAG.getZExtOrTrunc(Srl.getOperand(0), SDLoc(Srl.getOperand(0)), - EVT(MVT::i32)); + // cvt_f32_ubyte0 (srl x, 8) -> cvt_f32_ubyte1 x + if (auto *C = dyn_cast(Shift.getOperand(1))) { + Shift = DAG.getZExtOrTrunc(Shift.getOperand(0), + SDLoc(Shift.getOperand(0)), MVT::i32); + + unsigned ShiftOffset = 8 * Offset; + if (Shift.getOpcode() == ISD::SHL) + ShiftOffset -= C->getZExtValue(); + else + ShiftOffset += C->getZExtValue(); - unsigned SrcOffset = C->getZExtValue() + 8 * Offset; - if (SrcOffset < 32 && SrcOffset % 8 == 0) { - return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0 + SrcOffset / 8, SL, - MVT::f32, Srl); + if (ShiftOffset < 32 && (ShiftOffset % 8) == 0) { + return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0 + ShiftOffset / 8, SL, + MVT::f32, Shift); } } } - APInt Demanded = APInt::getBitsSet(32, 8 * Offset, 8 * Offset + 8); - - KnownBits Known; - TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(), - !DCI.isBeforeLegalizeOps()); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - if (TLI.SimplifyDemandedBits(Src, Demanded, Known, TLO)) { - DCI.CommitTargetLoweringOpt(TLO); - } + APInt DemandedBits = APInt::getBitsSet(32, 8 * Offset, 8 * Offset + 8); + if (TLI.SimplifyDemandedBits(Src, DemandedBits, DCI)) + return SDValue(N, 0); + + // Handle (or x, (srl y, 8)) pattern when known bits are zero. + if (SDValue DemandedSrc = + TLI.SimplifyMultipleUseDemandedBits(Src, DemandedBits, DAG)) + return DAG.getNode(N->getOpcode(), SL, MVT::f32, DemandedSrc); return SDValue(); } diff --git a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll --- a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll +++ b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll @@ -199,23 +199,21 @@ ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_ubyte v2, v[0:1], s[0:3], 0 addr64 -; SI-NEXT: buffer_load_ubyte v3, v[0:1], s[0:3], 0 addr64 offset:1 -; SI-NEXT: buffer_load_ubyte v4, v[0:1], s[0:3], 0 addr64 offset:2 +; SI-NEXT: buffer_load_ubyte v4, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: buffer_load_ubyte v2, v[0:1], s[0:3], 0 addr64 offset:1 +; SI-NEXT: buffer_load_ubyte v3, v[0:1], s[0:3], 0 addr64 offset:2 ; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[0:3], 0 addr64 offset:3 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_cvt_f32_ubyte2_e32 v1, v2 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; SI-NEXT: v_or_b32_e32 v0, v0, v4 +; SI-NEXT: v_or_b32_e32 v0, v0, v3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_cvt_f32_ubyte3_e32 v3, v0 +; SI-NEXT: v_or_b32_e32 v3, v0, v4 ; SI-NEXT: v_cvt_f32_ubyte2_e32 v2, v0 -; SI-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 -; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +; SI-NEXT: v_cvt_f32_ubyte3_e32 v3, v3 +; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v4 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; @@ -230,29 +228,24 @@ ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 1, v0 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc ; VI-NEXT: v_add_u32_e32 v4, vcc, 2, v0 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; VI-NEXT: v_add_u32_e32 v6, vcc, 1, v0 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v0 ; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc -; VI-NEXT: flat_load_ubyte v2, v[2:3] -; VI-NEXT: flat_load_ubyte v3, v[4:5] -; VI-NEXT: flat_load_ubyte v4, v[6:7] +; VI-NEXT: flat_load_ubyte v8, v[2:3] +; VI-NEXT: flat_load_ubyte v2, v[4:5] +; VI-NEXT: flat_load_ubyte v3, v[6:7] ; VI-NEXT: flat_load_ubyte v0, v[0:1] ; VI-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v2 +; VI-NEXT: v_cvt_f32_ubyte2_e32 v1, v8 ; VI-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; VI-NEXT: v_or_b32_e32 v1, v1, v3 +; VI-NEXT: v_cvt_f32_ubyte0_e32 v2, v2 ; VI-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v4 -; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; VI-NEXT: v_cvt_f32_ubyte0_e32 v3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_or_b32_e32 v0, v2, v0 -; VI-NEXT: v_cvt_f32_ubyte0_e32 v2, v1 -; VI-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 ; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v3 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -277,29 +270,29 @@ ; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x9 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dword v1, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 ; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_movk_i32 s12, 0xff ; SI-NEXT: s_mov_b32 s10, s2 ; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_movk_i32 s12, 0xff ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 -; SI-NEXT: v_add_i32_e32 v7, vcc, 9, v1 -; SI-NEXT: v_and_b32_e32 v6, 0xff00, v1 -; SI-NEXT: v_lshrrev_b32_e32 v5, 24, v1 -; SI-NEXT: v_cvt_f32_ubyte3_e32 v3, v1 -; SI-NEXT: v_cvt_f32_ubyte2_e32 v2, v1 -; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v1 -; SI-NEXT: v_cvt_f32_ubyte1_e32 v1, v6 -; SI-NEXT: v_and_b32_e32 v7, s12, v7 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v6, 24, v4 +; SI-NEXT: v_and_b32_e32 v7, 0xff00, v4 +; SI-NEXT: v_cvt_f32_ubyte3_e32 v3, v4 +; SI-NEXT: v_cvt_f32_ubyte2_e32 v2, v4 +; SI-NEXT: v_cvt_f32_ubyte1_e32 v1, v4 +; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v4 ; SI-NEXT: v_add_i32_e32 v4, vcc, 9, v4 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v6, v7 -; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v5 -; SI-NEXT: v_and_b32_e32 v1, s12, v4 +; SI-NEXT: v_and_b32_e32 v0, s12, v4 +; SI-NEXT: v_add_i32_e32 v2, vcc, 9, v5 +; SI-NEXT: v_or_b32_e32 v0, v7, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v6 +; SI-NEXT: v_and_b32_e32 v2, s12, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x900, v0 -; SI-NEXT: v_or_b32_e32 v1, v5, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 @@ -365,33 +358,30 @@ ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_ubyte v2, v[0:1], s[0:3], 0 addr64 offset:5 -; SI-NEXT: buffer_load_ubyte v3, v[0:1], s[0:3], 0 addr64 offset:6 -; SI-NEXT: buffer_load_ubyte v4, v[0:1], s[0:3], 0 addr64 -; SI-NEXT: buffer_load_ubyte v5, v[0:1], s[0:3], 0 addr64 offset:1 +; SI-NEXT: buffer_load_ubyte v2, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: buffer_load_ubyte v3, v[0:1], s[0:3], 0 addr64 offset:1 ; SI-NEXT: buffer_load_ubyte v6, v[0:1], s[0:3], 0 addr64 offset:2 -; SI-NEXT: buffer_load_ubyte v7, v[0:1], s[0:3], 0 addr64 offset:3 -; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[0:3], 0 addr64 offset:4 +; SI-NEXT: buffer_load_ubyte v4, v[0:1], s[0:3], 0 addr64 offset:3 +; SI-NEXT: buffer_load_ubyte v7, v[0:1], s[0:3], 0 addr64 offset:4 +; SI-NEXT: buffer_load_ubyte v5, v[0:1], s[0:3], 0 addr64 offset:5 +; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[0:3], 0 addr64 offset:6 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_ubyte2_e32 v1, v3 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v5 -; SI-NEXT: v_or_b32_e32 v1, v1, v4 +; SI-NEXT: v_lshlrev_b32_e32 v8, 8, v4 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_ubyte0_e32 v4, v7 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v7 -; SI-NEXT: v_cvt_f32_ubyte0_e32 v7, v3 -; SI-NEXT: v_or_b32_e32 v3, v5, v6 +; SI-NEXT: v_cvt_f32_ubyte2_e32 v5, v5 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v2, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; SI-NEXT: v_cvt_f32_ubyte1_e32 v5, v0 -; SI-NEXT: v_cvt_f32_ubyte0_e32 v4, v0 -; SI-NEXT: v_or_b32_e32 v0, v2, v1 +; SI-NEXT: v_cvt_f32_ubyte0_e32 v7, v0 +; SI-NEXT: v_or_b32_e32 v0, v8, v6 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v6, v0, v2 ; SI-NEXT: v_cvt_f32_ubyte3_e32 v3, v0 ; SI-NEXT: v_cvt_f32_ubyte2_e32 v2, v0 -; SI-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 -; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v6 ; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:24 ; SI-NEXT: buffer_store_dwordx2 v[4:5], off, s[4:7], 0 offset:16 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 @@ -408,43 +398,38 @@ ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_add_u32_e32 v2, vcc, 1, v0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v0 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc -; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v0 +; VI-NEXT: v_add_u32_e32 v4, vcc, 2, v0 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; VI-NEXT: flat_load_ubyte v8, v[0:1] -; VI-NEXT: flat_load_ubyte v9, v[2:3] -; VI-NEXT: flat_load_ubyte v10, v[4:5] -; VI-NEXT: v_add_u32_e32 v2, vcc, 2, v0 +; VI-NEXT: flat_load_ubyte v10, v[2:3] +; VI-NEXT: flat_load_ubyte v11, v[4:5] +; VI-NEXT: v_add_u32_e32 v2, vcc, 6, v0 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc -; VI-NEXT: v_add_u32_e32 v4, vcc, 5, v0 +; VI-NEXT: v_add_u32_e32 v4, vcc, 4, v0 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; VI-NEXT: v_add_u32_e32 v6, vcc, 4, v0 +; VI-NEXT: v_add_u32_e32 v6, vcc, 5, v0 ; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc -; VI-NEXT: v_add_u32_e32 v0, vcc, 6, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v8, vcc, 1, v0 +; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc ; VI-NEXT: flat_load_ubyte v2, v[2:3] ; VI-NEXT: flat_load_ubyte v3, v[4:5] ; VI-NEXT: flat_load_ubyte v4, v[6:7] +; VI-NEXT: flat_load_ubyte v5, v[8:9] ; VI-NEXT: flat_load_ubyte v0, v[0:1] -; VI-NEXT: s_waitcnt vmcnt(5) lgkmcnt(5) -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v9 +; VI-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6) +; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v10 ; VI-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4) -; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v10 -; VI-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; VI-NEXT: v_cvt_f32_ubyte0_e32 v6, v2 +; VI-NEXT: v_or_b32_sdwa v2, v7, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) +; VI-NEXT: v_cvt_f32_ubyte2_e32 v1, v5 +; VI-NEXT: v_cvt_f32_ubyte2_e32 v5, v4 +; VI-NEXT: v_cvt_f32_ubyte0_e32 v4, v3 +; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v2 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_cvt_f32_ubyte0_e32 v6, v0 -; VI-NEXT: v_or_b32_e32 v0, v1, v8 -; VI-NEXT: v_or_b32_sdwa v1, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v2, v3, v4 -; VI-NEXT: v_or_b32_e32 v0, v1, v0 -; VI-NEXT: v_cvt_f32_ubyte1_e32 v5, v2 -; VI-NEXT: v_cvt_f32_ubyte0_e32 v4, v2 -; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v0 -; VI-NEXT: v_cvt_f32_ubyte2_e32 v2, v0 -; VI-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 ; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +; VI-NEXT: v_cvt_f32_ubyte2_e32 v2, v2 ; VI-NEXT: buffer_store_dwordx3 v[4:6], off, s[4:7], 0 offset:16 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; VI-NEXT: s_endpgm @@ -658,23 +643,21 @@ ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_ubyte v2, v[0:1], s[0:3], 0 addr64 -; SI-NEXT: buffer_load_ubyte v3, v[0:1], s[0:3], 0 addr64 offset:1 -; SI-NEXT: buffer_load_ubyte v4, v[0:1], s[0:3], 0 addr64 offset:2 +; SI-NEXT: buffer_load_ubyte v4, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: buffer_load_ubyte v2, v[0:1], s[0:3], 0 addr64 offset:1 +; SI-NEXT: buffer_load_ubyte v3, v[0:1], s[0:3], 0 addr64 offset:2 ; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[0:3], 0 addr64 offset:3 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_cvt_f32_ubyte2_e32 v1, v2 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; SI-NEXT: v_or_b32_e32 v0, v0, v4 +; SI-NEXT: v_or_b32_e32 v0, v0, v3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_cvt_f32_ubyte3_e32 v3, v0 +; SI-NEXT: v_or_b32_e32 v3, v0, v4 ; SI-NEXT: v_cvt_f32_ubyte2_e32 v2, v0 -; SI-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 -; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +; SI-NEXT: v_cvt_f32_ubyte3_e32 v3, v3 +; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v4 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; @@ -689,28 +672,25 @@ ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_add_u32_e32 v2, vcc, 1, v0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v0 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc -; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v0 +; VI-NEXT: v_add_u32_e32 v4, vcc, 2, v0 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; VI-NEXT: flat_load_ubyte v6, v[0:1] -; VI-NEXT: v_add_u32_e32 v0, vcc, 2, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v6, vcc, 1, v0 +; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc ; VI-NEXT: flat_load_ubyte v2, v[2:3] ; VI-NEXT: flat_load_ubyte v3, v[4:5] +; VI-NEXT: flat_load_ubyte v4, v[6:7] ; VI-NEXT: flat_load_ubyte v0, v[0:1] -; VI-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v2 -; VI-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v3 -; VI-NEXT: v_or_b32_e32 v1, v1, v6 +; VI-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) +; VI-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v1 +; VI-NEXT: v_cvt_f32_ubyte2_e32 v2, v1 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v0 -; VI-NEXT: v_cvt_f32_ubyte2_e32 v2, v0 ; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; VI-NEXT: v_cvt_f32_ubyte1_e32 v1, v1 +; VI-NEXT: v_cvt_f32_ubyte2_e32 v1, v4 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/udiv64.ll b/llvm/test/CodeGen/AMDGPU/udiv64.ll --- a/llvm/test/CodeGen/AMDGPU/udiv64.ll +++ b/llvm/test/CodeGen/AMDGPU/udiv64.ll @@ -704,40 +704,41 @@ ; GCN-NEXT: s_load_dword s3, s[0:1], 0xe ; GCN-NEXT: s_mov_b32 s5, 0xff000000 ; GCN-NEXT: s_mov_b32 s4, 0xffff -; GCN-NEXT: s_load_dword s6, s[0:1], 0xb -; GCN-NEXT: s_load_dword s7, s[0:1], 0xc +; GCN-NEXT: v_cvt_f32_ubyte3_e32 v0, s4 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_and_b32 s2, s2, s5 ; GCN-NEXT: s_and_b32 s3, s3, s4 -; GCN-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NEXT: v_alignbit_b32 v0, s3, v0, 24 -; GCN-NEXT: v_cvt_f32_u32_e32 v1, v0 -; GCN-NEXT: v_cvt_f32_ubyte3_e32 v2, s3 +; GCN-NEXT: v_mov_b32_e32 v1, s2 +; GCN-NEXT: v_alignbit_b32 v1, s3, v1, 24 +; GCN-NEXT: v_cvt_f32_u32_e32 v2, v1 +; GCN-NEXT: s_load_dword s6, s[0:1], 0xb +; GCN-NEXT: s_load_dword s7, s[0:1], 0xc +; GCN-NEXT: s_lshr_b64 s[2:3], s[2:3], 24 +; GCN-NEXT: v_mov_b32_e32 v9, 0 +; GCN-NEXT: v_mac_f32_e32 v2, 0x4f800000, v0 +; GCN-NEXT: v_rcp_f32_e32 v0, v2 +; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_and_b32 s7, s7, s4 ; GCN-NEXT: s_and_b32 s6, s6, s5 -; GCN-NEXT: v_mac_f32_e32 v1, 0x4f800000, v2 -; GCN-NEXT: v_rcp_f32_e32 v1, v1 -; GCN-NEXT: s_lshr_b64 s[2:3], s[2:3], 24 ; GCN-NEXT: s_sub_u32 s8, 0, s2 -; GCN-NEXT: s_subb_u32 s9, 0, s3 -; GCN-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v1 -; GCN-NEXT: v_mul_f32_e32 v2, 0x2f800000, v1 +; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 +; GCN-NEXT: v_mul_f32_e32 v2, 0x2f800000, v0 ; GCN-NEXT: v_trunc_f32_e32 v2, v2 -; GCN-NEXT: v_mac_f32_e32 v1, 0xcf800000, v2 -; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GCN-NEXT: v_mac_f32_e32 v0, 0xcf800000, v2 ; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2 -; GCN-NEXT: v_mov_b32_e32 v9, 0 +; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GCN-NEXT: s_subb_u32 s9, 0, s3 ; GCN-NEXT: v_mov_b32_e32 v8, 0 -; GCN-NEXT: v_mul_hi_u32 v4, s8, v1 ; GCN-NEXT: v_mul_lo_u32 v3, s8, v2 -; GCN-NEXT: v_mul_lo_u32 v5, s9, v1 +; GCN-NEXT: v_mul_hi_u32 v4, s8, v0 +; GCN-NEXT: v_mul_lo_u32 v5, s9, v0 ; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GCN-NEXT: v_mul_lo_u32 v4, s8, v1 +; GCN-NEXT: v_mul_lo_u32 v4, s8, v0 ; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v5 -; GCN-NEXT: v_mul_lo_u32 v6, v1, v3 -; GCN-NEXT: v_mul_hi_u32 v5, v1, v3 -; GCN-NEXT: v_mul_hi_u32 v7, v1, v4 +; GCN-NEXT: v_mul_lo_u32 v6, v0, v3 +; GCN-NEXT: v_mul_hi_u32 v5, v0, v3 +; GCN-NEXT: v_mul_hi_u32 v7, v0, v4 ; GCN-NEXT: v_mul_hi_u32 v10, v2, v3 ; GCN-NEXT: v_mul_lo_u32 v3, v2, v3 ; GCN-NEXT: v_add_i32_e32 v6, vcc, v7, v6 @@ -748,18 +749,18 @@ ; GCN-NEXT: v_addc_u32_e32 v4, vcc, v5, v4, vcc ; GCN-NEXT: v_addc_u32_e32 v5, vcc, v10, v8, vcc ; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GCN-NEXT: v_add_i32_e64 v1, s[2:3], v1, v3 +; GCN-NEXT: v_add_i32_e64 v0, s[2:3], v0, v3 ; GCN-NEXT: v_addc_u32_e32 v4, vcc, v9, v5, vcc ; GCN-NEXT: v_addc_u32_e64 v3, vcc, v2, v4, s[2:3] ; GCN-NEXT: v_mul_lo_u32 v5, s8, v3 -; GCN-NEXT: v_mul_hi_u32 v6, s8, v1 -; GCN-NEXT: v_mul_lo_u32 v7, s9, v1 +; GCN-NEXT: v_mul_hi_u32 v6, s8, v0 +; GCN-NEXT: v_mul_lo_u32 v7, s9, v0 ; GCN-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; GCN-NEXT: v_mul_lo_u32 v6, s8, v1 +; GCN-NEXT: v_mul_lo_u32 v6, s8, v0 ; GCN-NEXT: v_add_i32_e32 v5, vcc, v7, v5 -; GCN-NEXT: v_mul_lo_u32 v11, v1, v5 -; GCN-NEXT: v_mul_hi_u32 v13, v1, v5 -; GCN-NEXT: v_mul_hi_u32 v12, v1, v6 +; GCN-NEXT: v_mul_lo_u32 v11, v0, v5 +; GCN-NEXT: v_mul_hi_u32 v13, v0, v5 +; GCN-NEXT: v_mul_hi_u32 v12, v0, v6 ; GCN-NEXT: v_mul_hi_u32 v10, v3, v6 ; GCN-NEXT: v_mul_lo_u32 v6, v3, v6 ; GCN-NEXT: v_mul_hi_u32 v7, v3, v5 @@ -773,50 +774,50 @@ ; GCN-NEXT: v_addc_u32_e32 v5, vcc, v9, v5, vcc ; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GCN-NEXT: v_addc_u32_e64 v2, vcc, v2, v5, s[2:3] -; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v3 +; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v3 ; GCN-NEXT: v_mov_b32_e32 v3, s6 ; GCN-NEXT: v_alignbit_b32 v3, s7, v3, 24 ; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc -; GCN-NEXT: v_mul_hi_u32 v5, v3, v1 +; GCN-NEXT: v_mul_hi_u32 v5, v3, v0 ; GCN-NEXT: v_mul_lo_u32 v4, v3, v2 ; GCN-NEXT: v_mul_hi_u32 v6, v3, v2 -; GCN-NEXT: v_mul_hi_u32 v1, 0, v1 +; GCN-NEXT: v_mul_hi_u32 v0, 0, v0 ; GCN-NEXT: v_mul_hi_u32 v2, 0, v2 ; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4 ; GCN-NEXT: v_addc_u32_e32 v5, vcc, v9, v6, vcc ; GCN-NEXT: v_add_i32_e32 v4, vcc, 0, v4 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, v5, v1, vcc +; GCN-NEXT: v_addc_u32_e32 v0, vcc, v5, v0, vcc ; GCN-NEXT: v_addc_u32_e32 v2, vcc, v2, v8, vcc -; GCN-NEXT: v_add_i32_e32 v1, vcc, 0, v1 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 0, v0 ; GCN-NEXT: v_addc_u32_e32 v2, vcc, v9, v2, vcc -; GCN-NEXT: v_mul_lo_u32 v4, v0, v2 -; GCN-NEXT: v_mul_hi_u32 v5, v0, v1 -; GCN-NEXT: v_mul_lo_u32 v6, v0, v1 +; GCN-NEXT: v_mul_lo_u32 v4, v1, v2 +; GCN-NEXT: v_mul_hi_u32 v5, v1, v0 +; GCN-NEXT: v_mul_lo_u32 v6, v1, v0 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4 ; GCN-NEXT: v_sub_i32_e32 v3, vcc, v3, v6 ; GCN-NEXT: v_subb_u32_e32 v4, vcc, 0, v4, vcc -; GCN-NEXT: v_sub_i32_e32 v5, vcc, v3, v0 +; GCN-NEXT: v_sub_i32_e32 v5, vcc, v3, v1 ; GCN-NEXT: v_subbrev_u32_e32 v6, vcc, 0, v4, vcc -; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v5, v0 +; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v5, v1 ; GCN-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v6 ; GCN-NEXT: v_cndmask_b32_e32 v5, -1, v5, vcc -; GCN-NEXT: v_add_i32_e32 v6, vcc, 2, v1 +; GCN-NEXT: v_add_i32_e32 v6, vcc, 2, v0 ; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v2, vcc -; GCN-NEXT: v_add_i32_e32 v8, vcc, 1, v1 -; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], v3, v0 +; GCN-NEXT: v_add_i32_e32 v8, vcc, 1, v0 +; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], v3, v1 ; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v2, vcc -; GCN-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1] +; GCN-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[0:1] ; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v4 ; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; GCN-NEXT: v_cndmask_b32_e64 v0, -1, v0, s[0:1] -; GCN-NEXT: v_cndmask_b32_e32 v5, v8, v6, vcc -; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v0 -; GCN-NEXT: v_cndmask_b32_e64 v0, v1, v5, s[0:1] +; GCN-NEXT: v_cndmask_b32_e64 v1, -1, v1, s[0:1] +; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v1 ; GCN-NEXT: v_cndmask_b32_e32 v1, v9, v7, vcc +; GCN-NEXT: v_cndmask_b32_e32 v5, v8, v6, vcc ; GCN-NEXT: v_cndmask_b32_e64 v1, v2, v1, s[0:1] +; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v5, s[0:1] ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_store_short v1, off, s[4:7], 0 offset:4 ; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0