diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -1045,14 +1045,19 @@ // Try to simplify operations with a constant that may appear after instruction // selection. // TODO: See if a frame index with a fixed offset can fold. -static bool tryConstantFoldOp(MachineRegisterInfo &MRI, - const SIInstrInfo *TII, - MachineInstr *MI, - MachineOperand *ImmOp) { +static bool tryConstantFoldOp(MachineRegisterInfo &MRI, const SIInstrInfo *TII, + MachineInstr *MI) { unsigned Opc = MI->getOpcode(); - if (Opc == AMDGPU::V_NOT_B32_e64 || Opc == AMDGPU::V_NOT_B32_e32 || - Opc == AMDGPU::S_NOT_B32) { - MI->getOperand(1).ChangeToImmediate(~ImmOp->getImm()); + + int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); + if (Src0Idx == -1) + return false; + MachineOperand *Src0 = getImmOrMaterializedImm(MRI, MI->getOperand(Src0Idx)); + + if ((Opc == AMDGPU::V_NOT_B32_e64 || Opc == AMDGPU::V_NOT_B32_e32 || + Opc == AMDGPU::S_NOT_B32) && + Src0->isImm()) { + MI->getOperand(1).ChangeToImmediate(~Src0->getImm()); mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_NOT_B32))); return true; } @@ -1060,9 +1065,6 @@ int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1); if (Src1Idx == -1) return false; - - int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); - MachineOperand *Src0 = getImmOrMaterializedImm(MRI, MI->getOperand(Src0Idx)); MachineOperand *Src1 = getImmOrMaterializedImm(MRI, MI->getOperand(Src1Idx)); if (!Src0->isImm() && !Src1->isImm()) @@ -1189,68 +1191,59 @@ SmallVector FoldList; MachineOperand &Dst = MI.getOperand(0); + if (OpToFold.isImm()) { + for (auto &UseMI : + make_early_inc_range(MRI->use_nodbg_instructions(Dst.getReg()))) { + // Folding the immediate may reveal operations that can be constant + // folded or replaced with a copy. This can happen for example after + // frame indices are lowered to constants or from splitting 64-bit + // constants. + // + // We may also encounter cases where one or both operands are + // immediates materialized into a register, which would ordinarily not + // be folded due to multiple uses or operand constraints. + if (tryConstantFoldOp(*MRI, TII, &UseMI)) + LLVM_DEBUG(dbgs() << "Constant folded " << UseMI); + } + } + bool FoldingImm = OpToFold.isImm() || OpToFold.isFI() || OpToFold.isGlobal(); if (FoldingImm) { unsigned NumLiteralUses = 0; MachineOperand *NonInlineUse = nullptr; int NonInlineUseOpNo = -1; - bool Again; - do { - Again = false; - for (auto &Use : make_early_inc_range(MRI->use_nodbg_operands(Dst.getReg()))) { - MachineInstr *UseMI = Use.getParent(); - unsigned OpNo = UseMI->getOperandNo(&Use); - - // Folding the immediate may reveal operations that can be constant - // folded or replaced with a copy. This can happen for example after - // frame indices are lowered to constants or from splitting 64-bit - // constants. - // - // We may also encounter cases where one or both operands are - // immediates materialized into a register, which would ordinarily not - // be folded due to multiple uses or operand constraints. - - if (OpToFold.isImm() && tryConstantFoldOp(*MRI, TII, UseMI, &OpToFold)) { - LLVM_DEBUG(dbgs() << "Constant folded " << *UseMI); - - // Some constant folding cases change the same immediate's use to a new - // instruction, e.g. and x, 0 -> 0. Make sure we re-visit the user - // again. The same constant folded instruction could also have a second - // use operand. - FoldList.clear(); - Again = true; - break; - } + for (auto &Use : + make_early_inc_range(MRI->use_nodbg_operands(Dst.getReg()))) { + MachineInstr *UseMI = Use.getParent(); + unsigned OpNo = UseMI->getOperandNo(&Use); - // Try to fold any inline immediate uses, and then only fold other - // constants if they have one use. - // - // The legality of the inline immediate must be checked based on the use - // operand, not the defining instruction, because 32-bit instructions - // with 32-bit inline immediate sources may be used to materialize - // constants used in 16-bit operands. - // - // e.g. it is unsafe to fold: - // s_mov_b32 s0, 1.0 // materializes 0x3f800000 - // v_add_f16 v0, v1, s0 // 1.0 f16 inline immediate sees 0x00003c00 - - // Folding immediates with more than one use will increase program size. - // FIXME: This will also reduce register usage, which may be better - // in some cases. A better heuristic is needed. - if (isInlineConstantIfFolded(TII, *UseMI, OpNo, OpToFold)) { - foldOperand(OpToFold, UseMI, OpNo, FoldList, CopiesToReplace); - } else if (frameIndexMayFold(TII, *UseMI, OpNo, OpToFold)) { - foldOperand(OpToFold, UseMI, OpNo, FoldList, - CopiesToReplace); - } else { - if (++NumLiteralUses == 1) { - NonInlineUse = &Use; - NonInlineUseOpNo = OpNo; - } + // Try to fold any inline immediate uses, and then only fold other + // constants if they have one use. + // + // The legality of the inline immediate must be checked based on the use + // operand, not the defining instruction, because 32-bit instructions + // with 32-bit inline immediate sources may be used to materialize + // constants used in 16-bit operands. + // + // e.g. it is unsafe to fold: + // s_mov_b32 s0, 1.0 // materializes 0x3f800000 + // v_add_f16 v0, v1, s0 // 1.0 f16 inline immediate sees 0x00003c00 + + // Folding immediates with more than one use will increase program size. + // FIXME: This will also reduce register usage, which may be better + // in some cases. A better heuristic is needed. + if (isInlineConstantIfFolded(TII, *UseMI, OpNo, OpToFold)) { + foldOperand(OpToFold, UseMI, OpNo, FoldList, CopiesToReplace); + } else if (frameIndexMayFold(TII, *UseMI, OpNo, OpToFold)) { + foldOperand(OpToFold, UseMI, OpNo, FoldList, CopiesToReplace); + } else { + if (++NumLiteralUses == 1) { + NonInlineUse = &Use; + NonInlineUseOpNo = OpNo; } } - } while (Again); + } if (NumLiteralUses == 1) { MachineInstr *UseMI = NonInlineUse->getParent(); diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll @@ -3209,40 +3209,40 @@ ; GFX6-LABEL: v_fshr_v2i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s5, 0xffff -; GFX6-NEXT: v_and_b32_e32 v4, s5, v1 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff, v1 ; GFX6-NEXT: s_bfe_u32 s4, 1, 0x100000 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 1, v4 -; GFX6-NEXT: s_bfe_u32 s6, 14, 0x100000 +; GFX6-NEXT: v_lshrrev_b32_e32 v5, 1, v5 +; GFX6-NEXT: s_bfe_u32 s5, 14, 0x100000 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, s4, v0 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, s6, v4 -; GFX6-NEXT: v_or_b32_e32 v0, v0, v4 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 17, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v5, s5, v5 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v5, 17, v1 ; GFX6-NEXT: v_xor_b32_e32 v2, -1, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, s4, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, s6, v4 -; GFX6-NEXT: v_and_b32_e32 v6, 15, v2 -; GFX6-NEXT: v_or_b32_e32 v3, v3, v4 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v1 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 1, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v5, s5, v5 +; GFX6-NEXT: v_and_b32_e32 v7, 15, v2 +; GFX6-NEXT: v_or_b32_e32 v3, v3, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v6, 16, v2 ; GFX6-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX6-NEXT: v_mov_b32_e32 v4, 0xffff +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 1, v1 ; GFX6-NEXT: v_and_b32_e32 v2, 15, v2 -; GFX6-NEXT: v_and_b32_e32 v1, s5, v1 -; GFX6-NEXT: v_bfe_u32 v6, v6, 0, 16 +; GFX6-NEXT: v_and_b32_e32 v1, v1, v4 +; GFX6-NEXT: v_bfe_u32 v7, v7, 0, 16 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 1, v1 ; GFX6-NEXT: v_bfe_u32 v2, v2, 0, 16 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, v2, v1 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, v6, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, v7, v0 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX6-NEXT: v_and_b32_e32 v1, 15, v5 -; GFX6-NEXT: v_xor_b32_e32 v2, -1, v5 +; GFX6-NEXT: v_and_b32_e32 v1, 15, v6 +; GFX6-NEXT: v_xor_b32_e32 v2, -1, v6 ; GFX6-NEXT: v_bfe_u32 v1, v1, 0, 16 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 1, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 1, v5 ; GFX6-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, v1, v3 -; GFX6-NEXT: v_and_b32_e32 v3, s5, v4 +; GFX6-NEXT: v_and_b32_e32 v3, v5, v4 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 1, v3 ; GFX6-NEXT: v_bfe_u32 v2, v2, 0, 16 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v2, v3 @@ -3589,42 +3589,42 @@ define amdgpu_ps float @v_fshr_v2i16_svs(<2 x i16> inreg %lhs, <2 x i16> %rhs, <2 x i16> inreg %amt) { ; GFX6-LABEL: v_fshr_v2i16_svs: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_mov_b32 s4, 0xffff -; GFX6-NEXT: v_and_b32_e32 v1, s4, v0 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v0 ; GFX6-NEXT: s_bfe_u32 s3, 1, 0x100000 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 1, v1 -; GFX6-NEXT: s_bfe_u32 s5, 14, 0x100000 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 17, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 1, v2 +; GFX6-NEXT: s_bfe_u32 s4, 14, 0x100000 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 17, v0 ; GFX6-NEXT: s_lshr_b32 s2, s0, 16 ; GFX6-NEXT: s_lshl_b32 s0, s0, s3 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, s5, v1 -; GFX6-NEXT: v_or_b32_e32 v1, s0, v1 -; GFX6-NEXT: s_lshl_b32 s0, s2, s3 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, s5, v2 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, s4, v2 ; GFX6-NEXT: v_or_b32_e32 v2, s0, v2 +; GFX6-NEXT: s_lshl_b32 s0, s2, s3 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, s4, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GFX6-NEXT: v_or_b32_e32 v3, s0, v3 ; GFX6-NEXT: s_xor_b32 s0, s1, -1 +; GFX6-NEXT: v_mov_b32_e32 v1, 0xffff +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX6-NEXT: s_and_b32 s2, s0, 15 -; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX6-NEXT: v_and_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_lshr_b32 s1, s0, 16 ; GFX6-NEXT: s_andn2_b32 s0, 15, s0 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, 1, v0 ; GFX6-NEXT: s_bfe_u32 s0, s0, 0x100000 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 1, v4 ; GFX6-NEXT: s_bfe_u32 s2, s2, 0x100000 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, s0, v0 ; GFX6-NEXT: s_and_b32 s0, s1, 15 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, s2, v1 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 1, v3 -; GFX6-NEXT: s_bfe_u32 s0, s0, 0x100000 -; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, s0, v2 -; GFX6-NEXT: v_and_b32_e32 v2, s4, v3 +; GFX6-NEXT: v_and_b32_e32 v1, v4, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, s2, v2 ; GFX6-NEXT: s_andn2_b32 s1, 15, s1 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 1, v2 +; GFX6-NEXT: s_bfe_u32 s0, s0, 0x100000 +; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, s0, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 1, v1 ; GFX6-NEXT: s_bfe_u32 s0, s1, 0x100000 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, s0, v2 -; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, s0, v1 +; GFX6-NEXT: v_or_b32_e32 v1, v2, v1 ; GFX6-NEXT: v_bfe_u32 v1, v1, 0, 16 ; GFX6-NEXT: v_bfe_u32 v0, v0, 0, 16 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 @@ -4134,7 +4134,7 @@ ; GFX6-NEXT: v_lshrrev_b32_e32 v10, 16, v8 ; GFX6-NEXT: v_xor_b32_e32 v8, -1, v8 ; GFX6-NEXT: v_and_b32_e32 v8, 15, v8 -; GFX6-NEXT: v_and_b32_e32 v4, s5, v4 +; GFX6-NEXT: v_and_b32_e32 v4, v4, v12 ; GFX6-NEXT: v_bfe_u32 v11, v11, 0, 16 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 1, v4 ; GFX6-NEXT: v_bfe_u32 v8, v8, 0, 16 @@ -4146,18 +4146,18 @@ ; GFX6-NEXT: v_bfe_u32 v4, v4, 0, 16 ; GFX6-NEXT: v_lshlrev_b32_e32 v5, 1, v5 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, v4, v1 -; GFX6-NEXT: v_and_b32_e32 v4, s5, v5 +; GFX6-NEXT: v_and_b32_e32 v4, v5, v12 ; GFX6-NEXT: v_and_b32_e32 v8, 15, v8 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 1, v4 ; GFX6-NEXT: v_bfe_u32 v5, v8, 0, 16 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, v5, v4 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v4 -; GFX6-NEXT: v_and_b32_e32 v4, s5, v6 +; GFX6-NEXT: v_and_b32_e32 v4, v6, v12 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 1, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, s4, v2 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, s6, v4 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v4 -; GFX6-NEXT: v_and_b32_e32 v4, s5, v7 +; GFX6-NEXT: v_and_b32_e32 v4, v7, v12 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 1, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, s4, v3 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, s6, v4 @@ -4169,7 +4169,7 @@ ; GFX6-NEXT: v_lshrrev_b32_e32 v7, 16, v6 ; GFX6-NEXT: v_xor_b32_e32 v6, -1, v6 ; GFX6-NEXT: v_and_b32_e32 v6, 15, v6 -; GFX6-NEXT: v_and_b32_e32 v4, s5, v4 +; GFX6-NEXT: v_and_b32_e32 v4, v4, v12 ; GFX6-NEXT: v_bfe_u32 v8, v8, 0, 16 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 1, v4 ; GFX6-NEXT: v_bfe_u32 v6, v6, 0, 16 @@ -4180,7 +4180,7 @@ ; GFX6-NEXT: v_xor_b32_e32 v6, -1, v7 ; GFX6-NEXT: v_bfe_u32 v4, v4, 0, 16 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, v4, v3 -; GFX6-NEXT: v_and_b32_e32 v4, s5, v5 +; GFX6-NEXT: v_and_b32_e32 v4, v5, v12 ; GFX6-NEXT: v_and_b32_e32 v6, 15, v6 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 1, v4 ; GFX6-NEXT: v_bfe_u32 v5, v6, 0, 16 @@ -4202,21 +4202,21 @@ ; GFX8-NEXT: v_lshrrev_b16_e32 v8, 14, v8 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v8 ; GFX8-NEXT: v_lshlrev_b16_e32 v8, 1, v2 -; GFX8-NEXT: v_and_b32_e32 v10, 15, v4 -; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v4 +; GFX8-NEXT: v_lshlrev_b16_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_and_b32_e32 v9, 15, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v4 ; GFX8-NEXT: v_xor_b32_e32 v4, -1, v4 ; GFX8-NEXT: v_and_b32_e32 v4, 15, v4 ; GFX8-NEXT: v_lshrrev_b16_e32 v8, 1, v8 -; GFX8-NEXT: v_lshlrev_b16_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_lshlrev_b16_e32 v6, v9, v6 ; GFX8-NEXT: v_lshrrev_b16_e32 v4, v4, v8 -; GFX8-NEXT: v_xor_b32_e32 v8, -1, v9 -; GFX8-NEXT: v_lshlrev_b16_e32 v6, v10, v6 ; GFX8-NEXT: v_or_b32_e32 v4, v6, v4 -; GFX8-NEXT: v_and_b32_e32 v6, 15, v9 -; GFX8-NEXT: v_and_b32_e32 v8, 15, v8 +; GFX8-NEXT: v_and_b32_e32 v6, 15, v7 +; GFX8-NEXT: v_xor_b32_e32 v7, -1, v7 +; GFX8-NEXT: v_and_b32_e32 v7, 15, v7 ; GFX8-NEXT: v_lshrrev_b16_e32 v2, 1, v2 ; GFX8-NEXT: v_lshlrev_b16_e32 v0, v6, v0 -; GFX8-NEXT: v_lshrrev_b16_e32 v2, v8, v2 +; GFX8-NEXT: v_lshrrev_b16_e32 v2, v7, v2 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX8-NEXT: v_mov_b32_e32 v2, 16 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 @@ -4225,23 +4225,24 @@ ; GFX8-NEXT: v_lshlrev_b16_e32 v4, 1, v1 ; GFX8-NEXT: v_lshrrev_b16_e32 v6, 14, v6 ; GFX8-NEXT: v_or_b32_e32 v4, v4, v6 -; GFX8-NEXT: v_lshrrev_b16_sdwa v6, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_mov_b32_e32 v6, 1 +; GFX8-NEXT: v_lshrrev_b16_sdwa v7, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_xor_b32_e32 v5, -1, v5 -; GFX8-NEXT: v_lshlrev_b16_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_lshrrev_b16_e32 v6, 14, v6 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v6 -; GFX8-NEXT: v_lshlrev_b16_e32 v6, 1, v3 -; GFX8-NEXT: v_lshlrev_b16_sdwa v3, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_lshlrev_b16_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_lshrrev_b16_e32 v7, 14, v7 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v7 +; GFX8-NEXT: v_lshlrev_b16_e32 v7, 1, v3 +; GFX8-NEXT: v_lshlrev_b16_sdwa v3, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_and_b32_e32 v8, 15, v5 -; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v5 +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v5 ; GFX8-NEXT: v_xor_b32_e32 v5, -1, v5 ; GFX8-NEXT: v_and_b32_e32 v5, 15, v5 -; GFX8-NEXT: v_lshrrev_b16_e32 v6, 1, v6 -; GFX8-NEXT: v_lshrrev_b16_e32 v5, v5, v6 +; GFX8-NEXT: v_lshrrev_b16_e32 v7, 1, v7 ; GFX8-NEXT: v_lshlrev_b16_e32 v4, v8, v4 -; GFX8-NEXT: v_xor_b32_e32 v6, -1, v7 +; GFX8-NEXT: v_lshrrev_b16_e32 v5, v5, v7 ; GFX8-NEXT: v_or_b32_e32 v4, v4, v5 -; GFX8-NEXT: v_and_b32_e32 v5, 15, v7 +; GFX8-NEXT: v_and_b32_e32 v5, 15, v6 +; GFX8-NEXT: v_xor_b32_e32 v6, -1, v6 ; GFX8-NEXT: v_and_b32_e32 v6, 15, v6 ; GFX8-NEXT: v_lshrrev_b16_e32 v3, 1, v3 ; GFX8-NEXT: v_lshlrev_b16_e32 v1, v5, v1