diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -1103,6 +1103,11 @@ Mutations.push_back(std::make_unique(&InstrInfo)); } +std::unique_ptr +GCNSubtarget::createFillMFMAShadowMutation(const TargetInstrInfo *TII) const { + return std::make_unique(&InstrInfo); +} + const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) { if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn) return static_cast(MF.getSubtarget()); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -809,6 +809,7 @@ // allow calls without EnableAMDGPUFunctionCalls if they are marked // noinline, so this is always required. setRequiresCodeGenSCCOrder(true); + substitutePass(&PostRASchedulerID, &PostMachineSchedulerID); } GCNTargetMachine &getGCNTargetMachine() const { @@ -818,6 +819,14 @@ ScheduleDAGInstrs * createMachineScheduler(MachineSchedContext *C) const override; + ScheduleDAGInstrs * + createPostMachineScheduler(MachineSchedContext *C) const override { + ScheduleDAGMI *DAG = createGenericSchedPostRA(C); + const GCNSubtarget &ST = C->MF->getSubtarget(); + DAG->addMutation(ST.createFillMFMAShadowMutation(DAG->TII)); + return DAG; + } + bool addPreISel() override; void addMachineSSAOptimization() override; bool addILPOpts() override; diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -1130,6 +1130,9 @@ std::vector> &Mutations) const override; + std::unique_ptr + createFillMFMAShadowMutation(const TargetInstrInfo *TII) const; + bool isWave32() const { return getWavefrontSize() == 32; } diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -1048,6 +1048,10 @@ ScheduleHazardRecognizer * CreateTargetPostRAHazardRecognizer(const MachineFunction &MF) const override; + ScheduleHazardRecognizer * + CreateTargetMIHazardRecognizer(const InstrItineraryData *II, + const ScheduleDAGMI *DAG) const override; + bool isBasicBlockPrologue(const MachineInstr &MI) const override; MachineInstr *createPHIDestinationCopy(MachineBasicBlock &MBB, diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -21,6 +21,7 @@ #include "llvm/Analysis/ValueTracking.h" #include "llvm/CodeGen/LiveVariables.h" #include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/MachineScheduler.h" #include "llvm/CodeGen/RegisterScavenging.h" #include "llvm/CodeGen/ScheduleDAG.h" #include "llvm/IR/DiagnosticInfo.h" @@ -7462,6 +7463,20 @@ return new GCNHazardRecognizer(MF); } +// Called during: +// - pre-RA scheduling and post-RA scheduling +ScheduleHazardRecognizer * +SIInstrInfo::CreateTargetMIHazardRecognizer(const InstrItineraryData *II, + const ScheduleDAGMI *DAG) const { + // Borrowed from Arm Target + // We would like to restrict this hazard recognizer to only + // post-RA scheduling; we can tell that we're post-RA because we don't + // track VRegLiveness. + if (!DAG->hasVRegLiveness()) + return new GCNHazardRecognizer(DAG->MF); + return TargetInstrInfo::CreateTargetMIHazardRecognizer(II, DAG); +} + std::pair SIInstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const { return std::make_pair(TF & MO_MASK, TF & ~MO_MASK); diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll @@ -210,8 +210,8 @@ ; GFX8-LABEL: s_add_v2i16_neg_inline_imm_splat: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_mov_b32 s3, 0xffff -; GFX8-NEXT: s_lshr_b32 s2, s0, 16 ; GFX8-NEXT: s_mov_b32 s1, 0xffc0 +; GFX8-NEXT: s_lshr_b32 s2, s0, 16 ; GFX8-NEXT: s_and_b32 s0, s0, s3 ; GFX8-NEXT: s_add_i32 s0, s0, s1 ; GFX8-NEXT: s_add_i32 s2, s2, s1 @@ -312,8 +312,8 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_mov_b32 s3, 0xffff ; GFX8-NEXT: s_lshr_b32 s2, s0, 16 -; GFX8-NEXT: s_lshr_b32 s4, s1, 16 ; GFX8-NEXT: s_and_b32 s0, s0, s3 +; GFX8-NEXT: s_lshr_b32 s4, s1, 16 ; GFX8-NEXT: s_and_b32 s1, s1, s3 ; GFX8-NEXT: s_add_i32 s0, s0, s1 ; GFX8-NEXT: s_add_i32 s2, s2, s4 @@ -351,8 +351,8 @@ ; GFX8-NEXT: s_xor_b32 s0, s0, 0x80008000 ; GFX8-NEXT: s_mov_b32 s3, 0xffff ; GFX8-NEXT: s_lshr_b32 s2, s0, 16 -; GFX8-NEXT: s_lshr_b32 s4, s1, 16 ; GFX8-NEXT: s_and_b32 s0, s0, s3 +; GFX8-NEXT: s_lshr_b32 s4, s1, 16 ; GFX8-NEXT: s_and_b32 s1, s1, s3 ; GFX8-NEXT: s_add_i32 s0, s0, s1 ; GFX8-NEXT: s_add_i32 s2, s2, s4 @@ -393,8 +393,8 @@ ; GFX8-NEXT: s_xor_b32 s1, s1, 0x80008000 ; GFX8-NEXT: s_mov_b32 s3, 0xffff ; GFX8-NEXT: s_lshr_b32 s2, s0, 16 -; GFX8-NEXT: s_lshr_b32 s4, s1, 16 ; GFX8-NEXT: s_and_b32 s0, s0, s3 +; GFX8-NEXT: s_lshr_b32 s4, s1, 16 ; GFX8-NEXT: s_and_b32 s1, s1, s3 ; GFX8-NEXT: s_add_i32 s0, s0, s1 ; GFX8-NEXT: s_add_i32 s2, s2, s4 @@ -423,8 +423,8 @@ ; GFX9-LABEL: s_add_v2i16_fneg_lhs_fneg_rhs: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_mov_b32 s2, 0x80008000 -; GFX9-NEXT: s_xor_b32 s1, s1, s2 ; GFX9-NEXT: s_xor_b32 s0, s0, s2 +; GFX9-NEXT: s_xor_b32 s1, s1, s2 ; GFX9-NEXT: s_lshr_b32 s2, s0, 16 ; GFX9-NEXT: s_lshr_b32 s3, s1, 16 ; GFX9-NEXT: s_add_i32 s0, s0, s1 @@ -435,12 +435,12 @@ ; GFX8-LABEL: s_add_v2i16_fneg_lhs_fneg_rhs: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_mov_b32 s2, 0x80008000 -; GFX8-NEXT: s_xor_b32 s1, s1, s2 ; GFX8-NEXT: s_xor_b32 s0, s0, s2 +; GFX8-NEXT: s_xor_b32 s1, s1, s2 ; GFX8-NEXT: s_mov_b32 s3, 0xffff ; GFX8-NEXT: s_lshr_b32 s2, s0, 16 -; GFX8-NEXT: s_lshr_b32 s4, s1, 16 ; GFX8-NEXT: s_and_b32 s0, s0, s3 +; GFX8-NEXT: s_lshr_b32 s4, s1, 16 ; GFX8-NEXT: s_and_b32 s1, s1, s3 ; GFX8-NEXT: s_add_i32 s0, s0, s1 ; GFX8-NEXT: s_add_i32 s2, s2, s4 @@ -452,10 +452,10 @@ ; GFX10-LABEL: s_add_v2i16_fneg_lhs_fneg_rhs: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_mov_b32 s2, 0x80008000 -; GFX10-NEXT: s_xor_b32 s1, s1, s2 ; GFX10-NEXT: s_xor_b32 s0, s0, s2 -; GFX10-NEXT: s_lshr_b32 s3, s1, 16 +; GFX10-NEXT: s_xor_b32 s1, s1, s2 ; GFX10-NEXT: s_lshr_b32 s2, s0, 16 +; GFX10-NEXT: s_lshr_b32 s3, s1, 16 ; GFX10-NEXT: s_add_i32 s0, s0, s1 ; GFX10-NEXT: s_add_i32 s2, s2, s3 ; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s2 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll @@ -430,8 +430,8 @@ ; GFX6-LABEL: s_andn2_v2i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_mov_b32 s1, 0xffff -; GFX6-NEXT: s_and_b32 s2, s2, s1 ; GFX6-NEXT: s_lshl_b32 s0, s3, 16 +; GFX6-NEXT: s_and_b32 s2, s2, s1 ; GFX6-NEXT: s_or_b32 s0, s0, s2 ; GFX6-NEXT: s_lshl_b32 s2, s5, 16 ; GFX6-NEXT: s_and_b32 s1, s4, s1 @@ -459,8 +459,8 @@ ; GFX6-LABEL: s_andn2_v2i16_commute: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_mov_b32 s1, 0xffff -; GFX6-NEXT: s_and_b32 s2, s2, s1 ; GFX6-NEXT: s_lshl_b32 s0, s3, 16 +; GFX6-NEXT: s_and_b32 s2, s2, s1 ; GFX6-NEXT: s_or_b32 s0, s0, s2 ; GFX6-NEXT: s_lshl_b32 s2, s5, 16 ; GFX6-NEXT: s_and_b32 s1, s4, s1 @@ -488,8 +488,8 @@ ; GFX6-LABEL: s_andn2_v2i16_multi_use: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_mov_b32 s1, 0xffff -; GFX6-NEXT: s_and_b32 s2, s2, s1 ; GFX6-NEXT: s_lshl_b32 s0, s3, 16 +; GFX6-NEXT: s_and_b32 s2, s2, s1 ; GFX6-NEXT: s_or_b32 s0, s0, s2 ; GFX6-NEXT: s_lshl_b32 s2, s5, 16 ; GFX6-NEXT: s_and_b32 s1, s4, s1 @@ -526,8 +526,8 @@ ; GFX6-NEXT: s_lshl_b32 s0, s3, 16 ; GFX6-NEXT: s_and_b32 s2, s2, s1 ; GFX6-NEXT: s_or_b32 s0, s0, s2 -; GFX6-NEXT: s_and_b32 s3, s4, s1 ; GFX6-NEXT: s_lshl_b32 s2, s5, 16 +; GFX6-NEXT: s_and_b32 s3, s4, s1 ; GFX6-NEXT: s_or_b32 s2, s2, s3 ; GFX6-NEXT: s_lshl_b32 s3, s7, 16 ; GFX6-NEXT: s_and_b32 s1, s6, s1 @@ -633,11 +633,11 @@ ; GFX6-NEXT: s_mov_b32 s3, 0xffff ; GFX6-NEXT: s_and_b32 s1, s2, s3 ; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_and_b32 s2, s4, s3 ; GFX6-NEXT: s_lshl_b32 s1, s5, 16 +; GFX6-NEXT: s_and_b32 s2, s4, s3 ; GFX6-NEXT: s_or_b32 s1, s1, s2 -; GFX6-NEXT: s_and_b32 s4, s6, s3 ; GFX6-NEXT: s_lshl_b32 s2, s7, 16 +; GFX6-NEXT: s_and_b32 s4, s6, s3 ; GFX6-NEXT: s_or_b32 s2, s2, s4 ; GFX6-NEXT: s_lshl_b32 s4, s9, 16 ; GFX6-NEXT: s_and_b32 s3, s8, s3 @@ -676,11 +676,11 @@ ; GFX6-NEXT: s_mov_b32 s3, 0xffff ; GFX6-NEXT: s_and_b32 s1, s2, s3 ; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_and_b32 s2, s4, s3 ; GFX6-NEXT: s_lshl_b32 s1, s5, 16 +; GFX6-NEXT: s_and_b32 s2, s4, s3 ; GFX6-NEXT: s_or_b32 s1, s1, s2 -; GFX6-NEXT: s_and_b32 s4, s6, s3 ; GFX6-NEXT: s_lshl_b32 s2, s7, 16 +; GFX6-NEXT: s_and_b32 s4, s6, s3 ; GFX6-NEXT: s_or_b32 s2, s2, s4 ; GFX6-NEXT: s_lshl_b32 s4, s9, 16 ; GFX6-NEXT: s_and_b32 s3, s8, s3 @@ -719,11 +719,11 @@ ; GFX6-NEXT: s_mov_b32 s3, 0xffff ; GFX6-NEXT: s_and_b32 s1, s2, s3 ; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_and_b32 s2, s4, s3 ; GFX6-NEXT: s_lshl_b32 s1, s5, 16 +; GFX6-NEXT: s_and_b32 s2, s4, s3 ; GFX6-NEXT: s_or_b32 s1, s1, s2 -; GFX6-NEXT: s_and_b32 s4, s6, s3 ; GFX6-NEXT: s_lshl_b32 s2, s7, 16 +; GFX6-NEXT: s_and_b32 s4, s6, s3 ; GFX6-NEXT: s_or_b32 s2, s2, s4 ; GFX6-NEXT: s_lshl_b32 s4, s9, 16 ; GFX6-NEXT: s_and_b32 s3, s8, s3 @@ -773,8 +773,8 @@ ; GFX6-NEXT: s_lshl_b32 s1, s5, 16 ; GFX6-NEXT: s_and_b32 s2, s4, s14 ; GFX6-NEXT: s_or_b32 s1, s1, s2 -; GFX6-NEXT: s_and_b32 s3, s6, s14 ; GFX6-NEXT: s_lshl_b32 s2, s7, 16 +; GFX6-NEXT: s_and_b32 s3, s6, s14 ; GFX6-NEXT: s_or_b32 s2, s2, s3 ; GFX6-NEXT: s_lshl_b32 s3, s9, 16 ; GFX6-NEXT: s_and_b32 s4, s8, s14 @@ -831,8 +831,8 @@ ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v3 ; GFX6-NEXT: v_and_b32_e32 v2, v2, v8 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX6-NEXT: v_and_b32_e32 v3, v4, v8 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v5 +; GFX6-NEXT: v_and_b32_e32 v3, v4, v8 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v7 ; GFX6-NEXT: v_and_b32_e32 v4, v6, v8 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll @@ -810,11 +810,11 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_mov_b32 s3, 0x100010 ; GFX8-NEXT: s_sext_i32_i16 s2, s0 -; GFX8-NEXT: s_sext_i32_i16 s4, s1 ; GFX8-NEXT: s_bfe_i32 s0, s0, s3 +; GFX8-NEXT: s_sext_i32_i16 s4, s1 ; GFX8-NEXT: s_bfe_i32 s1, s1, s3 -; GFX8-NEXT: s_ashr_i32 s0, s0, s1 ; GFX8-NEXT: s_ashr_i32 s2, s2, s4 +; GFX8-NEXT: s_ashr_i32 s0, s0, s1 ; GFX8-NEXT: s_lshl_b32 s0, s0, 16 ; GFX8-NEXT: s_and_b32 s1, s2, 0xffff ; GFX8-NEXT: s_or_b32 s0, s0, s1 @@ -823,8 +823,8 @@ ; GFX9-LABEL: s_ashr_v2i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_sext_i32_i16 s2, s0 -; GFX9-NEXT: s_sext_i32_i16 s3, s1 ; GFX9-NEXT: s_ashr_i32 s0, s0, 16 +; GFX9-NEXT: s_sext_i32_i16 s3, s1 ; GFX9-NEXT: s_ashr_i32 s1, s1, 16 ; GFX9-NEXT: s_ashr_i32 s2, s2, s3 ; GFX9-NEXT: s_ashr_i32 s0, s0, s1 @@ -834,8 +834,8 @@ ; GFX10-LABEL: s_ashr_v2i16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_sext_i32_i16 s2, s0 -; GFX10-NEXT: s_sext_i32_i16 s3, s1 ; GFX10-NEXT: s_ashr_i32 s0, s0, 16 +; GFX10-NEXT: s_sext_i32_i16 s3, s1 ; GFX10-NEXT: s_ashr_i32 s1, s1, 16 ; GFX10-NEXT: s_ashr_i32 s2, s2, s3 ; GFX10-NEXT: s_ashr_i32 s0, s0, s1 @@ -948,10 +948,10 @@ ; GFX6-NEXT: v_ashrrev_i32_e32 v1, v4, v1 ; GFX6-NEXT: v_and_b32_e32 v4, s4, v6 ; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 16 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 ; GFX6-NEXT: v_ashrrev_i32_e32 v2, v4, v2 ; GFX6-NEXT: v_and_b32_e32 v4, s4, v7 ; GFX6-NEXT: v_bfe_i32 v3, v3, 0, 16 +; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 ; GFX6-NEXT: v_ashrrev_i32_e32 v3, v4, v3 ; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 @@ -1004,10 +1004,10 @@ ; GFX6-NEXT: s_ashr_i32 s1, s1, s4 ; GFX6-NEXT: s_and_b32 s4, s6, s8 ; GFX6-NEXT: s_sext_i32_i16 s2, s2 -; GFX6-NEXT: s_and_b32 s1, s1, s8 ; GFX6-NEXT: s_ashr_i32 s2, s2, s4 ; GFX6-NEXT: s_and_b32 s4, s7, s8 ; GFX6-NEXT: s_sext_i32_i16 s3, s3 +; GFX6-NEXT: s_and_b32 s1, s1, s8 ; GFX6-NEXT: s_ashr_i32 s3, s3, s4 ; GFX6-NEXT: s_and_b32 s0, s0, s8 ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 @@ -1022,18 +1022,18 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_mov_b32 s5, 0x100010 ; GFX8-NEXT: s_sext_i32_i16 s4, s0 -; GFX8-NEXT: s_sext_i32_i16 s7, s2 -; GFX8-NEXT: s_sext_i32_i16 s6, s1 -; GFX8-NEXT: s_sext_i32_i16 s8, s3 ; GFX8-NEXT: s_bfe_i32 s0, s0, s5 -; GFX8-NEXT: s_bfe_i32 s2, s2, s5 +; GFX8-NEXT: s_sext_i32_i16 s6, s1 ; GFX8-NEXT: s_bfe_i32 s1, s1, s5 +; GFX8-NEXT: s_sext_i32_i16 s7, s2 +; GFX8-NEXT: s_bfe_i32 s2, s2, s5 +; GFX8-NEXT: s_sext_i32_i16 s8, s3 ; GFX8-NEXT: s_bfe_i32 s3, s3, s5 +; GFX8-NEXT: s_ashr_i32 s4, s4, s7 ; GFX8-NEXT: s_ashr_i32 s0, s0, s2 +; GFX8-NEXT: s_ashr_i32 s2, s6, s8 ; GFX8-NEXT: s_ashr_i32 s1, s1, s3 -; GFX8-NEXT: s_ashr_i32 s4, s4, s7 ; GFX8-NEXT: s_mov_b32 s3, 0xffff -; GFX8-NEXT: s_ashr_i32 s2, s6, s8 ; GFX8-NEXT: s_lshl_b32 s0, s0, 16 ; GFX8-NEXT: s_and_b32 s4, s4, s3 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 @@ -1045,15 +1045,15 @@ ; GFX9-LABEL: s_ashr_v4i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_sext_i32_i16 s4, s0 -; GFX9-NEXT: s_sext_i32_i16 s5, s2 ; GFX9-NEXT: s_ashr_i32 s0, s0, 16 +; GFX9-NEXT: s_sext_i32_i16 s5, s2 ; GFX9-NEXT: s_ashr_i32 s2, s2, 16 -; GFX9-NEXT: s_ashr_i32 s0, s0, s2 ; GFX9-NEXT: s_ashr_i32 s4, s4, s5 +; GFX9-NEXT: s_ashr_i32 s0, s0, s2 ; GFX9-NEXT: s_pack_ll_b32_b16 s0, s4, s0 ; GFX9-NEXT: s_sext_i32_i16 s2, s1 -; GFX9-NEXT: s_sext_i32_i16 s4, s3 ; GFX9-NEXT: s_ashr_i32 s1, s1, 16 +; GFX9-NEXT: s_sext_i32_i16 s4, s3 ; GFX9-NEXT: s_ashr_i32 s3, s3, 16 ; GFX9-NEXT: s_ashr_i32 s2, s2, s4 ; GFX9-NEXT: s_ashr_i32 s1, s1, s3 @@ -1063,14 +1063,14 @@ ; GFX10-LABEL: s_ashr_v4i16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_sext_i32_i16 s4, s0 -; GFX10-NEXT: s_sext_i32_i16 s5, s2 ; GFX10-NEXT: s_ashr_i32 s0, s0, 16 +; GFX10-NEXT: s_sext_i32_i16 s5, s2 ; GFX10-NEXT: s_ashr_i32 s2, s2, 16 ; GFX10-NEXT: s_ashr_i32 s4, s4, s5 ; GFX10-NEXT: s_ashr_i32 s0, s0, s2 ; GFX10-NEXT: s_sext_i32_i16 s2, s1 -; GFX10-NEXT: s_sext_i32_i16 s5, s3 ; GFX10-NEXT: s_ashr_i32 s1, s1, 16 +; GFX10-NEXT: s_sext_i32_i16 s5, s3 ; GFX10-NEXT: s_ashr_i32 s3, s3, 16 ; GFX10-NEXT: s_ashr_i32 s2, s2, s5 ; GFX10-NEXT: s_ashr_i32 s1, s1, s3 @@ -1125,28 +1125,28 @@ ; GFX6-NEXT: v_ashrrev_i32_e32 v3, v8, v3 ; GFX6-NEXT: v_and_b32_e32 v8, s4, v12 ; GFX6-NEXT: v_bfe_i32 v4, v4, 0, 16 -; GFX6-NEXT: v_and_b32_e32 v1, v1, v16 ; GFX6-NEXT: v_ashrrev_i32_e32 v4, v8, v4 ; GFX6-NEXT: v_and_b32_e32 v8, s4, v13 ; GFX6-NEXT: v_bfe_i32 v5, v5, 0, 16 +; GFX6-NEXT: v_and_b32_e32 v1, v1, v16 ; GFX6-NEXT: v_ashrrev_i32_e32 v5, v8, v5 ; GFX6-NEXT: v_and_b32_e32 v8, s4, v14 ; GFX6-NEXT: v_bfe_i32 v6, v6, 0, 16 ; GFX6-NEXT: v_and_b32_e32 v0, v0, v16 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_ashrrev_i32_e32 v6, v8, v6 +; GFX6-NEXT: v_and_b32_e32 v8, v15, v16 +; GFX6-NEXT: v_bfe_i32 v7, v7, 0, 16 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: v_and_b32_e32 v1, v2, v16 ; GFX6-NEXT: v_and_b32_e32 v2, v3, v16 -; GFX6-NEXT: v_and_b32_e32 v8, v15, v16 -; GFX6-NEXT: v_bfe_i32 v7, v7, 0, 16 -; GFX6-NEXT: v_and_b32_e32 v3, v5, v16 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: v_ashrrev_i32_e32 v7, v8, v7 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_and_b32_e32 v3, v5, v16 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX6-NEXT: v_and_b32_e32 v2, v4, v16 -; GFX6-NEXT: v_and_b32_e32 v4, v7, v16 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_and_b32_e32 v4, v7, v16 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX6-NEXT: v_and_b32_e32 v3, v6, v16 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 @@ -1211,28 +1211,28 @@ ; GFX6-NEXT: s_ashr_i32 s3, s3, s8 ; GFX6-NEXT: s_and_b32 s8, s12, s16 ; GFX6-NEXT: s_sext_i32_i16 s4, s4 -; GFX6-NEXT: s_and_b32 s1, s1, s16 ; GFX6-NEXT: s_ashr_i32 s4, s4, s8 ; GFX6-NEXT: s_and_b32 s8, s13, s16 ; GFX6-NEXT: s_sext_i32_i16 s5, s5 +; GFX6-NEXT: s_and_b32 s1, s1, s16 ; GFX6-NEXT: s_ashr_i32 s5, s5, s8 ; GFX6-NEXT: s_and_b32 s8, s14, s16 ; GFX6-NEXT: s_sext_i32_i16 s6, s6 ; GFX6-NEXT: s_and_b32 s0, s0, s16 ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_ashr_i32 s6, s6, s8 +; GFX6-NEXT: s_and_b32 s8, s15, s16 +; GFX6-NEXT: s_sext_i32_i16 s7, s7 ; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: s_and_b32 s1, s2, s16 ; GFX6-NEXT: s_and_b32 s2, s3, s16 -; GFX6-NEXT: s_and_b32 s8, s15, s16 -; GFX6-NEXT: s_sext_i32_i16 s7, s7 -; GFX6-NEXT: s_and_b32 s3, s5, s16 -; GFX6-NEXT: s_lshl_b32 s2, s2, 16 ; GFX6-NEXT: s_ashr_i32 s7, s7, s8 +; GFX6-NEXT: s_lshl_b32 s2, s2, 16 +; GFX6-NEXT: s_and_b32 s3, s5, s16 ; GFX6-NEXT: s_or_b32 s1, s1, s2 ; GFX6-NEXT: s_and_b32 s2, s4, s16 -; GFX6-NEXT: s_and_b32 s4, s7, s16 ; GFX6-NEXT: s_lshl_b32 s3, s3, 16 +; GFX6-NEXT: s_and_b32 s4, s7, s16 ; GFX6-NEXT: s_or_b32 s2, s2, s3 ; GFX6-NEXT: s_and_b32 s3, s6, s16 ; GFX6-NEXT: s_lshl_b32 s4, s4, 16 @@ -1243,38 +1243,38 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_mov_b32 s9, 0x100010 ; GFX8-NEXT: s_sext_i32_i16 s8, s0 -; GFX8-NEXT: s_sext_i32_i16 s13, s4 +; GFX8-NEXT: s_bfe_i32 s0, s0, s9 ; GFX8-NEXT: s_sext_i32_i16 s10, s1 +; GFX8-NEXT: s_bfe_i32 s1, s1, s9 ; GFX8-NEXT: s_sext_i32_i16 s12, s3 -; GFX8-NEXT: s_sext_i32_i16 s14, s5 -; GFX8-NEXT: s_sext_i32_i16 s16, s7 -; GFX8-NEXT: s_bfe_i32 s0, s0, s9 +; GFX8-NEXT: s_bfe_i32 s3, s3, s9 +; GFX8-NEXT: s_sext_i32_i16 s13, s4 ; GFX8-NEXT: s_bfe_i32 s4, s4, s9 -; GFX8-NEXT: s_bfe_i32 s1, s1, s9 +; GFX8-NEXT: s_sext_i32_i16 s14, s5 ; GFX8-NEXT: s_bfe_i32 s5, s5, s9 -; GFX8-NEXT: s_bfe_i32 s3, s3, s9 +; GFX8-NEXT: s_sext_i32_i16 s16, s7 ; GFX8-NEXT: s_bfe_i32 s7, s7, s9 -; GFX8-NEXT: s_ashr_i32 s0, s0, s4 -; GFX8-NEXT: s_ashr_i32 s3, s3, s7 -; GFX8-NEXT: s_ashr_i32 s1, s1, s5 ; GFX8-NEXT: s_sext_i32_i16 s11, s2 -; GFX8-NEXT: s_sext_i32_i16 s15, s6 ; GFX8-NEXT: s_bfe_i32 s2, s2, s9 +; GFX8-NEXT: s_sext_i32_i16 s15, s6 ; GFX8-NEXT: s_bfe_i32 s6, s6, s9 +; GFX8-NEXT: s_ashr_i32 s0, s0, s4 ; GFX8-NEXT: s_ashr_i32 s4, s10, s14 +; GFX8-NEXT: s_ashr_i32 s1, s1, s5 +; GFX8-NEXT: s_ashr_i32 s3, s3, s7 ; GFX8-NEXT: s_mov_b32 s7, 0xffff -; GFX8-NEXT: s_ashr_i32 s2, s2, s6 ; GFX8-NEXT: s_ashr_i32 s5, s11, s15 +; GFX8-NEXT: s_ashr_i32 s2, s2, s6 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 ; GFX8-NEXT: s_and_b32 s4, s4, s7 ; GFX8-NEXT: s_ashr_i32 s8, s8, s13 -; GFX8-NEXT: s_or_b32 s1, s1, s4 ; GFX8-NEXT: s_ashr_i32 s6, s12, s16 +; GFX8-NEXT: s_or_b32 s1, s1, s4 ; GFX8-NEXT: s_lshl_b32 s2, s2, 16 ; GFX8-NEXT: s_and_b32 s4, s5, s7 -; GFX8-NEXT: s_or_b32 s2, s2, s4 ; GFX8-NEXT: s_lshl_b32 s0, s0, 16 ; GFX8-NEXT: s_and_b32 s8, s8, s7 +; GFX8-NEXT: s_or_b32 s2, s2, s4 ; GFX8-NEXT: s_lshl_b32 s3, s3, 16 ; GFX8-NEXT: s_and_b32 s4, s6, s7 ; GFX8-NEXT: s_or_b32 s0, s0, s8 @@ -1284,29 +1284,29 @@ ; GFX9-LABEL: s_ashr_v8i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_sext_i32_i16 s8, s0 -; GFX9-NEXT: s_sext_i32_i16 s9, s4 ; GFX9-NEXT: s_ashr_i32 s0, s0, 16 +; GFX9-NEXT: s_sext_i32_i16 s9, s4 ; GFX9-NEXT: s_ashr_i32 s4, s4, 16 -; GFX9-NEXT: s_ashr_i32 s0, s0, s4 ; GFX9-NEXT: s_ashr_i32 s8, s8, s9 +; GFX9-NEXT: s_ashr_i32 s0, s0, s4 ; GFX9-NEXT: s_pack_ll_b32_b16 s0, s8, s0 ; GFX9-NEXT: s_sext_i32_i16 s4, s1 -; GFX9-NEXT: s_sext_i32_i16 s8, s5 ; GFX9-NEXT: s_ashr_i32 s1, s1, 16 +; GFX9-NEXT: s_sext_i32_i16 s8, s5 ; GFX9-NEXT: s_ashr_i32 s5, s5, 16 -; GFX9-NEXT: s_ashr_i32 s1, s1, s5 ; GFX9-NEXT: s_ashr_i32 s4, s4, s8 +; GFX9-NEXT: s_ashr_i32 s1, s1, s5 ; GFX9-NEXT: s_pack_ll_b32_b16 s1, s4, s1 ; GFX9-NEXT: s_sext_i32_i16 s4, s2 -; GFX9-NEXT: s_sext_i32_i16 s5, s6 ; GFX9-NEXT: s_ashr_i32 s2, s2, 16 +; GFX9-NEXT: s_sext_i32_i16 s5, s6 ; GFX9-NEXT: s_ashr_i32 s6, s6, 16 ; GFX9-NEXT: s_ashr_i32 s4, s4, s5 ; GFX9-NEXT: s_ashr_i32 s2, s2, s6 ; GFX9-NEXT: s_pack_ll_b32_b16 s2, s4, s2 ; GFX9-NEXT: s_sext_i32_i16 s4, s3 -; GFX9-NEXT: s_sext_i32_i16 s5, s7 ; GFX9-NEXT: s_ashr_i32 s3, s3, 16 +; GFX9-NEXT: s_sext_i32_i16 s5, s7 ; GFX9-NEXT: s_ashr_i32 s6, s7, 16 ; GFX9-NEXT: s_ashr_i32 s4, s4, s5 ; GFX9-NEXT: s_ashr_i32 s3, s3, s6 @@ -1316,14 +1316,14 @@ ; GFX10-LABEL: s_ashr_v8i16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_sext_i32_i16 s8, s0 -; GFX10-NEXT: s_sext_i32_i16 s9, s4 ; GFX10-NEXT: s_ashr_i32 s0, s0, 16 +; GFX10-NEXT: s_sext_i32_i16 s9, s4 ; GFX10-NEXT: s_ashr_i32 s4, s4, 16 ; GFX10-NEXT: s_ashr_i32 s8, s8, s9 ; GFX10-NEXT: s_ashr_i32 s0, s0, s4 ; GFX10-NEXT: s_sext_i32_i16 s4, s1 -; GFX10-NEXT: s_sext_i32_i16 s9, s5 ; GFX10-NEXT: s_ashr_i32 s1, s1, 16 +; GFX10-NEXT: s_sext_i32_i16 s9, s5 ; GFX10-NEXT: s_ashr_i32 s5, s5, 16 ; GFX10-NEXT: s_ashr_i32 s4, s4, s9 ; GFX10-NEXT: s_ashr_i32 s1, s1, s5 @@ -1335,8 +1335,8 @@ ; GFX10-NEXT: s_ashr_i32 s4, s4, s5 ; GFX10-NEXT: s_ashr_i32 s2, s2, s6 ; GFX10-NEXT: s_sext_i32_i16 s5, s3 -; GFX10-NEXT: s_sext_i32_i16 s6, s7 ; GFX10-NEXT: s_ashr_i32 s3, s3, 16 +; GFX10-NEXT: s_sext_i32_i16 s6, s7 ; GFX10-NEXT: s_ashr_i32 s7, s7, 16 ; GFX10-NEXT: s_ashr_i32 s5, s5, s6 ; GFX10-NEXT: s_ashr_i32 s3, s3, s7 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/bswap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/bswap.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/bswap.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/bswap.ll @@ -92,8 +92,8 @@ ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: s_mov_b32 s0, 0x10203 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: v_perm_b32 v1, 0, v1, s0 ; GFX8-NEXT: v_perm_b32 v0, 0, v0, s0 +; GFX8-NEXT: v_perm_b32 v1, 0, v1, s0 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0 ; GFX8-NEXT: v_readfirstlane_b32 s1, v1 ; GFX8-NEXT: ; return to shader part epilog @@ -103,8 +103,8 @@ ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: s_mov_b32 s0, 0x10203 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_perm_b32 v1, 0, v1, s0 ; GFX9-NEXT: v_perm_b32 v0, 0, v0, s0 +; GFX9-NEXT: v_perm_b32 v1, 0, v1, s0 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: v_readfirstlane_b32 s1, v1 ; GFX9-NEXT: ; return to shader part epilog @@ -285,9 +285,9 @@ ; GFX8-NEXT: v_mov_b32_e32 v2, s3 ; GFX8-NEXT: v_mov_b32_e32 v3, s2 ; GFX8-NEXT: v_perm_b32 v0, 0, v0, s1 +; GFX8-NEXT: v_perm_b32 v1, 0, v1, s1 ; GFX8-NEXT: v_perm_b32 v2, 0, v2, s1 ; GFX8-NEXT: v_perm_b32 v3, 0, v3, s1 -; GFX8-NEXT: v_perm_b32 v1, 0, v1, s1 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0 ; GFX8-NEXT: v_readfirstlane_b32 s1, v1 ; GFX8-NEXT: v_readfirstlane_b32 s2, v2 @@ -302,9 +302,9 @@ ; GFX9-NEXT: v_mov_b32_e32 v2, s3 ; GFX9-NEXT: v_mov_b32_e32 v3, s2 ; GFX9-NEXT: v_perm_b32 v0, 0, v0, s1 +; GFX9-NEXT: v_perm_b32 v1, 0, v1, s1 ; GFX9-NEXT: v_perm_b32 v2, 0, v2, s1 ; GFX9-NEXT: v_perm_b32 v3, 0, v3, s1 -; GFX9-NEXT: v_perm_b32 v1, 0, v1, s1 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: v_readfirstlane_b32 s1, v1 ; GFX9-NEXT: v_readfirstlane_b32 s2, v2 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll @@ -181,8 +181,8 @@ ; SI-NEXT: v_cvt_f32_ubyte0_e32 v3, v1 ; SI-NEXT: v_bfe_u32 v1, v0, 8, 8 ; SI-NEXT: v_bfe_u32 v0, v0, 16, 8 -; SI-NEXT: v_cvt_f32_ubyte0_e32 v2, v0 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v1 +; SI-NEXT: v_cvt_f32_ubyte0_e32 v2, v0 ; SI-NEXT: v_mov_b32_e32 v0, v3 ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -205,12 +205,12 @@ ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v0 -; SI-NEXT: v_bfe_u32 v2, v0, 16, 8 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v4, v1 ; SI-NEXT: v_bfe_u32 v1, v0, 8, 8 -; SI-NEXT: v_cvt_f32_ubyte3_e32 v3, v0 +; SI-NEXT: v_bfe_u32 v2, v0, 16, 8 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v1 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v2, v2 +; SI-NEXT: v_cvt_f32_ubyte3_e32 v3, v0 ; SI-NEXT: v_mov_b32_e32 v0, v4 ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -233,12 +233,12 @@ ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v0 -; SI-NEXT: v_bfe_u32 v2, v0, 16, 8 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v4, v1 ; SI-NEXT: v_bfe_u32 v1, v0, 8, 8 -; SI-NEXT: v_cvt_f32_ubyte3_e32 v3, v0 +; SI-NEXT: v_bfe_u32 v2, v0, 16, 8 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v1 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v2, v2 +; SI-NEXT: v_cvt_f32_ubyte3_e32 v3, v0 ; SI-NEXT: v_mov_b32_e32 v0, v4 ; SI-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll @@ -41,10 +41,12 @@ ; GFX9-NEXT: v_mov_b32_e32 v0, v2 ; GFX9-NEXT: v_mov_b32_e32 v1, v3 ; GFX9-NEXT: v_mov_b32_e32 v18, v2 -; GFX9-NEXT: v_mov_b32_e32 v3, v3 ; GFX9-NEXT: s_set_gpr_idx_off ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: v_readfirstlane_b32 s1, v1 +; GFX9-NEXT: s_set_gpr_idx_on s2, gpr_idx(SRC0) +; GFX9-NEXT: v_mov_b32_e32 v3, v3 +; GFX9-NEXT: s_set_gpr_idx_off ; GFX9-NEXT: v_readfirstlane_b32 s2, v18 ; GFX9-NEXT: v_readfirstlane_b32 s3, v3 ; GFX9-NEXT: ; return to shader part epilog @@ -128,8 +130,8 @@ ; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off ; GFX9-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:16 ; GFX9-NEXT: v_add_u32_e32 v17, 1, v16 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v16 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v17 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v16 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], 6, v16 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[8:9], 7, v16 ; GFX9-NEXT: s_waitcnt vmcnt(1) @@ -187,8 +189,8 @@ ; GFX8-NEXT: flat_load_dwordx4 v[4:7], v[3:4] ; GFX8-NEXT: v_lshlrev_b32_e32 v16, 1, v2 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, 1, v16 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v16 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v17 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v16 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[6:7], 6, v16 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[8:9], 7, v16 ; GFX8-NEXT: s_waitcnt vmcnt(1) @@ -251,8 +253,8 @@ ; GFX7-NEXT: buffer_load_dwordx4 v[2:5], v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: buffer_load_dwordx4 v[6:9], v[0:1], s[8:11], 0 addr64 offset:16 ; GFX7-NEXT: v_add_i32_e32 v17, vcc, 1, v16 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v16 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v17 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v16 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], 6, v16 ; GFX7-NEXT: s_waitcnt vmcnt(1) ; GFX7-NEXT: v_cndmask_b32_e64 v10, v2, v4, s[4:5] @@ -310,8 +312,8 @@ ; GFX10-NEXT: global_load_dwordx4 v[7:10], v[0:1], off offset:16 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 1, v2 ; GFX10-NEXT: global_load_dwordx4 v[11:14], v[0:1], off offset:32 -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2 ; GFX10-NEXT: v_add_nc_u32_e32 v19, 1, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2 ; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 1, v19 ; GFX10-NEXT: s_waitcnt vmcnt(2) ; GFX10-NEXT: v_cndmask_b32_e32 v15, v3, v5, vcc_lo @@ -324,15 +326,15 @@ ; GFX10-NEXT: v_cndmask_b32_e32 v5, v15, v7, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v6, v16, v8, vcc_lo ; GFX10-NEXT: global_load_dwordx4 v[15:18], v[0:1], off offset:48 -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v2 ; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v7, s4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v2 ; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v8, s4 ; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 3, v19 ; GFX10-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v2 ; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v9, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v10, s4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v2 ; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 4, v19 ; GFX10-NEXT: s_waitcnt vmcnt(1) ; GFX10-NEXT: v_cndmask_b32_e32 v0, v5, v11, vcc_lo @@ -350,8 +352,8 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v15, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v16, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v2 ; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v15, s4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v2 ; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v16, s4 ; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 7, v19 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v17, vcc_lo @@ -408,13 +410,13 @@ ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v19 ; GFX9-NEXT: v_mov_b32_e32 v15, s14 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 7, v0 ; GFX9-NEXT: v_mov_b32_e32 v16, s15 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 7, v0 ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[0:1] ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 3, v19 -; GFX9-NEXT: v_cndmask_b32_e64 v3, v1, v7, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e32 v0, v17, v15, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v3, v1, v7, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e32 v1, v18, v16, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v8, s[0:1] ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 4, v19 @@ -442,8 +444,8 @@ ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s0 -; GFX8-NEXT: v_mov_b32_e32 v3, s2 ; GFX8-NEXT: v_mov_b32_e32 v2, s1 +; GFX8-NEXT: v_mov_b32_e32 v3, s2 ; GFX8-NEXT: v_mov_b32_e32 v4, s3 ; GFX8-NEXT: v_mov_b32_e32 v5, s4 ; GFX8-NEXT: v_mov_b32_e32 v6, s5 @@ -478,13 +480,13 @@ ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v19 ; GFX8-NEXT: v_mov_b32_e32 v15, s14 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 7, v0 ; GFX8-NEXT: v_mov_b32_e32 v16, s15 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 7, v0 ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[0:1] ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 3, v19 -; GFX8-NEXT: v_cndmask_b32_e64 v3, v1, v7, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e32 v0, v17, v15, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v1, v7, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e32 v1, v18, v16, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v8, s[0:1] ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 4, v19 @@ -512,8 +514,8 @@ ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s0 -; GFX7-NEXT: v_mov_b32_e32 v3, s2 ; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s2 ; GFX7-NEXT: v_mov_b32_e32 v4, s3 ; GFX7-NEXT: v_mov_b32_e32 v5, s4 ; GFX7-NEXT: v_mov_b32_e32 v6, s5 @@ -548,13 +550,13 @@ ; GFX7-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v19 ; GFX7-NEXT: v_mov_b32_e32 v15, s14 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 7, v0 ; GFX7-NEXT: v_mov_b32_e32 v16, s15 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 7, v0 ; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[0:1] ; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[0:1] ; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 3, v19 -; GFX7-NEXT: v_cndmask_b32_e64 v3, v1, v7, s[0:1] ; GFX7-NEXT: v_cndmask_b32_e32 v0, v17, v15, vcc +; GFX7-NEXT: v_cndmask_b32_e64 v3, v1, v7, s[0:1] ; GFX7-NEXT: v_cndmask_b32_e32 v1, v18, v16, vcc ; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v8, s[0:1] ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 4, v19 @@ -621,12 +623,12 @@ ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s16, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s17, s0 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 7, v1 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v5, s19, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v0, v4, s18, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v1, v5, s19, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s18, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s19, s0 -; GFX10-NEXT: v_readfirstlane_b32 s1, v1 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: v_readfirstlane_b32 s1, v1 ; GFX10-NEXT: v_readfirstlane_b32 s2, v2 ; GFX10-NEXT: v_readfirstlane_b32 s3, v3 ; GFX10-NEXT: ; return to shader part epilog diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i16.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i16.ll @@ -38,8 +38,8 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off ; GFX9-NEXT: s_lshr_b32 s0, s2, 1 -; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1 ; GFX9-NEXT: s_and_b32 s1, s2, 1 +; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1 ; GFX9-NEXT: s_lshl_b32 s0, s1, 4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc @@ -51,8 +51,8 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX8-NEXT: s_lshr_b32 s0, s2, 1 -; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1 ; GFX8-NEXT: s_and_b32 s1, s2, 1 +; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1 ; GFX8-NEXT: s_lshl_b32 s0, s1, 4 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc @@ -64,8 +64,8 @@ ; GFX7: ; %bb.0: ; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX7-NEXT: s_lshr_b32 s0, s2, 1 -; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1 ; GFX7-NEXT: s_and_b32 s1, s2, 1 +; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1 ; GFX7-NEXT: s_lshl_b32 s0, s1, 4 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc @@ -96,8 +96,8 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 1, v2 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 ; GFX9-NEXT: v_and_b32_e32 v2, 1, v2 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 4, v2 @@ -109,8 +109,8 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 1, v2 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 ; GFX8-NEXT: v_and_b32_e32 v2, 1, v2 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 4, v2 @@ -122,8 +122,8 @@ ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 1, v2 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 ; GFX7-NEXT: v_and_b32_e32 v2, 1, v2 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 4, v2 @@ -153,8 +153,8 @@ ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GCN-NEXT: v_lshrrev_b32_e32 v1, 1, v0 -; GCN-NEXT: v_and_b32_e32 v0, 1, v0 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 +; GCN-NEXT: v_and_b32_e32 v0, 1, v0 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v2, s0 @@ -605,11 +605,11 @@ ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v2, s0 ; GCN-NEXT: v_mov_b32_e32 v3, s1 -; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; GCN-NEXT: v_mov_b32_e32 v4, s2 +; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 2, v1 -; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GCN-NEXT: v_mov_b32_e32 v5, s3 +; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 3, v1 ; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v5, vcc ; GCN-NEXT: v_lshrrev_b32_e32 v0, v0, v1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i8.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i8.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i8.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i8.ll @@ -12,8 +12,8 @@ ; GCN-NEXT: s_bfe_u32 s3, s0, 0x80008 ; GCN-NEXT: s_lshr_b32 s1, s0, 24 ; GCN-NEXT: s_and_b32 s2, s0, 0xff -; GCN-NEXT: s_bfe_u32 s0, s0, 0x80010 ; GCN-NEXT: s_lshl_b32 s3, s3, 8 +; GCN-NEXT: s_bfe_u32 s0, s0, 0x80010 ; GCN-NEXT: s_or_b32 s2, s2, s3 ; GCN-NEXT: s_lshl_b32 s0, s0, 16 ; GCN-NEXT: s_or_b32 s0, s2, s0 @@ -52,8 +52,8 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: global_load_dword v0, v[0:1], off ; GFX9-NEXT: v_mov_b32_e32 v2, 8 -; GFX9-NEXT: v_mov_b32_e32 v3, 16 ; GFX9-NEXT: v_mov_b32_e32 v1, 0xff +; GFX9-NEXT: v_mov_b32_e32 v3, 16 ; GFX9-NEXT: s_and_b32 s0, s2, 3 ; GFX9-NEXT: s_lshl_b32 s0, s0, 3 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -137,8 +137,8 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dword v0, v[0:1], off ; GFX9-NEXT: v_mov_b32_e32 v3, 8 -; GFX9-NEXT: v_mov_b32_e32 v4, 16 ; GFX9-NEXT: v_mov_b32_e32 v1, 0xff +; GFX9-NEXT: v_mov_b32_e32 v4, 16 ; GFX9-NEXT: v_and_b32_e32 v2, 3, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 24, v0 @@ -226,8 +226,8 @@ ; GFX9-NEXT: s_bfe_u32 s3, s0, 0x80008 ; GFX9-NEXT: s_lshr_b32 s1, s0, 24 ; GFX9-NEXT: s_and_b32 s2, s0, 0xff -; GFX9-NEXT: s_bfe_u32 s0, s0, 0x80010 ; GFX9-NEXT: s_lshl_b32 s3, s3, 8 +; GFX9-NEXT: s_bfe_u32 s0, s0, 0x80010 ; GFX9-NEXT: s_or_b32 s2, s2, s3 ; GFX9-NEXT: s_lshl_b32 s0, s0, 16 ; GFX9-NEXT: s_or_b32 s0, s2, s0 @@ -246,8 +246,8 @@ ; GFX8-NEXT: s_bfe_u32 s3, s0, 0x80008 ; GFX8-NEXT: s_lshr_b32 s1, s0, 24 ; GFX8-NEXT: s_and_b32 s2, s0, 0xff -; GFX8-NEXT: s_bfe_u32 s0, s0, 0x80010 ; GFX8-NEXT: s_lshl_b32 s3, s3, 8 +; GFX8-NEXT: s_bfe_u32 s0, s0, 0x80010 ; GFX8-NEXT: s_or_b32 s2, s2, s3 ; GFX8-NEXT: s_lshl_b32 s0, s0, 16 ; GFX8-NEXT: s_or_b32 s0, s2, s0 @@ -266,8 +266,8 @@ ; GFX7-NEXT: s_bfe_u32 s3, s0, 0x80008 ; GFX7-NEXT: s_lshr_b32 s1, s0, 24 ; GFX7-NEXT: s_and_b32 s2, s0, 0xff -; GFX7-NEXT: s_bfe_u32 s0, s0, 0x80010 ; GFX7-NEXT: s_lshl_b32 s3, s3, 8 +; GFX7-NEXT: s_bfe_u32 s0, s0, 0x80010 ; GFX7-NEXT: s_or_b32 s2, s2, s3 ; GFX7-NEXT: s_lshl_b32 s0, s0, 16 ; GFX7-NEXT: s_or_b32 s0, s2, s0 @@ -284,14 +284,14 @@ ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_bfe_u32 s2, s0, 0x80008 -; GFX10-NEXT: s_bfe_u32 s3, s0, 0x80010 ; GFX10-NEXT: s_and_b32 s1, s0, 0xff +; GFX10-NEXT: s_bfe_u32 s3, s0, 0x80010 ; GFX10-NEXT: s_lshl_b32 s2, s2, 8 -; GFX10-NEXT: s_lshr_b32 s0, s0, 24 ; GFX10-NEXT: s_lshl_b32 s3, s3, 16 ; GFX10-NEXT: s_or_b32 s1, s1, s2 -; GFX10-NEXT: s_lshl_b32 s0, s0, 24 +; GFX10-NEXT: s_lshr_b32 s0, s0, 24 ; GFX10-NEXT: s_or_b32 s1, s1, s3 +; GFX10-NEXT: s_lshl_b32 s0, s0, 24 ; GFX10-NEXT: s_or_b32 s0, s1, s0 ; GFX10-NEXT: v_lshrrev_b32_e64 v0, v0, s0 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 @@ -323,14 +323,14 @@ ; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_bfe_u32 s2, s0, 0x80008 -; GFX10-NEXT: s_bfe_u32 s3, s0, 0x80010 ; GFX10-NEXT: s_and_b32 s1, s0, 0xff +; GFX10-NEXT: s_bfe_u32 s3, s0, 0x80010 ; GFX10-NEXT: s_lshl_b32 s2, s2, 8 -; GFX10-NEXT: s_lshr_b32 s0, s0, 24 ; GFX10-NEXT: s_lshl_b32 s3, s3, 16 ; GFX10-NEXT: s_or_b32 s1, s1, s2 -; GFX10-NEXT: s_lshl_b32 s0, s0, 24 +; GFX10-NEXT: s_lshr_b32 s0, s0, 24 ; GFX10-NEXT: s_or_b32 s1, s1, s3 +; GFX10-NEXT: s_lshl_b32 s0, s0, 24 ; GFX10-NEXT: s_or_b32 s0, s1, s0 ; GFX10-NEXT: ; return to shader part epilog %vector = load <4 x i8>, <4 x i8> addrspace(4)* %ptr @@ -361,14 +361,14 @@ ; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_bfe_u32 s2, s0, 0x80008 -; GFX10-NEXT: s_bfe_u32 s3, s0, 0x80010 ; GFX10-NEXT: s_and_b32 s1, s0, 0xff +; GFX10-NEXT: s_bfe_u32 s3, s0, 0x80010 ; GFX10-NEXT: s_lshl_b32 s2, s2, 8 -; GFX10-NEXT: s_lshr_b32 s0, s0, 24 ; GFX10-NEXT: s_lshl_b32 s3, s3, 16 ; GFX10-NEXT: s_or_b32 s1, s1, s2 -; GFX10-NEXT: s_lshl_b32 s0, s0, 24 +; GFX10-NEXT: s_lshr_b32 s0, s0, 24 ; GFX10-NEXT: s_or_b32 s1, s1, s3 +; GFX10-NEXT: s_lshl_b32 s0, s0, 24 ; GFX10-NEXT: s_or_b32 s0, s1, s0 ; GFX10-NEXT: s_lshr_b32 s0, s0, 8 ; GFX10-NEXT: ; return to shader part epilog @@ -400,14 +400,14 @@ ; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_bfe_u32 s2, s0, 0x80008 -; GFX10-NEXT: s_bfe_u32 s3, s0, 0x80010 ; GFX10-NEXT: s_and_b32 s1, s0, 0xff +; GFX10-NEXT: s_bfe_u32 s3, s0, 0x80010 ; GFX10-NEXT: s_lshl_b32 s2, s2, 8 -; GFX10-NEXT: s_lshr_b32 s0, s0, 24 ; GFX10-NEXT: s_lshl_b32 s3, s3, 16 ; GFX10-NEXT: s_or_b32 s1, s1, s2 -; GFX10-NEXT: s_lshl_b32 s0, s0, 24 +; GFX10-NEXT: s_lshr_b32 s0, s0, 24 ; GFX10-NEXT: s_or_b32 s1, s1, s3 +; GFX10-NEXT: s_lshl_b32 s0, s0, 24 ; GFX10-NEXT: s_or_b32 s0, s1, s0 ; GFX10-NEXT: s_lshr_b32 s0, s0, 16 ; GFX10-NEXT: ; return to shader part epilog @@ -439,14 +439,14 @@ ; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_bfe_u32 s2, s0, 0x80008 -; GFX10-NEXT: s_bfe_u32 s3, s0, 0x80010 ; GFX10-NEXT: s_and_b32 s1, s0, 0xff +; GFX10-NEXT: s_bfe_u32 s3, s0, 0x80010 ; GFX10-NEXT: s_lshl_b32 s2, s2, 8 -; GFX10-NEXT: s_lshr_b32 s0, s0, 24 ; GFX10-NEXT: s_lshl_b32 s3, s3, 16 ; GFX10-NEXT: s_or_b32 s1, s1, s2 -; GFX10-NEXT: s_lshl_b32 s0, s0, 24 +; GFX10-NEXT: s_lshr_b32 s0, s0, 24 ; GFX10-NEXT: s_or_b32 s1, s1, s3 +; GFX10-NEXT: s_lshl_b32 s0, s0, 24 ; GFX10-NEXT: s_or_b32 s0, s1, s0 ; GFX10-NEXT: s_lshr_b32 s0, s0, 24 ; GFX10-NEXT: ; return to shader part epilog @@ -461,8 +461,8 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dword v0, v[0:1], off ; GFX9-NEXT: v_mov_b32_e32 v2, 8 -; GFX9-NEXT: v_mov_b32_e32 v3, 16 ; GFX9-NEXT: v_mov_b32_e32 v1, 0xff +; GFX9-NEXT: v_mov_b32_e32 v3, 16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 24, v0 ; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 @@ -534,8 +534,8 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dword v0, v[0:1], off ; GFX9-NEXT: s_mov_b32 s4, 8 -; GFX9-NEXT: v_mov_b32_e32 v2, 16 ; GFX9-NEXT: v_mov_b32_e32 v1, 0xff +; GFX9-NEXT: v_mov_b32_e32 v2, 16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 24, v0 ; GFX9-NEXT: v_lshlrev_b32_sdwa v4, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 @@ -688,8 +688,8 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dword v0, v[0:1], off ; GFX9-NEXT: v_mov_b32_e32 v2, 8 -; GFX9-NEXT: v_mov_b32_e32 v3, 16 ; GFX9-NEXT: v_mov_b32_e32 v1, 0xff +; GFX9-NEXT: v_mov_b32_e32 v3, 16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 24, v0 ; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 @@ -780,8 +780,8 @@ ; GCN-NEXT: s_and_b32 s2, s1, s5 ; GCN-NEXT: s_bfe_u32 s5, s1, s7 ; GCN-NEXT: s_lshr_b32 s3, s1, 24 -; GCN-NEXT: s_bfe_u32 s1, s1, s8 ; GCN-NEXT: s_lshl_b32 s5, s5, 8 +; GCN-NEXT: s_bfe_u32 s1, s1, s8 ; GCN-NEXT: s_or_b32 s2, s2, s5 ; GCN-NEXT: s_lshl_b32 s1, s1, 16 ; GCN-NEXT: s_or_b32 s1, s2, s1 @@ -814,12 +814,12 @@ ; GFX10-NEXT: s_lshl_b32 s5, s10, 8 ; GFX10-NEXT: s_lshl_b32 s3, s3, 8 ; GFX10-NEXT: s_lshl_b32 s0, s0, 16 -; GFX10-NEXT: s_or_b32 s5, s9, s5 ; GFX10-NEXT: s_lshl_b32 s1, s1, 16 +; GFX10-NEXT: s_or_b32 s5, s9, s5 ; GFX10-NEXT: s_or_b32 s2, s2, s3 ; GFX10-NEXT: s_lshl_b32 s7, s7, 24 -; GFX10-NEXT: s_or_b32 s0, s5, s0 ; GFX10-NEXT: s_lshl_b32 s8, s8, 24 +; GFX10-NEXT: s_or_b32 s0, s5, s0 ; GFX10-NEXT: s_or_b32 s1, s2, s1 ; GFX10-NEXT: s_or_b32 s0, s0, s7 ; GFX10-NEXT: s_or_b32 s1, s1, s8 @@ -876,14 +876,14 @@ ; GFX8-NEXT: v_lshlrev_b32_sdwa v6, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 24, v0 -; GFX8-NEXT: v_lshlrev_b32_sdwa v7, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 24, v1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v7, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v7 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 24, v5 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v7 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v3 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v2 @@ -913,13 +913,13 @@ ; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 8 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v7, 8, v7 -; GFX7-NEXT: v_or_b32_e32 v4, v4, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v4, v4, v5 ; GFX7-NEXT: v_or_b32_e32 v5, v6, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GFX7-NEXT: v_or_b32_e32 v0, v4, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GFX7-NEXT: v_or_b32_e32 v0, v4, v0 ; GFX7-NEXT: v_or_b32_e32 v1, v5, v1 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX7-NEXT: v_or_b32_e32 v1, v1, v3 @@ -938,8 +938,8 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 24, v0 ; GFX10-NEXT: v_lshlrev_b32_sdwa v3, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_lshlrev_b32_sdwa v5, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_lshrrev_b32_e32 v4, 24, v1 +; GFX10-NEXT: v_lshlrev_b32_sdwa v5, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_lshlrev_b32_sdwa v6, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v7, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_and_or_b32 v0, v0, s3, v3 @@ -947,8 +947,8 @@ ; GFX10-NEXT: v_and_or_b32 v1, v1, s3, v5 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 24, v4 ; GFX10-NEXT: s_lshr_b32 s0, s2, 2 -; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 1 ; GFX10-NEXT: v_or3_b32 v0, v0, v6, v2 +; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 1 ; GFX10-NEXT: v_or3_b32 v1, v1, v7, v3 ; GFX10-NEXT: s_and_b32 s0, s2, 3 ; GFX10-NEXT: s_lshl_b32 s0, s0, 3 @@ -1003,14 +1003,14 @@ ; GFX8-NEXT: v_lshlrev_b32_sdwa v8, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 24, v0 -; GFX8-NEXT: v_lshlrev_b32_sdwa v9, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v9, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b32_e32 v6, 24, v6 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v9 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 24, v7 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v9 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v4 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v6 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v3 @@ -1041,13 +1041,13 @@ ; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 8 ; GFX7-NEXT: v_lshlrev_b32_e32 v7, 8, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v9, 8, v9 -; GFX7-NEXT: v_or_b32_e32 v6, v6, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v6, v6, v7 ; GFX7-NEXT: v_or_b32_e32 v7, v8, v9 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GFX7-NEXT: v_or_b32_e32 v0, v6, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 24, v5 +; GFX7-NEXT: v_or_b32_e32 v0, v6, v0 ; GFX7-NEXT: v_or_b32_e32 v1, v7, v1 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v4 ; GFX7-NEXT: v_or_b32_e32 v1, v1, v5 @@ -1073,8 +1073,8 @@ ; GFX10-NEXT: v_lshlrev_b32_sdwa v8, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_and_or_b32 v0, v0, s6, v4 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 24, v5 ; GFX10-NEXT: v_and_or_b32 v1, v1, s6, v6 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 24, v5 ; GFX10-NEXT: v_lshrrev_b32_e32 v5, 2, v2 ; GFX10-NEXT: v_and_b32_e32 v2, 3, v2 ; GFX10-NEXT: v_or3_b32 v0, v0, v7, v3 @@ -1096,7 +1096,7 @@ ; GCN-NEXT: s_mov_b32 s6, 0x80008 ; GCN-NEXT: s_movk_i32 s4, 0xff ; GCN-NEXT: v_lshrrev_b32_e32 v1, 2, v0 -; GCN-NEXT: v_and_b32_e32 v0, 3, v0 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_bfe_u32 s7, s0, s6 ; GCN-NEXT: s_and_b32 s5, s0, s4 @@ -1112,8 +1112,8 @@ ; GCN-NEXT: s_and_b32 s2, s1, s4 ; GCN-NEXT: s_bfe_u32 s4, s1, s6 ; GCN-NEXT: s_lshr_b32 s3, s1, 24 -; GCN-NEXT: s_bfe_u32 s1, s1, s7 ; GCN-NEXT: s_lshl_b32 s4, s4, 8 +; GCN-NEXT: s_bfe_u32 s1, s1, s7 ; GCN-NEXT: s_or_b32 s2, s2, s4 ; GCN-NEXT: s_lshl_b32 s1, s1, 16 ; GCN-NEXT: s_or_b32 s1, s2, s1 @@ -1121,7 +1121,7 @@ ; GCN-NEXT: s_or_b32 s1, s1, s2 ; GCN-NEXT: v_mov_b32_e32 v2, s0 ; GCN-NEXT: v_mov_b32_e32 v3, s1 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 +; GCN-NEXT: v_and_b32_e32 v0, 3, v0 ; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc ; GCN-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GCN-NEXT: v_lshrrev_b32_e32 v0, v0, v1 @@ -1141,8 +1141,8 @@ ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_bfe_u32 s8, s0, s3 ; GFX10-NEXT: s_bfe_u32 s3, s1, s3 -; GFX10-NEXT: s_and_b32 s7, s0, s2 ; GFX10-NEXT: s_lshr_b32 s6, s1, 24 +; GFX10-NEXT: s_and_b32 s7, s0, s2 ; GFX10-NEXT: s_and_b32 s2, s1, s2 ; GFX10-NEXT: s_bfe_u32 s1, s1, s4 ; GFX10-NEXT: s_lshl_b32 s3, s3, 8 @@ -1191,14 +1191,14 @@ ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_bfe_u32 s2, s0, 0x80008 -; GFX10-NEXT: s_bfe_u32 s3, s0, 0x80010 ; GFX10-NEXT: s_and_b32 s1, s0, 0xff +; GFX10-NEXT: s_bfe_u32 s3, s0, 0x80010 ; GFX10-NEXT: s_lshl_b32 s2, s2, 8 -; GFX10-NEXT: s_lshr_b32 s0, s0, 24 ; GFX10-NEXT: s_lshl_b32 s3, s3, 16 ; GFX10-NEXT: s_or_b32 s1, s1, s2 -; GFX10-NEXT: s_lshl_b32 s0, s0, 24 +; GFX10-NEXT: s_lshr_b32 s0, s0, 24 ; GFX10-NEXT: s_or_b32 s1, s1, s3 +; GFX10-NEXT: s_lshl_b32 s0, s0, 24 ; GFX10-NEXT: s_or_b32 s0, s1, s0 ; GFX10-NEXT: ; return to shader part epilog %vector = load <8 x i8>, <8 x i8> addrspace(4)* %ptr @@ -1229,14 +1229,14 @@ ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_bfe_u32 s2, s0, 0x80008 -; GFX10-NEXT: s_bfe_u32 s3, s0, 0x80010 ; GFX10-NEXT: s_and_b32 s1, s0, 0xff +; GFX10-NEXT: s_bfe_u32 s3, s0, 0x80010 ; GFX10-NEXT: s_lshl_b32 s2, s2, 8 -; GFX10-NEXT: s_lshr_b32 s0, s0, 24 ; GFX10-NEXT: s_lshl_b32 s3, s3, 16 ; GFX10-NEXT: s_or_b32 s1, s1, s2 -; GFX10-NEXT: s_lshl_b32 s0, s0, 24 +; GFX10-NEXT: s_lshr_b32 s0, s0, 24 ; GFX10-NEXT: s_or_b32 s1, s1, s3 +; GFX10-NEXT: s_lshl_b32 s0, s0, 24 ; GFX10-NEXT: s_or_b32 s0, s1, s0 ; GFX10-NEXT: s_lshr_b32 s0, s0, 8 ; GFX10-NEXT: ; return to shader part epilog @@ -1268,14 +1268,14 @@ ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_bfe_u32 s2, s0, 0x80008 -; GFX10-NEXT: s_bfe_u32 s3, s0, 0x80010 ; GFX10-NEXT: s_and_b32 s1, s0, 0xff +; GFX10-NEXT: s_bfe_u32 s3, s0, 0x80010 ; GFX10-NEXT: s_lshl_b32 s2, s2, 8 -; GFX10-NEXT: s_lshr_b32 s0, s0, 24 ; GFX10-NEXT: s_lshl_b32 s3, s3, 16 ; GFX10-NEXT: s_or_b32 s1, s1, s2 -; GFX10-NEXT: s_lshl_b32 s0, s0, 24 +; GFX10-NEXT: s_lshr_b32 s0, s0, 24 ; GFX10-NEXT: s_or_b32 s1, s1, s3 +; GFX10-NEXT: s_lshl_b32 s0, s0, 24 ; GFX10-NEXT: s_or_b32 s0, s1, s0 ; GFX10-NEXT: s_lshr_b32 s0, s0, 16 ; GFX10-NEXT: ; return to shader part epilog @@ -1307,14 +1307,14 @@ ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_bfe_u32 s2, s0, 0x80008 -; GFX10-NEXT: s_bfe_u32 s3, s0, 0x80010 ; GFX10-NEXT: s_and_b32 s1, s0, 0xff +; GFX10-NEXT: s_bfe_u32 s3, s0, 0x80010 ; GFX10-NEXT: s_lshl_b32 s2, s2, 8 -; GFX10-NEXT: s_lshr_b32 s0, s0, 24 ; GFX10-NEXT: s_lshl_b32 s3, s3, 16 ; GFX10-NEXT: s_or_b32 s1, s1, s2 -; GFX10-NEXT: s_lshl_b32 s0, s0, 24 +; GFX10-NEXT: s_lshr_b32 s0, s0, 24 ; GFX10-NEXT: s_or_b32 s1, s1, s3 +; GFX10-NEXT: s_lshl_b32 s0, s0, 24 ; GFX10-NEXT: s_or_b32 s0, s1, s0 ; GFX10-NEXT: s_lshr_b32 s0, s0, 24 ; GFX10-NEXT: ; return to shader part epilog @@ -1345,14 +1345,14 @@ ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_bfe_u32 s2, s1, 0x80008 -; GFX10-NEXT: s_bfe_u32 s3, s1, 0x80010 ; GFX10-NEXT: s_and_b32 s0, s1, 0xff +; GFX10-NEXT: s_bfe_u32 s3, s1, 0x80010 ; GFX10-NEXT: s_lshl_b32 s2, s2, 8 -; GFX10-NEXT: s_lshr_b32 s1, s1, 24 ; GFX10-NEXT: s_lshl_b32 s3, s3, 16 ; GFX10-NEXT: s_or_b32 s0, s0, s2 -; GFX10-NEXT: s_lshl_b32 s1, s1, 24 +; GFX10-NEXT: s_lshr_b32 s1, s1, 24 ; GFX10-NEXT: s_or_b32 s0, s0, s3 +; GFX10-NEXT: s_lshl_b32 s1, s1, 24 ; GFX10-NEXT: s_or_b32 s0, s0, s1 ; GFX10-NEXT: ; return to shader part epilog %vector = load <8 x i8>, <8 x i8> addrspace(4)* %ptr @@ -1383,14 +1383,14 @@ ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_bfe_u32 s2, s1, 0x80008 -; GFX10-NEXT: s_bfe_u32 s3, s1, 0x80010 ; GFX10-NEXT: s_and_b32 s0, s1, 0xff +; GFX10-NEXT: s_bfe_u32 s3, s1, 0x80010 ; GFX10-NEXT: s_lshl_b32 s2, s2, 8 -; GFX10-NEXT: s_lshr_b32 s1, s1, 24 ; GFX10-NEXT: s_lshl_b32 s3, s3, 16 ; GFX10-NEXT: s_or_b32 s0, s0, s2 -; GFX10-NEXT: s_lshl_b32 s1, s1, 24 +; GFX10-NEXT: s_lshr_b32 s1, s1, 24 ; GFX10-NEXT: s_or_b32 s0, s0, s3 +; GFX10-NEXT: s_lshl_b32 s1, s1, 24 ; GFX10-NEXT: s_or_b32 s0, s0, s1 ; GFX10-NEXT: s_lshr_b32 s0, s0, 8 ; GFX10-NEXT: ; return to shader part epilog @@ -1422,14 +1422,14 @@ ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_bfe_u32 s2, s1, 0x80008 -; GFX10-NEXT: s_bfe_u32 s3, s1, 0x80010 ; GFX10-NEXT: s_and_b32 s0, s1, 0xff +; GFX10-NEXT: s_bfe_u32 s3, s1, 0x80010 ; GFX10-NEXT: s_lshl_b32 s2, s2, 8 -; GFX10-NEXT: s_lshr_b32 s1, s1, 24 ; GFX10-NEXT: s_lshl_b32 s3, s3, 16 ; GFX10-NEXT: s_or_b32 s0, s0, s2 -; GFX10-NEXT: s_lshl_b32 s1, s1, 24 +; GFX10-NEXT: s_lshr_b32 s1, s1, 24 ; GFX10-NEXT: s_or_b32 s0, s0, s3 +; GFX10-NEXT: s_lshl_b32 s1, s1, 24 ; GFX10-NEXT: s_or_b32 s0, s0, s1 ; GFX10-NEXT: s_lshr_b32 s0, s0, 16 ; GFX10-NEXT: ; return to shader part epilog @@ -1461,14 +1461,14 @@ ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_bfe_u32 s2, s1, 0x80008 -; GFX10-NEXT: s_bfe_u32 s3, s1, 0x80010 ; GFX10-NEXT: s_and_b32 s0, s1, 0xff +; GFX10-NEXT: s_bfe_u32 s3, s1, 0x80010 ; GFX10-NEXT: s_lshl_b32 s2, s2, 8 -; GFX10-NEXT: s_lshr_b32 s1, s1, 24 ; GFX10-NEXT: s_lshl_b32 s3, s3, 16 ; GFX10-NEXT: s_or_b32 s0, s0, s2 -; GFX10-NEXT: s_lshl_b32 s1, s1, 24 +; GFX10-NEXT: s_lshr_b32 s1, s1, 24 ; GFX10-NEXT: s_or_b32 s0, s0, s3 +; GFX10-NEXT: s_lshl_b32 s1, s1, 24 ; GFX10-NEXT: s_or_b32 s0, s0, s1 ; GFX10-NEXT: s_lshr_b32 s0, s0, 24 ; GFX10-NEXT: ; return to shader part epilog @@ -1483,9 +1483,9 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off ; GFX9-NEXT: v_mov_b32_e32 v2, 8 -; GFX9-NEXT: v_mov_b32_e32 v3, 16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, 0xff +; GFX9-NEXT: v_mov_b32_e32 v3, 16 ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 24, v0 ; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX9-NEXT: v_lshlrev_b32_sdwa v3, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 @@ -1556,9 +1556,9 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off ; GFX9-NEXT: s_mov_b32 s4, 8 -; GFX9-NEXT: v_mov_b32_e32 v2, 16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, 0xff +; GFX9-NEXT: v_mov_b32_e32 v2, 16 ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 24, v0 ; GFX9-NEXT: v_lshlrev_b32_sdwa v4, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 @@ -1710,9 +1710,9 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off ; GFX9-NEXT: v_mov_b32_e32 v2, 8 -; GFX9-NEXT: v_mov_b32_e32 v3, 16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, 0xff +; GFX9-NEXT: v_mov_b32_e32 v3, 16 ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 24, v0 ; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX9-NEXT: v_lshlrev_b32_sdwa v3, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 @@ -1787,11 +1787,11 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off ; GFX9-NEXT: v_mov_b32_e32 v2, 8 -; GFX9-NEXT: v_mov_b32_e32 v3, 16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, 0xff -; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_mov_b32_e32 v3, 16 ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 24, v1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX9-NEXT: v_lshlrev_b32_sdwa v3, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_and_or_b32 v0, v1, v0, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 24, v4 @@ -1860,11 +1860,11 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off ; GFX9-NEXT: s_mov_b32 s4, 8 -; GFX9-NEXT: v_mov_b32_e32 v2, 16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, 0xff -; GFX9-NEXT: v_lshlrev_b32_sdwa v4, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_mov_b32_e32 v2, 16 ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 24, v1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v4, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_and_or_b32 v0, v1, v0, v4 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 24, v3 @@ -1940,8 +1940,8 @@ ; GFX9-NEXT: s_mov_b32 s4, 16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, 0xff -; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 24, v1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX9-NEXT: v_lshlrev_b32_sdwa v4, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_and_or_b32 v0, v1, v0, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 24, v3 @@ -2014,11 +2014,11 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off ; GFX9-NEXT: v_mov_b32_e32 v2, 8 -; GFX9-NEXT: v_mov_b32_e32 v3, 16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, 0xff -; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_mov_b32_e32 v3, 16 ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 24, v1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX9-NEXT: v_lshlrev_b32_sdwa v3, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_and_or_b32 v0, v1, v0, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 24, v4 @@ -2101,33 +2101,33 @@ ; GCN-NEXT: s_bfe_u32 s0, s0, s12 ; GCN-NEXT: s_lshl_b32 s0, s0, 16 ; GCN-NEXT: s_or_b32 s0, s10, s0 -; GCN-NEXT: s_bfe_u32 s10, s1, s11 ; GCN-NEXT: s_lshl_b32 s5, s5, 24 -; GCN-NEXT: s_or_b32 s0, s0, s5 +; GCN-NEXT: s_bfe_u32 s10, s1, s11 ; GCN-NEXT: s_lshr_b32 s6, s1, 24 +; GCN-NEXT: s_or_b32 s0, s0, s5 ; GCN-NEXT: s_and_b32 s5, s1, s9 -; GCN-NEXT: s_bfe_u32 s1, s1, s12 ; GCN-NEXT: s_lshl_b32 s10, s10, 8 +; GCN-NEXT: s_bfe_u32 s1, s1, s12 ; GCN-NEXT: s_or_b32 s5, s5, s10 ; GCN-NEXT: s_lshl_b32 s1, s1, 16 ; GCN-NEXT: s_or_b32 s1, s5, s1 ; GCN-NEXT: s_lshl_b32 s5, s6, 24 ; GCN-NEXT: s_bfe_u32 s6, s2, s11 -; GCN-NEXT: s_or_b32 s1, s1, s5 ; GCN-NEXT: s_lshr_b32 s7, s2, 24 +; GCN-NEXT: s_or_b32 s1, s1, s5 ; GCN-NEXT: s_and_b32 s5, s2, s9 -; GCN-NEXT: s_bfe_u32 s2, s2, s12 ; GCN-NEXT: s_lshl_b32 s6, s6, 8 +; GCN-NEXT: s_bfe_u32 s2, s2, s12 ; GCN-NEXT: s_or_b32 s5, s5, s6 ; GCN-NEXT: s_lshl_b32 s2, s2, 16 -; GCN-NEXT: s_bfe_u32 s6, s3, s11 ; GCN-NEXT: s_or_b32 s2, s5, s2 ; GCN-NEXT: s_lshl_b32 s5, s7, 24 -; GCN-NEXT: s_or_b32 s2, s2, s5 +; GCN-NEXT: s_bfe_u32 s6, s3, s11 ; GCN-NEXT: s_lshr_b32 s8, s3, 24 +; GCN-NEXT: s_or_b32 s2, s2, s5 ; GCN-NEXT: s_and_b32 s5, s3, s9 -; GCN-NEXT: s_bfe_u32 s3, s3, s12 ; GCN-NEXT: s_lshl_b32 s6, s6, 8 +; GCN-NEXT: s_bfe_u32 s3, s3, s12 ; GCN-NEXT: s_or_b32 s5, s5, s6 ; GCN-NEXT: s_lshl_b32 s3, s3, 16 ; GCN-NEXT: s_or_b32 s3, s5, s3 @@ -2159,36 +2159,36 @@ ; GFX10-NEXT: s_lshl_b32 s13, s13, 8 ; GFX10-NEXT: s_bfe_u32 s15, s1, s6 ; GFX10-NEXT: s_bfe_u32 s17, s2, s6 -; GFX10-NEXT: s_bfe_u32 s6, s3, s6 ; GFX10-NEXT: s_lshl_b32 s0, s0, 16 ; GFX10-NEXT: s_or_b32 s12, s12, s13 +; GFX10-NEXT: s_bfe_u32 s6, s3, s6 ; GFX10-NEXT: s_lshr_b32 s9, s1, 24 +; GFX10-NEXT: s_lshr_b32 s10, s2, 24 +; GFX10-NEXT: s_lshr_b32 s11, s3, 24 ; GFX10-NEXT: s_and_b32 s14, s1, s5 ; GFX10-NEXT: s_bfe_u32 s1, s1, s7 ; GFX10-NEXT: s_and_b32 s16, s2, s5 -; GFX10-NEXT: s_lshr_b32 s10, s2, 24 -; GFX10-NEXT: s_bfe_u32 s2, s2, s7 +; GFX10-NEXT: s_lshl_b32 s8, s8, 24 ; GFX10-NEXT: s_lshl_b32 s15, s15, 8 -; GFX10-NEXT: s_lshr_b32 s11, s3, 24 +; GFX10-NEXT: s_lshl_b32 s17, s17, 8 +; GFX10-NEXT: s_or_b32 s0, s12, s0 +; GFX10-NEXT: s_bfe_u32 s2, s2, s7 ; GFX10-NEXT: s_and_b32 s5, s3, s5 -; GFX10-NEXT: s_bfe_u32 s3, s3, s7 ; GFX10-NEXT: s_lshl_b32 s6, s6, 8 -; GFX10-NEXT: s_lshl_b32 s8, s8, 24 -; GFX10-NEXT: s_or_b32 s0, s12, s0 -; GFX10-NEXT: s_lshl_b32 s17, s17, 8 -; GFX10-NEXT: s_or_b32 s0, s0, s8 -; GFX10-NEXT: s_or_b32 s5, s5, s6 -; GFX10-NEXT: s_lshl_b32 s3, s3, 16 +; GFX10-NEXT: s_bfe_u32 s3, s3, s7 ; GFX10-NEXT: s_lshl_b32 s1, s1, 16 ; GFX10-NEXT: s_or_b32 s13, s14, s15 +; GFX10-NEXT: s_or_b32 s0, s0, s8 ; GFX10-NEXT: s_or_b32 s8, s16, s17 ; GFX10-NEXT: s_lshl_b32 s2, s2, 16 -; GFX10-NEXT: s_or_b32 s3, s5, s3 +; GFX10-NEXT: s_or_b32 s5, s5, s6 +; GFX10-NEXT: s_lshl_b32 s3, s3, 16 +; GFX10-NEXT: s_lshl_b32 s9, s9, 24 +; GFX10-NEXT: s_or_b32 s1, s13, s1 ; GFX10-NEXT: s_or_b32 s2, s8, s2 ; GFX10-NEXT: s_lshl_b32 s8, s10, 24 +; GFX10-NEXT: s_or_b32 s3, s5, s3 ; GFX10-NEXT: s_lshl_b32 s5, s11, 24 -; GFX10-NEXT: s_lshl_b32 s9, s9, 24 -; GFX10-NEXT: s_or_b32 s1, s13, s1 ; GFX10-NEXT: s_lshr_b32 s6, s4, 2 ; GFX10-NEXT: s_or_b32 s1, s1, s9 ; GFX10-NEXT: s_or_b32 s2, s2, s8 @@ -2214,11 +2214,11 @@ ; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off ; GFX9-NEXT: s_mov_b32 s0, 8 ; GFX9-NEXT: s_mov_b32 s1, 16 -; GFX9-NEXT: v_mov_b32_e32 v5, 8 ; GFX9-NEXT: s_movk_i32 s3, 0xff -; GFX9-NEXT: s_lshr_b32 s4, s2, 2 -; GFX9-NEXT: v_mov_b32_e32 v6, 16 +; GFX9-NEXT: v_mov_b32_e32 v5, 8 ; GFX9-NEXT: v_mov_b32_e32 v4, 0xff +; GFX9-NEXT: v_mov_b32_e32 v6, 16 +; GFX9-NEXT: s_lshr_b32 s4, s2, 2 ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1 ; GFX9-NEXT: s_and_b32 s2, s2, 3 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -2234,8 +2234,8 @@ ; GFX9-NEXT: v_lshlrev_b32_e32 v7, 24, v7 ; GFX9-NEXT: v_and_or_b32 v1, v1, s3, v13 ; GFX9-NEXT: v_lshlrev_b32_e32 v8, 24, v8 -; GFX9-NEXT: v_lshlrev_b32_sdwa v16, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_lshrrev_b32_e32 v10, 24, v3 +; GFX9-NEXT: v_lshlrev_b32_sdwa v16, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_lshlrev_b32_sdwa v5, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX9-NEXT: v_and_or_b32 v2, v2, v4, v15 ; GFX9-NEXT: v_lshlrev_b32_e32 v9, 24, v9 @@ -2244,11 +2244,11 @@ ; GFX9-NEXT: v_lshlrev_b32_sdwa v6, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_and_or_b32 v3, v3, v4, v5 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 24, v10 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GFX9-NEXT: v_or3_b32 v2, v2, v16, v9 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s4, 2 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX9-NEXT: v_or3_b32 v3, v3, v6, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s4, 3 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GFX9-NEXT: s_lshl_b32 s0, s2, 3 @@ -2270,35 +2270,35 @@ ; GFX8-NEXT: v_lshlrev_b32_sdwa v12, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX8-NEXT: v_lshrrev_b32_e32 v8, 24, v0 -; GFX8-NEXT: v_lshlrev_b32_sdwa v13, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_sdwa v14, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX8-NEXT: v_lshrrev_b32_e32 v9, 24, v1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v13, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v14, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_sdwa v15, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX8-NEXT: v_lshrrev_b32_e32 v10, 24, v2 -; GFX8-NEXT: v_or_b32_sdwa v2, v2, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 24, v9 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v5 +; GFX8-NEXT: v_lshlrev_b32_sdwa v15, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX8-NEXT: v_lshlrev_b32_sdwa v6, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX8-NEXT: v_lshlrev_b32_e32 v8, 24, v8 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 24, v9 +; GFX8-NEXT: v_or_b32_sdwa v2, v2, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v0, v0, v13 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v5 ; GFX8-NEXT: v_lshrrev_b32_e32 v11, 24, v3 ; GFX8-NEXT: v_lshlrev_b32_sdwa v7, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_or_b32_sdwa v3, v3, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b32_e32 v9, 24, v10 +; GFX8-NEXT: v_or_b32_sdwa v3, v3, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v2, v2, v15 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v8 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v6, 24, v11 ; GFX8-NEXT: v_or_b32_e32 v3, v3, v7 ; GFX8-NEXT: v_or_b32_e32 v2, v2, v9 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s0, 2 +; GFX8-NEXT: v_or_b32_e32 v3, v3, v6 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s0, 3 -; GFX8-NEXT: v_or_b32_e32 v3, v3, v6 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GFX8-NEXT: s_lshl_b32 s0, s1, 3 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, s0, v0 @@ -2321,45 +2321,45 @@ ; GFX7-NEXT: v_bfe_u32 v12, v1, 8, 8 ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 24, v0 ; GFX7-NEXT: v_lshrrev_b32_e32 v6, 24, v1 -; GFX7-NEXT: v_bfe_u32 v14, v2, 8, 8 ; GFX7-NEXT: v_and_b32_e32 v9, s0, v0 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 ; GFX7-NEXT: v_and_b32_e32 v11, s0, v1 ; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 8 +; GFX7-NEXT: v_bfe_u32 v14, v2, 8, 8 ; GFX7-NEXT: v_lshlrev_b32_e32 v10, 8, v10 ; GFX7-NEXT: v_lshlrev_b32_e32 v12, 8, v12 ; GFX7-NEXT: v_lshrrev_b32_e32 v7, 24, v2 ; GFX7-NEXT: v_and_b32_e32 v13, v2, v4 ; GFX7-NEXT: v_bfe_u32 v2, v2, 16, 8 ; GFX7-NEXT: v_bfe_u32 v15, v3, 8, 8 -; GFX7-NEXT: v_or_b32_e32 v9, v9, v10 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_or_b32_e32 v10, v11, v12 ; GFX7-NEXT: v_lshlrev_b32_e32 v14, 8, v14 +; GFX7-NEXT: v_or_b32_e32 v9, v9, v10 +; GFX7-NEXT: v_or_b32_e32 v10, v11, v12 ; GFX7-NEXT: v_lshrrev_b32_e32 v8, 24, v3 ; GFX7-NEXT: v_and_b32_e32 v4, v3, v4 ; GFX7-NEXT: v_bfe_u32 v3, v3, 16, 8 -; GFX7-NEXT: v_lshlrev_b32_e32 v15, 8, v15 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 24, v5 -; GFX7-NEXT: v_or_b32_e32 v0, v9, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v6, 24, v6 -; GFX7-NEXT: v_or_b32_e32 v1, v10, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v15, 8, v15 ; GFX7-NEXT: v_or_b32_e32 v11, v13, v14 +; GFX7-NEXT: v_or_b32_e32 v0, v9, v0 +; GFX7-NEXT: v_or_b32_e32 v1, v10, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v7, 24, v7 -; GFX7-NEXT: v_or_b32_e32 v2, v11, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_or_b32_e32 v4, v4, v15 +; GFX7-NEXT: v_or_b32_e32 v2, v11, v2 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v5 ; GFX7-NEXT: v_or_b32_e32 v1, v1, v6 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GFX7-NEXT: v_lshlrev_b32_e32 v8, 24, v8 ; GFX7-NEXT: v_or_b32_e32 v3, v4, v3 ; GFX7-NEXT: v_or_b32_e32 v2, v2, v7 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s1, 2 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX7-NEXT: v_or_b32_e32 v3, v3, v8 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s1, 3 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GFX7-NEXT: s_lshl_b32 s0, s2, 3 @@ -2374,8 +2374,8 @@ ; GFX10-NEXT: v_mov_b32_e32 v5, 8 ; GFX10-NEXT: s_mov_b32 s1, 16 ; GFX10-NEXT: s_movk_i32 s3, 0xff -; GFX10-NEXT: v_mov_b32_e32 v6, 16 ; GFX10-NEXT: v_mov_b32_e32 v4, 0xff +; GFX10-NEXT: v_mov_b32_e32 v6, 16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v7, 24, v0 ; GFX10-NEXT: v_lshrrev_b32_e32 v8, 24, v1 @@ -2398,8 +2398,8 @@ ; GFX10-NEXT: v_or3_b32 v0, v0, v11, v7 ; GFX10-NEXT: v_or3_b32 v1, v1, v13, v8 ; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 1 -; GFX10-NEXT: v_and_or_b32 v4, v3, v4, v5 ; GFX10-NEXT: v_or3_b32 v2, v2, v15, v9 +; GFX10-NEXT: v_and_or_b32 v4, v3, v4, v5 ; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 24, v10 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo @@ -2425,11 +2425,11 @@ ; GFX9-NEXT: global_load_dwordx4 v[3:6], v[0:1], off ; GFX9-NEXT: s_mov_b32 s4, 8 ; GFX9-NEXT: s_mov_b32 s5, 16 -; GFX9-NEXT: v_mov_b32_e32 v1, 8 ; GFX9-NEXT: s_movk_i32 s6, 0xff -; GFX9-NEXT: v_lshrrev_b32_e32 v8, 2, v2 -; GFX9-NEXT: v_mov_b32_e32 v7, 16 +; GFX9-NEXT: v_mov_b32_e32 v1, 8 ; GFX9-NEXT: v_mov_b32_e32 v0, 0xff +; GFX9-NEXT: v_mov_b32_e32 v7, 16 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 2, v2 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8 ; GFX9-NEXT: v_and_b32_e32 v2, 3, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -2438,26 +2438,26 @@ ; GFX9-NEXT: v_lshlrev_b32_sdwa v13, s4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX9-NEXT: v_lshlrev_b32_sdwa v15, s4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX9-NEXT: v_lshrrev_b32_e32 v11, 24, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v12, 24, v6 ; GFX9-NEXT: v_lshlrev_b32_sdwa v14, s5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_lshlrev_b32_sdwa v16, s5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_lshlrev_b32_sdwa v17, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshrrev_b32_e32 v12, 24, v6 ; GFX9-NEXT: v_lshlrev_b32_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX9-NEXT: v_and_or_b32 v3, v3, s6, v13 ; GFX9-NEXT: v_lshlrev_b32_e32 v9, 24, v9 ; GFX9-NEXT: v_and_or_b32 v4, v4, s6, v15 ; GFX9-NEXT: v_lshlrev_b32_e32 v10, 24, v10 ; GFX9-NEXT: v_lshlrev_b32_sdwa v18, v7, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v7, v7, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_and_or_b32 v5, v5, v0, v17 ; GFX9-NEXT: v_lshlrev_b32_e32 v11, 24, v11 ; GFX9-NEXT: v_and_or_b32 v0, v6, v0, v1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v7, v7, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 24, v12 ; GFX9-NEXT: v_or3_b32 v3, v3, v14, v9 ; GFX9-NEXT: v_or3_b32 v4, v4, v16, v10 +; GFX9-NEXT: v_or3_b32 v5, v5, v18, v11 ; GFX9-NEXT: v_or3_b32 v0, v0, v7, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; GFX9-NEXT: v_or3_b32 v5, v5, v18, v11 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 2, v8 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 3, v8 @@ -2481,34 +2481,34 @@ ; GFX8-NEXT: v_lshlrev_b32_sdwa v14, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX8-NEXT: v_lshrrev_b32_e32 v10, 24, v3 -; GFX8-NEXT: v_lshlrev_b32_sdwa v15, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_or_b32_sdwa v3, v3, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_sdwa v16, v7, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX8-NEXT: v_lshrrev_b32_e32 v11, 24, v4 +; GFX8-NEXT: v_lshlrev_b32_sdwa v15, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v16, v7, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_or_b32_sdwa v3, v3, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_sdwa v17, v8, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX8-NEXT: v_lshrrev_b32_e32 v12, 24, v5 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 24, v11 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX8-NEXT: v_or_b32_sdwa v5, v5, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_sdwa v17, v8, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX8-NEXT: v_lshlrev_b32_sdwa v7, v7, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX8-NEXT: v_lshlrev_b32_e32 v10, 24, v10 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 24, v11 +; GFX8-NEXT: v_or_b32_sdwa v5, v5, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v3, v3, v15 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: v_lshrrev_b32_e32 v13, 24, v6 ; GFX8-NEXT: v_lshlrev_b32_sdwa v8, v8, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_lshlrev_b32_e32 v11, 24, v12 ; GFX8-NEXT: v_or_b32_sdwa v6, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v1, v5, v17 -; GFX8-NEXT: v_lshlrev_b32_e32 v11, 24, v12 ; GFX8-NEXT: v_or_b32_e32 v3, v3, v10 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v7, 24, v13 ; GFX8-NEXT: v_or_b32_e32 v5, v6, v8 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v11 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 2, v9 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GFX8-NEXT: v_or_b32_e32 v4, v5, v7 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 3, v9 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 3, v2 @@ -2532,45 +2532,45 @@ ; GFX7-NEXT: v_bfe_u32 v13, v4, 8, 8 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 24, v3 ; GFX7-NEXT: v_lshrrev_b32_e32 v7, 24, v4 -; GFX7-NEXT: v_bfe_u32 v15, v5, 8, 8 ; GFX7-NEXT: v_and_b32_e32 v10, s4, v3 ; GFX7-NEXT: v_bfe_u32 v3, v3, 16, 8 ; GFX7-NEXT: v_and_b32_e32 v12, s4, v4 ; GFX7-NEXT: v_bfe_u32 v4, v4, 16, 8 +; GFX7-NEXT: v_bfe_u32 v15, v5, 8, 8 ; GFX7-NEXT: v_lshlrev_b32_e32 v11, 8, v11 ; GFX7-NEXT: v_lshlrev_b32_e32 v13, 8, v13 ; GFX7-NEXT: v_lshrrev_b32_e32 v8, 24, v5 ; GFX7-NEXT: v_and_b32_e32 v14, v5, v0 ; GFX7-NEXT: v_bfe_u32 v5, v5, 16, 8 ; GFX7-NEXT: v_bfe_u32 v16, v6, 8, 8 -; GFX7-NEXT: v_or_b32_e32 v10, v10, v11 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_or_b32_e32 v11, v12, v13 ; GFX7-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX7-NEXT: v_or_b32_e32 v10, v10, v11 +; GFX7-NEXT: v_or_b32_e32 v11, v12, v13 ; GFX7-NEXT: v_lshrrev_b32_e32 v9, 24, v6 ; GFX7-NEXT: v_and_b32_e32 v0, v6, v0 ; GFX7-NEXT: v_bfe_u32 v6, v6, 16, 8 -; GFX7-NEXT: v_lshlrev_b32_e32 v16, 8, v16 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; GFX7-NEXT: v_or_b32_e32 v3, v10, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v7, 24, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v16, 8, v16 ; GFX7-NEXT: v_or_b32_e32 v12, v14, v15 +; GFX7-NEXT: v_or_b32_e32 v3, v10, v3 ; GFX7-NEXT: v_or_b32_e32 v4, v11, v4 -; GFX7-NEXT: v_or_b32_e32 v1, v3, v1 -; GFX7-NEXT: v_or_b32_e32 v3, v4, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v8, 24, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v12, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v16 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX7-NEXT: v_or_b32_e32 v5, v12, v5 +; GFX7-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX7-NEXT: v_or_b32_e32 v3, v4, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v9, 24, v9 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v6 ; GFX7-NEXT: v_or_b32_e32 v4, v5, v8 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 2, v17 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; GFX7-NEXT: v_or_b32_e32 v0, v0, v9 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 3, v17 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 3, v2 @@ -2586,9 +2586,9 @@ ; GFX10-NEXT: v_mov_b32_e32 v1, 8 ; GFX10-NEXT: s_mov_b32 s5, 16 ; GFX10-NEXT: s_movk_i32 s6, 0xff -; GFX10-NEXT: v_lshrrev_b32_e32 v8, 2, v2 -; GFX10-NEXT: v_mov_b32_e32 v7, 16 ; GFX10-NEXT: v_mov_b32_e32 v0, 0xff +; GFX10-NEXT: v_mov_b32_e32 v7, 16 +; GFX10-NEXT: v_lshrrev_b32_e32 v8, 2, v2 ; GFX10-NEXT: v_and_b32_e32 v2, 3, v2 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v8 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -2604,15 +2604,15 @@ ; GFX10-NEXT: v_lshlrev_b32_e32 v9, 24, v9 ; GFX10-NEXT: v_and_or_b32 v4, v4, s6, v15 ; GFX10-NEXT: v_lshlrev_b32_e32 v10, 24, v10 -; GFX10-NEXT: v_lshlrev_b32_sdwa v18, v7, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshrrev_b32_e32 v12, 24, v6 +; GFX10-NEXT: v_lshlrev_b32_sdwa v18, v7, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_and_or_b32 v5, v5, v0, v17 ; GFX10-NEXT: v_lshlrev_b32_e32 v11, 24, v11 ; GFX10-NEXT: v_or3_b32 v3, v3, v14, v9 ; GFX10-NEXT: v_or3_b32 v4, v4, v16, v10 -; GFX10-NEXT: v_and_or_b32 v0, v6, v0, v1 ; GFX10-NEXT: v_lshlrev_b32_sdwa v7, v7, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_and_or_b32 v0, v6, v0, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 24, v12 ; GFX10-NEXT: v_or3_b32 v5, v5, v18, v11 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo @@ -2647,33 +2647,33 @@ ; GCN-NEXT: s_bfe_u32 s0, s0, s11 ; GCN-NEXT: s_lshl_b32 s0, s0, 16 ; GCN-NEXT: s_or_b32 s0, s9, s0 -; GCN-NEXT: s_bfe_u32 s9, s1, s10 ; GCN-NEXT: s_lshl_b32 s4, s4, 24 -; GCN-NEXT: s_or_b32 s0, s0, s4 +; GCN-NEXT: s_bfe_u32 s9, s1, s10 ; GCN-NEXT: s_lshr_b32 s5, s1, 24 +; GCN-NEXT: s_or_b32 s0, s0, s4 ; GCN-NEXT: s_and_b32 s4, s1, s8 -; GCN-NEXT: s_bfe_u32 s1, s1, s11 ; GCN-NEXT: s_lshl_b32 s9, s9, 8 +; GCN-NEXT: s_bfe_u32 s1, s1, s11 ; GCN-NEXT: s_or_b32 s4, s4, s9 ; GCN-NEXT: s_lshl_b32 s1, s1, 16 ; GCN-NEXT: s_or_b32 s1, s4, s1 ; GCN-NEXT: s_lshl_b32 s4, s5, 24 ; GCN-NEXT: s_bfe_u32 s5, s2, s10 -; GCN-NEXT: s_or_b32 s1, s1, s4 ; GCN-NEXT: s_lshr_b32 s6, s2, 24 +; GCN-NEXT: s_or_b32 s1, s1, s4 ; GCN-NEXT: s_and_b32 s4, s2, s8 -; GCN-NEXT: s_bfe_u32 s2, s2, s11 ; GCN-NEXT: s_lshl_b32 s5, s5, 8 +; GCN-NEXT: s_bfe_u32 s2, s2, s11 ; GCN-NEXT: s_or_b32 s4, s4, s5 ; GCN-NEXT: s_lshl_b32 s2, s2, 16 -; GCN-NEXT: s_bfe_u32 s5, s3, s10 ; GCN-NEXT: s_or_b32 s2, s4, s2 ; GCN-NEXT: s_lshl_b32 s4, s6, 24 -; GCN-NEXT: s_or_b32 s2, s2, s4 +; GCN-NEXT: s_bfe_u32 s5, s3, s10 ; GCN-NEXT: s_lshr_b32 s7, s3, 24 +; GCN-NEXT: s_or_b32 s2, s2, s4 ; GCN-NEXT: s_and_b32 s4, s3, s8 -; GCN-NEXT: s_bfe_u32 s3, s3, s11 ; GCN-NEXT: s_lshl_b32 s5, s5, 8 +; GCN-NEXT: s_bfe_u32 s3, s3, s11 ; GCN-NEXT: s_or_b32 s4, s4, s5 ; GCN-NEXT: s_lshl_b32 s3, s3, 16 ; GCN-NEXT: s_or_b32 s3, s4, s3 @@ -2681,13 +2681,13 @@ ; GCN-NEXT: v_mov_b32_e32 v2, s0 ; GCN-NEXT: v_mov_b32_e32 v3, s1 ; GCN-NEXT: s_or_b32 s3, s3, s4 -; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; GCN-NEXT: v_mov_b32_e32 v4, s2 +; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 2, v1 -; GCN-NEXT: v_and_b32_e32 v0, 3, v0 -; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GCN-NEXT: v_mov_b32_e32 v5, s3 +; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 3, v1 +; GCN-NEXT: v_and_b32_e32 v0, 3, v0 ; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v5, vcc ; GCN-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GCN-NEXT: v_lshrrev_b32_e32 v0, v0, v1 @@ -2708,13 +2708,13 @@ ; GFX10-NEXT: s_bfe_u32 s12, s0, s5 ; GFX10-NEXT: s_bfe_u32 s14, s1, s5 ; GFX10-NEXT: s_lshr_b32 s8, s1, 24 +; GFX10-NEXT: s_and_b32 s11, s0, s4 ; GFX10-NEXT: s_and_b32 s13, s1, s4 ; GFX10-NEXT: s_bfe_u32 s1, s1, s6 -; GFX10-NEXT: s_and_b32 s11, s0, s4 ; GFX10-NEXT: s_lshl_b32 s12, s12, 8 ; GFX10-NEXT: s_lshl_b32 s14, s14, 8 -; GFX10-NEXT: s_or_b32 s11, s11, s12 ; GFX10-NEXT: s_lshl_b32 s1, s1, 16 +; GFX10-NEXT: s_or_b32 s11, s11, s12 ; GFX10-NEXT: s_or_b32 s12, s13, s14 ; GFX10-NEXT: s_lshl_b32 s8, s8, 24 ; GFX10-NEXT: s_or_b32 s1, s12, s1 @@ -2726,10 +2726,10 @@ ; GFX10-NEXT: v_mov_b32_e32 v2, s1 ; GFX10-NEXT: s_lshl_b32 s7, s7, 24 ; GFX10-NEXT: s_or_b32 s0, s11, s0 -; GFX10-NEXT: s_and_b32 s15, s2, s4 ; GFX10-NEXT: s_lshr_b32 s9, s2, 24 -; GFX10-NEXT: s_bfe_u32 s2, s2, s6 +; GFX10-NEXT: s_and_b32 s15, s2, s4 ; GFX10-NEXT: s_lshl_b32 s16, s16, 8 +; GFX10-NEXT: s_bfe_u32 s2, s2, s6 ; GFX10-NEXT: s_or_b32 s0, s0, s7 ; GFX10-NEXT: s_or_b32 s7, s15, s16 ; GFX10-NEXT: s_lshl_b32 s2, s2, 16 @@ -2738,9 +2738,9 @@ ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v1 ; GFX10-NEXT: s_or_b32 s2, s7, s2 ; GFX10-NEXT: s_lshl_b32 s7, s9, 24 -; GFX10-NEXT: s_bfe_u32 s1, s3, s6 ; GFX10-NEXT: s_and_b32 s4, s3, s4 ; GFX10-NEXT: s_lshl_b32 s5, s5, 8 +; GFX10-NEXT: s_bfe_u32 s1, s3, s6 ; GFX10-NEXT: s_or_b32 s2, s2, s7 ; GFX10-NEXT: s_lshr_b32 s10, s3, 24 ; GFX10-NEXT: s_or_b32 s3, s4, s5 @@ -2766,8 +2766,8 @@ ; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, 8 -; GFX9-NEXT: v_mov_b32_e32 v3, 16 ; GFX9-NEXT: v_mov_b32_e32 v1, 0xff +; GFX9-NEXT: v_mov_b32_e32 v3, 16 ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 24, v0 ; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX9-NEXT: v_lshlrev_b32_sdwa v3, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 @@ -2839,8 +2839,8 @@ ; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off ; GFX9-NEXT: s_mov_b32 s4, 8 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, 16 ; GFX9-NEXT: v_mov_b32_e32 v1, 0xff +; GFX9-NEXT: v_mov_b32_e32 v2, 16 ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 24, v0 ; GFX9-NEXT: v_lshlrev_b32_sdwa v4, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 @@ -2993,8 +2993,8 @@ ; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, 8 -; GFX9-NEXT: v_mov_b32_e32 v3, 16 ; GFX9-NEXT: v_mov_b32_e32 v1, 0xff +; GFX9-NEXT: v_mov_b32_e32 v3, 16 ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 24, v0 ; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX9-NEXT: v_lshlrev_b32_sdwa v3, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 @@ -3070,10 +3070,10 @@ ; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, 8 -; GFX9-NEXT: v_mov_b32_e32 v3, 16 ; GFX9-NEXT: v_mov_b32_e32 v0, 0xff -; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_mov_b32_e32 v3, 16 ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 24, v1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX9-NEXT: v_lshlrev_b32_sdwa v3, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_and_or_b32 v0, v1, v0, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 24, v4 @@ -3143,10 +3143,10 @@ ; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off ; GFX9-NEXT: s_mov_b32 s4, 8 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, 16 ; GFX9-NEXT: v_mov_b32_e32 v0, 0xff -; GFX9-NEXT: v_lshlrev_b32_sdwa v4, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_mov_b32_e32 v2, 16 ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 24, v1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v4, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_and_or_b32 v0, v1, v0, v4 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 24, v3 @@ -3222,8 +3222,8 @@ ; GFX9-NEXT: v_mov_b32_e32 v2, 8 ; GFX9-NEXT: s_mov_b32 s4, 16 ; GFX9-NEXT: v_mov_b32_e32 v0, 0xff -; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 24, v1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX9-NEXT: v_lshlrev_b32_sdwa v4, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_and_or_b32 v0, v1, v0, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 24, v3 @@ -3297,10 +3297,10 @@ ; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, 8 -; GFX9-NEXT: v_mov_b32_e32 v3, 16 ; GFX9-NEXT: v_mov_b32_e32 v0, 0xff -; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_mov_b32_e32 v3, 16 ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 24, v1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX9-NEXT: v_lshlrev_b32_sdwa v3, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_and_or_b32 v0, v1, v0, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 24, v4 @@ -3374,12 +3374,12 @@ ; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, 8 -; GFX9-NEXT: v_mov_b32_e32 v3, 16 ; GFX9-NEXT: v_mov_b32_e32 v0, 0xff -; GFX9-NEXT: v_lshlrev_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_mov_b32_e32 v3, 16 ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 24, v2 -; GFX9-NEXT: v_and_or_b32 v0, v2, v0, v1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX9-NEXT: v_lshlrev_b32_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_and_or_b32 v0, v2, v0, v1 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 24, v4 ; GFX9-NEXT: v_or3_b32 v0, v0, v3, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -3447,10 +3447,10 @@ ; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off ; GFX9-NEXT: s_mov_b32 s4, 8 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, 16 ; GFX9-NEXT: v_mov_b32_e32 v0, 0xff -; GFX9-NEXT: v_lshlrev_b32_sdwa v4, s4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_mov_b32_e32 v1, 16 ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 24, v2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v4, s4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX9-NEXT: v_lshlrev_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_and_or_b32 v0, v2, v0, v4 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 24, v3 @@ -3526,10 +3526,10 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, 8 ; GFX9-NEXT: s_mov_b32 s4, 16 ; GFX9-NEXT: v_mov_b32_e32 v0, 0xff -; GFX9-NEXT: v_lshlrev_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 24, v2 -; GFX9-NEXT: v_and_or_b32 v0, v2, v0, v1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX9-NEXT: v_lshlrev_b32_sdwa v4, s4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_and_or_b32 v0, v2, v0, v1 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 24, v3 ; GFX9-NEXT: v_or3_b32 v0, v0, v4, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -3601,12 +3601,12 @@ ; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, 8 -; GFX9-NEXT: v_mov_b32_e32 v3, 16 ; GFX9-NEXT: v_mov_b32_e32 v0, 0xff -; GFX9-NEXT: v_lshlrev_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_mov_b32_e32 v3, 16 ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 24, v2 -; GFX9-NEXT: v_and_or_b32 v0, v2, v0, v1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX9-NEXT: v_lshlrev_b32_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_and_or_b32 v0, v2, v0, v1 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 24, v4 ; GFX9-NEXT: v_or3_b32 v0, v0, v3, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v0 @@ -3678,12 +3678,12 @@ ; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, 8 -; GFX9-NEXT: v_mov_b32_e32 v2, 16 ; GFX9-NEXT: v_mov_b32_e32 v0, 0xff -; GFX9-NEXT: v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_mov_b32_e32 v2, 16 ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 24, v3 -; GFX9-NEXT: v_and_or_b32 v0, v3, v0, v1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_and_or_b32 v0, v3, v0, v1 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 24, v4 ; GFX9-NEXT: v_or3_b32 v0, v0, v2, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -3751,8 +3751,8 @@ ; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off ; GFX9-NEXT: s_mov_b32 s4, 8 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, 16 ; GFX9-NEXT: v_mov_b32_e32 v0, 0xff +; GFX9-NEXT: v_mov_b32_e32 v1, 16 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 24, v3 ; GFX9-NEXT: v_lshlrev_b32_sdwa v4, s4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX9-NEXT: v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 @@ -3830,10 +3830,10 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, 8 ; GFX9-NEXT: s_mov_b32 s4, 16 ; GFX9-NEXT: v_mov_b32_e32 v0, 0xff -; GFX9-NEXT: v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 24, v3 -; GFX9-NEXT: v_and_or_b32 v0, v3, v0, v1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX9-NEXT: v_lshlrev_b32_sdwa v4, s4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_and_or_b32 v0, v3, v0, v1 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 24, v2 ; GFX9-NEXT: v_or3_b32 v0, v0, v4, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -3905,12 +3905,12 @@ ; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, 8 -; GFX9-NEXT: v_mov_b32_e32 v2, 16 ; GFX9-NEXT: v_mov_b32_e32 v0, 0xff -; GFX9-NEXT: v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_mov_b32_e32 v2, 16 ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 24, v3 -; GFX9-NEXT: v_and_or_b32 v0, v3, v0, v1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_and_or_b32 v0, v3, v0, v1 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 24, v4 ; GFX9-NEXT: v_or3_b32 v0, v0, v2, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll @@ -8,22 +8,22 @@ ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GCN-NEXT: v_cndmask_b32_e64 v6, 1.0, 2.0, vcc ; GCN-NEXT: v_mov_b32_e32 v1, 0x40400000 +; GCN-NEXT: v_cndmask_b32_e64 v6, 1.0, 2.0, vcc ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 2, v0 ; GCN-NEXT: v_cndmask_b32_e32 v1, v6, v1, vcc ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 3, v0 -; GCN-NEXT: v_cndmask_b32_e64 v1, v1, 4.0, vcc ; GCN-NEXT: v_mov_b32_e32 v2, 0x40a00000 +; GCN-NEXT: v_cndmask_b32_e64 v1, v1, 4.0, vcc ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 4, v0 -; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GCN-NEXT: v_mov_b32_e32 v3, 0x40c00000 +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 5, v0 -; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GCN-NEXT: v_mov_b32_e32 v4, 0x40e00000 +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 6, v0 -; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; GCN-NEXT: v_mov_b32_e32 v5, 0x41000000 +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 7, v0 ; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v5, vcc ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -105,23 +105,23 @@ ; GCN-NEXT: v_mov_b32_e32 v2, s1 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GCN-NEXT: s_mov_b32 s3, s5 -; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GCN-NEXT: v_mov_b32_e32 v3, s2 +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 2, v0 -; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GCN-NEXT: v_mov_b32_e32 v4, s3 +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 3, v0 -; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; GCN-NEXT: v_mov_b32_e32 v5, s6 +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 4, v0 -; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc ; GCN-NEXT: v_mov_b32_e32 v6, s7 +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 5, v0 -; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc ; GCN-NEXT: v_mov_b32_e32 v7, s8 +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 6, v0 -; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc ; GCN-NEXT: v_mov_b32_e32 v8, s9 +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 7, v0 ; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v8, vcc ; GCN-NEXT: ; return to shader part epilog @@ -291,21 +291,21 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: s_mov_b64 s[4:5], 1 ; GCN-NEXT: s_mov_b64 s[6:7], 2 -; GCN-NEXT: s_mov_b64 s[8:9], 3 ; GCN-NEXT: v_mov_b32_e32 v1, s4 ; GCN-NEXT: v_mov_b32_e32 v2, s5 ; GCN-NEXT: v_mov_b32_e32 v3, s6 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GCN-NEXT: v_mov_b32_e32 v4, s7 -; GCN-NEXT: s_mov_b64 s[10:11], 4 +; GCN-NEXT: s_mov_b64 s[8:9], 3 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GCN-NEXT: v_mov_b32_e32 v5, s8 ; GCN-NEXT: v_mov_b32_e32 v6, s9 +; GCN-NEXT: s_mov_b64 s[10:11], 4 ; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 2, v0 -; GCN-NEXT: s_mov_b64 s[12:13], 5 ; GCN-NEXT: v_mov_b32_e32 v7, s10 ; GCN-NEXT: v_mov_b32_e32 v8, s11 +; GCN-NEXT: s_mov_b64 s[12:13], 5 ; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc ; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 3, v0 @@ -446,8 +446,8 @@ ; GPRIDX-NEXT: v_mov_b32_e32 v1, s0 ; GPRIDX-NEXT: v_mov_b32_e32 v2, s1 ; GPRIDX-NEXT: v_mov_b32_e32 v3, s2 -; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GPRIDX-NEXT: v_mov_b32_e32 v4, s3 +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GPRIDX-NEXT: s_mov_b32 s6, s8 ; GPRIDX-NEXT: s_mov_b32 s7, s9 ; GPRIDX-NEXT: v_mov_b32_e32 v5, s4 @@ -500,8 +500,8 @@ ; MOVREL-NEXT: v_mov_b32_e32 v1, s0 ; MOVREL-NEXT: v_mov_b32_e32 v2, s1 ; MOVREL-NEXT: v_mov_b32_e32 v3, s2 -; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; MOVREL-NEXT: v_mov_b32_e32 v4, s3 +; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; MOVREL-NEXT: s_mov_b32 s6, s8 ; MOVREL-NEXT: s_mov_b32 s7, s9 ; MOVREL-NEXT: v_mov_b32_e32 v5, s4 @@ -545,11 +545,11 @@ ; ; GFX10-LABEL: dyn_extract_v8i64_s_v: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_mov_b32 s19, s5 ; GFX10-NEXT: s_mov_b32 s0, s2 ; GFX10-NEXT: s_mov_b32 s2, s4 -; GFX10-NEXT: v_mov_b32_e32 v2, s19 +; GFX10-NEXT: s_mov_b32 s19, s5 ; GFX10-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-NEXT: v_mov_b32_e32 v2, s19 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: s_mov_b32 s4, s6 @@ -1936,17 +1936,17 @@ ; GCN-NEXT: v_mov_b32_e32 v1, s0 ; GCN-NEXT: v_mov_b32_e32 v2, s1 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GCN-NEXT: v_mov_b32_e32 v3, s4 +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 2, v0 -; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GCN-NEXT: v_mov_b32_e32 v4, s5 +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 3, v0 -; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; GCN-NEXT: v_mov_b32_e32 v5, s6 +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 4, v0 -; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc ; GCN-NEXT: v_mov_b32_e32 v6, s7 +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 5, v0 ; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v6, vcc ; GCN-NEXT: ; return to shader part epilog @@ -2089,20 +2089,20 @@ ; GCN-NEXT: v_mov_b32_e32 v1, s0 ; GCN-NEXT: v_mov_b32_e32 v2, s1 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GCN-NEXT: v_mov_b32_e32 v3, s2 +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 2, v0 -; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GCN-NEXT: v_mov_b32_e32 v4, s5 +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 3, v0 -; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; GCN-NEXT: v_mov_b32_e32 v5, s6 +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 4, v0 -; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc ; GCN-NEXT: v_mov_b32_e32 v6, s7 +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 5, v0 -; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc ; GCN-NEXT: v_mov_b32_e32 v7, s8 +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 6, v0 ; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v7, vcc ; GCN-NEXT: ; return to shader part epilog @@ -2263,8 +2263,8 @@ ; GCN-NEXT: v_mov_b32_e32 v1, s0 ; GCN-NEXT: v_mov_b32_e32 v2, s1 ; GCN-NEXT: v_mov_b32_e32 v3, s2 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GCN-NEXT: v_mov_b32_e32 v4, s3 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GCN-NEXT: s_mov_b32 s6, s8 ; GCN-NEXT: s_mov_b32 s7, s9 ; GCN-NEXT: v_mov_b32_e32 v5, s4 @@ -2295,11 +2295,11 @@ ; ; GFX10-LABEL: dyn_extract_v6f64_s_v: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_mov_b32 s15, s5 ; GFX10-NEXT: s_mov_b32 s0, s2 ; GFX10-NEXT: s_mov_b32 s2, s4 -; GFX10-NEXT: v_mov_b32_e32 v2, s15 +; GFX10-NEXT: s_mov_b32 s15, s5 ; GFX10-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-NEXT: v_mov_b32_e32 v2, s15 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: s_mov_b32 s4, s6 @@ -2465,8 +2465,8 @@ ; GCN-NEXT: v_mov_b32_e32 v1, s0 ; GCN-NEXT: v_mov_b32_e32 v2, s1 ; GCN-NEXT: v_mov_b32_e32 v3, s2 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GCN-NEXT: v_mov_b32_e32 v4, s3 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GCN-NEXT: s_mov_b32 s6, s8 ; GCN-NEXT: s_mov_b32 s7, s9 ; GCN-NEXT: v_mov_b32_e32 v5, s4 @@ -2504,11 +2504,11 @@ ; ; GFX10-LABEL: dyn_extract_v7f64_s_v: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_mov_b32 s19, s5 ; GFX10-NEXT: s_mov_b32 s0, s2 ; GFX10-NEXT: s_mov_b32 s2, s4 -; GFX10-NEXT: v_mov_b32_e32 v2, s19 +; GFX10-NEXT: s_mov_b32 s19, s5 ; GFX10-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-NEXT: v_mov_b32_e32 v2, s19 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: s_mov_b32 s4, s6 @@ -2963,43 +2963,43 @@ ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GCN-NEXT: v_cndmask_b32_e64 v13, 1.0, 2.0, vcc ; GCN-NEXT: v_mov_b32_e32 v1, 0x40400000 +; GCN-NEXT: v_cndmask_b32_e64 v13, 1.0, 2.0, vcc ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 2, v0 ; GCN-NEXT: v_cndmask_b32_e32 v1, v13, v1, vcc ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 3, v0 -; GCN-NEXT: v_cndmask_b32_e64 v1, v1, 4.0, vcc ; GCN-NEXT: v_mov_b32_e32 v2, 0x40a00000 +; GCN-NEXT: v_cndmask_b32_e64 v1, v1, 4.0, vcc ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 4, v0 -; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GCN-NEXT: v_mov_b32_e32 v3, 0x40c00000 +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 5, v0 -; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GCN-NEXT: v_mov_b32_e32 v4, 0x40e00000 +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 6, v0 -; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; GCN-NEXT: v_mov_b32_e32 v5, 0x41000000 +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 7, v0 -; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc ; GCN-NEXT: v_mov_b32_e32 v6, 0x41100000 +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 8, v0 -; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc ; GCN-NEXT: v_mov_b32_e32 v7, 0x41200000 +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 9, v0 -; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc ; GCN-NEXT: v_mov_b32_e32 v8, 0x41300000 +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 10, v0 -; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc ; GCN-NEXT: v_mov_b32_e32 v9, 0x41400000 +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 11, v0 -; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc ; GCN-NEXT: v_mov_b32_e32 v10, 0x41500000 +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 12, v0 -; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v10, vcc ; GCN-NEXT: v_mov_b32_e32 v11, 0x41600000 +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v10, vcc ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 13, v0 -; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v11, vcc ; GCN-NEXT: v_mov_b32_e32 v12, 0x41700000 +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v11, vcc ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 14, v0 ; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v12, vcc ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -3101,51 +3101,51 @@ ; GCN-NEXT: v_mov_b32_e32 v2, s1 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GCN-NEXT: s_mov_b32 s3, s5 -; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GCN-NEXT: v_mov_b32_e32 v3, s2 +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 2, v0 ; GCN-NEXT: s_mov_b32 s4, s6 -; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GCN-NEXT: v_mov_b32_e32 v4, s3 +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 3, v0 ; GCN-NEXT: s_mov_b32 s5, s7 -; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; GCN-NEXT: v_mov_b32_e32 v5, s4 +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 4, v0 ; GCN-NEXT: s_mov_b32 s6, s8 -; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc ; GCN-NEXT: v_mov_b32_e32 v6, s5 +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 5, v0 ; GCN-NEXT: s_mov_b32 s7, s9 -; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc ; GCN-NEXT: v_mov_b32_e32 v7, s6 +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 6, v0 ; GCN-NEXT: s_mov_b32 s8, s10 -; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc ; GCN-NEXT: v_mov_b32_e32 v8, s7 +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 7, v0 ; GCN-NEXT: s_mov_b32 s9, s11 -; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc ; GCN-NEXT: v_mov_b32_e32 v9, s8 +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 8, v0 ; GCN-NEXT: s_mov_b32 s10, s12 -; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc ; GCN-NEXT: v_mov_b32_e32 v10, s9 +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 9, v0 -; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v10, vcc ; GCN-NEXT: v_mov_b32_e32 v11, s10 +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v10, vcc ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 10, v0 -; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v11, vcc ; GCN-NEXT: v_mov_b32_e32 v12, s13 +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v11, vcc ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 11, v0 -; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v12, vcc ; GCN-NEXT: v_mov_b32_e32 v13, s14 +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v12, vcc ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 12, v0 -; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v13, vcc ; GCN-NEXT: v_mov_b32_e32 v14, s15 +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v13, vcc ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 13, v0 -; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v14, vcc ; GCN-NEXT: v_mov_b32_e32 v15, s16 +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v14, vcc ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 14, v0 ; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v15, vcc ; GCN-NEXT: ; return to shader part epilog diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll @@ -632,8 +632,8 @@ ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; GFX6-FLUSH-NEXT: v_div_fmas_f32 v4, v4, v5, v7 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, s[4:5], v3, v3, v1 -; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v4, v2, v0 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v6, v5 +; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v4, v2, v0 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, vcc, v1, v3, v1 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX6-FLUSH-NEXT: v_fma_f32 v4, -v5, v6, 1.0 @@ -657,16 +657,16 @@ ; GFX89-IEEE-NEXT: v_rcp_f32_e32 v8, v4 ; GFX89-IEEE-NEXT: v_rcp_f32_e32 v9, v5 ; GFX89-IEEE-NEXT: v_fma_f32 v10, -v4, v8, 1.0 -; GFX89-IEEE-NEXT: v_fma_f32 v11, -v5, v9, 1.0 ; GFX89-IEEE-NEXT: v_fma_f32 v8, v10, v8, v8 +; GFX89-IEEE-NEXT: v_fma_f32 v11, -v5, v9, 1.0 ; GFX89-IEEE-NEXT: v_fma_f32 v9, v11, v9, v9 ; GFX89-IEEE-NEXT: v_mul_f32_e32 v10, v6, v8 -; GFX89-IEEE-NEXT: v_fma_f32 v12, -v4, v10, v6 ; GFX89-IEEE-NEXT: v_mul_f32_e32 v11, v7, v9 +; GFX89-IEEE-NEXT: v_fma_f32 v12, -v4, v10, v6 ; GFX89-IEEE-NEXT: v_fma_f32 v13, -v5, v11, v7 ; GFX89-IEEE-NEXT: v_fma_f32 v10, v12, v8, v10 -; GFX89-IEEE-NEXT: v_fma_f32 v4, -v4, v10, v6 ; GFX89-IEEE-NEXT: v_fma_f32 v11, v13, v9, v11 +; GFX89-IEEE-NEXT: v_fma_f32 v4, -v4, v10, v6 ; GFX89-IEEE-NEXT: v_div_fmas_f32 v4, v4, v8, v10 ; GFX89-IEEE-NEXT: v_fma_f32 v5, -v5, v11, v7 ; GFX89-IEEE-NEXT: s_mov_b64 vcc, s[4:5] @@ -689,11 +689,11 @@ ; GFX89-FLUSH-NEXT: v_fma_f32 v7, v8, v6, v7 ; GFX89-FLUSH-NEXT: v_fma_f32 v4, -v4, v7, v5 ; GFX89-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX89-FLUSH-NEXT: v_div_fmas_f32 v4, v4, v6, v7 ; GFX89-FLUSH-NEXT: v_div_scale_f32 v5, s[4:5], v3, v3, v1 +; GFX89-FLUSH-NEXT: v_div_fmas_f32 v4, v4, v6, v7 ; GFX89-FLUSH-NEXT: v_div_scale_f32 v6, vcc, v1, v3, v1 -; GFX89-FLUSH-NEXT: v_div_fixup_f32 v0, v4, v2, v0 ; GFX89-FLUSH-NEXT: v_rcp_f32_e32 v7, v5 +; GFX89-FLUSH-NEXT: v_div_fixup_f32 v0, v4, v2, v0 ; GFX89-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX89-FLUSH-NEXT: v_fma_f32 v2, -v5, v7, 1.0 ; GFX89-FLUSH-NEXT: v_fma_f32 v2, v2, v7, v7 @@ -718,8 +718,8 @@ ; GFX10-IEEE-NEXT: v_fma_f32 v8, -v4, v6, 1.0 ; GFX10-IEEE-NEXT: v_fma_f32 v9, -v5, v7, 1.0 ; GFX10-IEEE-NEXT: v_fmac_f32_e32 v6, v8, v6 -; GFX10-IEEE-NEXT: v_fmac_f32_e32 v7, v9, v7 ; GFX10-IEEE-NEXT: v_div_scale_f32 v8, s4, v1, v3, v1 +; GFX10-IEEE-NEXT: v_fmac_f32_e32 v7, v9, v7 ; GFX10-IEEE-NEXT: v_mul_f32_e32 v9, v10, v6 ; GFX10-IEEE-NEXT: v_mul_f32_e32 v11, v8, v7 ; GFX10-IEEE-NEXT: v_fma_f32 v12, v9, -v4, v10 @@ -850,16 +850,16 @@ ; GFX89-IEEE-NEXT: v_rcp_f32_e32 v8, v4 ; GFX89-IEEE-NEXT: v_rcp_f32_e32 v9, v5 ; GFX89-IEEE-NEXT: v_fma_f32 v10, -v4, v8, 1.0 -; GFX89-IEEE-NEXT: v_fma_f32 v11, -v5, v9, 1.0 ; GFX89-IEEE-NEXT: v_fma_f32 v8, v10, v8, v8 +; GFX89-IEEE-NEXT: v_fma_f32 v11, -v5, v9, 1.0 ; GFX89-IEEE-NEXT: v_fma_f32 v9, v11, v9, v9 ; GFX89-IEEE-NEXT: v_mul_f32_e32 v10, v6, v8 -; GFX89-IEEE-NEXT: v_fma_f32 v12, -v4, v10, v6 ; GFX89-IEEE-NEXT: v_mul_f32_e32 v11, v7, v9 +; GFX89-IEEE-NEXT: v_fma_f32 v12, -v4, v10, v6 ; GFX89-IEEE-NEXT: v_fma_f32 v13, -v5, v11, v7 ; GFX89-IEEE-NEXT: v_fma_f32 v10, v12, v8, v10 -; GFX89-IEEE-NEXT: v_fma_f32 v4, -v4, v10, v6 ; GFX89-IEEE-NEXT: v_fma_f32 v11, v13, v9, v11 +; GFX89-IEEE-NEXT: v_fma_f32 v4, -v4, v10, v6 ; GFX89-IEEE-NEXT: v_div_fmas_f32 v4, v4, v8, v10 ; GFX89-IEEE-NEXT: v_fma_f32 v5, -v5, v11, v7 ; GFX89-IEEE-NEXT: s_mov_b64 vcc, s[4:5] @@ -880,8 +880,8 @@ ; GFX10-IEEE-NEXT: v_fma_f32 v8, -v4, v6, 1.0 ; GFX10-IEEE-NEXT: v_fma_f32 v9, -v5, v7, 1.0 ; GFX10-IEEE-NEXT: v_fmac_f32_e32 v6, v8, v6 -; GFX10-IEEE-NEXT: v_fmac_f32_e32 v7, v9, v7 ; GFX10-IEEE-NEXT: v_div_scale_f32 v8, s4, v1, v3, v1 +; GFX10-IEEE-NEXT: v_fmac_f32_e32 v7, v9, v7 ; GFX10-IEEE-NEXT: v_mul_f32_e32 v9, v10, v6 ; GFX10-IEEE-NEXT: v_mul_f32_e32 v11, v8, v7 ; GFX10-IEEE-NEXT: v_fma_f32 v12, v9, -v4, v10 @@ -964,8 +964,8 @@ ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, 1.0 -; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v3 +; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, vcc, 1.0, v1, 1.0 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v3, v4, 1.0 @@ -989,18 +989,18 @@ ; GFX89-IEEE-NEXT: v_rcp_f32_e32 v6, v2 ; GFX89-IEEE-NEXT: v_rcp_f32_e32 v7, v3 ; GFX89-IEEE-NEXT: v_fma_f32 v8, -v2, v6, 1.0 -; GFX89-IEEE-NEXT: v_fma_f32 v9, -v3, v7, 1.0 ; GFX89-IEEE-NEXT: v_fma_f32 v6, v8, v6, v6 +; GFX89-IEEE-NEXT: v_fma_f32 v9, -v3, v7, 1.0 ; GFX89-IEEE-NEXT: v_fma_f32 v7, v9, v7, v7 ; GFX89-IEEE-NEXT: v_mul_f32_e32 v8, v4, v6 -; GFX89-IEEE-NEXT: v_fma_f32 v10, -v2, v8, v4 ; GFX89-IEEE-NEXT: v_mul_f32_e32 v9, v5, v7 +; GFX89-IEEE-NEXT: v_fma_f32 v10, -v2, v8, v4 ; GFX89-IEEE-NEXT: v_fma_f32 v11, -v3, v9, v5 ; GFX89-IEEE-NEXT: v_fma_f32 v8, v10, v6, v8 -; GFX89-IEEE-NEXT: v_fma_f32 v2, -v2, v8, v4 ; GFX89-IEEE-NEXT: v_fma_f32 v9, v11, v7, v9 -; GFX89-IEEE-NEXT: v_div_fmas_f32 v2, v2, v6, v8 +; GFX89-IEEE-NEXT: v_fma_f32 v2, -v2, v8, v4 ; GFX89-IEEE-NEXT: v_fma_f32 v3, -v3, v9, v5 +; GFX89-IEEE-NEXT: v_div_fmas_f32 v2, v2, v6, v8 ; GFX89-IEEE-NEXT: s_mov_b64 vcc, s[4:5] ; GFX89-IEEE-NEXT: v_div_fmas_f32 v3, v3, v7, v9 ; GFX89-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0 @@ -1021,11 +1021,11 @@ ; GFX89-FLUSH-NEXT: v_fma_f32 v5, v6, v4, v5 ; GFX89-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v3 ; GFX89-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX89-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v4, v5 ; GFX89-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, 1.0 +; GFX89-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v4, v5 ; GFX89-FLUSH-NEXT: v_div_scale_f32 v4, vcc, 1.0, v1, 1.0 -; GFX89-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0 ; GFX89-FLUSH-NEXT: v_rcp_f32_e32 v5, v3 +; GFX89-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0 ; GFX89-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX89-FLUSH-NEXT: v_fma_f32 v2, -v3, v5, 1.0 ; GFX89-FLUSH-NEXT: v_fma_f32 v2, v2, v5, v5 @@ -1050,8 +1050,8 @@ ; GFX10-IEEE-NEXT: v_fma_f32 v6, -v2, v4, 1.0 ; GFX10-IEEE-NEXT: v_fma_f32 v7, -v3, v5, 1.0 ; GFX10-IEEE-NEXT: v_fmac_f32_e32 v4, v6, v4 -; GFX10-IEEE-NEXT: v_fmac_f32_e32 v5, v7, v5 ; GFX10-IEEE-NEXT: v_div_scale_f32 v6, s4, 1.0, v1, 1.0 +; GFX10-IEEE-NEXT: v_fmac_f32_e32 v5, v7, v5 ; GFX10-IEEE-NEXT: v_mul_f32_e32 v7, v8, v4 ; GFX10-IEEE-NEXT: v_mul_f32_e32 v9, v6, v5 ; GFX10-IEEE-NEXT: v_fma_f32 v10, v7, -v2, v8 @@ -1146,8 +1146,8 @@ ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, 1.0 -; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v3 +; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, vcc, 1.0, v1, 1.0 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v3, v4, 1.0 @@ -1171,18 +1171,18 @@ ; GFX89-IEEE-NEXT: v_rcp_f32_e32 v6, v2 ; GFX89-IEEE-NEXT: v_rcp_f32_e32 v7, v3 ; GFX89-IEEE-NEXT: v_fma_f32 v8, -v2, v6, 1.0 -; GFX89-IEEE-NEXT: v_fma_f32 v9, -v3, v7, 1.0 ; GFX89-IEEE-NEXT: v_fma_f32 v6, v8, v6, v6 +; GFX89-IEEE-NEXT: v_fma_f32 v9, -v3, v7, 1.0 ; GFX89-IEEE-NEXT: v_fma_f32 v7, v9, v7, v7 ; GFX89-IEEE-NEXT: v_mul_f32_e32 v8, v4, v6 -; GFX89-IEEE-NEXT: v_fma_f32 v10, -v2, v8, v4 ; GFX89-IEEE-NEXT: v_mul_f32_e32 v9, v5, v7 +; GFX89-IEEE-NEXT: v_fma_f32 v10, -v2, v8, v4 ; GFX89-IEEE-NEXT: v_fma_f32 v11, -v3, v9, v5 ; GFX89-IEEE-NEXT: v_fma_f32 v8, v10, v6, v8 -; GFX89-IEEE-NEXT: v_fma_f32 v2, -v2, v8, v4 ; GFX89-IEEE-NEXT: v_fma_f32 v9, v11, v7, v9 -; GFX89-IEEE-NEXT: v_div_fmas_f32 v2, v2, v6, v8 +; GFX89-IEEE-NEXT: v_fma_f32 v2, -v2, v8, v4 ; GFX89-IEEE-NEXT: v_fma_f32 v3, -v3, v9, v5 +; GFX89-IEEE-NEXT: v_div_fmas_f32 v2, v2, v6, v8 ; GFX89-IEEE-NEXT: s_mov_b64 vcc, s[4:5] ; GFX89-IEEE-NEXT: v_div_fmas_f32 v3, v3, v7, v9 ; GFX89-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0 @@ -1203,11 +1203,11 @@ ; GFX89-FLUSH-NEXT: v_fma_f32 v5, v6, v4, v5 ; GFX89-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v3 ; GFX89-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX89-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v4, v5 ; GFX89-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, 1.0 +; GFX89-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v4, v5 ; GFX89-FLUSH-NEXT: v_div_scale_f32 v4, vcc, 1.0, v1, 1.0 -; GFX89-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0 ; GFX89-FLUSH-NEXT: v_rcp_f32_e32 v5, v3 +; GFX89-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0 ; GFX89-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX89-FLUSH-NEXT: v_fma_f32 v2, -v3, v5, 1.0 ; GFX89-FLUSH-NEXT: v_fma_f32 v2, v2, v5, v5 @@ -1232,8 +1232,8 @@ ; GFX10-IEEE-NEXT: v_fma_f32 v6, -v2, v4, 1.0 ; GFX10-IEEE-NEXT: v_fma_f32 v7, -v3, v5, 1.0 ; GFX10-IEEE-NEXT: v_fmac_f32_e32 v4, v6, v4 -; GFX10-IEEE-NEXT: v_fmac_f32_e32 v5, v7, v5 ; GFX10-IEEE-NEXT: v_div_scale_f32 v6, s4, 1.0, v1, 1.0 +; GFX10-IEEE-NEXT: v_fmac_f32_e32 v5, v7, v5 ; GFX10-IEEE-NEXT: v_mul_f32_e32 v7, v8, v4 ; GFX10-IEEE-NEXT: v_mul_f32_e32 v9, v6, v5 ; GFX10-IEEE-NEXT: v_fma_f32 v10, v7, -v2, v8 @@ -1441,16 +1441,16 @@ ; GFX89-IEEE-NEXT: v_rcp_f32_e32 v8, v4 ; GFX89-IEEE-NEXT: v_rcp_f32_e32 v9, v5 ; GFX89-IEEE-NEXT: v_fma_f32 v10, -v4, v8, 1.0 -; GFX89-IEEE-NEXT: v_fma_f32 v11, -v5, v9, 1.0 ; GFX89-IEEE-NEXT: v_fma_f32 v8, v10, v8, v8 +; GFX89-IEEE-NEXT: v_fma_f32 v11, -v5, v9, 1.0 ; GFX89-IEEE-NEXT: v_fma_f32 v9, v11, v9, v9 ; GFX89-IEEE-NEXT: v_mul_f32_e32 v10, v6, v8 -; GFX89-IEEE-NEXT: v_fma_f32 v12, -v4, v10, v6 ; GFX89-IEEE-NEXT: v_mul_f32_e32 v11, v7, v9 +; GFX89-IEEE-NEXT: v_fma_f32 v12, -v4, v10, v6 ; GFX89-IEEE-NEXT: v_fma_f32 v13, -v5, v11, v7 ; GFX89-IEEE-NEXT: v_fma_f32 v10, v12, v8, v10 -; GFX89-IEEE-NEXT: v_fma_f32 v4, -v4, v10, v6 ; GFX89-IEEE-NEXT: v_fma_f32 v11, v13, v9, v11 +; GFX89-IEEE-NEXT: v_fma_f32 v4, -v4, v10, v6 ; GFX89-IEEE-NEXT: v_div_fmas_f32 v4, v4, v8, v10 ; GFX89-IEEE-NEXT: v_fma_f32 v5, -v5, v11, v7 ; GFX89-IEEE-NEXT: s_mov_b64 vcc, s[4:5] @@ -1471,8 +1471,8 @@ ; GFX10-IEEE-NEXT: v_fma_f32 v8, -v4, v6, 1.0 ; GFX10-IEEE-NEXT: v_fma_f32 v9, -v5, v7, 1.0 ; GFX10-IEEE-NEXT: v_fmac_f32_e32 v6, v8, v6 -; GFX10-IEEE-NEXT: v_fmac_f32_e32 v7, v9, v7 ; GFX10-IEEE-NEXT: v_div_scale_f32 v8, s4, v1, v3, v1 +; GFX10-IEEE-NEXT: v_fmac_f32_e32 v7, v9, v7 ; GFX10-IEEE-NEXT: v_mul_f32_e32 v9, v10, v6 ; GFX10-IEEE-NEXT: v_mul_f32_e32 v11, v8, v7 ; GFX10-IEEE-NEXT: v_fma_f32 v12, v9, -v4, v10 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f64.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f64.ll @@ -18,8 +18,8 @@ ; GFX6-NEXT: v_div_scale_f64 v[4:5], s[4:5], v[2:3], v[2:3], v[0:1] ; GFX6-NEXT: v_div_scale_f64 v[10:11], s[4:5], v[0:1], v[2:3], v[0:1] ; GFX6-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v11 ; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], v3, v5 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v11 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 ; GFX6-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] @@ -121,8 +121,8 @@ ; GFX6-NEXT: v_div_scale_f64 v[4:5], s[4:5], v[2:3], v[2:3], v[0:1] ; GFX6-NEXT: v_div_scale_f64 v[10:11], s[4:5], v[0:1], v[2:3], v[0:1] ; GFX6-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v11 ; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], v3, v5 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v11 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 ; GFX6-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] @@ -192,12 +192,12 @@ ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0 ; GFX6-NEXT: v_div_scale_f64 v[8:9], s[4:5], 1.0, v[0:1], 1.0 -; GFX6-NEXT: v_mov_b32_e32 v10, 0x3ff00000 ; GFX6-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] +; GFX6-NEXT: v_mov_b32_e32 v10, 0x3ff00000 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v10, v9 ; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], v1, v3 -; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] ; GFX6-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 ; GFX6-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] @@ -265,12 +265,12 @@ ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0 ; GFX6-NEXT: v_div_scale_f64 v[8:9], s[4:5], 1.0, v[0:1], 1.0 -; GFX6-NEXT: v_mov_b32_e32 v10, 0x3ff00000 ; GFX6-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] +; GFX6-NEXT: v_mov_b32_e32 v10, 0x3ff00000 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v10, v9 ; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], v1, v3 -; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] ; GFX6-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 ; GFX6-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] @@ -369,12 +369,12 @@ ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0 ; GFX6-NEXT: v_div_scale_f64 v[8:9], s[4:5], 1.0, v[0:1], 1.0 -; GFX6-NEXT: v_mov_b32_e32 v10, 0x3ff00000 ; GFX6-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] +; GFX6-NEXT: v_mov_b32_e32 v10, 0x3ff00000 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v10, v9 ; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], v1, v3 -; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] ; GFX6-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 ; GFX6-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] @@ -474,8 +474,8 @@ ; GFX6-NEXT: v_div_scale_f64 v[4:5], s[4:5], v[2:3], v[2:3], v[0:1] ; GFX6-NEXT: v_div_scale_f64 v[10:11], s[4:5], v[0:1], v[2:3], v[0:1] ; GFX6-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v11 ; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], v3, v5 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v11 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 ; GFX6-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] @@ -545,18 +545,18 @@ ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_div_scale_f64 v[8:9], s[4:5], v[4:5], v[4:5], v[0:1] ; GFX6-NEXT: v_div_scale_f64 v[14:15], s[4:5], v[6:7], v[6:7], v[2:3] -; GFX6-NEXT: v_div_scale_f64 v[18:19], s[4:5], v[0:1], v[4:5], v[0:1] ; GFX6-NEXT: v_rcp_f64_e32 v[10:11], v[8:9] +; GFX6-NEXT: v_div_scale_f64 v[18:19], s[4:5], v[0:1], v[4:5], v[0:1] ; GFX6-NEXT: v_rcp_f64_e32 v[16:17], v[14:15] +; GFX6-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v19 +; GFX6-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] ; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], v5, v9 ; GFX6-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] -; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v15 -; GFX6-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 -; GFX6-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] ; GFX6-NEXT: v_fma_f64 v[12:13], -v[14:15], v[16:17], 1.0 +; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v15 ; GFX6-NEXT: v_fma_f64 v[12:13], v[16:17], v[12:13], v[16:17] ; GFX6-NEXT: v_mul_f64 v[16:17], v[18:19], v[10:11] ; GFX6-NEXT: v_fma_f64 v[18:19], -v[8:9], v[16:17], v[18:19] @@ -646,8 +646,8 @@ ; GFX10-NEXT: v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0 ; GFX10-NEXT: v_fma_f64 v[18:19], -v[10:11], v[14:15], 1.0 ; GFX10-NEXT: v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13] -; GFX10-NEXT: v_fma_f64 v[14:15], v[14:15], v[18:19], v[14:15] ; GFX10-NEXT: v_div_scale_f64 v[16:17], s4, v[2:3], v[6:7], v[2:3] +; GFX10-NEXT: v_fma_f64 v[14:15], v[14:15], v[18:19], v[14:15] ; GFX10-NEXT: v_mul_f64 v[18:19], v[20:21], v[12:13] ; GFX10-NEXT: v_mul_f64 v[22:23], v[16:17], v[14:15] ; GFX10-NEXT: v_fma_f64 v[8:9], -v[8:9], v[18:19], v[20:21] @@ -715,18 +715,18 @@ ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_div_scale_f64 v[8:9], s[4:5], v[4:5], v[4:5], v[0:1] ; GFX6-NEXT: v_div_scale_f64 v[14:15], s[4:5], v[6:7], v[6:7], v[2:3] -; GFX6-NEXT: v_div_scale_f64 v[18:19], s[4:5], v[0:1], v[4:5], v[0:1] ; GFX6-NEXT: v_rcp_f64_e32 v[10:11], v[8:9] +; GFX6-NEXT: v_div_scale_f64 v[18:19], s[4:5], v[0:1], v[4:5], v[0:1] ; GFX6-NEXT: v_rcp_f64_e32 v[16:17], v[14:15] +; GFX6-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v19 +; GFX6-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] ; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], v5, v9 ; GFX6-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] -; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v15 -; GFX6-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 -; GFX6-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] ; GFX6-NEXT: v_fma_f64 v[12:13], -v[14:15], v[16:17], 1.0 +; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v15 ; GFX6-NEXT: v_fma_f64 v[12:13], v[16:17], v[12:13], v[16:17] ; GFX6-NEXT: v_mul_f64 v[16:17], v[18:19], v[10:11] ; GFX6-NEXT: v_fma_f64 v[18:19], -v[8:9], v[16:17], v[18:19] @@ -816,8 +816,8 @@ ; GFX10-NEXT: v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0 ; GFX10-NEXT: v_fma_f64 v[18:19], -v[10:11], v[14:15], 1.0 ; GFX10-NEXT: v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13] -; GFX10-NEXT: v_fma_f64 v[14:15], v[14:15], v[18:19], v[14:15] ; GFX10-NEXT: v_div_scale_f64 v[16:17], s4, v[2:3], v[6:7], v[2:3] +; GFX10-NEXT: v_fma_f64 v[14:15], v[14:15], v[18:19], v[14:15] ; GFX10-NEXT: v_mul_f64 v[18:19], v[20:21], v[12:13] ; GFX10-NEXT: v_mul_f64 v[22:23], v[16:17], v[14:15] ; GFX10-NEXT: v_fma_f64 v[8:9], -v[8:9], v[18:19], v[20:21] @@ -838,8 +838,8 @@ ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_div_scale_f64 v[4:5], s[4:5], v[0:1], v[0:1], 1.0 ; GFX6-NEXT: v_div_scale_f64 v[10:11], s[4:5], 1.0, v[0:1], 1.0 -; GFX6-NEXT: v_mov_b32_e32 v18, 0x3ff00000 ; GFX6-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] +; GFX6-NEXT: v_mov_b32_e32 v18, 0x3ff00000 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v18, v11 ; GFX6-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 ; GFX6-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] @@ -878,8 +878,8 @@ ; GFX8-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0 ; GFX8-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0 ; GFX8-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9] -; GFX8-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] ; GFX8-NEXT: v_div_scale_f64 v[12:13], vcc, 1.0, v[0:1], 1.0 +; GFX8-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] ; GFX8-NEXT: v_fma_f64 v[14:15], -v[4:5], v[8:9], 1.0 ; GFX8-NEXT: v_fma_f64 v[18:19], -v[6:7], v[10:11], 1.0 ; GFX8-NEXT: v_fma_f64 v[8:9], v[8:9], v[14:15], v[8:9] @@ -906,8 +906,8 @@ ; GFX9-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0 ; GFX9-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0 ; GFX9-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9] -; GFX9-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] ; GFX9-NEXT: v_div_scale_f64 v[12:13], vcc, 1.0, v[0:1], 1.0 +; GFX9-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] ; GFX9-NEXT: v_fma_f64 v[14:15], -v[4:5], v[8:9], 1.0 ; GFX9-NEXT: v_fma_f64 v[18:19], -v[6:7], v[10:11], 1.0 ; GFX9-NEXT: v_fma_f64 v[8:9], v[8:9], v[14:15], v[8:9] @@ -939,8 +939,8 @@ ; GFX10-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0 ; GFX10-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0 ; GFX10-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9] -; GFX10-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] ; GFX10-NEXT: v_div_scale_f64 v[12:13], s4, 1.0, v[2:3], 1.0 +; GFX10-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] ; GFX10-NEXT: v_mul_f64 v[14:15], v[16:17], v[8:9] ; GFX10-NEXT: v_mul_f64 v[18:19], v[12:13], v[10:11] ; GFX10-NEXT: v_fma_f64 v[4:5], -v[4:5], v[14:15], v[16:17] @@ -961,8 +961,8 @@ ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_div_scale_f64 v[4:5], s[4:5], v[0:1], v[0:1], 1.0 ; GFX6-NEXT: v_div_scale_f64 v[10:11], s[4:5], 1.0, v[0:1], 1.0 -; GFX6-NEXT: v_mov_b32_e32 v18, 0x3ff00000 ; GFX6-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] +; GFX6-NEXT: v_mov_b32_e32 v18, 0x3ff00000 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v18, v11 ; GFX6-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 ; GFX6-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] @@ -1001,8 +1001,8 @@ ; GFX8-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0 ; GFX8-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0 ; GFX8-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9] -; GFX8-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] ; GFX8-NEXT: v_div_scale_f64 v[12:13], vcc, 1.0, v[0:1], 1.0 +; GFX8-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] ; GFX8-NEXT: v_fma_f64 v[14:15], -v[4:5], v[8:9], 1.0 ; GFX8-NEXT: v_fma_f64 v[18:19], -v[6:7], v[10:11], 1.0 ; GFX8-NEXT: v_fma_f64 v[8:9], v[8:9], v[14:15], v[8:9] @@ -1029,8 +1029,8 @@ ; GFX9-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0 ; GFX9-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0 ; GFX9-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9] -; GFX9-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] ; GFX9-NEXT: v_div_scale_f64 v[12:13], vcc, 1.0, v[0:1], 1.0 +; GFX9-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] ; GFX9-NEXT: v_fma_f64 v[14:15], -v[4:5], v[8:9], 1.0 ; GFX9-NEXT: v_fma_f64 v[18:19], -v[6:7], v[10:11], 1.0 ; GFX9-NEXT: v_fma_f64 v[8:9], v[8:9], v[14:15], v[8:9] @@ -1062,8 +1062,8 @@ ; GFX10-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0 ; GFX10-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0 ; GFX10-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9] -; GFX10-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] ; GFX10-NEXT: v_div_scale_f64 v[12:13], s4, 1.0, v[2:3], 1.0 +; GFX10-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] ; GFX10-NEXT: v_mul_f64 v[14:15], v[16:17], v[8:9] ; GFX10-NEXT: v_mul_f64 v[18:19], v[12:13], v[10:11] ; GFX10-NEXT: v_fma_f64 v[4:5], -v[4:5], v[14:15], v[16:17] @@ -1131,8 +1131,8 @@ ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_div_scale_f64 v[4:5], s[4:5], v[0:1], v[0:1], 1.0 ; GFX6-NEXT: v_div_scale_f64 v[10:11], s[4:5], 1.0, v[0:1], 1.0 -; GFX6-NEXT: v_mov_b32_e32 v18, 0x3ff00000 ; GFX6-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] +; GFX6-NEXT: v_mov_b32_e32 v18, 0x3ff00000 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v18, v11 ; GFX6-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 ; GFX6-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] @@ -1171,8 +1171,8 @@ ; GFX8-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0 ; GFX8-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0 ; GFX8-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9] -; GFX8-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] ; GFX8-NEXT: v_div_scale_f64 v[12:13], vcc, 1.0, v[0:1], 1.0 +; GFX8-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] ; GFX8-NEXT: v_fma_f64 v[14:15], -v[4:5], v[8:9], 1.0 ; GFX8-NEXT: v_fma_f64 v[18:19], -v[6:7], v[10:11], 1.0 ; GFX8-NEXT: v_fma_f64 v[8:9], v[8:9], v[14:15], v[8:9] @@ -1199,8 +1199,8 @@ ; GFX9-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0 ; GFX9-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0 ; GFX9-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9] -; GFX9-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] ; GFX9-NEXT: v_div_scale_f64 v[12:13], vcc, 1.0, v[0:1], 1.0 +; GFX9-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] ; GFX9-NEXT: v_fma_f64 v[14:15], -v[4:5], v[8:9], 1.0 ; GFX9-NEXT: v_fma_f64 v[18:19], -v[6:7], v[10:11], 1.0 ; GFX9-NEXT: v_fma_f64 v[8:9], v[8:9], v[14:15], v[8:9] @@ -1232,8 +1232,8 @@ ; GFX10-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0 ; GFX10-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0 ; GFX10-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9] -; GFX10-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] ; GFX10-NEXT: v_div_scale_f64 v[12:13], s4, 1.0, v[2:3], 1.0 +; GFX10-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] ; GFX10-NEXT: v_mul_f64 v[14:15], v[16:17], v[8:9] ; GFX10-NEXT: v_mul_f64 v[18:19], v[12:13], v[10:11] ; GFX10-NEXT: v_fma_f64 v[4:5], -v[4:5], v[14:15], v[16:17] @@ -1301,18 +1301,18 @@ ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_div_scale_f64 v[8:9], s[4:5], v[4:5], v[4:5], v[0:1] ; GFX6-NEXT: v_div_scale_f64 v[14:15], s[4:5], v[6:7], v[6:7], v[2:3] -; GFX6-NEXT: v_div_scale_f64 v[18:19], s[4:5], v[0:1], v[4:5], v[0:1] ; GFX6-NEXT: v_rcp_f64_e32 v[10:11], v[8:9] +; GFX6-NEXT: v_div_scale_f64 v[18:19], s[4:5], v[0:1], v[4:5], v[0:1] ; GFX6-NEXT: v_rcp_f64_e32 v[16:17], v[14:15] +; GFX6-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v19 +; GFX6-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] ; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], v5, v9 ; GFX6-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] -; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v15 -; GFX6-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 -; GFX6-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] ; GFX6-NEXT: v_fma_f64 v[12:13], -v[14:15], v[16:17], 1.0 +; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v15 ; GFX6-NEXT: v_fma_f64 v[12:13], v[16:17], v[12:13], v[16:17] ; GFX6-NEXT: v_mul_f64 v[16:17], v[18:19], v[10:11] ; GFX6-NEXT: v_fma_f64 v[18:19], -v[8:9], v[16:17], v[18:19] @@ -1402,8 +1402,8 @@ ; GFX10-NEXT: v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0 ; GFX10-NEXT: v_fma_f64 v[18:19], -v[10:11], v[14:15], 1.0 ; GFX10-NEXT: v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13] -; GFX10-NEXT: v_fma_f64 v[14:15], v[14:15], v[18:19], v[14:15] ; GFX10-NEXT: v_div_scale_f64 v[16:17], s4, v[2:3], v[6:7], v[2:3] +; GFX10-NEXT: v_fma_f64 v[14:15], v[14:15], v[18:19], v[14:15] ; GFX10-NEXT: v_mul_f64 v[18:19], v[20:21], v[12:13] ; GFX10-NEXT: v_mul_f64 v[22:23], v[16:17], v[14:15] ; GFX10-NEXT: v_fma_f64 v[8:9], -v[8:9], v[18:19], v[20:21] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll @@ -12,8 +12,8 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b32 s1, s0, 2 ; GFX9-NEXT: s_and_b32 s0, s0, 15 -; GFX9-NEXT: s_lshl_b32 s0, s0, 2 ; GFX9-NEXT: s_add_i32 s1, s1, 4 +; GFX9-NEXT: s_lshl_b32 s0, s0, 2 ; GFX9-NEXT: scratch_store_dword off, v0, s1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_add_i32 s0, s0, 4 @@ -58,12 +58,12 @@ ; GFX9: ; %bb.0: ; %bb ; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0 -; GFX9-NEXT: v_sub_u32_e32 v0, 0, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 4 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: v_sub_u32_e32 v0, 0, v0 ; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 ; GFX9-NEXT: v_add_u32_e32 v1, v2, v1 ; GFX9-NEXT: v_mov_b32_e32 v3, 15 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: scratch_store_dword v1, v3, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v0, v2, v0 @@ -109,11 +109,11 @@ ; GFX9: ; %bb.0: ; %bb ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 15, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, s32 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 15, v0 ; GFX9-NEXT: v_add_u32_e32 v1, v2, v1 ; GFX9-NEXT: v_mov_b32_e32 v3, 15 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: scratch_store_dword v1, v3, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v0, v2, v0 @@ -183,9 +183,9 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_lshl_b32 s1, s0, 2 ; GFX9-NEXT: s_and_b32 s0, s0, 15 -; GFX9-NEXT: s_lshl_b32 s0, s0, 2 ; GFX9-NEXT: v_mov_b32_e32 v0, 15 ; GFX9-NEXT: s_addk_i32 s1, 0x104 +; GFX9-NEXT: s_lshl_b32 s0, s0, 2 ; GFX9-NEXT: scratch_store_dword off, v0, s1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_addk_i32 s0, 0x104 @@ -239,11 +239,11 @@ ; GFX9-NEXT: scratch_load_dword v1, off, vcc_hi offset:4 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0 -; GFX9-NEXT: v_sub_u32_e32 v0, 0, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x104 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: v_sub_u32_e32 v0, 0, v0 ; GFX9-NEXT: v_add_u32_e32 v1, v2, v1 ; GFX9-NEXT: v_mov_b32_e32 v3, 15 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: scratch_store_dword v1, v3, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v0, v2, v0 @@ -297,11 +297,11 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_add_i32 vcc_hi, s32, 0x100 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 15, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, vcc_hi -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 15, v0 ; GFX9-NEXT: v_add_u32_e32 v1, v2, v1 ; GFX9-NEXT: v_mov_b32_e32 v3, 15 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: scratch_store_dword v1, v3, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v0, v2, v0 @@ -355,9 +355,9 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_lshl_b32 s1, s0, 2 ; GFX9-NEXT: s_and_b32 s0, s0, 15 -; GFX9-NEXT: s_lshl_b32 s0, s0, 2 ; GFX9-NEXT: v_mov_b32_e32 v0, 15 ; GFX9-NEXT: s_addk_i32 s1, 0x4004 +; GFX9-NEXT: s_lshl_b32 s0, s0, 2 ; GFX9-NEXT: scratch_store_dword off, v0, s1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_addk_i32 s0, 0x4004 @@ -411,11 +411,11 @@ ; GFX9-NEXT: scratch_load_dword v1, off, vcc_hi offset:4 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0 -; GFX9-NEXT: v_sub_u32_e32 v0, 0, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x4004 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: v_sub_u32_e32 v0, 0, v0 ; GFX9-NEXT: v_add_u32_e32 v1, v2, v1 ; GFX9-NEXT: v_mov_b32_e32 v3, 15 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: scratch_store_dword v1, v3, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v0, v2, v0 @@ -469,11 +469,11 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_add_i32 vcc_hi, s32, 0x4000 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 15, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, vcc_hi -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 15, v0 ; GFX9-NEXT: v_add_u32_e32 v1, v2, v1 ; GFX9-NEXT: v_mov_b32_e32 v3, 15 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: scratch_store_dword v1, v3, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v0, v2, v0 @@ -521,9 +521,9 @@ ; GFX9: ; %bb.0: ; %bb ; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 ; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 -; GFX9-NEXT: s_movk_i32 s0, 0x3e80 ; GFX9-NEXT: v_mov_b32_e32 v0, 13 ; GFX9-NEXT: s_mov_b32 vcc_hi, 0 +; GFX9-NEXT: s_movk_i32 s0, 0x3e80 ; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, 15 @@ -566,8 +566,8 @@ ; GFX9-LABEL: store_load_large_imm_offset_foo: ; GFX9: ; %bb.0: ; %bb ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_movk_i32 s0, 0x3e80 ; GFX9-NEXT: v_mov_b32_e32 v0, 13 +; GFX9-NEXT: s_movk_i32 s0, 0x3e80 ; GFX9-NEXT: scratch_store_dword off, v0, s32 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, 15 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/floor.f64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/floor.f64.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/floor.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/floor.f64.ll @@ -7,8 +7,8 @@ ; GFX6-LABEL: v_floor_f64_ieee: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s4, -1 ; GFX6-NEXT: v_fract_f64_e32 v[2:3], v[0:1] +; GFX6-NEXT: s_mov_b32 s4, -1 ; GFX6-NEXT: s_mov_b32 s5, 0x3fefffff ; GFX6-NEXT: v_min_f64 v[2:3], v[2:3], s[4:5] ; GFX6-NEXT: v_cmp_o_f64_e32 vcc, v[0:1], v[0:1] @@ -30,8 +30,8 @@ ; GFX6-LABEL: v_floor_f64_ieee_nnan: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s4, -1 ; GFX6-NEXT: v_fract_f64_e32 v[2:3], v[0:1] +; GFX6-NEXT: s_mov_b32 s4, -1 ; GFX6-NEXT: s_mov_b32 s5, 0x3fefffff ; GFX6-NEXT: v_min_f64 v[2:3], v[2:3], s[4:5] ; GFX6-NEXT: v_add_f64 v[0:1], v[0:1], -v[2:3] @@ -50,8 +50,8 @@ ; GFX6-LABEL: v_floor_f64_ieee_fneg: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s4, -1 ; GFX6-NEXT: v_fract_f64_e64 v[2:3], -v[0:1] +; GFX6-NEXT: s_mov_b32 s4, -1 ; GFX6-NEXT: s_mov_b32 s5, 0x3fefffff ; GFX6-NEXT: v_min_f64 v[2:3], v[2:3], s[4:5] ; GFX6-NEXT: v_cmp_o_f64_e32 vcc, v[0:1], v[0:1] @@ -74,8 +74,8 @@ ; GFX6-LABEL: v_floor_f64_nonieee: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s4, -1 ; GFX6-NEXT: v_fract_f64_e32 v[2:3], v[0:1] +; GFX6-NEXT: s_mov_b32 s4, -1 ; GFX6-NEXT: s_mov_b32 s5, 0x3fefffff ; GFX6-NEXT: v_min_f64 v[2:3], v[2:3], s[4:5] ; GFX6-NEXT: v_cmp_o_f64_e32 vcc, v[0:1], v[0:1] @@ -97,8 +97,8 @@ ; GFX6-LABEL: v_floor_f64_nonieee_nnan: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s4, -1 ; GFX6-NEXT: v_fract_f64_e32 v[2:3], v[0:1] +; GFX6-NEXT: s_mov_b32 s4, -1 ; GFX6-NEXT: s_mov_b32 s5, 0x3fefffff ; GFX6-NEXT: v_min_f64 v[2:3], v[2:3], s[4:5] ; GFX6-NEXT: v_add_f64 v[0:1], v[0:1], -v[2:3] @@ -117,8 +117,8 @@ ; GFX6-LABEL: v_floor_f64_non_ieee_fneg: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s4, -1 ; GFX6-NEXT: v_fract_f64_e64 v[2:3], -v[0:1] +; GFX6-NEXT: s_mov_b32 s4, -1 ; GFX6-NEXT: s_mov_b32 s5, 0x3fefffff ; GFX6-NEXT: v_min_f64 v[2:3], v[2:3], s[4:5] ; GFX6-NEXT: v_cmp_o_f64_e32 vcc, v[0:1], v[0:1] @@ -141,8 +141,8 @@ ; GFX6-LABEL: v_floor_f64_fabs: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s4, -1 ; GFX6-NEXT: v_fract_f64_e64 v[2:3], |v[0:1]| +; GFX6-NEXT: s_mov_b32 s4, -1 ; GFX6-NEXT: s_mov_b32 s5, 0x3fefffff ; GFX6-NEXT: v_min_f64 v[2:3], v[2:3], s[4:5] ; GFX6-NEXT: v_cmp_o_f64_e32 vcc, v[0:1], v[0:1] @@ -170,8 +170,8 @@ ; GFX6-LABEL: v_floor_f64_fneg_fabs: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s4, -1 ; GFX6-NEXT: v_fract_f64_e64 v[2:3], -|v[0:1]| +; GFX6-NEXT: s_mov_b32 s4, -1 ; GFX6-NEXT: s_mov_b32 s5, 0x3fefffff ; GFX6-NEXT: v_min_f64 v[2:3], v[2:3], s[4:5] ; GFX6-NEXT: v_cmp_o_f64_e32 vcc, v[0:1], v[0:1] @@ -194,8 +194,8 @@ define amdgpu_ps <2 x float> @s_floor_f64(double inreg %x) { ; GFX6-LABEL: s_floor_f64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_mov_b32 s0, -1 ; GFX6-NEXT: v_fract_f64_e32 v[0:1], s[2:3] +; GFX6-NEXT: s_mov_b32 s0, -1 ; GFX6-NEXT: s_mov_b32 s1, 0x3fefffff ; GFX6-NEXT: v_min_f64 v[0:1], v[0:1], s[0:1] ; GFX6-NEXT: v_cmp_o_f64_e64 vcc, s[2:3], s[2:3] @@ -218,8 +218,8 @@ define amdgpu_ps <2 x float> @s_floor_f64_fneg(double inreg %x) { ; GFX6-LABEL: s_floor_f64_fneg: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_mov_b32 s0, -1 ; GFX6-NEXT: v_fract_f64_e64 v[0:1], -s[2:3] +; GFX6-NEXT: s_mov_b32 s0, -1 ; GFX6-NEXT: s_mov_b32 s1, 0x3fefffff ; GFX6-NEXT: v_min_f64 v[0:1], v[0:1], s[0:1] ; GFX6-NEXT: v_cmp_o_f64_e64 vcc, s[2:3], s[2:3] @@ -243,8 +243,8 @@ define amdgpu_ps <2 x float> @s_floor_f64_fabs(double inreg %x) { ; GFX6-LABEL: s_floor_f64_fabs: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_mov_b32 s0, -1 ; GFX6-NEXT: v_fract_f64_e64 v[0:1], |s[2:3]| +; GFX6-NEXT: s_mov_b32 s0, -1 ; GFX6-NEXT: s_mov_b32 s1, 0x3fefffff ; GFX6-NEXT: v_min_f64 v[0:1], v[0:1], s[0:1] ; GFX6-NEXT: v_cmp_o_f64_e64 vcc, s[2:3], s[2:3] @@ -268,8 +268,8 @@ define amdgpu_ps <2 x float> @s_floor_f64_fneg_fabs(double inreg %x) { ; GFX6-LABEL: s_floor_f64_fneg_fabs: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_mov_b32 s0, -1 ; GFX6-NEXT: v_fract_f64_e64 v[0:1], -|s[2:3]| +; GFX6-NEXT: s_mov_b32 s0, -1 ; GFX6-NEXT: s_mov_b32 s1, 0x3fefffff ; GFX6-NEXT: v_min_f64 v[0:1], v[0:1], s[0:1] ; GFX6-NEXT: v_cmp_o_f64_e64 vcc, s[2:3], s[2:3] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fma.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fma.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fma.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fma.ll @@ -259,8 +259,8 @@ ; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v3 ; GFX6-NEXT: v_and_b32_e32 v2, v2, v6 -; GFX6-NEXT: s_mov_b32 s4, 0x80008000 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX6-NEXT: s_mov_b32 s4, 0x80008000 ; GFX6-NEXT: v_xor_b32_e32 v0, s4, v0 ; GFX6-NEXT: v_xor_b32_e32 v1, s4, v1 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v0 @@ -328,15 +328,15 @@ ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 ; GFX6-NEXT: v_cvt_f32_f16_e32 v9, v9 ; GFX6-NEXT: v_fma_f32 v0, v0, v4, v8 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v6 -; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v7 ; GFX6-NEXT: v_fma_f32 v1, v1, v5, v9 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v10 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v7 ; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v11 -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-NEXT: v_fma_f32 v2, v2, v4, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-NEXT: v_fma_f32 v3, v3, v6, v7 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 @@ -349,15 +349,15 @@ ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v2 ; GFX8-NEXT: v_lshrrev_b32_e32 v10, 16, v4 -; GFX8-NEXT: v_fma_f16 v0, v0, v2, v4 ; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v1 ; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v3 ; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v5 +; GFX8-NEXT: v_fma_f16 v0, v0, v2, v4 ; GFX8-NEXT: v_fma_f16 v2, v6, v8, v10 ; GFX8-NEXT: v_mov_b32_e32 v4, 16 ; GFX8-NEXT: v_fma_f16 v1, v1, v3, v5 -; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX8-NEXT: v_fma_f16 v3, v7, v9, v11 +; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX8-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3.ll @@ -36,12 +36,12 @@ ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v8 -; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v8 -; VI-NEXT: v_mov_b32_e32 v4, s6 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_mov_b32_e32 v4, s6 ; VI-NEXT: v_mov_b32_e32 v5, s7 ; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v8 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc @@ -146,12 +146,12 @@ ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v6 -; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v6 -; VI-NEXT: v_mov_b32_e32 v4, s6 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_mov_b32_e32 v4, s6 ; VI-NEXT: v_mov_b32_e32 v5, s7 ; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v6 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc @@ -270,12 +270,12 @@ ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v6 -; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v6 -; VI-NEXT: v_mov_b32_e32 v4, s6 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_mov_b32_e32 v4, s6 ; VI-NEXT: v_mov_b32_e32 v5, s7 ; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v6 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc @@ -285,8 +285,8 @@ ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: flat_load_dword v3, v[4:5] glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_mov_b32 s2, 0x80000000 +; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v6 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc @@ -390,12 +390,12 @@ ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v6 -; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v6 -; VI-NEXT: v_mov_b32_e32 v4, s6 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_mov_b32_e32 v4, s6 ; VI-NEXT: v_mov_b32_e32 v5, s7 ; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v6 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc @@ -514,12 +514,12 @@ ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v6 -; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v6 -; VI-NEXT: v_mov_b32_e32 v4, s6 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_mov_b32_e32 v4, s6 ; VI-NEXT: v_mov_b32_e32 v5, s7 ; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v6 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc @@ -642,12 +642,12 @@ ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v6 -; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v6 -; VI-NEXT: v_mov_b32_e32 v4, s6 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_mov_b32_e32 v4, s6 ; VI-NEXT: v_mov_b32_e32 v5, s7 ; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v6 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc @@ -663,8 +663,8 @@ ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_mul_f32_e32 v4, 1.0, v7 ; VI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; VI-NEXT: v_min_f32_e32 v5, v4, v2 ; VI-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_min_f32_e32 v5, v4, v2 ; VI-NEXT: v_max_f32_e32 v2, v4, v2 ; VI-NEXT: v_min_f32_e32 v2, v2, v3 ; VI-NEXT: v_max_f32_e32 v2, v5, v2 @@ -686,9 +686,9 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_max_f32_e32 v1, v1, v1 ; GFX9-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX9-NEXT: v_max_f32_e32 v3, v3, v3 ; GFX9-NEXT: v_min_f32_e32 v4, v1, v2 ; GFX9-NEXT: v_max_f32_e32 v1, v1, v2 -; GFX9-NEXT: v_max_f32_e32 v3, v3, v3 ; GFX9-NEXT: global_store_dword v[0:1], v4, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_min_f32_e32 v1, v1, v3 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmul.v2f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmul.v2f16.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmul.v2f16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmul.v2f16.ll @@ -265,8 +265,8 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_mov_b32 s4, 0x80008000 ; GFX8-NEXT: v_xor_b32_e32 v0, s4, v0 -; GFX8-NEXT: v_xor_b32_e32 v2, s4, v2 ; GFX8-NEXT: v_xor_b32_e32 v1, s4, v1 +; GFX8-NEXT: v_xor_b32_e32 v2, s4, v2 ; GFX8-NEXT: v_xor_b32_e32 v3, s4, v3 ; GFX8-NEXT: v_mul_f16_e32 v4, v0, v2 ; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 @@ -436,9 +436,9 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_mov_b32 s4, 0x80008000 ; GFX8-NEXT: v_xor_b32_e32 v0, s4, v0 -; GFX8-NEXT: v_xor_b32_e32 v3, s4, v3 ; GFX8-NEXT: v_xor_b32_e32 v1, s4, v1 ; GFX8-NEXT: v_xor_b32_e32 v2, s4, v2 +; GFX8-NEXT: v_xor_b32_e32 v3, s4, v3 ; GFX8-NEXT: v_xor_b32_e32 v4, s4, v4 ; GFX8-NEXT: v_xor_b32_e32 v5, s4, v5 ; GFX8-NEXT: v_mul_f16_e32 v6, v0, v3 @@ -636,10 +636,10 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_mov_b32 s4, 0x80008000 ; GFX8-NEXT: v_xor_b32_e32 v0, s4, v0 -; GFX8-NEXT: v_xor_b32_e32 v4, s4, v4 ; GFX8-NEXT: v_xor_b32_e32 v1, s4, v1 ; GFX8-NEXT: v_xor_b32_e32 v2, s4, v2 ; GFX8-NEXT: v_xor_b32_e32 v3, s4, v3 +; GFX8-NEXT: v_xor_b32_e32 v4, s4, v4 ; GFX8-NEXT: v_xor_b32_e32 v5, s4, v5 ; GFX8-NEXT: v_xor_b32_e32 v6, s4, v6 ; GFX8-NEXT: v_xor_b32_e32 v7, s4, v7 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll @@ -188,10 +188,10 @@ ; GFX9-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX9-NEXT: v_mul_legacy_f32_e32 v2, v2, v3 ; GFX9-NEXT: v_mul_legacy_f32_e32 v0, v0, v1 -; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX9-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX9-NEXT: v_exp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD +; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX9-NEXT: v_exp_f16_e32 v1, v2 +; GFX9-NEXT: v_exp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD ; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff ; GFX9-NEXT: v_and_or_b32 v0, v1, v2, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -274,10 +274,10 @@ ; GFX9-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX9-NEXT: v_mul_legacy_f32_e32 v2, v2, v3 ; GFX9-NEXT: v_mul_legacy_f32_e32 v0, v0, v1 -; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX9-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX9-NEXT: v_exp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD +; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX9-NEXT: v_exp_f16_e32 v1, v2 +; GFX9-NEXT: v_exp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD ; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff ; GFX9-NEXT: v_and_or_b32 v0, v1, v2, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -310,10 +310,10 @@ ; GFX6-LABEL: v_pow_v2f16_fneg_rhs: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: v_or_b32_e32 v2, v3, v2 ; GFX6-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2 @@ -340,8 +340,8 @@ ; GFX8-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX8-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX8-NEXT: v_mul_legacy_f32_e32 v0, v0, v1 -; GFX8-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX8-NEXT: v_mul_legacy_f32_e32 v2, v2, v3 +; GFX8-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX8-NEXT: v_cvt_f16_f32_e32 v1, v2 ; GFX8-NEXT: v_mov_b32_e32 v2, 16 ; GFX8-NEXT: v_exp_f16_e32 v0, v0 @@ -437,8 +437,8 @@ ; GFX8-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX8-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX8-NEXT: v_mul_legacy_f32_e32 v0, v0, v1 -; GFX8-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX8-NEXT: v_mul_legacy_f32_e32 v2, v2, v3 +; GFX8-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX8-NEXT: v_cvt_f16_f32_e32 v1, v2 ; GFX8-NEXT: v_mov_b32_e32 v2, 16 ; GFX8-NEXT: v_exp_f16_e32 v0, v0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll @@ -543,8 +543,8 @@ ; GFX6-NEXT: s_lshr_b32 s4, s2, 8 ; GFX6-NEXT: s_andn2_b32 s2, 7, s2 ; GFX6-NEXT: s_lshr_b32 s5, s5, 1 -; GFX6-NEXT: s_bfe_u32 s1, s1, 0x80008 ; GFX6-NEXT: s_lshr_b32 s2, s5, s2 +; GFX6-NEXT: s_bfe_u32 s1, s1, 0x80008 ; GFX6-NEXT: s_or_b32 s0, s0, s2 ; GFX6-NEXT: s_and_b32 s2, s4, 7 ; GFX6-NEXT: s_andn2_b32 s4, 7, s4 @@ -622,11 +622,11 @@ ; GFX10-NEXT: s_movk_i32 s6, 0xff ; GFX10-NEXT: s_lshr_b32 s5, s2, 8 ; GFX10-NEXT: s_and_b32 s4, s4, s6 +; GFX10-NEXT: s_and_b32 s7, s2, 7 ; GFX10-NEXT: s_and_b32 s1, s1, s6 ; GFX10-NEXT: s_bfe_u32 s4, s4, 0x100000 -; GFX10-NEXT: s_and_b32 s7, s2, 7 -; GFX10-NEXT: s_bfe_u32 s1, s1, 0x100000 ; GFX10-NEXT: s_lshr_b32 s3, s0, 8 +; GFX10-NEXT: s_bfe_u32 s1, s1, 0x100000 ; GFX10-NEXT: s_lshl_b32 s0, s0, s7 ; GFX10-NEXT: s_and_b32 s7, s5, 7 ; GFX10-NEXT: s_andn2_b32 s5, 7, s5 @@ -694,11 +694,11 @@ ; GFX8-NEXT: v_and_b32_e32 v2, 7, v2 ; GFX8-NEXT: v_lshrrev_b16_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX8-NEXT: v_lshrrev_b16_e32 v1, v2, v1 -; GFX8-NEXT: v_xor_b32_e32 v2, -1, v5 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: v_and_b32_e32 v1, 7, v5 -; GFX8-NEXT: v_lshlrev_b16_e32 v1, v1, v3 +; GFX8-NEXT: v_xor_b32_e32 v2, -1, v5 ; GFX8-NEXT: v_and_b32_e32 v2, 7, v2 +; GFX8-NEXT: v_lshlrev_b16_e32 v1, v1, v3 ; GFX8-NEXT: v_lshrrev_b16_sdwa v3, v6, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX8-NEXT: v_lshrrev_b16_e32 v2, v2, v3 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v2 @@ -718,13 +718,13 @@ ; GFX9-NEXT: v_and_b32_e32 v2, 7, v2 ; GFX9-NEXT: v_lshrrev_b16_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 8, v0 -; GFX9-NEXT: v_lshrrev_b16_e32 v1, v2, v1 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, v6, v0 -; GFX9-NEXT: v_xor_b32_e32 v2, -1, v5 +; GFX9-NEXT: v_lshrrev_b16_e32 v1, v2, v1 ; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX9-NEXT: v_and_b32_e32 v1, 7, v5 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, v1, v3 +; GFX9-NEXT: v_xor_b32_e32 v2, -1, v5 ; GFX9-NEXT: v_and_b32_e32 v2, 7, v2 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, v1, v3 ; GFX9-NEXT: v_lshrrev_b16_sdwa v3, s4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_lshrrev_b16_e32 v2, v2, v3 ; GFX9-NEXT: v_or_b32_e32 v1, v1, v2 @@ -740,11 +740,11 @@ ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v4, 8, v1 ; GFX10-NEXT: s_movk_i32 s4, 0xff +; GFX10-NEXT: v_lshrrev_b32_e32 v5, 8, v0 ; GFX10-NEXT: v_xor_b32_e32 v7, -1, v2 -; GFX10-NEXT: v_and_b32_e32 v1, s4, v1 ; GFX10-NEXT: v_xor_b32_e32 v6, -1, v3 ; GFX10-NEXT: v_and_b32_e32 v4, s4, v4 -; GFX10-NEXT: v_lshrrev_b32_e32 v5, 8, v0 +; GFX10-NEXT: v_and_b32_e32 v1, s4, v1 ; GFX10-NEXT: v_and_b32_e32 v3, 7, v3 ; GFX10-NEXT: v_and_b32_e32 v2, 7, v2 ; GFX10-NEXT: v_and_b32_e32 v6, 7, v6 @@ -798,11 +798,11 @@ ; GFX6-NEXT: s_andn2_b32 s6, 7, s7 ; GFX6-NEXT: s_lshr_b32 s4, s4, 1 ; GFX6-NEXT: s_lshr_b32 s4, s4, s6 -; GFX6-NEXT: s_and_b32 s2, s2, s10 ; GFX6-NEXT: s_or_b32 s3, s3, s4 ; GFX6-NEXT: s_and_b32 s4, s8, 7 ; GFX6-NEXT: s_andn2_b32 s6, 7, s8 ; GFX6-NEXT: s_lshr_b32 s1, s1, 25 +; GFX6-NEXT: s_and_b32 s2, s2, s10 ; GFX6-NEXT: s_lshl_b32 s4, s5, s4 ; GFX6-NEXT: s_lshr_b32 s1, s1, s6 ; GFX6-NEXT: s_and_b32 s0, s0, s10 @@ -810,8 +810,8 @@ ; GFX6-NEXT: s_or_b32 s1, s4, s1 ; GFX6-NEXT: s_or_b32 s0, s0, s2 ; GFX6-NEXT: s_and_b32 s2, s3, s10 -; GFX6-NEXT: s_and_b32 s1, s1, s10 ; GFX6-NEXT: s_lshl_b32 s2, s2, 16 +; GFX6-NEXT: s_and_b32 s1, s1, s10 ; GFX6-NEXT: s_or_b32 s0, s0, s2 ; GFX6-NEXT: s_lshl_b32 s1, s1, 24 ; GFX6-NEXT: s_or_b32 s0, s0, s1 @@ -831,11 +831,11 @@ ; GFX8-NEXT: s_and_b32 s12, s2, 7 ; GFX8-NEXT: s_andn2_b32 s2, 7, s2 ; GFX8-NEXT: s_lshr_b32 s1, s1, 1 -; GFX8-NEXT: s_lshr_b32 s1, s1, s2 ; GFX8-NEXT: s_lshr_b32 s3, s0, 8 ; GFX8-NEXT: s_lshr_b32 s4, s0, 16 ; GFX8-NEXT: s_lshr_b32 s5, s0, 24 ; GFX8-NEXT: s_lshl_b32 s0, s0, s12 +; GFX8-NEXT: s_lshr_b32 s1, s1, s2 ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: s_and_b32 s1, s9, 7 ; GFX8-NEXT: s_lshl_b32 s1, s3, s1 @@ -852,19 +852,19 @@ ; GFX8-NEXT: s_andn2_b32 s3, 7, s10 ; GFX8-NEXT: s_lshr_b32 s4, s4, 1 ; GFX8-NEXT: s_lshr_b32 s3, s4, s3 -; GFX8-NEXT: s_and_b32 s1, s1, s13 ; GFX8-NEXT: s_or_b32 s2, s2, s3 ; GFX8-NEXT: s_and_b32 s3, s11, 7 -; GFX8-NEXT: s_lshl_b32 s3, s5, s3 +; GFX8-NEXT: s_and_b32 s1, s1, s13 ; GFX8-NEXT: s_andn2_b32 s4, 7, s11 +; GFX8-NEXT: s_lshl_b32 s3, s5, s3 ; GFX8-NEXT: s_lshr_b32 s5, s8, 1 ; GFX8-NEXT: s_and_b32 s0, s0, s13 ; GFX8-NEXT: s_lshl_b32 s1, s1, 8 ; GFX8-NEXT: s_lshr_b32 s4, s5, s4 ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: s_and_b32 s1, s2, s13 -; GFX8-NEXT: s_lshl_b32 s1, s1, 16 ; GFX8-NEXT: s_or_b32 s3, s3, s4 +; GFX8-NEXT: s_lshl_b32 s1, s1, 16 ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: s_and_b32 s1, s3, s13 ; GFX8-NEXT: s_lshl_b32 s1, s1, 24 @@ -885,11 +885,11 @@ ; GFX9-NEXT: s_and_b32 s12, s2, 7 ; GFX9-NEXT: s_andn2_b32 s2, 7, s2 ; GFX9-NEXT: s_lshr_b32 s1, s1, 1 -; GFX9-NEXT: s_lshr_b32 s1, s1, s2 ; GFX9-NEXT: s_lshr_b32 s3, s0, 8 ; GFX9-NEXT: s_lshr_b32 s4, s0, 16 ; GFX9-NEXT: s_lshr_b32 s5, s0, 24 ; GFX9-NEXT: s_lshl_b32 s0, s0, s12 +; GFX9-NEXT: s_lshr_b32 s1, s1, s2 ; GFX9-NEXT: s_or_b32 s0, s0, s1 ; GFX9-NEXT: s_and_b32 s1, s9, 7 ; GFX9-NEXT: s_lshl_b32 s1, s3, s1 @@ -906,19 +906,19 @@ ; GFX9-NEXT: s_andn2_b32 s3, 7, s10 ; GFX9-NEXT: s_lshr_b32 s4, s4, 1 ; GFX9-NEXT: s_lshr_b32 s3, s4, s3 -; GFX9-NEXT: s_and_b32 s1, s1, s13 ; GFX9-NEXT: s_or_b32 s2, s2, s3 ; GFX9-NEXT: s_and_b32 s3, s11, 7 -; GFX9-NEXT: s_lshl_b32 s3, s5, s3 +; GFX9-NEXT: s_and_b32 s1, s1, s13 ; GFX9-NEXT: s_andn2_b32 s4, 7, s11 +; GFX9-NEXT: s_lshl_b32 s3, s5, s3 ; GFX9-NEXT: s_lshr_b32 s5, s8, 1 ; GFX9-NEXT: s_and_b32 s0, s0, s13 ; GFX9-NEXT: s_lshl_b32 s1, s1, 8 ; GFX9-NEXT: s_lshr_b32 s4, s5, s4 ; GFX9-NEXT: s_or_b32 s0, s0, s1 ; GFX9-NEXT: s_and_b32 s1, s2, s13 -; GFX9-NEXT: s_lshl_b32 s1, s1, 16 ; GFX9-NEXT: s_or_b32 s3, s3, s4 +; GFX9-NEXT: s_lshl_b32 s1, s1, 16 ; GFX9-NEXT: s_or_b32 s0, s0, s1 ; GFX9-NEXT: s_and_b32 s1, s3, s13 ; GFX9-NEXT: s_lshl_b32 s1, s1, 24 @@ -946,10 +946,10 @@ ; GFX10-NEXT: s_bfe_u32 s2, s2, 0x100000 ; GFX10-NEXT: s_andn2_b32 s9, 7, s9 ; GFX10-NEXT: s_lshr_b32 s2, s2, 1 -; GFX10-NEXT: s_lshl_b32 s3, s3, s6 ; GFX10-NEXT: s_lshr_b32 s4, s0, 16 ; GFX10-NEXT: s_lshr_b32 s5, s0, 24 ; GFX10-NEXT: s_lshl_b32 s0, s0, s13 +; GFX10-NEXT: s_lshl_b32 s3, s3, s6 ; GFX10-NEXT: s_lshr_b32 s2, s2, s9 ; GFX10-NEXT: s_or_b32 s0, s0, s1 ; GFX10-NEXT: s_or_b32 s1, s3, s2 @@ -963,13 +963,13 @@ ; GFX10-NEXT: s_and_b32 s4, s12, 7 ; GFX10-NEXT: s_andn2_b32 s6, 7, s12 ; GFX10-NEXT: s_lshr_b32 s7, s8, 1 -; GFX10-NEXT: s_or_b32 s2, s3, s2 -; GFX10-NEXT: s_and_b32 s1, s1, s11 ; GFX10-NEXT: s_lshl_b32 s4, s5, s4 ; GFX10-NEXT: s_lshr_b32 s5, s7, s6 +; GFX10-NEXT: s_or_b32 s2, s3, s2 +; GFX10-NEXT: s_and_b32 s1, s1, s11 +; GFX10-NEXT: s_or_b32 s3, s4, s5 ; GFX10-NEXT: s_and_b32 s0, s0, s11 ; GFX10-NEXT: s_lshl_b32 s1, s1, 8 -; GFX10-NEXT: s_or_b32 s3, s4, s5 ; GFX10-NEXT: s_and_b32 s2, s2, s11 ; GFX10-NEXT: s_or_b32 s0, s0, s1 ; GFX10-NEXT: s_lshl_b32 s1, s2, 16 @@ -990,12 +990,12 @@ ; GFX6-LABEL: v_fshl_v4i8: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v10, 0xff, v1 ; GFX6-NEXT: v_lshrrev_b32_e32 v6, 8, v2 ; GFX6-NEXT: v_lshrrev_b32_e32 v7, 16, v2 ; GFX6-NEXT: v_lshrrev_b32_e32 v8, 24, v2 ; GFX6-NEXT: v_and_b32_e32 v9, 7, v2 ; GFX6-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX6-NEXT: v_and_b32_e32 v10, 0xff, v1 ; GFX6-NEXT: v_and_b32_e32 v2, 7, v2 ; GFX6-NEXT: v_lshrrev_b32_e32 v10, 1, v10 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 8, v0 @@ -1021,11 +1021,11 @@ ; GFX6-NEXT: v_mov_b32_e32 v9, 0xff ; GFX6-NEXT: v_lshrrev_b32_e32 v4, v6, v4 ; GFX6-NEXT: v_xor_b32_e32 v6, -1, v8 -; GFX6-NEXT: v_and_b32_e32 v2, v2, v9 ; GFX6-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX6-NEXT: v_and_b32_e32 v4, 7, v8 ; GFX6-NEXT: v_and_b32_e32 v6, 7, v6 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 25, v1 +; GFX6-NEXT: v_and_b32_e32 v2, v2, v9 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, v4, v5 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, v6, v1 ; GFX6-NEXT: v_and_b32_e32 v0, v0, v9 @@ -1033,8 +1033,8 @@ ; GFX6-NEXT: v_or_b32_e32 v1, v4, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX6-NEXT: v_and_b32_e32 v2, v3, v9 -; GFX6-NEXT: v_and_b32_e32 v1, v1, v9 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_and_b32_e32 v1, v1, v9 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 @@ -1043,12 +1043,12 @@ ; GFX8-LABEL: v_fshl_v4i8: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_and_b32_e32 v8, 7, v2 -; GFX8-NEXT: v_mov_b32_e32 v10, 1 ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 8, v2 ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v2 ; GFX8-NEXT: v_lshrrev_b32_e32 v7, 24, v2 +; GFX8-NEXT: v_and_b32_e32 v8, 7, v2 ; GFX8-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX8-NEXT: v_mov_b32_e32 v10, 1 ; GFX8-NEXT: v_and_b32_e32 v2, 7, v2 ; GFX8-NEXT: v_lshrrev_b16_sdwa v11, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX8-NEXT: v_lshlrev_b16_e32 v8, v8, v0 @@ -1060,17 +1060,17 @@ ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v0 ; GFX8-NEXT: v_and_b32_e32 v5, 7, v5 ; GFX8-NEXT: v_lshrrev_b16_sdwa v4, v10, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_lshrrev_b16_e32 v4, v5, v4 -; GFX8-NEXT: v_lshlrev_b16_e32 v3, v8, v3 ; GFX8-NEXT: v_mov_b32_e32 v9, 0xff -; GFX8-NEXT: v_xor_b32_e32 v5, -1, v6 +; GFX8-NEXT: v_lshlrev_b16_e32 v3, v8, v3 +; GFX8-NEXT: v_lshrrev_b16_e32 v4, v5, v4 ; GFX8-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX8-NEXT: v_and_b32_e32 v4, 7, v6 +; GFX8-NEXT: v_xor_b32_e32 v5, -1, v6 ; GFX8-NEXT: v_and_b32_sdwa v6, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_and_b32_e32 v5, 7, v5 ; GFX8-NEXT: v_lshrrev_b16_e32 v6, 1, v6 -; GFX8-NEXT: v_lshrrev_b16_e32 v5, v5, v6 ; GFX8-NEXT: v_lshlrev_b16_sdwa v4, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_lshrrev_b16_e32 v5, v5, v6 ; GFX8-NEXT: v_or_b32_e32 v4, v4, v5 ; GFX8-NEXT: v_and_b32_e32 v5, 7, v7 ; GFX8-NEXT: v_xor_b32_e32 v6, -1, v7 @@ -1081,12 +1081,12 @@ ; GFX8-NEXT: v_lshrrev_b16_e32 v1, v6, v1 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: v_mov_b32_e32 v1, 8 -; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX8-NEXT: s_movk_i32 s4, 0xff +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_and_b32_e32 v2, s4, v4 -; GFX8-NEXT: v_and_b32_e32 v0, s4, v0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v0, s4, v0 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 24, v0 ; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 @@ -1095,12 +1095,12 @@ ; GFX9-LABEL: v_fshl_v4i8: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v8, 7, v2 -; GFX9-NEXT: s_mov_b32 s5, 1 ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v2 +; GFX9-NEXT: v_and_b32_e32 v8, 7, v2 ; GFX9-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX9-NEXT: s_mov_b32 s5, 1 ; GFX9-NEXT: v_and_b32_e32 v2, 7, v2 ; GFX9-NEXT: v_lshrrev_b16_sdwa v10, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_lshlrev_b16_e32 v8, v8, v0 @@ -1112,17 +1112,17 @@ ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 8, v0 ; GFX9-NEXT: v_and_b32_e32 v5, 7, v5 ; GFX9-NEXT: v_lshrrev_b16_sdwa v4, s5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_lshrrev_b16_e32 v4, v5, v4 -; GFX9-NEXT: v_lshlrev_b16_e32 v3, v8, v3 ; GFX9-NEXT: v_mov_b32_e32 v9, 0xff -; GFX9-NEXT: v_xor_b32_e32 v5, -1, v6 +; GFX9-NEXT: v_lshlrev_b16_e32 v3, v8, v3 +; GFX9-NEXT: v_lshrrev_b16_e32 v4, v5, v4 ; GFX9-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX9-NEXT: v_and_b32_e32 v4, 7, v6 +; GFX9-NEXT: v_xor_b32_e32 v5, -1, v6 ; GFX9-NEXT: v_and_b32_sdwa v6, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v5, 7, v5 ; GFX9-NEXT: v_lshrrev_b16_e32 v6, 1, v6 -; GFX9-NEXT: v_lshrrev_b16_e32 v5, v5, v6 ; GFX9-NEXT: v_lshlrev_b16_sdwa v4, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshrrev_b16_e32 v5, v5, v6 ; GFX9-NEXT: v_or_b32_e32 v4, v4, v5 ; GFX9-NEXT: v_and_b32_e32 v5, 7, v7 ; GFX9-NEXT: v_xor_b32_e32 v6, -1, v7 @@ -1135,9 +1135,9 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, 8 ; GFX9-NEXT: s_movk_i32 s4, 0xff ; GFX9-NEXT: v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_and_b32_e32 v0, s4, v0 ; GFX9-NEXT: v_and_or_b32 v1, v2, s4, v1 ; GFX9-NEXT: v_and_b32_e32 v2, s4, v4 +; GFX9-NEXT: v_and_b32_e32 v0, s4, v0 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 24, v0 ; GFX9-NEXT: v_or3_b32 v0, v1, v2, v0 @@ -1149,52 +1149,52 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_lshrrev_b32_e32 v8, 8, v2 ; GFX10-NEXT: v_and_b32_e32 v11, 7, v2 -; GFX10-NEXT: v_xor_b32_e32 v10, -1, v2 -; GFX10-NEXT: v_lshrrev_b32_e32 v9, 16, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v0 ; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v0 ; GFX10-NEXT: v_lshrrev_b32_e32 v5, 24, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v6, 8, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v9, 16, v2 +; GFX10-NEXT: v_xor_b32_e32 v10, -1, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v2, 24, v2 ; GFX10-NEXT: v_lshlrev_b16 v0, v11, v0 ; GFX10-NEXT: v_xor_b32_e32 v11, -1, v8 ; GFX10-NEXT: v_and_b32_e32 v8, 7, v8 -; GFX10-NEXT: v_lshrrev_b32_e32 v6, 8, v1 -; GFX10-NEXT: v_lshrrev_b32_e32 v2, 24, v2 ; GFX10-NEXT: v_mov_b32_e32 v13, 0xff ; GFX10-NEXT: s_movk_i32 s4, 0xff ; GFX10-NEXT: v_lshrrev_b32_e32 v7, 24, v1 ; GFX10-NEXT: v_and_b32_e32 v12, s4, v1 ; GFX10-NEXT: v_and_b32_e32 v6, s4, v6 -; GFX10-NEXT: v_and_b32_sdwa v1, v1, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX10-NEXT: v_xor_b32_e32 v13, -1, v2 ; GFX10-NEXT: v_lshlrev_b16 v3, v8, v3 ; GFX10-NEXT: v_xor_b32_e32 v8, -1, v9 -; GFX10-NEXT: v_and_b32_e32 v2, 7, v2 -; GFX10-NEXT: v_and_b32_e32 v9, 7, v9 +; GFX10-NEXT: v_and_b32_sdwa v1, v1, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_xor_b32_e32 v13, -1, v2 ; GFX10-NEXT: v_and_b32_e32 v11, 7, v11 ; GFX10-NEXT: v_lshrrev_b16 v6, 1, v6 +; GFX10-NEXT: v_and_b32_e32 v9, 7, v9 ; GFX10-NEXT: v_and_b32_e32 v8, 7, v8 ; GFX10-NEXT: v_lshrrev_b16 v1, 1, v1 +; GFX10-NEXT: v_and_b32_e32 v2, 7, v2 ; GFX10-NEXT: v_and_b32_e32 v13, 7, v13 ; GFX10-NEXT: v_lshrrev_b16 v7, 1, v7 -; GFX10-NEXT: v_lshrrev_b16 v6, v11, v6 -; GFX10-NEXT: v_lshlrev_b16 v2, v2, v5 ; GFX10-NEXT: v_and_b32_e32 v10, 7, v10 ; GFX10-NEXT: v_lshrrev_b16 v12, 1, v12 -; GFX10-NEXT: v_lshrrev_b16 v5, v13, v7 +; GFX10-NEXT: v_lshrrev_b16 v6, v11, v6 ; GFX10-NEXT: v_lshlrev_b16 v4, v9, v4 ; GFX10-NEXT: v_lshrrev_b16 v1, v8, v1 -; GFX10-NEXT: v_or_b32_e32 v3, v3, v6 +; GFX10-NEXT: v_lshlrev_b16 v2, v2, v5 +; GFX10-NEXT: v_lshrrev_b16 v5, v13, v7 ; GFX10-NEXT: v_lshrrev_b16 v7, v10, v12 -; GFX10-NEXT: v_or_b32_e32 v2, v2, v5 +; GFX10-NEXT: v_or_b32_e32 v3, v3, v6 ; GFX10-NEXT: v_mov_b32_e32 v6, 8 ; GFX10-NEXT: v_or_b32_e32 v1, v4, v1 +; GFX10-NEXT: v_or_b32_e32 v2, v2, v5 ; GFX10-NEXT: v_or_b32_e32 v0, v0, v7 -; GFX10-NEXT: v_and_b32_e32 v2, s4, v2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX10-NEXT: v_and_b32_e32 v1, s4, v1 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GFX10-NEXT: v_and_b32_e32 v2, s4, v2 ; GFX10-NEXT: v_and_or_b32 v0, v0, s4, v3 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 24, v2 ; GFX10-NEXT: v_or3_b32 v0, v0, v1, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] %lhs = bitcast i32 %lhs.arg to <4 x i8> @@ -1481,36 +1481,36 @@ ; GFX6-NEXT: s_bfe_u32 s6, s6, 0x100000 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: s_lshr_b32 s8, s1, 8 -; GFX6-NEXT: s_and_b32 s1, s1, s9 ; GFX6-NEXT: s_bfe_u32 s0, s0, 0x100000 ; GFX6-NEXT: s_lshl_b32 s6, s6, 16 +; GFX6-NEXT: s_and_b32 s1, s1, s9 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX6-NEXT: s_lshl_b32 s1, s1, 8 ; GFX6-NEXT: s_or_b32 s0, s0, s6 +; GFX6-NEXT: s_lshl_b32 s1, s1, 8 ; GFX6-NEXT: s_and_b32 s6, s8, s9 ; GFX6-NEXT: s_or_b32 s1, s7, s1 ; GFX6-NEXT: s_bfe_u32 s6, s6, 0x100000 -; GFX6-NEXT: v_mov_b32_e32 v1, 0xffffffe8 ; GFX6-NEXT: s_bfe_u32 s1, s1, 0x100000 ; GFX6-NEXT: s_lshl_b32 s6, s6, 16 -; GFX6-NEXT: v_mul_lo_u32 v2, v1, v0 +; GFX6-NEXT: v_mov_b32_e32 v1, 0xffffffe8 ; GFX6-NEXT: s_or_b32 s1, s1, s6 ; GFX6-NEXT: s_lshr_b32 s6, s2, 16 ; GFX6-NEXT: s_lshr_b32 s7, s2, 24 ; GFX6-NEXT: s_and_b32 s10, s2, s9 ; GFX6-NEXT: s_bfe_u32 s2, s2, s11 +; GFX6-NEXT: v_mul_lo_u32 v2, v1, v0 ; GFX6-NEXT: s_lshl_b32 s2, s2, 8 ; GFX6-NEXT: s_and_b32 s6, s6, s9 ; GFX6-NEXT: s_or_b32 s2, s10, s2 ; GFX6-NEXT: s_bfe_u32 s6, s6, 0x100000 ; GFX6-NEXT: s_lshr_b32 s8, s3, 8 -; GFX6-NEXT: s_and_b32 s3, s3, s9 ; GFX6-NEXT: s_bfe_u32 s2, s2, 0x100000 ; GFX6-NEXT: s_lshl_b32 s6, s6, 16 -; GFX6-NEXT: s_lshl_b32 s3, s3, 8 -; GFX6-NEXT: v_mul_hi_u32 v2, v0, v2 +; GFX6-NEXT: s_and_b32 s3, s3, s9 ; GFX6-NEXT: s_or_b32 s2, s2, s6 +; GFX6-NEXT: s_lshl_b32 s3, s3, 8 ; GFX6-NEXT: s_and_b32 s6, s8, s9 +; GFX6-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX6-NEXT: s_or_b32 s3, s7, s3 ; GFX6-NEXT: s_bfe_u32 s6, s6, 0x100000 ; GFX6-NEXT: s_bfe_u32 s3, s3, 0x100000 @@ -1542,22 +1542,22 @@ ; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, 24, v0 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 ; GFX6-NEXT: v_mul_hi_u32 v1, v2, v1 -; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GFX6-NEXT: s_and_b32 s6, s8, s9 +; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GFX6-NEXT: s_or_b32 s5, s7, s5 ; GFX6-NEXT: s_bfe_u32 s6, s6, 0x100000 ; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, 24, v0 -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 ; GFX6-NEXT: s_bfe_u32 s5, s5, 0x100000 ; GFX6-NEXT: s_lshl_b32 s6, s6, 16 -; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 ; GFX6-NEXT: s_or_b32 s5, s5, s6 +; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v2, v1 ; GFX6-NEXT: v_mul_hi_u32 v1, s5, v1 ; GFX6-NEXT: s_mov_b32 s6, 0xffffff ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 23, v0 -; GFX6-NEXT: v_and_b32_e32 v0, s6, v0 ; GFX6-NEXT: v_mul_lo_u32 v1, v1, 24 +; GFX6-NEXT: v_and_b32_e32 v0, s6, v0 ; GFX6-NEXT: v_lshl_b32_e32 v0, s0, v0 ; GFX6-NEXT: s_lshr_b32 s0, s2, 1 ; GFX6-NEXT: v_and_b32_e32 v2, s6, v3 @@ -1575,13 +1575,13 @@ ; GFX6-NEXT: v_and_b32_e32 v1, v1, v4 ; GFX6-NEXT: s_lshr_b32 s0, s3, 1 ; GFX6-NEXT: v_and_b32_e32 v2, v2, v4 -; GFX6-NEXT: v_bfe_u32 v3, v0, 8, 8 ; GFX6-NEXT: v_lshl_b32_e32 v1, s1, v1 ; GFX6-NEXT: v_lshr_b32_e32 v2, s0, v2 +; GFX6-NEXT: v_bfe_u32 v3, v0, 8, 8 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX6-NEXT: v_and_b32_e32 v2, s9, v0 -; GFX6-NEXT: v_bfe_u32 v0, v0, 16, 8 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GFX6-NEXT: v_bfe_u32 v0, v0, 16, 8 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 @@ -1612,17 +1612,17 @@ ; GFX8-NEXT: s_bfe_u32 s6, s6, 0x100000 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX8-NEXT: s_lshr_b32 s9, s1, 8 -; GFX8-NEXT: s_and_b32 s1, s1, s10 ; GFX8-NEXT: s_bfe_u32 s0, s0, 0x100000 ; GFX8-NEXT: s_lshl_b32 s6, s6, 16 -; GFX8-NEXT: s_lshl_b32 s1, s1, s11 +; GFX8-NEXT: s_and_b32 s1, s1, s10 ; GFX8-NEXT: s_or_b32 s0, s0, s6 +; GFX8-NEXT: s_lshl_b32 s1, s1, s11 ; GFX8-NEXT: s_and_b32 s6, s9, s10 ; GFX8-NEXT: s_or_b32 s1, s8, s1 ; GFX8-NEXT: s_bfe_u32 s6, s6, 0x100000 -; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 ; GFX8-NEXT: s_lshl_b32 s6, s6, 16 +; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: s_or_b32 s1, s1, s6 ; GFX8-NEXT: s_lshr_b32 s6, s2, 8 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 @@ -1637,17 +1637,17 @@ ; GFX8-NEXT: s_bfe_u32 s6, s6, 0x100000 ; GFX8-NEXT: v_mul_lo_u32 v2, v1, v0 ; GFX8-NEXT: s_lshr_b32 s9, s3, 8 -; GFX8-NEXT: s_and_b32 s3, s3, s10 ; GFX8-NEXT: s_bfe_u32 s2, s2, 0x100000 ; GFX8-NEXT: s_lshl_b32 s6, s6, 16 -; GFX8-NEXT: s_lshl_b32 s3, s3, s11 +; GFX8-NEXT: s_and_b32 s3, s3, s10 ; GFX8-NEXT: s_or_b32 s2, s2, s6 +; GFX8-NEXT: s_lshl_b32 s3, s3, s11 ; GFX8-NEXT: s_and_b32 s6, s9, s10 ; GFX8-NEXT: s_or_b32 s3, s8, s3 ; GFX8-NEXT: s_bfe_u32 s6, s6, 0x100000 -; GFX8-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX8-NEXT: s_bfe_u32 s3, s3, 0x100000 ; GFX8-NEXT: s_lshl_b32 s6, s6, 16 +; GFX8-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX8-NEXT: s_or_b32 s3, s3, s6 ; GFX8-NEXT: s_lshr_b32 s6, s4, 8 ; GFX8-NEXT: s_and_b32 s6, s6, s10 @@ -1676,22 +1676,22 @@ ; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, 24, v0 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 ; GFX8-NEXT: v_mul_hi_u32 v1, v2, v1 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GFX8-NEXT: s_and_b32 s6, s9, s10 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GFX8-NEXT: s_or_b32 s5, s8, s5 ; GFX8-NEXT: s_bfe_u32 s6, s6, 0x100000 ; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, 24, v0 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 ; GFX8-NEXT: s_bfe_u32 s5, s5, 0x100000 ; GFX8-NEXT: s_lshl_b32 s6, s6, 16 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 ; GFX8-NEXT: s_or_b32 s5, s5, s6 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1 ; GFX8-NEXT: v_mul_hi_u32 v1, s5, v1 ; GFX8-NEXT: s_mov_b32 s6, 0xffffff ; GFX8-NEXT: v_sub_u32_e32 v3, vcc, 23, v0 -; GFX8-NEXT: v_and_b32_e32 v0, s6, v0 ; GFX8-NEXT: v_mul_lo_u32 v1, v1, 24 +; GFX8-NEXT: v_and_b32_e32 v0, s6, v0 ; GFX8-NEXT: v_lshlrev_b32_e64 v0, v0, s0 ; GFX8-NEXT: s_lshr_b32 s0, s2, 1 ; GFX8-NEXT: v_and_b32_e32 v2, s6, v3 @@ -1707,8 +1707,8 @@ ; GFX8-NEXT: v_mov_b32_e32 v4, 0xffffff ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 23, v1 ; GFX8-NEXT: v_and_b32_e32 v1, v1, v4 -; GFX8-NEXT: v_and_b32_e32 v2, v2, v4 ; GFX8-NEXT: s_lshr_b32 s0, s3, 1 +; GFX8-NEXT: v_and_b32_e32 v2, v2, v4 ; GFX8-NEXT: v_lshlrev_b32_e64 v1, v1, s1 ; GFX8-NEXT: v_lshrrev_b32_e64 v2, v2, s0 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v2 @@ -1719,8 +1719,8 @@ ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX8-NEXT: v_or_b32_e32 v0, v3, v0 ; GFX8-NEXT: v_and_b32_e32 v3, s10, v1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v3 ; GFX8-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; GFX8-NEXT: v_readfirstlane_b32 s0, v0 @@ -1745,21 +1745,21 @@ ; GFX9-NEXT: s_and_b32 s7, s9, s12 ; GFX9-NEXT: s_bfe_u32 s7, s7, 0x100000 ; GFX9-NEXT: v_mov_b32_e32 v1, 0xffffffe8 -; GFX9-NEXT: v_mul_lo_u32 v2, v1, v0 ; GFX9-NEXT: s_lshr_b32 s11, s1, 8 -; GFX9-NEXT: s_and_b32 s1, s1, s12 ; GFX9-NEXT: s_bfe_u32 s0, s0, 0x100000 ; GFX9-NEXT: s_lshl_b32 s7, s7, 16 -; GFX9-NEXT: s_lshl_b32 s1, s1, s13 +; GFX9-NEXT: s_and_b32 s1, s1, s12 +; GFX9-NEXT: v_mul_lo_u32 v2, v1, v0 ; GFX9-NEXT: s_or_b32 s0, s0, s7 +; GFX9-NEXT: s_lshl_b32 s1, s1, s13 ; GFX9-NEXT: s_and_b32 s7, s11, s12 ; GFX9-NEXT: s_or_b32 s1, s10, s1 ; GFX9-NEXT: s_bfe_u32 s7, s7, 0x100000 ; GFX9-NEXT: s_bfe_u32 s1, s1, 0x100000 ; GFX9-NEXT: s_lshl_b32 s7, s7, 16 -; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX9-NEXT: s_or_b32 s1, s1, s7 ; GFX9-NEXT: s_lshr_b32 s7, s2, 8 +; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX9-NEXT: s_and_b32 s7, s7, s12 ; GFX9-NEXT: s_lshr_b32 s9, s2, 16 ; GFX9-NEXT: s_lshr_b32 s10, s2, 24 @@ -1770,21 +1770,21 @@ ; GFX9-NEXT: s_bfe_u32 s7, s7, 0x100000 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v2, 24 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX9-NEXT: s_lshr_b32 s11, s3, 8 -; GFX9-NEXT: s_and_b32 s3, s3, s12 ; GFX9-NEXT: s_bfe_u32 s2, s2, 0x100000 ; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_and_b32 s3, s3, s12 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX9-NEXT: s_or_b32 s2, s2, s7 -; GFX9-NEXT: s_and_b32 s7, s11, s12 ; GFX9-NEXT: s_lshl_b32 s3, s3, s13 +; GFX9-NEXT: s_and_b32 s7, s11, s12 ; GFX9-NEXT: s_or_b32 s3, s10, s3 ; GFX9-NEXT: s_bfe_u32 s7, s7, 0x100000 ; GFX9-NEXT: s_bfe_u32 s3, s3, 0x100000 ; GFX9-NEXT: s_lshl_b32 s7, s7, 16 -; GFX9-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 ; GFX9-NEXT: s_or_b32 s3, s3, s7 ; GFX9-NEXT: s_lshr_b32 s7, s4, 8 +; GFX9-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 ; GFX9-NEXT: s_and_b32 s7, s7, s12 ; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GFX9-NEXT: s_lshr_b32 s9, s4, 16 @@ -1794,9 +1794,9 @@ ; GFX9-NEXT: s_or_b32 s4, s4, s7 ; GFX9-NEXT: s_and_b32 s7, s9, s12 ; GFX9-NEXT: s_bfe_u32 s7, s7, 0x100000 -; GFX9-NEXT: v_mul_lo_u32 v1, v1, v2 ; GFX9-NEXT: s_bfe_u32 s4, s4, 0x100000 ; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: v_mul_lo_u32 v1, v1, v2 ; GFX9-NEXT: s_or_b32 s4, s4, s7 ; GFX9-NEXT: v_mul_hi_u32 v0, s4, v0 ; GFX9-NEXT: s_lshr_b32 s11, s5, 8 @@ -1816,17 +1816,17 @@ ; GFX9-NEXT: v_subrev_u32_e32 v3, 24, v0 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_mul_lo_u32 v1, v1, 24 ; GFX9-NEXT: v_subrev_u32_e32 v3, 24, v0 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 +; GFX9-NEXT: v_mul_lo_u32 v1, v1, 24 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GFX9-NEXT: s_mov_b32 s7, 0xffffff ; GFX9-NEXT: v_sub_u32_e32 v3, 23, v0 ; GFX9-NEXT: s_lshr_b32 s2, s2, 1 ; GFX9-NEXT: v_and_b32_e32 v3, s7, v3 -; GFX9-NEXT: v_sub_u32_e32 v1, s5, v1 ; GFX9-NEXT: v_and_b32_e32 v0, s7, v0 ; GFX9-NEXT: v_lshrrev_b32_e64 v3, v3, s2 +; GFX9-NEXT: v_sub_u32_e32 v1, s5, v1 ; GFX9-NEXT: v_lshl_or_b32 v0, s0, v0, v3 ; GFX9-NEXT: v_subrev_u32_e32 v3, 24, v1 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v1 @@ -1840,11 +1840,11 @@ ; GFX9-NEXT: s_lshr_b32 s0, s3, 1 ; GFX9-NEXT: v_and_b32_e32 v2, v3, v2 ; GFX9-NEXT: v_lshrrev_b32_e64 v2, v2, s0 -; GFX9-NEXT: v_lshl_or_b32 v1, s1, v1, v2 ; GFX9-NEXT: s_mov_b32 s6, 8 +; GFX9-NEXT: v_lshl_or_b32 v1, s1, v1, v2 +; GFX9-NEXT: s_mov_b32 s8, 16 ; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX9-NEXT: v_and_b32_e32 v3, s12, v1 -; GFX9-NEXT: s_mov_b32 s8, 16 ; GFX9-NEXT: v_and_or_b32 v2, v0, s12, v2 ; GFX9-NEXT: v_lshlrev_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 24, v3 @@ -1875,9 +1875,9 @@ ; GFX10-NEXT: s_lshr_b32 s7, s0, 16 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 -; GFX10-NEXT: s_and_b32 s8, s8, s9 ; GFX10-NEXT: s_and_b32 s0, s0, s9 ; GFX10-NEXT: s_lshl_b32 s6, s6, s11 +; GFX10-NEXT: s_and_b32 s8, s8, s9 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX10-NEXT: s_or_b32 s0, s0, s6 @@ -1895,49 +1895,50 @@ ; GFX10-NEXT: s_and_b32 s8, s10, s9 ; GFX10-NEXT: v_mul_hi_u32 v3, v1, v3 ; GFX10-NEXT: s_bfe_u32 s8, s8, 0x100000 -; GFX10-NEXT: s_and_b32 s5, s5, s9 ; GFX10-NEXT: s_bfe_u32 s4, s4, 0x100000 ; GFX10-NEXT: s_lshl_b32 s8, s8, 16 -; GFX10-NEXT: s_lshl_b32 s5, s5, s11 -; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v2 +; GFX10-NEXT: s_and_b32 s5, s5, s9 ; GFX10-NEXT: s_or_b32 s4, s4, s8 +; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v2 +; GFX10-NEXT: s_lshl_b32 s5, s5, s11 ; GFX10-NEXT: s_and_b32 s8, s13, s9 ; GFX10-NEXT: s_or_b32 s5, s12, s5 ; GFX10-NEXT: s_bfe_u32 s8, s8, 0x100000 -; GFX10-NEXT: v_mul_hi_u32 v0, s4, v0 ; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v3 +; GFX10-NEXT: v_mul_hi_u32 v0, s4, v0 ; GFX10-NEXT: s_bfe_u32 s5, s5, 0x100000 ; GFX10-NEXT: s_lshl_b32 s8, s8, 16 ; GFX10-NEXT: s_lshr_b32 s10, s2, 16 ; GFX10-NEXT: s_or_b32 s5, s5, s8 ; GFX10-NEXT: s_lshr_b32 s8, s2, 8 ; GFX10-NEXT: v_mul_hi_u32 v1, s5, v1 -; GFX10-NEXT: v_mul_lo_u32 v0, v0, 24 ; GFX10-NEXT: s_and_b32 s8, s8, s9 +; GFX10-NEXT: v_mul_lo_u32 v0, v0, 24 ; GFX10-NEXT: s_and_b32 s12, s2, s9 ; GFX10-NEXT: s_lshl_b32 s8, s8, s11 ; GFX10-NEXT: s_and_b32 s10, s10, s9 ; GFX10-NEXT: s_or_b32 s8, s12, s8 ; GFX10-NEXT: s_lshr_b32 s2, s2, 24 ; GFX10-NEXT: v_mul_lo_u32 v1, v1, 24 +; GFX10-NEXT: s_bfe_u32 s6, s6, 0x100000 ; GFX10-NEXT: v_sub_nc_u32_e32 v0, s4, v0 ; GFX10-NEXT: s_bfe_u32 s4, s8, 0x100000 ; GFX10-NEXT: s_bfe_u32 s8, s10, 0x100000 -; GFX10-NEXT: s_bfe_u32 s6, s6, 0x100000 ; GFX10-NEXT: s_bfe_u32 s7, s7, 0x100000 +; GFX10-NEXT: s_bfe_u32 s0, s0, 0x100000 ; GFX10-NEXT: v_subrev_nc_u32_e32 v2, 24, v0 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v0 ; GFX10-NEXT: v_sub_nc_u32_e32 v1, s5, v1 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v0 ; GFX10-NEXT: s_lshl_b32 s5, s8, 16 ; GFX10-NEXT: s_lshr_b32 s8, s3, 8 ; GFX10-NEXT: s_and_b32 s3, s3, s9 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX10-NEXT: v_subrev_nc_u32_e32 v3, 24, v1 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v1 ; GFX10-NEXT: s_lshl_b32 s3, s3, s11 ; GFX10-NEXT: s_or_b32 s4, s4, s5 -; GFX10-NEXT: v_subrev_nc_u32_e32 v2, 24, v0 ; GFX10-NEXT: s_or_b32 s2, s2, s3 +; GFX10-NEXT: v_subrev_nc_u32_e32 v2, 24, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v0 ; GFX10-NEXT: s_and_b32 s3, s8, s9 @@ -1948,7 +1949,7 @@ ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v1 ; GFX10-NEXT: s_bfe_u32 s2, s2, 0x100000 ; GFX10-NEXT: s_lshl_b32 s3, s3, 16 -; GFX10-NEXT: s_bfe_u32 s0, s0, 0x100000 +; GFX10-NEXT: s_lshl_b32 s6, s6, 16 ; GFX10-NEXT: v_sub_nc_u32_e32 v2, 23, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo ; GFX10-NEXT: v_mov_b32_e32 v3, 0xffffff @@ -1961,22 +1962,21 @@ ; GFX10-NEXT: v_and_b32_e32 v1, v1, v3 ; GFX10-NEXT: v_lshrrev_b32_e64 v2, v2, s3 ; GFX10-NEXT: v_and_b32_e32 v4, v4, v3 -; GFX10-NEXT: s_lshl_b32 s6, s6, 16 ; GFX10-NEXT: s_bfe_u32 s1, s1, 0x100000 ; GFX10-NEXT: s_lshl_b32 s7, s7, 16 ; GFX10-NEXT: s_or_b32 s0, s0, s6 -; GFX10-NEXT: v_lshrrev_b32_e64 v3, v4, s2 ; GFX10-NEXT: s_or_b32 s1, s1, s7 +; GFX10-NEXT: v_lshrrev_b32_e64 v3, v4, s2 ; GFX10-NEXT: v_lshl_or_b32 v0, s0, v0, v2 ; GFX10-NEXT: s_mov_b32 s0, 8 ; GFX10-NEXT: v_lshl_or_b32 v1, s1, v1, v3 ; GFX10-NEXT: v_lshlrev_b32_sdwa v2, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: s_mov_b32 s0, 16 ; GFX10-NEXT: v_and_b32_e32 v3, s9, v1 -; GFX10-NEXT: v_bfe_u32 v4, v1, 8, 8 -; GFX10-NEXT: v_bfe_u32 v1, v1, 16, 8 ; GFX10-NEXT: v_and_or_b32 v2, v0, s9, v2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_bfe_u32 v4, v1, 8, 8 +; GFX10-NEXT: v_bfe_u32 v1, v1, 16, 8 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; GFX10-NEXT: v_lshl_or_b32 v1, v1, 8, v4 ; GFX10-NEXT: v_or3_b32 v0, v2, v0, v3 @@ -2023,8 +2023,8 @@ ; GFX6-NEXT: v_mul_lo_u32 v6, v7, v8 ; GFX6-NEXT: v_sub_i32_e32 v7, vcc, 23, v4 ; GFX6-NEXT: v_and_b32_e32 v4, v4, v9 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, v4, v0 ; GFX6-NEXT: v_mul_hi_u32 v6, v8, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, v4, v0 ; GFX6-NEXT: v_and_b32_e32 v4, v5, v9 ; GFX6-NEXT: v_add_i32_e32 v5, vcc, v8, v6 ; GFX6-NEXT: v_mul_hi_u32 v5, v4, v5 @@ -2080,8 +2080,8 @@ ; GFX8-NEXT: v_mul_lo_u32 v6, v7, v8 ; GFX8-NEXT: v_sub_u32_e32 v7, vcc, 23, v4 ; GFX8-NEXT: v_and_b32_e32 v4, v4, v9 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, v4, v0 ; GFX8-NEXT: v_mul_hi_u32 v6, v8, v6 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, v4, v0 ; GFX8-NEXT: v_and_b32_e32 v4, v5, v9 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, v8, v6 ; GFX8-NEXT: v_mul_hi_u32 v5, v4, v5 @@ -2594,9 +2594,9 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_alignbit_b32 v2, v0, v2, 1 -; GFX10-NEXT: v_alignbit_b32 v3, v1, v3, 1 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 1, v0 ; GFX10-NEXT: v_xor_b32_e32 v4, -1, v4 +; GFX10-NEXT: v_alignbit_b32 v3, v1, v3, 1 ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 1, v1 ; GFX10-NEXT: v_xor_b32_e32 v5, -1, v5 ; GFX10-NEXT: v_alignbit_b32 v0, v0, v2, v4 @@ -2663,12 +2663,12 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_alignbit_b32 v3, v0, v3, 1 -; GFX10-NEXT: v_alignbit_b32 v4, v1, v4, 1 -; GFX10-NEXT: v_alignbit_b32 v5, v2, v5, 1 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 1, v0 ; GFX10-NEXT: v_xor_b32_e32 v6, -1, v6 +; GFX10-NEXT: v_alignbit_b32 v4, v1, v4, 1 ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 1, v1 ; GFX10-NEXT: v_xor_b32_e32 v7, -1, v7 +; GFX10-NEXT: v_alignbit_b32 v5, v2, v5, 1 ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 1, v2 ; GFX10-NEXT: v_xor_b32_e32 v8, -1, v8 ; GFX10-NEXT: v_alignbit_b32 v0, v0, v3, v6 @@ -2748,15 +2748,15 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_alignbit_b32 v4, v0, v4, 1 -; GFX10-NEXT: v_alignbit_b32 v5, v1, v5, 1 -; GFX10-NEXT: v_alignbit_b32 v6, v2, v6, 1 -; GFX10-NEXT: v_alignbit_b32 v7, v3, v7, 1 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 1, v0 ; GFX10-NEXT: v_xor_b32_e32 v8, -1, v8 +; GFX10-NEXT: v_alignbit_b32 v5, v1, v5, 1 ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 1, v1 ; GFX10-NEXT: v_xor_b32_e32 v9, -1, v9 +; GFX10-NEXT: v_alignbit_b32 v6, v2, v6, 1 ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 1, v2 ; GFX10-NEXT: v_xor_b32_e32 v10, -1, v10 +; GFX10-NEXT: v_alignbit_b32 v7, v3, v7, 1 ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 1, v3 ; GFX10-NEXT: v_xor_b32_e32 v11, -1, v11 ; GFX10-NEXT: v_alignbit_b32 v0, v0, v4, v8 @@ -3090,8 +3090,8 @@ ; GFX10-NEXT: v_and_b32_e32 v0, 15, v0 ; GFX10-NEXT: s_bfe_u32 s1, s1, 0x100000 ; GFX10-NEXT: s_bfe_u32 s2, 1, 0x100000 -; GFX10-NEXT: v_and_b32_e32 v1, 15, v1 ; GFX10-NEXT: s_lshr_b32 s1, s1, s2 +; GFX10-NEXT: v_and_b32_e32 v1, 15, v1 ; GFX10-NEXT: v_lshlrev_b16 v0, v0, s0 ; GFX10-NEXT: v_lshrrev_b16 v1, v1, s1 ; GFX10-NEXT: v_or_b32_e32 v0, v0, v1 @@ -3118,8 +3118,8 @@ ; GFX8-LABEL: v_fshl_i16_svs: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_and_b32 s2, s1, 15 -; GFX8-NEXT: s_bfe_u32 s2, s2, 0x100000 ; GFX8-NEXT: s_andn2_b32 s1, 15, s1 +; GFX8-NEXT: s_bfe_u32 s2, s2, 0x100000 ; GFX8-NEXT: v_lshrrev_b16_e32 v0, 1, v0 ; GFX8-NEXT: s_lshl_b32 s0, s0, s2 ; GFX8-NEXT: v_lshrrev_b16_e32 v0, s1, v0 @@ -3129,8 +3129,8 @@ ; GFX9-LABEL: v_fshl_i16_svs: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_and_b32 s2, s1, 15 -; GFX9-NEXT: s_bfe_u32 s2, s2, 0x100000 ; GFX9-NEXT: s_andn2_b32 s1, 15, s1 +; GFX9-NEXT: s_bfe_u32 s2, s2, 0x100000 ; GFX9-NEXT: v_lshrrev_b16_e32 v0, 1, v0 ; GFX9-NEXT: s_lshl_b32 s0, s0, s2 ; GFX9-NEXT: v_lshrrev_b16_e32 v0, s1, v0 @@ -3224,9 +3224,9 @@ ; GFX6-NEXT: s_or_b32 s0, s0, s2 ; GFX6-NEXT: s_and_b32 s2, s5, 15 ; GFX6-NEXT: s_bfe_u32 s2, s2, 0x100000 +; GFX6-NEXT: s_andn2_b32 s4, 15, s5 ; GFX6-NEXT: s_lshl_b32 s1, s1, s2 ; GFX6-NEXT: s_and_b32 s2, s3, s6 -; GFX6-NEXT: s_andn2_b32 s4, 15, s5 ; GFX6-NEXT: s_lshr_b32 s2, s2, 1 ; GFX6-NEXT: s_bfe_u32 s3, s4, 0x100000 ; GFX6-NEXT: s_lshr_b32 s2, s2, s3 @@ -3283,8 +3283,8 @@ ; GFX9-NEXT: s_lshr_b32 s3, s3, 1 ; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s3 ; GFX9-NEXT: s_lshr_b32 s3, s1, 16 -; GFX9-NEXT: s_lshr_b32 s5, s2, 16 ; GFX9-NEXT: s_and_b32 s1, s1, s4 +; GFX9-NEXT: s_lshr_b32 s5, s2, 16 ; GFX9-NEXT: s_and_b32 s2, s2, s4 ; GFX9-NEXT: s_lshr_b32 s1, s1, s2 ; GFX9-NEXT: s_lshr_b32 s2, s3, s5 @@ -3334,15 +3334,15 @@ ; GFX6-NEXT: v_bfe_u32 v6, v6, 0, 16 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, 1, v2 ; GFX6-NEXT: v_bfe_u32 v4, v4, 0, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, v6, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v2 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX6-NEXT: v_and_b32_e32 v2, 15, v5 ; GFX6-NEXT: v_xor_b32_e32 v4, -1, v5 ; GFX6-NEXT: v_bfe_u32 v2, v2, 0, 16 +; GFX6-NEXT: v_and_b32_e32 v4, 15, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, v2, v1 ; GFX6-NEXT: v_and_b32_e32 v2, s4, v3 -; GFX6-NEXT: v_and_b32_e32 v4, 15, v4 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, 1, v2 ; GFX6-NEXT: v_bfe_u32 v3, v4, 0, 16 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v3, v2 @@ -3352,8 +3352,8 @@ ; GFX8-LABEL: v_fshl_v2i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_and_b32_e32 v4, 15, v2 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v4, 15, v2 ; GFX8-NEXT: v_xor_b32_e32 v2, -1, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX8-NEXT: v_lshrrev_b16_e32 v5, 1, v1 @@ -3413,8 +3413,8 @@ ; GFX6-NEXT: v_and_b32_e32 v2, s4, v2 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, 12, v2 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX6-NEXT: v_and_b32_e32 v2, s4, v3 ; GFX6-NEXT: s_bfe_u32 s5, 8, 0x100000 +; GFX6-NEXT: v_and_b32_e32 v2, s4, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, s5, v1 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, 8, v2 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 @@ -3442,8 +3442,8 @@ ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v2, 16 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v3, 16 -; GFX9-NEXT: s_mov_b32 s4, 0x4f7ffffe ; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v3 +; GFX9-NEXT: s_mov_b32 s4, 0x4f7ffffe ; GFX9-NEXT: v_mul_f32_e32 v2, s4, v2 ; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GFX9-NEXT: v_mul_f32_e32 v3, s4, v3 @@ -3460,8 +3460,8 @@ ; GFX9-NEXT: v_sub_u32_e32 v2, 4, v2 ; GFX9-NEXT: v_subrev_u32_e32 v4, 16, v2 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 16, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 4, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX9-NEXT: v_subrev_u32_e32 v4, 16, v2 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 16, v2 ; GFX9-NEXT: v_sub_u32_e32 v3, 8, v3 @@ -3537,8 +3537,8 @@ ; GFX6-NEXT: v_xor_b32_e32 v0, -1, v0 ; GFX6-NEXT: v_lshl_b32_e32 v2, s0, v2 ; GFX6-NEXT: s_mov_b32 s0, 0xffff -; GFX6-NEXT: s_and_b32 s2, s2, s0 ; GFX6-NEXT: v_and_b32_e32 v0, 15, v0 +; GFX6-NEXT: s_and_b32 s2, s2, s0 ; GFX6-NEXT: s_lshr_b32 s2, s2, 1 ; GFX6-NEXT: v_bfe_u32 v0, v0, 0, 16 ; GFX6-NEXT: v_lshr_b32_e32 v0, s2, v0 @@ -3562,11 +3562,11 @@ ; GFX8-LABEL: v_fshl_v2i16_ssv: ; GFX8: ; %bb.0: ; GFX8-NEXT: v_and_b32_e32 v2, 15, v0 +; GFX8-NEXT: s_lshr_b32 s2, s0, 16 +; GFX8-NEXT: s_lshr_b32 s3, s1, 16 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX8-NEXT: v_xor_b32_e32 v0, -1, v0 -; GFX8-NEXT: s_lshr_b32 s2, s0, 16 ; GFX8-NEXT: v_lshlrev_b16_e64 v2, v2, s0 -; GFX8-NEXT: s_lshr_b32 s3, s1, 16 ; GFX8-NEXT: s_bfe_u32 s0, s1, 0x100000 ; GFX8-NEXT: s_bfe_u32 s1, 1, 0x100000 ; GFX8-NEXT: v_and_b32_e32 v0, 15, v0 @@ -3635,9 +3635,9 @@ ; GFX6-NEXT: v_lshrrev_b32_e32 v0, s2, v0 ; GFX6-NEXT: v_or_b32_e32 v0, s0, v0 ; GFX6-NEXT: s_and_b32 s0, s3, 15 +; GFX6-NEXT: s_andn2_b32 s2, 15, s3 ; GFX6-NEXT: s_bfe_u32 s0, s0, 0x100000 ; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 -; GFX6-NEXT: s_andn2_b32 s2, 15, s3 ; GFX6-NEXT: s_lshl_b32 s0, s1, s0 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 1, v1 ; GFX6-NEXT: s_bfe_u32 s1, s2, 0x100000 @@ -3654,17 +3654,17 @@ ; GFX8-NEXT: s_and_b32 s4, s1, 15 ; GFX8-NEXT: s_lshr_b32 s3, s1, 16 ; GFX8-NEXT: s_andn2_b32 s1, 15, s1 -; GFX8-NEXT: v_lshrrev_b16_e32 v1, 1, v0 ; GFX8-NEXT: s_bfe_u32 s4, s4, 0x100000 +; GFX8-NEXT: v_lshrrev_b16_e32 v1, 1, v0 ; GFX8-NEXT: s_lshr_b32 s2, s0, 16 -; GFX8-NEXT: v_lshrrev_b16_e32 v1, s1, v1 ; GFX8-NEXT: s_lshl_b32 s0, s0, s4 +; GFX8-NEXT: v_lshrrev_b16_e32 v1, s1, v1 ; GFX8-NEXT: v_or_b32_e32 v1, s0, v1 ; GFX8-NEXT: s_and_b32 s0, s3, 15 ; GFX8-NEXT: v_mov_b32_e32 v2, 1 +; GFX8-NEXT: s_andn2_b32 s1, 15, s3 ; GFX8-NEXT: s_bfe_u32 s0, s0, 0x100000 ; GFX8-NEXT: v_lshrrev_b16_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: s_andn2_b32 s1, 15, s3 ; GFX8-NEXT: s_lshl_b32 s0, s2, s0 ; GFX8-NEXT: v_lshrrev_b16_e32 v0, s1, v0 ; GFX8-NEXT: v_or_b32_e32 v0, s0, v0 @@ -3680,9 +3680,9 @@ ; GFX9-NEXT: s_andn2_b32 s1, s2, s1 ; GFX9-NEXT: s_lshr_b32 s2, s0, 16 ; GFX9-NEXT: s_lshr_b32 s4, s3, 16 -; GFX9-NEXT: v_pk_lshrrev_b16 v0, 1, v0 op_sel_hi:[0,1] ; GFX9-NEXT: s_lshl_b32 s0, s0, s3 ; GFX9-NEXT: s_lshl_b32 s2, s2, s4 +; GFX9-NEXT: v_pk_lshrrev_b16 v0, 1, v0 op_sel_hi:[0,1] ; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s2 ; GFX9-NEXT: v_pk_lshrrev_b16 v0, s1, v0 ; GFX9-NEXT: v_or_b32_e32 v0, s0, v0 @@ -3722,9 +3722,9 @@ ; GFX6-NEXT: v_or_b32_e32 v0, s0, v0 ; GFX6-NEXT: s_and_b32 s0, s3, 15 ; GFX6-NEXT: s_bfe_u32 s0, s0, 0x100000 +; GFX6-NEXT: s_andn2_b32 s2, 15, s3 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, s0, v1 ; GFX6-NEXT: s_and_b32 s0, s1, s4 -; GFX6-NEXT: s_andn2_b32 s2, 15, s3 ; GFX6-NEXT: s_lshr_b32 s0, s0, 1 ; GFX6-NEXT: s_bfe_u32 s1, s2, 0x100000 ; GFX6-NEXT: s_lshr_b32 s0, s0, s1 @@ -3747,9 +3747,9 @@ ; GFX8-NEXT: s_lshr_b32 s0, s0, s4 ; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 ; GFX8-NEXT: s_lshr_b32 s0, s0, s1 -; GFX8-NEXT: s_andn2_b32 s1, 15, s3 ; GFX8-NEXT: v_or_b32_e32 v1, s0, v1 ; GFX8-NEXT: s_and_b32 s0, s3, 15 +; GFX8-NEXT: s_andn2_b32 s1, 15, s3 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: s_lshr_b32 s0, s2, s4 ; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 @@ -3774,8 +3774,8 @@ ; GFX9-NEXT: s_lshr_b32 s2, s2, 1 ; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s2 ; GFX9-NEXT: s_lshr_b32 s2, s0, 16 -; GFX9-NEXT: s_lshr_b32 s4, s1, 16 ; GFX9-NEXT: s_and_b32 s0, s0, s3 +; GFX9-NEXT: s_lshr_b32 s4, s1, 16 ; GFX9-NEXT: s_and_b32 s1, s1, s3 ; GFX9-NEXT: s_lshr_b32 s0, s0, s1 ; GFX9-NEXT: s_lshr_b32 s1, s2, s4 @@ -3838,9 +3838,9 @@ ; GFX6-NEXT: s_or_b32 s0, s0, s4 ; GFX6-NEXT: s_and_b32 s4, s9, 15 ; GFX6-NEXT: s_bfe_u32 s4, s4, 0x100000 +; GFX6-NEXT: s_andn2_b32 s8, 15, s9 ; GFX6-NEXT: s_lshl_b32 s1, s1, s4 ; GFX6-NEXT: s_and_b32 s4, s5, s12 -; GFX6-NEXT: s_andn2_b32 s8, 15, s9 ; GFX6-NEXT: s_lshr_b32 s4, s4, 1 ; GFX6-NEXT: s_bfe_u32 s5, s8, 0x100000 ; GFX6-NEXT: s_lshr_b32 s4, s4, s5 @@ -3861,8 +3861,8 @@ ; GFX6-NEXT: s_and_b32 s4, s7, s12 ; GFX6-NEXT: s_lshr_b32 s4, s4, 1 ; GFX6-NEXT: s_bfe_u32 s5, s5, 0x100000 -; GFX6-NEXT: s_bfe_u32 s1, s1, 0x100000 ; GFX6-NEXT: s_lshr_b32 s4, s4, s5 +; GFX6-NEXT: s_bfe_u32 s1, s1, 0x100000 ; GFX6-NEXT: s_or_b32 s3, s3, s4 ; GFX6-NEXT: s_bfe_u32 s0, s0, 0x100000 ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 @@ -3898,14 +3898,14 @@ ; GFX8-NEXT: s_or_b32 s2, s2, s4 ; GFX8-NEXT: s_and_b32 s4, s5, 15 ; GFX8-NEXT: s_lshr_b32 s9, s3, 16 -; GFX8-NEXT: s_bfe_u32 s3, s3, 0x100000 ; GFX8-NEXT: s_lshr_b32 s11, s5, 16 ; GFX8-NEXT: s_andn2_b32 s5, 15, s5 ; GFX8-NEXT: s_bfe_u32 s4, s4, 0x100000 +; GFX8-NEXT: s_bfe_u32 s3, s3, 0x100000 ; GFX8-NEXT: s_lshr_b32 s7, s1, 16 ; GFX8-NEXT: s_lshl_b32 s1, s1, s4 -; GFX8-NEXT: s_bfe_u32 s4, s5, 0x100000 ; GFX8-NEXT: s_lshr_b32 s3, s3, s12 +; GFX8-NEXT: s_bfe_u32 s4, s5, 0x100000 ; GFX8-NEXT: s_lshr_b32 s3, s3, s4 ; GFX8-NEXT: s_or_b32 s1, s1, s3 ; GFX8-NEXT: s_and_b32 s3, s11, 15 @@ -3913,9 +3913,9 @@ ; GFX8-NEXT: s_bfe_u32 s3, s3, 0x100000 ; GFX8-NEXT: s_lshr_b32 s5, s9, s12 ; GFX8-NEXT: s_bfe_u32 s4, s4, 0x100000 -; GFX8-NEXT: s_bfe_u32 s2, s2, 0x100000 ; GFX8-NEXT: s_lshl_b32 s3, s7, s3 ; GFX8-NEXT: s_lshr_b32 s4, s5, s4 +; GFX8-NEXT: s_bfe_u32 s2, s2, 0x100000 ; GFX8-NEXT: s_or_b32 s3, s3, s4 ; GFX8-NEXT: s_bfe_u32 s0, s0, 0x100000 ; GFX8-NEXT: s_lshl_b32 s2, s2, 16 @@ -3940,11 +3940,11 @@ ; GFX9-NEXT: s_and_b32 s2, s2, s8 ; GFX9-NEXT: s_lshr_b32 s2, s2, 1 ; GFX9-NEXT: s_lshr_b32 s7, s7, 1 -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s7 ; GFX9-NEXT: s_andn2_b32 s4, s6, s4 +; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s7 ; GFX9-NEXT: s_lshr_b32 s7, s2, 16 -; GFX9-NEXT: s_lshr_b32 s9, s4, 16 ; GFX9-NEXT: s_and_b32 s2, s2, s8 +; GFX9-NEXT: s_lshr_b32 s9, s4, 16 ; GFX9-NEXT: s_and_b32 s4, s4, s8 ; GFX9-NEXT: s_lshr_b32 s2, s2, s4 ; GFX9-NEXT: s_lshr_b32 s4, s7, s9 @@ -3963,8 +3963,8 @@ ; GFX9-NEXT: s_lshr_b32 s2, s2, 1 ; GFX9-NEXT: s_pack_ll_b32_b16 s2, s3, s2 ; GFX9-NEXT: s_lshr_b32 s3, s2, 16 -; GFX9-NEXT: s_lshr_b32 s5, s4, 16 ; GFX9-NEXT: s_and_b32 s2, s2, s8 +; GFX9-NEXT: s_lshr_b32 s5, s4, 16 ; GFX9-NEXT: s_and_b32 s4, s4, s8 ; GFX9-NEXT: s_lshr_b32 s2, s2, s4 ; GFX9-NEXT: s_lshr_b32 s3, s3, s5 @@ -3981,15 +3981,15 @@ ; GFX10-NEXT: s_and_b32 s7, s4, s6 ; GFX10-NEXT: s_lshr_b32 s11, s11, 1 ; GFX10-NEXT: s_lshr_b32 s2, s2, 1 -; GFX10-NEXT: s_lshr_b32 s8, s0, 16 ; GFX10-NEXT: s_andn2_b32 s4, s6, s4 +; GFX10-NEXT: s_lshr_b32 s8, s0, 16 ; GFX10-NEXT: s_lshr_b32 s10, s7, 16 ; GFX10-NEXT: s_pack_ll_b32_b16 s2, s11, s2 ; GFX10-NEXT: s_lshl_b32 s0, s0, s7 ; GFX10-NEXT: s_lshl_b32 s7, s8, s10 ; GFX10-NEXT: s_lshr_b32 s8, s2, 16 -; GFX10-NEXT: s_and_b32 s10, s4, s9 ; GFX10-NEXT: s_and_b32 s2, s2, s9 +; GFX10-NEXT: s_and_b32 s10, s4, s9 ; GFX10-NEXT: s_lshr_b32 s4, s4, 16 ; GFX10-NEXT: s_lshr_b32 s2, s2, s10 ; GFX10-NEXT: s_lshr_b32 s4, s8, s4 @@ -4007,8 +4007,8 @@ ; GFX10-NEXT: s_lshl_b32 s1, s1, s4 ; GFX10-NEXT: s_lshl_b32 s4, s6, s7 ; GFX10-NEXT: s_lshr_b32 s6, s3, 16 -; GFX10-NEXT: s_and_b32 s7, s5, s9 ; GFX10-NEXT: s_and_b32 s3, s3, s9 +; GFX10-NEXT: s_and_b32 s7, s5, s9 ; GFX10-NEXT: s_lshr_b32 s5, s5, 16 ; GFX10-NEXT: s_lshr_b32 s3, s3, s7 ; GFX10-NEXT: s_lshr_b32 s5, s6, s5 @@ -4040,17 +4040,17 @@ ; GFX6-NEXT: v_and_b32_e32 v4, 15, v9 ; GFX6-NEXT: v_xor_b32_e32 v8, -1, v9 ; GFX6-NEXT: v_bfe_u32 v4, v4, 0, 16 +; GFX6-NEXT: v_and_b32_e32 v8, 15, v8 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, v4, v1 ; GFX6-NEXT: v_and_b32_e32 v4, s4, v5 -; GFX6-NEXT: v_and_b32_e32 v8, 15, v8 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 1, v4 ; GFX6-NEXT: v_bfe_u32 v5, v8, 0, 16 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, v5, v4 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v4 ; GFX6-NEXT: v_and_b32_e32 v4, 15, v10 +; GFX6-NEXT: v_mov_b32_e32 v12, 0xffff ; GFX6-NEXT: v_xor_b32_e32 v5, -1, v10 ; GFX6-NEXT: v_bfe_u32 v4, v4, 0, 16 -; GFX6-NEXT: v_mov_b32_e32 v12, 0xffff ; GFX6-NEXT: v_and_b32_e32 v5, 15, v5 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX6-NEXT: v_and_b32_e32 v4, v6, v12 @@ -4073,8 +4073,8 @@ ; GFX8-LABEL: v_fshl_v4i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_and_b32_e32 v8, 15, v4 ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v8, 15, v4 ; GFX8-NEXT: v_xor_b32_e32 v4, -1, v4 ; GFX8-NEXT: v_and_b32_e32 v4, 15, v4 ; GFX8-NEXT: v_lshrrev_b16_e32 v9, 1, v2 @@ -4088,14 +4088,14 @@ ; GFX8-NEXT: v_and_b32_e32 v6, 15, v6 ; GFX8-NEXT: v_lshrrev_b16_sdwa v2, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_lshrrev_b16_e32 v2, v6, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v5 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 15, v5 -; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v5 ; GFX8-NEXT: v_xor_b32_e32 v5, -1, v5 ; GFX8-NEXT: v_and_b32_e32 v5, 15, v5 ; GFX8-NEXT: v_lshrrev_b16_e32 v6, 1, v3 -; GFX8-NEXT: v_lshrrev_b16_e32 v5, v5, v6 ; GFX8-NEXT: v_lshlrev_b16_e32 v2, v2, v1 +; GFX8-NEXT: v_lshrrev_b16_e32 v5, v5, v6 ; GFX8-NEXT: v_or_b32_e32 v2, v2, v5 ; GFX8-NEXT: v_and_b32_e32 v5, 15, v7 ; GFX8-NEXT: v_xor_b32_e32 v6, -1, v7 @@ -4120,13 +4120,13 @@ ; GFX9-NEXT: v_xor_b32_e32 v4, -1, v4 ; GFX9-NEXT: v_and_b32_e32 v4, s4, v4 ; GFX9-NEXT: v_pk_lshrrev_b16 v2, 1, v2 op_sel_hi:[0,1] -; GFX9-NEXT: v_pk_lshrrev_b16 v2, v4, v2 ; GFX9-NEXT: v_pk_lshlrev_b16 v0, v6, v0 -; GFX9-NEXT: v_xor_b32_e32 v4, -1, v5 +; GFX9-NEXT: v_pk_lshrrev_b16 v2, v4, v2 ; GFX9-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX9-NEXT: v_and_b32_e32 v2, s4, v5 -; GFX9-NEXT: v_pk_lshlrev_b16 v1, v2, v1 +; GFX9-NEXT: v_xor_b32_e32 v4, -1, v5 ; GFX9-NEXT: v_and_b32_e32 v4, s4, v4 +; GFX9-NEXT: v_pk_lshlrev_b16 v1, v2, v1 ; GFX9-NEXT: v_pk_lshrrev_b16 v2, 1, v3 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_lshrrev_b16 v2, v4, v2 ; GFX9-NEXT: v_or_b32_e32 v1, v1, v2 @@ -4174,8 +4174,8 @@ define amdgpu_ps i64 @s_fshl_i64_5(i64 inreg %lhs, i64 inreg %rhs) { ; GCN-LABEL: s_fshl_i64_5: ; GCN: ; %bb.0: -; GCN-NEXT: s_lshr_b32 s2, s3, 27 ; GCN-NEXT: s_lshl_b64 s[0:1], s[0:1], 5 +; GCN-NEXT: s_lshr_b32 s2, s3, 27 ; GCN-NEXT: s_mov_b32 s3, 0 ; GCN-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] ; GCN-NEXT: ; return to shader part epilog @@ -4483,8 +4483,8 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_and_b64 s[4:5], s[2:3], 63 ; GFX6-NEXT: s_andn2_b64 s[2:3], 63, s[2:3] -; GFX6-NEXT: s_lshr_b64 s[0:1], s[0:1], 1 ; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], s4 +; GFX6-NEXT: s_lshr_b64 s[0:1], s[0:1], 1 ; GFX6-NEXT: s_lshr_b64 s[0:1], s[0:1], s2 ; GFX6-NEXT: v_or_b32_e32 v0, s0, v0 ; GFX6-NEXT: v_or_b32_e32 v1, s1, v1 @@ -4494,8 +4494,8 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_and_b64 s[4:5], s[2:3], 63 ; GFX8-NEXT: s_andn2_b64 s[2:3], 63, s[2:3] -; GFX8-NEXT: s_lshr_b64 s[0:1], s[0:1], 1 ; GFX8-NEXT: v_lshlrev_b64 v[0:1], s4, v[0:1] +; GFX8-NEXT: s_lshr_b64 s[0:1], s[0:1], 1 ; GFX8-NEXT: s_lshr_b64 s[0:1], s[0:1], s2 ; GFX8-NEXT: v_or_b32_e32 v0, s0, v0 ; GFX8-NEXT: v_or_b32_e32 v1, s1, v1 @@ -4505,8 +4505,8 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_and_b64 s[4:5], s[2:3], 63 ; GFX9-NEXT: s_andn2_b64 s[2:3], 63, s[2:3] -; GFX9-NEXT: s_lshr_b64 s[0:1], s[0:1], 1 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], s4, v[0:1] +; GFX9-NEXT: s_lshr_b64 s[0:1], s[0:1], 1 ; GFX9-NEXT: s_lshr_b64 s[0:1], s[0:1], s2 ; GFX9-NEXT: v_or_b32_e32 v0, s0, v0 ; GFX9-NEXT: v_or_b32_e32 v1, s1, v1 @@ -4533,12 +4533,12 @@ ; GFX6-NEXT: s_and_b64 s[12:13], s[8:9], 63 ; GFX6-NEXT: s_andn2_b64 s[8:9], 63, s[8:9] ; GFX6-NEXT: s_lshr_b64 s[4:5], s[4:5], 1 -; GFX6-NEXT: s_lshr_b64 s[4:5], s[4:5], s8 ; GFX6-NEXT: s_lshl_b64 s[0:1], s[0:1], s12 +; GFX6-NEXT: s_lshr_b64 s[4:5], s[4:5], s8 ; GFX6-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] ; GFX6-NEXT: s_and_b64 s[4:5], s[10:11], 63 -; GFX6-NEXT: s_lshl_b64 s[2:3], s[2:3], s4 ; GFX6-NEXT: s_andn2_b64 s[8:9], 63, s[10:11] +; GFX6-NEXT: s_lshl_b64 s[2:3], s[2:3], s4 ; GFX6-NEXT: s_lshr_b64 s[4:5], s[6:7], 1 ; GFX6-NEXT: s_lshr_b64 s[4:5], s[4:5], s8 ; GFX6-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] @@ -4549,12 +4549,12 @@ ; GFX8-NEXT: s_and_b64 s[12:13], s[8:9], 63 ; GFX8-NEXT: s_andn2_b64 s[8:9], 63, s[8:9] ; GFX8-NEXT: s_lshr_b64 s[4:5], s[4:5], 1 -; GFX8-NEXT: s_lshr_b64 s[4:5], s[4:5], s8 ; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], s12 +; GFX8-NEXT: s_lshr_b64 s[4:5], s[4:5], s8 ; GFX8-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] ; GFX8-NEXT: s_and_b64 s[4:5], s[10:11], 63 -; GFX8-NEXT: s_lshl_b64 s[2:3], s[2:3], s4 ; GFX8-NEXT: s_andn2_b64 s[8:9], 63, s[10:11] +; GFX8-NEXT: s_lshl_b64 s[2:3], s[2:3], s4 ; GFX8-NEXT: s_lshr_b64 s[4:5], s[6:7], 1 ; GFX8-NEXT: s_lshr_b64 s[4:5], s[4:5], s8 ; GFX8-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] @@ -4565,12 +4565,12 @@ ; GFX9-NEXT: s_and_b64 s[12:13], s[8:9], 63 ; GFX9-NEXT: s_andn2_b64 s[8:9], 63, s[8:9] ; GFX9-NEXT: s_lshr_b64 s[4:5], s[4:5], 1 -; GFX9-NEXT: s_lshr_b64 s[4:5], s[4:5], s8 ; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], s12 +; GFX9-NEXT: s_lshr_b64 s[4:5], s[4:5], s8 ; GFX9-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] ; GFX9-NEXT: s_and_b64 s[4:5], s[10:11], 63 -; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], s4 ; GFX9-NEXT: s_andn2_b64 s[8:9], 63, s[10:11] +; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], s4 ; GFX9-NEXT: s_lshr_b64 s[4:5], s[6:7], 1 ; GFX9-NEXT: s_lshr_b64 s[4:5], s[4:5], s8 ; GFX9-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] @@ -4603,8 +4603,8 @@ ; GFX6-NEXT: v_xor_b32_e32 v8, -1, v8 ; GFX6-NEXT: v_lshr_b64 v[4:5], v[4:5], 1 ; GFX6-NEXT: v_and_b32_e32 v8, 63, v8 -; GFX6-NEXT: v_lshr_b64 v[4:5], v[4:5], v8 ; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], v9 +; GFX6-NEXT: v_lshr_b64 v[4:5], v[4:5], v8 ; GFX6-NEXT: v_xor_b32_e32 v8, -1, v10 ; GFX6-NEXT: v_lshr_b64 v[6:7], v[6:7], 1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v4 @@ -4624,8 +4624,8 @@ ; GFX8-NEXT: v_xor_b32_e32 v8, -1, v8 ; GFX8-NEXT: v_lshrrev_b64 v[4:5], 1, v[4:5] ; GFX8-NEXT: v_and_b32_e32 v8, 63, v8 -; GFX8-NEXT: v_lshrrev_b64 v[4:5], v8, v[4:5] ; GFX8-NEXT: v_lshlrev_b64 v[0:1], v9, v[0:1] +; GFX8-NEXT: v_lshrrev_b64 v[4:5], v8, v[4:5] ; GFX8-NEXT: v_xor_b32_e32 v8, -1, v10 ; GFX8-NEXT: v_lshrrev_b64 v[6:7], 1, v[6:7] ; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 @@ -4645,8 +4645,8 @@ ; GFX9-NEXT: v_xor_b32_e32 v8, -1, v8 ; GFX9-NEXT: v_lshrrev_b64 v[4:5], 1, v[4:5] ; GFX9-NEXT: v_and_b32_e32 v8, 63, v8 -; GFX9-NEXT: v_lshrrev_b64 v[4:5], v8, v[4:5] ; GFX9-NEXT: v_lshlrev_b64 v[0:1], v9, v[0:1] +; GFX9-NEXT: v_lshrrev_b64 v[4:5], v8, v[4:5] ; GFX9-NEXT: v_xor_b32_e32 v8, -1, v10 ; GFX9-NEXT: v_lshrrev_b64 v[6:7], 1, v[6:7] ; GFX9-NEXT: v_or_b32_e32 v0, v0, v4 @@ -4718,8 +4718,8 @@ ; GFX6-NEXT: s_cmp_eq_u32 s8, 0 ; GFX6-NEXT: s_cselect_b32 s16, 1, 0 ; GFX6-NEXT: s_lshr_b64 s[6:7], s[4:5], s8 -; GFX6-NEXT: s_lshl_b64 s[10:11], s[4:5], s10 ; GFX6-NEXT: s_lshr_b64 s[8:9], s[0:1], s8 +; GFX6-NEXT: s_lshl_b64 s[10:11], s[4:5], s10 ; GFX6-NEXT: s_or_b64 s[8:9], s[8:9], s[10:11] ; GFX6-NEXT: s_lshr_b64 s[4:5], s[4:5], s12 ; GFX6-NEXT: s_cmp_lg_u32 s13, 0 @@ -4765,8 +4765,8 @@ ; GFX8-NEXT: s_cmp_eq_u32 s8, 0 ; GFX8-NEXT: s_cselect_b32 s16, 1, 0 ; GFX8-NEXT: s_lshr_b64 s[6:7], s[4:5], s8 -; GFX8-NEXT: s_lshl_b64 s[10:11], s[4:5], s10 ; GFX8-NEXT: s_lshr_b64 s[8:9], s[0:1], s8 +; GFX8-NEXT: s_lshl_b64 s[10:11], s[4:5], s10 ; GFX8-NEXT: s_or_b64 s[8:9], s[8:9], s[10:11] ; GFX8-NEXT: s_lshr_b64 s[4:5], s[4:5], s12 ; GFX8-NEXT: s_cmp_lg_u32 s13, 0 @@ -4812,8 +4812,8 @@ ; GFX9-NEXT: s_cmp_eq_u32 s8, 0 ; GFX9-NEXT: s_cselect_b32 s16, 1, 0 ; GFX9-NEXT: s_lshr_b64 s[6:7], s[4:5], s8 -; GFX9-NEXT: s_lshl_b64 s[10:11], s[4:5], s10 ; GFX9-NEXT: s_lshr_b64 s[8:9], s[0:1], s8 +; GFX9-NEXT: s_lshl_b64 s[10:11], s[4:5], s10 ; GFX9-NEXT: s_or_b64 s[8:9], s[8:9], s[10:11] ; GFX9-NEXT: s_lshr_b64 s[4:5], s[4:5], s12 ; GFX9-NEXT: s_cmp_lg_u32 s13, 0 @@ -4837,8 +4837,8 @@ ; GFX10-NEXT: s_cselect_b32 s18, 1, 0 ; GFX10-NEXT: s_cmp_eq_u32 s12, 0 ; GFX10-NEXT: s_cselect_b32 s19, 1, 0 -; GFX10-NEXT: s_lshl_b64 s[16:17], s[2:3], s12 ; GFX10-NEXT: s_lshr_b64 s[14:15], s[0:1], s10 +; GFX10-NEXT: s_lshl_b64 s[16:17], s[2:3], s12 ; GFX10-NEXT: s_lshl_b64 s[12:13], s[0:1], s12 ; GFX10-NEXT: s_or_b64 s[14:15], s[14:15], s[16:17] ; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], s9 @@ -4891,8 +4891,8 @@ ; GFX6-NEXT: v_lshl_b64 v[12:13], v[0:1], v14 ; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], v16 ; GFX6-NEXT: v_or_b32_e32 v8, v8, v10 -; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v14 ; GFX6-NEXT: v_or_b32_e32 v9, v9, v11 +; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v14 ; GFX6-NEXT: v_cndmask_b32_e32 v10, 0, v12, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v11, 0, v13, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc @@ -4911,11 +4911,11 @@ ; GFX6-NEXT: v_lshr_b64 v[8:9], v[2:3], v15 ; GFX6-NEXT: v_lshr_b64 v[2:3], v[2:3], v14 ; GFX6-NEXT: v_or_b32_e32 v4, v4, v6 -; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v15 ; GFX6-NEXT: v_or_b32_e32 v5, v5, v7 +; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v15 ; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v15 ; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc +; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v15 ; GFX6-NEXT: v_cndmask_b32_e64 v0, v2, v0, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v8, vcc @@ -4940,8 +4940,8 @@ ; GFX8-NEXT: v_lshlrev_b64 v[12:13], v14, v[0:1] ; GFX8-NEXT: v_lshlrev_b64 v[0:1], v16, v[0:1] ; GFX8-NEXT: v_or_b32_e32 v8, v8, v10 -; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v14 ; GFX8-NEXT: v_or_b32_e32 v9, v9, v11 +; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v14 ; GFX8-NEXT: v_cndmask_b32_e32 v10, 0, v12, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v11, 0, v13, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc @@ -4960,11 +4960,11 @@ ; GFX8-NEXT: v_lshrrev_b64 v[8:9], v15, v[2:3] ; GFX8-NEXT: v_lshrrev_b64 v[2:3], v14, v[2:3] ; GFX8-NEXT: v_or_b32_e32 v4, v4, v6 -; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v15 ; GFX8-NEXT: v_or_b32_e32 v5, v5, v7 +; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v15 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v15 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc +; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v15 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v2, v0, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v2, 0, v8, vcc @@ -4989,8 +4989,8 @@ ; GFX9-NEXT: v_lshlrev_b64 v[12:13], v14, v[0:1] ; GFX9-NEXT: v_lshlrev_b64 v[0:1], v16, v[0:1] ; GFX9-NEXT: v_or_b32_e32 v8, v8, v10 -; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v14 ; GFX9-NEXT: v_or_b32_e32 v9, v9, v11 +; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v14 ; GFX9-NEXT: v_cndmask_b32_e32 v10, 0, v12, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v11, 0, v13, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc @@ -5009,11 +5009,11 @@ ; GFX9-NEXT: v_lshrrev_b64 v[8:9], v15, v[2:3] ; GFX9-NEXT: v_lshrrev_b64 v[2:3], v14, v[2:3] ; GFX9-NEXT: v_or_b32_e32 v4, v4, v6 -; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v15 ; GFX9-NEXT: v_or_b32_e32 v5, v5, v7 +; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v15 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v15 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v15 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, v0, s[4:5] ; GFX9-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[4:5] ; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v8, vcc @@ -5046,28 +5046,28 @@ ; GFX10-NEXT: v_lshlrev_b64 v[0:1], v20, v[0:1] ; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v18 ; GFX10-NEXT: v_or_b32_e32 v10, v8, v10 -; GFX10-NEXT: v_lshlrev_b64 v[16:17], v16, v[6:7] ; GFX10-NEXT: v_subrev_nc_u32_e32 v8, 64, v19 +; GFX10-NEXT: v_lshlrev_b64 v[16:17], v16, v[6:7] ; GFX10-NEXT: v_or_b32_e32 v11, v9, v11 ; GFX10-NEXT: v_cmp_gt_u32_e64 s4, 64, v19 ; GFX10-NEXT: v_cndmask_b32_e32 v10, v0, v10, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 0, v18 ; GFX10-NEXT: v_lshrrev_b64 v[8:9], v8, v[6:7] +; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 0, v19 ; GFX10-NEXT: v_or_b32_e32 v14, v14, v16 ; GFX10-NEXT: v_or_b32_e32 v15, v15, v17 ; GFX10-NEXT: v_cndmask_b32_e32 v11, v1, v11, vcc_lo ; GFX10-NEXT: v_lshrrev_b64 v[0:1], v19, v[6:7] -; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 0, v19 +; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 0, v18 ; GFX10-NEXT: v_cndmask_b32_e64 v8, v8, v14, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v6, v9, v15, s4 ; GFX10-NEXT: v_cndmask_b32_e32 v12, 0, v12, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v7, 0, v13, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v2, v10, v2, s6 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v11, v3, s6 ; GFX10-NEXT: v_cndmask_b32_e64 v4, v8, v4, s5 ; GFX10-NEXT: v_cndmask_b32_e64 v5, v6, v5, s5 ; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, v0, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, v1, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v11, v3, s6 ; GFX10-NEXT: v_or_b32_e32 v0, v12, v4 ; GFX10-NEXT: v_or_b32_e32 v1, v7, v5 ; GFX10-NEXT: v_or_b32_e32 v2, v2, v6 @@ -5093,29 +5093,29 @@ ; GFX6-NEXT: v_or_b32_e32 v3, v1, v3 ; GFX6-NEXT: v_lshl_b64 v[0:1], s[0:1], v8 ; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v6 -; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX6-NEXT: s_mov_b32 s8, 0 ; GFX6-NEXT: v_cndmask_b32_e32 v8, 0, v4, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v9, 0, v5, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX6-NEXT: v_mov_b32_e32 v2, s2 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v6 ; GFX6-NEXT: v_mov_b32_e32 v3, s3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v6 ; GFX6-NEXT: s_lshr_b64 s[0:1], s[4:5], 1 ; GFX6-NEXT: s_lshl_b32 s9, s6, 31 -; GFX6-NEXT: s_lshr_b64 s[2:3], s[6:7], 1 ; GFX6-NEXT: v_cndmask_b32_e32 v6, v0, v2, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v10, v1, v3, vcc -; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 64, v7 ; GFX6-NEXT: s_or_b64 s[0:1], s[0:1], s[8:9] +; GFX6-NEXT: s_lshr_b64 s[2:3], s[6:7], 1 +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 64, v7 ; GFX6-NEXT: v_lshr_b64 v[0:1], s[0:1], v7 ; GFX6-NEXT: v_lshl_b64 v[2:3], s[2:3], v2 ; GFX6-NEXT: v_subrev_i32_e32 v11, vcc, 64, v7 ; GFX6-NEXT: v_or_b32_e32 v2, v0, v2 ; GFX6-NEXT: v_or_b32_e32 v3, v1, v3 ; GFX6-NEXT: v_lshr_b64 v[0:1], s[2:3], v11 -; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v7 ; GFX6-NEXT: v_lshr_b64 v[4:5], s[2:3], v7 +; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v7 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX6-NEXT: v_mov_b32_e32 v2, s0 @@ -5146,29 +5146,29 @@ ; GFX8-NEXT: v_or_b32_e32 v3, v1, v3 ; GFX8-NEXT: v_lshlrev_b64 v[0:1], v8, s[0:1] ; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v6 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX8-NEXT: s_mov_b32 s8, 0 ; GFX8-NEXT: v_cndmask_b32_e32 v8, 0, v4, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v9, 0, v5, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX8-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v6 ; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v6 ; GFX8-NEXT: s_lshr_b64 s[0:1], s[4:5], 1 ; GFX8-NEXT: s_lshl_b32 s9, s6, 31 -; GFX8-NEXT: s_lshr_b64 s[2:3], s[6:7], 1 ; GFX8-NEXT: v_cndmask_b32_e32 v6, v0, v2, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v10, v1, v3, vcc -; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 64, v7 ; GFX8-NEXT: s_or_b64 s[0:1], s[0:1], s[8:9] +; GFX8-NEXT: s_lshr_b64 s[2:3], s[6:7], 1 +; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 64, v7 ; GFX8-NEXT: v_lshrrev_b64 v[0:1], v7, s[0:1] ; GFX8-NEXT: v_lshlrev_b64 v[2:3], v2, s[2:3] ; GFX8-NEXT: v_subrev_u32_e32 v11, vcc, 64, v7 ; GFX8-NEXT: v_or_b32_e32 v2, v0, v2 ; GFX8-NEXT: v_or_b32_e32 v3, v1, v3 ; GFX8-NEXT: v_lshrrev_b64 v[0:1], v11, s[2:3] -; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v7 ; GFX8-NEXT: v_lshrrev_b64 v[4:5], v7, s[2:3] +; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v7 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX8-NEXT: v_mov_b32_e32 v2, s0 @@ -5199,19 +5199,19 @@ ; GFX9-NEXT: v_or_b32_e32 v3, v1, v3 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], v8, s[0:1] ; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v6 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX9-NEXT: s_mov_b32 s8, 0 ; GFX9-NEXT: v_cndmask_b32_e32 v8, 0, v4, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v9, 0, v5, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX9-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v6 ; GFX9-NEXT: s_lshr_b64 s[0:1], s[4:5], 1 ; GFX9-NEXT: s_lshl_b32 s9, s6, 31 ; GFX9-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v0, v2, vcc -; GFX9-NEXT: s_lshr_b64 s[2:3], s[6:7], 1 ; GFX9-NEXT: s_or_b64 s[0:1], s[0:1], s[8:9] +; GFX9-NEXT: s_lshr_b64 s[2:3], s[6:7], 1 ; GFX9-NEXT: v_sub_u32_e32 v2, 64, v7 ; GFX9-NEXT: v_cndmask_b32_e32 v10, v1, v3, vcc ; GFX9-NEXT: v_lshrrev_b64 v[0:1], v7, s[0:1] @@ -5220,8 +5220,8 @@ ; GFX9-NEXT: v_or_b32_e32 v2, v0, v2 ; GFX9-NEXT: v_or_b32_e32 v3, v1, v3 ; GFX9-NEXT: v_lshrrev_b64 v[0:1], v11, s[2:3] -; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v7 ; GFX9-NEXT: v_lshrrev_b64 v[4:5], v7, s[2:3] +; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v7 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX9-NEXT: v_mov_b32_e32 v2, s0 @@ -5315,21 +5315,21 @@ ; GFX6-NEXT: s_cselect_b64 s[0:1], s[2:3], s[0:1] ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 31, v2 ; GFX6-NEXT: v_lshr_b64 v[2:3], v[2:3], 1 -; GFX6-NEXT: s_sub_i32 s3, 64, s4 ; GFX6-NEXT: s_sub_i32 s2, s4, 64 +; GFX6-NEXT: s_sub_i32 s3, 64, s4 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v4 ; GFX6-NEXT: s_cmp_lt_u32 s4, 64 ; GFX6-NEXT: s_cselect_b32 s5, 1, 0 ; GFX6-NEXT: s_cmp_eq_u32 s4, 0 ; GFX6-NEXT: v_lshr_b64 v[4:5], v[0:1], s4 ; GFX6-NEXT: v_lshl_b64 v[6:7], v[2:3], s3 -; GFX6-NEXT: v_lshr_b64 v[8:9], v[2:3], s4 ; GFX6-NEXT: s_cselect_b32 s8, 1, 0 +; GFX6-NEXT: v_lshr_b64 v[8:9], v[2:3], s4 ; GFX6-NEXT: v_lshr_b64 v[2:3], v[2:3], s2 ; GFX6-NEXT: s_and_b32 s2, 1, s5 -; GFX6-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2 ; GFX6-NEXT: v_or_b32_e32 v4, v4, v6 ; GFX6-NEXT: v_or_b32_e32 v5, v5, v7 +; GFX6-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2 ; GFX6-NEXT: s_and_b32 s2, 1, s8 ; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc @@ -5370,21 +5370,21 @@ ; GFX8-NEXT: s_cselect_b64 s[0:1], s[2:3], s[0:1] ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 31, v2 ; GFX8-NEXT: v_lshrrev_b64 v[2:3], 1, v[2:3] -; GFX8-NEXT: s_sub_i32 s3, 64, s4 ; GFX8-NEXT: s_sub_i32 s2, s4, 64 +; GFX8-NEXT: s_sub_i32 s3, 64, s4 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v4 ; GFX8-NEXT: s_cmp_lt_u32 s4, 64 ; GFX8-NEXT: s_cselect_b32 s5, 1, 0 ; GFX8-NEXT: s_cmp_eq_u32 s4, 0 ; GFX8-NEXT: v_lshrrev_b64 v[4:5], s4, v[0:1] ; GFX8-NEXT: v_lshlrev_b64 v[6:7], s3, v[2:3] -; GFX8-NEXT: v_lshrrev_b64 v[8:9], s4, v[2:3] ; GFX8-NEXT: s_cselect_b32 s8, 1, 0 +; GFX8-NEXT: v_lshrrev_b64 v[8:9], s4, v[2:3] ; GFX8-NEXT: v_lshrrev_b64 v[2:3], s2, v[2:3] ; GFX8-NEXT: s_and_b32 s2, 1, s5 -; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2 ; GFX8-NEXT: v_or_b32_e32 v4, v4, v6 ; GFX8-NEXT: v_or_b32_e32 v5, v5, v7 +; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2 ; GFX8-NEXT: s_and_b32 s2, 1, s8 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc @@ -5425,21 +5425,21 @@ ; GFX9-NEXT: s_cselect_b64 s[0:1], s[2:3], s[0:1] ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 31, v2 ; GFX9-NEXT: v_lshrrev_b64 v[2:3], 1, v[2:3] -; GFX9-NEXT: s_sub_i32 s3, 64, s4 ; GFX9-NEXT: s_sub_i32 s2, s4, 64 +; GFX9-NEXT: s_sub_i32 s3, 64, s4 ; GFX9-NEXT: v_or_b32_e32 v1, v1, v4 ; GFX9-NEXT: s_cmp_lt_u32 s4, 64 ; GFX9-NEXT: s_cselect_b32 s5, 1, 0 ; GFX9-NEXT: s_cmp_eq_u32 s4, 0 ; GFX9-NEXT: v_lshrrev_b64 v[4:5], s4, v[0:1] ; GFX9-NEXT: v_lshlrev_b64 v[6:7], s3, v[2:3] -; GFX9-NEXT: v_lshrrev_b64 v[8:9], s4, v[2:3] ; GFX9-NEXT: s_cselect_b32 s8, 1, 0 +; GFX9-NEXT: v_lshrrev_b64 v[8:9], s4, v[2:3] ; GFX9-NEXT: v_lshrrev_b64 v[2:3], s2, v[2:3] ; GFX9-NEXT: s_and_b32 s2, 1, s5 -; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2 ; GFX9-NEXT: v_or_b32_e32 v4, v4, v6 ; GFX9-NEXT: v_or_b32_e32 v5, v5, v7 +; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2 ; GFX9-NEXT: s_and_b32 s2, 1, s8 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc @@ -5470,8 +5470,8 @@ ; GFX10-NEXT: s_cmp_eq_u32 s8, 0 ; GFX10-NEXT: v_lshrrev_b64 v[2:3], 1, v[2:3] ; GFX10-NEXT: s_cselect_b32 s13, 1, 0 -; GFX10-NEXT: s_lshl_b64 s[10:11], s[2:3], s8 ; GFX10-NEXT: s_lshr_b64 s[6:7], s[0:1], s6 +; GFX10-NEXT: s_lshl_b64 s[10:11], s[2:3], s8 ; GFX10-NEXT: s_lshl_b64 s[8:9], s[0:1], s8 ; GFX10-NEXT: s_or_b64 s[6:7], s[6:7], s[10:11] ; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], s5 @@ -5519,8 +5519,8 @@ ; GFX6-NEXT: s_mov_b64 s[6:7], 0x7f ; GFX6-NEXT: s_and_b64 s[8:9], s[4:5], s[6:7] ; GFX6-NEXT: s_andn2_b64 s[4:5], s[6:7], s[4:5] -; GFX6-NEXT: s_sub_i32 s6, 64, s8 ; GFX6-NEXT: s_sub_i32 s5, s8, 64 +; GFX6-NEXT: s_sub_i32 s6, 64, s8 ; GFX6-NEXT: s_cmp_lt_u32 s8, 64 ; GFX6-NEXT: s_cselect_b32 s9, 1, 0 ; GFX6-NEXT: s_cmp_eq_u32 s8, 0 @@ -5530,8 +5530,8 @@ ; GFX6-NEXT: v_lshl_b64 v[8:9], v[0:1], s8 ; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], s5 ; GFX6-NEXT: s_and_b32 s5, 1, s9 -; GFX6-NEXT: s_lshl_b32 s9, s2, 31 ; GFX6-NEXT: s_lshr_b64 s[0:1], s[0:1], 1 +; GFX6-NEXT: s_lshl_b32 s9, s2, 31 ; GFX6-NEXT: s_mov_b32 s8, s7 ; GFX6-NEXT: v_cmp_ne_u32_e64 vcc, 0, s5 ; GFX6-NEXT: s_and_b32 s5, 1, s10 @@ -5544,15 +5544,15 @@ ; GFX6-NEXT: v_or_b32_e32 v5, v5, v7 ; GFX6-NEXT: s_cselect_b32 s11, 1, 0 ; GFX6-NEXT: s_cmp_eq_u32 s4, 0 -; GFX6-NEXT: s_cselect_b32 s12, 1, 0 ; GFX6-NEXT: v_cndmask_b32_e32 v6, 0, v8, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v7, 0, v9, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc ; GFX6-NEXT: v_cmp_ne_u32_e64 vcc, 0, s5 +; GFX6-NEXT: s_cselect_b32 s12, 1, 0 ; GFX6-NEXT: s_lshr_b64 s[6:7], s[2:3], s4 -; GFX6-NEXT: s_lshl_b64 s[8:9], s[2:3], s8 ; GFX6-NEXT: s_lshr_b64 s[4:5], s[0:1], s4 +; GFX6-NEXT: s_lshl_b64 s[8:9], s[2:3], s8 ; GFX6-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9] ; GFX6-NEXT: s_lshr_b64 s[2:3], s[2:3], s10 ; GFX6-NEXT: s_cmp_lg_u32 s11, 0 @@ -5574,8 +5574,8 @@ ; GFX8-NEXT: s_mov_b64 s[6:7], 0x7f ; GFX8-NEXT: s_and_b64 s[8:9], s[4:5], s[6:7] ; GFX8-NEXT: s_andn2_b64 s[4:5], s[6:7], s[4:5] -; GFX8-NEXT: s_sub_i32 s6, 64, s8 ; GFX8-NEXT: s_sub_i32 s5, s8, 64 +; GFX8-NEXT: s_sub_i32 s6, 64, s8 ; GFX8-NEXT: s_cmp_lt_u32 s8, 64 ; GFX8-NEXT: s_cselect_b32 s9, 1, 0 ; GFX8-NEXT: s_cmp_eq_u32 s8, 0 @@ -5585,8 +5585,8 @@ ; GFX8-NEXT: v_lshlrev_b64 v[8:9], s8, v[0:1] ; GFX8-NEXT: v_lshlrev_b64 v[0:1], s5, v[0:1] ; GFX8-NEXT: s_and_b32 s5, 1, s9 -; GFX8-NEXT: s_lshl_b32 s9, s2, 31 ; GFX8-NEXT: s_lshr_b64 s[0:1], s[0:1], 1 +; GFX8-NEXT: s_lshl_b32 s9, s2, 31 ; GFX8-NEXT: s_mov_b32 s8, s7 ; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s5 ; GFX8-NEXT: s_and_b32 s5, 1, s10 @@ -5599,15 +5599,15 @@ ; GFX8-NEXT: v_or_b32_e32 v5, v5, v7 ; GFX8-NEXT: s_cselect_b32 s11, 1, 0 ; GFX8-NEXT: s_cmp_eq_u32 s4, 0 -; GFX8-NEXT: s_cselect_b32 s12, 1, 0 ; GFX8-NEXT: v_cndmask_b32_e32 v6, 0, v8, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v7, 0, v9, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc ; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s5 +; GFX8-NEXT: s_cselect_b32 s12, 1, 0 ; GFX8-NEXT: s_lshr_b64 s[6:7], s[2:3], s4 -; GFX8-NEXT: s_lshl_b64 s[8:9], s[2:3], s8 ; GFX8-NEXT: s_lshr_b64 s[4:5], s[0:1], s4 +; GFX8-NEXT: s_lshl_b64 s[8:9], s[2:3], s8 ; GFX8-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9] ; GFX8-NEXT: s_lshr_b64 s[2:3], s[2:3], s10 ; GFX8-NEXT: s_cmp_lg_u32 s11, 0 @@ -5629,8 +5629,8 @@ ; GFX9-NEXT: s_mov_b64 s[6:7], 0x7f ; GFX9-NEXT: s_and_b64 s[8:9], s[4:5], s[6:7] ; GFX9-NEXT: s_andn2_b64 s[4:5], s[6:7], s[4:5] -; GFX9-NEXT: s_sub_i32 s6, 64, s8 ; GFX9-NEXT: s_sub_i32 s5, s8, 64 +; GFX9-NEXT: s_sub_i32 s6, 64, s8 ; GFX9-NEXT: s_cmp_lt_u32 s8, 64 ; GFX9-NEXT: s_cselect_b32 s9, 1, 0 ; GFX9-NEXT: s_cmp_eq_u32 s8, 0 @@ -5640,8 +5640,8 @@ ; GFX9-NEXT: v_lshlrev_b64 v[8:9], s8, v[0:1] ; GFX9-NEXT: v_lshlrev_b64 v[0:1], s5, v[0:1] ; GFX9-NEXT: s_and_b32 s5, 1, s9 -; GFX9-NEXT: s_lshl_b32 s9, s2, 31 ; GFX9-NEXT: s_lshr_b64 s[0:1], s[0:1], 1 +; GFX9-NEXT: s_lshl_b32 s9, s2, 31 ; GFX9-NEXT: s_mov_b32 s8, s7 ; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s5 ; GFX9-NEXT: s_and_b32 s5, 1, s10 @@ -5654,15 +5654,15 @@ ; GFX9-NEXT: v_or_b32_e32 v5, v5, v7 ; GFX9-NEXT: s_cselect_b32 s11, 1, 0 ; GFX9-NEXT: s_cmp_eq_u32 s4, 0 -; GFX9-NEXT: s_cselect_b32 s12, 1, 0 ; GFX9-NEXT: v_cndmask_b32_e32 v6, 0, v8, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v7, 0, v9, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc ; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s5 +; GFX9-NEXT: s_cselect_b32 s12, 1, 0 ; GFX9-NEXT: s_lshr_b64 s[6:7], s[2:3], s4 -; GFX9-NEXT: s_lshl_b64 s[8:9], s[2:3], s8 ; GFX9-NEXT: s_lshr_b64 s[4:5], s[0:1], s4 +; GFX9-NEXT: s_lshl_b64 s[8:9], s[2:3], s8 ; GFX9-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9] ; GFX9-NEXT: s_lshr_b64 s[2:3], s[2:3], s10 ; GFX9-NEXT: s_cmp_lg_u32 s11, 0 @@ -5686,20 +5686,20 @@ ; GFX10-NEXT: s_andn2_b64 s[10:11], s[6:7], s[4:5] ; GFX10-NEXT: s_sub_i32 s4, 64, s8 ; GFX10-NEXT: s_sub_i32 s5, s8, 64 -; GFX10-NEXT: s_cmp_lt_u32 s8, 64 ; GFX10-NEXT: v_lshrrev_b64 v[4:5], s4, v[0:1] ; GFX10-NEXT: v_lshlrev_b64 v[6:7], s8, v[2:3] +; GFX10-NEXT: s_cmp_lt_u32 s8, 64 +; GFX10-NEXT: v_lshlrev_b64 v[8:9], s8, v[0:1] ; GFX10-NEXT: s_cselect_b32 vcc_lo, 1, 0 ; GFX10-NEXT: s_cmp_eq_u32 s8, 0 -; GFX10-NEXT: v_lshlrev_b64 v[8:9], s8, v[0:1] +; GFX10-NEXT: v_lshlrev_b64 v[0:1], s5, v[0:1] ; GFX10-NEXT: s_cselect_b32 s6, 1, 0 ; GFX10-NEXT: s_and_b32 s4, 1, vcc_lo -; GFX10-NEXT: v_lshlrev_b64 v[0:1], s5, v[0:1] ; GFX10-NEXT: v_or_b32_e32 v4, v4, v6 ; GFX10-NEXT: v_or_b32_e32 v5, v5, v7 ; GFX10-NEXT: v_cmp_ne_u32_e64 s4, 0, s4 -; GFX10-NEXT: s_lshl_b32 s5, s2, 31 ; GFX10-NEXT: s_lshr_b64 s[0:1], s[0:1], 1 +; GFX10-NEXT: s_lshl_b32 s5, s2, 31 ; GFX10-NEXT: s_and_b32 s6, 1, s6 ; GFX10-NEXT: s_lshr_b64 s[2:3], s[2:3], 1 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v4, s4 @@ -5869,8 +5869,8 @@ ; GFX6-NEXT: s_cmp_eq_u32 s16, 0 ; GFX6-NEXT: s_cselect_b32 s28, 1, 0 ; GFX6-NEXT: s_lshr_b64 s[10:11], s[8:9], s16 -; GFX6-NEXT: s_lshl_b64 s[22:23], s[8:9], s22 ; GFX6-NEXT: s_lshr_b64 s[16:17], s[0:1], s16 +; GFX6-NEXT: s_lshl_b64 s[22:23], s[8:9], s22 ; GFX6-NEXT: s_or_b64 s[16:17], s[16:17], s[22:23] ; GFX6-NEXT: s_lshr_b64 s[8:9], s[8:9], s26 ; GFX6-NEXT: s_cmp_lg_u32 s27, 0 @@ -5895,8 +5895,8 @@ ; GFX6-NEXT: s_or_b64 s[8:9], s[20:21], s[8:9] ; GFX6-NEXT: s_lshl_b64 s[4:5], s[4:5], s11 ; GFX6-NEXT: s_cmp_lg_u32 s18, 0 -; GFX6-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5] ; GFX6-NEXT: s_cselect_b64 s[16:17], s[16:17], 0 +; GFX6-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5] ; GFX6-NEXT: s_cmp_lg_u32 s22, 0 ; GFX6-NEXT: s_cselect_b64 s[6:7], s[6:7], s[4:5] ; GFX6-NEXT: s_lshr_b64 s[4:5], s[12:13], 1 @@ -5911,8 +5911,8 @@ ; GFX6-NEXT: s_cmp_eq_u32 s10, 0 ; GFX6-NEXT: s_cselect_b32 s20, 1, 0 ; GFX6-NEXT: s_lshr_b64 s[12:13], s[8:9], s10 -; GFX6-NEXT: s_lshl_b64 s[14:15], s[8:9], s14 ; GFX6-NEXT: s_lshr_b64 s[10:11], s[4:5], s10 +; GFX6-NEXT: s_lshl_b64 s[14:15], s[8:9], s14 ; GFX6-NEXT: s_or_b64 s[10:11], s[10:11], s[14:15] ; GFX6-NEXT: s_lshr_b64 s[8:9], s[8:9], s18 ; GFX6-NEXT: s_cmp_lg_u32 s19, 0 @@ -5958,8 +5958,8 @@ ; GFX8-NEXT: s_cmp_eq_u32 s16, 0 ; GFX8-NEXT: s_cselect_b32 s28, 1, 0 ; GFX8-NEXT: s_lshr_b64 s[10:11], s[8:9], s16 -; GFX8-NEXT: s_lshl_b64 s[22:23], s[8:9], s22 ; GFX8-NEXT: s_lshr_b64 s[16:17], s[0:1], s16 +; GFX8-NEXT: s_lshl_b64 s[22:23], s[8:9], s22 ; GFX8-NEXT: s_or_b64 s[16:17], s[16:17], s[22:23] ; GFX8-NEXT: s_lshr_b64 s[8:9], s[8:9], s26 ; GFX8-NEXT: s_cmp_lg_u32 s27, 0 @@ -5984,8 +5984,8 @@ ; GFX8-NEXT: s_or_b64 s[8:9], s[20:21], s[8:9] ; GFX8-NEXT: s_lshl_b64 s[4:5], s[4:5], s11 ; GFX8-NEXT: s_cmp_lg_u32 s18, 0 -; GFX8-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5] ; GFX8-NEXT: s_cselect_b64 s[16:17], s[16:17], 0 +; GFX8-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5] ; GFX8-NEXT: s_cmp_lg_u32 s22, 0 ; GFX8-NEXT: s_cselect_b64 s[6:7], s[6:7], s[4:5] ; GFX8-NEXT: s_lshr_b64 s[4:5], s[12:13], 1 @@ -6000,8 +6000,8 @@ ; GFX8-NEXT: s_cmp_eq_u32 s10, 0 ; GFX8-NEXT: s_cselect_b32 s20, 1, 0 ; GFX8-NEXT: s_lshr_b64 s[12:13], s[8:9], s10 -; GFX8-NEXT: s_lshl_b64 s[14:15], s[8:9], s14 ; GFX8-NEXT: s_lshr_b64 s[10:11], s[4:5], s10 +; GFX8-NEXT: s_lshl_b64 s[14:15], s[8:9], s14 ; GFX8-NEXT: s_or_b64 s[10:11], s[10:11], s[14:15] ; GFX8-NEXT: s_lshr_b64 s[8:9], s[8:9], s18 ; GFX8-NEXT: s_cmp_lg_u32 s19, 0 @@ -6047,8 +6047,8 @@ ; GFX9-NEXT: s_cmp_eq_u32 s16, 0 ; GFX9-NEXT: s_cselect_b32 s28, 1, 0 ; GFX9-NEXT: s_lshr_b64 s[10:11], s[8:9], s16 -; GFX9-NEXT: s_lshl_b64 s[22:23], s[8:9], s22 ; GFX9-NEXT: s_lshr_b64 s[16:17], s[0:1], s16 +; GFX9-NEXT: s_lshl_b64 s[22:23], s[8:9], s22 ; GFX9-NEXT: s_or_b64 s[16:17], s[16:17], s[22:23] ; GFX9-NEXT: s_lshr_b64 s[8:9], s[8:9], s26 ; GFX9-NEXT: s_cmp_lg_u32 s27, 0 @@ -6073,8 +6073,8 @@ ; GFX9-NEXT: s_or_b64 s[8:9], s[20:21], s[8:9] ; GFX9-NEXT: s_lshl_b64 s[4:5], s[4:5], s11 ; GFX9-NEXT: s_cmp_lg_u32 s18, 0 -; GFX9-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5] ; GFX9-NEXT: s_cselect_b64 s[16:17], s[16:17], 0 +; GFX9-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5] ; GFX9-NEXT: s_cmp_lg_u32 s22, 0 ; GFX9-NEXT: s_cselect_b64 s[6:7], s[6:7], s[4:5] ; GFX9-NEXT: s_lshr_b64 s[4:5], s[12:13], 1 @@ -6089,8 +6089,8 @@ ; GFX9-NEXT: s_cmp_eq_u32 s10, 0 ; GFX9-NEXT: s_cselect_b32 s20, 1, 0 ; GFX9-NEXT: s_lshr_b64 s[12:13], s[8:9], s10 -; GFX9-NEXT: s_lshl_b64 s[14:15], s[8:9], s14 ; GFX9-NEXT: s_lshr_b64 s[10:11], s[4:5], s10 +; GFX9-NEXT: s_lshl_b64 s[14:15], s[8:9], s14 ; GFX9-NEXT: s_or_b64 s[10:11], s[10:11], s[14:15] ; GFX9-NEXT: s_lshr_b64 s[8:9], s[8:9], s18 ; GFX9-NEXT: s_cmp_lg_u32 s19, 0 @@ -6204,11 +6204,11 @@ ; GFX6-NEXT: v_sub_i32_e32 v17, vcc, 64, v23 ; GFX6-NEXT: v_lshr_b64 v[17:18], v[0:1], v17 ; GFX6-NEXT: v_lshl_b64 v[21:22], v[2:3], v23 -; GFX6-NEXT: v_xor_b32_e32 v16, -1, v16 ; GFX6-NEXT: v_lshr_b64 v[8:9], v[8:9], 1 -; GFX6-NEXT: v_and_b32_e32 v24, s6, v16 +; GFX6-NEXT: v_xor_b32_e32 v16, -1, v16 ; GFX6-NEXT: v_or_b32_e32 v21, v17, v21 ; GFX6-NEXT: v_lshlrev_b32_e32 v17, 31, v10 +; GFX6-NEXT: v_and_b32_e32 v24, s6, v16 ; GFX6-NEXT: v_lshr_b64 v[10:11], v[10:11], 1 ; GFX6-NEXT: v_or_b32_e32 v9, v9, v17 ; GFX6-NEXT: v_sub_i32_e32 v16, vcc, 64, v24 @@ -6233,8 +6233,8 @@ ; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, v18, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v18, 0, v1, vcc ; GFX6-NEXT: v_lshr_b64 v[0:1], v[10:11], v24 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v24 ; GFX6-NEXT: v_cndmask_b32_e64 v3, v3, v19, s[4:5] +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v24 ; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc ; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, v0, s[4:5] @@ -6276,8 +6276,8 @@ ; GFX6-NEXT: v_lshr_b64 v[6:7], v[6:7], v12 ; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v17 ; GFX6-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc -; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v17 ; GFX6-NEXT: v_cndmask_b32_e32 v7, v7, v11, vcc +; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v17 ; GFX6-NEXT: v_cndmask_b32_e64 v4, v6, v4, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e64 v5, v7, v5, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v6, 0, v8, vcc @@ -6296,11 +6296,11 @@ ; GFX8-NEXT: v_sub_u32_e32 v17, vcc, 64, v23 ; GFX8-NEXT: v_lshrrev_b64 v[17:18], v17, v[0:1] ; GFX8-NEXT: v_lshlrev_b64 v[21:22], v23, v[2:3] -; GFX8-NEXT: v_xor_b32_e32 v16, -1, v16 ; GFX8-NEXT: v_lshrrev_b64 v[8:9], 1, v[8:9] -; GFX8-NEXT: v_and_b32_e32 v24, s6, v16 +; GFX8-NEXT: v_xor_b32_e32 v16, -1, v16 ; GFX8-NEXT: v_or_b32_e32 v21, v17, v21 ; GFX8-NEXT: v_lshlrev_b32_e32 v17, 31, v10 +; GFX8-NEXT: v_and_b32_e32 v24, s6, v16 ; GFX8-NEXT: v_lshrrev_b64 v[10:11], 1, v[10:11] ; GFX8-NEXT: v_or_b32_e32 v9, v9, v17 ; GFX8-NEXT: v_sub_u32_e32 v16, vcc, 64, v24 @@ -6325,8 +6325,8 @@ ; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v18, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v18, 0, v1, vcc ; GFX8-NEXT: v_lshrrev_b64 v[0:1], v24, v[10:11] -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v24 ; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v19, s[4:5] +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v24 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, v0, s[4:5] @@ -6368,8 +6368,8 @@ ; GFX8-NEXT: v_lshrrev_b64 v[6:7], v12, v[6:7] ; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v17 ; GFX8-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc -; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v17 ; GFX8-NEXT: v_cndmask_b32_e32 v7, v7, v11, vcc +; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v17 ; GFX8-NEXT: v_cndmask_b32_e64 v4, v6, v4, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e64 v5, v7, v5, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v6, 0, v8, vcc @@ -6388,11 +6388,11 @@ ; GFX9-NEXT: v_sub_u32_e32 v17, 64, v23 ; GFX9-NEXT: v_lshrrev_b64 v[17:18], v17, v[0:1] ; GFX9-NEXT: v_lshlrev_b64 v[21:22], v23, v[2:3] -; GFX9-NEXT: v_xor_b32_e32 v16, -1, v16 ; GFX9-NEXT: v_lshrrev_b64 v[8:9], 1, v[8:9] -; GFX9-NEXT: v_and_b32_e32 v24, s6, v16 +; GFX9-NEXT: v_xor_b32_e32 v16, -1, v16 ; GFX9-NEXT: v_or_b32_e32 v21, v17, v21 ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 31, v10 +; GFX9-NEXT: v_and_b32_e32 v24, s6, v16 ; GFX9-NEXT: v_lshrrev_b64 v[10:11], 1, v[10:11] ; GFX9-NEXT: v_or_b32_e32 v9, v9, v17 ; GFX9-NEXT: v_sub_u32_e32 v16, 64, v24 @@ -6417,8 +6417,8 @@ ; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v18, s[4:5] ; GFX9-NEXT: v_cndmask_b32_e32 v18, 0, v1, vcc ; GFX9-NEXT: v_lshrrev_b64 v[0:1], v24, v[10:11] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v24 ; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v19, s[4:5] +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v24 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, v0, s[4:5] @@ -6439,9 +6439,9 @@ ; GFX9-NEXT: v_lshlrev_b64 v[8:9], v16, v[4:5] ; GFX9-NEXT: v_lshlrev_b64 v[4:5], v18, v[4:5] ; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v16 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v18, 0, v8, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v19, 0, v9, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v11, vcc ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v16 ; GFX9-NEXT: v_cndmask_b32_e32 v16, v4, v6, vcc @@ -6460,8 +6460,8 @@ ; GFX9-NEXT: v_lshrrev_b64 v[6:7], v12, v[6:7] ; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v17 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc -; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v17 ; GFX9-NEXT: v_cndmask_b32_e32 v7, v7, v11, vcc +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v17 ; GFX9-NEXT: v_cndmask_b32_e64 v4, v6, v4, s[4:5] ; GFX9-NEXT: v_cndmask_b32_e64 v5, v7, v5, s[4:5] ; GFX9-NEXT: v_cndmask_b32_e32 v6, 0, v8, vcc @@ -6494,13 +6494,13 @@ ; GFX10-NEXT: v_lshlrev_b64 v[0:1], v29, v[0:1] ; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v27 ; GFX10-NEXT: v_or_b32_e32 v18, v16, v18 -; GFX10-NEXT: v_lshlrev_b64 v[25:26], v25, v[10:11] ; GFX10-NEXT: v_subrev_nc_u32_e32 v16, 64, v28 +; GFX10-NEXT: v_lshlrev_b64 v[25:26], v25, v[10:11] ; GFX10-NEXT: v_or_b32_e32 v19, v17, v19 ; GFX10-NEXT: v_cmp_gt_u32_e64 s4, 64, v28 ; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 0, v28 -; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 0, v27 ; GFX10-NEXT: v_lshrrev_b64 v[16:17], v16, v[10:11] +; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 0, v27 ; GFX10-NEXT: v_or_b32_e32 v23, v23, v25 ; GFX10-NEXT: v_or_b32_e32 v24, v24, v26 ; GFX10-NEXT: v_cndmask_b32_e32 v19, v1, v19, vcc_lo @@ -6508,14 +6508,14 @@ ; GFX10-NEXT: v_lshrrev_b64 v[0:1], v28, v[10:11] ; GFX10-NEXT: v_cndmask_b32_e64 v16, v16, v23, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v10, v17, v24, s4 +; GFX10-NEXT: v_cndmask_b32_e32 v21, 0, v21, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v11, 0, v22, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v22, v19, v3, s6 -; GFX10-NEXT: v_cndmask_b32_e32 v21, 0, v21, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v3, v16, v8, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v8, v10, v9, s5 -; GFX10-NEXT: v_and_b32_e32 v23, s7, v20 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v18, v2, s6 +; GFX10-NEXT: v_cndmask_b32_e64 v8, v10, v9, s5 ; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, v0, s4 +; GFX10-NEXT: v_and_b32_e32 v23, s7, v20 ; GFX10-NEXT: v_or_b32_e32 v0, v21, v3 ; GFX10-NEXT: v_xor_b32_e32 v3, -1, v20 ; GFX10-NEXT: v_cndmask_b32_e64 v24, 0, v1, s4 @@ -6523,13 +6523,13 @@ ; GFX10-NEXT: v_sub_nc_u32_e32 v10, 64, v23 ; GFX10-NEXT: v_or_b32_e32 v2, v2, v9 ; GFX10-NEXT: v_lshrrev_b64 v[8:9], 1, v[12:13] -; GFX10-NEXT: v_and_b32_e32 v25, s7, v3 ; GFX10-NEXT: v_lshlrev_b32_e32 v16, 31, v14 +; GFX10-NEXT: v_and_b32_e32 v25, s7, v3 ; GFX10-NEXT: v_lshrrev_b64 v[10:11], v10, v[4:5] ; GFX10-NEXT: v_lshlrev_b64 v[12:13], v23, v[6:7] ; GFX10-NEXT: v_lshrrev_b64 v[14:15], 1, v[14:15] -; GFX10-NEXT: v_sub_nc_u32_e32 v20, 64, v25 ; GFX10-NEXT: v_or_b32_e32 v9, v9, v16 +; GFX10-NEXT: v_sub_nc_u32_e32 v20, 64, v25 ; GFX10-NEXT: v_subrev_nc_u32_e32 v3, 64, v23 ; GFX10-NEXT: v_lshlrev_b64 v[16:17], v23, v[4:5] ; GFX10-NEXT: v_or_b32_e32 v12, v10, v12 @@ -6547,22 +6547,22 @@ ; GFX10-NEXT: v_cndmask_b32_e32 v12, v3, v12, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v5, v4, v5, vcc_lo ; GFX10-NEXT: v_lshrrev_b64 v[3:4], v25, v[14:15] -; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 0, v23 ; GFX10-NEXT: v_cndmask_b32_e64 v10, v10, v16, s4 ; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 0, v25 +; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 0, v23 ; GFX10-NEXT: v_cndmask_b32_e64 v11, v11, v18, s4 ; GFX10-NEXT: v_cndmask_b32_e32 v14, 0, v17, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v7, v5, v7, s6 ; GFX10-NEXT: v_cndmask_b32_e64 v6, v12, v6, s6 +; GFX10-NEXT: v_cndmask_b32_e64 v7, v5, v7, s6 ; GFX10-NEXT: v_cndmask_b32_e64 v5, v10, v8, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, v4, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v8, v11, v9, s5 ; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, v3, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, v4, s4 ; GFX10-NEXT: v_or_b32_e32 v3, v22, v24 ; GFX10-NEXT: v_or_b32_e32 v4, v13, v5 -; GFX10-NEXT: v_or_b32_e32 v7, v7, v10 ; GFX10-NEXT: v_or_b32_e32 v5, v14, v8 ; GFX10-NEXT: v_or_b32_e32 v6, v6, v9 +; GFX10-NEXT: v_or_b32_e32 v7, v7, v10 ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call <2 x i128> @llvm.fshl.v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %amt) ret <2 x i128> %result diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll @@ -265,9 +265,9 @@ ; GFX6-LABEL: s_fshr_i8: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_and_b32 s3, s2, 7 -; GFX6-NEXT: s_and_b32 s1, s1, 0xff ; GFX6-NEXT: s_andn2_b32 s2, 7, s2 ; GFX6-NEXT: s_lshl_b32 s0, s0, 1 +; GFX6-NEXT: s_and_b32 s1, s1, 0xff ; GFX6-NEXT: s_lshl_b32 s0, s0, s2 ; GFX6-NEXT: s_lshr_b32 s1, s1, s3 ; GFX6-NEXT: s_or_b32 s0, s0, s1 @@ -277,9 +277,9 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_and_b32 s1, s1, 0xff ; GFX8-NEXT: s_and_b32 s3, s2, 7 -; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 ; GFX8-NEXT: s_andn2_b32 s2, 7, s2 ; GFX8-NEXT: s_lshl_b32 s0, s0, 1 +; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 ; GFX8-NEXT: s_lshl_b32 s0, s0, s2 ; GFX8-NEXT: s_lshr_b32 s1, s1, s3 ; GFX8-NEXT: s_or_b32 s0, s0, s1 @@ -289,9 +289,9 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_and_b32 s1, s1, 0xff ; GFX9-NEXT: s_and_b32 s3, s2, 7 -; GFX9-NEXT: s_bfe_u32 s1, s1, 0x100000 ; GFX9-NEXT: s_andn2_b32 s2, 7, s2 ; GFX9-NEXT: s_lshl_b32 s0, s0, 1 +; GFX9-NEXT: s_bfe_u32 s1, s1, 0x100000 ; GFX9-NEXT: s_lshl_b32 s0, s0, s2 ; GFX9-NEXT: s_lshr_b32 s1, s1, s3 ; GFX9-NEXT: s_or_b32 s0, s0, s1 @@ -301,11 +301,11 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_and_b32 s1, s1, 0xff ; GFX10-NEXT: s_and_b32 s3, s2, 7 -; GFX10-NEXT: s_bfe_u32 s1, s1, 0x100000 ; GFX10-NEXT: s_andn2_b32 s2, 7, s2 ; GFX10-NEXT: s_lshl_b32 s0, s0, 1 -; GFX10-NEXT: s_lshr_b32 s1, s1, s3 +; GFX10-NEXT: s_bfe_u32 s1, s1, 0x100000 ; GFX10-NEXT: s_lshl_b32 s0, s0, s2 +; GFX10-NEXT: s_lshr_b32 s1, s1, s3 ; GFX10-NEXT: s_or_b32 s0, s0, s1 ; GFX10-NEXT: ; return to shader part epilog %result = call i8 @llvm.fshr.i8(i8 %lhs, i8 %rhs, i8 %amt) @@ -318,9 +318,9 @@ ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_and_b32_e32 v3, 7, v2 ; GFX6-NEXT: v_xor_b32_e32 v2, -1, v2 -; GFX6-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX6-NEXT: v_and_b32_e32 v2, 7, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX6-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, v2, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, v3, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 @@ -356,8 +356,8 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_xor_b32_e32 v3, -1, v2 ; GFX10-NEXT: v_and_b32_e32 v2, 7, v2 -; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX10-NEXT: v_lshlrev_b16 v0, 1, v0 +; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX10-NEXT: v_and_b32_e32 v3, 7, v3 ; GFX10-NEXT: v_lshrrev_b16 v1, v2, v1 ; GFX10-NEXT: v_lshlrev_b16 v0, v3, v0 @@ -534,16 +534,16 @@ ; GFX6-NEXT: s_lshr_b32 s4, s2, 8 ; GFX6-NEXT: s_and_b32 s5, s2, 7 ; GFX6-NEXT: s_andn2_b32 s2, 7, s2 -; GFX6-NEXT: s_lshl_b32 s0, s0, 1 ; GFX6-NEXT: s_movk_i32 s6, 0xff +; GFX6-NEXT: s_lshl_b32 s0, s0, 1 ; GFX6-NEXT: s_lshl_b32 s0, s0, s2 ; GFX6-NEXT: s_and_b32 s2, s1, s6 ; GFX6-NEXT: s_lshr_b32 s2, s2, s5 ; GFX6-NEXT: s_or_b32 s0, s0, s2 ; GFX6-NEXT: s_and_b32 s2, s4, 7 -; GFX6-NEXT: s_bfe_u32 s1, s1, 0x80008 ; GFX6-NEXT: s_andn2_b32 s4, 7, s4 ; GFX6-NEXT: s_lshl_b32 s3, s3, 1 +; GFX6-NEXT: s_bfe_u32 s1, s1, 0x80008 ; GFX6-NEXT: s_lshl_b32 s3, s3, s4 ; GFX6-NEXT: s_lshr_b32 s1, s1, s2 ; GFX6-NEXT: s_or_b32 s1, s3, s1 @@ -565,13 +565,13 @@ ; GFX8-NEXT: s_lshr_b32 s4, s1, 8 ; GFX8-NEXT: s_and_b32 s1, s1, s2 ; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX8-NEXT: s_and_b32 s4, s4, s2 ; GFX8-NEXT: s_lshr_b32 s1, s1, s6 +; GFX8-NEXT: s_and_b32 s4, s4, s2 ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: s_and_b32 s1, s5, 7 -; GFX8-NEXT: s_bfe_u32 s4, s4, 0x100000 ; GFX8-NEXT: s_andn2_b32 s5, 7, s5 ; GFX8-NEXT: s_lshl_b32 s3, s3, 1 +; GFX8-NEXT: s_bfe_u32 s4, s4, 0x100000 ; GFX8-NEXT: s_lshl_b32 s3, s3, s5 ; GFX8-NEXT: s_lshr_b32 s1, s4, s1 ; GFX8-NEXT: s_or_b32 s1, s3, s1 @@ -594,13 +594,13 @@ ; GFX9-NEXT: s_lshr_b32 s4, s1, 8 ; GFX9-NEXT: s_and_b32 s1, s1, s2 ; GFX9-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX9-NEXT: s_and_b32 s4, s4, s2 ; GFX9-NEXT: s_lshr_b32 s1, s1, s6 +; GFX9-NEXT: s_and_b32 s4, s4, s2 ; GFX9-NEXT: s_or_b32 s0, s0, s1 ; GFX9-NEXT: s_and_b32 s1, s5, 7 -; GFX9-NEXT: s_bfe_u32 s4, s4, 0x100000 ; GFX9-NEXT: s_andn2_b32 s5, 7, s5 ; GFX9-NEXT: s_lshl_b32 s3, s3, 1 +; GFX9-NEXT: s_bfe_u32 s4, s4, 0x100000 ; GFX9-NEXT: s_lshl_b32 s3, s3, s5 ; GFX9-NEXT: s_lshr_b32 s1, s4, s1 ; GFX9-NEXT: s_or_b32 s1, s3, s1 @@ -616,17 +616,17 @@ ; GFX10-NEXT: s_lshr_b32 s4, s1, 8 ; GFX10-NEXT: s_movk_i32 s7, 0xff ; GFX10-NEXT: s_lshr_b32 s3, s0, 8 -; GFX10-NEXT: s_and_b32 s4, s4, s7 ; GFX10-NEXT: s_lshr_b32 s5, s2, 8 ; GFX10-NEXT: s_and_b32 s6, s2, 7 ; GFX10-NEXT: s_andn2_b32 s2, 7, s2 ; GFX10-NEXT: s_lshl_b32 s0, s0, 1 +; GFX10-NEXT: s_and_b32 s4, s4, s7 ; GFX10-NEXT: s_and_b32 s1, s1, s7 ; GFX10-NEXT: s_lshl_b32 s0, s0, s2 ; GFX10-NEXT: s_and_b32 s2, s5, 7 -; GFX10-NEXT: s_bfe_u32 s4, s4, 0x100000 ; GFX10-NEXT: s_andn2_b32 s5, 7, s5 ; GFX10-NEXT: s_lshl_b32 s3, s3, 1 +; GFX10-NEXT: s_bfe_u32 s4, s4, 0x100000 ; GFX10-NEXT: s_bfe_u32 s1, s1, 0x100000 ; GFX10-NEXT: s_lshl_b32 s3, s3, s5 ; GFX10-NEXT: s_lshr_b32 s2, s4, s2 @@ -656,17 +656,17 @@ ; GFX6-NEXT: v_xor_b32_e32 v2, -1, v2 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 8, v0 ; GFX6-NEXT: v_and_b32_e32 v2, 7, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX6-NEXT: s_movk_i32 s4, 0xff +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, v2, v0 ; GFX6-NEXT: v_and_b32_e32 v2, s4, v1 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v5, v2 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX6-NEXT: v_and_b32_e32 v2, 7, v4 ; GFX6-NEXT: v_xor_b32_e32 v4, -1, v4 -; GFX6-NEXT: v_bfe_u32 v1, v1, 8, 8 ; GFX6-NEXT: v_and_b32_e32 v4, 7, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 1, v3 +; GFX6-NEXT: v_bfe_u32 v1, v1, 8, 8 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, v4, v3 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, v2, v1 ; GFX6-NEXT: v_or_b32_e32 v1, v3, v1 @@ -687,8 +687,8 @@ ; GFX8-NEXT: v_lshlrev_b16_e32 v0, 1, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 8, v1 ; GFX8-NEXT: v_lshlrev_b16_e32 v0, v2, v0 -; GFX8-NEXT: v_xor_b32_e32 v2, -1, v5 ; GFX8-NEXT: v_lshrrev_b16_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_xor_b32_e32 v2, -1, v5 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: v_and_b32_e32 v1, 7, v5 ; GFX8-NEXT: v_and_b32_e32 v2, 7, v2 @@ -712,8 +712,8 @@ ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 1, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 8, v1 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, v2, v0 -; GFX9-NEXT: v_xor_b32_e32 v2, -1, v5 ; GFX9-NEXT: v_lshrrev_b16_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_xor_b32_e32 v2, -1, v5 ; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX9-NEXT: v_and_b32_e32 v1, 7, v5 ; GFX9-NEXT: v_and_b32_e32 v2, 7, v2 @@ -738,11 +738,11 @@ ; GFX10-NEXT: v_xor_b32_e32 v6, -1, v3 ; GFX10-NEXT: v_xor_b32_e32 v2, -1, v2 ; GFX10-NEXT: v_and_b32_e32 v3, 7, v3 -; GFX10-NEXT: v_and_b32_e32 v5, s4, v5 ; GFX10-NEXT: v_lshlrev_b16 v4, 1, v4 +; GFX10-NEXT: v_and_b32_e32 v5, s4, v5 ; GFX10-NEXT: v_and_b32_e32 v6, 7, v6 -; GFX10-NEXT: v_and_b32_e32 v1, s4, v1 ; GFX10-NEXT: v_lshlrev_b16 v0, 1, v0 +; GFX10-NEXT: v_and_b32_e32 v1, s4, v1 ; GFX10-NEXT: v_and_b32_e32 v2, 7, v2 ; GFX10-NEXT: v_lshrrev_b16 v3, v3, v5 ; GFX10-NEXT: v_lshlrev_b16 v4, v6, v4 @@ -772,8 +772,8 @@ ; GFX6-NEXT: s_lshr_b32 s9, s2, 24 ; GFX6-NEXT: s_and_b32 s10, s2, 7 ; GFX6-NEXT: s_andn2_b32 s2, 7, s2 -; GFX6-NEXT: s_lshl_b32 s0, s0, 1 ; GFX6-NEXT: s_movk_i32 s11, 0xff +; GFX6-NEXT: s_lshl_b32 s0, s0, 1 ; GFX6-NEXT: s_lshl_b32 s0, s0, s2 ; GFX6-NEXT: s_and_b32 s2, s1, s11 ; GFX6-NEXT: s_lshr_b32 s2, s2, s10 @@ -784,24 +784,24 @@ ; GFX6-NEXT: s_lshl_b32 s3, s3, s7 ; GFX6-NEXT: s_bfe_u32 s7, s1, 0x80008 ; GFX6-NEXT: s_lshr_b32 s2, s7, s2 -; GFX6-NEXT: s_or_b32 s2, s3, s2 ; GFX6-NEXT: s_lshr_b32 s6, s1, 24 +; GFX6-NEXT: s_or_b32 s2, s3, s2 ; GFX6-NEXT: s_and_b32 s3, s8, 7 -; GFX6-NEXT: s_bfe_u32 s1, s1, 0x80010 ; GFX6-NEXT: s_andn2_b32 s7, 7, s8 ; GFX6-NEXT: s_lshl_b32 s4, s4, 1 -; GFX6-NEXT: s_lshr_b32 s1, s1, s3 +; GFX6-NEXT: s_bfe_u32 s1, s1, 0x80010 ; GFX6-NEXT: s_lshl_b32 s4, s4, s7 +; GFX6-NEXT: s_lshr_b32 s1, s1, s3 ; GFX6-NEXT: s_or_b32 s1, s4, s1 ; GFX6-NEXT: s_and_b32 s3, s9, 7 -; GFX6-NEXT: s_and_b32 s2, s2, s11 ; GFX6-NEXT: s_andn2_b32 s4, 7, s9 ; GFX6-NEXT: s_lshl_b32 s5, s5, 1 -; GFX6-NEXT: s_and_b32 s1, s1, s11 +; GFX6-NEXT: s_and_b32 s2, s2, s11 ; GFX6-NEXT: s_lshl_b32 s4, s5, s4 ; GFX6-NEXT: s_lshr_b32 s3, s6, s3 ; GFX6-NEXT: s_and_b32 s0, s0, s11 ; GFX6-NEXT: s_lshl_b32 s2, s2, 8 +; GFX6-NEXT: s_and_b32 s1, s1, s11 ; GFX6-NEXT: s_or_b32 s3, s4, s3 ; GFX6-NEXT: s_or_b32 s0, s0, s2 ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 @@ -820,15 +820,15 @@ ; GFX8-NEXT: s_lshr_b32 s6, s1, 8 ; GFX8-NEXT: s_lshr_b32 s7, s1, 16 ; GFX8-NEXT: s_lshr_b32 s8, s1, 24 -; GFX8-NEXT: s_and_b32 s1, s1, s13 ; GFX8-NEXT: s_lshr_b32 s9, s2, 8 ; GFX8-NEXT: s_lshr_b32 s10, s2, 16 ; GFX8-NEXT: s_lshr_b32 s11, s2, 24 ; GFX8-NEXT: s_and_b32 s12, s2, 7 ; GFX8-NEXT: s_andn2_b32 s2, 7, s2 ; GFX8-NEXT: s_lshl_b32 s0, s0, 1 -; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX8-NEXT: s_and_b32 s1, s1, s13 ; GFX8-NEXT: s_lshl_b32 s0, s0, s2 +; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 ; GFX8-NEXT: s_andn2_b32 s2, 7, s9 ; GFX8-NEXT: s_lshl_b32 s3, s3, 1 ; GFX8-NEXT: s_lshr_b32 s1, s1, s12 @@ -853,12 +853,12 @@ ; GFX8-NEXT: s_lshl_b32 s5, s5, 1 ; GFX8-NEXT: s_and_b32 s0, s0, s13 ; GFX8-NEXT: s_lshl_b32 s1, s1, 8 -; GFX8-NEXT: s_or_b32 s0, s0, s1 -; GFX8-NEXT: s_and_b32 s1, s2, s13 ; GFX8-NEXT: s_lshl_b32 s4, s5, s4 ; GFX8-NEXT: s_lshr_b32 s3, s8, s3 -; GFX8-NEXT: s_lshl_b32 s1, s1, 16 +; GFX8-NEXT: s_or_b32 s0, s0, s1 +; GFX8-NEXT: s_and_b32 s1, s2, s13 ; GFX8-NEXT: s_or_b32 s3, s4, s3 +; GFX8-NEXT: s_lshl_b32 s1, s1, 16 ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: s_and_b32 s1, s3, s13 ; GFX8-NEXT: s_lshl_b32 s1, s1, 24 @@ -874,15 +874,15 @@ ; GFX9-NEXT: s_lshr_b32 s6, s1, 8 ; GFX9-NEXT: s_lshr_b32 s7, s1, 16 ; GFX9-NEXT: s_lshr_b32 s8, s1, 24 -; GFX9-NEXT: s_and_b32 s1, s1, s13 ; GFX9-NEXT: s_lshr_b32 s9, s2, 8 ; GFX9-NEXT: s_lshr_b32 s10, s2, 16 ; GFX9-NEXT: s_lshr_b32 s11, s2, 24 ; GFX9-NEXT: s_and_b32 s12, s2, 7 ; GFX9-NEXT: s_andn2_b32 s2, 7, s2 ; GFX9-NEXT: s_lshl_b32 s0, s0, 1 -; GFX9-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX9-NEXT: s_and_b32 s1, s1, s13 ; GFX9-NEXT: s_lshl_b32 s0, s0, s2 +; GFX9-NEXT: s_bfe_u32 s1, s1, 0x100000 ; GFX9-NEXT: s_andn2_b32 s2, 7, s9 ; GFX9-NEXT: s_lshl_b32 s3, s3, 1 ; GFX9-NEXT: s_lshr_b32 s1, s1, s12 @@ -907,12 +907,12 @@ ; GFX9-NEXT: s_lshl_b32 s5, s5, 1 ; GFX9-NEXT: s_and_b32 s0, s0, s13 ; GFX9-NEXT: s_lshl_b32 s1, s1, 8 -; GFX9-NEXT: s_or_b32 s0, s0, s1 -; GFX9-NEXT: s_and_b32 s1, s2, s13 ; GFX9-NEXT: s_lshl_b32 s4, s5, s4 ; GFX9-NEXT: s_lshr_b32 s3, s8, s3 -; GFX9-NEXT: s_lshl_b32 s1, s1, 16 +; GFX9-NEXT: s_or_b32 s0, s0, s1 +; GFX9-NEXT: s_and_b32 s1, s2, s13 ; GFX9-NEXT: s_or_b32 s3, s4, s3 +; GFX9-NEXT: s_lshl_b32 s1, s1, 16 ; GFX9-NEXT: s_or_b32 s0, s0, s1 ; GFX9-NEXT: s_and_b32 s1, s3, s13 ; GFX9-NEXT: s_lshl_b32 s1, s1, 24 @@ -926,44 +926,44 @@ ; GFX10-NEXT: s_lshr_b32 s3, s0, 8 ; GFX10-NEXT: s_lshr_b32 s4, s0, 16 ; GFX10-NEXT: s_lshr_b32 s5, s0, 24 -; GFX10-NEXT: s_and_b32 s6, s6, s13 ; GFX10-NEXT: s_lshr_b32 s7, s1, 16 ; GFX10-NEXT: s_lshr_b32 s8, s1, 24 -; GFX10-NEXT: s_and_b32 s1, s1, s13 ; GFX10-NEXT: s_lshr_b32 s9, s2, 8 ; GFX10-NEXT: s_lshr_b32 s10, s2, 16 ; GFX10-NEXT: s_lshr_b32 s11, s2, 24 ; GFX10-NEXT: s_and_b32 s12, s2, 7 ; GFX10-NEXT: s_andn2_b32 s2, 7, s2 +; GFX10-NEXT: s_and_b32 s1, s1, s13 ; GFX10-NEXT: s_lshl_b32 s0, s0, 1 +; GFX10-NEXT: s_and_b32 s6, s6, s13 ; GFX10-NEXT: s_bfe_u32 s1, s1, 0x100000 ; GFX10-NEXT: s_lshl_b32 s0, s0, s2 ; GFX10-NEXT: s_and_b32 s2, s9, 7 -; GFX10-NEXT: s_bfe_u32 s6, s6, 0x100000 ; GFX10-NEXT: s_andn2_b32 s9, 7, s9 ; GFX10-NEXT: s_lshl_b32 s3, s3, 1 +; GFX10-NEXT: s_bfe_u32 s6, s6, 0x100000 ; GFX10-NEXT: s_lshr_b32 s1, s1, s12 +; GFX10-NEXT: s_lshl_b32 s3, s3, s9 ; GFX10-NEXT: s_lshr_b32 s2, s6, s2 ; GFX10-NEXT: s_and_b32 s6, s7, s13 -; GFX10-NEXT: s_lshl_b32 s3, s3, s9 ; GFX10-NEXT: s_or_b32 s0, s0, s1 ; GFX10-NEXT: s_or_b32 s1, s3, s2 ; GFX10-NEXT: s_and_b32 s2, s10, 7 -; GFX10-NEXT: s_bfe_u32 s6, s6, 0x100000 ; GFX10-NEXT: s_andn2_b32 s3, 7, s10 ; GFX10-NEXT: s_lshl_b32 s4, s4, 1 -; GFX10-NEXT: s_lshr_b32 s2, s6, s2 +; GFX10-NEXT: s_bfe_u32 s6, s6, 0x100000 ; GFX10-NEXT: s_lshl_b32 s3, s4, s3 +; GFX10-NEXT: s_lshr_b32 s2, s6, s2 ; GFX10-NEXT: s_andn2_b32 s4, 7, s11 ; GFX10-NEXT: s_lshl_b32 s5, s5, 1 ; GFX10-NEXT: s_and_b32 s6, s11, 7 -; GFX10-NEXT: s_or_b32 s2, s3, s2 -; GFX10-NEXT: s_and_b32 s1, s1, s13 ; GFX10-NEXT: s_lshl_b32 s4, s5, s4 ; GFX10-NEXT: s_lshr_b32 s5, s8, s6 +; GFX10-NEXT: s_or_b32 s2, s3, s2 +; GFX10-NEXT: s_and_b32 s1, s1, s13 +; GFX10-NEXT: s_or_b32 s3, s4, s5 ; GFX10-NEXT: s_and_b32 s0, s0, s13 ; GFX10-NEXT: s_lshl_b32 s1, s1, 8 -; GFX10-NEXT: s_or_b32 s3, s4, s5 ; GFX10-NEXT: s_and_b32 s2, s2, s13 ; GFX10-NEXT: s_or_b32 s0, s0, s1 ; GFX10-NEXT: s_lshl_b32 s1, s2, 16 @@ -992,9 +992,9 @@ ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 8, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v5, 24, v0 -; GFX6-NEXT: v_and_b32_e32 v11, 0xff, v1 ; GFX6-NEXT: v_and_b32_e32 v2, 7, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX6-NEXT: v_and_b32_e32 v11, 0xff, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, v2, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v10, v10, v11 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v10 @@ -1009,23 +1009,23 @@ ; GFX6-NEXT: v_and_b32_e32 v7, 7, v8 ; GFX6-NEXT: v_xor_b32_e32 v8, -1, v8 ; GFX6-NEXT: v_lshrrev_b32_e32 v6, 24, v1 -; GFX6-NEXT: v_bfe_u32 v1, v1, 16, 8 ; GFX6-NEXT: v_and_b32_e32 v8, 7, v8 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 1, v4 +; GFX6-NEXT: v_bfe_u32 v1, v1, 16, 8 ; GFX6-NEXT: v_mov_b32_e32 v2, 0xff +; GFX6-NEXT: v_lshlrev_b32_e32 v4, v8, v4 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, v7, v1 ; GFX6-NEXT: v_xor_b32_e32 v7, -1, v9 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, v8, v4 ; GFX6-NEXT: v_or_b32_e32 v1, v4, v1 ; GFX6-NEXT: v_and_b32_e32 v4, 7, v9 -; GFX6-NEXT: v_and_b32_e32 v3, v3, v2 ; GFX6-NEXT: v_and_b32_e32 v7, 7, v7 ; GFX6-NEXT: v_lshlrev_b32_e32 v5, 1, v5 -; GFX6-NEXT: v_and_b32_e32 v1, v1, v2 +; GFX6-NEXT: v_and_b32_e32 v3, v3, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v5, v7, v5 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, v4, v6 ; GFX6-NEXT: v_and_b32_e32 v0, v0, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GFX6-NEXT: v_and_b32_e32 v1, v1, v2 ; GFX6-NEXT: v_or_b32_e32 v4, v5, v4 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 @@ -1038,10 +1038,10 @@ ; GFX8-LABEL: v_fshr_v4i8: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_and_b32_e32 v8, 7, v2 ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 8, v2 ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v2 ; GFX8-NEXT: v_lshrrev_b32_e32 v7, 24, v2 +; GFX8-NEXT: v_and_b32_e32 v8, 7, v2 ; GFX8-NEXT: v_xor_b32_e32 v2, -1, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 7, v2 ; GFX8-NEXT: v_lshlrev_b16_e32 v9, 1, v0 @@ -1056,13 +1056,13 @@ ; GFX8-NEXT: v_lshlrev_b16_e32 v3, 1, v3 ; GFX8-NEXT: v_lshlrev_b16_e32 v3, v5, v3 ; GFX8-NEXT: v_lshrrev_b16_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_xor_b32_e32 v5, -1, v6 ; GFX8-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX8-NEXT: v_and_b32_e32 v4, 7, v6 +; GFX8-NEXT: v_xor_b32_e32 v5, -1, v6 ; GFX8-NEXT: v_mov_b32_e32 v6, 1 -; GFX8-NEXT: v_lshlrev_b16_sdwa v8, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_and_b32_e32 v5, 7, v5 ; GFX8-NEXT: v_mov_b32_e32 v9, 0xff +; GFX8-NEXT: v_and_b32_e32 v5, 7, v5 +; GFX8-NEXT: v_lshlrev_b16_sdwa v8, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_lshlrev_b16_e32 v5, v5, v8 ; GFX8-NEXT: v_and_b32_sdwa v8, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_lshrrev_b16_e32 v4, v4, v8 @@ -1075,12 +1075,12 @@ ; GFX8-NEXT: v_lshrrev_b16_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: v_mov_b32_e32 v1, 8 -; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX8-NEXT: s_movk_i32 s4, 0xff +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_and_b32_e32 v2, s4, v4 -; GFX8-NEXT: v_and_b32_e32 v0, s4, v0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v0, s4, v0 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 24, v0 ; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 @@ -1089,10 +1089,10 @@ ; GFX9-LABEL: v_fshr_v4i8: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v8, 7, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v2 +; GFX9-NEXT: v_and_b32_e32 v8, 7, v2 ; GFX9-NEXT: v_xor_b32_e32 v2, -1, v2 ; GFX9-NEXT: v_and_b32_e32 v2, 7, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v9, 1, v0 @@ -1107,13 +1107,13 @@ ; GFX9-NEXT: v_lshlrev_b16_e32 v3, 1, v3 ; GFX9-NEXT: v_lshlrev_b16_e32 v3, v5, v3 ; GFX9-NEXT: v_lshrrev_b16_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_xor_b32_e32 v5, -1, v6 ; GFX9-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX9-NEXT: v_and_b32_e32 v4, 7, v6 +; GFX9-NEXT: v_xor_b32_e32 v5, -1, v6 ; GFX9-NEXT: v_mov_b32_e32 v6, 1 -; GFX9-NEXT: v_lshlrev_b16_sdwa v8, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_and_b32_e32 v5, 7, v5 ; GFX9-NEXT: v_mov_b32_e32 v9, 0xff +; GFX9-NEXT: v_and_b32_e32 v5, 7, v5 +; GFX9-NEXT: v_lshlrev_b16_sdwa v8, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: v_lshlrev_b16_e32 v5, v5, v8 ; GFX9-NEXT: v_and_b32_sdwa v8, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NEXT: v_lshrrev_b16_e32 v4, v4, v8 @@ -1128,9 +1128,9 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, 8 ; GFX9-NEXT: s_movk_i32 s4, 0xff ; GFX9-NEXT: v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_and_b32_e32 v0, s4, v0 ; GFX9-NEXT: v_and_or_b32 v1, v2, s4, v1 ; GFX9-NEXT: v_and_b32_e32 v2, s4, v4 +; GFX9-NEXT: v_and_b32_e32 v0, s4, v0 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 24, v0 ; GFX9-NEXT: v_or3_b32 v0, v1, v2, v0 @@ -1143,30 +1143,30 @@ ; GFX10-NEXT: v_lshrrev_b32_e32 v6, 8, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v0 ; GFX10-NEXT: v_xor_b32_e32 v8, -1, v2 -; GFX10-NEXT: v_lshrrev_b32_e32 v12, 24, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v12, 24, v2 ; GFX10-NEXT: v_xor_b32_e32 v11, -1, v6 ; GFX10-NEXT: v_lshlrev_b16 v3, 1, v3 ; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v0 ; GFX10-NEXT: v_lshrrev_b32_e32 v5, 24, v0 -; GFX10-NEXT: v_xor_b32_e32 v14, -1, v12 -; GFX10-NEXT: v_and_b32_e32 v11, 7, v11 ; GFX10-NEXT: v_lshrrev_b32_e32 v7, 8, v1 +; GFX10-NEXT: v_and_b32_e32 v11, 7, v11 ; GFX10-NEXT: v_and_b32_e32 v8, 7, v8 ; GFX10-NEXT: v_lshlrev_b16 v0, 1, v0 ; GFX10-NEXT: v_mov_b32_e32 v13, 0xff +; GFX10-NEXT: v_xor_b32_e32 v14, -1, v12 ; GFX10-NEXT: v_lshlrev_b16 v3, v11, v3 ; GFX10-NEXT: v_xor_b32_e32 v11, -1, v10 ; GFX10-NEXT: s_movk_i32 s4, 0xff -; GFX10-NEXT: v_lshlrev_b16 v0, v8, v0 ; GFX10-NEXT: v_lshrrev_b32_e32 v9, 24, v1 +; GFX10-NEXT: v_lshlrev_b16 v0, v8, v0 ; GFX10-NEXT: v_and_b32_e32 v8, s4, v1 -; GFX10-NEXT: v_and_b32_sdwa v1, v1, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX10-NEXT: v_and_b32_e32 v10, 7, v10 ; GFX10-NEXT: v_and_b32_e32 v6, 7, v6 ; GFX10-NEXT: v_and_b32_e32 v7, s4, v7 +; GFX10-NEXT: v_and_b32_e32 v10, 7, v10 ; GFX10-NEXT: v_and_b32_e32 v11, 7, v11 ; GFX10-NEXT: v_lshlrev_b16 v4, 1, v4 +; GFX10-NEXT: v_and_b32_sdwa v1, v1, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-NEXT: v_and_b32_e32 v13, 7, v14 ; GFX10-NEXT: v_lshlrev_b16 v5, 1, v5 ; GFX10-NEXT: v_and_b32_e32 v12, 7, v12 @@ -1178,15 +1178,15 @@ ; GFX10-NEXT: v_lshrrev_b16 v7, v12, v9 ; GFX10-NEXT: v_lshrrev_b16 v2, v2, v8 ; GFX10-NEXT: v_or_b32_e32 v3, v3, v6 -; GFX10-NEXT: v_or_b32_e32 v1, v4, v1 ; GFX10-NEXT: v_mov_b32_e32 v6, 8 +; GFX10-NEXT: v_or_b32_e32 v1, v4, v1 ; GFX10-NEXT: v_or_b32_e32 v4, v5, v7 ; GFX10-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX10-NEXT: v_and_b32_e32 v1, s4, v1 ; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_and_b32_e32 v1, s4, v1 ; GFX10-NEXT: v_and_b32_e32 v3, s4, v4 -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX10-NEXT: v_and_or_b32 v0, v0, s4, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 24, v3 ; GFX10-NEXT: v_or3_b32 v0, v0, v1, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -1467,11 +1467,11 @@ ; GFX6-NEXT: s_lshr_b32 s6, s0, 16 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX6-NEXT: s_lshr_b32 s8, s1, 8 -; GFX6-NEXT: s_and_b32 s1, s1, s9 ; GFX6-NEXT: s_lshr_b32 s7, s0, 24 +; GFX6-NEXT: s_lshr_b32 s8, s1, 8 ; GFX6-NEXT: s_and_b32 s10, s0, s9 ; GFX6-NEXT: s_bfe_u32 s0, s0, s11 +; GFX6-NEXT: s_and_b32 s1, s1, s9 ; GFX6-NEXT: s_lshl_b32 s0, s0, 8 ; GFX6-NEXT: s_lshl_b32 s1, s1, 8 ; GFX6-NEXT: v_mov_b32_e32 v1, 0xffffffe8 @@ -1479,22 +1479,22 @@ ; GFX6-NEXT: s_or_b32 s1, s7, s1 ; GFX6-NEXT: s_and_b32 s7, s8, s9 ; GFX6-NEXT: s_lshr_b32 s8, s2, 16 -; GFX6-NEXT: v_mul_lo_u32 v2, v1, v0 ; GFX6-NEXT: s_lshr_b32 s10, s2, 24 ; GFX6-NEXT: s_and_b32 s13, s2, s9 ; GFX6-NEXT: s_bfe_u32 s2, s2, s11 +; GFX6-NEXT: v_mul_lo_u32 v2, v1, v0 ; GFX6-NEXT: s_lshl_b32 s2, s2, 8 ; GFX6-NEXT: s_and_b32 s8, s8, s9 ; GFX6-NEXT: s_or_b32 s2, s13, s2 ; GFX6-NEXT: s_bfe_u32 s8, s8, 0x100000 ; GFX6-NEXT: s_lshr_b32 s12, s3, 8 -; GFX6-NEXT: s_and_b32 s3, s3, s9 ; GFX6-NEXT: s_bfe_u32 s2, s2, 0x100000 ; GFX6-NEXT: s_lshl_b32 s8, s8, 16 -; GFX6-NEXT: s_lshl_b32 s3, s3, 8 -; GFX6-NEXT: v_mul_hi_u32 v2, v0, v2 +; GFX6-NEXT: s_and_b32 s3, s3, s9 ; GFX6-NEXT: s_or_b32 s2, s2, s8 +; GFX6-NEXT: s_lshl_b32 s3, s3, 8 ; GFX6-NEXT: s_and_b32 s8, s12, s9 +; GFX6-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX6-NEXT: s_or_b32 s3, s10, s3 ; GFX6-NEXT: s_bfe_u32 s8, s8, 0x100000 ; GFX6-NEXT: s_bfe_u32 s3, s3, 0x100000 @@ -1526,16 +1526,16 @@ ; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, 24, v0 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 ; GFX6-NEXT: v_mul_hi_u32 v1, v2, v1 -; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GFX6-NEXT: s_and_b32 s8, s12, s9 +; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GFX6-NEXT: s_or_b32 s5, s10, s5 ; GFX6-NEXT: s_bfe_u32 s8, s8, 0x100000 ; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, 24, v0 -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 ; GFX6-NEXT: s_bfe_u32 s5, s5, 0x100000 ; GFX6-NEXT: s_lshl_b32 s8, s8, 16 -; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 ; GFX6-NEXT: s_or_b32 s5, s5, s8 +; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v2, v1 ; GFX6-NEXT: v_mul_hi_u32 v1, s5, v1 ; GFX6-NEXT: s_and_b32 s6, s6, s9 @@ -1546,9 +1546,9 @@ ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 23, v0 ; GFX6-NEXT: s_lshl_b32 s4, s6, 17 ; GFX6-NEXT: s_lshl_b32 s0, s0, 1 -; GFX6-NEXT: v_and_b32_e32 v0, s8, v0 ; GFX6-NEXT: s_or_b32 s0, s4, s0 ; GFX6-NEXT: v_and_b32_e32 v2, s8, v3 +; GFX6-NEXT: v_and_b32_e32 v0, s8, v0 ; GFX6-NEXT: v_lshl_b32_e32 v2, s0, v2 ; GFX6-NEXT: v_lshr_b32_e32 v0, s2, v0 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s5, v1 @@ -1558,23 +1558,23 @@ ; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, 24, v1 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v1 -; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX6-NEXT: s_bfe_u32 s1, s1, 0x100000 ; GFX6-NEXT: s_bfe_u32 s7, s7, 0x100000 +; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX6-NEXT: v_mov_b32_e32 v4, 0xffffff ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 23, v1 ; GFX6-NEXT: s_lshl_b32 s0, s7, 17 ; GFX6-NEXT: s_lshl_b32 s1, s1, 1 -; GFX6-NEXT: v_and_b32_e32 v1, v1, v4 ; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: v_and_b32_e32 v2, v2, v4 -; GFX6-NEXT: v_bfe_u32 v3, v0, 8, 8 +; GFX6-NEXT: v_and_b32_e32 v1, v1, v4 ; GFX6-NEXT: v_lshl_b32_e32 v2, s0, v2 ; GFX6-NEXT: v_lshr_b32_e32 v1, s3, v1 +; GFX6-NEXT: v_bfe_u32 v3, v0, 8, 8 ; GFX6-NEXT: v_or_b32_e32 v1, v2, v1 ; GFX6-NEXT: v_and_b32_e32 v2, s9, v0 -; GFX6-NEXT: v_bfe_u32 v0, v0, 16, 8 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GFX6-NEXT: v_bfe_u32 v0, v0, 16, 8 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 @@ -1598,17 +1598,17 @@ ; GFX8-NEXT: s_bfe_u32 s11, 8, 0x100000 ; GFX8-NEXT: s_and_b32 s1, s1, s10 ; GFX8-NEXT: s_lshr_b32 s6, s0, 8 -; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: s_lshr_b32 s8, s0, 24 ; GFX8-NEXT: s_lshl_b32 s1, s1, s11 -; GFX8-NEXT: s_or_b32 s1, s8, s1 +; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: s_and_b32 s6, s6, s10 +; GFX8-NEXT: s_or_b32 s1, s8, s1 ; GFX8-NEXT: s_lshr_b32 s8, s2, 8 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX8-NEXT: s_and_b32 s8, s8, s10 ; GFX8-NEXT: s_lshr_b32 s7, s0, 16 ; GFX8-NEXT: s_and_b32 s0, s0, s10 ; GFX8-NEXT: s_lshl_b32 s6, s6, s11 +; GFX8-NEXT: s_and_b32 s8, s8, s10 ; GFX8-NEXT: s_or_b32 s0, s0, s6 ; GFX8-NEXT: s_and_b32 s6, s7, s10 ; GFX8-NEXT: s_and_b32 s7, s9, s10 @@ -1622,17 +1622,17 @@ ; GFX8-NEXT: s_bfe_u32 s8, s8, 0x100000 ; GFX8-NEXT: v_mul_lo_u32 v2, v1, v0 ; GFX8-NEXT: s_lshr_b32 s13, s3, 8 -; GFX8-NEXT: s_and_b32 s3, s3, s10 ; GFX8-NEXT: s_bfe_u32 s2, s2, 0x100000 ; GFX8-NEXT: s_lshl_b32 s8, s8, 16 -; GFX8-NEXT: s_lshl_b32 s3, s3, s11 +; GFX8-NEXT: s_and_b32 s3, s3, s10 ; GFX8-NEXT: s_or_b32 s2, s2, s8 +; GFX8-NEXT: s_lshl_b32 s3, s3, s11 ; GFX8-NEXT: s_and_b32 s8, s13, s10 ; GFX8-NEXT: s_or_b32 s3, s12, s3 ; GFX8-NEXT: s_bfe_u32 s8, s8, 0x100000 -; GFX8-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX8-NEXT: s_bfe_u32 s3, s3, 0x100000 ; GFX8-NEXT: s_lshl_b32 s8, s8, 16 +; GFX8-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX8-NEXT: s_or_b32 s3, s3, s8 ; GFX8-NEXT: s_lshr_b32 s8, s4, 8 ; GFX8-NEXT: s_and_b32 s8, s8, s10 @@ -1661,16 +1661,16 @@ ; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, 24, v0 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 ; GFX8-NEXT: v_mul_hi_u32 v1, v2, v1 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GFX8-NEXT: s_and_b32 s8, s13, s10 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GFX8-NEXT: s_or_b32 s5, s12, s5 ; GFX8-NEXT: s_bfe_u32 s8, s8, 0x100000 ; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, 24, v0 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 ; GFX8-NEXT: s_bfe_u32 s5, s5, 0x100000 ; GFX8-NEXT: s_lshl_b32 s8, s8, 16 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 ; GFX8-NEXT: s_or_b32 s5, s5, s8 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1 ; GFX8-NEXT: v_mul_hi_u32 v1, s5, v1 ; GFX8-NEXT: s_bfe_u32 s0, s0, 0x100000 @@ -1680,9 +1680,9 @@ ; GFX8-NEXT: v_sub_u32_e32 v3, vcc, 23, v0 ; GFX8-NEXT: s_lshl_b32 s4, s6, 17 ; GFX8-NEXT: s_lshl_b32 s0, s0, 1 -; GFX8-NEXT: v_and_b32_e32 v0, s8, v0 ; GFX8-NEXT: s_or_b32 s0, s4, s0 ; GFX8-NEXT: v_and_b32_e32 v2, s8, v3 +; GFX8-NEXT: v_and_b32_e32 v0, s8, v0 ; GFX8-NEXT: v_lshlrev_b32_e64 v2, v2, s0 ; GFX8-NEXT: v_lshrrev_b32_e64 v0, v0, s2 ; GFX8-NEXT: v_sub_u32_e32 v1, vcc, s5, v1 @@ -1692,16 +1692,16 @@ ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, 24, v1 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v1 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 ; GFX8-NEXT: s_bfe_u32 s7, s7, 0x100000 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX8-NEXT: v_mov_b32_e32 v4, 0xffffff ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 23, v1 ; GFX8-NEXT: s_lshl_b32 s0, s7, 17 ; GFX8-NEXT: s_lshl_b32 s1, s1, 1 -; GFX8-NEXT: v_and_b32_e32 v1, v1, v4 -; GFX8-NEXT: v_and_b32_e32 v2, v2, v4 ; GFX8-NEXT: s_or_b32 s0, s0, s1 +; GFX8-NEXT: v_and_b32_e32 v2, v2, v4 +; GFX8-NEXT: v_and_b32_e32 v1, v1, v4 ; GFX8-NEXT: v_lshlrev_b32_e64 v2, v2, s0 ; GFX8-NEXT: v_lshrrev_b32_e64 v1, v1, s3 ; GFX8-NEXT: v_or_b32_e32 v1, v2, v1 @@ -1712,8 +1712,8 @@ ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX8-NEXT: v_or_b32_e32 v0, v3, v0 ; GFX8-NEXT: v_and_b32_e32 v3, s10, v1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v3 ; GFX8-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; GFX8-NEXT: v_readfirstlane_b32 s0, v0 @@ -1735,14 +1735,14 @@ ; GFX9-NEXT: v_mul_lo_u32 v2, v1, v0 ; GFX9-NEXT: s_lshr_b32 s10, s0, 24 ; GFX9-NEXT: s_lshl_b32 s1, s1, s13 -; GFX9-NEXT: s_or_b32 s1, s10, s1 -; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX9-NEXT: s_and_b32 s7, s7, s12 +; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2 +; GFX9-NEXT: s_or_b32 s1, s10, s1 ; GFX9-NEXT: s_lshr_b32 s10, s2, 8 -; GFX9-NEXT: s_and_b32 s10, s10, s12 ; GFX9-NEXT: s_lshr_b32 s9, s0, 16 ; GFX9-NEXT: s_and_b32 s0, s0, s12 ; GFX9-NEXT: s_lshl_b32 s7, s7, s13 +; GFX9-NEXT: s_and_b32 s10, s10, s12 ; GFX9-NEXT: s_or_b32 s0, s0, s7 ; GFX9-NEXT: s_and_b32 s7, s9, s12 ; GFX9-NEXT: s_and_b32 s9, s11, s12 @@ -1757,17 +1757,17 @@ ; GFX9-NEXT: s_bfe_u32 s10, s10, 0x100000 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX9-NEXT: s_lshr_b32 s15, s3, 8 -; GFX9-NEXT: s_and_b32 s3, s3, s12 ; GFX9-NEXT: s_bfe_u32 s2, s2, 0x100000 ; GFX9-NEXT: s_lshl_b32 s10, s10, 16 -; GFX9-NEXT: s_lshl_b32 s3, s3, s13 +; GFX9-NEXT: s_and_b32 s3, s3, s12 ; GFX9-NEXT: s_or_b32 s2, s2, s10 +; GFX9-NEXT: s_lshl_b32 s3, s3, s13 ; GFX9-NEXT: s_and_b32 s10, s15, s12 ; GFX9-NEXT: s_or_b32 s3, s14, s3 ; GFX9-NEXT: s_bfe_u32 s10, s10, 0x100000 -; GFX9-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 ; GFX9-NEXT: s_bfe_u32 s3, s3, 0x100000 ; GFX9-NEXT: s_lshl_b32 s10, s10, 16 +; GFX9-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 ; GFX9-NEXT: s_or_b32 s3, s3, s10 ; GFX9-NEXT: s_lshr_b32 s10, s4, 8 ; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 @@ -1791,9 +1791,9 @@ ; GFX9-NEXT: s_and_b32 s10, s15, s12 ; GFX9-NEXT: s_or_b32 s5, s14, s5 ; GFX9-NEXT: s_bfe_u32 s10, s10, 0x100000 -; GFX9-NEXT: v_mul_lo_u32 v0, v0, 24 ; GFX9-NEXT: s_bfe_u32 s5, s5, 0x100000 ; GFX9-NEXT: s_lshl_b32 s10, s10, 16 +; GFX9-NEXT: v_mul_lo_u32 v0, v0, 24 ; GFX9-NEXT: s_or_b32 s5, s5, s10 ; GFX9-NEXT: v_add_u32_e32 v1, v2, v1 ; GFX9-NEXT: v_mul_hi_u32 v1, s5, v1 @@ -1805,17 +1805,17 @@ ; GFX9-NEXT: v_subrev_u32_e32 v3, 24, v0 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 ; GFX9-NEXT: s_bfe_u32 s0, s0, 0x100000 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GFX9-NEXT: s_bfe_u32 s7, s7, 0x100000 ; GFX9-NEXT: s_mov_b32 s10, 0xffffff +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GFX9-NEXT: v_sub_u32_e32 v3, 23, v0 -; GFX9-NEXT: v_and_b32_e32 v0, s10, v0 ; GFX9-NEXT: s_lshl_b32 s4, s7, 17 ; GFX9-NEXT: s_lshl_b32 s0, s0, 1 -; GFX9-NEXT: v_sub_u32_e32 v1, s5, v1 +; GFX9-NEXT: v_and_b32_e32 v0, s10, v0 ; GFX9-NEXT: s_or_b32 s0, s4, s0 ; GFX9-NEXT: v_and_b32_e32 v3, s10, v3 ; GFX9-NEXT: v_lshrrev_b32_e64 v0, v0, s2 +; GFX9-NEXT: v_sub_u32_e32 v1, s5, v1 ; GFX9-NEXT: v_lshl_or_b32 v0, s0, v3, v0 ; GFX9-NEXT: v_subrev_u32_e32 v3, 24, v1 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v1 @@ -1823,21 +1823,21 @@ ; GFX9-NEXT: v_subrev_u32_e32 v3, 24, v1 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v1 ; GFX9-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX9-NEXT: s_bfe_u32 s9, s9, 0x100000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0xffffff +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX9-NEXT: v_sub_u32_e32 v3, 23, v1 -; GFX9-NEXT: v_and_b32_e32 v1, v1, v2 ; GFX9-NEXT: s_lshl_b32 s0, s9, 17 ; GFX9-NEXT: s_lshl_b32 s1, s1, 1 +; GFX9-NEXT: v_and_b32_e32 v1, v1, v2 ; GFX9-NEXT: s_or_b32 s0, s0, s1 ; GFX9-NEXT: v_and_b32_e32 v3, v3, v2 ; GFX9-NEXT: v_lshrrev_b32_e64 v1, v1, s3 -; GFX9-NEXT: v_lshl_or_b32 v1, s0, v3, v1 ; GFX9-NEXT: s_mov_b32 s6, 8 +; GFX9-NEXT: v_lshl_or_b32 v1, s0, v3, v1 +; GFX9-NEXT: s_mov_b32 s8, 16 ; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX9-NEXT: v_and_b32_e32 v3, s12, v1 -; GFX9-NEXT: s_mov_b32 s8, 16 ; GFX9-NEXT: v_and_or_b32 v2, v0, s12, v2 ; GFX9-NEXT: v_lshlrev_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 24, v3 @@ -1855,10 +1855,10 @@ ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v1, 24 ; GFX10-NEXT: s_movk_i32 s9, 0xff ; GFX10-NEXT: s_lshr_b32 s12, s4, 8 -; GFX10-NEXT: s_lshr_b32 s13, s4, 16 +; GFX10-NEXT: s_bfe_u32 s10, 8, 0x100000 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GFX10-NEXT: s_bfe_u32 s10, 8, 0x100000 +; GFX10-NEXT: s_lshr_b32 s13, s4, 16 ; GFX10-NEXT: s_and_b32 s12, s12, s9 ; GFX10-NEXT: s_lshr_b32 s14, s4, 24 ; GFX10-NEXT: s_and_b32 s4, s4, s9 @@ -1869,13 +1869,13 @@ ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX10-NEXT: s_lshr_b32 s15, s5, 8 -; GFX10-NEXT: s_and_b32 s5, s5, s9 ; GFX10-NEXT: s_bfe_u32 s4, s4, 0x100000 +; GFX10-NEXT: s_lshl_b32 s12, s12, 16 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX10-NEXT: s_lshl_b32 s12, s12, 16 -; GFX10-NEXT: s_lshl_b32 s5, s5, s10 +; GFX10-NEXT: s_and_b32 s5, s5, s9 ; GFX10-NEXT: s_or_b32 s4, s4, s12 +; GFX10-NEXT: s_lshl_b32 s5, s5, s10 ; GFX10-NEXT: v_mul_lo_u32 v2, 0xffffffe8, v0 ; GFX10-NEXT: v_mul_lo_u32 v3, 0xffffffe8, v1 ; GFX10-NEXT: s_and_b32 s12, s15, s9 @@ -1896,18 +1896,18 @@ ; GFX10-NEXT: v_mul_hi_u32 v2, v1, v3 ; GFX10-NEXT: s_lshr_b32 s8, s2, 8 ; GFX10-NEXT: s_lshr_b32 s7, s0, 16 -; GFX10-NEXT: s_and_b32 s8, s8, s9 -; GFX10-NEXT: v_mul_hi_u32 v0, s4, v0 ; GFX10-NEXT: s_and_b32 s0, s0, s9 +; GFX10-NEXT: v_mul_hi_u32 v0, s4, v0 ; GFX10-NEXT: s_lshl_b32 s6, s6, s10 -; GFX10-NEXT: s_lshr_b32 s13, s2, 24 -; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v2 +; GFX10-NEXT: s_and_b32 s8, s8, s9 ; GFX10-NEXT: s_or_b32 s0, s0, s6 +; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v2 ; GFX10-NEXT: s_and_b32 s6, s7, s9 ; GFX10-NEXT: s_and_b32 s7, s11, s9 +; GFX10-NEXT: s_lshr_b32 s11, s2, 16 ; GFX10-NEXT: v_mul_lo_u32 v0, v0, 24 ; GFX10-NEXT: v_mul_hi_u32 v1, s5, v1 -; GFX10-NEXT: s_lshr_b32 s11, s2, 16 +; GFX10-NEXT: s_lshr_b32 s13, s2, 24 ; GFX10-NEXT: s_and_b32 s2, s2, s9 ; GFX10-NEXT: s_lshl_b32 s8, s8, s10 ; GFX10-NEXT: s_lshr_b32 s12, s3, 8 @@ -2018,13 +2018,13 @@ ; GFX6-NEXT: v_mul_lo_u32 v6, v7, v8 ; GFX6-NEXT: v_sub_i32_e32 v7, vcc, 23, v4 ; GFX6-NEXT: v_and_b32_e32 v7, v7, v9 -; GFX6-NEXT: v_and_b32_e32 v4, v4, v9 ; GFX6-NEXT: v_mul_hi_u32 v6, v8, v6 +; GFX6-NEXT: v_and_b32_e32 v4, v4, v9 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, v7, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v2 -; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX6-NEXT: v_add_i32_e32 v6, vcc, v8, v6 ; GFX6-NEXT: v_mul_hi_u32 v6, v5, v6 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX6-NEXT: v_and_b32_e32 v3, v3, v9 ; GFX6-NEXT: v_mul_lo_u32 v6, v6, 24 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v5, v6 @@ -2035,8 +2035,8 @@ ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v2 ; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX6-NEXT: v_sub_i32_e32 v4, vcc, 23, v2 -; GFX6-NEXT: v_and_b32_e32 v2, v2, v9 ; GFX6-NEXT: v_and_b32_e32 v4, v4, v9 +; GFX6-NEXT: v_and_b32_e32 v2, v2, v9 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, v4, v1 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v2, v3 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 @@ -2075,13 +2075,13 @@ ; GFX8-NEXT: v_mul_lo_u32 v6, v7, v8 ; GFX8-NEXT: v_sub_u32_e32 v7, vcc, 23, v4 ; GFX8-NEXT: v_and_b32_e32 v7, v7, v9 -; GFX8-NEXT: v_and_b32_e32 v4, v4, v9 ; GFX8-NEXT: v_mul_hi_u32 v6, v8, v6 +; GFX8-NEXT: v_and_b32_e32 v4, v4, v9 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, v7, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, v4, v2 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v8, v6 ; GFX8-NEXT: v_mul_hi_u32 v6, v5, v6 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX8-NEXT: v_and_b32_e32 v3, v3, v9 ; GFX8-NEXT: v_mul_lo_u32 v6, v6, 24 ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, v5, v6 @@ -2092,8 +2092,8 @@ ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX8-NEXT: v_sub_u32_e32 v4, vcc, 23, v2 -; GFX8-NEXT: v_and_b32_e32 v2, v2, v9 ; GFX8-NEXT: v_and_b32_e32 v4, v4, v9 +; GFX8-NEXT: v_and_b32_e32 v2, v2, v9 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, v4, v1 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, v2, v3 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v2 @@ -2640,10 +2640,10 @@ ; GFX10-LABEL: s_fshr_i16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_and_b32 s3, s2, 15 -; GFX10-NEXT: s_andn2_b32 s2, 15, s2 ; GFX10-NEXT: s_bfe_u32 s4, 1, 0x100000 -; GFX10-NEXT: s_bfe_u32 s2, s2, 0x100000 +; GFX10-NEXT: s_andn2_b32 s2, 15, s2 ; GFX10-NEXT: s_lshl_b32 s0, s0, s4 +; GFX10-NEXT: s_bfe_u32 s2, s2, 0x100000 ; GFX10-NEXT: s_bfe_u32 s1, s1, 0x100000 ; GFX10-NEXT: s_bfe_u32 s3, s3, 0x100000 ; GFX10-NEXT: s_lshl_b32 s0, s0, s2 @@ -2783,8 +2783,8 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_xor_b32_e32 v3, -1, v2 -; GFX10-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX10-NEXT: v_lshlrev_b16 v0, 1, v0 +; GFX10-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX10-NEXT: v_and_b32_e32 v3, 15, v3 ; GFX10-NEXT: v_lshrrev_b16 v1, v2, v1 ; GFX10-NEXT: v_lshlrev_b16 v0, v3, v0 @@ -2966,8 +2966,8 @@ ; GFX10-LABEL: v_fshr_i16_svs: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_and_b32 s2, s1, 15 -; GFX10-NEXT: s_andn2_b32 s1, 15, s1 ; GFX10-NEXT: s_bfe_u32 s3, 1, 0x100000 +; GFX10-NEXT: s_andn2_b32 s1, 15, s1 ; GFX10-NEXT: v_lshrrev_b16 v0, s2, v0 ; GFX10-NEXT: s_lshl_b32 s0, s0, s3 ; GFX10-NEXT: s_bfe_u32 s1, s1, 0x100000 @@ -3041,32 +3041,32 @@ ; GFX6-NEXT: s_and_b32 s4, s4, s6 ; GFX6-NEXT: s_or_b32 s4, s5, s4 ; GFX6-NEXT: s_bfe_u32 s5, 1, 0x100000 -; GFX6-NEXT: s_and_b32 s7, s2, s6 ; GFX6-NEXT: s_lshl_b32 s0, s0, s5 +; GFX6-NEXT: s_and_b32 s7, s2, s6 ; GFX6-NEXT: s_lshl_b32 s1, s1, s5 ; GFX6-NEXT: s_and_b32 s5, s3, s6 -; GFX6-NEXT: s_lshl_b32 s2, s2, 1 ; GFX6-NEXT: s_lshr_b32 s7, s7, 15 ; GFX6-NEXT: s_lshr_b32 s5, s5, 15 +; GFX6-NEXT: s_lshl_b32 s2, s2, 1 ; GFX6-NEXT: s_xor_b32 s4, s4, -1 -; GFX6-NEXT: s_and_b32 s2, s2, s6 ; GFX6-NEXT: s_or_b32 s0, s0, s7 -; GFX6-NEXT: s_and_b32 s7, s4, 15 ; GFX6-NEXT: s_or_b32 s1, s1, s5 ; GFX6-NEXT: s_lshr_b32 s5, s4, 16 +; GFX6-NEXT: s_and_b32 s7, s4, 15 ; GFX6-NEXT: s_andn2_b32 s4, 15, s4 +; GFX6-NEXT: s_and_b32 s2, s2, s6 ; GFX6-NEXT: s_bfe_u32 s7, s7, 0x100000 ; GFX6-NEXT: s_lshr_b32 s2, s2, 1 ; GFX6-NEXT: s_bfe_u32 s4, s4, 0x100000 -; GFX6-NEXT: s_lshr_b32 s2, s2, s4 ; GFX6-NEXT: s_lshl_b32 s0, s0, s7 +; GFX6-NEXT: s_lshr_b32 s2, s2, s4 ; GFX6-NEXT: s_or_b32 s0, s0, s2 ; GFX6-NEXT: s_and_b32 s2, s5, 15 -; GFX6-NEXT: s_bfe_u32 s2, s2, 0x100000 ; GFX6-NEXT: s_lshl_b32 s3, s3, 1 +; GFX6-NEXT: s_bfe_u32 s2, s2, 0x100000 +; GFX6-NEXT: s_andn2_b32 s4, 15, s5 ; GFX6-NEXT: s_lshl_b32 s1, s1, s2 ; GFX6-NEXT: s_and_b32 s2, s3, s6 -; GFX6-NEXT: s_andn2_b32 s4, 15, s5 ; GFX6-NEXT: s_lshr_b32 s2, s2, 1 ; GFX6-NEXT: s_bfe_u32 s3, s4, 0x100000 ; GFX6-NEXT: s_lshr_b32 s2, s2, s3 @@ -3086,25 +3086,25 @@ ; GFX8-NEXT: s_lshr_b32 s4, s1, 16 ; GFX8-NEXT: s_lshl_b32 s0, s0, s5 ; GFX8-NEXT: s_lshr_b32 s6, s6, s7 -; GFX8-NEXT: s_lshl_b32 s1, s1, s5 ; GFX8-NEXT: s_or_b32 s0, s0, s6 -; GFX8-NEXT: s_lshr_b32 s6, s4, s7 ; GFX8-NEXT: s_lshl_b32 s3, s3, s5 +; GFX8-NEXT: s_lshr_b32 s6, s4, s7 +; GFX8-NEXT: s_lshl_b32 s1, s1, s5 ; GFX8-NEXT: s_xor_b32 s2, s2, -1 -; GFX8-NEXT: s_and_b32 s7, s2, 15 -; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 ; GFX8-NEXT: s_or_b32 s3, s3, s6 ; GFX8-NEXT: s_lshr_b32 s6, s2, 16 +; GFX8-NEXT: s_and_b32 s7, s2, 15 ; GFX8-NEXT: s_andn2_b32 s2, 15, s2 +; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 ; GFX8-NEXT: s_bfe_u32 s7, s7, 0x100000 ; GFX8-NEXT: s_lshr_b32 s1, s1, s5 ; GFX8-NEXT: s_bfe_u32 s2, s2, 0x100000 -; GFX8-NEXT: s_lshr_b32 s1, s1, s2 ; GFX8-NEXT: s_lshl_b32 s0, s0, s7 +; GFX8-NEXT: s_lshr_b32 s1, s1, s2 ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: s_and_b32 s1, s6, 15 -; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 ; GFX8-NEXT: s_lshl_b32 s4, s4, s5 +; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 ; GFX8-NEXT: s_andn2_b32 s2, 15, s6 ; GFX8-NEXT: s_lshl_b32 s1, s3, s1 ; GFX8-NEXT: s_bfe_u32 s3, s4, 0x100000 @@ -3181,35 +3181,35 @@ ; GFX6-NEXT: v_and_b32_e32 v4, v4, v6 ; GFX6-NEXT: s_mov_b32 s5, 0xffff ; GFX6-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX6-NEXT: v_and_b32_e32 v5, s5, v2 ; GFX6-NEXT: s_bfe_u32 s4, 1, 0x100000 +; GFX6-NEXT: v_and_b32_e32 v5, s5, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, s4, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v5, 15, v5 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v5 ; GFX6-NEXT: v_and_b32_e32 v5, s5, v3 -; GFX6-NEXT: v_xor_b32_e32 v4, -1, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, s4, v1 ; GFX6-NEXT: v_lshrrev_b32_e32 v5, 15, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 1, v2 -; GFX6-NEXT: v_and_b32_e32 v7, 15, v4 +; GFX6-NEXT: v_xor_b32_e32 v4, -1, v4 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 1, v2 ; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; GFX6-NEXT: v_and_b32_e32 v7, 15, v4 ; GFX6-NEXT: v_xor_b32_e32 v4, -1, v4 ; GFX6-NEXT: v_and_b32_e32 v4, 15, v4 ; GFX6-NEXT: v_and_b32_e32 v2, v2, v6 ; GFX6-NEXT: v_bfe_u32 v7, v7, 0, 16 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, 1, v2 ; GFX6-NEXT: v_bfe_u32 v4, v4, 0, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, v7, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v2 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX6-NEXT: v_and_b32_e32 v2, 15, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 1, v3 ; GFX6-NEXT: v_xor_b32_e32 v4, -1, v5 ; GFX6-NEXT: v_bfe_u32 v2, v2, 0, 16 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 1, v3 +; GFX6-NEXT: v_and_b32_e32 v4, 15, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, v2, v1 ; GFX6-NEXT: v_and_b32_e32 v2, v3, v6 -; GFX6-NEXT: v_and_b32_e32 v4, 15, v4 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, 1, v2 ; GFX6-NEXT: v_bfe_u32 v3, v4, 0, 16 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v3, v2 @@ -3221,17 +3221,17 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshlrev_b16_e32 v3, 1, v0 ; GFX8-NEXT: v_lshrrev_b16_e32 v4, 15, v1 -; GFX8-NEXT: v_mov_b32_e32 v5, 15 ; GFX8-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX8-NEXT: v_mov_b32_e32 v4, 1 -; GFX8-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX8-NEXT: v_mov_b32_e32 v5, 15 ; GFX8-NEXT: v_lshlrev_b16_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_lshrrev_b16_sdwa v5, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_xor_b32_e32 v2, -1, v2 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v5 ; GFX8-NEXT: v_lshlrev_b16_e32 v5, 1, v1 ; GFX8-NEXT: v_lshlrev_b16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_and_b32_e32 v6, 15, v2 ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v6, 15, v2 ; GFX8-NEXT: v_xor_b32_e32 v2, -1, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX8-NEXT: v_lshrrev_b16_e32 v5, 1, v5 @@ -3323,8 +3323,8 @@ ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v2, 16 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v3, 16 -; GFX9-NEXT: s_mov_b32 s4, 0x4f7ffffe ; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v3 +; GFX9-NEXT: s_mov_b32 s4, 0x4f7ffffe ; GFX9-NEXT: v_mul_f32_e32 v2, s4, v2 ; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GFX9-NEXT: v_mul_f32_e32 v3, s4, v3 @@ -3341,8 +3341,8 @@ ; GFX9-NEXT: v_sub_u32_e32 v2, 4, v2 ; GFX9-NEXT: v_subrev_u32_e32 v4, 16, v2 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 16, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 4, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX9-NEXT: v_subrev_u32_e32 v4, 16, v2 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 16, v2 ; GFX9-NEXT: v_sub_u32_e32 v3, 8, v3 @@ -3417,32 +3417,32 @@ ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX6-NEXT: s_mov_b32 s5, 0xffff -; GFX6-NEXT: s_and_b32 s6, s2, s5 ; GFX6-NEXT: s_bfe_u32 s4, 1, 0x100000 +; GFX6-NEXT: s_and_b32 s6, s2, s5 ; GFX6-NEXT: v_xor_b32_e32 v0, -1, v0 -; GFX6-NEXT: v_and_b32_e32 v2, 15, v0 ; GFX6-NEXT: s_lshl_b32 s0, s0, s4 ; GFX6-NEXT: s_lshr_b32 s6, s6, 15 +; GFX6-NEXT: v_and_b32_e32 v2, 15, v0 +; GFX6-NEXT: s_or_b32 s0, s0, s6 +; GFX6-NEXT: s_lshl_b32 s2, s2, 1 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX6-NEXT: v_xor_b32_e32 v0, -1, v0 -; GFX6-NEXT: s_or_b32 s0, s0, s6 ; GFX6-NEXT: v_bfe_u32 v2, v2, 0, 16 -; GFX6-NEXT: s_lshl_b32 s2, s2, 1 ; GFX6-NEXT: v_and_b32_e32 v0, 15, v0 ; GFX6-NEXT: v_lshl_b32_e32 v2, s0, v2 ; GFX6-NEXT: s_and_b32 s0, s2, s5 ; GFX6-NEXT: s_lshr_b32 s0, s0, 1 ; GFX6-NEXT: v_bfe_u32 v0, v0, 0, 16 ; GFX6-NEXT: v_lshr_b32_e32 v0, s0, v0 -; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX6-NEXT: v_and_b32_e32 v2, 15, v1 -; GFX6-NEXT: v_xor_b32_e32 v1, -1, v1 ; GFX6-NEXT: s_lshl_b32 s1, s1, s4 ; GFX6-NEXT: s_and_b32 s4, s3, s5 ; GFX6-NEXT: s_lshl_b32 s3, s3, 1 +; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX6-NEXT: v_and_b32_e32 v2, 15, v1 +; GFX6-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX6-NEXT: s_lshr_b32 s4, s4, 15 ; GFX6-NEXT: v_and_b32_e32 v1, 15, v1 ; GFX6-NEXT: s_and_b32 s0, s3, s5 -; GFX6-NEXT: s_lshr_b32 s4, s4, 15 ; GFX6-NEXT: s_or_b32 s1, s1, s4 ; GFX6-NEXT: v_bfe_u32 v2, v2, 0, 16 ; GFX6-NEXT: s_lshr_b32 s0, s0, 1 @@ -3462,27 +3462,27 @@ ; GFX8-NEXT: s_bfe_u32 s5, s1, 0x100000 ; GFX8-NEXT: s_bfe_u32 s6, 15, 0x100000 ; GFX8-NEXT: s_lshr_b32 s2, s0, 16 -; GFX8-NEXT: v_xor_b32_e32 v0, -1, v0 ; GFX8-NEXT: s_lshl_b32 s0, s0, s4 ; GFX8-NEXT: s_lshr_b32 s5, s5, s6 +; GFX8-NEXT: v_xor_b32_e32 v0, -1, v0 ; GFX8-NEXT: s_lshr_b32 s3, s1, 16 ; GFX8-NEXT: s_or_b32 s0, s0, s5 -; GFX8-NEXT: v_and_b32_e32 v2, 15, v0 ; GFX8-NEXT: s_lshl_b32 s1, s1, s4 +; GFX8-NEXT: v_and_b32_e32 v2, 15, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX8-NEXT: v_xor_b32_e32 v0, -1, v0 ; GFX8-NEXT: v_lshlrev_b16_e64 v2, v2, s0 ; GFX8-NEXT: s_bfe_u32 s0, s1, 0x100000 ; GFX8-NEXT: v_and_b32_e32 v0, 15, v0 ; GFX8-NEXT: s_lshr_b32 s0, s0, s4 -; GFX8-NEXT: v_lshrrev_b16_e64 v0, v0, s0 ; GFX8-NEXT: s_lshr_b32 s5, s3, s6 ; GFX8-NEXT: s_lshl_b32 s3, s3, s4 +; GFX8-NEXT: v_lshrrev_b16_e64 v0, v0, s0 ; GFX8-NEXT: s_lshl_b32 s2, s2, s4 -; GFX8-NEXT: s_bfe_u32 s0, s3, 0x100000 ; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX8-NEXT: v_and_b32_e32 v2, 15, v1 ; GFX8-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX8-NEXT: s_bfe_u32 s0, s3, 0x100000 ; GFX8-NEXT: s_or_b32 s2, s2, s5 ; GFX8-NEXT: v_and_b32_e32 v1, 15, v1 ; GFX8-NEXT: s_lshr_b32 s0, s0, s4 @@ -3534,32 +3534,32 @@ ; GFX6-NEXT: s_mov_b32 s4, 0xffff ; GFX6-NEXT: s_lshl_b32 s3, s3, 16 ; GFX6-NEXT: s_and_b32 s2, s2, s4 -; GFX6-NEXT: v_and_b32_e32 v2, s4, v0 ; GFX6-NEXT: s_or_b32 s2, s3, s2 ; GFX6-NEXT: s_bfe_u32 s3, 1, 0x100000 -; GFX6-NEXT: v_and_b32_e32 v3, s4, v1 +; GFX6-NEXT: v_and_b32_e32 v2, s4, v0 ; GFX6-NEXT: s_lshl_b32 s0, s0, s3 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, 15, v2 +; GFX6-NEXT: v_and_b32_e32 v3, s4, v1 ; GFX6-NEXT: v_or_b32_e32 v2, s0, v2 ; GFX6-NEXT: s_lshl_b32 s0, s1, s3 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 15, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX6-NEXT: v_or_b32_e32 v3, s0, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX6-NEXT: s_xor_b32 s0, s2, -1 -; GFX6-NEXT: s_and_b32 s2, s0, 15 -; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 ; GFX6-NEXT: s_lshr_b32 s1, s0, 16 +; GFX6-NEXT: s_and_b32 s2, s0, 15 ; GFX6-NEXT: s_andn2_b32 s0, 15, s0 +; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, 1, v0 ; GFX6-NEXT: s_bfe_u32 s0, s0, 0x100000 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 1, v1 ; GFX6-NEXT: s_bfe_u32 s2, s2, 0x100000 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, s0, v0 ; GFX6-NEXT: s_and_b32 s0, s1, 15 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, s2, v2 ; GFX6-NEXT: s_andn2_b32 s1, 15, s1 ; GFX6-NEXT: s_bfe_u32 s0, s0, 0x100000 +; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, s0, v3 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 1, v1 @@ -3575,20 +3575,20 @@ ; GFX8-LABEL: v_fshr_v2i16_svs: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_bfe_u32 s3, 1, 0x100000 -; GFX8-NEXT: v_mov_b32_e32 v2, 15 ; GFX8-NEXT: s_lshr_b32 s2, s0, 16 ; GFX8-NEXT: s_lshl_b32 s0, s0, s3 ; GFX8-NEXT: v_lshrrev_b16_e32 v1, 15, v0 +; GFX8-NEXT: v_mov_b32_e32 v2, 15 ; GFX8-NEXT: v_or_b32_e32 v1, s0, v1 ; GFX8-NEXT: s_lshl_b32 s0, s2, s3 ; GFX8-NEXT: v_lshrrev_b16_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_lshlrev_b16_e32 v3, 1, v0 ; GFX8-NEXT: v_or_b32_e32 v2, s0, v2 -; GFX8-NEXT: s_xor_b32 s0, s1, -1 +; GFX8-NEXT: v_lshlrev_b16_e32 v3, 1, v0 ; GFX8-NEXT: v_mov_b32_e32 v4, 1 +; GFX8-NEXT: s_xor_b32 s0, s1, -1 ; GFX8-NEXT: v_lshlrev_b16_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: s_and_b32 s2, s0, 15 ; GFX8-NEXT: s_lshr_b32 s1, s0, 16 +; GFX8-NEXT: s_and_b32 s2, s0, 15 ; GFX8-NEXT: s_andn2_b32 s0, 15, s0 ; GFX8-NEXT: v_lshrrev_b16_e32 v3, 1, v3 ; GFX8-NEXT: v_lshrrev_b16_e32 v3, s0, v3 @@ -3653,32 +3653,32 @@ ; GFX6-NEXT: s_and_b32 s2, s2, s4 ; GFX6-NEXT: s_or_b32 s2, s3, s2 ; GFX6-NEXT: s_bfe_u32 s3, 1, 0x100000 -; GFX6-NEXT: s_and_b32 s5, s0, s4 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, s3, v0 +; GFX6-NEXT: s_and_b32 s5, s0, s4 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, s3, v1 ; GFX6-NEXT: s_and_b32 s3, s1, s4 -; GFX6-NEXT: s_lshl_b32 s0, s0, 1 ; GFX6-NEXT: s_lshr_b32 s5, s5, 15 ; GFX6-NEXT: s_lshr_b32 s3, s3, 15 +; GFX6-NEXT: s_lshl_b32 s0, s0, 1 ; GFX6-NEXT: s_xor_b32 s2, s2, -1 -; GFX6-NEXT: s_and_b32 s0, s0, s4 ; GFX6-NEXT: v_or_b32_e32 v0, s5, v0 -; GFX6-NEXT: s_and_b32 s5, s2, 15 ; GFX6-NEXT: v_or_b32_e32 v1, s3, v1 ; GFX6-NEXT: s_lshr_b32 s3, s2, 16 +; GFX6-NEXT: s_and_b32 s5, s2, 15 ; GFX6-NEXT: s_andn2_b32 s2, 15, s2 +; GFX6-NEXT: s_and_b32 s0, s0, s4 ; GFX6-NEXT: s_bfe_u32 s5, s5, 0x100000 ; GFX6-NEXT: s_lshr_b32 s0, s0, 1 ; GFX6-NEXT: s_bfe_u32 s2, s2, 0x100000 -; GFX6-NEXT: s_lshr_b32 s0, s0, s2 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, s5, v0 +; GFX6-NEXT: s_lshr_b32 s0, s0, s2 ; GFX6-NEXT: v_or_b32_e32 v0, s0, v0 ; GFX6-NEXT: s_and_b32 s0, s3, 15 -; GFX6-NEXT: s_bfe_u32 s0, s0, 0x100000 ; GFX6-NEXT: s_lshl_b32 s1, s1, 1 +; GFX6-NEXT: s_bfe_u32 s0, s0, 0x100000 +; GFX6-NEXT: s_andn2_b32 s2, 15, s3 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, s0, v1 ; GFX6-NEXT: s_and_b32 s0, s1, s4 -; GFX6-NEXT: s_andn2_b32 s2, 15, s3 ; GFX6-NEXT: s_lshr_b32 s0, s0, 1 ; GFX6-NEXT: s_bfe_u32 s1, s2, 0x100000 ; GFX6-NEXT: s_lshr_b32 s0, s0, s1 @@ -3704,17 +3704,17 @@ ; GFX8-NEXT: s_bfe_u32 s3, 1, 0x100000 ; GFX8-NEXT: s_lshl_b32 s0, s0, s3 ; GFX8-NEXT: s_xor_b32 s1, s1, -1 -; GFX8-NEXT: s_bfe_u32 s0, s0, 0x100000 -; GFX8-NEXT: s_and_b32 s5, s1, 15 ; GFX8-NEXT: s_lshr_b32 s4, s1, 16 +; GFX8-NEXT: s_and_b32 s5, s1, 15 ; GFX8-NEXT: s_andn2_b32 s1, 15, s1 +; GFX8-NEXT: s_bfe_u32 s0, s0, 0x100000 ; GFX8-NEXT: s_lshr_b32 s0, s0, s3 ; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX8-NEXT: s_lshr_b32 s0, s0, s1 ; GFX8-NEXT: v_lshlrev_b16_e32 v1, s5, v1 +; GFX8-NEXT: s_lshr_b32 s0, s0, s1 +; GFX8-NEXT: s_lshl_b32 s2, s2, s3 ; GFX8-NEXT: v_or_b32_e32 v1, s0, v1 ; GFX8-NEXT: s_and_b32 s0, s4, 15 -; GFX8-NEXT: s_lshl_b32 s2, s2, s3 ; GFX8-NEXT: s_andn2_b32 s1, 15, s4 ; GFX8-NEXT: v_lshlrev_b16_e32 v0, s0, v0 ; GFX8-NEXT: s_bfe_u32 s0, s2, 0x100000 @@ -3753,8 +3753,8 @@ ; GFX10-NEXT: s_and_b32 s4, s1, s2 ; GFX10-NEXT: s_andn2_b32 s1, s2, s1 ; GFX10-NEXT: s_lshr_b32 s2, s0, 16 -; GFX10-NEXT: v_pk_lshlrev_b16 v0, s1, v0 ; GFX10-NEXT: s_and_b32 s0, s0, s3 +; GFX10-NEXT: v_pk_lshlrev_b16 v0, s1, v0 ; GFX10-NEXT: s_and_b32 s1, s4, s3 ; GFX10-NEXT: s_lshr_b32 s3, s4, 16 ; GFX10-NEXT: s_lshr_b32 s0, s0, s1 @@ -3790,34 +3790,34 @@ ; GFX6-NEXT: s_or_b32 s8, s9, s8 ; GFX6-NEXT: s_lshl_b32 s9, s11, 16 ; GFX6-NEXT: s_and_b32 s10, s10, s12 -; GFX6-NEXT: s_and_b32 s11, s4, s12 ; GFX6-NEXT: s_or_b32 s9, s9, s10 ; GFX6-NEXT: s_bfe_u32 s10, 1, 0x100000 +; GFX6-NEXT: s_and_b32 s11, s4, s12 ; GFX6-NEXT: s_lshl_b32 s0, s0, s10 ; GFX6-NEXT: s_lshr_b32 s11, s11, 15 ; GFX6-NEXT: s_or_b32 s0, s0, s11 ; GFX6-NEXT: s_and_b32 s11, s5, s12 -; GFX6-NEXT: s_lshl_b32 s4, s4, 1 -; GFX6-NEXT: s_xor_b32 s8, s8, -1 ; GFX6-NEXT: s_lshl_b32 s1, s1, s10 ; GFX6-NEXT: s_lshr_b32 s11, s11, 15 -; GFX6-NEXT: s_and_b32 s13, s8, 15 -; GFX6-NEXT: s_and_b32 s4, s4, s12 +; GFX6-NEXT: s_lshl_b32 s4, s4, 1 +; GFX6-NEXT: s_xor_b32 s8, s8, -1 ; GFX6-NEXT: s_or_b32 s1, s1, s11 ; GFX6-NEXT: s_lshr_b32 s11, s8, 16 +; GFX6-NEXT: s_and_b32 s13, s8, 15 ; GFX6-NEXT: s_andn2_b32 s8, 15, s8 +; GFX6-NEXT: s_and_b32 s4, s4, s12 ; GFX6-NEXT: s_bfe_u32 s13, s13, 0x100000 ; GFX6-NEXT: s_lshr_b32 s4, s4, 1 ; GFX6-NEXT: s_bfe_u32 s8, s8, 0x100000 -; GFX6-NEXT: s_lshr_b32 s4, s4, s8 ; GFX6-NEXT: s_lshl_b32 s0, s0, s13 +; GFX6-NEXT: s_lshr_b32 s4, s4, s8 ; GFX6-NEXT: s_or_b32 s0, s0, s4 ; GFX6-NEXT: s_and_b32 s4, s11, 15 -; GFX6-NEXT: s_bfe_u32 s4, s4, 0x100000 ; GFX6-NEXT: s_lshl_b32 s5, s5, 1 +; GFX6-NEXT: s_bfe_u32 s4, s4, 0x100000 +; GFX6-NEXT: s_andn2_b32 s8, 15, s11 ; GFX6-NEXT: s_lshl_b32 s1, s1, s4 ; GFX6-NEXT: s_and_b32 s4, s5, s12 -; GFX6-NEXT: s_andn2_b32 s8, 15, s11 ; GFX6-NEXT: s_lshr_b32 s4, s4, 1 ; GFX6-NEXT: s_bfe_u32 s5, s8, 0x100000 ; GFX6-NEXT: s_lshr_b32 s4, s4, s5 @@ -3836,22 +3836,22 @@ ; GFX6-NEXT: s_or_b32 s2, s2, s3 ; GFX6-NEXT: s_lshl_b32 s3, s6, 1 ; GFX6-NEXT: s_xor_b32 s5, s9, -1 -; GFX6-NEXT: s_and_b32 s3, s3, s12 ; GFX6-NEXT: s_lshl_b32 s4, s7, 1 -; GFX6-NEXT: s_and_b32 s7, s5, 15 ; GFX6-NEXT: s_lshr_b32 s6, s5, 16 +; GFX6-NEXT: s_and_b32 s7, s5, 15 ; GFX6-NEXT: s_andn2_b32 s5, 15, s5 +; GFX6-NEXT: s_and_b32 s3, s3, s12 ; GFX6-NEXT: s_bfe_u32 s7, s7, 0x100000 ; GFX6-NEXT: s_lshr_b32 s3, s3, 1 ; GFX6-NEXT: s_bfe_u32 s5, s5, 0x100000 -; GFX6-NEXT: s_lshr_b32 s3, s3, s5 ; GFX6-NEXT: s_lshl_b32 s1, s1, s7 +; GFX6-NEXT: s_lshr_b32 s3, s3, s5 ; GFX6-NEXT: s_or_b32 s1, s1, s3 ; GFX6-NEXT: s_and_b32 s3, s6, 15 ; GFX6-NEXT: s_bfe_u32 s3, s3, 0x100000 +; GFX6-NEXT: s_andn2_b32 s5, 15, s6 ; GFX6-NEXT: s_lshl_b32 s2, s2, s3 ; GFX6-NEXT: s_and_b32 s3, s4, s12 -; GFX6-NEXT: s_andn2_b32 s5, 15, s6 ; GFX6-NEXT: s_lshr_b32 s3, s3, 1 ; GFX6-NEXT: s_bfe_u32 s4, s5, 0x100000 ; GFX6-NEXT: s_lshr_b32 s3, s3, s4 @@ -3871,25 +3871,25 @@ ; GFX8-NEXT: s_lshr_b32 s7, s2, 16 ; GFX8-NEXT: s_lshl_b32 s0, s0, s8 ; GFX8-NEXT: s_lshr_b32 s9, s9, s10 -; GFX8-NEXT: s_lshl_b32 s2, s2, s8 ; GFX8-NEXT: s_or_b32 s0, s0, s9 -; GFX8-NEXT: s_lshr_b32 s9, s7, s10 ; GFX8-NEXT: s_lshl_b32 s6, s6, s8 +; GFX8-NEXT: s_lshr_b32 s9, s7, s10 +; GFX8-NEXT: s_lshl_b32 s2, s2, s8 ; GFX8-NEXT: s_xor_b32 s4, s4, -1 -; GFX8-NEXT: s_and_b32 s11, s4, 15 -; GFX8-NEXT: s_bfe_u32 s2, s2, 0x100000 ; GFX8-NEXT: s_or_b32 s6, s6, s9 ; GFX8-NEXT: s_lshr_b32 s9, s4, 16 +; GFX8-NEXT: s_and_b32 s11, s4, 15 ; GFX8-NEXT: s_andn2_b32 s4, 15, s4 +; GFX8-NEXT: s_bfe_u32 s2, s2, 0x100000 ; GFX8-NEXT: s_bfe_u32 s11, s11, 0x100000 ; GFX8-NEXT: s_lshr_b32 s2, s2, s8 ; GFX8-NEXT: s_bfe_u32 s4, s4, 0x100000 -; GFX8-NEXT: s_lshr_b32 s2, s2, s4 ; GFX8-NEXT: s_lshl_b32 s0, s0, s11 +; GFX8-NEXT: s_lshr_b32 s2, s2, s4 ; GFX8-NEXT: s_or_b32 s0, s0, s2 ; GFX8-NEXT: s_and_b32 s2, s9, 15 -; GFX8-NEXT: s_bfe_u32 s2, s2, 0x100000 ; GFX8-NEXT: s_lshl_b32 s7, s7, s8 +; GFX8-NEXT: s_bfe_u32 s2, s2, 0x100000 ; GFX8-NEXT: s_andn2_b32 s4, 15, s9 ; GFX8-NEXT: s_lshl_b32 s2, s6, s2 ; GFX8-NEXT: s_bfe_u32 s6, s7, 0x100000 @@ -3898,36 +3898,36 @@ ; GFX8-NEXT: s_lshr_b32 s4, s6, s4 ; GFX8-NEXT: s_or_b32 s2, s2, s4 ; GFX8-NEXT: s_bfe_u32 s2, s2, 0x100000 -; GFX8-NEXT: s_bfe_u32 s6, s3, 0x100000 ; GFX8-NEXT: s_bfe_u32 s0, s0, 0x100000 ; GFX8-NEXT: s_lshl_b32 s2, s2, 16 +; GFX8-NEXT: s_bfe_u32 s6, s3, 0x100000 ; GFX8-NEXT: s_or_b32 s0, s0, s2 ; GFX8-NEXT: s_lshr_b32 s2, s1, 16 ; GFX8-NEXT: s_lshr_b32 s4, s3, 16 ; GFX8-NEXT: s_lshl_b32 s1, s1, s8 ; GFX8-NEXT: s_lshr_b32 s6, s6, s10 -; GFX8-NEXT: s_lshl_b32 s3, s3, s8 ; GFX8-NEXT: s_or_b32 s1, s1, s6 -; GFX8-NEXT: s_lshr_b32 s6, s4, s10 ; GFX8-NEXT: s_lshl_b32 s2, s2, s8 +; GFX8-NEXT: s_lshr_b32 s6, s4, s10 +; GFX8-NEXT: s_lshl_b32 s3, s3, s8 ; GFX8-NEXT: s_xor_b32 s5, s5, -1 -; GFX8-NEXT: s_and_b32 s7, s5, 15 -; GFX8-NEXT: s_bfe_u32 s3, s3, 0x100000 ; GFX8-NEXT: s_or_b32 s2, s2, s6 ; GFX8-NEXT: s_lshr_b32 s6, s5, 16 +; GFX8-NEXT: s_and_b32 s7, s5, 15 ; GFX8-NEXT: s_andn2_b32 s5, 15, s5 +; GFX8-NEXT: s_bfe_u32 s3, s3, 0x100000 ; GFX8-NEXT: s_bfe_u32 s7, s7, 0x100000 ; GFX8-NEXT: s_lshr_b32 s3, s3, s8 ; GFX8-NEXT: s_bfe_u32 s5, s5, 0x100000 -; GFX8-NEXT: s_lshr_b32 s3, s3, s5 ; GFX8-NEXT: s_lshl_b32 s1, s1, s7 +; GFX8-NEXT: s_lshr_b32 s3, s3, s5 ; GFX8-NEXT: s_or_b32 s1, s1, s3 ; GFX8-NEXT: s_and_b32 s3, s6, 15 -; GFX8-NEXT: s_bfe_u32 s3, s3, 0x100000 ; GFX8-NEXT: s_lshl_b32 s4, s4, s8 +; GFX8-NEXT: s_bfe_u32 s3, s3, 0x100000 +; GFX8-NEXT: s_andn2_b32 s5, 15, s6 ; GFX8-NEXT: s_lshl_b32 s2, s2, s3 ; GFX8-NEXT: s_bfe_u32 s3, s4, 0x100000 -; GFX8-NEXT: s_andn2_b32 s5, 15, s6 ; GFX8-NEXT: s_lshr_b32 s3, s3, s8 ; GFX8-NEXT: s_bfe_u32 s4, s5, 0x100000 ; GFX8-NEXT: s_lshr_b32 s3, s3, s4 @@ -3940,14 +3940,14 @@ ; ; GFX9-LABEL: s_fshr_v4i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_lshr_b32 s9, s0, 16 ; GFX9-NEXT: s_mov_b32 s8, 0x10001 +; GFX9-NEXT: s_lshr_b32 s9, s0, 16 ; GFX9-NEXT: s_mov_b32 s6, 0xf000f ; GFX9-NEXT: s_lshl_b32 s0, s0, s8 ; GFX9-NEXT: s_lshl_b32 s9, s9, 1 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s9 ; GFX9-NEXT: s_and_b32 s7, s4, s6 ; GFX9-NEXT: s_andn2_b32 s4, s6, s4 +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s9 ; GFX9-NEXT: s_lshr_b32 s9, s0, 16 ; GFX9-NEXT: s_lshr_b32 s10, s4, 16 ; GFX9-NEXT: s_lshl_b32 s0, s0, s4 @@ -3955,8 +3955,8 @@ ; GFX9-NEXT: s_mov_b32 s9, 0xffff ; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s4 ; GFX9-NEXT: s_lshr_b32 s4, s2, 16 -; GFX9-NEXT: s_lshr_b32 s10, s7, 16 ; GFX9-NEXT: s_and_b32 s2, s2, s9 +; GFX9-NEXT: s_lshr_b32 s10, s7, 16 ; GFX9-NEXT: s_and_b32 s7, s7, s9 ; GFX9-NEXT: s_lshr_b32 s2, s2, s7 ; GFX9-NEXT: s_lshr_b32 s4, s4, s10 @@ -3974,8 +3974,8 @@ ; GFX9-NEXT: s_lshl_b32 s4, s5, s6 ; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s4 ; GFX9-NEXT: s_lshr_b32 s4, s3, 16 -; GFX9-NEXT: s_lshr_b32 s5, s2, 16 ; GFX9-NEXT: s_and_b32 s3, s3, s9 +; GFX9-NEXT: s_lshr_b32 s5, s2, 16 ; GFX9-NEXT: s_and_b32 s2, s2, s9 ; GFX9-NEXT: s_lshr_b32 s2, s3, s2 ; GFX9-NEXT: s_lshr_b32 s3, s4, s5 @@ -3985,8 +3985,8 @@ ; ; GFX10-LABEL: s_fshr_v4i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_lshr_b32 s8, s0, 16 ; GFX10-NEXT: s_mov_b32 s7, 0x10001 +; GFX10-NEXT: s_lshr_b32 s8, s0, 16 ; GFX10-NEXT: s_mov_b32 s6, 0xf000f ; GFX10-NEXT: s_lshl_b32 s0, s0, s7 ; GFX10-NEXT: s_lshl_b32 s8, s8, 1 @@ -4008,14 +4008,14 @@ ; GFX10-NEXT: s_lshr_b32 s5, s1, 16 ; GFX10-NEXT: s_lshr_b32 s6, s4, 16 ; GFX10-NEXT: s_lshr_b32 s10, s2, 16 -; GFX10-NEXT: s_and_b32 s11, s9, s8 ; GFX10-NEXT: s_and_b32 s2, s2, s8 +; GFX10-NEXT: s_and_b32 s11, s9, s8 ; GFX10-NEXT: s_lshr_b32 s9, s9, 16 ; GFX10-NEXT: s_lshl_b32 s1, s1, s4 ; GFX10-NEXT: s_lshl_b32 s4, s5, s6 ; GFX10-NEXT: s_lshr_b32 s5, s3, 16 -; GFX10-NEXT: s_and_b32 s6, s7, s8 ; GFX10-NEXT: s_and_b32 s3, s3, s8 +; GFX10-NEXT: s_and_b32 s6, s7, s8 ; GFX10-NEXT: s_lshr_b32 s7, s7, 16 ; GFX10-NEXT: s_lshr_b32 s2, s2, s11 ; GFX10-NEXT: s_lshr_b32 s9, s10, s9 @@ -4043,36 +4043,36 @@ ; GFX6-NEXT: v_lshlrev_b32_e32 v9, 16, v11 ; GFX6-NEXT: v_and_b32_e32 v10, v10, v12 ; GFX6-NEXT: s_mov_b32 s5, 0xffff -; GFX6-NEXT: s_bfe_u32 s4, 1, 0x100000 ; GFX6-NEXT: v_or_b32_e32 v9, v9, v10 +; GFX6-NEXT: s_bfe_u32 s4, 1, 0x100000 ; GFX6-NEXT: v_and_b32_e32 v10, s5, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, s4, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v10, 15, v10 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v10 ; GFX6-NEXT: v_and_b32_e32 v10, s5, v5 -; GFX6-NEXT: v_xor_b32_e32 v8, -1, v8 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, s4, v1 ; GFX6-NEXT: v_lshrrev_b32_e32 v10, 15, v10 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 1, v4 -; GFX6-NEXT: v_and_b32_e32 v11, 15, v8 +; GFX6-NEXT: v_xor_b32_e32 v8, -1, v8 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v10 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 1, v4 ; GFX6-NEXT: v_lshrrev_b32_e32 v10, 16, v8 +; GFX6-NEXT: v_and_b32_e32 v11, 15, v8 ; GFX6-NEXT: v_xor_b32_e32 v8, -1, v8 ; GFX6-NEXT: v_and_b32_e32 v8, 15, v8 ; GFX6-NEXT: v_and_b32_e32 v4, v4, v12 ; GFX6-NEXT: v_bfe_u32 v11, v11, 0, 16 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 1, v4 ; GFX6-NEXT: v_bfe_u32 v8, v8, 0, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, v8, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, v11, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, v8, v4 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v4 ; GFX6-NEXT: v_and_b32_e32 v4, 15, v10 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 1, v5 ; GFX6-NEXT: v_xor_b32_e32 v8, -1, v10 ; GFX6-NEXT: v_bfe_u32 v4, v4, 0, 16 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 1, v5 +; GFX6-NEXT: v_and_b32_e32 v8, 15, v8 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, v4, v1 ; GFX6-NEXT: v_and_b32_e32 v4, v5, v12 -; GFX6-NEXT: v_and_b32_e32 v8, 15, v8 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 1, v4 ; GFX6-NEXT: v_bfe_u32 v5, v8, 0, 16 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, v5, v4 @@ -4087,24 +4087,24 @@ ; GFX6-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 1, v6 ; GFX6-NEXT: v_xor_b32_e32 v6, -1, v9 -; GFX6-NEXT: v_and_b32_e32 v8, 15, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v5, 1, v7 ; GFX6-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; GFX6-NEXT: v_and_b32_e32 v8, 15, v6 ; GFX6-NEXT: v_xor_b32_e32 v6, -1, v6 ; GFX6-NEXT: v_and_b32_e32 v6, 15, v6 ; GFX6-NEXT: v_and_b32_e32 v4, v4, v12 ; GFX6-NEXT: v_bfe_u32 v8, v8, 0, 16 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 1, v4 ; GFX6-NEXT: v_bfe_u32 v6, v6, 0, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, v6, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, v8, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, v6, v4 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v4 ; GFX6-NEXT: v_and_b32_e32 v4, 15, v7 ; GFX6-NEXT: v_xor_b32_e32 v6, -1, v7 ; GFX6-NEXT: v_bfe_u32 v4, v4, 0, 16 +; GFX6-NEXT: v_and_b32_e32 v6, 15, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, v4, v3 ; GFX6-NEXT: v_and_b32_e32 v4, v5, v12 -; GFX6-NEXT: v_and_b32_e32 v6, 15, v6 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 1, v4 ; GFX6-NEXT: v_bfe_u32 v5, v6, 0, 16 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, v5, v4 @@ -4119,14 +4119,14 @@ ; GFX8-NEXT: v_or_b32_e32 v6, v6, v7 ; GFX8-NEXT: v_mov_b32_e32 v7, 1 ; GFX8-NEXT: v_mov_b32_e32 v8, 15 -; GFX8-NEXT: v_xor_b32_e32 v4, -1, v4 ; GFX8-NEXT: v_lshlrev_b16_sdwa v0, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_lshrrev_b16_sdwa v9, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_xor_b32_e32 v4, -1, v4 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v9 ; GFX8-NEXT: v_lshlrev_b16_e32 v9, 1, v2 ; GFX8-NEXT: v_lshlrev_b16_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_and_b32_e32 v10, 15, v4 ; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v10, 15, v4 ; GFX8-NEXT: v_xor_b32_e32 v4, -1, v4 ; GFX8-NEXT: v_and_b32_e32 v4, 15, v4 ; GFX8-NEXT: v_lshrrev_b16_e32 v9, 1, v9 @@ -4153,8 +4153,8 @@ ; GFX8-NEXT: v_or_b32_e32 v1, v1, v7 ; GFX8-NEXT: v_lshlrev_b16_e32 v7, 1, v3 ; GFX8-NEXT: v_lshlrev_b16_sdwa v3, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_and_b32_e32 v8, 15, v5 ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v5 +; GFX8-NEXT: v_and_b32_e32 v8, 15, v5 ; GFX8-NEXT: v_xor_b32_e32 v5, -1, v5 ; GFX8-NEXT: v_and_b32_e32 v5, 15, v5 ; GFX8-NEXT: v_lshrrev_b16_e32 v7, 1, v7 @@ -4181,8 +4181,8 @@ ; GFX9-NEXT: v_and_b32_e32 v4, s4, v4 ; GFX9-NEXT: v_pk_lshlrev_b16 v0, 1, v0 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_lshlrev_b16 v0, v4, v0 -; GFX9-NEXT: v_xor_b32_e32 v4, -1, v5 ; GFX9-NEXT: v_pk_lshrrev_b16 v2, v6, v2 +; GFX9-NEXT: v_xor_b32_e32 v4, -1, v5 ; GFX9-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX9-NEXT: v_and_b32_e32 v2, s4, v5 ; GFX9-NEXT: v_and_b32_e32 v4, s4, v4 @@ -4289,8 +4289,8 @@ define amdgpu_ps i64 @s_fshr_i64_48(i64 inreg %lhs, i64 inreg %rhs) { ; GCN-LABEL: s_fshr_i64_48: ; GCN: ; %bb.0: -; GCN-NEXT: s_lshr_b32 s2, s3, 16 ; GCN-NEXT: s_lshl_b64 s[0:1], s[0:1], 16 +; GCN-NEXT: s_lshr_b32 s2, s3, 16 ; GCN-NEXT: s_mov_b32 s3, 0 ; GCN-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] ; GCN-NEXT: ; return to shader part epilog @@ -4998,8 +4998,8 @@ ; GFX6-NEXT: v_or_b32_e32 v2, v0, v2 ; GFX6-NEXT: v_or_b32_e32 v3, v1, v3 ; GFX6-NEXT: v_lshr_b64 v[0:1], v[6:7], v15 -; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v14 ; GFX6-NEXT: v_lshr_b64 v[8:9], v[6:7], v14 +; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v14 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v14 @@ -5047,8 +5047,8 @@ ; GFX8-NEXT: v_or_b32_e32 v2, v0, v2 ; GFX8-NEXT: v_or_b32_e32 v3, v1, v3 ; GFX8-NEXT: v_lshrrev_b64 v[0:1], v15, v[6:7] -; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v14 ; GFX8-NEXT: v_lshrrev_b64 v[8:9], v14, v[6:7] +; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v14 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v14 @@ -5096,8 +5096,8 @@ ; GFX9-NEXT: v_or_b32_e32 v2, v0, v2 ; GFX9-NEXT: v_or_b32_e32 v3, v1, v3 ; GFX9-NEXT: v_lshrrev_b64 v[0:1], v15, v[6:7] -; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v14 ; GFX9-NEXT: v_lshrrev_b64 v[8:9], v14, v[6:7] +; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v14 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v14 @@ -5143,18 +5143,18 @@ ; GFX10-NEXT: v_cndmask_b32_e32 v10, v0, v10, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v11, v1, v11, vcc_lo ; GFX10-NEXT: v_lshrrev_b64 v[0:1], v19, v[6:7] -; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 0, v18 ; GFX10-NEXT: v_cndmask_b32_e64 v8, v8, v12, s4 ; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 0, v19 +; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 0, v18 ; GFX10-NEXT: v_cndmask_b32_e64 v6, v9, v13, s4 ; GFX10-NEXT: v_cndmask_b32_e32 v14, 0, v14, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v7, 0, v15, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v10, v2, s6 ; GFX10-NEXT: v_cndmask_b32_e64 v4, v8, v4, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v10, v2, s6 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v11, v3, s6 ; GFX10-NEXT: v_cndmask_b32_e64 v5, v6, v5, s5 ; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, v0, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, v1, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v11, v3, s6 ; GFX10-NEXT: v_or_b32_e32 v0, v14, v4 ; GFX10-NEXT: v_or_b32_e32 v1, v7, v5 ; GFX10-NEXT: v_or_b32_e32 v2, v2, v6 @@ -5170,13 +5170,13 @@ ; GFX6-NEXT: s_movk_i32 s8, 0x7f ; GFX6-NEXT: v_and_b32_e32 v6, s8, v0 ; GFX6-NEXT: v_xor_b32_e32 v0, -1, v0 -; GFX6-NEXT: v_and_b32_e32 v7, s8, v0 ; GFX6-NEXT: s_mov_b32 s9, 0 +; GFX6-NEXT: v_and_b32_e32 v7, s8, v0 ; GFX6-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 ; GFX6-NEXT: s_lshr_b32 s8, s1, 31 ; GFX6-NEXT: s_lshl_b64 s[10:11], s[0:1], 1 -; GFX6-NEXT: v_sub_i32_e32 v0, vcc, 64, v7 ; GFX6-NEXT: s_or_b64 s[0:1], s[2:3], s[8:9] +; GFX6-NEXT: v_sub_i32_e32 v0, vcc, 64, v7 ; GFX6-NEXT: v_lshr_b64 v[0:1], s[10:11], v0 ; GFX6-NEXT: v_lshl_b64 v[2:3], s[0:1], v7 ; GFX6-NEXT: v_subrev_i32_e32 v8, vcc, 64, v7 @@ -5185,13 +5185,13 @@ ; GFX6-NEXT: v_or_b32_e32 v3, v1, v3 ; GFX6-NEXT: v_lshl_b64 v[0:1], s[10:11], v8 ; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v7 -; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v8, 0, v4, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v9, 0, v5, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX6-NEXT: v_mov_b32_e32 v2, s0 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 ; GFX6-NEXT: v_mov_b32_e32 v3, s1 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 ; GFX6-NEXT: v_cndmask_b32_e32 v7, v0, v2, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v10, v1, v3, vcc ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 64, v6 @@ -5201,13 +5201,13 @@ ; GFX6-NEXT: v_or_b32_e32 v2, v0, v2 ; GFX6-NEXT: v_or_b32_e32 v3, v1, v3 ; GFX6-NEXT: v_lshr_b64 v[0:1], s[6:7], v11 -; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v6 ; GFX6-NEXT: v_lshr_b64 v[4:5], s[6:7], v6 +; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v6 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX6-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v6 ; GFX6-NEXT: v_mov_b32_e32 v3, s5 +; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v6 ; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] ; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc @@ -5223,13 +5223,13 @@ ; GFX8-NEXT: s_movk_i32 s8, 0x7f ; GFX8-NEXT: v_and_b32_e32 v6, s8, v0 ; GFX8-NEXT: v_xor_b32_e32 v0, -1, v0 -; GFX8-NEXT: v_and_b32_e32 v7, s8, v0 ; GFX8-NEXT: s_mov_b32 s9, 0 +; GFX8-NEXT: v_and_b32_e32 v7, s8, v0 ; GFX8-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 ; GFX8-NEXT: s_lshr_b32 s8, s1, 31 ; GFX8-NEXT: s_lshl_b64 s[10:11], s[0:1], 1 -; GFX8-NEXT: v_sub_u32_e32 v0, vcc, 64, v7 ; GFX8-NEXT: s_or_b64 s[0:1], s[2:3], s[8:9] +; GFX8-NEXT: v_sub_u32_e32 v0, vcc, 64, v7 ; GFX8-NEXT: v_lshrrev_b64 v[0:1], v0, s[10:11] ; GFX8-NEXT: v_lshlrev_b64 v[2:3], v7, s[0:1] ; GFX8-NEXT: v_subrev_u32_e32 v8, vcc, 64, v7 @@ -5238,13 +5238,13 @@ ; GFX8-NEXT: v_or_b32_e32 v3, v1, v3 ; GFX8-NEXT: v_lshlrev_b64 v[0:1], v8, s[10:11] ; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v7 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v8, 0, v4, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v9, 0, v5, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX8-NEXT: v_mov_b32_e32 v2, s0 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 ; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 ; GFX8-NEXT: v_cndmask_b32_e32 v7, v0, v2, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v10, v1, v3, vcc ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 64, v6 @@ -5254,13 +5254,13 @@ ; GFX8-NEXT: v_or_b32_e32 v2, v0, v2 ; GFX8-NEXT: v_or_b32_e32 v3, v1, v3 ; GFX8-NEXT: v_lshrrev_b64 v[0:1], v11, s[6:7] -; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v6 ; GFX8-NEXT: v_lshrrev_b64 v[4:5], v6, s[6:7] +; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v6 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v6 ; GFX8-NEXT: v_mov_b32_e32 v3, s5 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v6 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc @@ -5276,13 +5276,13 @@ ; GFX9-NEXT: s_movk_i32 s8, 0x7f ; GFX9-NEXT: v_and_b32_e32 v6, s8, v0 ; GFX9-NEXT: v_xor_b32_e32 v0, -1, v0 -; GFX9-NEXT: v_and_b32_e32 v7, s8, v0 ; GFX9-NEXT: s_mov_b32 s9, 0 +; GFX9-NEXT: v_and_b32_e32 v7, s8, v0 ; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 ; GFX9-NEXT: s_lshr_b32 s8, s1, 31 ; GFX9-NEXT: s_lshl_b64 s[10:11], s[0:1], 1 -; GFX9-NEXT: v_sub_u32_e32 v0, 64, v7 ; GFX9-NEXT: s_or_b64 s[0:1], s[2:3], s[8:9] +; GFX9-NEXT: v_sub_u32_e32 v0, 64, v7 ; GFX9-NEXT: v_lshrrev_b64 v[0:1], v0, s[10:11] ; GFX9-NEXT: v_lshlrev_b64 v[2:3], v7, s[0:1] ; GFX9-NEXT: v_subrev_u32_e32 v8, 64, v7 @@ -5291,10 +5291,10 @@ ; GFX9-NEXT: v_or_b32_e32 v3, v1, v3 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], v8, s[10:11] ; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v7 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v8, 0, v4, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v9, 0, v5, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX9-NEXT: v_mov_b32_e32 v2, s0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 ; GFX9-NEXT: v_mov_b32_e32 v3, s1 @@ -5307,13 +5307,13 @@ ; GFX9-NEXT: v_or_b32_e32 v2, v0, v2 ; GFX9-NEXT: v_or_b32_e32 v3, v1, v3 ; GFX9-NEXT: v_lshrrev_b64 v[0:1], v11, s[6:7] -; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v6 ; GFX9-NEXT: v_lshrrev_b64 v[4:5], v6, s[6:7] +; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v6 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX9-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v6 ; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v6 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc @@ -5328,17 +5328,17 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: v_xor_b32_e32 v1, -1, v0 ; GFX10-NEXT: s_movk_i32 s10, 0x7f -; GFX10-NEXT: s_lshr_b32 s8, s1, 31 -; GFX10-NEXT: v_and_b32_e32 v13, s10, v0 ; GFX10-NEXT: s_mov_b32 s9, 0 -; GFX10-NEXT: v_and_b32_e32 v12, s10, v1 +; GFX10-NEXT: v_and_b32_e32 v13, s10, v0 ; GFX10-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 +; GFX10-NEXT: v_and_b32_e32 v12, s10, v1 +; GFX10-NEXT: s_lshr_b32 s8, s1, 31 ; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], 1 ; GFX10-NEXT: v_sub_nc_u32_e32 v8, 64, v13 ; GFX10-NEXT: s_or_b64 s[8:9], s[2:3], s[8:9] ; GFX10-NEXT: v_sub_nc_u32_e32 v2, 64, v12 -; GFX10-NEXT: v_subrev_nc_u32_e32 v10, 64, v12 ; GFX10-NEXT: v_lshlrev_b64 v[0:1], v12, s[8:9] +; GFX10-NEXT: v_subrev_nc_u32_e32 v10, 64, v12 ; GFX10-NEXT: v_subrev_nc_u32_e32 v14, 64, v13 ; GFX10-NEXT: v_lshrrev_b64 v[4:5], v13, s[4:5] ; GFX10-NEXT: v_lshrrev_b64 v[2:3], v2, s[0:1] @@ -5357,15 +5357,15 @@ ; GFX10-NEXT: v_lshrrev_b64 v[2:3], v13, s[6:7] ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0 ; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 0, v13 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v5, s0 ; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 0, v12 -; GFX10-NEXT: v_cndmask_b32_e32 v4, 0, v7, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v5, s0 ; GFX10-NEXT: v_cndmask_b32_e32 v6, 0, v6, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v4, 0, v7, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s4, s1 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s5, s1 ; GFX10-NEXT: v_cndmask_b32_e64 v5, v8, s8, s2 -; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, v2, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v7, v10, s9, s2 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s5, s1 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, v2, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, v3, s0 ; GFX10-NEXT: v_or_b32_e32 v0, v6, v0 ; GFX10-NEXT: v_or_b32_e32 v1, v4, v1 @@ -5404,20 +5404,20 @@ ; GFX6-NEXT: s_cselect_b64 s[4:5], s[4:5], s[6:7] ; GFX6-NEXT: s_cmp_lg_u32 s13, 0 ; GFX6-NEXT: s_cselect_b64 s[0:1], s[0:1], s[4:5] -; GFX6-NEXT: s_sub_i32 s5, 64, s8 ; GFX6-NEXT: s_sub_i32 s4, s8, 64 +; GFX6-NEXT: s_sub_i32 s5, 64, s8 ; GFX6-NEXT: s_cmp_lt_u32 s8, 64 ; GFX6-NEXT: s_cselect_b32 s6, 1, 0 ; GFX6-NEXT: s_cmp_eq_u32 s8, 0 ; GFX6-NEXT: v_lshr_b64 v[4:5], v[0:1], s8 ; GFX6-NEXT: v_lshl_b64 v[6:7], v[2:3], s5 -; GFX6-NEXT: v_lshr_b64 v[8:9], v[2:3], s8 ; GFX6-NEXT: s_cselect_b32 s7, 1, 0 +; GFX6-NEXT: v_lshr_b64 v[8:9], v[2:3], s8 ; GFX6-NEXT: v_lshr_b64 v[2:3], v[2:3], s4 ; GFX6-NEXT: s_and_b32 s4, 1, s6 -; GFX6-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 ; GFX6-NEXT: v_or_b32_e32 v4, v4, v6 ; GFX6-NEXT: v_or_b32_e32 v5, v5, v7 +; GFX6-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 ; GFX6-NEXT: s_and_b32 s4, 1, s7 ; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc @@ -5460,20 +5460,20 @@ ; GFX8-NEXT: s_cselect_b64 s[4:5], s[4:5], s[6:7] ; GFX8-NEXT: s_cmp_lg_u32 s13, 0 ; GFX8-NEXT: s_cselect_b64 s[0:1], s[0:1], s[4:5] -; GFX8-NEXT: s_sub_i32 s5, 64, s8 ; GFX8-NEXT: s_sub_i32 s4, s8, 64 +; GFX8-NEXT: s_sub_i32 s5, 64, s8 ; GFX8-NEXT: s_cmp_lt_u32 s8, 64 ; GFX8-NEXT: s_cselect_b32 s6, 1, 0 ; GFX8-NEXT: s_cmp_eq_u32 s8, 0 ; GFX8-NEXT: v_lshrrev_b64 v[4:5], s8, v[0:1] ; GFX8-NEXT: v_lshlrev_b64 v[6:7], s5, v[2:3] -; GFX8-NEXT: v_lshrrev_b64 v[8:9], s8, v[2:3] ; GFX8-NEXT: s_cselect_b32 s7, 1, 0 +; GFX8-NEXT: v_lshrrev_b64 v[8:9], s8, v[2:3] ; GFX8-NEXT: v_lshrrev_b64 v[2:3], s4, v[2:3] ; GFX8-NEXT: s_and_b32 s4, 1, s6 -; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 ; GFX8-NEXT: v_or_b32_e32 v4, v4, v6 ; GFX8-NEXT: v_or_b32_e32 v5, v5, v7 +; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 ; GFX8-NEXT: s_and_b32 s4, 1, s7 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc @@ -5516,20 +5516,20 @@ ; GFX9-NEXT: s_cselect_b64 s[4:5], s[4:5], s[6:7] ; GFX9-NEXT: s_cmp_lg_u32 s13, 0 ; GFX9-NEXT: s_cselect_b64 s[0:1], s[0:1], s[4:5] -; GFX9-NEXT: s_sub_i32 s5, 64, s8 ; GFX9-NEXT: s_sub_i32 s4, s8, 64 +; GFX9-NEXT: s_sub_i32 s5, 64, s8 ; GFX9-NEXT: s_cmp_lt_u32 s8, 64 ; GFX9-NEXT: s_cselect_b32 s6, 1, 0 ; GFX9-NEXT: s_cmp_eq_u32 s8, 0 ; GFX9-NEXT: v_lshrrev_b64 v[4:5], s8, v[0:1] ; GFX9-NEXT: v_lshlrev_b64 v[6:7], s5, v[2:3] -; GFX9-NEXT: v_lshrrev_b64 v[8:9], s8, v[2:3] ; GFX9-NEXT: s_cselect_b32 s7, 1, 0 +; GFX9-NEXT: v_lshrrev_b64 v[8:9], s8, v[2:3] ; GFX9-NEXT: v_lshrrev_b64 v[2:3], s4, v[2:3] ; GFX9-NEXT: s_and_b32 s4, 1, s6 -; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 ; GFX9-NEXT: v_or_b32_e32 v4, v4, v6 ; GFX9-NEXT: v_or_b32_e32 v5, v5, v7 +; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 ; GFX9-NEXT: s_and_b32 s4, 1, s7 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc @@ -5628,18 +5628,18 @@ ; GFX6-NEXT: s_and_b32 s4, 1, s9 ; GFX6-NEXT: s_sub_i32 s10, s8, 64 ; GFX6-NEXT: s_sub_i32 s9, 64, s8 -; GFX6-NEXT: s_cmp_lt_u32 s8, 64 ; GFX6-NEXT: v_or_b32_e32 v6, v0, v6 ; GFX6-NEXT: v_or_b32_e32 v7, v1, v7 ; GFX6-NEXT: v_lshl_b64 v[0:1], v[4:5], s5 +; GFX6-NEXT: s_cmp_lt_u32 s8, 64 ; GFX6-NEXT: s_cselect_b32 s11, 1, 0 ; GFX6-NEXT: s_cmp_eq_u32 s8, 0 -; GFX6-NEXT: s_cselect_b32 s12, 1, 0 ; GFX6-NEXT: v_cndmask_b32_e32 v4, 0, v8, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v5, 0, v9, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc ; GFX6-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 +; GFX6-NEXT: s_cselect_b32 s12, 1, 0 ; GFX6-NEXT: s_lshr_b64 s[4:5], s[2:3], s8 ; GFX6-NEXT: s_lshr_b64 s[6:7], s[0:1], s8 ; GFX6-NEXT: s_lshl_b64 s[8:9], s[2:3], s9 @@ -5682,18 +5682,18 @@ ; GFX8-NEXT: s_and_b32 s4, 1, s9 ; GFX8-NEXT: s_sub_i32 s10, s8, 64 ; GFX8-NEXT: s_sub_i32 s9, 64, s8 -; GFX8-NEXT: s_cmp_lt_u32 s8, 64 ; GFX8-NEXT: v_or_b32_e32 v6, v0, v6 ; GFX8-NEXT: v_or_b32_e32 v7, v1, v7 ; GFX8-NEXT: v_lshlrev_b64 v[0:1], s5, v[4:5] +; GFX8-NEXT: s_cmp_lt_u32 s8, 64 ; GFX8-NEXT: s_cselect_b32 s11, 1, 0 ; GFX8-NEXT: s_cmp_eq_u32 s8, 0 -; GFX8-NEXT: s_cselect_b32 s12, 1, 0 ; GFX8-NEXT: v_cndmask_b32_e32 v4, 0, v8, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v5, 0, v9, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc ; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 +; GFX8-NEXT: s_cselect_b32 s12, 1, 0 ; GFX8-NEXT: s_lshr_b64 s[4:5], s[2:3], s8 ; GFX8-NEXT: s_lshr_b64 s[6:7], s[0:1], s8 ; GFX8-NEXT: s_lshl_b64 s[8:9], s[2:3], s9 @@ -5736,18 +5736,18 @@ ; GFX9-NEXT: s_and_b32 s4, 1, s9 ; GFX9-NEXT: s_sub_i32 s10, s8, 64 ; GFX9-NEXT: s_sub_i32 s9, 64, s8 -; GFX9-NEXT: s_cmp_lt_u32 s8, 64 ; GFX9-NEXT: v_or_b32_e32 v6, v0, v6 ; GFX9-NEXT: v_or_b32_e32 v7, v1, v7 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], s5, v[4:5] +; GFX9-NEXT: s_cmp_lt_u32 s8, 64 ; GFX9-NEXT: s_cselect_b32 s11, 1, 0 ; GFX9-NEXT: s_cmp_eq_u32 s8, 0 -; GFX9-NEXT: s_cselect_b32 s12, 1, 0 ; GFX9-NEXT: v_cndmask_b32_e32 v4, 0, v8, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v5, 0, v9, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc ; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 +; GFX9-NEXT: s_cselect_b32 s12, 1, 0 ; GFX9-NEXT: s_lshr_b64 s[4:5], s[2:3], s8 ; GFX9-NEXT: s_lshr_b64 s[6:7], s[0:1], s8 ; GFX9-NEXT: s_lshl_b64 s[8:9], s[2:3], s9 @@ -5778,8 +5778,8 @@ ; GFX10-NEXT: v_or_b32_e32 v2, v2, v4 ; GFX10-NEXT: s_sub_i32 s4, 64, s8 ; GFX10-NEXT: s_sub_i32 s5, s8, 64 -; GFX10-NEXT: s_cmp_lt_u32 s8, 64 ; GFX10-NEXT: v_lshrrev_b64 v[4:5], s4, v[0:1] +; GFX10-NEXT: s_cmp_lt_u32 s8, 64 ; GFX10-NEXT: v_lshlrev_b64 v[6:7], s8, v[2:3] ; GFX10-NEXT: s_cselect_b32 vcc_lo, 1, 0 ; GFX10-NEXT: s_cmp_eq_u32 s8, 0 @@ -5865,13 +5865,13 @@ ; GFX10-LABEL: s_fshr_i128_65: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: s_lshl_b32 s3, s2, 31 ; GFX10-NEXT: s_lshl_b32 s5, s0, 31 +; GFX10-NEXT: s_lshl_b32 s3, s2, 31 ; GFX10-NEXT: s_mov_b32 s2, s4 -; GFX10-NEXT: s_lshr_b64 s[8:9], s[0:1], 1 ; GFX10-NEXT: s_lshr_b64 s[6:7], s[6:7], 1 -; GFX10-NEXT: s_or_b64 s[2:3], s[2:3], s[8:9] +; GFX10-NEXT: s_lshr_b64 s[8:9], s[0:1], 1 ; GFX10-NEXT: s_or_b64 s[0:1], s[4:5], s[6:7] +; GFX10-NEXT: s_or_b64 s[2:3], s[2:3], s[8:9] ; GFX10-NEXT: ; return to shader part epilog %result = call i128 @llvm.fshr.i128(i128 %lhs, i128 %rhs, i128 65) ret i128 %result @@ -5881,9 +5881,9 @@ ; GFX6-LABEL: v_fshr_i128_65: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 31, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v5, 31, v2 ; GFX6-NEXT: v_lshr_b64 v[2:3], v[0:1], 1 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 31, v0 ; GFX6-NEXT: v_lshr_b64 v[0:1], v[6:7], 1 ; GFX6-NEXT: v_or_b32_e32 v3, v5, v3 ; GFX6-NEXT: v_or_b32_e32 v1, v4, v1 @@ -5892,9 +5892,9 @@ ; GFX8-LABEL: v_fshr_i128_65: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 31, v0 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, 31, v2 ; GFX8-NEXT: v_lshrrev_b64 v[2:3], 1, v[0:1] -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 31, v0 ; GFX8-NEXT: v_lshrrev_b64 v[0:1], 1, v[6:7] ; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 ; GFX8-NEXT: v_or_b32_e32 v1, v4, v1 @@ -5903,9 +5903,9 @@ ; GFX9-LABEL: v_fshr_i128_65: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 31, v0 ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 31, v2 ; GFX9-NEXT: v_lshrrev_b64 v[2:3], 1, v[0:1] -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 31, v0 ; GFX9-NEXT: v_lshrrev_b64 v[0:1], 1, v[6:7] ; GFX9-NEXT: v_or_b32_e32 v3, v5, v3 ; GFX9-NEXT: v_or_b32_e32 v1, v4, v1 @@ -5936,8 +5936,8 @@ ; GFX6-NEXT: s_and_b64 s[22:23], s[16:17], s[18:19] ; GFX6-NEXT: s_andn2_b64 s[16:17], s[18:19], s[16:17] ; GFX6-NEXT: s_lshl_b64 s[24:25], s[0:1], 1 -; GFX6-NEXT: s_lshr_b32 s0, s1, 31 ; GFX6-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 +; GFX6-NEXT: s_lshr_b32 s0, s1, 31 ; GFX6-NEXT: s_mov_b32 s1, s19 ; GFX6-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] ; GFX6-NEXT: s_sub_i32 s23, s16, 64 @@ -5963,8 +5963,8 @@ ; GFX6-NEXT: s_cmp_eq_u32 s22, 0 ; GFX6-NEXT: s_cselect_b32 s28, 1, 0 ; GFX6-NEXT: s_lshr_b64 s[0:1], s[10:11], s22 -; GFX6-NEXT: s_lshl_b64 s[24:25], s[10:11], s24 ; GFX6-NEXT: s_lshr_b64 s[22:23], s[8:9], s22 +; GFX6-NEXT: s_lshl_b64 s[24:25], s[10:11], s24 ; GFX6-NEXT: s_or_b64 s[22:23], s[22:23], s[24:25] ; GFX6-NEXT: s_lshr_b64 s[10:11], s[10:11], s26 ; GFX6-NEXT: s_cmp_lg_u32 s27, 0 @@ -5975,8 +5975,8 @@ ; GFX6-NEXT: s_cselect_b64 s[10:11], s[0:1], 0 ; GFX6-NEXT: s_or_b64 s[0:1], s[2:3], s[8:9] ; GFX6-NEXT: s_or_b64 s[2:3], s[16:17], s[10:11] -; GFX6-NEXT: s_andn2_b64 s[10:11], s[18:19], s[20:21] ; GFX6-NEXT: s_and_b64 s[8:9], s[20:21], s[18:19] +; GFX6-NEXT: s_andn2_b64 s[10:11], s[18:19], s[20:21] ; GFX6-NEXT: s_lshl_b64 s[6:7], s[6:7], 1 ; GFX6-NEXT: s_lshr_b32 s18, s5, 31 ; GFX6-NEXT: s_lshl_b64 s[16:17], s[4:5], 1 @@ -6004,8 +6004,8 @@ ; GFX6-NEXT: s_cmp_eq_u32 s8, 0 ; GFX6-NEXT: s_cselect_b32 s20, 1, 0 ; GFX6-NEXT: s_lshr_b64 s[4:5], s[14:15], s8 -; GFX6-NEXT: s_lshl_b64 s[16:17], s[14:15], s16 ; GFX6-NEXT: s_lshr_b64 s[8:9], s[12:13], s8 +; GFX6-NEXT: s_lshl_b64 s[16:17], s[14:15], s16 ; GFX6-NEXT: s_or_b64 s[8:9], s[8:9], s[16:17] ; GFX6-NEXT: s_lshr_b64 s[14:15], s[14:15], s18 ; GFX6-NEXT: s_cmp_lg_u32 s19, 0 @@ -6025,8 +6025,8 @@ ; GFX8-NEXT: s_and_b64 s[22:23], s[16:17], s[18:19] ; GFX8-NEXT: s_andn2_b64 s[16:17], s[18:19], s[16:17] ; GFX8-NEXT: s_lshl_b64 s[24:25], s[0:1], 1 -; GFX8-NEXT: s_lshr_b32 s0, s1, 31 ; GFX8-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 +; GFX8-NEXT: s_lshr_b32 s0, s1, 31 ; GFX8-NEXT: s_mov_b32 s1, s19 ; GFX8-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] ; GFX8-NEXT: s_sub_i32 s23, s16, 64 @@ -6052,8 +6052,8 @@ ; GFX8-NEXT: s_cmp_eq_u32 s22, 0 ; GFX8-NEXT: s_cselect_b32 s28, 1, 0 ; GFX8-NEXT: s_lshr_b64 s[0:1], s[10:11], s22 -; GFX8-NEXT: s_lshl_b64 s[24:25], s[10:11], s24 ; GFX8-NEXT: s_lshr_b64 s[22:23], s[8:9], s22 +; GFX8-NEXT: s_lshl_b64 s[24:25], s[10:11], s24 ; GFX8-NEXT: s_or_b64 s[22:23], s[22:23], s[24:25] ; GFX8-NEXT: s_lshr_b64 s[10:11], s[10:11], s26 ; GFX8-NEXT: s_cmp_lg_u32 s27, 0 @@ -6064,8 +6064,8 @@ ; GFX8-NEXT: s_cselect_b64 s[10:11], s[0:1], 0 ; GFX8-NEXT: s_or_b64 s[0:1], s[2:3], s[8:9] ; GFX8-NEXT: s_or_b64 s[2:3], s[16:17], s[10:11] -; GFX8-NEXT: s_andn2_b64 s[10:11], s[18:19], s[20:21] ; GFX8-NEXT: s_and_b64 s[8:9], s[20:21], s[18:19] +; GFX8-NEXT: s_andn2_b64 s[10:11], s[18:19], s[20:21] ; GFX8-NEXT: s_lshl_b64 s[6:7], s[6:7], 1 ; GFX8-NEXT: s_lshr_b32 s18, s5, 31 ; GFX8-NEXT: s_lshl_b64 s[16:17], s[4:5], 1 @@ -6093,8 +6093,8 @@ ; GFX8-NEXT: s_cmp_eq_u32 s8, 0 ; GFX8-NEXT: s_cselect_b32 s20, 1, 0 ; GFX8-NEXT: s_lshr_b64 s[4:5], s[14:15], s8 -; GFX8-NEXT: s_lshl_b64 s[16:17], s[14:15], s16 ; GFX8-NEXT: s_lshr_b64 s[8:9], s[12:13], s8 +; GFX8-NEXT: s_lshl_b64 s[16:17], s[14:15], s16 ; GFX8-NEXT: s_or_b64 s[8:9], s[8:9], s[16:17] ; GFX8-NEXT: s_lshr_b64 s[14:15], s[14:15], s18 ; GFX8-NEXT: s_cmp_lg_u32 s19, 0 @@ -6114,8 +6114,8 @@ ; GFX9-NEXT: s_and_b64 s[22:23], s[16:17], s[18:19] ; GFX9-NEXT: s_andn2_b64 s[16:17], s[18:19], s[16:17] ; GFX9-NEXT: s_lshl_b64 s[24:25], s[0:1], 1 -; GFX9-NEXT: s_lshr_b32 s0, s1, 31 ; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 +; GFX9-NEXT: s_lshr_b32 s0, s1, 31 ; GFX9-NEXT: s_mov_b32 s1, s19 ; GFX9-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] ; GFX9-NEXT: s_sub_i32 s23, s16, 64 @@ -6141,8 +6141,8 @@ ; GFX9-NEXT: s_cmp_eq_u32 s22, 0 ; GFX9-NEXT: s_cselect_b32 s28, 1, 0 ; GFX9-NEXT: s_lshr_b64 s[0:1], s[10:11], s22 -; GFX9-NEXT: s_lshl_b64 s[24:25], s[10:11], s24 ; GFX9-NEXT: s_lshr_b64 s[22:23], s[8:9], s22 +; GFX9-NEXT: s_lshl_b64 s[24:25], s[10:11], s24 ; GFX9-NEXT: s_or_b64 s[22:23], s[22:23], s[24:25] ; GFX9-NEXT: s_lshr_b64 s[10:11], s[10:11], s26 ; GFX9-NEXT: s_cmp_lg_u32 s27, 0 @@ -6153,8 +6153,8 @@ ; GFX9-NEXT: s_cselect_b64 s[10:11], s[0:1], 0 ; GFX9-NEXT: s_or_b64 s[0:1], s[2:3], s[8:9] ; GFX9-NEXT: s_or_b64 s[2:3], s[16:17], s[10:11] -; GFX9-NEXT: s_andn2_b64 s[10:11], s[18:19], s[20:21] ; GFX9-NEXT: s_and_b64 s[8:9], s[20:21], s[18:19] +; GFX9-NEXT: s_andn2_b64 s[10:11], s[18:19], s[20:21] ; GFX9-NEXT: s_lshl_b64 s[6:7], s[6:7], 1 ; GFX9-NEXT: s_lshr_b32 s18, s5, 31 ; GFX9-NEXT: s_lshl_b64 s[16:17], s[4:5], 1 @@ -6182,8 +6182,8 @@ ; GFX9-NEXT: s_cmp_eq_u32 s8, 0 ; GFX9-NEXT: s_cselect_b32 s20, 1, 0 ; GFX9-NEXT: s_lshr_b64 s[4:5], s[14:15], s8 -; GFX9-NEXT: s_lshl_b64 s[16:17], s[14:15], s16 ; GFX9-NEXT: s_lshr_b64 s[8:9], s[12:13], s8 +; GFX9-NEXT: s_lshl_b64 s[16:17], s[14:15], s16 ; GFX9-NEXT: s_or_b64 s[8:9], s[8:9], s[16:17] ; GFX9-NEXT: s_lshr_b64 s[14:15], s[14:15], s18 ; GFX9-NEXT: s_cmp_lg_u32 s19, 0 @@ -6198,12 +6198,12 @@ ; ; GFX10-LABEL: s_fshr_v2i128: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_mov_b32 s19, 0 ; GFX10-NEXT: s_movk_i32 s18, 0x7f -; GFX10-NEXT: s_lshr_b32 s24, s1, 31 +; GFX10-NEXT: s_mov_b32 s19, 0 +; GFX10-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 ; GFX10-NEXT: s_and_b64 s[22:23], s[16:17], s[18:19] ; GFX10-NEXT: s_andn2_b64 s[16:17], s[18:19], s[16:17] -; GFX10-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 +; GFX10-NEXT: s_lshr_b32 s24, s1, 31 ; GFX10-NEXT: s_mov_b32 s25, s19 ; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], 1 ; GFX10-NEXT: s_or_b64 s[2:3], s[2:3], s[24:25] @@ -6243,8 +6243,8 @@ ; GFX10-NEXT: s_andn2_b64 s[10:11], s[18:19], s[20:21] ; GFX10-NEXT: s_or_b64 s[2:3], s[2:3], s[8:9] ; GFX10-NEXT: s_and_b64 s[8:9], s[20:21], s[18:19] -; GFX10-NEXT: s_lshr_b32 s18, s5, 31 ; GFX10-NEXT: s_lshl_b64 s[6:7], s[6:7], 1 +; GFX10-NEXT: s_lshr_b32 s18, s5, 31 ; GFX10-NEXT: s_or_b64 s[0:1], s[16:17], s[0:1] ; GFX10-NEXT: s_lshl_b64 s[4:5], s[4:5], 1 ; GFX10-NEXT: s_or_b64 s[6:7], s[6:7], s[18:19] @@ -6300,9 +6300,9 @@ ; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 1 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v17 ; GFX6-NEXT: v_sub_i32_e32 v17, vcc, 64, v23 -; GFX6-NEXT: v_and_b32_e32 v24, s6, v16 ; GFX6-NEXT: v_lshr_b64 v[17:18], v[0:1], v17 ; GFX6-NEXT: v_lshl_b64 v[21:22], v[2:3], v23 +; GFX6-NEXT: v_and_b32_e32 v24, s6, v16 ; GFX6-NEXT: v_sub_i32_e32 v16, vcc, 64, v24 ; GFX6-NEXT: v_or_b32_e32 v21, v17, v21 ; GFX6-NEXT: v_or_b32_e32 v22, v18, v22 @@ -6366,14 +6366,14 @@ ; GFX6-NEXT: v_or_b32_e32 v11, v4, v6 ; GFX6-NEXT: v_or_b32_e32 v17, v5, v7 ; GFX6-NEXT: v_lshr_b64 v[6:7], v[14:15], v10 -; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v16 ; GFX6-NEXT: v_lshr_b64 v[4:5], v[14:15], v16 +; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v16 ; GFX6-NEXT: v_cndmask_b32_e32 v6, v6, v11, vcc -; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v16 ; GFX6-NEXT: v_cndmask_b32_e32 v7, v7, v17, vcc +; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v16 ; GFX6-NEXT: v_cndmask_b32_e64 v6, v6, v12, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v10, 0, v4, vcc ; GFX6-NEXT: v_cndmask_b32_e64 v7, v7, v13, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e32 v10, 0, v4, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v11, 0, v5, vcc ; GFX6-NEXT: v_or_b32_e32 v4, v18, v6 ; GFX6-NEXT: v_or_b32_e32 v5, v19, v7 @@ -6392,9 +6392,9 @@ ; GFX8-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] ; GFX8-NEXT: v_or_b32_e32 v2, v2, v17 ; GFX8-NEXT: v_sub_u32_e32 v17, vcc, 64, v23 -; GFX8-NEXT: v_and_b32_e32 v24, s6, v16 ; GFX8-NEXT: v_lshrrev_b64 v[17:18], v17, v[0:1] ; GFX8-NEXT: v_lshlrev_b64 v[21:22], v23, v[2:3] +; GFX8-NEXT: v_and_b32_e32 v24, s6, v16 ; GFX8-NEXT: v_sub_u32_e32 v16, vcc, 64, v24 ; GFX8-NEXT: v_or_b32_e32 v21, v17, v21 ; GFX8-NEXT: v_or_b32_e32 v22, v18, v22 @@ -6458,14 +6458,14 @@ ; GFX8-NEXT: v_or_b32_e32 v11, v4, v6 ; GFX8-NEXT: v_or_b32_e32 v17, v5, v7 ; GFX8-NEXT: v_lshrrev_b64 v[6:7], v10, v[14:15] -; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v16 ; GFX8-NEXT: v_lshrrev_b64 v[4:5], v16, v[14:15] +; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v16 ; GFX8-NEXT: v_cndmask_b32_e32 v6, v6, v11, vcc -; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v16 ; GFX8-NEXT: v_cndmask_b32_e32 v7, v7, v17, vcc +; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v16 ; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v12, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v10, 0, v4, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v7, v7, v13, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e32 v10, 0, v4, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v11, 0, v5, vcc ; GFX8-NEXT: v_or_b32_e32 v4, v18, v6 ; GFX8-NEXT: v_or_b32_e32 v5, v19, v7 @@ -6484,9 +6484,9 @@ ; GFX9-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] ; GFX9-NEXT: v_or_b32_e32 v2, v2, v17 ; GFX9-NEXT: v_sub_u32_e32 v17, 64, v23 -; GFX9-NEXT: v_and_b32_e32 v24, s6, v16 ; GFX9-NEXT: v_lshrrev_b64 v[17:18], v17, v[0:1] ; GFX9-NEXT: v_lshlrev_b64 v[21:22], v23, v[2:3] +; GFX9-NEXT: v_and_b32_e32 v24, s6, v16 ; GFX9-NEXT: v_sub_u32_e32 v16, 64, v24 ; GFX9-NEXT: v_or_b32_e32 v21, v17, v21 ; GFX9-NEXT: v_or_b32_e32 v22, v18, v22 @@ -6521,8 +6521,8 @@ ; GFX9-NEXT: v_xor_b32_e32 v8, -1, v20 ; GFX9-NEXT: v_lshlrev_b64 v[6:7], 1, v[6:7] ; GFX9-NEXT: v_or_b32_e32 v1, v18, v3 -; GFX9-NEXT: v_and_b32_e32 v17, s6, v8 ; GFX9-NEXT: v_or_b32_e32 v3, v16, v9 +; GFX9-NEXT: v_and_b32_e32 v17, s6, v8 ; GFX9-NEXT: v_lshlrev_b64 v[8:9], 1, v[4:5] ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 31, v5 ; GFX9-NEXT: v_or_b32_e32 v6, v6, v4 @@ -6550,14 +6550,14 @@ ; GFX9-NEXT: v_or_b32_e32 v11, v4, v6 ; GFX9-NEXT: v_or_b32_e32 v17, v5, v7 ; GFX9-NEXT: v_lshrrev_b64 v[6:7], v10, v[14:15] -; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v16 ; GFX9-NEXT: v_lshrrev_b64 v[4:5], v16, v[14:15] +; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v16 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v11, vcc -; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v16 ; GFX9-NEXT: v_cndmask_b32_e32 v7, v7, v17, vcc +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v16 ; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v12, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e32 v10, 0, v4, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v13, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v10, 0, v4, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v11, 0, v5, vcc ; GFX9-NEXT: v_or_b32_e32 v4, v18, v6 ; GFX9-NEXT: v_or_b32_e32 v5, v19, v7 @@ -6608,18 +6608,18 @@ ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v17, s4 ; GFX10-NEXT: v_lshrrev_b64 v[2:3], v26, v[10:11] ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc_lo -; GFX10-NEXT: v_lshrrev_b32_e32 v8, 31, v5 ; GFX10-NEXT: v_and_b32_e32 v25, s5, v16 +; GFX10-NEXT: v_lshrrev_b32_e32 v8, 31, v5 ; GFX10-NEXT: v_lshlrev_b64 v[4:5], 1, v[4:5] ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc_lo ; GFX10-NEXT: v_or_b32_e32 v0, v23, v0 -; GFX10-NEXT: v_or_b32_e32 v6, v6, v8 ; GFX10-NEXT: v_sub_nc_u32_e32 v9, 64, v25 +; GFX10-NEXT: v_or_b32_e32 v6, v6, v8 ; GFX10-NEXT: v_and_b32_e32 v23, s5, v20 ; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, v2, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v26, 0, v3, s4 -; GFX10-NEXT: v_lshlrev_b64 v[10:11], v25, v[6:7] ; GFX10-NEXT: v_lshrrev_b64 v[8:9], v9, v[4:5] +; GFX10-NEXT: v_lshlrev_b64 v[10:11], v25, v[6:7] ; GFX10-NEXT: v_sub_nc_u32_e32 v20, 64, v23 ; GFX10-NEXT: v_subrev_nc_u32_e32 v3, 64, v25 ; GFX10-NEXT: v_or_b32_e32 v2, v18, v2 @@ -6639,20 +6639,20 @@ ; GFX10-NEXT: v_cndmask_b32_e32 v10, v3, v10, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v5, v4, v5, vcc_lo ; GFX10-NEXT: v_lshrrev_b64 v[3:4], v23, v[14:15] -; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 0, v25 ; GFX10-NEXT: v_cndmask_b32_e64 v8, v8, v16, s4 ; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 0, v23 +; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 0, v25 ; GFX10-NEXT: v_cndmask_b32_e64 v9, v9, v18, s4 ; GFX10-NEXT: v_cndmask_b32_e32 v14, 0, v17, vcc_lo +; GFX10-NEXT: v_or_b32_e32 v1, v24, v1 ; GFX10-NEXT: v_cndmask_b32_e64 v6, v10, v6, s6 ; GFX10-NEXT: v_cndmask_b32_e64 v7, v5, v7, s6 ; GFX10-NEXT: v_cndmask_b32_e64 v5, v8, v12, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, v4, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v8, v9, v13, s5 ; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, v3, s4 -; GFX10-NEXT: v_or_b32_e32 v1, v24, v1 -; GFX10-NEXT: v_or_b32_e32 v4, v11, v5 +; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, v4, s4 ; GFX10-NEXT: v_or_b32_e32 v3, v22, v26 +; GFX10-NEXT: v_or_b32_e32 v4, v11, v5 ; GFX10-NEXT: v_or_b32_e32 v5, v14, v8 ; GFX10-NEXT: v_or_b32_e32 v6, v6, v9 ; GFX10-NEXT: v_or_b32_e32 v7, v7, v10 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll @@ -143,9 +143,9 @@ ; GCN-NEXT: v_mov_b32_e32 v0, s48 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:496 ; GCN-NEXT: v_mov_b32_e32 v0, s49 -; GCN-NEXT: s_and_b32 s4, s25, 63 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:500 ; GCN-NEXT: v_mov_b32_e32 v0, s50 +; GCN-NEXT: s_and_b32 s4, s25, 63 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:504 ; GCN-NEXT: v_mov_b32_e32 v0, s51 ; GCN-NEXT: s_lshl_b32 s4, s4, 2 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll @@ -92,9 +92,9 @@ ; GFX9-NEXT: s_and_b32 s2, s2, s1 ; GFX9-NEXT: s_lshl_b32 s2, s2, s0 ; GFX9-NEXT: s_lshl_b32 s0, s1, s0 -; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_not_b32 s0, s0 ; GFX9-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_and_or_b32 v2, v2, s0, v3 @@ -164,8 +164,8 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-NEXT: s_and_b32 s1, s4, 1 -; GFX9-NEXT: s_mov_b32 s2, 0xffff ; GFX9-NEXT: s_lshl_b32 s1, s1, 4 +; GFX9-NEXT: s_mov_b32 s2, 0xffff ; GFX9-NEXT: v_and_b32_e32 v2, s2, v0 ; GFX9-NEXT: s_lshl_b32 s2, s2, s1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -288,8 +288,8 @@ ; ; GFX10-LABEL: insertelement_s_v2i16_s_v: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX10-NEXT: s_mov_b32 s1, 0xffff ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX10-NEXT: v_lshlrev_b32_e64 v1, v0, s1 @@ -362,8 +362,8 @@ ; ; GFX10-LABEL: insertelement_s_v2i16_v_v: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_and_b32_e32 v1, 1, v1 ; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX10-NEXT: v_and_b32_e32 v1, 1, v1 ; GFX10-NEXT: s_mov_b32 s1, 0xffff ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 4, v1 ; GFX10-NEXT: v_lshlrev_b32_e64 v2, v1, s1 @@ -402,8 +402,8 @@ ; GFX8-LABEL: insertelement_v_v2i16_s_v: ; GFX8: ; %bb.0: ; GFX8-NEXT: flat_load_dword v0, v[0:1] -; GFX8-NEXT: v_and_b32_e32 v1, 1, v2 ; GFX8-NEXT: s_mov_b32 s0, 0xffff +; GFX8-NEXT: v_and_b32_e32 v1, 1, v2 ; GFX8-NEXT: s_and_b32 s1, s2, s0 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 4, v1 ; GFX8-NEXT: v_lshlrev_b32_e64 v2, v1, s1 @@ -420,8 +420,8 @@ ; GFX7-LABEL: insertelement_v_v2i16_s_v: ; GFX7: ; %bb.0: ; GFX7-NEXT: flat_load_dword v0, v[0:1] -; GFX7-NEXT: v_and_b32_e32 v1, 1, v2 ; GFX7-NEXT: s_mov_b32 s0, 0xffff +; GFX7-NEXT: v_and_b32_e32 v1, 1, v2 ; GFX7-NEXT: s_and_b32 s1, s2, s0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 4, v1 ; GFX7-NEXT: v_lshl_b32_e32 v2, s1, v1 @@ -478,8 +478,8 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: s_and_b32 s1, s2, 1 -; GFX8-NEXT: s_lshl_b32 s1, s1, 4 ; GFX8-NEXT: s_mov_b32 s0, 0xffff +; GFX8-NEXT: s_lshl_b32 s1, s1, 4 ; GFX8-NEXT: s_lshl_b32 s0, s0, s1 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: s_not_b32 s0, s0 @@ -552,8 +552,8 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: v_and_b32_e32 v1, 1, v3 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 4, v1 ; GFX8-NEXT: s_mov_b32 s0, 0xffff +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 4, v1 ; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX8-NEXT: v_lshlrev_b32_e64 v1, v1, s0 ; GFX8-NEXT: v_xor_b32_e32 v1, -1, v1 @@ -568,8 +568,8 @@ ; GFX7-LABEL: insertelement_v_v2i16_v_v: ; GFX7: ; %bb.0: ; GFX7-NEXT: flat_load_dword v0, v[0:1] -; GFX7-NEXT: v_and_b32_e32 v1, 1, v3 ; GFX7-NEXT: s_mov_b32 s0, 0xffff +; GFX7-NEXT: v_and_b32_e32 v1, 1, v3 ; GFX7-NEXT: v_and_b32_e32 v2, s0, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 4, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, v1, v2 @@ -665,13 +665,13 @@ ; GFX9-LABEL: insertelement_v_v4i16_s_s: ; GFX9: ; %bb.0: ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX9-NEXT: s_mov_b32 s0, 0xffff ; GFX9-NEXT: s_lshr_b32 s1, s3, 1 ; GFX9-NEXT: s_and_b32 s3, s3, 1 -; GFX9-NEXT: s_mov_b32 s0, 0xffff ; GFX9-NEXT: s_and_b32 s2, s2, s0 ; GFX9-NEXT: s_lshl_b32 s3, s3, 4 -; GFX9-NEXT: s_lshl_b32 s0, s0, s3 ; GFX9-NEXT: s_lshl_b32 s2, s2, s3 +; GFX9-NEXT: s_lshl_b32 s0, s0, s3 ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s1, 1 ; GFX9-NEXT: s_not_b32 s0, s0 ; GFX9-NEXT: v_mov_b32_e32 v4, s2 @@ -780,9 +780,9 @@ ; GFX9-NEXT: v_lshl_or_b32 v4, v0, s4, v1 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s2, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s2, 1 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc @@ -806,9 +806,9 @@ ; GFX8-NEXT: s_andn2_b32 s3, s3, s4 ; GFX8-NEXT: v_or_b32_e32 v4, s3, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; GFX8-NEXT: v_mov_b32_e32 v2, 0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s2, 1 ; GFX8-NEXT: v_mov_b32_e32 v3, 0 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc @@ -832,9 +832,9 @@ ; GFX7-NEXT: v_or_b32_e32 v4, s3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s2, 0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; GFX7-NEXT: v_mov_b32_e32 v2, 0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s2, 1 ; GFX7-NEXT: v_mov_b32_e32 v3, 0 ; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc @@ -994,12 +994,12 @@ ; GFX9-NEXT: v_and_b32_e32 v1, 1, v1 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 4, v1 ; GFX9-NEXT: s_mov_b32 s2, 0xffff -; GFX9-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX9-NEXT: v_lshlrev_b32_e64 v1, v1, s2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v3, s0 ; GFX9-NEXT: v_mov_b32_e32 v4, s1 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX9-NEXT: v_lshlrev_b32_e64 v1, v1, s2 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX9-NEXT: v_xor_b32_e32 v1, -1, v1 ; GFX9-NEXT: v_and_or_b32 v4, v3, v1, v0 @@ -1020,12 +1020,12 @@ ; GFX8-NEXT: v_and_b32_e32 v1, 1, v1 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 4, v1 ; GFX8-NEXT: s_mov_b32 s2, 0xffff -; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_lshlrev_b32_e64 v1, v1, s2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v3, s0 ; GFX8-NEXT: v_mov_b32_e32 v4, s1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_lshlrev_b32_e64 v1, v1, s2 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX8-NEXT: v_xor_b32_e32 v1, -1, v1 ; GFX8-NEXT: v_and_b32_e32 v1, v3, v1 @@ -1048,12 +1048,12 @@ ; GFX7-NEXT: s_mov_b32 s2, 0xffff ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 4, v1 ; GFX7-NEXT: v_and_b32_e32 v0, s2, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, v1, v0 -; GFX7-NEXT: v_lshl_b32_e32 v1, s2, v1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v3, s0 ; GFX7-NEXT: v_mov_b32_e32 v4, s1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_lshl_b32_e32 v1, s2, v1 ; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX7-NEXT: v_xor_b32_e32 v1, -1, v1 ; GFX7-NEXT: v_and_b32_e32 v1, v3, v1 @@ -1102,9 +1102,9 @@ ; GFX9-LABEL: insertelement_v_v4i16_s_v: ; GFX9: ; %bb.0: ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX9-NEXT: s_mov_b32 s0, 0xffff ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 1, v2 ; GFX9-NEXT: v_and_b32_e32 v2, 1, v2 -; GFX9-NEXT: s_mov_b32 s0, 0xffff ; GFX9-NEXT: s_and_b32 s1, s2, s0 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 4, v2 ; GFX9-NEXT: v_lshlrev_b32_e64 v6, v2, s1 @@ -1125,9 +1125,9 @@ ; GFX8-LABEL: insertelement_v_v4i16_s_v: ; GFX8: ; %bb.0: ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX8-NEXT: s_mov_b32 s0, 0xffff ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 1, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 1, v2 -; GFX8-NEXT: s_mov_b32 s0, 0xffff ; GFX8-NEXT: s_and_b32 s1, s2, s0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 4, v2 ; GFX8-NEXT: v_lshlrev_b32_e64 v6, v2, s1 @@ -1149,9 +1149,9 @@ ; GFX7-LABEL: insertelement_v_v4i16_s_v: ; GFX7: ; %bb.0: ; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX7-NEXT: s_mov_b32 s0, 0xffff ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 1, v2 ; GFX7-NEXT: v_and_b32_e32 v2, 1, v2 -; GFX7-NEXT: s_mov_b32 s0, 0xffff ; GFX7-NEXT: s_and_b32 s1, s2, s0 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 4, v2 ; GFX7-NEXT: v_lshl_b32_e32 v6, s1, v2 @@ -1204,8 +1204,8 @@ ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off ; GFX9-NEXT: s_lshr_b32 s1, s2, 1 ; GFX9-NEXT: s_and_b32 s2, s2, 1 -; GFX9-NEXT: s_lshl_b32 s2, s2, 4 ; GFX9-NEXT: s_mov_b32 s0, 0xffff +; GFX9-NEXT: s_lshl_b32 s2, s2, 4 ; GFX9-NEXT: s_lshl_b32 s0, s0, s2 ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s1, 1 ; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 @@ -1226,10 +1226,10 @@ ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX8-NEXT: s_lshr_b32 s1, s2, 1 ; GFX8-NEXT: s_and_b32 s2, s2, 1 -; GFX8-NEXT: s_lshl_b32 s2, s2, 4 ; GFX8-NEXT: s_mov_b32 s0, 0xffff -; GFX8-NEXT: s_lshl_b32 s0, s0, s2 +; GFX8-NEXT: s_lshl_b32 s2, s2, 4 ; GFX8-NEXT: v_mov_b32_e32 v5, s2 +; GFX8-NEXT: s_lshl_b32 s0, s0, s2 ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s1, 1 ; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX8-NEXT: s_not_b32 s0, s0 @@ -1302,8 +1302,8 @@ ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 1, v3 ; GFX9-NEXT: v_and_b32_e32 v3, 1, v3 -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 4, v3 ; GFX9-NEXT: s_mov_b32 s0, 0xffff +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 4, v3 ; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX9-NEXT: v_lshlrev_b32_e64 v3, v3, s0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6 @@ -1324,8 +1324,8 @@ ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 1, v3 ; GFX8-NEXT: v_and_b32_e32 v3, 1, v3 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 4, v3 ; GFX8-NEXT: s_mov_b32 s0, 0xffff +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 4, v3 ; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX8-NEXT: v_lshlrev_b32_e64 v3, v3, s0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6 @@ -1345,9 +1345,9 @@ ; GFX7-LABEL: insertelement_v_v4i16_v_v: ; GFX7: ; %bb.0: ; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX7-NEXT: s_mov_b32 s0, 0xffff ; GFX7-NEXT: v_lshrrev_b32_e32 v6, 1, v3 ; GFX7-NEXT: v_and_b32_e32 v3, 1, v3 -; GFX7-NEXT: s_mov_b32 s0, 0xffff ; GFX7-NEXT: v_and_b32_e32 v2, s0, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 4, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, v3, v2 @@ -1693,15 +1693,15 @@ ; GFX9-NEXT: v_lshl_or_b32 v6, v0, s4, v1 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s5, 0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc ; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s5, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc ; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s5, 2 +; GFX9-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc ; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s5, 3 ; GFX9-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc @@ -1729,15 +1729,15 @@ ; GFX8-NEXT: s_andn2_b32 s4, s6, s4 ; GFX8-NEXT: v_or_b32_e32 v6, s4, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc ; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s5, 1 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc ; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s5, 2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc ; GFX8-NEXT: v_mov_b32_e32 v4, 0 -; GFX8-NEXT: v_mov_b32_e32 v3, s3 ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s5, 3 ; GFX8-NEXT: v_mov_b32_e32 v5, 0 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc @@ -1765,11 +1765,11 @@ ; GFX7-NEXT: v_or_b32_e32 v4, s4, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s5, 0 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s5, 1 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s5, 2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc @@ -1826,9 +1826,9 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x0 ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 1, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_mov_b32 s5, 0xffff -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s8 ; GFX9-NEXT: v_mov_b32_e32 v2, s9 @@ -1837,11 +1837,11 @@ ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v4 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX9-NEXT: s_and_b32 s4, s4, s5 -; GFX9-NEXT: v_lshlrev_b32_e64 v2, v0, s4 -; GFX9-NEXT: v_lshlrev_b32_e64 v0, v0, s5 ; GFX9-NEXT: v_mov_b32_e32 v5, s11 ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] ; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v4 +; GFX9-NEXT: v_lshlrev_b32_e64 v2, v0, s4 +; GFX9-NEXT: v_lshlrev_b32_e64 v0, v0, s5 ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[2:3] ; GFX9-NEXT: v_xor_b32_e32 v0, -1, v0 ; GFX9-NEXT: v_and_or_b32 v6, v1, v0, v2 @@ -1863,9 +1863,9 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x0 ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 1, v0 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX8-NEXT: s_mov_b32 s5, 0xffff -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s8 ; GFX8-NEXT: v_mov_b32_e32 v2, s9 @@ -1874,11 +1874,11 @@ ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v4 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX8-NEXT: s_and_b32 s4, s4, s5 -; GFX8-NEXT: v_lshlrev_b32_e64 v2, v0, s4 -; GFX8-NEXT: v_lshlrev_b32_e64 v0, v0, s5 ; GFX8-NEXT: v_mov_b32_e32 v5, s11 ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] ; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v4 +; GFX8-NEXT: v_lshlrev_b32_e64 v2, v0, s4 +; GFX8-NEXT: v_lshlrev_b32_e64 v0, v0, s5 ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[2:3] ; GFX8-NEXT: v_xor_b32_e32 v0, -1, v0 ; GFX8-NEXT: v_and_b32_e32 v0, v1, v0 @@ -1901,9 +1901,9 @@ ; GFX7: ; %bb.0: ; GFX7-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x0 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 1, v0 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 ; GFX7-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX7-NEXT: s_mov_b32 s5, 0xffff -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s9 @@ -1912,11 +1912,11 @@ ; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX7-NEXT: s_and_b32 s4, s4, s5 -; GFX7-NEXT: v_lshl_b32_e32 v2, s4, v0 -; GFX7-NEXT: v_lshl_b32_e32 v0, s5, v0 ; GFX7-NEXT: v_mov_b32_e32 v5, s11 ; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] ; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v4 +; GFX7-NEXT: v_lshl_b32_e32 v2, s4, v0 +; GFX7-NEXT: v_lshl_b32_e32 v0, s5, v0 ; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[2:3] ; GFX7-NEXT: v_xor_b32_e32 v0, -1, v0 ; GFX7-NEXT: v_and_b32_e32 v0, v1, v0 @@ -1926,10 +1926,10 @@ ; GFX7-NEXT: v_mov_b32_e32 v2, s10 ; GFX7-NEXT: v_mov_b32_e32 v3, s11 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v4 -; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[2:3] -; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1] ; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v5, s[4:5] ; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1] +; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[2:3] ; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 @@ -1980,21 +1980,21 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 1, v1 -; GFX9-NEXT: v_and_b32_e32 v1, 1, v1 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 4, v1 +; GFX9-NEXT: v_and_b32_e32 v1, 1, v1 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: v_mov_b32_e32 v3, s5 ; GFX9-NEXT: v_mov_b32_e32 v5, s6 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v4 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 4, v1 ; GFX9-NEXT: s_mov_b32 s8, 0xffff -; GFX9-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX9-NEXT: v_lshlrev_b32_e64 v1, v1, s8 ; GFX9-NEXT: v_mov_b32_e32 v6, s7 ; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1] ; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v4 +; GFX9-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX9-NEXT: v_lshlrev_b32_e64 v1, v1, s8 ; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[2:3] ; GFX9-NEXT: v_xor_b32_e32 v1, -1, v1 ; GFX9-NEXT: v_and_or_b32 v6, v2, v1, v0 @@ -2016,21 +2016,21 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 1, v1 -; GFX8-NEXT: v_and_b32_e32 v1, 1, v1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 4, v1 +; GFX8-NEXT: v_and_b32_e32 v1, 1, v1 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v4 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_mov_b32_e32 v3, s5 ; GFX8-NEXT: v_mov_b32_e32 v5, s6 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v4 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 4, v1 ; GFX8-NEXT: s_mov_b32 s8, 0xffff -; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_lshlrev_b32_e64 v1, v1, s8 ; GFX8-NEXT: v_mov_b32_e32 v6, s7 ; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1] ; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v4 +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_lshlrev_b32_e64 v1, v1, s8 ; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[2:3] ; GFX8-NEXT: v_xor_b32_e32 v1, -1, v1 ; GFX8-NEXT: v_and_b32_e32 v1, v2, v1 @@ -2053,9 +2053,9 @@ ; GFX7: ; %bb.0: ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 1, v1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 ; GFX7-NEXT: v_and_b32_e32 v1, 1, v1 ; GFX7-NEXT: s_mov_b32 s8, 0xffff -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: v_mov_b32_e32 v3, s5 @@ -2064,11 +2064,11 @@ ; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 4, v1 ; GFX7-NEXT: v_and_b32_e32 v0, s8, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, v1, v0 -; GFX7-NEXT: v_lshl_b32_e32 v1, s8, v1 ; GFX7-NEXT: v_mov_b32_e32 v6, s7 ; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1] ; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_lshl_b32_e32 v1, s8, v1 ; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[2:3] ; GFX7-NEXT: v_xor_b32_e32 v1, -1, v1 ; GFX7-NEXT: v_and_b32_e32 v1, v2, v1 @@ -2078,10 +2078,10 @@ ; GFX7-NEXT: v_mov_b32_e32 v2, s6 ; GFX7-NEXT: v_mov_b32_e32 v3, s7 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v4 -; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[2:3] -; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1] ; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v5, s[4:5] ; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1] +; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[2:3] ; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 @@ -2091,11 +2091,11 @@ ; GFX10-LABEL: insertelement_s_v8i16_v_v: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 -; GFX10-NEXT: v_lshrrev_b32_e32 v6, 1, v1 ; GFX10-NEXT: v_and_b32_e32 v2, 1, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v6, 1, v1 ; GFX10-NEXT: s_mov_b32 s0, 0xffff -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v6 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 4, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v6 ; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v6 ; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 0, v6 ; GFX10-NEXT: v_lshlrev_b32_e64 v3, v2, s0 @@ -2130,9 +2130,9 @@ ; GFX9-LABEL: insertelement_v_v8i16_s_v: ; GFX9: ; %bb.0: ; GFX9-NEXT: global_load_dwordx4 v[3:6], v[0:1], off +; GFX9-NEXT: s_mov_b32 s0, 0xffff ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 1, v2 ; GFX9-NEXT: v_and_b32_e32 v1, 1, v2 -; GFX9-NEXT: s_mov_b32 s0, 0xffff ; GFX9-NEXT: s_and_b32 s1, s2, s0 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 4, v1 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 @@ -2159,9 +2159,9 @@ ; GFX8-LABEL: insertelement_v_v8i16_s_v: ; GFX8: ; %bb.0: ; GFX8-NEXT: flat_load_dwordx4 v[3:6], v[0:1] +; GFX8-NEXT: s_mov_b32 s0, 0xffff ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 1, v2 ; GFX8-NEXT: v_and_b32_e32 v1, 1, v2 -; GFX8-NEXT: s_mov_b32 s0, 0xffff ; GFX8-NEXT: s_and_b32 s1, s2, s0 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 4, v1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 @@ -2192,9 +2192,9 @@ ; GFX7-NEXT: s_mov_b32 s11, 0xf000 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: buffer_load_dwordx4 v[3:6], v[0:1], s[8:11], 0 addr64 +; GFX7-NEXT: s_mov_b32 s0, 0xffff ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 1, v2 ; GFX7-NEXT: v_and_b32_e32 v1, 1, v2 -; GFX7-NEXT: s_mov_b32 s0, 0xffff ; GFX7-NEXT: s_and_b32 s1, s2, s0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 4, v1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 @@ -2257,9 +2257,9 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: global_load_dwordx4 v[3:6], v[0:1], off ; GFX9-NEXT: s_and_b32 s1, s2, 1 +; GFX9-NEXT: s_mov_b32 s0, 0xffff ; GFX9-NEXT: s_lshr_b32 s4, s2, 1 ; GFX9-NEXT: s_lshl_b32 s1, s1, 4 -; GFX9-NEXT: s_mov_b32 s0, 0xffff ; GFX9-NEXT: s_lshl_b32 s0, s0, s1 ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1 ; GFX9-NEXT: v_lshlrev_b32_sdwa v0, s1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 @@ -2285,9 +2285,9 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: flat_load_dwordx4 v[3:6], v[0:1] ; GFX8-NEXT: s_and_b32 s1, s2, 1 +; GFX8-NEXT: s_mov_b32 s0, 0xffff ; GFX8-NEXT: s_lshr_b32 s4, s2, 1 ; GFX8-NEXT: s_lshl_b32 s1, s1, 4 -; GFX8-NEXT: s_mov_b32 s0, 0xffff ; GFX8-NEXT: s_lshl_b32 s0, s0, s1 ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1 ; GFX8-NEXT: v_mov_b32_e32 v0, s1 @@ -2318,8 +2318,8 @@ ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: buffer_load_dwordx4 v[3:6], v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: s_and_b32 s1, s2, 1 -; GFX7-NEXT: s_lshr_b32 s4, s2, 1 ; GFX7-NEXT: s_mov_b32 s0, 0xffff +; GFX7-NEXT: s_lshr_b32 s4, s2, 1 ; GFX7-NEXT: s_lshl_b32 s1, s1, 4 ; GFX7-NEXT: v_and_b32_e32 v0, s0, v2 ; GFX7-NEXT: s_lshl_b32 s0, s0, s1 @@ -2382,9 +2382,9 @@ ; GFX9-NEXT: global_load_dwordx4 v[4:7], v[0:1], off ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 1, v3 ; GFX9-NEXT: v_and_b32_e32 v1, 1, v3 +; GFX9-NEXT: s_mov_b32 s0, 0xffff ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 4, v1 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-NEXT: s_mov_b32 s0, 0xffff ; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX9-NEXT: v_lshlrev_b32_e64 v1, v1, s0 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v0 @@ -2410,9 +2410,9 @@ ; GFX8-NEXT: flat_load_dwordx4 v[4:7], v[0:1] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 1, v3 ; GFX8-NEXT: v_and_b32_e32 v1, 1, v3 +; GFX8-NEXT: s_mov_b32 s0, 0xffff ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 4, v1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX8-NEXT: s_mov_b32 s0, 0xffff ; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX8-NEXT: v_lshlrev_b32_e64 v1, v1, s0 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v0 @@ -2440,12 +2440,12 @@ ; GFX7-NEXT: s_mov_b32 s11, 0xf000 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[8:11], 0 addr64 +; GFX7-NEXT: s_mov_b32 s0, 0xffff ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 1, v3 ; GFX7-NEXT: v_and_b32_e32 v1, 1, v3 -; GFX7-NEXT: s_mov_b32 s0, 0xffff -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX7-NEXT: v_and_b32_e32 v2, s0, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 4, v1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, v1, v2 ; GFX7-NEXT: v_lshl_b32_e32 v1, s0, v1 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v0 @@ -2469,11 +2469,11 @@ ; GFX10-LABEL: insertelement_v_v8i16_v_v: ; GFX10: ; %bb.0: ; GFX10-NEXT: global_load_dwordx4 v[4:7], v[0:1], off -; GFX10-NEXT: v_lshrrev_b32_e32 v1, 1, v3 ; GFX10-NEXT: v_and_b32_e32 v0, 1, v3 +; GFX10-NEXT: v_lshrrev_b32_e32 v1, 1, v3 ; GFX10-NEXT: s_mov_b32 s0, 0xffff -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 ; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v1 ; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 0, v1 ; GFX10-NEXT: v_lshlrev_b32_e64 v8, v0, s0 @@ -2539,19 +2539,19 @@ ; GFX9-NEXT: s_cmp_eq_u32 s7, 4 ; GFX9-NEXT: s_cselect_b32 s4, s16, s12 ; GFX9-NEXT: s_cmp_eq_u32 s7, 5 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: s_cselect_b32 s5, s16, s13 ; GFX9-NEXT: s_cmp_eq_u32 s7, 6 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: s_cselect_b32 s6, s16, s14 ; GFX9-NEXT: s_cmp_eq_u32 s7, 7 ; GFX9-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off ; GFX9-NEXT: s_cselect_b32 s7, s16, s15 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off ; GFX9-NEXT: s_mov_b64 s[0:1], 16 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-NEXT: v_mov_b32_e32 v3, s7 @@ -2598,19 +2598,19 @@ ; GFX8-NEXT: s_cmp_eq_u32 s7, 4 ; GFX8-NEXT: s_cselect_b32 s4, s16, s12 ; GFX8-NEXT: s_cmp_eq_u32 s7, 5 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: s_cselect_b32 s5, s16, s13 ; GFX8-NEXT: s_cmp_eq_u32 s7, 6 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: s_cselect_b32 s6, s16, s14 ; GFX8-NEXT: s_cmp_eq_u32 s7, 7 ; GFX8-NEXT: v_mov_b32_e32 v5, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_mov_b32_e32 v2, s2 ; GFX8-NEXT: v_mov_b32_e32 v3, s3 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_cselect_b32 s7, s16, s15 -; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v4, 16 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v5, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_mov_b32_e32 v2, s6 @@ -2656,9 +2656,9 @@ ; GFX7-NEXT: s_cmp_eq_u32 s7, 4 ; GFX7-NEXT: s_cselect_b32 s4, s16, s12 ; GFX7-NEXT: s_cmp_eq_u32 s7, 5 -; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: s_cselect_b32 s5, s16, s13 ; GFX7-NEXT: s_cmp_eq_u32 s7, 6 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: s_cselect_b32 s6, s16, s14 ; GFX7-NEXT: s_cmp_eq_u32 s7, 7 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 @@ -2667,10 +2667,10 @@ ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_mov_b32 s10, -1 ; GFX7-NEXT: s_mov_b32 s11, 0xf000 -; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 ; GFX7-NEXT: s_cselect_b32 s7, s16, s15 -; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 ; GFX7-NEXT: s_mov_b64 s[8:9], 16 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s6 ; GFX7-NEXT: v_mov_b32_e32 v3, s7 @@ -2747,8 +2747,8 @@ ; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off ; GFX9-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:16 ; GFX9-NEXT: s_and_b32 s1, s3, 1 -; GFX9-NEXT: s_lshr_b32 s12, s3, 1 ; GFX9-NEXT: s_mov_b32 s0, 0xffff +; GFX9-NEXT: s_lshr_b32 s12, s3, 1 ; GFX9-NEXT: s_lshl_b32 s1, s1, 4 ; GFX9-NEXT: s_and_b32 s2, s2, s0 ; GFX9-NEXT: s_lshl_b32 s0, s0, s1 @@ -2774,8 +2774,8 @@ ; GFX9-NEXT: v_and_or_b32 v10, v1, s13, v0 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[12:13], s12, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, v10, s[12:13] -; GFX9-NEXT: v_cndmask_b32_e64 v2, v4, v10, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v10, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v2, v4, v10, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v3, v5, v10, s[2:3] ; GFX9-NEXT: v_cndmask_b32_e64 v4, v6, v10, s[4:5] ; GFX9-NEXT: v_cndmask_b32_e64 v5, v7, v10, s[6:7] @@ -2796,8 +2796,8 @@ ; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GFX8-NEXT: flat_load_dwordx4 v[4:7], v[4:5] ; GFX8-NEXT: s_and_b32 s1, s3, 1 -; GFX8-NEXT: s_lshr_b32 s12, s3, 1 ; GFX8-NEXT: s_mov_b32 s0, 0xffff +; GFX8-NEXT: s_lshr_b32 s12, s3, 1 ; GFX8-NEXT: s_lshl_b32 s1, s1, 4 ; GFX8-NEXT: s_and_b32 s2, s2, s0 ; GFX8-NEXT: s_lshl_b32 s0, s0, s1 @@ -2846,8 +2846,8 @@ ; GFX7-NEXT: buffer_load_dwordx4 v[2:5], v[0:1], s[16:19], 0 addr64 ; GFX7-NEXT: buffer_load_dwordx4 v[6:9], v[0:1], s[16:19], 0 addr64 offset:16 ; GFX7-NEXT: s_and_b32 s1, s3, 1 -; GFX7-NEXT: s_lshr_b32 s12, s3, 1 ; GFX7-NEXT: s_mov_b32 s0, 0xffff +; GFX7-NEXT: s_lshr_b32 s12, s3, 1 ; GFX7-NEXT: s_lshl_b32 s1, s1, 4 ; GFX7-NEXT: s_and_b32 s2, s2, s0 ; GFX7-NEXT: s_lshl_b32 s0, s0, s1 @@ -2879,9 +2879,9 @@ ; GFX7-NEXT: v_cndmask_b32_e64 v3, v5, v10, s[2:3] ; GFX7-NEXT: v_cndmask_b32_e64 v4, v6, v10, s[4:5] ; GFX7-NEXT: v_cndmask_b32_e64 v5, v7, v10, s[6:7] -; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 ; GFX7-NEXT: v_cndmask_b32_e64 v6, v8, v10, s[8:9] ; GFX7-NEXT: v_cndmask_b32_e64 v7, v9, v10, s[10:11] +; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 ; GFX7-NEXT: s_mov_b64 s[16:17], 16 ; GFX7-NEXT: buffer_store_dwordx4 v[4:7], off, s[16:19], 0 ; GFX7-NEXT: s_endpgm @@ -2968,25 +2968,25 @@ ; GFX9-NEXT: v_lshl_or_b32 v8, v0, s1, v1 ; GFX9-NEXT: v_mov_b32_e32 v0, s8 ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s2, 0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc ; GFX9-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s2, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc ; GFX9-NEXT: v_mov_b32_e32 v2, s10 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s2, 2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc ; GFX9-NEXT: v_mov_b32_e32 v3, s11 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s2, 3 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc ; GFX9-NEXT: v_mov_b32_e32 v5, s13 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s2, 5 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v8, vcc ; GFX9-NEXT: v_mov_b32_e32 v6, s14 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v8, vcc ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s2, 6 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc ; GFX9-NEXT: v_mov_b32_e32 v4, s12 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, 4 ; GFX9-NEXT: v_mov_b32_e32 v7, s15 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, 4 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s2, 7 ; GFX9-NEXT: v_cndmask_b32_e64 v4, v4, v8, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc @@ -3027,31 +3027,31 @@ ; GFX8-NEXT: s_andn2_b32 s0, s0, s1 ; GFX8-NEXT: v_or_b32_e32 v8, s0, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s8 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc ; GFX8-NEXT: v_mov_b32_e32 v1, s9 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s2, 1 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc ; GFX8-NEXT: v_mov_b32_e32 v2, s10 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s2, 2 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc ; GFX8-NEXT: v_mov_b32_e32 v3, s11 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s2, 3 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc ; GFX8-NEXT: v_mov_b32_e32 v5, s13 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s2, 5 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v5, v8, vcc ; GFX8-NEXT: v_mov_b32_e32 v6, s14 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v5, v8, vcc ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s2, 6 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc ; GFX8-NEXT: v_mov_b32_e32 v4, s12 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, 4 ; GFX8-NEXT: v_mov_b32_e32 v7, s15 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, 4 +; GFX8-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s2, 7 ; GFX8-NEXT: v_cndmask_b32_e64 v4, v4, v8, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc ; GFX8-NEXT: v_mov_b32_e32 v8, 0 -; GFX8-NEXT: v_mov_b32_e32 v10, 16 ; GFX8-NEXT: v_mov_b32_e32 v9, 0 +; GFX8-NEXT: v_mov_b32_e32 v10, 16 ; GFX8-NEXT: v_mov_b32_e32 v11, 0 ; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[0:3] ; GFX8-NEXT: flat_store_dwordx4 v[10:11], v[4:7] @@ -3086,32 +3086,32 @@ ; GFX7-NEXT: v_or_b32_e32 v8, s0, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s8 ; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s2, 0 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc ; GFX7-NEXT: v_mov_b32_e32 v1, s9 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc ; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s2, 1 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc ; GFX7-NEXT: v_mov_b32_e32 v2, s10 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc ; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s2, 2 -; GFX7-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc ; GFX7-NEXT: v_mov_b32_e32 v3, s11 +; GFX7-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc ; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s2, 3 -; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc ; GFX7-NEXT: v_mov_b32_e32 v5, s13 +; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc ; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s2, 5 -; GFX7-NEXT: v_cndmask_b32_e32 v5, v5, v8, vcc ; GFX7-NEXT: v_mov_b32_e32 v4, s12 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, 4 ; GFX7-NEXT: v_mov_b32_e32 v6, s14 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, 4 +; GFX7-NEXT: v_cndmask_b32_e32 v5, v5, v8, vcc ; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s2, 6 +; GFX7-NEXT: v_mov_b32_e32 v7, s15 ; GFX7-NEXT: v_cndmask_b32_e64 v4, v4, v8, s[0:1] ; GFX7-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc ; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s2, 7 -; GFX7-NEXT: v_mov_b32_e32 v7, s15 ; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX7-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc +; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX7-NEXT: s_mov_b64 s[0:1], 16 ; GFX7-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 ; GFX7-NEXT: s_endpgm @@ -3197,21 +3197,21 @@ ; GFX9-NEXT: v_mov_b32_e32 v5, s20 ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[2:3] ; GFX9-NEXT: v_cmp_eq_u32_e64 s[14:15], 4, v8 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: s_mov_b32 s5, 0xffff ; GFX9-NEXT: v_mov_b32_e32 v6, s21 ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[14:15] ; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], 5, v8 +; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX9-NEXT: s_mov_b32 s5, 0xffff ; GFX9-NEXT: v_mov_b32_e32 v7, s22 ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[6:7] ; GFX9-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v8 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX9-NEXT: s_and_b32 s4, s4, s5 -; GFX9-NEXT: v_lshlrev_b32_e64 v2, v0, s4 -; GFX9-NEXT: v_lshlrev_b32_e64 v0, v0, s5 ; GFX9-NEXT: v_mov_b32_e32 v9, s23 ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[8:9] ; GFX9-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v8 +; GFX9-NEXT: v_lshlrev_b32_e64 v2, v0, s4 +; GFX9-NEXT: v_lshlrev_b32_e64 v0, v0, s5 ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[10:11] ; GFX9-NEXT: v_xor_b32_e32 v0, -1, v0 ; GFX9-NEXT: v_and_or_b32 v9, v1, v0, v2 @@ -3224,9 +3224,9 @@ ; GFX9-NEXT: v_mov_b32_e32 v6, s22 ; GFX9-NEXT: v_mov_b32_e32 v7, s23 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v8 -; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v9, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v9, s[12:13] ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v9, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v9, s[2:3] ; GFX9-NEXT: v_cndmask_b32_e64 v4, v4, v9, s[14:15] ; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v9, s[6:7] @@ -3257,21 +3257,21 @@ ; GFX8-NEXT: v_mov_b32_e32 v5, s20 ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[2:3] ; GFX8-NEXT: v_cmp_eq_u32_e64 s[14:15], 4, v8 -; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX8-NEXT: s_mov_b32 s5, 0xffff ; GFX8-NEXT: v_mov_b32_e32 v6, s21 ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[14:15] ; GFX8-NEXT: v_cmp_eq_u32_e64 s[6:7], 5, v8 +; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX8-NEXT: s_mov_b32 s5, 0xffff ; GFX8-NEXT: v_mov_b32_e32 v7, s22 ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[6:7] ; GFX8-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v8 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX8-NEXT: s_and_b32 s4, s4, s5 -; GFX8-NEXT: v_lshlrev_b32_e64 v2, v0, s4 -; GFX8-NEXT: v_lshlrev_b32_e64 v0, v0, s5 ; GFX8-NEXT: v_mov_b32_e32 v9, s23 ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[8:9] ; GFX8-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v8 +; GFX8-NEXT: v_lshlrev_b32_e64 v2, v0, s4 +; GFX8-NEXT: v_lshlrev_b32_e64 v0, v0, s5 ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[10:11] ; GFX8-NEXT: v_xor_b32_e32 v0, -1, v0 ; GFX8-NEXT: v_and_b32_e32 v0, v1, v0 @@ -3294,8 +3294,8 @@ ; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[8:9] ; GFX8-NEXT: v_cndmask_b32_e64 v7, v7, v9, s[10:11] ; GFX8-NEXT: v_mov_b32_e32 v8, 0 -; GFX8-NEXT: v_mov_b32_e32 v10, 16 ; GFX8-NEXT: v_mov_b32_e32 v9, 0 +; GFX8-NEXT: v_mov_b32_e32 v10, 16 ; GFX8-NEXT: v_mov_b32_e32 v11, 0 ; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[0:3] ; GFX8-NEXT: flat_store_dwordx4 v[10:11], v[4:7] @@ -3318,21 +3318,21 @@ ; GFX7-NEXT: v_mov_b32_e32 v5, s20 ; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[2:3] ; GFX7-NEXT: v_cmp_eq_u32_e64 s[14:15], 4, v8 -; GFX7-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX7-NEXT: s_mov_b32 s5, 0xffff ; GFX7-NEXT: v_mov_b32_e32 v6, s21 ; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[14:15] ; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], 5, v8 +; GFX7-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX7-NEXT: s_mov_b32 s5, 0xffff ; GFX7-NEXT: v_mov_b32_e32 v7, s22 ; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[6:7] ; GFX7-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v8 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX7-NEXT: s_and_b32 s4, s4, s5 -; GFX7-NEXT: v_lshl_b32_e32 v2, s4, v0 -; GFX7-NEXT: v_lshl_b32_e32 v0, s5, v0 ; GFX7-NEXT: v_mov_b32_e32 v9, s23 ; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[8:9] ; GFX7-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v8 +; GFX7-NEXT: v_lshl_b32_e32 v2, s4, v0 +; GFX7-NEXT: v_lshl_b32_e32 v0, s5, v0 ; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[10:11] ; GFX7-NEXT: v_xor_b32_e32 v0, -1, v0 ; GFX7-NEXT: v_and_b32_e32 v0, v1, v0 @@ -3346,18 +3346,18 @@ ; GFX7-NEXT: v_mov_b32_e32 v5, s21 ; GFX7-NEXT: v_mov_b32_e32 v6, s22 ; GFX7-NEXT: v_mov_b32_e32 v7, s23 -; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v9, s[2:3] -; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v9, s[0:1] ; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v9, s[12:13] ; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc +; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v9, s[0:1] +; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v9, s[2:3] ; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX7-NEXT: v_cndmask_b32_e64 v4, v4, v9, s[14:15] ; GFX7-NEXT: v_cndmask_b32_e64 v5, v5, v9, s[6:7] ; GFX7-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[8:9] ; GFX7-NEXT: v_cndmask_b32_e64 v7, v7, v9, s[10:11] +; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX7-NEXT: s_mov_b64 s[0:1], 16 ; GFX7-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 ; GFX7-NEXT: s_endpgm @@ -3438,20 +3438,20 @@ ; GFX9-NEXT: v_mov_b32_e32 v6, s16 ; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[2:3] ; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 4, v8 -; GFX9-NEXT: v_and_b32_e32 v1, 1, v1 ; GFX9-NEXT: v_mov_b32_e32 v7, s17 ; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[4:5] ; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], 5, v8 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 4, v1 +; GFX9-NEXT: v_and_b32_e32 v1, 1, v1 ; GFX9-NEXT: v_mov_b32_e32 v9, s18 ; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v7, s[6:7] ; GFX9-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v8 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 4, v1 ; GFX9-NEXT: s_mov_b32 s20, 0xffff -; GFX9-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX9-NEXT: v_lshlrev_b32_e64 v1, v1, s20 ; GFX9-NEXT: v_mov_b32_e32 v10, s19 ; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v9, s[8:9] ; GFX9-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v8 +; GFX9-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX9-NEXT: v_lshlrev_b32_e64 v1, v1, s20 ; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v10, s[10:11] ; GFX9-NEXT: v_xor_b32_e32 v1, -1, v1 ; GFX9-NEXT: v_and_or_b32 v9, v2, v1, v0 @@ -3464,9 +3464,9 @@ ; GFX9-NEXT: v_mov_b32_e32 v6, s18 ; GFX9-NEXT: v_mov_b32_e32 v7, s19 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v8 -; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v9, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v9, s[12:13] ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v9, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v9, s[2:3] ; GFX9-NEXT: v_cndmask_b32_e64 v4, v4, v9, s[4:5] ; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v9, s[6:7] @@ -3497,20 +3497,20 @@ ; GFX8-NEXT: v_mov_b32_e32 v6, s16 ; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[2:3] ; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 4, v8 -; GFX8-NEXT: v_and_b32_e32 v1, 1, v1 ; GFX8-NEXT: v_mov_b32_e32 v7, s17 ; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[4:5] ; GFX8-NEXT: v_cmp_eq_u32_e64 s[6:7], 5, v8 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 4, v1 +; GFX8-NEXT: v_and_b32_e32 v1, 1, v1 ; GFX8-NEXT: v_mov_b32_e32 v9, s18 ; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v7, s[6:7] ; GFX8-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v8 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 4, v1 ; GFX8-NEXT: s_mov_b32 s20, 0xffff -; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_lshlrev_b32_e64 v1, v1, s20 ; GFX8-NEXT: v_mov_b32_e32 v10, s19 ; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v9, s[8:9] ; GFX8-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v8 +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_lshlrev_b32_e64 v1, v1, s20 ; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v10, s[10:11] ; GFX8-NEXT: v_xor_b32_e32 v1, -1, v1 ; GFX8-NEXT: v_and_b32_e32 v1, v2, v1 @@ -3533,8 +3533,8 @@ ; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[8:9] ; GFX8-NEXT: v_cndmask_b32_e64 v7, v7, v9, s[10:11] ; GFX8-NEXT: v_mov_b32_e32 v8, 0 -; GFX8-NEXT: v_mov_b32_e32 v10, 16 ; GFX8-NEXT: v_mov_b32_e32 v9, 0 +; GFX8-NEXT: v_mov_b32_e32 v10, 16 ; GFX8-NEXT: v_mov_b32_e32 v11, 0 ; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[0:3] ; GFX8-NEXT: flat_store_dwordx4 v[10:11], v[4:7] @@ -3557,21 +3557,21 @@ ; GFX7-NEXT: v_mov_b32_e32 v6, s16 ; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[2:3] ; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 4, v8 -; GFX7-NEXT: v_and_b32_e32 v1, 1, v1 -; GFX7-NEXT: s_mov_b32 s20, 0xffff ; GFX7-NEXT: v_mov_b32_e32 v7, s17 ; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[4:5] ; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], 5, v8 +; GFX7-NEXT: v_and_b32_e32 v1, 1, v1 +; GFX7-NEXT: s_mov_b32 s20, 0xffff ; GFX7-NEXT: v_mov_b32_e32 v9, s18 ; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v7, s[6:7] ; GFX7-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v8 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 4, v1 ; GFX7-NEXT: v_and_b32_e32 v0, s20, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, v1, v0 -; GFX7-NEXT: v_lshl_b32_e32 v1, s20, v1 ; GFX7-NEXT: v_mov_b32_e32 v10, s19 ; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v9, s[8:9] ; GFX7-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v8 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_lshl_b32_e32 v1, s20, v1 ; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v10, s[10:11] ; GFX7-NEXT: v_xor_b32_e32 v1, -1, v1 ; GFX7-NEXT: v_and_b32_e32 v1, v2, v1 @@ -3585,18 +3585,18 @@ ; GFX7-NEXT: v_mov_b32_e32 v6, s18 ; GFX7-NEXT: v_mov_b32_e32 v7, s19 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v8 -; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v9, s[2:3] -; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v9, s[0:1] ; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v9, s[12:13] ; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc +; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v9, s[0:1] +; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v9, s[2:3] ; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX7-NEXT: v_cndmask_b32_e64 v4, v4, v9, s[4:5] ; GFX7-NEXT: v_cndmask_b32_e64 v5, v5, v9, s[6:7] ; GFX7-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[8:9] ; GFX7-NEXT: v_cndmask_b32_e64 v7, v7, v9, s[10:11] +; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX7-NEXT: s_mov_b64 s[0:1], 16 ; GFX7-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 ; GFX7-NEXT: s_endpgm @@ -3619,9 +3619,9 @@ ; GFX10-NEXT: v_lshlrev_b32_e64 v3, v1, s4 ; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 6, v10 ; GFX10-NEXT: v_lshlrev_b32_sdwa v8, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_xor_b32_e32 v9, -1, v3 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v2, s9 -; GFX10-NEXT: v_xor_b32_e32 v9, -1, v3 ; GFX10-NEXT: v_cndmask_b32_e32 v2, s8, v2, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s10, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s11, s1 @@ -3663,9 +3663,9 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: global_load_dwordx4 v[3:6], v[0:1], off ; GFX9-NEXT: global_load_dwordx4 v[7:10], v[0:1], off offset:16 +; GFX9-NEXT: s_mov_b32 s0, 0xffff ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 1, v2 ; GFX9-NEXT: v_and_b32_e32 v1, 1, v2 -; GFX9-NEXT: s_mov_b32 s0, 0xffff ; GFX9-NEXT: s_and_b32 s1, s2, s0 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 4, v1 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 @@ -3689,8 +3689,8 @@ ; GFX9-NEXT: v_cndmask_b32_e64 v11, v11, v9, s[8:9] ; GFX9-NEXT: v_cndmask_b32_e64 v11, v11, v10, s[10:11] ; GFX9-NEXT: v_and_or_b32 v11, v11, v1, v2 -; GFX9-NEXT: v_cndmask_b32_e64 v2, v5, v11, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v0, v3, v11, s[12:13] +; GFX9-NEXT: v_cndmask_b32_e64 v2, v5, v11, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v3, v6, v11, s[2:3] ; GFX9-NEXT: v_cndmask_b32_e64 v5, v8, v11, s[6:7] ; GFX9-NEXT: v_cndmask_b32_e64 v6, v9, v11, s[8:9] @@ -3711,9 +3711,9 @@ ; GFX8-NEXT: v_addc_u32_e32 v8, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dwordx4 v[3:6], v[0:1] ; GFX8-NEXT: flat_load_dwordx4 v[7:10], v[7:8] +; GFX8-NEXT: s_mov_b32 s0, 0xffff ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 1, v2 ; GFX8-NEXT: v_and_b32_e32 v1, 1, v2 -; GFX8-NEXT: s_mov_b32 s0, 0xffff ; GFX8-NEXT: s_and_b32 s1, s2, s0 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 4, v1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 @@ -3747,8 +3747,8 @@ ; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v11, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v4, v7, v11, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e64 v7, v10, v11, s[10:11] -; GFX8-NEXT: v_mov_b32_e32 v10, 16 ; GFX8-NEXT: v_mov_b32_e32 v9, 0 +; GFX8-NEXT: v_mov_b32_e32 v10, 16 ; GFX8-NEXT: v_mov_b32_e32 v11, 0 ; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[0:3] ; GFX8-NEXT: flat_store_dwordx4 v[10:11], v[4:7] @@ -3761,9 +3761,9 @@ ; GFX7-NEXT: s_mov_b64 s[16:17], 0 ; GFX7-NEXT: buffer_load_dwordx4 v[3:6], v[0:1], s[16:19], 0 addr64 ; GFX7-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[16:19], 0 addr64 offset:16 +; GFX7-NEXT: s_mov_b32 s0, 0xffff ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 1, v2 ; GFX7-NEXT: v_and_b32_e32 v1, 1, v2 -; GFX7-NEXT: s_mov_b32 s0, 0xffff ; GFX7-NEXT: s_and_b32 s1, s2, s0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 4, v1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 @@ -3794,10 +3794,10 @@ ; GFX7-NEXT: v_cndmask_b32_e64 v2, v5, v11, s[0:1] ; GFX7-NEXT: v_cndmask_b32_e64 v3, v6, v11, s[2:3] ; GFX7-NEXT: v_cndmask_b32_e64 v4, v7, v11, s[4:5] -; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 ; GFX7-NEXT: v_cndmask_b32_e64 v5, v8, v11, s[6:7] ; GFX7-NEXT: v_cndmask_b32_e64 v6, v9, v11, s[8:9] ; GFX7-NEXT: v_cndmask_b32_e64 v7, v10, v11, s[10:11] +; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 ; GFX7-NEXT: s_mov_b64 s[16:17], 16 ; GFX7-NEXT: buffer_store_dwordx4 v[4:7], off, s[16:19], 0 ; GFX7-NEXT: s_endpgm @@ -3860,9 +3860,9 @@ ; GFX9-NEXT: global_load_dwordx4 v[3:6], v[0:1], off ; GFX9-NEXT: global_load_dwordx4 v[7:10], v[0:1], off offset:16 ; GFX9-NEXT: s_and_b32 s1, s2, 1 +; GFX9-NEXT: s_mov_b32 s0, 0xffff ; GFX9-NEXT: s_lshr_b32 s12, s2, 1 ; GFX9-NEXT: s_lshl_b32 s1, s1, 4 -; GFX9-NEXT: s_mov_b32 s0, 0xffff ; GFX9-NEXT: s_lshl_b32 s0, s0, s1 ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s12, 1 ; GFX9-NEXT: v_lshlrev_b32_sdwa v0, s1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 @@ -3884,8 +3884,8 @@ ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v10, s[10:11] ; GFX9-NEXT: v_and_or_b32 v11, v1, s13, v0 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[12:13], s12, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v2, v5, v11, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v0, v3, v11, s[12:13] +; GFX9-NEXT: v_cndmask_b32_e64 v2, v5, v11, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v3, v6, v11, s[2:3] ; GFX9-NEXT: v_cndmask_b32_e64 v5, v8, v11, s[6:7] ; GFX9-NEXT: v_cndmask_b32_e64 v6, v9, v11, s[8:9] @@ -3907,9 +3907,9 @@ ; GFX8-NEXT: flat_load_dwordx4 v[3:6], v[0:1] ; GFX8-NEXT: flat_load_dwordx4 v[7:10], v[7:8] ; GFX8-NEXT: s_and_b32 s1, s2, 1 +; GFX8-NEXT: s_mov_b32 s0, 0xffff ; GFX8-NEXT: s_lshr_b32 s12, s2, 1 ; GFX8-NEXT: s_lshl_b32 s1, s1, 4 -; GFX8-NEXT: s_mov_b32 s0, 0xffff ; GFX8-NEXT: s_lshl_b32 s0, s0, s1 ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s12, 1 ; GFX8-NEXT: v_mov_b32_e32 v0, s1 @@ -3942,8 +3942,8 @@ ; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v11, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v4, v7, v11, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e64 v7, v10, v11, s[10:11] -; GFX8-NEXT: v_mov_b32_e32 v10, 16 ; GFX8-NEXT: v_mov_b32_e32 v9, 0 +; GFX8-NEXT: v_mov_b32_e32 v10, 16 ; GFX8-NEXT: v_mov_b32_e32 v11, 0 ; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[0:3] ; GFX8-NEXT: flat_store_dwordx4 v[10:11], v[4:7] @@ -3957,8 +3957,8 @@ ; GFX7-NEXT: buffer_load_dwordx4 v[3:6], v[0:1], s[16:19], 0 addr64 ; GFX7-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[16:19], 0 addr64 offset:16 ; GFX7-NEXT: s_and_b32 s1, s2, 1 -; GFX7-NEXT: s_lshr_b32 s12, s2, 1 ; GFX7-NEXT: s_mov_b32 s0, 0xffff +; GFX7-NEXT: s_lshr_b32 s12, s2, 1 ; GFX7-NEXT: s_lshl_b32 s1, s1, 4 ; GFX7-NEXT: v_and_b32_e32 v0, s0, v2 ; GFX7-NEXT: s_lshl_b32 s0, s0, s1 @@ -3989,10 +3989,10 @@ ; GFX7-NEXT: v_cndmask_b32_e64 v2, v5, v11, s[0:1] ; GFX7-NEXT: v_cndmask_b32_e64 v3, v6, v11, s[2:3] ; GFX7-NEXT: v_cndmask_b32_e64 v4, v7, v11, s[4:5] -; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 ; GFX7-NEXT: v_cndmask_b32_e64 v5, v8, v11, s[6:7] ; GFX7-NEXT: v_cndmask_b32_e64 v6, v9, v11, s[8:9] ; GFX7-NEXT: v_cndmask_b32_e64 v7, v10, v11, s[10:11] +; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 ; GFX7-NEXT: s_mov_b64 s[16:17], 16 ; GFX7-NEXT: buffer_store_dwordx4 v[4:7], off, s[16:19], 0 ; GFX7-NEXT: s_endpgm @@ -4055,9 +4055,9 @@ ; GFX9-NEXT: global_load_dwordx4 v[8:11], v[0:1], off offset:16 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 1, v3 ; GFX9-NEXT: v_and_b32_e32 v1, 1, v3 +; GFX9-NEXT: s_mov_b32 s0, 0xffff ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 4, v1 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-NEXT: s_mov_b32 s0, 0xffff ; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX9-NEXT: v_lshlrev_b32_e64 v1, v1, s0 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v0 @@ -4102,9 +4102,9 @@ ; GFX8-NEXT: flat_load_dwordx4 v[8:11], v[8:9] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 1, v3 ; GFX8-NEXT: v_and_b32_e32 v1, 1, v3 +; GFX8-NEXT: s_mov_b32 s0, 0xffff ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 4, v1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX8-NEXT: s_mov_b32 s0, 0xffff ; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX8-NEXT: v_lshlrev_b32_e64 v1, v1, s0 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v0 @@ -4135,8 +4135,8 @@ ; GFX8-NEXT: v_cndmask_b32_e64 v3, v7, v12, s[2:3] ; GFX8-NEXT: v_cndmask_b32_e64 v6, v10, v12, s[8:9] ; GFX8-NEXT: v_cndmask_b32_e64 v7, v11, v12, s[10:11] -; GFX8-NEXT: v_mov_b32_e32 v10, 16 ; GFX8-NEXT: v_mov_b32_e32 v9, 0 +; GFX8-NEXT: v_mov_b32_e32 v10, 16 ; GFX8-NEXT: v_mov_b32_e32 v11, 0 ; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[0:3] ; GFX8-NEXT: flat_store_dwordx4 v[10:11], v[4:7] @@ -4149,12 +4149,12 @@ ; GFX7-NEXT: s_mov_b64 s[16:17], 0 ; GFX7-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[16:19], 0 addr64 ; GFX7-NEXT: buffer_load_dwordx4 v[8:11], v[0:1], s[16:19], 0 addr64 offset:16 +; GFX7-NEXT: s_mov_b32 s0, 0xffff ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 1, v3 ; GFX7-NEXT: v_and_b32_e32 v1, 1, v3 -; GFX7-NEXT: s_mov_b32 s0, 0xffff -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX7-NEXT: v_and_b32_e32 v2, s0, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 4, v1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, v1, v2 ; GFX7-NEXT: v_lshl_b32_e32 v1, s0, v1 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v0 @@ -4181,11 +4181,11 @@ ; GFX7-NEXT: v_cndmask_b32_e32 v1, v5, v12, vcc ; GFX7-NEXT: v_cndmask_b32_e64 v2, v6, v12, s[0:1] ; GFX7-NEXT: v_cndmask_b32_e64 v3, v7, v12, s[2:3] -; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 ; GFX7-NEXT: v_cndmask_b32_e64 v4, v8, v12, s[4:5] ; GFX7-NEXT: v_cndmask_b32_e64 v5, v9, v12, s[6:7] ; GFX7-NEXT: v_cndmask_b32_e64 v6, v10, v12, s[8:9] ; GFX7-NEXT: v_cndmask_b32_e64 v7, v11, v12, s[10:11] +; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 ; GFX7-NEXT: s_mov_b64 s[16:17], 16 ; GFX7-NEXT: buffer_store_dwordx4 v[4:7], off, s[16:19], 0 ; GFX7-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll @@ -12,8 +12,8 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s5, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 8, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s5, 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GFX9-NEXT: v_and_b32_e32 v0, 0xff, v0 @@ -32,8 +32,8 @@ ; GFX8-NEXT: v_mov_b32_e32 v1, s4 ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s5, 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 8, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s5, 1 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GFX8-NEXT: v_and_b32_e32 v0, 0xff, v0 @@ -97,8 +97,8 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s3, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 8, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s3, 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GFX9-NEXT: v_and_b32_e32 v0, 0xff, v0 @@ -115,8 +115,8 @@ ; GFX8-NEXT: v_mov_b32_e32 v1, s2 ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s3, 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 8, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s3, 1 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GFX8-NEXT: v_and_b32_e32 v0, 0xff, v0 @@ -178,8 +178,8 @@ ; GFX9-NEXT: global_load_ushort v1, v1, s[2:3] ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s4, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v0, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 8, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v0, vcc ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GFX9-NEXT: v_and_b32_e32 v0, 0xff, v0 @@ -197,8 +197,8 @@ ; GFX8-NEXT: flat_load_ushort v1, v[1:2] ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s4, 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cndmask_b32_e32 v3, v1, v0, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 8, v1 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v1, v0, vcc ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GFX8-NEXT: v_and_b32_e32 v0, 0xff, v0 @@ -262,8 +262,8 @@ ; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cndmask_b32_e32 v4, v1, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 8, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v1, v2, vcc ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX9-NEXT: v_and_b32_e32 v0, 0xff, v0 @@ -282,8 +282,8 @@ ; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cndmask_b32_e32 v4, v1, v2, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v1 +; GFX8-NEXT: v_cndmask_b32_e32 v4, v1, v2, vcc ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX8-NEXT: v_and_b32_e32 v0, 0xff, v0 @@ -430,8 +430,8 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 8, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc ; GFX9-NEXT: v_and_b32_e32 v0, 0xff, v0 @@ -448,8 +448,8 @@ ; GFX8-NEXT: v_mov_b32_e32 v1, s2 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc ; GFX8-NEXT: v_and_b32_e32 v0, 0xff, v0 @@ -510,8 +510,8 @@ ; GFX9-NEXT: global_load_ushort v0, v[0:1], off ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s2, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cndmask_b32_e32 v3, v0, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v0, v2, vcc ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s2, 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GFX9-NEXT: v_and_b32_e32 v0, 0xff, v0 @@ -527,8 +527,8 @@ ; GFX8-NEXT: flat_load_ushort v0, v[0:1] ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s2, 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cndmask_b32_e32 v3, v0, v2, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v0, v2, vcc ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s2, 1 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GFX8-NEXT: v_and_b32_e32 v0, 0xff, v0 @@ -588,8 +588,8 @@ ; GFX9-NEXT: global_load_ushort v0, v[0:1], off ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cndmask_b32_e32 v4, v0, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v0, v2, vcc ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GFX9-NEXT: v_and_b32_e32 v0, 0xff, v0 @@ -605,8 +605,8 @@ ; GFX8-NEXT: flat_load_ushort v0, v[0:1] ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cndmask_b32_e32 v4, v0, v2, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v4, v0, v2, vcc ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GFX8-NEXT: v_and_b32_e32 v0, 0xff, v0 @@ -721,9 +721,9 @@ ; GFX9-LABEL: insertelement_v_v4i8_s_s: ; GFX9: ; %bb.0: ; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: s_and_b32 s3, s3, 3 ; GFX9-NEXT: s_mov_b32 s0, 8 ; GFX9-NEXT: s_movk_i32 s4, 0xff +; GFX9-NEXT: s_and_b32 s3, s3, 3 ; GFX9-NEXT: s_mov_b32 s1, 16 ; GFX9-NEXT: s_and_b32 s2, s2, s4 ; GFX9-NEXT: s_lshl_b32 s3, s3, 3 @@ -850,11 +850,11 @@ ; GFX10-NEXT: s_not_b32 s2, s3 ; GFX10-NEXT: v_mov_b32_e32 v1, 8 ; GFX10-NEXT: v_and_or_b32 v0, v0, s2, s0 -; GFX10-NEXT: v_lshrrev_b32_e32 v3, 24, v0 ; GFX10-NEXT: v_lshlrev_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, 24, v0 ; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; GFX10-NEXT: v_and_or_b32 v4, v0, s1, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_or3_b32 v2, v4, v2, v3 @@ -878,8 +878,8 @@ ; GFX9-NEXT: s_bfe_u32 s7, s0, 0x80008 ; GFX9-NEXT: s_lshr_b32 s3, s0, 24 ; GFX9-NEXT: s_and_b32 s6, s0, s5 -; GFX9-NEXT: s_bfe_u32 s0, s0, 0x80010 ; GFX9-NEXT: s_lshl_b32 s7, s7, 8 +; GFX9-NEXT: s_bfe_u32 s0, s0, 0x80010 ; GFX9-NEXT: s_or_b32 s6, s6, s7 ; GFX9-NEXT: s_lshl_b32 s0, s0, 16 ; GFX9-NEXT: s_or_b32 s0, s6, s0 @@ -891,8 +891,8 @@ ; GFX9-NEXT: s_andn2_b32 s0, s0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: v_lshl_or_b32 v0, v0, s3, v1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 24, v0 +; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX9-NEXT: v_and_or_b32 v2, v0, s5, v2 ; GFX9-NEXT: v_lshlrev_b32_sdwa v3, s2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 24, v1 @@ -912,8 +912,8 @@ ; GFX8-NEXT: s_bfe_u32 s5, s1, 0x80008 ; GFX8-NEXT: s_lshr_b32 s2, s1, 24 ; GFX8-NEXT: s_and_b32 s3, s1, s0 -; GFX8-NEXT: s_bfe_u32 s1, s1, 0x80010 ; GFX8-NEXT: s_lshl_b32 s5, s5, 8 +; GFX8-NEXT: s_bfe_u32 s1, s1, 0x80010 ; GFX8-NEXT: s_or_b32 s3, s3, s5 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 ; GFX8-NEXT: s_or_b32 s1, s3, s1 @@ -921,8 +921,8 @@ ; GFX8-NEXT: s_or_b32 s1, s1, s2 ; GFX8-NEXT: s_and_b32 s2, s4, 3 ; GFX8-NEXT: s_lshl_b32 s2, s2, 3 -; GFX8-NEXT: s_lshl_b32 s0, s0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s2 +; GFX8-NEXT: s_lshl_b32 s0, s0, s2 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX8-NEXT: s_andn2_b32 s0, s1, s0 ; GFX8-NEXT: v_or_b32_e32 v0, s0, v0 @@ -947,8 +947,8 @@ ; GFX7-NEXT: s_bfe_u32 s3, s0, 0x80008 ; GFX7-NEXT: s_lshr_b32 s1, s0, 24 ; GFX7-NEXT: s_and_b32 s2, s0, s5 -; GFX7-NEXT: s_bfe_u32 s0, s0, 0x80010 ; GFX7-NEXT: s_lshl_b32 s3, s3, 8 +; GFX7-NEXT: s_bfe_u32 s0, s0, 0x80010 ; GFX7-NEXT: s_or_b32 s2, s2, s3 ; GFX7-NEXT: s_lshl_b32 s0, s0, 16 ; GFX7-NEXT: s_or_b32 s0, s2, s0 @@ -963,8 +963,8 @@ ; GFX7-NEXT: v_bfe_u32 v3, v0, 8, 8 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 24, v0 ; GFX7-NEXT: v_and_b32_e32 v2, s5, v0 -; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 ; GFX7-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_or_b32_e32 v0, v2, v0 @@ -1027,8 +1027,8 @@ ; GFX9-NEXT: s_bfe_u32 s7, s0, 0x80008 ; GFX9-NEXT: s_lshr_b32 s3, s0, 24 ; GFX9-NEXT: s_and_b32 s6, s0, s5 -; GFX9-NEXT: s_bfe_u32 s0, s0, 0x80010 ; GFX9-NEXT: s_lshl_b32 s7, s7, 8 +; GFX9-NEXT: s_bfe_u32 s0, s0, 0x80010 ; GFX9-NEXT: s_or_b32 s6, s6, s7 ; GFX9-NEXT: s_lshl_b32 s0, s0, 16 ; GFX9-NEXT: s_or_b32 s0, s6, s0 @@ -1039,9 +1039,9 @@ ; GFX9-NEXT: v_lshlrev_b32_e64 v0, v0, s5 ; GFX9-NEXT: v_xor_b32_e32 v0, -1, v0 ; GFX9-NEXT: v_and_or_b32 v0, s0, v0, v1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX9-NEXT: s_mov_b32 s2, 16 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 24, v0 +; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX9-NEXT: v_and_or_b32 v2, v0, s5, v2 ; GFX9-NEXT: v_lshlrev_b32_sdwa v3, s2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 24, v1 @@ -1062,8 +1062,8 @@ ; GFX8-NEXT: s_bfe_u32 s5, s1, 0x80008 ; GFX8-NEXT: s_lshr_b32 s2, s1, 24 ; GFX8-NEXT: s_and_b32 s3, s1, s0 -; GFX8-NEXT: s_bfe_u32 s1, s1, 0x80010 ; GFX8-NEXT: s_lshl_b32 s5, s5, 8 +; GFX8-NEXT: s_bfe_u32 s1, s1, 0x80010 ; GFX8-NEXT: s_or_b32 s3, s3, s5 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 ; GFX8-NEXT: s_or_b32 s1, s3, s1 @@ -1098,8 +1098,8 @@ ; GFX7-NEXT: s_bfe_u32 s3, s0, 0x80008 ; GFX7-NEXT: s_lshr_b32 s1, s0, 24 ; GFX7-NEXT: s_and_b32 s2, s0, s5 -; GFX7-NEXT: s_bfe_u32 s0, s0, 0x80010 ; GFX7-NEXT: s_lshl_b32 s3, s3, 8 +; GFX7-NEXT: s_bfe_u32 s0, s0, 0x80010 ; GFX7-NEXT: s_or_b32 s2, s2, s3 ; GFX7-NEXT: s_lshl_b32 s0, s0, 16 ; GFX7-NEXT: s_or_b32 s0, s2, s0 @@ -1114,8 +1114,8 @@ ; GFX7-NEXT: v_bfe_u32 v3, v0, 8, 8 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 24, v0 ; GFX7-NEXT: v_and_b32_e32 v2, s5, v0 -; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 ; GFX7-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_or_b32_e32 v0, v2, v0 @@ -1179,20 +1179,20 @@ ; GFX9-NEXT: s_bfe_u32 s6, s0, 0x80008 ; GFX9-NEXT: s_lshr_b32 s3, s0, 24 ; GFX9-NEXT: s_and_b32 s5, s0, s4 -; GFX9-NEXT: s_bfe_u32 s0, s0, 0x80010 ; GFX9-NEXT: s_lshl_b32 s6, s6, 8 +; GFX9-NEXT: s_bfe_u32 s0, s0, 0x80010 ; GFX9-NEXT: s_or_b32 s5, s5, s6 ; GFX9-NEXT: s_lshl_b32 s0, s0, 16 -; GFX9-NEXT: v_lshlrev_b32_e64 v1, v1, s4 ; GFX9-NEXT: s_or_b32 s0, s5, s0 ; GFX9-NEXT: s_lshl_b32 s3, s3, 24 +; GFX9-NEXT: v_lshlrev_b32_e64 v1, v1, s4 ; GFX9-NEXT: s_or_b32 s0, s0, s3 ; GFX9-NEXT: v_xor_b32_e32 v1, -1, v1 -; GFX9-NEXT: v_and_or_b32 v0, s0, v1, v0 ; GFX9-NEXT: s_mov_b32 s1, 8 -; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_and_or_b32 v0, s0, v1, v0 ; GFX9-NEXT: s_mov_b32 s2, 16 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 24, v0 +; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX9-NEXT: v_and_or_b32 v2, v0, s4, v2 ; GFX9-NEXT: v_lshlrev_b32_sdwa v3, s2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 24, v1 @@ -1213,13 +1213,13 @@ ; GFX8-NEXT: s_bfe_u32 s4, s1, 0x80008 ; GFX8-NEXT: s_lshr_b32 s2, s1, 24 ; GFX8-NEXT: s_and_b32 s3, s1, s0 -; GFX8-NEXT: s_bfe_u32 s1, s1, 0x80010 ; GFX8-NEXT: s_lshl_b32 s4, s4, 8 +; GFX8-NEXT: s_bfe_u32 s1, s1, 0x80010 ; GFX8-NEXT: s_or_b32 s3, s3, s4 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 -; GFX8-NEXT: v_lshlrev_b32_e64 v1, v1, s0 ; GFX8-NEXT: s_or_b32 s1, s3, s1 ; GFX8-NEXT: s_lshl_b32 s2, s2, 24 +; GFX8-NEXT: v_lshlrev_b32_e64 v1, v1, s0 ; GFX8-NEXT: s_or_b32 s1, s1, s2 ; GFX8-NEXT: v_xor_b32_e32 v1, -1, v1 ; GFX8-NEXT: v_and_b32_e32 v1, s1, v1 @@ -1249,8 +1249,8 @@ ; GFX7-NEXT: s_bfe_u32 s3, s0, 0x80008 ; GFX7-NEXT: s_lshr_b32 s1, s0, 24 ; GFX7-NEXT: s_and_b32 s2, s0, s4 -; GFX7-NEXT: s_bfe_u32 s0, s0, 0x80010 ; GFX7-NEXT: s_lshl_b32 s3, s3, 8 +; GFX7-NEXT: s_bfe_u32 s0, s0, 0x80010 ; GFX7-NEXT: s_or_b32 s2, s2, s3 ; GFX7-NEXT: s_lshl_b32 s0, s0, 16 ; GFX7-NEXT: s_or_b32 s0, s2, s0 @@ -1264,8 +1264,8 @@ ; GFX7-NEXT: v_bfe_u32 v3, v0, 8, 8 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 24, v0 ; GFX7-NEXT: v_and_b32_e32 v2, s4, v0 -; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 ; GFX7-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_or_b32_e32 v0, v2, v0 @@ -1320,9 +1320,9 @@ ; GFX9-LABEL: insertelement_v_v4i8_s_v: ; GFX9: ; %bb.0: ; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: v_and_b32_e32 v2, 3, v2 ; GFX9-NEXT: s_mov_b32 s0, 8 ; GFX9-NEXT: s_movk_i32 s3, 0xff +; GFX9-NEXT: v_and_b32_e32 v2, 3, v2 ; GFX9-NEXT: s_mov_b32 s1, 16 ; GFX9-NEXT: s_and_b32 s2, s2, s3 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v2 @@ -1354,9 +1354,9 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v1, 8 +; GFX8-NEXT: s_movk_i32 s0, 0xff ; GFX8-NEXT: v_mov_b32_e32 v3, 16 ; GFX8-NEXT: v_and_b32_e32 v2, 3, v2 -; GFX8-NEXT: s_movk_i32 s0, 0xff ; GFX8-NEXT: s_and_b32 s1, s2, s0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v2 ; GFX8-NEXT: v_lshlrev_b32_e64 v6, v2, s1 @@ -1468,11 +1468,11 @@ ; GFX9-LABEL: insertelement_v_v4i8_v_s: ; GFX9: ; %bb.0: ; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: s_and_b32 s2, s2, 3 ; GFX9-NEXT: s_mov_b32 s0, 8 +; GFX9-NEXT: s_and_b32 s2, s2, 3 ; GFX9-NEXT: s_mov_b32 s1, 16 -; GFX9-NEXT: s_lshl_b32 s2, s2, 3 ; GFX9-NEXT: s_movk_i32 s3, 0xff +; GFX9-NEXT: s_lshl_b32 s2, s2, 3 ; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: s_lshl_b32 s2, s3, s2 ; GFX9-NEXT: s_not_b32 s2, s2 @@ -1594,11 +1594,11 @@ ; GFX10-NEXT: v_mov_b32_e32 v1, 8 ; GFX10-NEXT: v_and_or_b32 v0, v0, s1, v2 ; GFX10-NEXT: v_mov_b32_e32 v2, 16 -; GFX10-NEXT: v_lshrrev_b32_e32 v3, 24, v0 ; GFX10-NEXT: v_lshlrev_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, 24, v0 ; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; GFX10-NEXT: v_and_or_b32 v4, v0, s0, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_or3_b32 v2, v4, v2, v3 @@ -1614,11 +1614,11 @@ ; GFX9-LABEL: insertelement_v_v4i8_v_v: ; GFX9: ; %bb.0: ; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX9-NEXT: s_mov_b32 s0, 8 +; GFX9-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX9-NEXT: s_mov_b32 s1, 16 -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX9-NEXT: s_movk_i32 s2, 0xff +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_lshlrev_b32_e64 v3, v3, s2 ; GFX9-NEXT: v_xor_b32_e32 v3, -1, v3 @@ -1633,8 +1633,8 @@ ; GFX9-NEXT: v_lshlrev_b32_e32 v6, 24, v6 ; GFX9-NEXT: v_or3_b32 v0, v0, v8, v6 ; GFX9-NEXT: v_and_or_b32 v0, v0, v3, v2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v3, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 24, v0 +; GFX9-NEXT: v_lshlrev_b32_sdwa v3, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX9-NEXT: v_lshlrev_b32_sdwa v4, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_and_or_b32 v3, v0, v1, v3 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 24, v2 @@ -1649,11 +1649,11 @@ ; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v4, 8 ; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX8-NEXT: v_mov_b32_e32 v1, 0xff ; GFX8-NEXT: v_mov_b32_e32 v5, 16 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX8-NEXT: v_mov_b32_e32 v1, 0xff -; GFX8-NEXT: v_lshlrev_b32_e32 v1, v3, v1 ; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, v3, v1 ; GFX8-NEXT: v_xor_b32_e32 v1, -1, v1 ; GFX8-NEXT: v_mov_b32_e32 v6, 8 ; GFX8-NEXT: v_mov_b32_e32 v7, 16 @@ -1775,13 +1775,13 @@ ; GFX9-NEXT: s_bfe_u32 s0, s0, s9 ; GFX9-NEXT: s_lshl_b32 s0, s0, 16 ; GFX9-NEXT: s_or_b32 s0, s7, s0 -; GFX9-NEXT: s_bfe_u32 s7, s1, s8 ; GFX9-NEXT: s_lshl_b32 s2, s2, 24 -; GFX9-NEXT: s_or_b32 s0, s0, s2 +; GFX9-NEXT: s_bfe_u32 s7, s1, s8 ; GFX9-NEXT: s_lshr_b32 s3, s1, 24 +; GFX9-NEXT: s_or_b32 s0, s0, s2 ; GFX9-NEXT: s_and_b32 s2, s1, s6 -; GFX9-NEXT: s_bfe_u32 s1, s1, s9 ; GFX9-NEXT: s_lshl_b32 s7, s7, 8 +; GFX9-NEXT: s_bfe_u32 s1, s1, s9 ; GFX9-NEXT: s_or_b32 s2, s2, s7 ; GFX9-NEXT: s_lshl_b32 s1, s1, 16 ; GFX9-NEXT: s_or_b32 s1, s2, s1 @@ -1804,18 +1804,18 @@ ; GFX9-NEXT: s_bfe_u32 s5, s0, s8 ; GFX9-NEXT: s_lshr_b32 s2, s0, 24 ; GFX9-NEXT: s_and_b32 s4, s0, s6 -; GFX9-NEXT: s_bfe_u32 s0, s0, s9 ; GFX9-NEXT: s_lshl_b32 s5, s5, 8 +; GFX9-NEXT: s_bfe_u32 s0, s0, s9 ; GFX9-NEXT: s_or_b32 s4, s4, s5 ; GFX9-NEXT: s_lshl_b32 s0, s0, 16 ; GFX9-NEXT: s_or_b32 s0, s4, s0 -; GFX9-NEXT: s_bfe_u32 s4, s1, s8 ; GFX9-NEXT: s_lshl_b32 s2, s2, 24 -; GFX9-NEXT: s_or_b32 s0, s0, s2 +; GFX9-NEXT: s_bfe_u32 s4, s1, s8 ; GFX9-NEXT: s_lshr_b32 s3, s1, 24 +; GFX9-NEXT: s_or_b32 s0, s0, s2 ; GFX9-NEXT: s_and_b32 s2, s1, s6 -; GFX9-NEXT: s_bfe_u32 s1, s1, s9 ; GFX9-NEXT: s_lshl_b32 s4, s4, 8 +; GFX9-NEXT: s_bfe_u32 s1, s1, s9 ; GFX9-NEXT: s_or_b32 s2, s2, s4 ; GFX9-NEXT: s_lshl_b32 s1, s1, 16 ; GFX9-NEXT: s_or_b32 s1, s2, s1 @@ -1843,13 +1843,13 @@ ; GFX8-NEXT: s_bfe_u32 s0, s0, s9 ; GFX8-NEXT: s_lshl_b32 s0, s0, 16 ; GFX8-NEXT: s_or_b32 s0, s7, s0 -; GFX8-NEXT: s_bfe_u32 s7, s1, s8 ; GFX8-NEXT: s_lshl_b32 s2, s2, 24 -; GFX8-NEXT: s_or_b32 s0, s0, s2 +; GFX8-NEXT: s_bfe_u32 s7, s1, s8 ; GFX8-NEXT: s_lshr_b32 s3, s1, 24 +; GFX8-NEXT: s_or_b32 s0, s0, s2 ; GFX8-NEXT: s_and_b32 s2, s1, s6 -; GFX8-NEXT: s_bfe_u32 s1, s1, s9 ; GFX8-NEXT: s_lshl_b32 s7, s7, 8 +; GFX8-NEXT: s_bfe_u32 s1, s1, s9 ; GFX8-NEXT: s_or_b32 s2, s2, s7 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 ; GFX8-NEXT: s_or_b32 s1, s2, s1 @@ -1872,18 +1872,18 @@ ; GFX8-NEXT: s_bfe_u32 s5, s0, s8 ; GFX8-NEXT: s_lshr_b32 s2, s0, 24 ; GFX8-NEXT: s_and_b32 s4, s0, s6 -; GFX8-NEXT: s_bfe_u32 s0, s0, s9 ; GFX8-NEXT: s_lshl_b32 s5, s5, 8 +; GFX8-NEXT: s_bfe_u32 s0, s0, s9 ; GFX8-NEXT: s_or_b32 s4, s4, s5 ; GFX8-NEXT: s_lshl_b32 s0, s0, 16 ; GFX8-NEXT: s_or_b32 s0, s4, s0 -; GFX8-NEXT: s_bfe_u32 s4, s1, s8 ; GFX8-NEXT: s_lshl_b32 s2, s2, 24 -; GFX8-NEXT: s_or_b32 s0, s0, s2 +; GFX8-NEXT: s_bfe_u32 s4, s1, s8 ; GFX8-NEXT: s_lshr_b32 s3, s1, 24 +; GFX8-NEXT: s_or_b32 s0, s0, s2 ; GFX8-NEXT: s_and_b32 s2, s1, s6 -; GFX8-NEXT: s_bfe_u32 s1, s1, s9 ; GFX8-NEXT: s_lshl_b32 s4, s4, 8 +; GFX8-NEXT: s_bfe_u32 s1, s1, s9 ; GFX8-NEXT: s_or_b32 s2, s2, s4 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 ; GFX8-NEXT: s_or_b32 s1, s2, s1 @@ -1909,13 +1909,13 @@ ; GFX7-NEXT: s_bfe_u32 s0, s0, s9 ; GFX7-NEXT: s_lshl_b32 s0, s0, 16 ; GFX7-NEXT: s_or_b32 s0, s7, s0 -; GFX7-NEXT: s_bfe_u32 s7, s1, s8 ; GFX7-NEXT: s_lshl_b32 s2, s2, 24 -; GFX7-NEXT: s_or_b32 s0, s0, s2 +; GFX7-NEXT: s_bfe_u32 s7, s1, s8 ; GFX7-NEXT: s_lshr_b32 s3, s1, 24 +; GFX7-NEXT: s_or_b32 s0, s0, s2 ; GFX7-NEXT: s_and_b32 s2, s1, s6 -; GFX7-NEXT: s_bfe_u32 s1, s1, s9 ; GFX7-NEXT: s_lshl_b32 s7, s7, 8 +; GFX7-NEXT: s_bfe_u32 s1, s1, s9 ; GFX7-NEXT: s_or_b32 s2, s2, s7 ; GFX7-NEXT: s_lshl_b32 s1, s1, 16 ; GFX7-NEXT: s_or_b32 s1, s2, s1 @@ -1938,8 +1938,8 @@ ; GFX7-NEXT: s_bfe_u32 s10, s4, s8 ; GFX7-NEXT: s_lshr_b32 s2, s4, 24 ; GFX7-NEXT: s_and_b32 s7, s4, s6 -; GFX7-NEXT: s_bfe_u32 s4, s4, s9 ; GFX7-NEXT: s_lshl_b32 s10, s10, 8 +; GFX7-NEXT: s_bfe_u32 s4, s4, s9 ; GFX7-NEXT: s_or_b32 s7, s7, s10 ; GFX7-NEXT: s_lshl_b32 s4, s4, 16 ; GFX7-NEXT: s_or_b32 s4, s7, s4 @@ -1948,16 +1948,16 @@ ; GFX7-NEXT: s_and_b32 s4, s3, s6 ; GFX7-NEXT: s_bfe_u32 s6, s3, s8 ; GFX7-NEXT: s_lshr_b32 s5, s3, 24 -; GFX7-NEXT: s_bfe_u32 s3, s3, s9 ; GFX7-NEXT: s_lshl_b32 s6, s6, 8 +; GFX7-NEXT: s_bfe_u32 s3, s3, s9 ; GFX7-NEXT: s_or_b32 s4, s4, s6 ; GFX7-NEXT: s_lshl_b32 s3, s3, 16 ; GFX7-NEXT: s_or_b32 s3, s4, s3 ; GFX7-NEXT: s_lshl_b32 s4, s5, 24 ; GFX7-NEXT: s_or_b32 s3, s3, s4 ; GFX7-NEXT: v_mov_b32_e32 v0, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s3 ; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -1983,13 +1983,13 @@ ; GFX10-NEXT: s_bfe_u32 s1, s1, s6 ; GFX10-NEXT: s_lshl_b32 s11, s11, 8 ; GFX10-NEXT: s_lshl_b32 s13, s13, 8 -; GFX10-NEXT: s_or_b32 s10, s10, s11 ; GFX10-NEXT: s_lshl_b32 s0, s0, 16 ; GFX10-NEXT: s_lshl_b32 s1, s1, 16 +; GFX10-NEXT: s_or_b32 s10, s10, s11 ; GFX10-NEXT: s_or_b32 s11, s12, s13 ; GFX10-NEXT: s_lshl_b32 s8, s8, 24 -; GFX10-NEXT: s_or_b32 s0, s10, s0 ; GFX10-NEXT: s_lshl_b32 s9, s9, 24 +; GFX10-NEXT: s_or_b32 s0, s10, s0 ; GFX10-NEXT: s_or_b32 s1, s11, s1 ; GFX10-NEXT: s_or_b32 s0, s0, s8 ; GFX10-NEXT: s_or_b32 s1, s1, s9 @@ -2011,8 +2011,8 @@ ; GFX10-NEXT: s_and_b32 s5, s0, s2 ; GFX10-NEXT: s_lshr_b32 s8, s1, 24 ; GFX10-NEXT: s_and_b32 s2, s1, s2 -; GFX10-NEXT: s_bfe_u32 s1, s1, s6 ; GFX10-NEXT: s_lshl_b32 s3, s3, 8 +; GFX10-NEXT: s_bfe_u32 s1, s1, s6 ; GFX10-NEXT: s_lshr_b32 s4, s0, 24 ; GFX10-NEXT: s_bfe_u32 s0, s0, s6 ; GFX10-NEXT: s_lshl_b32 s7, s7, 8 @@ -2042,9 +2042,9 @@ ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off ; GFX9-NEXT: s_mov_b32 s0, 8 ; GFX9-NEXT: s_mov_b32 s1, 16 +; GFX9-NEXT: s_movk_i32 s4, 0xff ; GFX9-NEXT: s_lshr_b32 s5, s3, 2 ; GFX9-NEXT: s_and_b32 s3, s3, 3 -; GFX9-NEXT: s_movk_i32 s4, 0xff ; GFX9-NEXT: s_and_b32 s2, s2, s4 ; GFX9-NEXT: s_lshl_b32 s3, s3, 3 ; GFX9-NEXT: s_lshl_b32 s2, s2, s3 @@ -2075,15 +2075,15 @@ ; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 24, v0 -; GFX9-NEXT: v_lshlrev_b32_sdwa v8, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v8, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX9-NEXT: v_lshlrev_b32_sdwa v4, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX9-NEXT: v_lshlrev_b32_sdwa v9, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_lshlrev_b32_sdwa v5, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_and_or_b32 v1, v1, s4, v4 -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 24, v7 ; GFX9-NEXT: v_and_or_b32 v0, v0, s4, v8 ; GFX9-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; GFX9-NEXT: v_and_or_b32 v1, v1, s4, v4 +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 24, v7 ; GFX9-NEXT: v_or3_b32 v0, v0, v9, v6 ; GFX9-NEXT: v_or3_b32 v1, v1, v5, v4 ; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off @@ -2111,14 +2111,14 @@ ; GFX8-NEXT: v_lshlrev_b32_sdwa v10, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX8-NEXT: v_lshrrev_b32_e32 v8, 24, v0 -; GFX8-NEXT: v_lshlrev_b32_sdwa v11, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_lshrrev_b32_e32 v9, 24, v1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v11, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b32_e32 v8, 24, v8 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v11 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 24, v9 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v11 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v5 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v8 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v4 @@ -2133,12 +2133,12 @@ ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 24, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 24, v1 ; GFX8-NEXT: v_lshlrev_b32_sdwa v9, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b32_sdwa v7, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v9 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, 24, v5 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v9 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v7 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v5 @@ -2171,13 +2171,13 @@ ; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 8 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v7, 8, v7 -; GFX7-NEXT: v_or_b32_e32 v4, v4, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v4, v4, v5 ; GFX7-NEXT: v_or_b32_e32 v5, v6, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GFX7-NEXT: v_or_b32_e32 v0, v4, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GFX7-NEXT: v_or_b32_e32 v0, v4, v0 ; GFX7-NEXT: v_or_b32_e32 v1, v5, v1 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX7-NEXT: v_or_b32_e32 v1, v1, v3 @@ -2185,8 +2185,8 @@ ; GFX7-NEXT: v_and_b32_e32 v2, s1, v2 ; GFX7-NEXT: v_or_b32_e32 v2, s2, v2 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], s0, 0 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] +; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX7-NEXT: v_bfe_u32 v5, v0, 8, 8 ; GFX7-NEXT: v_bfe_u32 v7, v1, 8, 8 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v0 @@ -2197,13 +2197,13 @@ ; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 8 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v7, 8, v7 -; GFX7-NEXT: v_or_b32_e32 v4, v4, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v4, v4, v5 ; GFX7-NEXT: v_or_b32_e32 v5, v6, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GFX7-NEXT: v_or_b32_e32 v0, v4, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GFX7-NEXT: v_or_b32_e32 v0, v4, v0 ; GFX7-NEXT: v_or_b32_e32 v1, v5, v1 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX7-NEXT: v_or_b32_e32 v1, v1, v3 @@ -2221,8 +2221,8 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 24, v0 ; GFX10-NEXT: v_lshlrev_b32_sdwa v3, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_lshlrev_b32_sdwa v5, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_lshrrev_b32_e32 v4, 24, v1 +; GFX10-NEXT: v_lshlrev_b32_sdwa v5, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_lshlrev_b32_sdwa v6, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v7, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_and_or_b32 v0, v0, s4, v3 @@ -2231,8 +2231,8 @@ ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 24, v4 ; GFX10-NEXT: s_lshr_b32 s0, s3, 2 ; GFX10-NEXT: s_and_b32 s1, s3, 3 -; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 1 ; GFX10-NEXT: v_or3_b32 v0, v0, v6, v2 +; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 1 ; GFX10-NEXT: v_or3_b32 v1, v1, v7, v3 ; GFX10-NEXT: s_lshl_b32 s1, s1, 3 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s0, 0 @@ -2247,8 +2247,8 @@ ; GFX10-NEXT: v_mov_b32_e32 v2, 16 ; GFX10-NEXT: v_lshrrev_b32_e32 v4, 24, v0 ; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_lshrrev_b32_e32 v6, 24, v1 +; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_lshlrev_b32_sdwa v7, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v8, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_and_or_b32 v2, v0, s4, v5 @@ -2285,13 +2285,13 @@ ; GFX9-NEXT: s_bfe_u32 s0, s0, s10 ; GFX9-NEXT: s_lshl_b32 s0, s0, 16 ; GFX9-NEXT: s_or_b32 s0, s8, s0 -; GFX9-NEXT: s_bfe_u32 s8, s1, s9 ; GFX9-NEXT: s_lshl_b32 s5, s5, 24 -; GFX9-NEXT: s_or_b32 s0, s0, s5 +; GFX9-NEXT: s_bfe_u32 s8, s1, s9 ; GFX9-NEXT: s_lshr_b32 s6, s1, 24 +; GFX9-NEXT: s_or_b32 s0, s0, s5 ; GFX9-NEXT: s_and_b32 s5, s1, s7 -; GFX9-NEXT: s_bfe_u32 s1, s1, s10 ; GFX9-NEXT: s_lshl_b32 s8, s8, 8 +; GFX9-NEXT: s_bfe_u32 s1, s1, s10 ; GFX9-NEXT: s_or_b32 s5, s5, s8 ; GFX9-NEXT: s_lshl_b32 s1, s1, 16 ; GFX9-NEXT: s_or_b32 s1, s5, s1 @@ -2308,13 +2308,13 @@ ; GFX9-NEXT: v_lshl_or_b32 v2, v0, s4, v1 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s5, 0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s5, 1 +; GFX9-NEXT: s_mov_b32 s3, 16 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 24, v0 ; GFX9-NEXT: v_lshlrev_b32_sdwa v4, s2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: s_mov_b32 s3, 16 ; GFX9-NEXT: v_and_or_b32 v4, v0, s7, v4 ; GFX9-NEXT: v_lshlrev_b32_sdwa v0, s3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 24, v2 @@ -2322,8 +2322,8 @@ ; GFX9-NEXT: v_or3_b32 v0, v4, v0, v2 ; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX9-NEXT: v_and_or_b32 v4, v1, s7, v2 -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 24, v3 ; GFX9-NEXT: v_lshlrev_b32_sdwa v1, s3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 24, v3 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: v_or3_b32 v1, v4, v1, v5 @@ -2347,13 +2347,13 @@ ; GFX8-NEXT: s_bfe_u32 s0, s0, s8 ; GFX8-NEXT: s_lshl_b32 s0, s0, 16 ; GFX8-NEXT: s_or_b32 s0, s6, s0 -; GFX8-NEXT: s_bfe_u32 s6, s1, s7 ; GFX8-NEXT: s_lshl_b32 s2, s2, 24 -; GFX8-NEXT: s_or_b32 s0, s0, s2 +; GFX8-NEXT: s_bfe_u32 s6, s1, s7 ; GFX8-NEXT: s_lshr_b32 s3, s1, 24 +; GFX8-NEXT: s_or_b32 s0, s0, s2 ; GFX8-NEXT: s_and_b32 s2, s1, s5 -; GFX8-NEXT: s_bfe_u32 s1, s1, s8 ; GFX8-NEXT: s_lshl_b32 s6, s6, 8 +; GFX8-NEXT: s_bfe_u32 s1, s1, s8 ; GFX8-NEXT: s_or_b32 s2, s2, s6 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 ; GFX8-NEXT: s_or_b32 s1, s2, s1 @@ -2372,9 +2372,9 @@ ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s2, 0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s2, 1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 24, v0 ; GFX8-NEXT: v_or_b32_sdwa v5, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -2410,13 +2410,13 @@ ; GFX7-NEXT: s_bfe_u32 s0, s0, s8 ; GFX7-NEXT: s_lshl_b32 s0, s0, 16 ; GFX7-NEXT: s_or_b32 s0, s6, s0 -; GFX7-NEXT: s_bfe_u32 s6, s1, s7 ; GFX7-NEXT: s_lshl_b32 s2, s2, 24 -; GFX7-NEXT: s_or_b32 s0, s0, s2 +; GFX7-NEXT: s_bfe_u32 s6, s1, s7 ; GFX7-NEXT: s_lshr_b32 s3, s1, 24 +; GFX7-NEXT: s_or_b32 s0, s0, s2 ; GFX7-NEXT: s_and_b32 s2, s1, s5 -; GFX7-NEXT: s_bfe_u32 s1, s1, s8 ; GFX7-NEXT: s_lshl_b32 s6, s6, 8 +; GFX7-NEXT: s_bfe_u32 s1, s1, s8 ; GFX7-NEXT: s_or_b32 s2, s2, s6 ; GFX7-NEXT: s_lshl_b32 s1, s1, 16 ; GFX7-NEXT: s_or_b32 s1, s2, s1 @@ -2434,24 +2434,24 @@ ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s2, 0 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX7-NEXT: v_bfe_u32 v5, v0, 8, 8 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s2, 1 +; GFX7-NEXT: v_bfe_u32 v5, v0, 8, 8 ; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v0 ; GFX7-NEXT: v_and_b32_e32 v4, s5, v0 -; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 ; GFX7-NEXT: v_or_b32_e32 v4, v4, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_or_b32_e32 v0, v4, v0 -; GFX7-NEXT: v_bfe_u32 v4, v1, 8, 8 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX7-NEXT: v_bfe_u32 v4, v1, 8, 8 ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX7-NEXT: v_and_b32_e32 v2, s5, v1 -; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 8 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 8 ; GFX7-NEXT: v_or_b32_e32 v2, v2, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_or_b32_e32 v1, v2, v1 @@ -2484,12 +2484,12 @@ ; GFX10-NEXT: s_lshl_b32 s5, s10, 8 ; GFX10-NEXT: s_lshl_b32 s3, s3, 8 ; GFX10-NEXT: s_lshl_b32 s0, s0, 16 -; GFX10-NEXT: s_or_b32 s5, s9, s5 ; GFX10-NEXT: s_lshl_b32 s1, s1, 16 +; GFX10-NEXT: s_or_b32 s5, s9, s5 ; GFX10-NEXT: s_or_b32 s3, s11, s3 ; GFX10-NEXT: s_lshl_b32 s7, s7, 24 -; GFX10-NEXT: s_or_b32 s0, s5, s0 ; GFX10-NEXT: s_lshl_b32 s8, s8, 24 +; GFX10-NEXT: s_or_b32 s0, s5, s0 ; GFX10-NEXT: s_or_b32 s1, s3, s1 ; GFX10-NEXT: s_or_b32 s0, s0, s7 ; GFX10-NEXT: s_or_b32 s1, s1, s8 @@ -2513,13 +2513,13 @@ ; GFX10-NEXT: v_and_or_b32 v3, v0, s2, v3 ; GFX10-NEXT: v_lshrrev_b32_e32 v4, 24, v1 ; GFX10-NEXT: v_lshlrev_b32_sdwa v5, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 24, v2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v7, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 24, v2 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 24, v4 ; GFX10-NEXT: v_and_or_b32 v5, v1, s2, v5 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: v_or3_b32 v2, v3, v6, v2 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: v_or3_b32 v2, v3, v6, v2 ; GFX10-NEXT: v_or3_b32 v3, v5, v7, v4 ; GFX10-NEXT: global_store_dwordx2 v[0:1], v[2:3], off ; GFX10-NEXT: s_endpgm @@ -2547,13 +2547,13 @@ ; GFX9-NEXT: s_bfe_u32 s0, s0, s10 ; GFX9-NEXT: s_lshl_b32 s0, s0, 16 ; GFX9-NEXT: s_or_b32 s0, s8, s0 -; GFX9-NEXT: s_bfe_u32 s8, s1, s9 ; GFX9-NEXT: s_lshl_b32 s5, s5, 24 -; GFX9-NEXT: s_or_b32 s0, s0, s5 +; GFX9-NEXT: s_bfe_u32 s8, s1, s9 ; GFX9-NEXT: s_lshr_b32 s6, s1, 24 +; GFX9-NEXT: s_or_b32 s0, s0, s5 ; GFX9-NEXT: s_and_b32 s5, s1, s7 -; GFX9-NEXT: s_bfe_u32 s1, s1, s10 ; GFX9-NEXT: s_lshl_b32 s8, s8, 8 +; GFX9-NEXT: s_bfe_u32 s1, s1, s10 ; GFX9-NEXT: s_or_b32 s5, s5, s8 ; GFX9-NEXT: s_lshl_b32 s1, s1, 16 ; GFX9-NEXT: s_or_b32 s1, s5, s1 @@ -2572,11 +2572,11 @@ ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v2 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v3, s[0:1] ; GFX9-NEXT: s_mov_b32 s2, 8 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v3, s[0:1] +; GFX9-NEXT: s_mov_b32 s3, 16 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 24, v0 ; GFX9-NEXT: v_lshlrev_b32_sdwa v4, s2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: s_mov_b32 s3, 16 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX9-NEXT: v_and_or_b32 v4, v0, s7, v4 ; GFX9-NEXT: v_lshlrev_b32_sdwa v0, s3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 @@ -2585,8 +2585,8 @@ ; GFX9-NEXT: v_or3_b32 v0, v4, v0, v2 ; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX9-NEXT: v_and_or_b32 v4, v1, s7, v2 -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 24, v3 ; GFX9-NEXT: v_lshlrev_b32_sdwa v1, s3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 24, v3 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: v_or3_b32 v1, v4, v1, v5 @@ -2610,13 +2610,13 @@ ; GFX8-NEXT: s_bfe_u32 s0, s0, s8 ; GFX8-NEXT: s_lshl_b32 s0, s0, 16 ; GFX8-NEXT: s_or_b32 s0, s6, s0 -; GFX8-NEXT: s_bfe_u32 s6, s1, s7 ; GFX8-NEXT: s_lshl_b32 s2, s2, 24 -; GFX8-NEXT: s_or_b32 s0, s0, s2 +; GFX8-NEXT: s_bfe_u32 s6, s1, s7 ; GFX8-NEXT: s_lshr_b32 s3, s1, 24 +; GFX8-NEXT: s_or_b32 s0, s0, s2 ; GFX8-NEXT: s_and_b32 s2, s1, s5 -; GFX8-NEXT: s_bfe_u32 s1, s1, s8 ; GFX8-NEXT: s_lshl_b32 s6, s6, 8 +; GFX8-NEXT: s_bfe_u32 s1, s1, s8 ; GFX8-NEXT: s_or_b32 s2, s2, s6 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 ; GFX8-NEXT: s_or_b32 s1, s2, s1 @@ -2676,13 +2676,13 @@ ; GFX7-NEXT: s_bfe_u32 s0, s0, s8 ; GFX7-NEXT: s_lshl_b32 s0, s0, 16 ; GFX7-NEXT: s_or_b32 s0, s6, s0 -; GFX7-NEXT: s_bfe_u32 s6, s1, s7 ; GFX7-NEXT: s_lshl_b32 s2, s2, 24 -; GFX7-NEXT: s_or_b32 s0, s0, s2 +; GFX7-NEXT: s_bfe_u32 s6, s1, s7 ; GFX7-NEXT: s_lshr_b32 s3, s1, 24 +; GFX7-NEXT: s_or_b32 s0, s0, s2 ; GFX7-NEXT: s_and_b32 s2, s1, s5 -; GFX7-NEXT: s_bfe_u32 s1, s1, s8 ; GFX7-NEXT: s_lshl_b32 s6, s6, 8 +; GFX7-NEXT: s_bfe_u32 s1, s1, s8 ; GFX7-NEXT: s_or_b32 s2, s2, s6 ; GFX7-NEXT: s_lshl_b32 s1, s1, 16 ; GFX7-NEXT: s_or_b32 s1, s2, s1 @@ -2706,19 +2706,19 @@ ; GFX7-NEXT: v_bfe_u32 v5, v0, 8, 8 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v0 ; GFX7-NEXT: v_and_b32_e32 v4, s5, v0 -; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 ; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX7-NEXT: v_or_b32_e32 v4, v4, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_or_b32_e32 v0, v4, v0 -; GFX7-NEXT: v_bfe_u32 v4, v1, 8, 8 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX7-NEXT: v_bfe_u32 v4, v1, 8, 8 ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX7-NEXT: v_and_b32_e32 v2, s5, v1 -; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 8 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 8 ; GFX7-NEXT: v_or_b32_e32 v2, v2, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_or_b32_e32 v1, v2, v1 @@ -2737,12 +2737,12 @@ ; GFX10-NEXT: s_mov_b32 s3, 0x80008 ; GFX10-NEXT: s_movk_i32 s2, 0xff ; GFX10-NEXT: s_mov_b32 s5, 0x80010 -; GFX10-NEXT: v_lshrrev_b32_e32 v2, 2, v0 -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 3, v1 ; GFX10-NEXT: s_and_b32 s4, s4, s2 -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 3, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v2, 2, v0 ; GFX10-NEXT: v_lshlrev_b32_e64 v3, v1, s4 ; GFX10-NEXT: v_lshlrev_b32_e64 v0, v1, s2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2 ; GFX10-NEXT: v_xor_b32_e32 v4, -1, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_bfe_u32 s8, s0, s3 @@ -2760,8 +2760,8 @@ ; GFX10-NEXT: s_bfe_u32 s0, s0, s5 ; GFX10-NEXT: s_lshl_b32 s5, s8, 8 ; GFX10-NEXT: s_or_b32 s1, s1, s6 -; GFX10-NEXT: s_lshl_b32 s3, s4, 24 ; GFX10-NEXT: s_lshl_b32 s0, s0, 16 +; GFX10-NEXT: s_lshl_b32 s3, s4, 24 ; GFX10-NEXT: s_or_b32 s4, s7, s5 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: s_or_b32 s0, s4, s0 @@ -2815,35 +2815,35 @@ ; GFX9-NEXT: s_bfe_u32 s0, s0, s9 ; GFX9-NEXT: s_lshl_b32 s0, s0, 16 ; GFX9-NEXT: s_or_b32 s0, s7, s0 -; GFX9-NEXT: s_bfe_u32 s7, s1, s8 ; GFX9-NEXT: s_lshl_b32 s4, s4, 24 -; GFX9-NEXT: s_or_b32 s0, s0, s4 +; GFX9-NEXT: s_bfe_u32 s7, s1, s8 ; GFX9-NEXT: s_lshr_b32 s5, s1, 24 +; GFX9-NEXT: s_or_b32 s0, s0, s4 ; GFX9-NEXT: s_and_b32 s4, s1, s6 -; GFX9-NEXT: s_bfe_u32 s1, s1, s9 ; GFX9-NEXT: s_lshl_b32 s7, s7, 8 +; GFX9-NEXT: s_bfe_u32 s1, s1, s9 ; GFX9-NEXT: s_or_b32 s4, s4, s7 ; GFX9-NEXT: s_lshl_b32 s1, s1, 16 ; GFX9-NEXT: s_or_b32 s1, s4, s1 ; GFX9-NEXT: s_lshl_b32 s4, s5, 24 ; GFX9-NEXT: s_or_b32 s1, s1, s4 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 3, v1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_lshlrev_b32_e64 v1, v1, s6 ; GFX9-NEXT: v_mov_b32_e32 v3, s0 ; GFX9-NEXT: v_mov_b32_e32 v4, s1 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_lshlrev_b32_e64 v1, v1, s6 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX9-NEXT: v_xor_b32_e32 v1, -1, v1 ; GFX9-NEXT: v_and_or_b32 v3, v3, v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v2 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v3, s[0:1] ; GFX9-NEXT: s_mov_b32 s2, 8 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v3, s[0:1] +; GFX9-NEXT: s_mov_b32 s3, 16 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 24, v0 ; GFX9-NEXT: v_lshlrev_b32_sdwa v4, s2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: s_mov_b32 s3, 16 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX9-NEXT: v_and_or_b32 v4, v0, s6, v4 ; GFX9-NEXT: v_lshlrev_b32_sdwa v0, s3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 @@ -2852,8 +2852,8 @@ ; GFX9-NEXT: v_or3_b32 v0, v4, v0, v2 ; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX9-NEXT: v_and_or_b32 v4, v1, s6, v2 -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 24, v3 ; GFX9-NEXT: v_lshlrev_b32_sdwa v1, s3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 24, v3 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: v_or3_b32 v1, v4, v1, v5 @@ -2877,24 +2877,24 @@ ; GFX8-NEXT: s_bfe_u32 s0, s0, s7 ; GFX8-NEXT: s_lshl_b32 s0, s0, 16 ; GFX8-NEXT: s_or_b32 s0, s5, s0 -; GFX8-NEXT: s_bfe_u32 s5, s1, s6 ; GFX8-NEXT: s_lshl_b32 s2, s2, 24 -; GFX8-NEXT: s_or_b32 s0, s0, s2 +; GFX8-NEXT: s_bfe_u32 s5, s1, s6 ; GFX8-NEXT: s_lshr_b32 s3, s1, 24 +; GFX8-NEXT: s_or_b32 s0, s0, s2 ; GFX8-NEXT: s_and_b32 s2, s1, s4 -; GFX8-NEXT: s_bfe_u32 s1, s1, s7 ; GFX8-NEXT: s_lshl_b32 s5, s5, 8 +; GFX8-NEXT: s_bfe_u32 s1, s1, s7 ; GFX8-NEXT: s_or_b32 s2, s2, s5 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 ; GFX8-NEXT: s_or_b32 s1, s2, s1 ; GFX8-NEXT: s_lshl_b32 s2, s3, 24 ; GFX8-NEXT: s_or_b32 s1, s1, s2 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 3, v1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_lshlrev_b32_e64 v1, v1, s4 ; GFX8-NEXT: v_mov_b32_e32 v3, s0 ; GFX8-NEXT: v_mov_b32_e32 v4, s1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshlrev_b32_e64 v1, v1, s4 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX8-NEXT: v_xor_b32_e32 v1, -1, v1 ; GFX8-NEXT: v_and_b32_e32 v1, v3, v1 @@ -2942,13 +2942,13 @@ ; GFX7-NEXT: s_bfe_u32 s0, s0, s7 ; GFX7-NEXT: s_lshl_b32 s0, s0, 16 ; GFX7-NEXT: s_or_b32 s0, s5, s0 -; GFX7-NEXT: s_bfe_u32 s5, s1, s6 ; GFX7-NEXT: s_lshl_b32 s2, s2, 24 -; GFX7-NEXT: s_or_b32 s0, s0, s2 +; GFX7-NEXT: s_bfe_u32 s5, s1, s6 ; GFX7-NEXT: s_lshr_b32 s3, s1, 24 +; GFX7-NEXT: s_or_b32 s0, s0, s2 ; GFX7-NEXT: s_and_b32 s2, s1, s4 -; GFX7-NEXT: s_bfe_u32 s1, s1, s7 ; GFX7-NEXT: s_lshl_b32 s5, s5, 8 +; GFX7-NEXT: s_bfe_u32 s1, s1, s7 ; GFX7-NEXT: s_or_b32 s2, s2, s5 ; GFX7-NEXT: s_lshl_b32 s1, s1, 16 ; GFX7-NEXT: s_or_b32 s1, s2, s1 @@ -2956,11 +2956,11 @@ ; GFX7-NEXT: s_or_b32 s1, s1, s2 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 3, v1 ; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, v1, v0 -; GFX7-NEXT: v_lshl_b32_e32 v1, s4, v1 ; GFX7-NEXT: v_mov_b32_e32 v3, s0 ; GFX7-NEXT: v_mov_b32_e32 v4, s1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_lshl_b32_e32 v1, s4, v1 ; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX7-NEXT: v_xor_b32_e32 v1, -1, v1 ; GFX7-NEXT: v_and_b32_e32 v1, v3, v1 @@ -2972,19 +2972,19 @@ ; GFX7-NEXT: v_bfe_u32 v5, v0, 8, 8 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v0 ; GFX7-NEXT: v_and_b32_e32 v4, s4, v0 -; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 ; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX7-NEXT: v_or_b32_e32 v4, v4, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_or_b32_e32 v0, v4, v0 -; GFX7-NEXT: v_bfe_u32 v4, v1, 8, 8 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX7-NEXT: v_bfe_u32 v4, v1, 8, 8 ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX7-NEXT: v_and_b32_e32 v2, s4, v1 -; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 8 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 8 ; GFX7-NEXT: v_or_b32_e32 v2, v2, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_or_b32_e32 v1, v2, v1 @@ -3068,9 +3068,9 @@ ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off ; GFX9-NEXT: s_mov_b32 s0, 8 ; GFX9-NEXT: s_mov_b32 s1, 16 +; GFX9-NEXT: s_movk_i32 s3, 0xff ; GFX9-NEXT: v_lshrrev_b32_e32 v7, 2, v2 ; GFX9-NEXT: v_and_b32_e32 v2, 3, v2 -; GFX9-NEXT: s_movk_i32 s3, 0xff ; GFX9-NEXT: s_and_b32 s2, s2, s3 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v2 ; GFX9-NEXT: v_lshlrev_b32_e64 v8, v2, s2 @@ -3100,15 +3100,15 @@ ; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 24, v0 -; GFX9-NEXT: v_lshlrev_b32_sdwa v8, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v8, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX9-NEXT: v_lshlrev_b32_sdwa v5, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX9-NEXT: v_lshlrev_b32_sdwa v9, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_lshlrev_b32_sdwa v6, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_and_or_b32 v1, v1, s3, v5 -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 24, v7 ; GFX9-NEXT: v_and_or_b32 v0, v0, s3, v8 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GFX9-NEXT: v_and_or_b32 v1, v1, s3, v5 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 24, v7 ; GFX9-NEXT: v_or3_b32 v0, v0, v9, v2 ; GFX9-NEXT: v_or3_b32 v1, v1, v6, v5 ; GFX9-NEXT: global_store_dwordx2 v[3:4], v[0:1], off @@ -3119,9 +3119,9 @@ ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v5, 8 ; GFX8-NEXT: v_mov_b32_e32 v6, 16 +; GFX8-NEXT: s_movk_i32 s0, 0xff ; GFX8-NEXT: v_lshrrev_b32_e32 v9, 2, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 3, v2 -; GFX8-NEXT: s_movk_i32 s0, 0xff ; GFX8-NEXT: s_and_b32 s1, s2, s0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v2 ; GFX8-NEXT: v_lshlrev_b32_e64 v10, v2, s1 @@ -3137,14 +3137,14 @@ ; GFX8-NEXT: v_lshlrev_b32_sdwa v13, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX8-NEXT: v_lshrrev_b32_e32 v11, 24, v0 -; GFX8-NEXT: v_lshlrev_b32_sdwa v14, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_lshrrev_b32_e32 v12, 24, v1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v14, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX8-NEXT: v_lshlrev_b32_sdwa v6, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b32_e32 v11, 24, v11 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v14 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, 24, v12 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v14 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v6 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v11 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v5 @@ -3158,12 +3158,12 @@ ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 24, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 24, v1 ; GFX8-NEXT: v_lshlrev_b32_sdwa v9, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b32_sdwa v8, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v1, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v9 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, 24, v5 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v9 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v8 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v5 @@ -3198,21 +3198,21 @@ ; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 8 ; GFX7-NEXT: v_lshlrev_b32_e32 v8, 8, v8 ; GFX7-NEXT: v_lshlrev_b32_e32 v10, 8, v10 -; GFX7-NEXT: v_or_b32_e32 v7, v7, v8 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v7, v7, v8 ; GFX7-NEXT: v_or_b32_e32 v8, v9, v10 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 24, v5 -; GFX7-NEXT: v_or_b32_e32 v0, v7, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; GFX7-NEXT: v_or_b32_e32 v0, v7, v0 ; GFX7-NEXT: v_or_b32_e32 v1, v8, v1 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v5 ; GFX7-NEXT: v_or_b32_e32 v1, v1, v6 ; GFX7-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc ; GFX7-NEXT: v_and_b32_e32 v2, v5, v2 ; GFX7-NEXT: v_or_b32_e32 v2, v2, v4 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] +; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX7-NEXT: v_bfe_u32 v5, v0, 8, 8 ; GFX7-NEXT: v_bfe_u32 v7, v1, 8, 8 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v0 @@ -3223,13 +3223,13 @@ ; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 8 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v7, 8, v7 -; GFX7-NEXT: v_or_b32_e32 v4, v4, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v4, v4, v5 ; GFX7-NEXT: v_or_b32_e32 v5, v6, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GFX7-NEXT: v_or_b32_e32 v0, v4, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GFX7-NEXT: v_or_b32_e32 v0, v4, v0 ; GFX7-NEXT: v_or_b32_e32 v1, v5, v1 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX7-NEXT: v_or_b32_e32 v1, v1, v3 @@ -3249,14 +3249,14 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v4, 24, v0 ; GFX10-NEXT: v_lshlrev_b32_sdwa v5, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_lshlrev_b32_sdwa v7, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_lshrrev_b32_e32 v6, 24, v1 +; GFX10-NEXT: v_lshlrev_b32_sdwa v7, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_lshlrev_b32_sdwa v8, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v9, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_and_or_b32 v0, v0, s3, v5 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GFX10-NEXT: v_lshlrev_b32_e32 v5, 24, v6 ; GFX10-NEXT: v_and_or_b32 v1, v1, s3, v7 +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 24, v6 ; GFX10-NEXT: v_lshlrev_b32_e64 v6, v3, s3 ; GFX10-NEXT: s_and_b32 s0, s2, s3 ; GFX10-NEXT: v_or3_b32 v0, v0, v8, v4 @@ -3270,8 +3270,8 @@ ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v3, s0 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo ; GFX10-NEXT: v_mov_b32_e32 v3, 16 -; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_lshrrev_b32_e32 v4, 24, v0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_lshrrev_b32_e32 v6, 24, v1 ; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_lshlrev_b32_sdwa v7, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 @@ -3298,9 +3298,9 @@ ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off ; GFX9-NEXT: s_mov_b32 s0, 8 ; GFX9-NEXT: s_mov_b32 s1, 16 +; GFX9-NEXT: s_movk_i32 s3, 0xff ; GFX9-NEXT: s_lshr_b32 s4, s2, 2 ; GFX9-NEXT: s_and_b32 s2, s2, 3 -; GFX9-NEXT: s_movk_i32 s3, 0xff ; GFX9-NEXT: s_lshl_b32 s2, s2, 3 ; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: s_lshl_b32 s2, s3, s2 @@ -3329,15 +3329,15 @@ ; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 24, v0 -; GFX9-NEXT: v_lshlrev_b32_sdwa v8, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v8, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX9-NEXT: v_lshlrev_b32_sdwa v5, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX9-NEXT: v_lshlrev_b32_sdwa v9, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_lshlrev_b32_sdwa v6, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_and_or_b32 v1, v1, s3, v5 -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 24, v7 ; GFX9-NEXT: v_and_or_b32 v0, v0, s3, v8 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GFX9-NEXT: v_and_or_b32 v1, v1, s3, v5 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 24, v7 ; GFX9-NEXT: v_or3_b32 v0, v0, v9, v2 ; GFX9-NEXT: v_or3_b32 v1, v1, v6, v5 ; GFX9-NEXT: global_store_dwordx2 v[3:4], v[0:1], off @@ -3365,14 +3365,14 @@ ; GFX8-NEXT: v_lshlrev_b32_sdwa v11, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX8-NEXT: v_lshrrev_b32_e32 v9, 24, v0 -; GFX8-NEXT: v_lshlrev_b32_sdwa v12, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_lshrrev_b32_e32 v10, 24, v1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v12, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX8-NEXT: v_lshlrev_b32_sdwa v6, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b32_e32 v9, 24, v9 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v12 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, 24, v10 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v12 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v6 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v9 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v5 @@ -3387,12 +3387,12 @@ ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 24, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 24, v1 ; GFX8-NEXT: v_lshlrev_b32_sdwa v9, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b32_sdwa v8, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v1, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v9 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, 24, v5 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v9 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v8 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v5 @@ -3426,13 +3426,13 @@ ; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 8 ; GFX7-NEXT: v_lshlrev_b32_e32 v6, 8, v6 ; GFX7-NEXT: v_lshlrev_b32_e32 v8, 8, v8 -; GFX7-NEXT: v_or_b32_e32 v5, v5, v6 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v5, v5, v6 ; GFX7-NEXT: v_or_b32_e32 v6, v7, v8 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GFX7-NEXT: v_or_b32_e32 v0, v5, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GFX7-NEXT: v_or_b32_e32 v0, v5, v0 ; GFX7-NEXT: v_or_b32_e32 v1, v6, v1 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v3 ; GFX7-NEXT: v_or_b32_e32 v1, v1, v4 @@ -3440,8 +3440,8 @@ ; GFX7-NEXT: v_and_b32_e32 v3, s1, v3 ; GFX7-NEXT: v_or_b32_e32 v2, v3, v2 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], s0, 0 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] +; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX7-NEXT: v_bfe_u32 v5, v0, 8, 8 ; GFX7-NEXT: v_bfe_u32 v7, v1, 8, 8 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v0 @@ -3452,13 +3452,13 @@ ; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 8 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v7, 8, v7 -; GFX7-NEXT: v_or_b32_e32 v4, v4, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v4, v4, v5 ; GFX7-NEXT: v_or_b32_e32 v5, v6, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GFX7-NEXT: v_or_b32_e32 v0, v4, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GFX7-NEXT: v_or_b32_e32 v0, v4, v0 ; GFX7-NEXT: v_or_b32_e32 v1, v5, v1 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX7-NEXT: v_or_b32_e32 v1, v1, v3 @@ -3474,8 +3474,8 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 24, v0 ; GFX10-NEXT: v_lshlrev_b32_sdwa v4, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_lshlrev_b32_sdwa v6, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_lshrrev_b32_e32 v5, 24, v1 +; GFX10-NEXT: v_lshlrev_b32_sdwa v6, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_lshlrev_b32_sdwa v7, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v8, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_and_or_b32 v0, v0, s3, v4 @@ -3526,11 +3526,11 @@ ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off ; GFX9-NEXT: s_mov_b32 s0, 8 ; GFX9-NEXT: s_mov_b32 s1, 16 +; GFX9-NEXT: s_movk_i32 s2, 0xff ; GFX9-NEXT: v_lshrrev_b32_e32 v9, 2, v3 ; GFX9-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX9-NEXT: s_movk_i32 s2, 0xff -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX9-NEXT: v_mov_b32_e32 v6, 0xff +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, v3, v6 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v9 @@ -3575,12 +3575,12 @@ ; GFX8-LABEL: insertelement_v_v8i8_v_v: ; GFX8: ; %bb.0: ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v7, 8 ; GFX8-NEXT: v_lshrrev_b32_e32 v11, 2, v3 ; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX8-NEXT: v_mov_b32_e32 v7, 8 +; GFX8-NEXT: v_mov_b32_e32 v6, 0xff ; GFX8-NEXT: v_mov_b32_e32 v8, 16 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX8-NEXT: v_mov_b32_e32 v6, 0xff ; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, v3, v6 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v11 @@ -3594,14 +3594,14 @@ ; GFX8-NEXT: v_lshlrev_b32_sdwa v13, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX8-NEXT: v_lshlrev_b32_sdwa v7, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 24, v0 -; GFX8-NEXT: v_lshlrev_b32_sdwa v14, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_lshrrev_b32_e32 v12, 24, v1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v14, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX8-NEXT: v_lshlrev_b32_sdwa v8, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v1, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b32_e32 v6, 24, v6 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v14 ; GFX8-NEXT: v_lshlrev_b32_e32 v7, 24, v12 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v14 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v8 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v6 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v7 @@ -3615,12 +3615,12 @@ ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 24, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 24, v1 ; GFX8-NEXT: v_lshlrev_b32_sdwa v7, v10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b32_sdwa v9, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v7 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v7 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v9 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v3 @@ -3634,9 +3634,9 @@ ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_movk_i32 s0, 0xff +; GFX7-NEXT: v_mov_b32_e32 v4, 0xff ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 2, v3 ; GFX7-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX7-NEXT: v_mov_b32_e32 v4, 0xff ; GFX7-NEXT: v_and_b32_e32 v2, v2, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, v3, v2 @@ -3655,13 +3655,13 @@ ; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 8 ; GFX7-NEXT: v_lshlrev_b32_e32 v9, 8, v9 ; GFX7-NEXT: v_lshlrev_b32_e32 v11, 8, v11 -; GFX7-NEXT: v_or_b32_e32 v8, v8, v9 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v8, v8, v9 ; GFX7-NEXT: v_or_b32_e32 v9, v10, v11 ; GFX7-NEXT: v_lshlrev_b32_e32 v6, 24, v6 -; GFX7-NEXT: v_or_b32_e32 v0, v8, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; GFX7-NEXT: v_or_b32_e32 v0, v8, v0 ; GFX7-NEXT: v_or_b32_e32 v1, v9, v1 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v6 ; GFX7-NEXT: v_or_b32_e32 v1, v1, v7 @@ -3682,12 +3682,12 @@ ; GFX7-NEXT: v_lshlrev_b32_e32 v6, 8, v6 ; GFX7-NEXT: v_lshlrev_b32_e32 v7, 8, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_or_b32_e32 v5, v5, v6 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v5, v5, v6 ; GFX7-NEXT: v_or_b32_e32 v4, v4, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GFX7-NEXT: v_or_b32_e32 v0, v5, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GFX7-NEXT: v_or_b32_e32 v0, v5, v0 ; GFX7-NEXT: v_or_b32_e32 v1, v4, v1 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX7-NEXT: v_or_b32_e32 v1, v1, v3 @@ -3701,22 +3701,22 @@ ; GFX10-NEXT: v_and_b32_e32 v4, 3, v3 ; GFX10-NEXT: s_mov_b32 s1, 16 ; GFX10-NEXT: s_movk_i32 s2, 0xff -; GFX10-NEXT: v_lshrrev_b32_e32 v3, 2, v3 ; GFX10-NEXT: v_mov_b32_e32 v5, 0xff +; GFX10-NEXT: v_lshrrev_b32_e32 v3, 2, v3 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3 ; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v6, 24, v0 ; GFX10-NEXT: v_lshlrev_b32_sdwa v7, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_lshlrev_b32_sdwa v9, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_lshrrev_b32_e32 v8, 24, v1 +; GFX10-NEXT: v_lshlrev_b32_sdwa v9, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_lshlrev_b32_sdwa v10, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v11, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_and_or_b32 v0, v0, s2, v7 ; GFX10-NEXT: v_lshlrev_b32_e32 v6, 24, v6 -; GFX10-NEXT: v_lshlrev_b32_e32 v7, 24, v8 ; GFX10-NEXT: v_and_or_b32 v1, v1, s2, v9 +; GFX10-NEXT: v_lshlrev_b32_e32 v7, 24, v8 ; GFX10-NEXT: v_lshlrev_b32_e32 v8, v4, v5 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 0, v3 ; GFX10-NEXT: v_or3_b32 v0, v0, v10, v6 @@ -3730,13 +3730,13 @@ ; GFX10-NEXT: v_mov_b32_e32 v2, 16 ; GFX10-NEXT: v_lshrrev_b32_e32 v4, 24, v0 ; GFX10-NEXT: v_lshlrev_b32_sdwa v6, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_lshlrev_b32_sdwa v8, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v9, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_and_or_b32 v2, v0, v5, v6 -; GFX10-NEXT: v_and_or_b32 v3, v1, v5, v3 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GFX10-NEXT: v_and_or_b32 v3, v1, v5, v3 ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 24, v7 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 @@ -3768,33 +3768,33 @@ ; GFX9-NEXT: s_bfe_u32 s0, s0, s13 ; GFX9-NEXT: s_lshl_b32 s0, s0, 16 ; GFX9-NEXT: s_or_b32 s0, s11, s0 -; GFX9-NEXT: s_bfe_u32 s11, s1, s12 ; GFX9-NEXT: s_lshl_b32 s6, s6, 24 -; GFX9-NEXT: s_or_b32 s0, s0, s6 +; GFX9-NEXT: s_bfe_u32 s11, s1, s12 ; GFX9-NEXT: s_lshr_b32 s7, s1, 24 +; GFX9-NEXT: s_or_b32 s0, s0, s6 ; GFX9-NEXT: s_and_b32 s6, s1, s10 -; GFX9-NEXT: s_bfe_u32 s1, s1, s13 ; GFX9-NEXT: s_lshl_b32 s11, s11, 8 +; GFX9-NEXT: s_bfe_u32 s1, s1, s13 ; GFX9-NEXT: s_or_b32 s6, s6, s11 ; GFX9-NEXT: s_lshl_b32 s1, s1, 16 ; GFX9-NEXT: s_or_b32 s1, s6, s1 ; GFX9-NEXT: s_lshl_b32 s6, s7, 24 ; GFX9-NEXT: s_bfe_u32 s7, s2, s12 -; GFX9-NEXT: s_or_b32 s1, s1, s6 ; GFX9-NEXT: s_lshr_b32 s8, s2, 24 +; GFX9-NEXT: s_or_b32 s1, s1, s6 ; GFX9-NEXT: s_and_b32 s6, s2, s10 -; GFX9-NEXT: s_bfe_u32 s2, s2, s13 ; GFX9-NEXT: s_lshl_b32 s7, s7, 8 +; GFX9-NEXT: s_bfe_u32 s2, s2, s13 ; GFX9-NEXT: s_or_b32 s6, s6, s7 ; GFX9-NEXT: s_lshl_b32 s2, s2, 16 -; GFX9-NEXT: s_bfe_u32 s7, s3, s12 ; GFX9-NEXT: s_or_b32 s2, s6, s2 ; GFX9-NEXT: s_lshl_b32 s6, s8, 24 -; GFX9-NEXT: s_or_b32 s2, s2, s6 +; GFX9-NEXT: s_bfe_u32 s7, s3, s12 ; GFX9-NEXT: s_lshr_b32 s9, s3, 24 +; GFX9-NEXT: s_or_b32 s2, s2, s6 ; GFX9-NEXT: s_and_b32 s6, s3, s10 -; GFX9-NEXT: s_bfe_u32 s3, s3, s13 ; GFX9-NEXT: s_lshl_b32 s7, s7, 8 +; GFX9-NEXT: s_bfe_u32 s3, s3, s13 ; GFX9-NEXT: s_or_b32 s6, s6, s7 ; GFX9-NEXT: s_lshl_b32 s3, s3, 16 ; GFX9-NEXT: s_or_b32 s3, s6, s3 @@ -3825,38 +3825,38 @@ ; GFX9-NEXT: s_bfe_u32 s9, s0, s12 ; GFX9-NEXT: s_lshr_b32 s4, s0, 24 ; GFX9-NEXT: s_and_b32 s8, s0, s10 -; GFX9-NEXT: s_bfe_u32 s0, s0, s13 ; GFX9-NEXT: s_lshl_b32 s9, s9, 8 +; GFX9-NEXT: s_bfe_u32 s0, s0, s13 ; GFX9-NEXT: s_or_b32 s8, s8, s9 ; GFX9-NEXT: s_lshl_b32 s0, s0, 16 ; GFX9-NEXT: s_or_b32 s0, s8, s0 -; GFX9-NEXT: s_bfe_u32 s8, s1, s12 ; GFX9-NEXT: s_lshl_b32 s4, s4, 24 -; GFX9-NEXT: s_or_b32 s0, s0, s4 +; GFX9-NEXT: s_bfe_u32 s8, s1, s12 ; GFX9-NEXT: s_lshr_b32 s5, s1, 24 +; GFX9-NEXT: s_or_b32 s0, s0, s4 ; GFX9-NEXT: s_and_b32 s4, s1, s10 -; GFX9-NEXT: s_bfe_u32 s1, s1, s13 ; GFX9-NEXT: s_lshl_b32 s8, s8, 8 +; GFX9-NEXT: s_bfe_u32 s1, s1, s13 ; GFX9-NEXT: s_or_b32 s4, s4, s8 ; GFX9-NEXT: s_lshl_b32 s1, s1, 16 ; GFX9-NEXT: s_or_b32 s1, s4, s1 ; GFX9-NEXT: s_lshl_b32 s4, s5, 24 ; GFX9-NEXT: s_bfe_u32 s5, s2, s12 -; GFX9-NEXT: s_or_b32 s1, s1, s4 ; GFX9-NEXT: s_lshr_b32 s6, s2, 24 +; GFX9-NEXT: s_or_b32 s1, s1, s4 ; GFX9-NEXT: s_and_b32 s4, s2, s10 -; GFX9-NEXT: s_bfe_u32 s2, s2, s13 ; GFX9-NEXT: s_lshl_b32 s5, s5, 8 +; GFX9-NEXT: s_bfe_u32 s2, s2, s13 ; GFX9-NEXT: s_or_b32 s4, s4, s5 ; GFX9-NEXT: s_lshl_b32 s2, s2, 16 -; GFX9-NEXT: s_bfe_u32 s5, s3, s12 ; GFX9-NEXT: s_or_b32 s2, s4, s2 ; GFX9-NEXT: s_lshl_b32 s4, s6, 24 -; GFX9-NEXT: s_or_b32 s2, s2, s4 +; GFX9-NEXT: s_bfe_u32 s5, s3, s12 ; GFX9-NEXT: s_lshr_b32 s7, s3, 24 +; GFX9-NEXT: s_or_b32 s2, s2, s4 ; GFX9-NEXT: s_and_b32 s4, s3, s10 -; GFX9-NEXT: s_bfe_u32 s3, s3, s13 ; GFX9-NEXT: s_lshl_b32 s5, s5, 8 +; GFX9-NEXT: s_bfe_u32 s3, s3, s13 ; GFX9-NEXT: s_or_b32 s4, s4, s5 ; GFX9-NEXT: s_lshl_b32 s3, s3, 16 ; GFX9-NEXT: s_or_b32 s3, s4, s3 @@ -3886,33 +3886,33 @@ ; GFX8-NEXT: s_bfe_u32 s0, s0, s13 ; GFX8-NEXT: s_lshl_b32 s0, s0, 16 ; GFX8-NEXT: s_or_b32 s0, s11, s0 -; GFX8-NEXT: s_bfe_u32 s11, s1, s12 ; GFX8-NEXT: s_lshl_b32 s6, s6, 24 -; GFX8-NEXT: s_or_b32 s0, s0, s6 +; GFX8-NEXT: s_bfe_u32 s11, s1, s12 ; GFX8-NEXT: s_lshr_b32 s7, s1, 24 +; GFX8-NEXT: s_or_b32 s0, s0, s6 ; GFX8-NEXT: s_and_b32 s6, s1, s10 -; GFX8-NEXT: s_bfe_u32 s1, s1, s13 ; GFX8-NEXT: s_lshl_b32 s11, s11, 8 +; GFX8-NEXT: s_bfe_u32 s1, s1, s13 ; GFX8-NEXT: s_or_b32 s6, s6, s11 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 ; GFX8-NEXT: s_or_b32 s1, s6, s1 ; GFX8-NEXT: s_lshl_b32 s6, s7, 24 ; GFX8-NEXT: s_bfe_u32 s7, s2, s12 -; GFX8-NEXT: s_or_b32 s1, s1, s6 ; GFX8-NEXT: s_lshr_b32 s8, s2, 24 +; GFX8-NEXT: s_or_b32 s1, s1, s6 ; GFX8-NEXT: s_and_b32 s6, s2, s10 -; GFX8-NEXT: s_bfe_u32 s2, s2, s13 ; GFX8-NEXT: s_lshl_b32 s7, s7, 8 +; GFX8-NEXT: s_bfe_u32 s2, s2, s13 ; GFX8-NEXT: s_or_b32 s6, s6, s7 ; GFX8-NEXT: s_lshl_b32 s2, s2, 16 -; GFX8-NEXT: s_bfe_u32 s7, s3, s12 ; GFX8-NEXT: s_or_b32 s2, s6, s2 ; GFX8-NEXT: s_lshl_b32 s6, s8, 24 -; GFX8-NEXT: s_or_b32 s2, s2, s6 +; GFX8-NEXT: s_bfe_u32 s7, s3, s12 ; GFX8-NEXT: s_lshr_b32 s9, s3, 24 +; GFX8-NEXT: s_or_b32 s2, s2, s6 ; GFX8-NEXT: s_and_b32 s6, s3, s10 -; GFX8-NEXT: s_bfe_u32 s3, s3, s13 ; GFX8-NEXT: s_lshl_b32 s7, s7, 8 +; GFX8-NEXT: s_bfe_u32 s3, s3, s13 ; GFX8-NEXT: s_or_b32 s6, s6, s7 ; GFX8-NEXT: s_lshl_b32 s3, s3, 16 ; GFX8-NEXT: s_or_b32 s3, s6, s3 @@ -3943,38 +3943,38 @@ ; GFX8-NEXT: s_bfe_u32 s9, s0, s12 ; GFX8-NEXT: s_lshr_b32 s4, s0, 24 ; GFX8-NEXT: s_and_b32 s8, s0, s10 -; GFX8-NEXT: s_bfe_u32 s0, s0, s13 ; GFX8-NEXT: s_lshl_b32 s9, s9, 8 +; GFX8-NEXT: s_bfe_u32 s0, s0, s13 ; GFX8-NEXT: s_or_b32 s8, s8, s9 ; GFX8-NEXT: s_lshl_b32 s0, s0, 16 ; GFX8-NEXT: s_or_b32 s0, s8, s0 -; GFX8-NEXT: s_bfe_u32 s8, s1, s12 ; GFX8-NEXT: s_lshl_b32 s4, s4, 24 -; GFX8-NEXT: s_or_b32 s0, s0, s4 +; GFX8-NEXT: s_bfe_u32 s8, s1, s12 ; GFX8-NEXT: s_lshr_b32 s5, s1, 24 +; GFX8-NEXT: s_or_b32 s0, s0, s4 ; GFX8-NEXT: s_and_b32 s4, s1, s10 -; GFX8-NEXT: s_bfe_u32 s1, s1, s13 ; GFX8-NEXT: s_lshl_b32 s8, s8, 8 +; GFX8-NEXT: s_bfe_u32 s1, s1, s13 ; GFX8-NEXT: s_or_b32 s4, s4, s8 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 ; GFX8-NEXT: s_or_b32 s1, s4, s1 ; GFX8-NEXT: s_lshl_b32 s4, s5, 24 ; GFX8-NEXT: s_bfe_u32 s5, s2, s12 -; GFX8-NEXT: s_or_b32 s1, s1, s4 ; GFX8-NEXT: s_lshr_b32 s6, s2, 24 +; GFX8-NEXT: s_or_b32 s1, s1, s4 ; GFX8-NEXT: s_and_b32 s4, s2, s10 -; GFX8-NEXT: s_bfe_u32 s2, s2, s13 ; GFX8-NEXT: s_lshl_b32 s5, s5, 8 +; GFX8-NEXT: s_bfe_u32 s2, s2, s13 ; GFX8-NEXT: s_or_b32 s4, s4, s5 ; GFX8-NEXT: s_lshl_b32 s2, s2, 16 -; GFX8-NEXT: s_bfe_u32 s5, s3, s12 ; GFX8-NEXT: s_or_b32 s2, s4, s2 ; GFX8-NEXT: s_lshl_b32 s4, s6, 24 -; GFX8-NEXT: s_or_b32 s2, s2, s4 +; GFX8-NEXT: s_bfe_u32 s5, s3, s12 ; GFX8-NEXT: s_lshr_b32 s7, s3, 24 +; GFX8-NEXT: s_or_b32 s2, s2, s4 ; GFX8-NEXT: s_and_b32 s4, s3, s10 -; GFX8-NEXT: s_bfe_u32 s3, s3, s13 ; GFX8-NEXT: s_lshl_b32 s5, s5, 8 +; GFX8-NEXT: s_bfe_u32 s3, s3, s13 ; GFX8-NEXT: s_or_b32 s4, s4, s5 ; GFX8-NEXT: s_lshl_b32 s3, s3, 16 ; GFX8-NEXT: s_or_b32 s3, s4, s3 @@ -4002,33 +4002,33 @@ ; GFX7-NEXT: s_bfe_u32 s0, s0, s13 ; GFX7-NEXT: s_lshl_b32 s0, s0, 16 ; GFX7-NEXT: s_or_b32 s0, s11, s0 -; GFX7-NEXT: s_bfe_u32 s11, s1, s12 ; GFX7-NEXT: s_lshl_b32 s6, s6, 24 -; GFX7-NEXT: s_or_b32 s0, s0, s6 +; GFX7-NEXT: s_bfe_u32 s11, s1, s12 ; GFX7-NEXT: s_lshr_b32 s7, s1, 24 +; GFX7-NEXT: s_or_b32 s0, s0, s6 ; GFX7-NEXT: s_and_b32 s6, s1, s10 -; GFX7-NEXT: s_bfe_u32 s1, s1, s13 ; GFX7-NEXT: s_lshl_b32 s11, s11, 8 +; GFX7-NEXT: s_bfe_u32 s1, s1, s13 ; GFX7-NEXT: s_or_b32 s6, s6, s11 ; GFX7-NEXT: s_lshl_b32 s1, s1, 16 ; GFX7-NEXT: s_or_b32 s1, s6, s1 ; GFX7-NEXT: s_lshl_b32 s6, s7, 24 ; GFX7-NEXT: s_bfe_u32 s7, s2, s12 -; GFX7-NEXT: s_or_b32 s1, s1, s6 ; GFX7-NEXT: s_lshr_b32 s8, s2, 24 +; GFX7-NEXT: s_or_b32 s1, s1, s6 ; GFX7-NEXT: s_and_b32 s6, s2, s10 -; GFX7-NEXT: s_bfe_u32 s2, s2, s13 ; GFX7-NEXT: s_lshl_b32 s7, s7, 8 +; GFX7-NEXT: s_bfe_u32 s2, s2, s13 ; GFX7-NEXT: s_or_b32 s6, s6, s7 ; GFX7-NEXT: s_lshl_b32 s2, s2, 16 -; GFX7-NEXT: s_bfe_u32 s7, s3, s12 ; GFX7-NEXT: s_or_b32 s2, s6, s2 ; GFX7-NEXT: s_lshl_b32 s6, s8, 24 -; GFX7-NEXT: s_or_b32 s2, s2, s6 +; GFX7-NEXT: s_bfe_u32 s7, s3, s12 ; GFX7-NEXT: s_lshr_b32 s9, s3, 24 +; GFX7-NEXT: s_or_b32 s2, s2, s6 ; GFX7-NEXT: s_and_b32 s6, s3, s10 -; GFX7-NEXT: s_bfe_u32 s3, s3, s13 ; GFX7-NEXT: s_lshl_b32 s7, s7, 8 +; GFX7-NEXT: s_bfe_u32 s3, s3, s13 ; GFX7-NEXT: s_or_b32 s6, s6, s7 ; GFX7-NEXT: s_lshl_b32 s3, s3, 16 ; GFX7-NEXT: s_or_b32 s3, s6, s3 @@ -4059,8 +4059,8 @@ ; GFX7-NEXT: s_bfe_u32 s14, s5, s12 ; GFX7-NEXT: s_lshr_b32 s4, s5, 24 ; GFX7-NEXT: s_and_b32 s11, s5, s10 -; GFX7-NEXT: s_bfe_u32 s5, s5, s13 ; GFX7-NEXT: s_lshl_b32 s14, s14, 8 +; GFX7-NEXT: s_bfe_u32 s5, s5, s13 ; GFX7-NEXT: s_or_b32 s11, s11, s14 ; GFX7-NEXT: s_lshl_b32 s5, s5, 16 ; GFX7-NEXT: s_or_b32 s5, s11, s5 @@ -4069,28 +4069,28 @@ ; GFX7-NEXT: s_lshr_b32 s6, s7, 24 ; GFX7-NEXT: s_or_b32 s4, s5, s4 ; GFX7-NEXT: s_and_b32 s5, s7, s10 -; GFX7-NEXT: s_bfe_u32 s7, s7, s13 ; GFX7-NEXT: s_lshl_b32 s11, s11, 8 +; GFX7-NEXT: s_bfe_u32 s7, s7, s13 ; GFX7-NEXT: s_or_b32 s5, s5, s11 ; GFX7-NEXT: s_lshl_b32 s7, s7, 16 ; GFX7-NEXT: s_or_b32 s5, s5, s7 -; GFX7-NEXT: s_bfe_u32 s7, s2, s12 ; GFX7-NEXT: s_lshl_b32 s6, s6, 24 -; GFX7-NEXT: s_or_b32 s5, s5, s6 +; GFX7-NEXT: s_bfe_u32 s7, s2, s12 ; GFX7-NEXT: s_lshr_b32 s8, s2, 24 +; GFX7-NEXT: s_or_b32 s5, s5, s6 ; GFX7-NEXT: s_and_b32 s6, s2, s10 -; GFX7-NEXT: s_bfe_u32 s2, s2, s13 ; GFX7-NEXT: s_lshl_b32 s7, s7, 8 +; GFX7-NEXT: s_bfe_u32 s2, s2, s13 ; GFX7-NEXT: s_or_b32 s6, s6, s7 ; GFX7-NEXT: s_lshl_b32 s2, s2, 16 -; GFX7-NEXT: s_bfe_u32 s7, s3, s12 ; GFX7-NEXT: s_or_b32 s2, s6, s2 ; GFX7-NEXT: s_lshl_b32 s6, s8, 24 -; GFX7-NEXT: s_or_b32 s6, s2, s6 +; GFX7-NEXT: s_bfe_u32 s7, s3, s12 ; GFX7-NEXT: s_lshr_b32 s9, s3, 24 +; GFX7-NEXT: s_or_b32 s6, s2, s6 ; GFX7-NEXT: s_and_b32 s2, s3, s10 -; GFX7-NEXT: s_bfe_u32 s3, s3, s13 ; GFX7-NEXT: s_lshl_b32 s7, s7, 8 +; GFX7-NEXT: s_bfe_u32 s3, s3, s13 ; GFX7-NEXT: s_or_b32 s2, s2, s7 ; GFX7-NEXT: s_lshl_b32 s3, s3, 16 ; GFX7-NEXT: s_or_b32 s2, s2, s3 @@ -4117,35 +4117,35 @@ ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_bfe_u32 s14, s0, s7 ; GFX10-NEXT: s_lshr_b32 s9, s0, 24 -; GFX10-NEXT: s_bfe_u32 s16, s1, s7 ; GFX10-NEXT: s_and_b32 s13, s0, s6 ; GFX10-NEXT: s_bfe_u32 s0, s0, s8 +; GFX10-NEXT: s_bfe_u32 s16, s1, s7 ; GFX10-NEXT: s_lshl_b32 s14, s14, 8 ; GFX10-NEXT: s_lshr_b32 s10, s1, 24 ; GFX10-NEXT: s_and_b32 s15, s1, s6 ; GFX10-NEXT: s_bfe_u32 s1, s1, s8 ; GFX10-NEXT: s_lshl_b32 s0, s0, 16 -; GFX10-NEXT: s_or_b32 s13, s13, s14 ; GFX10-NEXT: s_lshl_b32 s16, s16, 8 +; GFX10-NEXT: s_or_b32 s13, s13, s14 ; GFX10-NEXT: s_bfe_u32 s18, s2, s7 ; GFX10-NEXT: s_lshl_b32 s9, s9, 24 -; GFX10-NEXT: s_or_b32 s0, s13, s0 ; GFX10-NEXT: s_lshl_b32 s1, s1, 16 ; GFX10-NEXT: s_or_b32 s14, s15, s16 -; GFX10-NEXT: s_or_b32 s0, s0, s9 +; GFX10-NEXT: s_or_b32 s0, s13, s0 ; GFX10-NEXT: s_lshr_b32 s11, s2, 24 ; GFX10-NEXT: s_and_b32 s17, s2, s6 -; GFX10-NEXT: s_lshl_b32 s9, s18, 8 -; GFX10-NEXT: s_bfe_u32 s2, s2, s8 ; GFX10-NEXT: s_lshl_b32 s10, s10, 24 ; GFX10-NEXT: s_or_b32 s1, s14, s1 +; GFX10-NEXT: s_or_b32 s0, s0, s9 +; GFX10-NEXT: s_lshl_b32 s9, s18, 8 +; GFX10-NEXT: s_bfe_u32 s2, s2, s8 ; GFX10-NEXT: s_or_b32 s9, s17, s9 ; GFX10-NEXT: s_lshl_b32 s2, s2, 16 ; GFX10-NEXT: s_or_b32 s1, s1, s10 ; GFX10-NEXT: s_bfe_u32 s10, s3, s7 +; GFX10-NEXT: s_lshr_b32 s12, s3, 24 ; GFX10-NEXT: s_or_b32 s2, s9, s2 ; GFX10-NEXT: s_lshl_b32 s9, s11, 24 -; GFX10-NEXT: s_lshr_b32 s12, s3, 24 ; GFX10-NEXT: s_and_b32 s11, s3, s6 ; GFX10-NEXT: s_lshl_b32 s10, s10, 8 ; GFX10-NEXT: s_bfe_u32 s3, s3, s8 @@ -4191,20 +4191,20 @@ ; GFX10-NEXT: s_and_b32 s12, s1, s6 ; GFX10-NEXT: s_lshl_b32 s10, s10, 8 ; GFX10-NEXT: s_bfe_u32 s1, s1, s8 +; GFX10-NEXT: s_or_b32 s10, s12, s10 +; GFX10-NEXT: s_lshl_b32 s1, s1, 16 ; GFX10-NEXT: s_or_b32 s0, s0, s4 ; GFX10-NEXT: s_lshl_b32 s4, s5, 24 ; GFX10-NEXT: s_bfe_u32 s5, s2, s7 -; GFX10-NEXT: s_or_b32 s10, s12, s10 -; GFX10-NEXT: s_lshl_b32 s1, s1, 16 ; GFX10-NEXT: s_lshr_b32 s9, s2, 24 ; GFX10-NEXT: s_or_b32 s1, s10, s1 ; GFX10-NEXT: s_and_b32 s10, s2, s6 ; GFX10-NEXT: s_lshl_b32 s5, s5, 8 ; GFX10-NEXT: s_bfe_u32 s2, s2, s8 -; GFX10-NEXT: s_or_b32 s1, s1, s4 -; GFX10-NEXT: s_bfe_u32 s4, s3, s7 ; GFX10-NEXT: s_or_b32 s5, s10, s5 ; GFX10-NEXT: s_lshl_b32 s2, s2, 16 +; GFX10-NEXT: s_or_b32 s1, s1, s4 +; GFX10-NEXT: s_bfe_u32 s4, s3, s7 ; GFX10-NEXT: s_lshr_b32 s11, s3, 24 ; GFX10-NEXT: s_or_b32 s2, s5, s2 ; GFX10-NEXT: s_and_b32 s5, s3, s6 @@ -4235,11 +4235,11 @@ ; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off ; GFX9-NEXT: s_mov_b32 s0, 8 ; GFX9-NEXT: s_mov_b32 s1, 16 -; GFX9-NEXT: v_mov_b32_e32 v6, 8 ; GFX9-NEXT: s_movk_i32 s6, 0xff +; GFX9-NEXT: v_mov_b32_e32 v6, 8 +; GFX9-NEXT: v_mov_b32_e32 v7, 16 ; GFX9-NEXT: s_lshr_b32 s4, s3, 2 ; GFX9-NEXT: s_and_b32 s3, s3, 3 -; GFX9-NEXT: v_mov_b32_e32 v7, 16 ; GFX9-NEXT: s_and_b32 s2, s2, s6 ; GFX9-NEXT: s_lshl_b32 s3, s3, 3 ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1 @@ -4260,8 +4260,8 @@ ; GFX9-NEXT: v_lshlrev_b32_sdwa v16, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_lshlrev_b32_sdwa v17, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX9-NEXT: v_and_or_b32 v0, v0, s6, v13 -; GFX9-NEXT: v_lshlrev_b32_e32 v9, 24, v9 ; GFX9-NEXT: v_and_or_b32 v1, v1, s6, v15 +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 24, v9 ; GFX9-NEXT: v_lshlrev_b32_e32 v10, 24, v10 ; GFX9-NEXT: v_lshrrev_b32_e32 v12, 24, v3 ; GFX9-NEXT: v_lshlrev_b32_sdwa v18, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 @@ -4281,9 +4281,9 @@ ; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v3, s[2:3] ; GFX9-NEXT: v_and_or_b32 v8, v9, s5, v8 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, 0 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v8, s[4:5] ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v8, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v8, s[4:5] ; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v8, s[2:3] ; GFX9-NEXT: v_lshrrev_b32_e32 v8, 24, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v9, 24, v1 @@ -4297,14 +4297,14 @@ ; GFX9-NEXT: v_lshlrev_b32_sdwa v15, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_lshlrev_b32_sdwa v17, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_lshlrev_b32_sdwa v7, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_and_or_b32 v3, v3, s6, v6 -; GFX9-NEXT: v_lshlrev_b32_e32 v6, 24, v11 ; GFX9-NEXT: v_and_or_b32 v0, v0, s6, v12 ; GFX9-NEXT: v_lshlrev_b32_e32 v8, 24, v8 ; GFX9-NEXT: v_and_or_b32 v1, v1, s6, v14 ; GFX9-NEXT: v_lshlrev_b32_e32 v9, 24, v9 ; GFX9-NEXT: v_and_or_b32 v2, v2, s6, v16 ; GFX9-NEXT: v_lshlrev_b32_e32 v10, 24, v10 +; GFX9-NEXT: v_and_or_b32 v3, v3, s6, v6 +; GFX9-NEXT: v_lshlrev_b32_e32 v6, 24, v11 ; GFX9-NEXT: v_or3_b32 v0, v0, v13, v8 ; GFX9-NEXT: v_or3_b32 v1, v1, v15, v9 ; GFX9-NEXT: v_or3_b32 v2, v2, v17, v10 @@ -4316,8 +4316,8 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v6, 8 -; GFX8-NEXT: v_mov_b32_e32 v8, 8 ; GFX8-NEXT: v_mov_b32_e32 v7, 16 +; GFX8-NEXT: v_mov_b32_e32 v8, 8 ; GFX8-NEXT: v_mov_b32_e32 v9, 16 ; GFX8-NEXT: s_and_b32 s1, s3, 3 ; GFX8-NEXT: s_movk_i32 s0, 0xff @@ -4336,24 +4336,24 @@ ; GFX8-NEXT: v_lshlrev_b32_sdwa v14, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX8-NEXT: v_lshlrev_b32_sdwa v6, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX8-NEXT: v_lshrrev_b32_e32 v10, 24, v0 -; GFX8-NEXT: v_lshlrev_b32_sdwa v15, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_lshrrev_b32_e32 v11, 24, v1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v15, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX8-NEXT: v_lshlrev_b32_sdwa v7, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_or_b32_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b32_sdwa v16, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_lshrrev_b32_e32 v12, 24, v2 ; GFX8-NEXT: v_lshlrev_b32_sdwa v17, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_or_b32_sdwa v2, v2, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v6, 24, v11 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v7 ; GFX8-NEXT: v_lshlrev_b32_sdwa v18, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX8-NEXT: v_lshlrev_b32_e32 v10, 24, v10 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 24, v11 +; GFX8-NEXT: v_or_b32_sdwa v2, v2, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v0, v0, v15 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v7 ; GFX8-NEXT: v_lshrrev_b32_e32 v13, 24, v3 ; GFX8-NEXT: v_lshlrev_b32_sdwa v19, v9, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_or_b32_sdwa v3, v3, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b32_e32 v11, 24, v12 +; GFX8-NEXT: v_or_b32_sdwa v3, v3, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v2, v2, v17 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v10 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v6 @@ -4367,9 +4367,9 @@ ; GFX8-NEXT: v_and_b32_e32 v6, s6, v6 ; GFX8-NEXT: v_or_b32_e32 v6, s5, v6 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, 0 +; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[2:3] ; GFX8-NEXT: v_lshlrev_b32_sdwa v12, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX8-NEXT: v_lshlrev_b32_sdwa v14, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 @@ -4378,22 +4378,22 @@ ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 24, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v7, 24, v1 ; GFX8-NEXT: v_lshrrev_b32_e32 v10, 24, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v11, 24, v3 ; GFX8-NEXT: v_lshlrev_b32_sdwa v13, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b32_sdwa v15, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX8-NEXT: v_lshlrev_b32_sdwa v17, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v9, v9, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v1, v1, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v2, v2, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_lshrrev_b32_e32 v11, 24, v3 -; GFX8-NEXT: v_lshlrev_b32_sdwa v9, v9, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX8-NEXT: v_or_b32_sdwa v3, v3, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b32_e32 v6, 24, v6 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v13 ; GFX8-NEXT: v_lshlrev_b32_e32 v7, 24, v7 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v15 ; GFX8-NEXT: v_lshlrev_b32_e32 v10, 24, v10 -; GFX8-NEXT: v_or_b32_e32 v2, v2, v17 ; GFX8-NEXT: v_lshlrev_b32_e32 v8, 24, v11 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v13 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v15 +; GFX8-NEXT: v_or_b32_e32 v2, v2, v17 ; GFX8-NEXT: v_or_b32_e32 v3, v3, v9 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v6 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v7 @@ -4425,36 +4425,36 @@ ; GFX7-NEXT: v_bfe_u32 v11, v1, 8, 8 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 24, v0 ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 24, v1 -; GFX7-NEXT: v_bfe_u32 v13, v2, 8, 8 ; GFX7-NEXT: v_and_b32_e32 v8, s6, v0 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 ; GFX7-NEXT: v_and_b32_e32 v10, s6, v1 ; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 8 +; GFX7-NEXT: v_bfe_u32 v13, v2, 8, 8 ; GFX7-NEXT: v_lshlrev_b32_e32 v9, 8, v9 ; GFX7-NEXT: v_lshlrev_b32_e32 v11, 8, v11 ; GFX7-NEXT: v_lshrrev_b32_e32 v6, 24, v2 -; GFX7-NEXT: v_bfe_u32 v15, v3, 8, 8 ; GFX7-NEXT: v_and_b32_e32 v12, s6, v2 ; GFX7-NEXT: v_bfe_u32 v2, v2, 16, 8 -; GFX7-NEXT: v_or_b32_e32 v8, v8, v9 +; GFX7-NEXT: v_bfe_u32 v15, v3, 8, 8 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_or_b32_e32 v9, v10, v11 ; GFX7-NEXT: v_lshlrev_b32_e32 v13, 8, v13 +; GFX7-NEXT: v_or_b32_e32 v8, v8, v9 +; GFX7-NEXT: v_or_b32_e32 v9, v10, v11 ; GFX7-NEXT: v_lshrrev_b32_e32 v7, 24, v3 ; GFX7-NEXT: v_and_b32_e32 v14, s6, v3 ; GFX7-NEXT: v_bfe_u32 v3, v3, 16, 8 -; GFX7-NEXT: v_lshlrev_b32_e32 v15, 8, v15 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GFX7-NEXT: v_or_b32_e32 v0, v8, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 24, v5 -; GFX7-NEXT: v_or_b32_e32 v1, v9, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v15, 8, v15 ; GFX7-NEXT: v_or_b32_e32 v10, v12, v13 +; GFX7-NEXT: v_or_b32_e32 v0, v8, v0 +; GFX7-NEXT: v_or_b32_e32 v1, v9, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v6, 24, v6 -; GFX7-NEXT: v_or_b32_e32 v2, v10, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_or_b32_e32 v11, v14, v15 +; GFX7-NEXT: v_or_b32_e32 v2, v10, v2 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v4 ; GFX7-NEXT: v_or_b32_e32 v1, v1, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v7, 24, v7 @@ -4467,45 +4467,45 @@ ; GFX7-NEXT: v_and_b32_e32 v4, s7, v4 ; GFX7-NEXT: v_or_b32_e32 v4, s5, v4 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, 0 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[4:5] +; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[0:1] ; GFX7-NEXT: v_bfe_u32 v9, v0, 8, 8 ; GFX7-NEXT: v_bfe_u32 v11, v1, 8, 8 ; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[2:3] ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 24, v0 ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 24, v1 -; GFX7-NEXT: v_bfe_u32 v13, v2, 8, 8 ; GFX7-NEXT: v_and_b32_e32 v8, s6, v0 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 ; GFX7-NEXT: v_and_b32_e32 v10, s6, v1 ; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 8 +; GFX7-NEXT: v_bfe_u32 v13, v2, 8, 8 ; GFX7-NEXT: v_lshlrev_b32_e32 v9, 8, v9 ; GFX7-NEXT: v_lshlrev_b32_e32 v11, 8, v11 -; GFX7-NEXT: v_or_b32_e32 v8, v8, v9 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_lshrrev_b32_e32 v6, 24, v2 ; GFX7-NEXT: v_and_b32_e32 v12, s6, v2 ; GFX7-NEXT: v_bfe_u32 v2, v2, 16, 8 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_or_b32_e32 v9, v10, v11 ; GFX7-NEXT: v_lshlrev_b32_e32 v13, 8, v13 +; GFX7-NEXT: v_or_b32_e32 v8, v8, v9 +; GFX7-NEXT: v_or_b32_e32 v9, v10, v11 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GFX7-NEXT: v_or_b32_e32 v0, v8, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 24, v5 -; GFX7-NEXT: v_or_b32_e32 v1, v9, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_or_b32_e32 v10, v12, v13 +; GFX7-NEXT: v_or_b32_e32 v0, v8, v0 +; GFX7-NEXT: v_or_b32_e32 v1, v9, v1 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v4 ; GFX7-NEXT: v_or_b32_e32 v1, v1, v5 -; GFX7-NEXT: v_bfe_u32 v5, v3, 8, 8 ; GFX7-NEXT: v_or_b32_e32 v2, v10, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v6 -; GFX7-NEXT: v_or_b32_e32 v2, v2, v4 +; GFX7-NEXT: v_bfe_u32 v5, v3, 8, 8 ; GFX7-NEXT: v_lshrrev_b32_e32 v7, 24, v3 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v4 ; GFX7-NEXT: v_and_b32_e32 v4, s6, v3 -; GFX7-NEXT: v_bfe_u32 v3, v3, 16, 8 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; GFX7-NEXT: v_bfe_u32 v3, v3, 16, 8 ; GFX7-NEXT: v_or_b32_e32 v4, v4, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_or_b32_e32 v3, v4, v3 @@ -4533,23 +4533,23 @@ ; GFX10-NEXT: v_lshrrev_b32_e32 v8, 24, v2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v11, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v13, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v14, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_and_or_b32 v0, v0, s4, v10 ; GFX10-NEXT: v_lshlrev_b32_e32 v6, 24, v6 -; GFX10-NEXT: v_lshlrev_b32_sdwa v14, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_and_or_b32 v1, v1, s4, v12 ; GFX10-NEXT: v_lshlrev_b32_e32 v7, 24, v7 -; GFX10-NEXT: v_lshlrev_b32_sdwa v15, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshrrev_b32_e32 v9, 24, v3 -; GFX10-NEXT: v_or3_b32 v0, v0, v11, v6 +; GFX10-NEXT: v_lshlrev_b32_sdwa v15, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v16, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_or3_b32 v1, v1, v13, v7 ; GFX10-NEXT: v_and_or_b32 v2, v2, s4, v14 ; GFX10-NEXT: v_lshlrev_b32_e32 v8, 24, v8 +; GFX10-NEXT: v_or3_b32 v0, v0, v11, v6 +; GFX10-NEXT: v_or3_b32 v1, v1, v13, v7 ; GFX10-NEXT: v_lshlrev_b32_sdwa v10, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_and_or_b32 v3, v3, s4, v16 ; GFX10-NEXT: v_lshlrev_b32_e32 v6, 24, v9 -; GFX10-NEXT: v_cndmask_b32_e32 v7, v0, v1, vcc_lo ; GFX10-NEXT: v_or3_b32 v2, v2, v15, v8 +; GFX10-NEXT: v_cndmask_b32_e32 v7, v0, v1, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s5, 2 ; GFX10-NEXT: s_and_b32 s1, s3, 3 ; GFX10-NEXT: v_or3_b32 v3, v3, v10, v6 @@ -4563,12 +4563,12 @@ ; GFX10-NEXT: v_and_or_b32 v6, v6, s3, s2 ; GFX10-NEXT: v_cmp_eq_u32_e64 s2, s5, 0 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v6, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v6, s2 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v6, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v6, s1 ; GFX10-NEXT: v_lshrrev_b32_e32 v7, 24, v1 -; GFX10-NEXT: v_lshrrev_b32_e32 v8, 24, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v6, 24, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v8, 24, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v9, 24, v3 ; GFX10-NEXT: v_lshlrev_b32_sdwa v10, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_lshlrev_b32_sdwa v12, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 @@ -4578,14 +4578,14 @@ ; GFX10-NEXT: v_lshlrev_b32_sdwa v13, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v15, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v16, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v3, v3, s4, v4 -; GFX10-NEXT: v_lshlrev_b32_e32 v9, 24, v9 ; GFX10-NEXT: v_and_or_b32 v0, v0, s4, v10 ; GFX10-NEXT: v_lshlrev_b32_e32 v6, 24, v6 ; GFX10-NEXT: v_and_or_b32 v1, v1, s4, v12 ; GFX10-NEXT: v_lshlrev_b32_e32 v7, 24, v7 ; GFX10-NEXT: v_and_or_b32 v2, v2, s4, v14 ; GFX10-NEXT: v_lshlrev_b32_e32 v8, 24, v8 +; GFX10-NEXT: v_and_or_b32 v3, v3, s4, v4 +; GFX10-NEXT: v_lshlrev_b32_e32 v9, 24, v9 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: v_mov_b32_e32 v5, 0 ; GFX10-NEXT: v_or3_b32 v0, v0, v11, v6 @@ -4618,33 +4618,33 @@ ; GFX9-NEXT: s_bfe_u32 s0, s0, s14 ; GFX9-NEXT: s_lshl_b32 s0, s0, 16 ; GFX9-NEXT: s_or_b32 s0, s12, s0 -; GFX9-NEXT: s_bfe_u32 s12, s1, s13 ; GFX9-NEXT: s_lshl_b32 s7, s7, 24 -; GFX9-NEXT: s_or_b32 s0, s0, s7 +; GFX9-NEXT: s_bfe_u32 s12, s1, s13 ; GFX9-NEXT: s_lshr_b32 s8, s1, 24 +; GFX9-NEXT: s_or_b32 s0, s0, s7 ; GFX9-NEXT: s_and_b32 s7, s1, s11 -; GFX9-NEXT: s_bfe_u32 s1, s1, s14 ; GFX9-NEXT: s_lshl_b32 s12, s12, 8 +; GFX9-NEXT: s_bfe_u32 s1, s1, s14 ; GFX9-NEXT: s_or_b32 s7, s7, s12 ; GFX9-NEXT: s_lshl_b32 s1, s1, 16 ; GFX9-NEXT: s_or_b32 s1, s7, s1 ; GFX9-NEXT: s_lshl_b32 s7, s8, 24 ; GFX9-NEXT: s_bfe_u32 s8, s2, s13 -; GFX9-NEXT: s_or_b32 s1, s1, s7 ; GFX9-NEXT: s_lshr_b32 s9, s2, 24 +; GFX9-NEXT: s_or_b32 s1, s1, s7 ; GFX9-NEXT: s_and_b32 s7, s2, s11 -; GFX9-NEXT: s_bfe_u32 s2, s2, s14 ; GFX9-NEXT: s_lshl_b32 s8, s8, 8 +; GFX9-NEXT: s_bfe_u32 s2, s2, s14 ; GFX9-NEXT: s_or_b32 s7, s7, s8 ; GFX9-NEXT: s_lshl_b32 s2, s2, 16 -; GFX9-NEXT: s_bfe_u32 s8, s3, s13 ; GFX9-NEXT: s_or_b32 s2, s7, s2 ; GFX9-NEXT: s_lshl_b32 s7, s9, 24 -; GFX9-NEXT: s_or_b32 s2, s2, s7 +; GFX9-NEXT: s_bfe_u32 s8, s3, s13 ; GFX9-NEXT: s_lshr_b32 s10, s3, 24 +; GFX9-NEXT: s_or_b32 s2, s2, s7 ; GFX9-NEXT: s_and_b32 s7, s3, s11 -; GFX9-NEXT: s_bfe_u32 s3, s3, s14 ; GFX9-NEXT: s_lshl_b32 s8, s8, 8 +; GFX9-NEXT: s_bfe_u32 s3, s3, s14 ; GFX9-NEXT: s_or_b32 s7, s7, s8 ; GFX9-NEXT: s_lshl_b32 s3, s3, 16 ; GFX9-NEXT: s_or_b32 s3, s7, s3 @@ -4665,19 +4665,19 @@ ; GFX9-NEXT: v_lshl_or_b32 v4, v0, s4, v1 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s7, 0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s7, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s7, 2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s7, 3 +; GFX9-NEXT: s_mov_b32 s6, 16 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 24, v0 ; GFX9-NEXT: v_lshlrev_b32_sdwa v8, s5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: s_mov_b32 s6, 16 ; GFX9-NEXT: v_and_or_b32 v8, v0, s11, v8 ; GFX9-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 24, v4 @@ -4689,8 +4689,8 @@ ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 24, v5 ; GFX9-NEXT: v_or3_b32 v1, v4, v1, v5 ; GFX9-NEXT: v_mov_b32_e32 v4, 8 -; GFX9-NEXT: v_lshlrev_b32_sdwa v5, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 24, v2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v5, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX9-NEXT: v_mov_b32_e32 v8, 16 ; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v3 ; GFX9-NEXT: v_and_or_b32 v5, v2, s11, v5 @@ -4699,9 +4699,9 @@ ; GFX9-NEXT: v_lshlrev_b32_sdwa v4, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX9-NEXT: v_or3_b32 v2, v5, v2, v6 ; GFX9-NEXT: v_and_or_b32 v6, v3, s11, v4 -; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_lshlrev_b32_sdwa v3, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-NEXT: v_or3_b32 v3, v6, v3, v7 ; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off @@ -4724,33 +4724,33 @@ ; GFX8-NEXT: s_bfe_u32 s0, s0, s12 ; GFX8-NEXT: s_lshl_b32 s0, s0, 16 ; GFX8-NEXT: s_or_b32 s0, s10, s0 -; GFX8-NEXT: s_bfe_u32 s10, s1, s11 ; GFX8-NEXT: s_lshl_b32 s5, s5, 24 -; GFX8-NEXT: s_or_b32 s0, s0, s5 +; GFX8-NEXT: s_bfe_u32 s10, s1, s11 ; GFX8-NEXT: s_lshr_b32 s6, s1, 24 +; GFX8-NEXT: s_or_b32 s0, s0, s5 ; GFX8-NEXT: s_and_b32 s5, s1, s9 -; GFX8-NEXT: s_bfe_u32 s1, s1, s12 ; GFX8-NEXT: s_lshl_b32 s10, s10, 8 +; GFX8-NEXT: s_bfe_u32 s1, s1, s12 ; GFX8-NEXT: s_or_b32 s5, s5, s10 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 ; GFX8-NEXT: s_or_b32 s1, s5, s1 ; GFX8-NEXT: s_lshl_b32 s5, s6, 24 ; GFX8-NEXT: s_bfe_u32 s6, s2, s11 -; GFX8-NEXT: s_or_b32 s1, s1, s5 ; GFX8-NEXT: s_lshr_b32 s7, s2, 24 +; GFX8-NEXT: s_or_b32 s1, s1, s5 ; GFX8-NEXT: s_and_b32 s5, s2, s9 -; GFX8-NEXT: s_bfe_u32 s2, s2, s12 ; GFX8-NEXT: s_lshl_b32 s6, s6, 8 +; GFX8-NEXT: s_bfe_u32 s2, s2, s12 ; GFX8-NEXT: s_or_b32 s5, s5, s6 ; GFX8-NEXT: s_lshl_b32 s2, s2, 16 -; GFX8-NEXT: s_bfe_u32 s6, s3, s11 ; GFX8-NEXT: s_or_b32 s2, s5, s2 ; GFX8-NEXT: s_lshl_b32 s5, s7, 24 -; GFX8-NEXT: s_or_b32 s2, s2, s5 +; GFX8-NEXT: s_bfe_u32 s6, s3, s11 ; GFX8-NEXT: s_lshr_b32 s8, s3, 24 +; GFX8-NEXT: s_or_b32 s2, s2, s5 ; GFX8-NEXT: s_and_b32 s5, s3, s9 -; GFX8-NEXT: s_bfe_u32 s3, s3, s12 ; GFX8-NEXT: s_lshl_b32 s6, s6, 8 +; GFX8-NEXT: s_bfe_u32 s3, s3, s12 ; GFX8-NEXT: s_or_b32 s5, s5, s6 ; GFX8-NEXT: s_lshl_b32 s3, s3, 16 ; GFX8-NEXT: s_or_b32 s3, s5, s3 @@ -4772,16 +4772,16 @@ ; GFX8-NEXT: v_or_b32_e32 v4, s4, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s5, 0 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s5, 1 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s5, 2 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX8-NEXT: v_lshlrev_b32_sdwa v9, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s5, 3 +; GFX8-NEXT: v_lshlrev_b32_sdwa v9, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 24, v0 ; GFX8-NEXT: v_or_b32_sdwa v9, v0, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -4799,13 +4799,13 @@ ; GFX8-NEXT: v_mov_b32_e32 v4, 8 ; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX8-NEXT: v_mov_b32_e32 v8, 16 -; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 24, v2 ; GFX8-NEXT: v_or_b32_sdwa v5, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshrrev_b32_e32 v7, 24, v3 ; GFX8-NEXT: v_or_b32_e32 v2, v5, v2 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, 24, v6 -; GFX8-NEXT: v_lshrrev_b32_e32 v7, 24, v3 ; GFX8-NEXT: v_or_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX8-NEXT: v_or_b32_e32 v2, v2, v5 @@ -4833,33 +4833,33 @@ ; GFX7-NEXT: s_bfe_u32 s0, s0, s12 ; GFX7-NEXT: s_lshl_b32 s0, s0, 16 ; GFX7-NEXT: s_or_b32 s0, s10, s0 -; GFX7-NEXT: s_bfe_u32 s10, s1, s11 ; GFX7-NEXT: s_lshl_b32 s5, s5, 24 -; GFX7-NEXT: s_or_b32 s0, s0, s5 +; GFX7-NEXT: s_bfe_u32 s10, s1, s11 ; GFX7-NEXT: s_lshr_b32 s6, s1, 24 +; GFX7-NEXT: s_or_b32 s0, s0, s5 ; GFX7-NEXT: s_and_b32 s5, s1, s9 -; GFX7-NEXT: s_bfe_u32 s1, s1, s12 ; GFX7-NEXT: s_lshl_b32 s10, s10, 8 +; GFX7-NEXT: s_bfe_u32 s1, s1, s12 ; GFX7-NEXT: s_or_b32 s5, s5, s10 ; GFX7-NEXT: s_lshl_b32 s1, s1, 16 ; GFX7-NEXT: s_or_b32 s1, s5, s1 ; GFX7-NEXT: s_lshl_b32 s5, s6, 24 ; GFX7-NEXT: s_bfe_u32 s6, s2, s11 -; GFX7-NEXT: s_or_b32 s1, s1, s5 ; GFX7-NEXT: s_lshr_b32 s7, s2, 24 +; GFX7-NEXT: s_or_b32 s1, s1, s5 ; GFX7-NEXT: s_and_b32 s5, s2, s9 -; GFX7-NEXT: s_bfe_u32 s2, s2, s12 ; GFX7-NEXT: s_lshl_b32 s6, s6, 8 +; GFX7-NEXT: s_bfe_u32 s2, s2, s12 ; GFX7-NEXT: s_or_b32 s5, s5, s6 ; GFX7-NEXT: s_lshl_b32 s2, s2, 16 -; GFX7-NEXT: s_bfe_u32 s6, s3, s11 ; GFX7-NEXT: s_or_b32 s2, s5, s2 ; GFX7-NEXT: s_lshl_b32 s5, s7, 24 -; GFX7-NEXT: s_or_b32 s2, s2, s5 +; GFX7-NEXT: s_bfe_u32 s6, s3, s11 ; GFX7-NEXT: s_lshr_b32 s8, s3, 24 +; GFX7-NEXT: s_or_b32 s2, s2, s5 ; GFX7-NEXT: s_and_b32 s5, s3, s9 -; GFX7-NEXT: s_bfe_u32 s3, s3, s12 ; GFX7-NEXT: s_lshl_b32 s6, s6, 8 +; GFX7-NEXT: s_bfe_u32 s3, s3, s12 ; GFX7-NEXT: s_or_b32 s5, s5, s6 ; GFX7-NEXT: s_lshl_b32 s3, s3, 16 ; GFX7-NEXT: s_or_b32 s3, s5, s3 @@ -4880,51 +4880,51 @@ ; GFX7-NEXT: v_or_b32_e32 v4, s4, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s5, 0 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s5, 1 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s5, 2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX7-NEXT: v_bfe_u32 v9, v0, 8, 8 ; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s5, 3 +; GFX7-NEXT: v_bfe_u32 v9, v0, 8, 8 ; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 24, v0 ; GFX7-NEXT: v_and_b32_e32 v8, s9, v0 -; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 ; GFX7-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 ; GFX7-NEXT: v_or_b32_e32 v8, v8, v9 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_or_b32_e32 v0, v8, v0 -; GFX7-NEXT: v_bfe_u32 v8, v1, 8, 8 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX7-NEXT: v_bfe_u32 v8, v1, 8, 8 ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 24, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v4 ; GFX7-NEXT: v_and_b32_e32 v4, s9, v1 -; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 8 ; GFX7-NEXT: v_lshlrev_b32_e32 v8, 8, v8 +; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 8 ; GFX7-NEXT: v_or_b32_e32 v4, v4, v8 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_or_b32_e32 v1, v4, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v5 ; GFX7-NEXT: v_bfe_u32 v5, v2, 8, 8 -; GFX7-NEXT: v_or_b32_e32 v1, v1, v4 ; GFX7-NEXT: v_lshrrev_b32_e32 v6, 24, v2 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v4 ; GFX7-NEXT: v_and_b32_e32 v4, s9, v2 -; GFX7-NEXT: v_bfe_u32 v2, v2, 16, 8 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; GFX7-NEXT: v_bfe_u32 v2, v2, 16, 8 ; GFX7-NEXT: v_or_b32_e32 v4, v4, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_bfe_u32 v5, v3, 8, 8 ; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v6 -; GFX7-NEXT: v_or_b32_e32 v2, v2, v4 +; GFX7-NEXT: v_bfe_u32 v5, v3, 8, 8 ; GFX7-NEXT: v_lshrrev_b32_e32 v7, 24, v3 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v4 ; GFX7-NEXT: v_and_b32_e32 v4, s9, v3 -; GFX7-NEXT: v_bfe_u32 v3, v3, 16, 8 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; GFX7-NEXT: v_bfe_u32 v3, v3, 16, 8 ; GFX7-NEXT: v_or_b32_e32 v4, v4, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_or_b32_e32 v3, v4, v3 @@ -4956,26 +4956,26 @@ ; GFX10-NEXT: s_bfe_u32 s1, s1, s7 ; GFX10-NEXT: s_lshl_b32 s13, s13, 8 ; GFX10-NEXT: s_lshl_b32 s15, s15, 8 -; GFX10-NEXT: s_or_b32 s12, s12, s13 ; GFX10-NEXT: s_lshl_b32 s0, s0, 16 ; GFX10-NEXT: s_lshl_b32 s1, s1, 16 +; GFX10-NEXT: s_or_b32 s12, s12, s13 ; GFX10-NEXT: s_or_b32 s13, s14, s15 ; GFX10-NEXT: s_bfe_u32 s17, s2, s6 -; GFX10-NEXT: s_bfe_u32 s6, s3, s6 ; GFX10-NEXT: s_lshl_b32 s8, s8, 24 -; GFX10-NEXT: s_or_b32 s0, s12, s0 ; GFX10-NEXT: s_lshl_b32 s9, s9, 24 +; GFX10-NEXT: s_or_b32 s0, s12, s0 ; GFX10-NEXT: s_or_b32 s1, s13, s1 -; GFX10-NEXT: s_or_b32 s0, s0, s8 -; GFX10-NEXT: s_or_b32 s1, s1, s9 +; GFX10-NEXT: s_bfe_u32 s6, s3, s6 ; GFX10-NEXT: s_lshr_b32 s10, s2, 24 +; GFX10-NEXT: s_lshr_b32 s11, s3, 24 ; GFX10-NEXT: s_and_b32 s16, s2, s5 +; GFX10-NEXT: s_or_b32 s0, s0, s8 ; GFX10-NEXT: s_lshl_b32 s8, s17, 8 ; GFX10-NEXT: s_bfe_u32 s2, s2, s7 -; GFX10-NEXT: s_lshr_b32 s11, s3, 24 +; GFX10-NEXT: s_or_b32 s1, s1, s9 ; GFX10-NEXT: s_and_b32 s9, s3, s5 -; GFX10-NEXT: s_bfe_u32 s3, s3, s7 ; GFX10-NEXT: s_lshl_b32 s6, s6, 8 +; GFX10-NEXT: s_bfe_u32 s3, s3, s7 ; GFX10-NEXT: s_or_b32 s8, s16, s8 ; GFX10-NEXT: s_lshl_b32 s2, s2, 16 ; GFX10-NEXT: s_or_b32 s6, s9, s6 @@ -5020,12 +5020,12 @@ ; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo ; GFX10-NEXT: v_lshrrev_b32_e32 v4, 24, v0 ; GFX10-NEXT: v_lshlrev_b32_sdwa v11, v10, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_and_or_b32 v9, v1, s5, v9 -; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshrrev_b32_e32 v8, 24, v3 ; GFX10-NEXT: v_lshlrev_b32_sdwa v10, v10, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_and_or_b32 v11, v2, s5, v11 ; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v12, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_e32 v7, 24, v7 @@ -5053,7 +5053,7 @@ ; GFX9-NEXT: s_mov_b32 s13, 0x80008 ; GFX9-NEXT: s_movk_i32 s12, 0xff ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 2, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_bfe_u32 s14, s0, s13 ; GFX9-NEXT: s_and_b32 s8, s0, s12 @@ -5069,8 +5069,8 @@ ; GFX9-NEXT: s_bfe_u32 s5, s1, s13 ; GFX9-NEXT: s_lshr_b32 s9, s1, 24 ; GFX9-NEXT: s_and_b32 s0, s1, s12 -; GFX9-NEXT: s_bfe_u32 s1, s1, s14 ; GFX9-NEXT: s_lshl_b32 s5, s5, 8 +; GFX9-NEXT: s_bfe_u32 s1, s1, s14 ; GFX9-NEXT: s_or_b32 s0, s0, s5 ; GFX9-NEXT: s_lshl_b32 s1, s1, 16 ; GFX9-NEXT: s_or_b32 s0, s0, s1 @@ -5081,8 +5081,8 @@ ; GFX9-NEXT: s_lshl_b32 s1, s1, 8 ; GFX9-NEXT: s_or_b32 s0, s0, s1 ; GFX9-NEXT: s_bfe_u32 s1, s2, s14 -; GFX9-NEXT: s_lshl_b32 s1, s1, 16 ; GFX9-NEXT: s_lshr_b32 s10, s2, 24 +; GFX9-NEXT: s_lshl_b32 s1, s1, 16 ; GFX9-NEXT: s_or_b32 s0, s0, s1 ; GFX9-NEXT: s_lshl_b32 s1, s10, 24 ; GFX9-NEXT: s_or_b32 s10, s0, s1 @@ -5097,47 +5097,47 @@ ; GFX9-NEXT: s_lshl_b32 s1, s11, 24 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 ; GFX9-NEXT: v_mov_b32_e32 v2, s9 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 +; GFX9-NEXT: v_and_b32_e32 v0, 3, v0 ; GFX9-NEXT: s_or_b32 s11, s0, s1 ; GFX9-NEXT: v_mov_b32_e32 v3, s10 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v4 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX9-NEXT: s_and_b32 s4, s4, s12 -; GFX9-NEXT: v_lshlrev_b32_e64 v2, v0, s4 -; GFX9-NEXT: v_lshlrev_b32_e64 v0, v0, s12 ; GFX9-NEXT: v_mov_b32_e32 v5, s11 ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] ; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v4 +; GFX9-NEXT: v_lshlrev_b32_e64 v2, v0, s4 +; GFX9-NEXT: v_lshlrev_b32_e64 v0, v0, s12 ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[2:3] ; GFX9-NEXT: v_xor_b32_e32 v0, -1, v0 ; GFX9-NEXT: v_and_or_b32 v5, v1, v0, v2 ; GFX9-NEXT: v_mov_b32_e32 v0, s8 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v4 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v5, s[4:5] ; GFX9-NEXT: s_mov_b32 s6, 8 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v5, s[4:5] +; GFX9-NEXT: s_mov_b32 s7, 16 ; GFX9-NEXT: v_mov_b32_e32 v1, s9 ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 24, v0 ; GFX9-NEXT: v_lshlrev_b32_sdwa v8, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: s_mov_b32 s7, 16 ; GFX9-NEXT: v_mov_b32_e32 v2, s10 ; GFX9-NEXT: v_mov_b32_e32 v3, s11 -; GFX9-NEXT: v_and_or_b32 v8, v0, s12, v8 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; GFX9-NEXT: v_and_or_b32 v8, v0, s12, v8 ; GFX9-NEXT: v_lshlrev_b32_sdwa v0, s7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GFX9-NEXT: v_or3_b32 v0, v8, v0, v4 -; GFX9-NEXT: v_lshlrev_b32_sdwa v4, s6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[2:3] ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 24, v1 +; GFX9-NEXT: v_or3_b32 v0, v8, v0, v4 +; GFX9-NEXT: v_lshlrev_b32_sdwa v4, s6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX9-NEXT: v_and_or_b32 v4, v1, s12, v4 ; GFX9-NEXT: v_lshlrev_b32_sdwa v1, s7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 24, v5 ; GFX9-NEXT: v_or3_b32 v1, v4, v1, v5 ; GFX9-NEXT: v_mov_b32_e32 v4, 8 -; GFX9-NEXT: v_lshlrev_b32_sdwa v5, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 24, v2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v5, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX9-NEXT: v_mov_b32_e32 v8, 16 ; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v3 ; GFX9-NEXT: v_and_or_b32 v5, v2, s12, v5 @@ -5146,9 +5146,9 @@ ; GFX9-NEXT: v_lshlrev_b32_sdwa v4, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX9-NEXT: v_or3_b32 v2, v5, v2, v6 ; GFX9-NEXT: v_and_or_b32 v6, v3, s12, v4 -; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_lshlrev_b32_sdwa v3, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-NEXT: v_or3_b32 v3, v6, v3, v7 ; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off @@ -5165,8 +5165,8 @@ ; GFX8-NEXT: s_bfe_u32 s9, s0, s13 ; GFX8-NEXT: s_lshr_b32 s5, s0, 24 ; GFX8-NEXT: s_and_b32 s8, s0, s12 -; GFX8-NEXT: s_bfe_u32 s0, s0, s14 ; GFX8-NEXT: s_lshl_b32 s9, s9, 8 +; GFX8-NEXT: s_bfe_u32 s0, s0, s14 ; GFX8-NEXT: s_or_b32 s8, s8, s9 ; GFX8-NEXT: s_lshl_b32 s0, s0, 16 ; GFX8-NEXT: s_or_b32 s0, s8, s0 @@ -5175,8 +5175,8 @@ ; GFX8-NEXT: s_bfe_u32 s5, s1, s13 ; GFX8-NEXT: s_lshr_b32 s6, s1, 24 ; GFX8-NEXT: s_and_b32 s0, s1, s12 -; GFX8-NEXT: s_bfe_u32 s1, s1, s14 ; GFX8-NEXT: s_lshl_b32 s5, s5, 8 +; GFX8-NEXT: s_bfe_u32 s1, s1, s14 ; GFX8-NEXT: s_or_b32 s0, s0, s5 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 ; GFX8-NEXT: s_or_b32 s0, s0, s1 @@ -5187,8 +5187,8 @@ ; GFX8-NEXT: s_lshl_b32 s1, s1, 8 ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: s_bfe_u32 s1, s2, s14 -; GFX8-NEXT: s_lshl_b32 s1, s1, 16 ; GFX8-NEXT: s_lshr_b32 s7, s2, 24 +; GFX8-NEXT: s_lshl_b32 s1, s1, 16 ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: s_lshl_b32 s1, s7, 24 ; GFX8-NEXT: s_or_b32 s10, s0, s1 @@ -5199,23 +5199,23 @@ ; GFX8-NEXT: s_bfe_u32 s1, s3, s14 ; GFX8-NEXT: s_lshr_b32 s11, s3, 24 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 -; GFX8-NEXT: v_and_b32_e32 v0, 3, v0 ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: s_lshl_b32 s1, s11, 24 ; GFX8-NEXT: v_mov_b32_e32 v1, s8 ; GFX8-NEXT: v_mov_b32_e32 v2, s9 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 +; GFX8-NEXT: v_and_b32_e32 v0, 3, v0 ; GFX8-NEXT: s_or_b32 s11, s0, s1 ; GFX8-NEXT: v_mov_b32_e32 v3, s10 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v4 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX8-NEXT: s_and_b32 s4, s4, s12 -; GFX8-NEXT: v_lshlrev_b32_e64 v2, v0, s4 -; GFX8-NEXT: v_lshlrev_b32_e64 v0, v0, s12 ; GFX8-NEXT: v_mov_b32_e32 v5, s11 ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] ; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v4 +; GFX8-NEXT: v_lshlrev_b32_e64 v2, v0, s4 +; GFX8-NEXT: v_lshlrev_b32_e64 v0, v0, s12 ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[2:3] ; GFX8-NEXT: v_xor_b32_e32 v0, -1, v0 ; GFX8-NEXT: v_and_b32_e32 v0, v1, v0 @@ -5248,13 +5248,13 @@ ; GFX8-NEXT: v_mov_b32_e32 v4, 8 ; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX8-NEXT: v_mov_b32_e32 v8, 16 -; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 24, v2 ; GFX8-NEXT: v_or_b32_sdwa v5, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshrrev_b32_e32 v7, 24, v3 ; GFX8-NEXT: v_or_b32_e32 v2, v5, v2 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, 24, v6 -; GFX8-NEXT: v_lshrrev_b32_e32 v7, 24, v3 ; GFX8-NEXT: v_or_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX8-NEXT: v_or_b32_e32 v2, v2, v5 @@ -5277,8 +5277,8 @@ ; GFX7-NEXT: s_bfe_u32 s9, s0, s13 ; GFX7-NEXT: s_lshr_b32 s5, s0, 24 ; GFX7-NEXT: s_and_b32 s8, s0, s12 -; GFX7-NEXT: s_bfe_u32 s0, s0, s14 ; GFX7-NEXT: s_lshl_b32 s9, s9, 8 +; GFX7-NEXT: s_bfe_u32 s0, s0, s14 ; GFX7-NEXT: s_or_b32 s8, s8, s9 ; GFX7-NEXT: s_lshl_b32 s0, s0, 16 ; GFX7-NEXT: s_or_b32 s0, s8, s0 @@ -5287,8 +5287,8 @@ ; GFX7-NEXT: s_bfe_u32 s5, s1, s13 ; GFX7-NEXT: s_lshr_b32 s6, s1, 24 ; GFX7-NEXT: s_and_b32 s0, s1, s12 -; GFX7-NEXT: s_bfe_u32 s1, s1, s14 ; GFX7-NEXT: s_lshl_b32 s5, s5, 8 +; GFX7-NEXT: s_bfe_u32 s1, s1, s14 ; GFX7-NEXT: s_or_b32 s0, s0, s5 ; GFX7-NEXT: s_lshl_b32 s1, s1, 16 ; GFX7-NEXT: s_or_b32 s0, s0, s1 @@ -5299,8 +5299,8 @@ ; GFX7-NEXT: s_lshl_b32 s1, s1, 8 ; GFX7-NEXT: s_or_b32 s0, s0, s1 ; GFX7-NEXT: s_bfe_u32 s1, s2, s14 -; GFX7-NEXT: s_lshl_b32 s1, s1, 16 ; GFX7-NEXT: s_lshr_b32 s7, s2, 24 +; GFX7-NEXT: s_lshl_b32 s1, s1, 16 ; GFX7-NEXT: s_or_b32 s0, s0, s1 ; GFX7-NEXT: s_lshl_b32 s1, s7, 24 ; GFX7-NEXT: s_or_b32 s10, s0, s1 @@ -5311,23 +5311,23 @@ ; GFX7-NEXT: s_bfe_u32 s1, s3, s14 ; GFX7-NEXT: s_lshr_b32 s11, s3, 24 ; GFX7-NEXT: s_lshl_b32 s1, s1, 16 -; GFX7-NEXT: v_and_b32_e32 v0, 3, v0 ; GFX7-NEXT: s_or_b32 s0, s0, s1 ; GFX7-NEXT: s_lshl_b32 s1, s11, 24 ; GFX7-NEXT: v_mov_b32_e32 v1, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s9 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 +; GFX7-NEXT: v_and_b32_e32 v0, 3, v0 ; GFX7-NEXT: s_or_b32 s11, s0, s1 ; GFX7-NEXT: v_mov_b32_e32 v3, s10 ; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX7-NEXT: s_and_b32 s4, s4, s12 -; GFX7-NEXT: v_lshl_b32_e32 v2, s4, v0 -; GFX7-NEXT: v_lshl_b32_e32 v0, s12, v0 ; GFX7-NEXT: v_mov_b32_e32 v5, s11 ; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] ; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v4 +; GFX7-NEXT: v_lshl_b32_e32 v2, s4, v0 +; GFX7-NEXT: v_lshl_b32_e32 v0, s12, v0 ; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[2:3] ; GFX7-NEXT: v_xor_b32_e32 v0, -1, v0 ; GFX7-NEXT: v_and_b32_e32 v0, v1, v0 @@ -5339,43 +5339,43 @@ ; GFX7-NEXT: v_mov_b32_e32 v1, s9 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 24, v0 ; GFX7-NEXT: v_and_b32_e32 v8, s12, v0 -; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 ; GFX7-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 ; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc ; GFX7-NEXT: v_or_b32_e32 v8, v8, v9 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_mov_b32_e32 v2, s10 ; GFX7-NEXT: v_mov_b32_e32 v3, s11 ; GFX7-NEXT: v_or_b32_e32 v0, v8, v0 -; GFX7-NEXT: v_bfe_u32 v8, v1, 8, 8 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[2:3] +; GFX7-NEXT: v_bfe_u32 v8, v1, 8, 8 ; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1] -; GFX7-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[2:3] ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 24, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v4 ; GFX7-NEXT: v_and_b32_e32 v4, s12, v1 -; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 8 ; GFX7-NEXT: v_lshlrev_b32_e32 v8, 8, v8 +; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 8 ; GFX7-NEXT: v_or_b32_e32 v4, v4, v8 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_or_b32_e32 v1, v4, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v5 ; GFX7-NEXT: v_bfe_u32 v5, v2, 8, 8 -; GFX7-NEXT: v_or_b32_e32 v1, v1, v4 ; GFX7-NEXT: v_lshrrev_b32_e32 v6, 24, v2 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v4 ; GFX7-NEXT: v_and_b32_e32 v4, s12, v2 -; GFX7-NEXT: v_bfe_u32 v2, v2, 16, 8 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; GFX7-NEXT: v_bfe_u32 v2, v2, 16, 8 ; GFX7-NEXT: v_or_b32_e32 v4, v4, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_bfe_u32 v5, v3, 8, 8 ; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v6 -; GFX7-NEXT: v_or_b32_e32 v2, v2, v4 +; GFX7-NEXT: v_bfe_u32 v5, v3, 8, 8 ; GFX7-NEXT: v_lshrrev_b32_e32 v7, 24, v3 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v4 ; GFX7-NEXT: v_and_b32_e32 v4, s12, v3 -; GFX7-NEXT: v_bfe_u32 v3, v3, 16, 8 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; GFX7-NEXT: v_bfe_u32 v3, v3, 16, 8 ; GFX7-NEXT: v_or_b32_e32 v4, v4, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_or_b32_e32 v3, v4, v3 @@ -5400,6 +5400,7 @@ ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v4 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-NEXT: v_lshlrev_b32_e64 v2, v0, s5 +; GFX10-NEXT: v_xor_b32_e32 v2, -1, v2 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_bfe_u32 s13, s0, s6 ; GFX10-NEXT: s_lshr_b32 s8, s0, 24 @@ -5418,9 +5419,9 @@ ; GFX10-NEXT: s_or_b32 s0, s12, s0 ; GFX10-NEXT: s_lshl_b32 s1, s1, 16 ; GFX10-NEXT: s_or_b32 s13, s14, s15 -; GFX10-NEXT: s_or_b32 s8, s0, s8 ; GFX10-NEXT: s_lshr_b32 s10, s2, 24 ; GFX10-NEXT: s_and_b32 s16, s2, s5 +; GFX10-NEXT: s_or_b32 s8, s0, s8 ; GFX10-NEXT: s_lshl_b32 s0, s17, 8 ; GFX10-NEXT: s_bfe_u32 s2, s2, s7 ; GFX10-NEXT: s_lshl_b32 s9, s9, 24 @@ -5429,24 +5430,23 @@ ; GFX10-NEXT: s_lshl_b32 s2, s2, 16 ; GFX10-NEXT: s_or_b32 s9, s1, s9 ; GFX10-NEXT: s_or_b32 s0, s0, s2 -; GFX10-NEXT: s_bfe_u32 s2, s3, s6 ; GFX10-NEXT: s_lshl_b32 s1, s10, 24 +; GFX10-NEXT: s_bfe_u32 s2, s3, s6 ; GFX10-NEXT: v_mov_b32_e32 v1, s9 -; GFX10-NEXT: s_or_b32 s10, s0, s1 -; GFX10-NEXT: s_bfe_u32 s1, s3, s7 ; GFX10-NEXT: s_and_b32 s6, s3, s5 ; GFX10-NEXT: s_lshl_b32 s2, s2, 8 -; GFX10-NEXT: s_lshl_b32 s1, s1, 16 +; GFX10-NEXT: s_or_b32 s10, s0, s1 +; GFX10-NEXT: s_bfe_u32 s1, s3, s7 ; GFX10-NEXT: s_or_b32 s0, s6, s2 +; GFX10-NEXT: s_lshl_b32 s1, s1, 16 ; GFX10-NEXT: v_cndmask_b32_e32 v1, s8, v1, vcc_lo ; GFX10-NEXT: s_or_b32 s1, s0, s1 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v4 ; GFX10-NEXT: s_lshr_b32 s11, s3, 24 -; GFX10-NEXT: v_xor_b32_e32 v2, -1, v2 -; GFX10-NEXT: s_lshl_b32 s2, s11, 24 ; GFX10-NEXT: s_mov_b32 s3, 8 -; GFX10-NEXT: s_or_b32 s11, s1, s2 +; GFX10-NEXT: s_lshl_b32 s2, s11, 24 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s10, s0 +; GFX10-NEXT: s_or_b32 s11, s1, s2 ; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v4 ; GFX10-NEXT: s_and_b32 s2, s4, s5 ; GFX10-NEXT: v_lshlrev_b32_e64 v0, v0, s2 @@ -5466,20 +5466,20 @@ ; GFX10-NEXT: v_lshrrev_b32_e32 v5, 24, v1 ; GFX10-NEXT: v_lshlrev_b32_sdwa v6, s3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_lshlrev_b32_sdwa v9, s3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_lshlrev_b32_sdwa v11, v10, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_lshrrev_b32_e32 v7, 24, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v8, 24, v3 +; GFX10-NEXT: v_lshlrev_b32_sdwa v11, v10, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_lshlrev_b32_sdwa v10, v10, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_and_or_b32 v6, v0, s5, v6 -; GFX10-NEXT: v_and_or_b32 v9, v1, s5, v9 ; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GFX10-NEXT: v_and_or_b32 v9, v1, s5, v9 ; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 24, v5 ; GFX10-NEXT: v_and_or_b32 v11, v2, s5, v11 -; GFX10-NEXT: v_and_or_b32 v10, v3, s5, v10 ; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v12, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; GFX10-NEXT: v_and_or_b32 v10, v3, s5, v10 ; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v12, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_e32 v8, 24, v8 ; GFX10-NEXT: v_or3_b32 v0, v6, v0, v4 @@ -5503,7 +5503,7 @@ ; GFX9-NEXT: s_mov_b32 s12, 0x80008 ; GFX9-NEXT: s_movk_i32 s10, 0xff ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 2, v1 -; GFX9-NEXT: v_and_b32_e32 v1, 3, v1 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_bfe_u32 s13, s0, s12 ; GFX9-NEXT: s_and_b32 s11, s0, s10 @@ -5516,11 +5516,11 @@ ; GFX9-NEXT: s_or_b32 s0, s11, s0 ; GFX9-NEXT: s_lshl_b32 s4, s4, 24 ; GFX9-NEXT: s_bfe_u32 s11, s1, s12 -; GFX9-NEXT: s_or_b32 s4, s0, s4 ; GFX9-NEXT: s_lshr_b32 s5, s1, 24 +; GFX9-NEXT: s_or_b32 s4, s0, s4 ; GFX9-NEXT: s_and_b32 s0, s1, s10 -; GFX9-NEXT: s_bfe_u32 s1, s1, s13 ; GFX9-NEXT: s_lshl_b32 s11, s11, 8 +; GFX9-NEXT: s_bfe_u32 s1, s1, s13 ; GFX9-NEXT: s_or_b32 s0, s0, s11 ; GFX9-NEXT: s_lshl_b32 s1, s1, 16 ; GFX9-NEXT: s_or_b32 s0, s0, s1 @@ -5531,8 +5531,8 @@ ; GFX9-NEXT: s_lshl_b32 s1, s1, 8 ; GFX9-NEXT: s_or_b32 s0, s0, s1 ; GFX9-NEXT: s_bfe_u32 s1, s2, s13 -; GFX9-NEXT: s_lshl_b32 s1, s1, 16 ; GFX9-NEXT: s_lshr_b32 s6, s2, 24 +; GFX9-NEXT: s_lshl_b32 s1, s1, 16 ; GFX9-NEXT: s_or_b32 s0, s0, s1 ; GFX9-NEXT: s_lshl_b32 s1, s6, 24 ; GFX9-NEXT: s_or_b32 s6, s0, s1 @@ -5547,17 +5547,17 @@ ; GFX9-NEXT: s_lshl_b32 s1, s7, 24 ; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 +; GFX9-NEXT: v_and_b32_e32 v1, 3, v1 ; GFX9-NEXT: s_or_b32 s7, s0, s1 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 3, v1 ; GFX9-NEXT: v_mov_b32_e32 v5, s6 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v4 -; GFX9-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_lshlrev_b32_e64 v1, v1, s10 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 3, v1 ; GFX9-NEXT: v_mov_b32_e32 v6, s7 ; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1] ; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v4 +; GFX9-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_lshlrev_b32_e64 v1, v1, s10 ; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[2:3] ; GFX9-NEXT: v_xor_b32_e32 v1, -1, v1 ; GFX9-NEXT: v_and_or_b32 v5, v2, v1, v0 @@ -5566,27 +5566,27 @@ ; GFX9-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-NEXT: v_mov_b32_e32 v3, s7 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v4 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v5, s[4:5] ; GFX9-NEXT: s_mov_b32 s8, 8 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v5, s[4:5] +; GFX9-NEXT: s_mov_b32 s9, 16 ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 24, v0 ; GFX9-NEXT: v_lshlrev_b32_sdwa v8, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: s_mov_b32 s9, 16 -; GFX9-NEXT: v_and_or_b32 v8, v0, s10, v8 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; GFX9-NEXT: v_and_or_b32 v8, v0, s10, v8 ; GFX9-NEXT: v_lshlrev_b32_sdwa v0, s9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GFX9-NEXT: v_or3_b32 v0, v8, v0, v4 -; GFX9-NEXT: v_lshlrev_b32_sdwa v4, s8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[2:3] ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 24, v1 +; GFX9-NEXT: v_or3_b32 v0, v8, v0, v4 +; GFX9-NEXT: v_lshlrev_b32_sdwa v4, s8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX9-NEXT: v_and_or_b32 v4, v1, s10, v4 ; GFX9-NEXT: v_lshlrev_b32_sdwa v1, s9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 24, v5 ; GFX9-NEXT: v_or3_b32 v1, v4, v1, v5 ; GFX9-NEXT: v_mov_b32_e32 v4, 8 -; GFX9-NEXT: v_lshlrev_b32_sdwa v5, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 24, v2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v5, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX9-NEXT: v_mov_b32_e32 v8, 16 ; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v3 ; GFX9-NEXT: v_and_or_b32 v5, v2, s10, v5 @@ -5595,9 +5595,9 @@ ; GFX9-NEXT: v_lshlrev_b32_sdwa v4, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX9-NEXT: v_or3_b32 v2, v5, v2, v6 ; GFX9-NEXT: v_and_or_b32 v6, v3, s10, v4 -; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_lshlrev_b32_sdwa v3, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-NEXT: v_or3_b32 v3, v6, v3, v7 ; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off @@ -5609,7 +5609,7 @@ ; GFX8-NEXT: s_mov_b32 s10, 0x80008 ; GFX8-NEXT: s_movk_i32 s8, 0xff ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 2, v1 -; GFX8-NEXT: v_and_b32_e32 v1, 3, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_bfe_u32 s11, s0, s10 ; GFX8-NEXT: s_and_b32 s9, s0, s8 @@ -5622,11 +5622,11 @@ ; GFX8-NEXT: s_or_b32 s0, s9, s0 ; GFX8-NEXT: s_lshl_b32 s4, s4, 24 ; GFX8-NEXT: s_bfe_u32 s9, s1, s10 -; GFX8-NEXT: s_or_b32 s4, s0, s4 ; GFX8-NEXT: s_lshr_b32 s5, s1, 24 +; GFX8-NEXT: s_or_b32 s4, s0, s4 ; GFX8-NEXT: s_and_b32 s0, s1, s8 -; GFX8-NEXT: s_bfe_u32 s1, s1, s11 ; GFX8-NEXT: s_lshl_b32 s9, s9, 8 +; GFX8-NEXT: s_bfe_u32 s1, s1, s11 ; GFX8-NEXT: s_or_b32 s0, s0, s9 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 ; GFX8-NEXT: s_or_b32 s0, s0, s1 @@ -5637,8 +5637,8 @@ ; GFX8-NEXT: s_lshl_b32 s1, s1, 8 ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: s_bfe_u32 s1, s2, s11 -; GFX8-NEXT: s_lshl_b32 s1, s1, 16 ; GFX8-NEXT: s_lshr_b32 s6, s2, 24 +; GFX8-NEXT: s_lshl_b32 s1, s1, 16 ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: s_lshl_b32 s1, s6, 24 ; GFX8-NEXT: s_or_b32 s6, s0, s1 @@ -5653,17 +5653,17 @@ ; GFX8-NEXT: s_lshl_b32 s1, s7, 24 ; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_mov_b32_e32 v3, s5 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 +; GFX8-NEXT: v_and_b32_e32 v1, 3, v1 ; GFX8-NEXT: s_or_b32 s7, s0, s1 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 3, v1 ; GFX8-NEXT: v_mov_b32_e32 v5, s6 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v4 -; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_lshlrev_b32_e64 v1, v1, s8 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 3, v1 ; GFX8-NEXT: v_mov_b32_e32 v6, s7 ; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1] ; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v4 +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshlrev_b32_e64 v1, v1, s8 ; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[2:3] ; GFX8-NEXT: v_xor_b32_e32 v1, -1, v1 ; GFX8-NEXT: v_and_b32_e32 v1, v2, v1 @@ -5696,13 +5696,13 @@ ; GFX8-NEXT: v_mov_b32_e32 v4, 8 ; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX8-NEXT: v_mov_b32_e32 v8, 16 -; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 24, v2 ; GFX8-NEXT: v_or_b32_sdwa v5, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshrrev_b32_e32 v7, 24, v3 ; GFX8-NEXT: v_or_b32_e32 v2, v5, v2 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, 24, v6 -; GFX8-NEXT: v_lshrrev_b32_e32 v7, 24, v3 ; GFX8-NEXT: v_or_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX8-NEXT: v_or_b32_e32 v2, v2, v5 @@ -5720,7 +5720,7 @@ ; GFX7-NEXT: s_mov_b32 s10, 0x80008 ; GFX7-NEXT: s_movk_i32 s8, 0xff ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 2, v1 -; GFX7-NEXT: v_and_b32_e32 v1, 3, v1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_bfe_u32 s11, s0, s10 ; GFX7-NEXT: s_and_b32 s9, s0, s8 @@ -5733,11 +5733,11 @@ ; GFX7-NEXT: s_or_b32 s0, s9, s0 ; GFX7-NEXT: s_lshl_b32 s4, s4, 24 ; GFX7-NEXT: s_bfe_u32 s9, s1, s10 -; GFX7-NEXT: s_or_b32 s4, s0, s4 ; GFX7-NEXT: s_lshr_b32 s5, s1, 24 +; GFX7-NEXT: s_or_b32 s4, s0, s4 ; GFX7-NEXT: s_and_b32 s0, s1, s8 -; GFX7-NEXT: s_bfe_u32 s1, s1, s11 ; GFX7-NEXT: s_lshl_b32 s9, s9, 8 +; GFX7-NEXT: s_bfe_u32 s1, s1, s11 ; GFX7-NEXT: s_or_b32 s0, s0, s9 ; GFX7-NEXT: s_lshl_b32 s1, s1, 16 ; GFX7-NEXT: s_or_b32 s0, s0, s1 @@ -5748,8 +5748,8 @@ ; GFX7-NEXT: s_lshl_b32 s1, s1, 8 ; GFX7-NEXT: s_or_b32 s0, s0, s1 ; GFX7-NEXT: s_bfe_u32 s1, s2, s11 -; GFX7-NEXT: s_lshl_b32 s1, s1, 16 ; GFX7-NEXT: s_lshr_b32 s6, s2, 24 +; GFX7-NEXT: s_lshl_b32 s1, s1, 16 ; GFX7-NEXT: s_or_b32 s0, s0, s1 ; GFX7-NEXT: s_lshl_b32 s1, s6, 24 ; GFX7-NEXT: s_or_b32 s6, s0, s1 @@ -5764,18 +5764,18 @@ ; GFX7-NEXT: s_lshl_b32 s1, s7, 24 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: v_mov_b32_e32 v3, s5 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 +; GFX7-NEXT: v_and_b32_e32 v1, 3, v1 ; GFX7-NEXT: s_or_b32 s7, s0, s1 ; GFX7-NEXT: v_mov_b32_e32 v5, s6 ; GFX7-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 3, v1 ; GFX7-NEXT: v_and_b32_e32 v0, s8, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, v1, v0 -; GFX7-NEXT: v_lshl_b32_e32 v1, s8, v1 ; GFX7-NEXT: v_mov_b32_e32 v6, s7 ; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1] ; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_lshl_b32_e32 v1, s8, v1 ; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[2:3] ; GFX7-NEXT: v_xor_b32_e32 v1, -1, v1 ; GFX7-NEXT: v_and_b32_e32 v1, v2, v1 @@ -5789,41 +5789,41 @@ ; GFX7-NEXT: v_bfe_u32 v9, v0, 8, 8 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 24, v0 ; GFX7-NEXT: v_and_b32_e32 v8, s8, v0 -; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 ; GFX7-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 ; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc ; GFX7-NEXT: v_or_b32_e32 v8, v8, v9 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_or_b32_e32 v0, v8, v0 -; GFX7-NEXT: v_bfe_u32 v8, v1, 8, 8 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[2:3] +; GFX7-NEXT: v_bfe_u32 v8, v1, 8, 8 ; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1] -; GFX7-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[2:3] ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 24, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v4 ; GFX7-NEXT: v_and_b32_e32 v4, s8, v1 -; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 8 ; GFX7-NEXT: v_lshlrev_b32_e32 v8, 8, v8 +; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 8 ; GFX7-NEXT: v_or_b32_e32 v4, v4, v8 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_or_b32_e32 v1, v4, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v5 ; GFX7-NEXT: v_bfe_u32 v5, v2, 8, 8 -; GFX7-NEXT: v_or_b32_e32 v1, v1, v4 ; GFX7-NEXT: v_lshrrev_b32_e32 v6, 24, v2 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v4 ; GFX7-NEXT: v_and_b32_e32 v4, s8, v2 -; GFX7-NEXT: v_bfe_u32 v2, v2, 16, 8 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; GFX7-NEXT: v_bfe_u32 v2, v2, 16, 8 ; GFX7-NEXT: v_or_b32_e32 v4, v4, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_bfe_u32 v5, v3, 8, 8 ; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v6 -; GFX7-NEXT: v_or_b32_e32 v2, v2, v4 +; GFX7-NEXT: v_bfe_u32 v5, v3, 8, 8 ; GFX7-NEXT: v_lshrrev_b32_e32 v7, 24, v3 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v4 ; GFX7-NEXT: v_and_b32_e32 v4, s8, v3 -; GFX7-NEXT: v_bfe_u32 v3, v3, 16, 8 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; GFX7-NEXT: v_bfe_u32 v3, v3, 16, 8 ; GFX7-NEXT: v_or_b32_e32 v4, v4, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_or_b32_e32 v3, v4, v3 @@ -5849,6 +5849,7 @@ ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 3, v1 ; GFX10-NEXT: v_lshlrev_b32_e64 v3, v1, s8 ; GFX10-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_xor_b32_e32 v1, -1, v3 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_bfe_u32 s12, s0, s7 ; GFX10-NEXT: s_lshr_b32 s4, s0, 24 @@ -5861,10 +5862,10 @@ ; GFX10-NEXT: s_lshl_b32 s4, s4, 24 ; GFX10-NEXT: s_or_b32 s0, s11, s0 ; GFX10-NEXT: s_bfe_u32 s14, s1, s7 -; GFX10-NEXT: s_or_b32 s4, s0, s4 -; GFX10-NEXT: s_bfe_u32 s0, s2, s9 ; GFX10-NEXT: s_and_b32 s15, s2, s8 ; GFX10-NEXT: s_lshl_b32 s16, s16, 8 +; GFX10-NEXT: s_or_b32 s4, s0, s4 +; GFX10-NEXT: s_bfe_u32 s0, s2, s9 ; GFX10-NEXT: s_lshr_b32 s5, s1, 24 ; GFX10-NEXT: s_and_b32 s13, s1, s8 ; GFX10-NEXT: s_bfe_u32 s1, s1, s9 @@ -5892,12 +5893,11 @@ ; GFX10-NEXT: s_or_b32 s1, s0, s1 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v4 ; GFX10-NEXT: s_lshl_b32 s2, s10, 24 -; GFX10-NEXT: v_xor_b32_e32 v1, -1, v3 +; GFX10-NEXT: s_mov_b32 s3, 8 ; GFX10-NEXT: s_or_b32 s7, s1, s2 ; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v4 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s6, s0 ; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 0, v4 -; GFX10-NEXT: s_mov_b32 s3, 8 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s7, s1 ; GFX10-NEXT: v_and_or_b32 v5, v2, v1, v0 ; GFX10-NEXT: v_mov_b32_e32 v0, s4 @@ -5913,20 +5913,20 @@ ; GFX10-NEXT: v_lshrrev_b32_e32 v5, 24, v1 ; GFX10-NEXT: v_lshlrev_b32_sdwa v6, s3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_lshlrev_b32_sdwa v9, s3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_lshlrev_b32_sdwa v11, v10, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_lshrrev_b32_e32 v7, 24, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v8, 24, v3 +; GFX10-NEXT: v_lshlrev_b32_sdwa v11, v10, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_lshlrev_b32_sdwa v10, v10, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_and_or_b32 v6, v0, s8, v6 -; GFX10-NEXT: v_and_or_b32 v9, v1, s8, v9 ; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GFX10-NEXT: v_and_or_b32 v9, v1, s8, v9 ; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 24, v5 ; GFX10-NEXT: v_and_or_b32 v11, v2, s8, v11 -; GFX10-NEXT: v_and_or_b32 v10, v3, s8, v10 ; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v12, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; GFX10-NEXT: v_and_or_b32 v10, v3, s8, v10 ; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v12, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_e32 v8, 24, v8 ; GFX10-NEXT: v_or3_b32 v0, v6, v0, v4 @@ -5949,8 +5949,8 @@ ; GFX9-NEXT: global_load_dwordx4 v[3:6], v[0:1], off ; GFX9-NEXT: s_mov_b32 s0, 8 ; GFX9-NEXT: s_mov_b32 s1, 16 -; GFX9-NEXT: v_mov_b32_e32 v0, 8 ; GFX9-NEXT: s_movk_i32 s6, 0xff +; GFX9-NEXT: v_mov_b32_e32 v0, 8 ; GFX9-NEXT: v_mov_b32_e32 v1, 16 ; GFX9-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-NEXT: v_mov_b32_e32 v8, 0 @@ -5962,26 +5962,26 @@ ; GFX9-NEXT: v_lshrrev_b32_e32 v11, 24, v5 ; GFX9-NEXT: v_lshlrev_b32_sdwa v14, s1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_lshlrev_b32_sdwa v16, s1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v17, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_and_or_b32 v3, v3, s6, v13 ; GFX9-NEXT: v_and_or_b32 v4, v4, s6, v15 ; GFX9-NEXT: v_lshrrev_b32_e32 v15, 2, v2 ; GFX9-NEXT: v_and_b32_e32 v2, 3, v2 -; GFX9-NEXT: v_lshlrev_b32_e32 v10, 24, v10 -; GFX9-NEXT: v_lshlrev_b32_sdwa v17, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_and_or_b32 v3, v3, s6, v13 ; GFX9-NEXT: v_lshlrev_b32_e32 v9, 24, v9 +; GFX9-NEXT: v_lshlrev_b32_e32 v10, 24, v10 ; GFX9-NEXT: v_lshrrev_b32_e32 v12, 24, v6 ; GFX9-NEXT: v_lshlrev_b32_sdwa v18, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_lshlrev_b32_sdwa v19, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: s_and_b32 s0, s2, s6 ; GFX9-NEXT: v_and_or_b32 v5, v5, s6, v17 +; GFX9-NEXT: s_and_b32 s0, s2, s6 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v11, 24, v11 ; GFX9-NEXT: v_or3_b32 v3, v3, v14, v9 ; GFX9-NEXT: v_or3_b32 v4, v4, v16, v10 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v15 -; GFX9-NEXT: v_lshlrev_b32_e64 v17, v2, s0 ; GFX9-NEXT: v_and_or_b32 v13, v6, s6, v19 ; GFX9-NEXT: v_lshlrev_b32_sdwa v6, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_e64 v17, v2, s0 ; GFX9-NEXT: v_lshlrev_b32_e32 v12, 24, v12 ; GFX9-NEXT: v_or3_b32 v5, v5, v18, v11 ; GFX9-NEXT: v_cndmask_b32_e32 v9, v3, v4, vcc @@ -5994,29 +5994,29 @@ ; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v6, s[2:3] ; GFX9-NEXT: v_and_or_b32 v2, v9, v2, v17 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v15 +; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v2, s[4:5] ; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v2, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v2, s[4:5] ; GFX9-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[2:3] -; GFX9-NEXT: v_lshrrev_b32_e32 v11, 24, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 24, v3 ; GFX9-NEXT: v_lshrrev_b32_e32 v9, 24, v4 ; GFX9-NEXT: v_lshrrev_b32_e32 v10, 24, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 24, v2 ; GFX9-NEXT: v_lshlrev_b32_sdwa v12, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX9-NEXT: v_lshlrev_b32_sdwa v14, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX9-NEXT: v_lshlrev_b32_sdwa v16, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX9-NEXT: v_lshlrev_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v13, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_lshlrev_b32_sdwa v15, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_lshlrev_b32_sdwa v17, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v13, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_lshlrev_b32_sdwa v18, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_and_or_b32 v1, v3, s6, v12 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 24, v6 +; GFX9-NEXT: v_and_or_b32 v4, v4, s6, v14 ; GFX9-NEXT: v_lshlrev_b32_e32 v6, 24, v9 +; GFX9-NEXT: v_and_or_b32 v5, v5, s6, v16 ; GFX9-NEXT: v_lshlrev_b32_e32 v9, 24, v10 ; GFX9-NEXT: v_and_or_b32 v10, v2, s6, v0 -; GFX9-NEXT: v_and_or_b32 v5, v5, s6, v16 -; GFX9-NEXT: v_and_or_b32 v4, v4, s6, v14 ; GFX9-NEXT: v_lshlrev_b32_e32 v11, 24, v11 ; GFX9-NEXT: v_or3_b32 v0, v1, v13, v3 ; GFX9-NEXT: v_or3_b32 v1, v4, v15, v6 @@ -6029,8 +6029,8 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: flat_load_dwordx4 v[3:6], v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v0, 8 -; GFX8-NEXT: v_mov_b32_e32 v9, 8 ; GFX8-NEXT: v_mov_b32_e32 v1, 16 +; GFX8-NEXT: v_mov_b32_e32 v9, 8 ; GFX8-NEXT: v_mov_b32_e32 v10, 16 ; GFX8-NEXT: s_movk_i32 s0, 0xff ; GFX8-NEXT: s_and_b32 s1, s2, s0 @@ -6042,33 +6042,33 @@ ; GFX8-NEXT: v_lshrrev_b32_e32 v11, 24, v3 ; GFX8-NEXT: v_lshrrev_b32_e32 v12, 24, v4 ; GFX8-NEXT: v_lshlrev_b32_sdwa v16, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_or_b32_sdwa v3, v3, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b32_sdwa v17, v9, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshrrev_b32_e32 v15, 2, v2 +; GFX8-NEXT: v_or_b32_sdwa v3, v3, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_lshrrev_b32_e32 v13, 24, v5 -; GFX8-NEXT: v_and_b32_e32 v2, 3, v2 ; GFX8-NEXT: v_lshlrev_b32_sdwa v18, v10, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX8-NEXT: v_lshlrev_b32_sdwa v19, v9, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX8-NEXT: v_or_b32_sdwa v4, v5, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX8-NEXT: v_lshlrev_b32_e32 v12, 24, v12 +; GFX8-NEXT: v_lshrrev_b32_e32 v15, 2, v2 +; GFX8-NEXT: v_and_b32_e32 v2, 3, v2 ; GFX8-NEXT: v_lshlrev_b32_e32 v11, 24, v11 +; GFX8-NEXT: v_lshlrev_b32_e32 v12, 24, v12 ; GFX8-NEXT: v_or_b32_e32 v3, v3, v16 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: v_lshrrev_b32_e32 v14, 24, v6 ; GFX8-NEXT: v_or_b32_sdwa v5, v6, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v2 ; GFX8-NEXT: v_lshlrev_b32_sdwa v6, v10, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_or_b32_e32 v1, v4, v18 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v2 ; GFX8-NEXT: v_lshlrev_b32_e32 v13, 24, v13 +; GFX8-NEXT: v_or_b32_e32 v1, v4, v18 ; GFX8-NEXT: v_or_b32_e32 v3, v3, v11 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v12 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v15 ; GFX8-NEXT: v_lshlrev_b32_e64 v17, v2, s1 ; GFX8-NEXT: v_lshlrev_b32_e64 v2, v2, s0 -; GFX8-NEXT: v_or_b32_e32 v4, v5, v6 ; GFX8-NEXT: v_lshlrev_b32_e32 v14, 24, v14 +; GFX8-NEXT: v_or_b32_e32 v4, v5, v6 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v13 ; GFX8-NEXT: v_cndmask_b32_e32 v5, v3, v0, vcc ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v15 @@ -6080,9 +6080,9 @@ ; GFX8-NEXT: v_and_b32_e32 v2, v5, v2 ; GFX8-NEXT: v_or_b32_e32 v2, v2, v17 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v15 +; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v2, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v2, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e64 v2, v4, v2, s[2:3] ; GFX8-NEXT: v_lshlrev_b32_sdwa v12, v9, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX8-NEXT: v_lshlrev_b32_sdwa v14, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 @@ -6091,19 +6091,19 @@ ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 24, v3 ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 24, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 24, v1 +; GFX8-NEXT: v_lshrrev_b32_e32 v11, 24, v2 ; GFX8-NEXT: v_lshlrev_b32_sdwa v13, v10, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_or_b32_sdwa v3, v3, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b32_sdwa v15, v10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX8-NEXT: v_lshlrev_b32_sdwa v17, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_or_b32_sdwa v1, v1, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_lshrrev_b32_e32 v11, 24, v2 ; GFX8-NEXT: v_lshlrev_b32_sdwa v10, v10, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_or_b32_sdwa v2, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v3, v3, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v9, 24, v11 +; GFX8-NEXT: v_or_b32_sdwa v1, v1, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v2, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 24, v4 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, 24, v5 ; GFX8-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; GFX8-NEXT: v_lshlrev_b32_e32 v9, 24, v11 ; GFX8-NEXT: v_or_b32_e32 v3, v3, v13 ; GFX8-NEXT: v_or_b32_e32 v11, v0, v15 ; GFX8-NEXT: v_or_b32_e32 v12, v1, v17 @@ -6139,86 +6139,86 @@ ; GFX7-NEXT: v_bfe_u32 v12, v4, 8, 8 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v3 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 24, v4 -; GFX7-NEXT: v_bfe_u32 v14, v5, 8, 8 ; GFX7-NEXT: v_and_b32_e32 v9, s6, v3 ; GFX7-NEXT: v_bfe_u32 v3, v3, 16, 8 ; GFX7-NEXT: v_and_b32_e32 v11, s6, v4 ; GFX7-NEXT: v_bfe_u32 v4, v4, 16, 8 +; GFX7-NEXT: v_bfe_u32 v14, v5, 8, 8 ; GFX7-NEXT: v_lshlrev_b32_e32 v10, 8, v10 ; GFX7-NEXT: v_lshlrev_b32_e32 v12, 8, v12 ; GFX7-NEXT: v_lshrrev_b32_e32 v7, 24, v5 -; GFX7-NEXT: v_bfe_u32 v16, v6, 8, 8 ; GFX7-NEXT: v_and_b32_e32 v13, s6, v5 ; GFX7-NEXT: v_bfe_u32 v5, v5, 16, 8 -; GFX7-NEXT: v_or_b32_e32 v9, v9, v10 +; GFX7-NEXT: v_bfe_u32 v16, v6, 8, 8 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_or_b32_e32 v10, v11, v12 ; GFX7-NEXT: v_lshlrev_b32_e32 v14, 8, v14 +; GFX7-NEXT: v_or_b32_e32 v9, v9, v10 +; GFX7-NEXT: v_or_b32_e32 v10, v11, v12 ; GFX7-NEXT: v_lshrrev_b32_e32 v8, 24, v6 ; GFX7-NEXT: v_and_b32_e32 v15, s6, v6 ; GFX7-NEXT: v_bfe_u32 v6, v6, 16, 8 -; GFX7-NEXT: v_lshlrev_b32_e32 v16, 8, v16 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; GFX7-NEXT: v_or_b32_e32 v3, v9, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; GFX7-NEXT: v_or_b32_e32 v4, v10, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v16, 8, v16 ; GFX7-NEXT: v_or_b32_e32 v11, v13, v14 +; GFX7-NEXT: v_or_b32_e32 v3, v9, v3 +; GFX7-NEXT: v_or_b32_e32 v4, v10, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v7, 24, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; GFX7-NEXT: v_or_b32_e32 v12, v15, v16 ; GFX7-NEXT: v_or_b32_e32 v5, v11, v5 ; GFX7-NEXT: v_or_b32_e32 v0, v3, v0 ; GFX7-NEXT: v_or_b32_e32 v1, v4, v1 -; GFX7-NEXT: v_or_b32_e32 v3, v5, v7 -; GFX7-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc ; GFX7-NEXT: v_lshlrev_b32_e32 v8, 24, v8 ; GFX7-NEXT: v_or_b32_e32 v6, v12, v6 +; GFX7-NEXT: v_or_b32_e32 v3, v5, v7 +; GFX7-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc ; GFX7-NEXT: v_or_b32_e32 v4, v6, v8 ; GFX7-NEXT: v_cndmask_b32_e64 v5, v5, v3, s[0:1] ; GFX7-NEXT: v_cndmask_b32_e64 v5, v5, v4, s[2:3] ; GFX7-NEXT: v_and_b32_e32 v2, v5, v2 ; GFX7-NEXT: v_or_b32_e32 v2, v2, v18 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[4:5] +; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v2, s[0:1] ; GFX7-NEXT: v_bfe_u32 v9, v0, 8, 8 ; GFX7-NEXT: v_bfe_u32 v11, v1, 8, 8 ; GFX7-NEXT: v_cndmask_b32_e64 v4, v4, v2, s[2:3] ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v0 ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 24, v1 -; GFX7-NEXT: v_bfe_u32 v13, v3, 8, 8 ; GFX7-NEXT: v_and_b32_e32 v8, s6, v0 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 ; GFX7-NEXT: v_and_b32_e32 v10, s6, v1 ; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 8 +; GFX7-NEXT: v_bfe_u32 v13, v3, 8, 8 ; GFX7-NEXT: v_lshlrev_b32_e32 v9, 8, v9 ; GFX7-NEXT: v_lshlrev_b32_e32 v11, 8, v11 -; GFX7-NEXT: v_or_b32_e32 v8, v8, v9 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_lshrrev_b32_e32 v6, 24, v3 ; GFX7-NEXT: v_and_b32_e32 v12, s6, v3 ; GFX7-NEXT: v_bfe_u32 v3, v3, 16, 8 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_or_b32_e32 v9, v10, v11 ; GFX7-NEXT: v_lshlrev_b32_e32 v13, 8, v13 +; GFX7-NEXT: v_or_b32_e32 v8, v8, v9 +; GFX7-NEXT: v_or_b32_e32 v9, v10, v11 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GFX7-NEXT: v_or_b32_e32 v0, v8, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 24, v5 -; GFX7-NEXT: v_or_b32_e32 v1, v9, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_or_b32_e32 v10, v12, v13 +; GFX7-NEXT: v_or_b32_e32 v0, v8, v0 +; GFX7-NEXT: v_or_b32_e32 v1, v9, v1 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX7-NEXT: v_or_b32_e32 v1, v1, v5 -; GFX7-NEXT: v_bfe_u32 v5, v4, 8, 8 ; GFX7-NEXT: v_or_b32_e32 v2, v10, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 24, v6 -; GFX7-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX7-NEXT: v_bfe_u32 v5, v4, 8, 8 ; GFX7-NEXT: v_lshrrev_b32_e32 v7, 24, v4 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX7-NEXT: v_and_b32_e32 v3, s6, v4 -; GFX7-NEXT: v_bfe_u32 v4, v4, 16, 8 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; GFX7-NEXT: v_bfe_u32 v4, v4, 16, 8 ; GFX7-NEXT: v_or_b32_e32 v3, v3, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: v_or_b32_e32 v3, v3, v4 @@ -6247,23 +6247,23 @@ ; GFX10-NEXT: v_lshrrev_b32_e32 v10, 24, v5 ; GFX10-NEXT: v_lshlrev_b32_sdwa v13, s1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v15, s1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v16, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_and_or_b32 v3, v3, s3, v12 ; GFX10-NEXT: v_lshlrev_b32_e32 v8, 24, v8 -; GFX10-NEXT: v_lshlrev_b32_sdwa v16, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_and_or_b32 v4, v4, s3, v14 ; GFX10-NEXT: v_lshlrev_b32_e32 v9, 24, v9 -; GFX10-NEXT: v_lshlrev_b32_sdwa v17, v7, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshrrev_b32_e32 v11, 24, v6 -; GFX10-NEXT: v_or3_b32 v3, v3, v13, v8 +; GFX10-NEXT: v_lshlrev_b32_sdwa v17, v7, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v18, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_or3_b32 v4, v4, v15, v9 ; GFX10-NEXT: v_and_or_b32 v5, v5, s3, v16 ; GFX10-NEXT: v_lshlrev_b32_e32 v10, 24, v10 +; GFX10-NEXT: v_or3_b32 v3, v3, v13, v8 +; GFX10-NEXT: v_or3_b32 v4, v4, v15, v9 ; GFX10-NEXT: v_lshlrev_b32_sdwa v12, v7, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_and_or_b32 v6, v6, s3, v18 ; GFX10-NEXT: v_lshlrev_b32_e32 v8, 24, v11 -; GFX10-NEXT: v_cndmask_b32_e32 v9, v3, v4, vcc_lo ; GFX10-NEXT: v_or3_b32 v5, v5, v17, v10 +; GFX10-NEXT: v_cndmask_b32_e32 v9, v3, v4, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v2 ; GFX10-NEXT: s_and_b32 s1, s2, s3 ; GFX10-NEXT: v_lshlrev_b32_e64 v10, v0, s3 @@ -6290,19 +6290,19 @@ ; GFX10-NEXT: v_lshlrev_b32_sdwa v11, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v13, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v15, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v7, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_and_or_b32 v2, v2, s3, v10 -; GFX10-NEXT: v_and_or_b32 v3, v3, s3, v12 -; GFX10-NEXT: v_and_or_b32 v12, v4, s3, v14 ; GFX10-NEXT: v_lshlrev_b32_e32 v10, 24, v5 -; GFX10-NEXT: v_lshlrev_b32_sdwa v7, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v14, v0, s3, v1 +; GFX10-NEXT: v_and_or_b32 v3, v3, s3, v12 ; GFX10-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; GFX10-NEXT: v_and_or_b32 v12, v4, s3, v14 ; GFX10-NEXT: v_lshlrev_b32_e32 v8, 24, v8 +; GFX10-NEXT: v_and_or_b32 v14, v0, s3, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v9, 24, v9 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 +; GFX10-NEXT: v_mov_b32_e32 v5, 0 ; GFX10-NEXT: v_or3_b32 v0, v2, v11, v10 ; GFX10-NEXT: v_or3_b32 v1, v3, v13, v6 -; GFX10-NEXT: v_mov_b32_e32 v5, 0 ; GFX10-NEXT: v_or3_b32 v2, v12, v15, v8 ; GFX10-NEXT: v_or3_b32 v3, v14, v7, v9 ; GFX10-NEXT: global_store_dwordx4 v[4:5], v[0:3], off @@ -6319,11 +6319,11 @@ ; GFX9-NEXT: global_load_dwordx4 v[3:6], v[0:1], off ; GFX9-NEXT: s_mov_b32 s0, 8 ; GFX9-NEXT: s_mov_b32 s1, 16 -; GFX9-NEXT: v_mov_b32_e32 v0, 8 ; GFX9-NEXT: s_movk_i32 s6, 0xff +; GFX9-NEXT: v_mov_b32_e32 v0, 8 +; GFX9-NEXT: v_mov_b32_e32 v1, 16 ; GFX9-NEXT: s_lshr_b32 s4, s2, 2 ; GFX9-NEXT: s_and_b32 s2, s2, 3 -; GFX9-NEXT: v_mov_b32_e32 v1, 16 ; GFX9-NEXT: s_lshl_b32 s2, s2, 3 ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1 ; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -6342,8 +6342,8 @@ ; GFX9-NEXT: v_lshlrev_b32_sdwa v16, s1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_lshlrev_b32_sdwa v17, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX9-NEXT: v_and_or_b32 v3, v3, s6, v13 -; GFX9-NEXT: v_lshlrev_b32_e32 v9, 24, v9 ; GFX9-NEXT: v_and_or_b32 v4, v4, s6, v15 +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 24, v9 ; GFX9-NEXT: v_lshlrev_b32_e32 v10, 24, v10 ; GFX9-NEXT: v_lshrrev_b32_e32 v12, 24, v6 ; GFX9-NEXT: v_lshlrev_b32_sdwa v18, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 @@ -6363,29 +6363,29 @@ ; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v6, s[2:3] ; GFX9-NEXT: v_and_or_b32 v2, v9, s5, v2 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, 0 +; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v2, s[4:5] ; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v2, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v2, s[4:5] ; GFX9-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[2:3] -; GFX9-NEXT: v_lshrrev_b32_e32 v11, 24, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 24, v3 ; GFX9-NEXT: v_lshrrev_b32_e32 v9, 24, v4 ; GFX9-NEXT: v_lshrrev_b32_e32 v10, 24, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 24, v2 ; GFX9-NEXT: v_lshlrev_b32_sdwa v12, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX9-NEXT: v_lshlrev_b32_sdwa v14, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX9-NEXT: v_lshlrev_b32_sdwa v16, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX9-NEXT: v_lshlrev_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v13, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_lshlrev_b32_sdwa v15, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_lshlrev_b32_sdwa v17, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v13, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_lshlrev_b32_sdwa v18, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_and_or_b32 v1, v3, s6, v12 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 24, v6 +; GFX9-NEXT: v_and_or_b32 v4, v4, s6, v14 ; GFX9-NEXT: v_lshlrev_b32_e32 v6, 24, v9 +; GFX9-NEXT: v_and_or_b32 v5, v5, s6, v16 ; GFX9-NEXT: v_lshlrev_b32_e32 v9, 24, v10 ; GFX9-NEXT: v_and_or_b32 v10, v2, s6, v0 -; GFX9-NEXT: v_and_or_b32 v5, v5, s6, v16 -; GFX9-NEXT: v_and_or_b32 v4, v4, s6, v14 ; GFX9-NEXT: v_lshlrev_b32_e32 v11, 24, v11 ; GFX9-NEXT: v_or3_b32 v0, v1, v13, v3 ; GFX9-NEXT: v_or3_b32 v1, v4, v15, v6 @@ -6400,13 +6400,13 @@ ; GFX8-NEXT: s_and_b32 s1, s2, 3 ; GFX8-NEXT: v_mov_b32_e32 v0, 8 ; GFX8-NEXT: s_lshl_b32 s1, s1, 3 -; GFX8-NEXT: v_mov_b32_e32 v9, 8 ; GFX8-NEXT: v_mov_b32_e32 v1, 16 +; GFX8-NEXT: v_mov_b32_e32 v9, 8 ; GFX8-NEXT: v_mov_b32_e32 v11, s1 ; GFX8-NEXT: v_mov_b32_e32 v10, 16 ; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v11, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: s_lshr_b32 s4, s2, 2 ; GFX8-NEXT: s_movk_i32 s0, 0xff +; GFX8-NEXT: s_lshr_b32 s4, s2, 2 ; GFX8-NEXT: s_lshl_b32 s0, s0, s1 ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1 ; GFX8-NEXT: s_not_b32 s5, s0 @@ -6420,27 +6420,27 @@ ; GFX8-NEXT: v_lshrrev_b32_e32 v11, 24, v3 ; GFX8-NEXT: v_lshrrev_b32_e32 v12, 24, v4 ; GFX8-NEXT: v_lshlrev_b32_sdwa v16, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_or_b32_sdwa v3, v3, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b32_sdwa v17, v9, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_or_b32_sdwa v3, v3, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_lshrrev_b32_e32 v13, 24, v5 ; GFX8-NEXT: v_lshlrev_b32_sdwa v18, v10, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX8-NEXT: v_lshlrev_b32_sdwa v19, v9, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX8-NEXT: v_or_b32_sdwa v4, v5, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX8-NEXT: v_lshlrev_b32_e32 v12, 24, v12 ; GFX8-NEXT: v_lshlrev_b32_e32 v11, 24, v11 +; GFX8-NEXT: v_lshlrev_b32_e32 v12, 24, v12 ; GFX8-NEXT: v_or_b32_e32 v3, v3, v16 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: v_lshrrev_b32_e32 v14, 24, v6 ; GFX8-NEXT: v_or_b32_sdwa v5, v6, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b32_sdwa v6, v10, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_or_b32_e32 v1, v4, v18 ; GFX8-NEXT: v_lshlrev_b32_e32 v13, 24, v13 +; GFX8-NEXT: v_or_b32_e32 v1, v4, v18 ; GFX8-NEXT: v_or_b32_e32 v3, v3, v11 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v12 -; GFX8-NEXT: v_or_b32_e32 v4, v5, v6 ; GFX8-NEXT: v_lshlrev_b32_e32 v14, 24, v14 +; GFX8-NEXT: v_or_b32_e32 v4, v5, v6 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v13 ; GFX8-NEXT: v_cndmask_b32_e32 v5, v3, v0, vcc ; GFX8-NEXT: v_or_b32_e32 v4, v4, v14 @@ -6449,9 +6449,9 @@ ; GFX8-NEXT: v_and_b32_e32 v5, s5, v5 ; GFX8-NEXT: v_or_b32_e32 v2, v5, v2 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, 0 +; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v2, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v2, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e64 v2, v4, v2, s[2:3] ; GFX8-NEXT: v_lshlrev_b32_sdwa v12, v9, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX8-NEXT: v_lshlrev_b32_sdwa v14, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 @@ -6460,19 +6460,19 @@ ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 24, v3 ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 24, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 24, v1 +; GFX8-NEXT: v_lshrrev_b32_e32 v11, 24, v2 ; GFX8-NEXT: v_lshlrev_b32_sdwa v13, v10, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_or_b32_sdwa v3, v3, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b32_sdwa v15, v10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX8-NEXT: v_lshlrev_b32_sdwa v17, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_or_b32_sdwa v1, v1, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_lshrrev_b32_e32 v11, 24, v2 ; GFX8-NEXT: v_lshlrev_b32_sdwa v10, v10, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_or_b32_sdwa v2, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v3, v3, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v9, 24, v11 +; GFX8-NEXT: v_or_b32_sdwa v1, v1, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v2, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 24, v4 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, 24, v5 ; GFX8-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; GFX8-NEXT: v_lshlrev_b32_e32 v9, 24, v11 ; GFX8-NEXT: v_or_b32_e32 v3, v3, v13 ; GFX8-NEXT: v_or_b32_e32 v11, v0, v15 ; GFX8-NEXT: v_or_b32_e32 v12, v1, v17 @@ -6507,87 +6507,87 @@ ; GFX7-NEXT: v_bfe_u32 v12, v4, 8, 8 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 24, v3 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v4 -; GFX7-NEXT: v_bfe_u32 v14, v5, 8, 8 ; GFX7-NEXT: v_and_b32_e32 v9, s6, v3 ; GFX7-NEXT: v_bfe_u32 v3, v3, 16, 8 ; GFX7-NEXT: v_and_b32_e32 v11, s6, v4 ; GFX7-NEXT: v_bfe_u32 v4, v4, 16, 8 +; GFX7-NEXT: v_bfe_u32 v14, v5, 8, 8 ; GFX7-NEXT: v_lshlrev_b32_e32 v10, 8, v10 ; GFX7-NEXT: v_lshlrev_b32_e32 v12, 8, v12 ; GFX7-NEXT: v_lshrrev_b32_e32 v7, 24, v5 -; GFX7-NEXT: v_bfe_u32 v16, v6, 8, 8 ; GFX7-NEXT: v_and_b32_e32 v13, s6, v5 ; GFX7-NEXT: v_bfe_u32 v5, v5, 16, 8 -; GFX7-NEXT: v_or_b32_e32 v9, v9, v10 +; GFX7-NEXT: v_bfe_u32 v16, v6, 8, 8 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_or_b32_e32 v10, v11, v12 ; GFX7-NEXT: v_lshlrev_b32_e32 v14, 8, v14 +; GFX7-NEXT: v_or_b32_e32 v9, v9, v10 +; GFX7-NEXT: v_or_b32_e32 v10, v11, v12 ; GFX7-NEXT: v_lshrrev_b32_e32 v8, 24, v6 ; GFX7-NEXT: v_and_b32_e32 v15, s6, v6 ; GFX7-NEXT: v_bfe_u32 v6, v6, 16, 8 -; GFX7-NEXT: v_lshlrev_b32_e32 v16, 8, v16 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; GFX7-NEXT: v_or_b32_e32 v3, v9, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GFX7-NEXT: v_or_b32_e32 v4, v10, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v16, 8, v16 ; GFX7-NEXT: v_or_b32_e32 v11, v13, v14 +; GFX7-NEXT: v_or_b32_e32 v3, v9, v3 +; GFX7-NEXT: v_or_b32_e32 v4, v10, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v7, 24, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; GFX7-NEXT: v_or_b32_e32 v12, v15, v16 ; GFX7-NEXT: v_or_b32_e32 v5, v11, v5 ; GFX7-NEXT: v_or_b32_e32 v1, v3, v1 ; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX7-NEXT: v_or_b32_e32 v3, v5, v7 -; GFX7-NEXT: v_cndmask_b32_e32 v5, v1, v2, vcc ; GFX7-NEXT: v_lshlrev_b32_e32 v8, 24, v8 ; GFX7-NEXT: v_or_b32_e32 v6, v12, v6 +; GFX7-NEXT: v_or_b32_e32 v3, v5, v7 +; GFX7-NEXT: v_cndmask_b32_e32 v5, v1, v2, vcc ; GFX7-NEXT: v_or_b32_e32 v4, v6, v8 ; GFX7-NEXT: v_cndmask_b32_e64 v5, v5, v3, s[0:1] ; GFX7-NEXT: v_cndmask_b32_e64 v5, v5, v4, s[2:3] ; GFX7-NEXT: v_and_b32_e32 v5, s5, v5 ; GFX7-NEXT: v_or_b32_e32 v0, v5, v0 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, 0 -; GFX7-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc ; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v0, s[4:5] +; GFX7-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc ; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v0, s[0:1] ; GFX7-NEXT: v_bfe_u32 v9, v1, 8, 8 ; GFX7-NEXT: v_bfe_u32 v11, v2, 8, 8 ; GFX7-NEXT: v_cndmask_b32_e64 v4, v4, v0, s[2:3] ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v1 ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 24, v2 -; GFX7-NEXT: v_bfe_u32 v13, v3, 8, 8 ; GFX7-NEXT: v_and_b32_e32 v8, s6, v1 ; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 8 ; GFX7-NEXT: v_and_b32_e32 v10, s6, v2 ; GFX7-NEXT: v_bfe_u32 v2, v2, 16, 8 +; GFX7-NEXT: v_bfe_u32 v13, v3, 8, 8 ; GFX7-NEXT: v_lshlrev_b32_e32 v9, 8, v9 ; GFX7-NEXT: v_lshlrev_b32_e32 v11, 8, v11 -; GFX7-NEXT: v_or_b32_e32 v8, v8, v9 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_lshrrev_b32_e32 v6, 24, v3 ; GFX7-NEXT: v_and_b32_e32 v12, s6, v3 ; GFX7-NEXT: v_bfe_u32 v3, v3, 16, 8 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_or_b32_e32 v9, v10, v11 ; GFX7-NEXT: v_lshlrev_b32_e32 v13, 8, v13 +; GFX7-NEXT: v_or_b32_e32 v8, v8, v9 +; GFX7-NEXT: v_or_b32_e32 v9, v10, v11 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; GFX7-NEXT: v_or_b32_e32 v1, v8, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 24, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_or_b32_e32 v10, v12, v13 +; GFX7-NEXT: v_or_b32_e32 v1, v8, v1 ; GFX7-NEXT: v_or_b32_e32 v2, v9, v2 ; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX7-NEXT: v_or_b32_e32 v1, v2, v5 -; GFX7-NEXT: v_bfe_u32 v5, v4, 8, 8 ; GFX7-NEXT: v_or_b32_e32 v2, v10, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 24, v6 -; GFX7-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX7-NEXT: v_bfe_u32 v5, v4, 8, 8 ; GFX7-NEXT: v_lshrrev_b32_e32 v7, 24, v4 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX7-NEXT: v_and_b32_e32 v3, s6, v4 -; GFX7-NEXT: v_bfe_u32 v4, v4, 16, 8 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; GFX7-NEXT: v_bfe_u32 v4, v4, 16, 8 ; GFX7-NEXT: v_or_b32_e32 v3, v3, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: v_or_b32_e32 v3, v3, v4 @@ -6614,23 +6614,23 @@ ; GFX10-NEXT: v_lshrrev_b32_e32 v9, 24, v5 ; GFX10-NEXT: v_lshlrev_b32_sdwa v12, s1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v14, s1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v15, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_and_or_b32 v3, v3, s3, v11 ; GFX10-NEXT: v_lshlrev_b32_e32 v7, 24, v7 -; GFX10-NEXT: v_lshlrev_b32_sdwa v15, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_and_or_b32 v4, v4, s3, v13 ; GFX10-NEXT: v_lshlrev_b32_e32 v8, 24, v8 -; GFX10-NEXT: v_lshlrev_b32_sdwa v16, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshrrev_b32_e32 v10, 24, v6 -; GFX10-NEXT: v_or3_b32 v3, v3, v12, v7 +; GFX10-NEXT: v_lshlrev_b32_sdwa v16, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v17, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_or3_b32 v4, v4, v14, v8 ; GFX10-NEXT: v_and_or_b32 v5, v5, s3, v15 ; GFX10-NEXT: v_lshlrev_b32_e32 v9, 24, v9 +; GFX10-NEXT: v_or3_b32 v3, v3, v12, v7 +; GFX10-NEXT: v_or3_b32 v4, v4, v14, v8 ; GFX10-NEXT: v_lshlrev_b32_sdwa v11, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_and_or_b32 v6, v6, s3, v17 ; GFX10-NEXT: v_lshlrev_b32_e32 v7, 24, v10 -; GFX10-NEXT: v_cndmask_b32_e32 v8, v3, v4, vcc_lo ; GFX10-NEXT: v_or3_b32 v5, v5, v16, v9 +; GFX10-NEXT: v_cndmask_b32_e32 v8, v3, v4, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s4, 2 ; GFX10-NEXT: s_and_b32 s1, s2, 3 ; GFX10-NEXT: v_or3_b32 v6, v6, v11, v7 @@ -6644,12 +6644,12 @@ ; GFX10-NEXT: v_and_or_b32 v2, v7, s2, v2 ; GFX10-NEXT: v_cmp_eq_u32_e64 s2, s4, 0 ; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v2, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v2, s2 +; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v2, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v6, v2, s1 ; GFX10-NEXT: v_lshrrev_b32_e32 v7, 24, v4 -; GFX10-NEXT: v_lshrrev_b32_e32 v8, 24, v5 ; GFX10-NEXT: v_lshrrev_b32_e32 v6, 24, v3 +; GFX10-NEXT: v_lshrrev_b32_e32 v8, 24, v5 ; GFX10-NEXT: v_lshrrev_b32_e32 v9, 24, v2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v10, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_lshlrev_b32_sdwa v12, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 @@ -6662,14 +6662,14 @@ ; GFX10-NEXT: v_and_or_b32 v1, v3, s3, v10 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 24, v6 ; GFX10-NEXT: v_and_or_b32 v6, v4, s3, v12 +; GFX10-NEXT: v_lshlrev_b32_e32 v7, 24, v7 ; GFX10-NEXT: v_and_or_b32 v10, v5, s3, v14 -; GFX10-NEXT: v_and_or_b32 v12, v2, s3, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v8, 24, v8 -; GFX10-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; GFX10-NEXT: v_and_or_b32 v12, v2, s3, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v9, 24, v9 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 -; GFX10-NEXT: v_or3_b32 v0, v1, v11, v3 ; GFX10-NEXT: v_mov_b32_e32 v5, 0 +; GFX10-NEXT: v_or3_b32 v0, v1, v11, v3 ; GFX10-NEXT: v_or3_b32 v1, v6, v13, v7 ; GFX10-NEXT: v_or3_b32 v2, v10, v15, v8 ; GFX10-NEXT: v_or3_b32 v3, v12, v16, v9 @@ -6688,9 +6688,9 @@ ; GFX9-NEXT: s_mov_b32 s0, 8 ; GFX9-NEXT: v_mov_b32_e32 v1, 8 ; GFX9-NEXT: s_mov_b32 s1, 16 -; GFX9-NEXT: v_mov_b32_e32 v8, 16 ; GFX9-NEXT: s_movk_i32 s2, 0xff ; GFX9-NEXT: v_mov_b32_e32 v0, 0xff +; GFX9-NEXT: v_mov_b32_e32 v8, 16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v9, 24, v4 ; GFX9-NEXT: v_lshrrev_b32_e32 v10, 24, v5 @@ -6699,14 +6699,14 @@ ; GFX9-NEXT: v_lshlrev_b32_sdwa v17, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX9-NEXT: v_lshlrev_b32_sdwa v19, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX9-NEXT: v_lshrrev_b32_e32 v11, 24, v6 -; GFX9-NEXT: v_lshlrev_b32_sdwa v18, v8, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_and_or_b32 v6, v6, v0, v17 -; GFX9-NEXT: v_and_or_b32 v17, v7, v0, v19 -; GFX9-NEXT: v_lshrrev_b32_e32 v19, 2, v3 ; GFX9-NEXT: v_lshlrev_b32_sdwa v14, s1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_lshlrev_b32_sdwa v16, s1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v18, v8, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_and_or_b32 v13, v4, s2, v13 ; GFX9-NEXT: v_and_or_b32 v15, v5, s2, v15 +; GFX9-NEXT: v_and_or_b32 v6, v6, v0, v17 +; GFX9-NEXT: v_and_or_b32 v17, v7, v0, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 2, v3 ; GFX9-NEXT: v_lshlrev_b32_e32 v9, 24, v9 ; GFX9-NEXT: v_lshlrev_b32_e32 v10, 24, v10 ; GFX9-NEXT: v_lshrrev_b32_e32 v12, 24, v7 @@ -6715,10 +6715,10 @@ ; GFX9-NEXT: v_or3_b32 v9, v13, v14, v9 ; GFX9-NEXT: v_or3_b32 v10, v15, v16, v10 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v19 -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX9-NEXT: v_or3_b32 v6, v6, v18, v11 ; GFX9-NEXT: v_lshlrev_b32_sdwa v7, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX9-NEXT: v_lshlrev_b32_e32 v12, 24, v12 +; GFX9-NEXT: v_or3_b32 v6, v6, v18, v11 ; GFX9-NEXT: v_cndmask_b32_e32 v11, v9, v10, vcc ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v19 ; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -6746,17 +6746,17 @@ ; GFX9-NEXT: v_lshlrev_b32_sdwa v14, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_lshlrev_b32_sdwa v16, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_lshlrev_b32_sdwa v18, v8, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_and_or_b32 v3, v3, v0, v13 ; GFX9-NEXT: v_lshlrev_b32_sdwa v8, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_and_or_b32 v3, v3, v0, v13 ; GFX9-NEXT: v_lshlrev_b32_e32 v7, 24, v7 ; GFX9-NEXT: v_and_or_b32 v9, v9, v0, v15 ; GFX9-NEXT: v_lshlrev_b32_e32 v10, 24, v10 -; GFX9-NEXT: v_and_or_b32 v13, v2, v0, v1 ; GFX9-NEXT: v_and_or_b32 v6, v6, v0, v17 ; GFX9-NEXT: v_lshlrev_b32_e32 v11, 24, v11 +; GFX9-NEXT: v_and_or_b32 v13, v2, v0, v1 ; GFX9-NEXT: v_lshlrev_b32_e32 v12, 24, v12 -; GFX9-NEXT: v_or3_b32 v0, v3, v14, v7 ; GFX9-NEXT: v_mov_b32_e32 v5, 0 +; GFX9-NEXT: v_or3_b32 v0, v3, v14, v7 ; GFX9-NEXT: v_or3_b32 v1, v9, v16, v10 ; GFX9-NEXT: v_or3_b32 v2, v6, v18, v11 ; GFX9-NEXT: v_or3_b32 v3, v13, v8, v12 @@ -6774,30 +6774,30 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_sdwa v17, v9, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX8-NEXT: v_lshlrev_b32_sdwa v19, v9, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshrrev_b32_e32 v13, 24, v6 ; GFX8-NEXT: v_lshlrev_b32_sdwa v15, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshrrev_b32_e32 v13, 24, v6 ; GFX8-NEXT: v_lshlrev_b32_sdwa v18, v10, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX8-NEXT: v_or_b32_sdwa v6, v6, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v17, v7, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_lshrrev_b32_e32 v19, 2, v3 ; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX8-NEXT: v_lshrrev_b32_e32 v14, 24, v7 -; GFX8-NEXT: v_lshlrev_b32_sdwa v16, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX8-NEXT: v_lshrrev_b32_e32 v11, 24, v4 ; GFX8-NEXT: v_lshrrev_b32_e32 v12, 24, v5 +; GFX8-NEXT: v_lshrrev_b32_e32 v14, 24, v7 +; GFX8-NEXT: v_lshlrev_b32_sdwa v16, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX8-NEXT: v_lshlrev_b32_sdwa v8, v8, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v15, v4, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v0, v3, v0 +; GFX8-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, v3, v0 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 24, v11 ; GFX8-NEXT: v_lshlrev_b32_e32 v11, 24, v12 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v8 ; GFX8-NEXT: v_lshlrev_b32_e32 v12, 24, v13 ; GFX8-NEXT: v_lshlrev_b32_e32 v13, 24, v14 ; GFX8-NEXT: v_or_b32_e32 v14, v15, v16 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v8 ; GFX8-NEXT: v_lshlrev_b32_sdwa v7, v10, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX8-NEXT: v_or_b32_e32 v6, v6, v18 ; GFX8-NEXT: v_or_b32_e32 v3, v14, v3 @@ -6816,8 +6816,8 @@ ; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v19 ; GFX8-NEXT: v_cndmask_b32_e64 v2, v3, v0, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v0, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v0, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v0, s[2:3] ; GFX8-NEXT: v_lshlrev_b32_sdwa v12, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX8-NEXT: v_lshlrev_b32_sdwa v14, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 @@ -6826,28 +6826,28 @@ ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 24, v2 ; GFX8-NEXT: v_lshrrev_b32_e32 v7, 24, v1 ; GFX8-NEXT: v_lshrrev_b32_e32 v8, 24, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v11, 24, v0 ; GFX8-NEXT: v_lshlrev_b32_sdwa v13, v10, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_or_b32_sdwa v2, v2, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b32_sdwa v15, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX8-NEXT: v_lshlrev_b32_sdwa v17, v10, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v10, v10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_or_b32_sdwa v2, v2, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v1, v1, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v3, v3, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_lshrrev_b32_e32 v11, 24, v0 -; GFX8-NEXT: v_lshlrev_b32_sdwa v10, v10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_mov_b32_e32 v4, 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v6, 24, v6 ; GFX8-NEXT: v_lshlrev_b32_e32 v7, 24, v7 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v15 ; GFX8-NEXT: v_lshlrev_b32_e32 v8, 24, v8 ; GFX8-NEXT: v_lshlrev_b32_e32 v9, 24, v11 ; GFX8-NEXT: v_or_b32_e32 v2, v2, v13 -; GFX8-NEXT: v_or_b32_e32 v10, v0, v10 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v15 ; GFX8-NEXT: v_or_b32_e32 v3, v3, v17 -; GFX8-NEXT: v_or_b32_e32 v0, v2, v6 -; GFX8-NEXT: v_or_b32_e32 v2, v3, v8 +; GFX8-NEXT: v_or_b32_e32 v10, v0, v10 ; GFX8-NEXT: v_mov_b32_e32 v5, 0 +; GFX8-NEXT: v_or_b32_e32 v0, v2, v6 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v7 +; GFX8-NEXT: v_or_b32_e32 v2, v3, v8 ; GFX8-NEXT: v_or_b32_e32 v3, v10, v9 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_endpgm @@ -6876,41 +6876,41 @@ ; GFX7-NEXT: v_bfe_u32 v14, v5, 8, 8 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v4 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 24, v5 -; GFX7-NEXT: v_bfe_u32 v16, v6, 8, 8 ; GFX7-NEXT: v_and_b32_e32 v11, s0, v4 ; GFX7-NEXT: v_bfe_u32 v4, v4, 16, 8 ; GFX7-NEXT: v_and_b32_e32 v13, s0, v5 ; GFX7-NEXT: v_bfe_u32 v5, v5, 16, 8 +; GFX7-NEXT: v_bfe_u32 v16, v6, 8, 8 ; GFX7-NEXT: v_lshlrev_b32_e32 v12, 8, v12 ; GFX7-NEXT: v_lshlrev_b32_e32 v14, 8, v14 ; GFX7-NEXT: v_lshrrev_b32_e32 v9, 24, v6 -; GFX7-NEXT: v_bfe_u32 v18, v7, 8, 8 ; GFX7-NEXT: v_and_b32_e32 v15, v6, v8 ; GFX7-NEXT: v_bfe_u32 v6, v6, 16, 8 -; GFX7-NEXT: v_or_b32_e32 v11, v11, v12 +; GFX7-NEXT: v_bfe_u32 v18, v7, 8, 8 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_or_b32_e32 v12, v13, v14 ; GFX7-NEXT: v_lshlrev_b32_e32 v16, 8, v16 +; GFX7-NEXT: v_or_b32_e32 v11, v11, v12 +; GFX7-NEXT: v_or_b32_e32 v12, v13, v14 ; GFX7-NEXT: v_lshrrev_b32_e32 v10, 24, v7 ; GFX7-NEXT: v_and_b32_e32 v17, v7, v8 ; GFX7-NEXT: v_bfe_u32 v7, v7, 16, 8 -; GFX7-NEXT: v_lshlrev_b32_e32 v18, 8, v18 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; GFX7-NEXT: v_or_b32_e32 v4, v11, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; GFX7-NEXT: v_or_b32_e32 v5, v12, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v18, 8, v18 ; GFX7-NEXT: v_or_b32_e32 v13, v15, v16 +; GFX7-NEXT: v_or_b32_e32 v4, v11, v4 +; GFX7-NEXT: v_or_b32_e32 v5, v12, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v9, 24, v9 ; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; GFX7-NEXT: v_or_b32_e32 v14, v17, v18 ; GFX7-NEXT: v_or_b32_e32 v6, v13, v6 ; GFX7-NEXT: v_or_b32_e32 v0, v4, v0 ; GFX7-NEXT: v_or_b32_e32 v1, v5, v1 -; GFX7-NEXT: v_or_b32_e32 v4, v6, v9 ; GFX7-NEXT: v_lshlrev_b32_e32 v10, 24, v10 ; GFX7-NEXT: v_or_b32_e32 v7, v14, v7 +; GFX7-NEXT: v_or_b32_e32 v4, v6, v9 ; GFX7-NEXT: v_cndmask_b32_e32 v6, v0, v1, vcc ; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v19 ; GFX7-NEXT: v_or_b32_e32 v5, v7, v10 @@ -6924,39 +6924,39 @@ ; GFX7-NEXT: v_cndmask_b32_e64 v3, v4, v2, s[0:1] ; GFX7-NEXT: v_cndmask_b32_e64 v4, v5, v2, s[2:3] ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v0 -; GFX7-NEXT: v_bfe_u32 v12, v1, 8, 8 ; GFX7-NEXT: v_and_b32_e32 v9, v0, v8 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 +; GFX7-NEXT: v_bfe_u32 v12, v1, 8, 8 ; GFX7-NEXT: v_lshlrev_b32_e32 v10, 8, v10 ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 24, v1 -; GFX7-NEXT: v_bfe_u32 v14, v3, 8, 8 ; GFX7-NEXT: v_and_b32_e32 v11, v1, v8 ; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 8 +; GFX7-NEXT: v_bfe_u32 v14, v3, 8, 8 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_or_b32_e32 v9, v9, v10 ; GFX7-NEXT: v_lshlrev_b32_e32 v12, 8, v12 +; GFX7-NEXT: v_or_b32_e32 v9, v9, v10 ; GFX7-NEXT: v_lshrrev_b32_e32 v6, 24, v3 ; GFX7-NEXT: v_and_b32_e32 v13, v3, v8 ; GFX7-NEXT: v_bfe_u32 v3, v3, 16, 8 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GFX7-NEXT: v_or_b32_e32 v0, v9, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_or_b32_e32 v10, v11, v12 ; GFX7-NEXT: v_lshlrev_b32_e32 v14, 8, v14 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX7-NEXT: v_or_b32_e32 v11, v13, v14 +; GFX7-NEXT: v_or_b32_e32 v10, v11, v12 +; GFX7-NEXT: v_or_b32_e32 v0, v9, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 24, v5 +; GFX7-NEXT: v_or_b32_e32 v11, v13, v14 ; GFX7-NEXT: v_or_b32_e32 v1, v10, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3 ; GFX7-NEXT: v_or_b32_e32 v1, v1, v5 -; GFX7-NEXT: v_bfe_u32 v5, v4, 8, 8 ; GFX7-NEXT: v_or_b32_e32 v2, v11, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 24, v6 -; GFX7-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX7-NEXT: v_bfe_u32 v5, v4, 8, 8 ; GFX7-NEXT: v_lshrrev_b32_e32 v7, 24, v4 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX7-NEXT: v_and_b32_e32 v3, v4, v8 -; GFX7-NEXT: v_bfe_u32 v4, v4, 16, 8 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; GFX7-NEXT: v_bfe_u32 v4, v4, 16, 8 ; GFX7-NEXT: v_or_b32_e32 v3, v3, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: v_or_b32_e32 v3, v3, v4 @@ -6973,9 +6973,9 @@ ; GFX10-NEXT: s_mov_b32 s1, 16 ; GFX10-NEXT: s_movk_i32 s2, 0xff ; GFX10-NEXT: v_and_b32_e32 v0, 3, v3 +; GFX10-NEXT: v_mov_b32_e32 v1, 0xff ; GFX10-NEXT: v_mov_b32_e32 v9, 16 ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 2, v3 -; GFX10-NEXT: v_mov_b32_e32 v1, 0xff ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -6986,23 +6986,23 @@ ; GFX10-NEXT: v_lshrrev_b32_e32 v12, 24, v6 ; GFX10-NEXT: v_lshlrev_b32_sdwa v15, s1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v17, s1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v18, v8, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_and_or_b32 v4, v4, s2, v14 ; GFX10-NEXT: v_lshlrev_b32_e32 v10, 24, v10 ; GFX10-NEXT: v_and_or_b32 v5, v5, s2, v16 ; GFX10-NEXT: v_lshlrev_b32_e32 v11, 24, v11 -; GFX10-NEXT: v_lshlrev_b32_sdwa v18, v8, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_lshlrev_b32_sdwa v19, v9, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshrrev_b32_e32 v13, 24, v7 -; GFX10-NEXT: v_or3_b32 v4, v4, v15, v10 -; GFX10-NEXT: v_or3_b32 v5, v5, v17, v11 +; GFX10-NEXT: v_lshlrev_b32_sdwa v19, v9, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v20, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_and_or_b32 v6, v6, v1, v18 ; GFX10-NEXT: v_lshlrev_b32_e32 v12, 24, v12 +; GFX10-NEXT: v_or3_b32 v4, v4, v15, v10 +; GFX10-NEXT: v_or3_b32 v5, v5, v17, v11 ; GFX10-NEXT: v_lshlrev_b32_sdwa v14, v9, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_lshlrev_b32_e32 v10, 24, v13 ; GFX10-NEXT: v_and_or_b32 v7, v7, v1, v20 -; GFX10-NEXT: v_cndmask_b32_e32 v11, v4, v5, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_e32 v10, 24, v13 ; GFX10-NEXT: v_or3_b32 v6, v6, v19, v12 +; GFX10-NEXT: v_cndmask_b32_e32 v11, v4, v5, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v3 ; GFX10-NEXT: v_lshlrev_b32_e32 v12, v0, v1 ; GFX10-NEXT: v_or3_b32 v7, v7, v14, v10 @@ -7028,19 +7028,19 @@ ; GFX10-NEXT: v_lshlrev_b32_sdwa v12, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v14, v9, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v16, v9, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v2, v2, v1, v11 -; GFX10-NEXT: v_and_or_b32 v3, v3, v1, v13 ; GFX10-NEXT: v_lshlrev_b32_sdwa v9, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_and_or_b32 v2, v2, v1, v11 ; GFX10-NEXT: v_lshlrev_b32_e32 v11, 24, v5 -; GFX10-NEXT: v_and_or_b32 v13, v4, v1, v15 -; GFX10-NEXT: v_and_or_b32 v8, v0, v1, v8 +; GFX10-NEXT: v_and_or_b32 v3, v3, v1, v13 ; GFX10-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; GFX10-NEXT: v_and_or_b32 v13, v4, v1, v15 ; GFX10-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; GFX10-NEXT: v_and_or_b32 v8, v0, v1, v8 ; GFX10-NEXT: v_lshlrev_b32_e32 v10, 24, v10 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 +; GFX10-NEXT: v_mov_b32_e32 v5, 0 ; GFX10-NEXT: v_or3_b32 v0, v2, v12, v11 ; GFX10-NEXT: v_or3_b32 v1, v3, v14, v6 -; GFX10-NEXT: v_mov_b32_e32 v5, 0 ; GFX10-NEXT: v_or3_b32 v2, v13, v16, v7 ; GFX10-NEXT: v_or3_b32 v3, v8, v9, v10 ; GFX10-NEXT: global_store_dwordx4 v[4:5], v[0:3], off diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll @@ -111,23 +111,23 @@ ; GPRIDX-NEXT: v_mov_b32_e32 v15, s11 ; GPRIDX-NEXT: v_mov_b32_e32 v8, s4 ; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GPRIDX-NEXT: v_cndmask_b32_e32 v8, v8, v0, vcc ; GPRIDX-NEXT: v_mov_b32_e32 v9, s5 +; GPRIDX-NEXT: v_cndmask_b32_e32 v8, v8, v0, vcc ; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 -; GPRIDX-NEXT: v_cndmask_b32_e32 v9, v9, v0, vcc ; GPRIDX-NEXT: v_mov_b32_e32 v10, s6 +; GPRIDX-NEXT: v_cndmask_b32_e32 v9, v9, v0, vcc ; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 2, v1 -; GPRIDX-NEXT: v_cndmask_b32_e32 v2, v10, v0, vcc ; GPRIDX-NEXT: v_mov_b32_e32 v11, s7 +; GPRIDX-NEXT: v_cndmask_b32_e32 v2, v10, v0, vcc ; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 3, v1 -; GPRIDX-NEXT: v_cndmask_b32_e32 v3, v11, v0, vcc ; GPRIDX-NEXT: v_mov_b32_e32 v12, s8 +; GPRIDX-NEXT: v_cndmask_b32_e32 v3, v11, v0, vcc ; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 4, v1 -; GPRIDX-NEXT: v_cndmask_b32_e32 v4, v12, v0, vcc ; GPRIDX-NEXT: v_mov_b32_e32 v13, s9 +; GPRIDX-NEXT: v_cndmask_b32_e32 v4, v12, v0, vcc ; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 5, v1 -; GPRIDX-NEXT: v_cndmask_b32_e32 v5, v13, v0, vcc ; GPRIDX-NEXT: v_mov_b32_e32 v14, s10 +; GPRIDX-NEXT: v_cndmask_b32_e32 v5, v13, v0, vcc ; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 6, v1 ; GPRIDX-NEXT: v_cndmask_b32_e32 v6, v14, v0, vcc ; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 7, v1 @@ -195,23 +195,23 @@ ; GPRIDX-NEXT: v_mov_b32_e32 v8, s0 ; GPRIDX-NEXT: v_mov_b32_e32 v7, s10 ; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GPRIDX-NEXT: v_cndmask_b32_e32 v8, v8, v7, vcc ; GPRIDX-NEXT: v_mov_b32_e32 v9, s1 +; GPRIDX-NEXT: v_cndmask_b32_e32 v8, v8, v7, vcc ; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GPRIDX-NEXT: v_cndmask_b32_e32 v1, v9, v7, vcc ; GPRIDX-NEXT: v_mov_b32_e32 v10, s2 +; GPRIDX-NEXT: v_cndmask_b32_e32 v1, v9, v7, vcc ; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 2, v0 -; GPRIDX-NEXT: v_cndmask_b32_e32 v2, v10, v7, vcc ; GPRIDX-NEXT: v_mov_b32_e32 v11, s3 +; GPRIDX-NEXT: v_cndmask_b32_e32 v2, v10, v7, vcc ; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 3, v0 -; GPRIDX-NEXT: v_cndmask_b32_e32 v3, v11, v7, vcc ; GPRIDX-NEXT: v_mov_b32_e32 v12, s4 +; GPRIDX-NEXT: v_cndmask_b32_e32 v3, v11, v7, vcc ; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 4, v0 -; GPRIDX-NEXT: v_cndmask_b32_e32 v4, v12, v7, vcc ; GPRIDX-NEXT: v_mov_b32_e32 v13, s5 +; GPRIDX-NEXT: v_cndmask_b32_e32 v4, v12, v7, vcc ; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 5, v0 -; GPRIDX-NEXT: v_cndmask_b32_e32 v5, v13, v7, vcc ; GPRIDX-NEXT: v_mov_b32_e32 v14, s6 +; GPRIDX-NEXT: v_cndmask_b32_e32 v5, v13, v7, vcc ; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 6, v0 ; GPRIDX-NEXT: v_cndmask_b32_e32 v6, v14, v7, vcc ; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 7, v0 @@ -274,23 +274,23 @@ ; GPRIDX-NEXT: v_mov_b32_e32 v15, s7 ; GPRIDX-NEXT: v_mov_b32_e32 v8, s0 ; GPRIDX-NEXT: v_cmp_eq_u32_e64 vcc, s10, 0 -; GPRIDX-NEXT: v_cndmask_b32_e32 v8, v8, v0, vcc ; GPRIDX-NEXT: v_mov_b32_e32 v9, s1 +; GPRIDX-NEXT: v_cndmask_b32_e32 v8, v8, v0, vcc ; GPRIDX-NEXT: v_cmp_eq_u32_e64 vcc, s10, 1 -; GPRIDX-NEXT: v_cndmask_b32_e32 v1, v9, v0, vcc ; GPRIDX-NEXT: v_mov_b32_e32 v10, s2 +; GPRIDX-NEXT: v_cndmask_b32_e32 v1, v9, v0, vcc ; GPRIDX-NEXT: v_cmp_eq_u32_e64 vcc, s10, 2 -; GPRIDX-NEXT: v_cndmask_b32_e32 v2, v10, v0, vcc ; GPRIDX-NEXT: v_mov_b32_e32 v11, s3 +; GPRIDX-NEXT: v_cndmask_b32_e32 v2, v10, v0, vcc ; GPRIDX-NEXT: v_cmp_eq_u32_e64 vcc, s10, 3 -; GPRIDX-NEXT: v_cndmask_b32_e32 v3, v11, v0, vcc ; GPRIDX-NEXT: v_mov_b32_e32 v12, s4 +; GPRIDX-NEXT: v_cndmask_b32_e32 v3, v11, v0, vcc ; GPRIDX-NEXT: v_cmp_eq_u32_e64 vcc, s10, 4 -; GPRIDX-NEXT: v_cndmask_b32_e32 v4, v12, v0, vcc ; GPRIDX-NEXT: v_mov_b32_e32 v13, s5 +; GPRIDX-NEXT: v_cndmask_b32_e32 v4, v12, v0, vcc ; GPRIDX-NEXT: v_cmp_eq_u32_e64 vcc, s10, 5 -; GPRIDX-NEXT: v_cndmask_b32_e32 v5, v13, v0, vcc ; GPRIDX-NEXT: v_mov_b32_e32 v14, s6 +; GPRIDX-NEXT: v_cndmask_b32_e32 v5, v13, v0, vcc ; GPRIDX-NEXT: v_cmp_eq_u32_e64 vcc, s10, 6 ; GPRIDX-NEXT: v_cndmask_b32_e32 v6, v14, v0, vcc ; GPRIDX-NEXT: v_cmp_eq_u32_e64 vcc, s10, 7 @@ -399,23 +399,23 @@ ; GPRIDX-NEXT: v_mov_b32_e32 v15, s7 ; GPRIDX-NEXT: v_mov_b32_e32 v8, s0 ; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GPRIDX-NEXT: v_cndmask_b32_e32 v8, v8, v0, vcc ; GPRIDX-NEXT: v_mov_b32_e32 v9, s1 +; GPRIDX-NEXT: v_cndmask_b32_e32 v8, v8, v0, vcc ; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 -; GPRIDX-NEXT: v_cndmask_b32_e32 v9, v9, v0, vcc ; GPRIDX-NEXT: v_mov_b32_e32 v10, s2 +; GPRIDX-NEXT: v_cndmask_b32_e32 v9, v9, v0, vcc ; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 2, v1 -; GPRIDX-NEXT: v_cndmask_b32_e32 v2, v10, v0, vcc ; GPRIDX-NEXT: v_mov_b32_e32 v11, s3 +; GPRIDX-NEXT: v_cndmask_b32_e32 v2, v10, v0, vcc ; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 3, v1 -; GPRIDX-NEXT: v_cndmask_b32_e32 v3, v11, v0, vcc ; GPRIDX-NEXT: v_mov_b32_e32 v12, s4 +; GPRIDX-NEXT: v_cndmask_b32_e32 v3, v11, v0, vcc ; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 4, v1 -; GPRIDX-NEXT: v_cndmask_b32_e32 v4, v12, v0, vcc ; GPRIDX-NEXT: v_mov_b32_e32 v13, s5 +; GPRIDX-NEXT: v_cndmask_b32_e32 v4, v12, v0, vcc ; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 5, v1 -; GPRIDX-NEXT: v_cndmask_b32_e32 v5, v13, v0, vcc ; GPRIDX-NEXT: v_mov_b32_e32 v14, s6 +; GPRIDX-NEXT: v_cndmask_b32_e32 v5, v13, v0, vcc ; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 6, v1 ; GPRIDX-NEXT: v_cndmask_b32_e32 v6, v14, v0, vcc ; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 7, v1 @@ -769,7 +769,6 @@ ; GPRIDX-NEXT: v_mov_b32_e32 v3, s4 ; GPRIDX-NEXT: v_mov_b32_e32 v4, s5 ; GPRIDX-NEXT: v_mov_b32_e32 v5, s6 -; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 ; GPRIDX-NEXT: v_mov_b32_e32 v6, s7 ; GPRIDX-NEXT: v_mov_b32_e32 v7, s8 ; GPRIDX-NEXT: v_mov_b32_e32 v8, s9 @@ -783,6 +782,7 @@ ; GPRIDX-NEXT: v_mov_b32_e32 v16, s17 ; GPRIDX-NEXT: v_mov_b32_e32 v17, s18 ; GPRIDX-NEXT: v_mov_b32_e32 v18, s19 +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 ; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[16:17], 0, v2 ; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[4:5], 2, v2 ; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[6:7], 3, v2 @@ -834,7 +834,6 @@ ; MOVREL-NEXT: s_mov_b32 s8, s18 ; MOVREL-NEXT: s_mov_b64 s[6:7], 2.0 ; MOVREL-NEXT: v_mov_b32_e32 v3, s4 -; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 ; MOVREL-NEXT: v_mov_b32_e32 v4, s5 ; MOVREL-NEXT: v_mov_b32_e32 v5, s6 ; MOVREL-NEXT: v_mov_b32_e32 v6, s7 @@ -850,6 +849,7 @@ ; MOVREL-NEXT: v_mov_b32_e32 v16, s17 ; MOVREL-NEXT: v_mov_b32_e32 v17, s18 ; MOVREL-NEXT: v_mov_b32_e32 v18, s19 +; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 ; MOVREL-NEXT: v_cmp_eq_u32_e64 s4, 1, v2 ; MOVREL-NEXT: v_cmp_eq_u32_e64 s5, 3, v2 ; MOVREL-NEXT: v_cmp_eq_u32_e64 s10, 2, v2 @@ -986,8 +986,8 @@ ; MOVREL-NEXT: s_mov_b32 s14, s16 ; MOVREL-NEXT: v_mov_b32_e32 v16, s15 ; MOVREL-NEXT: v_mov_b32_e32 v2, s1 -; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; MOVREL-NEXT: v_mov_b32_e32 v1, s0 +; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; MOVREL-NEXT: v_mov_b32_e32 v15, s14 ; MOVREL-NEXT: v_mov_b32_e32 v14, s13 ; MOVREL-NEXT: v_mov_b32_e32 v13, s12 @@ -1002,14 +1002,14 @@ ; MOVREL-NEXT: v_mov_b32_e32 v4, s3 ; MOVREL-NEXT: v_mov_b32_e32 v3, s2 ; MOVREL-NEXT: v_cmp_eq_u32_e64 s0, 1, v0 -; MOVREL-NEXT: v_cmp_eq_u32_e64 s1, 2, v0 ; MOVREL-NEXT: v_cndmask_b32_e64 v1, v1, s18, vcc_lo ; MOVREL-NEXT: v_cndmask_b32_e64 v2, v2, s19, vcc_lo ; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v0 -; MOVREL-NEXT: v_cmp_eq_u32_e64 s2, 5, v0 +; MOVREL-NEXT: v_cmp_eq_u32_e64 s1, 2, v0 ; MOVREL-NEXT: v_cndmask_b32_e64 v3, v3, s18, s0 ; MOVREL-NEXT: v_cndmask_b32_e64 v4, v4, s19, s0 ; MOVREL-NEXT: v_cmp_eq_u32_e64 s0, 4, v0 +; MOVREL-NEXT: v_cmp_eq_u32_e64 s2, 5, v0 ; MOVREL-NEXT: v_cmp_eq_u32_e64 s3, 6, v0 ; MOVREL-NEXT: v_cmp_eq_u32_e64 s4, 7, v0 ; MOVREL-NEXT: v_cndmask_b32_e64 v5, v5, s18, s1 @@ -1232,10 +1232,10 @@ ; GPRIDX-NEXT: v_mov_b32_e32 v8, s5 ; GPRIDX-NEXT: v_mov_b32_e32 v7, s4 ; GPRIDX-NEXT: v_mov_b32_e32 v6, s3 -; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 ; GPRIDX-NEXT: v_mov_b32_e32 v5, s2 ; GPRIDX-NEXT: v_mov_b32_e32 v4, s1 ; GPRIDX-NEXT: v_mov_b32_e32 v3, s0 +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 ; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v2 ; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v2 ; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v2 @@ -1486,14 +1486,14 @@ ; GPRIDX: ; %bb.0: ; %entry ; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 0, v18 ; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v18 +; GPRIDX-NEXT: v_cndmask_b32_e32 v0, v0, v16, vcc +; GPRIDX-NEXT: v_cndmask_b32_e64 v2, v2, v16, s[0:1] ; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[2:3], 2, v18 ; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[4:5], 3, v18 ; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[6:7], 4, v18 ; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[8:9], 5, v18 ; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v18 ; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[12:13], 6, v18 -; GPRIDX-NEXT: v_cndmask_b32_e32 v0, v0, v16, vcc -; GPRIDX-NEXT: v_cndmask_b32_e64 v2, v2, v16, s[0:1] ; GPRIDX-NEXT: v_cndmask_b32_e32 v1, v1, v17, vcc ; GPRIDX-NEXT: v_cndmask_b32_e64 v3, v3, v17, s[0:1] ; GPRIDX-NEXT: v_cndmask_b32_e64 v4, v4, v16, s[2:3] @@ -2075,18 +2075,18 @@ ; MOVREL-NEXT: s_mov_b32 s15, s17 ; MOVREL-NEXT: s_movreld_b64 s[2:3], s[18:19] ; MOVREL-NEXT: v_mov_b32_e32 v0, s0 -; MOVREL-NEXT: v_mov_b32_e32 v4, s4 ; MOVREL-NEXT: v_mov_b32_e32 v1, s1 ; MOVREL-NEXT: v_mov_b32_e32 v2, s2 ; MOVREL-NEXT: v_mov_b32_e32 v3, s3 -; MOVREL-NEXT: v_mov_b32_e32 v8, s8 +; MOVREL-NEXT: v_mov_b32_e32 v4, s4 ; MOVREL-NEXT: v_mov_b32_e32 v5, s5 ; MOVREL-NEXT: v_mov_b32_e32 v6, s6 ; MOVREL-NEXT: v_mov_b32_e32 v7, s7 -; MOVREL-NEXT: v_mov_b32_e32 v12, s12 +; MOVREL-NEXT: v_mov_b32_e32 v8, s8 ; MOVREL-NEXT: v_mov_b32_e32 v9, s9 ; MOVREL-NEXT: v_mov_b32_e32 v10, s10 ; MOVREL-NEXT: v_mov_b32_e32 v11, s11 +; MOVREL-NEXT: v_mov_b32_e32 v12, s12 ; MOVREL-NEXT: v_mov_b32_e32 v13, s13 ; MOVREL-NEXT: v_mov_b32_e32 v14, s14 ; MOVREL-NEXT: v_mov_b32_e32 v15, s15 @@ -2119,14 +2119,14 @@ ; GPRIDX-NEXT: v_add_u32_e32 v18, 1, v18 ; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 0, v18 ; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v18 +; GPRIDX-NEXT: v_cndmask_b32_e32 v0, v0, v16, vcc +; GPRIDX-NEXT: v_cndmask_b32_e64 v2, v2, v16, s[0:1] ; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[2:3], 2, v18 ; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[4:5], 3, v18 ; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[6:7], 4, v18 ; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[8:9], 5, v18 ; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v18 ; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[12:13], 6, v18 -; GPRIDX-NEXT: v_cndmask_b32_e32 v0, v0, v16, vcc -; GPRIDX-NEXT: v_cndmask_b32_e64 v2, v2, v16, s[0:1] ; GPRIDX-NEXT: v_cndmask_b32_e32 v1, v1, v17, vcc ; GPRIDX-NEXT: v_cndmask_b32_e64 v3, v3, v17, s[0:1] ; GPRIDX-NEXT: v_cndmask_b32_e64 v4, v4, v16, s[2:3] @@ -3510,20 +3510,20 @@ ; GPRIDX-NEXT: v_mov_b32_e32 v13, s6 ; GPRIDX-NEXT: v_mov_b32_e32 v7, s0 ; GPRIDX-NEXT: v_cmp_eq_u32_e64 vcc, s9, 0 -; GPRIDX-NEXT: v_cndmask_b32_e32 v7, v7, v0, vcc ; GPRIDX-NEXT: v_mov_b32_e32 v8, s1 +; GPRIDX-NEXT: v_cndmask_b32_e32 v7, v7, v0, vcc ; GPRIDX-NEXT: v_cmp_eq_u32_e64 vcc, s9, 1 -; GPRIDX-NEXT: v_cndmask_b32_e32 v1, v8, v0, vcc ; GPRIDX-NEXT: v_mov_b32_e32 v9, s2 +; GPRIDX-NEXT: v_cndmask_b32_e32 v1, v8, v0, vcc ; GPRIDX-NEXT: v_cmp_eq_u32_e64 vcc, s9, 2 -; GPRIDX-NEXT: v_cndmask_b32_e32 v2, v9, v0, vcc ; GPRIDX-NEXT: v_mov_b32_e32 v10, s3 +; GPRIDX-NEXT: v_cndmask_b32_e32 v2, v9, v0, vcc ; GPRIDX-NEXT: v_cmp_eq_u32_e64 vcc, s9, 3 -; GPRIDX-NEXT: v_cndmask_b32_e32 v3, v10, v0, vcc ; GPRIDX-NEXT: v_mov_b32_e32 v11, s4 +; GPRIDX-NEXT: v_cndmask_b32_e32 v3, v10, v0, vcc ; GPRIDX-NEXT: v_cmp_eq_u32_e64 vcc, s9, 4 -; GPRIDX-NEXT: v_cndmask_b32_e32 v4, v11, v0, vcc ; GPRIDX-NEXT: v_mov_b32_e32 v12, s5 +; GPRIDX-NEXT: v_cndmask_b32_e32 v4, v11, v0, vcc ; GPRIDX-NEXT: v_cmp_eq_u32_e64 vcc, s9, 5 ; GPRIDX-NEXT: v_cndmask_b32_e32 v5, v12, v0, vcc ; GPRIDX-NEXT: v_cmp_eq_u32_e64 vcc, s9, 6 @@ -3581,20 +3581,20 @@ ; GPRIDX-NEXT: v_mov_b32_e32 v14, s6 ; GPRIDX-NEXT: v_mov_b32_e32 v8, s0 ; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GPRIDX-NEXT: v_cndmask_b32_e32 v8, v8, v0, vcc ; GPRIDX-NEXT: v_mov_b32_e32 v9, s1 +; GPRIDX-NEXT: v_cndmask_b32_e32 v8, v8, v0, vcc ; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 -; GPRIDX-NEXT: v_cndmask_b32_e32 v7, v9, v0, vcc ; GPRIDX-NEXT: v_mov_b32_e32 v10, s2 +; GPRIDX-NEXT: v_cndmask_b32_e32 v7, v9, v0, vcc ; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 2, v1 -; GPRIDX-NEXT: v_cndmask_b32_e32 v2, v10, v0, vcc ; GPRIDX-NEXT: v_mov_b32_e32 v11, s3 +; GPRIDX-NEXT: v_cndmask_b32_e32 v2, v10, v0, vcc ; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 3, v1 -; GPRIDX-NEXT: v_cndmask_b32_e32 v3, v11, v0, vcc ; GPRIDX-NEXT: v_mov_b32_e32 v12, s4 +; GPRIDX-NEXT: v_cndmask_b32_e32 v3, v11, v0, vcc ; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 4, v1 -; GPRIDX-NEXT: v_cndmask_b32_e32 v4, v12, v0, vcc ; GPRIDX-NEXT: v_mov_b32_e32 v13, s5 +; GPRIDX-NEXT: v_cndmask_b32_e32 v4, v12, v0, vcc ; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 5, v1 ; GPRIDX-NEXT: v_cndmask_b32_e32 v5, v13, v0, vcc ; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 6, v1 @@ -3915,22 +3915,22 @@ ; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v2 ; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v2 ; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[4:5], 4, v2 -; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[10:11], 1, v2 ; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[6:7], 5, v2 ; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v2 +; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[10:11], 1, v2 +; GPRIDX-NEXT: v_cndmask_b32_e32 v3, v3, v0, vcc ; GPRIDX-NEXT: v_cndmask_b32_e64 v2, v5, v0, s[10:11] ; GPRIDX-NEXT: v_cndmask_b32_e64 v5, v7, v0, s[0:1] ; GPRIDX-NEXT: v_cndmask_b32_e64 v7, v9, v0, s[2:3] ; GPRIDX-NEXT: v_cndmask_b32_e64 v9, v11, v0, s[4:5] ; GPRIDX-NEXT: v_cndmask_b32_e64 v11, v13, v0, s[6:7] -; GPRIDX-NEXT: v_cndmask_b32_e32 v3, v3, v0, vcc ; GPRIDX-NEXT: v_cndmask_b32_e64 v0, v15, v0, s[8:9] +; GPRIDX-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc ; GPRIDX-NEXT: v_cndmask_b32_e64 v6, v6, v1, s[10:11] -; GPRIDX-NEXT: v_cndmask_b32_e64 v10, v10, v1, s[2:3] -; GPRIDX-NEXT: v_cndmask_b32_e64 v13, v14, v1, s[6:7] ; GPRIDX-NEXT: v_cndmask_b32_e64 v8, v8, v1, s[0:1] +; GPRIDX-NEXT: v_cndmask_b32_e64 v10, v10, v1, s[2:3] ; GPRIDX-NEXT: v_cndmask_b32_e64 v12, v12, v1, s[4:5] -; GPRIDX-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc +; GPRIDX-NEXT: v_cndmask_b32_e64 v13, v14, v1, s[6:7] ; GPRIDX-NEXT: v_cndmask_b32_e64 v1, v16, v1, s[8:9] ; GPRIDX-NEXT: v_readfirstlane_b32 s0, v3 ; GPRIDX-NEXT: v_readfirstlane_b32 s1, v4 @@ -3996,8 +3996,8 @@ ; MOVREL-NEXT: v_cndmask_b32_e64 v10, v10, v1, s0 ; MOVREL-NEXT: v_cmp_eq_u32_e64 s0, 5, v2 ; MOVREL-NEXT: v_readfirstlane_b32 s2, v5 -; MOVREL-NEXT: v_cndmask_b32_e32 v2, v12, v1, vcc_lo ; MOVREL-NEXT: v_cndmask_b32_e32 v11, v11, v0, vcc_lo +; MOVREL-NEXT: v_cndmask_b32_e32 v2, v12, v1, vcc_lo ; MOVREL-NEXT: v_readfirstlane_b32 s3, v6 ; MOVREL-NEXT: v_cndmask_b32_e64 v12, v13, v0, s0 ; MOVREL-NEXT: v_cndmask_b32_e64 v13, v14, v1, s0 @@ -4082,20 +4082,20 @@ ; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[6:7], 4, v16 ; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[8:9], 5, v16 ; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[10:11], 6, v16 -; GPRIDX-NEXT: v_cndmask_b32_e64 v12, v12, v14, s[10:11] -; GPRIDX-NEXT: v_cndmask_b32_e64 v13, v13, v15, s[10:11] -; GPRIDX-NEXT: v_cndmask_b32_e64 v10, v10, v14, s[8:9] -; GPRIDX-NEXT: v_cndmask_b32_e64 v11, v11, v15, s[8:9] -; GPRIDX-NEXT: v_cndmask_b32_e64 v8, v8, v14, s[6:7] -; GPRIDX-NEXT: v_cndmask_b32_e64 v9, v9, v15, s[6:7] -; GPRIDX-NEXT: v_cndmask_b32_e64 v6, v6, v14, s[4:5] -; GPRIDX-NEXT: v_cndmask_b32_e64 v7, v7, v15, s[4:5] -; GPRIDX-NEXT: v_cndmask_b32_e64 v4, v4, v14, s[2:3] -; GPRIDX-NEXT: v_cndmask_b32_e64 v5, v5, v15, s[2:3] -; GPRIDX-NEXT: v_cndmask_b32_e64 v2, v2, v14, s[0:1] -; GPRIDX-NEXT: v_cndmask_b32_e64 v3, v3, v15, s[0:1] ; GPRIDX-NEXT: v_cndmask_b32_e32 v0, v0, v14, vcc +; GPRIDX-NEXT: v_cndmask_b32_e64 v2, v2, v14, s[0:1] +; GPRIDX-NEXT: v_cndmask_b32_e64 v4, v4, v14, s[2:3] +; GPRIDX-NEXT: v_cndmask_b32_e64 v6, v6, v14, s[4:5] +; GPRIDX-NEXT: v_cndmask_b32_e64 v8, v8, v14, s[6:7] +; GPRIDX-NEXT: v_cndmask_b32_e64 v10, v10, v14, s[8:9] +; GPRIDX-NEXT: v_cndmask_b32_e64 v12, v12, v14, s[10:11] ; GPRIDX-NEXT: v_cndmask_b32_e32 v1, v1, v15, vcc +; GPRIDX-NEXT: v_cndmask_b32_e64 v3, v3, v15, s[0:1] +; GPRIDX-NEXT: v_cndmask_b32_e64 v5, v5, v15, s[2:3] +; GPRIDX-NEXT: v_cndmask_b32_e64 v7, v7, v15, s[4:5] +; GPRIDX-NEXT: v_cndmask_b32_e64 v9, v9, v15, s[6:7] +; GPRIDX-NEXT: v_cndmask_b32_e64 v11, v11, v15, s[8:9] +; GPRIDX-NEXT: v_cndmask_b32_e64 v13, v13, v15, s[10:11] ; GPRIDX-NEXT: v_readfirstlane_b32 s0, v0 ; GPRIDX-NEXT: v_readfirstlane_b32 s1, v1 ; GPRIDX-NEXT: v_readfirstlane_b32 s2, v2 @@ -4122,24 +4122,24 @@ ; MOVREL-NEXT: v_cmp_eq_u32_e64 s4, 5, v16 ; MOVREL-NEXT: v_cmp_eq_u32_e64 s5, 6, v16 ; MOVREL-NEXT: v_cndmask_b32_e32 v0, v0, v14, vcc_lo +; MOVREL-NEXT: v_cndmask_b32_e64 v2, v2, v14, s0 +; MOVREL-NEXT: v_cndmask_b32_e64 v4, v4, v14, s1 ; MOVREL-NEXT: v_cndmask_b32_e64 v6, v6, v14, s2 ; MOVREL-NEXT: v_cndmask_b32_e64 v8, v8, v14, s3 ; MOVREL-NEXT: v_cndmask_b32_e64 v10, v10, v14, s4 ; MOVREL-NEXT: v_cndmask_b32_e64 v12, v12, v14, s5 -; MOVREL-NEXT: v_cndmask_b32_e64 v7, v7, v15, s2 -; MOVREL-NEXT: v_cndmask_b32_e64 v2, v2, v14, s0 +; MOVREL-NEXT: v_cndmask_b32_e32 v1, v1, v15, vcc_lo ; MOVREL-NEXT: v_cndmask_b32_e64 v3, v3, v15, s0 +; MOVREL-NEXT: v_cndmask_b32_e64 v5, v5, v15, s1 +; MOVREL-NEXT: v_cndmask_b32_e64 v7, v7, v15, s2 ; MOVREL-NEXT: v_cndmask_b32_e64 v9, v9, v15, s3 ; MOVREL-NEXT: v_cndmask_b32_e64 v11, v11, v15, s4 -; MOVREL-NEXT: v_cndmask_b32_e64 v4, v4, v14, s1 ; MOVREL-NEXT: v_cndmask_b32_e64 v13, v13, v15, s5 -; MOVREL-NEXT: v_cndmask_b32_e64 v5, v5, v15, s1 -; MOVREL-NEXT: v_cndmask_b32_e32 v1, v1, v15, vcc_lo ; MOVREL-NEXT: v_readfirstlane_b32 s0, v0 +; MOVREL-NEXT: v_readfirstlane_b32 s1, v1 ; MOVREL-NEXT: v_readfirstlane_b32 s2, v2 ; MOVREL-NEXT: v_readfirstlane_b32 s3, v3 ; MOVREL-NEXT: v_readfirstlane_b32 s4, v4 -; MOVREL-NEXT: v_readfirstlane_b32 s1, v1 ; MOVREL-NEXT: v_readfirstlane_b32 s5, v5 ; MOVREL-NEXT: v_readfirstlane_b32 s6, v6 ; MOVREL-NEXT: v_readfirstlane_b32 s7, v7 @@ -4220,17 +4220,17 @@ ; GPRIDX-NEXT: v_cmp_eq_u32_e64 vcc, s12, 0 ; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[0:1], s12, 1 ; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[2:3], s12, 3 -; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[6:7], s12, 2 ; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[4:5], s12, 4 +; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[6:7], s12, 2 ; GPRIDX-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc ; GPRIDX-NEXT: v_cndmask_b32_e64 v4, v4, v0, s[0:1] -; GPRIDX-NEXT: v_cndmask_b32_e64 v5, v5, v1, s[0:1] -; GPRIDX-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc ; GPRIDX-NEXT: v_cndmask_b32_e64 v6, v6, v0, s[6:7] ; GPRIDX-NEXT: v_cndmask_b32_e64 v8, v8, v0, s[2:3] -; GPRIDX-NEXT: v_cndmask_b32_e64 v9, v9, v1, s[2:3] -; GPRIDX-NEXT: v_cndmask_b32_e64 v7, v7, v1, s[6:7] ; GPRIDX-NEXT: v_cndmask_b32_e64 v0, v10, v0, s[4:5] +; GPRIDX-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc +; GPRIDX-NEXT: v_cndmask_b32_e64 v5, v5, v1, s[0:1] +; GPRIDX-NEXT: v_cndmask_b32_e64 v7, v7, v1, s[6:7] +; GPRIDX-NEXT: v_cndmask_b32_e64 v9, v9, v1, s[2:3] ; GPRIDX-NEXT: v_cndmask_b32_e64 v1, v11, v1, s[4:5] ; GPRIDX-NEXT: v_readfirstlane_b32 s0, v2 ; GPRIDX-NEXT: v_readfirstlane_b32 s1, v3 @@ -4277,13 +4277,13 @@ ; MOVREL-NEXT: v_cmp_eq_u32_e64 s1, s12, 4 ; MOVREL-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc_lo ; MOVREL-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc_lo -; MOVREL-NEXT: v_cmp_eq_u32_e64 vcc_lo, s12, 2 ; MOVREL-NEXT: v_cndmask_b32_e64 v4, v4, v0, s0 +; MOVREL-NEXT: v_cmp_eq_u32_e64 vcc_lo, s12, 2 ; MOVREL-NEXT: v_cndmask_b32_e64 v5, v5, v1, s0 ; MOVREL-NEXT: v_cmp_eq_u32_e64 s0, s12, 3 +; MOVREL-NEXT: v_readfirstlane_b32 s2, v4 ; MOVREL-NEXT: v_cndmask_b32_e32 v6, v6, v0, vcc_lo ; MOVREL-NEXT: v_cndmask_b32_e32 v7, v7, v1, vcc_lo -; MOVREL-NEXT: v_readfirstlane_b32 s2, v4 ; MOVREL-NEXT: v_cndmask_b32_e64 v8, v8, v0, s0 ; MOVREL-NEXT: v_cndmask_b32_e64 v9, v9, v1, s0 ; MOVREL-NEXT: v_cndmask_b32_e64 v0, v10, v0, s1 @@ -4334,18 +4334,18 @@ ; GPRIDX-NEXT: v_mov_b32_e32 v3, s0 ; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v2 -; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[6:7], 1, v2 ; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v2 ; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[4:5], 4, v2 +; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[6:7], 1, v2 +; GPRIDX-NEXT: v_cndmask_b32_e32 v3, v3, v0, vcc ; GPRIDX-NEXT: v_cndmask_b32_e64 v2, v5, v0, s[6:7] ; GPRIDX-NEXT: v_cndmask_b32_e64 v5, v7, v0, s[0:1] ; GPRIDX-NEXT: v_cndmask_b32_e64 v7, v9, v0, s[2:3] -; GPRIDX-NEXT: v_cndmask_b32_e32 v3, v3, v0, vcc ; GPRIDX-NEXT: v_cndmask_b32_e64 v0, v11, v0, s[4:5] +; GPRIDX-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc ; GPRIDX-NEXT: v_cndmask_b32_e64 v6, v6, v1, s[6:7] -; GPRIDX-NEXT: v_cndmask_b32_e64 v9, v10, v1, s[2:3] ; GPRIDX-NEXT: v_cndmask_b32_e64 v8, v8, v1, s[0:1] -; GPRIDX-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc +; GPRIDX-NEXT: v_cndmask_b32_e64 v9, v10, v1, s[2:3] ; GPRIDX-NEXT: v_cndmask_b32_e64 v1, v12, v1, s[4:5] ; GPRIDX-NEXT: v_readfirstlane_b32 s0, v3 ; GPRIDX-NEXT: v_readfirstlane_b32 s1, v4 @@ -4393,19 +4393,19 @@ ; MOVREL-NEXT: v_cndmask_b32_e32 v3, v3, v0, vcc_lo ; MOVREL-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc_lo ; MOVREL-NEXT: v_cndmask_b32_e64 v5, v5, v0, s0 -; MOVREL-NEXT: v_cndmask_b32_e64 v6, v6, v1, s0 ; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v2 +; MOVREL-NEXT: v_cndmask_b32_e64 v6, v6, v1, s0 ; MOVREL-NEXT: v_cmp_eq_u32_e64 s0, 3, v2 ; MOVREL-NEXT: v_readfirstlane_b32 s2, v5 -; MOVREL-NEXT: v_readfirstlane_b32 s3, v6 +; MOVREL-NEXT: v_cndmask_b32_e32 v7, v7, v0, vcc_lo ; MOVREL-NEXT: v_cndmask_b32_e32 v2, v8, v1, vcc_lo ; MOVREL-NEXT: v_cndmask_b32_e64 v8, v9, v0, s0 ; MOVREL-NEXT: v_cndmask_b32_e64 v9, v10, v1, s0 -; MOVREL-NEXT: v_cndmask_b32_e32 v7, v7, v0, vcc_lo ; MOVREL-NEXT: v_cndmask_b32_e64 v0, v11, v0, s1 ; MOVREL-NEXT: v_cndmask_b32_e64 v1, v12, v1, s1 ; MOVREL-NEXT: v_readfirstlane_b32 s0, v3 ; MOVREL-NEXT: v_readfirstlane_b32 s1, v4 +; MOVREL-NEXT: v_readfirstlane_b32 s3, v6 ; MOVREL-NEXT: v_readfirstlane_b32 s4, v7 ; MOVREL-NEXT: v_readfirstlane_b32 s5, v2 ; MOVREL-NEXT: v_readfirstlane_b32 s6, v8 @@ -4426,15 +4426,15 @@ ; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[8:9], s2, 2 ; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[4:5], s2, 3 ; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[6:7], s2, 4 -; GPRIDX-NEXT: v_cndmask_b32_e64 v2, v2, v10, s[0:1] -; GPRIDX-NEXT: v_cndmask_b32_e64 v3, v3, v11, s[0:1] ; GPRIDX-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc -; GPRIDX-NEXT: v_cndmask_b32_e32 v1, v1, v11, vcc +; GPRIDX-NEXT: v_cndmask_b32_e64 v2, v2, v10, s[0:1] ; GPRIDX-NEXT: v_cndmask_b32_e64 v4, v4, v10, s[8:9] ; GPRIDX-NEXT: v_cndmask_b32_e64 v6, v6, v10, s[4:5] -; GPRIDX-NEXT: v_cndmask_b32_e64 v7, v7, v11, s[4:5] -; GPRIDX-NEXT: v_cndmask_b32_e64 v5, v5, v11, s[8:9] ; GPRIDX-NEXT: v_cndmask_b32_e64 v8, v8, v10, s[6:7] +; GPRIDX-NEXT: v_cndmask_b32_e32 v1, v1, v11, vcc +; GPRIDX-NEXT: v_cndmask_b32_e64 v3, v3, v11, s[0:1] +; GPRIDX-NEXT: v_cndmask_b32_e64 v5, v5, v11, s[8:9] +; GPRIDX-NEXT: v_cndmask_b32_e64 v7, v7, v11, s[4:5] ; GPRIDX-NEXT: v_cndmask_b32_e64 v9, v9, v11, s[6:7] ; GPRIDX-NEXT: v_readfirstlane_b32 s0, v0 ; GPRIDX-NEXT: v_readfirstlane_b32 s1, v1 @@ -4456,8 +4456,8 @@ ; MOVREL-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 1 ; MOVREL-NEXT: v_readfirstlane_b32 s0, v0 ; MOVREL-NEXT: v_readfirstlane_b32 s1, v1 -; MOVREL-NEXT: v_cndmask_b32_e32 v3, v3, v11, vcc_lo ; MOVREL-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc_lo +; MOVREL-NEXT: v_cndmask_b32_e32 v3, v3, v11, vcc_lo ; MOVREL-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 2 ; MOVREL-NEXT: v_readfirstlane_b32 s3, v3 ; MOVREL-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc_lo @@ -4489,16 +4489,16 @@ ; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[2:3], 2, v12 ; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[4:5], 3, v12 ; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[6:7], 4, v12 -; GPRIDX-NEXT: v_cndmask_b32_e64 v8, v8, v10, s[6:7] -; GPRIDX-NEXT: v_cndmask_b32_e64 v9, v9, v11, s[6:7] -; GPRIDX-NEXT: v_cndmask_b32_e64 v6, v6, v10, s[4:5] -; GPRIDX-NEXT: v_cndmask_b32_e64 v7, v7, v11, s[4:5] -; GPRIDX-NEXT: v_cndmask_b32_e64 v4, v4, v10, s[2:3] -; GPRIDX-NEXT: v_cndmask_b32_e64 v5, v5, v11, s[2:3] -; GPRIDX-NEXT: v_cndmask_b32_e64 v2, v2, v10, s[0:1] -; GPRIDX-NEXT: v_cndmask_b32_e64 v3, v3, v11, s[0:1] ; GPRIDX-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc +; GPRIDX-NEXT: v_cndmask_b32_e64 v2, v2, v10, s[0:1] +; GPRIDX-NEXT: v_cndmask_b32_e64 v4, v4, v10, s[2:3] +; GPRIDX-NEXT: v_cndmask_b32_e64 v6, v6, v10, s[4:5] +; GPRIDX-NEXT: v_cndmask_b32_e64 v8, v8, v10, s[6:7] ; GPRIDX-NEXT: v_cndmask_b32_e32 v1, v1, v11, vcc +; GPRIDX-NEXT: v_cndmask_b32_e64 v3, v3, v11, s[0:1] +; GPRIDX-NEXT: v_cndmask_b32_e64 v5, v5, v11, s[2:3] +; GPRIDX-NEXT: v_cndmask_b32_e64 v7, v7, v11, s[4:5] +; GPRIDX-NEXT: v_cndmask_b32_e64 v9, v9, v11, s[6:7] ; GPRIDX-NEXT: v_readfirstlane_b32 s0, v0 ; GPRIDX-NEXT: v_readfirstlane_b32 s1, v1 ; GPRIDX-NEXT: v_readfirstlane_b32 s2, v2 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.dec.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.dec.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.dec.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.dec.ll @@ -443,8 +443,8 @@ ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 -; CI-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: v_add_i32_e32 v0, vcc, 20, v0 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc @@ -458,8 +458,8 @@ ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_add_u32_e32 v0, vcc, 20, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc @@ -746,8 +746,8 @@ ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 -; CI-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: v_add_i32_e32 v0, vcc, 20, v0 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc @@ -761,8 +761,8 @@ ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_add_u32_e32 v0, vcc, 20, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc @@ -990,11 +990,11 @@ ; CI-NEXT: v_add_i32_e32 v4, vcc, v0, v2 ; CI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc ; CI-NEXT: v_mov_b32_e32 v0, s0 -; CI-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; CI-NEXT: v_add_i32_e32 v4, vcc, 40, v4 ; CI-NEXT: v_mov_b32_e32 v2, 42 +; CI-NEXT: v_add_i32_e32 v4, vcc, 40, v4 ; CI-NEXT: v_mov_b32_e32 v3, 0 ; CI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc ; CI-NEXT: flat_atomic_dec_x2 v[2:3], v[4:5], v[2:3] glc @@ -1012,11 +1012,11 @@ ; VI-NEXT: v_add_u32_e32 v4, vcc, v0, v2 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_add_u32_e32 v4, vcc, 40, v4 ; VI-NEXT: v_mov_b32_e32 v2, 42 +; VI-NEXT: v_add_u32_e32 v4, vcc, 40, v4 ; VI-NEXT: v_mov_b32_e32 v3, 0 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc ; VI-NEXT: flat_atomic_dec_x2 v[2:3], v[4:5], v[2:3] glc @@ -1067,8 +1067,8 @@ ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: v_add_i32_e32 v2, vcc, v0, v2 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc -; CI-NEXT: v_add_i32_e32 v2, vcc, 40, v2 ; CI-NEXT: v_mov_b32_e32 v0, 42 +; CI-NEXT: v_add_i32_e32 v2, vcc, 40, v2 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; CI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc @@ -1083,8 +1083,8 @@ ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v0, v2 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc -; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v2 ; VI-NEXT: v_mov_b32_e32 v0, 42 +; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v2 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc @@ -1549,11 +1549,11 @@ ; CI-NEXT: v_add_i32_e32 v4, vcc, v0, v2 ; CI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc ; CI-NEXT: v_mov_b32_e32 v0, s0 -; CI-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; CI-NEXT: v_add_i32_e32 v4, vcc, 40, v4 ; CI-NEXT: v_mov_b32_e32 v2, 42 +; CI-NEXT: v_add_i32_e32 v4, vcc, 40, v4 ; CI-NEXT: v_mov_b32_e32 v3, 0 ; CI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc ; CI-NEXT: flat_atomic_dec_x2 v[2:3], v[4:5], v[2:3] glc @@ -1571,11 +1571,11 @@ ; VI-NEXT: v_add_u32_e32 v4, vcc, v0, v2 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_add_u32_e32 v4, vcc, 40, v4 ; VI-NEXT: v_mov_b32_e32 v2, 42 +; VI-NEXT: v_add_u32_e32 v4, vcc, 40, v4 ; VI-NEXT: v_mov_b32_e32 v3, 0 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc ; VI-NEXT: flat_atomic_dec_x2 v[2:3], v[4:5], v[2:3] glc @@ -1626,8 +1626,8 @@ ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: v_add_i32_e32 v2, vcc, v0, v2 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc -; CI-NEXT: v_add_i32_e32 v2, vcc, 40, v2 ; CI-NEXT: v_mov_b32_e32 v0, 42 +; CI-NEXT: v_add_i32_e32 v2, vcc, 40, v2 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; CI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc @@ -1642,8 +1642,8 @@ ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v0, v2 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc -; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v2 ; VI-NEXT: v_mov_b32_e32 v0, 42 +; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v2 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll @@ -512,8 +512,8 @@ ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 -; CI-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: v_add_i32_e32 v0, vcc, 20, v0 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc @@ -527,8 +527,8 @@ ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_add_u32_e32 v0, vcc, 20, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc @@ -599,8 +599,8 @@ ; ; GFX9-LABEL: atomic_inc_shl_base_lds_0_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_add_u32_e32 v1, 2, v0 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: v_add_u32_e32 v1, 2, v0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 9 ; GFX9-NEXT: ds_inc_rtn_u32 v0, v0, v2 offset:8 @@ -1086,11 +1086,11 @@ ; CI-NEXT: v_add_i32_e32 v4, vcc, v0, v2 ; CI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc ; CI-NEXT: v_mov_b32_e32 v0, s0 -; CI-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; CI-NEXT: v_add_i32_e32 v4, vcc, 40, v4 ; CI-NEXT: v_mov_b32_e32 v2, 42 +; CI-NEXT: v_add_i32_e32 v4, vcc, 40, v4 ; CI-NEXT: v_mov_b32_e32 v3, 0 ; CI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc ; CI-NEXT: flat_atomic_inc_x2 v[2:3], v[4:5], v[2:3] glc @@ -1108,11 +1108,11 @@ ; VI-NEXT: v_add_u32_e32 v4, vcc, v0, v2 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_add_u32_e32 v4, vcc, 40, v4 ; VI-NEXT: v_mov_b32_e32 v2, 42 +; VI-NEXT: v_add_u32_e32 v4, vcc, 40, v4 ; VI-NEXT: v_mov_b32_e32 v3, 0 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc ; VI-NEXT: flat_atomic_inc_x2 v[2:3], v[4:5], v[2:3] glc @@ -1162,8 +1162,8 @@ ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: v_add_i32_e32 v2, vcc, v0, v2 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc -; CI-NEXT: v_add_i32_e32 v2, vcc, 40, v2 ; CI-NEXT: v_mov_b32_e32 v0, 42 +; CI-NEXT: v_add_i32_e32 v2, vcc, 40, v2 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; CI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc @@ -1178,8 +1178,8 @@ ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v0, v2 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc -; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v2 ; VI-NEXT: v_mov_b32_e32 v0, 42 +; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v2 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc @@ -1412,9 +1412,9 @@ ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4 +; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: v_mov_b32_e32 v2, s0 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 @@ -1461,8 +1461,8 @@ ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 -; CI-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: v_add_i32_e32 v0, vcc, 20, v0 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc @@ -1476,8 +1476,8 @@ ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_add_u32_e32 v0, vcc, 20, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc @@ -1491,8 +1491,8 @@ ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: v_mov_b32_e32 v2, 42 ; GFX9-NEXT: flat_atomic_inc v0, v[0:1], v2 offset:20 glc @@ -1506,8 +1506,8 @@ ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 -; GFX10-NEXT: v_mov_b32_e32 v2, 42 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v2, 42 ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, 20 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: flat_atomic_inc v0, v[0:1], v2 glc @@ -1561,8 +1561,8 @@ ; GFX9-LABEL: atomic_inc_shl_base_lds_0_i64: ; GFX9: ; %bb.0: ; GFX9-NEXT: v_mov_b32_e32 v1, 9 -; GFX9-NEXT: v_add_u32_e32 v3, 2, v0 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: v_add_u32_e32 v3, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX9-NEXT: ds_inc_rtn_u64 v[0:1], v0, v[1:2] offset:16 @@ -1575,8 +1575,8 @@ ; GFX10-LABEL: atomic_inc_shl_base_lds_0_i64: ; GFX10: ; %bb.0: ; GFX10-NEXT: v_mov_b32_e32 v1, 9 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v0 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-NEXT: v_add_nc_u32_e32 v0, 2, v0 ; GFX10-NEXT: ds_inc_rtn_u64 v[1:2], v3, v[1:2] offset:16 @@ -1767,11 +1767,11 @@ ; CI-NEXT: v_add_i32_e32 v4, vcc, v0, v2 ; CI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc ; CI-NEXT: v_mov_b32_e32 v0, s0 -; CI-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; CI-NEXT: v_add_i32_e32 v4, vcc, 40, v4 ; CI-NEXT: v_mov_b32_e32 v2, 42 +; CI-NEXT: v_add_i32_e32 v4, vcc, 40, v4 ; CI-NEXT: v_mov_b32_e32 v3, 0 ; CI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc ; CI-NEXT: flat_atomic_inc_x2 v[2:3], v[4:5], v[2:3] glc @@ -1789,11 +1789,11 @@ ; VI-NEXT: v_add_u32_e32 v4, vcc, v0, v2 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_add_u32_e32 v4, vcc, 40, v4 ; VI-NEXT: v_mov_b32_e32 v2, 42 +; VI-NEXT: v_add_u32_e32 v4, vcc, 40, v4 ; VI-NEXT: v_mov_b32_e32 v3, 0 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc ; VI-NEXT: flat_atomic_inc_x2 v[2:3], v[4:5], v[2:3] glc @@ -1831,8 +1831,8 @@ ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v0, v4 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: v_mov_b32_e32 v0, 42 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, 40 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo ; GFX10-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc ; GFX10-NEXT: v_mov_b32_e32 v3, s1 @@ -1861,8 +1861,8 @@ ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: v_add_i32_e32 v2, vcc, v0, v2 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc -; CI-NEXT: v_add_i32_e32 v2, vcc, 40, v2 ; CI-NEXT: v_mov_b32_e32 v0, 42 +; CI-NEXT: v_add_i32_e32 v2, vcc, 40, v2 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; CI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc @@ -1877,8 +1877,8 @@ ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v0, v2 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc -; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v2 ; VI-NEXT: v_mov_b32_e32 v0, 42 +; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v2 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc @@ -1908,8 +1908,8 @@ ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v0, v2 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: v_mov_b32_e32 v0, 42 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, 40 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo ; GFX10-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc ; GFX10-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll @@ -141,8 +141,8 @@ ; GFX7-LABEL: s_div_fmas_f64: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_cmp_eq_u32 s6, 0 -; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: s_cselect_b32 s6, 1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v4, s4 @@ -159,8 +159,8 @@ ; GFX8-LABEL: s_div_fmas_f64: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_cmp_eq_u32 s6, 0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: s_cselect_b32 s6, 1, 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_mov_b32_e32 v2, s2 ; GFX8-NEXT: v_mov_b32_e32 v4, s4 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll @@ -530,8 +530,8 @@ ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 -; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX7-NEXT: v_mov_b32_e32 v2, s4 @@ -548,8 +548,8 @@ ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v2, s4 @@ -590,8 +590,8 @@ ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 -; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX7-NEXT: v_mov_b32_e32 v2, s4 @@ -608,8 +608,8 @@ ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v2, s4 @@ -650,8 +650,8 @@ ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 -; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX7-NEXT: v_mov_b32_e32 v2, s4 @@ -668,8 +668,8 @@ ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v2, s4 @@ -710,8 +710,8 @@ ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s6 -; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX7-NEXT: v_mov_b32_e32 v2, s4 @@ -728,8 +728,8 @@ ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v2, s4 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.a16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.a16.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.a16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.a16.ll @@ -496,8 +496,8 @@ ; GFX9-NEXT: s_mov_b32 s1, s3 ; GFX9-NEXT: s_mov_b32 s3, s5 ; GFX9-NEXT: s_mov_b32 s5, s7 -; GFX9-NEXT: v_and_or_b32 v1, v1, v4, v2 ; GFX9-NEXT: s_mov_b32 s7, s9 +; GFX9-NEXT: v_and_or_b32 v1, v1, v4, v2 ; GFX9-NEXT: v_and_or_b32 v2, v3, v4, s8 ; GFX9-NEXT: image_atomic_add v0, v[1:2], s[0:7] dmask:0x1 unorm glc a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -540,8 +540,8 @@ ; GFX9-NEXT: s_mov_b32 s1, s3 ; GFX9-NEXT: s_mov_b32 s3, s5 ; GFX9-NEXT: s_mov_b32 s5, s7 -; GFX9-NEXT: v_and_or_b32 v1, v1, v4, v2 ; GFX9-NEXT: s_mov_b32 s7, s9 +; GFX9-NEXT: v_and_or_b32 v1, v1, v4, v2 ; GFX9-NEXT: v_and_or_b32 v2, v3, v4, s8 ; GFX9-NEXT: image_atomic_add v0, v[1:2], s[0:7] dmask:0x1 unorm glc a16 da ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -623,8 +623,8 @@ ; GFX9-NEXT: s_mov_b32 s1, s3 ; GFX9-NEXT: s_mov_b32 s3, s5 ; GFX9-NEXT: s_mov_b32 s5, s7 -; GFX9-NEXT: v_and_or_b32 v1, v1, v4, v2 ; GFX9-NEXT: s_mov_b32 s7, s9 +; GFX9-NEXT: v_and_or_b32 v1, v1, v4, v2 ; GFX9-NEXT: v_and_or_b32 v2, v3, v4, s8 ; GFX9-NEXT: image_atomic_add v0, v[1:2], s[0:7] dmask:0x1 unorm glc a16 da ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -667,8 +667,8 @@ ; GFX9-NEXT: s_mov_b32 s1, s3 ; GFX9-NEXT: s_mov_b32 s3, s5 ; GFX9-NEXT: s_mov_b32 s5, s7 -; GFX9-NEXT: v_and_or_b32 v1, v1, v4, v2 ; GFX9-NEXT: s_mov_b32 s7, s9 +; GFX9-NEXT: v_and_or_b32 v1, v1, v4, v2 ; GFX9-NEXT: v_and_or_b32 v2, v3, v4, s8 ; GFX9-NEXT: image_atomic_add v0, v[1:2], s[0:7] dmask:0x1 unorm glc a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -1270,8 +1270,8 @@ ; GFX9-NEXT: s_mov_b32 s1, s3 ; GFX9-NEXT: s_mov_b32 s3, s5 ; GFX9-NEXT: s_mov_b32 s5, s7 -; GFX9-NEXT: v_and_or_b32 v2, v2, v5, v3 ; GFX9-NEXT: s_mov_b32 s7, s9 +; GFX9-NEXT: v_and_or_b32 v2, v2, v5, v3 ; GFX9-NEXT: v_and_or_b32 v3, v4, v5, s8 ; GFX9-NEXT: image_atomic_add v[0:1], v[2:3], s[0:7] dmask:0x3 unorm glc a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -1314,8 +1314,8 @@ ; GFX9-NEXT: s_mov_b32 s1, s3 ; GFX9-NEXT: s_mov_b32 s3, s5 ; GFX9-NEXT: s_mov_b32 s5, s7 -; GFX9-NEXT: v_and_or_b32 v2, v2, v5, v3 ; GFX9-NEXT: s_mov_b32 s7, s9 +; GFX9-NEXT: v_and_or_b32 v2, v2, v5, v3 ; GFX9-NEXT: v_and_or_b32 v3, v4, v5, s8 ; GFX9-NEXT: image_atomic_add v[0:1], v[2:3], s[0:7] dmask:0x3 unorm glc a16 da ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -1397,8 +1397,8 @@ ; GFX9-NEXT: s_mov_b32 s1, s3 ; GFX9-NEXT: s_mov_b32 s3, s5 ; GFX9-NEXT: s_mov_b32 s5, s7 -; GFX9-NEXT: v_and_or_b32 v2, v2, v5, v3 ; GFX9-NEXT: s_mov_b32 s7, s9 +; GFX9-NEXT: v_and_or_b32 v2, v2, v5, v3 ; GFX9-NEXT: v_and_or_b32 v3, v4, v5, s8 ; GFX9-NEXT: image_atomic_add v[0:1], v[2:3], s[0:7] dmask:0x3 unorm glc a16 da ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -1441,8 +1441,8 @@ ; GFX9-NEXT: s_mov_b32 s1, s3 ; GFX9-NEXT: s_mov_b32 s3, s5 ; GFX9-NEXT: s_mov_b32 s5, s7 -; GFX9-NEXT: v_and_or_b32 v2, v2, v5, v3 ; GFX9-NEXT: s_mov_b32 s7, s9 +; GFX9-NEXT: v_and_or_b32 v2, v2, v5, v3 ; GFX9-NEXT: v_and_or_b32 v3, v4, v5, s8 ; GFX9-NEXT: image_atomic_add v[0:1], v[2:3], s[0:7] dmask:0x3 unorm glc a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.a16.dim.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.a16.dim.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.a16.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.a16.dim.ll @@ -73,8 +73,8 @@ ; GFX9-NEXT: s_mov_b32 s5, s7 ; GFX9-NEXT: s_mov_b32 s7, s9 ; GFX9-NEXT: s_mov_b32 s9, s11 -; GFX9-NEXT: v_and_or_b32 v0, v0, v3, v1 ; GFX9-NEXT: s_mov_b32 s11, s13 +; GFX9-NEXT: v_and_or_b32 v0, v0, v3, v1 ; GFX9-NEXT: v_and_or_b32 v1, v2, v3, s12 ; GFX9-NEXT: s_and_b64 exec, exec, s[14:15] ; GFX9-NEXT: image_gather4 v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 a16 da @@ -99,8 +99,8 @@ ; GFX10NSA-NEXT: s_mov_b32 s5, s7 ; GFX10NSA-NEXT: s_mov_b32 s7, s9 ; GFX10NSA-NEXT: s_mov_b32 s9, s11 -; GFX10NSA-NEXT: v_and_or_b32 v0, v0, v3, v1 ; GFX10NSA-NEXT: s_mov_b32 s11, s13 +; GFX10NSA-NEXT: v_and_or_b32 v0, v0, v3, v1 ; GFX10NSA-NEXT: v_and_or_b32 v1, v2, v3, s12 ; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10NSA-NEXT: image_gather4 v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_CUBE a16 @@ -130,8 +130,8 @@ ; GFX9-NEXT: s_mov_b32 s5, s7 ; GFX9-NEXT: s_mov_b32 s7, s9 ; GFX9-NEXT: s_mov_b32 s9, s11 -; GFX9-NEXT: v_and_or_b32 v0, v0, v3, v1 ; GFX9-NEXT: s_mov_b32 s11, s13 +; GFX9-NEXT: v_and_or_b32 v0, v0, v3, v1 ; GFX9-NEXT: v_and_or_b32 v1, v2, v3, s12 ; GFX9-NEXT: s_and_b64 exec, exec, s[14:15] ; GFX9-NEXT: image_gather4 v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 a16 da @@ -156,8 +156,8 @@ ; GFX10NSA-NEXT: s_mov_b32 s5, s7 ; GFX10NSA-NEXT: s_mov_b32 s7, s9 ; GFX10NSA-NEXT: s_mov_b32 s9, s11 -; GFX10NSA-NEXT: v_and_or_b32 v0, v0, v3, v1 ; GFX10NSA-NEXT: s_mov_b32 s11, s13 +; GFX10NSA-NEXT: v_and_or_b32 v0, v0, v3, v1 ; GFX10NSA-NEXT: v_and_or_b32 v1, v2, v3, s12 ; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10NSA-NEXT: image_gather4 v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY a16 @@ -239,8 +239,8 @@ ; GFX9-NEXT: s_mov_b32 s5, s7 ; GFX9-NEXT: s_mov_b32 s7, s9 ; GFX9-NEXT: s_mov_b32 s9, s11 -; GFX9-NEXT: v_and_or_b32 v0, v0, v3, v1 ; GFX9-NEXT: s_mov_b32 s11, s13 +; GFX9-NEXT: v_and_or_b32 v0, v0, v3, v1 ; GFX9-NEXT: v_and_or_b32 v1, v2, v3, s12 ; GFX9-NEXT: s_and_b64 exec, exec, s[14:15] ; GFX9-NEXT: image_gather4_cl v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 a16 @@ -265,8 +265,8 @@ ; GFX10NSA-NEXT: s_mov_b32 s5, s7 ; GFX10NSA-NEXT: s_mov_b32 s7, s9 ; GFX10NSA-NEXT: s_mov_b32 s9, s11 -; GFX10NSA-NEXT: v_and_or_b32 v0, v0, v3, v1 ; GFX10NSA-NEXT: s_mov_b32 s11, s13 +; GFX10NSA-NEXT: v_and_or_b32 v0, v0, v3, v1 ; GFX10NSA-NEXT: v_and_or_b32 v1, v2, v3, s12 ; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10NSA-NEXT: image_gather4_cl v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 @@ -296,8 +296,8 @@ ; GFX9-NEXT: s_mov_b32 s5, s7 ; GFX9-NEXT: s_mov_b32 s7, s9 ; GFX9-NEXT: s_mov_b32 s9, s11 -; GFX9-NEXT: v_and_or_b32 v1, v1, v4, v2 ; GFX9-NEXT: s_mov_b32 s11, s13 +; GFX9-NEXT: v_and_or_b32 v1, v1, v4, v2 ; GFX9-NEXT: v_and_or_b32 v2, v3, v4, s12 ; GFX9-NEXT: s_and_b64 exec, exec, s[14:15] ; GFX9-NEXT: image_gather4_c_cl v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 a16 @@ -322,8 +322,8 @@ ; GFX10NSA-NEXT: s_mov_b32 s5, s7 ; GFX10NSA-NEXT: s_mov_b32 s7, s9 ; GFX10NSA-NEXT: s_mov_b32 s9, s11 -; GFX10NSA-NEXT: v_and_or_b32 v1, v1, v4, v2 ; GFX10NSA-NEXT: s_mov_b32 s11, s13 +; GFX10NSA-NEXT: v_and_or_b32 v1, v1, v4, v2 ; GFX10NSA-NEXT: v_and_or_b32 v2, v3, v4, s12 ; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10NSA-NEXT: image_gather4_c_cl v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 @@ -457,8 +457,8 @@ ; GFX9-NEXT: s_mov_b32 s5, s7 ; GFX9-NEXT: s_mov_b32 s7, s9 ; GFX9-NEXT: s_mov_b32 s9, s11 -; GFX9-NEXT: v_and_or_b32 v1, v1, v4, v2 ; GFX9-NEXT: s_mov_b32 s11, s13 +; GFX9-NEXT: v_and_or_b32 v1, v1, v4, v2 ; GFX9-NEXT: v_and_or_b32 v2, v3, v4, s12 ; GFX9-NEXT: s_and_b64 exec, exec, s[14:15] ; GFX9-NEXT: image_gather4_b_cl v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 a16 @@ -483,8 +483,8 @@ ; GFX10NSA-NEXT: s_mov_b32 s5, s7 ; GFX10NSA-NEXT: s_mov_b32 s7, s9 ; GFX10NSA-NEXT: s_mov_b32 s9, s11 -; GFX10NSA-NEXT: v_and_or_b32 v1, v1, v4, v2 ; GFX10NSA-NEXT: s_mov_b32 s11, s13 +; GFX10NSA-NEXT: v_and_or_b32 v1, v1, v4, v2 ; GFX10NSA-NEXT: v_and_or_b32 v2, v3, v4, s12 ; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10NSA-NEXT: image_gather4_b_cl v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 @@ -514,8 +514,8 @@ ; GFX9-NEXT: s_mov_b32 s5, s7 ; GFX9-NEXT: s_mov_b32 s7, s9 ; GFX9-NEXT: s_mov_b32 s9, s11 -; GFX9-NEXT: v_and_or_b32 v2, v2, v5, v3 ; GFX9-NEXT: s_mov_b32 s11, s13 +; GFX9-NEXT: v_and_or_b32 v2, v2, v5, v3 ; GFX9-NEXT: v_and_or_b32 v3, v4, v5, s12 ; GFX9-NEXT: s_and_b64 exec, exec, s[14:15] ; GFX9-NEXT: image_gather4_c_b_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x1 a16 @@ -540,8 +540,8 @@ ; GFX10NSA-NEXT: s_mov_b32 s5, s7 ; GFX10NSA-NEXT: s_mov_b32 s7, s9 ; GFX10NSA-NEXT: s_mov_b32 s9, s11 -; GFX10NSA-NEXT: v_and_or_b32 v2, v2, v5, v3 ; GFX10NSA-NEXT: s_mov_b32 s11, s13 +; GFX10NSA-NEXT: v_and_or_b32 v2, v2, v5, v3 ; GFX10NSA-NEXT: v_and_or_b32 v3, v4, v5, s12 ; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10NSA-NEXT: image_gather4_c_b_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 @@ -569,8 +569,8 @@ ; GFX9-NEXT: s_mov_b32 s5, s7 ; GFX9-NEXT: s_mov_b32 s7, s9 ; GFX9-NEXT: s_mov_b32 s9, s11 -; GFX9-NEXT: v_and_or_b32 v0, v0, v3, v1 ; GFX9-NEXT: s_mov_b32 s11, s13 +; GFX9-NEXT: v_and_or_b32 v0, v0, v3, v1 ; GFX9-NEXT: v_and_or_b32 v1, v2, v3, s12 ; GFX9-NEXT: image_gather4_l v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -620,8 +620,8 @@ ; GFX9-NEXT: s_mov_b32 s5, s7 ; GFX9-NEXT: s_mov_b32 s7, s9 ; GFX9-NEXT: s_mov_b32 s9, s11 -; GFX9-NEXT: v_and_or_b32 v1, v1, v4, v2 ; GFX9-NEXT: s_mov_b32 s11, s13 +; GFX9-NEXT: v_and_or_b32 v1, v1, v4, v2 ; GFX9-NEXT: v_and_or_b32 v2, v3, v4, s12 ; GFX9-NEXT: image_gather4_c_l v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.dim.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.dim.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.dim.ll @@ -5,6 +5,7 @@ define amdgpu_ps <4 x float> @gather4_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t) { ; GFX6-LABEL: gather4_2d: ; GFX6: ; %bb.0: ; %main_body +; GFX6-NEXT: s_mov_b64 s[14:15], exec ; GFX6-NEXT: s_mov_b32 s0, s2 ; GFX6-NEXT: s_mov_b32 s1, s3 ; GFX6-NEXT: s_mov_b32 s2, s4 @@ -15,7 +16,6 @@ ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: s_mov_b32 s8, s10 ; GFX6-NEXT: s_mov_b32 s9, s11 -; GFX6-NEXT: s_mov_b64 s[14:15], exec ; GFX6-NEXT: s_mov_b32 s10, s12 ; GFX6-NEXT: s_mov_b32 s11, s13 ; GFX6-NEXT: s_wqm_b64 exec, exec @@ -52,6 +52,7 @@ define amdgpu_ps <4 x float> @gather4_2d_tfe(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t) { ; GFX6-LABEL: gather4_2d_tfe: ; GFX6: ; %bb.0: ; %main_body +; GFX6-NEXT: s_mov_b64 s[14:15], exec ; GFX6-NEXT: s_mov_b32 s0, s2 ; GFX6-NEXT: s_mov_b32 s1, s3 ; GFX6-NEXT: s_mov_b32 s2, s4 @@ -62,7 +63,6 @@ ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: s_mov_b32 s8, s10 ; GFX6-NEXT: s_mov_b32 s9, s11 -; GFX6-NEXT: s_mov_b64 s[14:15], exec ; GFX6-NEXT: s_mov_b32 s10, s12 ; GFX6-NEXT: s_mov_b32 s11, s13 ; GFX6-NEXT: s_wqm_b64 exec, exec @@ -114,6 +114,7 @@ define amdgpu_ps <4 x float> @gather4_cube(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t, float %face) { ; GFX6-LABEL: gather4_cube: ; GFX6: ; %bb.0: ; %main_body +; GFX6-NEXT: s_mov_b64 s[14:15], exec ; GFX6-NEXT: s_mov_b32 s0, s2 ; GFX6-NEXT: s_mov_b32 s1, s3 ; GFX6-NEXT: s_mov_b32 s2, s4 @@ -124,7 +125,6 @@ ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: s_mov_b32 s8, s10 ; GFX6-NEXT: s_mov_b32 s9, s11 -; GFX6-NEXT: s_mov_b64 s[14:15], exec ; GFX6-NEXT: s_mov_b32 s10, s12 ; GFX6-NEXT: s_mov_b32 s11, s13 ; GFX6-NEXT: s_wqm_b64 exec, exec @@ -161,6 +161,7 @@ define amdgpu_ps <4 x float> @gather4_2darray(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t, float %slice) { ; GFX6-LABEL: gather4_2darray: ; GFX6: ; %bb.0: ; %main_body +; GFX6-NEXT: s_mov_b64 s[14:15], exec ; GFX6-NEXT: s_mov_b32 s0, s2 ; GFX6-NEXT: s_mov_b32 s1, s3 ; GFX6-NEXT: s_mov_b32 s2, s4 @@ -171,7 +172,6 @@ ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: s_mov_b32 s8, s10 ; GFX6-NEXT: s_mov_b32 s9, s11 -; GFX6-NEXT: s_mov_b64 s[14:15], exec ; GFX6-NEXT: s_mov_b32 s10, s12 ; GFX6-NEXT: s_mov_b32 s11, s13 ; GFX6-NEXT: s_wqm_b64 exec, exec @@ -208,6 +208,7 @@ define amdgpu_ps <4 x float> @gather4_c_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %s, float %t) { ; GFX6-LABEL: gather4_c_2d: ; GFX6: ; %bb.0: ; %main_body +; GFX6-NEXT: s_mov_b64 s[14:15], exec ; GFX6-NEXT: s_mov_b32 s0, s2 ; GFX6-NEXT: s_mov_b32 s1, s3 ; GFX6-NEXT: s_mov_b32 s2, s4 @@ -218,7 +219,6 @@ ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: s_mov_b32 s8, s10 ; GFX6-NEXT: s_mov_b32 s9, s11 -; GFX6-NEXT: s_mov_b64 s[14:15], exec ; GFX6-NEXT: s_mov_b32 s10, s12 ; GFX6-NEXT: s_mov_b32 s11, s13 ; GFX6-NEXT: s_wqm_b64 exec, exec @@ -255,6 +255,7 @@ define amdgpu_ps <4 x float> @gather4_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t, float %clamp) { ; GFX6-LABEL: gather4_cl_2d: ; GFX6: ; %bb.0: ; %main_body +; GFX6-NEXT: s_mov_b64 s[14:15], exec ; GFX6-NEXT: s_mov_b32 s0, s2 ; GFX6-NEXT: s_mov_b32 s1, s3 ; GFX6-NEXT: s_mov_b32 s2, s4 @@ -265,7 +266,6 @@ ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: s_mov_b32 s8, s10 ; GFX6-NEXT: s_mov_b32 s9, s11 -; GFX6-NEXT: s_mov_b64 s[14:15], exec ; GFX6-NEXT: s_mov_b32 s10, s12 ; GFX6-NEXT: s_mov_b32 s11, s13 ; GFX6-NEXT: s_wqm_b64 exec, exec @@ -302,6 +302,7 @@ define amdgpu_ps <4 x float> @gather4_c_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %s, float %t, float %clamp) { ; GFX6-LABEL: gather4_c_cl_2d: ; GFX6: ; %bb.0: ; %main_body +; GFX6-NEXT: s_mov_b64 s[14:15], exec ; GFX6-NEXT: s_mov_b32 s0, s2 ; GFX6-NEXT: s_mov_b32 s1, s3 ; GFX6-NEXT: s_mov_b32 s2, s4 @@ -312,7 +313,6 @@ ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: s_mov_b32 s8, s10 ; GFX6-NEXT: s_mov_b32 s9, s11 -; GFX6-NEXT: s_mov_b64 s[14:15], exec ; GFX6-NEXT: s_mov_b32 s10, s12 ; GFX6-NEXT: s_mov_b32 s11, s13 ; GFX6-NEXT: s_wqm_b64 exec, exec @@ -349,6 +349,7 @@ define amdgpu_ps <4 x float> @gather4_b_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %s, float %t) { ; GFX6-LABEL: gather4_b_2d: ; GFX6: ; %bb.0: ; %main_body +; GFX6-NEXT: s_mov_b64 s[14:15], exec ; GFX6-NEXT: s_mov_b32 s0, s2 ; GFX6-NEXT: s_mov_b32 s1, s3 ; GFX6-NEXT: s_mov_b32 s2, s4 @@ -359,7 +360,6 @@ ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: s_mov_b32 s8, s10 ; GFX6-NEXT: s_mov_b32 s9, s11 -; GFX6-NEXT: s_mov_b64 s[14:15], exec ; GFX6-NEXT: s_mov_b32 s10, s12 ; GFX6-NEXT: s_mov_b32 s11, s13 ; GFX6-NEXT: s_wqm_b64 exec, exec @@ -396,6 +396,7 @@ define amdgpu_ps <4 x float> @gather4_c_b_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %zcompare, float %s, float %t) { ; GFX6-LABEL: gather4_c_b_2d: ; GFX6: ; %bb.0: ; %main_body +; GFX6-NEXT: s_mov_b64 s[14:15], exec ; GFX6-NEXT: s_mov_b32 s0, s2 ; GFX6-NEXT: s_mov_b32 s1, s3 ; GFX6-NEXT: s_mov_b32 s2, s4 @@ -406,7 +407,6 @@ ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: s_mov_b32 s8, s10 ; GFX6-NEXT: s_mov_b32 s9, s11 -; GFX6-NEXT: s_mov_b64 s[14:15], exec ; GFX6-NEXT: s_mov_b32 s10, s12 ; GFX6-NEXT: s_mov_b32 s11, s13 ; GFX6-NEXT: s_wqm_b64 exec, exec @@ -443,6 +443,7 @@ define amdgpu_ps <4 x float> @gather4_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %s, float %t, float %clamp) { ; GFX6-LABEL: gather4_b_cl_2d: ; GFX6: ; %bb.0: ; %main_body +; GFX6-NEXT: s_mov_b64 s[14:15], exec ; GFX6-NEXT: s_mov_b32 s0, s2 ; GFX6-NEXT: s_mov_b32 s1, s3 ; GFX6-NEXT: s_mov_b32 s2, s4 @@ -453,7 +454,6 @@ ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: s_mov_b32 s8, s10 ; GFX6-NEXT: s_mov_b32 s9, s11 -; GFX6-NEXT: s_mov_b64 s[14:15], exec ; GFX6-NEXT: s_mov_b32 s10, s12 ; GFX6-NEXT: s_mov_b32 s11, s13 ; GFX6-NEXT: s_wqm_b64 exec, exec @@ -490,6 +490,7 @@ define amdgpu_ps <4 x float> @gather4_c_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %zcompare, float %s, float %t, float %clamp) { ; GFX6-LABEL: gather4_c_b_cl_2d: ; GFX6: ; %bb.0: ; %main_body +; GFX6-NEXT: s_mov_b64 s[14:15], exec ; GFX6-NEXT: s_mov_b32 s0, s2 ; GFX6-NEXT: s_mov_b32 s1, s3 ; GFX6-NEXT: s_mov_b32 s2, s4 @@ -500,7 +501,6 @@ ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: s_mov_b32 s8, s10 ; GFX6-NEXT: s_mov_b32 s9, s11 -; GFX6-NEXT: s_mov_b64 s[14:15], exec ; GFX6-NEXT: s_mov_b32 s10, s12 ; GFX6-NEXT: s_mov_b32 s11, s13 ; GFX6-NEXT: s_wqm_b64 exec, exec @@ -701,6 +701,7 @@ define amdgpu_ps <4 x float> @gather4_2d_dmask_2(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t) { ; GFX6-LABEL: gather4_2d_dmask_2: ; GFX6: ; %bb.0: ; %main_body +; GFX6-NEXT: s_mov_b64 s[14:15], exec ; GFX6-NEXT: s_mov_b32 s0, s2 ; GFX6-NEXT: s_mov_b32 s1, s3 ; GFX6-NEXT: s_mov_b32 s2, s4 @@ -711,7 +712,6 @@ ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: s_mov_b32 s8, s10 ; GFX6-NEXT: s_mov_b32 s9, s11 -; GFX6-NEXT: s_mov_b64 s[14:15], exec ; GFX6-NEXT: s_mov_b32 s10, s12 ; GFX6-NEXT: s_mov_b32 s11, s13 ; GFX6-NEXT: s_wqm_b64 exec, exec @@ -748,6 +748,7 @@ define amdgpu_ps <4 x float> @gather4_2d_dmask_4(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t) { ; GFX6-LABEL: gather4_2d_dmask_4: ; GFX6: ; %bb.0: ; %main_body +; GFX6-NEXT: s_mov_b64 s[14:15], exec ; GFX6-NEXT: s_mov_b32 s0, s2 ; GFX6-NEXT: s_mov_b32 s1, s3 ; GFX6-NEXT: s_mov_b32 s2, s4 @@ -758,7 +759,6 @@ ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: s_mov_b32 s8, s10 ; GFX6-NEXT: s_mov_b32 s9, s11 -; GFX6-NEXT: s_mov_b64 s[14:15], exec ; GFX6-NEXT: s_mov_b32 s10, s12 ; GFX6-NEXT: s_mov_b32 s11, s13 ; GFX6-NEXT: s_wqm_b64 exec, exec @@ -795,6 +795,7 @@ define amdgpu_ps <4 x float> @gather4_2d_dmask_8(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t) { ; GFX6-LABEL: gather4_2d_dmask_8: ; GFX6: ; %bb.0: ; %main_body +; GFX6-NEXT: s_mov_b64 s[14:15], exec ; GFX6-NEXT: s_mov_b32 s0, s2 ; GFX6-NEXT: s_mov_b32 s1, s3 ; GFX6-NEXT: s_mov_b32 s2, s4 @@ -805,7 +806,6 @@ ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: s_mov_b32 s8, s10 ; GFX6-NEXT: s_mov_b32 s9, s11 -; GFX6-NEXT: s_mov_b64 s[14:15], exec ; GFX6-NEXT: s_mov_b32 s10, s12 ; GFX6-NEXT: s_mov_b32 s11, s13 ; GFX6-NEXT: s_wqm_b64 exec, exec diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.o.dim.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.o.dim.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.o.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.o.dim.ll @@ -5,6 +5,7 @@ define amdgpu_ps <4 x float> @gather4_o_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %s, float %t) { ; GFX6-LABEL: gather4_o_2d: ; GFX6: ; %bb.0: ; %main_body +; GFX6-NEXT: s_mov_b64 s[14:15], exec ; GFX6-NEXT: s_mov_b32 s0, s2 ; GFX6-NEXT: s_mov_b32 s1, s3 ; GFX6-NEXT: s_mov_b32 s2, s4 @@ -15,7 +16,6 @@ ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: s_mov_b32 s8, s10 ; GFX6-NEXT: s_mov_b32 s9, s11 -; GFX6-NEXT: s_mov_b64 s[14:15], exec ; GFX6-NEXT: s_mov_b32 s10, s12 ; GFX6-NEXT: s_mov_b32 s11, s13 ; GFX6-NEXT: s_wqm_b64 exec, exec @@ -52,6 +52,7 @@ define amdgpu_ps <4 x float> @gather4_c_o_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, float %s, float %t) { ; GFX6-LABEL: gather4_c_o_2d: ; GFX6: ; %bb.0: ; %main_body +; GFX6-NEXT: s_mov_b64 s[14:15], exec ; GFX6-NEXT: s_mov_b32 s0, s2 ; GFX6-NEXT: s_mov_b32 s1, s3 ; GFX6-NEXT: s_mov_b32 s2, s4 @@ -62,7 +63,6 @@ ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: s_mov_b32 s8, s10 ; GFX6-NEXT: s_mov_b32 s9, s11 -; GFX6-NEXT: s_mov_b64 s[14:15], exec ; GFX6-NEXT: s_mov_b32 s10, s12 ; GFX6-NEXT: s_mov_b32 s11, s13 ; GFX6-NEXT: s_wqm_b64 exec, exec @@ -99,6 +99,7 @@ define amdgpu_ps <4 x float> @gather4_cl_o_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %s, float %t, float %clamp) { ; GFX6-LABEL: gather4_cl_o_2d: ; GFX6: ; %bb.0: ; %main_body +; GFX6-NEXT: s_mov_b64 s[14:15], exec ; GFX6-NEXT: s_mov_b32 s0, s2 ; GFX6-NEXT: s_mov_b32 s1, s3 ; GFX6-NEXT: s_mov_b32 s2, s4 @@ -109,7 +110,6 @@ ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: s_mov_b32 s8, s10 ; GFX6-NEXT: s_mov_b32 s9, s11 -; GFX6-NEXT: s_mov_b64 s[14:15], exec ; GFX6-NEXT: s_mov_b32 s10, s12 ; GFX6-NEXT: s_mov_b32 s11, s13 ; GFX6-NEXT: s_wqm_b64 exec, exec @@ -146,6 +146,7 @@ define amdgpu_ps <4 x float> @gather4_c_cl_o_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, float %s, float %t, float %clamp) { ; GFX6-LABEL: gather4_c_cl_o_2d: ; GFX6: ; %bb.0: ; %main_body +; GFX6-NEXT: s_mov_b64 s[14:15], exec ; GFX6-NEXT: s_mov_b32 s0, s2 ; GFX6-NEXT: s_mov_b32 s1, s3 ; GFX6-NEXT: s_mov_b32 s2, s4 @@ -156,7 +157,6 @@ ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: s_mov_b32 s8, s10 ; GFX6-NEXT: s_mov_b32 s9, s11 -; GFX6-NEXT: s_mov_b64 s[14:15], exec ; GFX6-NEXT: s_mov_b32 s10, s12 ; GFX6-NEXT: s_mov_b32 s11, s13 ; GFX6-NEXT: s_wqm_b64 exec, exec @@ -193,6 +193,7 @@ define amdgpu_ps <4 x float> @gather4_b_o_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %bias, float %s, float %t) { ; GFX6-LABEL: gather4_b_o_2d: ; GFX6: ; %bb.0: ; %main_body +; GFX6-NEXT: s_mov_b64 s[14:15], exec ; GFX6-NEXT: s_mov_b32 s0, s2 ; GFX6-NEXT: s_mov_b32 s1, s3 ; GFX6-NEXT: s_mov_b32 s2, s4 @@ -203,7 +204,6 @@ ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: s_mov_b32 s8, s10 ; GFX6-NEXT: s_mov_b32 s9, s11 -; GFX6-NEXT: s_mov_b64 s[14:15], exec ; GFX6-NEXT: s_mov_b32 s10, s12 ; GFX6-NEXT: s_mov_b32 s11, s13 ; GFX6-NEXT: s_wqm_b64 exec, exec @@ -240,6 +240,7 @@ define amdgpu_ps <4 x float> @gather4_c_b_o_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %bias, float %zcompare, float %s, float %t) { ; GFX6-LABEL: gather4_c_b_o_2d: ; GFX6: ; %bb.0: ; %main_body +; GFX6-NEXT: s_mov_b64 s[14:15], exec ; GFX6-NEXT: s_mov_b32 s0, s2 ; GFX6-NEXT: s_mov_b32 s1, s3 ; GFX6-NEXT: s_mov_b32 s2, s4 @@ -250,7 +251,6 @@ ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: s_mov_b32 s8, s10 ; GFX6-NEXT: s_mov_b32 s9, s11 -; GFX6-NEXT: s_mov_b64 s[14:15], exec ; GFX6-NEXT: s_mov_b32 s10, s12 ; GFX6-NEXT: s_mov_b32 s11, s13 ; GFX6-NEXT: s_wqm_b64 exec, exec @@ -328,6 +328,7 @@ define amdgpu_ps <4 x float> @gather4_c_b_cl_o_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %bias, float %zcompare, float %s, float %t, float %clamp) { ; GFX6-LABEL: gather4_c_b_cl_o_2d: ; GFX6: ; %bb.0: ; %main_body +; GFX6-NEXT: s_mov_b64 s[14:15], exec ; GFX6-NEXT: s_mov_b32 s0, s2 ; GFX6-NEXT: s_mov_b32 s1, s3 ; GFX6-NEXT: s_mov_b32 s2, s4 @@ -338,7 +339,6 @@ ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: s_mov_b32 s8, s10 ; GFX6-NEXT: s_mov_b32 s9, s11 -; GFX6-NEXT: s_mov_b64 s[14:15], exec ; GFX6-NEXT: s_mov_b32 s10, s12 ; GFX6-NEXT: s_mov_b32 s11, s13 ; GFX6-NEXT: s_wqm_b64 exec, exec diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2d.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2d.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2d.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2d.ll @@ -73,13 +73,13 @@ ; GFX10-NEXT: v_mov_b32_e32 v9, v7 ; GFX10-NEXT: v_mov_b32_e32 v10, v7 ; GFX10-NEXT: v_mov_b32_e32 v11, v7 -; GFX10-NEXT: v_mov_b32_e32 v0, v7 ; GFX10-NEXT: s_mov_b32 s2, s4 ; GFX10-NEXT: s_mov_b32 s3, s5 ; GFX10-NEXT: s_mov_b32 s4, s6 ; GFX10-NEXT: s_mov_b32 s5, s7 ; GFX10-NEXT: s_mov_b32 s6, s8 ; GFX10-NEXT: s_mov_b32 s7, s9 +; GFX10-NEXT: v_mov_b32_e32 v0, v7 ; GFX10-NEXT: v_mov_b32_e32 v1, v8 ; GFX10-NEXT: v_mov_b32_e32 v2, v9 ; GFX10-NEXT: v_mov_b32_e32 v3, v10 @@ -135,13 +135,13 @@ ; GFX10-NEXT: v_mov_b32_e32 v9, v7 ; GFX10-NEXT: v_mov_b32_e32 v10, v7 ; GFX10-NEXT: v_mov_b32_e32 v11, v7 -; GFX10-NEXT: v_mov_b32_e32 v0, v7 ; GFX10-NEXT: s_mov_b32 s2, s4 ; GFX10-NEXT: s_mov_b32 s3, s5 ; GFX10-NEXT: s_mov_b32 s4, s6 ; GFX10-NEXT: s_mov_b32 s5, s7 ; GFX10-NEXT: s_mov_b32 s6, s8 ; GFX10-NEXT: s_mov_b32 s7, s9 +; GFX10-NEXT: v_mov_b32_e32 v0, v7 ; GFX10-NEXT: v_mov_b32_e32 v1, v8 ; GFX10-NEXT: v_mov_b32_e32 v2, v9 ; GFX10-NEXT: v_mov_b32_e32 v3, v10 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.a16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.a16.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.a16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.a16.ll @@ -49,9 +49,9 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-NEXT: v_and_or_b32 v10, v0, v4, v1 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; GFX9-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-NEXT: v_and_or_b32 v11, v2, v4, v0 ; GFX9-NEXT: v_mov_b32_e32 v6, v5 ; GFX9-NEXT: v_mov_b32_e32 v7, v5 @@ -89,7 +89,6 @@ ; GFX10-NEXT: v_mov_b32_e32 v9, v5 ; GFX10-NEXT: v_and_or_b32 v10, v0, v4, v1 ; GFX10-NEXT: v_and_or_b32 v11, v2, v4, v3 -; GFX10-NEXT: v_mov_b32_e32 v0, v5 ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: s_mov_b32 s2, s4 ; GFX10-NEXT: s_mov_b32 s3, s5 @@ -97,6 +96,7 @@ ; GFX10-NEXT: s_mov_b32 s5, s7 ; GFX10-NEXT: s_mov_b32 s6, s8 ; GFX10-NEXT: s_mov_b32 s7, s9 +; GFX10-NEXT: v_mov_b32_e32 v0, v5 ; GFX10-NEXT: v_mov_b32_e32 v1, v6 ; GFX10-NEXT: v_mov_b32_e32 v2, v7 ; GFX10-NEXT: v_mov_b32_e32 v3, v8 @@ -118,9 +118,9 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-NEXT: v_and_or_b32 v10, v0, v4, v1 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; GFX9-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-NEXT: v_and_or_b32 v11, v2, v4, v0 ; GFX9-NEXT: v_mov_b32_e32 v6, v5 ; GFX9-NEXT: v_mov_b32_e32 v7, v5 @@ -158,7 +158,6 @@ ; GFX10-NEXT: v_mov_b32_e32 v9, v5 ; GFX10-NEXT: v_and_or_b32 v10, v0, v4, v1 ; GFX10-NEXT: v_and_or_b32 v11, v2, v4, v3 -; GFX10-NEXT: v_mov_b32_e32 v0, v5 ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: s_mov_b32 s2, s4 ; GFX10-NEXT: s_mov_b32 s3, s5 @@ -166,6 +165,7 @@ ; GFX10-NEXT: s_mov_b32 s5, s7 ; GFX10-NEXT: s_mov_b32 s6, s8 ; GFX10-NEXT: s_mov_b32 s7, s9 +; GFX10-NEXT: v_mov_b32_e32 v0, v5 ; GFX10-NEXT: v_mov_b32_e32 v1, v6 ; GFX10-NEXT: v_mov_b32_e32 v2, v7 ; GFX10-NEXT: v_mov_b32_e32 v3, v8 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.ll @@ -75,7 +75,6 @@ ; GFX10-NEXT: v_mov_b32_e32 v11, v9 ; GFX10-NEXT: v_mov_b32_e32 v12, v9 ; GFX10-NEXT: v_mov_b32_e32 v13, v9 -; GFX10-NEXT: v_mov_b32_e32 v0, v9 ; GFX10-NEXT: s_mov_b32 s0, s2 ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: s_mov_b32 s2, s4 @@ -84,6 +83,7 @@ ; GFX10-NEXT: s_mov_b32 s5, s7 ; GFX10-NEXT: s_mov_b32 s6, s8 ; GFX10-NEXT: s_mov_b32 s7, s9 +; GFX10-NEXT: v_mov_b32_e32 v0, v9 ; GFX10-NEXT: v_mov_b32_e32 v1, v10 ; GFX10-NEXT: v_mov_b32_e32 v2, v11 ; GFX10-NEXT: v_mov_b32_e32 v3, v12 @@ -141,7 +141,6 @@ ; GFX10-NEXT: v_mov_b32_e32 v11, v9 ; GFX10-NEXT: v_mov_b32_e32 v12, v9 ; GFX10-NEXT: v_mov_b32_e32 v13, v9 -; GFX10-NEXT: v_mov_b32_e32 v0, v9 ; GFX10-NEXT: s_mov_b32 s0, s2 ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: s_mov_b32 s2, s4 @@ -150,6 +149,7 @@ ; GFX10-NEXT: s_mov_b32 s5, s7 ; GFX10-NEXT: s_mov_b32 s6, s8 ; GFX10-NEXT: s_mov_b32 s7, s9 +; GFX10-NEXT: v_mov_b32_e32 v0, v9 ; GFX10-NEXT: v_mov_b32_e32 v1, v10 ; GFX10-NEXT: v_mov_b32_e32 v2, v11 ; GFX10-NEXT: v_mov_b32_e32 v3, v12 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.a16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.a16.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.a16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.a16.ll @@ -15,8 +15,8 @@ ; GFX9-NEXT: s_mov_b32 s1, s3 ; GFX9-NEXT: s_mov_b32 s3, s5 ; GFX9-NEXT: s_mov_b32 s5, s7 -; GFX9-NEXT: v_and_or_b32 v0, v0, v3, v1 ; GFX9-NEXT: s_mov_b32 s7, s9 +; GFX9-NEXT: v_and_or_b32 v0, v0, v3, v1 ; GFX9-NEXT: v_and_or_b32 v1, v2, v3, s8 ; GFX9-NEXT: image_load v[0:3], v[0:1], s[0:7] dmask:0xf unorm a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -48,13 +48,13 @@ ; GFX9-LABEL: load_3d_v4f32_xyzw_tfe: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_mov_b32 s0, s2 -; GFX9-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-NEXT: s_mov_b32 s2, s4 ; GFX9-NEXT: s_mov_b32 s4, s6 ; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX9-NEXT: s_lshl_b32 s8, s0, 16 +; GFX9-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-NEXT: v_and_or_b32 v10, v0, v3, v1 ; GFX9-NEXT: v_and_or_b32 v11, v2, v3, s8 ; GFX9-NEXT: v_mov_b32_e32 v6, v5 @@ -92,11 +92,11 @@ ; GFX10-NEXT: v_mov_b32_e32 v9, v5 ; GFX10-NEXT: v_and_or_b32 v10, v0, v3, v1 ; GFX10-NEXT: v_and_or_b32 v11, v2, v3, s8 -; GFX10-NEXT: v_mov_b32_e32 v0, v5 ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: s_mov_b32 s3, s5 ; GFX10-NEXT: s_mov_b32 s5, s7 ; GFX10-NEXT: s_mov_b32 s7, s9 +; GFX10-NEXT: v_mov_b32_e32 v0, v5 ; GFX10-NEXT: v_mov_b32_e32 v1, v6 ; GFX10-NEXT: v_mov_b32_e32 v2, v7 ; GFX10-NEXT: v_mov_b32_e32 v3, v8 @@ -117,13 +117,13 @@ ; GFX9-LABEL: load_3d_v4f32_xyzw_tfe_lwe: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_mov_b32 s0, s2 -; GFX9-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-NEXT: s_mov_b32 s2, s4 ; GFX9-NEXT: s_mov_b32 s4, s6 ; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX9-NEXT: s_lshl_b32 s8, s0, 16 +; GFX9-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-NEXT: v_and_or_b32 v10, v0, v3, v1 ; GFX9-NEXT: v_and_or_b32 v11, v2, v3, s8 ; GFX9-NEXT: v_mov_b32_e32 v6, v5 @@ -161,11 +161,11 @@ ; GFX10-NEXT: v_mov_b32_e32 v9, v5 ; GFX10-NEXT: v_and_or_b32 v10, v0, v3, v1 ; GFX10-NEXT: v_and_or_b32 v11, v2, v3, s8 -; GFX10-NEXT: v_mov_b32_e32 v0, v5 ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: s_mov_b32 s3, s5 ; GFX10-NEXT: s_mov_b32 s5, s7 ; GFX10-NEXT: s_mov_b32 s7, s9 +; GFX10-NEXT: v_mov_b32_e32 v0, v5 ; GFX10-NEXT: v_mov_b32_e32 v1, v6 ; GFX10-NEXT: v_mov_b32_e32 v2, v7 ; GFX10-NEXT: v_mov_b32_e32 v3, v8 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.ll @@ -74,7 +74,6 @@ ; GFX10-NEXT: v_mov_b32_e32 v10, v8 ; GFX10-NEXT: v_mov_b32_e32 v11, v8 ; GFX10-NEXT: v_mov_b32_e32 v12, v8 -; GFX10-NEXT: v_mov_b32_e32 v0, v8 ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: s_mov_b32 s2, s4 ; GFX10-NEXT: s_mov_b32 s3, s5 @@ -82,6 +81,7 @@ ; GFX10-NEXT: s_mov_b32 s5, s7 ; GFX10-NEXT: s_mov_b32 s6, s8 ; GFX10-NEXT: s_mov_b32 s7, s9 +; GFX10-NEXT: v_mov_b32_e32 v0, v8 ; GFX10-NEXT: v_mov_b32_e32 v1, v9 ; GFX10-NEXT: v_mov_b32_e32 v2, v10 ; GFX10-NEXT: v_mov_b32_e32 v3, v11 @@ -138,7 +138,6 @@ ; GFX10-NEXT: v_mov_b32_e32 v10, v8 ; GFX10-NEXT: v_mov_b32_e32 v11, v8 ; GFX10-NEXT: v_mov_b32_e32 v12, v8 -; GFX10-NEXT: v_mov_b32_e32 v0, v8 ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: s_mov_b32 s2, s4 ; GFX10-NEXT: s_mov_b32 s3, s5 @@ -146,6 +145,7 @@ ; GFX10-NEXT: s_mov_b32 s5, s7 ; GFX10-NEXT: s_mov_b32 s6, s8 ; GFX10-NEXT: s_mov_b32 s7, s9 +; GFX10-NEXT: v_mov_b32_e32 v0, v8 ; GFX10-NEXT: v_mov_b32_e32 v1, v9 ; GFX10-NEXT: v_mov_b32_e32 v2, v10 ; GFX10-NEXT: v_mov_b32_e32 v3, v11 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.g16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.g16.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.g16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.g16.ll @@ -38,12 +38,12 @@ ; GFX10-NEXT: v_mov_b32_e32 v9, v2 ; GFX10-NEXT: v_mov_b32_e32 v10, v3 ; GFX10-NEXT: v_mov_b32_e32 v11, 0xffff -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX10-NEXT: s_lshl_b32 s12, s0, 16 ; GFX10-NEXT: v_and_or_b32 v3, v9, v11, s12 -; GFX10-NEXT: v_and_or_b32 v4, v10, v11, v4 ; GFX10-NEXT: v_and_or_b32 v2, v0, v11, v1 +; GFX10-NEXT: v_and_or_b32 v4, v10, v11, v4 ; GFX10-NEXT: v_and_or_b32 v5, v5, v11, s12 ; GFX10-NEXT: image_sample_d_g16 v[0:3], v[2:8], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -281,14 +281,14 @@ ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: v_mov_b32_e32 v9, v3 ; GFX10-NEXT: v_mov_b32_e32 v10, v2 +; GFX10-NEXT: v_mov_b32_e32 v11, v4 ; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; GFX10-NEXT: v_mov_b32_e32 v3, v1 -; GFX10-NEXT: v_mov_b32_e32 v11, v4 ; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff -; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v9 -; GFX10-NEXT: v_and_or_b32 v5, v11, v0, v5 +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX10-NEXT: v_and_or_b32 v4, v10, v0, v1 +; GFX10-NEXT: v_and_or_b32 v5, v11, v0, v5 ; GFX10-NEXT: image_sample_c_d_o_g16 v0, v[2:8], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -302,14 +302,14 @@ ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: v_mov_b32_e32 v9, v3 ; GFX10-NEXT: v_mov_b32_e32 v10, v2 +; GFX10-NEXT: v_mov_b32_e32 v11, v4 ; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; GFX10-NEXT: v_mov_b32_e32 v3, v1 -; GFX10-NEXT: v_mov_b32_e32 v11, v4 ; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff -; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v9 -; GFX10-NEXT: v_and_or_b32 v5, v11, v0, v5 +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX10-NEXT: v_and_or_b32 v4, v10, v0, v1 +; GFX10-NEXT: v_and_or_b32 v5, v11, v0, v5 ; GFX10-NEXT: image_sample_c_d_o_g16 v[0:1], v[2:8], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.store.2d.d16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.store.2d.d16.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.store.2d.d16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.store.2d.d16.ll @@ -117,9 +117,9 @@ ; GFX81-NEXT: s_mov_b32 s3, s5 ; GFX81-NEXT: s_mov_b32 s4, s6 ; GFX81-NEXT: s_mov_b32 s5, s7 -; GFX81-NEXT: v_or_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX81-NEXT: s_mov_b32 s6, s8 ; GFX81-NEXT: s_mov_b32 s7, s9 +; GFX81-NEXT: v_or_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX81-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX81-NEXT: v_mov_b32_e32 v4, 0 ; GFX81-NEXT: image_store v[2:4], v[0:1], s[0:7] dmask:0x7 unorm d16 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll @@ -208,13 +208,13 @@ ; GFX1030: ; %bb.0: ; GFX1030-NEXT: s_mov_b32 s0, 0xffff ; GFX1030-NEXT: v_mov_b32_e32 v5, v0 -; GFX1030-NEXT: v_lshrrev_b32_e32 v0, 16, v6 ; GFX1030-NEXT: v_mov_b32_e32 v14, v1 +; GFX1030-NEXT: v_lshrrev_b32_e32 v0, 16, v6 ; GFX1030-NEXT: v_and_b32_e32 v1, s0, v8 ; GFX1030-NEXT: v_mov_b32_e32 v15, v2 ; GFX1030-NEXT: v_mov_b32_e32 v16, v3 -; GFX1030-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX1030-NEXT: v_lshrrev_b32_e32 v2, 16, v8 +; GFX1030-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX1030-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX1030-NEXT: v_and_b32_e32 v3, s0, v9 ; GFX1030-NEXT: s_mov_b32 s1, exec_lo @@ -340,13 +340,13 @@ ; GFX1030: ; %bb.0: ; GFX1030-NEXT: s_mov_b32 s0, 0xffff ; GFX1030-NEXT: v_mov_b32_e32 v6, v0 -; GFX1030-NEXT: v_lshrrev_b32_e32 v0, 16, v7 ; GFX1030-NEXT: v_mov_b32_e32 v15, v1 +; GFX1030-NEXT: v_lshrrev_b32_e32 v0, 16, v7 ; GFX1030-NEXT: v_and_b32_e32 v1, s0, v9 ; GFX1030-NEXT: v_mov_b32_e32 v16, v2 ; GFX1030-NEXT: v_mov_b32_e32 v17, v3 -; GFX1030-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX1030-NEXT: v_lshrrev_b32_e32 v2, 16, v9 +; GFX1030-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX1030-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX1030-NEXT: v_and_b32_e32 v3, s0, v10 ; GFX1030-NEXT: s_mov_b32 s1, exec_lo @@ -430,8 +430,8 @@ ; GFX1030-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 ; GFX1030-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX1030-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 -; GFX1030-NEXT: v_mov_b32_e32 v4, 2.0 ; GFX1030-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo +; GFX1030-NEXT: v_mov_b32_e32 v4, 2.0 ; GFX1030-NEXT: flat_load_dword v0, v[0:1] ; GFX1030-NEXT: flat_load_dword v1, v[2:3] ; GFX1030-NEXT: v_mov_b32_e32 v2, 0 @@ -460,8 +460,8 @@ ; GFX1013-NEXT: v_add_co_u32 v4, vcc_lo, v0, v6 ; GFX1013-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo ; GFX1013-NEXT: v_add_co_u32 v2, vcc_lo, v2, v6 -; GFX1013-NEXT: v_mov_b32_e32 v6, 4.0 ; GFX1013-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo +; GFX1013-NEXT: v_mov_b32_e32 v6, 4.0 ; GFX1013-NEXT: flat_load_dword v0, v[4:5] ; GFX1013-NEXT: flat_load_dword v1, v[2:3] ; GFX1013-NEXT: v_mov_b32_e32 v2, 0 @@ -509,16 +509,16 @@ ; GFX1030-NEXT: v_mov_b32_e32 v3, s7 ; GFX1030-NEXT: s_movk_i32 s5, 0x4400 ; GFX1030-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 -; GFX1030-NEXT: s_movk_i32 s6, 0x4200 ; GFX1030-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX1030-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 -; GFX1030-NEXT: s_bfe_u32 s5, s5, 0x100000 ; GFX1030-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo +; GFX1030-NEXT: s_movk_i32 s6, 0x4200 +; GFX1030-NEXT: flat_load_dword v0, v[0:1] +; GFX1030-NEXT: flat_load_dword v1, v[2:3] +; GFX1030-NEXT: s_bfe_u32 s5, s5, 0x100000 ; GFX1030-NEXT: s_movk_i32 s7, 0x4800 ; GFX1030-NEXT: s_bfe_u32 s6, s6, 0x100000 ; GFX1030-NEXT: s_lshl_b32 s5, s5, 16 -; GFX1030-NEXT: flat_load_dword v0, v[0:1] -; GFX1030-NEXT: flat_load_dword v1, v[2:3] ; GFX1030-NEXT: s_movk_i32 s4, 0x4500 ; GFX1030-NEXT: s_or_b32 s5, s6, s5 ; GFX1030-NEXT: s_bfe_u32 s6, s9, 0x100000 @@ -553,8 +553,8 @@ ; GFX1013-NEXT: s_bfe_u32 s2, s2, 0x100000 ; GFX1013-NEXT: s_lshl_b32 s1, s1, 16 ; GFX1013-NEXT: s_movk_i32 s0, 0x4500 -; GFX1013-NEXT: s_bfe_u32 s3, s3, 0x100000 ; GFX1013-NEXT: s_or_b32 s1, s2, s1 +; GFX1013-NEXT: s_bfe_u32 s3, s3, 0x100000 ; GFX1013-NEXT: s_bfe_u32 s0, s0, 0x100000 ; GFX1013-NEXT: s_lshl_b32 s3, s3, 16 ; GFX1013-NEXT: s_waitcnt lgkmcnt(0) @@ -564,18 +564,18 @@ ; GFX1013-NEXT: v_mov_b32_e32 v3, s7 ; GFX1013-NEXT: s_movk_i32 s5, 0x4600 ; GFX1013-NEXT: v_add_co_u32 v4, vcc_lo, v0, v6 -; GFX1013-NEXT: s_movk_i32 s4, 0x4700 ; GFX1013-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo ; GFX1013-NEXT: v_add_co_u32 v2, vcc_lo, v2, v6 -; GFX1013-NEXT: s_bfe_u32 s2, s5, 0x100000 ; GFX1013-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo -; GFX1013-NEXT: s_lshl_b32 s2, s2, 16 -; GFX1013-NEXT: s_bfe_u32 s4, s4, 0x100000 -; GFX1013-NEXT: s_or_b32 s0, s0, s2 +; GFX1013-NEXT: s_movk_i32 s4, 0x4700 ; GFX1013-NEXT: flat_load_dword v0, v[4:5] ; GFX1013-NEXT: flat_load_dword v1, v[2:3] -; GFX1013-NEXT: s_or_b32 s2, s4, s3 +; GFX1013-NEXT: s_bfe_u32 s2, s5, 0x100000 +; GFX1013-NEXT: s_bfe_u32 s4, s4, 0x100000 +; GFX1013-NEXT: s_lshl_b32 s2, s2, 16 ; GFX1013-NEXT: v_mov_b32_e32 v2, 0 +; GFX1013-NEXT: s_or_b32 s0, s0, s2 +; GFX1013-NEXT: s_or_b32 s2, s4, s3 ; GFX1013-NEXT: v_mov_b32_e32 v3, 1.0 ; GFX1013-NEXT: v_mov_b32_e32 v4, 2.0 ; GFX1013-NEXT: v_mov_b32_e32 v5, s1 @@ -705,17 +705,17 @@ ; GFX1030-NEXT: s_movk_i32 s4, 0x4500 ; GFX1030-NEXT: s_bfe_u32 s5, s5, 0x100000 ; GFX1030-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 -; GFX1030-NEXT: s_lshl_b32 s5, s5, 16 ; GFX1030-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX1030-NEXT: s_lshl_b32 s5, s5, 16 +; GFX1030-NEXT: s_bfe_u32 s4, s4, 0x100000 ; GFX1030-NEXT: s_or_b32 s5, s6, s5 +; GFX1030-NEXT: flat_load_dword v2, v[0:1] ; GFX1030-NEXT: s_bfe_u32 s6, s9, 0x100000 -; GFX1030-NEXT: s_bfe_u32 s4, s4, 0x100000 +; GFX1030-NEXT: v_mov_b32_e32 v0, 0xb36211c6 ; GFX1030-NEXT: s_lshl_b32 s6, s6, 16 -; GFX1030-NEXT: flat_load_dword v2, v[0:1] +; GFX1030-NEXT: v_mov_b32_e32 v1, 0x102 ; GFX1030-NEXT: s_or_b32 s4, s4, s6 ; GFX1030-NEXT: s_or_b32 s6, s8, s7 -; GFX1030-NEXT: v_mov_b32_e32 v0, 0xb36211c6 -; GFX1030-NEXT: v_mov_b32_e32 v1, 0x102 ; GFX1030-NEXT: v_mov_b32_e32 v6, s5 ; GFX1030-NEXT: v_mov_b32_e32 v7, s4 ; GFX1030-NEXT: v_mov_b32_e32 v8, s6 @@ -749,13 +749,13 @@ ; GFX1013-NEXT: s_movk_i32 s3, 0x4800 ; GFX1013-NEXT: s_bfe_u32 s2, s2, 0x100000 ; GFX1013-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 -; GFX1013-NEXT: s_or_b32 s1, s2, s1 ; GFX1013-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX1013-NEXT: s_or_b32 s1, s2, s1 ; GFX1013-NEXT: s_bfe_u32 s2, s9, 0x100000 ; GFX1013-NEXT: s_bfe_u32 s3, s3, 0x100000 +; GFX1013-NEXT: flat_load_dword v2, v[0:1] ; GFX1013-NEXT: s_lshl_b32 s2, s2, 16 ; GFX1013-NEXT: s_lshl_b32 s3, s3, 16 -; GFX1013-NEXT: flat_load_dword v2, v[0:1] ; GFX1013-NEXT: s_or_b32 s0, s0, s2 ; GFX1013-NEXT: s_or_b32 s2, s8, s3 ; GFX1013-NEXT: v_mov_b32_e32 v0, 0xb36211c6 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll @@ -47,9 +47,8 @@ ; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: v_mov_b32_dpp v4, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 ; GFX8-NEXT: v_mov_b32_dpp v5, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GFX8-NEXT: v_mov_b32_dpp v4, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 ; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[4:5] ; GFX8-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll @@ -57,8 +57,8 @@ ; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v4, 24, v5 ; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v5, v6, s4, v7 ; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v6, 16, v8 -; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v8, v10, v0, v1 ; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v7, 24, v9 +; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v8, v10, v0, v1 ; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v9, 16, v11 ; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v10, 24, v12 ; GFX9-NOUNALIGNED-NEXT: v_or3_b32 v0, v2, v3, v4 @@ -94,8 +94,8 @@ ; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v11, v[0:1], s[4:7], 0 addr64 offset:9 ; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v12, v[0:1], s[4:7], 0 addr64 offset:10 ; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 offset:11 -; GFX7-NOUNALIGNED-NEXT: v_mov_b32_e32 v1, 0xff ; GFX7-NOUNALIGNED-NEXT: s_movk_i32 s4, 0xff +; GFX7-NOUNALIGNED-NEXT: v_mov_b32_e32 v1, 0xff ; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(11) ; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v2, s4, v2 ; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(10) @@ -211,8 +211,8 @@ ; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0) ; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v0, s4, v0 ; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v6, 16, v0 ; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v6, 16, v0 ; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v0, v1, v2 ; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v1, v3, v4 ; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v2, v5, v6 @@ -323,10 +323,10 @@ ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v1 -; GFX9-NEXT: v_mov_b32_e32 v4, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v11, 24, v2 +; GFX9-NEXT: v_mov_b32_e32 v4, v1 ; GFX9-NEXT: v_mov_b32_e32 v8, v2 ; GFX9-NEXT: v_mov_b32_e32 v1, v13 ; GFX9-NEXT: v_mov_b32_e32 v2, v12 @@ -346,10 +346,10 @@ ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 8, v1 ; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v1 ; GFX7-NEXT: v_lshrrev_b32_e32 v7, 24, v1 -; GFX7-NEXT: v_mov_b32_e32 v4, v1 ; GFX7-NEXT: v_lshrrev_b32_e32 v9, 8, v2 ; GFX7-NEXT: v_lshrrev_b32_e32 v10, 16, v2 ; GFX7-NEXT: v_lshrrev_b32_e32 v11, 24, v2 +; GFX7-NEXT: v_mov_b32_e32 v4, v1 ; GFX7-NEXT: v_mov_b32_e32 v8, v2 ; GFX7-NEXT: v_mov_b32_e32 v1, v13 ; GFX7-NEXT: v_mov_b32_e32 v2, v12 @@ -405,8 +405,8 @@ ; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v10, v0, s[0:1] offset:9 ; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v11, v0, s[0:1] offset:10 ; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v12, v0, s[0:1] offset:11 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v0, 0xff ; GFX9-NOUNALIGNED-NEXT: s_movk_i32 s0, 0xff +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v0, 0xff ; GFX9-NOUNALIGNED-NEXT: s_mov_b32 s1, 8 ; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v13, 8 ; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(10) @@ -471,8 +471,8 @@ ; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v9, off, s[0:3], 0 offset:9 ; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v10, off, s[0:3], 0 offset:10 ; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v11, off, s[0:3], 0 offset:11 -; GFX7-NOUNALIGNED-NEXT: v_mov_b32_e32 v12, 0xff ; GFX7-NOUNALIGNED-NEXT: s_movk_i32 s0, 0xff +; GFX7-NOUNALIGNED-NEXT: v_mov_b32_e32 v12, 0xff ; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(11) ; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v0, s0, v0 ; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(10) @@ -498,17 +498,17 @@ ; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v7, v7, v12 ; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0) ; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v11, v11, v12 -; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v1, v4, v5 ; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v1, v4, v5 ; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v4, v8, v9 -; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v7, 24, v7 -; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v1, v1, v6 ; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v11, 24, v11 +; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v1, v1, v6 ; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v2, v4, v10 ; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v0, v0, v3 ; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v1, v1, v7 @@ -585,13 +585,15 @@ ; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v0, s0, v0 ; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(4) ; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v1, s0, v1 -; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(3) +; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v2, s0, v2 ; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(2) ; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v3, s0, v3 -; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v2, s0, v2 +; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(1) +; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v4, s0, v4 ; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0) ; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v5, s0, v5 -; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v4, s0, v4 +; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v0, v0, v1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.128.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.128.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.128.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.128.ll @@ -86,12 +86,13 @@ ; GFX9-NEXT: v_and_b32_e32 v7, v8, v3 ; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; GFX9-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; GFX9-NEXT: v_or3_b32 v2, v2, v6, v7 ; GFX9-NEXT: s_waitcnt lgkmcnt(2) ; GFX9-NEXT: v_lshlrev_b32_sdwa v5, v5, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_waitcnt lgkmcnt(1) +; GFX9-NEXT: v_and_b32_e32 v6, v11, v3 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v0, v0, v3 -; GFX9-NEXT: v_or3_b32 v2, v2, v6, v7 -; GFX9-NEXT: v_and_b32_e32 v6, v11, v3 ; GFX9-NEXT: v_and_or_b32 v5, v9, v3, v5 ; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 24, v0 @@ -123,8 +124,8 @@ ; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX7-NEXT: s_waitcnt lgkmcnt(4) ; GFX7-NEXT: v_and_b32_e32 v2, s4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v2 ; GFX7-NEXT: v_mov_b32_e32 v3, 0xff +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v2 ; GFX7-NEXT: v_or_b32_e32 v4, v1, v2 ; GFX7-NEXT: s_waitcnt lgkmcnt(2) ; GFX7-NEXT: v_and_b32_e32 v2, v7, v3 @@ -158,18 +159,18 @@ ; GFX7-NEXT: v_or_b32_e32 v2, v2, v5 ; GFX7-NEXT: s_waitcnt lgkmcnt(4) ; GFX7-NEXT: v_and_b32_e32 v5, v7, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 24, v5 ; GFX7-NEXT: s_waitcnt lgkmcnt(2) ; GFX7-NEXT: v_and_b32_e32 v6, v9, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 24, v5 ; GFX7-NEXT: v_or_b32_e32 v2, v2, v5 ; GFX7-NEXT: v_and_b32_e32 v5, v8, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v6, 8, v6 ; GFX7-NEXT: v_or_b32_e32 v5, v5, v6 ; GFX7-NEXT: s_waitcnt lgkmcnt(1) ; GFX7-NEXT: v_and_b32_e32 v6, v10, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v0, v0, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; GFX7-NEXT: v_or_b32_e32 v5, v5, v6 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 24, v0 ; GFX7-NEXT: v_or_b32_e32 v3, v5, v0 @@ -216,9 +217,10 @@ ; GFX10-NEXT: v_lshlrev_b32_sdwa v7, v17, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX10-NEXT: s_waitcnt lgkmcnt(8) ; GFX10-NEXT: v_and_or_b32 v1, v8, s4, v1 +; GFX10-NEXT: s_waitcnt lgkmcnt(7) +; GFX10-NEXT: v_and_or_b32 v4, v9, s4, v4 ; GFX10-NEXT: s_waitcnt lgkmcnt(5) ; GFX10-NEXT: v_and_b32_e32 v8, v12, v11 -; GFX10-NEXT: v_and_or_b32 v4, v9, s4, v4 ; GFX10-NEXT: s_waitcnt lgkmcnt(4) ; GFX10-NEXT: v_and_b32_e32 v9, v13, v11 ; GFX10-NEXT: v_and_or_b32 v7, v10, v11, v7 @@ -230,17 +232,17 @@ ; GFX10-NEXT: v_and_b32_e32 v0, v0, v11 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GFX10-NEXT: v_and_or_b32 v10, v14, v11, v10 -; GFX10-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; GFX10-NEXT: v_lshlrev_b32_e32 v12, 24, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX10-NEXT: v_lshlrev_b32_e32 v6, 24, v6 ; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; GFX10-NEXT: v_lshlrev_b32_e32 v9, 24, v9 +; GFX10-NEXT: v_and_or_b32 v10, v14, v11, v10 +; GFX10-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; GFX10-NEXT: v_lshlrev_b32_e32 v12, 24, v0 ; GFX10-NEXT: v_or3_b32 v0, v1, v2, v3 -; GFX10-NEXT: v_or3_b32 v3, v10, v11, v12 ; GFX10-NEXT: v_or3_b32 v1, v4, v5, v6 ; GFX10-NEXT: v_or3_b32 v2, v7, v8, v9 +; GFX10-NEXT: v_or3_b32 v3, v10, v11, v12 ; GFX10-NEXT: s_setpc_b64 s[30:31] %load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr, align 1 ret <4 x i32> %load @@ -266,9 +268,9 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(4) ; GFX9-NEXT: v_and_b32_e32 v1, s4, v4 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_or_b32 v1, v3, s4, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(2) ; GFX9-NEXT: v_and_b32_e32 v2, s4, v6 -; GFX9-NEXT: v_and_or_b32 v1, v3, s4, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v3, s4, v8 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 @@ -294,19 +296,19 @@ ; GFX7-NEXT: v_and_b32_e32 v0, s4, v1 ; GFX7-NEXT: s_waitcnt lgkmcnt(6) ; GFX7-NEXT: v_and_b32_e32 v1, s4, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: s_waitcnt lgkmcnt(4) ; GFX7-NEXT: v_and_b32_e32 v2, s4, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: v_and_b32_e32 v1, s4, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: s_waitcnt lgkmcnt(2) ; GFX7-NEXT: v_and_b32_e32 v3, s4, v6 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v8 ; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX7-NEXT: v_and_b32_e32 v2, s4, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v8 ; GFX7-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX7-NEXT: v_and_b32_e32 v3, s4, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.96.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.96.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.96.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.96.ll @@ -138,9 +138,9 @@ ; GFX7-NEXT: v_or_b32_e32 v4, v4, v5 ; GFX7-NEXT: s_waitcnt lgkmcnt(1) ; GFX7-NEXT: v_and_b32_e32 v5, v6, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v0, v0, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: v_or_b32_e32 v4, v4, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 24, v0 ; GFX7-NEXT: v_or_b32_e32 v2, v4, v0 @@ -189,11 +189,12 @@ ; GFX10-NEXT: v_and_or_b32 v1, v10, s4, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_and_or_b32 v7, v0, v12, v7 +; GFX10-NEXT: s_waitcnt lgkmcnt(1) ; GFX10-NEXT: v_and_or_b32 v4, v11, s4, v4 ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX10-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_and_or_b32 v7, v0, v12, v7 ; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; GFX10-NEXT: v_lshlrev_b32_e32 v9, 24, v9 ; GFX10-NEXT: v_or3_b32 v0, v1, v2, v3 @@ -244,14 +245,14 @@ ; GFX7-NEXT: v_and_b32_e32 v0, s4, v1 ; GFX7-NEXT: s_waitcnt lgkmcnt(4) ; GFX7-NEXT: v_and_b32_e32 v1, s4, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: s_waitcnt lgkmcnt(2) ; GFX7-NEXT: v_and_b32_e32 v2, s4, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: v_and_b32_e32 v1, s4, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v3, s4, v6 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX7-NEXT: v_and_b32_e32 v2, s4, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-unaligned.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-unaligned.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-unaligned.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-unaligned.ll @@ -39,8 +39,8 @@ ; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX7-NEXT: s_waitcnt lgkmcnt(4) ; GFX7-NEXT: v_and_b32_e32 v2, s4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v2 ; GFX7-NEXT: v_mov_b32_e32 v3, 0xff +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v2 ; GFX7-NEXT: v_or_b32_e32 v4, v1, v2 ; GFX7-NEXT: s_waitcnt lgkmcnt(2) ; GFX7-NEXT: v_and_b32_e32 v2, v7, v3 @@ -74,18 +74,18 @@ ; GFX7-NEXT: v_or_b32_e32 v2, v2, v5 ; GFX7-NEXT: s_waitcnt lgkmcnt(4) ; GFX7-NEXT: v_and_b32_e32 v5, v7, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 24, v5 ; GFX7-NEXT: s_waitcnt lgkmcnt(2) ; GFX7-NEXT: v_and_b32_e32 v6, v9, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 24, v5 ; GFX7-NEXT: v_or_b32_e32 v2, v2, v5 ; GFX7-NEXT: v_and_b32_e32 v5, v8, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v6, 8, v6 ; GFX7-NEXT: v_or_b32_e32 v5, v5, v6 ; GFX7-NEXT: s_waitcnt lgkmcnt(1) ; GFX7-NEXT: v_and_b32_e32 v6, v10, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v0, v0, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; GFX7-NEXT: v_or_b32_e32 v5, v5, v6 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 24, v0 ; GFX7-NEXT: v_or_b32_e32 v3, v5, v0 @@ -132,9 +132,10 @@ ; GFX10-NEXT: v_lshlrev_b32_sdwa v7, v17, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX10-NEXT: s_waitcnt lgkmcnt(8) ; GFX10-NEXT: v_and_or_b32 v1, v8, s4, v1 +; GFX10-NEXT: s_waitcnt lgkmcnt(7) +; GFX10-NEXT: v_and_or_b32 v4, v9, s4, v4 ; GFX10-NEXT: s_waitcnt lgkmcnt(5) ; GFX10-NEXT: v_and_b32_e32 v8, v12, v11 -; GFX10-NEXT: v_and_or_b32 v4, v9, s4, v4 ; GFX10-NEXT: s_waitcnt lgkmcnt(4) ; GFX10-NEXT: v_and_b32_e32 v9, v13, v11 ; GFX10-NEXT: v_and_or_b32 v7, v10, v11, v7 @@ -146,17 +147,17 @@ ; GFX10-NEXT: v_and_b32_e32 v0, v0, v11 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GFX10-NEXT: v_and_or_b32 v10, v14, v11, v10 -; GFX10-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; GFX10-NEXT: v_lshlrev_b32_e32 v12, 24, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX10-NEXT: v_lshlrev_b32_e32 v6, 24, v6 ; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; GFX10-NEXT: v_lshlrev_b32_e32 v9, 24, v9 +; GFX10-NEXT: v_and_or_b32 v10, v14, v11, v10 +; GFX10-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; GFX10-NEXT: v_lshlrev_b32_e32 v12, 24, v0 ; GFX10-NEXT: v_or3_b32 v0, v1, v2, v3 -; GFX10-NEXT: v_or3_b32 v3, v10, v11, v12 ; GFX10-NEXT: v_or3_b32 v1, v4, v5, v6 ; GFX10-NEXT: v_or3_b32 v2, v7, v8, v9 +; GFX10-NEXT: v_or3_b32 v3, v10, v11, v12 ; GFX10-NEXT: s_setpc_b64 s[30:31] %load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr, align 1 ret <4 x i32> %load @@ -222,9 +223,9 @@ ; GFX7-NEXT: v_or_b32_e32 v4, v4, v5 ; GFX7-NEXT: s_waitcnt lgkmcnt(1) ; GFX7-NEXT: v_and_b32_e32 v5, v6, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v0, v0, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: v_or_b32_e32 v4, v4, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 24, v0 ; GFX7-NEXT: v_or_b32_e32 v2, v4, v0 @@ -273,11 +274,12 @@ ; GFX10-NEXT: v_and_or_b32 v1, v10, s4, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_and_or_b32 v7, v0, v12, v7 +; GFX10-NEXT: s_waitcnt lgkmcnt(1) ; GFX10-NEXT: v_and_or_b32 v4, v11, s4, v4 ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX10-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_and_or_b32 v7, v0, v12, v7 ; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; GFX10-NEXT: v_lshlrev_b32_e32 v9, 24, v9 ; GFX10-NEXT: v_or3_b32 v0, v1, v2, v3 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll @@ -128,12 +128,12 @@ ; GFX9-NEXT: s_getpc_b64 s[0:1] ; GFX9-NEXT: s_add_u32 s0, s0, gv0@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s1, s1, gv0@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX9-NEXT: s_getpc_b64 s[2:3] ; GFX9-NEXT: s_add_u32 s2, s2, gv1@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s3, s3, gv1@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dword v0, v0, s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll @@ -824,8 +824,8 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_mov_b32 s3, 0xffff ; GFX8-NEXT: s_lshr_b32 s2, s0, 16 -; GFX8-NEXT: s_lshr_b32 s4, s1, 16 ; GFX8-NEXT: s_and_b32 s0, s0, s3 +; GFX8-NEXT: s_lshr_b32 s4, s1, 16 ; GFX8-NEXT: s_and_b32 s1, s1, s3 ; GFX8-NEXT: s_lshr_b32 s0, s0, s1 ; GFX8-NEXT: s_lshr_b32 s1, s2, s4 @@ -838,8 +838,8 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_mov_b32 s3, 0xffff ; GFX9-NEXT: s_lshr_b32 s2, s0, 16 -; GFX9-NEXT: s_lshr_b32 s4, s1, 16 ; GFX9-NEXT: s_and_b32 s0, s0, s3 +; GFX9-NEXT: s_lshr_b32 s4, s1, 16 ; GFX9-NEXT: s_and_b32 s1, s1, s3 ; GFX9-NEXT: s_lshr_b32 s0, s0, s1 ; GFX9-NEXT: s_lshr_b32 s1, s2, s4 @@ -963,8 +963,8 @@ ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v2 ; GFX6-NEXT: v_and_b32_e32 v4, s4, v7 ; GFX6-NEXT: v_and_b32_e32 v3, s4, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, v4, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v3 ; GFX6-NEXT: v_or_b32_e32 v1, v2, v1 @@ -1015,8 +1015,8 @@ ; GFX6-NEXT: s_lshr_b32 s2, s2, s4 ; GFX6-NEXT: s_and_b32 s4, s7, s8 ; GFX6-NEXT: s_and_b32 s3, s3, s8 -; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_lshr_b32 s3, s3, s4 +; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: s_lshl_b32 s1, s3, 16 ; GFX6-NEXT: s_or_b32 s1, s2, s1 @@ -1026,15 +1026,15 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_mov_b32 s6, 0xffff ; GFX8-NEXT: s_lshr_b32 s4, s0, 16 -; GFX8-NEXT: s_lshr_b32 s7, s2, 16 ; GFX8-NEXT: s_and_b32 s0, s0, s6 +; GFX8-NEXT: s_lshr_b32 s7, s2, 16 ; GFX8-NEXT: s_and_b32 s2, s2, s6 -; GFX8-NEXT: s_lshr_b32 s0, s0, s2 -; GFX8-NEXT: s_lshr_b32 s2, s4, s7 ; GFX8-NEXT: s_lshr_b32 s5, s1, 16 -; GFX8-NEXT: s_lshr_b32 s8, s3, 16 ; GFX8-NEXT: s_and_b32 s1, s1, s6 +; GFX8-NEXT: s_lshr_b32 s8, s3, 16 ; GFX8-NEXT: s_and_b32 s3, s3, s6 +; GFX8-NEXT: s_lshr_b32 s0, s0, s2 +; GFX8-NEXT: s_lshr_b32 s2, s4, s7 ; GFX8-NEXT: s_lshr_b32 s1, s1, s3 ; GFX8-NEXT: s_lshr_b32 s3, s5, s8 ; GFX8-NEXT: s_lshl_b32 s2, s2, 16 @@ -1049,15 +1049,15 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_mov_b32 s5, 0xffff ; GFX9-NEXT: s_lshr_b32 s4, s0, 16 -; GFX9-NEXT: s_lshr_b32 s6, s2, 16 ; GFX9-NEXT: s_and_b32 s0, s0, s5 +; GFX9-NEXT: s_lshr_b32 s6, s2, 16 ; GFX9-NEXT: s_and_b32 s2, s2, s5 ; GFX9-NEXT: s_lshr_b32 s0, s0, s2 ; GFX9-NEXT: s_lshr_b32 s2, s4, s6 ; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s2 ; GFX9-NEXT: s_lshr_b32 s2, s1, 16 -; GFX9-NEXT: s_lshr_b32 s4, s3, 16 ; GFX9-NEXT: s_and_b32 s1, s1, s5 +; GFX9-NEXT: s_lshr_b32 s4, s3, 16 ; GFX9-NEXT: s_and_b32 s3, s3, s5 ; GFX9-NEXT: s_lshr_b32 s1, s1, s3 ; GFX9-NEXT: s_lshr_b32 s2, s2, s4 @@ -1137,15 +1137,15 @@ ; GFX6-NEXT: v_and_b32_e32 v8, s4, v14 ; GFX6-NEXT: v_and_b32_e32 v6, s4, v6 ; GFX6-NEXT: v_lshrrev_b32_e32 v6, v8, v6 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_and_b32_e32 v8, v15, v16 ; GFX6-NEXT: v_and_b32_e32 v7, v7, v16 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v7, v8, v7 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v7, v8, v7 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v7 ; GFX6-NEXT: v_or_b32_e32 v1, v2, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v7 ; GFX6-NEXT: v_or_b32_e32 v2, v4, v2 ; GFX6-NEXT: v_or_b32_e32 v3, v6, v3 ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -1215,15 +1215,15 @@ ; GFX6-NEXT: s_and_b32 s8, s14, s16 ; GFX6-NEXT: s_and_b32 s6, s6, s16 ; GFX6-NEXT: s_lshr_b32 s6, s6, s8 -; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_and_b32 s8, s15, s16 ; GFX6-NEXT: s_and_b32 s7, s7, s16 +; GFX6-NEXT: s_lshl_b32 s1, s1, 16 +; GFX6-NEXT: s_lshr_b32 s7, s7, s8 ; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: s_lshl_b32 s1, s3, 16 -; GFX6-NEXT: s_lshr_b32 s7, s7, s8 -; GFX6-NEXT: s_lshl_b32 s3, s7, 16 ; GFX6-NEXT: s_or_b32 s1, s2, s1 ; GFX6-NEXT: s_lshl_b32 s2, s5, 16 +; GFX6-NEXT: s_lshl_b32 s3, s7, 16 ; GFX6-NEXT: s_or_b32 s2, s4, s2 ; GFX6-NEXT: s_or_b32 s3, s6, s3 ; GFX6-NEXT: ; return to shader part epilog @@ -1232,35 +1232,35 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_mov_b32 s12, 0xffff ; GFX8-NEXT: s_lshr_b32 s8, s0, 16 -; GFX8-NEXT: s_lshr_b32 s13, s4, 16 ; GFX8-NEXT: s_and_b32 s0, s0, s12 +; GFX8-NEXT: s_lshr_b32 s13, s4, 16 ; GFX8-NEXT: s_and_b32 s4, s4, s12 -; GFX8-NEXT: s_lshr_b32 s0, s0, s4 -; GFX8-NEXT: s_lshr_b32 s4, s8, s13 ; GFX8-NEXT: s_lshr_b32 s9, s1, 16 -; GFX8-NEXT: s_lshr_b32 s14, s5, 16 ; GFX8-NEXT: s_and_b32 s1, s1, s12 +; GFX8-NEXT: s_lshr_b32 s14, s5, 16 ; GFX8-NEXT: s_and_b32 s5, s5, s12 -; GFX8-NEXT: s_lshr_b32 s1, s1, s5 +; GFX8-NEXT: s_lshr_b32 s0, s0, s4 +; GFX8-NEXT: s_lshr_b32 s4, s8, s13 ; GFX8-NEXT: s_lshr_b32 s10, s2, 16 -; GFX8-NEXT: s_lshr_b32 s15, s6, 16 ; GFX8-NEXT: s_and_b32 s2, s2, s12 +; GFX8-NEXT: s_lshr_b32 s15, s6, 16 ; GFX8-NEXT: s_and_b32 s6, s6, s12 +; GFX8-NEXT: s_lshr_b32 s1, s1, s5 ; GFX8-NEXT: s_lshr_b32 s5, s9, s14 ; GFX8-NEXT: s_lshl_b32 s4, s4, 16 ; GFX8-NEXT: s_and_b32 s0, s0, s12 -; GFX8-NEXT: s_lshr_b32 s2, s2, s6 ; GFX8-NEXT: s_lshr_b32 s11, s3, 16 -; GFX8-NEXT: s_lshr_b32 s16, s7, 16 -; GFX8-NEXT: s_or_b32 s0, s4, s0 ; GFX8-NEXT: s_and_b32 s3, s3, s12 +; GFX8-NEXT: s_lshr_b32 s16, s7, 16 ; GFX8-NEXT: s_and_b32 s7, s7, s12 +; GFX8-NEXT: s_lshr_b32 s2, s2, s6 ; GFX8-NEXT: s_lshr_b32 s6, s10, s15 +; GFX8-NEXT: s_or_b32 s0, s4, s0 ; GFX8-NEXT: s_lshl_b32 s4, s5, 16 ; GFX8-NEXT: s_and_b32 s1, s1, s12 ; GFX8-NEXT: s_lshr_b32 s3, s3, s7 -; GFX8-NEXT: s_or_b32 s1, s4, s1 ; GFX8-NEXT: s_lshr_b32 s7, s11, s16 +; GFX8-NEXT: s_or_b32 s1, s4, s1 ; GFX8-NEXT: s_lshl_b32 s4, s6, 16 ; GFX8-NEXT: s_and_b32 s2, s2, s12 ; GFX8-NEXT: s_or_b32 s2, s4, s2 @@ -1273,29 +1273,29 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_mov_b32 s9, 0xffff ; GFX9-NEXT: s_lshr_b32 s8, s0, 16 -; GFX9-NEXT: s_lshr_b32 s10, s4, 16 ; GFX9-NEXT: s_and_b32 s0, s0, s9 +; GFX9-NEXT: s_lshr_b32 s10, s4, 16 ; GFX9-NEXT: s_and_b32 s4, s4, s9 ; GFX9-NEXT: s_lshr_b32 s0, s0, s4 ; GFX9-NEXT: s_lshr_b32 s4, s8, s10 ; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s4 ; GFX9-NEXT: s_lshr_b32 s4, s1, 16 -; GFX9-NEXT: s_lshr_b32 s8, s5, 16 ; GFX9-NEXT: s_and_b32 s1, s1, s9 +; GFX9-NEXT: s_lshr_b32 s8, s5, 16 ; GFX9-NEXT: s_and_b32 s5, s5, s9 ; GFX9-NEXT: s_lshr_b32 s1, s1, s5 ; GFX9-NEXT: s_lshr_b32 s4, s4, s8 ; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s4 ; GFX9-NEXT: s_lshr_b32 s4, s2, 16 -; GFX9-NEXT: s_lshr_b32 s5, s6, 16 ; GFX9-NEXT: s_and_b32 s2, s2, s9 +; GFX9-NEXT: s_lshr_b32 s5, s6, 16 ; GFX9-NEXT: s_and_b32 s6, s6, s9 ; GFX9-NEXT: s_lshr_b32 s2, s2, s6 ; GFX9-NEXT: s_lshr_b32 s4, s4, s5 ; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s4 ; GFX9-NEXT: s_lshr_b32 s4, s3, 16 -; GFX9-NEXT: s_lshr_b32 s5, s7, 16 ; GFX9-NEXT: s_and_b32 s3, s3, s9 +; GFX9-NEXT: s_lshr_b32 s5, s7, 16 ; GFX9-NEXT: s_and_b32 s6, s7, s9 ; GFX9-NEXT: s_lshr_b32 s3, s3, s6 ; GFX9-NEXT: s_lshr_b32 s4, s4, s5 @@ -1306,28 +1306,28 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_mov_b32 s8, 0xffff ; GFX10-NEXT: s_lshr_b32 s9, s0, 16 -; GFX10-NEXT: s_and_b32 s10, s4, s8 ; GFX10-NEXT: s_and_b32 s0, s0, s8 +; GFX10-NEXT: s_and_b32 s10, s4, s8 ; GFX10-NEXT: s_lshr_b32 s4, s4, 16 ; GFX10-NEXT: s_lshr_b32 s0, s0, s10 ; GFX10-NEXT: s_lshr_b32 s4, s9, s4 ; GFX10-NEXT: s_lshr_b32 s9, s1, 16 -; GFX10-NEXT: s_and_b32 s10, s5, s8 ; GFX10-NEXT: s_and_b32 s1, s1, s8 +; GFX10-NEXT: s_and_b32 s10, s5, s8 ; GFX10-NEXT: s_lshr_b32 s5, s5, 16 ; GFX10-NEXT: s_lshr_b32 s1, s1, s10 ; GFX10-NEXT: s_lshr_b32 s5, s9, s5 ; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s4 ; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s5 ; GFX10-NEXT: s_lshr_b32 s4, s2, 16 -; GFX10-NEXT: s_and_b32 s5, s6, s8 ; GFX10-NEXT: s_and_b32 s2, s2, s8 +; GFX10-NEXT: s_and_b32 s5, s6, s8 ; GFX10-NEXT: s_lshr_b32 s6, s6, 16 ; GFX10-NEXT: s_lshr_b32 s2, s2, s5 ; GFX10-NEXT: s_lshr_b32 s4, s4, s6 ; GFX10-NEXT: s_lshr_b32 s5, s3, 16 -; GFX10-NEXT: s_and_b32 s6, s7, s8 ; GFX10-NEXT: s_and_b32 s3, s3, s8 +; GFX10-NEXT: s_and_b32 s6, s7, s8 ; GFX10-NEXT: s_lshr_b32 s7, s7, 16 ; GFX10-NEXT: s_lshr_b32 s3, s3, s6 ; GFX10-NEXT: s_lshr_b32 s5, s5, s7 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll @@ -338,9 +338,9 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_mov_b64 s[4:5], 0x400 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x200000 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GFX6-NEXT: v_mov_b32_e32 v3, s5 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GFX6-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x200000 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc ; GFX6-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GFX6-NEXT: v_mov_b32_e32 v2, 0 @@ -353,9 +353,9 @@ ; GFX7: ; %bb.0: ; GFX7-NEXT: s_mov_b64 s[4:5], 0x400 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x200000 -; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GFX7-NEXT: v_mov_b32_e32 v3, s5 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GFX7-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x200000 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc ; GFX7-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GFX7-NEXT: v_mov_b32_e32 v2, 0 @@ -792,9 +792,9 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_mov_b64 s[4:5], 0x400 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x200000 ; GFX6-NEXT: v_mov_b32_e32 v3, s5 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GFX6-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x200000 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc ; GFX6-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 @@ -807,9 +807,9 @@ ; GFX7: ; %bb.0: ; GFX7-NEXT: s_mov_b64 s[4:5], 0x400 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x200000 ; GFX7-NEXT: v_mov_b32_e32 v3, s5 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GFX7-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x200000 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc ; GFX7-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll @@ -382,8 +382,8 @@ ; GFX7-LABEL: v_mul_i64: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_lo_u32 v4, v0, v3 ; GFX7-NEXT: v_mul_lo_u32 v1, v1, v2 +; GFX7-NEXT: v_mul_lo_u32 v4, v0, v3 ; GFX7-NEXT: v_mul_lo_u32 v3, v0, v2 ; GFX7-NEXT: v_mul_hi_u32 v0, v0, v2 ; GFX7-NEXT: v_add_i32_e32 v1, vcc, v1, v4 @@ -394,8 +394,8 @@ ; GFX8-LABEL: v_mul_i64: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mul_lo_u32 v4, v0, v3 ; GFX8-NEXT: v_mul_lo_u32 v1, v1, v2 +; GFX8-NEXT: v_mul_lo_u32 v4, v0, v3 ; GFX8-NEXT: v_mul_lo_u32 v3, v0, v2 ; GFX8-NEXT: v_mul_hi_u32 v0, v0, v2 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v4 @@ -432,24 +432,24 @@ ; GFX7: ; %bb.0: ; GFX7-NEXT: v_mov_b32_e32 v0, s3 ; GFX7-NEXT: v_mul_hi_u32 v0, s0, v0 -; GFX7-NEXT: v_mov_b32_e32 v2, s1 ; GFX7-NEXT: s_mul_i32 s7, s1, s3 ; GFX7-NEXT: s_mul_i32 s8, s0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 ; GFX7-NEXT: s_add_u32 s7, s7, s8 -; GFX7-NEXT: v_mov_b32_e32 v3, s4 ; GFX7-NEXT: v_mul_hi_u32 v2, v2, s3 +; GFX7-NEXT: v_mov_b32_e32 v3, s4 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, s7, v0 -; GFX7-NEXT: s_mul_i32 s7, s1, s4 ; GFX7-NEXT: s_mul_i32 s2, s2, s3 +; GFX7-NEXT: s_mul_i32 s7, s1, s4 ; GFX7-NEXT: v_mul_hi_u32 v3, s0, v3 -; GFX7-NEXT: s_cselect_b32 s8, 1, 0 ; GFX7-NEXT: s_mul_i32 s6, s0, s3 +; GFX7-NEXT: s_cselect_b32 s8, 1, 0 ; GFX7-NEXT: s_mul_i32 s5, s0, s5 ; GFX7-NEXT: s_add_i32 s0, s2, s7 ; GFX7-NEXT: s_add_i32 s0, s0, s5 +; GFX7-NEXT: s_and_b32 s8, s8, 1 ; GFX7-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX7-NEXT: v_add_i32_e32 v2, vcc, s0, v2 -; GFX7-NEXT: s_and_b32 s8, s8, 1 ; GFX7-NEXT: v_add_i32_e32 v1, vcc, s8, v1 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GFX7-NEXT: v_add_i32_e32 v1, vcc, v2, v1 @@ -462,24 +462,24 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: v_mov_b32_e32 v0, s3 ; GFX8-NEXT: v_mul_hi_u32 v0, s0, v0 -; GFX8-NEXT: v_mov_b32_e32 v2, s1 ; GFX8-NEXT: s_mul_i32 s7, s1, s3 ; GFX8-NEXT: s_mul_i32 s8, s0, s4 +; GFX8-NEXT: v_mov_b32_e32 v2, s1 ; GFX8-NEXT: s_add_u32 s7, s7, s8 -; GFX8-NEXT: v_mov_b32_e32 v3, s4 ; GFX8-NEXT: v_mul_hi_u32 v2, v2, s3 +; GFX8-NEXT: v_mov_b32_e32 v3, s4 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s7, v0 -; GFX8-NEXT: s_mul_i32 s7, s1, s4 ; GFX8-NEXT: s_mul_i32 s2, s2, s3 +; GFX8-NEXT: s_mul_i32 s7, s1, s4 ; GFX8-NEXT: v_mul_hi_u32 v3, s0, v3 -; GFX8-NEXT: s_cselect_b32 s8, 1, 0 ; GFX8-NEXT: s_mul_i32 s6, s0, s3 +; GFX8-NEXT: s_cselect_b32 s8, 1, 0 ; GFX8-NEXT: s_mul_i32 s5, s0, s5 ; GFX8-NEXT: s_add_i32 s0, s2, s7 ; GFX8-NEXT: s_add_i32 s0, s0, s5 +; GFX8-NEXT: s_and_b32 s8, s8, 1 ; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v2 -; GFX8-NEXT: s_and_b32 s8, s8, 1 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, s8, v1 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v3 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1 @@ -500,8 +500,8 @@ ; GFX9-NEXT: s_cselect_b32 s9, 1, 0 ; GFX9-NEXT: s_and_b32 s9, s9, 1 ; GFX9-NEXT: s_add_i32 s8, s8, s9 -; GFX9-NEXT: s_mul_i32 s9, s1, s4 ; GFX9-NEXT: s_mul_i32 s2, s2, s3 +; GFX9-NEXT: s_mul_i32 s9, s1, s4 ; GFX9-NEXT: s_mul_i32 s5, s0, s5 ; GFX9-NEXT: s_add_i32 s2, s2, s9 ; GFX9-NEXT: s_mul_hi_u32 s1, s1, s3 @@ -522,9 +522,9 @@ ; GFX10-NEXT: s_mul_hi_u32 s8, s0, s3 ; GFX10-NEXT: s_add_u32 s6, s6, s7 ; GFX10-NEXT: s_cselect_b32 s7, 1, 0 -; GFX10-NEXT: s_mul_i32 s9, s1, s4 -; GFX10-NEXT: s_and_b32 s7, s7, 1 ; GFX10-NEXT: s_mul_i32 s2, s2, s3 +; GFX10-NEXT: s_and_b32 s7, s7, 1 +; GFX10-NEXT: s_mul_i32 s9, s1, s4 ; GFX10-NEXT: s_add_u32 s6, s6, s8 ; GFX10-NEXT: s_cselect_b32 s8, 1, 0 ; GFX10-NEXT: s_mul_i32 s5, s0, s5 @@ -604,12 +604,12 @@ ; GFX9-NEXT: v_mul_lo_u32 v7, v1, v3 ; GFX9-NEXT: v_mul_lo_u32 v8, v0, v4 ; GFX9-NEXT: v_mul_hi_u32 v9, v0, v3 -; GFX9-NEXT: v_mul_lo_u32 v10, v1, v4 ; GFX9-NEXT: v_mul_lo_u32 v2, v2, v3 +; GFX9-NEXT: v_mul_lo_u32 v10, v1, v4 ; GFX9-NEXT: v_mul_lo_u32 v5, v0, v5 ; GFX9-NEXT: v_mul_hi_u32 v1, v1, v3 -; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v7, v8 ; GFX9-NEXT: v_mul_lo_u32 v6, v0, v3 +; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v7, v8 ; GFX9-NEXT: v_mul_hi_u32 v0, v0, v4 ; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v7, v9 @@ -629,8 +629,8 @@ ; GFX10-NEXT: v_mul_lo_u32 v6, v1, v3 ; GFX10-NEXT: v_mul_lo_u32 v7, v0, v4 ; GFX10-NEXT: v_mul_hi_u32 v8, v0, v3 -; GFX10-NEXT: v_mul_lo_u32 v9, v1, v4 ; GFX10-NEXT: v_mul_lo_u32 v2, v2, v3 +; GFX10-NEXT: v_mul_lo_u32 v9, v1, v4 ; GFX10-NEXT: v_mul_lo_u32 v5, v0, v5 ; GFX10-NEXT: v_mul_hi_u32 v4, v0, v4 ; GFX10-NEXT: v_mul_lo_u32 v0, v0, v3 @@ -665,16 +665,16 @@ ; GFX7-NEXT: s_mul_i32 s10, s1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s1 ; GFX7-NEXT: s_add_u32 s9, s9, s10 -; GFX7-NEXT: s_cselect_b32 s10, 1, 0 ; GFX7-NEXT: v_mul_hi_u32 v2, v2, s4 +; GFX7-NEXT: s_cselect_b32 s10, 1, 0 ; GFX7-NEXT: s_mul_i32 s11, s0, s6 ; GFX7-NEXT: s_and_b32 s10, s10, 1 -; GFX7-NEXT: s_add_u32 s9, s9, s11 ; GFX7-NEXT: v_mov_b32_e32 v3, s5 -; GFX7-NEXT: s_cselect_b32 s11, 1, 0 +; GFX7-NEXT: s_add_u32 s9, s9, s11 ; GFX7-NEXT: v_mul_hi_u32 v4, s0, v3 -; GFX7-NEXT: v_add_i32_e32 v2, vcc, s9, v2 +; GFX7-NEXT: s_cselect_b32 s11, 1, 0 ; GFX7-NEXT: s_and_b32 s11, s11, 1 +; GFX7-NEXT: v_add_i32_e32 v2, vcc, s9, v2 ; GFX7-NEXT: s_add_i32 s10, s10, s11 ; GFX7-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GFX7-NEXT: v_add_i32_e32 v5, vcc, s10, v5 @@ -685,14 +685,14 @@ ; GFX7-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX7-NEXT: v_add_i32_e32 v2, vcc, v4, v2 ; GFX7-NEXT: v_mov_b32_e32 v4, s2 -; GFX7-NEXT: v_mov_b32_e32 v5, s6 -; GFX7-NEXT: s_mul_i32 s5, s2, s5 ; GFX7-NEXT: s_mul_i32 s3, s3, s4 +; GFX7-NEXT: s_mul_i32 s5, s2, s5 ; GFX7-NEXT: v_mul_hi_u32 v4, v4, s4 +; GFX7-NEXT: v_mov_b32_e32 v5, s6 ; GFX7-NEXT: s_mul_i32 s8, s0, s4 ; GFX7-NEXT: s_mul_i32 s9, s1, s6 -; GFX7-NEXT: v_mul_hi_u32 v3, s1, v3 ; GFX7-NEXT: s_mul_i32 s7, s0, s7 +; GFX7-NEXT: v_mul_hi_u32 v3, s1, v3 ; GFX7-NEXT: v_mul_hi_u32 v5, s0, v5 ; GFX7-NEXT: s_add_i32 s0, s3, s5 ; GFX7-NEXT: s_add_i32 s0, s0, s9 @@ -723,16 +723,16 @@ ; GFX8-NEXT: s_mul_i32 s10, s1, s5 ; GFX8-NEXT: v_mov_b32_e32 v2, s1 ; GFX8-NEXT: s_add_u32 s9, s9, s10 -; GFX8-NEXT: s_cselect_b32 s10, 1, 0 ; GFX8-NEXT: v_mul_hi_u32 v2, v2, s4 +; GFX8-NEXT: s_cselect_b32 s10, 1, 0 ; GFX8-NEXT: s_mul_i32 s11, s0, s6 ; GFX8-NEXT: s_and_b32 s10, s10, 1 -; GFX8-NEXT: s_add_u32 s9, s9, s11 ; GFX8-NEXT: v_mov_b32_e32 v3, s5 -; GFX8-NEXT: s_cselect_b32 s11, 1, 0 +; GFX8-NEXT: s_add_u32 s9, s9, s11 ; GFX8-NEXT: v_mul_hi_u32 v4, s0, v3 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, s9, v2 +; GFX8-NEXT: s_cselect_b32 s11, 1, 0 ; GFX8-NEXT: s_and_b32 s11, s11, 1 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, s9, v2 ; GFX8-NEXT: s_add_i32 s10, s10, s11 ; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v5, vcc, s10, v5 @@ -743,14 +743,14 @@ ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v4, v2 ; GFX8-NEXT: v_mov_b32_e32 v4, s2 -; GFX8-NEXT: v_mov_b32_e32 v5, s6 -; GFX8-NEXT: s_mul_i32 s5, s2, s5 ; GFX8-NEXT: s_mul_i32 s3, s3, s4 +; GFX8-NEXT: s_mul_i32 s5, s2, s5 ; GFX8-NEXT: v_mul_hi_u32 v4, v4, s4 +; GFX8-NEXT: v_mov_b32_e32 v5, s6 ; GFX8-NEXT: s_mul_i32 s8, s0, s4 ; GFX8-NEXT: s_mul_i32 s9, s1, s6 -; GFX8-NEXT: v_mul_hi_u32 v3, s1, v3 ; GFX8-NEXT: s_mul_i32 s7, s0, s7 +; GFX8-NEXT: v_mul_hi_u32 v3, s1, v3 ; GFX8-NEXT: v_mul_hi_u32 v5, s0, v5 ; GFX8-NEXT: s_add_i32 s0, s3, s5 ; GFX8-NEXT: s_add_i32 s0, s0, s9 @@ -801,8 +801,8 @@ ; GFX9-NEXT: s_cselect_b32 s11, 1, 0 ; GFX9-NEXT: s_and_b32 s11, s11, 1 ; GFX9-NEXT: s_add_i32 s12, s12, s11 -; GFX9-NEXT: s_mul_i32 s11, s2, s5 ; GFX9-NEXT: s_mul_i32 s3, s3, s4 +; GFX9-NEXT: s_mul_i32 s11, s2, s5 ; GFX9-NEXT: s_mul_i32 s13, s1, s6 ; GFX9-NEXT: s_add_i32 s3, s3, s11 ; GFX9-NEXT: s_mul_i32 s7, s0, s7 @@ -812,8 +812,8 @@ ; GFX9-NEXT: s_mul_hi_u32 s1, s1, s5 ; GFX9-NEXT: s_add_i32 s2, s3, s2 ; GFX9-NEXT: s_mul_i32 s8, s0, s4 -; GFX9-NEXT: s_add_i32 s1, s2, s1 ; GFX9-NEXT: s_mul_hi_u32 s0, s0, s6 +; GFX9-NEXT: s_add_i32 s1, s2, s1 ; GFX9-NEXT: s_add_i32 s0, s1, s0 ; GFX9-NEXT: s_add_i32 s3, s0, s12 ; GFX9-NEXT: s_mov_b32 s0, s8 @@ -916,11 +916,11 @@ ; GFX7-NEXT: v_mul_lo_u32 v13, v1, v6 ; GFX7-NEXT: v_mul_lo_u32 v7, v0, v7 ; GFX7-NEXT: v_mul_hi_u32 v2, v2, v4 -; GFX7-NEXT: v_add_i32_e32 v3, vcc, v3, v12 ; GFX7-NEXT: v_mul_hi_u32 v1, v1, v5 -; GFX7-NEXT: v_add_i32_e32 v3, vcc, v3, v13 +; GFX7-NEXT: v_add_i32_e32 v3, vcc, v3, v12 ; GFX7-NEXT: v_mul_lo_u32 v8, v0, v4 ; GFX7-NEXT: v_mul_hi_u32 v0, v0, v6 +; GFX7-NEXT: v_add_i32_e32 v3, vcc, v3, v13 ; GFX7-NEXT: v_add_i32_e32 v3, vcc, v3, v7 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX7-NEXT: v_add_i32_e32 v1, vcc, v2, v1 @@ -966,11 +966,11 @@ ; GFX8-NEXT: v_mul_lo_u32 v13, v1, v6 ; GFX8-NEXT: v_mul_lo_u32 v7, v0, v7 ; GFX8-NEXT: v_mul_hi_u32 v2, v2, v4 -; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v12 ; GFX8-NEXT: v_mul_hi_u32 v1, v1, v5 -; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v13 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v12 ; GFX8-NEXT: v_mul_lo_u32 v8, v0, v4 ; GFX8-NEXT: v_mul_hi_u32 v0, v0, v6 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v13 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v7 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1 @@ -1042,28 +1042,28 @@ ; GFX10-NEXT: v_add_co_u32 v8, s4, v8, v9 ; GFX10-NEXT: v_add_co_u32 v9, s5, v10, v11 ; GFX10-NEXT: v_mul_hi_u32 v11, v1, v4 -; GFX10-NEXT: v_cndmask_b32_e64 v14, 0, 1, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, 1, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v14, 0, 1, s4 ; GFX10-NEXT: v_add_co_u32 v13, s4, v8, v13 ; GFX10-NEXT: v_add_co_u32 v8, s5, v9, v12 -; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, 1, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, 1, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, 1, s4 ; GFX10-NEXT: v_add_co_u32 v11, s4, v13, v11 ; GFX10-NEXT: v_cndmask_b32_e64 v13, 0, 1, s4 ; GFX10-NEXT: v_add_nc_u32_e32 v9, v10, v9 ; GFX10-NEXT: v_mul_lo_u32 v10, v2, v5 ; GFX10-NEXT: v_add_co_u32 v11, s4, v11, v15 -; GFX10-NEXT: v_mul_hi_u32 v15, v2, v4 ; GFX10-NEXT: v_add3_u32 v12, v14, v12, v13 ; GFX10-NEXT: v_mul_lo_u32 v13, v1, v6 +; GFX10-NEXT: v_mul_hi_u32 v15, v2, v4 ; GFX10-NEXT: v_mul_hi_u32 v1, v1, v5 ; GFX10-NEXT: v_cndmask_b32_e64 v14, 0, 1, s4 -; GFX10-NEXT: v_add_co_u32 v2, s4, v11, v9 ; GFX10-NEXT: v_add_nc_u32_e32 v3, v3, v10 +; GFX10-NEXT: v_add_co_u32 v2, s4, v11, v9 ; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s4 ; GFX10-NEXT: v_mul_hi_u32 v6, v0, v6 -; GFX10-NEXT: v_mul_lo_u32 v0, v0, v4 ; GFX10-NEXT: v_add3_u32 v3, v3, v13, v7 +; GFX10-NEXT: v_mul_lo_u32 v0, v0, v4 ; GFX10-NEXT: v_add3_u32 v4, v12, v14, v5 ; GFX10-NEXT: v_add3_u32 v1, v3, v15, v1 ; GFX10-NEXT: v_add3_u32 v3, v1, v6, v4 @@ -1090,16 +1090,16 @@ ; GFX7-NEXT: s_mul_i32 s18, s1, s9 ; GFX7-NEXT: v_mov_b32_e32 v2, s1 ; GFX7-NEXT: s_add_u32 s17, s17, s18 -; GFX7-NEXT: s_cselect_b32 s18, 1, 0 ; GFX7-NEXT: v_mul_hi_u32 v2, v2, s8 +; GFX7-NEXT: s_cselect_b32 s18, 1, 0 ; GFX7-NEXT: s_mul_i32 s19, s0, s10 ; GFX7-NEXT: s_and_b32 s18, s18, 1 -; GFX7-NEXT: s_add_u32 s17, s17, s19 ; GFX7-NEXT: v_mov_b32_e32 v3, s9 -; GFX7-NEXT: s_cselect_b32 s19, 1, 0 +; GFX7-NEXT: s_add_u32 s17, s17, s19 ; GFX7-NEXT: v_mul_hi_u32 v4, s0, v3 -; GFX7-NEXT: v_add_i32_e32 v2, vcc, s17, v2 +; GFX7-NEXT: s_cselect_b32 s19, 1, 0 ; GFX7-NEXT: s_and_b32 s19, s19, 1 +; GFX7-NEXT: v_add_i32_e32 v2, vcc, s17, v2 ; GFX7-NEXT: s_add_i32 s18, s18, s19 ; GFX7-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GFX7-NEXT: v_add_i32_e32 v5, vcc, s18, v5 @@ -1108,30 +1108,30 @@ ; GFX7-NEXT: s_mul_i32 s18, s2, s9 ; GFX7-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX7-NEXT: s_add_u32 s17, s17, s18 -; GFX7-NEXT: s_cselect_b32 s18, 1, 0 ; GFX7-NEXT: v_add_i32_e32 v4, vcc, v5, v4 +; GFX7-NEXT: s_cselect_b32 s18, 1, 0 ; GFX7-NEXT: v_add_i32_e32 v1, vcc, v2, v1 ; GFX7-NEXT: s_mul_i32 s19, s1, s10 ; GFX7-NEXT: s_and_b32 s18, s18, 1 ; GFX7-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX7-NEXT: s_add_u32 s17, s17, s19 -; GFX7-NEXT: s_cselect_b32 s19, 1, 0 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, v4, v2 ; GFX7-NEXT: v_mov_b32_e32 v4, s2 +; GFX7-NEXT: s_cselect_b32 s19, 1, 0 ; GFX7-NEXT: v_mul_hi_u32 v5, v4, s8 ; GFX7-NEXT: s_and_b32 s19, s19, 1 ; GFX7-NEXT: s_mul_i32 s20, s0, s11 ; GFX7-NEXT: s_add_i32 s18, s18, s19 ; GFX7-NEXT: s_add_u32 s17, s17, s20 -; GFX7-NEXT: s_cselect_b32 s19, 1, 0 ; GFX7-NEXT: v_mul_hi_u32 v3, s1, v3 -; GFX7-NEXT: v_add_i32_e32 v5, vcc, s17, v5 +; GFX7-NEXT: s_cselect_b32 s19, 1, 0 ; GFX7-NEXT: s_and_b32 s19, s19, 1 +; GFX7-NEXT: v_add_i32_e32 v5, vcc, s17, v5 ; GFX7-NEXT: v_mov_b32_e32 v6, s10 ; GFX7-NEXT: s_add_i32 s18, s18, s19 ; GFX7-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GFX7-NEXT: v_add_i32_e32 v8, vcc, s18, v8 ; GFX7-NEXT: v_mul_hi_u32 v7, s0, v6 +; GFX7-NEXT: v_add_i32_e32 v8, vcc, s18, v8 ; GFX7-NEXT: s_mul_i32 s17, s4, s8 ; GFX7-NEXT: s_mul_i32 s18, s3, s9 ; GFX7-NEXT: v_add_i32_e32 v3, vcc, v5, v3 @@ -1143,8 +1143,8 @@ ; GFX7-NEXT: s_and_b32 s18, s18, 1 ; GFX7-NEXT: v_add_i32_e32 v3, vcc, v3, v7 ; GFX7-NEXT: s_add_u32 s17, s17, s19 -; GFX7-NEXT: s_cselect_b32 s19, 1, 0 ; GFX7-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GFX7-NEXT: s_cselect_b32 s19, 1, 0 ; GFX7-NEXT: v_add_i32_e32 v5, vcc, v5, v7 ; GFX7-NEXT: s_and_b32 s19, s19, 1 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, v3, v2 @@ -1152,36 +1152,36 @@ ; GFX7-NEXT: s_add_i32 s18, s18, s19 ; GFX7-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX7-NEXT: s_add_u32 s17, s17, s20 -; GFX7-NEXT: s_cselect_b32 s19, 1, 0 ; GFX7-NEXT: v_add_i32_e32 v3, vcc, v5, v3 ; GFX7-NEXT: v_mov_b32_e32 v5, s3 -; GFX7-NEXT: s_and_b32 s19, s19, 1 +; GFX7-NEXT: s_cselect_b32 s19, 1, 0 ; GFX7-NEXT: v_mul_hi_u32 v7, v5, s8 +; GFX7-NEXT: s_and_b32 s19, s19, 1 ; GFX7-NEXT: s_mul_i32 s21, s0, s12 ; GFX7-NEXT: s_add_i32 s18, s18, s19 ; GFX7-NEXT: s_add_u32 s17, s17, s21 ; GFX7-NEXT: s_cselect_b32 s19, 1, 0 -; GFX7-NEXT: v_add_i32_e32 v7, vcc, s17, v7 -; GFX7-NEXT: s_and_b32 s19, s19, 1 ; GFX7-NEXT: v_mul_hi_u32 v4, v4, s9 +; GFX7-NEXT: s_and_b32 s19, s19, 1 +; GFX7-NEXT: v_add_i32_e32 v7, vcc, s17, v7 ; GFX7-NEXT: s_add_i32 s18, s18, s19 ; GFX7-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; GFX7-NEXT: v_add_i32_e32 v11, vcc, s18, v11 ; GFX7-NEXT: s_mul_i32 s17, s5, s8 ; GFX7-NEXT: s_mul_i32 s18, s4, s9 -; GFX7-NEXT: s_add_u32 s17, s17, s18 ; GFX7-NEXT: v_mul_hi_u32 v8, s1, v6 -; GFX7-NEXT: s_cselect_b32 s18, 1, 0 +; GFX7-NEXT: s_add_u32 s17, s17, s18 ; GFX7-NEXT: v_add_i32_e32 v4, vcc, v7, v4 -; GFX7-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GFX7-NEXT: s_cselect_b32 s18, 1, 0 ; GFX7-NEXT: v_mov_b32_e32 v9, s11 +; GFX7-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GFX7-NEXT: s_mul_i32 s19, s3, s10 ; GFX7-NEXT: s_and_b32 s18, s18, 1 +; GFX7-NEXT: v_mul_hi_u32 v10, s0, v9 ; GFX7-NEXT: v_add_i32_e32 v7, vcc, v11, v7 ; GFX7-NEXT: s_add_u32 s17, s17, s19 -; GFX7-NEXT: v_mul_hi_u32 v10, s0, v9 -; GFX7-NEXT: s_cselect_b32 s19, 1, 0 ; GFX7-NEXT: v_add_i32_e32 v4, vcc, v4, v8 +; GFX7-NEXT: s_cselect_b32 s19, 1, 0 ; GFX7-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GFX7-NEXT: s_and_b32 s19, s19, 1 ; GFX7-NEXT: v_add_i32_e32 v7, vcc, v7, v8 @@ -1189,8 +1189,8 @@ ; GFX7-NEXT: s_add_i32 s18, s18, s19 ; GFX7-NEXT: v_add_i32_e32 v4, vcc, v4, v10 ; GFX7-NEXT: s_add_u32 s17, s17, s20 -; GFX7-NEXT: s_cselect_b32 s19, 1, 0 ; GFX7-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GFX7-NEXT: s_cselect_b32 s19, 1, 0 ; GFX7-NEXT: v_add_i32_e32 v7, vcc, v7, v8 ; GFX7-NEXT: s_and_b32 s19, s19, 1 ; GFX7-NEXT: v_add_i32_e32 v3, vcc, v4, v3 @@ -1198,17 +1198,17 @@ ; GFX7-NEXT: s_add_i32 s18, s18, s19 ; GFX7-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX7-NEXT: s_add_u32 s17, s17, s21 -; GFX7-NEXT: s_cselect_b32 s19, 1, 0 ; GFX7-NEXT: v_add_i32_e32 v4, vcc, v7, v4 ; GFX7-NEXT: v_mov_b32_e32 v7, s4 -; GFX7-NEXT: s_and_b32 s19, s19, 1 +; GFX7-NEXT: s_cselect_b32 s19, 1, 0 ; GFX7-NEXT: v_mul_hi_u32 v8, v7, s8 +; GFX7-NEXT: s_and_b32 s19, s19, 1 ; GFX7-NEXT: s_mul_i32 s22, s0, s13 ; GFX7-NEXT: s_add_i32 s18, s18, s19 ; GFX7-NEXT: s_add_u32 s17, s17, s22 ; GFX7-NEXT: s_cselect_b32 s19, 1, 0 -; GFX7-NEXT: v_add_i32_e32 v8, vcc, s17, v8 ; GFX7-NEXT: s_and_b32 s19, s19, 1 +; GFX7-NEXT: v_add_i32_e32 v8, vcc, s17, v8 ; GFX7-NEXT: v_mul_hi_u32 v10, v5, s9 ; GFX7-NEXT: s_add_i32 s18, s18, s19 ; GFX7-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc @@ -1216,27 +1216,27 @@ ; GFX7-NEXT: s_mul_i32 s17, s6, s8 ; GFX7-NEXT: s_mul_i32 s18, s5, s9 ; GFX7-NEXT: s_add_u32 s17, s17, s18 -; GFX7-NEXT: s_cselect_b32 s18, 1, 0 ; GFX7-NEXT: v_mul_hi_u32 v6, s2, v6 +; GFX7-NEXT: s_cselect_b32 s18, 1, 0 ; GFX7-NEXT: v_add_i32_e32 v8, vcc, v8, v10 ; GFX7-NEXT: s_mul_i32 s19, s4, s10 ; GFX7-NEXT: s_and_b32 s18, s18, 1 ; GFX7-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GFX7-NEXT: s_add_u32 s17, s17, s19 -; GFX7-NEXT: s_cselect_b32 s19, 1, 0 -; GFX7-NEXT: v_add_i32_e32 v10, vcc, v14, v10 ; GFX7-NEXT: v_mul_hi_u32 v11, s1, v9 +; GFX7-NEXT: v_add_i32_e32 v10, vcc, v14, v10 +; GFX7-NEXT: s_cselect_b32 s19, 1, 0 ; GFX7-NEXT: v_add_i32_e32 v6, vcc, v8, v6 ; GFX7-NEXT: s_and_b32 s19, s19, 1 -; GFX7-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GFX7-NEXT: v_mov_b32_e32 v12, s12 +; GFX7-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GFX7-NEXT: s_mul_i32 s20, s3, s11 ; GFX7-NEXT: s_add_i32 s18, s18, s19 +; GFX7-NEXT: v_mul_hi_u32 v13, s0, v12 ; GFX7-NEXT: v_add_i32_e32 v8, vcc, v10, v8 ; GFX7-NEXT: s_add_u32 s17, s17, s20 -; GFX7-NEXT: v_mul_hi_u32 v13, s0, v12 -; GFX7-NEXT: s_cselect_b32 s19, 1, 0 ; GFX7-NEXT: v_add_i32_e32 v6, vcc, v6, v11 +; GFX7-NEXT: s_cselect_b32 s19, 1, 0 ; GFX7-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GFX7-NEXT: s_and_b32 s19, s19, 1 ; GFX7-NEXT: v_add_i32_e32 v8, vcc, v8, v10 @@ -1244,8 +1244,8 @@ ; GFX7-NEXT: s_add_i32 s18, s18, s19 ; GFX7-NEXT: v_add_i32_e32 v6, vcc, v6, v13 ; GFX7-NEXT: s_add_u32 s17, s17, s21 -; GFX7-NEXT: s_cselect_b32 s19, 1, 0 ; GFX7-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GFX7-NEXT: s_cselect_b32 s19, 1, 0 ; GFX7-NEXT: v_add_i32_e32 v8, vcc, v8, v10 ; GFX7-NEXT: s_and_b32 s19, s19, 1 ; GFX7-NEXT: v_add_i32_e32 v4, vcc, v6, v4 @@ -1253,46 +1253,46 @@ ; GFX7-NEXT: s_add_i32 s18, s18, s19 ; GFX7-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GFX7-NEXT: s_add_u32 s17, s17, s22 -; GFX7-NEXT: s_cselect_b32 s19, 1, 0 ; GFX7-NEXT: v_add_i32_e32 v6, vcc, v8, v6 ; GFX7-NEXT: v_mov_b32_e32 v8, s5 +; GFX7-NEXT: s_cselect_b32 s19, 1, 0 ; GFX7-NEXT: v_mul_hi_u32 v10, v8, s8 ; GFX7-NEXT: s_and_b32 s19, s19, 1 ; GFX7-NEXT: s_mul_i32 s23, s0, s14 ; GFX7-NEXT: s_add_i32 s18, s18, s19 ; GFX7-NEXT: s_add_u32 s17, s17, s23 -; GFX7-NEXT: s_cselect_b32 s19, 1, 0 ; GFX7-NEXT: v_mul_hi_u32 v11, v7, s9 -; GFX7-NEXT: v_add_i32_e32 v10, vcc, s17, v10 +; GFX7-NEXT: s_cselect_b32 s19, 1, 0 ; GFX7-NEXT: s_and_b32 s19, s19, 1 +; GFX7-NEXT: v_add_i32_e32 v10, vcc, s17, v10 ; GFX7-NEXT: s_add_i32 s18, s18, s19 ; GFX7-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; GFX7-NEXT: v_add_i32_e32 v17, vcc, s18, v17 ; GFX7-NEXT: v_mul_hi_u32 v5, v5, s10 +; GFX7-NEXT: v_add_i32_e32 v17, vcc, s18, v17 ; GFX7-NEXT: v_add_i32_e32 v10, vcc, v10, v11 ; GFX7-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; GFX7-NEXT: v_mul_hi_u32 v13, s2, v9 ; GFX7-NEXT: v_add_i32_e32 v11, vcc, v17, v11 ; GFX7-NEXT: v_add_i32_e32 v5, vcc, v10, v5 ; GFX7-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GFX7-NEXT: v_add_i32_e32 v10, vcc, v11, v10 ; GFX7-NEXT: v_mul_hi_u32 v14, s1, v12 +; GFX7-NEXT: v_add_i32_e32 v10, vcc, v11, v10 ; GFX7-NEXT: v_add_i32_e32 v5, vcc, v5, v13 -; GFX7-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; GFX7-NEXT: v_mov_b32_e32 v15, s13 -; GFX7-NEXT: v_add_i32_e32 v10, vcc, v10, v11 +; GFX7-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; GFX7-NEXT: v_mul_hi_u32 v16, s0, v15 +; GFX7-NEXT: v_add_i32_e32 v10, vcc, v10, v11 ; GFX7-NEXT: v_add_i32_e32 v5, vcc, v5, v14 ; GFX7-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; GFX7-NEXT: v_add_i32_e32 v10, vcc, v10, v11 ; GFX7-NEXT: v_add_i32_e32 v5, vcc, v5, v16 ; GFX7-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; GFX7-NEXT: v_add_i32_e32 v10, vcc, v10, v11 -; GFX7-NEXT: v_mov_b32_e32 v13, s14 ; GFX7-NEXT: s_mul_i32 s7, s7, s8 ; GFX7-NEXT: s_mul_i32 s17, s6, s9 -; GFX7-NEXT: v_add_i32_e32 v5, vcc, v5, v6 +; GFX7-NEXT: v_mov_b32_e32 v13, s14 ; GFX7-NEXT: s_mul_i32 s16, s0, s8 +; GFX7-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; GFX7-NEXT: s_mul_i32 s5, s5, s10 ; GFX7-NEXT: s_mul_i32 s15, s0, s15 ; GFX7-NEXT: v_mul_hi_u32 v13, s0, v13 @@ -1301,17 +1301,17 @@ ; GFX7-NEXT: s_mul_i32 s4, s4, s11 ; GFX7-NEXT: s_add_i32 s0, s0, s5 ; GFX7-NEXT: v_add_i32_e32 v6, vcc, v10, v6 -; GFX7-NEXT: v_mov_b32_e32 v10, s6 ; GFX7-NEXT: s_mul_i32 s11, s3, s12 +; GFX7-NEXT: v_mov_b32_e32 v10, s6 ; GFX7-NEXT: s_add_i32 s0, s0, s4 ; GFX7-NEXT: s_mul_i32 s12, s2, s13 -; GFX7-NEXT: s_add_i32 s0, s0, s11 ; GFX7-NEXT: v_mul_hi_u32 v10, v10, s8 +; GFX7-NEXT: s_add_i32 s0, s0, s11 ; GFX7-NEXT: s_mul_i32 s13, s1, s14 -; GFX7-NEXT: s_add_i32 s0, s0, s12 ; GFX7-NEXT: v_mul_hi_u32 v8, v8, s9 -; GFX7-NEXT: s_add_i32 s0, s0, s13 +; GFX7-NEXT: s_add_i32 s0, s0, s12 ; GFX7-NEXT: v_mul_hi_u32 v7, v7, s10 +; GFX7-NEXT: s_add_i32 s0, s0, s13 ; GFX7-NEXT: v_mul_hi_u32 v9, s3, v9 ; GFX7-NEXT: s_add_i32 s0, s0, s15 ; GFX7-NEXT: v_mul_hi_u32 v11, s2, v12 @@ -1350,16 +1350,16 @@ ; GFX8-NEXT: s_mul_i32 s18, s1, s9 ; GFX8-NEXT: v_mov_b32_e32 v2, s1 ; GFX8-NEXT: s_add_u32 s17, s17, s18 -; GFX8-NEXT: s_cselect_b32 s18, 1, 0 ; GFX8-NEXT: v_mul_hi_u32 v2, v2, s8 +; GFX8-NEXT: s_cselect_b32 s18, 1, 0 ; GFX8-NEXT: s_mul_i32 s19, s0, s10 ; GFX8-NEXT: s_and_b32 s18, s18, 1 -; GFX8-NEXT: s_add_u32 s17, s17, s19 ; GFX8-NEXT: v_mov_b32_e32 v3, s9 -; GFX8-NEXT: s_cselect_b32 s19, 1, 0 +; GFX8-NEXT: s_add_u32 s17, s17, s19 ; GFX8-NEXT: v_mul_hi_u32 v4, s0, v3 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, s17, v2 +; GFX8-NEXT: s_cselect_b32 s19, 1, 0 ; GFX8-NEXT: s_and_b32 s19, s19, 1 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, s17, v2 ; GFX8-NEXT: s_add_i32 s18, s18, s19 ; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v5, vcc, s18, v5 @@ -1368,30 +1368,30 @@ ; GFX8-NEXT: s_mul_i32 s18, s2, s9 ; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX8-NEXT: s_add_u32 s17, s17, s18 -; GFX8-NEXT: s_cselect_b32 s18, 1, 0 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v5, v4 +; GFX8-NEXT: s_cselect_b32 s18, 1, 0 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1 ; GFX8-NEXT: s_mul_i32 s19, s1, s10 ; GFX8-NEXT: s_and_b32 s18, s18, 1 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX8-NEXT: s_add_u32 s17, s17, s19 -; GFX8-NEXT: s_cselect_b32 s19, 1, 0 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v4, v2 ; GFX8-NEXT: v_mov_b32_e32 v4, s2 +; GFX8-NEXT: s_cselect_b32 s19, 1, 0 ; GFX8-NEXT: v_mul_hi_u32 v5, v4, s8 ; GFX8-NEXT: s_and_b32 s19, s19, 1 ; GFX8-NEXT: s_mul_i32 s20, s0, s11 ; GFX8-NEXT: s_add_i32 s18, s18, s19 ; GFX8-NEXT: s_add_u32 s17, s17, s20 -; GFX8-NEXT: s_cselect_b32 s19, 1, 0 ; GFX8-NEXT: v_mul_hi_u32 v3, s1, v3 -; GFX8-NEXT: v_add_u32_e32 v5, vcc, s17, v5 +; GFX8-NEXT: s_cselect_b32 s19, 1, 0 ; GFX8-NEXT: s_and_b32 s19, s19, 1 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, s17, v5 ; GFX8-NEXT: v_mov_b32_e32 v6, s10 ; GFX8-NEXT: s_add_i32 s18, s18, s19 ; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v8, vcc, s18, v8 ; GFX8-NEXT: v_mul_hi_u32 v7, s0, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, s18, v8 ; GFX8-NEXT: s_mul_i32 s17, s4, s8 ; GFX8-NEXT: s_mul_i32 s18, s3, s9 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v5, v3 @@ -1403,8 +1403,8 @@ ; GFX8-NEXT: s_and_b32 s18, s18, 1 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v7 ; GFX8-NEXT: s_add_u32 s17, s17, s19 -; GFX8-NEXT: s_cselect_b32 s19, 1, 0 ; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GFX8-NEXT: s_cselect_b32 s19, 1, 0 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v7 ; GFX8-NEXT: s_and_b32 s19, s19, 1 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 @@ -1412,36 +1412,36 @@ ; GFX8-NEXT: s_add_i32 s18, s18, s19 ; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX8-NEXT: s_add_u32 s17, s17, s20 -; GFX8-NEXT: s_cselect_b32 s19, 1, 0 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v5, v3 ; GFX8-NEXT: v_mov_b32_e32 v5, s3 -; GFX8-NEXT: s_and_b32 s19, s19, 1 +; GFX8-NEXT: s_cselect_b32 s19, 1, 0 ; GFX8-NEXT: v_mul_hi_u32 v7, v5, s8 +; GFX8-NEXT: s_and_b32 s19, s19, 1 ; GFX8-NEXT: s_mul_i32 s21, s0, s12 ; GFX8-NEXT: s_add_i32 s18, s18, s19 ; GFX8-NEXT: s_add_u32 s17, s17, s21 ; GFX8-NEXT: s_cselect_b32 s19, 1, 0 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, s17, v7 -; GFX8-NEXT: s_and_b32 s19, s19, 1 ; GFX8-NEXT: v_mul_hi_u32 v4, v4, s9 +; GFX8-NEXT: s_and_b32 s19, s19, 1 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, s17, v7 ; GFX8-NEXT: s_add_i32 s18, s18, s19 ; GFX8-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v11, vcc, s18, v11 ; GFX8-NEXT: s_mul_i32 s17, s5, s8 ; GFX8-NEXT: s_mul_i32 s18, s4, s9 -; GFX8-NEXT: s_add_u32 s17, s17, s18 ; GFX8-NEXT: v_mul_hi_u32 v8, s1, v6 -; GFX8-NEXT: s_cselect_b32 s18, 1, 0 +; GFX8-NEXT: s_add_u32 s17, s17, s18 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v7, v4 -; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GFX8-NEXT: s_cselect_b32 s18, 1, 0 ; GFX8-NEXT: v_mov_b32_e32 v9, s11 +; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GFX8-NEXT: s_mul_i32 s19, s3, s10 ; GFX8-NEXT: s_and_b32 s18, s18, 1 +; GFX8-NEXT: v_mul_hi_u32 v10, s0, v9 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v11, v7 ; GFX8-NEXT: s_add_u32 s17, s17, s19 -; GFX8-NEXT: v_mul_hi_u32 v10, s0, v9 -; GFX8-NEXT: s_cselect_b32 s19, 1, 0 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v8 +; GFX8-NEXT: s_cselect_b32 s19, 1, 0 ; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GFX8-NEXT: s_and_b32 s19, s19, 1 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v8 @@ -1449,8 +1449,8 @@ ; GFX8-NEXT: s_add_i32 s18, s18, s19 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v10 ; GFX8-NEXT: s_add_u32 s17, s17, s20 -; GFX8-NEXT: s_cselect_b32 s19, 1, 0 ; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GFX8-NEXT: s_cselect_b32 s19, 1, 0 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v8 ; GFX8-NEXT: s_and_b32 s19, s19, 1 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v4, v3 @@ -1458,17 +1458,17 @@ ; GFX8-NEXT: s_add_i32 s18, s18, s19 ; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX8-NEXT: s_add_u32 s17, s17, s21 -; GFX8-NEXT: s_cselect_b32 s19, 1, 0 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v7, v4 ; GFX8-NEXT: v_mov_b32_e32 v7, s4 -; GFX8-NEXT: s_and_b32 s19, s19, 1 +; GFX8-NEXT: s_cselect_b32 s19, 1, 0 ; GFX8-NEXT: v_mul_hi_u32 v8, v7, s8 +; GFX8-NEXT: s_and_b32 s19, s19, 1 ; GFX8-NEXT: s_mul_i32 s22, s0, s13 ; GFX8-NEXT: s_add_i32 s18, s18, s19 ; GFX8-NEXT: s_add_u32 s17, s17, s22 ; GFX8-NEXT: s_cselect_b32 s19, 1, 0 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, s17, v8 ; GFX8-NEXT: s_and_b32 s19, s19, 1 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, s17, v8 ; GFX8-NEXT: v_mul_hi_u32 v10, v5, s9 ; GFX8-NEXT: s_add_i32 s18, s18, s19 ; GFX8-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc @@ -1476,27 +1476,27 @@ ; GFX8-NEXT: s_mul_i32 s17, s6, s8 ; GFX8-NEXT: s_mul_i32 s18, s5, s9 ; GFX8-NEXT: s_add_u32 s17, s17, s18 -; GFX8-NEXT: s_cselect_b32 s18, 1, 0 ; GFX8-NEXT: v_mul_hi_u32 v6, s2, v6 +; GFX8-NEXT: s_cselect_b32 s18, 1, 0 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v10 ; GFX8-NEXT: s_mul_i32 s19, s4, s10 ; GFX8-NEXT: s_and_b32 s18, s18, 1 ; GFX8-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GFX8-NEXT: s_add_u32 s17, s17, s19 -; GFX8-NEXT: s_cselect_b32 s19, 1, 0 -; GFX8-NEXT: v_add_u32_e32 v10, vcc, v14, v10 ; GFX8-NEXT: v_mul_hi_u32 v11, s1, v9 +; GFX8-NEXT: v_add_u32_e32 v10, vcc, v14, v10 +; GFX8-NEXT: s_cselect_b32 s19, 1, 0 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v8, v6 ; GFX8-NEXT: s_and_b32 s19, s19, 1 -; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GFX8-NEXT: v_mov_b32_e32 v12, s12 +; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GFX8-NEXT: s_mul_i32 s20, s3, s11 ; GFX8-NEXT: s_add_i32 s18, s18, s19 +; GFX8-NEXT: v_mul_hi_u32 v13, s0, v12 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, v10, v8 ; GFX8-NEXT: s_add_u32 s17, s17, s20 -; GFX8-NEXT: v_mul_hi_u32 v13, s0, v12 -; GFX8-NEXT: s_cselect_b32 s19, 1, 0 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v11 +; GFX8-NEXT: s_cselect_b32 s19, 1, 0 ; GFX8-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GFX8-NEXT: s_and_b32 s19, s19, 1 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v10 @@ -1504,8 +1504,8 @@ ; GFX8-NEXT: s_add_i32 s18, s18, s19 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v13 ; GFX8-NEXT: s_add_u32 s17, s17, s21 -; GFX8-NEXT: s_cselect_b32 s19, 1, 0 ; GFX8-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GFX8-NEXT: s_cselect_b32 s19, 1, 0 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v10 ; GFX8-NEXT: s_and_b32 s19, s19, 1 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v6, v4 @@ -1513,46 +1513,46 @@ ; GFX8-NEXT: s_add_i32 s18, s18, s19 ; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GFX8-NEXT: s_add_u32 s17, s17, s22 -; GFX8-NEXT: s_cselect_b32 s19, 1, 0 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v8, v6 ; GFX8-NEXT: v_mov_b32_e32 v8, s5 +; GFX8-NEXT: s_cselect_b32 s19, 1, 0 ; GFX8-NEXT: v_mul_hi_u32 v10, v8, s8 ; GFX8-NEXT: s_and_b32 s19, s19, 1 ; GFX8-NEXT: s_mul_i32 s23, s0, s14 ; GFX8-NEXT: s_add_i32 s18, s18, s19 ; GFX8-NEXT: s_add_u32 s17, s17, s23 -; GFX8-NEXT: s_cselect_b32 s19, 1, 0 ; GFX8-NEXT: v_mul_hi_u32 v11, v7, s9 -; GFX8-NEXT: v_add_u32_e32 v10, vcc, s17, v10 +; GFX8-NEXT: s_cselect_b32 s19, 1, 0 ; GFX8-NEXT: s_and_b32 s19, s19, 1 +; GFX8-NEXT: v_add_u32_e32 v10, vcc, s17, v10 ; GFX8-NEXT: s_add_i32 s18, s18, s19 ; GFX8-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v17, vcc, s18, v17 ; GFX8-NEXT: v_mul_hi_u32 v5, v5, s10 +; GFX8-NEXT: v_add_u32_e32 v17, vcc, s18, v17 ; GFX8-NEXT: v_add_u32_e32 v10, vcc, v10, v11 ; GFX8-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; GFX8-NEXT: v_mul_hi_u32 v13, s2, v9 ; GFX8-NEXT: v_add_u32_e32 v11, vcc, v17, v11 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, v10, v5 ; GFX8-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v10, vcc, v11, v10 ; GFX8-NEXT: v_mul_hi_u32 v14, s1, v12 +; GFX8-NEXT: v_add_u32_e32 v10, vcc, v11, v10 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v13 -; GFX8-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; GFX8-NEXT: v_mov_b32_e32 v15, s13 -; GFX8-NEXT: v_add_u32_e32 v10, vcc, v10, v11 +; GFX8-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; GFX8-NEXT: v_mul_hi_u32 v16, s0, v15 +; GFX8-NEXT: v_add_u32_e32 v10, vcc, v10, v11 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v14 ; GFX8-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v10, vcc, v10, v11 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v16 ; GFX8-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v10, vcc, v10, v11 -; GFX8-NEXT: v_mov_b32_e32 v13, s14 ; GFX8-NEXT: s_mul_i32 s7, s7, s8 ; GFX8-NEXT: s_mul_i32 s17, s6, s9 -; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v6 +; GFX8-NEXT: v_mov_b32_e32 v13, s14 ; GFX8-NEXT: s_mul_i32 s16, s0, s8 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v6 ; GFX8-NEXT: s_mul_i32 s5, s5, s10 ; GFX8-NEXT: s_mul_i32 s15, s0, s15 ; GFX8-NEXT: v_mul_hi_u32 v13, s0, v13 @@ -1561,17 +1561,17 @@ ; GFX8-NEXT: s_mul_i32 s4, s4, s11 ; GFX8-NEXT: s_add_i32 s0, s0, s5 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v10, v6 -; GFX8-NEXT: v_mov_b32_e32 v10, s6 ; GFX8-NEXT: s_mul_i32 s11, s3, s12 +; GFX8-NEXT: v_mov_b32_e32 v10, s6 ; GFX8-NEXT: s_add_i32 s0, s0, s4 ; GFX8-NEXT: s_mul_i32 s12, s2, s13 -; GFX8-NEXT: s_add_i32 s0, s0, s11 ; GFX8-NEXT: v_mul_hi_u32 v10, v10, s8 +; GFX8-NEXT: s_add_i32 s0, s0, s11 ; GFX8-NEXT: s_mul_i32 s13, s1, s14 -; GFX8-NEXT: s_add_i32 s0, s0, s12 ; GFX8-NEXT: v_mul_hi_u32 v8, v8, s9 -; GFX8-NEXT: s_add_i32 s0, s0, s13 +; GFX8-NEXT: s_add_i32 s0, s0, s12 ; GFX8-NEXT: v_mul_hi_u32 v7, v7, s10 +; GFX8-NEXT: s_add_i32 s0, s0, s13 ; GFX8-NEXT: v_mul_hi_u32 v9, s3, v9 ; GFX8-NEXT: s_add_i32 s0, s0, s15 ; GFX8-NEXT: v_mul_hi_u32 v11, s2, v12 @@ -1826,8 +1826,8 @@ ; GFX9-NEXT: s_cselect_b32 s23, 1, 0 ; GFX9-NEXT: s_and_b32 s23, s23, 1 ; GFX9-NEXT: s_add_i32 s24, s24, s23 -; GFX9-NEXT: s_mul_i32 s23, s6, s9 ; GFX9-NEXT: s_mul_i32 s7, s7, s8 +; GFX9-NEXT: s_mul_i32 s23, s6, s9 ; GFX9-NEXT: s_mul_i32 s25, s5, s10 ; GFX9-NEXT: s_add_i32 s7, s7, s23 ; GFX9-NEXT: s_mul_i32 s26, s4, s11 @@ -1844,17 +1844,17 @@ ; GFX9-NEXT: s_add_i32 s7, s7, s15 ; GFX9-NEXT: s_mul_hi_u32 s5, s5, s9 ; GFX9-NEXT: s_add_i32 s6, s7, s6 -; GFX9-NEXT: s_add_i32 s5, s6, s5 ; GFX9-NEXT: s_mul_hi_u32 s4, s4, s10 -; GFX9-NEXT: s_add_i32 s4, s5, s4 +; GFX9-NEXT: s_add_i32 s5, s6, s5 ; GFX9-NEXT: s_mul_hi_u32 s3, s3, s11 -; GFX9-NEXT: s_add_i32 s3, s4, s3 +; GFX9-NEXT: s_add_i32 s4, s5, s4 ; GFX9-NEXT: s_mul_hi_u32 s2, s2, s12 -; GFX9-NEXT: s_add_i32 s2, s3, s2 +; GFX9-NEXT: s_add_i32 s3, s4, s3 ; GFX9-NEXT: s_mul_hi_u32 s1, s1, s13 +; GFX9-NEXT: s_add_i32 s2, s3, s2 ; GFX9-NEXT: s_mul_i32 s16, s0, s8 -; GFX9-NEXT: s_add_i32 s1, s2, s1 ; GFX9-NEXT: s_mul_hi_u32 s0, s0, s14 +; GFX9-NEXT: s_add_i32 s1, s2, s1 ; GFX9-NEXT: s_add_i32 s0, s1, s0 ; GFX9-NEXT: s_add_i32 s7, s0, s24 ; GFX9-NEXT: s_mov_b32 s0, s16 @@ -2166,7 +2166,6 @@ ; GFX7-NEXT: v_mul_hi_u32 v21, v0, v9 ; GFX7-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc ; GFX7-NEXT: v_add_i32_e32 v19, vcc, v19, v20 -; GFX7-NEXT: v_mul_lo_u32 v22, v0, v11 ; GFX7-NEXT: v_add_i32_e32 v18, vcc, v18, v21 ; GFX7-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc ; GFX7-NEXT: v_add_i32_e32 v19, vcc, v19, v20 @@ -2176,8 +2175,8 @@ ; GFX7-NEXT: v_mul_lo_u32 v21, v2, v9 ; GFX7-NEXT: v_add_i32_e32 v18, vcc, v19, v18 ; GFX7-NEXT: v_mul_lo_u32 v19, v1, v10 -; GFX7-NEXT: v_mul_lo_u32 v23, v1, v11 ; GFX7-NEXT: v_add_i32_e32 v20, vcc, v20, v21 +; GFX7-NEXT: v_mul_lo_u32 v22, v0, v11 ; GFX7-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc ; GFX7-NEXT: v_add_i32_e32 v19, vcc, v20, v19 ; GFX7-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc @@ -2186,12 +2185,10 @@ ; GFX7-NEXT: v_mul_hi_u32 v22, v2, v8 ; GFX7-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc ; GFX7-NEXT: v_add_i32_e32 v20, vcc, v20, v21 -; GFX7-NEXT: v_mul_lo_u32 v7, v7, v8 ; GFX7-NEXT: v_add_i32_e32 v19, vcc, v19, v22 ; GFX7-NEXT: v_mul_hi_u32 v22, v1, v9 ; GFX7-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc ; GFX7-NEXT: v_add_i32_e32 v20, vcc, v20, v21 -; GFX7-NEXT: v_mul_lo_u32 v15, v0, v15 ; GFX7-NEXT: v_add_i32_e32 v19, vcc, v19, v22 ; GFX7-NEXT: v_mul_hi_u32 v22, v0, v10 ; GFX7-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc @@ -2206,6 +2203,7 @@ ; GFX7-NEXT: v_add_i32_e32 v19, vcc, v20, v19 ; GFX7-NEXT: v_mul_lo_u32 v20, v2, v10 ; GFX7-NEXT: v_add_i32_e32 v21, vcc, v21, v22 +; GFX7-NEXT: v_mul_lo_u32 v23, v1, v11 ; GFX7-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc ; GFX7-NEXT: v_add_i32_e32 v20, vcc, v21, v20 ; GFX7-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc @@ -2245,6 +2243,8 @@ ; GFX7-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc ; GFX7-NEXT: v_add_i32_e32 v22, vcc, v23, v22 ; GFX7-NEXT: v_mul_lo_u32 v23, v2, v11 +; GFX7-NEXT: v_mul_lo_u32 v7, v7, v8 +; GFX7-NEXT: v_mul_lo_u32 v15, v0, v15 ; GFX7-NEXT: v_add_i32_e32 v21, vcc, v21, v23 ; GFX7-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc ; GFX7-NEXT: v_add_i32_e32 v22, vcc, v22, v23 @@ -2342,9 +2342,9 @@ ; GFX7-NEXT: v_mul_hi_u32 v12, v2, v12 ; GFX7-NEXT: v_mul_lo_u32 v2, v2, v13 ; GFX7-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; GFX7-NEXT: v_add_i32_e32 v5, vcc, v6, v5 ; GFX7-NEXT: v_mul_hi_u32 v13, v1, v13 ; GFX7-NEXT: v_mul_lo_u32 v1, v1, v14 +; GFX7-NEXT: v_add_i32_e32 v5, vcc, v6, v5 ; GFX7-NEXT: v_add_i32_e32 v4, vcc, v5, v4 ; GFX7-NEXT: v_add_i32_e32 v3, vcc, v4, v3 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, v3, v2 @@ -2352,8 +2352,8 @@ ; GFX7-NEXT: v_add_i32_e32 v1, vcc, v1, v15 ; GFX7-NEXT: v_add_i32_e32 v1, vcc, v1, v8 ; GFX7-NEXT: v_add_i32_e32 v1, vcc, v1, v9 -; GFX7-NEXT: v_add_i32_e32 v1, vcc, v1, v10 ; GFX7-NEXT: v_mul_hi_u32 v0, v0, v14 +; GFX7-NEXT: v_add_i32_e32 v1, vcc, v1, v10 ; GFX7-NEXT: v_add_i32_e32 v1, vcc, v1, v11 ; GFX7-NEXT: v_add_i32_e32 v1, vcc, v1, v12 ; GFX7-NEXT: v_add_i32_e32 v1, vcc, v1, v13 @@ -2392,7 +2392,6 @@ ; GFX8-NEXT: v_mul_hi_u32 v21, v0, v9 ; GFX8-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v19, vcc, v19, v20 -; GFX8-NEXT: v_mul_lo_u32 v22, v0, v11 ; GFX8-NEXT: v_add_u32_e32 v18, vcc, v18, v21 ; GFX8-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v19, vcc, v19, v20 @@ -2402,8 +2401,8 @@ ; GFX8-NEXT: v_mul_lo_u32 v21, v2, v9 ; GFX8-NEXT: v_add_u32_e32 v18, vcc, v19, v18 ; GFX8-NEXT: v_mul_lo_u32 v19, v1, v10 -; GFX8-NEXT: v_mul_lo_u32 v23, v1, v11 ; GFX8-NEXT: v_add_u32_e32 v20, vcc, v20, v21 +; GFX8-NEXT: v_mul_lo_u32 v22, v0, v11 ; GFX8-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v19, vcc, v20, v19 ; GFX8-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc @@ -2412,12 +2411,10 @@ ; GFX8-NEXT: v_mul_hi_u32 v22, v2, v8 ; GFX8-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v20, vcc, v20, v21 -; GFX8-NEXT: v_mul_lo_u32 v7, v7, v8 ; GFX8-NEXT: v_add_u32_e32 v19, vcc, v19, v22 ; GFX8-NEXT: v_mul_hi_u32 v22, v1, v9 ; GFX8-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v20, vcc, v20, v21 -; GFX8-NEXT: v_mul_lo_u32 v15, v0, v15 ; GFX8-NEXT: v_add_u32_e32 v19, vcc, v19, v22 ; GFX8-NEXT: v_mul_hi_u32 v22, v0, v10 ; GFX8-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc @@ -2432,6 +2429,7 @@ ; GFX8-NEXT: v_add_u32_e32 v19, vcc, v20, v19 ; GFX8-NEXT: v_mul_lo_u32 v20, v2, v10 ; GFX8-NEXT: v_add_u32_e32 v21, vcc, v21, v22 +; GFX8-NEXT: v_mul_lo_u32 v23, v1, v11 ; GFX8-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v20, vcc, v21, v20 ; GFX8-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc @@ -2471,6 +2469,8 @@ ; GFX8-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v22, vcc, v23, v22 ; GFX8-NEXT: v_mul_lo_u32 v23, v2, v11 +; GFX8-NEXT: v_mul_lo_u32 v7, v7, v8 +; GFX8-NEXT: v_mul_lo_u32 v15, v0, v15 ; GFX8-NEXT: v_add_u32_e32 v21, vcc, v21, v23 ; GFX8-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v22, vcc, v22, v23 @@ -2568,9 +2568,9 @@ ; GFX8-NEXT: v_mul_hi_u32 v12, v2, v12 ; GFX8-NEXT: v_mul_lo_u32 v2, v2, v13 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v7, v6 -; GFX8-NEXT: v_add_u32_e32 v5, vcc, v6, v5 ; GFX8-NEXT: v_mul_hi_u32 v13, v1, v13 ; GFX8-NEXT: v_mul_lo_u32 v1, v1, v14 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v6, v5 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v5, v4 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v4, v3 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 @@ -2578,8 +2578,8 @@ ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v15 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v8 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v9 -; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v10 ; GFX8-NEXT: v_mul_hi_u32 v0, v0, v14 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v10 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v11 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v12 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v13 @@ -2616,8 +2616,8 @@ ; GFX9-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v16, vcc, v17, v16 ; GFX9-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v19, vcc, v19, v21 ; GFX9-NEXT: v_add_u32_e32 v17, v20, v17 +; GFX9-NEXT: v_add_co_u32_e32 v19, vcc, v19, v21 ; GFX9-NEXT: v_mul_lo_u32 v21, v3, v8 ; GFX9-NEXT: v_mul_lo_u32 v22, v2, v9 ; GFX9-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc @@ -2769,8 +2769,8 @@ ; GFX9-NEXT: v_mul_lo_u32 v23, v5, v10 ; GFX9-NEXT: v_mul_hi_u32 v5, v5, v9 ; GFX9-NEXT: v_mul_hi_u32 v9, v3, v11 -; GFX9-NEXT: v_mul_hi_u32 v10, v2, v12 ; GFX9-NEXT: v_mul_lo_u32 v3, v3, v12 +; GFX9-NEXT: v_mul_hi_u32 v10, v2, v12 ; GFX9-NEXT: v_mul_lo_u32 v2, v2, v13 ; GFX9-NEXT: v_mul_hi_u32 v11, v1, v13 ; GFX9-NEXT: v_mul_lo_u32 v12, v1, v14 @@ -2778,8 +2778,8 @@ ; GFX9-NEXT: v_add3_u32 v7, v7, v23, v24 ; GFX9-NEXT: v_add3_u32 v2, v7, v3, v2 ; GFX9-NEXT: v_mul_lo_u32 v1, v0, v8 -; GFX9-NEXT: v_add3_u32 v2, v2, v12, v13 ; GFX9-NEXT: v_mul_hi_u32 v0, v0, v14 +; GFX9-NEXT: v_add3_u32 v2, v2, v12, v13 ; GFX9-NEXT: v_add3_u32 v2, v2, v6, v5 ; GFX9-NEXT: v_add3_u32 v2, v2, v4, v9 ; GFX9-NEXT: v_add3_u32 v2, v2, v10, v11 @@ -2807,53 +2807,53 @@ ; GFX10-NEXT: v_mul_lo_u32 v25, v1, v10 ; GFX10-NEXT: v_mul_hi_u32 v23, v0, v9 ; GFX10-NEXT: v_add_co_u32 v16, s4, v16, v17 -; GFX10-NEXT: v_mul_hi_u32 v27, v0, v10 ; GFX10-NEXT: v_cndmask_b32_e64 v17, 0, 1, s4 +; GFX10-NEXT: v_mul_hi_u32 v27, v0, v10 ; GFX10-NEXT: v_mul_hi_u32 v29, v3, v9 -; GFX10-NEXT: v_mul_hi_u32 v31, v4, v9 ; GFX10-NEXT: v_add_co_u32 v16, s4, v16, v18 -; GFX10-NEXT: v_mul_lo_u32 v7, v7, v8 ; GFX10-NEXT: v_cndmask_b32_e64 v18, 0, 1, s4 ; GFX10-NEXT: v_add_co_u32 v19, s4, v19, v20 ; GFX10-NEXT: v_mul_lo_u32 v20, v2, v9 ; GFX10-NEXT: v_cndmask_b32_e64 v24, 0, 1, s4 -; GFX10-NEXT: v_mul_lo_u32 v15, v0, v15 ; GFX10-NEXT: v_add_nc_u32_e32 v17, v17, v18 ; GFX10-NEXT: v_mul_lo_u32 v18, v0, v10 +; GFX10-NEXT: v_mul_hi_u32 v31, v4, v9 +; GFX10-NEXT: v_mul_lo_u32 v7, v7, v8 +; GFX10-NEXT: v_mul_lo_u32 v15, v0, v15 ; GFX10-NEXT: v_add_co_u32 v18, s4, v19, v18 ; GFX10-NEXT: v_cndmask_b32_e64 v19, 0, 1, s4 ; GFX10-NEXT: v_add_co_u32 v20, s4, v22, v20 -; GFX10-NEXT: v_mul_lo_u32 v22, v0, v11 ; GFX10-NEXT: v_add_co_u32 v18, s5, v18, v21 -; GFX10-NEXT: v_cndmask_b32_e64 v26, 0, 1, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v21, 0, 1, s5 +; GFX10-NEXT: v_mul_lo_u32 v22, v0, v11 +; GFX10-NEXT: v_cndmask_b32_e64 v26, 0, 1, s4 ; GFX10-NEXT: v_add_co_u32 v20, s4, v20, v25 -; GFX10-NEXT: v_add_co_u32 v18, s5, v18, v23 -; GFX10-NEXT: v_mul_hi_u32 v23, v1, v9 ; GFX10-NEXT: v_add3_u32 v19, v24, v19, v21 ; GFX10-NEXT: v_mul_hi_u32 v21, v2, v8 ; GFX10-NEXT: v_cndmask_b32_e64 v24, 0, 1, s4 +; GFX10-NEXT: v_add_co_u32 v18, s5, v18, v23 ; GFX10-NEXT: v_add_co_u32 v20, s4, v20, v22 -; GFX10-NEXT: v_cndmask_b32_e64 v22, 0, 1, s5 +; GFX10-NEXT: v_mul_hi_u32 v23, v1, v9 ; GFX10-NEXT: v_cndmask_b32_e64 v25, 0, 1, s4 -; GFX10-NEXT: v_add_co_u32 v17, s5, v18, v17 -; GFX10-NEXT: v_cndmask_b32_e64 v18, 0, 1, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v22, 0, 1, s5 ; GFX10-NEXT: v_add_co_u32 v20, s4, v20, v21 +; GFX10-NEXT: v_add_co_u32 v17, s5, v18, v17 ; GFX10-NEXT: v_add3_u32 v21, v26, v24, v25 ; GFX10-NEXT: v_cndmask_b32_e64 v24, 0, 1, s4 -; GFX10-NEXT: v_mul_lo_u32 v25, v4, v8 -; GFX10-NEXT: v_mul_lo_u32 v26, v3, v9 ; GFX10-NEXT: v_add_co_u32 v20, s4, v20, v23 -; GFX10-NEXT: v_add3_u32 v18, v19, v22, v18 +; GFX10-NEXT: v_cndmask_b32_e64 v18, 0, 1, s5 ; GFX10-NEXT: v_cndmask_b32_e64 v23, 0, 1, s4 +; GFX10-NEXT: v_mul_lo_u32 v25, v4, v8 +; GFX10-NEXT: v_mul_lo_u32 v26, v3, v9 ; GFX10-NEXT: v_add_co_u32 v20, s5, v20, v27 +; GFX10-NEXT: v_add3_u32 v18, v19, v22, v18 ; GFX10-NEXT: v_add3_u32 v19, v21, v24, v23 ; GFX10-NEXT: v_mul_lo_u32 v21, v2, v10 -; GFX10-NEXT: v_add_co_u32 v22, s4, v25, v26 ; GFX10-NEXT: v_mul_lo_u32 v24, v1, v11 +; GFX10-NEXT: v_cndmask_b32_e64 v23, 0, 1, s5 +; GFX10-NEXT: v_add_co_u32 v22, s4, v25, v26 ; GFX10-NEXT: v_cndmask_b32_e64 v25, 0, 1, s4 ; GFX10-NEXT: v_mul_hi_u32 v26, v3, v8 -; GFX10-NEXT: v_cndmask_b32_e64 v23, 0, 1, s5 ; GFX10-NEXT: v_add_co_u32 v21, s4, v22, v21 ; GFX10-NEXT: v_mul_lo_u32 v22, v0, v12 ; GFX10-NEXT: v_cndmask_b32_e64 v27, 0, 1, s4 @@ -2865,116 +2865,116 @@ ; GFX10-NEXT: v_mul_hi_u32 v22, v2, v9 ; GFX10-NEXT: v_add3_u32 v24, v25, v27, v24 ; GFX10-NEXT: v_cndmask_b32_e64 v25, 0, 1, s4 -; GFX10-NEXT: v_add3_u32 v19, v19, v23, v20 ; GFX10-NEXT: v_add_co_u32 v21, s4, v21, v26 -; GFX10-NEXT: v_mul_hi_u32 v20, v1, v10 ; GFX10-NEXT: v_cndmask_b32_e64 v26, 0, 1, s4 -; GFX10-NEXT: v_mul_hi_u32 v27, v0, v11 +; GFX10-NEXT: v_add3_u32 v19, v19, v23, v20 +; GFX10-NEXT: v_mul_hi_u32 v20, v1, v10 ; GFX10-NEXT: v_add_co_u32 v21, s4, v21, v22 -; GFX10-NEXT: v_mul_lo_u32 v22, v5, v8 ; GFX10-NEXT: v_add3_u32 v23, v24, v25, v26 +; GFX10-NEXT: v_mul_lo_u32 v22, v5, v8 ; GFX10-NEXT: v_mul_lo_u32 v24, v4, v9 ; GFX10-NEXT: v_cndmask_b32_e64 v25, 0, 1, s4 -; GFX10-NEXT: v_add_co_u32 v20, s4, v21, v20 ; GFX10-NEXT: v_mul_lo_u32 v26, v3, v10 +; GFX10-NEXT: v_add_co_u32 v20, s4, v21, v20 ; GFX10-NEXT: v_cndmask_b32_e64 v21, 0, 1, s4 -; GFX10-NEXT: v_add_co_u32 v20, s5, v20, v27 +; GFX10-NEXT: v_mul_hi_u32 v27, v0, v11 ; GFX10-NEXT: v_add_co_u32 v22, s4, v22, v24 ; GFX10-NEXT: v_add3_u32 v21, v23, v25, v21 ; GFX10-NEXT: v_mul_lo_u32 v23, v2, v11 ; GFX10-NEXT: v_cndmask_b32_e64 v24, 0, 1, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v25, 0, 1, s5 ; GFX10-NEXT: v_add_co_u32 v22, s4, v22, v26 ; GFX10-NEXT: v_mul_lo_u32 v26, v1, v12 +; GFX10-NEXT: v_add_co_u32 v20, s5, v20, v27 ; GFX10-NEXT: v_cndmask_b32_e64 v27, 0, 1, s4 -; GFX10-NEXT: v_add_co_u32 v19, s5, v20, v19 ; GFX10-NEXT: v_add_co_u32 v22, s4, v22, v23 ; GFX10-NEXT: v_mul_lo_u32 v23, v0, v13 ; GFX10-NEXT: v_cndmask_b32_e64 v28, 0, 1, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v20, 0, 1, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v25, 0, 1, s5 ; GFX10-NEXT: v_add_co_u32 v22, s4, v22, v26 ; GFX10-NEXT: v_mul_hi_u32 v26, v4, v8 +; GFX10-NEXT: v_add_co_u32 v19, s5, v20, v19 +; GFX10-NEXT: v_cndmask_b32_e64 v20, 0, 1, s5 ; GFX10-NEXT: v_cndmask_b32_e64 v30, 0, 1, s4 -; GFX10-NEXT: v_add3_u32 v20, v21, v25, v20 ; GFX10-NEXT: v_add_co_u32 v22, s4, v22, v23 ; GFX10-NEXT: v_add3_u32 v23, v24, v27, v28 ; GFX10-NEXT: v_cndmask_b32_e64 v24, 0, 1, s4 -; GFX10-NEXT: v_mul_lo_u32 v27, v6, v8 -; GFX10-NEXT: v_mul_lo_u32 v28, v5, v9 +; GFX10-NEXT: v_add3_u32 v20, v21, v25, v20 ; GFX10-NEXT: v_add_co_u32 v21, s4, v22, v26 ; GFX10-NEXT: v_mul_hi_u32 v22, v2, v10 ; GFX10-NEXT: v_add3_u32 v23, v23, v30, v24 ; GFX10-NEXT: v_cndmask_b32_e64 v24, 0, 1, s4 -; GFX10-NEXT: v_mul_hi_u32 v26, v1, v11 ; GFX10-NEXT: v_add_co_u32 v21, s4, v21, v29 -; GFX10-NEXT: v_mul_hi_u32 v29, v0, v12 ; GFX10-NEXT: v_cndmask_b32_e64 v25, 0, 1, s4 +; GFX10-NEXT: v_mul_hi_u32 v26, v1, v11 +; GFX10-NEXT: v_mul_lo_u32 v27, v6, v8 +; GFX10-NEXT: v_mul_lo_u32 v28, v5, v9 ; GFX10-NEXT: v_add_co_u32 v21, s4, v21, v22 ; GFX10-NEXT: v_add3_u32 v23, v23, v24, v25 ; GFX10-NEXT: v_mul_lo_u32 v24, v4, v10 ; GFX10-NEXT: v_cndmask_b32_e64 v22, 0, 1, s4 -; GFX10-NEXT: v_add_co_u32 v25, s4, v27, v28 ; GFX10-NEXT: v_add_co_u32 v21, s5, v21, v26 -; GFX10-NEXT: v_mul_lo_u32 v27, v3, v11 +; GFX10-NEXT: v_add_co_u32 v25, s4, v27, v28 ; GFX10-NEXT: v_cndmask_b32_e64 v26, 0, 1, s5 +; GFX10-NEXT: v_mul_lo_u32 v27, v3, v11 ; GFX10-NEXT: v_cndmask_b32_e64 v28, 0, 1, s4 ; GFX10-NEXT: v_add_co_u32 v24, s4, v25, v24 -; GFX10-NEXT: v_add_co_u32 v21, s5, v21, v29 +; GFX10-NEXT: v_mul_hi_u32 v29, v0, v12 ; GFX10-NEXT: v_add3_u32 v22, v23, v22, v26 ; GFX10-NEXT: v_mul_lo_u32 v23, v2, v12 ; GFX10-NEXT: v_cndmask_b32_e64 v25, 0, 1, s4 ; GFX10-NEXT: v_add_co_u32 v24, s4, v24, v27 ; GFX10-NEXT: v_mul_lo_u32 v27, v1, v13 +; GFX10-NEXT: v_add_co_u32 v21, s5, v21, v29 ; GFX10-NEXT: v_cndmask_b32_e64 v29, 0, 1, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v26, 0, 1, s5 -; GFX10-NEXT: v_add_co_u32 v20, s5, v21, v20 ; GFX10-NEXT: v_add_co_u32 v23, s4, v24, v23 ; GFX10-NEXT: v_mul_lo_u32 v24, v0, v14 ; GFX10-NEXT: v_cndmask_b32_e64 v30, 0, 1, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v21, 0, 1, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v26, 0, 1, s5 ; GFX10-NEXT: v_add_co_u32 v23, s4, v23, v27 ; GFX10-NEXT: v_mul_hi_u32 v27, v5, v8 ; GFX10-NEXT: v_cndmask_b32_e64 v32, 0, 1, s4 -; GFX10-NEXT: v_add3_u32 v21, v22, v26, v21 -; GFX10-NEXT: v_mul_hi_u32 v26, v2, v11 +; GFX10-NEXT: v_add_co_u32 v20, s5, v21, v20 ; GFX10-NEXT: v_add_co_u32 v23, s4, v23, v24 ; GFX10-NEXT: v_add3_u32 v24, v28, v25, v29 ; GFX10-NEXT: v_cndmask_b32_e64 v28, 0, 1, s4 ; GFX10-NEXT: v_mul_hi_u32 v25, v3, v10 -; GFX10-NEXT: v_mul_lo_u32 v29, v3, v12 ; GFX10-NEXT: v_add_co_u32 v23, s4, v23, v27 +; GFX10-NEXT: v_cndmask_b32_e64 v21, 0, 1, s5 ; GFX10-NEXT: v_add3_u32 v24, v24, v30, v32 ; GFX10-NEXT: v_cndmask_b32_e64 v27, 0, 1, s4 -; GFX10-NEXT: v_mul_hi_u32 v3, v3, v11 ; GFX10-NEXT: v_add_co_u32 v23, s4, v23, v31 +; GFX10-NEXT: v_add3_u32 v21, v22, v26, v21 +; GFX10-NEXT: v_mul_hi_u32 v26, v2, v11 ; GFX10-NEXT: v_add3_u32 v22, v24, v28, v27 ; GFX10-NEXT: v_cndmask_b32_e64 v24, 0, 1, s4 -; GFX10-NEXT: v_mul_lo_u32 v28, v6, v9 ; GFX10-NEXT: v_add_co_u32 v23, s4, v23, v25 -; GFX10-NEXT: v_mul_hi_u32 v27, v1, v12 ; GFX10-NEXT: v_cndmask_b32_e64 v25, 0, 1, s4 -; GFX10-NEXT: v_mul_hi_u32 v6, v6, v8 +; GFX10-NEXT: v_mul_lo_u32 v28, v6, v9 +; GFX10-NEXT: v_mul_lo_u32 v29, v3, v12 +; GFX10-NEXT: v_mul_hi_u32 v27, v1, v12 ; GFX10-NEXT: v_add_co_u32 v23, s4, v23, v26 ; GFX10-NEXT: v_add3_u32 v22, v22, v24, v25 ; GFX10-NEXT: v_mul_lo_u32 v24, v5, v10 ; GFX10-NEXT: v_mul_lo_u32 v25, v4, v11 ; GFX10-NEXT: v_add_nc_u32_e32 v7, v7, v28 ; GFX10-NEXT: v_mul_lo_u32 v28, v2, v13 +; GFX10-NEXT: v_mul_hi_u32 v6, v6, v8 ; GFX10-NEXT: v_mul_hi_u32 v5, v5, v9 ; GFX10-NEXT: v_cndmask_b32_e64 v26, 0, 1, s4 ; GFX10-NEXT: v_add_co_u32 v23, s4, v23, v27 -; GFX10-NEXT: v_mul_hi_u32 v4, v4, v10 -; GFX10-NEXT: v_cndmask_b32_e64 v27, 0, 1, s4 ; GFX10-NEXT: v_add3_u32 v7, v7, v24, v25 ; GFX10-NEXT: v_mul_lo_u32 v24, v1, v14 ; GFX10-NEXT: v_mul_hi_u32 v25, v0, v13 +; GFX10-NEXT: v_mul_hi_u32 v4, v4, v10 +; GFX10-NEXT: v_mul_hi_u32 v3, v3, v11 +; GFX10-NEXT: v_add3_u32 v7, v7, v29, v28 +; GFX10-NEXT: v_cndmask_b32_e64 v27, 0, 1, s4 ; GFX10-NEXT: v_mul_hi_u32 v2, v2, v12 ; GFX10-NEXT: v_mul_hi_u32 v1, v1, v13 -; GFX10-NEXT: v_add3_u32 v7, v7, v29, v28 -; GFX10-NEXT: v_add3_u32 v22, v22, v26, v27 ; GFX10-NEXT: v_add3_u32 v7, v7, v24, v15 ; GFX10-NEXT: v_add_co_u32 v9, s4, v23, v25 ; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, 1, s4 +; GFX10-NEXT: v_add3_u32 v22, v22, v26, v27 ; GFX10-NEXT: v_add3_u32 v5, v7, v6, v5 ; GFX10-NEXT: v_add_co_u32 v6, s4, v9, v21 ; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, 1, s4 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll @@ -430,8 +430,8 @@ ; GFX6-LABEL: s_orn2_v2i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_mov_b32 s1, 0xffff -; GFX6-NEXT: s_and_b32 s2, s2, s1 ; GFX6-NEXT: s_lshl_b32 s0, s3, 16 +; GFX6-NEXT: s_and_b32 s2, s2, s1 ; GFX6-NEXT: s_or_b32 s0, s0, s2 ; GFX6-NEXT: s_lshl_b32 s2, s5, 16 ; GFX6-NEXT: s_and_b32 s1, s4, s1 @@ -459,8 +459,8 @@ ; GFX6-LABEL: s_orn2_v2i16_commute: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_mov_b32 s1, 0xffff -; GFX6-NEXT: s_and_b32 s2, s2, s1 ; GFX6-NEXT: s_lshl_b32 s0, s3, 16 +; GFX6-NEXT: s_and_b32 s2, s2, s1 ; GFX6-NEXT: s_or_b32 s0, s0, s2 ; GFX6-NEXT: s_lshl_b32 s2, s5, 16 ; GFX6-NEXT: s_and_b32 s1, s4, s1 @@ -488,8 +488,8 @@ ; GFX6-LABEL: s_orn2_v2i16_multi_use: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_mov_b32 s1, 0xffff -; GFX6-NEXT: s_and_b32 s2, s2, s1 ; GFX6-NEXT: s_lshl_b32 s0, s3, 16 +; GFX6-NEXT: s_and_b32 s2, s2, s1 ; GFX6-NEXT: s_or_b32 s0, s0, s2 ; GFX6-NEXT: s_lshl_b32 s2, s5, 16 ; GFX6-NEXT: s_and_b32 s1, s4, s1 @@ -526,8 +526,8 @@ ; GFX6-NEXT: s_lshl_b32 s0, s3, 16 ; GFX6-NEXT: s_and_b32 s2, s2, s1 ; GFX6-NEXT: s_or_b32 s0, s0, s2 -; GFX6-NEXT: s_and_b32 s3, s4, s1 ; GFX6-NEXT: s_lshl_b32 s2, s5, 16 +; GFX6-NEXT: s_and_b32 s3, s4, s1 ; GFX6-NEXT: s_or_b32 s2, s2, s3 ; GFX6-NEXT: s_lshl_b32 s3, s7, 16 ; GFX6-NEXT: s_and_b32 s1, s6, s1 @@ -633,11 +633,11 @@ ; GFX6-NEXT: s_mov_b32 s3, 0xffff ; GFX6-NEXT: s_and_b32 s1, s2, s3 ; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_and_b32 s2, s4, s3 ; GFX6-NEXT: s_lshl_b32 s1, s5, 16 +; GFX6-NEXT: s_and_b32 s2, s4, s3 ; GFX6-NEXT: s_or_b32 s1, s1, s2 -; GFX6-NEXT: s_and_b32 s4, s6, s3 ; GFX6-NEXT: s_lshl_b32 s2, s7, 16 +; GFX6-NEXT: s_and_b32 s4, s6, s3 ; GFX6-NEXT: s_or_b32 s2, s2, s4 ; GFX6-NEXT: s_lshl_b32 s4, s9, 16 ; GFX6-NEXT: s_and_b32 s3, s8, s3 @@ -676,11 +676,11 @@ ; GFX6-NEXT: s_mov_b32 s3, 0xffff ; GFX6-NEXT: s_and_b32 s1, s2, s3 ; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_and_b32 s2, s4, s3 ; GFX6-NEXT: s_lshl_b32 s1, s5, 16 +; GFX6-NEXT: s_and_b32 s2, s4, s3 ; GFX6-NEXT: s_or_b32 s1, s1, s2 -; GFX6-NEXT: s_and_b32 s4, s6, s3 ; GFX6-NEXT: s_lshl_b32 s2, s7, 16 +; GFX6-NEXT: s_and_b32 s4, s6, s3 ; GFX6-NEXT: s_or_b32 s2, s2, s4 ; GFX6-NEXT: s_lshl_b32 s4, s9, 16 ; GFX6-NEXT: s_and_b32 s3, s8, s3 @@ -719,11 +719,11 @@ ; GFX6-NEXT: s_mov_b32 s3, 0xffff ; GFX6-NEXT: s_and_b32 s1, s2, s3 ; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_and_b32 s2, s4, s3 ; GFX6-NEXT: s_lshl_b32 s1, s5, 16 +; GFX6-NEXT: s_and_b32 s2, s4, s3 ; GFX6-NEXT: s_or_b32 s1, s1, s2 -; GFX6-NEXT: s_and_b32 s4, s6, s3 ; GFX6-NEXT: s_lshl_b32 s2, s7, 16 +; GFX6-NEXT: s_and_b32 s4, s6, s3 ; GFX6-NEXT: s_or_b32 s2, s2, s4 ; GFX6-NEXT: s_lshl_b32 s4, s9, 16 ; GFX6-NEXT: s_and_b32 s3, s8, s3 @@ -773,8 +773,8 @@ ; GFX6-NEXT: s_lshl_b32 s1, s5, 16 ; GFX6-NEXT: s_and_b32 s2, s4, s14 ; GFX6-NEXT: s_or_b32 s1, s1, s2 -; GFX6-NEXT: s_and_b32 s3, s6, s14 ; GFX6-NEXT: s_lshl_b32 s2, s7, 16 +; GFX6-NEXT: s_and_b32 s3, s6, s14 ; GFX6-NEXT: s_or_b32 s2, s2, s3 ; GFX6-NEXT: s_lshl_b32 s3, s9, 16 ; GFX6-NEXT: s_and_b32 s4, s8, s14 @@ -831,8 +831,8 @@ ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v3 ; GFX6-NEXT: v_and_b32_e32 v2, v2, v8 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX6-NEXT: v_and_b32_e32 v3, v4, v8 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v5 +; GFX6-NEXT: v_and_b32_e32 v3, v4, v8 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v7 ; GFX6-NEXT: v_and_b32_e32 v4, v6, v8 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/roundeven.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/roundeven.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/roundeven.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/roundeven.ll @@ -375,10 +375,10 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_rndne_f16_e32 v2, v0 -; GFX8-NEXT: v_rndne_f16_e32 v3, v1 ; GFX8-NEXT: v_rndne_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX8-NEXT: v_mov_b32_e32 v4, 16 +; GFX8-NEXT: v_rndne_f16_e32 v3, v1 ; GFX8-NEXT: v_rndne_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX8-NEXT: v_mov_b32_e32 v4, 16 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX8-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -389,10 +389,10 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_rndne_f16_e32 v2, v0 -; GFX9-NEXT: v_rndne_f16_e32 v3, v1 ; GFX9-NEXT: v_rndne_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff +; GFX9-NEXT: v_rndne_f16_e32 v3, v1 ; GFX9-NEXT: v_rndne_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX9-NEXT: v_and_or_b32 v0, v2, v4, v0 ; GFX9-NEXT: v_and_or_b32 v1, v3, v4, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -402,8 +402,8 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_rndne_f16_e32 v2, v0 -; GFX10-NEXT: v_rndne_f16_e32 v3, v1 ; GFX10-NEXT: v_rndne_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX10-NEXT: v_rndne_f16_e32 v3, v1 ; GFX10-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX10-NEXT: v_rndne_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GFX10-NEXT: v_and_or_b32 v0, v2, v4, v0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll @@ -10,8 +10,8 @@ ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 25, v0 ; GFX6-NEXT: v_min_i32_e32 v3, 0, v0 -; GFX6-NEXT: v_max_i32_e32 v2, 0, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 25, v1 +; GFX6-NEXT: v_max_i32_e32 v2, 0, v0 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0x80000000, v3 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 0x7fffffff, v2 ; GFX6-NEXT: v_max_i32_e32 v1, v3, v1 @@ -25,8 +25,8 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshlrev_b16_e32 v0, 9, v0 ; GFX8-NEXT: v_min_i16_e32 v3, 0, v0 -; GFX8-NEXT: v_max_i16_e32 v2, 0, v0 ; GFX8-NEXT: v_lshlrev_b16_e32 v1, 9, v1 +; GFX8-NEXT: v_max_i16_e32 v2, 0, v0 ; GFX8-NEXT: v_sub_u16_e32 v3, 0x8000, v3 ; GFX8-NEXT: v_sub_u16_e32 v2, 0x7fff, v2 ; GFX8-NEXT: v_max_i16_e32 v1, v3, v1 @@ -62,8 +62,8 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_lshl_b32 s0, s0, 25 ; GFX6-NEXT: s_min_i32 s3, s0, 0 -; GFX6-NEXT: s_max_i32 s2, s0, 0 ; GFX6-NEXT: s_lshl_b32 s1, s1, 25 +; GFX6-NEXT: s_max_i32 s2, s0, 0 ; GFX6-NEXT: s_sub_i32 s3, 0x80000000, s3 ; GFX6-NEXT: s_sub_i32 s2, 0x7fffffff, s2 ; GFX6-NEXT: s_max_i32 s1, s3, s1 @@ -84,8 +84,8 @@ ; GFX8-NEXT: s_sub_i32 s3, 0xffff8000, s3 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 -; GFX8-NEXT: s_max_i32 s1, s3, s1 ; GFX8-NEXT: s_sub_i32 s5, 0x7fff, s5 +; GFX8-NEXT: s_max_i32 s1, s3, s1 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 ; GFX8-NEXT: s_sext_i32_i16 s3, s5 ; GFX8-NEXT: s_min_i32 s1, s1, s3 @@ -124,8 +124,8 @@ ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 24, v0 ; GFX6-NEXT: v_min_i32_e32 v3, 0, v0 -; GFX6-NEXT: v_max_i32_e32 v2, 0, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; GFX6-NEXT: v_max_i32_e32 v2, 0, v0 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0x80000000, v3 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 0x7fffffff, v2 ; GFX6-NEXT: v_max_i32_e32 v1, v3, v1 @@ -139,8 +139,8 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX8-NEXT: v_min_i16_e32 v3, 0, v0 -; GFX8-NEXT: v_max_i16_e32 v2, 0, v0 ; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX8-NEXT: v_max_i16_e32 v2, 0, v0 ; GFX8-NEXT: v_sub_u16_e32 v3, 0x8000, v3 ; GFX8-NEXT: v_sub_u16_e32 v2, 0x7fff, v2 ; GFX8-NEXT: v_max_i16_e32 v1, v3, v1 @@ -176,8 +176,8 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_lshl_b32 s0, s0, 24 ; GFX6-NEXT: s_min_i32 s3, s0, 0 -; GFX6-NEXT: s_max_i32 s2, s0, 0 ; GFX6-NEXT: s_lshl_b32 s1, s1, 24 +; GFX6-NEXT: s_max_i32 s2, s0, 0 ; GFX6-NEXT: s_sub_i32 s3, 0x80000000, s3 ; GFX6-NEXT: s_sub_i32 s2, 0x7fffffff, s2 ; GFX6-NEXT: s_max_i32 s1, s3, s1 @@ -198,8 +198,8 @@ ; GFX8-NEXT: s_sub_i32 s3, 0xffff8000, s3 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 -; GFX8-NEXT: s_max_i32 s1, s3, s1 ; GFX8-NEXT: s_sub_i32 s5, 0x7fff, s5 +; GFX8-NEXT: s_max_i32 s1, s3, s1 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 ; GFX8-NEXT: s_sext_i32_i16 s3, s5 ; GFX8-NEXT: s_min_i32 s1, s1, s3 @@ -242,9 +242,9 @@ ; GFX6-NEXT: v_min_i32_e32 v5, 0, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 8, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; GFX6-NEXT: v_sub_i32_e32 v5, vcc, s5, v5 ; GFX6-NEXT: s_brev_b32 s4, -2 ; GFX6-NEXT: v_max_i32_e32 v4, 0, v0 +; GFX6-NEXT: v_sub_i32_e32 v5, vcc, s5, v5 ; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s4, v4 ; GFX6-NEXT: v_max_i32_e32 v1, v5, v1 ; GFX6-NEXT: v_min_i32_e32 v1, v1, v4 @@ -277,9 +277,9 @@ ; GFX8-NEXT: v_min_i16_e32 v5, 0, v0 ; GFX8-NEXT: v_lshrrev_b32_sdwa v2, v2, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX8-NEXT: v_sub_u16_e32 v5, s5, v5 ; GFX8-NEXT: s_movk_i32 s4, 0x7fff ; GFX8-NEXT: v_max_i16_e32 v4, 0, v0 +; GFX8-NEXT: v_sub_u16_e32 v5, s5, v5 ; GFX8-NEXT: v_sub_u16_e32 v4, s4, v4 ; GFX8-NEXT: v_max_i16_e32 v1, v5, v1 ; GFX8-NEXT: v_min_i16_e32 v1, v1, v4 @@ -290,8 +290,8 @@ ; GFX8-NEXT: v_sub_u16_e32 v1, s4, v1 ; GFX8-NEXT: v_max_i16_e32 v2, v4, v2 ; GFX8-NEXT: v_min_i16_e32 v1, v2, v1 -; GFX8-NEXT: v_mov_b32_e32 v2, 0xff ; GFX8-NEXT: v_add_u16_e32 v1, v3, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, 0xff ; GFX8-NEXT: v_and_b32_sdwa v0, sext(v0), v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; GFX8-NEXT: v_and_b32_sdwa v1, sext(v1), v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 @@ -349,9 +349,9 @@ ; GFX6-NEXT: s_min_i32 s7, s0, 0 ; GFX6-NEXT: s_lshr_b32 s3, s1, 8 ; GFX6-NEXT: s_lshl_b32 s1, s1, 24 -; GFX6-NEXT: s_sub_i32 s7, s5, s7 ; GFX6-NEXT: s_brev_b32 s4, -2 ; GFX6-NEXT: s_max_i32 s6, s0, 0 +; GFX6-NEXT: s_sub_i32 s7, s5, s7 ; GFX6-NEXT: s_sub_i32 s6, s4, s6 ; GFX6-NEXT: s_max_i32 s1, s7, s1 ; GFX6-NEXT: s_min_i32 s1, s1, s6 @@ -381,17 +381,17 @@ ; GFX8-NEXT: s_lshl_b32 s0, s0, s4 ; GFX8-NEXT: s_sext_i32_i16 s7, s0 ; GFX8-NEXT: s_sext_i32_i16 s8, 0 -; GFX8-NEXT: s_max_i32 s9, s7, s8 ; GFX8-NEXT: s_movk_i32 s6, 0x8000 +; GFX8-NEXT: s_max_i32 s9, s7, s8 ; GFX8-NEXT: s_min_i32 s7, s7, s8 -; GFX8-NEXT: s_sub_i32 s7, s6, s7 ; GFX8-NEXT: s_lshr_b32 s3, s1, 8 ; GFX8-NEXT: s_lshl_b32 s1, s1, s4 +; GFX8-NEXT: s_sub_i32 s7, s6, s7 ; GFX8-NEXT: s_movk_i32 s5, 0x7fff ; GFX8-NEXT: s_sext_i32_i16 s7, s7 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 -; GFX8-NEXT: s_max_i32 s1, s7, s1 ; GFX8-NEXT: s_sub_i32 s9, s5, s9 +; GFX8-NEXT: s_max_i32 s1, s7, s1 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 ; GFX8-NEXT: s_sext_i32_i16 s7, s9 ; GFX8-NEXT: s_min_i32 s1, s1, s7 @@ -404,8 +404,8 @@ ; GFX8-NEXT: s_sub_i32 s3, s6, s3 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 ; GFX8-NEXT: s_sext_i32_i16 s2, s2 -; GFX8-NEXT: s_max_i32 s2, s3, s2 ; GFX8-NEXT: s_sub_i32 s5, s5, s7 +; GFX8-NEXT: s_max_i32 s2, s3, s2 ; GFX8-NEXT: s_sext_i32_i16 s2, s2 ; GFX8-NEXT: s_sext_i32_i16 s3, s5 ; GFX8-NEXT: s_min_i32 s2, s2, s3 @@ -427,8 +427,8 @@ ; GFX9-NEXT: s_lshr_b32 s3, s1, 8 ; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s2 ; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s3 -; GFX9-NEXT: s_lshr_b32 s3, s0, 16 ; GFX9-NEXT: s_mov_b32 s2, 0x80008 +; GFX9-NEXT: s_lshr_b32 s3, s0, 16 ; GFX9-NEXT: s_lshl_b32 s0, s0, s2 ; GFX9-NEXT: s_lshl_b32 s3, s3, 8 ; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s3 @@ -451,8 +451,8 @@ ; GFX10-NEXT: s_lshr_b32 s3, s1, 8 ; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s2 ; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s3 -; GFX10-NEXT: s_lshr_b32 s3, s0, 16 ; GFX10-NEXT: s_mov_b32 s2, 0x80008 +; GFX10-NEXT: s_lshr_b32 s3, s0, 16 ; GFX10-NEXT: s_lshr_b32 s4, s1, 16 ; GFX10-NEXT: s_lshl_b32 s0, s0, s2 ; GFX10-NEXT: s_lshl_b32 s3, s3, 8 @@ -488,9 +488,9 @@ ; GFX6-NEXT: v_lshrrev_b32_e32 v6, 16, v1 ; GFX6-NEXT: v_lshrrev_b32_e32 v7, 24, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; GFX6-NEXT: v_sub_i32_e32 v10, vcc, s5, v10 ; GFX6-NEXT: s_brev_b32 s4, -2 ; GFX6-NEXT: v_max_i32_e32 v8, 0, v0 +; GFX6-NEXT: v_sub_i32_e32 v10, vcc, s5, v10 ; GFX6-NEXT: v_sub_i32_e32 v8, vcc, s4, v8 ; GFX6-NEXT: v_max_i32_e32 v1, v10, v1 ; GFX6-NEXT: v_min_i32_e32 v1, v1, v8 @@ -507,9 +507,9 @@ ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 24, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 24, v6 ; GFX6-NEXT: v_min_i32_e32 v6, 0, v2 -; GFX6-NEXT: v_sub_i32_e32 v6, vcc, s5, v6 ; GFX6-NEXT: v_bfrev_b32_e32 v9, -2 ; GFX6-NEXT: v_max_i32_e32 v5, 0, v2 +; GFX6-NEXT: v_sub_i32_e32 v6, vcc, s5, v6 ; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v9, v5 ; GFX6-NEXT: v_max_i32_e32 v3, v6, v3 ; GFX6-NEXT: v_min_i32_e32 v3, v3, v5 @@ -517,24 +517,24 @@ ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 24, v4 ; GFX6-NEXT: v_bfrev_b32_e32 v11, 1 ; GFX6-NEXT: v_min_i32_e32 v6, 0, v3 -; GFX6-NEXT: v_max_i32_e32 v5, 0, v3 ; GFX6-NEXT: v_ashrrev_i32_e32 v1, 24, v1 -; GFX6-NEXT: s_movk_i32 s4, 0xff ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 24, v7 +; GFX6-NEXT: v_max_i32_e32 v5, 0, v3 ; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v11, v6 +; GFX6-NEXT: s_movk_i32 s4, 0xff ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 24, v0 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 ; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v9, v5 ; GFX6-NEXT: v_max_i32_e32 v4, v6, v4 -; GFX6-NEXT: v_min_i32_e32 v4, v4, v5 +; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 ; GFX6-NEXT: v_ashrrev_i32_e32 v2, 24, v2 +; GFX6-NEXT: v_min_i32_e32 v4, v4, v5 ; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v4 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: v_and_b32_e32 v1, s4, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_ashrrev_i32_e32 v3, 24, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: v_and_b32_e32 v1, s4, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v1 @@ -555,9 +555,9 @@ ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v1 ; GFX8-NEXT: v_lshrrev_b32_e32 v7, 24, v1 ; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX8-NEXT: v_sub_u16_e32 v10, s5, v10 ; GFX8-NEXT: s_movk_i32 s4, 0x7fff ; GFX8-NEXT: v_max_i16_e32 v8, 0, v0 +; GFX8-NEXT: v_sub_u16_e32 v10, s5, v10 ; GFX8-NEXT: v_sub_u16_e32 v8, s4, v8 ; GFX8-NEXT: v_max_i16_e32 v1, v10, v1 ; GFX8-NEXT: v_min_i16_e32 v1, v1, v8 @@ -572,17 +572,17 @@ ; GFX8-NEXT: v_add_u16_e32 v1, v3, v1 ; GFX8-NEXT: v_lshlrev_b16_e32 v3, 8, v6 ; GFX8-NEXT: v_min_i16_e32 v6, 0, v2 -; GFX8-NEXT: v_sub_u16_e32 v6, s5, v6 ; GFX8-NEXT: v_mov_b32_e32 v9, 0x7fff ; GFX8-NEXT: v_max_i16_e32 v4, 0, v2 +; GFX8-NEXT: v_sub_u16_e32 v6, s5, v6 ; GFX8-NEXT: v_sub_u16_e32 v4, v9, v4 ; GFX8-NEXT: v_max_i16_e32 v3, v6, v3 ; GFX8-NEXT: v_min_i16_e32 v3, v3, v4 ; GFX8-NEXT: v_add_u16_e32 v2, v2, v3 ; GFX8-NEXT: v_lshlrev_b16_e32 v3, 8, v5 ; GFX8-NEXT: v_min_i16_e32 v6, 0, v3 -; GFX8-NEXT: v_max_i16_e32 v5, 0, v3 ; GFX8-NEXT: v_lshlrev_b16_e32 v4, 8, v7 +; GFX8-NEXT: v_max_i16_e32 v5, 0, v3 ; GFX8-NEXT: v_sub_u16_e32 v6, 0x8000, v6 ; GFX8-NEXT: v_sub_u16_e32 v5, v9, v5 ; GFX8-NEXT: v_max_i16_e32 v4, v6, v4 @@ -607,20 +607,20 @@ ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 24, v0 ; GFX9-NEXT: v_mov_b32_e32 v8, 0xffff ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX9-NEXT: v_and_or_b32 v0, v0, v8, v2 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v4 ; GFX9-NEXT: v_lshrrev_b32_sdwa v5, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v1 -; GFX9-NEXT: v_and_or_b32 v2, v3, v8, v2 +; GFX9-NEXT: v_and_or_b32 v0, v0, v8, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v4 ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; GFX9-NEXT: v_and_or_b32 v2, v3, v8, v2 ; GFX9-NEXT: v_and_or_b32 v1, v1, v8, v5 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX9-NEXT: v_and_or_b32 v3, v6, v8, v3 ; GFX9-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] +; GFX9-NEXT: v_and_or_b32 v3, v6, v8, v3 ; GFX9-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] -; GFX9-NEXT: v_pk_add_i16 v0, v0, v1 clamp ; GFX9-NEXT: v_pk_lshlrev_b16 v2, 8, v2 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_lshlrev_b16 v3, 8, v3 op_sel_hi:[0,1] +; GFX9-NEXT: v_pk_add_i16 v0, v0, v1 clamp ; GFX9-NEXT: v_pk_add_i16 v1, v2, v3 clamp ; GFX9-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1] ; GFX9-NEXT: v_mov_b32_e32 v2, 8 @@ -691,9 +691,9 @@ ; GFX6-NEXT: s_lshr_b32 s6, s1, 16 ; GFX6-NEXT: s_lshr_b32 s7, s1, 24 ; GFX6-NEXT: s_lshl_b32 s1, s1, 24 -; GFX6-NEXT: s_sub_i32 s11, s9, s11 ; GFX6-NEXT: s_brev_b32 s8, -2 ; GFX6-NEXT: s_max_i32 s10, s0, 0 +; GFX6-NEXT: s_sub_i32 s11, s9, s11 ; GFX6-NEXT: s_sub_i32 s10, s8, s10 ; GFX6-NEXT: s_max_i32 s1, s11, s1 ; GFX6-NEXT: s_min_i32 s1, s1, s10 @@ -718,14 +718,14 @@ ; GFX6-NEXT: s_add_i32 s2, s2, s3 ; GFX6-NEXT: s_lshl_b32 s3, s4, 24 ; GFX6-NEXT: s_min_i32 s6, s3, 0 -; GFX6-NEXT: s_max_i32 s5, s3, 0 ; GFX6-NEXT: s_lshl_b32 s4, s7, 24 +; GFX6-NEXT: s_max_i32 s5, s3, 0 ; GFX6-NEXT: s_sub_i32 s6, s9, s6 ; GFX6-NEXT: s_sub_i32 s5, s8, s5 ; GFX6-NEXT: s_max_i32 s4, s6, s4 ; GFX6-NEXT: s_min_i32 s4, s4, s5 -; GFX6-NEXT: s_add_i32 s3, s3, s4 ; GFX6-NEXT: s_ashr_i32 s1, s1, 24 +; GFX6-NEXT: s_add_i32 s3, s3, s4 ; GFX6-NEXT: s_movk_i32 s4, 0xff ; GFX6-NEXT: s_ashr_i32 s0, s0, 24 ; GFX6-NEXT: s_and_b32 s1, s1, s4 @@ -734,8 +734,8 @@ ; GFX6-NEXT: s_lshl_b32 s1, s1, 8 ; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: s_and_b32 s1, s2, s4 -; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_ashr_i32 s3, s3, 24 +; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: s_and_b32 s1, s3, s4 ; GFX6-NEXT: s_lshl_b32 s1, s1, 24 @@ -751,19 +751,19 @@ ; GFX8-NEXT: s_lshl_b32 s0, s0, s8 ; GFX8-NEXT: s_sext_i32_i16 s11, s0 ; GFX8-NEXT: s_sext_i32_i16 s12, 0 -; GFX8-NEXT: s_max_i32 s13, s11, s12 ; GFX8-NEXT: s_movk_i32 s10, 0x8000 +; GFX8-NEXT: s_max_i32 s13, s11, s12 ; GFX8-NEXT: s_min_i32 s11, s11, s12 -; GFX8-NEXT: s_sub_i32 s11, s10, s11 ; GFX8-NEXT: s_lshr_b32 s5, s1, 8 ; GFX8-NEXT: s_lshr_b32 s6, s1, 16 ; GFX8-NEXT: s_lshr_b32 s7, s1, 24 ; GFX8-NEXT: s_lshl_b32 s1, s1, s8 +; GFX8-NEXT: s_sub_i32 s11, s10, s11 ; GFX8-NEXT: s_movk_i32 s9, 0x7fff ; GFX8-NEXT: s_sext_i32_i16 s11, s11 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 -; GFX8-NEXT: s_max_i32 s1, s11, s1 ; GFX8-NEXT: s_sub_i32 s13, s9, s13 +; GFX8-NEXT: s_max_i32 s1, s11, s1 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 ; GFX8-NEXT: s_sext_i32_i16 s11, s13 ; GFX8-NEXT: s_min_i32 s1, s1, s11 @@ -776,8 +776,8 @@ ; GFX8-NEXT: s_sub_i32 s5, s10, s5 ; GFX8-NEXT: s_sext_i32_i16 s5, s5 ; GFX8-NEXT: s_sext_i32_i16 s2, s2 -; GFX8-NEXT: s_max_i32 s2, s5, s2 ; GFX8-NEXT: s_sub_i32 s11, s9, s11 +; GFX8-NEXT: s_max_i32 s2, s5, s2 ; GFX8-NEXT: s_sext_i32_i16 s2, s2 ; GFX8-NEXT: s_sext_i32_i16 s5, s11 ; GFX8-NEXT: s_min_i32 s2, s2, s5 @@ -790,8 +790,8 @@ ; GFX8-NEXT: s_sub_i32 s5, s10, s5 ; GFX8-NEXT: s_sext_i32_i16 s5, s5 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 -; GFX8-NEXT: s_max_i32 s3, s5, s3 ; GFX8-NEXT: s_sub_i32 s6, s9, s6 +; GFX8-NEXT: s_max_i32 s3, s5, s3 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 ; GFX8-NEXT: s_sext_i32_i16 s5, s6 ; GFX8-NEXT: s_min_i32 s3, s3, s5 @@ -804,15 +804,15 @@ ; GFX8-NEXT: s_sub_i32 s5, s10, s5 ; GFX8-NEXT: s_sext_i32_i16 s5, s5 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 -; GFX8-NEXT: s_max_i32 s4, s5, s4 ; GFX8-NEXT: s_sub_i32 s6, s9, s6 +; GFX8-NEXT: s_max_i32 s4, s5, s4 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 ; GFX8-NEXT: s_sext_i32_i16 s5, s6 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 ; GFX8-NEXT: s_min_i32 s4, s4, s5 -; GFX8-NEXT: s_add_i32 s3, s3, s4 ; GFX8-NEXT: s_sext_i32_i16 s0, s0 ; GFX8-NEXT: s_ashr_i32 s1, s1, s8 +; GFX8-NEXT: s_add_i32 s3, s3, s4 ; GFX8-NEXT: s_movk_i32 s4, 0xff ; GFX8-NEXT: s_ashr_i32 s0, s0, s8 ; GFX8-NEXT: s_sext_i32_i16 s2, s2 @@ -823,8 +823,8 @@ ; GFX8-NEXT: s_sext_i32_i16 s3, s3 ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: s_and_b32 s1, s2, s4 -; GFX8-NEXT: s_lshl_b32 s1, s1, 16 ; GFX8-NEXT: s_ashr_i32 s3, s3, s8 +; GFX8-NEXT: s_lshl_b32 s1, s1, 16 ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: s_and_b32 s1, s3, s4 ; GFX8-NEXT: s_lshl_b32 s1, s1, 24 @@ -838,19 +838,19 @@ ; GFX9-NEXT: s_lshr_b32 s6, s0, 24 ; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s3 ; GFX9-NEXT: s_pack_ll_b32_b16 s3, s4, s6 -; GFX9-NEXT: s_lshr_b32 s6, s0, 16 ; GFX9-NEXT: s_mov_b32 s4, 0x80008 +; GFX9-NEXT: s_lshr_b32 s6, s0, 16 ; GFX9-NEXT: s_lshr_b32 s7, s1, 8 ; GFX9-NEXT: s_lshl_b32 s0, s0, s4 ; GFX9-NEXT: s_lshl_b32 s6, s6, 8 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s6 -; GFX9-NEXT: s_lshr_b32 s6, s3, 16 ; GFX9-NEXT: s_lshr_b32 s8, s1, 16 ; GFX9-NEXT: s_lshr_b32 s9, s1, 24 +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s6 +; GFX9-NEXT: s_lshr_b32 s6, s3, 16 ; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s7 -; GFX9-NEXT: s_lshr_b32 s7, s1, 16 ; GFX9-NEXT: s_lshl_b32 s3, s3, s4 ; GFX9-NEXT: s_lshl_b32 s6, s6, 8 +; GFX9-NEXT: s_lshr_b32 s7, s1, 16 ; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s6 ; GFX9-NEXT: s_pack_ll_b32_b16 s6, s8, s9 ; GFX9-NEXT: s_lshl_b32 s1, s1, s4 @@ -859,19 +859,19 @@ ; GFX9-NEXT: s_lshr_b32 s7, s6, 16 ; GFX9-NEXT: s_lshl_b32 s4, s6, s4 ; GFX9-NEXT: s_lshl_b32 s6, s7, 8 -; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s6 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: v_pk_add_i16 v0, s0, v0 clamp ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: v_pk_add_i16 v1, s3, v1 clamp ; GFX9-NEXT: s_mov_b32 s2, 8 +; GFX9-NEXT: v_pk_add_i16 v1, s3, v1 clamp ; GFX9-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_ashrrev_i16 v1, 8, v1 op_sel_hi:[0,1] ; GFX9-NEXT: s_movk_i32 s0, 0xff ; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: s_mov_b32 s5, 24 ; GFX9-NEXT: v_and_or_b32 v0, v0, s0, v2 ; GFX9-NEXT: v_and_b32_e32 v2, s0, v1 -; GFX9-NEXT: s_mov_b32 s5, 24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_or3_b32 v0, v0, v2, v1 @@ -885,8 +885,8 @@ ; GFX10-NEXT: s_lshr_b32 s4, s0, 24 ; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s2 ; GFX10-NEXT: s_pack_ll_b32_b16 s2, s3, s4 -; GFX10-NEXT: s_lshr_b32 s4, s0, 16 ; GFX10-NEXT: s_mov_b32 s3, 0x80008 +; GFX10-NEXT: s_lshr_b32 s4, s0, 16 ; GFX10-NEXT: s_lshr_b32 s5, s1, 8 ; GFX10-NEXT: s_lshr_b32 s6, s1, 16 ; GFX10-NEXT: s_lshr_b32 s7, s1, 24 @@ -904,8 +904,8 @@ ; GFX10-NEXT: s_lshl_b32 s5, s5, 8 ; GFX10-NEXT: s_lshl_b32 s3, s4, s3 ; GFX10-NEXT: s_lshl_b32 s4, s6, 8 -; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s5 ; GFX10-NEXT: s_pack_ll_b32_b16 s2, s2, s8 +; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s5 ; GFX10-NEXT: s_pack_ll_b32_b16 s3, s3, s4 ; GFX10-NEXT: v_pk_add_i16 v0, s0, s1 clamp ; GFX10-NEXT: v_pk_add_i16 v1, s2, s3 clamp @@ -935,8 +935,8 @@ ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 8, v0 ; GFX6-NEXT: v_min_i32_e32 v3, 0, v0 -; GFX6-NEXT: v_max_i32_e32 v2, 0, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX6-NEXT: v_max_i32_e32 v2, 0, v0 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0x80000000, v3 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 0x7fffffff, v2 ; GFX6-NEXT: v_max_i32_e32 v1, v3, v1 @@ -987,8 +987,8 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_lshl_b32 s0, s0, 8 ; GFX6-NEXT: s_min_i32 s3, s0, 0 -; GFX6-NEXT: s_max_i32 s2, s0, 0 ; GFX6-NEXT: s_lshl_b32 s1, s1, 8 +; GFX6-NEXT: s_max_i32 s2, s0, 0 ; GFX6-NEXT: s_sub_i32 s3, 0x80000000, s3 ; GFX6-NEXT: s_sub_i32 s2, 0x7fffffff, s2 ; GFX6-NEXT: s_max_i32 s1, s3, s1 @@ -1211,9 +1211,9 @@ ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_brev_b32 s5, 1 ; GFX6-NEXT: v_min_i32_e32 v5, 0, v0 -; GFX6-NEXT: v_sub_i32_e32 v5, vcc, s5, v5 ; GFX6-NEXT: s_brev_b32 s4, -2 ; GFX6-NEXT: v_max_i32_e32 v4, 0, v0 +; GFX6-NEXT: v_sub_i32_e32 v5, vcc, s5, v5 ; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s4, v4 ; GFX6-NEXT: v_max_i32_e32 v2, v5, v2 ; GFX6-NEXT: v_min_i32_e32 v2, v2, v4 @@ -1232,9 +1232,9 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_brev_b32 s5, 1 ; GFX8-NEXT: v_min_i32_e32 v5, 0, v0 -; GFX8-NEXT: v_sub_u32_e32 v5, vcc, s5, v5 ; GFX8-NEXT: s_brev_b32 s4, -2 ; GFX8-NEXT: v_max_i32_e32 v4, 0, v0 +; GFX8-NEXT: v_sub_u32_e32 v5, vcc, s5, v5 ; GFX8-NEXT: v_sub_u32_e32 v4, vcc, s4, v4 ; GFX8-NEXT: v_max_i32_e32 v2, v5, v2 ; GFX8-NEXT: v_min_i32_e32 v2, v2, v4 @@ -1271,9 +1271,9 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_brev_b32 s5, 1 ; GFX6-NEXT: s_min_i32 s7, s0, 0 -; GFX6-NEXT: s_sub_i32 s7, s5, s7 ; GFX6-NEXT: s_brev_b32 s4, -2 ; GFX6-NEXT: s_max_i32 s6, s0, 0 +; GFX6-NEXT: s_sub_i32 s7, s5, s7 ; GFX6-NEXT: s_sub_i32 s6, s4, s6 ; GFX6-NEXT: s_max_i32 s2, s7, s2 ; GFX6-NEXT: s_min_i32 s2, s2, s6 @@ -1291,9 +1291,9 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_brev_b32 s5, 1 ; GFX8-NEXT: s_min_i32 s7, s0, 0 -; GFX8-NEXT: s_sub_i32 s7, s5, s7 ; GFX8-NEXT: s_brev_b32 s4, -2 ; GFX8-NEXT: s_max_i32 s6, s0, 0 +; GFX8-NEXT: s_sub_i32 s7, s5, s7 ; GFX8-NEXT: s_sub_i32 s6, s4, s6 ; GFX8-NEXT: s_max_i32 s2, s7, s2 ; GFX8-NEXT: s_min_i32 s2, s2, s6 @@ -1334,9 +1334,9 @@ ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_brev_b32 s5, 1 ; GFX6-NEXT: v_min_i32_e32 v7, 0, v0 -; GFX6-NEXT: v_sub_i32_e32 v7, vcc, s5, v7 ; GFX6-NEXT: s_brev_b32 s4, -2 ; GFX6-NEXT: v_max_i32_e32 v6, 0, v0 +; GFX6-NEXT: v_sub_i32_e32 v7, vcc, s5, v7 ; GFX6-NEXT: v_sub_i32_e32 v6, vcc, s4, v6 ; GFX6-NEXT: v_max_i32_e32 v3, v7, v3 ; GFX6-NEXT: v_min_i32_e32 v3, v3, v6 @@ -1348,9 +1348,9 @@ ; GFX6-NEXT: v_max_i32_e32 v4, v6, v4 ; GFX6-NEXT: v_min_i32_e32 v3, v4, v3 ; GFX6-NEXT: v_min_i32_e32 v4, 0, v2 -; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s5, v4 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3 ; GFX6-NEXT: v_max_i32_e32 v3, 0, v2 +; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s5, v4 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s4, v3 ; GFX6-NEXT: v_max_i32_e32 v4, v4, v5 ; GFX6-NEXT: v_min_i32_e32 v3, v4, v3 @@ -1362,9 +1362,9 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_brev_b32 s5, 1 ; GFX8-NEXT: v_min_i32_e32 v7, 0, v0 -; GFX8-NEXT: v_sub_u32_e32 v7, vcc, s5, v7 ; GFX8-NEXT: s_brev_b32 s4, -2 ; GFX8-NEXT: v_max_i32_e32 v6, 0, v0 +; GFX8-NEXT: v_sub_u32_e32 v7, vcc, s5, v7 ; GFX8-NEXT: v_sub_u32_e32 v6, vcc, s4, v6 ; GFX8-NEXT: v_max_i32_e32 v3, v7, v3 ; GFX8-NEXT: v_min_i32_e32 v3, v3, v6 @@ -1376,9 +1376,9 @@ ; GFX8-NEXT: v_max_i32_e32 v4, v6, v4 ; GFX8-NEXT: v_min_i32_e32 v3, v4, v3 ; GFX8-NEXT: v_min_i32_e32 v4, 0, v2 -; GFX8-NEXT: v_sub_u32_e32 v4, vcc, s5, v4 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3 ; GFX8-NEXT: v_max_i32_e32 v3, 0, v2 +; GFX8-NEXT: v_sub_u32_e32 v4, vcc, s5, v4 ; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s4, v3 ; GFX8-NEXT: v_max_i32_e32 v4, v4, v5 ; GFX8-NEXT: v_min_i32_e32 v3, v4, v3 @@ -1410,9 +1410,9 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_brev_b32 s7, 1 ; GFX6-NEXT: s_min_i32 s9, s0, 0 -; GFX6-NEXT: s_sub_i32 s9, s7, s9 ; GFX6-NEXT: s_brev_b32 s6, -2 ; GFX6-NEXT: s_max_i32 s8, s0, 0 +; GFX6-NEXT: s_sub_i32 s9, s7, s9 ; GFX6-NEXT: s_sub_i32 s8, s6, s8 ; GFX6-NEXT: s_max_i32 s3, s9, s3 ; GFX6-NEXT: s_min_i32 s3, s3, s8 @@ -1424,9 +1424,9 @@ ; GFX6-NEXT: s_max_i32 s4, s8, s4 ; GFX6-NEXT: s_min_i32 s3, s4, s3 ; GFX6-NEXT: s_min_i32 s4, s2, 0 -; GFX6-NEXT: s_sub_i32 s4, s7, s4 ; GFX6-NEXT: s_add_i32 s1, s1, s3 ; GFX6-NEXT: s_max_i32 s3, s2, 0 +; GFX6-NEXT: s_sub_i32 s4, s7, s4 ; GFX6-NEXT: s_sub_i32 s3, s6, s3 ; GFX6-NEXT: s_max_i32 s4, s4, s5 ; GFX6-NEXT: s_min_i32 s3, s4, s3 @@ -1437,9 +1437,9 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_brev_b32 s7, 1 ; GFX8-NEXT: s_min_i32 s9, s0, 0 -; GFX8-NEXT: s_sub_i32 s9, s7, s9 ; GFX8-NEXT: s_brev_b32 s6, -2 ; GFX8-NEXT: s_max_i32 s8, s0, 0 +; GFX8-NEXT: s_sub_i32 s9, s7, s9 ; GFX8-NEXT: s_sub_i32 s8, s6, s8 ; GFX8-NEXT: s_max_i32 s3, s9, s3 ; GFX8-NEXT: s_min_i32 s3, s3, s8 @@ -1451,9 +1451,9 @@ ; GFX8-NEXT: s_max_i32 s4, s8, s4 ; GFX8-NEXT: s_min_i32 s3, s4, s3 ; GFX8-NEXT: s_min_i32 s4, s2, 0 -; GFX8-NEXT: s_sub_i32 s4, s7, s4 ; GFX8-NEXT: s_add_i32 s1, s1, s3 ; GFX8-NEXT: s_max_i32 s3, s2, 0 +; GFX8-NEXT: s_sub_i32 s4, s7, s4 ; GFX8-NEXT: s_sub_i32 s3, s6, s3 ; GFX8-NEXT: s_max_i32 s4, s4, s5 ; GFX8-NEXT: s_min_i32 s3, s4, s3 @@ -1492,9 +1492,9 @@ ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_brev_b32 s5, 1 ; GFX6-NEXT: v_min_i32_e32 v9, 0, v0 -; GFX6-NEXT: v_sub_i32_e32 v9, vcc, s5, v9 ; GFX6-NEXT: s_brev_b32 s4, -2 ; GFX6-NEXT: v_max_i32_e32 v8, 0, v0 +; GFX6-NEXT: v_sub_i32_e32 v9, vcc, s5, v9 ; GFX6-NEXT: v_sub_i32_e32 v8, vcc, s4, v8 ; GFX6-NEXT: v_max_i32_e32 v4, v9, v4 ; GFX6-NEXT: v_min_i32_e32 v4, v4, v8 @@ -1506,16 +1506,16 @@ ; GFX6-NEXT: v_max_i32_e32 v5, v8, v5 ; GFX6-NEXT: v_min_i32_e32 v4, v5, v4 ; GFX6-NEXT: v_min_i32_e32 v5, 0, v2 -; GFX6-NEXT: v_sub_i32_e32 v5, vcc, s5, v5 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v4 ; GFX6-NEXT: v_max_i32_e32 v4, 0, v2 +; GFX6-NEXT: v_sub_i32_e32 v5, vcc, s5, v5 ; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s4, v4 ; GFX6-NEXT: v_max_i32_e32 v5, v5, v6 ; GFX6-NEXT: v_min_i32_e32 v4, v5, v4 ; GFX6-NEXT: v_min_i32_e32 v5, 0, v3 -; GFX6-NEXT: v_sub_i32_e32 v5, vcc, 0x80000000, v5 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GFX6-NEXT: v_max_i32_e32 v4, 0, v3 +; GFX6-NEXT: v_sub_i32_e32 v5, vcc, 0x80000000, v5 ; GFX6-NEXT: v_sub_i32_e32 v4, vcc, 0x7fffffff, v4 ; GFX6-NEXT: v_max_i32_e32 v5, v5, v7 ; GFX6-NEXT: v_min_i32_e32 v4, v5, v4 @@ -1527,9 +1527,9 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_brev_b32 s5, 1 ; GFX8-NEXT: v_min_i32_e32 v9, 0, v0 -; GFX8-NEXT: v_sub_u32_e32 v9, vcc, s5, v9 ; GFX8-NEXT: s_brev_b32 s4, -2 ; GFX8-NEXT: v_max_i32_e32 v8, 0, v0 +; GFX8-NEXT: v_sub_u32_e32 v9, vcc, s5, v9 ; GFX8-NEXT: v_sub_u32_e32 v8, vcc, s4, v8 ; GFX8-NEXT: v_max_i32_e32 v4, v9, v4 ; GFX8-NEXT: v_min_i32_e32 v4, v4, v8 @@ -1541,16 +1541,16 @@ ; GFX8-NEXT: v_max_i32_e32 v5, v8, v5 ; GFX8-NEXT: v_min_i32_e32 v4, v5, v4 ; GFX8-NEXT: v_min_i32_e32 v5, 0, v2 -; GFX8-NEXT: v_sub_u32_e32 v5, vcc, s5, v5 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v4 ; GFX8-NEXT: v_max_i32_e32 v4, 0, v2 +; GFX8-NEXT: v_sub_u32_e32 v5, vcc, s5, v5 ; GFX8-NEXT: v_sub_u32_e32 v4, vcc, s4, v4 ; GFX8-NEXT: v_max_i32_e32 v5, v5, v6 ; GFX8-NEXT: v_min_i32_e32 v4, v5, v4 ; GFX8-NEXT: v_min_i32_e32 v5, 0, v3 -; GFX8-NEXT: v_sub_u32_e32 v5, vcc, 0x80000000, v5 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v4 ; GFX8-NEXT: v_max_i32_e32 v4, 0, v3 +; GFX8-NEXT: v_sub_u32_e32 v5, vcc, 0x80000000, v5 ; GFX8-NEXT: v_sub_u32_e32 v4, vcc, 0x7fffffff, v4 ; GFX8-NEXT: v_max_i32_e32 v5, v5, v7 ; GFX8-NEXT: v_min_i32_e32 v4, v5, v4 @@ -1584,9 +1584,9 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_brev_b32 s9, 1 ; GFX6-NEXT: s_min_i32 s11, s0, 0 -; GFX6-NEXT: s_sub_i32 s11, s9, s11 ; GFX6-NEXT: s_brev_b32 s8, -2 ; GFX6-NEXT: s_max_i32 s10, s0, 0 +; GFX6-NEXT: s_sub_i32 s11, s9, s11 ; GFX6-NEXT: s_sub_i32 s10, s8, s10 ; GFX6-NEXT: s_max_i32 s4, s11, s4 ; GFX6-NEXT: s_min_i32 s4, s4, s10 @@ -1598,16 +1598,16 @@ ; GFX6-NEXT: s_max_i32 s5, s10, s5 ; GFX6-NEXT: s_min_i32 s4, s5, s4 ; GFX6-NEXT: s_min_i32 s5, s2, 0 -; GFX6-NEXT: s_sub_i32 s5, s9, s5 ; GFX6-NEXT: s_add_i32 s1, s1, s4 ; GFX6-NEXT: s_max_i32 s4, s2, 0 +; GFX6-NEXT: s_sub_i32 s5, s9, s5 ; GFX6-NEXT: s_sub_i32 s4, s8, s4 ; GFX6-NEXT: s_max_i32 s5, s5, s6 ; GFX6-NEXT: s_min_i32 s4, s5, s4 ; GFX6-NEXT: s_min_i32 s5, s3, 0 -; GFX6-NEXT: s_sub_i32 s5, s9, s5 ; GFX6-NEXT: s_add_i32 s2, s2, s4 ; GFX6-NEXT: s_max_i32 s4, s3, 0 +; GFX6-NEXT: s_sub_i32 s5, s9, s5 ; GFX6-NEXT: s_sub_i32 s4, s8, s4 ; GFX6-NEXT: s_max_i32 s5, s5, s7 ; GFX6-NEXT: s_min_i32 s4, s5, s4 @@ -1618,9 +1618,9 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_brev_b32 s9, 1 ; GFX8-NEXT: s_min_i32 s11, s0, 0 -; GFX8-NEXT: s_sub_i32 s11, s9, s11 ; GFX8-NEXT: s_brev_b32 s8, -2 ; GFX8-NEXT: s_max_i32 s10, s0, 0 +; GFX8-NEXT: s_sub_i32 s11, s9, s11 ; GFX8-NEXT: s_sub_i32 s10, s8, s10 ; GFX8-NEXT: s_max_i32 s4, s11, s4 ; GFX8-NEXT: s_min_i32 s4, s4, s10 @@ -1632,16 +1632,16 @@ ; GFX8-NEXT: s_max_i32 s5, s10, s5 ; GFX8-NEXT: s_min_i32 s4, s5, s4 ; GFX8-NEXT: s_min_i32 s5, s2, 0 -; GFX8-NEXT: s_sub_i32 s5, s9, s5 ; GFX8-NEXT: s_add_i32 s1, s1, s4 ; GFX8-NEXT: s_max_i32 s4, s2, 0 +; GFX8-NEXT: s_sub_i32 s5, s9, s5 ; GFX8-NEXT: s_sub_i32 s4, s8, s4 ; GFX8-NEXT: s_max_i32 s5, s5, s6 ; GFX8-NEXT: s_min_i32 s4, s5, s4 ; GFX8-NEXT: s_min_i32 s5, s3, 0 -; GFX8-NEXT: s_sub_i32 s5, s9, s5 ; GFX8-NEXT: s_add_i32 s2, s2, s4 ; GFX8-NEXT: s_max_i32 s4, s3, 0 +; GFX8-NEXT: s_sub_i32 s5, s9, s5 ; GFX8-NEXT: s_sub_i32 s4, s8, s4 ; GFX8-NEXT: s_max_i32 s5, s5, s7 ; GFX8-NEXT: s_min_i32 s4, s5, s4 @@ -1685,9 +1685,9 @@ ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_brev_b32 s5, 1 ; GFX6-NEXT: v_min_i32_e32 v12, 0, v0 -; GFX6-NEXT: v_sub_i32_e32 v12, vcc, s5, v12 ; GFX6-NEXT: s_brev_b32 s4, -2 ; GFX6-NEXT: v_max_i32_e32 v10, 0, v0 +; GFX6-NEXT: v_sub_i32_e32 v12, vcc, s5, v12 ; GFX6-NEXT: v_sub_i32_e32 v10, vcc, s4, v10 ; GFX6-NEXT: v_max_i32_e32 v5, v12, v5 ; GFX6-NEXT: v_min_i32_e32 v5, v5, v10 @@ -1699,25 +1699,25 @@ ; GFX6-NEXT: v_max_i32_e32 v6, v10, v6 ; GFX6-NEXT: v_min_i32_e32 v5, v6, v5 ; GFX6-NEXT: v_min_i32_e32 v6, 0, v2 -; GFX6-NEXT: v_sub_i32_e32 v6, vcc, s5, v6 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v5 ; GFX6-NEXT: v_max_i32_e32 v5, 0, v2 +; GFX6-NEXT: v_sub_i32_e32 v6, vcc, s5, v6 ; GFX6-NEXT: v_sub_i32_e32 v5, vcc, s4, v5 ; GFX6-NEXT: v_max_i32_e32 v6, v6, v7 -; GFX6-NEXT: v_min_i32_e32 v5, v6, v5 ; GFX6-NEXT: v_bfrev_b32_e32 v13, 1 +; GFX6-NEXT: v_min_i32_e32 v5, v6, v5 ; GFX6-NEXT: v_min_i32_e32 v6, 0, v3 -; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v13, v6 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5 ; GFX6-NEXT: v_bfrev_b32_e32 v11, -2 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5 ; GFX6-NEXT: v_max_i32_e32 v5, 0, v3 +; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v13, v6 ; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v11, v5 ; GFX6-NEXT: v_max_i32_e32 v6, v6, v8 ; GFX6-NEXT: v_min_i32_e32 v5, v6, v5 ; GFX6-NEXT: v_min_i32_e32 v6, 0, v4 -; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v13, v6 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v5 ; GFX6-NEXT: v_max_i32_e32 v5, 0, v4 +; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v13, v6 ; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v11, v5 ; GFX6-NEXT: v_max_i32_e32 v6, v6, v9 ; GFX6-NEXT: v_min_i32_e32 v5, v6, v5 @@ -1729,9 +1729,9 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_brev_b32 s5, 1 ; GFX8-NEXT: v_min_i32_e32 v12, 0, v0 -; GFX8-NEXT: v_sub_u32_e32 v12, vcc, s5, v12 ; GFX8-NEXT: s_brev_b32 s4, -2 ; GFX8-NEXT: v_max_i32_e32 v10, 0, v0 +; GFX8-NEXT: v_sub_u32_e32 v12, vcc, s5, v12 ; GFX8-NEXT: v_sub_u32_e32 v10, vcc, s4, v10 ; GFX8-NEXT: v_max_i32_e32 v5, v12, v5 ; GFX8-NEXT: v_min_i32_e32 v5, v5, v10 @@ -1743,25 +1743,25 @@ ; GFX8-NEXT: v_max_i32_e32 v6, v10, v6 ; GFX8-NEXT: v_min_i32_e32 v5, v6, v5 ; GFX8-NEXT: v_min_i32_e32 v6, 0, v2 -; GFX8-NEXT: v_sub_u32_e32 v6, vcc, s5, v6 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v5 ; GFX8-NEXT: v_max_i32_e32 v5, 0, v2 +; GFX8-NEXT: v_sub_u32_e32 v6, vcc, s5, v6 ; GFX8-NEXT: v_sub_u32_e32 v5, vcc, s4, v5 ; GFX8-NEXT: v_max_i32_e32 v6, v6, v7 -; GFX8-NEXT: v_min_i32_e32 v5, v6, v5 ; GFX8-NEXT: v_bfrev_b32_e32 v13, 1 +; GFX8-NEXT: v_min_i32_e32 v5, v6, v5 ; GFX8-NEXT: v_min_i32_e32 v6, 0, v3 -; GFX8-NEXT: v_sub_u32_e32 v6, vcc, v13, v6 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5 ; GFX8-NEXT: v_bfrev_b32_e32 v11, -2 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5 ; GFX8-NEXT: v_max_i32_e32 v5, 0, v3 +; GFX8-NEXT: v_sub_u32_e32 v6, vcc, v13, v6 ; GFX8-NEXT: v_sub_u32_e32 v5, vcc, v11, v5 ; GFX8-NEXT: v_max_i32_e32 v6, v6, v8 ; GFX8-NEXT: v_min_i32_e32 v5, v6, v5 ; GFX8-NEXT: v_min_i32_e32 v6, 0, v4 -; GFX8-NEXT: v_sub_u32_e32 v6, vcc, v13, v6 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v5 ; GFX8-NEXT: v_max_i32_e32 v5, 0, v4 +; GFX8-NEXT: v_sub_u32_e32 v6, vcc, v13, v6 ; GFX8-NEXT: v_sub_u32_e32 v5, vcc, v11, v5 ; GFX8-NEXT: v_max_i32_e32 v6, v6, v9 ; GFX8-NEXT: v_min_i32_e32 v5, v6, v5 @@ -1797,9 +1797,9 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_brev_b32 s11, 1 ; GFX6-NEXT: s_min_i32 s13, s0, 0 -; GFX6-NEXT: s_sub_i32 s13, s11, s13 ; GFX6-NEXT: s_brev_b32 s10, -2 ; GFX6-NEXT: s_max_i32 s12, s0, 0 +; GFX6-NEXT: s_sub_i32 s13, s11, s13 ; GFX6-NEXT: s_sub_i32 s12, s10, s12 ; GFX6-NEXT: s_max_i32 s5, s13, s5 ; GFX6-NEXT: s_min_i32 s5, s5, s12 @@ -1811,23 +1811,23 @@ ; GFX6-NEXT: s_max_i32 s6, s12, s6 ; GFX6-NEXT: s_min_i32 s5, s6, s5 ; GFX6-NEXT: s_min_i32 s6, s2, 0 -; GFX6-NEXT: s_sub_i32 s6, s11, s6 ; GFX6-NEXT: s_add_i32 s1, s1, s5 ; GFX6-NEXT: s_max_i32 s5, s2, 0 +; GFX6-NEXT: s_sub_i32 s6, s11, s6 ; GFX6-NEXT: s_sub_i32 s5, s10, s5 ; GFX6-NEXT: s_max_i32 s6, s6, s7 ; GFX6-NEXT: s_min_i32 s5, s6, s5 ; GFX6-NEXT: s_min_i32 s6, s3, 0 -; GFX6-NEXT: s_sub_i32 s6, s11, s6 ; GFX6-NEXT: s_add_i32 s2, s2, s5 ; GFX6-NEXT: s_max_i32 s5, s3, 0 +; GFX6-NEXT: s_sub_i32 s6, s11, s6 ; GFX6-NEXT: s_sub_i32 s5, s10, s5 ; GFX6-NEXT: s_max_i32 s6, s6, s8 ; GFX6-NEXT: s_min_i32 s5, s6, s5 ; GFX6-NEXT: s_min_i32 s6, s4, 0 -; GFX6-NEXT: s_sub_i32 s6, s11, s6 ; GFX6-NEXT: s_add_i32 s3, s3, s5 ; GFX6-NEXT: s_max_i32 s5, s4, 0 +; GFX6-NEXT: s_sub_i32 s6, s11, s6 ; GFX6-NEXT: s_sub_i32 s5, s10, s5 ; GFX6-NEXT: s_max_i32 s6, s6, s9 ; GFX6-NEXT: s_min_i32 s5, s6, s5 @@ -1838,9 +1838,9 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_brev_b32 s11, 1 ; GFX8-NEXT: s_min_i32 s13, s0, 0 -; GFX8-NEXT: s_sub_i32 s13, s11, s13 ; GFX8-NEXT: s_brev_b32 s10, -2 ; GFX8-NEXT: s_max_i32 s12, s0, 0 +; GFX8-NEXT: s_sub_i32 s13, s11, s13 ; GFX8-NEXT: s_sub_i32 s12, s10, s12 ; GFX8-NEXT: s_max_i32 s5, s13, s5 ; GFX8-NEXT: s_min_i32 s5, s5, s12 @@ -1852,23 +1852,23 @@ ; GFX8-NEXT: s_max_i32 s6, s12, s6 ; GFX8-NEXT: s_min_i32 s5, s6, s5 ; GFX8-NEXT: s_min_i32 s6, s2, 0 -; GFX8-NEXT: s_sub_i32 s6, s11, s6 ; GFX8-NEXT: s_add_i32 s1, s1, s5 ; GFX8-NEXT: s_max_i32 s5, s2, 0 +; GFX8-NEXT: s_sub_i32 s6, s11, s6 ; GFX8-NEXT: s_sub_i32 s5, s10, s5 ; GFX8-NEXT: s_max_i32 s6, s6, s7 ; GFX8-NEXT: s_min_i32 s5, s6, s5 ; GFX8-NEXT: s_min_i32 s6, s3, 0 -; GFX8-NEXT: s_sub_i32 s6, s11, s6 ; GFX8-NEXT: s_add_i32 s2, s2, s5 ; GFX8-NEXT: s_max_i32 s5, s3, 0 +; GFX8-NEXT: s_sub_i32 s6, s11, s6 ; GFX8-NEXT: s_sub_i32 s5, s10, s5 ; GFX8-NEXT: s_max_i32 s6, s6, s8 ; GFX8-NEXT: s_min_i32 s5, s6, s5 ; GFX8-NEXT: s_min_i32 s6, s4, 0 -; GFX8-NEXT: s_sub_i32 s6, s11, s6 ; GFX8-NEXT: s_add_i32 s3, s3, s5 ; GFX8-NEXT: s_max_i32 s5, s4, 0 +; GFX8-NEXT: s_sub_i32 s6, s11, s6 ; GFX8-NEXT: s_sub_i32 s5, s10, s5 ; GFX8-NEXT: s_max_i32 s6, s6, s9 ; GFX8-NEXT: s_min_i32 s5, s6, s5 @@ -2205,9 +2205,9 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_brev_b32 s33, 1 ; GFX6-NEXT: s_min_i32 s35, s0, 0 -; GFX6-NEXT: s_sub_i32 s35, s33, s35 ; GFX6-NEXT: s_brev_b32 s32, -2 ; GFX6-NEXT: s_max_i32 s34, s0, 0 +; GFX6-NEXT: s_sub_i32 s35, s33, s35 ; GFX6-NEXT: s_sub_i32 s34, s32, s34 ; GFX6-NEXT: s_max_i32 s16, s35, s16 ; GFX6-NEXT: s_min_i32 s16, s16, s34 @@ -2219,100 +2219,100 @@ ; GFX6-NEXT: s_max_i32 s17, s34, s17 ; GFX6-NEXT: s_min_i32 s16, s17, s16 ; GFX6-NEXT: s_min_i32 s17, s2, 0 -; GFX6-NEXT: s_sub_i32 s17, s33, s17 ; GFX6-NEXT: s_add_i32 s1, s1, s16 ; GFX6-NEXT: s_max_i32 s16, s2, 0 +; GFX6-NEXT: s_sub_i32 s17, s33, s17 ; GFX6-NEXT: s_sub_i32 s16, s32, s16 ; GFX6-NEXT: s_max_i32 s17, s17, s18 ; GFX6-NEXT: s_min_i32 s16, s17, s16 ; GFX6-NEXT: s_min_i32 s17, s3, 0 -; GFX6-NEXT: s_sub_i32 s17, s33, s17 ; GFX6-NEXT: s_add_i32 s2, s2, s16 ; GFX6-NEXT: s_max_i32 s16, s3, 0 +; GFX6-NEXT: s_sub_i32 s17, s33, s17 ; GFX6-NEXT: s_sub_i32 s16, s32, s16 ; GFX6-NEXT: s_max_i32 s17, s17, s19 ; GFX6-NEXT: s_min_i32 s16, s17, s16 ; GFX6-NEXT: s_min_i32 s17, s4, 0 -; GFX6-NEXT: s_sub_i32 s17, s33, s17 ; GFX6-NEXT: s_add_i32 s3, s3, s16 ; GFX6-NEXT: s_max_i32 s16, s4, 0 +; GFX6-NEXT: s_sub_i32 s17, s33, s17 ; GFX6-NEXT: s_sub_i32 s16, s32, s16 ; GFX6-NEXT: s_max_i32 s17, s17, s20 ; GFX6-NEXT: s_min_i32 s16, s17, s16 ; GFX6-NEXT: s_min_i32 s17, s5, 0 -; GFX6-NEXT: s_sub_i32 s17, s33, s17 ; GFX6-NEXT: s_add_i32 s4, s4, s16 ; GFX6-NEXT: s_max_i32 s16, s5, 0 +; GFX6-NEXT: s_sub_i32 s17, s33, s17 ; GFX6-NEXT: s_sub_i32 s16, s32, s16 ; GFX6-NEXT: s_max_i32 s17, s17, s21 ; GFX6-NEXT: s_min_i32 s16, s17, s16 ; GFX6-NEXT: s_min_i32 s17, s6, 0 -; GFX6-NEXT: s_sub_i32 s17, s33, s17 ; GFX6-NEXT: s_add_i32 s5, s5, s16 ; GFX6-NEXT: s_max_i32 s16, s6, 0 +; GFX6-NEXT: s_sub_i32 s17, s33, s17 ; GFX6-NEXT: s_sub_i32 s16, s32, s16 ; GFX6-NEXT: s_max_i32 s17, s17, s22 ; GFX6-NEXT: s_min_i32 s16, s17, s16 ; GFX6-NEXT: s_min_i32 s17, s7, 0 -; GFX6-NEXT: s_sub_i32 s17, s33, s17 ; GFX6-NEXT: s_add_i32 s6, s6, s16 ; GFX6-NEXT: s_max_i32 s16, s7, 0 +; GFX6-NEXT: s_sub_i32 s17, s33, s17 ; GFX6-NEXT: s_sub_i32 s16, s32, s16 ; GFX6-NEXT: s_max_i32 s17, s17, s23 ; GFX6-NEXT: s_min_i32 s16, s17, s16 ; GFX6-NEXT: s_min_i32 s17, s8, 0 -; GFX6-NEXT: s_sub_i32 s17, s33, s17 ; GFX6-NEXT: s_add_i32 s7, s7, s16 ; GFX6-NEXT: s_max_i32 s16, s8, 0 +; GFX6-NEXT: s_sub_i32 s17, s33, s17 ; GFX6-NEXT: s_sub_i32 s16, s32, s16 ; GFX6-NEXT: s_max_i32 s17, s17, s24 ; GFX6-NEXT: s_min_i32 s16, s17, s16 ; GFX6-NEXT: s_min_i32 s17, s9, 0 -; GFX6-NEXT: s_sub_i32 s17, s33, s17 ; GFX6-NEXT: s_add_i32 s8, s8, s16 ; GFX6-NEXT: s_max_i32 s16, s9, 0 +; GFX6-NEXT: s_sub_i32 s17, s33, s17 ; GFX6-NEXT: s_sub_i32 s16, s32, s16 ; GFX6-NEXT: s_max_i32 s17, s17, s25 ; GFX6-NEXT: s_min_i32 s16, s17, s16 ; GFX6-NEXT: s_min_i32 s17, s10, 0 -; GFX6-NEXT: s_sub_i32 s17, s33, s17 ; GFX6-NEXT: s_add_i32 s9, s9, s16 ; GFX6-NEXT: s_max_i32 s16, s10, 0 +; GFX6-NEXT: s_sub_i32 s17, s33, s17 ; GFX6-NEXT: s_sub_i32 s16, s32, s16 ; GFX6-NEXT: s_max_i32 s17, s17, s26 ; GFX6-NEXT: s_min_i32 s16, s17, s16 ; GFX6-NEXT: s_min_i32 s17, s11, 0 -; GFX6-NEXT: s_sub_i32 s17, s33, s17 ; GFX6-NEXT: s_add_i32 s10, s10, s16 ; GFX6-NEXT: s_max_i32 s16, s11, 0 +; GFX6-NEXT: s_sub_i32 s17, s33, s17 ; GFX6-NEXT: s_sub_i32 s16, s32, s16 ; GFX6-NEXT: s_max_i32 s17, s17, s27 ; GFX6-NEXT: s_min_i32 s16, s17, s16 ; GFX6-NEXT: s_min_i32 s17, s12, 0 -; GFX6-NEXT: s_sub_i32 s17, s33, s17 ; GFX6-NEXT: s_add_i32 s11, s11, s16 ; GFX6-NEXT: s_max_i32 s16, s12, 0 +; GFX6-NEXT: s_sub_i32 s17, s33, s17 ; GFX6-NEXT: s_sub_i32 s16, s32, s16 ; GFX6-NEXT: s_max_i32 s17, s17, s28 ; GFX6-NEXT: s_min_i32 s16, s17, s16 ; GFX6-NEXT: s_min_i32 s17, s13, 0 -; GFX6-NEXT: s_sub_i32 s17, s33, s17 ; GFX6-NEXT: s_add_i32 s12, s12, s16 ; GFX6-NEXT: s_max_i32 s16, s13, 0 +; GFX6-NEXT: s_sub_i32 s17, s33, s17 ; GFX6-NEXT: s_sub_i32 s16, s32, s16 ; GFX6-NEXT: s_max_i32 s17, s17, s29 ; GFX6-NEXT: s_min_i32 s16, s17, s16 ; GFX6-NEXT: s_min_i32 s17, s14, 0 -; GFX6-NEXT: s_sub_i32 s17, s33, s17 ; GFX6-NEXT: s_add_i32 s13, s13, s16 ; GFX6-NEXT: s_max_i32 s16, s14, 0 +; GFX6-NEXT: s_sub_i32 s17, s33, s17 ; GFX6-NEXT: s_sub_i32 s16, s32, s16 ; GFX6-NEXT: s_max_i32 s17, s17, s30 ; GFX6-NEXT: s_min_i32 s16, s17, s16 ; GFX6-NEXT: s_min_i32 s17, s15, 0 -; GFX6-NEXT: s_sub_i32 s17, s33, s17 ; GFX6-NEXT: s_add_i32 s14, s14, s16 ; GFX6-NEXT: s_max_i32 s16, s15, 0 +; GFX6-NEXT: s_sub_i32 s17, s33, s17 ; GFX6-NEXT: s_sub_i32 s16, s32, s16 ; GFX6-NEXT: s_max_i32 s17, s17, s31 ; GFX6-NEXT: s_min_i32 s16, s17, s16 @@ -2323,9 +2323,9 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_brev_b32 s33, 1 ; GFX8-NEXT: s_min_i32 s35, s0, 0 -; GFX8-NEXT: s_sub_i32 s35, s33, s35 ; GFX8-NEXT: s_brev_b32 s32, -2 ; GFX8-NEXT: s_max_i32 s34, s0, 0 +; GFX8-NEXT: s_sub_i32 s35, s33, s35 ; GFX8-NEXT: s_sub_i32 s34, s32, s34 ; GFX8-NEXT: s_max_i32 s16, s35, s16 ; GFX8-NEXT: s_min_i32 s16, s16, s34 @@ -2337,100 +2337,100 @@ ; GFX8-NEXT: s_max_i32 s17, s34, s17 ; GFX8-NEXT: s_min_i32 s16, s17, s16 ; GFX8-NEXT: s_min_i32 s17, s2, 0 -; GFX8-NEXT: s_sub_i32 s17, s33, s17 ; GFX8-NEXT: s_add_i32 s1, s1, s16 ; GFX8-NEXT: s_max_i32 s16, s2, 0 +; GFX8-NEXT: s_sub_i32 s17, s33, s17 ; GFX8-NEXT: s_sub_i32 s16, s32, s16 ; GFX8-NEXT: s_max_i32 s17, s17, s18 ; GFX8-NEXT: s_min_i32 s16, s17, s16 ; GFX8-NEXT: s_min_i32 s17, s3, 0 -; GFX8-NEXT: s_sub_i32 s17, s33, s17 ; GFX8-NEXT: s_add_i32 s2, s2, s16 ; GFX8-NEXT: s_max_i32 s16, s3, 0 +; GFX8-NEXT: s_sub_i32 s17, s33, s17 ; GFX8-NEXT: s_sub_i32 s16, s32, s16 ; GFX8-NEXT: s_max_i32 s17, s17, s19 ; GFX8-NEXT: s_min_i32 s16, s17, s16 ; GFX8-NEXT: s_min_i32 s17, s4, 0 -; GFX8-NEXT: s_sub_i32 s17, s33, s17 ; GFX8-NEXT: s_add_i32 s3, s3, s16 ; GFX8-NEXT: s_max_i32 s16, s4, 0 +; GFX8-NEXT: s_sub_i32 s17, s33, s17 ; GFX8-NEXT: s_sub_i32 s16, s32, s16 ; GFX8-NEXT: s_max_i32 s17, s17, s20 ; GFX8-NEXT: s_min_i32 s16, s17, s16 ; GFX8-NEXT: s_min_i32 s17, s5, 0 -; GFX8-NEXT: s_sub_i32 s17, s33, s17 ; GFX8-NEXT: s_add_i32 s4, s4, s16 ; GFX8-NEXT: s_max_i32 s16, s5, 0 +; GFX8-NEXT: s_sub_i32 s17, s33, s17 ; GFX8-NEXT: s_sub_i32 s16, s32, s16 ; GFX8-NEXT: s_max_i32 s17, s17, s21 ; GFX8-NEXT: s_min_i32 s16, s17, s16 ; GFX8-NEXT: s_min_i32 s17, s6, 0 -; GFX8-NEXT: s_sub_i32 s17, s33, s17 ; GFX8-NEXT: s_add_i32 s5, s5, s16 ; GFX8-NEXT: s_max_i32 s16, s6, 0 +; GFX8-NEXT: s_sub_i32 s17, s33, s17 ; GFX8-NEXT: s_sub_i32 s16, s32, s16 ; GFX8-NEXT: s_max_i32 s17, s17, s22 ; GFX8-NEXT: s_min_i32 s16, s17, s16 ; GFX8-NEXT: s_min_i32 s17, s7, 0 -; GFX8-NEXT: s_sub_i32 s17, s33, s17 ; GFX8-NEXT: s_add_i32 s6, s6, s16 ; GFX8-NEXT: s_max_i32 s16, s7, 0 +; GFX8-NEXT: s_sub_i32 s17, s33, s17 ; GFX8-NEXT: s_sub_i32 s16, s32, s16 ; GFX8-NEXT: s_max_i32 s17, s17, s23 ; GFX8-NEXT: s_min_i32 s16, s17, s16 ; GFX8-NEXT: s_min_i32 s17, s8, 0 -; GFX8-NEXT: s_sub_i32 s17, s33, s17 ; GFX8-NEXT: s_add_i32 s7, s7, s16 ; GFX8-NEXT: s_max_i32 s16, s8, 0 +; GFX8-NEXT: s_sub_i32 s17, s33, s17 ; GFX8-NEXT: s_sub_i32 s16, s32, s16 ; GFX8-NEXT: s_max_i32 s17, s17, s24 ; GFX8-NEXT: s_min_i32 s16, s17, s16 ; GFX8-NEXT: s_min_i32 s17, s9, 0 -; GFX8-NEXT: s_sub_i32 s17, s33, s17 ; GFX8-NEXT: s_add_i32 s8, s8, s16 ; GFX8-NEXT: s_max_i32 s16, s9, 0 +; GFX8-NEXT: s_sub_i32 s17, s33, s17 ; GFX8-NEXT: s_sub_i32 s16, s32, s16 ; GFX8-NEXT: s_max_i32 s17, s17, s25 ; GFX8-NEXT: s_min_i32 s16, s17, s16 ; GFX8-NEXT: s_min_i32 s17, s10, 0 -; GFX8-NEXT: s_sub_i32 s17, s33, s17 ; GFX8-NEXT: s_add_i32 s9, s9, s16 ; GFX8-NEXT: s_max_i32 s16, s10, 0 +; GFX8-NEXT: s_sub_i32 s17, s33, s17 ; GFX8-NEXT: s_sub_i32 s16, s32, s16 ; GFX8-NEXT: s_max_i32 s17, s17, s26 ; GFX8-NEXT: s_min_i32 s16, s17, s16 ; GFX8-NEXT: s_min_i32 s17, s11, 0 -; GFX8-NEXT: s_sub_i32 s17, s33, s17 ; GFX8-NEXT: s_add_i32 s10, s10, s16 ; GFX8-NEXT: s_max_i32 s16, s11, 0 +; GFX8-NEXT: s_sub_i32 s17, s33, s17 ; GFX8-NEXT: s_sub_i32 s16, s32, s16 ; GFX8-NEXT: s_max_i32 s17, s17, s27 ; GFX8-NEXT: s_min_i32 s16, s17, s16 ; GFX8-NEXT: s_min_i32 s17, s12, 0 -; GFX8-NEXT: s_sub_i32 s17, s33, s17 ; GFX8-NEXT: s_add_i32 s11, s11, s16 ; GFX8-NEXT: s_max_i32 s16, s12, 0 +; GFX8-NEXT: s_sub_i32 s17, s33, s17 ; GFX8-NEXT: s_sub_i32 s16, s32, s16 ; GFX8-NEXT: s_max_i32 s17, s17, s28 ; GFX8-NEXT: s_min_i32 s16, s17, s16 ; GFX8-NEXT: s_min_i32 s17, s13, 0 -; GFX8-NEXT: s_sub_i32 s17, s33, s17 ; GFX8-NEXT: s_add_i32 s12, s12, s16 ; GFX8-NEXT: s_max_i32 s16, s13, 0 +; GFX8-NEXT: s_sub_i32 s17, s33, s17 ; GFX8-NEXT: s_sub_i32 s16, s32, s16 ; GFX8-NEXT: s_max_i32 s17, s17, s29 ; GFX8-NEXT: s_min_i32 s16, s17, s16 ; GFX8-NEXT: s_min_i32 s17, s14, 0 -; GFX8-NEXT: s_sub_i32 s17, s33, s17 ; GFX8-NEXT: s_add_i32 s13, s13, s16 ; GFX8-NEXT: s_max_i32 s16, s14, 0 +; GFX8-NEXT: s_sub_i32 s17, s33, s17 ; GFX8-NEXT: s_sub_i32 s16, s32, s16 ; GFX8-NEXT: s_max_i32 s17, s17, s30 ; GFX8-NEXT: s_min_i32 s16, s17, s16 ; GFX8-NEXT: s_min_i32 s17, s15, 0 -; GFX8-NEXT: s_sub_i32 s17, s33, s17 ; GFX8-NEXT: s_add_i32 s14, s14, s16 ; GFX8-NEXT: s_max_i32 s16, s15, 0 +; GFX8-NEXT: s_sub_i32 s17, s33, s17 ; GFX8-NEXT: s_sub_i32 s16, s32, s16 ; GFX8-NEXT: s_max_i32 s17, s17, s31 ; GFX8-NEXT: s_min_i32 s16, s17, s16 @@ -2534,8 +2534,8 @@ ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: v_min_i32_e32 v3, 0, v0 -; GFX6-NEXT: v_max_i32_e32 v2, 0, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_max_i32_e32 v2, 0, v0 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0x80000000, v3 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 0x7fffffff, v2 ; GFX6-NEXT: v_max_i32_e32 v1, v3, v1 @@ -2577,8 +2577,8 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_lshl_b32 s0, s0, 16 ; GFX6-NEXT: s_min_i32 s3, s0, 0 -; GFX6-NEXT: s_max_i32 s2, s0, 0 ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 +; GFX6-NEXT: s_max_i32 s2, s0, 0 ; GFX6-NEXT: s_sub_i32 s3, 0x80000000, s3 ; GFX6-NEXT: s_sub_i32 s2, 0x7fffffff, s2 ; GFX6-NEXT: s_max_i32 s1, s3, s1 @@ -2596,8 +2596,8 @@ ; GFX8-NEXT: s_sub_i32 s2, 0xffff8000, s2 ; GFX8-NEXT: s_sext_i32_i16 s2, s2 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 -; GFX8-NEXT: s_max_i32 s1, s2, s1 ; GFX8-NEXT: s_sub_i32 s4, 0x7fff, s4 +; GFX8-NEXT: s_max_i32 s1, s2, s1 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 ; GFX8-NEXT: s_sext_i32_i16 s2, s4 ; GFX8-NEXT: s_min_i32 s1, s1, s2 @@ -2625,8 +2625,8 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_lshl_b32 s0, s0, 16 ; GFX6-NEXT: s_min_i32 s2, s0, 0 -; GFX6-NEXT: s_max_i32 s1, s0, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: s_max_i32 s1, s0, 0 ; GFX6-NEXT: s_sub_i32 s2, 0x80000000, s2 ; GFX6-NEXT: s_sub_i32 s1, 0x7fffffff, s1 ; GFX6-NEXT: v_max_i32_e32 v0, s2, v0 @@ -2667,8 +2667,8 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: v_min_i32_e32 v2, 0, v0 -; GFX6-NEXT: v_max_i32_e32 v1, 0, v0 ; GFX6-NEXT: s_lshl_b32 s0, s0, 16 +; GFX6-NEXT: v_max_i32_e32 v1, 0, v0 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 0x80000000, v2 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, 0x7fffffff, v1 ; GFX6-NEXT: v_max_i32_e32 v2, s0, v2 @@ -2710,9 +2710,9 @@ ; GFX6-NEXT: s_brev_b32 s5, 1 ; GFX6-NEXT: v_min_i32_e32 v5, 0, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_sub_i32_e32 v5, vcc, s5, v5 ; GFX6-NEXT: s_brev_b32 s4, -2 ; GFX6-NEXT: v_max_i32_e32 v4, 0, v0 +; GFX6-NEXT: v_sub_i32_e32 v5, vcc, s5, v5 ; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s4, v4 ; GFX6-NEXT: v_max_i32_e32 v2, v5, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 @@ -2735,13 +2735,13 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_movk_i32 s5, 0x8000 ; GFX8-NEXT: v_min_i16_e32 v4, 0, v0 -; GFX8-NEXT: v_sub_u16_e32 v4, s5, v4 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX8-NEXT: s_movk_i32 s4, 0x7fff ; GFX8-NEXT: v_max_i16_e32 v3, 0, v0 -; GFX8-NEXT: v_min_i16_e32 v5, 0, v2 +; GFX8-NEXT: v_sub_u16_e32 v4, s5, v4 ; GFX8-NEXT: v_sub_u16_e32 v3, s4, v3 ; GFX8-NEXT: v_max_i16_e32 v4, v4, v1 +; GFX8-NEXT: v_min_i16_e32 v5, 0, v2 ; GFX8-NEXT: v_min_i16_e32 v3, v4, v3 ; GFX8-NEXT: v_max_i16_e32 v4, 0, v2 ; GFX8-NEXT: v_sub_u16_e32 v5, s5, v5 @@ -2776,9 +2776,9 @@ ; GFX6-NEXT: s_brev_b32 s5, 1 ; GFX6-NEXT: s_min_i32 s7, s0, 0 ; GFX6-NEXT: s_lshl_b32 s2, s2, 16 -; GFX6-NEXT: s_sub_i32 s7, s5, s7 ; GFX6-NEXT: s_brev_b32 s4, -2 ; GFX6-NEXT: s_max_i32 s6, s0, 0 +; GFX6-NEXT: s_sub_i32 s7, s5, s7 ; GFX6-NEXT: s_sub_i32 s6, s4, s6 ; GFX6-NEXT: s_max_i32 s2, s7, s2 ; GFX6-NEXT: s_min_i32 s2, s2, s6 @@ -2805,16 +2805,16 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_sext_i32_i16 s6, s0 ; GFX8-NEXT: s_sext_i32_i16 s7, 0 -; GFX8-NEXT: s_max_i32 s8, s6, s7 ; GFX8-NEXT: s_movk_i32 s5, 0x8000 +; GFX8-NEXT: s_max_i32 s8, s6, s7 ; GFX8-NEXT: s_min_i32 s6, s6, s7 ; GFX8-NEXT: s_sub_i32 s6, s5, s6 ; GFX8-NEXT: s_lshr_b32 s3, s1, 16 ; GFX8-NEXT: s_movk_i32 s4, 0x7fff ; GFX8-NEXT: s_sext_i32_i16 s6, s6 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 -; GFX8-NEXT: s_max_i32 s1, s6, s1 ; GFX8-NEXT: s_sub_i32 s8, s4, s8 +; GFX8-NEXT: s_max_i32 s1, s6, s1 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 ; GFX8-NEXT: s_sext_i32_i16 s6, s8 ; GFX8-NEXT: s_lshr_b32 s2, s0, 16 @@ -2826,8 +2826,8 @@ ; GFX8-NEXT: s_sub_i32 s1, s5, s1 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 -; GFX8-NEXT: s_max_i32 s1, s1, s3 ; GFX8-NEXT: s_sub_i32 s4, s4, s6 +; GFX8-NEXT: s_max_i32 s1, s1, s3 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 ; GFX8-NEXT: s_sext_i32_i16 s3, s4 ; GFX8-NEXT: s_min_i32 s1, s1, s3 @@ -2862,9 +2862,9 @@ ; GFX6-NEXT: s_brev_b32 s3, 1 ; GFX6-NEXT: s_min_i32 s5, s0, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: s_sub_i32 s5, s3, s5 ; GFX6-NEXT: s_brev_b32 s2, -2 ; GFX6-NEXT: s_max_i32 s4, s0, 0 +; GFX6-NEXT: s_sub_i32 s5, s3, s5 ; GFX6-NEXT: s_sub_i32 s4, s2, s4 ; GFX6-NEXT: v_max_i32_e32 v0, s5, v0 ; GFX6-NEXT: v_min_i32_e32 v0, s4, v0 @@ -2891,22 +2891,22 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_sext_i32_i16 s4, s0 ; GFX8-NEXT: s_sext_i32_i16 s5, 0 -; GFX8-NEXT: s_max_i32 s6, s4, s5 ; GFX8-NEXT: s_movk_i32 s3, 0x8000 +; GFX8-NEXT: s_max_i32 s6, s4, s5 ; GFX8-NEXT: s_min_i32 s4, s4, s5 +; GFX8-NEXT: s_lshr_b32 s1, s0, 16 ; GFX8-NEXT: s_movk_i32 s2, 0x7fff ; GFX8-NEXT: s_sub_i32 s4, s3, s4 -; GFX8-NEXT: s_lshr_b32 s1, s0, 16 -; GFX8-NEXT: v_max_i16_e32 v1, s4, v0 ; GFX8-NEXT: s_sub_i32 s6, s2, s6 +; GFX8-NEXT: v_max_i16_e32 v1, s4, v0 ; GFX8-NEXT: s_sext_i32_i16 s4, s1 ; GFX8-NEXT: v_min_i16_e32 v1, s6, v1 ; GFX8-NEXT: s_max_i32 s6, s4, s5 ; GFX8-NEXT: s_min_i32 s4, s4, s5 ; GFX8-NEXT: s_sub_i32 s3, s3, s4 ; GFX8-NEXT: v_mov_b32_e32 v2, s3 -; GFX8-NEXT: v_max_i16_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: s_sub_i32 s2, s2, s6 +; GFX8-NEXT: v_max_i16_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_min_i16_e32 v0, s2, v0 ; GFX8-NEXT: v_mov_b32_e32 v2, s1 ; GFX8-NEXT: v_add_u16_e32 v1, s0, v1 @@ -2935,20 +2935,20 @@ ; GFX6-NEXT: s_brev_b32 s3, 1 ; GFX6-NEXT: v_min_i32_e32 v3, 0, v0 ; GFX6-NEXT: s_lshl_b32 s0, s0, 16 -; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s3, v3 ; GFX6-NEXT: s_brev_b32 s2, -2 ; GFX6-NEXT: v_max_i32_e32 v2, 0, v0 -; GFX6-NEXT: v_max_i32_e32 v3, s0, v3 +; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s3, v3 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s2, v2 +; GFX6-NEXT: v_max_i32_e32 v3, s0, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_min_i32_e32 v2, v3, v2 ; GFX6-NEXT: v_min_i32_e32 v3, 0, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GFX6-NEXT: v_max_i32_e32 v2, 0, v1 ; GFX6-NEXT: s_lshl_b32 s0, s1, 16 +; GFX6-NEXT: v_max_i32_e32 v2, 0, v1 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s3, v3 -; GFX6-NEXT: v_max_i32_e32 v3, s0, v3 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s2, v2 +; GFX6-NEXT: v_max_i32_e32 v3, s0, v3 ; GFX6-NEXT: v_min_i32_e32 v2, v3, v2 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v2 ; GFX6-NEXT: v_ashrrev_i32_e32 v1, 16, v1 @@ -2964,16 +2964,16 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_movk_i32 s3, 0x8000 ; GFX8-NEXT: v_min_i16_e32 v3, 0, v0 -; GFX8-NEXT: v_sub_u16_e32 v3, s3, v3 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX8-NEXT: s_movk_i32 s2, 0x7fff ; GFX8-NEXT: v_max_i16_e32 v2, 0, v0 -; GFX8-NEXT: v_min_i16_e32 v4, 0, v1 +; GFX8-NEXT: v_sub_u16_e32 v3, s3, v3 ; GFX8-NEXT: v_sub_u16_e32 v2, s2, v2 ; GFX8-NEXT: v_max_i16_e32 v3, s0, v3 +; GFX8-NEXT: v_min_i16_e32 v4, 0, v1 +; GFX8-NEXT: s_lshr_b32 s1, s0, 16 ; GFX8-NEXT: v_min_i16_e32 v2, v3, v2 ; GFX8-NEXT: v_max_i16_e32 v3, 0, v1 -; GFX8-NEXT: s_lshr_b32 s1, s0, 16 ; GFX8-NEXT: v_sub_u16_e32 v4, s3, v4 ; GFX8-NEXT: v_sub_u16_e32 v3, s2, v3 ; GFX8-NEXT: v_max_i16_e32 v4, s1, v4 @@ -3016,9 +3016,9 @@ ; GFX6-NEXT: s_brev_b32 s5, 1 ; GFX6-NEXT: v_min_i32_e32 v10, 0, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_sub_i32_e32 v10, vcc, s5, v10 ; GFX6-NEXT: s_brev_b32 s4, -2 ; GFX6-NEXT: v_max_i32_e32 v8, 0, v0 +; GFX6-NEXT: v_sub_i32_e32 v10, vcc, s5, v10 ; GFX6-NEXT: v_sub_i32_e32 v8, vcc, s4, v8 ; GFX6-NEXT: v_max_i32_e32 v4, v10, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 @@ -3035,23 +3035,23 @@ ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v6 ; GFX6-NEXT: v_min_i32_e32 v6, 0, v2 -; GFX6-NEXT: v_sub_i32_e32 v6, vcc, s5, v6 ; GFX6-NEXT: v_bfrev_b32_e32 v9, -2 ; GFX6-NEXT: v_max_i32_e32 v5, 0, v2 +; GFX6-NEXT: v_sub_i32_e32 v6, vcc, s5, v6 ; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v9, v5 ; GFX6-NEXT: v_max_i32_e32 v4, v6, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_min_i32_e32 v4, v4, v5 ; GFX6-NEXT: v_bfrev_b32_e32 v11, 1 +; GFX6-NEXT: v_min_i32_e32 v4, v4, v5 ; GFX6-NEXT: v_min_i32_e32 v6, 0, v3 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 -; GFX6-NEXT: v_max_i32_e32 v5, 0, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v7 +; GFX6-NEXT: v_max_i32_e32 v5, 0, v3 ; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v11, v6 ; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v9, v5 ; GFX6-NEXT: v_max_i32_e32 v4, v6, v4 -; GFX6-NEXT: v_min_i32_e32 v4, v4, v5 ; GFX6-NEXT: v_ashrrev_i32_e32 v1, 16, v1 +; GFX6-NEXT: v_min_i32_e32 v4, v4, v5 ; GFX6-NEXT: s_mov_b32 s4, 0xffff ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 16, v0 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v4 @@ -3072,26 +3072,26 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_movk_i32 s5, 0x8000 ; GFX8-NEXT: v_min_i16_e32 v7, 0, v0 -; GFX8-NEXT: v_sub_u16_e32 v7, s5, v7 ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v0 ; GFX8-NEXT: s_movk_i32 s4, 0x7fff ; GFX8-NEXT: v_max_i16_e32 v6, 0, v0 -; GFX8-NEXT: v_min_i16_e32 v8, 0, v4 +; GFX8-NEXT: v_sub_u16_e32 v7, s5, v7 ; GFX8-NEXT: v_sub_u16_e32 v6, s4, v6 ; GFX8-NEXT: v_max_i16_e32 v7, v7, v2 +; GFX8-NEXT: v_min_i16_e32 v8, 0, v4 ; GFX8-NEXT: v_min_i16_e32 v6, v7, v6 ; GFX8-NEXT: v_max_i16_e32 v7, 0, v4 ; GFX8-NEXT: v_sub_u16_e32 v8, s5, v8 +; GFX8-NEXT: v_sub_u16_e32 v7, s4, v7 ; GFX8-NEXT: v_max_i16_sdwa v2, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_min_i16_e32 v8, 0, v1 -; GFX8-NEXT: v_sub_u16_e32 v7, s4, v7 +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v1 ; GFX8-NEXT: v_min_i16_e32 v2, v2, v7 ; GFX8-NEXT: v_max_i16_e32 v7, 0, v1 ; GFX8-NEXT: v_sub_u16_e32 v8, s5, v8 -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v1 -; GFX8-NEXT: v_min_i16_e32 v9, 0, v5 ; GFX8-NEXT: v_sub_u16_e32 v7, s4, v7 ; GFX8-NEXT: v_max_i16_e32 v8, v8, v3 +; GFX8-NEXT: v_min_i16_e32 v9, 0, v5 ; GFX8-NEXT: v_min_i16_e32 v7, v8, v7 ; GFX8-NEXT: v_max_i16_e32 v8, 0, v5 ; GFX8-NEXT: v_sub_u16_e32 v9, s5, v9 @@ -3132,9 +3132,9 @@ ; GFX6-NEXT: s_brev_b32 s9, 1 ; GFX6-NEXT: s_min_i32 s11, s0, 0 ; GFX6-NEXT: s_lshl_b32 s4, s4, 16 -; GFX6-NEXT: s_sub_i32 s11, s9, s11 ; GFX6-NEXT: s_brev_b32 s8, -2 ; GFX6-NEXT: s_max_i32 s10, s0, 0 +; GFX6-NEXT: s_sub_i32 s11, s9, s11 ; GFX6-NEXT: s_sub_i32 s10, s8, s10 ; GFX6-NEXT: s_max_i32 s4, s11, s4 ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 @@ -3159,14 +3159,14 @@ ; GFX6-NEXT: s_min_i32 s4, s4, s5 ; GFX6-NEXT: s_min_i32 s6, s3, 0 ; GFX6-NEXT: s_add_i32 s2, s2, s4 -; GFX6-NEXT: s_max_i32 s5, s3, 0 ; GFX6-NEXT: s_lshl_b32 s4, s7, 16 +; GFX6-NEXT: s_max_i32 s5, s3, 0 ; GFX6-NEXT: s_sub_i32 s6, s9, s6 ; GFX6-NEXT: s_sub_i32 s5, s8, s5 ; GFX6-NEXT: s_max_i32 s4, s6, s4 ; GFX6-NEXT: s_min_i32 s4, s4, s5 -; GFX6-NEXT: s_add_i32 s3, s3, s4 ; GFX6-NEXT: s_ashr_i32 s1, s1, 16 +; GFX6-NEXT: s_add_i32 s3, s3, s4 ; GFX6-NEXT: s_mov_b32 s4, 0xffff ; GFX6-NEXT: s_ashr_i32 s0, s0, 16 ; GFX6-NEXT: s_and_b32 s1, s1, s4 @@ -3185,16 +3185,16 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_sext_i32_i16 s10, s0 ; GFX8-NEXT: s_sext_i32_i16 s11, 0 -; GFX8-NEXT: s_max_i32 s12, s10, s11 ; GFX8-NEXT: s_movk_i32 s9, 0x8000 +; GFX8-NEXT: s_max_i32 s12, s10, s11 ; GFX8-NEXT: s_min_i32 s10, s10, s11 ; GFX8-NEXT: s_sub_i32 s10, s9, s10 ; GFX8-NEXT: s_lshr_b32 s6, s2, 16 ; GFX8-NEXT: s_movk_i32 s8, 0x7fff ; GFX8-NEXT: s_sext_i32_i16 s10, s10 ; GFX8-NEXT: s_sext_i32_i16 s2, s2 -; GFX8-NEXT: s_max_i32 s2, s10, s2 ; GFX8-NEXT: s_sub_i32 s12, s8, s12 +; GFX8-NEXT: s_max_i32 s2, s10, s2 ; GFX8-NEXT: s_sext_i32_i16 s2, s2 ; GFX8-NEXT: s_sext_i32_i16 s10, s12 ; GFX8-NEXT: s_lshr_b32 s4, s0, 16 @@ -3206,8 +3206,8 @@ ; GFX8-NEXT: s_sub_i32 s2, s9, s2 ; GFX8-NEXT: s_sext_i32_i16 s2, s2 ; GFX8-NEXT: s_sext_i32_i16 s6, s6 -; GFX8-NEXT: s_max_i32 s2, s2, s6 ; GFX8-NEXT: s_sub_i32 s10, s8, s10 +; GFX8-NEXT: s_max_i32 s2, s2, s6 ; GFX8-NEXT: s_sext_i32_i16 s2, s2 ; GFX8-NEXT: s_sext_i32_i16 s6, s10 ; GFX8-NEXT: s_min_i32 s2, s2, s6 @@ -3219,10 +3219,10 @@ ; GFX8-NEXT: s_lshr_b32 s7, s3, 16 ; GFX8-NEXT: s_sext_i32_i16 s2, s2 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 -; GFX8-NEXT: s_max_i32 s2, s2, s3 ; GFX8-NEXT: s_sub_i32 s6, s8, s6 -; GFX8-NEXT: s_sext_i32_i16 s3, s6 +; GFX8-NEXT: s_max_i32 s2, s2, s3 ; GFX8-NEXT: s_sext_i32_i16 s2, s2 +; GFX8-NEXT: s_sext_i32_i16 s3, s6 ; GFX8-NEXT: s_lshr_b32 s5, s1, 16 ; GFX8-NEXT: s_min_i32 s2, s2, s3 ; GFX8-NEXT: s_add_i32 s1, s1, s2 @@ -3289,9 +3289,9 @@ ; GFX6-NEXT: s_brev_b32 s5, 1 ; GFX6-NEXT: v_min_i32_e32 v14, 0, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX6-NEXT: v_sub_i32_e32 v14, vcc, s5, v14 ; GFX6-NEXT: s_brev_b32 s4, -2 ; GFX6-NEXT: v_max_i32_e32 v12, 0, v0 +; GFX6-NEXT: v_sub_i32_e32 v14, vcc, s5, v14 ; GFX6-NEXT: v_sub_i32_e32 v12, vcc, s4, v12 ; GFX6-NEXT: v_max_i32_e32 v6, v14, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 @@ -3308,18 +3308,18 @@ ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v8 ; GFX6-NEXT: v_min_i32_e32 v8, 0, v2 -; GFX6-NEXT: v_sub_i32_e32 v8, vcc, s5, v8 ; GFX6-NEXT: v_bfrev_b32_e32 v13, -2 ; GFX6-NEXT: v_max_i32_e32 v7, 0, v2 +; GFX6-NEXT: v_sub_i32_e32 v8, vcc, s5, v8 ; GFX6-NEXT: v_sub_i32_e32 v7, vcc, v13, v7 ; GFX6-NEXT: v_max_i32_e32 v6, v8, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_min_i32_e32 v6, v6, v7 ; GFX6-NEXT: v_bfrev_b32_e32 v15, 1 +; GFX6-NEXT: v_min_i32_e32 v6, v6, v7 ; GFX6-NEXT: v_min_i32_e32 v8, 0, v3 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v6 -; GFX6-NEXT: v_max_i32_e32 v7, 0, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v9 +; GFX6-NEXT: v_max_i32_e32 v7, 0, v3 ; GFX6-NEXT: v_sub_i32_e32 v8, vcc, v15, v8 ; GFX6-NEXT: v_sub_i32_e32 v7, vcc, v13, v7 ; GFX6-NEXT: v_max_i32_e32 v6, v8, v6 @@ -3327,8 +3327,8 @@ ; GFX6-NEXT: v_min_i32_e32 v6, v6, v7 ; GFX6-NEXT: v_min_i32_e32 v8, 0, v4 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; GFX6-NEXT: v_max_i32_e32 v7, 0, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v10 +; GFX6-NEXT: v_max_i32_e32 v7, 0, v4 ; GFX6-NEXT: v_sub_i32_e32 v8, vcc, v15, v8 ; GFX6-NEXT: v_sub_i32_e32 v7, vcc, v13, v7 ; GFX6-NEXT: v_max_i32_e32 v6, v8, v6 @@ -3336,28 +3336,28 @@ ; GFX6-NEXT: v_min_i32_e32 v6, v6, v7 ; GFX6-NEXT: v_min_i32_e32 v8, 0, v5 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v6 -; GFX6-NEXT: v_max_i32_e32 v7, 0, v5 ; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v11 +; GFX6-NEXT: v_max_i32_e32 v7, 0, v5 ; GFX6-NEXT: v_sub_i32_e32 v8, vcc, v15, v8 ; GFX6-NEXT: v_ashrrev_i32_e32 v1, 16, v1 -; GFX6-NEXT: s_mov_b32 s4, 0xffff ; GFX6-NEXT: v_sub_i32_e32 v7, vcc, v13, v7 ; GFX6-NEXT: v_max_i32_e32 v6, v8, v6 +; GFX6-NEXT: s_mov_b32 s4, 0xffff ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 16, v0 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 ; GFX6-NEXT: v_min_i32_e32 v6, v6, v7 -; GFX6-NEXT: v_add_i32_e32 v5, vcc, v5, v6 +; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 ; GFX6-NEXT: v_ashrrev_i32_e32 v2, 16, v2 ; GFX6-NEXT: v_ashrrev_i32_e32 v3, 16, v3 +; GFX6-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_ashrrev_i32_e32 v5, 16, v5 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: v_and_b32_e32 v1, s4, v2 ; GFX6-NEXT: v_and_b32_e32 v2, s4, v3 -; GFX6-NEXT: v_ashrrev_i32_e32 v5, 16, v5 -; GFX6-NEXT: v_and_b32_e32 v3, s4, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: v_ashrrev_i32_e32 v4, 16, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_and_b32_e32 v3, s4, v5 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX6-NEXT: v_and_b32_e32 v2, s4, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 @@ -3369,37 +3369,37 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_movk_i32 s5, 0x8000 ; GFX8-NEXT: v_min_i16_e32 v11, 0, v0 -; GFX8-NEXT: v_sub_u16_e32 v11, s5, v11 ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v0 ; GFX8-NEXT: s_movk_i32 s4, 0x7fff ; GFX8-NEXT: v_max_i16_e32 v9, 0, v0 -; GFX8-NEXT: v_min_i16_e32 v13, 0, v6 +; GFX8-NEXT: v_sub_u16_e32 v11, s5, v11 ; GFX8-NEXT: v_sub_u16_e32 v9, s4, v9 ; GFX8-NEXT: v_max_i16_e32 v11, v11, v3 +; GFX8-NEXT: v_min_i16_e32 v13, 0, v6 ; GFX8-NEXT: v_min_i16_e32 v9, v11, v9 ; GFX8-NEXT: v_max_i16_e32 v11, 0, v6 ; GFX8-NEXT: v_sub_u16_e32 v13, s5, v13 +; GFX8-NEXT: v_sub_u16_e32 v11, s4, v11 ; GFX8-NEXT: v_max_i16_sdwa v3, v13, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_min_i16_e32 v13, 0, v1 -; GFX8-NEXT: v_sub_u16_e32 v11, s4, v11 +; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v1 ; GFX8-NEXT: v_min_i16_e32 v3, v3, v11 ; GFX8-NEXT: v_max_i16_e32 v11, 0, v1 ; GFX8-NEXT: v_sub_u16_e32 v13, s5, v13 -; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v1 -; GFX8-NEXT: v_min_i16_e32 v14, 0, v7 ; GFX8-NEXT: v_sub_u16_e32 v11, s4, v11 ; GFX8-NEXT: v_max_i16_e32 v13, v13, v4 +; GFX8-NEXT: v_min_i16_e32 v14, 0, v7 ; GFX8-NEXT: v_min_i16_e32 v11, v13, v11 ; GFX8-NEXT: v_max_i16_e32 v13, 0, v7 ; GFX8-NEXT: v_sub_u16_e32 v14, s5, v14 -; GFX8-NEXT: v_max_i16_sdwa v4, v14, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_sub_u16_e32 v13, s4, v13 ; GFX8-NEXT: v_mov_b32_e32 v12, 0xffff8000 +; GFX8-NEXT: v_sub_u16_e32 v13, s4, v13 +; GFX8-NEXT: v_max_i16_sdwa v4, v14, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_min_i16_e32 v14, 0, v2 -; GFX8-NEXT: v_sub_u16_e32 v14, v12, v14 -; GFX8-NEXT: v_min_i16_e32 v4, v4, v13 ; GFX8-NEXT: v_mov_b32_e32 v10, 0x7fff +; GFX8-NEXT: v_min_i16_e32 v4, v4, v13 ; GFX8-NEXT: v_max_i16_e32 v13, 0, v2 +; GFX8-NEXT: v_sub_u16_e32 v14, v12, v14 ; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v2 ; GFX8-NEXT: v_sub_u16_e32 v13, v10, v13 ; GFX8-NEXT: v_max_i16_e32 v14, v14, v5 @@ -3411,8 +3411,8 @@ ; GFX8-NEXT: v_max_i16_sdwa v5, v12, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_add_u16_e32 v0, v0, v9 ; GFX8-NEXT: v_add_u16_sdwa v3, v6, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v0, v3 ; GFX8-NEXT: v_min_i16_e32 v5, v5, v10 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v3 ; GFX8-NEXT: v_add_u16_e32 v1, v1, v11 ; GFX8-NEXT: v_add_u16_sdwa v3, v7, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v1, v1, v3 @@ -3449,9 +3449,9 @@ ; GFX6-NEXT: s_brev_b32 s13, 1 ; GFX6-NEXT: s_min_i32 s15, s0, 0 ; GFX6-NEXT: s_lshl_b32 s6, s6, 16 -; GFX6-NEXT: s_sub_i32 s15, s13, s15 ; GFX6-NEXT: s_brev_b32 s12, -2 ; GFX6-NEXT: s_max_i32 s14, s0, 0 +; GFX6-NEXT: s_sub_i32 s15, s13, s15 ; GFX6-NEXT: s_sub_i32 s14, s12, s14 ; GFX6-NEXT: s_max_i32 s6, s15, s6 ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 @@ -3476,8 +3476,8 @@ ; GFX6-NEXT: s_min_i32 s6, s6, s7 ; GFX6-NEXT: s_min_i32 s8, s3, 0 ; GFX6-NEXT: s_add_i32 s2, s2, s6 -; GFX6-NEXT: s_max_i32 s7, s3, 0 ; GFX6-NEXT: s_lshl_b32 s6, s9, 16 +; GFX6-NEXT: s_max_i32 s7, s3, 0 ; GFX6-NEXT: s_sub_i32 s8, s13, s8 ; GFX6-NEXT: s_sub_i32 s7, s12, s7 ; GFX6-NEXT: s_max_i32 s6, s8, s6 @@ -3485,8 +3485,8 @@ ; GFX6-NEXT: s_min_i32 s6, s6, s7 ; GFX6-NEXT: s_min_i32 s8, s4, 0 ; GFX6-NEXT: s_add_i32 s3, s3, s6 -; GFX6-NEXT: s_max_i32 s7, s4, 0 ; GFX6-NEXT: s_lshl_b32 s6, s10, 16 +; GFX6-NEXT: s_max_i32 s7, s4, 0 ; GFX6-NEXT: s_sub_i32 s8, s13, s8 ; GFX6-NEXT: s_sub_i32 s7, s12, s7 ; GFX6-NEXT: s_max_i32 s6, s8, s6 @@ -3494,14 +3494,14 @@ ; GFX6-NEXT: s_min_i32 s6, s6, s7 ; GFX6-NEXT: s_min_i32 s8, s5, 0 ; GFX6-NEXT: s_add_i32 s4, s4, s6 -; GFX6-NEXT: s_max_i32 s7, s5, 0 ; GFX6-NEXT: s_lshl_b32 s6, s11, 16 +; GFX6-NEXT: s_max_i32 s7, s5, 0 ; GFX6-NEXT: s_sub_i32 s8, s13, s8 ; GFX6-NEXT: s_sub_i32 s7, s12, s7 ; GFX6-NEXT: s_max_i32 s6, s8, s6 ; GFX6-NEXT: s_min_i32 s6, s6, s7 -; GFX6-NEXT: s_add_i32 s5, s5, s6 ; GFX6-NEXT: s_ashr_i32 s1, s1, 16 +; GFX6-NEXT: s_add_i32 s5, s5, s6 ; GFX6-NEXT: s_mov_b32 s6, 0xffff ; GFX6-NEXT: s_ashr_i32 s0, s0, 16 ; GFX6-NEXT: s_and_b32 s1, s1, s6 @@ -3509,13 +3509,13 @@ ; GFX6-NEXT: s_ashr_i32 s3, s3, 16 ; GFX6-NEXT: s_and_b32 s0, s0, s6 ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 +; GFX6-NEXT: s_ashr_i32 s5, s5, 16 ; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: s_and_b32 s1, s2, s6 ; GFX6-NEXT: s_and_b32 s2, s3, s6 -; GFX6-NEXT: s_ashr_i32 s5, s5, 16 -; GFX6-NEXT: s_and_b32 s3, s5, s6 -; GFX6-NEXT: s_lshl_b32 s2, s2, 16 ; GFX6-NEXT: s_ashr_i32 s4, s4, 16 +; GFX6-NEXT: s_lshl_b32 s2, s2, 16 +; GFX6-NEXT: s_and_b32 s3, s5, s6 ; GFX6-NEXT: s_or_b32 s1, s1, s2 ; GFX6-NEXT: s_and_b32 s2, s4, s6 ; GFX6-NEXT: s_lshl_b32 s3, s3, 16 @@ -3526,16 +3526,16 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_sext_i32_i16 s14, s0 ; GFX8-NEXT: s_sext_i32_i16 s15, 0 -; GFX8-NEXT: s_max_i32 s16, s14, s15 ; GFX8-NEXT: s_movk_i32 s13, 0x8000 +; GFX8-NEXT: s_max_i32 s16, s14, s15 ; GFX8-NEXT: s_min_i32 s14, s14, s15 ; GFX8-NEXT: s_sub_i32 s14, s13, s14 ; GFX8-NEXT: s_lshr_b32 s9, s3, 16 ; GFX8-NEXT: s_movk_i32 s12, 0x7fff ; GFX8-NEXT: s_sext_i32_i16 s14, s14 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 -; GFX8-NEXT: s_max_i32 s3, s14, s3 ; GFX8-NEXT: s_sub_i32 s16, s12, s16 +; GFX8-NEXT: s_max_i32 s3, s14, s3 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 ; GFX8-NEXT: s_sext_i32_i16 s14, s16 ; GFX8-NEXT: s_lshr_b32 s6, s0, 16 @@ -3547,8 +3547,8 @@ ; GFX8-NEXT: s_sub_i32 s3, s13, s3 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 ; GFX8-NEXT: s_sext_i32_i16 s9, s9 -; GFX8-NEXT: s_max_i32 s3, s3, s9 ; GFX8-NEXT: s_sub_i32 s14, s12, s14 +; GFX8-NEXT: s_max_i32 s3, s3, s9 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 ; GFX8-NEXT: s_sext_i32_i16 s9, s14 ; GFX8-NEXT: s_min_i32 s3, s3, s9 @@ -3560,10 +3560,10 @@ ; GFX8-NEXT: s_lshr_b32 s10, s4, 16 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 -; GFX8-NEXT: s_max_i32 s3, s3, s4 ; GFX8-NEXT: s_sub_i32 s9, s12, s9 -; GFX8-NEXT: s_sext_i32_i16 s4, s9 +; GFX8-NEXT: s_max_i32 s3, s3, s4 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 +; GFX8-NEXT: s_sext_i32_i16 s4, s9 ; GFX8-NEXT: s_lshr_b32 s7, s1, 16 ; GFX8-NEXT: s_min_i32 s3, s3, s4 ; GFX8-NEXT: s_add_i32 s1, s1, s3 @@ -3586,8 +3586,8 @@ ; GFX8-NEXT: s_lshr_b32 s11, s5, 16 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 ; GFX8-NEXT: s_sext_i32_i16 s5, s5 -; GFX8-NEXT: s_max_i32 s3, s3, s5 ; GFX8-NEXT: s_sub_i32 s4, s12, s4 +; GFX8-NEXT: s_max_i32 s3, s3, s5 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 ; GFX8-NEXT: s_lshr_b32 s8, s2, 16 @@ -3654,9 +3654,9 @@ ; GFX6-NEXT: s_brev_b32 s5, 1 ; GFX6-NEXT: v_min_i32_e32 v18, 0, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GFX6-NEXT: v_sub_i32_e32 v18, vcc, s5, v18 ; GFX6-NEXT: s_brev_b32 s4, -2 ; GFX6-NEXT: v_max_i32_e32 v16, 0, v0 +; GFX6-NEXT: v_sub_i32_e32 v18, vcc, s5, v18 ; GFX6-NEXT: v_sub_i32_e32 v16, vcc, s4, v16 ; GFX6-NEXT: v_max_i32_e32 v8, v18, v8 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 @@ -3673,18 +3673,18 @@ ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v8 ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v10 ; GFX6-NEXT: v_min_i32_e32 v10, 0, v2 -; GFX6-NEXT: v_sub_i32_e32 v10, vcc, s5, v10 ; GFX6-NEXT: v_bfrev_b32_e32 v17, -2 ; GFX6-NEXT: v_max_i32_e32 v9, 0, v2 +; GFX6-NEXT: v_sub_i32_e32 v10, vcc, s5, v10 ; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v17, v9 ; GFX6-NEXT: v_max_i32_e32 v8, v10, v8 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_min_i32_e32 v8, v8, v9 ; GFX6-NEXT: v_bfrev_b32_e32 v19, 1 +; GFX6-NEXT: v_min_i32_e32 v8, v8, v9 ; GFX6-NEXT: v_min_i32_e32 v10, 0, v3 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v8 -; GFX6-NEXT: v_max_i32_e32 v9, 0, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v11 +; GFX6-NEXT: v_max_i32_e32 v9, 0, v3 ; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v19, v10 ; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v17, v9 ; GFX6-NEXT: v_max_i32_e32 v8, v10, v8 @@ -3692,8 +3692,8 @@ ; GFX6-NEXT: v_min_i32_e32 v8, v8, v9 ; GFX6-NEXT: v_min_i32_e32 v10, 0, v4 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v8 -; GFX6-NEXT: v_max_i32_e32 v9, 0, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v12 +; GFX6-NEXT: v_max_i32_e32 v9, 0, v4 ; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v19, v10 ; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v17, v9 ; GFX6-NEXT: v_max_i32_e32 v8, v10, v8 @@ -3701,8 +3701,8 @@ ; GFX6-NEXT: v_min_i32_e32 v8, v8, v9 ; GFX6-NEXT: v_min_i32_e32 v10, 0, v5 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v8 -; GFX6-NEXT: v_max_i32_e32 v9, 0, v5 ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v13 +; GFX6-NEXT: v_max_i32_e32 v9, 0, v5 ; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v19, v10 ; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v17, v9 ; GFX6-NEXT: v_max_i32_e32 v8, v10, v8 @@ -3710,43 +3710,43 @@ ; GFX6-NEXT: v_min_i32_e32 v8, v8, v9 ; GFX6-NEXT: v_min_i32_e32 v10, 0, v6 ; GFX6-NEXT: v_add_i32_e32 v5, vcc, v5, v8 -; GFX6-NEXT: v_max_i32_e32 v9, 0, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v14 +; GFX6-NEXT: v_max_i32_e32 v9, 0, v6 ; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v19, v10 ; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v17, v9 ; GFX6-NEXT: v_max_i32_e32 v8, v10, v8 ; GFX6-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; GFX6-NEXT: v_min_i32_e32 v8, v8, v9 ; GFX6-NEXT: v_min_i32_e32 v10, 0, v7 -; GFX6-NEXT: v_add_i32_e32 v6, vcc, v6, v8 -; GFX6-NEXT: v_max_i32_e32 v9, 0, v7 ; GFX6-NEXT: v_ashrrev_i32_e32 v1, 16, v1 -; GFX6-NEXT: s_mov_b32 s4, 0xffff +; GFX6-NEXT: v_add_i32_e32 v6, vcc, v6, v8 ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v15 +; GFX6-NEXT: v_max_i32_e32 v9, 0, v7 ; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v19, v10 +; GFX6-NEXT: s_mov_b32 s4, 0xffff ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 16, v0 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 ; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v17, v9 ; GFX6-NEXT: v_max_i32_e32 v8, v10, v8 -; GFX6-NEXT: v_min_i32_e32 v8, v8, v9 +; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 ; GFX6-NEXT: v_ashrrev_i32_e32 v2, 16, v2 ; GFX6-NEXT: v_ashrrev_i32_e32 v3, 16, v3 +; GFX6-NEXT: v_min_i32_e32 v8, v8, v9 ; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_ashrrev_i32_e32 v5, 16, v5 ; GFX6-NEXT: v_add_i32_e32 v7, vcc, v7, v8 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: v_and_b32_e32 v1, s4, v2 ; GFX6-NEXT: v_and_b32_e32 v2, s4, v3 -; GFX6-NEXT: v_ashrrev_i32_e32 v5, 16, v5 -; GFX6-NEXT: v_and_b32_e32 v3, s4, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: v_ashrrev_i32_e32 v4, 16, v4 ; GFX6-NEXT: v_ashrrev_i32_e32 v7, 16, v7 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_and_b32_e32 v3, s4, v5 +; GFX6-NEXT: v_ashrrev_i32_e32 v6, 16, v6 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX6-NEXT: v_and_b32_e32 v2, s4, v4 -; GFX6-NEXT: v_and_b32_e32 v4, s4, v7 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_ashrrev_i32_e32 v6, 16, v6 +; GFX6-NEXT: v_and_b32_e32 v4, s4, v7 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX6-NEXT: v_and_b32_e32 v3, s4, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 @@ -3758,50 +3758,50 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_movk_i32 s5, 0x8000 ; GFX8-NEXT: v_min_i16_e32 v14, 0, v0 -; GFX8-NEXT: v_sub_u16_e32 v14, s5, v14 ; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v0 ; GFX8-NEXT: s_movk_i32 s4, 0x7fff ; GFX8-NEXT: v_max_i16_e32 v12, 0, v0 -; GFX8-NEXT: v_min_i16_e32 v16, 0, v8 +; GFX8-NEXT: v_sub_u16_e32 v14, s5, v14 ; GFX8-NEXT: v_sub_u16_e32 v12, s4, v12 ; GFX8-NEXT: v_max_i16_e32 v14, v14, v4 +; GFX8-NEXT: v_min_i16_e32 v16, 0, v8 ; GFX8-NEXT: v_min_i16_e32 v12, v14, v12 ; GFX8-NEXT: v_max_i16_e32 v14, 0, v8 ; GFX8-NEXT: v_sub_u16_e32 v16, s5, v16 +; GFX8-NEXT: v_sub_u16_e32 v14, s4, v14 ; GFX8-NEXT: v_max_i16_sdwa v4, v16, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_min_i16_e32 v16, 0, v1 -; GFX8-NEXT: v_sub_u16_e32 v14, s4, v14 +; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v1 ; GFX8-NEXT: v_min_i16_e32 v4, v4, v14 ; GFX8-NEXT: v_max_i16_e32 v14, 0, v1 ; GFX8-NEXT: v_sub_u16_e32 v16, s5, v16 -; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v1 -; GFX8-NEXT: v_min_i16_e32 v17, 0, v9 ; GFX8-NEXT: v_sub_u16_e32 v14, s4, v14 ; GFX8-NEXT: v_max_i16_e32 v16, v16, v5 +; GFX8-NEXT: v_min_i16_e32 v17, 0, v9 ; GFX8-NEXT: v_min_i16_e32 v14, v16, v14 ; GFX8-NEXT: v_max_i16_e32 v16, 0, v9 ; GFX8-NEXT: v_sub_u16_e32 v17, s5, v17 -; GFX8-NEXT: v_max_i16_sdwa v5, v17, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_sub_u16_e32 v16, s4, v16 ; GFX8-NEXT: v_mov_b32_e32 v15, 0xffff8000 +; GFX8-NEXT: v_sub_u16_e32 v16, s4, v16 +; GFX8-NEXT: v_max_i16_sdwa v5, v17, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_min_i16_e32 v17, 0, v2 -; GFX8-NEXT: v_sub_u16_e32 v17, v15, v17 ; GFX8-NEXT: v_lshrrev_b32_e32 v10, 16, v2 -; GFX8-NEXT: v_min_i16_e32 v5, v5, v16 ; GFX8-NEXT: v_mov_b32_e32 v13, 0x7fff +; GFX8-NEXT: v_min_i16_e32 v5, v5, v16 ; GFX8-NEXT: v_max_i16_e32 v16, 0, v2 -; GFX8-NEXT: v_min_i16_e32 v18, 0, v10 +; GFX8-NEXT: v_sub_u16_e32 v17, v15, v17 ; GFX8-NEXT: v_sub_u16_e32 v16, v13, v16 ; GFX8-NEXT: v_max_i16_e32 v17, v17, v6 +; GFX8-NEXT: v_min_i16_e32 v18, 0, v10 ; GFX8-NEXT: v_min_i16_e32 v16, v17, v16 ; GFX8-NEXT: v_max_i16_e32 v17, 0, v10 ; GFX8-NEXT: v_sub_u16_e32 v18, v15, v18 +; GFX8-NEXT: v_sub_u16_e32 v17, v13, v17 ; GFX8-NEXT: v_max_i16_sdwa v6, v18, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_min_i16_e32 v18, 0, v3 -; GFX8-NEXT: v_sub_u16_e32 v17, v13, v17 -; GFX8-NEXT: v_sub_u16_e32 v18, v15, v18 ; GFX8-NEXT: v_min_i16_e32 v6, v6, v17 ; GFX8-NEXT: v_max_i16_e32 v17, 0, v3 +; GFX8-NEXT: v_sub_u16_e32 v18, v15, v18 ; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v3 ; GFX8-NEXT: v_sub_u16_e32 v17, v13, v17 ; GFX8-NEXT: v_max_i16_e32 v18, v18, v7 @@ -3816,8 +3816,8 @@ ; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 ; GFX8-NEXT: v_add_u16_e32 v1, v1, v14 ; GFX8-NEXT: v_add_u16_sdwa v4, v9, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v1, v1, v4 ; GFX8-NEXT: v_min_i16_e32 v7, v7, v13 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v4 ; GFX8-NEXT: v_add_u16_e32 v2, v2, v16 ; GFX8-NEXT: v_add_u16_sdwa v4, v10, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v2, v2, v4 @@ -3856,9 +3856,9 @@ ; GFX6-NEXT: s_brev_b32 s17, 1 ; GFX6-NEXT: s_min_i32 s19, s0, 0 ; GFX6-NEXT: s_lshl_b32 s8, s8, 16 -; GFX6-NEXT: s_sub_i32 s19, s17, s19 ; GFX6-NEXT: s_brev_b32 s16, -2 ; GFX6-NEXT: s_max_i32 s18, s0, 0 +; GFX6-NEXT: s_sub_i32 s19, s17, s19 ; GFX6-NEXT: s_sub_i32 s18, s16, s18 ; GFX6-NEXT: s_max_i32 s8, s19, s8 ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 @@ -3883,8 +3883,8 @@ ; GFX6-NEXT: s_min_i32 s8, s8, s9 ; GFX6-NEXT: s_min_i32 s10, s3, 0 ; GFX6-NEXT: s_add_i32 s2, s2, s8 -; GFX6-NEXT: s_max_i32 s9, s3, 0 ; GFX6-NEXT: s_lshl_b32 s8, s11, 16 +; GFX6-NEXT: s_max_i32 s9, s3, 0 ; GFX6-NEXT: s_sub_i32 s10, s17, s10 ; GFX6-NEXT: s_sub_i32 s9, s16, s9 ; GFX6-NEXT: s_max_i32 s8, s10, s8 @@ -3892,8 +3892,8 @@ ; GFX6-NEXT: s_min_i32 s8, s8, s9 ; GFX6-NEXT: s_min_i32 s10, s4, 0 ; GFX6-NEXT: s_add_i32 s3, s3, s8 -; GFX6-NEXT: s_max_i32 s9, s4, 0 ; GFX6-NEXT: s_lshl_b32 s8, s12, 16 +; GFX6-NEXT: s_max_i32 s9, s4, 0 ; GFX6-NEXT: s_sub_i32 s10, s17, s10 ; GFX6-NEXT: s_sub_i32 s9, s16, s9 ; GFX6-NEXT: s_max_i32 s8, s10, s8 @@ -3901,8 +3901,8 @@ ; GFX6-NEXT: s_min_i32 s8, s8, s9 ; GFX6-NEXT: s_min_i32 s10, s5, 0 ; GFX6-NEXT: s_add_i32 s4, s4, s8 -; GFX6-NEXT: s_max_i32 s9, s5, 0 ; GFX6-NEXT: s_lshl_b32 s8, s13, 16 +; GFX6-NEXT: s_max_i32 s9, s5, 0 ; GFX6-NEXT: s_sub_i32 s10, s17, s10 ; GFX6-NEXT: s_sub_i32 s9, s16, s9 ; GFX6-NEXT: s_max_i32 s8, s10, s8 @@ -3910,8 +3910,8 @@ ; GFX6-NEXT: s_min_i32 s8, s8, s9 ; GFX6-NEXT: s_min_i32 s10, s6, 0 ; GFX6-NEXT: s_add_i32 s5, s5, s8 -; GFX6-NEXT: s_max_i32 s9, s6, 0 ; GFX6-NEXT: s_lshl_b32 s8, s14, 16 +; GFX6-NEXT: s_max_i32 s9, s6, 0 ; GFX6-NEXT: s_sub_i32 s10, s17, s10 ; GFX6-NEXT: s_sub_i32 s9, s16, s9 ; GFX6-NEXT: s_max_i32 s8, s10, s8 @@ -3919,14 +3919,14 @@ ; GFX6-NEXT: s_min_i32 s8, s8, s9 ; GFX6-NEXT: s_min_i32 s10, s7, 0 ; GFX6-NEXT: s_add_i32 s6, s6, s8 -; GFX6-NEXT: s_max_i32 s9, s7, 0 ; GFX6-NEXT: s_lshl_b32 s8, s15, 16 +; GFX6-NEXT: s_max_i32 s9, s7, 0 ; GFX6-NEXT: s_sub_i32 s10, s17, s10 ; GFX6-NEXT: s_sub_i32 s9, s16, s9 ; GFX6-NEXT: s_max_i32 s8, s10, s8 ; GFX6-NEXT: s_min_i32 s8, s8, s9 -; GFX6-NEXT: s_add_i32 s7, s7, s8 ; GFX6-NEXT: s_ashr_i32 s1, s1, 16 +; GFX6-NEXT: s_add_i32 s7, s7, s8 ; GFX6-NEXT: s_mov_b32 s8, 0xffff ; GFX6-NEXT: s_ashr_i32 s0, s0, 16 ; GFX6-NEXT: s_and_b32 s1, s1, s8 @@ -3934,19 +3934,19 @@ ; GFX6-NEXT: s_ashr_i32 s3, s3, 16 ; GFX6-NEXT: s_and_b32 s0, s0, s8 ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 +; GFX6-NEXT: s_ashr_i32 s5, s5, 16 ; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: s_and_b32 s1, s2, s8 ; GFX6-NEXT: s_and_b32 s2, s3, s8 -; GFX6-NEXT: s_ashr_i32 s5, s5, 16 -; GFX6-NEXT: s_and_b32 s3, s5, s8 -; GFX6-NEXT: s_lshl_b32 s2, s2, 16 ; GFX6-NEXT: s_ashr_i32 s4, s4, 16 ; GFX6-NEXT: s_ashr_i32 s7, s7, 16 +; GFX6-NEXT: s_lshl_b32 s2, s2, 16 +; GFX6-NEXT: s_and_b32 s3, s5, s8 +; GFX6-NEXT: s_ashr_i32 s6, s6, 16 ; GFX6-NEXT: s_or_b32 s1, s1, s2 ; GFX6-NEXT: s_and_b32 s2, s4, s8 -; GFX6-NEXT: s_and_b32 s4, s7, s8 ; GFX6-NEXT: s_lshl_b32 s3, s3, 16 -; GFX6-NEXT: s_ashr_i32 s6, s6, 16 +; GFX6-NEXT: s_and_b32 s4, s7, s8 ; GFX6-NEXT: s_or_b32 s2, s2, s3 ; GFX6-NEXT: s_and_b32 s3, s6, s8 ; GFX6-NEXT: s_lshl_b32 s4, s4, 16 @@ -3957,16 +3957,16 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_sext_i32_i16 s18, s0 ; GFX8-NEXT: s_sext_i32_i16 s19, 0 -; GFX8-NEXT: s_max_i32 s20, s18, s19 ; GFX8-NEXT: s_movk_i32 s17, 0x8000 +; GFX8-NEXT: s_max_i32 s20, s18, s19 ; GFX8-NEXT: s_min_i32 s18, s18, s19 ; GFX8-NEXT: s_sub_i32 s18, s17, s18 ; GFX8-NEXT: s_lshr_b32 s12, s4, 16 ; GFX8-NEXT: s_movk_i32 s16, 0x7fff ; GFX8-NEXT: s_sext_i32_i16 s18, s18 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 -; GFX8-NEXT: s_max_i32 s4, s18, s4 ; GFX8-NEXT: s_sub_i32 s20, s16, s20 +; GFX8-NEXT: s_max_i32 s4, s18, s4 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 ; GFX8-NEXT: s_sext_i32_i16 s18, s20 ; GFX8-NEXT: s_lshr_b32 s8, s0, 16 @@ -3978,8 +3978,8 @@ ; GFX8-NEXT: s_sub_i32 s4, s17, s4 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 ; GFX8-NEXT: s_sext_i32_i16 s12, s12 -; GFX8-NEXT: s_max_i32 s4, s4, s12 ; GFX8-NEXT: s_sub_i32 s18, s16, s18 +; GFX8-NEXT: s_max_i32 s4, s4, s12 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 ; GFX8-NEXT: s_sext_i32_i16 s12, s18 ; GFX8-NEXT: s_min_i32 s4, s4, s12 @@ -3991,10 +3991,10 @@ ; GFX8-NEXT: s_lshr_b32 s13, s5, 16 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 ; GFX8-NEXT: s_sext_i32_i16 s5, s5 -; GFX8-NEXT: s_max_i32 s4, s4, s5 ; GFX8-NEXT: s_sub_i32 s12, s16, s12 -; GFX8-NEXT: s_sext_i32_i16 s5, s12 +; GFX8-NEXT: s_max_i32 s4, s4, s5 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 +; GFX8-NEXT: s_sext_i32_i16 s5, s12 ; GFX8-NEXT: s_lshr_b32 s9, s1, 16 ; GFX8-NEXT: s_min_i32 s4, s4, s5 ; GFX8-NEXT: s_add_i32 s1, s1, s4 @@ -4017,8 +4017,8 @@ ; GFX8-NEXT: s_lshr_b32 s14, s6, 16 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 ; GFX8-NEXT: s_sext_i32_i16 s6, s6 -; GFX8-NEXT: s_max_i32 s4, s4, s6 ; GFX8-NEXT: s_sub_i32 s5, s16, s5 +; GFX8-NEXT: s_max_i32 s4, s4, s6 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 ; GFX8-NEXT: s_sext_i32_i16 s5, s5 ; GFX8-NEXT: s_lshr_b32 s10, s2, 16 @@ -4030,8 +4030,8 @@ ; GFX8-NEXT: s_sub_i32 s4, s17, s4 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 ; GFX8-NEXT: s_sext_i32_i16 s6, s14 -; GFX8-NEXT: s_max_i32 s4, s4, s6 ; GFX8-NEXT: s_sub_i32 s5, s16, s5 +; GFX8-NEXT: s_max_i32 s4, s4, s6 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 ; GFX8-NEXT: s_sext_i32_i16 s5, s5 ; GFX8-NEXT: s_min_i32 s4, s4, s5 @@ -4052,8 +4052,8 @@ ; GFX8-NEXT: s_sext_i32_i16 s4, s11 ; GFX8-NEXT: s_max_i32 s5, s4, s19 ; GFX8-NEXT: s_min_i32 s4, s4, s19 -; GFX8-NEXT: s_sub_i32 s4, s17, s4 ; GFX8-NEXT: s_lshr_b32 s15, s7, 16 +; GFX8-NEXT: s_sub_i32 s4, s17, s4 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 ; GFX8-NEXT: s_sext_i32_i16 s6, s15 ; GFX8-NEXT: s_sub_i32 s5, s16, s5 @@ -4462,8 +4462,8 @@ ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v9 ; GFX6-NEXT: v_bfrev_b32_e32 v10, 1 ; GFX6-NEXT: v_add_i32_e64 v1, s[6:7], 0, v0 -; GFX6-NEXT: s_xor_b64 vcc, s[4:5], vcc ; GFX6-NEXT: v_addc_u32_e64 v4, s[6:7], v0, v10, s[6:7] +; GFX6-NEXT: s_xor_b64 vcc, s[4:5], vcc ; GFX6-NEXT: v_cndmask_b32_e32 v0, v8, v1, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v1, v9, v4, vcc ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v2, v6 @@ -4488,8 +4488,8 @@ ; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v9 ; GFX8-NEXT: v_bfrev_b32_e32 v10, 1 ; GFX8-NEXT: v_add_u32_e64 v1, s[6:7], 0, v0 -; GFX8-NEXT: s_xor_b64 vcc, s[4:5], vcc ; GFX8-NEXT: v_addc_u32_e64 v4, s[6:7], v0, v10, s[6:7] +; GFX8-NEXT: s_xor_b64 vcc, s[4:5], vcc ; GFX8-NEXT: v_cndmask_b32_e32 v0, v8, v1, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v1, v9, v4, vcc ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v2, v6 @@ -4514,8 +4514,8 @@ ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v9 ; GFX9-NEXT: v_bfrev_b32_e32 v10, 1 ; GFX9-NEXT: v_add_co_u32_e64 v1, s[6:7], 0, v0 -; GFX9-NEXT: s_xor_b64 vcc, s[4:5], vcc ; GFX9-NEXT: v_addc_co_u32_e64 v4, s[6:7], v0, v10, s[6:7] +; GFX9-NEXT: s_xor_b64 vcc, s[4:5], vcc ; GFX9-NEXT: v_cndmask_b32_e32 v0, v8, v1, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v4, vcc ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v2, v6 @@ -4535,20 +4535,20 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_add_co_u32 v8, vcc_lo, v0, v4 -; GFX10-NEXT: v_cmp_gt_i64_e64 s4, 0, v[4:5] ; GFX10-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v1, v5, vcc_lo ; GFX10-NEXT: v_add_co_u32 v10, vcc_lo, v2, v6 -; GFX10-NEXT: v_cmp_gt_i64_e64 s6, 0, v[6:7] ; GFX10-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, v3, v7, vcc_lo ; GFX10-NEXT: v_ashrrev_i32_e32 v12, 31, v9 ; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[8:9], v[0:1] +; GFX10-NEXT: v_cmp_gt_i64_e64 s4, 0, v[4:5] ; GFX10-NEXT: v_ashrrev_i32_e32 v0, 31, v11 +; GFX10-NEXT: v_cmp_gt_i64_e64 s6, 0, v[6:7] ; GFX10-NEXT: v_add_co_u32 v1, s5, v12, 0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s5, 0x80000000, v12, s5 ; GFX10-NEXT: v_cmp_lt_i64_e64 s5, v[10:11], v[2:3] ; GFX10-NEXT: v_add_co_u32 v2, s7, v0, 0 -; GFX10-NEXT: s_xor_b32 vcc_lo, s4, vcc_lo ; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s7, 0x80000000, v0, s7 +; GFX10-NEXT: s_xor_b32 vcc_lo, s4, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v0, v8, v1, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v9, v4, vcc_lo ; GFX10-NEXT: s_xor_b32 vcc_lo, s6, s5 @@ -4569,8 +4569,8 @@ ; GFX6-NEXT: v_mov_b32_e32 v0, s0 ; GFX6-NEXT: s_addc_u32 s9, s1, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 -; GFX6-NEXT: v_cmp_lt_i64_e64 s[0:1], s[4:5], 0 ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1] +; GFX6-NEXT: v_cmp_lt_i64_e64 s[0:1], s[4:5], 0 ; GFX6-NEXT: s_ashr_i32 s4, s9, 31 ; GFX6-NEXT: s_xor_b64 vcc, s[0:1], vcc ; GFX6-NEXT: s_add_u32 s0, s4, 0 @@ -4583,13 +4583,13 @@ ; GFX6-NEXT: s_add_u32 s0, s2, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s1 ; GFX6-NEXT: s_cselect_b32 s1, 1, 0 -; GFX6-NEXT: s_and_b32 s1, s1, 1 ; GFX6-NEXT: v_mov_b32_e32 v0, s8 -; GFX6-NEXT: s_cmp_lg_u32 s1, 0 +; GFX6-NEXT: s_and_b32 s1, s1, 1 ; GFX6-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc +; GFX6-NEXT: s_cmp_lg_u32 s1, 0 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 -; GFX6-NEXT: s_addc_u32 s1, s3, s7 ; GFX6-NEXT: v_mov_b32_e32 v3, s9 +; GFX6-NEXT: s_addc_u32 s1, s3, s7 ; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1] @@ -4603,8 +4603,8 @@ ; GFX6-NEXT: s_cmp_lg_u32 s2, 0 ; GFX6-NEXT: s_addc_u32 s3, s4, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NEXT: v_mov_b32_e32 v5, s1 ; GFX6-NEXT: v_mov_b32_e32 v3, s3 +; GFX6-NEXT: v_mov_b32_e32 v5, s1 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc ; GFX6-NEXT: v_readfirstlane_b32 s0, v4 @@ -4622,8 +4622,8 @@ ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: s_addc_u32 s9, s1, s5 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: v_cmp_lt_i64_e64 s[0:1], s[4:5], 0 ; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1] +; GFX8-NEXT: v_cmp_lt_i64_e64 s[0:1], s[4:5], 0 ; GFX8-NEXT: s_ashr_i32 s4, s9, 31 ; GFX8-NEXT: s_xor_b64 vcc, s[0:1], vcc ; GFX8-NEXT: s_add_u32 s0, s4, 0 @@ -4636,13 +4636,13 @@ ; GFX8-NEXT: s_add_u32 s0, s2, s6 ; GFX8-NEXT: v_mov_b32_e32 v2, s1 ; GFX8-NEXT: s_cselect_b32 s1, 1, 0 -; GFX8-NEXT: s_and_b32 s1, s1, 1 ; GFX8-NEXT: v_mov_b32_e32 v0, s8 -; GFX8-NEXT: s_cmp_lg_u32 s1, 0 +; GFX8-NEXT: s_and_b32 s1, s1, 1 ; GFX8-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc +; GFX8-NEXT: s_cmp_lg_u32 s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: s_addc_u32 s1, s3, s7 ; GFX8-NEXT: v_mov_b32_e32 v3, s9 +; GFX8-NEXT: s_addc_u32 s1, s3, s7 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc ; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1] @@ -4656,8 +4656,8 @@ ; GFX8-NEXT: s_cmp_lg_u32 s2, 0 ; GFX8-NEXT: s_addc_u32 s3, s4, s5 ; GFX8-NEXT: v_mov_b32_e32 v1, s0 -; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc ; GFX8-NEXT: v_readfirstlane_b32 s0, v4 @@ -4675,8 +4675,8 @@ ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: s_addc_u32 s9, s1, s5 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], s[4:5], 0 ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1] +; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], s[4:5], 0 ; GFX9-NEXT: s_ashr_i32 s4, s9, 31 ; GFX9-NEXT: s_xor_b64 vcc, s[0:1], vcc ; GFX9-NEXT: s_add_u32 s0, s4, 0 @@ -4689,13 +4689,13 @@ ; GFX9-NEXT: s_add_u32 s0, s2, s6 ; GFX9-NEXT: v_mov_b32_e32 v2, s1 ; GFX9-NEXT: s_cselect_b32 s1, 1, 0 -; GFX9-NEXT: s_and_b32 s1, s1, 1 ; GFX9-NEXT: v_mov_b32_e32 v0, s8 -; GFX9-NEXT: s_cmp_lg_u32 s1, 0 +; GFX9-NEXT: s_and_b32 s1, s1, 1 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc +; GFX9-NEXT: s_cmp_lg_u32 s1, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: s_addc_u32 s1, s3, s7 ; GFX9-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-NEXT: s_addc_u32 s1, s3, s7 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1] @@ -4709,8 +4709,8 @@ ; GFX9-NEXT: s_cmp_lg_u32 s2, 0 ; GFX9-NEXT: s_addc_u32 s3, s4, s5 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_mov_b32_e32 v5, s1 ; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_mov_b32_e32 v5, s1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc ; GFX9-NEXT: v_readfirstlane_b32 s0, v4 @@ -4784,8 +4784,8 @@ ; GFX6-NEXT: s_and_b32 s9, s9, 1 ; GFX6-NEXT: v_mov_b32_e32 v2, s0 ; GFX6-NEXT: s_cmp_lg_u32 s9, 0 -; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3] ; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3] ; GFX6-NEXT: s_addc_u32 s9, s3, s7 ; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc @@ -4816,13 +4816,13 @@ ; GFX6-NEXT: v_mov_b32_e32 v1, s0 ; GFX6-NEXT: v_mov_b32_e32 v2, s1 ; GFX6-NEXT: v_mov_b32_e32 v3, s4 -; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX6-NEXT: v_mov_b32_e32 v4, s5 +; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc ; GFX6-NEXT: v_mov_b32_e32 v2, s2 -; GFX6-NEXT: v_mov_b32_e32 v4, s8 ; GFX6-NEXT: v_mov_b32_e32 v3, s3 +; GFX6-NEXT: v_mov_b32_e32 v4, s8 ; GFX6-NEXT: v_mov_b32_e32 v5, s9 ; GFX6-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc @@ -4849,8 +4849,8 @@ ; GFX8-NEXT: s_cmp_lg_u32 s9, 0 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: s_addc_u32 s9, s3, s7 -; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3] ; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3] ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: s_cmp_eq_u64 s[8:9], s[2:3] ; GFX8-NEXT: s_cselect_b32 s2, 1, 0 @@ -4886,13 +4886,13 @@ ; GFX8-NEXT: v_mov_b32_e32 v1, s0 ; GFX8-NEXT: v_mov_b32_e32 v2, s1 ; GFX8-NEXT: v_mov_b32_e32 v3, s4 -; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX8-NEXT: v_mov_b32_e32 v4, s5 +; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc ; GFX8-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NEXT: v_mov_b32_e32 v4, s8 ; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: v_mov_b32_e32 v4, s8 ; GFX8-NEXT: v_mov_b32_e32 v5, s9 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc @@ -4919,8 +4919,8 @@ ; GFX9-NEXT: s_cmp_lg_u32 s9, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, s0 ; GFX9-NEXT: s_addc_u32 s9, s3, s7 -; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: s_cmp_eq_u64 s[8:9], s[2:3] ; GFX9-NEXT: s_cselect_b32 s2, 1, 0 @@ -4956,13 +4956,13 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: v_mov_b32_e32 v2, s1 ; GFX9-NEXT: v_mov_b32_e32 v3, s4 -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX9-NEXT: v_mov_b32_e32 v4, s5 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc ; GFX9-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NEXT: v_mov_b32_e32 v4, s8 ; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_mov_b32_e32 v4, s8 ; GFX9-NEXT: v_mov_b32_e32 v5, s9 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc @@ -5286,18 +5286,18 @@ ; GFX10-LABEL: saddsat_i128_vs: ; GFX10: ; %bb.0: ; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, v0, s0 -; GFX10-NEXT: s_cmp_eq_u64 s[2:3], 0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, s1, v1, vcc_lo -; GFX10-NEXT: v_cmp_lt_i64_e64 s1, s[2:3], 0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, s2, v2, vcc_lo -; GFX10-NEXT: s_cselect_b32 s0, 1, 0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, s3, v3, vcc_lo ; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[4:5], v[0:1] +; GFX10-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10-NEXT: v_cmp_lt_i64_e64 s1, s[2:3], 0 +; GFX10-NEXT: s_cselect_b32 s0, 1, 0 ; GFX10-NEXT: s_and_b32 s0, 1, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, s1 -; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[6:7], v[2:3] +; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, s1 +; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[2:3] ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo @@ -5516,19 +5516,19 @@ ; GFX10-NEXT: v_add_co_u32 v10, vcc_lo, v4, v12 ; GFX10-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, v5, v13, vcc_lo ; GFX10-NEXT: v_add_co_ci_u32_e32 v12, vcc_lo, v6, v14, vcc_lo -; GFX10-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v13, vcc_lo, v7, v15, vcc_lo ; GFX10-NEXT: v_cmp_lt_u64_e64 s4, v[10:11], v[4:5] +; GFX10-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX10-NEXT: v_ashrrev_i32_e32 v1, 31, v17 -; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX10-NEXT: v_cmp_eq_u64_e64 s5, v[12:13], v[6:7] ; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s4 ; GFX10-NEXT: v_cmp_lt_i64_e64 s4, v[12:13], v[6:7] +; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v1, 0 -; GFX10-NEXT: v_ashrrev_i32_e32 v7, 31, v13 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s4 ; GFX10-NEXT: v_cmp_gt_i64_e64 s4, 0, v[14:15] +; GFX10-NEXT: v_ashrrev_i32_e32 v7, 31, v13 ; GFX10-NEXT: v_cndmask_b32_e64 v18, 0, 1, s4 ; GFX10-NEXT: v_cmp_ne_u32_e64 s4, 0, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v5, v4, s5 @@ -5573,8 +5573,8 @@ ; GFX6-NEXT: s_and_b32 s17, s17, 1 ; GFX6-NEXT: v_mov_b32_e32 v2, s0 ; GFX6-NEXT: s_cmp_lg_u32 s17, 0 -; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[2:3] ; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[2:3] ; GFX6-NEXT: s_addc_u32 s17, s3, s11 ; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc @@ -5614,24 +5614,24 @@ ; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: s_cselect_b32 s2, 1, 0 ; GFX6-NEXT: s_and_b32 s2, s2, 1 -; GFX6-NEXT: s_cmp_lg_u32 s2, 0 -; GFX6-NEXT: v_mov_b32_e32 v4, s9 ; GFX6-NEXT: v_mov_b32_e32 v3, s8 +; GFX6-NEXT: v_mov_b32_e32 v4, s9 +; GFX6-NEXT: s_cmp_lg_u32 s2, 0 ; GFX6-NEXT: v_cndmask_b32_e32 v5, v3, v1, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc -; GFX6-NEXT: v_mov_b32_e32 v2, s16 ; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: v_mov_b32_e32 v2, s16 ; GFX6-NEXT: v_mov_b32_e32 v3, s17 ; GFX6-NEXT: s_addc_u32 s2, s6, s14 -; GFX6-NEXT: s_cselect_b32 s3, 1, 0 ; GFX6-NEXT: v_cndmask_b32_e32 v6, v2, v0, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v7, v3, v1, vcc +; GFX6-NEXT: s_cselect_b32 s3, 1, 0 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: s_and_b32 s3, s3, 1 ; GFX6-NEXT: v_mov_b32_e32 v3, s5 ; GFX6-NEXT: s_cmp_lg_u32 s3, 0 -; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3] ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3] ; GFX6-NEXT: s_addc_u32 s3, s7, s15 ; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc @@ -5658,18 +5658,18 @@ ; GFX6-NEXT: s_and_b32 s8, s8, 1 ; GFX6-NEXT: s_cmp_lg_u32 s8, 0 ; GFX6-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX6-NEXT: v_mov_b32_e32 v3, s0 -; GFX6-NEXT: v_mov_b32_e32 v8, s1 ; GFX6-NEXT: s_addc_u32 s7, s7, s10 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 -; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: v_mov_b32_e32 v3, s0 +; GFX6-NEXT: v_mov_b32_e32 v8, s1 +; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v1, v8, v2, vcc -; GFX6-NEXT: v_mov_b32_e32 v8, s2 ; GFX6-NEXT: v_mov_b32_e32 v2, s6 -; GFX6-NEXT: v_mov_b32_e32 v9, s3 ; GFX6-NEXT: v_mov_b32_e32 v3, s7 +; GFX6-NEXT: v_mov_b32_e32 v8, s2 +; GFX6-NEXT: v_mov_b32_e32 v9, s3 ; GFX6-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc ; GFX6-NEXT: v_readfirstlane_b32 s0, v5 @@ -5699,8 +5699,8 @@ ; GFX8-NEXT: s_cmp_lg_u32 s17, 0 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: s_addc_u32 s17, s3, s11 -; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[2:3] ; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[2:3] ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: s_cmp_eq_u64 s[16:17], s[2:3] ; GFX8-NEXT: s_cselect_b32 s2, 1, 0 @@ -5746,24 +5746,24 @@ ; GFX8-NEXT: s_cselect_b32 s2, 1, 0 ; GFX8-NEXT: s_and_b32 s2, s2, 1 ; GFX8-NEXT: s_cmp_lg_u32 s2, 0 -; GFX8-NEXT: v_mov_b32_e32 v4, s9 ; GFX8-NEXT: v_mov_b32_e32 v3, s8 +; GFX8-NEXT: v_mov_b32_e32 v4, s9 ; GFX8-NEXT: s_addc_u32 s2, s6, s14 ; GFX8-NEXT: v_cndmask_b32_e32 v5, v3, v1, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc ; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: s_cselect_b32 s3, 1, 0 ; GFX8-NEXT: v_mov_b32_e32 v2, s16 ; GFX8-NEXT: v_mov_b32_e32 v3, s17 -; GFX8-NEXT: s_and_b32 s3, s3, 1 +; GFX8-NEXT: s_cselect_b32 s3, 1, 0 ; GFX8-NEXT: v_cndmask_b32_e32 v6, v2, v0, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v7, v3, v1, vcc +; GFX8-NEXT: s_and_b32 s3, s3, 1 ; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: s_cmp_lg_u32 s3, 0 ; GFX8-NEXT: v_mov_b32_e32 v3, s5 ; GFX8-NEXT: s_addc_u32 s3, s7, s15 -; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3] ; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3] ; GFX8-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NEXT: s_cmp_eq_u64 s[2:3], s[6:7] ; GFX8-NEXT: s_cselect_b32 s6, 1, 0 @@ -5795,18 +5795,18 @@ ; GFX8-NEXT: s_and_b32 s8, s8, 1 ; GFX8-NEXT: s_cmp_lg_u32 s8, 0 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX8-NEXT: v_mov_b32_e32 v3, s0 -; GFX8-NEXT: v_mov_b32_e32 v8, s1 ; GFX8-NEXT: s_addc_u32 s7, s7, s10 ; GFX8-NEXT: v_mov_b32_e32 v1, s4 -; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX8-NEXT: v_mov_b32_e32 v2, s5 +; GFX8-NEXT: v_mov_b32_e32 v3, s0 +; GFX8-NEXT: v_mov_b32_e32 v8, s1 +; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v1, v8, v2, vcc -; GFX8-NEXT: v_mov_b32_e32 v8, s2 ; GFX8-NEXT: v_mov_b32_e32 v2, s6 -; GFX8-NEXT: v_mov_b32_e32 v9, s3 ; GFX8-NEXT: v_mov_b32_e32 v3, s7 +; GFX8-NEXT: v_mov_b32_e32 v8, s2 +; GFX8-NEXT: v_mov_b32_e32 v9, s3 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc ; GFX8-NEXT: v_readfirstlane_b32 s0, v5 @@ -5836,8 +5836,8 @@ ; GFX9-NEXT: s_cmp_lg_u32 s17, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, s0 ; GFX9-NEXT: s_addc_u32 s17, s3, s11 -; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: s_cmp_eq_u64 s[16:17], s[2:3] ; GFX9-NEXT: s_cselect_b32 s2, 1, 0 @@ -5883,24 +5883,24 @@ ; GFX9-NEXT: s_cselect_b32 s2, 1, 0 ; GFX9-NEXT: s_and_b32 s2, s2, 1 ; GFX9-NEXT: s_cmp_lg_u32 s2, 0 -; GFX9-NEXT: v_mov_b32_e32 v4, s9 ; GFX9-NEXT: v_mov_b32_e32 v3, s8 +; GFX9-NEXT: v_mov_b32_e32 v4, s9 ; GFX9-NEXT: s_addc_u32 s2, s6, s14 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v1, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc ; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: s_cselect_b32 s3, 1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, s16 ; GFX9-NEXT: v_mov_b32_e32 v3, s17 -; GFX9-NEXT: s_and_b32 s3, s3, 1 +; GFX9-NEXT: s_cselect_b32 s3, 1, 0 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v2, v0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v7, v3, v1, vcc +; GFX9-NEXT: s_and_b32 s3, s3, 1 ; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: s_cmp_lg_u32 s3, 0 ; GFX9-NEXT: v_mov_b32_e32 v3, s5 ; GFX9-NEXT: s_addc_u32 s3, s7, s15 -; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: s_cmp_eq_u64 s[2:3], s[6:7] ; GFX9-NEXT: s_cselect_b32 s6, 1, 0 @@ -5932,18 +5932,18 @@ ; GFX9-NEXT: s_and_b32 s8, s8, 1 ; GFX9-NEXT: s_cmp_lg_u32 s8, 0 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-NEXT: v_mov_b32_e32 v8, s1 ; GFX9-NEXT: s_addc_u32 s7, s7, s10 ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: v_mov_b32_e32 v8, s1 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v8, v2, vcc -; GFX9-NEXT: v_mov_b32_e32 v8, s2 ; GFX9-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-NEXT: v_mov_b32_e32 v9, s3 ; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: v_mov_b32_e32 v8, s2 +; GFX9-NEXT: v_mov_b32_e32 v9, s3 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc ; GFX9-NEXT: v_readfirstlane_b32 s0, v5 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll @@ -27,29 +27,29 @@ ; CHECK-NEXT: v_cvt_f32_u32_e32 v6, v2 ; CHECK-NEXT: v_ashrrev_i32_e32 v7, 31, v5 ; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v7 -; CHECK-NEXT: v_addc_u32_e32 v5, vcc, v5, v7, vcc ; CHECK-NEXT: v_mac_f32_e32 v3, 0x4f800000, v6 ; CHECK-NEXT: v_rcp_iflag_f32_e32 v3, v3 +; CHECK-NEXT: v_addc_u32_e32 v5, vcc, v5, v7, vcc ; CHECK-NEXT: v_sub_i32_e32 v8, vcc, 0, v1 -; CHECK-NEXT: v_subb_u32_e32 v9, vcc, 0, v2, vcc -; CHECK-NEXT: v_xor_b32_e32 v4, v4, v7 ; CHECK-NEXT: v_mul_f32_e32 v3, 0x5f7ffffc, v3 ; CHECK-NEXT: v_mul_f32_e32 v6, 0x2f800000, v3 ; CHECK-NEXT: v_trunc_f32_e32 v6, v6 ; CHECK-NEXT: v_mac_f32_e32 v3, 0xcf800000, v6 ; CHECK-NEXT: v_cvt_u32_f32_e32 v3, v3 ; CHECK-NEXT: v_cvt_u32_f32_e32 v6, v6 -; CHECK-NEXT: v_xor_b32_e32 v5, v5, v7 +; CHECK-NEXT: v_subb_u32_e32 v9, vcc, 0, v2, vcc ; CHECK-NEXT: v_mul_lo_u32 v10, v9, v3 ; CHECK-NEXT: v_mul_lo_u32 v11, v8, v6 ; CHECK-NEXT: v_mul_hi_u32 v13, v8, v3 ; CHECK-NEXT: v_mul_lo_u32 v12, v8, v3 +; CHECK-NEXT: v_xor_b32_e32 v4, v4, v7 ; CHECK-NEXT: v_add_i32_e32 v10, vcc, v10, v11 ; CHECK-NEXT: v_add_i32_e32 v10, vcc, v10, v13 ; CHECK-NEXT: v_mul_lo_u32 v11, v6, v12 ; CHECK-NEXT: v_mul_lo_u32 v13, v3, v10 ; CHECK-NEXT: v_mul_hi_u32 v14, v3, v12 ; CHECK-NEXT: v_mul_hi_u32 v12, v6, v12 +; CHECK-NEXT: v_xor_b32_e32 v5, v5, v7 ; CHECK-NEXT: v_add_i32_e32 v11, vcc, v11, v13 ; CHECK-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v11, vcc, v11, v14 @@ -57,12 +57,12 @@ ; CHECK-NEXT: v_mul_lo_u32 v14, v6, v10 ; CHECK-NEXT: v_add_i32_e32 v11, vcc, v13, v11 ; CHECK-NEXT: v_mul_hi_u32 v13, v3, v10 -; CHECK-NEXT: v_mul_hi_u32 v10, v6, v10 ; CHECK-NEXT: v_add_i32_e32 v12, vcc, v14, v12 ; CHECK-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v12, vcc, v12, v13 ; CHECK-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v13, vcc, v14, v13 +; CHECK-NEXT: v_mul_hi_u32 v10, v6, v10 ; CHECK-NEXT: v_add_i32_e32 v11, vcc, v12, v11 ; CHECK-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v12, vcc, v13, v12 @@ -75,10 +75,10 @@ ; CHECK-NEXT: v_mul_hi_u32 v8, v8, v3 ; CHECK-NEXT: v_add_i32_e64 v6, s[4:5], v6, v10 ; CHECK-NEXT: v_add_i32_e64 v9, s[4:5], v9, v12 -; CHECK-NEXT: v_mul_hi_u32 v10, v3, v13 ; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v9, v8 ; CHECK-NEXT: v_mul_lo_u32 v9, v11, v13 ; CHECK-NEXT: v_mul_lo_u32 v12, v3, v8 +; CHECK-NEXT: v_mul_hi_u32 v10, v3, v13 ; CHECK-NEXT: v_mul_hi_u32 v13, v11, v13 ; CHECK-NEXT: v_add_i32_e64 v9, s[4:5], v9, v12 ; CHECK-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] @@ -87,12 +87,12 @@ ; CHECK-NEXT: v_mul_lo_u32 v10, v11, v8 ; CHECK-NEXT: v_add_i32_e64 v9, s[4:5], v12, v9 ; CHECK-NEXT: v_mul_hi_u32 v12, v3, v8 -; CHECK-NEXT: v_mul_hi_u32 v8, v11, v8 ; CHECK-NEXT: v_add_i32_e64 v10, s[4:5], v10, v13 ; CHECK-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] ; CHECK-NEXT: v_add_i32_e64 v10, s[4:5], v10, v12 ; CHECK-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] ; CHECK-NEXT: v_add_i32_e64 v12, s[4:5], v13, v12 +; CHECK-NEXT: v_mul_hi_u32 v8, v11, v8 ; CHECK-NEXT: v_add_i32_e64 v9, s[4:5], v10, v9 ; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] ; CHECK-NEXT: v_add_i32_e64 v10, s[4:5], v12, v10 @@ -111,12 +111,12 @@ ; CHECK-NEXT: v_mul_lo_u32 v10, v5, v6 ; CHECK-NEXT: v_add_i32_e32 v8, vcc, v9, v8 ; CHECK-NEXT: v_mul_hi_u32 v9, v4, v6 -; CHECK-NEXT: v_mul_hi_u32 v6, v5, v6 ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v10, v3 ; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v9 ; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v9, vcc, v10, v9 +; CHECK-NEXT: v_mul_hi_u32 v6, v5, v6 ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v8 ; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v8, vcc, v9, v8 @@ -235,12 +235,12 @@ ; CHECK-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; CHECK-NEXT: v_trunc_f32_e32 v1, v1 ; CHECK-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 -; CHECK-NEXT: v_cvt_u32_f32_e32 v0, v0 ; CHECK-NEXT: v_cvt_u32_f32_e32 v1, v1 +; CHECK-NEXT: v_cvt_u32_f32_e32 v0, v0 ; CHECK-NEXT: s_subb_u32 s5, 0, s11 ; CHECK-NEXT: v_mov_b32_e32 v6, s11 -; CHECK-NEXT: v_mul_lo_u32 v2, s5, v0 ; CHECK-NEXT: v_mul_lo_u32 v3, s3, v1 +; CHECK-NEXT: v_mul_lo_u32 v2, s5, v0 ; CHECK-NEXT: v_mul_hi_u32 v5, s3, v0 ; CHECK-NEXT: v_mul_lo_u32 v4, s3, v0 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3 @@ -256,12 +256,12 @@ ; CHECK-NEXT: v_mul_lo_u32 v7, v1, v2 ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v5, v3 ; CHECK-NEXT: v_mul_hi_u32 v5, v0, v2 -; CHECK-NEXT: v_mul_hi_u32 v2, v1, v2 ; CHECK-NEXT: v_add_i32_e32 v4, vcc, v7, v4 ; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v5 ; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v7, v5 +; CHECK-NEXT: v_mul_hi_u32 v2, v1, v2 ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v3 ; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v4, vcc, v5, v4 @@ -286,12 +286,12 @@ ; CHECK-NEXT: v_mul_lo_u32 v5, v3, v4 ; CHECK-NEXT: v_add_i32_e64 v2, s[0:1], v8, v2 ; CHECK-NEXT: v_mul_hi_u32 v8, v0, v4 -; CHECK-NEXT: v_mul_hi_u32 v3, v3, v4 ; CHECK-NEXT: v_add_i32_e64 v5, s[0:1], v5, v7 ; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[0:1] ; CHECK-NEXT: v_add_i32_e64 v5, s[0:1], v5, v8 ; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[0:1] ; CHECK-NEXT: v_add_i32_e64 v7, s[0:1], v7, v8 +; CHECK-NEXT: v_mul_hi_u32 v3, v3, v4 ; CHECK-NEXT: v_add_i32_e64 v2, s[0:1], v5, v2 ; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[0:1] ; CHECK-NEXT: v_add_i32_e64 v4, s[0:1], v7, v5 @@ -311,12 +311,12 @@ ; CHECK-NEXT: v_mul_lo_u32 v5, s13, v1 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; CHECK-NEXT: v_mul_hi_u32 v3, s12, v1 -; CHECK-NEXT: v_mul_hi_u32 v1, s13, v1 ; CHECK-NEXT: v_add_i32_e32 v0, vcc, v5, v0 ; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v3 ; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v5, v3 +; CHECK-NEXT: v_mul_hi_u32 v1, s13, v1 ; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2 @@ -334,9 +334,9 @@ ; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[0:1] ; CHECK-NEXT: v_cmp_le_u32_e64 s[0:1], s10, v2 ; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v6, vcc -; CHECK-NEXT: v_subrev_i32_e32 v2, vcc, s10, v2 ; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1] ; CHECK-NEXT: v_cmp_eq_u32_e64 s[0:1], s11, v3 +; CHECK-NEXT: v_subrev_i32_e32 v2, vcc, s10, v2 ; CHECK-NEXT: v_cndmask_b32_e64 v3, v4, v5, s[0:1] ; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc ; CHECK-NEXT: v_add_i32_e32 v4, vcc, 1, v0 @@ -350,8 +350,8 @@ ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 ; CHECK-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; CHECK-NEXT: s_xor_b64 s[0:1], s[6:7], s[8:9] ; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; CHECK-NEXT: s_xor_b64 s[0:1], s[6:7], s[8:9] ; CHECK-NEXT: v_xor_b32_e32 v0, s0, v0 ; CHECK-NEXT: v_subrev_i32_e32 v0, vcc, s0, v0 ; CHECK-NEXT: s_mov_b32 s1, 0 @@ -412,29 +412,29 @@ ; GISEL-NEXT: v_cvt_f32_u32_e32 v10, v5 ; GISEL-NEXT: v_ashrrev_i32_e32 v11, 31, v1 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v11 -; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v11, vcc ; GISEL-NEXT: v_mac_f32_e32 v9, 0x4f800000, v10 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v9, v9 +; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v11, vcc ; GISEL-NEXT: v_sub_i32_e32 v12, vcc, 0, v4 -; GISEL-NEXT: v_subb_u32_e32 v13, vcc, 0, v5, vcc -; GISEL-NEXT: v_xor_b32_e32 v0, v0, v11 ; GISEL-NEXT: v_mul_f32_e32 v9, 0x5f7ffffc, v9 ; GISEL-NEXT: v_mul_f32_e32 v10, 0x2f800000, v9 ; GISEL-NEXT: v_trunc_f32_e32 v10, v10 ; GISEL-NEXT: v_mac_f32_e32 v9, 0xcf800000, v10 ; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v9 ; GISEL-NEXT: v_cvt_u32_f32_e32 v10, v10 -; GISEL-NEXT: v_xor_b32_e32 v1, v1, v11 +; GISEL-NEXT: v_subb_u32_e32 v13, vcc, 0, v5, vcc ; GISEL-NEXT: v_mul_lo_u32 v14, v13, v9 ; GISEL-NEXT: v_mul_lo_u32 v15, v12, v10 ; GISEL-NEXT: v_mul_hi_u32 v17, v12, v9 ; GISEL-NEXT: v_mul_lo_u32 v16, v12, v9 +; GISEL-NEXT: v_xor_b32_e32 v0, v0, v11 ; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v15 ; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v17 ; GISEL-NEXT: v_mul_lo_u32 v15, v10, v16 ; GISEL-NEXT: v_mul_lo_u32 v17, v9, v14 ; GISEL-NEXT: v_mul_hi_u32 v18, v9, v16 ; GISEL-NEXT: v_mul_hi_u32 v16, v10, v16 +; GISEL-NEXT: v_xor_b32_e32 v1, v1, v11 ; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v17 ; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v18 @@ -442,12 +442,12 @@ ; GISEL-NEXT: v_mul_lo_u32 v18, v10, v14 ; GISEL-NEXT: v_add_i32_e32 v15, vcc, v17, v15 ; GISEL-NEXT: v_mul_hi_u32 v17, v9, v14 -; GISEL-NEXT: v_mul_hi_u32 v14, v10, v14 ; GISEL-NEXT: v_add_i32_e32 v16, vcc, v18, v16 ; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v17 ; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v17, vcc, v18, v17 +; GISEL-NEXT: v_mul_hi_u32 v14, v10, v14 ; GISEL-NEXT: v_add_i32_e32 v15, vcc, v16, v15 ; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v16, vcc, v17, v16 @@ -460,10 +460,10 @@ ; GISEL-NEXT: v_mul_hi_u32 v12, v12, v9 ; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v14 ; GISEL-NEXT: v_add_i32_e64 v13, s[4:5], v13, v16 -; GISEL-NEXT: v_mul_hi_u32 v14, v9, v17 ; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v13, v12 ; GISEL-NEXT: v_mul_lo_u32 v13, v15, v17 ; GISEL-NEXT: v_mul_lo_u32 v16, v9, v12 +; GISEL-NEXT: v_mul_hi_u32 v14, v9, v17 ; GISEL-NEXT: v_mul_hi_u32 v17, v15, v17 ; GISEL-NEXT: v_add_i32_e64 v13, s[4:5], v13, v16 ; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[4:5] @@ -472,12 +472,12 @@ ; GISEL-NEXT: v_mul_lo_u32 v14, v15, v12 ; GISEL-NEXT: v_add_i32_e64 v13, s[4:5], v16, v13 ; GISEL-NEXT: v_mul_hi_u32 v16, v9, v12 -; GISEL-NEXT: v_mul_hi_u32 v12, v15, v12 ; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], v14, v17 ; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], v14, v16 ; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v16, s[4:5], v17, v16 +; GISEL-NEXT: v_mul_hi_u32 v12, v15, v12 ; GISEL-NEXT: v_add_i32_e64 v13, s[4:5], v14, v13 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], v16, v14 @@ -496,12 +496,12 @@ ; GISEL-NEXT: v_mul_lo_u32 v14, v1, v10 ; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12 ; GISEL-NEXT: v_mul_hi_u32 v13, v0, v10 -; GISEL-NEXT: v_mul_hi_u32 v10, v1, v10 ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v14, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v13 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 +; GISEL-NEXT: v_mul_hi_u32 v10, v1, v10 ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12 @@ -539,42 +539,40 @@ ; GISEL-NEXT: v_cndmask_b32_e32 v1, v14, v4, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 ; GISEL-NEXT: v_ashrrev_i32_e32 v5, 31, v7 -; GISEL-NEXT: v_cndmask_b32_e32 v1, v10, v1, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v0, v9, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v1, v10, v1, vcc ; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v5 ; GISEL-NEXT: v_addc_u32_e32 v7, vcc, v7, v5, vcc -; GISEL-NEXT: v_xor_b32_e32 v7, v7, v5 ; GISEL-NEXT: v_xor_b32_e32 v6, v6, v5 +; GISEL-NEXT: v_xor_b32_e32 v7, v7, v5 ; GISEL-NEXT: v_xor_b32_e32 v4, v11, v8 ; GISEL-NEXT: v_cvt_f32_u32_e32 v8, v6 ; GISEL-NEXT: v_cvt_f32_u32_e32 v9, v7 ; GISEL-NEXT: v_ashrrev_i32_e32 v10, 31, v3 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v10 -; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v10, vcc ; GISEL-NEXT: v_mac_f32_e32 v8, 0x4f800000, v9 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v8, v8 +; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v10, vcc ; GISEL-NEXT: v_sub_i32_e32 v11, vcc, 0, v6 -; GISEL-NEXT: v_subb_u32_e32 v12, vcc, 0, v7, vcc -; GISEL-NEXT: v_xor_b32_e32 v0, v0, v4 ; GISEL-NEXT: v_mul_f32_e32 v8, 0x5f7ffffc, v8 ; GISEL-NEXT: v_mul_f32_e32 v9, 0x2f800000, v8 ; GISEL-NEXT: v_trunc_f32_e32 v9, v9 ; GISEL-NEXT: v_mac_f32_e32 v8, 0xcf800000, v9 ; GISEL-NEXT: v_cvt_u32_f32_e32 v8, v8 ; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v9 -; GISEL-NEXT: v_xor_b32_e32 v2, v2, v10 -; GISEL-NEXT: v_xor_b32_e32 v3, v3, v10 +; GISEL-NEXT: v_subb_u32_e32 v12, vcc, 0, v7, vcc ; GISEL-NEXT: v_mul_lo_u32 v13, v12, v8 ; GISEL-NEXT: v_mul_lo_u32 v14, v11, v9 ; GISEL-NEXT: v_mul_hi_u32 v16, v11, v8 ; GISEL-NEXT: v_mul_lo_u32 v15, v11, v8 -; GISEL-NEXT: v_xor_b32_e32 v1, v1, v4 +; GISEL-NEXT: v_xor_b32_e32 v0, v0, v4 ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v16 ; GISEL-NEXT: v_mul_lo_u32 v14, v9, v15 ; GISEL-NEXT: v_mul_lo_u32 v16, v8, v13 ; GISEL-NEXT: v_mul_hi_u32 v17, v8, v15 ; GISEL-NEXT: v_mul_hi_u32 v15, v9, v15 +; GISEL-NEXT: v_xor_b32_e32 v2, v2, v10 ; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v16 ; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v17 @@ -582,12 +580,12 @@ ; GISEL-NEXT: v_mul_lo_u32 v17, v9, v13 ; GISEL-NEXT: v_add_i32_e32 v14, vcc, v16, v14 ; GISEL-NEXT: v_mul_hi_u32 v16, v8, v13 -; GISEL-NEXT: v_mul_hi_u32 v13, v9, v13 ; GISEL-NEXT: v_add_i32_e32 v15, vcc, v17, v15 ; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v16 ; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v16, vcc, v17, v16 +; GISEL-NEXT: v_mul_hi_u32 v13, v9, v13 ; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v15, vcc, v16, v15 @@ -600,11 +598,12 @@ ; GISEL-NEXT: v_mul_hi_u32 v11, v11, v8 ; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v13 ; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v12, v15 -; GISEL-NEXT: v_mul_hi_u32 v13, v8, v16 ; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v12, v11 ; GISEL-NEXT: v_mul_lo_u32 v12, v14, v16 ; GISEL-NEXT: v_mul_lo_u32 v15, v8, v11 +; GISEL-NEXT: v_mul_hi_u32 v13, v8, v16 ; GISEL-NEXT: v_mul_hi_u32 v16, v14, v16 +; GISEL-NEXT: v_xor_b32_e32 v3, v3, v10 ; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v12, v15 ; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v12, v13 @@ -612,12 +611,12 @@ ; GISEL-NEXT: v_mul_lo_u32 v13, v14, v11 ; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v15, v12 ; GISEL-NEXT: v_mul_hi_u32 v15, v8, v11 -; GISEL-NEXT: v_mul_hi_u32 v11, v14, v11 ; GISEL-NEXT: v_add_i32_e64 v13, s[4:5], v13, v16 ; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v13, s[4:5], v13, v15 ; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v15, s[4:5], v16, v15 +; GISEL-NEXT: v_mul_hi_u32 v11, v14, v11 ; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v13, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v13, s[4:5], v15, v13 @@ -625,25 +624,26 @@ ; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v11, vcc ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v12 ; GISEL-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 +; GISEL-NEXT: v_xor_b32_e32 v1, v1, v4 ; GISEL-NEXT: v_mul_lo_u32 v11, v3, v8 ; GISEL-NEXT: v_mul_lo_u32 v12, v2, v9 +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 ; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v4, vcc ; GISEL-NEXT: v_mul_hi_u32 v4, v2, v8 -; GISEL-NEXT: v_mul_hi_u32 v8, v3, v8 ; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v11, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GISEL-NEXT: v_mul_lo_u32 v11, v3, v9 +; GISEL-NEXT: v_mul_hi_u32 v8, v3, v8 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v12, v4 ; GISEL-NEXT: v_mul_hi_u32 v12, v2, v9 -; GISEL-NEXT: v_mul_hi_u32 v9, v3, v9 ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v11, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 +; GISEL-NEXT: v_mul_hi_u32 v9, v3, v9 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v8, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v11, v8 @@ -716,26 +716,26 @@ ; CGP-NEXT: v_rcp_iflag_f32_e32 v3, v3 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v10, v5 ; CGP-NEXT: v_addc_u32_e32 v10, vcc, v11, v5, vcc -; CGP-NEXT: v_sub_i32_e32 v12, vcc, 0, v1 ; CGP-NEXT: v_mul_f32_e32 v3, 0x5f7ffffc, v3 ; CGP-NEXT: v_mul_f32_e32 v11, 0x2f800000, v3 ; CGP-NEXT: v_trunc_f32_e32 v11, v11 ; CGP-NEXT: v_mac_f32_e32 v3, 0xcf800000, v11 ; CGP-NEXT: v_cvt_u32_f32_e32 v3, v3 ; CGP-NEXT: v_cvt_u32_f32_e32 v11, v11 +; CGP-NEXT: v_sub_i32_e32 v12, vcc, 0, v1 ; CGP-NEXT: v_subb_u32_e32 v13, vcc, 0, v2, vcc -; CGP-NEXT: v_xor_b32_e32 v4, v4, v5 ; CGP-NEXT: v_mul_lo_u32 v14, v13, v3 ; CGP-NEXT: v_mul_lo_u32 v15, v12, v11 ; CGP-NEXT: v_mul_hi_u32 v17, v12, v3 ; CGP-NEXT: v_mul_lo_u32 v16, v12, v3 -; CGP-NEXT: v_xor_b32_e32 v10, v10, v5 +; CGP-NEXT: v_xor_b32_e32 v4, v4, v5 ; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v15 ; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v17 ; CGP-NEXT: v_mul_lo_u32 v15, v11, v16 ; CGP-NEXT: v_mul_lo_u32 v17, v3, v14 ; CGP-NEXT: v_mul_hi_u32 v18, v3, v16 ; CGP-NEXT: v_mul_hi_u32 v16, v11, v16 +; CGP-NEXT: v_xor_b32_e32 v10, v10, v5 ; CGP-NEXT: v_add_i32_e32 v15, vcc, v15, v17 ; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v15, vcc, v15, v18 @@ -743,12 +743,12 @@ ; CGP-NEXT: v_mul_lo_u32 v18, v11, v14 ; CGP-NEXT: v_add_i32_e32 v15, vcc, v17, v15 ; CGP-NEXT: v_mul_hi_u32 v17, v3, v14 -; CGP-NEXT: v_mul_hi_u32 v14, v11, v14 ; CGP-NEXT: v_add_i32_e32 v16, vcc, v18, v16 ; CGP-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v16, vcc, v16, v17 ; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v17, vcc, v18, v17 +; CGP-NEXT: v_mul_hi_u32 v14, v11, v14 ; CGP-NEXT: v_add_i32_e32 v15, vcc, v16, v15 ; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v16, vcc, v17, v16 @@ -761,10 +761,10 @@ ; CGP-NEXT: v_mul_hi_u32 v12, v12, v3 ; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v14 ; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v13, v16 -; CGP-NEXT: v_mul_hi_u32 v14, v3, v17 ; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v13, v12 ; CGP-NEXT: v_mul_lo_u32 v13, v15, v17 ; CGP-NEXT: v_mul_lo_u32 v16, v3, v12 +; CGP-NEXT: v_mul_hi_u32 v14, v3, v17 ; CGP-NEXT: v_mul_hi_u32 v17, v15, v17 ; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v13, v16 ; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[4:5] @@ -773,12 +773,12 @@ ; CGP-NEXT: v_mul_lo_u32 v14, v15, v12 ; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v16, v13 ; CGP-NEXT: v_mul_hi_u32 v16, v3, v12 -; CGP-NEXT: v_mul_hi_u32 v12, v15, v12 ; CGP-NEXT: v_add_i32_e64 v14, s[4:5], v14, v17 ; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, s[4:5] ; CGP-NEXT: v_add_i32_e64 v14, s[4:5], v14, v16 ; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[4:5] ; CGP-NEXT: v_add_i32_e64 v16, s[4:5], v17, v16 +; CGP-NEXT: v_mul_hi_u32 v12, v15, v12 ; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v14, v13 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] ; CGP-NEXT: v_add_i32_e64 v14, s[4:5], v16, v14 @@ -797,12 +797,12 @@ ; CGP-NEXT: v_mul_lo_u32 v14, v10, v11 ; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12 ; CGP-NEXT: v_mul_hi_u32 v13, v4, v11 -; CGP-NEXT: v_mul_hi_u32 v11, v10, v11 ; CGP-NEXT: v_add_i32_e32 v3, vcc, v14, v3 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v13 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 +; CGP-NEXT: v_mul_hi_u32 v11, v10, v11 ; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v12 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12 @@ -895,26 +895,26 @@ ; CGP-NEXT: v_rcp_iflag_f32_e32 v5, v5 ; CGP-NEXT: v_add_i32_e32 v6, vcc, v8, v7 ; CGP-NEXT: v_addc_u32_e32 v8, vcc, v9, v7, vcc -; CGP-NEXT: v_sub_i32_e32 v10, vcc, 0, v3 ; CGP-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5 ; CGP-NEXT: v_mul_f32_e32 v9, 0x2f800000, v5 ; CGP-NEXT: v_trunc_f32_e32 v9, v9 ; CGP-NEXT: v_mac_f32_e32 v5, 0xcf800000, v9 ; CGP-NEXT: v_cvt_u32_f32_e32 v5, v5 ; CGP-NEXT: v_cvt_u32_f32_e32 v9, v9 +; CGP-NEXT: v_sub_i32_e32 v10, vcc, 0, v3 ; CGP-NEXT: v_subb_u32_e32 v11, vcc, 0, v4, vcc -; CGP-NEXT: v_xor_b32_e32 v6, v6, v7 ; CGP-NEXT: v_mul_lo_u32 v12, v11, v5 ; CGP-NEXT: v_mul_lo_u32 v13, v10, v9 ; CGP-NEXT: v_mul_hi_u32 v15, v10, v5 ; CGP-NEXT: v_mul_lo_u32 v14, v10, v5 -; CGP-NEXT: v_xor_b32_e32 v8, v8, v7 +; CGP-NEXT: v_xor_b32_e32 v6, v6, v7 ; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v13 ; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v15 ; CGP-NEXT: v_mul_lo_u32 v13, v9, v14 ; CGP-NEXT: v_mul_lo_u32 v15, v5, v12 ; CGP-NEXT: v_mul_hi_u32 v16, v5, v14 ; CGP-NEXT: v_mul_hi_u32 v14, v9, v14 +; CGP-NEXT: v_xor_b32_e32 v8, v8, v7 ; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v15 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v16 @@ -922,12 +922,12 @@ ; CGP-NEXT: v_mul_lo_u32 v16, v9, v12 ; CGP-NEXT: v_add_i32_e32 v13, vcc, v15, v13 ; CGP-NEXT: v_mul_hi_u32 v15, v5, v12 -; CGP-NEXT: v_mul_hi_u32 v12, v9, v12 ; CGP-NEXT: v_add_i32_e32 v14, vcc, v16, v14 ; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v15 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v15, vcc, v16, v15 +; CGP-NEXT: v_mul_hi_u32 v12, v9, v12 ; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14 @@ -940,10 +940,10 @@ ; CGP-NEXT: v_mul_hi_u32 v10, v10, v5 ; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v9, v12 ; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v14 -; CGP-NEXT: v_mul_hi_u32 v12, v5, v15 ; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v11, v10 ; CGP-NEXT: v_mul_lo_u32 v11, v13, v15 ; CGP-NEXT: v_mul_lo_u32 v14, v5, v10 +; CGP-NEXT: v_mul_hi_u32 v12, v5, v15 ; CGP-NEXT: v_mul_hi_u32 v15, v13, v15 ; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] @@ -952,12 +952,12 @@ ; CGP-NEXT: v_mul_lo_u32 v12, v13, v10 ; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v14, v11 ; CGP-NEXT: v_mul_hi_u32 v14, v5, v10 -; CGP-NEXT: v_mul_hi_u32 v10, v13, v10 ; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v15 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] ; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] ; CGP-NEXT: v_add_i32_e64 v14, s[4:5], v15, v14 +; CGP-NEXT: v_mul_hi_u32 v10, v13, v10 ; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v12, v11 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] ; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v14, v12 @@ -976,12 +976,12 @@ ; CGP-NEXT: v_mul_lo_u32 v12, v8, v9 ; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 ; CGP-NEXT: v_mul_hi_u32 v11, v6, v9 -; CGP-NEXT: v_mul_hi_u32 v9, v8, v9 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v12, v5 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v11 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11 +; CGP-NEXT: v_mul_hi_u32 v9, v8, v9 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v10 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 @@ -1071,19 +1071,19 @@ ; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v3 ; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc -; CHECK-NEXT: v_xor_b32_e32 v0, v0, v3 ; CHECK-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 ; CHECK-NEXT: v_mul_f32_e32 v4, 0x2f800000, v2 ; CHECK-NEXT: v_trunc_f32_e32 v4, v4 ; CHECK-NEXT: v_mac_f32_e32 v2, 0xcf800000, v4 ; CHECK-NEXT: v_cvt_u32_f32_e32 v2, v2 ; CHECK-NEXT: v_cvt_u32_f32_e32 v4, v4 +; CHECK-NEXT: v_xor_b32_e32 v0, v0, v3 ; CHECK-NEXT: v_xor_b32_e32 v1, v1, v3 -; CHECK-NEXT: s_bfe_i32 s7, -1, 0x10000 ; CHECK-NEXT: v_mul_lo_u32 v5, -1, v2 ; CHECK-NEXT: v_mul_lo_u32 v6, s6, v4 ; CHECK-NEXT: v_mul_hi_u32 v8, s6, v2 ; CHECK-NEXT: v_mul_lo_u32 v7, s6, v2 +; CHECK-NEXT: s_bfe_i32 s7, -1, 0x10000 ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v8 ; CHECK-NEXT: v_mul_lo_u32 v6, v4, v7 @@ -1097,12 +1097,12 @@ ; CHECK-NEXT: v_mul_lo_u32 v9, v4, v5 ; CHECK-NEXT: v_add_i32_e32 v6, vcc, v8, v6 ; CHECK-NEXT: v_mul_hi_u32 v8, v2, v5 -; CHECK-NEXT: v_mul_hi_u32 v5, v4, v5 ; CHECK-NEXT: v_add_i32_e32 v7, vcc, v9, v7 ; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v8 ; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; CHECK-NEXT: v_mul_hi_u32 v5, v4, v5 ; CHECK-NEXT: v_add_i32_e32 v6, vcc, v7, v6 ; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v7, vcc, v8, v7 @@ -1128,12 +1128,12 @@ ; CHECK-NEXT: v_mul_lo_u32 v8, v6, v7 ; CHECK-NEXT: v_add_i32_e64 v5, s[4:5], v10, v5 ; CHECK-NEXT: v_mul_hi_u32 v10, v2, v7 -; CHECK-NEXT: v_mul_hi_u32 v6, v6, v7 ; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v8, v9 ; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] ; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v8, v10 ; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] ; CHECK-NEXT: v_add_i32_e64 v9, s[4:5], v9, v10 +; CHECK-NEXT: v_mul_hi_u32 v6, v6, v7 ; CHECK-NEXT: v_add_i32_e64 v5, s[4:5], v8, v5 ; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] ; CHECK-NEXT: v_add_i32_e64 v7, s[4:5], v9, v8 @@ -1152,12 +1152,12 @@ ; CHECK-NEXT: v_mul_lo_u32 v7, v1, v4 ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5 ; CHECK-NEXT: v_mul_hi_u32 v6, v0, v4 -; CHECK-NEXT: v_mul_hi_u32 v4, v1, v4 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v7, v2 ; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v6 ; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; CHECK-NEXT: v_mul_hi_u32 v4, v1, v4 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v5 ; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5 @@ -1176,13 +1176,13 @@ ; CHECK-NEXT: v_subrev_i32_e32 v0, vcc, s6, v0 ; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc ; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5] -; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v6 ; CHECK-NEXT: v_mov_b32_e32 v7, s7 +; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v6 ; CHECK-NEXT: v_add_i32_e32 v6, vcc, 1, v2 ; CHECK-NEXT: v_cndmask_b32_e64 v5, v7, v5, s[4:5] ; CHECK-NEXT: v_addc_u32_e32 v7, vcc, 0, v4, vcc -; CHECK-NEXT: v_cmp_le_u32_e32 vcc, s6, v0 ; CHECK-NEXT: s_bfe_i32 s4, -1, 0x10000 +; CHECK-NEXT: v_cmp_le_u32_e32 vcc, s6, v0 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc ; CHECK-NEXT: v_mov_b32_e32 v8, s4 ; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 @@ -1212,8 +1212,8 @@ ; GISEL-NEXT: s_add_u32 s4, s10, 0 ; GISEL-NEXT: s_cselect_b32 s5, 1, 0 ; GISEL-NEXT: s_and_b32 s5, s5, 1 -; GISEL-NEXT: s_cmp_lg_u32 s5, 0 ; GISEL-NEXT: s_mov_b32 s6, 0 +; GISEL-NEXT: s_cmp_lg_u32 s5, 0 ; GISEL-NEXT: s_mov_b32 s7, s6 ; GISEL-NEXT: s_addc_u32 s5, 0, 0 ; GISEL-NEXT: s_xor_b64 s[8:9], s[4:5], s[6:7] @@ -1234,19 +1234,18 @@ ; GISEL-NEXT: v_cvt_u32_f32_e32 v4, v4 ; GISEL-NEXT: v_cvt_u32_f32_e32 v5, v5 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v6 -; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v6, vcc ; GISEL-NEXT: v_mul_lo_u32 v7, s12, v4 ; GISEL-NEXT: v_mul_lo_u32 v8, s11, v5 ; GISEL-NEXT: v_mul_hi_u32 v10, s11, v4 ; GISEL-NEXT: v_mul_lo_u32 v9, s11, v4 -; GISEL-NEXT: v_xor_b32_e32 v0, v0, v6 +; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v6, vcc ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v10 ; GISEL-NEXT: v_mul_lo_u32 v8, v5, v9 ; GISEL-NEXT: v_mul_lo_u32 v10, v4, v7 ; GISEL-NEXT: v_mul_hi_u32 v11, v4, v9 ; GISEL-NEXT: v_mul_hi_u32 v9, v5, v9 -; GISEL-NEXT: v_xor_b32_e32 v1, v1, v6 +; GISEL-NEXT: v_xor_b32_e32 v0, v0, v6 ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v11 @@ -1254,12 +1253,12 @@ ; GISEL-NEXT: v_mul_lo_u32 v11, v5, v7 ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v10, v8 ; GISEL-NEXT: v_mul_hi_u32 v10, v4, v7 -; GISEL-NEXT: v_mul_hi_u32 v7, v5, v7 ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v11, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; GISEL-NEXT: v_mul_hi_u32 v7, v5, v7 ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 @@ -1277,6 +1276,7 @@ ; GISEL-NEXT: v_mul_lo_u32 v12, v4, v9 ; GISEL-NEXT: v_mul_hi_u32 v7, v4, v11 ; GISEL-NEXT: v_mul_hi_u32 v11, v8, v11 +; GISEL-NEXT: v_xor_b32_e32 v1, v1, v6 ; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v10, v7 @@ -1284,12 +1284,12 @@ ; GISEL-NEXT: v_mul_lo_u32 v10, v8, v9 ; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v12, v7 ; GISEL-NEXT: v_mul_hi_u32 v12, v4, v9 -; GISEL-NEXT: v_mul_hi_u32 v8, v8, v9 ; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v12 +; GISEL-NEXT: v_mul_hi_u32 v8, v8, v9 ; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v10, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v11, v10 @@ -1309,12 +1309,12 @@ ; GISEL-NEXT: v_mul_lo_u32 v10, v1, v5 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v7 ; GISEL-NEXT: v_mul_hi_u32 v8, v0, v5 -; GISEL-NEXT: v_mul_hi_u32 v5, v1, v5 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v10, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v10, v8 +; GISEL-NEXT: v_mul_hi_u32 v5, v1, v5 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v7 @@ -1343,20 +1343,20 @@ ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc ; GISEL-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 ; GISEL-NEXT: s_add_u32 s4, s10, 0 -; GISEL-NEXT: s_cselect_b32 s5, 1, 0 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc ; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, s9, v1 -; GISEL-NEXT: s_and_b32 s5, s5, 1 +; GISEL-NEXT: s_cselect_b32 s5, 1, 0 ; GISEL-NEXT: v_cndmask_b32_e32 v0, v10, v0, vcc ; GISEL-NEXT: v_add_i32_e32 v1, vcc, 1, v8 -; GISEL-NEXT: s_cmp_lg_u32 s5, 0 +; GISEL-NEXT: s_and_b32 s5, s5, 1 ; GISEL-NEXT: v_addc_u32_e32 v10, vcc, 0, v9, vcc +; GISEL-NEXT: s_cmp_lg_u32 s5, 0 ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GISEL-NEXT: s_addc_u32 s5, 0, 0 -; GISEL-NEXT: s_xor_b64 s[6:7], s[4:5], s[6:7] ; GISEL-NEXT: v_cndmask_b32_e32 v0, v8, v1, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v1, v9, v10, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 +; GISEL-NEXT: s_xor_b64 s[6:7], s[4:5], s[6:7] ; GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; GISEL-NEXT: v_cvt_f32_u32_e32 v4, s6 @@ -1399,12 +1399,12 @@ ; GISEL-NEXT: v_mul_lo_u32 v11, v5, v7 ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v10, v8 ; GISEL-NEXT: v_mul_hi_u32 v10, v4, v7 -; GISEL-NEXT: v_mul_hi_u32 v7, v5, v7 ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v11, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; GISEL-NEXT: v_mul_hi_u32 v7, v5, v7 ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 @@ -1430,12 +1430,12 @@ ; GISEL-NEXT: v_mul_lo_u32 v10, v8, v9 ; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v12, v7 ; GISEL-NEXT: v_mul_hi_u32 v12, v4, v9 -; GISEL-NEXT: v_mul_hi_u32 v8, v8, v9 ; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v12 +; GISEL-NEXT: v_mul_hi_u32 v8, v8, v9 ; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v10, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v11, v10 @@ -1455,12 +1455,12 @@ ; GISEL-NEXT: v_mul_lo_u32 v10, v3, v5 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v7 ; GISEL-NEXT: v_mul_hi_u32 v8, v2, v5 -; GISEL-NEXT: v_mul_hi_u32 v5, v3, v5 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v10, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v10, v8 +; GISEL-NEXT: v_mul_hi_u32 v5, v3, v5 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v7 @@ -1517,27 +1517,26 @@ ; CGP-NEXT: v_rcp_iflag_f32_e32 v7, v7 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v5 ; CGP-NEXT: v_addc_u32_e32 v1, vcc, v1, v5, vcc -; CGP-NEXT: v_xor_b32_e32 v0, v0, v5 ; CGP-NEXT: v_mul_f32_e32 v7, 0x5f7ffffc, v7 ; CGP-NEXT: v_mul_f32_e32 v8, 0x2f800000, v7 ; CGP-NEXT: v_trunc_f32_e32 v8, v8 ; CGP-NEXT: v_mac_f32_e32 v7, 0xcf800000, v8 ; CGP-NEXT: v_cvt_u32_f32_e32 v7, v7 ; CGP-NEXT: v_cvt_u32_f32_e32 v8, v8 +; CGP-NEXT: v_xor_b32_e32 v0, v0, v5 ; CGP-NEXT: v_xor_b32_e32 v1, v1, v5 -; CGP-NEXT: s_movk_i32 s7, 0x1000 ; CGP-NEXT: v_mul_lo_u32 v9, -1, v7 ; CGP-NEXT: v_mul_lo_u32 v10, s6, v8 ; CGP-NEXT: v_mul_hi_u32 v12, s6, v7 ; CGP-NEXT: v_mul_lo_u32 v11, s6, v7 -; CGP-NEXT: s_bfe_i32 s8, -1, 0x10000 +; CGP-NEXT: s_movk_i32 s7, 0x1000 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v10 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v12 ; CGP-NEXT: v_mul_lo_u32 v10, v8, v11 ; CGP-NEXT: v_mul_lo_u32 v12, v7, v9 ; CGP-NEXT: v_mul_hi_u32 v13, v7, v11 ; CGP-NEXT: v_mul_hi_u32 v11, v8, v11 -; CGP-NEXT: v_mac_f32_e32 v4, 0x4f800000, v6 +; CGP-NEXT: s_bfe_i32 s8, -1, 0x10000 ; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v12 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v13 @@ -1545,12 +1544,12 @@ ; CGP-NEXT: v_mul_lo_u32 v13, v8, v9 ; CGP-NEXT: v_add_i32_e32 v10, vcc, v12, v10 ; CGP-NEXT: v_mul_hi_u32 v12, v7, v9 -; CGP-NEXT: v_mul_hi_u32 v9, v8, v9 ; CGP-NEXT: v_add_i32_e32 v11, vcc, v13, v11 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v12 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12 +; CGP-NEXT: v_mul_hi_u32 v9, v8, v9 ; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11 @@ -1568,7 +1567,7 @@ ; CGP-NEXT: v_mul_lo_u32 v14, v7, v11 ; CGP-NEXT: v_mul_hi_u32 v9, v7, v13 ; CGP-NEXT: v_mul_hi_u32 v13, v10, v13 -; CGP-NEXT: v_rcp_iflag_f32_e32 v4, v4 +; CGP-NEXT: v_mac_f32_e32 v4, 0x4f800000, v6 ; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] ; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v12, v9 @@ -1576,12 +1575,12 @@ ; CGP-NEXT: v_mul_lo_u32 v12, v10, v11 ; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v14, v9 ; CGP-NEXT: v_mul_hi_u32 v14, v7, v11 -; CGP-NEXT: v_mul_hi_u32 v10, v10, v11 ; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v13 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] ; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] ; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v13, v14 +; CGP-NEXT: v_mul_hi_u32 v10, v10, v11 ; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v12, v9 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] ; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v13, v12 @@ -1593,7 +1592,7 @@ ; CGP-NEXT: v_mul_lo_u32 v10, v0, v8 ; CGP-NEXT: v_mul_hi_u32 v11, v0, v7 ; CGP-NEXT: v_mul_hi_u32 v7, v1, v7 -; CGP-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 +; CGP-NEXT: v_rcp_iflag_f32_e32 v4, v4 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v10 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v11 @@ -1601,12 +1600,12 @@ ; CGP-NEXT: v_mul_lo_u32 v11, v1, v8 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v9 ; CGP-NEXT: v_mul_hi_u32 v10, v0, v8 -; CGP-NEXT: v_mul_hi_u32 v8, v1, v8 ; CGP-NEXT: v_add_i32_e32 v7, vcc, v11, v7 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v10 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; CGP-NEXT: v_mul_hi_u32 v8, v1, v8 ; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v9 ; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v9 @@ -1615,7 +1614,7 @@ ; CGP-NEXT: v_mul_lo_u32 v10, s7, v8 ; CGP-NEXT: v_mul_hi_u32 v12, s7, v7 ; CGP-NEXT: v_mul_lo_u32 v11, s7, v7 -; CGP-NEXT: v_ashrrev_i32_e32 v6, 31, v3 +; CGP-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v10 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v12 ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v11 @@ -1626,13 +1625,13 @@ ; CGP-NEXT: v_subrev_i32_e32 v0, vcc, s7, v0 ; CGP-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc ; CGP-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v10 ; CGP-NEXT: v_mov_b32_e32 v11, s8 +; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v10 ; CGP-NEXT: v_add_i32_e32 v10, vcc, 1, v7 ; CGP-NEXT: v_cndmask_b32_e64 v9, v11, v9, s[4:5] ; CGP-NEXT: v_addc_u32_e32 v11, vcc, 0, v8, vcc -; CGP-NEXT: v_cmp_le_u32_e32 vcc, s7, v0 ; CGP-NEXT: s_bfe_i32 s4, -1, 0x10000 +; CGP-NEXT: v_cmp_le_u32_e32 vcc, s7, v0 ; CGP-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc ; CGP-NEXT: v_mov_b32_e32 v12, s4 ; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 @@ -1650,11 +1649,12 @@ ; CGP-NEXT: v_cvt_u32_f32_e32 v4, v4 ; CGP-NEXT: v_cvt_u32_f32_e32 v7, v7 ; CGP-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v6 +; CGP-NEXT: v_ashrrev_i32_e32 v6, 31, v3 ; CGP-NEXT: v_mul_lo_u32 v8, -1, v4 ; CGP-NEXT: v_mul_lo_u32 v9, s6, v7 ; CGP-NEXT: v_mul_hi_u32 v11, s6, v4 ; CGP-NEXT: v_mul_lo_u32 v10, s6, v4 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v6 ; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v6, vcc ; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9 ; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v11 @@ -1670,12 +1670,12 @@ ; CGP-NEXT: v_mul_lo_u32 v12, v7, v8 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v11, v9 ; CGP-NEXT: v_mul_hi_u32 v11, v4, v8 -; CGP-NEXT: v_mul_hi_u32 v8, v7, v8 ; CGP-NEXT: v_add_i32_e32 v10, vcc, v12, v10 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v11 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11 +; CGP-NEXT: v_mul_hi_u32 v8, v7, v8 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v9 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 @@ -1701,40 +1701,40 @@ ; CGP-NEXT: v_mul_lo_u32 v11, v9, v10 ; CGP-NEXT: v_add_i32_e64 v8, s[4:5], v13, v8 ; CGP-NEXT: v_mul_hi_u32 v13, v4, v10 -; CGP-NEXT: v_mul_hi_u32 v9, v9, v10 ; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v12 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] ; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v13 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] ; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v13 +; CGP-NEXT: v_mul_hi_u32 v9, v9, v10 ; CGP-NEXT: v_add_i32_e64 v8, s[4:5], v11, v8 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] ; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v12, v11 ; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v9, v10 ; CGP-NEXT: v_addc_u32_e32 v7, vcc, v7, v9, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v8 -; CGP-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc ; CGP-NEXT: v_xor_b32_e32 v3, v3, v6 +; CGP-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc ; CGP-NEXT: v_xor_b32_e32 v1, v1, v5 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v5 ; CGP-NEXT: v_mul_lo_u32 v8, v3, v4 ; CGP-NEXT: v_mul_lo_u32 v9, v2, v7 +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v5 ; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v5, vcc ; CGP-NEXT: v_mul_hi_u32 v5, v2, v4 -; CGP-NEXT: v_mul_hi_u32 v4, v3, v4 ; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9 ; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v5, vcc, v8, v5 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CGP-NEXT: v_mul_lo_u32 v8, v3, v7 +; CGP-NEXT: v_mul_hi_u32 v4, v3, v4 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v9, v5 ; CGP-NEXT: v_mul_hi_u32 v9, v2, v7 -; CGP-NEXT: v_mul_hi_u32 v7, v3, v7 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v8, v4 ; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9 ; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9 +; CGP-NEXT: v_mul_hi_u32 v7, v3, v7 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v5, vcc, v8, v5 @@ -1754,13 +1754,13 @@ ; CGP-NEXT: v_subrev_i32_e32 v2, vcc, s7, v2 ; CGP-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc ; CGP-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v8 ; CGP-NEXT: v_mov_b32_e32 v9, s6 +; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v8 ; CGP-NEXT: v_add_i32_e32 v8, vcc, 1, v4 ; CGP-NEXT: v_cndmask_b32_e64 v7, v9, v7, s[4:5] ; CGP-NEXT: v_addc_u32_e32 v9, vcc, 0, v5, vcc -; CGP-NEXT: v_cmp_le_u32_e32 vcc, s7, v2 ; CGP-NEXT: s_bfe_i32 s4, -1, 0x10000 +; CGP-NEXT: v_cmp_le_u32_e32 vcc, s7, v2 ; CGP-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc ; CGP-NEXT: v_mov_b32_e32 v10, s4 ; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 @@ -1794,19 +1794,19 @@ ; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v3 ; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc -; CHECK-NEXT: v_xor_b32_e32 v0, v0, v3 ; CHECK-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 ; CHECK-NEXT: v_mul_f32_e32 v4, 0x2f800000, v2 ; CHECK-NEXT: v_trunc_f32_e32 v4, v4 ; CHECK-NEXT: v_mac_f32_e32 v2, 0xcf800000, v4 ; CHECK-NEXT: v_cvt_u32_f32_e32 v2, v2 ; CHECK-NEXT: v_cvt_u32_f32_e32 v4, v4 +; CHECK-NEXT: v_xor_b32_e32 v0, v0, v3 ; CHECK-NEXT: v_xor_b32_e32 v1, v1, v3 -; CHECK-NEXT: s_bfe_i32 s7, -1, 0x10000 ; CHECK-NEXT: v_mul_lo_u32 v5, -1, v2 ; CHECK-NEXT: v_mul_lo_u32 v6, s6, v4 ; CHECK-NEXT: v_mul_hi_u32 v8, s6, v2 ; CHECK-NEXT: v_mul_lo_u32 v7, s6, v2 +; CHECK-NEXT: s_bfe_i32 s7, -1, 0x10000 ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v8 ; CHECK-NEXT: v_mul_lo_u32 v6, v4, v7 @@ -1820,12 +1820,12 @@ ; CHECK-NEXT: v_mul_lo_u32 v9, v4, v5 ; CHECK-NEXT: v_add_i32_e32 v6, vcc, v8, v6 ; CHECK-NEXT: v_mul_hi_u32 v8, v2, v5 -; CHECK-NEXT: v_mul_hi_u32 v5, v4, v5 ; CHECK-NEXT: v_add_i32_e32 v7, vcc, v9, v7 ; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v8 ; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; CHECK-NEXT: v_mul_hi_u32 v5, v4, v5 ; CHECK-NEXT: v_add_i32_e32 v6, vcc, v7, v6 ; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v7, vcc, v8, v7 @@ -1851,12 +1851,12 @@ ; CHECK-NEXT: v_mul_lo_u32 v8, v6, v7 ; CHECK-NEXT: v_add_i32_e64 v5, s[4:5], v10, v5 ; CHECK-NEXT: v_mul_hi_u32 v10, v2, v7 -; CHECK-NEXT: v_mul_hi_u32 v6, v6, v7 ; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v8, v9 ; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] ; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v8, v10 ; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] ; CHECK-NEXT: v_add_i32_e64 v9, s[4:5], v9, v10 +; CHECK-NEXT: v_mul_hi_u32 v6, v6, v7 ; CHECK-NEXT: v_add_i32_e64 v5, s[4:5], v8, v5 ; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] ; CHECK-NEXT: v_add_i32_e64 v7, s[4:5], v9, v8 @@ -1875,12 +1875,12 @@ ; CHECK-NEXT: v_mul_lo_u32 v7, v1, v4 ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5 ; CHECK-NEXT: v_mul_hi_u32 v6, v0, v4 -; CHECK-NEXT: v_mul_hi_u32 v4, v1, v4 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v7, v2 ; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v6 ; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; CHECK-NEXT: v_mul_hi_u32 v4, v1, v4 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v5 ; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5 @@ -1899,13 +1899,13 @@ ; CHECK-NEXT: v_subrev_i32_e32 v0, vcc, s6, v0 ; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc ; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5] -; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v6 ; CHECK-NEXT: v_mov_b32_e32 v7, s7 +; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v6 ; CHECK-NEXT: v_add_i32_e32 v6, vcc, 1, v2 ; CHECK-NEXT: v_cndmask_b32_e64 v5, v7, v5, s[4:5] ; CHECK-NEXT: v_addc_u32_e32 v7, vcc, 0, v4, vcc -; CHECK-NEXT: v_cmp_le_u32_e32 vcc, s6, v0 ; CHECK-NEXT: s_bfe_i32 s4, -1, 0x10000 +; CHECK-NEXT: v_cmp_le_u32_e32 vcc, s6, v0 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc ; CHECK-NEXT: v_mov_b32_e32 v8, s4 ; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 @@ -1935,8 +1935,8 @@ ; GISEL-NEXT: s_add_u32 s4, s10, 0 ; GISEL-NEXT: s_cselect_b32 s5, 1, 0 ; GISEL-NEXT: s_and_b32 s5, s5, 1 -; GISEL-NEXT: s_cmp_lg_u32 s5, 0 ; GISEL-NEXT: s_mov_b32 s6, 0 +; GISEL-NEXT: s_cmp_lg_u32 s5, 0 ; GISEL-NEXT: s_mov_b32 s7, s6 ; GISEL-NEXT: s_addc_u32 s5, 0, 0 ; GISEL-NEXT: s_xor_b64 s[8:9], s[4:5], s[6:7] @@ -1957,19 +1957,18 @@ ; GISEL-NEXT: v_cvt_u32_f32_e32 v4, v4 ; GISEL-NEXT: v_cvt_u32_f32_e32 v5, v5 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v6 -; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v6, vcc ; GISEL-NEXT: v_mul_lo_u32 v7, s12, v4 ; GISEL-NEXT: v_mul_lo_u32 v8, s11, v5 ; GISEL-NEXT: v_mul_hi_u32 v10, s11, v4 ; GISEL-NEXT: v_mul_lo_u32 v9, s11, v4 -; GISEL-NEXT: v_xor_b32_e32 v0, v0, v6 +; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v6, vcc ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v10 ; GISEL-NEXT: v_mul_lo_u32 v8, v5, v9 ; GISEL-NEXT: v_mul_lo_u32 v10, v4, v7 ; GISEL-NEXT: v_mul_hi_u32 v11, v4, v9 ; GISEL-NEXT: v_mul_hi_u32 v9, v5, v9 -; GISEL-NEXT: v_xor_b32_e32 v1, v1, v6 +; GISEL-NEXT: v_xor_b32_e32 v0, v0, v6 ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v11 @@ -1977,12 +1976,12 @@ ; GISEL-NEXT: v_mul_lo_u32 v11, v5, v7 ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v10, v8 ; GISEL-NEXT: v_mul_hi_u32 v10, v4, v7 -; GISEL-NEXT: v_mul_hi_u32 v7, v5, v7 ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v11, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; GISEL-NEXT: v_mul_hi_u32 v7, v5, v7 ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 @@ -2000,6 +1999,7 @@ ; GISEL-NEXT: v_mul_lo_u32 v12, v4, v9 ; GISEL-NEXT: v_mul_hi_u32 v7, v4, v11 ; GISEL-NEXT: v_mul_hi_u32 v11, v8, v11 +; GISEL-NEXT: v_xor_b32_e32 v1, v1, v6 ; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v10, v7 @@ -2007,12 +2007,12 @@ ; GISEL-NEXT: v_mul_lo_u32 v10, v8, v9 ; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v12, v7 ; GISEL-NEXT: v_mul_hi_u32 v12, v4, v9 -; GISEL-NEXT: v_mul_hi_u32 v8, v8, v9 ; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v12 +; GISEL-NEXT: v_mul_hi_u32 v8, v8, v9 ; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v10, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v11, v10 @@ -2032,12 +2032,12 @@ ; GISEL-NEXT: v_mul_lo_u32 v10, v1, v5 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v7 ; GISEL-NEXT: v_mul_hi_u32 v8, v0, v5 -; GISEL-NEXT: v_mul_hi_u32 v5, v1, v5 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v10, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v10, v8 +; GISEL-NEXT: v_mul_hi_u32 v5, v1, v5 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v7 @@ -2066,20 +2066,20 @@ ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc ; GISEL-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 ; GISEL-NEXT: s_add_u32 s4, s10, 0 -; GISEL-NEXT: s_cselect_b32 s5, 1, 0 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc ; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, s9, v1 -; GISEL-NEXT: s_and_b32 s5, s5, 1 +; GISEL-NEXT: s_cselect_b32 s5, 1, 0 ; GISEL-NEXT: v_cndmask_b32_e32 v0, v10, v0, vcc ; GISEL-NEXT: v_add_i32_e32 v1, vcc, 1, v8 -; GISEL-NEXT: s_cmp_lg_u32 s5, 0 +; GISEL-NEXT: s_and_b32 s5, s5, 1 ; GISEL-NEXT: v_addc_u32_e32 v10, vcc, 0, v9, vcc +; GISEL-NEXT: s_cmp_lg_u32 s5, 0 ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GISEL-NEXT: s_addc_u32 s5, 0, 0 -; GISEL-NEXT: s_xor_b64 s[6:7], s[4:5], s[6:7] ; GISEL-NEXT: v_cndmask_b32_e32 v0, v8, v1, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v1, v9, v10, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 +; GISEL-NEXT: s_xor_b64 s[6:7], s[4:5], s[6:7] ; GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; GISEL-NEXT: v_cvt_f32_u32_e32 v4, s6 @@ -2122,12 +2122,12 @@ ; GISEL-NEXT: v_mul_lo_u32 v11, v5, v7 ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v10, v8 ; GISEL-NEXT: v_mul_hi_u32 v10, v4, v7 -; GISEL-NEXT: v_mul_hi_u32 v7, v5, v7 ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v11, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; GISEL-NEXT: v_mul_hi_u32 v7, v5, v7 ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 @@ -2153,12 +2153,12 @@ ; GISEL-NEXT: v_mul_lo_u32 v10, v8, v9 ; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v12, v7 ; GISEL-NEXT: v_mul_hi_u32 v12, v4, v9 -; GISEL-NEXT: v_mul_hi_u32 v8, v8, v9 ; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v12 +; GISEL-NEXT: v_mul_hi_u32 v8, v8, v9 ; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v10, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v11, v10 @@ -2178,12 +2178,12 @@ ; GISEL-NEXT: v_mul_lo_u32 v10, v3, v5 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v7 ; GISEL-NEXT: v_mul_hi_u32 v8, v2, v5 -; GISEL-NEXT: v_mul_hi_u32 v5, v3, v5 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v10, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v10, v8 +; GISEL-NEXT: v_mul_hi_u32 v5, v3, v5 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v7 @@ -2240,27 +2240,26 @@ ; CGP-NEXT: v_rcp_iflag_f32_e32 v7, v7 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v5 ; CGP-NEXT: v_addc_u32_e32 v1, vcc, v1, v5, vcc -; CGP-NEXT: v_xor_b32_e32 v0, v0, v5 ; CGP-NEXT: v_mul_f32_e32 v7, 0x5f7ffffc, v7 ; CGP-NEXT: v_mul_f32_e32 v8, 0x2f800000, v7 ; CGP-NEXT: v_trunc_f32_e32 v8, v8 ; CGP-NEXT: v_mac_f32_e32 v7, 0xcf800000, v8 ; CGP-NEXT: v_cvt_u32_f32_e32 v7, v7 ; CGP-NEXT: v_cvt_u32_f32_e32 v8, v8 +; CGP-NEXT: v_xor_b32_e32 v0, v0, v5 ; CGP-NEXT: v_xor_b32_e32 v1, v1, v5 -; CGP-NEXT: s_mov_b32 s7, 0x12d8fb ; CGP-NEXT: v_mul_lo_u32 v9, -1, v7 ; CGP-NEXT: v_mul_lo_u32 v10, s6, v8 ; CGP-NEXT: v_mul_hi_u32 v12, s6, v7 ; CGP-NEXT: v_mul_lo_u32 v11, s6, v7 -; CGP-NEXT: s_bfe_i32 s8, -1, 0x10000 +; CGP-NEXT: s_mov_b32 s7, 0x12d8fb ; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v10 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v12 ; CGP-NEXT: v_mul_lo_u32 v10, v8, v11 ; CGP-NEXT: v_mul_lo_u32 v12, v7, v9 ; CGP-NEXT: v_mul_hi_u32 v13, v7, v11 ; CGP-NEXT: v_mul_hi_u32 v11, v8, v11 -; CGP-NEXT: v_mac_f32_e32 v4, 0x4f800000, v6 +; CGP-NEXT: s_bfe_i32 s8, -1, 0x10000 ; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v12 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v13 @@ -2268,12 +2267,12 @@ ; CGP-NEXT: v_mul_lo_u32 v13, v8, v9 ; CGP-NEXT: v_add_i32_e32 v10, vcc, v12, v10 ; CGP-NEXT: v_mul_hi_u32 v12, v7, v9 -; CGP-NEXT: v_mul_hi_u32 v9, v8, v9 ; CGP-NEXT: v_add_i32_e32 v11, vcc, v13, v11 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v12 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12 +; CGP-NEXT: v_mul_hi_u32 v9, v8, v9 ; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11 @@ -2291,7 +2290,7 @@ ; CGP-NEXT: v_mul_lo_u32 v14, v7, v11 ; CGP-NEXT: v_mul_hi_u32 v9, v7, v13 ; CGP-NEXT: v_mul_hi_u32 v13, v10, v13 -; CGP-NEXT: v_rcp_iflag_f32_e32 v4, v4 +; CGP-NEXT: v_mac_f32_e32 v4, 0x4f800000, v6 ; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] ; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v12, v9 @@ -2299,12 +2298,12 @@ ; CGP-NEXT: v_mul_lo_u32 v12, v10, v11 ; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v14, v9 ; CGP-NEXT: v_mul_hi_u32 v14, v7, v11 -; CGP-NEXT: v_mul_hi_u32 v10, v10, v11 ; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v13 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] ; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] ; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v13, v14 +; CGP-NEXT: v_mul_hi_u32 v10, v10, v11 ; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v12, v9 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] ; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v13, v12 @@ -2316,7 +2315,7 @@ ; CGP-NEXT: v_mul_lo_u32 v10, v0, v8 ; CGP-NEXT: v_mul_hi_u32 v11, v0, v7 ; CGP-NEXT: v_mul_hi_u32 v7, v1, v7 -; CGP-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 +; CGP-NEXT: v_rcp_iflag_f32_e32 v4, v4 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v10 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v11 @@ -2324,12 +2323,12 @@ ; CGP-NEXT: v_mul_lo_u32 v11, v1, v8 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v9 ; CGP-NEXT: v_mul_hi_u32 v10, v0, v8 -; CGP-NEXT: v_mul_hi_u32 v8, v1, v8 ; CGP-NEXT: v_add_i32_e32 v7, vcc, v11, v7 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v10 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; CGP-NEXT: v_mul_hi_u32 v8, v1, v8 ; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v9 ; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v9 @@ -2338,7 +2337,7 @@ ; CGP-NEXT: v_mul_lo_u32 v10, s7, v8 ; CGP-NEXT: v_mul_hi_u32 v12, s7, v7 ; CGP-NEXT: v_mul_lo_u32 v11, s7, v7 -; CGP-NEXT: v_ashrrev_i32_e32 v6, 31, v3 +; CGP-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v10 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v12 ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v11 @@ -2349,13 +2348,13 @@ ; CGP-NEXT: v_subrev_i32_e32 v0, vcc, s7, v0 ; CGP-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc ; CGP-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v10 ; CGP-NEXT: v_mov_b32_e32 v11, s8 +; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v10 ; CGP-NEXT: v_add_i32_e32 v10, vcc, 1, v7 ; CGP-NEXT: v_cndmask_b32_e64 v9, v11, v9, s[4:5] ; CGP-NEXT: v_addc_u32_e32 v11, vcc, 0, v8, vcc -; CGP-NEXT: v_cmp_le_u32_e32 vcc, s7, v0 ; CGP-NEXT: s_bfe_i32 s4, -1, 0x10000 +; CGP-NEXT: v_cmp_le_u32_e32 vcc, s7, v0 ; CGP-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc ; CGP-NEXT: v_mov_b32_e32 v12, s4 ; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 @@ -2373,11 +2372,12 @@ ; CGP-NEXT: v_cvt_u32_f32_e32 v4, v4 ; CGP-NEXT: v_cvt_u32_f32_e32 v7, v7 ; CGP-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v6 +; CGP-NEXT: v_ashrrev_i32_e32 v6, 31, v3 ; CGP-NEXT: v_mul_lo_u32 v8, -1, v4 ; CGP-NEXT: v_mul_lo_u32 v9, s6, v7 ; CGP-NEXT: v_mul_hi_u32 v11, s6, v4 ; CGP-NEXT: v_mul_lo_u32 v10, s6, v4 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v6 ; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v6, vcc ; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9 ; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v11 @@ -2393,12 +2393,12 @@ ; CGP-NEXT: v_mul_lo_u32 v12, v7, v8 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v11, v9 ; CGP-NEXT: v_mul_hi_u32 v11, v4, v8 -; CGP-NEXT: v_mul_hi_u32 v8, v7, v8 ; CGP-NEXT: v_add_i32_e32 v10, vcc, v12, v10 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v11 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11 +; CGP-NEXT: v_mul_hi_u32 v8, v7, v8 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v9 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 @@ -2424,40 +2424,40 @@ ; CGP-NEXT: v_mul_lo_u32 v11, v9, v10 ; CGP-NEXT: v_add_i32_e64 v8, s[4:5], v13, v8 ; CGP-NEXT: v_mul_hi_u32 v13, v4, v10 -; CGP-NEXT: v_mul_hi_u32 v9, v9, v10 ; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v12 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] ; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v13 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] ; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v13 +; CGP-NEXT: v_mul_hi_u32 v9, v9, v10 ; CGP-NEXT: v_add_i32_e64 v8, s[4:5], v11, v8 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] ; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v12, v11 ; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v9, v10 ; CGP-NEXT: v_addc_u32_e32 v7, vcc, v7, v9, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v8 -; CGP-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc ; CGP-NEXT: v_xor_b32_e32 v3, v3, v6 +; CGP-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc ; CGP-NEXT: v_xor_b32_e32 v1, v1, v5 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v5 ; CGP-NEXT: v_mul_lo_u32 v8, v3, v4 ; CGP-NEXT: v_mul_lo_u32 v9, v2, v7 +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v5 ; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v5, vcc ; CGP-NEXT: v_mul_hi_u32 v5, v2, v4 -; CGP-NEXT: v_mul_hi_u32 v4, v3, v4 ; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9 ; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v5, vcc, v8, v5 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CGP-NEXT: v_mul_lo_u32 v8, v3, v7 +; CGP-NEXT: v_mul_hi_u32 v4, v3, v4 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v9, v5 ; CGP-NEXT: v_mul_hi_u32 v9, v2, v7 -; CGP-NEXT: v_mul_hi_u32 v7, v3, v7 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v8, v4 ; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9 ; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9 +; CGP-NEXT: v_mul_hi_u32 v7, v3, v7 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v5, vcc, v8, v5 @@ -2477,13 +2477,13 @@ ; CGP-NEXT: v_subrev_i32_e32 v2, vcc, s7, v2 ; CGP-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc ; CGP-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v8 ; CGP-NEXT: v_mov_b32_e32 v9, s6 +; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v8 ; CGP-NEXT: v_add_i32_e32 v8, vcc, 1, v4 ; CGP-NEXT: v_cndmask_b32_e64 v7, v9, v7, s[4:5] ; CGP-NEXT: v_addc_u32_e32 v9, vcc, 0, v5, vcc -; CGP-NEXT: v_cmp_le_u32_e32 vcc, s7, v2 ; CGP-NEXT: s_bfe_i32 s4, -1, 0x10000 +; CGP-NEXT: v_cmp_le_u32_e32 vcc, s7, v2 ; CGP-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc ; CGP-NEXT: v_mov_b32_e32 v10, s4 ; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 @@ -2530,29 +2530,29 @@ ; CHECK-NEXT: v_cvt_f32_u32_e32 v6, v2 ; CHECK-NEXT: v_ashrrev_i32_e32 v7, 31, v4 ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v7 -; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v4, v7, vcc ; CHECK-NEXT: v_mac_f32_e32 v5, 0x4f800000, v6 ; CHECK-NEXT: v_rcp_iflag_f32_e32 v5, v5 +; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v4, v7, vcc ; CHECK-NEXT: v_sub_i32_e32 v8, vcc, 0, v1 -; CHECK-NEXT: v_subb_u32_e32 v9, vcc, 0, v2, vcc -; CHECK-NEXT: v_xor_b32_e32 v3, v3, v7 ; CHECK-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5 ; CHECK-NEXT: v_mul_f32_e32 v6, 0x2f800000, v5 ; CHECK-NEXT: v_trunc_f32_e32 v6, v6 ; CHECK-NEXT: v_mac_f32_e32 v5, 0xcf800000, v6 ; CHECK-NEXT: v_cvt_u32_f32_e32 v5, v5 ; CHECK-NEXT: v_cvt_u32_f32_e32 v6, v6 -; CHECK-NEXT: v_xor_b32_e32 v4, v4, v7 +; CHECK-NEXT: v_subb_u32_e32 v9, vcc, 0, v2, vcc ; CHECK-NEXT: v_mul_lo_u32 v10, v9, v5 ; CHECK-NEXT: v_mul_lo_u32 v11, v8, v6 ; CHECK-NEXT: v_mul_hi_u32 v13, v8, v5 ; CHECK-NEXT: v_mul_lo_u32 v12, v8, v5 +; CHECK-NEXT: v_xor_b32_e32 v3, v3, v7 ; CHECK-NEXT: v_add_i32_e32 v10, vcc, v10, v11 ; CHECK-NEXT: v_add_i32_e32 v10, vcc, v10, v13 ; CHECK-NEXT: v_mul_lo_u32 v11, v6, v12 ; CHECK-NEXT: v_mul_lo_u32 v13, v5, v10 ; CHECK-NEXT: v_mul_hi_u32 v14, v5, v12 ; CHECK-NEXT: v_mul_hi_u32 v12, v6, v12 +; CHECK-NEXT: v_xor_b32_e32 v4, v4, v7 ; CHECK-NEXT: v_add_i32_e32 v11, vcc, v11, v13 ; CHECK-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v11, vcc, v11, v14 @@ -2560,12 +2560,12 @@ ; CHECK-NEXT: v_mul_lo_u32 v14, v6, v10 ; CHECK-NEXT: v_add_i32_e32 v11, vcc, v13, v11 ; CHECK-NEXT: v_mul_hi_u32 v13, v5, v10 -; CHECK-NEXT: v_mul_hi_u32 v10, v6, v10 ; CHECK-NEXT: v_add_i32_e32 v12, vcc, v14, v12 ; CHECK-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v12, vcc, v12, v13 ; CHECK-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v13, vcc, v14, v13 +; CHECK-NEXT: v_mul_hi_u32 v10, v6, v10 ; CHECK-NEXT: v_add_i32_e32 v11, vcc, v12, v11 ; CHECK-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v12, vcc, v13, v12 @@ -2578,10 +2578,10 @@ ; CHECK-NEXT: v_mul_hi_u32 v8, v8, v5 ; CHECK-NEXT: v_add_i32_e64 v6, s[4:5], v6, v10 ; CHECK-NEXT: v_add_i32_e64 v9, s[4:5], v9, v12 -; CHECK-NEXT: v_mul_hi_u32 v10, v5, v13 ; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v9, v8 ; CHECK-NEXT: v_mul_lo_u32 v9, v11, v13 ; CHECK-NEXT: v_mul_lo_u32 v12, v5, v8 +; CHECK-NEXT: v_mul_hi_u32 v10, v5, v13 ; CHECK-NEXT: v_mul_hi_u32 v13, v11, v13 ; CHECK-NEXT: v_add_i32_e64 v9, s[4:5], v9, v12 ; CHECK-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] @@ -2590,12 +2590,12 @@ ; CHECK-NEXT: v_mul_lo_u32 v10, v11, v8 ; CHECK-NEXT: v_add_i32_e64 v9, s[4:5], v12, v9 ; CHECK-NEXT: v_mul_hi_u32 v12, v5, v8 -; CHECK-NEXT: v_mul_hi_u32 v8, v11, v8 ; CHECK-NEXT: v_add_i32_e64 v10, s[4:5], v10, v13 ; CHECK-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] ; CHECK-NEXT: v_add_i32_e64 v10, s[4:5], v10, v12 ; CHECK-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] ; CHECK-NEXT: v_add_i32_e64 v12, s[4:5], v13, v12 +; CHECK-NEXT: v_mul_hi_u32 v8, v11, v8 ; CHECK-NEXT: v_add_i32_e64 v9, s[4:5], v10, v9 ; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] ; CHECK-NEXT: v_add_i32_e64 v10, s[4:5], v12, v10 @@ -2614,12 +2614,12 @@ ; CHECK-NEXT: v_mul_lo_u32 v10, v4, v6 ; CHECK-NEXT: v_add_i32_e32 v8, vcc, v9, v8 ; CHECK-NEXT: v_mul_hi_u32 v9, v3, v6 -; CHECK-NEXT: v_mul_hi_u32 v6, v4, v6 ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v10, v5 ; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v9 ; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v9, vcc, v10, v9 +; CHECK-NEXT: v_mul_hi_u32 v6, v4, v6 ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v8 ; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v8, vcc, v9, v8 @@ -2714,22 +2714,22 @@ ; GISEL-NEXT: v_cvt_f32_u32_e32 v9, v5 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v10 ; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v10, vcc -; GISEL-NEXT: v_sub_i32_e32 v11, vcc, 0, v4 ; GISEL-NEXT: v_mac_f32_e32 v8, 0x4f800000, v9 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v8, v8 ; GISEL-NEXT: v_xor_b32_e32 v9, v0, v10 -; GISEL-NEXT: v_subb_u32_e32 v12, vcc, 0, v5, vcc -; GISEL-NEXT: v_xor_b32_e32 v17, v1, v10 +; GISEL-NEXT: v_sub_i32_e32 v11, vcc, 0, v4 ; GISEL-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v8 ; GISEL-NEXT: v_mul_f32_e32 v8, 0x2f800000, v0 ; GISEL-NEXT: v_trunc_f32_e32 v8, v8 ; GISEL-NEXT: v_mac_f32_e32 v0, 0xcf800000, v8 ; GISEL-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GISEL-NEXT: v_cvt_u32_f32_e32 v8, v8 +; GISEL-NEXT: v_subb_u32_e32 v12, vcc, 0, v5, vcc ; GISEL-NEXT: v_mul_lo_u32 v13, v12, v0 ; GISEL-NEXT: v_mul_lo_u32 v14, v11, v8 ; GISEL-NEXT: v_mul_hi_u32 v16, v11, v0 ; GISEL-NEXT: v_mul_lo_u32 v15, v11, v0 +; GISEL-NEXT: v_xor_b32_e32 v17, v1, v10 ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v16 ; GISEL-NEXT: v_mul_lo_u32 v14, v8, v15 @@ -2743,12 +2743,12 @@ ; GISEL-NEXT: v_mul_lo_u32 v14, v8, v13 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v16, v1 ; GISEL-NEXT: v_mul_hi_u32 v16, v0, v13 -; GISEL-NEXT: v_mul_hi_u32 v13, v8, v13 ; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v15 ; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v16 ; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v16 +; GISEL-NEXT: v_mul_hi_u32 v13, v8, v13 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v14, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14 @@ -2761,10 +2761,10 @@ ; GISEL-NEXT: v_mul_hi_u32 v11, v11, v0 ; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v8, v13 ; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v12, v14 -; GISEL-NEXT: v_mul_hi_u32 v13, v0, v15 ; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v12, v11 ; GISEL-NEXT: v_mul_lo_u32 v12, v1, v15 ; GISEL-NEXT: v_mul_lo_u32 v14, v0, v11 +; GISEL-NEXT: v_mul_hi_u32 v13, v0, v15 ; GISEL-NEXT: v_mul_hi_u32 v15, v1, v15 ; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v12, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] @@ -2773,12 +2773,12 @@ ; GISEL-NEXT: v_mul_lo_u32 v13, v1, v11 ; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v14, v12 ; GISEL-NEXT: v_mul_hi_u32 v14, v0, v11 -; GISEL-NEXT: v_mul_hi_u32 v1, v1, v11 ; GISEL-NEXT: v_add_i32_e64 v13, s[4:5], v13, v15 ; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v13, s[4:5], v13, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], v15, v14 +; GISEL-NEXT: v_mul_hi_u32 v1, v1, v11 ; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v13, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v14, v13 @@ -2798,12 +2798,12 @@ ; GISEL-NEXT: v_mul_lo_u32 v12, v17, v11 ; GISEL-NEXT: v_add_i32_e32 v6, vcc, v13, v6 ; GISEL-NEXT: v_mul_hi_u32 v13, v9, v11 -; GISEL-NEXT: v_mul_hi_u32 v11, v17, v11 ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v12, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v13 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v13 +; GISEL-NEXT: v_mul_hi_u32 v11, v17, v11 ; GISEL-NEXT: v_add_i32_e32 v6, vcc, v8, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v12, v8 @@ -2857,8 +2857,8 @@ ; GISEL-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v2, v10 ; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v3, v10, vcc -; GISEL-NEXT: v_xor_b32_e32 v3, v1, v10 ; GISEL-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 +; GISEL-NEXT: v_xor_b32_e32 v3, v1, v10 ; GISEL-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GISEL-NEXT: v_trunc_f32_e32 v1, v1 ; GISEL-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 @@ -2884,12 +2884,12 @@ ; GISEL-NEXT: v_mul_lo_u32 v17, v1, v13 ; GISEL-NEXT: v_add_i32_e32 v14, vcc, v16, v14 ; GISEL-NEXT: v_mul_hi_u32 v16, v0, v13 -; GISEL-NEXT: v_mul_hi_u32 v13, v1, v13 ; GISEL-NEXT: v_add_i32_e32 v15, vcc, v17, v15 ; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v16 ; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v16, vcc, v17, v16 +; GISEL-NEXT: v_mul_hi_u32 v13, v1, v13 ; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v15, vcc, v16, v15 @@ -2902,10 +2902,10 @@ ; GISEL-NEXT: v_mul_hi_u32 v11, v11, v0 ; GISEL-NEXT: v_add_i32_e64 v1, s[4:5], v1, v13 ; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v12, v15 -; GISEL-NEXT: v_mul_hi_u32 v13, v0, v16 ; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v12, v11 ; GISEL-NEXT: v_mul_lo_u32 v12, v14, v16 ; GISEL-NEXT: v_mul_lo_u32 v15, v0, v11 +; GISEL-NEXT: v_mul_hi_u32 v13, v0, v16 ; GISEL-NEXT: v_mul_hi_u32 v16, v14, v16 ; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v12, v15 ; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] @@ -2914,12 +2914,12 @@ ; GISEL-NEXT: v_mul_lo_u32 v13, v14, v11 ; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v15, v12 ; GISEL-NEXT: v_mul_hi_u32 v15, v0, v11 -; GISEL-NEXT: v_mul_hi_u32 v11, v14, v11 ; GISEL-NEXT: v_add_i32_e64 v13, s[4:5], v13, v16 ; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v13, s[4:5], v13, v15 ; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v15, s[4:5], v16, v15 +; GISEL-NEXT: v_mul_hi_u32 v11, v14, v11 ; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v13, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v13, s[4:5], v15, v13 @@ -3002,8 +3002,8 @@ ; CGP-NEXT: v_mov_b32_e32 v8, v0 ; CGP-NEXT: v_or_b32_e32 v1, v9, v3 ; CGP-NEXT: v_mov_b32_e32 v0, 0 -; CGP-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; CGP-NEXT: v_lshl_b64 v[10:11], s[4:5], v6 +; CGP-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; CGP-NEXT: ; implicit-def: $vgpr0_vgpr1 ; CGP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; CGP-NEXT: s_xor_b64 s[6:7], exec, s[4:5] @@ -3021,26 +3021,26 @@ ; CGP-NEXT: v_rcp_iflag_f32_e32 v3, v3 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v8, v6 ; CGP-NEXT: v_addc_u32_e32 v8, vcc, v9, v6, vcc -; CGP-NEXT: v_sub_i32_e32 v12, vcc, 0, v1 ; CGP-NEXT: v_mul_f32_e32 v3, 0x5f7ffffc, v3 ; CGP-NEXT: v_mul_f32_e32 v9, 0x2f800000, v3 ; CGP-NEXT: v_trunc_f32_e32 v9, v9 ; CGP-NEXT: v_mac_f32_e32 v3, 0xcf800000, v9 ; CGP-NEXT: v_cvt_u32_f32_e32 v3, v3 ; CGP-NEXT: v_cvt_u32_f32_e32 v9, v9 +; CGP-NEXT: v_sub_i32_e32 v12, vcc, 0, v1 ; CGP-NEXT: v_subb_u32_e32 v13, vcc, 0, v2, vcc -; CGP-NEXT: v_xor_b32_e32 v4, v4, v6 ; CGP-NEXT: v_mul_lo_u32 v14, v13, v3 ; CGP-NEXT: v_mul_lo_u32 v15, v12, v9 ; CGP-NEXT: v_mul_hi_u32 v17, v12, v3 ; CGP-NEXT: v_mul_lo_u32 v16, v12, v3 -; CGP-NEXT: v_xor_b32_e32 v8, v8, v6 +; CGP-NEXT: v_xor_b32_e32 v4, v4, v6 ; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v15 ; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v17 ; CGP-NEXT: v_mul_lo_u32 v15, v9, v16 ; CGP-NEXT: v_mul_lo_u32 v17, v3, v14 ; CGP-NEXT: v_mul_hi_u32 v18, v3, v16 ; CGP-NEXT: v_mul_hi_u32 v16, v9, v16 +; CGP-NEXT: v_xor_b32_e32 v8, v8, v6 ; CGP-NEXT: v_add_i32_e32 v15, vcc, v15, v17 ; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v15, vcc, v15, v18 @@ -3048,12 +3048,12 @@ ; CGP-NEXT: v_mul_lo_u32 v18, v9, v14 ; CGP-NEXT: v_add_i32_e32 v15, vcc, v17, v15 ; CGP-NEXT: v_mul_hi_u32 v17, v3, v14 -; CGP-NEXT: v_mul_hi_u32 v14, v9, v14 ; CGP-NEXT: v_add_i32_e32 v16, vcc, v18, v16 ; CGP-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v16, vcc, v16, v17 ; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v17, vcc, v18, v17 +; CGP-NEXT: v_mul_hi_u32 v14, v9, v14 ; CGP-NEXT: v_add_i32_e32 v15, vcc, v16, v15 ; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v16, vcc, v17, v16 @@ -3066,10 +3066,10 @@ ; CGP-NEXT: v_mul_hi_u32 v12, v12, v3 ; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v9, v14 ; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v13, v16 -; CGP-NEXT: v_mul_hi_u32 v14, v3, v17 ; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v13, v12 ; CGP-NEXT: v_mul_lo_u32 v13, v15, v17 ; CGP-NEXT: v_mul_lo_u32 v16, v3, v12 +; CGP-NEXT: v_mul_hi_u32 v14, v3, v17 ; CGP-NEXT: v_mul_hi_u32 v17, v15, v17 ; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v13, v16 ; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[4:5] @@ -3078,12 +3078,12 @@ ; CGP-NEXT: v_mul_lo_u32 v14, v15, v12 ; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v16, v13 ; CGP-NEXT: v_mul_hi_u32 v16, v3, v12 -; CGP-NEXT: v_mul_hi_u32 v12, v15, v12 ; CGP-NEXT: v_add_i32_e64 v14, s[4:5], v14, v17 ; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, s[4:5] ; CGP-NEXT: v_add_i32_e64 v14, s[4:5], v14, v16 ; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[4:5] ; CGP-NEXT: v_add_i32_e64 v16, s[4:5], v17, v16 +; CGP-NEXT: v_mul_hi_u32 v12, v15, v12 ; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v14, v13 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] ; CGP-NEXT: v_add_i32_e64 v14, s[4:5], v16, v14 @@ -3102,12 +3102,12 @@ ; CGP-NEXT: v_mul_lo_u32 v14, v8, v9 ; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12 ; CGP-NEXT: v_mul_hi_u32 v13, v4, v9 -; CGP-NEXT: v_mul_hi_u32 v9, v8, v9 ; CGP-NEXT: v_add_i32_e32 v3, vcc, v14, v3 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v13 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 +; CGP-NEXT: v_mul_hi_u32 v9, v8, v9 ; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v12 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12 @@ -3197,29 +3197,29 @@ ; CGP-NEXT: v_cvt_f32_u32_e32 v8, v4 ; CGP-NEXT: v_ashrrev_i32_e32 v9, 31, v7 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v9 -; CGP-NEXT: v_addc_u32_e32 v7, vcc, v7, v9, vcc ; CGP-NEXT: v_mac_f32_e32 v6, 0x4f800000, v8 ; CGP-NEXT: v_rcp_iflag_f32_e32 v6, v6 +; CGP-NEXT: v_addc_u32_e32 v7, vcc, v7, v9, vcc ; CGP-NEXT: v_sub_i32_e32 v10, vcc, 0, v3 -; CGP-NEXT: v_subb_u32_e32 v11, vcc, 0, v4, vcc -; CGP-NEXT: v_xor_b32_e32 v5, v5, v9 ; CGP-NEXT: v_mul_f32_e32 v6, 0x5f7ffffc, v6 ; CGP-NEXT: v_mul_f32_e32 v8, 0x2f800000, v6 ; CGP-NEXT: v_trunc_f32_e32 v8, v8 ; CGP-NEXT: v_mac_f32_e32 v6, 0xcf800000, v8 ; CGP-NEXT: v_cvt_u32_f32_e32 v6, v6 ; CGP-NEXT: v_cvt_u32_f32_e32 v8, v8 -; CGP-NEXT: v_xor_b32_e32 v7, v7, v9 +; CGP-NEXT: v_subb_u32_e32 v11, vcc, 0, v4, vcc ; CGP-NEXT: v_mul_lo_u32 v12, v11, v6 ; CGP-NEXT: v_mul_lo_u32 v13, v10, v8 ; CGP-NEXT: v_mul_hi_u32 v15, v10, v6 ; CGP-NEXT: v_mul_lo_u32 v14, v10, v6 +; CGP-NEXT: v_xor_b32_e32 v5, v5, v9 ; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v13 ; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v15 ; CGP-NEXT: v_mul_lo_u32 v13, v8, v14 ; CGP-NEXT: v_mul_lo_u32 v15, v6, v12 ; CGP-NEXT: v_mul_hi_u32 v16, v6, v14 ; CGP-NEXT: v_mul_hi_u32 v14, v8, v14 +; CGP-NEXT: v_xor_b32_e32 v7, v7, v9 ; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v15 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v16 @@ -3227,12 +3227,12 @@ ; CGP-NEXT: v_mul_lo_u32 v16, v8, v12 ; CGP-NEXT: v_add_i32_e32 v13, vcc, v15, v13 ; CGP-NEXT: v_mul_hi_u32 v15, v6, v12 -; CGP-NEXT: v_mul_hi_u32 v12, v8, v12 ; CGP-NEXT: v_add_i32_e32 v14, vcc, v16, v14 ; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v15 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v15, vcc, v16, v15 +; CGP-NEXT: v_mul_hi_u32 v12, v8, v12 ; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14 @@ -3245,10 +3245,10 @@ ; CGP-NEXT: v_mul_hi_u32 v10, v10, v6 ; CGP-NEXT: v_add_i32_e64 v8, s[4:5], v8, v12 ; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v14 -; CGP-NEXT: v_mul_hi_u32 v12, v6, v15 ; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v11, v10 ; CGP-NEXT: v_mul_lo_u32 v11, v13, v15 ; CGP-NEXT: v_mul_lo_u32 v14, v6, v10 +; CGP-NEXT: v_mul_hi_u32 v12, v6, v15 ; CGP-NEXT: v_mul_hi_u32 v15, v13, v15 ; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] @@ -3257,12 +3257,12 @@ ; CGP-NEXT: v_mul_lo_u32 v12, v13, v10 ; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v14, v11 ; CGP-NEXT: v_mul_hi_u32 v14, v6, v10 -; CGP-NEXT: v_mul_hi_u32 v10, v13, v10 ; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v15 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] ; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] ; CGP-NEXT: v_add_i32_e64 v14, s[4:5], v15, v14 +; CGP-NEXT: v_mul_hi_u32 v10, v13, v10 ; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v12, v11 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] ; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v14, v12 @@ -3281,12 +3281,12 @@ ; CGP-NEXT: v_mul_lo_u32 v12, v7, v8 ; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 ; CGP-NEXT: v_mul_hi_u32 v11, v5, v8 -; CGP-NEXT: v_mul_hi_u32 v8, v7, v8 ; CGP-NEXT: v_add_i32_e32 v6, vcc, v12, v6 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v11 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11 +; CGP-NEXT: v_mul_hi_u32 v8, v7, v8 ; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v10 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 @@ -3385,8 +3385,8 @@ ; GISEL-NEXT: v_add_i32_e32 v4, vcc, 1, v2 ; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1 -; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v0, v1 ; GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v0, v1 ; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GISEL-NEXT: v_add_i32_e32 v3, vcc, 1, v2 ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1 @@ -3431,9 +3431,9 @@ ; GISEL-NEXT: v_cvt_f32_u32_e32 v5, v3 ; GISEL-NEXT: v_sub_i32_e32 v7, vcc, 0, v1 ; GISEL-NEXT: v_subb_u32_e32 v8, vcc, 0, v3, vcc -; GISEL-NEXT: v_and_b32_e32 v0, s6, v0 ; GISEL-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4 +; GISEL-NEXT: v_and_b32_e32 v0, s6, v0 ; GISEL-NEXT: v_and_b32_e32 v6, s6, v6 ; GISEL-NEXT: v_and_b32_e32 v2, s6, v2 ; GISEL-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 @@ -3461,12 +3461,12 @@ ; GISEL-NEXT: v_mul_hi_u32 v11, v5, v11 ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v12, v10 ; GISEL-NEXT: v_mul_hi_u32 v12, v4, v9 -; GISEL-NEXT: v_mul_hi_u32 v9, v5, v9 ; GISEL-NEXT: v_add_i32_e32 v11, vcc, v14, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v12, vcc, v14, v12 +; GISEL-NEXT: v_mul_hi_u32 v9, v5, v9 ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11 @@ -3479,10 +3479,10 @@ ; GISEL-NEXT: v_mul_hi_u32 v7, v7, v4 ; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v5, v9 ; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v8, v11 -; GISEL-NEXT: v_mul_hi_u32 v9, v4, v12 ; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v8, v7 ; GISEL-NEXT: v_mul_lo_u32 v8, v10, v12 ; GISEL-NEXT: v_mul_lo_u32 v11, v4, v7 +; GISEL-NEXT: v_mul_hi_u32 v9, v4, v12 ; GISEL-NEXT: v_mul_hi_u32 v12, v10, v12 ; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v8, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] @@ -3491,12 +3491,12 @@ ; GISEL-NEXT: v_mul_lo_u32 v9, v10, v7 ; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v11, v8 ; GISEL-NEXT: v_mul_hi_u32 v11, v4, v7 -; GISEL-NEXT: v_mul_hi_u32 v7, v10, v7 ; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v12, v11 +; GISEL-NEXT: v_mul_hi_u32 v7, v10, v7 ; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v9, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v11, v9 @@ -3515,12 +3515,12 @@ ; GISEL-NEXT: v_mul_lo_u32 v9, v13, v5 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v7 ; GISEL-NEXT: v_mul_hi_u32 v8, v0, v5 -; GISEL-NEXT: v_mul_hi_u32 v5, v13, v5 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v9, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; GISEL-NEXT: v_mul_hi_u32 v5, v13, v5 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v7 @@ -3592,12 +3592,12 @@ ; GISEL-NEXT: v_mul_hi_u32 v11, v4, v11 ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v12, v10 ; GISEL-NEXT: v_mul_hi_u32 v12, v3, v9 -; GISEL-NEXT: v_mul_hi_u32 v9, v4, v9 ; GISEL-NEXT: v_add_i32_e32 v11, vcc, v14, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v12, vcc, v14, v12 +; GISEL-NEXT: v_mul_hi_u32 v9, v4, v9 ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11 @@ -3610,10 +3610,10 @@ ; GISEL-NEXT: v_mul_hi_u32 v5, v5, v3 ; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v4, v9 ; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v8, v11 -; GISEL-NEXT: v_mul_hi_u32 v9, v3, v12 ; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v8, v5 ; GISEL-NEXT: v_mul_lo_u32 v8, v10, v12 ; GISEL-NEXT: v_mul_lo_u32 v11, v3, v5 +; GISEL-NEXT: v_mul_hi_u32 v9, v3, v12 ; GISEL-NEXT: v_mul_hi_u32 v12, v10, v12 ; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v8, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] @@ -3622,12 +3622,12 @@ ; GISEL-NEXT: v_mul_lo_u32 v9, v10, v5 ; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v11, v8 ; GISEL-NEXT: v_mul_hi_u32 v11, v3, v5 -; GISEL-NEXT: v_mul_hi_u32 v5, v10, v5 ; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v12, v11 +; GISEL-NEXT: v_mul_hi_u32 v5, v10, v5 ; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v9, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v11, v9 @@ -3648,12 +3648,12 @@ ; GISEL-NEXT: v_mul_hi_u32 v3, v13, v3 ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v8, v5 ; GISEL-NEXT: v_mul_hi_u32 v8, v2, v4 -; GISEL-NEXT: v_mul_hi_u32 v4, v13, v4 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v9, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; GISEL-NEXT: v_mul_hi_u32 v4, v13, v4 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v8, v5 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll @@ -85,8 +85,8 @@ ; GFX9-NEXT: v_subrev_u32_e32 v3, s7, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX9-NEXT: v_xor_b32_e32 v0, s4, v0 -; GFX9-NEXT: v_xor_b32_e32 v1, s8, v1 ; GFX9-NEXT: v_subrev_u32_e32 v0, s4, v0 +; GFX9-NEXT: v_xor_b32_e32 v1, s8, v1 ; GFX9-NEXT: v_subrev_u32_e32 v1, s8, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dword v2, v0, s[0:1] @@ -117,8 +117,8 @@ ; GFX10-NEXT: v_sub_nc_u32_e32 v1, s0, v1 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-NEXT: s_xor_b32 s4, s8, s6 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s7, v1 ; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s7, v1 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s7, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo ; GFX10-NEXT: v_add_nc_u32_e32 v2, 1, v0 @@ -147,8 +147,8 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_ashr_i32 s12, s11, 31 ; GFX8-NEXT: s_ashr_i32 s2, s9, 31 +; GFX8-NEXT: s_ashr_i32 s12, s11, 31 ; GFX8-NEXT: s_add_u32 s0, s8, s2 ; GFX8-NEXT: s_cselect_b32 s1, 1, 0 ; GFX8-NEXT: s_and_b32 s1, s1, 1 @@ -176,12 +176,12 @@ ; GFX8-NEXT: v_trunc_f32_e32 v1, v1 ; GFX8-NEXT: v_mul_f32_e32 v2, 0xcf800000, v1 ; GFX8-NEXT: v_add_f32_e32 v0, v2, v0 -; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX8-NEXT: s_cmp_lg_u32 s0, 0 ; GFX8-NEXT: s_subb_u32 s15, 0, s9 -; GFX8-NEXT: v_mul_lo_u32 v3, s15, v0 ; GFX8-NEXT: v_mul_lo_u32 v2, s14, v1 +; GFX8-NEXT: v_mul_lo_u32 v3, s15, v0 ; GFX8-NEXT: v_mul_hi_u32 v5, s14, v0 ; GFX8-NEXT: v_mul_lo_u32 v4, s14, v0 ; GFX8-NEXT: v_mov_b32_e32 v6, s9 @@ -198,12 +198,12 @@ ; GFX8-NEXT: v_mul_lo_u32 v7, v1, v2 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v5, v3 ; GFX8-NEXT: v_mul_hi_u32 v5, v0, v2 -; GFX8-NEXT: v_mul_hi_u32 v2, v1, v2 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v7, v4 ; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v5 ; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v5, vcc, v7, v5 +; GFX8-NEXT: v_mul_hi_u32 v2, v1, v2 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v4, v3 ; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v5, v4 @@ -228,12 +228,12 @@ ; GFX8-NEXT: v_mul_lo_u32 v5, v3, v4 ; GFX8-NEXT: v_add_u32_e64 v2, s[0:1], v8, v2 ; GFX8-NEXT: v_mul_hi_u32 v8, v0, v4 -; GFX8-NEXT: v_mul_hi_u32 v3, v3, v4 ; GFX8-NEXT: v_add_u32_e64 v5, s[0:1], v5, v7 ; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[0:1] ; GFX8-NEXT: v_add_u32_e64 v5, s[0:1], v5, v8 ; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[0:1] ; GFX8-NEXT: v_add_u32_e64 v7, s[0:1], v7, v8 +; GFX8-NEXT: v_mul_hi_u32 v3, v3, v4 ; GFX8-NEXT: v_add_u32_e64 v2, s[0:1], v5, v2 ; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[0:1] ; GFX8-NEXT: v_add_u32_e64 v4, s[0:1], v7, v5 @@ -253,12 +253,12 @@ ; GFX8-NEXT: v_mul_lo_u32 v5, s11, v1 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 ; GFX8-NEXT: v_mul_hi_u32 v3, s10, v1 -; GFX8-NEXT: v_mul_hi_u32 v1, s11, v1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v5, v0 ; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v3 ; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v5, v3 +; GFX8-NEXT: v_mul_hi_u32 v1, s11, v1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 @@ -284,25 +284,25 @@ ; GFX8-NEXT: v_add_u32_e64 v9, s[0:1], 1, v0 ; GFX8-NEXT: v_addc_u32_e64 v10, s[0:1], 0, v1, s[0:1] ; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v8 -; GFX8-NEXT: v_subb_u32_e32 v2, vcc, v2, v6, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[0:1] ; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v7 -; GFX8-NEXT: v_subrev_u32_e32 v6, vcc, s8, v7 +; GFX8-NEXT: v_subb_u32_e32 v2, vcc, v2, v6, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[0:1] ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v8 +; GFX8-NEXT: v_subrev_u32_e32 v6, vcc, s8, v7 ; GFX8-NEXT: v_cndmask_b32_e64 v11, v11, v12, s[0:1] ; GFX8-NEXT: v_add_u32_e64 v12, s[0:1], 1, v9 ; GFX8-NEXT: v_subbrev_u32_e32 v2, vcc, 0, v2, vcc ; GFX8-NEXT: v_addc_u32_e64 v13, s[0:1], 0, v10, s[0:1] ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 -; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v5 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v9, v9, v12, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v10, v10, v13, vcc +; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v5 ; GFX8-NEXT: v_cndmask_b32_e32 v5, v7, v6, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v9, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v10, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e64 v2, v4, v2, s[0:1] ; GFX8-NEXT: s_xor_b64 s[0:1], s[2:3], s[12:13] ; GFX8-NEXT: v_xor_b32_e32 v0, s0, v0 @@ -327,8 +327,8 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s12, s11, 31 ; GFX9-NEXT: s_ashr_i32 s2, s9, 31 +; GFX9-NEXT: s_ashr_i32 s12, s11, 31 ; GFX9-NEXT: s_add_u32 s0, s8, s2 ; GFX9-NEXT: s_cselect_b32 s1, 1, 0 ; GFX9-NEXT: s_and_b32 s1, s1, 1 @@ -356,12 +356,12 @@ ; GFX9-NEXT: v_trunc_f32_e32 v1, v1 ; GFX9-NEXT: v_mul_f32_e32 v2, 0xcf800000, v1 ; GFX9-NEXT: v_add_f32_e32 v0, v2, v0 -; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: s_cmp_lg_u32 s0, 0 ; GFX9-NEXT: s_subb_u32 s15, 0, s9 -; GFX9-NEXT: v_mul_lo_u32 v3, s15, v0 ; GFX9-NEXT: v_mul_lo_u32 v2, s14, v1 +; GFX9-NEXT: v_mul_lo_u32 v3, s15, v0 ; GFX9-NEXT: v_mul_hi_u32 v4, s14, v0 ; GFX9-NEXT: v_mul_lo_u32 v5, s14, v0 ; GFX9-NEXT: v_mov_b32_e32 v8, s11 @@ -383,10 +383,10 @@ ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v5, v4 ; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v4, v3 -; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX9-NEXT: v_add_u32_e32 v5, v6, v5 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 +; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX9-NEXT: v_add3_u32 v2, v5, v4, v2 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 ; GFX9-NEXT: v_addc_co_u32_e64 v3, s[0:1], v1, v2, vcc ; GFX9-NEXT: v_mul_lo_u32 v4, s15, v0 ; GFX9-NEXT: v_mul_lo_u32 v5, s14, v3 @@ -459,26 +459,26 @@ ; GFX9-NEXT: v_add_co_u32_e64 v10, s[0:1], 1, v0 ; GFX9-NEXT: v_addc_co_u32_e64 v11, s[0:1], 0, v1, s[0:1] ; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v9 -; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v4, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[0:1] ; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v8 -; GFX9-NEXT: v_subrev_co_u32_e32 v4, vcc, s8, v8 +; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v4, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[0:1] ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v9 +; GFX9-NEXT: v_subrev_co_u32_e32 v4, vcc, s8, v8 ; GFX9-NEXT: v_cndmask_b32_e64 v12, v12, v13, s[0:1] ; GFX9-NEXT: v_add_co_u32_e64 v13, s[0:1], 1, v10 ; GFX9-NEXT: v_subbrev_co_u32_e32 v2, vcc, 0, v2, vcc -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 ; GFX9-NEXT: v_addc_co_u32_e64 v14, s[0:1], 0, v11, s[0:1] +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 ; GFX9-NEXT: v_cndmask_b32_e32 v10, v10, v13, vcc -; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v7 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v9, v2, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v11, v11, v14, vcc +; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v7 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v2, v5, v2, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e32 v2, v9, v2, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v10, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v11, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v2, v5, v2, s[0:1] ; GFX9-NEXT: s_xor_b64 s[0:1], s[2:3], s[12:13] ; GFX9-NEXT: v_xor_b32_e32 v0, s0, v0 ; GFX9-NEXT: v_xor_b32_e32 v1, s1, v1 @@ -518,11 +518,11 @@ ; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s8 ; GFX10-NEXT: s_sub_u32 s1, 0, s8 ; GFX10-NEXT: s_cselect_b32 s0, 1, 0 -; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0 ; GFX10-NEXT: s_and_b32 s0, s0, 1 +; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0 ; GFX10-NEXT: s_cmp_lg_u32 s0, 0 -; GFX10-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX10-NEXT: s_subb_u32 s14, 0, s9 +; GFX10-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX10-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 @@ -614,8 +614,8 @@ ; GFX10-NEXT: v_add3_u32 v2, v2, v4, v3 ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, v0, 1 ; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_sub_co_u32 v5, vcc_lo, s10, v5 ; GFX10-NEXT: v_sub_nc_u32_e32 v6, s11, v2 +; GFX10-NEXT: v_sub_co_u32 v5, vcc_lo, s10, v5 ; GFX10-NEXT: v_sub_co_ci_u32_e64 v2, s0, s11, v2, vcc_lo ; GFX10-NEXT: v_subrev_co_ci_u32_e32 v6, vcc_lo, s9, v6, vcc_lo ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s8, v5 @@ -637,13 +637,13 @@ ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v11 ; GFX10-NEXT: v_cndmask_b32_e64 v7, v10, v7, s0 ; GFX10-NEXT: v_sub_co_u32 v10, s0, v8, s8 -; GFX10-NEXT: s_xor_b64 s[8:9], s[2:3], s[12:13] ; GFX10-NEXT: v_subrev_co_ci_u32_e64 v6, s0, 0, v6, s0 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v13, vcc_lo ; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v7 ; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v14, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v7, v8, v10, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v6, v9, v6, vcc_lo +; GFX10-NEXT: s_xor_b64 s[8:9], s[2:3], s[12:13] ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v3, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v4, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v3, v5, v7, s0 @@ -784,8 +784,8 @@ ; GFX9-NEXT: v_mul_hi_u32 v2, v1, v2 ; GFX9-NEXT: v_add_u32_e32 v4, 1, v0 ; GFX9-NEXT: v_sub_u32_e32 v3, s8, v3 -; GFX9-NEXT: v_add_u32_e32 v1, v1, v2 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s6, v3 +; GFX9-NEXT: v_add_u32_e32 v1, v1, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; GFX9-NEXT: v_subrev_u32_e32 v4, s6, v3 ; GFX9-NEXT: v_mul_hi_u32 v1, s9, v1 @@ -810,12 +810,12 @@ ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; GFX9-NEXT: v_subrev_u32_e32 v4, s7, v3 ; GFX9-NEXT: s_xor_b32 s4, s11, s5 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX9-NEXT: v_xor_b32_e32 v0, s6, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX9-NEXT: v_xor_b32_e32 v1, s4, v1 -; GFX9-NEXT: v_xor_b32_e32 v3, s11, v3 ; GFX9-NEXT: v_subrev_u32_e32 v0, s6, v0 ; GFX9-NEXT: v_subrev_u32_e32 v1, s4, v1 +; GFX9-NEXT: v_xor_b32_e32 v3, s11, v3 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_subrev_u32_e32 v3, s11, v3 ; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] @@ -866,22 +866,21 @@ ; GFX10-NEXT: v_sub_nc_u32_e32 v2, s0, v2 ; GFX10-NEXT: v_sub_nc_u32_e32 v3, s1, v3 ; GFX10-NEXT: s_xor_b32 s1, s10, s2 -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s8, v2 ; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s8, v2 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s9, v3 +; GFX10-NEXT: v_cmp_le_u32_e64 s0, s8, v2 ; GFX10-NEXT: v_subrev_nc_u32_e32 v7, s9, v3 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v6, s0 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc_lo +; GFX10-NEXT: v_add_nc_u32_e32 v5, 1, v1 ; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v0 ; GFX10-NEXT: v_cmp_le_u32_e64 s0, s8, v2 -; GFX10-NEXT: v_add_nc_u32_e32 v5, 1, v1 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s9, v3 ; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s8, v2 ; GFX10-NEXT: v_subrev_nc_u32_e32 v7, s9, v3 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0 -; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v6, s0 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc_lo @@ -890,6 +889,7 @@ ; GFX10-NEXT: v_xor_b32_e32 v1, s0, v1 ; GFX10-NEXT: v_xor_b32_e32 v2, s10, v2 ; GFX10-NEXT: v_xor_b32_e32 v3, s11, v3 +; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: v_subrev_nc_u32_e32 v0, s1, v0 ; GFX10-NEXT: v_subrev_nc_u32_e32 v1, s0, v1 ; GFX10-NEXT: v_subrev_nc_u32_e32 v2, s10, v2 @@ -960,9 +960,9 @@ ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v4 ; GFX8-NEXT: v_mul_hi_u32 v1, s1, v1 ; GFX8-NEXT: v_subrev_u32_e32 v0, vcc, s0, v0 -; GFX8-NEXT: s_add_i32 s0, s2, s5 ; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, s14, v2 ; GFX8-NEXT: v_mul_lo_u32 v5, v1, s16 +; GFX8-NEXT: s_add_i32 s0, s2, s5 ; GFX8-NEXT: s_xor_b32 s2, s0, s5 ; GFX8-NEXT: s_ashr_i32 s12, s6, 31 ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s1, v5 @@ -972,29 +972,29 @@ ; GFX8-NEXT: v_cvt_f32_u32_e32 v5, s2 ; GFX8-NEXT: v_subrev_u32_e64 v6, s[0:1], s16, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 1, v1 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v5, v5 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 1, v1 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s16, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc -; GFX8-NEXT: v_subrev_u32_e64 v6, s[0:1], s16, v2 ; GFX8-NEXT: v_mul_f32_e32 v5, v5, v3 ; GFX8-NEXT: v_cvt_u32_f32_e32 v5, v5 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc +; GFX8-NEXT: v_subrev_u32_e64 v6, s[0:1], s16, v2 ; GFX8-NEXT: s_sub_i32 s0, 0, s2 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc -; GFX8-NEXT: s_add_i32 s1, s6, s12 ; GFX8-NEXT: v_mul_lo_u32 v6, s0, v5 +; GFX8-NEXT: s_add_i32 s1, s6, s12 ; GFX8-NEXT: s_xor_b32 s1, s1, s12 ; GFX8-NEXT: s_xor_b32 s0, s4, s15 -; GFX8-NEXT: v_xor_b32_e32 v2, s4, v2 ; GFX8-NEXT: v_mul_hi_u32 v6, v5, v6 +; GFX8-NEXT: v_xor_b32_e32 v2, s4, v2 ; GFX8-NEXT: v_xor_b32_e32 v1, s0, v1 ; GFX8-NEXT: v_subrev_u32_e32 v1, vcc, s0, v1 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v6 ; GFX8-NEXT: v_mul_hi_u32 v6, s1, v5 ; GFX8-NEXT: v_subrev_u32_e32 v5, vcc, s4, v2 ; GFX8-NEXT: s_ashr_i32 s4, s3, 31 -; GFX8-NEXT: s_add_i32 s0, s3, s4 ; GFX8-NEXT: v_mul_lo_u32 v7, v6, s2 +; GFX8-NEXT: s_add_i32 s0, s3, s4 ; GFX8-NEXT: s_xor_b32 s3, s0, s4 ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s1, v7 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 1, v6 @@ -1003,10 +1003,9 @@ ; GFX8-NEXT: v_cvt_f32_u32_e32 v7, s3 ; GFX8-NEXT: v_subrev_u32_e64 v8, s[0:1], s2, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 1, v6 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v7, v7 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 1, v6 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc ; GFX8-NEXT: v_mul_f32_e32 v3, v7, v3 ; GFX8-NEXT: v_cvt_u32_f32_e32 v3, v3 ; GFX8-NEXT: v_subrev_u32_e64 v7, s[0:1], s2, v2 @@ -1015,16 +1014,17 @@ ; GFX8-NEXT: v_mul_lo_u32 v2, s0, v3 ; GFX8-NEXT: s_ashr_i32 s2, s7, 31 ; GFX8-NEXT: s_add_i32 s1, s7, s2 -; GFX8-NEXT: s_xor_b32 s1, s1, s2 +; GFX8-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc ; GFX8-NEXT: v_mul_hi_u32 v2, v3, v2 +; GFX8-NEXT: s_xor_b32 s1, s1, s2 ; GFX8-NEXT: s_xor_b32 s0, s12, s5 ; GFX8-NEXT: v_xor_b32_e32 v6, s0, v6 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 ; GFX8-NEXT: v_mul_hi_u32 v3, s1, v2 ; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s0, v6 ; GFX8-NEXT: v_xor_b32_e32 v6, s12, v7 -; GFX8-NEXT: v_subrev_u32_e32 v6, vcc, s12, v6 ; GFX8-NEXT: v_mul_lo_u32 v7, v3, s3 +; GFX8-NEXT: v_subrev_u32_e32 v6, vcc, s12, v6 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, 1, v3 ; GFX8-NEXT: v_sub_u32_e32 v7, vcc, s1, v7 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s3, v7 @@ -1097,8 +1097,8 @@ ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v4 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc ; GFX9-NEXT: v_subrev_u32_e32 v5, s7, v4 -; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 ; GFX9-NEXT: s_xor_b32 s7, s9, s13 +; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 ; GFX9-NEXT: v_mul_hi_u32 v1, s7, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc ; GFX9-NEXT: v_xor_b32_e32 v0, s6, v0 @@ -1136,15 +1136,14 @@ ; GFX9-NEXT: s_add_i32 s8, s10, s5 ; GFX9-NEXT: s_xor_b32 s8, s8, s5 ; GFX9-NEXT: v_add_u32_e32 v5, v5, v6 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v8, v8 ; GFX9-NEXT: v_mul_hi_u32 v6, s8, v5 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v8, v8 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc ; GFX9-NEXT: v_xor_b32_e32 v3, s13, v3 -; GFX9-NEXT: v_mul_f32_e32 v2, v8, v2 ; GFX9-NEXT: v_mul_lo_u32 v7, v6, s7 +; GFX9-NEXT: v_mul_f32_e32 v2, v8, v2 ; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GFX9-NEXT: v_subrev_u32_e32 v5, s13, v3 -; GFX9-NEXT: s_xor_b32 s6, s5, s6 ; GFX9-NEXT: v_sub_u32_e32 v3, s8, v7 ; GFX9-NEXT: s_sub_i32 s8, 0, s9 ; GFX9-NEXT: v_mul_lo_u32 v8, s8, v2 @@ -1164,12 +1163,12 @@ ; GFX9-NEXT: v_add_u32_e32 v2, v2, v8 ; GFX9-NEXT: v_mul_hi_u32 v8, s8, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc +; GFX9-NEXT: s_xor_b32 s6, s5, s6 ; GFX9-NEXT: v_xor_b32_e32 v3, s5, v3 -; GFX9-NEXT: v_xor_b32_e32 v2, s6, v6 ; GFX9-NEXT: v_mul_lo_u32 v7, v8, s9 +; GFX9-NEXT: v_xor_b32_e32 v2, s6, v6 ; GFX9-NEXT: v_subrev_u32_e32 v6, s5, v3 ; GFX9-NEXT: s_xor_b32 s4, s7, s4 -; GFX9-NEXT: v_subrev_u32_e32 v2, s6, v2 ; GFX9-NEXT: v_sub_u32_e32 v3, s8, v7 ; GFX9-NEXT: v_add_u32_e32 v7, 1, v8 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s9, v3 @@ -1182,8 +1181,9 @@ ; GFX9-NEXT: v_subrev_u32_e32 v8, s9, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v8, v3, v8, vcc ; GFX9-NEXT: v_xor_b32_e32 v3, s4, v7 -; GFX9-NEXT: v_xor_b32_e32 v7, s7, v8 +; GFX9-NEXT: v_subrev_u32_e32 v2, s6, v2 ; GFX9-NEXT: v_subrev_u32_e32 v3, s4, v3 +; GFX9-NEXT: v_xor_b32_e32 v7, s7, v8 ; GFX9-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-NEXT: v_subrev_u32_e32 v7, s7, v7 ; GFX9-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] @@ -1294,23 +1294,23 @@ ; GFX10-NEXT: v_add_nc_u32_e32 v9, 1, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc_lo ; GFX10-NEXT: v_subrev_nc_u32_e32 v8, s10, v4 -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s11, v5 ; GFX10-NEXT: v_add_nc_u32_e32 v10, 1, v2 +; GFX10-NEXT: v_cmp_le_u32_e64 s0, s11, v5 ; GFX10-NEXT: v_cmp_le_u32_e64 s1, s8, v6 ; GFX10-NEXT: v_add_nc_u32_e32 v11, 1, v3 ; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc_lo ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s9, v7 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v9, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v10, s1 ; GFX10-NEXT: v_subrev_nc_u32_e32 v9, s11, v5 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v10, s1 ; GFX10-NEXT: v_subrev_nc_u32_e32 v10, s8, v6 ; GFX10-NEXT: v_subrev_nc_u32_e32 v12, s9, v7 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v11, vcc_lo -; GFX10-NEXT: v_xor_b32_e32 v0, s12, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v9, s0 +; GFX10-NEXT: s_xor_b32 s0, s19, s15 ; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v10, s1 ; GFX10-NEXT: v_cndmask_b32_e32 v7, v7, v12, vcc_lo -; GFX10-NEXT: s_xor_b32 s0, s19, s15 +; GFX10-NEXT: v_xor_b32_e32 v0, s12, v0 ; GFX10-NEXT: v_xor_b32_e32 v1, s13, v1 ; GFX10-NEXT: v_xor_b32_e32 v2, s14, v2 ; GFX10-NEXT: v_xor_b32_e32 v3, s0, v3 @@ -1373,12 +1373,12 @@ ; GFX8-NEXT: v_trunc_f32_e32 v1, v1 ; GFX8-NEXT: v_mul_f32_e32 v2, 0xcf800000, v1 ; GFX8-NEXT: v_add_f32_e32 v0, v2, v0 -; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX8-NEXT: s_cmp_lg_u32 s0, 0 ; GFX8-NEXT: s_subb_u32 s17, 0, s15 -; GFX8-NEXT: v_mul_lo_u32 v3, s17, v0 ; GFX8-NEXT: v_mul_lo_u32 v2, s16, v1 +; GFX8-NEXT: v_mul_lo_u32 v3, s17, v0 ; GFX8-NEXT: v_mul_hi_u32 v5, s16, v0 ; GFX8-NEXT: v_mul_lo_u32 v4, s16, v0 ; GFX8-NEXT: v_mov_b32_e32 v6, s15 @@ -1395,12 +1395,12 @@ ; GFX8-NEXT: v_mul_lo_u32 v7, v1, v2 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v5, v3 ; GFX8-NEXT: v_mul_hi_u32 v5, v0, v2 -; GFX8-NEXT: v_mul_hi_u32 v2, v1, v2 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v7, v4 ; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v5 ; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v5, vcc, v7, v5 +; GFX8-NEXT: v_mul_hi_u32 v2, v1, v2 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v4, v3 ; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v5, v4 @@ -1425,12 +1425,12 @@ ; GFX8-NEXT: v_mul_lo_u32 v5, v3, v4 ; GFX8-NEXT: v_add_u32_e64 v2, s[0:1], v8, v2 ; GFX8-NEXT: v_mul_hi_u32 v8, v0, v4 -; GFX8-NEXT: v_mul_hi_u32 v3, v3, v4 ; GFX8-NEXT: v_add_u32_e64 v5, s[0:1], v5, v7 ; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[0:1] ; GFX8-NEXT: v_add_u32_e64 v5, s[0:1], v5, v8 ; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[0:1] ; GFX8-NEXT: v_add_u32_e64 v7, s[0:1], v7, v8 +; GFX8-NEXT: v_mul_hi_u32 v3, v3, v4 ; GFX8-NEXT: v_add_u32_e64 v2, s[0:1], v5, v2 ; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[0:1] ; GFX8-NEXT: v_add_u32_e64 v4, s[0:1], v7, v5 @@ -1450,12 +1450,12 @@ ; GFX8-NEXT: v_mul_lo_u32 v5, s9, v1 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 ; GFX8-NEXT: v_mul_hi_u32 v3, s8, v1 -; GFX8-NEXT: v_mul_hi_u32 v1, s9, v1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v5, v0 ; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v3 ; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v5, v3 +; GFX8-NEXT: v_mul_hi_u32 v1, s9, v1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 @@ -1481,26 +1481,26 @@ ; GFX8-NEXT: v_add_u32_e64 v9, s[0:1], 1, v0 ; GFX8-NEXT: v_addc_u32_e64 v10, s[0:1], 0, v1, s[0:1] ; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s15, v8 -; GFX8-NEXT: v_subb_u32_e32 v2, vcc, v2, v6, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[0:1] ; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s14, v7 -; GFX8-NEXT: v_subrev_u32_e32 v6, vcc, s14, v7 +; GFX8-NEXT: v_subb_u32_e32 v2, vcc, v2, v6, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[0:1] ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s15, v8 +; GFX8-NEXT: v_subrev_u32_e32 v6, vcc, s14, v7 ; GFX8-NEXT: v_cndmask_b32_e64 v11, v11, v12, s[0:1] ; GFX8-NEXT: v_add_u32_e64 v12, s[0:1], 1, v9 ; GFX8-NEXT: v_subbrev_u32_e32 v2, vcc, 0, v2, vcc ; GFX8-NEXT: v_addc_u32_e64 v13, s[0:1], 0, v10, s[0:1] ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; GFX8-NEXT: v_cndmask_b32_e32 v9, v9, v12, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v10, v10, v13, vcc ; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v5 ; GFX8-NEXT: v_cndmask_b32_e32 v5, v7, v6, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v9, v9, v12, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v10, v10, v13, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e64 v2, v4, v2, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v9, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v10, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v2, v4, v2, s[0:1] ; GFX8-NEXT: s_xor_b64 s[0:1], s[6:7], s[12:13] ; GFX8-NEXT: v_xor_b32_e32 v0, s0, v0 ; GFX8-NEXT: s_ashr_i32 s8, s11, 31 @@ -1535,13 +1535,13 @@ ; GFX8-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 ; GFX8-NEXT: s_mov_b32 s9, s8 ; GFX8-NEXT: v_trunc_f32_e32 v3, v3 -; GFX8-NEXT: v_mul_f32_e32 v6, 0xcf800000, v3 ; GFX8-NEXT: s_xor_b64 s[6:7], s[0:1], s[8:9] +; GFX8-NEXT: v_mul_f32_e32 v6, 0xcf800000, v3 ; GFX8-NEXT: v_add_f32_e32 v2, v6, v2 ; GFX8-NEXT: s_sub_u32 s10, 0, s2 -; GFX8-NEXT: s_cselect_b32 s0, 1, 0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GFX8-NEXT: v_cvt_u32_f32_e32 v3, v3 +; GFX8-NEXT: s_cselect_b32 s0, 1, 0 ; GFX8-NEXT: s_and_b32 s0, s0, 1 ; GFX8-NEXT: s_cmp_lg_u32 s0, 0 ; GFX8-NEXT: s_subb_u32 s11, 0, s3 @@ -1563,12 +1563,12 @@ ; GFX8-NEXT: v_mul_lo_u32 v11, v3, v6 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v9, v7 ; GFX8-NEXT: v_mul_hi_u32 v9, v2, v6 -; GFX8-NEXT: v_mul_hi_u32 v6, v3, v6 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, v11, v8 ; GFX8-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v9 ; GFX8-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v9, vcc, v11, v9 +; GFX8-NEXT: v_mul_hi_u32 v6, v3, v6 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v8, v7 ; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v8, vcc, v9, v8 @@ -1593,12 +1593,12 @@ ; GFX8-NEXT: v_mul_lo_u32 v9, v7, v8 ; GFX8-NEXT: v_add_u32_e64 v6, s[0:1], v12, v6 ; GFX8-NEXT: v_mul_hi_u32 v12, v2, v8 -; GFX8-NEXT: v_mul_hi_u32 v7, v7, v8 ; GFX8-NEXT: v_add_u32_e64 v9, s[0:1], v9, v11 ; GFX8-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[0:1] ; GFX8-NEXT: v_add_u32_e64 v9, s[0:1], v9, v12 ; GFX8-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[0:1] ; GFX8-NEXT: v_add_u32_e64 v11, s[0:1], v11, v12 +; GFX8-NEXT: v_mul_hi_u32 v7, v7, v8 ; GFX8-NEXT: v_add_u32_e64 v6, s[0:1], v9, v6 ; GFX8-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[0:1] ; GFX8-NEXT: v_add_u32_e64 v8, s[0:1], v11, v9 @@ -1618,12 +1618,12 @@ ; GFX8-NEXT: v_mul_lo_u32 v9, s7, v3 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v7, v6 ; GFX8-NEXT: v_mul_hi_u32 v7, s6, v3 -; GFX8-NEXT: v_mul_hi_u32 v3, s7, v3 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v9, v2 ; GFX8-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v7 ; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v9, v7 +; GFX8-NEXT: v_mul_hi_u32 v3, s7, v3 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v6 ; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v7, v6 @@ -1661,14 +1661,14 @@ ; GFX8-NEXT: v_subrev_u32_e64 v13, s[0:1], s2, v11 ; GFX8-NEXT: v_subbrev_u32_e64 v6, s[0:1], 0, v6, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e32 v10, v14, v10, vcc -; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v9 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v12, v6, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v14, v15, v16, vcc +; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v9 ; GFX8-NEXT: v_cndmask_b32_e32 v9, v11, v13, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v7, v7, v9, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e32 v6, v12, v6, vcc ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 ; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v10, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v14, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v7, v7, v9, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e64 v6, v8, v6, s[0:1] ; GFX8-NEXT: s_xor_b64 s[0:1], s[8:9], s[12:13] ; GFX8-NEXT: v_xor_b32_e32 v2, s0, v2 @@ -1725,12 +1725,12 @@ ; GFX9-NEXT: v_trunc_f32_e32 v1, v1 ; GFX9-NEXT: v_mul_f32_e32 v2, 0xcf800000, v1 ; GFX9-NEXT: v_add_f32_e32 v0, v2, v0 -; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: s_cmp_lg_u32 s0, 0 ; GFX9-NEXT: s_subb_u32 s17, 0, s15 -; GFX9-NEXT: v_mul_lo_u32 v3, s17, v0 ; GFX9-NEXT: v_mul_lo_u32 v2, s16, v1 +; GFX9-NEXT: v_mul_lo_u32 v3, s17, v0 ; GFX9-NEXT: v_mul_hi_u32 v4, s16, v0 ; GFX9-NEXT: v_mul_lo_u32 v5, s16, v0 ; GFX9-NEXT: v_add3_u32 v2, v3, v2, v4 @@ -1751,10 +1751,10 @@ ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v5, v4 ; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v4, v3 -; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX9-NEXT: v_add_u32_e32 v5, v6, v5 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 +; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX9-NEXT: v_add3_u32 v2, v5, v4, v2 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 ; GFX9-NEXT: v_addc_co_u32_e64 v3, s[0:1], v1, v2, vcc ; GFX9-NEXT: v_mul_lo_u32 v4, s17, v0 ; GFX9-NEXT: v_mul_lo_u32 v5, s16, v3 @@ -1827,28 +1827,28 @@ ; GFX9-NEXT: v_add_co_u32_e64 v9, s[0:1], 1, v0 ; GFX9-NEXT: v_addc_co_u32_e64 v10, s[0:1], 0, v1, s[0:1] ; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s15, v8 -; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v5, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[0:1] ; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s14, v7 -; GFX9-NEXT: v_subrev_co_u32_e32 v5, vcc, s14, v7 +; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v5, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[0:1] ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s15, v8 +; GFX9-NEXT: v_subrev_co_u32_e32 v5, vcc, s14, v7 ; GFX9-NEXT: v_cndmask_b32_e64 v11, v11, v12, s[0:1] ; GFX9-NEXT: v_add_co_u32_e64 v12, s[0:1], 1, v9 ; GFX9-NEXT: v_subbrev_co_u32_e32 v2, vcc, 0, v2, vcc -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 ; GFX9-NEXT: v_addc_co_u32_e64 v13, s[0:1], 0, v10, s[0:1] +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 ; GFX9-NEXT: v_cndmask_b32_e32 v9, v9, v12, vcc -; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v6 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v10, v10, v13, vcc +; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v6 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v2, v4, v2, s[0:1] -; GFX9-NEXT: s_ashr_i32 s8, s11, 31 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v9, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v10, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v2, v4, v2, s[0:1] ; GFX9-NEXT: s_xor_b64 s[0:1], s[6:7], s[12:13] +; GFX9-NEXT: s_ashr_i32 s8, s11, 31 ; GFX9-NEXT: s_ashr_i32 s12, s3, 31 ; GFX9-NEXT: s_add_u32 s10, s10, s8 ; GFX9-NEXT: s_cselect_b32 s7, 1, 0 @@ -1877,9 +1877,9 @@ ; GFX9-NEXT: v_trunc_f32_e32 v6, v6 ; GFX9-NEXT: v_mul_f32_e32 v7, 0xcf800000, v6 ; GFX9-NEXT: v_add_f32_e32 v4, v7, v4 -; GFX9-NEXT: s_cselect_b32 s1, 1, 0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v4 ; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v6 +; GFX9-NEXT: s_cselect_b32 s1, 1, 0 ; GFX9-NEXT: s_and_b32 s1, s1, 1 ; GFX9-NEXT: s_cmp_lg_u32 s1, 0 ; GFX9-NEXT: s_subb_u32 s14, 0, s3 @@ -1909,10 +1909,10 @@ ; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v7, v9 ; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v7, v8 -; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GFX9-NEXT: v_add_u32_e32 v9, v10, v9 -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v7 +; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GFX9-NEXT: v_add3_u32 v5, v9, v8, v5 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v7 ; GFX9-NEXT: v_addc_co_u32_e64 v7, s[0:1], v6, v5, vcc ; GFX9-NEXT: v_mul_lo_u32 v8, s14, v4 ; GFX9-NEXT: v_mul_lo_u32 v9, s7, v7 @@ -1945,17 +1945,17 @@ ; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v5, v7, vcc ; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v4, v6 ; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v5, vcc -; GFX9-NEXT: v_subrev_co_u32_e32 v4, vcc, s6, v3 ; GFX9-NEXT: v_mul_lo_u32 v8, s11, v6 ; GFX9-NEXT: v_mul_lo_u32 v9, s10, v7 +; GFX9-NEXT: v_subrev_co_u32_e32 v4, vcc, s6, v3 ; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v2, v12, vcc ; GFX9-NEXT: v_mul_hi_u32 v2, s10, v6 -; GFX9-NEXT: v_mul_hi_u32 v6, s11, v6 ; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v8, v9 ; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX9-NEXT: v_mul_lo_u32 v3, s11, v7 +; GFX9-NEXT: v_mul_hi_u32 v6, s11, v6 ; GFX9-NEXT: v_add_u32_e32 v2, v8, v2 ; GFX9-NEXT: v_mul_hi_u32 v8, s10, v7 ; GFX9-NEXT: v_mul_hi_u32 v7, s11, v7 @@ -2003,12 +2003,12 @@ ; GFX9-NEXT: v_subrev_co_u32_e64 v15, s[0:1], s2, v11 ; GFX9-NEXT: v_subbrev_co_u32_e64 v6, s[0:1], 0, v6, s[0:1] ; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v10 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v12, v6, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v9, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e32 v9, v11, v15, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v9, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e32 v6, v12, v6, vcc ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 ; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v14, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v9, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v6, v8, v6, s[0:1] ; GFX9-NEXT: s_xor_b64 s[0:1], s[8:9], s[12:13] ; GFX9-NEXT: v_xor_b32_e32 v2, s0, v2 @@ -2017,8 +2017,8 @@ ; GFX9-NEXT: v_subrev_co_u32_e32 v2, vcc, s0, v2 ; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v8, vcc ; GFX9-NEXT: v_xor_b32_e32 v7, s8, v7 -; GFX9-NEXT: v_xor_b32_e32 v8, s8, v6 ; GFX9-NEXT: v_mov_b32_e32 v13, 0 +; GFX9-NEXT: v_xor_b32_e32 v8, s8, v6 ; GFX9-NEXT: v_mov_b32_e32 v9, s8 ; GFX9-NEXT: v_subrev_co_u32_e32 v6, vcc, s8, v7 ; GFX9-NEXT: v_subb_co_u32_e32 v7, vcc, v8, v9, vcc @@ -2058,32 +2058,32 @@ ; GFX10-NEXT: s_cmp_lg_u32 s0, 0 ; GFX10-NEXT: s_subb_u32 s23, 0, s9 ; GFX10-NEXT: s_ashr_i32 s16, s11, 31 -; GFX10-NEXT: v_add_f32_e32 v0, v1, v0 -; GFX10-NEXT: s_ashr_i32 s18, s3, 31 ; GFX10-NEXT: s_xor_b64 s[20:21], s[12:13], s[6:7] +; GFX10-NEXT: s_ashr_i32 s18, s3, 31 +; GFX10-NEXT: v_add_f32_e32 v0, v1, v0 ; GFX10-NEXT: s_add_u32 s0, s10, s16 ; GFX10-NEXT: s_cselect_b32 s1, 1, 0 -; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX10-NEXT: s_and_b32 s1, s1, 1 ; GFX10-NEXT: s_mov_b32 s19, s18 +; GFX10-NEXT: s_and_b32 s1, s1, 1 +; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: s_cmp_lg_u32 s1, 0 ; GFX10-NEXT: s_mov_b32 s17, s16 ; GFX10-NEXT: s_addc_u32 s1, s11, s16 ; GFX10-NEXT: s_add_u32 s2, s2, s18 ; GFX10-NEXT: s_cselect_b32 s6, 1, 0 ; GFX10-NEXT: s_and_b32 s6, s6, 1 -; GFX10-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX10-NEXT: s_cmp_lg_u32 s6, 0 +; GFX10-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX10-NEXT: s_addc_u32 s3, s3, s18 ; GFX10-NEXT: s_xor_b64 s[10:11], s[0:1], s[16:17] ; GFX10-NEXT: s_xor_b64 s[2:3], s[2:3], s[18:19] -; GFX10-NEXT: v_mul_f32_e32 v2, 0x2f800000, v0 ; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s3 +; GFX10-NEXT: v_mul_f32_e32 v2, 0x2f800000, v0 ; GFX10-NEXT: v_cvt_f32_u32_e32 v3, s2 ; GFX10-NEXT: s_sub_u32 s6, 0, s2 ; GFX10-NEXT: s_cselect_b32 s0, 1, 0 -; GFX10-NEXT: v_trunc_f32_e32 v2, v2 ; GFX10-NEXT: v_mul_f32_e32 v1, 0x4f800000, v1 +; GFX10-NEXT: v_trunc_f32_e32 v2, v2 ; GFX10-NEXT: s_and_b32 s0, s0, 1 ; GFX10-NEXT: s_cmp_lg_u32 s0, 0 ; GFX10-NEXT: v_add_f32_e32 v1, v1, v3 @@ -2112,19 +2112,19 @@ ; GFX10-NEXT: v_mul_hi_u32 v3, v2, v3 ; GFX10-NEXT: v_add_f32_e32 v1, v9, v1 ; GFX10-NEXT: v_add_co_u32 v5, s0, v5, v8 -; GFX10-NEXT: v_mul_lo_u32 v9, s6, v4 ; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, s0 ; GFX10-NEXT: v_add_co_u32 v6, s0, v10, v6 -; GFX10-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, 1, s0 ; GFX10-NEXT: v_add_co_u32 v5, s0, v5, v7 +; GFX10-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s0 +; GFX10-NEXT: v_mul_lo_u32 v9, s6, v4 +; GFX10-NEXT: v_add_co_u32 v6, s0, v6, v11 ; GFX10-NEXT: v_mul_lo_u32 v12, s7, v1 ; GFX10-NEXT: v_mul_hi_u32 v13, s6, v1 -; GFX10-NEXT: v_add_co_u32 v6, s0, v6, v11 -; GFX10-NEXT: v_mul_lo_u32 v11, s6, v1 ; GFX10-NEXT: v_add_nc_u32_e32 v5, v8, v5 ; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, 1, s0 +; GFX10-NEXT: v_mul_lo_u32 v11, s6, v1 ; GFX10-NEXT: v_add_co_u32 v5, s0, v6, v5 ; GFX10-NEXT: v_add_nc_u32_e32 v7, v10, v7 ; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s0 @@ -2136,95 +2136,95 @@ ; GFX10-NEXT: v_mul_lo_u32 v6, v1, v8 ; GFX10-NEXT: v_mul_lo_u32 v7, v4, v8 ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v5 -; GFX10-NEXT: v_mul_hi_u32 v5, v1, v8 ; GFX10-NEXT: v_add_co_ci_u32_e64 v12, s0, v2, v3, vcc_lo -; GFX10-NEXT: v_mul_hi_u32 v8, v4, v8 -; GFX10-NEXT: v_add_nc_u32_e32 v2, v2, v3 +; GFX10-NEXT: v_mul_hi_u32 v5, v1, v8 ; GFX10-NEXT: v_mul_lo_u32 v14, s23, v0 ; GFX10-NEXT: v_add_co_u32 v6, s0, v9, v6 -; GFX10-NEXT: v_mul_hi_u32 v15, s22, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, 1, s0 ; GFX10-NEXT: v_add_co_u32 v7, s0, v7, v11 +; GFX10-NEXT: v_mul_hi_u32 v15, s22, v0 ; GFX10-NEXT: v_mul_lo_u32 v16, s22, v12 ; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, 1, s0 ; GFX10-NEXT: v_add_co_u32 v6, s0, v6, v10 -; GFX10-NEXT: v_mul_lo_u32 v13, s22, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s0 +; GFX10-NEXT: v_mul_lo_u32 v13, s22, v0 ; GFX10-NEXT: v_add_co_u32 v5, s0, v7, v5 ; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, 1, s0 -; GFX10-NEXT: v_add3_u32 v14, v14, v16, v15 ; GFX10-NEXT: v_add_nc_u32_e32 v6, v9, v6 +; GFX10-NEXT: v_add3_u32 v14, v14, v16, v15 +; GFX10-NEXT: v_mul_hi_u32 v8, v4, v8 +; GFX10-NEXT: v_add_nc_u32_e32 v2, v2, v3 ; GFX10-NEXT: v_mul_lo_u32 v10, v12, v13 ; GFX10-NEXT: v_add_nc_u32_e32 v7, v11, v7 -; GFX10-NEXT: v_mul_lo_u32 v11, v0, v14 ; GFX10-NEXT: v_add_co_u32 v5, s0, v5, v6 +; GFX10-NEXT: v_mul_lo_u32 v11, v0, v14 ; GFX10-NEXT: v_mul_hi_u32 v9, v0, v13 ; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s0 ; GFX10-NEXT: v_mul_hi_u32 v13, v12, v13 ; GFX10-NEXT: v_mul_lo_u32 v15, v12, v14 ; GFX10-NEXT: v_add_co_u32 v1, s0, v1, v5 -; GFX10-NEXT: v_mul_hi_u32 v16, v0, v14 ; GFX10-NEXT: v_add3_u32 v6, v7, v6, v8 ; GFX10-NEXT: v_add_co_u32 v5, s1, v10, v11 ; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, 1, s1 -; GFX10-NEXT: v_mul_lo_u32 v3, s6, v1 +; GFX10-NEXT: v_mul_hi_u32 v16, v0, v14 ; GFX10-NEXT: v_add_co_u32 v8, s1, v15, v13 -; GFX10-NEXT: v_mul_lo_u32 v13, s7, v1 ; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, 1, s1 ; GFX10-NEXT: v_add_co_ci_u32_e64 v11, s1, v4, v6, s0 ; GFX10-NEXT: v_add_co_u32 v5, s1, v5, v9 -; GFX10-NEXT: v_mul_hi_u32 v15, s6, v1 ; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s1 ; GFX10-NEXT: v_add_co_u32 v8, s1, v8, v16 -; GFX10-NEXT: v_mul_lo_u32 v9, s6, v11 ; GFX10-NEXT: v_cndmask_b32_e64 v16, 0, 1, s1 -; GFX10-NEXT: v_add_nc_u32_e32 v4, v4, v6 +; GFX10-NEXT: v_mul_lo_u32 v13, s7, v1 ; GFX10-NEXT: v_add_nc_u32_e32 v5, v7, v5 +; GFX10-NEXT: v_mul_hi_u32 v15, s6, v1 +; GFX10-NEXT: v_mul_lo_u32 v9, s6, v11 ; GFX10-NEXT: v_mul_hi_u32 v7, v12, v14 -; GFX10-NEXT: v_mul_lo_u32 v12, v11, v3 ; GFX10-NEXT: v_add_nc_u32_e32 v10, v10, v16 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 ; GFX10-NEXT: v_add_co_u32 v5, s1, v8, v5 -; GFX10-NEXT: v_add3_u32 v9, v13, v9, v15 ; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, s1 +; GFX10-NEXT: v_mul_lo_u32 v3, s6, v1 +; GFX10-NEXT: v_add_nc_u32_e32 v4, v4, v6 +; GFX10-NEXT: v_add3_u32 v9, v13, v9, v15 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; GFX10-NEXT: v_add3_u32 v7, v10, v8, v7 +; GFX10-NEXT: v_mul_lo_u32 v14, v1, v9 +; GFX10-NEXT: v_mul_lo_u32 v12, v11, v3 +; GFX10-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v2, v7, vcc_lo ; GFX10-NEXT: v_mul_hi_u32 v13, v1, v3 ; GFX10-NEXT: v_mul_hi_u32 v3, v11, v3 -; GFX10-NEXT: v_mul_lo_u32 v14, v1, v9 -; GFX10-NEXT: v_add3_u32 v7, v10, v8, v7 ; GFX10-NEXT: v_mul_lo_u32 v8, v11, v9 -; GFX10-NEXT: v_mul_hi_u32 v10, v1, v9 -; GFX10-NEXT: v_mul_hi_u32 v9, v11, v9 -; GFX10-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v2, v7, vcc_lo ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v5 +; GFX10-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, 0, v2, vcc_lo +; GFX10-NEXT: v_mul_hi_u32 v10, v1, v9 ; GFX10-NEXT: v_add_co_u32 v7, s1, v12, v14 +; GFX10-NEXT: v_mul_hi_u32 v9, v11, v9 ; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, 1, s1 -; GFX10-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, 0, v2, vcc_lo ; GFX10-NEXT: v_add_co_u32 v3, s1, v8, v3 ; GFX10-NEXT: v_mul_lo_u32 v8, s15, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s1 -; GFX10-NEXT: v_add_co_u32 v7, s1, v7, v13 ; GFX10-NEXT: v_mul_lo_u32 v14, s14, v2 +; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s1 ; GFX10-NEXT: v_mul_hi_u32 v12, s14, v0 ; GFX10-NEXT: v_mul_hi_u32 v0, s15, v0 +; GFX10-NEXT: v_add_co_u32 v7, s1, v7, v13 ; GFX10-NEXT: v_mul_lo_u32 v13, s15, v2 ; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, 1, s1 ; GFX10-NEXT: v_add_co_u32 v3, s1, v3, v10 -; GFX10-NEXT: v_mul_hi_u32 v15, s14, v2 ; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, 1, s1 ; GFX10-NEXT: v_add_co_u32 v8, s1, v8, v14 -; GFX10-NEXT: v_add_nc_u32_e32 v7, v11, v7 ; GFX10-NEXT: v_cndmask_b32_e64 v14, 0, 1, s1 ; GFX10-NEXT: v_add_co_u32 v0, s1, v13, v0 -; GFX10-NEXT: v_mul_hi_u32 v2, s15, v2 +; GFX10-NEXT: v_mul_hi_u32 v15, s14, v2 ; GFX10-NEXT: v_cndmask_b32_e64 v13, 0, 1, s1 ; GFX10-NEXT: v_add_co_u32 v8, s1, v8, v12 -; GFX10-NEXT: v_add_nc_u32_e32 v5, v5, v10 ; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, s1 +; GFX10-NEXT: v_add_nc_u32_e32 v7, v11, v7 +; GFX10-NEXT: v_mul_hi_u32 v2, s15, v2 +; GFX10-NEXT: v_add_nc_u32_e32 v5, v5, v10 ; GFX10-NEXT: v_add_co_u32 v0, s1, v0, v15 -; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, 1, s1 ; GFX10-NEXT: v_add_nc_u32_e32 v8, v14, v8 -; GFX10-NEXT: v_add_nc_u32_e32 v10, v13, v12 +; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, 1, s1 ; GFX10-NEXT: v_add_co_u32 v0, s1, v0, v8 +; GFX10-NEXT: v_add_nc_u32_e32 v10, v13, v12 ; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, s1 ; GFX10-NEXT: v_add_co_u32 v3, s1, v3, v7 ; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, 1, s1 @@ -2242,21 +2242,21 @@ ; GFX10-NEXT: v_mul_lo_u32 v6, s11, v1 ; GFX10-NEXT: v_mul_hi_u32 v7, s11, v1 ; GFX10-NEXT: v_sub_co_u32 v5, vcc_lo, s14, v5 -; GFX10-NEXT: v_mul_lo_u32 v14, s10, v3 ; GFX10-NEXT: v_sub_nc_u32_e32 v8, s15, v4 ; GFX10-NEXT: v_sub_co_ci_u32_e64 v4, s0, s15, v4, vcc_lo -; GFX10-NEXT: v_mul_lo_u32 v15, s11, v3 ; GFX10-NEXT: v_cmp_le_u32_e64 s0, s8, v5 -; GFX10-NEXT: v_mul_hi_u32 v1, s10, v1 +; GFX10-NEXT: v_mul_lo_u32 v14, s10, v3 ; GFX10-NEXT: v_subrev_co_ci_u32_e32 v8, vcc_lo, s9, v8, vcc_lo ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s9, v4 -; GFX10-NEXT: v_mul_hi_u32 v17, s10, v3 ; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, -1, s0 -; GFX10-NEXT: v_mul_hi_u32 v3, s11, v3 +; GFX10-NEXT: v_mul_lo_u32 v15, s11, v3 +; GFX10-NEXT: v_mul_hi_u32 v1, s10, v1 +; GFX10-NEXT: v_mul_hi_u32 v17, s10, v3 ; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc_lo ; GFX10-NEXT: v_sub_co_u32 v12, vcc_lo, v5, s8 ; GFX10-NEXT: v_subrev_co_ci_u32_e64 v13, s0, 0, v8, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s9, v4 +; GFX10-NEXT: v_mul_hi_u32 v3, s11, v3 ; GFX10-NEXT: v_subrev_co_ci_u32_e32 v8, vcc_lo, s9, v8, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v10, v11, v10, s0 ; GFX10-NEXT: v_cmp_le_u32_e64 s0, s9, v13 @@ -2268,13 +2268,13 @@ ; GFX10-NEXT: v_cndmask_b32_e64 v14, 0, 1, s0 ; GFX10-NEXT: v_add_co_u32 v7, s0, v15, v7 ; GFX10-NEXT: v_add_co_u32 v1, s1, v6, v1 -; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s1 +; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s0 ; GFX10-NEXT: v_add_co_u32 v7, s0, v7, v17 ; GFX10-NEXT: v_cndmask_b32_e64 v15, 0, 1, s0 ; GFX10-NEXT: v_add_co_u32 v17, s0, v0, 1 -; GFX10-NEXT: v_add_nc_u32_e32 v1, v14, v1 ; GFX10-NEXT: v_add_co_ci_u32_e64 v18, s0, 0, v2, s0 +; GFX10-NEXT: v_add_nc_u32_e32 v1, v14, v1 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s9, v13 ; GFX10-NEXT: v_add_nc_u32_e32 v6, v6, v15 ; GFX10-NEXT: v_cndmask_b32_e64 v11, v11, v16, s0 @@ -2295,18 +2295,18 @@ ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v6, vcc_lo ; GFX10-NEXT: v_add3_u32 v6, v10, v11, v15 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v13, v8, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v12, v12, v14, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v13, v8, s0 ; GFX10-NEXT: v_sub_co_u32 v8, s0, s10, v16 -; GFX10-NEXT: v_xor_b32_e32 v0, s20, v0 ; GFX10-NEXT: v_sub_co_ci_u32_e64 v10, s1, s11, v6, s0 -; GFX10-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v5, v5, v12, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc_lo ; GFX10-NEXT: v_sub_nc_u32_e32 v4, s11, v6 -; GFX10-NEXT: v_xor_b32_e32 v1, s21, v1 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s3, v10 -; GFX10-NEXT: v_xor_b32_e32 v2, s12, v2 +; GFX10-NEXT: v_xor_b32_e32 v0, s20, v0 +; GFX10-NEXT: v_xor_b32_e32 v1, s21, v1 ; GFX10-NEXT: v_xor_b32_e32 v5, s12, v5 +; GFX10-NEXT: v_xor_b32_e32 v2, s12, v2 ; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc_lo ; GFX10-NEXT: v_subrev_co_ci_u32_e64 v4, vcc_lo, s3, v4, s0 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s2, v8 @@ -2314,9 +2314,9 @@ ; GFX10-NEXT: v_sub_co_u32 v12, vcc_lo, v8, s2 ; GFX10-NEXT: v_subrev_co_ci_u32_e64 v13, s0, 0, v4, vcc_lo ; GFX10-NEXT: v_sub_co_u32 v0, s0, v0, s20 -; GFX10-NEXT: v_subrev_co_ci_u32_e32 v4, vcc_lo, s3, v4, vcc_lo ; GFX10-NEXT: v_subrev_co_ci_u32_e64 v1, s0, s21, v1, s0 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s3, v10 +; GFX10-NEXT: v_subrev_co_ci_u32_e32 v4, vcc_lo, s3, v4, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v11, s0 ; GFX10-NEXT: v_cmp_le_u32_e64 s0, s3, v13 ; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, -1, s0 @@ -2331,8 +2331,8 @@ ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v11 ; GFX10-NEXT: v_sub_co_u32 v11, s0, v12, s2 ; GFX10-NEXT: v_subrev_co_ci_u32_e64 v4, s0, 0, v4, s0 -; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v6 ; GFX10-NEXT: v_cndmask_b32_e32 v14, v15, v14, vcc_lo +; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v6 ; GFX10-NEXT: v_cndmask_b32_e32 v15, v16, v17, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v6, v12, v11, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v4, v13, v4, vcc_lo @@ -2340,8 +2340,8 @@ ; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v15, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v6, v8, v6, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v8, v10, v4, s0 -; GFX10-NEXT: v_sub_co_u32 v4, vcc_lo, v5, s12 ; GFX10-NEXT: s_xor_b64 s[0:1], s[16:17], s[18:19] +; GFX10-NEXT: v_sub_co_u32 v4, vcc_lo, v5, s12 ; GFX10-NEXT: v_subrev_co_ci_u32_e32 v5, vcc_lo, s12, v2, vcc_lo ; GFX10-NEXT: v_xor_b32_e32 v2, s0, v7 ; GFX10-NEXT: v_xor_b32_e32 v3, s1, v3 @@ -2448,8 +2448,8 @@ ; GFX9-NEXT: v_subrev_u32_e32 v3, s7, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX9-NEXT: v_xor_b32_e32 v0, s4, v0 -; GFX9-NEXT: v_xor_b32_e32 v1, s8, v1 ; GFX9-NEXT: v_subrev_u32_e32 v0, s4, v0 +; GFX9-NEXT: v_xor_b32_e32 v1, s8, v1 ; GFX9-NEXT: v_subrev_u32_e32 v1, s8, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_byte v2, v0, s[0:1] @@ -2482,8 +2482,8 @@ ; GFX10-NEXT: v_sub_nc_u32_e32 v1, s0, v1 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-NEXT: s_xor_b32 s4, s8, s6 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s7, v1 ; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s7, v1 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s7, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo ; GFX10-NEXT: v_add_nc_u32_e32 v2, 1, v0 @@ -2523,10 +2523,10 @@ ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX8-NEXT: s_add_i32 s1, s1, s10 ; GFX8-NEXT: s_xor_b32 s11, s1, s10 -; GFX8-NEXT: v_cvt_f32_u32_e32 v2, s11 +; GFX8-NEXT: s_sext_i32_i8 s0, s2 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX8-NEXT: s_sext_i32_i8 s0, s2 +; GFX8-NEXT: v_cvt_f32_u32_e32 v2, s11 ; GFX8-NEXT: s_ashr_i32 s9, s0, 31 ; GFX8-NEXT: s_add_i32 s0, s0, s9 ; GFX8-NEXT: v_mul_lo_u32 v1, s6, v0 @@ -2563,8 +2563,8 @@ ; GFX8-NEXT: v_mul_hi_u32 v1, s1, v1 ; GFX8-NEXT: v_xor_b32_e32 v2, s9, v2 ; GFX8-NEXT: v_subrev_u32_e32 v0, vcc, s0, v0 -; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s9, v2 ; GFX8-NEXT: v_mul_lo_u32 v3, v1, s11 +; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s9, v2 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v1 ; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s1, v3 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s11, v3 @@ -2586,8 +2586,8 @@ ; GFX8-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s2, v3 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_store_short v[0:1], v4 ; GFX8-NEXT: v_and_b32_e32 v0, s0, v3 ; GFX8-NEXT: v_lshlrev_b16_e32 v0, 8, v0 @@ -2667,12 +2667,12 @@ ; GFX9-NEXT: s_movk_i32 s4, 0xff ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX9-NEXT: v_and_b32_e32 v1, s4, v1 -; GFX9-NEXT: v_xor_b32_e32 v3, s11, v3 ; GFX9-NEXT: v_subrev_u32_e32 v0, s6, v0 +; GFX9-NEXT: v_xor_b32_e32 v3, s11, v3 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_subrev_u32_e32 v3, s11, v3 ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_subrev_u32_e32 v3, s11, v3 ; GFX9-NEXT: v_xor_b32_e32 v2, s10, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_short v1, v0, s[0:1] @@ -2730,8 +2730,8 @@ ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s1, v2 ; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s1, v2 -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s2, v3 ; GFX10-NEXT: v_subrev_nc_u32_e32 v7, s2, v3 +; GFX10-NEXT: v_cmp_le_u32_e64 s0, s2, v3 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v4, s0 @@ -2859,8 +2859,8 @@ ; GFX9-NEXT: v_subrev_u32_e32 v3, s7, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX9-NEXT: v_xor_b32_e32 v0, s4, v0 -; GFX9-NEXT: v_xor_b32_e32 v1, s8, v1 ; GFX9-NEXT: v_subrev_u32_e32 v0, s4, v0 +; GFX9-NEXT: v_xor_b32_e32 v1, s8, v1 ; GFX9-NEXT: v_subrev_u32_e32 v1, s8, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_short v2, v0, s[0:1] @@ -2893,8 +2893,8 @@ ; GFX10-NEXT: v_sub_nc_u32_e32 v1, s0, v1 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-NEXT: s_xor_b32 s4, s8, s6 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s7, v1 ; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s7, v1 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s7, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo ; GFX10-NEXT: v_add_nc_u32_e32 v2, 1, v0 @@ -2940,8 +2940,8 @@ ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX8-NEXT: s_add_i32 s0, s0, s11 -; GFX8-NEXT: s_xor_b32 s12, s0, s11 ; GFX8-NEXT: s_xor_b32 s1, s1, s10 +; GFX8-NEXT: s_xor_b32 s12, s0, s11 ; GFX8-NEXT: v_mul_lo_u32 v1, s6, v0 ; GFX8-NEXT: v_cvt_f32_u32_e32 v2, s12 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 @@ -2952,13 +2952,13 @@ ; GFX8-NEXT: v_mul_lo_u32 v2, v0, s3 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 1, v0 ; GFX8-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 -; GFX8-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s1, v2 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s3, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GFX8-NEXT: v_subrev_u32_e64 v3, s[0:1], s3, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 1, v0 +; GFX8-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s3, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GFX8-NEXT: v_subrev_u32_e64 v3, s[0:1], s3, v2 @@ -2976,8 +2976,8 @@ ; GFX8-NEXT: v_mul_hi_u32 v1, s1, v1 ; GFX8-NEXT: v_xor_b32_e32 v2, s10, v2 ; GFX8-NEXT: v_subrev_u32_e32 v0, vcc, s0, v0 -; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s10, v2 ; GFX8-NEXT: v_mul_lo_u32 v3, v1, s12 +; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s10, v2 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v1 ; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s1, v3 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s12, v3 @@ -2995,8 +2995,8 @@ ; GFX8-NEXT: s_mov_b32 s0, 0xffff ; GFX8-NEXT: v_xor_b32_e32 v3, s2, v3 ; GFX8-NEXT: v_and_b32_e32 v1, s0, v1 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s2, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX8-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: v_and_b32_e32 v0, s0, v3 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 @@ -3026,10 +3026,10 @@ ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: s_ashr_i32 s10, s6, 31 ; GFX9-NEXT: s_add_i32 s6, s6, s10 -; GFX9-NEXT: s_xor_b32 s6, s6, s10 +; GFX9-NEXT: s_sub_i32 s11, 0, s8 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: s_sub_i32 s11, 0, s8 +; GFX9-NEXT: s_xor_b32 s6, s6, s10 ; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s6 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_sext_i32_i16 s5, s9 @@ -3085,8 +3085,8 @@ ; GFX9-NEXT: v_subrev_u32_e32 v0, s7, v0 ; GFX9-NEXT: v_subrev_u32_e32 v2, s11, v2 ; GFX9-NEXT: v_sub_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX9-NEXT: v_sub_u32_sdwa v3, v3, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX9-NEXT: v_and_or_b32 v0, v0, v4, v1 ; GFX9-NEXT: v_and_or_b32 v1, v2, v4, v3 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -3142,22 +3142,22 @@ ; GFX10-NEXT: v_sub_nc_u32_e32 v2, s6, v2 ; GFX10-NEXT: v_sub_nc_u32_e32 v3, s0, v3 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s2, v2 ; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s2, v2 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s9, v3 +; GFX10-NEXT: v_cmp_le_u32_e64 s0, s2, v2 ; GFX10-NEXT: v_subrev_nc_u32_e32 v7, s9, v3 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v6, s0 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc_lo +; GFX10-NEXT: v_add_nc_u32_e32 v5, 1, v1 ; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v0 ; GFX10-NEXT: v_cmp_le_u32_e64 s0, s2, v2 -; GFX10-NEXT: v_add_nc_u32_e32 v5, 1, v1 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s9, v3 ; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s2, v2 ; GFX10-NEXT: v_subrev_nc_u32_e32 v7, s9, v3 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0 ; GFX10-NEXT: s_xor_b32 s2, s1, s3 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v6, s0 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc_lo @@ -3228,8 +3228,8 @@ ; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s4, v2 ; GFX8-NEXT: v_xor_b32_e32 v3, s8, v3 ; GFX8-NEXT: v_and_b32_e32 v2, 7, v2 -; GFX8-NEXT: flat_store_byte v[0:1], v2 ; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s8, v3 +; GFX8-NEXT: flat_store_byte v[0:1], v2 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_and_b32_e32 v2, 7, v3 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -3275,8 +3275,8 @@ ; GFX9-NEXT: v_xor_b32_e32 v0, s4, v0 ; GFX9-NEXT: v_subrev_u32_e32 v0, s4, v0 ; GFX9-NEXT: v_xor_b32_e32 v1, s8, v1 -; GFX9-NEXT: v_and_b32_e32 v0, 7, v0 ; GFX9-NEXT: v_subrev_u32_e32 v1, s8, v1 +; GFX9-NEXT: v_and_b32_e32 v0, 7, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_byte v2, v0, s[0:1] ; GFX9-NEXT: v_and_b32_e32 v0, 7, v1 @@ -3307,8 +3307,8 @@ ; GFX10-NEXT: v_mul_lo_u32 v1, v0, s1 ; GFX10-NEXT: v_add_nc_u32_e32 v2, 1, v0 ; GFX10-NEXT: v_sub_nc_u32_e32 v1, s0, v1 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s1, v1 ; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s1, v1 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s1, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo ; GFX10-NEXT: v_add_nc_u32_e32 v2, 1, v0 @@ -3427,8 +3427,8 @@ ; GFX9-NEXT: v_xor_b32_e32 v0, s5, v0 ; GFX9-NEXT: v_subrev_u32_e32 v0, s5, v0 ; GFX9-NEXT: v_xor_b32_e32 v1, s8, v1 -; GFX9-NEXT: v_and_b32_e32 v0, s4, v0 ; GFX9-NEXT: v_subrev_u32_e32 v1, s8, v1 +; GFX9-NEXT: v_and_b32_e32 v0, s4, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dword v2, v0, s[0:1] ; GFX9-NEXT: v_and_b32_e32 v0, s4, v1 @@ -3459,8 +3459,8 @@ ; GFX10-NEXT: v_mul_lo_u32 v1, v0, s1 ; GFX10-NEXT: v_add_nc_u32_e32 v2, 1, v0 ; GFX10-NEXT: v_sub_nc_u32_e32 v1, s0, v1 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s1, v1 ; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s1, v1 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s1, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo ; GFX10-NEXT: v_add_nc_u32_e32 v2, 1, v0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll @@ -313,8 +313,8 @@ ; GFX8-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc ; GFX8-NEXT: flat_load_dword v4, v[1:2] ; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5 ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -404,8 +404,8 @@ ; GFX7-NEXT: s_brev_b32 s4, -4 ; GFX7-NEXT: v_and_b32_e32 v2, s4, v1 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 -; GFX7-NEXT: v_mov_b32_e32 v3, v1 ; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX7-NEXT: v_mov_b32_e32 v3, v1 ; GFX7-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 ; GFX7-NEXT: v_lshl_b64 v[2:3], v[2:3], 2 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -416,8 +416,8 @@ ; GFX8-NEXT: s_brev_b32 s4, -4 ; GFX8-NEXT: v_and_b32_e32 v2, s4, v1 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: v_mov_b32_e32 v3, v1 ; GFX8-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX8-NEXT: v_mov_b32_e32 v3, v1 ; GFX8-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] ; GFX8-NEXT: v_lshlrev_b64 v[2:3], 2, v[2:3] ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -428,8 +428,8 @@ ; GFX9-NEXT: s_brev_b32 s4, -4 ; GFX9-NEXT: v_and_b32_e32 v2, s4, v1 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] ; GFX9-NEXT: v_lshlrev_b64 v[2:3], 2, v[2:3] ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -487,8 +487,8 @@ ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_brev_b32 s4, -8 -; GFX7-NEXT: v_and_b32_e32 v2, s4, v1 ; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX7-NEXT: v_and_b32_e32 v2, s4, v1 ; GFX7-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GFX7-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; GFX7-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 @@ -499,8 +499,8 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_brev_b32 s4, -8 -; GFX8-NEXT: v_and_b32_e32 v2, s4, v1 ; GFX8-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX8-NEXT: v_and_b32_e32 v2, s4, v1 ; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; GFX8-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] @@ -511,8 +511,8 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_brev_b32 s4, -8 -; GFX9-NEXT: v_and_b32_e32 v2, s4, v1 ; GFX9-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX9-NEXT: v_and_b32_e32 v2, s4, v1 ; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GFX9-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] @@ -524,12 +524,12 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_brev_b32 s4, -8 -; GFX10-NEXT: v_and_b32_e32 v2, s4, v1 ; GFX10-NEXT: v_and_b32_e32 v0, s4, v0 -; GFX10-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX10-NEXT: v_and_b32_e32 v2, s4, v1 ; GFX10-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX10-NEXT: v_lshlrev_b64 v[2:3], 2, v[2:3] +; GFX10-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; GFX10-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] +; GFX10-NEXT: v_lshlrev_b64 v[2:3], 2, v[2:3] ; GFX10-NEXT: s_setpc_b64 s[30:31] %and = and <2 x i32> %x, %ext = sext <2 x i32> %and to <2 x i64> @@ -622,8 +622,8 @@ ; GFX8-LABEL: s_shl_v2i32_zext_v2i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_movk_i32 s2, 0x3fff -; GFX8-NEXT: s_lshr_b32 s1, s0, 16 ; GFX8-NEXT: s_mov_b32 s3, s2 +; GFX8-NEXT: s_lshr_b32 s1, s0, 16 ; GFX8-NEXT: s_and_b32 s0, s0, 0xffff ; GFX8-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] ; GFX8-NEXT: s_lshl_b32 s0, s0, 2 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll @@ -812,8 +812,8 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_mov_b32 s3, 0xffff ; GFX8-NEXT: s_lshr_b32 s2, s0, 16 -; GFX8-NEXT: s_lshr_b32 s4, s1, 16 ; GFX8-NEXT: s_and_b32 s0, s0, s3 +; GFX8-NEXT: s_lshr_b32 s4, s1, 16 ; GFX8-NEXT: s_and_b32 s1, s1, s3 ; GFX8-NEXT: s_lshl_b32 s0, s0, s1 ; GFX8-NEXT: s_lshl_b32 s1, s2, s4 @@ -939,9 +939,9 @@ ; GFX6-NEXT: v_and_b32_e32 v4, s4, v5 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, v4, v1 ; GFX6-NEXT: v_and_b32_e32 v4, s4, v6 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX6-NEXT: v_and_b32_e32 v4, s4, v7 +; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, v4, v3 ; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 @@ -991,9 +991,9 @@ ; GFX6-NEXT: s_and_b32 s4, s5, s8 ; GFX6-NEXT: s_lshl_b32 s1, s1, s4 ; GFX6-NEXT: s_and_b32 s4, s6, s8 -; GFX6-NEXT: s_and_b32 s1, s1, s8 ; GFX6-NEXT: s_lshl_b32 s2, s2, s4 ; GFX6-NEXT: s_and_b32 s4, s7, s8 +; GFX6-NEXT: s_and_b32 s1, s1, s8 ; GFX6-NEXT: s_lshl_b32 s3, s3, s4 ; GFX6-NEXT: s_and_b32 s0, s0, s8 ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 @@ -1008,15 +1008,15 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_mov_b32 s6, 0xffff ; GFX8-NEXT: s_lshr_b32 s4, s0, 16 -; GFX8-NEXT: s_lshr_b32 s7, s2, 16 ; GFX8-NEXT: s_and_b32 s0, s0, s6 +; GFX8-NEXT: s_lshr_b32 s7, s2, 16 ; GFX8-NEXT: s_and_b32 s2, s2, s6 -; GFX8-NEXT: s_lshl_b32 s0, s0, s2 -; GFX8-NEXT: s_lshl_b32 s2, s4, s7 ; GFX8-NEXT: s_lshr_b32 s5, s1, 16 -; GFX8-NEXT: s_lshr_b32 s8, s3, 16 ; GFX8-NEXT: s_and_b32 s1, s1, s6 +; GFX8-NEXT: s_lshr_b32 s8, s3, 16 ; GFX8-NEXT: s_and_b32 s3, s3, s6 +; GFX8-NEXT: s_lshl_b32 s0, s0, s2 +; GFX8-NEXT: s_lshl_b32 s2, s4, s7 ; GFX8-NEXT: s_lshl_b32 s1, s1, s3 ; GFX8-NEXT: s_lshl_b32 s3, s5, s8 ; GFX8-NEXT: s_lshl_b32 s2, s2, 16 @@ -1096,12 +1096,12 @@ ; GFX6-NEXT: v_and_b32_e32 v8, s4, v11 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, v8, v3 ; GFX6-NEXT: v_and_b32_e32 v8, s4, v12 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, v8, v4 ; GFX6-NEXT: v_and_b32_e32 v8, s4, v13 +; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX6-NEXT: v_mov_b32_e32 v16, 0xffff ; GFX6-NEXT: v_lshlrev_b32_e32 v5, v8, v5 ; GFX6-NEXT: v_and_b32_e32 v8, s4, v14 -; GFX6-NEXT: v_mov_b32_e32 v16, 0xffff ; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v6, v8, v6 @@ -1109,13 +1109,13 @@ ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: v_and_b32_e32 v1, v2, v16 ; GFX6-NEXT: v_and_b32_e32 v2, v3, v16 -; GFX6-NEXT: v_and_b32_e32 v3, v5, v16 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v7, v8, v7 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_and_b32_e32 v3, v5, v16 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX6-NEXT: v_and_b32_e32 v2, v4, v16 -; GFX6-NEXT: v_and_b32_e32 v4, v7, v16 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_and_b32_e32 v4, v7, v16 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX6-NEXT: v_and_b32_e32 v3, v6, v16 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 @@ -1175,9 +1175,9 @@ ; GFX6-NEXT: s_and_b32 s8, s11, s16 ; GFX6-NEXT: s_lshl_b32 s3, s3, s8 ; GFX6-NEXT: s_and_b32 s8, s12, s16 -; GFX6-NEXT: s_and_b32 s1, s1, s16 ; GFX6-NEXT: s_lshl_b32 s4, s4, s8 ; GFX6-NEXT: s_and_b32 s8, s13, s16 +; GFX6-NEXT: s_and_b32 s1, s1, s16 ; GFX6-NEXT: s_lshl_b32 s5, s5, s8 ; GFX6-NEXT: s_and_b32 s8, s14, s16 ; GFX6-NEXT: s_and_b32 s0, s0, s16 @@ -1187,13 +1187,13 @@ ; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: s_and_b32 s1, s2, s16 ; GFX6-NEXT: s_and_b32 s2, s3, s16 -; GFX6-NEXT: s_and_b32 s3, s5, s16 -; GFX6-NEXT: s_lshl_b32 s2, s2, 16 ; GFX6-NEXT: s_lshl_b32 s7, s7, s8 +; GFX6-NEXT: s_lshl_b32 s2, s2, 16 +; GFX6-NEXT: s_and_b32 s3, s5, s16 ; GFX6-NEXT: s_or_b32 s1, s1, s2 ; GFX6-NEXT: s_and_b32 s2, s4, s16 -; GFX6-NEXT: s_and_b32 s4, s7, s16 ; GFX6-NEXT: s_lshl_b32 s3, s3, 16 +; GFX6-NEXT: s_and_b32 s4, s7, s16 ; GFX6-NEXT: s_or_b32 s2, s2, s3 ; GFX6-NEXT: s_and_b32 s3, s6, s16 ; GFX6-NEXT: s_lshl_b32 s4, s4, 16 @@ -1204,35 +1204,35 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_mov_b32 s12, 0xffff ; GFX8-NEXT: s_lshr_b32 s8, s0, 16 -; GFX8-NEXT: s_lshr_b32 s13, s4, 16 ; GFX8-NEXT: s_and_b32 s0, s0, s12 +; GFX8-NEXT: s_lshr_b32 s13, s4, 16 ; GFX8-NEXT: s_and_b32 s4, s4, s12 -; GFX8-NEXT: s_lshl_b32 s0, s0, s4 -; GFX8-NEXT: s_lshl_b32 s4, s8, s13 ; GFX8-NEXT: s_lshr_b32 s9, s1, 16 -; GFX8-NEXT: s_lshr_b32 s14, s5, 16 ; GFX8-NEXT: s_and_b32 s1, s1, s12 +; GFX8-NEXT: s_lshr_b32 s14, s5, 16 ; GFX8-NEXT: s_and_b32 s5, s5, s12 -; GFX8-NEXT: s_lshl_b32 s1, s1, s5 +; GFX8-NEXT: s_lshl_b32 s0, s0, s4 +; GFX8-NEXT: s_lshl_b32 s4, s8, s13 ; GFX8-NEXT: s_lshr_b32 s10, s2, 16 -; GFX8-NEXT: s_lshr_b32 s15, s6, 16 ; GFX8-NEXT: s_and_b32 s2, s2, s12 +; GFX8-NEXT: s_lshr_b32 s15, s6, 16 ; GFX8-NEXT: s_and_b32 s6, s6, s12 +; GFX8-NEXT: s_lshl_b32 s1, s1, s5 ; GFX8-NEXT: s_lshl_b32 s5, s9, s14 ; GFX8-NEXT: s_lshl_b32 s4, s4, 16 ; GFX8-NEXT: s_and_b32 s0, s0, s12 -; GFX8-NEXT: s_lshl_b32 s2, s2, s6 ; GFX8-NEXT: s_lshr_b32 s11, s3, 16 -; GFX8-NEXT: s_lshr_b32 s16, s7, 16 -; GFX8-NEXT: s_or_b32 s0, s4, s0 ; GFX8-NEXT: s_and_b32 s3, s3, s12 +; GFX8-NEXT: s_lshr_b32 s16, s7, 16 ; GFX8-NEXT: s_and_b32 s7, s7, s12 +; GFX8-NEXT: s_lshl_b32 s2, s2, s6 ; GFX8-NEXT: s_lshl_b32 s6, s10, s15 +; GFX8-NEXT: s_or_b32 s0, s4, s0 ; GFX8-NEXT: s_lshl_b32 s4, s5, 16 ; GFX8-NEXT: s_and_b32 s1, s1, s12 ; GFX8-NEXT: s_lshl_b32 s3, s3, s7 -; GFX8-NEXT: s_or_b32 s1, s4, s1 ; GFX8-NEXT: s_lshl_b32 s7, s11, s16 +; GFX8-NEXT: s_or_b32 s1, s4, s1 ; GFX8-NEXT: s_lshl_b32 s4, s6, 16 ; GFX8-NEXT: s_and_b32 s2, s2, s12 ; GFX8-NEXT: s_or_b32 s2, s4, s2 @@ -1255,8 +1255,8 @@ ; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s4 ; GFX9-NEXT: s_lshr_b32 s4, s2, 16 ; GFX9-NEXT: s_lshr_b32 s5, s6, 16 -; GFX9-NEXT: s_lshl_b32 s4, s4, s5 ; GFX9-NEXT: s_lshl_b32 s2, s2, s6 +; GFX9-NEXT: s_lshl_b32 s4, s4, s5 ; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s4 ; GFX9-NEXT: s_lshr_b32 s4, s3, 16 ; GFX9-NEXT: s_lshr_b32 s5, s7, 16 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll @@ -30,26 +30,26 @@ ; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v6 ; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v5, v6, vcc -; CHECK-NEXT: v_sub_i32_e32 v7, vcc, 0, v1 ; CHECK-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 ; CHECK-NEXT: v_mul_f32_e32 v5, 0x2f800000, v2 ; CHECK-NEXT: v_trunc_f32_e32 v5, v5 ; CHECK-NEXT: v_mac_f32_e32 v2, 0xcf800000, v5 ; CHECK-NEXT: v_cvt_u32_f32_e32 v2, v2 ; CHECK-NEXT: v_cvt_u32_f32_e32 v5, v5 +; CHECK-NEXT: v_sub_i32_e32 v7, vcc, 0, v1 ; CHECK-NEXT: v_subb_u32_e32 v8, vcc, 0, v0, vcc -; CHECK-NEXT: v_xor_b32_e32 v3, v3, v6 ; CHECK-NEXT: v_mul_lo_u32 v9, v8, v2 ; CHECK-NEXT: v_mul_lo_u32 v10, v7, v5 ; CHECK-NEXT: v_mul_hi_u32 v12, v7, v2 ; CHECK-NEXT: v_mul_lo_u32 v11, v7, v2 -; CHECK-NEXT: v_xor_b32_e32 v4, v4, v6 +; CHECK-NEXT: v_xor_b32_e32 v3, v3, v6 ; CHECK-NEXT: v_add_i32_e32 v9, vcc, v9, v10 ; CHECK-NEXT: v_add_i32_e32 v9, vcc, v9, v12 ; CHECK-NEXT: v_mul_lo_u32 v10, v5, v11 ; CHECK-NEXT: v_mul_lo_u32 v12, v2, v9 ; CHECK-NEXT: v_mul_hi_u32 v13, v2, v11 ; CHECK-NEXT: v_mul_hi_u32 v11, v5, v11 +; CHECK-NEXT: v_xor_b32_e32 v4, v4, v6 ; CHECK-NEXT: v_add_i32_e32 v10, vcc, v10, v12 ; CHECK-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v10, vcc, v10, v13 @@ -57,12 +57,12 @@ ; CHECK-NEXT: v_mul_lo_u32 v13, v5, v9 ; CHECK-NEXT: v_add_i32_e32 v10, vcc, v12, v10 ; CHECK-NEXT: v_mul_hi_u32 v12, v2, v9 -; CHECK-NEXT: v_mul_hi_u32 v9, v5, v9 ; CHECK-NEXT: v_add_i32_e32 v11, vcc, v13, v11 ; CHECK-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v11, vcc, v11, v12 ; CHECK-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v12, vcc, v13, v12 +; CHECK-NEXT: v_mul_hi_u32 v9, v5, v9 ; CHECK-NEXT: v_add_i32_e32 v10, vcc, v11, v10 ; CHECK-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v11, vcc, v12, v11 @@ -75,10 +75,10 @@ ; CHECK-NEXT: v_mul_hi_u32 v7, v7, v2 ; CHECK-NEXT: v_add_i32_e64 v5, s[4:5], v5, v9 ; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v8, v11 -; CHECK-NEXT: v_mul_hi_u32 v9, v2, v12 ; CHECK-NEXT: v_add_i32_e64 v7, s[4:5], v8, v7 ; CHECK-NEXT: v_mul_lo_u32 v8, v10, v12 ; CHECK-NEXT: v_mul_lo_u32 v11, v2, v7 +; CHECK-NEXT: v_mul_hi_u32 v9, v2, v12 ; CHECK-NEXT: v_mul_hi_u32 v12, v10, v12 ; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v8, v11 ; CHECK-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] @@ -87,12 +87,12 @@ ; CHECK-NEXT: v_mul_lo_u32 v9, v10, v7 ; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v11, v8 ; CHECK-NEXT: v_mul_hi_u32 v11, v2, v7 -; CHECK-NEXT: v_mul_hi_u32 v7, v10, v7 ; CHECK-NEXT: v_add_i32_e64 v9, s[4:5], v9, v12 ; CHECK-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] ; CHECK-NEXT: v_add_i32_e64 v9, s[4:5], v9, v11 ; CHECK-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] ; CHECK-NEXT: v_add_i32_e64 v11, s[4:5], v12, v11 +; CHECK-NEXT: v_mul_hi_u32 v7, v10, v7 ; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v9, v8 ; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] ; CHECK-NEXT: v_add_i32_e64 v9, s[4:5], v11, v9 @@ -111,12 +111,12 @@ ; CHECK-NEXT: v_mul_lo_u32 v9, v4, v5 ; CHECK-NEXT: v_add_i32_e32 v7, vcc, v8, v7 ; CHECK-NEXT: v_mul_hi_u32 v8, v3, v5 -; CHECK-NEXT: v_mul_hi_u32 v5, v4, v5 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v9, v2 ; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v8 ; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; CHECK-NEXT: v_mul_hi_u32 v5, v4, v5 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v7 ; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v7, vcc, v8, v7 @@ -203,8 +203,8 @@ ; CHECK-NEXT: v_cmp_ne_u64_e64 vcc, s[6:7], 0 ; CHECK-NEXT: s_cbranch_vccz BB1_2 ; CHECK-NEXT: ; %bb.1: -; CHECK-NEXT: s_ashr_i32 s0, s5, 31 ; CHECK-NEXT: s_ashr_i32 s6, s3, 31 +; CHECK-NEXT: s_ashr_i32 s0, s5, 31 ; CHECK-NEXT: s_add_u32 s8, s2, s6 ; CHECK-NEXT: s_cselect_b32 s7, 1, 0 ; CHECK-NEXT: s_and_b32 s7, s7, 1 @@ -231,12 +231,12 @@ ; CHECK-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; CHECK-NEXT: v_trunc_f32_e32 v1, v1 ; CHECK-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 -; CHECK-NEXT: v_cvt_u32_f32_e32 v0, v0 ; CHECK-NEXT: v_cvt_u32_f32_e32 v1, v1 +; CHECK-NEXT: v_cvt_u32_f32_e32 v0, v0 ; CHECK-NEXT: s_subb_u32 s5, 0, s11 ; CHECK-NEXT: v_mov_b32_e32 v6, s11 -; CHECK-NEXT: v_mul_lo_u32 v2, s5, v0 ; CHECK-NEXT: v_mul_lo_u32 v3, s3, v1 +; CHECK-NEXT: v_mul_lo_u32 v2, s5, v0 ; CHECK-NEXT: v_mul_hi_u32 v5, s3, v0 ; CHECK-NEXT: v_mul_lo_u32 v4, s3, v0 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3 @@ -252,12 +252,12 @@ ; CHECK-NEXT: v_mul_lo_u32 v7, v1, v2 ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v5, v3 ; CHECK-NEXT: v_mul_hi_u32 v5, v0, v2 -; CHECK-NEXT: v_mul_hi_u32 v2, v1, v2 ; CHECK-NEXT: v_add_i32_e32 v4, vcc, v7, v4 ; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v5 ; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v7, v5 +; CHECK-NEXT: v_mul_hi_u32 v2, v1, v2 ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v3 ; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v4, vcc, v5, v4 @@ -282,12 +282,12 @@ ; CHECK-NEXT: v_mul_lo_u32 v5, v3, v4 ; CHECK-NEXT: v_add_i32_e64 v2, s[0:1], v8, v2 ; CHECK-NEXT: v_mul_hi_u32 v8, v0, v4 -; CHECK-NEXT: v_mul_hi_u32 v3, v3, v4 ; CHECK-NEXT: v_add_i32_e64 v5, s[0:1], v5, v7 ; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[0:1] ; CHECK-NEXT: v_add_i32_e64 v5, s[0:1], v5, v8 ; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[0:1] ; CHECK-NEXT: v_add_i32_e64 v7, s[0:1], v7, v8 +; CHECK-NEXT: v_mul_hi_u32 v3, v3, v4 ; CHECK-NEXT: v_add_i32_e64 v2, s[0:1], v5, v2 ; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[0:1] ; CHECK-NEXT: v_add_i32_e64 v4, s[0:1], v7, v5 @@ -307,12 +307,12 @@ ; CHECK-NEXT: v_mul_lo_u32 v5, s9, v1 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; CHECK-NEXT: v_mul_hi_u32 v3, s8, v1 -; CHECK-NEXT: v_mul_hi_u32 v1, s9, v1 ; CHECK-NEXT: v_add_i32_e32 v0, vcc, v5, v0 ; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v3 ; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v5, v3 +; CHECK-NEXT: v_mul_hi_u32 v1, s9, v1 ; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2 @@ -398,35 +398,35 @@ ; GISEL-NEXT: v_ashrrev_i32_e32 v8, 31, v5 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v8 ; GISEL-NEXT: v_addc_u32_e32 v5, vcc, v5, v8, vcc -; GISEL-NEXT: v_xor_b32_e32 v5, v5, v8 ; GISEL-NEXT: v_xor_b32_e32 v4, v4, v8 +; GISEL-NEXT: v_xor_b32_e32 v5, v5, v8 ; GISEL-NEXT: v_cvt_f32_u32_e32 v8, v4 ; GISEL-NEXT: v_cvt_f32_u32_e32 v9, v5 ; GISEL-NEXT: v_ashrrev_i32_e32 v10, 31, v1 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v10 -; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v10, vcc ; GISEL-NEXT: v_mac_f32_e32 v8, 0x4f800000, v9 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v8, v8 +; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v10, vcc ; GISEL-NEXT: v_sub_i32_e32 v11, vcc, 0, v4 -; GISEL-NEXT: v_subb_u32_e32 v12, vcc, 0, v5, vcc -; GISEL-NEXT: v_xor_b32_e32 v0, v0, v10 ; GISEL-NEXT: v_mul_f32_e32 v8, 0x5f7ffffc, v8 ; GISEL-NEXT: v_mul_f32_e32 v9, 0x2f800000, v8 ; GISEL-NEXT: v_trunc_f32_e32 v9, v9 ; GISEL-NEXT: v_mac_f32_e32 v8, 0xcf800000, v9 ; GISEL-NEXT: v_cvt_u32_f32_e32 v8, v8 ; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v9 -; GISEL-NEXT: v_xor_b32_e32 v1, v1, v10 +; GISEL-NEXT: v_subb_u32_e32 v12, vcc, 0, v5, vcc ; GISEL-NEXT: v_mul_lo_u32 v13, v12, v8 ; GISEL-NEXT: v_mul_lo_u32 v14, v11, v9 ; GISEL-NEXT: v_mul_hi_u32 v16, v11, v8 ; GISEL-NEXT: v_mul_lo_u32 v15, v11, v8 +; GISEL-NEXT: v_xor_b32_e32 v0, v0, v10 ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v16 ; GISEL-NEXT: v_mul_lo_u32 v14, v9, v15 ; GISEL-NEXT: v_mul_lo_u32 v16, v8, v13 ; GISEL-NEXT: v_mul_hi_u32 v17, v8, v15 ; GISEL-NEXT: v_mul_hi_u32 v15, v9, v15 +; GISEL-NEXT: v_xor_b32_e32 v1, v1, v10 ; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v16 ; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v17 @@ -434,12 +434,12 @@ ; GISEL-NEXT: v_mul_lo_u32 v17, v9, v13 ; GISEL-NEXT: v_add_i32_e32 v14, vcc, v16, v14 ; GISEL-NEXT: v_mul_hi_u32 v16, v8, v13 -; GISEL-NEXT: v_mul_hi_u32 v13, v9, v13 ; GISEL-NEXT: v_add_i32_e32 v15, vcc, v17, v15 ; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v16 ; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v16, vcc, v17, v16 +; GISEL-NEXT: v_mul_hi_u32 v13, v9, v13 ; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v15, vcc, v16, v15 @@ -452,10 +452,10 @@ ; GISEL-NEXT: v_mul_hi_u32 v11, v11, v8 ; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v13 ; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v12, v15 -; GISEL-NEXT: v_mul_hi_u32 v13, v8, v16 ; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v12, v11 ; GISEL-NEXT: v_mul_lo_u32 v12, v14, v16 ; GISEL-NEXT: v_mul_lo_u32 v15, v8, v11 +; GISEL-NEXT: v_mul_hi_u32 v13, v8, v16 ; GISEL-NEXT: v_mul_hi_u32 v16, v14, v16 ; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v12, v15 ; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] @@ -464,12 +464,12 @@ ; GISEL-NEXT: v_mul_lo_u32 v13, v14, v11 ; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v15, v12 ; GISEL-NEXT: v_mul_hi_u32 v15, v8, v11 -; GISEL-NEXT: v_mul_hi_u32 v11, v14, v11 ; GISEL-NEXT: v_add_i32_e64 v13, s[4:5], v13, v16 ; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v13, s[4:5], v13, v15 ; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v15, s[4:5], v16, v15 +; GISEL-NEXT: v_mul_hi_u32 v11, v14, v11 ; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v13, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v13, s[4:5], v15, v13 @@ -488,12 +488,12 @@ ; GISEL-NEXT: v_mul_lo_u32 v13, v1, v9 ; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11 ; GISEL-NEXT: v_mul_hi_u32 v12, v0, v9 -; GISEL-NEXT: v_mul_hi_u32 v9, v1, v9 ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v13, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12 +; GISEL-NEXT: v_mul_hi_u32 v9, v1, v9 ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11 @@ -520,9 +520,9 @@ ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v11, v4 ; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v5, vcc -; GISEL-NEXT: v_sub_i32_e32 v4, vcc, v11, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v12, v5 +; GISEL-NEXT: v_sub_i32_e32 v4, vcc, v11, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v13, v13, v14, s[4:5] ; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 @@ -540,31 +540,29 @@ ; GISEL-NEXT: v_cvt_f32_u32_e32 v7, v4 ; GISEL-NEXT: v_ashrrev_i32_e32 v8, 31, v3 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v8 -; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v8, vcc ; GISEL-NEXT: v_mac_f32_e32 v6, 0x4f800000, v7 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v6, v6 +; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v8, vcc ; GISEL-NEXT: v_sub_i32_e32 v9, vcc, 0, v5 -; GISEL-NEXT: v_subb_u32_e32 v11, vcc, 0, v4, vcc -; GISEL-NEXT: v_xor_b32_e32 v0, v0, v10 ; GISEL-NEXT: v_mul_f32_e32 v6, 0x5f7ffffc, v6 ; GISEL-NEXT: v_mul_f32_e32 v7, 0x2f800000, v6 ; GISEL-NEXT: v_trunc_f32_e32 v7, v7 ; GISEL-NEXT: v_mac_f32_e32 v6, 0xcf800000, v7 ; GISEL-NEXT: v_cvt_u32_f32_e32 v6, v6 ; GISEL-NEXT: v_cvt_u32_f32_e32 v7, v7 -; GISEL-NEXT: v_xor_b32_e32 v2, v2, v8 -; GISEL-NEXT: v_xor_b32_e32 v3, v3, v8 +; GISEL-NEXT: v_subb_u32_e32 v11, vcc, 0, v4, vcc ; GISEL-NEXT: v_mul_lo_u32 v12, v11, v6 ; GISEL-NEXT: v_mul_lo_u32 v13, v9, v7 ; GISEL-NEXT: v_mul_hi_u32 v15, v9, v6 ; GISEL-NEXT: v_mul_lo_u32 v14, v9, v6 -; GISEL-NEXT: v_xor_b32_e32 v1, v1, v10 +; GISEL-NEXT: v_xor_b32_e32 v0, v0, v10 ; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v13 ; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v15 ; GISEL-NEXT: v_mul_lo_u32 v13, v7, v14 ; GISEL-NEXT: v_mul_lo_u32 v15, v6, v12 ; GISEL-NEXT: v_mul_hi_u32 v16, v6, v14 ; GISEL-NEXT: v_mul_hi_u32 v14, v7, v14 +; GISEL-NEXT: v_xor_b32_e32 v2, v2, v8 ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v15 ; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v16 @@ -572,12 +570,12 @@ ; GISEL-NEXT: v_mul_lo_u32 v16, v7, v12 ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v15, v13 ; GISEL-NEXT: v_mul_hi_u32 v15, v6, v12 -; GISEL-NEXT: v_mul_hi_u32 v12, v7, v12 ; GISEL-NEXT: v_add_i32_e32 v14, vcc, v16, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v15 ; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v15, vcc, v16, v15 +; GISEL-NEXT: v_mul_hi_u32 v12, v7, v12 ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14 @@ -590,11 +588,12 @@ ; GISEL-NEXT: v_mul_hi_u32 v9, v9, v6 ; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v7, v12 ; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v14 -; GISEL-NEXT: v_mul_hi_u32 v12, v6, v15 ; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v11, v9 ; GISEL-NEXT: v_mul_lo_u32 v11, v13, v15 ; GISEL-NEXT: v_mul_lo_u32 v14, v6, v9 +; GISEL-NEXT: v_mul_hi_u32 v12, v6, v15 ; GISEL-NEXT: v_mul_hi_u32 v15, v13, v15 +; GISEL-NEXT: v_xor_b32_e32 v3, v3, v8 ; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v12 @@ -602,12 +601,12 @@ ; GISEL-NEXT: v_mul_lo_u32 v12, v13, v9 ; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v14, v11 ; GISEL-NEXT: v_mul_hi_u32 v14, v6, v9 -; GISEL-NEXT: v_mul_hi_u32 v9, v13, v9 ; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v12, v15 ; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v12, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], v15, v14 +; GISEL-NEXT: v_mul_hi_u32 v9, v13, v9 ; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v12, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v14, v12 @@ -615,25 +614,26 @@ ; GISEL-NEXT: v_addc_u32_e32 v7, vcc, v7, v9, vcc ; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v11 ; GISEL-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v10 +; GISEL-NEXT: v_xor_b32_e32 v1, v1, v10 ; GISEL-NEXT: v_mul_lo_u32 v9, v3, v6 ; GISEL-NEXT: v_mul_lo_u32 v11, v2, v7 +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v10 ; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v10, vcc ; GISEL-NEXT: v_mul_hi_u32 v10, v2, v6 -; GISEL-NEXT: v_mul_hi_u32 v6, v3, v6 ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GISEL-NEXT: v_mul_lo_u32 v10, v3, v7 +; GISEL-NEXT: v_mul_hi_u32 v6, v3, v6 ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v11, v9 ; GISEL-NEXT: v_mul_hi_u32 v11, v2, v7 -; GISEL-NEXT: v_mul_hi_u32 v7, v3, v7 ; GISEL-NEXT: v_add_i32_e32 v6, vcc, v10, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v11 +; GISEL-NEXT: v_mul_hi_u32 v7, v3, v7 ; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 @@ -704,26 +704,26 @@ ; CGP-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; CGP-NEXT: v_add_i32_e32 v3, vcc, v10, v4 ; CGP-NEXT: v_addc_u32_e32 v5, vcc, v11, v4, vcc -; CGP-NEXT: v_sub_i32_e32 v11, vcc, 0, v1 ; CGP-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 ; CGP-NEXT: v_mul_f32_e32 v10, 0x2f800000, v2 ; CGP-NEXT: v_trunc_f32_e32 v10, v10 ; CGP-NEXT: v_mac_f32_e32 v2, 0xcf800000, v10 ; CGP-NEXT: v_cvt_u32_f32_e32 v2, v2 ; CGP-NEXT: v_cvt_u32_f32_e32 v10, v10 +; CGP-NEXT: v_sub_i32_e32 v11, vcc, 0, v1 ; CGP-NEXT: v_subb_u32_e32 v12, vcc, 0, v0, vcc -; CGP-NEXT: v_xor_b32_e32 v3, v3, v4 ; CGP-NEXT: v_mul_lo_u32 v13, v12, v2 ; CGP-NEXT: v_mul_lo_u32 v14, v11, v10 ; CGP-NEXT: v_mul_hi_u32 v16, v11, v2 ; CGP-NEXT: v_mul_lo_u32 v15, v11, v2 -; CGP-NEXT: v_xor_b32_e32 v5, v5, v4 +; CGP-NEXT: v_xor_b32_e32 v3, v3, v4 ; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14 ; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v16 ; CGP-NEXT: v_mul_lo_u32 v14, v10, v15 ; CGP-NEXT: v_mul_lo_u32 v16, v2, v13 ; CGP-NEXT: v_mul_hi_u32 v17, v2, v15 ; CGP-NEXT: v_mul_hi_u32 v15, v10, v15 +; CGP-NEXT: v_xor_b32_e32 v5, v5, v4 ; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v16 ; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v17 @@ -731,12 +731,12 @@ ; CGP-NEXT: v_mul_lo_u32 v17, v10, v13 ; CGP-NEXT: v_add_i32_e32 v14, vcc, v16, v14 ; CGP-NEXT: v_mul_hi_u32 v16, v2, v13 -; CGP-NEXT: v_mul_hi_u32 v13, v10, v13 ; CGP-NEXT: v_add_i32_e32 v15, vcc, v17, v15 ; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v15, vcc, v15, v16 ; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v16, vcc, v17, v16 +; CGP-NEXT: v_mul_hi_u32 v13, v10, v13 ; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v15, vcc, v16, v15 @@ -749,10 +749,10 @@ ; CGP-NEXT: v_mul_hi_u32 v11, v11, v2 ; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v10, v13 ; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v15 -; CGP-NEXT: v_mul_hi_u32 v13, v2, v16 ; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v12, v11 ; CGP-NEXT: v_mul_lo_u32 v12, v14, v16 ; CGP-NEXT: v_mul_lo_u32 v15, v2, v11 +; CGP-NEXT: v_mul_hi_u32 v13, v2, v16 ; CGP-NEXT: v_mul_hi_u32 v16, v14, v16 ; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v15 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] @@ -761,12 +761,12 @@ ; CGP-NEXT: v_mul_lo_u32 v13, v14, v11 ; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v15, v12 ; CGP-NEXT: v_mul_hi_u32 v15, v2, v11 -; CGP-NEXT: v_mul_hi_u32 v11, v14, v11 ; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v13, v16 ; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[4:5] ; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v13, v15 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] ; CGP-NEXT: v_add_i32_e64 v15, s[4:5], v16, v15 +; CGP-NEXT: v_mul_hi_u32 v11, v14, v11 ; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v13, v12 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] ; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v15, v13 @@ -785,12 +785,12 @@ ; CGP-NEXT: v_mul_lo_u32 v13, v5, v10 ; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11 ; CGP-NEXT: v_mul_hi_u32 v12, v3, v10 -; CGP-NEXT: v_mul_hi_u32 v10, v5, v10 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v13, v2 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v12 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12 +; CGP-NEXT: v_mul_hi_u32 v10, v5, v10 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v11 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11 @@ -879,26 +879,26 @@ ; CGP-NEXT: v_rcp_iflag_f32_e32 v4, v4 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v8, v6 ; CGP-NEXT: v_addc_u32_e32 v7, vcc, v9, v6, vcc -; CGP-NEXT: v_sub_i32_e32 v9, vcc, 0, v3 ; CGP-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 ; CGP-NEXT: v_mul_f32_e32 v8, 0x2f800000, v4 ; CGP-NEXT: v_trunc_f32_e32 v8, v8 ; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v8 ; CGP-NEXT: v_cvt_u32_f32_e32 v4, v4 ; CGP-NEXT: v_cvt_u32_f32_e32 v8, v8 +; CGP-NEXT: v_sub_i32_e32 v9, vcc, 0, v3 ; CGP-NEXT: v_subb_u32_e32 v10, vcc, 0, v2, vcc -; CGP-NEXT: v_xor_b32_e32 v5, v5, v6 ; CGP-NEXT: v_mul_lo_u32 v11, v10, v4 ; CGP-NEXT: v_mul_lo_u32 v12, v9, v8 ; CGP-NEXT: v_mul_hi_u32 v14, v9, v4 ; CGP-NEXT: v_mul_lo_u32 v13, v9, v4 -; CGP-NEXT: v_xor_b32_e32 v7, v7, v6 +; CGP-NEXT: v_xor_b32_e32 v5, v5, v6 ; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v12 ; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v14 ; CGP-NEXT: v_mul_lo_u32 v12, v8, v13 ; CGP-NEXT: v_mul_lo_u32 v14, v4, v11 ; CGP-NEXT: v_mul_hi_u32 v15, v4, v13 ; CGP-NEXT: v_mul_hi_u32 v13, v8, v13 +; CGP-NEXT: v_xor_b32_e32 v7, v7, v6 ; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v15 @@ -906,12 +906,12 @@ ; CGP-NEXT: v_mul_lo_u32 v15, v8, v11 ; CGP-NEXT: v_add_i32_e32 v12, vcc, v14, v12 ; CGP-NEXT: v_mul_hi_u32 v14, v4, v11 -; CGP-NEXT: v_mul_hi_u32 v11, v8, v11 ; CGP-NEXT: v_add_i32_e32 v13, vcc, v15, v13 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14 +; CGP-NEXT: v_mul_hi_u32 v11, v8, v11 ; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 @@ -924,10 +924,10 @@ ; CGP-NEXT: v_mul_hi_u32 v9, v9, v4 ; CGP-NEXT: v_add_i32_e64 v8, s[4:5], v8, v11 ; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v10, v13 -; CGP-NEXT: v_mul_hi_u32 v11, v4, v14 ; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v10, v9 ; CGP-NEXT: v_mul_lo_u32 v10, v12, v14 ; CGP-NEXT: v_mul_lo_u32 v13, v4, v9 +; CGP-NEXT: v_mul_hi_u32 v11, v4, v14 ; CGP-NEXT: v_mul_hi_u32 v14, v12, v14 ; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v10, v13 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] @@ -936,12 +936,12 @@ ; CGP-NEXT: v_mul_lo_u32 v11, v12, v9 ; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v13, v10 ; CGP-NEXT: v_mul_hi_u32 v13, v4, v9 -; CGP-NEXT: v_mul_hi_u32 v9, v12, v9 ; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] ; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v13 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] ; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v14, v13 +; CGP-NEXT: v_mul_hi_u32 v9, v12, v9 ; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v11, v10 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] ; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v13, v11 @@ -960,12 +960,12 @@ ; CGP-NEXT: v_mul_lo_u32 v11, v7, v8 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v9 ; CGP-NEXT: v_mul_hi_u32 v10, v5, v8 -; CGP-NEXT: v_mul_hi_u32 v8, v7, v8 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v11, v4 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v10 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; CGP-NEXT: v_mul_hi_u32 v8, v7, v8 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9 ; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v9 @@ -1051,19 +1051,19 @@ ; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v3 ; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc -; CHECK-NEXT: v_xor_b32_e32 v0, v0, v3 ; CHECK-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 ; CHECK-NEXT: v_mul_f32_e32 v4, 0x2f800000, v2 ; CHECK-NEXT: v_trunc_f32_e32 v4, v4 ; CHECK-NEXT: v_mac_f32_e32 v2, 0xcf800000, v4 ; CHECK-NEXT: v_cvt_u32_f32_e32 v2, v2 ; CHECK-NEXT: v_cvt_u32_f32_e32 v4, v4 +; CHECK-NEXT: v_xor_b32_e32 v0, v0, v3 ; CHECK-NEXT: v_xor_b32_e32 v1, v1, v3 -; CHECK-NEXT: s_bfe_i32 s7, -1, 0x10000 ; CHECK-NEXT: v_mul_lo_u32 v5, -1, v2 ; CHECK-NEXT: v_mul_lo_u32 v6, s6, v4 ; CHECK-NEXT: v_mul_hi_u32 v8, s6, v2 ; CHECK-NEXT: v_mul_lo_u32 v7, s6, v2 +; CHECK-NEXT: s_bfe_i32 s7, -1, 0x10000 ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v8 ; CHECK-NEXT: v_mul_lo_u32 v6, v4, v7 @@ -1077,12 +1077,12 @@ ; CHECK-NEXT: v_mul_lo_u32 v9, v4, v5 ; CHECK-NEXT: v_add_i32_e32 v6, vcc, v8, v6 ; CHECK-NEXT: v_mul_hi_u32 v8, v2, v5 -; CHECK-NEXT: v_mul_hi_u32 v5, v4, v5 ; CHECK-NEXT: v_add_i32_e32 v7, vcc, v9, v7 ; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v8 ; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; CHECK-NEXT: v_mul_hi_u32 v5, v4, v5 ; CHECK-NEXT: v_add_i32_e32 v6, vcc, v7, v6 ; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v7, vcc, v8, v7 @@ -1108,12 +1108,12 @@ ; CHECK-NEXT: v_mul_lo_u32 v8, v6, v7 ; CHECK-NEXT: v_add_i32_e64 v5, s[4:5], v10, v5 ; CHECK-NEXT: v_mul_hi_u32 v10, v2, v7 -; CHECK-NEXT: v_mul_hi_u32 v6, v6, v7 ; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v8, v9 ; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] ; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v8, v10 ; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] ; CHECK-NEXT: v_add_i32_e64 v9, s[4:5], v9, v10 +; CHECK-NEXT: v_mul_hi_u32 v6, v6, v7 ; CHECK-NEXT: v_add_i32_e64 v5, s[4:5], v8, v5 ; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] ; CHECK-NEXT: v_add_i32_e64 v7, s[4:5], v9, v8 @@ -1132,12 +1132,12 @@ ; CHECK-NEXT: v_mul_lo_u32 v7, v1, v4 ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5 ; CHECK-NEXT: v_mul_hi_u32 v6, v0, v4 -; CHECK-NEXT: v_mul_hi_u32 v4, v1, v4 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v7, v2 ; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v6 ; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; CHECK-NEXT: v_mul_hi_u32 v4, v1, v4 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v5 ; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5 @@ -1147,13 +1147,13 @@ ; CHECK-NEXT: v_mul_lo_u32 v6, s6, v2 ; CHECK-NEXT: v_mul_hi_u32 v2, s6, v2 ; CHECK-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; CHECK-NEXT: v_mov_b32_e32 v5, s7 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v4, v2 ; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v6 ; CHECK-NEXT: v_subb_u32_e64 v4, s[4:5], v1, v2, vcc ; CHECK-NEXT: v_sub_i32_e64 v1, s[4:5], v1, v2 ; CHECK-NEXT: v_cmp_le_u32_e64 s[4:5], s6, v0 ; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[4:5] +; CHECK-NEXT: v_mov_b32_e32 v5, s7 ; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v4 ; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc ; CHECK-NEXT: v_cndmask_b32_e64 v2, v5, v2, s[4:5] @@ -1190,8 +1190,8 @@ ; GISEL-NEXT: s_add_u32 s4, s10, 0 ; GISEL-NEXT: s_cselect_b32 s5, 1, 0 ; GISEL-NEXT: s_and_b32 s5, s5, 1 -; GISEL-NEXT: s_cmp_lg_u32 s5, 0 ; GISEL-NEXT: s_mov_b32 s6, 0 +; GISEL-NEXT: s_cmp_lg_u32 s5, 0 ; GISEL-NEXT: s_mov_b32 s7, s6 ; GISEL-NEXT: s_addc_u32 s5, 0, 0 ; GISEL-NEXT: s_xor_b64 s[8:9], s[4:5], s[6:7] @@ -1212,19 +1212,18 @@ ; GISEL-NEXT: v_cvt_u32_f32_e32 v4, v4 ; GISEL-NEXT: v_cvt_u32_f32_e32 v5, v5 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v6 -; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v6, vcc ; GISEL-NEXT: v_mul_lo_u32 v7, s12, v4 ; GISEL-NEXT: v_mul_lo_u32 v8, s11, v5 ; GISEL-NEXT: v_mul_hi_u32 v10, s11, v4 ; GISEL-NEXT: v_mul_lo_u32 v9, s11, v4 -; GISEL-NEXT: v_xor_b32_e32 v0, v0, v6 +; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v6, vcc ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v10 ; GISEL-NEXT: v_mul_lo_u32 v8, v5, v9 ; GISEL-NEXT: v_mul_lo_u32 v10, v4, v7 ; GISEL-NEXT: v_mul_hi_u32 v11, v4, v9 ; GISEL-NEXT: v_mul_hi_u32 v9, v5, v9 -; GISEL-NEXT: v_xor_b32_e32 v1, v1, v6 +; GISEL-NEXT: v_xor_b32_e32 v0, v0, v6 ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v11 @@ -1232,12 +1231,12 @@ ; GISEL-NEXT: v_mul_lo_u32 v11, v5, v7 ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v10, v8 ; GISEL-NEXT: v_mul_hi_u32 v10, v4, v7 -; GISEL-NEXT: v_mul_hi_u32 v7, v5, v7 ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v11, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; GISEL-NEXT: v_mul_hi_u32 v7, v5, v7 ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 @@ -1255,6 +1254,7 @@ ; GISEL-NEXT: v_mul_lo_u32 v12, v4, v9 ; GISEL-NEXT: v_mul_hi_u32 v7, v4, v11 ; GISEL-NEXT: v_mul_hi_u32 v11, v8, v11 +; GISEL-NEXT: v_xor_b32_e32 v1, v1, v6 ; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v10, v7 @@ -1262,12 +1262,12 @@ ; GISEL-NEXT: v_mul_lo_u32 v10, v8, v9 ; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v12, v7 ; GISEL-NEXT: v_mul_hi_u32 v12, v4, v9 -; GISEL-NEXT: v_mul_hi_u32 v8, v8, v9 ; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v12 +; GISEL-NEXT: v_mul_hi_u32 v8, v8, v9 ; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v10, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v11, v10 @@ -1287,12 +1287,12 @@ ; GISEL-NEXT: v_mul_lo_u32 v10, v1, v5 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v7 ; GISEL-NEXT: v_mul_hi_u32 v8, v0, v5 -; GISEL-NEXT: v_mul_hi_u32 v5, v1, v5 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v10, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v10, v8 +; GISEL-NEXT: v_mul_hi_u32 v5, v1, v5 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v7 @@ -1326,14 +1326,14 @@ ; GISEL-NEXT: s_cselect_b32 s5, 1, 0 ; GISEL-NEXT: v_subrev_i32_e32 v9, vcc, s8, v7 ; GISEL-NEXT: s_and_b32 s5, s5, 1 -; GISEL-NEXT: s_cmp_lg_u32 s5, 0 ; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc +; GISEL-NEXT: s_cmp_lg_u32 s5, 0 ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 ; GISEL-NEXT: s_addc_u32 s5, 0, 0 -; GISEL-NEXT: s_xor_b64 s[6:7], s[4:5], s[6:7] ; GISEL-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GISEL-NEXT: s_xor_b64 s[6:7], s[4:5], s[6:7] ; GISEL-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; GISEL-NEXT: v_cvt_f32_u32_e32 v4, s6 ; GISEL-NEXT: v_cvt_f32_u32_e32 v5, s7 @@ -1376,12 +1376,12 @@ ; GISEL-NEXT: v_mul_lo_u32 v11, v5, v7 ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v10, v8 ; GISEL-NEXT: v_mul_hi_u32 v10, v4, v7 -; GISEL-NEXT: v_mul_hi_u32 v7, v5, v7 ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v11, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; GISEL-NEXT: v_mul_hi_u32 v7, v5, v7 ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 @@ -1407,12 +1407,12 @@ ; GISEL-NEXT: v_mul_lo_u32 v10, v8, v9 ; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v12, v7 ; GISEL-NEXT: v_mul_hi_u32 v12, v4, v9 -; GISEL-NEXT: v_mul_hi_u32 v8, v8, v9 ; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v12 +; GISEL-NEXT: v_mul_hi_u32 v8, v8, v9 ; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v10, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v11, v10 @@ -1432,12 +1432,12 @@ ; GISEL-NEXT: v_mul_lo_u32 v10, v3, v5 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v7 ; GISEL-NEXT: v_mul_hi_u32 v8, v2, v5 -; GISEL-NEXT: v_mul_hi_u32 v5, v3, v5 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v10, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v10, v8 +; GISEL-NEXT: v_mul_hi_u32 v5, v3, v5 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v7 @@ -1461,12 +1461,12 @@ ; GISEL-NEXT: v_subrev_i32_e32 v7, vcc, s6, v2 ; GISEL-NEXT: v_subbrev_u32_e64 v8, s[4:5], 0, v3, vcc ; GISEL-NEXT: v_cmp_le_u32_e64 s[4:5], s7, v8 -; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v9, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_le_u32_e64 s[4:5], s6, v7 -; GISEL-NEXT: v_subrev_i32_e32 v9, vcc, s6, v7 +; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v9, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], s7, v8 +; GISEL-NEXT: v_subrev_i32_e32 v9, vcc, s6, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v10, v10, v11, s[4:5] ; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 @@ -1493,27 +1493,26 @@ ; CGP-NEXT: v_rcp_iflag_f32_e32 v7, v7 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v5 ; CGP-NEXT: v_addc_u32_e32 v1, vcc, v1, v5, vcc -; CGP-NEXT: v_xor_b32_e32 v0, v0, v5 ; CGP-NEXT: v_mul_f32_e32 v7, 0x5f7ffffc, v7 ; CGP-NEXT: v_mul_f32_e32 v8, 0x2f800000, v7 ; CGP-NEXT: v_trunc_f32_e32 v8, v8 ; CGP-NEXT: v_mac_f32_e32 v7, 0xcf800000, v8 ; CGP-NEXT: v_cvt_u32_f32_e32 v7, v7 ; CGP-NEXT: v_cvt_u32_f32_e32 v8, v8 +; CGP-NEXT: v_xor_b32_e32 v0, v0, v5 ; CGP-NEXT: v_xor_b32_e32 v1, v1, v5 -; CGP-NEXT: s_movk_i32 s7, 0x1000 ; CGP-NEXT: v_mul_lo_u32 v9, -1, v7 ; CGP-NEXT: v_mul_lo_u32 v10, s6, v8 ; CGP-NEXT: v_mul_hi_u32 v12, s6, v7 ; CGP-NEXT: v_mul_lo_u32 v11, s6, v7 -; CGP-NEXT: s_bfe_i32 s8, -1, 0x10000 +; CGP-NEXT: s_movk_i32 s7, 0x1000 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v10 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v12 ; CGP-NEXT: v_mul_lo_u32 v10, v8, v11 ; CGP-NEXT: v_mul_lo_u32 v12, v7, v9 ; CGP-NEXT: v_mul_hi_u32 v13, v7, v11 ; CGP-NEXT: v_mul_hi_u32 v11, v8, v11 -; CGP-NEXT: v_mac_f32_e32 v4, 0x4f800000, v6 +; CGP-NEXT: s_bfe_i32 s8, -1, 0x10000 ; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v12 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v13 @@ -1521,12 +1520,12 @@ ; CGP-NEXT: v_mul_lo_u32 v13, v8, v9 ; CGP-NEXT: v_add_i32_e32 v10, vcc, v12, v10 ; CGP-NEXT: v_mul_hi_u32 v12, v7, v9 -; CGP-NEXT: v_mul_hi_u32 v9, v8, v9 ; CGP-NEXT: v_add_i32_e32 v11, vcc, v13, v11 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v12 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12 +; CGP-NEXT: v_mul_hi_u32 v9, v8, v9 ; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11 @@ -1544,7 +1543,7 @@ ; CGP-NEXT: v_mul_lo_u32 v14, v7, v11 ; CGP-NEXT: v_mul_hi_u32 v9, v7, v13 ; CGP-NEXT: v_mul_hi_u32 v13, v10, v13 -; CGP-NEXT: v_rcp_iflag_f32_e32 v4, v4 +; CGP-NEXT: v_mac_f32_e32 v4, 0x4f800000, v6 ; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] ; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v12, v9 @@ -1552,12 +1551,12 @@ ; CGP-NEXT: v_mul_lo_u32 v12, v10, v11 ; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v14, v9 ; CGP-NEXT: v_mul_hi_u32 v14, v7, v11 -; CGP-NEXT: v_mul_hi_u32 v10, v10, v11 ; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v13 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] ; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] ; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v13, v14 +; CGP-NEXT: v_mul_hi_u32 v10, v10, v11 ; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v12, v9 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] ; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v13, v12 @@ -1569,7 +1568,7 @@ ; CGP-NEXT: v_mul_lo_u32 v10, v0, v8 ; CGP-NEXT: v_mul_hi_u32 v11, v0, v7 ; CGP-NEXT: v_mul_hi_u32 v7, v1, v7 -; CGP-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 +; CGP-NEXT: v_rcp_iflag_f32_e32 v4, v4 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v10 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v11 @@ -1577,12 +1576,12 @@ ; CGP-NEXT: v_mul_lo_u32 v11, v1, v8 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v9 ; CGP-NEXT: v_mul_hi_u32 v10, v0, v8 -; CGP-NEXT: v_mul_hi_u32 v8, v1, v8 ; CGP-NEXT: v_add_i32_e32 v7, vcc, v11, v7 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v10 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; CGP-NEXT: v_mul_hi_u32 v8, v1, v8 ; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v9 ; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v9 @@ -1591,15 +1590,15 @@ ; CGP-NEXT: v_mul_lo_u32 v8, s7, v8 ; CGP-NEXT: v_mul_lo_u32 v10, s7, v7 ; CGP-NEXT: v_mul_hi_u32 v7, s7, v7 -; CGP-NEXT: v_ashrrev_i32_e32 v6, 31, v3 +; CGP-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 ; CGP-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; CGP-NEXT: v_mov_b32_e32 v9, s8 ; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v7 ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v10 ; CGP-NEXT: v_subb_u32_e64 v8, s[4:5], v1, v7, vcc ; CGP-NEXT: v_sub_i32_e64 v1, s[4:5], v1, v7 ; CGP-NEXT: v_cmp_le_u32_e64 s[4:5], s7, v0 ; CGP-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] +; CGP-NEXT: v_mov_b32_e32 v9, s8 ; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v8 ; CGP-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc ; CGP-NEXT: v_cndmask_b32_e64 v7, v9, v7, s[4:5] @@ -1627,6 +1626,7 @@ ; CGP-NEXT: v_mul_lo_u32 v8, -1, v4 ; CGP-NEXT: v_mul_lo_u32 v9, s6, v7 ; CGP-NEXT: v_mul_hi_u32 v11, s6, v4 +; CGP-NEXT: v_ashrrev_i32_e32 v6, 31, v3 ; CGP-NEXT: v_mul_lo_u32 v10, s6, v4 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v6 ; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v6, vcc @@ -1644,12 +1644,12 @@ ; CGP-NEXT: v_mul_lo_u32 v12, v7, v8 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v11, v9 ; CGP-NEXT: v_mul_hi_u32 v11, v4, v8 -; CGP-NEXT: v_mul_hi_u32 v8, v7, v8 ; CGP-NEXT: v_add_i32_e32 v10, vcc, v12, v10 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v11 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11 +; CGP-NEXT: v_mul_hi_u32 v8, v7, v8 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v9 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 @@ -1675,40 +1675,40 @@ ; CGP-NEXT: v_mul_lo_u32 v11, v9, v10 ; CGP-NEXT: v_add_i32_e64 v8, s[4:5], v13, v8 ; CGP-NEXT: v_mul_hi_u32 v13, v4, v10 -; CGP-NEXT: v_mul_hi_u32 v9, v9, v10 ; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v12 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] ; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v13 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] ; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v13 +; CGP-NEXT: v_mul_hi_u32 v9, v9, v10 ; CGP-NEXT: v_add_i32_e64 v8, s[4:5], v11, v8 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] ; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v12, v11 ; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v9, v10 ; CGP-NEXT: v_addc_u32_e32 v7, vcc, v7, v9, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v8 -; CGP-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc ; CGP-NEXT: v_xor_b32_e32 v3, v3, v6 +; CGP-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc ; CGP-NEXT: v_xor_b32_e32 v1, v1, v5 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v5 ; CGP-NEXT: v_mul_lo_u32 v8, v3, v4 ; CGP-NEXT: v_mul_lo_u32 v9, v2, v7 +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v5 ; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v5, vcc ; CGP-NEXT: v_mul_hi_u32 v5, v2, v4 -; CGP-NEXT: v_mul_hi_u32 v4, v3, v4 ; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9 ; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v5, vcc, v8, v5 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CGP-NEXT: v_mul_lo_u32 v8, v3, v7 +; CGP-NEXT: v_mul_hi_u32 v4, v3, v4 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v9, v5 ; CGP-NEXT: v_mul_hi_u32 v9, v2, v7 -; CGP-NEXT: v_mul_hi_u32 v7, v3, v7 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v8, v4 ; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9 ; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9 +; CGP-NEXT: v_mul_hi_u32 v7, v3, v7 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v5, vcc, v8, v5 @@ -1719,13 +1719,13 @@ ; CGP-NEXT: v_mul_hi_u32 v4, s7, v4 ; CGP-NEXT: s_bfe_i32 s6, -1, 0x10000 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5 -; CGP-NEXT: v_mov_b32_e32 v7, s6 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v5, v4 ; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v8 ; CGP-NEXT: v_subb_u32_e64 v5, s[4:5], v3, v4, vcc ; CGP-NEXT: v_sub_i32_e64 v3, s[4:5], v3, v4 ; CGP-NEXT: v_cmp_le_u32_e64 s[4:5], s7, v2 ; CGP-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[4:5] +; CGP-NEXT: v_mov_b32_e32 v7, s6 ; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v5 ; CGP-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc ; CGP-NEXT: v_cndmask_b32_e64 v4, v7, v4, s[4:5] @@ -1766,19 +1766,19 @@ ; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v3 ; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc -; CHECK-NEXT: v_xor_b32_e32 v0, v0, v3 ; CHECK-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 ; CHECK-NEXT: v_mul_f32_e32 v4, 0x2f800000, v2 ; CHECK-NEXT: v_trunc_f32_e32 v4, v4 ; CHECK-NEXT: v_mac_f32_e32 v2, 0xcf800000, v4 ; CHECK-NEXT: v_cvt_u32_f32_e32 v2, v2 ; CHECK-NEXT: v_cvt_u32_f32_e32 v4, v4 +; CHECK-NEXT: v_xor_b32_e32 v0, v0, v3 ; CHECK-NEXT: v_xor_b32_e32 v1, v1, v3 -; CHECK-NEXT: s_bfe_i32 s7, -1, 0x10000 ; CHECK-NEXT: v_mul_lo_u32 v5, -1, v2 ; CHECK-NEXT: v_mul_lo_u32 v6, s6, v4 ; CHECK-NEXT: v_mul_hi_u32 v8, s6, v2 ; CHECK-NEXT: v_mul_lo_u32 v7, s6, v2 +; CHECK-NEXT: s_bfe_i32 s7, -1, 0x10000 ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v8 ; CHECK-NEXT: v_mul_lo_u32 v6, v4, v7 @@ -1792,12 +1792,12 @@ ; CHECK-NEXT: v_mul_lo_u32 v9, v4, v5 ; CHECK-NEXT: v_add_i32_e32 v6, vcc, v8, v6 ; CHECK-NEXT: v_mul_hi_u32 v8, v2, v5 -; CHECK-NEXT: v_mul_hi_u32 v5, v4, v5 ; CHECK-NEXT: v_add_i32_e32 v7, vcc, v9, v7 ; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v8 ; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; CHECK-NEXT: v_mul_hi_u32 v5, v4, v5 ; CHECK-NEXT: v_add_i32_e32 v6, vcc, v7, v6 ; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v7, vcc, v8, v7 @@ -1823,12 +1823,12 @@ ; CHECK-NEXT: v_mul_lo_u32 v8, v6, v7 ; CHECK-NEXT: v_add_i32_e64 v5, s[4:5], v10, v5 ; CHECK-NEXT: v_mul_hi_u32 v10, v2, v7 -; CHECK-NEXT: v_mul_hi_u32 v6, v6, v7 ; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v8, v9 ; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] ; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v8, v10 ; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] ; CHECK-NEXT: v_add_i32_e64 v9, s[4:5], v9, v10 +; CHECK-NEXT: v_mul_hi_u32 v6, v6, v7 ; CHECK-NEXT: v_add_i32_e64 v5, s[4:5], v8, v5 ; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] ; CHECK-NEXT: v_add_i32_e64 v7, s[4:5], v9, v8 @@ -1847,12 +1847,12 @@ ; CHECK-NEXT: v_mul_lo_u32 v7, v1, v4 ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5 ; CHECK-NEXT: v_mul_hi_u32 v6, v0, v4 -; CHECK-NEXT: v_mul_hi_u32 v4, v1, v4 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v7, v2 ; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v6 ; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; CHECK-NEXT: v_mul_hi_u32 v4, v1, v4 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v5 ; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5 @@ -1862,13 +1862,13 @@ ; CHECK-NEXT: v_mul_lo_u32 v6, s6, v2 ; CHECK-NEXT: v_mul_hi_u32 v2, s6, v2 ; CHECK-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; CHECK-NEXT: v_mov_b32_e32 v5, s7 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v4, v2 ; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v6 ; CHECK-NEXT: v_subb_u32_e64 v4, s[4:5], v1, v2, vcc ; CHECK-NEXT: v_sub_i32_e64 v1, s[4:5], v1, v2 ; CHECK-NEXT: v_cmp_le_u32_e64 s[4:5], s6, v0 ; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[4:5] +; CHECK-NEXT: v_mov_b32_e32 v5, s7 ; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v4 ; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc ; CHECK-NEXT: v_cndmask_b32_e64 v2, v5, v2, s[4:5] @@ -1905,8 +1905,8 @@ ; GISEL-NEXT: s_add_u32 s4, s10, 0 ; GISEL-NEXT: s_cselect_b32 s5, 1, 0 ; GISEL-NEXT: s_and_b32 s5, s5, 1 -; GISEL-NEXT: s_cmp_lg_u32 s5, 0 ; GISEL-NEXT: s_mov_b32 s6, 0 +; GISEL-NEXT: s_cmp_lg_u32 s5, 0 ; GISEL-NEXT: s_mov_b32 s7, s6 ; GISEL-NEXT: s_addc_u32 s5, 0, 0 ; GISEL-NEXT: s_xor_b64 s[8:9], s[4:5], s[6:7] @@ -1927,19 +1927,18 @@ ; GISEL-NEXT: v_cvt_u32_f32_e32 v4, v4 ; GISEL-NEXT: v_cvt_u32_f32_e32 v5, v5 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v6 -; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v6, vcc ; GISEL-NEXT: v_mul_lo_u32 v7, s12, v4 ; GISEL-NEXT: v_mul_lo_u32 v8, s11, v5 ; GISEL-NEXT: v_mul_hi_u32 v10, s11, v4 ; GISEL-NEXT: v_mul_lo_u32 v9, s11, v4 -; GISEL-NEXT: v_xor_b32_e32 v0, v0, v6 +; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v6, vcc ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v10 ; GISEL-NEXT: v_mul_lo_u32 v8, v5, v9 ; GISEL-NEXT: v_mul_lo_u32 v10, v4, v7 ; GISEL-NEXT: v_mul_hi_u32 v11, v4, v9 ; GISEL-NEXT: v_mul_hi_u32 v9, v5, v9 -; GISEL-NEXT: v_xor_b32_e32 v1, v1, v6 +; GISEL-NEXT: v_xor_b32_e32 v0, v0, v6 ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v11 @@ -1947,12 +1946,12 @@ ; GISEL-NEXT: v_mul_lo_u32 v11, v5, v7 ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v10, v8 ; GISEL-NEXT: v_mul_hi_u32 v10, v4, v7 -; GISEL-NEXT: v_mul_hi_u32 v7, v5, v7 ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v11, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; GISEL-NEXT: v_mul_hi_u32 v7, v5, v7 ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 @@ -1970,6 +1969,7 @@ ; GISEL-NEXT: v_mul_lo_u32 v12, v4, v9 ; GISEL-NEXT: v_mul_hi_u32 v7, v4, v11 ; GISEL-NEXT: v_mul_hi_u32 v11, v8, v11 +; GISEL-NEXT: v_xor_b32_e32 v1, v1, v6 ; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v10, v7 @@ -1977,12 +1977,12 @@ ; GISEL-NEXT: v_mul_lo_u32 v10, v8, v9 ; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v12, v7 ; GISEL-NEXT: v_mul_hi_u32 v12, v4, v9 -; GISEL-NEXT: v_mul_hi_u32 v8, v8, v9 ; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v12 +; GISEL-NEXT: v_mul_hi_u32 v8, v8, v9 ; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v10, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v11, v10 @@ -2002,12 +2002,12 @@ ; GISEL-NEXT: v_mul_lo_u32 v10, v1, v5 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v7 ; GISEL-NEXT: v_mul_hi_u32 v8, v0, v5 -; GISEL-NEXT: v_mul_hi_u32 v5, v1, v5 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v10, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v10, v8 +; GISEL-NEXT: v_mul_hi_u32 v5, v1, v5 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v7 @@ -2041,14 +2041,14 @@ ; GISEL-NEXT: s_cselect_b32 s5, 1, 0 ; GISEL-NEXT: v_subrev_i32_e32 v9, vcc, s8, v7 ; GISEL-NEXT: s_and_b32 s5, s5, 1 -; GISEL-NEXT: s_cmp_lg_u32 s5, 0 ; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc +; GISEL-NEXT: s_cmp_lg_u32 s5, 0 ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 ; GISEL-NEXT: s_addc_u32 s5, 0, 0 -; GISEL-NEXT: s_xor_b64 s[6:7], s[4:5], s[6:7] ; GISEL-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GISEL-NEXT: s_xor_b64 s[6:7], s[4:5], s[6:7] ; GISEL-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; GISEL-NEXT: v_cvt_f32_u32_e32 v4, s6 ; GISEL-NEXT: v_cvt_f32_u32_e32 v5, s7 @@ -2091,12 +2091,12 @@ ; GISEL-NEXT: v_mul_lo_u32 v11, v5, v7 ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v10, v8 ; GISEL-NEXT: v_mul_hi_u32 v10, v4, v7 -; GISEL-NEXT: v_mul_hi_u32 v7, v5, v7 ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v11, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; GISEL-NEXT: v_mul_hi_u32 v7, v5, v7 ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 @@ -2122,12 +2122,12 @@ ; GISEL-NEXT: v_mul_lo_u32 v10, v8, v9 ; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v12, v7 ; GISEL-NEXT: v_mul_hi_u32 v12, v4, v9 -; GISEL-NEXT: v_mul_hi_u32 v8, v8, v9 ; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v12 +; GISEL-NEXT: v_mul_hi_u32 v8, v8, v9 ; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v10, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v11, v10 @@ -2147,12 +2147,12 @@ ; GISEL-NEXT: v_mul_lo_u32 v10, v3, v5 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v7 ; GISEL-NEXT: v_mul_hi_u32 v8, v2, v5 -; GISEL-NEXT: v_mul_hi_u32 v5, v3, v5 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v10, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v10, v8 +; GISEL-NEXT: v_mul_hi_u32 v5, v3, v5 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v7 @@ -2176,12 +2176,12 @@ ; GISEL-NEXT: v_subrev_i32_e32 v7, vcc, s6, v2 ; GISEL-NEXT: v_subbrev_u32_e64 v8, s[4:5], 0, v3, vcc ; GISEL-NEXT: v_cmp_le_u32_e64 s[4:5], s7, v8 -; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v9, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_le_u32_e64 s[4:5], s6, v7 -; GISEL-NEXT: v_subrev_i32_e32 v9, vcc, s6, v7 +; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v9, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], s7, v8 +; GISEL-NEXT: v_subrev_i32_e32 v9, vcc, s6, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v10, v10, v11, s[4:5] ; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 @@ -2208,27 +2208,26 @@ ; CGP-NEXT: v_rcp_iflag_f32_e32 v7, v7 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v5 ; CGP-NEXT: v_addc_u32_e32 v1, vcc, v1, v5, vcc -; CGP-NEXT: v_xor_b32_e32 v0, v0, v5 ; CGP-NEXT: v_mul_f32_e32 v7, 0x5f7ffffc, v7 ; CGP-NEXT: v_mul_f32_e32 v8, 0x2f800000, v7 ; CGP-NEXT: v_trunc_f32_e32 v8, v8 ; CGP-NEXT: v_mac_f32_e32 v7, 0xcf800000, v8 ; CGP-NEXT: v_cvt_u32_f32_e32 v7, v7 ; CGP-NEXT: v_cvt_u32_f32_e32 v8, v8 +; CGP-NEXT: v_xor_b32_e32 v0, v0, v5 ; CGP-NEXT: v_xor_b32_e32 v1, v1, v5 -; CGP-NEXT: s_mov_b32 s7, 0x12d8fb ; CGP-NEXT: v_mul_lo_u32 v9, -1, v7 ; CGP-NEXT: v_mul_lo_u32 v10, s6, v8 ; CGP-NEXT: v_mul_hi_u32 v12, s6, v7 ; CGP-NEXT: v_mul_lo_u32 v11, s6, v7 -; CGP-NEXT: s_bfe_i32 s8, -1, 0x10000 +; CGP-NEXT: s_mov_b32 s7, 0x12d8fb ; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v10 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v12 ; CGP-NEXT: v_mul_lo_u32 v10, v8, v11 ; CGP-NEXT: v_mul_lo_u32 v12, v7, v9 ; CGP-NEXT: v_mul_hi_u32 v13, v7, v11 ; CGP-NEXT: v_mul_hi_u32 v11, v8, v11 -; CGP-NEXT: v_mac_f32_e32 v4, 0x4f800000, v6 +; CGP-NEXT: s_bfe_i32 s8, -1, 0x10000 ; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v12 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v13 @@ -2236,12 +2235,12 @@ ; CGP-NEXT: v_mul_lo_u32 v13, v8, v9 ; CGP-NEXT: v_add_i32_e32 v10, vcc, v12, v10 ; CGP-NEXT: v_mul_hi_u32 v12, v7, v9 -; CGP-NEXT: v_mul_hi_u32 v9, v8, v9 ; CGP-NEXT: v_add_i32_e32 v11, vcc, v13, v11 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v12 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12 +; CGP-NEXT: v_mul_hi_u32 v9, v8, v9 ; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11 @@ -2259,7 +2258,7 @@ ; CGP-NEXT: v_mul_lo_u32 v14, v7, v11 ; CGP-NEXT: v_mul_hi_u32 v9, v7, v13 ; CGP-NEXT: v_mul_hi_u32 v13, v10, v13 -; CGP-NEXT: v_rcp_iflag_f32_e32 v4, v4 +; CGP-NEXT: v_mac_f32_e32 v4, 0x4f800000, v6 ; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] ; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v12, v9 @@ -2267,12 +2266,12 @@ ; CGP-NEXT: v_mul_lo_u32 v12, v10, v11 ; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v14, v9 ; CGP-NEXT: v_mul_hi_u32 v14, v7, v11 -; CGP-NEXT: v_mul_hi_u32 v10, v10, v11 ; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v13 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] ; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] ; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v13, v14 +; CGP-NEXT: v_mul_hi_u32 v10, v10, v11 ; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v12, v9 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] ; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v13, v12 @@ -2284,7 +2283,7 @@ ; CGP-NEXT: v_mul_lo_u32 v10, v0, v8 ; CGP-NEXT: v_mul_hi_u32 v11, v0, v7 ; CGP-NEXT: v_mul_hi_u32 v7, v1, v7 -; CGP-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 +; CGP-NEXT: v_rcp_iflag_f32_e32 v4, v4 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v10 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v11 @@ -2292,12 +2291,12 @@ ; CGP-NEXT: v_mul_lo_u32 v11, v1, v8 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v9 ; CGP-NEXT: v_mul_hi_u32 v10, v0, v8 -; CGP-NEXT: v_mul_hi_u32 v8, v1, v8 ; CGP-NEXT: v_add_i32_e32 v7, vcc, v11, v7 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v10 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; CGP-NEXT: v_mul_hi_u32 v8, v1, v8 ; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v9 ; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v9 @@ -2306,15 +2305,15 @@ ; CGP-NEXT: v_mul_lo_u32 v8, s7, v8 ; CGP-NEXT: v_mul_lo_u32 v10, s7, v7 ; CGP-NEXT: v_mul_hi_u32 v7, s7, v7 -; CGP-NEXT: v_ashrrev_i32_e32 v6, 31, v3 +; CGP-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 ; CGP-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; CGP-NEXT: v_mov_b32_e32 v9, s8 ; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v7 ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v10 ; CGP-NEXT: v_subb_u32_e64 v8, s[4:5], v1, v7, vcc ; CGP-NEXT: v_sub_i32_e64 v1, s[4:5], v1, v7 ; CGP-NEXT: v_cmp_le_u32_e64 s[4:5], s7, v0 ; CGP-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] +; CGP-NEXT: v_mov_b32_e32 v9, s8 ; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v8 ; CGP-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc ; CGP-NEXT: v_cndmask_b32_e64 v7, v9, v7, s[4:5] @@ -2342,6 +2341,7 @@ ; CGP-NEXT: v_mul_lo_u32 v8, -1, v4 ; CGP-NEXT: v_mul_lo_u32 v9, s6, v7 ; CGP-NEXT: v_mul_hi_u32 v11, s6, v4 +; CGP-NEXT: v_ashrrev_i32_e32 v6, 31, v3 ; CGP-NEXT: v_mul_lo_u32 v10, s6, v4 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v6 ; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v6, vcc @@ -2359,12 +2359,12 @@ ; CGP-NEXT: v_mul_lo_u32 v12, v7, v8 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v11, v9 ; CGP-NEXT: v_mul_hi_u32 v11, v4, v8 -; CGP-NEXT: v_mul_hi_u32 v8, v7, v8 ; CGP-NEXT: v_add_i32_e32 v10, vcc, v12, v10 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v11 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11 +; CGP-NEXT: v_mul_hi_u32 v8, v7, v8 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v9 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 @@ -2390,40 +2390,40 @@ ; CGP-NEXT: v_mul_lo_u32 v11, v9, v10 ; CGP-NEXT: v_add_i32_e64 v8, s[4:5], v13, v8 ; CGP-NEXT: v_mul_hi_u32 v13, v4, v10 -; CGP-NEXT: v_mul_hi_u32 v9, v9, v10 ; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v12 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] ; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v13 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] ; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v13 +; CGP-NEXT: v_mul_hi_u32 v9, v9, v10 ; CGP-NEXT: v_add_i32_e64 v8, s[4:5], v11, v8 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] ; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v12, v11 ; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v9, v10 ; CGP-NEXT: v_addc_u32_e32 v7, vcc, v7, v9, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v8 -; CGP-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc ; CGP-NEXT: v_xor_b32_e32 v3, v3, v6 +; CGP-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc ; CGP-NEXT: v_xor_b32_e32 v1, v1, v5 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v5 ; CGP-NEXT: v_mul_lo_u32 v8, v3, v4 ; CGP-NEXT: v_mul_lo_u32 v9, v2, v7 +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v5 ; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v5, vcc ; CGP-NEXT: v_mul_hi_u32 v5, v2, v4 -; CGP-NEXT: v_mul_hi_u32 v4, v3, v4 ; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9 ; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v5, vcc, v8, v5 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CGP-NEXT: v_mul_lo_u32 v8, v3, v7 +; CGP-NEXT: v_mul_hi_u32 v4, v3, v4 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v9, v5 ; CGP-NEXT: v_mul_hi_u32 v9, v2, v7 -; CGP-NEXT: v_mul_hi_u32 v7, v3, v7 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v8, v4 ; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9 ; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9 +; CGP-NEXT: v_mul_hi_u32 v7, v3, v7 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v5, vcc, v8, v5 @@ -2434,13 +2434,13 @@ ; CGP-NEXT: v_mul_hi_u32 v4, s7, v4 ; CGP-NEXT: s_bfe_i32 s6, -1, 0x10000 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5 -; CGP-NEXT: v_mov_b32_e32 v7, s6 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v5, v4 ; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v8 ; CGP-NEXT: v_subb_u32_e64 v5, s[4:5], v3, v4, vcc ; CGP-NEXT: v_sub_i32_e64 v3, s[4:5], v3, v4 ; CGP-NEXT: v_cmp_le_u32_e64 s[4:5], s7, v2 ; CGP-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[4:5] +; CGP-NEXT: v_mov_b32_e32 v7, s6 ; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v5 ; CGP-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc ; CGP-NEXT: v_cndmask_b32_e64 v4, v7, v4, s[4:5] @@ -2494,29 +2494,29 @@ ; CHECK-NEXT: v_cvt_f32_u32_e32 v5, v0 ; CHECK-NEXT: v_ashrrev_i32_e32 v6, 31, v4 ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc ; CHECK-NEXT: v_mac_f32_e32 v2, 0x4f800000, v5 ; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, v2 +; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc ; CHECK-NEXT: v_sub_i32_e32 v7, vcc, 0, v1 -; CHECK-NEXT: v_subb_u32_e32 v8, vcc, 0, v0, vcc -; CHECK-NEXT: v_xor_b32_e32 v3, v3, v6 ; CHECK-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 ; CHECK-NEXT: v_mul_f32_e32 v5, 0x2f800000, v2 ; CHECK-NEXT: v_trunc_f32_e32 v5, v5 ; CHECK-NEXT: v_mac_f32_e32 v2, 0xcf800000, v5 ; CHECK-NEXT: v_cvt_u32_f32_e32 v2, v2 ; CHECK-NEXT: v_cvt_u32_f32_e32 v5, v5 -; CHECK-NEXT: v_xor_b32_e32 v4, v4, v6 +; CHECK-NEXT: v_subb_u32_e32 v8, vcc, 0, v0, vcc ; CHECK-NEXT: v_mul_lo_u32 v9, v8, v2 ; CHECK-NEXT: v_mul_lo_u32 v10, v7, v5 ; CHECK-NEXT: v_mul_hi_u32 v12, v7, v2 ; CHECK-NEXT: v_mul_lo_u32 v11, v7, v2 +; CHECK-NEXT: v_xor_b32_e32 v3, v3, v6 ; CHECK-NEXT: v_add_i32_e32 v9, vcc, v9, v10 ; CHECK-NEXT: v_add_i32_e32 v9, vcc, v9, v12 ; CHECK-NEXT: v_mul_lo_u32 v10, v5, v11 ; CHECK-NEXT: v_mul_lo_u32 v12, v2, v9 ; CHECK-NEXT: v_mul_hi_u32 v13, v2, v11 ; CHECK-NEXT: v_mul_hi_u32 v11, v5, v11 +; CHECK-NEXT: v_xor_b32_e32 v4, v4, v6 ; CHECK-NEXT: v_add_i32_e32 v10, vcc, v10, v12 ; CHECK-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v10, vcc, v10, v13 @@ -2524,12 +2524,12 @@ ; CHECK-NEXT: v_mul_lo_u32 v13, v5, v9 ; CHECK-NEXT: v_add_i32_e32 v10, vcc, v12, v10 ; CHECK-NEXT: v_mul_hi_u32 v12, v2, v9 -; CHECK-NEXT: v_mul_hi_u32 v9, v5, v9 ; CHECK-NEXT: v_add_i32_e32 v11, vcc, v13, v11 ; CHECK-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v11, vcc, v11, v12 ; CHECK-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v12, vcc, v13, v12 +; CHECK-NEXT: v_mul_hi_u32 v9, v5, v9 ; CHECK-NEXT: v_add_i32_e32 v10, vcc, v11, v10 ; CHECK-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v11, vcc, v12, v11 @@ -2542,10 +2542,10 @@ ; CHECK-NEXT: v_mul_hi_u32 v7, v7, v2 ; CHECK-NEXT: v_add_i32_e64 v5, s[4:5], v5, v9 ; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v8, v11 -; CHECK-NEXT: v_mul_hi_u32 v9, v2, v12 ; CHECK-NEXT: v_add_i32_e64 v7, s[4:5], v8, v7 ; CHECK-NEXT: v_mul_lo_u32 v8, v10, v12 ; CHECK-NEXT: v_mul_lo_u32 v11, v2, v7 +; CHECK-NEXT: v_mul_hi_u32 v9, v2, v12 ; CHECK-NEXT: v_mul_hi_u32 v12, v10, v12 ; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v8, v11 ; CHECK-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] @@ -2554,12 +2554,12 @@ ; CHECK-NEXT: v_mul_lo_u32 v9, v10, v7 ; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v11, v8 ; CHECK-NEXT: v_mul_hi_u32 v11, v2, v7 -; CHECK-NEXT: v_mul_hi_u32 v7, v10, v7 ; CHECK-NEXT: v_add_i32_e64 v9, s[4:5], v9, v12 ; CHECK-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] ; CHECK-NEXT: v_add_i32_e64 v9, s[4:5], v9, v11 ; CHECK-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] ; CHECK-NEXT: v_add_i32_e64 v11, s[4:5], v12, v11 +; CHECK-NEXT: v_mul_hi_u32 v7, v10, v7 ; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v9, v8 ; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] ; CHECK-NEXT: v_add_i32_e64 v9, s[4:5], v11, v9 @@ -2578,12 +2578,12 @@ ; CHECK-NEXT: v_mul_lo_u32 v9, v4, v5 ; CHECK-NEXT: v_add_i32_e32 v7, vcc, v8, v7 ; CHECK-NEXT: v_mul_hi_u32 v8, v3, v5 -; CHECK-NEXT: v_mul_hi_u32 v5, v4, v5 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v9, v2 ; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v8 ; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; CHECK-NEXT: v_mul_hi_u32 v5, v4, v5 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v7 ; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v7, vcc, v8, v7 @@ -2668,28 +2668,28 @@ ; GISEL-NEXT: v_ashrrev_i32_e32 v7, 31, v5 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7 ; GISEL-NEXT: v_addc_u32_e32 v5, vcc, v5, v7, vcc -; GISEL-NEXT: v_xor_b32_e32 v5, v5, v7 ; GISEL-NEXT: v_xor_b32_e32 v4, v4, v7 +; GISEL-NEXT: v_xor_b32_e32 v5, v5, v7 ; GISEL-NEXT: v_cvt_f32_u32_e32 v7, v4 ; GISEL-NEXT: v_cvt_f32_u32_e32 v8, v5 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v9 ; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v9, vcc -; GISEL-NEXT: v_sub_i32_e32 v10, vcc, 0, v4 ; GISEL-NEXT: v_mac_f32_e32 v7, 0x4f800000, v8 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v7, v7 ; GISEL-NEXT: v_xor_b32_e32 v8, v0, v9 -; GISEL-NEXT: v_subb_u32_e32 v11, vcc, 0, v5, vcc -; GISEL-NEXT: v_xor_b32_e32 v16, v1, v9 +; GISEL-NEXT: v_sub_i32_e32 v10, vcc, 0, v4 ; GISEL-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v7 ; GISEL-NEXT: v_mul_f32_e32 v7, 0x2f800000, v0 ; GISEL-NEXT: v_trunc_f32_e32 v7, v7 ; GISEL-NEXT: v_mac_f32_e32 v0, 0xcf800000, v7 ; GISEL-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GISEL-NEXT: v_cvt_u32_f32_e32 v7, v7 +; GISEL-NEXT: v_subb_u32_e32 v11, vcc, 0, v5, vcc ; GISEL-NEXT: v_mul_lo_u32 v12, v11, v0 ; GISEL-NEXT: v_mul_lo_u32 v13, v10, v7 ; GISEL-NEXT: v_mul_hi_u32 v15, v10, v0 ; GISEL-NEXT: v_mul_lo_u32 v14, v10, v0 +; GISEL-NEXT: v_xor_b32_e32 v16, v1, v9 ; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v13 ; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v15 ; GISEL-NEXT: v_mul_lo_u32 v13, v7, v14 @@ -2703,12 +2703,12 @@ ; GISEL-NEXT: v_mul_lo_u32 v13, v7, v12 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v15, v1 ; GISEL-NEXT: v_mul_hi_u32 v15, v0, v12 -; GISEL-NEXT: v_mul_hi_u32 v12, v7, v12 ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v15 ; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v15 +; GISEL-NEXT: v_mul_hi_u32 v12, v7, v12 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v13, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 @@ -2721,10 +2721,10 @@ ; GISEL-NEXT: v_mul_hi_u32 v10, v10, v0 ; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v7, v12 ; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v13 -; GISEL-NEXT: v_mul_hi_u32 v12, v0, v14 ; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v11, v10 ; GISEL-NEXT: v_mul_lo_u32 v11, v1, v14 ; GISEL-NEXT: v_mul_lo_u32 v13, v0, v10 +; GISEL-NEXT: v_mul_hi_u32 v12, v0, v14 ; GISEL-NEXT: v_mul_hi_u32 v14, v1, v14 ; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v13 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] @@ -2733,12 +2733,12 @@ ; GISEL-NEXT: v_mul_lo_u32 v12, v1, v10 ; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v13, v11 ; GISEL-NEXT: v_mul_hi_u32 v13, v0, v10 -; GISEL-NEXT: v_mul_hi_u32 v1, v1, v10 ; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v12, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v12, v13 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v13, s[4:5], v14, v13 +; GISEL-NEXT: v_mul_hi_u32 v1, v1, v10 ; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v12, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v13, v12 @@ -2758,12 +2758,12 @@ ; GISEL-NEXT: v_mul_lo_u32 v11, v16, v10 ; GISEL-NEXT: v_add_i32_e32 v6, vcc, v12, v6 ; GISEL-NEXT: v_mul_hi_u32 v12, v8, v10 -; GISEL-NEXT: v_mul_hi_u32 v10, v16, v10 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v11, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 +; GISEL-NEXT: v_mul_hi_u32 v10, v16, v10 ; GISEL-NEXT: v_add_i32_e32 v6, vcc, v7, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v11, v7 @@ -2800,8 +2800,8 @@ ; GISEL-NEXT: v_cndmask_b32_e32 v5, v12, v5, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 ; GISEL-NEXT: v_ashrrev_i32_e32 v6, 31, v1 -; GISEL-NEXT: v_cndmask_b32_e32 v5, v8, v5, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v5, v8, v5, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v6 ; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v6, vcc ; GISEL-NEXT: v_xor_b32_e32 v7, v0, v6 @@ -2815,8 +2815,8 @@ ; GISEL-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v2, v8 ; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v3, v8, vcc -; GISEL-NEXT: v_xor_b32_e32 v3, v1, v8 ; GISEL-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 +; GISEL-NEXT: v_xor_b32_e32 v3, v1, v8 ; GISEL-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GISEL-NEXT: v_trunc_f32_e32 v1, v1 ; GISEL-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 @@ -2842,12 +2842,12 @@ ; GISEL-NEXT: v_mul_lo_u32 v16, v1, v12 ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v15, v13 ; GISEL-NEXT: v_mul_hi_u32 v15, v0, v12 -; GISEL-NEXT: v_mul_hi_u32 v12, v1, v12 ; GISEL-NEXT: v_add_i32_e32 v14, vcc, v16, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v15 ; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v15, vcc, v16, v15 +; GISEL-NEXT: v_mul_hi_u32 v12, v1, v12 ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14 @@ -2860,10 +2860,10 @@ ; GISEL-NEXT: v_mul_hi_u32 v10, v10, v0 ; GISEL-NEXT: v_add_i32_e64 v1, s[4:5], v1, v12 ; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v14 -; GISEL-NEXT: v_mul_hi_u32 v12, v0, v15 ; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v11, v10 ; GISEL-NEXT: v_mul_lo_u32 v11, v13, v15 ; GISEL-NEXT: v_mul_lo_u32 v14, v0, v10 +; GISEL-NEXT: v_mul_hi_u32 v12, v0, v15 ; GISEL-NEXT: v_mul_hi_u32 v15, v13, v15 ; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] @@ -2872,12 +2872,12 @@ ; GISEL-NEXT: v_mul_lo_u32 v12, v13, v10 ; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v14, v11 ; GISEL-NEXT: v_mul_hi_u32 v14, v0, v10 -; GISEL-NEXT: v_mul_hi_u32 v10, v13, v10 ; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v12, v15 ; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v12, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], v15, v14 +; GISEL-NEXT: v_mul_hi_u32 v10, v13, v10 ; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v12, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v14, v12 @@ -2958,8 +2958,8 @@ ; CGP-NEXT: v_mov_b32_e32 v8, v0 ; CGP-NEXT: v_or_b32_e32 v1, v9, v3 ; CGP-NEXT: v_mov_b32_e32 v0, 0 -; CGP-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; CGP-NEXT: v_lshl_b64 v[10:11], s[4:5], v6 +; CGP-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; CGP-NEXT: ; implicit-def: $vgpr0_vgpr1 ; CGP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; CGP-NEXT: s_xor_b64 s[6:7], exec, s[4:5] @@ -2977,26 +2977,26 @@ ; CGP-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; CGP-NEXT: v_add_i32_e32 v3, vcc, v8, v4 ; CGP-NEXT: v_addc_u32_e32 v6, vcc, v9, v4, vcc -; CGP-NEXT: v_sub_i32_e32 v9, vcc, 0, v1 ; CGP-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 ; CGP-NEXT: v_mul_f32_e32 v8, 0x2f800000, v2 ; CGP-NEXT: v_trunc_f32_e32 v8, v8 ; CGP-NEXT: v_mac_f32_e32 v2, 0xcf800000, v8 ; CGP-NEXT: v_cvt_u32_f32_e32 v2, v2 ; CGP-NEXT: v_cvt_u32_f32_e32 v8, v8 +; CGP-NEXT: v_sub_i32_e32 v9, vcc, 0, v1 ; CGP-NEXT: v_subb_u32_e32 v12, vcc, 0, v0, vcc -; CGP-NEXT: v_xor_b32_e32 v3, v3, v4 ; CGP-NEXT: v_mul_lo_u32 v13, v12, v2 ; CGP-NEXT: v_mul_lo_u32 v14, v9, v8 ; CGP-NEXT: v_mul_hi_u32 v16, v9, v2 ; CGP-NEXT: v_mul_lo_u32 v15, v9, v2 -; CGP-NEXT: v_xor_b32_e32 v6, v6, v4 +; CGP-NEXT: v_xor_b32_e32 v3, v3, v4 ; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14 ; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v16 ; CGP-NEXT: v_mul_lo_u32 v14, v8, v15 ; CGP-NEXT: v_mul_lo_u32 v16, v2, v13 ; CGP-NEXT: v_mul_hi_u32 v17, v2, v15 ; CGP-NEXT: v_mul_hi_u32 v15, v8, v15 +; CGP-NEXT: v_xor_b32_e32 v6, v6, v4 ; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v16 ; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v17 @@ -3004,12 +3004,12 @@ ; CGP-NEXT: v_mul_lo_u32 v17, v8, v13 ; CGP-NEXT: v_add_i32_e32 v14, vcc, v16, v14 ; CGP-NEXT: v_mul_hi_u32 v16, v2, v13 -; CGP-NEXT: v_mul_hi_u32 v13, v8, v13 ; CGP-NEXT: v_add_i32_e32 v15, vcc, v17, v15 ; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v15, vcc, v15, v16 ; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v16, vcc, v17, v16 +; CGP-NEXT: v_mul_hi_u32 v13, v8, v13 ; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v15, vcc, v16, v15 @@ -3022,10 +3022,10 @@ ; CGP-NEXT: v_mul_hi_u32 v9, v9, v2 ; CGP-NEXT: v_add_i32_e64 v8, s[4:5], v8, v13 ; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v15 -; CGP-NEXT: v_mul_hi_u32 v13, v2, v16 ; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v12, v9 ; CGP-NEXT: v_mul_lo_u32 v12, v14, v16 ; CGP-NEXT: v_mul_lo_u32 v15, v2, v9 +; CGP-NEXT: v_mul_hi_u32 v13, v2, v16 ; CGP-NEXT: v_mul_hi_u32 v16, v14, v16 ; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v15 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] @@ -3034,12 +3034,12 @@ ; CGP-NEXT: v_mul_lo_u32 v13, v14, v9 ; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v15, v12 ; CGP-NEXT: v_mul_hi_u32 v15, v2, v9 -; CGP-NEXT: v_mul_hi_u32 v9, v14, v9 ; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v13, v16 ; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[4:5] ; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v13, v15 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] ; CGP-NEXT: v_add_i32_e64 v15, s[4:5], v16, v15 +; CGP-NEXT: v_mul_hi_u32 v9, v14, v9 ; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v13, v12 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] ; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v15, v13 @@ -3058,12 +3058,12 @@ ; CGP-NEXT: v_mul_lo_u32 v13, v6, v8 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v12, v9 ; CGP-NEXT: v_mul_hi_u32 v12, v3, v8 -; CGP-NEXT: v_mul_hi_u32 v8, v6, v8 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v13, v2 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v12 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12 +; CGP-NEXT: v_mul_hi_u32 v8, v6, v8 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v9 ; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v9, vcc, v12, v9 @@ -3149,29 +3149,29 @@ ; CGP-NEXT: v_cvt_f32_u32_e32 v6, v2 ; CGP-NEXT: v_ashrrev_i32_e32 v8, 31, v7 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v8 -; CGP-NEXT: v_xor_b32_e32 v5, v5, v8 ; CGP-NEXT: v_mac_f32_e32 v4, 0x4f800000, v6 ; CGP-NEXT: v_rcp_iflag_f32_e32 v4, v4 ; CGP-NEXT: v_addc_u32_e32 v6, vcc, v7, v8, vcc ; CGP-NEXT: v_sub_i32_e32 v9, vcc, 0, v3 -; CGP-NEXT: v_subb_u32_e32 v10, vcc, 0, v2, vcc ; CGP-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 ; CGP-NEXT: v_mul_f32_e32 v7, 0x2f800000, v4 ; CGP-NEXT: v_trunc_f32_e32 v7, v7 ; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v7 ; CGP-NEXT: v_cvt_u32_f32_e32 v4, v4 ; CGP-NEXT: v_cvt_u32_f32_e32 v7, v7 -; CGP-NEXT: v_xor_b32_e32 v6, v6, v8 +; CGP-NEXT: v_subb_u32_e32 v10, vcc, 0, v2, vcc ; CGP-NEXT: v_mul_lo_u32 v11, v10, v4 ; CGP-NEXT: v_mul_lo_u32 v12, v9, v7 ; CGP-NEXT: v_mul_hi_u32 v14, v9, v4 ; CGP-NEXT: v_mul_lo_u32 v13, v9, v4 +; CGP-NEXT: v_xor_b32_e32 v5, v5, v8 ; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v12 ; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v14 ; CGP-NEXT: v_mul_lo_u32 v12, v7, v13 ; CGP-NEXT: v_mul_lo_u32 v14, v4, v11 ; CGP-NEXT: v_mul_hi_u32 v15, v4, v13 ; CGP-NEXT: v_mul_hi_u32 v13, v7, v13 +; CGP-NEXT: v_xor_b32_e32 v6, v6, v8 ; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v15 @@ -3179,12 +3179,12 @@ ; CGP-NEXT: v_mul_lo_u32 v15, v7, v11 ; CGP-NEXT: v_add_i32_e32 v12, vcc, v14, v12 ; CGP-NEXT: v_mul_hi_u32 v14, v4, v11 -; CGP-NEXT: v_mul_hi_u32 v11, v7, v11 ; CGP-NEXT: v_add_i32_e32 v13, vcc, v15, v13 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14 +; CGP-NEXT: v_mul_hi_u32 v11, v7, v11 ; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 @@ -3197,10 +3197,10 @@ ; CGP-NEXT: v_mul_hi_u32 v9, v9, v4 ; CGP-NEXT: v_add_i32_e64 v7, s[4:5], v7, v11 ; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v10, v13 -; CGP-NEXT: v_mul_hi_u32 v11, v4, v14 ; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v10, v9 ; CGP-NEXT: v_mul_lo_u32 v10, v12, v14 ; CGP-NEXT: v_mul_lo_u32 v13, v4, v9 +; CGP-NEXT: v_mul_hi_u32 v11, v4, v14 ; CGP-NEXT: v_mul_hi_u32 v14, v12, v14 ; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v10, v13 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] @@ -3209,12 +3209,12 @@ ; CGP-NEXT: v_mul_lo_u32 v11, v12, v9 ; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v13, v10 ; CGP-NEXT: v_mul_hi_u32 v13, v4, v9 -; CGP-NEXT: v_mul_hi_u32 v9, v12, v9 ; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] ; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v13 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] ; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v14, v13 +; CGP-NEXT: v_mul_hi_u32 v9, v12, v9 ; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v11, v10 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] ; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v13, v11 @@ -3233,12 +3233,12 @@ ; CGP-NEXT: v_mul_lo_u32 v11, v6, v7 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v9 ; CGP-NEXT: v_mul_hi_u32 v10, v5, v7 -; CGP-NEXT: v_mul_hi_u32 v7, v6, v7 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v11, v4 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v10 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; CGP-NEXT: v_mul_hi_u32 v7, v6, v7 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9 ; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v9 @@ -3271,8 +3271,8 @@ ; CGP-NEXT: v_cndmask_b32_e64 v11, v11, v12, s[4:5] ; CGP-NEXT: v_subbrev_u32_e32 v2, vcc, 0, v2, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 -; CGP-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc ; CGP-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc +; CGP-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 ; CGP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; CGP-NEXT: v_cndmask_b32_e32 v2, v7, v2, vcc @@ -3379,9 +3379,9 @@ ; GISEL-NEXT: v_cvt_f32_u32_e32 v5, v3 ; GISEL-NEXT: v_sub_i32_e32 v7, vcc, 0, v1 ; GISEL-NEXT: v_subb_u32_e32 v8, vcc, 0, v3, vcc -; GISEL-NEXT: v_and_b32_e32 v0, s6, v0 ; GISEL-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4 +; GISEL-NEXT: v_and_b32_e32 v0, s6, v0 ; GISEL-NEXT: v_and_b32_e32 v6, s6, v6 ; GISEL-NEXT: v_and_b32_e32 v2, s6, v2 ; GISEL-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 @@ -3409,12 +3409,12 @@ ; GISEL-NEXT: v_mul_hi_u32 v11, v5, v11 ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v12, v10 ; GISEL-NEXT: v_mul_hi_u32 v12, v4, v9 -; GISEL-NEXT: v_mul_hi_u32 v9, v5, v9 ; GISEL-NEXT: v_add_i32_e32 v11, vcc, v14, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v12, vcc, v14, v12 +; GISEL-NEXT: v_mul_hi_u32 v9, v5, v9 ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11 @@ -3427,10 +3427,10 @@ ; GISEL-NEXT: v_mul_hi_u32 v7, v7, v4 ; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v5, v9 ; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v8, v11 -; GISEL-NEXT: v_mul_hi_u32 v9, v4, v12 ; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v8, v7 ; GISEL-NEXT: v_mul_lo_u32 v8, v10, v12 ; GISEL-NEXT: v_mul_lo_u32 v11, v4, v7 +; GISEL-NEXT: v_mul_hi_u32 v9, v4, v12 ; GISEL-NEXT: v_mul_hi_u32 v12, v10, v12 ; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v8, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] @@ -3439,12 +3439,12 @@ ; GISEL-NEXT: v_mul_lo_u32 v9, v10, v7 ; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v11, v8 ; GISEL-NEXT: v_mul_hi_u32 v11, v4, v7 -; GISEL-NEXT: v_mul_hi_u32 v7, v10, v7 ; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v12, v11 +; GISEL-NEXT: v_mul_hi_u32 v7, v10, v7 ; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v9, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v11, v9 @@ -3463,12 +3463,12 @@ ; GISEL-NEXT: v_mul_lo_u32 v9, v13, v5 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v7 ; GISEL-NEXT: v_mul_hi_u32 v8, v0, v5 -; GISEL-NEXT: v_mul_hi_u32 v5, v13, v5 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v9, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; GISEL-NEXT: v_mul_hi_u32 v5, v13, v5 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v7 @@ -3539,12 +3539,12 @@ ; GISEL-NEXT: v_mul_hi_u32 v11, v5, v11 ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v12, v10 ; GISEL-NEXT: v_mul_hi_u32 v12, v3, v9 -; GISEL-NEXT: v_mul_hi_u32 v9, v5, v9 ; GISEL-NEXT: v_add_i32_e32 v11, vcc, v14, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v12, vcc, v14, v12 +; GISEL-NEXT: v_mul_hi_u32 v9, v5, v9 ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11 @@ -3557,10 +3557,10 @@ ; GISEL-NEXT: v_mul_hi_u32 v7, v7, v3 ; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v5, v9 ; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v8, v11 -; GISEL-NEXT: v_mul_hi_u32 v9, v3, v12 ; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v8, v7 ; GISEL-NEXT: v_mul_lo_u32 v8, v10, v12 ; GISEL-NEXT: v_mul_lo_u32 v11, v3, v7 +; GISEL-NEXT: v_mul_hi_u32 v9, v3, v12 ; GISEL-NEXT: v_mul_hi_u32 v12, v10, v12 ; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v8, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] @@ -3569,12 +3569,12 @@ ; GISEL-NEXT: v_mul_lo_u32 v9, v10, v7 ; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v11, v8 ; GISEL-NEXT: v_mul_hi_u32 v11, v3, v7 -; GISEL-NEXT: v_mul_hi_u32 v7, v10, v7 ; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v12, v11 +; GISEL-NEXT: v_mul_hi_u32 v7, v10, v7 ; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v9, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v11, v9 @@ -3595,12 +3595,12 @@ ; GISEL-NEXT: v_mul_hi_u32 v3, v13, v3 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v7 ; GISEL-NEXT: v_mul_hi_u32 v8, v2, v5 -; GISEL-NEXT: v_mul_hi_u32 v5, v13, v5 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v9, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; GISEL-NEXT: v_mul_hi_u32 v5, v13, v5 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v7 @@ -3627,9 +3627,9 @@ ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v4 ; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v6, vcc -; GISEL-NEXT: v_sub_i32_e32 v4, vcc, v8, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v9, v6 +; GISEL-NEXT: v_sub_i32_e32 v4, vcc, v8, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v10, v10, v11, s[4:5] ; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 @@ -3665,13 +3665,13 @@ ; CGP-NEXT: v_cvt_f32_i32_e32 v3, v2 ; CGP-NEXT: v_rcp_f32_e32 v5, v4 ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 -; CGP-NEXT: v_bfe_i32 v0, v0, 0, 25 ; CGP-NEXT: v_mul_f32_e32 v1, v3, v5 ; CGP-NEXT: v_trunc_f32_e32 v1, v1 ; CGP-NEXT: v_mad_f32 v3, -v1, v4, v3 ; CGP-NEXT: v_cvt_i32_f32_e32 v1, v1 ; CGP-NEXT: v_cmp_ge_f32_e64 s[4:5], |v3|, |v4| ; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[4:5] +; CGP-NEXT: v_bfe_i32 v0, v0, 0, 25 ; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v3 ; CGP-NEXT: v_mul_lo_u32 v3, v1, v6 ; CGP-NEXT: v_ashrrev_i32_e32 v1, 31, v0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll @@ -10,9 +10,9 @@ ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 25, v0 ; GFX6-NEXT: v_max_i32_e32 v2, -1, v0 -; GFX6-NEXT: v_min_i32_e32 v3, -1, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 25, v1 ; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, 0x7fffffff, v2 +; GFX6-NEXT: v_min_i32_e32 v3, -1, v0 ; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, 0x80000000, v3 ; GFX6-NEXT: v_max_i32_e32 v1, v2, v1 ; GFX6-NEXT: v_min_i32_e32 v1, v1, v3 @@ -25,9 +25,9 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshlrev_b16_e32 v0, 9, v0 ; GFX8-NEXT: v_max_i16_e32 v2, -1, v0 -; GFX8-NEXT: v_min_i16_e32 v3, -1, v0 ; GFX8-NEXT: v_lshlrev_b16_e32 v1, 9, v1 ; GFX8-NEXT: v_subrev_u16_e32 v2, 0x7fff, v2 +; GFX8-NEXT: v_min_i16_e32 v3, -1, v0 ; GFX8-NEXT: v_subrev_u16_e32 v3, 0x8000, v3 ; GFX8-NEXT: v_max_i16_e32 v1, v2, v1 ; GFX8-NEXT: v_min_i16_e32 v1, v1, v3 @@ -62,9 +62,9 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_lshl_b32 s0, s0, 25 ; GFX6-NEXT: s_max_i32 s2, s0, -1 -; GFX6-NEXT: s_min_i32 s3, s0, -1 ; GFX6-NEXT: s_lshl_b32 s1, s1, 25 ; GFX6-NEXT: s_sub_i32 s2, s2, 0x7fffffff +; GFX6-NEXT: s_min_i32 s3, s0, -1 ; GFX6-NEXT: s_sub_i32 s3, s3, 0x80000000 ; GFX6-NEXT: s_max_i32 s1, s2, s1 ; GFX6-NEXT: s_min_i32 s1, s1, s3 @@ -124,9 +124,9 @@ ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 24, v0 ; GFX6-NEXT: v_max_i32_e32 v2, -1, v0 -; GFX6-NEXT: v_min_i32_e32 v3, -1, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v1 ; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, 0x7fffffff, v2 +; GFX6-NEXT: v_min_i32_e32 v3, -1, v0 ; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, 0x80000000, v3 ; GFX6-NEXT: v_max_i32_e32 v1, v2, v1 ; GFX6-NEXT: v_min_i32_e32 v1, v1, v3 @@ -139,9 +139,9 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX8-NEXT: v_max_i16_e32 v2, -1, v0 -; GFX8-NEXT: v_min_i16_e32 v3, -1, v0 ; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX8-NEXT: v_subrev_u16_e32 v2, 0x7fff, v2 +; GFX8-NEXT: v_min_i16_e32 v3, -1, v0 ; GFX8-NEXT: v_subrev_u16_e32 v3, 0x8000, v3 ; GFX8-NEXT: v_max_i16_e32 v1, v2, v1 ; GFX8-NEXT: v_min_i16_e32 v1, v1, v3 @@ -176,9 +176,9 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_lshl_b32 s0, s0, 24 ; GFX6-NEXT: s_max_i32 s2, s0, -1 -; GFX6-NEXT: s_min_i32 s3, s0, -1 ; GFX6-NEXT: s_lshl_b32 s1, s1, 24 ; GFX6-NEXT: s_sub_i32 s2, s2, 0x7fffffff +; GFX6-NEXT: s_min_i32 s3, s0, -1 ; GFX6-NEXT: s_sub_i32 s3, s3, 0x80000000 ; GFX6-NEXT: s_max_i32 s1, s2, s1 ; GFX6-NEXT: s_min_i32 s1, s1, s3 @@ -242,8 +242,8 @@ ; GFX6-NEXT: v_max_i32_e32 v4, -1, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 8, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s4, v4 ; GFX6-NEXT: s_brev_b32 s5, 1 +; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s4, v4 ; GFX6-NEXT: v_min_i32_e32 v5, -1, v0 ; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, s5, v5 ; GFX6-NEXT: v_max_i32_e32 v1, v4, v1 @@ -252,8 +252,8 @@ ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 24, v3 ; GFX6-NEXT: v_max_i32_e32 v3, -1, v1 -; GFX6-NEXT: v_min_i32_e32 v4, -1, v1 ; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s4, v3 +; GFX6-NEXT: v_min_i32_e32 v4, -1, v1 ; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s5, v4 ; GFX6-NEXT: v_max_i32_e32 v2, v3, v2 ; GFX6-NEXT: v_min_i32_e32 v2, v2, v4 @@ -277,21 +277,21 @@ ; GFX8-NEXT: v_max_i16_e32 v4, -1, v0 ; GFX8-NEXT: v_lshrrev_b32_sdwa v2, v2, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX8-NEXT: v_subrev_u16_e32 v4, s4, v4 ; GFX8-NEXT: s_movk_i32 s5, 0x8000 +; GFX8-NEXT: v_subrev_u16_e32 v4, s4, v4 ; GFX8-NEXT: v_min_i16_e32 v5, -1, v0 -; GFX8-NEXT: v_max_i16_e32 v1, v4, v1 ; GFX8-NEXT: v_subrev_u16_e32 v5, s5, v5 +; GFX8-NEXT: v_max_i16_e32 v1, v4, v1 ; GFX8-NEXT: v_min_i16_e32 v1, v1, v5 ; GFX8-NEXT: v_sub_u16_e32 v0, v0, v1 ; GFX8-NEXT: v_max_i16_e32 v1, -1, v3 ; GFX8-NEXT: v_subrev_u16_e32 v1, s4, v1 ; GFX8-NEXT: v_min_i16_e32 v4, -1, v3 -; GFX8-NEXT: v_max_i16_e32 v1, v1, v2 ; GFX8-NEXT: v_subrev_u16_e32 v4, s5, v4 +; GFX8-NEXT: v_max_i16_e32 v1, v1, v2 ; GFX8-NEXT: v_min_i16_e32 v1, v1, v4 -; GFX8-NEXT: v_mov_b32_e32 v2, 0xff ; GFX8-NEXT: v_sub_u16_e32 v1, v3, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, 0xff ; GFX8-NEXT: v_and_b32_sdwa v0, sext(v0), v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; GFX8-NEXT: v_and_b32_sdwa v1, sext(v1), v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 @@ -349,8 +349,8 @@ ; GFX6-NEXT: s_max_i32 s6, s0, -1 ; GFX6-NEXT: s_lshr_b32 s3, s1, 8 ; GFX6-NEXT: s_lshl_b32 s1, s1, 24 -; GFX6-NEXT: s_sub_i32 s6, s6, s4 ; GFX6-NEXT: s_brev_b32 s5, 1 +; GFX6-NEXT: s_sub_i32 s6, s6, s4 ; GFX6-NEXT: s_min_i32 s7, s0, -1 ; GFX6-NEXT: s_sub_i32 s7, s7, s5 ; GFX6-NEXT: s_max_i32 s1, s6, s1 @@ -381,11 +381,11 @@ ; GFX8-NEXT: s_lshl_b32 s0, s0, s4 ; GFX8-NEXT: s_sext_i32_i16 s7, s0 ; GFX8-NEXT: s_sext_i32_i16 s8, -1 -; GFX8-NEXT: s_max_i32 s9, s7, s8 ; GFX8-NEXT: s_movk_i32 s5, 0x7fff -; GFX8-NEXT: s_sub_i32 s9, s9, s5 +; GFX8-NEXT: s_max_i32 s9, s7, s8 ; GFX8-NEXT: s_lshr_b32 s3, s1, 8 ; GFX8-NEXT: s_lshl_b32 s1, s1, s4 +; GFX8-NEXT: s_sub_i32 s9, s9, s5 ; GFX8-NEXT: s_movk_i32 s6, 0x8000 ; GFX8-NEXT: s_min_i32 s7, s7, s8 ; GFX8-NEXT: s_sext_i32_i16 s9, s9 @@ -427,8 +427,8 @@ ; GFX9-NEXT: s_lshr_b32 s3, s1, 8 ; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s2 ; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s3 -; GFX9-NEXT: s_lshr_b32 s3, s0, 16 ; GFX9-NEXT: s_mov_b32 s2, 0x80008 +; GFX9-NEXT: s_lshr_b32 s3, s0, 16 ; GFX9-NEXT: s_lshl_b32 s0, s0, s2 ; GFX9-NEXT: s_lshl_b32 s3, s3, 8 ; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s3 @@ -451,8 +451,8 @@ ; GFX10-NEXT: s_lshr_b32 s3, s1, 8 ; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s2 ; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s3 -; GFX10-NEXT: s_lshr_b32 s3, s0, 16 ; GFX10-NEXT: s_mov_b32 s2, 0x80008 +; GFX10-NEXT: s_lshr_b32 s3, s0, 16 ; GFX10-NEXT: s_lshr_b32 s4, s1, 16 ; GFX10-NEXT: s_lshl_b32 s0, s0, s2 ; GFX10-NEXT: s_lshl_b32 s3, s3, 8 @@ -488,8 +488,8 @@ ; GFX6-NEXT: v_lshrrev_b32_e32 v6, 16, v1 ; GFX6-NEXT: v_lshrrev_b32_e32 v7, 24, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; GFX6-NEXT: v_subrev_i32_e32 v8, vcc, s4, v8 ; GFX6-NEXT: s_brev_b32 s5, 1 +; GFX6-NEXT: v_subrev_i32_e32 v8, vcc, s4, v8 ; GFX6-NEXT: v_min_i32_e32 v10, -1, v0 ; GFX6-NEXT: v_subrev_i32_e32 v10, vcc, s5, v10 ; GFX6-NEXT: v_max_i32_e32 v1, v8, v1 @@ -508,8 +508,8 @@ ; GFX6-NEXT: v_bfrev_b32_e32 v9, -2 ; GFX6-NEXT: v_max_i32_e32 v5, -1, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 24, v6 -; GFX6-NEXT: v_min_i32_e32 v6, -1, v2 ; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v5, v9 +; GFX6-NEXT: v_min_i32_e32 v6, -1, v2 ; GFX6-NEXT: v_subrev_i32_e32 v6, vcc, s5, v6 ; GFX6-NEXT: v_max_i32_e32 v3, v5, v3 ; GFX6-NEXT: v_min_i32_e32 v3, v3, v6 @@ -517,24 +517,24 @@ ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 24, v4 ; GFX6-NEXT: v_max_i32_e32 v5, -1, v3 ; GFX6-NEXT: v_bfrev_b32_e32 v11, 1 -; GFX6-NEXT: v_min_i32_e32 v6, -1, v3 ; GFX6-NEXT: v_ashrrev_i32_e32 v1, 24, v1 -; GFX6-NEXT: s_movk_i32 s4, 0xff ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 24, v7 ; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v5, v9 +; GFX6-NEXT: v_min_i32_e32 v6, -1, v3 +; GFX6-NEXT: s_movk_i32 s4, 0xff ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 24, v0 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 ; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v6, v11 ; GFX6-NEXT: v_max_i32_e32 v4, v5, v4 -; GFX6-NEXT: v_min_i32_e32 v4, v4, v6 +; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 ; GFX6-NEXT: v_ashrrev_i32_e32 v2, 24, v2 +; GFX6-NEXT: v_min_i32_e32 v4, v4, v6 ; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v3, v4 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: v_and_b32_e32 v1, s4, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_ashrrev_i32_e32 v3, 24, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: v_and_b32_e32 v1, s4, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v1 @@ -555,35 +555,35 @@ ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v1 ; GFX8-NEXT: v_lshrrev_b32_e32 v7, 24, v1 ; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX8-NEXT: v_subrev_u16_e32 v8, s4, v8 ; GFX8-NEXT: s_movk_i32 s5, 0x8000 +; GFX8-NEXT: v_subrev_u16_e32 v8, s4, v8 ; GFX8-NEXT: v_min_i16_e32 v10, -1, v0 -; GFX8-NEXT: v_max_i16_e32 v1, v8, v1 ; GFX8-NEXT: v_subrev_u16_e32 v10, s5, v10 +; GFX8-NEXT: v_max_i16_e32 v1, v8, v1 ; GFX8-NEXT: v_min_i16_e32 v1, v1, v10 ; GFX8-NEXT: v_sub_u16_e32 v0, v0, v1 ; GFX8-NEXT: v_max_i16_e32 v1, -1, v3 ; GFX8-NEXT: v_subrev_u16_e32 v1, s4, v1 ; GFX8-NEXT: v_min_i16_e32 v8, -1, v3 +; GFX8-NEXT: v_subrev_u16_e32 v8, s5, v8 ; GFX8-NEXT: v_max_i16_e32 v1, v1, v2 ; GFX8-NEXT: v_lshlrev_b16_e32 v2, 8, v4 -; GFX8-NEXT: v_subrev_u16_e32 v8, s5, v8 -; GFX8-NEXT: v_min_i16_e32 v1, v1, v8 ; GFX8-NEXT: v_mov_b32_e32 v9, 0x7fff +; GFX8-NEXT: v_min_i16_e32 v1, v1, v8 ; GFX8-NEXT: v_max_i16_e32 v4, -1, v2 ; GFX8-NEXT: v_sub_u16_e32 v1, v3, v1 ; GFX8-NEXT: v_lshlrev_b16_e32 v3, 8, v6 -; GFX8-NEXT: v_min_i16_e32 v6, -1, v2 ; GFX8-NEXT: v_sub_u16_e32 v4, v4, v9 -; GFX8-NEXT: v_max_i16_e32 v3, v4, v3 +; GFX8-NEXT: v_min_i16_e32 v6, -1, v2 ; GFX8-NEXT: v_subrev_u16_e32 v6, s5, v6 +; GFX8-NEXT: v_max_i16_e32 v3, v4, v3 ; GFX8-NEXT: v_min_i16_e32 v3, v3, v6 ; GFX8-NEXT: v_sub_u16_e32 v2, v2, v3 ; GFX8-NEXT: v_lshlrev_b16_e32 v3, 8, v5 ; GFX8-NEXT: v_max_i16_e32 v5, -1, v3 -; GFX8-NEXT: v_min_i16_e32 v6, -1, v3 ; GFX8-NEXT: v_lshlrev_b16_e32 v4, 8, v7 ; GFX8-NEXT: v_sub_u16_e32 v5, v5, v9 +; GFX8-NEXT: v_min_i16_e32 v6, -1, v3 ; GFX8-NEXT: v_subrev_u16_e32 v6, 0x8000, v6 ; GFX8-NEXT: v_max_i16_e32 v4, v5, v4 ; GFX8-NEXT: v_min_i16_e32 v4, v4, v6 @@ -607,20 +607,20 @@ ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 24, v0 ; GFX9-NEXT: v_mov_b32_e32 v8, 0xffff ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX9-NEXT: v_and_or_b32 v0, v0, v8, v2 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v4 ; GFX9-NEXT: v_lshrrev_b32_sdwa v5, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v1 -; GFX9-NEXT: v_and_or_b32 v2, v3, v8, v2 +; GFX9-NEXT: v_and_or_b32 v0, v0, v8, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v4 ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; GFX9-NEXT: v_and_or_b32 v2, v3, v8, v2 ; GFX9-NEXT: v_and_or_b32 v1, v1, v8, v5 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX9-NEXT: v_and_or_b32 v3, v6, v8, v3 ; GFX9-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] +; GFX9-NEXT: v_and_or_b32 v3, v6, v8, v3 ; GFX9-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] -; GFX9-NEXT: v_pk_sub_i16 v0, v0, v1 clamp ; GFX9-NEXT: v_pk_lshlrev_b16 v2, 8, v2 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_lshlrev_b16 v3, 8, v3 op_sel_hi:[0,1] +; GFX9-NEXT: v_pk_sub_i16 v0, v0, v1 clamp ; GFX9-NEXT: v_pk_sub_i16 v1, v2, v3 clamp ; GFX9-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1] ; GFX9-NEXT: v_mov_b32_e32 v2, 8 @@ -691,8 +691,8 @@ ; GFX6-NEXT: s_lshr_b32 s6, s1, 16 ; GFX6-NEXT: s_lshr_b32 s7, s1, 24 ; GFX6-NEXT: s_lshl_b32 s1, s1, 24 -; GFX6-NEXT: s_sub_i32 s10, s10, s8 ; GFX6-NEXT: s_brev_b32 s9, 1 +; GFX6-NEXT: s_sub_i32 s10, s10, s8 ; GFX6-NEXT: s_min_i32 s11, s0, -1 ; GFX6-NEXT: s_sub_i32 s11, s11, s9 ; GFX6-NEXT: s_max_i32 s1, s10, s1 @@ -701,8 +701,8 @@ ; GFX6-NEXT: s_lshl_b32 s1, s2, 24 ; GFX6-NEXT: s_lshl_b32 s2, s5, 24 ; GFX6-NEXT: s_max_i32 s5, s1, -1 -; GFX6-NEXT: s_min_i32 s10, s1, -1 ; GFX6-NEXT: s_sub_i32 s5, s5, s8 +; GFX6-NEXT: s_min_i32 s10, s1, -1 ; GFX6-NEXT: s_sub_i32 s10, s10, s9 ; GFX6-NEXT: s_max_i32 s2, s5, s2 ; GFX6-NEXT: s_min_i32 s2, s2, s10 @@ -710,22 +710,22 @@ ; GFX6-NEXT: s_lshl_b32 s2, s3, 24 ; GFX6-NEXT: s_max_i32 s5, s2, -1 ; GFX6-NEXT: s_lshl_b32 s3, s6, 24 -; GFX6-NEXT: s_min_i32 s6, s2, -1 ; GFX6-NEXT: s_sub_i32 s5, s5, s8 +; GFX6-NEXT: s_min_i32 s6, s2, -1 ; GFX6-NEXT: s_sub_i32 s6, s6, s9 ; GFX6-NEXT: s_max_i32 s3, s5, s3 ; GFX6-NEXT: s_min_i32 s3, s3, s6 ; GFX6-NEXT: s_sub_i32 s2, s2, s3 ; GFX6-NEXT: s_lshl_b32 s3, s4, 24 ; GFX6-NEXT: s_max_i32 s5, s3, -1 -; GFX6-NEXT: s_min_i32 s6, s3, -1 ; GFX6-NEXT: s_lshl_b32 s4, s7, 24 ; GFX6-NEXT: s_sub_i32 s5, s5, s8 +; GFX6-NEXT: s_min_i32 s6, s3, -1 ; GFX6-NEXT: s_sub_i32 s6, s6, s9 ; GFX6-NEXT: s_max_i32 s4, s5, s4 ; GFX6-NEXT: s_min_i32 s4, s4, s6 -; GFX6-NEXT: s_sub_i32 s3, s3, s4 ; GFX6-NEXT: s_ashr_i32 s1, s1, 24 +; GFX6-NEXT: s_sub_i32 s3, s3, s4 ; GFX6-NEXT: s_movk_i32 s4, 0xff ; GFX6-NEXT: s_ashr_i32 s0, s0, 24 ; GFX6-NEXT: s_and_b32 s1, s1, s4 @@ -734,8 +734,8 @@ ; GFX6-NEXT: s_lshl_b32 s1, s1, 8 ; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: s_and_b32 s1, s2, s4 -; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_ashr_i32 s3, s3, 24 +; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: s_and_b32 s1, s3, s4 ; GFX6-NEXT: s_lshl_b32 s1, s1, 24 @@ -751,13 +751,13 @@ ; GFX8-NEXT: s_lshl_b32 s0, s0, s8 ; GFX8-NEXT: s_sext_i32_i16 s11, s0 ; GFX8-NEXT: s_sext_i32_i16 s12, -1 -; GFX8-NEXT: s_max_i32 s13, s11, s12 ; GFX8-NEXT: s_movk_i32 s9, 0x7fff -; GFX8-NEXT: s_sub_i32 s13, s13, s9 +; GFX8-NEXT: s_max_i32 s13, s11, s12 ; GFX8-NEXT: s_lshr_b32 s5, s1, 8 ; GFX8-NEXT: s_lshr_b32 s6, s1, 16 ; GFX8-NEXT: s_lshr_b32 s7, s1, 24 ; GFX8-NEXT: s_lshl_b32 s1, s1, s8 +; GFX8-NEXT: s_sub_i32 s13, s13, s9 ; GFX8-NEXT: s_movk_i32 s10, 0x8000 ; GFX8-NEXT: s_min_i32 s11, s11, s12 ; GFX8-NEXT: s_sext_i32_i16 s13, s13 @@ -810,9 +810,9 @@ ; GFX8-NEXT: s_sext_i32_i16 s5, s5 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 ; GFX8-NEXT: s_min_i32 s4, s4, s5 -; GFX8-NEXT: s_sub_i32 s3, s3, s4 ; GFX8-NEXT: s_sext_i32_i16 s0, s0 ; GFX8-NEXT: s_ashr_i32 s1, s1, s8 +; GFX8-NEXT: s_sub_i32 s3, s3, s4 ; GFX8-NEXT: s_movk_i32 s4, 0xff ; GFX8-NEXT: s_ashr_i32 s0, s0, s8 ; GFX8-NEXT: s_sext_i32_i16 s2, s2 @@ -823,8 +823,8 @@ ; GFX8-NEXT: s_sext_i32_i16 s3, s3 ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: s_and_b32 s1, s2, s4 -; GFX8-NEXT: s_lshl_b32 s1, s1, 16 ; GFX8-NEXT: s_ashr_i32 s3, s3, s8 +; GFX8-NEXT: s_lshl_b32 s1, s1, 16 ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: s_and_b32 s1, s3, s4 ; GFX8-NEXT: s_lshl_b32 s1, s1, 24 @@ -838,19 +838,19 @@ ; GFX9-NEXT: s_lshr_b32 s6, s0, 24 ; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s3 ; GFX9-NEXT: s_pack_ll_b32_b16 s3, s4, s6 -; GFX9-NEXT: s_lshr_b32 s6, s0, 16 ; GFX9-NEXT: s_mov_b32 s4, 0x80008 +; GFX9-NEXT: s_lshr_b32 s6, s0, 16 ; GFX9-NEXT: s_lshr_b32 s7, s1, 8 ; GFX9-NEXT: s_lshl_b32 s0, s0, s4 ; GFX9-NEXT: s_lshl_b32 s6, s6, 8 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s6 -; GFX9-NEXT: s_lshr_b32 s6, s3, 16 ; GFX9-NEXT: s_lshr_b32 s8, s1, 16 ; GFX9-NEXT: s_lshr_b32 s9, s1, 24 +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s6 +; GFX9-NEXT: s_lshr_b32 s6, s3, 16 ; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s7 -; GFX9-NEXT: s_lshr_b32 s7, s1, 16 ; GFX9-NEXT: s_lshl_b32 s3, s3, s4 ; GFX9-NEXT: s_lshl_b32 s6, s6, 8 +; GFX9-NEXT: s_lshr_b32 s7, s1, 16 ; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s6 ; GFX9-NEXT: s_pack_ll_b32_b16 s6, s8, s9 ; GFX9-NEXT: s_lshl_b32 s1, s1, s4 @@ -859,19 +859,19 @@ ; GFX9-NEXT: s_lshr_b32 s7, s6, 16 ; GFX9-NEXT: s_lshl_b32 s4, s6, s4 ; GFX9-NEXT: s_lshl_b32 s6, s7, 8 -; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s6 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: v_pk_sub_i16 v0, s0, v0 clamp ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: v_pk_sub_i16 v1, s3, v1 clamp ; GFX9-NEXT: s_mov_b32 s2, 8 +; GFX9-NEXT: v_pk_sub_i16 v1, s3, v1 clamp ; GFX9-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_ashrrev_i16 v1, 8, v1 op_sel_hi:[0,1] ; GFX9-NEXT: s_movk_i32 s0, 0xff ; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: s_mov_b32 s5, 24 ; GFX9-NEXT: v_and_or_b32 v0, v0, s0, v2 ; GFX9-NEXT: v_and_b32_e32 v2, s0, v1 -; GFX9-NEXT: s_mov_b32 s5, 24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_or3_b32 v0, v0, v2, v1 @@ -885,8 +885,8 @@ ; GFX10-NEXT: s_lshr_b32 s4, s0, 24 ; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s2 ; GFX10-NEXT: s_pack_ll_b32_b16 s2, s3, s4 -; GFX10-NEXT: s_lshr_b32 s4, s0, 16 ; GFX10-NEXT: s_mov_b32 s3, 0x80008 +; GFX10-NEXT: s_lshr_b32 s4, s0, 16 ; GFX10-NEXT: s_lshr_b32 s5, s1, 8 ; GFX10-NEXT: s_lshr_b32 s6, s1, 16 ; GFX10-NEXT: s_lshr_b32 s7, s1, 24 @@ -904,8 +904,8 @@ ; GFX10-NEXT: s_lshl_b32 s5, s5, 8 ; GFX10-NEXT: s_lshl_b32 s3, s4, s3 ; GFX10-NEXT: s_lshl_b32 s4, s6, 8 -; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s5 ; GFX10-NEXT: s_pack_ll_b32_b16 s2, s2, s8 +; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s5 ; GFX10-NEXT: s_pack_ll_b32_b16 s3, s3, s4 ; GFX10-NEXT: v_pk_sub_i16 v0, s0, s1 clamp ; GFX10-NEXT: v_pk_sub_i16 v1, s2, s3 clamp @@ -935,9 +935,9 @@ ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 8, v0 ; GFX6-NEXT: v_max_i32_e32 v2, -1, v0 -; GFX6-NEXT: v_min_i32_e32 v3, -1, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, 0x7fffffff, v2 +; GFX6-NEXT: v_min_i32_e32 v3, -1, v0 ; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, 0x80000000, v3 ; GFX6-NEXT: v_max_i32_e32 v1, v2, v1 ; GFX6-NEXT: v_min_i32_e32 v1, v1, v3 @@ -987,9 +987,9 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_lshl_b32 s0, s0, 8 ; GFX6-NEXT: s_max_i32 s2, s0, -1 -; GFX6-NEXT: s_min_i32 s3, s0, -1 ; GFX6-NEXT: s_lshl_b32 s1, s1, 8 ; GFX6-NEXT: s_sub_i32 s2, s2, 0x7fffffff +; GFX6-NEXT: s_min_i32 s3, s0, -1 ; GFX6-NEXT: s_sub_i32 s3, s3, 0x80000000 ; GFX6-NEXT: s_max_i32 s1, s2, s1 ; GFX6-NEXT: s_min_i32 s1, s1, s3 @@ -1042,8 +1042,8 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_max_i32_e32 v2, -1, v0 -; GFX6-NEXT: v_min_i32_e32 v3, -1, v0 ; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, 0x7fffffff, v2 +; GFX6-NEXT: v_min_i32_e32 v3, -1, v0 ; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, 0x80000000, v3 ; GFX6-NEXT: v_max_i32_e32 v1, v2, v1 ; GFX6-NEXT: v_min_i32_e32 v1, v1, v3 @@ -1054,8 +1054,8 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_max_i32_e32 v2, -1, v0 -; GFX8-NEXT: v_min_i32_e32 v3, -1, v0 ; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, 0x7fffffff, v2 +; GFX8-NEXT: v_min_i32_e32 v3, -1, v0 ; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, 0x80000000, v3 ; GFX8-NEXT: v_max_i32_e32 v1, v2, v1 ; GFX8-NEXT: v_min_i32_e32 v1, v1, v3 @@ -1082,8 +1082,8 @@ ; GFX6-LABEL: s_ssubsat_i32: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_max_i32 s2, s0, -1 -; GFX6-NEXT: s_min_i32 s3, s0, -1 ; GFX6-NEXT: s_sub_i32 s2, s2, 0x7fffffff +; GFX6-NEXT: s_min_i32 s3, s0, -1 ; GFX6-NEXT: s_sub_i32 s3, s3, 0x80000000 ; GFX6-NEXT: s_max_i32 s1, s2, s1 ; GFX6-NEXT: s_min_i32 s1, s1, s3 @@ -1093,8 +1093,8 @@ ; GFX8-LABEL: s_ssubsat_i32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_max_i32 s2, s0, -1 -; GFX8-NEXT: s_min_i32 s3, s0, -1 ; GFX8-NEXT: s_sub_i32 s2, s2, 0x7fffffff +; GFX8-NEXT: s_min_i32 s3, s0, -1 ; GFX8-NEXT: s_sub_i32 s3, s3, 0x80000000 ; GFX8-NEXT: s_max_i32 s1, s2, s1 ; GFX8-NEXT: s_min_i32 s1, s1, s3 @@ -1121,8 +1121,8 @@ ; GFX6-LABEL: ssubsat_i32_sv: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_max_i32 s1, s0, -1 -; GFX6-NEXT: s_min_i32 s2, s0, -1 ; GFX6-NEXT: s_sub_i32 s1, s1, 0x7fffffff +; GFX6-NEXT: s_min_i32 s2, s0, -1 ; GFX6-NEXT: s_sub_i32 s2, s2, 0x80000000 ; GFX6-NEXT: v_max_i32_e32 v0, s1, v0 ; GFX6-NEXT: v_min_i32_e32 v0, s2, v0 @@ -1132,8 +1132,8 @@ ; GFX8-LABEL: ssubsat_i32_sv: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_max_i32 s1, s0, -1 -; GFX8-NEXT: s_min_i32 s2, s0, -1 ; GFX8-NEXT: s_sub_i32 s1, s1, 0x7fffffff +; GFX8-NEXT: s_min_i32 s2, s0, -1 ; GFX8-NEXT: s_sub_i32 s2, s2, 0x80000000 ; GFX8-NEXT: v_max_i32_e32 v0, s1, v0 ; GFX8-NEXT: v_min_i32_e32 v0, s2, v0 @@ -1197,11 +1197,11 @@ ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_brev_b32 s4, -2 ; GFX6-NEXT: v_max_i32_e32 v4, -1, v0 -; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s4, v4 ; GFX6-NEXT: s_brev_b32 s5, 1 +; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s4, v4 ; GFX6-NEXT: v_min_i32_e32 v5, -1, v0 -; GFX6-NEXT: v_max_i32_e32 v2, v4, v2 ; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, s5, v5 +; GFX6-NEXT: v_max_i32_e32 v2, v4, v2 ; GFX6-NEXT: v_min_i32_e32 v2, v2, v5 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 ; GFX6-NEXT: v_max_i32_e32 v2, -1, v1 @@ -1218,11 +1218,11 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_brev_b32 s4, -2 ; GFX8-NEXT: v_max_i32_e32 v4, -1, v0 -; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, s4, v4 ; GFX8-NEXT: s_brev_b32 s5, 1 +; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, s4, v4 ; GFX8-NEXT: v_min_i32_e32 v5, -1, v0 -; GFX8-NEXT: v_max_i32_e32 v2, v4, v2 ; GFX8-NEXT: v_subrev_u32_e32 v5, vcc, s5, v5 +; GFX8-NEXT: v_max_i32_e32 v2, v4, v2 ; GFX8-NEXT: v_min_i32_e32 v2, v2, v5 ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, v0, v2 ; GFX8-NEXT: v_max_i32_e32 v2, -1, v1 @@ -1257,8 +1257,8 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_brev_b32 s4, -2 ; GFX6-NEXT: s_max_i32 s6, s0, -1 -; GFX6-NEXT: s_sub_i32 s6, s6, s4 ; GFX6-NEXT: s_brev_b32 s5, 1 +; GFX6-NEXT: s_sub_i32 s6, s6, s4 ; GFX6-NEXT: s_min_i32 s7, s0, -1 ; GFX6-NEXT: s_sub_i32 s7, s7, s5 ; GFX6-NEXT: s_max_i32 s2, s6, s2 @@ -1277,8 +1277,8 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_brev_b32 s4, -2 ; GFX8-NEXT: s_max_i32 s6, s0, -1 -; GFX8-NEXT: s_sub_i32 s6, s6, s4 ; GFX8-NEXT: s_brev_b32 s5, 1 +; GFX8-NEXT: s_sub_i32 s6, s6, s4 ; GFX8-NEXT: s_min_i32 s7, s0, -1 ; GFX8-NEXT: s_sub_i32 s7, s7, s5 ; GFX8-NEXT: s_max_i32 s2, s6, s2 @@ -1320,18 +1320,18 @@ ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_brev_b32 s4, -2 ; GFX6-NEXT: v_max_i32_e32 v6, -1, v0 -; GFX6-NEXT: v_subrev_i32_e32 v6, vcc, s4, v6 ; GFX6-NEXT: s_brev_b32 s5, 1 +; GFX6-NEXT: v_subrev_i32_e32 v6, vcc, s4, v6 ; GFX6-NEXT: v_min_i32_e32 v7, -1, v0 -; GFX6-NEXT: v_max_i32_e32 v3, v6, v3 ; GFX6-NEXT: v_subrev_i32_e32 v7, vcc, s5, v7 +; GFX6-NEXT: v_max_i32_e32 v3, v6, v3 ; GFX6-NEXT: v_min_i32_e32 v3, v3, v7 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 ; GFX6-NEXT: v_max_i32_e32 v3, -1, v1 ; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s4, v3 ; GFX6-NEXT: v_min_i32_e32 v6, -1, v1 -; GFX6-NEXT: v_max_i32_e32 v3, v3, v4 ; GFX6-NEXT: v_subrev_i32_e32 v6, vcc, s5, v6 +; GFX6-NEXT: v_max_i32_e32 v3, v3, v4 ; GFX6-NEXT: v_min_i32_e32 v3, v3, v6 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v3 ; GFX6-NEXT: v_max_i32_e32 v3, -1, v2 @@ -1348,18 +1348,18 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_brev_b32 s4, -2 ; GFX8-NEXT: v_max_i32_e32 v6, -1, v0 -; GFX8-NEXT: v_subrev_u32_e32 v6, vcc, s4, v6 ; GFX8-NEXT: s_brev_b32 s5, 1 +; GFX8-NEXT: v_subrev_u32_e32 v6, vcc, s4, v6 ; GFX8-NEXT: v_min_i32_e32 v7, -1, v0 -; GFX8-NEXT: v_max_i32_e32 v3, v6, v3 ; GFX8-NEXT: v_subrev_u32_e32 v7, vcc, s5, v7 +; GFX8-NEXT: v_max_i32_e32 v3, v6, v3 ; GFX8-NEXT: v_min_i32_e32 v3, v3, v7 ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, v0, v3 ; GFX8-NEXT: v_max_i32_e32 v3, -1, v1 ; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s4, v3 ; GFX8-NEXT: v_min_i32_e32 v6, -1, v1 -; GFX8-NEXT: v_max_i32_e32 v3, v3, v4 ; GFX8-NEXT: v_subrev_u32_e32 v6, vcc, s5, v6 +; GFX8-NEXT: v_max_i32_e32 v3, v3, v4 ; GFX8-NEXT: v_min_i32_e32 v3, v3, v6 ; GFX8-NEXT: v_sub_u32_e32 v1, vcc, v1, v3 ; GFX8-NEXT: v_max_i32_e32 v3, -1, v2 @@ -1396,18 +1396,18 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_brev_b32 s6, -2 ; GFX6-NEXT: s_max_i32 s8, s0, -1 -; GFX6-NEXT: s_sub_i32 s8, s8, s6 ; GFX6-NEXT: s_brev_b32 s7, 1 +; GFX6-NEXT: s_sub_i32 s8, s8, s6 ; GFX6-NEXT: s_min_i32 s9, s0, -1 -; GFX6-NEXT: s_max_i32 s3, s8, s3 ; GFX6-NEXT: s_sub_i32 s9, s9, s7 +; GFX6-NEXT: s_max_i32 s3, s8, s3 ; GFX6-NEXT: s_min_i32 s3, s3, s9 ; GFX6-NEXT: s_sub_i32 s0, s0, s3 ; GFX6-NEXT: s_max_i32 s3, s1, -1 ; GFX6-NEXT: s_sub_i32 s3, s3, s6 ; GFX6-NEXT: s_min_i32 s8, s1, -1 -; GFX6-NEXT: s_max_i32 s3, s3, s4 ; GFX6-NEXT: s_sub_i32 s8, s8, s7 +; GFX6-NEXT: s_max_i32 s3, s3, s4 ; GFX6-NEXT: s_min_i32 s3, s3, s8 ; GFX6-NEXT: s_sub_i32 s1, s1, s3 ; GFX6-NEXT: s_max_i32 s3, s2, -1 @@ -1423,18 +1423,18 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_brev_b32 s6, -2 ; GFX8-NEXT: s_max_i32 s8, s0, -1 -; GFX8-NEXT: s_sub_i32 s8, s8, s6 ; GFX8-NEXT: s_brev_b32 s7, 1 +; GFX8-NEXT: s_sub_i32 s8, s8, s6 ; GFX8-NEXT: s_min_i32 s9, s0, -1 -; GFX8-NEXT: s_max_i32 s3, s8, s3 ; GFX8-NEXT: s_sub_i32 s9, s9, s7 +; GFX8-NEXT: s_max_i32 s3, s8, s3 ; GFX8-NEXT: s_min_i32 s3, s3, s9 ; GFX8-NEXT: s_sub_i32 s0, s0, s3 ; GFX8-NEXT: s_max_i32 s3, s1, -1 ; GFX8-NEXT: s_sub_i32 s3, s3, s6 ; GFX8-NEXT: s_min_i32 s8, s1, -1 -; GFX8-NEXT: s_max_i32 s3, s3, s4 ; GFX8-NEXT: s_sub_i32 s8, s8, s7 +; GFX8-NEXT: s_max_i32 s3, s3, s4 ; GFX8-NEXT: s_min_i32 s3, s3, s8 ; GFX8-NEXT: s_sub_i32 s1, s1, s3 ; GFX8-NEXT: s_max_i32 s3, s2, -1 @@ -1478,18 +1478,18 @@ ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_brev_b32 s4, -2 ; GFX6-NEXT: v_max_i32_e32 v8, -1, v0 -; GFX6-NEXT: v_subrev_i32_e32 v8, vcc, s4, v8 ; GFX6-NEXT: s_brev_b32 s5, 1 +; GFX6-NEXT: v_subrev_i32_e32 v8, vcc, s4, v8 ; GFX6-NEXT: v_min_i32_e32 v9, -1, v0 -; GFX6-NEXT: v_max_i32_e32 v4, v8, v4 ; GFX6-NEXT: v_subrev_i32_e32 v9, vcc, s5, v9 +; GFX6-NEXT: v_max_i32_e32 v4, v8, v4 ; GFX6-NEXT: v_min_i32_e32 v4, v4, v9 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 ; GFX6-NEXT: v_max_i32_e32 v4, -1, v1 ; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s4, v4 ; GFX6-NEXT: v_min_i32_e32 v8, -1, v1 -; GFX6-NEXT: v_max_i32_e32 v4, v4, v5 ; GFX6-NEXT: v_subrev_i32_e32 v8, vcc, s5, v8 +; GFX6-NEXT: v_max_i32_e32 v4, v4, v5 ; GFX6-NEXT: v_min_i32_e32 v4, v4, v8 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v4 ; GFX6-NEXT: v_max_i32_e32 v4, -1, v2 @@ -1513,18 +1513,18 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_brev_b32 s4, -2 ; GFX8-NEXT: v_max_i32_e32 v8, -1, v0 -; GFX8-NEXT: v_subrev_u32_e32 v8, vcc, s4, v8 ; GFX8-NEXT: s_brev_b32 s5, 1 +; GFX8-NEXT: v_subrev_u32_e32 v8, vcc, s4, v8 ; GFX8-NEXT: v_min_i32_e32 v9, -1, v0 -; GFX8-NEXT: v_max_i32_e32 v4, v8, v4 ; GFX8-NEXT: v_subrev_u32_e32 v9, vcc, s5, v9 +; GFX8-NEXT: v_max_i32_e32 v4, v8, v4 ; GFX8-NEXT: v_min_i32_e32 v4, v4, v9 ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, v0, v4 ; GFX8-NEXT: v_max_i32_e32 v4, -1, v1 ; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, s4, v4 ; GFX8-NEXT: v_min_i32_e32 v8, -1, v1 -; GFX8-NEXT: v_max_i32_e32 v4, v4, v5 ; GFX8-NEXT: v_subrev_u32_e32 v8, vcc, s5, v8 +; GFX8-NEXT: v_max_i32_e32 v4, v4, v5 ; GFX8-NEXT: v_min_i32_e32 v4, v4, v8 ; GFX8-NEXT: v_sub_u32_e32 v1, vcc, v1, v4 ; GFX8-NEXT: v_max_i32_e32 v4, -1, v2 @@ -1570,18 +1570,18 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_brev_b32 s8, -2 ; GFX6-NEXT: s_max_i32 s10, s0, -1 -; GFX6-NEXT: s_sub_i32 s10, s10, s8 ; GFX6-NEXT: s_brev_b32 s9, 1 +; GFX6-NEXT: s_sub_i32 s10, s10, s8 ; GFX6-NEXT: s_min_i32 s11, s0, -1 -; GFX6-NEXT: s_max_i32 s4, s10, s4 ; GFX6-NEXT: s_sub_i32 s11, s11, s9 +; GFX6-NEXT: s_max_i32 s4, s10, s4 ; GFX6-NEXT: s_min_i32 s4, s4, s11 ; GFX6-NEXT: s_sub_i32 s0, s0, s4 ; GFX6-NEXT: s_max_i32 s4, s1, -1 ; GFX6-NEXT: s_sub_i32 s4, s4, s8 ; GFX6-NEXT: s_min_i32 s10, s1, -1 -; GFX6-NEXT: s_max_i32 s4, s4, s5 ; GFX6-NEXT: s_sub_i32 s10, s10, s9 +; GFX6-NEXT: s_max_i32 s4, s4, s5 ; GFX6-NEXT: s_min_i32 s4, s4, s10 ; GFX6-NEXT: s_sub_i32 s1, s1, s4 ; GFX6-NEXT: s_max_i32 s4, s2, -1 @@ -1604,18 +1604,18 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_brev_b32 s8, -2 ; GFX8-NEXT: s_max_i32 s10, s0, -1 -; GFX8-NEXT: s_sub_i32 s10, s10, s8 ; GFX8-NEXT: s_brev_b32 s9, 1 +; GFX8-NEXT: s_sub_i32 s10, s10, s8 ; GFX8-NEXT: s_min_i32 s11, s0, -1 -; GFX8-NEXT: s_max_i32 s4, s10, s4 ; GFX8-NEXT: s_sub_i32 s11, s11, s9 +; GFX8-NEXT: s_max_i32 s4, s10, s4 ; GFX8-NEXT: s_min_i32 s4, s4, s11 ; GFX8-NEXT: s_sub_i32 s0, s0, s4 ; GFX8-NEXT: s_max_i32 s4, s1, -1 ; GFX8-NEXT: s_sub_i32 s4, s4, s8 ; GFX8-NEXT: s_min_i32 s10, s1, -1 -; GFX8-NEXT: s_max_i32 s4, s4, s5 ; GFX8-NEXT: s_sub_i32 s10, s10, s9 +; GFX8-NEXT: s_max_i32 s4, s4, s5 ; GFX8-NEXT: s_min_i32 s4, s4, s10 ; GFX8-NEXT: s_sub_i32 s1, s1, s4 ; GFX8-NEXT: s_max_i32 s4, s2, -1 @@ -1671,18 +1671,18 @@ ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_brev_b32 s4, -2 ; GFX6-NEXT: v_max_i32_e32 v10, -1, v0 -; GFX6-NEXT: v_subrev_i32_e32 v10, vcc, s4, v10 ; GFX6-NEXT: s_brev_b32 s5, 1 +; GFX6-NEXT: v_subrev_i32_e32 v10, vcc, s4, v10 ; GFX6-NEXT: v_min_i32_e32 v12, -1, v0 -; GFX6-NEXT: v_max_i32_e32 v5, v10, v5 ; GFX6-NEXT: v_subrev_i32_e32 v12, vcc, s5, v12 +; GFX6-NEXT: v_max_i32_e32 v5, v10, v5 ; GFX6-NEXT: v_min_i32_e32 v5, v5, v12 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v5 ; GFX6-NEXT: v_max_i32_e32 v5, -1, v1 ; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, s4, v5 ; GFX6-NEXT: v_min_i32_e32 v10, -1, v1 -; GFX6-NEXT: v_max_i32_e32 v5, v5, v6 ; GFX6-NEXT: v_subrev_i32_e32 v10, vcc, s5, v10 +; GFX6-NEXT: v_max_i32_e32 v5, v5, v6 ; GFX6-NEXT: v_min_i32_e32 v5, v5, v10 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v5 ; GFX6-NEXT: v_max_i32_e32 v5, -1, v2 @@ -1691,11 +1691,11 @@ ; GFX6-NEXT: v_subrev_i32_e32 v6, vcc, s5, v6 ; GFX6-NEXT: v_max_i32_e32 v5, v5, v7 ; GFX6-NEXT: v_min_i32_e32 v5, v5, v6 -; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v5 ; GFX6-NEXT: v_bfrev_b32_e32 v11, -2 +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v5 ; GFX6-NEXT: v_max_i32_e32 v5, -1, v3 -; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v5, v11 ; GFX6-NEXT: v_bfrev_b32_e32 v13, 1 +; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v5, v11 ; GFX6-NEXT: v_min_i32_e32 v6, -1, v3 ; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v6, v13 ; GFX6-NEXT: v_max_i32_e32 v5, v5, v8 @@ -1715,18 +1715,18 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_brev_b32 s4, -2 ; GFX8-NEXT: v_max_i32_e32 v10, -1, v0 -; GFX8-NEXT: v_subrev_u32_e32 v10, vcc, s4, v10 ; GFX8-NEXT: s_brev_b32 s5, 1 +; GFX8-NEXT: v_subrev_u32_e32 v10, vcc, s4, v10 ; GFX8-NEXT: v_min_i32_e32 v12, -1, v0 -; GFX8-NEXT: v_max_i32_e32 v5, v10, v5 ; GFX8-NEXT: v_subrev_u32_e32 v12, vcc, s5, v12 +; GFX8-NEXT: v_max_i32_e32 v5, v10, v5 ; GFX8-NEXT: v_min_i32_e32 v5, v5, v12 ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, v0, v5 ; GFX8-NEXT: v_max_i32_e32 v5, -1, v1 ; GFX8-NEXT: v_subrev_u32_e32 v5, vcc, s4, v5 ; GFX8-NEXT: v_min_i32_e32 v10, -1, v1 -; GFX8-NEXT: v_max_i32_e32 v5, v5, v6 ; GFX8-NEXT: v_subrev_u32_e32 v10, vcc, s5, v10 +; GFX8-NEXT: v_max_i32_e32 v5, v5, v6 ; GFX8-NEXT: v_min_i32_e32 v5, v5, v10 ; GFX8-NEXT: v_sub_u32_e32 v1, vcc, v1, v5 ; GFX8-NEXT: v_max_i32_e32 v5, -1, v2 @@ -1735,11 +1735,11 @@ ; GFX8-NEXT: v_subrev_u32_e32 v6, vcc, s5, v6 ; GFX8-NEXT: v_max_i32_e32 v5, v5, v7 ; GFX8-NEXT: v_min_i32_e32 v5, v5, v6 -; GFX8-NEXT: v_sub_u32_e32 v2, vcc, v2, v5 ; GFX8-NEXT: v_bfrev_b32_e32 v11, -2 +; GFX8-NEXT: v_sub_u32_e32 v2, vcc, v2, v5 ; GFX8-NEXT: v_max_i32_e32 v5, -1, v3 -; GFX8-NEXT: v_sub_u32_e32 v5, vcc, v5, v11 ; GFX8-NEXT: v_bfrev_b32_e32 v13, 1 +; GFX8-NEXT: v_sub_u32_e32 v5, vcc, v5, v11 ; GFX8-NEXT: v_min_i32_e32 v6, -1, v3 ; GFX8-NEXT: v_sub_u32_e32 v6, vcc, v6, v13 ; GFX8-NEXT: v_max_i32_e32 v5, v5, v8 @@ -1783,18 +1783,18 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_brev_b32 s10, -2 ; GFX6-NEXT: s_max_i32 s12, s0, -1 -; GFX6-NEXT: s_sub_i32 s12, s12, s10 ; GFX6-NEXT: s_brev_b32 s11, 1 +; GFX6-NEXT: s_sub_i32 s12, s12, s10 ; GFX6-NEXT: s_min_i32 s13, s0, -1 -; GFX6-NEXT: s_max_i32 s5, s12, s5 ; GFX6-NEXT: s_sub_i32 s13, s13, s11 +; GFX6-NEXT: s_max_i32 s5, s12, s5 ; GFX6-NEXT: s_min_i32 s5, s5, s13 ; GFX6-NEXT: s_sub_i32 s0, s0, s5 ; GFX6-NEXT: s_max_i32 s5, s1, -1 ; GFX6-NEXT: s_sub_i32 s5, s5, s10 ; GFX6-NEXT: s_min_i32 s12, s1, -1 -; GFX6-NEXT: s_max_i32 s5, s5, s6 ; GFX6-NEXT: s_sub_i32 s12, s12, s11 +; GFX6-NEXT: s_max_i32 s5, s5, s6 ; GFX6-NEXT: s_min_i32 s5, s5, s12 ; GFX6-NEXT: s_sub_i32 s1, s1, s5 ; GFX6-NEXT: s_max_i32 s5, s2, -1 @@ -1824,18 +1824,18 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_brev_b32 s10, -2 ; GFX8-NEXT: s_max_i32 s12, s0, -1 -; GFX8-NEXT: s_sub_i32 s12, s12, s10 ; GFX8-NEXT: s_brev_b32 s11, 1 +; GFX8-NEXT: s_sub_i32 s12, s12, s10 ; GFX8-NEXT: s_min_i32 s13, s0, -1 -; GFX8-NEXT: s_max_i32 s5, s12, s5 ; GFX8-NEXT: s_sub_i32 s13, s13, s11 +; GFX8-NEXT: s_max_i32 s5, s12, s5 ; GFX8-NEXT: s_min_i32 s5, s5, s13 ; GFX8-NEXT: s_sub_i32 s0, s0, s5 ; GFX8-NEXT: s_max_i32 s5, s1, -1 ; GFX8-NEXT: s_sub_i32 s5, s5, s10 ; GFX8-NEXT: s_min_i32 s12, s1, -1 -; GFX8-NEXT: s_max_i32 s5, s5, s6 ; GFX8-NEXT: s_sub_i32 s12, s12, s11 +; GFX8-NEXT: s_max_i32 s5, s5, s6 ; GFX8-NEXT: s_min_i32 s5, s5, s12 ; GFX8-NEXT: s_sub_i32 s1, s1, s5 ; GFX8-NEXT: s_max_i32 s5, s2, -1 @@ -2191,18 +2191,18 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_brev_b32 s32, -2 ; GFX6-NEXT: s_max_i32 s34, s0, -1 -; GFX6-NEXT: s_sub_i32 s34, s34, s32 ; GFX6-NEXT: s_brev_b32 s33, 1 +; GFX6-NEXT: s_sub_i32 s34, s34, s32 ; GFX6-NEXT: s_min_i32 s35, s0, -1 -; GFX6-NEXT: s_max_i32 s16, s34, s16 ; GFX6-NEXT: s_sub_i32 s35, s35, s33 +; GFX6-NEXT: s_max_i32 s16, s34, s16 ; GFX6-NEXT: s_min_i32 s16, s16, s35 ; GFX6-NEXT: s_sub_i32 s0, s0, s16 ; GFX6-NEXT: s_max_i32 s16, s1, -1 ; GFX6-NEXT: s_sub_i32 s16, s16, s32 ; GFX6-NEXT: s_min_i32 s34, s1, -1 -; GFX6-NEXT: s_max_i32 s16, s16, s17 ; GFX6-NEXT: s_sub_i32 s34, s34, s33 +; GFX6-NEXT: s_max_i32 s16, s16, s17 ; GFX6-NEXT: s_min_i32 s16, s16, s34 ; GFX6-NEXT: s_sub_i32 s1, s1, s16 ; GFX6-NEXT: s_max_i32 s16, s2, -1 @@ -2309,18 +2309,18 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_brev_b32 s32, -2 ; GFX8-NEXT: s_max_i32 s34, s0, -1 -; GFX8-NEXT: s_sub_i32 s34, s34, s32 ; GFX8-NEXT: s_brev_b32 s33, 1 +; GFX8-NEXT: s_sub_i32 s34, s34, s32 ; GFX8-NEXT: s_min_i32 s35, s0, -1 -; GFX8-NEXT: s_max_i32 s16, s34, s16 ; GFX8-NEXT: s_sub_i32 s35, s35, s33 +; GFX8-NEXT: s_max_i32 s16, s34, s16 ; GFX8-NEXT: s_min_i32 s16, s16, s35 ; GFX8-NEXT: s_sub_i32 s0, s0, s16 ; GFX8-NEXT: s_max_i32 s16, s1, -1 ; GFX8-NEXT: s_sub_i32 s16, s16, s32 ; GFX8-NEXT: s_min_i32 s34, s1, -1 -; GFX8-NEXT: s_max_i32 s16, s16, s17 ; GFX8-NEXT: s_sub_i32 s34, s34, s33 +; GFX8-NEXT: s_max_i32 s16, s16, s17 ; GFX8-NEXT: s_min_i32 s16, s16, s34 ; GFX8-NEXT: s_sub_i32 s1, s1, s16 ; GFX8-NEXT: s_max_i32 s16, s2, -1 @@ -2520,9 +2520,9 @@ ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: v_max_i32_e32 v2, -1, v0 -; GFX6-NEXT: v_min_i32_e32 v3, -1, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, 0x7fffffff, v2 +; GFX6-NEXT: v_min_i32_e32 v3, -1, v0 ; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, 0x80000000, v3 ; GFX6-NEXT: v_max_i32_e32 v1, v2, v1 ; GFX6-NEXT: v_min_i32_e32 v1, v1, v3 @@ -2534,8 +2534,8 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_max_i16_e32 v2, -1, v0 -; GFX8-NEXT: v_min_i16_e32 v3, -1, v0 ; GFX8-NEXT: v_subrev_u16_e32 v2, 0x7fff, v2 +; GFX8-NEXT: v_min_i16_e32 v3, -1, v0 ; GFX8-NEXT: v_subrev_u16_e32 v3, 0x8000, v3 ; GFX8-NEXT: v_max_i16_e32 v1, v2, v1 ; GFX8-NEXT: v_min_i16_e32 v1, v1, v3 @@ -2563,9 +2563,9 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_lshl_b32 s0, s0, 16 ; GFX6-NEXT: s_max_i32 s2, s0, -1 -; GFX6-NEXT: s_min_i32 s3, s0, -1 ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_sub_i32 s2, s2, 0x7fffffff +; GFX6-NEXT: s_min_i32 s3, s0, -1 ; GFX6-NEXT: s_sub_i32 s3, s3, 0x80000000 ; GFX6-NEXT: s_max_i32 s1, s2, s1 ; GFX6-NEXT: s_min_i32 s1, s1, s3 @@ -2611,9 +2611,9 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_lshl_b32 s0, s0, 16 ; GFX6-NEXT: s_max_i32 s1, s0, -1 -; GFX6-NEXT: s_min_i32 s2, s0, -1 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: s_sub_i32 s1, s1, 0x7fffffff +; GFX6-NEXT: s_min_i32 s2, s0, -1 ; GFX6-NEXT: s_sub_i32 s2, s2, 0x80000000 ; GFX6-NEXT: v_max_i32_e32 v0, s1, v0 ; GFX6-NEXT: v_min_i32_e32 v0, s2, v0 @@ -2626,8 +2626,8 @@ ; GFX8-NEXT: s_sext_i32_i16 s1, s0 ; GFX8-NEXT: s_sext_i32_i16 s2, -1 ; GFX8-NEXT: s_max_i32 s3, s1, s2 -; GFX8-NEXT: s_min_i32 s1, s1, s2 ; GFX8-NEXT: s_sub_i32 s3, s3, 0x7fff +; GFX8-NEXT: s_min_i32 s1, s1, s2 ; GFX8-NEXT: s_sub_i32 s1, s1, 0xffff8000 ; GFX8-NEXT: v_max_i16_e32 v0, s3, v0 ; GFX8-NEXT: v_min_i16_e32 v0, s1, v0 @@ -2653,9 +2653,9 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: v_max_i32_e32 v1, -1, v0 -; GFX6-NEXT: v_min_i32_e32 v2, -1, v0 ; GFX6-NEXT: s_lshl_b32 s0, s0, 16 ; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, 0x7fffffff, v1 +; GFX6-NEXT: v_min_i32_e32 v2, -1, v0 ; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, 0x80000000, v2 ; GFX6-NEXT: v_max_i32_e32 v1, s0, v1 ; GFX6-NEXT: v_min_i32_e32 v1, v1, v2 @@ -2696,8 +2696,8 @@ ; GFX6-NEXT: s_brev_b32 s4, -2 ; GFX6-NEXT: v_max_i32_e32 v4, -1, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s4, v4 ; GFX6-NEXT: s_brev_b32 s5, 1 +; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s4, v4 ; GFX6-NEXT: v_min_i32_e32 v5, -1, v0 ; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, s5, v5 ; GFX6-NEXT: v_max_i32_e32 v2, v4, v2 @@ -2706,8 +2706,8 @@ ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3 ; GFX6-NEXT: v_max_i32_e32 v3, -1, v1 -; GFX6-NEXT: v_min_i32_e32 v4, -1, v1 ; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s4, v3 +; GFX6-NEXT: v_min_i32_e32 v4, -1, v1 ; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s5, v4 ; GFX6-NEXT: v_max_i32_e32 v2, v3, v2 ; GFX6-NEXT: v_min_i32_e32 v2, v2, v4 @@ -2721,16 +2721,16 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_movk_i32 s4, 0x7fff ; GFX8-NEXT: v_max_i16_e32 v3, -1, v0 -; GFX8-NEXT: v_subrev_u16_e32 v3, s4, v3 ; GFX8-NEXT: s_movk_i32 s5, 0x8000 +; GFX8-NEXT: v_subrev_u16_e32 v3, s4, v3 ; GFX8-NEXT: v_min_i16_e32 v4, -1, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX8-NEXT: v_subrev_u16_e32 v4, s5, v4 ; GFX8-NEXT: v_max_i16_e32 v3, v3, v1 ; GFX8-NEXT: v_min_i16_e32 v3, v3, v4 ; GFX8-NEXT: v_max_i16_e32 v4, -1, v2 -; GFX8-NEXT: v_min_i16_e32 v5, -1, v2 ; GFX8-NEXT: v_subrev_u16_e32 v4, s4, v4 +; GFX8-NEXT: v_min_i16_e32 v5, -1, v2 ; GFX8-NEXT: v_subrev_u16_e32 v5, s5, v5 ; GFX8-NEXT: v_max_i16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_min_i16_e32 v1, v1, v5 @@ -2762,8 +2762,8 @@ ; GFX6-NEXT: s_brev_b32 s4, -2 ; GFX6-NEXT: s_max_i32 s6, s0, -1 ; GFX6-NEXT: s_lshl_b32 s2, s2, 16 -; GFX6-NEXT: s_sub_i32 s6, s6, s4 ; GFX6-NEXT: s_brev_b32 s5, 1 +; GFX6-NEXT: s_sub_i32 s6, s6, s4 ; GFX6-NEXT: s_min_i32 s7, s0, -1 ; GFX6-NEXT: s_sub_i32 s7, s7, s5 ; GFX6-NEXT: s_max_i32 s2, s6, s2 @@ -2791,8 +2791,8 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_sext_i32_i16 s6, s0 ; GFX8-NEXT: s_sext_i32_i16 s7, -1 -; GFX8-NEXT: s_max_i32 s8, s6, s7 ; GFX8-NEXT: s_movk_i32 s4, 0x7fff +; GFX8-NEXT: s_max_i32 s8, s6, s7 ; GFX8-NEXT: s_sub_i32 s8, s8, s4 ; GFX8-NEXT: s_lshr_b32 s3, s1, 16 ; GFX8-NEXT: s_movk_i32 s5, 0x8000 @@ -2848,8 +2848,8 @@ ; GFX6-NEXT: s_brev_b32 s2, -2 ; GFX6-NEXT: s_max_i32 s4, s0, -1 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: s_sub_i32 s4, s4, s2 ; GFX6-NEXT: s_brev_b32 s3, 1 +; GFX6-NEXT: s_sub_i32 s4, s4, s2 ; GFX6-NEXT: s_min_i32 s5, s0, -1 ; GFX6-NEXT: s_sub_i32 s5, s5, s3 ; GFX6-NEXT: v_max_i32_e32 v0, s4, v0 @@ -2857,8 +2857,8 @@ ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 ; GFX6-NEXT: s_lshl_b32 s0, s1, 16 ; GFX6-NEXT: s_max_i32 s1, s0, -1 -; GFX6-NEXT: s_sub_i32 s1, s1, s2 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: s_sub_i32 s1, s1, s2 ; GFX6-NEXT: s_min_i32 s2, s0, -1 ; GFX6-NEXT: s_sub_i32 s2, s2, s3 ; GFX6-NEXT: v_max_i32_e32 v1, s1, v1 @@ -2877,10 +2877,10 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_sext_i32_i16 s4, s0 ; GFX8-NEXT: s_sext_i32_i16 s5, -1 -; GFX8-NEXT: s_max_i32 s6, s4, s5 ; GFX8-NEXT: s_movk_i32 s2, 0x7fff -; GFX8-NEXT: s_sub_i32 s6, s6, s2 +; GFX8-NEXT: s_max_i32 s6, s4, s5 ; GFX8-NEXT: s_movk_i32 s3, 0x8000 +; GFX8-NEXT: s_sub_i32 s6, s6, s2 ; GFX8-NEXT: s_min_i32 s4, s4, s5 ; GFX8-NEXT: s_lshr_b32 s1, s0, 16 ; GFX8-NEXT: s_sub_i32 s4, s4, s3 @@ -2891,8 +2891,8 @@ ; GFX8-NEXT: s_sub_i32 s2, s6, s2 ; GFX8-NEXT: s_min_i32 s4, s4, s5 ; GFX8-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NEXT: v_max_i16_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: s_sub_i32 s3, s4, s3 +; GFX8-NEXT: v_max_i16_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_min_i16_e32 v0, s3, v0 ; GFX8-NEXT: v_mov_b32_e32 v2, s1 ; GFX8-NEXT: v_sub_u16_e32 v1, s0, v1 @@ -2921,20 +2921,20 @@ ; GFX6-NEXT: s_brev_b32 s2, -2 ; GFX6-NEXT: v_max_i32_e32 v2, -1, v0 ; GFX6-NEXT: s_lshl_b32 s0, s0, 16 -; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s2, v2 ; GFX6-NEXT: s_brev_b32 s3, 1 +; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s2, v2 ; GFX6-NEXT: v_min_i32_e32 v3, -1, v0 -; GFX6-NEXT: v_max_i32_e32 v2, s0, v2 ; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s3, v3 +; GFX6-NEXT: v_max_i32_e32 v2, s0, v2 ; GFX6-NEXT: v_min_i32_e32 v2, v2, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 ; GFX6-NEXT: v_max_i32_e32 v2, -1, v1 -; GFX6-NEXT: v_min_i32_e32 v3, -1, v1 ; GFX6-NEXT: s_lshl_b32 s0, s1, 16 ; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s2, v2 -; GFX6-NEXT: v_max_i32_e32 v2, s0, v2 +; GFX6-NEXT: v_min_i32_e32 v3, -1, v1 ; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s3, v3 +; GFX6-NEXT: v_max_i32_e32 v2, s0, v2 ; GFX6-NEXT: v_min_i32_e32 v2, v2, v3 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v2 ; GFX6-NEXT: v_ashrrev_i32_e32 v1, 16, v1 @@ -2950,17 +2950,17 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_movk_i32 s2, 0x7fff ; GFX8-NEXT: v_max_i16_e32 v2, -1, v0 -; GFX8-NEXT: v_subrev_u16_e32 v2, s2, v2 ; GFX8-NEXT: s_movk_i32 s3, 0x8000 +; GFX8-NEXT: v_subrev_u16_e32 v2, s2, v2 ; GFX8-NEXT: v_min_i16_e32 v3, -1, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX8-NEXT: v_subrev_u16_e32 v3, s3, v3 ; GFX8-NEXT: v_max_i16_e32 v2, s0, v2 ; GFX8-NEXT: v_min_i16_e32 v2, v2, v3 ; GFX8-NEXT: v_max_i16_e32 v3, -1, v1 -; GFX8-NEXT: v_min_i16_e32 v4, -1, v1 ; GFX8-NEXT: s_lshr_b32 s1, s0, 16 ; GFX8-NEXT: v_subrev_u16_e32 v3, s2, v3 +; GFX8-NEXT: v_min_i16_e32 v4, -1, v1 ; GFX8-NEXT: v_subrev_u16_e32 v4, s3, v4 ; GFX8-NEXT: v_max_i16_e32 v3, s1, v3 ; GFX8-NEXT: v_min_i16_e32 v3, v3, v4 @@ -3002,8 +3002,8 @@ ; GFX6-NEXT: s_brev_b32 s4, -2 ; GFX6-NEXT: v_max_i32_e32 v8, -1, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_subrev_i32_e32 v8, vcc, s4, v8 ; GFX6-NEXT: s_brev_b32 s5, 1 +; GFX6-NEXT: v_subrev_i32_e32 v8, vcc, s4, v8 ; GFX6-NEXT: v_min_i32_e32 v10, -1, v0 ; GFX6-NEXT: v_subrev_i32_e32 v10, vcc, s5, v10 ; GFX6-NEXT: v_max_i32_e32 v4, v8, v4 @@ -3017,27 +3017,27 @@ ; GFX6-NEXT: v_subrev_i32_e32 v8, vcc, s5, v8 ; GFX6-NEXT: v_max_i32_e32 v4, v5, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_min_i32_e32 v4, v4, v8 ; GFX6-NEXT: v_bfrev_b32_e32 v9, -2 +; GFX6-NEXT: v_min_i32_e32 v4, v4, v8 ; GFX6-NEXT: v_max_i32_e32 v5, -1, v2 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v6 -; GFX6-NEXT: v_min_i32_e32 v6, -1, v2 ; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v5, v9 +; GFX6-NEXT: v_min_i32_e32 v6, -1, v2 ; GFX6-NEXT: v_subrev_i32_e32 v6, vcc, s5, v6 ; GFX6-NEXT: v_max_i32_e32 v4, v5, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: v_min_i32_e32 v4, v4, v6 ; GFX6-NEXT: v_max_i32_e32 v5, -1, v3 -; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v4 ; GFX6-NEXT: v_bfrev_b32_e32 v11, 1 -; GFX6-NEXT: v_min_i32_e32 v6, -1, v3 +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v7 ; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v5, v9 +; GFX6-NEXT: v_min_i32_e32 v6, -1, v3 ; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v6, v11 ; GFX6-NEXT: v_max_i32_e32 v4, v5, v4 -; GFX6-NEXT: v_min_i32_e32 v4, v4, v6 ; GFX6-NEXT: v_ashrrev_i32_e32 v1, 16, v1 +; GFX6-NEXT: v_min_i32_e32 v4, v4, v6 ; GFX6-NEXT: s_mov_b32 s4, 0xffff ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 16, v0 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v3, v4 @@ -3058,19 +3058,19 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_movk_i32 s4, 0x7fff ; GFX8-NEXT: v_max_i16_e32 v6, -1, v0 -; GFX8-NEXT: v_subrev_u16_e32 v6, s4, v6 ; GFX8-NEXT: s_movk_i32 s5, 0x8000 +; GFX8-NEXT: v_subrev_u16_e32 v6, s4, v6 ; GFX8-NEXT: v_min_i16_e32 v7, -1, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v0 ; GFX8-NEXT: v_subrev_u16_e32 v7, s5, v7 ; GFX8-NEXT: v_max_i16_e32 v6, v6, v2 ; GFX8-NEXT: v_min_i16_e32 v6, v6, v7 ; GFX8-NEXT: v_max_i16_e32 v7, -1, v4 -; GFX8-NEXT: v_min_i16_e32 v8, -1, v4 ; GFX8-NEXT: v_subrev_u16_e32 v7, s4, v7 +; GFX8-NEXT: v_min_i16_e32 v8, -1, v4 +; GFX8-NEXT: v_subrev_u16_e32 v8, s5, v8 ; GFX8-NEXT: v_max_i16_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_max_i16_e32 v7, -1, v1 -; GFX8-NEXT: v_subrev_u16_e32 v8, s5, v8 ; GFX8-NEXT: v_min_i16_e32 v2, v2, v8 ; GFX8-NEXT: v_subrev_u16_e32 v7, s4, v7 ; GFX8-NEXT: v_min_i16_e32 v8, -1, v1 @@ -3079,8 +3079,8 @@ ; GFX8-NEXT: v_max_i16_e32 v7, v7, v3 ; GFX8-NEXT: v_min_i16_e32 v7, v7, v8 ; GFX8-NEXT: v_max_i16_e32 v8, -1, v5 -; GFX8-NEXT: v_min_i16_e32 v9, -1, v5 ; GFX8-NEXT: v_subrev_u16_e32 v8, s4, v8 +; GFX8-NEXT: v_min_i16_e32 v9, -1, v5 ; GFX8-NEXT: v_subrev_u16_e32 v9, s5, v9 ; GFX8-NEXT: v_max_i16_sdwa v3, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_min_i16_e32 v3, v3, v9 @@ -3118,8 +3118,8 @@ ; GFX6-NEXT: s_brev_b32 s8, -2 ; GFX6-NEXT: s_max_i32 s10, s0, -1 ; GFX6-NEXT: s_lshl_b32 s4, s4, 16 -; GFX6-NEXT: s_sub_i32 s10, s10, s8 ; GFX6-NEXT: s_brev_b32 s9, 1 +; GFX6-NEXT: s_sub_i32 s10, s10, s8 ; GFX6-NEXT: s_min_i32 s11, s0, -1 ; GFX6-NEXT: s_sub_i32 s11, s11, s9 ; GFX6-NEXT: s_max_i32 s4, s10, s4 @@ -3128,31 +3128,31 @@ ; GFX6-NEXT: s_sub_i32 s0, s0, s4 ; GFX6-NEXT: s_lshl_b32 s4, s5, 16 ; GFX6-NEXT: s_max_i32 s5, s1, -1 -; GFX6-NEXT: s_min_i32 s10, s1, -1 ; GFX6-NEXT: s_sub_i32 s5, s5, s8 +; GFX6-NEXT: s_min_i32 s10, s1, -1 ; GFX6-NEXT: s_sub_i32 s10, s10, s9 ; GFX6-NEXT: s_max_i32 s4, s5, s4 ; GFX6-NEXT: s_lshl_b32 s2, s2, 16 -; GFX6-NEXT: s_max_i32 s5, s2, -1 ; GFX6-NEXT: s_min_i32 s4, s4, s10 +; GFX6-NEXT: s_max_i32 s5, s2, -1 ; GFX6-NEXT: s_sub_i32 s1, s1, s4 ; GFX6-NEXT: s_lshl_b32 s4, s6, 16 -; GFX6-NEXT: s_min_i32 s6, s2, -1 ; GFX6-NEXT: s_sub_i32 s5, s5, s8 +; GFX6-NEXT: s_min_i32 s6, s2, -1 ; GFX6-NEXT: s_sub_i32 s6, s6, s9 ; GFX6-NEXT: s_max_i32 s4, s5, s4 ; GFX6-NEXT: s_lshl_b32 s3, s3, 16 ; GFX6-NEXT: s_min_i32 s4, s4, s6 ; GFX6-NEXT: s_max_i32 s5, s3, -1 ; GFX6-NEXT: s_sub_i32 s2, s2, s4 -; GFX6-NEXT: s_min_i32 s6, s3, -1 ; GFX6-NEXT: s_lshl_b32 s4, s7, 16 ; GFX6-NEXT: s_sub_i32 s5, s5, s8 +; GFX6-NEXT: s_min_i32 s6, s3, -1 ; GFX6-NEXT: s_sub_i32 s6, s6, s9 ; GFX6-NEXT: s_max_i32 s4, s5, s4 ; GFX6-NEXT: s_min_i32 s4, s4, s6 -; GFX6-NEXT: s_sub_i32 s3, s3, s4 ; GFX6-NEXT: s_ashr_i32 s1, s1, 16 +; GFX6-NEXT: s_sub_i32 s3, s3, s4 ; GFX6-NEXT: s_mov_b32 s4, 0xffff ; GFX6-NEXT: s_ashr_i32 s0, s0, 16 ; GFX6-NEXT: s_and_b32 s1, s1, s4 @@ -3171,8 +3171,8 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_sext_i32_i16 s10, s0 ; GFX8-NEXT: s_sext_i32_i16 s11, -1 -; GFX8-NEXT: s_max_i32 s12, s10, s11 ; GFX8-NEXT: s_movk_i32 s8, 0x7fff +; GFX8-NEXT: s_max_i32 s12, s10, s11 ; GFX8-NEXT: s_sub_i32 s12, s12, s8 ; GFX8-NEXT: s_lshr_b32 s6, s2, 16 ; GFX8-NEXT: s_movk_i32 s9, 0x8000 @@ -3201,12 +3201,12 @@ ; GFX8-NEXT: s_sext_i32_i16 s4, s1 ; GFX8-NEXT: s_max_i32 s6, s4, s11 ; GFX8-NEXT: s_sub_i32 s6, s6, s8 -; GFX8-NEXT: s_min_i32 s4, s4, s11 ; GFX8-NEXT: s_lshr_b32 s7, s3, 16 +; GFX8-NEXT: s_min_i32 s4, s4, s11 ; GFX8-NEXT: s_sext_i32_i16 s6, s6 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 -; GFX8-NEXT: s_max_i32 s3, s6, s3 ; GFX8-NEXT: s_sub_i32 s4, s4, s9 +; GFX8-NEXT: s_max_i32 s3, s6, s3 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 ; GFX8-NEXT: s_lshr_b32 s5, s1, 16 @@ -3275,8 +3275,8 @@ ; GFX6-NEXT: s_brev_b32 s4, -2 ; GFX6-NEXT: v_max_i32_e32 v12, -1, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX6-NEXT: v_subrev_i32_e32 v12, vcc, s4, v12 ; GFX6-NEXT: s_brev_b32 s5, 1 +; GFX6-NEXT: v_subrev_i32_e32 v12, vcc, s4, v12 ; GFX6-NEXT: v_min_i32_e32 v14, -1, v0 ; GFX6-NEXT: v_subrev_i32_e32 v14, vcc, s5, v14 ; GFX6-NEXT: v_max_i32_e32 v6, v12, v6 @@ -3290,60 +3290,60 @@ ; GFX6-NEXT: v_subrev_i32_e32 v12, vcc, s5, v12 ; GFX6-NEXT: v_max_i32_e32 v6, v7, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_min_i32_e32 v6, v6, v12 ; GFX6-NEXT: v_bfrev_b32_e32 v13, -2 +; GFX6-NEXT: v_min_i32_e32 v6, v6, v12 ; GFX6-NEXT: v_max_i32_e32 v7, -1, v2 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v8 -; GFX6-NEXT: v_min_i32_e32 v8, -1, v2 ; GFX6-NEXT: v_sub_i32_e32 v7, vcc, v7, v13 +; GFX6-NEXT: v_min_i32_e32 v8, -1, v2 ; GFX6-NEXT: v_subrev_i32_e32 v8, vcc, s5, v8 ; GFX6-NEXT: v_max_i32_e32 v6, v7, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: v_min_i32_e32 v6, v6, v8 ; GFX6-NEXT: v_max_i32_e32 v7, -1, v3 -; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v6 ; GFX6-NEXT: v_bfrev_b32_e32 v15, 1 -; GFX6-NEXT: v_min_i32_e32 v8, -1, v3 +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v9 ; GFX6-NEXT: v_sub_i32_e32 v7, vcc, v7, v13 +; GFX6-NEXT: v_min_i32_e32 v8, -1, v3 ; GFX6-NEXT: v_sub_i32_e32 v8, vcc, v8, v15 ; GFX6-NEXT: v_max_i32_e32 v6, v7, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX6-NEXT: v_min_i32_e32 v6, v6, v8 ; GFX6-NEXT: v_max_i32_e32 v7, -1, v4 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v3, v6 -; GFX6-NEXT: v_min_i32_e32 v8, -1, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v10 ; GFX6-NEXT: v_sub_i32_e32 v7, vcc, v7, v13 +; GFX6-NEXT: v_min_i32_e32 v8, -1, v4 ; GFX6-NEXT: v_sub_i32_e32 v8, vcc, v8, v15 ; GFX6-NEXT: v_max_i32_e32 v6, v7, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX6-NEXT: v_min_i32_e32 v6, v6, v8 ; GFX6-NEXT: v_max_i32_e32 v7, -1, v5 ; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v4, v6 -; GFX6-NEXT: v_min_i32_e32 v8, -1, v5 ; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v11 ; GFX6-NEXT: v_sub_i32_e32 v7, vcc, v7, v13 +; GFX6-NEXT: v_min_i32_e32 v8, -1, v5 ; GFX6-NEXT: v_ashrrev_i32_e32 v1, 16, v1 -; GFX6-NEXT: s_mov_b32 s4, 0xffff ; GFX6-NEXT: v_sub_i32_e32 v8, vcc, v8, v15 ; GFX6-NEXT: v_max_i32_e32 v6, v7, v6 +; GFX6-NEXT: s_mov_b32 s4, 0xffff ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 16, v0 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 ; GFX6-NEXT: v_min_i32_e32 v6, v6, v8 -; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v5, v6 +; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 ; GFX6-NEXT: v_ashrrev_i32_e32 v2, 16, v2 ; GFX6-NEXT: v_ashrrev_i32_e32 v3, 16, v3 +; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v5, v6 ; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_ashrrev_i32_e32 v5, 16, v5 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: v_and_b32_e32 v1, s4, v2 ; GFX6-NEXT: v_and_b32_e32 v2, s4, v3 -; GFX6-NEXT: v_ashrrev_i32_e32 v5, 16, v5 -; GFX6-NEXT: v_and_b32_e32 v3, s4, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: v_ashrrev_i32_e32 v4, 16, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_and_b32_e32 v3, s4, v5 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX6-NEXT: v_and_b32_e32 v2, s4, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 @@ -3355,19 +3355,19 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_movk_i32 s4, 0x7fff ; GFX8-NEXT: v_max_i16_e32 v9, -1, v0 -; GFX8-NEXT: v_subrev_u16_e32 v9, s4, v9 ; GFX8-NEXT: s_movk_i32 s5, 0x8000 +; GFX8-NEXT: v_subrev_u16_e32 v9, s4, v9 ; GFX8-NEXT: v_min_i16_e32 v11, -1, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v0 ; GFX8-NEXT: v_subrev_u16_e32 v11, s5, v11 ; GFX8-NEXT: v_max_i16_e32 v9, v9, v3 ; GFX8-NEXT: v_min_i16_e32 v9, v9, v11 ; GFX8-NEXT: v_max_i16_e32 v11, -1, v6 -; GFX8-NEXT: v_min_i16_e32 v13, -1, v6 ; GFX8-NEXT: v_subrev_u16_e32 v11, s4, v11 +; GFX8-NEXT: v_min_i16_e32 v13, -1, v6 +; GFX8-NEXT: v_subrev_u16_e32 v13, s5, v13 ; GFX8-NEXT: v_max_i16_sdwa v3, v11, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_max_i16_e32 v11, -1, v1 -; GFX8-NEXT: v_subrev_u16_e32 v13, s5, v13 ; GFX8-NEXT: v_min_i16_e32 v3, v3, v13 ; GFX8-NEXT: v_subrev_u16_e32 v11, s4, v11 ; GFX8-NEXT: v_min_i16_e32 v13, -1, v1 @@ -3376,15 +3376,15 @@ ; GFX8-NEXT: v_max_i16_e32 v11, v11, v4 ; GFX8-NEXT: v_min_i16_e32 v11, v11, v13 ; GFX8-NEXT: v_max_i16_e32 v13, -1, v7 -; GFX8-NEXT: v_min_i16_e32 v14, -1, v7 ; GFX8-NEXT: v_subrev_u16_e32 v13, s4, v13 -; GFX8-NEXT: v_max_i16_sdwa v4, v13, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_subrev_u16_e32 v14, s5, v14 +; GFX8-NEXT: v_min_i16_e32 v14, -1, v7 ; GFX8-NEXT: v_mov_b32_e32 v10, 0x7fff +; GFX8-NEXT: v_subrev_u16_e32 v14, s5, v14 +; GFX8-NEXT: v_max_i16_sdwa v4, v13, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_max_i16_e32 v13, -1, v2 -; GFX8-NEXT: v_sub_u16_e32 v13, v13, v10 -; GFX8-NEXT: v_min_i16_e32 v4, v4, v14 ; GFX8-NEXT: v_mov_b32_e32 v12, 0xffff8000 +; GFX8-NEXT: v_min_i16_e32 v4, v4, v14 +; GFX8-NEXT: v_sub_u16_e32 v13, v13, v10 ; GFX8-NEXT: v_min_i16_e32 v14, -1, v2 ; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v2 ; GFX8-NEXT: v_sub_u16_e32 v14, v14, v12 @@ -3397,8 +3397,8 @@ ; GFX8-NEXT: v_max_i16_sdwa v5, v10, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_sub_u16_e32 v0, v0, v9 ; GFX8-NEXT: v_sub_u16_sdwa v3, v6, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v0, v3 ; GFX8-NEXT: v_min_i16_e32 v5, v5, v12 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v3 ; GFX8-NEXT: v_sub_u16_e32 v1, v1, v11 ; GFX8-NEXT: v_sub_u16_sdwa v3, v7, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v1, v1, v3 @@ -3435,8 +3435,8 @@ ; GFX6-NEXT: s_brev_b32 s12, -2 ; GFX6-NEXT: s_max_i32 s14, s0, -1 ; GFX6-NEXT: s_lshl_b32 s6, s6, 16 -; GFX6-NEXT: s_sub_i32 s14, s14, s12 ; GFX6-NEXT: s_brev_b32 s13, 1 +; GFX6-NEXT: s_sub_i32 s14, s14, s12 ; GFX6-NEXT: s_min_i32 s15, s0, -1 ; GFX6-NEXT: s_sub_i32 s15, s15, s13 ; GFX6-NEXT: s_max_i32 s6, s14, s6 @@ -3445,49 +3445,49 @@ ; GFX6-NEXT: s_sub_i32 s0, s0, s6 ; GFX6-NEXT: s_lshl_b32 s6, s7, 16 ; GFX6-NEXT: s_max_i32 s7, s1, -1 -; GFX6-NEXT: s_min_i32 s14, s1, -1 ; GFX6-NEXT: s_sub_i32 s7, s7, s12 +; GFX6-NEXT: s_min_i32 s14, s1, -1 ; GFX6-NEXT: s_sub_i32 s14, s14, s13 ; GFX6-NEXT: s_max_i32 s6, s7, s6 ; GFX6-NEXT: s_lshl_b32 s2, s2, 16 -; GFX6-NEXT: s_max_i32 s7, s2, -1 ; GFX6-NEXT: s_min_i32 s6, s6, s14 +; GFX6-NEXT: s_max_i32 s7, s2, -1 ; GFX6-NEXT: s_sub_i32 s1, s1, s6 ; GFX6-NEXT: s_lshl_b32 s6, s8, 16 -; GFX6-NEXT: s_min_i32 s8, s2, -1 ; GFX6-NEXT: s_sub_i32 s7, s7, s12 +; GFX6-NEXT: s_min_i32 s8, s2, -1 ; GFX6-NEXT: s_sub_i32 s8, s8, s13 ; GFX6-NEXT: s_max_i32 s6, s7, s6 ; GFX6-NEXT: s_lshl_b32 s3, s3, 16 ; GFX6-NEXT: s_min_i32 s6, s6, s8 ; GFX6-NEXT: s_max_i32 s7, s3, -1 ; GFX6-NEXT: s_sub_i32 s2, s2, s6 -; GFX6-NEXT: s_min_i32 s8, s3, -1 ; GFX6-NEXT: s_lshl_b32 s6, s9, 16 ; GFX6-NEXT: s_sub_i32 s7, s7, s12 +; GFX6-NEXT: s_min_i32 s8, s3, -1 ; GFX6-NEXT: s_sub_i32 s8, s8, s13 ; GFX6-NEXT: s_max_i32 s6, s7, s6 ; GFX6-NEXT: s_lshl_b32 s4, s4, 16 ; GFX6-NEXT: s_min_i32 s6, s6, s8 ; GFX6-NEXT: s_max_i32 s7, s4, -1 ; GFX6-NEXT: s_sub_i32 s3, s3, s6 -; GFX6-NEXT: s_min_i32 s8, s4, -1 ; GFX6-NEXT: s_lshl_b32 s6, s10, 16 ; GFX6-NEXT: s_sub_i32 s7, s7, s12 +; GFX6-NEXT: s_min_i32 s8, s4, -1 ; GFX6-NEXT: s_sub_i32 s8, s8, s13 ; GFX6-NEXT: s_max_i32 s6, s7, s6 ; GFX6-NEXT: s_lshl_b32 s5, s5, 16 ; GFX6-NEXT: s_min_i32 s6, s6, s8 ; GFX6-NEXT: s_max_i32 s7, s5, -1 ; GFX6-NEXT: s_sub_i32 s4, s4, s6 -; GFX6-NEXT: s_min_i32 s8, s5, -1 ; GFX6-NEXT: s_lshl_b32 s6, s11, 16 ; GFX6-NEXT: s_sub_i32 s7, s7, s12 +; GFX6-NEXT: s_min_i32 s8, s5, -1 ; GFX6-NEXT: s_sub_i32 s8, s8, s13 ; GFX6-NEXT: s_max_i32 s6, s7, s6 ; GFX6-NEXT: s_min_i32 s6, s6, s8 -; GFX6-NEXT: s_sub_i32 s5, s5, s6 ; GFX6-NEXT: s_ashr_i32 s1, s1, 16 +; GFX6-NEXT: s_sub_i32 s5, s5, s6 ; GFX6-NEXT: s_mov_b32 s6, 0xffff ; GFX6-NEXT: s_ashr_i32 s0, s0, 16 ; GFX6-NEXT: s_and_b32 s1, s1, s6 @@ -3495,13 +3495,13 @@ ; GFX6-NEXT: s_ashr_i32 s3, s3, 16 ; GFX6-NEXT: s_and_b32 s0, s0, s6 ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 +; GFX6-NEXT: s_ashr_i32 s5, s5, 16 ; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: s_and_b32 s1, s2, s6 ; GFX6-NEXT: s_and_b32 s2, s3, s6 -; GFX6-NEXT: s_ashr_i32 s5, s5, 16 -; GFX6-NEXT: s_and_b32 s3, s5, s6 -; GFX6-NEXT: s_lshl_b32 s2, s2, 16 ; GFX6-NEXT: s_ashr_i32 s4, s4, 16 +; GFX6-NEXT: s_lshl_b32 s2, s2, 16 +; GFX6-NEXT: s_and_b32 s3, s5, s6 ; GFX6-NEXT: s_or_b32 s1, s1, s2 ; GFX6-NEXT: s_and_b32 s2, s4, s6 ; GFX6-NEXT: s_lshl_b32 s3, s3, 16 @@ -3512,8 +3512,8 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_sext_i32_i16 s14, s0 ; GFX8-NEXT: s_sext_i32_i16 s15, -1 -; GFX8-NEXT: s_max_i32 s16, s14, s15 ; GFX8-NEXT: s_movk_i32 s12, 0x7fff +; GFX8-NEXT: s_max_i32 s16, s14, s15 ; GFX8-NEXT: s_sub_i32 s16, s16, s12 ; GFX8-NEXT: s_lshr_b32 s9, s3, 16 ; GFX8-NEXT: s_movk_i32 s13, 0x8000 @@ -3542,12 +3542,12 @@ ; GFX8-NEXT: s_sext_i32_i16 s6, s1 ; GFX8-NEXT: s_max_i32 s9, s6, s15 ; GFX8-NEXT: s_sub_i32 s9, s9, s12 -; GFX8-NEXT: s_min_i32 s6, s6, s15 ; GFX8-NEXT: s_lshr_b32 s10, s4, 16 +; GFX8-NEXT: s_min_i32 s6, s6, s15 ; GFX8-NEXT: s_sext_i32_i16 s9, s9 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 -; GFX8-NEXT: s_max_i32 s4, s9, s4 ; GFX8-NEXT: s_sub_i32 s6, s6, s13 +; GFX8-NEXT: s_max_i32 s4, s9, s4 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 ; GFX8-NEXT: s_sext_i32_i16 s6, s6 ; GFX8-NEXT: s_lshr_b32 s7, s1, 16 @@ -3568,12 +3568,12 @@ ; GFX8-NEXT: s_sub_i32 s4, s7, s4 ; GFX8-NEXT: s_max_i32 s7, s6, s15 ; GFX8-NEXT: s_sub_i32 s7, s7, s12 -; GFX8-NEXT: s_min_i32 s6, s6, s15 ; GFX8-NEXT: s_lshr_b32 s11, s5, 16 +; GFX8-NEXT: s_min_i32 s6, s6, s15 ; GFX8-NEXT: s_sext_i32_i16 s7, s7 ; GFX8-NEXT: s_sext_i32_i16 s5, s5 -; GFX8-NEXT: s_max_i32 s5, s7, s5 ; GFX8-NEXT: s_sub_i32 s6, s6, s13 +; GFX8-NEXT: s_max_i32 s5, s7, s5 ; GFX8-NEXT: s_sext_i32_i16 s5, s5 ; GFX8-NEXT: s_sext_i32_i16 s6, s6 ; GFX8-NEXT: s_lshr_b32 s8, s2, 16 @@ -3640,8 +3640,8 @@ ; GFX6-NEXT: s_brev_b32 s4, -2 ; GFX6-NEXT: v_max_i32_e32 v16, -1, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GFX6-NEXT: v_subrev_i32_e32 v16, vcc, s4, v16 ; GFX6-NEXT: s_brev_b32 s5, 1 +; GFX6-NEXT: v_subrev_i32_e32 v16, vcc, s4, v16 ; GFX6-NEXT: v_min_i32_e32 v18, -1, v0 ; GFX6-NEXT: v_subrev_i32_e32 v18, vcc, s5, v18 ; GFX6-NEXT: v_max_i32_e32 v8, v16, v8 @@ -3655,84 +3655,84 @@ ; GFX6-NEXT: v_subrev_i32_e32 v16, vcc, s5, v16 ; GFX6-NEXT: v_max_i32_e32 v8, v9, v8 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_min_i32_e32 v8, v8, v16 ; GFX6-NEXT: v_bfrev_b32_e32 v17, -2 +; GFX6-NEXT: v_min_i32_e32 v8, v8, v16 ; GFX6-NEXT: v_max_i32_e32 v9, -1, v2 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v8 ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v10 -; GFX6-NEXT: v_min_i32_e32 v10, -1, v2 ; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v9, v17 +; GFX6-NEXT: v_min_i32_e32 v10, -1, v2 ; GFX6-NEXT: v_subrev_i32_e32 v10, vcc, s5, v10 ; GFX6-NEXT: v_max_i32_e32 v8, v9, v8 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: v_min_i32_e32 v8, v8, v10 ; GFX6-NEXT: v_max_i32_e32 v9, -1, v3 -; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v8 ; GFX6-NEXT: v_bfrev_b32_e32 v19, 1 -; GFX6-NEXT: v_min_i32_e32 v10, -1, v3 +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v8 ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v11 ; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v9, v17 +; GFX6-NEXT: v_min_i32_e32 v10, -1, v3 ; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v10, v19 ; GFX6-NEXT: v_max_i32_e32 v8, v9, v8 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX6-NEXT: v_min_i32_e32 v8, v8, v10 ; GFX6-NEXT: v_max_i32_e32 v9, -1, v4 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v3, v8 -; GFX6-NEXT: v_min_i32_e32 v10, -1, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v12 ; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v9, v17 +; GFX6-NEXT: v_min_i32_e32 v10, -1, v4 ; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v10, v19 ; GFX6-NEXT: v_max_i32_e32 v8, v9, v8 ; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX6-NEXT: v_min_i32_e32 v8, v8, v10 ; GFX6-NEXT: v_max_i32_e32 v9, -1, v5 ; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v4, v8 -; GFX6-NEXT: v_min_i32_e32 v10, -1, v5 ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v13 ; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v9, v17 +; GFX6-NEXT: v_min_i32_e32 v10, -1, v5 ; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v10, v19 ; GFX6-NEXT: v_max_i32_e32 v8, v9, v8 ; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; GFX6-NEXT: v_min_i32_e32 v8, v8, v10 ; GFX6-NEXT: v_max_i32_e32 v9, -1, v6 ; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v5, v8 -; GFX6-NEXT: v_min_i32_e32 v10, -1, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v14 ; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v9, v17 +; GFX6-NEXT: v_min_i32_e32 v10, -1, v6 ; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v10, v19 ; GFX6-NEXT: v_max_i32_e32 v8, v9, v8 ; GFX6-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; GFX6-NEXT: v_min_i32_e32 v8, v8, v10 ; GFX6-NEXT: v_max_i32_e32 v9, -1, v7 -; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v6, v8 -; GFX6-NEXT: v_min_i32_e32 v10, -1, v7 ; GFX6-NEXT: v_ashrrev_i32_e32 v1, 16, v1 -; GFX6-NEXT: s_mov_b32 s4, 0xffff +; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v6, v8 ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v15 ; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v9, v17 +; GFX6-NEXT: v_min_i32_e32 v10, -1, v7 +; GFX6-NEXT: s_mov_b32 s4, 0xffff ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 16, v0 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 ; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v10, v19 ; GFX6-NEXT: v_max_i32_e32 v8, v9, v8 -; GFX6-NEXT: v_min_i32_e32 v8, v8, v10 +; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 ; GFX6-NEXT: v_ashrrev_i32_e32 v2, 16, v2 ; GFX6-NEXT: v_ashrrev_i32_e32 v3, 16, v3 +; GFX6-NEXT: v_min_i32_e32 v8, v8, v10 ; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_ashrrev_i32_e32 v5, 16, v5 ; GFX6-NEXT: v_sub_i32_e32 v7, vcc, v7, v8 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: v_and_b32_e32 v1, s4, v2 ; GFX6-NEXT: v_and_b32_e32 v2, s4, v3 -; GFX6-NEXT: v_ashrrev_i32_e32 v5, 16, v5 -; GFX6-NEXT: v_and_b32_e32 v3, s4, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: v_ashrrev_i32_e32 v4, 16, v4 ; GFX6-NEXT: v_ashrrev_i32_e32 v7, 16, v7 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_and_b32_e32 v3, s4, v5 +; GFX6-NEXT: v_ashrrev_i32_e32 v6, 16, v6 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX6-NEXT: v_and_b32_e32 v2, s4, v4 -; GFX6-NEXT: v_and_b32_e32 v4, s4, v7 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_ashrrev_i32_e32 v6, 16, v6 +; GFX6-NEXT: v_and_b32_e32 v4, s4, v7 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX6-NEXT: v_and_b32_e32 v3, s4, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 @@ -3744,19 +3744,19 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_movk_i32 s4, 0x7fff ; GFX8-NEXT: v_max_i16_e32 v12, -1, v0 -; GFX8-NEXT: v_subrev_u16_e32 v12, s4, v12 ; GFX8-NEXT: s_movk_i32 s5, 0x8000 +; GFX8-NEXT: v_subrev_u16_e32 v12, s4, v12 ; GFX8-NEXT: v_min_i16_e32 v14, -1, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v0 ; GFX8-NEXT: v_subrev_u16_e32 v14, s5, v14 ; GFX8-NEXT: v_max_i16_e32 v12, v12, v4 ; GFX8-NEXT: v_min_i16_e32 v12, v12, v14 ; GFX8-NEXT: v_max_i16_e32 v14, -1, v8 -; GFX8-NEXT: v_min_i16_e32 v16, -1, v8 ; GFX8-NEXT: v_subrev_u16_e32 v14, s4, v14 +; GFX8-NEXT: v_min_i16_e32 v16, -1, v8 +; GFX8-NEXT: v_subrev_u16_e32 v16, s5, v16 ; GFX8-NEXT: v_max_i16_sdwa v4, v14, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_max_i16_e32 v14, -1, v1 -; GFX8-NEXT: v_subrev_u16_e32 v16, s5, v16 ; GFX8-NEXT: v_min_i16_e32 v4, v4, v16 ; GFX8-NEXT: v_subrev_u16_e32 v14, s4, v14 ; GFX8-NEXT: v_min_i16_e32 v16, -1, v1 @@ -3765,28 +3765,28 @@ ; GFX8-NEXT: v_max_i16_e32 v14, v14, v5 ; GFX8-NEXT: v_min_i16_e32 v14, v14, v16 ; GFX8-NEXT: v_max_i16_e32 v16, -1, v9 -; GFX8-NEXT: v_min_i16_e32 v17, -1, v9 ; GFX8-NEXT: v_subrev_u16_e32 v16, s4, v16 -; GFX8-NEXT: v_max_i16_sdwa v5, v16, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_subrev_u16_e32 v17, s5, v17 +; GFX8-NEXT: v_min_i16_e32 v17, -1, v9 ; GFX8-NEXT: v_mov_b32_e32 v13, 0x7fff +; GFX8-NEXT: v_subrev_u16_e32 v17, s5, v17 +; GFX8-NEXT: v_max_i16_sdwa v5, v16, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_max_i16_e32 v16, -1, v2 -; GFX8-NEXT: v_sub_u16_e32 v16, v16, v13 -; GFX8-NEXT: v_min_i16_e32 v5, v5, v17 ; GFX8-NEXT: v_mov_b32_e32 v15, 0xffff8000 +; GFX8-NEXT: v_min_i16_e32 v5, v5, v17 +; GFX8-NEXT: v_sub_u16_e32 v16, v16, v13 ; GFX8-NEXT: v_min_i16_e32 v17, -1, v2 ; GFX8-NEXT: v_lshrrev_b32_e32 v10, 16, v2 ; GFX8-NEXT: v_sub_u16_e32 v17, v17, v15 ; GFX8-NEXT: v_max_i16_e32 v16, v16, v6 ; GFX8-NEXT: v_min_i16_e32 v16, v16, v17 ; GFX8-NEXT: v_max_i16_e32 v17, -1, v10 -; GFX8-NEXT: v_min_i16_e32 v18, -1, v10 ; GFX8-NEXT: v_sub_u16_e32 v17, v17, v13 +; GFX8-NEXT: v_min_i16_e32 v18, -1, v10 +; GFX8-NEXT: v_sub_u16_e32 v18, v18, v15 ; GFX8-NEXT: v_max_i16_sdwa v6, v17, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_max_i16_e32 v17, -1, v3 -; GFX8-NEXT: v_sub_u16_e32 v18, v18, v15 -; GFX8-NEXT: v_sub_u16_e32 v17, v17, v13 ; GFX8-NEXT: v_min_i16_e32 v6, v6, v18 +; GFX8-NEXT: v_sub_u16_e32 v17, v17, v13 ; GFX8-NEXT: v_min_i16_e32 v18, -1, v3 ; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v3 ; GFX8-NEXT: v_sub_u16_e32 v18, v18, v15 @@ -3797,13 +3797,13 @@ ; GFX8-NEXT: v_min_i16_e32 v18, -1, v11 ; GFX8-NEXT: v_sub_u16_e32 v0, v0, v12 ; GFX8-NEXT: v_sub_u16_sdwa v4, v8, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 ; GFX8-NEXT: v_sub_u16_e32 v15, v18, v15 ; GFX8-NEXT: v_max_i16_sdwa v7, v13, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 ; GFX8-NEXT: v_sub_u16_e32 v1, v1, v14 ; GFX8-NEXT: v_sub_u16_sdwa v4, v9, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v1, v1, v4 ; GFX8-NEXT: v_min_i16_e32 v7, v7, v15 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v4 ; GFX8-NEXT: v_sub_u16_e32 v2, v2, v16 ; GFX8-NEXT: v_sub_u16_sdwa v4, v10, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v2, v2, v4 @@ -3842,8 +3842,8 @@ ; GFX6-NEXT: s_brev_b32 s16, -2 ; GFX6-NEXT: s_max_i32 s18, s0, -1 ; GFX6-NEXT: s_lshl_b32 s8, s8, 16 -; GFX6-NEXT: s_sub_i32 s18, s18, s16 ; GFX6-NEXT: s_brev_b32 s17, 1 +; GFX6-NEXT: s_sub_i32 s18, s18, s16 ; GFX6-NEXT: s_min_i32 s19, s0, -1 ; GFX6-NEXT: s_sub_i32 s19, s19, s17 ; GFX6-NEXT: s_max_i32 s8, s18, s8 @@ -3852,67 +3852,67 @@ ; GFX6-NEXT: s_sub_i32 s0, s0, s8 ; GFX6-NEXT: s_lshl_b32 s8, s9, 16 ; GFX6-NEXT: s_max_i32 s9, s1, -1 -; GFX6-NEXT: s_min_i32 s18, s1, -1 ; GFX6-NEXT: s_sub_i32 s9, s9, s16 +; GFX6-NEXT: s_min_i32 s18, s1, -1 ; GFX6-NEXT: s_sub_i32 s18, s18, s17 ; GFX6-NEXT: s_max_i32 s8, s9, s8 ; GFX6-NEXT: s_lshl_b32 s2, s2, 16 -; GFX6-NEXT: s_max_i32 s9, s2, -1 ; GFX6-NEXT: s_min_i32 s8, s8, s18 +; GFX6-NEXT: s_max_i32 s9, s2, -1 ; GFX6-NEXT: s_sub_i32 s1, s1, s8 ; GFX6-NEXT: s_lshl_b32 s8, s10, 16 -; GFX6-NEXT: s_min_i32 s10, s2, -1 ; GFX6-NEXT: s_sub_i32 s9, s9, s16 +; GFX6-NEXT: s_min_i32 s10, s2, -1 ; GFX6-NEXT: s_sub_i32 s10, s10, s17 ; GFX6-NEXT: s_max_i32 s8, s9, s8 ; GFX6-NEXT: s_lshl_b32 s3, s3, 16 ; GFX6-NEXT: s_min_i32 s8, s8, s10 ; GFX6-NEXT: s_max_i32 s9, s3, -1 ; GFX6-NEXT: s_sub_i32 s2, s2, s8 -; GFX6-NEXT: s_min_i32 s10, s3, -1 ; GFX6-NEXT: s_lshl_b32 s8, s11, 16 ; GFX6-NEXT: s_sub_i32 s9, s9, s16 +; GFX6-NEXT: s_min_i32 s10, s3, -1 ; GFX6-NEXT: s_sub_i32 s10, s10, s17 ; GFX6-NEXT: s_max_i32 s8, s9, s8 ; GFX6-NEXT: s_lshl_b32 s4, s4, 16 ; GFX6-NEXT: s_min_i32 s8, s8, s10 ; GFX6-NEXT: s_max_i32 s9, s4, -1 ; GFX6-NEXT: s_sub_i32 s3, s3, s8 -; GFX6-NEXT: s_min_i32 s10, s4, -1 ; GFX6-NEXT: s_lshl_b32 s8, s12, 16 ; GFX6-NEXT: s_sub_i32 s9, s9, s16 +; GFX6-NEXT: s_min_i32 s10, s4, -1 ; GFX6-NEXT: s_sub_i32 s10, s10, s17 ; GFX6-NEXT: s_max_i32 s8, s9, s8 ; GFX6-NEXT: s_lshl_b32 s5, s5, 16 ; GFX6-NEXT: s_min_i32 s8, s8, s10 ; GFX6-NEXT: s_max_i32 s9, s5, -1 ; GFX6-NEXT: s_sub_i32 s4, s4, s8 -; GFX6-NEXT: s_min_i32 s10, s5, -1 ; GFX6-NEXT: s_lshl_b32 s8, s13, 16 ; GFX6-NEXT: s_sub_i32 s9, s9, s16 +; GFX6-NEXT: s_min_i32 s10, s5, -1 ; GFX6-NEXT: s_sub_i32 s10, s10, s17 ; GFX6-NEXT: s_max_i32 s8, s9, s8 ; GFX6-NEXT: s_lshl_b32 s6, s6, 16 ; GFX6-NEXT: s_min_i32 s8, s8, s10 ; GFX6-NEXT: s_max_i32 s9, s6, -1 ; GFX6-NEXT: s_sub_i32 s5, s5, s8 -; GFX6-NEXT: s_min_i32 s10, s6, -1 ; GFX6-NEXT: s_lshl_b32 s8, s14, 16 ; GFX6-NEXT: s_sub_i32 s9, s9, s16 +; GFX6-NEXT: s_min_i32 s10, s6, -1 ; GFX6-NEXT: s_sub_i32 s10, s10, s17 ; GFX6-NEXT: s_max_i32 s8, s9, s8 ; GFX6-NEXT: s_lshl_b32 s7, s7, 16 ; GFX6-NEXT: s_min_i32 s8, s8, s10 ; GFX6-NEXT: s_max_i32 s9, s7, -1 ; GFX6-NEXT: s_sub_i32 s6, s6, s8 -; GFX6-NEXT: s_min_i32 s10, s7, -1 ; GFX6-NEXT: s_lshl_b32 s8, s15, 16 ; GFX6-NEXT: s_sub_i32 s9, s9, s16 +; GFX6-NEXT: s_min_i32 s10, s7, -1 ; GFX6-NEXT: s_sub_i32 s10, s10, s17 ; GFX6-NEXT: s_max_i32 s8, s9, s8 ; GFX6-NEXT: s_min_i32 s8, s8, s10 -; GFX6-NEXT: s_sub_i32 s7, s7, s8 ; GFX6-NEXT: s_ashr_i32 s1, s1, 16 +; GFX6-NEXT: s_sub_i32 s7, s7, s8 ; GFX6-NEXT: s_mov_b32 s8, 0xffff ; GFX6-NEXT: s_ashr_i32 s0, s0, 16 ; GFX6-NEXT: s_and_b32 s1, s1, s8 @@ -3920,19 +3920,19 @@ ; GFX6-NEXT: s_ashr_i32 s3, s3, 16 ; GFX6-NEXT: s_and_b32 s0, s0, s8 ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 +; GFX6-NEXT: s_ashr_i32 s5, s5, 16 ; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: s_and_b32 s1, s2, s8 ; GFX6-NEXT: s_and_b32 s2, s3, s8 -; GFX6-NEXT: s_ashr_i32 s5, s5, 16 -; GFX6-NEXT: s_and_b32 s3, s5, s8 -; GFX6-NEXT: s_lshl_b32 s2, s2, 16 ; GFX6-NEXT: s_ashr_i32 s4, s4, 16 ; GFX6-NEXT: s_ashr_i32 s7, s7, 16 +; GFX6-NEXT: s_lshl_b32 s2, s2, 16 +; GFX6-NEXT: s_and_b32 s3, s5, s8 +; GFX6-NEXT: s_ashr_i32 s6, s6, 16 ; GFX6-NEXT: s_or_b32 s1, s1, s2 ; GFX6-NEXT: s_and_b32 s2, s4, s8 -; GFX6-NEXT: s_and_b32 s4, s7, s8 ; GFX6-NEXT: s_lshl_b32 s3, s3, 16 -; GFX6-NEXT: s_ashr_i32 s6, s6, 16 +; GFX6-NEXT: s_and_b32 s4, s7, s8 ; GFX6-NEXT: s_or_b32 s2, s2, s3 ; GFX6-NEXT: s_and_b32 s3, s6, s8 ; GFX6-NEXT: s_lshl_b32 s4, s4, 16 @@ -3943,8 +3943,8 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_sext_i32_i16 s18, s0 ; GFX8-NEXT: s_sext_i32_i16 s19, -1 -; GFX8-NEXT: s_max_i32 s20, s18, s19 ; GFX8-NEXT: s_movk_i32 s16, 0x7fff +; GFX8-NEXT: s_max_i32 s20, s18, s19 ; GFX8-NEXT: s_sub_i32 s20, s20, s16 ; GFX8-NEXT: s_lshr_b32 s12, s4, 16 ; GFX8-NEXT: s_movk_i32 s17, 0x8000 @@ -3973,12 +3973,12 @@ ; GFX8-NEXT: s_sext_i32_i16 s8, s1 ; GFX8-NEXT: s_max_i32 s12, s8, s19 ; GFX8-NEXT: s_sub_i32 s12, s12, s16 -; GFX8-NEXT: s_min_i32 s8, s8, s19 ; GFX8-NEXT: s_lshr_b32 s13, s5, 16 +; GFX8-NEXT: s_min_i32 s8, s8, s19 ; GFX8-NEXT: s_sext_i32_i16 s12, s12 ; GFX8-NEXT: s_sext_i32_i16 s5, s5 -; GFX8-NEXT: s_max_i32 s5, s12, s5 ; GFX8-NEXT: s_sub_i32 s8, s8, s17 +; GFX8-NEXT: s_max_i32 s5, s12, s5 ; GFX8-NEXT: s_sext_i32_i16 s5, s5 ; GFX8-NEXT: s_sext_i32_i16 s8, s8 ; GFX8-NEXT: s_lshr_b32 s9, s1, 16 @@ -3999,12 +3999,12 @@ ; GFX8-NEXT: s_sub_i32 s5, s9, s5 ; GFX8-NEXT: s_max_i32 s9, s8, s19 ; GFX8-NEXT: s_sub_i32 s9, s9, s16 -; GFX8-NEXT: s_min_i32 s8, s8, s19 ; GFX8-NEXT: s_lshr_b32 s14, s6, 16 +; GFX8-NEXT: s_min_i32 s8, s8, s19 ; GFX8-NEXT: s_sext_i32_i16 s9, s9 ; GFX8-NEXT: s_sext_i32_i16 s6, s6 -; GFX8-NEXT: s_max_i32 s6, s9, s6 ; GFX8-NEXT: s_sub_i32 s8, s8, s17 +; GFX8-NEXT: s_max_i32 s6, s9, s6 ; GFX8-NEXT: s_sext_i32_i16 s6, s6 ; GFX8-NEXT: s_sext_i32_i16 s8, s8 ; GFX8-NEXT: s_lshr_b32 s10, s2, 16 @@ -4024,12 +4024,12 @@ ; GFX8-NEXT: s_sext_i32_i16 s8, s3 ; GFX8-NEXT: s_max_i32 s9, s8, s19 ; GFX8-NEXT: s_sub_i32 s9, s9, s16 -; GFX8-NEXT: s_min_i32 s8, s8, s19 ; GFX8-NEXT: s_lshr_b32 s15, s7, 16 +; GFX8-NEXT: s_min_i32 s8, s8, s19 ; GFX8-NEXT: s_sext_i32_i16 s9, s9 ; GFX8-NEXT: s_sext_i32_i16 s7, s7 -; GFX8-NEXT: s_max_i32 s7, s9, s7 ; GFX8-NEXT: s_sub_i32 s8, s8, s17 +; GFX8-NEXT: s_max_i32 s7, s9, s7 ; GFX8-NEXT: s_sext_i32_i16 s7, s7 ; GFX8-NEXT: s_sext_i32_i16 s8, s8 ; GFX8-NEXT: s_lshr_b32 s11, s3, 16 @@ -4448,8 +4448,8 @@ ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v9 ; GFX6-NEXT: v_bfrev_b32_e32 v10, 1 ; GFX6-NEXT: v_add_i32_e64 v1, s[6:7], 0, v0 -; GFX6-NEXT: s_xor_b64 vcc, s[4:5], vcc ; GFX6-NEXT: v_addc_u32_e64 v4, s[6:7], v0, v10, s[6:7] +; GFX6-NEXT: s_xor_b64 vcc, s[4:5], vcc ; GFX6-NEXT: v_cndmask_b32_e32 v0, v8, v1, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v1, v9, v4, vcc ; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v2, v6 @@ -4474,8 +4474,8 @@ ; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v9 ; GFX8-NEXT: v_bfrev_b32_e32 v10, 1 ; GFX8-NEXT: v_add_u32_e64 v1, s[6:7], 0, v0 -; GFX8-NEXT: s_xor_b64 vcc, s[4:5], vcc ; GFX8-NEXT: v_addc_u32_e64 v4, s[6:7], v0, v10, s[6:7] +; GFX8-NEXT: s_xor_b64 vcc, s[4:5], vcc ; GFX8-NEXT: v_cndmask_b32_e32 v0, v8, v1, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v1, v9, v4, vcc ; GFX8-NEXT: v_sub_u32_e32 v4, vcc, v2, v6 @@ -4500,8 +4500,8 @@ ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v9 ; GFX9-NEXT: v_bfrev_b32_e32 v10, 1 ; GFX9-NEXT: v_add_co_u32_e64 v1, s[6:7], 0, v0 -; GFX9-NEXT: s_xor_b64 vcc, s[4:5], vcc ; GFX9-NEXT: v_addc_co_u32_e64 v4, s[6:7], v0, v10, s[6:7] +; GFX9-NEXT: s_xor_b64 vcc, s[4:5], vcc ; GFX9-NEXT: v_cndmask_b32_e32 v0, v8, v1, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v4, vcc ; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, v2, v6 @@ -4521,20 +4521,20 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_sub_co_u32 v8, vcc_lo, v0, v4 -; GFX10-NEXT: v_cmp_lt_i64_e64 s4, 0, v[4:5] ; GFX10-NEXT: v_sub_co_ci_u32_e32 v9, vcc_lo, v1, v5, vcc_lo ; GFX10-NEXT: v_sub_co_u32 v10, vcc_lo, v2, v6 -; GFX10-NEXT: v_cmp_lt_i64_e64 s6, 0, v[6:7] ; GFX10-NEXT: v_sub_co_ci_u32_e32 v11, vcc_lo, v3, v7, vcc_lo ; GFX10-NEXT: v_ashrrev_i32_e32 v12, 31, v9 ; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[8:9], v[0:1] +; GFX10-NEXT: v_cmp_lt_i64_e64 s4, 0, v[4:5] ; GFX10-NEXT: v_ashrrev_i32_e32 v0, 31, v11 +; GFX10-NEXT: v_cmp_lt_i64_e64 s6, 0, v[6:7] ; GFX10-NEXT: v_add_co_u32 v1, s5, v12, 0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s5, 0x80000000, v12, s5 ; GFX10-NEXT: v_cmp_lt_i64_e64 s5, v[10:11], v[2:3] ; GFX10-NEXT: v_add_co_u32 v2, s7, v0, 0 -; GFX10-NEXT: s_xor_b32 vcc_lo, s4, vcc_lo ; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s7, 0x80000000, v0, s7 +; GFX10-NEXT: s_xor_b32 vcc_lo, s4, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v0, v8, v1, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v9, v4, vcc_lo ; GFX10-NEXT: s_xor_b32 vcc_lo, s6, s5 @@ -4555,8 +4555,8 @@ ; GFX6-NEXT: v_mov_b32_e32 v0, s0 ; GFX6-NEXT: s_subb_u32 s9, s1, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 -; GFX6-NEXT: v_cmp_gt_i64_e64 s[0:1], s[4:5], 0 ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1] +; GFX6-NEXT: v_cmp_gt_i64_e64 s[0:1], s[4:5], 0 ; GFX6-NEXT: s_ashr_i32 s4, s9, 31 ; GFX6-NEXT: s_xor_b64 vcc, s[0:1], vcc ; GFX6-NEXT: s_add_u32 s0, s4, 0 @@ -4569,13 +4569,13 @@ ; GFX6-NEXT: s_sub_u32 s0, s2, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s1 ; GFX6-NEXT: s_cselect_b32 s1, 1, 0 -; GFX6-NEXT: s_and_b32 s1, s1, 1 ; GFX6-NEXT: v_mov_b32_e32 v0, s8 -; GFX6-NEXT: s_cmp_lg_u32 s1, 0 +; GFX6-NEXT: s_and_b32 s1, s1, 1 ; GFX6-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc +; GFX6-NEXT: s_cmp_lg_u32 s1, 0 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 -; GFX6-NEXT: s_subb_u32 s1, s3, s7 ; GFX6-NEXT: v_mov_b32_e32 v3, s9 +; GFX6-NEXT: s_subb_u32 s1, s3, s7 ; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1] @@ -4589,8 +4589,8 @@ ; GFX6-NEXT: s_cmp_lg_u32 s2, 0 ; GFX6-NEXT: s_addc_u32 s3, s4, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NEXT: v_mov_b32_e32 v5, s1 ; GFX6-NEXT: v_mov_b32_e32 v3, s3 +; GFX6-NEXT: v_mov_b32_e32 v5, s1 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc ; GFX6-NEXT: v_readfirstlane_b32 s0, v4 @@ -4608,8 +4608,8 @@ ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: s_subb_u32 s9, s1, s5 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: v_cmp_gt_i64_e64 s[0:1], s[4:5], 0 ; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1] +; GFX8-NEXT: v_cmp_gt_i64_e64 s[0:1], s[4:5], 0 ; GFX8-NEXT: s_ashr_i32 s4, s9, 31 ; GFX8-NEXT: s_xor_b64 vcc, s[0:1], vcc ; GFX8-NEXT: s_add_u32 s0, s4, 0 @@ -4622,13 +4622,13 @@ ; GFX8-NEXT: s_sub_u32 s0, s2, s6 ; GFX8-NEXT: v_mov_b32_e32 v2, s1 ; GFX8-NEXT: s_cselect_b32 s1, 1, 0 -; GFX8-NEXT: s_and_b32 s1, s1, 1 ; GFX8-NEXT: v_mov_b32_e32 v0, s8 -; GFX8-NEXT: s_cmp_lg_u32 s1, 0 +; GFX8-NEXT: s_and_b32 s1, s1, 1 ; GFX8-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc +; GFX8-NEXT: s_cmp_lg_u32 s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: s_subb_u32 s1, s3, s7 ; GFX8-NEXT: v_mov_b32_e32 v3, s9 +; GFX8-NEXT: s_subb_u32 s1, s3, s7 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc ; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1] @@ -4642,8 +4642,8 @@ ; GFX8-NEXT: s_cmp_lg_u32 s2, 0 ; GFX8-NEXT: s_addc_u32 s3, s4, s5 ; GFX8-NEXT: v_mov_b32_e32 v1, s0 -; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc ; GFX8-NEXT: v_readfirstlane_b32 s0, v4 @@ -4661,8 +4661,8 @@ ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: s_subb_u32 s9, s1, s5 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], s[4:5], 0 ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1] +; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], s[4:5], 0 ; GFX9-NEXT: s_ashr_i32 s4, s9, 31 ; GFX9-NEXT: s_xor_b64 vcc, s[0:1], vcc ; GFX9-NEXT: s_add_u32 s0, s4, 0 @@ -4675,13 +4675,13 @@ ; GFX9-NEXT: s_sub_u32 s0, s2, s6 ; GFX9-NEXT: v_mov_b32_e32 v2, s1 ; GFX9-NEXT: s_cselect_b32 s1, 1, 0 -; GFX9-NEXT: s_and_b32 s1, s1, 1 ; GFX9-NEXT: v_mov_b32_e32 v0, s8 -; GFX9-NEXT: s_cmp_lg_u32 s1, 0 +; GFX9-NEXT: s_and_b32 s1, s1, 1 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc +; GFX9-NEXT: s_cmp_lg_u32 s1, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: s_subb_u32 s1, s3, s7 ; GFX9-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-NEXT: s_subb_u32 s1, s3, s7 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1] @@ -4695,8 +4695,8 @@ ; GFX9-NEXT: s_cmp_lg_u32 s2, 0 ; GFX9-NEXT: s_addc_u32 s3, s4, s5 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_mov_b32_e32 v5, s1 ; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_mov_b32_e32 v5, s1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc ; GFX9-NEXT: v_readfirstlane_b32 s0, v4 @@ -4770,8 +4770,8 @@ ; GFX6-NEXT: s_and_b32 s11, s11, 1 ; GFX6-NEXT: v_mov_b32_e32 v2, s0 ; GFX6-NEXT: s_cmp_lg_u32 s11, 0 -; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[2:3] ; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[2:3] ; GFX6-NEXT: s_subb_u32 s11, s3, s7 ; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc @@ -4791,11 +4791,11 @@ ; GFX6-NEXT: s_addc_u32 s1, s3, 0 ; GFX6-NEXT: s_cselect_b32 s2, 1, 0 ; GFX6-NEXT: s_and_b32 s2, s2, 1 -; GFX6-NEXT: s_cmp_lg_u32 s2, 0 ; GFX6-NEXT: v_cmp_eq_u64_e64 vcc, s[6:7], 0 +; GFX6-NEXT: s_cmp_lg_u32 s2, 0 ; GFX6-NEXT: s_addc_u32 s2, s3, 0 -; GFX6-NEXT: s_cselect_b32 s4, 1, 0 ; GFX6-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; GFX6-NEXT: s_cselect_b32 s4, 1, 0 ; GFX6-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX6-NEXT: s_and_b32 s4, s4, 1 ; GFX6-NEXT: s_cmp_lg_u32 s4, 0 @@ -4804,13 +4804,13 @@ ; GFX6-NEXT: v_mov_b32_e32 v1, s0 ; GFX6-NEXT: v_mov_b32_e32 v2, s1 ; GFX6-NEXT: v_mov_b32_e32 v3, s8 -; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX6-NEXT: v_mov_b32_e32 v4, s9 +; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc ; GFX6-NEXT: v_mov_b32_e32 v2, s2 -; GFX6-NEXT: v_mov_b32_e32 v4, s10 ; GFX6-NEXT: v_mov_b32_e32 v3, s3 +; GFX6-NEXT: v_mov_b32_e32 v4, s10 ; GFX6-NEXT: v_mov_b32_e32 v5, s11 ; GFX6-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc @@ -4837,8 +4837,8 @@ ; GFX8-NEXT: s_cmp_lg_u32 s11, 0 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: s_subb_u32 s11, s3, s7 -; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[2:3] ; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[2:3] ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: s_cmp_eq_u64 s[10:11], s[2:3] ; GFX8-NEXT: s_cselect_b32 s2, 1, 0 @@ -4866,8 +4866,8 @@ ; GFX8-NEXT: s_and_b32 s2, s2, 1 ; GFX8-NEXT: s_cmp_lg_u32 s2, 0 ; GFX8-NEXT: s_addc_u32 s2, s3, 0 -; GFX8-NEXT: s_cselect_b32 s4, 1, 0 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; GFX8-NEXT: s_cselect_b32 s4, 1, 0 ; GFX8-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX8-NEXT: s_and_b32 s4, s4, 1 ; GFX8-NEXT: s_cmp_lg_u32 s4, 0 @@ -4876,13 +4876,13 @@ ; GFX8-NEXT: v_mov_b32_e32 v1, s0 ; GFX8-NEXT: v_mov_b32_e32 v2, s1 ; GFX8-NEXT: v_mov_b32_e32 v3, s8 -; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX8-NEXT: v_mov_b32_e32 v4, s9 +; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc ; GFX8-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NEXT: v_mov_b32_e32 v4, s10 ; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: v_mov_b32_e32 v4, s10 ; GFX8-NEXT: v_mov_b32_e32 v5, s11 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc @@ -4909,8 +4909,8 @@ ; GFX9-NEXT: s_cmp_lg_u32 s11, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, s0 ; GFX9-NEXT: s_subb_u32 s11, s3, s7 -; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: s_cmp_eq_u64 s[10:11], s[2:3] ; GFX9-NEXT: s_cselect_b32 s2, 1, 0 @@ -4938,8 +4938,8 @@ ; GFX9-NEXT: s_and_b32 s2, s2, 1 ; GFX9-NEXT: s_cmp_lg_u32 s2, 0 ; GFX9-NEXT: s_addc_u32 s2, s3, 0 -; GFX9-NEXT: s_cselect_b32 s4, 1, 0 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; GFX9-NEXT: s_cselect_b32 s4, 1, 0 ; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX9-NEXT: s_and_b32 s4, s4, 1 ; GFX9-NEXT: s_cmp_lg_u32 s4, 0 @@ -4948,13 +4948,13 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: v_mov_b32_e32 v2, s1 ; GFX9-NEXT: v_mov_b32_e32 v3, s8 -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX9-NEXT: v_mov_b32_e32 v4, s9 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc ; GFX9-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NEXT: v_mov_b32_e32 v4, s10 ; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_mov_b32_e32 v4, s10 ; GFX9-NEXT: v_mov_b32_e32 v5, s11 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc @@ -5294,21 +5294,21 @@ ; GFX10-LABEL: ssubsat_i128_vs: ; GFX10: ; %bb.0: ; GFX10-NEXT: v_sub_co_u32 v4, vcc_lo, v0, s0 -; GFX10-NEXT: v_cmp_gt_u64_e64 s0, s[0:1], 0 ; GFX10-NEXT: v_subrev_co_ci_u32_e32 v5, vcc_lo, s1, v1, vcc_lo -; GFX10-NEXT: s_cmp_eq_u64 s[2:3], 0 ; GFX10-NEXT: v_subrev_co_ci_u32_e32 v6, vcc_lo, s2, v2, vcc_lo -; GFX10-NEXT: s_cselect_b32 s4, 1, 0 ; GFX10-NEXT: v_subrev_co_ci_u32_e32 v7, vcc_lo, s3, v3, vcc_lo ; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[4:5], v[0:1] -; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, s0 -; GFX10-NEXT: v_cmp_gt_i64_e64 s0, s[2:3], 0 +; GFX10-NEXT: v_cmp_gt_u64_e64 s0, s[0:1], 0 +; GFX10-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10-NEXT: s_cselect_b32 s4, 1, 0 ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[6:7], v[2:3] -; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, 1, s0 -; GFX10-NEXT: s_and_b32 s0, 1, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, s0 +; GFX10-NEXT: v_cmp_gt_i64_e64 s0, s[2:3], 0 ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[2:3] +; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, 1, s0 +; GFX10-NEXT: s_and_b32 s0, 1, s4 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo ; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v9, v8, vcc_lo @@ -5540,19 +5540,19 @@ ; GFX10-NEXT: v_sub_co_u32 v8, vcc_lo, v4, v12 ; GFX10-NEXT: v_sub_co_ci_u32_e32 v9, vcc_lo, v5, v13, vcc_lo ; GFX10-NEXT: v_sub_co_ci_u32_e32 v10, vcc_lo, v6, v14, vcc_lo -; GFX10-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX10-NEXT: v_sub_co_ci_u32_e32 v11, vcc_lo, v7, v15, vcc_lo ; GFX10-NEXT: v_cmp_lt_u64_e64 s4, v[8:9], v[4:5] +; GFX10-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX10-NEXT: v_ashrrev_i32_e32 v1, 31, v19 -; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX10-NEXT: v_cmp_eq_u64_e64 s5, v[10:11], v[6:7] ; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s4 ; GFX10-NEXT: v_cmp_lt_i64_e64 s4, v[10:11], v[6:7] +; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v1, 0 -; GFX10-NEXT: v_ashrrev_i32_e32 v7, 31, v11 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s4 ; GFX10-NEXT: v_cmp_lt_u64_e64 s4, 0, v[12:13] +; GFX10-NEXT: v_ashrrev_i32_e32 v7, 31, v11 ; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, 1, s4 ; GFX10-NEXT: v_cmp_lt_i64_e64 s4, 0, v[14:15] ; GFX10-NEXT: v_cndmask_b32_e64 v13, 0, 1, s4 @@ -5599,8 +5599,8 @@ ; GFX6-NEXT: s_and_b32 s19, s19, 1 ; GFX6-NEXT: v_mov_b32_e32 v2, s0 ; GFX6-NEXT: s_cmp_lg_u32 s19, 0 -; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[16:17], v[2:3] ; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[16:17], v[2:3] ; GFX6-NEXT: s_subb_u32 s19, s3, s11 ; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc @@ -5623,8 +5623,8 @@ ; GFX6-NEXT: s_cmp_lg_u32 s2, 0 ; GFX6-NEXT: s_addc_u32 s2, s3, 0 ; GFX6-NEXT: s_cselect_b32 s9, 1, 0 -; GFX6-NEXT: s_and_b32 s9, s9, 1 ; GFX6-NEXT: v_cmp_eq_u64_e64 vcc, s[10:11], 0 +; GFX6-NEXT: s_and_b32 s9, s9, 1 ; GFX6-NEXT: s_brev_b32 s8, 1 ; GFX6-NEXT: s_cmp_lg_u32 s9, 0 ; GFX6-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc @@ -5642,24 +5642,24 @@ ; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: s_cselect_b32 s2, 1, 0 ; GFX6-NEXT: s_and_b32 s2, s2, 1 -; GFX6-NEXT: s_cmp_lg_u32 s2, 0 -; GFX6-NEXT: v_mov_b32_e32 v4, s17 ; GFX6-NEXT: v_mov_b32_e32 v3, s16 +; GFX6-NEXT: v_mov_b32_e32 v4, s17 +; GFX6-NEXT: s_cmp_lg_u32 s2, 0 ; GFX6-NEXT: v_cndmask_b32_e32 v5, v3, v1, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc -; GFX6-NEXT: v_mov_b32_e32 v2, s18 ; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: v_mov_b32_e32 v2, s18 ; GFX6-NEXT: v_mov_b32_e32 v3, s19 ; GFX6-NEXT: s_subb_u32 s2, s6, s14 -; GFX6-NEXT: s_cselect_b32 s3, 1, 0 ; GFX6-NEXT: v_cndmask_b32_e32 v6, v2, v0, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v7, v3, v1, vcc +; GFX6-NEXT: s_cselect_b32 s3, 1, 0 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: s_and_b32 s3, s3, 1 ; GFX6-NEXT: v_mov_b32_e32 v3, s5 ; GFX6-NEXT: s_cmp_lg_u32 s3, 0 -; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3] ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3] ; GFX6-NEXT: s_subb_u32 s3, s7, s15 ; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc @@ -5679,27 +5679,27 @@ ; GFX6-NEXT: s_addc_u32 s5, s7, 0 ; GFX6-NEXT: s_cselect_b32 s6, 1, 0 ; GFX6-NEXT: s_and_b32 s6, s6, 1 -; GFX6-NEXT: s_cmp_lg_u32 s6, 0 ; GFX6-NEXT: v_cmp_eq_u64_e64 vcc, s[14:15], 0 +; GFX6-NEXT: s_cmp_lg_u32 s6, 0 ; GFX6-NEXT: s_addc_u32 s6, s7, 0 -; GFX6-NEXT: s_cselect_b32 s9, 1, 0 ; GFX6-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; GFX6-NEXT: s_cselect_b32 s9, 1, 0 ; GFX6-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX6-NEXT: s_and_b32 s9, s9, 1 ; GFX6-NEXT: s_cmp_lg_u32 s9, 0 ; GFX6-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX6-NEXT: v_mov_b32_e32 v3, s0 -; GFX6-NEXT: v_mov_b32_e32 v8, s1 ; GFX6-NEXT: s_addc_u32 s7, s7, s8 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 -; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: v_mov_b32_e32 v3, s0 +; GFX6-NEXT: v_mov_b32_e32 v8, s1 +; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v1, v8, v2, vcc -; GFX6-NEXT: v_mov_b32_e32 v8, s2 ; GFX6-NEXT: v_mov_b32_e32 v2, s6 -; GFX6-NEXT: v_mov_b32_e32 v9, s3 ; GFX6-NEXT: v_mov_b32_e32 v3, s7 +; GFX6-NEXT: v_mov_b32_e32 v8, s2 +; GFX6-NEXT: v_mov_b32_e32 v9, s3 ; GFX6-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc ; GFX6-NEXT: v_readfirstlane_b32 s0, v5 @@ -5729,8 +5729,8 @@ ; GFX8-NEXT: s_cmp_lg_u32 s19, 0 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: s_subb_u32 s19, s3, s11 -; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[16:17], v[2:3] ; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[16:17], v[2:3] ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: s_cmp_eq_u64 s[18:19], s[2:3] ; GFX8-NEXT: s_cselect_b32 s2, 1, 0 @@ -5778,24 +5778,24 @@ ; GFX8-NEXT: s_cselect_b32 s2, 1, 0 ; GFX8-NEXT: s_and_b32 s2, s2, 1 ; GFX8-NEXT: s_cmp_lg_u32 s2, 0 -; GFX8-NEXT: v_mov_b32_e32 v4, s17 ; GFX8-NEXT: v_mov_b32_e32 v3, s16 +; GFX8-NEXT: v_mov_b32_e32 v4, s17 ; GFX8-NEXT: s_subb_u32 s2, s6, s14 ; GFX8-NEXT: v_cndmask_b32_e32 v5, v3, v1, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc ; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: s_cselect_b32 s3, 1, 0 ; GFX8-NEXT: v_mov_b32_e32 v2, s18 ; GFX8-NEXT: v_mov_b32_e32 v3, s19 -; GFX8-NEXT: s_and_b32 s3, s3, 1 +; GFX8-NEXT: s_cselect_b32 s3, 1, 0 ; GFX8-NEXT: v_cndmask_b32_e32 v6, v2, v0, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v7, v3, v1, vcc +; GFX8-NEXT: s_and_b32 s3, s3, 1 ; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: s_cmp_lg_u32 s3, 0 ; GFX8-NEXT: v_mov_b32_e32 v3, s5 ; GFX8-NEXT: s_subb_u32 s3, s7, s15 -; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3] ; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3] ; GFX8-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NEXT: s_cmp_eq_u64 s[2:3], s[6:7] ; GFX8-NEXT: s_cselect_b32 s6, 1, 0 @@ -5823,24 +5823,24 @@ ; GFX8-NEXT: s_and_b32 s6, s6, 1 ; GFX8-NEXT: s_cmp_lg_u32 s6, 0 ; GFX8-NEXT: s_addc_u32 s6, s7, 0 -; GFX8-NEXT: s_cselect_b32 s9, 1, 0 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; GFX8-NEXT: s_cselect_b32 s9, 1, 0 ; GFX8-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX8-NEXT: s_and_b32 s9, s9, 1 ; GFX8-NEXT: s_cmp_lg_u32 s9, 0 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX8-NEXT: v_mov_b32_e32 v3, s0 -; GFX8-NEXT: v_mov_b32_e32 v8, s1 ; GFX8-NEXT: s_addc_u32 s7, s7, s8 ; GFX8-NEXT: v_mov_b32_e32 v1, s4 -; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX8-NEXT: v_mov_b32_e32 v2, s5 +; GFX8-NEXT: v_mov_b32_e32 v3, s0 +; GFX8-NEXT: v_mov_b32_e32 v8, s1 +; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v1, v8, v2, vcc -; GFX8-NEXT: v_mov_b32_e32 v8, s2 ; GFX8-NEXT: v_mov_b32_e32 v2, s6 -; GFX8-NEXT: v_mov_b32_e32 v9, s3 ; GFX8-NEXT: v_mov_b32_e32 v3, s7 +; GFX8-NEXT: v_mov_b32_e32 v8, s2 +; GFX8-NEXT: v_mov_b32_e32 v9, s3 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc ; GFX8-NEXT: v_readfirstlane_b32 s0, v5 @@ -5870,8 +5870,8 @@ ; GFX9-NEXT: s_cmp_lg_u32 s19, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, s0 ; GFX9-NEXT: s_subb_u32 s19, s3, s11 -; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[16:17], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[16:17], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: s_cmp_eq_u64 s[18:19], s[2:3] ; GFX9-NEXT: s_cselect_b32 s2, 1, 0 @@ -5919,24 +5919,24 @@ ; GFX9-NEXT: s_cselect_b32 s2, 1, 0 ; GFX9-NEXT: s_and_b32 s2, s2, 1 ; GFX9-NEXT: s_cmp_lg_u32 s2, 0 -; GFX9-NEXT: v_mov_b32_e32 v4, s17 ; GFX9-NEXT: v_mov_b32_e32 v3, s16 +; GFX9-NEXT: v_mov_b32_e32 v4, s17 ; GFX9-NEXT: s_subb_u32 s2, s6, s14 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v1, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc ; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: s_cselect_b32 s3, 1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, s18 ; GFX9-NEXT: v_mov_b32_e32 v3, s19 -; GFX9-NEXT: s_and_b32 s3, s3, 1 +; GFX9-NEXT: s_cselect_b32 s3, 1, 0 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v2, v0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v7, v3, v1, vcc +; GFX9-NEXT: s_and_b32 s3, s3, 1 ; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: s_cmp_lg_u32 s3, 0 ; GFX9-NEXT: v_mov_b32_e32 v3, s5 ; GFX9-NEXT: s_subb_u32 s3, s7, s15 -; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: s_cmp_eq_u64 s[2:3], s[6:7] ; GFX9-NEXT: s_cselect_b32 s6, 1, 0 @@ -5964,24 +5964,24 @@ ; GFX9-NEXT: s_and_b32 s6, s6, 1 ; GFX9-NEXT: s_cmp_lg_u32 s6, 0 ; GFX9-NEXT: s_addc_u32 s6, s7, 0 -; GFX9-NEXT: s_cselect_b32 s9, 1, 0 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; GFX9-NEXT: s_cselect_b32 s9, 1, 0 ; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX9-NEXT: s_and_b32 s9, s9, 1 ; GFX9-NEXT: s_cmp_lg_u32 s9, 0 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-NEXT: v_mov_b32_e32 v8, s1 ; GFX9-NEXT: s_addc_u32 s7, s7, s8 ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: v_mov_b32_e32 v8, s1 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v8, v2, vcc -; GFX9-NEXT: v_mov_b32_e32 v8, s2 ; GFX9-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-NEXT: v_mov_b32_e32 v9, s3 ; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: v_mov_b32_e32 v8, s2 +; GFX9-NEXT: v_mov_b32_e32 v9, s3 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc ; GFX9-NEXT: v_readfirstlane_b32 s0, v5 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll @@ -58,47 +58,47 @@ ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: s_lshr_b32 s0, s4, 8 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: s_lshr_b32 s1, s4, 16 ; GFX9-NEXT: ds_write_b8 v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: s_lshr_b32 s1, s4, 16 +; GFX9-NEXT: s_lshr_b32 s3, s4, 24 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:1 ; GFX9-NEXT: v_mov_b32_e32 v0, s1 -; GFX9-NEXT: s_lshr_b32 s3, s4, 24 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:2 ; GFX9-NEXT: v_mov_b32_e32 v0, s3 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:3 -; GFX9-NEXT: v_mov_b32_e32 v0, s5 ; GFX9-NEXT: s_lshr_b32 s0, s5, 8 +; GFX9-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-NEXT: s_lshr_b32 s1, s5, 16 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:4 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: s_lshr_b32 s1, s5, 16 +; GFX9-NEXT: s_lshr_b32 s2, s5, 24 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:5 ; GFX9-NEXT: v_mov_b32_e32 v0, s1 -; GFX9-NEXT: s_lshr_b32 s2, s5, 24 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:6 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:7 -; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: s_lshr_b32 s0, s6, 8 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: s_lshr_b32 s1, s6, 16 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:8 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: s_lshr_b32 s1, s6, 16 +; GFX9-NEXT: s_lshr_b32 s2, s6, 24 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:9 ; GFX9-NEXT: v_mov_b32_e32 v0, s1 -; GFX9-NEXT: s_lshr_b32 s2, s6, 24 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:10 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:11 -; GFX9-NEXT: v_mov_b32_e32 v0, s7 ; GFX9-NEXT: s_lshr_b32 s0, s7, 8 +; GFX9-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NEXT: s_lshr_b32 s1, s7, 16 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:12 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: s_lshr_b32 s1, s7, 16 +; GFX9-NEXT: s_lshr_b32 s2, s7, 24 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:13 ; GFX9-NEXT: v_mov_b32_e32 v0, s1 -; GFX9-NEXT: s_lshr_b32 s2, s7, 24 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:14 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:15 @@ -111,47 +111,47 @@ ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 -; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: s_lshr_b32 s5, s0, 8 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: s_lshr_b32 s6, s0, 16 ; GFX7-NEXT: ds_write_b8 v1, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s5 -; GFX7-NEXT: s_lshr_b32 s6, s0, 16 ; GFX7-NEXT: s_lshr_b32 s7, s0, 24 ; GFX7-NEXT: ds_write_b8 v1, v0 offset:1 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ds_write_b8 v1, v0 offset:2 ; GFX7-NEXT: v_mov_b32_e32 v0, s7 ; GFX7-NEXT: ds_write_b8 v1, v0 offset:3 -; GFX7-NEXT: v_mov_b32_e32 v0, s1 ; GFX7-NEXT: s_lshr_b32 s0, s1, 8 +; GFX7-NEXT: v_mov_b32_e32 v0, s1 +; GFX7-NEXT: s_lshr_b32 s4, s1, 16 ; GFX7-NEXT: ds_write_b8 v1, v0 offset:4 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: s_lshr_b32 s4, s1, 16 ; GFX7-NEXT: s_lshr_b32 s5, s1, 24 ; GFX7-NEXT: ds_write_b8 v1, v0 offset:5 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: ds_write_b8 v1, v0 offset:6 ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: ds_write_b8 v1, v0 offset:7 -; GFX7-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-NEXT: s_lshr_b32 s0, s2, 8 +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: s_lshr_b32 s1, s2, 16 ; GFX7-NEXT: ds_write_b8 v1, v0 offset:8 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: s_lshr_b32 s1, s2, 16 ; GFX7-NEXT: s_lshr_b32 s4, s2, 24 ; GFX7-NEXT: ds_write_b8 v1, v0 offset:9 ; GFX7-NEXT: v_mov_b32_e32 v0, s1 ; GFX7-NEXT: ds_write_b8 v1, v0 offset:10 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: ds_write_b8 v1, v0 offset:11 -; GFX7-NEXT: v_mov_b32_e32 v0, s3 ; GFX7-NEXT: s_lshr_b32 s0, s3, 8 +; GFX7-NEXT: v_mov_b32_e32 v0, s3 +; GFX7-NEXT: s_lshr_b32 s1, s3, 16 ; GFX7-NEXT: ds_write_b8 v1, v0 offset:12 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: s_lshr_b32 s1, s3, 16 +; GFX7-NEXT: s_lshr_b32 s2, s3, 24 ; GFX7-NEXT: ds_write_b8 v1, v0 offset:13 ; GFX7-NEXT: v_mov_b32_e32 v0, s1 -; GFX7-NEXT: s_lshr_b32 s2, s3, 24 ; GFX7-NEXT: ds_write_b8 v1, v0 offset:14 ; GFX7-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-NEXT: ds_write_b8 v1, v0 offset:15 @@ -164,8 +164,8 @@ ; GFX10-NEXT: s_load_dword s2, s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_lshr_b32 s0, s4, 8 -; GFX10-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-NEXT: s_lshr_b32 s1, s4, 16 ; GFX10-NEXT: s_lshr_b32 s3, s4, 24 ; GFX10-NEXT: s_lshr_b32 s2, s5, 8 @@ -174,8 +174,8 @@ ; GFX10-NEXT: v_mov_b32_e32 v2, s5 ; GFX10-NEXT: s_lshr_b32 s5, s6, 8 ; GFX10-NEXT: s_lshr_b32 s9, s6, 16 -; GFX10-NEXT: v_mov_b32_e32 v4, s0 ; GFX10-NEXT: v_mov_b32_e32 v3, s6 +; GFX10-NEXT: v_mov_b32_e32 v4, s0 ; GFX10-NEXT: v_mov_b32_e32 v5, s1 ; GFX10-NEXT: v_mov_b32_e32 v10, s5 ; GFX10-NEXT: s_lshr_b32 s0, s6, 24 @@ -220,23 +220,23 @@ ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: s_lshr_b32 s0, s4, 16 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: ds_write_b16 v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: ds_write_b16 v1, v0 offset:2 -; GFX9-NEXT: v_mov_b32_e32 v0, s5 ; GFX9-NEXT: s_lshr_b32 s0, s5, 16 +; GFX9-NEXT: v_mov_b32_e32 v0, s5 ; GFX9-NEXT: ds_write_b16 v1, v0 offset:4 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: ds_write_b16 v1, v0 offset:6 -; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: s_lshr_b32 s0, s6, 16 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: ds_write_b16 v1, v0 offset:8 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: ds_write_b16 v1, v0 offset:10 -; GFX9-NEXT: v_mov_b32_e32 v0, s7 ; GFX9-NEXT: s_lshr_b32 s0, s7, 16 +; GFX9-NEXT: v_mov_b32_e32 v0, s7 ; GFX9-NEXT: ds_write_b16 v1, v0 offset:12 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: ds_write_b16 v1, v0 offset:14 @@ -249,23 +249,23 @@ ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 -; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: s_lshr_b32 s5, s0, 16 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: ds_write_b16 v1, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: ds_write_b16 v1, v0 offset:2 -; GFX7-NEXT: v_mov_b32_e32 v0, s1 ; GFX7-NEXT: s_lshr_b32 s0, s1, 16 +; GFX7-NEXT: v_mov_b32_e32 v0, s1 ; GFX7-NEXT: ds_write_b16 v1, v0 offset:4 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: ds_write_b16 v1, v0 offset:6 -; GFX7-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-NEXT: s_lshr_b32 s0, s2, 16 +; GFX7-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-NEXT: ds_write_b16 v1, v0 offset:8 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: ds_write_b16 v1, v0 offset:10 -; GFX7-NEXT: v_mov_b32_e32 v0, s3 ; GFX7-NEXT: s_lshr_b32 s0, s3, 16 +; GFX7-NEXT: v_mov_b32_e32 v0, s3 ; GFX7-NEXT: ds_write_b16 v1, v0 offset:12 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: ds_write_b16 v1, v0 offset:14 @@ -312,8 +312,8 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v2, s5 -; GFX9-NEXT: ds_write2_b32 v1, v0, v2 offset1:1 ; GFX9-NEXT: v_mov_b32_e32 v3, s6 +; GFX9-NEXT: ds_write2_b32 v1, v0, v2 offset1:1 ; GFX9-NEXT: v_mov_b32_e32 v0, s7 ; GFX9-NEXT: ds_write2_b32 v1, v3, v0 offset0:2 offset1:3 ; GFX9-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.96.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.96.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.96.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.96.ll @@ -55,36 +55,36 @@ ; GFX9-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: v_mov_b32_e32 v0, s12 ; GFX9-NEXT: s_lshr_b32 s0, s12, 8 +; GFX9-NEXT: v_mov_b32_e32 v0, s12 +; GFX9-NEXT: s_lshr_b32 s1, s12, 16 ; GFX9-NEXT: ds_write_b8 v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: s_lshr_b32 s1, s12, 16 +; GFX9-NEXT: s_lshr_b32 s3, s12, 24 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:1 ; GFX9-NEXT: v_mov_b32_e32 v0, s1 -; GFX9-NEXT: s_lshr_b32 s3, s12, 24 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:2 ; GFX9-NEXT: v_mov_b32_e32 v0, s3 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:3 -; GFX9-NEXT: v_mov_b32_e32 v0, s13 ; GFX9-NEXT: s_lshr_b32 s0, s13, 8 +; GFX9-NEXT: v_mov_b32_e32 v0, s13 +; GFX9-NEXT: s_lshr_b32 s1, s13, 16 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:4 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: s_lshr_b32 s1, s13, 16 +; GFX9-NEXT: s_lshr_b32 s2, s13, 24 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:5 ; GFX9-NEXT: v_mov_b32_e32 v0, s1 -; GFX9-NEXT: s_lshr_b32 s2, s13, 24 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:6 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:7 -; GFX9-NEXT: v_mov_b32_e32 v0, s14 ; GFX9-NEXT: s_lshr_b32 s0, s14, 8 +; GFX9-NEXT: v_mov_b32_e32 v0, s14 +; GFX9-NEXT: s_lshr_b32 s1, s14, 16 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:8 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: s_lshr_b32 s1, s14, 16 +; GFX9-NEXT: s_lshr_b32 s2, s14, 24 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:9 ; GFX9-NEXT: v_mov_b32_e32 v0, s1 -; GFX9-NEXT: s_lshr_b32 s2, s14, 24 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:10 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:11 @@ -97,36 +97,36 @@ ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 -; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: s_lshr_b32 s3, s0, 8 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: s_lshr_b32 s5, s0, 16 ; GFX7-NEXT: ds_write_b8 v1, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s3 -; GFX7-NEXT: s_lshr_b32 s5, s0, 16 ; GFX7-NEXT: s_lshr_b32 s6, s0, 24 ; GFX7-NEXT: ds_write_b8 v1, v0 offset:1 ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: ds_write_b8 v1, v0 offset:2 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: ds_write_b8 v1, v0 offset:3 -; GFX7-NEXT: v_mov_b32_e32 v0, s1 ; GFX7-NEXT: s_lshr_b32 s0, s1, 8 +; GFX7-NEXT: v_mov_b32_e32 v0, s1 +; GFX7-NEXT: s_lshr_b32 s3, s1, 16 ; GFX7-NEXT: ds_write_b8 v1, v0 offset:4 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: s_lshr_b32 s3, s1, 16 ; GFX7-NEXT: s_lshr_b32 s4, s1, 24 ; GFX7-NEXT: ds_write_b8 v1, v0 offset:5 ; GFX7-NEXT: v_mov_b32_e32 v0, s3 ; GFX7-NEXT: ds_write_b8 v1, v0 offset:6 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: ds_write_b8 v1, v0 offset:7 -; GFX7-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-NEXT: s_lshr_b32 s0, s2, 8 +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: s_lshr_b32 s1, s2, 16 ; GFX7-NEXT: ds_write_b8 v1, v0 offset:8 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: s_lshr_b32 s1, s2, 16 +; GFX7-NEXT: s_lshr_b32 s3, s2, 24 ; GFX7-NEXT: ds_write_b8 v1, v0 offset:9 ; GFX7-NEXT: v_mov_b32_e32 v0, s1 -; GFX7-NEXT: s_lshr_b32 s3, s2, 24 ; GFX7-NEXT: ds_write_b8 v1, v0 offset:10 ; GFX7-NEXT: v_mov_b32_e32 v0, s3 ; GFX7-NEXT: ds_write_b8 v1, v0 offset:11 @@ -139,9 +139,9 @@ ; GFX10-NEXT: s_load_dword s2, s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_lshr_b32 s0, s12, 8 +; GFX10-NEXT: v_mov_b32_e32 v0, s12 ; GFX10-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-NEXT: s_lshr_b32 s5, s13, 24 -; GFX10-NEXT: v_mov_b32_e32 v0, s12 ; GFX10-NEXT: s_lshr_b32 s1, s12, 16 ; GFX10-NEXT: v_mov_b32_e32 v2, s13 ; GFX10-NEXT: s_lshr_b32 s3, s12, 24 @@ -184,18 +184,18 @@ ; GFX9-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: v_mov_b32_e32 v0, s12 ; GFX9-NEXT: s_lshr_b32 s0, s12, 16 +; GFX9-NEXT: v_mov_b32_e32 v0, s12 ; GFX9-NEXT: ds_write_b16 v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: ds_write_b16 v1, v0 offset:2 -; GFX9-NEXT: v_mov_b32_e32 v0, s13 ; GFX9-NEXT: s_lshr_b32 s0, s13, 16 +; GFX9-NEXT: v_mov_b32_e32 v0, s13 ; GFX9-NEXT: ds_write_b16 v1, v0 offset:4 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: ds_write_b16 v1, v0 offset:6 -; GFX9-NEXT: v_mov_b32_e32 v0, s14 ; GFX9-NEXT: s_lshr_b32 s0, s14, 16 +; GFX9-NEXT: v_mov_b32_e32 v0, s14 ; GFX9-NEXT: ds_write_b16 v1, v0 offset:8 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: ds_write_b16 v1, v0 offset:10 @@ -208,18 +208,18 @@ ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 -; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: s_lshr_b32 s3, s0, 16 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: ds_write_b16 v1, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s3 ; GFX7-NEXT: ds_write_b16 v1, v0 offset:2 -; GFX7-NEXT: v_mov_b32_e32 v0, s1 ; GFX7-NEXT: s_lshr_b32 s0, s1, 16 +; GFX7-NEXT: v_mov_b32_e32 v0, s1 ; GFX7-NEXT: ds_write_b16 v1, v0 offset:4 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: ds_write_b16 v1, v0 offset:6 -; GFX7-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-NEXT: s_lshr_b32 s0, s2, 16 +; GFX7-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-NEXT: ds_write_b16 v1, v0 offset:8 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: ds_write_b16 v1, v0 offset:10 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll @@ -289,8 +289,8 @@ ; GFX8-NEXT: s_lshr_b32 s3, s1, 8 ; GFX8-NEXT: s_lshl_b32 s1, s1, s4 ; GFX8-NEXT: s_lshr_b32 s2, s0, 8 -; GFX8-NEXT: v_mov_b32_e32 v0, s1 ; GFX8-NEXT: s_lshl_b32 s0, s0, s4 +; GFX8-NEXT: v_mov_b32_e32 v0, s1 ; GFX8-NEXT: s_lshl_b32 s1, s3, s4 ; GFX8-NEXT: v_add_u16_e64 v0, s0, v0 clamp ; GFX8-NEXT: s_lshl_b32 s0, s2, s4 @@ -308,8 +308,8 @@ ; GFX9-NEXT: s_lshr_b32 s3, s1, 8 ; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s2 ; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s3 -; GFX9-NEXT: s_lshr_b32 s3, s0, 16 ; GFX9-NEXT: s_mov_b32 s2, 0x80008 +; GFX9-NEXT: s_lshr_b32 s3, s0, 16 ; GFX9-NEXT: s_lshl_b32 s0, s0, s2 ; GFX9-NEXT: s_lshl_b32 s3, s3, 8 ; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s3 @@ -332,8 +332,8 @@ ; GFX10-NEXT: s_lshr_b32 s3, s1, 8 ; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s2 ; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s3 -; GFX10-NEXT: s_lshr_b32 s3, s0, 16 ; GFX10-NEXT: s_mov_b32 s2, 0x80008 +; GFX10-NEXT: s_lshr_b32 s3, s0, 16 ; GFX10-NEXT: s_lshr_b32 s4, s1, 16 ; GFX10-NEXT: s_lshl_b32 s0, s0, s2 ; GFX10-NEXT: s_lshl_b32 s3, s3, 8 @@ -385,13 +385,13 @@ ; GFX6-NEXT: v_xor_b32_e32 v5, -1, v3 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 24, v1 ; GFX6-NEXT: v_min_u32_e32 v4, v5, v4 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v4 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, 24, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, 24, v2 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 24, v3 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 24, v3 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v3 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 @@ -436,20 +436,20 @@ ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 24, v0 ; GFX9-NEXT: v_mov_b32_e32 v8, 0xffff ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX9-NEXT: v_and_or_b32 v0, v0, v8, v2 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v4 ; GFX9-NEXT: v_lshrrev_b32_sdwa v5, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v1 -; GFX9-NEXT: v_and_or_b32 v2, v3, v8, v2 +; GFX9-NEXT: v_and_or_b32 v0, v0, v8, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v4 ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; GFX9-NEXT: v_and_or_b32 v2, v3, v8, v2 ; GFX9-NEXT: v_and_or_b32 v1, v1, v8, v5 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX9-NEXT: v_and_or_b32 v3, v6, v8, v3 ; GFX9-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] +; GFX9-NEXT: v_and_or_b32 v3, v6, v8, v3 ; GFX9-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] -; GFX9-NEXT: v_pk_add_u16 v0, v0, v1 clamp ; GFX9-NEXT: v_pk_lshlrev_b16 v2, 8, v2 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_lshlrev_b16 v3, 8, v3 op_sel_hi:[0,1] +; GFX9-NEXT: v_pk_add_u16 v0, v0, v1 clamp ; GFX9-NEXT: v_pk_add_u16 v1, v2, v3 clamp ; GFX9-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1] ; GFX9-NEXT: v_mov_b32_e32 v2, 8 @@ -536,13 +536,13 @@ ; GFX6-NEXT: s_not_b32 s5, s3 ; GFX6-NEXT: s_lshr_b32 s1, s1, 24 ; GFX6-NEXT: s_min_u32 s4, s5, s4 -; GFX6-NEXT: s_add_i32 s3, s3, s4 ; GFX6-NEXT: s_lshr_b32 s0, s0, 24 -; GFX6-NEXT: s_lshl_b32 s1, s1, 8 ; GFX6-NEXT: s_lshr_b32 s2, s2, 24 +; GFX6-NEXT: s_add_i32 s3, s3, s4 +; GFX6-NEXT: s_lshl_b32 s1, s1, 8 +; GFX6-NEXT: s_lshr_b32 s3, s3, 24 ; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: s_lshl_b32 s1, s2, 16 -; GFX6-NEXT: s_lshr_b32 s3, s3, 24 ; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: s_lshl_b32 s1, s3, 24 ; GFX6-NEXT: s_or_b32 s0, s0, s1 @@ -555,30 +555,30 @@ ; GFX8-NEXT: s_lshr_b32 s6, s1, 16 ; GFX8-NEXT: s_lshr_b32 s7, s1, 24 ; GFX8-NEXT: s_lshl_b32 s1, s1, s8 -; GFX8-NEXT: v_mov_b32_e32 v0, s1 -; GFX8-NEXT: s_lshl_b32 s1, s5, s8 ; GFX8-NEXT: s_lshr_b32 s2, s0, 8 ; GFX8-NEXT: s_lshr_b32 s3, s0, 16 ; GFX8-NEXT: s_lshr_b32 s4, s0, 24 ; GFX8-NEXT: s_lshl_b32 s0, s0, s8 +; GFX8-NEXT: v_mov_b32_e32 v0, s1 +; GFX8-NEXT: s_lshl_b32 s1, s5, s8 ; GFX8-NEXT: v_add_u16_e64 v0, s0, v0 clamp -; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: s_lshl_b32 s0, s2, s8 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_add_u16_e64 v1, s0, v1 clamp -; GFX8-NEXT: v_mov_b32_e32 v4, 0xff ; GFX8-NEXT: s_lshl_b32 s1, s6, s8 -; GFX8-NEXT: v_and_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; GFX8-NEXT: v_mov_b32_e32 v2, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, 0xff ; GFX8-NEXT: s_lshl_b32 s0, s3, s8 +; GFX8-NEXT: v_mov_b32_e32 v2, s1 ; GFX8-NEXT: s_lshl_b32 s1, s7, s8 +; GFX8-NEXT: v_and_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; GFX8-NEXT: v_add_u16_e64 v2, s0, v2 clamp ; GFX8-NEXT: s_lshl_b32 s0, s4, s8 ; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_and_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX8-NEXT: v_add_u16_e64 v3, s0, v3 clamp ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: v_and_b32_sdwa v1, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; GFX8-NEXT: v_add_u16_e64 v3, s0, v3 clamp ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: v_and_b32_sdwa v1, v3, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 @@ -592,19 +592,19 @@ ; GFX9-NEXT: s_lshr_b32 s6, s0, 24 ; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s3 ; GFX9-NEXT: s_pack_ll_b32_b16 s3, s4, s6 -; GFX9-NEXT: s_lshr_b32 s6, s0, 16 ; GFX9-NEXT: s_mov_b32 s4, 0x80008 +; GFX9-NEXT: s_lshr_b32 s6, s0, 16 ; GFX9-NEXT: s_lshr_b32 s7, s1, 8 ; GFX9-NEXT: s_lshl_b32 s0, s0, s4 ; GFX9-NEXT: s_lshl_b32 s6, s6, 8 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s6 -; GFX9-NEXT: s_lshr_b32 s6, s3, 16 ; GFX9-NEXT: s_lshr_b32 s8, s1, 16 ; GFX9-NEXT: s_lshr_b32 s9, s1, 24 +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s6 +; GFX9-NEXT: s_lshr_b32 s6, s3, 16 ; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s7 -; GFX9-NEXT: s_lshr_b32 s7, s1, 16 ; GFX9-NEXT: s_lshl_b32 s3, s3, s4 ; GFX9-NEXT: s_lshl_b32 s6, s6, 8 +; GFX9-NEXT: s_lshr_b32 s7, s1, 16 ; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s6 ; GFX9-NEXT: s_pack_ll_b32_b16 s6, s8, s9 ; GFX9-NEXT: s_lshl_b32 s1, s1, s4 @@ -613,19 +613,19 @@ ; GFX9-NEXT: s_lshr_b32 s7, s6, 16 ; GFX9-NEXT: s_lshl_b32 s4, s6, s4 ; GFX9-NEXT: s_lshl_b32 s6, s7, 8 -; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s6 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: v_pk_add_u16 v0, s0, v0 clamp ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: v_pk_add_u16 v1, s3, v1 clamp ; GFX9-NEXT: s_mov_b32 s2, 8 +; GFX9-NEXT: v_pk_add_u16 v1, s3, v1 clamp ; GFX9-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1] ; GFX9-NEXT: s_movk_i32 s0, 0xff ; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: s_mov_b32 s5, 24 ; GFX9-NEXT: v_and_or_b32 v0, v0, s0, v2 ; GFX9-NEXT: v_and_b32_e32 v2, s0, v1 -; GFX9-NEXT: s_mov_b32 s5, 24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_or3_b32 v0, v0, v2, v1 @@ -639,8 +639,8 @@ ; GFX10-NEXT: s_lshr_b32 s4, s0, 24 ; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s2 ; GFX10-NEXT: s_pack_ll_b32_b16 s2, s3, s4 -; GFX10-NEXT: s_lshr_b32 s4, s0, 16 ; GFX10-NEXT: s_mov_b32 s3, 0x80008 +; GFX10-NEXT: s_lshr_b32 s4, s0, 16 ; GFX10-NEXT: s_lshr_b32 s5, s1, 8 ; GFX10-NEXT: s_lshr_b32 s6, s1, 16 ; GFX10-NEXT: s_lshr_b32 s7, s1, 24 @@ -658,8 +658,8 @@ ; GFX10-NEXT: s_lshl_b32 s5, s5, 8 ; GFX10-NEXT: s_lshl_b32 s3, s4, s3 ; GFX10-NEXT: s_lshl_b32 s4, s6, 8 -; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s5 ; GFX10-NEXT: s_pack_ll_b32_b16 s2, s2, s8 +; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s5 ; GFX10-NEXT: s_pack_ll_b32_b16 s3, s3, s4 ; GFX10-NEXT: v_pk_add_u16 v0, s0, s1 clamp ; GFX10-NEXT: v_pk_add_u16 v1, s2, s3 clamp @@ -1967,10 +1967,10 @@ ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v4 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v3 ; GFX6-NEXT: v_or_b32_e32 v1, v2, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -2033,10 +2033,10 @@ ; GFX6-NEXT: s_lshr_b32 s1, s1, 16 ; GFX6-NEXT: s_add_i32 s3, s3, s4 ; GFX6-NEXT: s_lshr_b32 s0, s0, 16 -; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_lshr_b32 s3, s3, 16 -; GFX6-NEXT: s_or_b32 s0, s0, s1 +; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_lshr_b32 s2, s2, 16 +; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: s_lshl_b32 s1, s3, 16 ; GFX6-NEXT: s_or_b32 s1, s2, s1 ; GFX6-NEXT: ; return to shader part epilog @@ -2044,20 +2044,20 @@ ; GFX8-LABEL: s_uaddsat_v4i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_lshr_b32 s6, s2, 16 -; GFX8-NEXT: s_lshr_b32 s7, s3, 16 ; GFX8-NEXT: s_lshr_b32 s4, s0, 16 +; GFX8-NEXT: s_lshr_b32 s7, s3, 16 ; GFX8-NEXT: v_mov_b32_e32 v1, s6 -; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: s_lshr_b32 s5, s1, 16 -; GFX8-NEXT: v_mov_b32_e32 v3, s7 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_add_u16_e64 v1, s4, v1 clamp +; GFX8-NEXT: v_mov_b32_e32 v3, s7 ; GFX8-NEXT: v_mov_b32_e32 v4, 16 -; GFX8-NEXT: v_mov_b32_e32 v2, s3 ; GFX8-NEXT: v_add_u16_e64 v0, s0, v0 clamp -; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_mov_b32_e32 v2, s3 ; GFX8-NEXT: v_add_u16_e64 v3, s5, v3 clamp -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX8-NEXT: v_add_u16_e64 v2, s1, v2 clamp +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: v_readfirstlane_b32 s0, v0 @@ -2131,16 +2131,16 @@ ; GFX6-NEXT: v_xor_b32_e32 v7, -1, v5 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_min_u32_e32 v6, v7, v6 -; GFX6-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-NEXT: v_add_i32_e32 v5, vcc, v5, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v3 ; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v3 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_or_b32_e32 v1, v2, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v5 ; GFX6-NEXT: v_or_b32_e32 v2, v4, v2 ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -2218,16 +2218,16 @@ ; GFX6-NEXT: s_not_b32 s7, s5 ; GFX6-NEXT: s_lshr_b32 s1, s1, 16 ; GFX6-NEXT: s_min_u32 s6, s7, s6 -; GFX6-NEXT: s_add_i32 s5, s5, s6 ; GFX6-NEXT: s_lshr_b32 s0, s0, 16 -; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_lshr_b32 s3, s3, 16 -; GFX6-NEXT: s_or_b32 s0, s0, s1 +; GFX6-NEXT: s_add_i32 s5, s5, s6 +; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_lshr_b32 s2, s2, 16 -; GFX6-NEXT: s_lshl_b32 s1, s3, 16 ; GFX6-NEXT: s_lshr_b32 s5, s5, 16 -; GFX6-NEXT: s_or_b32 s1, s2, s1 +; GFX6-NEXT: s_or_b32 s0, s0, s1 +; GFX6-NEXT: s_lshl_b32 s1, s3, 16 ; GFX6-NEXT: s_lshr_b32 s4, s4, 16 +; GFX6-NEXT: s_or_b32 s1, s2, s1 ; GFX6-NEXT: s_lshl_b32 s2, s5, 16 ; GFX6-NEXT: s_or_b32 s2, s4, s2 ; GFX6-NEXT: ; return to shader part epilog @@ -2235,28 +2235,28 @@ ; GFX8-LABEL: s_uaddsat_v6i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_lshr_b32 s9, s3, 16 -; GFX8-NEXT: s_lshr_b32 s10, s4, 16 ; GFX8-NEXT: s_lshr_b32 s6, s0, 16 +; GFX8-NEXT: s_lshr_b32 s10, s4, 16 ; GFX8-NEXT: v_mov_b32_e32 v1, s9 -; GFX8-NEXT: v_mov_b32_e32 v0, s3 -; GFX8-NEXT: s_lshr_b32 s11, s5, 16 ; GFX8-NEXT: s_lshr_b32 s7, s1, 16 -; GFX8-NEXT: v_mov_b32_e32 v3, s10 +; GFX8-NEXT: s_lshr_b32 s11, s5, 16 +; GFX8-NEXT: v_mov_b32_e32 v0, s3 ; GFX8-NEXT: v_add_u16_e64 v1, s6, v1 clamp +; GFX8-NEXT: v_mov_b32_e32 v3, s10 ; GFX8-NEXT: v_mov_b32_e32 v6, 16 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: s_lshr_b32 s8, s2, 16 -; GFX8-NEXT: v_mov_b32_e32 v5, s11 ; GFX8-NEXT: v_add_u16_e64 v0, s0, v0 clamp -; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_add_u16_e64 v3, s7, v3 clamp +; GFX8-NEXT: v_mov_b32_e32 v5, s11 +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_add_u16_e64 v2, s1, v2 clamp ; GFX8-NEXT: v_mov_b32_e32 v4, s5 +; GFX8-NEXT: v_add_u16_e64 v5, s8, v5 clamp ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_add_u16_e64 v2, s1, v2 clamp ; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_add_u16_e64 v5, s8, v5 clamp -; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: v_add_u16_e64 v4, s2, v4 clamp +; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX8-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: v_readfirstlane_b32 s0, v0 @@ -2335,19 +2335,19 @@ ; GFX6-NEXT: v_xor_b32_e32 v9, -1, v7 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_min_u32_e32 v8, v9, v8 -; GFX6-NEXT: v_add_i32_e32 v7, vcc, v7, v8 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-NEXT: v_add_i32_e32 v7, vcc, v7, v8 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v7, 16, v7 ; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v3 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v5 ; GFX6-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX6-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v5 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v7 ; GFX6-NEXT: v_or_b32_e32 v2, v4, v2 ; GFX6-NEXT: v_or_b32_e32 v3, v6, v3 @@ -2442,19 +2442,19 @@ ; GFX6-NEXT: s_not_b32 s9, s7 ; GFX6-NEXT: s_lshr_b32 s1, s1, 16 ; GFX6-NEXT: s_min_u32 s8, s9, s8 -; GFX6-NEXT: s_add_i32 s7, s7, s8 ; GFX6-NEXT: s_lshr_b32 s0, s0, 16 -; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_lshr_b32 s3, s3, 16 -; GFX6-NEXT: s_or_b32 s0, s0, s1 +; GFX6-NEXT: s_add_i32 s7, s7, s8 +; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_lshr_b32 s2, s2, 16 -; GFX6-NEXT: s_lshl_b32 s1, s3, 16 -; GFX6-NEXT: s_lshr_b32 s7, s7, 16 ; GFX6-NEXT: s_lshr_b32 s5, s5, 16 -; GFX6-NEXT: s_or_b32 s1, s2, s1 +; GFX6-NEXT: s_lshr_b32 s7, s7, 16 +; GFX6-NEXT: s_or_b32 s0, s0, s1 +; GFX6-NEXT: s_lshl_b32 s1, s3, 16 ; GFX6-NEXT: s_lshr_b32 s4, s4, 16 -; GFX6-NEXT: s_lshl_b32 s2, s5, 16 ; GFX6-NEXT: s_lshr_b32 s6, s6, 16 +; GFX6-NEXT: s_or_b32 s1, s2, s1 +; GFX6-NEXT: s_lshl_b32 s2, s5, 16 ; GFX6-NEXT: s_lshl_b32 s3, s7, 16 ; GFX6-NEXT: s_or_b32 s2, s4, s2 ; GFX6-NEXT: s_or_b32 s3, s6, s3 @@ -2463,35 +2463,35 @@ ; GFX8-LABEL: s_uaddsat_v8i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_lshr_b32 s12, s4, 16 -; GFX8-NEXT: s_lshr_b32 s13, s5, 16 ; GFX8-NEXT: s_lshr_b32 s8, s0, 16 +; GFX8-NEXT: s_lshr_b32 s13, s5, 16 ; GFX8-NEXT: v_mov_b32_e32 v1, s12 -; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: s_lshr_b32 s9, s1, 16 ; GFX8-NEXT: s_lshr_b32 s14, s6, 16 ; GFX8-NEXT: s_lshr_b32 s15, s7, 16 -; GFX8-NEXT: s_lshr_b32 s9, s1, 16 -; GFX8-NEXT: v_mov_b32_e32 v3, s13 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_add_u16_e64 v1, s8, v1 clamp +; GFX8-NEXT: v_mov_b32_e32 v3, s13 ; GFX8-NEXT: v_mov_b32_e32 v8, 16 -; GFX8-NEXT: v_mov_b32_e32 v2, s5 ; GFX8-NEXT: s_lshr_b32 s10, s2, 16 -; GFX8-NEXT: v_mov_b32_e32 v5, s14 ; GFX8-NEXT: s_lshr_b32 s11, s3, 16 -; GFX8-NEXT: v_mov_b32_e32 v7, s15 ; GFX8-NEXT: v_add_u16_e64 v0, s0, v0 clamp -; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_mov_b32_e32 v2, s5 ; GFX8-NEXT: v_add_u16_e64 v3, s9, v3 clamp +; GFX8-NEXT: v_mov_b32_e32 v5, s14 +; GFX8-NEXT: v_mov_b32_e32 v7, s15 +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_add_u16_e64 v2, s1, v2 clamp ; GFX8-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NEXT: v_add_u16_e64 v5, s10, v5 clamp ; GFX8-NEXT: v_mov_b32_e32 v6, s7 +; GFX8-NEXT: v_add_u16_e64 v7, s11, v7 clamp ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_add_u16_e64 v2, s1, v2 clamp ; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_add_u16_e64 v7, s11, v7 clamp -; GFX8-NEXT: v_add_u16_e64 v5, s10, v5 clamp -; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: v_add_u16_e64 v4, s2, v4 clamp -; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v8, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX8-NEXT: v_add_u16_e64 v6, s3, v6 clamp +; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v8, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX8-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v3, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -2838,8 +2838,8 @@ ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_addc_u32 s1, s1, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 -; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1] ; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1] ; GFX6-NEXT: s_add_u32 s0, s2, s6 ; GFX6-NEXT: v_mov_b32_e32 v3, s1 ; GFX6-NEXT: s_cselect_b32 s1, 1, 0 @@ -2870,8 +2870,8 @@ ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: s_addc_u32 s1, s1, s5 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1] ; GFX8-NEXT: s_add_u32 s0, s2, s6 ; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: s_cselect_b32 s1, 1, 0 @@ -2902,8 +2902,8 @@ ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: s_addc_u32 s1, s1, s5 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1] ; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1] ; GFX9-NEXT: s_add_u32 s0, s2, s6 ; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: s_cselect_b32 s1, 1, 0 @@ -2942,9 +2942,9 @@ ; GFX10-NEXT: v_cmp_lt_u64_e64 s5, s[2:3], s[6:7] ; GFX10-NEXT: v_cndmask_b32_e64 v1, s1, -1, s4 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: v_readfirstlane_b32 s1, v1 ; GFX10-NEXT: v_cndmask_b32_e64 v2, s2, -1, s5 ; GFX10-NEXT: v_cndmask_b32_e64 v3, s3, -1, s5 -; GFX10-NEXT: v_readfirstlane_b32 s1, v1 ; GFX10-NEXT: v_readfirstlane_b32 s2, v2 ; GFX10-NEXT: v_readfirstlane_b32 s3, v3 ; GFX10-NEXT: ; return to shader part epilog @@ -2969,8 +2969,8 @@ ; GFX6-NEXT: s_and_b32 s8, s8, 1 ; GFX6-NEXT: v_mov_b32_e32 v3, s5 ; GFX6-NEXT: s_cmp_lg_u32 s8, 0 -; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3] ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3] ; GFX6-NEXT: s_addc_u32 s3, s3, s7 ; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc @@ -2980,8 +2980,8 @@ ; GFX6-NEXT: v_mov_b32_e32 v1, s0 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX6-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX6-NEXT: v_mov_b32_e32 v2, s1 +; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX6-NEXT: v_cndmask_b32_e64 v0, v1, -1, vcc ; GFX6-NEXT: v_cndmask_b32_e64 v1, v2, -1, vcc ; GFX6-NEXT: v_mov_b32_e32 v2, s2 @@ -3011,8 +3011,8 @@ ; GFX8-NEXT: s_cmp_lg_u32 s8, 0 ; GFX8-NEXT: v_mov_b32_e32 v3, s5 ; GFX8-NEXT: s_addc_u32 s3, s3, s7 -; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3] ; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3] ; GFX8-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NEXT: s_cmp_eq_u64 s[2:3], s[6:7] ; GFX8-NEXT: s_cselect_b32 s6, 1, 0 @@ -3024,8 +3024,8 @@ ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX8-NEXT: v_mov_b32_e32 v1, s0 -; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX8-NEXT: v_mov_b32_e32 v2, s1 +; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v1, -1, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v1, v2, -1, vcc ; GFX8-NEXT: v_mov_b32_e32 v2, s2 @@ -3055,8 +3055,8 @@ ; GFX9-NEXT: s_cmp_lg_u32 s8, 0 ; GFX9-NEXT: v_mov_b32_e32 v3, s5 ; GFX9-NEXT: s_addc_u32 s3, s3, s7 -; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: s_cmp_eq_u64 s[2:3], s[6:7] ; GFX9-NEXT: s_cselect_b32 s6, 1, 0 @@ -3068,8 +3068,8 @@ ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v1, -1, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v1, v2, -1, vcc ; GFX9-NEXT: v_mov_b32_e32 v2, s2 @@ -3479,8 +3479,8 @@ ; GFX6-NEXT: s_and_b32 s16, s16, 1 ; GFX6-NEXT: v_mov_b32_e32 v3, s9 ; GFX6-NEXT: s_cmp_lg_u32 s16, 0 -; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3] ; GFX6-NEXT: v_mov_b32_e32 v0, s10 +; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3] ; GFX6-NEXT: s_addc_u32 s3, s3, s11 ; GFX6-NEXT: v_mov_b32_e32 v1, s11 ; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc @@ -3503,17 +3503,17 @@ ; GFX6-NEXT: s_cmp_lg_u32 s2, 0 ; GFX6-NEXT: s_addc_u32 s2, s6, s14 ; GFX6-NEXT: v_cndmask_b32_e64 v4, v1, -1, vcc +; GFX6-NEXT: v_cndmask_b32_e64 v5, v2, -1, vcc ; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: s_cselect_b32 s3, 1, 0 -; GFX6-NEXT: v_cndmask_b32_e64 v5, v2, -1, vcc ; GFX6-NEXT: v_mov_b32_e32 v2, s12 ; GFX6-NEXT: s_and_b32 s3, s3, 1 ; GFX6-NEXT: v_mov_b32_e32 v3, s13 -; GFX6-NEXT: s_cmp_lg_u32 s3, 0 ; GFX6-NEXT: v_cndmask_b32_e64 v6, v0, -1, vcc ; GFX6-NEXT: v_cndmask_b32_e64 v7, v1, -1, vcc -; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3] +; GFX6-NEXT: s_cmp_lg_u32 s3, 0 ; GFX6-NEXT: v_mov_b32_e32 v0, s14 +; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3] ; GFX6-NEXT: s_addc_u32 s3, s7, s15 ; GFX6-NEXT: v_mov_b32_e32 v1, s15 ; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc @@ -3523,12 +3523,12 @@ ; GFX6-NEXT: v_mov_b32_e32 v1, s0 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX6-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX6-NEXT: v_mov_b32_e32 v2, s1 -; GFX6-NEXT: v_mov_b32_e32 v3, s3 +; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX6-NEXT: v_cndmask_b32_e64 v0, v1, -1, vcc ; GFX6-NEXT: v_cndmask_b32_e64 v1, v2, -1, vcc ; GFX6-NEXT: v_mov_b32_e32 v2, s2 +; GFX6-NEXT: v_mov_b32_e32 v3, s3 ; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, -1, vcc ; GFX6-NEXT: v_cndmask_b32_e64 v3, v3, -1, vcc ; GFX6-NEXT: v_readfirstlane_b32 s0, v4 @@ -3558,8 +3558,8 @@ ; GFX8-NEXT: s_cmp_lg_u32 s16, 0 ; GFX8-NEXT: v_mov_b32_e32 v3, s9 ; GFX8-NEXT: s_addc_u32 s3, s3, s11 -; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3] ; GFX8-NEXT: v_mov_b32_e32 v0, s10 +; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3] ; GFX8-NEXT: v_mov_b32_e32 v1, s11 ; GFX8-NEXT: s_cmp_eq_u64 s[2:3], s[10:11] ; GFX8-NEXT: s_cselect_b32 s10, 1, 0 @@ -3586,18 +3586,18 @@ ; GFX8-NEXT: v_cndmask_b32_e64 v4, v1, -1, vcc ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: s_cselect_b32 s3, 1, 0 -; GFX8-NEXT: s_and_b32 s3, s3, 1 ; GFX8-NEXT: v_cndmask_b32_e64 v5, v2, -1, vcc +; GFX8-NEXT: s_and_b32 s3, s3, 1 ; GFX8-NEXT: v_mov_b32_e32 v2, s12 ; GFX8-NEXT: s_cmp_lg_u32 s3, 0 ; GFX8-NEXT: v_mov_b32_e32 v3, s13 -; GFX8-NEXT: s_addc_u32 s3, s7, s15 ; GFX8-NEXT: v_cndmask_b32_e64 v6, v0, -1, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v7, v1, -1, vcc -; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3] +; GFX8-NEXT: s_addc_u32 s3, s7, s15 ; GFX8-NEXT: v_mov_b32_e32 v0, s14 -; GFX8-NEXT: s_cmp_eq_u64 s[2:3], s[14:15] +; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3] ; GFX8-NEXT: v_mov_b32_e32 v1, s15 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], s[14:15] ; GFX8-NEXT: s_cselect_b32 s4, 1, 0 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1] @@ -3607,12 +3607,12 @@ ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX8-NEXT: v_mov_b32_e32 v1, s0 -; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX8-NEXT: v_mov_b32_e32 v2, s1 -; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v1, -1, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v1, v2, -1, vcc ; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 ; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, -1, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, -1, vcc ; GFX8-NEXT: v_readfirstlane_b32 s0, v4 @@ -3642,8 +3642,8 @@ ; GFX9-NEXT: s_cmp_lg_u32 s16, 0 ; GFX9-NEXT: v_mov_b32_e32 v3, s9 ; GFX9-NEXT: s_addc_u32 s3, s3, s11 -; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v0, s10 +; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v1, s11 ; GFX9-NEXT: s_cmp_eq_u64 s[2:3], s[10:11] ; GFX9-NEXT: s_cselect_b32 s10, 1, 0 @@ -3670,18 +3670,18 @@ ; GFX9-NEXT: v_cndmask_b32_e64 v4, v1, -1, vcc ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: s_cselect_b32 s3, 1, 0 -; GFX9-NEXT: s_and_b32 s3, s3, 1 ; GFX9-NEXT: v_cndmask_b32_e64 v5, v2, -1, vcc +; GFX9-NEXT: s_and_b32 s3, s3, 1 ; GFX9-NEXT: v_mov_b32_e32 v2, s12 ; GFX9-NEXT: s_cmp_lg_u32 s3, 0 ; GFX9-NEXT: v_mov_b32_e32 v3, s13 -; GFX9-NEXT: s_addc_u32 s3, s7, s15 ; GFX9-NEXT: v_cndmask_b32_e64 v6, v0, -1, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v7, v1, -1, vcc -; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3] +; GFX9-NEXT: s_addc_u32 s3, s7, s15 ; GFX9-NEXT: v_mov_b32_e32 v0, s14 -; GFX9-NEXT: s_cmp_eq_u64 s[2:3], s[14:15] +; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v1, s15 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], s[14:15] ; GFX9-NEXT: s_cselect_b32 s4, 1, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1] @@ -3691,12 +3691,12 @@ ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v1, -1, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v1, v2, -1, vcc ; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, -1, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, -1, vcc ; GFX9-NEXT: v_readfirstlane_b32 s0, v4 @@ -3746,8 +3746,8 @@ ; GFX10-NEXT: s_cselect_b32 s8, 1, 0 ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX10-NEXT: s_and_b32 s8, s8, 1 -; GFX10-NEXT: s_cmp_lg_u32 s8, 0 ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s9 +; GFX10-NEXT: s_cmp_lg_u32 s8, 0 ; GFX10-NEXT: s_addc_u32 s7, s7, s15 ; GFX10-NEXT: s_cmp_eq_u64 s[6:7], s[14:15] ; GFX10-NEXT: v_cmp_lt_u64_e64 s9, s[6:7], s[14:15] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll @@ -88,8 +88,8 @@ ; GFX10-NEXT: v_mul_lo_u32 v1, v0, s7 ; GFX10-NEXT: v_add_nc_u32_e32 v2, 1, v0 ; GFX10-NEXT: v_sub_nc_u32_e32 v1, s6, v1 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s7, v1 ; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s7, v1 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s7, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo ; GFX10-NEXT: v_add_nc_u32_e32 v2, 1, v0 @@ -129,11 +129,11 @@ ; GFX8-NEXT: v_trunc_f32_e32 v1, v1 ; GFX8-NEXT: v_mul_f32_e32 v2, 0xcf800000, v1 ; GFX8-NEXT: v_add_f32_e32 v0, v2, v0 -; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX8-NEXT: v_mov_b32_e32 v6, s11 -; GFX8-NEXT: v_mul_lo_u32 v3, s3, v0 ; GFX8-NEXT: v_mul_lo_u32 v2, s2, v1 +; GFX8-NEXT: v_mul_lo_u32 v3, s3, v0 ; GFX8-NEXT: v_mul_hi_u32 v5, s2, v0 ; GFX8-NEXT: v_mul_lo_u32 v4, s2, v0 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 @@ -149,12 +149,12 @@ ; GFX8-NEXT: v_mul_lo_u32 v7, v1, v2 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v5, v3 ; GFX8-NEXT: v_mul_hi_u32 v5, v0, v2 -; GFX8-NEXT: v_mul_hi_u32 v2, v1, v2 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v7, v4 ; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v5 ; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v5, vcc, v7, v5 +; GFX8-NEXT: v_mul_hi_u32 v2, v1, v2 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v4, v3 ; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v5, v4 @@ -179,12 +179,12 @@ ; GFX8-NEXT: v_mul_lo_u32 v5, v3, v4 ; GFX8-NEXT: v_add_u32_e64 v2, s[0:1], v8, v2 ; GFX8-NEXT: v_mul_hi_u32 v8, v0, v4 -; GFX8-NEXT: v_mul_hi_u32 v3, v3, v4 ; GFX8-NEXT: v_add_u32_e64 v5, s[0:1], v5, v7 ; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[0:1] ; GFX8-NEXT: v_add_u32_e64 v5, s[0:1], v5, v8 ; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[0:1] ; GFX8-NEXT: v_add_u32_e64 v7, s[0:1], v7, v8 +; GFX8-NEXT: v_mul_hi_u32 v3, v3, v4 ; GFX8-NEXT: v_add_u32_e64 v2, s[0:1], v5, v2 ; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[0:1] ; GFX8-NEXT: v_add_u32_e64 v4, s[0:1], v7, v5 @@ -204,12 +204,12 @@ ; GFX8-NEXT: v_mul_lo_u32 v5, s9, v1 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 ; GFX8-NEXT: v_mul_hi_u32 v3, s8, v1 -; GFX8-NEXT: v_mul_hi_u32 v1, s9, v1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v5, v0 ; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v3 ; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v5, v3 +; GFX8-NEXT: v_mul_hi_u32 v1, s9, v1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 @@ -235,12 +235,12 @@ ; GFX8-NEXT: v_add_u32_e64 v9, s[0:1], 1, v0 ; GFX8-NEXT: v_addc_u32_e64 v10, s[0:1], 0, v1, s[0:1] ; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s11, v8 -; GFX8-NEXT: v_subb_u32_e32 v2, vcc, v2, v6, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[0:1] ; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s10, v7 -; GFX8-NEXT: v_subrev_u32_e32 v6, vcc, s10, v7 +; GFX8-NEXT: v_subb_u32_e32 v2, vcc, v2, v6, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[0:1] ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s11, v8 +; GFX8-NEXT: v_subrev_u32_e32 v6, vcc, s10, v7 ; GFX8-NEXT: v_cndmask_b32_e64 v11, v11, v12, s[0:1] ; GFX8-NEXT: v_add_u32_e64 v12, s[0:1], 1, v9 ; GFX8-NEXT: v_subbrev_u32_e32 v2, vcc, 0, v2, vcc @@ -282,10 +282,10 @@ ; GFX9-NEXT: v_trunc_f32_e32 v1, v1 ; GFX9-NEXT: v_mul_f32_e32 v2, 0xcf800000, v1 ; GFX9-NEXT: v_add_f32_e32 v0, v2, v0 -; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX9-NEXT: v_mul_lo_u32 v3, s3, v0 +; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_lo_u32 v2, s2, v1 +; GFX9-NEXT: v_mul_lo_u32 v3, s3, v0 ; GFX9-NEXT: v_mul_hi_u32 v4, s2, v0 ; GFX9-NEXT: v_mul_lo_u32 v5, s2, v0 ; GFX9-NEXT: v_add3_u32 v2, v3, v2, v4 @@ -301,15 +301,15 @@ ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v7, v5 ; GFX9-NEXT: v_mul_hi_u32 v2, v1, v2 +; GFX9-NEXT: v_add_u32_e32 v3, v6, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v8 -; GFX9-NEXT: v_add_u32_e32 v3, v6, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v4, v3 -; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX9-NEXT: v_add_u32_e32 v5, v5, v6 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 +; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX9-NEXT: v_add3_u32 v2, v5, v4, v2 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 ; GFX9-NEXT: v_addc_co_u32_e64 v3, s[0:1], v1, v2, vcc ; GFX9-NEXT: v_mul_lo_u32 v4, s3, v0 ; GFX9-NEXT: v_mul_lo_u32 v5, s2, v3 @@ -383,20 +383,20 @@ ; GFX9-NEXT: v_add_co_u32_e64 v10, s[0:1], 1, v0 ; GFX9-NEXT: v_addc_co_u32_e64 v11, s[0:1], 0, v1, s[0:1] ; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s11, v9 -; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v4, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[0:1] ; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s10, v8 -; GFX9-NEXT: v_subrev_co_u32_e32 v4, vcc, s10, v8 +; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v4, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[0:1] ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s11, v9 +; GFX9-NEXT: v_subrev_co_u32_e32 v4, vcc, s10, v8 ; GFX9-NEXT: v_cndmask_b32_e64 v12, v12, v13, s[0:1] ; GFX9-NEXT: v_add_co_u32_e64 v13, s[0:1], 1, v10 ; GFX9-NEXT: v_subbrev_co_u32_e32 v2, vcc, 0, v2, vcc ; GFX9-NEXT: v_addc_co_u32_e64 v14, s[0:1], 0, v11, s[0:1] ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v7 ; GFX9-NEXT: v_cndmask_b32_e32 v10, v10, v13, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v11, v11, v14, vcc +; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v7 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v10, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v11, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc @@ -415,11 +415,11 @@ ; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s10 ; GFX10-NEXT: s_sub_u32 s1, 0, s10 ; GFX10-NEXT: s_cselect_b32 s0, 1, 0 -; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0 ; GFX10-NEXT: s_and_b32 s0, s0, 1 +; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0 ; GFX10-NEXT: s_cmp_lg_u32 s0, 0 -; GFX10-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX10-NEXT: s_subb_u32 s2, 0, s11 +; GFX10-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX10-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 @@ -511,8 +511,8 @@ ; GFX10-NEXT: v_add3_u32 v2, v2, v4, v3 ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, v0, 1 ; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_sub_co_u32 v5, vcc_lo, s8, v5 ; GFX10-NEXT: v_sub_nc_u32_e32 v6, s9, v2 +; GFX10-NEXT: v_sub_co_u32 v5, vcc_lo, s8, v5 ; GFX10-NEXT: v_sub_co_ci_u32_e64 v7, s0, s9, v2, vcc_lo ; GFX10-NEXT: v_subrev_co_ci_u32_e32 v2, vcc_lo, s11, v6, vcc_lo ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s10, v5 @@ -536,8 +536,8 @@ ; GFX10-NEXT: v_sub_co_u32 v10, s0, v8, s10 ; GFX10-NEXT: v_subrev_co_ci_u32_e64 v2, s0, 0, v2, s0 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v13, vcc_lo -; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v6 ; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v14, vcc_lo +; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v6 ; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v8, v9, v2, vcc_lo ; GFX10-NEXT: v_mov_b32_e32 v9, 0 @@ -646,15 +646,15 @@ ; GFX9-NEXT: v_sub_u32_e32 v3, s1, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; GFX9-NEXT: v_subrev_u32_e32 v4, s2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[0:1] ; GFX9-NEXT: v_subrev_u32_e32 v5, s3, v3 ; GFX9-NEXT: v_add_u32_e32 v4, 1, v0 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s2, v2 +; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; GFX9-NEXT: v_subrev_u32_e32 v4, s2, v2 -; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX9-NEXT: v_add_u32_e32 v4, 1, v1 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v3 @@ -683,12 +683,10 @@ ; GFX10-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX10-NEXT: v_mul_lo_u32 v2, s0, v0 ; GFX10-NEXT: v_mul_lo_u32 v3, s1, v1 -; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x10 -; GFX10-NEXT: s_nop 0 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 ; GFX10-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX10-NEXT: v_mul_hi_u32 v3, v1, v3 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v2 ; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v3 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -700,17 +698,17 @@ ; GFX10-NEXT: v_add_nc_u32_e32 v5, 1, v1 ; GFX10-NEXT: v_sub_nc_u32_e32 v2, s0, v2 ; GFX10-NEXT: v_sub_nc_u32_e32 v3, s1, v3 -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s2, v2 ; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s2, v2 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s3, v3 +; GFX10-NEXT: v_cmp_le_u32_e64 s0, s2, v2 ; GFX10-NEXT: v_subrev_nc_u32_e32 v7, s3, v3 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v6, s0 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc_lo +; GFX10-NEXT: v_add_nc_u32_e32 v5, 1, v1 ; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v0 ; GFX10-NEXT: v_cmp_le_u32_e64 s0, s2, v2 -; GFX10-NEXT: v_add_nc_u32_e32 v5, 1, v1 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s3, v3 ; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s2, v2 ; GFX10-NEXT: v_subrev_nc_u32_e32 v7, s3, v3 @@ -772,10 +770,10 @@ ; GFX8-NEXT: v_rcp_iflag_f32_e32 v5, v6 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, 1, v1 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s9, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc ; GFX8-NEXT: v_mul_f32_e32 v5, v5, v2 -; GFX8-NEXT: v_subrev_u32_e64 v6, s[0:1], s9, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc ; GFX8-NEXT: v_cvt_u32_f32_e32 v5, v5 +; GFX8-NEXT: v_subrev_u32_e64 v6, s[0:1], s9, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc ; GFX8-NEXT: v_add_u32_e32 v6, vcc, 1, v1 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s9, v3 @@ -900,8 +898,8 @@ ; GFX9-NEXT: v_mul_hi_u32 v7, s11, v2 ; GFX9-NEXT: v_add_u32_e32 v8, 1, v3 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s2, v6 -; GFX9-NEXT: v_subrev_u32_e32 v2, s2, v6 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc +; GFX9-NEXT: v_subrev_u32_e32 v2, s2, v6 ; GFX9-NEXT: v_mul_lo_u32 v8, v7, s3 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v2, vcc ; GFX9-NEXT: v_add_u32_e32 v2, 1, v3 @@ -999,25 +997,25 @@ ; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v11, s1 ; GFX10-NEXT: v_add_nc_u32_e32 v9, 1, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, v12, s2 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s8, v4 ; GFX10-NEXT: v_add_nc_u32_e32 v10, 1, v1 -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s9, v5 ; GFX10-NEXT: v_add_nc_u32_e32 v11, 1, v2 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s8, v4 +; GFX10-NEXT: v_cmp_le_u32_e64 s0, s9, v5 ; GFX10-NEXT: v_cmp_le_u32_e64 s1, s10, v6 ; GFX10-NEXT: v_add_nc_u32_e32 v12, 1, v3 ; GFX10-NEXT: v_cmp_le_u32_e64 s2, s11, v7 -; GFX10-NEXT: v_subrev_nc_u32_e32 v13, s11, v7 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc_lo ; GFX10-NEXT: v_subrev_nc_u32_e32 v9, s8, v4 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v10, s0 ; GFX10-NEXT: v_subrev_nc_u32_e32 v10, s9, v5 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v11, s1 ; GFX10-NEXT: v_subrev_nc_u32_e32 v11, s10, v6 +; GFX10-NEXT: v_subrev_nc_u32_e32 v13, s11, v7 ; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v12, s2 ; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v9, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v10, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, v13, s2 ; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v11, s1 +; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, v13, s2 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_store_dwordx4 v8, v[0:3], s[4:5] ; GFX10-NEXT: global_store_dwordx4 v8, v[4:7], s[6:7] @@ -1051,11 +1049,11 @@ ; GFX8-NEXT: v_trunc_f32_e32 v1, v1 ; GFX8-NEXT: v_mul_f32_e32 v2, 0xcf800000, v1 ; GFX8-NEXT: v_add_f32_e32 v0, v2, v0 -; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX8-NEXT: v_mov_b32_e32 v6, s9 -; GFX8-NEXT: v_mul_lo_u32 v3, s3, v0 ; GFX8-NEXT: v_mul_lo_u32 v2, s2, v1 +; GFX8-NEXT: v_mul_lo_u32 v3, s3, v0 ; GFX8-NEXT: v_mul_hi_u32 v5, s2, v0 ; GFX8-NEXT: v_mul_lo_u32 v4, s2, v0 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 @@ -1071,12 +1069,12 @@ ; GFX8-NEXT: v_mul_lo_u32 v7, v1, v2 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v5, v3 ; GFX8-NEXT: v_mul_hi_u32 v5, v0, v2 -; GFX8-NEXT: v_mul_hi_u32 v2, v1, v2 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v7, v4 ; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v5 ; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v5, vcc, v7, v5 +; GFX8-NEXT: v_mul_hi_u32 v2, v1, v2 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v4, v3 ; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v5, v4 @@ -1102,12 +1100,12 @@ ; GFX8-NEXT: v_mul_lo_u32 v5, v3, v4 ; GFX8-NEXT: v_add_u32_e64 v2, s[0:1], v8, v2 ; GFX8-NEXT: v_mul_hi_u32 v8, v0, v4 -; GFX8-NEXT: v_mul_hi_u32 v3, v3, v4 ; GFX8-NEXT: v_add_u32_e64 v5, s[0:1], v5, v7 ; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[0:1] ; GFX8-NEXT: v_add_u32_e64 v5, s[0:1], v5, v8 ; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[0:1] ; GFX8-NEXT: v_add_u32_e64 v7, s[0:1], v7, v8 +; GFX8-NEXT: v_mul_hi_u32 v3, v3, v4 ; GFX8-NEXT: v_add_u32_e64 v2, s[0:1], v5, v2 ; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[0:1] ; GFX8-NEXT: v_add_u32_e64 v4, s[0:1], v7, v5 @@ -1127,12 +1125,12 @@ ; GFX8-NEXT: v_mul_lo_u32 v5, s13, v1 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 ; GFX8-NEXT: v_mul_hi_u32 v3, s12, v1 -; GFX8-NEXT: v_mul_hi_u32 v1, s13, v1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v5, v0 ; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v3 ; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v5, v3 +; GFX8-NEXT: v_mul_hi_u32 v1, s13, v1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 @@ -1158,24 +1156,24 @@ ; GFX8-NEXT: v_add_u32_e64 v9, s[0:1], 1, v0 ; GFX8-NEXT: v_addc_u32_e64 v10, s[0:1], 0, v1, s[0:1] ; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v8 -; GFX8-NEXT: v_subb_u32_e32 v2, vcc, v2, v6, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[0:1] ; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v7 -; GFX8-NEXT: v_subrev_u32_e32 v6, vcc, s8, v7 +; GFX8-NEXT: v_subb_u32_e32 v2, vcc, v2, v6, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[0:1] ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v8 +; GFX8-NEXT: v_subrev_u32_e32 v6, vcc, s8, v7 ; GFX8-NEXT: v_cndmask_b32_e64 v11, v11, v12, s[0:1] ; GFX8-NEXT: v_add_u32_e64 v12, s[0:1], 1, v9 ; GFX8-NEXT: v_subbrev_u32_e32 v2, vcc, 0, v2, vcc -; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 ; GFX8-NEXT: v_addc_u32_e64 v13, s[0:1], 0, v10, s[0:1] +; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; GFX8-NEXT: v_cndmask_b32_e32 v9, v9, v12, vcc ; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4 ; GFX8-NEXT: v_cvt_f32_u32_e32 v4, s11 -; GFX8-NEXT: v_cndmask_b32_e32 v9, v9, v12, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v9, s[0:1] ; GFX8-NEXT: v_cvt_f32_u32_e32 v9, s10 -; GFX8-NEXT: v_mul_f32_e32 v4, 0x4f800000, v4 ; GFX8-NEXT: v_cndmask_b32_e32 v6, v7, v6, vcc +; GFX8-NEXT: v_mul_f32_e32 v4, 0x4f800000, v4 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc ; GFX8-NEXT: v_add_f32_e32 v4, v4, v9 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v7, v4 @@ -1188,9 +1186,9 @@ ; GFX8-NEXT: v_mul_f32_e32 v6, 0xcf800000, v3 ; GFX8-NEXT: v_add_f32_e32 v2, v6, v2 ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v10, s[0:1] -; GFX8-NEXT: s_cselect_b32 s0, 1, 0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GFX8-NEXT: v_cvt_u32_f32_e32 v3, v3 +; GFX8-NEXT: s_cselect_b32 s0, 1, 0 ; GFX8-NEXT: s_and_b32 s0, s0, 1 ; GFX8-NEXT: s_cmp_lg_u32 s0, 0 ; GFX8-NEXT: s_subb_u32 s3, 0, s11 @@ -1212,12 +1210,12 @@ ; GFX8-NEXT: v_mul_lo_u32 v11, v3, v6 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v9, v7 ; GFX8-NEXT: v_mul_hi_u32 v9, v2, v6 -; GFX8-NEXT: v_mul_hi_u32 v6, v3, v6 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, v11, v8 ; GFX8-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v9 ; GFX8-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v9, vcc, v11, v9 +; GFX8-NEXT: v_mul_hi_u32 v6, v3, v6 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v8, v7 ; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v8, vcc, v9, v8 @@ -1242,12 +1240,12 @@ ; GFX8-NEXT: v_mul_lo_u32 v9, v7, v8 ; GFX8-NEXT: v_add_u32_e64 v6, s[0:1], v12, v6 ; GFX8-NEXT: v_mul_hi_u32 v12, v2, v8 -; GFX8-NEXT: v_mul_hi_u32 v7, v7, v8 ; GFX8-NEXT: v_add_u32_e64 v9, s[0:1], v9, v11 ; GFX8-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[0:1] ; GFX8-NEXT: v_add_u32_e64 v9, s[0:1], v9, v12 ; GFX8-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[0:1] ; GFX8-NEXT: v_add_u32_e64 v11, s[0:1], v11, v12 +; GFX8-NEXT: v_mul_hi_u32 v7, v7, v8 ; GFX8-NEXT: v_add_u32_e64 v6, s[0:1], v9, v6 ; GFX8-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[0:1] ; GFX8-NEXT: v_add_u32_e64 v8, s[0:1], v11, v9 @@ -1267,12 +1265,12 @@ ; GFX8-NEXT: v_mul_lo_u32 v9, s15, v3 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v7, v6 ; GFX8-NEXT: v_mul_hi_u32 v7, s14, v3 -; GFX8-NEXT: v_mul_hi_u32 v3, s15, v3 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v9, v2 ; GFX8-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v7 ; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v9, v7 +; GFX8-NEXT: v_mul_hi_u32 v3, s15, v3 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v6 ; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v7, v6 @@ -1347,11 +1345,11 @@ ; GFX9-NEXT: v_trunc_f32_e32 v1, v1 ; GFX9-NEXT: v_mul_f32_e32 v2, 0xcf800000, v1 ; GFX9-NEXT: v_add_f32_e32 v0, v2, v0 -; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_cvt_f32_u32_e32 v14, s11 -; GFX9-NEXT: v_mul_lo_u32 v3, s3, v0 ; GFX9-NEXT: v_mul_lo_u32 v2, s2, v1 +; GFX9-NEXT: v_mul_lo_u32 v3, s3, v0 ; GFX9-NEXT: v_mul_hi_u32 v4, s2, v0 ; GFX9-NEXT: v_mul_lo_u32 v5, s2, v0 ; GFX9-NEXT: v_mul_f32_e32 v14, 0x4f800000, v14 @@ -1368,15 +1366,15 @@ ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v7, v5 ; GFX9-NEXT: v_mul_hi_u32 v2, v1, v2 -; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GFX9-NEXT: v_add_u32_e32 v3, v6, v3 +; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v8 ; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v4, v3 -; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX9-NEXT: v_add_u32_e32 v5, v5, v6 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 +; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX9-NEXT: v_add3_u32 v2, v5, v4, v2 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 ; GFX9-NEXT: v_addc_co_u32_e64 v3, s[0:1], v1, v2, vcc ; GFX9-NEXT: v_mul_lo_u32 v4, s3, v0 ; GFX9-NEXT: v_mul_lo_u32 v5, s2, v3 @@ -1450,14 +1448,14 @@ ; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s10 ; GFX9-NEXT: v_add_co_u32_e64 v9, s[0:1], 1, v0 ; GFX9-NEXT: v_addc_co_u32_e64 v10, s[0:1], 0, v1, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v8 ; GFX9-NEXT: v_add_f32_e32 v5, v14, v5 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v5 +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v8 ; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[0:1] ; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v7 -; GFX9-NEXT: v_subrev_co_u32_e32 v15, vcc, s8, v7 ; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[0:1] ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v8 +; GFX9-NEXT: v_subrev_co_u32_e32 v15, vcc, s8, v7 ; GFX9-NEXT: v_cndmask_b32_e64 v11, v11, v12, s[0:1] ; GFX9-NEXT: v_subbrev_co_u32_e32 v2, vcc, 0, v2, vcc ; GFX9-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5 @@ -1470,9 +1468,9 @@ ; GFX9-NEXT: v_addc_co_u32_e64 v13, s[0:1], 0, v10, s[0:1] ; GFX9-NEXT: v_add_f32_e32 v5, v12, v5 ; GFX9-NEXT: s_sub_u32 s8, 0, s10 -; GFX9-NEXT: s_cselect_b32 s0, 1, 0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v5, v5 ; GFX9-NEXT: v_cvt_u32_f32_e32 v11, v11 +; GFX9-NEXT: s_cselect_b32 s0, 1, 0 ; GFX9-NEXT: s_and_b32 s0, s0, 1 ; GFX9-NEXT: s_cmp_lg_u32 s0, 0 ; GFX9-NEXT: s_subb_u32 s9, 0, s11 @@ -1503,10 +1501,10 @@ ; GFX9-NEXT: v_add_co_u32_e64 v10, s[2:3], v12, v10 ; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[2:3] ; GFX9-NEXT: v_add_co_u32_e64 v9, s[2:3], v10, v9 -; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[2:3] ; GFX9-NEXT: v_add_u32_e32 v12, v13, v12 -; GFX9-NEXT: v_add_co_u32_e64 v5, s[2:3], v5, v9 +; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[2:3] ; GFX9-NEXT: v_add3_u32 v10, v12, v10, v4 +; GFX9-NEXT: v_add_co_u32_e64 v5, s[2:3], v5, v9 ; GFX9-NEXT: v_addc_co_u32_e64 v9, s[6:7], v11, v10, s[2:3] ; GFX9-NEXT: v_mul_lo_u32 v12, s9, v5 ; GFX9-NEXT: v_mul_lo_u32 v13, s8, v9 @@ -1597,8 +1595,8 @@ ; GFX9-NEXT: v_subrev_co_u32_e64 v15, s[0:1], s10, v11 ; GFX9-NEXT: v_subbrev_co_u32_e64 v6, s[0:1], 0, v6, s[0:1] ; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v10 -; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v9, s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v13, 0 +; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v9, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v14, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e32 v9, v11, v15, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v10, v12, v6, vcc @@ -1674,64 +1672,63 @@ ; GFX10-NEXT: v_mul_hi_u32 v14, v0, v4 ; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, 1, s0 ; GFX10-NEXT: v_add_co_u32 v7, s0, v13, v7 -; GFX10-NEXT: v_mul_hi_u32 v17, v1, v8 ; GFX10-NEXT: v_cndmask_b32_e64 v13, 0, 1, s0 ; GFX10-NEXT: v_add_co_u32 v9, s0, v9, v15 -; GFX10-NEXT: v_mul_hi_u32 v4, v2, v4 ; GFX10-NEXT: v_cndmask_b32_e64 v15, 0, 1, s0 ; GFX10-NEXT: v_add_co_u32 v11, s0, v16, v11 -; GFX10-NEXT: v_mul_hi_u32 v8, v3, v8 ; GFX10-NEXT: v_cndmask_b32_e64 v16, 0, 1, s0 ; GFX10-NEXT: v_add_co_u32 v5, s0, v5, v6 +; GFX10-NEXT: v_mul_hi_u32 v17, v1, v8 ; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s0 ; GFX10-NEXT: v_add_co_u32 v6, s0, v7, v14 ; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, 1, s0 ; GFX10-NEXT: v_add_co_u32 v9, s0, v9, v10 -; GFX10-NEXT: v_add_nc_u32_e32 v5, v12, v5 ; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, 1, s0 +; GFX10-NEXT: v_add_nc_u32_e32 v5, v12, v5 ; GFX10-NEXT: v_add_co_u32 v10, s0, v11, v17 -; GFX10-NEXT: v_add_nc_u32_e32 v7, v13, v7 +; GFX10-NEXT: v_mul_hi_u32 v4, v2, v4 +; GFX10-NEXT: v_add_nc_u32_e32 v9, v15, v9 ; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, 1, s0 ; GFX10-NEXT: v_add_co_u32 v5, s0, v6, v5 -; GFX10-NEXT: v_add_nc_u32_e32 v9, v15, v9 +; GFX10-NEXT: v_add_nc_u32_e32 v7, v13, v7 ; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s0 -; GFX10-NEXT: v_add_nc_u32_e32 v11, v16, v11 -; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v5 +; GFX10-NEXT: v_mul_hi_u32 v8, v3, v8 ; GFX10-NEXT: v_add_co_u32 v9, s0, v10, v9 -; GFX10-NEXT: v_add3_u32 v4, v7, v6, v4 +; GFX10-NEXT: v_add_nc_u32_e32 v11, v16, v11 ; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, 1, s0 -; GFX10-NEXT: v_mul_lo_u32 v7, s2, v0 -; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s0, v2, v4, vcc_lo +; GFX10-NEXT: v_add3_u32 v4, v7, v6, v4 +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v5 ; GFX10-NEXT: v_add3_u32 v5, v11, v10, v8 +; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s0, v2, v4, vcc_lo ; GFX10-NEXT: v_add_co_u32 v1, s0, v1, v9 ; GFX10-NEXT: v_mul_lo_u32 v8, s1, v0 +; GFX10-NEXT: v_add_co_ci_u32_e64 v10, s1, v3, v5, s0 ; GFX10-NEXT: v_mul_hi_u32 v9, s2, v0 ; GFX10-NEXT: v_mul_lo_u32 v11, s2, v6 -; GFX10-NEXT: v_add_co_ci_u32_e64 v10, s1, v3, v5, s0 ; GFX10-NEXT: v_mul_lo_u32 v13, s6, v1 ; GFX10-NEXT: v_mul_hi_u32 v14, s3, v1 -; GFX10-NEXT: v_mul_lo_u32 v12, s3, v1 -; GFX10-NEXT: v_mul_lo_u32 v16, v6, v7 ; GFX10-NEXT: v_mul_lo_u32 v15, s3, v10 -; GFX10-NEXT: v_mul_hi_u32 v17, v0, v7 -; GFX10-NEXT: v_add3_u32 v8, v8, v11, v9 -; GFX10-NEXT: v_mul_hi_u32 v7, v6, v7 +; GFX10-NEXT: v_mul_lo_u32 v7, s2, v0 +; GFX10-NEXT: v_mul_lo_u32 v12, s3, v1 ; GFX10-NEXT: v_add_nc_u32_e32 v2, v2, v4 ; GFX10-NEXT: v_add_nc_u32_e32 v3, v3, v5 -; GFX10-NEXT: v_mul_lo_u32 v9, v10, v12 -; GFX10-NEXT: v_mul_hi_u32 v18, v0, v8 +; GFX10-NEXT: v_add3_u32 v8, v8, v11, v9 ; GFX10-NEXT: v_add3_u32 v13, v13, v15, v14 +; GFX10-NEXT: v_mul_lo_u32 v16, v6, v7 ; GFX10-NEXT: v_mul_lo_u32 v14, v0, v8 +; GFX10-NEXT: v_mul_hi_u32 v17, v0, v7 +; GFX10-NEXT: v_mul_hi_u32 v7, v6, v7 ; GFX10-NEXT: v_mul_lo_u32 v15, v6, v8 +; GFX10-NEXT: v_mul_lo_u32 v9, v10, v12 +; GFX10-NEXT: v_mul_hi_u32 v18, v0, v8 ; GFX10-NEXT: v_mul_hi_u32 v6, v6, v8 -; GFX10-NEXT: v_mul_hi_u32 v11, v1, v12 ; GFX10-NEXT: v_mul_lo_u32 v8, v1, v13 -; GFX10-NEXT: v_mul_hi_u32 v12, v10, v12 -; GFX10-NEXT: v_mul_lo_u32 v19, v10, v13 -; GFX10-NEXT: v_mul_hi_u32 v20, v1, v13 ; GFX10-NEXT: v_add_co_u32 v14, s1, v16, v14 ; GFX10-NEXT: v_cndmask_b32_e64 v16, 0, 1, s1 ; GFX10-NEXT: v_add_co_u32 v7, s1, v15, v7 +; GFX10-NEXT: v_mul_hi_u32 v11, v1, v12 +; GFX10-NEXT: v_mul_hi_u32 v12, v10, v12 +; GFX10-NEXT: v_mul_lo_u32 v19, v10, v13 ; GFX10-NEXT: v_cndmask_b32_e64 v15, 0, 1, s1 ; GFX10-NEXT: v_add_co_u32 v8, s1, v9, v8 ; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, 1, s1 @@ -1745,20 +1742,21 @@ ; GFX10-NEXT: v_add_co_u32 v8, s1, v8, v11 ; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, s1 ; GFX10-NEXT: v_add_co_u32 v7, s1, v7, v14 +; GFX10-NEXT: v_mul_hi_u32 v20, v1, v13 ; GFX10-NEXT: v_add_nc_u32_e32 v11, v15, v18 ; GFX10-NEXT: v_cndmask_b32_e64 v14, 0, 1, s1 -; GFX10-NEXT: v_add_co_u32 v12, s1, v12, v20 ; GFX10-NEXT: v_add_nc_u32_e32 v8, v9, v8 -; GFX10-NEXT: v_cndmask_b32_e64 v15, 0, 1, s1 ; GFX10-NEXT: v_add3_u32 v4, v11, v14, v6 +; GFX10-NEXT: v_add_co_u32 v12, s1, v12, v20 +; GFX10-NEXT: v_cndmask_b32_e64 v15, 0, 1, s1 ; GFX10-NEXT: v_mul_hi_u32 v6, v10, v13 -; GFX10-NEXT: v_mov_b32_e32 v10, 0 -; GFX10-NEXT: v_add_nc_u32_e32 v9, v17, v15 ; GFX10-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v2, v4, vcc_lo ; GFX10-NEXT: v_add_co_u32 v4, s1, v12, v8 ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v7 +; GFX10-NEXT: v_add_nc_u32_e32 v9, v17, v15 ; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, s1 ; GFX10-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, 0, v2, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v10, 0 ; GFX10-NEXT: v_add3_u32 v5, v9, v8, v6 ; GFX10-NEXT: v_mul_lo_u32 v6, s17, v0 ; GFX10-NEXT: v_mul_lo_u32 v7, s16, v2 @@ -1768,47 +1766,47 @@ ; GFX10-NEXT: v_add_co_ci_u32_e64 v3, vcc_lo, v3, v5, s0 ; GFX10-NEXT: v_mul_hi_u32 v5, s16, v2 ; GFX10-NEXT: v_mul_hi_u32 v2, s17, v2 -; GFX10-NEXT: v_add_co_u32 v1, vcc_lo, v1, v4 ; GFX10-NEXT: v_add_co_u32 v6, s0, v6, v7 -; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, 1, s0 ; GFX10-NEXT: v_add_co_u32 v8, s0, v9, v8 ; GFX10-NEXT: v_add_co_u32 v0, s1, v6, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s1 +; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s0 ; GFX10-NEXT: v_add_co_u32 v5, s0, v8, v5 -; GFX10-NEXT: v_mul_hi_u32 v12, s18, v3 ; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, s0 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v7, v0 -; GFX10-NEXT: v_mul_hi_u32 v7, s18, v1 +; GFX10-NEXT: v_add_co_u32 v1, vcc_lo, v1, v4 +; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo +; GFX10-NEXT: v_add_co_u32 v0, s0, v5, v0 ; GFX10-NEXT: v_add_nc_u32_e32 v4, v6, v8 +; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s0 ; GFX10-NEXT: v_mul_lo_u32 v6, s19, v1 -; GFX10-NEXT: v_add_co_u32 v0, s0, v5, v0 +; GFX10-NEXT: v_mul_hi_u32 v7, s18, v1 ; GFX10-NEXT: v_mul_hi_u32 v1, s19, v1 -; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s0 ; GFX10-NEXT: v_mul_lo_u32 v8, s9, v0 -; GFX10-NEXT: v_mul_hi_u32 v9, s8, v0 ; GFX10-NEXT: v_add3_u32 v2, v4, v5, v2 ; GFX10-NEXT: v_mul_lo_u32 v4, s18, v3 ; GFX10-NEXT: v_mul_lo_u32 v5, s19, v3 +; GFX10-NEXT: v_mul_hi_u32 v9, s8, v0 ; GFX10-NEXT: v_mul_lo_u32 v13, s8, v0 -; GFX10-NEXT: v_mul_hi_u32 v3, s19, v3 ; GFX10-NEXT: v_mul_lo_u32 v11, s8, v2 +; GFX10-NEXT: v_mul_hi_u32 v12, s18, v3 +; GFX10-NEXT: v_mul_hi_u32 v3, s19, v3 ; GFX10-NEXT: v_add_co_u32 v4, s0, v6, v4 ; GFX10-NEXT: v_add_co_u32 v1, s1, v5, v1 -; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v16, 0, 1, s1 ; GFX10-NEXT: v_add3_u32 v5, v8, v11, v9 -; GFX10-NEXT: v_add_co_u32 v4, s0, v4, v7 +; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s0 ; GFX10-NEXT: v_sub_co_u32 v8, vcc_lo, s16, v13 -; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s0 +; GFX10-NEXT: v_add_co_u32 v4, s0, v4, v7 ; GFX10-NEXT: v_sub_nc_u32_e32 v7, s17, v5 +; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s0 ; GFX10-NEXT: v_sub_co_ci_u32_e64 v5, s0, s17, v5, vcc_lo ; GFX10-NEXT: v_cmp_le_u32_e64 s0, s8, v8 -; GFX10-NEXT: v_add_nc_u32_e32 v4, v6, v4 ; GFX10-NEXT: v_subrev_co_ci_u32_e32 v7, vcc_lo, s9, v7, vcc_lo ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s9, v5 +; GFX10-NEXT: v_cndmask_b32_e64 v16, 0, 1, s1 ; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, -1, s0 +; GFX10-NEXT: v_add_nc_u32_e32 v4, v6, v4 ; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc_lo ; GFX10-NEXT: v_sub_co_u32 v13, vcc_lo, v8, s8 ; GFX10-NEXT: v_subrev_co_ci_u32_e64 v14, s0, 0, v7, vcc_lo @@ -1829,35 +1827,35 @@ ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 ; GFX10-NEXT: v_add_co_u32 v4, s0, v17, 1 ; GFX10-NEXT: v_add_co_ci_u32_e64 v15, s0, 0, v18, s0 -; GFX10-NEXT: v_sub_co_u32 v19, s0, v13, s8 ; GFX10-NEXT: v_add3_u32 v3, v11, v1, v3 ; GFX10-NEXT: v_subrev_co_ci_u32_e32 v1, vcc_lo, s9, v7, vcc_lo ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6 ; GFX10-NEXT: v_mul_lo_u32 v6, s11, v12 -; GFX10-NEXT: v_mul_hi_u32 v11, s10, v12 ; GFX10-NEXT: v_mul_lo_u32 v7, s10, v3 +; GFX10-NEXT: v_mul_hi_u32 v11, s10, v12 +; GFX10-NEXT: v_sub_co_u32 v19, s0, v13, s8 ; GFX10-NEXT: v_mul_lo_u32 v16, s10, v12 ; GFX10-NEXT: v_subrev_co_ci_u32_e64 v20, s0, 0, v1, s0 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v17, v4, vcc_lo -; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v9 ; GFX10-NEXT: v_cndmask_b32_e32 v4, v18, v15, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v14, v14, v20, vcc_lo ; GFX10-NEXT: v_add3_u32 v6, v6, v7, v11 +; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v9 +; GFX10-NEXT: v_cndmask_b32_e32 v14, v14, v20, vcc_lo +; GFX10-NEXT: v_sub_co_u32 v7, s1, s18, v16 +; GFX10-NEXT: v_sub_co_ci_u32_e64 v9, s2, s19, v6, s1 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v1, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v2, v4, s0 -; GFX10-NEXT: v_sub_co_u32 v7, s1, s18, v16 -; GFX10-NEXT: v_cndmask_b32_e32 v4, v13, v19, vcc_lo ; GFX10-NEXT: v_sub_nc_u32_e32 v2, s19, v6 -; GFX10-NEXT: v_sub_co_ci_u32_e64 v9, s2, s19, v6, s1 +; GFX10-NEXT: v_cmp_le_u32_e64 s2, s11, v9 +; GFX10-NEXT: v_cndmask_b32_e32 v4, v13, v19, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, s11, v9 ; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v14, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v4, v8, v4, s0 ; GFX10-NEXT: v_subrev_co_ci_u32_e64 v2, s1, s11, v2, s1 ; GFX10-NEXT: v_cmp_le_u32_e64 s1, s10, v7 -; GFX10-NEXT: v_cmp_le_u32_e64 s2, s11, v9 -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, s11, v9 +; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, -1, s2 +; GFX10-NEXT: v_cndmask_b32_e64 v4, v8, v4, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, -1, s1 ; GFX10-NEXT: v_sub_co_u32 v13, s1, v7, s10 -; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, -1, s2 ; GFX10-NEXT: v_subrev_co_ci_u32_e64 v15, s2, 0, v2, s1 ; GFX10-NEXT: v_cndmask_b32_e32 v6, v6, v11, vcc_lo ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s11, v15 @@ -1875,8 +1873,8 @@ ; GFX10-NEXT: v_sub_co_u32 v8, s1, v13, s10 ; GFX10-NEXT: v_subrev_co_ci_u32_e64 v2, s1, 0, v2, s1 ; GFX10-NEXT: v_cndmask_b32_e32 v11, v16, v11, vcc_lo -; GFX10-NEXT: v_cmp_ne_u32_e64 s1, 0, v6 ; GFX10-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc_lo +; GFX10-NEXT: v_cmp_ne_u32_e64 s1, 0, v6 ; GFX10-NEXT: v_cndmask_b32_e32 v6, v13, v8, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v8, v15, v2, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v2, v12, v11, s1 @@ -1984,8 +1982,8 @@ ; GFX10-NEXT: v_add_nc_u32_e32 v2, 1, v0 ; GFX10-NEXT: v_sub_nc_u32_e32 v1, s0, v1 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s6, v1 ; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s6, v1 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s6, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo ; GFX10-NEXT: v_add_nc_u32_e32 v2, 1, v0 @@ -2046,8 +2044,8 @@ ; GFX8-NEXT: v_mul_lo_u32 v3, v1, s3 ; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s2, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v1 ; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s4, v3 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v1 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s3, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s3, v3 @@ -2110,9 +2108,9 @@ ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX9-NEXT: v_add_u32_e32 v4, 1, v1 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s6, v3 +; GFX9-NEXT: v_sub_u32_e32 v2, s9, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; GFX9-NEXT: v_subrev_u32_e32 v4, s6, v3 -; GFX9-NEXT: v_sub_u32_e32 v2, s9, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX9-NEXT: v_add_u32_e32 v4, 1, v0 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v2 @@ -2125,9 +2123,9 @@ ; GFX9-NEXT: v_and_b32_e32 v0, s4, v0 ; GFX9-NEXT: v_subrev_u32_e32 v4, s7, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_short v1, v0, s[0:1] ; GFX9-NEXT: v_and_b32_e32 v0, s4, v2 @@ -2138,10 +2136,7 @@ ; ; GFX10-LABEL: udivrem_v2i8: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: s_load_dword s0, s[4:5], 0x10 -; GFX10-NEXT: s_nop 0 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_cvt_f32_ubyte3_e32 v0, s0 ; GFX10-NEXT: s_bfe_u32 s1, s0, 0x80010 @@ -2149,6 +2144,7 @@ ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v1, s1 ; GFX10-NEXT: s_sub_i32 s3, 0, s2 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 @@ -2171,8 +2167,8 @@ ; GFX10-NEXT: v_add_nc_u32_e32 v6, 1, v1 ; GFX10-NEXT: v_sub_nc_u32_e32 v2, s3, v2 ; GFX10-NEXT: v_sub_nc_u32_e32 v3, s0, v3 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s2, v2 ; GFX10-NEXT: v_subrev_nc_u32_e32 v5, s2, v2 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s2, v2 ; GFX10-NEXT: v_cmp_le_u32_e64 s0, s1, v3 ; GFX10-NEXT: v_subrev_nc_u32_e32 v7, s1, v3 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo @@ -2195,6 +2191,7 @@ ; GFX10-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_store_short v1, v0, s[4:5] ; GFX10-NEXT: global_store_short v1, v2, s[6:7] ; GFX10-NEXT: s_endpgm @@ -2296,8 +2293,8 @@ ; GFX10-NEXT: v_add_nc_u32_e32 v2, 1, v0 ; GFX10-NEXT: v_sub_nc_u32_e32 v1, s0, v1 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s6, v1 ; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s6, v1 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s6, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo ; GFX10-NEXT: v_add_nc_u32_e32 v2, 1, v0 @@ -2361,8 +2358,8 @@ ; GFX8-NEXT: v_mul_lo_u32 v3, v1, s3 ; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s6, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v1 ; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s4, v3 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v1 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s3, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s3, v3 @@ -2426,11 +2423,11 @@ ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX9-NEXT: v_add_u32_e32 v4, 1, v0 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v2 +; GFX9-NEXT: v_sub_u32_e32 v3, s8, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; GFX9-NEXT: v_subrev_u32_e32 v4, s7, v2 -; GFX9-NEXT: v_sub_u32_e32 v3, s8, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX9-NEXT: v_add_u32_e32 v5, 1, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s6, v3 ; GFX9-NEXT: v_subrev_u32_e32 v4, s6, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc @@ -2487,8 +2484,8 @@ ; GFX10-NEXT: v_sub_nc_u32_e32 v2, s6, v2 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 ; GFX10-NEXT: v_sub_nc_u32_e32 v3, s0, v3 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s1, v2 ; GFX10-NEXT: v_subrev_nc_u32_e32 v5, s1, v2 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s1, v2 ; GFX10-NEXT: v_cmp_le_u32_e64 s0, s3, v3 ; GFX10-NEXT: v_subrev_nc_u32_e32 v7, s3, v3 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo @@ -2554,8 +2551,8 @@ ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s6, v3 ; GFX8-NEXT: v_and_b32_e32 v2, 7, v2 -; GFX8-NEXT: flat_store_byte v[0:1], v2 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; GFX8-NEXT: flat_store_byte v[0:1], v2 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_and_b32_e32 v2, 7, v3 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -2590,8 +2587,8 @@ ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s6, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GFX9-NEXT: v_subrev_u32_e32 v3, s6, v1 -; GFX9-NEXT: v_and_b32_e32 v0, 7, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX9-NEXT: v_and_b32_e32 v0, 7, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_byte v2, v0, s[0:1] ; GFX9-NEXT: v_and_b32_e32 v0, 7, v1 @@ -2617,8 +2614,8 @@ ; GFX10-NEXT: v_add_nc_u32_e32 v2, 1, v0 ; GFX10-NEXT: v_sub_nc_u32_e32 v1, s0, v1 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s6, v1 ; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s6, v1 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s6, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo ; GFX10-NEXT: v_add_nc_u32_e32 v2, 1, v0 @@ -2673,8 +2670,8 @@ ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s7, v3 ; GFX8-NEXT: v_and_b32_e32 v2, s6, v2 -; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_and_b32_e32 v2, s6, v3 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -2710,8 +2707,8 @@ ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GFX9-NEXT: v_subrev_u32_e32 v3, s7, v1 -; GFX9-NEXT: v_and_b32_e32 v0, s6, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX9-NEXT: v_and_b32_e32 v0, s6, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dword v2, v0, s[0:1] ; GFX9-NEXT: v_and_b32_e32 v0, s6, v1 @@ -2738,8 +2735,8 @@ ; GFX10-NEXT: v_add_nc_u32_e32 v2, 1, v0 ; GFX10-NEXT: v_sub_nc_u32_e32 v1, s0, v1 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s7, v1 ; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s7, v1 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s7, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo ; GFX10-NEXT: v_add_nc_u32_e32 v2, 1, v0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll @@ -281,8 +281,8 @@ ; GFX8-NEXT: s_lshr_b32 s3, s1, 8 ; GFX8-NEXT: s_lshl_b32 s1, s1, s4 ; GFX8-NEXT: s_lshr_b32 s2, s0, 8 -; GFX8-NEXT: v_mov_b32_e32 v0, s1 ; GFX8-NEXT: s_lshl_b32 s0, s0, s4 +; GFX8-NEXT: v_mov_b32_e32 v0, s1 ; GFX8-NEXT: s_lshl_b32 s1, s3, s4 ; GFX8-NEXT: v_sub_u16_e64 v0, s0, v0 clamp ; GFX8-NEXT: s_lshl_b32 s0, s2, s4 @@ -300,8 +300,8 @@ ; GFX9-NEXT: s_lshr_b32 s3, s1, 8 ; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s2 ; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s3 -; GFX9-NEXT: s_lshr_b32 s3, s0, 16 ; GFX9-NEXT: s_mov_b32 s2, 0x80008 +; GFX9-NEXT: s_lshr_b32 s3, s0, 16 ; GFX9-NEXT: s_lshl_b32 s0, s0, s2 ; GFX9-NEXT: s_lshl_b32 s3, s3, 8 ; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s3 @@ -324,8 +324,8 @@ ; GFX10-NEXT: s_lshr_b32 s3, s1, 8 ; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s2 ; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s3 -; GFX10-NEXT: s_lshr_b32 s3, s0, 16 ; GFX10-NEXT: s_mov_b32 s2, 0x80008 +; GFX10-NEXT: s_lshr_b32 s3, s0, 16 ; GFX10-NEXT: s_lshr_b32 s4, s1, 16 ; GFX10-NEXT: s_lshl_b32 s0, s0, s2 ; GFX10-NEXT: s_lshl_b32 s3, s3, 8 @@ -373,13 +373,13 @@ ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 24, v7 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 24, v1 ; GFX6-NEXT: v_min_u32_e32 v4, v3, v4 -; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v3, v4 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, 24, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, 24, v2 +; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 24, v3 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 24, v3 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v3 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 @@ -424,20 +424,20 @@ ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 24, v0 ; GFX9-NEXT: v_mov_b32_e32 v8, 0xffff ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX9-NEXT: v_and_or_b32 v0, v0, v8, v2 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v4 ; GFX9-NEXT: v_lshrrev_b32_sdwa v5, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v1 -; GFX9-NEXT: v_and_or_b32 v2, v3, v8, v2 +; GFX9-NEXT: v_and_or_b32 v0, v0, v8, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v4 ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; GFX9-NEXT: v_and_or_b32 v2, v3, v8, v2 ; GFX9-NEXT: v_and_or_b32 v1, v1, v8, v5 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX9-NEXT: v_and_or_b32 v3, v6, v8, v3 ; GFX9-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] +; GFX9-NEXT: v_and_or_b32 v3, v6, v8, v3 ; GFX9-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] -; GFX9-NEXT: v_pk_sub_u16 v0, v0, v1 clamp ; GFX9-NEXT: v_pk_lshlrev_b16 v2, 8, v2 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_lshlrev_b16 v3, 8, v3 op_sel_hi:[0,1] +; GFX9-NEXT: v_pk_sub_u16 v0, v0, v1 clamp ; GFX9-NEXT: v_pk_sub_u16 v1, v2, v3 clamp ; GFX9-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1] ; GFX9-NEXT: v_mov_b32_e32 v2, 8 @@ -520,13 +520,13 @@ ; GFX6-NEXT: s_lshl_b32 s4, s7, 24 ; GFX6-NEXT: s_lshr_b32 s1, s1, 24 ; GFX6-NEXT: s_min_u32 s4, s3, s4 -; GFX6-NEXT: s_sub_i32 s3, s3, s4 ; GFX6-NEXT: s_lshr_b32 s0, s0, 24 -; GFX6-NEXT: s_lshl_b32 s1, s1, 8 ; GFX6-NEXT: s_lshr_b32 s2, s2, 24 +; GFX6-NEXT: s_sub_i32 s3, s3, s4 +; GFX6-NEXT: s_lshl_b32 s1, s1, 8 +; GFX6-NEXT: s_lshr_b32 s3, s3, 24 ; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: s_lshl_b32 s1, s2, 16 -; GFX6-NEXT: s_lshr_b32 s3, s3, 24 ; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: s_lshl_b32 s1, s3, 24 ; GFX6-NEXT: s_or_b32 s0, s0, s1 @@ -539,30 +539,30 @@ ; GFX8-NEXT: s_lshr_b32 s6, s1, 16 ; GFX8-NEXT: s_lshr_b32 s7, s1, 24 ; GFX8-NEXT: s_lshl_b32 s1, s1, s8 -; GFX8-NEXT: v_mov_b32_e32 v0, s1 -; GFX8-NEXT: s_lshl_b32 s1, s5, s8 ; GFX8-NEXT: s_lshr_b32 s2, s0, 8 ; GFX8-NEXT: s_lshr_b32 s3, s0, 16 ; GFX8-NEXT: s_lshr_b32 s4, s0, 24 ; GFX8-NEXT: s_lshl_b32 s0, s0, s8 +; GFX8-NEXT: v_mov_b32_e32 v0, s1 +; GFX8-NEXT: s_lshl_b32 s1, s5, s8 ; GFX8-NEXT: v_sub_u16_e64 v0, s0, v0 clamp -; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: s_lshl_b32 s0, s2, s8 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_sub_u16_e64 v1, s0, v1 clamp -; GFX8-NEXT: v_mov_b32_e32 v4, 0xff ; GFX8-NEXT: s_lshl_b32 s1, s6, s8 -; GFX8-NEXT: v_and_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; GFX8-NEXT: v_mov_b32_e32 v2, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, 0xff ; GFX8-NEXT: s_lshl_b32 s0, s3, s8 +; GFX8-NEXT: v_mov_b32_e32 v2, s1 ; GFX8-NEXT: s_lshl_b32 s1, s7, s8 +; GFX8-NEXT: v_and_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; GFX8-NEXT: v_sub_u16_e64 v2, s0, v2 clamp ; GFX8-NEXT: s_lshl_b32 s0, s4, s8 ; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_and_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX8-NEXT: v_sub_u16_e64 v3, s0, v3 clamp ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: v_and_b32_sdwa v1, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; GFX8-NEXT: v_sub_u16_e64 v3, s0, v3 clamp ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: v_and_b32_sdwa v1, v3, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 @@ -576,19 +576,19 @@ ; GFX9-NEXT: s_lshr_b32 s6, s0, 24 ; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s3 ; GFX9-NEXT: s_pack_ll_b32_b16 s3, s4, s6 -; GFX9-NEXT: s_lshr_b32 s6, s0, 16 ; GFX9-NEXT: s_mov_b32 s4, 0x80008 +; GFX9-NEXT: s_lshr_b32 s6, s0, 16 ; GFX9-NEXT: s_lshr_b32 s7, s1, 8 ; GFX9-NEXT: s_lshl_b32 s0, s0, s4 ; GFX9-NEXT: s_lshl_b32 s6, s6, 8 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s6 -; GFX9-NEXT: s_lshr_b32 s6, s3, 16 ; GFX9-NEXT: s_lshr_b32 s8, s1, 16 ; GFX9-NEXT: s_lshr_b32 s9, s1, 24 +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s6 +; GFX9-NEXT: s_lshr_b32 s6, s3, 16 ; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s7 -; GFX9-NEXT: s_lshr_b32 s7, s1, 16 ; GFX9-NEXT: s_lshl_b32 s3, s3, s4 ; GFX9-NEXT: s_lshl_b32 s6, s6, 8 +; GFX9-NEXT: s_lshr_b32 s7, s1, 16 ; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s6 ; GFX9-NEXT: s_pack_ll_b32_b16 s6, s8, s9 ; GFX9-NEXT: s_lshl_b32 s1, s1, s4 @@ -597,19 +597,19 @@ ; GFX9-NEXT: s_lshr_b32 s7, s6, 16 ; GFX9-NEXT: s_lshl_b32 s4, s6, s4 ; GFX9-NEXT: s_lshl_b32 s6, s7, 8 -; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s6 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: v_pk_sub_u16 v0, s0, v0 clamp ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: v_pk_sub_u16 v1, s3, v1 clamp ; GFX9-NEXT: s_mov_b32 s2, 8 +; GFX9-NEXT: v_pk_sub_u16 v1, s3, v1 clamp ; GFX9-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1] ; GFX9-NEXT: s_movk_i32 s0, 0xff ; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: s_mov_b32 s5, 24 ; GFX9-NEXT: v_and_or_b32 v0, v0, s0, v2 ; GFX9-NEXT: v_and_b32_e32 v2, s0, v1 -; GFX9-NEXT: s_mov_b32 s5, 24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_or3_b32 v0, v0, v2, v1 @@ -623,8 +623,8 @@ ; GFX10-NEXT: s_lshr_b32 s4, s0, 24 ; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s2 ; GFX10-NEXT: s_pack_ll_b32_b16 s2, s3, s4 -; GFX10-NEXT: s_lshr_b32 s4, s0, 16 ; GFX10-NEXT: s_mov_b32 s3, 0x80008 +; GFX10-NEXT: s_lshr_b32 s4, s0, 16 ; GFX10-NEXT: s_lshr_b32 s5, s1, 8 ; GFX10-NEXT: s_lshr_b32 s6, s1, 16 ; GFX10-NEXT: s_lshr_b32 s7, s1, 24 @@ -642,8 +642,8 @@ ; GFX10-NEXT: s_lshl_b32 s5, s5, 8 ; GFX10-NEXT: s_lshl_b32 s3, s4, s3 ; GFX10-NEXT: s_lshl_b32 s4, s6, 8 -; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s5 ; GFX10-NEXT: s_pack_ll_b32_b16 s2, s2, s8 +; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s5 ; GFX10-NEXT: s_pack_ll_b32_b16 s3, s3, s4 ; GFX10-NEXT: v_pk_sub_u16 v0, s0, s1 clamp ; GFX10-NEXT: v_pk_sub_u16 v1, s2, s3 clamp @@ -1869,10 +1869,10 @@ ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v3, v4 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v3 ; GFX6-NEXT: v_or_b32_e32 v1, v2, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -1931,10 +1931,10 @@ ; GFX6-NEXT: s_lshr_b32 s1, s1, 16 ; GFX6-NEXT: s_sub_i32 s3, s3, s4 ; GFX6-NEXT: s_lshr_b32 s0, s0, 16 -; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_lshr_b32 s3, s3, 16 -; GFX6-NEXT: s_or_b32 s0, s0, s1 +; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_lshr_b32 s2, s2, 16 +; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: s_lshl_b32 s1, s3, 16 ; GFX6-NEXT: s_or_b32 s1, s2, s1 ; GFX6-NEXT: ; return to shader part epilog @@ -1942,20 +1942,20 @@ ; GFX8-LABEL: s_usubsat_v4i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_lshr_b32 s6, s2, 16 -; GFX8-NEXT: s_lshr_b32 s7, s3, 16 ; GFX8-NEXT: s_lshr_b32 s4, s0, 16 +; GFX8-NEXT: s_lshr_b32 s7, s3, 16 ; GFX8-NEXT: v_mov_b32_e32 v1, s6 -; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: s_lshr_b32 s5, s1, 16 -; GFX8-NEXT: v_mov_b32_e32 v3, s7 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_sub_u16_e64 v1, s4, v1 clamp +; GFX8-NEXT: v_mov_b32_e32 v3, s7 ; GFX8-NEXT: v_mov_b32_e32 v4, 16 -; GFX8-NEXT: v_mov_b32_e32 v2, s3 ; GFX8-NEXT: v_sub_u16_e64 v0, s0, v0 clamp -; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_mov_b32_e32 v2, s3 ; GFX8-NEXT: v_sub_u16_e64 v3, s5, v3 clamp -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX8-NEXT: v_sub_u16_e64 v2, s1, v2 clamp +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: v_readfirstlane_b32 s0, v0 @@ -2023,16 +2023,16 @@ ; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v11 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_min_u32_e32 v6, v5, v6 -; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v5, v6 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v5, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v3 ; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v3 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_or_b32_e32 v1, v2, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v5 ; GFX6-NEXT: v_or_b32_e32 v2, v4, v2 ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -2104,16 +2104,16 @@ ; GFX6-NEXT: s_lshl_b32 s6, s11, 16 ; GFX6-NEXT: s_lshr_b32 s1, s1, 16 ; GFX6-NEXT: s_min_u32 s6, s5, s6 -; GFX6-NEXT: s_sub_i32 s5, s5, s6 ; GFX6-NEXT: s_lshr_b32 s0, s0, 16 -; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_lshr_b32 s3, s3, 16 -; GFX6-NEXT: s_or_b32 s0, s0, s1 +; GFX6-NEXT: s_sub_i32 s5, s5, s6 +; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_lshr_b32 s2, s2, 16 -; GFX6-NEXT: s_lshl_b32 s1, s3, 16 ; GFX6-NEXT: s_lshr_b32 s5, s5, 16 -; GFX6-NEXT: s_or_b32 s1, s2, s1 +; GFX6-NEXT: s_or_b32 s0, s0, s1 +; GFX6-NEXT: s_lshl_b32 s1, s3, 16 ; GFX6-NEXT: s_lshr_b32 s4, s4, 16 +; GFX6-NEXT: s_or_b32 s1, s2, s1 ; GFX6-NEXT: s_lshl_b32 s2, s5, 16 ; GFX6-NEXT: s_or_b32 s2, s4, s2 ; GFX6-NEXT: ; return to shader part epilog @@ -2121,28 +2121,28 @@ ; GFX8-LABEL: s_usubsat_v6i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_lshr_b32 s9, s3, 16 -; GFX8-NEXT: s_lshr_b32 s10, s4, 16 ; GFX8-NEXT: s_lshr_b32 s6, s0, 16 +; GFX8-NEXT: s_lshr_b32 s10, s4, 16 ; GFX8-NEXT: v_mov_b32_e32 v1, s9 -; GFX8-NEXT: v_mov_b32_e32 v0, s3 -; GFX8-NEXT: s_lshr_b32 s11, s5, 16 ; GFX8-NEXT: s_lshr_b32 s7, s1, 16 -; GFX8-NEXT: v_mov_b32_e32 v3, s10 +; GFX8-NEXT: s_lshr_b32 s11, s5, 16 +; GFX8-NEXT: v_mov_b32_e32 v0, s3 ; GFX8-NEXT: v_sub_u16_e64 v1, s6, v1 clamp +; GFX8-NEXT: v_mov_b32_e32 v3, s10 ; GFX8-NEXT: v_mov_b32_e32 v6, 16 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: s_lshr_b32 s8, s2, 16 -; GFX8-NEXT: v_mov_b32_e32 v5, s11 ; GFX8-NEXT: v_sub_u16_e64 v0, s0, v0 clamp -; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_sub_u16_e64 v3, s7, v3 clamp +; GFX8-NEXT: v_mov_b32_e32 v5, s11 +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_sub_u16_e64 v2, s1, v2 clamp ; GFX8-NEXT: v_mov_b32_e32 v4, s5 +; GFX8-NEXT: v_sub_u16_e64 v5, s8, v5 clamp ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_sub_u16_e64 v2, s1, v2 clamp ; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_sub_u16_e64 v5, s8, v5 clamp -; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: v_sub_u16_e64 v4, s2, v4 clamp +; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX8-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: v_readfirstlane_b32 s0, v0 @@ -2213,19 +2213,19 @@ ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v15 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_min_u32_e32 v8, v7, v8 -; GFX6-NEXT: v_sub_i32_e32 v7, vcc, v7, v8 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-NEXT: v_sub_i32_e32 v7, vcc, v7, v8 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v7, 16, v7 ; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v3 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v5 ; GFX6-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX6-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v5 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v7 ; GFX6-NEXT: v_or_b32_e32 v2, v4, v2 ; GFX6-NEXT: v_or_b32_e32 v3, v6, v3 @@ -2312,19 +2312,19 @@ ; GFX6-NEXT: s_lshl_b32 s8, s15, 16 ; GFX6-NEXT: s_lshr_b32 s1, s1, 16 ; GFX6-NEXT: s_min_u32 s8, s7, s8 -; GFX6-NEXT: s_sub_i32 s7, s7, s8 ; GFX6-NEXT: s_lshr_b32 s0, s0, 16 -; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_lshr_b32 s3, s3, 16 -; GFX6-NEXT: s_or_b32 s0, s0, s1 +; GFX6-NEXT: s_sub_i32 s7, s7, s8 +; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_lshr_b32 s2, s2, 16 -; GFX6-NEXT: s_lshl_b32 s1, s3, 16 -; GFX6-NEXT: s_lshr_b32 s7, s7, 16 ; GFX6-NEXT: s_lshr_b32 s5, s5, 16 -; GFX6-NEXT: s_or_b32 s1, s2, s1 +; GFX6-NEXT: s_lshr_b32 s7, s7, 16 +; GFX6-NEXT: s_or_b32 s0, s0, s1 +; GFX6-NEXT: s_lshl_b32 s1, s3, 16 ; GFX6-NEXT: s_lshr_b32 s4, s4, 16 -; GFX6-NEXT: s_lshl_b32 s2, s5, 16 ; GFX6-NEXT: s_lshr_b32 s6, s6, 16 +; GFX6-NEXT: s_or_b32 s1, s2, s1 +; GFX6-NEXT: s_lshl_b32 s2, s5, 16 ; GFX6-NEXT: s_lshl_b32 s3, s7, 16 ; GFX6-NEXT: s_or_b32 s2, s4, s2 ; GFX6-NEXT: s_or_b32 s3, s6, s3 @@ -2333,35 +2333,35 @@ ; GFX8-LABEL: s_usubsat_v8i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_lshr_b32 s12, s4, 16 -; GFX8-NEXT: s_lshr_b32 s13, s5, 16 ; GFX8-NEXT: s_lshr_b32 s8, s0, 16 +; GFX8-NEXT: s_lshr_b32 s13, s5, 16 ; GFX8-NEXT: v_mov_b32_e32 v1, s12 -; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: s_lshr_b32 s9, s1, 16 ; GFX8-NEXT: s_lshr_b32 s14, s6, 16 ; GFX8-NEXT: s_lshr_b32 s15, s7, 16 -; GFX8-NEXT: s_lshr_b32 s9, s1, 16 -; GFX8-NEXT: v_mov_b32_e32 v3, s13 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_sub_u16_e64 v1, s8, v1 clamp +; GFX8-NEXT: v_mov_b32_e32 v3, s13 ; GFX8-NEXT: v_mov_b32_e32 v8, 16 -; GFX8-NEXT: v_mov_b32_e32 v2, s5 ; GFX8-NEXT: s_lshr_b32 s10, s2, 16 -; GFX8-NEXT: v_mov_b32_e32 v5, s14 ; GFX8-NEXT: s_lshr_b32 s11, s3, 16 -; GFX8-NEXT: v_mov_b32_e32 v7, s15 ; GFX8-NEXT: v_sub_u16_e64 v0, s0, v0 clamp -; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_mov_b32_e32 v2, s5 ; GFX8-NEXT: v_sub_u16_e64 v3, s9, v3 clamp +; GFX8-NEXT: v_mov_b32_e32 v5, s14 +; GFX8-NEXT: v_mov_b32_e32 v7, s15 +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_sub_u16_e64 v2, s1, v2 clamp ; GFX8-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NEXT: v_sub_u16_e64 v5, s10, v5 clamp ; GFX8-NEXT: v_mov_b32_e32 v6, s7 +; GFX8-NEXT: v_sub_u16_e64 v7, s11, v7 clamp ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_sub_u16_e64 v2, s1, v2 clamp ; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_sub_u16_e64 v7, s11, v7 clamp -; GFX8-NEXT: v_sub_u16_e64 v5, s10, v5 clamp -; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: v_sub_u16_e64 v4, s2, v4 clamp -; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v8, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX8-NEXT: v_sub_u16_e64 v6, s3, v6 clamp +; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v8, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX8-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v3, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -2713,13 +2713,13 @@ ; GFX6-NEXT: s_cselect_b32 s1, 1, 0 ; GFX6-NEXT: s_and_b32 s1, s1, 1 ; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: s_cmp_lg_u32 s1, 0 ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: v_mov_b32_e32 v3, s9 +; GFX6-NEXT: s_cmp_lg_u32 s1, 0 ; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: s_subb_u32 s1, s3, s7 ; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc ; GFX6-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc +; GFX6-NEXT: s_subb_u32 s1, s3, s7 ; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1] ; GFX6-NEXT: v_mov_b32_e32 v4, s0 ; GFX6-NEXT: v_mov_b32_e32 v5, s1 @@ -2745,13 +2745,13 @@ ; GFX8-NEXT: s_cselect_b32 s1, 1, 0 ; GFX8-NEXT: s_and_b32 s1, s1, 1 ; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: s_cmp_lg_u32 s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v2, s8 ; GFX8-NEXT: v_mov_b32_e32 v3, s9 +; GFX8-NEXT: s_cmp_lg_u32 s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: s_subb_u32 s1, s3, s7 ; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc +; GFX8-NEXT: s_subb_u32 s1, s3, s7 ; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NEXT: v_mov_b32_e32 v5, s1 @@ -2777,13 +2777,13 @@ ; GFX9-NEXT: s_cselect_b32 s1, 1, 0 ; GFX9-NEXT: s_and_b32 s1, s1, 1 ; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: s_cmp_lg_u32 s1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, s8 ; GFX9-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-NEXT: s_cmp_lg_u32 s1, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: s_subb_u32 s1, s3, s7 ; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc +; GFX9-NEXT: s_subb_u32 s1, s3, s7 ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1] ; GFX9-NEXT: v_mov_b32_e32 v4, s0 ; GFX9-NEXT: v_mov_b32_e32 v5, s1 @@ -2831,9 +2831,9 @@ ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: s_cmp_lg_u32 s9, 0 ; GFX6-NEXT: v_mov_b32_e32 v3, s5 -; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3] ; GFX6-NEXT: s_subb_u32 s9, s1, s5 ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3] ; GFX6-NEXT: s_cselect_b32 s10, 1, 0 ; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: s_and_b32 s10, s10, 1 @@ -2844,14 +2844,14 @@ ; GFX6-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[2:3], v[0:1] ; GFX6-NEXT: s_cselect_b32 s11, 1, 0 -; GFX6-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX6-NEXT: s_and_b32 s11, s11, 1 +; GFX6-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX6-NEXT: s_cmp_lg_u32 s11, 0 ; GFX6-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX6-NEXT: s_subb_u32 s11, s3, s7 ; GFX6-NEXT: v_mov_b32_e32 v1, s8 -; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX6-NEXT: v_mov_b32_e32 v2, s9 +; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX6-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc ; GFX6-NEXT: v_cndmask_b32_e64 v1, v2, 0, vcc ; GFX6-NEXT: v_mov_b32_e32 v2, s10 @@ -2881,8 +2881,8 @@ ; GFX8-NEXT: s_cmp_lg_u32 s11, 0 ; GFX8-NEXT: v_mov_b32_e32 v3, s5 ; GFX8-NEXT: s_subb_u32 s11, s3, s7 -; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3] ; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3] ; GFX8-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NEXT: s_cmp_eq_u64 s[2:3], s[6:7] ; GFX8-NEXT: s_cselect_b32 s6, 1, 0 @@ -2894,8 +2894,8 @@ ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX8-NEXT: v_mov_b32_e32 v1, s8 -; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX8-NEXT: v_mov_b32_e32 v2, s9 +; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v1, v2, 0, vcc ; GFX8-NEXT: v_mov_b32_e32 v2, s10 @@ -2925,8 +2925,8 @@ ; GFX9-NEXT: s_cmp_lg_u32 s11, 0 ; GFX9-NEXT: v_mov_b32_e32 v3, s5 ; GFX9-NEXT: s_subb_u32 s11, s3, s7 -; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: s_cmp_eq_u64 s[2:3], s[6:7] ; GFX9-NEXT: s_cselect_b32 s6, 1, 0 @@ -2938,8 +2938,8 @@ ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, s9 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v1, v2, 0, vcc ; GFX9-NEXT: v_mov_b32_e32 v2, s10 @@ -2979,13 +2979,13 @@ ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v0, s8, 0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v1, s9, 0, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v2, s10, 0, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v3, s1, 0, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v1, s9, 0, vcc_lo ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: v_readfirstlane_b32 s1, v1 ; GFX10-NEXT: v_readfirstlane_b32 s2, v2 ; GFX10-NEXT: v_readfirstlane_b32 s3, v3 -; GFX10-NEXT: v_readfirstlane_b32 s1, v1 ; GFX10-NEXT: ; return to shader part epilog %result = call i128 @llvm.usub.sat.i128(i128 %lhs, i128 %rhs) ret i128 %result @@ -3312,16 +3312,16 @@ ; GFX10-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v9, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v8, v18, v17, s5 ; GFX10-NEXT: v_sub_co_ci_u32_e32 v2, vcc_lo, v2, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, 0, s4 ; GFX10-NEXT: v_sub_co_ci_u32_e32 v3, vcc_lo, v3, v11, vcc_lo ; GFX10-NEXT: v_sub_co_u32 v4, vcc_lo, v4, v12 ; GFX10-NEXT: v_and_b32_e32 v8, 1, v8 ; GFX10-NEXT: v_sub_co_ci_u32_e32 v5, vcc_lo, v5, v13, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, 0, s4 ; GFX10-NEXT: v_sub_co_ci_u32_e32 v6, vcc_lo, v6, v14, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, 0, s4 ; GFX10-NEXT: v_cmp_ne_u32_e64 s5, 0, v8 ; GFX10-NEXT: v_sub_co_ci_u32_e32 v7, vcc_lo, v7, v15, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, 0, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, 0, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, 0, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, 0, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, 0, s5 ; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, 0, s5 @@ -3344,8 +3344,8 @@ ; GFX6-NEXT: s_cselect_b32 s18, 1, 0 ; GFX6-NEXT: v_mov_b32_e32 v3, s9 ; GFX6-NEXT: s_and_b32 s18, s18, 1 -; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3] ; GFX6-NEXT: v_mov_b32_e32 v0, s10 +; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3] ; GFX6-NEXT: s_cmp_lg_u32 s18, 0 ; GFX6-NEXT: v_mov_b32_e32 v1, s11 ; GFX6-NEXT: s_subb_u32 s18, s2, s10 @@ -3356,27 +3356,27 @@ ; GFX6-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[2:3], v[0:1] ; GFX6-NEXT: s_cmp_lg_u32 s19, 0 -; GFX6-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX6-NEXT: s_subb_u32 s19, s3, s11 +; GFX6-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX6-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX6-NEXT: s_sub_u32 s0, s4, s12 -; GFX6-NEXT: s_cselect_b32 s1, 1, 0 -; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX6-NEXT: v_mov_b32_e32 v2, s17 -; GFX6-NEXT: s_and_b32 s1, s1, 1 +; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX6-NEXT: s_cselect_b32 s1, 1, 0 ; GFX6-NEXT: v_mov_b32_e32 v1, s16 ; GFX6-NEXT: v_cndmask_b32_e64 v5, v2, 0, vcc +; GFX6-NEXT: s_and_b32 s1, s1, 1 ; GFX6-NEXT: v_mov_b32_e32 v2, s12 -; GFX6-NEXT: s_cmp_lg_u32 s1, 0 ; GFX6-NEXT: v_cndmask_b32_e64 v4, v1, 0, vcc ; GFX6-NEXT: v_mov_b32_e32 v0, s18 ; GFX6-NEXT: v_mov_b32_e32 v1, s19 +; GFX6-NEXT: s_cmp_lg_u32 s1, 0 ; GFX6-NEXT: v_mov_b32_e32 v3, s13 -; GFX6-NEXT: s_subb_u32 s1, s5, s13 ; GFX6-NEXT: v_cndmask_b32_e64 v6, v0, 0, vcc ; GFX6-NEXT: v_cndmask_b32_e64 v7, v1, 0, vcc -; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3] +; GFX6-NEXT: s_subb_u32 s1, s5, s13 ; GFX6-NEXT: v_mov_b32_e32 v0, s14 +; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3] ; GFX6-NEXT: s_cselect_b32 s2, 1, 0 ; GFX6-NEXT: v_mov_b32_e32 v1, s15 ; GFX6-NEXT: s_and_b32 s2, s2, 1 @@ -3387,18 +3387,18 @@ ; GFX6-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[6:7], v[0:1] ; GFX6-NEXT: s_cselect_b32 s3, 1, 0 -; GFX6-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX6-NEXT: s_and_b32 s3, s3, 1 +; GFX6-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX6-NEXT: s_cmp_lg_u32 s3, 0 ; GFX6-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX6-NEXT: s_subb_u32 s3, s7, s15 ; GFX6-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX6-NEXT: v_mov_b32_e32 v2, s1 -; GFX6-NEXT: v_mov_b32_e32 v3, s3 +; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX6-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc ; GFX6-NEXT: v_cndmask_b32_e64 v1, v2, 0, vcc ; GFX6-NEXT: v_mov_b32_e32 v2, s2 +; GFX6-NEXT: v_mov_b32_e32 v3, s3 ; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc ; GFX6-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc ; GFX6-NEXT: v_readfirstlane_b32 s0, v4 @@ -3428,8 +3428,8 @@ ; GFX8-NEXT: s_cmp_lg_u32 s19, 0 ; GFX8-NEXT: v_mov_b32_e32 v3, s9 ; GFX8-NEXT: s_subb_u32 s19, s3, s11 -; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3] ; GFX8-NEXT: v_mov_b32_e32 v0, s10 +; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3] ; GFX8-NEXT: v_mov_b32_e32 v1, s11 ; GFX8-NEXT: s_cmp_eq_u64 s[2:3], s[10:11] ; GFX8-NEXT: s_cselect_b32 s10, 1, 0 @@ -3449,25 +3449,25 @@ ; GFX8-NEXT: s_cmp_lg_u32 s2, 0 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX8-NEXT: s_subb_u32 s2, s6, s14 -; GFX8-NEXT: s_cselect_b32 s3, 1, 0 -; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX8-NEXT: v_mov_b32_e32 v2, s17 -; GFX8-NEXT: s_and_b32 s3, s3, 1 +; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX8-NEXT: s_cselect_b32 s3, 1, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, s16 ; GFX8-NEXT: v_cndmask_b32_e64 v5, v2, 0, vcc +; GFX8-NEXT: s_and_b32 s3, s3, 1 ; GFX8-NEXT: v_mov_b32_e32 v2, s12 -; GFX8-NEXT: s_cmp_lg_u32 s3, 0 ; GFX8-NEXT: v_cndmask_b32_e64 v4, v1, 0, vcc ; GFX8-NEXT: v_mov_b32_e32 v0, s18 ; GFX8-NEXT: v_mov_b32_e32 v1, s19 +; GFX8-NEXT: s_cmp_lg_u32 s3, 0 ; GFX8-NEXT: v_mov_b32_e32 v3, s13 -; GFX8-NEXT: s_subb_u32 s3, s7, s15 ; GFX8-NEXT: v_cndmask_b32_e64 v6, v0, 0, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v7, v1, 0, vcc -; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3] +; GFX8-NEXT: s_subb_u32 s3, s7, s15 ; GFX8-NEXT: v_mov_b32_e32 v0, s14 -; GFX8-NEXT: s_cmp_eq_u64 s[6:7], s[14:15] +; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3] ; GFX8-NEXT: v_mov_b32_e32 v1, s15 +; GFX8-NEXT: s_cmp_eq_u64 s[6:7], s[14:15] ; GFX8-NEXT: s_cselect_b32 s8, 1, 0 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1] @@ -3477,12 +3477,12 @@ ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX8-NEXT: v_mov_b32_e32 v1, s0 -; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX8-NEXT: v_mov_b32_e32 v2, s1 -; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v1, v2, 0, vcc ; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 ; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc ; GFX8-NEXT: v_readfirstlane_b32 s0, v4 @@ -3512,8 +3512,8 @@ ; GFX9-NEXT: s_cmp_lg_u32 s19, 0 ; GFX9-NEXT: v_mov_b32_e32 v3, s9 ; GFX9-NEXT: s_subb_u32 s19, s3, s11 -; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v0, s10 +; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v1, s11 ; GFX9-NEXT: s_cmp_eq_u64 s[2:3], s[10:11] ; GFX9-NEXT: s_cselect_b32 s10, 1, 0 @@ -3533,25 +3533,25 @@ ; GFX9-NEXT: s_cmp_lg_u32 s2, 0 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_subb_u32 s2, s6, s14 -; GFX9-NEXT: s_cselect_b32 s3, 1, 0 -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, s17 -; GFX9-NEXT: s_and_b32 s3, s3, 1 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_cselect_b32 s3, 1, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, s16 ; GFX9-NEXT: v_cndmask_b32_e64 v5, v2, 0, vcc +; GFX9-NEXT: s_and_b32 s3, s3, 1 ; GFX9-NEXT: v_mov_b32_e32 v2, s12 -; GFX9-NEXT: s_cmp_lg_u32 s3, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v4, v1, 0, vcc ; GFX9-NEXT: v_mov_b32_e32 v0, s18 ; GFX9-NEXT: v_mov_b32_e32 v1, s19 +; GFX9-NEXT: s_cmp_lg_u32 s3, 0 ; GFX9-NEXT: v_mov_b32_e32 v3, s13 -; GFX9-NEXT: s_subb_u32 s3, s7, s15 ; GFX9-NEXT: v_cndmask_b32_e64 v6, v0, 0, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v7, v1, 0, vcc -; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3] +; GFX9-NEXT: s_subb_u32 s3, s7, s15 ; GFX9-NEXT: v_mov_b32_e32 v0, s14 -; GFX9-NEXT: s_cmp_eq_u64 s[6:7], s[14:15] +; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v1, s15 +; GFX9-NEXT: s_cmp_eq_u64 s[6:7], s[14:15] ; GFX9-NEXT: s_cselect_b32 s8, 1, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1] @@ -3561,12 +3561,12 @@ ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v1, v2, 0, vcc ; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc ; GFX9-NEXT: v_readfirstlane_b32 s0, v4 @@ -3616,15 +3616,15 @@ ; GFX10-NEXT: s_cselect_b32 s0, 1, 0 ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX10-NEXT: s_and_b32 s0, s0, 1 -; GFX10-NEXT: s_cmp_lg_u32 s0, 0 ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s1 +; GFX10-NEXT: s_cmp_lg_u32 s0, 0 +; GFX10-NEXT: v_cmp_lt_u64_e64 s1, s[6:7], s[14:15] ; GFX10-NEXT: s_subb_u32 s9, s7, s15 ; GFX10-NEXT: s_cmp_eq_u64 s[6:7], s[14:15] -; GFX10-NEXT: v_cmp_lt_u64_e64 s1, s[6:7], s[14:15] ; GFX10-NEXT: s_cselect_b32 s0, 1, 0 ; GFX10-NEXT: s_and_b32 s0, 1, s0 -; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s1 +; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc_lo ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; GFX10-NEXT: v_and_b32_e32 v0, 1, v1 @@ -3636,13 +3636,13 @@ ; GFX10-NEXT: v_readfirstlane_b32 s0, v1 ; GFX10-NEXT: v_readfirstlane_b32 s1, v2 ; GFX10-NEXT: v_readfirstlane_b32 s2, v3 -; GFX10-NEXT: v_cndmask_b32_e64 v1, s3, 0, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v0, s8, 0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v1, s3, 0, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v2, s10, 0, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v3, s9, 0, vcc_lo ; GFX10-NEXT: v_readfirstlane_b32 s3, v4 -; GFX10-NEXT: v_readfirstlane_b32 s5, v1 ; GFX10-NEXT: v_readfirstlane_b32 s4, v0 +; GFX10-NEXT: v_readfirstlane_b32 s5, v1 ; GFX10-NEXT: v_readfirstlane_b32 s6, v2 ; GFX10-NEXT: v_readfirstlane_b32 s7, v3 ; GFX10-NEXT: ; return to shader part epilog diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll @@ -38,10 +38,10 @@ ; ; GFX8-LABEL: scalar_xnor_v2i16_one_use: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_xor_b32 s0, s0, s1 ; GFX8-NEXT: s_mov_b32 s2, 0xffff -; GFX8-NEXT: s_lshr_b32 s1, s0, 16 +; GFX8-NEXT: s_xor_b32 s0, s0, s1 ; GFX8-NEXT: s_mov_b32 s3, s2 +; GFX8-NEXT: s_lshr_b32 s1, s0, 16 ; GFX8-NEXT: s_and_b32 s0, s0, s2 ; GFX8-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3] ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 @@ -124,8 +124,8 @@ ; GFX7-NEXT: s_lshl_b32 s1, s3, 16 ; GFX7-NEXT: s_and_b32 s2, s2, s8 ; GFX7-NEXT: s_or_b32 s1, s1, s2 -; GFX7-NEXT: s_and_b32 s3, s4, s8 ; GFX7-NEXT: s_lshl_b32 s2, s5, 16 +; GFX7-NEXT: s_and_b32 s3, s4, s8 ; GFX7-NEXT: s_or_b32 s2, s2, s3 ; GFX7-NEXT: s_lshl_b32 s3, s7, 16 ; GFX7-NEXT: s_and_b32 s4, s6, s8 @@ -138,8 +138,8 @@ ; ; GFX8-LABEL: scalar_xnor_v4i16_one_use: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3] ; GFX8-NEXT: s_mov_b32 s4, 0xffff +; GFX8-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3] ; GFX8-NEXT: s_lshr_b32 s3, s0, 16 ; GFX8-NEXT: s_and_b32 s2, s0, s4 ; GFX8-NEXT: s_mov_b32 s5, s4 diff --git a/llvm/test/CodeGen/AMDGPU/add3.ll b/llvm/test/CodeGen/AMDGPU/add3.ll --- a/llvm/test/CodeGen/AMDGPU/add3.ll +++ b/llvm/test/CodeGen/AMDGPU/add3.ll @@ -217,9 +217,9 @@ define amdgpu_ps float @add3_uniform_vgpr(float inreg %a, float inreg %b, float inreg %c) { ; VI-LABEL: add3_uniform_vgpr: ; VI: ; %bb.0: -; VI-NEXT: v_mov_b32_e32 v2, 0x40400000 ; VI-NEXT: v_add_f32_e64 v0, s2, 1.0 ; VI-NEXT: v_add_f32_e64 v1, s3, 2.0 +; VI-NEXT: v_mov_b32_e32 v2, 0x40400000 ; VI-NEXT: v_add_f32_e32 v2, s4, v2 ; VI-NEXT: v_add_u32_e32 v0, vcc, v1, v0 ; VI-NEXT: v_add_u32_e32 v0, vcc, v2, v0 @@ -227,9 +227,9 @@ ; ; GFX9-LABEL: add3_uniform_vgpr: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v2, 0x40400000 ; GFX9-NEXT: v_add_f32_e64 v0, s2, 1.0 ; GFX9-NEXT: v_add_f32_e64 v1, s3, 2.0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0x40400000 ; GFX9-NEXT: v_add_f32_e32 v2, s4, v2 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 diff --git a/llvm/test/CodeGen/AMDGPU/agpr-remat.ll b/llvm/test/CodeGen/AMDGPU/agpr-remat.ll --- a/llvm/test/CodeGen/AMDGPU/agpr-remat.ll +++ b/llvm/test/CodeGen/AMDGPU/agpr-remat.ll @@ -33,9 +33,9 @@ ; GFX908-NEXT: v_accvgpr_write_b32 a2, v1 ; GFX908-NEXT: v_accvgpr_write_b32 a3, v2 ; GFX908-NEXT: v_accvgpr_write_b32 a4, v3 +; GFX908-NEXT: v_accvgpr_write_b32 a0, v8 ; GFX908-NEXT: ;;#ASMSTART ; GFX908-NEXT: ;;#ASMEND -; GFX908-NEXT: v_accvgpr_write_b32 a0, v8 ; GFX908-NEXT: v_accvgpr_write_b32 a1, v4 ; GFX908-NEXT: v_accvgpr_write_b32 a2, v5 ; GFX908-NEXT: v_accvgpr_write_b32 a3, v6 diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll --- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --force-update ; RUN: opt -S -mtriple=amdgcn-- -mcpu=tahiti -amdgpu-codegenprepare -amdgpu-bypass-slow-div=0 %s | FileCheck %s ; RUN: llc -mtriple=amdgcn-- -mcpu=tahiti -amdgpu-bypass-slow-div=0 < %s | FileCheck -check-prefix=GFX6 %s ; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -amdgpu-bypass-slow-div=0 < %s | FileCheck -check-prefix=GFX9 %s @@ -191,11 +191,10 @@ ; GFX9-LABEL: urem_i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 ; GFX9-NEXT: s_sub_i32 s4, 0, s3 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 @@ -212,6 +211,7 @@ ; GFX9-NEXT: v_subrev_u32_e32 v2, s3, v0 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-NEXT: s_endpgm ; @@ -483,14 +483,13 @@ ; GFX9-LABEL: srem_i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_ashr_i32 s4, s3, 31 ; GFX9-NEXT: s_add_i32 s3, s3, s4 ; GFX9-NEXT: s_xor_b32 s3, s3, s4 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 ; GFX9-NEXT: s_sub_i32 s4, 0, s3 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 @@ -512,6 +511,7 @@ ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX9-NEXT: v_xor_b32_e32 v0, s4, v0 ; GFX9-NEXT: v_subrev_u32_e32 v0, s4, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-NEXT: s_endpgm ; @@ -690,13 +690,12 @@ ; GFX9-LABEL: urem_i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshr_b32 s3, s2, 16 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 ; GFX9-NEXT: s_and_b32 s4, s2, 0xffff ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s4 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v0 ; GFX9-NEXT: v_mul_f32_e32 v2, v1, v2 ; GFX9-NEXT: v_trunc_f32_e32 v2, v2 @@ -707,6 +706,7 @@ ; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v3, vcc ; GFX9-NEXT: v_mul_lo_u32 v0, v0, s3 ; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_short v1, v0, s[0:1] ; GFX9-NEXT: s_endpgm ; @@ -901,8 +901,6 @@ ; GFX9-LABEL: srem_i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_ashr_i32 s5, s4, 16 ; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s5 @@ -912,6 +910,7 @@ ; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v0 ; GFX9-NEXT: s_ashr_i32 s2, s2, 30 ; GFX9-NEXT: s_or_b32 s6, s2, 1 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mul_f32_e32 v2, v1, v2 ; GFX9-NEXT: v_trunc_f32_e32 v2, v2 ; GFX9-NEXT: v_mad_f32 v1, -v2, v0, v1 @@ -923,6 +922,7 @@ ; GFX9-NEXT: v_mul_lo_u32 v0, v0, s5 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_short v1, v0, s[0:1] ; GFX9-NEXT: s_endpgm ; @@ -1087,13 +1087,12 @@ ; GFX9-LABEL: urem_i8: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v0, s2 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0 ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v2, s2 ; GFX9-NEXT: s_lshr_b32 s3, s2, 8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mul_f32_e32 v1, v2, v1 ; GFX9-NEXT: v_trunc_f32_e32 v1, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v1 @@ -1103,6 +1102,7 @@ ; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v3, vcc ; GFX9-NEXT: v_mul_lo_u32 v0, v0, s3 ; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_byte v1, v0, s[0:1] ; GFX9-NEXT: s_endpgm ; @@ -1297,8 +1297,6 @@ ; GFX9-LABEL: srem_i8: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_bfe_i32 s2, s4, 0x80008 ; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s2 @@ -1318,8 +1316,10 @@ ; GFX9-NEXT: s_cselect_b32 s2, s6, 0 ; GFX9-NEXT: v_add_u32_e32 v0, s2, v2 ; GFX9-NEXT: v_mul_lo_u32 v0, v0, s5 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_byte v1, v0, s[0:1] ; GFX9-NEXT: s_endpgm ; @@ -1530,17 +1530,17 @@ ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v4 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s5, v5 -; GFX6-NEXT: v_add_i32_e32 v4, vcc, 1, v1 ; GFX6-NEXT: v_mul_f32_e32 v2, s3, v2 ; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v2 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, 1, v1 ; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v3 ; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[0:1] -; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, s9, v3 ; GFX6-NEXT: v_mul_lo_u32 v4, s2, v2 +; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, s9, v3 ; GFX6-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[0:1] +; GFX6-NEXT: v_mul_hi_u32 v4, v2, v4 ; GFX6-NEXT: v_add_i32_e32 v5, vcc, 1, v1 ; GFX6-NEXT: s_sub_i32 s0, 0, s11 -; GFX6-NEXT: v_mul_hi_u32 v4, v2, v4 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v4, v2 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v6 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s9, v3 @@ -1556,13 +1556,13 @@ ; GFX6-NEXT: v_mul_hi_u32 v5, v4, v5 ; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[0:1] ; GFX6-NEXT: v_subrev_i32_e32 v6, vcc, s10, v3 -; GFX6-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[0:1] ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 ; GFX6-NEXT: v_mul_hi_u32 v4, s7, v4 +; GFX6-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[0:1] ; GFX6-NEXT: v_add_i32_e32 v5, vcc, 1, v2 +; GFX6-NEXT: v_mul_lo_u32 v6, v4, s11 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s10, v3 ; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc -; GFX6-NEXT: v_mul_lo_u32 v6, v4, s11 ; GFX6-NEXT: v_add_i32_e32 v5, vcc, 1, v4 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s7, v6 ; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s11, v3 @@ -1612,17 +1612,17 @@ ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s8, v5 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc ; GFX9-NEXT: v_subrev_u32_e32 v7, s8, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc ; GFX9-NEXT: v_mul_lo_u32 v6, v1, s9 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v2 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s8, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc ; GFX9-NEXT: v_add_u32_e32 v7, 1, v0 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s8, v5 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc ; GFX9-NEXT: v_mul_lo_u32 v7, s2, v3 ; GFX9-NEXT: v_sub_u32_e32 v6, s5, v6 -; GFX9-NEXT: v_mul_f32_e32 v2, s12, v2 ; GFX9-NEXT: v_add_u32_e32 v5, 1, v1 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s9, v6 +; GFX9-NEXT: v_mul_f32_e32 v2, s12, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc ; GFX9-NEXT: v_mul_hi_u32 v5, v3, v7 ; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 @@ -1646,8 +1646,8 @@ ; GFX9-NEXT: v_subrev_u32_e32 v3, s10, v6 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc ; GFX9-NEXT: v_mul_lo_u32 v6, v5, s11 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s10, v3 ; GFX9-NEXT: v_add_u32_e32 v7, 1, v2 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s10, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc ; GFX9-NEXT: v_sub_u32_e32 v3, s7, v6 ; GFX9-NEXT: v_add_u32_e32 v6, 1, v5 @@ -1689,32 +1689,32 @@ ; GFX90A-NEXT: v_subrev_u32_e32 v3, s8, v2 ; GFX90A-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; GFX90A-NEXT: s_sub_i32 s2, 0, s9 +; GFX90A-NEXT: v_add_u32_e32 v3, 1, v0 ; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s8, v2 ; GFX90A-NEXT: v_mul_lo_u32 v2, s2, v1 -; GFX90A-NEXT: v_add_u32_e32 v3, 1, v0 -; GFX90A-NEXT: v_mul_hi_u32 v2, v1, v2 ; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; GFX90A-NEXT: v_mul_hi_u32 v2, v1, v2 ; GFX90A-NEXT: v_cvt_f32_u32_e32 v3, s10 ; GFX90A-NEXT: v_add_u32_e32 v1, v1, v2 ; GFX90A-NEXT: v_mul_hi_u32 v1, s5, v1 ; GFX90A-NEXT: v_mul_lo_u32 v2, v1, s9 -; GFX90A-NEXT: v_rcp_iflag_f32_e32 v3, v3 ; GFX90A-NEXT: v_sub_u32_e32 v2, s5, v2 +; GFX90A-NEXT: v_rcp_iflag_f32_e32 v3, v3 ; GFX90A-NEXT: v_add_u32_e32 v5, 1, v1 ; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s9, v2 ; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc ; GFX90A-NEXT: v_subrev_u32_e32 v5, s9, v2 ; GFX90A-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc -; GFX90A-NEXT: v_mul_f32_e32 v3, s3, v3 ; GFX90A-NEXT: v_add_u32_e32 v5, 1, v1 +; GFX90A-NEXT: v_mul_f32_e32 v3, s3, v3 ; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s9, v2 +; GFX90A-NEXT: v_cvt_u32_f32_e32 v3, v3 ; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc ; GFX90A-NEXT: v_cvt_f32_u32_e32 v5, s11 -; GFX90A-NEXT: v_cvt_u32_f32_e32 v3, v3 ; GFX90A-NEXT: s_sub_i32 s2, 0, s10 -; GFX90A-NEXT: v_rcp_iflag_f32_e32 v5, v5 ; GFX90A-NEXT: v_mul_lo_u32 v2, s2, v3 ; GFX90A-NEXT: v_mul_hi_u32 v2, v3, v2 +; GFX90A-NEXT: v_rcp_iflag_f32_e32 v5, v5 ; GFX90A-NEXT: v_add_u32_e32 v2, v3, v2 ; GFX90A-NEXT: v_mul_hi_u32 v2, s6, v2 ; GFX90A-NEXT: v_mul_lo_u32 v3, v2, s10 @@ -1905,8 +1905,8 @@ ; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v4, v1 ; GFX6-NEXT: v_mul_hi_u32 v1, s5, v1 -; GFX6-NEXT: v_mul_f32_e32 v2, s13, v3 ; GFX6-NEXT: v_mul_lo_u32 v0, v0, s8 +; GFX6-NEXT: v_mul_f32_e32 v2, s13, v3 ; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GFX6-NEXT: v_mul_lo_u32 v1, v1, s9 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 @@ -1921,8 +1921,8 @@ ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s5, v1 ; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s9, v1 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s9, v1 -; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; GFX6-NEXT: v_mul_hi_u32 v3, v2, v3 +; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v5 ; GFX6-NEXT: s_sub_i32 s4, 0, s11 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 @@ -1930,19 +1930,19 @@ ; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v3 ; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s9, v1 ; GFX6-NEXT: v_mul_hi_u32 v2, s6, v2 -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s9, v1 ; GFX6-NEXT: v_mul_lo_u32 v5, s4, v3 +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s9, v1 ; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; GFX6-NEXT: v_mul_lo_u32 v2, v2, s10 ; GFX6-NEXT: v_mul_hi_u32 v4, v3, v5 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s6, v2 -; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, s10, v2 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, v4, v3 ; GFX6-NEXT: v_mul_hi_u32 v3, s7, v3 +; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, s10, v2 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s10, v2 +; GFX6-NEXT: v_mul_lo_u32 v3, v3, s11 ; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc ; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s10, v2 -; GFX6-NEXT: v_mul_lo_u32 v3, v3, s11 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s10, v2 ; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s7, v3 @@ -2013,9 +2013,9 @@ ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s9, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc ; GFX9-NEXT: v_mul_lo_u32 v3, v3, s11 -; GFX9-NEXT: v_sub_u32_e32 v2, s6, v2 ; GFX9-NEXT: v_subrev_u32_e32 v6, s9, v1 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s9, v1 +; GFX9-NEXT: v_sub_u32_e32 v2, s6, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc ; GFX9-NEXT: v_subrev_u32_e32 v5, s10, v2 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s10, v2 @@ -2328,8 +2328,8 @@ ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 1, v0 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v2, v1 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s3, v3 -; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; GFX6-NEXT: s_ashr_i32 s0, s5, 31 +; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; GFX6-NEXT: s_add_i32 s1, s5, s0 ; GFX6-NEXT: v_xor_b32_e32 v0, s2, v0 ; GFX6-NEXT: s_ashr_i32 s3, s10, 31 @@ -2363,17 +2363,17 @@ ; GFX6-NEXT: s_add_i32 s5, s11, s2 ; GFX6-NEXT: s_add_i32 s1, s6, s0 ; GFX6-NEXT: s_xor_b32 s5, s5, s2 -; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s5 ; GFX6-NEXT: s_xor_b32 s1, s1, s0 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s5 ; GFX6-NEXT: v_mul_hi_u32 v2, s1, v2 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v4 ; GFX6-NEXT: s_xor_b32 s3, s0, s3 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v4 ; GFX6-NEXT: v_mul_lo_u32 v3, v2, s4 -; GFX6-NEXT: v_mul_f32_e32 v4, s16, v4 -; GFX6-NEXT: v_cvt_u32_f32_e32 v4, v4 ; GFX6-NEXT: v_add_i32_e32 v5, vcc, 1, v2 +; GFX6-NEXT: v_mul_f32_e32 v4, s16, v4 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s1, v3 +; GFX6-NEXT: v_cvt_u32_f32_e32 v4, v4 ; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s4, v3 ; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1] ; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, s4, v3 @@ -2476,8 +2476,8 @@ ; GFX9-NEXT: s_add_i32 s9, s11, s8 ; GFX9-NEXT: v_add_u32_e32 v5, 1, v1 ; GFX9-NEXT: s_xor_b32 s9, s9, s8 -; GFX9-NEXT: v_mul_hi_u32 v2, v3, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; GFX9-NEXT: v_mul_hi_u32 v2, v3, v2 ; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s9 ; GFX9-NEXT: s_ashr_i32 s5, s6, 31 ; GFX9-NEXT: s_add_i32 s6, s6, s5 @@ -2488,9 +2488,9 @@ ; GFX9-NEXT: v_xor_b32_e32 v0, s2, v0 ; GFX9-NEXT: v_mul_f32_e32 v3, s15, v3 ; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 -; GFX9-NEXT: v_mul_lo_u32 v5, v2, s4 ; GFX9-NEXT: v_subrev_u32_e32 v0, s2, v0 ; GFX9-NEXT: s_xor_b32 s2, s13, s12 +; GFX9-NEXT: v_mul_lo_u32 v5, v2, s4 ; GFX9-NEXT: v_xor_b32_e32 v1, s2, v1 ; GFX9-NEXT: v_subrev_u32_e32 v1, s2, v1 ; GFX9-NEXT: s_xor_b32 s2, s5, s3 @@ -2852,9 +2852,9 @@ ; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s8, v0 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s8, v0 ; GFX6-NEXT: s_xor_b32 s4, s5, s13 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v2, v1 -; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s8, v0 ; GFX6-NEXT: s_ashr_i32 s5, s10, 31 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 ; GFX6-NEXT: s_add_i32 s8, s10, s5 @@ -2866,7 +2866,6 @@ ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX6-NEXT: v_mul_lo_u32 v1, v1, s9 ; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s12, v0 -; GFX6-NEXT: s_ashr_i32 s8, s11, 31 ; GFX6-NEXT: v_mul_f32_e32 v2, s14, v2 ; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s4, v1 @@ -2879,6 +2878,7 @@ ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s9, v1 ; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX6-NEXT: v_mul_hi_u32 v3, v2, v4 +; GFX6-NEXT: s_ashr_i32 s8, s11, 31 ; GFX6-NEXT: s_add_i32 s9, s11, s8 ; GFX6-NEXT: s_ashr_i32 s4, s6, 31 ; GFX6-NEXT: s_xor_b32 s8, s9, s8 @@ -3072,11 +3072,11 @@ ; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX90A-NEXT: s_sub_i32 s4, 0, s8 ; GFX90A-NEXT: v_xor_b32_e32 v0, s3, v0 -; GFX90A-NEXT: v_mul_lo_u32 v2, s4, v1 ; GFX90A-NEXT: s_ashr_i32 s2, s5, 31 -; GFX90A-NEXT: v_mul_hi_u32 v2, v1, v2 +; GFX90A-NEXT: v_mul_lo_u32 v2, s4, v1 ; GFX90A-NEXT: v_subrev_u32_e32 v0, s3, v0 ; GFX90A-NEXT: s_add_i32 s3, s5, s2 +; GFX90A-NEXT: v_mul_hi_u32 v2, v1, v2 ; GFX90A-NEXT: s_xor_b32 s3, s3, s2 ; GFX90A-NEXT: v_add_u32_e32 v1, v1, v2 ; GFX90A-NEXT: v_mul_hi_u32 v1, s3, v1 @@ -3084,9 +3084,9 @@ ; GFX90A-NEXT: v_sub_u32_e32 v1, s3, v1 ; GFX90A-NEXT: s_ashr_i32 s3, s10, 31 ; GFX90A-NEXT: s_add_i32 s4, s10, s3 -; GFX90A-NEXT: s_xor_b32 s3, s4, s3 ; GFX90A-NEXT: v_subrev_u32_e32 v2, s8, v1 ; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s8, v1 +; GFX90A-NEXT: s_xor_b32 s3, s4, s3 ; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX90A-NEXT: v_cvt_f32_u32_e32 v2, s3 ; GFX90A-NEXT: v_subrev_u32_e32 v3, s8, v1 @@ -3109,9 +3109,9 @@ ; GFX90A-NEXT: v_sub_u32_e32 v2, s4, v2 ; GFX90A-NEXT: s_ashr_i32 s4, s11, 31 ; GFX90A-NEXT: s_add_i32 s5, s11, s4 -; GFX90A-NEXT: s_xor_b32 s4, s5, s4 ; GFX90A-NEXT: v_subrev_u32_e32 v3, s3, v2 ; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s3, v2 +; GFX90A-NEXT: s_xor_b32 s4, s5, s4 ; GFX90A-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; GFX90A-NEXT: v_cvt_f32_u32_e32 v3, s4 ; GFX90A-NEXT: v_subrev_u32_e32 v5, s3, v2 @@ -3245,13 +3245,13 @@ ; GFX6-NEXT: s_lshr_b32 s9, s0, 16 ; GFX6-NEXT: s_and_b32 s0, s0, s8 ; GFX6-NEXT: s_lshr_b32 s2, s2, 16 -; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s2 ; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s0 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s2 ; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s9 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v3 ; GFX6-NEXT: s_and_b32 s2, s3, s8 ; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v3 ; GFX6-NEXT: v_trunc_f32_e32 v2, v2 ; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 ; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v2 @@ -3262,22 +3262,21 @@ ; GFX6-NEXT: v_mad_f32 v2, -v1, v3, v4 ; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s2 ; GFX6-NEXT: s_lshr_b32 s0, s1, 16 -; GFX6-NEXT: s_and_b32 s1, s1, s8 ; GFX6-NEXT: s_lshr_b32 s10, s3, 16 -; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v3 +; GFX6-NEXT: s_and_b32 s1, s1, s8 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s10 ; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s1 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v6, v4 +; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v3 +; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s10 ; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v1, vcc -; GFX6-NEXT: v_rcp_iflag_f32_e32 v7, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: v_mul_f32_e32 v1, v5, v6 ; GFX6-NEXT: v_cvt_f32_u32_e32 v6, s0 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v7, v3 ; GFX6-NEXT: v_trunc_f32_e32 v1, v1 ; GFX6-NEXT: v_mad_f32 v5, -v1, v4, v5 -; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v5|, v4 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v5|, v4 ; GFX6-NEXT: v_mul_f32_e32 v4, v6, v7 ; GFX6-NEXT: v_trunc_f32_e32 v4, v4 ; GFX6-NEXT: v_cvt_u32_f32_e32 v5, v4 @@ -3285,9 +3284,10 @@ ; GFX6-NEXT: v_mad_f32 v4, -v4, v3, v6 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, v3 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc -; GFX6-NEXT: v_and_b32_e32 v0, s8, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: v_and_b32_e32 v1, s8, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_and_b32_e32 v0, s8, v0 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v3 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 @@ -3321,22 +3321,22 @@ ; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v3, vcc ; GFX9-NEXT: v_mad_f32 v3, -v1, v4, v5 ; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s0 -; GFX9-NEXT: s_and_b32 s0, s5, s8 ; GFX9-NEXT: s_lshr_b32 s6, s7, 16 -; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v4 +; GFX9-NEXT: s_and_b32 s0, s5, s8 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s6 ; GFX9-NEXT: v_cvt_f32_u32_e32 v6, s0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v7, v5 +; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v4 +; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s6 ; GFX9-NEXT: s_lshr_b32 s1, s5, 16 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc -; GFX9-NEXT: v_rcp_iflag_f32_e32 v8, v4 ; GFX9-NEXT: v_mul_f32_e32 v1, v6, v7 ; GFX9-NEXT: v_cvt_f32_u32_e32 v7, s1 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v8, v4 ; GFX9-NEXT: v_trunc_f32_e32 v1, v1 ; GFX9-NEXT: v_mad_f32 v6, -v1, v5, v6 -; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v6|, v5 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v6|, v5 ; GFX9-NEXT: v_mul_f32_e32 v5, v7, v8 ; GFX9-NEXT: v_trunc_f32_e32 v5, v5 ; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v5 @@ -3344,9 +3344,9 @@ ; GFX9-NEXT: v_mad_f32 v5, -v5, v4, v7 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v5|, v4 ; GFX9-NEXT: v_mov_b32_e32 v5, 0xffff -; GFX9-NEXT: v_and_b32_e32 v0, v5, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v6, vcc ; GFX9-NEXT: v_and_b32_e32 v1, v5, v1 +; GFX9-NEXT: v_and_b32_e32 v0, v5, v0 ; GFX9-NEXT: v_lshl_or_b32 v1, v4, 16, v1 ; GFX9-NEXT: v_lshl_or_b32 v0, v3, 16, v0 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] @@ -3380,22 +3380,22 @@ ; GFX90A-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v3, vcc ; GFX90A-NEXT: v_mad_f32 v3, -v1, v4, v5 ; GFX90A-NEXT: v_cvt_f32_u32_e32 v5, s0 -; GFX90A-NEXT: s_and_b32 s0, s5, s8 ; GFX90A-NEXT: s_lshr_b32 s6, s7, 16 -; GFX90A-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v4 +; GFX90A-NEXT: s_and_b32 s0, s5, s8 ; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX90A-NEXT: v_cvt_f32_u32_e32 v4, s6 ; GFX90A-NEXT: v_cvt_f32_u32_e32 v6, s0 ; GFX90A-NEXT: v_rcp_iflag_f32_e32 v7, v5 +; GFX90A-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v4 +; GFX90A-NEXT: v_cvt_f32_u32_e32 v4, s6 ; GFX90A-NEXT: s_lshr_b32 s1, s5, 16 ; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc -; GFX90A-NEXT: v_rcp_iflag_f32_e32 v8, v4 ; GFX90A-NEXT: v_mul_f32_e32 v1, v6, v7 ; GFX90A-NEXT: v_cvt_f32_u32_e32 v7, s1 +; GFX90A-NEXT: v_rcp_iflag_f32_e32 v8, v4 ; GFX90A-NEXT: v_trunc_f32_e32 v1, v1 ; GFX90A-NEXT: v_mad_f32 v6, -v1, v5, v6 -; GFX90A-NEXT: v_cmp_ge_f32_e64 vcc, |v6|, v5 ; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GFX90A-NEXT: v_cmp_ge_f32_e64 vcc, |v6|, v5 ; GFX90A-NEXT: v_mul_f32_e32 v5, v7, v8 ; GFX90A-NEXT: v_trunc_f32_e32 v5, v5 ; GFX90A-NEXT: v_cvt_u32_f32_e32 v6, v5 @@ -3403,9 +3403,9 @@ ; GFX90A-NEXT: v_mad_f32 v5, -v5, v4, v7 ; GFX90A-NEXT: v_cmp_ge_f32_e64 vcc, |v5|, v4 ; GFX90A-NEXT: v_mov_b32_e32 v5, 0xffff -; GFX90A-NEXT: v_and_b32_e32 v0, v5, v0 ; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v6, vcc ; GFX90A-NEXT: v_and_b32_e32 v1, v5, v1 +; GFX90A-NEXT: v_and_b32_e32 v0, v5, v0 ; GFX90A-NEXT: v_lshl_or_b32 v1, v4, 16, v1 ; GFX90A-NEXT: v_lshl_or_b32 v0, v3, 16, v0 ; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] @@ -3536,8 +3536,8 @@ ; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc ; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v1 ; GFX6-NEXT: v_mad_f32 v1, -v1, v3, v4 -; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v3 ; GFX6-NEXT: v_mul_lo_u32 v0, v0, s2 +; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v3 ; GFX6-NEXT: s_and_b32 s2, s3, s8 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc ; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s2 @@ -3547,16 +3547,16 @@ ; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v2 ; GFX6-NEXT: s_lshr_b32 s12, s3, 16 ; GFX6-NEXT: v_sub_i32_e32 v5, vcc, s9, v1 -; GFX6-NEXT: s_lshr_b32 s10, s1, 16 ; GFX6-NEXT: v_mul_f32_e32 v1, v3, v4 ; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s12 +; GFX6-NEXT: s_lshr_b32 s10, s1, 16 ; GFX6-NEXT: v_cvt_f32_u32_e32 v6, s10 ; GFX6-NEXT: v_trunc_f32_e32 v1, v1 -; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v7, v4 +; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 ; GFX6-NEXT: v_mad_f32 v3, -v1, v2, v3 -; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v2 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v2 ; GFX6-NEXT: v_mul_f32_e32 v2, v6, v7 ; GFX6-NEXT: v_trunc_f32_e32 v2, v2 ; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v2 @@ -3608,20 +3608,20 @@ ; GFX9-NEXT: v_mad_f32 v3, -v1, v4, v5 ; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s6 ; GFX9-NEXT: s_and_b32 s6, s5, s8 -; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v4 -; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s10 ; GFX9-NEXT: v_cvt_f32_u32_e32 v6, s6 +; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v4 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v7, v5 -; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s10 ; GFX9-NEXT: s_lshr_b32 s1, s5, 16 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v8, v4 +; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX9-NEXT: v_mul_f32_e32 v3, v6, v7 ; GFX9-NEXT: v_cvt_f32_u32_e32 v7, s1 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v8, v4 ; GFX9-NEXT: v_trunc_f32_e32 v3, v3 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: v_mad_f32 v6, -v3, v5, v6 -; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v6|, v5 ; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 +; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v6|, v5 ; GFX9-NEXT: v_mul_f32_e32 v5, v7, v8 ; GFX9-NEXT: v_trunc_f32_e32 v5, v5 ; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v5 @@ -3677,21 +3677,21 @@ ; GFX90A-NEXT: v_cvt_f32_u32_e32 v5, s4 ; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX90A-NEXT: s_and_b32 s4, s5, s8 -; GFX90A-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v4 -; GFX90A-NEXT: v_cvt_f32_u32_e32 v4, s10 ; GFX90A-NEXT: v_cvt_f32_u32_e32 v6, s4 ; GFX90A-NEXT: v_rcp_iflag_f32_e32 v7, v5 +; GFX90A-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v4 +; GFX90A-NEXT: v_cvt_f32_u32_e32 v4, s10 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX90A-NEXT: v_mul_lo_u32 v1, v1, s9 ; GFX90A-NEXT: s_lshr_b32 s1, s5, 16 +; GFX90A-NEXT: v_mul_lo_u32 v1, v1, s9 ; GFX90A-NEXT: v_sub_u32_e32 v3, s0, v1 ; GFX90A-NEXT: v_mul_f32_e32 v1, v6, v7 ; GFX90A-NEXT: v_cvt_f32_u32_e32 v7, s1 ; GFX90A-NEXT: v_rcp_iflag_f32_e32 v8, v4 ; GFX90A-NEXT: v_trunc_f32_e32 v1, v1 ; GFX90A-NEXT: v_mad_f32 v6, -v1, v5, v6 -; GFX90A-NEXT: v_cmp_ge_f32_e64 vcc, |v6|, v5 ; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GFX90A-NEXT: v_cmp_ge_f32_e64 vcc, |v6|, v5 ; GFX90A-NEXT: v_mul_f32_e32 v5, v7, v8 ; GFX90A-NEXT: v_trunc_f32_e32 v5, v5 ; GFX90A-NEXT: v_cvt_u32_f32_e32 v6, v5 @@ -3700,12 +3700,12 @@ ; GFX90A-NEXT: v_cmp_ge_f32_e64 vcc, |v5|, v4 ; GFX90A-NEXT: v_mul_lo_u32 v1, v1, s7 ; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v6, vcc +; GFX90A-NEXT: v_sub_u32_e32 v1, s5, v1 ; GFX90A-NEXT: v_mul_lo_u32 v4, v4, s10 ; GFX90A-NEXT: v_mov_b32_e32 v5, 0xffff -; GFX90A-NEXT: v_sub_u32_e32 v1, s5, v1 -; GFX90A-NEXT: v_and_b32_e32 v0, v5, v0 ; GFX90A-NEXT: v_sub_u32_e32 v4, s1, v4 ; GFX90A-NEXT: v_and_b32_e32 v1, v5, v1 +; GFX90A-NEXT: v_and_b32_e32 v0, v5, v0 ; GFX90A-NEXT: v_lshl_or_b32 v1, v4, 16, v1 ; GFX90A-NEXT: v_lshl_or_b32 v0, v3, 16, v0 ; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] @@ -3835,8 +3835,8 @@ ; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 ; GFX6-NEXT: v_trunc_f32_e32 v2, v2 ; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 -; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| ; GFX6-NEXT: v_cvt_i32_f32_e32 v2, v2 +; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| ; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s2 ; GFX6-NEXT: v_mov_b32_e32 v3, s8 ; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc @@ -3852,8 +3852,8 @@ ; GFX6-NEXT: v_mad_f32 v2, -v3, v1, v2 ; GFX6-NEXT: v_mov_b32_e32 v4, s0 ; GFX6-NEXT: s_sext_i32_i16 s0, s3 -; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, |v1| ; GFX6-NEXT: v_cvt_i32_f32_e32 v3, v3 +; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, |v1| ; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s0 ; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v4, vcc ; GFX6-NEXT: s_sext_i32_i16 s2, s1 @@ -3868,8 +3868,8 @@ ; GFX6-NEXT: v_mad_f32 v1, -v4, v2, v1 ; GFX6-NEXT: v_mov_b32_e32 v5, s0 ; GFX6-NEXT: s_ashr_i32 s0, s3, 16 -; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v2| ; GFX6-NEXT: v_cvt_i32_f32_e32 v4, v4 +; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v2| ; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s0 ; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v5, vcc ; GFX6-NEXT: s_ashr_i32 s1, s1, 16 @@ -3931,9 +3931,9 @@ ; GFX9-NEXT: v_mad_f32 v1, -v4, v0, v1 ; GFX9-NEXT: s_or_b32 s4, s0, 1 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, |v0| +; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v4 ; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: s_sext_i32_i16 s1, s7 -; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v4 ; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s1 ; GFX9-NEXT: s_cselect_b32 s0, s4, 0 ; GFX9-NEXT: v_add_u32_e32 v4, s0, v4 @@ -3948,8 +3948,8 @@ ; GFX9-NEXT: v_mad_f32 v1, -v5, v0, v1 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, |v0| ; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX9-NEXT: s_cselect_b32 s0, s4, 0 ; GFX9-NEXT: v_cvt_i32_f32_e32 v5, v5 +; GFX9-NEXT: s_cselect_b32 s0, s4, 0 ; GFX9-NEXT: s_ashr_i32 s1, s7, 16 ; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s1 ; GFX9-NEXT: v_add_u32_e32 v1, s0, v5 @@ -4009,9 +4009,9 @@ ; GFX90A-NEXT: v_mad_f32 v1, -v4, v0, v1 ; GFX90A-NEXT: s_or_b32 s4, s0, 1 ; GFX90A-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, |v0| +; GFX90A-NEXT: v_cvt_i32_f32_e32 v4, v4 ; GFX90A-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX90A-NEXT: s_sext_i32_i16 s1, s7 -; GFX90A-NEXT: v_cvt_i32_f32_e32 v4, v4 ; GFX90A-NEXT: v_cvt_f32_i32_e32 v0, s1 ; GFX90A-NEXT: s_cselect_b32 s0, s4, 0 ; GFX90A-NEXT: v_add_u32_e32 v4, s0, v4 @@ -4026,8 +4026,8 @@ ; GFX90A-NEXT: v_mad_f32 v1, -v5, v0, v1 ; GFX90A-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, |v0| ; GFX90A-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX90A-NEXT: s_cselect_b32 s0, s4, 0 ; GFX90A-NEXT: v_cvt_i32_f32_e32 v5, v5 +; GFX90A-NEXT: s_cselect_b32 s0, s4, 0 ; GFX90A-NEXT: s_ashr_i32 s1, s7, 16 ; GFX90A-NEXT: v_cvt_f32_i32_e32 v0, s1 ; GFX90A-NEXT: v_add_u32_e32 v1, s0, v5 @@ -4222,8 +4222,8 @@ ; GFX6-NEXT: v_mad_f32 v1, -v4, v2, v1 ; GFX6-NEXT: v_mov_b32_e32 v5, s0 ; GFX6-NEXT: s_ashr_i32 s0, s3, 16 -; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v2| ; GFX6-NEXT: v_cvt_i32_f32_e32 v4, v4 +; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v2| ; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s0 ; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v5, vcc ; GFX6-NEXT: s_ashr_i32 s2, s1, 16 @@ -4243,11 +4243,11 @@ ; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v6, vcc ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5 ; GFX6-NEXT: v_mul_lo_u32 v2, v2, s0 -; GFX6-NEXT: s_mov_b32 s0, 0xffff ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s1, v1 -; GFX6-NEXT: v_and_b32_e32 v1, s0, v1 +; GFX6-NEXT: s_mov_b32 s0, 0xffff ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s2, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_and_b32_e32 v1, s0, v1 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3 ; GFX6-NEXT: v_and_b32_e32 v0, s0, v0 @@ -4307,8 +4307,8 @@ ; GFX9-NEXT: v_trunc_f32_e32 v5, v5 ; GFX9-NEXT: v_mad_f32 v4, -v5, v3, v4 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v4|, |v3| -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: v_cvt_i32_f32_e32 v5, v5 +; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: s_cselect_b32 s0, s6, 0 ; GFX9-NEXT: s_ashr_i32 s6, s7, 16 ; GFX9-NEXT: v_cvt_f32_i32_e32 v4, s6 @@ -4358,8 +4358,8 @@ ; GFX90A-NEXT: v_mul_f32_e32 v3, v1, v3 ; GFX90A-NEXT: v_trunc_f32_e32 v3, v3 ; GFX90A-NEXT: v_mad_f32 v1, -v3, v0, v1 -; GFX90A-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, |v0| ; GFX90A-NEXT: v_cvt_i32_f32_e32 v3, v3 +; GFX90A-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, |v0| ; GFX90A-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX90A-NEXT: s_cselect_b32 s0, s8, 0 ; GFX90A-NEXT: s_ashr_i32 s8, s6, 16 @@ -4394,8 +4394,8 @@ ; GFX90A-NEXT: v_mul_f32_e32 v5, v1, v5 ; GFX90A-NEXT: v_trunc_f32_e32 v5, v5 ; GFX90A-NEXT: v_mad_f32 v1, -v5, v3, v1 -; GFX90A-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, |v3| ; GFX90A-NEXT: v_cvt_i32_f32_e32 v5, v5 +; GFX90A-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, |v3| ; GFX90A-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX90A-NEXT: s_cselect_b32 s0, s4, 0 ; GFX90A-NEXT: s_ashr_i32 s4, s7, 16 @@ -4419,9 +4419,9 @@ ; GFX90A-NEXT: v_add_u32_e32 v3, s0, v6 ; GFX90A-NEXT: v_mul_lo_u32 v3, v3, s4 ; GFX90A-NEXT: v_mov_b32_e32 v5, 0xffff -; GFX90A-NEXT: v_and_b32_e32 v0, v5, v0 ; GFX90A-NEXT: v_sub_u32_e32 v3, s5, v3 ; GFX90A-NEXT: v_and_b32_e32 v1, v5, v1 +; GFX90A-NEXT: v_and_b32_e32 v0, v5, v0 ; GFX90A-NEXT: v_lshl_or_b32 v1, v3, 16, v1 ; GFX90A-NEXT: v_lshl_or_b32 v0, v4, 16, v0 ; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] @@ -4573,8 +4573,6 @@ ; GFX9-LABEL: urem_i3: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_bfe_u32 s3, s2, 0x30008 ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 @@ -4587,11 +4585,13 @@ ; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v1 ; GFX9-NEXT: v_mad_f32 v1, -v1, v0, v2 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v3, vcc ; GFX9-NEXT: v_mul_lo_u32 v0, v0, s3 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 7, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_byte v1, v0, s[0:1] ; GFX9-NEXT: s_endpgm ; @@ -4793,8 +4793,6 @@ ; GFX9-LABEL: srem_i3: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_bfe_i32 s2, s4, 0x30008 ; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s2 @@ -4814,9 +4812,11 @@ ; GFX9-NEXT: s_cselect_b32 s2, s6, 0 ; GFX9-NEXT: v_add_u32_e32 v0, s2, v2 ; GFX9-NEXT: v_mul_lo_u32 v0, v0, s5 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 7, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_byte v1, v0, s[0:1] ; GFX9-NEXT: s_endpgm ; @@ -4930,13 +4930,13 @@ ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s6 ; GFX6-NEXT: s_and_b32 s6, s2, s8 ; GFX6-NEXT: s_lshr_b32 s0, s0, 16 -; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s0 ; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s6 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s0 ; GFX6-NEXT: s_lshr_b32 s0, s2, 16 ; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s0 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v3 ; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v3 ; GFX6-NEXT: v_trunc_f32_e32 v2, v2 ; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 ; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v2 @@ -4960,8 +4960,8 @@ ; GFX6-NEXT: v_mad_f32 v2, -v2, v4, v5 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_and_b32_e32 v0, s8, v0 ; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v3, vcc +; GFX6-NEXT: v_and_b32_e32 v0, s8, v0 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: buffer_store_short v2, off, s[4:7], 0 offset:4 ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 @@ -5156,21 +5156,21 @@ ; GFX6-NEXT: v_alignbit_b32 v1, s3, v1, 16 ; GFX6-NEXT: v_mul_f32_e32 v3, v2, v3 ; GFX6-NEXT: v_trunc_f32_e32 v3, v3 -; GFX6-NEXT: v_mad_f32 v2, -v3, v0, v2 ; GFX6-NEXT: v_cvt_u32_f32_e32 v6, v3 +; GFX6-NEXT: v_mad_f32 v2, -v3, v0, v2 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v0 ; GFX6-NEXT: v_cvt_f32_u32_e32 v2, v5 -; GFX6-NEXT: v_and_b32_e32 v3, s8, v1 ; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v6, vcc +; GFX6-NEXT: v_and_b32_e32 v3, s8, v1 ; GFX6-NEXT: v_mul_lo_u32 v0, v0, s0 -; GFX6-NEXT: s_and_b32 s0, s1, s8 ; GFX6-NEXT: v_cvt_f32_u32_e32 v3, v3 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v2 +; GFX6-NEXT: s_and_b32 s0, s1, s8 ; GFX6-NEXT: v_cvt_f32_u32_e32 v6, s0 ; GFX6-NEXT: s_and_b32 s0, s3, s8 -; GFX6-NEXT: v_cvt_f32_u32_e32 v7, s0 ; GFX6-NEXT: v_mul_f32_e32 v5, v3, v5 ; GFX6-NEXT: v_trunc_f32_e32 v5, v5 +; GFX6-NEXT: v_cvt_f32_u32_e32 v7, s0 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v8, v6 ; GFX6-NEXT: v_mad_f32 v3, -v5, v2, v3 ; GFX6-NEXT: v_cvt_u32_f32_e32 v5, v5 @@ -5188,8 +5188,8 @@ ; GFX6-NEXT: v_mul_lo_u32 v3, v3, s1 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_and_b32_e32 v0, s8, v0 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s3, v3 +; GFX6-NEXT: v_and_b32_e32 v0, s8, v0 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: buffer_store_short v2, off, s[4:7], 0 offset:4 ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 @@ -5282,8 +5282,8 @@ ; GFX90A-NEXT: s_and_b32 s1, s5, s8 ; GFX90A-NEXT: v_cvt_f32_u32_e32 v6, s1 ; GFX90A-NEXT: v_rcp_iflag_f32_e32 v7, v5 -; GFX90A-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v4 ; GFX90A-NEXT: v_cvt_u32_f32_e32 v2, v2 +; GFX90A-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v4 ; GFX90A-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX90A-NEXT: v_mul_f32_e32 v3, v6, v7 ; GFX90A-NEXT: v_trunc_f32_e32 v3, v3 @@ -5403,8 +5403,8 @@ ; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 ; GFX6-NEXT: v_trunc_f32_e32 v2, v2 ; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 -; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| ; GFX6-NEXT: v_cvt_i32_f32_e32 v2, v2 +; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| ; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s0 ; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc ; GFX6-NEXT: s_ashr_i32 s2, s2, 16 @@ -5419,8 +5419,8 @@ ; GFX6-NEXT: v_mad_f32 v2, -v3, v1, v2 ; GFX6-NEXT: v_mov_b32_e32 v4, s0 ; GFX6-NEXT: s_sext_i32_i16 s0, s1 -; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, |v1| ; GFX6-NEXT: v_cvt_i32_f32_e32 v3, v3 +; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, |v1| ; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s0 ; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v4, vcc ; GFX6-NEXT: s_sext_i32_i16 s1, s3 @@ -5480,12 +5480,11 @@ ; GFX9-NEXT: v_trunc_f32_e32 v4, v4 ; GFX9-NEXT: v_mad_f32 v3, -v4, v0, v3 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v3|, |v0| +; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v4 ; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: s_sext_i32_i16 s1, s7 -; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v4 ; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s1 ; GFX9-NEXT: s_cselect_b32 s0, s4, 0 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX9-NEXT: v_add_u32_e32 v3, s0, v4 ; GFX9-NEXT: s_sext_i32_i16 s0, s5 ; GFX9-NEXT: v_cvt_f32_i32_e32 v4, s0 @@ -5501,6 +5500,7 @@ ; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: s_cselect_b32 s0, s4, 0 ; GFX9-NEXT: v_add_u32_e32 v0, s0, v5 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX9-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; GFX9-NEXT: global_store_short v1, v0, s[2:3] offset:4 ; GFX9-NEXT: global_store_dword v1, v2, s[2:3] @@ -5541,12 +5541,11 @@ ; GFX90A-NEXT: v_trunc_f32_e32 v4, v4 ; GFX90A-NEXT: v_mad_f32 v3, -v4, v0, v3 ; GFX90A-NEXT: v_cmp_ge_f32_e64 s[0:1], |v3|, |v0| +; GFX90A-NEXT: v_cvt_i32_f32_e32 v4, v4 ; GFX90A-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX90A-NEXT: s_sext_i32_i16 s1, s7 -; GFX90A-NEXT: v_cvt_i32_f32_e32 v4, v4 ; GFX90A-NEXT: v_cvt_f32_i32_e32 v0, s1 ; GFX90A-NEXT: s_cselect_b32 s0, s4, 0 -; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX90A-NEXT: v_add_u32_e32 v3, s0, v4 ; GFX90A-NEXT: s_sext_i32_i16 s0, s5 ; GFX90A-NEXT: v_cvt_f32_i32_e32 v4, s0 @@ -5562,6 +5561,7 @@ ; GFX90A-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX90A-NEXT: s_cselect_b32 s0, s4, 0 ; GFX90A-NEXT: v_add_u32_e32 v0, s0, v5 +; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX90A-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; GFX90A-NEXT: global_store_short v1, v0, s[2:3] offset:4 ; GFX90A-NEXT: global_store_dword v1, v2, s[2:3] @@ -5719,8 +5719,8 @@ ; GFX6-NEXT: v_mul_lo_u32 v3, v3, s1 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s3, v3 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: buffer_store_short v2, off, s[4:7], 0 offset:4 ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 @@ -5786,8 +5786,8 @@ ; GFX9-NEXT: v_sub_u32_e32 v1, s9, v1 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: v_sub_u32_e32 v0, s6, v0 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX9-NEXT: v_sub_u32_e32 v2, s2, v2 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v1 ; GFX9-NEXT: global_store_short v3, v2, s[4:5] offset:4 ; GFX9-NEXT: global_store_dword v3, v0, s[4:5] @@ -5853,8 +5853,8 @@ ; GFX90A-NEXT: v_add_u32_e32 v3, s0, v5 ; GFX90A-NEXT: v_sub_u32_e32 v0, s9, v0 ; GFX90A-NEXT: v_mul_lo_u32 v3, v3, s6 -; GFX90A-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX90A-NEXT: v_sub_u32_e32 v3, s4, v3 +; GFX90A-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX90A-NEXT: v_lshl_or_b32 v0, v2, 16, v0 ; GFX90A-NEXT: global_store_short v1, v3, s[2:3] offset:4 ; GFX90A-NEXT: global_store_dword v1, v0, s[2:3] @@ -5942,16 +5942,16 @@ ; GFX6-NEXT: s_movk_i32 s3, 0x7fff ; GFX6-NEXT: s_and_b32 s9, s0, s3 ; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s9 -; GFX6-NEXT: v_mov_b32_e32 v2, s0 ; GFX6-NEXT: s_and_b32 s8, s2, s3 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 ; GFX6-NEXT: s_bfe_u32 s0, s0, 0xf000f -; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s0 ; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s8 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v1 +; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s0 ; GFX6-NEXT: s_bfe_u32 s2, s2, 0xf000f ; GFX6-NEXT: v_alignbit_b32 v2, s1, v2, 30 -; GFX6-NEXT: v_cvt_f32_u32_e32 v6, s2 ; GFX6-NEXT: v_mul_f32_e32 v4, v3, v4 +; GFX6-NEXT: v_cvt_f32_u32_e32 v6, s2 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v7, v5 ; GFX6-NEXT: v_and_b32_e32 v2, s3, v2 ; GFX6-NEXT: v_trunc_f32_e32 v4, v4 @@ -5977,8 +5977,8 @@ ; GFX6-NEXT: v_and_b32_e32 v2, s3, v3 ; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v5, vcc ; GFX6-NEXT: v_and_b32_e32 v3, s3, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 15, v3 ; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 30 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 15, v3 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 @@ -6002,8 +6002,8 @@ ; GFX9-NEXT: s_bfe_u32 s0, s6, 0xf000f ; GFX9-NEXT: v_cvt_f32_u32_e32 v6, s0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v1 -; GFX9-NEXT: v_mov_b32_e32 v3, s6 ; GFX9-NEXT: s_bfe_u32 s1, s4, 0xf000f +; GFX9-NEXT: v_mov_b32_e32 v3, s6 ; GFX9-NEXT: v_alignbit_b32 v3, s7, v3, 30 ; GFX9-NEXT: v_mul_f32_e32 v5, v4, v5 ; GFX9-NEXT: v_cvt_f32_u32_e32 v7, s1 @@ -6014,8 +6014,8 @@ ; GFX9-NEXT: v_cvt_u32_f32_e32 v5, v5 ; GFX9-NEXT: v_cvt_f32_u32_e32 v3, v3 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, v1 ; GFX9-NEXT: v_alignbit_b32 v0, s5, v0, 30 +; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, v1 ; GFX9-NEXT: v_mul_f32_e32 v1, v7, v8 ; GFX9-NEXT: v_and_b32_e32 v0, s8, v0 ; GFX9-NEXT: v_trunc_f32_e32 v1, v1 @@ -6034,8 +6034,8 @@ ; GFX9-NEXT: v_and_b32_e32 v3, s8, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v6, vcc ; GFX9-NEXT: v_and_b32_e32 v4, s8, v5 -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 15, v4 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], 30, v[0:1] +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 15, v4 ; GFX9-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX9-NEXT: v_or_b32_e32 v0, v3, v0 ; GFX9-NEXT: global_store_dword v2, v0, s[2:3] @@ -6058,8 +6058,8 @@ ; GFX90A-NEXT: s_bfe_u32 s0, s6, 0xf000f ; GFX90A-NEXT: v_cvt_f32_u32_e32 v6, s0 ; GFX90A-NEXT: v_rcp_iflag_f32_e32 v5, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, s6 ; GFX90A-NEXT: s_bfe_u32 s1, s4, 0xf000f +; GFX90A-NEXT: v_mov_b32_e32 v3, s6 ; GFX90A-NEXT: v_alignbit_b32 v3, s7, v3, 30 ; GFX90A-NEXT: v_mul_f32_e32 v5, v4, v5 ; GFX90A-NEXT: v_cvt_f32_u32_e32 v7, s1 @@ -6070,8 +6070,8 @@ ; GFX90A-NEXT: v_cvt_u32_f32_e32 v5, v5 ; GFX90A-NEXT: v_cvt_f32_u32_e32 v3, v3 ; GFX90A-NEXT: v_mov_b32_e32 v0, s4 -; GFX90A-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, v1 ; GFX90A-NEXT: v_alignbit_b32 v0, s5, v0, 30 +; GFX90A-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, v1 ; GFX90A-NEXT: v_mul_f32_e32 v1, v7, v8 ; GFX90A-NEXT: v_and_b32_e32 v0, s8, v0 ; GFX90A-NEXT: v_trunc_f32_e32 v1, v1 @@ -6213,8 +6213,8 @@ ; GFX6-NEXT: v_trunc_f32_e32 v1, v1 ; GFX6-NEXT: v_mad_f32 v3, -v1, v5, v3 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v8, v4 -; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v5 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v5 ; GFX6-NEXT: s_lshr_b32 s0, s0, 15 ; GFX6-NEXT: v_mul_f32_e32 v3, v7, v8 ; GFX6-NEXT: v_trunc_f32_e32 v3, v3 @@ -6222,8 +6222,8 @@ ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX6-NEXT: v_mad_f32 v3, -v3, v4, v7 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v4 -; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc ; GFX6-NEXT: v_mul_lo_u32 v1, v1, s0 +; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc ; GFX6-NEXT: v_mul_lo_u32 v2, v3, v2 ; GFX6-NEXT: s_lshr_b32 s8, s2, 15 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s8, v1 @@ -6263,37 +6263,37 @@ ; GFX9-NEXT: v_trunc_f32_e32 v5, v5 ; GFX9-NEXT: v_mad_f32 v4, -v5, v1, v4 ; GFX9-NEXT: v_cvt_u32_f32_e32 v5, v5 -; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, v1 ; GFX9-NEXT: s_bfe_u32 s1, s4, 0xf000f ; GFX9-NEXT: v_and_b32_e32 v3, s8, v3 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v5, vcc -; GFX9-NEXT: v_cvt_f32_u32_e32 v5, v3 +; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, v1 ; GFX9-NEXT: v_cvt_f32_u32_e32 v7, s1 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v8, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v5, vcc +; GFX9-NEXT: v_cvt_f32_u32_e32 v5, v3 ; GFX9-NEXT: v_and_b32_e32 v0, s8, v0 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v9, v5 -; GFX9-NEXT: s_lshr_b32 s0, s6, 15 ; GFX9-NEXT: v_mul_f32_e32 v4, v7, v8 ; GFX9-NEXT: v_cvt_f32_u32_e32 v8, v0 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v9, v5 ; GFX9-NEXT: v_trunc_f32_e32 v4, v4 ; GFX9-NEXT: v_mad_f32 v7, -v4, v6, v7 -; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v7|, v6 ; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v4 +; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v7|, v6 ; GFX9-NEXT: v_mul_f32_e32 v6, v8, v9 ; GFX9-NEXT: v_trunc_f32_e32 v6, v6 ; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v6 ; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc ; GFX9-NEXT: v_mad_f32 v6, -v6, v5, v8 +; GFX9-NEXT: s_lshr_b32 s0, s6, 15 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v6|, v5 ; GFX9-NEXT: v_mul_lo_u32 v4, v4, s0 ; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc -; GFX9-NEXT: v_mul_lo_u32 v3, v5, v3 ; GFX9-NEXT: v_mul_lo_u32 v1, v1, s6 +; GFX9-NEXT: v_mul_lo_u32 v3, v5, v3 ; GFX9-NEXT: s_lshr_b32 s0, s4, 15 ; GFX9-NEXT: v_sub_u32_e32 v4, s0, v4 -; GFX9-NEXT: v_and_b32_e32 v4, s8, v4 ; GFX9-NEXT: v_sub_u32_e32 v5, s4, v1 ; GFX9-NEXT: v_sub_u32_e32 v0, v0, v3 +; GFX9-NEXT: v_and_b32_e32 v4, s8, v4 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], 30, v[0:1] ; GFX9-NEXT: v_and_b32_e32 v3, s8, v5 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 15, v4 @@ -6472,8 +6472,8 @@ ; GFX6-NEXT: v_mul_f32_e32 v4, v3, v4 ; GFX6-NEXT: v_trunc_f32_e32 v4, v4 ; GFX6-NEXT: v_mad_f32 v3, -v4, v2, v3 -; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v2| ; GFX6-NEXT: v_cvt_i32_f32_e32 v4, v4 +; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v2| ; GFX6-NEXT: v_cvt_f32_i32_e32 v3, s0 ; GFX6-NEXT: s_or_b32 s1, s1, 1 ; GFX6-NEXT: v_mov_b32_e32 v5, s1 @@ -6488,8 +6488,8 @@ ; GFX6-NEXT: v_mul_f32_e32 v5, v4, v5 ; GFX6-NEXT: v_trunc_f32_e32 v5, v5 ; GFX6-NEXT: v_mad_f32 v4, -v5, v3, v4 -; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, |v3| ; GFX6-NEXT: v_cvt_i32_f32_e32 v5, v5 +; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, |v3| ; GFX6-NEXT: v_cvt_f32_i32_e32 v4, v1 ; GFX6-NEXT: s_or_b32 s0, s0, 1 ; GFX6-NEXT: v_mov_b32_e32 v6, s0 @@ -6543,26 +6543,26 @@ ; GFX9-NEXT: v_mad_f32 v4, -v5, v3, v4 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v4|, |v3| ; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX9-NEXT: s_cselect_b32 s0, s5, 0 ; GFX9-NEXT: v_cvt_i32_f32_e32 v5, v5 +; GFX9-NEXT: s_cselect_b32 s0, s5, 0 ; GFX9-NEXT: s_bfe_i32 s1, s6, 0xf000f ; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s1 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 ; GFX9-NEXT: v_add_u32_e32 v4, s0, v5 ; GFX9-NEXT: s_bfe_i32 s0, s4, 0xf000f ; GFX9-NEXT: v_cvt_f32_i32_e32 v5, s0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v6, v3 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 ; GFX9-NEXT: v_alignbit_b32 v1, s7, v1, 30 ; GFX9-NEXT: s_xor_b32 s0, s0, s1 -; GFX9-NEXT: s_ashr_i32 s0, s0, 30 ; GFX9-NEXT: v_mul_f32_e32 v6, v5, v6 ; GFX9-NEXT: v_trunc_f32_e32 v6, v6 +; GFX9-NEXT: s_ashr_i32 s0, s0, 30 ; GFX9-NEXT: v_mad_f32 v5, -v6, v3, v5 ; GFX9-NEXT: v_bfe_i32 v1, v1, 0, 15 ; GFX9-NEXT: s_or_b32 s4, s0, 1 +; GFX9-NEXT: v_cvt_i32_f32_e32 v6, v6 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, |v3| ; GFX9-NEXT: v_cvt_f32_i32_e32 v3, v1 -; GFX9-NEXT: v_cvt_i32_f32_e32 v6, v6 ; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: s_cselect_b32 s0, s4, 0 ; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 15 @@ -6582,8 +6582,8 @@ ; GFX9-NEXT: v_add_u32_e32 v0, v7, v0 ; GFX9-NEXT: v_and_b32_e32 v3, s0, v4 ; GFX9-NEXT: v_and_b32_e32 v4, s0, v5 -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 15, v4 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], 30, v[0:1] +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 15, v4 ; GFX9-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX9-NEXT: v_or_b32_e32 v0, v3, v0 ; GFX9-NEXT: global_store_dword v2, v0, s[2:3] @@ -6613,26 +6613,26 @@ ; GFX90A-NEXT: v_mad_f32 v4, -v5, v3, v4 ; GFX90A-NEXT: v_cmp_ge_f32_e64 s[0:1], |v4|, |v3| ; GFX90A-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX90A-NEXT: s_cselect_b32 s0, s5, 0 ; GFX90A-NEXT: v_cvt_i32_f32_e32 v5, v5 +; GFX90A-NEXT: s_cselect_b32 s0, s5, 0 ; GFX90A-NEXT: s_bfe_i32 s1, s6, 0xf000f ; GFX90A-NEXT: v_cvt_f32_i32_e32 v3, s1 -; GFX90A-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NEXT: v_add_u32_e32 v4, s0, v5 ; GFX90A-NEXT: s_bfe_i32 s0, s4, 0xf000f ; GFX90A-NEXT: v_cvt_f32_i32_e32 v5, s0 ; GFX90A-NEXT: v_rcp_iflag_f32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, s6 ; GFX90A-NEXT: v_alignbit_b32 v1, s7, v1, 30 ; GFX90A-NEXT: s_xor_b32 s0, s0, s1 -; GFX90A-NEXT: s_ashr_i32 s0, s0, 30 ; GFX90A-NEXT: v_mul_f32_e32 v6, v5, v6 ; GFX90A-NEXT: v_trunc_f32_e32 v6, v6 +; GFX90A-NEXT: s_ashr_i32 s0, s0, 30 ; GFX90A-NEXT: v_mad_f32 v5, -v6, v3, v5 ; GFX90A-NEXT: v_bfe_i32 v1, v1, 0, 15 ; GFX90A-NEXT: s_or_b32 s4, s0, 1 +; GFX90A-NEXT: v_cvt_i32_f32_e32 v6, v6 ; GFX90A-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, |v3| ; GFX90A-NEXT: v_cvt_f32_i32_e32 v3, v1 -; GFX90A-NEXT: v_cvt_i32_f32_e32 v6, v6 ; GFX90A-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX90A-NEXT: s_cselect_b32 s0, s4, 0 ; GFX90A-NEXT: v_bfe_i32 v0, v0, 0, 15 @@ -6777,16 +6777,16 @@ ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v2| ; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v5, vcc ; GFX6-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GFX6-NEXT: s_bfe_u32 s12, s0, 0xf000f +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GFX6-NEXT: v_alignbit_b32 v1, s1, v1, 30 -; GFX6-NEXT: v_mul_lo_u32 v2, v2, s0 ; GFX6-NEXT: s_lshr_b32 s1, s0, 15 +; GFX6-NEXT: v_mul_lo_u32 v2, v2, s0 ; GFX6-NEXT: s_bfe_i32 s0, s12, 0xf0000 ; GFX6-NEXT: v_cvt_f32_i32_e32 v3, s0 ; GFX6-NEXT: s_bfe_u32 s10, s2, 0xf000f -; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s2, v2 ; GFX6-NEXT: s_lshr_b32 s8, s2, 15 +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s2, v2 ; GFX6-NEXT: s_bfe_i32 s2, s10, 0xf0000 ; GFX6-NEXT: v_cvt_f32_i32_e32 v4, s2 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v3 @@ -6798,8 +6798,8 @@ ; GFX6-NEXT: v_mad_f32 v4, -v5, v3, v4 ; GFX6-NEXT: v_cvt_i32_f32_e32 v5, v5 ; GFX6-NEXT: v_and_b32_e32 v1, s3, v1 -; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, |v3| ; GFX6-NEXT: v_mov_b32_e32 v6, s0 +; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, |v3| ; GFX6-NEXT: v_cndmask_b32_e32 v3, 0, v6, vcc ; GFX6-NEXT: v_bfe_i32 v4, v1, 0, 15 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v5 @@ -6820,12 +6820,12 @@ ; GFX6-NEXT: v_mul_lo_u32 v3, v3, s1 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v6 ; GFX6-NEXT: v_mul_lo_u32 v1, v4, v1 -; GFX6-NEXT: v_and_b32_e32 v2, s3, v2 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s8, v3 ; GFX6-NEXT: v_and_b32_e32 v3, s3, v3 ; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, v1, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 15, v3 ; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 30 +; GFX6-NEXT: v_and_b32_e32 v2, s3, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 15, v3 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 @@ -6856,13 +6856,13 @@ ; GFX9-NEXT: v_trunc_f32_e32 v4, v4 ; GFX9-NEXT: v_mad_f32 v3, -v4, v2, v3 ; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v4 -; GFX9-NEXT: v_alignbit_b32 v0, s5, v0, 30 -; GFX9-NEXT: v_alignbit_b32 v1, s7, v1, 30 -; GFX9-NEXT: s_or_b32 s11, s0, 1 ; GFX9-NEXT: s_lshr_b32 s9, s4, 15 +; GFX9-NEXT: v_alignbit_b32 v0, s5, v0, 30 ; GFX9-NEXT: s_bfe_u32 s5, s4, 0xf000f +; GFX9-NEXT: v_alignbit_b32 v1, s7, v1, 30 ; GFX9-NEXT: s_lshr_b32 s7, s6, 15 ; GFX9-NEXT: s_bfe_u32 s10, s6, 0xf000f +; GFX9-NEXT: s_or_b32 s11, s0, 1 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v3|, |v2| ; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: s_cselect_b32 s0, s11, 0 @@ -6942,12 +6942,12 @@ ; GFX90A-NEXT: v_mad_f32 v4, -v5, v3, v4 ; GFX90A-NEXT: v_cvt_i32_f32_e32 v5, v5 ; GFX90A-NEXT: v_alignbit_b32 v0, s5, v0, 30 -; GFX90A-NEXT: v_alignbit_b32 v1, s7, v1, 30 -; GFX90A-NEXT: s_or_b32 s11, s0, 1 ; GFX90A-NEXT: s_lshr_b32 s5, s4, 15 ; GFX90A-NEXT: s_bfe_u32 s9, s4, 0xf000f +; GFX90A-NEXT: v_alignbit_b32 v1, s7, v1, 30 ; GFX90A-NEXT: s_lshr_b32 s7, s6, 15 ; GFX90A-NEXT: s_bfe_u32 s10, s6, 0xf000f +; GFX90A-NEXT: s_or_b32 s11, s0, 1 ; GFX90A-NEXT: v_cmp_ge_f32_e64 s[0:1], |v4|, |v3| ; GFX90A-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX90A-NEXT: s_cselect_b32 s0, s11, 0 @@ -6967,8 +6967,8 @@ ; GFX90A-NEXT: v_cvt_i32_f32_e32 v6, v6 ; GFX90A-NEXT: s_or_b32 s4, s0, 1 ; GFX90A-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, |v4| -; GFX90A-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX90A-NEXT: v_and_b32_e32 v1, s8, v1 +; GFX90A-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX90A-NEXT: s_cselect_b32 s0, s4, 0 ; GFX90A-NEXT: v_bfe_i32 v5, v1, 0, 15 ; GFX90A-NEXT: v_add_u32_e32 v4, s0, v6 @@ -6989,8 +6989,8 @@ ; GFX90A-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc ; GFX90A-NEXT: v_sub_u32_e32 v4, s5, v4 ; GFX90A-NEXT: v_add_u32_e32 v5, v9, v5 -; GFX90A-NEXT: v_and_b32_e32 v4, s8, v4 ; GFX90A-NEXT: v_mul_lo_u32 v1, v5, v1 +; GFX90A-NEXT: v_and_b32_e32 v4, s8, v4 ; GFX90A-NEXT: v_sub_u32_e32 v0, v0, v1 ; GFX90A-NEXT: v_and_b32_e32 v3, s8, v3 ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 15, v4 @@ -7441,8 +7441,8 @@ ; GFX9-NEXT: v_subrev_u32_e32 v5, s4, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc ; GFX9-NEXT: v_sub_u32_e32 v4, s3, v4 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s4, v3 ; GFX9-NEXT: v_add_u32_e32 v5, 1, v0 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s4, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v4 ; GFX9-NEXT: v_subrev_u32_e32 v3, s5, v4 @@ -7880,14 +7880,14 @@ ; GFX9-NEXT: v_sub_u32_e32 v1, s3, v1 ; GFX9-NEXT: v_subrev_u32_e32 v3, s4, v0 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s4, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GFX9-NEXT: v_subrev_u32_e32 v4, s5, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; GFX9-NEXT: v_subrev_u32_e32 v3, s4, v0 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s4, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GFX9-NEXT: v_subrev_u32_e32 v4, s5, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] @@ -8202,8 +8202,8 @@ ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_ashr_i32 s2, s0, 31 -; GFX6-NEXT: s_lshr_b32 s2, s2, 20 ; GFX6-NEXT: s_ashr_i32 s3, s1, 31 +; GFX6-NEXT: s_lshr_b32 s2, s2, 20 ; GFX6-NEXT: s_add_i32 s0, s0, s2 ; GFX6-NEXT: s_lshr_b32 s2, s3, 20 ; GFX6-NEXT: s_add_i32 s1, s1, s2 @@ -8280,9 +8280,9 @@ ; GFX6-NEXT: s_lshr_b32 s2, s2, 20 ; GFX6-NEXT: s_add_i32 s0, s0, s2 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, s1, v0 +; GFX6-NEXT: s_ashr_i32 s0, s0, 12 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 31, v0 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 11, v0 -; GFX6-NEXT: s_ashr_i32 s0, s0, 12 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v0 ; GFX6-NEXT: v_mov_b32_e32 v0, s0 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 @@ -8477,11 +8477,11 @@ ; GFX6-NEXT: v_mul_lo_u32 v2, v1, s10 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, 1, v1 ; GFX6-NEXT: v_xor_b32_e32 v0, s8, v0 -; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s8, v0 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s1, v2 ; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s10, v2 ; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] ; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s10, v2 +; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s8, v0 ; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] ; GFX6-NEXT: v_add_i32_e32 v3, vcc, 1, v1 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s10, v2 @@ -8541,8 +8541,8 @@ ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc ; GFX9-NEXT: v_subrev_u32_e32 v5, s2, v4 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s2, v4 ; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s2, v4 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GFX9-NEXT: v_mul_lo_u32 v3, v1, s0 ; GFX9-NEXT: v_add_u32_e32 v4, 1, v1 @@ -8801,8 +8801,6 @@ ; GFX9-LABEL: srem_i32_pow2_shl_denom: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b32 s3, 0x1000, s3 ; GFX9-NEXT: s_ashr_i32 s4, s3, 31 @@ -8810,6 +8808,7 @@ ; GFX9-NEXT: s_xor_b32 s3, s3, s4 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 ; GFX9-NEXT: s_sub_i32 s4, 0, s3 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 @@ -8831,6 +8830,7 @@ ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX9-NEXT: v_xor_b32_e32 v0, s4, v0 ; GFX9-NEXT: v_subrev_u32_e32 v0, s4, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-NEXT: s_endpgm ; @@ -9054,8 +9054,8 @@ ; GFX6-NEXT: s_ashr_i32 s6, s3, 31 ; GFX6-NEXT: s_add_i32 s3, s3, s6 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX6-NEXT: s_xor_b32 s3, s3, s6 ; GFX6-NEXT: s_sub_i32 s9, 0, s2 +; GFX6-NEXT: s_xor_b32 s3, s3, s6 ; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s3 ; GFX6-NEXT: v_mul_f32_e32 v0, s10, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 @@ -9078,8 +9078,8 @@ ; GFX6-NEXT: v_mul_lo_u32 v0, v0, s2 ; GFX6-NEXT: v_mul_hi_u32 v2, v1, v2 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 -; GFX6-NEXT: s_ashr_i32 s0, s1, 31 ; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s2, v0 +; GFX6-NEXT: s_ashr_i32 s0, s1, 31 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s2, v0 ; GFX6-NEXT: s_add_i32 s1, s1, s0 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc @@ -9088,8 +9088,8 @@ ; GFX6-NEXT: v_mul_hi_u32 v1, s1, v1 ; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s2, v0 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s2, v0 -; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GFX6-NEXT: v_mul_lo_u32 v1, v1, s3 +; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GFX6-NEXT: v_xor_b32_e32 v0, s8, v0 ; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s8, v0 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s1, v1 @@ -9270,16 +9270,14 @@ ; GFX6-NEXT: v_mul_lo_u32 v3, v0, s3 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v4, v2 ; GFX6-NEXT: v_mul_lo_u32 v5, v0, v2 -; GFX6-NEXT: v_mul_hi_u32 v4, v0, v2 ; GFX6-NEXT: v_mul_hi_u32 v6, v0, v3 +; GFX6-NEXT: v_mul_hi_u32 v4, v0, v2 ; GFX6-NEXT: v_mul_hi_u32 v9, v1, v2 ; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 -; GFX6-NEXT: s_mov_b32 s4, 0x976a7376 ; GFX6-NEXT: v_add_i32_e32 v5, vcc, v6, v5 ; GFX6-NEXT: v_mul_lo_u32 v6, v1, v3 ; GFX6-NEXT: v_mul_hi_u32 v3, v1, v3 ; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v8, v4, vcc -; GFX6-NEXT: s_mov_b32 s10, -1 ; GFX6-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v4, v3, vcc ; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v9, v7, vcc @@ -9290,21 +9288,20 @@ ; GFX6-NEXT: v_mul_hi_u32 v5, v0, s3 ; GFX6-NEXT: v_addc_u32_e64 v2, vcc, v1, v3, s[0:1] ; GFX6-NEXT: v_mul_lo_u32 v6, v2, s3 -; GFX6-NEXT: s_movk_i32 s2, 0x11f ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 ; GFX6-NEXT: v_mul_lo_u32 v5, v0, s3 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v6 ; GFX6-NEXT: v_mul_lo_u32 v6, v0, v4 -; GFX6-NEXT: v_mul_hi_u32 v10, v0, v4 ; GFX6-NEXT: v_mul_hi_u32 v9, v0, v5 +; GFX6-NEXT: v_mul_hi_u32 v10, v0, v4 ; GFX6-NEXT: v_mul_hi_u32 v11, v2, v4 -; GFX6-NEXT: s_mov_b32 s3, 0x976a7377 -; GFX6-NEXT: s_mov_b32 s9, s5 +; GFX6-NEXT: s_movk_i32 s2, 0x11f ; GFX6-NEXT: v_add_i32_e32 v6, vcc, v9, v6 ; GFX6-NEXT: v_addc_u32_e32 v9, vcc, v8, v10, vcc ; GFX6-NEXT: v_mul_lo_u32 v10, v2, v5 ; GFX6-NEXT: v_mul_hi_u32 v5, v2, v5 ; GFX6-NEXT: v_mul_lo_u32 v2, v2, v4 +; GFX6-NEXT: s_mov_b32 s3, 0x976a7377 ; GFX6-NEXT: v_add_i32_e32 v6, vcc, v6, v10 ; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v9, v5, vcc ; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v11, v7, vcc @@ -9323,6 +9320,8 @@ ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v8, v4, vcc ; GFX6-NEXT: v_mul_lo_u32 v4, s7, v0 ; GFX6-NEXT: v_mul_hi_u32 v0, s7, v0 +; GFX6-NEXT: s_mov_b32 s4, 0x976a7376 +; GFX6-NEXT: s_mov_b32 s10, -1 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GFX6-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc ; GFX6-NEXT: v_addc_u32_e32 v2, vcc, v5, v7, vcc @@ -9332,6 +9331,7 @@ ; GFX6-NEXT: v_mul_hi_u32 v3, v0, s3 ; GFX6-NEXT: v_mul_lo_u32 v4, v1, s3 ; GFX6-NEXT: v_mov_b32_e32 v5, s2 +; GFX6-NEXT: s_mov_b32 s9, s5 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX6-NEXT: v_mul_lo_u32 v3, v0, s3 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 @@ -9403,8 +9403,8 @@ ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v4, v6, vcc ; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v9, v5, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 -; GFX9-NEXT: v_add_co_u32_e64 v0, s[2:3], v0, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v8, v4, vcc +; GFX9-NEXT: v_add_co_u32_e64 v0, s[2:3], v0, v2 ; GFX9-NEXT: v_addc_co_u32_e64 v2, vcc, v1, v3, s[2:3] ; GFX9-NEXT: v_mul_lo_u32 v4, v0, s4 ; GFX9-NEXT: v_mul_hi_u32 v6, v0, s5 @@ -9479,8 +9479,8 @@ ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s2, v2 ; GFX9-NEXT: v_add_co_u32_e64 v4, s[0:1], v0, v4 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; GFX9-NEXT: v_addc_co_u32_e64 v6, s[0:1], 0, v1, s[0:1] +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc ; GFX9-NEXT: global_store_dwordx2 v5, v[0:1], s[4:5] @@ -9511,10 +9511,10 @@ ; GFX90A-NEXT: v_mul_lo_u32 v6, v0, s3 ; GFX90A-NEXT: v_mul_lo_u32 v5, v0, v3 ; GFX90A-NEXT: v_mul_hi_u32 v7, v0, v6 -; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v7, v5 ; GFX90A-NEXT: v_mul_hi_u32 v4, v0, v3 -; GFX90A-NEXT: v_mul_hi_u32 v9, v1, v6 +; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v7, v5 ; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v8, v4, vcc +; GFX90A-NEXT: v_mul_hi_u32 v9, v1, v6 ; GFX90A-NEXT: v_mul_lo_u32 v6, v1, v6 ; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v5, v6 ; GFX90A-NEXT: v_mul_hi_u32 v7, v1, v3 @@ -9522,8 +9522,8 @@ ; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v7, v2, vcc ; GFX90A-NEXT: v_mul_lo_u32 v3, v1, v3 ; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v4, v3 -; GFX90A-NEXT: v_add_co_u32_e64 v0, s[0:1], v0, v3 ; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v8, v5, vcc +; GFX90A-NEXT: v_add_co_u32_e64 v0, s[0:1], v0, v3 ; GFX90A-NEXT: v_addc_co_u32_e64 v3, vcc, v1, v4, s[0:1] ; GFX90A-NEXT: v_mul_lo_u32 v6, v0, s2 ; GFX90A-NEXT: v_mul_hi_u32 v7, v0, s3 @@ -9533,10 +9533,10 @@ ; GFX90A-NEXT: v_mul_lo_u32 v9, v0, s3 ; GFX90A-NEXT: v_mul_lo_u32 v7, v0, v5 ; GFX90A-NEXT: v_mul_hi_u32 v10, v0, v9 -; GFX90A-NEXT: v_add_co_u32_e32 v7, vcc, v10, v7 ; GFX90A-NEXT: v_mul_hi_u32 v6, v0, v5 -; GFX90A-NEXT: v_mul_hi_u32 v11, v3, v9 +; GFX90A-NEXT: v_add_co_u32_e32 v7, vcc, v10, v7 ; GFX90A-NEXT: v_addc_co_u32_e32 v6, vcc, v8, v6, vcc +; GFX90A-NEXT: v_mul_hi_u32 v11, v3, v9 ; GFX90A-NEXT: v_mul_lo_u32 v9, v3, v9 ; GFX90A-NEXT: v_add_co_u32_e32 v7, vcc, v7, v9 ; GFX90A-NEXT: v_mul_hi_u32 v10, v3, v5 @@ -9552,10 +9552,10 @@ ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mul_lo_u32 v4, s6, v1 ; GFX90A-NEXT: v_mul_hi_u32 v5, s6, v0 -; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v5, v4 ; GFX90A-NEXT: v_mul_hi_u32 v3, s6, v1 -; GFX90A-NEXT: v_mul_hi_u32 v6, s7, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v5, v4 ; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v8, v3, vcc +; GFX90A-NEXT: v_mul_hi_u32 v6, s7, v0 ; GFX90A-NEXT: v_mul_lo_u32 v0, s7, v0 ; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v4, v0 ; GFX90A-NEXT: v_mul_hi_u32 v5, s7, v1 @@ -9563,8 +9563,8 @@ ; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v2, vcc ; GFX90A-NEXT: v_mul_lo_u32 v1, s7, v1 ; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 -; GFX90A-NEXT: s_mov_b32 s3, 0x976a7377 ; GFX90A-NEXT: s_movk_i32 s2, 0x11f +; GFX90A-NEXT: s_mov_b32 s3, 0x976a7377 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v8, v3, vcc ; GFX90A-NEXT: v_mul_lo_u32 v3, v0, s2 ; GFX90A-NEXT: v_mul_hi_u32 v4, v0, s3 @@ -9572,9 +9572,9 @@ ; GFX90A-NEXT: v_mul_lo_u32 v4, v1, s3 ; GFX90A-NEXT: v_add_u32_e32 v3, v3, v4 ; GFX90A-NEXT: v_mul_lo_u32 v5, v0, s3 -; GFX90A-NEXT: v_sub_co_u32_e32 v5, vcc, s6, v5 ; GFX90A-NEXT: v_sub_u32_e32 v4, s7, v3 ; GFX90A-NEXT: v_mov_b32_e32 v6, s2 +; GFX90A-NEXT: v_sub_co_u32_e32 v5, vcc, s6, v5 ; GFX90A-NEXT: v_subb_co_u32_e64 v4, s[0:1], v4, v6, vcc ; GFX90A-NEXT: v_subrev_co_u32_e64 v6, s[0:1], s3, v5 ; GFX90A-NEXT: v_subbrev_co_u32_e64 v4, s[0:1], 0, v4, s[0:1] @@ -9597,8 +9597,8 @@ ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, s2, v3 ; GFX90A-NEXT: v_add_co_u32_e64 v4, s[0:1], v0, v4 ; GFX90A-NEXT: v_cndmask_b32_e32 v3, v7, v5, vcc -; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 ; GFX90A-NEXT: v_addc_co_u32_e64 v6, s[0:1], 0, v1, s[0:1] +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 ; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc ; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] @@ -9814,15 +9814,15 @@ ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v3, v4, vcc ; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v9, v7, vcc ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GFX6-NEXT: v_add_i32_e64 v0, s[0:1], v0, v2 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v8, v4, vcc -; GFX6-NEXT: v_mul_hi_u32 v4, v0, s2 +; GFX6-NEXT: v_add_i32_e64 v0, s[0:1], v0, v2 ; GFX6-NEXT: v_addc_u32_e64 v2, vcc, v1, v3, s[0:1] +; GFX6-NEXT: v_mul_hi_u32 v4, v0, s2 ; GFX6-NEXT: v_mul_lo_u32 v5, v2, s2 ; GFX6-NEXT: v_mul_lo_u32 v6, v0, s2 -; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, v0, v4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_lshr_b64 s[2:3], s[8:9], 12 +; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, v0, v4 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v5 ; GFX6-NEXT: v_mul_lo_u32 v5, v0, v4 ; GFX6-NEXT: v_mul_hi_u32 v9, v0, v6 @@ -9923,8 +9923,8 @@ ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v6, v3, vcc ; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v9, v5, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 -; GFX9-NEXT: v_add_co_u32_e64 v0, s[2:3], v0, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v7, v4, vcc +; GFX9-NEXT: v_add_co_u32_e64 v0, s[2:3], v0, v2 ; GFX9-NEXT: v_addc_co_u32_e64 v2, vcc, v1, v3, s[2:3] ; GFX9-NEXT: v_mul_hi_u32 v4, v0, s4 ; GFX9-NEXT: v_mul_lo_u32 v6, v2, s4 @@ -9968,12 +9968,12 @@ ; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v6, v5, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v7, v2, vcc -; GFX9-NEXT: v_mul_lo_u32 v4, v0, s8 ; GFX9-NEXT: v_mul_lo_u32 v2, v1, s8 ; GFX9-NEXT: v_mul_hi_u32 v3, v0, s8 -; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, s6, v4 +; GFX9-NEXT: v_mul_lo_u32 v4, v0, s8 ; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 ; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, s6, v4 ; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v3, v2, vcc ; GFX9-NEXT: v_subrev_co_u32_e32 v3, vcc, s8, v4 ; GFX9-NEXT: v_subbrev_co_u32_e32 v6, vcc, 0, v2, vcc @@ -10020,10 +10020,10 @@ ; GFX90A-NEXT: v_mul_lo_u32 v6, v0, s8 ; GFX90A-NEXT: v_mul_lo_u32 v5, v0, v2 ; GFX90A-NEXT: v_mul_hi_u32 v7, v0, v6 -; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v7, v5 ; GFX90A-NEXT: v_mul_hi_u32 v3, v0, v2 -; GFX90A-NEXT: v_mul_hi_u32 v9, v1, v6 +; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v7, v5 ; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v8, v3, vcc +; GFX90A-NEXT: v_mul_hi_u32 v9, v1, v6 ; GFX90A-NEXT: v_mul_lo_u32 v6, v1, v6 ; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v5, v6 ; GFX90A-NEXT: v_mul_hi_u32 v7, v1, v2 @@ -10031,8 +10031,8 @@ ; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v7, v4, vcc ; GFX90A-NEXT: v_mul_lo_u32 v2, v1, v2 ; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 -; GFX90A-NEXT: v_add_co_u32_e64 v0, s[0:1], v0, v2 ; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v8, v5, vcc +; GFX90A-NEXT: v_add_co_u32_e64 v0, s[0:1], v0, v2 ; GFX90A-NEXT: v_addc_co_u32_e64 v2, vcc, v1, v3, s[0:1] ; GFX90A-NEXT: v_mul_hi_u32 v6, v0, s8 ; GFX90A-NEXT: v_mul_lo_u32 v5, v2, s8 @@ -10041,10 +10041,10 @@ ; GFX90A-NEXT: v_mul_lo_u32 v9, v0, s8 ; GFX90A-NEXT: v_mul_lo_u32 v7, v0, v5 ; GFX90A-NEXT: v_mul_hi_u32 v10, v0, v9 -; GFX90A-NEXT: v_add_co_u32_e32 v7, vcc, v10, v7 ; GFX90A-NEXT: v_mul_hi_u32 v6, v0, v5 -; GFX90A-NEXT: v_mul_hi_u32 v11, v2, v9 +; GFX90A-NEXT: v_add_co_u32_e32 v7, vcc, v10, v7 ; GFX90A-NEXT: v_addc_co_u32_e32 v6, vcc, v8, v6, vcc +; GFX90A-NEXT: v_mul_hi_u32 v11, v2, v9 ; GFX90A-NEXT: v_mul_lo_u32 v9, v2, v9 ; GFX90A-NEXT: v_add_co_u32_e32 v7, vcc, v7, v9 ; GFX90A-NEXT: v_mul_hi_u32 v10, v2, v5 @@ -10060,10 +10060,10 @@ ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mul_lo_u32 v3, s6, v1 ; GFX90A-NEXT: v_mul_hi_u32 v5, s6, v0 -; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v5, v3 ; GFX90A-NEXT: v_mul_hi_u32 v2, s6, v1 -; GFX90A-NEXT: v_mul_hi_u32 v6, s7, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v5, v3 ; GFX90A-NEXT: v_addc_co_u32_e32 v2, vcc, v8, v2, vcc +; GFX90A-NEXT: v_mul_hi_u32 v6, s7, v0 ; GFX90A-NEXT: v_mul_lo_u32 v0, s7, v0 ; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v3, v0 ; GFX90A-NEXT: v_mul_hi_u32 v5, s7, v1 @@ -10071,8 +10071,8 @@ ; GFX90A-NEXT: v_addc_co_u32_e32 v2, vcc, v5, v4, vcc ; GFX90A-NEXT: v_mul_lo_u32 v1, s7, v1 ; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 -; GFX90A-NEXT: s_movk_i32 s0, 0xfff ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v8, v2, vcc +; GFX90A-NEXT: s_movk_i32 s0, 0xfff ; GFX90A-NEXT: v_mul_lo_u32 v2, v1, s0 ; GFX90A-NEXT: v_mul_hi_u32 v3, v0, s0 ; GFX90A-NEXT: v_add_u32_e32 v2, v3, v2 @@ -10095,8 +10095,8 @@ ; GFX90A-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX90A-NEXT: v_cndmask_b32_e32 v2, -1, v3, vcc -; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; GFX90A-NEXT: s_lshr_b64 s[4:5], s[4:5], 12 +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; GFX90A-NEXT: v_cndmask_b32_e32 v2, v0, v5, vcc ; GFX90A-NEXT: v_cndmask_b32_e32 v3, v1, v6, vcc ; GFX90A-NEXT: v_mov_b32_e32 v0, s4 @@ -10215,16 +10215,14 @@ ; GFX6-NEXT: v_mul_lo_u32 v3, v0, s3 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v4, v2 ; GFX6-NEXT: v_mul_lo_u32 v5, v0, v2 -; GFX6-NEXT: v_mul_hi_u32 v4, v0, v2 ; GFX6-NEXT: v_mul_hi_u32 v6, v0, v3 +; GFX6-NEXT: v_mul_hi_u32 v4, v0, v2 ; GFX6-NEXT: v_mul_hi_u32 v9, v1, v2 ; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 -; GFX6-NEXT: s_movk_i32 s4, 0x11f ; GFX6-NEXT: v_add_i32_e32 v5, vcc, v6, v5 ; GFX6-NEXT: v_mul_lo_u32 v6, v1, v3 ; GFX6-NEXT: v_mul_hi_u32 v3, v1, v3 ; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v8, v4, vcc -; GFX6-NEXT: s_mov_b32 s9, s5 ; GFX6-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v4, v3, vcc ; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v9, v7, vcc @@ -10235,21 +10233,20 @@ ; GFX6-NEXT: v_mul_hi_u32 v5, v0, s3 ; GFX6-NEXT: v_addc_u32_e64 v2, vcc, v1, v3, s[0:1] ; GFX6-NEXT: v_mul_lo_u32 v6, v2, s3 -; GFX6-NEXT: s_movk_i32 s5, 0x11e ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 ; GFX6-NEXT: v_mul_lo_u32 v5, v0, s3 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v6 ; GFX6-NEXT: v_mul_lo_u32 v6, v0, v4 -; GFX6-NEXT: v_mul_hi_u32 v10, v0, v4 ; GFX6-NEXT: v_mul_hi_u32 v9, v0, v5 +; GFX6-NEXT: v_mul_hi_u32 v10, v0, v4 ; GFX6-NEXT: v_mul_hi_u32 v11, v2, v4 -; GFX6-NEXT: s_mov_b32 s11, 0xf000 -; GFX6-NEXT: s_mov_b32 s10, -1 +; GFX6-NEXT: s_movk_i32 s4, 0x11f ; GFX6-NEXT: v_add_i32_e32 v6, vcc, v9, v6 ; GFX6-NEXT: v_addc_u32_e32 v9, vcc, v8, v10, vcc ; GFX6-NEXT: v_mul_lo_u32 v10, v2, v5 ; GFX6-NEXT: v_mul_hi_u32 v5, v2, v5 ; GFX6-NEXT: v_mul_lo_u32 v2, v2, v4 +; GFX6-NEXT: s_mov_b32 s9, s5 ; GFX6-NEXT: v_add_i32_e32 v6, vcc, v6, v10 ; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v9, v5, vcc ; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v11, v7, vcc @@ -10268,6 +10265,8 @@ ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v8, v4, vcc ; GFX6-NEXT: v_mul_lo_u32 v4, s7, v0 ; GFX6-NEXT: v_mul_hi_u32 v0, s7, v0 +; GFX6-NEXT: s_movk_i32 s5, 0x11e +; GFX6-NEXT: s_mov_b32 s11, 0xf000 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GFX6-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc ; GFX6-NEXT: v_addc_u32_e32 v2, vcc, v5, v7, vcc @@ -10277,22 +10276,23 @@ ; GFX6-NEXT: v_mul_hi_u32 v3, v0, s12 ; GFX6-NEXT: v_mul_lo_u32 v1, v1, s12 ; GFX6-NEXT: v_mul_lo_u32 v0, v0, s12 +; GFX6-NEXT: s_mov_b32 s10, -1 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v2, v1 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s7, v1 -; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s6, v0 ; GFX6-NEXT: v_mov_b32_e32 v3, s4 +; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s6, v0 ; GFX6-NEXT: v_subb_u32_e64 v2, s[0:1], v2, v3, vcc ; GFX6-NEXT: v_subrev_i32_e64 v4, s[0:1], s12, v0 ; GFX6-NEXT: v_subbrev_u32_e64 v5, s[2:3], 0, v2, s[0:1] ; GFX6-NEXT: v_cmp_lt_u32_e64 s[2:3], s5, v5 ; GFX6-NEXT: s_mov_b32 s6, 0x9761f7c8 -; GFX6-NEXT: v_subb_u32_e64 v2, s[0:1], v2, v3, s[0:1] ; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[2:3] ; GFX6-NEXT: v_cmp_lt_u32_e64 s[2:3], s6, v4 -; GFX6-NEXT: v_subrev_i32_e64 v3, s[0:1], s12, v4 +; GFX6-NEXT: v_subb_u32_e64 v2, s[0:1], v2, v3, s[0:1] ; GFX6-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[2:3] ; GFX6-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, v5 +; GFX6-NEXT: v_subrev_i32_e64 v3, s[0:1], s12, v4 ; GFX6-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[2:3] ; GFX6-NEXT: v_subbrev_u32_e64 v2, s[0:1], 0, v2, s[0:1] ; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v6 @@ -10350,8 +10350,8 @@ ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v4, v6, vcc ; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v9, v5, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 -; GFX9-NEXT: v_add_co_u32_e64 v0, s[2:3], v0, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v8, v4, vcc +; GFX9-NEXT: v_add_co_u32_e64 v0, s[2:3], v0, v2 ; GFX9-NEXT: v_addc_co_u32_e64 v2, vcc, v1, v3, s[2:3] ; GFX9-NEXT: v_mul_lo_u32 v4, v0, s4 ; GFX9-NEXT: v_mul_hi_u32 v6, v0, s5 @@ -10399,20 +10399,20 @@ ; GFX9-NEXT: v_mul_lo_u32 v0, v0, s9 ; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 ; GFX9-NEXT: v_add_u32_e32 v1, v2, v1 -; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s6, v0 ; GFX9-NEXT: v_sub_u32_e32 v2, s7, v1 ; GFX9-NEXT: v_mov_b32_e32 v3, s8 +; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s6, v0 ; GFX9-NEXT: v_subb_co_u32_e64 v2, s[0:1], v2, v3, vcc ; GFX9-NEXT: v_subrev_co_u32_e64 v4, s[0:1], s9, v0 ; GFX9-NEXT: v_subbrev_co_u32_e64 v6, s[2:3], 0, v2, s[0:1] ; GFX9-NEXT: s_movk_i32 s6, 0x11e ; GFX9-NEXT: v_cmp_lt_u32_e64 s[2:3], s6, v6 -; GFX9-NEXT: v_subb_co_u32_e64 v2, s[0:1], v2, v3, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[2:3] ; GFX9-NEXT: v_cmp_lt_u32_e64 s[2:3], s10, v4 -; GFX9-NEXT: v_subrev_co_u32_e64 v3, s[0:1], s9, v4 +; GFX9-NEXT: v_subb_co_u32_e64 v2, s[0:1], v2, v3, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[2:3] ; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], s8, v6 +; GFX9-NEXT: v_subrev_co_u32_e64 v3, s[0:1], s9, v4 ; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v8, s[2:3] ; GFX9-NEXT: v_subbrev_co_u32_e64 v2, s[0:1], 0, v2, s[0:1] ; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v7 @@ -10457,10 +10457,10 @@ ; GFX90A-NEXT: v_mul_lo_u32 v6, v0, s3 ; GFX90A-NEXT: v_mul_lo_u32 v5, v0, v3 ; GFX90A-NEXT: v_mul_hi_u32 v7, v0, v6 -; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v7, v5 ; GFX90A-NEXT: v_mul_hi_u32 v4, v0, v3 -; GFX90A-NEXT: v_mul_hi_u32 v9, v1, v6 +; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v7, v5 ; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v8, v4, vcc +; GFX90A-NEXT: v_mul_hi_u32 v9, v1, v6 ; GFX90A-NEXT: v_mul_lo_u32 v6, v1, v6 ; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v5, v6 ; GFX90A-NEXT: v_mul_hi_u32 v7, v1, v3 @@ -10468,8 +10468,8 @@ ; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v7, v2, vcc ; GFX90A-NEXT: v_mul_lo_u32 v3, v1, v3 ; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v4, v3 -; GFX90A-NEXT: v_add_co_u32_e64 v0, s[0:1], v0, v3 ; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v8, v5, vcc +; GFX90A-NEXT: v_add_co_u32_e64 v0, s[0:1], v0, v3 ; GFX90A-NEXT: v_addc_co_u32_e64 v3, vcc, v1, v4, s[0:1] ; GFX90A-NEXT: v_mul_lo_u32 v6, v0, s2 ; GFX90A-NEXT: v_mul_hi_u32 v7, v0, s3 @@ -10479,10 +10479,10 @@ ; GFX90A-NEXT: v_mul_lo_u32 v9, v0, s3 ; GFX90A-NEXT: v_mul_lo_u32 v7, v0, v5 ; GFX90A-NEXT: v_mul_hi_u32 v10, v0, v9 -; GFX90A-NEXT: v_add_co_u32_e32 v7, vcc, v10, v7 ; GFX90A-NEXT: v_mul_hi_u32 v6, v0, v5 -; GFX90A-NEXT: v_mul_hi_u32 v11, v3, v9 +; GFX90A-NEXT: v_add_co_u32_e32 v7, vcc, v10, v7 ; GFX90A-NEXT: v_addc_co_u32_e32 v6, vcc, v8, v6, vcc +; GFX90A-NEXT: v_mul_hi_u32 v11, v3, v9 ; GFX90A-NEXT: v_mul_lo_u32 v9, v3, v9 ; GFX90A-NEXT: v_add_co_u32_e32 v7, vcc, v7, v9 ; GFX90A-NEXT: v_mul_hi_u32 v10, v3, v5 @@ -10498,10 +10498,10 @@ ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mul_lo_u32 v4, s6, v1 ; GFX90A-NEXT: v_mul_hi_u32 v5, s6, v0 -; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v5, v4 ; GFX90A-NEXT: v_mul_hi_u32 v3, s6, v1 -; GFX90A-NEXT: v_mul_hi_u32 v6, s7, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v5, v4 ; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v8, v3, vcc +; GFX90A-NEXT: v_mul_hi_u32 v6, s7, v0 ; GFX90A-NEXT: v_mul_lo_u32 v0, s7, v0 ; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v4, v0 ; GFX90A-NEXT: v_mul_hi_u32 v5, s7, v1 @@ -10509,8 +10509,8 @@ ; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v2, vcc ; GFX90A-NEXT: v_mul_lo_u32 v1, s7, v1 ; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 -; GFX90A-NEXT: s_mov_b32 s9, 0x9761f7c9 ; GFX90A-NEXT: s_movk_i32 s8, 0x11f +; GFX90A-NEXT: s_mov_b32 s9, 0x9761f7c9 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v8, v3, vcc ; GFX90A-NEXT: v_mul_lo_u32 v3, v0, s8 ; GFX90A-NEXT: v_mul_hi_u32 v4, v0, s9 @@ -10518,21 +10518,21 @@ ; GFX90A-NEXT: v_mul_lo_u32 v1, v1, s9 ; GFX90A-NEXT: v_add_u32_e32 v1, v3, v1 ; GFX90A-NEXT: v_mul_lo_u32 v0, v0, s9 -; GFX90A-NEXT: v_sub_co_u32_e32 v0, vcc, s6, v0 ; GFX90A-NEXT: v_sub_u32_e32 v3, s7, v1 ; GFX90A-NEXT: v_mov_b32_e32 v4, s8 +; GFX90A-NEXT: v_sub_co_u32_e32 v0, vcc, s6, v0 ; GFX90A-NEXT: v_subb_co_u32_e64 v3, s[0:1], v3, v4, vcc ; GFX90A-NEXT: v_subrev_co_u32_e64 v5, s[0:1], s9, v0 ; GFX90A-NEXT: v_subbrev_co_u32_e64 v6, s[2:3], 0, v3, s[0:1] ; GFX90A-NEXT: s_movk_i32 s6, 0x11e ; GFX90A-NEXT: v_cmp_lt_u32_e64 s[2:3], s6, v6 ; GFX90A-NEXT: s_mov_b32 s10, 0x9761f7c8 -; GFX90A-NEXT: v_subb_co_u32_e64 v3, s[0:1], v3, v4, s[0:1] ; GFX90A-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[2:3] ; GFX90A-NEXT: v_cmp_lt_u32_e64 s[2:3], s10, v5 -; GFX90A-NEXT: v_subrev_co_u32_e64 v4, s[0:1], s9, v5 +; GFX90A-NEXT: v_subb_co_u32_e64 v3, s[0:1], v3, v4, s[0:1] ; GFX90A-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[2:3] ; GFX90A-NEXT: v_cmp_eq_u32_e64 s[2:3], s8, v6 +; GFX90A-NEXT: v_subrev_co_u32_e64 v4, s[0:1], s9, v5 ; GFX90A-NEXT: v_cndmask_b32_e64 v7, v7, v8, s[2:3] ; GFX90A-NEXT: v_subbrev_co_u32_e64 v3, s[0:1], 0, v3, s[0:1] ; GFX90A-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v7 @@ -10831,12 +10831,12 @@ ; GFX6-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GFX6-NEXT: v_trunc_f32_e32 v1, v1 ; GFX6-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 -; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: v_mul_hi_u32 v3, v0, s2 ; GFX6-NEXT: v_mul_lo_u32 v2, v1, s2 +; GFX6-NEXT: v_mul_hi_u32 v3, v0, s2 ; GFX6-NEXT: v_mul_lo_u32 v4, v0, s2 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -10852,22 +10852,22 @@ ; GFX6-NEXT: v_mul_lo_u32 v6, v1, v4 ; GFX6-NEXT: v_mul_hi_u32 v4, v1, v4 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v8, v3, vcc -; GFX6-NEXT: s_mov_b32 s5, s9 ; GFX6-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v3, v4, vcc ; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v9, v7, vcc ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GFX6-NEXT: v_add_i32_e64 v0, s[0:1], v0, v2 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v8, v4, vcc +; GFX6-NEXT: v_add_i32_e64 v0, s[0:1], v0, v2 ; GFX6-NEXT: v_addc_u32_e64 v2, vcc, v1, v3, s[0:1] ; GFX6-NEXT: v_mul_lo_u32 v4, v2, s2 ; GFX6-NEXT: v_mul_hi_u32 v5, v0, s2 +; GFX6-NEXT: s_mov_b32 s5, s9 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 ; GFX6-NEXT: v_mul_lo_u32 v5, v0, s2 ; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, v0, v4 ; GFX6-NEXT: v_mul_lo_u32 v10, v0, v4 -; GFX6-NEXT: v_mul_hi_u32 v12, v0, v4 ; GFX6-NEXT: v_mul_hi_u32 v11, v0, v5 +; GFX6-NEXT: v_mul_hi_u32 v12, v0, v4 ; GFX6-NEXT: v_mul_hi_u32 v9, v2, v5 ; GFX6-NEXT: v_mul_lo_u32 v5, v2, v5 ; GFX6-NEXT: v_mul_hi_u32 v6, v2, v4 @@ -10886,8 +10886,8 @@ ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GFX6-NEXT: s_mov_b32 s3, s2 ; GFX6-NEXT: s_addc_u32 s1, s11, s2 -; GFX6-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3] ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX6-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3] ; GFX6-NEXT: v_mul_lo_u32 v2, s0, v1 ; GFX6-NEXT: v_mul_hi_u32 v3, s0, v0 ; GFX6-NEXT: v_mul_hi_u32 v4, s0, v1 @@ -10911,8 +10911,8 @@ ; GFX6-NEXT: v_add_i32_e32 v6, vcc, 1, v0 ; GFX6-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; GFX6-NEXT: v_sub_i32_e32 v8, vcc, s0, v8 ; GFX6-NEXT: v_mov_b32_e32 v5, s1 +; GFX6-NEXT: v_sub_i32_e32 v8, vcc, s0, v8 ; GFX6-NEXT: v_subb_u32_e32 v4, vcc, v5, v4, vcc ; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, s3, v8 ; GFX6-NEXT: v_subbrev_u32_e32 v9, vcc, 0, v4, vcc @@ -10928,8 +10928,8 @@ ; GFX6-NEXT: v_cndmask_b32_e64 v4, -1, v5, s[0:1] ; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4 ; GFX6-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc -; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX6-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc +; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] ; GFX6-NEXT: v_xor_b32_e32 v0, s2, v0 ; GFX6-NEXT: v_xor_b32_e32 v1, s2, v1 @@ -10951,11 +10951,11 @@ ; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GFX9-NEXT: v_trunc_f32_e32 v1, v1 ; GFX9-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 -; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: v_mul_hi_u32 v3, v0, s8 ; GFX9-NEXT: v_mul_lo_u32 v2, v1, s8 +; GFX9-NEXT: v_mul_hi_u32 v3, v0, s8 ; GFX9-NEXT: v_mul_lo_u32 v4, v0, s8 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_ashr_i32 s0, s7, 31 @@ -10975,8 +10975,8 @@ ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v6, v4, vcc ; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v9, v5, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 -; GFX9-NEXT: v_add_co_u32_e64 v0, s[2:3], v0, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v7, v4, vcc +; GFX9-NEXT: v_add_co_u32_e64 v0, s[2:3], v0, v2 ; GFX9-NEXT: v_addc_co_u32_e64 v2, vcc, v1, v3, s[2:3] ; GFX9-NEXT: v_mul_lo_u32 v4, v2, s8 ; GFX9-NEXT: v_mul_hi_u32 v6, v0, s8 @@ -10989,8 +10989,8 @@ ; GFX9-NEXT: v_mul_hi_u32 v12, v0, v4 ; GFX9-NEXT: v_mul_hi_u32 v9, v2, v8 ; GFX9-NEXT: v_mul_lo_u32 v8, v2, v8 -; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v11, v10 ; GFX9-NEXT: v_mul_hi_u32 v6, v2, v4 +; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v11, v10 ; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, v7, v12, vcc ; GFX9-NEXT: v_mul_lo_u32 v2, v2, v4 ; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v10, v8 @@ -11000,10 +11000,10 @@ ; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v7, v4, vcc ; GFX9-NEXT: v_addc_co_u32_e64 v1, vcc, v1, v4, s[2:3] ; GFX9-NEXT: s_add_u32 s2, s6, s0 -; GFX9-NEXT: s_addc_u32 s3, s7, s0 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 -; GFX9-NEXT: s_xor_b64 s[2:3], s[2:3], s[0:1] +; GFX9-NEXT: s_addc_u32 s3, s7, s0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: s_xor_b64 s[2:3], s[2:3], s[0:1] ; GFX9-NEXT: v_mul_lo_u32 v2, s2, v1 ; GFX9-NEXT: v_mul_hi_u32 v3, s2, v0 ; GFX9-NEXT: v_mul_hi_u32 v4, s2, v1 @@ -11019,12 +11019,12 @@ ; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v6, v5, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v7, v2, vcc -; GFX9-NEXT: v_mul_lo_u32 v4, v0, s1 ; GFX9-NEXT: v_mul_lo_u32 v2, v1, s1 ; GFX9-NEXT: v_mul_hi_u32 v3, v0, s1 -; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, s2, v4 +; GFX9-NEXT: v_mul_lo_u32 v4, v0, s1 ; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 ; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, s2, v4 ; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v3, v2, vcc ; GFX9-NEXT: v_subrev_co_u32_e32 v3, vcc, s1, v4 ; GFX9-NEXT: v_subbrev_co_u32_e32 v6, vcc, 0, v2, vcc @@ -11064,20 +11064,20 @@ ; GFX90A-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GFX90A-NEXT: v_trunc_f32_e32 v1, v1 ; GFX90A-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 -; GFX90A-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GFX90A-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX90A-NEXT: v_mul_hi_u32 v4, v0, s2 ; GFX90A-NEXT: v_mul_lo_u32 v3, v1, s2 +; GFX90A-NEXT: v_mul_hi_u32 v4, v0, s2 ; GFX90A-NEXT: v_add_u32_e32 v3, v4, v3 ; GFX90A-NEXT: v_sub_u32_e32 v3, v3, v0 ; GFX90A-NEXT: v_mul_lo_u32 v6, v0, s2 ; GFX90A-NEXT: v_mul_lo_u32 v5, v0, v3 ; GFX90A-NEXT: v_mul_hi_u32 v7, v0, v6 -; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v7, v5 ; GFX90A-NEXT: v_mul_hi_u32 v4, v0, v3 -; GFX90A-NEXT: v_mul_hi_u32 v9, v1, v6 +; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v7, v5 ; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v8, v4, vcc +; GFX90A-NEXT: v_mul_hi_u32 v9, v1, v6 ; GFX90A-NEXT: v_mul_lo_u32 v6, v1, v6 ; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v5, v6 ; GFX90A-NEXT: v_mul_hi_u32 v7, v1, v3 @@ -11085,8 +11085,8 @@ ; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v7, v2, vcc ; GFX90A-NEXT: v_mul_lo_u32 v3, v1, v3 ; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v4, v3 -; GFX90A-NEXT: v_add_co_u32_e64 v0, s[0:1], v0, v3 ; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v8, v5, vcc +; GFX90A-NEXT: v_add_co_u32_e64 v0, s[0:1], v0, v3 ; GFX90A-NEXT: v_addc_co_u32_e64 v3, vcc, v1, v4, s[0:1] ; GFX90A-NEXT: v_mul_lo_u32 v5, v3, s2 ; GFX90A-NEXT: v_mul_hi_u32 v6, v0, s2 @@ -11115,14 +11115,14 @@ ; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 ; GFX90A-NEXT: s_mov_b32 s1, s0 ; GFX90A-NEXT: s_addc_u32 s3, s7, s0 -; GFX90A-NEXT: s_xor_b64 s[2:3], s[2:3], s[0:1] ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX90A-NEXT: s_xor_b64 s[2:3], s[2:3], s[0:1] ; GFX90A-NEXT: v_mul_lo_u32 v4, s2, v1 ; GFX90A-NEXT: v_mul_hi_u32 v5, s2, v0 -; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v5, v4 ; GFX90A-NEXT: v_mul_hi_u32 v3, s2, v1 -; GFX90A-NEXT: v_mul_hi_u32 v6, s3, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v5, v4 ; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v8, v3, vcc +; GFX90A-NEXT: v_mul_hi_u32 v6, s3, v0 ; GFX90A-NEXT: v_mul_lo_u32 v0, s3, v0 ; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v4, v0 ; GFX90A-NEXT: v_mul_hi_u32 v5, s3, v1 @@ -11130,8 +11130,8 @@ ; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v2, vcc ; GFX90A-NEXT: v_mul_lo_u32 v1, s3, v1 ; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 -; GFX90A-NEXT: s_mov_b32 s1, 0x12d8fb ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v8, v3, vcc +; GFX90A-NEXT: s_mov_b32 s1, 0x12d8fb ; GFX90A-NEXT: v_mul_lo_u32 v3, v1, s1 ; GFX90A-NEXT: v_mul_hi_u32 v4, v0, s1 ; GFX90A-NEXT: v_add_u32_e32 v3, v4, v3 @@ -11259,10 +11259,10 @@ ; GFX6-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GFX6-NEXT: v_trunc_f32_e32 v1, v1 ; GFX6-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 -; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX6-NEXT: v_mul_hi_u32 v3, s4, v0 +; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX6-NEXT: v_mul_lo_u32 v2, s4, v1 +; GFX6-NEXT: v_mul_hi_u32 v3, s4, v0 ; GFX6-NEXT: v_mul_lo_u32 v5, s5, v0 ; GFX6-NEXT: v_mul_lo_u32 v4, s4, v0 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 @@ -11280,10 +11280,10 @@ ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v5, v4, vcc ; GFX6-NEXT: v_mov_b32_e32 v4, 0 ; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v7, v4, vcc -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX6-NEXT: v_mov_b32_e32 v6, 0 -; GFX6-NEXT: v_add_i32_e64 v0, s[0:1], v0, v2 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v6, v5, vcc +; GFX6-NEXT: v_add_i32_e64 v0, s[0:1], v0, v2 ; GFX6-NEXT: v_addc_u32_e64 v2, vcc, v1, v3, s[0:1] ; GFX6-NEXT: v_mul_lo_u32 v5, s4, v2 ; GFX6-NEXT: v_mul_hi_u32 v7, s4, v0 @@ -11293,8 +11293,8 @@ ; GFX6-NEXT: v_mul_lo_u32 v7, s4, v0 ; GFX6-NEXT: v_add_i32_e32 v5, vcc, v8, v5 ; GFX6-NEXT: v_mul_lo_u32 v10, v0, v5 -; GFX6-NEXT: v_mul_hi_u32 v12, v0, v5 ; GFX6-NEXT: v_mul_hi_u32 v11, v0, v7 +; GFX6-NEXT: v_mul_hi_u32 v12, v0, v5 ; GFX6-NEXT: v_mul_hi_u32 v9, v2, v7 ; GFX6-NEXT: v_mul_lo_u32 v7, v2, v7 ; GFX6-NEXT: v_mul_hi_u32 v8, v2, v5 @@ -11309,10 +11309,10 @@ ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3 ; GFX6-NEXT: v_addc_u32_e64 v1, vcc, v1, v5, s[0:1] ; GFX6-NEXT: s_add_u32 s0, s10, s14 -; GFX6-NEXT: s_addc_u32 s1, s11, s14 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GFX6-NEXT: s_xor_b64 s[10:11], s[0:1], s[14:15] +; GFX6-NEXT: s_addc_u32 s1, s11, s14 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX6-NEXT: s_xor_b64 s[10:11], s[0:1], s[14:15] ; GFX6-NEXT: v_mul_lo_u32 v2, s10, v1 ; GFX6-NEXT: v_mul_hi_u32 v3, s10, v0 ; GFX6-NEXT: v_mul_hi_u32 v5, s10, v1 @@ -11395,10 +11395,10 @@ ; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GFX9-NEXT: v_trunc_f32_e32 v1, v1 ; GFX9-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 -; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX9-NEXT: v_mul_hi_u32 v4, s12, v0 +; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_lo_u32 v3, s12, v1 +; GFX9-NEXT: v_mul_hi_u32 v4, s12, v0 ; GFX9-NEXT: v_mul_lo_u32 v6, s4, v0 ; GFX9-NEXT: v_mul_lo_u32 v5, s12, v0 ; GFX9-NEXT: v_add_u32_e32 v3, v4, v3 @@ -11408,17 +11408,17 @@ ; GFX9-NEXT: v_mul_hi_u32 v8, v0, v3 ; GFX9-NEXT: v_mul_hi_u32 v7, v1, v5 ; GFX9-NEXT: v_mul_lo_u32 v5, v1, v5 -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v6 ; GFX9-NEXT: v_mul_hi_u32 v9, v1, v3 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v6 ; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v8, vcc ; GFX9-NEXT: v_mul_lo_u32 v3, v1, v3 ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5 ; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v6, v7, vcc ; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v9, v2, vcc -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v4, v3 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 -; GFX9-NEXT: v_add_co_u32_e64 v0, s[2:3], v0, v3 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v4, v3 ; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v6, v5, vcc +; GFX9-NEXT: v_add_co_u32_e64 v0, s[2:3], v0, v3 ; GFX9-NEXT: v_addc_co_u32_e64 v3, vcc, v1, v4, s[2:3] ; GFX9-NEXT: v_mul_lo_u32 v5, s12, v3 ; GFX9-NEXT: v_mul_hi_u32 v7, s12, v0 @@ -11432,8 +11432,8 @@ ; GFX9-NEXT: v_mul_hi_u32 v12, v0, v5 ; GFX9-NEXT: v_mul_hi_u32 v8, v3, v9 ; GFX9-NEXT: v_mul_lo_u32 v9, v3, v9 -; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v11, v10 ; GFX9-NEXT: v_mul_hi_u32 v7, v3, v5 +; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v11, v10 ; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v12, vcc ; GFX9-NEXT: v_mul_lo_u32 v3, v3, v5 ; GFX9-NEXT: v_add_co_u32_e32 v9, vcc, v10, v9 @@ -11449,8 +11449,8 @@ ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 ; GFX9-NEXT: s_mov_b32 s3, s2 ; GFX9-NEXT: s_addc_u32 s1, s7, s2 -; GFX9-NEXT: s_xor_b64 s[6:7], s[0:1], s[2:3] ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: s_xor_b64 s[6:7], s[0:1], s[2:3] ; GFX9-NEXT: v_mul_lo_u32 v3, s6, v1 ; GFX9-NEXT: v_mul_hi_u32 v4, s6, v0 ; GFX9-NEXT: v_mul_hi_u32 v5, s6, v1 @@ -11496,8 +11496,8 @@ ; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v4, vcc ; GFX9-NEXT: v_addc_co_u32_e64 v6, s[0:1], 0, v1, s[0:1] ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; GFX9-NEXT: s_xor_b64 s[0:1], s[2:3], s[8:9] ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc +; GFX9-NEXT: s_xor_b64 s[0:1], s[2:3], s[8:9] ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc ; GFX9-NEXT: v_xor_b32_e32 v0, s0, v0 ; GFX9-NEXT: v_xor_b32_e32 v1, s1, v1 @@ -11530,30 +11530,30 @@ ; GFX90A-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GFX90A-NEXT: v_trunc_f32_e32 v1, v1 ; GFX90A-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 -; GFX90A-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX90A-NEXT: v_mul_hi_u32 v5, s10, v0 +; GFX90A-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX90A-NEXT: v_mul_lo_u32 v3, s10, v1 +; GFX90A-NEXT: v_mul_hi_u32 v5, s10, v0 ; GFX90A-NEXT: v_mul_lo_u32 v4, s11, v0 ; GFX90A-NEXT: v_add_u32_e32 v3, v5, v3 -; GFX90A-NEXT: v_add_u32_e32 v3, v3, v4 ; GFX90A-NEXT: v_mul_lo_u32 v6, s10, v0 +; GFX90A-NEXT: v_add_u32_e32 v3, v3, v4 ; GFX90A-NEXT: v_mul_lo_u32 v5, v0, v3 ; GFX90A-NEXT: v_mul_hi_u32 v7, v0, v6 -; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v7, v5 ; GFX90A-NEXT: v_mul_hi_u32 v4, v0, v3 -; GFX90A-NEXT: v_mul_hi_u32 v8, v1, v6 +; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v7, v5 ; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc +; GFX90A-NEXT: v_mul_hi_u32 v8, v1, v6 ; GFX90A-NEXT: v_mul_lo_u32 v6, v1, v6 ; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v5, v6 ; GFX90A-NEXT: v_mul_hi_u32 v7, v1, v3 ; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v8, vcc ; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v7, v2, vcc ; GFX90A-NEXT: v_mul_lo_u32 v3, v1, v3 -; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v4, v3 ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_add_co_u32_e64 v0, s[0:1], v0, v3 +; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v4, v3 ; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v6, v5, vcc +; GFX90A-NEXT: v_add_co_u32_e64 v0, s[0:1], v0, v3 ; GFX90A-NEXT: v_addc_co_u32_e64 v3, vcc, v1, v4, s[0:1] ; GFX90A-NEXT: v_mul_lo_u32 v5, s10, v3 ; GFX90A-NEXT: v_mul_hi_u32 v7, s10, v0 @@ -11574,23 +11574,23 @@ ; GFX90A-NEXT: v_addc_co_u32_e32 v7, vcc, v7, v2, vcc ; GFX90A-NEXT: v_mul_lo_u32 v3, v3, v5 ; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v8, v3 -; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_ashr_i32 s10, s7, 31 ; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v6, v7, vcc ; GFX90A-NEXT: v_add_u32_e32 v1, v1, v4 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_ashr_i32 s10, s7, 31 ; GFX90A-NEXT: v_addc_co_u32_e64 v1, vcc, v1, v5, s[0:1] ; GFX90A-NEXT: s_add_u32 s0, s6, s10 ; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 ; GFX90A-NEXT: s_mov_b32 s11, s10 ; GFX90A-NEXT: s_addc_u32 s1, s7, s10 -; GFX90A-NEXT: s_xor_b64 s[6:7], s[0:1], s[10:11] ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX90A-NEXT: s_xor_b64 s[6:7], s[0:1], s[10:11] ; GFX90A-NEXT: v_mul_lo_u32 v4, s6, v1 ; GFX90A-NEXT: v_mul_hi_u32 v5, s6, v0 -; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v5, v4 ; GFX90A-NEXT: v_mul_hi_u32 v3, s6, v1 -; GFX90A-NEXT: v_mul_hi_u32 v7, s7, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v5, v4 ; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX90A-NEXT: v_mul_hi_u32 v7, s7, v0 ; GFX90A-NEXT: v_mul_lo_u32 v0, s7, v0 ; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v4, v0 ; GFX90A-NEXT: v_mul_hi_u32 v5, s7, v1 @@ -11630,8 +11630,8 @@ ; GFX90A-NEXT: v_cndmask_b32_e32 v3, v7, v5, vcc ; GFX90A-NEXT: v_addc_co_u32_e64 v6, s[0:1], 0, v1, s[0:1] ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; GFX90A-NEXT: s_xor_b64 s[0:1], s[10:11], s[8:9] ; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; GFX90A-NEXT: s_xor_b64 s[0:1], s[10:11], s[8:9] ; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc ; GFX90A-NEXT: v_xor_b32_e32 v0, s0, v0 ; GFX90A-NEXT: v_xor_b32_e32 v1, s1, v1 @@ -11765,29 +11765,29 @@ ; GFX6-NEXT: v_mul_lo_u32 v3, v1, s6 ; GFX6-NEXT: s_add_u32 s2, s8, s0 ; GFX6-NEXT: s_addc_u32 s3, s9, 0 -; GFX6-NEXT: s_ashr_i32 s8, s11, 31 +; GFX6-NEXT: s_ashr_i64 s[2:3], s[2:3], 12 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GFX6-NEXT: v_mul_lo_u32 v3, v0, s6 ; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, v0, v2 ; GFX6-NEXT: v_mul_lo_u32 v4, v0, v2 -; GFX6-NEXT: v_mul_hi_u32 v6, v0, v2 ; GFX6-NEXT: v_mul_hi_u32 v5, v0, v3 +; GFX6-NEXT: v_mul_hi_u32 v6, v0, v2 ; GFX6-NEXT: v_mul_hi_u32 v7, v1, v2 ; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 -; GFX6-NEXT: s_ashr_i64 s[2:3], s[2:3], 12 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 ; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc ; GFX6-NEXT: v_mul_lo_u32 v6, v1, v3 ; GFX6-NEXT: v_mul_hi_u32 v3, v1, v3 +; GFX6-NEXT: s_ashr_i32 s8, s11, 31 ; GFX6-NEXT: s_mov_b32 s9, s8 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v6 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc ; GFX6-NEXT: v_mov_b32_e32 v4, 0 ; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v7, v4, vcc -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX6-NEXT: v_mov_b32_e32 v6, 0 -; GFX6-NEXT: v_add_i32_e64 v0, s[0:1], v0, v2 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v6, v5, vcc +; GFX6-NEXT: v_add_i32_e64 v0, s[0:1], v0, v2 ; GFX6-NEXT: v_addc_u32_e64 v2, vcc, v1, v3, s[0:1] ; GFX6-NEXT: v_mul_lo_u32 v5, v2, s6 ; GFX6-NEXT: v_mul_hi_u32 v7, v0, s6 @@ -11795,8 +11795,8 @@ ; GFX6-NEXT: v_mul_lo_u32 v7, v0, s6 ; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, v0, v5 ; GFX6-NEXT: v_mul_lo_u32 v10, v0, v5 -; GFX6-NEXT: v_mul_hi_u32 v12, v0, v5 ; GFX6-NEXT: v_mul_hi_u32 v11, v0, v7 +; GFX6-NEXT: v_mul_hi_u32 v12, v0, v5 ; GFX6-NEXT: v_mul_hi_u32 v9, v2, v7 ; GFX6-NEXT: v_mul_lo_u32 v7, v2, v7 ; GFX6-NEXT: v_mul_hi_u32 v8, v2, v5 @@ -11811,10 +11811,10 @@ ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3 ; GFX6-NEXT: v_addc_u32_e64 v1, vcc, v1, v5, s[0:1] ; GFX6-NEXT: s_add_u32 s0, s10, s8 -; GFX6-NEXT: s_addc_u32 s1, s11, s8 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GFX6-NEXT: s_xor_b64 s[0:1], s[0:1], s[8:9] +; GFX6-NEXT: s_addc_u32 s1, s11, s8 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX6-NEXT: s_xor_b64 s[0:1], s[0:1], s[8:9] ; GFX6-NEXT: v_mul_lo_u32 v2, s0, v1 ; GFX6-NEXT: v_mul_hi_u32 v3, s0, v0 ; GFX6-NEXT: v_mul_hi_u32 v5, s0, v1 @@ -11839,8 +11839,8 @@ ; GFX6-NEXT: v_add_i32_e32 v6, vcc, 1, v0 ; GFX6-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; GFX6-NEXT: v_sub_i32_e32 v8, vcc, s0, v8 ; GFX6-NEXT: v_mov_b32_e32 v5, s1 +; GFX6-NEXT: v_sub_i32_e32 v8, vcc, s0, v8 ; GFX6-NEXT: v_subb_u32_e32 v4, vcc, v5, v4, vcc ; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, s9, v8 ; GFX6-NEXT: v_subbrev_u32_e32 v9, vcc, 0, v4, vcc @@ -11856,13 +11856,13 @@ ; GFX6-NEXT: v_cndmask_b32_e64 v4, -1, v5, s[0:1] ; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4 ; GFX6-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc -; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX6-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc +; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] ; GFX6-NEXT: v_xor_b32_e32 v0, s8, v0 -; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s8, v0 ; GFX6-NEXT: v_xor_b32_e32 v1, s8, v1 ; GFX6-NEXT: v_mov_b32_e32 v3, s8 +; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s8, v0 ; GFX6-NEXT: v_subb_u32_e32 v3, vcc, v1, v3, vcc ; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s3 @@ -11908,10 +11908,10 @@ ; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v7 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v6, v5, vcc ; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v8, v4, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 -; GFX9-NEXT: v_add_co_u32_e64 v0, s[2:3], v0, v2 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v6, v5, vcc +; GFX9-NEXT: v_add_co_u32_e64 v0, s[2:3], v0, v2 ; GFX9-NEXT: v_addc_co_u32_e64 v2, vcc, v1, v3, s[2:3] ; GFX9-NEXT: v_mul_lo_u32 v5, v2, s8 ; GFX9-NEXT: v_mul_hi_u32 v7, v0, s8 @@ -11924,8 +11924,8 @@ ; GFX9-NEXT: v_mul_hi_u32 v12, v0, v5 ; GFX9-NEXT: v_mul_hi_u32 v9, v2, v8 ; GFX9-NEXT: v_mul_lo_u32 v8, v2, v8 -; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v11, v10 ; GFX9-NEXT: v_mul_hi_u32 v7, v2, v5 +; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v11, v10 ; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v12, vcc ; GFX9-NEXT: v_mul_lo_u32 v2, v2, v5 ; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v10, v8 @@ -11939,8 +11939,8 @@ ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 ; GFX9-NEXT: s_mov_b32 s3, s2 ; GFX9-NEXT: s_addc_u32 s7, s7, s2 -; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[2:3] ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[2:3] ; GFX9-NEXT: v_mul_lo_u32 v2, s6, v1 ; GFX9-NEXT: v_mul_hi_u32 v3, s6, v0 ; GFX9-NEXT: v_mul_hi_u32 v5, s6, v1 @@ -11956,12 +11956,12 @@ ; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v7, v4, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v6, v2, vcc -; GFX9-NEXT: v_mul_lo_u32 v5, v0, s3 ; GFX9-NEXT: v_mul_lo_u32 v2, v1, s3 ; GFX9-NEXT: v_mul_hi_u32 v3, v0, s3 -; GFX9-NEXT: v_sub_co_u32_e32 v5, vcc, s6, v5 +; GFX9-NEXT: v_mul_lo_u32 v5, v0, s3 ; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 ; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: v_sub_co_u32_e32 v5, vcc, s6, v5 ; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v3, v2, vcc ; GFX9-NEXT: v_subrev_co_u32_e32 v3, vcc, s3, v5 ; GFX9-NEXT: v_subbrev_co_u32_e32 v6, vcc, 0, v2, vcc @@ -11982,9 +11982,9 @@ ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc ; GFX9-NEXT: v_xor_b32_e32 v0, s2, v0 -; GFX9-NEXT: v_subrev_co_u32_e32 v2, vcc, s2, v0 ; GFX9-NEXT: v_xor_b32_e32 v1, s2, v1 ; GFX9-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-NEXT: v_subrev_co_u32_e32 v2, vcc, s2, v0 ; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v3, vcc ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 @@ -12018,10 +12018,10 @@ ; GFX90A-NEXT: v_mul_lo_u32 v6, v0, s8 ; GFX90A-NEXT: v_mul_lo_u32 v5, v0, v2 ; GFX90A-NEXT: v_mul_hi_u32 v7, v0, v6 -; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v7, v5 ; GFX90A-NEXT: v_mul_hi_u32 v3, v0, v2 -; GFX90A-NEXT: v_mul_hi_u32 v8, v1, v6 +; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v7, v5 ; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX90A-NEXT: v_mul_hi_u32 v8, v1, v6 ; GFX90A-NEXT: v_mul_lo_u32 v6, v1, v6 ; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v5, v6 ; GFX90A-NEXT: v_mul_hi_u32 v7, v1, v2 @@ -12033,8 +12033,8 @@ ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 ; GFX90A-NEXT: s_ashr_i64 s[4:5], s[0:1], 12 -; GFX90A-NEXT: v_add_co_u32_e64 v0, s[0:1], v0, v2 ; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v6, v5, vcc +; GFX90A-NEXT: v_add_co_u32_e64 v0, s[0:1], v0, v2 ; GFX90A-NEXT: v_addc_co_u32_e64 v2, vcc, v1, v3, s[0:1] ; GFX90A-NEXT: v_mul_lo_u32 v5, v2, s8 ; GFX90A-NEXT: v_mul_hi_u32 v7, v0, s8 @@ -12062,14 +12062,14 @@ ; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 ; GFX90A-NEXT: s_mov_b32 s1, s0 ; GFX90A-NEXT: s_addc_u32 s7, s7, s0 -; GFX90A-NEXT: s_xor_b64 s[6:7], s[6:7], s[0:1] ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX90A-NEXT: s_xor_b64 s[6:7], s[6:7], s[0:1] ; GFX90A-NEXT: v_mul_lo_u32 v3, s6, v1 ; GFX90A-NEXT: v_mul_hi_u32 v5, s6, v0 -; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v5, v3 ; GFX90A-NEXT: v_mul_hi_u32 v2, s6, v1 -; GFX90A-NEXT: v_mul_hi_u32 v7, s7, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v5, v3 ; GFX90A-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc +; GFX90A-NEXT: v_mul_hi_u32 v7, s7, v0 ; GFX90A-NEXT: v_mul_lo_u32 v0, s7, v0 ; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v3, v0 ; GFX90A-NEXT: v_mul_hi_u32 v5, s7, v1 @@ -12077,8 +12077,8 @@ ; GFX90A-NEXT: v_addc_co_u32_e32 v2, vcc, v5, v4, vcc ; GFX90A-NEXT: v_mul_lo_u32 v1, s7, v1 ; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 -; GFX90A-NEXT: s_movk_i32 s1, 0xfff ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v6, v2, vcc +; GFX90A-NEXT: s_movk_i32 s1, 0xfff ; GFX90A-NEXT: v_mul_lo_u32 v2, v1, s1 ; GFX90A-NEXT: v_mul_hi_u32 v3, v0, s1 ; GFX90A-NEXT: v_add_u32_e32 v2, v3, v2 @@ -12105,9 +12105,9 @@ ; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc ; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc ; GFX90A-NEXT: v_xor_b32_e32 v0, s0, v0 -; GFX90A-NEXT: v_subrev_co_u32_e32 v2, vcc, s0, v0 ; GFX90A-NEXT: v_xor_b32_e32 v1, s0, v1 ; GFX90A-NEXT: v_mov_b32_e32 v3, s0 +; GFX90A-NEXT: v_subrev_co_u32_e32 v2, vcc, s0, v0 ; GFX90A-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v3, vcc ; GFX90A-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NEXT: v_mov_b32_e32 v1, s5 @@ -12160,10 +12160,10 @@ ; GFX6-NEXT: v_mul_f32_e32 v1, s20, v0 ; GFX6-NEXT: v_trunc_f32_e32 v1, v1 ; GFX6-NEXT: v_mac_f32_e32 v0, s21, v1 -; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX6-NEXT: v_mul_hi_u32 v3, s6, v0 +; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX6-NEXT: v_mul_lo_u32 v2, s6, v1 +; GFX6-NEXT: v_mul_hi_u32 v3, s6, v0 ; GFX6-NEXT: v_mul_lo_u32 v4, s7, v0 ; GFX6-NEXT: v_mul_lo_u32 v5, s6, v0 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 @@ -12181,10 +12181,10 @@ ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v4, v5, vcc ; GFX6-NEXT: v_mov_b32_e32 v4, 0 ; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v7, v4, vcc -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX6-NEXT: v_mov_b32_e32 v6, 0 -; GFX6-NEXT: v_add_i32_e64 v0, s[2:3], v0, v2 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v6, v5, vcc +; GFX6-NEXT: v_add_i32_e64 v0, s[2:3], v0, v2 ; GFX6-NEXT: v_addc_u32_e64 v2, vcc, v1, v3, s[2:3] ; GFX6-NEXT: v_mul_lo_u32 v5, s6, v2 ; GFX6-NEXT: v_mul_hi_u32 v7, s6, v0 @@ -12194,8 +12194,8 @@ ; GFX6-NEXT: v_mul_lo_u32 v7, s6, v0 ; GFX6-NEXT: v_add_i32_e32 v5, vcc, v8, v5 ; GFX6-NEXT: v_mul_lo_u32 v10, v0, v5 -; GFX6-NEXT: v_mul_hi_u32 v12, v0, v5 ; GFX6-NEXT: v_mul_hi_u32 v11, v0, v7 +; GFX6-NEXT: v_mul_hi_u32 v12, v0, v5 ; GFX6-NEXT: v_mul_hi_u32 v9, v2, v7 ; GFX6-NEXT: v_mul_lo_u32 v7, v2, v7 ; GFX6-NEXT: v_mul_hi_u32 v8, v2, v5 @@ -12215,8 +12215,8 @@ ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GFX6-NEXT: s_mov_b32 s3, s2 ; GFX6-NEXT: s_addc_u32 s1, s9, s2 -; GFX6-NEXT: s_xor_b64 s[8:9], s[0:1], s[2:3] ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX6-NEXT: s_xor_b64 s[8:9], s[0:1], s[2:3] ; GFX6-NEXT: v_mul_lo_u32 v2, s8, v1 ; GFX6-NEXT: v_mul_hi_u32 v3, s8, v0 ; GFX6-NEXT: v_mul_hi_u32 v5, s8, v1 @@ -12276,14 +12276,14 @@ ; GFX6-NEXT: v_rcp_f32_e32 v3, v10 ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc -; GFX6-NEXT: s_sub_u32 s14, 0, s12 +; GFX6-NEXT: v_cndmask_b32_e64 v2, v9, v7, s[0:1] ; GFX6-NEXT: v_mul_f32_e32 v3, s19, v3 ; GFX6-NEXT: v_mul_f32_e32 v5, s20, v3 ; GFX6-NEXT: v_trunc_f32_e32 v5, v5 ; GFX6-NEXT: v_mac_f32_e32 v3, s21, v5 ; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v3 ; GFX6-NEXT: v_cvt_u32_f32_e32 v5, v5 -; GFX6-NEXT: v_cndmask_b32_e64 v2, v9, v7, s[0:1] +; GFX6-NEXT: s_sub_u32 s14, 0, s12 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX6-NEXT: v_mul_hi_u32 v2, s14, v3 ; GFX6-NEXT: v_mul_lo_u32 v7, s14, v5 @@ -12294,21 +12294,21 @@ ; GFX6-NEXT: v_mul_lo_u32 v7, s14, v3 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v8 ; GFX6-NEXT: v_mul_lo_u32 v8, v3, v2 -; GFX6-NEXT: v_mul_hi_u32 v10, v3, v2 ; GFX6-NEXT: v_mul_hi_u32 v9, v3, v7 +; GFX6-NEXT: v_mul_hi_u32 v10, v3, v2 ; GFX6-NEXT: v_mul_hi_u32 v11, v5, v2 ; GFX6-NEXT: v_mul_lo_u32 v2, v5, v2 -; GFX6-NEXT: v_xor_b32_e32 v1, s3, v1 ; GFX6-NEXT: v_add_i32_e32 v8, vcc, v9, v8 ; GFX6-NEXT: v_addc_u32_e32 v9, vcc, 0, v10, vcc ; GFX6-NEXT: v_mul_lo_u32 v10, v5, v7 ; GFX6-NEXT: v_mul_hi_u32 v7, v5, v7 +; GFX6-NEXT: v_xor_b32_e32 v1, s3, v1 ; GFX6-NEXT: v_add_i32_e32 v8, vcc, v8, v10 ; GFX6-NEXT: v_addc_u32_e32 v7, vcc, v9, v7, vcc ; GFX6-NEXT: v_addc_u32_e32 v8, vcc, v11, v4, vcc ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v7, v2 -; GFX6-NEXT: v_add_i32_e64 v2, s[0:1], v3, v2 ; GFX6-NEXT: v_addc_u32_e32 v7, vcc, v6, v8, vcc +; GFX6-NEXT: v_add_i32_e64 v2, s[0:1], v3, v2 ; GFX6-NEXT: v_addc_u32_e64 v3, vcc, v5, v7, s[0:1] ; GFX6-NEXT: v_mul_lo_u32 v8, s14, v3 ; GFX6-NEXT: v_mul_hi_u32 v9, s14, v2 @@ -12317,8 +12317,8 @@ ; GFX6-NEXT: v_mul_lo_u32 v9, s14, v2 ; GFX6-NEXT: v_add_i32_e32 v8, vcc, v10, v8 ; GFX6-NEXT: v_mul_lo_u32 v12, v2, v8 -; GFX6-NEXT: v_mul_hi_u32 v14, v2, v8 ; GFX6-NEXT: v_mul_hi_u32 v13, v2, v9 +; GFX6-NEXT: v_mul_hi_u32 v14, v2, v8 ; GFX6-NEXT: v_mul_hi_u32 v11, v3, v9 ; GFX6-NEXT: v_mul_lo_u32 v9, v3, v9 ; GFX6-NEXT: v_mul_hi_u32 v10, v3, v8 @@ -12337,8 +12337,8 @@ ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GFX6-NEXT: s_mov_b32 s15, s14 ; GFX6-NEXT: s_addc_u32 s1, s11, s14 -; GFX6-NEXT: s_xor_b64 s[10:11], s[0:1], s[14:15] ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc +; GFX6-NEXT: s_xor_b64 s[10:11], s[0:1], s[14:15] ; GFX6-NEXT: v_mul_lo_u32 v5, s10, v3 ; GFX6-NEXT: v_mul_hi_u32 v7, s10, v2 ; GFX6-NEXT: v_mul_hi_u32 v9, s10, v3 @@ -12428,10 +12428,10 @@ ; GFX9-NEXT: v_mul_f32_e32 v1, s18, v0 ; GFX9-NEXT: v_trunc_f32_e32 v1, v1 ; GFX9-NEXT: v_mac_f32_e32 v0, s19, v1 -; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX9-NEXT: v_mul_hi_u32 v3, s14, v0 +; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_lo_u32 v2, s14, v1 +; GFX9-NEXT: v_mul_hi_u32 v3, s14, v0 ; GFX9-NEXT: v_mul_lo_u32 v5, s4, v0 ; GFX9-NEXT: v_mul_lo_u32 v4, s14, v0 ; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 @@ -12448,10 +12448,10 @@ ; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v7 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v4, vcc ; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v8, v6, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 ; GFX9-NEXT: v_mov_b32_e32 v5, 0 -; GFX9-NEXT: v_add_co_u32_e64 v0, s[2:3], v0, v2 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v4, vcc +; GFX9-NEXT: v_add_co_u32_e64 v0, s[2:3], v0, v2 ; GFX9-NEXT: v_addc_co_u32_e64 v2, vcc, v1, v3, s[2:3] ; GFX9-NEXT: v_mul_lo_u32 v4, s14, v2 ; GFX9-NEXT: v_mul_hi_u32 v7, s14, v0 @@ -12465,25 +12465,25 @@ ; GFX9-NEXT: v_mul_hi_u32 v12, v0, v4 ; GFX9-NEXT: v_mul_hi_u32 v8, v2, v9 ; GFX9-NEXT: v_mul_lo_u32 v9, v2, v9 -; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v11, v10 ; GFX9-NEXT: v_mul_hi_u32 v7, v2, v4 +; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v11, v10 ; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v12, vcc ; GFX9-NEXT: v_mul_lo_u32 v2, v2, v4 ; GFX9-NEXT: v_add_co_u32_e32 v9, vcc, v10, v9 ; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, v11, v8, vcc ; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v7, v6, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v8, v2 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s14, s5, 31 ; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v5, v4, vcc ; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_ashr_i32 s14, s5, 31 ; GFX9-NEXT: v_addc_co_u32_e64 v1, vcc, v1, v4, s[2:3] ; GFX9-NEXT: s_add_u32 s2, s4, s14 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 ; GFX9-NEXT: s_mov_b32 s15, s14 ; GFX9-NEXT: s_addc_u32 s3, s5, s14 -; GFX9-NEXT: s_xor_b64 s[4:5], s[2:3], s[14:15] ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: s_xor_b64 s[4:5], s[2:3], s[14:15] ; GFX9-NEXT: v_mul_lo_u32 v2, s4, v1 ; GFX9-NEXT: v_mul_hi_u32 v3, s4, v0 ; GFX9-NEXT: v_mul_hi_u32 v4, s4, v1 @@ -12520,10 +12520,10 @@ ; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4 ; GFX9-NEXT: v_cndmask_b32_e64 v4, 1, 2, s[0:1] ; GFX9-NEXT: v_add_co_u32_e64 v4, s[0:1], v0, v4 +; GFX9-NEXT: v_addc_co_u32_e64 v7, s[0:1], 0, v1, s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v8, s5 ; GFX9-NEXT: s_xor_b64 s[4:5], s[14:15], s[12:13] ; GFX9-NEXT: s_ashr_i32 s12, s9, 31 -; GFX9-NEXT: v_addc_co_u32_e64 v7, s[0:1], 0, v1, s[0:1] ; GFX9-NEXT: s_add_u32 s0, s8, s12 ; GFX9-NEXT: s_mov_b32 s13, s12 ; GFX9-NEXT: s_addc_u32 s1, s9, s12 @@ -12571,14 +12571,14 @@ ; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v9, v7, vcc ; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, v11, v6, vcc ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v7, v4 -; GFX9-NEXT: v_add_co_u32_e64 v2, s[0:1], v2, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v5, v8, vcc +; GFX9-NEXT: v_add_co_u32_e64 v2, s[0:1], v2, v4 ; GFX9-NEXT: v_addc_co_u32_e64 v4, vcc, v3, v7, s[0:1] ; GFX9-NEXT: v_mul_lo_u32 v8, s10, v4 ; GFX9-NEXT: v_mul_hi_u32 v9, s10, v2 ; GFX9-NEXT: v_mul_lo_u32 v10, s11, v2 ; GFX9-NEXT: v_mul_lo_u32 v11, s10, v2 -; GFX9-NEXT: s_ashr_i32 s10, s7, 31 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v7 ; GFX9-NEXT: v_add_u32_e32 v8, v9, v8 ; GFX9-NEXT: v_add_u32_e32 v8, v8, v10 ; GFX9-NEXT: v_mul_lo_u32 v12, v2, v8 @@ -12586,8 +12586,8 @@ ; GFX9-NEXT: v_mul_hi_u32 v14, v2, v8 ; GFX9-NEXT: v_mul_hi_u32 v10, v4, v11 ; GFX9-NEXT: v_mul_lo_u32 v11, v4, v11 -; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, v13, v12 ; GFX9-NEXT: v_mul_hi_u32 v9, v4, v8 +; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, v13, v12 ; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v14, vcc ; GFX9-NEXT: v_mul_lo_u32 v4, v4, v8 ; GFX9-NEXT: v_add_co_u32_e32 v11, vcc, v12, v11 @@ -12595,14 +12595,14 @@ ; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, v9, v6, vcc ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v10, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, v5, v8, vcc -; GFX9-NEXT: v_add_u32_e32 v3, v3, v7 +; GFX9-NEXT: s_ashr_i32 s10, s7, 31 ; GFX9-NEXT: v_addc_co_u32_e64 v3, vcc, v3, v8, s[0:1] ; GFX9-NEXT: s_add_u32 s0, s6, s10 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 ; GFX9-NEXT: s_mov_b32 s11, s10 ; GFX9-NEXT: s_addc_u32 s1, s7, s10 -; GFX9-NEXT: s_xor_b64 s[6:7], s[0:1], s[10:11] ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: s_xor_b64 s[6:7], s[0:1], s[10:11] ; GFX9-NEXT: v_mul_lo_u32 v4, s6, v3 ; GFX9-NEXT: v_mul_hi_u32 v7, s6, v2 ; GFX9-NEXT: v_mul_hi_u32 v9, s6, v3 @@ -12622,9 +12622,9 @@ ; GFX9-NEXT: v_mul_hi_u32 v5, s8, v2 ; GFX9-NEXT: v_mul_lo_u32 v7, s9, v2 ; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s4, v0 -; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v8, vcc ; GFX9-NEXT: v_add_u32_e32 v4, v5, v4 ; GFX9-NEXT: v_mul_lo_u32 v5, s8, v2 +; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v8, vcc ; GFX9-NEXT: v_add_u32_e32 v4, v4, v7 ; GFX9-NEXT: v_sub_u32_e32 v7, s7, v4 ; GFX9-NEXT: v_mov_b32_e32 v8, s9 @@ -12651,8 +12651,8 @@ ; GFX9-NEXT: v_cndmask_b32_e32 v4, v9, v5, vcc ; GFX9-NEXT: v_addc_co_u32_e64 v8, s[0:1], 0, v3, s[0:1] ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GFX9-NEXT: s_xor_b64 s[0:1], s[10:11], s[12:13] ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc +; GFX9-NEXT: s_xor_b64 s[0:1], s[10:11], s[12:13] ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc ; GFX9-NEXT: v_xor_b32_e32 v2, s0, v2 ; GFX9-NEXT: v_xor_b32_e32 v3, s1, v3 @@ -12702,20 +12702,20 @@ ; GFX90A-NEXT: v_mul_lo_u32 v6, s14, v0 ; GFX90A-NEXT: v_mul_lo_u32 v5, v0, v2 ; GFX90A-NEXT: v_mul_hi_u32 v7, v0, v6 -; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v7, v5 ; GFX90A-NEXT: v_mul_hi_u32 v3, v0, v2 -; GFX90A-NEXT: v_mul_hi_u32 v8, v1, v6 +; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v7, v5 ; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX90A-NEXT: v_mul_hi_u32 v8, v1, v6 ; GFX90A-NEXT: v_mul_lo_u32 v6, v1, v6 ; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v5, v6 ; GFX90A-NEXT: v_mul_hi_u32 v7, v1, v2 ; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v8, vcc ; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v7, v4, vcc ; GFX90A-NEXT: v_mul_lo_u32 v2, v1, v2 -; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_add_co_u32_e64 v0, s[0:1], v0, v2 +; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 ; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v6, v5, vcc +; GFX90A-NEXT: v_add_co_u32_e64 v0, s[0:1], v0, v2 ; GFX90A-NEXT: v_addc_co_u32_e64 v2, vcc, v1, v3, s[0:1] ; GFX90A-NEXT: v_mul_lo_u32 v5, s14, v2 ; GFX90A-NEXT: v_mul_hi_u32 v7, s14, v0 @@ -12736,23 +12736,23 @@ ; GFX90A-NEXT: v_addc_co_u32_e32 v7, vcc, v7, v4, vcc ; GFX90A-NEXT: v_mul_lo_u32 v2, v2, v5 ; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, v8, v2 -; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_ashr_i32 s14, s5, 31 ; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v6, v7, vcc ; GFX90A-NEXT: v_add_u32_e32 v1, v1, v3 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_ashr_i32 s14, s5, 31 ; GFX90A-NEXT: v_addc_co_u32_e64 v1, vcc, v1, v5, s[0:1] ; GFX90A-NEXT: s_add_u32 s0, s4, s14 ; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 ; GFX90A-NEXT: s_mov_b32 s15, s14 ; GFX90A-NEXT: s_addc_u32 s1, s5, s14 -; GFX90A-NEXT: s_xor_b64 s[4:5], s[0:1], s[14:15] ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX90A-NEXT: s_xor_b64 s[4:5], s[0:1], s[14:15] ; GFX90A-NEXT: v_mul_lo_u32 v3, s4, v1 ; GFX90A-NEXT: v_mul_hi_u32 v5, s4, v0 -; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v5, v3 ; GFX90A-NEXT: v_mul_hi_u32 v2, s4, v1 -; GFX90A-NEXT: v_mul_hi_u32 v7, s5, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v5, v3 ; GFX90A-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc +; GFX90A-NEXT: v_mul_hi_u32 v7, s5, v0 ; GFX90A-NEXT: v_mul_lo_u32 v0, s5, v0 ; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v3, v0 ; GFX90A-NEXT: v_mul_hi_u32 v5, s5, v1 @@ -12767,9 +12767,9 @@ ; GFX90A-NEXT: v_mul_lo_u32 v3, s13, v0 ; GFX90A-NEXT: v_add_u32_e32 v2, v2, v3 ; GFX90A-NEXT: v_mul_lo_u32 v5, s12, v0 -; GFX90A-NEXT: v_sub_co_u32_e32 v5, vcc, s4, v5 ; GFX90A-NEXT: v_sub_u32_e32 v3, s5, v2 ; GFX90A-NEXT: v_mov_b32_e32 v7, s13 +; GFX90A-NEXT: v_sub_co_u32_e32 v5, vcc, s4, v5 ; GFX90A-NEXT: v_subb_co_u32_e64 v3, s[0:1], v3, v7, vcc ; GFX90A-NEXT: v_subrev_co_u32_e64 v7, s[0:1], s12, v5 ; GFX90A-NEXT: v_subbrev_co_u32_e64 v3, s[0:1], 0, v3, s[0:1] @@ -12782,17 +12782,17 @@ ; GFX90A-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v3 ; GFX90A-NEXT: v_cndmask_b32_e64 v3, 1, 2, s[0:1] ; GFX90A-NEXT: v_mov_b32_e32 v8, s5 -; GFX90A-NEXT: v_subb_co_u32_e32 v2, vcc, v8, v2, vcc ; GFX90A-NEXT: v_add_co_u32_e64 v3, s[0:1], v0, v3 +; GFX90A-NEXT: v_subb_co_u32_e32 v2, vcc, v8, v2, vcc ; GFX90A-NEXT: v_addc_co_u32_e64 v7, s[0:1], 0, v1, s[0:1] ; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s13, v2 ; GFX90A-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc ; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s12, v5 -; GFX90A-NEXT: s_ashr_i32 s4, s9, 31 ; GFX90A-NEXT: s_xor_b64 s[0:1], s[14:15], s[10:11] -; GFX90A-NEXT: s_add_u32 s8, s8, s4 +; GFX90A-NEXT: s_ashr_i32 s4, s9, 31 ; GFX90A-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, s13, v2 +; GFX90A-NEXT: s_add_u32 s8, s8, s4 ; GFX90A-NEXT: v_cndmask_b32_e32 v2, v8, v5, vcc ; GFX90A-NEXT: s_mov_b32 s5, s4 ; GFX90A-NEXT: s_addc_u32 s9, s9, s4 @@ -12825,10 +12825,10 @@ ; GFX90A-NEXT: v_mul_lo_u32 v9, s10, v2 ; GFX90A-NEXT: v_mul_lo_u32 v8, v2, v5 ; GFX90A-NEXT: v_mul_hi_u32 v10, v2, v9 -; GFX90A-NEXT: v_add_co_u32_e32 v8, vcc, v10, v8 ; GFX90A-NEXT: v_mul_hi_u32 v7, v2, v5 -; GFX90A-NEXT: v_mul_hi_u32 v11, v3, v9 +; GFX90A-NEXT: v_add_co_u32_e32 v8, vcc, v10, v8 ; GFX90A-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc +; GFX90A-NEXT: v_mul_hi_u32 v11, v3, v9 ; GFX90A-NEXT: v_mul_lo_u32 v9, v3, v9 ; GFX90A-NEXT: v_add_co_u32_e32 v8, vcc, v8, v9 ; GFX90A-NEXT: v_mul_hi_u32 v10, v3, v5 @@ -12836,8 +12836,8 @@ ; GFX90A-NEXT: v_addc_co_u32_e32 v8, vcc, v10, v4, vcc ; GFX90A-NEXT: v_mul_lo_u32 v5, v3, v5 ; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v7, v5 -; GFX90A-NEXT: v_add_co_u32_e64 v2, s[0:1], v2, v5 ; GFX90A-NEXT: v_addc_co_u32_e32 v7, vcc, v6, v8, vcc +; GFX90A-NEXT: v_add_co_u32_e64 v2, s[0:1], v2, v5 ; GFX90A-NEXT: v_addc_co_u32_e64 v5, vcc, v3, v7, s[0:1] ; GFX90A-NEXT: v_mul_lo_u32 v8, s10, v5 ; GFX90A-NEXT: v_mul_hi_u32 v9, s10, v2 @@ -12858,22 +12858,22 @@ ; GFX90A-NEXT: v_addc_co_u32_e32 v9, vcc, v9, v4, vcc ; GFX90A-NEXT: v_mul_lo_u32 v5, v5, v8 ; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v10, v5 -; GFX90A-NEXT: s_ashr_i32 s10, s7, 31 ; GFX90A-NEXT: v_addc_co_u32_e32 v8, vcc, v6, v9, vcc ; GFX90A-NEXT: v_add_u32_e32 v3, v3, v7 +; GFX90A-NEXT: s_ashr_i32 s10, s7, 31 ; GFX90A-NEXT: v_addc_co_u32_e64 v3, vcc, v3, v8, s[0:1] ; GFX90A-NEXT: s_add_u32 s0, s6, s10 ; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5 ; GFX90A-NEXT: s_mov_b32 s11, s10 ; GFX90A-NEXT: s_addc_u32 s1, s7, s10 -; GFX90A-NEXT: s_xor_b64 s[6:7], s[0:1], s[10:11] ; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX90A-NEXT: s_xor_b64 s[6:7], s[0:1], s[10:11] ; GFX90A-NEXT: v_mul_lo_u32 v7, s6, v3 ; GFX90A-NEXT: v_mul_hi_u32 v8, s6, v2 -; GFX90A-NEXT: v_add_co_u32_e32 v7, vcc, v8, v7 ; GFX90A-NEXT: v_mul_hi_u32 v5, s6, v3 -; GFX90A-NEXT: v_mul_hi_u32 v9, s7, v2 +; GFX90A-NEXT: v_add_co_u32_e32 v7, vcc, v8, v7 ; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX90A-NEXT: v_mul_hi_u32 v9, s7, v2 ; GFX90A-NEXT: v_mul_lo_u32 v2, s7, v2 ; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, v7, v2 ; GFX90A-NEXT: v_mul_hi_u32 v8, s7, v3 @@ -12913,8 +12913,8 @@ ; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v7, vcc ; GFX90A-NEXT: v_addc_co_u32_e64 v8, s[0:1], 0, v3, s[0:1] ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; GFX90A-NEXT: s_xor_b64 s[0:1], s[10:11], s[4:5] ; GFX90A-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc +; GFX90A-NEXT: s_xor_b64 s[0:1], s[10:11], s[4:5] ; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc ; GFX90A-NEXT: v_xor_b32_e32 v2, s0, v2 ; GFX90A-NEXT: v_xor_b32_e32 v3, s1, v3 @@ -12947,12 +12947,12 @@ ; GFX6-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GFX6-NEXT: v_trunc_f32_e32 v1, v1 ; GFX6-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 -; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: v_mul_hi_u32 v3, v0, s2 ; GFX6-NEXT: v_mul_lo_u32 v2, v1, s2 +; GFX6-NEXT: v_mul_hi_u32 v3, v0, s2 ; GFX6-NEXT: v_mul_lo_u32 v4, v0, s2 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -12968,22 +12968,22 @@ ; GFX6-NEXT: v_mul_lo_u32 v6, v1, v4 ; GFX6-NEXT: v_mul_hi_u32 v4, v1, v4 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v8, v3, vcc -; GFX6-NEXT: s_mov_b32 s5, s9 ; GFX6-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v3, v4, vcc ; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v9, v7, vcc ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GFX6-NEXT: v_add_i32_e64 v0, s[0:1], v0, v2 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v8, v4, vcc +; GFX6-NEXT: v_add_i32_e64 v0, s[0:1], v0, v2 ; GFX6-NEXT: v_addc_u32_e64 v2, vcc, v1, v3, s[0:1] ; GFX6-NEXT: v_mul_lo_u32 v4, v2, s2 ; GFX6-NEXT: v_mul_hi_u32 v5, v0, s2 +; GFX6-NEXT: s_mov_b32 s5, s9 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 ; GFX6-NEXT: v_mul_lo_u32 v5, v0, s2 ; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, v0, v4 ; GFX6-NEXT: v_mul_lo_u32 v10, v0, v4 -; GFX6-NEXT: v_mul_hi_u32 v12, v0, v4 ; GFX6-NEXT: v_mul_hi_u32 v11, v0, v5 +; GFX6-NEXT: v_mul_hi_u32 v12, v0, v4 ; GFX6-NEXT: v_mul_hi_u32 v9, v2, v5 ; GFX6-NEXT: v_mul_lo_u32 v5, v2, v5 ; GFX6-NEXT: v_mul_hi_u32 v6, v2, v4 @@ -13002,8 +13002,8 @@ ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GFX6-NEXT: s_mov_b32 s3, s2 ; GFX6-NEXT: s_addc_u32 s1, s11, s2 -; GFX6-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3] ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX6-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3] ; GFX6-NEXT: v_mul_lo_u32 v2, s0, v1 ; GFX6-NEXT: v_mul_hi_u32 v3, s0, v0 ; GFX6-NEXT: v_mul_hi_u32 v4, s0, v1 @@ -13019,12 +13019,12 @@ ; GFX6-NEXT: v_addc_u32_e32 v2, vcc, v5, v7, vcc ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v8, v2, vcc -; GFX6-NEXT: v_mul_hi_u32 v2, v0, s3 ; GFX6-NEXT: v_mul_lo_u32 v1, v1, s3 +; GFX6-NEXT: v_mul_hi_u32 v2, v0, s3 ; GFX6-NEXT: v_mul_lo_u32 v0, v0, s3 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v2, v1 -; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 ; GFX6-NEXT: v_mov_b32_e32 v2, s1 +; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 ; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc ; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s3, v0 ; GFX6-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v1, vcc @@ -13065,11 +13065,11 @@ ; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GFX9-NEXT: v_trunc_f32_e32 v1, v1 ; GFX9-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 -; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: v_mul_hi_u32 v3, v0, s8 ; GFX9-NEXT: v_mul_lo_u32 v2, v1, s8 +; GFX9-NEXT: v_mul_hi_u32 v3, v0, s8 ; GFX9-NEXT: v_mul_lo_u32 v4, v0, s8 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_ashr_i32 s0, s7, 31 @@ -13089,8 +13089,8 @@ ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v6, v4, vcc ; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v9, v5, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 -; GFX9-NEXT: v_add_co_u32_e64 v0, s[2:3], v0, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v7, v4, vcc +; GFX9-NEXT: v_add_co_u32_e64 v0, s[2:3], v0, v2 ; GFX9-NEXT: v_addc_co_u32_e64 v2, vcc, v1, v3, s[2:3] ; GFX9-NEXT: v_mul_lo_u32 v4, v2, s8 ; GFX9-NEXT: v_mul_hi_u32 v6, v0, s8 @@ -13103,8 +13103,8 @@ ; GFX9-NEXT: v_mul_hi_u32 v12, v0, v4 ; GFX9-NEXT: v_mul_hi_u32 v9, v2, v8 ; GFX9-NEXT: v_mul_lo_u32 v8, v2, v8 -; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v11, v10 ; GFX9-NEXT: v_mul_hi_u32 v6, v2, v4 +; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v11, v10 ; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, v7, v12, vcc ; GFX9-NEXT: v_mul_lo_u32 v2, v2, v4 ; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v10, v8 @@ -13114,10 +13114,10 @@ ; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v7, v4, vcc ; GFX9-NEXT: v_addc_co_u32_e64 v1, vcc, v1, v4, s[2:3] ; GFX9-NEXT: s_add_u32 s2, s6, s0 -; GFX9-NEXT: s_addc_u32 s3, s7, s0 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 -; GFX9-NEXT: s_xor_b64 s[2:3], s[2:3], s[0:1] +; GFX9-NEXT: s_addc_u32 s3, s7, s0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: s_xor_b64 s[2:3], s[2:3], s[0:1] ; GFX9-NEXT: v_mul_lo_u32 v2, s2, v1 ; GFX9-NEXT: v_mul_hi_u32 v3, s2, v0 ; GFX9-NEXT: v_mul_hi_u32 v4, s2, v1 @@ -13133,8 +13133,8 @@ ; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v6, v5, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v7, v2, vcc -; GFX9-NEXT: v_mul_hi_u32 v2, v0, s1 ; GFX9-NEXT: v_mul_lo_u32 v1, v1, s1 +; GFX9-NEXT: v_mul_hi_u32 v2, v0, s1 ; GFX9-NEXT: v_mul_lo_u32 v0, v0, s1 ; GFX9-NEXT: v_add_u32_e32 v1, v2, v1 ; GFX9-NEXT: v_mov_b32_e32 v2, s3 @@ -13179,20 +13179,20 @@ ; GFX90A-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GFX90A-NEXT: v_trunc_f32_e32 v1, v1 ; GFX90A-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 -; GFX90A-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GFX90A-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX90A-NEXT: v_mul_hi_u32 v4, v0, s2 ; GFX90A-NEXT: v_mul_lo_u32 v3, v1, s2 +; GFX90A-NEXT: v_mul_hi_u32 v4, v0, s2 ; GFX90A-NEXT: v_add_u32_e32 v3, v4, v3 ; GFX90A-NEXT: v_sub_u32_e32 v3, v3, v0 ; GFX90A-NEXT: v_mul_lo_u32 v6, v0, s2 ; GFX90A-NEXT: v_mul_lo_u32 v5, v0, v3 ; GFX90A-NEXT: v_mul_hi_u32 v7, v0, v6 -; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v7, v5 ; GFX90A-NEXT: v_mul_hi_u32 v4, v0, v3 -; GFX90A-NEXT: v_mul_hi_u32 v9, v1, v6 +; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v7, v5 ; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v8, v4, vcc +; GFX90A-NEXT: v_mul_hi_u32 v9, v1, v6 ; GFX90A-NEXT: v_mul_lo_u32 v6, v1, v6 ; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v5, v6 ; GFX90A-NEXT: v_mul_hi_u32 v7, v1, v3 @@ -13200,8 +13200,8 @@ ; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v7, v2, vcc ; GFX90A-NEXT: v_mul_lo_u32 v3, v1, v3 ; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v4, v3 -; GFX90A-NEXT: v_add_co_u32_e64 v0, s[0:1], v0, v3 ; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v8, v5, vcc +; GFX90A-NEXT: v_add_co_u32_e64 v0, s[0:1], v0, v3 ; GFX90A-NEXT: v_addc_co_u32_e64 v3, vcc, v1, v4, s[0:1] ; GFX90A-NEXT: v_mul_lo_u32 v5, v3, s2 ; GFX90A-NEXT: v_mul_hi_u32 v6, v0, s2 @@ -13230,14 +13230,14 @@ ; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 ; GFX90A-NEXT: s_mov_b32 s1, s0 ; GFX90A-NEXT: s_addc_u32 s3, s7, s0 -; GFX90A-NEXT: s_xor_b64 s[2:3], s[2:3], s[0:1] ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX90A-NEXT: s_xor_b64 s[2:3], s[2:3], s[0:1] ; GFX90A-NEXT: v_mul_lo_u32 v4, s2, v1 ; GFX90A-NEXT: v_mul_hi_u32 v5, s2, v0 -; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v5, v4 ; GFX90A-NEXT: v_mul_hi_u32 v3, s2, v1 -; GFX90A-NEXT: v_mul_hi_u32 v6, s3, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v5, v4 ; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v8, v3, vcc +; GFX90A-NEXT: v_mul_hi_u32 v6, s3, v0 ; GFX90A-NEXT: v_mul_lo_u32 v0, s3, v0 ; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v4, v0 ; GFX90A-NEXT: v_mul_hi_u32 v5, s3, v1 @@ -13247,8 +13247,8 @@ ; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v8, v3, vcc ; GFX90A-NEXT: s_mov_b32 s1, 0x12d8fb -; GFX90A-NEXT: v_mul_hi_u32 v3, v0, s1 ; GFX90A-NEXT: v_mul_lo_u32 v1, v1, s1 +; GFX90A-NEXT: v_mul_hi_u32 v3, v0, s1 ; GFX90A-NEXT: v_mul_lo_u32 v0, v0, s1 ; GFX90A-NEXT: v_add_u32_e32 v1, v3, v1 ; GFX90A-NEXT: v_mov_b32_e32 v3, s3 @@ -13383,10 +13383,10 @@ ; GFX6-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GFX6-NEXT: v_trunc_f32_e32 v1, v1 ; GFX6-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 -; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX6-NEXT: v_mul_hi_u32 v3, s2, v0 +; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX6-NEXT: v_mul_lo_u32 v2, s2, v1 +; GFX6-NEXT: v_mul_hi_u32 v3, s2, v0 ; GFX6-NEXT: v_mul_lo_u32 v5, s3, v0 ; GFX6-NEXT: v_mul_lo_u32 v4, s2, v0 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 @@ -13404,10 +13404,10 @@ ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v5, v4, vcc ; GFX6-NEXT: v_mov_b32_e32 v4, 0 ; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v7, v4, vcc -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX6-NEXT: v_mov_b32_e32 v6, 0 -; GFX6-NEXT: v_add_i32_e64 v0, s[0:1], v0, v2 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v6, v5, vcc +; GFX6-NEXT: v_add_i32_e64 v0, s[0:1], v0, v2 ; GFX6-NEXT: v_addc_u32_e64 v2, vcc, v1, v3, s[0:1] ; GFX6-NEXT: v_mul_lo_u32 v5, s2, v2 ; GFX6-NEXT: v_mul_hi_u32 v7, s2, v0 @@ -13416,8 +13416,8 @@ ; GFX6-NEXT: v_mul_lo_u32 v7, s2, v0 ; GFX6-NEXT: v_add_i32_e32 v5, vcc, v8, v5 ; GFX6-NEXT: v_mul_lo_u32 v10, v0, v5 -; GFX6-NEXT: v_mul_hi_u32 v12, v0, v5 ; GFX6-NEXT: v_mul_hi_u32 v11, v0, v7 +; GFX6-NEXT: v_mul_hi_u32 v12, v0, v5 ; GFX6-NEXT: v_mul_hi_u32 v9, v2, v7 ; GFX6-NEXT: v_mul_lo_u32 v7, v2, v7 ; GFX6-NEXT: v_mul_hi_u32 v8, v2, v5 @@ -13432,10 +13432,10 @@ ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3 ; GFX6-NEXT: v_addc_u32_e64 v1, vcc, v1, v5, s[0:1] ; GFX6-NEXT: s_add_u32 s0, s10, s14 -; GFX6-NEXT: s_addc_u32 s1, s11, s14 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GFX6-NEXT: s_xor_b64 s[10:11], s[0:1], s[14:15] +; GFX6-NEXT: s_addc_u32 s1, s11, s14 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX6-NEXT: s_xor_b64 s[10:11], s[0:1], s[14:15] ; GFX6-NEXT: v_mul_lo_u32 v2, s10, v1 ; GFX6-NEXT: v_mul_hi_u32 v3, s10, v0 ; GFX6-NEXT: v_mul_hi_u32 v5, s10, v1 @@ -13463,12 +13463,12 @@ ; GFX6-NEXT: v_subrev_i32_e64 v4, s[0:1], s12, v0 ; GFX6-NEXT: v_subbrev_u32_e64 v5, s[2:3], 0, v2, s[0:1] ; GFX6-NEXT: v_cmp_le_u32_e64 s[2:3], s13, v5 -; GFX6-NEXT: v_subb_u32_e64 v2, s[0:1], v2, v3, s[0:1] ; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[2:3] ; GFX6-NEXT: v_cmp_le_u32_e64 s[2:3], s12, v4 -; GFX6-NEXT: v_subrev_i32_e64 v3, s[0:1], s12, v4 +; GFX6-NEXT: v_subb_u32_e64 v2, s[0:1], v2, v3, s[0:1] ; GFX6-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[2:3] ; GFX6-NEXT: v_cmp_eq_u32_e64 s[2:3], s13, v5 +; GFX6-NEXT: v_subrev_i32_e64 v3, s[0:1], s12, v4 ; GFX6-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[2:3] ; GFX6-NEXT: v_subbrev_u32_e64 v2, s[0:1], 0, v2, s[0:1] ; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v6 @@ -13515,10 +13515,10 @@ ; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GFX9-NEXT: v_trunc_f32_e32 v1, v1 ; GFX9-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 -; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX9-NEXT: v_mul_hi_u32 v4, s10, v0 +; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_lo_u32 v3, s10, v1 +; GFX9-NEXT: v_mul_hi_u32 v4, s10, v0 ; GFX9-NEXT: v_mul_lo_u32 v6, s4, v0 ; GFX9-NEXT: v_mul_lo_u32 v5, s10, v0 ; GFX9-NEXT: v_add_u32_e32 v3, v4, v3 @@ -13528,17 +13528,17 @@ ; GFX9-NEXT: v_mul_hi_u32 v8, v0, v3 ; GFX9-NEXT: v_mul_hi_u32 v7, v1, v5 ; GFX9-NEXT: v_mul_lo_u32 v5, v1, v5 -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v6 ; GFX9-NEXT: v_mul_hi_u32 v9, v1, v3 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v6 ; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v8, vcc ; GFX9-NEXT: v_mul_lo_u32 v3, v1, v3 ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5 ; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v6, v7, vcc ; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v9, v2, vcc -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v4, v3 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 -; GFX9-NEXT: v_add_co_u32_e64 v0, s[2:3], v0, v3 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v4, v3 ; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v6, v5, vcc +; GFX9-NEXT: v_add_co_u32_e64 v0, s[2:3], v0, v3 ; GFX9-NEXT: v_addc_co_u32_e64 v3, vcc, v1, v4, s[2:3] ; GFX9-NEXT: v_mul_lo_u32 v5, s10, v3 ; GFX9-NEXT: v_mul_hi_u32 v7, s10, v0 @@ -13552,25 +13552,25 @@ ; GFX9-NEXT: v_mul_hi_u32 v12, v0, v5 ; GFX9-NEXT: v_mul_hi_u32 v8, v3, v9 ; GFX9-NEXT: v_mul_lo_u32 v9, v3, v9 -; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v11, v10 ; GFX9-NEXT: v_mul_hi_u32 v7, v3, v5 +; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v11, v10 ; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v12, vcc ; GFX9-NEXT: v_mul_lo_u32 v3, v3, v5 ; GFX9-NEXT: v_add_co_u32_e32 v9, vcc, v10, v9 ; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, v11, v8, vcc ; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v7, v2, vcc ; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v8, v3 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s10, s7, 31 ; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v6, v5, vcc ; GFX9-NEXT: v_add_u32_e32 v1, v1, v4 -; GFX9-NEXT: s_add_u32 s0, s6, s10 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_ashr_i32 s10, s7, 31 ; GFX9-NEXT: v_addc_co_u32_e64 v1, vcc, v1, v5, s[2:3] +; GFX9-NEXT: s_add_u32 s0, s6, s10 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 ; GFX9-NEXT: s_mov_b32 s11, s10 ; GFX9-NEXT: s_addc_u32 s1, s7, s10 -; GFX9-NEXT: s_xor_b64 s[6:7], s[0:1], s[10:11] ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: s_xor_b64 s[6:7], s[0:1], s[10:11] ; GFX9-NEXT: v_mul_lo_u32 v3, s6, v1 ; GFX9-NEXT: v_mul_hi_u32 v4, s6, v0 ; GFX9-NEXT: v_mul_hi_u32 v5, s6, v1 @@ -13598,12 +13598,12 @@ ; GFX9-NEXT: v_subrev_co_u32_e64 v5, s[0:1], s8, v0 ; GFX9-NEXT: v_subbrev_co_u32_e64 v6, s[2:3], 0, v3, s[0:1] ; GFX9-NEXT: v_cmp_le_u32_e64 s[2:3], s9, v6 -; GFX9-NEXT: v_subb_co_u32_e64 v3, s[0:1], v3, v4, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[2:3] ; GFX9-NEXT: v_cmp_le_u32_e64 s[2:3], s8, v5 -; GFX9-NEXT: v_subrev_co_u32_e64 v4, s[0:1], s8, v5 +; GFX9-NEXT: v_subb_co_u32_e64 v3, s[0:1], v3, v4, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[2:3] ; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], s9, v6 +; GFX9-NEXT: v_subrev_co_u32_e64 v4, s[0:1], s8, v5 ; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v8, s[2:3] ; GFX9-NEXT: v_subbrev_co_u32_e64 v3, s[0:1], 0, v3, s[0:1] ; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v7 @@ -13653,31 +13653,31 @@ ; GFX90A-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GFX90A-NEXT: v_trunc_f32_e32 v1, v1 ; GFX90A-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 -; GFX90A-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GFX90A-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX90A-NEXT: s_mov_b32 s11, s10 -; GFX90A-NEXT: v_mul_hi_u32 v5, s2, v0 ; GFX90A-NEXT: v_mul_lo_u32 v3, s2, v1 +; GFX90A-NEXT: v_mul_hi_u32 v5, s2, v0 ; GFX90A-NEXT: v_mul_lo_u32 v4, s3, v0 ; GFX90A-NEXT: v_add_u32_e32 v3, v5, v3 -; GFX90A-NEXT: v_add_u32_e32 v3, v3, v4 ; GFX90A-NEXT: v_mul_lo_u32 v6, s2, v0 +; GFX90A-NEXT: v_add_u32_e32 v3, v3, v4 ; GFX90A-NEXT: v_mul_lo_u32 v5, v0, v3 ; GFX90A-NEXT: v_mul_hi_u32 v7, v0, v6 -; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v7, v5 ; GFX90A-NEXT: v_mul_hi_u32 v4, v0, v3 -; GFX90A-NEXT: v_mul_hi_u32 v8, v1, v6 +; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v7, v5 ; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc +; GFX90A-NEXT: v_mul_hi_u32 v8, v1, v6 ; GFX90A-NEXT: v_mul_lo_u32 v6, v1, v6 ; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v5, v6 ; GFX90A-NEXT: v_mul_hi_u32 v7, v1, v3 ; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v8, vcc ; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v7, v2, vcc ; GFX90A-NEXT: v_mul_lo_u32 v3, v1, v3 -; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v4, v3 ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_add_co_u32_e64 v0, s[0:1], v0, v3 +; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v4, v3 ; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v6, v5, vcc +; GFX90A-NEXT: v_add_co_u32_e64 v0, s[0:1], v0, v3 ; GFX90A-NEXT: v_addc_co_u32_e64 v3, vcc, v1, v4, s[0:1] ; GFX90A-NEXT: v_mul_lo_u32 v5, s2, v3 ; GFX90A-NEXT: v_mul_hi_u32 v7, s2, v0 @@ -13702,16 +13702,16 @@ ; GFX90A-NEXT: v_add_u32_e32 v1, v1, v4 ; GFX90A-NEXT: v_addc_co_u32_e64 v1, vcc, v1, v5, s[0:1] ; GFX90A-NEXT: s_add_u32 s0, s6, s10 -; GFX90A-NEXT: s_addc_u32 s1, s7, s10 ; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 -; GFX90A-NEXT: s_xor_b64 s[6:7], s[0:1], s[10:11] +; GFX90A-NEXT: s_addc_u32 s1, s7, s10 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX90A-NEXT: s_xor_b64 s[6:7], s[0:1], s[10:11] ; GFX90A-NEXT: v_mul_lo_u32 v4, s6, v1 ; GFX90A-NEXT: v_mul_hi_u32 v5, s6, v0 -; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v5, v4 ; GFX90A-NEXT: v_mul_hi_u32 v3, s6, v1 -; GFX90A-NEXT: v_mul_hi_u32 v7, s7, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v5, v4 ; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX90A-NEXT: v_mul_hi_u32 v7, s7, v0 ; GFX90A-NEXT: v_mul_lo_u32 v0, s7, v0 ; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v4, v0 ; GFX90A-NEXT: v_mul_hi_u32 v5, s7, v1 @@ -13733,12 +13733,12 @@ ; GFX90A-NEXT: v_subrev_co_u32_e64 v5, s[0:1], s8, v0 ; GFX90A-NEXT: v_subbrev_co_u32_e64 v6, s[2:3], 0, v3, s[0:1] ; GFX90A-NEXT: v_cmp_le_u32_e64 s[2:3], s9, v6 -; GFX90A-NEXT: v_subb_co_u32_e64 v3, s[0:1], v3, v4, s[0:1] ; GFX90A-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[2:3] ; GFX90A-NEXT: v_cmp_le_u32_e64 s[2:3], s8, v5 -; GFX90A-NEXT: v_subrev_co_u32_e64 v4, s[0:1], s8, v5 +; GFX90A-NEXT: v_subb_co_u32_e64 v3, s[0:1], v3, v4, s[0:1] ; GFX90A-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[2:3] ; GFX90A-NEXT: v_cmp_eq_u32_e64 s[2:3], s9, v6 +; GFX90A-NEXT: v_subrev_co_u32_e64 v4, s[0:1], s8, v5 ; GFX90A-NEXT: v_cndmask_b32_e64 v7, v7, v8, s[2:3] ; GFX90A-NEXT: v_subbrev_co_u32_e64 v3, s[0:1], 0, v3, s[0:1] ; GFX90A-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v7 @@ -13910,13 +13910,13 @@ ; GFX6-NEXT: v_mul_f32_e32 v1, s20, v0 ; GFX6-NEXT: v_trunc_f32_e32 v1, v1 ; GFX6-NEXT: v_mac_f32_e32 v0, s21, v1 -; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_ashr_i32 s12, s9, 31 ; GFX6-NEXT: s_add_u32 s0, s8, s12 -; GFX6-NEXT: v_mul_hi_u32 v3, s6, v0 ; GFX6-NEXT: v_mul_lo_u32 v2, s6, v1 +; GFX6-NEXT: v_mul_hi_u32 v3, s6, v0 ; GFX6-NEXT: v_mul_lo_u32 v4, s7, v0 ; GFX6-NEXT: v_mul_lo_u32 v5, s6, v0 ; GFX6-NEXT: s_mov_b32 s13, s12 @@ -13937,10 +13937,10 @@ ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v4, v5, vcc ; GFX6-NEXT: v_mov_b32_e32 v4, 0 ; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v7, v4, vcc -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX6-NEXT: v_mov_b32_e32 v6, 0 -; GFX6-NEXT: v_add_i32_e64 v0, s[2:3], v0, v2 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v6, v5, vcc +; GFX6-NEXT: v_add_i32_e64 v0, s[2:3], v0, v2 ; GFX6-NEXT: v_addc_u32_e64 v2, vcc, v1, v3, s[2:3] ; GFX6-NEXT: v_mul_lo_u32 v5, s6, v2 ; GFX6-NEXT: v_mul_hi_u32 v7, s6, v0 @@ -13950,8 +13950,8 @@ ; GFX6-NEXT: v_mul_lo_u32 v7, s6, v0 ; GFX6-NEXT: v_add_i32_e32 v5, vcc, v8, v5 ; GFX6-NEXT: v_mul_lo_u32 v10, v0, v5 -; GFX6-NEXT: v_mul_hi_u32 v12, v0, v5 ; GFX6-NEXT: v_mul_hi_u32 v11, v0, v7 +; GFX6-NEXT: v_mul_hi_u32 v12, v0, v5 ; GFX6-NEXT: v_mul_hi_u32 v9, v2, v7 ; GFX6-NEXT: v_mul_lo_u32 v7, v2, v7 ; GFX6-NEXT: v_mul_hi_u32 v8, v2, v5 @@ -13995,15 +13995,15 @@ ; GFX6-NEXT: v_subrev_i32_e64 v5, s[0:1], s16, v0 ; GFX6-NEXT: v_subbrev_u32_e64 v7, s[2:3], 0, v2, s[0:1] ; GFX6-NEXT: v_cmp_le_u32_e64 s[2:3], s17, v7 -; GFX6-NEXT: v_subb_u32_e64 v2, s[0:1], v2, v3, s[0:1] ; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[2:3] ; GFX6-NEXT: v_cmp_le_u32_e64 s[2:3], s16, v5 -; GFX6-NEXT: v_subrev_i32_e64 v3, s[0:1], s16, v5 +; GFX6-NEXT: v_subb_u32_e64 v2, s[0:1], v2, v3, s[0:1] ; GFX6-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[2:3] ; GFX6-NEXT: v_cmp_eq_u32_e64 s[2:3], s17, v7 +; GFX6-NEXT: v_subrev_i32_e64 v3, s[0:1], s16, v5 ; GFX6-NEXT: v_cndmask_b32_e64 v8, v8, v9, s[2:3] -; GFX6-NEXT: s_ashr_i32 s2, s15, 31 ; GFX6-NEXT: v_subbrev_u32_e64 v2, s[0:1], 0, v2, s[0:1] +; GFX6-NEXT: s_ashr_i32 s2, s15, 31 ; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v8 ; GFX6-NEXT: s_add_u32 s8, s14, s2 ; GFX6-NEXT: v_cndmask_b32_e64 v2, v7, v2, s[0:1] @@ -14015,8 +14015,8 @@ ; GFX6-NEXT: v_cvt_f32_u32_e32 v9, s9 ; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v7, v1, vcc ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s17, v1 -; GFX6-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc ; GFX6-NEXT: v_mac_f32_e32 v8, s18, v9 +; GFX6-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s16, v0 ; GFX6-NEXT: v_rcp_f32_e32 v8, v8 ; GFX6-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc @@ -14042,33 +14042,33 @@ ; GFX6-NEXT: v_mul_lo_u32 v7, s2, v3 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v8 ; GFX6-NEXT: v_mul_lo_u32 v8, v3, v2 -; GFX6-NEXT: v_mul_hi_u32 v10, v3, v2 ; GFX6-NEXT: v_mul_hi_u32 v9, v3, v7 +; GFX6-NEXT: v_mul_hi_u32 v10, v3, v2 ; GFX6-NEXT: v_mul_hi_u32 v11, v5, v2 ; GFX6-NEXT: v_mul_lo_u32 v2, v5, v2 -; GFX6-NEXT: s_mov_b32 s15, s14 ; GFX6-NEXT: v_add_i32_e32 v8, vcc, v9, v8 ; GFX6-NEXT: v_addc_u32_e32 v9, vcc, 0, v10, vcc ; GFX6-NEXT: v_mul_lo_u32 v10, v5, v7 ; GFX6-NEXT: v_mul_hi_u32 v7, v5, v7 +; GFX6-NEXT: s_mov_b32 s15, s14 ; GFX6-NEXT: v_xor_b32_e32 v0, s12, v0 -; GFX6-NEXT: v_xor_b32_e32 v1, s12, v1 ; GFX6-NEXT: v_add_i32_e32 v8, vcc, v8, v10 ; GFX6-NEXT: v_addc_u32_e32 v7, vcc, v9, v7, vcc ; GFX6-NEXT: v_addc_u32_e32 v8, vcc, v11, v4, vcc ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v7, v2 -; GFX6-NEXT: v_add_i32_e64 v2, s[0:1], v3, v2 ; GFX6-NEXT: v_addc_u32_e32 v7, vcc, v6, v8, vcc +; GFX6-NEXT: v_add_i32_e64 v2, s[0:1], v3, v2 ; GFX6-NEXT: v_addc_u32_e64 v3, vcc, v5, v7, s[0:1] ; GFX6-NEXT: v_mul_lo_u32 v8, s2, v3 ; GFX6-NEXT: v_mul_hi_u32 v9, s2, v2 ; GFX6-NEXT: v_mul_lo_u32 v10, s3, v2 +; GFX6-NEXT: v_xor_b32_e32 v1, s12, v1 ; GFX6-NEXT: v_add_i32_e32 v8, vcc, v9, v8 ; GFX6-NEXT: v_mul_lo_u32 v9, s2, v2 ; GFX6-NEXT: v_add_i32_e32 v8, vcc, v10, v8 ; GFX6-NEXT: v_mul_lo_u32 v12, v2, v8 -; GFX6-NEXT: v_mul_hi_u32 v14, v2, v8 ; GFX6-NEXT: v_mul_hi_u32 v13, v2, v9 +; GFX6-NEXT: v_mul_hi_u32 v14, v2, v8 ; GFX6-NEXT: v_mul_hi_u32 v11, v3, v9 ; GFX6-NEXT: v_mul_lo_u32 v9, v3, v9 ; GFX6-NEXT: v_mul_hi_u32 v10, v3, v8 @@ -14085,8 +14085,8 @@ ; GFX6-NEXT: s_add_u32 s0, s10, s14 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GFX6-NEXT: s_addc_u32 s1, s11, s14 -; GFX6-NEXT: s_xor_b64 s[10:11], s[0:1], s[14:15] ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc +; GFX6-NEXT: s_xor_b64 s[10:11], s[0:1], s[14:15] ; GFX6-NEXT: v_mul_lo_u32 v5, s10, v3 ; GFX6-NEXT: v_mul_hi_u32 v7, s10, v2 ; GFX6-NEXT: v_mul_hi_u32 v9, s10, v3 @@ -14117,12 +14117,12 @@ ; GFX6-NEXT: v_subrev_i32_e64 v6, s[0:1], s8, v2 ; GFX6-NEXT: v_subbrev_u32_e64 v7, s[2:3], 0, v4, s[0:1] ; GFX6-NEXT: v_cmp_le_u32_e64 s[2:3], s9, v7 -; GFX6-NEXT: v_subb_u32_e64 v4, s[0:1], v4, v5, s[0:1] ; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[2:3] ; GFX6-NEXT: v_cmp_le_u32_e64 s[2:3], s8, v6 -; GFX6-NEXT: v_subrev_i32_e64 v5, s[0:1], s8, v6 +; GFX6-NEXT: v_subb_u32_e64 v4, s[0:1], v4, v5, s[0:1] ; GFX6-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[2:3] ; GFX6-NEXT: v_cmp_eq_u32_e64 s[2:3], s9, v7 +; GFX6-NEXT: v_subrev_i32_e64 v5, s[0:1], s8, v6 ; GFX6-NEXT: v_cndmask_b32_e64 v8, v8, v9, s[2:3] ; GFX6-NEXT: v_subbrev_u32_e64 v4, s[0:1], 0, v4, s[0:1] ; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v8 @@ -14174,10 +14174,10 @@ ; GFX9-NEXT: v_mul_f32_e32 v1, s18, v0 ; GFX9-NEXT: v_trunc_f32_e32 v1, v1 ; GFX9-NEXT: v_mac_f32_e32 v0, s19, v1 -; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX9-NEXT: v_mul_hi_u32 v3, s8, v0 +; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_lo_u32 v2, s8, v1 +; GFX9-NEXT: v_mul_hi_u32 v3, s8, v0 ; GFX9-NEXT: v_mul_lo_u32 v5, s4, v0 ; GFX9-NEXT: v_mul_lo_u32 v4, s8, v0 ; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 @@ -14194,10 +14194,10 @@ ; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v7 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v4, vcc ; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v8, v6, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 ; GFX9-NEXT: v_mov_b32_e32 v5, 0 -; GFX9-NEXT: v_add_co_u32_e64 v0, s[2:3], v0, v2 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v4, vcc +; GFX9-NEXT: v_add_co_u32_e64 v0, s[2:3], v0, v2 ; GFX9-NEXT: v_addc_co_u32_e64 v2, vcc, v1, v3, s[2:3] ; GFX9-NEXT: v_mul_lo_u32 v4, s8, v2 ; GFX9-NEXT: v_mul_hi_u32 v7, s8, v0 @@ -14211,25 +14211,25 @@ ; GFX9-NEXT: v_mul_hi_u32 v12, v0, v4 ; GFX9-NEXT: v_mul_hi_u32 v8, v2, v9 ; GFX9-NEXT: v_mul_lo_u32 v9, v2, v9 -; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v11, v10 ; GFX9-NEXT: v_mul_hi_u32 v7, v2, v4 +; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v11, v10 ; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v12, vcc ; GFX9-NEXT: v_mul_lo_u32 v2, v2, v4 ; GFX9-NEXT: v_add_co_u32_e32 v9, vcc, v10, v9 ; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, v11, v8, vcc ; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v7, v6, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v8, v2 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s8, s5, 31 ; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v5, v4, vcc ; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_ashr_i32 s8, s5, 31 ; GFX9-NEXT: v_addc_co_u32_e64 v1, vcc, v1, v4, s[2:3] ; GFX9-NEXT: s_add_u32 s2, s4, s8 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 -; GFX9-NEXT: s_addc_u32 s3, s5, s8 ; GFX9-NEXT: s_mov_b32 s9, s8 -; GFX9-NEXT: s_xor_b64 s[14:15], s[2:3], s[8:9] +; GFX9-NEXT: s_addc_u32 s3, s5, s8 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: s_xor_b64 s[14:15], s[2:3], s[8:9] ; GFX9-NEXT: v_mul_lo_u32 v2, s14, v1 ; GFX9-NEXT: v_mul_hi_u32 v3, s14, v0 ; GFX9-NEXT: v_mul_hi_u32 v4, s14, v1 @@ -14258,12 +14258,12 @@ ; GFX9-NEXT: v_subrev_co_u32_e64 v4, s[0:1], s12, v0 ; GFX9-NEXT: v_subbrev_co_u32_e64 v7, s[2:3], 0, v2, s[0:1] ; GFX9-NEXT: v_cmp_le_u32_e64 s[2:3], s13, v7 -; GFX9-NEXT: v_subb_co_u32_e64 v2, s[0:1], v2, v3, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[2:3] ; GFX9-NEXT: v_cmp_le_u32_e64 s[2:3], s12, v4 -; GFX9-NEXT: v_subrev_co_u32_e64 v3, s[0:1], s12, v4 +; GFX9-NEXT: v_subb_co_u32_e64 v2, s[0:1], v2, v3, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[2:3] ; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], s13, v7 +; GFX9-NEXT: v_subrev_co_u32_e64 v3, s[0:1], s12, v4 ; GFX9-NEXT: v_cndmask_b32_e64 v8, v8, v9, s[2:3] ; GFX9-NEXT: v_subbrev_co_u32_e64 v2, s[0:1], 0, v2, s[0:1] ; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v8 @@ -14273,8 +14273,8 @@ ; GFX9-NEXT: s_add_u32 s2, s10, s0 ; GFX9-NEXT: s_mov_b32 s1, s0 ; GFX9-NEXT: s_addc_u32 s3, s11, s0 -; GFX9-NEXT: s_xor_b64 s[10:11], s[2:3], s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v4, s15 +; GFX9-NEXT: s_xor_b64 s[10:11], s[2:3], s[0:1] ; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v4, v1, vcc ; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s10 ; GFX9-NEXT: v_cvt_f32_u32_e32 v7, s11 @@ -14318,8 +14318,8 @@ ; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v9, v2, vcc ; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, v11, v6, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v7 -; GFX9-NEXT: v_add_co_u32_e64 v2, s[0:1], v3, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v5, v8, vcc +; GFX9-NEXT: v_add_co_u32_e64 v2, s[0:1], v3, v2 ; GFX9-NEXT: v_addc_co_u32_e64 v3, vcc, v4, v7, s[0:1] ; GFX9-NEXT: v_mul_lo_u32 v8, s2, v3 ; GFX9-NEXT: v_mul_hi_u32 v9, s2, v2 @@ -14333,8 +14333,8 @@ ; GFX9-NEXT: v_mul_hi_u32 v14, v2, v8 ; GFX9-NEXT: v_mul_hi_u32 v10, v3, v11 ; GFX9-NEXT: v_mul_lo_u32 v11, v3, v11 -; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, v13, v12 ; GFX9-NEXT: v_mul_hi_u32 v9, v3, v8 +; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, v13, v12 ; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v14, vcc ; GFX9-NEXT: v_mul_lo_u32 v3, v3, v8 ; GFX9-NEXT: v_add_co_u32_e32 v11, vcc, v12, v11 @@ -14346,8 +14346,8 @@ ; GFX9-NEXT: s_add_u32 s0, s6, s12 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v3 ; GFX9-NEXT: s_addc_u32 s1, s7, s12 -; GFX9-NEXT: s_xor_b64 s[6:7], s[0:1], s[12:13] ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc +; GFX9-NEXT: s_xor_b64 s[6:7], s[0:1], s[12:13] ; GFX9-NEXT: v_mul_lo_u32 v4, s6, v3 ; GFX9-NEXT: v_mul_hi_u32 v7, s6, v2 ; GFX9-NEXT: v_mul_hi_u32 v9, s6, v3 @@ -14380,12 +14380,12 @@ ; GFX9-NEXT: v_subrev_co_u32_e64 v7, s[0:1], s10, v2 ; GFX9-NEXT: v_subbrev_co_u32_e64 v8, s[2:3], 0, v4, s[0:1] ; GFX9-NEXT: v_cmp_le_u32_e64 s[2:3], s11, v8 -; GFX9-NEXT: v_subb_co_u32_e64 v4, s[0:1], v4, v5, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[2:3] ; GFX9-NEXT: v_cmp_le_u32_e64 s[2:3], s10, v7 -; GFX9-NEXT: v_subrev_co_u32_e64 v5, s[0:1], s10, v7 +; GFX9-NEXT: v_subb_co_u32_e64 v4, s[0:1], v4, v5, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[2:3] ; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], s11, v8 +; GFX9-NEXT: v_subrev_co_u32_e64 v5, s[0:1], s10, v7 ; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v10, s[2:3] ; GFX9-NEXT: v_subbrev_co_u32_e64 v4, s[0:1], 0, v4, s[0:1] ; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v9 @@ -14453,20 +14453,20 @@ ; GFX90A-NEXT: v_mul_lo_u32 v6, s2, v0 ; GFX90A-NEXT: v_mul_lo_u32 v5, v0, v2 ; GFX90A-NEXT: v_mul_hi_u32 v7, v0, v6 -; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v7, v5 ; GFX90A-NEXT: v_mul_hi_u32 v3, v0, v2 -; GFX90A-NEXT: v_mul_hi_u32 v8, v1, v6 +; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v7, v5 ; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX90A-NEXT: v_mul_hi_u32 v8, v1, v6 ; GFX90A-NEXT: v_mul_lo_u32 v6, v1, v6 ; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v5, v6 ; GFX90A-NEXT: v_mul_hi_u32 v7, v1, v2 ; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v8, vcc ; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v7, v4, vcc ; GFX90A-NEXT: v_mul_lo_u32 v2, v1, v2 -; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_add_co_u32_e64 v0, s[0:1], v0, v2 +; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 ; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v6, v5, vcc +; GFX90A-NEXT: v_add_co_u32_e64 v0, s[0:1], v0, v2 ; GFX90A-NEXT: v_addc_co_u32_e64 v2, vcc, v1, v3, s[0:1] ; GFX90A-NEXT: v_mul_lo_u32 v5, s2, v2 ; GFX90A-NEXT: v_mul_hi_u32 v7, s2, v0 @@ -14491,16 +14491,16 @@ ; GFX90A-NEXT: v_add_u32_e32 v1, v1, v3 ; GFX90A-NEXT: v_addc_co_u32_e64 v1, vcc, v1, v5, s[0:1] ; GFX90A-NEXT: s_add_u32 s0, s4, s14 -; GFX90A-NEXT: s_addc_u32 s1, s5, s14 ; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 -; GFX90A-NEXT: s_xor_b64 s[4:5], s[0:1], s[14:15] +; GFX90A-NEXT: s_addc_u32 s1, s5, s14 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX90A-NEXT: s_xor_b64 s[4:5], s[0:1], s[14:15] ; GFX90A-NEXT: v_mul_lo_u32 v3, s4, v1 ; GFX90A-NEXT: v_mul_hi_u32 v5, s4, v0 -; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v5, v3 ; GFX90A-NEXT: v_mul_hi_u32 v2, s4, v1 -; GFX90A-NEXT: v_mul_hi_u32 v7, s5, v0 +; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v5, v3 ; GFX90A-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc +; GFX90A-NEXT: v_mul_hi_u32 v7, s5, v0 ; GFX90A-NEXT: v_mul_lo_u32 v0, s5, v0 ; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v3, v0 ; GFX90A-NEXT: v_mul_hi_u32 v5, s5, v1 @@ -14522,12 +14522,12 @@ ; GFX90A-NEXT: v_subrev_co_u32_e64 v5, s[0:1], s12, v0 ; GFX90A-NEXT: v_subbrev_co_u32_e64 v7, s[2:3], 0, v2, s[0:1] ; GFX90A-NEXT: v_cmp_le_u32_e64 s[2:3], s13, v7 -; GFX90A-NEXT: v_subb_co_u32_e64 v2, s[0:1], v2, v3, s[0:1] ; GFX90A-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[2:3] ; GFX90A-NEXT: v_cmp_le_u32_e64 s[2:3], s12, v5 -; GFX90A-NEXT: v_subrev_co_u32_e64 v3, s[0:1], s12, v5 +; GFX90A-NEXT: v_subb_co_u32_e64 v2, s[0:1], v2, v3, s[0:1] ; GFX90A-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[2:3] ; GFX90A-NEXT: v_cmp_eq_u32_e64 s[2:3], s13, v7 +; GFX90A-NEXT: v_subrev_co_u32_e64 v3, s[0:1], s12, v5 ; GFX90A-NEXT: v_cndmask_b32_e64 v8, v8, v9, s[2:3] ; GFX90A-NEXT: v_subbrev_co_u32_e64 v2, s[0:1], 0, v2, s[0:1] ; GFX90A-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v8 @@ -14536,12 +14536,12 @@ ; GFX90A-NEXT: v_subb_co_u32_e32 v1, vcc, v5, v1, vcc ; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s13, v1 ; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v2, s[0:1] -; GFX90A-NEXT: s_ashr_i32 s0, s11, 31 ; GFX90A-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc ; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s12, v0 -; GFX90A-NEXT: s_add_u32 s2, s10, s0 +; GFX90A-NEXT: s_ashr_i32 s0, s11, 31 ; GFX90A-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, s13, v1 +; GFX90A-NEXT: s_add_u32 s2, s10, s0 ; GFX90A-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc ; GFX90A-NEXT: s_mov_b32 s1, s0 ; GFX90A-NEXT: s_addc_u32 s3, s11, s0 @@ -14553,10 +14553,9 @@ ; GFX90A-NEXT: v_cvt_f32_u32_e32 v3, s5 ; GFX90A-NEXT: v_xor_b32_e32 v0, s14, v0 ; GFX90A-NEXT: s_sub_u32 s2, 0, s4 -; GFX90A-NEXT: s_subb_u32 s3, 0, s5 +; GFX90A-NEXT: v_xor_b32_e32 v1, s14, v1 ; GFX90A-NEXT: v_mac_f32_e32 v2, s16, v3 ; GFX90A-NEXT: v_rcp_f32_e32 v2, v2 -; GFX90A-NEXT: v_xor_b32_e32 v1, s14, v1 ; GFX90A-NEXT: v_mov_b32_e32 v5, s14 ; GFX90A-NEXT: v_subrev_co_u32_e32 v0, vcc, s14, v0 ; GFX90A-NEXT: v_mul_f32_e32 v2, s17, v2 @@ -14565,8 +14564,8 @@ ; GFX90A-NEXT: v_mac_f32_e32 v2, s19, v3 ; GFX90A-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GFX90A-NEXT: v_cvt_u32_f32_e32 v3, v3 +; GFX90A-NEXT: s_subb_u32 s3, 0, s5 ; GFX90A-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v5, vcc -; GFX90A-NEXT: s_ashr_i32 s10, s7, 31 ; GFX90A-NEXT: v_mul_hi_u32 v7, s2, v2 ; GFX90A-NEXT: v_mul_lo_u32 v8, s2, v3 ; GFX90A-NEXT: v_mul_lo_u32 v5, s3, v2 @@ -14575,10 +14574,10 @@ ; GFX90A-NEXT: v_mul_lo_u32 v9, s2, v2 ; GFX90A-NEXT: v_mul_lo_u32 v8, v2, v5 ; GFX90A-NEXT: v_mul_hi_u32 v10, v2, v9 -; GFX90A-NEXT: v_add_co_u32_e32 v8, vcc, v10, v8 ; GFX90A-NEXT: v_mul_hi_u32 v7, v2, v5 -; GFX90A-NEXT: v_mul_hi_u32 v11, v3, v9 +; GFX90A-NEXT: v_add_co_u32_e32 v8, vcc, v10, v8 ; GFX90A-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc +; GFX90A-NEXT: v_mul_hi_u32 v11, v3, v9 ; GFX90A-NEXT: v_mul_lo_u32 v9, v3, v9 ; GFX90A-NEXT: v_add_co_u32_e32 v8, vcc, v8, v9 ; GFX90A-NEXT: v_mul_hi_u32 v10, v3, v5 @@ -14586,8 +14585,8 @@ ; GFX90A-NEXT: v_addc_co_u32_e32 v8, vcc, v10, v4, vcc ; GFX90A-NEXT: v_mul_lo_u32 v5, v3, v5 ; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v7, v5 -; GFX90A-NEXT: v_add_co_u32_e64 v2, s[0:1], v2, v5 ; GFX90A-NEXT: v_addc_co_u32_e32 v7, vcc, v6, v8, vcc +; GFX90A-NEXT: v_add_co_u32_e64 v2, s[0:1], v2, v5 ; GFX90A-NEXT: v_addc_co_u32_e64 v5, vcc, v3, v7, s[0:1] ; GFX90A-NEXT: v_mul_lo_u32 v8, s2, v5 ; GFX90A-NEXT: v_mul_hi_u32 v9, s2, v2 @@ -14610,19 +14609,20 @@ ; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v10, v5 ; GFX90A-NEXT: v_addc_co_u32_e32 v8, vcc, v6, v9, vcc ; GFX90A-NEXT: v_add_u32_e32 v3, v3, v7 +; GFX90A-NEXT: s_ashr_i32 s10, s7, 31 ; GFX90A-NEXT: v_addc_co_u32_e64 v3, vcc, v3, v8, s[0:1] ; GFX90A-NEXT: s_add_u32 s0, s6, s10 ; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5 ; GFX90A-NEXT: s_mov_b32 s11, s10 ; GFX90A-NEXT: s_addc_u32 s1, s7, s10 -; GFX90A-NEXT: s_xor_b64 s[6:7], s[0:1], s[10:11] ; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX90A-NEXT: s_xor_b64 s[6:7], s[0:1], s[10:11] ; GFX90A-NEXT: v_mul_lo_u32 v7, s6, v3 ; GFX90A-NEXT: v_mul_hi_u32 v8, s6, v2 -; GFX90A-NEXT: v_add_co_u32_e32 v7, vcc, v8, v7 ; GFX90A-NEXT: v_mul_hi_u32 v5, s6, v3 -; GFX90A-NEXT: v_mul_hi_u32 v9, s7, v2 +; GFX90A-NEXT: v_add_co_u32_e32 v7, vcc, v8, v7 ; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX90A-NEXT: v_mul_hi_u32 v9, s7, v2 ; GFX90A-NEXT: v_mul_lo_u32 v2, s7, v2 ; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, v7, v2 ; GFX90A-NEXT: v_mul_hi_u32 v8, s7, v3 @@ -14644,12 +14644,12 @@ ; GFX90A-NEXT: v_subrev_co_u32_e64 v7, s[0:1], s4, v2 ; GFX90A-NEXT: v_subbrev_co_u32_e64 v8, s[2:3], 0, v5, s[0:1] ; GFX90A-NEXT: v_cmp_le_u32_e64 s[2:3], s5, v8 -; GFX90A-NEXT: v_subb_co_u32_e64 v5, s[0:1], v5, v6, s[0:1] ; GFX90A-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[2:3] ; GFX90A-NEXT: v_cmp_le_u32_e64 s[2:3], s4, v7 -; GFX90A-NEXT: v_subrev_co_u32_e64 v6, s[0:1], s4, v7 +; GFX90A-NEXT: v_subb_co_u32_e64 v5, s[0:1], v5, v6, s[0:1] ; GFX90A-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[2:3] ; GFX90A-NEXT: v_cmp_eq_u32_e64 s[2:3], s5, v8 +; GFX90A-NEXT: v_subrev_co_u32_e64 v6, s[0:1], s4, v7 ; GFX90A-NEXT: v_cndmask_b32_e64 v9, v9, v10, s[2:3] ; GFX90A-NEXT: v_subbrev_co_u32_e64 v5, s[0:1], 0, v5, s[0:1] ; GFX90A-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v9 diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-mul24-knownbits.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-mul24-knownbits.ll --- a/llvm/test/CodeGen/AMDGPU/amdgpu-mul24-knownbits.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-mul24-knownbits.ll @@ -5,8 +5,8 @@ ; GCN-LABEL: test_mul24_knownbits_kernel: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: v_and_b32_e32 v0, 3, v0 -; GCN-NEXT: v_mul_i32_i24_e32 v0, -5, v0 ; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GCN-NEXT: v_mul_i32_i24_e32 v0, -5, v0 ; GCN-NEXT: v_and_b32_e32 v0, 0xffffffe0, v0 ; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/anyext.ll b/llvm/test/CodeGen/AMDGPU/anyext.ll --- a/llvm/test/CodeGen/AMDGPU/anyext.ll +++ b/llvm/test/CodeGen/AMDGPU/anyext.ll @@ -65,11 +65,11 @@ ; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN-NEXT: s_mov_b32 s11, 0xf000 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; GCN-NEXT: s_mov_b32 s14, 0 ; GCN-NEXT: s_mov_b32 s15, s11 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_mov_b64 s[12:13], s[6:7] +; GCN-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; GCN-NEXT: v_mov_b32_e32 v3, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 1, v1 ; GCN-NEXT: s_mov_b64 s[2:3], s[14:15] @@ -94,10 +94,10 @@ ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v2, vcc, s6, v0 ; GFX8-NEXT: v_mov_b32_e32 v3, s7 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 1, v1 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, s6, v0 ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 1, v1 ; GFX8-NEXT: v_mov_b32_e32 v1, s9 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s8, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc diff --git a/llvm/test/CodeGen/AMDGPU/ashr.v2i16.ll b/llvm/test/CodeGen/AMDGPU/ashr.v2i16.ll --- a/llvm/test/CodeGen/AMDGPU/ashr.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/ashr.v2i16.ll @@ -38,9 +38,9 @@ ; VI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; CI: s_mov_b32 [[MASK:s[0-9]+]], 0xffff{{$}} -; CI-DAG: v_and_b32_e32 v{{[0-9]+}}, [[MASK]], [[RHS]] ; CI-DAG: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 16 ; CI: v_ashrrev_i32_e32 v{{[0-9]+}}, 16, [[LHS]] +; CI-DAG: v_and_b32_e32 v{{[0-9]+}}, [[MASK]], [[RHS]] ; CI: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} ; CI: v_ashr_i32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; CI: v_ashr_i32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll @@ -822,9 +822,9 @@ ; GFX8-NEXT: s_mov_b32 s13, s7 ; GFX8-NEXT: s_mul_i32 s7, s1, s6 ; GFX8-NEXT: s_mul_i32 s6, s0, s6 -; GFX8-NEXT: v_add_u32_e32 v1, vcc, s7, v0 ; GFX8-NEXT: s_mov_b32 s15, 0xf000 ; GFX8-NEXT: s_mov_b32 s14, -1 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, s7, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s6 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_atomic_add_x2 v[0:1], off, s[12:15], 0 glc @@ -877,15 +877,15 @@ ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: BB4_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mul_lo_u32 v0, s2, v2 ; GFX9-NEXT: v_mul_lo_u32 v3, s3, v2 ; GFX9-NEXT: v_mul_hi_u32 v4, s2, v2 +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: v_mul_lo_u32 v0, s2, v2 ; GFX9-NEXT: v_readfirstlane_b32 s1, v1 +; GFX9-NEXT: v_add_u32_e32 v1, v4, v3 ; GFX9-NEXT: v_mov_b32_e32 v2, s1 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 -; GFX9-NEXT: v_add_u32_e32 v1, v4, v3 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc @@ -1755,8 +1755,8 @@ ; GFX8-NEXT: BB9_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 -; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v2 ; GFX8-NEXT: v_readfirstlane_b32 s5, v1 +; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v2 ; GFX8-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 ; GFX8-NEXT: v_mov_b32_e32 v2, s5 ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s4, v0 @@ -1794,8 +1794,8 @@ ; GFX9-NEXT: BB9_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 -; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v2 ; GFX9-NEXT: v_readfirstlane_b32 s5, v1 +; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v2 ; GFX9-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 ; GFX9-NEXT: v_mov_b32_e32 v2, s5 ; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s4, v0 @@ -1841,9 +1841,9 @@ ; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1064-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 ; GFX1064-NEXT: v_sub_co_u32 v0, vcc, s2, v0 -; GFX1064-NEXT: s_mov_b32 s2, -1 ; GFX1064-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v1, vcc ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1064-NEXT: s_mov_b32 s2, -1 ; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX1064-NEXT: s_endpgm ; @@ -1881,9 +1881,9 @@ ; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1032-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 ; GFX1032-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v0 -; GFX1032-NEXT: s_mov_b32 s2, -1 ; GFX1032-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1032-NEXT: s_mov_b32 s2, -1 ; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX1032-NEXT: s_endpgm entry: @@ -1959,9 +1959,9 @@ ; GFX8-NEXT: s_mov_b32 s13, s7 ; GFX8-NEXT: s_mul_i32 s7, s1, s6 ; GFX8-NEXT: s_mul_i32 s6, s0, s6 -; GFX8-NEXT: v_add_u32_e32 v1, vcc, s7, v0 ; GFX8-NEXT: s_mov_b32 s15, 0xf000 ; GFX8-NEXT: s_mov_b32 s14, -1 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, s7, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s6 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[12:15], 0 glc @@ -2014,15 +2014,15 @@ ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: BB10_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mul_lo_u32 v0, s2, v2 ; GFX9-NEXT: v_mul_lo_u32 v3, s3, v2 ; GFX9-NEXT: v_mul_hi_u32 v4, s2, v2 +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: v_mul_lo_u32 v0, s2, v2 ; GFX9-NEXT: v_readfirstlane_b32 s1, v1 +; GFX9-NEXT: v_add_u32_e32 v1, v4, v3 ; GFX9-NEXT: v_mov_b32_e32 v2, s1 ; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s0, v0 -; GFX9-NEXT: v_add_u32_e32 v1, v4, v3 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v1, vcc diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll @@ -979,8 +979,8 @@ ; GFX8-NEXT: v_mul_lo_u32 v1, s2, v2 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v0 ; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: s_mov_b32 s7, 0xf000 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v1 +; GFX8-NEXT: s_mov_b32 s7, 0xf000 ; GFX8-NEXT: s_mov_b32 s6, -1 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v3, v2, vcc ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 @@ -1013,14 +1013,14 @@ ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_mov_b32 s4, s0 -; GFX9-NEXT: v_readfirstlane_b32 s0, v0 -; GFX9-NEXT: v_mul_lo_u32 v0, s2, v2 ; GFX9-NEXT: v_mul_lo_u32 v3, s3, v2 ; GFX9-NEXT: v_mul_hi_u32 v4, s2, v2 +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: v_mul_lo_u32 v0, s2, v2 ; GFX9-NEXT: s_mov_b32 s5, s1 ; GFX9-NEXT: v_readfirstlane_b32 s1, v1 -; GFX9-NEXT: v_mov_b32_e32 v2, s1 ; GFX9-NEXT: v_add_u32_e32 v1, v4, v3 +; GFX9-NEXT: v_mov_b32_e32 v2, s1 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 @@ -1958,8 +1958,8 @@ ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v0 -; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v2 ; GFX8-NEXT: v_readfirstlane_b32 s3, v1 +; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v2 ; GFX8-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 ; GFX8-NEXT: v_mov_b32_e32 v2, s3 ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s2, v0 @@ -1991,8 +1991,8 @@ ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v0 -; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v2 ; GFX9-NEXT: v_readfirstlane_b32 s3, v1 +; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v2 ; GFX9-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 ; GFX9-NEXT: v_mov_b32_e32 v2, s3 ; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s2, v0 @@ -2030,9 +2030,9 @@ ; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1064-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 ; GFX1064-NEXT: v_sub_co_u32 v0, vcc, s2, v0 -; GFX1064-NEXT: s_mov_b32 s2, -1 ; GFX1064-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v1, vcc ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1064-NEXT: s_mov_b32 s2, -1 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX1064-NEXT: s_endpgm @@ -2064,9 +2064,9 @@ ; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1032-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 ; GFX1032-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v0 -; GFX1032-NEXT: s_mov_b32 s2, -1 ; GFX1032-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1032-NEXT: s_mov_b32 s2, -1 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX1032-NEXT: s_endpgm @@ -2158,8 +2158,8 @@ ; GFX8-NEXT: v_mul_lo_u32 v1, s2, v2 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v0 ; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: s_mov_b32 s7, 0xf000 ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s0, v1 +; GFX8-NEXT: s_mov_b32 s7, 0xf000 ; GFX8-NEXT: s_mov_b32 s6, -1 ; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v3, v2, vcc ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 @@ -2192,14 +2192,14 @@ ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_mov_b32 s4, s0 -; GFX9-NEXT: v_readfirstlane_b32 s0, v0 -; GFX9-NEXT: v_mul_lo_u32 v0, s2, v2 ; GFX9-NEXT: v_mul_lo_u32 v3, s3, v2 ; GFX9-NEXT: v_mul_hi_u32 v4, s2, v2 +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: v_mul_lo_u32 v0, s2, v2 ; GFX9-NEXT: s_mov_b32 s5, s1 ; GFX9-NEXT: v_readfirstlane_b32 s1, v1 -; GFX9-NEXT: v_mov_b32_e32 v2, s1 ; GFX9-NEXT: v_add_u32_e32 v1, v4, v3 +; GFX9-NEXT: v_mov_b32_e32 v2, s1 ; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s0, v0 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 @@ -4189,8 +4189,8 @@ ; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc ; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; GFX8-NEXT: s_mov_b32 s3, 0xf000 @@ -4222,8 +4222,8 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc ; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; GFX9-NEXT: s_mov_b32 s3, 0xf000 @@ -4594,8 +4594,8 @@ ; GFX8-NEXT: BB24_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_readfirstlane_b32 s5, v1 ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 +; GFX8-NEXT: v_readfirstlane_b32 s5, v1 ; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc ; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1] @@ -4627,8 +4627,8 @@ ; GFX9-NEXT: BB24_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_readfirstlane_b32 s5, v1 ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 +; GFX9-NEXT: v_readfirstlane_b32 s5, v1 ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll @@ -311,8 +311,8 @@ ; GFX1064-NEXT: v_readlane_b32 s12, v1, 31 ; GFX1064-NEXT: v_mov_b32_e32 v2, s12 ; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s12, v1, 15 ; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064-NEXT: v_readlane_b32 s12, v1, 15 ; GFX1064-NEXT: v_readlane_b32 s13, v1, 31 ; GFX1064-NEXT: v_writelane_b32 v3, s12, 16 ; GFX1064-NEXT: s_mov_b64 exec, s[10:11] diff --git a/llvm/test/CodeGen/AMDGPU/bitreverse.ll b/llvm/test/CodeGen/AMDGPU/bitreverse.ll --- a/llvm/test/CodeGen/AMDGPU/bitreverse.ll +++ b/llvm/test/CodeGen/AMDGPU/bitreverse.ll @@ -307,8 +307,8 @@ ; GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: v_mov_b32_e32 v0, s0 -; GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GISEL-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GISEL-NEXT: v_mov_b32_e32 v2, s2 diff --git a/llvm/test/CodeGen/AMDGPU/branch-relaxation-inst-size-gfx10.ll b/llvm/test/CodeGen/AMDGPU/branch-relaxation-inst-size-gfx10.ll --- a/llvm/test/CodeGen/AMDGPU/branch-relaxation-inst-size-gfx10.ll +++ b/llvm/test/CodeGen/AMDGPU/branch-relaxation-inst-size-gfx10.ll @@ -6,6 +6,7 @@ ; GCN-LABEL: {{^}}long_forward_branch_gfx10only: ; GFX9: s_cmp_eq_u32 +; GFX9: s_load_dwordx2 ; GFX9-NEXT: s_cbranch_scc1 ; GFX10: s_cmp_eq_u32 diff --git a/llvm/test/CodeGen/AMDGPU/bypass-div.ll b/llvm/test/CodeGen/AMDGPU/bypass-div.ll --- a/llvm/test/CodeGen/AMDGPU/bypass-div.ll +++ b/llvm/test/CodeGen/AMDGPU/bypass-div.ll @@ -19,25 +19,25 @@ ; GFX9-NEXT: v_ashrrev_i32_e32 v4, 31, v3 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v4, vcc -; GFX9-NEXT: v_xor_b32_e32 v2, v2, v4 ; GFX9-NEXT: v_xor_b32_e32 v3, v3, v4 +; GFX9-NEXT: v_xor_b32_e32 v2, v2, v4 ; GFX9-NEXT: v_cvt_f32_u32_e32 v5, v2 ; GFX9-NEXT: v_cvt_f32_u32_e32 v6, v3 ; GFX9-NEXT: v_sub_co_u32_e32 v7, vcc, 0, v2 ; GFX9-NEXT: v_subb_co_u32_e32 v8, vcc, 0, v3, vcc -; GFX9-NEXT: v_mov_b32_e32 v15, 0 ; GFX9-NEXT: v_mac_f32_e32 v5, 0x4f800000, v6 ; GFX9-NEXT: v_rcp_f32_e32 v5, v5 +; GFX9-NEXT: v_mov_b32_e32 v15, 0 ; GFX9-NEXT: v_mov_b32_e32 v14, 0 ; GFX9-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5 ; GFX9-NEXT: v_mul_f32_e32 v6, 0x2f800000, v5 ; GFX9-NEXT: v_trunc_f32_e32 v6, v6 ; GFX9-NEXT: v_mac_f32_e32 v5, 0xcf800000, v6 -; GFX9-NEXT: v_cvt_u32_f32_e32 v5, v5 ; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v6 +; GFX9-NEXT: v_cvt_u32_f32_e32 v5, v5 +; GFX9-NEXT: v_mul_lo_u32 v11, v7, v6 ; GFX9-NEXT: v_mul_lo_u32 v9, v8, v5 ; GFX9-NEXT: v_mul_hi_u32 v10, v7, v5 -; GFX9-NEXT: v_mul_lo_u32 v11, v7, v6 ; GFX9-NEXT: v_mul_lo_u32 v12, v7, v5 ; GFX9-NEXT: v_add3_u32 v9, v10, v11, v9 ; GFX9-NEXT: v_mul_lo_u32 v11, v5, v9 @@ -53,12 +53,12 @@ ; GFX9-NEXT: v_addc_co_u32_e32 v10, vcc, v10, v12, vcc ; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, v16, v14, vcc ; GFX9-NEXT: v_add_co_u32_e32 v9, vcc, v10, v9 -; GFX9-NEXT: v_add_co_u32_e64 v5, s[4:5], v5, v9 ; GFX9-NEXT: v_addc_co_u32_e32 v10, vcc, v15, v11, vcc +; GFX9-NEXT: v_add_co_u32_e64 v5, s[4:5], v5, v9 ; GFX9-NEXT: v_addc_co_u32_e64 v9, vcc, v6, v10, s[4:5] ; GFX9-NEXT: v_mul_lo_u32 v11, v7, v9 -; GFX9-NEXT: v_mul_hi_u32 v12, v7, v5 ; GFX9-NEXT: v_mul_lo_u32 v8, v8, v5 +; GFX9-NEXT: v_mul_hi_u32 v12, v7, v5 ; GFX9-NEXT: v_mul_lo_u32 v7, v7, v5 ; GFX9-NEXT: v_add_u32_e32 v6, v6, v10 ; GFX9-NEXT: v_add3_u32 v8, v12, v11, v8 @@ -67,8 +67,8 @@ ; GFX9-NEXT: v_mul_hi_u32 v17, v5, v8 ; GFX9-NEXT: v_mul_hi_u32 v12, v9, v7 ; GFX9-NEXT: v_mul_lo_u32 v7, v9, v7 -; GFX9-NEXT: v_add_co_u32_e32 v13, vcc, v16, v13 ; GFX9-NEXT: v_mul_hi_u32 v11, v9, v8 +; GFX9-NEXT: v_add_co_u32_e32 v13, vcc, v16, v13 ; GFX9-NEXT: v_addc_co_u32_e32 v16, vcc, v15, v17, vcc ; GFX9-NEXT: v_mul_lo_u32 v8, v9, v8 ; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v13, v7 @@ -113,9 +113,9 @@ ; GFX9-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v2 ; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] ; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], v9, v3 -; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v8, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v9, v11, v10, s[4:5] ; GFX9-NEXT: v_add_co_u32_e64 v10, s[4:5], 2, v5 +; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v8, vcc ; GFX9-NEXT: v_addc_co_u32_e64 v11, s[4:5], 0, v6, s[4:5] ; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3 ; GFX9-NEXT: v_add_co_u32_e64 v12, s[4:5], 1, v5 @@ -156,8 +156,8 @@ ; GFX9-NEXT: v_mul_lo_u32 v3, v1, v2 ; GFX9-NEXT: v_add_u32_e32 v4, 1, v1 ; GFX9-NEXT: v_sub_u32_e32 v0, v0, v3 -; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 ; GFX9-NEXT: v_sub_u32_e32 v3, v0, v2 +; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; GFX9-NEXT: v_add_u32_e32 v3, 1, v1 @@ -188,18 +188,18 @@ ; GFX9-NEXT: v_cvt_f32_u32_e32 v5, v3 ; GFX9-NEXT: v_sub_co_u32_e32 v6, vcc, 0, v2 ; GFX9-NEXT: v_subb_co_u32_e32 v7, vcc, 0, v3, vcc -; GFX9-NEXT: v_mov_b32_e32 v13, 0 ; GFX9-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 ; GFX9-NEXT: v_rcp_f32_e32 v4, v4 +; GFX9-NEXT: v_mov_b32_e32 v13, 0 ; GFX9-NEXT: v_mov_b32_e32 v12, 0 ; GFX9-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 ; GFX9-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 ; GFX9-NEXT: v_trunc_f32_e32 v5, v5 ; GFX9-NEXT: v_mac_f32_e32 v4, 0xcf800000, v5 -; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v4 ; GFX9-NEXT: v_cvt_u32_f32_e32 v5, v5 -; GFX9-NEXT: v_mul_lo_u32 v9, v7, v4 +; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v4 ; GFX9-NEXT: v_mul_lo_u32 v8, v6, v5 +; GFX9-NEXT: v_mul_lo_u32 v9, v7, v4 ; GFX9-NEXT: v_mul_hi_u32 v10, v6, v4 ; GFX9-NEXT: v_mul_lo_u32 v11, v6, v4 ; GFX9-NEXT: v_add3_u32 v8, v10, v8, v9 @@ -216,12 +216,12 @@ ; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, v10, v11, vcc ; GFX9-NEXT: v_addc_co_u32_e32 v10, vcc, v15, v12, vcc ; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v9, v8 -; GFX9-NEXT: v_add_co_u32_e64 v4, s[4:5], v4, v8 ; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, v13, v10, vcc +; GFX9-NEXT: v_add_co_u32_e64 v4, s[4:5], v4, v8 ; GFX9-NEXT: v_addc_co_u32_e64 v8, vcc, v5, v9, s[4:5] ; GFX9-NEXT: v_mul_lo_u32 v10, v6, v8 -; GFX9-NEXT: v_mul_hi_u32 v11, v6, v4 ; GFX9-NEXT: v_mul_lo_u32 v7, v7, v4 +; GFX9-NEXT: v_mul_hi_u32 v11, v6, v4 ; GFX9-NEXT: v_mul_lo_u32 v6, v6, v4 ; GFX9-NEXT: v_add_u32_e32 v5, v5, v9 ; GFX9-NEXT: v_add3_u32 v7, v11, v10, v7 @@ -271,9 +271,9 @@ ; GFX9-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v2 ; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] ; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v3 -; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v6, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v7, v9, v8, s[4:5] ; GFX9-NEXT: v_add_co_u32_e64 v8, s[4:5], 2, v4 +; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v6, vcc ; GFX9-NEXT: v_addc_co_u32_e64 v9, s[4:5], 0, v5, s[4:5] ; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3 ; GFX9-NEXT: v_add_co_u32_e64 v10, s[4:5], 1, v4 @@ -284,11 +284,11 @@ ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 ; GFX9-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v7 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX9-NEXT: v_cndmask_b32_e64 v7, v11, v9, s[4:5] +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v10, v8, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v0, vcc ; GFX9-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX9-NEXT: BB1_2: ; %Flow @@ -309,8 +309,8 @@ ; GFX9-NEXT: v_mul_lo_u32 v3, v1, v2 ; GFX9-NEXT: v_add_u32_e32 v4, 1, v1 ; GFX9-NEXT: v_sub_u32_e32 v0, v0, v3 -; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 ; GFX9-NEXT: v_sub_u32_e32 v3, v0, v2 +; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; GFX9-NEXT: v_add_u32_e32 v3, 1, v1 @@ -346,19 +346,19 @@ ; GFX9-NEXT: v_cvt_f32_u32_e32 v5, v3 ; GFX9-NEXT: v_sub_co_u32_e32 v6, vcc, 0, v2 ; GFX9-NEXT: v_subb_co_u32_e32 v7, vcc, 0, v3, vcc -; GFX9-NEXT: v_mov_b32_e32 v14, 0 ; GFX9-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 ; GFX9-NEXT: v_rcp_f32_e32 v4, v4 +; GFX9-NEXT: v_mov_b32_e32 v14, 0 ; GFX9-NEXT: v_mov_b32_e32 v13, 0 ; GFX9-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 ; GFX9-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 ; GFX9-NEXT: v_trunc_f32_e32 v5, v5 ; GFX9-NEXT: v_mac_f32_e32 v4, 0xcf800000, v5 -; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v4 ; GFX9-NEXT: v_cvt_u32_f32_e32 v5, v5 +; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v4 +; GFX9-NEXT: v_mul_lo_u32 v10, v6, v5 ; GFX9-NEXT: v_mul_lo_u32 v8, v7, v4 ; GFX9-NEXT: v_mul_hi_u32 v9, v6, v4 -; GFX9-NEXT: v_mul_lo_u32 v10, v6, v5 ; GFX9-NEXT: v_mul_lo_u32 v11, v6, v4 ; GFX9-NEXT: v_add3_u32 v8, v9, v10, v8 ; GFX9-NEXT: v_mul_lo_u32 v10, v4, v8 @@ -374,12 +374,12 @@ ; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, v9, v11, vcc ; GFX9-NEXT: v_addc_co_u32_e32 v10, vcc, v15, v13, vcc ; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v9, v8 -; GFX9-NEXT: v_add_co_u32_e64 v4, s[4:5], v4, v8 ; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, v14, v10, vcc +; GFX9-NEXT: v_add_co_u32_e64 v4, s[4:5], v4, v8 ; GFX9-NEXT: v_addc_co_u32_e64 v8, vcc, v5, v9, s[4:5] ; GFX9-NEXT: v_mul_lo_u32 v10, v6, v8 -; GFX9-NEXT: v_mul_hi_u32 v11, v6, v4 ; GFX9-NEXT: v_mul_lo_u32 v7, v7, v4 +; GFX9-NEXT: v_mul_hi_u32 v11, v6, v4 ; GFX9-NEXT: v_mul_lo_u32 v6, v6, v4 ; GFX9-NEXT: v_add_u32_e32 v5, v5, v9 ; GFX9-NEXT: v_add3_u32 v7, v11, v10, v7 @@ -388,8 +388,8 @@ ; GFX9-NEXT: v_mul_hi_u32 v16, v4, v7 ; GFX9-NEXT: v_mul_hi_u32 v11, v8, v6 ; GFX9-NEXT: v_mul_lo_u32 v6, v8, v6 -; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, v15, v12 ; GFX9-NEXT: v_mul_hi_u32 v10, v8, v7 +; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, v15, v12 ; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, v14, v16, vcc ; GFX9-NEXT: v_mul_lo_u32 v7, v8, v7 ; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v12, v6 @@ -420,8 +420,8 @@ ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5 ; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v14, v7, vcc ; GFX9-NEXT: v_mul_lo_u32 v7, v3, v4 -; GFX9-NEXT: v_mul_hi_u32 v8, v2, v4 ; GFX9-NEXT: v_mul_lo_u32 v5, v2, v5 +; GFX9-NEXT: v_mul_hi_u32 v8, v2, v4 ; GFX9-NEXT: v_mul_lo_u32 v4, v2, v4 ; GFX9-NEXT: v_add3_u32 v5, v8, v5, v7 ; GFX9-NEXT: v_sub_u32_e32 v7, v1, v5 @@ -430,13 +430,13 @@ ; GFX9-NEXT: v_sub_co_u32_e64 v7, s[4:5], v0, v2 ; GFX9-NEXT: v_subbrev_co_u32_e64 v8, s[6:7], 0, v4, s[4:5] ; GFX9-NEXT: v_cmp_ge_u32_e64 s[6:7], v8, v3 -; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v5, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[6:7] ; GFX9-NEXT: v_cmp_ge_u32_e64 s[6:7], v7, v2 +; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v5, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[6:7] ; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], v8, v3 -; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3 ; GFX9-NEXT: v_subb_co_u32_e64 v4, s[4:5], v4, v3, s[4:5] +; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v10, s[6:7] ; GFX9-NEXT: v_sub_co_u32_e64 v10, s[4:5], v7, v2 ; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc @@ -444,12 +444,12 @@ ; GFX9-NEXT: v_subbrev_co_u32_e64 v4, s[4:5], 0, v4, s[4:5] ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc ; GFX9-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v9 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; GFX9-NEXT: v_cndmask_b32_e64 v2, v7, v10, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v4, v8, v4, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; GFX9-NEXT: v_xor_b32_e32 v0, v0, v6 ; GFX9-NEXT: v_xor_b32_e32 v1, v1, v6 @@ -505,18 +505,18 @@ ; GFX9-NEXT: v_cvt_f32_u32_e32 v5, v3 ; GFX9-NEXT: v_sub_co_u32_e32 v6, vcc, 0, v2 ; GFX9-NEXT: v_subb_co_u32_e32 v7, vcc, 0, v3, vcc -; GFX9-NEXT: v_mov_b32_e32 v13, 0 ; GFX9-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 ; GFX9-NEXT: v_rcp_f32_e32 v4, v4 +; GFX9-NEXT: v_mov_b32_e32 v13, 0 ; GFX9-NEXT: v_mov_b32_e32 v12, 0 ; GFX9-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 ; GFX9-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 ; GFX9-NEXT: v_trunc_f32_e32 v5, v5 ; GFX9-NEXT: v_mac_f32_e32 v4, 0xcf800000, v5 -; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v4 ; GFX9-NEXT: v_cvt_u32_f32_e32 v5, v5 -; GFX9-NEXT: v_mul_lo_u32 v9, v7, v4 +; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v4 ; GFX9-NEXT: v_mul_lo_u32 v8, v6, v5 +; GFX9-NEXT: v_mul_lo_u32 v9, v7, v4 ; GFX9-NEXT: v_mul_hi_u32 v10, v6, v4 ; GFX9-NEXT: v_mul_lo_u32 v11, v6, v4 ; GFX9-NEXT: v_add3_u32 v8, v10, v8, v9 @@ -533,12 +533,12 @@ ; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, v10, v11, vcc ; GFX9-NEXT: v_addc_co_u32_e32 v10, vcc, v15, v12, vcc ; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v9, v8 -; GFX9-NEXT: v_add_co_u32_e64 v4, s[4:5], v4, v8 ; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, v13, v10, vcc +; GFX9-NEXT: v_add_co_u32_e64 v4, s[4:5], v4, v8 ; GFX9-NEXT: v_addc_co_u32_e64 v8, vcc, v5, v9, s[4:5] ; GFX9-NEXT: v_mul_lo_u32 v10, v6, v8 -; GFX9-NEXT: v_mul_hi_u32 v11, v6, v4 ; GFX9-NEXT: v_mul_lo_u32 v7, v7, v4 +; GFX9-NEXT: v_mul_hi_u32 v11, v6, v4 ; GFX9-NEXT: v_mul_lo_u32 v6, v6, v4 ; GFX9-NEXT: v_add_u32_e32 v5, v5, v9 ; GFX9-NEXT: v_add3_u32 v7, v11, v10, v7 @@ -574,8 +574,8 @@ ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5 ; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v13, v6, vcc ; GFX9-NEXT: v_mul_lo_u32 v6, v3, v4 -; GFX9-NEXT: v_mul_hi_u32 v7, v2, v4 ; GFX9-NEXT: v_mul_lo_u32 v5, v2, v5 +; GFX9-NEXT: v_mul_hi_u32 v7, v2, v4 ; GFX9-NEXT: v_mul_lo_u32 v4, v2, v4 ; GFX9-NEXT: v_add3_u32 v5, v7, v5, v6 ; GFX9-NEXT: v_sub_u32_e32 v6, v1, v5 @@ -584,13 +584,13 @@ ; GFX9-NEXT: v_sub_co_u32_e64 v6, s[4:5], v0, v2 ; GFX9-NEXT: v_subbrev_co_u32_e64 v7, s[6:7], 0, v4, s[4:5] ; GFX9-NEXT: v_cmp_ge_u32_e64 s[6:7], v7, v3 -; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v5, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[6:7] ; GFX9-NEXT: v_cmp_ge_u32_e64 s[6:7], v6, v2 +; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v5, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[6:7] ; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], v7, v3 -; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3 ; GFX9-NEXT: v_subb_co_u32_e64 v4, s[4:5], v4, v3, s[4:5] +; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v8, v8, v9, s[6:7] ; GFX9-NEXT: v_sub_co_u32_e64 v9, s[4:5], v6, v2 ; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc @@ -598,10 +598,10 @@ ; GFX9-NEXT: v_subbrev_co_u32_e64 v4, s[4:5], 0, v4, s[4:5] ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc ; GFX9-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v8 -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v4, v7, v4, s[4:5] +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v1, v4, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v1, v6, v9, s[4:5] ; GFX9-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc @@ -780,25 +780,25 @@ ; GFX9-NEXT: v_ashrrev_i32_e32 v4, 31, v3 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v4, vcc -; GFX9-NEXT: v_xor_b32_e32 v2, v2, v4 ; GFX9-NEXT: v_xor_b32_e32 v3, v3, v4 +; GFX9-NEXT: v_xor_b32_e32 v2, v2, v4 ; GFX9-NEXT: v_cvt_f32_u32_e32 v5, v2 ; GFX9-NEXT: v_cvt_f32_u32_e32 v6, v3 ; GFX9-NEXT: v_sub_co_u32_e32 v7, vcc, 0, v2 ; GFX9-NEXT: v_subb_co_u32_e32 v8, vcc, 0, v3, vcc -; GFX9-NEXT: v_mov_b32_e32 v15, 0 ; GFX9-NEXT: v_mac_f32_e32 v5, 0x4f800000, v6 ; GFX9-NEXT: v_rcp_f32_e32 v5, v5 +; GFX9-NEXT: v_mov_b32_e32 v15, 0 ; GFX9-NEXT: v_mov_b32_e32 v14, 0 ; GFX9-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5 ; GFX9-NEXT: v_mul_f32_e32 v6, 0x2f800000, v5 ; GFX9-NEXT: v_trunc_f32_e32 v6, v6 ; GFX9-NEXT: v_mac_f32_e32 v5, 0xcf800000, v6 -; GFX9-NEXT: v_cvt_u32_f32_e32 v5, v5 ; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v6 +; GFX9-NEXT: v_cvt_u32_f32_e32 v5, v5 +; GFX9-NEXT: v_mul_lo_u32 v11, v7, v6 ; GFX9-NEXT: v_mul_lo_u32 v9, v8, v5 ; GFX9-NEXT: v_mul_hi_u32 v10, v7, v5 -; GFX9-NEXT: v_mul_lo_u32 v11, v7, v6 ; GFX9-NEXT: v_mul_lo_u32 v12, v7, v5 ; GFX9-NEXT: v_add3_u32 v9, v10, v11, v9 ; GFX9-NEXT: v_mul_lo_u32 v11, v5, v9 @@ -814,12 +814,12 @@ ; GFX9-NEXT: v_addc_co_u32_e32 v10, vcc, v10, v12, vcc ; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, v16, v14, vcc ; GFX9-NEXT: v_add_co_u32_e32 v9, vcc, v10, v9 -; GFX9-NEXT: v_add_co_u32_e64 v5, s[4:5], v5, v9 ; GFX9-NEXT: v_addc_co_u32_e32 v10, vcc, v15, v11, vcc +; GFX9-NEXT: v_add_co_u32_e64 v5, s[4:5], v5, v9 ; GFX9-NEXT: v_addc_co_u32_e64 v9, vcc, v6, v10, s[4:5] ; GFX9-NEXT: v_mul_lo_u32 v11, v7, v9 -; GFX9-NEXT: v_mul_hi_u32 v12, v7, v5 ; GFX9-NEXT: v_mul_lo_u32 v8, v8, v5 +; GFX9-NEXT: v_mul_hi_u32 v12, v7, v5 ; GFX9-NEXT: v_mul_lo_u32 v7, v7, v5 ; GFX9-NEXT: v_add_u32_e32 v6, v6, v10 ; GFX9-NEXT: v_add3_u32 v8, v12, v11, v8 @@ -828,8 +828,8 @@ ; GFX9-NEXT: v_mul_hi_u32 v17, v5, v8 ; GFX9-NEXT: v_mul_hi_u32 v12, v9, v7 ; GFX9-NEXT: v_mul_lo_u32 v7, v9, v7 -; GFX9-NEXT: v_add_co_u32_e32 v13, vcc, v16, v13 ; GFX9-NEXT: v_mul_hi_u32 v11, v9, v8 +; GFX9-NEXT: v_add_co_u32_e32 v13, vcc, v16, v13 ; GFX9-NEXT: v_addc_co_u32_e32 v16, vcc, v15, v17, vcc ; GFX9-NEXT: v_mul_lo_u32 v8, v9, v8 ; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v13, v7 @@ -877,8 +877,8 @@ ; GFX9-NEXT: v_cndmask_b32_e64 v12, v12, v13, s[6:7] ; GFX9-NEXT: v_add_co_u32_e64 v13, s[6:7], 2, v5 ; GFX9-NEXT: v_addc_co_u32_e64 v14, s[6:7], 0, v6, s[6:7] -; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v8, vcc ; GFX9-NEXT: v_add_co_u32_e64 v15, s[6:7], 1, v5 +; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v8, vcc ; GFX9-NEXT: v_addc_co_u32_e64 v16, s[6:7], 0, v6, s[6:7] ; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3 ; GFX9-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, v12 @@ -889,20 +889,20 @@ ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v8, v8, v14, vcc ; GFX9-NEXT: v_subb_co_u32_e64 v3, s[4:5], v9, v3, s[4:5] -; GFX9-NEXT: v_sub_co_u32_e64 v2, s[4:5], v10, v2 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 ; GFX9-NEXT: v_cndmask_b32_e64 v8, v15, v13, s[6:7] -; GFX9-NEXT: v_subbrev_co_u32_e64 v3, s[4:5], 0, v3, s[4:5] +; GFX9-NEXT: v_sub_co_u32_e64 v2, s[4:5], v10, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v8, vcc ; GFX9-NEXT: v_xor_b32_e32 v8, v7, v4 +; GFX9-NEXT: v_subbrev_co_u32_e64 v3, s[4:5], 0, v3, s[4:5] ; GFX9-NEXT: v_cndmask_b32_e64 v2, v10, v2, s[6:7] ; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v12, vcc ; GFX9-NEXT: v_xor_b32_e32 v4, v5, v8 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v3, v11, v3, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX9-NEXT: v_xor_b32_e32 v6, v6, v8 ; GFX9-NEXT: v_sub_co_u32_e64 v4, s[8:9], v4, v8 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX9-NEXT: v_xor_b32_e32 v0, v0, v7 ; GFX9-NEXT: v_subb_co_u32_e64 v5, s[8:9], v6, v8, s[8:9] ; GFX9-NEXT: v_xor_b32_e32 v1, v1, v7 @@ -929,13 +929,13 @@ ; GFX9-NEXT: v_mul_lo_u32 v3, v1, v2 ; GFX9-NEXT: v_add_u32_e32 v4, 1, v1 ; GFX9-NEXT: v_sub_u32_e32 v0, v0, v3 -; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 ; GFX9-NEXT: v_sub_u32_e32 v3, v0, v2 +; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; GFX9-NEXT: v_sub_u32_e32 v3, v0, v2 -; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 ; GFX9-NEXT: v_add_u32_e32 v4, 1, v1 +; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v0, v3, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v4, v1, v4, vcc ; GFX9-NEXT: BB8_4: @@ -969,18 +969,18 @@ ; GFX9-NEXT: v_cvt_f32_u32_e32 v5, v3 ; GFX9-NEXT: v_sub_co_u32_e32 v6, vcc, 0, v2 ; GFX9-NEXT: v_subb_co_u32_e32 v7, vcc, 0, v3, vcc -; GFX9-NEXT: v_mov_b32_e32 v13, 0 ; GFX9-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 ; GFX9-NEXT: v_rcp_f32_e32 v4, v4 +; GFX9-NEXT: v_mov_b32_e32 v13, 0 ; GFX9-NEXT: v_mov_b32_e32 v12, 0 ; GFX9-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 ; GFX9-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 ; GFX9-NEXT: v_trunc_f32_e32 v5, v5 ; GFX9-NEXT: v_mac_f32_e32 v4, 0xcf800000, v5 -; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v4 ; GFX9-NEXT: v_cvt_u32_f32_e32 v5, v5 -; GFX9-NEXT: v_mul_lo_u32 v9, v7, v4 +; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v4 ; GFX9-NEXT: v_mul_lo_u32 v8, v6, v5 +; GFX9-NEXT: v_mul_lo_u32 v9, v7, v4 ; GFX9-NEXT: v_mul_hi_u32 v10, v6, v4 ; GFX9-NEXT: v_mul_lo_u32 v11, v6, v4 ; GFX9-NEXT: v_add3_u32 v8, v10, v8, v9 @@ -997,12 +997,12 @@ ; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, v10, v11, vcc ; GFX9-NEXT: v_addc_co_u32_e32 v10, vcc, v15, v12, vcc ; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v9, v8 -; GFX9-NEXT: v_add_co_u32_e64 v4, s[4:5], v4, v8 ; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, v13, v10, vcc +; GFX9-NEXT: v_add_co_u32_e64 v4, s[4:5], v4, v8 ; GFX9-NEXT: v_addc_co_u32_e64 v8, vcc, v5, v9, s[4:5] ; GFX9-NEXT: v_mul_lo_u32 v10, v6, v8 -; GFX9-NEXT: v_mul_hi_u32 v11, v6, v4 ; GFX9-NEXT: v_mul_lo_u32 v7, v7, v4 +; GFX9-NEXT: v_mul_hi_u32 v11, v6, v4 ; GFX9-NEXT: v_mul_lo_u32 v6, v6, v4 ; GFX9-NEXT: v_add_u32_e32 v5, v5, v9 ; GFX9-NEXT: v_add3_u32 v7, v11, v10, v7 @@ -1055,8 +1055,8 @@ ; GFX9-NEXT: v_cndmask_b32_e64 v10, v10, v11, s[6:7] ; GFX9-NEXT: v_add_co_u32_e64 v11, s[6:7], 2, v4 ; GFX9-NEXT: v_addc_co_u32_e64 v12, s[6:7], 0, v5, s[6:7] -; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v6, vcc ; GFX9-NEXT: v_add_co_u32_e64 v13, s[6:7], 1, v4 +; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v6, vcc ; GFX9-NEXT: v_addc_co_u32_e64 v14, s[6:7], 0, v5, s[6:7] ; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3 ; GFX9-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, v10 @@ -1074,9 +1074,9 @@ ; GFX9-NEXT: v_cndmask_b32_e64 v6, v13, v11, s[6:7] ; GFX9-NEXT: v_cndmask_b32_e32 v7, v1, v3, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v1, v8, v2, s[6:7] +; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v10, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v6, v0, v1, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v10, vcc ; GFX9-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX9-NEXT: BB9_2: ; %Flow @@ -1098,13 +1098,13 @@ ; GFX9-NEXT: v_mul_lo_u32 v3, v1, v2 ; GFX9-NEXT: v_add_u32_e32 v4, 1, v1 ; GFX9-NEXT: v_sub_u32_e32 v0, v0, v3 -; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 ; GFX9-NEXT: v_sub_u32_e32 v3, v0, v2 +; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; GFX9-NEXT: v_sub_u32_e32 v3, v0, v2 -; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 ; GFX9-NEXT: v_add_u32_e32 v4, 1, v1 +; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v0, v3, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v4, v1, v4, vcc ; GFX9-NEXT: BB9_4: diff --git a/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll b/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll --- a/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll +++ b/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll @@ -9,7 +9,6 @@ ; GCN: s_getpc_b64 s[34:35] ; GCN-NEXT: s_add_u32 s34, s34, ; GCN-NEXT: s_addc_u32 s35, s35, -; GCN-NEXT: s_mov_b32 s32, 0 ; GCN: s_swappc_b64 s[30:31], s[34:35] ; GCN-NEXT: #ASMSTART @@ -96,10 +95,10 @@ } ; GCN-LABEL: {{^}}test_call_void_func_void_clobber_vcc: -; GCN: s_getpc_b64 +; GCN: s_mov_b64 s[34:35], vcc +; GCN-NEXT: s_getpc_b64 ; GCN-NEXT: s_add_u32 ; GCN-NEXT: s_addc_u32 -; GCN: s_mov_b64 s[34:35], vcc ; GCN-NEXT: s_swappc_b64 ; GCN: s_mov_b64 vcc, s[34:35] define amdgpu_kernel void @test_call_void_func_void_clobber_vcc(i32 addrspace(1)* %out) #0 { @@ -113,7 +112,7 @@ ; GCN-LABEL: {{^}}test_call_void_func_void_mayclobber_s31: ; GCN: s_mov_b32 s33, s31 -; GCN-NEXT: s_swappc_b64 +; GCN: s_swappc_b64 ; GCN-NEXT: s_mov_b32 s31, s33 define amdgpu_kernel void @test_call_void_func_void_mayclobber_s31(i32 addrspace(1)* %out) #0 { %s31 = call i32 asm sideeffect "; def $0", "={s31}"() @@ -124,7 +123,7 @@ ; GCN-LABEL: {{^}}test_call_void_func_void_mayclobber_v31: ; GCN: v_mov_b32_e32 v40, v31 -; GCN-NEXT: s_swappc_b64 +; GCN: s_swappc_b64 ; GCN-NEXT: v_mov_b32_e32 v31, v40 define amdgpu_kernel void @test_call_void_func_void_mayclobber_v31(i32 addrspace(1)* %out) #0 { %v31 = call i32 asm sideeffect "; def $0", "={v31}"() @@ -136,18 +135,17 @@ ; FIXME: What is the expected behavior for reserved registers here? ; GCN-LABEL: {{^}}test_call_void_func_void_preserves_s33: -; MUBUF: s_getpc_b64 s[4:5] -; MUBUF-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 -; MUBUF-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12 -; FLATSCR: s_getpc_b64 s[0:1] -; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_void@rel32@lo+4 -; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_void@rel32@hi+12 -; GCN: s_mov_b32 s32, 0 ; GCN: #ASMSTART ; GCN-NEXT: ; def s33 ; GCN-NEXT: #ASMEND -; MUBUF: s_swappc_b64 s[30:31], s[4:5] -; FLATSCR: s_swappc_b64 s[30:31], s[0:1] +; FLATSCR: s_getpc_b64 s[0:1] +; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_void@rel32@lo+4 +; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_void@rel32@hi+12 +; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[0:1] +; MUBUF: s_getpc_b64 s[4:5] +; MUBUF-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 +; MUBUF-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12 +; MUBUF-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GCN: ;;#ASMSTART ; GCN-NEXT: ; use s33 ; GCN-NEXT: ;;#ASMEND @@ -163,18 +161,18 @@ ; GCN-LABEL: {{^}}test_call_void_func_void_preserves_s34: {{.*}} ; GCN-NOT: s34 -; MUBUF: s_getpc_b64 s[4:5] -; MUBUF-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 -; MUBUF-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12 -; FLATSCR: s_getpc_b64 s[0:1] -; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_void@rel32@lo+4 -; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_void@rel32@hi+12 ; GCN: s_mov_b32 s32, 0 ; GCN-NOT: s34 ; GCN: ;;#ASMSTART ; GCN-NEXT: ; def s34 ; GCN-NEXT: ;;#ASMEND +; FLATSCR: s_getpc_b64 s[0:1] +; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_void@rel32@lo+4 +; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_void@rel32@hi+12 +; MUBUF: s_getpc_b64 s[4:5] +; MUBUF-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 +; MUBUF-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12 ; GCN-NOT: s34 ; MUBUF: s_swappc_b64 s[30:31], s[4:5] @@ -196,18 +194,18 @@ ; GCN-LABEL: {{^}}test_call_void_func_void_preserves_v40: {{.*}} ; GCN-NOT: v32 -; MUBUF: s_getpc_b64 s[4:5] -; MUBUF-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 -; MUBUF-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12 -; FLATSCR: s_getpc_b64 s[0:1] -; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_void@rel32@lo+4 -; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_void@rel32@hi+12 ; GCN: s_mov_b32 s32, 0 ; GCN-NOT: v40 ; GCN: ;;#ASMSTART ; GCN-NEXT: ; def v40 ; GCN-NEXT: ;;#ASMEND +; MUBUF: s_getpc_b64 s[4:5] +; MUBUF-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 +; MUBUF-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12 +; FLATSCR: s_getpc_b64 s[0:1] +; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_void@rel32@lo+4 +; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_void@rel32@hi+12 ; MUBUF: s_swappc_b64 s[30:31], s[4:5] ; FLATSCR: s_swappc_b64 s[30:31], s[0:1] @@ -250,10 +248,10 @@ } ; GCN-LABEL: {{^}}test_call_void_func_void_clobber_s33: +; GCN: s_mov_b32 s32, 0 ; GCN: s_getpc_b64 ; GCN-NEXT: s_add_u32 ; GCN-NEXT: s_addc_u32 -; GCN-NEXT: s_mov_b32 s32, 0 ; GCN: s_swappc_b64 ; GCN-NEXT: s_endpgm define amdgpu_kernel void @test_call_void_func_void_clobber_s33() #0 { @@ -262,10 +260,10 @@ } ; GCN-LABEL: {{^}}test_call_void_func_void_clobber_s34: +; GCN: s_mov_b32 s32, 0 ; GCN: s_getpc_b64 ; GCN-NEXT: s_add_u32 ; GCN-NEXT: s_addc_u32 -; GCN-NEXT: s_mov_b32 s32, 0 ; GCN: s_swappc_b64 ; GCN-NEXT: s_endpgm define amdgpu_kernel void @test_call_void_func_void_clobber_s34() #0 { diff --git a/llvm/test/CodeGen/AMDGPU/call-waitcnt.ll b/llvm/test/CodeGen/AMDGPU/call-waitcnt.ll --- a/llvm/test/CodeGen/AMDGPU/call-waitcnt.ll +++ b/llvm/test/CodeGen/AMDGPU/call-waitcnt.ll @@ -13,10 +13,10 @@ ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: ds_read_b32 v0, v0 +; GCN-NEXT: s_mov_b32 s32, 0 ; GCN-NEXT: s_getpc_b64 s[4:5] ; GCN-NEXT: s_add_u32 s4, s4, func@rel32@lo+4 ; GCN-NEXT: s_addc_u32 s5, s5, func@rel32@hi+12 -; GCN-NEXT: s_mov_b32 s32, 0 ; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GCN-NEXT: s_endpgm %vgpr = load volatile i32, i32 addrspace(3)* %ptr @@ -37,10 +37,10 @@ ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: global_store_dword v0, v0, s[4:5] ; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: s_mov_b32 s32, 0 ; GCN-NEXT: s_getpc_b64 s[6:7] ; GCN-NEXT: s_add_u32 s6, s6, func@rel32@lo+4 ; GCN-NEXT: s_addc_u32 s7, s7, func@rel32@hi+12 -; GCN-NEXT: s_mov_b32 s32, 0 ; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GCN-NEXT: s_endpgm store i32 0, i32 addrspace(1)* %ptr @@ -58,10 +58,10 @@ ; GCN-NEXT: s_add_u32 s0, s0, s9 ; GCN-NEXT: s_addc_u32 s1, s1, 0 ; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: s_mov_b32 s32, 0 ; GCN-NEXT: s_getpc_b64 s[4:5] ; GCN-NEXT: s_add_u32 s4, s4, func@rel32@lo+4 ; GCN-NEXT: s_addc_u32 s5, s5, func@rel32@hi+12 -; GCN-NEXT: s_mov_b32 s32, 0 ; GCN-NEXT: v_mov_b32_e32 v40, 0 ; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GCN-NEXT: global_store_dword v40, v40, s[34:35] @@ -80,10 +80,10 @@ ; GCN-NEXT: s_add_u32 s0, s0, s9 ; GCN-NEXT: s_addc_u32 s1, s1, 0 ; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: s_mov_b32 s32, 0 ; GCN-NEXT: s_getpc_b64 s[4:5] ; GCN-NEXT: s_add_u32 s4, s4, func.return@rel32@lo+4 ; GCN-NEXT: s_addc_u32 s5, s5, func.return@rel32@hi+12 -; GCN-NEXT: s_mov_b32 s32, 0 ; GCN-NEXT: v_mov_b32_e32 v40, 0 ; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GCN-NEXT: global_store_dword v40, v0, s[34:35] diff --git a/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll b/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll --- a/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll +++ b/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll @@ -276,9 +276,10 @@ ; GCN-NEXT: v_writelane_b32 v0, s33, 63 ; GCN-COUNT-60: v_writelane_b32 v0 ; GCN: s_mov_b32 s33, s32 -; GCN-COUNT-2: v_writelane_b32 v0 +; GCN: v_writelane_b32 v0 ; MUBUF: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill ; FLATSCR: scratch_store_dword off, v41, s33 ; 4-byte Folded Spill +; GCN: v_writelane_b32 v0 ; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s33 offset:8 ; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, s33 offset:8 ; GCN: ;;#ASMSTART @@ -318,12 +319,14 @@ ; MUBUF-NEXT: buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR:v[0-9]+]], s32 offset:12 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]] -; GCN-COUNT-62: v_writelane_b32 v0, -; GCN: s_mov_b32 [[FP_COPY:s[0-9]+]], s33 -; GCN-NEXT: s_mov_b32 s33, s32 -; GCN: v_writelane_b32 v0, +; GCN-COUNT-61: v_writelane_b32 v0, +; FLATSCR: s_mov_b32 [[FP_COPY:s[0-9]+]], s33 +; FLATSCR-NEXT: s_mov_b32 s33, s32 +; MUBUF: s_mov_b32 [[FP_COPY:s[0-9]+]], s33 +; MUBUF-NEXT: s_mov_b32 s33, s32 ; MUBUF: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill ; FLATSCR: scratch_store_dword off, v41, s33 ; 4-byte Folded Spill +; GCN: v_writelane_b32 v0, ; MUBUF: buffer_store_dword ; FLATSCR: scratch_store_dword ; GCN: ;;#ASMSTART @@ -389,8 +392,8 @@ ; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR:v[0-9]+]], s32 offset:8 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]] ; GCN-NEXT: v_writelane_b32 v0, s33, 2 -; GCN-NEXT: v_writelane_b32 v0, s30, 0 ; GCN-NEXT: s_mov_b32 s33, s32 +; GCN-NEXT: v_writelane_b32 v0, s30, 0 ; GCN: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0 ; GCN: v_writelane_b32 v0, s31, 1 ; MUBUF: buffer_store_dword [[ZERO]], off, s[0:3], s33 offset:4 @@ -435,8 +438,8 @@ ; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR:v[0-9]+]], s32 offset:8 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]] ; GCN-NEXT: v_writelane_b32 [[CSR_VGPR]], s33, 2 -; GCN-NEXT: v_writelane_b32 [[CSR_VGPR]], s30, 0 ; GCN-NEXT: s_mov_b32 s33, s32 +; GCN-NEXT: v_writelane_b32 [[CSR_VGPR]], s30, 0 ; GCN-DAG: v_writelane_b32 [[CSR_VGPR]], s31, 1 ; MUBUF-DAG: buffer_store_dword @@ -672,8 +675,8 @@ ; GCN-NOT: v_mov_b32_e32 v0, 0x100c ; MUBUF-NEXT: s_add_i32 [[SCRATCH_SGPR:s[0-9]+]], s32, 0x40300 ; MUBUF: buffer_store_dword v0, off, s[0:3], [[SCRATCH_SGPR]] ; 4-byte Folded Spill -; FLATSCR: s_add_i32 [[SOFF:s[0-9]+]], s33, 0x1004 ; FLATSCR: v_mov_b32_e32 v0, 0 +; FLATSCR: s_add_i32 [[SOFF:s[0-9]+]], s33, 0x1004 ; FLATSCR: scratch_store_dword off, v0, [[SOFF]] define void @spill_fp_to_memory_scratch_reg_needed_mubuf_offset([4096 x i8] addrspace(5)* byval([4096 x i8]) align 4 %arg) #3 { %alloca = alloca i32, addrspace(5) diff --git a/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll b/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll --- a/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll +++ b/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll @@ -234,8 +234,8 @@ } ; GCN-LABEL: {{^}}kern_indirect_use_every_sgpr_input: -; GCN: s_mov_b32 s12, s14 ; GCN: s_mov_b32 s13, s15 +; GCN: s_mov_b32 s12, s14 ; GCN: s_mov_b32 s14, s16 ; GCN: s_mov_b32 s32, 0 ; GCN: s_swappc_b64 diff --git a/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs.ll b/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs.ll --- a/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs.ll +++ b/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs.ll @@ -196,10 +196,10 @@ ; GCN-NOT: s6 ; GCN: s_mov_b32 s4, s6 -; GCN-NEXT: s_getpc_b64 s[6:7] +; GCN: s_mov_b32 s32, 0 +; GCN: s_getpc_b64 s[6:7] ; GCN-NEXT: s_add_u32 s6, s6, use_workgroup_id_x@rel32@lo+4 ; GCN-NEXT: s_addc_u32 s7, s7, use_workgroup_id_x@rel32@hi+12 -; GCN: s_mov_b32 s32, 0 ; GCN: s_swappc_b64 ; GCN-NEXT: s_endpgm define amdgpu_kernel void @kern_indirect_use_workgroup_id_x() #1 { @@ -254,8 +254,8 @@ ; GCN: enable_sgpr_workgroup_id_y = 1 ; GCN: enable_sgpr_workgroup_id_z = 1 -; GCN: s_mov_b32 s4, s6 ; GCN: s_mov_b32 s5, s7 +; GCN: s_mov_b32 s4, s6 ; GCN: s_mov_b32 s6, s8 ; GCN: s_mov_b32 s32, 0 @@ -285,8 +285,8 @@ ; GCN: enable_sgpr_workgroup_id_y = 1 ; GCN: enable_sgpr_workgroup_id_z = 1 -; GCN: s_mov_b32 s4, s7 ; GCN: s_mov_b32 s5, s8 +; GCN: s_mov_b32 s4, s7 ; GCN: s_mov_b32 s32, 0 ; GCN: s_swappc_b64 @@ -449,8 +449,8 @@ ; GCN: enable_sgpr_dispatch_id = 1 ; GCN: enable_sgpr_flat_scratch_init = 1 -; GCN: s_mov_b32 s12, s14 ; GCN: s_mov_b32 s13, s15 +; GCN: s_mov_b32 s12, s14 ; GCN: s_mov_b32 s14, s16 ; GCN: s_mov_b32 s32, 0 ; GCN: s_swappc_b64 @@ -526,18 +526,14 @@ ; GCN-DAG: s_mov_b64 s{{\[}}[[LO_X:[0-9]+]]{{\:}}[[HI_X:[0-9]+]]{{\]}}, s[4:5] ; GCN-DAG: s_mov_b64 s{{\[}}[[LO_Y:[0-9]+]]{{\:}}[[HI_Y:[0-9]+]]{{\]}}, s[6:7] - ; GCN: s_mov_b32 s4, s12 ; GCN: s_mov_b32 s5, s13 ; GCN: s_mov_b32 s6, s14 -; GCN: s_mov_b64 s{{\[}}[[LO_Z:[0-9]+]]{{\:}}[[HI_Z:[0-9]+]]{{\]}}, s[8:9] - -; GCN-DAG: s_mov_b32 [[SAVE_X:s[0-57-9][0-9]*]], s12 -; GCN-DAG: s_mov_b32 [[SAVE_Y:s[0-57-9][0-9]*]], s13 ; GCN-DAG: s_mov_b32 [[SAVE_Z:s[0-68-9][0-9]*]], s14 - - +; GCN-DAG: s_mov_b32 [[SAVE_Y:s[0-57-9][0-9]*]], s13 +; GCN-DAG: s_mov_b32 [[SAVE_X:s[0-57-9][0-9]*]], s12 +; GCN: s_mov_b64 s{{\[}}[[LO_Z:[0-9]+]]{{\:}}[[HI_Z:[0-9]+]]{{\]}}, s[8:9] ; GCN: s_swappc_b64 diff --git a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll --- a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll +++ b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll @@ -467,8 +467,8 @@ ; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP1:v[0-9]+]], 10, v1 ; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP0:v[0-9]+]], 20, v2 ; FIXEDABI-DAG: v_or_b32_e32 [[TMP2:v[0-9]+]], v0, [[TMP1]] -; FIXEDABI-DAG: v_or_b32_e32 v31, [[TMP2]], [[TMP0]] ; FIXEDABI: buffer_store_dword [[K]], off, s[0:3], s32{{$}} +; FIXEDABI-DAG: v_or_b32_e32 v31, [[TMP2]], [[TMP0]] ; FIXEDABI: s_swappc_b64 define amdgpu_kernel void @kern_call_too_many_args_use_workitem_id_x() #1 { @@ -623,8 +623,8 @@ ; FIXEDABI: v_mov_b32_e32 [[K0:v[0-9]+]], 0x3e7 -; FIXEDABI: buffer_store_dword [[K0]], off, s[0:3], 0 offset:4{{$}} ; FIXEDABI: s_movk_i32 s32, 0x400{{$}} +; FIXEDABI: buffer_store_dword [[K0]], off, s[0:3], 0 offset:4{{$}} ; FIXEDABI: v_mov_b32_e32 [[K1:v[0-9]+]], 0x140 ; FIXEDABI: buffer_store_dword [[K1]], off, s[0:3], s32{{$}} @@ -775,14 +775,14 @@ ; GCN-DAG: s_mov_b32 s32, 0 ; GCN-DAG: v_lshlrev_b32_e32 [[TMP1:v[0-9]+]], 10, v1 +; FIXEDABI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x140 ; GCN-DAG: v_lshlrev_b32_e32 [[TMP0:v[0-9]+]], 20, v2 ; GCN-DAG: v_or_b32_e32 [[TMP2:v[0-9]+]], v0, [[TMP1]] ; VARABI-DAG: v_or_b32_e32 [[PACKEDID:v[0-9]+]], [[TMP2]], [[TMP0]] ; VARABI: buffer_store_dword [[PACKEDID]], off, s[0:3], s32{{$}} -; FIXEDABI-DAG: v_or_b32_e32 v31, [[TMP2]], [[TMP0]] -; FIXEDABI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x140 ; FIXEDABI: buffer_store_dword [[K]], off, s[0:3], s32{{$}} +; FIXEDABI-DAG: v_or_b32_e32 v31, [[TMP2]], [[TMP0]] ; GCN: s_swappc_b64 define amdgpu_kernel void @kern_call_too_many_args_use_workitem_id_xyz() #1 { diff --git a/llvm/test/CodeGen/AMDGPU/captured-frame-index.ll b/llvm/test/CodeGen/AMDGPU/captured-frame-index.ll --- a/llvm/test/CodeGen/AMDGPU/captured-frame-index.ll +++ b/llvm/test/CodeGen/AMDGPU/captured-frame-index.ll @@ -14,8 +14,8 @@ ; GCN-LABEL: {{^}}stored_fi_to_lds: ; GCN: s_load_dword [[LDSPTR:s[0-9]+]] -; GCN: buffer_store_dword v{{[0-9]+}}, off, ; GCN: v_mov_b32_e32 [[ZERO0:v[0-9]+]], 4{{$}} +; GCN: buffer_store_dword v{{[0-9]+}}, off, ; GCN: v_mov_b32_e32 [[VLDSPTR:v[0-9]+]], [[LDSPTR]] ; GCN: ds_write_b32 [[VLDSPTR]], [[ZERO0]] define amdgpu_kernel void @stored_fi_to_lds(float addrspace(5)* addrspace(3)* %ptr) #0 { diff --git a/llvm/test/CodeGen/AMDGPU/cc-update.ll b/llvm/test/CodeGen/AMDGPU/cc-update.ll --- a/llvm/test/CodeGen/AMDGPU/cc-update.ll +++ b/llvm/test/CodeGen/AMDGPU/cc-update.ll @@ -59,11 +59,11 @@ ; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 ; GFX803-NEXT: s_add_u32 s0, s0, s7 ; GFX803-NEXT: s_addc_u32 s1, s1, 0 +; GFX803-NEXT: s_mov_b32 s32, 0 ; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s5 ; GFX803-NEXT: s_getpc_b64 s[4:5] ; GFX803-NEXT: s_add_u32 s4, s4, ex@rel32@lo+4 ; GFX803-NEXT: s_addc_u32 s5, s5, ex@rel32@hi+12 -; GFX803-NEXT: s_mov_b32 s32, 0 ; GFX803-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX803-NEXT: s_endpgm ; @@ -73,10 +73,10 @@ ; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s5, 0 ; GFX900-NEXT: s_add_u32 s0, s0, s7 ; GFX900-NEXT: s_addc_u32 s1, s1, 0 +; GFX900-NEXT: s_mov_b32 s32, 0 ; GFX900-NEXT: s_getpc_b64 s[4:5] ; GFX900-NEXT: s_add_u32 s4, s4, ex@rel32@lo+4 ; GFX900-NEXT: s_addc_u32 s5, s5, ex@rel32@hi+12 -; GFX900-NEXT: s_mov_b32 s32, 0 ; GFX900-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX900-NEXT: s_endpgm ; @@ -107,13 +107,13 @@ ; GFX803-NEXT: s_add_u32 s0, s0, s7 ; GFX803-NEXT: s_addc_u32 s1, s1, 0 ; GFX803-NEXT: v_mov_b32_e32 v0, 0 +; GFX803-NEXT: s_movk_i32 s32, 0x400 ; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s5 +; GFX803-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4 +; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: s_getpc_b64 s[4:5] ; GFX803-NEXT: s_add_u32 s4, s4, ex@rel32@lo+4 ; GFX803-NEXT: s_addc_u32 s5, s5, ex@rel32@hi+12 -; GFX803-NEXT: s_movk_i32 s32, 0x400 -; GFX803-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4 -; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX803-NEXT: s_endpgm ; @@ -124,12 +124,12 @@ ; GFX900-NEXT: s_add_u32 s0, s0, s7 ; GFX900-NEXT: s_addc_u32 s1, s1, 0 ; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: s_getpc_b64 s[4:5] -; GFX900-NEXT: s_add_u32 s4, s4, ex@rel32@lo+4 -; GFX900-NEXT: s_addc_u32 s5, s5, ex@rel32@hi+12 ; GFX900-NEXT: s_movk_i32 s32, 0x400 ; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4 ; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_getpc_b64 s[4:5] +; GFX900-NEXT: s_add_u32 s4, s4, ex@rel32@lo+4 +; GFX900-NEXT: s_addc_u32 s5, s5, ex@rel32@hi+12 ; GFX900-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX900-NEXT: s_endpgm ; @@ -219,12 +219,12 @@ ; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 ; GFX803-NEXT: s_add_u32 s0, s0, s7 ; GFX803-NEXT: s_addc_u32 s1, s1, 0 +; GFX803-NEXT: s_mov_b32 s32, 0 +; GFX803-NEXT: s_mov_b32 s33, 0 ; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s5 ; GFX803-NEXT: s_getpc_b64 s[4:5] ; GFX803-NEXT: s_add_u32 s4, s4, ex@rel32@lo+4 ; GFX803-NEXT: s_addc_u32 s5, s5, ex@rel32@hi+12 -; GFX803-NEXT: s_mov_b32 s32, 0 -; GFX803-NEXT: s_mov_b32 s33, 0 ; GFX803-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX803-NEXT: s_endpgm ; @@ -234,11 +234,11 @@ ; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s5, 0 ; GFX900-NEXT: s_add_u32 s0, s0, s7 ; GFX900-NEXT: s_addc_u32 s1, s1, 0 +; GFX900-NEXT: s_mov_b32 s32, 0 +; GFX900-NEXT: s_mov_b32 s33, 0 ; GFX900-NEXT: s_getpc_b64 s[4:5] ; GFX900-NEXT: s_add_u32 s4, s4, ex@rel32@lo+4 ; GFX900-NEXT: s_addc_u32 s5, s5, ex@rel32@hi+12 -; GFX900-NEXT: s_mov_b32 s32, 0 -; GFX900-NEXT: s_mov_b32 s33, 0 ; GFX900-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX900-NEXT: s_endpgm ; @@ -271,13 +271,13 @@ ; GFX803-NEXT: s_mov_b32 s33, 0 ; GFX803-NEXT: s_addc_u32 s1, s1, 0 ; GFX803-NEXT: v_mov_b32_e32 v0, 0 +; GFX803-NEXT: s_movk_i32 s32, 0x400 ; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s5 +; GFX803-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4 +; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: s_getpc_b64 s[4:5] ; GFX803-NEXT: s_add_u32 s4, s4, ex@rel32@lo+4 ; GFX803-NEXT: s_addc_u32 s5, s5, ex@rel32@hi+12 -; GFX803-NEXT: s_movk_i32 s32, 0x400 -; GFX803-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4 -; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX803-NEXT: s_endpgm ; @@ -286,15 +286,15 @@ ; GFX900-NEXT: s_add_u32 flat_scratch_lo, s4, s7 ; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s5, 0 ; GFX900-NEXT: s_add_u32 s0, s0, s7 -; GFX900-NEXT: s_addc_u32 s1, s1, 0 ; GFX900-NEXT: s_mov_b32 s33, 0 +; GFX900-NEXT: s_addc_u32 s1, s1, 0 ; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: s_getpc_b64 s[4:5] -; GFX900-NEXT: s_add_u32 s4, s4, ex@rel32@lo+4 -; GFX900-NEXT: s_addc_u32 s5, s5, ex@rel32@hi+12 ; GFX900-NEXT: s_movk_i32 s32, 0x400 ; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4 ; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_getpc_b64 s[4:5] +; GFX900-NEXT: s_add_u32 s4, s4, ex@rel32@lo+4 +; GFX900-NEXT: s_addc_u32 s5, s5, ex@rel32@hi+12 ; GFX900-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX900-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/cluster_stores.ll b/llvm/test/CodeGen/AMDGPU/cluster_stores.ll --- a/llvm/test/CodeGen/AMDGPU/cluster_stores.ll +++ b/llvm/test/CodeGen/AMDGPU/cluster_stores.ll @@ -59,18 +59,18 @@ ; GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-NEXT: v_mov_b32_e32 v2, s0 ; GFX10-NEXT: s_add_u32 s0, s2, 24 -; GFX10-NEXT: s_addc_u32 s1, s3, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: s_addc_u32 s1, s3, 0 ; GFX10-NEXT: v_mov_b32_e32 v4, s6 -; GFX10-NEXT: v_mov_b32_e32 v7, s1 ; GFX10-NEXT: v_mov_b32_e32 v5, s7 +; GFX10-NEXT: v_mov_b32_e32 v7, s1 ; GFX10-NEXT: v_mov_b32_e32 v6, s0 -; GFX10-NEXT: s_add_u32 s0, s4, 8 ; GFX10-NEXT: s_clause 0x3 ; GFX10-NEXT: flat_load_dword v8, v[0:1] ; GFX10-NEXT: flat_load_dword v9, v[2:3] ; GFX10-NEXT: flat_load_dword v10, v[4:5] ; GFX10-NEXT: flat_load_dword v11, v[6:7] +; GFX10-NEXT: s_add_u32 s0, s4, 8 ; GFX10-NEXT: s_addc_u32 s1, s5, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-NEXT: v_mov_b32_e32 v3, s1 @@ -78,11 +78,11 @@ ; GFX10-NEXT: s_add_u32 s0, s4, 16 ; GFX10-NEXT: s_addc_u32 s1, s5, 0 ; GFX10-NEXT: s_add_u32 s2, s4, 24 -; GFX10-NEXT: s_addc_u32 s3, s5, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-NEXT: s_addc_u32 s3, s5, 0 ; GFX10-NEXT: v_mov_b32_e32 v5, s1 -; GFX10-NEXT: v_mov_b32_e32 v7, s3 ; GFX10-NEXT: v_mov_b32_e32 v4, s0 +; GFX10-NEXT: v_mov_b32_e32 v7, s3 ; GFX10-NEXT: v_mov_b32_e32 v6, s2 ; GFX10-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) ; GFX10-NEXT: flat_store_dword v[0:1], v8 @@ -169,26 +169,26 @@ ; GFX10-NEXT: s_add_u32 s0, s2, 24 ; GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-NEXT: s_addc_u32 s1, s3, 0 -; GFX10-NEXT: v_mov_b32_e32 v4, s6 ; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: v_mov_b32_e32 v4, s6 +; GFX10-NEXT: v_mov_b32_e32 v5, s7 ; GFX10-NEXT: flat_load_dword v6, v[2:3] ; GFX10-NEXT: v_mov_b32_e32 v3, s1 -; GFX10-NEXT: v_mov_b32_e32 v5, s7 ; GFX10-NEXT: v_mov_b32_e32 v2, s0 -; GFX10-NEXT: s_add_u32 s0, s4, 8 -; GFX10-NEXT: s_addc_u32 s1, s5, 0 ; GFX10-NEXT: s_clause 0x2 ; GFX10-NEXT: flat_load_dword v8, v[0:1] ; GFX10-NEXT: flat_load_dword v9, v[4:5] ; GFX10-NEXT: flat_load_dword v10, v[2:3] +; GFX10-NEXT: s_add_u32 s0, s4, 8 +; GFX10-NEXT: s_addc_u32 s1, s5, 0 ; GFX10-NEXT: s_add_u32 s2, s4, 16 -; GFX10-NEXT: s_addc_u32 s3, s5, 0 ; GFX10-NEXT: v_mov_b32_e32 v3, s1 +; GFX10-NEXT: s_addc_u32 s3, s5, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-NEXT: v_mov_b32_e32 v2, s0 ; GFX10-NEXT: s_add_u32 s0, s4, 24 -; GFX10-NEXT: v_mov_b32_e32 v5, s3 ; GFX10-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-NEXT: v_mov_b32_e32 v5, s3 ; GFX10-NEXT: s_addc_u32 s1, s5, 0 ; GFX10-NEXT: v_mov_b32_e32 v4, s2 ; GFX10-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) @@ -328,14 +328,14 @@ ; GFX9-NEXT: v_cvt_f32_i32_e32 v8, v0 ; GFX9-NEXT: v_cvt_f32_i32_e32 v9, v1 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: v_mov_b32_e32 v10, 1.0 +; GFX9-NEXT: v_mov_b32_e32 v5, v4 ; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v8 ; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v9 -; GFX9-NEXT: v_mov_b32_e32 v5, v4 ; GFX9-NEXT: v_mov_b32_e32 v6, v4 ; GFX9-NEXT: v_mov_b32_e32 v7, v4 ; GFX9-NEXT: v_add_f32_e32 v8, 2.0, v8 ; GFX9-NEXT: v_add_f32_e32 v9, 2.0, v9 +; GFX9-NEXT: v_mov_b32_e32 v10, 1.0 ; GFX9-NEXT: v_mov_b32_e32 v11, v10 ; GFX9-NEXT: v_mov_b32_e32 v12, v10 ; GFX9-NEXT: v_mov_b32_e32 v13, v10 @@ -351,13 +351,13 @@ ; ; GFX10-LABEL: cluster_image_sample: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: v_cvt_f32_i32_e32 v8, v0 ; GFX10-NEXT: v_cvt_f32_i32_e32 v9, v1 +; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: v_mov_b32_e32 v10, 1.0 -; GFX10-NEXT: v_mov_b32_e32 v5, v4 ; GFX10-NEXT: v_add_f32_e32 v2, 1.0, v8 ; GFX10-NEXT: v_add_f32_e32 v3, 1.0, v9 +; GFX10-NEXT: v_mov_b32_e32 v5, v4 ; GFX10-NEXT: v_mov_b32_e32 v6, v4 ; GFX10-NEXT: v_mov_b32_e32 v7, v4 ; GFX10-NEXT: v_add_f32_e32 v8, 2.0, v8 diff --git a/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll b/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll --- a/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll +++ b/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll @@ -258,12 +258,12 @@ ; SI-NEXT: s_mov_b32 s0, s6 ; SI-NEXT: s_mov_b32 s1, s7 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v3, vcc, 9, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; SI-NEXT: v_and_b32_e32 v4, s12, v1 -; SI-NEXT: v_add_i32_e32 v1, vcc, 9, v1 +; SI-NEXT: v_add_i32_e32 v3, vcc, 9, v0 ; SI-NEXT: v_and_b32_e32 v2, s12, v0 +; SI-NEXT: v_and_b32_e32 v4, s12, v1 ; SI-NEXT: v_and_b32_e32 v3, s13, v3 +; SI-NEXT: v_add_i32_e32 v1, vcc, 9, v1 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_and_b32_e32 v1, s13, v1 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x900, v2 @@ -303,9 +303,9 @@ ; VI-NEXT: v_add_u16_e32 v1, 9, v1 ; VI-NEXT: v_add_u16_e32 v3, 9, v0 ; VI-NEXT: v_and_b32_e32 v1, s13, v1 -; VI-NEXT: v_or_b32_e32 v1, v4, v1 ; VI-NEXT: v_and_b32_e32 v2, s12, v0 ; VI-NEXT: v_and_b32_e32 v3, s13, v3 +; VI-NEXT: v_or_b32_e32 v1, v4, v1 ; VI-NEXT: v_or_b32_e32 v2, v2, v3 ; VI-NEXT: v_add_u16_e32 v1, s14, v1 ; VI-NEXT: v_add_u16_e32 v2, s14, v2 @@ -347,12 +347,13 @@ ; SI-NEXT: s_mov_b32 s6, s10 ; SI-NEXT: s_mov_b32 s7, s11 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v3, vcc, 9, v0 +; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; SI-NEXT: v_and_b32_e32 v4, s16, v1 -; SI-NEXT: v_add_i32_e32 v1, vcc, 9, v1 +; SI-NEXT: v_add_i32_e32 v3, vcc, 9, v0 ; SI-NEXT: v_and_b32_e32 v2, s16, v0 +; SI-NEXT: v_and_b32_e32 v4, s16, v1 ; SI-NEXT: v_and_b32_e32 v3, s17, v3 +; SI-NEXT: v_add_i32_e32 v1, vcc, 9, v1 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_and_b32_e32 v1, s17, v1 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x900, v2 @@ -361,7 +362,6 @@ ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x9000000, v1 -; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; SI-NEXT: buffer_store_dword v1, off, s[12:15], 0 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm @@ -389,20 +389,20 @@ ; VI-NEXT: s_mov_b32 s6, s10 ; VI-NEXT: s_mov_b32 s7, s11 ; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; VI-NEXT: v_and_b32_e32 v4, s16, v1 ; VI-NEXT: v_add_u16_e32 v1, 9, v1 ; VI-NEXT: v_add_u16_e32 v3, 9, v0 ; VI-NEXT: v_and_b32_e32 v1, s17, v1 -; VI-NEXT: v_or_b32_e32 v1, v4, v1 ; VI-NEXT: v_and_b32_e32 v2, s16, v0 ; VI-NEXT: v_and_b32_e32 v3, s17, v3 +; VI-NEXT: v_or_b32_e32 v1, v4, v1 ; VI-NEXT: v_or_b32_e32 v2, v2, v3 ; VI-NEXT: v_add_u16_e32 v1, s18, v1 ; VI-NEXT: v_add_u16_e32 v2, s18, v2 ; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; VI-NEXT: v_or_b32_e32 v1, v2, v1 -; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; VI-NEXT: buffer_store_dword v1, off, s[12:15], 0 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll b/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll --- a/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll +++ b/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll @@ -31,13 +31,13 @@ ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: v_writelane_b32 v40, s33, 2 -; GCN-NEXT: v_writelane_b32 v40, s30, 0 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_addk_i32 s32, 0x400 +; GCN-NEXT: v_writelane_b32 v40, s30, 0 +; GCN-NEXT: v_writelane_b32 v40, s31, 1 ; GCN-NEXT: s_getpc_b64 s[4:5] ; GCN-NEXT: s_add_u32 s4, s4, func_v2f32@rel32@lo+4 ; GCN-NEXT: s_addc_u32 s5, s5, func_v2f32@rel32@hi+12 -; GCN-NEXT: v_writelane_b32 v40, s31, 1 ; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GCN-NEXT: v_readlane_b32 s4, v40, 0 ; GCN-NEXT: v_readlane_b32 s5, v40, 1 @@ -65,13 +65,13 @@ ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: v_writelane_b32 v40, s33, 2 -; GCN-NEXT: v_writelane_b32 v40, s30, 0 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_addk_i32 s32, 0x400 +; GCN-NEXT: v_writelane_b32 v40, s30, 0 +; GCN-NEXT: v_writelane_b32 v40, s31, 1 ; GCN-NEXT: s_getpc_b64 s[4:5] ; GCN-NEXT: s_add_u32 s4, s4, func_v3f32@rel32@lo+4 ; GCN-NEXT: s_addc_u32 s5, s5, func_v3f32@rel32@hi+12 -; GCN-NEXT: v_writelane_b32 v40, s31, 1 ; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GCN-NEXT: v_readlane_b32 s4, v40, 0 ; GCN-NEXT: v_readlane_b32 s5, v40, 1 @@ -99,13 +99,13 @@ ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: v_writelane_b32 v40, s33, 2 -; GCN-NEXT: v_writelane_b32 v40, s30, 0 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_addk_i32 s32, 0x400 +; GCN-NEXT: v_writelane_b32 v40, s30, 0 +; GCN-NEXT: v_writelane_b32 v40, s31, 1 ; GCN-NEXT: s_getpc_b64 s[4:5] ; GCN-NEXT: s_add_u32 s4, s4, func_v4f16@rel32@lo+4 ; GCN-NEXT: s_addc_u32 s5, s5, func_v4f16@rel32@hi+12 -; GCN-NEXT: v_writelane_b32 v40, s31, 1 ; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GCN-NEXT: v_readlane_b32 s4, v40, 0 ; GCN-NEXT: v_readlane_b32 s5, v40, 1 @@ -133,13 +133,13 @@ ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: v_writelane_b32 v40, s33, 2 -; GCN-NEXT: v_writelane_b32 v40, s30, 0 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_addk_i32 s32, 0x400 +; GCN-NEXT: v_writelane_b32 v40, s30, 0 +; GCN-NEXT: v_writelane_b32 v40, s31, 1 ; GCN-NEXT: s_getpc_b64 s[4:5] ; GCN-NEXT: s_add_u32 s4, s4, func_struct@rel32@lo+4 ; GCN-NEXT: s_addc_u32 s5, s5, func_struct@rel32@hi+12 -; GCN-NEXT: v_writelane_b32 v40, s31, 1 ; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GCN-NEXT: v_readlane_b32 s4, v40, 0 ; GCN-NEXT: v_mov_b32_e32 v1, v4 diff --git a/llvm/test/CodeGen/AMDGPU/ctpop16.ll b/llvm/test/CodeGen/AMDGPU/ctpop16.ll --- a/llvm/test/CodeGen/AMDGPU/ctpop16.ll +++ b/llvm/test/CodeGen/AMDGPU/ctpop16.ll @@ -409,14 +409,14 @@ ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v2, s0, v0 -; SI-NEXT: v_and_b32_e32 v3, s0, v1 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_and_b32_e32 v3, s0, v1 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_bcnt_u32_b32_e64 v1, v1, 0 ; SI-NEXT: v_bcnt_u32_b32_e64 v0, v0, 0 ; SI-NEXT: v_bcnt_u32_b32_e64 v3, v3, 0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_bcnt_u32_b32_e64 v2, v2, 0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: v_or_b32_e32 v0, v2, v0 @@ -431,8 +431,8 @@ ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; VI-NEXT: s_mov_b32 s0, 0xffff @@ -444,8 +444,8 @@ ; VI-NEXT: v_bcnt_u32_b32 v2, v2, 0 ; VI-NEXT: v_bcnt_u32_b32 v3, v3, 0 ; VI-NEXT: v_bcnt_u32_b32 v1, v1, 0 -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; VI-NEXT: v_bcnt_u32_b32 v0, v0, 0 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; VI-NEXT: v_or_b32_e32 v1, v1, v2 ; VI-NEXT: v_or_b32_e32 v0, v0, v3 @@ -534,24 +534,24 @@ ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v4, s0, v0 -; SI-NEXT: v_and_b32_e32 v5, s0, v1 -; SI-NEXT: v_and_b32_e32 v6, s0, v2 -; SI-NEXT: v_and_b32_e32 v7, s0, v3 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_and_b32_e32 v5, s0, v1 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v6, s0, v2 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v7, s0, v3 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_bcnt_u32_b32_e64 v3, v3, 0 ; SI-NEXT: v_bcnt_u32_b32_e64 v2, v2, 0 ; SI-NEXT: v_bcnt_u32_b32_e64 v1, v1, 0 ; SI-NEXT: v_bcnt_u32_b32_e64 v0, v0, 0 ; SI-NEXT: v_bcnt_u32_b32_e64 v7, v7, 0 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_bcnt_u32_b32_e64 v6, v6, 0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_bcnt_u32_b32_e64 v5, v5, 0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_bcnt_u32_b32_e64 v4, v4, 0 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v3, v7, v3 ; SI-NEXT: v_or_b32_e32 v2, v6, v2 @@ -568,8 +568,8 @@ ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; VI-NEXT: s_mov_b32 s0, 0xffff @@ -587,12 +587,12 @@ ; VI-NEXT: v_bcnt_u32_b32 v6, v6, 0 ; VI-NEXT: v_bcnt_u32_b32 v7, v7, 0 ; VI-NEXT: v_bcnt_u32_b32 v3, v3, 0 -; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; VI-NEXT: v_bcnt_u32_b32 v2, v2, 0 -; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; VI-NEXT: v_bcnt_u32_b32 v1, v1, 0 -; VI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; VI-NEXT: v_bcnt_u32_b32 v0, v0, 0 +; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; VI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; VI-NEXT: v_or_b32_e32 v3, v3, v4 ; VI-NEXT: v_or_b32_e32 v2, v2, v5 @@ -718,19 +718,19 @@ ; SI-NEXT: v_and_b32_e32 v12, s0, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v8, s0, v4 -; SI-NEXT: v_and_b32_e32 v9, s0, v5 -; SI-NEXT: v_and_b32_e32 v10, s0, v6 -; SI-NEXT: v_and_b32_e32 v11, s0, v7 ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v9, s0, v5 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v10, s0, v6 ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v11, s0, v7 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_and_b32_e32 v13, s0, v1 -; SI-NEXT: v_and_b32_e32 v14, s0, v2 -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v3 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_and_b32_e32 v13, s0, v1 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v14, s0, v2 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v3 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_bcnt_u32_b32_e64 v7, v7, 0 ; SI-NEXT: v_bcnt_u32_b32_e64 v6, v6, 0 @@ -741,20 +741,20 @@ ; SI-NEXT: v_bcnt_u32_b32_e64 v1, v1, 0 ; SI-NEXT: v_bcnt_u32_b32_e64 v0, v0, 0 ; SI-NEXT: v_bcnt_u32_b32_e64 v11, v11, 0 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_bcnt_u32_b32_e64 v10, v10, 0 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_bcnt_u32_b32_e64 v9, v9, 0 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_bcnt_u32_b32_e64 v8, v8, 0 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_bcnt_u32_b32_e64 v15, v15, 0 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_bcnt_u32_b32_e64 v14, v14, 0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_bcnt_u32_b32_e64 v13, v13, 0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_bcnt_u32_b32_e64 v12, v12, 0 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v7, v11, v7 ; SI-NEXT: v_or_b32_e32 v6, v10, v6 @@ -777,8 +777,8 @@ ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_add_u32_e32 v4, vcc, 16, v0 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc @@ -790,15 +790,15 @@ ; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v5 -; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v4 ; VI-NEXT: v_and_b32_e32 v3, v8, v3 ; VI-NEXT: v_and_b32_e32 v2, v8, v2 ; VI-NEXT: v_and_b32_e32 v1, v8, v1 ; VI-NEXT: v_and_b32_e32 v0, v8, v0 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v4 ; VI-NEXT: v_bcnt_u32_b32 v9, v9, 0 ; VI-NEXT: v_bcnt_u32_b32 v10, v10, 0 ; VI-NEXT: v_bcnt_u32_b32 v11, v11, 0 @@ -807,25 +807,25 @@ ; VI-NEXT: v_and_b32_e32 v6, s0, v6 ; VI-NEXT: v_and_b32_e32 v5, s0, v5 ; VI-NEXT: v_and_b32_e32 v4, s0, v4 +; VI-NEXT: v_bcnt_u32_b32 v3, v3, 0 +; VI-NEXT: v_bcnt_u32_b32 v2, v2, 0 +; VI-NEXT: v_bcnt_u32_b32 v1, v1, 0 +; VI-NEXT: v_bcnt_u32_b32 v0, v0, 0 ; VI-NEXT: v_bcnt_u32_b32 v8, v8, 0 ; VI-NEXT: v_bcnt_u32_b32 v13, v13, 0 ; VI-NEXT: v_bcnt_u32_b32 v14, v14, 0 ; VI-NEXT: v_bcnt_u32_b32 v15, v15, 0 -; VI-NEXT: v_bcnt_u32_b32 v3, v3, 0 ; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; VI-NEXT: v_bcnt_u32_b32 v2, v2, 0 ; VI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; VI-NEXT: v_bcnt_u32_b32 v1, v1, 0 ; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; VI-NEXT: v_bcnt_u32_b32 v0, v0, 0 ; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; VI-NEXT: v_bcnt_u32_b32 v7, v7, 0 -; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; VI-NEXT: v_bcnt_u32_b32 v6, v6, 0 -; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; VI-NEXT: v_bcnt_u32_b32 v5, v5, 0 -; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; VI-NEXT: v_bcnt_u32_b32 v4, v4, 0 +; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; VI-NEXT: v_or_b32_e32 v3, v3, v9 ; VI-NEXT: v_or_b32_e32 v2, v2, v10 @@ -1186,8 +1186,8 @@ ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_ushort v0, v[0:1] ; VI-NEXT: s_movk_i32 s0, 0x3e7 diff --git a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll --- a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll +++ b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll @@ -1063,9 +1063,9 @@ ; GFX9-GISEL-NEXT: v_or_b32_sdwa v4, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-GISEL-NEXT: v_bfe_u32 v3, v3, 0, 16 ; GFX9-GISEL-NEXT: v_bfe_u32 v4, v4, 0, 16 -; GFX9-GISEL-NEXT: v_lshl_or_b32 v3, v4, 16, v3 ; GFX9-GISEL-NEXT: v_bfe_u32 v0, v0, 0, 16 ; GFX9-GISEL-NEXT: v_bfe_u32 v2, v2, 0, 16 +; GFX9-GISEL-NEXT: v_lshl_or_b32 v3, v4, 16, v3 ; GFX9-GISEL-NEXT: v_lshl_or_b32 v2, v2, 16, v0 ; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v4, v3 ; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v0, v2 diff --git a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll --- a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll +++ b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll @@ -87,9 +87,9 @@ ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v0 -; SI-NEXT: v_cvt_f32_ubyte1_e32 v0, v0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: v_cvt_f32_ubyte1_e32 v0, v0 ; SI-NEXT: buffer_store_dword v1, off, s[4:7], 0 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -98,9 +98,9 @@ ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v0 -; VI-NEXT: v_cvt_f32_ubyte1_e32 v0, v0 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: v_cvt_f32_ubyte1_e32 v0, v0 ; VI-NEXT: buffer_store_dword v1, off, s[4:7], 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] @@ -933,8 +933,8 @@ ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v0, s0, v4 ; SI-NEXT: v_add_i32_e32 v2, vcc, 9, v5 -; SI-NEXT: v_or_b32_e32 v0, v7, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v6 +; SI-NEXT: v_or_b32_e32 v0, v7, v0 ; SI-NEXT: v_and_b32_e32 v2, s0, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x900, v0 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 @@ -955,8 +955,8 @@ ; VI-NEXT: s_mov_b32 s10, -1 ; VI-NEXT: v_mov_b32_e32 v5, 9 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v4, v[0:1] ; VI-NEXT: s_mov_b32 s6, s10 @@ -968,10 +968,11 @@ ; VI-NEXT: v_cvt_f32_ubyte2_e32 v2, v4 ; VI-NEXT: v_cvt_f32_ubyte1_e32 v1, v4 ; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v4 -; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; VI-NEXT: v_and_b32_e32 v7, 0xffffff00, v4 ; VI-NEXT: v_add_u16_e32 v8, 9, v4 ; VI-NEXT: v_add_u16_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; VI-NEXT: s_nop 0 ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v6 ; VI-NEXT: v_or_b32_sdwa v0, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -1056,10 +1057,10 @@ ; SI-NEXT: v_cvt_f32_ubyte2_e32 v5, v5 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_ubyte0_e32 v2, v8 +; SI-NEXT: v_cvt_f32_ubyte0_e32 v4, v7 ; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:24 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; SI-NEXT: v_cvt_f32_ubyte0_e32 v4, v7 ; SI-NEXT: v_cvt_f32_ubyte3_e32 v3, v2 ; SI-NEXT: v_cvt_f32_ubyte2_e32 v2, v2 ; SI-NEXT: buffer_store_dwordx2 v[4:5], off, s[4:7], 0 offset:16 @@ -1104,11 +1105,11 @@ ; VI-NEXT: v_cvt_f32_ubyte0_e32 v6, v4 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v2, v2, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_cvt_f32_ubyte0_e32 v4, v11 -; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v2 +; VI-NEXT: v_or_b32_sdwa v2, v2, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v2 ; VI-NEXT: v_cvt_f32_ubyte2_e32 v2, v2 ; VI-NEXT: buffer_store_dwordx3 v[4:6], off, s[4:7], 0 offset:16 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 diff --git a/llvm/test/CodeGen/AMDGPU/dagcombine-fma-fmad.ll b/llvm/test/CodeGen/AMDGPU/dagcombine-fma-fmad.ll --- a/llvm/test/CodeGen/AMDGPU/dagcombine-fma-fmad.ll +++ b/llvm/test/CodeGen/AMDGPU/dagcombine-fma-fmad.ll @@ -26,13 +26,11 @@ ; GCN-NEXT: s_nop 0 ; GCN-NEXT: s_buffer_load_dwordx4 s[0:3], s[0:3], 0x40 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_sub_f32_e64 v5, s24, s28 ; GCN-NEXT: s_clause 0x1 ; GCN-NEXT: s_buffer_load_dwordx4 s[4:7], s[0:3], 0x50 ; GCN-NEXT: s_nop 0 ; GCN-NEXT: s_buffer_load_dword s0, s[0:3], 0x2c -; GCN-NEXT: v_fma_f32 v1, v1, v5, s28 -; GCN-NEXT: v_add_f32_e64 v5, s29, -1.0 +; GCN-NEXT: v_sub_f32_e64 v5, s24, s28 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_clause 0x4 ; GCN-NEXT: s_buffer_load_dwordx4 s[8:11], s[0:3], 0x60 @@ -40,12 +38,14 @@ ; GCN-NEXT: s_buffer_load_dwordx4 s[16:19], s[0:3], 0x0 ; GCN-NEXT: s_buffer_load_dwordx4 s[20:23], s[0:3], 0x70 ; GCN-NEXT: s_buffer_load_dwordx4 s[24:27], s[0:3], 0x10 +; GCN-NEXT: v_fma_f32 v1, v1, v5, s28 ; GCN-NEXT: v_max_f32_e64 v6, s0, s0 clamp +; GCN-NEXT: v_add_f32_e64 v5, s29, -1.0 ; GCN-NEXT: v_sub_f32_e32 v8, s0, v1 -; GCN-NEXT: s_mov_b32 s0, 0x3c23d70a ; GCN-NEXT: v_fma_f32 v7, -s2, v6, s6 -; GCN-NEXT: v_fmac_f32_e32 v1, v6, v8 ; GCN-NEXT: v_fma_f32 v5, v6, v5, 1.0 +; GCN-NEXT: s_mov_b32 s0, 0x3c23d70a +; GCN-NEXT: v_fmac_f32_e32 v1, v6, v8 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mul_f32_e32 v9, s10, v0 ; GCN-NEXT: v_fma_f32 v0, -v0, s10, s14 diff --git a/llvm/test/CodeGen/AMDGPU/ds-alignment.ll b/llvm/test/CodeGen/AMDGPU/ds-alignment.ll --- a/llvm/test/CodeGen/AMDGPU/ds-alignment.ll +++ b/llvm/test/CodeGen/AMDGPU/ds-alignment.ll @@ -373,9 +373,10 @@ ; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v4, 24, v4 ; ALIGNED-GISEL-NEXT: v_or3_b32 v0, v0, v3, v4 +; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(2) +; ALIGNED-GISEL-NEXT: v_lshlrev_b32_sdwa v3, s3, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(1) ; ALIGNED-GISEL-NEXT: v_and_b32_e32 v4, v8, v1 -; ALIGNED-GISEL-NEXT: v_lshlrev_b32_sdwa v3, s3, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; ALIGNED-GISEL-NEXT: v_and_b32_e32 v1, v9, v1 ; ALIGNED-GISEL-NEXT: v_and_or_b32 v3, v6, s2, v3 diff --git a/llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll b/llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll --- a/llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll +++ b/llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll @@ -48,11 +48,11 @@ ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 ; CI-NEXT: s_mov_b64 vcc, 0 -; CI-NEXT: v_mov_b32_e32 v2, 0x7b ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s0 ; CI-NEXT: s_mov_b32 s0, 0 ; CI-NEXT: v_div_fmas_f32 v1, v1, v1, v1 +; CI-NEXT: v_mov_b32_e32 v2, 0x7b ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 @@ -83,8 +83,8 @@ ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-NEXT: s_mov_b32 vcc_lo, 0 ; GFX10-NEXT: v_mov_b32_e32 v3, 0x7b +; GFX10-NEXT: s_mov_b32 vcc_lo, 0 ; GFX10-NEXT: v_sub_nc_u32_e32 v2, 0, v0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 @@ -295,11 +295,11 @@ ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: v_sub_i32_e32 v0, vcc, 0x3fb, v0 ; CI-NEXT: s_mov_b64 vcc, 0 -; CI-NEXT: v_mov_b32_e32 v2, 0x7b ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s0 ; CI-NEXT: s_mov_b32 s0, 0 ; CI-NEXT: v_div_fmas_f32 v1, v1, v1, v1 +; CI-NEXT: v_mov_b32_e32 v2, 0x7b ; CI-NEXT: v_mov_b32_e32 v3, 0 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: s_mov_b32 s3, 0xf000 @@ -332,9 +332,9 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-NEXT: s_mov_b32 vcc_lo, 0 ; GFX10-NEXT: v_mov_b32_e32 v3, 0x7b ; GFX10-NEXT: v_mov_b32_e32 v4, 0 +; GFX10-NEXT: s_mov_b32 vcc_lo, 0 ; GFX10-NEXT: v_sub_nc_u32_e32 v2, 0x3fb, v0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 diff --git a/llvm/test/CodeGen/AMDGPU/ds_read2.ll b/llvm/test/CodeGen/AMDGPU/ds_read2.ll --- a/llvm/test/CodeGen/AMDGPU/ds_read2.ll +++ b/llvm/test/CodeGen/AMDGPU/ds_read2.ll @@ -548,10 +548,10 @@ ; CI-NEXT: v_lshlrev_b32_e32 v6, 8, v6 ; CI-NEXT: s_waitcnt lgkmcnt(2) ; CI-NEXT: v_or_b32_e32 v6, v6, v7 -; CI-NEXT: v_lshlrev_b32_e32 v4, 8, v4 -; CI-NEXT: v_or_b32_e32 v2, v2, v3 ; CI-NEXT: s_waitcnt lgkmcnt(1) ; CI-NEXT: v_lshlrev_b32_e32 v8, 8, v8 +; CI-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; CI-NEXT: v_or_b32_e32 v2, v2, v3 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_or_b32_e32 v1, v8, v1 ; CI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 @@ -642,10 +642,10 @@ ; CI-NEXT: v_lshlrev_b32_e32 v6, 8, v6 ; CI-NEXT: s_waitcnt lgkmcnt(2) ; CI-NEXT: v_or_b32_e32 v6, v6, v7 -; CI-NEXT: v_lshlrev_b32_e32 v4, 8, v4 -; CI-NEXT: v_or_b32_e32 v2, v2, v3 ; CI-NEXT: s_waitcnt lgkmcnt(1) ; CI-NEXT: v_lshlrev_b32_e32 v8, 8, v8 +; CI-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; CI-NEXT: v_or_b32_e32 v2, v2, v3 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_or_b32_e32 v1, v8, v1 ; CI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 @@ -1126,8 +1126,8 @@ ; GFX9-NEXT: v_add_f32_e32 v0, v0, v3 ; GFX9-NEXT: s_waitcnt lgkmcnt(2) ; GFX9-NEXT: v_add_f32_e32 v0, v0, v4 -; GFX9-NEXT: v_add_f32_e32 v0, v0, v5 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: v_add_f32_e32 v0, v0, v5 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_add_f32_e32 v0, v0, v6 ; GFX9-NEXT: v_add_f32_e32 v0, v0, v7 diff --git a/llvm/test/CodeGen/AMDGPU/ds_write2.ll b/llvm/test/CodeGen/AMDGPU/ds_write2.ll --- a/llvm/test/CodeGen/AMDGPU/ds_write2.ll +++ b/llvm/test/CodeGen/AMDGPU/ds_write2.ll @@ -183,9 +183,9 @@ ; CI-LABEL: simple_write2_two_val_subreg2_mixed_f32: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; CI-NEXT: v_lshlrev_b32_e32 v1, 3, v0 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 +; CI-NEXT: v_lshlrev_b32_e32 v1, 3, v0 ; CI-NEXT: v_mov_b32_e32 v2, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: buffer_load_dwordx2 v[3:4], v[1:2], s[0:3], 0 addr64 glc @@ -230,9 +230,9 @@ ; CI-LABEL: simple_write2_two_val_subreg2_f32: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; CI-NEXT: v_lshlrev_b32_e32 v1, 3, v0 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 +; CI-NEXT: v_lshlrev_b32_e32 v1, 3, v0 ; CI-NEXT: v_mov_b32_e32 v2, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: buffer_load_dwordx2 v[1:2], v[1:2], s[0:3], 0 addr64 @@ -269,9 +269,9 @@ ; CI-LABEL: simple_write2_two_val_subreg4_f32: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; CI-NEXT: v_lshlrev_b32_e32 v1, 4, v0 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 +; CI-NEXT: v_lshlrev_b32_e32 v1, 4, v0 ; CI-NEXT: v_mov_b32_e32 v2, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: buffer_load_dwordx4 v[1:4], v[1:2], s[0:3], 0 addr64 @@ -653,8 +653,8 @@ ; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v0 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_lshrrev_b32_e32 v3, 24, v1 ; CI-NEXT: ds_write_b8 v0, v1 offset:5 +; CI-NEXT: v_lshrrev_b32_e32 v3, 24, v1 ; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 ; CI-NEXT: v_lshrrev_b32_e32 v5, 8, v1 ; CI-NEXT: ds_write_b8 v0, v2 offset:13 @@ -682,9 +682,9 @@ ; GFX9-ALIGNED-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] ; GFX9-ALIGNED-NEXT: v_add_u32_e32 v2, s4, v2 ; GFX9-ALIGNED-NEXT: s_waitcnt vmcnt(0) -; GFX9-ALIGNED-NEXT: v_lshrrev_b32_e32 v3, 24, v0 ; GFX9-ALIGNED-NEXT: ds_write_b8_d16_hi v2, v0 offset:7 ; GFX9-ALIGNED-NEXT: ds_write_b8 v2, v0 offset:5 +; GFX9-ALIGNED-NEXT: v_lshrrev_b32_e32 v3, 24, v0 ; GFX9-ALIGNED-NEXT: v_lshrrev_b32_e32 v4, 8, v0 ; GFX9-ALIGNED-NEXT: ds_write_b8_d16_hi v2, v1 offset:15 ; GFX9-ALIGNED-NEXT: ds_write_b8 v2, v1 offset:13 @@ -898,8 +898,8 @@ ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-NEXT: s_lshl_b32 s2, s2, 2 ; GFX9-NEXT: s_add_i32 s3, s2, 0xc20 -; GFX9-NEXT: s_addk_i32 s2, 0xc60 ; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: s_addk_i32 s2, 0xc60 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, s2 @@ -955,13 +955,13 @@ ; CI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_add_i32_e32 v0, vcc, s4, v0 ; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; CI-NEXT: v_add_i32_e32 v0, vcc, s4, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s0 ; CI-NEXT: v_mov_b32_e32 v2, s1 -; CI-NEXT: ds_write2_b32 v0, v1, v2 offset1:1 ; CI-NEXT: v_mov_b32_e32 v3, s2 +; CI-NEXT: ds_write2_b32 v0, v1, v2 offset1:1 ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: ds_write2_b32 v0, v3, v1 offset0:2 offset1:3 ; CI-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll b/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll --- a/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll +++ b/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll @@ -14,8 +14,8 @@ ; GFX7-NEXT: s_or_b32 s4, s4, s5 ; GFX7-NEXT: s_cmp_lg_u32 s4, 0 ; GFX7-NEXT: s_addc_u32 s4, s6, 0 -; GFX7-NEXT: s_cselect_b64 vcc, 1, 0 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: s_cselect_b64 vcc, 1, 0 ; GFX7-NEXT: s_cmp_gt_u32 s6, 31 ; GFX7-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; GFX7-NEXT: s_cselect_b64 vcc, -1, 0 @@ -31,8 +31,8 @@ ; GFX9-NEXT: v_add_co_u32_e64 v0, s[4:5], s6, s6 ; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX9-NEXT: s_addc_u32 s4, s6, 0 -; GFX9-NEXT: s_cselect_b64 vcc, 1, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: s_cselect_b64 vcc, 1, 0 ; GFX9-NEXT: s_cmp_gt_u32 s6, 31 ; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 diff --git a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll --- a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll +++ b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll @@ -69,8 +69,8 @@ ; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-ALIGNED-NEXT: s_add_u32 s2, s0, 2 ; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-ALIGNED-NEXT: flat_store_short v[0:1], v2 ; GFX7-ALIGNED-NEXT: s_addc_u32 s3, s1, 0 +; GFX7-ALIGNED-NEXT: flat_store_short v[0:1], v2 ; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v2, 2 ; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v1, s3 @@ -85,8 +85,8 @@ ; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-UNALIGNED-NEXT: s_add_u32 s2, s0, 2 ; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-UNALIGNED-NEXT: flat_store_short v[0:1], v2 ; GFX7-UNALIGNED-NEXT: s_addc_u32 s3, s1, 0 +; GFX7-UNALIGNED-NEXT: flat_store_short v[0:1], v2 ; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v2, 2 ; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v1, s3 @@ -192,15 +192,15 @@ ; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v3, 0 ; GFX7-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-ALIGNED-NEXT: s_add_u32 s2, s0, 2 -; GFX7-ALIGNED-NEXT: s_addc_u32 s3, s1, 0 ; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-ALIGNED-NEXT: s_add_u32 s4, s0, 1 +; GFX7-ALIGNED-NEXT: s_addc_u32 s3, s1, 0 ; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-ALIGNED-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-ALIGNED-NEXT: s_add_u32 s4, s0, 1 ; GFX7-ALIGNED-NEXT: flat_store_byte v[0:1], v2 +; GFX7-ALIGNED-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-ALIGNED-NEXT: s_add_u32 s0, s0, 3 ; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-ALIGNED-NEXT: s_add_u32 s0, s0, 3 ; GFX7-ALIGNED-NEXT: flat_store_byte v[0:1], v3 ; GFX7-ALIGNED-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v0, s0 diff --git a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll --- a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll +++ b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll @@ -237,8 +237,8 @@ ; GFX7-ALIGNED: ; %bb.0: ; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v3, 1 -; GFX7-ALIGNED-NEXT: buffer_store_byte v3, v1, s[0:3], 0 offen ; GFX7-ALIGNED-NEXT: v_add_i32_e32 v2, vcc, 2, v1 +; GFX7-ALIGNED-NEXT: buffer_store_byte v3, v1, s[0:3], 0 offen ; GFX7-ALIGNED-NEXT: v_add_i32_e32 v3, vcc, 1, v1 ; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v4, 0 ; GFX7-ALIGNED-NEXT: v_add_i32_e32 v1, vcc, 3, v1 diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll --- a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll @@ -723,9 +723,9 @@ ; GFX9-NEXT: s_setpc_b64 ; VI: s_waitcnt -; VI-NEXT: v_max_f16_e32 v0, v0, v0 ; VI-NEXT: v_max_f16_e32 v1, v1, v1 ; VI-NEXT: v_max_f16_sdwa v2, v2, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_max_f16_e32 v0, v0, v0 ; VI-NEXT: v_or_b32_e32 v0, 0x7e000000, v0 ; VI-NEXT: v_or_b32_e32 v1, v1, v2 ; VI-NEXT: s_setpc_b64 diff --git a/llvm/test/CodeGen/AMDGPU/fexp.ll b/llvm/test/CodeGen/AMDGPU/fexp.ll --- a/llvm/test/CodeGen/AMDGPU/fexp.ll +++ b/llvm/test/CodeGen/AMDGPU/fexp.ll @@ -163,15 +163,15 @@ ; VI-NEXT: s_movk_i32 s4, 0x3dc5 ; VI-NEXT: v_mov_b32_e32 v3, s4 ; VI-NEXT: v_mul_f16_e32 v2, s4, v1 -; VI-NEXT: v_mul_f16_e32 v4, s4, v0 ; VI-NEXT: v_mul_f16_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_mul_f16_e32 v4, s4, v0 ; VI-NEXT: v_mul_f16_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_exp_f16_e32 v2, v2 -; VI-NEXT: v_exp_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD ; VI-NEXT: v_exp_f16_e32 v4, v4 ; VI-NEXT: v_exp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD -; VI-NEXT: v_or_b32_e32 v1, v2, v1 +; VI-NEXT: v_exp_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v4, v0 +; VI-NEXT: v_or_b32_e32 v1, v2, v1 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_exp_v4f16: @@ -179,15 +179,15 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_movk_i32 s4, 0x3dc5 ; GFX9-NEXT: v_mul_f16_e32 v2, s4, v1 -; GFX9-NEXT: v_mul_f16_e32 v3, s4, v0 ; GFX9-NEXT: v_mul_f16_sdwa v1, v1, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_mul_f16_e32 v3, s4, v0 ; GFX9-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NEXT: v_exp_f16_e32 v2, v2 -; GFX9-NEXT: v_exp_f16_e32 v1, v1 ; GFX9-NEXT: v_exp_f16_e32 v3, v3 ; GFX9-NEXT: v_exp_f16_e32 v0, v0 -; GFX9-NEXT: v_pack_b32_f16 v1, v2, v1 +; GFX9-NEXT: v_exp_f16_e32 v1, v1 ; GFX9-NEXT: v_pack_b32_f16 v0, v3, v0 +; GFX9-NEXT: v_pack_b32_f16 v1, v2, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = call <4 x half> @llvm.exp.v4f16(<4 x half> %arg0) ret <4 x half> %result diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll --- a/llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll @@ -171,14 +171,14 @@ ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, vcc_lo ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, vcc_lo ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: s_mov_b32 s2, exec_lo diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll --- a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll @@ -227,8 +227,8 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b32 s1, s0, 2 ; GFX9-NEXT: s_and_b32 s0, s0, 15 -; GFX9-NEXT: s_lshl_b32 s0, s0, 2 ; GFX9-NEXT: s_add_i32 s1, s1, 4 +; GFX9-NEXT: s_lshl_b32 s0, s0, 2 ; GFX9-NEXT: scratch_store_dword off, v0, s1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_add_i32 s0, s0, 4 @@ -269,8 +269,8 @@ ; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s5, 0 ; GFX9-PAL-NEXT: s_lshl_b32 s1, s0, 2 ; GFX9-PAL-NEXT: s_and_b32 s0, s0, 15 -; GFX9-PAL-NEXT: s_lshl_b32 s0, s0, 2 ; GFX9-PAL-NEXT: s_add_i32 s1, s1, 4 +; GFX9-PAL-NEXT: s_lshl_b32 s0, s0, 2 ; GFX9-PAL-NEXT: scratch_store_dword off, v0, s1 ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: s_add_i32 s0, s0, 4 @@ -338,8 +338,8 @@ ; GFX10-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 -; GFX10-NEXT: s_and_b32 s0, s2, 15 ; GFX10-NEXT: v_mov_b32_e32 v0, 15 +; GFX10-NEXT: s_and_b32 s0, s2, 15 ; GFX10-NEXT: s_lshl_b32 s1, s2, 2 ; GFX10-NEXT: s_lshl_b32 s0, s0, 2 ; GFX10-NEXT: s_add_i32 s1, s1, 4 @@ -362,8 +362,8 @@ ; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 ; GFX9-PAL-NEXT: s_lshl_b32 s1, s0, 2 ; GFX9-PAL-NEXT: s_and_b32 s0, s0, 15 -; GFX9-PAL-NEXT: s_lshl_b32 s0, s0, 2 ; GFX9-PAL-NEXT: s_add_i32 s1, s1, 4 +; GFX9-PAL-NEXT: s_lshl_b32 s0, s0, 2 ; GFX9-PAL-NEXT: scratch_store_dword off, v0, s1 ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: s_add_i32 s0, s0, 4 @@ -382,8 +382,8 @@ ; GFX10-PAL-NEXT: s_addc_u32 s3, s3, 0 ; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 ; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 -; GFX10-PAL-NEXT: s_and_b32 s1, s0, 15 ; GFX10-PAL-NEXT: v_mov_b32_e32 v0, 15 +; GFX10-PAL-NEXT: s_and_b32 s1, s0, 15 ; GFX10-PAL-NEXT: s_lshl_b32 s0, s0, 2 ; GFX10-PAL-NEXT: s_lshl_b32 s1, s1, 2 ; GFX10-PAL-NEXT: s_add_i32 s0, s0, 4 @@ -412,8 +412,8 @@ ; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 4 -; GFX9-NEXT: v_add_u32_e32 v2, v1, v0 ; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 +; GFX9-NEXT: v_add_u32_e32 v2, v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v3, 15 ; GFX9-NEXT: scratch_store_dword v2, v3, off ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -855,9 +855,9 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_lshl_b32 s1, s0, 2 ; GFX9-NEXT: s_and_b32 s0, s0, 15 -; GFX9-NEXT: s_lshl_b32 s0, s0, 2 ; GFX9-NEXT: v_mov_b32_e32 v0, 15 ; GFX9-NEXT: s_addk_i32 s1, 0x104 +; GFX9-NEXT: s_lshl_b32 s0, s0, 2 ; GFX9-NEXT: scratch_store_dword off, v0, s1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_addk_i32 s0, 0x104 @@ -898,13 +898,13 @@ ; GFX9-PAL-NEXT: s_and_b32 s5, s5, 0xffff ; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s4, s3 ; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s5, 0 -; GFX9-PAL-NEXT: s_lshl_b32 s1, s0, 2 -; GFX9-PAL-NEXT: s_and_b32 s0, s0, 15 ; GFX9-PAL-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 glc ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX9-PAL-NEXT: s_lshl_b32 s0, s0, 2 +; GFX9-PAL-NEXT: s_lshl_b32 s1, s0, 2 +; GFX9-PAL-NEXT: s_and_b32 s0, s0, 15 ; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 15 ; GFX9-PAL-NEXT: s_addk_i32 s1, 0x104 +; GFX9-PAL-NEXT: s_lshl_b32 s0, s0, 2 ; GFX9-PAL-NEXT: scratch_store_dword off, v0, s1 ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: s_addk_i32 s0, 0x104 @@ -988,9 +988,9 @@ ; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 ; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 ; GFX9-NEXT: s_mov_b32 vcc_hi, 0 -; GFX9-NEXT: s_lshl_b32 s0, s2, 2 ; GFX9-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_lshl_b32 s0, s2, 2 ; GFX9-NEXT: s_addk_i32 s0, 0x104 ; GFX9-NEXT: v_mov_b32_e32 v0, 15 ; GFX9-NEXT: scratch_store_dword off, v0, s0 @@ -1010,8 +1010,8 @@ ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 ; GFX10-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: s_and_b32 s0, s2, 15 ; GFX10-NEXT: v_mov_b32_e32 v0, 15 +; GFX10-NEXT: s_and_b32 s0, s2, 15 ; GFX10-NEXT: s_lshl_b32 s1, s2, 2 ; GFX10-NEXT: s_lshl_b32 s0, s0, 2 ; GFX10-NEXT: s_addk_i32 s1, 0x104 @@ -1032,13 +1032,13 @@ ; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff ; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1 ; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 -; GFX9-PAL-NEXT: s_lshl_b32 s1, s0, 2 -; GFX9-PAL-NEXT: s_and_b32 s0, s0, 15 ; GFX9-PAL-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 glc ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX9-PAL-NEXT: s_lshl_b32 s0, s0, 2 +; GFX9-PAL-NEXT: s_lshl_b32 s1, s0, 2 +; GFX9-PAL-NEXT: s_and_b32 s0, s0, 15 ; GFX9-PAL-NEXT: s_addk_i32 s1, 0x104 ; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 15 +; GFX9-PAL-NEXT: s_lshl_b32 s0, s0, 2 ; GFX9-PAL-NEXT: scratch_store_dword off, v0, s1 ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: s_addk_i32 s0, 0x104 @@ -1085,8 +1085,8 @@ ; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 ; GFX1030-PAL-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc ; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX1030-PAL-NEXT: s_and_b32 s1, s0, 15 ; GFX1030-PAL-NEXT: v_mov_b32_e32 v0, 15 +; GFX1030-PAL-NEXT: s_and_b32 s1, s0, 15 ; GFX1030-PAL-NEXT: s_lshl_b32 s0, s0, 2 ; GFX1030-PAL-NEXT: s_lshl_b32 s1, s1, 2 ; GFX1030-PAL-NEXT: s_addk_i32 s0, 0x104 @@ -1627,9 +1627,9 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_lshl_b32 s1, s0, 2 ; GFX9-NEXT: s_and_b32 s0, s0, 15 -; GFX9-NEXT: s_lshl_b32 s0, s0, 2 ; GFX9-NEXT: v_mov_b32_e32 v0, 15 ; GFX9-NEXT: s_addk_i32 s1, 0x4004 +; GFX9-NEXT: s_lshl_b32 s0, s0, 2 ; GFX9-NEXT: scratch_store_dword off, v0, s1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_addk_i32 s0, 0x4004 @@ -1670,13 +1670,13 @@ ; GFX9-PAL-NEXT: s_and_b32 s5, s5, 0xffff ; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s4, s3 ; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s5, 0 -; GFX9-PAL-NEXT: s_lshl_b32 s1, s0, 2 -; GFX9-PAL-NEXT: s_and_b32 s0, s0, 15 ; GFX9-PAL-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 glc ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX9-PAL-NEXT: s_lshl_b32 s0, s0, 2 +; GFX9-PAL-NEXT: s_lshl_b32 s1, s0, 2 +; GFX9-PAL-NEXT: s_and_b32 s0, s0, 15 ; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 15 ; GFX9-PAL-NEXT: s_addk_i32 s1, 0x4004 +; GFX9-PAL-NEXT: s_lshl_b32 s0, s0, 2 ; GFX9-PAL-NEXT: scratch_store_dword off, v0, s1 ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: s_addk_i32 s0, 0x4004 @@ -1760,9 +1760,9 @@ ; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 ; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 ; GFX9-NEXT: s_mov_b32 vcc_hi, 0 -; GFX9-NEXT: s_lshl_b32 s0, s2, 2 ; GFX9-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_lshl_b32 s0, s2, 2 ; GFX9-NEXT: s_addk_i32 s0, 0x4004 ; GFX9-NEXT: v_mov_b32_e32 v0, 15 ; GFX9-NEXT: scratch_store_dword off, v0, s0 @@ -1782,8 +1782,8 @@ ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 ; GFX10-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: s_and_b32 s0, s2, 15 ; GFX10-NEXT: v_mov_b32_e32 v0, 15 +; GFX10-NEXT: s_and_b32 s0, s2, 15 ; GFX10-NEXT: s_lshl_b32 s1, s2, 2 ; GFX10-NEXT: s_lshl_b32 s0, s0, 2 ; GFX10-NEXT: s_addk_i32 s1, 0x4004 @@ -1804,13 +1804,13 @@ ; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff ; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1 ; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 -; GFX9-PAL-NEXT: s_lshl_b32 s1, s0, 2 -; GFX9-PAL-NEXT: s_and_b32 s0, s0, 15 ; GFX9-PAL-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 glc ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX9-PAL-NEXT: s_lshl_b32 s0, s0, 2 +; GFX9-PAL-NEXT: s_lshl_b32 s1, s0, 2 +; GFX9-PAL-NEXT: s_and_b32 s0, s0, 15 ; GFX9-PAL-NEXT: s_addk_i32 s1, 0x4004 ; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 15 +; GFX9-PAL-NEXT: s_lshl_b32 s0, s0, 2 ; GFX9-PAL-NEXT: scratch_store_dword off, v0, s1 ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: s_addk_i32 s0, 0x4004 @@ -1857,8 +1857,8 @@ ; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 ; GFX1030-PAL-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc ; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX1030-PAL-NEXT: s_and_b32 s1, s0, 15 ; GFX1030-PAL-NEXT: v_mov_b32_e32 v0, 15 +; GFX1030-PAL-NEXT: s_and_b32 s1, s0, 15 ; GFX1030-PAL-NEXT: s_lshl_b32 s0, s0, 2 ; GFX1030-PAL-NEXT: s_lshl_b32 s1, s1, 2 ; GFX1030-PAL-NEXT: s_addk_i32 s0, 0x4004 @@ -2102,9 +2102,9 @@ ; GFX9: ; %bb.0: ; %bb ; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 ; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 -; GFX9-NEXT: s_movk_i32 s0, 0x3000 ; GFX9-NEXT: v_mov_b32_e32 v0, 13 ; GFX9-NEXT: s_mov_b32 vcc_hi, 0 +; GFX9-NEXT: s_movk_i32 s0, 0x3000 ; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_add_i32 s0, s0, 4 @@ -2216,8 +2216,8 @@ ; GFX9-LABEL: store_load_large_imm_offset_foo: ; GFX9: ; %bb.0: ; %bb ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_movk_i32 s0, 0x3000 ; GFX9-NEXT: v_mov_b32_e32 v0, 13 +; GFX9-NEXT: s_movk_i32 s0, 0x3000 ; GFX9-NEXT: scratch_store_dword off, v0, s32 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_add_i32 s0, s0, s32 @@ -2247,8 +2247,8 @@ ; GFX9-PAL-LABEL: store_load_large_imm_offset_foo: ; GFX9-PAL: ; %bb.0: ; %bb ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-PAL-NEXT: s_movk_i32 s0, 0x3000 ; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 13 +; GFX9-PAL-NEXT: s_movk_i32 s0, 0x3000 ; GFX9-PAL-NEXT: scratch_store_dword off, v0, s32 ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: s_add_i32 s0, s0, s32 @@ -2330,8 +2330,8 @@ ; GFX9-PAL-NEXT: s_and_b32 s5, s5, 0xffff ; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s4, s3 ; GFX9-PAL-NEXT: v_add_u32_e32 v0, s0, v0 -; GFX9-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, v1 ; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s5, 0 +; GFX9-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, v1 ; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 15 ; GFX9-PAL-NEXT: scratch_store_dword v0, v1, off offset:1024 ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) @@ -2722,8 +2722,8 @@ define amdgpu_ps void @large_offset() { ; GFX9-LABEL: large_offset: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, v0 diff --git a/llvm/test/CodeGen/AMDGPU/fmax_legacy.f16.ll b/llvm/test/CodeGen/AMDGPU/fmax_legacy.f16.ll --- a/llvm/test/CodeGen/AMDGPU/fmax_legacy.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fmax_legacy.f16.ll @@ -231,9 +231,9 @@ ; GFX9-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-SAFE-NEXT: v_lshrrev_b32_e32 v6, 16, v3 ; GFX9-SAFE-NEXT: v_lshrrev_b32_e32 v7, 16, v1 -; GFX9-SAFE-NEXT: v_cmp_nle_f16_e32 vcc, v7, v6 ; GFX9-SAFE-NEXT: v_lshrrev_b32_e32 v4, 16, v2 ; GFX9-SAFE-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; GFX9-SAFE-NEXT: v_cmp_nle_f16_e32 vcc, v7, v6 ; GFX9-SAFE-NEXT: v_cndmask_b32_e32 v6, v6, v7, vcc ; GFX9-SAFE-NEXT: v_cmp_nle_f16_e32 vcc, v5, v4 ; GFX9-SAFE-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc @@ -260,9 +260,9 @@ ; VI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-SAFE-NEXT: v_lshrrev_b32_e32 v6, 16, v3 ; VI-SAFE-NEXT: v_lshrrev_b32_e32 v7, 16, v1 -; VI-SAFE-NEXT: v_cmp_nle_f16_e32 vcc, v7, v6 ; VI-SAFE-NEXT: v_lshrrev_b32_e32 v4, 16, v2 ; VI-SAFE-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; VI-SAFE-NEXT: v_cmp_nle_f16_e32 vcc, v7, v6 ; VI-SAFE-NEXT: v_cndmask_b32_e32 v6, v6, v7, vcc ; VI-SAFE-NEXT: v_cmp_nle_f16_e32 vcc, v5, v4 ; VI-SAFE-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc @@ -280,8 +280,8 @@ ; VI-NNAN: ; %bb.0: ; VI-NNAN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NNAN-NEXT: v_max_f16_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NNAN-NEXT: v_max_f16_e32 v1, v1, v3 ; VI-NNAN-NEXT: v_max_f16_sdwa v5, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NNAN-NEXT: v_max_f16_e32 v1, v1, v3 ; VI-NNAN-NEXT: v_max_f16_e32 v0, v0, v2 ; VI-NNAN-NEXT: v_or_b32_e32 v0, v0, v5 ; VI-NNAN-NEXT: v_or_b32_e32 v1, v1, v4 @@ -347,17 +347,17 @@ ; GFX9-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-SAFE-NEXT: v_lshrrev_b32_e32 v14, 16, v7 ; GFX9-SAFE-NEXT: v_lshrrev_b32_e32 v15, 16, v3 -; GFX9-SAFE-NEXT: v_cmp_nle_f16_e32 vcc, v15, v14 ; GFX9-SAFE-NEXT: v_lshrrev_b32_e32 v12, 16, v6 ; GFX9-SAFE-NEXT: v_lshrrev_b32_e32 v13, 16, v2 -; GFX9-SAFE-NEXT: v_cndmask_b32_e32 v14, v14, v15, vcc -; GFX9-SAFE-NEXT: v_cmp_nle_f16_e32 vcc, v13, v12 +; GFX9-SAFE-NEXT: v_cmp_nle_f16_e32 vcc, v15, v14 ; GFX9-SAFE-NEXT: v_lshrrev_b32_e32 v10, 16, v5 ; GFX9-SAFE-NEXT: v_lshrrev_b32_e32 v11, 16, v1 -; GFX9-SAFE-NEXT: v_cndmask_b32_e32 v12, v12, v13, vcc -; GFX9-SAFE-NEXT: v_cmp_nle_f16_e32 vcc, v11, v10 +; GFX9-SAFE-NEXT: v_cndmask_b32_e32 v14, v14, v15, vcc +; GFX9-SAFE-NEXT: v_cmp_nle_f16_e32 vcc, v13, v12 ; GFX9-SAFE-NEXT: v_lshrrev_b32_e32 v8, 16, v4 ; GFX9-SAFE-NEXT: v_lshrrev_b32_e32 v9, 16, v0 +; GFX9-SAFE-NEXT: v_cndmask_b32_e32 v12, v12, v13, vcc +; GFX9-SAFE-NEXT: v_cmp_nle_f16_e32 vcc, v11, v10 ; GFX9-SAFE-NEXT: v_cndmask_b32_e32 v10, v10, v11, vcc ; GFX9-SAFE-NEXT: v_cmp_nle_f16_e32 vcc, v9, v8 ; GFX9-SAFE-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc @@ -394,17 +394,17 @@ ; VI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-SAFE-NEXT: v_lshrrev_b32_e32 v14, 16, v7 ; VI-SAFE-NEXT: v_lshrrev_b32_e32 v15, 16, v3 -; VI-SAFE-NEXT: v_cmp_nle_f16_e32 vcc, v15, v14 ; VI-SAFE-NEXT: v_lshrrev_b32_e32 v12, 16, v6 ; VI-SAFE-NEXT: v_lshrrev_b32_e32 v13, 16, v2 -; VI-SAFE-NEXT: v_cndmask_b32_e32 v14, v14, v15, vcc -; VI-SAFE-NEXT: v_cmp_nle_f16_e32 vcc, v13, v12 +; VI-SAFE-NEXT: v_cmp_nle_f16_e32 vcc, v15, v14 ; VI-SAFE-NEXT: v_lshrrev_b32_e32 v10, 16, v5 ; VI-SAFE-NEXT: v_lshrrev_b32_e32 v11, 16, v1 -; VI-SAFE-NEXT: v_cndmask_b32_e32 v12, v12, v13, vcc -; VI-SAFE-NEXT: v_cmp_nle_f16_e32 vcc, v11, v10 +; VI-SAFE-NEXT: v_cndmask_b32_e32 v14, v14, v15, vcc +; VI-SAFE-NEXT: v_cmp_nle_f16_e32 vcc, v13, v12 ; VI-SAFE-NEXT: v_lshrrev_b32_e32 v8, 16, v4 ; VI-SAFE-NEXT: v_lshrrev_b32_e32 v9, 16, v0 +; VI-SAFE-NEXT: v_cndmask_b32_e32 v12, v12, v13, vcc +; VI-SAFE-NEXT: v_cmp_nle_f16_e32 vcc, v11, v10 ; VI-SAFE-NEXT: v_cndmask_b32_e32 v10, v10, v11, vcc ; VI-SAFE-NEXT: v_cmp_nle_f16_e32 vcc, v9, v8 ; VI-SAFE-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc @@ -430,12 +430,12 @@ ; VI-NNAN: ; %bb.0: ; VI-NNAN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NNAN-NEXT: v_max_f16_sdwa v8, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NNAN-NEXT: v_max_f16_e32 v3, v3, v7 ; VI-NNAN-NEXT: v_max_f16_sdwa v9, v2, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NNAN-NEXT: v_max_f16_e32 v2, v2, v6 ; VI-NNAN-NEXT: v_max_f16_sdwa v10, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NNAN-NEXT: v_max_f16_e32 v1, v1, v5 ; VI-NNAN-NEXT: v_max_f16_sdwa v11, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NNAN-NEXT: v_max_f16_e32 v3, v3, v7 +; VI-NNAN-NEXT: v_max_f16_e32 v2, v2, v6 +; VI-NNAN-NEXT: v_max_f16_e32 v1, v1, v5 ; VI-NNAN-NEXT: v_max_f16_e32 v0, v0, v4 ; VI-NNAN-NEXT: v_or_b32_e32 v0, v0, v11 ; VI-NNAN-NEXT: v_or_b32_e32 v1, v1, v10 diff --git a/llvm/test/CodeGen/AMDGPU/fmin_legacy.f16.ll b/llvm/test/CodeGen/AMDGPU/fmin_legacy.f16.ll --- a/llvm/test/CodeGen/AMDGPU/fmin_legacy.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fmin_legacy.f16.ll @@ -232,9 +232,9 @@ ; GFX9-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-SAFE-NEXT: v_lshrrev_b32_e32 v6, 16, v3 ; GFX9-SAFE-NEXT: v_lshrrev_b32_e32 v7, 16, v1 -; GFX9-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc, v7, v6 ; GFX9-SAFE-NEXT: v_lshrrev_b32_e32 v4, 16, v2 ; GFX9-SAFE-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; GFX9-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc, v7, v6 ; GFX9-SAFE-NEXT: v_cndmask_b32_e32 v6, v6, v7, vcc ; GFX9-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc, v5, v4 ; GFX9-SAFE-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc @@ -261,9 +261,9 @@ ; VI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-SAFE-NEXT: v_lshrrev_b32_e32 v6, 16, v3 ; VI-SAFE-NEXT: v_lshrrev_b32_e32 v7, 16, v1 -; VI-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc, v7, v6 ; VI-SAFE-NEXT: v_lshrrev_b32_e32 v4, 16, v2 ; VI-SAFE-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; VI-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc, v7, v6 ; VI-SAFE-NEXT: v_cndmask_b32_e32 v6, v6, v7, vcc ; VI-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc, v5, v4 ; VI-SAFE-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc @@ -281,8 +281,8 @@ ; VI-NNAN: ; %bb.0: ; VI-NNAN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NNAN-NEXT: v_min_f16_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NNAN-NEXT: v_min_f16_e32 v1, v1, v3 ; VI-NNAN-NEXT: v_min_f16_sdwa v5, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NNAN-NEXT: v_min_f16_e32 v1, v1, v3 ; VI-NNAN-NEXT: v_min_f16_e32 v0, v0, v2 ; VI-NNAN-NEXT: v_or_b32_e32 v0, v0, v5 ; VI-NNAN-NEXT: v_or_b32_e32 v1, v1, v4 @@ -348,17 +348,17 @@ ; GFX9-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-SAFE-NEXT: v_lshrrev_b32_e32 v14, 16, v7 ; GFX9-SAFE-NEXT: v_lshrrev_b32_e32 v15, 16, v3 -; GFX9-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc, v15, v14 ; GFX9-SAFE-NEXT: v_lshrrev_b32_e32 v12, 16, v6 ; GFX9-SAFE-NEXT: v_lshrrev_b32_e32 v13, 16, v2 -; GFX9-SAFE-NEXT: v_cndmask_b32_e32 v14, v14, v15, vcc -; GFX9-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc, v13, v12 +; GFX9-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc, v15, v14 ; GFX9-SAFE-NEXT: v_lshrrev_b32_e32 v10, 16, v5 ; GFX9-SAFE-NEXT: v_lshrrev_b32_e32 v11, 16, v1 -; GFX9-SAFE-NEXT: v_cndmask_b32_e32 v12, v12, v13, vcc -; GFX9-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc, v11, v10 +; GFX9-SAFE-NEXT: v_cndmask_b32_e32 v14, v14, v15, vcc +; GFX9-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc, v13, v12 ; GFX9-SAFE-NEXT: v_lshrrev_b32_e32 v8, 16, v4 ; GFX9-SAFE-NEXT: v_lshrrev_b32_e32 v9, 16, v0 +; GFX9-SAFE-NEXT: v_cndmask_b32_e32 v12, v12, v13, vcc +; GFX9-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc, v11, v10 ; GFX9-SAFE-NEXT: v_cndmask_b32_e32 v10, v10, v11, vcc ; GFX9-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc, v9, v8 ; GFX9-SAFE-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc @@ -395,17 +395,17 @@ ; VI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-SAFE-NEXT: v_lshrrev_b32_e32 v14, 16, v7 ; VI-SAFE-NEXT: v_lshrrev_b32_e32 v15, 16, v3 -; VI-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc, v15, v14 ; VI-SAFE-NEXT: v_lshrrev_b32_e32 v12, 16, v6 ; VI-SAFE-NEXT: v_lshrrev_b32_e32 v13, 16, v2 -; VI-SAFE-NEXT: v_cndmask_b32_e32 v14, v14, v15, vcc -; VI-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc, v13, v12 +; VI-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc, v15, v14 ; VI-SAFE-NEXT: v_lshrrev_b32_e32 v10, 16, v5 ; VI-SAFE-NEXT: v_lshrrev_b32_e32 v11, 16, v1 -; VI-SAFE-NEXT: v_cndmask_b32_e32 v12, v12, v13, vcc -; VI-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc, v11, v10 +; VI-SAFE-NEXT: v_cndmask_b32_e32 v14, v14, v15, vcc +; VI-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc, v13, v12 ; VI-SAFE-NEXT: v_lshrrev_b32_e32 v8, 16, v4 ; VI-SAFE-NEXT: v_lshrrev_b32_e32 v9, 16, v0 +; VI-SAFE-NEXT: v_cndmask_b32_e32 v12, v12, v13, vcc +; VI-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc, v11, v10 ; VI-SAFE-NEXT: v_cndmask_b32_e32 v10, v10, v11, vcc ; VI-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc, v9, v8 ; VI-SAFE-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc @@ -431,12 +431,12 @@ ; VI-NNAN: ; %bb.0: ; VI-NNAN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NNAN-NEXT: v_min_f16_sdwa v8, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NNAN-NEXT: v_min_f16_e32 v3, v3, v7 ; VI-NNAN-NEXT: v_min_f16_sdwa v9, v2, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NNAN-NEXT: v_min_f16_e32 v2, v2, v6 ; VI-NNAN-NEXT: v_min_f16_sdwa v10, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NNAN-NEXT: v_min_f16_e32 v1, v1, v5 ; VI-NNAN-NEXT: v_min_f16_sdwa v11, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NNAN-NEXT: v_min_f16_e32 v3, v3, v7 +; VI-NNAN-NEXT: v_min_f16_e32 v2, v2, v6 +; VI-NNAN-NEXT: v_min_f16_e32 v1, v1, v5 ; VI-NNAN-NEXT: v_min_f16_e32 v0, v0, v4 ; VI-NNAN-NEXT: v_or_b32_e32 v0, v0, v11 ; VI-NNAN-NEXT: v_or_b32_e32 v1, v1, v10 diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll --- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll @@ -131,8 +131,8 @@ ; GCN-LABEL: {{^}}s_fneg_multi_use_fabs_v2f16: ; GFX9: s_and_b32 [[ABS:s[0-9]+]], s{{[0-9]+}}, 0x7fff7fff -; GFX9: v_mov_b32_e32 [[V_ABS:v[0-9]+]], [[ABS]] ; GFX9: s_xor_b32 [[NEG:s[0-9]+]], [[ABS]], 0x80008000 +; GFX9: v_mov_b32_e32 [[V_ABS:v[0-9]+]], [[ABS]] ; GFX9-DAG: v_mov_b32_e32 [[V_NEG:v[0-9]+]], [[NEG]] ; GFX9-DAG: global_store_dword v{{[0-9]+}}, [[V_ABS]], s{{\[[0-9]+:[0-9]+\]}} ; GFX9: global_store_dword v{{[0-9]+}}, [[V_NEG]], s{{\[[0-9]+:[0-9]+\]}} diff --git a/llvm/test/CodeGen/AMDGPU/fp-min-max-atomics.ll b/llvm/test/CodeGen/AMDGPU/fp-min-max-atomics.ll --- a/llvm/test/CodeGen/AMDGPU/fp-min-max-atomics.ll +++ b/llvm/test/CodeGen/AMDGPU/fp-min-max-atomics.ll @@ -382,15 +382,15 @@ ; ; GFX10-LABEL: raw_buffer_atomic_min_rtn_f32_off4_slc: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: s_clause 0x2 +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: s_nop 0 -; GFX10-NEXT: s_load_dword s0, s[0:1], 0x3c ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: s_load_dword s0, s[0:1], 0x3c ; GFX10-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 4 offen glc slc +; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, s0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ds_write_b32 v1, v0 @@ -906,17 +906,16 @@ ; ; GFX10-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: s_clause 0x2 +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: s_nop 0 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c ; GFX10-NEXT: buffer_atomic_fmax v0, v1, s[4:7], 4 offen glc slc ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; @@ -967,17 +966,16 @@ ; ; G_GFX10-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc: ; G_GFX10: ; %bb.0: ; %main_body -; G_GFX10-NEXT: s_clause 0x2 +; G_GFX10-NEXT: s_clause 0x1 ; G_GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; G_GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; G_GFX10-NEXT: s_nop 0 -; G_GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c ; G_GFX10-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX10-NEXT: v_mov_b32_e32 v0, s2 ; G_GFX10-NEXT: v_mov_b32_e32 v1, s3 +; G_GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c ; G_GFX10-NEXT: buffer_atomic_fmax v0, v1, s[4:7], 4 offen glc slc ; G_GFX10-NEXT: v_mov_b32_e32 v1, 0 -; G_GFX10-NEXT: s_waitcnt vmcnt(0) +; G_GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; G_GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; G_GFX10-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll --- a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll +++ b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll @@ -54,8 +54,8 @@ ; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v3, s10 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v3, s[4:7], 0 idxen offset:4 glc slc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] @@ -104,8 +104,8 @@ ; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v3, s10 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v3, s[4:7], 4 offen glc slc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] @@ -154,8 +154,8 @@ ; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v3, s10 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v3, s[4:7], 0 idxen offset:4 glc slc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] @@ -204,8 +204,8 @@ ; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v3, s10 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v3, s[4:7], 4 offen glc slc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] @@ -254,8 +254,8 @@ ; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v3, s10 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v3, s[4:7], 0 idxen offset:4 glc slc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] @@ -304,8 +304,8 @@ ; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v3, s10 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v3, s[4:7], 4 offen glc slc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] @@ -354,8 +354,8 @@ ; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NEXT: v_mov_b32_e32 v3, s10 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v3, s[4:7], 0 idxen offset:4 glc slc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] diff --git a/llvm/test/CodeGen/AMDGPU/fp_to_sint.ll b/llvm/test/CodeGen/AMDGPU/fp_to_sint.ll --- a/llvm/test/CodeGen/AMDGPU/fp_to_sint.ll +++ b/llvm/test/CodeGen/AMDGPU/fp_to_sint.ll @@ -338,10 +338,10 @@ ; VI-NEXT: v_floor_f32_e32 v1, v1 ; VI-NEXT: v_fma_f32 v2, v1, s3, |v0| ; VI-NEXT: v_trunc_f32_e32 v4, s0 -; VI-NEXT: v_mul_f32_e64 v3, |v4|, s2 ; VI-NEXT: v_cvt_u32_f32_e32 v2, v2 -; VI-NEXT: v_floor_f32_e32 v3, v3 +; VI-NEXT: v_mul_f32_e64 v3, |v4|, s2 ; VI-NEXT: v_cvt_u32_f32_e32 v1, v1 +; VI-NEXT: v_floor_f32_e32 v3, v3 ; VI-NEXT: v_cvt_u32_f32_e32 v5, v3 ; VI-NEXT: v_fma_f32 v3, v3, s3, |v4| ; VI-NEXT: v_ashrrev_i32_e32 v0, 31, v0 @@ -517,10 +517,10 @@ ; VI-NEXT: v_mul_f32_e64 v1, |v0|, s8 ; VI-NEXT: v_floor_f32_e32 v1, v1 ; VI-NEXT: v_fma_f32 v2, v1, s9, |v0| -; VI-NEXT: v_trunc_f32_e32 v4, s0 ; VI-NEXT: v_cvt_u32_f32_e32 v2, v2 -; VI-NEXT: v_mul_f32_e64 v3, |v4|, s8 +; VI-NEXT: v_trunc_f32_e32 v4, s0 ; VI-NEXT: v_cvt_u32_f32_e32 v1, v1 +; VI-NEXT: v_mul_f32_e64 v3, |v4|, s8 ; VI-NEXT: v_floor_f32_e32 v3, v3 ; VI-NEXT: v_ashrrev_i32_e32 v0, 31, v0 ; VI-NEXT: v_cvt_u32_f32_e32 v5, v3 @@ -549,8 +549,8 @@ ; VI-NEXT: v_cvt_u32_f32_e32 v9, v6 ; VI-NEXT: v_fma_f32 v6, v6, s9, |v8| ; VI-NEXT: v_cvt_u32_f32_e32 v10, v6 -; VI-NEXT: v_sub_u32_e32 v6, vcc, v5, v4 ; VI-NEXT: v_xor_b32_e32 v7, v7, v4 +; VI-NEXT: v_sub_u32_e32 v6, vcc, v5, v4 ; VI-NEXT: v_ashrrev_i32_e32 v5, 31, v8 ; VI-NEXT: v_subb_u32_e32 v7, vcc, v7, v4, vcc ; VI-NEXT: v_xor_b32_e32 v4, v10, v5 diff --git a/llvm/test/CodeGen/AMDGPU/fpext.f16.ll b/llvm/test/CodeGen/AMDGPU/fpext.f16.ll --- a/llvm/test/CodeGen/AMDGPU/fpext.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fpext.f16.ll @@ -56,8 +56,8 @@ ; GCN: buffer_load_dword ; SI-DAG: v_lshrrev_b32_e32 ; SI-DAG: v_cvt_f32_f16_e32 -; GFX89: v_cvt_f32_f16_sdwa ; GCN: v_cvt_f32_f16_e32 +; GFX89: v_cvt_f32_f16_sdwa ; GCN: v_cvt_f64_f32_e32 ; GCN: v_cvt_f64_f32_e32 diff --git a/llvm/test/CodeGen/AMDGPU/fpow.ll b/llvm/test/CodeGen/AMDGPU/fpow.ll --- a/llvm/test/CodeGen/AMDGPU/fpow.ll +++ b/llvm/test/CodeGen/AMDGPU/fpow.ll @@ -171,10 +171,10 @@ ; GFX8-NEXT: v_log_f32_e32 v0, v0 ; GFX8-NEXT: v_mul_legacy_f32_e32 v2, v3, v2 ; GFX8-NEXT: v_mul_legacy_f32_e32 v0, v1, v0 -; GFX8-NEXT: v_exp_f32_e32 v0, v0 ; GFX8-NEXT: v_exp_f32_e32 v2, v2 -; GFX8-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX8-NEXT: v_exp_f32_e32 v0, v0 ; GFX8-NEXT: v_cvt_f16_f32_sdwa v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD +; GFX8-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -189,10 +189,10 @@ ; GFX9-NEXT: v_log_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_legacy_f32_e32 v2, v3, v2 ; GFX9-NEXT: v_mul_legacy_f32_e32 v0, v1, v0 -; GFX9-NEXT: v_exp_f32_e32 v0, v0 ; GFX9-NEXT: v_exp_f32_e32 v2, v2 -; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX9-NEXT: v_exp_f32_e32 v0, v0 ; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v2 +; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX9-NEXT: v_pack_b32_f16 v0, v0, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -253,10 +253,10 @@ ; GFX8-NEXT: v_log_f32_e32 v0, v0 ; GFX8-NEXT: v_mul_legacy_f32_e32 v2, v3, v2 ; GFX8-NEXT: v_mul_legacy_f32_e32 v0, v1, v0 -; GFX8-NEXT: v_exp_f32_e32 v0, v0 ; GFX8-NEXT: v_exp_f32_e32 v2, v2 -; GFX8-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX8-NEXT: v_exp_f32_e32 v0, v0 ; GFX8-NEXT: v_cvt_f16_f32_sdwa v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD +; GFX8-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -271,10 +271,10 @@ ; GFX9-NEXT: v_log_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_legacy_f32_e32 v2, v3, v2 ; GFX9-NEXT: v_mul_legacy_f32_e32 v0, v1, v0 -; GFX9-NEXT: v_exp_f32_e32 v0, v0 ; GFX9-NEXT: v_exp_f32_e32 v2, v2 -; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX9-NEXT: v_exp_f32_e32 v0, v0 ; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v2 +; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX9-NEXT: v_pack_b32_f16 v0, v0, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -282,19 +282,19 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -;GFX10-NEXT: v_cvt_f32_f16_sdwa v2, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -;GFX10-NEXT: v_cvt_f32_f16_e64 v0, -v0 -;GFX10-NEXT: v_cvt_f32_f16_sdwa v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -;GFX10-NEXT: v_cvt_f32_f16_e32 v1, v1 -;GFX10-NEXT: v_log_f32_e32 v2, v2 -;GFX10-NEXT: v_log_f32_e32 v0, v0 -;GFX10-NEXT: v_mul_legacy_f32_e32 v2, v3, v2 -;GFX10-NEXT: v_mul_legacy_f32_e32 v0, v1, v0 -;GFX10-NEXT: v_exp_f32_e32 v1, v2 -;GFX10-NEXT: v_exp_f32_e32 v0, v0 -;GFX10-NEXT: v_cvt_f16_f32_e32 v1, v1 -;GFX10-NEXT: v_cvt_f16_f32_e32 v0, v0 -;GFX10-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX10-NEXT: v_cvt_f32_f16_sdwa v2, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX10-NEXT: v_cvt_f32_f16_e64 v0, -v0 +; GFX10-NEXT: v_cvt_f32_f16_sdwa v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX10-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX10-NEXT: v_log_f32_e32 v2, v2 +; GFX10-NEXT: v_log_f32_e32 v0, v0 +; GFX10-NEXT: v_mul_legacy_f32_e32 v2, v3, v2 +; GFX10-NEXT: v_mul_legacy_f32_e32 v0, v1, v0 +; GFX10-NEXT: v_exp_f32_e32 v1, v2 +; GFX10-NEXT: v_exp_f32_e32 v0, v0 +; GFX10-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX10-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX10-NEXT: v_pack_b32_f16 v0, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] %x.fneg = fneg <2 x half> %x %pow = call <2 x half> @llvm.pow.v2f16(<2 x half> %x.fneg, <2 x half> %y) @@ -336,10 +336,10 @@ ; GFX8-NEXT: v_log_f32_e32 v0, v0 ; GFX8-NEXT: v_mul_legacy_f32_e32 v2, v3, v2 ; GFX8-NEXT: v_mul_legacy_f32_e32 v0, v1, v0 -; GFX8-NEXT: v_exp_f32_e32 v0, v0 ; GFX8-NEXT: v_exp_f32_e32 v2, v2 -; GFX8-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX8-NEXT: v_exp_f32_e32 v0, v0 ; GFX8-NEXT: v_cvt_f16_f32_sdwa v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD +; GFX8-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -354,10 +354,10 @@ ; GFX9-NEXT: v_log_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_legacy_f32_e32 v2, v3, v2 ; GFX9-NEXT: v_mul_legacy_f32_e32 v0, v1, v0 -; GFX9-NEXT: v_exp_f32_e32 v0, v0 ; GFX9-NEXT: v_exp_f32_e32 v2, v2 -; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX9-NEXT: v_exp_f32_e32 v0, v0 ; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v2 +; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX9-NEXT: v_pack_b32_f16 v0, v0, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -365,19 +365,19 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -;GFX10-NEXT: v_cvt_f32_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -;GFX10-NEXT: v_cvt_f32_f16_e32 v0, v0 -;GFX10-NEXT: v_cvt_f32_f16_sdwa v3, -v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -;GFX10-NEXT: v_cvt_f32_f16_e64 v1, -v1 -;GFX10-NEXT: v_log_f32_e32 v2, v2 -;GFX10-NEXT: v_log_f32_e32 v0, v0 -;GFX10-NEXT: v_mul_legacy_f32_e32 v2, v3, v2 -;GFX10-NEXT: v_mul_legacy_f32_e32 v0, v1, v0 -;GFX10-NEXT: v_exp_f32_e32 v1, v2 -;GFX10-NEXT: v_exp_f32_e32 v0, v0 -;GFX10-NEXT: v_cvt_f16_f32_e32 v1, v1 -;GFX10-NEXT: v_cvt_f16_f32_e32 v0, v0 -;GFX10-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX10-NEXT: v_cvt_f32_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX10-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX10-NEXT: v_cvt_f32_f16_sdwa v3, -v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX10-NEXT: v_cvt_f32_f16_e64 v1, -v1 +; GFX10-NEXT: v_log_f32_e32 v2, v2 +; GFX10-NEXT: v_log_f32_e32 v0, v0 +; GFX10-NEXT: v_mul_legacy_f32_e32 v2, v3, v2 +; GFX10-NEXT: v_mul_legacy_f32_e32 v0, v1, v0 +; GFX10-NEXT: v_exp_f32_e32 v1, v2 +; GFX10-NEXT: v_exp_f32_e32 v0, v0 +; GFX10-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX10-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX10-NEXT: v_pack_b32_f16 v0, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] %y.fneg = fneg <2 x half> %y %pow = call <2 x half> @llvm.pow.v2f16(<2 x half> %x, <2 x half> %y.fneg) @@ -397,9 +397,9 @@ ; GFX6-NEXT: s_mov_b32 s4, 0x80008000 ; GFX6-NEXT: v_xor_b32_e32 v0, s4, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX6-NEXT: v_xor_b32_e32 v2, s4, v2 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2 @@ -424,10 +424,10 @@ ; GFX8-NEXT: v_log_f32_e32 v0, v0 ; GFX8-NEXT: v_mul_legacy_f32_e32 v2, v3, v2 ; GFX8-NEXT: v_mul_legacy_f32_e32 v0, v1, v0 -; GFX8-NEXT: v_exp_f32_e32 v0, v0 ; GFX8-NEXT: v_exp_f32_e32 v2, v2 -; GFX8-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX8-NEXT: v_exp_f32_e32 v0, v0 ; GFX8-NEXT: v_cvt_f16_f32_sdwa v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD +; GFX8-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -442,10 +442,10 @@ ; GFX9-NEXT: v_log_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_legacy_f32_e32 v2, v3, v2 ; GFX9-NEXT: v_mul_legacy_f32_e32 v0, v1, v0 -; GFX9-NEXT: v_exp_f32_e32 v0, v0 ; GFX9-NEXT: v_exp_f32_e32 v2, v2 -; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX9-NEXT: v_exp_f32_e32 v0, v0 ; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v2 +; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX9-NEXT: v_pack_b32_f16 v0, v0, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll b/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll --- a/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll @@ -114,12 +114,13 @@ ; SI: v_ashrrev_i32_e32 v[[R_I64_0_High:[0-9]+]], 31, v[[R_I64_0_Low]] ; SI: v_cvt_i32_f32_e32 v[[R_I64_1_Low:[0-9]+]], v[[A_F32_1]] ; SI: v_ashrrev_i32_e32 v[[R_I64_1_High:[0-9]+]], 31, v[[R_I64_1_Low]] -; VI: v_cvt_f32_f16_sdwa v[[A_F32_1:[0-9]+]], v[[A_F16_0]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_F16_0]] -; VI: v_cvt_i32_f32_e32 v[[R_I64_1_Low:[0-9]+]], v[[A_F32_1]] -; VI: v_cvt_i32_f32_e32 v[[R_I64_0_Low:[0-9]+]], v[[A_F32_0]] -; VI: v_ashrrev_i32_e32 v[[R_I64_1_High:[0-9]+]], 31, v[[R_I64_1_Low]] -; VI: v_ashrrev_i32_e32 v[[R_I64_0_High:[0-9]+]], 31, v[[R_I64_0_Low]] +; VI-DAG: v_cvt_f32_f16_sdwa v[[A_F32_1:[0-9]+]], v[[A_F16_0]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_F16_0]] +; VI-DAG: v_cvt_i32_f32_e32 v[[R_I64_1_Low:[0-9]+]], v[[A_F32_1]] +; VI-DAG: v_cvt_i32_f32_e32 v[[R_I64_0_Low:[0-9]+]], v[[A_F32_0]] +; VI-NOT: DEADBEEF +; VI-DAG: v_ashrrev_i32_e32 v[[R_I64_1_High:[0-9]+]], 31, v[[R_I64_1_Low]] +; VI-DAG: v_ashrrev_i32_e32 v[[R_I64_0_High:[0-9]+]], 31, v[[R_I64_0_Low]] ; GCN: buffer_store_dwordx4 v{{\[}}[[R_I64_0_Low]]{{\:}}[[R_I64_1_High]]{{\]}} ; GCN: s_endpgm define amdgpu_kernel void @fptosi_v2f16_to_v2i64( diff --git a/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll b/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll --- a/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll @@ -112,10 +112,10 @@ ; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] ; SI: v_cvt_u32_f32_e32 v[[R_I64_0_Low:[0-9]+]], v[[A_F32_0]] ; SI: v_cvt_u32_f32_e32 v[[R_I64_1_Low:[0-9]+]], v[[A_F32_1]] -; VI: v_cvt_f32_f16_sdwa v[[A_F32_1:[0-9]+]], v[[A_F16_0]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; VI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_F16_0]] -; VI: v_cvt_u32_f32_e32 v[[R_I64_1_Low:[0-9]+]], v[[A_F32_1]] +; VI: v_cvt_f32_f16_sdwa v[[A_F32_1:[0-9]+]], v[[A_F16_0]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; VI: v_cvt_u32_f32_e32 v[[R_I64_0_Low:[0-9]+]], v[[A_F32_0]] +; VI: v_cvt_u32_f32_e32 v[[R_I64_1_Low:[0-9]+]], v[[A_F32_1]] ; GCN: v_mov_b32_e32 v[[R_I64_0_High:[0-9]+]], 0 ; GCN: buffer_store_dwordx4 v{{\[}}[[R_I64_0_Low]]{{\:}}[[R_I64_1_High]]{{\]}} ; GCN: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll b/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll --- a/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll @@ -63,13 +63,12 @@ ; GCN: buffer_load_dwordx4 v{{\[}}[[A_F64_0:[0-9]+]]:[[A_F64_3:[0-9]+]]{{\]}} ; GCN-DAG: v_cvt_f32_f64_e32 v[[A_F32_0:[0-9]+]], v{{\[}}[[A_F64_0]]:{{[0-9]+}}{{\]}} ; GCN-DAG: v_cvt_f32_f64_e32 v[[A_F32_1:[0-9]+]], v{{\[}}{{[0-9]+}}:[[A_F64_3]]{{\]}} +; VI: v_cvt_f16_f32_sdwa v[[R_F16_HI:[0-9]+]], v[[A_F32_1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD ; GCN-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[A_F32_0]] ; ; SI-DAG: v_cvt_f16_f32_e32 v[[CVTHI:[0-9]+]], v[[A_F32_1]] ; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[CVTHI]] -; VI: v_cvt_f16_f32_sdwa v[[R_F16_HI:[0-9]+]], v[[A_F32_1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD - ; SIVI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] ; GFX9-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[A_F32_1]] diff --git a/llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll b/llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll --- a/llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll +++ b/llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll @@ -14,16 +14,16 @@ ; SPILL-TO-VGPR-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SPILL-TO-VGPR-NEXT: s_mov_b64 exec, s[4:5] ; SPILL-TO-VGPR-NEXT: v_writelane_b32 v40, s33, 2 -; SPILL-TO-VGPR-NEXT: v_writelane_b32 v40, s30, 0 ; SPILL-TO-VGPR-NEXT: s_mov_b32 s33, s32 ; SPILL-TO-VGPR-NEXT: s_addk_i32 s32, 0x400 +; SPILL-TO-VGPR-NEXT: v_writelane_b32 v40, s30, 0 ; SPILL-TO-VGPR-NEXT: v_mov_b32_e32 v0, 0 -; SPILL-TO-VGPR-NEXT: s_getpc_b64 s[4:5] -; SPILL-TO-VGPR-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 -; SPILL-TO-VGPR-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12 ; SPILL-TO-VGPR-NEXT: v_writelane_b32 v40, s31, 1 ; SPILL-TO-VGPR-NEXT: buffer_store_dword v0, off, s[0:3], s33 ; SPILL-TO-VGPR-NEXT: s_waitcnt vmcnt(0) +; SPILL-TO-VGPR-NEXT: s_getpc_b64 s[4:5] +; SPILL-TO-VGPR-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 +; SPILL-TO-VGPR-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12 ; SPILL-TO-VGPR-NEXT: s_swappc_b64 s[30:31], s[4:5] ; SPILL-TO-VGPR-NEXT: v_readlane_b32 s4, v40, 0 ; SPILL-TO-VGPR-NEXT: v_readlane_b32 s5, v40, 1 @@ -52,11 +52,11 @@ ; NO-SPILL-TO-VGPR-NEXT: s_waitcnt vmcnt(0) ; NO-SPILL-TO-VGPR-NEXT: s_mov_b64 exec, s[6:7] ; NO-SPILL-TO-VGPR-NEXT: v_mov_b32_e32 v0, 0 +; NO-SPILL-TO-VGPR-NEXT: buffer_store_dword v0, off, s[0:3], s33 +; NO-SPILL-TO-VGPR-NEXT: s_waitcnt vmcnt(0) ; NO-SPILL-TO-VGPR-NEXT: s_getpc_b64 s[4:5] ; NO-SPILL-TO-VGPR-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 ; NO-SPILL-TO-VGPR-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12 -; NO-SPILL-TO-VGPR-NEXT: buffer_store_dword v0, off, s[0:3], s33 -; NO-SPILL-TO-VGPR-NEXT: s_waitcnt vmcnt(0) ; NO-SPILL-TO-VGPR-NEXT: s_swappc_b64 s[30:31], s[4:5] ; NO-SPILL-TO-VGPR-NEXT: s_mov_b64 s[6:7], exec ; NO-SPILL-TO-VGPR-NEXT: s_mov_b64 exec, 3 diff --git a/llvm/test/CodeGen/AMDGPU/frem.ll b/llvm/test/CodeGen/AMDGPU/frem.ll --- a/llvm/test/CodeGen/AMDGPU/frem.ll +++ b/llvm/test/CodeGen/AMDGPU/frem.ll @@ -59,9 +59,9 @@ ; CI-NEXT: s_mov_b32 s9, s5 ; CI-NEXT: s_mov_b32 s4, s6 ; CI-NEXT: s_mov_b32 s5, s7 -; CI-NEXT: s_mov_b32 s3, s11 ; CI-NEXT: s_mov_b32 s6, s10 ; CI-NEXT: s_mov_b32 s7, s11 +; CI-NEXT: s_mov_b32 s3, s11 ; CI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 ; CI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 offset:8 ; CI-NEXT: s_waitcnt vmcnt(1) @@ -1393,9 +1393,9 @@ ; CI-NEXT: s_mov_b32 s1, s5 ; CI-NEXT: s_mov_b32 s4, s6 ; CI-NEXT: s_mov_b32 s5, s7 -; CI-NEXT: s_mov_b32 s11, s3 ; CI-NEXT: s_mov_b32 s6, s2 ; CI-NEXT: s_mov_b32 s7, s3 +; CI-NEXT: s_mov_b32 s11, s3 ; CI-NEXT: buffer_load_dword v0, off, s[4:7], 0 ; CI-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:16 ; CI-NEXT: s_mov_b32 s6, 3 diff --git a/llvm/test/CodeGen/AMDGPU/fshl.ll b/llvm/test/CodeGen/AMDGPU/fshl.ll --- a/llvm/test/CodeGen/AMDGPU/fshl.ll +++ b/llvm/test/CodeGen/AMDGPU/fshl.ll @@ -171,8 +171,8 @@ ; SI-NEXT: v_mov_b32_e32 v0, s9 ; SI-NEXT: s_not_b32 s1, s1 ; SI-NEXT: v_alignbit_b32 v0, s3, v0, 1 -; SI-NEXT: v_mov_b32_e32 v1, s1 ; SI-NEXT: s_lshr_b32 s3, s3, 1 +; SI-NEXT: v_mov_b32_e32 v1, s1 ; SI-NEXT: v_alignbit_b32 v1, s3, v0, v1 ; SI-NEXT: v_mov_b32_e32 v0, s8 ; SI-NEXT: s_not_b32 s0, s0 @@ -192,9 +192,9 @@ ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s7 ; VI-NEXT: s_not_b32 s1, s1 -; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: s_lshr_b32 s7, s5, 1 ; VI-NEXT: v_alignbit_b32 v0, s5, v0, 1 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_alignbit_b32 v1, s7, v0, v1 ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: s_not_b32 s0, s0 @@ -218,8 +218,8 @@ ; GFX9-NEXT: s_lshr_b32 s0, s5, 1 ; GFX9-NEXT: v_mov_b32_e32 v0, s7 ; GFX9-NEXT: s_not_b32 s1, s9 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_alignbit_b32 v0, s5, v0, 1 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_alignbit_b32 v1, s0, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: s_not_b32 s1, s8 @@ -363,20 +363,20 @@ ; SI-NEXT: v_mov_b32_e32 v0, s15 ; SI-NEXT: s_not_b32 s3, s3 ; SI-NEXT: v_alignbit_b32 v0, s11, v0, 1 -; SI-NEXT: v_mov_b32_e32 v1, s3 ; SI-NEXT: s_lshr_b32 s11, s11, 1 +; SI-NEXT: v_mov_b32_e32 v1, s3 ; SI-NEXT: v_alignbit_b32 v3, s11, v0, v1 ; SI-NEXT: v_mov_b32_e32 v0, s14 ; SI-NEXT: s_not_b32 s2, s2 -; SI-NEXT: v_mov_b32_e32 v1, s2 ; SI-NEXT: v_alignbit_b32 v0, s10, v0, 1 ; SI-NEXT: s_lshr_b32 s3, s10, 1 +; SI-NEXT: v_mov_b32_e32 v1, s2 ; SI-NEXT: v_alignbit_b32 v2, s3, v0, v1 ; SI-NEXT: v_mov_b32_e32 v0, s13 ; SI-NEXT: s_not_b32 s1, s1 -; SI-NEXT: v_mov_b32_e32 v1, s1 ; SI-NEXT: v_alignbit_b32 v0, s9, v0, 1 ; SI-NEXT: s_lshr_b32 s2, s9, 1 +; SI-NEXT: v_mov_b32_e32 v1, s1 ; SI-NEXT: v_alignbit_b32 v1, s2, v0, v1 ; SI-NEXT: v_mov_b32_e32 v0, s12 ; SI-NEXT: s_not_b32 s0, s0 @@ -396,21 +396,21 @@ ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s11 ; VI-NEXT: s_not_b32 s3, s3 -; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: s_lshr_b32 s11, s7, 1 ; VI-NEXT: v_alignbit_b32 v0, s7, v0, 1 +; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_alignbit_b32 v3, s11, v0, v1 ; VI-NEXT: v_mov_b32_e32 v0, s10 ; VI-NEXT: s_not_b32 s2, s2 -; VI-NEXT: v_mov_b32_e32 v1, s2 ; VI-NEXT: v_alignbit_b32 v0, s6, v0, 1 ; VI-NEXT: s_lshr_b32 s3, s6, 1 +; VI-NEXT: v_mov_b32_e32 v1, s2 ; VI-NEXT: v_alignbit_b32 v2, s3, v0, v1 ; VI-NEXT: v_mov_b32_e32 v0, s9 ; VI-NEXT: s_not_b32 s1, s1 -; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_alignbit_b32 v0, s5, v0, 1 ; VI-NEXT: s_lshr_b32 s2, s5, 1 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_alignbit_b32 v1, s2, v0, v1 ; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: s_not_b32 s0, s0 @@ -434,20 +434,20 @@ ; GFX9-NEXT: s_lshr_b32 s0, s7, 1 ; GFX9-NEXT: v_mov_b32_e32 v0, s11 ; GFX9-NEXT: s_not_b32 s1, s15 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_alignbit_b32 v0, s7, v0, 1 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_alignbit_b32 v3, s0, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v0, s10 ; GFX9-NEXT: s_not_b32 s1, s14 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_alignbit_b32 v0, s6, v0, 1 ; GFX9-NEXT: s_lshr_b32 s0, s6, 1 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_alignbit_b32 v2, s0, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v0, s9 ; GFX9-NEXT: s_not_b32 s1, s13 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_alignbit_b32 v0, s5, v0, 1 ; GFX9-NEXT: s_lshr_b32 s0, s5, 1 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_alignbit_b32 v1, s0, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v0, s8 ; GFX9-NEXT: s_not_b32 s1, s12 @@ -494,10 +494,10 @@ ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_lshr_b32 s0, s7, 1 -; GFX10-NEXT: v_alignbit_b32 v5, s5, s9, 1 -; GFX10-NEXT: v_alignbit_b32 v6, s4, s8, 1 ; GFX10-NEXT: v_alignbit_b32 v0, s7, s11, 1 ; GFX10-NEXT: v_alignbit_b32 v1, s6, s10, 1 +; GFX10-NEXT: v_alignbit_b32 v5, s5, s9, 1 +; GFX10-NEXT: v_alignbit_b32 v6, s4, s8, 1 ; GFX10-NEXT: s_not_b32 s1, s15 ; GFX10-NEXT: s_lshr_b32 s6, s6, 1 ; GFX10-NEXT: s_not_b32 s7, s14 diff --git a/llvm/test/CodeGen/AMDGPU/fshr.ll b/llvm/test/CodeGen/AMDGPU/fshr.ll --- a/llvm/test/CodeGen/AMDGPU/fshr.ll +++ b/llvm/test/CodeGen/AMDGPU/fshr.ll @@ -640,9 +640,9 @@ ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_xor_b32_e32 v3, -1, v2 -; VI-NEXT: v_and_b32_e32 v2, 15, v2 ; VI-NEXT: v_lshlrev_b16_e32 v0, 1, v0 ; VI-NEXT: v_and_b32_e32 v3, 15, v3 +; VI-NEXT: v_and_b32_e32 v2, 15, v2 ; VI-NEXT: v_lshlrev_b16_e32 v0, v3, v0 ; VI-NEXT: v_lshrrev_b16_e32 v1, v2, v1 ; VI-NEXT: v_or_b32_e32 v0, v0, v1 @@ -652,9 +652,9 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_xor_b32_e32 v3, -1, v2 -; GFX9-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 1, v0 ; GFX9-NEXT: v_and_b32_e32 v3, 15, v3 +; GFX9-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, v3, v0 ; GFX9-NEXT: v_lshrrev_b16_e32 v1, v2, v1 ; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 @@ -670,8 +670,8 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_xor_b32_e32 v3, -1, v2 -; GFX10-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX10-NEXT: v_lshlrev_b16 v0, 1, v0 +; GFX10-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX10-NEXT: v_and_b32_e32 v3, 15, v3 ; GFX10-NEXT: v_lshrrev_b16 v1, v2, v1 ; GFX10-NEXT: v_lshlrev_b16 v0, v3, v0 @@ -710,9 +710,9 @@ ; VI-NEXT: v_lshlrev_b16_e32 v3, v3, v5 ; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_xor_b32_e32 v4, -1, v2 -; VI-NEXT: v_and_b32_e32 v2, 15, v2 ; VI-NEXT: v_lshlrev_b16_e32 v0, 1, v0 ; VI-NEXT: v_and_b32_e32 v4, 15, v4 +; VI-NEXT: v_and_b32_e32 v2, 15, v2 ; VI-NEXT: v_lshlrev_b16_e32 v0, v4, v0 ; VI-NEXT: v_lshrrev_b16_e32 v1, v2, v1 ; VI-NEXT: v_or_b32_e32 v0, v0, v1 @@ -724,9 +724,9 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_xor_b32_e32 v3, -1, v2 ; GFX9-NEXT: s_mov_b32 s4, 0xf000f -; GFX9-NEXT: v_and_b32_e32 v2, s4, v2 ; GFX9-NEXT: v_pk_lshlrev_b16 v0, 1, v0 op_sel_hi:[0,1] ; GFX9-NEXT: v_and_b32_e32 v3, s4, v3 +; GFX9-NEXT: v_and_b32_e32 v2, s4, v2 ; GFX9-NEXT: v_pk_lshlrev_b16 v0, v3, v0 ; GFX9-NEXT: v_pk_lshrrev_b16 v1, v2, v1 ; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 @@ -767,9 +767,9 @@ ; SI-NEXT: s_mov_b32 s4, 0xffff ; SI-NEXT: v_or_b32_e32 v3, 16, v8 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; SI-NEXT: v_alignbit_b32 v3, v2, v4, v3 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, s4, v0 +; SI-NEXT: v_alignbit_b32 v3, v2, v4, v3 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: v_and_b32_e32 v2, s4, v3 ; SI-NEXT: v_alignbit_b32 v1, v3, v1, 16 @@ -788,9 +788,9 @@ ; VI-NEXT: v_lshlrev_b16_e32 v6, v6, v8 ; VI-NEXT: v_or_b32_sdwa v6, v6, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_xor_b32_e32 v7, -1, v5 -; VI-NEXT: v_and_b32_e32 v5, 15, v5 ; VI-NEXT: v_lshlrev_b16_e32 v1, 1, v1 ; VI-NEXT: v_and_b32_e32 v7, 15, v7 +; VI-NEXT: v_and_b32_e32 v5, 15, v5 ; VI-NEXT: v_lshlrev_b16_e32 v1, v7, v1 ; VI-NEXT: v_lshrrev_b16_e32 v3, v5, v3 ; VI-NEXT: v_or_b32_e32 v1, v1, v3 @@ -817,9 +817,9 @@ ; GFX9-NEXT: v_lshlrev_b16_e32 v6, v6, v8 ; GFX9-NEXT: v_or_b32_e32 v6, v6, v7 ; GFX9-NEXT: v_xor_b32_e32 v7, -1, v5 -; GFX9-NEXT: v_and_b32_e32 v5, 15, v5 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 1, v1 ; GFX9-NEXT: v_and_b32_e32 v7, 15, v7 +; GFX9-NEXT: v_and_b32_e32 v5, 15, v5 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, v7, v1 ; GFX9-NEXT: v_lshrrev_b16_e32 v3, v5, v3 ; GFX9-NEXT: v_or_b32_e32 v1, v1, v3 @@ -843,30 +843,30 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_xor_b32_e32 v8, -1, v4 ; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; GFX10-NEXT: v_xor_b32_e32 v8, -1, v4 ; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v0 ; GFX10-NEXT: v_lshlrev_b16 v0, 1, v0 ; GFX10-NEXT: v_and_b32_e32 v4, 15, v4 -; GFX10-NEXT: v_and_b32_e32 v8, 15, v8 ; GFX10-NEXT: v_and_b32_e32 v9, 15, v6 ; GFX10-NEXT: v_xor_b32_e32 v6, -1, v6 +; GFX10-NEXT: v_and_b32_e32 v8, 15, v8 ; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v2 -; GFX10-NEXT: v_lshrrev_b16 v2, v4, v2 -; GFX10-NEXT: v_lshlrev_b16 v0, v8, v0 ; GFX10-NEXT: v_lshlrev_b16 v10, 1, v10 -; GFX10-NEXT: v_and_b32_e32 v6, 15, v6 ; GFX10-NEXT: v_xor_b32_e32 v11, -1, v5 +; GFX10-NEXT: v_and_b32_e32 v6, 15, v6 +; GFX10-NEXT: v_lshlrev_b16 v0, v8, v0 +; GFX10-NEXT: v_lshrrev_b16 v2, v4, v2 ; GFX10-NEXT: v_lshrrev_b16 v4, v9, v7 -; GFX10-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX10-NEXT: v_and_b32_e32 v2, 15, v5 -; GFX10-NEXT: v_lshlrev_b16 v6, v6, v10 ; GFX10-NEXT: v_lshlrev_b16 v1, 1, v1 +; GFX10-NEXT: v_lshlrev_b16 v6, v6, v10 ; GFX10-NEXT: v_and_b32_e32 v7, 15, v11 -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX10-NEXT: v_lshrrev_b16 v2, v2, v3 +; GFX10-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX10-NEXT: v_and_b32_e32 v2, 15, v5 ; GFX10-NEXT: v_or_b32_e32 v4, v6, v4 ; GFX10-NEXT: v_lshlrev_b16 v1, v7, v1 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX10-NEXT: v_lshrrev_b16 v2, v2, v3 ; GFX10-NEXT: v_lshl_or_b32 v0, v4, 16, v0 ; GFX10-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -893,9 +893,9 @@ ; SI-NEXT: v_alignbit_b32 v2, v2, v5, v4 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_and_b32_e32 v2, s4, v2 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, s4, v0 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 @@ -906,8 +906,8 @@ ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v5 ; VI-NEXT: v_and_b32_e32 v7, 15, v6 -; VI-NEXT: v_xor_b32_e32 v6, -1, v6 ; VI-NEXT: v_mov_b32_e32 v8, 1 +; VI-NEXT: v_xor_b32_e32 v6, -1, v6 ; VI-NEXT: v_lshlrev_b16_sdwa v9, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_and_b32_e32 v6, 15, v6 ; VI-NEXT: v_lshrrev_b16_sdwa v7, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 @@ -920,9 +920,9 @@ ; VI-NEXT: v_and_b32_e32 v7, 15, v7 ; VI-NEXT: v_lshlrev_b16_e32 v7, v7, v8 ; VI-NEXT: v_xor_b32_e32 v8, -1, v5 -; VI-NEXT: v_and_b32_e32 v5, 15, v5 ; VI-NEXT: v_lshlrev_b16_e32 v1, 1, v1 ; VI-NEXT: v_and_b32_e32 v8, 15, v8 +; VI-NEXT: v_and_b32_e32 v5, 15, v5 ; VI-NEXT: v_lshlrev_b16_e32 v1, v8, v1 ; VI-NEXT: v_lshrrev_b16_e32 v3, v5, v3 ; VI-NEXT: v_or_b32_e32 v1, v1, v3 @@ -944,8 +944,8 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v5 ; GFX9-NEXT: v_and_b32_e32 v7, 15, v6 -; GFX9-NEXT: v_xor_b32_e32 v6, -1, v6 ; GFX9-NEXT: v_mov_b32_e32 v8, 1 +; GFX9-NEXT: v_xor_b32_e32 v6, -1, v6 ; GFX9-NEXT: v_lshlrev_b16_sdwa v9, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: v_and_b32_e32 v6, 15, v6 ; GFX9-NEXT: v_lshrrev_b16_sdwa v7, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 @@ -958,9 +958,9 @@ ; GFX9-NEXT: v_and_b32_e32 v7, 15, v7 ; GFX9-NEXT: v_lshlrev_b16_e32 v7, v7, v8 ; GFX9-NEXT: v_xor_b32_e32 v8, -1, v5 -; GFX9-NEXT: v_and_b32_e32 v5, 15, v5 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 1, v1 ; GFX9-NEXT: v_and_b32_e32 v8, 15, v8 +; GFX9-NEXT: v_and_b32_e32 v5, 15, v5 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, v8, v1 ; GFX9-NEXT: v_lshrrev_b16_e32 v3, v5, v3 ; GFX9-NEXT: v_or_b32_e32 v1, v1, v3 @@ -973,9 +973,9 @@ ; GFX9-NEXT: v_lshrrev_b16_e32 v2, v3, v2 ; GFX9-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff -; GFX9-NEXT: v_and_b32_e32 v1, v2, v1 ; GFX9-NEXT: v_or_b32_e32 v7, v7, v9 ; GFX9-NEXT: v_and_b32_e32 v0, v2, v0 +; GFX9-NEXT: v_and_b32_e32 v1, v2, v1 ; GFX9-NEXT: v_lshl_or_b32 v0, v7, 16, v0 ; GFX9-NEXT: v_lshl_or_b32 v1, v6, 16, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -991,8 +991,8 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v5 ; GFX10-NEXT: v_lshrrev_b32_e32 v8, 16, v1 -; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v4 ; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v3 +; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v4 ; GFX10-NEXT: v_lshrrev_b32_e32 v11, 16, v0 ; GFX10-NEXT: v_xor_b32_e32 v9, -1, v6 ; GFX10-NEXT: v_and_b32_e32 v6, 15, v6 @@ -1038,8 +1038,8 @@ ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_and_b32_e32 v5, 63, v4 -; SI-NEXT: v_not_b32_e32 v4, v4 ; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 1 +; SI-NEXT: v_not_b32_e32 v4, v4 ; SI-NEXT: v_and_b32_e32 v4, 63, v4 ; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], v5 ; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], v4 @@ -1051,8 +1051,8 @@ ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_and_b32_e32 v5, 63, v4 -; VI-NEXT: v_not_b32_e32 v4, v4 ; VI-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] +; VI-NEXT: v_not_b32_e32 v4, v4 ; VI-NEXT: v_and_b32_e32 v4, 63, v4 ; VI-NEXT: v_lshrrev_b64 v[2:3], v5, v[2:3] ; VI-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1] @@ -1064,8 +1064,8 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v5, 63, v4 -; GFX9-NEXT: v_not_b32_e32 v4, v4 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] +; GFX9-NEXT: v_not_b32_e32 v4, v4 ; GFX9-NEXT: v_and_b32_e32 v4, 63, v4 ; GFX9-NEXT: v_lshrrev_b64 v[2:3], v5, v[2:3] ; GFX9-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1] @@ -1121,8 +1121,8 @@ ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_and_b32_e32 v9, 63, v8 -; VI-NEXT: v_not_b32_e32 v8, v8 ; VI-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] +; VI-NEXT: v_not_b32_e32 v8, v8 ; VI-NEXT: v_and_b32_e32 v8, 63, v8 ; VI-NEXT: v_lshrrev_b64 v[4:5], v9, v[4:5] ; VI-NEXT: v_lshlrev_b64 v[0:1], v8, v[0:1] @@ -1142,8 +1142,8 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v9, 63, v8 -; GFX9-NEXT: v_not_b32_e32 v8, v8 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] +; GFX9-NEXT: v_not_b32_e32 v8, v8 ; GFX9-NEXT: v_and_b32_e32 v8, 63, v8 ; GFX9-NEXT: v_lshrrev_b64 v[4:5], v9, v[4:5] ; GFX9-NEXT: v_lshlrev_b64 v[0:1], v8, v[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll --- a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll +++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll @@ -100,14 +100,14 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 1 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_i1@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_i1@rel32@hi+12 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: buffer_store_byte v0, off, s[0:3], s32 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 @@ -193,13 +193,13 @@ ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_i1_signext@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_i1_signext@rel32@hi+12 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: buffer_store_byte v0, off, s[0:3], s32 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -291,13 +291,13 @@ ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_i1_zeroext@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_i1_zeroext@rel32@hi+12 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: buffer_store_byte v0, off, s[0:3], s32 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -387,14 +387,14 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x7b +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_i8@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_i8@rel32@hi+12 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 @@ -477,13 +477,13 @@ ; GFX9-NEXT: global_load_sbyte v0, v[0:1], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_i8_signext@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_i8_signext@rel32@hi+12 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 @@ -569,13 +569,13 @@ ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_i8_zeroext@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_i8_zeroext@rel32@hi+12 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 @@ -659,14 +659,14 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x7b +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_i16@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_i16@rel32@hi+12 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 @@ -749,13 +749,13 @@ ; GFX9-NEXT: global_load_ushort v0, v[0:1], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_i16_signext@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_i16_signext@rel32@hi+12 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 @@ -841,13 +841,13 @@ ; GFX9-NEXT: global_load_ushort v0, v[0:1], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_i16_zeroext@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_i16_zeroext@rel32@hi+12 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 @@ -931,14 +931,14 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_i32@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_i32@rel32@hi+12 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 @@ -1019,15 +1019,15 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x7b ; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_i64@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_i64@rel32@hi+12 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 @@ -1113,13 +1113,13 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v2i64@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v2i64@rel32@hi+12 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 @@ -1205,17 +1205,17 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 1 ; GFX9-NEXT: v_mov_b32_e32 v1, 2 ; GFX9-NEXT: v_mov_b32_e32 v2, 3 ; GFX9-NEXT: v_mov_b32_e32 v3, 4 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v2i64@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v2i64@rel32@hi+12 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 @@ -1305,15 +1305,15 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v4, 1 ; GFX9-NEXT: v_mov_b32_e32 v5, 2 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v3i64@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v3i64@rel32@hi+12 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 @@ -1408,17 +1408,17 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v4, 1 ; GFX9-NEXT: v_mov_b32_e32 v5, 2 ; GFX9-NEXT: v_mov_b32_e32 v6, 3 ; GFX9-NEXT: v_mov_b32_e32 v7, 4 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v4i64@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v4i64@rel32@hi+12 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 @@ -1513,14 +1513,14 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x4400 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_f16@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_f16@rel32@hi+12 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 @@ -1601,14 +1601,14 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 4.0 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_f32@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_f32@rel32@hi+12 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 @@ -1689,15 +1689,15 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX9-NEXT: v_mov_b32_e32 v1, 2.0 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v2f32@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v2f32@rel32@hi+12 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 @@ -1780,16 +1780,16 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX9-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX9-NEXT: v_mov_b32_e32 v2, 4.0 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v3f32@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v3f32@rel32@hi+12 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 @@ -1874,18 +1874,18 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX9-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX9-NEXT: v_mov_b32_e32 v2, 4.0 ; GFX9-NEXT: v_mov_b32_e32 v3, -1.0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0.5 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v5f32@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v5f32@rel32@hi+12 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 @@ -1974,15 +1974,15 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x40100000 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_f64@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_f64@rel32@hi+12 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 @@ -2065,17 +2065,17 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0x40100000 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v2f64@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v2f64@rel32@hi+12 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 @@ -2162,19 +2162,19 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0x40100000 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_mov_b32_e32 v5, 0x40200000 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v3f64@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v3f64@rel32@hi+12 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 @@ -2204,10 +2204,10 @@ ; GFX10-NEXT: v_mov_b32_e32 v5, 0x40200000 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[4:5] ; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_v3f64@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_v3f64@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 @@ -2238,10 +2238,10 @@ ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v5, 0x40200000 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v3f64@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v3f64@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 0 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s1, v40, 1 @@ -2266,13 +2266,13 @@ ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: global_load_dword v0, v[0:1], off ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v2i16@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v2i16@rel32@hi+12 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 @@ -2355,13 +2355,13 @@ ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v3i16@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v3i16@rel32@hi+12 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 @@ -2444,13 +2444,13 @@ ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v3f16@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v3f16@rel32@hi+12 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 @@ -2532,15 +2532,15 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x20001 ; GFX9-NEXT: v_mov_b32_e32 v1, 3 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v3i16@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v3i16@rel32@hi+12 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 @@ -2623,15 +2623,15 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x40003c00 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x4400 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v3f16@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v3f16@rel32@hi+12 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 @@ -2715,13 +2715,13 @@ ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v4i16@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v4i16@rel32@hi+12 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 @@ -2803,15 +2803,15 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x20001 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x40003 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v4i16@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v4i16@rel32@hi+12 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 @@ -2895,13 +2895,13 @@ ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: global_load_dword v0, v[0:1], off ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v2f16@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v2f16@rel32@hi+12 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 @@ -2984,13 +2984,13 @@ ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v2i32@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v2i32@rel32@hi+12 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 @@ -3072,15 +3072,15 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 1 ; GFX9-NEXT: v_mov_b32_e32 v1, 2 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v2i32@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v2i32@rel32@hi+12 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 @@ -3163,16 +3163,16 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 3 ; GFX9-NEXT: v_mov_b32_e32 v1, 4 ; GFX9-NEXT: v_mov_b32_e32 v2, 5 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v3i32@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v3i32@rel32@hi+12 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 @@ -3257,17 +3257,17 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 3 ; GFX9-NEXT: v_mov_b32_e32 v1, 4 ; GFX9-NEXT: v_mov_b32_e32 v2, 5 ; GFX9-NEXT: v_mov_b32_e32 v3, 6 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v3i32_i32@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v3i32_i32@rel32@hi+12 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 @@ -3355,13 +3355,13 @@ ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v4i32@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v4i32@rel32@hi+12 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 @@ -3443,17 +3443,17 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 1 ; GFX9-NEXT: v_mov_b32_e32 v1, 2 ; GFX9-NEXT: v_mov_b32_e32 v2, 3 ; GFX9-NEXT: v_mov_b32_e32 v3, 4 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v4i32@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v4i32@rel32@hi+12 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 @@ -3540,18 +3540,18 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 1 ; GFX9-NEXT: v_mov_b32_e32 v1, 2 ; GFX9-NEXT: v_mov_b32_e32 v2, 3 ; GFX9-NEXT: v_mov_b32_e32 v3, 4 ; GFX9-NEXT: v_mov_b32_e32 v4, 5 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v5i32@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v5i32@rel32@hi+12 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 @@ -3642,16 +3642,16 @@ ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx4 v[0:3], v8, s[4:5] ; GFX9-NEXT: global_load_dwordx4 v[4:7], v8, s[4:5] offset:16 -; GFX9-NEXT: s_addk_i32 s32, 0x400 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v8i32@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v8i32@rel32@hi+12 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 @@ -3676,16 +3676,15 @@ ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dwordx4 v[0:3], v8, s[4:5] ; GFX10-NEXT: global_load_dwordx4 v[4:7], v8, s[4:5] offset:16 -; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_getpc_b64 s[4:5] ; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_v8i32@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_v8i32@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 @@ -3711,16 +3710,15 @@ ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SCRATCH-NEXT: s_clause 0x1 ; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[0:3], v8, s[0:1] ; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[4:7], v8, s[0:1] offset:16 -; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v8i32@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v8i32@rel32@hi+12 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 0 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s1, v40, 1 @@ -3746,9 +3744,9 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 1 ; GFX9-NEXT: v_mov_b32_e32 v1, 2 ; GFX9-NEXT: v_mov_b32_e32 v2, 3 @@ -3757,10 +3755,10 @@ ; GFX9-NEXT: v_mov_b32_e32 v5, 6 ; GFX9-NEXT: v_mov_b32_e32 v6, 7 ; GFX9-NEXT: v_mov_b32_e32 v7, 8 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v8i32@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v8i32@rel32@hi+12 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 @@ -3792,10 +3790,10 @@ ; GFX10-NEXT: v_mov_b32_e32 v7, 8 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[4:5] ; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_v8i32@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_v8i32@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 @@ -3828,10 +3826,10 @@ ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v7, 8 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v8i32@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v8i32@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 0 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s1, v40, 1 @@ -3857,18 +3855,18 @@ ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v16, 0 ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx4 v[0:3], v16, s[4:5] ; GFX9-NEXT: global_load_dwordx4 v[4:7], v16, s[4:5] offset:16 ; GFX9-NEXT: global_load_dwordx4 v[8:11], v16, s[4:5] offset:32 ; GFX9-NEXT: global_load_dwordx4 v[12:15], v16, s[4:5] offset:48 -; GFX9-NEXT: s_addk_i32 s32, 0x400 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v16i32@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v16i32@rel32@hi+12 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 @@ -3893,18 +3891,17 @@ ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x3 ; GFX10-NEXT: global_load_dwordx4 v[0:3], v16, s[4:5] ; GFX10-NEXT: global_load_dwordx4 v[4:7], v16, s[4:5] offset:16 ; GFX10-NEXT: global_load_dwordx4 v[8:11], v16, s[4:5] offset:32 ; GFX10-NEXT: global_load_dwordx4 v[12:15], v16, s[4:5] offset:48 -; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_getpc_b64 s[4:5] ; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_v16i32@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_v16i32@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 @@ -3930,18 +3927,17 @@ ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SCRATCH-NEXT: s_clause 0x3 ; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1] ; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:16 ; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[8:11], v16, s[0:1] offset:32 ; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[12:15], v16, s[0:1] offset:48 -; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v16i32@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v16i32@rel32@hi+12 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 0 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s1, v40, 1 @@ -3969,8 +3965,8 @@ ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v28, 0 ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx4 v[0:3], v28, s[4:5] ; GFX9-NEXT: global_load_dwordx4 v[4:7], v28, s[4:5] offset:16 @@ -3981,11 +3977,11 @@ ; GFX9-NEXT: global_load_dwordx4 v[24:27], v28, s[4:5] offset:96 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_dwordx4 v[28:31], v28, s[4:5] offset:112 -; GFX9-NEXT: s_addk_i32 s32, 0x400 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v32i32@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v32i32@rel32@hi+12 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 @@ -4010,8 +4006,6 @@ ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x7 ; GFX10-NEXT: global_load_dwordx4 v[0:3], v32, s[4:5] @@ -4022,10 +4016,11 @@ ; GFX10-NEXT: global_load_dwordx4 v[20:23], v32, s[4:5] offset:80 ; GFX10-NEXT: global_load_dwordx4 v[24:27], v32, s[4:5] offset:96 ; GFX10-NEXT: global_load_dwordx4 v[28:31], v32, s[4:5] offset:112 -; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_getpc_b64 s[4:5] ; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_v32i32@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_v32i32@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 @@ -4051,8 +4046,6 @@ ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SCRATCH-NEXT: s_clause 0x7 ; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1] @@ -4063,10 +4056,11 @@ ; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:80 ; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:96 ; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:112 -; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v32i32@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v32i32@rel32@hi+12 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 0 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s1, v40, 1 @@ -4094,8 +4088,8 @@ ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v28, 0 ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx4 v[0:3], v28, s[4:5] ; GFX9-NEXT: global_load_dwordx4 v[4:7], v28, s[4:5] offset:16 @@ -4106,11 +4100,11 @@ ; GFX9-NEXT: global_load_dwordx4 v[24:27], v28, s[4:5] offset:96 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_dwordx4 v[28:31], v28, s[4:5] offset:112 -; GFX9-NEXT: s_addk_i32 s32, 0x400 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v32i32_i32@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v32i32_i32@rel32@hi+12 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_waitcnt vmcnt(7) ; GFX9-NEXT: global_load_dword v32, v[0:1], off ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -4139,8 +4133,6 @@ ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: global_load_dword v33, v[0:1], off ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x7 @@ -4152,10 +4144,11 @@ ; GFX10-NEXT: global_load_dwordx4 v[20:23], v32, s[4:5] offset:80 ; GFX10-NEXT: global_load_dwordx4 v[24:27], v32, s[4:5] offset:96 ; GFX10-NEXT: global_load_dwordx4 v[28:31], v32, s[4:5] offset:112 -; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_getpc_b64 s[4:5] ; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_v32i32_i32@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_v32i32_i32@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_waitcnt vmcnt(8) ; GFX10-NEXT: buffer_store_dword v33, off, s[0:3], s32 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -4183,8 +4176,6 @@ ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: global_load_dword v33, v[0:1], off ; GFX10-SCRATCH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SCRATCH-NEXT: s_clause 0x7 @@ -4196,10 +4187,11 @@ ; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:80 ; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:96 ; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:112 -; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v32i32_i32@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v32i32_i32@rel32@hi+12 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(8) ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v33, s32 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -4229,17 +4221,17 @@ ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 ; GFX9-NEXT: s_mov_b32 s33, s32 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: s_addk_i32 s32, 0x400 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v41, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: v_mov_b32_e32 v42, v1 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, external_i32_func_i32@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, external_i32_func_i32@rel32@hi+12 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: v_mov_b32_e32 v42, v1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: global_store_dword v[41:42], v0, off ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -4265,16 +4257,16 @@ ; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-NEXT: s_mov_b32 s33, s32 -; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: v_mov_b32_e32 v41, v0 ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 +; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_mov_b32_e32 v42, v1 ; GFX10-NEXT: s_getpc_b64 s[4:5] ; GFX10-NEXT: s_add_u32 s4, s4, external_i32_func_i32@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s5, s5, external_i32_func_i32@rel32@hi+12 -; GFX10-NEXT: v_mov_b32_e32 v42, v1 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX10-NEXT: global_store_dword v[41:42], v0, off @@ -4303,16 +4295,16 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v42, s33 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v41, v0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 42 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v42, v1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_i32_func_i32@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_i32_func_i32@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v42, v1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: global_store_dword v[41:42], v0, off @@ -4345,16 +4337,16 @@ ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v2, s[4:5] offset:4 ; GFX9-NEXT: global_load_ubyte v0, v2, s[4:5] -; GFX9-NEXT: s_addk_i32 s32, 0x400 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_struct_i8_i32@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_struct_i8_i32@rel32@hi+12 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 @@ -4379,16 +4371,15 @@ ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_ubyte v0, v2, s[4:5] ; GFX10-NEXT: global_load_dword v1, v2, s[4:5] offset:4 -; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_getpc_b64 s[4:5] ; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_struct_i8_i32@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_struct_i8_i32@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 @@ -4414,16 +4405,15 @@ ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SCRATCH-NEXT: s_clause 0x1 ; GFX10-SCRATCH-NEXT: global_load_ubyte v0, v2, s[0:1] ; GFX10-SCRATCH-NEXT: global_load_dword v1, v2, s[0:1] offset:4 -; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_struct_i8_i32@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_struct_i8_i32@rel32@hi+12 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 0 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s1, v40, 1 @@ -4453,14 +4443,14 @@ ; GFX9-NEXT: v_mov_b32_e32 v0, 3 ; GFX9-NEXT: buffer_store_byte v0, off, s[0:3], s33 ; GFX9-NEXT: v_mov_b32_e32 v0, 8 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_addk_i32 s32, 0x400 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4 ; GFX9-NEXT: v_lshrrev_b32_e64 v0, 6, s33 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_byval_struct_i8_i32@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_byval_struct_i8_i32@rel32@hi+12 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 @@ -4560,14 +4550,14 @@ ; GFX9-NEXT: v_mov_b32_e32 v0, 8 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4 ; GFX9-NEXT: v_lshrrev_b32_e64 v0, 6, s33 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_addk_i32 s32, 0x800 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_add_u32_e32 v0, 8, v0 ; GFX9-NEXT: v_lshrrev_b32_e64 v1, 6, s33 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@hi+12 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: buffer_load_ubyte v0, off, s[0:3], s33 offset:8 ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:12 @@ -4698,11 +4688,11 @@ ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx4 v[0:3], v0, s[4:5] -; GFX9-NEXT: s_addk_i32 s32, 0x400 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v16i8@rel32@lo+4 @@ -4714,14 +4704,14 @@ ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v1 -; GFX9-NEXT: v_mov_b32_e32 v4, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v11, 24, v2 -; GFX9-NEXT: v_mov_b32_e32 v8, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v3 ; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v3 ; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v3 +; GFX9-NEXT: v_mov_b32_e32 v4, v1 +; GFX9-NEXT: v_mov_b32_e32 v8, v2 ; GFX9-NEXT: v_mov_b32_e32 v12, v3 ; GFX9-NEXT: v_mov_b32_e32 v1, v16 ; GFX9-NEXT: v_mov_b32_e32 v2, v17 @@ -4765,14 +4755,14 @@ ; GFX10-NEXT: v_lshrrev_b32_e32 v5, 8, v1 ; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v1 ; GFX10-NEXT: v_lshrrev_b32_e32 v7, 24, v1 -; GFX10-NEXT: v_mov_b32_e32 v4, v1 ; GFX10-NEXT: v_lshrrev_b32_e32 v9, 8, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v11, 24, v2 -; GFX10-NEXT: v_mov_b32_e32 v8, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v13, 8, v3 ; GFX10-NEXT: v_lshrrev_b32_e32 v14, 16, v3 ; GFX10-NEXT: v_lshrrev_b32_e32 v15, 24, v3 +; GFX10-NEXT: v_mov_b32_e32 v4, v1 +; GFX10-NEXT: v_mov_b32_e32 v8, v2 ; GFX10-NEXT: v_mov_b32_e32 v12, v3 ; GFX10-NEXT: v_mov_b32_e32 v1, v16 ; GFX10-NEXT: v_mov_b32_e32 v2, v17 @@ -4817,14 +4807,14 @@ ; GFX10-SCRATCH-NEXT: v_lshrrev_b32_e32 v5, 8, v1 ; GFX10-SCRATCH-NEXT: v_lshrrev_b32_e32 v6, 16, v1 ; GFX10-SCRATCH-NEXT: v_lshrrev_b32_e32 v7, 24, v1 -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v4, v1 ; GFX10-SCRATCH-NEXT: v_lshrrev_b32_e32 v9, 8, v2 ; GFX10-SCRATCH-NEXT: v_lshrrev_b32_e32 v10, 16, v2 ; GFX10-SCRATCH-NEXT: v_lshrrev_b32_e32 v11, 24, v2 -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v8, v2 ; GFX10-SCRATCH-NEXT: v_lshrrev_b32_e32 v13, 8, v3 ; GFX10-SCRATCH-NEXT: v_lshrrev_b32_e32 v14, 16, v3 ; GFX10-SCRATCH-NEXT: v_lshrrev_b32_e32 v15, 24, v3 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v4, v1 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v8, v2 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v12, v3 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, v16 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, v17 @@ -4903,14 +4893,14 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 1 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_i1_inreg@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_i1_inreg@rel32@hi+12 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: buffer_store_byte v0, off, s[0:3], s32 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 @@ -4994,14 +4984,14 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_movk_i32 s4, 0x7b +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[6:7] ; GFX9-NEXT: s_add_u32 s6, s6, external_void_func_i8_inreg@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s7, s7, external_void_func_i8_inreg@rel32@hi+12 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 @@ -5082,14 +5072,14 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_movk_i32 s4, 0x7b +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[6:7] ; GFX9-NEXT: s_add_u32 s6, s6, external_void_func_i16_inreg@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s7, s7, external_void_func_i16_inreg@rel32@hi+12 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 @@ -5170,14 +5160,14 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s4, 42 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[6:7] ; GFX9-NEXT: s_add_u32 s6, s6, external_void_func_i32_inreg@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s7, s7, external_void_func_i32_inreg@rel32@hi+12 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 @@ -5258,15 +5248,15 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_movk_i32 s4, 0x7b ; GFX9-NEXT: s_mov_b32 s5, 0 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[6:7] ; GFX9-NEXT: s_add_u32 s6, s6, external_void_func_i64_inreg@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s7, s7, external_void_func_i64_inreg@rel32@hi+12 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 @@ -5351,13 +5341,13 @@ ; GFX9-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[8:9] ; GFX9-NEXT: s_add_u32 s8, s8, external_void_func_v2i64_inreg@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s9, s9, external_void_func_v2i64_inreg@rel32@hi+12 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[8:9] ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 @@ -5441,17 +5431,17 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s4, 1 ; GFX9-NEXT: s_mov_b32 s5, 2 ; GFX9-NEXT: s_mov_b32 s6, 3 ; GFX9-NEXT: s_mov_b32 s7, 4 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[8:9] ; GFX9-NEXT: s_add_u32 s8, s8, external_void_func_v2i64_inreg@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s9, s9, external_void_func_v2i64_inreg@rel32@hi+12 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[8:9] ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 @@ -5540,15 +5530,15 @@ ; GFX9-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s8, 1 ; GFX9-NEXT: s_mov_b32 s9, 2 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[10:11] ; GFX9-NEXT: s_add_u32 s10, s10, external_void_func_v3i64_inreg@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s11, s11, external_void_func_v3i64_inreg@rel32@hi+12 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[10:11] ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 @@ -5640,17 +5630,17 @@ ; GFX9-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s8, 1 ; GFX9-NEXT: s_mov_b32 s9, 2 ; GFX9-NEXT: s_mov_b32 s10, 3 ; GFX9-NEXT: s_mov_b32 s11, 4 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[12:13] ; GFX9-NEXT: s_add_u32 s12, s12, external_void_func_v4i64_inreg@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s13, s13, external_void_func_v4i64_inreg@rel32@hi+12 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[12:13] ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 @@ -5743,14 +5733,14 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_movk_i32 s4, 0x4400 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[6:7] ; GFX9-NEXT: s_add_u32 s6, s6, external_void_func_f16_inreg@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s7, s7, external_void_func_f16_inreg@rel32@hi+12 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 @@ -5831,14 +5821,14 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s4, 4.0 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[6:7] ; GFX9-NEXT: s_add_u32 s6, s6, external_void_func_f32_inreg@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s7, s7, external_void_func_f32_inreg@rel32@hi+12 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 @@ -5919,15 +5909,15 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s4, 1.0 ; GFX9-NEXT: s_mov_b32 s5, 2.0 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[6:7] ; GFX9-NEXT: s_add_u32 s6, s6, external_void_func_v2f32_inreg@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s7, s7, external_void_func_v2f32_inreg@rel32@hi+12 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 @@ -6010,16 +6000,16 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s4, 1.0 ; GFX9-NEXT: s_mov_b32 s5, 2.0 ; GFX9-NEXT: s_mov_b32 s6, 4.0 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[8:9] ; GFX9-NEXT: s_add_u32 s8, s8, external_void_func_v3f32_inreg@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s9, s9, external_void_func_v3f32_inreg@rel32@hi+12 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[8:9] ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 @@ -6104,18 +6094,18 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s4, 1.0 ; GFX9-NEXT: s_mov_b32 s5, 2.0 ; GFX9-NEXT: s_mov_b32 s6, 4.0 ; GFX9-NEXT: s_mov_b32 s7, -1.0 ; GFX9-NEXT: s_mov_b32 s8, 0.5 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[10:11] ; GFX9-NEXT: s_add_u32 s10, s10, external_void_func_v5f32_inreg@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s11, s11, external_void_func_v5f32_inreg@rel32@hi+12 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[10:11] ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 @@ -6204,15 +6194,15 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s4, 0 ; GFX9-NEXT: s_mov_b32 s5, 0x40100000 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[6:7] ; GFX9-NEXT: s_add_u32 s6, s6, external_void_func_f64_inreg@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s7, s7, external_void_func_f64_inreg@rel32@hi+12 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 @@ -6295,17 +6285,17 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s4, 0 ; GFX9-NEXT: s_mov_b32 s5, 2.0 ; GFX9-NEXT: s_mov_b32 s6, 0 ; GFX9-NEXT: s_mov_b32 s7, 0x40100000 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[8:9] ; GFX9-NEXT: s_add_u32 s8, s8, external_void_func_v2f64_inreg@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s9, s9, external_void_func_v2f64_inreg@rel32@hi+12 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[8:9] ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 @@ -6392,19 +6382,19 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s4, 0 ; GFX9-NEXT: s_mov_b32 s5, 2.0 ; GFX9-NEXT: s_mov_b32 s6, 0 ; GFX9-NEXT: s_mov_b32 s7, 0x40100000 ; GFX9-NEXT: s_mov_b32 s8, 0 ; GFX9-NEXT: s_mov_b32 s9, 0x40200000 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[10:11] ; GFX9-NEXT: s_add_u32 s10, s10, external_void_func_v3f64_inreg@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s11, s11, external_void_func_v3f64_inreg@rel32@hi+12 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[10:11] ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 @@ -6434,10 +6424,10 @@ ; GFX10-NEXT: s_mov_b32 s9, 0x40200000 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[10:11] ; GFX10-NEXT: s_add_u32 s10, s10, external_void_func_v3f64_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s11, s11, external_void_func_v3f64_inreg@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[10:11] ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 @@ -6468,10 +6458,10 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s9, 0x40200000 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v3f64_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v3f64_inreg@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 0 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s1, v40, 1 @@ -6496,13 +6486,13 @@ ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[6:7] ; GFX9-NEXT: s_add_u32 s6, s6, external_void_func_v2i16_inreg@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s7, s7, external_void_func_v2i16_inreg@rel32@hi+12 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 @@ -6585,13 +6575,13 @@ ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[6:7] ; GFX9-NEXT: s_add_u32 s6, s6, external_void_func_v3i16_inreg@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s7, s7, external_void_func_v3i16_inreg@rel32@hi+12 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 @@ -6674,13 +6664,13 @@ ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[6:7] ; GFX9-NEXT: s_add_u32 s6, s6, external_void_func_v3f16_inreg@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s7, s7, external_void_func_v3f16_inreg@rel32@hi+12 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 @@ -6762,15 +6752,15 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s4, 0x20001 ; GFX9-NEXT: s_mov_b32 s5, 3 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[6:7] ; GFX9-NEXT: s_add_u32 s6, s6, external_void_func_v3i16_inreg@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s7, s7, external_void_func_v3i16_inreg@rel32@hi+12 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 @@ -6853,15 +6843,15 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s4, 0x40003c00 ; GFX9-NEXT: s_movk_i32 s5, 0x4400 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[6:7] ; GFX9-NEXT: s_add_u32 s6, s6, external_void_func_v3f16_inreg@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s7, s7, external_void_func_v3f16_inreg@rel32@hi+12 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 @@ -6945,13 +6935,13 @@ ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[6:7] ; GFX9-NEXT: s_add_u32 s6, s6, external_void_func_v4i16_inreg@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s7, s7, external_void_func_v4i16_inreg@rel32@hi+12 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 @@ -7033,15 +7023,15 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s4, 0x20001 ; GFX9-NEXT: s_mov_b32 s5, 0x40003 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[6:7] ; GFX9-NEXT: s_add_u32 s6, s6, external_void_func_v4i16_inreg@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s7, s7, external_void_func_v4i16_inreg@rel32@hi+12 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 @@ -7125,13 +7115,13 @@ ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[6:7] ; GFX9-NEXT: s_add_u32 s6, s6, external_void_func_v2f16_inreg@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s7, s7, external_void_func_v2f16_inreg@rel32@hi+12 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 @@ -7214,13 +7204,13 @@ ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[6:7] ; GFX9-NEXT: s_add_u32 s6, s6, external_void_func_v2i32_inreg@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s7, s7, external_void_func_v2i32_inreg@rel32@hi+12 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 @@ -7302,15 +7292,15 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s4, 1 ; GFX9-NEXT: s_mov_b32 s5, 2 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[6:7] ; GFX9-NEXT: s_add_u32 s6, s6, external_void_func_v2i32_inreg@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s7, s7, external_void_func_v2i32_inreg@rel32@hi+12 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 @@ -7393,16 +7383,16 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s4, 3 ; GFX9-NEXT: s_mov_b32 s5, 4 ; GFX9-NEXT: s_mov_b32 s6, 5 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[8:9] ; GFX9-NEXT: s_add_u32 s8, s8, external_void_func_v3i32_inreg@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s9, s9, external_void_func_v3i32_inreg@rel32@hi+12 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[8:9] ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 @@ -7487,17 +7477,17 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s4, 3 ; GFX9-NEXT: s_mov_b32 s5, 4 ; GFX9-NEXT: s_mov_b32 s6, 5 ; GFX9-NEXT: s_mov_b32 s7, 6 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[8:9] ; GFX9-NEXT: s_add_u32 s8, s8, external_void_func_v3i32_i32_inreg@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s9, s9, external_void_func_v3i32_i32_inreg@rel32@hi+12 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[8:9] ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 @@ -7585,13 +7575,13 @@ ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[8:9] ; GFX9-NEXT: s_add_u32 s8, s8, external_void_func_v4i32_inreg@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s9, s9, external_void_func_v4i32_inreg@rel32@hi+12 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[8:9] ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 @@ -7673,17 +7663,17 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s4, 1 ; GFX9-NEXT: s_mov_b32 s5, 2 ; GFX9-NEXT: s_mov_b32 s6, 3 ; GFX9-NEXT: s_mov_b32 s7, 4 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[8:9] ; GFX9-NEXT: s_add_u32 s8, s8, external_void_func_v4i32_inreg@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s9, s9, external_void_func_v4i32_inreg@rel32@hi+12 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[8:9] ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 @@ -7770,18 +7760,18 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s4, 1 ; GFX9-NEXT: s_mov_b32 s5, 2 ; GFX9-NEXT: s_mov_b32 s6, 3 ; GFX9-NEXT: s_mov_b32 s7, 4 ; GFX9-NEXT: s_mov_b32 s8, 5 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[10:11] ; GFX9-NEXT: s_add_u32 s10, s10, external_void_func_v5i32_inreg@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s11, s11, external_void_func_v5i32_inreg@rel32@hi+12 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[10:11] ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 @@ -7871,15 +7861,15 @@ ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[12:13] ; GFX9-NEXT: s_add_u32 s12, s12, external_void_func_v8i32_inreg@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s13, s13, external_void_func_v8i32_inreg@rel32@hi+12 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[12:13] ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 @@ -7966,9 +7956,9 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s4, 1 ; GFX9-NEXT: s_mov_b32 s5, 2 ; GFX9-NEXT: s_mov_b32 s6, 3 @@ -7977,10 +7967,10 @@ ; GFX9-NEXT: s_mov_b32 s9, 6 ; GFX9-NEXT: s_mov_b32 s10, 7 ; GFX9-NEXT: s_mov_b32 s11, 8 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[12:13] ; GFX9-NEXT: s_add_u32 s12, s12, external_void_func_v8i32_inreg@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s13, s13, external_void_func_v8i32_inreg@rel32@hi+12 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[12:13] ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 @@ -8012,10 +8002,10 @@ ; GFX10-NEXT: s_mov_b32 s11, 8 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[12:13] ; GFX10-NEXT: s_add_u32 s12, s12, external_void_func_v8i32_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s13, s13, external_void_func_v8i32_inreg@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[12:13] ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 @@ -8048,10 +8038,10 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s11, 8 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v8i32_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v8i32_inreg@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 0 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s1, v40, 1 @@ -8076,15 +8066,15 @@ ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dwordx16 s[4:19], s[4:5], 0x0 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[20:21] ; GFX9-NEXT: s_add_u32 s20, s20, external_void_func_v16i32_inreg@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s21, s21, external_void_func_v16i32_inreg@rel32@hi+12 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[20:21] ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 @@ -8217,10 +8207,10 @@ ; GFX9-NEXT: s_mov_b32 s27, s43 ; GFX9-NEXT: s_mov_b32 s28, s44 ; GFX9-NEXT: s_mov_b32 s29, s45 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:20 ; GFX9-NEXT: s_getpc_b64 s[30:31] ; GFX9-NEXT: s_add_u32 s30, s30, external_void_func_v32i32_inreg@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s31, s31, external_void_func_v32i32_inreg@rel32@hi+12 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:20 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[30:31] ; GFX9-NEXT: v_readlane_b32 s4, v40, 16 ; GFX9-NEXT: v_readlane_b32 s5, v40, 17 @@ -8455,8 +8445,8 @@ ; GFX9-NEXT: s_load_dwordx16 s[4:19], s[20:21], 0x0 ; GFX9-NEXT: s_load_dwordx16 s[36:51], s[20:21], 0x40 ; GFX9-NEXT: s_mov_b32 s33, s32 -; GFX9-NEXT: v_mov_b32_e32 v0, s22 ; GFX9-NEXT: s_addk_i32 s32, 0x400 +; GFX9-NEXT: v_mov_b32_e32 v0, s22 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s46 @@ -8482,10 +8472,10 @@ ; GFX9-NEXT: s_mov_b32 s28, s44 ; GFX9-NEXT: s_mov_b32 s29, s45 ; GFX9-NEXT: v_writelane_b32 v40, s31, 17 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:20 ; GFX9-NEXT: s_getpc_b64 s[30:31] ; GFX9-NEXT: s_add_u32 s30, s30, external_void_func_v32i32_i32_inreg@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s31, s31, external_void_func_v32i32_i32_inreg@rel32@hi+12 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:20 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[30:31] ; GFX9-NEXT: v_readlane_b32 s4, v40, 16 ; GFX9-NEXT: v_readlane_b32 s5, v40, 17 @@ -8711,12 +8701,12 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s33 ; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:4 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_addk_i32 s32, 0x400 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, stack_passed_f64_arg@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, stack_passed_f64_arg@rel32@hi+12 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 ; GFX9-NEXT: s_waitcnt vmcnt(1) @@ -8742,19 +8732,19 @@ ; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-NEXT: s_mov_b32 s33, s32 -; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s33 ; GFX10-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:4 +; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_getpc_b64 s[4:5] ; GFX10-NEXT: s_add_u32 s4, s4, stack_passed_f64_arg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s5, s5, stack_passed_f64_arg@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_waitcnt vmcnt(1) ; GFX10-NEXT: buffer_store_dword v32, off, s[0:3], s32 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:4 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 @@ -8853,10 +8843,10 @@ ; GFX9-NEXT: v_mov_b32_e32 v29, 9 ; GFX9-NEXT: v_mov_b32_e32 v30, 10 ; GFX9-NEXT: v_mov_b32_e32 v31, 11 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_12xv3i32@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_12xv3i32@rel32@hi+12 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 @@ -8920,10 +8910,10 @@ ; GFX10-NEXT: v_mov_b32_e32 v29, 9 ; GFX10-NEXT: v_mov_b32_e32 v30, 10 ; GFX10-NEXT: v_mov_b32_e32 v31, 11 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[4:5] ; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_12xv3i32@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_12xv3i32@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 @@ -8985,10 +8975,10 @@ ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v29, 9 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v30, 10 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v31, 11 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_12xv3i32@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_12xv3i32@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 0 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s1, v40, 1 @@ -9076,10 +9066,10 @@ ; GFX9-NEXT: v_mov_b32_e32 v29, 5 ; GFX9-NEXT: v_mov_b32_e32 v30, 6 ; GFX9-NEXT: v_mov_b32_e32 v31, 7 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_8xv5i32@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_8xv5i32@rel32@hi+12 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 @@ -9105,13 +9095,13 @@ ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_mov_b32_e32 v3, 14 ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; GFX10-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; GFX10-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 ; GFX10-NEXT: v_mov_b32_e32 v0, 11 ; GFX10-NEXT: v_mov_b32_e32 v1, 12 ; GFX10-NEXT: v_mov_b32_e32 v2, 13 +; GFX10-NEXT: v_mov_b32_e32 v3, 14 ; GFX10-NEXT: v_mov_b32_e32 v4, 15 ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:12 @@ -9151,10 +9141,10 @@ ; GFX10-NEXT: v_mov_b32_e32 v29, 5 ; GFX10-NEXT: v_mov_b32_e32 v30, 6 ; GFX10-NEXT: v_mov_b32_e32 v31, 7 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[4:5] ; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_8xv5i32@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_8xv5i32@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 @@ -9221,10 +9211,10 @@ ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v29, 5 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v30, 6 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v31, 7 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_8xv5i32@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_8xv5i32@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 0 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s1, v40, 1 @@ -9308,10 +9298,10 @@ ; GFX9-NEXT: v_mov_b32_e32 v29, 0x40a00000 ; GFX9-NEXT: v_mov_b32_e32 v30, 0x40c00000 ; GFX9-NEXT: v_mov_b32_e32 v31, 0x40e00000 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_8xv5f32@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_8xv5f32@rel32@hi+12 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 @@ -9337,13 +9327,13 @@ ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_mov_b32_e32 v3, 0x41600000 ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; GFX10-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; GFX10-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 ; GFX10-NEXT: v_mov_b32_e32 v0, 0x41300000 ; GFX10-NEXT: v_mov_b32_e32 v1, 0x41400000 ; GFX10-NEXT: v_mov_b32_e32 v2, 0x41500000 +; GFX10-NEXT: v_mov_b32_e32 v3, 0x41600000 ; GFX10-NEXT: v_mov_b32_e32 v4, 0x41700000 ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:12 @@ -9383,10 +9373,10 @@ ; GFX10-NEXT: v_mov_b32_e32 v29, 0x40a00000 ; GFX10-NEXT: v_mov_b32_e32 v30, 0x40c00000 ; GFX10-NEXT: v_mov_b32_e32 v31, 0x40e00000 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[4:5] ; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_8xv5f32@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_8xv5f32@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 @@ -9453,10 +9443,10 @@ ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v29, 0x40a00000 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v30, 0x40c00000 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v31, 0x40e00000 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_8xv5f32@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_8xv5f32@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 0 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s1, v40, 1 diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll --- a/llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll +++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll @@ -14,13 +14,13 @@ ; GFX9-NEXT: v_writelane_b32 v40, s33, 4 ; GFX9-NEXT: v_writelane_b32 v40, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s35, 1 -; GFX9-NEXT: v_writelane_b32 v40, s30, 2 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 +; GFX9-NEXT: v_writelane_b32 v40, s30, 2 +; GFX9-NEXT: v_writelane_b32 v40, s31, 3 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_void@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s35, s35, external_void_func_void@rel32@hi+12 -; GFX9-NEXT: v_writelane_b32 v40, s31, 3 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ;;#ASMEND @@ -109,17 +109,17 @@ ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: v_writelane_b32 v40, s33, 3 ; GFX9-NEXT: v_writelane_b32 v40, s34, 0 -; GFX9-NEXT: v_writelane_b32 v40, s30, 1 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 +; GFX9-NEXT: v_writelane_b32 v40, s30, 1 ; GFX9-NEXT: v_writelane_b32 v40, s31, 2 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; def s31 ; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s34, s31 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12 -; GFX9-NEXT: s_mov_b32 s34, s31 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: v_readlane_b32 s4, v40, 1 ; GFX9-NEXT: s_mov_b32 s31, s34 @@ -187,18 +187,18 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; def v31 ; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: v_mov_b32_e32 v41, v31 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: v_mov_b32_e32 v41, v31 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v31, v41 ; GFX9-NEXT: ;;#ASMSTART @@ -231,10 +231,10 @@ ; GFX10-NEXT: ; def v31 ; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_mov_b32_e32 v41, v31 ; GFX10-NEXT: s_getpc_b64 s[4:5] ; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12 -; GFX10-NEXT: v_mov_b32_e32 v41, v31 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX10-NEXT: v_mov_b32_e32 v31, v41 @@ -267,16 +267,16 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; def s33 ; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: ;;#ASMSTART @@ -339,16 +339,16 @@ ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: v_writelane_b32 v40, s33, 3 ; GFX9-NEXT: v_writelane_b32 v40, s34, 0 -; GFX9-NEXT: v_writelane_b32 v40, s30, 1 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12 +; GFX9-NEXT: v_writelane_b32 v40, s30, 1 ; GFX9-NEXT: v_writelane_b32 v40, s31, 2 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; def s34 ; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: v_readlane_b32 s4, v40, 1 ; GFX9-NEXT: ;;#ASMSTART @@ -413,17 +413,17 @@ ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: v_writelane_b32 v41, s33, 2 -; GFX9-NEXT: v_writelane_b32 v41, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 +; GFX9-NEXT: v_writelane_b32 v41, s30, 0 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12 ; GFX9-NEXT: v_writelane_b32 v41, s31, 1 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; def v40 ; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use v40 @@ -451,13 +451,13 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX10-NEXT: s_getpc_b64 s[4:5] -; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 -; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v41, s30, 0 ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; def v40 ; GFX10-NEXT: ;;#ASMEND +; GFX10-NEXT: v_writelane_b32 v41, s30, 0 +; GFX10-NEXT: s_getpc_b64 s[4:5] +; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12 ; GFX10-NEXT: v_writelane_b32 v41, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX10-NEXT: ;;#ASMSTART @@ -572,13 +572,13 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, void_func_void_clobber_s33@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, void_func_void_clobber_s33@rel32@hi+12 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 @@ -629,13 +629,13 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, void_func_void_clobber_s34@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, void_func_void_clobber_s34@rel32@hi+12 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 @@ -687,16 +687,16 @@ ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: v_writelane_b32 v40, s33, 3 ; GFX9-NEXT: v_writelane_b32 v40, s40, 0 -; GFX9-NEXT: v_writelane_b32 v40, s30, 1 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12 +; GFX9-NEXT: v_writelane_b32 v40, s30, 1 ; GFX9-NEXT: v_writelane_b32 v40, s31, 2 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; def s40 ; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: v_readlane_b32 s4, v40, 1 ; GFX9-NEXT: ;;#ASMSTART @@ -762,21 +762,21 @@ ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: v_writelane_b32 v40, s33, 3 ; GFX9-NEXT: v_writelane_b32 v40, s40, 0 -; GFX9-NEXT: v_writelane_b32 v40, s30, 1 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_addk_i32 s32, 0x400 +; GFX9-NEXT: v_writelane_b32 v40, s30, 1 ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: v_writelane_b32 v40, s31, 2 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; def s40 ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; def v32 ; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: v_mov_b32_e32 v41, v32 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12 -; GFX9-NEXT: v_writelane_b32 v40, s31, 2 -; GFX9-NEXT: v_mov_b32_e32 v41, v32 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use s40 @@ -807,10 +807,10 @@ ; GFX10-NEXT: v_writelane_b32 v40, s33, 3 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: s_getpc_b64 s[4:5] ; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12 -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: v_writelane_b32 v40, s40, 0 ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; def s40 diff --git a/llvm/test/CodeGen/AMDGPU/half.ll b/llvm/test/CodeGen/AMDGPU/half.ll --- a/llvm/test/CodeGen/AMDGPU/half.ll +++ b/llvm/test/CodeGen/AMDGPU/half.ll @@ -65,8 +65,8 @@ ; SI-NEXT: s_add_u32 s4, s0, 4 ; SI-NEXT: s_addc_u32 s5, s1, 0 ; SI-NEXT: v_mov_b32_e32 v2, s4 -; SI-NEXT: v_mov_b32_e32 v0, s0 ; SI-NEXT: v_mov_b32_e32 v4, s3 +; SI-NEXT: v_mov_b32_e32 v0, s0 ; SI-NEXT: v_mov_b32_e32 v3, s5 ; SI-NEXT: v_mov_b32_e32 v1, s1 ; SI-NEXT: v_mov_b32_e32 v5, s2 @@ -82,8 +82,8 @@ ; VI-NEXT: s_add_u32 s4, s0, 4 ; VI-NEXT: s_addc_u32 s5, s1, 0 ; VI-NEXT: v_mov_b32_e32 v2, s4 -; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v4, s3 +; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v5, s2 @@ -336,16 +336,16 @@ ; SI-NEXT: s_lshr_b32 s8, s3, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 ; SI-NEXT: s_lshr_b32 s4, s2, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 ; SI-NEXT: v_cvt_f32_f16_e32 v7, s8 ; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 ; SI-NEXT: v_cvt_f32_f16_e32 v6, s3 ; SI-NEXT: v_cvt_f32_f16_e32 v4, s2 ; SI-NEXT: s_add_u32 s0, s6, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v2, s1 ; SI-NEXT: s_addc_u32 s1, s7, 0 -; SI-NEXT: v_mov_b32_e32 v9, s1 ; SI-NEXT: v_cvt_f32_f16_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v9, s1 ; SI-NEXT: v_mov_b32_e32 v8, s0 ; SI-NEXT: flat_store_dwordx4 v[8:9], v[4:7] ; SI-NEXT: s_nop 0 @@ -364,16 +364,16 @@ ; VI-NEXT: s_lshr_b32 s8, s3, 16 ; VI-NEXT: v_cvt_f32_f16_e32 v3, s4 ; VI-NEXT: s_lshr_b32 s4, s2, 16 -; VI-NEXT: v_cvt_f32_f16_e32 v0, s0 ; VI-NEXT: v_cvt_f32_f16_e32 v7, s8 ; VI-NEXT: v_cvt_f32_f16_e32 v5, s4 +; VI-NEXT: v_cvt_f32_f16_e32 v0, s0 ; VI-NEXT: v_cvt_f32_f16_e32 v6, s3 ; VI-NEXT: v_cvt_f32_f16_e32 v4, s2 ; VI-NEXT: s_add_u32 s0, s6, 16 ; VI-NEXT: v_cvt_f32_f16_e32 v2, s1 ; VI-NEXT: s_addc_u32 s1, s7, 0 -; VI-NEXT: v_mov_b32_e32 v9, s1 ; VI-NEXT: v_cvt_f32_f16_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v9, s1 ; VI-NEXT: v_mov_b32_e32 v8, s0 ; VI-NEXT: flat_store_dwordx4 v[8:9], v[4:7] ; VI-NEXT: s_nop 0 @@ -462,14 +462,14 @@ ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, s3 ; SI-NEXT: s_lshr_b32 s4, s2, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 ; SI-NEXT: v_cvt_f32_f16_e32 v1, s2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 ; SI-NEXT: s_add_u32 s2, s0, 16 ; SI-NEXT: v_cvt_f64_f32_e32 v[4:5], v0 ; SI-NEXT: s_addc_u32 s3, s1, 0 -; SI-NEXT: v_mov_b32_e32 v7, s3 ; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v1 ; SI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 +; SI-NEXT: v_mov_b32_e32 v7, s3 ; SI-NEXT: v_mov_b32_e32 v6, s2 ; SI-NEXT: flat_store_dwordx2 v[6:7], v[4:5] ; SI-NEXT: v_mov_b32_e32 v5, s1 @@ -484,14 +484,14 @@ ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_cvt_f32_f16_e32 v1, s3 ; VI-NEXT: s_lshr_b32 s4, s2, 16 -; VI-NEXT: v_cvt_f32_f16_e32 v2, s4 ; VI-NEXT: v_cvt_f32_f16_e32 v0, s2 +; VI-NEXT: v_cvt_f32_f16_e32 v2, s4 ; VI-NEXT: s_add_u32 s2, s0, 16 ; VI-NEXT: v_cvt_f64_f32_e32 v[4:5], v1 ; VI-NEXT: s_addc_u32 s3, s1, 0 -; VI-NEXT: v_mov_b32_e32 v7, s3 ; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 +; VI-NEXT: v_mov_b32_e32 v7, s3 ; VI-NEXT: v_mov_b32_e32 v6, s2 ; VI-NEXT: flat_store_dwordx2 v[6:7], v[4:5] ; VI-NEXT: v_mov_b32_e32 v5, s1 @@ -515,13 +515,13 @@ ; SI-NEXT: s_lshr_b32 s5, s2, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 ; SI-NEXT: v_cvt_f32_f16_e32 v2, s5 -; SI-NEXT: s_add_u32 s2, s0, 16 ; SI-NEXT: v_cvt_f64_f32_e32 v[6:7], v4 ; SI-NEXT: v_cvt_f64_f32_e32 v[4:5], v5 +; SI-NEXT: s_add_u32 s2, s0, 16 ; SI-NEXT: s_addc_u32 s3, s1, 0 -; SI-NEXT: v_mov_b32_e32 v9, s3 ; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; SI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 +; SI-NEXT: v_mov_b32_e32 v9, s3 ; SI-NEXT: v_mov_b32_e32 v8, s2 ; SI-NEXT: flat_store_dwordx4 v[8:9], v[4:7] ; SI-NEXT: s_nop 0 @@ -541,13 +541,13 @@ ; VI-NEXT: s_lshr_b32 s4, s2, 16 ; VI-NEXT: v_cvt_f32_f16_e32 v0, s2 ; VI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; VI-NEXT: s_add_u32 s2, s0, 16 ; VI-NEXT: v_cvt_f64_f32_e32 v[6:7], v4 ; VI-NEXT: v_cvt_f64_f32_e32 v[4:5], v5 +; VI-NEXT: s_add_u32 s2, s0, 16 ; VI-NEXT: s_addc_u32 s3, s1, 0 -; VI-NEXT: v_mov_b32_e32 v9, s3 ; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 +; VI-NEXT: v_mov_b32_e32 v9, s3 ; VI-NEXT: v_mov_b32_e32 v8, s2 ; VI-NEXT: flat_store_dwordx4 v[8:9], v[4:7] ; VI-NEXT: s_nop 0 @@ -573,13 +573,13 @@ ; SI-NEXT: s_lshr_b32 s8, s1, 16 ; SI-NEXT: s_lshr_b32 s4, s0, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v1, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s0 ; SI-NEXT: v_cvt_f32_f16_e32 v8, s2 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s0 ; SI-NEXT: s_add_u32 s0, s6, 48 ; SI-NEXT: v_cvt_f32_f16_e32 v5, s1 -; SI-NEXT: s_addc_u32 s1, s7, 0 ; SI-NEXT: v_cvt_f64_f32_e32 v[14:15], v0 ; SI-NEXT: v_cvt_f64_f32_e32 v[12:13], v12 +; SI-NEXT: s_addc_u32 s1, s7, 0 ; SI-NEXT: v_cvt_f32_f16_e32 v4, s8 ; SI-NEXT: v_mov_b32_e32 v17, s1 ; SI-NEXT: v_mov_b32_e32 v16, s0 @@ -592,12 +592,13 @@ ; SI-NEXT: flat_store_dwordx4 v[16:17], v[12:15] ; SI-NEXT: v_cvt_f64_f32_e32 v[6:7], v4 ; SI-NEXT: v_mov_b32_e32 v13, s1 +; SI-NEXT: v_cvt_f64_f32_e32 v[4:5], v5 ; SI-NEXT: v_mov_b32_e32 v12, s0 ; SI-NEXT: s_add_u32 s0, s6, 16 -; SI-NEXT: v_cvt_f64_f32_e32 v[4:5], v5 ; SI-NEXT: s_addc_u32 s1, s7, 0 -; SI-NEXT: flat_store_dwordx4 v[12:13], v[8:11] ; SI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 +; SI-NEXT: flat_store_dwordx4 v[12:13], v[8:11] +; SI-NEXT: s_nop 0 ; SI-NEXT: v_mov_b32_e32 v9, s1 ; SI-NEXT: v_mov_b32_e32 v8, s0 ; SI-NEXT: flat_store_dwordx4 v[8:9], v[4:7] @@ -616,19 +617,19 @@ ; VI-NEXT: s_lshr_b32 s8, s2, 16 ; VI-NEXT: s_lshr_b32 s9, s3, 16 ; VI-NEXT: v_cvt_f32_f16_e32 v0, s4 -; VI-NEXT: v_cvt_f32_f16_e32 v12, s3 ; VI-NEXT: v_cvt_f32_f16_e32 v4, s8 ; VI-NEXT: v_cvt_f32_f16_e32 v5, s9 +; VI-NEXT: v_cvt_f32_f16_e32 v12, s3 ; VI-NEXT: s_lshr_b32 s5, s1, 16 -; VI-NEXT: v_cvt_f32_f16_e32 v8, s2 ; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v0 ; VI-NEXT: v_cvt_f32_f16_e32 v0, s0 +; VI-NEXT: v_cvt_f32_f16_e32 v8, s2 ; VI-NEXT: s_add_u32 s0, s6, 48 ; VI-NEXT: v_cvt_f64_f32_e32 v[10:11], v4 -; VI-NEXT: v_cvt_f32_f16_e32 v4, s1 -; VI-NEXT: s_addc_u32 s1, s7, 0 ; VI-NEXT: v_cvt_f64_f32_e32 v[14:15], v5 +; VI-NEXT: v_cvt_f32_f16_e32 v4, s1 ; VI-NEXT: v_cvt_f64_f32_e32 v[12:13], v12 +; VI-NEXT: s_addc_u32 s1, s7, 0 ; VI-NEXT: v_cvt_f32_f16_e32 v1, s5 ; VI-NEXT: v_mov_b32_e32 v17, s1 ; VI-NEXT: v_mov_b32_e32 v16, s0 @@ -638,12 +639,13 @@ ; VI-NEXT: flat_store_dwordx4 v[16:17], v[12:15] ; VI-NEXT: v_cvt_f64_f32_e32 v[6:7], v1 ; VI-NEXT: v_mov_b32_e32 v13, s1 +; VI-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 ; VI-NEXT: v_mov_b32_e32 v12, s0 ; VI-NEXT: s_add_u32 s0, s6, 16 -; VI-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 ; VI-NEXT: s_addc_u32 s1, s7, 0 -; VI-NEXT: flat_store_dwordx4 v[12:13], v[8:11] ; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; VI-NEXT: flat_store_dwordx4 v[12:13], v[8:11] +; VI-NEXT: s_nop 0 ; VI-NEXT: v_mov_b32_e32 v9, s1 ; VI-NEXT: v_mov_b32_e32 v8, s0 ; VI-NEXT: flat_store_dwordx4 v[8:9], v[4:7] @@ -854,9 +856,9 @@ ; VI-NEXT: flat_load_dwordx2 v[4:5], v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cvt_f32_f16_e32 v0, v4 -; VI-NEXT: v_cvt_f32_f16_sdwa v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; VI-NEXT: v_cvt_f32_f16_e32 v2, v5 ; VI-NEXT: v_cvt_f32_f16_sdwa v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f32_f16_sdwa v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] @@ -958,20 +960,21 @@ ; SI-NEXT: v_cvt_f32_f16_e32 v12, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v6 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v4 ; SI-NEXT: flat_store_dwordx4 v[13:14], v[9:12] -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v0 +; SI-NEXT: s_nop 0 ; SI-NEXT: v_cvt_f32_f16_e32 v12, v3 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v0 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v3 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v4 ; SI-NEXT: v_mov_b32_e32 v5, s1 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v16 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v17 ; SI-NEXT: v_mov_b32_e32 v4, s0 @@ -1005,8 +1008,8 @@ ; VI-NEXT: v_mov_b32_e32 v19, s3 ; VI-NEXT: v_mov_b32_e32 v18, s2 ; VI-NEXT: s_add_u32 s2, s0, 48 -; VI-NEXT: s_addc_u32 s3, s1, 0 ; VI-NEXT: v_mov_b32_e32 v17, s1 +; VI-NEXT: s_addc_u32 s3, s1, 0 ; VI-NEXT: v_mov_b32_e32 v16, s0 ; VI-NEXT: s_add_u32 s0, s0, 32 ; VI-NEXT: s_addc_u32 s1, s1, 0 @@ -1014,12 +1017,12 @@ ; VI-NEXT: v_mov_b32_e32 v20, s2 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_cvt_f32_f16_e32 v14, v3 -; VI-NEXT: v_cvt_f32_f16_sdwa v15, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; VI-NEXT: v_cvt_f32_f16_e32 v12, v2 +; VI-NEXT: v_cvt_f32_f16_sdwa v15, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; VI-NEXT: v_cvt_f32_f16_sdwa v13, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; VI-NEXT: v_cvt_f32_f16_e32 v10, v1 -; VI-NEXT: v_cvt_f32_f16_sdwa v11, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; VI-NEXT: v_cvt_f32_f16_e32 v8, v0 +; VI-NEXT: v_cvt_f32_f16_sdwa v11, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; VI-NEXT: v_cvt_f32_f16_sdwa v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; VI-NEXT: flat_store_dwordx4 v[18:19], v[12:15] ; VI-NEXT: s_waitcnt vmcnt(1) @@ -1076,10 +1079,10 @@ ; SI-NEXT: v_mov_b32_e32 v5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 ; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; SI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 ; SI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; SI-NEXT: s_endpgm ; @@ -1093,10 +1096,10 @@ ; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_cvt_f32_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; VI-NEXT: v_cvt_f32_f16_e32 v1, v0 -; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 +; VI-NEXT: v_cvt_f32_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v1 +; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm %val = load <2 x half>, <2 x half> addrspace(1)* %in @@ -1120,12 +1123,12 @@ ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cvt_f64_f32_e32 v[4:5], v1 -; SI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 -; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; SI-NEXT: flat_store_dwordx2 v[6:7], v[4:5] +; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; SI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 ; SI-NEXT: v_mov_b32_e32 v5, s1 ; SI-NEXT: v_mov_b32_e32 v4, s0 ; SI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] @@ -1201,12 +1204,12 @@ ; VI-NEXT: v_mov_b32_e32 v9, s1 ; VI-NEXT: v_mov_b32_e32 v8, s0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_cvt_f32_f16_sdwa v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; VI-NEXT: v_cvt_f32_f16_e32 v3, v1 +; VI-NEXT: v_cvt_f32_f16_sdwa v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; VI-NEXT: v_cvt_f32_f16_e32 v2, v0 ; VI-NEXT: v_cvt_f32_f16_sdwa v10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: v_cvt_f64_f32_e32 v[6:7], v6 ; VI-NEXT: v_cvt_f64_f32_e32 v[4:5], v3 +; VI-NEXT: v_cvt_f64_f32_e32 v[6:7], v6 ; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v2 ; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v10 ; VI-NEXT: v_mov_b32_e32 v11, s3 @@ -1233,8 +1236,8 @@ ; SI-NEXT: v_mov_b32_e32 v7, s3 ; SI-NEXT: v_mov_b32_e32 v6, s2 ; SI-NEXT: s_add_u32 s2, s0, 32 -; SI-NEXT: s_addc_u32 s3, s1, 0 ; SI-NEXT: v_mov_b32_e32 v13, s1 +; SI-NEXT: s_addc_u32 s3, s1, 0 ; SI-NEXT: v_mov_b32_e32 v12, s0 ; SI-NEXT: s_add_u32 s0, s0, 16 ; SI-NEXT: v_mov_b32_e32 v15, s3 @@ -1242,25 +1245,26 @@ ; SI-NEXT: v_mov_b32_e32 v14, s2 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v1 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v16, v5 ; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v3 ; SI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 ; SI-NEXT: v_cvt_f32_f16_e32 v17, v9 ; SI-NEXT: v_cvt_f32_f16_e32 v18, v11 -; SI-NEXT: v_cvt_f64_f32_e32 v[8:9], v8 ; SI-NEXT: flat_store_dwordx4 v[6:7], v[0:3] -; SI-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 +; SI-NEXT: s_nop 0 ; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v10 +; SI-NEXT: v_cvt_f64_f32_e32 v[8:9], v8 ; SI-NEXT: v_cvt_f64_f32_e32 v[10:11], v16 ; SI-NEXT: v_cvt_f64_f32_e32 v[2:3], v17 +; SI-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 ; SI-NEXT: v_cvt_f64_f32_e32 v[6:7], v18 ; SI-NEXT: v_mov_b32_e32 v17, s1 ; SI-NEXT: v_mov_b32_e32 v16, s0 @@ -1281,22 +1285,22 @@ ; VI-NEXT: v_mov_b32_e32 v8, s3 ; VI-NEXT: v_mov_b32_e32 v7, s2 ; VI-NEXT: s_add_u32 s2, s0, 32 -; VI-NEXT: s_addc_u32 s3, s1, 0 ; VI-NEXT: v_mov_b32_e32 v13, s1 +; VI-NEXT: s_addc_u32 s3, s1, 0 ; VI-NEXT: v_mov_b32_e32 v12, s0 ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: v_mov_b32_e32 v15, s3 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v14, s2 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_cvt_f32_f16_sdwa v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; VI-NEXT: v_cvt_f32_f16_e32 v9, v0 ; VI-NEXT: v_cvt_f32_f16_sdwa v16, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; VI-NEXT: v_cvt_f32_f16_e32 v0, v3 -; VI-NEXT: v_cvt_f32_f16_e32 v11, v2 +; VI-NEXT: v_cvt_f32_f16_sdwa v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; VI-NEXT: v_cvt_f32_f16_e32 v10, v1 -; VI-NEXT: v_cvt_f64_f32_e32 v[5:6], v5 +; VI-NEXT: v_cvt_f32_f16_e32 v11, v2 ; VI-NEXT: v_cvt_f64_f32_e32 v[3:4], v0 +; VI-NEXT: v_cvt_f64_f32_e32 v[5:6], v5 ; VI-NEXT: v_cvt_f32_f16_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; VI-NEXT: v_cvt_f32_f16_sdwa v17, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v9 @@ -1359,37 +1363,38 @@ ; SI-NEXT: v_cvt_f64_f32_e32 v[10:11], v10 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; SI-NEXT: flat_store_dwordx4 v[14:15], v[8:11] -; SI-NEXT: v_mov_b32_e32 v15, s3 +; SI-NEXT: s_nop 0 ; SI-NEXT: v_cvt_f64_f32_e32 v[8:9], v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; SI-NEXT: v_cvt_f64_f32_e32 v[10:11], v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_mov_b32_e32 v14, s2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: flat_store_dwordx4 v[16:17], v[8:11] -; SI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 +; SI-NEXT: v_mov_b32_e32 v15, s3 ; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v0 ; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v1 +; SI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: flat_store_dwordx4 v[18:19], v[0:3] ; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v6 -; SI-NEXT: v_cvt_f64_f32_e32 v[2:3], v8 ; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v9 +; SI-NEXT: v_cvt_f64_f32_e32 v[2:3], v8 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v11 +; SI-NEXT: v_mov_b32_e32 v14, s2 ; SI-NEXT: s_add_u32 s2, s0, 0x60 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: flat_store_dwordx4 v[12:13], v[0:3] +; SI-NEXT: v_cvt_f32_f16_e32 v10, v11 ; SI-NEXT: s_addc_u32 s3, s1, 0 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; SI-NEXT: flat_store_dwordx4 v[12:13], v[0:3] +; SI-NEXT: v_mov_b32_e32 v17, s3 ; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v7 ; SI-NEXT: v_cvt_f64_f32_e32 v[2:3], v8 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v20 -; SI-NEXT: v_mov_b32_e32 v17, s3 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v12, v5 ; SI-NEXT: v_mov_b32_e32 v16, s2 @@ -1400,10 +1405,10 @@ ; SI-NEXT: s_add_u32 s0, s0, 64 ; SI-NEXT: flat_store_dwordx4 v[14:15], v[0:3] ; SI-NEXT: s_addc_u32 s1, s1, 0 -; SI-NEXT: v_cvt_f64_f32_e32 v[2:3], v7 ; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v21 -; SI-NEXT: v_cvt_f64_f32_e32 v[6:7], v12 +; SI-NEXT: v_cvt_f64_f32_e32 v[2:3], v7 ; SI-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 +; SI-NEXT: v_cvt_f64_f32_e32 v[6:7], v12 ; SI-NEXT: v_mov_b32_e32 v19, s3 ; SI-NEXT: v_mov_b32_e32 v13, s1 ; SI-NEXT: v_mov_b32_e32 v18, s2 @@ -1449,43 +1454,45 @@ ; VI-NEXT: v_cvt_f32_f16_e32 v3, v2 ; VI-NEXT: v_cvt_f32_f16_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; VI-NEXT: flat_store_dwordx4 v[14:15], v[8:11] -; VI-NEXT: v_mov_b32_e32 v15, s3 +; VI-NEXT: s_nop 0 ; VI-NEXT: v_cvt_f64_f32_e32 v[8:9], v3 ; VI-NEXT: v_cvt_f64_f32_e32 v[10:11], v2 ; VI-NEXT: v_cvt_f32_f16_e32 v2, v1 ; VI-NEXT: v_cvt_f32_f16_sdwa v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: v_mov_b32_e32 v14, s2 ; VI-NEXT: flat_store_dwordx4 v[16:17], v[8:11] -; VI-NEXT: s_add_u32 s2, s0, 0x60 +; VI-NEXT: s_nop 0 ; VI-NEXT: v_cvt_f32_f16_e32 v8, v0 ; VI-NEXT: v_cvt_f32_f16_sdwa v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v2 ; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v3 -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_cvt_f32_f16_e32 v10, v4 -; VI-NEXT: s_addc_u32 s3, s1, 0 -; VI-NEXT: v_mov_b32_e32 v17, s3 ; VI-NEXT: flat_store_dwordx4 v[18:19], v[0:3] -; VI-NEXT: v_cvt_f32_f16_sdwa v18, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_cvt_f32_f16_e32 v10, v4 ; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v8 ; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v9 +; VI-NEXT: v_cvt_f32_f16_sdwa v18, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; VI-NEXT: v_cvt_f32_f16_e32 v4, v7 ; VI-NEXT: v_cvt_f32_f16_sdwa v7, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI-NEXT: v_cvt_f32_f16_e32 v8, v5 +; VI-NEXT: v_mov_b32_e32 v15, s3 +; VI-NEXT: v_mov_b32_e32 v14, s2 +; VI-NEXT: s_add_u32 s2, s0, 0x60 ; VI-NEXT: flat_store_dwordx4 v[12:13], v[0:3] -; VI-NEXT: v_cvt_f32_f16_sdwa v12, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f32_f16_e32 v8, v5 ; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v4 ; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v7 ; VI-NEXT: v_cvt_f32_f16_e32 v7, v6 ; VI-NEXT: v_cvt_f32_f16_sdwa v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: s_addc_u32 s3, s1, 0 +; VI-NEXT: v_cvt_f32_f16_sdwa v12, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_mov_b32_e32 v17, s3 ; VI-NEXT: v_mov_b32_e32 v16, s2 ; VI-NEXT: s_add_u32 s2, s0, 0x50 ; VI-NEXT: s_addc_u32 s3, s1, 0 -; VI-NEXT: flat_store_dwordx4 v[14:15], v[0:3] ; VI-NEXT: v_cvt_f64_f32_e32 v[4:5], v10 +; VI-NEXT: flat_store_dwordx4 v[14:15], v[0:3] +; VI-NEXT: v_cvt_f64_f32_e32 v[10:11], v6 ; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v8 ; VI-NEXT: v_cvt_f64_f32_e32 v[8:9], v7 -; VI-NEXT: v_cvt_f64_f32_e32 v[10:11], v6 ; VI-NEXT: s_add_u32 s0, s0, 64 ; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v12 ; VI-NEXT: s_addc_u32 s1, s1, 0 @@ -1579,8 +1586,8 @@ ; SI-NEXT: v_cvt_f16_f32_e32 v4, v0 ; SI-NEXT: v_mov_b32_e32 v0, s2 ; SI-NEXT: v_mov_b32_e32 v1, s3 -; SI-NEXT: flat_store_short v[0:1], v2 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: flat_store_short v[0:1], v2 ; SI-NEXT: v_mov_b32_e32 v0, s0 ; SI-NEXT: v_or_b32_e32 v2, v4, v3 ; SI-NEXT: v_mov_b32_e32 v1, s1 @@ -1644,10 +1651,10 @@ ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_cvt_f16_f32_sdwa v4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD -; VI-NEXT: v_cvt_f16_f32_e32 v5, v0 ; VI-NEXT: v_cvt_f16_f32_sdwa v3, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD ; VI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; VI-NEXT: v_cvt_f16_f32_sdwa v4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD +; VI-NEXT: v_cvt_f16_f32_e32 v5, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_or_b32_e32 v3, v2, v3 @@ -1678,20 +1685,20 @@ ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v6, v0 -; SI-NEXT: v_or_b32_e32 v3, v2, v3 ; SI-NEXT: v_or_b32_e32 v0, v4, v5 +; SI-NEXT: v_or_b32_e32 v3, v2, v3 ; SI-NEXT: v_or_b32_e32 v2, v10, v7 ; SI-NEXT: flat_store_dwordx4 v[8:9], v[0:3] ; SI-NEXT: s_endpgm @@ -1712,17 +1719,17 @@ ; VI-NEXT: v_mov_b32_e32 v9, s1 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_cvt_f16_f32_sdwa v3, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD -; VI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cvt_f16_f32_sdwa v7, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD ; VI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; VI-NEXT: v_cvt_f16_f32_sdwa v10, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD ; VI-NEXT: v_cvt_f16_f32_sdwa v5, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD ; VI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; VI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; VI-NEXT: v_cvt_f16_f32_sdwa v10, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD ; VI-NEXT: v_cvt_f16_f32_e32 v11, v0 -; VI-NEXT: v_or_b32_e32 v3, v2, v3 ; VI-NEXT: v_or_b32_e32 v1, v6, v7 ; VI-NEXT: v_or_b32_e32 v0, v4, v5 +; VI-NEXT: v_or_b32_e32 v3, v2, v3 ; VI-NEXT: v_or_b32_e32 v2, v11, v10 ; VI-NEXT: flat_store_dwordx4 v[8:9], v[0:3] ; VI-NEXT: s_endpgm @@ -1745,8 +1752,8 @@ ; SI-NEXT: v_mov_b32_e32 v13, s3 ; SI-NEXT: s_addc_u32 s5, s3, 0 ; SI-NEXT: v_mov_b32_e32 v12, s2 -; SI-NEXT: s_add_u32 s2, s2, 16 ; SI-NEXT: v_mov_b32_e32 v4, s4 +; SI-NEXT: s_add_u32 s2, s2, 16 ; SI-NEXT: v_mov_b32_e32 v5, s5 ; SI-NEXT: s_addc_u32 s3, s3, 0 ; SI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] @@ -1759,39 +1766,38 @@ ; SI-NEXT: s_addc_u32 s3, s1, 0 ; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v4 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v1 ; SI-NEXT: v_mov_b32_e32 v5, s3 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v3 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v7 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v16 -; SI-NEXT: v_or_b32_e32 v3, v6, v2 -; SI-NEXT: v_or_b32_e32 v2, v17, v7 ; SI-NEXT: v_mov_b32_e32 v4, s2 ; SI-NEXT: v_or_b32_e32 v0, v0, v18 -; SI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; SI-NEXT: v_or_b32_e32 v3, v6, v2 +; SI-NEXT: v_or_b32_e32 v2, v17, v7 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v15 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v13 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; SI-NEXT: v_mov_b32_e32 v5, s1 ; SI-NEXT: v_or_b32_e32 v1, v14, v6 ; SI-NEXT: v_or_b32_e32 v0, v12, v7 @@ -1813,8 +1819,8 @@ ; VI-NEXT: v_mov_b32_e32 v13, s3 ; VI-NEXT: s_addc_u32 s5, s3, 0 ; VI-NEXT: v_mov_b32_e32 v12, s2 -; VI-NEXT: s_add_u32 s2, s2, 16 ; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: s_add_u32 s2, s2, 16 ; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: s_addc_u32 s3, s3, 0 ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] @@ -1845,8 +1851,8 @@ ; VI-NEXT: v_cvt_f16_f32_sdwa v9, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD ; VI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; VI-NEXT: v_mov_b32_e32 v5, s3 -; VI-NEXT: v_or_b32_e32 v1, v2, v3 ; VI-NEXT: v_mov_b32_e32 v4, s2 +; VI-NEXT: v_or_b32_e32 v1, v2, v3 ; VI-NEXT: v_or_b32_e32 v0, v0, v16 ; VI-NEXT: v_or_b32_e32 v3, v6, v7 ; VI-NEXT: v_or_b32_e32 v2, v18, v17 @@ -1910,8 +1916,8 @@ ; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 ; SI-NEXT: s_lshr_b32 s0, s1, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v1, s1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s0 ; SI-NEXT: v_cvt_f32_f16_e32 v2, s2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s0 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SI-NEXT: v_add_f32_e32 v0, v0, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -1966,23 +1972,23 @@ ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v6, v0 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v8, v2 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v3 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v7, v7, v9 -; SI-NEXT: v_add_f32_e32 v0, v0, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v6, v6, v8 ; SI-NEXT: v_add_f32_e32 v1, v1, v3 +; SI-NEXT: v_add_f32_e32 v0, v0, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v6, v6, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v6 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 @@ -2035,25 +2041,25 @@ ; SI-NEXT: s_lshr_b32 s0, s4, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v8, s0 ; SI-NEXT: s_lshr_b32 s0, s5, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s10 ; SI-NEXT: s_lshr_b32 s11, s1, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s10 ; SI-NEXT: s_lshr_b32 s10, s2, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v9, s0 ; SI-NEXT: s_lshr_b32 s0, s6, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v1, s11 ; SI-NEXT: v_cvt_f32_f16_e32 v2, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s0 ; SI-NEXT: s_lshr_b32 s10, s3, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s0 ; SI-NEXT: s_lshr_b32 s0, s7, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 ; SI-NEXT: v_cvt_f32_f16_e32 v3, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s0 ; SI-NEXT: v_cvt_f32_f16_e32 v5, s1 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s0 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 ; SI-NEXT: v_cvt_f32_f16_e32 v13, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v6, s2 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s6 ; SI-NEXT: v_cvt_f32_f16_e32 v7, s3 ; SI-NEXT: v_cvt_f32_f16_e32 v14, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v15, s6 ; SI-NEXT: v_add_f32_e32 v1, v1, v9 ; SI-NEXT: v_add_f32_e32 v0, v0, v8 ; SI-NEXT: v_add_f32_e32 v3, v3, v11 @@ -2112,8 +2118,8 @@ ; VI-NEXT: v_add_f16_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_add_f16_e32 v1, s1, v1 -; VI-NEXT: s_lshr_b32 s2, s0, 16 ; VI-NEXT: s_lshr_b32 s1, s4, 16 +; VI-NEXT: s_lshr_b32 s2, s0, 16 ; VI-NEXT: v_or_b32_e32 v1, v1, v0 ; VI-NEXT: v_mov_b32_e32 v0, s1 ; VI-NEXT: v_mov_b32_e32 v4, s2 diff --git a/llvm/test/CodeGen/AMDGPU/idiv-licm.ll b/llvm/test/CodeGen/AMDGPU/idiv-licm.ll --- a/llvm/test/CodeGen/AMDGPU/idiv-licm.ll +++ b/llvm/test/CodeGen/AMDGPU/idiv-licm.ll @@ -35,8 +35,8 @@ ; GFX9-NEXT: s_add_u32 s2, s2, 1 ; GFX9-NEXT: v_add_u32_e32 v4, 1, v2 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s4, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX9-NEXT: s_addc_u32 s3, s3, 0 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX9-NEXT: global_store_dword v1, v2, s[0:1] ; GFX9-NEXT: s_add_u32 s0, s0, 4 ; GFX9-NEXT: s_addc_u32 s1, s1, 0 @@ -135,8 +135,8 @@ ; GFX9-NEXT: s_add_u32 s2, s2, 1 ; GFX9-NEXT: v_subrev_u32_e32 v3, s4, v2 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s4, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; GFX9-NEXT: s_addc_u32 s3, s3, 0 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; GFX9-NEXT: global_store_dword v1, v2, s[0:1] ; GFX9-NEXT: s_add_u32 s0, s0, 4 ; GFX9-NEXT: s_addc_u32 s1, s1, 0 @@ -206,14 +206,13 @@ ; GFX9-LABEL: sdiv32_invariant_denom: ; GFX9: ; %bb.0: ; %bb ; GFX9-NEXT: s_load_dword s3, s[0:1], 0x2c -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_ashr_i32 s2, s3, 31 ; GFX9-NEXT: s_add_i32 s3, s3, s2 ; GFX9-NEXT: s_xor_b32 s3, s3, s2 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 ; GFX9-NEXT: s_sub_i32 s4, 0, s3 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 @@ -236,8 +235,9 @@ ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX9-NEXT: v_xor_b32_e32 v2, s2, v2 -; GFX9-NEXT: v_subrev_u32_e32 v2, s2, v2 ; GFX9-NEXT: s_add_i32 s4, s4, 1 +; GFX9-NEXT: v_subrev_u32_e32 v2, s2, v2 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dword v1, v2, s[0:1] ; GFX9-NEXT: s_add_u32 s0, s0, 4 ; GFX9-NEXT: s_addc_u32 s1, s1, 0 @@ -248,12 +248,10 @@ ; ; GFX10-LABEL: sdiv32_invariant_denom: ; GFX10: ; %bb.0: ; %bb -; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: s_load_dword s3, s[0:1], 0x2c -; GFX10-NEXT: s_nop 0 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_ashr_i32 s2, s3, 31 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_add_i32 s3, s3, s2 ; GFX10-NEXT: s_xor_b32 s3, s3, s2 ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s3 @@ -273,8 +271,8 @@ ; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v2 ; GFX10-NEXT: v_sub_nc_u32_e32 v3, s4, v3 ; GFX10-NEXT: s_add_i32 s4, s4, 1 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s3, v3 ; GFX10-NEXT: v_subrev_nc_u32_e32 v5, s3, v3 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s3, v3 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc_lo ; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v2 @@ -282,6 +280,7 @@ ; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo ; GFX10-NEXT: v_xor_b32_e32 v2, s2, v2 ; GFX10-NEXT: v_subrev_nc_u32_e32 v2, s2, v2 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_store_dword v1, v2, s[0:1] ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_add_u32 s0, s0, 4 @@ -311,14 +310,13 @@ ; GFX9-LABEL: srem32_invariant_denom: ; GFX9: ; %bb.0: ; %bb ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_ashr_i32 s3, s2, 31 ; GFX9-NEXT: s_add_i32 s2, s2, s3 ; GFX9-NEXT: s_xor_b32 s2, s2, s3 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2 ; GFX9-NEXT: s_sub_i32 s3, 0, s2 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 @@ -337,8 +335,9 @@ ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; GFX9-NEXT: v_subrev_u32_e32 v3, s2, v2 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; GFX9-NEXT: s_add_i32 s3, s3, 1 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dword v1, v2, s[0:1] ; GFX9-NEXT: s_add_u32 s0, s0, 4 ; GFX9-NEXT: s_addc_u32 s1, s1, 0 @@ -349,12 +348,10 @@ ; ; GFX10-LABEL: srem32_invariant_denom: ; GFX10: ; %bb.0: ; %bb -; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: s_load_dword s2, s[0:1], 0x2c -; GFX10-NEXT: s_nop 0 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_ashr_i32 s3, s2, 31 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_add_i32 s2, s2, s3 ; GFX10-NEXT: s_xor_b32 s2, s2, s3 ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s2 @@ -379,6 +376,7 @@ ; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s2, v2 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s2, v2 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo +; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_store_dword v1, v2, s[0:1] ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_add_u32 s0, s0, 4 @@ -431,10 +429,10 @@ ; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v0 ; GFX9-NEXT: v_add_u16_e32 v4, 1, v4 ; GFX9-NEXT: v_mad_f32 v0, -v0, v2, v8 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v0|, v2 ; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s5, v4 -; GFX9-NEXT: v_addc_co_u32_e64 v0, s[0:1], 0, v7, s[0:1] +; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v0|, v2 ; GFX9-NEXT: s_and_b64 vcc, exec, vcc +; GFX9-NEXT: v_addc_co_u32_e64 v0, s[0:1], 0, v7, s[0:1] ; GFX9-NEXT: global_store_short v[5:6], v0, off ; GFX9-NEXT: s_cbranch_vccz BB4_1 ; GFX9-NEXT: ; %bb.2: ; %bb2 @@ -461,9 +459,9 @@ ; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x400, v4 ; GFX10-NEXT: v_mul_f32_e32 v0, v7, v3 ; GFX10-NEXT: v_add_co_u32 v5, s0, s2, v5 -; GFX10-NEXT: s_and_b32 vcc_lo, exec_lo, vcc_lo ; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s0, s3, v6, s0 ; GFX10-NEXT: v_trunc_f32_e32 v0, v0 +; GFX10-NEXT: s_and_b32 vcc_lo, exec_lo, vcc_lo ; GFX10-NEXT: v_mad_f32 v7, -v0, v2, v7 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX10-NEXT: v_cmp_ge_f32_e64 s0, |v7|, v2 @@ -508,13 +506,13 @@ ; GFX9-NEXT: v_cvt_f32_u32_e32 v8, v0 ; GFX9-NEXT: v_lshlrev_b64 v[5:6], 1, v[0:1] ; GFX9-NEXT: v_add_u16_e32 v4, 1, v4 -; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s8, v4 +; GFX9-NEXT: v_mov_b32_e32 v7, s5 ; GFX9-NEXT: v_mul_f32_e32 v9, v8, v3 ; GFX9-NEXT: v_trunc_f32_e32 v9, v9 ; GFX9-NEXT: v_cvt_u32_f32_e32 v10, v9 ; GFX9-NEXT: v_mad_f32 v8, -v9, v2, v8 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v8|, v2 -; GFX9-NEXT: v_mov_b32_e32 v7, s5 +; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s8, v4 ; GFX9-NEXT: v_addc_co_u32_e64 v8, s[2:3], 0, v10, s[2:3] ; GFX9-NEXT: v_mul_lo_u32 v8, v8, s7 ; GFX9-NEXT: v_add_co_u32_e64 v5, s[0:1], s4, v5 @@ -610,8 +608,8 @@ ; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v7|, |v2| ; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s5, v4 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, v0, s[0:1] -; GFX9-NEXT: v_add_u32_e32 v0, v8, v0 ; GFX9-NEXT: s_and_b64 vcc, exec, vcc +; GFX9-NEXT: v_add_u32_e32 v0, v8, v0 ; GFX9-NEXT: global_store_short v[5:6], v0, off ; GFX9-NEXT: s_cbranch_vccz BB6_1 ; GFX9-NEXT: ; %bb.2: ; %bb2 @@ -640,10 +638,10 @@ ; GFX10-NEXT: v_mul_f32_e32 v0, v7, v3 ; GFX10-NEXT: v_ashrrev_i32_e32 v8, 30, v8 ; GFX10-NEXT: v_add_co_u32 v5, s0, s2, v5 -; GFX10-NEXT: s_and_b32 vcc_lo, exec_lo, vcc_lo ; GFX10-NEXT: v_trunc_f32_e32 v0, v0 ; GFX10-NEXT: v_or_b32_e32 v8, 1, v8 ; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s0, s3, v6, s0 +; GFX10-NEXT: s_and_b32 vcc_lo, exec_lo, vcc_lo ; GFX10-NEXT: v_mad_f32 v7, -v0, v2, v7 ; GFX10-NEXT: v_cvt_i32_f32_e32 v0, v0 ; GFX10-NEXT: v_cmp_ge_f32_e64 s1, |v7|, |v2| @@ -700,12 +698,12 @@ ; GFX9-NEXT: v_add_u32_e32 v0, v11, v0 ; GFX9-NEXT: v_mul_lo_u32 v0, v0, s6 ; GFX9-NEXT: v_add_u16_e32 v4, 1, v4 -; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s7, v4 ; GFX9-NEXT: v_mov_b32_e32 v8, s5 +; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s7, v4 ; GFX9-NEXT: v_add_co_u32_e64 v5, s[0:1], s4, v5 ; GFX9-NEXT: s_and_b64 vcc, exec, vcc -; GFX9-NEXT: v_sub_u32_e32 v0, v7, v0 ; GFX9-NEXT: v_addc_co_u32_e64 v6, s[0:1], v8, v6, s[0:1] +; GFX9-NEXT: v_sub_u32_e32 v0, v7, v0 ; GFX9-NEXT: global_store_short v[5:6], v0, off ; GFX9-NEXT: s_cbranch_vccz BB7_1 ; GFX9-NEXT: ; %bb.2: ; %bb2 @@ -741,9 +739,9 @@ ; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x400, v4 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v8, v9 ; GFX10-NEXT: v_add_co_u32 v5, s0, s2, v5 -; GFX10-NEXT: s_and_b32 vcc_lo, exec_lo, vcc_lo ; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s0, s3, v6, s0 ; GFX10-NEXT: v_mul_lo_u32 v0, v0, s1 +; GFX10-NEXT: s_and_b32 vcc_lo, exec_lo, vcc_lo ; GFX10-NEXT: v_sub_nc_u32_e32 v0, v7, v0 ; GFX10-NEXT: global_store_short v[5:6], v0, off ; GFX10-NEXT: s_cbranch_vccz BB7_1 diff --git a/llvm/test/CodeGen/AMDGPU/idot2.ll b/llvm/test/CodeGen/AMDGPU/idot2.ll --- a/llvm/test/CodeGen/AMDGPU/idot2.ll +++ b/llvm/test/CodeGen/AMDGPU/idot2.ll @@ -46,6 +46,7 @@ ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_mov_b32 s2, 0xffff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc @@ -54,7 +55,6 @@ ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] -; GFX8-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_and_b32_e32 v1, s2, v3 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 @@ -185,15 +185,15 @@ ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] -; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v3 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -323,6 +323,7 @@ ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc @@ -331,7 +332,6 @@ ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] -; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 16 ; GFX8-NEXT: v_ashrrev_i32_e32 v3, 16, v3 @@ -457,6 +457,7 @@ ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc @@ -465,7 +466,6 @@ ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] -; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 16 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 @@ -599,6 +599,7 @@ ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_mov_b32 s2, 0xffff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc @@ -607,7 +608,6 @@ ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] -; GFX8-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_and_b32_e32 v1, s2, v3 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 @@ -738,6 +738,7 @@ ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc @@ -746,7 +747,6 @@ ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] -; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 16 ; GFX8-NEXT: v_ashrrev_i32_e32 v3, 16, v3 @@ -1127,10 +1127,10 @@ ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 offset:4 ; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 offset:4 @@ -1156,6 +1156,7 @@ ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX8-NEXT: s_mov_b32 s2, 0xffff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, s4, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc @@ -1168,7 +1169,6 @@ ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v4 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] -; GFX8-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_and_b32_e32 v1, s2, v2 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2 @@ -1582,6 +1582,7 @@ ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_mov_b32 s2, 0xffff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc @@ -1590,7 +1591,6 @@ ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] -; GFX8-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_and_b32_e32 v1, s2, v3 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 @@ -1725,6 +1725,7 @@ ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_mov_b32 s2, 0xffff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc @@ -1733,7 +1734,6 @@ ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] -; GFX8-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_and_b32_e32 v1, s2, v3 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 @@ -1876,6 +1876,7 @@ ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc @@ -1884,7 +1885,6 @@ ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] -; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 16 ; GFX8-NEXT: v_ashrrev_i32_e32 v3, 16, v3 @@ -2029,6 +2029,7 @@ ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_mov_b32 s2, 0xffff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc @@ -2037,7 +2038,6 @@ ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] -; GFX8-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_and_b32_e32 v1, s2, v3 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 @@ -2189,6 +2189,7 @@ ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc @@ -2197,7 +2198,6 @@ ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] -; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 16 ; GFX8-NEXT: v_ashrrev_i32_e32 v3, 16, v3 @@ -2348,6 +2348,7 @@ ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_mov_b32 s2, 0xffff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc @@ -2356,7 +2357,6 @@ ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] -; GFX8-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_and_b32_e32 v1, s2, v3 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 @@ -2503,6 +2503,7 @@ ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc @@ -2511,7 +2512,6 @@ ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] -; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 16 ; GFX8-NEXT: v_ashrrev_i32_e32 v3, 16, v3 @@ -2636,8 +2636,8 @@ ; GFX7-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] -; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: buffer_load_ushort v4, off, s[0:3], 0 ; GFX7-NEXT: s_mov_b32 s4, 0xffff ; GFX7-NEXT: s_waitcnt vmcnt(2) @@ -2662,12 +2662,12 @@ ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v4, v[0:1] -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: flat_load_ushort v6, v[2:3] ; GFX8-NEXT: s_waitcnt vmcnt(2) ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v4 @@ -2799,6 +2799,7 @@ ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc @@ -2807,7 +2808,6 @@ ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_ushort v0, v[0:1] -; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 8 ; GFX8-NEXT: v_lshrrev_b16_e32 v3, 8, v3 diff --git a/llvm/test/CodeGen/AMDGPU/idot4s.ll b/llvm/test/CodeGen/AMDGPU/idot4s.ll --- a/llvm/test/CodeGen/AMDGPU/idot4s.ll +++ b/llvm/test/CodeGen/AMDGPU/idot4s.ll @@ -47,6 +47,7 @@ ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc @@ -55,7 +56,6 @@ ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] -; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 8 ; GFX8-NEXT: v_bfe_i32 v4, v3, 8, 8 @@ -194,18 +194,18 @@ ; GFX7-NEXT: v_bfe_i32 v3, v2, 8, 8 ; GFX7-NEXT: s_waitcnt vmcnt(1) ; GFX7-NEXT: v_bfe_i32 v5, v0, 0, 8 -; GFX7-NEXT: v_bfe_i32 v6, v0, 8, 8 ; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX7-NEXT: v_bfe_i32 v6, v0, 8, 8 ; GFX7-NEXT: v_and_b32_e32 v5, s4, v5 ; GFX7-NEXT: v_bfe_i32 v4, v2, 16, 8 -; GFX7-NEXT: v_bfe_i32 v7, v0, 16, 8 ; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 +; GFX7-NEXT: v_bfe_i32 v7, v0, 16, 8 ; GFX7-NEXT: v_and_b32_e32 v6, s4, v6 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mad_u32_u24 v1, v1, v5, v8 ; GFX7-NEXT: v_ashrrev_i32_e32 v2, 24, v2 -; GFX7-NEXT: v_ashrrev_i32_e32 v0, 24, v0 ; GFX7-NEXT: v_and_b32_e32 v4, s4, v4 +; GFX7-NEXT: v_ashrrev_i32_e32 v0, 24, v0 ; GFX7-NEXT: v_and_b32_e32 v7, s4, v7 ; GFX7-NEXT: v_mad_u32_u24 v1, v3, v6, v1 ; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 @@ -225,31 +225,31 @@ ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v4, v[0:1] -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: flat_load_ushort v1, v[2:3] ; GFX8-NEXT: s_waitcnt vmcnt(2) -; GFX8-NEXT: v_lshrrev_b32_e32 v9, 8, v4 ; GFX8-NEXT: v_bfe_i32 v7, v4, 0, 8 +; GFX8-NEXT: v_lshrrev_b32_e32 v9, 8, v4 ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX8-NEXT: v_bfe_i32 v9, v9, 0, 8 ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 24, v4 +; GFX8-NEXT: v_bfe_i32 v5, v5, 0, 8 +; GFX8-NEXT: v_bfe_i32 v4, v4, 0, 8 ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_lshrrev_b32_e32 v10, 8, v0 ; GFX8-NEXT: v_bfe_i32 v8, v0, 0, 8 +; GFX8-NEXT: v_lshrrev_b32_e32 v10, 8, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v0 ; GFX8-NEXT: v_bfe_i32 v10, v10, 0, 8 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mad_u16 v1, v7, v8, v1 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 24, v0 -; GFX8-NEXT: v_bfe_i32 v5, v5, 0, 8 ; GFX8-NEXT: v_bfe_i32 v6, v6, 0, 8 ; GFX8-NEXT: v_mad_u16 v1, v9, v10, v1 -; GFX8-NEXT: v_bfe_i32 v4, v4, 0, 8 ; GFX8-NEXT: v_bfe_i32 v0, v0, 0, 8 ; GFX8-NEXT: v_mad_u16 v1, v5, v6, v1 ; GFX8-NEXT: v_mad_u16 v0, v4, v0, v1 @@ -267,11 +267,11 @@ ; GFX9-NODL-NEXT: global_load_dword v3, v0, s[6:7] ; GFX9-NODL-NEXT: global_load_ushort v4, v1, s[2:3] ; GFX9-NODL-NEXT: s_waitcnt vmcnt(2) -; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v8, 8, v2 -; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) -; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v9, 8, v3 ; GFX9-NODL-NEXT: v_bfe_i32 v6, v2, 0, 8 +; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) ; GFX9-NODL-NEXT: v_bfe_i32 v7, v3, 0, 8 +; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v8, 8, v2 +; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v9, 8, v3 ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v0, 16, v2 ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v5, 16, v3 ; GFX9-NODL-NEXT: v_bfe_i32 v8, v8, 0, 8 @@ -301,11 +301,11 @@ ; GFX9-DL-NEXT: global_load_dword v3, v0, s[6:7] ; GFX9-DL-NEXT: global_load_ushort v4, v1, s[2:3] ; GFX9-DL-NEXT: s_waitcnt vmcnt(2) -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v8, 8, v2 -; GFX9-DL-NEXT: s_waitcnt vmcnt(1) -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v9, 8, v3 ; GFX9-DL-NEXT: v_bfe_i32 v6, v2, 0, 8 +; GFX9-DL-NEXT: s_waitcnt vmcnt(1) ; GFX9-DL-NEXT: v_bfe_i32 v7, v3, 0, 8 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v8, 8, v2 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v9, 8, v3 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v0, 16, v2 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 16, v3 ; GFX9-DL-NEXT: v_bfe_i32 v8, v8, 0, 8 @@ -337,10 +337,10 @@ ; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-DL-NEXT: global_load_ushort v3, v0, s[2:3] ; GFX10-DL-NEXT: s_waitcnt vmcnt(2) +; GFX10-DL-NEXT: v_bfe_i32 v4, v1, 0, 8 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 8, v1 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v6, 8, v2 -; GFX10-DL-NEXT: v_bfe_i32 v4, v1, 0, 8 ; GFX10-DL-NEXT: v_bfe_i32 v7, v2, 0, 8 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v8, 16, v1 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v9, 16, v2 @@ -348,10 +348,10 @@ ; GFX10-DL-NEXT: v_bfe_i32 v6, v6, 0, 8 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_mad_u16 v3, v4, v7, v3 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v1, 24, v1 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 24, v2 ; GFX10-DL-NEXT: v_bfe_i32 v4, v8, 0, 8 ; GFX10-DL-NEXT: v_bfe_i32 v7, v9, 0, 8 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v1, 24, v1 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 24, v2 ; GFX10-DL-NEXT: v_mad_u16 v3, v5, v6, v3 ; GFX10-DL-NEXT: v_bfe_i32 v1, v1, 0, 8 ; GFX10-DL-NEXT: v_bfe_i32 v2, v2, 0, 8 @@ -415,19 +415,19 @@ ; GFX7-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] -; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: buffer_load_ubyte v8, off, s[0:3], 0 ; GFX7-NEXT: s_movk_i32 s4, 0xff ; GFX7-NEXT: s_waitcnt vmcnt(2) ; GFX7-NEXT: v_and_b32_e32 v1, s4, v2 ; GFX7-NEXT: v_bfe_u32 v3, v2, 8, 8 +; GFX7-NEXT: v_bfe_u32 v4, v2, 16, 8 ; GFX7-NEXT: s_waitcnt vmcnt(1) ; GFX7-NEXT: v_and_b32_e32 v5, s4, v0 ; GFX7-NEXT: v_bfe_u32 v6, v0, 8, 8 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mad_u32_u24 v1, v1, v5, v8 -; GFX7-NEXT: v_bfe_u32 v4, v2, 16, 8 ; GFX7-NEXT: v_bfe_u32 v7, v0, 16, 8 ; GFX7-NEXT: v_mad_u32_u24 v1, v3, v6, v1 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v2 @@ -482,9 +482,9 @@ ; GFX9-NODL-NEXT: s_waitcnt vmcnt(2) ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v0, 16, v2 ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v6, 8, v2 -; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v8, 24, v2 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v7, 8, v3 +; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v8, 24, v2 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_mad_legacy_u16 v2, v2, v3, v4 ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v5, 16, v3 @@ -508,9 +508,9 @@ ; GFX9-DL-NEXT: s_waitcnt vmcnt(2) ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v0, 16, v2 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v6, 8, v2 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v8, 24, v2 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 8, v3 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v8, 24, v2 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_mad_legacy_u16 v2, v2, v3, v4 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 16, v3 @@ -624,6 +624,7 @@ ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc @@ -632,7 +633,6 @@ ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] -; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 8 ; GFX8-NEXT: v_bfe_i32 v4, v3, 8, 8 @@ -815,6 +815,7 @@ ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc @@ -823,18 +824,17 @@ ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] -; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_lshrrev_b16_e32 v1, 8, v3 ; GFX8-NEXT: v_ashrrev_i32_e32 v4, 24, v3 ; GFX8-NEXT: v_bfe_i32 v5, v3, 16, 8 ; GFX8-NEXT: v_bfe_i32 v3, v3, 0, 8 +; GFX8-NEXT: v_bfe_i32 v1, v1, 0, 8 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b16_e32 v2, 8, v0 ; GFX8-NEXT: v_ashrrev_i32_e32 v6, 24, v0 ; GFX8-NEXT: v_bfe_i32 v7, v0, 16, 8 ; GFX8-NEXT: v_bfe_i32 v0, v0, 0, 8 -; GFX8-NEXT: v_bfe_i32 v1, v1, 0, 8 ; GFX8-NEXT: v_bfe_i32 v2, v2, 0, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mad_i32_i24 v0, v3, v0, s2 @@ -979,15 +979,15 @@ ; GFX7-NEXT: v_or_b32_e32 v3, v5, v3 ; GFX7-NEXT: v_and_b32_e32 v6, s4, v7 ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v3 ; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v3 ; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 ; GFX7-NEXT: v_bfe_i32 v4, v2, 16, 8 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mad_u32_u24 v1, v1, v3, v8 ; GFX7-NEXT: v_ashrrev_i32_e32 v2, 24, v2 -; GFX7-NEXT: v_ashrrev_i32_e32 v0, 24, v0 ; GFX7-NEXT: v_and_b32_e32 v4, s4, v4 +; GFX7-NEXT: v_ashrrev_i32_e32 v0, 24, v0 ; GFX7-NEXT: v_mad_u32_u24 v1, v5, v7, v1 ; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 ; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 @@ -1006,10 +1006,10 @@ ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v4, v[0:1] -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: flat_load_ushort v1, v[2:3] @@ -1049,20 +1049,20 @@ ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v6, 16, v2 ; GFX9-NODL-NEXT: v_ashrrev_i16_e32 v7, 8, v1 -; GFX9-NODL-NEXT: v_and_b32_sdwa v1, v4, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NODL-NEXT: v_ashrrev_i16_e32 v8, 8, v2 ; GFX9-NODL-NEXT: v_and_b32_sdwa v2, v4, sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NODL-NEXT: v_and_b32_sdwa v1, v4, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NODL-NEXT: v_lshl_or_b32 v2, v8, 16, v2 ; GFX9-NODL-NEXT: v_lshl_or_b32 v1, v7, 16, v1 +; GFX9-NODL-NEXT: v_ashrrev_i16_e32 v9, 8, v5 ; GFX9-NODL-NEXT: v_ashrrev_i16_e32 v10, 8, v6 ; GFX9-NODL-NEXT: v_and_b32_sdwa v6, v4, sext(v6) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NODL-NEXT: v_pk_mul_lo_u16 v1, v1, v2 -; GFX9-NODL-NEXT: v_ashrrev_i16_e32 v9, 8, v5 ; GFX9-NODL-NEXT: v_and_b32_sdwa v4, v4, sext(v5) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) -; GFX9-NODL-NEXT: v_add_u16_e32 v3, v1, v3 +; GFX9-NODL-NEXT: v_pk_mul_lo_u16 v1, v1, v2 ; GFX9-NODL-NEXT: v_lshl_or_b32 v5, v10, 16, v6 ; GFX9-NODL-NEXT: v_lshl_or_b32 v4, v9, 16, v4 +; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) +; GFX9-NODL-NEXT: v_add_u16_e32 v3, v1, v3 ; GFX9-NODL-NEXT: v_pk_mul_lo_u16 v2, v4, v5 ; GFX9-NODL-NEXT: v_add_u16_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NODL-NEXT: v_add_u16_e32 v1, v1, v2 @@ -1086,20 +1086,20 @@ ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v6, 16, v2 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v7, 8, v1 -; GFX9-DL-NEXT: v_and_b32_sdwa v1, v4, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v8, 8, v2 ; GFX9-DL-NEXT: v_and_b32_sdwa v2, v4, sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-DL-NEXT: v_and_b32_sdwa v1, v4, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-DL-NEXT: v_lshl_or_b32 v2, v8, 16, v2 ; GFX9-DL-NEXT: v_lshl_or_b32 v1, v7, 16, v1 +; GFX9-DL-NEXT: v_ashrrev_i16_e32 v9, 8, v5 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v10, 8, v6 ; GFX9-DL-NEXT: v_and_b32_sdwa v6, v4, sext(v6) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-DL-NEXT: v_pk_mul_lo_u16 v1, v1, v2 -; GFX9-DL-NEXT: v_ashrrev_i16_e32 v9, 8, v5 ; GFX9-DL-NEXT: v_and_b32_sdwa v4, v4, sext(v5) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_add_u16_e32 v3, v1, v3 +; GFX9-DL-NEXT: v_pk_mul_lo_u16 v1, v1, v2 ; GFX9-DL-NEXT: v_lshl_or_b32 v5, v10, 16, v6 ; GFX9-DL-NEXT: v_lshl_or_b32 v4, v9, 16, v4 +; GFX9-DL-NEXT: s_waitcnt vmcnt(0) +; GFX9-DL-NEXT: v_add_u16_e32 v3, v1, v3 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v2, v4, v5 ; GFX9-DL-NEXT: v_add_u16_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-DL-NEXT: v_add_u16_e32 v1, v1, v2 @@ -1121,21 +1121,21 @@ ; GFX10-DL-NEXT: global_load_ushort v3, v0, s[0:1] ; GFX10-DL-NEXT: s_waitcnt vmcnt(2) ; GFX10-DL-NEXT: v_ashrrev_i16 v5, 8, v1 -; GFX10-DL-NEXT: v_and_b32_sdwa v8, v4, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) ; GFX10-DL-NEXT: v_ashrrev_i16 v6, 8, v2 ; GFX10-DL-NEXT: v_and_b32_sdwa v7, v4, sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-DL-NEXT: v_and_b32_sdwa v8, v4, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX10-DL-NEXT: v_lshl_or_b32 v5, v5, 16, v8 ; GFX10-DL-NEXT: v_lshl_or_b32 v6, v6, 16, v7 +; GFX10-DL-NEXT: v_lshl_or_b32 v5, v5, 16, v8 ; GFX10-DL-NEXT: v_ashrrev_i16 v7, 8, v1 -; GFX10-DL-NEXT: v_and_b32_sdwa v1, v4, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX10-DL-NEXT: v_ashrrev_i16 v8, 8, v2 ; GFX10-DL-NEXT: v_and_b32_sdwa v2, v4, sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-DL-NEXT: v_and_b32_sdwa v1, v4, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, v5, v6 -; GFX10-DL-NEXT: v_lshl_or_b32 v1, v7, 16, v1 ; GFX10-DL-NEXT: v_lshl_or_b32 v2, v8, 16, v2 +; GFX10-DL-NEXT: v_lshl_or_b32 v1, v7, 16, v1 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_add_nc_u16 v3, v4, v3 diff --git a/llvm/test/CodeGen/AMDGPU/idot4u.ll b/llvm/test/CodeGen/AMDGPU/idot4u.ll --- a/llvm/test/CodeGen/AMDGPU/idot4u.ll +++ b/llvm/test/CodeGen/AMDGPU/idot4u.ll @@ -49,6 +49,7 @@ ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_movk_i32 s2, 0xff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc @@ -57,7 +58,6 @@ ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] -; GFX8-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_and_b32_e32 v1, s2, v3 ; GFX8-NEXT: v_bfe_u32 v4, v3, 8, 8 @@ -186,19 +186,19 @@ ; GFX7-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] -; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: buffer_load_ushort v8, off, s[0:3], 0 ; GFX7-NEXT: s_movk_i32 s4, 0xff ; GFX7-NEXT: s_waitcnt vmcnt(2) ; GFX7-NEXT: v_and_b32_e32 v1, s4, v2 ; GFX7-NEXT: v_bfe_u32 v3, v2, 8, 8 +; GFX7-NEXT: v_bfe_u32 v4, v2, 16, 8 ; GFX7-NEXT: s_waitcnt vmcnt(1) ; GFX7-NEXT: v_and_b32_e32 v5, s4, v0 ; GFX7-NEXT: v_bfe_u32 v6, v0, 8, 8 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mad_u32_u24 v1, v1, v5, v8 -; GFX7-NEXT: v_bfe_u32 v4, v2, 16, 8 ; GFX7-NEXT: v_bfe_u32 v7, v0, 16, 8 ; GFX7-NEXT: v_mad_u32_u24 v1, v3, v6, v1 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v2 @@ -218,24 +218,24 @@ ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v4, v[0:1] -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: flat_load_ushort v1, v[2:3] ; GFX8-NEXT: s_movk_i32 s0, 0xff ; GFX8-NEXT: v_mov_b32_e32 v5, s0 ; GFX8-NEXT: s_waitcnt vmcnt(2) -; GFX8-NEXT: v_lshrrev_b32_e32 v8, 8, v4 ; GFX8-NEXT: v_and_b32_e32 v6, s0, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v8, 8, v4 ; GFX8-NEXT: v_and_b32_e32 v8, s0, v8 ; GFX8-NEXT: v_and_b32_sdwa v10, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 24, v4 ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_lshrrev_b32_e32 v9, 8, v0 ; GFX8-NEXT: v_and_b32_e32 v7, s0, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v9, 8, v0 ; GFX8-NEXT: v_and_b32_e32 v9, s0, v9 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mad_u16 v1, v6, v7, v1 @@ -259,11 +259,11 @@ ; GFX9-NODL-NEXT: global_load_dword v3, v0, s[6:7] ; GFX9-NODL-NEXT: global_load_ushort v4, v1, s[2:3] ; GFX9-NODL-NEXT: s_waitcnt vmcnt(2) -; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v6, 8, v2 -; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) -; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v7, 8, v3 ; GFX9-NODL-NEXT: v_and_b32_e32 v0, s0, v2 +; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) ; GFX9-NODL-NEXT: v_and_b32_e32 v5, s0, v3 +; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v6, 8, v2 +; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v7, 8, v3 ; GFX9-NODL-NEXT: v_and_b32_e32 v6, s0, v6 ; GFX9-NODL-NEXT: v_and_b32_e32 v7, s0, v7 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) @@ -290,11 +290,11 @@ ; GFX9-DL-NEXT: global_load_dword v3, v0, s[6:7] ; GFX9-DL-NEXT: global_load_ushort v4, v1, s[2:3] ; GFX9-DL-NEXT: s_waitcnt vmcnt(2) -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v6, 8, v2 -; GFX9-DL-NEXT: s_waitcnt vmcnt(1) -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 8, v3 ; GFX9-DL-NEXT: v_and_b32_e32 v0, s0, v2 +; GFX9-DL-NEXT: s_waitcnt vmcnt(1) ; GFX9-DL-NEXT: v_and_b32_e32 v5, s0, v3 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v6, 8, v2 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 8, v3 ; GFX9-DL-NEXT: v_and_b32_e32 v6, s0, v6 ; GFX9-DL-NEXT: v_and_b32_e32 v7, s0, v7 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) @@ -323,10 +323,10 @@ ; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-DL-NEXT: global_load_ushort v3, v0, s[2:3] ; GFX10-DL-NEXT: s_waitcnt vmcnt(2) +; GFX10-DL-NEXT: v_and_b32_e32 v4, s0, v1 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 8, v1 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v6, 8, v2 -; GFX10-DL-NEXT: v_and_b32_e32 v4, s0, v1 ; GFX10-DL-NEXT: v_and_b32_e32 v7, s0, v2 ; GFX10-DL-NEXT: v_and_b32_e32 v5, s0, v5 ; GFX10-DL-NEXT: v_and_b32_e32 v6, s0, v6 @@ -398,19 +398,19 @@ ; GFX7-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] -; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: buffer_load_ubyte v8, off, s[0:3], 0 ; GFX7-NEXT: s_movk_i32 s4, 0xff ; GFX7-NEXT: s_waitcnt vmcnt(2) ; GFX7-NEXT: v_and_b32_e32 v1, s4, v2 ; GFX7-NEXT: v_bfe_u32 v3, v2, 8, 8 +; GFX7-NEXT: v_bfe_u32 v4, v2, 16, 8 ; GFX7-NEXT: s_waitcnt vmcnt(1) ; GFX7-NEXT: v_and_b32_e32 v5, s4, v0 ; GFX7-NEXT: v_bfe_u32 v6, v0, 8, 8 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mad_u32_u24 v1, v1, v5, v8 -; GFX7-NEXT: v_bfe_u32 v4, v2, 16, 8 ; GFX7-NEXT: v_bfe_u32 v7, v0, 16, 8 ; GFX7-NEXT: v_mad_u32_u24 v1, v3, v6, v1 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v2 @@ -465,9 +465,9 @@ ; GFX9-NODL-NEXT: s_waitcnt vmcnt(2) ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v0, 16, v2 ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v6, 8, v2 -; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v8, 24, v2 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v7, 8, v3 +; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v8, 24, v2 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_mad_legacy_u16 v2, v2, v3, v4 ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v5, 16, v3 @@ -491,9 +491,9 @@ ; GFX9-DL-NEXT: s_waitcnt vmcnt(2) ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v0, 16, v2 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v6, 8, v2 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v8, 24, v2 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 8, v3 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v8, 24, v2 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_mad_legacy_u16 v2, v2, v3, v4 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 16, v3 @@ -722,19 +722,19 @@ ; GFX7-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] -; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: buffer_load_ubyte v8, off, s[0:3], 0 ; GFX7-NEXT: s_movk_i32 s4, 0xff ; GFX7-NEXT: s_waitcnt vmcnt(2) ; GFX7-NEXT: v_and_b32_e32 v1, s4, v2 ; GFX7-NEXT: v_bfe_u32 v3, v2, 8, 8 +; GFX7-NEXT: v_bfe_u32 v4, v2, 16, 8 ; GFX7-NEXT: s_waitcnt vmcnt(1) ; GFX7-NEXT: v_and_b32_e32 v5, s4, v0 ; GFX7-NEXT: v_bfe_u32 v6, v0, 8, 8 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mad_u32_u24 v1, v5, v1, v8 -; GFX7-NEXT: v_bfe_u32 v4, v2, 16, 8 ; GFX7-NEXT: v_bfe_u32 v7, v0, 16, 8 ; GFX7-NEXT: v_mad_u32_u24 v1, v6, v3, v1 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v2 @@ -789,9 +789,9 @@ ; GFX9-NODL-NEXT: s_waitcnt vmcnt(2) ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v0, 16, v2 ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v6, 8, v2 -; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v8, 24, v2 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v7, 8, v3 +; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v8, 24, v2 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_mad_legacy_u16 v2, v3, v2, v4 ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v5, 16, v3 @@ -815,9 +815,9 @@ ; GFX9-DL-NEXT: s_waitcnt vmcnt(2) ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v0, 16, v2 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v6, 8, v2 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v8, 24, v2 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 8, v3 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v8, 24, v2 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_mad_legacy_u16 v2, v3, v2, v4 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 16, v3 @@ -905,19 +905,19 @@ ; GFX7-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] -; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: buffer_load_ubyte v8, off, s[0:3], 0 ; GFX7-NEXT: s_movk_i32 s4, 0xff ; GFX7-NEXT: s_waitcnt vmcnt(2) ; GFX7-NEXT: v_bfe_u32 v3, v2, 8, 8 ; GFX7-NEXT: v_and_b32_e32 v1, s4, v2 +; GFX7-NEXT: v_bfe_u32 v4, v2, 16, 8 ; GFX7-NEXT: s_waitcnt vmcnt(1) ; GFX7-NEXT: v_bfe_u32 v6, v0, 8, 8 ; GFX7-NEXT: v_and_b32_e32 v5, s4, v0 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mad_u32_u24 v3, v6, v3, v8 -; GFX7-NEXT: v_bfe_u32 v4, v2, 16, 8 ; GFX7-NEXT: v_bfe_u32 v7, v0, 16, 8 ; GFX7-NEXT: v_mad_u32_u24 v1, v5, v1, v3 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v2 @@ -937,12 +937,12 @@ ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v4, v[0:1] -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: flat_load_ubyte v10, v[2:3] ; GFX8-NEXT: s_waitcnt vmcnt(2) ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 8, v4 @@ -950,9 +950,9 @@ ; GFX8-NEXT: v_lshrrev_b32_e32 v8, 24, v4 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_lshrrev_b32_e32 v7, 8, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v0 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mad_u16 v6, v7, v6, v10 +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v9, 24, v0 ; GFX8-NEXT: v_mad_u16 v0, v0, v4, v6 ; GFX8-NEXT: v_mad_u16 v0, v5, v1, v0 @@ -977,8 +977,8 @@ ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_mad_legacy_u16 v5, v6, v5, v9 ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v0, 16, v2 -; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v7, 24, v2 ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v7, 24, v2 ; GFX9-NODL-NEXT: v_mad_legacy_u16 v2, v3, v2, v5 ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v8, 24, v3 ; GFX9-NODL-NEXT: v_mad_legacy_u16 v0, v4, v0, v2 @@ -1003,8 +1003,8 @@ ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_mad_legacy_u16 v5, v6, v5, v9 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v0, 16, v2 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 24, v2 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 24, v2 ; GFX9-DL-NEXT: v_mad_legacy_u16 v2, v3, v2, v5 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v8, 24, v3 ; GFX9-DL-NEXT: v_mad_legacy_u16 v0, v4, v0, v2 @@ -1118,6 +1118,7 @@ ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_movk_i32 s2, 0xff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc @@ -1126,7 +1127,6 @@ ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] -; GFX8-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_and_b32_e32 v1, s2, v3 ; GFX8-NEXT: v_bfe_u32 v4, v3, 8, 8 @@ -1293,9 +1293,9 @@ ; GFX7-NEXT: v_and_b32_e32 v1, s4, v2 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_bfe_u32 v6, v0, 8, 8 +; GFX7-NEXT: v_and_b32_e32 v5, s4, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mad_u32_u24 v3, v3, v6, s5 -; GFX7-NEXT: v_and_b32_e32 v5, s4, v0 ; GFX7-NEXT: v_bfe_u32 v4, v2, 16, 8 ; GFX7-NEXT: v_bfe_u32 v7, v0, 16, 8 ; GFX7-NEXT: v_mad_u32_u24 v1, v1, v5, v3 @@ -1315,6 +1315,7 @@ ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_movk_i32 s2, 0xff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc @@ -1323,7 +1324,6 @@ ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] -; GFX8-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_bfe_u32 v4, v3, 8, 8 ; GFX8-NEXT: v_and_b32_e32 v1, s2, v3 @@ -1331,9 +1331,9 @@ ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 24, v3 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_bfe_u32 v5, v0, 8, 8 +; GFX8-NEXT: v_and_b32_e32 v2, s2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mad_u32_u24 v4, v4, v5, s3 -; GFX8-NEXT: v_and_b32_e32 v2, s2, v0 ; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 8 ; GFX8-NEXT: v_mad_u32_u24 v1, v1, v2, v4 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 24, v0 @@ -1515,11 +1515,11 @@ ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v4, v[0:1] -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: flat_load_ushort v10, v[2:3] ; GFX8-NEXT: s_waitcnt vmcnt(2) @@ -1527,6 +1527,7 @@ ; GFX8-NEXT: v_and_b32_e32 v7, s2, v7 ; GFX8-NEXT: v_bfe_i32 v1, v4, 0, 8 ; GFX8-NEXT: v_and_b32_sdwa v9, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 24, v4 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_lshrrev_b32_e32 v8, 8, v0 ; GFX8-NEXT: v_and_b32_e32 v8, s2, v8 @@ -1535,7 +1536,6 @@ ; GFX8-NEXT: v_mad_u16 v7, v7, v8, v10 ; GFX8-NEXT: v_and_b32_sdwa v5, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_mad_u16 v1, v1, v6, v7 -; GFX8-NEXT: v_lshrrev_b32_e32 v4, 24, v4 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 24, v0 ; GFX8-NEXT: v_mad_u16 v1, v9, v5, v1 ; GFX8-NEXT: v_mad_u16 v0, v4, v0, v1 @@ -1723,6 +1723,7 @@ ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_movk_i32 s2, 0xff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc @@ -1731,7 +1732,6 @@ ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] -; GFX8-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 24, v3 ; GFX8-NEXT: v_bfe_u32 v4, v3, 16, 8 @@ -1870,16 +1870,16 @@ ; GFX7-NEXT: s_waitcnt vmcnt(2) ; GFX7-NEXT: v_and_b32_e32 v1, s4, v2 ; GFX7-NEXT: v_and_b32_e32 v3, s5, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; GFX7-NEXT: s_waitcnt vmcnt(1) ; GFX7-NEXT: v_and_b32_e32 v5, s4, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX7-NEXT: v_and_b32_e32 v6, s5, v0 ; GFX7-NEXT: v_or_b32_e32 v1, v3, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v5 -; GFX7-NEXT: v_and_b32_e32 v6, s5, v0 ; GFX7-NEXT: v_or_b32_e32 v3, v6, v3 ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v3 ; GFX7-NEXT: v_and_b32_e32 v1, s5, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v3 ; GFX7-NEXT: v_and_b32_e32 v3, s5, v3 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mad_u32_u24 v1, v1, v3, v8 @@ -1905,17 +1905,17 @@ ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v4, v[0:1] -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: flat_load_ushort v10, v[2:3] ; GFX8-NEXT: s_waitcnt vmcnt(2) -; GFX8-NEXT: v_and_b32_sdwa v9, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 24, v4 ; GFX8-NEXT: v_lshrrev_b16_e32 v6, 8, v4 +; GFX8-NEXT: v_and_b32_sdwa v9, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_and_b32_e32 v4, s2, v4 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_lshrrev_b32_e32 v7, 24, v0 @@ -1944,23 +1944,23 @@ ; GFX9-NODL-NEXT: global_load_ushort v3, v0, s[2:3] ; GFX9-NODL-NEXT: s_waitcnt vmcnt(2) ; GFX9-NODL-NEXT: v_lshrrev_b16_e32 v5, 8, v1 -; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) -; GFX9-NODL-NEXT: v_and_b32_sdwa v10, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v6, 24, v1 -; GFX9-NODL-NEXT: v_and_b32_sdwa v9, v1, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NODL-NEXT: v_and_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) ; GFX9-NODL-NEXT: v_lshrrev_b16_e32 v7, 8, v2 ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v8, 24, v2 +; GFX9-NODL-NEXT: v_and_b32_sdwa v9, v1, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NODL-NEXT: v_and_b32_sdwa v10, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NODL-NEXT: v_and_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NODL-NEXT: v_and_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NODL-NEXT: v_lshl_or_b32 v2, v7, 16, v2 ; GFX9-NODL-NEXT: v_lshl_or_b32 v1, v5, 16, v1 ; GFX9-NODL-NEXT: v_and_b32_e32 v10, v4, v10 ; GFX9-NODL-NEXT: v_and_b32_e32 v4, v4, v9 ; GFX9-NODL-NEXT: v_pk_mul_lo_u16 v1, v1, v2 -; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) -; GFX9-NODL-NEXT: v_add_u16_e32 v3, v1, v3 ; GFX9-NODL-NEXT: v_lshl_or_b32 v5, v8, 16, v10 ; GFX9-NODL-NEXT: v_lshl_or_b32 v4, v6, 16, v4 +; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) +; GFX9-NODL-NEXT: v_add_u16_e32 v3, v1, v3 ; GFX9-NODL-NEXT: v_pk_mul_lo_u16 v2, v4, v5 ; GFX9-NODL-NEXT: v_add_u16_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NODL-NEXT: v_add_u16_e32 v1, v1, v2 @@ -1982,23 +1982,23 @@ ; GFX9-DL-NEXT: global_load_ushort v3, v0, s[2:3] ; GFX9-DL-NEXT: s_waitcnt vmcnt(2) ; GFX9-DL-NEXT: v_lshrrev_b16_e32 v5, 8, v1 -; GFX9-DL-NEXT: s_waitcnt vmcnt(1) -; GFX9-DL-NEXT: v_and_b32_sdwa v10, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v6, 24, v1 -; GFX9-DL-NEXT: v_and_b32_sdwa v9, v1, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-DL-NEXT: v_and_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-DL-NEXT: s_waitcnt vmcnt(1) ; GFX9-DL-NEXT: v_lshrrev_b16_e32 v7, 8, v2 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v8, 24, v2 +; GFX9-DL-NEXT: v_and_b32_sdwa v9, v1, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-DL-NEXT: v_and_b32_sdwa v10, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-DL-NEXT: v_and_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-DL-NEXT: v_and_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-DL-NEXT: v_lshl_or_b32 v2, v7, 16, v2 ; GFX9-DL-NEXT: v_lshl_or_b32 v1, v5, 16, v1 ; GFX9-DL-NEXT: v_and_b32_e32 v10, v4, v10 ; GFX9-DL-NEXT: v_and_b32_e32 v4, v4, v9 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v1, v1, v2 -; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_add_u16_e32 v3, v1, v3 ; GFX9-DL-NEXT: v_lshl_or_b32 v5, v8, 16, v10 ; GFX9-DL-NEXT: v_lshl_or_b32 v4, v6, 16, v4 +; GFX9-DL-NEXT: s_waitcnt vmcnt(0) +; GFX9-DL-NEXT: v_add_u16_e32 v3, v1, v3 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v2, v4, v5 ; GFX9-DL-NEXT: v_add_u16_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-DL-NEXT: v_add_u16_e32 v1, v1, v2 @@ -2021,15 +2021,15 @@ ; GFX10-DL-NEXT: global_load_ushort v3, v0, s[0:1] ; GFX10-DL-NEXT: s_waitcnt vmcnt(2) ; GFX10-DL-NEXT: v_lshrrev_b16 v5, 8, v1 -; GFX10-DL-NEXT: v_and_b32_sdwa v8, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) ; GFX10-DL-NEXT: v_lshrrev_b16 v6, 8, v2 ; GFX10-DL-NEXT: v_and_b32_sdwa v7, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-DL-NEXT: v_and_b32_sdwa v8, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX10-DL-NEXT: v_and_b32_sdwa v9, v1, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-DL-NEXT: v_and_b32_sdwa v10, v2, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX10-DL-NEXT: v_lshl_or_b32 v5, v5, 16, v8 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v1, 24, v1 ; GFX10-DL-NEXT: v_lshl_or_b32 v6, v6, 16, v7 +; GFX10-DL-NEXT: v_lshl_or_b32 v5, v5, 16, v8 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 24, v2 ; GFX10-DL-NEXT: v_and_b32_e32 v7, v4, v10 ; GFX10-DL-NEXT: v_and_b32_e32 v4, v4, v9 @@ -2089,21 +2089,21 @@ ; GFX7-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] -; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: buffer_load_ubyte v8, off, s[0:3], 0 ; GFX7-NEXT: s_movk_i32 s4, 0xff ; GFX7-NEXT: s_waitcnt vmcnt(2) ; GFX7-NEXT: v_and_b32_e32 v3, s4, v2 ; GFX7-NEXT: v_bfe_u32 v4, v2, 8, 8 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 24, v2 ; GFX7-NEXT: s_waitcnt vmcnt(1) ; GFX7-NEXT: v_and_b32_e32 v6, s4, v0 ; GFX7-NEXT: v_bfe_u32 v7, v0, 8, 8 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mad_u32_u24 v3, v3, v6, v8 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 24, v2 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 24, v0 ; GFX7-NEXT: v_bfe_u32 v2, v2, 16, 8 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 24, v0 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 ; GFX7-NEXT: v_mad_u32_u24 v3, v4, v7, v3 ; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v3 diff --git a/llvm/test/CodeGen/AMDGPU/idot8s.ll b/llvm/test/CodeGen/AMDGPU/idot8s.ll --- a/llvm/test/CodeGen/AMDGPU/idot8s.ll +++ b/llvm/test/CodeGen/AMDGPU/idot8s.ll @@ -19,10 +19,10 @@ ; GFX7-NEXT: s_mov_b32 s15, 0xe8f000 ; GFX7-NEXT: s_add_u32 s12, s12, s3 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 @@ -87,6 +87,8 @@ ; GFX8-NEXT: v_bfe_i32 v4, v3, 4, 4 ; GFX8-NEXT: v_bfe_i32 v6, v3, 8, 4 ; GFX8-NEXT: v_bfe_i32 v8, v3, 12, 4 +; GFX8-NEXT: v_bfe_i32 v10, v3, 16, 4 +; GFX8-NEXT: v_bfe_i32 v12, v3, 20, 4 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_bfe_i32 v2, v0, 0, 4 ; GFX8-NEXT: v_bfe_i32 v5, v0, 4, 4 @@ -96,10 +98,8 @@ ; GFX8-NEXT: v_mad_i32_i24 v1, v4, v5, v1 ; GFX8-NEXT: v_bfe_i32 v9, v0, 12, 4 ; GFX8-NEXT: v_mad_i32_i24 v1, v6, v7, v1 -; GFX8-NEXT: v_bfe_i32 v10, v3, 16, 4 ; GFX8-NEXT: v_bfe_i32 v11, v0, 16, 4 ; GFX8-NEXT: v_mad_i32_i24 v1, v8, v9, v1 -; GFX8-NEXT: v_bfe_i32 v12, v3, 20, 4 ; GFX8-NEXT: v_bfe_i32 v13, v0, 20, 4 ; GFX8-NEXT: v_mad_i32_i24 v1, v10, v11, v1 ; GFX8-NEXT: v_bfe_i32 v14, v3, 24, 4 @@ -136,20 +136,20 @@ ; GFX9-NEXT: v_bfe_i32 v4, v2, 0, 4 ; GFX9-NEXT: v_bfe_i32 v5, v1, 4, 4 ; GFX9-NEXT: v_bfe_i32 v6, v2, 4, 4 -; GFX9-NEXT: v_mul_i32_i24_e32 v3, v3, v4 ; GFX9-NEXT: v_bfe_i32 v7, v1, 8, 4 ; GFX9-NEXT: v_bfe_i32 v8, v2, 8, 4 -; GFX9-NEXT: v_mul_i32_i24_e32 v4, v5, v6 ; GFX9-NEXT: v_bfe_i32 v9, v1, 12, 4 ; GFX9-NEXT: v_bfe_i32 v10, v2, 12, 4 ; GFX9-NEXT: v_bfe_i32 v11, v1, 16, 4 ; GFX9-NEXT: v_bfe_i32 v12, v2, 16, 4 ; GFX9-NEXT: v_bfe_i32 v13, v1, 20, 4 -; GFX9-NEXT: v_bfe_i32 v15, v1, 24, 4 ; GFX9-NEXT: v_bfe_i32 v14, v2, 20, 4 +; GFX9-NEXT: v_bfe_i32 v15, v1, 24, 4 ; GFX9-NEXT: v_bfe_i32 v16, v2, 24, 4 ; GFX9-NEXT: v_ashrrev_i32_e32 v1, 28, v1 ; GFX9-NEXT: v_ashrrev_i32_e32 v2, 28, v2 +; GFX9-NEXT: v_mul_i32_i24_e32 v3, v3, v4 +; GFX9-NEXT: v_mul_i32_i24_e32 v4, v5, v6 ; GFX9-NEXT: v_mul_i32_i24_e32 v5, v7, v8 ; GFX9-NEXT: v_mul_i32_i24_e32 v6, v9, v10 ; GFX9-NEXT: v_mul_i32_i24_e32 v1, v1, v2 @@ -175,11 +175,11 @@ ; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v3, v0, s[6:7] ; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-DL-NEXT: v_dot8_i32_i4 v0, v2, v3, s0 ; GFX9-DL-NEXT: global_store_dword v1, v0, s[2:3] @@ -333,10 +333,10 @@ ; GFX7-NEXT: s_mov_b32 s15, 0xe8f000 ; GFX7-NEXT: s_add_u32 s12, s12, s3 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 @@ -422,23 +422,23 @@ ; GFX8-NEXT: v_lshlrev_b16_e32 v16, 12, v3 ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 28, v3 ; GFX8-NEXT: v_lshrrev_b32_e32 v7, 20, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v8, 12, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v9, 8, v3 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_lshrrev_b32_e32 v15, 4, v2 ; GFX8-NEXT: v_lshlrev_b16_e32 v17, 12, v2 -; GFX8-NEXT: v_lshrrev_b32_e32 v8, 12, v3 -; GFX8-NEXT: v_lshrrev_b32_e32 v9, 8, v3 ; GFX8-NEXT: v_lshrrev_b32_e32 v11, 28, v2 ; GFX8-NEXT: v_lshrrev_b32_e32 v12, 20, v2 ; GFX8-NEXT: v_lshrrev_b32_e32 v13, 12, v2 ; GFX8-NEXT: v_lshrrev_b32_e32 v14, 8, v2 -; GFX8-NEXT: v_lshlrev_b16_e32 v10, 12, v10 -; GFX8-NEXT: v_lshlrev_b16_e32 v15, 12, v15 ; GFX8-NEXT: v_lshlrev_b16_sdwa v18, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_lshlrev_b16_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 ; GFX8-NEXT: v_lshlrev_b16_sdwa v19, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_lshlrev_b16_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 ; GFX8-NEXT: v_lshlrev_b16_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 ; GFX8-NEXT: v_ashrrev_i16_e32 v5, 12, v16 ; GFX8-NEXT: v_ashrrev_i16_e32 v16, 12, v17 +; GFX8-NEXT: v_lshlrev_b16_e32 v10, 12, v10 +; GFX8-NEXT: v_lshlrev_b16_e32 v15, 12, v15 ; GFX8-NEXT: v_lshlrev_b16_e32 v9, 12, v9 ; GFX8-NEXT: v_lshlrev_b16_e32 v14, 12, v14 ; GFX8-NEXT: v_ashrrev_i16_e32 v10, 12, v10 @@ -453,18 +453,18 @@ ; GFX8-NEXT: v_ashrrev_i16_e32 v8, 12, v8 ; GFX8-NEXT: v_ashrrev_i16_e32 v13, 12, v13 ; GFX8-NEXT: v_mad_u16 v4, v9, v14, v4 -; GFX8-NEXT: v_lshlrev_b16_e32 v7, 12, v7 -; GFX8-NEXT: v_lshlrev_b16_e32 v12, 12, v12 ; GFX8-NEXT: v_ashrrev_i16_e32 v17, 12, v18 ; GFX8-NEXT: v_ashrrev_i16_e32 v18, 12, v19 +; GFX8-NEXT: v_lshlrev_b16_e32 v7, 12, v7 +; GFX8-NEXT: v_lshlrev_b16_e32 v12, 12, v12 ; GFX8-NEXT: v_mad_u16 v4, v8, v13, v4 ; GFX8-NEXT: v_ashrrev_i16_e32 v7, 12, v7 ; GFX8-NEXT: v_ashrrev_i16_e32 v12, 12, v12 ; GFX8-NEXT: v_mad_u16 v4, v17, v18, v4 -; GFX8-NEXT: v_lshlrev_b16_e32 v6, 12, v6 -; GFX8-NEXT: v_lshlrev_b16_e32 v11, 12, v11 ; GFX8-NEXT: v_ashrrev_i16_e32 v3, 12, v3 ; GFX8-NEXT: v_ashrrev_i16_e32 v2, 12, v2 +; GFX8-NEXT: v_lshlrev_b16_e32 v6, 12, v6 +; GFX8-NEXT: v_lshlrev_b16_e32 v11, 12, v11 ; GFX8-NEXT: v_mad_u16 v4, v7, v12, v4 ; GFX8-NEXT: v_ashrrev_i16_e32 v6, 12, v6 ; GFX8-NEXT: v_ashrrev_i16_e32 v11, 12, v11 @@ -484,12 +484,12 @@ ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v4, 12 -; GFX9-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: global_load_ushort v3, v0, s[2:3] +; GFX9-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshrrev_b32_e32 v9, 4, v1 ; GFX9-NEXT: s_waitcnt vmcnt(1) @@ -504,14 +504,14 @@ ; GFX9-NEXT: v_lshrrev_b32_e32 v11, 20, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v12, 12, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v2 -; GFX9-NEXT: v_lshlrev_b16_e32 v9, 12, v9 -; GFX9-NEXT: v_lshlrev_b16_e32 v14, 12, v14 ; GFX9-NEXT: v_lshlrev_b16_sdwa v17, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshlrev_b16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 ; GFX9-NEXT: v_lshlrev_b16_sdwa v18, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshlrev_b16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 ; GFX9-NEXT: v_lshlrev_b16_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 ; GFX9-NEXT: v_ashrrev_i16_e32 v4, 12, v15 ; GFX9-NEXT: v_ashrrev_i16_e32 v15, 12, v16 +; GFX9-NEXT: v_lshlrev_b16_e32 v9, 12, v9 +; GFX9-NEXT: v_lshlrev_b16_e32 v14, 12, v14 ; GFX9-NEXT: v_lshlrev_b16_e32 v8, 12, v8 ; GFX9-NEXT: v_lshlrev_b16_e32 v13, 12, v13 ; GFX9-NEXT: v_ashrrev_i16_e32 v9, 12, v9 @@ -526,18 +526,18 @@ ; GFX9-NEXT: v_ashrrev_i16_e32 v7, 12, v7 ; GFX9-NEXT: v_ashrrev_i16_e32 v12, 12, v12 ; GFX9-NEXT: v_mad_legacy_u16 v3, v8, v13, v3 -; GFX9-NEXT: v_lshlrev_b16_e32 v6, 12, v6 -; GFX9-NEXT: v_lshlrev_b16_e32 v11, 12, v11 ; GFX9-NEXT: v_ashrrev_i16_e32 v16, 12, v17 ; GFX9-NEXT: v_ashrrev_i16_e32 v17, 12, v18 +; GFX9-NEXT: v_lshlrev_b16_e32 v6, 12, v6 +; GFX9-NEXT: v_lshlrev_b16_e32 v11, 12, v11 ; GFX9-NEXT: v_mad_legacy_u16 v3, v7, v12, v3 ; GFX9-NEXT: v_ashrrev_i16_e32 v6, 12, v6 ; GFX9-NEXT: v_ashrrev_i16_e32 v11, 12, v11 ; GFX9-NEXT: v_mad_legacy_u16 v3, v16, v17, v3 -; GFX9-NEXT: v_lshlrev_b16_e32 v5, 12, v5 -; GFX9-NEXT: v_lshlrev_b16_e32 v10, 12, v10 ; GFX9-NEXT: v_ashrrev_i16_e32 v1, 12, v1 ; GFX9-NEXT: v_ashrrev_i16_e32 v2, 12, v2 +; GFX9-NEXT: v_lshlrev_b16_e32 v5, 12, v5 +; GFX9-NEXT: v_lshlrev_b16_e32 v10, 12, v10 ; GFX9-NEXT: v_mad_legacy_u16 v3, v6, v11, v3 ; GFX9-NEXT: v_ashrrev_i16_e32 v5, 12, v5 ; GFX9-NEXT: v_ashrrev_i16_e32 v10, 12, v10 @@ -557,12 +557,12 @@ ; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: v_mov_b32_e32 v4, 12 -; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: global_load_ushort v3, v0, s[2:3] +; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(2) ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v9, 4, v1 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) @@ -577,14 +577,14 @@ ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v11, 20, v2 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v12, 12, v2 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v13, 8, v2 -; GFX9-DL-NEXT: v_lshlrev_b16_e32 v9, 12, v9 -; GFX9-DL-NEXT: v_lshlrev_b16_e32 v14, 12, v14 ; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v17, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 ; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v18, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 ; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v4, 12, v15 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v15, 12, v16 +; GFX9-DL-NEXT: v_lshlrev_b16_e32 v9, 12, v9 +; GFX9-DL-NEXT: v_lshlrev_b16_e32 v14, 12, v14 ; GFX9-DL-NEXT: v_lshlrev_b16_e32 v8, 12, v8 ; GFX9-DL-NEXT: v_lshlrev_b16_e32 v13, 12, v13 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v9, 12, v9 @@ -599,18 +599,18 @@ ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v7, 12, v7 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v12, 12, v12 ; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v8, v13, v3 -; GFX9-DL-NEXT: v_lshlrev_b16_e32 v6, 12, v6 -; GFX9-DL-NEXT: v_lshlrev_b16_e32 v11, 12, v11 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v16, 12, v17 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v17, 12, v18 +; GFX9-DL-NEXT: v_lshlrev_b16_e32 v6, 12, v6 +; GFX9-DL-NEXT: v_lshlrev_b16_e32 v11, 12, v11 ; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v7, v12, v3 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v6, 12, v6 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v11, 12, v11 ; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v16, v17, v3 -; GFX9-DL-NEXT: v_lshlrev_b16_e32 v5, 12, v5 -; GFX9-DL-NEXT: v_lshlrev_b16_e32 v10, 12, v10 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v1, 12, v1 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v2, 12, v2 +; GFX9-DL-NEXT: v_lshlrev_b16_e32 v5, 12, v5 +; GFX9-DL-NEXT: v_lshlrev_b16_e32 v10, 12, v10 ; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v6, v11, v3 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v5, 12, v5 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v10, 12, v10 @@ -649,45 +649,45 @@ ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v16, 4, v2 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v17, 12, v2 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v11, 28, v2 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v10, 12, v10 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v12, 24, v2 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v16, 12, v16 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v13, 20, v2 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v14, 16, v2 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v15, 12, v2 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v2, 8, v2 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v1, 12, v1 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v10, 12, v10 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v16, 12, v16 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v17, 12, v17 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v9, 12, v9 -; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v10, 12, v10 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v2, 12, v2 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v10, 12, v10 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v16, 12, v16 ; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v1, v17, v3 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v8, 12, v8 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v3, 12, v9 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v9, 12, v15 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v2, 12, v2 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v8, 12, v8 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v9, 12, v15 ; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v10, v16, v1 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v7, 12, v7 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v10, 12, v14 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v8, 12, v8 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v9, 12, v9 ; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v3, v2, v1 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v6, 12, v6 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v2, 12, v7 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v7, 12, v13 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v3, 12, v10 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v6, 12, v6 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v7, 12, v13 ; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v8, v9, v1 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v5, 12, v5 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v8, 12, v12 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v6, 12, v6 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v7, 12, v7 ; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v2, v3, v1 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v4, 12, v4 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v2, 12, v5 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v5, 12, v11 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v3, 12, v8 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v4, 12, v4 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v5, 12, v11 ; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v6, v7, v1 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v4, 12, v4 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v5, 12, v5 @@ -727,45 +727,45 @@ ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v16, 4, v0 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v17, 12, v0 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v11, 28, v0 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v10, 12, v10 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v12, 24, v0 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v16, 12, v16 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v13, 20, v0 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v14, 16, v0 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v15, 12, v0 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v0, 8, v0 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v1, 12, v1 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v10, 12, v10 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v16, 12, v16 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v17, 12, v17 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v9, 12, v9 -; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v10, 12, v10 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v0, 12, v0 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v10, 12, v10 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v16, 12, v16 ; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v1, v1, v17, v3 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v8, 12, v8 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v3, 12, v9 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v9, 12, v15 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v0, 12, v0 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v8, 12, v8 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v9, 12, v15 ; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v1, v10, v16, v1 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v7, 12, v7 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v10, 12, v14 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v8, 12, v8 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v9, 12, v9 ; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v3, v0, v1 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v6, 12, v6 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v1, 12, v7 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v7, 12, v13 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v3, 12, v10 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v6, 12, v6 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v7, 12, v13 ; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v8, v9, v0 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v5, 12, v5 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v8, 12, v12 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v6, 12, v6 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v7, 12, v7 ; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v1, v3, v0 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v4, 12, v4 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v1, 12, v5 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v5, 12, v11 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v3, 12, v8 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v4, 12, v4 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v5, 12, v11 ; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v6, v7, v0 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v4, 12, v4 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v5, 12, v5 @@ -907,10 +907,10 @@ ; GFX7-NEXT: s_mov_b32 s15, 0xe8f000 ; GFX7-NEXT: s_add_u32 s12, s12, s3 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 @@ -996,23 +996,23 @@ ; GFX8-NEXT: v_lshlrev_b16_e32 v16, 12, v3 ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 28, v3 ; GFX8-NEXT: v_lshrrev_b32_e32 v7, 20, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v8, 12, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v9, 8, v3 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_lshrrev_b32_e32 v15, 4, v2 ; GFX8-NEXT: v_lshlrev_b16_e32 v17, 12, v2 -; GFX8-NEXT: v_lshrrev_b32_e32 v8, 12, v3 -; GFX8-NEXT: v_lshrrev_b32_e32 v9, 8, v3 ; GFX8-NEXT: v_lshrrev_b32_e32 v11, 28, v2 ; GFX8-NEXT: v_lshrrev_b32_e32 v12, 20, v2 ; GFX8-NEXT: v_lshrrev_b32_e32 v13, 12, v2 ; GFX8-NEXT: v_lshrrev_b32_e32 v14, 8, v2 -; GFX8-NEXT: v_lshlrev_b16_e32 v10, 12, v10 -; GFX8-NEXT: v_lshlrev_b16_e32 v15, 12, v15 ; GFX8-NEXT: v_lshlrev_b16_sdwa v18, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_lshlrev_b16_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 ; GFX8-NEXT: v_lshlrev_b16_sdwa v19, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_lshlrev_b16_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 ; GFX8-NEXT: v_lshlrev_b16_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 ; GFX8-NEXT: v_ashrrev_i16_e32 v5, 12, v16 ; GFX8-NEXT: v_ashrrev_i16_e32 v16, 12, v17 +; GFX8-NEXT: v_lshlrev_b16_e32 v10, 12, v10 +; GFX8-NEXT: v_lshlrev_b16_e32 v15, 12, v15 ; GFX8-NEXT: v_lshlrev_b16_e32 v9, 12, v9 ; GFX8-NEXT: v_lshlrev_b16_e32 v14, 12, v14 ; GFX8-NEXT: v_ashrrev_i16_e32 v10, 12, v10 @@ -1027,18 +1027,18 @@ ; GFX8-NEXT: v_ashrrev_i16_e32 v8, 12, v8 ; GFX8-NEXT: v_ashrrev_i16_e32 v13, 12, v13 ; GFX8-NEXT: v_mad_u16 v4, v9, v14, v4 -; GFX8-NEXT: v_lshlrev_b16_e32 v7, 12, v7 -; GFX8-NEXT: v_lshlrev_b16_e32 v12, 12, v12 ; GFX8-NEXT: v_ashrrev_i16_e32 v17, 12, v18 ; GFX8-NEXT: v_ashrrev_i16_e32 v18, 12, v19 +; GFX8-NEXT: v_lshlrev_b16_e32 v7, 12, v7 +; GFX8-NEXT: v_lshlrev_b16_e32 v12, 12, v12 ; GFX8-NEXT: v_mad_u16 v4, v8, v13, v4 ; GFX8-NEXT: v_ashrrev_i16_e32 v7, 12, v7 ; GFX8-NEXT: v_ashrrev_i16_e32 v12, 12, v12 ; GFX8-NEXT: v_mad_u16 v4, v17, v18, v4 -; GFX8-NEXT: v_lshlrev_b16_e32 v6, 12, v6 -; GFX8-NEXT: v_lshlrev_b16_e32 v11, 12, v11 ; GFX8-NEXT: v_ashrrev_i16_e32 v3, 12, v3 ; GFX8-NEXT: v_ashrrev_i16_e32 v2, 12, v2 +; GFX8-NEXT: v_lshlrev_b16_e32 v6, 12, v6 +; GFX8-NEXT: v_lshlrev_b16_e32 v11, 12, v11 ; GFX8-NEXT: v_mad_u16 v4, v7, v12, v4 ; GFX8-NEXT: v_ashrrev_i16_e32 v6, 12, v6 ; GFX8-NEXT: v_ashrrev_i16_e32 v11, 12, v11 @@ -1058,12 +1058,12 @@ ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v4, 12 -; GFX9-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: global_load_ubyte v3, v0, s[2:3] +; GFX9-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshrrev_b32_e32 v9, 4, v1 ; GFX9-NEXT: s_waitcnt vmcnt(1) @@ -1078,14 +1078,14 @@ ; GFX9-NEXT: v_lshrrev_b32_e32 v11, 20, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v12, 12, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v2 -; GFX9-NEXT: v_lshlrev_b16_e32 v9, 12, v9 -; GFX9-NEXT: v_lshlrev_b16_e32 v14, 12, v14 ; GFX9-NEXT: v_lshlrev_b16_sdwa v17, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshlrev_b16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 ; GFX9-NEXT: v_lshlrev_b16_sdwa v18, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshlrev_b16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 ; GFX9-NEXT: v_lshlrev_b16_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 ; GFX9-NEXT: v_ashrrev_i16_e32 v4, 12, v15 ; GFX9-NEXT: v_ashrrev_i16_e32 v15, 12, v16 +; GFX9-NEXT: v_lshlrev_b16_e32 v9, 12, v9 +; GFX9-NEXT: v_lshlrev_b16_e32 v14, 12, v14 ; GFX9-NEXT: v_lshlrev_b16_e32 v8, 12, v8 ; GFX9-NEXT: v_lshlrev_b16_e32 v13, 12, v13 ; GFX9-NEXT: v_ashrrev_i16_e32 v9, 12, v9 @@ -1100,18 +1100,18 @@ ; GFX9-NEXT: v_ashrrev_i16_e32 v7, 12, v7 ; GFX9-NEXT: v_ashrrev_i16_e32 v12, 12, v12 ; GFX9-NEXT: v_mad_legacy_u16 v3, v8, v13, v3 -; GFX9-NEXT: v_lshlrev_b16_e32 v6, 12, v6 -; GFX9-NEXT: v_lshlrev_b16_e32 v11, 12, v11 ; GFX9-NEXT: v_ashrrev_i16_e32 v16, 12, v17 ; GFX9-NEXT: v_ashrrev_i16_e32 v17, 12, v18 +; GFX9-NEXT: v_lshlrev_b16_e32 v6, 12, v6 +; GFX9-NEXT: v_lshlrev_b16_e32 v11, 12, v11 ; GFX9-NEXT: v_mad_legacy_u16 v3, v7, v12, v3 ; GFX9-NEXT: v_ashrrev_i16_e32 v6, 12, v6 ; GFX9-NEXT: v_ashrrev_i16_e32 v11, 12, v11 ; GFX9-NEXT: v_mad_legacy_u16 v3, v16, v17, v3 -; GFX9-NEXT: v_lshlrev_b16_e32 v5, 12, v5 -; GFX9-NEXT: v_lshlrev_b16_e32 v10, 12, v10 ; GFX9-NEXT: v_ashrrev_i16_e32 v1, 12, v1 ; GFX9-NEXT: v_ashrrev_i16_e32 v2, 12, v2 +; GFX9-NEXT: v_lshlrev_b16_e32 v5, 12, v5 +; GFX9-NEXT: v_lshlrev_b16_e32 v10, 12, v10 ; GFX9-NEXT: v_mad_legacy_u16 v3, v6, v11, v3 ; GFX9-NEXT: v_ashrrev_i16_e32 v5, 12, v5 ; GFX9-NEXT: v_ashrrev_i16_e32 v10, 12, v10 @@ -1131,12 +1131,12 @@ ; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: v_mov_b32_e32 v4, 12 -; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: global_load_ubyte v3, v0, s[2:3] +; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(2) ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v9, 4, v1 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) @@ -1151,14 +1151,14 @@ ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v11, 20, v2 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v12, 12, v2 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v13, 8, v2 -; GFX9-DL-NEXT: v_lshlrev_b16_e32 v9, 12, v9 -; GFX9-DL-NEXT: v_lshlrev_b16_e32 v14, 12, v14 ; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v17, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 ; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v18, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 ; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v4, 12, v15 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v15, 12, v16 +; GFX9-DL-NEXT: v_lshlrev_b16_e32 v9, 12, v9 +; GFX9-DL-NEXT: v_lshlrev_b16_e32 v14, 12, v14 ; GFX9-DL-NEXT: v_lshlrev_b16_e32 v8, 12, v8 ; GFX9-DL-NEXT: v_lshlrev_b16_e32 v13, 12, v13 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v9, 12, v9 @@ -1173,18 +1173,18 @@ ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v7, 12, v7 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v12, 12, v12 ; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v8, v13, v3 -; GFX9-DL-NEXT: v_lshlrev_b16_e32 v6, 12, v6 -; GFX9-DL-NEXT: v_lshlrev_b16_e32 v11, 12, v11 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v16, 12, v17 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v17, 12, v18 +; GFX9-DL-NEXT: v_lshlrev_b16_e32 v6, 12, v6 +; GFX9-DL-NEXT: v_lshlrev_b16_e32 v11, 12, v11 ; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v7, v12, v3 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v6, 12, v6 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v11, 12, v11 ; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v16, v17, v3 -; GFX9-DL-NEXT: v_lshlrev_b16_e32 v5, 12, v5 -; GFX9-DL-NEXT: v_lshlrev_b16_e32 v10, 12, v10 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v1, 12, v1 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v2, 12, v2 +; GFX9-DL-NEXT: v_lshlrev_b16_e32 v5, 12, v5 +; GFX9-DL-NEXT: v_lshlrev_b16_e32 v10, 12, v10 ; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v6, v11, v3 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v5, 12, v5 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v10, 12, v10 @@ -1223,45 +1223,45 @@ ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v16, 4, v2 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v17, 12, v2 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v11, 28, v2 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v10, 12, v10 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v12, 24, v2 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v16, 12, v16 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v13, 20, v2 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v14, 16, v2 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v15, 12, v2 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v2, 8, v2 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v1, 12, v1 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v10, 12, v10 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v16, 12, v16 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v17, 12, v17 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v9, 12, v9 -; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v10, 12, v10 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v2, 12, v2 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v10, 12, v10 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v16, 12, v16 ; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v1, v17, v3 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v8, 12, v8 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v3, 12, v9 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v9, 12, v15 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v2, 12, v2 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v8, 12, v8 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v9, 12, v15 ; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v10, v16, v1 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v7, 12, v7 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v10, 12, v14 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v8, 12, v8 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v9, 12, v9 ; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v3, v2, v1 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v6, 12, v6 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v2, 12, v7 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v7, 12, v13 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v3, 12, v10 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v6, 12, v6 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v7, 12, v13 ; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v8, v9, v1 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v5, 12, v5 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v8, 12, v12 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v6, 12, v6 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v7, 12, v7 ; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v2, v3, v1 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v4, 12, v4 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v2, 12, v5 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v5, 12, v11 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v3, 12, v8 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v4, 12, v4 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v5, 12, v11 ; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v6, v7, v1 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v4, 12, v4 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v5, 12, v5 @@ -1301,45 +1301,45 @@ ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v16, 4, v0 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v17, 12, v0 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v11, 28, v0 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v10, 12, v10 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v12, 24, v0 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v16, 12, v16 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v13, 20, v0 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v14, 16, v0 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v15, 12, v0 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v0, 8, v0 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v1, 12, v1 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v10, 12, v10 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v16, 12, v16 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v17, 12, v17 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v9, 12, v9 -; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v10, 12, v10 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v0, 12, v0 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v10, 12, v10 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v16, 12, v16 ; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v1, v1, v17, v3 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v8, 12, v8 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v3, 12, v9 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v9, 12, v15 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v0, 12, v0 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v8, 12, v8 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v9, 12, v15 ; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v1, v10, v16, v1 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v7, 12, v7 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v10, 12, v14 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v8, 12, v8 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v9, 12, v9 ; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v3, v0, v1 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v6, 12, v6 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v1, 12, v7 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v7, 12, v13 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v3, 12, v10 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v6, 12, v6 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v7, 12, v13 ; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v8, v9, v0 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v5, 12, v5 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v8, 12, v12 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v6, 12, v6 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v7, 12, v7 ; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v1, v3, v0 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v4, 12, v4 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v1, 12, v5 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v5, 12, v11 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v3, 12, v8 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v4, 12, v4 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v5, 12, v11 ; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v6, v7, v0 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v4, 12, v4 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v5, 12, v5 @@ -1482,10 +1482,10 @@ ; GFX7-NEXT: s_mov_b32 s15, 0xe8f000 ; GFX7-NEXT: s_add_u32 s12, s12, s3 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 @@ -1552,6 +1552,8 @@ ; GFX8-NEXT: v_bfe_i32 v4, v3, 4, 4 ; GFX8-NEXT: v_bfe_i32 v6, v3, 8, 4 ; GFX8-NEXT: v_bfe_i32 v8, v3, 12, 4 +; GFX8-NEXT: v_bfe_i32 v10, v3, 16, 4 +; GFX8-NEXT: v_bfe_i32 v12, v3, 20, 4 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_bfe_i32 v2, v0, 0, 4 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -1562,10 +1564,8 @@ ; GFX8-NEXT: v_mad_i32_i24 v1, v4, v5, v1 ; GFX8-NEXT: v_bfe_i32 v9, v0, 12, 4 ; GFX8-NEXT: v_mad_i32_i24 v1, v6, v7, v1 -; GFX8-NEXT: v_bfe_i32 v10, v3, 16, 4 ; GFX8-NEXT: v_bfe_i32 v11, v0, 16, 4 ; GFX8-NEXT: v_mad_i32_i24 v1, v8, v9, v1 -; GFX8-NEXT: v_bfe_i32 v12, v3, 20, 4 ; GFX8-NEXT: v_bfe_i32 v13, v0, 20, 4 ; GFX8-NEXT: v_mad_i32_i24 v1, v10, v11, v1 ; GFX8-NEXT: v_bfe_i32 v14, v3, 24, 4 @@ -1610,8 +1610,8 @@ ; GFX9-NEXT: v_bfe_i32 v11, v1, 16, 4 ; GFX9-NEXT: v_bfe_i32 v12, v2, 16, 4 ; GFX9-NEXT: v_bfe_i32 v13, v1, 20, 4 -; GFX9-NEXT: v_bfe_i32 v15, v1, 24, 4 ; GFX9-NEXT: v_bfe_i32 v14, v2, 20, 4 +; GFX9-NEXT: v_bfe_i32 v15, v1, 24, 4 ; GFX9-NEXT: v_bfe_i32 v16, v2, 24, 4 ; GFX9-NEXT: v_ashrrev_i32_e32 v1, 28, v1 ; GFX9-NEXT: v_ashrrev_i32_e32 v2, 28, v2 @@ -1661,8 +1661,8 @@ ; GFX9-DL-NEXT: v_bfe_i32 v11, v1, 16, 4 ; GFX9-DL-NEXT: v_bfe_i32 v12, v2, 16, 4 ; GFX9-DL-NEXT: v_bfe_i32 v13, v1, 20, 4 -; GFX9-DL-NEXT: v_bfe_i32 v15, v1, 24, 4 ; GFX9-DL-NEXT: v_bfe_i32 v14, v2, 20, 4 +; GFX9-DL-NEXT: v_bfe_i32 v15, v1, 24, 4 ; GFX9-DL-NEXT: v_bfe_i32 v16, v2, 24, 4 ; GFX9-DL-NEXT: v_ashrrev_i32_e32 v1, 28, v1 ; GFX9-DL-NEXT: v_ashrrev_i32_e32 v2, 28, v2 @@ -1917,10 +1917,10 @@ ; GFX7-NEXT: s_mov_b32 s15, 0xe8f000 ; GFX7-NEXT: s_add_u32 s12, s12, s3 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 @@ -2030,21 +2030,21 @@ ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_ashrrev_i32_e32 v3, 28, v1 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_ashrrev_i32_e32 v10, 28, v2 ; GFX9-NEXT: v_bfe_i32 v4, v1, 24, 4 -; GFX9-NEXT: v_bfe_i32 v11, v2, 24, 4 ; GFX9-NEXT: v_bfe_i32 v5, v1, 20, 4 -; GFX9-NEXT: v_bfe_i32 v12, v2, 20, 4 ; GFX9-NEXT: v_bfe_i32 v6, v1, 16, 4 -; GFX9-NEXT: v_bfe_i32 v13, v2, 16, 4 ; GFX9-NEXT: v_bfe_i32 v7, v1, 12, 4 -; GFX9-NEXT: v_bfe_i32 v14, v2, 12, 4 ; GFX9-NEXT: v_bfe_i32 v8, v1, 8, 4 ; GFX9-NEXT: v_bfe_i32 v9, v1, 4, 4 +; GFX9-NEXT: v_bfe_i32 v1, v1, 0, 4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_ashrrev_i32_e32 v10, 28, v2 +; GFX9-NEXT: v_bfe_i32 v11, v2, 24, 4 +; GFX9-NEXT: v_bfe_i32 v12, v2, 20, 4 +; GFX9-NEXT: v_bfe_i32 v13, v2, 16, 4 +; GFX9-NEXT: v_bfe_i32 v14, v2, 12, 4 ; GFX9-NEXT: v_bfe_i32 v15, v2, 8, 4 ; GFX9-NEXT: v_bfe_i32 v16, v2, 4, 4 -; GFX9-NEXT: v_bfe_i32 v1, v1, 0, 4 ; GFX9-NEXT: v_bfe_i32 v2, v2, 0, 4 ; GFX9-NEXT: v_mul_i32_i24_e32 v1, v1, v2 ; GFX9-NEXT: v_mul_i32_i24_e32 v2, v9, v16 @@ -2073,11 +2073,11 @@ ; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v3, v0, s[6:7] ; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-DL-NEXT: v_dot8_i32_i4 v0, v2, v3, s0 ; GFX9-DL-NEXT: global_store_dword v1, v0, s[2:3] @@ -2195,10 +2195,10 @@ ; GFX7-NEXT: s_mov_b32 s15, 0xe8f000 ; GFX7-NEXT: s_add_u32 s12, s12, s3 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 @@ -2238,19 +2238,19 @@ ; GFX7-NEXT: v_bfe_i32 v0, v0, 12, 4 ; GFX7-NEXT: v_or_b32_e32 v5, v6, v5 ; GFX7-NEXT: v_or_b32_e32 v6, v11, v10 +; GFX7-NEXT: v_and_b32_e32 v3, v2, v3 +; GFX7-NEXT: v_and_b32_e32 v9, v2, v9 ; GFX7-NEXT: v_and_b32_e32 v12, v2, v14 ; GFX7-NEXT: v_and_b32_e32 v13, v2, v15 +; GFX7-NEXT: v_and_b32_e32 v0, v2, v0 ; GFX7-NEXT: v_and_b32_e32 v14, v2, v16 ; GFX7-NEXT: v_lshrrev_b32_e32 v15, 16, v1 ; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v4 -; GFX7-NEXT: v_lshrrev_b32_e32 v11, 16, v6 -; GFX7-NEXT: v_and_b32_e32 v3, v2, v3 -; GFX7-NEXT: v_and_b32_e32 v9, v2, v9 -; GFX7-NEXT: v_and_b32_e32 v0, v2, v0 ; GFX7-NEXT: v_and_b32_e32 v4, v2, v4 ; GFX7-NEXT: v_and_b32_e32 v1, v2, v1 -; GFX7-NEXT: v_and_b32_e32 v6, v2, v6 ; GFX7-NEXT: v_lshrrev_b32_e32 v10, 16, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v11, 16, v6 +; GFX7-NEXT: v_and_b32_e32 v6, v2, v6 ; GFX7-NEXT: v_and_b32_e32 v2, v2, v5 ; GFX7-NEXT: buffer_load_ushort v5, off, s[0:3], 0 ; GFX7-NEXT: v_and_b32_e32 v8, s4, v8 @@ -2297,21 +2297,21 @@ ; GFX8-NEXT: v_lshrrev_b32_e32 v8, 12, v3 ; GFX8-NEXT: v_lshrrev_b32_e32 v9, 20, v3 ; GFX8-NEXT: v_lshrrev_b32_e32 v10, 28, v3 +; GFX8-NEXT: v_lshlrev_b16_sdwa v16, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_lshrrev_b32_e32 v11, 4, v2 ; GFX8-NEXT: v_lshrrev_b32_e32 v12, 8, v2 ; GFX8-NEXT: v_lshrrev_b32_e32 v13, 12, v2 ; GFX8-NEXT: v_lshrrev_b32_e32 v14, 20, v2 ; GFX8-NEXT: v_lshrrev_b32_e32 v15, 28, v2 -; GFX8-NEXT: v_lshlrev_b16_sdwa v16, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 ; GFX8-NEXT: v_lshlrev_b16_sdwa v17, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_lshlrev_b16_e32 v3, 12, v3 ; GFX8-NEXT: v_lshlrev_b16_sdwa v18, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 ; GFX8-NEXT: v_lshlrev_b16_sdwa v5, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_lshlrev_b16_e32 v3, 12, v3 ; GFX8-NEXT: v_lshlrev_b16_e32 v2, 12, v2 ; GFX8-NEXT: v_lshlrev_b16_e32 v6, 12, v6 -; GFX8-NEXT: v_lshlrev_b16_e32 v11, 12, v11 ; GFX8-NEXT: v_ashrrev_i16_e32 v3, 12, v3 +; GFX8-NEXT: v_lshlrev_b16_e32 v11, 12, v11 ; GFX8-NEXT: v_ashrrev_i16_e32 v2, 12, v2 ; GFX8-NEXT: v_lshlrev_b16_e32 v7, 12, v7 ; GFX8-NEXT: v_lshlrev_b16_e32 v12, 12, v12 @@ -2328,16 +2328,16 @@ ; GFX8-NEXT: v_ashrrev_i16_e32 v13, 12, v13 ; GFX8-NEXT: v_mad_u16 v2, v7, v12, v2 ; GFX8-NEXT: v_lshlrev_b16_e32 v9, 12, v9 -; GFX8-NEXT: v_lshlrev_b16_e32 v14, 12, v14 ; GFX8-NEXT: v_ashrrev_i16_e32 v17, 12, v17 +; GFX8-NEXT: v_lshlrev_b16_e32 v14, 12, v14 ; GFX8-NEXT: v_ashrrev_i16_e32 v5, 12, v5 ; GFX8-NEXT: v_mad_u16 v2, v8, v13, v2 ; GFX8-NEXT: v_ashrrev_i16_e32 v9, 12, v9 ; GFX8-NEXT: v_ashrrev_i16_e32 v14, 12, v14 ; GFX8-NEXT: v_mad_u16 v2, v17, v5, v2 ; GFX8-NEXT: v_lshlrev_b16_e32 v10, 12, v10 -; GFX8-NEXT: v_lshlrev_b16_e32 v15, 12, v15 ; GFX8-NEXT: v_ashrrev_i16_e32 v16, 12, v16 +; GFX8-NEXT: v_lshlrev_b16_e32 v15, 12, v15 ; GFX8-NEXT: v_ashrrev_i16_e32 v18, 12, v18 ; GFX8-NEXT: v_mad_u16 v2, v9, v14, v2 ; GFX8-NEXT: v_ashrrev_i16_e32 v10, 12, v10 @@ -2358,30 +2358,30 @@ ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff -; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v3, v0, s[4:5] ; GFX9-NEXT: global_load_dword v4, v0, s[6:7] +; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_and_b32_e32 v10, 15, v3 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v17, 15, v4 ; GFX9-NEXT: v_bfe_u32 v0, v3, 24, 4 -; GFX9-NEXT: v_bfe_u32 v6, v3, 16, 4 -; GFX9-NEXT: v_bfe_u32 v8, v3, 8, 4 -; GFX9-NEXT: v_bfe_u32 v13, v4, 16, 4 -; GFX9-NEXT: v_bfe_u32 v15, v4, 8, 4 ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 28, v3 +; GFX9-NEXT: v_bfe_u32 v6, v3, 16, 4 ; GFX9-NEXT: v_bfe_u32 v7, v3, 20, 4 +; GFX9-NEXT: v_bfe_u32 v8, v3, 8, 4 ; GFX9-NEXT: v_bfe_u32 v9, v3, 12, 4 ; GFX9-NEXT: v_bfe_u32 v3, v3, 4, 4 -; GFX9-NEXT: v_and_b32_e32 v10, v2, v10 ; GFX9-NEXT: v_bfe_u32 v11, v4, 24, 4 ; GFX9-NEXT: v_lshrrev_b32_e32 v12, 28, v4 +; GFX9-NEXT: v_bfe_u32 v13, v4, 16, 4 ; GFX9-NEXT: v_bfe_u32 v14, v4, 20, 4 +; GFX9-NEXT: v_bfe_u32 v15, v4, 8, 4 ; GFX9-NEXT: v_bfe_u32 v16, v4, 12, 4 ; GFX9-NEXT: v_bfe_u32 v4, v4, 4, 4 +; GFX9-NEXT: v_and_b32_e32 v10, v2, v10 ; GFX9-NEXT: v_and_b32_e32 v17, v2, v17 ; GFX9-NEXT: v_lshl_or_b32 v3, v3, 16, v10 ; GFX9-NEXT: v_lshl_or_b32 v4, v4, 16, v17 @@ -2394,14 +2394,14 @@ ; GFX9-NEXT: v_and_b32_e32 v8, v2, v8 ; GFX9-NEXT: v_and_b32_e32 v0, v2, v0 ; GFX9-NEXT: v_and_b32_e32 v15, v2, v15 -; GFX9-NEXT: v_lshl_or_b32 v0, v5, 16, v0 ; GFX9-NEXT: v_lshl_or_b32 v8, v9, 16, v8 +; GFX9-NEXT: v_lshl_or_b32 v0, v5, 16, v0 ; GFX9-NEXT: v_lshl_or_b32 v5, v16, 16, v15 ; GFX9-NEXT: v_and_b32_e32 v6, v2, v6 -; GFX9-NEXT: v_pk_lshlrev_b16 v8, 12, v8 op_sel_hi:[0,1] -; GFX9-NEXT: v_pk_lshlrev_b16 v5, 12, v5 op_sel_hi:[0,1] ; GFX9-NEXT: v_and_b32_e32 v13, v2, v13 ; GFX9-NEXT: v_and_b32_e32 v2, v2, v11 +; GFX9-NEXT: v_pk_lshlrev_b16 v8, 12, v8 op_sel_hi:[0,1] +; GFX9-NEXT: v_pk_lshlrev_b16 v5, 12, v5 op_sel_hi:[0,1] ; GFX9-NEXT: v_lshl_or_b32 v6, v7, 16, v6 ; GFX9-NEXT: v_lshl_or_b32 v7, v14, 16, v13 ; GFX9-NEXT: v_lshl_or_b32 v2, v12, 16, v2 @@ -2414,8 +2414,8 @@ ; GFX9-NEXT: v_pk_mul_lo_u16 v5, v8, v5 ; GFX9-NEXT: v_pk_ashrrev_i16 v6, 12, v6 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_ashrrev_i16 v0, 12, v0 op_sel_hi:[0,1] -; GFX9-NEXT: v_pk_ashrrev_i16 v2, 12, v2 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_ashrrev_i16 v7, 12, v7 op_sel_hi:[0,1] +; GFX9-NEXT: v_pk_ashrrev_i16 v2, 12, v2 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_mul_lo_u16 v0, v0, v2 ; GFX9-NEXT: v_pk_mul_lo_u16 v2, v6, v7 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -2441,30 +2441,30 @@ ; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: v_mov_b32_e32 v2, 0xffff -; GFX9-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v3, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v4, v0, s[6:7] +; GFX9-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) ; GFX9-DL-NEXT: v_and_b32_e32 v10, 15, v3 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_and_b32_e32 v17, 15, v4 ; GFX9-DL-NEXT: v_bfe_u32 v0, v3, 24, 4 -; GFX9-DL-NEXT: v_bfe_u32 v6, v3, 16, 4 -; GFX9-DL-NEXT: v_bfe_u32 v8, v3, 8, 4 -; GFX9-DL-NEXT: v_bfe_u32 v13, v4, 16, 4 -; GFX9-DL-NEXT: v_bfe_u32 v15, v4, 8, 4 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 28, v3 +; GFX9-DL-NEXT: v_bfe_u32 v6, v3, 16, 4 ; GFX9-DL-NEXT: v_bfe_u32 v7, v3, 20, 4 +; GFX9-DL-NEXT: v_bfe_u32 v8, v3, 8, 4 ; GFX9-DL-NEXT: v_bfe_u32 v9, v3, 12, 4 ; GFX9-DL-NEXT: v_bfe_u32 v3, v3, 4, 4 -; GFX9-DL-NEXT: v_and_b32_e32 v10, v2, v10 ; GFX9-DL-NEXT: v_bfe_u32 v11, v4, 24, 4 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v12, 28, v4 +; GFX9-DL-NEXT: v_bfe_u32 v13, v4, 16, 4 ; GFX9-DL-NEXT: v_bfe_u32 v14, v4, 20, 4 +; GFX9-DL-NEXT: v_bfe_u32 v15, v4, 8, 4 ; GFX9-DL-NEXT: v_bfe_u32 v16, v4, 12, 4 ; GFX9-DL-NEXT: v_bfe_u32 v4, v4, 4, 4 +; GFX9-DL-NEXT: v_and_b32_e32 v10, v2, v10 ; GFX9-DL-NEXT: v_and_b32_e32 v17, v2, v17 ; GFX9-DL-NEXT: v_lshl_or_b32 v3, v3, 16, v10 ; GFX9-DL-NEXT: v_lshl_or_b32 v4, v4, 16, v17 @@ -2477,14 +2477,14 @@ ; GFX9-DL-NEXT: v_and_b32_e32 v8, v2, v8 ; GFX9-DL-NEXT: v_and_b32_e32 v0, v2, v0 ; GFX9-DL-NEXT: v_and_b32_e32 v15, v2, v15 -; GFX9-DL-NEXT: v_lshl_or_b32 v0, v5, 16, v0 ; GFX9-DL-NEXT: v_lshl_or_b32 v8, v9, 16, v8 +; GFX9-DL-NEXT: v_lshl_or_b32 v0, v5, 16, v0 ; GFX9-DL-NEXT: v_lshl_or_b32 v5, v16, 16, v15 ; GFX9-DL-NEXT: v_and_b32_e32 v6, v2, v6 -; GFX9-DL-NEXT: v_pk_lshlrev_b16 v8, 12, v8 op_sel_hi:[0,1] -; GFX9-DL-NEXT: v_pk_lshlrev_b16 v5, 12, v5 op_sel_hi:[0,1] ; GFX9-DL-NEXT: v_and_b32_e32 v13, v2, v13 ; GFX9-DL-NEXT: v_and_b32_e32 v2, v2, v11 +; GFX9-DL-NEXT: v_pk_lshlrev_b16 v8, 12, v8 op_sel_hi:[0,1] +; GFX9-DL-NEXT: v_pk_lshlrev_b16 v5, 12, v5 op_sel_hi:[0,1] ; GFX9-DL-NEXT: v_lshl_or_b32 v6, v7, 16, v6 ; GFX9-DL-NEXT: v_lshl_or_b32 v7, v14, 16, v13 ; GFX9-DL-NEXT: v_lshl_or_b32 v2, v12, 16, v2 @@ -2497,8 +2497,8 @@ ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v5, v8, v5 ; GFX9-DL-NEXT: v_pk_ashrrev_i16 v6, 12, v6 op_sel_hi:[0,1] ; GFX9-DL-NEXT: v_pk_ashrrev_i16 v0, 12, v0 op_sel_hi:[0,1] -; GFX9-DL-NEXT: v_pk_ashrrev_i16 v2, 12, v2 op_sel_hi:[0,1] ; GFX9-DL-NEXT: v_pk_ashrrev_i16 v7, 12, v7 op_sel_hi:[0,1] +; GFX9-DL-NEXT: v_pk_ashrrev_i16 v2, 12, v2 op_sel_hi:[0,1] ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v0, v0, v2 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v2, v6, v7 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) @@ -2535,43 +2535,43 @@ ; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v11, 15, v1 ; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(1) ; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v13, 15, v2 -; GFX10-DL-XNACK-NEXT: v_bfe_u32 v7, v1, 16, 4 -; GFX10-DL-XNACK-NEXT: v_bfe_u32 v9, v1, 8, 4 ; GFX10-DL-XNACK-NEXT: v_bfe_u32 v5, v1, 24, 4 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v6, 28, v1 +; GFX10-DL-XNACK-NEXT: v_bfe_u32 v7, v1, 16, 4 ; GFX10-DL-XNACK-NEXT: v_bfe_u32 v8, v1, 20, 4 +; GFX10-DL-XNACK-NEXT: v_bfe_u32 v9, v1, 8, 4 ; GFX10-DL-XNACK-NEXT: v_bfe_u32 v10, v1, 12, 4 ; GFX10-DL-XNACK-NEXT: v_bfe_u32 v1, v1, 4, 4 -; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v11, v4, v11 ; GFX10-DL-XNACK-NEXT: v_bfe_u32 v16, v2, 4, 4 +; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v11, v4, v11 ; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v13, v4, v13 ; GFX10-DL-XNACK-NEXT: v_bfe_u32 v18, v2, 8, 4 -; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v9, v4, v9 -; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v1, v1, 16, v11 -; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v7, v4, v7 -; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v11, v16, 16, v13 ; GFX10-DL-XNACK-NEXT: v_bfe_u32 v12, v2, 24, 4 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v14, 28, v2 -; GFX10-DL-XNACK-NEXT: v_pk_lshlrev_b16 v1, 12, v1 op_sel_hi:[0,1] +; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v1, v1, 16, v11 +; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v11, v16, 16, v13 ; GFX10-DL-XNACK-NEXT: v_bfe_u32 v15, v2, 16, 4 ; GFX10-DL-XNACK-NEXT: v_bfe_u32 v17, v2, 20, 4 ; GFX10-DL-XNACK-NEXT: v_bfe_u32 v2, v2, 12, 4 +; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v9, v4, v9 +; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v7, v4, v7 ; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v13, v4, v18 +; GFX10-DL-XNACK-NEXT: v_pk_lshlrev_b16 v1, 12, v1 op_sel_hi:[0,1] ; GFX10-DL-XNACK-NEXT: v_pk_lshlrev_b16 v11, 12, v11 op_sel_hi:[0,1] ; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v9, v10, 16, v9 ; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v7, v8, 16, v7 -; GFX10-DL-XNACK-NEXT: v_pk_ashrrev_i16 v1, 12, v1 op_sel_hi:[0,1] ; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v2, v2, 16, v13 +; GFX10-DL-XNACK-NEXT: v_pk_ashrrev_i16 v1, 12, v1 op_sel_hi:[0,1] ; GFX10-DL-XNACK-NEXT: v_pk_ashrrev_i16 v8, 12, v11 op_sel_hi:[0,1] -; GFX10-DL-XNACK-NEXT: v_pk_lshlrev_b16 v9, 12, v9 op_sel_hi:[0,1] ; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v10, v4, v15 -; GFX10-DL-XNACK-NEXT: v_pk_lshlrev_b16 v7, 12, v7 op_sel_hi:[0,1] +; GFX10-DL-XNACK-NEXT: v_pk_lshlrev_b16 v9, 12, v9 op_sel_hi:[0,1] ; GFX10-DL-XNACK-NEXT: v_pk_lshlrev_b16 v2, 12, v2 op_sel_hi:[0,1] +; GFX10-DL-XNACK-NEXT: v_pk_lshlrev_b16 v7, 12, v7 op_sel_hi:[0,1] ; GFX10-DL-XNACK-NEXT: v_pk_mul_lo_u16 v1, v1, v8 -; GFX10-DL-XNACK-NEXT: v_pk_ashrrev_i16 v9, 12, v9 op_sel_hi:[0,1] ; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v8, v17, 16, v10 -; GFX10-DL-XNACK-NEXT: v_pk_ashrrev_i16 v7, 12, v7 op_sel_hi:[0,1] +; GFX10-DL-XNACK-NEXT: v_pk_ashrrev_i16 v9, 12, v9 op_sel_hi:[0,1] ; GFX10-DL-XNACK-NEXT: v_pk_ashrrev_i16 v2, 12, v2 op_sel_hi:[0,1] +; GFX10-DL-XNACK-NEXT: v_pk_ashrrev_i16 v7, 12, v7 op_sel_hi:[0,1] ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v10, 16, v1 ; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v1, v1, v3 @@ -2623,43 +2623,43 @@ ; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v11, 15, v1 ; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(1) ; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v13, 15, v0 -; GFX10-DL-NOXNACK-NEXT: v_bfe_u32 v7, v1, 16, 4 -; GFX10-DL-NOXNACK-NEXT: v_bfe_u32 v9, v1, 8, 4 ; GFX10-DL-NOXNACK-NEXT: v_bfe_u32 v5, v1, 24, 4 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v6, 28, v1 +; GFX10-DL-NOXNACK-NEXT: v_bfe_u32 v7, v1, 16, 4 ; GFX10-DL-NOXNACK-NEXT: v_bfe_u32 v8, v1, 20, 4 +; GFX10-DL-NOXNACK-NEXT: v_bfe_u32 v9, v1, 8, 4 ; GFX10-DL-NOXNACK-NEXT: v_bfe_u32 v10, v1, 12, 4 ; GFX10-DL-NOXNACK-NEXT: v_bfe_u32 v1, v1, 4, 4 -; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v11, v4, v11 ; GFX10-DL-NOXNACK-NEXT: v_bfe_u32 v16, v0, 4, 4 +; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v11, v4, v11 ; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v13, v4, v13 ; GFX10-DL-NOXNACK-NEXT: v_bfe_u32 v18, v0, 8, 4 -; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v9, v4, v9 -; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v1, v1, 16, v11 -; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v7, v4, v7 -; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v11, v16, 16, v13 ; GFX10-DL-NOXNACK-NEXT: v_bfe_u32 v12, v0, 24, 4 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v14, 28, v0 -; GFX10-DL-NOXNACK-NEXT: v_pk_lshlrev_b16 v1, 12, v1 op_sel_hi:[0,1] +; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v1, v1, 16, v11 +; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v11, v16, 16, v13 ; GFX10-DL-NOXNACK-NEXT: v_bfe_u32 v15, v0, 16, 4 ; GFX10-DL-NOXNACK-NEXT: v_bfe_u32 v17, v0, 20, 4 ; GFX10-DL-NOXNACK-NEXT: v_bfe_u32 v0, v0, 12, 4 +; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v9, v4, v9 +; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v7, v4, v7 ; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v13, v4, v18 +; GFX10-DL-NOXNACK-NEXT: v_pk_lshlrev_b16 v1, 12, v1 op_sel_hi:[0,1] ; GFX10-DL-NOXNACK-NEXT: v_pk_lshlrev_b16 v11, 12, v11 op_sel_hi:[0,1] ; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v9, v10, 16, v9 ; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v7, v8, 16, v7 -; GFX10-DL-NOXNACK-NEXT: v_pk_ashrrev_i16 v1, 12, v1 op_sel_hi:[0,1] ; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v0, v0, 16, v13 +; GFX10-DL-NOXNACK-NEXT: v_pk_ashrrev_i16 v1, 12, v1 op_sel_hi:[0,1] ; GFX10-DL-NOXNACK-NEXT: v_pk_ashrrev_i16 v8, 12, v11 op_sel_hi:[0,1] -; GFX10-DL-NOXNACK-NEXT: v_pk_lshlrev_b16 v9, 12, v9 op_sel_hi:[0,1] ; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v10, v4, v15 -; GFX10-DL-NOXNACK-NEXT: v_pk_lshlrev_b16 v7, 12, v7 op_sel_hi:[0,1] +; GFX10-DL-NOXNACK-NEXT: v_pk_lshlrev_b16 v9, 12, v9 op_sel_hi:[0,1] ; GFX10-DL-NOXNACK-NEXT: v_pk_lshlrev_b16 v0, 12, v0 op_sel_hi:[0,1] +; GFX10-DL-NOXNACK-NEXT: v_pk_lshlrev_b16 v7, 12, v7 op_sel_hi:[0,1] ; GFX10-DL-NOXNACK-NEXT: v_pk_mul_lo_u16 v1, v1, v8 -; GFX10-DL-NOXNACK-NEXT: v_pk_ashrrev_i16 v9, 12, v9 op_sel_hi:[0,1] ; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v8, v17, 16, v10 -; GFX10-DL-NOXNACK-NEXT: v_pk_ashrrev_i16 v7, 12, v7 op_sel_hi:[0,1] +; GFX10-DL-NOXNACK-NEXT: v_pk_ashrrev_i16 v9, 12, v9 op_sel_hi:[0,1] ; GFX10-DL-NOXNACK-NEXT: v_pk_ashrrev_i16 v0, 12, v0 op_sel_hi:[0,1] +; GFX10-DL-NOXNACK-NEXT: v_pk_ashrrev_i16 v7, 12, v7 op_sel_hi:[0,1] ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v10, 16, v1 ; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v1, v1, v3 @@ -2807,10 +2807,10 @@ ; GFX7-NEXT: s_mov_b32 s15, 0xe8f000 ; GFX7-NEXT: s_add_u32 s12, s12, s3 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 @@ -2839,15 +2839,15 @@ ; GFX7-NEXT: v_and_b32_e32 v9, v2, v13 ; GFX7-NEXT: v_lshlrev_b32_e32 v13, 8, v16 ; GFX7-NEXT: buffer_load_ubyte v16, off, s[0:3], 0 +; GFX7-NEXT: v_ashrrev_i32_e32 v1, 28, v4 ; GFX7-NEXT: v_bfe_i32 v5, v4, 24, 4 ; GFX7-NEXT: v_bfe_i32 v10, v4, 4, 4 -; GFX7-NEXT: v_ashrrev_i32_e32 v1, 28, v4 ; GFX7-NEXT: v_bfe_i32 v4, v4, 0, 4 ; GFX7-NEXT: v_lshlrev_b32_e32 v11, 8, v1 ; GFX7-NEXT: v_and_b32_e32 v5, s4, v5 -; GFX7-NEXT: v_ashrrev_i32_e32 v12, 28, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v10, 8, v10 ; GFX7-NEXT: v_and_b32_e32 v4, v2, v4 +; GFX7-NEXT: v_ashrrev_i32_e32 v12, 28, v0 ; GFX7-NEXT: v_bfe_i32 v14, v0, 20, 4 ; GFX7-NEXT: v_bfe_i32 v15, v0, 16, 4 ; GFX7-NEXT: v_bfe_i32 v17, v0, 8, 4 @@ -2861,29 +2861,29 @@ ; GFX7-NEXT: v_and_b32_e32 v14, v2, v17 ; GFX7-NEXT: v_lshlrev_b32_e32 v15, 8, v18 ; GFX7-NEXT: v_and_b32_e32 v0, v2, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v15 -; GFX7-NEXT: v_or_b32_e32 v8, v9, v8 -; GFX7-NEXT: v_or_b32_e32 v9, v11, v10 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: v_and_b32_e32 v6, s5, v6 +; GFX7-NEXT: v_or_b32_e32 v8, v9, v8 +; GFX7-NEXT: v_or_b32_e32 v9, v11, v10 ; GFX7-NEXT: v_or_b32_e32 v10, v14, v13 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v15 +; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX7-NEXT: v_and_b32_e32 v4, s5, v4 ; GFX7-NEXT: v_or_b32_e32 v5, v6, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v8 ; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v10 ; GFX7-NEXT: v_and_b32_e32 v0, v3, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GFX7-NEXT: v_and_b32_e32 v4, s5, v4 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v8 ; GFX7-NEXT: v_or_b32_e32 v4, v4, v7 ; GFX7-NEXT: v_and_b32_e32 v7, v3, v9 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v8 ; GFX7-NEXT: v_or_b32_e32 v3, v7, v6 ; GFX7-NEXT: v_and_b32_e32 v7, v2, v4 ; GFX7-NEXT: v_and_b32_e32 v13, v2, v0 ; GFX7-NEXT: v_bfe_u32 v8, v4, 8, 8 ; GFX7-NEXT: v_bfe_u32 v14, v0, 8, 8 ; GFX7-NEXT: v_lshrrev_b32_e32 v6, 24, v4 -; GFX7-NEXT: v_lshrrev_b32_e32 v11, 24, v0 ; GFX7-NEXT: v_bfe_u32 v4, v4, 16, 8 +; GFX7-NEXT: v_lshrrev_b32_e32 v11, 24, v0 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 ; GFX7-NEXT: v_and_b32_e32 v1, v2, v1 ; GFX7-NEXT: v_and_b32_e32 v12, v2, v12 @@ -2930,60 +2930,60 @@ ; GFX8-NEXT: s_add_u32 s8, s8, s3 ; GFX8-NEXT: s_addc_u32 s9, s9, 0 ; GFX8-NEXT: s_waitcnt vmcnt(2) +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 20, v3 ; GFX8-NEXT: v_lshrrev_b32_e32 v7, 28, v3 ; GFX8-NEXT: v_lshrrev_b32_e32 v8, 12, v3 ; GFX8-NEXT: v_lshrrev_b32_e32 v9, 8, v3 -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 20, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v10, 4, v3 +; GFX8-NEXT: v_lshlrev_b16_e32 v16, 12, v3 ; GFX8-NEXT: s_waitcnt vmcnt(1) +; GFX8-NEXT: v_lshrrev_b32_e32 v11, 20, v2 ; GFX8-NEXT: v_lshrrev_b32_e32 v12, 28, v2 ; GFX8-NEXT: v_lshrrev_b32_e32 v13, 12, v2 ; GFX8-NEXT: v_lshrrev_b32_e32 v14, 8, v2 -; GFX8-NEXT: v_lshrrev_b32_e32 v10, 4, v3 -; GFX8-NEXT: v_lshrrev_b32_e32 v11, 20, v2 ; GFX8-NEXT: v_lshrrev_b32_e32 v15, 4, v2 -; GFX8-NEXT: v_lshlrev_b16_e32 v16, 12, v3 ; GFX8-NEXT: v_lshlrev_b16_sdwa v17, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 ; GFX8-NEXT: v_lshlrev_b16_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_lshlrev_b16_e32 v18, 12, v2 ; GFX8-NEXT: v_lshlrev_b16_sdwa v19, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 ; GFX8-NEXT: v_lshlrev_b16_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_lshlrev_b16_e32 v5, 12, v10 -; GFX8-NEXT: v_lshlrev_b16_e32 v7, 12, v7 -; GFX8-NEXT: v_lshlrev_b16_e32 v12, 12, v12 ; GFX8-NEXT: v_ashrrev_i16_e32 v10, 12, v16 ; GFX8-NEXT: v_ashrrev_i16_e32 v16, 12, v17 +; GFX8-NEXT: v_lshlrev_b16_e32 v7, 12, v7 ; GFX8-NEXT: v_ashrrev_i16_e32 v17, 12, v3 ; GFX8-NEXT: v_lshlrev_b16_e32 v3, 12, v6 ; GFX8-NEXT: v_lshlrev_b16_e32 v6, 12, v15 ; GFX8-NEXT: v_ashrrev_i16_e32 v15, 12, v18 ; GFX8-NEXT: v_ashrrev_i16_e32 v18, 12, v19 +; GFX8-NEXT: v_lshlrev_b16_e32 v12, 12, v12 ; GFX8-NEXT: v_ashrrev_i16_e32 v19, 12, v2 ; GFX8-NEXT: v_lshlrev_b16_e32 v2, 12, v11 ; GFX8-NEXT: v_lshlrev_b16_e32 v9, 12, v9 ; GFX8-NEXT: v_lshlrev_b16_e32 v8, 12, v8 -; GFX8-NEXT: v_lshlrev_b16_e32 v13, 12, v13 ; GFX8-NEXT: v_lshlrev_b16_e32 v14, 12, v14 +; GFX8-NEXT: v_lshlrev_b16_e32 v13, 12, v13 ; GFX8-NEXT: v_ashrrev_i16_e32 v7, 12, v7 ; GFX8-NEXT: v_ashrrev_i16_e32 v3, 12, v3 -; GFX8-NEXT: v_ashrrev_i16_e32 v2, 12, v2 ; GFX8-NEXT: v_ashrrev_i16_e32 v12, 12, v12 -; GFX8-NEXT: v_ashrrev_i16_e32 v11, 12, v14 -; GFX8-NEXT: v_mul_lo_u16_e32 v10, v10, v15 -; GFX8-NEXT: v_mul_lo_u16_sdwa v2, v3, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_ashrrev_i16_e32 v2, 12, v2 ; GFX8-NEXT: v_ashrrev_i16_e32 v9, 12, v9 ; GFX8-NEXT: v_ashrrev_i16_e32 v8, 12, v8 +; GFX8-NEXT: v_ashrrev_i16_e32 v11, 12, v14 ; GFX8-NEXT: v_ashrrev_i16_e32 v13, 12, v13 -; GFX8-NEXT: v_mul_lo_u16_sdwa v3, v7, v12 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_mul_lo_u16_e32 v10, v10, v15 ; GFX8-NEXT: v_mul_lo_u16_e32 v15, v16, v18 -; GFX8-NEXT: v_mul_lo_u16_sdwa v7, v8, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_mul_lo_u16_e32 v8, v9, v11 -; GFX8-NEXT: v_or_b32_sdwa v3, v15, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_mul_lo_u16_sdwa v2, v3, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_mul_lo_u16_sdwa v3, v7, v12 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_ashrrev_i16_e32 v5, 12, v5 ; GFX8-NEXT: v_ashrrev_i16_e32 v6, 12, v6 ; GFX8-NEXT: v_mul_lo_u16_e32 v14, v17, v19 +; GFX8-NEXT: v_mul_lo_u16_sdwa v7, v8, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_mul_lo_u16_e32 v8, v9, v11 +; GFX8-NEXT: v_or_b32_sdwa v3, v15, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_mul_lo_u16_sdwa v5, v5, v6 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v7, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v6, v14, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v7, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v3 ; GFX8-NEXT: v_or_b32_sdwa v8, v10, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v7 @@ -3016,67 +3016,67 @@ ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-NEXT: global_load_ubyte v4, v3, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v0, 12 +; GFX9-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 20, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 28, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v7, 12, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v8, 8, v1 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshrrev_b32_e32 v11, 28, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v5, 20, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v9, 4, v1 +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshrrev_b32_e32 v10, 20, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 28, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v14, 4, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v15, 12, v1 ; GFX9-NEXT: v_lshlrev_b16_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 ; GFX9-NEXT: v_lshlrev_b16_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshlrev_b16_sdwa v18, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 ; GFX9-NEXT: v_lshlrev_b16_e32 v17, 12, v2 +; GFX9-NEXT: v_lshlrev_b16_sdwa v18, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 ; GFX9-NEXT: v_lshlrev_b16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: v_lshrrev_b32_e32 v12, 12, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 12, v9 -; GFX9-NEXT: v_lshlrev_b16_e32 v6, 12, v6 -; GFX9-NEXT: v_lshlrev_b16_e32 v11, 12, v11 ; GFX9-NEXT: v_ashrrev_i16_e32 v9, 12, v15 ; GFX9-NEXT: v_ashrrev_i16_e32 v15, 12, v16 +; GFX9-NEXT: v_lshlrev_b16_e32 v6, 12, v6 ; GFX9-NEXT: v_ashrrev_i16_e32 v16, 12, v1 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 12, v5 ; GFX9-NEXT: v_lshlrev_b16_e32 v5, 12, v14 ; GFX9-NEXT: v_ashrrev_i16_e32 v14, 12, v17 ; GFX9-NEXT: v_ashrrev_i16_e32 v17, 12, v18 +; GFX9-NEXT: v_lshlrev_b16_e32 v11, 12, v11 ; GFX9-NEXT: v_ashrrev_i16_e32 v18, 12, v0 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 12, v10 ; GFX9-NEXT: v_lshlrev_b16_e32 v8, 12, v8 ; GFX9-NEXT: v_lshlrev_b16_e32 v7, 12, v7 -; GFX9-NEXT: v_lshlrev_b16_e32 v12, 12, v12 ; GFX9-NEXT: v_lshlrev_b16_e32 v13, 12, v13 +; GFX9-NEXT: v_lshlrev_b16_e32 v12, 12, v12 ; GFX9-NEXT: v_ashrrev_i16_e32 v6, 12, v6 ; GFX9-NEXT: v_ashrrev_i16_e32 v1, 12, v1 -; GFX9-NEXT: v_ashrrev_i16_e32 v0, 12, v0 ; GFX9-NEXT: v_ashrrev_i16_e32 v11, 12, v11 -; GFX9-NEXT: v_ashrrev_i16_e32 v10, 12, v13 -; GFX9-NEXT: v_mul_lo_u16_sdwa v0, v1, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_ashrrev_i16_e32 v0, 12, v0 ; GFX9-NEXT: v_ashrrev_i16_e32 v8, 12, v8 ; GFX9-NEXT: v_ashrrev_i16_e32 v7, 12, v7 +; GFX9-NEXT: v_ashrrev_i16_e32 v10, 12, v13 ; GFX9-NEXT: v_ashrrev_i16_e32 v12, 12, v12 -; GFX9-NEXT: v_mul_lo_u16_sdwa v1, v6, v11 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_mul_lo_u16_e32 v19, v15, v17 -; GFX9-NEXT: v_mul_lo_u16_sdwa v6, v7, v12 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_mul_lo_u16_e32 v7, v8, v10 -; GFX9-NEXT: v_or_b32_sdwa v1, v19, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mul_lo_u16_sdwa v0, v1, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_mul_lo_u16_sdwa v1, v6, v11 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_ashrrev_i16_e32 v2, 12, v2 ; GFX9-NEXT: v_ashrrev_i16_e32 v5, 12, v5 ; GFX9-NEXT: v_mul_lo_u16_e32 v13, v16, v18 -; GFX9-NEXT: v_mul_lo_u16_sdwa v2, v2, v5 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_mul_lo_u16_sdwa v6, v7, v12 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_mul_lo_u16_e32 v7, v8, v10 +; GFX9-NEXT: v_or_b32_sdwa v1, v19, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_mul_lo_u16_e32 v9, v9, v14 -; GFX9-NEXT: v_or_b32_sdwa v6, v7, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mul_lo_u16_sdwa v2, v2, v5 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v5, v13, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v6, v7, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v1 ; GFX9-NEXT: v_or_b32_sdwa v7, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v6 @@ -3109,67 +3109,67 @@ ; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-DL-NEXT: global_load_ubyte v4, v3, s[2:3] ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 12 +; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(2) +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 20, v1 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v6, 28, v1 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 12, v1 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v8, 8, v1 -; GFX9-DL-NEXT: s_waitcnt vmcnt(1) -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v11, 28, v2 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 20, v1 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v9, 4, v1 +; GFX9-DL-NEXT: s_waitcnt vmcnt(1) ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v10, 20, v2 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v11, 28, v2 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v14, 4, v2 ; GFX9-DL-NEXT: v_lshlrev_b16_e32 v15, 12, v1 ; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 ; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v18, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 ; GFX9-DL-NEXT: v_lshlrev_b16_e32 v17, 12, v2 +; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v18, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 ; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v12, 12, v2 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v13, 8, v2 ; GFX9-DL-NEXT: v_lshlrev_b16_e32 v2, 12, v9 -; GFX9-DL-NEXT: v_lshlrev_b16_e32 v6, 12, v6 -; GFX9-DL-NEXT: v_lshlrev_b16_e32 v11, 12, v11 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v9, 12, v15 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v15, 12, v16 +; GFX9-DL-NEXT: v_lshlrev_b16_e32 v6, 12, v6 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v16, 12, v1 ; GFX9-DL-NEXT: v_lshlrev_b16_e32 v1, 12, v5 ; GFX9-DL-NEXT: v_lshlrev_b16_e32 v5, 12, v14 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v14, 12, v17 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v17, 12, v18 +; GFX9-DL-NEXT: v_lshlrev_b16_e32 v11, 12, v11 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v18, 12, v0 ; GFX9-DL-NEXT: v_lshlrev_b16_e32 v0, 12, v10 ; GFX9-DL-NEXT: v_lshlrev_b16_e32 v8, 12, v8 ; GFX9-DL-NEXT: v_lshlrev_b16_e32 v7, 12, v7 -; GFX9-DL-NEXT: v_lshlrev_b16_e32 v12, 12, v12 ; GFX9-DL-NEXT: v_lshlrev_b16_e32 v13, 12, v13 +; GFX9-DL-NEXT: v_lshlrev_b16_e32 v12, 12, v12 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v6, 12, v6 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v1, 12, v1 -; GFX9-DL-NEXT: v_ashrrev_i16_e32 v0, 12, v0 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v11, 12, v11 -; GFX9-DL-NEXT: v_ashrrev_i16_e32 v10, 12, v13 -; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v0, v1, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-DL-NEXT: v_ashrrev_i16_e32 v0, 12, v0 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v8, 12, v8 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v7, 12, v7 +; GFX9-DL-NEXT: v_ashrrev_i16_e32 v10, 12, v13 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v12, 12, v12 -; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v1, v6, v11 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-DL-NEXT: v_mul_lo_u16_e32 v19, v15, v17 -; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v6, v7, v12 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-DL-NEXT: v_mul_lo_u16_e32 v7, v8, v10 -; GFX9-DL-NEXT: v_or_b32_sdwa v1, v19, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v0, v1, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v1, v6, v11 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v2, 12, v2 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v5, 12, v5 ; GFX9-DL-NEXT: v_mul_lo_u16_e32 v13, v16, v18 -; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v2, v2, v5 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v6, v7, v12 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-DL-NEXT: v_mul_lo_u16_e32 v7, v8, v10 +; GFX9-DL-NEXT: v_or_b32_sdwa v1, v19, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-DL-NEXT: v_mul_lo_u16_e32 v9, v9, v14 -; GFX9-DL-NEXT: v_or_b32_sdwa v6, v7, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v2, v2, v5 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-DL-NEXT: v_or_b32_sdwa v5, v13, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-DL-NEXT: v_or_b32_sdwa v6, v7, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v8, 16, v1 ; GFX9-DL-NEXT: v_or_b32_sdwa v7, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 16, v6 @@ -3231,10 +3231,10 @@ ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v6, 12, v6 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v13, 12, v13 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v10, 12, v10 -; GFX10-DL-XNACK-NEXT: v_mul_lo_u16 v8, v8, v15 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v17, 12, v17 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v9, 12, v9 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v16, 12, v16 +; GFX10-DL-XNACK-NEXT: v_mul_lo_u16 v8, v8, v15 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v5, 16, v1 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v12, 16, v2 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v7, 12, v7 @@ -3251,33 +3251,33 @@ ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v8, 8, v8 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v5, 12, v5 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v12, 12, v12 -; GFX10-DL-XNACK-NEXT: v_mul_lo_u16 v6, v6, v13 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v7, 12, v7 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v0, 12, v0 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v14, 12, v14 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v11, 12, v11 -; GFX10-DL-XNACK-NEXT: v_mul_lo_u16 v10, v10, v15 -; GFX10-DL-XNACK-NEXT: v_or_b32_sdwa v8, v9, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-DL-XNACK-NEXT: v_mul_lo_u16 v6, v6, v13 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v1, 12, v1 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v2, 12, v2 -; GFX10-DL-XNACK-NEXT: v_mul_lo_u16 v9, v0, v11 +; GFX10-DL-XNACK-NEXT: v_mul_lo_u16 v10, v10, v15 +; GFX10-DL-XNACK-NEXT: v_or_b32_sdwa v8, v9, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v5, 12, v5 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v12, 12, v12 +; GFX10-DL-XNACK-NEXT: v_mul_lo_u16 v9, v0, v11 ; GFX10-DL-XNACK-NEXT: v_mul_lo_u16 v11, v7, v14 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v6, 8, v6 -; GFX10-DL-XNACK-NEXT: v_mul_lo_u16 v1, v1, v2 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v10, 8, v10 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v0, 16, v8 +; GFX10-DL-XNACK-NEXT: v_mul_lo_u16 v1, v1, v2 ; GFX10-DL-XNACK-NEXT: v_mul_lo_u16 v2, v5, v12 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v9, 8, v9 ; GFX10-DL-XNACK-NEXT: v_or_b32_sdwa v6, v11, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX10-DL-XNACK-NEXT: v_or_b32_sdwa v1, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-DL-XNACK-NEXT: v_or_b32_sdwa v11, v10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX10-DL-XNACK-NEXT: v_or_b32_sdwa v1, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-DL-XNACK-NEXT: v_or_b32_sdwa v2, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v9, 16, v6 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v10, 8, v11 ; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v3, v1, v3 -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v10, 8, v11 ; GFX10-DL-XNACK-NEXT: v_or_b32_sdwa v1, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v9, v3, v10 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b64 v[2:3], 24, v[0:1] @@ -3318,27 +3318,27 @@ ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v12, 16, v0 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v13, 28, v0 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v14, 24, v0 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v16, 8, v0 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v17, 4, v0 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v0, 12, v0 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v8, 12, v8 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v15, 12, v15 -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v16, 8, v0 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v0, 12, v0 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v6, 28, v1 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v10, 4, v1 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v9, 12, v9 -; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v8, 12, v8 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v18, 12, v0 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v0, 12, v16 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v8, 12, v8 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v15, 12, v15 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v3, 20, v1 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v7, 24, v1 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v6, 12, v6 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v13, 12, v13 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v10, 12, v10 -; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16 v8, v8, v15 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v17, 12, v17 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v9, 12, v9 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v0, 12, v0 +; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16 v8, v8, v15 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v5, 16, v1 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v7, 12, v7 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v3, 12, v3 @@ -3347,38 +3347,38 @@ ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v6, 12, v6 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v13, 12, v13 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v1, 12, v1 -; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16 v0, v9, v0 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v8, 8, v8 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v10, 12, v10 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v15, 12, v17 +; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16 v0, v9, v0 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v8, 8, v8 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v5, 12, v5 -; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16 v6, v6, v13 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v12, 12, v12 -; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v9, 12, v11 -; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v3, 12, v3 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v7, 12, v7 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v3, 12, v3 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v14, 12, v14 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v9, 12, v11 +; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16 v6, v6, v13 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v1, 12, v1 ; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16 v10, v10, v15 ; GFX10-DL-NOXNACK-NEXT: v_or_b32_sdwa v8, v0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16 v3, v3, v9 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v5, 12, v5 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v11, 12, v12 +; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16 v3, v3, v9 ; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16 v9, v7, v14 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v6, 8, v6 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v10, 8, v10 -; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16 v1, v1, v18 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b32_e32 v0, 16, v8 +; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16 v1, v1, v18 ; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16 v12, v5, v11 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v3, 8, v3 ; GFX10-DL-NOXNACK-NEXT: v_or_b32_sdwa v6, v9, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX10-DL-NOXNACK-NEXT: v_or_b32_sdwa v1, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-DL-NOXNACK-NEXT: v_or_b32_sdwa v9, v10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX10-DL-NOXNACK-NEXT: v_or_b32_sdwa v1, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-DL-NOXNACK-NEXT: v_or_b32_sdwa v3, v12, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b32_e32 v10, 16, v6 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v9, 8, v9 ; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v2, v1, v2 -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v9, 8, v9 ; GFX10-DL-NOXNACK-NEXT: v_or_b32_sdwa v1, v3, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v9, v2, v9 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b64 v[2:3], 24, v[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/idot8u.ll b/llvm/test/CodeGen/AMDGPU/idot8u.ll --- a/llvm/test/CodeGen/AMDGPU/idot8u.ll +++ b/llvm/test/CodeGen/AMDGPU/idot8u.ll @@ -17,10 +17,10 @@ ; GFX7-NEXT: s_mov_b32 s15, 0xe8f000 ; GFX7-NEXT: s_add_u32 s12, s12, s3 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 @@ -130,21 +130,21 @@ ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 28, v1 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v10, 28, v2 ; GFX9-NEXT: v_bfe_u32 v4, v1, 24, 4 -; GFX9-NEXT: v_bfe_u32 v11, v2, 24, 4 ; GFX9-NEXT: v_bfe_u32 v5, v1, 20, 4 -; GFX9-NEXT: v_bfe_u32 v12, v2, 20, 4 ; GFX9-NEXT: v_bfe_u32 v6, v1, 16, 4 -; GFX9-NEXT: v_bfe_u32 v13, v2, 16, 4 ; GFX9-NEXT: v_bfe_u32 v7, v1, 12, 4 -; GFX9-NEXT: v_bfe_u32 v14, v2, 12, 4 ; GFX9-NEXT: v_bfe_u32 v8, v1, 8, 4 ; GFX9-NEXT: v_bfe_u32 v9, v1, 4, 4 +; GFX9-NEXT: v_and_b32_e32 v1, 15, v1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 28, v2 +; GFX9-NEXT: v_bfe_u32 v11, v2, 24, 4 +; GFX9-NEXT: v_bfe_u32 v12, v2, 20, 4 +; GFX9-NEXT: v_bfe_u32 v13, v2, 16, 4 +; GFX9-NEXT: v_bfe_u32 v14, v2, 12, 4 ; GFX9-NEXT: v_bfe_u32 v15, v2, 8, 4 ; GFX9-NEXT: v_bfe_u32 v16, v2, 4, 4 -; GFX9-NEXT: v_and_b32_e32 v1, 15, v1 ; GFX9-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX9-NEXT: v_mul_u32_u24_e32 v1, v1, v2 ; GFX9-NEXT: v_mul_u32_u24_e32 v2, v9, v16 @@ -173,11 +173,11 @@ ; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v3, v0, s[6:7] ; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-DL-NEXT: v_dot8_u32_u4 v0, v2, v3, s0 ; GFX9-DL-NEXT: global_store_dword v1, v0, s[2:3] @@ -296,8 +296,8 @@ ; GFX7-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] -; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: buffer_load_ushort v16, off, s[0:3], 0 ; GFX7-NEXT: s_addc_u32 s13, s13, 0 ; GFX7-NEXT: s_waitcnt vmcnt(2) @@ -342,12 +342,12 @@ ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v4, v[0:1] -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: flat_load_ushort v18, v[2:3] ; GFX8-NEXT: s_mov_b32 s10, -1 ; GFX8-NEXT: s_mov_b32 s11, 0xe80000 @@ -394,11 +394,11 @@ ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v2, v0, s[4:5] ; GFX9-NEXT: global_load_dword v3, v0, s[6:7] ; GFX9-NEXT: global_load_ushort v17, v1, s[2:3] +; GFX9-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 28, v2 ; GFX9-NEXT: v_bfe_u32 v4, v2, 24, 4 @@ -440,11 +440,11 @@ ; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v3, v0, s[6:7] ; GFX9-DL-NEXT: global_load_ushort v17, v1, s[2:3] +; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(2) ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v0, 28, v2 ; GFX9-DL-NEXT: v_bfe_u32 v4, v2, 24, 4 @@ -614,8 +614,8 @@ ; GFX7-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] -; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: buffer_load_ubyte v16, off, s[0:3], 0 ; GFX7-NEXT: s_addc_u32 s13, s13, 0 ; GFX7-NEXT: s_waitcnt vmcnt(2) @@ -660,12 +660,12 @@ ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v4, v[0:1] -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: flat_load_ubyte v18, v[2:3] ; GFX8-NEXT: s_mov_b32 s10, -1 ; GFX8-NEXT: s_mov_b32 s11, 0xe80000 @@ -712,11 +712,11 @@ ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v2, v0, s[4:5] ; GFX9-NEXT: global_load_dword v3, v0, s[6:7] ; GFX9-NEXT: global_load_ubyte v17, v1, s[2:3] +; GFX9-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 28, v2 ; GFX9-NEXT: v_bfe_u32 v4, v2, 24, 4 @@ -758,11 +758,11 @@ ; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v3, v0, s[6:7] ; GFX9-DL-NEXT: global_load_ubyte v17, v1, s[2:3] +; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(2) ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v0, 28, v2 ; GFX9-DL-NEXT: v_bfe_u32 v4, v2, 24, 4 @@ -932,8 +932,8 @@ ; GFX7-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] -; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: buffer_load_ubyte v16, off, s[0:3], 0 ; GFX7-NEXT: s_addc_u32 s13, s13, 0 ; GFX7-NEXT: s_waitcnt vmcnt(2) @@ -979,12 +979,12 @@ ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v4, v[0:1] -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: flat_load_ubyte v18, v[2:3] ; GFX8-NEXT: s_mov_b32 s10, -1 ; GFX8-NEXT: s_mov_b32 s11, 0xe80000 @@ -1032,11 +1032,11 @@ ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v2, v0, s[4:5] ; GFX9-NEXT: global_load_dword v3, v0, s[6:7] ; GFX9-NEXT: global_load_ubyte v17, v1, s[2:3] +; GFX9-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 28, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 24, v2 @@ -1079,11 +1079,11 @@ ; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v3, v0, s[6:7] ; GFX9-DL-NEXT: global_load_ubyte v17, v1, s[2:3] +; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(2) ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v0, 28, v2 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v4, 24, v2 @@ -1239,8 +1239,8 @@ ; GFX7-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] -; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: buffer_load_ubyte v16, off, s[0:3], 0 ; GFX7-NEXT: s_addc_u32 s13, s13, 0 ; GFX7-NEXT: s_waitcnt vmcnt(2) @@ -1286,12 +1286,12 @@ ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v4, v[0:1] -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: flat_load_ubyte v18, v[2:3] ; GFX8-NEXT: s_mov_b32 s10, -1 ; GFX8-NEXT: s_mov_b32 s11, 0xe80000 @@ -1339,11 +1339,11 @@ ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v2, v0, s[4:5] ; GFX9-NEXT: global_load_dword v3, v0, s[6:7] ; GFX9-NEXT: global_load_ubyte v17, v1, s[2:3] +; GFX9-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 28, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 24, v2 @@ -1386,11 +1386,11 @@ ; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v3, v0, s[6:7] ; GFX9-DL-NEXT: global_load_ubyte v17, v1, s[2:3] +; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(2) ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v0, 28, v2 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v4, 24, v2 @@ -1536,10 +1536,10 @@ ; GFX7-NEXT: s_mov_b32 s15, 0xe8f000 ; GFX7-NEXT: s_add_u32 s12, s12, s3 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 @@ -1654,20 +1654,20 @@ ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_bfe_u32 v3, v1, 4, 4 ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 28, v1 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v11, 28, v2 ; GFX9-NEXT: v_bfe_u32 v5, v1, 24, 4 -; GFX9-NEXT: v_bfe_u32 v12, v2, 24, 4 ; GFX9-NEXT: v_bfe_u32 v6, v1, 20, 4 -; GFX9-NEXT: v_bfe_u32 v13, v2, 20, 4 ; GFX9-NEXT: v_bfe_u32 v7, v1, 16, 4 -; GFX9-NEXT: v_bfe_u32 v14, v2, 16, 4 ; GFX9-NEXT: v_bfe_u32 v8, v1, 12, 4 ; GFX9-NEXT: v_bfe_u32 v9, v1, 8, 4 +; GFX9-NEXT: v_and_b32_e32 v1, 15, v1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_bfe_u32 v10, v2, 4, 4 +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 28, v2 +; GFX9-NEXT: v_bfe_u32 v12, v2, 24, 4 +; GFX9-NEXT: v_bfe_u32 v13, v2, 20, 4 +; GFX9-NEXT: v_bfe_u32 v14, v2, 16, 4 ; GFX9-NEXT: v_bfe_u32 v15, v2, 12, 4 ; GFX9-NEXT: v_bfe_u32 v16, v2, 8, 4 -; GFX9-NEXT: v_bfe_u32 v10, v2, 4, 4 -; GFX9-NEXT: v_and_b32_e32 v1, 15, v1 ; GFX9-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX9-NEXT: v_mul_u32_u24_e32 v17, v1, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1705,20 +1705,20 @@ ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) ; GFX9-DL-NEXT: v_bfe_u32 v3, v1, 4, 4 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v4, 28, v1 -; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v11, 28, v2 ; GFX9-DL-NEXT: v_bfe_u32 v5, v1, 24, 4 -; GFX9-DL-NEXT: v_bfe_u32 v12, v2, 24, 4 ; GFX9-DL-NEXT: v_bfe_u32 v6, v1, 20, 4 -; GFX9-DL-NEXT: v_bfe_u32 v13, v2, 20, 4 ; GFX9-DL-NEXT: v_bfe_u32 v7, v1, 16, 4 -; GFX9-DL-NEXT: v_bfe_u32 v14, v2, 16, 4 ; GFX9-DL-NEXT: v_bfe_u32 v8, v1, 12, 4 ; GFX9-DL-NEXT: v_bfe_u32 v9, v1, 8, 4 +; GFX9-DL-NEXT: v_and_b32_e32 v1, 15, v1 +; GFX9-DL-NEXT: s_waitcnt vmcnt(0) +; GFX9-DL-NEXT: v_bfe_u32 v10, v2, 4, 4 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v11, 28, v2 +; GFX9-DL-NEXT: v_bfe_u32 v12, v2, 24, 4 +; GFX9-DL-NEXT: v_bfe_u32 v13, v2, 20, 4 +; GFX9-DL-NEXT: v_bfe_u32 v14, v2, 16, 4 ; GFX9-DL-NEXT: v_bfe_u32 v15, v2, 12, 4 ; GFX9-DL-NEXT: v_bfe_u32 v16, v2, 8, 4 -; GFX9-DL-NEXT: v_bfe_u32 v10, v2, 4, 4 -; GFX9-DL-NEXT: v_and_b32_e32 v1, 15, v1 ; GFX9-DL-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX9-DL-NEXT: v_mul_u32_u24_e32 v17, v1, v2 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) @@ -1763,10 +1763,10 @@ ; GFX10-DL-NEXT: v_bfe_u32 v5, v1, 20, 4 ; GFX10-DL-NEXT: v_bfe_u32 v6, v1, 16, 4 ; GFX10-DL-NEXT: v_bfe_u32 v7, v1, 12, 4 -; GFX10-DL-NEXT: v_bfe_u32 v12, v2, 12, 4 ; GFX10-DL-NEXT: v_bfe_u32 v1, v1, 8, 4 -; GFX10-DL-NEXT: v_bfe_u32 v11, v2, 8, 4 ; GFX10-DL-NEXT: v_bfe_u32 v10, v2, 4, 4 +; GFX10-DL-NEXT: v_bfe_u32 v11, v2, 8, 4 +; GFX10-DL-NEXT: v_bfe_u32 v12, v2, 12, 4 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: v_mad_u32_u24 v13, v8, v9, s2 ; GFX10-DL-NEXT: v_bfe_u32 v14, v2, 20, 4 @@ -1872,10 +1872,10 @@ ; GFX7-NEXT: s_mov_b32 s15, 0xe8f000 ; GFX7-NEXT: s_add_u32 s12, s12, s3 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 @@ -1985,21 +1985,21 @@ ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 28, v1 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v10, 28, v2 ; GFX9-NEXT: v_bfe_u32 v4, v1, 24, 4 -; GFX9-NEXT: v_bfe_u32 v11, v2, 24, 4 ; GFX9-NEXT: v_bfe_u32 v5, v1, 20, 4 -; GFX9-NEXT: v_bfe_u32 v12, v2, 20, 4 ; GFX9-NEXT: v_bfe_u32 v6, v1, 16, 4 -; GFX9-NEXT: v_bfe_u32 v13, v2, 16, 4 ; GFX9-NEXT: v_bfe_u32 v7, v1, 12, 4 -; GFX9-NEXT: v_bfe_u32 v14, v2, 12, 4 ; GFX9-NEXT: v_bfe_u32 v8, v1, 8, 4 ; GFX9-NEXT: v_bfe_u32 v9, v1, 4, 4 +; GFX9-NEXT: v_and_b32_e32 v1, 15, v1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 28, v2 +; GFX9-NEXT: v_bfe_u32 v11, v2, 24, 4 +; GFX9-NEXT: v_bfe_u32 v12, v2, 20, 4 +; GFX9-NEXT: v_bfe_u32 v13, v2, 16, 4 +; GFX9-NEXT: v_bfe_u32 v14, v2, 12, 4 ; GFX9-NEXT: v_bfe_u32 v15, v2, 8, 4 ; GFX9-NEXT: v_bfe_u32 v16, v2, 4, 4 -; GFX9-NEXT: v_and_b32_e32 v1, 15, v1 ; GFX9-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX9-NEXT: v_mul_u32_u24_e32 v1, v1, v2 ; GFX9-NEXT: v_mul_u32_u24_e32 v2, v9, v16 @@ -2028,11 +2028,11 @@ ; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v3, v0, s[6:7] ; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-DL-NEXT: v_dot8_u32_u4 v0, v2, v3, s0 ; GFX9-DL-NEXT: global_store_dword v1, v0, s[2:3] @@ -2108,10 +2108,10 @@ ; GFX7-NEXT: s_mov_b32 s15, 0xe8f000 ; GFX7-NEXT: s_add_u32 s12, s12, s3 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 @@ -2133,27 +2133,27 @@ ; GFX7-NEXT: v_and_b32_e32 v7, s4, v8 ; GFX7-NEXT: s_waitcnt vmcnt(1) ; GFX7-NEXT: v_lshlrev_b32_e32 v8, 12, v0 +; GFX7-NEXT: v_and_b32_e32 v13, 15, v0 ; GFX7-NEXT: v_or_b32_e32 v6, v6, v7 ; GFX7-NEXT: v_and_b32_e32 v7, s4, v8 -; GFX7-NEXT: v_and_b32_e32 v13, 15, v0 ; GFX7-NEXT: v_or_b32_e32 v7, v13, v7 ; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v6 -; GFX7-NEXT: v_lshrrev_b32_e32 v13, 16, v7 ; GFX7-NEXT: v_and_b32_e32 v6, 15, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v13, 16, v7 ; GFX7-NEXT: v_and_b32_e32 v7, 15, v7 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mad_u32_u24 v6, v6, v7, v16 ; GFX7-NEXT: v_bfe_u32 v12, v0, 8, 4 -; GFX7-NEXT: v_mad_u32_u24 v6, v8, v13, v6 ; GFX7-NEXT: v_bfe_u32 v14, v0, 20, 4 +; GFX7-NEXT: v_mad_u32_u24 v6, v8, v13, v6 ; GFX7-NEXT: v_lshrrev_b32_e32 v9, 28, v0 ; GFX7-NEXT: v_bfe_u32 v10, v0, 24, 4 ; GFX7-NEXT: v_bfe_u32 v11, v0, 12, 4 ; GFX7-NEXT: v_alignbit_b32 v0, v14, v0, 16 ; GFX7-NEXT: v_mad_u32_u24 v5, v5, v12, v6 ; GFX7-NEXT: v_lshrrev_b32_e32 v15, 16, v2 -; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v0 ; GFX7-NEXT: v_and_b32_e32 v2, 15, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v0 ; GFX7-NEXT: v_and_b32_e32 v0, 15, v0 ; GFX7-NEXT: v_mad_u32_u24 v4, v4, v11, v5 ; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v4 @@ -2175,12 +2175,12 @@ ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v4, v[0:1] -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: flat_load_ushort v18, v[2:3] ; GFX8-NEXT: s_mov_b32 s10, -1 ; GFX8-NEXT: s_mov_b32 s11, 0xe80000 @@ -2192,6 +2192,9 @@ ; GFX8-NEXT: v_bfe_u32 v6, v4, 8, 4 ; GFX8-NEXT: v_bfe_u32 v7, v4, 12, 4 ; GFX8-NEXT: v_bfe_u32 v8, v4, 16, 4 +; GFX8-NEXT: v_bfe_u32 v9, v4, 20, 4 +; GFX8-NEXT: v_bfe_u32 v10, v4, 24, 4 +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 28, v4 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_and_b32_e32 v11, 15, v0 ; GFX8-NEXT: v_bfe_u32 v12, v0, 4, 4 @@ -2203,13 +2206,10 @@ ; GFX8-NEXT: v_mad_u16 v1, v6, v13, v1 ; GFX8-NEXT: v_bfe_u32 v15, v0, 16, 4 ; GFX8-NEXT: v_mad_u16 v1, v7, v14, v1 -; GFX8-NEXT: v_bfe_u32 v9, v4, 20, 4 ; GFX8-NEXT: v_bfe_u32 v16, v0, 20, 4 ; GFX8-NEXT: v_mad_u16 v1, v8, v15, v1 -; GFX8-NEXT: v_bfe_u32 v10, v4, 24, 4 ; GFX8-NEXT: v_bfe_u32 v17, v0, 24, 4 ; GFX8-NEXT: v_mad_u16 v1, v9, v16, v1 -; GFX8-NEXT: v_lshrrev_b32_e32 v4, 28, v4 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 28, v0 ; GFX8-NEXT: v_mad_u16 v1, v10, v17, v1 ; GFX8-NEXT: v_mad_u16 v0, v4, v0, v1 @@ -2227,44 +2227,44 @@ ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff -; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v3, v0, s[4:5] ; GFX9-NEXT: global_load_dword v4, v0, s[6:7] +; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_bfe_u32 v0, v3, 24, 4 ; GFX9-NEXT: v_bfe_u32 v6, v3, 16, 4 ; GFX9-NEXT: v_bfe_u32 v8, v3, 8, 4 +; GFX9-NEXT: v_and_b32_e32 v10, 15, v3 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_bfe_u32 v11, v4, 24, 4 ; GFX9-NEXT: v_bfe_u32 v13, v4, 16, 4 ; GFX9-NEXT: v_bfe_u32 v15, v4, 8, 4 ; GFX9-NEXT: v_and_b32_e32 v17, 15, v4 -; GFX9-NEXT: v_and_b32_e32 v10, 15, v3 ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 28, v3 -; GFX9-NEXT: v_and_b32_e32 v0, v2, v0 ; GFX9-NEXT: v_bfe_u32 v7, v3, 20, 4 -; GFX9-NEXT: v_and_b32_e32 v6, v2, v6 ; GFX9-NEXT: v_bfe_u32 v9, v3, 12, 4 -; GFX9-NEXT: v_and_b32_e32 v8, v2, v8 ; GFX9-NEXT: v_bfe_u32 v3, v3, 4, 4 ; GFX9-NEXT: v_lshrrev_b32_e32 v12, 28, v4 ; GFX9-NEXT: v_bfe_u32 v14, v4, 20, 4 ; GFX9-NEXT: v_bfe_u32 v16, v4, 12, 4 ; GFX9-NEXT: v_bfe_u32 v4, v4, 4, 4 -; GFX9-NEXT: v_and_b32_e32 v17, v2, v17 ; GFX9-NEXT: v_and_b32_e32 v11, v2, v11 +; GFX9-NEXT: v_and_b32_e32 v0, v2, v0 ; GFX9-NEXT: v_and_b32_e32 v13, v2, v13 +; GFX9-NEXT: v_and_b32_e32 v6, v2, v6 ; GFX9-NEXT: v_and_b32_e32 v15, v2, v15 +; GFX9-NEXT: v_and_b32_e32 v8, v2, v8 +; GFX9-NEXT: v_and_b32_e32 v17, v2, v17 ; GFX9-NEXT: v_and_b32_e32 v2, v2, v10 ; GFX9-NEXT: v_lshl_or_b32 v4, v4, 16, v17 ; GFX9-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; GFX9-NEXT: v_pk_mul_lo_u16 v2, v2, v4 ; GFX9-NEXT: global_load_ushort v4, v1, s[2:3] ; GFX9-NEXT: v_lshl_or_b32 v0, v5, 16, v0 -; GFX9-NEXT: v_lshl_or_b32 v6, v7, 16, v6 ; GFX9-NEXT: v_lshl_or_b32 v5, v14, 16, v13 +; GFX9-NEXT: v_lshl_or_b32 v6, v7, 16, v6 ; GFX9-NEXT: v_lshl_or_b32 v7, v16, 16, v15 ; GFX9-NEXT: v_lshl_or_b32 v8, v9, 16, v8 ; GFX9-NEXT: v_pk_mul_lo_u16 v3, v6, v5 @@ -2294,44 +2294,44 @@ ; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: v_mov_b32_e32 v2, 0xffff -; GFX9-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v3, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v4, v0, s[6:7] +; GFX9-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) ; GFX9-DL-NEXT: v_bfe_u32 v0, v3, 24, 4 ; GFX9-DL-NEXT: v_bfe_u32 v6, v3, 16, 4 ; GFX9-DL-NEXT: v_bfe_u32 v8, v3, 8, 4 +; GFX9-DL-NEXT: v_and_b32_e32 v10, 15, v3 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_bfe_u32 v11, v4, 24, 4 ; GFX9-DL-NEXT: v_bfe_u32 v13, v4, 16, 4 ; GFX9-DL-NEXT: v_bfe_u32 v15, v4, 8, 4 ; GFX9-DL-NEXT: v_and_b32_e32 v17, 15, v4 -; GFX9-DL-NEXT: v_and_b32_e32 v10, 15, v3 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 28, v3 -; GFX9-DL-NEXT: v_and_b32_e32 v0, v2, v0 ; GFX9-DL-NEXT: v_bfe_u32 v7, v3, 20, 4 -; GFX9-DL-NEXT: v_and_b32_e32 v6, v2, v6 ; GFX9-DL-NEXT: v_bfe_u32 v9, v3, 12, 4 -; GFX9-DL-NEXT: v_and_b32_e32 v8, v2, v8 ; GFX9-DL-NEXT: v_bfe_u32 v3, v3, 4, 4 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v12, 28, v4 ; GFX9-DL-NEXT: v_bfe_u32 v14, v4, 20, 4 ; GFX9-DL-NEXT: v_bfe_u32 v16, v4, 12, 4 ; GFX9-DL-NEXT: v_bfe_u32 v4, v4, 4, 4 -; GFX9-DL-NEXT: v_and_b32_e32 v17, v2, v17 ; GFX9-DL-NEXT: v_and_b32_e32 v11, v2, v11 +; GFX9-DL-NEXT: v_and_b32_e32 v0, v2, v0 ; GFX9-DL-NEXT: v_and_b32_e32 v13, v2, v13 +; GFX9-DL-NEXT: v_and_b32_e32 v6, v2, v6 ; GFX9-DL-NEXT: v_and_b32_e32 v15, v2, v15 +; GFX9-DL-NEXT: v_and_b32_e32 v8, v2, v8 +; GFX9-DL-NEXT: v_and_b32_e32 v17, v2, v17 ; GFX9-DL-NEXT: v_and_b32_e32 v2, v2, v10 ; GFX9-DL-NEXT: v_lshl_or_b32 v4, v4, 16, v17 ; GFX9-DL-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v2, v2, v4 ; GFX9-DL-NEXT: global_load_ushort v4, v1, s[2:3] ; GFX9-DL-NEXT: v_lshl_or_b32 v0, v5, 16, v0 -; GFX9-DL-NEXT: v_lshl_or_b32 v6, v7, 16, v6 ; GFX9-DL-NEXT: v_lshl_or_b32 v5, v14, 16, v13 +; GFX9-DL-NEXT: v_lshl_or_b32 v6, v7, 16, v6 ; GFX9-DL-NEXT: v_lshl_or_b32 v7, v16, 16, v15 ; GFX9-DL-NEXT: v_lshl_or_b32 v8, v9, 16, v8 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v3, v6, v5 @@ -2395,17 +2395,17 @@ ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_add_nc_u16 v3, v6, v3 ; GFX10-DL-NEXT: v_bfe_u32 v1, v1, 20, 4 +; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 20, 4 +; GFX10-DL-NEXT: v_and_b32_e32 v7, v4, v7 ; GFX10-DL-NEXT: v_and_b32_e32 v11, v4, v11 ; GFX10-DL-NEXT: v_pk_mul_lo_u16 v9, v9, v10 -; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 20, 4 ; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v12 -; GFX10-DL-NEXT: v_and_b32_e32 v7, v4, v7 ; GFX10-DL-NEXT: v_bfe_u32 v10, v2, 24, 4 -; GFX10-DL-NEXT: v_lshl_or_b32 v1, v1, 16, v11 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 28, v2 -; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v9 ; GFX10-DL-NEXT: v_lshl_or_b32 v6, v6, 16, v7 +; GFX10-DL-NEXT: v_lshl_or_b32 v1, v1, 16, v11 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v7, 16, v9 +; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v9 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 28, v2 ; GFX10-DL-NEXT: v_and_b32_e32 v9, v4, v10 ; GFX10-DL-NEXT: v_and_b32_e32 v4, v4, v5 ; GFX10-DL-NEXT: v_pk_mul_lo_u16 v1, v1, v6 @@ -2469,10 +2469,10 @@ ; GFX7-NEXT: s_mov_b32 s15, 0xe8f000 ; GFX7-NEXT: s_add_u32 s12, s12, s3 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] ; GFX7-NEXT: s_mov_b32 s10, 0 ; GFX7-NEXT: s_mov_b32 s11, s3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 @@ -2487,43 +2487,43 @@ ; GFX7-NEXT: s_waitcnt vmcnt(2) ; GFX7-NEXT: v_lshrrev_b32_e32 v6, 28, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v9, 4, v2 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 4, v2 -; GFX7-NEXT: v_lshrrev_b32_e32 v8, 12, v2 ; GFX7-NEXT: v_bfe_u32 v1, v2, 8, 4 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 4, v2 ; GFX7-NEXT: v_and_b32_e32 v5, 15, v2 ; GFX7-NEXT: v_bfe_u32 v7, v2, 16, 4 -; GFX7-NEXT: v_alignbit_b32 v2, v6, v2, 24 -; GFX7-NEXT: v_and_b32_e32 v6, s4, v9 +; GFX7-NEXT: v_lshrrev_b32_e32 v8, 12, v2 ; GFX7-NEXT: s_waitcnt vmcnt(1) ; GFX7-NEXT: v_lshrrev_b32_e32 v11, 4, v0 +; GFX7-NEXT: v_alignbit_b32 v2, v6, v2, 24 +; GFX7-NEXT: v_and_b32_e32 v6, s4, v9 ; GFX7-NEXT: v_lshlrev_b32_e32 v9, 4, v0 +; GFX7-NEXT: v_bfe_u32 v10, v0, 8, 4 ; GFX7-NEXT: v_and_b32_e32 v4, s4, v4 ; GFX7-NEXT: v_or_b32_e32 v5, v5, v6 ; GFX7-NEXT: v_and_b32_e32 v6, v3, v9 -; GFX7-NEXT: v_bfe_u32 v10, v0, 8, 4 ; GFX7-NEXT: v_and_b32_e32 v3, v3, v11 +; GFX7-NEXT: v_and_b32_e32 v12, 15, v0 ; GFX7-NEXT: v_or_b32_e32 v1, v1, v4 ; GFX7-NEXT: v_or_b32_e32 v3, v10, v3 -; GFX7-NEXT: v_and_b32_e32 v12, 15, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_lshrrev_b32_e32 v13, 28, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_or_b32_e32 v6, v12, v6 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_and_b32_e32 v2, s5, v2 -; GFX7-NEXT: v_or_b32_e32 v3, v6, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v15, 12, v0 ; GFX7-NEXT: v_bfe_u32 v14, v0, 16, 4 -; GFX7-NEXT: v_alignbit_b32 v0, v13, v0, 24 +; GFX7-NEXT: v_lshrrev_b32_e32 v15, 12, v0 ; GFX7-NEXT: v_and_b32_e32 v8, s4, v8 +; GFX7-NEXT: v_and_b32_e32 v2, s5, v2 +; GFX7-NEXT: v_alignbit_b32 v0, v13, v0, 24 ; GFX7-NEXT: v_or_b32_e32 v1, v5, v1 -; GFX7-NEXT: v_and_b32_e32 v4, s4, v15 -; GFX7-NEXT: v_and_b32_e32 v0, s5, v0 +; GFX7-NEXT: v_or_b32_e32 v3, v6, v3 ; GFX7-NEXT: v_or_b32_e32 v7, v7, v8 +; GFX7-NEXT: v_and_b32_e32 v4, s4, v15 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_and_b32_e32 v0, s5, v0 ; GFX7-NEXT: v_and_b32_e32 v6, 15, v1 ; GFX7-NEXT: v_and_b32_e32 v12, 15, v3 -; GFX7-NEXT: v_or_b32_e32 v2, v7, v2 ; GFX7-NEXT: v_or_b32_e32 v4, v14, v4 +; GFX7-NEXT: v_or_b32_e32 v2, v7, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_bfe_u32 v7, v1, 8, 4 ; GFX7-NEXT: v_bfe_u32 v13, v3, 8, 4 @@ -2531,8 +2531,8 @@ ; GFX7-NEXT: v_mad_u32_u24 v6, v6, v12, v16 ; GFX7-NEXT: v_or_b32_e32 v0, v4, v0 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 24, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v10, 24, v3 ; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 4 +; GFX7-NEXT: v_lshrrev_b32_e32 v10, 24, v3 ; GFX7-NEXT: v_bfe_u32 v3, v3, 16, 4 ; GFX7-NEXT: v_mad_u32_u24 v6, v7, v13, v6 ; GFX7-NEXT: v_mad_u32_u24 v1, v1, v3, v6 @@ -2543,8 +2543,8 @@ ; GFX7-NEXT: v_bfe_u32 v15, v0, 8, 4 ; GFX7-NEXT: v_mad_u32_u24 v1, v8, v14, v1 ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 24, v2 -; GFX7-NEXT: v_lshrrev_b32_e32 v11, 24, v0 ; GFX7-NEXT: v_bfe_u32 v2, v2, 16, 4 +; GFX7-NEXT: v_lshrrev_b32_e32 v11, 24, v0 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 4 ; GFX7-NEXT: v_mad_u32_u24 v1, v9, v15, v1 ; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 @@ -2577,23 +2577,23 @@ ; GFX8-NEXT: s_addc_u32 s9, s9, 0 ; GFX8-NEXT: s_waitcnt vmcnt(2) ; GFX8-NEXT: v_bfe_u32 v3, v4, 20, 4 -; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_bfe_u32 v13, v2, 20, 4 ; GFX8-NEXT: v_bfe_u32 v7, v4, 24, 4 ; GFX8-NEXT: v_lshrrev_b32_e32 v8, 28, v4 -; GFX8-NEXT: v_lshrrev_b32_e32 v15, 28, v2 +; GFX8-NEXT: s_waitcnt vmcnt(1) +; GFX8-NEXT: v_bfe_u32 v13, v2, 20, 4 ; GFX8-NEXT: v_bfe_u32 v14, v2, 24, 4 -; GFX8-NEXT: v_mul_lo_u16_sdwa v3, v3, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v15, 28, v2 ; GFX8-NEXT: v_bfe_u32 v6, v4, 16, 4 -; GFX8-NEXT: v_bfe_u32 v12, v2, 16, 4 ; GFX8-NEXT: v_bfe_u32 v9, v4, 8, 4 -; GFX8-NEXT: v_bfe_u32 v16, v2, 8, 4 ; GFX8-NEXT: v_bfe_u32 v10, v4, 12, 4 ; GFX8-NEXT: v_and_b32_e32 v11, 15, v4 +; GFX8-NEXT: v_bfe_u32 v4, v4, 4, 4 +; GFX8-NEXT: v_bfe_u32 v12, v2, 16, 4 +; GFX8-NEXT: v_bfe_u32 v16, v2, 8, 4 ; GFX8-NEXT: v_bfe_u32 v17, v2, 12, 4 ; GFX8-NEXT: v_and_b32_e32 v18, 15, v2 -; GFX8-NEXT: v_bfe_u32 v4, v4, 4, 4 ; GFX8-NEXT: v_bfe_u32 v2, v2, 4, 4 +; GFX8-NEXT: v_mul_lo_u16_sdwa v3, v3, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_mul_lo_u16_e32 v13, v7, v14 ; GFX8-NEXT: v_mul_lo_u16_sdwa v8, v8, v15 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_mul_lo_u16_e32 v19, v6, v12 @@ -2602,12 +2602,12 @@ ; GFX8-NEXT: v_mul_lo_u16_e32 v11, v11, v18 ; GFX8-NEXT: v_mul_lo_u16_sdwa v4, v4, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v8, v13, v8 +; GFX8-NEXT: v_or_b32_e32 v3, v19, v3 ; GFX8-NEXT: v_or_b32_e32 v9, v9, v10 ; GFX8-NEXT: v_or_b32_e32 v10, v11, v4 ; GFX8-NEXT: v_lshlrev_b32_e32 v11, 16, v8 -; GFX8-NEXT: v_or_b32_e32 v3, v19, v3 -; GFX8-NEXT: v_or_b32_sdwa v3, v3, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v9 +; GFX8-NEXT: v_or_b32_sdwa v3, v3, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v4, v4, v2 ; GFX8-NEXT: v_lshrrev_b32_e32 v11, 8, v3 ; GFX8-NEXT: v_lshrrev_b64 v[2:3], 24, v[2:3] @@ -2636,44 +2636,44 @@ ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-NEXT: global_load_ubyte v4, v3, s[2:3] +; GFX9-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_bfe_u32 v0, v1, 20, 4 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_bfe_u32 v12, v2, 20, 4 ; GFX9-NEXT: v_bfe_u32 v6, v1, 24, 4 ; GFX9-NEXT: v_lshrrev_b32_e32 v7, 28, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v14, 28, v2 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_bfe_u32 v12, v2, 20, 4 ; GFX9-NEXT: v_bfe_u32 v13, v2, 24, 4 -; GFX9-NEXT: v_mul_lo_u16_sdwa v0, v0, v12 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 28, v2 ; GFX9-NEXT: v_bfe_u32 v5, v1, 16, 4 -; GFX9-NEXT: v_bfe_u32 v11, v2, 16, 4 ; GFX9-NEXT: v_bfe_u32 v8, v1, 8, 4 -; GFX9-NEXT: v_bfe_u32 v15, v2, 8, 4 ; GFX9-NEXT: v_bfe_u32 v9, v1, 12, 4 ; GFX9-NEXT: v_and_b32_e32 v10, 15, v1 +; GFX9-NEXT: v_bfe_u32 v1, v1, 4, 4 +; GFX9-NEXT: v_bfe_u32 v11, v2, 16, 4 +; GFX9-NEXT: v_bfe_u32 v15, v2, 8, 4 ; GFX9-NEXT: v_bfe_u32 v16, v2, 12, 4 ; GFX9-NEXT: v_and_b32_e32 v17, 15, v2 -; GFX9-NEXT: v_bfe_u32 v1, v1, 4, 4 ; GFX9-NEXT: v_bfe_u32 v2, v2, 4, 4 +; GFX9-NEXT: v_mul_lo_u16_sdwa v0, v0, v12 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_mul_lo_u16_e32 v12, v6, v13 ; GFX9-NEXT: v_mul_lo_u16_sdwa v7, v7, v14 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_mul_lo_u16_e32 v18, v5, v11 -; GFX9-NEXT: v_mul_lo_u16_sdwa v2, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_mul_lo_u16_e32 v8, v8, v15 ; GFX9-NEXT: v_mul_lo_u16_sdwa v9, v9, v16 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_mul_lo_u16_e32 v10, v10, v17 +; GFX9-NEXT: v_mul_lo_u16_sdwa v2, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v7, v12, v7 -; GFX9-NEXT: v_or_b32_e32 v8, v8, v9 ; GFX9-NEXT: v_or_b32_e32 v1, v18, v0 +; GFX9-NEXT: v_or_b32_e32 v8, v8, v9 ; GFX9-NEXT: v_or_b32_e32 v9, v10, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v7 -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v8 +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v2, v2, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v10, 8, v1 ; GFX9-NEXT: v_lshrrev_b64 v[0:1], 24, v[0:1] @@ -2702,44 +2702,44 @@ ; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-DL-NEXT: global_load_ubyte v4, v3, s[2:3] +; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(2) ; GFX9-DL-NEXT: v_bfe_u32 v0, v1, 20, 4 -; GFX9-DL-NEXT: s_waitcnt vmcnt(1) -; GFX9-DL-NEXT: v_bfe_u32 v12, v2, 20, 4 ; GFX9-DL-NEXT: v_bfe_u32 v6, v1, 24, 4 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 28, v1 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v14, 28, v2 +; GFX9-DL-NEXT: s_waitcnt vmcnt(1) +; GFX9-DL-NEXT: v_bfe_u32 v12, v2, 20, 4 ; GFX9-DL-NEXT: v_bfe_u32 v13, v2, 24, 4 -; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v0, v0, v12 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v14, 28, v2 ; GFX9-DL-NEXT: v_bfe_u32 v5, v1, 16, 4 -; GFX9-DL-NEXT: v_bfe_u32 v11, v2, 16, 4 ; GFX9-DL-NEXT: v_bfe_u32 v8, v1, 8, 4 -; GFX9-DL-NEXT: v_bfe_u32 v15, v2, 8, 4 ; GFX9-DL-NEXT: v_bfe_u32 v9, v1, 12, 4 ; GFX9-DL-NEXT: v_and_b32_e32 v10, 15, v1 +; GFX9-DL-NEXT: v_bfe_u32 v1, v1, 4, 4 +; GFX9-DL-NEXT: v_bfe_u32 v11, v2, 16, 4 +; GFX9-DL-NEXT: v_bfe_u32 v15, v2, 8, 4 ; GFX9-DL-NEXT: v_bfe_u32 v16, v2, 12, 4 ; GFX9-DL-NEXT: v_and_b32_e32 v17, 15, v2 -; GFX9-DL-NEXT: v_bfe_u32 v1, v1, 4, 4 ; GFX9-DL-NEXT: v_bfe_u32 v2, v2, 4, 4 +; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v0, v0, v12 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-DL-NEXT: v_mul_lo_u16_e32 v12, v6, v13 ; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v7, v7, v14 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-DL-NEXT: v_mul_lo_u16_e32 v18, v5, v11 -; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v2, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-DL-NEXT: v_mul_lo_u16_e32 v8, v8, v15 ; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v9, v9, v16 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-DL-NEXT: v_mul_lo_u16_e32 v10, v10, v17 +; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v2, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-DL-NEXT: v_or_b32_e32 v7, v12, v7 -; GFX9-DL-NEXT: v_or_b32_e32 v8, v8, v9 ; GFX9-DL-NEXT: v_or_b32_e32 v1, v18, v0 +; GFX9-DL-NEXT: v_or_b32_e32 v8, v8, v9 ; GFX9-DL-NEXT: v_or_b32_e32 v9, v10, v2 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v10, 16, v7 -; GFX9-DL-NEXT: v_or_b32_sdwa v1, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 16, v8 +; GFX9-DL-NEXT: v_or_b32_sdwa v1, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-DL-NEXT: v_or_b32_e32 v2, v2, v0 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v10, 8, v1 ; GFX9-DL-NEXT: v_lshrrev_b64 v[0:1], 24, v[0:1] @@ -2784,16 +2784,16 @@ ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v14, 28, v2 ; GFX10-DL-NEXT: v_mul_lo_u16 v9, v9, v10 ; GFX10-DL-NEXT: v_bfe_u32 v5, v1, 16, 4 -; GFX10-DL-NEXT: v_mul_lo_u16 v8, v8, v13 ; GFX10-DL-NEXT: v_bfe_u32 v0, v1, 20, 4 ; GFX10-DL-NEXT: v_bfe_u32 v6, v1, 24, 4 ; GFX10-DL-NEXT: v_and_b32_e32 v11, 15, v1 -; GFX10-DL-NEXT: v_lshlrev_b16 v9, 8, v9 ; GFX10-DL-NEXT: v_bfe_u32 v1, v1, 4, 4 ; GFX10-DL-NEXT: v_bfe_u32 v15, v2, 4, 4 -; GFX10-DL-NEXT: v_mul_lo_u16 v7, v7, v14 +; GFX10-DL-NEXT: v_mul_lo_u16 v8, v8, v13 +; GFX10-DL-NEXT: v_lshlrev_b16 v9, 8, v9 ; GFX10-DL-NEXT: v_bfe_u32 v10, v2, 20, 4 ; GFX10-DL-NEXT: v_bfe_u32 v13, v2, 24, 4 +; GFX10-DL-NEXT: v_mul_lo_u16 v7, v7, v14 ; GFX10-DL-NEXT: v_bfe_u32 v12, v2, 16, 4 ; GFX10-DL-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX10-DL-NEXT: v_mul_lo_u16 v1, v1, v15 @@ -2801,12 +2801,12 @@ ; GFX10-DL-NEXT: v_mul_lo_u16 v9, v0, v10 ; GFX10-DL-NEXT: v_mul_lo_u16 v10, v6, v13 ; GFX10-DL-NEXT: v_lshlrev_b16 v7, 8, v7 -; GFX10-DL-NEXT: v_mul_lo_u16 v2, v11, v2 ; GFX10-DL-NEXT: v_lshlrev_b16 v1, 8, v1 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 16, v8 +; GFX10-DL-NEXT: v_mul_lo_u16 v2, v11, v2 ; GFX10-DL-NEXT: v_mul_lo_u16 v11, v5, v12 -; GFX10-DL-NEXT: v_or_b32_e32 v7, v10, v7 ; GFX10-DL-NEXT: v_lshlrev_b16 v9, 8, v9 +; GFX10-DL-NEXT: v_or_b32_e32 v7, v10, v7 ; GFX10-DL-NEXT: v_or_b32_sdwa v10, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX10-DL-NEXT: v_or_b32_e32 v1, v2, v1 ; GFX10-DL-NEXT: v_or_b32_e32 v2, v11, v9 @@ -2883,8 +2883,8 @@ ; GFX7-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] -; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: buffer_load_ubyte v16, off, s[0:3], 0 ; GFX7-NEXT: s_addc_u32 s13, s13, 0 ; GFX7-NEXT: s_waitcnt vmcnt(2) @@ -2930,8 +2930,8 @@ ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v4, v[0:1] -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v3, s1 @@ -2945,18 +2945,18 @@ ; GFX8-NEXT: v_bfe_u32 v5, v4, 4, 4 ; GFX8-NEXT: v_bfe_u32 v6, v4, 8, 4 ; GFX8-NEXT: v_bfe_u32 v7, v4, 12, 4 +; GFX8-NEXT: v_bfe_u32 v8, v4, 16, 4 +; GFX8-NEXT: v_bfe_u32 v9, v4, 20, 4 +; GFX8-NEXT: v_bfe_u32 v10, v4, 24, 4 +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 28, v4 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_and_b32_e32 v11, 15, v0 ; GFX8-NEXT: v_bfe_u32 v12, v0, 4, 4 ; GFX8-NEXT: v_bfe_u32 v13, v0, 8, 4 ; GFX8-NEXT: v_bfe_u32 v14, v0, 12, 4 -; GFX8-NEXT: v_bfe_u32 v8, v4, 16, 4 ; GFX8-NEXT: v_bfe_u32 v15, v0, 16, 4 -; GFX8-NEXT: v_bfe_u32 v9, v4, 20, 4 -; GFX8-NEXT: v_bfe_u32 v10, v4, 24, 4 ; GFX8-NEXT: v_bfe_u32 v16, v0, 20, 4 ; GFX8-NEXT: v_bfe_u32 v17, v0, 24, 4 -; GFX8-NEXT: v_lshrrev_b32_e32 v4, 28, v4 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 28, v0 ; GFX8-NEXT: v_mul_u32_u24_e32 v0, v4, v0 ; GFX8-NEXT: v_mul_u32_u24_e32 v4, v10, v17 @@ -2991,27 +2991,27 @@ ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v2, v0, s[4:5] ; GFX9-NEXT: global_load_dword v3, v0, s[6:7] +; GFX9-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_and_b32_e32 v0, 15, v2 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v10, 15, v3 ; GFX9-NEXT: v_bfe_u32 v4, v2, 4, 4 -; GFX9-NEXT: v_bfe_u32 v11, v3, 4, 4 ; GFX9-NEXT: v_bfe_u32 v5, v2, 8, 4 -; GFX9-NEXT: v_bfe_u32 v12, v3, 8, 4 ; GFX9-NEXT: v_bfe_u32 v6, v2, 12, 4 -; GFX9-NEXT: v_bfe_u32 v13, v3, 12, 4 ; GFX9-NEXT: v_bfe_u32 v7, v2, 16, 4 -; GFX9-NEXT: v_bfe_u32 v14, v3, 16, 4 ; GFX9-NEXT: v_bfe_u32 v8, v2, 20, 4 ; GFX9-NEXT: v_bfe_u32 v9, v2, 24, 4 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 28, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_and_b32_e32 v10, 15, v3 +; GFX9-NEXT: v_bfe_u32 v11, v3, 4, 4 +; GFX9-NEXT: v_bfe_u32 v12, v3, 8, 4 +; GFX9-NEXT: v_bfe_u32 v13, v3, 12, 4 +; GFX9-NEXT: v_bfe_u32 v14, v3, 16, 4 ; GFX9-NEXT: v_bfe_u32 v15, v3, 20, 4 ; GFX9-NEXT: v_bfe_u32 v16, v3, 24, 4 -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 28, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 28, v3 ; GFX9-NEXT: v_mul_u32_u24_e32 v2, v2, v3 ; GFX9-NEXT: v_mul_u32_u24_e32 v3, v9, v16 @@ -3046,27 +3046,27 @@ ; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v3, v0, s[6:7] +; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) ; GFX9-DL-NEXT: v_and_b32_e32 v0, 15, v2 -; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_and_b32_e32 v10, 15, v3 ; GFX9-DL-NEXT: v_bfe_u32 v4, v2, 4, 4 -; GFX9-DL-NEXT: v_bfe_u32 v11, v3, 4, 4 ; GFX9-DL-NEXT: v_bfe_u32 v5, v2, 8, 4 -; GFX9-DL-NEXT: v_bfe_u32 v12, v3, 8, 4 ; GFX9-DL-NEXT: v_bfe_u32 v6, v2, 12, 4 -; GFX9-DL-NEXT: v_bfe_u32 v13, v3, 12, 4 ; GFX9-DL-NEXT: v_bfe_u32 v7, v2, 16, 4 -; GFX9-DL-NEXT: v_bfe_u32 v14, v3, 16, 4 ; GFX9-DL-NEXT: v_bfe_u32 v8, v2, 20, 4 ; GFX9-DL-NEXT: v_bfe_u32 v9, v2, 24, 4 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v2, 28, v2 +; GFX9-DL-NEXT: s_waitcnt vmcnt(0) +; GFX9-DL-NEXT: v_and_b32_e32 v10, 15, v3 +; GFX9-DL-NEXT: v_bfe_u32 v11, v3, 4, 4 +; GFX9-DL-NEXT: v_bfe_u32 v12, v3, 8, 4 +; GFX9-DL-NEXT: v_bfe_u32 v13, v3, 12, 4 +; GFX9-DL-NEXT: v_bfe_u32 v14, v3, 16, 4 ; GFX9-DL-NEXT: v_bfe_u32 v15, v3, 20, 4 ; GFX9-DL-NEXT: v_bfe_u32 v16, v3, 24, 4 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v2, 28, v2 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v3, 28, v3 ; GFX9-DL-NEXT: v_mul_u32_u24_e32 v2, v2, v3 ; GFX9-DL-NEXT: v_mul_u32_u24_e32 v3, v9, v16 @@ -3232,6 +3232,7 @@ ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc @@ -3240,22 +3241,21 @@ ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] -; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_and_b32_e32 v1, 15, v3 ; GFX8-NEXT: v_bfe_u32 v4, v3, 4, 4 ; GFX8-NEXT: v_bfe_u32 v6, v3, 8, 4 ; GFX8-NEXT: v_bfe_u32 v8, v3, 12, 4 +; GFX8-NEXT: v_bfe_u32 v10, v3, 16, 4 +; GFX8-NEXT: v_bfe_u32 v12, v3, 20, 4 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_and_b32_e32 v2, 15, v0 ; GFX8-NEXT: v_bfe_u32 v5, v0, 4, 4 ; GFX8-NEXT: v_bfe_u32 v7, v0, 8, 4 ; GFX8-NEXT: v_bfe_u32 v9, v0, 12, 4 -; GFX8-NEXT: v_bfe_u32 v10, v3, 16, 4 ; GFX8-NEXT: v_bfe_u32 v11, v0, 16, 4 -; GFX8-NEXT: v_bfe_u32 v12, v3, 20, 4 -; GFX8-NEXT: v_bfe_u32 v14, v3, 24, 4 ; GFX8-NEXT: v_bfe_u32 v13, v0, 20, 4 +; GFX8-NEXT: v_bfe_u32 v14, v3, 24, 4 ; GFX8-NEXT: v_bfe_u32 v15, v0, 24, 4 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 28, v3 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 28, v0 @@ -3296,8 +3296,8 @@ ; GFX9-NEXT: v_bfe_u32 v11, v1, 16, 4 ; GFX9-NEXT: v_bfe_u32 v12, v2, 16, 4 ; GFX9-NEXT: v_bfe_u32 v13, v1, 20, 4 -; GFX9-NEXT: v_bfe_u32 v15, v1, 24, 4 ; GFX9-NEXT: v_bfe_u32 v14, v2, 20, 4 +; GFX9-NEXT: v_bfe_u32 v15, v1, 24, 4 ; GFX9-NEXT: v_bfe_u32 v16, v2, 24, 4 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 28, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 28, v2 diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll @@ -454,8 +454,8 @@ ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: s_cmp_lg_u32 s4, 1 ; SI-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc -; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: v_mov_b32_e32 v1, s9 +; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: s_cmp_lg_u32 s4, 0 ; SI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc ; SI-NEXT: v_mov_b32_e32 v3, s8 @@ -478,8 +478,8 @@ ; VI-NEXT: s_cselect_b64 vcc, -1, 0 ; VI-NEXT: s_cmp_lg_u32 s4, 1 ; VI-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc -; VI-NEXT: s_cselect_b64 vcc, -1, 0 ; VI-NEXT: v_mov_b32_e32 v1, s9 +; VI-NEXT: s_cselect_b64 vcc, -1, 0 ; VI-NEXT: s_cmp_lg_u32 s4, 0 ; VI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc ; VI-NEXT: v_mov_b32_e32 v3, s8 @@ -507,12 +507,12 @@ ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: s_cmp_lg_u32 s4, 2 ; SI-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc -; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: v_mov_b32_e32 v1, s10 +; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: s_cmp_lg_u32 s4, 1 ; SI-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc -; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: v_mov_b32_e32 v1, s9 +; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: s_cmp_lg_u32 s4, 0 ; SI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc ; SI-NEXT: v_mov_b32_e32 v4, s8 @@ -535,12 +535,12 @@ ; VI-NEXT: s_cselect_b64 vcc, -1, 0 ; VI-NEXT: s_cmp_lg_u32 s4, 2 ; VI-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc -; VI-NEXT: s_cselect_b64 vcc, -1, 0 ; VI-NEXT: v_mov_b32_e32 v1, s10 +; VI-NEXT: s_cselect_b64 vcc, -1, 0 ; VI-NEXT: s_cmp_lg_u32 s4, 1 ; VI-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc -; VI-NEXT: s_cselect_b64 vcc, -1, 0 ; VI-NEXT: v_mov_b32_e32 v1, s9 +; VI-NEXT: s_cselect_b64 vcc, -1, 0 ; VI-NEXT: s_cmp_lg_u32 s4, 0 ; VI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc ; VI-NEXT: v_mov_b32_e32 v4, s8 @@ -568,28 +568,28 @@ ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: s_cmp_lg_u32 s4, 2 ; SI-NEXT: v_cndmask_b32_e32 v3, v4, v0, vcc -; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: v_mov_b32_e32 v0, s10 +; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: s_cmp_lg_u32 s4, 1 ; SI-NEXT: v_cndmask_b32_e32 v2, v4, v0, vcc -; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: v_mov_b32_e32 v0, s9 +; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: s_cmp_lg_u32 s4, 0 ; SI-NEXT: v_cndmask_b32_e32 v1, v4, v0, vcc -; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: v_mov_b32_e32 v0, s8 +; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: s_cmp_lg_u32 s4, 7 ; SI-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: v_mov_b32_e32 v5, s15 +; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: s_cmp_lg_u32 s4, 6 ; SI-NEXT: v_cndmask_b32_e32 v7, v4, v5, vcc -; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: v_mov_b32_e32 v5, s14 +; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: s_cmp_lg_u32 s4, 5 ; SI-NEXT: v_cndmask_b32_e32 v6, v4, v5, vcc -; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: v_mov_b32_e32 v5, s13 +; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: s_cmp_lg_u32 s4, 4 ; SI-NEXT: v_cndmask_b32_e32 v5, v4, v5, vcc ; SI-NEXT: v_mov_b32_e32 v8, s12 @@ -613,28 +613,28 @@ ; VI-NEXT: s_cselect_b64 vcc, -1, 0 ; VI-NEXT: s_cmp_lg_u32 s4, 2 ; VI-NEXT: v_cndmask_b32_e32 v3, v4, v0, vcc -; VI-NEXT: s_cselect_b64 vcc, -1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s10 +; VI-NEXT: s_cselect_b64 vcc, -1, 0 ; VI-NEXT: s_cmp_lg_u32 s4, 1 ; VI-NEXT: v_cndmask_b32_e32 v2, v4, v0, vcc -; VI-NEXT: s_cselect_b64 vcc, -1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s9 +; VI-NEXT: s_cselect_b64 vcc, -1, 0 ; VI-NEXT: s_cmp_lg_u32 s4, 0 ; VI-NEXT: v_cndmask_b32_e32 v1, v4, v0, vcc -; VI-NEXT: s_cselect_b64 vcc, -1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: s_cselect_b64 vcc, -1, 0 ; VI-NEXT: s_cmp_lg_u32 s4, 7 ; VI-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; VI-NEXT: s_cselect_b64 vcc, -1, 0 ; VI-NEXT: v_mov_b32_e32 v5, s15 +; VI-NEXT: s_cselect_b64 vcc, -1, 0 ; VI-NEXT: s_cmp_lg_u32 s4, 6 ; VI-NEXT: v_cndmask_b32_e32 v7, v4, v5, vcc -; VI-NEXT: s_cselect_b64 vcc, -1, 0 ; VI-NEXT: v_mov_b32_e32 v5, s14 +; VI-NEXT: s_cselect_b64 vcc, -1, 0 ; VI-NEXT: s_cmp_lg_u32 s4, 5 ; VI-NEXT: v_cndmask_b32_e32 v6, v4, v5, vcc -; VI-NEXT: s_cselect_b64 vcc, -1, 0 ; VI-NEXT: v_mov_b32_e32 v5, s13 +; VI-NEXT: s_cselect_b64 vcc, -1, 0 ; VI-NEXT: s_cmp_lg_u32 s4, 4 ; VI-NEXT: v_cndmask_b32_e32 v5, v4, v5, vcc ; VI-NEXT: v_mov_b32_e32 v8, s12 @@ -774,8 +774,8 @@ ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: s_cmp_lg_u32 s4, 1 ; SI-NEXT: v_cndmask_b32_e32 v2, 5, v0, vcc -; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: v_mov_b32_e32 v0, s9 +; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: s_cmp_lg_u32 s4, 0 ; SI-NEXT: v_cndmask_b32_e32 v1, 5, v0, vcc ; SI-NEXT: v_mov_b32_e32 v0, s8 @@ -820,16 +820,16 @@ ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s11 ; SI-NEXT: s_cmp_eq_u32 s6, 3 -; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: v_mov_b32_e32 v4, s4 +; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: s_cmp_eq_u32 s6, 2 ; SI-NEXT: v_cndmask_b32_e32 v3, v0, v4, vcc -; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: v_mov_b32_e32 v0, s10 +; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: s_cmp_eq_u32 s6, 1 ; SI-NEXT: v_cndmask_b32_e32 v2, v0, v4, vcc -; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: v_mov_b32_e32 v0, s9 +; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: s_cmp_eq_u32 s6, 0 ; SI-NEXT: v_cndmask_b32_e32 v1, v0, v4, vcc ; SI-NEXT: v_mov_b32_e32 v0, s8 @@ -880,28 +880,28 @@ ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: s_cmp_lg_u32 s4, 2 ; SI-NEXT: v_cndmask_b32_e32 v3, 5, v0, vcc -; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: v_mov_b32_e32 v0, s10 +; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: s_cmp_lg_u32 s4, 1 ; SI-NEXT: v_cndmask_b32_e32 v2, 5, v0, vcc -; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: v_mov_b32_e32 v0, s9 +; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: s_cmp_lg_u32 s4, 0 ; SI-NEXT: v_cndmask_b32_e32 v1, 5, v0, vcc -; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: v_mov_b32_e32 v0, s8 +; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: s_cmp_lg_u32 s4, 7 ; SI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc -; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: v_mov_b32_e32 v4, s15 +; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: s_cmp_lg_u32 s4, 6 ; SI-NEXT: v_cndmask_b32_e32 v7, 5, v4, vcc -; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: v_mov_b32_e32 v4, s14 +; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: s_cmp_lg_u32 s4, 5 ; SI-NEXT: v_cndmask_b32_e32 v6, 5, v4, vcc -; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: v_mov_b32_e32 v4, s13 +; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: s_cmp_lg_u32 s4, 4 ; SI-NEXT: v_cndmask_b32_e32 v5, 5, v4, vcc ; SI-NEXT: v_mov_b32_e32 v4, s12 @@ -1164,8 +1164,8 @@ ; SI-NEXT: s_andn2_b32 s5, s6, s4 ; SI-NEXT: s_and_b32 s4, s4, 0x5050505 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: s_lshr_b32 s5, s4, 16 +; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: v_mov_b32_e32 v0, s5 ; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0 offset:2 @@ -1184,8 +1184,8 @@ ; VI-NEXT: s_andn2_b32 s5, s6, s4 ; VI-NEXT: s_and_b32 s4, s4, 0x5050505 ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: s_lshr_b32 s5, s4, 16 +; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; VI-NEXT: v_mov_b32_e32 v0, s5 ; VI-NEXT: buffer_store_byte v0, off, s[0:3], 0 offset:2 @@ -1306,28 +1306,28 @@ ; SI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: s_lshr_b32 s6, s11, 8 ; SI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc ; SI-NEXT: s_movk_i32 s5, 0xff -; SI-NEXT: s_cmp_lg_u32 s4, 13 +; SI-NEXT: s_lshr_b32 s6, s11, 8 ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 ; SI-NEXT: v_and_b32_e32 v1, s5, v1 +; SI-NEXT: s_cmp_lg_u32 s4, 13 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_mov_b32_e32 v1, s6 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: s_cmp_lg_u32 s4, 12 ; SI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc -; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: v_mov_b32_e32 v2, s11 +; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: v_cndmask_b32_e32 v2, 5, v2, vcc ; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; SI-NEXT: v_and_b32_e32 v2, s5, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: s_mov_b32 s6, 0xffff ; SI-NEXT: s_lshr_b32 s7, s10, 24 -; SI-NEXT: s_cmp_lg_u32 s4, 11 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_and_b32_e32 v1, s6, v1 +; SI-NEXT: s_cmp_lg_u32 s4, 11 ; SI-NEXT: v_or_b32_e32 v3, v1, v0 ; SI-NEXT: v_mov_b32_e32 v0, s7 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1338,24 +1338,24 @@ ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc ; SI-NEXT: s_lshr_b32 s7, s10, 8 -; SI-NEXT: s_cmp_lg_u32 s4, 9 ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 ; SI-NEXT: v_and_b32_e32 v1, s5, v1 +; SI-NEXT: s_cmp_lg_u32 s4, 9 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: s_cmp_lg_u32 s4, 8 ; SI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc -; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: v_mov_b32_e32 v2, s10 +; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: v_cndmask_b32_e32 v2, 5, v2, vcc ; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; SI-NEXT: v_and_b32_e32 v2, s5, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: s_lshr_b32 s7, s9, 24 -; SI-NEXT: s_cmp_lg_u32 s4, 7 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_and_b32_e32 v1, s6, v1 +; SI-NEXT: s_cmp_lg_u32 s4, 7 ; SI-NEXT: v_or_b32_e32 v2, v1, v0 ; SI-NEXT: v_mov_b32_e32 v0, s7 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1366,24 +1366,24 @@ ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc ; SI-NEXT: s_lshr_b32 s7, s9, 8 -; SI-NEXT: s_cmp_lg_u32 s4, 5 ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 ; SI-NEXT: v_and_b32_e32 v1, s5, v1 +; SI-NEXT: s_cmp_lg_u32 s4, 5 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: s_cmp_lg_u32 s4, 4 ; SI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc -; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: v_mov_b32_e32 v4, s9 +; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: v_cndmask_b32_e32 v4, 5, v4, vcc ; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; SI-NEXT: v_and_b32_e32 v4, s5, v4 ; SI-NEXT: v_or_b32_e32 v1, v4, v1 ; SI-NEXT: s_lshr_b32 s7, s8, 24 -; SI-NEXT: s_cmp_lg_u32 s4, 3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_and_b32_e32 v1, s6, v1 +; SI-NEXT: s_cmp_lg_u32 s4, 3 ; SI-NEXT: v_or_b32_e32 v1, v1, v0 ; SI-NEXT: v_mov_b32_e32 v0, s7 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1394,12 +1394,12 @@ ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: v_cndmask_b32_e32 v4, 5, v4, vcc ; SI-NEXT: s_lshr_b32 s7, s8, 8 -; SI-NEXT: s_cmp_lg_u32 s4, 1 ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 ; SI-NEXT: v_and_b32_e32 v4, s5, v4 +; SI-NEXT: s_cmp_lg_u32 s4, 1 ; SI-NEXT: v_or_b32_e32 v0, v4, v0 -; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: v_mov_b32_e32 v4, s7 +; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: s_cmp_lg_u32 s4, 0 ; SI-NEXT: v_cndmask_b32_e32 v4, 5, v4, vcc ; SI-NEXT: v_mov_b32_e32 v5, s8 @@ -1436,15 +1436,15 @@ ; VI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc ; VI-NEXT: s_cmp_lg_u32 s4, 13 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_cselect_b64 vcc, -1, 0 ; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: s_cselect_b64 vcc, -1, 0 ; VI-NEXT: s_cmp_lg_u32 s4, 12 ; VI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc -; VI-NEXT: s_cselect_b64 vcc, -1, 0 ; VI-NEXT: v_mov_b32_e32 v2, s11 -; VI-NEXT: s_lshr_b32 s5, s10, 24 +; VI-NEXT: s_cselect_b64 vcc, -1, 0 ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: v_cndmask_b32_e32 v2, 5, v2, vcc +; VI-NEXT: s_lshr_b32 s5, s10, 24 ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_cmp_lg_u32 s4, 11 ; VI-NEXT: v_or_b32_sdwa v3, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -1460,15 +1460,15 @@ ; VI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc ; VI-NEXT: s_cmp_lg_u32 s4, 9 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_cselect_b64 vcc, -1, 0 ; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: s_cselect_b64 vcc, -1, 0 ; VI-NEXT: s_cmp_lg_u32 s4, 8 ; VI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc -; VI-NEXT: s_cselect_b64 vcc, -1, 0 ; VI-NEXT: v_mov_b32_e32 v2, s10 -; VI-NEXT: s_lshr_b32 s5, s9, 24 +; VI-NEXT: s_cselect_b64 vcc, -1, 0 ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: v_cndmask_b32_e32 v2, 5, v2, vcc +; VI-NEXT: s_lshr_b32 s5, s9, 24 ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_cmp_lg_u32 s4, 7 ; VI-NEXT: v_or_b32_sdwa v2, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -1484,15 +1484,15 @@ ; VI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc ; VI-NEXT: s_cmp_lg_u32 s4, 5 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_cselect_b64 vcc, -1, 0 ; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: s_cselect_b64 vcc, -1, 0 ; VI-NEXT: s_cmp_lg_u32 s4, 4 ; VI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc -; VI-NEXT: s_cselect_b64 vcc, -1, 0 ; VI-NEXT: v_mov_b32_e32 v4, s9 -; VI-NEXT: s_lshr_b32 s5, s8, 24 +; VI-NEXT: s_cselect_b64 vcc, -1, 0 ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: v_cndmask_b32_e32 v4, 5, v4, vcc +; VI-NEXT: s_lshr_b32 s5, s8, 24 ; VI-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_cmp_lg_u32 s4, 3 ; VI-NEXT: v_or_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -1508,8 +1508,8 @@ ; VI-NEXT: v_cndmask_b32_e32 v4, 5, v4, vcc ; VI-NEXT: s_cmp_lg_u32 s4, 1 ; VI-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_cselect_b64 vcc, -1, 0 ; VI-NEXT: v_mov_b32_e32 v4, s5 +; VI-NEXT: s_cselect_b64 vcc, -1, 0 ; VI-NEXT: s_cmp_lg_u32 s4, 0 ; VI-NEXT: v_cndmask_b32_e32 v4, 5, v4, vcc ; VI-NEXT: v_mov_b32_e32 v5, s8 @@ -1660,8 +1660,8 @@ ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_cmp_eq_u32 s6, 1 -; SI-NEXT: s_cselect_b64 s[4:5], -1, 0 ; SI-NEXT: v_mov_b32_e32 v0, s11 +; SI-NEXT: s_cselect_b64 s[4:5], -1, 0 ; SI-NEXT: v_cndmask_b32_e64 v3, v0, 0, s[4:5] ; SI-NEXT: v_mov_b32_e32 v0, s10 ; SI-NEXT: s_cmp_eq_u32 s6, 0 @@ -1715,8 +1715,8 @@ ; SI-NEXT: v_mov_b32_e32 v0, s10 ; SI-NEXT: s_cmp_eq_u32 s12, 0 ; SI-NEXT: v_cndmask_b32_e64 v2, v0, 5, s[4:5] -; SI-NEXT: s_cselect_b64 s[4:5], -1, 0 ; SI-NEXT: v_mov_b32_e32 v0, s9 +; SI-NEXT: s_cselect_b64 s[4:5], -1, 0 ; SI-NEXT: v_cndmask_b32_e64 v1, v0, 0, s[4:5] ; SI-NEXT: v_mov_b32_e32 v0, s8 ; SI-NEXT: s_cmp_eq_u32 s12, 2 @@ -1775,14 +1775,14 @@ ; SI-NEXT: v_mov_b32_e32 v0, s10 ; SI-NEXT: s_cmp_eq_u32 s4, 0 ; SI-NEXT: v_cndmask_b32_e64 v2, v0, 0, vcc -; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: v_mov_b32_e32 v0, s9 +; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: v_cndmask_b32_e32 v1, v0, v4, vcc ; SI-NEXT: v_mov_b32_e32 v0, s8 ; SI-NEXT: s_cmp_eq_u32 s4, 3 ; SI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc -; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: v_mov_b32_e32 v5, s15 +; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: v_cndmask_b32_e32 v7, v5, v4, vcc ; SI-NEXT: v_mov_b32_e32 v5, s14 ; SI-NEXT: s_cmp_eq_u32 s4, 2 diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll @@ -583,8 +583,8 @@ ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 ; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 ; VI-NEXT: s_lshr_b32 s0, s4, 16 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) @@ -603,8 +603,8 @@ ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: flat_load_dword v3, v[0:1] -; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v2 ; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v2 ; CI-NEXT: s_lshr_b32 s0, s4, 16 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: s_waitcnt vmcnt(0) @@ -706,9 +706,9 @@ ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: v_mov_b32_e32 v1, 0x3e70000 ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_mov_b32_e32 v1, 0x3e70000 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 @@ -764,9 +764,9 @@ ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: v_mov_b32_e32 v1, 0xfff10000 ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_mov_b32_e32 v1, 0xfff10000 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 @@ -940,9 +940,9 @@ ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: v_mov_b32_e32 v1, 0x45000000 ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_mov_b32_e32 v1, 0x45000000 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 @@ -998,9 +998,9 @@ ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: v_mov_b32_e32 v1, 0x230000 ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_mov_b32_e32 v1, 0x230000 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 @@ -1190,8 +1190,8 @@ ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 ; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4 @@ -1199,8 +1199,8 @@ ; VI-NEXT: flat_load_dword v2, v[2:3] ; VI-NEXT: flat_load_dword v3, v[0:1] ; VI-NEXT: s_mov_b32 s2, 0xffff -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v4 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v4 ; VI-NEXT: s_mov_b32 s0, 0x12341234 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(1) @@ -1225,8 +1225,8 @@ ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; CI-NEXT: flat_load_dword v2, v[2:3] ; CI-NEXT: flat_load_dword v3, v[0:1] -; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v4 ; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v4 ; CI-NEXT: s_mov_b32 s0, 0x12341234 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: s_waitcnt vmcnt(1) @@ -1272,8 +1272,8 @@ ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 ; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 ; VI-NEXT: s_mov_b32 s0, 0xffff ; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc @@ -1292,8 +1292,8 @@ ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2 ; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2 ; CI-NEXT: s_mov_b32 s0, 0xffff ; CI-NEXT: v_mov_b32_e32 v4, s4 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc @@ -1357,8 +1357,8 @@ ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2 ; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2 ; CI-NEXT: s_lshl_b32 s0, s4, 16 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; CI-NEXT: s_waitcnt vmcnt(0) @@ -1402,8 +1402,8 @@ ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 ; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 ; VI-NEXT: s_mov_b32 s0, 0xffff ; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc @@ -1422,8 +1422,8 @@ ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2 ; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2 ; CI-NEXT: s_mov_b32 s0, 0xffff ; CI-NEXT: v_mov_b32_e32 v4, s4 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc @@ -1487,8 +1487,8 @@ ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2 ; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2 ; CI-NEXT: s_lshl_b32 s0, s4, 16 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; CI-NEXT: s_waitcnt vmcnt(0) @@ -1532,8 +1532,8 @@ ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 ; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 ; VI-NEXT: s_mov_b32 s0, 0xffff ; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc @@ -1552,8 +1552,8 @@ ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2 ; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2 ; CI-NEXT: s_mov_b32 s0, 0xffff ; CI-NEXT: v_mov_b32_e32 v4, s4 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc @@ -1668,9 +1668,9 @@ ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] -; GFX9-NEXT: s_pack_ll_b32_b16 s5, s6, s6 ; GFX9-NEXT: s_mov_b64 s[2:3], 0xffff ; GFX9-NEXT: s_lshl_b32 s4, s7, 4 +; GFX9-NEXT: s_pack_ll_b32_b16 s5, s6, s6 ; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], s4 ; GFX9-NEXT: v_mov_b32_e32 v3, s5 ; GFX9-NEXT: v_mov_b32_e32 v4, s5 diff --git a/llvm/test/CodeGen/AMDGPU/kernel-args.ll b/llvm/test/CodeGen/AMDGPU/kernel-args.ll --- a/llvm/test/CodeGen/AMDGPU/kernel-args.ll +++ b/llvm/test/CodeGen/AMDGPU/kernel-args.ll @@ -1035,8 +1035,8 @@ ; VI-NEXT: s_add_u32 s4, s2, 4 ; VI-NEXT: s_addc_u32 s5, s3, 0 ; VI-NEXT: v_mov_b32_e32 v2, s4 -; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v4, s1 +; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_mov_b32_e32 v5, s0 @@ -1721,8 +1721,8 @@ ; VI-NEXT: s_addc_u32 s1, s3, 0 ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: flat_store_byte v[2:3], v4 ; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: flat_store_byte v[2:3], v4 ; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm @@ -2198,8 +2198,8 @@ ; VI-NEXT: s_addc_u32 s1, s3, 0 ; VI-NEXT: v_mov_b32_e32 v2, s1 ; VI-NEXT: v_mov_b32_e32 v1, s0 -; VI-NEXT: flat_store_dword v[1:2], v3 ; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: flat_store_dword v[1:2], v3 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: v_mov_b32_e32 v3, s7 @@ -2308,12 +2308,12 @@ ; VI-NEXT: v_mov_b32_e32 v5, s9 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: v_mov_b32_e32 v5, s3 -; VI-NEXT: v_mov_b32_e32 v4, s2 -; VI-NEXT: s_add_u32 s2, s2, 32 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: v_mov_b32_e32 v3, s7 +; VI-NEXT: v_mov_b32_e32 v4, s2 +; VI-NEXT: s_add_u32 s2, s2, 32 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_addc_u32 s3, s3, 0 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -2451,12 +2451,12 @@ ; VI-NEXT: v_mov_b32_e32 v5, s9 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: v_mov_b32_e32 v5, s3 -; VI-NEXT: v_mov_b32_e32 v4, s2 -; VI-NEXT: s_add_u32 s2, s2, 32 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: v_mov_b32_e32 v3, s7 +; VI-NEXT: v_mov_b32_e32 v4, s2 +; VI-NEXT: s_add_u32 s2, s2, 32 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_addc_u32 s3, s3, 0 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -5873,10 +5873,10 @@ ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: s_add_u32 s2, s0, 42 -; VI-NEXT: s_addc_u32 s3, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v5, s3 +; VI-NEXT: s_addc_u32 s3, s1, 0 ; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v5, s3 ; VI-NEXT: v_mov_b32_e32 v4, s2 ; VI-NEXT: flat_load_ushort v0, v[0:1] ; VI-NEXT: flat_load_ushort v1, v[2:3] diff --git a/llvm/test/CodeGen/AMDGPU/lds-atomic-fmin-fmax.ll b/llvm/test/CodeGen/AMDGPU/lds-atomic-fmin-fmax.ll --- a/llvm/test/CodeGen/AMDGPU/lds-atomic-fmin-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/lds-atomic-fmin-fmax.ll @@ -616,15 +616,15 @@ ; SI-NEXT: s_mov_b32 s0, 0 ; SI-NEXT: s_mov_b32 s1, 0x40450000 ; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: v_mov_b32_e32 v1, s1 ; SI-NEXT: v_mov_b32_e32 v2, s4 +; SI-NEXT: v_mov_b32_e32 v1, s1 ; SI-NEXT: ds_min_rtn_f64 v[2:3], v2, v[0:1] ; SI-NEXT: v_mov_b32_e32 v4, s5 ; SI-NEXT: ds_min_f64 v4, v[0:1] ; SI-NEXT: v_mov_b32_e32 v0, s3 -; SI-NEXT: s_add_i32 s0, s2, 4 ; SI-NEXT: s_waitcnt lgkmcnt(1) ; SI-NEXT: ds_min_rtn_f64 v[0:1], v0, v[2:3] +; SI-NEXT: s_add_i32 s0, s2, 4 ; SI-NEXT: v_mov_b32_e32 v2, s0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_dword v1, v2, s[8:11], 0 offen @@ -656,9 +656,9 @@ ; GFX7-NEXT: v_mov_b32_e32 v4, s1 ; GFX7-NEXT: ds_min_f64 v4, v[0:1] offset:64 ; GFX7-NEXT: v_mov_b32_e32 v0, s5 -; GFX7-NEXT: s_add_i32 s0, s4, 4 ; GFX7-NEXT: s_waitcnt lgkmcnt(1) ; GFX7-NEXT: ds_min_rtn_f64 v[0:1], v0, v[2:3] +; GFX7-NEXT: s_add_i32 s0, s4, 4 ; GFX7-NEXT: v_mov_b32_e32 v3, s0 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -689,9 +689,9 @@ ; VI-NEXT: v_mov_b32_e32 v4, s1 ; VI-NEXT: ds_min_f64 v4, v[0:1] offset:64 ; VI-NEXT: v_mov_b32_e32 v0, s5 -; VI-NEXT: s_add_i32 s0, s4, 4 ; VI-NEXT: s_waitcnt lgkmcnt(1) ; VI-NEXT: ds_min_rtn_f64 v[0:1], v0, v[2:3] +; VI-NEXT: s_add_i32 s0, s4, 4 ; VI-NEXT: v_mov_b32_e32 v3, s0 ; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -777,8 +777,8 @@ ; G_SI-NEXT: s_lshl_b32 s5, s2, 3 ; G_SI-NEXT: s_mov_b32 s1, 0x40450000 ; G_SI-NEXT: v_mov_b32_e32 v0, s0 -; G_SI-NEXT: v_mov_b32_e32 v1, s1 ; G_SI-NEXT: v_mov_b32_e32 v2, s5 +; G_SI-NEXT: v_mov_b32_e32 v1, s1 ; G_SI-NEXT: s_mov_b32 m0, -1 ; G_SI-NEXT: ds_min_rtn_f64 v[2:3], v2, v[0:1] ; G_SI-NEXT: s_lshl_b32 s2, s2, 4 @@ -822,8 +822,8 @@ ; G_GFX7-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX7-NEXT: v_mov_b32_e32 v0, s7 ; G_GFX7-NEXT: ds_min_rtn_f64 v[0:1], v0, v[2:3] -; G_GFX7-NEXT: s_add_u32 s0, s6, 4 ; G_GFX7-NEXT: v_mov_b32_e32 v2, s6 +; G_GFX7-NEXT: s_add_u32 s0, s6, 4 ; G_GFX7-NEXT: v_mov_b32_e32 v3, s0 ; G_GFX7-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX7-NEXT: buffer_store_dword v0, v2, s[8:11], 0 offen @@ -856,8 +856,8 @@ ; G_VI-NEXT: s_waitcnt lgkmcnt(0) ; G_VI-NEXT: v_mov_b32_e32 v0, s7 ; G_VI-NEXT: ds_min_rtn_f64 v[0:1], v0, v[2:3] -; G_VI-NEXT: s_add_u32 s0, s6, 4 ; G_VI-NEXT: v_mov_b32_e32 v2, s6 +; G_VI-NEXT: s_add_u32 s0, s6, 4 ; G_VI-NEXT: v_mov_b32_e32 v3, s0 ; G_VI-NEXT: s_waitcnt lgkmcnt(0) ; G_VI-NEXT: buffer_store_dword v0, v2, s[88:91], 0 offen @@ -867,16 +867,16 @@ ; G_GFX9-LABEL: lds_ds_fmin_f64: ; G_GFX9: ; %bb.0: ; G_GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; G_GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 ; G_GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x24 ; G_GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c -; G_GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 ; G_GFX9-NEXT: s_mov_b32 s10, -1 ; G_GFX9-NEXT: s_mov_b32 s11, 0xe00000 ; G_GFX9-NEXT: s_add_u32 s8, s8, s3 ; G_GFX9-NEXT: s_mov_b32 s0, 0 +; G_GFX9-NEXT: s_addc_u32 s9, s9, 0 ; G_GFX9-NEXT: s_mov_b32 s1, 0x40450000 ; G_GFX9-NEXT: v_mov_b32_e32 v0, s0 -; G_GFX9-NEXT: s_addc_u32 s9, s9, 0 ; G_GFX9-NEXT: v_mov_b32_e32 v1, s1 ; G_GFX9-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX9-NEXT: s_add_i32 s0, s2, 4 @@ -885,8 +885,8 @@ ; G_GFX9-NEXT: ds_min_rtn_f64 v[2:3], v2, v[0:1] ; G_GFX9-NEXT: s_lshl_b32 s0, s0, 4 ; G_GFX9-NEXT: v_mov_b32_e32 v5, s0 -; G_GFX9-NEXT: ds_min_rtn_f64 v[0:1], v5, v[0:1] ; G_GFX9-NEXT: v_mov_b32_e32 v4, s7 +; G_GFX9-NEXT: ds_min_rtn_f64 v[0:1], v5, v[0:1] ; G_GFX9-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX9-NEXT: ds_min_rtn_f64 v[0:1], v4, v[2:3] ; G_GFX9-NEXT: v_mov_b32_e32 v2, s6 @@ -959,15 +959,15 @@ ; SI-NEXT: s_mov_b32 s0, 0 ; SI-NEXT: s_mov_b32 s1, 0x40450000 ; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: v_mov_b32_e32 v1, s1 ; SI-NEXT: v_mov_b32_e32 v2, s4 +; SI-NEXT: v_mov_b32_e32 v1, s1 ; SI-NEXT: ds_max_rtn_f64 v[2:3], v2, v[0:1] ; SI-NEXT: v_mov_b32_e32 v4, s5 ; SI-NEXT: ds_max_f64 v4, v[0:1] ; SI-NEXT: v_mov_b32_e32 v0, s3 -; SI-NEXT: s_add_i32 s0, s2, 4 ; SI-NEXT: s_waitcnt lgkmcnt(1) ; SI-NEXT: ds_max_rtn_f64 v[0:1], v0, v[2:3] +; SI-NEXT: s_add_i32 s0, s2, 4 ; SI-NEXT: v_mov_b32_e32 v2, s0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_dword v1, v2, s[8:11], 0 offen @@ -999,9 +999,9 @@ ; GFX7-NEXT: v_mov_b32_e32 v4, s1 ; GFX7-NEXT: ds_max_f64 v4, v[0:1] offset:64 ; GFX7-NEXT: v_mov_b32_e32 v0, s5 -; GFX7-NEXT: s_add_i32 s0, s4, 4 ; GFX7-NEXT: s_waitcnt lgkmcnt(1) ; GFX7-NEXT: ds_max_rtn_f64 v[0:1], v0, v[2:3] +; GFX7-NEXT: s_add_i32 s0, s4, 4 ; GFX7-NEXT: v_mov_b32_e32 v3, s0 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1032,9 +1032,9 @@ ; VI-NEXT: v_mov_b32_e32 v4, s1 ; VI-NEXT: ds_max_f64 v4, v[0:1] offset:64 ; VI-NEXT: v_mov_b32_e32 v0, s5 -; VI-NEXT: s_add_i32 s0, s4, 4 ; VI-NEXT: s_waitcnt lgkmcnt(1) ; VI-NEXT: ds_max_rtn_f64 v[0:1], v0, v[2:3] +; VI-NEXT: s_add_i32 s0, s4, 4 ; VI-NEXT: v_mov_b32_e32 v3, s0 ; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1120,8 +1120,8 @@ ; G_SI-NEXT: s_lshl_b32 s5, s2, 3 ; G_SI-NEXT: s_mov_b32 s1, 0x40450000 ; G_SI-NEXT: v_mov_b32_e32 v0, s0 -; G_SI-NEXT: v_mov_b32_e32 v1, s1 ; G_SI-NEXT: v_mov_b32_e32 v2, s5 +; G_SI-NEXT: v_mov_b32_e32 v1, s1 ; G_SI-NEXT: s_mov_b32 m0, -1 ; G_SI-NEXT: ds_max_rtn_f64 v[2:3], v2, v[0:1] ; G_SI-NEXT: s_lshl_b32 s2, s2, 4 @@ -1165,8 +1165,8 @@ ; G_GFX7-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX7-NEXT: v_mov_b32_e32 v0, s7 ; G_GFX7-NEXT: ds_max_rtn_f64 v[0:1], v0, v[2:3] -; G_GFX7-NEXT: s_add_u32 s0, s6, 4 ; G_GFX7-NEXT: v_mov_b32_e32 v2, s6 +; G_GFX7-NEXT: s_add_u32 s0, s6, 4 ; G_GFX7-NEXT: v_mov_b32_e32 v3, s0 ; G_GFX7-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX7-NEXT: buffer_store_dword v0, v2, s[8:11], 0 offen @@ -1199,8 +1199,8 @@ ; G_VI-NEXT: s_waitcnt lgkmcnt(0) ; G_VI-NEXT: v_mov_b32_e32 v0, s7 ; G_VI-NEXT: ds_max_rtn_f64 v[0:1], v0, v[2:3] -; G_VI-NEXT: s_add_u32 s0, s6, 4 ; G_VI-NEXT: v_mov_b32_e32 v2, s6 +; G_VI-NEXT: s_add_u32 s0, s6, 4 ; G_VI-NEXT: v_mov_b32_e32 v3, s0 ; G_VI-NEXT: s_waitcnt lgkmcnt(0) ; G_VI-NEXT: buffer_store_dword v0, v2, s[88:91], 0 offen @@ -1210,16 +1210,16 @@ ; G_GFX9-LABEL: lds_ds_fmax_f64: ; G_GFX9: ; %bb.0: ; G_GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; G_GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 ; G_GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x24 ; G_GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c -; G_GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 ; G_GFX9-NEXT: s_mov_b32 s10, -1 ; G_GFX9-NEXT: s_mov_b32 s11, 0xe00000 ; G_GFX9-NEXT: s_add_u32 s8, s8, s3 ; G_GFX9-NEXT: s_mov_b32 s0, 0 +; G_GFX9-NEXT: s_addc_u32 s9, s9, 0 ; G_GFX9-NEXT: s_mov_b32 s1, 0x40450000 ; G_GFX9-NEXT: v_mov_b32_e32 v0, s0 -; G_GFX9-NEXT: s_addc_u32 s9, s9, 0 ; G_GFX9-NEXT: v_mov_b32_e32 v1, s1 ; G_GFX9-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX9-NEXT: s_add_i32 s0, s2, 4 @@ -1228,8 +1228,8 @@ ; G_GFX9-NEXT: ds_max_rtn_f64 v[2:3], v2, v[0:1] ; G_GFX9-NEXT: s_lshl_b32 s0, s0, 4 ; G_GFX9-NEXT: v_mov_b32_e32 v5, s0 -; G_GFX9-NEXT: ds_max_rtn_f64 v[0:1], v5, v[0:1] ; G_GFX9-NEXT: v_mov_b32_e32 v4, s7 +; G_GFX9-NEXT: ds_max_rtn_f64 v[0:1], v5, v[0:1] ; G_GFX9-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX9-NEXT: ds_max_rtn_f64 v[0:1], v4, v[2:3] ; G_GFX9-NEXT: v_mov_b32_e32 v2, s6 diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll --- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll +++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll @@ -369,7 +369,7 @@ ; GCN-O1-NEXT: SI post-RA bundler ; GCN-O1-NEXT: MachineDominator Tree Construction ; GCN-O1-NEXT: Machine Natural Loop Construction -; GCN-O1-NEXT: Post RA top-down list latency scheduler +; GCN-O1-NEXT: PostRA Machine Instruction Scheduler ; GCN-O1-NEXT: Machine Block Frequency Analysis ; GCN-O1-NEXT: MachinePostDominator Tree Construction ; GCN-O1-NEXT: Branch Probability Basic Block Placement @@ -653,7 +653,7 @@ ; GCN-O1-OPTS-NEXT: SI post-RA bundler ; GCN-O1-OPTS-NEXT: MachineDominator Tree Construction ; GCN-O1-OPTS-NEXT: Machine Natural Loop Construction -; GCN-O1-OPTS-NEXT: Post RA top-down list latency scheduler +; GCN-O1-OPTS-NEXT: PostRA Machine Instruction Scheduler ; GCN-O1-OPTS-NEXT: Machine Block Frequency Analysis ; GCN-O1-OPTS-NEXT: MachinePostDominator Tree Construction ; GCN-O1-OPTS-NEXT: Branch Probability Basic Block Placement @@ -939,7 +939,7 @@ ; GCN-O2-NEXT: SI post-RA bundler ; GCN-O2-NEXT: MachineDominator Tree Construction ; GCN-O2-NEXT: Machine Natural Loop Construction -; GCN-O2-NEXT: Post RA top-down list latency scheduler +; GCN-O2-NEXT: PostRA Machine Instruction Scheduler ; GCN-O2-NEXT: Machine Block Frequency Analysis ; GCN-O2-NEXT: MachinePostDominator Tree Construction ; GCN-O2-NEXT: Branch Probability Basic Block Placement @@ -1238,7 +1238,7 @@ ; GCN-O3-NEXT: SI post-RA bundler ; GCN-O3-NEXT: MachineDominator Tree Construction ; GCN-O3-NEXT: Machine Natural Loop Construction -; GCN-O3-NEXT: Post RA top-down list latency scheduler +; GCN-O3-NEXT: PostRA Machine Instruction Scheduler ; GCN-O3-NEXT: Machine Block Frequency Analysis ; GCN-O3-NEXT: MachinePostDominator Tree Construction ; GCN-O3-NEXT: Branch Probability Basic Block Placement diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll @@ -557,8 +557,8 @@ ; GFX9-NEXT: v_and_b32_e32 v4, v6, v4 ; GFX9-NEXT: v_and_b32_e32 v2, v6, v2 ; GFX9-NEXT: v_and_b32_e32 v0, v6, v0 -; GFX9-NEXT: v_lshl_or_b32 v3, v3, 16, v2 ; GFX9-NEXT: v_lshl_or_b32 v4, v5, 16, v4 +; GFX9-NEXT: v_lshl_or_b32 v3, v3, 16, v2 ; GFX9-NEXT: v_lshl_or_b32 v2, v1, 16, v0 ; GFX9-NEXT: image_sample_d v[0:3], v[2:4], s[0:7], s[8:11] dmask:0xf a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -639,12 +639,12 @@ ; GFX9-LABEL: sample_c_d_2d: ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: v_mov_b32_e32 v9, 0xffff -; GFX9-NEXT: v_mov_b32_e32 v8, v2 ; GFX9-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-NEXT: v_mov_b32_e32 v8, v2 ; GFX9-NEXT: v_and_b32_e32 v2, v9, v5 -; GFX9-NEXT: v_and_b32_e32 v1, v9, v1 ; GFX9-NEXT: v_lshl_or_b32 v3, v6, 16, v2 ; GFX9-NEXT: v_and_b32_e32 v2, v9, v7 +; GFX9-NEXT: v_and_b32_e32 v1, v9, v1 ; GFX9-NEXT: v_lshl_or_b32 v2, v4, 16, v2 ; GFX9-NEXT: v_lshl_or_b32 v1, v8, 16, v1 ; GFX9-NEXT: image_sample_c_d v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf a16 @@ -798,8 +798,8 @@ ; GFX9-NEXT: v_and_b32_e32 v4, v6, v4 ; GFX9-NEXT: v_and_b32_e32 v2, v6, v2 ; GFX9-NEXT: v_and_b32_e32 v0, v6, v0 -; GFX9-NEXT: v_lshl_or_b32 v3, v3, 16, v2 ; GFX9-NEXT: v_lshl_or_b32 v4, v5, 16, v4 +; GFX9-NEXT: v_lshl_or_b32 v3, v3, 16, v2 ; GFX9-NEXT: v_lshl_or_b32 v2, v1, 16, v0 ; GFX9-NEXT: image_sample_cd v[0:3], v[2:4], s[0:7], s[8:11] dmask:0xf a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -843,12 +843,12 @@ ; GFX9-LABEL: sample_c_cd_2d: ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: v_mov_b32_e32 v9, 0xffff -; GFX9-NEXT: v_mov_b32_e32 v8, v2 ; GFX9-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-NEXT: v_mov_b32_e32 v8, v2 ; GFX9-NEXT: v_and_b32_e32 v2, v9, v5 -; GFX9-NEXT: v_and_b32_e32 v1, v9, v1 ; GFX9-NEXT: v_lshl_or_b32 v3, v6, 16, v2 ; GFX9-NEXT: v_and_b32_e32 v2, v9, v7 +; GFX9-NEXT: v_and_b32_e32 v1, v9, v1 ; GFX9-NEXT: v_lshl_or_b32 v2, v4, 16, v2 ; GFX9-NEXT: v_lshl_or_b32 v1, v8, 16, v1 ; GFX9-NEXT: image_sample_c_cd v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf a16 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll @@ -1718,8 +1718,8 @@ ; VERDE-NEXT: s_mov_b32 s15, 0xf000 ; VERDE-NEXT: s_mov_b32 s14, -1 ; VERDE-NEXT: s_waitcnt vmcnt(0) -; VERDE-NEXT: v_mov_b32_e32 v0, v9 ; VERDE-NEXT: buffer_store_dword v10, off, s[12:15], 0 +; VERDE-NEXT: v_mov_b32_e32 v0, v9 ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; VERDE-NEXT: ; return to shader part epilog ; diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.a16.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.a16.dim.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.a16.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.a16.dim.ll @@ -548,12 +548,12 @@ ; GFX10GISEL-NEXT: v_mov_b32_e32 v9, v2 ; GFX10GISEL-NEXT: v_mov_b32_e32 v10, v3 ; GFX10GISEL-NEXT: v_mov_b32_e32 v11, 0xffff -; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX10GISEL-NEXT: s_lshl_b32 s12, s0, 16 ; GFX10GISEL-NEXT: v_and_or_b32 v3, v9, v11, s12 -; GFX10GISEL-NEXT: v_and_or_b32 v4, v10, v11, v4 ; GFX10GISEL-NEXT: v_and_or_b32 v2, v0, v11, v1 +; GFX10GISEL-NEXT: v_and_or_b32 v4, v10, v11, v4 ; GFX10GISEL-NEXT: v_and_or_b32 v5, v5, v11, s12 ; GFX10GISEL-NEXT: image_sample_d_g16 v[0:3], v[2:8], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) @@ -929,14 +929,14 @@ ; GFX10GISEL: ; %bb.0: ; %main_body ; GFX10GISEL-NEXT: v_mov_b32_e32 v9, v3 ; GFX10GISEL-NEXT: v_mov_b32_e32 v10, v2 +; GFX10GISEL-NEXT: v_mov_b32_e32 v11, v4 ; GFX10GISEL-NEXT: v_mov_b32_e32 v2, v0 ; GFX10GISEL-NEXT: v_mov_b32_e32 v3, v1 -; GFX10GISEL-NEXT: v_mov_b32_e32 v11, v4 ; GFX10GISEL-NEXT: v_mov_b32_e32 v0, 0xffff -; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v9 -; GFX10GISEL-NEXT: v_and_or_b32 v5, v11, v0, v5 +; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX10GISEL-NEXT: v_and_or_b32 v4, v10, v0, v1 +; GFX10GISEL-NEXT: v_and_or_b32 v5, v11, v0, v5 ; GFX10GISEL-NEXT: image_sample_c_d_o_g16 v0, v[2:8], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -965,14 +965,14 @@ ; GFX10GISEL: ; %bb.0: ; %main_body ; GFX10GISEL-NEXT: v_mov_b32_e32 v9, v3 ; GFX10GISEL-NEXT: v_mov_b32_e32 v10, v2 +; GFX10GISEL-NEXT: v_mov_b32_e32 v11, v4 ; GFX10GISEL-NEXT: v_mov_b32_e32 v2, v0 ; GFX10GISEL-NEXT: v_mov_b32_e32 v3, v1 -; GFX10GISEL-NEXT: v_mov_b32_e32 v11, v4 ; GFX10GISEL-NEXT: v_mov_b32_e32 v0, 0xffff -; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v9 -; GFX10GISEL-NEXT: v_and_or_b32 v5, v11, v0, v5 +; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX10GISEL-NEXT: v_and_or_b32 v4, v10, v0, v1 +; GFX10GISEL-NEXT: v_and_or_b32 v5, v11, v0, v5 ; GFX10GISEL-NEXT: image_sample_c_d_o_g16 v[0:1], v[2:8], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll @@ -49,8 +49,8 @@ ; GCN-NEXT: s_load_dword s6, s[0:1], 0x2c ; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s6 ; GCN-NEXT: s_buffer_load_dword s0, s[0:3], 0x0 +; GCN-NEXT: v_mov_b32_e32 v0, s6 ; GCN-NEXT: s_not_b64 exec, exec ; GCN-NEXT: v_mov_b32_e32 v0, 42 ; GCN-NEXT: s_not_b64 exec, exec diff --git a/llvm/test/CodeGen/AMDGPU/llvm.fma.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.fma.f16.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.fma.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.fma.f16.ll @@ -107,9 +107,9 @@ ; GCN: buffer_load_dword v[[C_V2_F16:[0-9]+]] ; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] +; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] ; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] ; SI: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]] -; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] ; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] ; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] ; SI: v_cvt_f32_f16_e32 v[[C_F32_1:[0-9]+]], v[[C_F16_1]] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll @@ -134,9 +134,9 @@ ; GFX10: buffer_load_dword v[[C_V2_F16:[0-9]+]] ; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] +; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] ; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] ; SI: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]] -; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] ; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] ; SI-DAG: v_cvt_f32_f16_e32 v[[C_F32_0:[0-9]+]], v[[C_V2_F16]] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll @@ -102,11 +102,11 @@ ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_mov_b32 s12, s6 ; GFX10-NEXT: s_mov_b32 s13, s7 -; GFX10-NEXT: s_mov_b32 s0, s4 ; GFX10-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_mov_b32 s0, s4 ; GFX10-NEXT: s_mov_b32 s1, s5 ; GFX10-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX10-NEXT: v_max_f16_e32 v1, v1, v1 @@ -320,12 +320,12 @@ ; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_max_f32_e32 v2, v3, v2 -; SI-NEXT: v_max_f32_e32 v0, v0, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_max_f32_e32 v0, v0, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: s_mov_b32 s1, s5 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm @@ -344,8 +344,8 @@ ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_max_f16_e64 v1, s4, s4 ; VI-NEXT: v_max_f16_e64 v0, s5, s5 -; VI-NEXT: s_lshr_b32 s4, s4, 16 ; VI-NEXT: s_lshr_b32 s5, s5, 16 +; VI-NEXT: s_lshr_b32 s4, s4, 16 ; VI-NEXT: v_max_f16_e32 v0, v1, v0 ; VI-NEXT: v_max_f16_e64 v1, s5, s5 ; VI-NEXT: v_max_f16_e64 v2, s4, s4 @@ -361,9 +361,9 @@ ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s0, s4 ; GFX9-NEXT: s_load_dword s10, s[6:7], 0x0 ; GFX9-NEXT: s_load_dword s11, s[8:9], 0x0 +; GFX9-NEXT: s_mov_b32 s0, s4 ; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_pk_max_f16 v1, s10, s10 @@ -568,14 +568,14 @@ ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s0, s4 ; SI-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 ; SI-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0 +; SI-NEXT: s_mov_b32 s0, s4 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshr_b32 s1, s6, 16 ; SI-NEXT: s_lshr_b32 s4, s8, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s1 ; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s1 ; SI-NEXT: v_cvt_f32_f16_e32 v1, s6 ; SI-NEXT: v_cvt_f32_f16_e32 v5, s8 ; SI-NEXT: v_cvt_f32_f16_e32 v0, s7 @@ -592,8 +592,8 @@ ; SI-NEXT: v_max_f32_e32 v0, v0, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:4 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], 0 @@ -613,8 +613,8 @@ ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_max_f16_e64 v1, s4, s4 ; VI-NEXT: v_max_f16_e64 v0, s6, s6 -; VI-NEXT: s_lshr_b32 s4, s4, 16 ; VI-NEXT: s_lshr_b32 s6, s6, 16 +; VI-NEXT: s_lshr_b32 s4, s4, 16 ; VI-NEXT: v_max_f16_e32 v0, v1, v0 ; VI-NEXT: v_max_f16_e64 v1, s6, s6 ; VI-NEXT: v_max_f16_e64 v2, s4, s4 @@ -641,8 +641,8 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_pk_max_f16 v1, s4, s4 ; GFX9-NEXT: v_pk_max_f16 v0, s10, s10 -; GFX9-NEXT: v_pk_max_f16 v0, v1, v0 ; GFX9-NEXT: v_pk_max_f16 v2, s11, s11 +; GFX9-NEXT: v_pk_max_f16 v0, v1, v0 ; GFX9-NEXT: v_pk_max_f16 v1, s5, s5 ; GFX9-NEXT: v_pk_max_f16 v1, v1, v2 ; GFX9-NEXT: buffer_store_short v1, off, s[0:3], 0 offset:4 @@ -693,8 +693,8 @@ ; SI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s5 ; SI-NEXT: s_lshr_b32 s4, s4, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s5 ; SI-NEXT: s_lshr_b32 s5, s5, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 ; SI-NEXT: v_cvt_f32_f16_e32 v3, s5 @@ -743,17 +743,17 @@ ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_max_f16_e64 v1, s5, s5 ; VI-NEXT: v_max_f16_e64 v0, s7, s7 -; VI-NEXT: s_lshr_b32 s5, s5, 16 ; VI-NEXT: s_lshr_b32 s7, s7, 16 +; VI-NEXT: s_lshr_b32 s5, s5, 16 ; VI-NEXT: v_max_f16_e32 v0, v1, v0 -; VI-NEXT: v_max_f16_e64 v2, s5, s5 ; VI-NEXT: v_max_f16_e64 v1, s7, s7 +; VI-NEXT: v_max_f16_e64 v2, s5, s5 ; VI-NEXT: v_max_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v1, v0, v1 -; VI-NEXT: v_max_f16_e64 v2, s4, s4 ; VI-NEXT: v_max_f16_e64 v0, s6, s6 -; VI-NEXT: s_lshr_b32 s4, s4, 16 +; VI-NEXT: v_max_f16_e64 v2, s4, s4 ; VI-NEXT: s_lshr_b32 s5, s6, 16 +; VI-NEXT: s_lshr_b32 s4, s4, 16 ; VI-NEXT: v_max_f16_e32 v0, v2, v0 ; VI-NEXT: v_max_f16_e64 v2, s5, s5 ; VI-NEXT: v_max_f16_e64 v3, s4, s4 @@ -776,8 +776,8 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_pk_max_f16 v1, s5, s5 ; GFX9-NEXT: v_pk_max_f16 v0, s11, s11 -; GFX9-NEXT: v_pk_max_f16 v1, v1, v0 ; GFX9-NEXT: v_pk_max_f16 v2, s10, s10 +; GFX9-NEXT: v_pk_max_f16 v1, v1, v0 ; GFX9-NEXT: v_pk_max_f16 v0, s4, s4 ; GFX9-NEXT: v_pk_max_f16 v0, v0, v2 ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -838,9 +838,9 @@ ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_max_f32_e32 v3, 2.0, v3 ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; SI-NEXT: v_max_f32_e32 v0, 0x41000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_max_f32_e32 v0, 0x41000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 @@ -864,8 +864,8 @@ ; VI-NEXT: s_lshr_b32 s5, s5, 16 ; VI-NEXT: v_max_f16_e64 v3, s5, s5 ; VI-NEXT: v_max_f16_e64 v2, s4, s4 -; VI-NEXT: v_max_f16_sdwa v0, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_max_f16_e32 v1, 0x4200, v1 +; VI-NEXT: v_max_f16_sdwa v0, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: s_lshr_b32 s4, s4, 16 ; VI-NEXT: v_or_b32_e32 v1, v1, v0 ; VI-NEXT: v_max_f16_e32 v0, 0x4800, v2 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll @@ -102,11 +102,11 @@ ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_mov_b32 s12, s6 ; GFX10-NEXT: s_mov_b32 s13, s7 -; GFX10-NEXT: s_mov_b32 s0, s4 ; GFX10-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_mov_b32 s0, s4 ; GFX10-NEXT: s_mov_b32 s1, s5 ; GFX10-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX10-NEXT: v_max_f16_e32 v1, v1, v1 @@ -348,12 +348,12 @@ ; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_min_f32_e32 v2, v3, v2 -; SI-NEXT: v_min_f32_e32 v0, v0, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_min_f32_e32 v0, v0, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: s_mov_b32 s1, s5 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm @@ -372,8 +372,8 @@ ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_max_f16_e64 v1, s4, s4 ; VI-NEXT: v_max_f16_e64 v0, s5, s5 -; VI-NEXT: s_lshr_b32 s4, s4, 16 ; VI-NEXT: s_lshr_b32 s5, s5, 16 +; VI-NEXT: s_lshr_b32 s4, s4, 16 ; VI-NEXT: v_min_f16_e32 v0, v1, v0 ; VI-NEXT: v_max_f16_e64 v1, s5, s5 ; VI-NEXT: v_max_f16_e64 v2, s4, s4 @@ -389,9 +389,9 @@ ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s0, s4 ; GFX9-NEXT: s_load_dword s10, s[6:7], 0x0 ; GFX9-NEXT: s_load_dword s11, s[8:9], 0x0 +; GFX9-NEXT: s_mov_b32 s0, s4 ; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_pk_max_f16 v1, s10, s10 @@ -631,14 +631,14 @@ ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s0, s4 ; SI-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 ; SI-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0 +; SI-NEXT: s_mov_b32 s0, s4 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshr_b32 s1, s6, 16 ; SI-NEXT: s_lshr_b32 s4, s8, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s1 ; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s1 ; SI-NEXT: v_cvt_f32_f16_e32 v1, s6 ; SI-NEXT: v_cvt_f32_f16_e32 v5, s8 ; SI-NEXT: v_cvt_f32_f16_e32 v0, s7 @@ -655,8 +655,8 @@ ; SI-NEXT: v_min_f32_e32 v0, v0, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:4 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], 0 @@ -676,8 +676,8 @@ ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_max_f16_e64 v1, s4, s4 ; VI-NEXT: v_max_f16_e64 v0, s6, s6 -; VI-NEXT: s_lshr_b32 s4, s4, 16 ; VI-NEXT: s_lshr_b32 s6, s6, 16 +; VI-NEXT: s_lshr_b32 s4, s4, 16 ; VI-NEXT: v_min_f16_e32 v0, v1, v0 ; VI-NEXT: v_max_f16_e64 v1, s6, s6 ; VI-NEXT: v_max_f16_e64 v2, s4, s4 @@ -704,8 +704,8 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_pk_max_f16 v1, s4, s4 ; GFX9-NEXT: v_pk_max_f16 v0, s10, s10 -; GFX9-NEXT: v_pk_min_f16 v0, v1, v0 ; GFX9-NEXT: v_pk_max_f16 v2, s11, s11 +; GFX9-NEXT: v_pk_min_f16 v0, v1, v0 ; GFX9-NEXT: v_pk_max_f16 v1, s5, s5 ; GFX9-NEXT: v_pk_min_f16 v1, v1, v2 ; GFX9-NEXT: buffer_store_short v1, off, s[0:3], 0 offset:4 @@ -756,8 +756,8 @@ ; SI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s5 ; SI-NEXT: s_lshr_b32 s4, s4, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s5 ; SI-NEXT: s_lshr_b32 s5, s5, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 ; SI-NEXT: v_cvt_f32_f16_e32 v3, s5 @@ -806,17 +806,17 @@ ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_max_f16_e64 v1, s5, s5 ; VI-NEXT: v_max_f16_e64 v0, s7, s7 -; VI-NEXT: s_lshr_b32 s5, s5, 16 ; VI-NEXT: s_lshr_b32 s7, s7, 16 +; VI-NEXT: s_lshr_b32 s5, s5, 16 ; VI-NEXT: v_min_f16_e32 v0, v1, v0 -; VI-NEXT: v_max_f16_e64 v2, s5, s5 ; VI-NEXT: v_max_f16_e64 v1, s7, s7 +; VI-NEXT: v_max_f16_e64 v2, s5, s5 ; VI-NEXT: v_min_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v1, v0, v1 -; VI-NEXT: v_max_f16_e64 v2, s4, s4 ; VI-NEXT: v_max_f16_e64 v0, s6, s6 -; VI-NEXT: s_lshr_b32 s4, s4, 16 +; VI-NEXT: v_max_f16_e64 v2, s4, s4 ; VI-NEXT: s_lshr_b32 s5, s6, 16 +; VI-NEXT: s_lshr_b32 s4, s4, 16 ; VI-NEXT: v_min_f16_e32 v0, v2, v0 ; VI-NEXT: v_max_f16_e64 v2, s5, s5 ; VI-NEXT: v_max_f16_e64 v3, s4, s4 @@ -839,8 +839,8 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_pk_max_f16 v1, s5, s5 ; GFX9-NEXT: v_pk_max_f16 v0, s11, s11 -; GFX9-NEXT: v_pk_min_f16 v1, v1, v0 ; GFX9-NEXT: v_pk_max_f16 v2, s10, s10 +; GFX9-NEXT: v_pk_min_f16 v1, v1, v0 ; GFX9-NEXT: v_pk_max_f16 v0, s4, s4 ; GFX9-NEXT: v_pk_min_f16 v0, v0, v2 ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -901,9 +901,9 @@ ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_min_f32_e32 v3, 2.0, v3 ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; SI-NEXT: v_min_f32_e32 v0, 0x41000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_min_f32_e32 v0, 0x41000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 @@ -927,8 +927,8 @@ ; VI-NEXT: s_lshr_b32 s5, s5, 16 ; VI-NEXT: v_max_f16_e64 v3, s5, s5 ; VI-NEXT: v_max_f16_e64 v2, s4, s4 -; VI-NEXT: v_min_f16_sdwa v0, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_min_f16_e32 v1, 0x4200, v1 +; VI-NEXT: v_min_f16_sdwa v0, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: s_lshr_b32 s4, s4, 16 ; VI-NEXT: v_or_b32_e32 v1, v1, v0 ; VI-NEXT: v_min_f16_e32 v0, 0x4800, v2 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll b/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll @@ -68,8 +68,8 @@ ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v4, v7, vcc_lo ; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v9, vcc_lo ; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, v3, v1 -; GFX10-NEXT: v_add3_u32 v1, v6, v5, v8 ; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v4, vcc_lo +; GFX10-NEXT: v_add3_u32 v1, v6, v5, v8 ; GFX10-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[3:4] ; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -127,8 +127,8 @@ ; GFX9-NEXT: v_mul_hi_u32 v4, v1, v2 ; GFX9-NEXT: v_add_co_u32_e32 v9, vcc, v6, v5 ; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v8, vcc -; GFX9-NEXT: v_add_co_u32_e32 v9, vcc, v9, v7 ; GFX9-NEXT: v_mul_hi_i32 v10, v1, v3 +; GFX9-NEXT: v_add_co_u32_e32 v9, vcc, v9, v7 ; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v8, v4, vcc ; GFX9-NEXT: v_mul_lo_u32 v8, v1, v3 ; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v10, vcc @@ -147,8 +147,8 @@ ; GFX9-NEXT: v_add3_u32 v1, v6, v5, v7 ; GFX9-NEXT: v_ashrrev_i32_e32 v5, 31, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc -; GFX9-NEXT: v_mov_b32_e32 v6, v5 ; GFX9-NEXT: v_mul_lo_u32 v0, v0, v2 +; GFX9-NEXT: v_mov_b32_e32 v6, v5 ; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, v[3:4], v[5:6] ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -179,10 +179,10 @@ ; GFX10-NEXT: v_cndmask_b32_e32 v7, v7, v10, vcc_lo ; GFX10-NEXT: v_ashrrev_i32_e32 v4, 31, v1 ; GFX10-NEXT: v_sub_co_u32 v8, vcc_lo, v6, v0 -; GFX10-NEXT: v_mul_lo_u32 v0, v0, v2 ; GFX10-NEXT: v_subrev_co_ci_u32_e32 v9, vcc_lo, 0, v7, vcc_lo ; GFX10-NEXT: v_cmp_gt_i32_e32 vcc_lo, 0, v3 ; GFX10-NEXT: v_mov_b32_e32 v5, v4 +; GFX10-NEXT: v_mul_lo_u32 v0, v0, v2 ; GFX10-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc_lo ; GFX10-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[6:7], v[4:5] @@ -350,8 +350,8 @@ ; GFX9-NEXT: s_add_u32 s6, s10, s9 ; GFX9-NEXT: s_mul_i32 s8, s1, s2 ; GFX9-NEXT: s_addc_u32 s5, 0, s5 -; GFX9-NEXT: s_add_u32 s6, s6, s8 ; GFX9-NEXT: s_mul_hi_u32 s4, s1, s2 +; GFX9-NEXT: s_add_u32 s6, s6, s8 ; GFX9-NEXT: s_mul_hi_i32 s7, s1, s3 ; GFX9-NEXT: s_addc_u32 s4, s5, s4 ; GFX9-NEXT: s_addc_u32 s5, s7, 0 @@ -387,14 +387,14 @@ ; GFX10-NEXT: s_mul_i32 s9, s0, s3 ; GFX10-NEXT: s_mul_hi_u32 s10, s0, s2 ; GFX10-NEXT: s_mul_hi_u32 s5, s0, s3 -; GFX10-NEXT: s_add_u32 s11, s10, s9 ; GFX10-NEXT: s_mul_i32 s8, s1, s2 -; GFX10-NEXT: s_addc_u32 s5, 0, s5 +; GFX10-NEXT: s_add_u32 s11, s10, s9 ; GFX10-NEXT: s_mul_hi_u32 s4, s1, s2 -; GFX10-NEXT: s_add_u32 s11, s11, s8 +; GFX10-NEXT: s_addc_u32 s5, 0, s5 ; GFX10-NEXT: s_mul_hi_i32 s6, s1, s3 -; GFX10-NEXT: s_addc_u32 s4, s5, s4 +; GFX10-NEXT: s_add_u32 s11, s11, s8 ; GFX10-NEXT: s_mul_i32 s7, s1, s3 +; GFX10-NEXT: s_addc_u32 s4, s5, s4 ; GFX10-NEXT: s_addc_u32 s5, s6, 0 ; GFX10-NEXT: s_add_u32 s4, s4, s7 ; GFX10-NEXT: s_addc_u32 s5, 0, s5 @@ -487,8 +487,8 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v7, 0x3fffffff, v1 ; GFX9-NEXT: v_mov_b32_e32 v6, v0 -; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, v[6:7], v[0:1] ; GFX9-NEXT: v_lshlrev_b64 v[4:5], 2, v[0:1] +; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, v[6:7], v[0:1] ; GFX9-NEXT: v_alignbit_b32 v3, v1, v0, 30 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX9-NEXT: v_mov_b32_e32 v0, v4 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll @@ -17,10 +17,10 @@ ; SI-NEXT: s_andn2_b64 s[2:3], s[10:11], s[0:1] ; SI-NEXT: s_and_b32 s0, s11, 0x80000000 ; SI-NEXT: s_cmp_lt_i32 s5, 0 +; SI-NEXT: v_mov_b32_e32 v0, s3 +; SI-NEXT: v_mov_b32_e32 v1, s0 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: s_cmp_gt_i32 s5, 51 -; SI-NEXT: v_mov_b32_e32 v1, s0 -; SI-NEXT: v_mov_b32_e32 v0, s3 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; SI-NEXT: v_mov_b32_e32 v1, s11 ; SI-NEXT: s_cselect_b64 s[0:1], -1, 0 @@ -79,8 +79,8 @@ ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b64 s[0:1], s[6:7] ; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 -; SI-NEXT: s_mov_b32 s0, -1 ; SI-NEXT: s_movk_i32 s7, 0xfc01 +; SI-NEXT: s_mov_b32 s0, -1 ; SI-NEXT: s_mov_b32 s1, 0xfffff ; SI-NEXT: s_brev_b32 s6, -2 ; SI-NEXT: v_mov_b32_e32 v8, 0x3ff00000 @@ -92,8 +92,8 @@ ; SI-NEXT: v_not_b32_e32 v4, v4 ; SI-NEXT: v_not_b32_e32 v5, v5 ; SI-NEXT: v_and_b32_e32 v5, v3, v5 -; SI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v6 ; SI-NEXT: v_and_b32_e32 v4, v2, v4 +; SI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v6 ; SI-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc ; SI-NEXT: v_cndmask_b32_e64 v4, v4, 0, vcc ; SI-NEXT: v_cmp_lt_i32_e32 vcc, 51, v6 @@ -158,9 +158,9 @@ ; SI-NEXT: s_andn2_b64 s[12:13], s[10:11], s[0:1] ; SI-NEXT: s_and_b32 s0, s11, s15 ; SI-NEXT: s_cmp_lt_i32 s14, 0 -; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: v_mov_b32_e32 v0, s13 ; SI-NEXT: v_mov_b32_e32 v1, s0 +; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: s_cmp_gt_i32 s14, 51 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; SI-NEXT: v_mov_b32_e32 v1, s11 @@ -173,11 +173,11 @@ ; SI-NEXT: v_add_f64 v[2:3], s[10:11], -v[0:1] ; SI-NEXT: s_bfe_u32 s0, s9, 0xb0014 ; SI-NEXT: s_add_i32 s7, s0, s7 -; SI-NEXT: s_lshr_b64 s[0:1], s[2:3], s7 -; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[2:3]|, 0.5 ; SI-NEXT: s_brev_b32 s10, -2 ; SI-NEXT: v_mov_b32_e32 v6, 0x3ff00000 ; SI-NEXT: v_mov_b32_e32 v4, s11 +; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[2:3]|, 0.5 +; SI-NEXT: s_lshr_b64 s[0:1], s[2:3], s7 ; SI-NEXT: v_bfi_b32 v4, s10, v6, v4 ; SI-NEXT: s_andn2_b64 s[2:3], s[8:9], s[0:1] ; SI-NEXT: s_and_b32 s0, s9, s15 @@ -185,10 +185,10 @@ ; SI-NEXT: v_mov_b32_e32 v2, 0 ; SI-NEXT: s_cmp_lt_i32 s7, 0 ; SI-NEXT: v_add_f64 v[2:3], v[0:1], v[2:3] +; SI-NEXT: v_mov_b32_e32 v0, s3 +; SI-NEXT: v_mov_b32_e32 v1, s0 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: s_cmp_gt_i32 s7, 51 -; SI-NEXT: v_mov_b32_e32 v1, s0 -; SI-NEXT: v_mov_b32_e32 v0, s3 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; SI-NEXT: v_mov_b32_e32 v1, s9 ; SI-NEXT: s_cselect_b64 s[0:1], -1, 0 @@ -257,9 +257,9 @@ ; SI-NEXT: s_andn2_b64 s[16:17], s[6:7], s[0:1] ; SI-NEXT: s_and_b32 s0, s7, s20 ; SI-NEXT: s_cmp_lt_i32 s19, 0 -; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: v_mov_b32_e32 v0, s17 ; SI-NEXT: v_mov_b32_e32 v1, s0 +; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: s_cmp_gt_i32 s19, 51 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; SI-NEXT: v_mov_b32_e32 v1, s7 @@ -272,10 +272,10 @@ ; SI-NEXT: v_add_f64 v[2:3], s[6:7], -v[0:1] ; SI-NEXT: s_bfe_u32 s0, s5, 0xb0014 ; SI-NEXT: s_add_i32 s17, s0, s18 -; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[2:3]|, 0.5 ; SI-NEXT: s_brev_b32 s16, -2 ; SI-NEXT: v_mov_b32_e32 v12, 0x3ff00000 ; SI-NEXT: v_mov_b32_e32 v4, s7 +; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[2:3]|, 0.5 ; SI-NEXT: s_lshr_b64 s[0:1], s[2:3], s17 ; SI-NEXT: v_bfi_b32 v4, s16, v12, v4 ; SI-NEXT: s_andn2_b64 s[6:7], s[4:5], s[0:1] @@ -284,9 +284,9 @@ ; SI-NEXT: v_mov_b32_e32 v2, 0 ; SI-NEXT: s_cmp_lt_i32 s17, 0 ; SI-NEXT: v_add_f64 v[2:3], v[0:1], v[2:3] -; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: v_mov_b32_e32 v0, s7 ; SI-NEXT: v_mov_b32_e32 v1, s0 +; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: s_cmp_gt_i32 s17, 51 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; SI-NEXT: v_mov_b32_e32 v1, s5 @@ -297,19 +297,19 @@ ; SI-NEXT: v_mov_b32_e32 v4, s4 ; SI-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[0:1] ; SI-NEXT: s_bfe_u32 s0, s11, 0xb0014 -; SI-NEXT: s_add_i32 s6, s0, s18 ; SI-NEXT: v_add_f64 v[4:5], s[4:5], -v[0:1] +; SI-NEXT: s_add_i32 s6, s0, s18 ; SI-NEXT: s_lshr_b64 s[0:1], s[2:3], s6 ; SI-NEXT: v_mov_b32_e32 v6, s5 -; SI-NEXT: s_andn2_b64 s[4:5], s[10:11], s[0:1] ; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5 +; SI-NEXT: s_andn2_b64 s[4:5], s[10:11], s[0:1] ; SI-NEXT: s_and_b32 s0, s11, s20 ; SI-NEXT: v_bfi_b32 v6, s16, v12, v6 ; SI-NEXT: s_cmp_lt_i32 s6, 0 ; SI-NEXT: v_cndmask_b32_e32 v9, 0, v6, vcc -; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: v_mov_b32_e32 v4, s5 ; SI-NEXT: v_mov_b32_e32 v5, s0 +; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: s_cmp_gt_i32 s6, 51 ; SI-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc ; SI-NEXT: v_mov_b32_e32 v5, s11 @@ -323,8 +323,8 @@ ; SI-NEXT: s_bfe_u32 s0, s9, 0xb0014 ; SI-NEXT: s_add_i32 s4, s0, s18 ; SI-NEXT: v_mov_b32_e32 v10, s11 -; SI-NEXT: s_lshr_b64 s[0:1], s[2:3], s4 ; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[6:7]|, 0.5 +; SI-NEXT: s_lshr_b64 s[0:1], s[2:3], s4 ; SI-NEXT: v_bfi_b32 v10, s16, v12, v10 ; SI-NEXT: s_andn2_b64 s[2:3], s[8:9], s[0:1] ; SI-NEXT: s_and_b32 s0, s9, s20 @@ -332,9 +332,9 @@ ; SI-NEXT: v_mov_b32_e32 v6, 0 ; SI-NEXT: s_cmp_lt_i32 s4, 0 ; SI-NEXT: v_add_f64 v[6:7], v[4:5], v[6:7] -; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: v_mov_b32_e32 v4, s3 ; SI-NEXT: v_mov_b32_e32 v5, s0 +; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: s_cmp_gt_i32 s4, 51 ; SI-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc ; SI-NEXT: v_mov_b32_e32 v5, s9 @@ -423,9 +423,9 @@ ; SI-NEXT: s_andn2_b64 s[24:25], s[6:7], s[2:3] ; SI-NEXT: s_and_b32 s2, s7, s28 ; SI-NEXT: s_cmp_lt_i32 s26, 0 -; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: v_mov_b32_e32 v0, s25 ; SI-NEXT: v_mov_b32_e32 v1, s2 +; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: s_cmp_gt_i32 s26, 51 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; SI-NEXT: v_mov_b32_e32 v1, s7 @@ -438,10 +438,10 @@ ; SI-NEXT: v_add_f64 v[2:3], s[6:7], -v[0:1] ; SI-NEXT: s_bfe_u32 s2, s5, 0xb0014 ; SI-NEXT: s_add_i32 s24, s2, s23 -; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[2:3]|, 0.5 ; SI-NEXT: s_brev_b32 s29, -2 ; SI-NEXT: v_mov_b32_e32 v14, 0x3ff00000 ; SI-NEXT: v_mov_b32_e32 v4, s7 +; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[2:3]|, 0.5 ; SI-NEXT: s_lshr_b64 s[2:3], s[20:21], s24 ; SI-NEXT: v_bfi_b32 v4, s29, v14, v4 ; SI-NEXT: s_andn2_b64 s[6:7], s[4:5], s[2:3] @@ -450,9 +450,9 @@ ; SI-NEXT: v_mov_b32_e32 v2, 0 ; SI-NEXT: s_cmp_lt_i32 s24, 0 ; SI-NEXT: v_add_f64 v[2:3], v[0:1], v[2:3] -; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: v_mov_b32_e32 v0, s7 ; SI-NEXT: v_mov_b32_e32 v1, s2 +; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: s_cmp_gt_i32 s24, 51 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; SI-NEXT: v_mov_b32_e32 v1, s5 @@ -475,9 +475,9 @@ ; SI-NEXT: v_mov_b32_e32 v4, 0 ; SI-NEXT: s_cmp_lt_i32 s6, 0 ; SI-NEXT: v_add_f64 v[0:1], v[0:1], v[4:5] -; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: v_mov_b32_e32 v4, s5 ; SI-NEXT: v_mov_b32_e32 v5, s2 +; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: s_cmp_gt_i32 s6, 51 ; SI-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc ; SI-NEXT: v_mov_b32_e32 v5, s11 @@ -500,9 +500,9 @@ ; SI-NEXT: v_mov_b32_e32 v6, 0 ; SI-NEXT: s_cmp_lt_i32 s6, 0 ; SI-NEXT: v_add_f64 v[6:7], v[4:5], v[6:7] -; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: v_mov_b32_e32 v4, s5 ; SI-NEXT: v_mov_b32_e32 v5, s2 +; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: s_cmp_gt_i32 s6, 51 ; SI-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc ; SI-NEXT: v_mov_b32_e32 v5, s9 @@ -513,8 +513,8 @@ ; SI-NEXT: v_mov_b32_e32 v8, s8 ; SI-NEXT: v_cndmask_b32_e64 v4, v4, v8, s[2:3] ; SI-NEXT: s_bfe_u32 s2, s15, 0xb0014 -; SI-NEXT: s_add_i32 s4, s2, s23 ; SI-NEXT: v_add_f64 v[8:9], s[8:9], -v[4:5] +; SI-NEXT: s_add_i32 s4, s2, s23 ; SI-NEXT: s_lshr_b64 s[2:3], s[20:21], s4 ; SI-NEXT: v_mov_b32_e32 v10, s9 ; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[8:9]|, 0.5 @@ -549,13 +549,13 @@ ; SI-NEXT: v_mov_b32_e32 v8, s27 ; SI-NEXT: s_cmp_lt_i32 s25, 0 ; SI-NEXT: v_cndmask_b32_e64 v17, v8, v9, s[4:5] +; SI-NEXT: v_mov_b32_e32 v8, s11 ; SI-NEXT: v_mov_b32_e32 v9, s8 ; SI-NEXT: s_cselect_b64 s[8:9], -1, 0 -; SI-NEXT: v_mov_b32_e32 v8, s11 ; SI-NEXT: s_cmp_gt_i32 s25, 51 ; SI-NEXT: v_cndmask_b32_e64 v8, v8, v9, s[8:9] -; SI-NEXT: v_mov_b32_e32 v10, s10 ; SI-NEXT: v_mov_b32_e32 v9, s19 +; SI-NEXT: v_mov_b32_e32 v10, s10 ; SI-NEXT: s_cselect_b64 s[10:11], -1, 0 ; SI-NEXT: v_cndmask_b32_e64 v9, v8, v9, s[10:11] ; SI-NEXT: v_cndmask_b32_e64 v8, v10, 0, s[8:9] @@ -567,9 +567,9 @@ ; SI-NEXT: s_andn2_b64 s[20:21], s[16:17], s[8:9] ; SI-NEXT: s_and_b32 s8, s17, s28 ; SI-NEXT: s_cmp_lt_i32 s10, 0 +; SI-NEXT: v_mov_b32_e32 v10, s21 ; SI-NEXT: v_mov_b32_e32 v11, s8 ; SI-NEXT: s_cselect_b64 s[8:9], -1, 0 -; SI-NEXT: v_mov_b32_e32 v10, s21 ; SI-NEXT: s_cmp_gt_i32 s10, 51 ; SI-NEXT: v_cndmask_b32_e64 v10, v10, v11, s[8:9] ; SI-NEXT: v_mov_b32_e32 v11, s17 @@ -592,9 +592,9 @@ ; SI-NEXT: v_add_f64 v[10:11], v[8:9], v[10:11] ; SI-NEXT: v_cndmask_b32_e64 v9, 0, v19, s[8:9] ; SI-NEXT: v_mov_b32_e32 v8, 0 +; SI-NEXT: v_mov_b32_e32 v16, s15 ; SI-NEXT: v_add_f64 v[8:9], v[12:13], v[8:9] ; SI-NEXT: v_mov_b32_e32 v12, s24 -; SI-NEXT: v_mov_b32_e32 v16, s15 ; SI-NEXT: v_cndmask_b32_e64 v13, v15, v16, s[2:3] ; SI-NEXT: v_cndmask_b32_e64 v12, v12, 0, vcc ; SI-NEXT: v_mov_b32_e32 v15, s14 @@ -605,8 +605,8 @@ ; SI-NEXT: v_mov_b32_e32 v18, s13 ; SI-NEXT: v_cndmask_b32_e64 v15, v15, 0, s[4:5] ; SI-NEXT: v_mov_b32_e32 v16, s12 -; SI-NEXT: v_cndmask_b32_e64 v16, v15, v16, s[6:7] ; SI-NEXT: v_cndmask_b32_e64 v17, v17, v18, s[6:7] +; SI-NEXT: v_cndmask_b32_e64 v16, v15, v16, s[6:7] ; SI-NEXT: v_mov_b32_e32 v15, s13 ; SI-NEXT: v_bfi_b32 v18, s29, v14, v15 ; SI-NEXT: v_add_f64 v[14:15], s[12:13], -v[16:17] @@ -687,9 +687,9 @@ ; CI-NEXT: v_cndmask_b32_e32 v15, 0, v17, vcc ; CI-NEXT: v_mov_b32_e32 v14, 0 ; CI-NEXT: v_mov_b32_e32 v17, s9 -; CI-NEXT: v_bfi_b32 v19, s18, v16, v17 ; CI-NEXT: v_add_f64 v[8:9], v[8:9], v[14:15] ; CI-NEXT: v_add_f64 v[14:15], s[8:9], -v[12:13] +; CI-NEXT: v_bfi_b32 v19, s18, v16, v17 ; CI-NEXT: v_trunc_f64_e32 v[16:17], s[10:11] ; CI-NEXT: v_cmp_ge_f64_e64 vcc, |v[14:15]|, 0.5 ; CI-NEXT: v_add_f64 v[14:15], s[10:11], -v[16:17] diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll --- a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll @@ -1075,8 +1075,8 @@ ; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s6, s4, 16 ; GCN-NOHSA-VI-NEXT: s_and_b32 s5, s5, s8 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s6, s4, 16 ; GCN-NOHSA-VI-NEXT: s_and_b32 s4, s4, s8 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s6 @@ -1256,8 +1256,8 @@ ; GCN-NOHSA-VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s6, s5, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s7, s4, 16 ; GCN-NOHSA-VI-NEXT: s_and_b32 s5, s5, s8 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s7, s4, 16 ; GCN-NOHSA-VI-NEXT: s_and_b32 s4, s4, s8 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s7 @@ -1329,8 +1329,8 @@ ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GCN-HSA-NEXT: s_ashr_i64 s[2:3], s[0:1], 48 ; GCN-HSA-NEXT: s_ashr_i32 s4, s0, 16 +; GCN-HSA-NEXT: s_ashr_i64 s[2:3], s[0:1], 48 ; GCN-HSA-NEXT: s_sext_i32_i16 s1, s1 ; GCN-HSA-NEXT: s_sext_i32_i16 s0, s0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 @@ -1436,14 +1436,14 @@ ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GCN-HSA-NEXT: s_lshr_b32 s2, s7, 16 -; GCN-HSA-NEXT: s_lshr_b32 s3, s6, 16 ; GCN-HSA-NEXT: s_lshr_b32 s9, s5, 16 ; GCN-HSA-NEXT: s_lshr_b32 s10, s4, 16 -; GCN-HSA-NEXT: s_and_b32 s7, s7, s8 -; GCN-HSA-NEXT: s_and_b32 s6, s6, s8 +; GCN-HSA-NEXT: s_lshr_b32 s2, s7, 16 +; GCN-HSA-NEXT: s_lshr_b32 s3, s6, 16 ; GCN-HSA-NEXT: s_and_b32 s5, s5, s8 ; GCN-HSA-NEXT: s_and_b32 s4, s4, s8 +; GCN-HSA-NEXT: s_and_b32 s7, s7, s8 +; GCN-HSA-NEXT: s_and_b32 s6, s6, s8 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1474,12 +1474,12 @@ ; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[6:7], 0x0 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s11, s7, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s12, s6, 16 ; GCN-NOHSA-VI-NEXT: s_and_b32 s7, s7, s8 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s12, s6, 16 ; GCN-NOHSA-VI-NEXT: s_and_b32 s6, s6, s8 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s9, s5, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s10, s4, 16 ; GCN-NOHSA-VI-NEXT: s_and_b32 s5, s5, s8 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s10, s4, 16 ; GCN-NOHSA-VI-NEXT: s_and_b32 s4, s4, s8 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s6 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s12 @@ -1570,10 +1570,10 @@ ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GCN-HSA-NEXT: s_ashr_i32 s2, s7, 16 -; GCN-HSA-NEXT: s_ashr_i32 s3, s6, 16 ; GCN-HSA-NEXT: s_ashr_i32 s8, s5, 16 ; GCN-HSA-NEXT: s_ashr_i32 s9, s4, 16 +; GCN-HSA-NEXT: s_ashr_i32 s2, s7, 16 +; GCN-HSA-NEXT: s_ashr_i32 s3, s6, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1584,9 +1584,9 @@ ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s6 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s7 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_sext_i32_i16 s5, s5 ; GCN-HSA-NEXT: s_sext_i32_i16 s4, s4 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s9 @@ -1724,22 +1724,22 @@ ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GCN-HSA-NEXT: s_lshr_b32 s2, s11, 16 -; GCN-HSA-NEXT: s_lshr_b32 s3, s10, 16 ; GCN-HSA-NEXT: s_lshr_b32 s13, s5, 16 ; GCN-HSA-NEXT: s_lshr_b32 s14, s4, 16 ; GCN-HSA-NEXT: s_lshr_b32 s15, s7, 16 ; GCN-HSA-NEXT: s_lshr_b32 s16, s6, 16 ; GCN-HSA-NEXT: s_lshr_b32 s17, s9, 16 ; GCN-HSA-NEXT: s_lshr_b32 s18, s8, 16 -; GCN-HSA-NEXT: s_and_b32 s11, s11, s12 -; GCN-HSA-NEXT: s_and_b32 s10, s10, s12 +; GCN-HSA-NEXT: s_lshr_b32 s2, s11, 16 +; GCN-HSA-NEXT: s_lshr_b32 s3, s10, 16 ; GCN-HSA-NEXT: s_and_b32 s5, s5, s12 ; GCN-HSA-NEXT: s_and_b32 s4, s4, s12 ; GCN-HSA-NEXT: s_and_b32 s7, s7, s12 ; GCN-HSA-NEXT: s_and_b32 s6, s6, s12 ; GCN-HSA-NEXT: s_and_b32 s9, s9, s12 ; GCN-HSA-NEXT: s_and_b32 s8, s8, s12 +; GCN-HSA-NEXT: s_and_b32 s11, s11, s12 +; GCN-HSA-NEXT: s_and_b32 s10, s10, s12 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1788,31 +1788,32 @@ ; GCN-NOHSA-VI-NEXT: s_load_dwordx8 s[4:11], s[6:7], 0x0 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s19, s11, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s20, s10, 16 ; GCN-NOHSA-VI-NEXT: s_and_b32 s11, s11, s12 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s20, s10, 16 ; GCN-NOHSA-VI-NEXT: s_and_b32 s10, s10, s12 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s17, s9, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s18, s8, 16 ; GCN-NOHSA-VI-NEXT: s_and_b32 s9, s9, s12 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s18, s8, 16 ; GCN-NOHSA-VI-NEXT: s_and_b32 s8, s8, s12 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s10 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s20 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s11 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s19 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s15, s7, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s16, s6, 16 ; GCN-NOHSA-VI-NEXT: s_and_b32 s7, s7, s12 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s16, s6, 16 ; GCN-NOHSA-VI-NEXT: s_and_b32 s6, s6, s12 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s13, s5, 16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s8 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s18 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s9 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s17 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s13, s5, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s14, s4, 16 ; GCN-NOHSA-VI-NEXT: s_and_b32 s5, s5, s12 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s14, s4, 16 ; GCN-NOHSA-VI-NEXT: s_and_b32 s4, s4, s12 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 +; GCN-NOHSA-VI-NEXT: s_nop 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s6 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s7 @@ -1940,14 +1941,14 @@ ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GCN-HSA-NEXT: s_ashr_i32 s2, s11, 16 -; GCN-HSA-NEXT: s_ashr_i32 s3, s10, 16 ; GCN-HSA-NEXT: s_ashr_i32 s12, s5, 16 ; GCN-HSA-NEXT: s_ashr_i32 s13, s4, 16 ; GCN-HSA-NEXT: s_ashr_i32 s14, s7, 16 ; GCN-HSA-NEXT: s_ashr_i32 s15, s6, 16 ; GCN-HSA-NEXT: s_ashr_i32 s16, s9, 16 ; GCN-HSA-NEXT: s_ashr_i32 s17, s8, 16 +; GCN-HSA-NEXT: s_ashr_i32 s2, s11, 16 +; GCN-HSA-NEXT: s_ashr_i32 s3, s10, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1971,18 +1972,18 @@ ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s9 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_sext_i32_i16 s7, s7 ; GCN-HSA-NEXT: s_sext_i32_i16 s6, s6 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s6 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s15 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s7 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s14 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_sext_i32_i16 s5, s5 ; GCN-HSA-NEXT: s_sext_i32_i16 s4, s4 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s13 @@ -2014,20 +2015,21 @@ ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s19 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s11 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s18 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 ; GCN-NOHSA-VI-NEXT: s_ashr_i32 s14, s7, 16 ; GCN-NOHSA-VI-NEXT: s_ashr_i32 s15, s6, 16 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s7, s7 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s6, s6 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s12, s5, 16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s8 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s17 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s9 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s16 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s12, s5, 16 ; GCN-NOHSA-VI-NEXT: s_ashr_i32 s13, s4, 16 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s5, s5 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s4, s4 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 +; GCN-NOHSA-VI-NEXT: s_nop 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s6 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s15 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s7 @@ -2203,9 +2205,6 @@ ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GCN-HSA-NEXT: s_and_b32 s34, s17, s20 -; GCN-HSA-NEXT: s_and_b32 s35, s16, s20 -; GCN-HSA-NEXT: s_and_b32 s36, s19, s20 ; GCN-HSA-NEXT: s_and_b32 s21, s5, s20 ; GCN-HSA-NEXT: s_and_b32 s22, s4, s20 ; GCN-HSA-NEXT: s_and_b32 s23, s7, s20 @@ -2218,11 +2217,10 @@ ; GCN-HSA-NEXT: s_and_b32 s30, s12, s20 ; GCN-HSA-NEXT: s_and_b32 s31, s15, s20 ; GCN-HSA-NEXT: s_and_b32 s33, s14, s20 +; GCN-HSA-NEXT: s_and_b32 s34, s17, s20 +; GCN-HSA-NEXT: s_and_b32 s35, s16, s20 +; GCN-HSA-NEXT: s_and_b32 s36, s19, s20 ; GCN-HSA-NEXT: s_and_b32 s20, s18, s20 -; GCN-HSA-NEXT: s_lshr_b32 s17, s17, 16 -; GCN-HSA-NEXT: s_lshr_b32 s16, s16, 16 -; GCN-HSA-NEXT: s_lshr_b32 s19, s19, 16 -; GCN-HSA-NEXT: s_lshr_b32 s18, s18, 16 ; GCN-HSA-NEXT: s_lshr_b32 s5, s5, 16 ; GCN-HSA-NEXT: s_lshr_b32 s4, s4, 16 ; GCN-HSA-NEXT: s_lshr_b32 s7, s7, 16 @@ -2235,6 +2233,10 @@ ; GCN-HSA-NEXT: s_lshr_b32 s12, s12, 16 ; GCN-HSA-NEXT: s_lshr_b32 s15, s15, 16 ; GCN-HSA-NEXT: s_lshr_b32 s14, s14, 16 +; GCN-HSA-NEXT: s_lshr_b32 s17, s17, 16 +; GCN-HSA-NEXT: s_lshr_b32 s16, s16, 16 +; GCN-HSA-NEXT: s_lshr_b32 s19, s19, 16 +; GCN-HSA-NEXT: s_lshr_b32 s18, s18, 16 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 @@ -2319,67 +2321,68 @@ ; GCN-NOHSA-VI-NEXT: s_load_dwordx16 s[4:19], s[6:7], 0x0 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s36, s19, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s37, s18, 16 ; GCN-NOHSA-VI-NEXT: s_and_b32 s19, s19, s20 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s37, s18, 16 ; GCN-NOHSA-VI-NEXT: s_and_b32 s18, s18, s20 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s34, s17, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s35, s16, 16 ; GCN-NOHSA-VI-NEXT: s_and_b32 s17, s17, s20 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s35, s16, 16 ; GCN-NOHSA-VI-NEXT: s_and_b32 s16, s16, s20 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s18 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s37 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s19 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s36 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s31, s15, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s33, s14, 16 ; GCN-NOHSA-VI-NEXT: s_and_b32 s15, s15, s20 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s33, s14, 16 ; GCN-NOHSA-VI-NEXT: s_and_b32 s14, s14, s20 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s29, s13, 16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s35 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s17 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s34 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s29, s13, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s30, s12, 16 ; GCN-NOHSA-VI-NEXT: s_and_b32 s13, s13, s20 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s30, s12, 16 ; GCN-NOHSA-VI-NEXT: s_and_b32 s12, s12, s20 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s27, s11, 16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s14 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s33 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s15 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s31 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s27, s11, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s28, s10, 16 ; GCN-NOHSA-VI-NEXT: s_and_b32 s11, s11, s20 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s28, s10, 16 ; GCN-NOHSA-VI-NEXT: s_and_b32 s10, s10, s20 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s25, s9, 16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s12 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s30 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s13 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s29 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s25, s9, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s26, s8, 16 ; GCN-NOHSA-VI-NEXT: s_and_b32 s9, s9, s20 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s26, s8, 16 ; GCN-NOHSA-VI-NEXT: s_and_b32 s8, s8, s20 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s23, s7, 16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s10 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s28 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s11 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s27 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s23, s7, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s24, s6, 16 ; GCN-NOHSA-VI-NEXT: s_and_b32 s7, s7, s20 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s24, s6, 16 ; GCN-NOHSA-VI-NEXT: s_and_b32 s6, s6, s20 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s21, s5, 16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s8 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s26 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s9 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s25 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s21, s5, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s22, s4, 16 ; GCN-NOHSA-VI-NEXT: s_and_b32 s5, s5, s20 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s22, s4, 16 ; GCN-NOHSA-VI-NEXT: s_and_b32 s4, s4, s20 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 +; GCN-NOHSA-VI-NEXT: s_nop 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s6 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s24 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s7 @@ -2591,10 +2594,6 @@ ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GCN-HSA-NEXT: s_ashr_i32 s33, s17, 16 -; GCN-HSA-NEXT: s_ashr_i32 s34, s16, 16 -; GCN-HSA-NEXT: s_ashr_i32 s35, s19, 16 -; GCN-HSA-NEXT: s_ashr_i32 s36, s18, 16 ; GCN-HSA-NEXT: s_ashr_i32 s20, s5, 16 ; GCN-HSA-NEXT: s_ashr_i32 s21, s4, 16 ; GCN-HSA-NEXT: s_ashr_i32 s22, s7, 16 @@ -2607,6 +2606,10 @@ ; GCN-HSA-NEXT: s_ashr_i32 s29, s12, 16 ; GCN-HSA-NEXT: s_ashr_i32 s30, s15, 16 ; GCN-HSA-NEXT: s_ashr_i32 s31, s14, 16 +; GCN-HSA-NEXT: s_ashr_i32 s33, s17, 16 +; GCN-HSA-NEXT: s_ashr_i32 s34, s16, 16 +; GCN-HSA-NEXT: s_ashr_i32 s35, s19, 16 +; GCN-HSA-NEXT: s_ashr_i32 s36, s18, 16 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 @@ -2674,18 +2677,18 @@ ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s9 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s24 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_sext_i32_i16 s7, s7 ; GCN-HSA-NEXT: s_sext_i32_i16 s6, s6 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s6 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s23 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s7 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s22 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_sext_i32_i16 s5, s5 ; GCN-HSA-NEXT: s_sext_i32_i16 s4, s4 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s21 @@ -2717,56 +2720,57 @@ ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s36 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s19 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s35 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 ; GCN-NOHSA-VI-NEXT: s_ashr_i32 s30, s15, 16 ; GCN-NOHSA-VI-NEXT: s_ashr_i32 s31, s14, 16 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s15, s15 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s14, s14 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s28, s13, 16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s34 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s17 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s33 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s28, s13, 16 ; GCN-NOHSA-VI-NEXT: s_ashr_i32 s29, s12, 16 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s13, s13 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s12, s12 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s26, s11, 16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s14 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s31 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s15 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s30 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s26, s11, 16 ; GCN-NOHSA-VI-NEXT: s_ashr_i32 s27, s10, 16 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s11, s11 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s10, s10 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s24, s9, 16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s12 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s29 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s13 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s28 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s24, s9, 16 ; GCN-NOHSA-VI-NEXT: s_ashr_i32 s25, s8, 16 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s9, s9 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s8, s8 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s22, s7, 16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s10 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s27 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s11 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s26 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s22, s7, 16 ; GCN-NOHSA-VI-NEXT: s_ashr_i32 s23, s6, 16 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s7, s7 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s6, s6 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s20, s5, 16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s8 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s25 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s9 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s24 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s20, s5, 16 ; GCN-NOHSA-VI-NEXT: s_ashr_i32 s21, s4, 16 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s5, s5 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s4, s4 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 +; GCN-NOHSA-VI-NEXT: s_nop 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s6 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s23 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s7 @@ -3100,6 +3104,11 @@ ; GCN-HSA-NEXT: s_and_b32 s53, s14, s37 ; GCN-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x10 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) +; GCN-HSA-NEXT: s_and_b32 s18, s1, s37 +; GCN-HSA-NEXT: s_and_b32 s19, s0, s37 +; GCN-HSA-NEXT: s_and_b32 s54, s3, s37 +; GCN-HSA-NEXT: s_and_b32 s55, s2, s37 +; GCN-HSA-NEXT: s_and_b32 s56, s5, s37 ; GCN-HSA-NEXT: s_and_b32 s57, s4, s37 ; GCN-HSA-NEXT: s_and_b32 s58, s7, s37 ; GCN-HSA-NEXT: s_and_b32 s59, s6, s37 @@ -3110,10 +3119,13 @@ ; GCN-HSA-NEXT: s_and_b32 s64, s13, s37 ; GCN-HSA-NEXT: s_and_b32 s65, s12, s37 ; GCN-HSA-NEXT: s_and_b32 s66, s15, s37 -; GCN-HSA-NEXT: s_and_b32 s54, s3, s37 -; GCN-HSA-NEXT: s_and_b32 s55, s2, s37 -; GCN-HSA-NEXT: s_and_b32 s56, s5, s37 +; GCN-HSA-NEXT: s_and_b32 s37, s14, s37 +; GCN-HSA-NEXT: s_lshr_b32 s67, s1, 16 +; GCN-HSA-NEXT: s_lshr_b32 s68, s0, 16 +; GCN-HSA-NEXT: s_lshr_b32 s3, s3, 16 +; GCN-HSA-NEXT: s_lshr_b32 s2, s2, 16 ; GCN-HSA-NEXT: s_lshr_b32 s5, s5, 16 +; GCN-HSA-NEXT: s_lshr_b32 s4, s4, 16 ; GCN-HSA-NEXT: s_lshr_b32 s7, s7, 16 ; GCN-HSA-NEXT: s_lshr_b32 s6, s6, 16 ; GCN-HSA-NEXT: s_lshr_b32 s9, s9, 16 @@ -3123,15 +3135,7 @@ ; GCN-HSA-NEXT: s_lshr_b32 s13, s13, 16 ; GCN-HSA-NEXT: s_lshr_b32 s12, s12, 16 ; GCN-HSA-NEXT: s_lshr_b32 s15, s15, 16 -; GCN-HSA-NEXT: s_and_b32 s18, s1, s37 -; GCN-HSA-NEXT: s_and_b32 s19, s0, s37 -; GCN-HSA-NEXT: s_and_b32 s37, s14, s37 ; GCN-HSA-NEXT: s_lshr_b32 s14, s14, 16 -; GCN-HSA-NEXT: s_lshr_b32 s67, s1, 16 -; GCN-HSA-NEXT: s_lshr_b32 s68, s0, 16 -; GCN-HSA-NEXT: s_lshr_b32 s3, s3, 16 -; GCN-HSA-NEXT: s_lshr_b32 s2, s2, 16 -; GCN-HSA-NEXT: s_lshr_b32 s4, s4, 16 ; GCN-HSA-NEXT: s_add_u32 s0, s16, 0xf0 ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v22, s1 @@ -3182,12 +3186,11 @@ ; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x60 ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v19, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v18, s0 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x50 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s14 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s66 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s15 -; GCN-HSA-NEXT: flat_store_dwordx4 v[21:22], v[0:3] +; GCN-HSA-NEXT: v_mov_b32_e32 v18, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x50 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s63 ; GCN-HSA-NEXT: v_mov_b32_e32 v9, s10 ; GCN-HSA-NEXT: v_mov_b32_e32 v10, s62 @@ -3197,8 +3200,9 @@ ; GCN-HSA-NEXT: v_mov_b32_e32 v14, s60 ; GCN-HSA-NEXT: v_mov_b32_e32 v15, s9 ; GCN-HSA-NEXT: v_mov_b32_e32 v20, s57 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s55 +; GCN-HSA-NEXT: flat_store_dwordx4 v[21:22], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v21, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s55 ; GCN-HSA-NEXT: v_mov_b32_e32 v22, s56 ; GCN-HSA-NEXT: v_mov_b32_e32 v23, s5 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s2 @@ -3208,9 +3212,9 @@ ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s68 ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[8:11] -; GCN-HSA-NEXT: flat_store_dwordx4 v[28:29], v[12:15] ; GCN-HSA-NEXT: v_mov_b32_e32 v6, s18 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s53 +; GCN-HSA-NEXT: flat_store_dwordx4 v[28:29], v[12:15] ; GCN-HSA-NEXT: v_mov_b32_e32 v7, s67 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, s51 ; GCN-HSA-NEXT: v_mov_b32_e32 v9, s36 @@ -3284,18 +3288,7 @@ ; GCN-NOHSA-VI-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x40 ; GCN-NOHSA-VI-NEXT: s_load_dwordx16 s[36:51], s[18:19], 0x0 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_and_b32 s70, s15, s20 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s60, s5, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s61, s4, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s62, s7, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s63, s6, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s64, s9, s20 -; GCN-NOHSA-VI-NEXT: s_and_b32 s65, s8, s20 -; GCN-NOHSA-VI-NEXT: s_and_b32 s66, s11, s20 -; GCN-NOHSA-VI-NEXT: s_and_b32 s67, s10, s20 -; GCN-NOHSA-VI-NEXT: s_and_b32 s68, s13, s20 -; GCN-NOHSA-VI-NEXT: s_and_b32 s69, s12, s20 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s15, s15, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s53, s1, s20 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s18, s37, 16 ; GCN-NOHSA-VI-NEXT: s_and_b32 s19, s37, s20 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s21, s36, 16 @@ -3328,15 +3321,26 @@ ; GCN-NOHSA-VI-NEXT: s_and_b32 s49, s51, s20 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s51, s50, 16 ; GCN-NOHSA-VI-NEXT: s_and_b32 s50, s50, s20 -; GCN-NOHSA-VI-NEXT: s_and_b32 s53, s1, s20 ; GCN-NOHSA-VI-NEXT: s_and_b32 s55, s0, s20 ; GCN-NOHSA-VI-NEXT: s_and_b32 s57, s3, s20 ; GCN-NOHSA-VI-NEXT: s_and_b32 s59, s2, s20 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s60, s5, 16 ; GCN-NOHSA-VI-NEXT: s_and_b32 s5, s5, s20 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s61, s4, 16 ; GCN-NOHSA-VI-NEXT: s_and_b32 s4, s4, s20 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s62, s7, 16 ; GCN-NOHSA-VI-NEXT: s_and_b32 s7, s7, s20 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s63, s6, 16 ; GCN-NOHSA-VI-NEXT: s_and_b32 s6, s6, s20 +; GCN-NOHSA-VI-NEXT: s_and_b32 s64, s9, s20 +; GCN-NOHSA-VI-NEXT: s_and_b32 s65, s8, s20 +; GCN-NOHSA-VI-NEXT: s_and_b32 s66, s11, s20 +; GCN-NOHSA-VI-NEXT: s_and_b32 s67, s10, s20 +; GCN-NOHSA-VI-NEXT: s_and_b32 s68, s13, s20 +; GCN-NOHSA-VI-NEXT: s_and_b32 s69, s12, s20 +; GCN-NOHSA-VI-NEXT: s_and_b32 s70, s15, s20 ; GCN-NOHSA-VI-NEXT: s_and_b32 s20, s14, s20 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s15, s15, 16 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s14, s14, 16 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s52, s1, 16 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s54, s0, 16 @@ -3352,16 +3356,17 @@ ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s14 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s70 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s15 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s11, s11, 16 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s10, s10, 16 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s9, s9, 16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s69 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s12 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s68 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s13 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s9, s9, 16 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s8, s8, 16 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224 +; GCN-NOHSA-VI-NEXT: s_nop 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s67 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s10 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s66 @@ -3841,7 +3846,12 @@ ; GCN-HSA-NEXT: s_sext_i32_i16 s52, s18 ; GCN-HSA-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x10 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) +; GCN-HSA-NEXT: s_ashr_i32 s53, s5, 16 +; GCN-HSA-NEXT: s_ashr_i32 s54, s4, 16 +; GCN-HSA-NEXT: s_ashr_i32 s55, s7, 16 +; GCN-HSA-NEXT: s_ashr_i32 s56, s6, 16 ; GCN-HSA-NEXT: s_ashr_i32 s57, s9, 16 +; GCN-HSA-NEXT: s_ashr_i32 s58, s8, 16 ; GCN-HSA-NEXT: s_ashr_i32 s59, s11, 16 ; GCN-HSA-NEXT: s_ashr_i32 s60, s10, 16 ; GCN-HSA-NEXT: s_ashr_i32 s61, s13, 16 @@ -3852,11 +3862,6 @@ ; GCN-HSA-NEXT: s_ashr_i32 s66, s16, 16 ; GCN-HSA-NEXT: s_ashr_i32 s67, s19, 16 ; GCN-HSA-NEXT: s_ashr_i32 s68, s18, 16 -; GCN-HSA-NEXT: s_ashr_i32 s53, s5, 16 -; GCN-HSA-NEXT: s_ashr_i32 s54, s4, 16 -; GCN-HSA-NEXT: s_ashr_i32 s55, s7, 16 -; GCN-HSA-NEXT: s_ashr_i32 s56, s6, 16 -; GCN-HSA-NEXT: s_ashr_i32 s58, s8, 16 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xf0 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v22, s3 @@ -3913,21 +3918,20 @@ ; GCN-HSA-NEXT: s_sext_i32_i16 s18, s18 ; GCN-HSA-NEXT: v_mov_b32_e32 v19, s3 ; GCN-HSA-NEXT: s_sext_i32_i16 s4, s4 +; GCN-HSA-NEXT: s_sext_i32_i16 s7, s7 +; GCN-HSA-NEXT: s_sext_i32_i16 s6, s6 +; GCN-HSA-NEXT: s_sext_i32_i16 s9, s9 ; GCN-HSA-NEXT: s_sext_i32_i16 s8, s8 ; GCN-HSA-NEXT: s_sext_i32_i16 s13, s13 ; GCN-HSA-NEXT: s_sext_i32_i16 s12, s12 ; GCN-HSA-NEXT: s_sext_i32_i16 s15, s15 ; GCN-HSA-NEXT: s_sext_i32_i16 s14, s14 -; GCN-HSA-NEXT: v_mov_b32_e32 v18, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x50 -; GCN-HSA-NEXT: s_sext_i32_i16 s7, s7 -; GCN-HSA-NEXT: s_sext_i32_i16 s6, s6 -; GCN-HSA-NEXT: s_sext_i32_i16 s9, s9 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s18 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s68 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s19 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s67 -; GCN-HSA-NEXT: flat_store_dwordx4 v[21:22], v[0:3] +; GCN-HSA-NEXT: v_mov_b32_e32 v18, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x50 ; GCN-HSA-NEXT: s_sext_i32_i16 s5, s5 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s14 ; GCN-HSA-NEXT: v_mov_b32_e32 v9, s64 @@ -3938,8 +3942,9 @@ ; GCN-HSA-NEXT: v_mov_b32_e32 v14, s13 ; GCN-HSA-NEXT: v_mov_b32_e32 v15, s61 ; GCN-HSA-NEXT: v_mov_b32_e32 v20, s8 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s6 +; GCN-HSA-NEXT: flat_store_dwordx4 v[21:22], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v21, s58 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s6 ; GCN-HSA-NEXT: v_mov_b32_e32 v22, s9 ; GCN-HSA-NEXT: v_mov_b32_e32 v23, s57 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s56 @@ -3949,9 +3954,9 @@ ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s54 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[8:11] -; GCN-HSA-NEXT: flat_store_dwordx4 v[28:29], v[12:15] ; GCN-HSA-NEXT: v_mov_b32_e32 v6, s5 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s52 +; GCN-HSA-NEXT: flat_store_dwordx4 v[28:29], v[12:15] ; GCN-HSA-NEXT: v_mov_b32_e32 v7, s53 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, s48 ; GCN-HSA-NEXT: v_mov_b32_e32 v9, s50 @@ -4028,8 +4033,6 @@ ; GCN-NOHSA-VI-NEXT: s_ashr_i32 s70, s14, 16 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s15, s15 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s14, s14 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s67, s13, 16 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s68, s12, 16 ; GCN-NOHSA-VI-NEXT: s_ashr_i32 s18, s37, 16 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s20, s37 ; GCN-NOHSA-VI-NEXT: s_ashr_i32 s22, s39, 16 @@ -4047,13 +4050,15 @@ ; GCN-NOHSA-VI-NEXT: s_ashr_i32 s47, s51, 16 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s49, s51 ; GCN-NOHSA-VI-NEXT: s_ashr_i32 s51, s1, 16 -; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s53, s1 ; GCN-NOHSA-VI-NEXT: s_ashr_i32 s52, s0, 16 +; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s53, s1 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s54, s0 ; GCN-NOHSA-VI-NEXT: s_ashr_i32 s55, s3, 16 -; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s57, s3 ; GCN-NOHSA-VI-NEXT: s_ashr_i32 s56, s2, 16 +; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s57, s3 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s58, s2 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s67, s13, 16 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s68, s12, 16 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s13, s13 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s12, s12 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 @@ -4064,56 +4069,55 @@ ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s70 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s15 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s69 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240 ; GCN-NOHSA-VI-NEXT: s_ashr_i32 s65, s11, 16 ; GCN-NOHSA-VI-NEXT: s_ashr_i32 s66, s10, 16 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s11, s11 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s10, s10 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s63, s9, 16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s12 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s68 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s13 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s67 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s63, s9, 16 ; GCN-NOHSA-VI-NEXT: s_ashr_i32 s64, s8, 16 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s9, s9 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s8, s8 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s61, s7, 16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s10 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s66 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s11 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s65 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:208 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s61, s7, 16 ; GCN-NOHSA-VI-NEXT: s_ashr_i32 s62, s6, 16 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s7, s7 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s6, s6 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:208 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s59, s5, 16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s8 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s64 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s9 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s63 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:192 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s59, s5, 16 ; GCN-NOHSA-VI-NEXT: s_ashr_i32 s60, s4, 16 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s5, s5 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s4, s4 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:192 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s19, s36, 16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s6 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s62 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s7 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s61 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:176 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s19, s36, 16 +; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s21, s36 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s60 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s5 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s59 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:160 -; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s21, s36 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s23, s38, 16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s58 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s56 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s57 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s55 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s23, s38, 16 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s25, s38 ; GCN-NOHSA-VI-NEXT: s_ashr_i32 s27, s40, 16 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s29, s40 @@ -4127,6 +4131,8 @@ ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s46, s48 ; GCN-NOHSA-VI-NEXT: s_ashr_i32 s48, s50, 16 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s50, s50 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144 +; GCN-NOHSA-VI-NEXT: s_nop 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s54 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s52 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s53 @@ -5014,10 +5020,10 @@ ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_mov_b32 s4, s3 ; GCN-HSA-NEXT: s_lshr_b32 s6, s2, 16 -; GCN-HSA-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[8:9], s[2:3], 0x100000 ; GCN-HSA-NEXT: s_ashr_i64 s[2:3], s[2:3], 48 ; GCN-HSA-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x100000 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3 @@ -5048,10 +5054,10 @@ ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s5 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s10, s5, 16 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x100000 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[6:7], s[4:5], 0x100000 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s4, s4, 16 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x100000 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x100000 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s8 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s9 @@ -5158,10 +5164,10 @@ ; GCN-HSA-NEXT: s_lshr_b32 s2, s7, 16 ; GCN-HSA-NEXT: s_lshr_b32 s10, s6, 16 ; GCN-HSA-NEXT: s_lshr_b32 s11, s4, 16 -; GCN-HSA-NEXT: s_and_b32 s3, s7, s8 ; GCN-HSA-NEXT: s_and_b32 s4, s4, s8 ; GCN-HSA-NEXT: s_and_b32 s6, s6, s8 ; GCN-HSA-NEXT: s_and_b32 s5, s5, s8 +; GCN-HSA-NEXT: s_and_b32 s3, s7, s8 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s3 @@ -5211,12 +5217,13 @@ ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s6, s6, 16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s8 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s7 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s5, s5, 16 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s4, s4, 16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s11 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s6 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s4, s4, 16 +; GCN-NOHSA-VI-NEXT: s_nop 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s10 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s5 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 @@ -5333,14 +5340,14 @@ ; GCN-HSA-NEXT: s_mov_b32 s8, s5 ; GCN-HSA-NEXT: s_lshr_b32 s10, s6, 16 ; GCN-HSA-NEXT: s_lshr_b32 s12, s4, 16 -; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[14:15], s[4:5], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[16:17], s[6:7], 0x100000 -; GCN-HSA-NEXT: s_ashr_i64 s[6:7], s[6:7], 48 ; GCN-HSA-NEXT: s_ashr_i64 s[4:5], s[4:5], 48 +; GCN-HSA-NEXT: s_ashr_i64 s[6:7], s[6:7], 48 ; GCN-HSA-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x100000 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5400,17 +5407,18 @@ ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s19 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s6 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s7 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[8:9], s[4:5], 0x100000 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s4, s4, 16 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x100000 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x100000 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x100000 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s14 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s15 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s17 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x100000 +; GCN-NOHSA-VI-NEXT: s_nop 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s10 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s11 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s12 @@ -5559,7 +5567,6 @@ ; GCN-HSA-NEXT: s_lshr_b32 s17, s10, 16 ; GCN-HSA-NEXT: s_lshr_b32 s18, s6, 16 ; GCN-HSA-NEXT: s_lshr_b32 s19, s4, 16 -; GCN-HSA-NEXT: s_and_b32 s3, s9, s12 ; GCN-HSA-NEXT: s_and_b32 s4, s4, s12 ; GCN-HSA-NEXT: s_and_b32 s6, s6, s12 ; GCN-HSA-NEXT: s_and_b32 s10, s10, s12 @@ -5567,6 +5574,7 @@ ; GCN-HSA-NEXT: s_and_b32 s5, s5, s12 ; GCN-HSA-NEXT: s_and_b32 s7, s7, s12 ; GCN-HSA-NEXT: s_and_b32 s11, s11, s12 +; GCN-HSA-NEXT: s_and_b32 s3, s9, s12 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x50 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s3 @@ -5648,28 +5656,29 @@ ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s10, s10, 16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s12 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s11 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s9, s9, 16 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s8, s8, 16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s19 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s10 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s8, s8, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s7, s7, 16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s18 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s9 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s7, s7, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s6, s6, 16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s17 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s8 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s6, s6, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s5, s5, 16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s7 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s5, s5, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s4, s4, 16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s15 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s6 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s4, s4, 16 +; GCN-NOHSA-VI-NEXT: s_nop 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s14 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s5 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 @@ -5859,18 +5868,17 @@ ; GCN-HSA-NEXT: s_mov_b32 s16, s5 ; GCN-HSA-NEXT: s_lshr_b32 s18, s10, 16 ; GCN-HSA-NEXT: s_lshr_b32 s20, s8, 16 -; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[34:35], s[10:11], 0x100000 -; GCN-HSA-NEXT: s_ashr_i64 s[10:11], s[10:11], 48 ; GCN-HSA-NEXT: s_lshr_b32 s22, s6, 16 ; GCN-HSA-NEXT: s_lshr_b32 s24, s4, 16 -; GCN-HSA-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[34:35], s[10:11], 0x100000 +; GCN-HSA-NEXT: s_ashr_i64 s[10:11], s[10:11], 48 +; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[26:27], s[4:5], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[28:29], s[6:7], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[30:31], s[8:9], 0x100000 -; GCN-HSA-NEXT: s_ashr_i64 s[8:9], s[8:9], 48 ; GCN-HSA-NEXT: s_ashr_i64 s[4:5], s[4:5], 48 ; GCN-HSA-NEXT: s_ashr_i64 s[6:7], s[6:7], 48 +; GCN-HSA-NEXT: s_ashr_i64 s[8:9], s[8:9], 48 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s10 @@ -5881,6 +5889,7 @@ ; GCN-HSA-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x100000 ; GCN-HSA-NEXT: s_add_u32 s22, s0, 0x70 ; GCN-HSA-NEXT: s_addc_u32 s23, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v6, s8 @@ -5888,8 +5897,8 @@ ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s22 ; GCN-HSA-NEXT: v_mov_b32_e32 v7, s9 ; GCN-HSA-NEXT: s_addc_u32 s9, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s9 ; GCN-HSA-NEXT: v_mov_b32_e32 v9, s23 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s9 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s12 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s13 ; GCN-HSA-NEXT: v_mov_b32_e32 v10, s8 @@ -5910,28 +5919,28 @@ ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s5 ; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 -; GCN-HSA-NEXT: s_add_u32 s4, s0, 0x60 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s16 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s17 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 +; GCN-HSA-NEXT: s_add_u32 s4, s0, 0x60 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 -; GCN-HSA-NEXT: s_add_u32 s4, s0, 64 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s34 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s35 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s18 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s19 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 +; GCN-HSA-NEXT: s_add_u32 s4, s0, 64 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 -; GCN-HSA-NEXT: s_add_u32 s4, s0, 32 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s30 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s31 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s20 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s21 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 +; GCN-HSA-NEXT: s_add_u32 s4, s0, 32 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 @@ -5964,13 +5973,13 @@ ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s10, s10, 16 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[34:35], s[10:11], 0x100000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s11 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[36:37], s[10:11], 0x100000 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s10, s11, 16 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[24:25], s[8:9], 0x100000 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s8, s8, 16 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[36:37], s[10:11], 0x100000 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s10, s11, 16 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[26:27], s[8:9], 0x100000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s9 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x100000 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[18:19], s[6:7], 0x100000 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s6, s6, 16 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[28:29], s[8:9], 0x100000 @@ -5979,45 +5988,46 @@ ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s37 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s10 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s11 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[20:21], s[6:7], 0x100000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, s7 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x100000 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[22:23], s[6:7], 0x100000 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s30 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s31 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s34 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s35 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[22:23], s[6:7], 0x100000 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s6, s7, 16 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x100000 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s28 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s29 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s8 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s9 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x100000 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s14, s5 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s24 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s25 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s26 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s27 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s14, s5 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s16, s5, 16 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[12:13], s[4:5], 0x100000 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s22 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s23 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s6 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s7 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[12:13], s[4:5], 0x100000 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s4, s4, 16 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x100000 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x100000 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x100000 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s18 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s19 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s20 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s21 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x100000 +; GCN-NOHSA-VI-NEXT: s_nop 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s14 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s15 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s16 @@ -6257,7 +6267,6 @@ ; GCN-HSA-NEXT: s_and_b32 s35, s15, s20 ; GCN-HSA-NEXT: s_and_b32 s36, s17, s20 ; GCN-HSA-NEXT: s_and_b32 s20, s19, s20 -; GCN-HSA-NEXT: s_lshr_b32 s19, s19, 16 ; GCN-HSA-NEXT: s_lshr_b32 s5, s5, 16 ; GCN-HSA-NEXT: s_lshr_b32 s7, s7, 16 ; GCN-HSA-NEXT: s_lshr_b32 s9, s9, 16 @@ -6265,6 +6274,7 @@ ; GCN-HSA-NEXT: s_lshr_b32 s13, s13, 16 ; GCN-HSA-NEXT: s_lshr_b32 s15, s15, 16 ; GCN-HSA-NEXT: s_lshr_b32 s17, s17, 16 +; GCN-HSA-NEXT: s_lshr_b32 s19, s19, 16 ; GCN-HSA-NEXT: s_lshr_b32 s18, s18, 16 ; GCN-HSA-NEXT: s_lshr_b32 s16, s16, 16 ; GCN-HSA-NEXT: s_lshr_b32 s14, s14, 16 @@ -6289,9 +6299,9 @@ ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v11, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v10, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s20 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s19 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s36 @@ -6302,9 +6312,9 @@ ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s15 ; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x50 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s34 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s13 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x50 ; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[0:3] ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s33 @@ -6418,60 +6428,61 @@ ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s18, s18, 16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s20 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s19 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s17, s17, 16 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s16, s16, 16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s36 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s18 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s16, s16, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s15, s15, 16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s35 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s17 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:208 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s15, s15, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s14, s14, 16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s34 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s16 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:192 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s14, s14, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s13, s13, 16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s33 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s15 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:176 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s13, s13, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s12, s12, 16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s31 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s14 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:160 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s12, s12, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s11, s11, 16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s30 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s13 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s11, s11, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s10, s10, 16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s29 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s12 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:128 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s10, s10, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s9, s9, 16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s28 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s11 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s9, s9, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s8, s8, 16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s27 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s10 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s8, s8, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s7, s7, 16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s26 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s9 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s7, s7, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s6, s6, 16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s25 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s8 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s6, s6, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s5, s5, 16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s24 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s7 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s5, s5, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s4, s4, 16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s23 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s6 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s4, s4, 16 +; GCN-NOHSA-VI-NEXT: s_nop 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s22 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s5 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 @@ -6816,27 +6827,22 @@ ; GCN-HSA-NEXT: s_lshr_b32 s66, s2, 16 ; GCN-HSA-NEXT: s_lshr_b32 s68, s0, 16 ; GCN-HSA-NEXT: s_bfe_i64 s[18:19], s[0:1], 0x100000 -; GCN-HSA-NEXT: s_ashr_i64 s[36:37], s[0:1], 48 -; GCN-HSA-NEXT: s_ashr_i64 s[0:1], s[14:15], 48 ; GCN-HSA-NEXT: s_bfe_i64 s[20:21], s[2:3], 0x100000 +; GCN-HSA-NEXT: s_ashr_i64 s[36:37], s[0:1], 48 ; GCN-HSA-NEXT: s_ashr_i64 s[70:71], s[2:3], 48 +; GCN-HSA-NEXT: s_ashr_i64 s[0:1], s[14:15], 48 ; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[38:39], 0x100000 -; GCN-HSA-NEXT: s_ashr_i64 s[74:75], s[6:7], 48 -; GCN-HSA-NEXT: s_ashr_i64 s[76:77], s[8:9], 48 -; GCN-HSA-NEXT: s_ashr_i64 s[78:79], s[10:11], 48 -; GCN-HSA-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[22:23], s[4:5], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[24:25], s[6:7], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[26:27], s[8:9], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[28:29], s[10:11], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[30:31], s[12:13], 0x100000 -; GCN-HSA-NEXT: s_ashr_i64 s[12:13], s[12:13], 48 ; GCN-HSA-NEXT: s_bfe_i64 s[34:35], s[14:15], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[22:23], s[4:5], 0x100000 ; GCN-HSA-NEXT: s_ashr_i64 s[72:73], s[4:5], 48 +; GCN-HSA-NEXT: s_ashr_i64 s[74:75], s[6:7], 48 +; GCN-HSA-NEXT: s_ashr_i64 s[76:77], s[8:9], 48 +; GCN-HSA-NEXT: s_ashr_i64 s[78:79], s[10:11], 48 +; GCN-HSA-NEXT: s_ashr_i64 s[12:13], s[12:13], 48 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s0 @@ -6851,6 +6857,11 @@ ; GCN-HSA-NEXT: s_bfe_i64 s[38:39], s[54:55], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[52:53], s[52:53], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x100000 ; GCN-HSA-NEXT: s_add_u32 s54, s16, 0xf0 ; GCN-HSA-NEXT: s_addc_u32 s55, s17, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v6, s12 @@ -6897,10 +6908,8 @@ ; GCN-HSA-NEXT: v_mov_b32_e32 v17, s13 ; GCN-HSA-NEXT: v_mov_b32_e32 v16, s12 ; GCN-HSA-NEXT: s_add_u32 s12, s16, 0xc0 -; GCN-HSA-NEXT: s_addc_u32 s13, s17, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v23, s55 -; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v19, s13 +; GCN-HSA-NEXT: s_addc_u32 s13, s17, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s42 ; GCN-HSA-NEXT: v_mov_b32_e32 v9, s43 ; GCN-HSA-NEXT: v_mov_b32_e32 v10, s78 @@ -6911,19 +6920,21 @@ ; GCN-HSA-NEXT: v_mov_b32_e32 v15, s77 ; GCN-HSA-NEXT: v_mov_b32_e32 v20, s48 ; GCN-HSA-NEXT: v_mov_b32_e32 v21, s49 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s50 +; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v22, s72 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s50 ; GCN-HSA-NEXT: v_mov_b32_e32 v23, s73 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s51 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s70 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s71 -; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[8:11] -; GCN-HSA-NEXT: flat_store_dwordx4 v[28:29], v[12:15] +; GCN-HSA-NEXT: v_mov_b32_e32 v19, s13 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s52 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s53 +; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[8:11] ; GCN-HSA-NEXT: v_mov_b32_e32 v6, s36 ; GCN-HSA-NEXT: v_mov_b32_e32 v7, s37 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s34 +; GCN-HSA-NEXT: flat_store_dwordx4 v[28:29], v[12:15] ; GCN-HSA-NEXT: v_mov_b32_e32 v9, s35 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, s30 ; GCN-HSA-NEXT: v_mov_b32_e32 v10, s38 @@ -7024,27 +7035,27 @@ ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[22:23], s[4:5], 0x100000 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[68:69], s[14:15], 0x100000 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s14, s14, 16 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[4:5], s[24:25], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[24:25], s[28:29], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[28:29], s[34:35], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[34:35], s[38:39], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[38:39], s[42:43], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[42:43], s[48:49], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[48:49], s[54:55], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[54:55], s[60:61], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[60:61], s[66:67], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[66:67], s[72:73], 0x100000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s16 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s17 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[4:5], s[24:25], 0x100000 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[16:17], s[26:27], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[24:25], s[28:29], 0x100000 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[26:27], s[30:31], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[28:29], s[34:35], 0x100000 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[30:31], s[36:37], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[34:35], s[38:39], 0x100000 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[36:37], s[40:41], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[38:39], s[42:43], 0x100000 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[40:41], s[46:47], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[42:43], s[48:49], 0x100000 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[46:47], s[52:53], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[48:49], s[54:55], 0x100000 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[52:53], s[58:59], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[54:55], s[60:61], 0x100000 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[58:59], s[64:65], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[60:61], s[66:67], 0x100000 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[64:65], s[70:71], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[66:67], s[72:73], 0x100000 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[20:21], s[2:3], 0x100000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 @@ -7053,54 +7064,55 @@ ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s65 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s66 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s67 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[62:63], s[12:13], 0x100000 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s12, s12, 16 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x100000 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s68 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s69 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s14 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s15 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[56:57], s[10:11], 0x100000 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s58 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s59 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s60 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s61 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:208 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[56:57], s[10:11], 0x100000 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s10, s10, 16 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:208 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x100000 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s62 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s63 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s12 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s13 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:192 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[50:51], s[8:9], 0x100000 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s52 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s53 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s54 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s55 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:176 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[50:51], s[8:9], 0x100000 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s8, s8, 16 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:176 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x100000 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s56 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s57 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s10 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s11 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:160 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[44:45], s[6:7], 0x100000 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s46 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s47 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s48 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s49 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[44:45], s[6:7], 0x100000 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s6, s6, 16 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x100000 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s50 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s51 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s8 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s9 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:128 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x100000 +; GCN-NOHSA-VI-NEXT: s_nop 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s40 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s41 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s42 diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll --- a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll @@ -525,8 +525,8 @@ ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_add_u32 s4, s0, 16 -; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_add_u32 s2, s2, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0 @@ -1725,8 +1725,8 @@ ; GCN-HSA-NEXT: s_waitcnt vmcnt(0) ; GCN-HSA-NEXT: v_ashr_i64 v[7:8], v[3:4], 48 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 16, v3 -; GCN-HSA-NEXT: v_bfe_i32 v0, v3, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v2, v4, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v0, v3, 0, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, v7 ; GCN-HSA-NEXT: flat_store_dwordx4 v[5:6], v[0:3] ; GCN-HSA-NEXT: s_endpgm @@ -2177,8 +2177,8 @@ ; GCN-HSA-NEXT: v_mov_b32_e32 v12, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v11, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v15, s1 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: s_mov_b32 s4, 0xffff ; GCN-HSA-NEXT: v_mov_b32_e32 v14, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s0, 32 @@ -2189,16 +2189,16 @@ ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v13, 16, v3 ; GCN-HSA-NEXT: s_waitcnt vmcnt(0) ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v10, 16, v7 -; GCN-HSA-NEXT: v_and_b32_e32 v9, s4, v7 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v8, 16, v6 +; GCN-HSA-NEXT: v_and_b32_e32 v9, s4, v7 ; GCN-HSA-NEXT: v_and_b32_e32 v7, s4, v6 ; GCN-HSA-NEXT: flat_store_dwordx4 v[11:12], v[7:10] -; GCN-HSA-NEXT: v_and_b32_e32 v12, s4, v3 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 16, v2 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 16, v1 -; GCN-HSA-NEXT: v_and_b32_e32 v8, s4, v1 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v0 +; GCN-HSA-NEXT: v_and_b32_e32 v8, s4, v1 ; GCN-HSA-NEXT: v_and_b32_e32 v6, s4, v0 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 16, v2 +; GCN-HSA-NEXT: v_and_b32_e32 v12, s4, v3 ; GCN-HSA-NEXT: v_and_b32_e32 v10, s4, v2 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v5 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v4 @@ -2415,22 +2415,22 @@ ; GCN-HSA-NEXT: v_mov_b32_e32 v19, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v18, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v17, s1 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v16, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s0, 32 -; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v21, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v23, s1 +; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v20, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v23, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v22, s0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(1) ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 16, v0 -; GCN-HSA-NEXT: v_bfe_i32 v8, v0, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v10, v1, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v8, v0, 0, 16 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 16, v3 -; GCN-HSA-NEXT: v_bfe_i32 v14, v3, 0, 16 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v13, 16, v2 +; GCN-HSA-NEXT: v_bfe_i32 v14, v3, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v12, v2, 0, 16 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 16, v1 ; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[12:15] @@ -2438,8 +2438,8 @@ ; GCN-HSA-NEXT: s_waitcnt vmcnt(2) ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 16, v5 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v10, 16, v7 -; GCN-HSA-NEXT: v_bfe_i32 v9, v7, 0, 16 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v8, 16, v6 +; GCN-HSA-NEXT: v_bfe_i32 v9, v7, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v7, v6, 0, 16 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 16, v4 ; GCN-HSA-NEXT: v_bfe_i32 v2, v5, 0, 16 @@ -2469,12 +2469,12 @@ ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v17, 16, v6 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v18, v7, 0, 16 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v16, v6, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v10, v1, 0, 16 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v9, 16, v0 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v10, v1, 0, 16 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v8, v0, 0, 16 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v15, 16, v3 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v14, v3, 0, 16 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v13, 16, v2 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v14, v3, 0, 16 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v12, v2, 0, 16 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v3, 16, v5 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v1, 16, v4 @@ -2682,12 +2682,12 @@ ; GCN-HSA-NEXT: s_add_u32 s4, s2, 32 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 ; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 -; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] -; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s2, 48 +; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] ; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2 @@ -2726,41 +2726,41 @@ ; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[16:19] ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 16, v7 -; GCN-HSA-NEXT: v_and_b32_e32 v18, s14, v7 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v17, 16, v6 +; GCN-HSA-NEXT: v_and_b32_e32 v18, s14, v7 ; GCN-HSA-NEXT: v_and_b32_e32 v16, s14, v6 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s11 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[16:19] ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 -; GCN-HSA-NEXT: s_add_u32 s0, s0, 48 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[16:19] ; GCN-HSA-NEXT: v_mov_b32_e32 v20, s3 ; GCN-HSA-NEXT: s_waitcnt vmcnt(3) ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v13 -; GCN-HSA-NEXT: v_and_b32_e32 v6, s14, v13 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v12 +; GCN-HSA-NEXT: v_and_b32_e32 v6, s14, v13 ; GCN-HSA-NEXT: v_and_b32_e32 v4, s14, v12 +; GCN-HSA-NEXT: s_add_u32 s0, s0, 48 +; GCN-HSA-NEXT: v_mov_b32_e32 v19, s2 ; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[4:7] ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v19, s2 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v18, 16, v15 -; GCN-HSA-NEXT: v_and_b32_e32 v17, s14, v15 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v16, 16, v14 +; GCN-HSA-NEXT: v_and_b32_e32 v17, s14, v15 ; GCN-HSA-NEXT: v_and_b32_e32 v15, s14, v14 ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[19:20], v[15:18] ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 +; GCN-HSA-NEXT: flat_store_dwordx4 v[19:20], v[15:18] +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v6, 16, v3 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v18, 16, v9 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v16, 16, v8 +; GCN-HSA-NEXT: v_and_b32_e32 v17, s14, v9 ; GCN-HSA-NEXT: v_and_b32_e32 v15, s14, v8 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s1 -; GCN-HSA-NEXT: v_and_b32_e32 v17, s14, v9 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v6, 16, v3 -; GCN-HSA-NEXT: v_and_b32_e32 v5, s14, v3 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GCN-HSA-NEXT: v_and_b32_e32 v5, s14, v3 ; GCN-HSA-NEXT: v_and_b32_e32 v3, s14, v2 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v14, 16, v11 -; GCN-HSA-NEXT: v_and_b32_e32 v13, s14, v11 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v12, 16, v10 +; GCN-HSA-NEXT: v_and_b32_e32 v13, s14, v11 ; GCN-HSA-NEXT: v_and_b32_e32 v11, s14, v10 ; GCN-HSA-NEXT: v_mov_b32_e32 v7, s0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[15:18] @@ -3092,8 +3092,8 @@ ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 ; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] ; GCN-HSA-NEXT: s_add_u32 s4, s2, 32 @@ -3102,8 +3102,8 @@ ; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v9, s5 ; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2 ; GCN-HSA-NEXT: flat_load_dwordx4 v[8:11], v[8:9] ; GCN-HSA-NEXT: flat_load_dwordx4 v[12:15], v[12:13] ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 @@ -3124,8 +3124,8 @@ ; GCN-HSA-NEXT: v_mov_b32_e32 v20, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v19, 16, v3 -; GCN-HSA-NEXT: v_bfe_i32 v18, v3, 0, 16 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v17, 16, v2 +; GCN-HSA-NEXT: v_bfe_i32 v18, v3, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v16, v2, 0, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[16:19] @@ -3138,8 +3138,8 @@ ; GCN-HSA-NEXT: v_mov_b32_e32 v19, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v18, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x50 -; GCN-HSA-NEXT: v_bfe_i32 v2, v5, 0, 16 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 16, v4 +; GCN-HSA-NEXT: v_bfe_i32 v2, v5, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v0, v4, 0, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[0:3] @@ -3149,8 +3149,8 @@ ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: s_add_u32 s0, s0, 48 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 16, v7 -; GCN-HSA-NEXT: v_bfe_i32 v2, v7, 0, 16 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 16, v6 +; GCN-HSA-NEXT: v_bfe_i32 v2, v7, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v0, v6, 0, 16 ; GCN-HSA-NEXT: s_waitcnt vmcnt(4) ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 16, v9 @@ -3165,9 +3165,9 @@ ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 16, v10 ; GCN-HSA-NEXT: v_bfe_i32 v2, v11, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v0, v10, 0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s1 ; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[4:7] ; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s1 ; GCN-HSA-NEXT: s_waitcnt vmcnt(6) ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 16, v13 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 16, v12 @@ -3200,8 +3200,8 @@ ; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(3) ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v19, 16, v3 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v18, v3, 0, 16 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v17, 16, v2 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v18, v3, 0, 16 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v35, 16, v13 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v33, 16, v12 @@ -3209,24 +3209,24 @@ ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v32, v12, 0, 16 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v16, v2, 0, 16 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v23, 16, v1 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v22, v1, 0, 16 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v21, 16, v0 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v22, v1, 0, 16 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v20, v0, 0, 16 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v3, 16, v7 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v2, v7, 0, 16 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v1, 16, v6 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v2, v7, 0, 16 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v0, v6, 0, 16 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v27, 16, v5 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v26, v5, 0, 16 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v25, 16, v4 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v26, v5, 0, 16 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v24, v4, 0, 16 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v7, 16, v11 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v6, v11, 0, 16 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v5, 16, v10 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v6, v11, 0, 16 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v4, v10, 0, 16 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v31, 16, v9 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v30, v9, 0, 16 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v29, 16, v8 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v30, v9, 0, 16 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v28, v8, 0, 16 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v11, 16, v15 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v9, 16, v14 @@ -3617,16 +3617,16 @@ ; GCN-HSA-NEXT: s_add_u32 s4, s2, 32 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 ; GCN-HSA-NEXT: s_add_u32 s6, s2, 48 -; GCN-HSA-NEXT: s_addc_u32 s7, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v33, s3 +; GCN-HSA-NEXT: s_addc_u32 s7, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v32, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s2, 64 ; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0 +; GCN-HSA-NEXT: flat_load_dwordx4 v[8:11], v[8:9] +; GCN-HSA-NEXT: flat_load_dwordx4 v[12:15], v[12:13] ; GCN-HSA-NEXT: v_mov_b32_e32 v17, s5 ; GCN-HSA-NEXT: v_mov_b32_e32 v21, s7 ; GCN-HSA-NEXT: v_mov_b32_e32 v29, s3 -; GCN-HSA-NEXT: flat_load_dwordx4 v[8:11], v[8:9] -; GCN-HSA-NEXT: flat_load_dwordx4 v[12:15], v[12:13] ; GCN-HSA-NEXT: v_mov_b32_e32 v16, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v20, s6 ; GCN-HSA-NEXT: v_mov_b32_e32 v28, s2 @@ -3658,8 +3658,8 @@ ; GCN-HSA-NEXT: s_addc_u32 s13, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s12 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v27, 16, v3 -; GCN-HSA-NEXT: v_and_b32_e32 v26, s17, v3 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v25, 16, v2 +; GCN-HSA-NEXT: v_and_b32_e32 v26, s17, v3 ; GCN-HSA-NEXT: v_and_b32_e32 v24, s17, v2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s13 ; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[24:27] @@ -3673,8 +3673,8 @@ ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s10 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v7 -; GCN-HSA-NEXT: v_and_b32_e32 v2, s17, v7 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v6 +; GCN-HSA-NEXT: v_and_b32_e32 v2, s17, v7 ; GCN-HSA-NEXT: v_and_b32_e32 v0, s17, v6 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s11 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] @@ -3682,27 +3682,27 @@ ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: s_waitcnt vmcnt(9) ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v9 -; GCN-HSA-NEXT: v_and_b32_e32 v2, s17, v9 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v8 +; GCN-HSA-NEXT: v_and_b32_e32 v2, s17, v9 ; GCN-HSA-NEXT: v_and_b32_e32 v0, s17, v8 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s6 ; GCN-HSA-NEXT: v_mov_b32_e32 v25, s1 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v11 -; GCN-HSA-NEXT: v_and_b32_e32 v2, s17, v11 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v10 +; GCN-HSA-NEXT: v_and_b32_e32 v2, s17, v11 ; GCN-HSA-NEXT: v_and_b32_e32 v0, s17, v10 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s7 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x80 -; GCN-HSA-NEXT: v_mov_b32_e32 v27, s5 ; GCN-HSA-NEXT: v_mov_b32_e32 v24, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v27, s5 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x80 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-HSA-NEXT: v_mov_b32_e32 v26, s4 ; GCN-HSA-NEXT: s_waitcnt vmcnt(6) ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v33 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v32 ; GCN-HSA-NEXT: v_and_b32_e32 v2, s17, v33 ; GCN-HSA-NEXT: v_and_b32_e32 v0, s17, v32 -; GCN-HSA-NEXT: v_mov_b32_e32 v26, s4 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v35 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v34 ; GCN-HSA-NEXT: v_and_b32_e32 v6, s17, v35 @@ -3715,12 +3715,12 @@ ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x90 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, s15 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 16, v29 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 16, v28 ; GCN-HSA-NEXT: v_and_b32_e32 v10, s17, v29 ; GCN-HSA-NEXT: v_and_b32_e32 v8, s17, v28 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, s15 ; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[8:11] ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v31 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v30 @@ -3733,26 +3733,26 @@ ; GCN-HSA-NEXT: s_add_u32 s2, s0, s16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 64 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v21 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v20 ; GCN-HSA-NEXT: v_and_b32_e32 v2, s17, v21 ; GCN-HSA-NEXT: v_and_b32_e32 v0, s17, v20 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 64 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v23 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v22 ; GCN-HSA-NEXT: v_and_b32_e32 v6, s17, v23 ; GCN-HSA-NEXT: v_and_b32_e32 v4, s17, v22 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[4:7] ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v15 -; GCN-HSA-NEXT: v_and_b32_e32 v2, s17, v15 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v14 -; GCN-HSA-NEXT: v_and_b32_e32 v0, s17, v14 +; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[4:7] +; GCN-HSA-NEXT: v_and_b32_e32 v2, s17, v15 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v13 -; GCN-HSA-NEXT: v_and_b32_e32 v6, s17, v13 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v12 +; GCN-HSA-NEXT: v_and_b32_e32 v0, s17, v14 +; GCN-HSA-NEXT: v_and_b32_e32 v6, s17, v13 ; GCN-HSA-NEXT: v_and_b32_e32 v4, s17, v12 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v15, 16, v17 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v13, 16, v16 @@ -3767,15 +3767,15 @@ ; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 16, v18 ; GCN-HSA-NEXT: v_and_b32_e32 v10, s17, v19 ; GCN-HSA-NEXT: v_and_b32_e32 v8, s17, v18 -; GCN-HSA-NEXT: s_add_u32 s0, s0, 48 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[8:11] -; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 +; GCN-HSA-NEXT: s_add_u32 s0, s0, 48 ; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2 +; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[4:7] ; GCN-HSA-NEXT: s_nop 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 @@ -3785,8 +3785,8 @@ ; ; GCN-NOHSA-VI-LABEL: global_zextload_v64i16_to_v64i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s90, -1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s91, 0xe80000 @@ -3878,8 +3878,8 @@ ; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v58, s0, v58 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v3, 16, v57 ; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v2, s0, v57 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, s0, v56 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 16, v56 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, s0, v56 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[58:61], off, s[0:3], 0 offset:240 @@ -4404,8 +4404,8 @@ ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v9, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s4 ; GCN-HSA-NEXT: flat_load_dwordx4 v[8:11], v[8:9] ; GCN-HSA-NEXT: flat_load_dwordx4 v[12:15], v[12:13] ; GCN-HSA-NEXT: s_add_u32 s4, s2, 64 @@ -4446,8 +4446,8 @@ ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xc0 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v27, 16, v3 -; GCN-HSA-NEXT: v_bfe_i32 v26, v3, 0, 16 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v25, 16, v2 +; GCN-HSA-NEXT: v_bfe_i32 v26, v3, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v24, v2, 0, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[24:27] @@ -4462,11 +4462,11 @@ ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 16, v7 -; GCN-HSA-NEXT: v_bfe_i32 v2, v7, 0, 16 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 16, v6 +; GCN-HSA-NEXT: v_bfe_i32 v2, v7, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v0, v6, 0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xa0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(8) @@ -4475,8 +4475,8 @@ ; GCN-HSA-NEXT: v_bfe_i32 v2, v13, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v0, v12, 0, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v13, s1 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, s0 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v25, s3 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 16, v9 @@ -4484,27 +4484,27 @@ ; GCN-HSA-NEXT: v_bfe_i32 v2, v9, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v0, v8, 0, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v9, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v24, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xb0 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 16, v15 -; GCN-HSA-NEXT: v_bfe_i32 v6, v15, 0, 16 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 16, v14 +; GCN-HSA-NEXT: v_bfe_i32 v6, v15, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v4, v14, 0, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v24, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xb0 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[4:7] ; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[0:3] -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v14, 16, v11 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x80 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x90 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v14, 16, v11 -; GCN-HSA-NEXT: v_bfe_i32 v13, v11, 0, 16 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v12, 16, v10 +; GCN-HSA-NEXT: v_bfe_i32 v13, v11, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v11, v10, 0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x90 ; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[11:14] ; GCN-HSA-NEXT: s_waitcnt vmcnt(11) ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 16, v17 @@ -4531,13 +4531,13 @@ ; GCN-HSA-NEXT: s_add_u32 s2, s0, 64 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, s10 ; GCN-HSA-NEXT: s_waitcnt vmcnt(12) ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 16, v23 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 16, v22 ; GCN-HSA-NEXT: v_bfe_i32 v10, v23, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v8, v22, 0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v16, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, s10 ; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[8:11] ; GCN-HSA-NEXT: s_waitcnt vmcnt(12) ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 16, v29 @@ -4550,17 +4550,17 @@ ; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 16, v30 ; GCN-HSA-NEXT: v_bfe_i32 v10, v31, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v8, v30, 0, 16 -; GCN-HSA-NEXT: s_add_u32 s0, s0, 48 -; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[8:11] +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 16, v21 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 16, v20 ; GCN-HSA-NEXT: v_bfe_i32 v2, v21, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v0, v20, 0, 16 +; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[8:11] +; GCN-HSA-NEXT: s_add_u32 s0, s0, 48 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_waitcnt vmcnt(14) ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 16, v33 @@ -4581,8 +4581,8 @@ ; ; GCN-NOHSA-VI-LABEL: global_sextload_v64i16_to_v64i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s90, -1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s91, 0xe80000 @@ -4611,8 +4611,8 @@ ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v51, 16, v9 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(3) ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v31, 16, v15 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v30, v15, 0, 16 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v29, 16, v14 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v30, v15, 0, 16 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v28, v14, 0, 16 ; GCN-NOHSA-VI-NEXT: buffer_store_dword v28, off, s[88:91], 0 offset:4 ; 4-byte Folded Spill ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) @@ -4620,54 +4620,54 @@ ; GCN-NOHSA-VI-NEXT: buffer_store_dword v30, off, s[88:91], 0 offset:12 ; 4-byte Folded Spill ; GCN-NOHSA-VI-NEXT: buffer_store_dword v31, off, s[88:91], 0 offset:16 ; 4-byte Folded Spill ; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[60:63], off, s[8:11], 0 offset:112 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v50, v9, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v58, v1, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v56, v0, 0, 16 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v31, 16, v19 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v30, v19, 0, 16 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v29, 16, v18 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v30, v19, 0, 16 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v28, v18, 0, 16 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v39, 16, v17 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v38, v17, 0, 16 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v37, 16, v16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v38, v17, 0, 16 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v36, v16, 0, 16 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v19, 16, v23 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v18, v23, 0, 16 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v17, 16, v22 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v18, v23, 0, 16 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v16, v22, 0, 16 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v43, 16, v21 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v42, v21, 0, 16 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v41, 16, v20 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v42, v21, 0, 16 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v40, v20, 0, 16 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v23, 16, v27 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v22, v27, 0, 16 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v21, 16, v26 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v22, v27, 0, 16 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v20, v26, 0, 16 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v47, 16, v25 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v46, v25, 0, 16 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v45, 16, v24 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v46, v25, 0, 16 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v44, v24, 0, 16 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v27, 16, v11 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v26, v11, 0, 16 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v25, 16, v10 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v26, v11, 0, 16 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v24, v10, 0, 16 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v49, 16, v8 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v50, v9, 0, 16 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v48, v8, 0, 16 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v11, 16, v3 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v10, v3, 0, 16 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v9, 16, v2 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v10, v3, 0, 16 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v8, v2, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v58, v1, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v56, v0, 0, 16 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v35, 16, v13 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v34, v13, 0, 16 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v33, 16, v12 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v34, v13, 0, 16 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v32, v12, 0, 16 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v15, 16, v7 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v14, v7, 0, 16 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v13, 16, v6 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v14, v7, 0, 16 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v12, v6, 0, 16 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v55, 16, v5 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v54, v5, 0, 16 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v53, 16, v4 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v54, v5, 0, 16 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v52, v4, 0, 16 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v3, 16, v61 @@ -5765,8 +5765,8 @@ ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, v1 ; GCN-HSA-NEXT: v_mov_b32_e32 v7, v1 @@ -5792,8 +5792,8 @@ ; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7 ; GCN-NOHSA-VI-NEXT: buffer_load_dwordx2 v[8:9], off, s[8:11], 0 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, 0xffff -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, 0 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, 0 @@ -5952,8 +5952,8 @@ ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, v2 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v6, v5, 0, 16 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v4, v4, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v6, v5, 0, 16 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v0, v1, 0, 16 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v2, v3, 0, 16 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v5, 31, v4 @@ -6080,8 +6080,8 @@ ; GCN-HSA-LABEL: global_zextload_v8i16_to_v8i64: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, 0 ; GCN-HSA-NEXT: s_mov_b32 s4, 0xffff +; GCN-HSA-NEXT: v_mov_b32_e32 v12, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v14, v12 ; GCN-HSA-NEXT: v_mov_b32_e32 v15, v12 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -6093,11 +6093,11 @@ ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v17, s1 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v16, s0 -; GCN-HSA-NEXT: s_add_u32 s0, s0, 32 ; GCN-HSA-NEXT: v_mov_b32_e32 v19, s3 +; GCN-HSA-NEXT: s_add_u32 s0, s0, 32 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, v12 ; GCN-HSA-NEXT: v_mov_b32_e32 v18, s2 ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 @@ -6134,8 +6134,8 @@ ; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7 ; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, 0xffff -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v17, 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v19, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v17, 0 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, 0 @@ -6314,21 +6314,21 @@ ; GCN-HSA-NEXT: v_mov_b32_e32 v13, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, s0 -; GCN-HSA-NEXT: s_add_u32 s0, s0, 32 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2 +; GCN-HSA-NEXT: s_add_u32 s0, s0, 32 ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v17, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v16, s0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(0) -; GCN-HSA-NEXT: v_bfe_i32 v4, v1, 0, 16 ; GCN-HSA-NEXT: v_ashr_i64 v[6:7], v[0:1], 48 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; GCN-HSA-NEXT: v_bfe_i32 v4, v1, 0, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v11, v3 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v4 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v10, 16, v2 ; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[4:7] -; GCN-HSA-NEXT: v_bfe_i32 v8, v2, 0, 16 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GCN-HSA-NEXT: v_bfe_i32 v4, v0, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v8, v2, 0, 16 ; GCN-HSA-NEXT: v_ashr_i64 v[2:3], v[2:3], 48 ; GCN-HSA-NEXT: v_bfe_i32 v0, v11, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v6, v1, 0, 16 @@ -6359,15 +6359,15 @@ ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v11, v3 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v10, 16, v2 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v10, 16, v2 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v12, v11, 0, 16 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v14, v3, 0, 16 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v8, v2, 0, 16 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v4, v0, 0, 16 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v0, v1, 0, 16 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v6, v5, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v8, v2, 0, 16 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v2, v7, 0, 16 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v10, v10, 0, 16 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v13, 31, v12 @@ -6554,8 +6554,8 @@ ; GCN-HSA-LABEL: global_zextload_v16i16_to_v16i64: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, 0 ; GCN-HSA-NEXT: s_mov_b32 s6, 0xffff +; GCN-HSA-NEXT: v_mov_b32_e32 v8, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v10, v8 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, v8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -6585,39 +6585,39 @@ ; GCN-HSA-NEXT: v_mov_b32_e32 v13, s4 ; GCN-HSA-NEXT: s_add_u32 s4, s0, 0x70 ; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 16, v1 ; GCN-HSA-NEXT: v_and_b32_e32 v9, s6, v1 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 ; GCN-HSA-NEXT: flat_store_dwordx4 v[13:14], v[9:12] -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v14, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v18, s3 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v13, s4 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 16, v3 ; GCN-HSA-NEXT: v_and_b32_e32 v9, s6, v3 +; GCN-HSA-NEXT: v_mov_b32_e32 v18, s3 ; GCN-HSA-NEXT: flat_store_dwordx4 v[13:14], v[9:12] ; GCN-HSA-NEXT: v_mov_b32_e32 v17, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 64 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 16, v7 ; GCN-HSA-NEXT: v_and_b32_e32 v7, s6, v7 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 64 ; GCN-HSA-NEXT: v_mov_b32_e32 v14, 0 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v13, 16, v6 ; GCN-HSA-NEXT: v_and_b32_e32 v11, s6, v6 ; GCN-HSA-NEXT: flat_store_dwordx4 v[15:16], v[7:10] ; GCN-HSA-NEXT: flat_store_dwordx4 v[17:18], v[11:14] -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v16, s1 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v15, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s0, 0x60 ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v18, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, v8 -; GCN-HSA-NEXT: v_mov_b32_e32 v7, v8 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, v14 +; GCN-HSA-NEXT: v_mov_b32_e32 v7, v8 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v8, 16, v0 -; GCN-HSA-NEXT: v_and_b32_e32 v6, s6, v0 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v13, 16, v2 ; GCN-HSA-NEXT: v_and_b32_e32 v11, s6, v2 +; GCN-HSA-NEXT: v_and_b32_e32 v6, s6, v0 +; GCN-HSA-NEXT: v_mov_b32_e32 v18, s3 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v2, 16, v4 ; GCN-HSA-NEXT: v_and_b32_e32 v0, s6, v4 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 @@ -6642,8 +6642,8 @@ ; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 ; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, 0xffff -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v28, 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v30, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v28, 0 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v25, v28 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v21, v28 @@ -6657,8 +6657,8 @@ ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(1) ; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v8, s0, v0 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v10, 16, v0 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, s0, v2 ; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v12, s0, v1 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, s0, v2 ; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v16, s0, v3 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v18, 16, v3 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) @@ -6671,14 +6671,15 @@ ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v22, 16, v4 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v26, 16, v5 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v5, 16, v6 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[27:30], off, s[0:3], 0 offset:112 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, v28 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[27:30], off, s[0:3], 0 offset:112 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v6, 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v27, 0 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v14, 16, v1 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[3:6], off, s[0:3], 0 offset:96 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, v28 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[3:6], off, s[0:3], 0 offset:96 +; GCN-NOHSA-VI-NEXT: s_nop 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, 0 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:80 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:64 @@ -6955,8 +6956,8 @@ ; GCN-HSA-NEXT: v_mov_b32_e32 v13, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, s0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(1) -; GCN-HSA-NEXT: v_bfe_i32 v8, v1, 0, 16 ; GCN-HSA-NEXT: v_ashr_i64 v[10:11], v[0:1], 48 +; GCN-HSA-NEXT: v_bfe_i32 v8, v1, 0, 16 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v8 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, v3 ; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[8:11] @@ -6966,42 +6967,42 @@ ; GCN-HSA-NEXT: v_mov_b32_e32 v16, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v8 -; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[8:11] ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[8:11] ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v2 ; GCN-HSA-NEXT: v_bfe_i32 v8, v2, 0, 16 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GCN-HSA-NEXT: v_mov_b32_e32 v15, s3 -; GCN-HSA-NEXT: v_bfe_i32 v2, v2, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v0, v0, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v10, v1, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v2, v2, 0, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v14, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 +; GCN-HSA-NEXT: v_bfe_i32 v10, v1, 0, 16 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[0:3] ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v8 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v10 -; GCN-HSA-NEXT: s_waitcnt vmcnt(3) -; GCN-HSA-NEXT: v_bfe_i32 v0, v5, 0, 16 -; GCN-HSA-NEXT: v_ashr_i64 v[2:3], v[4:5], 48 +; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[0:3] ; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[8:11] +; GCN-HSA-NEXT: s_waitcnt vmcnt(4) +; GCN-HSA-NEXT: v_ashr_i64 v[2:3], v[4:5], 48 +; GCN-HSA-NEXT: v_bfe_i32 v0, v5, 0, 16 ; GCN-HSA-NEXT: s_add_u32 s0, s0, 64 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, v7 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v8, 16, v6 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, v7 ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[0:3] ; GCN-HSA-NEXT: v_bfe_i32 v10, v8, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v8, v6, 0, 16 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v4 ; GCN-HSA-NEXT: v_bfe_i32 v0, v4, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v4, v11, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v8, v6, 0, 16 ; GCN-HSA-NEXT: v_ashr_i64 v[6:7], v[6:7], 48 -; GCN-HSA-NEXT: v_bfe_i32 v2, v1, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v4, v11, 0, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v15, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v21, s1 +; GCN-HSA-NEXT: v_bfe_i32 v2, v1, 0, 16 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v4 ; GCN-HSA-NEXT: v_mov_b32_e32 v14, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v20, s0 @@ -7060,8 +7061,8 @@ ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v16, v1, 0, 16 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 16, v4 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v6, v1, 0, 16 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v4, v4, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v6, v1, 0, 16 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v10, v0, 0, 16 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v0, v2, 0, 16 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v2, v5, 0, 16 @@ -7412,8 +7413,8 @@ ; GCN-HSA-LABEL: global_zextload_v32i16_to_v32i64: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0 ; GCN-HSA-NEXT: s_mov_b32 s16, 0xffff +; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, v1 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -7474,28 +7475,28 @@ ; GCN-HSA-NEXT: v_mov_b32_e32 v22, s12 ; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[2:5] ; GCN-HSA-NEXT: v_mov_b32_e32 v23, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s4 ; GCN-HSA-NEXT: s_waitcnt vmcnt(4) ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v4, 16, v19 ; GCN-HSA-NEXT: v_and_b32_e32 v2, s16, v19 +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s4 ; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[2:5] ; GCN-HSA-NEXT: v_mov_b32_e32 v23, s7 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v4, 16, v17 ; GCN-HSA-NEXT: v_and_b32_e32 v2, s16, v17 ; GCN-HSA-NEXT: v_mov_b32_e32 v22, s6 ; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[2:5] -; GCN-HSA-NEXT: s_add_u32 s4, s0, 32 ; GCN-HSA-NEXT: v_mov_b32_e32 v23, s9 -; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 +; GCN-HSA-NEXT: s_add_u32 s4, s0, 32 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v4, 16, v15 ; GCN-HSA-NEXT: v_and_b32_e32 v2, s16, v15 ; GCN-HSA-NEXT: v_mov_b32_e32 v22, s8 +; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[2:5] -; GCN-HSA-NEXT: s_add_u32 s6, s0, 0xe0 ; GCN-HSA-NEXT: v_mov_b32_e32 v7, 0 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v4, 16, v18 ; GCN-HSA-NEXT: v_and_b32_e32 v2, s16, v18 ; GCN-HSA-NEXT: v_mov_b32_e32 v18, s1 +; GCN-HSA-NEXT: s_add_u32 s6, s0, 0xe0 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, v7 ; GCN-HSA-NEXT: v_mov_b32_e32 v17, s0 ; GCN-HSA-NEXT: s_addc_u32 s7, s1, 0 @@ -7530,19 +7531,19 @@ ; GCN-HSA-NEXT: v_mov_b32_e32 v10, s7 ; GCN-HSA-NEXT: v_mov_b32_e32 v9, s6 ; GCN-HSA-NEXT: flat_store_dwordx4 v[9:10], v[2:5] -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 16, v6 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v2, 16, v21 ; GCN-HSA-NEXT: flat_store_dwordx4 v[15:16], v[0:3] -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 16, v6 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s4 ; GCN-HSA-NEXT: v_and_b32_e32 v9, s16, v6 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s4 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v6, 16, v20 ; GCN-HSA-NEXT: v_and_b32_e32 v4, s16, v20 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, v1 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s5 -; GCN-HSA-NEXT: s_add_u32 s0, s0, 64 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[4:7] ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 +; GCN-HSA-NEXT: s_add_u32 s0, s0, 64 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v14, 16, v8 ; GCN-HSA-NEXT: v_and_b32_e32 v12, s16, v8 ; GCN-HSA-NEXT: v_mov_b32_e32 v13, v1 @@ -7590,18 +7591,18 @@ ; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v36, s0, v37 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v38, 16, v37 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v37, 0 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v1, s0, v3 ; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v12, s0, v0 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v1, s0, v3 ; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v16, s0, v2 ; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v20, s0, v5 ; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v24, s0, v4 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v26, 16, v4 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v4, s0, v6 ; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v28, s0, v7 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v4, s0, v6 ; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v39, s0, v32 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v41, 16, v32 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v32, s0, v34 ; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v42, s0, v31 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v32, s0, v34 ; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v45, s0, v33 ; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v51, s0, v35 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v55, v37 @@ -7619,33 +7620,33 @@ ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v48, 0 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v22, 16, v5 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v44, 16, v31 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[45:48], off, s[0:3], 0 offset:160 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v45, 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v43, v37 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v45, 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, v37 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v18, 16, v2 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[42:45], off, s[0:3], 0 offset:128 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:96 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v14, 16, v0 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v18, 16, v2 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v34, 16, v34 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v35, 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v33, v37 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v42, 0 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[42:45], off, s[0:3], 0 offset:128 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v40, v37 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v42, 0 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:96 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v31, 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v29, v37 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, v37 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v13, v37 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v9, v37 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[39:42], off, s[0:3], 0 offset:144 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:176 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v39, 0 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[39:42], off, s[0:3], 0 offset:144 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v25, v37 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v39, 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v21, v37 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v17, v37 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:112 @@ -8114,8 +8115,8 @@ ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 ; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] ; GCN-HSA-NEXT: s_add_u32 s4, s2, 32 @@ -8124,8 +8125,8 @@ ; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v9, s5 ; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2 ; GCN-HSA-NEXT: flat_load_dwordx4 v[8:11], v[8:9] ; GCN-HSA-NEXT: flat_load_dwordx4 v[12:15], v[12:13] ; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 @@ -8135,8 +8136,8 @@ ; GCN-HSA-NEXT: v_mov_b32_e32 v21, s5 ; GCN-HSA-NEXT: v_mov_b32_e32 v20, s4 ; GCN-HSA-NEXT: s_waitcnt vmcnt(3) -; GCN-HSA-NEXT: v_bfe_i32 v16, v1, 0, 16 ; GCN-HSA-NEXT: v_ashr_i64 v[18:19], v[0:1], 48 +; GCN-HSA-NEXT: v_bfe_i32 v16, v1, 0, 16 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v17, 31, v16 ; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[16:19] ; GCN-HSA-NEXT: v_mov_b32_e32 v21, s3 @@ -8151,13 +8152,13 @@ ; GCN-HSA-NEXT: s_addc_u32 s9, s1, 0 ; GCN-HSA-NEXT: s_add_u32 s10, s0, 0x70 ; GCN-HSA-NEXT: s_addc_u32 s11, s1, 0 -; GCN-HSA-NEXT: s_add_u32 s12, s0, 0x50 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, v3 +; GCN-HSA-NEXT: s_add_u32 s12, s0, 0x50 ; GCN-HSA-NEXT: v_bfe_i32 v16, v1, 0, 16 -; GCN-HSA-NEXT: s_addc_u32 s13, s1, 0 ; GCN-HSA-NEXT: v_ashr_i64 v[18:19], v[2:3], 48 -; GCN-HSA-NEXT: s_add_u32 s14, s0, 32 +; GCN-HSA-NEXT: s_addc_u32 s13, s1, 0 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v17, 31, v16 +; GCN-HSA-NEXT: s_add_u32 s14, s0, 32 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v2 ; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[16:19] ; GCN-HSA-NEXT: s_addc_u32 s15, s1, 0 @@ -8170,16 +8171,16 @@ ; GCN-HSA-NEXT: flat_store_dwordx4 v[1:2], v[16:19] ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GCN-HSA-NEXT: v_mov_b32_e32 v17, s1 -; GCN-HSA-NEXT: v_bfe_i32 v2, v1, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v2, v1, 0, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v16, s0 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v17, s5 ; GCN-HSA-NEXT: s_waitcnt vmcnt(6) -; GCN-HSA-NEXT: v_bfe_i32 v0, v5, 0, 16 ; GCN-HSA-NEXT: v_ashr_i64 v[2:3], v[4:5], 48 +; GCN-HSA-NEXT: v_bfe_i32 v0, v5, 0, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v16, s4 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[0:3] @@ -8192,8 +8193,8 @@ ; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v19, s9 ; GCN-HSA-NEXT: s_waitcnt vmcnt(7) -; GCN-HSA-NEXT: v_bfe_i32 v0, v9, 0, 16 ; GCN-HSA-NEXT: v_ashr_i64 v[2:3], v[8:9], 48 +; GCN-HSA-NEXT: v_bfe_i32 v0, v9, 0, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v18, s8 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[0:3] @@ -8206,19 +8207,19 @@ ; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v17, s13 ; GCN-HSA-NEXT: s_waitcnt vmcnt(8) -; GCN-HSA-NEXT: v_bfe_i32 v0, v13, 0, 16 ; GCN-HSA-NEXT: v_ashr_i64 v[2:3], v[12:13], 48 +; GCN-HSA-NEXT: v_bfe_i32 v0, v13, 0, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v16, s12 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[0:3] -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xe0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, v15 ; GCN-HSA-NEXT: v_mov_b32_e32 v19, s11 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, v15 ; GCN-HSA-NEXT: v_bfe_i32 v0, v0, 0, 16 ; GCN-HSA-NEXT: v_ashr_i64 v[2:3], v[14:15], 48 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xe0 ; GCN-HSA-NEXT: v_mov_b32_e32 v18, s10 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v6 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[0:3] ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v4 @@ -8236,18 +8237,18 @@ ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xa0 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_bfe_i32 v2, v7, 0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v6, 16, v14 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-HSA-NEXT: v_bfe_i32 v4, v14, 0, 16 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v2, 16, v12 ; GCN-HSA-NEXT: v_bfe_i32 v0, v12, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v4, v14, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v14, v15, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v12, v10, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v14, v15, 0, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v16, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x80 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v13, 31, v12 @@ -8255,19 +8256,19 @@ ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 16, v8 ; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[12:15] -; GCN-HSA-NEXT: v_bfe_i32 v10, v11, 0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 ; GCN-HSA-NEXT: v_bfe_i32 v8, v8, 0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 +; GCN-HSA-NEXT: v_bfe_i32 v10, v11, 0, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v8 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v10 -; GCN-HSA-NEXT: s_add_u32 s0, s0, 64 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_bfe_i32 v6, v6, 0, 16 ; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[8:11] -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; GCN-HSA-NEXT: s_add_u32 s0, s0, 64 ; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v4 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 31, v6 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2 ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 @@ -8300,9 +8301,9 @@ ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v16, 16, v14 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v18, v16, 0, 16 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v16, v14, 0, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v14, v15 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v17, 31, v16 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v19, 31, v18 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v14, v15 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:224 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v14, v14, 0, 16 @@ -8322,13 +8323,13 @@ ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v14, v14, 0, 16 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v13, 31, v12 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v15, 31, v14 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:208 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v16, v11 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v12, 16, v10 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:208 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v17, 16, v11 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v14, v10, 0, 16 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v12, 16, v10 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v11, v16, 0, 16 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v16, v12, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v14, v10, 0, 16 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v13, v17, 0, 16 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v15, 31, v14 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v17, 31, v16 @@ -8378,10 +8379,10 @@ ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v20, v0, 0, 16 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v12, v1, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v8, v11, 0, 16 -; GCN-NOHSA-VI-NEXT: v_bfe_i32 v14, v2, 0, 16 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v1, v4, 0, 16 ; GCN-NOHSA-VI-NEXT: v_bfe_i32 v3, v3, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v8, v11, 0, 16 +; GCN-NOHSA-VI-NEXT: v_bfe_i32 v14, v2, 0, 16 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v19, 31, v18 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v21, 31, v20 ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v11, 31, v10 diff --git a/llvm/test/CodeGen/AMDGPU/load-local.128.ll b/llvm/test/CodeGen/AMDGPU/load-local.128.ll --- a/llvm/test/CodeGen/AMDGPU/load-local.128.ll +++ b/llvm/test/CodeGen/AMDGPU/load-local.128.ll @@ -131,9 +131,9 @@ ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_or_b32_e32 v0, v9, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_or_b32_e32 v3, v3, v5 ; GFX7-NEXT: v_or_b32_e32 v2, v2, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 8, v6 +; GFX7-NEXT: v_or_b32_e32 v3, v3, v5 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_or_b32_e32 v3, v3, v0 @@ -161,7 +161,6 @@ ; GFX6-NEXT: ds_read_u8 v8, v0 ; GFX6-NEXT: v_add_i32_e32 v9, vcc, 14, v0 ; GFX6-NEXT: v_add_i32_e32 v10, vcc, 3, v0 -; GFX6-NEXT: v_add_i32_e32 v11, vcc, 2, v0 ; GFX6-NEXT: s_waitcnt lgkmcnt(1) ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 @@ -176,6 +175,7 @@ ; GFX6-NEXT: v_add_i32_e32 v5, vcc, 13, v0 ; GFX6-NEXT: v_add_i32_e32 v6, vcc, 12, v0 ; GFX6-NEXT: v_add_i32_e32 v7, vcc, 15, v0 +; GFX6-NEXT: v_add_i32_e32 v11, vcc, 2, v0 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, 1, v0 ; GFX6-NEXT: ds_read_u8 v4, v4 ; GFX6-NEXT: ds_read_u8 v5, v5 @@ -187,13 +187,13 @@ ; GFX6-NEXT: ds_read_u8 v0, v0 ; GFX6-NEXT: s_waitcnt lgkmcnt(7) ; GFX6-NEXT: v_or_b32_e32 v3, v3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: s_waitcnt lgkmcnt(4) ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 8, v7 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: s_waitcnt lgkmcnt(3) -; GFX6-NEXT: v_or_b32_e32 v4, v4, v9 ; GFX6-NEXT: v_or_b32_e32 v2, v3, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 8, v5 +; GFX6-NEXT: s_waitcnt lgkmcnt(3) +; GFX6-NEXT: v_or_b32_e32 v4, v4, v9 ; GFX6-NEXT: v_or_b32_e32 v3, v3, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX6-NEXT: v_or_b32_e32 v3, v4, v3 @@ -324,8 +324,8 @@ ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v7 ; GFX6-NEXT: v_or_b32_e32 v3, v3, v6 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/load-local.96.ll b/llvm/test/CodeGen/AMDGPU/load-local.96.ll --- a/llvm/test/CodeGen/AMDGPU/load-local.96.ll +++ b/llvm/test/CodeGen/AMDGPU/load-local.96.ll @@ -106,12 +106,11 @@ ; GFX7-NEXT: v_or_b32_e32 v3, v3, v6 ; GFX7-NEXT: s_waitcnt lgkmcnt(3) ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; GFX7-NEXT: s_waitcnt lgkmcnt(2) -; GFX7-NEXT: v_or_b32_e32 v2, v2, v4 -; GFX7-NEXT: s_waitcnt lgkmcnt(1) -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5 ; GFX7-NEXT: v_or_b32_e32 v7, v7, v8 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: s_waitcnt lgkmcnt(1) +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v4 ; GFX7-NEXT: v_or_b32_e32 v3, v3, v7 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_or_b32_e32 v0, v5, v0 diff --git a/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll b/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll --- a/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll +++ b/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll @@ -113,8 +113,8 @@ ; MUBUF-NEXT: s_and_b32 s33, s33, 0xfff80000 ; MUBUF-NEXT: v_lshrrev_b32_e64 v3, 6, s33 ; MUBUF-NEXT: v_add_u32_e32 v3, 0x1000, v3 -; MUBUF-NEXT: v_mov_b32_e32 v4, 0 ; MUBUF-NEXT: v_add_u32_e32 v2, 64, v3 +; MUBUF-NEXT: v_mov_b32_e32 v4, 0 ; MUBUF-NEXT: s_mov_b32 s4, 0 ; MUBUF-NEXT: s_add_i32 s32, s32, 0x180000 ; MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], s33 @@ -224,9 +224,9 @@ ; MUBUF-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen glc ; MUBUF-NEXT: s_waitcnt vmcnt(0) ; MUBUF-NEXT: v_or_b32_e32 v2, 0x12d0, v0 +; MUBUF-NEXT: v_or_b32_e32 v1, 0x12c0, v0 ; MUBUF-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen glc ; MUBUF-NEXT: s_waitcnt vmcnt(0) -; MUBUF-NEXT: v_or_b32_e32 v1, 0x12c0, v0 ; MUBUF-NEXT: v_or_b32_e32 v2, 0x12c4, v0 ; MUBUF-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen glc ; MUBUF-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/lshl64-to-32.ll b/llvm/test/CodeGen/AMDGPU/lshl64-to-32.ll --- a/llvm/test/CodeGen/AMDGPU/lshl64-to-32.ll +++ b/llvm/test/CodeGen/AMDGPU/lshl64-to-32.ll @@ -113,10 +113,10 @@ ; GCN-LABEL: muli24_shl64: ; GCN: ; %bb.0: ; %bb ; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-NEXT: v_mov_b32_e32 v2, 0 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; GCN-NEXT: v_mov_b32_e32 v2, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_mov_b64 s[0:1], s[6:7] ; GCN-NEXT: buffer_load_dword v1, v[1:2], s[0:3], 0 addr64 diff --git a/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll b/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll --- a/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll @@ -27,9 +27,9 @@ ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: s_lshr_b32 s1, s5, 16 ; VI-NEXT: s_lshr_b32 s6, s0, 16 -; VI-NEXT: s_lshr_b32 s1, s1, s6 ; VI-NEXT: s_and_b32 s5, s5, s4 ; VI-NEXT: s_and_b32 s0, s0, s4 +; VI-NEXT: s_lshr_b32 s1, s1, s6 ; VI-NEXT: s_lshr_b32 s0, s5, s0 ; VI-NEXT: s_lshl_b32 s1, s1, 16 ; VI-NEXT: s_or_b32 s0, s0, s1 @@ -49,9 +49,9 @@ ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_lshr_b32 s1, s2, 16 ; CI-NEXT: s_lshr_b32 s8, s0, 16 -; CI-NEXT: s_lshr_b32 s1, s1, s8 ; CI-NEXT: s_and_b32 s2, s2, s3 ; CI-NEXT: s_and_b32 s0, s0, s3 +; CI-NEXT: s_lshr_b32 s1, s1, s8 ; CI-NEXT: s_lshr_b32 s0, s2, s0 ; CI-NEXT: s_lshl_b32 s1, s1, 16 ; CI-NEXT: s_or_b32 s0, s0, s1 @@ -125,9 +125,9 @@ ; CI-NEXT: s_mov_b64 s[6:7], s[2:3] ; CI-NEXT: s_waitcnt vmcnt(1) ; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; CI-NEXT: v_and_b32_e32 v2, s0, v2 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 -; CI-NEXT: v_and_b32_e32 v2, s0, v2 ; CI-NEXT: v_and_b32_e32 v3, s0, v3 ; CI-NEXT: v_lshr_b32_e32 v2, v2, v3 ; CI-NEXT: v_lshr_b32_e32 v3, v4, v5 @@ -344,8 +344,8 @@ ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v3, v[0:1] -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshrrev_b16_e64 v2, v3, 8 @@ -517,13 +517,13 @@ ; CI-NEXT: s_mov_b64 s[6:7], s[2:3] ; CI-NEXT: s_waitcnt vmcnt(1) ; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; CI-NEXT: v_and_b32_e32 v2, s0, v2 ; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v3 +; CI-NEXT: v_and_b32_e32 v3, s0, v3 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_lshrrev_b32_e32 v8, 16, v4 -; CI-NEXT: v_lshrrev_b32_e32 v9, 16, v5 -; CI-NEXT: v_and_b32_e32 v2, s0, v2 ; CI-NEXT: v_and_b32_e32 v4, s0, v4 -; CI-NEXT: v_and_b32_e32 v3, s0, v3 +; CI-NEXT: v_lshrrev_b32_e32 v9, 16, v5 ; CI-NEXT: v_and_b32_e32 v5, s0, v5 ; CI-NEXT: v_lshr_b32_e32 v3, v3, v5 ; CI-NEXT: v_lshr_b32_e32 v5, v7, v9 diff --git a/llvm/test/CodeGen/AMDGPU/max.i16.ll b/llvm/test/CodeGen/AMDGPU/max.i16.ll --- a/llvm/test/CodeGen/AMDGPU/max.i16.ll +++ b/llvm/test/CodeGen/AMDGPU/max.i16.ll @@ -122,8 +122,8 @@ ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc ; VI-NEXT: flat_load_dword v7, v[2:3] ; VI-NEXT: flat_load_ushort v8, v[0:1] -; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v6 ; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v6 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_add_u32_e32 v2, vcc, 4, v0 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll @@ -99,8 +99,6 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32 addrspace(3)* %in, i32 addrspace(1)* %out) { entry: %val = load i32, i32 addrspace(3)* %in, align 4, !nontemporal !0 @@ -203,8 +201,6 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v1, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32 addrspace(3)* %in, i32 addrspace(1)* %out) { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -247,8 +243,8 @@ ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0 ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -260,8 +256,8 @@ ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -285,8 +281,8 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -297,14 +293,12 @@ ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX90A-TGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32 addrspace(1)* %in, i32 addrspace(3)* %out) { entry: %val = load i32, i32 addrspace(1)* %in, align 4 @@ -347,8 +341,8 @@ ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX10-WGP-NEXT: v_lshl_add_u32 v0, v0, 2, s2 +; GFX10-WGP-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0 ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -360,8 +354,8 @@ ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX10-CU-NEXT: v_lshl_add_u32 v0, v0, 2, s2 +; GFX10-CU-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -386,8 +380,8 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_lshl_add_u32 v0, v0, 2, s2 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 @@ -398,14 +392,12 @@ ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX90A-TGSPLIT-NEXT: v_lshl_add_u32 v0, v0, 2, s2 +; GFX90A-TGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32 addrspace(1)* %in, i32 addrspace(3)* %out) { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll @@ -193,8 +193,8 @@ ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0 ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -206,8 +206,8 @@ ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX10-CU-NEXT: ds_write_b32 v0, v1 @@ -267,8 +267,8 @@ ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX10-WGP-NEXT: v_lshl_add_u32 v0, v0, 2, s2 +; GFX10-WGP-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0 ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 @@ -280,8 +280,8 @@ ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX10-CU-NEXT: v_lshl_add_u32 v0, v0, 2, s2 +; GFX10-CU-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX10-CU-NEXT: ds_write_b32 v0, v1 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll @@ -85,9 +85,9 @@ ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 ; SKIP-CACHE-INV-NEXT: s_add_u32 s8, s8, s3 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s9, s9, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 ; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen glc slc ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) @@ -125,8 +125,6 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32 addrspace(5)* %in, i32 addrspace(1)* %out) { entry: %val = load i32, i32 addrspace(5)* %in, align 4, !nontemporal !0 @@ -215,9 +213,9 @@ ; SKIP-CACHE-INV-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_add_i32_e32 v0, vcc, s4, v0 ; SKIP-CACHE-INV-NEXT: s_add_u32 s8, s8, s3 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s9, s9, 0 -; SKIP-CACHE-INV-NEXT: v_add_i32_e32 v0, vcc, s4, v0 ; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen glc slc ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) @@ -255,8 +253,6 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v1, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32 addrspace(5)* %in, i32 addrspace(1)* %out) { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -382,8 +378,6 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen glc slc ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32 addrspace(1)* %in, i32 addrspace(5)* %out) { entry: %val = load i32, i32 addrspace(1)* %in, align 4 @@ -400,9 +394,9 @@ ; GFX6-NEXT: s_load_dword s2, s[4:5], 0x2 ; GFX6-NEXT: s_add_u32 s8, s8, s7 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX6-NEXT: s_addc_u32 s9, s9, 0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX6-NEXT: s_addc_u32 s9, s9, 0 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, s2, v0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s0 @@ -417,9 +411,9 @@ ; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 ; GFX7-NEXT: s_add_u32 s8, s8, s7 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX7-NEXT: s_addc_u32 s9, s9, 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX7-NEXT: s_addc_u32 s9, s9, 0 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, s2, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s0 @@ -510,8 +504,6 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v0, s[8:11], 0 offen glc slc ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32 addrspace(1)* %in, i32 addrspace(5)* %out) { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll @@ -85,9 +85,9 @@ ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 ; SKIP-CACHE-INV-NEXT: s_add_u32 s8, s8, s3 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s9, s9, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 ; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 @@ -183,9 +183,9 @@ ; SKIP-CACHE-INV-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_add_i32_e32 v0, vcc, s4, v0 ; SKIP-CACHE-INV-NEXT: s_add_u32 s8, s8, s3 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s9, s9, 0 -; SKIP-CACHE-INV-NEXT: v_add_i32_e32 v0, vcc, s4, v0 ; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 @@ -327,9 +327,9 @@ ; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 ; GFX7-NEXT: s_add_u32 s8, s8, s7 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX7-NEXT: s_addc_u32 s9, s9, 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX7-NEXT: s_addc_u32 s9, s9, 0 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, s2, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s0 diff --git a/llvm/test/CodeGen/AMDGPU/memory_clause.ll b/llvm/test/CodeGen/AMDGPU/memory_clause.ll --- a/llvm/test/CodeGen/AMDGPU/memory_clause.ll +++ b/llvm/test/CodeGen/AMDGPU/memory_clause.ll @@ -81,11 +81,11 @@ ; GCN-NEXT: s_load_dwordx4 s[12:15], s[16:17], 0x30 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: v_mov_b32_e32 v4, s4 -; GCN-NEXT: v_mov_b32_e32 v8, s8 ; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: v_mov_b32_e32 v2, s2 ; GCN-NEXT: v_mov_b32_e32 v3, s3 +; GCN-NEXT: v_mov_b32_e32 v4, s4 +; GCN-NEXT: v_mov_b32_e32 v8, s8 ; GCN-NEXT: v_mov_b32_e32 v5, s5 ; GCN-NEXT: v_mov_b32_e32 v6, s6 ; GCN-NEXT: v_mov_b32_e32 v7, s7 @@ -116,18 +116,18 @@ ; GCN-SCRATCH-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x30 ; GCN-SCRATCH-NEXT: s_waitcnt lgkmcnt(0) ; GCN-SCRATCH-NEXT: v_mov_b32_e32 v0, s0 -; GCN-SCRATCH-NEXT: v_mov_b32_e32 v4, s4 ; GCN-SCRATCH-NEXT: v_mov_b32_e32 v1, s1 ; GCN-SCRATCH-NEXT: v_mov_b32_e32 v2, s2 ; GCN-SCRATCH-NEXT: v_mov_b32_e32 v3, s3 -; GCN-SCRATCH-NEXT: v_mov_b32_e32 v8, s8 +; GCN-SCRATCH-NEXT: v_mov_b32_e32 v4, s4 ; GCN-SCRATCH-NEXT: v_mov_b32_e32 v5, s5 ; GCN-SCRATCH-NEXT: v_mov_b32_e32 v6, s6 ; GCN-SCRATCH-NEXT: v_mov_b32_e32 v7, s7 -; GCN-SCRATCH-NEXT: v_mov_b32_e32 v12, s12 +; GCN-SCRATCH-NEXT: v_mov_b32_e32 v8, s8 ; GCN-SCRATCH-NEXT: v_mov_b32_e32 v9, s9 ; GCN-SCRATCH-NEXT: v_mov_b32_e32 v10, s10 ; GCN-SCRATCH-NEXT: v_mov_b32_e32 v11, s11 +; GCN-SCRATCH-NEXT: v_mov_b32_e32 v12, s12 ; GCN-SCRATCH-NEXT: v_mov_b32_e32 v13, s13 ; GCN-SCRATCH-NEXT: v_mov_b32_e32 v14, s14 ; GCN-SCRATCH-NEXT: v_mov_b32_e32 v15, s15 diff --git a/llvm/test/CodeGen/AMDGPU/min.ll b/llvm/test/CodeGen/AMDGPU/min.ll --- a/llvm/test/CodeGen/AMDGPU/min.ll +++ b/llvm/test/CodeGen/AMDGPU/min.ll @@ -110,9 +110,9 @@ ; GCN: s_load_dword s ; GCN: s_load_dword s -; SI: s_ashr_i32 ; SI: s_ashr_i32 ; SI: s_sext_i32_i16 +; SI: s_ashr_i32 ; SI: s_sext_i32_i16 ; SI: s_min_i32 ; SI: s_min_i32 diff --git a/llvm/test/CodeGen/AMDGPU/move-addr64-rsrc-dead-subreg-writes.ll b/llvm/test/CodeGen/AMDGPU/move-addr64-rsrc-dead-subreg-writes.ll --- a/llvm/test/CodeGen/AMDGPU/move-addr64-rsrc-dead-subreg-writes.ll +++ b/llvm/test/CodeGen/AMDGPU/move-addr64-rsrc-dead-subreg-writes.ll @@ -9,11 +9,11 @@ ; GCN: s_load_dwordx2 s{{\[}}[[ARG1LO:[0-9]+]]:[[ARG1HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x0{{$}} ; GCN-NOT: v_mov_b32 -; GCN: v_mov_b32_e32 v[[VARG1LO:[0-9]+]], s[[ARG1LO]] ; GCN: buffer_load_dwordx2 v{{\[}}[[LDPTRLO:[0-9]+]]:[[LDPTRHI:[0-9]+]]{{\]}} -; GCN-NOT: v_mov_b32 +; GCN: v_mov_b32_e32 v[[VARG1LO:[0-9]+]], s[[ARG1LO]] ; GCN: v_mov_b32_e32 v[[VARG1HI:[0-9]+]], s[[ARG1HI]] ; GCN-NOT: v_mov_b32 +; GCN-NOT: v_mov_b32 ; GCN: v_add_i32_e32 v[[PTRLO:[0-9]+]], vcc, v[[LDPTRLO]], v[[VARG1LO]] ; GCN: v_addc_u32_e32 v[[PTRHI:[0-9]+]], vcc, v[[LDPTRHI]], v[[VARG1HI]] diff --git a/llvm/test/CodeGen/AMDGPU/mul.i16.ll b/llvm/test/CodeGen/AMDGPU/mul.i16.ll --- a/llvm/test/CodeGen/AMDGPU/mul.i16.ll +++ b/llvm/test/CodeGen/AMDGPU/mul.i16.ll @@ -80,9 +80,9 @@ ; SI: v_mul_u32_u24 ; VI: v_mul_lo_u16_sdwa -; VI: v_mul_lo_u16_e32 ; VI: v_mul_lo_u16_sdwa ; VI: v_mul_lo_u16_e32 +; VI: v_mul_lo_u16_e32 ; VI: v_or_b32_e32 ; VI: v_or_b32_e32 diff --git a/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll b/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll --- a/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll +++ b/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll @@ -82,8 +82,8 @@ ; GFX9-NEXT: v_mul_lo_u32 v8, v8, v15 ; GFX9-NEXT: v_sub_u32_e32 v19, v9, v18 ; GFX9-NEXT: v_cmp_lt_u32_e64 s[6:7], v19, v14 -; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] ; GFX9-NEXT: v_sub_u32_e32 v12, v12, v18 +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] ; GFX9-NEXT: v_add_u32_e32 v8, v12, v8 ; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], vcc ; GFX9-NEXT: v_mov_b32_e32 v9, 0 @@ -151,8 +151,8 @@ ; GFX9-LABEL: slsr1_0: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mul_u32_u24_e32 v3, v0, v1 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffffff, v1 +; GFX9-NEXT: v_mul_u32_u24_e32 v3, v0, v1 ; GFX9-NEXT: global_store_dword v[0:1], v3, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mad_u32_u24 v0, v0, v1, v2 diff --git a/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll b/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll --- a/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll +++ b/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll @@ -142,10 +142,10 @@ ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_u32_e32 v2, vcc, s6, v0 ; VI-NEXT: v_mov_b32_e32 v3, s7 -; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s6, v0 ; VI-NEXT: v_addc_u32_e32 v3, vcc, v3, v4, vcc +; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v1 ; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc @@ -268,10 +268,10 @@ ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_u32_e32 v2, vcc, s6, v0 ; VI-NEXT: v_mov_b32_e32 v3, s7 -; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s6, v0 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v1 ; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc @@ -367,9 +367,9 @@ ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s0, s4 ; GFX9-NEXT: global_load_ubyte v2, v0, s[6:7] ; GFX9-NEXT: global_load_ubyte v3, v1, s[8:9] +; GFX9-NEXT: s_mov_b32 s0, s4 ; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mul_lo_u16_e32 v0, v2, v3 @@ -534,8 +534,8 @@ ; VI-NEXT: s_and_b32 s5, s6, s4 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_and_b32 s4, s7, s4 -; VI-NEXT: v_mov_b32_e32 v0, s7 ; VI-NEXT: s_mul_i32 s5, s5, s4 +; VI-NEXT: v_mov_b32_e32 v0, s7 ; VI-NEXT: v_mul_hi_u32_u24_e32 v1, s6, v0 ; VI-NEXT: v_mov_b32_e32 v0, s5 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -708,10 +708,10 @@ ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_and_b32 s3, s2, s1 -; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_and_b32 s1, s0, s1 -; VI-NEXT: v_mul_hi_u32_u24_e32 v0, s2, v0 +; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_mul_i32 s3, s3, s1 +; VI-NEXT: v_mul_hi_u32_u24_e32 v0, s2, v0 ; VI-NEXT: v_and_b32_e32 v1, 1, v0 ; VI-NEXT: v_mov_b32_e32 v0, s3 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 diff --git a/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll b/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll --- a/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll +++ b/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll @@ -30,8 +30,8 @@ ; MUBUF-NEXT: s_add_i32 s6, s32, 0x1000 ; MUBUF-NEXT: s_lshl_b32 s7, s10, 2 ; MUBUF-NEXT: s_mov_b32 s32, s6 -; MUBUF-NEXT: v_mov_b32_e32 v2, s6 ; MUBUF-NEXT: v_mov_b32_e32 v1, 0 +; MUBUF-NEXT: v_mov_b32_e32 v2, s6 ; MUBUF-NEXT: v_mov_b32_e32 v3, 1 ; MUBUF-NEXT: s_add_i32 s6, s6, s7 ; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -63,8 +63,8 @@ ; FLATSCR-NEXT: s_cmp_lg_u32 s5, 0 ; FLATSCR-NEXT: s_cbranch_scc1 BB0_3 ; FLATSCR-NEXT: ; %bb.2: ; %bb.1 -; FLATSCR-NEXT: s_add_i32 s2, s32, 0x1000 ; FLATSCR-NEXT: v_mov_b32_e32 v1, 0 +; FLATSCR-NEXT: s_add_i32 s2, s32, 0x1000 ; FLATSCR-NEXT: v_mov_b32_e32 v2, 1 ; FLATSCR-NEXT: s_lshl_b32 s3, s6, 2 ; FLATSCR-NEXT: s_mov_b32 s32, s2 @@ -130,8 +130,8 @@ ; MUBUF-NEXT: s_and_b32 s6, s6, 0xfffff000 ; MUBUF-NEXT: s_lshl_b32 s7, s7, 2 ; MUBUF-NEXT: s_mov_b32 s32, s6 -; MUBUF-NEXT: v_mov_b32_e32 v2, s6 ; MUBUF-NEXT: v_mov_b32_e32 v1, 0 +; MUBUF-NEXT: v_mov_b32_e32 v2, s6 ; MUBUF-NEXT: v_mov_b32_e32 v3, 1 ; MUBUF-NEXT: s_add_i32 s6, s6, s7 ; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -161,8 +161,8 @@ ; FLATSCR-NEXT: s_cbranch_scc1 BB1_2 ; FLATSCR-NEXT: ; %bb.1: ; %bb.0 ; FLATSCR-NEXT: s_add_i32 s2, s32, 0x1000 -; FLATSCR-NEXT: s_and_b32 s2, s2, 0xfffff000 ; FLATSCR-NEXT: v_mov_b32_e32 v1, 0 +; FLATSCR-NEXT: s_and_b32 s2, s2, 0xfffff000 ; FLATSCR-NEXT: v_mov_b32_e32 v2, 1 ; FLATSCR-NEXT: s_lshl_b32 s3, s3, 2 ; FLATSCR-NEXT: s_mov_b32 s32, s2 @@ -356,8 +356,8 @@ ; FLATSCR-NEXT: s_and_b32 s2, s2, 0xfffff000 ; FLATSCR-NEXT: v_mov_b32_e32 v5, 0 ; FLATSCR-NEXT: v_mov_b32_e32 v6, 1 -; FLATSCR-NEXT: v_lshl_add_u32 v2, v3, 2, s2 ; FLATSCR-NEXT: scratch_store_dwordx2 off, v[5:6], s2 +; FLATSCR-NEXT: v_lshl_add_u32 v2, v3, 2, s2 ; FLATSCR-NEXT: scratch_load_dword v2, v2, off ; FLATSCR-NEXT: v_and_b32_e32 v3, 0x3ff, v4 ; FLATSCR-NEXT: s_mov_b32 s32, s2 diff --git a/llvm/test/CodeGen/AMDGPU/saddo.ll b/llvm/test/CodeGen/AMDGPU/saddo.ll --- a/llvm/test/CodeGen/AMDGPU/saddo.ll +++ b/llvm/test/CodeGen/AMDGPU/saddo.ll @@ -42,10 +42,10 @@ ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s6 ; VI-NEXT: s_add_u32 s2, s6, s0 -; VI-NEXT: s_addc_u32 s3, s7, s1 ; VI-NEXT: v_mov_b32_e32 v2, s7 -; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[1:2] +; VI-NEXT: s_addc_u32 s3, s7, s1 ; VI-NEXT: v_cmp_lt_i64_e64 s[8:9], s[0:1], 0 +; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[1:2] ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: s_xor_b64 s[0:1], s[8:9], vcc ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] @@ -66,8 +66,8 @@ ; GFX9-NEXT: s_add_u32 s0, s6, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: s_addc_u32 s1, s7, s3 -; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1] ; GFX9-NEXT: v_cmp_lt_i64_e64 s[8:9], s[2:3], 0 +; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1] ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: s_xor_b64 s[2:3], s[8:9], vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3] @@ -116,8 +116,8 @@ ; SI-NEXT: s_cselect_b64 s[10:11], -1, 0 ; SI-NEXT: s_cmp_lt_i32 s12, s8 ; SI-NEXT: s_mov_b32 s1, s5 -; SI-NEXT: v_mov_b32_e32 v0, s12 ; SI-NEXT: s_cselect_b64 s[8:9], -1, 0 +; SI-NEXT: v_mov_b32_e32 v0, s12 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_xor_b64 s[0:1], s[10:11], s[8:9] ; SI-NEXT: s_mov_b32 s4, s6 @@ -143,9 +143,9 @@ ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: s_xor_b64 s[0:1], s[2:3], s[0:1] -; VI-NEXT: flat_store_dword v[0:1], v4 ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: v_mov_b32_e32 v3, s7 +; VI-NEXT: flat_store_dword v[0:1], v4 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; VI-NEXT: flat_store_byte v[2:3], v0 ; VI-NEXT: s_endpgm @@ -306,9 +306,9 @@ ; SI-NEXT: s_xor_b64 s[4:5], s[4:5], vcc ; SI-NEXT: s_mov_b32 s0, s2 ; SI-NEXT: s_mov_b32 s1, s3 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; SI-NEXT: s_mov_b32 s2, s10 ; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] ; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0 @@ -319,14 +319,14 @@ ; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: s_add_u32 s0, s4, s6 +; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: s_addc_u32 s1, s5, s7 ; VI-NEXT: v_mov_b32_e32 v5, s5 -; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[4:5] ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[4:5] ; VI-NEXT: v_cmp_lt_i64_e64 s[2:3], s[6:7], 0 ; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: v_mov_b32_e32 v4, s0 @@ -345,9 +345,9 @@ ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: s_addc_u32 s9, s5, s7 +; GFX9-NEXT: v_cmp_lt_i64_e64 s[10:11], s[6:7], 0 ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1] ; GFX9-NEXT: v_mov_b32_e32 v0, s8 -; GFX9-NEXT: v_cmp_lt_i64_e64 s[10:11], s[6:7], 0 ; GFX9-NEXT: v_mov_b32_e32 v1, s9 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_xor_b64 s[0:1], s[10:11], vcc @@ -508,9 +508,9 @@ ; SI-NEXT: v_add_i32_e32 v4, vcc, v0, v2 ; SI-NEXT: v_cmp_gt_i32_e64 s[0:1], 0, v3 ; SI-NEXT: v_cmp_lt_i32_e64 s[4:5], v5, v1 -; SI-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5] ; SI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v2 ; SI-NEXT: v_cmp_lt_i32_e64 s[2:3], v4, v0 +; SI-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5] ; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] ; SI-NEXT: s_xor_b64 s[0:1], vcc, s[2:3] ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] @@ -537,9 +537,9 @@ ; VI-NEXT: v_add_u32_e32 v8, vcc, v0, v2 ; VI-NEXT: v_cmp_gt_i32_e64 s[0:1], 0, v3 ; VI-NEXT: v_cmp_lt_i32_e64 s[4:5], v9, v1 -; VI-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5] ; VI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v2 ; VI-NEXT: v_cmp_lt_i32_e64 s[2:3], v8, v0 +; VI-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5] ; VI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] ; VI-NEXT: s_xor_b64 s[0:1], vcc, s[2:3] ; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] @@ -558,8 +558,8 @@ ; GFX9-NEXT: v_add_u32_e32 v5, v1, v3 ; GFX9-NEXT: v_add_i32 v1, v1, v3 clamp ; GFX9-NEXT: v_add_u32_e32 v4, v0, v2 -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, v5, v1 ; GFX9-NEXT: v_add_i32 v0, v0, v2 clamp +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, v5, v1 ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, v4, v0 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc diff --git a/llvm/test/CodeGen/AMDGPU/saddsat.ll b/llvm/test/CodeGen/AMDGPU/saddsat.ll --- a/llvm/test/CodeGen/AMDGPU/saddsat.ll +++ b/llvm/test/CodeGen/AMDGPU/saddsat.ll @@ -153,8 +153,8 @@ ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; GFX8-NEXT: v_add_u16_e32 v4, v3, v2 -; GFX8-NEXT: v_cmp_gt_i16_e64 s[4:5], 0, v2 ; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, v4, v3 +; GFX8-NEXT: v_cmp_gt_i16_e64 s[4:5], 0, v2 ; GFX8-NEXT: v_ashrrev_i16_e32 v2, 15, v4 ; GFX8-NEXT: s_movk_i32 s6, 0x8000 ; GFX8-NEXT: v_xor_b32_e32 v2, s6, v2 @@ -200,16 +200,16 @@ ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v4 ; GFX6-NEXT: s_movk_i32 s4, 0x7fff ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v3 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5 ; GFX6-NEXT: v_min_i32_e32 v1, s4, v1 ; GFX6-NEXT: s_movk_i32 s5, 0x8000 ; GFX6-NEXT: v_min_i32_e32 v0, s4, v0 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5 ; GFX6-NEXT: v_max_i32_e32 v1, s5, v1 ; GFX6-NEXT: v_max_i32_e32 v0, s5, v0 ; GFX6-NEXT: v_min_i32_e32 v2, s4, v2 -; GFX6-NEXT: v_max_i32_e32 v3, s5, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX6-NEXT: v_max_i32_e32 v3, s5, v2 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: v_or_b32_e32 v2, 0xffff0000, v3 ; GFX6-NEXT: v_alignbit_b32 v1, v3, v1, 16 @@ -221,8 +221,8 @@ ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v2 ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v0 ; GFX8-NEXT: v_add_u16_e32 v6, v5, v4 -; GFX8-NEXT: v_cmp_gt_i16_e64 s[4:5], 0, v4 ; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, v6, v5 +; GFX8-NEXT: v_cmp_gt_i16_e64 s[4:5], 0, v4 ; GFX8-NEXT: v_ashrrev_i16_e32 v4, 15, v6 ; GFX8-NEXT: s_movk_i32 s6, 0x8000 ; GFX8-NEXT: v_xor_b32_e32 v4, s6, v4 @@ -287,9 +287,9 @@ ; GFX6-NEXT: v_bfe_i32 v3, v3, 0, 16 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_and_b32_e32 v0, s6, v0 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v6 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v3, v7 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v6 ; GFX6-NEXT: v_min_i32_e32 v1, s4, v1 ; GFX6-NEXT: v_min_i32_e32 v2, s4, v2 ; GFX6-NEXT: v_max_i32_e32 v1, s5, v1 @@ -305,8 +305,8 @@ ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v2 ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v0 ; GFX8-NEXT: v_add_u16_e32 v6, v5, v4 -; GFX8-NEXT: v_cmp_gt_i16_e64 s[4:5], 0, v4 ; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, v6, v5 +; GFX8-NEXT: v_cmp_gt_i16_e64 s[4:5], 0, v4 ; GFX8-NEXT: v_ashrrev_i16_e32 v4, 15, v6 ; GFX8-NEXT: s_movk_i32 s6, 0x8000 ; GFX8-NEXT: v_xor_b32_e32 v4, s6, v4 @@ -318,14 +318,14 @@ ; GFX8-NEXT: v_ashrrev_i16_e32 v0, 15, v2 ; GFX8-NEXT: v_xor_b32_e32 v0, s6, v0 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v3 ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v1 ; GFX8-NEXT: v_add_u16_e32 v5, v4, v2 -; GFX8-NEXT: v_cmp_gt_i16_e64 s[4:5], 0, v2 ; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, v5, v4 +; GFX8-NEXT: v_cmp_gt_i16_e64 s[4:5], 0, v2 ; GFX8-NEXT: v_ashrrev_i16_e32 v2, 15, v5 ; GFX8-NEXT: v_xor_b32_e32 v2, s6, v2 ; GFX8-NEXT: s_xor_b64 vcc, s[4:5], vcc @@ -373,8 +373,8 @@ ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GFX6-NEXT: v_add_i32_e64 v2, s[4:5], v1, v3 -; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v2, v1 ; GFX6-NEXT: v_cmp_gt_i32_e32 vcc, 0, v3 +; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v2, v1 ; GFX6-NEXT: v_ashrrev_i32_e32 v1, 31, v2 ; GFX6-NEXT: v_xor_b32_e32 v1, s6, v1 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] @@ -393,8 +393,8 @@ ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GFX8-NEXT: v_add_u32_e64 v2, s[4:5], v1, v3 -; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v2, v1 ; GFX8-NEXT: v_cmp_gt_i32_e32 vcc, 0, v3 +; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v2, v1 ; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v2 ; GFX8-NEXT: v_xor_b32_e32 v1, s6, v1 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] @@ -467,8 +467,8 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, v0, v2 -; GFX10-NEXT: v_cmp_gt_i64_e64 s4, 0, v[2:3] ; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo +; GFX10-NEXT: v_cmp_gt_i64_e64 s4, 0, v[2:3] ; GFX10-NEXT: v_ashrrev_i32_e32 v6, 31, v5 ; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[0:1] ; GFX10-NEXT: v_xor_b32_e32 v1, 0x80000000, v6 diff --git a/llvm/test/CodeGen/AMDGPU/sdiv.ll b/llvm/test/CodeGen/AMDGPU/sdiv.ll --- a/llvm/test/CodeGen/AMDGPU/sdiv.ll +++ b/llvm/test/CodeGen/AMDGPU/sdiv.ll @@ -34,12 +34,12 @@ ; GCN-NEXT: v_cvt_f32_u32_e32 v3, v1 ; GCN-NEXT: v_sub_i32_e32 v4, vcc, 0, v1 ; GCN-NEXT: v_ashrrev_i32_e32 v5, 31, v0 -; GCN-NEXT: v_add_i32_e32 v0, vcc, v5, v0 ; GCN-NEXT: v_rcp_iflag_f32_e32 v3, v3 +; GCN-NEXT: v_add_i32_e32 v0, vcc, v5, v0 ; GCN-NEXT: v_xor_b32_e32 v0, v0, v5 -; GCN-NEXT: v_xor_b32_e32 v2, v5, v2 ; GCN-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 ; GCN-NEXT: v_cvt_u32_f32_e32 v3, v3 +; GCN-NEXT: v_xor_b32_e32 v2, v5, v2 ; GCN-NEXT: v_mul_lo_u32 v4, v4, v3 ; GCN-NEXT: v_mul_hi_u32 v4, v3, v4 ; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 @@ -48,8 +48,8 @@ ; GCN-NEXT: v_add_i32_e32 v5, vcc, 1, v3 ; GCN-NEXT: v_subrev_i32_e32 v0, vcc, v4, v0 ; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v1 -; GCN-NEXT: v_subrev_i32_e32 v4, vcc, v1, v0 ; GCN-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[0:1] +; GCN-NEXT: v_subrev_i32_e32 v4, vcc, v1, v0 ; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[0:1] ; GCN-NEXT: v_add_i32_e32 v4, vcc, 1, v3 ; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1 @@ -79,12 +79,12 @@ ; TONGA-NEXT: v_cvt_f32_u32_e32 v3, v1 ; TONGA-NEXT: v_sub_u32_e32 v4, vcc, 0, v1 ; TONGA-NEXT: v_ashrrev_i32_e32 v5, 31, v0 -; TONGA-NEXT: v_add_u32_e32 v0, vcc, v5, v0 ; TONGA-NEXT: v_rcp_iflag_f32_e32 v3, v3 +; TONGA-NEXT: v_add_u32_e32 v0, vcc, v5, v0 ; TONGA-NEXT: v_xor_b32_e32 v0, v0, v5 -; TONGA-NEXT: v_xor_b32_e32 v2, v5, v2 ; TONGA-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 ; TONGA-NEXT: v_cvt_u32_f32_e32 v3, v3 +; TONGA-NEXT: v_xor_b32_e32 v2, v5, v2 ; TONGA-NEXT: v_mul_lo_u32 v4, v4, v3 ; TONGA-NEXT: v_mul_hi_u32 v4, v3, v4 ; TONGA-NEXT: v_add_u32_e32 v3, vcc, v4, v3 @@ -93,8 +93,8 @@ ; TONGA-NEXT: v_add_u32_e32 v5, vcc, 1, v3 ; TONGA-NEXT: v_subrev_u32_e32 v0, vcc, v4, v0 ; TONGA-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v1 -; TONGA-NEXT: v_subrev_u32_e32 v4, vcc, v1, v0 ; TONGA-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[0:1] +; TONGA-NEXT: v_subrev_u32_e32 v4, vcc, v1, v0 ; TONGA-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[0:1] ; TONGA-NEXT: v_add_u32_e32 v4, vcc, 1, v3 ; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1 @@ -414,44 +414,44 @@ ; GCN-NEXT: v_xor_b32_e32 v2, v2, v5 ; GCN-NEXT: v_xor_b32_e32 v3, v3, v7 ; GCN-NEXT: v_xor_b32_e32 v8, v4, v5 -; GCN-NEXT: v_cvt_f32_u32_e32 v5, v2 ; GCN-NEXT: v_xor_b32_e32 v9, v6, v7 +; GCN-NEXT: v_cvt_f32_u32_e32 v5, v2 ; GCN-NEXT: v_cvt_f32_u32_e32 v7, v3 ; GCN-NEXT: v_sub_i32_e32 v10, vcc, 0, v2 ; GCN-NEXT: v_rcp_iflag_f32_e32 v5, v5 -; GCN-NEXT: v_sub_i32_e32 v11, vcc, 0, v3 ; GCN-NEXT: v_rcp_iflag_f32_e32 v7, v7 -; GCN-NEXT: v_add_i32_e32 v0, vcc, v4, v0 +; GCN-NEXT: v_sub_i32_e32 v11, vcc, 0, v3 ; GCN-NEXT: v_mul_f32_e32 v5, s2, v5 -; GCN-NEXT: v_cvt_u32_f32_e32 v5, v5 ; GCN-NEXT: v_mul_f32_e32 v7, s2, v7 +; GCN-NEXT: v_cvt_u32_f32_e32 v5, v5 ; GCN-NEXT: v_cvt_u32_f32_e32 v7, v7 -; GCN-NEXT: v_add_i32_e32 v1, vcc, v6, v1 +; GCN-NEXT: v_add_i32_e32 v0, vcc, v4, v0 ; GCN-NEXT: v_mul_lo_u32 v10, v10, v5 -; GCN-NEXT: v_xor_b32_e32 v0, v0, v4 ; GCN-NEXT: v_mul_lo_u32 v11, v11, v7 -; GCN-NEXT: v_xor_b32_e32 v1, v1, v6 +; GCN-NEXT: v_add_i32_e32 v1, vcc, v6, v1 +; GCN-NEXT: v_xor_b32_e32 v0, v0, v4 ; GCN-NEXT: v_mul_hi_u32 v4, v5, v10 +; GCN-NEXT: v_xor_b32_e32 v1, v1, v6 ; GCN-NEXT: v_mul_hi_u32 v6, v7, v11 ; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; GCN-NEXT: v_mul_hi_u32 v4, v0, v4 ; GCN-NEXT: v_add_i32_e32 v5, vcc, v6, v7 +; GCN-NEXT: v_mul_hi_u32 v4, v0, v4 ; GCN-NEXT: v_mul_hi_u32 v5, v1, v5 ; GCN-NEXT: v_mul_lo_u32 v6, v4, v2 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 1, v4 ; GCN-NEXT: v_mul_lo_u32 v10, v5, v3 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 1, v5 +; GCN-NEXT: v_add_i32_e32 v7, vcc, 1, v4 ; GCN-NEXT: v_subrev_i32_e32 v0, vcc, v6, v0 -; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v2 ; GCN-NEXT: v_subrev_i32_e32 v1, vcc, v10, v1 +; GCN-NEXT: v_add_i32_e32 v11, vcc, 1, v5 +; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v2 ; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], v1, v3 -; GCN-NEXT: v_subrev_i32_e32 v6, vcc, v2, v0 ; GCN-NEXT: v_cndmask_b32_e64 v4, v4, v7, s[0:1] -; GCN-NEXT: v_subrev_i32_e32 v7, vcc, v3, v1 +; GCN-NEXT: v_subrev_i32_e32 v6, vcc, v2, v0 ; GCN-NEXT: v_cndmask_b32_e64 v5, v5, v11, s[2:3] +; GCN-NEXT: v_subrev_i32_e32 v7, vcc, v3, v1 ; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[0:1] -; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[2:3] ; GCN-NEXT: v_add_i32_e32 v6, vcc, 1, v4 +; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[2:3] ; GCN-NEXT: v_add_i32_e32 v7, vcc, 1, v5 ; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 ; GCN-NEXT: v_cndmask_b32_e32 v0, v4, v6, vcc @@ -488,44 +488,44 @@ ; TONGA-NEXT: v_xor_b32_e32 v2, v2, v5 ; TONGA-NEXT: v_xor_b32_e32 v3, v3, v7 ; TONGA-NEXT: v_xor_b32_e32 v8, v4, v5 -; TONGA-NEXT: v_cvt_f32_u32_e32 v5, v2 ; TONGA-NEXT: v_xor_b32_e32 v9, v6, v7 +; TONGA-NEXT: v_cvt_f32_u32_e32 v5, v2 ; TONGA-NEXT: v_cvt_f32_u32_e32 v7, v3 ; TONGA-NEXT: v_sub_u32_e32 v10, vcc, 0, v2 ; TONGA-NEXT: v_rcp_iflag_f32_e32 v5, v5 -; TONGA-NEXT: v_sub_u32_e32 v11, vcc, 0, v3 ; TONGA-NEXT: v_rcp_iflag_f32_e32 v7, v7 -; TONGA-NEXT: v_add_u32_e32 v0, vcc, v4, v0 +; TONGA-NEXT: v_sub_u32_e32 v11, vcc, 0, v3 ; TONGA-NEXT: v_mul_f32_e32 v5, s2, v5 -; TONGA-NEXT: v_cvt_u32_f32_e32 v5, v5 ; TONGA-NEXT: v_mul_f32_e32 v7, s2, v7 +; TONGA-NEXT: v_cvt_u32_f32_e32 v5, v5 ; TONGA-NEXT: v_cvt_u32_f32_e32 v7, v7 -; TONGA-NEXT: v_add_u32_e32 v1, vcc, v6, v1 +; TONGA-NEXT: v_add_u32_e32 v0, vcc, v4, v0 ; TONGA-NEXT: v_mul_lo_u32 v10, v10, v5 -; TONGA-NEXT: v_xor_b32_e32 v0, v0, v4 ; TONGA-NEXT: v_mul_lo_u32 v11, v11, v7 -; TONGA-NEXT: v_xor_b32_e32 v1, v1, v6 +; TONGA-NEXT: v_add_u32_e32 v1, vcc, v6, v1 +; TONGA-NEXT: v_xor_b32_e32 v0, v0, v4 ; TONGA-NEXT: v_mul_hi_u32 v4, v5, v10 +; TONGA-NEXT: v_xor_b32_e32 v1, v1, v6 ; TONGA-NEXT: v_mul_hi_u32 v6, v7, v11 ; TONGA-NEXT: v_add_u32_e32 v4, vcc, v4, v5 -; TONGA-NEXT: v_mul_hi_u32 v4, v0, v4 ; TONGA-NEXT: v_add_u32_e32 v5, vcc, v6, v7 +; TONGA-NEXT: v_mul_hi_u32 v4, v0, v4 ; TONGA-NEXT: v_mul_hi_u32 v5, v1, v5 ; TONGA-NEXT: v_mul_lo_u32 v6, v4, v2 -; TONGA-NEXT: v_add_u32_e32 v7, vcc, 1, v4 ; TONGA-NEXT: v_mul_lo_u32 v10, v5, v3 -; TONGA-NEXT: v_add_u32_e32 v11, vcc, 1, v5 +; TONGA-NEXT: v_add_u32_e32 v7, vcc, 1, v4 ; TONGA-NEXT: v_subrev_u32_e32 v0, vcc, v6, v0 -; TONGA-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v2 ; TONGA-NEXT: v_subrev_u32_e32 v1, vcc, v10, v1 +; TONGA-NEXT: v_add_u32_e32 v11, vcc, 1, v5 +; TONGA-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v2 ; TONGA-NEXT: v_cmp_ge_u32_e64 s[2:3], v1, v3 -; TONGA-NEXT: v_subrev_u32_e32 v6, vcc, v2, v0 ; TONGA-NEXT: v_cndmask_b32_e64 v4, v4, v7, s[0:1] -; TONGA-NEXT: v_subrev_u32_e32 v7, vcc, v3, v1 +; TONGA-NEXT: v_subrev_u32_e32 v6, vcc, v2, v0 ; TONGA-NEXT: v_cndmask_b32_e64 v5, v5, v11, s[2:3] +; TONGA-NEXT: v_subrev_u32_e32 v7, vcc, v3, v1 ; TONGA-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[0:1] -; TONGA-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[2:3] ; TONGA-NEXT: v_add_u32_e32 v6, vcc, 1, v4 +; TONGA-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[2:3] ; TONGA-NEXT: v_add_u32_e32 v7, vcc, 1, v5 ; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 ; TONGA-NEXT: v_cndmask_b32_e32 v0, v4, v6, vcc @@ -593,16 +593,16 @@ ; GFX9-NEXT: v_sub_u32_e32 v1, v1, v9 ; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 ; GFX9-NEXT: v_sub_u32_e32 v8, v0, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc ; GFX9-NEXT: v_cmp_ge_u32_e64 s[0:1], v1, v3 ; GFX9-NEXT: v_sub_u32_e32 v9, v1, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v11, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[0:1] ; GFX9-NEXT: v_add_u32_e32 v8, 1, v6 +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[0:1] ; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v8, vcc ; GFX9-NEXT: v_add_u32_e32 v9, 1, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v8, vcc ; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v9, vcc ; GFX9-NEXT: v_xor_b32_e32 v0, v0, v4 @@ -830,20 +830,20 @@ ; GCN-NEXT: v_add_i32_e32 v6, vcc, v13, v6 ; GCN-NEXT: v_ashrrev_i32_e32 v10, 31, v1 ; GCN-NEXT: v_rcp_iflag_f32_e32 v8, v8 -; GCN-NEXT: v_mul_f32_e32 v9, s2, v9 ; GCN-NEXT: v_xor_b32_e32 v6, v6, v13 +; GCN-NEXT: v_mul_f32_e32 v9, s2, v9 ; GCN-NEXT: v_xor_b32_e32 v16, v10, v11 ; GCN-NEXT: v_cvt_f32_u32_e32 v11, v6 ; GCN-NEXT: v_cvt_u32_f32_e32 v9, v9 ; GCN-NEXT: v_ashrrev_i32_e32 v12, 31, v2 ; GCN-NEXT: v_add_i32_e32 v2, vcc, v12, v2 -; GCN-NEXT: v_mul_f32_e32 v8, s2, v8 ; GCN-NEXT: v_xor_b32_e32 v17, v12, v13 ; GCN-NEXT: v_xor_b32_e32 v2, v2, v12 +; GCN-NEXT: v_mul_f32_e32 v8, s2, v8 ; GCN-NEXT: v_sub_i32_e32 v12, vcc, 0, v5 +; GCN-NEXT: v_cvt_u32_f32_e32 v8, v8 ; GCN-NEXT: v_rcp_iflag_f32_e32 v11, v11 ; GCN-NEXT: v_mul_lo_u32 v12, v12, v9 -; GCN-NEXT: v_cvt_u32_f32_e32 v8, v8 ; GCN-NEXT: v_add_i32_e32 v1, vcc, v10, v1 ; GCN-NEXT: v_xor_b32_e32 v1, v1, v10 ; GCN-NEXT: v_sub_i32_e32 v10, vcc, 0, v4 @@ -857,15 +857,15 @@ ; GCN-NEXT: v_mul_lo_u32 v12, v12, v11 ; GCN-NEXT: v_add_i32_e32 v8, vcc, v10, v8 ; GCN-NEXT: v_mul_hi_u32 v8, v0, v8 -; GCN-NEXT: v_ashrrev_i32_e32 v14, 31, v7 ; GCN-NEXT: v_mul_hi_u32 v12, v11, v12 +; GCN-NEXT: v_ashrrev_i32_e32 v14, 31, v7 ; GCN-NEXT: v_add_i32_e32 v7, vcc, v14, v7 ; GCN-NEXT: v_xor_b32_e32 v7, v7, v14 ; GCN-NEXT: v_cvt_f32_u32_e32 v10, v7 ; GCN-NEXT: v_add_i32_e32 v11, vcc, v12, v11 ; GCN-NEXT: v_mul_lo_u32 v12, v8, v4 -; GCN-NEXT: v_mul_hi_u32 v9, v1, v9 ; GCN-NEXT: v_rcp_iflag_f32_e32 v10, v10 +; GCN-NEXT: v_mul_hi_u32 v9, v1, v9 ; GCN-NEXT: v_mul_hi_u32 v11, v2, v11 ; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v12 ; GCN-NEXT: v_add_i32_e32 v12, vcc, 1, v8 @@ -873,27 +873,27 @@ ; GCN-NEXT: v_cndmask_b32_e64 v8, v8, v12, s[0:1] ; GCN-NEXT: v_sub_i32_e32 v12, vcc, v0, v4 ; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v12, s[0:1] +; GCN-NEXT: v_mul_f32_e32 v10, s2, v10 ; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v4 ; GCN-NEXT: v_mul_lo_u32 v0, v9, v5 -; GCN-NEXT: v_mul_f32_e32 v10, s2, v10 ; GCN-NEXT: v_cvt_u32_f32_e32 v4, v10 ; GCN-NEXT: v_mul_lo_u32 v10, v11, v6 +; GCN-NEXT: v_add_i32_e32 v12, vcc, 1, v8 ; GCN-NEXT: v_sub_i32_e32 v0, vcc, v1, v0 ; GCN-NEXT: v_add_i32_e32 v1, vcc, 1, v9 -; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], v0, v5 ; GCN-NEXT: v_sub_i32_e32 v2, vcc, v2, v10 +; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], v0, v5 +; GCN-NEXT: v_add_i32_e32 v10, vcc, 1, v11 ; GCN-NEXT: v_cndmask_b32_e64 v1, v9, v1, s[2:3] ; GCN-NEXT: v_sub_i32_e32 v9, vcc, v0, v5 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 1, v11 ; GCN-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v6 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 1, v8 ; GCN-NEXT: v_cndmask_b32_e64 v10, v11, v10, s[4:5] ; GCN-NEXT: v_sub_i32_e32 v11, vcc, v2, v6 ; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v9, s[2:3] ; GCN-NEXT: v_add_i32_e32 v9, vcc, 1, v1 ; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v0, v5 -; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v9, vcc ; GCN-NEXT: v_cndmask_b32_e64 v8, v8, v12, s[0:1] +; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v9, vcc ; GCN-NEXT: v_xor_b32_e32 v1, v8, v15 ; GCN-NEXT: v_xor_b32_e32 v5, v0, v16 ; GCN-NEXT: v_sub_i32_e32 v0, vcc, v1, v15 @@ -902,8 +902,8 @@ ; GCN-NEXT: v_mul_lo_u32 v5, v5, v4 ; GCN-NEXT: v_ashrrev_i32_e32 v9, 31, v3 ; GCN-NEXT: v_add_i32_e32 v3, vcc, v9, v3 -; GCN-NEXT: v_xor_b32_e32 v3, v3, v9 ; GCN-NEXT: v_mul_hi_u32 v5, v4, v5 +; GCN-NEXT: v_xor_b32_e32 v3, v3, v9 ; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v11, s[4:5] ; GCN-NEXT: v_add_i32_e32 v8, vcc, 1, v10 ; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4 @@ -962,20 +962,20 @@ ; TONGA-NEXT: v_add_u32_e32 v6, vcc, v13, v6 ; TONGA-NEXT: v_ashrrev_i32_e32 v10, 31, v1 ; TONGA-NEXT: v_rcp_iflag_f32_e32 v8, v8 -; TONGA-NEXT: v_mul_f32_e32 v9, s2, v9 ; TONGA-NEXT: v_xor_b32_e32 v6, v6, v13 +; TONGA-NEXT: v_mul_f32_e32 v9, s2, v9 ; TONGA-NEXT: v_xor_b32_e32 v16, v10, v11 ; TONGA-NEXT: v_cvt_f32_u32_e32 v11, v6 ; TONGA-NEXT: v_cvt_u32_f32_e32 v9, v9 ; TONGA-NEXT: v_ashrrev_i32_e32 v12, 31, v2 ; TONGA-NEXT: v_add_u32_e32 v2, vcc, v12, v2 -; TONGA-NEXT: v_mul_f32_e32 v8, s2, v8 ; TONGA-NEXT: v_xor_b32_e32 v17, v12, v13 ; TONGA-NEXT: v_xor_b32_e32 v2, v2, v12 +; TONGA-NEXT: v_mul_f32_e32 v8, s2, v8 ; TONGA-NEXT: v_sub_u32_e32 v12, vcc, 0, v5 +; TONGA-NEXT: v_cvt_u32_f32_e32 v8, v8 ; TONGA-NEXT: v_rcp_iflag_f32_e32 v11, v11 ; TONGA-NEXT: v_mul_lo_u32 v12, v12, v9 -; TONGA-NEXT: v_cvt_u32_f32_e32 v8, v8 ; TONGA-NEXT: v_add_u32_e32 v1, vcc, v10, v1 ; TONGA-NEXT: v_xor_b32_e32 v1, v1, v10 ; TONGA-NEXT: v_sub_u32_e32 v10, vcc, 0, v4 @@ -989,15 +989,15 @@ ; TONGA-NEXT: v_mul_lo_u32 v12, v12, v11 ; TONGA-NEXT: v_add_u32_e32 v8, vcc, v10, v8 ; TONGA-NEXT: v_mul_hi_u32 v8, v0, v8 -; TONGA-NEXT: v_ashrrev_i32_e32 v14, 31, v7 ; TONGA-NEXT: v_mul_hi_u32 v12, v11, v12 +; TONGA-NEXT: v_ashrrev_i32_e32 v14, 31, v7 ; TONGA-NEXT: v_add_u32_e32 v7, vcc, v14, v7 ; TONGA-NEXT: v_xor_b32_e32 v7, v7, v14 ; TONGA-NEXT: v_cvt_f32_u32_e32 v10, v7 ; TONGA-NEXT: v_add_u32_e32 v11, vcc, v12, v11 ; TONGA-NEXT: v_mul_lo_u32 v12, v8, v4 -; TONGA-NEXT: v_mul_hi_u32 v9, v1, v9 ; TONGA-NEXT: v_rcp_iflag_f32_e32 v10, v10 +; TONGA-NEXT: v_mul_hi_u32 v9, v1, v9 ; TONGA-NEXT: v_mul_hi_u32 v11, v2, v11 ; TONGA-NEXT: v_sub_u32_e32 v0, vcc, v0, v12 ; TONGA-NEXT: v_add_u32_e32 v12, vcc, 1, v8 @@ -1005,27 +1005,27 @@ ; TONGA-NEXT: v_cndmask_b32_e64 v8, v8, v12, s[0:1] ; TONGA-NEXT: v_sub_u32_e32 v12, vcc, v0, v4 ; TONGA-NEXT: v_cndmask_b32_e64 v0, v0, v12, s[0:1] +; TONGA-NEXT: v_mul_f32_e32 v10, s2, v10 ; TONGA-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v4 ; TONGA-NEXT: v_mul_lo_u32 v0, v9, v5 -; TONGA-NEXT: v_mul_f32_e32 v10, s2, v10 ; TONGA-NEXT: v_cvt_u32_f32_e32 v4, v10 ; TONGA-NEXT: v_mul_lo_u32 v10, v11, v6 +; TONGA-NEXT: v_add_u32_e32 v12, vcc, 1, v8 ; TONGA-NEXT: v_sub_u32_e32 v0, vcc, v1, v0 ; TONGA-NEXT: v_add_u32_e32 v1, vcc, 1, v9 -; TONGA-NEXT: v_cmp_ge_u32_e64 s[2:3], v0, v5 ; TONGA-NEXT: v_sub_u32_e32 v2, vcc, v2, v10 +; TONGA-NEXT: v_cmp_ge_u32_e64 s[2:3], v0, v5 +; TONGA-NEXT: v_add_u32_e32 v10, vcc, 1, v11 ; TONGA-NEXT: v_cndmask_b32_e64 v1, v9, v1, s[2:3] ; TONGA-NEXT: v_sub_u32_e32 v9, vcc, v0, v5 -; TONGA-NEXT: v_add_u32_e32 v10, vcc, 1, v11 ; TONGA-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v6 -; TONGA-NEXT: v_add_u32_e32 v12, vcc, 1, v8 ; TONGA-NEXT: v_cndmask_b32_e64 v10, v11, v10, s[4:5] ; TONGA-NEXT: v_sub_u32_e32 v11, vcc, v2, v6 ; TONGA-NEXT: v_cndmask_b32_e64 v0, v0, v9, s[2:3] ; TONGA-NEXT: v_add_u32_e32 v9, vcc, 1, v1 ; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v0, v5 -; TONGA-NEXT: v_cndmask_b32_e32 v0, v1, v9, vcc ; TONGA-NEXT: v_cndmask_b32_e64 v8, v8, v12, s[0:1] +; TONGA-NEXT: v_cndmask_b32_e32 v0, v1, v9, vcc ; TONGA-NEXT: v_xor_b32_e32 v1, v8, v15 ; TONGA-NEXT: v_xor_b32_e32 v5, v0, v16 ; TONGA-NEXT: v_subrev_u32_e32 v0, vcc, v15, v1 @@ -1034,8 +1034,8 @@ ; TONGA-NEXT: v_mul_lo_u32 v5, v5, v4 ; TONGA-NEXT: v_ashrrev_i32_e32 v9, 31, v3 ; TONGA-NEXT: v_add_u32_e32 v3, vcc, v9, v3 -; TONGA-NEXT: v_xor_b32_e32 v3, v3, v9 ; TONGA-NEXT: v_mul_hi_u32 v5, v4, v5 +; TONGA-NEXT: v_xor_b32_e32 v3, v3, v9 ; TONGA-NEXT: v_cndmask_b32_e64 v2, v2, v11, s[4:5] ; TONGA-NEXT: v_add_u32_e32 v8, vcc, 1, v10 ; TONGA-NEXT: v_add_u32_e32 v4, vcc, v5, v4 @@ -1085,8 +1085,8 @@ ; GFX9-NEXT: v_xor_b32_e32 v4, v4, v9 ; GFX9-NEXT: v_ashrrev_i32_e32 v10, 31, v1 ; GFX9-NEXT: v_ashrrev_i32_e32 v13, 31, v6 -; GFX9-NEXT: v_add_u32_e32 v5, v5, v11 ; GFX9-NEXT: v_xor_b32_e32 v16, v8, v9 +; GFX9-NEXT: v_add_u32_e32 v5, v5, v11 ; GFX9-NEXT: v_xor_b32_e32 v0, v0, v8 ; GFX9-NEXT: v_cvt_f32_u32_e32 v8, v4 ; GFX9-NEXT: v_ashrrev_i32_e32 v12, 31, v2 @@ -1099,21 +1099,21 @@ ; GFX9-NEXT: v_add_u32_e32 v7, v7, v15 ; GFX9-NEXT: v_xor_b32_e32 v17, v10, v11 ; GFX9-NEXT: v_xor_b32_e32 v1, v1, v10 -; GFX9-NEXT: v_cvt_f32_u32_e32 v10, v5 ; GFX9-NEXT: v_xor_b32_e32 v6, v6, v13 +; GFX9-NEXT: v_cvt_f32_u32_e32 v10, v5 ; GFX9-NEXT: v_add_u32_e32 v3, v3, v14 ; GFX9-NEXT: v_xor_b32_e32 v18, v12, v13 ; GFX9-NEXT: v_xor_b32_e32 v2, v2, v12 -; GFX9-NEXT: v_cvt_f32_u32_e32 v12, v6 ; GFX9-NEXT: v_xor_b32_e32 v7, v7, v15 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v8, v8 +; GFX9-NEXT: v_cvt_f32_u32_e32 v12, v6 ; GFX9-NEXT: v_xor_b32_e32 v19, v14, v15 ; GFX9-NEXT: v_xor_b32_e32 v3, v3, v14 ; GFX9-NEXT: v_cvt_f32_u32_e32 v14, v7 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v8, v8 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v10, v10 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v12, v12 -; GFX9-NEXT: v_mul_f32_e32 v8, s2, v8 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v14, v14 +; GFX9-NEXT: v_mul_f32_e32 v8, s2, v8 ; GFX9-NEXT: v_cvt_u32_f32_e32 v8, v8 ; GFX9-NEXT: v_mul_f32_e32 v10, s2, v10 ; GFX9-NEXT: v_mul_f32_e32 v12, s2, v12 @@ -1121,8 +1121,8 @@ ; GFX9-NEXT: v_sub_u32_e32 v9, 0, v4 ; GFX9-NEXT: v_mul_f32_e32 v14, s2, v14 ; GFX9-NEXT: v_cvt_u32_f32_e32 v12, v12 -; GFX9-NEXT: v_mul_lo_u32 v9, v9, v8 ; GFX9-NEXT: v_cvt_u32_f32_e32 v14, v14 +; GFX9-NEXT: v_mul_lo_u32 v9, v9, v8 ; GFX9-NEXT: v_sub_u32_e32 v11, 0, v5 ; GFX9-NEXT: v_sub_u32_e32 v13, 0, v6 ; GFX9-NEXT: v_mul_lo_u32 v11, v11, v10 @@ -1140,45 +1140,45 @@ ; GFX9-NEXT: v_mul_hi_u32 v9, v1, v9 ; GFX9-NEXT: v_add_u32_e32 v11, v14, v15 ; GFX9-NEXT: v_mul_hi_u32 v10, v2, v10 -; GFX9-NEXT: v_mul_lo_u32 v12, v8, v4 ; GFX9-NEXT: v_mul_hi_u32 v11, v3, v11 +; GFX9-NEXT: v_mul_lo_u32 v12, v8, v4 ; GFX9-NEXT: v_mul_lo_u32 v14, v9, v5 ; GFX9-NEXT: v_mul_lo_u32 v15, v10, v6 +; GFX9-NEXT: v_add_u32_e32 v13, 1, v8 ; GFX9-NEXT: v_sub_u32_e32 v0, v0, v12 ; GFX9-NEXT: v_mul_lo_u32 v12, v11, v7 ; GFX9-NEXT: v_sub_u32_e32 v1, v1, v14 -; GFX9-NEXT: v_add_u32_e32 v13, 1, v8 ; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v0, v4 +; GFX9-NEXT: v_add_u32_e32 v14, 1, v9 ; GFX9-NEXT: v_sub_u32_e32 v2, v2, v15 ; GFX9-NEXT: v_cndmask_b32_e32 v8, v8, v13, vcc ; GFX9-NEXT: v_sub_u32_e32 v13, v0, v4 -; GFX9-NEXT: v_add_u32_e32 v14, 1, v9 ; GFX9-NEXT: v_cmp_ge_u32_e64 s[0:1], v1, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v13, vcc +; GFX9-NEXT: v_add_u32_e32 v15, 1, v10 ; GFX9-NEXT: v_sub_u32_e32 v3, v3, v12 ; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v14, s[0:1] ; GFX9-NEXT: v_sub_u32_e32 v14, v1, v5 -; GFX9-NEXT: v_add_u32_e32 v15, 1, v10 ; GFX9-NEXT: v_cmp_ge_u32_e64 s[2:3], v2, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v13, vcc +; GFX9-NEXT: v_add_u32_e32 v12, 1, v11 ; GFX9-NEXT: v_cndmask_b32_e64 v10, v10, v15, s[2:3] ; GFX9-NEXT: v_sub_u32_e32 v15, v2, v6 -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v14, s[0:1] -; GFX9-NEXT: v_add_u32_e32 v12, 1, v11 ; GFX9-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v7 ; GFX9-NEXT: v_add_u32_e32 v13, 1, v8 +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v14, s[0:1] ; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v0, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v8, v13, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v11, v11, v12, s[4:5] ; GFX9-NEXT: v_sub_u32_e32 v12, v3, v7 -; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v15, s[2:3] ; GFX9-NEXT: v_add_u32_e32 v14, 1, v9 +; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v15, s[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v0, v8, v13, vcc ; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v1, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v14, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v12, s[4:5] ; GFX9-NEXT: v_add_u32_e32 v15, 1, v10 +; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v12, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v14, vcc ; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v2, v6 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v10, v15, vcc ; GFX9-NEXT: v_add_u32_e32 v12, 1, v11 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v10, v15, vcc ; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v3, v7 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v11, v12, vcc ; GFX9-NEXT: v_xor_b32_e32 v0, v0, v16 @@ -1990,13 +1990,13 @@ ; GCN-NEXT: v_cvt_f32_u32_e32 v3, v2 ; GCN-NEXT: v_sub_i32_e32 v4, vcc, 0, v2 ; GCN-NEXT: v_bfe_i32 v5, v0, 0, 25 -; GCN-NEXT: v_bfe_i32 v0, v0, 24, 1 ; GCN-NEXT: v_rcp_iflag_f32_e32 v3, v3 +; GCN-NEXT: v_bfe_i32 v0, v0, 24, 1 ; GCN-NEXT: v_add_i32_e32 v5, vcc, v0, v5 -; GCN-NEXT: v_xor_b32_e32 v5, v5, v0 -; GCN-NEXT: v_xor_b32_e32 v0, v0, v1 ; GCN-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 ; GCN-NEXT: v_cvt_u32_f32_e32 v3, v3 +; GCN-NEXT: v_xor_b32_e32 v5, v5, v0 +; GCN-NEXT: v_xor_b32_e32 v0, v0, v1 ; GCN-NEXT: v_mul_lo_u32 v4, v4, v3 ; GCN-NEXT: v_mul_hi_u32 v4, v3, v4 ; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 @@ -2038,13 +2038,13 @@ ; TONGA-NEXT: v_cvt_f32_u32_e32 v3, v2 ; TONGA-NEXT: v_sub_u32_e32 v4, vcc, 0, v2 ; TONGA-NEXT: v_bfe_i32 v5, v0, 0, 25 -; TONGA-NEXT: v_bfe_i32 v0, v0, 24, 1 ; TONGA-NEXT: v_rcp_iflag_f32_e32 v3, v3 +; TONGA-NEXT: v_bfe_i32 v0, v0, 24, 1 ; TONGA-NEXT: v_add_u32_e32 v5, vcc, v0, v5 -; TONGA-NEXT: v_xor_b32_e32 v5, v5, v0 -; TONGA-NEXT: v_xor_b32_e32 v0, v0, v1 ; TONGA-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 ; TONGA-NEXT: v_cvt_u32_f32_e32 v3, v3 +; TONGA-NEXT: v_xor_b32_e32 v5, v5, v0 +; TONGA-NEXT: v_xor_b32_e32 v0, v0, v1 ; TONGA-NEXT: v_mul_lo_u32 v4, v4, v3 ; TONGA-NEXT: v_mul_hi_u32 v4, v3, v4 ; TONGA-NEXT: v_add_u32_e32 v3, vcc, v4, v3 diff --git a/llvm/test/CodeGen/AMDGPU/sdiv64.ll b/llvm/test/CodeGen/AMDGPU/sdiv64.ll --- a/llvm/test/CodeGen/AMDGPU/sdiv64.ll +++ b/llvm/test/CodeGen/AMDGPU/sdiv64.ll @@ -29,10 +29,10 @@ ; GCN-NEXT: v_mul_f32_e32 v2, 0x2f800000, v0 ; GCN-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-NEXT: v_mac_f32_e32 v0, 0xcf800000, v2 -; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2 -; GCN-NEXT: v_mul_hi_u32 v4, s4, v0 +; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GCN-NEXT: v_mul_lo_u32 v3, s4, v2 +; GCN-NEXT: v_mul_hi_u32 v4, s4, v0 ; GCN-NEXT: v_mul_lo_u32 v6, s5, v0 ; GCN-NEXT: v_mul_lo_u32 v5, s4, v0 ; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 @@ -50,8 +50,8 @@ ; GCN-NEXT: v_addc_u32_e32 v4, vcc, v6, v5, vcc ; GCN-NEXT: v_addc_u32_e32 v5, vcc, v9, v1, vcc ; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GCN-NEXT: v_add_i32_e64 v0, s[0:1], v0, v3 ; GCN-NEXT: v_addc_u32_e32 v4, vcc, v7, v5, vcc +; GCN-NEXT: v_add_i32_e64 v0, s[0:1], v0, v3 ; GCN-NEXT: v_addc_u32_e64 v3, vcc, v2, v4, s[0:1] ; GCN-NEXT: v_mul_lo_u32 v5, s4, v3 ; GCN-NEXT: v_mul_hi_u32 v6, s4, v0 @@ -61,8 +61,8 @@ ; GCN-NEXT: v_mul_lo_u32 v6, s4, v0 ; GCN-NEXT: v_add_i32_e32 v5, vcc, v8, v5 ; GCN-NEXT: v_mul_lo_u32 v10, v0, v5 -; GCN-NEXT: v_mul_hi_u32 v12, v0, v5 ; GCN-NEXT: v_mul_hi_u32 v11, v0, v6 +; GCN-NEXT: v_mul_hi_u32 v12, v0, v5 ; GCN-NEXT: v_mul_hi_u32 v9, v3, v6 ; GCN-NEXT: v_mul_lo_u32 v6, v3, v6 ; GCN-NEXT: v_mul_hi_u32 v8, v3, v5 @@ -77,10 +77,10 @@ ; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GCN-NEXT: v_addc_u32_e64 v2, vcc, v2, v5, s[0:1] ; GCN-NEXT: s_add_u32 s0, s10, s14 -; GCN-NEXT: s_addc_u32 s1, s11, s14 ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v3 -; GCN-NEXT: s_xor_b64 s[10:11], s[0:1], s[14:15] +; GCN-NEXT: s_addc_u32 s1, s11, s14 ; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc +; GCN-NEXT: s_xor_b64 s[10:11], s[0:1], s[14:15] ; GCN-NEXT: v_mul_lo_u32 v3, s10, v2 ; GCN-NEXT: v_mul_hi_u32 v4, s10, v0 ; GCN-NEXT: v_mul_hi_u32 v5, s10, v2 @@ -150,8 +150,8 @@ ; GCN-IR-NEXT: s_mov_b32 s1, s0 ; GCN-IR-NEXT: s_ashr_i32 s2, s9, 31 ; GCN-IR-NEXT: s_xor_b64 s[6:7], s[0:1], s[6:7] -; GCN-IR-NEXT: s_sub_u32 s10, s6, s0 ; GCN-IR-NEXT: s_mov_b32 s3, s2 +; GCN-IR-NEXT: s_sub_u32 s10, s6, s0 ; GCN-IR-NEXT: s_subb_u32 s11, s7, s0 ; GCN-IR-NEXT: s_xor_b64 s[6:7], s[2:3], s[8:9] ; GCN-IR-NEXT: s_sub_u32 s6, s6, s2 @@ -194,14 +194,14 @@ ; GCN-IR-NEXT: s_addc_u32 s21, s7, -1 ; GCN-IR-NEXT: s_not_b64 s[8:9], s[14:15] ; GCN-IR-NEXT: s_add_u32 s10, s8, s16 -; GCN-IR-NEXT: s_addc_u32 s11, s9, s15 ; GCN-IR-NEXT: s_mov_b32 s17, s15 +; GCN-IR-NEXT: s_addc_u32 s11, s9, s15 ; GCN-IR-NEXT: s_mov_b64 s[14:15], 0 ; GCN-IR-NEXT: s_mov_b32 s9, 0 ; GCN-IR-NEXT: BB0_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-IR-NEXT: s_lshr_b32 s8, s13, 31 ; GCN-IR-NEXT: s_lshl_b64 s[16:17], s[18:19], 1 +; GCN-IR-NEXT: s_lshr_b32 s8, s13, 31 ; GCN-IR-NEXT: s_lshl_b64 s[12:13], s[12:13], 1 ; GCN-IR-NEXT: s_or_b64 s[16:17], s[16:17], s[8:9] ; GCN-IR-NEXT: s_or_b64 s[12:13], s[14:15], s[12:13] @@ -255,15 +255,15 @@ ; GCN-NEXT: v_ashrrev_i32_e32 v4, 31, v3 ; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GCN-NEXT: v_addc_u32_e32 v3, vcc, v3, v4, vcc -; GCN-NEXT: v_xor_b32_e32 v2, v2, v4 ; GCN-NEXT: v_xor_b32_e32 v3, v3, v4 +; GCN-NEXT: v_xor_b32_e32 v2, v2, v4 ; GCN-NEXT: v_cvt_f32_u32_e32 v5, v2 ; GCN-NEXT: v_cvt_f32_u32_e32 v6, v3 ; GCN-NEXT: v_sub_i32_e32 v7, vcc, 0, v2 ; GCN-NEXT: v_subb_u32_e32 v8, vcc, 0, v3, vcc -; GCN-NEXT: v_mov_b32_e32 v15, 0 ; GCN-NEXT: v_mac_f32_e32 v5, 0x4f800000, v6 ; GCN-NEXT: v_rcp_f32_e32 v5, v5 +; GCN-NEXT: v_mov_b32_e32 v15, 0 ; GCN-NEXT: v_mov_b32_e32 v14, 0 ; GCN-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5 ; GCN-NEXT: v_mul_f32_e32 v6, 0x2f800000, v5 @@ -278,8 +278,8 @@ ; GCN-NEXT: v_mul_lo_u32 v10, v7, v5 ; GCN-NEXT: v_add_i32_e32 v9, vcc, v9, v11 ; GCN-NEXT: v_mul_lo_u32 v12, v5, v9 -; GCN-NEXT: v_mul_hi_u32 v11, v5, v9 ; GCN-NEXT: v_mul_hi_u32 v13, v5, v10 +; GCN-NEXT: v_mul_hi_u32 v11, v5, v9 ; GCN-NEXT: v_mul_hi_u32 v16, v6, v9 ; GCN-NEXT: v_mul_lo_u32 v9, v6, v9 ; GCN-NEXT: v_add_i32_e32 v12, vcc, v13, v12 @@ -290,8 +290,8 @@ ; GCN-NEXT: v_addc_u32_e32 v10, vcc, v11, v10, vcc ; GCN-NEXT: v_addc_u32_e32 v11, vcc, v16, v14, vcc ; GCN-NEXT: v_add_i32_e32 v9, vcc, v10, v9 -; GCN-NEXT: v_add_i32_e64 v5, s[4:5], v5, v9 ; GCN-NEXT: v_addc_u32_e32 v10, vcc, v15, v11, vcc +; GCN-NEXT: v_add_i32_e64 v5, s[4:5], v5, v9 ; GCN-NEXT: v_addc_u32_e64 v9, vcc, v6, v10, s[4:5] ; GCN-NEXT: v_mul_lo_u32 v11, v7, v9 ; GCN-NEXT: v_mul_hi_u32 v12, v7, v5 @@ -304,8 +304,8 @@ ; GCN-NEXT: v_mul_hi_u32 v17, v5, v8 ; GCN-NEXT: v_mul_hi_u32 v12, v9, v7 ; GCN-NEXT: v_mul_lo_u32 v7, v9, v7 -; GCN-NEXT: v_add_i32_e32 v13, vcc, v16, v13 ; GCN-NEXT: v_mul_hi_u32 v11, v9, v8 +; GCN-NEXT: v_add_i32_e32 v13, vcc, v16, v13 ; GCN-NEXT: v_addc_u32_e32 v16, vcc, v15, v17, vcc ; GCN-NEXT: v_mul_lo_u32 v8, v9, v8 ; GCN-NEXT: v_add_i32_e32 v7, vcc, v13, v7 @@ -352,9 +352,9 @@ ; GCN-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v2 ; GCN-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] ; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], v9, v3 -; GCN-NEXT: v_subb_u32_e32 v1, vcc, v1, v8, vcc ; GCN-NEXT: v_cndmask_b32_e64 v9, v11, v10, s[4:5] ; GCN-NEXT: v_add_i32_e64 v10, s[4:5], 2, v5 +; GCN-NEXT: v_subb_u32_e32 v1, vcc, v1, v8, vcc ; GCN-NEXT: v_addc_u32_e64 v11, s[4:5], 0, v6, s[4:5] ; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3 ; GCN-NEXT: v_add_i32_e64 v12, s[4:5], 1, v5 @@ -369,8 +369,8 @@ ; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GCN-NEXT: v_cndmask_b32_e64 v1, v12, v10, s[4:5] ; GCN-NEXT: v_cndmask_b32_e32 v0, v6, v9, vcc -; GCN-NEXT: v_xor_b32_e32 v2, v7, v4 ; GCN-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GCN-NEXT: v_xor_b32_e32 v2, v7, v4 ; GCN-NEXT: v_xor_b32_e32 v3, v0, v2 ; GCN-NEXT: v_xor_b32_e32 v0, v1, v2 ; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 @@ -383,12 +383,12 @@ ; GCN-IR-NEXT: v_ashrrev_i32_e32 v4, 31, v1 ; GCN-IR-NEXT: v_xor_b32_e32 v0, v4, v0 ; GCN-IR-NEXT: v_ashrrev_i32_e32 v5, 31, v3 -; GCN-IR-NEXT: v_sub_i32_e32 v9, vcc, v0, v4 ; GCN-IR-NEXT: v_xor_b32_e32 v1, v4, v1 +; GCN-IR-NEXT: v_sub_i32_e32 v9, vcc, v0, v4 ; GCN-IR-NEXT: v_subb_u32_e32 v10, vcc, v1, v4, vcc ; GCN-IR-NEXT: v_xor_b32_e32 v1, v5, v2 -; GCN-IR-NEXT: v_sub_i32_e32 v2, vcc, v1, v5 ; GCN-IR-NEXT: v_xor_b32_e32 v0, v5, v3 +; GCN-IR-NEXT: v_sub_i32_e32 v2, vcc, v1, v5 ; GCN-IR-NEXT: v_subb_u32_e32 v3, vcc, v0, v5, vcc ; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[9:10] @@ -421,8 +421,8 @@ ; GCN-IR-NEXT: v_addc_u32_e32 v16, vcc, 0, v8, vcc ; GCN-IR-NEXT: v_cmp_ge_u64_e32 vcc, v[15:16], v[7:8] ; GCN-IR-NEXT: v_sub_i32_e64 v7, s[4:5], 63, v7 -; GCN-IR-NEXT: v_mov_b32_e32 v11, 0 ; GCN-IR-NEXT: v_lshl_b64 v[7:8], v[9:10], v7 +; GCN-IR-NEXT: v_mov_b32_e32 v11, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v12, 0 ; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 ; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -434,8 +434,8 @@ ; GCN-IR-NEXT: v_not_b32_e32 v0, v0 ; GCN-IR-NEXT: v_lshr_b64 v[15:16], v[9:10], v15 ; GCN-IR-NEXT: v_not_b32_e32 v10, v17 -; GCN-IR-NEXT: v_add_i32_e32 v9, vcc, v0, v13 ; GCN-IR-NEXT: v_mov_b32_e32 v17, 0 +; GCN-IR-NEXT: v_add_i32_e32 v9, vcc, v0, v13 ; GCN-IR-NEXT: v_mov_b32_e32 v18, 0 ; GCN-IR-NEXT: v_addc_u32_e32 v10, vcc, v10, v14, vcc ; GCN-IR-NEXT: BB1_3: ; %udiv-do-while @@ -448,15 +448,15 @@ ; GCN-IR-NEXT: v_subb_u32_e32 v11, vcc, v20, v14, vcc ; GCN-IR-NEXT: v_or_b32_e32 v7, v17, v7 ; GCN-IR-NEXT: v_add_i32_e32 v17, vcc, 1, v9 -; GCN-IR-NEXT: v_ashrrev_i32_e32 v13, 31, v11 ; GCN-IR-NEXT: v_or_b32_e32 v8, v18, v8 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v13, 31, v11 ; GCN-IR-NEXT: v_addc_u32_e32 v18, vcc, 0, v10, vcc -; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[17:18], v[9:10] -; GCN-IR-NEXT: v_mov_b32_e32 v9, v17 ; GCN-IR-NEXT: v_mov_b32_e32 v12, 0 ; GCN-IR-NEXT: v_and_b32_e32 v11, 1, v13 ; GCN-IR-NEXT: v_and_b32_e32 v16, v13, v3 ; GCN-IR-NEXT: v_and_b32_e32 v13, v13, v2 +; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[17:18], v[9:10] +; GCN-IR-NEXT: v_mov_b32_e32 v9, v17 ; GCN-IR-NEXT: v_sub_i32_e64 v15, s[4:5], v0, v13 ; GCN-IR-NEXT: v_mov_b32_e32 v10, v18 ; GCN-IR-NEXT: v_mov_b32_e32 v18, v12 @@ -475,8 +475,8 @@ ; GCN-IR-NEXT: BB1_6: ; %Flow4 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN-IR-NEXT: v_xor_b32_e32 v0, v5, v4 -; GCN-IR-NEXT: v_xor_b32_e32 v3, v11, v0 ; GCN-IR-NEXT: v_xor_b32_e32 v1, v1, v6 +; GCN-IR-NEXT: v_xor_b32_e32 v3, v11, v0 ; GCN-IR-NEXT: v_xor_b32_e32 v2, v12, v1 ; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, v3, v0 ; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc @@ -1008,11 +1008,11 @@ ; GCN-IR-NEXT: s_sext_i32_i16 s7, s0 ; GCN-IR-NEXT: s_ashr_i32 s0, s3, 31 ; GCN-IR-NEXT: s_mov_b32 s1, s0 -; GCN-IR-NEXT: s_ashr_i32 s2, s7, 31 ; GCN-IR-NEXT: s_ashr_i64 s[12:13], s[6:7], 24 +; GCN-IR-NEXT: s_ashr_i32 s2, s7, 31 ; GCN-IR-NEXT: s_xor_b64 s[6:7], s[0:1], s[8:9] -; GCN-IR-NEXT: s_sub_u32 s10, s6, s0 ; GCN-IR-NEXT: s_mov_b32 s3, s2 +; GCN-IR-NEXT: s_sub_u32 s10, s6, s0 ; GCN-IR-NEXT: s_subb_u32 s11, s7, s0 ; GCN-IR-NEXT: s_xor_b64 s[6:7], s[2:3], s[12:13] ; GCN-IR-NEXT: s_sub_u32 s6, s6, s2 @@ -1055,14 +1055,14 @@ ; GCN-IR-NEXT: s_addc_u32 s21, s7, -1 ; GCN-IR-NEXT: s_not_b64 s[8:9], s[14:15] ; GCN-IR-NEXT: s_add_u32 s10, s8, s16 -; GCN-IR-NEXT: s_addc_u32 s11, s9, s15 ; GCN-IR-NEXT: s_mov_b32 s17, s15 +; GCN-IR-NEXT: s_addc_u32 s11, s9, s15 ; GCN-IR-NEXT: s_mov_b64 s[14:15], 0 ; GCN-IR-NEXT: s_mov_b32 s9, 0 ; GCN-IR-NEXT: BB9_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-IR-NEXT: s_lshr_b32 s8, s13, 31 ; GCN-IR-NEXT: s_lshl_b64 s[16:17], s[18:19], 1 +; GCN-IR-NEXT: s_lshr_b32 s8, s13, 31 ; GCN-IR-NEXT: s_lshl_b64 s[12:13], s[12:13], 1 ; GCN-IR-NEXT: s_or_b64 s[16:17], s[16:17], s[8:9] ; GCN-IR-NEXT: s_or_b64 s[12:13], s[14:15], s[12:13] @@ -1120,8 +1120,8 @@ ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_ashr_i32 s2, s7, 31 ; GCN-NEXT: s_add_u32 s0, s6, s2 -; GCN-NEXT: s_addc_u32 s1, s7, s2 ; GCN-NEXT: s_mov_b32 s3, s2 +; GCN-NEXT: s_addc_u32 s1, s7, s2 ; GCN-NEXT: s_xor_b64 s[8:9], s[0:1], s[2:3] ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s8 ; GCN-NEXT: v_cvt_f32_u32_e32 v1, s9 @@ -1135,10 +1135,10 @@ ; GCN-NEXT: v_mul_f32_e32 v3, 0x2f800000, v0 ; GCN-NEXT: v_trunc_f32_e32 v3, v3 ; GCN-NEXT: v_mac_f32_e32 v0, 0xcf800000, v3 -; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v3, v3 -; GCN-NEXT: v_mul_hi_u32 v5, s3, v0 +; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GCN-NEXT: v_mul_lo_u32 v4, s3, v3 +; GCN-NEXT: v_mul_hi_u32 v5, s3, v0 ; GCN-NEXT: v_mul_lo_u32 v7, s6, v0 ; GCN-NEXT: v_mul_lo_u32 v6, s3, v0 ; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4 @@ -1148,16 +1148,16 @@ ; GCN-NEXT: v_mul_hi_u32 v9, v0, v4 ; GCN-NEXT: v_mul_lo_u32 v8, v3, v6 ; GCN-NEXT: v_mul_hi_u32 v6, v3, v6 -; GCN-NEXT: v_add_i32_e32 v5, vcc, v5, v7 ; GCN-NEXT: v_mul_hi_u32 v10, v3, v4 +; GCN-NEXT: v_add_i32_e32 v5, vcc, v5, v7 ; GCN-NEXT: v_addc_u32_e32 v7, vcc, v2, v9, vcc ; GCN-NEXT: v_mul_lo_u32 v4, v3, v4 ; GCN-NEXT: v_add_i32_e32 v5, vcc, v5, v8 ; GCN-NEXT: v_addc_u32_e32 v5, vcc, v7, v6, vcc ; GCN-NEXT: v_addc_u32_e32 v6, vcc, v10, v1, vcc ; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; GCN-NEXT: v_add_i32_e64 v0, s[0:1], v0, v4 ; GCN-NEXT: v_addc_u32_e32 v5, vcc, v2, v6, vcc +; GCN-NEXT: v_add_i32_e64 v0, s[0:1], v0, v4 ; GCN-NEXT: v_addc_u32_e64 v4, vcc, v3, v5, s[0:1] ; GCN-NEXT: v_mul_lo_u32 v6, s3, v4 ; GCN-NEXT: v_mul_hi_u32 v7, s3, v0 @@ -1167,8 +1167,8 @@ ; GCN-NEXT: v_mul_lo_u32 v7, s3, v0 ; GCN-NEXT: v_add_i32_e32 v6, vcc, v8, v6 ; GCN-NEXT: v_mul_lo_u32 v10, v0, v6 -; GCN-NEXT: v_mul_hi_u32 v12, v0, v6 ; GCN-NEXT: v_mul_hi_u32 v11, v0, v7 +; GCN-NEXT: v_mul_hi_u32 v12, v0, v6 ; GCN-NEXT: v_mul_hi_u32 v9, v4, v7 ; GCN-NEXT: v_mul_lo_u32 v7, v4, v7 ; GCN-NEXT: v_mul_hi_u32 v8, v4, v6 @@ -1207,8 +1207,8 @@ ; GCN-NEXT: v_cndmask_b32_e64 v4, v6, v5, s[0:1] ; GCN-NEXT: v_add_i32_e64 v5, s[0:1], 2, v0 ; GCN-NEXT: v_addc_u32_e64 v6, s[0:1], 0, v2, s[0:1] -; GCN-NEXT: v_subb_u32_e32 v1, vcc, 0, v1, vcc ; GCN-NEXT: v_add_i32_e64 v7, s[0:1], 1, v0 +; GCN-NEXT: v_subb_u32_e32 v1, vcc, 0, v1, vcc ; GCN-NEXT: v_addc_u32_e64 v2, s[0:1], 0, v2, s[0:1] ; GCN-NEXT: v_cmp_le_u32_e32 vcc, s9, v1 ; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4 @@ -1274,8 +1274,8 @@ ; GCN-IR-NEXT: s_mov_b32 s7, 0 ; GCN-IR-NEXT: BB10_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-IR-NEXT: s_lshr_b32 s6, s11, 31 ; GCN-IR-NEXT: s_lshl_b64 s[14:15], s[14:15], 1 +; GCN-IR-NEXT: s_lshr_b32 s6, s11, 31 ; GCN-IR-NEXT: s_lshl_b64 s[10:11], s[10:11], 1 ; GCN-IR-NEXT: s_or_b64 s[14:15], s[14:15], s[6:7] ; GCN-IR-NEXT: s_or_b64 s[10:11], s[12:13], s[10:11] @@ -1326,15 +1326,15 @@ ; GCN-NEXT: v_ashrrev_i32_e32 v2, 31, v1 ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GCN-NEXT: v_addc_u32_e32 v1, vcc, v1, v2, vcc -; GCN-NEXT: v_xor_b32_e32 v0, v0, v2 ; GCN-NEXT: v_xor_b32_e32 v1, v1, v2 +; GCN-NEXT: v_xor_b32_e32 v0, v0, v2 ; GCN-NEXT: v_cvt_f32_u32_e32 v3, v0 ; GCN-NEXT: v_cvt_f32_u32_e32 v4, v1 ; GCN-NEXT: v_sub_i32_e32 v5, vcc, 0, v0 ; GCN-NEXT: v_subb_u32_e32 v6, vcc, 0, v1, vcc -; GCN-NEXT: v_mov_b32_e32 v13, 0 ; GCN-NEXT: v_mac_f32_e32 v3, 0x4f800000, v4 ; GCN-NEXT: v_rcp_f32_e32 v3, v3 +; GCN-NEXT: v_mov_b32_e32 v13, 0 ; GCN-NEXT: v_mov_b32_e32 v12, 0 ; GCN-NEXT: v_mul_f32_e32 v3, 0x5f7ffffc, v3 ; GCN-NEXT: v_mul_f32_e32 v4, 0x2f800000, v3 @@ -1349,8 +1349,8 @@ ; GCN-NEXT: v_mul_lo_u32 v8, v5, v3 ; GCN-NEXT: v_add_i32_e32 v7, vcc, v7, v9 ; GCN-NEXT: v_mul_lo_u32 v10, v3, v7 -; GCN-NEXT: v_mul_hi_u32 v9, v3, v7 ; GCN-NEXT: v_mul_hi_u32 v11, v3, v8 +; GCN-NEXT: v_mul_hi_u32 v9, v3, v7 ; GCN-NEXT: v_mul_hi_u32 v14, v4, v7 ; GCN-NEXT: v_mul_lo_u32 v7, v4, v7 ; GCN-NEXT: v_add_i32_e32 v10, vcc, v11, v10 @@ -1361,8 +1361,8 @@ ; GCN-NEXT: v_addc_u32_e32 v8, vcc, v9, v8, vcc ; GCN-NEXT: v_addc_u32_e32 v9, vcc, v14, v12, vcc ; GCN-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; GCN-NEXT: v_add_i32_e64 v3, s[4:5], v3, v7 ; GCN-NEXT: v_addc_u32_e32 v8, vcc, v13, v9, vcc +; GCN-NEXT: v_add_i32_e64 v3, s[4:5], v3, v7 ; GCN-NEXT: v_addc_u32_e64 v7, vcc, v4, v8, s[4:5] ; GCN-NEXT: v_mul_lo_u32 v9, v5, v7 ; GCN-NEXT: v_mul_hi_u32 v10, v5, v3 @@ -1375,8 +1375,8 @@ ; GCN-NEXT: v_mul_hi_u32 v15, v3, v6 ; GCN-NEXT: v_mul_hi_u32 v10, v7, v5 ; GCN-NEXT: v_mul_lo_u32 v5, v7, v5 -; GCN-NEXT: v_add_i32_e32 v11, vcc, v14, v11 ; GCN-NEXT: v_mul_hi_u32 v9, v7, v6 +; GCN-NEXT: v_add_i32_e32 v11, vcc, v14, v11 ; GCN-NEXT: v_addc_u32_e32 v14, vcc, v13, v15, vcc ; GCN-NEXT: v_mul_lo_u32 v6, v7, v6 ; GCN-NEXT: v_add_i32_e32 v5, vcc, v11, v5 @@ -1411,8 +1411,8 @@ ; GCN-NEXT: v_add_i32_e64 v7, s[4:5], 2, v3 ; GCN-NEXT: v_addc_u32_e64 v8, s[4:5], 0, v13, s[4:5] ; GCN-NEXT: v_add_i32_e64 v9, s[4:5], 1, v3 -; GCN-NEXT: v_subb_u32_e32 v4, vcc, 0, v4, vcc ; GCN-NEXT: v_addc_u32_e64 v10, s[4:5], 0, v13, s[4:5] +; GCN-NEXT: v_subb_u32_e32 v4, vcc, 0, v4, vcc ; GCN-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v6 ; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v4, v1 ; GCN-NEXT: v_cndmask_b32_e64 v6, v10, v8, s[4:5] @@ -1436,8 +1436,8 @@ ; GCN-IR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-IR-NEXT: v_ashrrev_i32_e32 v2, 31, v1 ; GCN-IR-NEXT: v_xor_b32_e32 v0, v2, v0 -; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 ; GCN-IR-NEXT: v_xor_b32_e32 v1, v2, v1 +; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 ; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc ; GCN-IR-NEXT: v_ffbh_u32_e32 v4, v0 ; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, 32, v4 @@ -1463,8 +1463,8 @@ ; GCN-IR-NEXT: v_addc_u32_e32 v11, vcc, 0, v5, vcc ; GCN-IR-NEXT: v_cmp_ge_u64_e32 vcc, v[10:11], v[4:5] ; GCN-IR-NEXT: v_sub_i32_e64 v4, s[4:5], 63, v4 -; GCN-IR-NEXT: v_mov_b32_e32 v6, 0 ; GCN-IR-NEXT: v_lshl_b64 v[4:5], 24, v4 +; GCN-IR-NEXT: v_mov_b32_e32 v6, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 ; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 ; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -1473,9 +1473,9 @@ ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader ; GCN-IR-NEXT: v_add_i32_e32 v14, vcc, -1, v0 ; GCN-IR-NEXT: v_addc_u32_e32 v15, vcc, -1, v1, vcc -; GCN-IR-NEXT: v_sub_i32_e32 v8, vcc, 58, v8 -; GCN-IR-NEXT: v_mov_b32_e32 v12, 0 ; GCN-IR-NEXT: v_lshr_b64 v[10:11], 24, v10 +; GCN-IR-NEXT: v_mov_b32_e32 v12, 0 +; GCN-IR-NEXT: v_sub_i32_e32 v8, vcc, 58, v8 ; GCN-IR-NEXT: v_mov_b32_e32 v13, 0 ; GCN-IR-NEXT: v_subb_u32_e32 v9, vcc, 0, v9, vcc ; GCN-IR-NEXT: BB11_3: ; %udiv-do-while @@ -1488,15 +1488,15 @@ ; GCN-IR-NEXT: v_subb_u32_e32 v6, vcc, v15, v11, vcc ; GCN-IR-NEXT: v_or_b32_e32 v4, v12, v4 ; GCN-IR-NEXT: v_ashrrev_i32_e32 v12, 31, v6 -; GCN-IR-NEXT: v_and_b32_e32 v17, v12, v0 ; GCN-IR-NEXT: v_and_b32_e32 v6, 1, v12 ; GCN-IR-NEXT: v_and_b32_e32 v16, v12, v1 +; GCN-IR-NEXT: v_and_b32_e32 v17, v12, v0 ; GCN-IR-NEXT: v_add_i32_e32 v12, vcc, 1, v8 ; GCN-IR-NEXT: v_or_b32_e32 v5, v13, v5 ; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, 0, v9, vcc +; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 ; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[12:13], v[8:9] ; GCN-IR-NEXT: v_mov_b32_e32 v8, v12 -; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 ; GCN-IR-NEXT: v_sub_i32_e64 v10, s[4:5], v10, v17 ; GCN-IR-NEXT: v_mov_b32_e32 v9, v13 ; GCN-IR-NEXT: v_mov_b32_e32 v13, v7 @@ -1530,15 +1530,15 @@ ; GCN-NEXT: v_ashrrev_i32_e32 v2, 31, v1 ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GCN-NEXT: v_addc_u32_e32 v1, vcc, v1, v2, vcc -; GCN-NEXT: v_xor_b32_e32 v0, v0, v2 ; GCN-NEXT: v_xor_b32_e32 v1, v1, v2 +; GCN-NEXT: v_xor_b32_e32 v0, v0, v2 ; GCN-NEXT: v_cvt_f32_u32_e32 v3, v0 ; GCN-NEXT: v_cvt_f32_u32_e32 v4, v1 ; GCN-NEXT: v_sub_i32_e32 v5, vcc, 0, v0 ; GCN-NEXT: v_subb_u32_e32 v6, vcc, 0, v1, vcc -; GCN-NEXT: v_mov_b32_e32 v13, 0 ; GCN-NEXT: v_mac_f32_e32 v3, 0x4f800000, v4 ; GCN-NEXT: v_rcp_f32_e32 v3, v3 +; GCN-NEXT: v_mov_b32_e32 v13, 0 ; GCN-NEXT: v_mov_b32_e32 v12, 0 ; GCN-NEXT: v_mul_f32_e32 v3, 0x5f7ffffc, v3 ; GCN-NEXT: v_mul_f32_e32 v4, 0x2f800000, v3 @@ -1553,8 +1553,8 @@ ; GCN-NEXT: v_mul_lo_u32 v8, v5, v3 ; GCN-NEXT: v_add_i32_e32 v7, vcc, v7, v9 ; GCN-NEXT: v_mul_lo_u32 v10, v3, v7 -; GCN-NEXT: v_mul_hi_u32 v9, v3, v7 ; GCN-NEXT: v_mul_hi_u32 v11, v3, v8 +; GCN-NEXT: v_mul_hi_u32 v9, v3, v7 ; GCN-NEXT: v_mul_hi_u32 v14, v4, v7 ; GCN-NEXT: v_mul_lo_u32 v7, v4, v7 ; GCN-NEXT: v_add_i32_e32 v10, vcc, v11, v10 @@ -1565,8 +1565,8 @@ ; GCN-NEXT: v_addc_u32_e32 v8, vcc, v9, v8, vcc ; GCN-NEXT: v_addc_u32_e32 v9, vcc, v14, v12, vcc ; GCN-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; GCN-NEXT: v_add_i32_e64 v3, s[4:5], v3, v7 ; GCN-NEXT: v_addc_u32_e32 v8, vcc, v13, v9, vcc +; GCN-NEXT: v_add_i32_e64 v3, s[4:5], v3, v7 ; GCN-NEXT: v_addc_u32_e64 v7, vcc, v4, v8, s[4:5] ; GCN-NEXT: v_mul_lo_u32 v9, v5, v7 ; GCN-NEXT: v_mul_hi_u32 v10, v5, v3 @@ -1579,8 +1579,8 @@ ; GCN-NEXT: v_mul_hi_u32 v15, v3, v6 ; GCN-NEXT: v_mul_hi_u32 v10, v7, v5 ; GCN-NEXT: v_mul_lo_u32 v5, v7, v5 -; GCN-NEXT: v_add_i32_e32 v11, vcc, v14, v11 ; GCN-NEXT: v_mul_hi_u32 v9, v7, v6 +; GCN-NEXT: v_add_i32_e32 v11, vcc, v14, v11 ; GCN-NEXT: v_addc_u32_e32 v14, vcc, v13, v15, vcc ; GCN-NEXT: v_mul_lo_u32 v6, v7, v6 ; GCN-NEXT: v_add_i32_e32 v5, vcc, v11, v5 @@ -1616,8 +1616,8 @@ ; GCN-NEXT: v_add_i32_e64 v7, s[4:5], 2, v3 ; GCN-NEXT: v_addc_u32_e64 v8, s[4:5], 0, v13, s[4:5] ; GCN-NEXT: v_add_i32_e64 v9, s[4:5], 1, v3 -; GCN-NEXT: v_subb_u32_e32 v4, vcc, 0, v4, vcc ; GCN-NEXT: v_addc_u32_e64 v10, s[4:5], 0, v13, s[4:5] +; GCN-NEXT: v_subb_u32_e32 v4, vcc, 0, v4, vcc ; GCN-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v6 ; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v4, v1 ; GCN-NEXT: v_cndmask_b32_e64 v6, v10, v8, s[4:5] @@ -1641,8 +1641,8 @@ ; GCN-IR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-IR-NEXT: v_ashrrev_i32_e32 v2, 31, v1 ; GCN-IR-NEXT: v_xor_b32_e32 v0, v2, v0 -; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 ; GCN-IR-NEXT: v_xor_b32_e32 v1, v2, v1 +; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 ; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc ; GCN-IR-NEXT: v_ffbh_u32_e32 v4, v0 ; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, 32, v4 @@ -1654,8 +1654,8 @@ ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1] ; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[4:5] ; GCN-IR-NEXT: s_mov_b64 s[8:9], 0x8000 -; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], vcc ; GCN-IR-NEXT: v_mov_b32_e32 v8, s8 +; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], vcc ; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[4:5] ; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 ; GCN-IR-NEXT: v_cndmask_b32_e64 v8, v8, 0, s[4:5] @@ -1670,8 +1670,8 @@ ; GCN-IR-NEXT: v_addc_u32_e32 v11, vcc, 0, v5, vcc ; GCN-IR-NEXT: v_cmp_ge_u64_e32 vcc, v[10:11], v[4:5] ; GCN-IR-NEXT: v_sub_i32_e64 v4, s[4:5], 63, v4 -; GCN-IR-NEXT: v_mov_b32_e32 v8, 0 ; GCN-IR-NEXT: v_lshl_b64 v[4:5], s[8:9], v4 +; GCN-IR-NEXT: v_mov_b32_e32 v8, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v9, 0 ; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 ; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -1679,11 +1679,11 @@ ; GCN-IR-NEXT: s_cbranch_execz BB12_5 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader ; GCN-IR-NEXT: v_add_i32_e32 v14, vcc, -1, v0 -; GCN-IR-NEXT: v_addc_u32_e32 v15, vcc, -1, v1, vcc ; GCN-IR-NEXT: s_mov_b64 s[4:5], 0x8000 -; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, 47, v6 -; GCN-IR-NEXT: v_mov_b32_e32 v12, 0 +; GCN-IR-NEXT: v_addc_u32_e32 v15, vcc, -1, v1, vcc ; GCN-IR-NEXT: v_lshr_b64 v[10:11], s[4:5], v10 +; GCN-IR-NEXT: v_mov_b32_e32 v12, 0 +; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, 47, v6 ; GCN-IR-NEXT: v_mov_b32_e32 v13, 0 ; GCN-IR-NEXT: v_subb_u32_e32 v7, vcc, 0, v7, vcc ; GCN-IR-NEXT: BB12_3: ; %udiv-do-while @@ -1696,15 +1696,15 @@ ; GCN-IR-NEXT: v_subb_u32_e32 v8, vcc, v15, v11, vcc ; GCN-IR-NEXT: v_or_b32_e32 v4, v12, v4 ; GCN-IR-NEXT: v_ashrrev_i32_e32 v12, 31, v8 -; GCN-IR-NEXT: v_and_b32_e32 v17, v12, v0 ; GCN-IR-NEXT: v_and_b32_e32 v8, 1, v12 ; GCN-IR-NEXT: v_and_b32_e32 v16, v12, v1 +; GCN-IR-NEXT: v_and_b32_e32 v17, v12, v0 ; GCN-IR-NEXT: v_add_i32_e32 v12, vcc, 1, v6 ; GCN-IR-NEXT: v_or_b32_e32 v5, v13, v5 ; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, 0, v7, vcc +; GCN-IR-NEXT: v_mov_b32_e32 v9, 0 ; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[12:13], v[6:7] ; GCN-IR-NEXT: v_mov_b32_e32 v6, v12 -; GCN-IR-NEXT: v_mov_b32_e32 v9, 0 ; GCN-IR-NEXT: v_sub_i32_e64 v10, s[4:5], v10, v17 ; GCN-IR-NEXT: v_mov_b32_e32 v7, v13 ; GCN-IR-NEXT: v_mov_b32_e32 v13, v9 @@ -1747,8 +1747,8 @@ ; GCN-IR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-IR-NEXT: v_ashrrev_i32_e32 v2, 31, v1 ; GCN-IR-NEXT: v_xor_b32_e32 v0, v2, v0 -; GCN-IR-NEXT: v_sub_i32_e32 v7, vcc, v0, v2 ; GCN-IR-NEXT: v_xor_b32_e32 v1, v2, v1 +; GCN-IR-NEXT: v_sub_i32_e32 v7, vcc, v0, v2 ; GCN-IR-NEXT: v_subb_u32_e32 v8, vcc, v1, v2, vcc ; GCN-IR-NEXT: v_ffbh_u32_e32 v0, v7 ; GCN-IR-NEXT: v_add_i32_e64 v0, s[4:5], 32, v0 @@ -1772,8 +1772,8 @@ ; GCN-IR-NEXT: v_addc_u32_e32 v10, vcc, 0, v4, vcc ; GCN-IR-NEXT: v_cmp_ge_u64_e32 vcc, v[9:10], v[3:4] ; GCN-IR-NEXT: v_sub_i32_e64 v3, s[4:5], 63, v3 -; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 ; GCN-IR-NEXT: v_lshl_b64 v[3:4], v[7:8], v3 +; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v6, 0 ; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 ; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -1796,18 +1796,18 @@ ; GCN-IR-NEXT: v_subb_u32_e32 v5, vcc, 0, v10, vcc ; GCN-IR-NEXT: v_or_b32_e32 v3, v11, v3 ; GCN-IR-NEXT: v_add_i32_e32 v11, vcc, 1, v7 -; GCN-IR-NEXT: v_ashrrev_i32_e32 v9, 31, v5 ; GCN-IR-NEXT: v_or_b32_e32 v4, v12, v4 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v9, 31, v5 ; GCN-IR-NEXT: v_addc_u32_e32 v12, vcc, 0, v8, vcc -; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[11:12], v[7:8] +; GCN-IR-NEXT: v_mov_b32_e32 v6, 0 ; GCN-IR-NEXT: v_and_b32_e32 v5, 1, v9 ; GCN-IR-NEXT: v_and_b32_e32 v9, 0x8000, v9 +; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[11:12], v[7:8] ; GCN-IR-NEXT: v_mov_b32_e32 v7, v11 -; GCN-IR-NEXT: v_mov_b32_e32 v6, 0 -; GCN-IR-NEXT: v_mov_b32_e32 v8, v12 -; GCN-IR-NEXT: v_mov_b32_e32 v12, v6 ; GCN-IR-NEXT: v_mov_b32_e32 v13, 0 ; GCN-IR-NEXT: v_sub_i32_e64 v9, s[4:5], v0, v9 +; GCN-IR-NEXT: v_mov_b32_e32 v8, v12 +; GCN-IR-NEXT: v_mov_b32_e32 v12, v6 ; GCN-IR-NEXT: v_subb_u32_e64 v10, s[4:5], v10, v13, s[4:5] ; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11] ; GCN-IR-NEXT: v_mov_b32_e32 v11, v5 diff --git a/llvm/test/CodeGen/AMDGPU/select.f16.ll b/llvm/test/CodeGen/AMDGPU/select.f16.ll --- a/llvm/test/CodeGen/AMDGPU/select.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/select.f16.ll @@ -16,10 +16,10 @@ ; SI-NEXT: s_mov_b32 s19, s3 ; SI-NEXT: s_mov_b32 s20, s8 ; SI-NEXT: s_mov_b32 s21, s9 -; SI-NEXT: s_mov_b32 s8, s10 -; SI-NEXT: s_mov_b32 s9, s11 ; SI-NEXT: s_mov_b32 s22, s2 ; SI-NEXT: s_mov_b32 s23, s3 +; SI-NEXT: s_mov_b32 s8, s10 +; SI-NEXT: s_mov_b32 s9, s11 ; SI-NEXT: s_mov_b32 s10, s2 ; SI-NEXT: s_mov_b32 s11, s3 ; SI-NEXT: s_mov_b32 s14, s2 @@ -57,10 +57,10 @@ ; VI-NEXT: s_mov_b32 s19, s3 ; VI-NEXT: s_mov_b32 s20, s8 ; VI-NEXT: s_mov_b32 s21, s9 -; VI-NEXT: s_mov_b32 s8, s10 -; VI-NEXT: s_mov_b32 s9, s11 ; VI-NEXT: s_mov_b32 s22, s2 ; VI-NEXT: s_mov_b32 s23, s3 +; VI-NEXT: s_mov_b32 s8, s10 +; VI-NEXT: s_mov_b32 s9, s11 ; VI-NEXT: s_mov_b32 s10, s2 ; VI-NEXT: s_mov_b32 s11, s3 ; VI-NEXT: s_mov_b32 s14, s2 @@ -108,10 +108,10 @@ ; SI-NEXT: s_mov_b32 s13, s3 ; SI-NEXT: s_mov_b32 s16, s4 ; SI-NEXT: s_mov_b32 s17, s5 -; SI-NEXT: s_mov_b32 s4, s6 -; SI-NEXT: s_mov_b32 s5, s7 ; SI-NEXT: s_mov_b32 s18, s10 ; SI-NEXT: s_mov_b32 s19, s11 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s7 ; SI-NEXT: s_mov_b32 s6, s10 ; SI-NEXT: s_mov_b32 s7, s11 ; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc @@ -143,10 +143,10 @@ ; VI-NEXT: s_mov_b32 s13, s3 ; VI-NEXT: s_mov_b32 s16, s4 ; VI-NEXT: s_mov_b32 s17, s5 -; VI-NEXT: s_mov_b32 s4, s6 -; VI-NEXT: s_mov_b32 s5, s7 ; VI-NEXT: s_mov_b32 s18, s10 ; VI-NEXT: s_mov_b32 s19, s11 +; VI-NEXT: s_mov_b32 s4, s6 +; VI-NEXT: s_mov_b32 s5, s7 ; VI-NEXT: s_mov_b32 s6, s10 ; VI-NEXT: s_mov_b32 s7, s11 ; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc @@ -188,10 +188,10 @@ ; SI-NEXT: s_mov_b32 s13, s3 ; SI-NEXT: s_mov_b32 s16, s4 ; SI-NEXT: s_mov_b32 s17, s5 -; SI-NEXT: s_mov_b32 s4, s6 -; SI-NEXT: s_mov_b32 s5, s7 ; SI-NEXT: s_mov_b32 s18, s10 ; SI-NEXT: s_mov_b32 s19, s11 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s7 ; SI-NEXT: s_mov_b32 s6, s10 ; SI-NEXT: s_mov_b32 s7, s11 ; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc @@ -223,10 +223,10 @@ ; VI-NEXT: s_mov_b32 s13, s3 ; VI-NEXT: s_mov_b32 s16, s4 ; VI-NEXT: s_mov_b32 s17, s5 -; VI-NEXT: s_mov_b32 s4, s6 -; VI-NEXT: s_mov_b32 s5, s7 ; VI-NEXT: s_mov_b32 s18, s10 ; VI-NEXT: s_mov_b32 s19, s11 +; VI-NEXT: s_mov_b32 s4, s6 +; VI-NEXT: s_mov_b32 s5, s7 ; VI-NEXT: s_mov_b32 s6, s10 ; VI-NEXT: s_mov_b32 s7, s11 ; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc @@ -268,10 +268,10 @@ ; SI-NEXT: s_mov_b32 s13, s3 ; SI-NEXT: s_mov_b32 s16, s4 ; SI-NEXT: s_mov_b32 s17, s5 -; SI-NEXT: s_mov_b32 s4, s6 -; SI-NEXT: s_mov_b32 s5, s7 ; SI-NEXT: s_mov_b32 s18, s10 ; SI-NEXT: s_mov_b32 s19, s11 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s7 ; SI-NEXT: s_mov_b32 s6, s10 ; SI-NEXT: s_mov_b32 s7, s11 ; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc @@ -303,10 +303,10 @@ ; VI-NEXT: s_mov_b32 s13, s3 ; VI-NEXT: s_mov_b32 s16, s4 ; VI-NEXT: s_mov_b32 s17, s5 -; VI-NEXT: s_mov_b32 s4, s6 -; VI-NEXT: s_mov_b32 s5, s7 ; VI-NEXT: s_mov_b32 s18, s10 ; VI-NEXT: s_mov_b32 s19, s11 +; VI-NEXT: s_mov_b32 s4, s6 +; VI-NEXT: s_mov_b32 s5, s7 ; VI-NEXT: s_mov_b32 s6, s10 ; VI-NEXT: s_mov_b32 s7, s11 ; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc @@ -349,10 +349,10 @@ ; SI-NEXT: s_mov_b32 s13, s3 ; SI-NEXT: s_mov_b32 s16, s4 ; SI-NEXT: s_mov_b32 s17, s5 -; SI-NEXT: s_mov_b32 s4, s6 -; SI-NEXT: s_mov_b32 s5, s7 ; SI-NEXT: s_mov_b32 s18, s10 ; SI-NEXT: s_mov_b32 s19, s11 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s7 ; SI-NEXT: s_mov_b32 s6, s10 ; SI-NEXT: s_mov_b32 s7, s11 ; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc @@ -384,10 +384,10 @@ ; VI-NEXT: s_mov_b32 s13, s3 ; VI-NEXT: s_mov_b32 s16, s4 ; VI-NEXT: s_mov_b32 s17, s5 -; VI-NEXT: s_mov_b32 s4, s6 -; VI-NEXT: s_mov_b32 s5, s7 ; VI-NEXT: s_mov_b32 s18, s10 ; VI-NEXT: s_mov_b32 s19, s11 +; VI-NEXT: s_mov_b32 s4, s6 +; VI-NEXT: s_mov_b32 s5, s7 ; VI-NEXT: s_mov_b32 s6, s10 ; VI-NEXT: s_mov_b32 s7, s11 ; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc @@ -431,10 +431,10 @@ ; SI-NEXT: s_mov_b32 s19, s3 ; SI-NEXT: s_mov_b32 s20, s8 ; SI-NEXT: s_mov_b32 s21, s9 -; SI-NEXT: s_mov_b32 s8, s10 -; SI-NEXT: s_mov_b32 s9, s11 ; SI-NEXT: s_mov_b32 s22, s2 ; SI-NEXT: s_mov_b32 s23, s3 +; SI-NEXT: s_mov_b32 s8, s10 +; SI-NEXT: s_mov_b32 s9, s11 ; SI-NEXT: s_mov_b32 s10, s2 ; SI-NEXT: s_mov_b32 s11, s3 ; SI-NEXT: s_mov_b32 s14, s2 @@ -451,9 +451,9 @@ ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v4, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 @@ -464,8 +464,8 @@ ; SI-NEXT: v_cmp_lt_f32_e32 vcc, v5, v6 ; SI-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc ; SI-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 -; SI-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 @@ -485,10 +485,10 @@ ; VI-NEXT: s_mov_b32 s19, s3 ; VI-NEXT: s_mov_b32 s20, s8 ; VI-NEXT: s_mov_b32 s21, s9 -; VI-NEXT: s_mov_b32 s8, s10 -; VI-NEXT: s_mov_b32 s9, s11 ; VI-NEXT: s_mov_b32 s22, s2 ; VI-NEXT: s_mov_b32 s23, s3 +; VI-NEXT: s_mov_b32 s8, s10 +; VI-NEXT: s_mov_b32 s9, s11 ; VI-NEXT: s_mov_b32 s10, s2 ; VI-NEXT: s_mov_b32 s11, s3 ; VI-NEXT: s_mov_b32 s14, s2 @@ -502,11 +502,12 @@ ; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_cmp_lt_f16_e32 vcc, v0, v1 ; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; VI-NEXT: v_cmp_lt_f16_e32 vcc, v0, v1 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v3 ; VI-NEXT: v_cmp_lt_f16_e32 vcc, v6, v5 ; VI-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc @@ -543,10 +544,10 @@ ; SI-NEXT: s_mov_b32 s13, s3 ; SI-NEXT: s_mov_b32 s16, s4 ; SI-NEXT: s_mov_b32 s17, s5 -; SI-NEXT: s_mov_b32 s4, s6 -; SI-NEXT: s_mov_b32 s5, s7 ; SI-NEXT: s_mov_b32 s18, s10 ; SI-NEXT: s_mov_b32 s19, s11 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s7 ; SI-NEXT: s_mov_b32 s6, s10 ; SI-NEXT: s_mov_b32 s7, s11 ; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 @@ -570,8 +571,8 @@ ; SI-NEXT: v_cmp_lt_f32_e32 vcc, s2, v3 ; SI-NEXT: v_cndmask_b32_e32 v3, v5, v4, vcc ; SI-NEXT: v_cmp_lt_f32_e32 vcc, 0.5, v0 -; SI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 @@ -590,10 +591,10 @@ ; VI-NEXT: s_mov_b32 s13, s3 ; VI-NEXT: s_mov_b32 s16, s4 ; VI-NEXT: s_mov_b32 s17, s5 -; VI-NEXT: s_mov_b32 s4, s6 -; VI-NEXT: s_mov_b32 s5, s7 ; VI-NEXT: s_mov_b32 s18, s10 ; VI-NEXT: s_mov_b32 s19, s11 +; VI-NEXT: s_mov_b32 s4, s6 +; VI-NEXT: s_mov_b32 s5, s7 ; VI-NEXT: s_mov_b32 s6, s10 ; VI-NEXT: s_mov_b32 s7, s11 ; VI-NEXT: buffer_load_dword v0, off, s[12:15], 0 @@ -642,10 +643,10 @@ ; SI-NEXT: s_mov_b32 s13, s3 ; SI-NEXT: s_mov_b32 s16, s4 ; SI-NEXT: s_mov_b32 s17, s5 -; SI-NEXT: s_mov_b32 s4, s6 -; SI-NEXT: s_mov_b32 s5, s7 ; SI-NEXT: s_mov_b32 s18, s10 ; SI-NEXT: s_mov_b32 s19, s11 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s7 ; SI-NEXT: s_mov_b32 s6, s10 ; SI-NEXT: s_mov_b32 s7, s11 ; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 @@ -669,8 +670,8 @@ ; SI-NEXT: v_cmp_gt_f32_e32 vcc, s2, v3 ; SI-NEXT: v_cndmask_b32_e32 v3, v5, v4, vcc ; SI-NEXT: v_cmp_gt_f32_e32 vcc, 0.5, v0 -; SI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 @@ -689,10 +690,10 @@ ; VI-NEXT: s_mov_b32 s13, s3 ; VI-NEXT: s_mov_b32 s16, s4 ; VI-NEXT: s_mov_b32 s17, s5 -; VI-NEXT: s_mov_b32 s4, s6 -; VI-NEXT: s_mov_b32 s5, s7 ; VI-NEXT: s_mov_b32 s18, s10 ; VI-NEXT: s_mov_b32 s19, s11 +; VI-NEXT: s_mov_b32 s4, s6 +; VI-NEXT: s_mov_b32 s5, s7 ; VI-NEXT: s_mov_b32 s6, s10 ; VI-NEXT: s_mov_b32 s7, s11 ; VI-NEXT: buffer_load_dword v0, off, s[12:15], 0 @@ -741,10 +742,10 @@ ; SI-NEXT: s_mov_b32 s13, s3 ; SI-NEXT: s_mov_b32 s16, s4 ; SI-NEXT: s_mov_b32 s17, s5 -; SI-NEXT: s_mov_b32 s4, s6 -; SI-NEXT: s_mov_b32 s5, s7 ; SI-NEXT: s_mov_b32 s18, s10 ; SI-NEXT: s_mov_b32 s19, s11 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s7 ; SI-NEXT: s_mov_b32 s6, s10 ; SI-NEXT: s_mov_b32 s7, s11 ; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 @@ -755,9 +756,9 @@ ; SI-NEXT: s_mov_b32 s9, s1 ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_cvt_f32_f16_e32 v4, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 @@ -788,10 +789,10 @@ ; VI-NEXT: s_mov_b32 s13, s3 ; VI-NEXT: s_mov_b32 s16, s4 ; VI-NEXT: s_mov_b32 s17, s5 -; VI-NEXT: s_mov_b32 s4, s6 -; VI-NEXT: s_mov_b32 s5, s7 ; VI-NEXT: s_mov_b32 s18, s10 ; VI-NEXT: s_mov_b32 s19, s11 +; VI-NEXT: s_mov_b32 s4, s6 +; VI-NEXT: s_mov_b32 s5, s7 ; VI-NEXT: s_mov_b32 s6, s10 ; VI-NEXT: s_mov_b32 s7, s11 ; VI-NEXT: buffer_load_dword v0, off, s[12:15], 0 @@ -804,8 +805,8 @@ ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_cmp_nlt_f16_e32 vcc, v0, v1 ; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; VI-NEXT: v_cmp_nlt_f16_e32 vcc, v0, v1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 @@ -842,10 +843,10 @@ ; SI-NEXT: s_mov_b32 s13, s3 ; SI-NEXT: s_mov_b32 s16, s4 ; SI-NEXT: s_mov_b32 s17, s5 -; SI-NEXT: s_mov_b32 s4, s6 -; SI-NEXT: s_mov_b32 s5, s7 ; SI-NEXT: s_mov_b32 s18, s10 ; SI-NEXT: s_mov_b32 s19, s11 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s7 ; SI-NEXT: s_mov_b32 s6, s10 ; SI-NEXT: s_mov_b32 s7, s11 ; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 @@ -869,8 +870,8 @@ ; SI-NEXT: v_cmp_lt_f32_e32 vcc, v4, v5 ; SI-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc ; SI-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 -; SI-NEXT: v_cndmask_b32_e32 v0, 0.5, v2, vcc ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cndmask_b32_e32 v0, 0.5, v2, vcc ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 @@ -889,10 +890,10 @@ ; VI-NEXT: s_mov_b32 s13, s3 ; VI-NEXT: s_mov_b32 s16, s4 ; VI-NEXT: s_mov_b32 s17, s5 -; VI-NEXT: s_mov_b32 s4, s6 -; VI-NEXT: s_mov_b32 s5, s7 ; VI-NEXT: s_mov_b32 s18, s10 ; VI-NEXT: s_mov_b32 s19, s11 +; VI-NEXT: s_mov_b32 s4, s6 +; VI-NEXT: s_mov_b32 s5, s7 ; VI-NEXT: s_mov_b32 s6, s10 ; VI-NEXT: s_mov_b32 s7, s11 ; VI-NEXT: buffer_load_dword v0, off, s[12:15], 0 @@ -905,8 +906,8 @@ ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_cmp_lt_f16_e32 vcc, v0, v1 ; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; VI-NEXT: v_cmp_lt_f16_e32 vcc, v0, v1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 diff --git a/llvm/test/CodeGen/AMDGPU/select64.ll b/llvm/test/CodeGen/AMDGPU/select64.ll --- a/llvm/test/CodeGen/AMDGPU/select64.ll +++ b/llvm/test/CodeGen/AMDGPU/select64.ll @@ -224,8 +224,8 @@ ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_cmp_gt_u32 s2, 5 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; SI-NEXT: s_cmp_gt_u32 s2, 5 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s1 @@ -261,9 +261,9 @@ ; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_cmp_gt_u32 s6, 5 ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NEXT: s_mov_b32 s4, 0 +; GFX90A-NEXT: s_cmp_gt_u32 s6, 5 ; GFX90A-NEXT: s_mov_b32 s5, 63 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_cselect_b64 s[0:1], s[0:1], s[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/shift-i128.ll b/llvm/test/CodeGen/AMDGPU/shift-i128.ll --- a/llvm/test/CodeGen/AMDGPU/shift-i128.ll +++ b/llvm/test/CodeGen/AMDGPU/shift-i128.ll @@ -14,8 +14,8 @@ ; GCN-NEXT: v_or_b32_e32 v8, v6, v8 ; GCN-NEXT: v_lshl_b64 v[5:6], v[0:1], v5 ; GCN-NEXT: v_cmp_gt_u32_e32 vcc, 64, v4 -; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], v4 ; GCN-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc +; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], v4 ; GCN-NEXT: v_cndmask_b32_e64 v2, v5, v2, s[4:5] ; GCN-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc ; GCN-NEXT: v_cndmask_b32_e64 v3, v5, v3, s[4:5] @@ -39,8 +39,8 @@ ; GCN-NEXT: v_or_b32_e32 v8, v6, v8 ; GCN-NEXT: v_lshr_b64 v[5:6], v[2:3], v5 ; GCN-NEXT: v_cmp_gt_u32_e32 vcc, 64, v4 -; GCN-NEXT: v_lshr_b64 v[2:3], v[2:3], v4 ; GCN-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc +; GCN-NEXT: v_lshr_b64 v[2:3], v[2:3], v4 ; GCN-NEXT: v_cndmask_b32_e64 v0, v5, v0, s[4:5] ; GCN-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc ; GCN-NEXT: v_cndmask_b32_e64 v1, v5, v1, s[4:5] @@ -194,10 +194,10 @@ ; GCN-NEXT: s_lshl_b64 s[2:3], s[4:5], s2 ; GCN-NEXT: s_or_b64 s[10:11], s[0:1], s[10:11] ; GCN-NEXT: s_cmp_lt_u32 s8, 64 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_eq_u32 s8, 0 ; GCN-NEXT: v_mov_b32_e32 v0, s3 ; GCN-NEXT: v_mov_b32_e32 v1, s11 +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s8, 0 ; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GCN-NEXT: v_mov_b32_e32 v1, s7 ; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 @@ -230,13 +230,13 @@ ; GCN-NEXT: s_sub_i32 s2, s8, 64 ; GCN-NEXT: s_lshr_b64 s[0:1], s[4:5], s8 ; GCN-NEXT: s_lshl_b64 s[10:11], s[6:7], s9 -; GCN-NEXT: s_or_b64 s[10:11], s[0:1], s[10:11] ; GCN-NEXT: s_lshr_b64 s[2:3], s[6:7], s2 +; GCN-NEXT: s_or_b64 s[10:11], s[0:1], s[10:11] ; GCN-NEXT: s_cmp_lt_u32 s8, 64 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_eq_u32 s8, 0 ; GCN-NEXT: v_mov_b32_e32 v0, s3 ; GCN-NEXT: v_mov_b32_e32 v1, s11 +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s8, 0 ; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GCN-NEXT: v_mov_b32_e32 v1, s5 ; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 @@ -278,9 +278,9 @@ ; GCN-NEXT: s_lshr_b64 s[6:7], s[4:5], s8 ; GCN-NEXT: s_or_b64 s[6:7], s[6:7], s[0:1] ; GCN-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc -; GCN-NEXT: s_cmp_eq_u32 s8, 0 ; GCN-NEXT: v_mov_b32_e32 v0, s3 ; GCN-NEXT: v_mov_b32_e32 v1, s7 +; GCN-NEXT: s_cmp_eq_u32 s8, 0 ; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GCN-NEXT: v_mov_b32_e32 v1, s5 ; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 @@ -306,13 +306,13 @@ ; GCN-NEXT: v_sub_i32_e32 v16, vcc, 64, v8 ; GCN-NEXT: v_lshr_b64 v[16:17], v[0:1], v16 ; GCN-NEXT: v_lshl_b64 v[18:19], v[2:3], v8 -; GCN-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[10:11] ; GCN-NEXT: v_cmp_gt_u64_e64 s[4:5], 64, v[8:9] +; GCN-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[10:11] ; GCN-NEXT: v_or_b32_e32 v11, v9, v11 ; GCN-NEXT: v_subrev_i32_e32 v9, vcc, 64, v8 -; GCN-NEXT: v_or_b32_e32 v10, v8, v10 ; GCN-NEXT: v_or_b32_e32 v19, v19, v17 ; GCN-NEXT: v_or_b32_e32 v18, v18, v16 +; GCN-NEXT: v_or_b32_e32 v10, v8, v10 ; GCN-NEXT: v_lshl_b64 v[16:17], v[0:1], v9 ; GCN-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5] ; GCN-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11] @@ -322,10 +322,10 @@ ; GCN-NEXT: v_cndmask_b32_e64 v11, v17, v19, s[4:5] ; GCN-NEXT: v_lshr_b64 v[9:10], v[4:5], v9 ; GCN-NEXT: v_lshl_b64 v[16:17], v[6:7], v12 -; GCN-NEXT: v_cmp_eq_u64_e64 s[8:9], 0, v[14:15] ; GCN-NEXT: v_cndmask_b32_e32 v3, v11, v3, vcc ; GCN-NEXT: v_or_b32_e32 v16, v16, v9 ; GCN-NEXT: v_cmp_gt_u64_e64 s[6:7], 64, v[12:13] +; GCN-NEXT: v_cmp_eq_u64_e64 s[8:9], 0, v[14:15] ; GCN-NEXT: v_subrev_i32_e32 v9, vcc, 64, v12 ; GCN-NEXT: v_or_b32_e32 v11, v17, v10 ; GCN-NEXT: v_lshl_b64 v[9:10], v[4:5], v9 @@ -333,9 +333,9 @@ ; GCN-NEXT: v_or_b32_e32 v14, v12, v14 ; GCN-NEXT: s_and_b64 vcc, s[8:9], s[6:7] ; GCN-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[14:15] +; GCN-NEXT: v_cndmask_b32_e32 v9, v9, v16, vcc ; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], v8 ; GCN-NEXT: v_lshl_b64 v[4:5], v[4:5], v12 -; GCN-NEXT: v_cndmask_b32_e32 v9, v9, v16, vcc ; GCN-NEXT: v_cndmask_b32_e64 v6, v9, v6, s[6:7] ; GCN-NEXT: v_cndmask_b32_e32 v9, v10, v11, vcc ; GCN-NEXT: v_cndmask_b32_e64 v7, v9, v7, s[6:7] @@ -355,13 +355,13 @@ ; GCN-NEXT: v_sub_i32_e32 v16, vcc, 64, v8 ; GCN-NEXT: v_lshl_b64 v[16:17], v[2:3], v16 ; GCN-NEXT: v_lshr_b64 v[18:19], v[0:1], v8 -; GCN-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[10:11] ; GCN-NEXT: v_cmp_gt_u64_e64 s[4:5], 64, v[8:9] +; GCN-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[10:11] ; GCN-NEXT: v_or_b32_e32 v11, v9, v11 ; GCN-NEXT: v_subrev_i32_e32 v9, vcc, 64, v8 -; GCN-NEXT: v_or_b32_e32 v10, v8, v10 ; GCN-NEXT: v_or_b32_e32 v19, v19, v17 ; GCN-NEXT: v_or_b32_e32 v18, v18, v16 +; GCN-NEXT: v_or_b32_e32 v10, v8, v10 ; GCN-NEXT: v_lshr_b64 v[16:17], v[2:3], v9 ; GCN-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5] ; GCN-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11] @@ -371,10 +371,10 @@ ; GCN-NEXT: v_cndmask_b32_e64 v11, v17, v19, s[4:5] ; GCN-NEXT: v_lshl_b64 v[9:10], v[6:7], v9 ; GCN-NEXT: v_lshr_b64 v[16:17], v[4:5], v12 -; GCN-NEXT: v_cmp_eq_u64_e64 s[8:9], 0, v[14:15] ; GCN-NEXT: v_cndmask_b32_e32 v1, v11, v1, vcc ; GCN-NEXT: v_or_b32_e32 v16, v16, v9 ; GCN-NEXT: v_cmp_gt_u64_e64 s[6:7], 64, v[12:13] +; GCN-NEXT: v_cmp_eq_u64_e64 s[8:9], 0, v[14:15] ; GCN-NEXT: v_subrev_i32_e32 v9, vcc, 64, v12 ; GCN-NEXT: v_or_b32_e32 v11, v17, v10 ; GCN-NEXT: v_lshr_b64 v[9:10], v[6:7], v9 @@ -382,9 +382,9 @@ ; GCN-NEXT: v_or_b32_e32 v14, v12, v14 ; GCN-NEXT: s_and_b64 vcc, s[8:9], s[6:7] ; GCN-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[14:15] +; GCN-NEXT: v_cndmask_b32_e32 v9, v9, v16, vcc ; GCN-NEXT: v_lshr_b64 v[2:3], v[2:3], v8 ; GCN-NEXT: v_lshr_b64 v[6:7], v[6:7], v12 -; GCN-NEXT: v_cndmask_b32_e32 v9, v9, v16, vcc ; GCN-NEXT: v_cndmask_b32_e64 v4, v9, v4, s[6:7] ; GCN-NEXT: v_cndmask_b32_e32 v9, v10, v11, vcc ; GCN-NEXT: v_cndmask_b32_e64 v5, v9, v5, s[6:7] @@ -404,13 +404,13 @@ ; GCN-NEXT: v_sub_i32_e32 v16, vcc, 64, v8 ; GCN-NEXT: v_lshl_b64 v[16:17], v[2:3], v16 ; GCN-NEXT: v_lshr_b64 v[18:19], v[0:1], v8 -; GCN-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[10:11] ; GCN-NEXT: v_cmp_gt_u64_e64 s[4:5], 64, v[8:9] +; GCN-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[10:11] ; GCN-NEXT: v_or_b32_e32 v11, v9, v11 ; GCN-NEXT: v_subrev_i32_e32 v9, vcc, 64, v8 -; GCN-NEXT: v_or_b32_e32 v10, v8, v10 ; GCN-NEXT: v_or_b32_e32 v19, v19, v17 ; GCN-NEXT: v_or_b32_e32 v18, v18, v16 +; GCN-NEXT: v_or_b32_e32 v10, v8, v10 ; GCN-NEXT: v_ashr_i64 v[16:17], v[2:3], v9 ; GCN-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5] ; GCN-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11] @@ -420,10 +420,10 @@ ; GCN-NEXT: v_cndmask_b32_e64 v11, v17, v19, s[4:5] ; GCN-NEXT: v_lshl_b64 v[9:10], v[6:7], v9 ; GCN-NEXT: v_lshr_b64 v[16:17], v[4:5], v12 -; GCN-NEXT: v_cmp_eq_u64_e64 s[8:9], 0, v[14:15] ; GCN-NEXT: v_cndmask_b32_e32 v1, v11, v1, vcc ; GCN-NEXT: v_or_b32_e32 v16, v16, v9 ; GCN-NEXT: v_cmp_gt_u64_e64 s[6:7], 64, v[12:13] +; GCN-NEXT: v_cmp_eq_u64_e64 s[8:9], 0, v[14:15] ; GCN-NEXT: v_subrev_i32_e32 v9, vcc, 64, v12 ; GCN-NEXT: v_or_b32_e32 v11, v17, v10 ; GCN-NEXT: v_ashr_i64 v[9:10], v[6:7], v9 @@ -457,16 +457,16 @@ ; GCN-NEXT: v_mov_b32_e32 v8, 0 ; GCN-NEXT: v_mov_b32_e32 v11, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_sub_i32 s6, 64, s16 ; GCN-NEXT: v_cmp_lt_u64_e64 s[0:1], s[16:17], 64 ; GCN-NEXT: v_cmp_eq_u64_e64 s[2:3], s[18:19], 0 +; GCN-NEXT: s_sub_i32 s6, 64, s16 ; GCN-NEXT: s_sub_i32 s4, s16, 64 ; GCN-NEXT: s_lshr_b64 s[6:7], s[8:9], s6 ; GCN-NEXT: s_lshl_b64 s[24:25], s[10:11], s16 -; GCN-NEXT: s_and_b64 vcc, s[2:3], s[0:1] -; GCN-NEXT: s_or_b64 s[0:1], s[16:17], s[18:19] ; GCN-NEXT: s_lshl_b64 s[4:5], s[8:9], s4 ; GCN-NEXT: s_or_b64 s[6:7], s[24:25], s[6:7] +; GCN-NEXT: s_and_b64 vcc, s[2:3], s[0:1] +; GCN-NEXT: s_or_b64 s[0:1], s[16:17], s[18:19] ; GCN-NEXT: v_mov_b32_e32 v0, s5 ; GCN-NEXT: v_mov_b32_e32 v1, s7 ; GCN-NEXT: v_cmp_eq_u64_e64 s[0:1], s[0:1], 0 @@ -477,10 +477,10 @@ ; GCN-NEXT: v_mov_b32_e32 v1, s6 ; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GCN-NEXT: v_mov_b32_e32 v1, s10 -; GCN-NEXT: s_sub_i32 s6, 64, s20 ; GCN-NEXT: v_cndmask_b32_e64 v2, v0, v1, s[0:1] ; GCN-NEXT: v_cmp_lt_u64_e64 s[0:1], s[20:21], 64 ; GCN-NEXT: v_cmp_eq_u64_e64 s[2:3], s[22:23], 0 +; GCN-NEXT: s_sub_i32 s6, 64, s20 ; GCN-NEXT: s_sub_i32 s4, s20, 64 ; GCN-NEXT: s_lshr_b64 s[6:7], s[12:13], s6 ; GCN-NEXT: s_lshl_b64 s[10:11], s[14:15], s20 @@ -527,16 +527,16 @@ ; GCN-NEXT: v_mov_b32_e32 v8, 0 ; GCN-NEXT: v_mov_b32_e32 v11, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_sub_i32 s6, 64, s16 ; GCN-NEXT: v_cmp_lt_u64_e64 s[0:1], s[16:17], 64 ; GCN-NEXT: v_cmp_eq_u64_e64 s[2:3], s[18:19], 0 +; GCN-NEXT: s_sub_i32 s6, 64, s16 ; GCN-NEXT: s_sub_i32 s4, s16, 64 ; GCN-NEXT: s_lshl_b64 s[6:7], s[10:11], s6 ; GCN-NEXT: s_lshr_b64 s[24:25], s[8:9], s16 +; GCN-NEXT: s_lshr_b64 s[4:5], s[10:11], s4 ; GCN-NEXT: s_or_b64 s[6:7], s[24:25], s[6:7] ; GCN-NEXT: s_and_b64 vcc, s[2:3], s[0:1] ; GCN-NEXT: s_or_b64 s[0:1], s[16:17], s[18:19] -; GCN-NEXT: s_lshr_b64 s[4:5], s[10:11], s4 ; GCN-NEXT: v_mov_b32_e32 v0, s5 ; GCN-NEXT: v_mov_b32_e32 v1, s7 ; GCN-NEXT: v_cmp_eq_u64_e64 s[0:1], s[0:1], 0 @@ -547,10 +547,10 @@ ; GCN-NEXT: v_mov_b32_e32 v2, s6 ; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GCN-NEXT: v_mov_b32_e32 v2, s8 -; GCN-NEXT: s_sub_i32 s6, 64, s20 ; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GCN-NEXT: v_cmp_lt_u64_e64 s[0:1], s[20:21], 64 ; GCN-NEXT: v_cmp_eq_u64_e64 s[2:3], s[22:23], 0 +; GCN-NEXT: s_sub_i32 s6, 64, s20 ; GCN-NEXT: s_sub_i32 s4, s20, 64 ; GCN-NEXT: s_lshl_b64 s[6:7], s[14:15], s6 ; GCN-NEXT: s_lshr_b64 s[8:9], s[12:13], s20 @@ -597,16 +597,16 @@ ; GCN-NEXT: v_mov_b32_e32 v8, 0 ; GCN-NEXT: v_mov_b32_e32 v11, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_sub_i32 s6, 64, s16 ; GCN-NEXT: v_cmp_lt_u64_e64 s[0:1], s[16:17], 64 ; GCN-NEXT: v_cmp_eq_u64_e64 s[2:3], s[18:19], 0 +; GCN-NEXT: s_sub_i32 s6, 64, s16 ; GCN-NEXT: s_sub_i32 s4, s16, 64 ; GCN-NEXT: s_lshl_b64 s[6:7], s[10:11], s6 ; GCN-NEXT: s_lshr_b64 s[24:25], s[8:9], s16 +; GCN-NEXT: s_ashr_i64 s[4:5], s[10:11], s4 ; GCN-NEXT: s_or_b64 s[6:7], s[24:25], s[6:7] ; GCN-NEXT: s_and_b64 vcc, s[2:3], s[0:1] ; GCN-NEXT: s_or_b64 s[0:1], s[16:17], s[18:19] -; GCN-NEXT: s_ashr_i64 s[4:5], s[10:11], s4 ; GCN-NEXT: v_mov_b32_e32 v0, s5 ; GCN-NEXT: v_mov_b32_e32 v1, s7 ; GCN-NEXT: v_cmp_eq_u64_e64 s[0:1], s[0:1], 0 @@ -617,10 +617,10 @@ ; GCN-NEXT: v_mov_b32_e32 v2, s6 ; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GCN-NEXT: v_mov_b32_e32 v2, s8 -; GCN-NEXT: s_sub_i32 s6, 64, s20 ; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GCN-NEXT: v_cmp_lt_u64_e64 s[0:1], s[20:21], 64 ; GCN-NEXT: v_cmp_eq_u64_e64 s[2:3], s[22:23], 0 +; GCN-NEXT: s_sub_i32 s6, 64, s20 ; GCN-NEXT: s_sub_i32 s4, s20, 64 ; GCN-NEXT: s_lshl_b64 s[6:7], s[14:15], s6 ; GCN-NEXT: s_lshr_b64 s[8:9], s[12:13], s20 @@ -639,13 +639,13 @@ ; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] ; GCN-NEXT: v_mov_b32_e32 v3, s12 ; GCN-NEXT: v_cndmask_b32_e64 v4, v2, v3, s[2:3] -; GCN-NEXT: s_ashr_i64 s[2:3], s[10:11], s16 ; GCN-NEXT: s_ashr_i32 s4, s11, 31 +; GCN-NEXT: s_ashr_i64 s[2:3], s[10:11], s16 ; GCN-NEXT: v_mov_b32_e32 v2, s4 ; GCN-NEXT: v_mov_b32_e32 v3, s3 ; GCN-NEXT: v_mov_b32_e32 v6, s2 -; GCN-NEXT: s_ashr_i64 s[2:3], s[14:15], s20 ; GCN-NEXT: s_ashr_i32 s4, s15, 31 +; GCN-NEXT: s_ashr_i64 s[2:3], s[14:15], s20 ; GCN-NEXT: v_cndmask_b32_e32 v3, v2, v3, vcc ; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc ; GCN-NEXT: v_mov_b32_e32 v6, s4 diff --git a/llvm/test/CodeGen/AMDGPU/shl.ll b/llvm/test/CodeGen/AMDGPU/shl.ll --- a/llvm/test/CodeGen/AMDGPU/shl.ll +++ b/llvm/test/CodeGen/AMDGPU/shl.ll @@ -551,10 +551,10 @@ ; SI-NEXT: s_mov_b32 s8, s6 ; SI-NEXT: s_mov_b32 s9, s7 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; SI-NEXT: s_mov_b64 s[12:13], s[6:7] ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s14, 0 ; SI-NEXT: s_mov_b32 s15, s3 +; SI-NEXT: s_mov_b64 s[12:13], s[6:7] ; SI-NEXT: buffer_load_dword v2, off, s[8:11], 0 ; SI-NEXT: buffer_load_dword v0, v[0:1], s[12:15], 0 addr64 offset:4 ; SI-NEXT: s_mov_b32 s6, 0xffff @@ -578,18 +578,18 @@ ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_load_dword s4, s[2:3], 0x0 ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_add_u32_e32 v0, vcc, 4, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: s_load_dword s4, s[2:3], 0x0 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s5, s4, 16 ; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b16_e64 v2, v0, s4 ; VI-NEXT: v_lshlrev_b16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -658,11 +658,11 @@ ; SI-NEXT: s_mov_b64 s[6:7], s[2:3] ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v3 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v8, s0, v4 ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_and_b32_e32 v9, s0, v5 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v3 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_lshl_b32_e32 v5, v7, v5 ; SI-NEXT: v_lshl_b32_e32 v3, v3, v9 diff --git a/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll b/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll --- a/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll @@ -28,8 +28,8 @@ ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s1, s2, 16 -; VI-NEXT: s_lshr_b32 s8, s0, 16 ; VI-NEXT: s_and_b32 s2, s2, s3 +; VI-NEXT: s_lshr_b32 s8, s0, 16 ; VI-NEXT: s_and_b32 s0, s0, s3 ; VI-NEXT: s_lshl_b32 s0, s2, s0 ; VI-NEXT: s_lshl_b32 s1, s1, s8 @@ -347,8 +347,8 @@ ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v3, v[0:1] -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b16_e64 v2, v3, 8 @@ -522,11 +522,11 @@ ; CI-NEXT: s_mov_b64 s[6:7], s[2:3] ; CI-NEXT: s_waitcnt vmcnt(1) ; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v3 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_and_b32_e32 v8, s0, v4 ; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; CI-NEXT: v_and_b32_e32 v9, s0, v5 -; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v3 ; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; CI-NEXT: v_lshl_b32_e32 v5, v7, v5 ; CI-NEXT: v_lshl_b32_e32 v3, v3, v9 @@ -584,8 +584,8 @@ ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; VI-NEXT: s_mov_b32 s2, 0xff000000 @@ -596,9 +596,9 @@ ; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v1 ; VI-NEXT: v_lshlrev_b16_e32 v5, 8, v0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; VI-NEXT: v_and_b32_e32 v0, s2, v0 ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: v_and_b32_e32 v4, s2, v4 +; VI-NEXT: v_and_b32_e32 v0, s2, v0 ; VI-NEXT: v_or_b32_e32 v1, v1, v4 ; VI-NEXT: v_or_b32_e32 v0, v5, v0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/shl_add_ptr.ll b/llvm/test/CodeGen/AMDGPU/shl_add_ptr.ll --- a/llvm/test/CodeGen/AMDGPU/shl_add_ptr.ll +++ b/llvm/test/CodeGen/AMDGPU/shl_add_ptr.ll @@ -282,9 +282,8 @@ ; GCN-LABEL: {{^}}shl_add_ptr_combine_2use_lds: ; GCN: v_lshlrev_b32_e32 [[SCALE0:v[0-9]+]], 3, v0 -; GCN: ds_write_b32 [[SCALE0]], v{{[0-9]+}} offset:32 - ; GCN: v_lshlrev_b32_e32 [[SCALE1:v[0-9]+]], 4, v0 +; GCN: ds_write_b32 [[SCALE0]], v{{[0-9]+}} offset:32 ; GCN: ds_write_b32 [[SCALE1]], v{{[0-9]+}} offset:64 define void @shl_add_ptr_combine_2use_lds(i32 %idx) #0 { %idx.add = add nuw i32 %idx, 4 @@ -333,9 +332,8 @@ ; GCN-LABEL: {{^}}shl_add_ptr_combine_2use_private: ; GCN: v_lshlrev_b32_e32 [[SCALE0:v[0-9]+]], 2, v0 -; GCN: buffer_store_dword v{{[0-9]+}}, [[SCALE0]], s[0:3], 0 offen offset:16 - ; GCN: v_lshlrev_b32_e32 [[SCALE1:v[0-9]+]], 3, v0 +; GCN: buffer_store_dword v{{[0-9]+}}, [[SCALE0]], s[0:3], 0 offen offset:16 ; GCN: buffer_store_dword v{{[0-9]+}}, [[SCALE1]], s[0:3], 0 offen offset:32 define void @shl_add_ptr_combine_2use_private(i16 zeroext %idx.arg) #0 { %idx = zext i16 %idx.arg to i32 @@ -388,10 +386,9 @@ ; GCN-LABEL: {{^}}shl_or_ptr_combine_2use_lds: ; GCN: v_lshlrev_b32_e32 [[SCALE0:v[0-9]+]], 3, v0 ; GCN: v_or_b32_e32 [[SCALE1:v[0-9]+]], 32, [[SCALE0]] +; GCN: v_lshlrev_b32_e32 [[SCALE2:v[0-9]+]], 4, v0 ; GCN: ds_write_b32 [[SCALE1]], v{{[0-9]+}} - -; GCN: v_lshlrev_b32_e32 [[SCALE1:v[0-9]+]], 4, v0 -; GCN: ds_write_b32 [[SCALE1]], v{{[0-9]+}} offset:64 +; GCN: ds_write_b32 [[SCALE2]], v{{[0-9]+}} offset:64 define void @shl_or_ptr_combine_2use_lds(i32 %idx) #0 { %idx.add = or i32 %idx, 4 %shl0 = shl i32 %idx.add, 3 diff --git a/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll b/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll --- a/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll +++ b/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll @@ -836,8 +836,8 @@ ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: flat_load_ushort v4, v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_subrev_u16_e32 v2, 64, v3 ; VI-NEXT: v_subrev_u16_e32 v3, 64, v4 @@ -923,8 +923,8 @@ ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v3, v[0:1] -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_sub_u16_sdwa v2, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -994,8 +994,8 @@ ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v3, v[0:1] -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v2, -7, v3 @@ -1066,8 +1066,8 @@ ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v3, v[0:1] -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_sdwa v2, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -1137,8 +1137,8 @@ ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v3, v[0:1] -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 @@ -1413,8 +1413,8 @@ ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v3, v[0:1] -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_sub_u16_sdwa v2, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -1549,8 +1549,8 @@ ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v3, v[0:1] -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 @@ -1621,8 +1621,8 @@ ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v3, v[0:1] -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v2, -16, v3 @@ -1757,8 +1757,8 @@ ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v3, v[0:1] -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 @@ -1823,14 +1823,14 @@ ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v3, v[0:1] ; VI-NEXT: s_movk_i32 s2, 0xc400 ; VI-NEXT: v_mov_b32_e32 v4, s2 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v2, s2, v3 @@ -1896,14 +1896,14 @@ ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v3, v[0:1] ; VI-NEXT: s_movk_i32 s2, 0x4400 ; VI-NEXT: v_mov_b32_e32 v4, s2 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v2, s2, v3 @@ -1969,14 +1969,14 @@ ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v3, v[0:1] ; VI-NEXT: s_movk_i32 s2, 0x4000 ; VI-NEXT: v_mov_b32_e32 v4, s2 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v2, s2, v3 @@ -2042,14 +2042,14 @@ ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v3, v[0:1] ; VI-NEXT: s_movk_i32 s2, 0xc000 ; VI-NEXT: v_mov_b32_e32 v4, s2 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v2, s2, v3 @@ -2117,9 +2117,9 @@ ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: v_mov_b32_e32 v1, 32 ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_mov_b32_e32 v1, 32 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_sub_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD diff --git a/llvm/test/CodeGen/AMDGPU/sign_extend.ll b/llvm/test/CodeGen/AMDGPU/sign_extend.ll --- a/llvm/test/CodeGen/AMDGPU/sign_extend.ll +++ b/llvm/test/CodeGen/AMDGPU/sign_extend.ll @@ -487,10 +487,10 @@ ; SI-NEXT: s_ashr_i32 s5, s6, 16 ; SI-NEXT: s_sext_i32_i16 s6, s6 ; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: s_sext_i32_i16 s7, s7 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s5 -; SI-NEXT: s_sext_i32_i16 s7, s7 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s7 @@ -513,10 +513,10 @@ ; VI-NEXT: s_mov_b32 s0, s4 ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: s_ashr_i32 s4, s7, 16 +; VI-NEXT: s_sext_i32_i16 s7, s7 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s5 -; VI-NEXT: s_sext_i32_i16 s7, s7 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s7 diff --git a/llvm/test/CodeGen/AMDGPU/sint_to_fp.i64.ll b/llvm/test/CodeGen/AMDGPU/sint_to_fp.i64.ll --- a/llvm/test/CodeGen/AMDGPU/sint_to_fp.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/sint_to_fp.i64.ll @@ -410,10 +410,10 @@ ; GFX8-NEXT: v_ashrrev_i32_e32 v14, 31, v14 ; GFX8-NEXT: v_ashrrev_i32_e32 v16, 31, v16 ; GFX8-NEXT: v_add_u32_e32 v11, vcc, -1, v11 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0 ; GFX8-NEXT: v_add_u32_e32 v13, vcc, -1, v13 ; GFX8-NEXT: v_add_u32_e32 v15, vcc, -1, v15 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, -1, v17 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0 ; GFX8-NEXT: v_add_u32_e32 v12, vcc, 32, v12 ; GFX8-NEXT: v_add_u32_e32 v14, vcc, 32, v14 ; GFX8-NEXT: v_add_u32_e32 v16, vcc, 32, v16 @@ -421,26 +421,26 @@ ; GFX8-NEXT: v_min_u32_e32 v11, v13, v12 ; GFX8-NEXT: v_min_u32_e32 v12, v15, v14 ; GFX8-NEXT: v_min_u32_e32 v13, v17, v16 -; GFX8-NEXT: v_lshlrev_b64 v[5:6], v11, v[5:6] -; GFX8-NEXT: v_lshlrev_b64 v[3:4], v12, v[3:4] ; GFX8-NEXT: v_lshlrev_b64 v[7:8], v0, v[7:8] ; GFX8-NEXT: v_sub_u32_e32 v14, vcc, 32, v0 +; GFX8-NEXT: v_lshlrev_b64 v[5:6], v11, v[5:6] +; GFX8-NEXT: v_lshlrev_b64 v[3:4], v12, v[3:4] ; GFX8-NEXT: v_lshlrev_b64 v[0:1], v13, v[1:2] ; GFX8-NEXT: v_min_u32_e32 v7, 1, v7 ; GFX8-NEXT: v_min_u32_e32 v5, 1, v5 ; GFX8-NEXT: v_min_u32_e32 v3, 1, v3 ; GFX8-NEXT: v_min_u32_e32 v0, 1, v0 -; GFX8-NEXT: v_or_b32_e32 v3, v4, v3 -; GFX8-NEXT: v_or_b32_e32 v5, v6, v5 ; GFX8-NEXT: v_or_b32_e32 v7, v8, v7 +; GFX8-NEXT: v_or_b32_e32 v5, v6, v5 +; GFX8-NEXT: v_or_b32_e32 v3, v4, v3 ; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX8-NEXT: v_cvt_f32_i32_e32 v1, v7 ; GFX8-NEXT: v_cvt_f32_i32_e32 v4, v5 ; GFX8-NEXT: v_cvt_f32_i32_e32 v3, v3 ; GFX8-NEXT: v_cvt_f32_i32_e32 v5, v0 -; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 32, v13 ; GFX8-NEXT: v_sub_u32_e32 v11, vcc, 32, v11 ; GFX8-NEXT: v_sub_u32_e32 v12, vcc, 32, v12 +; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 32, v13 ; GFX8-NEXT: v_ldexp_f32 v1, v1, v14 ; GFX8-NEXT: v_ldexp_f32 v0, v4, v11 ; GFX8-NEXT: v_ldexp_f32 v3, v3, v12 @@ -641,10 +641,10 @@ ; GFX8-NEXT: v_ashrrev_i32_e32 v14, 31, v14 ; GFX8-NEXT: v_ashrrev_i32_e32 v16, 31, v16 ; GFX8-NEXT: v_add_u32_e32 v11, vcc, -1, v11 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0 ; GFX8-NEXT: v_add_u32_e32 v13, vcc, -1, v13 ; GFX8-NEXT: v_add_u32_e32 v15, vcc, -1, v15 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, -1, v17 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0 ; GFX8-NEXT: v_add_u32_e32 v12, vcc, 32, v12 ; GFX8-NEXT: v_add_u32_e32 v14, vcc, 32, v14 ; GFX8-NEXT: v_add_u32_e32 v16, vcc, 32, v16 @@ -652,19 +652,19 @@ ; GFX8-NEXT: v_min_u32_e32 v11, v13, v12 ; GFX8-NEXT: v_min_u32_e32 v12, v15, v14 ; GFX8-NEXT: v_min_u32_e32 v13, v17, v16 -; GFX8-NEXT: v_lshlrev_b64 v[5:6], v11, v[5:6] -; GFX8-NEXT: v_lshlrev_b64 v[3:4], v12, v[3:4] ; GFX8-NEXT: v_lshlrev_b64 v[7:8], v0, v[7:8] ; GFX8-NEXT: v_sub_u32_e32 v14, vcc, 32, v0 +; GFX8-NEXT: v_lshlrev_b64 v[5:6], v11, v[5:6] +; GFX8-NEXT: v_lshlrev_b64 v[3:4], v12, v[3:4] ; GFX8-NEXT: v_lshlrev_b64 v[0:1], v13, v[1:2] ; GFX8-NEXT: v_min_u32_e32 v7, 1, v7 ; GFX8-NEXT: v_min_u32_e32 v5, 1, v5 ; GFX8-NEXT: v_min_u32_e32 v3, 1, v3 ; GFX8-NEXT: v_min_u32_e32 v0, 1, v0 -; GFX8-NEXT: v_or_b32_e32 v3, v4, v3 -; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX8-NEXT: v_or_b32_e32 v7, v8, v7 ; GFX8-NEXT: v_or_b32_e32 v5, v6, v5 +; GFX8-NEXT: v_or_b32_e32 v3, v4, v3 +; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX8-NEXT: v_cvt_f32_i32_e32 v1, v7 ; GFX8-NEXT: v_cvt_f32_i32_e32 v4, v5 ; GFX8-NEXT: v_cvt_f32_i32_e32 v3, v3 @@ -672,18 +672,18 @@ ; GFX8-NEXT: v_sub_u32_e32 v11, vcc, 32, v11 ; GFX8-NEXT: v_sub_u32_e32 v12, vcc, 32, v12 ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 32, v13 +; GFX8-NEXT: v_ldexp_f32 v1, v1, v14 ; GFX8-NEXT: v_ldexp_f32 v4, v4, v11 ; GFX8-NEXT: v_ldexp_f32 v3, v3, v12 ; GFX8-NEXT: v_ldexp_f32 v0, v0, v2 -; GFX8-NEXT: v_ldexp_f32 v1, v1, v14 -; GFX8-NEXT: v_cvt_f16_f32_e32 v5, v0 -; GFX8-NEXT: v_cvt_f16_f32_sdwa v3, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD ; GFX8-NEXT: v_cvt_f16_f32_sdwa v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD ; GFX8-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX8-NEXT: v_cvt_f16_f32_sdwa v3, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD +; GFX8-NEXT: v_cvt_f16_f32_e32 v5, v0 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v9 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v10, vcc -; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 ; GFX8-NEXT: v_or_b32_e32 v2, v4, v2 +; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 ; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX8-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll --- a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll +++ b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll @@ -100,22 +100,39 @@ ; FIXME: Ideally only one early-exit would be emitted define amdgpu_ps void @test_kill_depth_var_x2_same(float %x) #0 { -; WAVE64-LABEL: test_kill_depth_var_x2_same: -; WAVE64: ; %bb.0: -; WAVE64-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v0 -; WAVE64-NEXT: s_mov_b64 s[0:1], exec -; WAVE64-NEXT: s_andn2_b64 s[0:1], s[0:1], vcc -; WAVE64-NEXT: s_cbranch_scc0 BB4_2 -; WAVE64-NEXT: ; %bb.1: -; WAVE64-NEXT: s_andn2_b64 exec, exec, vcc -; WAVE64-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v0 -; WAVE64-NEXT: s_andn2_b64 s[0:1], s[0:1], vcc -; WAVE64-NEXT: s_cbranch_scc0 BB4_2 -; WAVE64-NEXT: s_endpgm -; WAVE64-NEXT: BB4_2: -; WAVE64-NEXT: s_mov_b64 exec, 0 -; WAVE64-NEXT: exp null off, off, off, off done vm -; WAVE64-NEXT: s_endpgm +; SI-LABEL: test_kill_depth_var_x2_same: +; SI: ; %bb.0: +; SI-NEXT: s_mov_b64 s[0:1], exec +; SI-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v0 +; SI-NEXT: s_andn2_b64 s[0:1], s[0:1], vcc +; SI-NEXT: s_cbranch_scc0 BB4_2 +; SI-NEXT: ; %bb.1: +; SI-NEXT: s_andn2_b64 exec, exec, vcc +; SI-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v0 +; SI-NEXT: s_andn2_b64 s[0:1], s[0:1], vcc +; SI-NEXT: s_cbranch_scc0 BB4_2 +; SI-NEXT: s_endpgm +; SI-NEXT: BB4_2: +; SI-NEXT: s_mov_b64 exec, 0 +; SI-NEXT: exp null off, off, off, off done vm +; SI-NEXT: s_endpgm +; +; GFX10-WAVE64-LABEL: test_kill_depth_var_x2_same: +; GFX10-WAVE64: ; %bb.0: +; GFX10-WAVE64-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v0 +; GFX10-WAVE64-NEXT: s_mov_b64 s[0:1], exec +; GFX10-WAVE64-NEXT: s_andn2_b64 s[0:1], s[0:1], vcc +; GFX10-WAVE64-NEXT: s_cbranch_scc0 BB4_2 +; GFX10-WAVE64-NEXT: ; %bb.1: +; GFX10-WAVE64-NEXT: s_andn2_b64 exec, exec, vcc +; GFX10-WAVE64-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v0 +; GFX10-WAVE64-NEXT: s_andn2_b64 s[0:1], s[0:1], vcc +; GFX10-WAVE64-NEXT: s_cbranch_scc0 BB4_2 +; GFX10-WAVE64-NEXT: s_endpgm +; GFX10-WAVE64-NEXT: BB4_2: +; GFX10-WAVE64-NEXT: s_mov_b64 exec, 0 +; GFX10-WAVE64-NEXT: exp null off, off, off, off done vm +; GFX10-WAVE64-NEXT: s_endpgm ; ; GFX10-WAVE32-LABEL: test_kill_depth_var_x2_same: ; GFX10-WAVE32: ; %bb.0: @@ -141,22 +158,39 @@ ; FIXME: Ideally only one early-exit would be emitted define amdgpu_ps void @test_kill_depth_var_x2(float %x, float %y) #0 { -; WAVE64-LABEL: test_kill_depth_var_x2: -; WAVE64: ; %bb.0: -; WAVE64-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v0 -; WAVE64-NEXT: s_mov_b64 s[0:1], exec -; WAVE64-NEXT: s_andn2_b64 s[0:1], s[0:1], vcc -; WAVE64-NEXT: s_cbranch_scc0 BB5_2 -; WAVE64-NEXT: ; %bb.1: -; WAVE64-NEXT: s_andn2_b64 exec, exec, vcc -; WAVE64-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v1 -; WAVE64-NEXT: s_andn2_b64 s[0:1], s[0:1], vcc -; WAVE64-NEXT: s_cbranch_scc0 BB5_2 -; WAVE64-NEXT: s_endpgm -; WAVE64-NEXT: BB5_2: -; WAVE64-NEXT: s_mov_b64 exec, 0 -; WAVE64-NEXT: exp null off, off, off, off done vm -; WAVE64-NEXT: s_endpgm +; SI-LABEL: test_kill_depth_var_x2: +; SI: ; %bb.0: +; SI-NEXT: s_mov_b64 s[0:1], exec +; SI-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v0 +; SI-NEXT: s_andn2_b64 s[0:1], s[0:1], vcc +; SI-NEXT: s_cbranch_scc0 BB5_2 +; SI-NEXT: ; %bb.1: +; SI-NEXT: s_andn2_b64 exec, exec, vcc +; SI-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v1 +; SI-NEXT: s_andn2_b64 s[0:1], s[0:1], vcc +; SI-NEXT: s_cbranch_scc0 BB5_2 +; SI-NEXT: s_endpgm +; SI-NEXT: BB5_2: +; SI-NEXT: s_mov_b64 exec, 0 +; SI-NEXT: exp null off, off, off, off done vm +; SI-NEXT: s_endpgm +; +; GFX10-WAVE64-LABEL: test_kill_depth_var_x2: +; GFX10-WAVE64: ; %bb.0: +; GFX10-WAVE64-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v0 +; GFX10-WAVE64-NEXT: s_mov_b64 s[0:1], exec +; GFX10-WAVE64-NEXT: s_andn2_b64 s[0:1], s[0:1], vcc +; GFX10-WAVE64-NEXT: s_cbranch_scc0 BB5_2 +; GFX10-WAVE64-NEXT: ; %bb.1: +; GFX10-WAVE64-NEXT: s_andn2_b64 exec, exec, vcc +; GFX10-WAVE64-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v1 +; GFX10-WAVE64-NEXT: s_andn2_b64 s[0:1], s[0:1], vcc +; GFX10-WAVE64-NEXT: s_cbranch_scc0 BB5_2 +; GFX10-WAVE64-NEXT: s_endpgm +; GFX10-WAVE64-NEXT: BB5_2: +; GFX10-WAVE64-NEXT: s_mov_b64 exec, 0 +; GFX10-WAVE64-NEXT: exp null off, off, off, off done vm +; GFX10-WAVE64-NEXT: s_endpgm ; ; GFX10-WAVE32-LABEL: test_kill_depth_var_x2: ; GFX10-WAVE32: ; %bb.0: @@ -182,25 +216,45 @@ } define amdgpu_ps void @test_kill_depth_var_x2_instructions(float %x) #0 { -; WAVE64-LABEL: test_kill_depth_var_x2_instructions: -; WAVE64: ; %bb.0: -; WAVE64-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v0 -; WAVE64-NEXT: s_mov_b64 s[0:1], exec -; WAVE64-NEXT: s_andn2_b64 s[0:1], s[0:1], vcc -; WAVE64-NEXT: s_cbranch_scc0 BB6_2 -; WAVE64-NEXT: ; %bb.1: -; WAVE64-NEXT: s_andn2_b64 exec, exec, vcc -; WAVE64-NEXT: ;;#ASMSTART -; WAVE64-NEXT: v_mov_b32_e64 v7, -1 -; WAVE64-NEXT: ;;#ASMEND -; WAVE64-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v7 -; WAVE64-NEXT: s_andn2_b64 s[0:1], s[0:1], vcc -; WAVE64-NEXT: s_cbranch_scc0 BB6_2 -; WAVE64-NEXT: s_endpgm -; WAVE64-NEXT: BB6_2: -; WAVE64-NEXT: s_mov_b64 exec, 0 -; WAVE64-NEXT: exp null off, off, off, off done vm -; WAVE64-NEXT: s_endpgm +; SI-LABEL: test_kill_depth_var_x2_instructions: +; SI: ; %bb.0: +; SI-NEXT: s_mov_b64 s[0:1], exec +; SI-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v0 +; SI-NEXT: s_andn2_b64 s[0:1], s[0:1], vcc +; SI-NEXT: s_cbranch_scc0 BB6_2 +; SI-NEXT: ; %bb.1: +; SI-NEXT: s_andn2_b64 exec, exec, vcc +; SI-NEXT: ;;#ASMSTART +; SI-NEXT: v_mov_b32_e64 v7, -1 +; SI-NEXT: ;;#ASMEND +; SI-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v7 +; SI-NEXT: s_andn2_b64 s[0:1], s[0:1], vcc +; SI-NEXT: s_cbranch_scc0 BB6_2 +; SI-NEXT: s_endpgm +; SI-NEXT: BB6_2: +; SI-NEXT: s_mov_b64 exec, 0 +; SI-NEXT: exp null off, off, off, off done vm +; SI-NEXT: s_endpgm +; +; GFX10-WAVE64-LABEL: test_kill_depth_var_x2_instructions: +; GFX10-WAVE64: ; %bb.0: +; GFX10-WAVE64-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v0 +; GFX10-WAVE64-NEXT: s_mov_b64 s[0:1], exec +; GFX10-WAVE64-NEXT: s_andn2_b64 s[0:1], s[0:1], vcc +; GFX10-WAVE64-NEXT: s_cbranch_scc0 BB6_2 +; GFX10-WAVE64-NEXT: ; %bb.1: +; GFX10-WAVE64-NEXT: s_andn2_b64 exec, exec, vcc +; GFX10-WAVE64-NEXT: ;;#ASMSTART +; GFX10-WAVE64-NEXT: v_mov_b32_e64 v7, -1 +; GFX10-WAVE64-NEXT: ;;#ASMEND +; GFX10-WAVE64-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v7 +; GFX10-WAVE64-NEXT: s_andn2_b64 s[0:1], s[0:1], vcc +; GFX10-WAVE64-NEXT: s_cbranch_scc0 BB6_2 +; GFX10-WAVE64-NEXT: s_endpgm +; GFX10-WAVE64-NEXT: BB6_2: +; GFX10-WAVE64-NEXT: s_mov_b64 exec, 0 +; GFX10-WAVE64-NEXT: exp null off, off, off, off done vm +; GFX10-WAVE64-NEXT: s_endpgm ; ; GFX10-WAVE32-LABEL: test_kill_depth_var_x2_instructions: ; GFX10-WAVE32: ; %bb.0: @@ -231,40 +285,75 @@ ; FIXME: why does the skip depend on the asm length in the same block? define amdgpu_ps float @test_kill_control_flow(i32 inreg %arg) #0 { -; WAVE64-LABEL: test_kill_control_flow: -; WAVE64: ; %bb.0: ; %entry -; WAVE64-NEXT: s_cmp_lg_u32 s0, 0 -; WAVE64-NEXT: s_cbranch_scc0 BB7_2 -; WAVE64-NEXT: ; %bb.1: ; %exit -; WAVE64-NEXT: v_mov_b32_e32 v0, 1.0 -; WAVE64-NEXT: s_branch BB7_5 -; WAVE64-NEXT: BB7_2: ; %bb -; WAVE64-NEXT: ;;#ASMSTART -; WAVE64-NEXT: v_mov_b32_e64 v7, -1 -; WAVE64-NEXT: v_nop_e64 -; WAVE64-NEXT: v_nop_e64 -; WAVE64-NEXT: v_nop_e64 -; WAVE64-NEXT: v_nop_e64 -; WAVE64-NEXT: v_nop_e64 -; WAVE64-NEXT: v_nop_e64 -; WAVE64-NEXT: v_nop_e64 -; WAVE64-NEXT: v_nop_e64 -; WAVE64-NEXT: v_nop_e64 -; WAVE64-NEXT: v_nop_e64 -; WAVE64-NEXT: ;;#ASMEND -; WAVE64-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v7 -; WAVE64-NEXT: s_mov_b64 s[2:3], exec -; WAVE64-NEXT: s_andn2_b64 s[2:3], s[2:3], vcc -; WAVE64-NEXT: s_cbranch_scc0 BB7_4 -; WAVE64-NEXT: ; %bb.3: ; %bb -; WAVE64-NEXT: s_andn2_b64 exec, exec, vcc -; WAVE64-NEXT: v_mov_b32_e32 v0, 1.0 -; WAVE64-NEXT: s_branch BB7_5 -; WAVE64-NEXT: BB7_4: -; WAVE64-NEXT: s_mov_b64 exec, 0 -; WAVE64-NEXT: exp null off, off, off, off done vm -; WAVE64-NEXT: s_endpgm -; WAVE64-NEXT: BB7_5: +; SI-LABEL: test_kill_control_flow: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_cmp_lg_u32 s0, 0 +; SI-NEXT: s_cbranch_scc0 BB7_2 +; SI-NEXT: ; %bb.1: ; %exit +; SI-NEXT: v_mov_b32_e32 v0, 1.0 +; SI-NEXT: s_branch BB7_5 +; SI-NEXT: BB7_2: ; %bb +; SI-NEXT: s_mov_b64 s[2:3], exec +; SI-NEXT: ;;#ASMSTART +; SI-NEXT: v_mov_b32_e64 v7, -1 +; SI-NEXT: v_nop_e64 +; SI-NEXT: v_nop_e64 +; SI-NEXT: v_nop_e64 +; SI-NEXT: v_nop_e64 +; SI-NEXT: v_nop_e64 +; SI-NEXT: v_nop_e64 +; SI-NEXT: v_nop_e64 +; SI-NEXT: v_nop_e64 +; SI-NEXT: v_nop_e64 +; SI-NEXT: v_nop_e64 +; SI-NEXT: ;;#ASMEND +; SI-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v7 +; SI-NEXT: s_andn2_b64 s[2:3], s[2:3], vcc +; SI-NEXT: s_cbranch_scc0 BB7_4 +; SI-NEXT: ; %bb.3: ; %bb +; SI-NEXT: s_andn2_b64 exec, exec, vcc +; SI-NEXT: v_mov_b32_e32 v0, 1.0 +; SI-NEXT: s_branch BB7_5 +; SI-NEXT: BB7_4: +; SI-NEXT: s_mov_b64 exec, 0 +; SI-NEXT: exp null off, off, off, off done vm +; SI-NEXT: s_endpgm +; SI-NEXT: BB7_5: +; +; GFX10-WAVE64-LABEL: test_kill_control_flow: +; GFX10-WAVE64: ; %bb.0: ; %entry +; GFX10-WAVE64-NEXT: s_cmp_lg_u32 s0, 0 +; GFX10-WAVE64-NEXT: s_cbranch_scc0 BB7_2 +; GFX10-WAVE64-NEXT: ; %bb.1: ; %exit +; GFX10-WAVE64-NEXT: v_mov_b32_e32 v0, 1.0 +; GFX10-WAVE64-NEXT: s_branch BB7_5 +; GFX10-WAVE64-NEXT: BB7_2: ; %bb +; GFX10-WAVE64-NEXT: ;;#ASMSTART +; GFX10-WAVE64-NEXT: v_mov_b32_e64 v7, -1 +; GFX10-WAVE64-NEXT: v_nop_e64 +; GFX10-WAVE64-NEXT: v_nop_e64 +; GFX10-WAVE64-NEXT: v_nop_e64 +; GFX10-WAVE64-NEXT: v_nop_e64 +; GFX10-WAVE64-NEXT: v_nop_e64 +; GFX10-WAVE64-NEXT: v_nop_e64 +; GFX10-WAVE64-NEXT: v_nop_e64 +; GFX10-WAVE64-NEXT: v_nop_e64 +; GFX10-WAVE64-NEXT: v_nop_e64 +; GFX10-WAVE64-NEXT: v_nop_e64 +; GFX10-WAVE64-NEXT: ;;#ASMEND +; GFX10-WAVE64-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v7 +; GFX10-WAVE64-NEXT: s_mov_b64 s[2:3], exec +; GFX10-WAVE64-NEXT: s_andn2_b64 s[2:3], s[2:3], vcc +; GFX10-WAVE64-NEXT: s_cbranch_scc0 BB7_4 +; GFX10-WAVE64-NEXT: ; %bb.3: ; %bb +; GFX10-WAVE64-NEXT: s_andn2_b64 exec, exec, vcc +; GFX10-WAVE64-NEXT: v_mov_b32_e32 v0, 1.0 +; GFX10-WAVE64-NEXT: s_branch BB7_5 +; GFX10-WAVE64-NEXT: BB7_4: +; GFX10-WAVE64-NEXT: s_mov_b64 exec, 0 +; GFX10-WAVE64-NEXT: exp null off, off, off, off done vm +; GFX10-WAVE64-NEXT: s_endpgm +; GFX10-WAVE64-NEXT: BB7_5: ; ; GFX10-WAVE32-LABEL: test_kill_control_flow: ; GFX10-WAVE32: ; %bb.0: ; %entry @@ -332,6 +421,7 @@ ; SI-NEXT: v_mov_b32_e32 v9, 0 ; SI-NEXT: s_cbranch_scc1 BB8_3 ; SI-NEXT: ; %bb.1: ; %bb +; SI-NEXT: s_mov_b64 s[2:3], exec ; SI-NEXT: ;;#ASMSTART ; SI-NEXT: v_mov_b32_e64 v7, -1 ; SI-NEXT: v_nop_e64 @@ -347,7 +437,6 @@ ; SI-NEXT: v_nop_e64 ; SI-NEXT: ;;#ASMEND ; SI-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v7 -; SI-NEXT: s_mov_b64 s[2:3], exec ; SI-NEXT: s_andn2_b64 s[2:3], s[2:3], vcc ; SI-NEXT: ;;#ASMSTART ; SI-NEXT: v_mov_b32_e64 v8, -1 diff --git a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll --- a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll @@ -49,15 +49,15 @@ ; GFX6: s_add_i32 s32, s32, 0x[[OFFSET:[0-9a-f]+]] ; GFX6: s_add_i32 s32, s32, 0x[[OFFSET:[0-9a-f]+]] ; GFX6: s_add_i32 s32, s32, 0x[[OFFSET:[0-9a-f]+]] -; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v{{[0-9]+}}, off, s[{{[0-9:]+}}], s32 ; GFX6-NEXT: s_add_i32 s32, s32, 0x[[OFFSET:[0-9a-f]+]] ; GFX6: NumSgprs: 48 ; GFX6: ScratchSize: 8608 ; FLATSCR: s_movk_i32 [[SOFF1:s[0-9]+]], 0x -; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; FLATSCR-NEXT: scratch_store_dwordx4 off, v[{{[0-9:]+}}], [[SOFF1]] ; 16-byte Folded Spill +; GFX9-FLATSCR: s_waitcnt vmcnt(0) +; FLATSCR: scratch_store_dwordx4 off, v[{{[0-9:]+}}], [[SOFF1]] ; 16-byte Folded Spill ; FLATSCR: s_movk_i32 [[SOFF2:s[0-9]+]], 0x ; FLATSCR: scratch_load_dwordx4 v[{{[0-9:]+}}], off, [[SOFF2]] ; 16-byte Folded Reload define amdgpu_kernel void @test_limited_sgpr(<64 x i32> addrspace(1)* %out, <64 x i32> addrspace(1)* %in) #0 { diff --git a/llvm/test/CodeGen/AMDGPU/sra.ll b/llvm/test/CodeGen/AMDGPU/sra.ll --- a/llvm/test/CodeGen/AMDGPU/sra.ll +++ b/llvm/test/CodeGen/AMDGPU/sra.ll @@ -162,8 +162,8 @@ ; SI-NEXT: s_mov_b32 s1, s5 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_bfe_i32 v2, v0, 0, 16 -; SI-NEXT: v_and_b32_e32 v3, s6, v1 ; SI-NEXT: v_ashrrev_i32_e32 v0, 16, v0 +; SI-NEXT: v_and_b32_e32 v3, s6, v1 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_ashrrev_i32_e32 v0, v1, v0 ; SI-NEXT: v_ashrrev_i32_e32 v1, v3, v2 @@ -250,16 +250,16 @@ ; SI-NEXT: s_mov_b32 s1, s5 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_bfe_i32 v4, v0, 0, 16 +; SI-NEXT: v_ashrrev_i32_e32 v0, 16, v0 ; SI-NEXT: v_bfe_i32 v5, v1, 0, 16 +; SI-NEXT: v_ashrrev_i32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v6, s6, v2 -; SI-NEXT: v_and_b32_e32 v7, s6, v3 -; SI-NEXT: v_ashrrev_i32_e32 v0, 16, v0 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_ashrrev_i32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v7, s6, v3 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_ashrrev_i32_e32 v1, v3, v1 -; SI-NEXT: v_ashrrev_i32_e32 v0, v2, v0 ; SI-NEXT: v_ashrrev_i32_e32 v3, v7, v5 +; SI-NEXT: v_ashrrev_i32_e32 v0, v2, v0 ; SI-NEXT: v_ashrrev_i32_e32 v2, v6, v4 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v3, s6, v3 diff --git a/llvm/test/CodeGen/AMDGPU/srem64.ll b/llvm/test/CodeGen/AMDGPU/srem64.ll --- a/llvm/test/CodeGen/AMDGPU/srem64.ll +++ b/llvm/test/CodeGen/AMDGPU/srem64.ll @@ -24,10 +24,10 @@ ; GCN-NEXT: v_mul_f32_e32 v3, 0x2f800000, v0 ; GCN-NEXT: v_trunc_f32_e32 v3, v3 ; GCN-NEXT: v_mac_f32_e32 v0, 0xcf800000, v3 -; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v3, v3 -; GCN-NEXT: v_mul_hi_u32 v5, s2, v0 +; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GCN-NEXT: v_mul_lo_u32 v4, s2, v3 +; GCN-NEXT: v_mul_hi_u32 v5, s2, v0 ; GCN-NEXT: v_mul_lo_u32 v7, s3, v0 ; GCN-NEXT: v_mul_lo_u32 v6, s2, v0 ; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4 @@ -45,8 +45,8 @@ ; GCN-NEXT: v_addc_u32_e32 v5, vcc, v7, v6, vcc ; GCN-NEXT: v_addc_u32_e32 v6, vcc, v9, v1, vcc ; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; GCN-NEXT: v_add_i32_e64 v0, s[0:1], v0, v4 ; GCN-NEXT: v_addc_u32_e32 v5, vcc, v2, v6, vcc +; GCN-NEXT: v_add_i32_e64 v0, s[0:1], v0, v4 ; GCN-NEXT: v_addc_u32_e64 v4, vcc, v3, v5, s[0:1] ; GCN-NEXT: v_mul_lo_u32 v6, s2, v4 ; GCN-NEXT: v_mul_hi_u32 v7, s2, v0 @@ -55,8 +55,8 @@ ; GCN-NEXT: v_mul_lo_u32 v7, s2, v0 ; GCN-NEXT: v_add_i32_e32 v6, vcc, v8, v6 ; GCN-NEXT: v_mul_lo_u32 v10, v0, v6 -; GCN-NEXT: v_mul_hi_u32 v12, v0, v6 ; GCN-NEXT: v_mul_hi_u32 v11, v0, v7 +; GCN-NEXT: v_mul_hi_u32 v12, v0, v6 ; GCN-NEXT: v_mul_hi_u32 v9, v4, v7 ; GCN-NEXT: v_mul_lo_u32 v7, v4, v7 ; GCN-NEXT: v_mul_hi_u32 v8, v4, v6 @@ -99,12 +99,12 @@ ; GCN-NEXT: v_subrev_i32_e64 v4, s[0:1], s12, v0 ; GCN-NEXT: v_subbrev_u32_e64 v5, s[2:3], 0, v2, s[0:1] ; GCN-NEXT: v_cmp_le_u32_e64 s[2:3], s13, v5 -; GCN-NEXT: v_subb_u32_e64 v2, s[0:1], v2, v3, s[0:1] ; GCN-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[2:3] ; GCN-NEXT: v_cmp_le_u32_e64 s[2:3], s12, v4 -; GCN-NEXT: v_subrev_i32_e64 v3, s[0:1], s12, v4 +; GCN-NEXT: v_subb_u32_e64 v2, s[0:1], v2, v3, s[0:1] ; GCN-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[2:3] ; GCN-NEXT: v_cmp_eq_u32_e64 s[2:3], s13, v5 +; GCN-NEXT: v_subrev_i32_e64 v3, s[0:1], s12, v4 ; GCN-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[2:3] ; GCN-NEXT: v_subbrev_u32_e64 v2, s[0:1], 0, v2, s[0:1] ; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v6 @@ -173,8 +173,8 @@ ; GCN-IR-NEXT: s_mov_b32 s3, 0 ; GCN-IR-NEXT: BB0_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-IR-NEXT: s_lshr_b32 s2, s11, 31 ; GCN-IR-NEXT: s_lshl_b64 s[14:15], s[14:15], 1 +; GCN-IR-NEXT: s_lshr_b32 s2, s11, 31 ; GCN-IR-NEXT: s_lshl_b64 s[10:11], s[10:11], 1 ; GCN-IR-NEXT: s_or_b64 s[14:15], s[14:15], s[2:3] ; GCN-IR-NEXT: s_or_b64 s[10:11], s[12:13], s[10:11] @@ -239,9 +239,9 @@ ; GCN-NEXT: v_cvt_f32_u32_e32 v5, v3 ; GCN-NEXT: v_sub_i32_e32 v6, vcc, 0, v2 ; GCN-NEXT: v_subb_u32_e32 v7, vcc, 0, v3, vcc -; GCN-NEXT: v_mov_b32_e32 v14, 0 ; GCN-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 ; GCN-NEXT: v_rcp_f32_e32 v4, v4 +; GCN-NEXT: v_mov_b32_e32 v14, 0 ; GCN-NEXT: v_mov_b32_e32 v13, 0 ; GCN-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 ; GCN-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 @@ -256,8 +256,8 @@ ; GCN-NEXT: v_mul_lo_u32 v9, v6, v4 ; GCN-NEXT: v_add_i32_e32 v8, vcc, v8, v10 ; GCN-NEXT: v_mul_lo_u32 v11, v4, v8 -; GCN-NEXT: v_mul_hi_u32 v10, v4, v8 ; GCN-NEXT: v_mul_hi_u32 v12, v4, v9 +; GCN-NEXT: v_mul_hi_u32 v10, v4, v8 ; GCN-NEXT: v_mul_hi_u32 v15, v5, v8 ; GCN-NEXT: v_mul_lo_u32 v8, v5, v8 ; GCN-NEXT: v_add_i32_e32 v11, vcc, v12, v11 @@ -268,8 +268,8 @@ ; GCN-NEXT: v_addc_u32_e32 v9, vcc, v10, v9, vcc ; GCN-NEXT: v_addc_u32_e32 v10, vcc, v15, v13, vcc ; GCN-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; GCN-NEXT: v_add_i32_e64 v4, s[4:5], v4, v8 ; GCN-NEXT: v_addc_u32_e32 v9, vcc, v14, v10, vcc +; GCN-NEXT: v_add_i32_e64 v4, s[4:5], v4, v8 ; GCN-NEXT: v_addc_u32_e64 v8, vcc, v5, v9, s[4:5] ; GCN-NEXT: v_mul_lo_u32 v10, v6, v8 ; GCN-NEXT: v_mul_hi_u32 v11, v6, v4 @@ -282,8 +282,8 @@ ; GCN-NEXT: v_mul_hi_u32 v16, v4, v7 ; GCN-NEXT: v_mul_hi_u32 v11, v8, v6 ; GCN-NEXT: v_mul_lo_u32 v6, v8, v6 -; GCN-NEXT: v_add_i32_e32 v12, vcc, v15, v12 ; GCN-NEXT: v_mul_hi_u32 v10, v8, v7 +; GCN-NEXT: v_add_i32_e32 v12, vcc, v15, v12 ; GCN-NEXT: v_addc_u32_e32 v15, vcc, v14, v16, vcc ; GCN-NEXT: v_mul_lo_u32 v7, v8, v7 ; GCN-NEXT: v_add_i32_e32 v6, vcc, v12, v6 @@ -326,13 +326,13 @@ ; GCN-NEXT: v_sub_i32_e64 v7, s[4:5], v0, v2 ; GCN-NEXT: v_subbrev_u32_e64 v8, s[6:7], 0, v4, s[4:5] ; GCN-NEXT: v_cmp_ge_u32_e64 s[6:7], v8, v3 -; GCN-NEXT: v_subb_u32_e32 v1, vcc, v1, v5, vcc ; GCN-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[6:7] ; GCN-NEXT: v_cmp_ge_u32_e64 s[6:7], v7, v2 +; GCN-NEXT: v_subb_u32_e32 v1, vcc, v1, v5, vcc ; GCN-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[6:7] ; GCN-NEXT: v_cmp_eq_u32_e64 s[6:7], v8, v3 -; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3 ; GCN-NEXT: v_subb_u32_e64 v4, s[4:5], v4, v3, s[4:5] +; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3 ; GCN-NEXT: v_cndmask_b32_e64 v9, v9, v10, s[6:7] ; GCN-NEXT: v_sub_i32_e64 v10, s[4:5], v7, v2 ; GCN-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc @@ -340,12 +340,12 @@ ; GCN-NEXT: v_subbrev_u32_e64 v4, s[4:5], 0, v4, s[4:5] ; GCN-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 -; GCN-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc ; GCN-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v9 +; GCN-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc ; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; GCN-NEXT: v_cndmask_b32_e64 v2, v7, v10, s[4:5] -; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GCN-NEXT: v_cndmask_b32_e64 v4, v8, v4, s[4:5] +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; GCN-NEXT: v_xor_b32_e32 v0, v0, v6 ; GCN-NEXT: v_xor_b32_e32 v1, v1, v6 @@ -361,10 +361,10 @@ ; GCN-IR-NEXT: v_ashrrev_i32_e32 v6, 31, v3 ; GCN-IR-NEXT: v_xor_b32_e32 v1, v1, v4 ; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 -; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v1, v4, vcc ; GCN-IR-NEXT: v_xor_b32_e32 v2, v2, v6 -; GCN-IR-NEXT: v_sub_i32_e32 v5, vcc, v2, v6 +; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v1, v4, vcc ; GCN-IR-NEXT: v_xor_b32_e32 v3, v3, v6 +; GCN-IR-NEXT: v_sub_i32_e32 v5, vcc, v2, v6 ; GCN-IR-NEXT: v_subb_u32_e32 v6, vcc, v3, v6, vcc ; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[5:6] ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1] @@ -396,8 +396,8 @@ ; GCN-IR-NEXT: v_addc_u32_e32 v15, vcc, 0, v8, vcc ; GCN-IR-NEXT: v_cmp_ge_u64_e32 vcc, v[14:15], v[7:8] ; GCN-IR-NEXT: v_sub_i32_e64 v7, s[4:5], 63, v7 -; GCN-IR-NEXT: v_mov_b32_e32 v9, 0 ; GCN-IR-NEXT: v_lshl_b64 v[7:8], v[0:1], v7 +; GCN-IR-NEXT: v_mov_b32_e32 v9, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v10, 0 ; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 ; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -407,10 +407,10 @@ ; GCN-IR-NEXT: v_add_i32_e32 v18, vcc, -1, v5 ; GCN-IR-NEXT: v_addc_u32_e32 v19, vcc, -1, v6, vcc ; GCN-IR-NEXT: v_not_b32_e32 v3, v3 +; GCN-IR-NEXT: v_lshr_b64 v[14:15], v[0:1], v14 ; GCN-IR-NEXT: v_not_b32_e32 v9, v11 -; GCN-IR-NEXT: v_add_i32_e32 v11, vcc, v3, v12 ; GCN-IR-NEXT: v_mov_b32_e32 v16, 0 -; GCN-IR-NEXT: v_lshr_b64 v[14:15], v[0:1], v14 +; GCN-IR-NEXT: v_add_i32_e32 v11, vcc, v3, v12 ; GCN-IR-NEXT: v_mov_b32_e32 v17, 0 ; GCN-IR-NEXT: v_addc_u32_e32 v12, vcc, v9, v13, vcc ; GCN-IR-NEXT: BB1_3: ; %udiv-do-while @@ -423,15 +423,15 @@ ; GCN-IR-NEXT: v_subb_u32_e32 v9, vcc, v19, v15, vcc ; GCN-IR-NEXT: v_or_b32_e32 v7, v16, v7 ; GCN-IR-NEXT: v_add_i32_e32 v16, vcc, 1, v11 -; GCN-IR-NEXT: v_ashrrev_i32_e32 v13, 31, v9 ; GCN-IR-NEXT: v_or_b32_e32 v8, v17, v8 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v13, 31, v9 ; GCN-IR-NEXT: v_addc_u32_e32 v17, vcc, 0, v12, vcc -; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[16:17], v[11:12] -; GCN-IR-NEXT: v_mov_b32_e32 v11, v16 ; GCN-IR-NEXT: v_mov_b32_e32 v10, 0 ; GCN-IR-NEXT: v_and_b32_e32 v9, 1, v13 ; GCN-IR-NEXT: v_and_b32_e32 v20, v13, v6 ; GCN-IR-NEXT: v_and_b32_e32 v13, v13, v5 +; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[16:17], v[11:12] +; GCN-IR-NEXT: v_mov_b32_e32 v11, v16 ; GCN-IR-NEXT: v_sub_i32_e64 v14, s[4:5], v3, v13 ; GCN-IR-NEXT: v_mov_b32_e32 v12, v17 ; GCN-IR-NEXT: v_mov_b32_e32 v17, v10 @@ -900,10 +900,10 @@ ; GCN-NEXT: v_mul_f32_e32 v2, 0x2f800000, v0 ; GCN-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-NEXT: v_mac_f32_e32 v0, 0xcf800000, v2 -; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2 -; GCN-NEXT: v_mul_hi_u32 v4, s4, v0 +; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GCN-NEXT: v_mul_lo_u32 v3, s4, v2 +; GCN-NEXT: v_mul_hi_u32 v4, s4, v0 ; GCN-NEXT: v_mul_lo_u32 v6, s5, v0 ; GCN-NEXT: v_mul_lo_u32 v5, s4, v0 ; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 @@ -921,8 +921,8 @@ ; GCN-NEXT: v_addc_u32_e32 v4, vcc, v6, v5, vcc ; GCN-NEXT: v_addc_u32_e32 v5, vcc, v9, v1, vcc ; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GCN-NEXT: v_add_i32_e64 v0, s[0:1], v0, v3 ; GCN-NEXT: v_addc_u32_e32 v4, vcc, v7, v5, vcc +; GCN-NEXT: v_add_i32_e64 v0, s[0:1], v0, v3 ; GCN-NEXT: v_addc_u32_e64 v3, vcc, v2, v4, s[0:1] ; GCN-NEXT: v_mul_lo_u32 v5, s4, v3 ; GCN-NEXT: v_mul_hi_u32 v6, s4, v0 @@ -932,8 +932,8 @@ ; GCN-NEXT: v_mul_lo_u32 v6, s4, v0 ; GCN-NEXT: v_add_i32_e32 v5, vcc, v8, v5 ; GCN-NEXT: v_mul_lo_u32 v10, v0, v5 -; GCN-NEXT: v_mul_hi_u32 v12, v0, v5 ; GCN-NEXT: v_mul_hi_u32 v11, v0, v6 +; GCN-NEXT: v_mul_hi_u32 v12, v0, v5 ; GCN-NEXT: v_mul_hi_u32 v9, v3, v6 ; GCN-NEXT: v_mul_lo_u32 v6, v3, v6 ; GCN-NEXT: v_mul_hi_u32 v8, v3, v5 @@ -950,8 +950,8 @@ ; GCN-NEXT: s_add_u32 s0, s2, s10 ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v3 ; GCN-NEXT: s_addc_u32 s1, s3, s10 -; GCN-NEXT: s_xor_b64 s[14:15], s[0:1], s[10:11] ; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc +; GCN-NEXT: s_xor_b64 s[14:15], s[0:1], s[10:11] ; GCN-NEXT: v_mul_lo_u32 v3, s14, v2 ; GCN-NEXT: v_mul_hi_u32 v4, s14, v0 ; GCN-NEXT: v_mul_hi_u32 v5, s14, v2 @@ -980,12 +980,12 @@ ; GCN-NEXT: v_subrev_i32_e64 v4, s[0:1], s12, v0 ; GCN-NEXT: v_subbrev_u32_e64 v5, s[2:3], 0, v2, s[0:1] ; GCN-NEXT: v_cmp_le_u32_e64 s[2:3], s13, v5 -; GCN-NEXT: v_subb_u32_e64 v2, s[0:1], v2, v3, s[0:1] ; GCN-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[2:3] ; GCN-NEXT: v_cmp_le_u32_e64 s[2:3], s12, v4 -; GCN-NEXT: v_subrev_i32_e64 v3, s[0:1], s12, v4 +; GCN-NEXT: v_subb_u32_e64 v2, s[0:1], v2, v3, s[0:1] ; GCN-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[2:3] ; GCN-NEXT: v_cmp_eq_u32_e64 s[2:3], s13, v5 +; GCN-NEXT: v_subrev_i32_e64 v3, s[0:1], s12, v4 ; GCN-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[2:3] ; GCN-NEXT: v_subbrev_u32_e64 v2, s[0:1], 0, v2, s[0:1] ; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v6 @@ -1071,8 +1071,8 @@ ; GCN-IR-NEXT: s_mov_b32 s7, 0 ; GCN-IR-NEXT: BB8_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-IR-NEXT: s_lshr_b32 s6, s13, 31 ; GCN-IR-NEXT: s_lshl_b64 s[16:17], s[16:17], 1 +; GCN-IR-NEXT: s_lshr_b32 s6, s13, 31 ; GCN-IR-NEXT: s_lshl_b64 s[12:13], s[12:13], 1 ; GCN-IR-NEXT: s_or_b64 s[16:17], s[16:17], s[6:7] ; GCN-IR-NEXT: s_or_b64 s[12:13], s[14:15], s[12:13] @@ -1111,8 +1111,8 @@ ; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 ; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, v2, v1 ; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, v1, v3 -; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 ; GCN-IR-NEXT: v_mov_b32_e32 v2, s3 +; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 ; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc ; GCN-IR-NEXT: v_xor_b32_e32 v0, s0, v0 ; GCN-IR-NEXT: v_xor_b32_e32 v1, s1, v1 @@ -1180,8 +1180,8 @@ ; GCN-IR-NEXT: s_sext_i32_i16 s7, s0 ; GCN-IR-NEXT: s_ashr_i32 s0, s3, 31 ; GCN-IR-NEXT: s_ashr_i32 s12, s7, 31 -; GCN-IR-NEXT: s_mov_b32 s1, s0 ; GCN-IR-NEXT: s_ashr_i64 s[10:11], s[6:7], 24 +; GCN-IR-NEXT: s_mov_b32 s1, s0 ; GCN-IR-NEXT: s_mov_b32 s13, s12 ; GCN-IR-NEXT: s_xor_b64 s[2:3], s[8:9], s[0:1] ; GCN-IR-NEXT: s_xor_b64 s[6:7], s[10:11], s[12:13] @@ -1233,8 +1233,8 @@ ; GCN-IR-NEXT: s_mov_b32 s9, 0 ; GCN-IR-NEXT: BB9_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-IR-NEXT: s_lshr_b32 s8, s13, 31 ; GCN-IR-NEXT: s_lshl_b64 s[16:17], s[16:17], 1 +; GCN-IR-NEXT: s_lshr_b32 s8, s13, 31 ; GCN-IR-NEXT: s_lshl_b64 s[12:13], s[12:13], 1 ; GCN-IR-NEXT: s_or_b64 s[16:17], s[16:17], s[8:9] ; GCN-IR-NEXT: s_or_b64 s[12:13], s[14:15], s[12:13] @@ -1273,8 +1273,8 @@ ; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 ; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, v2, v1 ; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, v1, v3 -; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 ; GCN-IR-NEXT: v_mov_b32_e32 v2, s3 +; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 ; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc ; GCN-IR-NEXT: v_xor_b32_e32 v0, s0, v0 ; GCN-IR-NEXT: v_xor_b32_e32 v1, s1, v1 @@ -1300,8 +1300,8 @@ ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_ashr_i32 s0, s7, 31 ; GCN-NEXT: s_add_u32 s2, s6, s0 -; GCN-NEXT: s_addc_u32 s3, s7, s0 ; GCN-NEXT: s_mov_b32 s1, s0 +; GCN-NEXT: s_addc_u32 s3, s7, s0 ; GCN-NEXT: s_xor_b64 s[8:9], s[2:3], s[0:1] ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s8 ; GCN-NEXT: v_cvt_f32_u32_e32 v1, s9 @@ -1316,10 +1316,10 @@ ; GCN-NEXT: v_mul_f32_e32 v3, 0x2f800000, v0 ; GCN-NEXT: v_trunc_f32_e32 v3, v3 ; GCN-NEXT: v_mac_f32_e32 v0, 0xcf800000, v3 -; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v3, v3 -; GCN-NEXT: v_mul_hi_u32 v5, s2, v0 +; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GCN-NEXT: v_mul_lo_u32 v4, s2, v3 +; GCN-NEXT: v_mul_hi_u32 v5, s2, v0 ; GCN-NEXT: v_mul_lo_u32 v7, s3, v0 ; GCN-NEXT: v_mul_lo_u32 v6, s2, v0 ; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4 @@ -1329,16 +1329,16 @@ ; GCN-NEXT: v_mul_hi_u32 v9, v0, v4 ; GCN-NEXT: v_mul_lo_u32 v8, v3, v6 ; GCN-NEXT: v_mul_hi_u32 v6, v3, v6 -; GCN-NEXT: v_add_i32_e32 v5, vcc, v5, v7 ; GCN-NEXT: v_mul_hi_u32 v10, v3, v4 +; GCN-NEXT: v_add_i32_e32 v5, vcc, v5, v7 ; GCN-NEXT: v_addc_u32_e32 v7, vcc, v2, v9, vcc ; GCN-NEXT: v_mul_lo_u32 v4, v3, v4 ; GCN-NEXT: v_add_i32_e32 v5, vcc, v5, v8 ; GCN-NEXT: v_addc_u32_e32 v5, vcc, v7, v6, vcc ; GCN-NEXT: v_addc_u32_e32 v6, vcc, v10, v1, vcc ; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; GCN-NEXT: v_add_i32_e64 v0, s[0:1], v0, v4 ; GCN-NEXT: v_addc_u32_e32 v5, vcc, v2, v6, vcc +; GCN-NEXT: v_add_i32_e64 v0, s[0:1], v0, v4 ; GCN-NEXT: v_addc_u32_e64 v4, vcc, v3, v5, s[0:1] ; GCN-NEXT: v_mul_lo_u32 v6, s2, v4 ; GCN-NEXT: v_mul_hi_u32 v7, s2, v0 @@ -1347,8 +1347,8 @@ ; GCN-NEXT: v_mul_lo_u32 v7, s2, v0 ; GCN-NEXT: v_add_i32_e32 v6, vcc, v8, v6 ; GCN-NEXT: v_mul_lo_u32 v10, v0, v6 -; GCN-NEXT: v_mul_hi_u32 v12, v0, v6 ; GCN-NEXT: v_mul_hi_u32 v11, v0, v7 +; GCN-NEXT: v_mul_hi_u32 v12, v0, v6 ; GCN-NEXT: v_mul_hi_u32 v9, v4, v7 ; GCN-NEXT: v_mul_lo_u32 v7, v4, v7 ; GCN-NEXT: v_mul_hi_u32 v8, v4, v6 @@ -1380,15 +1380,15 @@ ; GCN-NEXT: v_subrev_i32_e64 v4, s[0:1], s8, v0 ; GCN-NEXT: v_subbrev_u32_e64 v5, s[2:3], 0, v2, s[0:1] ; GCN-NEXT: v_cmp_le_u32_e64 s[2:3], s9, v5 -; GCN-NEXT: v_subb_u32_e64 v2, s[0:1], v2, v3, s[0:1] ; GCN-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[2:3] ; GCN-NEXT: v_cmp_le_u32_e64 s[2:3], s8, v4 -; GCN-NEXT: v_subrev_i32_e64 v3, s[0:1], s8, v4 +; GCN-NEXT: v_subb_u32_e64 v2, s[0:1], v2, v3, s[0:1] ; GCN-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[2:3] ; GCN-NEXT: v_cmp_eq_u32_e64 s[2:3], s9, v5 -; GCN-NEXT: v_subb_u32_e32 v1, vcc, 0, v1, vcc +; GCN-NEXT: v_subrev_i32_e64 v3, s[0:1], s8, v4 ; GCN-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[2:3] ; GCN-NEXT: v_subbrev_u32_e64 v2, s[0:1], 0, v2, s[0:1] +; GCN-NEXT: v_subb_u32_e32 v1, vcc, 0, v1, vcc ; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v6 ; GCN-NEXT: v_cmp_le_u32_e32 vcc, s9, v1 ; GCN-NEXT: v_cndmask_b32_e64 v2, v5, v2, s[0:1] @@ -1448,8 +1448,8 @@ ; GCN-IR-NEXT: s_mov_b32 s3, 0 ; GCN-IR-NEXT: BB10_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-IR-NEXT: s_lshr_b32 s2, s9, 31 ; GCN-IR-NEXT: s_lshl_b64 s[12:13], s[12:13], 1 +; GCN-IR-NEXT: s_lshr_b32 s2, s9, 31 ; GCN-IR-NEXT: s_lshl_b64 s[8:9], s[8:9], 1 ; GCN-IR-NEXT: s_or_b64 s[12:13], s[12:13], s[2:3] ; GCN-IR-NEXT: s_or_b64 s[8:9], s[10:11], s[8:9] @@ -1509,9 +1509,9 @@ ; GCN-NEXT: v_cvt_f32_u32_e32 v3, v1 ; GCN-NEXT: v_sub_i32_e32 v4, vcc, 0, v0 ; GCN-NEXT: v_subb_u32_e32 v5, vcc, 0, v1, vcc -; GCN-NEXT: v_mov_b32_e32 v12, 0 ; GCN-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3 ; GCN-NEXT: v_rcp_f32_e32 v2, v2 +; GCN-NEXT: v_mov_b32_e32 v12, 0 ; GCN-NEXT: v_mov_b32_e32 v11, 0 ; GCN-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 ; GCN-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 @@ -1526,8 +1526,8 @@ ; GCN-NEXT: v_mul_lo_u32 v7, v4, v2 ; GCN-NEXT: v_add_i32_e32 v6, vcc, v6, v8 ; GCN-NEXT: v_mul_lo_u32 v9, v2, v6 -; GCN-NEXT: v_mul_hi_u32 v8, v2, v6 ; GCN-NEXT: v_mul_hi_u32 v10, v2, v7 +; GCN-NEXT: v_mul_hi_u32 v8, v2, v6 ; GCN-NEXT: v_mul_hi_u32 v13, v3, v6 ; GCN-NEXT: v_mul_lo_u32 v6, v3, v6 ; GCN-NEXT: v_add_i32_e32 v9, vcc, v10, v9 @@ -1538,8 +1538,8 @@ ; GCN-NEXT: v_addc_u32_e32 v7, vcc, v8, v7, vcc ; GCN-NEXT: v_addc_u32_e32 v8, vcc, v13, v11, vcc ; GCN-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; GCN-NEXT: v_add_i32_e64 v2, s[4:5], v2, v6 ; GCN-NEXT: v_addc_u32_e32 v7, vcc, v12, v8, vcc +; GCN-NEXT: v_add_i32_e64 v2, s[4:5], v2, v6 ; GCN-NEXT: v_addc_u32_e64 v6, vcc, v3, v7, s[4:5] ; GCN-NEXT: v_mul_lo_u32 v8, v4, v6 ; GCN-NEXT: v_mul_hi_u32 v9, v4, v2 @@ -1552,8 +1552,8 @@ ; GCN-NEXT: v_mul_hi_u32 v14, v2, v5 ; GCN-NEXT: v_mul_hi_u32 v9, v6, v4 ; GCN-NEXT: v_mul_lo_u32 v4, v6, v4 -; GCN-NEXT: v_add_i32_e32 v10, vcc, v13, v10 ; GCN-NEXT: v_mul_hi_u32 v8, v6, v5 +; GCN-NEXT: v_add_i32_e32 v10, vcc, v13, v10 ; GCN-NEXT: v_addc_u32_e32 v13, vcc, v12, v14, vcc ; GCN-NEXT: v_mul_lo_u32 v5, v6, v5 ; GCN-NEXT: v_add_i32_e32 v4, vcc, v10, v4 @@ -1585,9 +1585,9 @@ ; GCN-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[6:7] ; GCN-NEXT: v_cmp_eq_u32_e64 s[6:7], v6, v1 ; GCN-NEXT: v_subb_u32_e64 v4, s[4:5], v4, v1, s[4:5] -; GCN-NEXT: v_subb_u32_e32 v3, vcc, 0, v3, vcc ; GCN-NEXT: v_cndmask_b32_e64 v7, v7, v8, s[6:7] ; GCN-NEXT: v_sub_i32_e64 v8, s[4:5], v5, v0 +; GCN-NEXT: v_subb_u32_e32 v3, vcc, 0, v3, vcc ; GCN-NEXT: v_subbrev_u32_e64 v4, s[4:5], 0, v4, s[4:5] ; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v3, v1 ; GCN-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v7 @@ -1634,8 +1634,8 @@ ; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, 0, v4, vcc ; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v3 ; GCN-IR-NEXT: v_cmp_ge_u64_e32 vcc, v[8:9], v[3:4] -; GCN-IR-NEXT: v_mov_b32_e32 v4, 0 ; GCN-IR-NEXT: v_lshl_b64 v[2:3], 24, v2 +; GCN-IR-NEXT: v_mov_b32_e32 v4, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 ; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 ; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -1644,9 +1644,9 @@ ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader ; GCN-IR-NEXT: v_add_i32_e32 v12, vcc, -1, v0 ; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, -1, v1, vcc -; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, 58, v6 -; GCN-IR-NEXT: v_mov_b32_e32 v10, 0 ; GCN-IR-NEXT: v_lshr_b64 v[8:9], 24, v8 +; GCN-IR-NEXT: v_mov_b32_e32 v10, 0 +; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, 58, v6 ; GCN-IR-NEXT: v_mov_b32_e32 v11, 0 ; GCN-IR-NEXT: v_subb_u32_e32 v7, vcc, 0, v7, vcc ; GCN-IR-NEXT: BB11_3: ; %udiv-do-while @@ -1659,15 +1659,15 @@ ; GCN-IR-NEXT: v_subb_u32_e32 v4, vcc, v13, v9, vcc ; GCN-IR-NEXT: v_or_b32_e32 v2, v10, v2 ; GCN-IR-NEXT: v_ashrrev_i32_e32 v10, 31, v4 -; GCN-IR-NEXT: v_and_b32_e32 v15, v10, v0 ; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v10 ; GCN-IR-NEXT: v_and_b32_e32 v14, v10, v1 +; GCN-IR-NEXT: v_and_b32_e32 v15, v10, v0 ; GCN-IR-NEXT: v_add_i32_e32 v10, vcc, 1, v6 ; GCN-IR-NEXT: v_or_b32_e32 v3, v11, v3 ; GCN-IR-NEXT: v_addc_u32_e32 v11, vcc, 0, v7, vcc +; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 ; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[10:11], v[6:7] ; GCN-IR-NEXT: v_mov_b32_e32 v6, v10 -; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 ; GCN-IR-NEXT: v_sub_i32_e64 v8, s[4:5], v8, v15 ; GCN-IR-NEXT: v_mov_b32_e32 v7, v11 ; GCN-IR-NEXT: v_mov_b32_e32 v11, v5 @@ -1711,9 +1711,9 @@ ; GCN-NEXT: v_cvt_f32_u32_e32 v3, v1 ; GCN-NEXT: v_sub_i32_e32 v4, vcc, 0, v0 ; GCN-NEXT: v_subb_u32_e32 v5, vcc, 0, v1, vcc -; GCN-NEXT: v_mov_b32_e32 v12, 0 ; GCN-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3 ; GCN-NEXT: v_rcp_f32_e32 v2, v2 +; GCN-NEXT: v_mov_b32_e32 v12, 0 ; GCN-NEXT: v_mov_b32_e32 v11, 0 ; GCN-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 ; GCN-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 @@ -1728,8 +1728,8 @@ ; GCN-NEXT: v_mul_lo_u32 v7, v4, v2 ; GCN-NEXT: v_add_i32_e32 v6, vcc, v6, v8 ; GCN-NEXT: v_mul_lo_u32 v9, v2, v6 -; GCN-NEXT: v_mul_hi_u32 v8, v2, v6 ; GCN-NEXT: v_mul_hi_u32 v10, v2, v7 +; GCN-NEXT: v_mul_hi_u32 v8, v2, v6 ; GCN-NEXT: v_mul_hi_u32 v13, v3, v6 ; GCN-NEXT: v_mul_lo_u32 v6, v3, v6 ; GCN-NEXT: v_add_i32_e32 v9, vcc, v10, v9 @@ -1740,8 +1740,8 @@ ; GCN-NEXT: v_addc_u32_e32 v7, vcc, v8, v7, vcc ; GCN-NEXT: v_addc_u32_e32 v8, vcc, v13, v11, vcc ; GCN-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; GCN-NEXT: v_add_i32_e64 v2, s[4:5], v2, v6 ; GCN-NEXT: v_addc_u32_e32 v7, vcc, v12, v8, vcc +; GCN-NEXT: v_add_i32_e64 v2, s[4:5], v2, v6 ; GCN-NEXT: v_addc_u32_e64 v6, vcc, v3, v7, s[4:5] ; GCN-NEXT: v_mul_lo_u32 v8, v4, v6 ; GCN-NEXT: v_mul_hi_u32 v9, v4, v2 @@ -1754,8 +1754,8 @@ ; GCN-NEXT: v_mul_hi_u32 v14, v2, v5 ; GCN-NEXT: v_mul_hi_u32 v9, v6, v4 ; GCN-NEXT: v_mul_lo_u32 v4, v6, v4 -; GCN-NEXT: v_add_i32_e32 v10, vcc, v13, v10 ; GCN-NEXT: v_mul_hi_u32 v8, v6, v5 +; GCN-NEXT: v_add_i32_e32 v10, vcc, v13, v10 ; GCN-NEXT: v_addc_u32_e32 v13, vcc, v12, v14, vcc ; GCN-NEXT: v_mul_lo_u32 v5, v6, v5 ; GCN-NEXT: v_add_i32_e32 v4, vcc, v10, v4 @@ -1788,9 +1788,9 @@ ; GCN-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[6:7] ; GCN-NEXT: v_cmp_eq_u32_e64 s[6:7], v6, v1 ; GCN-NEXT: v_subb_u32_e64 v4, s[4:5], v4, v1, s[4:5] -; GCN-NEXT: v_subb_u32_e32 v3, vcc, 0, v3, vcc ; GCN-NEXT: v_cndmask_b32_e64 v7, v7, v8, s[6:7] ; GCN-NEXT: v_sub_i32_e64 v8, s[4:5], v5, v0 +; GCN-NEXT: v_subb_u32_e32 v3, vcc, 0, v3, vcc ; GCN-NEXT: v_subbrev_u32_e64 v4, s[4:5], 0, v4, s[4:5] ; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v3, v1 ; GCN-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v7 @@ -1824,8 +1824,8 @@ ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1] ; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[2:3] ; GCN-IR-NEXT: s_mov_b64 s[8:9], 0x8000 -; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], vcc ; GCN-IR-NEXT: v_mov_b32_e32 v6, s8 +; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], vcc ; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[2:3] ; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 ; GCN-IR-NEXT: v_cndmask_b32_e64 v6, v6, 0, s[4:5] @@ -1839,8 +1839,8 @@ ; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, 0, v3, vcc ; GCN-IR-NEXT: v_cmp_ge_u64_e32 vcc, v[8:9], v[2:3] ; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v2 -; GCN-IR-NEXT: v_mov_b32_e32 v6, 0 ; GCN-IR-NEXT: v_lshl_b64 v[2:3], s[8:9], v2 +; GCN-IR-NEXT: v_mov_b32_e32 v6, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 ; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 ; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -1848,11 +1848,11 @@ ; GCN-IR-NEXT: s_cbranch_execz BB12_5 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader ; GCN-IR-NEXT: v_add_i32_e32 v12, vcc, -1, v0 -; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, -1, v1, vcc ; GCN-IR-NEXT: s_mov_b64 s[4:5], 0x8000 -; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, 47, v4 -; GCN-IR-NEXT: v_mov_b32_e32 v10, 0 +; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, -1, v1, vcc ; GCN-IR-NEXT: v_lshr_b64 v[8:9], s[4:5], v8 +; GCN-IR-NEXT: v_mov_b32_e32 v10, 0 +; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, 47, v4 ; GCN-IR-NEXT: v_mov_b32_e32 v11, 0 ; GCN-IR-NEXT: v_subb_u32_e32 v5, vcc, 0, v5, vcc ; GCN-IR-NEXT: BB12_3: ; %udiv-do-while @@ -1865,15 +1865,15 @@ ; GCN-IR-NEXT: v_subb_u32_e32 v6, vcc, v13, v9, vcc ; GCN-IR-NEXT: v_or_b32_e32 v2, v10, v2 ; GCN-IR-NEXT: v_ashrrev_i32_e32 v10, 31, v6 -; GCN-IR-NEXT: v_and_b32_e32 v15, v10, v0 ; GCN-IR-NEXT: v_and_b32_e32 v6, 1, v10 ; GCN-IR-NEXT: v_and_b32_e32 v14, v10, v1 +; GCN-IR-NEXT: v_and_b32_e32 v15, v10, v0 ; GCN-IR-NEXT: v_add_i32_e32 v10, vcc, 1, v4 ; GCN-IR-NEXT: v_or_b32_e32 v3, v11, v3 ; GCN-IR-NEXT: v_addc_u32_e32 v11, vcc, 0, v5, vcc +; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 ; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[10:11], v[4:5] ; GCN-IR-NEXT: v_mov_b32_e32 v4, v10 -; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 ; GCN-IR-NEXT: v_sub_i32_e64 v8, s[4:5], v8, v15 ; GCN-IR-NEXT: v_mov_b32_e32 v5, v11 ; GCN-IR-NEXT: v_mov_b32_e32 v11, v7 @@ -1922,8 +1922,8 @@ ; GCN-IR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-IR-NEXT: v_ashrrev_i32_e32 v2, 31, v1 ; GCN-IR-NEXT: v_xor_b32_e32 v0, v0, v2 -; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 ; GCN-IR-NEXT: v_xor_b32_e32 v1, v1, v2 +; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 ; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc ; GCN-IR-NEXT: v_ffbh_u32_e32 v3, v0 ; GCN-IR-NEXT: v_add_i32_e64 v3, s[4:5], 32, v3 @@ -1947,17 +1947,17 @@ ; GCN-IR-NEXT: v_addc_u32_e32 v10, vcc, 0, v5, vcc ; GCN-IR-NEXT: v_cmp_ge_u64_e32 vcc, v[9:10], v[4:5] ; GCN-IR-NEXT: v_sub_i32_e64 v4, s[4:5], 63, v4 -; GCN-IR-NEXT: v_mov_b32_e32 v6, 0 ; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[0:1], v4 +; GCN-IR-NEXT: v_mov_b32_e32 v6, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 ; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 ; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[4:5] ; GCN-IR-NEXT: s_cbranch_execz BB13_5 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader -; GCN-IR-NEXT: v_mov_b32_e32 v12, 0 ; GCN-IR-NEXT: v_lshr_b64 v[10:11], v[0:1], v9 ; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, 0xffffffcf, v8 +; GCN-IR-NEXT: v_mov_b32_e32 v12, 0 ; GCN-IR-NEXT: v_addc_u32_e64 v9, s[4:5], 0, -1, vcc ; GCN-IR-NEXT: v_mov_b32_e32 v13, 0 ; GCN-IR-NEXT: s_movk_i32 s12, 0x7fff @@ -1971,18 +1971,18 @@ ; GCN-IR-NEXT: v_subb_u32_e32 v6, vcc, 0, v11, vcc ; GCN-IR-NEXT: v_or_b32_e32 v4, v12, v4 ; GCN-IR-NEXT: v_ashrrev_i32_e32 v12, 31, v6 -; GCN-IR-NEXT: v_and_b32_e32 v15, 0x8000, v12 ; GCN-IR-NEXT: v_and_b32_e32 v6, 1, v12 +; GCN-IR-NEXT: v_and_b32_e32 v15, 0x8000, v12 ; GCN-IR-NEXT: v_add_i32_e32 v12, vcc, 1, v8 ; GCN-IR-NEXT: v_or_b32_e32 v5, v13, v5 ; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, 0, v9, vcc +; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 ; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[12:13], v[8:9] ; GCN-IR-NEXT: v_mov_b32_e32 v8, v12 -; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 -; GCN-IR-NEXT: v_mov_b32_e32 v9, v13 -; GCN-IR-NEXT: v_mov_b32_e32 v13, v7 ; GCN-IR-NEXT: v_mov_b32_e32 v14, 0 ; GCN-IR-NEXT: v_sub_i32_e64 v10, s[4:5], v10, v15 +; GCN-IR-NEXT: v_mov_b32_e32 v9, v13 +; GCN-IR-NEXT: v_mov_b32_e32 v13, v7 ; GCN-IR-NEXT: v_subb_u32_e64 v11, s[4:5], v11, v14, s[4:5] ; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11] ; GCN-IR-NEXT: v_mov_b32_e32 v12, v6 diff --git a/llvm/test/CodeGen/AMDGPU/srl.ll b/llvm/test/CodeGen/AMDGPU/srl.ll --- a/llvm/test/CodeGen/AMDGPU/srl.ll +++ b/llvm/test/CodeGen/AMDGPU/srl.ll @@ -428,8 +428,8 @@ ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b64 s[4:5], s[0:1] ; SI-NEXT: s_mov_b64 s[0:1], s[2:3] diff --git a/llvm/test/CodeGen/AMDGPU/ssubsat.ll b/llvm/test/CodeGen/AMDGPU/ssubsat.ll --- a/llvm/test/CodeGen/AMDGPU/ssubsat.ll +++ b/llvm/test/CodeGen/AMDGPU/ssubsat.ll @@ -153,8 +153,8 @@ ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; GFX8-NEXT: v_sub_u16_e32 v4, v3, v2 -; GFX8-NEXT: v_cmp_lt_i16_e64 s[4:5], 0, v2 ; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, v4, v3 +; GFX8-NEXT: v_cmp_lt_i16_e64 s[4:5], 0, v2 ; GFX8-NEXT: v_ashrrev_i16_e32 v2, 15, v4 ; GFX8-NEXT: s_movk_i32 s6, 0x8000 ; GFX8-NEXT: v_xor_b32_e32 v2, s6, v2 @@ -200,17 +200,17 @@ ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v4 ; GFX6-NEXT: s_movk_i32 s4, 0x7fff ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 -; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v5 ; GFX6-NEXT: v_min_i32_e32 v1, s4, v1 ; GFX6-NEXT: s_movk_i32 s5, 0x8000 ; GFX6-NEXT: v_min_i32_e32 v0, s4, v0 +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v5 ; GFX6-NEXT: v_max_i32_e32 v1, s5, v1 -; GFX6-NEXT: v_min_i32_e32 v2, s4, v2 ; GFX6-NEXT: v_max_i32_e32 v0, s5, v0 ; GFX6-NEXT: s_mov_b32 s6, 0xffff -; GFX6-NEXT: v_max_i32_e32 v3, s5, v2 +; GFX6-NEXT: v_min_i32_e32 v2, s4, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_and_b32_e32 v0, s6, v0 +; GFX6-NEXT: v_max_i32_e32 v3, s5, v2 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: v_and_b32_e32 v2, s6, v3 ; GFX6-NEXT: v_alignbit_b32 v1, v3, v1, 16 @@ -222,8 +222,8 @@ ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v2 ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v0 ; GFX8-NEXT: v_sub_u16_e32 v6, v5, v4 -; GFX8-NEXT: v_cmp_lt_i16_e64 s[4:5], 0, v4 ; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, v6, v5 +; GFX8-NEXT: v_cmp_lt_i16_e64 s[4:5], 0, v4 ; GFX8-NEXT: v_ashrrev_i16_e32 v4, 15, v6 ; GFX8-NEXT: s_movk_i32 s6, 0x8000 ; GFX8-NEXT: v_xor_b32_e32 v4, s6, v4 @@ -288,9 +288,9 @@ ; GFX6-NEXT: v_bfe_i32 v3, v3, 0, 16 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_and_b32_e32 v0, s6, v0 -; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v6 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v3, v7 +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v6 ; GFX6-NEXT: v_min_i32_e32 v1, s4, v1 ; GFX6-NEXT: v_min_i32_e32 v2, s4, v2 ; GFX6-NEXT: v_max_i32_e32 v1, s5, v1 @@ -306,8 +306,8 @@ ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v2 ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v0 ; GFX8-NEXT: v_sub_u16_e32 v6, v5, v4 -; GFX8-NEXT: v_cmp_lt_i16_e64 s[4:5], 0, v4 ; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, v6, v5 +; GFX8-NEXT: v_cmp_lt_i16_e64 s[4:5], 0, v4 ; GFX8-NEXT: v_ashrrev_i16_e32 v4, 15, v6 ; GFX8-NEXT: s_movk_i32 s6, 0x8000 ; GFX8-NEXT: v_xor_b32_e32 v4, s6, v4 @@ -319,14 +319,14 @@ ; GFX8-NEXT: v_ashrrev_i16_e32 v0, 15, v2 ; GFX8-NEXT: v_xor_b32_e32 v0, s6, v0 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v3 ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v1 ; GFX8-NEXT: v_sub_u16_e32 v5, v4, v2 -; GFX8-NEXT: v_cmp_lt_i16_e64 s[4:5], 0, v2 ; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, v5, v4 +; GFX8-NEXT: v_cmp_lt_i16_e64 s[4:5], 0, v2 ; GFX8-NEXT: v_ashrrev_i16_e32 v2, 15, v5 ; GFX8-NEXT: v_xor_b32_e32 v2, s6, v2 ; GFX8-NEXT: s_xor_b64 vcc, s[4:5], vcc @@ -374,8 +374,8 @@ ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GFX6-NEXT: v_sub_i32_e64 v2, s[4:5], v1, v3 -; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v2, v1 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v3 +; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v2, v1 ; GFX6-NEXT: v_ashrrev_i32_e32 v1, 31, v2 ; GFX6-NEXT: v_xor_b32_e32 v1, s6, v1 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] @@ -394,8 +394,8 @@ ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GFX8-NEXT: v_sub_u32_e64 v2, s[4:5], v1, v3 -; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v2, v1 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v3 +; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v2, v1 ; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v2 ; GFX8-NEXT: v_xor_b32_e32 v1, s6, v1 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] @@ -433,15 +433,15 @@ ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc ; GFX6-NEXT: v_sub_i32_e64 v3, s[4:5], v1, v4 -; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v3, v1 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v4 +; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v3, v1 ; GFX6-NEXT: v_ashrrev_i32_e32 v1, 31, v3 ; GFX6-NEXT: v_xor_b32_e32 v1, s6, v1 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GFX6-NEXT: v_sub_i32_e64 v3, s[4:5], v2, v5 -; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v3, v2 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v5 +; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v3, v2 ; GFX6-NEXT: v_ashrrev_i32_e32 v2, 31, v3 ; GFX6-NEXT: v_xor_b32_e32 v2, s6, v2 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] @@ -460,15 +460,15 @@ ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc ; GFX8-NEXT: v_sub_u32_e64 v3, s[4:5], v1, v4 -; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v3, v1 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v4 +; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v3, v1 ; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v3 ; GFX8-NEXT: v_xor_b32_e32 v1, s6, v1 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GFX8-NEXT: v_sub_u32_e64 v3, s[4:5], v2, v5 -; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v3, v2 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v5 +; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v3, v2 ; GFX8-NEXT: v_ashrrev_i32_e32 v2, 31, v3 ; GFX8-NEXT: v_xor_b32_e32 v2, s6, v2 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] @@ -508,22 +508,22 @@ ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; GFX6-NEXT: v_sub_i32_e64 v4, s[4:5], v1, v5 -; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v4, v1 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v5 +; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v4, v1 ; GFX6-NEXT: v_ashrrev_i32_e32 v1, 31, v4 ; GFX6-NEXT: v_xor_b32_e32 v1, s6, v1 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc ; GFX6-NEXT: v_sub_i32_e64 v4, s[4:5], v2, v6 -; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v4, v2 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v6 +; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v4, v2 ; GFX6-NEXT: v_ashrrev_i32_e32 v2, 31, v4 ; GFX6-NEXT: v_xor_b32_e32 v2, s6, v2 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; GFX6-NEXT: v_sub_i32_e64 v4, s[4:5], v3, v7 -; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v4, v3 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v7 +; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v4, v3 ; GFX6-NEXT: v_ashrrev_i32_e32 v3, 31, v4 ; GFX6-NEXT: v_xor_b32_e32 v3, 0x80000000, v3 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] @@ -542,22 +542,22 @@ ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; GFX8-NEXT: v_sub_u32_e64 v4, s[4:5], v1, v5 -; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v4, v1 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v5 +; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v4, v1 ; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v4 ; GFX8-NEXT: v_xor_b32_e32 v1, s6, v1 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc ; GFX8-NEXT: v_sub_u32_e64 v4, s[4:5], v2, v6 -; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v4, v2 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v6 +; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v4, v2 ; GFX8-NEXT: v_ashrrev_i32_e32 v2, 31, v4 ; GFX8-NEXT: v_xor_b32_e32 v2, s6, v2 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; GFX8-NEXT: v_sub_u32_e64 v4, s[4:5], v3, v7 -; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v4, v3 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v7 +; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v4, v3 ; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v4 ; GFX8-NEXT: v_xor_b32_e32 v3, 0x80000000, v3 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] @@ -599,51 +599,51 @@ ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc ; GFX6-NEXT: v_sub_i32_e64 v8, s[4:5], v1, v9 -; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v1 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v9 +; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v1 ; GFX6-NEXT: v_ashrrev_i32_e32 v1, 31, v8 ; GFX6-NEXT: v_xor_b32_e32 v1, s6, v1 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc ; GFX6-NEXT: v_sub_i32_e64 v8, s[4:5], v2, v10 -; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v2 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v10 +; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v2 ; GFX6-NEXT: v_ashrrev_i32_e32 v2, 31, v8 ; GFX6-NEXT: v_xor_b32_e32 v2, s6, v2 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc ; GFX6-NEXT: v_sub_i32_e64 v8, s[4:5], v3, v11 -; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v3 -; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v11 ; GFX6-NEXT: v_bfrev_b32_e32 v16, 1 +; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v11 +; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v3 ; GFX6-NEXT: v_ashrrev_i32_e32 v3, 31, v8 ; GFX6-NEXT: v_xor_b32_e32 v3, v16, v3 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v3, v8, v3, vcc ; GFX6-NEXT: v_sub_i32_e64 v8, s[4:5], v4, v12 -; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v4 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v12 +; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v4 ; GFX6-NEXT: v_ashrrev_i32_e32 v4, 31, v8 ; GFX6-NEXT: v_xor_b32_e32 v4, v16, v4 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc ; GFX6-NEXT: v_sub_i32_e64 v8, s[4:5], v5, v13 -; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v5 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v13 +; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v5 ; GFX6-NEXT: v_ashrrev_i32_e32 v5, 31, v8 ; GFX6-NEXT: v_xor_b32_e32 v5, v16, v5 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v5, v8, v5, vcc ; GFX6-NEXT: v_sub_i32_e64 v8, s[4:5], v6, v14 -; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v6 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v14 +; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v6 ; GFX6-NEXT: v_ashrrev_i32_e32 v6, 31, v8 ; GFX6-NEXT: v_xor_b32_e32 v6, v16, v6 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v6, v8, v6, vcc ; GFX6-NEXT: v_sub_i32_e64 v8, s[4:5], v7, v15 -; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v7 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v15 +; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v7 ; GFX6-NEXT: v_ashrrev_i32_e32 v7, 31, v8 ; GFX6-NEXT: v_xor_b32_e32 v7, v16, v7 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] @@ -662,51 +662,51 @@ ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc ; GFX8-NEXT: v_sub_u32_e64 v8, s[4:5], v1, v9 -; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v1 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v9 +; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v1 ; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v8 ; GFX8-NEXT: v_xor_b32_e32 v1, s6, v1 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc ; GFX8-NEXT: v_sub_u32_e64 v8, s[4:5], v2, v10 -; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v2 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v10 +; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v2 ; GFX8-NEXT: v_ashrrev_i32_e32 v2, 31, v8 ; GFX8-NEXT: v_xor_b32_e32 v2, s6, v2 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc ; GFX8-NEXT: v_sub_u32_e64 v8, s[4:5], v3, v11 -; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v3 -; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v11 ; GFX8-NEXT: v_bfrev_b32_e32 v16, 1 +; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v11 +; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v3 ; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v8 ; GFX8-NEXT: v_xor_b32_e32 v3, v16, v3 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v3, v8, v3, vcc ; GFX8-NEXT: v_sub_u32_e64 v8, s[4:5], v4, v12 -; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v4 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v12 +; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v4 ; GFX8-NEXT: v_ashrrev_i32_e32 v4, 31, v8 ; GFX8-NEXT: v_xor_b32_e32 v4, v16, v4 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc ; GFX8-NEXT: v_sub_u32_e64 v8, s[4:5], v5, v13 -; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v5 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v13 +; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v5 ; GFX8-NEXT: v_ashrrev_i32_e32 v5, 31, v8 ; GFX8-NEXT: v_xor_b32_e32 v5, v16, v5 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v5, vcc ; GFX8-NEXT: v_sub_u32_e64 v8, s[4:5], v6, v14 -; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v6 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v14 +; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v6 ; GFX8-NEXT: v_ashrrev_i32_e32 v6, 31, v8 ; GFX8-NEXT: v_xor_b32_e32 v6, v16, v6 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v6, v8, v6, vcc ; GFX8-NEXT: v_sub_u32_e64 v8, s[4:5], v7, v15 -; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v7 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v15 +; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v7 ; GFX8-NEXT: v_ashrrev_i32_e32 v7, 31, v8 ; GFX8-NEXT: v_xor_b32_e32 v7, v16, v7 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] @@ -763,100 +763,100 @@ ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v1, v16, v1, vcc ; GFX6-NEXT: v_sub_i32_e64 v16, s[4:5], v2, v18 -; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v2 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v18 +; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v2 ; GFX6-NEXT: v_ashrrev_i32_e32 v2, 31, v16 ; GFX6-NEXT: v_xor_b32_e32 v2, s6, v2 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v2, v16, v2, vcc ; GFX6-NEXT: v_sub_i32_e64 v16, s[4:5], v3, v19 -; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v3 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v19 +; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v3 ; GFX6-NEXT: v_bfrev_b32_e32 v17, 1 ; GFX6-NEXT: v_ashrrev_i32_e32 v3, 31, v16 ; GFX6-NEXT: v_xor_b32_e32 v3, v17, v3 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v3, v16, v3, vcc ; GFX6-NEXT: v_sub_i32_e64 v16, s[4:5], v4, v20 -; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v4 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v20 +; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v4 ; GFX6-NEXT: v_ashrrev_i32_e32 v4, 31, v16 ; GFX6-NEXT: v_xor_b32_e32 v4, v17, v4 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v4, v16, v4, vcc ; GFX6-NEXT: v_sub_i32_e64 v16, s[4:5], v5, v21 -; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v5 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v21 +; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v5 ; GFX6-NEXT: v_ashrrev_i32_e32 v5, 31, v16 ; GFX6-NEXT: v_xor_b32_e32 v5, v17, v5 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v5, v16, v5, vcc ; GFX6-NEXT: v_sub_i32_e64 v16, s[4:5], v6, v22 -; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v6 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v22 +; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v6 ; GFX6-NEXT: v_ashrrev_i32_e32 v6, 31, v16 ; GFX6-NEXT: v_xor_b32_e32 v6, v17, v6 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v6, v16, v6, vcc ; GFX6-NEXT: v_sub_i32_e64 v16, s[4:5], v7, v23 -; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v7 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v23 +; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v7 ; GFX6-NEXT: v_ashrrev_i32_e32 v7, 31, v16 ; GFX6-NEXT: v_xor_b32_e32 v7, v17, v7 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v7, v16, v7, vcc ; GFX6-NEXT: v_sub_i32_e64 v16, s[4:5], v8, v24 -; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v8 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v24 +; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v8 ; GFX6-NEXT: v_ashrrev_i32_e32 v8, 31, v16 ; GFX6-NEXT: v_xor_b32_e32 v8, v17, v8 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v8, v16, v8, vcc ; GFX6-NEXT: v_sub_i32_e64 v16, s[4:5], v9, v25 -; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v9 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v25 +; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v9 ; GFX6-NEXT: v_ashrrev_i32_e32 v9, 31, v16 ; GFX6-NEXT: v_xor_b32_e32 v9, v17, v9 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v9, v16, v9, vcc ; GFX6-NEXT: v_sub_i32_e64 v16, s[4:5], v10, v26 -; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v10 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v26 +; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v10 ; GFX6-NEXT: v_ashrrev_i32_e32 v10, 31, v16 ; GFX6-NEXT: v_xor_b32_e32 v10, v17, v10 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v10, v16, v10, vcc ; GFX6-NEXT: v_sub_i32_e64 v16, s[4:5], v11, v27 -; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v11 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v27 +; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v11 ; GFX6-NEXT: v_ashrrev_i32_e32 v11, 31, v16 ; GFX6-NEXT: v_xor_b32_e32 v11, v17, v11 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v11, v16, v11, vcc ; GFX6-NEXT: v_sub_i32_e64 v16, s[4:5], v12, v28 -; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v12 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v28 +; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v12 ; GFX6-NEXT: v_ashrrev_i32_e32 v12, 31, v16 ; GFX6-NEXT: v_xor_b32_e32 v12, v17, v12 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v12, v16, v12, vcc ; GFX6-NEXT: v_sub_i32_e64 v16, s[4:5], v13, v29 -; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v13 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v29 +; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v13 ; GFX6-NEXT: v_ashrrev_i32_e32 v13, 31, v16 ; GFX6-NEXT: v_xor_b32_e32 v13, v17, v13 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v13, v16, v13, vcc ; GFX6-NEXT: v_sub_i32_e64 v16, s[4:5], v14, v30 -; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v14 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v30 +; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v14 ; GFX6-NEXT: v_ashrrev_i32_e32 v14, 31, v16 ; GFX6-NEXT: v_xor_b32_e32 v14, v17, v14 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v14, v16, v14, vcc ; GFX6-NEXT: v_sub_i32_e64 v16, s[4:5], v15, v31 -; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v15 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v31 +; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v15 ; GFX6-NEXT: v_ashrrev_i32_e32 v15, 31, v16 ; GFX6-NEXT: v_xor_b32_e32 v15, v17, v15 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] @@ -882,100 +882,100 @@ ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v1, v16, v1, vcc ; GFX8-NEXT: v_sub_u32_e64 v16, s[4:5], v2, v18 -; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v2 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v18 +; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v2 ; GFX8-NEXT: v_ashrrev_i32_e32 v2, 31, v16 ; GFX8-NEXT: v_xor_b32_e32 v2, s6, v2 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v2, v16, v2, vcc ; GFX8-NEXT: v_sub_u32_e64 v16, s[4:5], v3, v19 -; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v3 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v19 +; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v3 ; GFX8-NEXT: v_bfrev_b32_e32 v17, 1 ; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v16 ; GFX8-NEXT: v_xor_b32_e32 v3, v17, v3 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v3, v16, v3, vcc ; GFX8-NEXT: v_sub_u32_e64 v16, s[4:5], v4, v20 -; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v4 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v20 +; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v4 ; GFX8-NEXT: v_ashrrev_i32_e32 v4, 31, v16 ; GFX8-NEXT: v_xor_b32_e32 v4, v17, v4 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v4, v16, v4, vcc ; GFX8-NEXT: v_sub_u32_e64 v16, s[4:5], v5, v21 -; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v5 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v21 +; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v5 ; GFX8-NEXT: v_ashrrev_i32_e32 v5, 31, v16 ; GFX8-NEXT: v_xor_b32_e32 v5, v17, v5 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v5, v16, v5, vcc ; GFX8-NEXT: v_sub_u32_e64 v16, s[4:5], v6, v22 -; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v6 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v22 +; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v6 ; GFX8-NEXT: v_ashrrev_i32_e32 v6, 31, v16 ; GFX8-NEXT: v_xor_b32_e32 v6, v17, v6 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v6, v16, v6, vcc ; GFX8-NEXT: v_sub_u32_e64 v16, s[4:5], v7, v23 -; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v7 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v23 +; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v7 ; GFX8-NEXT: v_ashrrev_i32_e32 v7, 31, v16 ; GFX8-NEXT: v_xor_b32_e32 v7, v17, v7 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v7, v16, v7, vcc ; GFX8-NEXT: v_sub_u32_e64 v16, s[4:5], v8, v24 -; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v8 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v24 +; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v8 ; GFX8-NEXT: v_ashrrev_i32_e32 v8, 31, v16 ; GFX8-NEXT: v_xor_b32_e32 v8, v17, v8 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v8, v16, v8, vcc ; GFX8-NEXT: v_sub_u32_e64 v16, s[4:5], v9, v25 -; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v9 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v25 +; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v9 ; GFX8-NEXT: v_ashrrev_i32_e32 v9, 31, v16 ; GFX8-NEXT: v_xor_b32_e32 v9, v17, v9 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v9, v16, v9, vcc ; GFX8-NEXT: v_sub_u32_e64 v16, s[4:5], v10, v26 -; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v10 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v26 +; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v10 ; GFX8-NEXT: v_ashrrev_i32_e32 v10, 31, v16 ; GFX8-NEXT: v_xor_b32_e32 v10, v17, v10 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v10, v16, v10, vcc ; GFX8-NEXT: v_sub_u32_e64 v16, s[4:5], v11, v27 -; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v11 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v27 +; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v11 ; GFX8-NEXT: v_ashrrev_i32_e32 v11, 31, v16 ; GFX8-NEXT: v_xor_b32_e32 v11, v17, v11 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v11, v16, v11, vcc ; GFX8-NEXT: v_sub_u32_e64 v16, s[4:5], v12, v28 -; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v12 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v28 +; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v12 ; GFX8-NEXT: v_ashrrev_i32_e32 v12, 31, v16 ; GFX8-NEXT: v_xor_b32_e32 v12, v17, v12 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v12, v16, v12, vcc ; GFX8-NEXT: v_sub_u32_e64 v16, s[4:5], v13, v29 -; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v13 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v29 +; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v13 ; GFX8-NEXT: v_ashrrev_i32_e32 v13, 31, v16 ; GFX8-NEXT: v_xor_b32_e32 v13, v17, v13 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v13, v16, v13, vcc ; GFX8-NEXT: v_sub_u32_e64 v16, s[4:5], v14, v30 -; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v14 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v30 +; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v14 ; GFX8-NEXT: v_ashrrev_i32_e32 v14, 31, v16 ; GFX8-NEXT: v_xor_b32_e32 v14, v17, v14 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v14, v16, v14, vcc ; GFX8-NEXT: v_sub_u32_e64 v16, s[4:5], v15, v31 -; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v15 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v31 +; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v15 ; GFX8-NEXT: v_ashrrev_i32_e32 v15, 31, v16 ; GFX8-NEXT: v_xor_b32_e32 v15, v17, v15 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] @@ -1077,8 +1077,8 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_sub_co_u32 v4, vcc_lo, v0, v2 -; GFX10-NEXT: v_cmp_lt_i64_e64 s4, 0, v[2:3] ; GFX10-NEXT: v_sub_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo +; GFX10-NEXT: v_cmp_lt_i64_e64 s4, 0, v[2:3] ; GFX10-NEXT: v_ashrrev_i32_e32 v6, 31, v5 ; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[0:1] ; GFX10-NEXT: v_xor_b32_e32 v1, 0x80000000, v6 diff --git a/llvm/test/CodeGen/AMDGPU/stack-realign.ll b/llvm/test/CodeGen/AMDGPU/stack-realign.ll --- a/llvm/test/CodeGen/AMDGPU/stack-realign.ll +++ b/llvm/test/CodeGen/AMDGPU/stack-realign.ll @@ -193,12 +193,12 @@ ; GCN-LABEL: needs_align1024_stack_args_used_inside_loop: ; GCN: s_mov_b32 [[FP_COPY:s[0-9]+]], s33 -; GCN-NEXT: s_add_i32 s33, s32, 0xffc0 ; GCN-NEXT: s_mov_b32 [[BP_COPY:s[0-9]+]], s34 +; GCN-NEXT: s_add_i32 s33, s32, 0xffc0 ; GCN-NEXT: s_mov_b32 s34, s32 ; GCN-NEXT: s_and_b32 s33, s33, 0xffff0000 -; GCN-NEXT: v_mov_b32_e32 v{{[0-9]+}}, 0 ; GCN-NEXT: v_lshrrev_b32_e64 [[VGPR_REG:v[0-9]+]], 6, s34 +; GCN-NEXT: v_mov_b32_e32 v{{[0-9]+}}, 0 ; GCN: s_add_i32 s32, s32, 0x30000 ; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s33 offset:1024 ; GCN: buffer_load_dword v{{[0-9]+}}, [[VGPR_REG]], s[0:3], 0 offen diff --git a/llvm/test/CodeGen/AMDGPU/store-local.128.ll b/llvm/test/CodeGen/AMDGPU/store-local.128.ll --- a/llvm/test/CodeGen/AMDGPU/store-local.128.ll +++ b/llvm/test/CodeGen/AMDGPU/store-local.128.ll @@ -223,9 +223,9 @@ ; GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-NEXT: s_lshr_b32 s3, s6, 24 +; GFX10-NEXT: v_mov_b32_e32 v2, s6 ; GFX10-NEXT: s_lshr_b32 s0, s7, 8 ; GFX10-NEXT: s_lshr_b32 s2, s6, 8 -; GFX10-NEXT: v_mov_b32_e32 v2, s6 ; GFX10-NEXT: s_lshr_b32 s6, s5, 8 ; GFX10-NEXT: v_mov_b32_e32 v3, s5 ; GFX10-NEXT: s_lshr_b32 s1, s7, 24 @@ -234,8 +234,8 @@ ; GFX10-NEXT: v_mov_b32_e32 v5, s0 ; GFX10-NEXT: v_mov_b32_e32 v9, s6 ; GFX10-NEXT: s_lshr_b32 s0, s4, 8 -; GFX10-NEXT: v_mov_b32_e32 v6, s1 ; GFX10-NEXT: v_mov_b32_e32 v4, s4 +; GFX10-NEXT: v_mov_b32_e32 v6, s1 ; GFX10-NEXT: v_mov_b32_e32 v7, s2 ; GFX10-NEXT: ds_write_b8 v0, v1 offset:12 ; GFX10-NEXT: ds_write_b8_d16_hi v0, v1 offset:14 @@ -248,8 +248,8 @@ ; GFX10-NEXT: ds_write_b8 v0, v5 offset:13 ; GFX10-NEXT: ds_write_b8 v0, v6 offset:15 ; GFX10-NEXT: ds_write_b8 v0, v7 offset:9 -; GFX10-NEXT: s_lshr_b32 s1, s4, 24 ; GFX10-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-NEXT: s_lshr_b32 s1, s4, 24 ; GFX10-NEXT: v_mov_b32_e32 v2, s0 ; GFX10-NEXT: v_mov_b32_e32 v3, s1 ; GFX10-NEXT: ds_write_b8 v0, v8 offset:11 @@ -374,8 +374,8 @@ ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: v_mov_b32_e32 v2, s5 -; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset1:1 ; GFX9-NEXT: v_mov_b32_e32 v3, s6 +; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset1:1 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: ds_write2_b32 v0, v3, v1 offset0:2 offset1:3 ; GFX9-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/store-local.96.ll b/llvm/test/CodeGen/AMDGPU/store-local.96.ll --- a/llvm/test/CodeGen/AMDGPU/store-local.96.ll +++ b/llvm/test/CodeGen/AMDGPU/store-local.96.ll @@ -38,8 +38,8 @@ ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s2 -; GFX6-NEXT: ds_write_b32 v2, v1 offset:8 ; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: ds_write_b32 v2, v1 offset:8 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: ds_write_b64 v2, v[0:1] ; GFX6-NEXT: s_endpgm @@ -189,12 +189,12 @@ ; GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-NEXT: v_mov_b32_e32 v1, s6 ; GFX10-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-NEXT: v_mov_b32_e32 v3, s4 ; GFX10-NEXT: s_lshr_b32 s0, s6, 8 ; GFX10-NEXT: s_lshr_b32 s1, s6, 24 ; GFX10-NEXT: s_lshr_b32 s2, s5, 8 ; GFX10-NEXT: s_lshr_b32 s3, s5, 24 ; GFX10-NEXT: s_lshr_b32 s5, s4, 8 -; GFX10-NEXT: v_mov_b32_e32 v3, s4 ; GFX10-NEXT: s_lshr_b32 s4, s4, 24 ; GFX10-NEXT: v_mov_b32_e32 v4, s0 ; GFX10-NEXT: v_mov_b32_e32 v5, s1 @@ -387,8 +387,8 @@ ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s2 -; GFX7-NEXT: ds_write_b32 v2, v1 offset:8 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: ds_write_b32 v2, v1 offset:8 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: ds_write_b64 v2, v[0:1] ; GFX7-NEXT: s_endpgm @@ -401,8 +401,8 @@ ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s2 -; GFX6-NEXT: ds_write_b32 v2, v1 offset:8 ; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: ds_write_b32 v2, v1 offset:8 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: ds_write_b64 v2, v[0:1] ; GFX6-NEXT: s_endpgm @@ -458,8 +458,8 @@ ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s2 -; GFX6-NEXT: ds_write_b32 v2, v1 offset:8 ; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: ds_write_b32 v2, v1 offset:8 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: ds_write_b64 v2, v[0:1] ; GFX6-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll b/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll --- a/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll +++ b/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll @@ -92,8 +92,8 @@ ; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s1, s[4:5], 0x8 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s3, s2, 0xffff ; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: s_and_b32 s3, s2, 0xffff ; GFX9-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: ds_write_b16 v1, v2 offset:4 @@ -244,8 +244,8 @@ ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_and_b32 s2, s2, 1 ; GFX10-NEXT: v_mov_b32_e32 v2, s3 -; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v3, s2 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: ds_write_b8 v2, v3 offset:8 ; GFX10-NEXT: ds_write_b64 v2, v[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/strict_fadd.f16.ll b/llvm/test/CodeGen/AMDGPU/strict_fadd.f16.ll --- a/llvm/test/CodeGen/AMDGPU/strict_fadd.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/strict_fadd.f16.ll @@ -164,8 +164,8 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_add_f16_sdwa v4, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_add_f16_e32 v1, v1, v3 ; GFX9-NEXT: v_add_f16_sdwa v5, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-NEXT: v_add_f16_e32 v1, v1, v3 ; GFX9-NEXT: v_add_f16_e32 v0, v0, v2 ; GFX9-NEXT: v_lshl_or_b32 v0, v5, 16, v0 ; GFX9-NEXT: v_lshl_or_b32 v1, v4, 16, v1 @@ -175,8 +175,8 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_f16_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_add_f16_e32 v1, v1, v3 ; GFX8-NEXT: v_add_f16_sdwa v5, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_f16_e32 v1, v1, v3 ; GFX8-NEXT: v_add_f16_e32 v0, v0, v2 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v5 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v4 @@ -187,8 +187,8 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_add_f16_e32 v4, v0, v2 -; GFX10-NEXT: v_add_f16_e32 v6, v1, v3 ; GFX10-NEXT: v_mov_b32_e32 v5, 0xffff +; GFX10-NEXT: v_add_f16_e32 v6, v1, v3 ; GFX10-NEXT: v_add_f16_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX10-NEXT: v_add_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX10-NEXT: v_and_b32_e32 v2, v5, v4 diff --git a/llvm/test/CodeGen/AMDGPU/strict_fma.f16.ll b/llvm/test/CodeGen/AMDGPU/strict_fma.f16.ll --- a/llvm/test/CodeGen/AMDGPU/strict_fma.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/strict_fma.f16.ll @@ -31,9 +31,9 @@ ; GFX8-LABEL: v_constained_fma_v2f16_fpexcept_strict: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v0 ; GFX8-NEXT: v_fma_f16 v3, v5, v4, v3 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX8-NEXT: v_fma_f16 v0, v0, v1, v2 @@ -61,9 +61,9 @@ ; GFX8-LABEL: v_constained_fma_v3f16_fpexcept_strict: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v4 ; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v0 ; GFX8-NEXT: v_fma_f16 v6, v8, v7, v6 ; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; GFX8-NEXT: v_fma_f16 v0, v0, v2, v4 @@ -87,19 +87,19 @@ ; GFX9-LABEL: v_constained_fma_v4f16_fpexcept_strict: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v5 ; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v1 ; GFX9-NEXT: v_fma_f16 v6, v8, v7, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v4 ; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v0 +; GFX9-NEXT: v_fma_f16 v1, v1, v3, v5 ; GFX9-NEXT: v_fma_f16 v0, v0, v2, v4 ; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff -; GFX9-NEXT: v_fma_f16 v1, v1, v3, v5 -; GFX9-NEXT: v_and_b32_e32 v1, v2, v1 ; GFX9-NEXT: v_fma_f16 v7, v9, v8, v7 ; GFX9-NEXT: v_and_b32_e32 v0, v2, v0 +; GFX9-NEXT: v_and_b32_e32 v1, v2, v1 ; GFX9-NEXT: v_lshl_or_b32 v0, v7, 16, v0 ; GFX9-NEXT: v_lshl_or_b32 v1, v6, 16, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -107,18 +107,18 @@ ; GFX8-LABEL: v_constained_fma_v4f16_fpexcept_strict: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v1 ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v5 ; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v1 ; GFX8-NEXT: v_fma_f16 v6, v8, v7, v6 -; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v4 ; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v0 ; GFX8-NEXT: v_fma_f16 v7, v9, v8, v7 ; GFX8-NEXT: v_fma_f16 v0, v0, v2, v4 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v7 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX8-NEXT: v_fma_f16 v1, v1, v3, v5 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v6 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -128,9 +128,9 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v5 -; GFX10-NEXT: v_lshrrev_b32_e32 v9, 16, v4 ; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v3 ; GFX10-NEXT: v_lshrrev_b32_e32 v8, 16, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v9, 16, v4 ; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v11, 16, v0 ; GFX10-NEXT: v_fmac_f16_e32 v4, v0, v2 @@ -215,9 +215,9 @@ ; GFX8-LABEL: v_constained_fma_v2f16_fpexcept_strict_fneg_fneg: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v0 ; GFX8-NEXT: v_fma_f16 v3, -v5, -v4, v3 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX8-NEXT: v_fma_f16 v0, -v0, -v1, v2 diff --git a/llvm/test/CodeGen/AMDGPU/strict_fmul.f16.ll b/llvm/test/CodeGen/AMDGPU/strict_fmul.f16.ll --- a/llvm/test/CodeGen/AMDGPU/strict_fmul.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/strict_fmul.f16.ll @@ -164,8 +164,8 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_mul_f16_sdwa v4, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_mul_f16_e32 v1, v1, v3 ; GFX9-NEXT: v_mul_f16_sdwa v5, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-NEXT: v_mul_f16_e32 v1, v1, v3 ; GFX9-NEXT: v_mul_f16_e32 v0, v0, v2 ; GFX9-NEXT: v_lshl_or_b32 v0, v5, 16, v0 ; GFX9-NEXT: v_lshl_or_b32 v1, v4, 16, v1 @@ -175,8 +175,8 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mul_f16_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_mul_f16_e32 v1, v1, v3 ; GFX8-NEXT: v_mul_f16_sdwa v5, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_mul_f16_e32 v1, v1, v3 ; GFX8-NEXT: v_mul_f16_e32 v0, v0, v2 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v5 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v4 @@ -187,8 +187,8 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_mul_f16_e32 v4, v0, v2 -; GFX10-NEXT: v_mul_f16_e32 v6, v1, v3 ; GFX10-NEXT: v_mov_b32_e32 v5, 0xffff +; GFX10-NEXT: v_mul_f16_e32 v6, v1, v3 ; GFX10-NEXT: v_mul_f16_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX10-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX10-NEXT: v_and_b32_e32 v2, v5, v4 diff --git a/llvm/test/CodeGen/AMDGPU/strict_fsub.f16.ll b/llvm/test/CodeGen/AMDGPU/strict_fsub.f16.ll --- a/llvm/test/CodeGen/AMDGPU/strict_fsub.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/strict_fsub.f16.ll @@ -184,8 +184,8 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_sub_f16_sdwa v4, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_sub_f16_e32 v1, v1, v3 ; GFX9-NEXT: v_sub_f16_sdwa v5, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-NEXT: v_sub_f16_e32 v1, v1, v3 ; GFX9-NEXT: v_sub_f16_e32 v0, v0, v2 ; GFX9-NEXT: v_lshl_or_b32 v0, v5, 16, v0 ; GFX9-NEXT: v_lshl_or_b32 v1, v4, 16, v1 @@ -195,8 +195,8 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_sub_f16_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_sub_f16_e32 v1, v1, v3 ; GFX8-NEXT: v_sub_f16_sdwa v5, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_sub_f16_e32 v1, v1, v3 ; GFX8-NEXT: v_sub_f16_e32 v0, v0, v2 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v5 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v4 @@ -207,8 +207,8 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_sub_f16_e32 v4, v0, v2 -; GFX10-NEXT: v_sub_f16_e32 v6, v1, v3 ; GFX10-NEXT: v_mov_b32_e32 v5, 0xffff +; GFX10-NEXT: v_sub_f16_e32 v6, v1, v3 ; GFX10-NEXT: v_sub_f16_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX10-NEXT: v_sub_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX10-NEXT: v_and_b32_e32 v2, v5, v4 @@ -239,9 +239,9 @@ ; GFX9-LABEL: s_constained_fsub_v2f16_fpexcept_strict: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_lshr_b32 s0, s3, 16 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: s_lshr_b32 s1, s2, 16 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_sub_f16_e32 v0, s1, v0 ; GFX9-NEXT: v_sub_f16_e32 v1, s2, v1 ; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v1 diff --git a/llvm/test/CodeGen/AMDGPU/trunc.ll b/llvm/test/CodeGen/AMDGPU/trunc.ll --- a/llvm/test/CodeGen/AMDGPU/trunc.ll +++ b/llvm/test/CodeGen/AMDGPU/trunc.ll @@ -41,7 +41,6 @@ ; GCN: s_lshl_b64 s{{\[}}[[LO_SHL:[0-9]+]]:{{[0-9]+\]}}, s{{\[}}[[LO_SREG]]:{{[0-9]+\]}}, 2 ; GCN: s_add_u32 s[[LO_SREG2:[0-9]+]], s[[LO_SHL]], ; GCN: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], s[[LO_SREG2]] -; GCN: s_addc_u32 ; SI: buffer_store_dword v[[LO_VREG]], ; VI: flat_store_dword v[{{[0-9:]+}}], v[[LO_VREG]] ; GCN: v_mov_b32_e32 diff --git a/llvm/test/CodeGen/AMDGPU/uaddsat.ll b/llvm/test/CodeGen/AMDGPU/uaddsat.ll --- a/llvm/test/CodeGen/AMDGPU/uaddsat.ll +++ b/llvm/test/CodeGen/AMDGPU/uaddsat.ll @@ -88,9 +88,9 @@ ; GFX6-NEXT: s_mov_b32 s4, 0xffff ; GFX6-NEXT: v_and_b32_e32 v3, s4, v3 ; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 -; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3 ; GFX6-NEXT: v_and_b32_e32 v2, s4, v2 ; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GFX6-NEXT: v_min_u32_e32 v1, s4, v1 ; GFX6-NEXT: v_min_u32_e32 v0, s4, v0 @@ -123,17 +123,17 @@ ; GFX6-NEXT: s_mov_b32 s4, 0xffff ; GFX6-NEXT: v_and_b32_e32 v4, s4, v4 ; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 -; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v4 ; GFX6-NEXT: v_and_b32_e32 v5, s4, v5 ; GFX6-NEXT: v_and_b32_e32 v2, s4, v2 ; GFX6-NEXT: v_and_b32_e32 v3, s4, v3 ; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v4 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v3 ; GFX6-NEXT: v_min_u32_e32 v1, s4, v1 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5 -; GFX6-NEXT: v_min_u32_e32 v3, s4, v2 ; GFX6-NEXT: v_min_u32_e32 v0, s4, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_min_u32_e32 v3, s4, v2 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: v_or_b32_e32 v2, 0xffff0000, v3 ; GFX6-NEXT: v_alignbit_b32 v1, v3, v1, 16 @@ -165,9 +165,9 @@ ; GFX6-NEXT: s_mov_b32 s4, 0xffff ; GFX6-NEXT: v_and_b32_e32 v5, s4, v5 ; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 -; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v5 ; GFX6-NEXT: v_and_b32_e32 v4, s4, v4 ; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v5 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v4 ; GFX6-NEXT: v_min_u32_e32 v1, s4, v1 ; GFX6-NEXT: v_and_b32_e32 v7, s4, v7 diff --git a/llvm/test/CodeGen/AMDGPU/udiv64.ll b/llvm/test/CodeGen/AMDGPU/udiv64.ll --- a/llvm/test/CodeGen/AMDGPU/udiv64.ll +++ b/llvm/test/CodeGen/AMDGPU/udiv64.ll @@ -22,10 +22,10 @@ ; GCN-NEXT: v_mul_f32_e32 v3, 0x2f800000, v0 ; GCN-NEXT: v_trunc_f32_e32 v3, v3 ; GCN-NEXT: v_mac_f32_e32 v0, 0xcf800000, v3 -; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v3, v3 -; GCN-NEXT: v_mul_hi_u32 v5, s4, v0 +; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GCN-NEXT: v_mul_lo_u32 v4, s4, v3 +; GCN-NEXT: v_mul_hi_u32 v5, s4, v0 ; GCN-NEXT: v_mul_lo_u32 v7, s5, v0 ; GCN-NEXT: v_mul_lo_u32 v6, s4, v0 ; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4 @@ -43,8 +43,8 @@ ; GCN-NEXT: v_addc_u32_e32 v5, vcc, v7, v6, vcc ; GCN-NEXT: v_addc_u32_e32 v6, vcc, v9, v1, vcc ; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; GCN-NEXT: v_add_i32_e64 v0, s[0:1], v0, v4 ; GCN-NEXT: v_addc_u32_e32 v5, vcc, v2, v6, vcc +; GCN-NEXT: v_add_i32_e64 v0, s[0:1], v0, v4 ; GCN-NEXT: v_addc_u32_e64 v4, vcc, v3, v5, s[0:1] ; GCN-NEXT: v_mul_lo_u32 v6, s4, v4 ; GCN-NEXT: v_mul_hi_u32 v7, s4, v0 @@ -54,8 +54,8 @@ ; GCN-NEXT: v_mul_lo_u32 v7, s4, v0 ; GCN-NEXT: v_add_i32_e32 v6, vcc, v8, v6 ; GCN-NEXT: v_mul_lo_u32 v10, v0, v6 -; GCN-NEXT: v_mul_hi_u32 v12, v0, v6 ; GCN-NEXT: v_mul_hi_u32 v11, v0, v7 +; GCN-NEXT: v_mul_hi_u32 v12, v0, v6 ; GCN-NEXT: v_mul_hi_u32 v9, v4, v7 ; GCN-NEXT: v_mul_lo_u32 v7, v4, v7 ; GCN-NEXT: v_mul_hi_u32 v8, v4, v6 @@ -134,8 +134,8 @@ ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[10:11], s[6:7], 0 ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[8:9], s[0:1], 0 ; GCN-IR-NEXT: s_flbit_i32_b32 s12, s0 -; GCN-IR-NEXT: s_or_b64 s[14:15], s[8:9], s[10:11] ; GCN-IR-NEXT: s_add_i32 s12, s12, 32 +; GCN-IR-NEXT: s_or_b64 s[14:15], s[8:9], s[10:11] ; GCN-IR-NEXT: s_flbit_i32_b32 s8, s1 ; GCN-IR-NEXT: s_min_u32 s10, s12, s8 ; GCN-IR-NEXT: s_flbit_i32_b32 s8, s6 @@ -168,14 +168,14 @@ ; GCN-IR-NEXT: s_addc_u32 s17, s1, -1 ; GCN-IR-NEXT: s_not_b64 s[2:3], s[10:11] ; GCN-IR-NEXT: s_add_u32 s6, s2, s12 -; GCN-IR-NEXT: s_addc_u32 s7, s3, s11 ; GCN-IR-NEXT: s_mov_b32 s13, s11 +; GCN-IR-NEXT: s_addc_u32 s7, s3, s11 ; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 ; GCN-IR-NEXT: s_mov_b32 s3, 0 ; GCN-IR-NEXT: BB0_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-IR-NEXT: s_lshr_b32 s2, s9, 31 ; GCN-IR-NEXT: s_lshl_b64 s[12:13], s[14:15], 1 +; GCN-IR-NEXT: s_lshr_b32 s2, s9, 31 ; GCN-IR-NEXT: s_lshl_b64 s[8:9], s[8:9], 1 ; GCN-IR-NEXT: s_or_b64 s[12:13], s[12:13], s[2:3] ; GCN-IR-NEXT: s_or_b64 s[8:9], s[10:11], s[8:9] @@ -224,25 +224,25 @@ ; GCN-NEXT: v_cvt_f32_u32_e32 v5, v3 ; GCN-NEXT: v_sub_i32_e32 v6, vcc, 0, v2 ; GCN-NEXT: v_subb_u32_e32 v7, vcc, 0, v3, vcc -; GCN-NEXT: v_mov_b32_e32 v14, 0 ; GCN-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 ; GCN-NEXT: v_rcp_f32_e32 v4, v4 +; GCN-NEXT: v_mov_b32_e32 v14, 0 ; GCN-NEXT: v_mov_b32_e32 v13, 0 ; GCN-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 ; GCN-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 ; GCN-NEXT: v_trunc_f32_e32 v5, v5 ; GCN-NEXT: v_mac_f32_e32 v4, 0xcf800000, v5 -; GCN-NEXT: v_cvt_u32_f32_e32 v4, v4 ; GCN-NEXT: v_cvt_u32_f32_e32 v5, v5 -; GCN-NEXT: v_mul_hi_u32 v8, v6, v4 +; GCN-NEXT: v_cvt_u32_f32_e32 v4, v4 ; GCN-NEXT: v_mul_lo_u32 v9, v6, v5 +; GCN-NEXT: v_mul_hi_u32 v8, v6, v4 ; GCN-NEXT: v_mul_lo_u32 v10, v7, v4 ; GCN-NEXT: v_add_i32_e32 v8, vcc, v8, v9 ; GCN-NEXT: v_mul_lo_u32 v9, v6, v4 ; GCN-NEXT: v_add_i32_e32 v8, vcc, v8, v10 ; GCN-NEXT: v_mul_lo_u32 v11, v4, v8 -; GCN-NEXT: v_mul_hi_u32 v10, v4, v8 ; GCN-NEXT: v_mul_hi_u32 v12, v4, v9 +; GCN-NEXT: v_mul_hi_u32 v10, v4, v8 ; GCN-NEXT: v_mul_hi_u32 v15, v5, v8 ; GCN-NEXT: v_mul_lo_u32 v8, v5, v8 ; GCN-NEXT: v_add_i32_e32 v11, vcc, v12, v11 @@ -253,8 +253,8 @@ ; GCN-NEXT: v_addc_u32_e32 v9, vcc, v10, v9, vcc ; GCN-NEXT: v_addc_u32_e32 v10, vcc, v15, v13, vcc ; GCN-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; GCN-NEXT: v_add_i32_e64 v4, s[4:5], v4, v8 ; GCN-NEXT: v_addc_u32_e32 v9, vcc, v14, v10, vcc +; GCN-NEXT: v_add_i32_e64 v4, s[4:5], v4, v8 ; GCN-NEXT: v_addc_u32_e64 v8, vcc, v5, v9, s[4:5] ; GCN-NEXT: v_mul_lo_u32 v10, v6, v8 ; GCN-NEXT: v_mul_hi_u32 v11, v6, v4 @@ -267,8 +267,8 @@ ; GCN-NEXT: v_mul_hi_u32 v16, v4, v7 ; GCN-NEXT: v_mul_hi_u32 v11, v8, v6 ; GCN-NEXT: v_mul_lo_u32 v6, v8, v6 -; GCN-NEXT: v_add_i32_e32 v12, vcc, v15, v12 ; GCN-NEXT: v_mul_hi_u32 v10, v8, v7 +; GCN-NEXT: v_add_i32_e32 v12, vcc, v15, v12 ; GCN-NEXT: v_addc_u32_e32 v15, vcc, v14, v16, vcc ; GCN-NEXT: v_mul_lo_u32 v7, v8, v7 ; GCN-NEXT: v_add_i32_e32 v6, vcc, v12, v6 @@ -310,9 +310,9 @@ ; GCN-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v2 ; GCN-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] ; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v3 -; GCN-NEXT: v_subb_u32_e32 v1, vcc, v1, v6, vcc ; GCN-NEXT: v_cndmask_b32_e64 v7, v9, v8, s[4:5] ; GCN-NEXT: v_add_i32_e64 v8, s[4:5], 2, v4 +; GCN-NEXT: v_subb_u32_e32 v1, vcc, v1, v6, vcc ; GCN-NEXT: v_addc_u32_e64 v9, s[4:5], 0, v5, s[4:5] ; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3 ; GCN-NEXT: v_add_i32_e64 v10, s[4:5], 1, v4 @@ -362,8 +362,8 @@ ; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, 0, v7, vcc ; GCN-IR-NEXT: v_sub_i32_e64 v4, s[4:5], 63, v6 ; GCN-IR-NEXT: v_cmp_ge_u64_e32 vcc, v[12:13], v[6:7] -; GCN-IR-NEXT: v_mov_b32_e32 v6, 0 ; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[0:1], v4 +; GCN-IR-NEXT: v_mov_b32_e32 v6, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 ; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 ; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -375,8 +375,8 @@ ; GCN-IR-NEXT: v_addc_u32_e32 v15, vcc, -1, v3, vcc ; GCN-IR-NEXT: v_not_b32_e32 v0, v8 ; GCN-IR-NEXT: v_not_b32_e32 v1, v9 -; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v10 ; GCN-IR-NEXT: v_mov_b32_e32 v8, 0 +; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v10 ; GCN-IR-NEXT: v_mov_b32_e32 v9, 0 ; GCN-IR-NEXT: v_addc_u32_e32 v1, vcc, v1, v11, vcc ; GCN-IR-NEXT: BB1_3: ; %udiv-do-while @@ -389,15 +389,15 @@ ; GCN-IR-NEXT: v_subb_u32_e32 v6, vcc, v15, v11, vcc ; GCN-IR-NEXT: v_or_b32_e32 v4, v8, v4 ; GCN-IR-NEXT: v_ashrrev_i32_e32 v8, 31, v6 -; GCN-IR-NEXT: v_and_b32_e32 v12, v8, v2 ; GCN-IR-NEXT: v_and_b32_e32 v6, 1, v8 ; GCN-IR-NEXT: v_and_b32_e32 v13, v8, v3 +; GCN-IR-NEXT: v_and_b32_e32 v12, v8, v2 ; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, 1, v0 ; GCN-IR-NEXT: v_or_b32_e32 v5, v9, v5 ; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 ; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[8:9], v[0:1] ; GCN-IR-NEXT: v_mov_b32_e32 v0, v8 -; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 ; GCN-IR-NEXT: v_sub_i32_e64 v12, s[4:5], v10, v12 ; GCN-IR-NEXT: v_mov_b32_e32 v1, v9 ; GCN-IR-NEXT: v_mov_b32_e32 v9, v7 @@ -729,8 +729,8 @@ ; GCN-NEXT: v_mul_lo_u32 v4, s2, v1 ; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v5 ; GCN-NEXT: v_mul_lo_u32 v6, v1, v3 -; GCN-NEXT: v_mul_hi_u32 v5, v1, v3 ; GCN-NEXT: v_mul_hi_u32 v7, v1, v4 +; GCN-NEXT: v_mul_hi_u32 v5, v1, v3 ; GCN-NEXT: v_mul_hi_u32 v10, v2, v3 ; GCN-NEXT: v_mul_lo_u32 v3, v2, v3 ; GCN-NEXT: v_add_i32_e32 v6, vcc, v7, v6 @@ -741,8 +741,8 @@ ; GCN-NEXT: v_addc_u32_e32 v4, vcc, v5, v4, vcc ; GCN-NEXT: v_addc_u32_e32 v5, vcc, v10, v8, vcc ; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GCN-NEXT: v_add_i32_e64 v1, s[0:1], v1, v3 ; GCN-NEXT: v_addc_u32_e32 v4, vcc, v9, v5, vcc +; GCN-NEXT: v_add_i32_e64 v1, s[0:1], v1, v3 ; GCN-NEXT: v_addc_u32_e64 v3, vcc, v2, v4, s[0:1] ; GCN-NEXT: v_mul_lo_u32 v5, s2, v3 ; GCN-NEXT: v_mul_hi_u32 v6, s2, v1 @@ -751,8 +751,8 @@ ; GCN-NEXT: v_mul_lo_u32 v6, s2, v1 ; GCN-NEXT: v_add_i32_e32 v5, vcc, v7, v5 ; GCN-NEXT: v_mul_lo_u32 v11, v1, v5 -; GCN-NEXT: v_mul_hi_u32 v13, v1, v5 ; GCN-NEXT: v_mul_hi_u32 v12, v1, v6 +; GCN-NEXT: v_mul_hi_u32 v13, v1, v5 ; GCN-NEXT: v_mul_hi_u32 v10, v3, v6 ; GCN-NEXT: v_mul_lo_u32 v6, v3, v6 ; GCN-NEXT: v_mul_hi_u32 v7, v3, v5 @@ -768,8 +768,8 @@ ; GCN-NEXT: v_addc_u32_e64 v2, vcc, v2, v5, s[0:1] ; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v3 ; GCN-NEXT: v_mov_b32_e32 v3, s8 -; GCN-NEXT: v_alignbit_b32 v3, s6, v3, 24 ; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc +; GCN-NEXT: v_alignbit_b32 v3, s6, v3, 24 ; GCN-NEXT: v_mul_lo_u32 v4, v3, v2 ; GCN-NEXT: v_mul_hi_u32 v1, v3, v1 ; GCN-NEXT: v_mul_hi_u32 v2, v3, v2 @@ -794,14 +794,14 @@ ; GCN-NEXT: v_sub_i32_e32 v7, vcc, v3, v0 ; GCN-NEXT: v_subbrev_u32_e32 v10, vcc, 0, v6, vcc ; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v7, v0 -; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], v3, v0 ; GCN-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v10 +; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], v3, v0 ; GCN-NEXT: v_cndmask_b32_e32 v7, -1, v7, vcc ; GCN-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1] ; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v6 -; GCN-NEXT: v_cndmask_b32_e64 v0, -1, v0, s[0:1] ; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 +; GCN-NEXT: v_cndmask_b32_e64 v0, -1, v0, s[0:1] ; GCN-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc ; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v0 ; GCN-NEXT: v_cndmask_b32_e64 v0, v1, v4, s[0:1] @@ -865,14 +865,14 @@ ; GCN-IR-NEXT: s_addc_u32 s17, s3, -1 ; GCN-IR-NEXT: s_not_b64 s[0:1], s[10:11] ; GCN-IR-NEXT: s_add_u32 s6, s0, s12 -; GCN-IR-NEXT: s_addc_u32 s7, s1, s11 ; GCN-IR-NEXT: s_mov_b32 s13, s11 +; GCN-IR-NEXT: s_addc_u32 s7, s1, s11 ; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 ; GCN-IR-NEXT: s_mov_b32 s1, 0 ; GCN-IR-NEXT: BB7_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-IR-NEXT: s_lshr_b32 s0, s9, 31 ; GCN-IR-NEXT: s_lshl_b64 s[12:13], s[14:15], 1 +; GCN-IR-NEXT: s_lshr_b32 s0, s9, 31 ; GCN-IR-NEXT: s_lshl_b64 s[8:9], s[8:9], 1 ; GCN-IR-NEXT: s_or_b64 s[12:13], s[12:13], s[0:1] ; GCN-IR-NEXT: s_or_b64 s[8:9], s[10:11], s[8:9] @@ -937,10 +937,10 @@ ; GCN-NEXT: v_mul_f32_e32 v3, 0x2f800000, v0 ; GCN-NEXT: v_trunc_f32_e32 v3, v3 ; GCN-NEXT: v_mac_f32_e32 v0, 0xcf800000, v3 -; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v3, v3 -; GCN-NEXT: v_mul_hi_u32 v5, s2, v0 +; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GCN-NEXT: v_mul_lo_u32 v4, s2, v3 +; GCN-NEXT: v_mul_hi_u32 v5, s2, v0 ; GCN-NEXT: v_mul_lo_u32 v7, s3, v0 ; GCN-NEXT: v_mul_lo_u32 v6, s2, v0 ; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4 @@ -950,16 +950,16 @@ ; GCN-NEXT: v_mul_hi_u32 v9, v0, v4 ; GCN-NEXT: v_mul_hi_u32 v8, v3, v6 ; GCN-NEXT: v_mul_lo_u32 v6, v3, v6 -; GCN-NEXT: v_add_i32_e32 v5, vcc, v5, v7 ; GCN-NEXT: v_mul_hi_u32 v10, v3, v4 +; GCN-NEXT: v_add_i32_e32 v5, vcc, v5, v7 ; GCN-NEXT: v_addc_u32_e32 v7, vcc, v2, v9, vcc ; GCN-NEXT: v_mul_lo_u32 v4, v3, v4 ; GCN-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; GCN-NEXT: v_addc_u32_e32 v5, vcc, v7, v8, vcc ; GCN-NEXT: v_addc_u32_e32 v6, vcc, v10, v1, vcc ; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; GCN-NEXT: v_add_i32_e64 v0, s[0:1], v0, v4 ; GCN-NEXT: v_addc_u32_e32 v5, vcc, v2, v6, vcc +; GCN-NEXT: v_add_i32_e64 v0, s[0:1], v0, v4 ; GCN-NEXT: v_addc_u32_e64 v4, vcc, v3, v5, s[0:1] ; GCN-NEXT: v_mul_lo_u32 v6, s2, v4 ; GCN-NEXT: v_mul_hi_u32 v7, s2, v0 @@ -968,8 +968,8 @@ ; GCN-NEXT: v_mul_lo_u32 v7, s2, v0 ; GCN-NEXT: v_add_i32_e32 v6, vcc, v8, v6 ; GCN-NEXT: v_mul_lo_u32 v10, v0, v6 -; GCN-NEXT: v_mul_hi_u32 v12, v0, v6 ; GCN-NEXT: v_mul_hi_u32 v11, v0, v7 +; GCN-NEXT: v_mul_hi_u32 v12, v0, v6 ; GCN-NEXT: v_mul_hi_u32 v9, v4, v7 ; GCN-NEXT: v_mul_lo_u32 v7, v4, v7 ; GCN-NEXT: v_mul_hi_u32 v8, v4, v6 @@ -1008,8 +1008,8 @@ ; GCN-NEXT: v_cndmask_b32_e64 v4, v6, v5, s[0:1] ; GCN-NEXT: v_add_i32_e64 v5, s[0:1], 2, v0 ; GCN-NEXT: v_addc_u32_e64 v6, s[0:1], 0, v2, s[0:1] -; GCN-NEXT: v_subb_u32_e32 v1, vcc, 0, v1, vcc ; GCN-NEXT: v_add_i32_e64 v7, s[0:1], 1, v0 +; GCN-NEXT: v_subb_u32_e32 v1, vcc, 0, v1, vcc ; GCN-NEXT: v_addc_u32_e64 v2, s[0:1], 0, v2, s[0:1] ; GCN-NEXT: v_cmp_le_u32_e32 vcc, s7, v1 ; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4 @@ -1065,8 +1065,8 @@ ; GCN-IR-NEXT: s_mov_b32 s5, 0 ; GCN-IR-NEXT: BB8_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-IR-NEXT: s_lshr_b32 s4, s9, 31 ; GCN-IR-NEXT: s_lshl_b64 s[12:13], s[12:13], 1 +; GCN-IR-NEXT: s_lshr_b32 s4, s9, 31 ; GCN-IR-NEXT: s_lshl_b64 s[8:9], s[8:9], 1 ; GCN-IR-NEXT: s_or_b64 s[12:13], s[12:13], s[4:5] ; GCN-IR-NEXT: s_or_b64 s[8:9], s[10:11], s[8:9] @@ -1118,18 +1118,18 @@ ; GCN-NEXT: v_cvt_f32_u32_e32 v3, v1 ; GCN-NEXT: v_sub_i32_e32 v4, vcc, 0, v0 ; GCN-NEXT: v_subb_u32_e32 v5, vcc, 0, v1, vcc -; GCN-NEXT: v_mov_b32_e32 v12, 0 ; GCN-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3 ; GCN-NEXT: v_rcp_f32_e32 v2, v2 +; GCN-NEXT: v_mov_b32_e32 v12, 0 ; GCN-NEXT: v_mov_b32_e32 v11, 0 ; GCN-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 ; GCN-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 ; GCN-NEXT: v_trunc_f32_e32 v3, v3 ; GCN-NEXT: v_mac_f32_e32 v2, 0xcf800000, v3 -; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GCN-NEXT: v_cvt_u32_f32_e32 v3, v3 -; GCN-NEXT: v_mul_hi_u32 v7, v4, v2 +; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GCN-NEXT: v_mul_lo_u32 v6, v4, v3 +; GCN-NEXT: v_mul_hi_u32 v7, v4, v2 ; GCN-NEXT: v_mul_lo_u32 v8, v5, v2 ; GCN-NEXT: v_mul_lo_u32 v9, v4, v2 ; GCN-NEXT: v_add_i32_e32 v6, vcc, v7, v6 @@ -1147,8 +1147,8 @@ ; GCN-NEXT: v_addc_u32_e32 v7, vcc, v7, v9, vcc ; GCN-NEXT: v_addc_u32_e32 v8, vcc, v13, v11, vcc ; GCN-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; GCN-NEXT: v_add_i32_e64 v2, s[4:5], v2, v6 ; GCN-NEXT: v_addc_u32_e32 v7, vcc, v12, v8, vcc +; GCN-NEXT: v_add_i32_e64 v2, s[4:5], v2, v6 ; GCN-NEXT: v_addc_u32_e64 v6, vcc, v3, v7, s[4:5] ; GCN-NEXT: v_mul_lo_u32 v8, v4, v6 ; GCN-NEXT: v_mul_hi_u32 v9, v4, v2 @@ -1161,8 +1161,8 @@ ; GCN-NEXT: v_mul_hi_u32 v14, v2, v5 ; GCN-NEXT: v_mul_hi_u32 v9, v6, v4 ; GCN-NEXT: v_mul_lo_u32 v4, v6, v4 -; GCN-NEXT: v_add_i32_e32 v10, vcc, v13, v10 ; GCN-NEXT: v_mul_hi_u32 v8, v6, v5 +; GCN-NEXT: v_add_i32_e32 v10, vcc, v13, v10 ; GCN-NEXT: v_addc_u32_e32 v13, vcc, v12, v14, vcc ; GCN-NEXT: v_mul_lo_u32 v5, v6, v5 ; GCN-NEXT: v_add_i32_e32 v4, vcc, v10, v4 @@ -1193,8 +1193,8 @@ ; GCN-NEXT: v_add_i32_e64 v6, s[4:5], 2, v2 ; GCN-NEXT: v_addc_u32_e64 v7, s[4:5], 0, v12, s[4:5] ; GCN-NEXT: v_add_i32_e64 v8, s[4:5], 1, v2 -; GCN-NEXT: v_subb_u32_e32 v3, vcc, 0, v3, vcc ; GCN-NEXT: v_addc_u32_e64 v9, s[4:5], 0, v12, s[4:5] +; GCN-NEXT: v_subb_u32_e32 v3, vcc, 0, v3, vcc ; GCN-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v5 ; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v3, v1 ; GCN-NEXT: v_cndmask_b32_e64 v5, v8, v6, s[4:5] @@ -1221,8 +1221,8 @@ ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1] ; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[6:7] ; GCN-IR-NEXT: s_mov_b64 s[8:9], 0x8000 -; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], vcc ; GCN-IR-NEXT: v_mov_b32_e32 v2, s8 +; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], vcc ; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[6:7] ; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 ; GCN-IR-NEXT: v_cndmask_b32_e64 v2, v2, 0, s[4:5] @@ -1236,8 +1236,8 @@ ; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, 0, v7, vcc ; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v6 ; GCN-IR-NEXT: v_cmp_ge_u64_e32 vcc, v[8:9], v[6:7] -; GCN-IR-NEXT: v_mov_b32_e32 v6, 0 ; GCN-IR-NEXT: v_lshl_b64 v[2:3], s[8:9], v2 +; GCN-IR-NEXT: v_mov_b32_e32 v6, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 ; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 ; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -1245,11 +1245,11 @@ ; GCN-IR-NEXT: s_cbranch_execz BB9_5 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader ; GCN-IR-NEXT: v_add_i32_e32 v12, vcc, -1, v0 -; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, -1, v1, vcc ; GCN-IR-NEXT: s_mov_b64 s[4:5], 0x8000 -; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, 47, v4 -; GCN-IR-NEXT: v_mov_b32_e32 v10, 0 +; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, -1, v1, vcc ; GCN-IR-NEXT: v_lshr_b64 v[8:9], s[4:5], v8 +; GCN-IR-NEXT: v_mov_b32_e32 v10, 0 +; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, 47, v4 ; GCN-IR-NEXT: v_mov_b32_e32 v11, 0 ; GCN-IR-NEXT: v_subb_u32_e32 v5, vcc, 0, v5, vcc ; GCN-IR-NEXT: BB9_3: ; %udiv-do-while @@ -1262,15 +1262,15 @@ ; GCN-IR-NEXT: v_subb_u32_e32 v6, vcc, v13, v9, vcc ; GCN-IR-NEXT: v_or_b32_e32 v2, v10, v2 ; GCN-IR-NEXT: v_ashrrev_i32_e32 v10, 31, v6 -; GCN-IR-NEXT: v_and_b32_e32 v15, v10, v0 ; GCN-IR-NEXT: v_and_b32_e32 v6, 1, v10 ; GCN-IR-NEXT: v_and_b32_e32 v14, v10, v1 +; GCN-IR-NEXT: v_and_b32_e32 v15, v10, v0 ; GCN-IR-NEXT: v_add_i32_e32 v10, vcc, 1, v4 ; GCN-IR-NEXT: v_or_b32_e32 v3, v11, v3 ; GCN-IR-NEXT: v_addc_u32_e32 v11, vcc, 0, v5, vcc +; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 ; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[10:11], v[4:5] ; GCN-IR-NEXT: v_mov_b32_e32 v4, v10 -; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 ; GCN-IR-NEXT: v_sub_i32_e64 v8, s[4:5], v8, v15 ; GCN-IR-NEXT: v_mov_b32_e32 v5, v11 ; GCN-IR-NEXT: v_mov_b32_e32 v11, v7 @@ -1327,8 +1327,8 @@ ; GCN-IR-NEXT: v_addc_u32_e32 v8, vcc, 0, v5, vcc ; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v4 ; GCN-IR-NEXT: v_cmp_ge_u64_e32 vcc, v[7:8], v[4:5] -; GCN-IR-NEXT: v_mov_b32_e32 v4, 0 ; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[0:1], v2 +; GCN-IR-NEXT: v_mov_b32_e32 v4, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 ; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 ; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -1351,18 +1351,18 @@ ; GCN-IR-NEXT: v_subb_u32_e32 v4, vcc, 0, v8, vcc ; GCN-IR-NEXT: v_or_b32_e32 v2, v9, v2 ; GCN-IR-NEXT: v_ashrrev_i32_e32 v9, 31, v4 -; GCN-IR-NEXT: v_and_b32_e32 v11, 0x8000, v9 ; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v9 +; GCN-IR-NEXT: v_and_b32_e32 v11, 0x8000, v9 ; GCN-IR-NEXT: v_add_i32_e32 v9, vcc, 1, v0 ; GCN-IR-NEXT: v_or_b32_e32 v3, v10, v3 ; GCN-IR-NEXT: v_addc_u32_e32 v10, vcc, 0, v1, vcc +; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 ; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[9:10], v[0:1] ; GCN-IR-NEXT: v_mov_b32_e32 v0, v9 -; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 -; GCN-IR-NEXT: v_mov_b32_e32 v1, v10 -; GCN-IR-NEXT: v_mov_b32_e32 v10, v5 ; GCN-IR-NEXT: v_mov_b32_e32 v6, 0 ; GCN-IR-NEXT: v_sub_i32_e64 v7, s[4:5], v7, v11 +; GCN-IR-NEXT: v_mov_b32_e32 v1, v10 +; GCN-IR-NEXT: v_mov_b32_e32 v10, v5 ; GCN-IR-NEXT: v_subb_u32_e64 v8, s[4:5], v8, v6, s[4:5] ; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11] ; GCN-IR-NEXT: v_mov_b32_e32 v9, v4 @@ -1416,25 +1416,25 @@ ; GCN-NEXT: v_mul_lo_u32 v6, v1, v4 ; GCN-NEXT: v_mul_hi_u32 v4, v1, v4 ; GCN-NEXT: v_addc_u32_e32 v3, vcc, v8, v3, vcc -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s4, s8 ; GCN-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; GCN-NEXT: v_addc_u32_e32 v3, vcc, v3, v4, vcc ; GCN-NEXT: v_addc_u32_e32 v4, vcc, v9, v7, vcc ; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GCN-NEXT: v_add_i32_e64 v0, s[0:1], v0, v2 ; GCN-NEXT: v_addc_u32_e32 v3, vcc, v8, v4, vcc -; GCN-NEXT: v_mul_hi_u32 v4, v0, s2 +; GCN-NEXT: v_add_i32_e64 v0, s[0:1], v0, v2 ; GCN-NEXT: v_addc_u32_e64 v2, vcc, v1, v3, s[0:1] +; GCN-NEXT: v_mul_hi_u32 v4, v0, s2 ; GCN-NEXT: v_mul_lo_u32 v5, v2, s2 ; GCN-NEXT: v_mul_lo_u32 v6, v0, s2 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s4, s8 ; GCN-NEXT: v_subrev_i32_e32 v4, vcc, v0, v4 -; GCN-NEXT: s_mov_b32 s5, s9 ; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v5 ; GCN-NEXT: v_mul_lo_u32 v5, v0, v4 ; GCN-NEXT: v_mul_hi_u32 v9, v0, v6 ; GCN-NEXT: v_mul_hi_u32 v10, v0, v4 ; GCN-NEXT: v_mul_hi_u32 v11, v2, v4 +; GCN-NEXT: s_mov_b32 s5, s9 ; GCN-NEXT: v_add_i32_e32 v5, vcc, v9, v5 ; GCN-NEXT: v_addc_u32_e32 v9, vcc, v8, v10, vcc ; GCN-NEXT: v_mul_lo_u32 v10, v2, v6 @@ -1471,8 +1471,8 @@ ; GCN-NEXT: v_add_i32_e32 v6, vcc, 1, v0 ; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc ; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; GCN-NEXT: v_sub_i32_e32 v8, vcc, s10, v8 ; GCN-NEXT: v_mov_b32_e32 v5, s11 +; GCN-NEXT: v_sub_i32_e32 v8, vcc, s10, v8 ; GCN-NEXT: v_subb_u32_e32 v4, vcc, v5, v4, vcc ; GCN-NEXT: v_subrev_i32_e32 v5, vcc, 24, v8 ; GCN-NEXT: v_subbrev_u32_e32 v9, vcc, 0, v4, vcc @@ -1530,8 +1530,8 @@ ; GCN-IR-NEXT: s_mov_b32 s5, 0 ; GCN-IR-NEXT: BB11_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-IR-NEXT: s_lshr_b32 s4, s7, 31 ; GCN-IR-NEXT: s_lshl_b64 s[10:11], s[10:11], 1 +; GCN-IR-NEXT: s_lshr_b32 s4, s7, 31 ; GCN-IR-NEXT: s_lshl_b64 s[6:7], s[6:7], 1 ; GCN-IR-NEXT: s_or_b64 s[10:11], s[10:11], s[4:5] ; GCN-IR-NEXT: s_or_b64 s[6:7], s[8:9], s[6:7] @@ -1605,10 +1605,10 @@ ; GCN-NEXT: v_addc_u32_e32 v5, vcc, v5, v6, vcc ; GCN-NEXT: v_addc_u32_e32 v6, vcc, v11, v9, vcc ; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; GCN-NEXT: v_add_i32_e64 v2, s[4:5], v2, v4 ; GCN-NEXT: v_addc_u32_e32 v5, vcc, v10, v6, vcc -; GCN-NEXT: v_mul_hi_u32 v6, v2, s6 +; GCN-NEXT: v_add_i32_e64 v2, s[4:5], v2, v4 ; GCN-NEXT: v_addc_u32_e64 v4, vcc, v3, v5, s[4:5] +; GCN-NEXT: v_mul_hi_u32 v6, v2, s6 ; GCN-NEXT: v_mul_lo_u32 v7, v4, s6 ; GCN-NEXT: v_mul_lo_u32 v8, v2, s6 ; GCN-NEXT: v_subrev_i32_e32 v6, vcc, v2, v6 @@ -1697,8 +1697,8 @@ ; GCN-IR-NEXT: v_addc_u32_e32 v8, vcc, 0, v5, vcc ; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v4 ; GCN-IR-NEXT: v_cmp_ge_u64_e32 vcc, v[7:8], v[4:5] -; GCN-IR-NEXT: v_mov_b32_e32 v4, 0 ; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[0:1], v2 +; GCN-IR-NEXT: v_mov_b32_e32 v4, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 ; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 ; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -1720,14 +1720,14 @@ ; GCN-IR-NEXT: v_subb_u32_e32 v4, vcc, 0, v8, vcc ; GCN-IR-NEXT: v_or_b32_e32 v2, v9, v2 ; GCN-IR-NEXT: v_add_i32_e32 v9, vcc, 1, v0 -; GCN-IR-NEXT: v_ashrrev_i32_e32 v7, 31, v4 ; GCN-IR-NEXT: v_or_b32_e32 v3, v10, v3 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v7, 31, v4 ; GCN-IR-NEXT: v_addc_u32_e32 v10, vcc, 0, v1, vcc -; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[9:10], v[0:1] +; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 ; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v7 ; GCN-IR-NEXT: v_and_b32_e32 v7, 24, v7 +; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[9:10], v[0:1] ; GCN-IR-NEXT: v_mov_b32_e32 v0, v9 -; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 ; GCN-IR-NEXT: v_sub_i32_e64 v7, s[4:5], v6, v7 ; GCN-IR-NEXT: v_mov_b32_e32 v1, v10 ; GCN-IR-NEXT: v_mov_b32_e32 v10, v5 diff --git a/llvm/test/CodeGen/AMDGPU/udivrem.ll b/llvm/test/CodeGen/AMDGPU/udivrem.ll --- a/llvm/test/CodeGen/AMDGPU/udivrem.ll +++ b/llvm/test/CodeGen/AMDGPU/udivrem.ll @@ -100,8 +100,8 @@ ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v2 ; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s6, v3 ; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[0:1] -; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, s6, v3 +; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_cndmask_b32_e64 v2, v3, v4, s[0:1] ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -165,8 +165,8 @@ ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GFX6-NEXT: v_mul_f32_e32 v0, s2, v0 -; GFX6-NEXT: v_mul_f32_e32 v1, s2, v1 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX6-NEXT: v_mul_f32_e32 v1, s2, v1 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX6-NEXT: s_sub_i32 s2, 0, s6 ; GFX6-NEXT: v_mul_lo_u32 v2, s2, v0 @@ -209,8 +209,8 @@ ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GFX8-NEXT: v_mul_f32_e32 v0, s2, v0 -; GFX8-NEXT: v_mul_f32_e32 v1, s2, v1 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX8-NEXT: v_mul_f32_e32 v1, s2, v1 ; GFX8-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX8-NEXT: s_sub_i32 s2, 0, s6 ; GFX8-NEXT: v_mul_lo_u32 v2, s2, v0 @@ -219,14 +219,14 @@ ; GFX8-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX8-NEXT: v_mul_hi_u32 v3, v1, v3 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v2, v0 -; GFX8-NEXT: v_mul_hi_u32 v0, s4, v0 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1 +; GFX8-NEXT: v_mul_hi_u32 v0, s4, v0 ; GFX8-NEXT: v_mul_hi_u32 v1, s5, v1 ; GFX8-NEXT: v_mul_lo_u32 v0, v0, s6 ; GFX8-NEXT: v_mul_lo_u32 v1, v1, s7 ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s4, v0 -; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s6, v0 ; GFX8-NEXT: v_sub_u32_e32 v1, vcc, s5, v1 +; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s6, v0 ; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s7, v1 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s6, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc @@ -345,8 +345,8 @@ ; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v4, v1 ; GFX6-NEXT: v_mul_hi_u32 v1, s5, v1 -; GFX6-NEXT: v_mul_f32_e32 v2, s13, v3 ; GFX6-NEXT: v_mul_lo_u32 v0, v0, s8 +; GFX6-NEXT: v_mul_f32_e32 v2, s13, v3 ; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GFX6-NEXT: v_mul_lo_u32 v1, v1, s9 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 @@ -361,8 +361,8 @@ ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s5, v1 ; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s9, v1 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s9, v1 -; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; GFX6-NEXT: v_mul_hi_u32 v3, v2, v3 +; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v5 ; GFX6-NEXT: s_sub_i32 s4, 0, s11 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 @@ -370,19 +370,19 @@ ; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v3 ; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s9, v1 ; GFX6-NEXT: v_mul_hi_u32 v2, s6, v2 -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s9, v1 ; GFX6-NEXT: v_mul_lo_u32 v5, s4, v3 +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s9, v1 ; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; GFX6-NEXT: v_mul_lo_u32 v2, v2, s10 ; GFX6-NEXT: v_mul_hi_u32 v4, v3, v5 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s6, v2 -; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, s10, v2 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, v4, v3 ; GFX6-NEXT: v_mul_hi_u32 v3, s7, v3 +; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, s10, v2 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s10, v2 +; GFX6-NEXT: v_mul_lo_u32 v3, v3, s11 ; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc ; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s10, v2 -; GFX6-NEXT: v_mul_lo_u32 v3, v3, s11 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s10, v2 ; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s7, v3 @@ -423,8 +423,8 @@ ; GFX8-NEXT: v_mul_hi_u32 v0, s4, v0 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v4, v1 ; GFX8-NEXT: v_mul_hi_u32 v1, s5, v1 -; GFX8-NEXT: v_mul_f32_e32 v2, s12, v3 ; GFX8-NEXT: v_mul_lo_u32 v0, v0, s8 +; GFX8-NEXT: v_mul_f32_e32 v2, s12, v3 ; GFX8-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GFX8-NEXT: v_mul_lo_u32 v1, v1, s9 ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s4, v0 @@ -438,8 +438,8 @@ ; GFX8-NEXT: v_sub_u32_e32 v1, vcc, s5, v1 ; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, s9, v1 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s9, v1 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; GFX8-NEXT: v_mul_hi_u32 v3, v2, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; GFX8-NEXT: v_rcp_iflag_f32_e32 v4, v5 ; GFX8-NEXT: s_sub_i32 s2, 0, s11 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 @@ -447,19 +447,19 @@ ; GFX8-NEXT: v_cvt_u32_f32_e32 v3, v3 ; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, s9, v1 ; GFX8-NEXT: v_mul_hi_u32 v2, s6, v2 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s9, v1 ; GFX8-NEXT: v_mul_lo_u32 v5, s2, v3 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s9, v1 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; GFX8-NEXT: v_mul_lo_u32 v2, v2, s10 ; GFX8-NEXT: v_mul_hi_u32 v4, v3, v5 ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s6, v2 -; GFX8-NEXT: v_subrev_u32_e32 v5, vcc, s10, v2 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v4, v3 ; GFX8-NEXT: v_mul_hi_u32 v3, s7, v3 +; GFX8-NEXT: v_subrev_u32_e32 v5, vcc, s10, v2 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s10, v2 +; GFX8-NEXT: v_mul_lo_u32 v3, v3, s11 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc ; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, s10, v2 -; GFX8-NEXT: v_mul_lo_u32 v3, v3, s11 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s10, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s7, v3 diff --git a/llvm/test/CodeGen/AMDGPU/uint_to_fp.i64.ll b/llvm/test/CodeGen/AMDGPU/uint_to_fp.i64.ll --- a/llvm/test/CodeGen/AMDGPU/uint_to_fp.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/uint_to_fp.i64.ll @@ -241,14 +241,14 @@ ; GFX8-NEXT: s_flbit_i32_b32 s6, s3 ; GFX8-NEXT: s_flbit_i32_b32 s7, s1 ; GFX8-NEXT: s_min_u32 s6, s6, 32 -; GFX8-NEXT: s_lshl_b64 s[2:3], s[2:3], s6 ; GFX8-NEXT: s_min_u32 s7, s7, 32 +; GFX8-NEXT: s_lshl_b64 s[2:3], s[2:3], s6 ; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], s7 ; GFX8-NEXT: s_min_u32 s2, s2, 1 -; GFX8-NEXT: s_min_u32 s0, s0, 1 ; GFX8-NEXT: s_or_b32 s2, s3, s2 -; GFX8-NEXT: s_or_b32 s0, s1, s0 +; GFX8-NEXT: s_min_u32 s0, s0, 1 ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GFX8-NEXT: s_or_b32 s0, s1, s0 ; GFX8-NEXT: v_cvt_f32_u32_e32 v2, s0 ; GFX8-NEXT: s_sub_i32 s0, 32, s6 ; GFX8-NEXT: v_ldexp_f32 v1, v0, s0 @@ -341,26 +341,26 @@ ; GFX8-NEXT: v_min_u32_e32 v11, 32, v11 ; GFX8-NEXT: v_min_u32_e32 v12, 32, v12 ; GFX8-NEXT: v_min_u32_e32 v13, 32, v13 -; GFX8-NEXT: v_lshlrev_b64 v[5:6], v11, v[5:6] -; GFX8-NEXT: v_lshlrev_b64 v[3:4], v12, v[3:4] ; GFX8-NEXT: v_lshlrev_b64 v[7:8], v0, v[7:8] ; GFX8-NEXT: v_sub_u32_e32 v14, vcc, 32, v0 +; GFX8-NEXT: v_lshlrev_b64 v[5:6], v11, v[5:6] +; GFX8-NEXT: v_lshlrev_b64 v[3:4], v12, v[3:4] ; GFX8-NEXT: v_lshlrev_b64 v[0:1], v13, v[1:2] ; GFX8-NEXT: v_min_u32_e32 v7, 1, v7 ; GFX8-NEXT: v_min_u32_e32 v5, 1, v5 ; GFX8-NEXT: v_min_u32_e32 v3, 1, v3 ; GFX8-NEXT: v_min_u32_e32 v0, 1, v0 -; GFX8-NEXT: v_or_b32_e32 v3, v4, v3 -; GFX8-NEXT: v_or_b32_e32 v5, v6, v5 ; GFX8-NEXT: v_or_b32_e32 v7, v8, v7 +; GFX8-NEXT: v_or_b32_e32 v5, v6, v5 +; GFX8-NEXT: v_or_b32_e32 v3, v4, v3 ; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX8-NEXT: v_cvt_f32_u32_e32 v1, v7 ; GFX8-NEXT: v_cvt_f32_u32_e32 v4, v5 ; GFX8-NEXT: v_cvt_f32_u32_e32 v3, v3 ; GFX8-NEXT: v_cvt_f32_u32_e32 v5, v0 -; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 32, v13 ; GFX8-NEXT: v_sub_u32_e32 v11, vcc, 32, v11 ; GFX8-NEXT: v_sub_u32_e32 v12, vcc, 32, v12 +; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 32, v13 ; GFX8-NEXT: v_ldexp_f32 v1, v1, v14 ; GFX8-NEXT: v_ldexp_f32 v0, v4, v11 ; GFX8-NEXT: v_ldexp_f32 v3, v3, v12 @@ -418,8 +418,8 @@ ; GFX8-NEXT: s_min_u32 s9, s3, 32 ; GFX8-NEXT: s_lshl_b64 s[2:3], s[6:7], s8 ; GFX8-NEXT: s_min_u32 s2, s2, 1 -; GFX8-NEXT: s_or_b32 s2, s3, s2 ; GFX8-NEXT: s_lshl_b64 s[4:5], s[4:5], s9 +; GFX8-NEXT: s_or_b32 s2, s3, s2 ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s2 ; GFX8-NEXT: s_min_u32 s2, s4, 1 ; GFX8-NEXT: s_or_b32 s2, s5, s2 @@ -524,19 +524,19 @@ ; GFX8-NEXT: v_min_u32_e32 v11, 32, v11 ; GFX8-NEXT: v_min_u32_e32 v12, 32, v12 ; GFX8-NEXT: v_min_u32_e32 v13, 32, v13 -; GFX8-NEXT: v_lshlrev_b64 v[5:6], v11, v[5:6] -; GFX8-NEXT: v_lshlrev_b64 v[3:4], v12, v[3:4] ; GFX8-NEXT: v_lshlrev_b64 v[7:8], v0, v[7:8] ; GFX8-NEXT: v_sub_u32_e32 v14, vcc, 32, v0 +; GFX8-NEXT: v_lshlrev_b64 v[5:6], v11, v[5:6] +; GFX8-NEXT: v_lshlrev_b64 v[3:4], v12, v[3:4] ; GFX8-NEXT: v_lshlrev_b64 v[0:1], v13, v[1:2] ; GFX8-NEXT: v_min_u32_e32 v7, 1, v7 ; GFX8-NEXT: v_min_u32_e32 v5, 1, v5 ; GFX8-NEXT: v_min_u32_e32 v3, 1, v3 ; GFX8-NEXT: v_min_u32_e32 v0, 1, v0 -; GFX8-NEXT: v_or_b32_e32 v3, v4, v3 -; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX8-NEXT: v_or_b32_e32 v7, v8, v7 ; GFX8-NEXT: v_or_b32_e32 v5, v6, v5 +; GFX8-NEXT: v_or_b32_e32 v3, v4, v3 +; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX8-NEXT: v_cvt_f32_u32_e32 v1, v7 ; GFX8-NEXT: v_cvt_f32_u32_e32 v4, v5 ; GFX8-NEXT: v_cvt_f32_u32_e32 v3, v3 @@ -544,18 +544,18 @@ ; GFX8-NEXT: v_sub_u32_e32 v11, vcc, 32, v11 ; GFX8-NEXT: v_sub_u32_e32 v12, vcc, 32, v12 ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 32, v13 +; GFX8-NEXT: v_ldexp_f32 v1, v1, v14 ; GFX8-NEXT: v_ldexp_f32 v4, v4, v11 ; GFX8-NEXT: v_ldexp_f32 v3, v3, v12 ; GFX8-NEXT: v_ldexp_f32 v0, v0, v2 -; GFX8-NEXT: v_ldexp_f32 v1, v1, v14 -; GFX8-NEXT: v_cvt_f16_f32_e32 v5, v0 -; GFX8-NEXT: v_cvt_f16_f32_sdwa v3, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD ; GFX8-NEXT: v_cvt_f16_f32_sdwa v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD ; GFX8-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX8-NEXT: v_cvt_f16_f32_sdwa v3, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD +; GFX8-NEXT: v_cvt_f16_f32_e32 v5, v0 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v9 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v10, vcc -; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 ; GFX8-NEXT: v_or_b32_e32 v2, v4, v2 +; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 ; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX8-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/urem64.ll b/llvm/test/CodeGen/AMDGPU/urem64.ll --- a/llvm/test/CodeGen/AMDGPU/urem64.ll +++ b/llvm/test/CodeGen/AMDGPU/urem64.ll @@ -24,10 +24,10 @@ ; GCN-NEXT: v_mul_f32_e32 v3, 0x2f800000, v0 ; GCN-NEXT: v_trunc_f32_e32 v3, v3 ; GCN-NEXT: v_mac_f32_e32 v0, 0xcf800000, v3 -; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v3, v3 -; GCN-NEXT: v_mul_hi_u32 v5, s2, v0 +; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GCN-NEXT: v_mul_lo_u32 v4, s2, v3 +; GCN-NEXT: v_mul_hi_u32 v5, s2, v0 ; GCN-NEXT: v_mul_lo_u32 v7, s3, v0 ; GCN-NEXT: v_mul_lo_u32 v6, s2, v0 ; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4 @@ -45,8 +45,8 @@ ; GCN-NEXT: v_addc_u32_e32 v5, vcc, v7, v6, vcc ; GCN-NEXT: v_addc_u32_e32 v6, vcc, v9, v1, vcc ; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; GCN-NEXT: v_add_i32_e64 v0, s[0:1], v0, v4 ; GCN-NEXT: v_addc_u32_e32 v5, vcc, v2, v6, vcc +; GCN-NEXT: v_add_i32_e64 v0, s[0:1], v0, v4 ; GCN-NEXT: v_addc_u32_e64 v4, vcc, v3, v5, s[0:1] ; GCN-NEXT: v_mul_lo_u32 v6, s2, v4 ; GCN-NEXT: v_mul_hi_u32 v7, s2, v0 @@ -55,8 +55,8 @@ ; GCN-NEXT: v_mul_lo_u32 v7, s2, v0 ; GCN-NEXT: v_add_i32_e32 v6, vcc, v8, v6 ; GCN-NEXT: v_mul_lo_u32 v10, v0, v6 -; GCN-NEXT: v_mul_hi_u32 v12, v0, v6 ; GCN-NEXT: v_mul_hi_u32 v11, v0, v7 +; GCN-NEXT: v_mul_hi_u32 v12, v0, v6 ; GCN-NEXT: v_mul_hi_u32 v9, v4, v7 ; GCN-NEXT: v_mul_lo_u32 v7, v4, v7 ; GCN-NEXT: v_mul_hi_u32 v8, v4, v6 @@ -99,12 +99,12 @@ ; GCN-NEXT: v_subrev_i32_e64 v4, s[0:1], s12, v0 ; GCN-NEXT: v_subbrev_u32_e64 v5, s[2:3], 0, v2, s[0:1] ; GCN-NEXT: v_cmp_le_u32_e64 s[2:3], s13, v5 -; GCN-NEXT: v_subb_u32_e64 v2, s[0:1], v2, v3, s[0:1] ; GCN-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[2:3] ; GCN-NEXT: v_cmp_le_u32_e64 s[2:3], s12, v4 -; GCN-NEXT: v_subrev_i32_e64 v3, s[0:1], s12, v4 +; GCN-NEXT: v_subb_u32_e64 v2, s[0:1], v2, v3, s[0:1] ; GCN-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[2:3] ; GCN-NEXT: v_cmp_eq_u32_e64 s[2:3], s13, v5 +; GCN-NEXT: v_subrev_i32_e64 v3, s[0:1], s12, v4 ; GCN-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[2:3] ; GCN-NEXT: v_subbrev_u32_e64 v2, s[0:1], 0, v2, s[0:1] ; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v6 @@ -173,8 +173,8 @@ ; GCN-IR-NEXT: s_mov_b32 s3, 0 ; GCN-IR-NEXT: BB0_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-IR-NEXT: s_lshr_b32 s2, s11, 31 ; GCN-IR-NEXT: s_lshl_b64 s[14:15], s[14:15], 1 +; GCN-IR-NEXT: s_lshr_b32 s2, s11, 31 ; GCN-IR-NEXT: s_lshl_b64 s[10:11], s[10:11], 1 ; GCN-IR-NEXT: s_or_b64 s[14:15], s[14:15], s[2:3] ; GCN-IR-NEXT: s_or_b64 s[10:11], s[12:13], s[10:11] @@ -234,25 +234,25 @@ ; GCN-NEXT: v_cvt_f32_u32_e32 v5, v3 ; GCN-NEXT: v_sub_i32_e32 v6, vcc, 0, v2 ; GCN-NEXT: v_subb_u32_e32 v7, vcc, 0, v3, vcc -; GCN-NEXT: v_mov_b32_e32 v14, 0 ; GCN-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 ; GCN-NEXT: v_rcp_f32_e32 v4, v4 +; GCN-NEXT: v_mov_b32_e32 v14, 0 ; GCN-NEXT: v_mov_b32_e32 v13, 0 ; GCN-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 ; GCN-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 ; GCN-NEXT: v_trunc_f32_e32 v5, v5 ; GCN-NEXT: v_mac_f32_e32 v4, 0xcf800000, v5 -; GCN-NEXT: v_cvt_u32_f32_e32 v4, v4 ; GCN-NEXT: v_cvt_u32_f32_e32 v5, v5 -; GCN-NEXT: v_mul_hi_u32 v8, v6, v4 +; GCN-NEXT: v_cvt_u32_f32_e32 v4, v4 ; GCN-NEXT: v_mul_lo_u32 v9, v6, v5 +; GCN-NEXT: v_mul_hi_u32 v8, v6, v4 ; GCN-NEXT: v_mul_lo_u32 v10, v7, v4 ; GCN-NEXT: v_add_i32_e32 v8, vcc, v8, v9 ; GCN-NEXT: v_mul_lo_u32 v9, v6, v4 ; GCN-NEXT: v_add_i32_e32 v8, vcc, v8, v10 ; GCN-NEXT: v_mul_lo_u32 v11, v4, v8 -; GCN-NEXT: v_mul_hi_u32 v10, v4, v8 ; GCN-NEXT: v_mul_hi_u32 v12, v4, v9 +; GCN-NEXT: v_mul_hi_u32 v10, v4, v8 ; GCN-NEXT: v_mul_hi_u32 v15, v5, v8 ; GCN-NEXT: v_mul_lo_u32 v8, v5, v8 ; GCN-NEXT: v_add_i32_e32 v11, vcc, v12, v11 @@ -263,8 +263,8 @@ ; GCN-NEXT: v_addc_u32_e32 v9, vcc, v10, v9, vcc ; GCN-NEXT: v_addc_u32_e32 v10, vcc, v15, v13, vcc ; GCN-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; GCN-NEXT: v_add_i32_e64 v4, s[4:5], v4, v8 ; GCN-NEXT: v_addc_u32_e32 v9, vcc, v14, v10, vcc +; GCN-NEXT: v_add_i32_e64 v4, s[4:5], v4, v8 ; GCN-NEXT: v_addc_u32_e64 v8, vcc, v5, v9, s[4:5] ; GCN-NEXT: v_mul_lo_u32 v10, v6, v8 ; GCN-NEXT: v_mul_hi_u32 v11, v6, v4 @@ -277,8 +277,8 @@ ; GCN-NEXT: v_mul_hi_u32 v16, v4, v7 ; GCN-NEXT: v_mul_hi_u32 v11, v8, v6 ; GCN-NEXT: v_mul_lo_u32 v6, v8, v6 -; GCN-NEXT: v_add_i32_e32 v12, vcc, v15, v12 ; GCN-NEXT: v_mul_hi_u32 v10, v8, v7 +; GCN-NEXT: v_add_i32_e32 v12, vcc, v15, v12 ; GCN-NEXT: v_addc_u32_e32 v15, vcc, v14, v16, vcc ; GCN-NEXT: v_mul_lo_u32 v7, v8, v7 ; GCN-NEXT: v_add_i32_e32 v6, vcc, v12, v6 @@ -316,13 +316,13 @@ ; GCN-NEXT: v_sub_i32_e64 v6, s[4:5], v0, v2 ; GCN-NEXT: v_subbrev_u32_e64 v7, s[6:7], 0, v4, s[4:5] ; GCN-NEXT: v_cmp_ge_u32_e64 s[6:7], v7, v3 -; GCN-NEXT: v_subb_u32_e32 v1, vcc, v1, v5, vcc ; GCN-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[6:7] ; GCN-NEXT: v_cmp_ge_u32_e64 s[6:7], v6, v2 +; GCN-NEXT: v_subb_u32_e32 v1, vcc, v1, v5, vcc ; GCN-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[6:7] ; GCN-NEXT: v_cmp_eq_u32_e64 s[6:7], v7, v3 -; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3 ; GCN-NEXT: v_subb_u32_e64 v4, s[4:5], v4, v3, s[4:5] +; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3 ; GCN-NEXT: v_cndmask_b32_e64 v8, v8, v9, s[6:7] ; GCN-NEXT: v_sub_i32_e64 v9, s[4:5], v6, v2 ; GCN-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc @@ -330,10 +330,10 @@ ; GCN-NEXT: v_subbrev_u32_e64 v4, s[4:5], 0, v4, s[4:5] ; GCN-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 -; GCN-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc ; GCN-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v8 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GCN-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc ; GCN-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[4:5] +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; GCN-NEXT: v_cndmask_b32_e64 v2, v7, v4, s[4:5] ; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc ; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc @@ -371,8 +371,8 @@ ; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, 0, v6, vcc ; GCN-IR-NEXT: v_sub_i32_e64 v4, s[4:5], 63, v5 ; GCN-IR-NEXT: v_cmp_ge_u64_e32 vcc, v[12:13], v[5:6] -; GCN-IR-NEXT: v_mov_b32_e32 v6, 0 ; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[0:1], v4 +; GCN-IR-NEXT: v_mov_b32_e32 v6, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 ; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 ; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -382,9 +382,9 @@ ; GCN-IR-NEXT: v_add_i32_e32 v16, vcc, -1, v2 ; GCN-IR-NEXT: v_addc_u32_e32 v17, vcc, -1, v3, vcc ; GCN-IR-NEXT: v_not_b32_e32 v6, v8 -; GCN-IR-NEXT: v_mov_b32_e32 v14, 0 ; GCN-IR-NEXT: v_lshr_b64 v[12:13], v[0:1], v12 ; GCN-IR-NEXT: v_not_b32_e32 v7, v9 +; GCN-IR-NEXT: v_mov_b32_e32 v14, 0 ; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, v6, v10 ; GCN-IR-NEXT: v_mov_b32_e32 v15, 0 ; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, v7, v11, vcc @@ -398,15 +398,15 @@ ; GCN-IR-NEXT: v_subb_u32_e32 v6, vcc, v17, v11, vcc ; GCN-IR-NEXT: v_or_b32_e32 v4, v14, v4 ; GCN-IR-NEXT: v_add_i32_e32 v14, vcc, 1, v8 -; GCN-IR-NEXT: v_ashrrev_i32_e32 v12, 31, v6 ; GCN-IR-NEXT: v_or_b32_e32 v5, v15, v5 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v12, 31, v6 ; GCN-IR-NEXT: v_addc_u32_e32 v15, vcc, 0, v9, vcc -; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[14:15], v[8:9] -; GCN-IR-NEXT: v_mov_b32_e32 v8, v14 ; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 ; GCN-IR-NEXT: v_and_b32_e32 v6, 1, v12 ; GCN-IR-NEXT: v_and_b32_e32 v13, v12, v3 ; GCN-IR-NEXT: v_and_b32_e32 v12, v12, v2 +; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[14:15], v[8:9] +; GCN-IR-NEXT: v_mov_b32_e32 v8, v14 ; GCN-IR-NEXT: v_sub_i32_e64 v12, s[4:5], v10, v12 ; GCN-IR-NEXT: v_mov_b32_e32 v9, v15 ; GCN-IR-NEXT: v_mov_b32_e32 v15, v7 @@ -758,10 +758,10 @@ ; GCN-NEXT: v_mul_f32_e32 v3, 0x2f800000, v0 ; GCN-NEXT: v_trunc_f32_e32 v3, v3 ; GCN-NEXT: v_mac_f32_e32 v0, 0xcf800000, v3 -; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v3, v3 -; GCN-NEXT: v_mul_hi_u32 v5, s2, v0 +; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GCN-NEXT: v_mul_lo_u32 v4, s2, v3 +; GCN-NEXT: v_mul_hi_u32 v5, s2, v0 ; GCN-NEXT: v_mul_lo_u32 v7, s3, v0 ; GCN-NEXT: v_mul_lo_u32 v6, s2, v0 ; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4 @@ -771,16 +771,16 @@ ; GCN-NEXT: v_mul_hi_u32 v9, v0, v4 ; GCN-NEXT: v_mul_hi_u32 v8, v3, v6 ; GCN-NEXT: v_mul_lo_u32 v6, v3, v6 -; GCN-NEXT: v_add_i32_e32 v5, vcc, v5, v7 ; GCN-NEXT: v_mul_hi_u32 v10, v3, v4 +; GCN-NEXT: v_add_i32_e32 v5, vcc, v5, v7 ; GCN-NEXT: v_addc_u32_e32 v7, vcc, v2, v9, vcc ; GCN-NEXT: v_mul_lo_u32 v4, v3, v4 ; GCN-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; GCN-NEXT: v_addc_u32_e32 v5, vcc, v7, v8, vcc ; GCN-NEXT: v_addc_u32_e32 v6, vcc, v10, v1, vcc ; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; GCN-NEXT: v_add_i32_e64 v0, s[0:1], v0, v4 ; GCN-NEXT: v_addc_u32_e32 v5, vcc, v2, v6, vcc +; GCN-NEXT: v_add_i32_e64 v0, s[0:1], v0, v4 ; GCN-NEXT: v_addc_u32_e64 v4, vcc, v3, v5, s[0:1] ; GCN-NEXT: v_mul_lo_u32 v6, s2, v4 ; GCN-NEXT: v_mul_hi_u32 v7, s2, v0 @@ -789,8 +789,8 @@ ; GCN-NEXT: v_mul_lo_u32 v7, s2, v0 ; GCN-NEXT: v_add_i32_e32 v6, vcc, v8, v6 ; GCN-NEXT: v_mul_lo_u32 v10, v0, v6 -; GCN-NEXT: v_mul_hi_u32 v12, v0, v6 ; GCN-NEXT: v_mul_hi_u32 v11, v0, v7 +; GCN-NEXT: v_mul_hi_u32 v12, v0, v6 ; GCN-NEXT: v_mul_hi_u32 v9, v4, v7 ; GCN-NEXT: v_mul_lo_u32 v7, v4, v7 ; GCN-NEXT: v_mul_hi_u32 v8, v4, v6 @@ -822,15 +822,15 @@ ; GCN-NEXT: v_subrev_i32_e64 v4, s[0:1], s6, v0 ; GCN-NEXT: v_subbrev_u32_e64 v5, s[2:3], 0, v2, s[0:1] ; GCN-NEXT: v_cmp_le_u32_e64 s[2:3], s7, v5 -; GCN-NEXT: v_subb_u32_e64 v2, s[0:1], v2, v3, s[0:1] ; GCN-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[2:3] ; GCN-NEXT: v_cmp_le_u32_e64 s[2:3], s6, v4 -; GCN-NEXT: v_subrev_i32_e64 v3, s[0:1], s6, v4 +; GCN-NEXT: v_subb_u32_e64 v2, s[0:1], v2, v3, s[0:1] ; GCN-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[2:3] ; GCN-NEXT: v_cmp_eq_u32_e64 s[2:3], s7, v5 -; GCN-NEXT: v_subb_u32_e32 v1, vcc, 0, v1, vcc +; GCN-NEXT: v_subrev_i32_e64 v3, s[0:1], s6, v4 ; GCN-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[2:3] ; GCN-NEXT: v_subbrev_u32_e64 v2, s[0:1], 0, v2, s[0:1] +; GCN-NEXT: v_subb_u32_e32 v1, vcc, 0, v1, vcc ; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v6 ; GCN-NEXT: v_cmp_le_u32_e32 vcc, s7, v1 ; GCN-NEXT: v_cndmask_b32_e64 v2, v5, v2, s[0:1] @@ -885,8 +885,8 @@ ; GCN-IR-NEXT: s_mov_b32 s5, 0 ; GCN-IR-NEXT: BB6_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-IR-NEXT: s_lshr_b32 s4, s9, 31 ; GCN-IR-NEXT: s_lshl_b64 s[12:13], s[12:13], 1 +; GCN-IR-NEXT: s_lshr_b32 s4, s9, 31 ; GCN-IR-NEXT: s_lshl_b64 s[8:9], s[8:9], 1 ; GCN-IR-NEXT: s_or_b64 s[12:13], s[12:13], s[4:5] ; GCN-IR-NEXT: s_or_b64 s[8:9], s[10:11], s[8:9] @@ -967,25 +967,25 @@ ; GCN-NEXT: v_mul_lo_u32 v6, v1, v4 ; GCN-NEXT: v_mul_hi_u32 v4, v1, v4 ; GCN-NEXT: v_addc_u32_e32 v3, vcc, v8, v3, vcc -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s8, s4 ; GCN-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; GCN-NEXT: v_addc_u32_e32 v3, vcc, v3, v4, vcc ; GCN-NEXT: v_addc_u32_e32 v4, vcc, v9, v7, vcc ; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GCN-NEXT: v_add_i32_e64 v0, s[0:1], v0, v2 ; GCN-NEXT: v_addc_u32_e32 v3, vcc, v8, v4, vcc -; GCN-NEXT: v_mul_hi_u32 v4, v0, s2 +; GCN-NEXT: v_add_i32_e64 v0, s[0:1], v0, v2 ; GCN-NEXT: v_addc_u32_e64 v2, vcc, v1, v3, s[0:1] +; GCN-NEXT: v_mul_hi_u32 v4, v0, s2 ; GCN-NEXT: v_mul_lo_u32 v5, v2, s2 ; GCN-NEXT: v_mul_lo_u32 v6, v0, s2 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s8, s4 ; GCN-NEXT: v_subrev_i32_e32 v4, vcc, v0, v4 -; GCN-NEXT: s_mov_b32 s9, s5 ; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v5 ; GCN-NEXT: v_mul_lo_u32 v5, v0, v4 ; GCN-NEXT: v_mul_hi_u32 v9, v0, v6 ; GCN-NEXT: v_mul_hi_u32 v10, v0, v4 ; GCN-NEXT: v_mul_hi_u32 v11, v2, v4 +; GCN-NEXT: s_mov_b32 s9, s5 ; GCN-NEXT: v_add_i32_e32 v5, vcc, v9, v5 ; GCN-NEXT: v_addc_u32_e32 v9, vcc, v8, v10, vcc ; GCN-NEXT: v_mul_lo_u32 v10, v2, v6 @@ -1014,12 +1014,12 @@ ; GCN-NEXT: v_addc_u32_e32 v2, vcc, v5, v7, vcc ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GCN-NEXT: v_addc_u32_e32 v1, vcc, v8, v2, vcc -; GCN-NEXT: v_mul_hi_u32 v2, v0, 24 ; GCN-NEXT: v_mul_lo_u32 v1, v1, 24 +; GCN-NEXT: v_mul_hi_u32 v2, v0, 24 ; GCN-NEXT: v_mul_lo_u32 v0, v0, 24 ; GCN-NEXT: v_add_i32_e32 v1, vcc, v2, v1 -; GCN-NEXT: v_sub_i32_e32 v0, vcc, s6, v0 ; GCN-NEXT: v_mov_b32_e32 v2, s7 +; GCN-NEXT: v_sub_i32_e32 v0, vcc, s6, v0 ; GCN-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc ; GCN-NEXT: v_subrev_i32_e32 v2, vcc, 24, v0 ; GCN-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v1, vcc @@ -1079,8 +1079,8 @@ ; GCN-IR-NEXT: s_mov_b32 s5, 0 ; GCN-IR-NEXT: BB7_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-IR-NEXT: s_lshr_b32 s4, s9, 31 ; GCN-IR-NEXT: s_lshl_b64 s[12:13], s[12:13], 1 +; GCN-IR-NEXT: s_lshr_b32 s4, s9, 31 ; GCN-IR-NEXT: s_lshl_b64 s[8:9], s[8:9], 1 ; GCN-IR-NEXT: s_or_b64 s[12:13], s[12:13], s[4:5] ; GCN-IR-NEXT: s_or_b64 s[8:9], s[10:11], s[8:9] @@ -1111,8 +1111,8 @@ ; GCN-IR-NEXT: v_mov_b32_e32 v0, s2 ; GCN-IR-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[10:11] ; GCN-IR-NEXT: BB7_6: ; %udiv-end -; GCN-IR-NEXT: v_mul_hi_u32 v2, v0, 24 ; GCN-IR-NEXT: v_mul_lo_u32 v1, v1, 24 +; GCN-IR-NEXT: v_mul_hi_u32 v2, v0, 24 ; GCN-IR-NEXT: v_mul_lo_u32 v0, v0, 24 ; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s6, -1 @@ -1143,18 +1143,18 @@ ; GCN-NEXT: v_cvt_f32_u32_e32 v3, v1 ; GCN-NEXT: v_sub_i32_e32 v4, vcc, 0, v0 ; GCN-NEXT: v_subb_u32_e32 v5, vcc, 0, v1, vcc -; GCN-NEXT: v_mov_b32_e32 v12, 0 ; GCN-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3 ; GCN-NEXT: v_rcp_f32_e32 v2, v2 +; GCN-NEXT: v_mov_b32_e32 v12, 0 ; GCN-NEXT: v_mov_b32_e32 v11, 0 ; GCN-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 ; GCN-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 ; GCN-NEXT: v_trunc_f32_e32 v3, v3 ; GCN-NEXT: v_mac_f32_e32 v2, 0xcf800000, v3 -; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GCN-NEXT: v_cvt_u32_f32_e32 v3, v3 -; GCN-NEXT: v_mul_hi_u32 v7, v4, v2 +; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GCN-NEXT: v_mul_lo_u32 v6, v4, v3 +; GCN-NEXT: v_mul_hi_u32 v7, v4, v2 ; GCN-NEXT: v_mul_lo_u32 v8, v5, v2 ; GCN-NEXT: v_mul_lo_u32 v9, v4, v2 ; GCN-NEXT: v_add_i32_e32 v6, vcc, v7, v6 @@ -1172,8 +1172,8 @@ ; GCN-NEXT: v_addc_u32_e32 v7, vcc, v7, v9, vcc ; GCN-NEXT: v_addc_u32_e32 v8, vcc, v13, v11, vcc ; GCN-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; GCN-NEXT: v_add_i32_e64 v2, s[4:5], v2, v6 ; GCN-NEXT: v_addc_u32_e32 v7, vcc, v12, v8, vcc +; GCN-NEXT: v_add_i32_e64 v2, s[4:5], v2, v6 ; GCN-NEXT: v_addc_u32_e64 v6, vcc, v3, v7, s[4:5] ; GCN-NEXT: v_mul_lo_u32 v8, v4, v6 ; GCN-NEXT: v_mul_hi_u32 v9, v4, v2 @@ -1186,8 +1186,8 @@ ; GCN-NEXT: v_mul_hi_u32 v14, v2, v5 ; GCN-NEXT: v_mul_hi_u32 v9, v6, v4 ; GCN-NEXT: v_mul_lo_u32 v4, v6, v4 -; GCN-NEXT: v_add_i32_e32 v10, vcc, v13, v10 ; GCN-NEXT: v_mul_hi_u32 v8, v6, v5 +; GCN-NEXT: v_add_i32_e32 v10, vcc, v13, v10 ; GCN-NEXT: v_addc_u32_e32 v13, vcc, v12, v14, vcc ; GCN-NEXT: v_mul_lo_u32 v5, v6, v5 ; GCN-NEXT: v_add_i32_e32 v4, vcc, v10, v4 @@ -1215,9 +1215,9 @@ ; GCN-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[6:7] ; GCN-NEXT: v_cmp_eq_u32_e64 s[6:7], v6, v1 ; GCN-NEXT: v_subb_u32_e64 v4, s[4:5], v4, v1, s[4:5] -; GCN-NEXT: v_subb_u32_e32 v3, vcc, 0, v3, vcc ; GCN-NEXT: v_cndmask_b32_e64 v7, v7, v8, s[6:7] ; GCN-NEXT: v_sub_i32_e64 v8, s[4:5], v5, v0 +; GCN-NEXT: v_subb_u32_e32 v3, vcc, 0, v3, vcc ; GCN-NEXT: v_subbrev_u32_e64 v4, s[4:5], 0, v4, s[4:5] ; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v3, v1 ; GCN-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v7 @@ -1245,8 +1245,8 @@ ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1] ; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[2:3] ; GCN-IR-NEXT: s_mov_b64 s[8:9], 0x8000 -; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], vcc ; GCN-IR-NEXT: v_mov_b32_e32 v6, s8 +; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], vcc ; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[2:3] ; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 ; GCN-IR-NEXT: v_cndmask_b32_e64 v6, v6, 0, s[4:5] @@ -1260,8 +1260,8 @@ ; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, 0, v3, vcc ; GCN-IR-NEXT: v_cmp_ge_u64_e32 vcc, v[8:9], v[2:3] ; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v2 -; GCN-IR-NEXT: v_mov_b32_e32 v6, 0 ; GCN-IR-NEXT: v_lshl_b64 v[2:3], s[8:9], v2 +; GCN-IR-NEXT: v_mov_b32_e32 v6, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 ; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 ; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -1269,11 +1269,11 @@ ; GCN-IR-NEXT: s_cbranch_execz BB8_5 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader ; GCN-IR-NEXT: v_add_i32_e32 v12, vcc, -1, v0 -; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, -1, v1, vcc ; GCN-IR-NEXT: s_mov_b64 s[4:5], 0x8000 -; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, 47, v4 -; GCN-IR-NEXT: v_mov_b32_e32 v10, 0 +; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, -1, v1, vcc ; GCN-IR-NEXT: v_lshr_b64 v[8:9], s[4:5], v8 +; GCN-IR-NEXT: v_mov_b32_e32 v10, 0 +; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, 47, v4 ; GCN-IR-NEXT: v_mov_b32_e32 v11, 0 ; GCN-IR-NEXT: v_subb_u32_e32 v5, vcc, 0, v5, vcc ; GCN-IR-NEXT: BB8_3: ; %udiv-do-while @@ -1286,15 +1286,15 @@ ; GCN-IR-NEXT: v_subb_u32_e32 v6, vcc, v13, v9, vcc ; GCN-IR-NEXT: v_or_b32_e32 v2, v10, v2 ; GCN-IR-NEXT: v_ashrrev_i32_e32 v10, 31, v6 -; GCN-IR-NEXT: v_and_b32_e32 v15, v10, v0 ; GCN-IR-NEXT: v_and_b32_e32 v6, 1, v10 ; GCN-IR-NEXT: v_and_b32_e32 v14, v10, v1 +; GCN-IR-NEXT: v_and_b32_e32 v15, v10, v0 ; GCN-IR-NEXT: v_add_i32_e32 v10, vcc, 1, v4 ; GCN-IR-NEXT: v_or_b32_e32 v3, v11, v3 ; GCN-IR-NEXT: v_addc_u32_e32 v11, vcc, 0, v5, vcc +; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 ; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[10:11], v[4:5] ; GCN-IR-NEXT: v_mov_b32_e32 v4, v10 -; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 ; GCN-IR-NEXT: v_sub_i32_e64 v8, s[4:5], v8, v15 ; GCN-IR-NEXT: v_mov_b32_e32 v5, v11 ; GCN-IR-NEXT: v_mov_b32_e32 v11, v7 @@ -1357,17 +1357,17 @@ ; GCN-IR-NEXT: v_addc_u32_e32 v8, vcc, 0, v3, vcc ; GCN-IR-NEXT: v_cmp_ge_u64_e32 vcc, v[7:8], v[2:3] ; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v2 -; GCN-IR-NEXT: v_mov_b32_e32 v4, 0 ; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[0:1], v2 +; GCN-IR-NEXT: v_mov_b32_e32 v4, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 ; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 ; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[4:5] ; GCN-IR-NEXT: s_cbranch_execz BB9_5 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader -; GCN-IR-NEXT: v_mov_b32_e32 v10, 0 ; GCN-IR-NEXT: v_lshr_b64 v[8:9], v[0:1], v7 ; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 0xffffffcf, v6 +; GCN-IR-NEXT: v_mov_b32_e32 v10, 0 ; GCN-IR-NEXT: v_addc_u32_e64 v7, s[4:5], 0, -1, vcc ; GCN-IR-NEXT: v_mov_b32_e32 v11, 0 ; GCN-IR-NEXT: s_movk_i32 s12, 0x7fff @@ -1381,18 +1381,18 @@ ; GCN-IR-NEXT: v_subb_u32_e32 v4, vcc, 0, v9, vcc ; GCN-IR-NEXT: v_or_b32_e32 v2, v10, v2 ; GCN-IR-NEXT: v_ashrrev_i32_e32 v10, 31, v4 -; GCN-IR-NEXT: v_and_b32_e32 v13, 0x8000, v10 ; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v10 +; GCN-IR-NEXT: v_and_b32_e32 v13, 0x8000, v10 ; GCN-IR-NEXT: v_add_i32_e32 v10, vcc, 1, v6 ; GCN-IR-NEXT: v_or_b32_e32 v3, v11, v3 ; GCN-IR-NEXT: v_addc_u32_e32 v11, vcc, 0, v7, vcc +; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 ; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[10:11], v[6:7] ; GCN-IR-NEXT: v_mov_b32_e32 v6, v10 -; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 -; GCN-IR-NEXT: v_mov_b32_e32 v7, v11 -; GCN-IR-NEXT: v_mov_b32_e32 v11, v5 ; GCN-IR-NEXT: v_mov_b32_e32 v12, 0 ; GCN-IR-NEXT: v_sub_i32_e64 v8, s[4:5], v8, v13 +; GCN-IR-NEXT: v_mov_b32_e32 v7, v11 +; GCN-IR-NEXT: v_mov_b32_e32 v11, v5 ; GCN-IR-NEXT: v_subb_u32_e64 v9, s[4:5], v9, v12, s[4:5] ; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11] ; GCN-IR-NEXT: v_mov_b32_e32 v10, v4 diff --git a/llvm/test/CodeGen/AMDGPU/usubsat.ll b/llvm/test/CodeGen/AMDGPU/usubsat.ll --- a/llvm/test/CodeGen/AMDGPU/usubsat.ll +++ b/llvm/test/CodeGen/AMDGPU/usubsat.ll @@ -110,9 +110,9 @@ ; GFX6-NEXT: s_mov_b32 s4, 0xffff ; GFX6-NEXT: v_and_b32_e32 v4, s4, v3 ; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 -; GFX6-NEXT: v_max_u32_e32 v1, v1, v4 ; GFX6-NEXT: v_and_b32_e32 v2, s4, v2 ; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX6-NEXT: v_max_u32_e32 v1, v1, v4 ; GFX6-NEXT: v_max_u32_e32 v0, v0, v2 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v3 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 @@ -152,9 +152,9 @@ ; GFX6-NEXT: s_mov_b32 s4, 0xffff ; GFX6-NEXT: v_and_b32_e32 v6, s4, v4 ; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 -; GFX6-NEXT: v_max_u32_e32 v1, v1, v6 ; GFX6-NEXT: v_and_b32_e32 v3, s4, v3 ; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX6-NEXT: v_max_u32_e32 v1, v1, v6 ; GFX6-NEXT: v_max_u32_e32 v0, v0, v3 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v4 ; GFX6-NEXT: v_and_b32_e32 v5, s4, v5 @@ -201,9 +201,9 @@ ; GFX6-NEXT: s_mov_b32 s4, 0xffff ; GFX6-NEXT: v_and_b32_e32 v9, s4, v5 ; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 -; GFX6-NEXT: v_max_u32_e32 v1, v1, v9 ; GFX6-NEXT: v_and_b32_e32 v4, s4, v4 ; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX6-NEXT: v_max_u32_e32 v1, v1, v9 ; GFX6-NEXT: v_max_u32_e32 v0, v0, v4 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v5 ; GFX6-NEXT: v_and_b32_e32 v8, s4, v7 diff --git a/llvm/test/CodeGen/AMDGPU/vector-alloca-bitcast.ll b/llvm/test/CodeGen/AMDGPU/vector-alloca-bitcast.ll --- a/llvm/test/CodeGen/AMDGPU/vector-alloca-bitcast.ll +++ b/llvm/test/CodeGen/AMDGPU/vector-alloca-bitcast.ll @@ -16,8 +16,8 @@ ; GCN_PROMOTE: s_cmp_lg_u32 s{{[0-9]+}}, 2 ; GCN-PROMOTE: s_cmp_eq_u32 s{{[0-9]+}}, 1 ; GCN-PROMOTE: s_cselect_b64 [[CC1:[^,]+]], -1, 0 -; GCN-PROMOTE: s_cselect_b64 vcc, -1, 0 ; GCN-PROMOTE: v_cndmask_b32_e{{32|64}} [[IND1:v[0-9]+]], 0, 1, [[CC1]] +; GCN-PROMOTE: s_cselect_b64 vcc, -1, 0 ; GCN_PROMOTE: s_cmp_lg_u32 s{{[0-9]+}}, 3 ; GCN-PROMOTE: v_cndmask_b32_e{{32|64}} [[IND2:v[0-9]+]], 2, [[IND1]], vcc ; GCN-PROMOTE: s_cselect_b64 vcc, -1, 0 @@ -322,11 +322,11 @@ ; GCN-ALLOCA-COUNT-4: buffer_store_dword ; GCN-ALLOCA: buffer_load_dword -; GCN_PROMOTE: s_cmp_lg_u32 s{{[0-9]+}}, 2 ; GCN-PROMOTE: s_cmp_eq_u32 s{{[0-9]+}}, 1 ; GCN-PROMOTE: s_cselect_b64 [[CC1:[^,]+]], -1, 0 -; GCN-PROMOTE: s_cselect_b64 vcc, -1, 0 +; GCN_PROMOTE: s_cmp_lg_u32 s{{[0-9]+}}, 2 ; GCN-PROMOTE: v_cndmask_b32_e{{32|64}} [[IND1:v[0-9]+]], 0, 1, [[CC1]] +; GCN-PROMOTE: s_cselect_b64 vcc, -1, 0 ; GCN_PROMOTE: s_cmp_lg_u32 s{{[0-9]+}}, 3 ; GCN-PROMOTE: v_cndmask_b32_e{{32|64}} [[IND2:v[0-9]+]], 2, [[IND1]], vcc ; GCN-PROMOTE: s_cselect_b64 vcc, -1, 0 diff --git a/llvm/test/CodeGen/AMDGPU/vector-extract-insert.ll b/llvm/test/CodeGen/AMDGPU/vector-extract-insert.ll --- a/llvm/test/CodeGen/AMDGPU/vector-extract-insert.ll +++ b/llvm/test/CodeGen/AMDGPU/vector-extract-insert.ll @@ -36,12 +36,12 @@ ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 ; GCN-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0xd -; GCN-NEXT: v_mov_b32_e32 v5, 0 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_mov_b64 s[4:5], s[10:11] -; GCN-NEXT: v_lshlrev_b32_e32 v4, 4, v0 +; GCN-NEXT: v_mov_b32_e32 v5, 0 ; GCN-NEXT: buffer_load_dwordx4 v[1:4], v[4:5], s[4:7], 0 addr64 ; GCN-NEXT: s_load_dword s14, s[0:1], 0xf ; GCN-NEXT: s_cmp_eq_u32 s13, 3 @@ -60,10 +60,10 @@ ; GCN-NEXT: s_mov_b64 s[10:11], s[6:7] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v0, vcc -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: v_cndmask_b32_e64 v3, v3, v0, s[0:1] ; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v0, s[2:3] ; GCN-NEXT: v_cndmask_b32_e64 v0, v1, v0, s[4:5] +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: s_cmp_eq_u32 s14, 2 ; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 @@ -113,12 +113,12 @@ ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GCN-NEXT: s_load_dword s8, s[0:1], 0xd -; GCN-NEXT: v_mov_b32_e32 v2, 0 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, 0 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 4, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_mov_b64 s[0:1], s[6:7] -; GCN-NEXT: v_lshlrev_b32_e32 v1, 4, v0 +; GCN-NEXT: v_mov_b32_e32 v2, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; GCN-NEXT: v_mov_b32_e32 v5, v2 ; GCN-NEXT: buffer_load_dwordx4 v[0:3], v[1:2], s[0:3], 0 addr64 glc diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll b/llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll --- a/llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll +++ b/llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll @@ -10,8 +10,8 @@ ; GCN-NEXT: ; =>This Loop Header: Depth=1 ; GCN-NEXT: ; Child Loop BB0_2 Depth 2 ; GCN-NEXT: v_add_co_u32 v6, vcc_lo, v0, 8 -; GCN-NEXT: s_mov_b32 s5, exec_lo ; GCN-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v1, vcc_lo +; GCN-NEXT: s_mov_b32 s5, exec_lo ; GCN-NEXT: s_clause 0x1 ; GCN-NEXT: flat_load_dwordx2 v[4:5], v[6:7] ; GCN-NEXT: flat_load_dwordx2 v[2:3], v[0:1] @@ -26,8 +26,8 @@ ; GCN-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[4:5] ; GCN-NEXT: s_and_b32 s4, vcc_lo, s4 ; GCN-NEXT: s_and_saveexec_b32 s4, s4 -; GCN-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 ; GCN-NEXT: buffer_store_dword v0, v0, s[8:11], 0 offen +; GCN-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 ; GCN-NEXT: s_waitcnt_depctr 0xffe3 ; GCN-NEXT: s_xor_b32 exec_lo, exec_lo, s4 ; GCN-NEXT: s_cbranch_execnz BB0_2 diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-liverange.ll b/llvm/test/CodeGen/AMDGPU/vgpr-liverange.ll --- a/llvm/test/CodeGen/AMDGPU/vgpr-liverange.ll +++ b/llvm/test/CodeGen/AMDGPU/vgpr-liverange.ll @@ -166,8 +166,8 @@ ; SI-NEXT: s_mov_b32 s39, 0x31c16000 ; SI-NEXT: s_add_u32 s36, s36, s1 ; SI-NEXT: s_addc_u32 s37, s37, 0 -; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: s_mov_b32 s32, 0 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: s_and_saveexec_b32 s0, vcc_lo ; SI-NEXT: s_xor_b32 s33, exec_lo, s0 ; SI-NEXT: s_cbranch_execz BB3_4 @@ -239,13 +239,13 @@ ; SI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 ; SI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; SI-NEXT: s_mov_b32 s38, -1 -; SI-NEXT: v_cmp_gt_i32_e32 vcc_lo, 6, v0 ; SI-NEXT: v_mov_b32_e32 v40, v1 +; SI-NEXT: v_cmp_gt_i32_e32 vcc_lo, 6, v0 ; SI-NEXT: s_mov_b32 s39, 0x31c16000 ; SI-NEXT: s_add_u32 s36, s36, s1 ; SI-NEXT: s_addc_u32 s37, s37, 0 -; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: s_mov_b32 s32, 0 +; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: s_and_saveexec_b32 s0, vcc_lo ; SI-NEXT: s_xor_b32 s33, exec_lo, s0 ; SI-NEXT: s_cbranch_execz BB4_4 diff --git a/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll b/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll --- a/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll +++ b/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll @@ -148,8 +148,8 @@ ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0 -; VI-NEXT: v_mov_b32_e32 v2, 2 ; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: v_mov_b32_e32 v2, 2 ; VI-NEXT: v_mov_b32_e32 v3, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s0, s[0:1], 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll --- a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll +++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll @@ -518,8 +518,8 @@ ; GFX9-O3-NEXT: v_add_co_u32_e32 v2, vcc, v0, v0 ; GFX9-O3-NEXT: v_addc_co_u32_e32 v3, vcc, v1, v1, vcc ; GFX9-O3-NEXT: v_mul_lo_u32 v4, v3, v0 -; GFX9-O3-NEXT: v_mul_hi_u32 v5, v2, v0 ; GFX9-O3-NEXT: v_mul_lo_u32 v1, v2, v1 +; GFX9-O3-NEXT: v_mul_hi_u32 v5, v2, v0 ; GFX9-O3-NEXT: v_mul_lo_u32 v0, v2, v0 ; GFX9-O3-NEXT: v_add3_u32 v1, v5, v1, v4 ; GFX9-O3-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v2 @@ -823,9 +823,9 @@ ; GFX9-O3-NEXT: s_not_b64 exec, exec ; GFX9-O3-NEXT: v_mov_b32_e32 v7, v1 ; GFX9-O3-NEXT: v_mov_b32_e32 v9, v3 -; GFX9-O3-NEXT: v_mov_b32_e32 v11, v5 ; GFX9-O3-NEXT: v_mov_b32_e32 v8, v2 ; GFX9-O3-NEXT: v_mov_b32_e32 v10, v4 +; GFX9-O3-NEXT: v_mov_b32_e32 v11, v5 ; GFX9-O3-NEXT: v_mov_b32_e32 v12, v6 ; GFX9-O3-NEXT: buffer_store_dwordx4 v[7:10], v0, s[4:7], 0 offen ; GFX9-O3-NEXT: buffer_store_dwordx2 v[11:12], v0, s[4:7], 0 offen offset:16 diff --git a/llvm/test/CodeGen/AMDGPU/xor3.ll b/llvm/test/CodeGen/AMDGPU/xor3.ll --- a/llvm/test/CodeGen/AMDGPU/xor3.ll +++ b/llvm/test/CodeGen/AMDGPU/xor3.ll @@ -138,9 +138,9 @@ define amdgpu_ps float @xor3_uniform_vgpr(float inreg %a, float inreg %b, float inreg %c) { ; GFX9-LABEL: xor3_uniform_vgpr: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v2, 0x40400000 ; GFX9-NEXT: v_add_f32_e64 v0, s2, 1.0 ; GFX9-NEXT: v_add_f32_e64 v1, s3, 2.0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0x40400000 ; GFX9-NEXT: v_add_f32_e32 v2, s4, v2 ; GFX9-NEXT: v_xor_b32_e32 v0, v0, v1 ; GFX9-NEXT: v_xor_b32_e32 v0, v0, v2